diff options
Diffstat (limited to 'kernel')
141 files changed, 13274 insertions, 6626 deletions
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks new file mode 100644 index 00000000000..88c92fb4461 --- /dev/null +++ b/kernel/Kconfig.locks @@ -0,0 +1,202 @@ +# +# The ARCH_INLINE foo is necessary because select ignores "depends on" +# +config ARCH_INLINE_SPIN_TRYLOCK + bool + +config ARCH_INLINE_SPIN_TRYLOCK_BH + bool + +config ARCH_INLINE_SPIN_LOCK + bool + +config ARCH_INLINE_SPIN_LOCK_BH + bool + +config ARCH_INLINE_SPIN_LOCK_IRQ + bool + +config ARCH_INLINE_SPIN_LOCK_IRQSAVE + bool + +config ARCH_INLINE_SPIN_UNLOCK + bool + +config ARCH_INLINE_SPIN_UNLOCK_BH + bool + +config ARCH_INLINE_SPIN_UNLOCK_IRQ + bool + +config ARCH_INLINE_SPIN_UNLOCK_IRQRESTORE + bool + + +config ARCH_INLINE_READ_TRYLOCK + bool + +config ARCH_INLINE_READ_LOCK + bool + +config ARCH_INLINE_READ_LOCK_BH + bool + +config ARCH_INLINE_READ_LOCK_IRQ + bool + +config ARCH_INLINE_READ_LOCK_IRQSAVE + bool + +config ARCH_INLINE_READ_UNLOCK + bool + +config ARCH_INLINE_READ_UNLOCK_BH + bool + +config ARCH_INLINE_READ_UNLOCK_IRQ + bool + +config ARCH_INLINE_READ_UNLOCK_IRQRESTORE + bool + + +config ARCH_INLINE_WRITE_TRYLOCK + bool + +config ARCH_INLINE_WRITE_LOCK + bool + +config ARCH_INLINE_WRITE_LOCK_BH + bool + +config ARCH_INLINE_WRITE_LOCK_IRQ + bool + +config ARCH_INLINE_WRITE_LOCK_IRQSAVE + bool + +config ARCH_INLINE_WRITE_UNLOCK + bool + +config ARCH_INLINE_WRITE_UNLOCK_BH + bool + +config ARCH_INLINE_WRITE_UNLOCK_IRQ + bool + +config ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE + bool + +# +# lock_* functions are inlined when: +# - DEBUG_SPINLOCK=n and GENERIC_LOCKBREAK=n and ARCH_INLINE_*LOCK=y +# +# trylock_* functions are inlined when: +# - DEBUG_SPINLOCK=n and ARCH_INLINE_*LOCK=y +# +# unlock and unlock_irq functions are inlined when: +# - DEBUG_SPINLOCK=n and ARCH_INLINE_*LOCK=y +# or +# - DEBUG_SPINLOCK=n and PREEMPT=n +# +# unlock_bh and unlock_irqrestore functions are inlined when: +# - DEBUG_SPINLOCK=n and ARCH_INLINE_*LOCK=y +# + +config INLINE_SPIN_TRYLOCK + def_bool !DEBUG_SPINLOCK && ARCH_INLINE_SPIN_TRYLOCK + +config INLINE_SPIN_TRYLOCK_BH + def_bool !DEBUG_SPINLOCK && ARCH_INLINE_SPIN_TRYLOCK_BH + +config INLINE_SPIN_LOCK + def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && ARCH_INLINE_SPIN_LOCK + +config INLINE_SPIN_LOCK_BH + def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \ + ARCH_INLINE_SPIN_LOCK_BH + +config INLINE_SPIN_LOCK_IRQ + def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \ + ARCH_INLINE_SPIN_LOCK_IRQ + +config INLINE_SPIN_LOCK_IRQSAVE + def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \ + ARCH_INLINE_SPIN_LOCK_IRQSAVE + +config INLINE_SPIN_UNLOCK + def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_SPIN_UNLOCK) + +config INLINE_SPIN_UNLOCK_BH + def_bool !DEBUG_SPINLOCK && ARCH_INLINE_SPIN_UNLOCK_BH + +config INLINE_SPIN_UNLOCK_IRQ + def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_SPIN_UNLOCK_BH) + +config INLINE_SPIN_UNLOCK_IRQRESTORE + def_bool !DEBUG_SPINLOCK && ARCH_INLINE_SPIN_UNLOCK_IRQRESTORE + + +config INLINE_READ_TRYLOCK + def_bool !DEBUG_SPINLOCK && ARCH_INLINE_READ_TRYLOCK + +config INLINE_READ_LOCK + def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && ARCH_INLINE_READ_LOCK + +config INLINE_READ_LOCK_BH + def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \ + ARCH_INLINE_READ_LOCK_BH + +config INLINE_READ_LOCK_IRQ + def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \ + ARCH_INLINE_READ_LOCK_IRQ + +config INLINE_READ_LOCK_IRQSAVE + def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \ + ARCH_INLINE_READ_LOCK_IRQSAVE + +config INLINE_READ_UNLOCK + def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_READ_UNLOCK) + +config INLINE_READ_UNLOCK_BH + def_bool !DEBUG_SPINLOCK && ARCH_INLINE_READ_UNLOCK_BH + +config INLINE_READ_UNLOCK_IRQ + def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_READ_UNLOCK_BH) + +config INLINE_READ_UNLOCK_IRQRESTORE + def_bool !DEBUG_SPINLOCK && ARCH_INLINE_READ_UNLOCK_IRQRESTORE + + +config INLINE_WRITE_TRYLOCK + def_bool !DEBUG_SPINLOCK && ARCH_INLINE_WRITE_TRYLOCK + +config INLINE_WRITE_LOCK + def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && ARCH_INLINE_WRITE_LOCK + +config INLINE_WRITE_LOCK_BH + def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \ + ARCH_INLINE_WRITE_LOCK_BH + +config INLINE_WRITE_LOCK_IRQ + def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \ + ARCH_INLINE_WRITE_LOCK_IRQ + +config INLINE_WRITE_LOCK_IRQSAVE + def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \ + ARCH_INLINE_WRITE_LOCK_IRQSAVE + +config INLINE_WRITE_UNLOCK + def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_WRITE_UNLOCK) + +config INLINE_WRITE_UNLOCK_BH + def_bool !DEBUG_SPINLOCK && ARCH_INLINE_WRITE_UNLOCK_BH + +config INLINE_WRITE_UNLOCK_IRQ + def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_WRITE_UNLOCK_BH) + +config INLINE_WRITE_UNLOCK_IRQRESTORE + def_bool !DEBUG_SPINLOCK && ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE + +config MUTEX_SPIN_ON_OWNER + def_bool SMP && !DEBUG_MUTEXES && !HAVE_DEFAULT_NO_SPIN_MUTEXES diff --git a/kernel/Makefile b/kernel/Makefile index 187c89b4783..864ff75d65f 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -4,7 +4,7 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o \ cpu.o exit.o itimer.o time.o softirq.o resource.o \ - sysctl.o capability.o ptrace.o timer.o user.o \ + sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \ signal.o sys.o kmod.o workqueue.o pid.o \ rcupdate.o extable.o params.o posix-timers.o \ kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ @@ -21,6 +21,7 @@ CFLAGS_REMOVE_mutex-debug.o = -pg CFLAGS_REMOVE_rtmutex-debug.o = -pg CFLAGS_REMOVE_cgroup-debug.o = -pg CFLAGS_REMOVE_sched_clock.o = -pg +CFLAGS_REMOVE_perf_event.o = -pg endif obj-$(CONFIG_FREEZER) += freezer.o @@ -58,7 +59,6 @@ obj-$(CONFIG_KEXEC) += kexec.o obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o obj-$(CONFIG_COMPAT) += compat.o obj-$(CONFIG_CGROUPS) += cgroup.o -obj-$(CONFIG_CGROUP_DEBUG) += cgroup_debug.o obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o obj-$(CONFIG_CPUSETS) += cpuset.o obj-$(CONFIG_CGROUP_NS) += ns_cgroup.o @@ -83,6 +83,7 @@ obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o obj-$(CONFIG_TREE_RCU) += rcutree.o obj-$(CONFIG_TREE_PREEMPT_RCU) += rcutree.o obj-$(CONFIG_TREE_RCU_TRACE) += rcutree_trace.o +obj-$(CONFIG_TINY_RCU) += rcutiny.o obj-$(CONFIG_RELAY) += relay.o obj-$(CONFIG_SYSCTL) += utsname_sysctl.o obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o @@ -95,7 +96,10 @@ obj-$(CONFIG_X86_DS) += trace/ obj-$(CONFIG_RING_BUFFER) += trace/ obj-$(CONFIG_SMP) += sched_cpupri.o obj-$(CONFIG_SLOW_WORK) += slow-work.o +obj-$(CONFIG_SLOW_WORK_DEBUG) += slow-work-debugfs.o obj-$(CONFIG_PERF_EVENTS) += perf_event.o +obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o +obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is diff --git a/kernel/acct.c b/kernel/acct.c index 9a4715a2f6b..a6605ca921b 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -536,7 +536,8 @@ static void do_acct_process(struct bsd_acct_struct *acct, do_div(elapsed, AHZ); ac.ac_btime = get_seconds() - elapsed; /* we really need to bite the bullet and change layout */ - current_uid_gid(&ac.ac_uid, &ac.ac_gid); + ac.ac_uid = orig_cred->uid; + ac.ac_gid = orig_cred->gid; #if ACCT_VERSION==2 ac.ac_ahz = AHZ; #endif diff --git a/kernel/audit.c b/kernel/audit.c index defc2e6f1e3..5feed232be9 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -855,18 +855,24 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) break; } case AUDIT_SIGNAL_INFO: - err = security_secid_to_secctx(audit_sig_sid, &ctx, &len); - if (err) - return err; + len = 0; + if (audit_sig_sid) { + err = security_secid_to_secctx(audit_sig_sid, &ctx, &len); + if (err) + return err; + } sig_data = kmalloc(sizeof(*sig_data) + len, GFP_KERNEL); if (!sig_data) { - security_release_secctx(ctx, len); + if (audit_sig_sid) + security_release_secctx(ctx, len); return -ENOMEM; } sig_data->uid = audit_sig_uid; sig_data->pid = audit_sig_pid; - memcpy(sig_data->ctx, ctx, len); - security_release_secctx(ctx, len); + if (audit_sig_sid) { + memcpy(sig_data->ctx, ctx, len); + security_release_secctx(ctx, len); + } audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_SIGNAL_INFO, 0, 0, sig_data, sizeof(*sig_data) + len); kfree(sig_data); diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 0e96dbc60ea..cc7e87936cb 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -45,8 +45,8 @@ struct audit_watch { atomic_t count; /* reference count */ - char *path; /* insertion path */ dev_t dev; /* associated superblock device */ + char *path; /* insertion path */ unsigned long ino; /* associated inode number */ struct audit_parent *parent; /* associated parent */ struct list_head wlist; /* entry in parent->watches list */ diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 68d3c6a0ecd..267e484f019 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -168,12 +168,12 @@ struct audit_context { int in_syscall; /* 1 if task is in a syscall */ enum audit_state state, current_state; unsigned int serial; /* serial number for record */ - struct timespec ctime; /* time of syscall entry */ int major; /* syscall number */ + struct timespec ctime; /* time of syscall entry */ unsigned long argv[4]; /* syscall arguments */ - int return_valid; /* return code is valid */ long return_code;/* syscall return code */ u64 prio; + int return_valid; /* return code is valid */ int name_count; struct audit_names names[AUDIT_NAMES]; char * filterkey; /* key for rule that triggered record */ @@ -198,8 +198,8 @@ struct audit_context { char target_comm[TASK_COMM_LEN]; struct audit_tree_refs *trees, *first_trees; - int tree_count; struct list_head killed_trees; + int tree_count; int type; union { diff --git a/kernel/capability.c b/kernel/capability.c index 4e17041963f..7f876e60521 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -29,7 +29,6 @@ EXPORT_SYMBOL(__cap_empty_set); EXPORT_SYMBOL(__cap_full_set); EXPORT_SYMBOL(__cap_init_eff_set); -#ifdef CONFIG_SECURITY_FILE_CAPABILITIES int file_caps_enabled = 1; static int __init file_caps_disable(char *str) @@ -38,7 +37,6 @@ static int __init file_caps_disable(char *str) return 1; } __setup("no_file_caps", file_caps_disable); -#endif /* * More recent versions of libcap are available from: @@ -169,8 +167,8 @@ SYSCALL_DEFINE2(capget, cap_user_header_t, header, cap_user_data_t, dataptr) kernel_cap_t pE, pI, pP; ret = cap_validate_magic(header, &tocopy); - if (ret != 0) - return ret; + if ((dataptr == NULL) || (ret != 0)) + return ((dataptr == NULL) && (ret == -EINVAL)) ? 0 : ret; if (get_user(pid, &header->pid)) return -EFAULT; @@ -238,7 +236,7 @@ SYSCALL_DEFINE2(capget, cap_user_header_t, header, cap_user_data_t, dataptr) SYSCALL_DEFINE2(capset, cap_user_header_t, header, const cap_user_data_t, data) { struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S]; - unsigned i, tocopy; + unsigned i, tocopy, copybytes; kernel_cap_t inheritable, permitted, effective; struct cred *new; int ret; @@ -255,8 +253,11 @@ SYSCALL_DEFINE2(capset, cap_user_header_t, header, const cap_user_data_t, data) if (pid != 0 && pid != task_pid_vnr(current)) return -EPERM; - if (copy_from_user(&kdata, data, - tocopy * sizeof(struct __user_cap_data_struct))) + copybytes = tocopy * sizeof(struct __user_cap_data_struct); + if (copybytes > sizeof(kdata)) + return -EFAULT; + + if (copy_from_user(&kdata, data, copybytes)) return -EFAULT; for (i = 0; i < tocopy; i++) { diff --git a/kernel/cgroup.c b/kernel/cgroup.c index cd83d9933b6..0249f4be9b5 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -23,6 +23,7 @@ */ #include <linux/cgroup.h> +#include <linux/ctype.h> #include <linux/errno.h> #include <linux/fs.h> #include <linux/kernel.h> @@ -48,6 +49,8 @@ #include <linux/namei.h> #include <linux/smp_lock.h> #include <linux/pid_namespace.h> +#include <linux/idr.h> +#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */ #include <asm/atomic.h> @@ -60,6 +63,8 @@ static struct cgroup_subsys *subsys[] = { #include <linux/cgroup_subsys.h> }; +#define MAX_CGROUP_ROOT_NAMELEN 64 + /* * A cgroupfs_root represents the root of a cgroup hierarchy, * and may be associated with a superblock to form an active @@ -74,6 +79,9 @@ struct cgroupfs_root { */ unsigned long subsys_bits; + /* Unique id for this hierarchy. */ + int hierarchy_id; + /* The bitmask of subsystems currently attached to this hierarchy */ unsigned long actual_subsys_bits; @@ -94,6 +102,9 @@ struct cgroupfs_root { /* The path to use for release notifications. */ char release_agent_path[PATH_MAX]; + + /* The name for this hierarchy - may be empty */ + char name[MAX_CGROUP_ROOT_NAMELEN]; }; /* @@ -141,6 +152,10 @@ struct css_id { static LIST_HEAD(roots); static int root_count; +static DEFINE_IDA(hierarchy_ida); +static int next_hierarchy_id; +static DEFINE_SPINLOCK(hierarchy_id_lock); + /* dummytop is a shorthand for the dummy hierarchy's top cgroup */ #define dummytop (&rootnode.top_cgroup) @@ -201,6 +216,7 @@ struct cg_cgroup_link { * cgroup, anchored on cgroup->css_sets */ struct list_head cgrp_link_list; + struct cgroup *cgrp; /* * List running through cg_cgroup_links pointing at a * single css_set object, anchored on css_set->cg_links @@ -227,8 +243,11 @@ static int cgroup_subsys_init_idr(struct cgroup_subsys *ss); static DEFINE_RWLOCK(css_set_lock); static int css_set_count; -/* hash table for cgroup groups. This improves the performance to - * find an existing css_set */ +/* + * hash table for cgroup groups. This improves the performance to find + * an existing css_set. This hash doesn't (currently) take into + * account cgroups in empty hierarchies. + */ #define CSS_SET_HASH_BITS 7 #define CSS_SET_TABLE_SIZE (1 << CSS_SET_HASH_BITS) static struct hlist_head css_set_table[CSS_SET_TABLE_SIZE]; @@ -248,48 +267,22 @@ static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[]) return &css_set_table[index]; } +static void free_css_set_rcu(struct rcu_head *obj) +{ + struct css_set *cg = container_of(obj, struct css_set, rcu_head); + kfree(cg); +} + /* We don't maintain the lists running through each css_set to its * task until after the first call to cgroup_iter_start(). This * reduces the fork()/exit() overhead for people who have cgroups * compiled into their kernel but not actually in use */ static int use_task_css_set_links __read_mostly; -/* When we create or destroy a css_set, the operation simply - * takes/releases a reference count on all the cgroups referenced - * by subsystems in this css_set. This can end up multiple-counting - * some cgroups, but that's OK - the ref-count is just a - * busy/not-busy indicator; ensuring that we only count each cgroup - * once would require taking a global lock to ensure that no - * subsystems moved between hierarchies while we were doing so. - * - * Possible TODO: decide at boot time based on the number of - * registered subsystems and the number of CPUs or NUMA nodes whether - * it's better for performance to ref-count every subsystem, or to - * take a global lock and only add one ref count to each hierarchy. - */ - -/* - * unlink a css_set from the list and free it - */ -static void unlink_css_set(struct css_set *cg) +static void __put_css_set(struct css_set *cg, int taskexit) { struct cg_cgroup_link *link; struct cg_cgroup_link *saved_link; - - hlist_del(&cg->hlist); - css_set_count--; - - list_for_each_entry_safe(link, saved_link, &cg->cg_links, - cg_link_list) { - list_del(&link->cg_link_list); - list_del(&link->cgrp_link_list); - kfree(link); - } -} - -static void __put_css_set(struct css_set *cg, int taskexit) -{ - int i; /* * Ensure that the refcount doesn't hit zero while any readers * can see it. Similar to atomic_dec_and_lock(), but for an @@ -302,21 +295,28 @@ static void __put_css_set(struct css_set *cg, int taskexit) write_unlock(&css_set_lock); return; } - unlink_css_set(cg); - write_unlock(&css_set_lock); - rcu_read_lock(); - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { - struct cgroup *cgrp = rcu_dereference(cg->subsys[i]->cgroup); + /* This css_set is dead. unlink it and release cgroup refcounts */ + hlist_del(&cg->hlist); + css_set_count--; + + list_for_each_entry_safe(link, saved_link, &cg->cg_links, + cg_link_list) { + struct cgroup *cgrp = link->cgrp; + list_del(&link->cg_link_list); + list_del(&link->cgrp_link_list); if (atomic_dec_and_test(&cgrp->count) && notify_on_release(cgrp)) { if (taskexit) set_bit(CGRP_RELEASABLE, &cgrp->flags); check_for_release(cgrp); } + + kfree(link); } - rcu_read_unlock(); - kfree(cg); + + write_unlock(&css_set_lock); + call_rcu(&cg->rcu_head, free_css_set_rcu); } /* @@ -338,6 +338,78 @@ static inline void put_css_set_taskexit(struct css_set *cg) } /* + * compare_css_sets - helper function for find_existing_css_set(). + * @cg: candidate css_set being tested + * @old_cg: existing css_set for a task + * @new_cgrp: cgroup that's being entered by the task + * @template: desired set of css pointers in css_set (pre-calculated) + * + * Returns true if "cg" matches "old_cg" except for the hierarchy + * which "new_cgrp" belongs to, for which it should match "new_cgrp". + */ +static bool compare_css_sets(struct css_set *cg, + struct css_set *old_cg, + struct cgroup *new_cgrp, + struct cgroup_subsys_state *template[]) +{ + struct list_head *l1, *l2; + + if (memcmp(template, cg->subsys, sizeof(cg->subsys))) { + /* Not all subsystems matched */ + return false; + } + + /* + * Compare cgroup pointers in order to distinguish between + * different cgroups in heirarchies with no subsystems. We + * could get by with just this check alone (and skip the + * memcmp above) but on most setups the memcmp check will + * avoid the need for this more expensive check on almost all + * candidates. + */ + + l1 = &cg->cg_links; + l2 = &old_cg->cg_links; + while (1) { + struct cg_cgroup_link *cgl1, *cgl2; + struct cgroup *cg1, *cg2; + + l1 = l1->next; + l2 = l2->next; + /* See if we reached the end - both lists are equal length. */ + if (l1 == &cg->cg_links) { + BUG_ON(l2 != &old_cg->cg_links); + break; + } else { + BUG_ON(l2 == &old_cg->cg_links); + } + /* Locate the cgroups associated with these links. */ + cgl1 = list_entry(l1, struct cg_cgroup_link, cg_link_list); + cgl2 = list_entry(l2, struct cg_cgroup_link, cg_link_list); + cg1 = cgl1->cgrp; + cg2 = cgl2->cgrp; + /* Hierarchies should be linked in the same order. */ + BUG_ON(cg1->root != cg2->root); + + /* + * If this hierarchy is the hierarchy of the cgroup + * that's changing, then we need to check that this + * css_set points to the new cgroup; if it's any other + * hierarchy, then this css_set should point to the + * same cgroup as the old css_set. + */ + if (cg1->root == new_cgrp->root) { + if (cg1 != new_cgrp) + return false; + } else { + if (cg1 != cg2) + return false; + } + } + return true; +} + +/* * find_existing_css_set() is a helper for * find_css_set(), and checks to see whether an existing * css_set is suitable. @@ -378,10 +450,11 @@ static struct css_set *find_existing_css_set( hhead = css_set_hash(template); hlist_for_each_entry(cg, node, hhead, hlist) { - if (!memcmp(template, cg->subsys, sizeof(cg->subsys))) { - /* All subsystems matched */ - return cg; - } + if (!compare_css_sets(cg, oldcg, cgrp, template)) + continue; + + /* This css_set matches what we need */ + return cg; } /* No existing cgroup group matched */ @@ -435,8 +508,14 @@ static void link_css_set(struct list_head *tmp_cg_links, link = list_first_entry(tmp_cg_links, struct cg_cgroup_link, cgrp_link_list); link->cg = cg; + link->cgrp = cgrp; + atomic_inc(&cgrp->count); list_move(&link->cgrp_link_list, &cgrp->css_sets); - list_add(&link->cg_link_list, &cg->cg_links); + /* + * Always add links to the tail of the list so that the list + * is sorted by order of hierarchy creation + */ + list_add_tail(&link->cg_link_list, &cg->cg_links); } /* @@ -451,11 +530,11 @@ static struct css_set *find_css_set( { struct css_set *res; struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT]; - int i; struct list_head tmp_cg_links; struct hlist_head *hhead; + struct cg_cgroup_link *link; /* First see if we already have a cgroup group that matches * the desired set */ @@ -489,20 +568,12 @@ static struct css_set *find_css_set( write_lock(&css_set_lock); /* Add reference counts and links from the new css_set. */ - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { - struct cgroup *cgrp = res->subsys[i]->cgroup; - struct cgroup_subsys *ss = subsys[i]; - atomic_inc(&cgrp->count); - /* - * We want to add a link once per cgroup, so we - * only do it for the first subsystem in each - * hierarchy - */ - if (ss->root->subsys_list.next == &ss->sibling) - link_css_set(&tmp_cg_links, res, cgrp); + list_for_each_entry(link, &oldcg->cg_links, cg_link_list) { + struct cgroup *c = link->cgrp; + if (c->root == cgrp->root) + c = cgrp; + link_css_set(&tmp_cg_links, res, c); } - if (list_empty(&rootnode.subsys_list)) - link_css_set(&tmp_cg_links, res, dummytop); BUG_ON(!list_empty(&tmp_cg_links)); @@ -518,6 +589,41 @@ static struct css_set *find_css_set( } /* + * Return the cgroup for "task" from the given hierarchy. Must be + * called with cgroup_mutex held. + */ +static struct cgroup *task_cgroup_from_root(struct task_struct *task, + struct cgroupfs_root *root) +{ + struct css_set *css; + struct cgroup *res = NULL; + + BUG_ON(!mutex_is_locked(&cgroup_mutex)); + read_lock(&css_set_lock); + /* + * No need to lock the task - since we hold cgroup_mutex the + * task can't change groups, so the only thing that can happen + * is that it exits and its css is set back to init_css_set. + */ + css = task->cgroups; + if (css == &init_css_set) { + res = &root->top_cgroup; + } else { + struct cg_cgroup_link *link; + list_for_each_entry(link, &css->cg_links, cg_link_list) { + struct cgroup *c = link->cgrp; + if (c->root == root) { + res = c; + break; + } + } + } + read_unlock(&css_set_lock); + BUG_ON(!res); + return res; +} + +/* * There is one global cgroup mutex. We also require taking * task_lock() when dereferencing a task's cgroup subsys pointers. * See "The task_lock() exception", at the end of this comment. @@ -597,7 +703,7 @@ static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode); static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry); static int cgroup_populate_dir(struct cgroup *cgrp); static const struct inode_operations cgroup_dir_inode_operations; -static struct file_operations proc_cgroupstats_operations; +static const struct file_operations proc_cgroupstats_operations; static struct backing_dev_info cgroup_backing_dev_info = { .name = "cgroup", @@ -677,6 +783,12 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode) */ deactivate_super(cgrp->root->sb); + /* + * if we're getting rid of the cgroup, refcount should ensure + * that there are no pidlists left. + */ + BUG_ON(!list_empty(&cgrp->pidlists)); + call_rcu(&cgrp->rcu_head, free_cgroup_rcu); } iput(inode); @@ -841,6 +953,8 @@ static int cgroup_show_options(struct seq_file *seq, struct vfsmount *vfs) seq_puts(seq, ",noprefix"); if (strlen(root->release_agent_path)) seq_printf(seq, ",release_agent=%s", root->release_agent_path); + if (strlen(root->name)) + seq_printf(seq, ",name=%s", root->name); mutex_unlock(&cgroup_mutex); return 0; } @@ -849,6 +963,12 @@ struct cgroup_sb_opts { unsigned long subsys_bits; unsigned long flags; char *release_agent; + char *name; + /* User explicitly requested empty subsystem */ + bool none; + + struct cgroupfs_root *new_root; + }; /* Convert a hierarchy specifier into a bitmask of subsystems and @@ -863,9 +983,7 @@ static int parse_cgroupfs_options(char *data, mask = ~(1UL << cpuset_subsys_id); #endif - opts->subsys_bits = 0; - opts->flags = 0; - opts->release_agent = NULL; + memset(opts, 0, sizeof(*opts)); while ((token = strsep(&o, ",")) != NULL) { if (!*token) @@ -879,17 +997,42 @@ static int parse_cgroupfs_options(char *data, if (!ss->disabled) opts->subsys_bits |= 1ul << i; } + } else if (!strcmp(token, "none")) { + /* Explicitly have no subsystems */ + opts->none = true; } else if (!strcmp(token, "noprefix")) { set_bit(ROOT_NOPREFIX, &opts->flags); } else if (!strncmp(token, "release_agent=", 14)) { /* Specifying two release agents is forbidden */ if (opts->release_agent) return -EINVAL; - opts->release_agent = kzalloc(PATH_MAX, GFP_KERNEL); + opts->release_agent = + kstrndup(token + 14, PATH_MAX, GFP_KERNEL); if (!opts->release_agent) return -ENOMEM; - strncpy(opts->release_agent, token + 14, PATH_MAX - 1); - opts->release_agent[PATH_MAX - 1] = 0; + } else if (!strncmp(token, "name=", 5)) { + int i; + const char *name = token + 5; + /* Can't specify an empty name */ + if (!strlen(name)) + return -EINVAL; + /* Must match [\w.-]+ */ + for (i = 0; i < strlen(name); i++) { + char c = name[i]; + if (isalnum(c)) + continue; + if ((c == '.') || (c == '-') || (c == '_')) + continue; + return -EINVAL; + } + /* Specifying two names is forbidden */ + if (opts->name) + return -EINVAL; + opts->name = kstrndup(name, + MAX_CGROUP_ROOT_NAMELEN, + GFP_KERNEL); + if (!opts->name) + return -ENOMEM; } else { struct cgroup_subsys *ss; int i; @@ -906,6 +1049,8 @@ static int parse_cgroupfs_options(char *data, } } + /* Consistency checks */ + /* * Option noprefix was introduced just for backward compatibility * with the old cpuset, so we allow noprefix only if mounting just @@ -915,8 +1060,16 @@ static int parse_cgroupfs_options(char *data, (opts->subsys_bits & mask)) return -EINVAL; - /* We can't have an empty hierarchy */ - if (!opts->subsys_bits) + + /* Can't specify "none" and some subsystems */ + if (opts->subsys_bits && opts->none) + return -EINVAL; + + /* + * We either have to specify by name or by subsystems. (So all + * empty hierarchies must have a name). + */ + if (!opts->subsys_bits && !opts->name) return -EINVAL; return 0; @@ -944,6 +1097,12 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) goto out_unlock; } + /* Don't allow name to change at remount */ + if (opts.name && strcmp(opts.name, root->name)) { + ret = -EINVAL; + goto out_unlock; + } + ret = rebind_subsystems(root, opts.subsys_bits); if (ret) goto out_unlock; @@ -955,6 +1114,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) strcpy(root->release_agent_path, opts.release_agent); out_unlock: kfree(opts.release_agent); + kfree(opts.name); mutex_unlock(&cgroup_mutex); mutex_unlock(&cgrp->dentry->d_inode->i_mutex); unlock_kernel(); @@ -974,9 +1134,10 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) INIT_LIST_HEAD(&cgrp->children); INIT_LIST_HEAD(&cgrp->css_sets); INIT_LIST_HEAD(&cgrp->release_list); - INIT_LIST_HEAD(&cgrp->pids_list); - init_rwsem(&cgrp->pids_mutex); + INIT_LIST_HEAD(&cgrp->pidlists); + mutex_init(&cgrp->pidlist_mutex); } + static void init_cgroup_root(struct cgroupfs_root *root) { struct cgroup *cgrp = &root->top_cgroup; @@ -988,33 +1149,106 @@ static void init_cgroup_root(struct cgroupfs_root *root) init_cgroup_housekeeping(cgrp); } +static bool init_root_id(struct cgroupfs_root *root) +{ + int ret = 0; + + do { + if (!ida_pre_get(&hierarchy_ida, GFP_KERNEL)) + return false; + spin_lock(&hierarchy_id_lock); + /* Try to allocate the next unused ID */ + ret = ida_get_new_above(&hierarchy_ida, next_hierarchy_id, + &root->hierarchy_id); + if (ret == -ENOSPC) + /* Try again starting from 0 */ + ret = ida_get_new(&hierarchy_ida, &root->hierarchy_id); + if (!ret) { + next_hierarchy_id = root->hierarchy_id + 1; + } else if (ret != -EAGAIN) { + /* Can only get here if the 31-bit IDR is full ... */ + BUG_ON(ret); + } + spin_unlock(&hierarchy_id_lock); + } while (ret); + return true; +} + static int cgroup_test_super(struct super_block *sb, void *data) { - struct cgroupfs_root *new = data; + struct cgroup_sb_opts *opts = data; struct cgroupfs_root *root = sb->s_fs_info; - /* First check subsystems */ - if (new->subsys_bits != root->subsys_bits) - return 0; + /* If we asked for a name then it must match */ + if (opts->name && strcmp(opts->name, root->name)) + return 0; - /* Next check flags */ - if (new->flags != root->flags) + /* + * If we asked for subsystems (or explicitly for no + * subsystems) then they must match + */ + if ((opts->subsys_bits || opts->none) + && (opts->subsys_bits != root->subsys_bits)) return 0; return 1; } +static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts) +{ + struct cgroupfs_root *root; + + if (!opts->subsys_bits && !opts->none) + return NULL; + + root = kzalloc(sizeof(*root), GFP_KERNEL); + if (!root) + return ERR_PTR(-ENOMEM); + + if (!init_root_id(root)) { + kfree(root); + return ERR_PTR(-ENOMEM); + } + init_cgroup_root(root); + + root->subsys_bits = opts->subsys_bits; + root->flags = opts->flags; + if (opts->release_agent) + strcpy(root->release_agent_path, opts->release_agent); + if (opts->name) + strcpy(root->name, opts->name); + return root; +} + +static void cgroup_drop_root(struct cgroupfs_root *root) +{ + if (!root) + return; + + BUG_ON(!root->hierarchy_id); + spin_lock(&hierarchy_id_lock); + ida_remove(&hierarchy_ida, root->hierarchy_id); + spin_unlock(&hierarchy_id_lock); + kfree(root); +} + static int cgroup_set_super(struct super_block *sb, void *data) { int ret; - struct cgroupfs_root *root = data; + struct cgroup_sb_opts *opts = data; + + /* If we don't have a new root, we can't set up a new sb */ + if (!opts->new_root) + return -EINVAL; + + BUG_ON(!opts->subsys_bits && !opts->none); ret = set_anon_super(sb, NULL); if (ret) return ret; - sb->s_fs_info = root; - root->sb = sb; + sb->s_fs_info = opts->new_root; + opts->new_root->sb = sb; sb->s_blocksize = PAGE_CACHE_SIZE; sb->s_blocksize_bits = PAGE_CACHE_SHIFT; @@ -1051,48 +1285,43 @@ static int cgroup_get_sb(struct file_system_type *fs_type, void *data, struct vfsmount *mnt) { struct cgroup_sb_opts opts; + struct cgroupfs_root *root; int ret = 0; struct super_block *sb; - struct cgroupfs_root *root; - struct list_head tmp_cg_links; + struct cgroupfs_root *new_root; /* First find the desired set of subsystems */ ret = parse_cgroupfs_options(data, &opts); - if (ret) { - kfree(opts.release_agent); - return ret; - } - - root = kzalloc(sizeof(*root), GFP_KERNEL); - if (!root) { - kfree(opts.release_agent); - return -ENOMEM; - } + if (ret) + goto out_err; - init_cgroup_root(root); - root->subsys_bits = opts.subsys_bits; - root->flags = opts.flags; - if (opts.release_agent) { - strcpy(root->release_agent_path, opts.release_agent); - kfree(opts.release_agent); + /* + * Allocate a new cgroup root. We may not need it if we're + * reusing an existing hierarchy. + */ + new_root = cgroup_root_from_opts(&opts); + if (IS_ERR(new_root)) { + ret = PTR_ERR(new_root); + goto out_err; } + opts.new_root = new_root; - sb = sget(fs_type, cgroup_test_super, cgroup_set_super, root); - + /* Locate an existing or new sb for this hierarchy */ + sb = sget(fs_type, cgroup_test_super, cgroup_set_super, &opts); if (IS_ERR(sb)) { - kfree(root); - return PTR_ERR(sb); + ret = PTR_ERR(sb); + cgroup_drop_root(opts.new_root); + goto out_err; } - if (sb->s_fs_info != root) { - /* Reusing an existing superblock */ - BUG_ON(sb->s_root == NULL); - kfree(root); - root = NULL; - } else { - /* New superblock */ + root = sb->s_fs_info; + BUG_ON(!root); + if (root == opts.new_root) { + /* We used the new root structure, so this is a new hierarchy */ + struct list_head tmp_cg_links; struct cgroup *root_cgrp = &root->top_cgroup; struct inode *inode; + struct cgroupfs_root *existing_root; int i; BUG_ON(sb->s_root != NULL); @@ -1105,6 +1334,18 @@ static int cgroup_get_sb(struct file_system_type *fs_type, mutex_lock(&inode->i_mutex); mutex_lock(&cgroup_mutex); + if (strlen(root->name)) { + /* Check for name clashes with existing mounts */ + for_each_active_root(existing_root) { + if (!strcmp(existing_root->name, root->name)) { + ret = -EBUSY; + mutex_unlock(&cgroup_mutex); + mutex_unlock(&inode->i_mutex); + goto drop_new_super; + } + } + } + /* * We're accessing css_set_count without locking * css_set_lock here, but that's OK - it can only be @@ -1123,7 +1364,8 @@ static int cgroup_get_sb(struct file_system_type *fs_type, if (ret == -EBUSY) { mutex_unlock(&cgroup_mutex); mutex_unlock(&inode->i_mutex); - goto free_cg_links; + free_cg_links(&tmp_cg_links); + goto drop_new_super; } /* EBUSY should be the only error here */ @@ -1155,17 +1397,27 @@ static int cgroup_get_sb(struct file_system_type *fs_type, BUG_ON(root->number_of_cgroups != 1); cgroup_populate_dir(root_cgrp); - mutex_unlock(&inode->i_mutex); mutex_unlock(&cgroup_mutex); + mutex_unlock(&inode->i_mutex); + } else { + /* + * We re-used an existing hierarchy - the new root (if + * any) is not needed + */ + cgroup_drop_root(opts.new_root); } simple_set_mnt(mnt, sb); + kfree(opts.release_agent); + kfree(opts.name); return 0; - free_cg_links: - free_cg_links(&tmp_cg_links); drop_new_super: deactivate_locked_super(sb); + out_err: + kfree(opts.release_agent); + kfree(opts.name); + return ret; } @@ -1211,7 +1463,7 @@ static void cgroup_kill_sb(struct super_block *sb) { mutex_unlock(&cgroup_mutex); kill_litter_super(sb); - kfree(root); + cgroup_drop_root(root); } static struct file_system_type cgroup_fs_type = { @@ -1276,27 +1528,6 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) return 0; } -/* - * Return the first subsystem attached to a cgroup's hierarchy, and - * its subsystem id. - */ - -static void get_first_subsys(const struct cgroup *cgrp, - struct cgroup_subsys_state **css, int *subsys_id) -{ - const struct cgroupfs_root *root = cgrp->root; - const struct cgroup_subsys *test_ss; - BUG_ON(list_empty(&root->subsys_list)); - test_ss = list_entry(root->subsys_list.next, - struct cgroup_subsys, sibling); - if (css) { - *css = cgrp->subsys[test_ss->subsys_id]; - BUG_ON(!*css); - } - if (subsys_id) - *subsys_id = test_ss->subsys_id; -} - /** * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp' * @cgrp: the cgroup the task is attaching to @@ -1313,18 +1544,15 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) struct css_set *cg; struct css_set *newcg; struct cgroupfs_root *root = cgrp->root; - int subsys_id; - - get_first_subsys(cgrp, NULL, &subsys_id); /* Nothing to do if the task is already in that cgroup */ - oldcgrp = task_cgroup(tsk, subsys_id); + oldcgrp = task_cgroup_from_root(tsk, root); if (cgrp == oldcgrp) return 0; for_each_subsys(root, ss) { if (ss->can_attach) { - retval = ss->can_attach(ss, cgrp, tsk); + retval = ss->can_attach(ss, cgrp, tsk, false); if (retval) return retval; } @@ -1362,7 +1590,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) for_each_subsys(root, ss) { if (ss->attach) - ss->attach(ss, cgrp, oldcgrp, tsk); + ss->attach(ss, cgrp, oldcgrp, tsk, false); } set_bit(CGRP_RELEASABLE, &oldcgrp->flags); synchronize_rcu(); @@ -1423,15 +1651,6 @@ static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid) return ret; } -/* The various types of files and directories in a cgroup file system */ -enum cgroup_filetype { - FILE_ROOT, - FILE_DIR, - FILE_TASKLIST, - FILE_NOTIFY_ON_RELEASE, - FILE_RELEASE_AGENT, -}; - /** * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive. * @cgrp: the cgroup to be checked for liveness @@ -1491,14 +1710,13 @@ static ssize_t cgroup_write_X64(struct cgroup *cgrp, struct cftype *cft, return -EFAULT; buffer[nbytes] = 0; /* nul-terminate */ - strstrip(buffer); if (cft->write_u64) { - u64 val = simple_strtoull(buffer, &end, 0); + u64 val = simple_strtoull(strstrip(buffer), &end, 0); if (*end) return -EINVAL; retval = cft->write_u64(cgrp, cft, val); } else { - s64 val = simple_strtoll(buffer, &end, 0); + s64 val = simple_strtoll(strstrip(buffer), &end, 0); if (*end) return -EINVAL; retval = cft->write_s64(cgrp, cft, val); @@ -1534,8 +1752,7 @@ static ssize_t cgroup_write_string(struct cgroup *cgrp, struct cftype *cft, } buffer[nbytes] = 0; /* nul-terminate */ - strstrip(buffer); - retval = cft->write_string(cgrp, cft, buffer); + retval = cft->write_string(cgrp, cft, strstrip(buffer)); if (!retval) retval = nbytes; out: @@ -1644,7 +1861,7 @@ static int cgroup_seqfile_release(struct inode *inode, struct file *file) return single_release(inode, file); } -static struct file_operations cgroup_seqfile_operations = { +static const struct file_operations cgroup_seqfile_operations = { .read = seq_read, .write = cgroup_file_write, .llseek = seq_lseek, @@ -1703,7 +1920,7 @@ static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry, return simple_rename(old_dir, old_dentry, new_dir, new_dentry); } -static struct file_operations cgroup_file_operations = { +static const struct file_operations cgroup_file_operations = { .read = cgroup_file_read, .write = cgroup_file_write, .llseek = generic_file_llseek, @@ -1876,7 +2093,7 @@ int cgroup_task_count(const struct cgroup *cgrp) * the start of a css_set */ static void cgroup_advance_iter(struct cgroup *cgrp, - struct cgroup_iter *it) + struct cgroup_iter *it) { struct list_head *l = it->cg_link; struct cg_cgroup_link *link; @@ -2129,7 +2346,7 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan) } /* - * Stuff for reading the 'tasks' file. + * Stuff for reading the 'tasks'/'procs' files. * * Reading this file can return large amounts of data if a cgroup has * *lots* of attached tasks. So it may need several calls to read(), @@ -2139,27 +2356,196 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan) */ /* - * Load into 'pidarray' up to 'npids' of the tasks using cgroup - * 'cgrp'. Return actual number of pids loaded. No need to - * task_lock(p) when reading out p->cgroup, since we're in an RCU - * read section, so the css_set can't go away, and is - * immutable after creation. + * The following two functions "fix" the issue where there are more pids + * than kmalloc will give memory for; in such cases, we use vmalloc/vfree. + * TODO: replace with a kernel-wide solution to this problem + */ +#define PIDLIST_TOO_LARGE(c) ((c) * sizeof(pid_t) > (PAGE_SIZE * 2)) +static void *pidlist_allocate(int count) +{ + if (PIDLIST_TOO_LARGE(count)) + return vmalloc(count * sizeof(pid_t)); + else + return kmalloc(count * sizeof(pid_t), GFP_KERNEL); +} +static void pidlist_free(void *p) +{ + if (is_vmalloc_addr(p)) + vfree(p); + else + kfree(p); +} +static void *pidlist_resize(void *p, int newcount) +{ + void *newlist; + /* note: if new alloc fails, old p will still be valid either way */ + if (is_vmalloc_addr(p)) { + newlist = vmalloc(newcount * sizeof(pid_t)); + if (!newlist) + return NULL; + memcpy(newlist, p, newcount * sizeof(pid_t)); + vfree(p); + } else { + newlist = krealloc(p, newcount * sizeof(pid_t), GFP_KERNEL); + } + return newlist; +} + +/* + * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries + * If the new stripped list is sufficiently smaller and there's enough memory + * to allocate a new buffer, will let go of the unneeded memory. Returns the + * number of unique elements. + */ +/* is the size difference enough that we should re-allocate the array? */ +#define PIDLIST_REALLOC_DIFFERENCE(old, new) ((old) - PAGE_SIZE >= (new)) +static int pidlist_uniq(pid_t **p, int length) +{ + int src, dest = 1; + pid_t *list = *p; + pid_t *newlist; + + /* + * we presume the 0th element is unique, so i starts at 1. trivial + * edge cases first; no work needs to be done for either + */ + if (length == 0 || length == 1) + return length; + /* src and dest walk down the list; dest counts unique elements */ + for (src = 1; src < length; src++) { + /* find next unique element */ + while (list[src] == list[src-1]) { + src++; + if (src == length) + goto after; + } + /* dest always points to where the next unique element goes */ + list[dest] = list[src]; + dest++; + } +after: + /* + * if the length difference is large enough, we want to allocate a + * smaller buffer to save memory. if this fails due to out of memory, + * we'll just stay with what we've got. + */ + if (PIDLIST_REALLOC_DIFFERENCE(length, dest)) { + newlist = pidlist_resize(list, dest); + if (newlist) + *p = newlist; + } + return dest; +} + +static int cmppid(const void *a, const void *b) +{ + return *(pid_t *)a - *(pid_t *)b; +} + +/* + * find the appropriate pidlist for our purpose (given procs vs tasks) + * returns with the lock on that pidlist already held, and takes care + * of the use count, or returns NULL with no locks held if we're out of + * memory. */ -static int pid_array_load(pid_t *pidarray, int npids, struct cgroup *cgrp) +static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp, + enum cgroup_filetype type) { - int n = 0, pid; + struct cgroup_pidlist *l; + /* don't need task_nsproxy() if we're looking at ourself */ + struct pid_namespace *ns = get_pid_ns(current->nsproxy->pid_ns); + /* + * We can't drop the pidlist_mutex before taking the l->mutex in case + * the last ref-holder is trying to remove l from the list at the same + * time. Holding the pidlist_mutex precludes somebody taking whichever + * list we find out from under us - compare release_pid_array(). + */ + mutex_lock(&cgrp->pidlist_mutex); + list_for_each_entry(l, &cgrp->pidlists, links) { + if (l->key.type == type && l->key.ns == ns) { + /* found a matching list - drop the extra refcount */ + put_pid_ns(ns); + /* make sure l doesn't vanish out from under us */ + down_write(&l->mutex); + mutex_unlock(&cgrp->pidlist_mutex); + l->use_count++; + return l; + } + } + /* entry not found; create a new one */ + l = kmalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL); + if (!l) { + mutex_unlock(&cgrp->pidlist_mutex); + put_pid_ns(ns); + return l; + } + init_rwsem(&l->mutex); + down_write(&l->mutex); + l->key.type = type; + l->key.ns = ns; + l->use_count = 0; /* don't increment here */ + l->list = NULL; + l->owner = cgrp; + list_add(&l->links, &cgrp->pidlists); + mutex_unlock(&cgrp->pidlist_mutex); + return l; +} + +/* + * Load a cgroup's pidarray with either procs' tgids or tasks' pids + */ +static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, + struct cgroup_pidlist **lp) +{ + pid_t *array; + int length; + int pid, n = 0; /* used for populating the array */ struct cgroup_iter it; struct task_struct *tsk; + struct cgroup_pidlist *l; + + /* + * If cgroup gets more users after we read count, we won't have + * enough space - tough. This race is indistinguishable to the + * caller from the case that the additional cgroup users didn't + * show up until sometime later on. + */ + length = cgroup_task_count(cgrp); + array = pidlist_allocate(length); + if (!array) + return -ENOMEM; + /* now, populate the array */ cgroup_iter_start(cgrp, &it); while ((tsk = cgroup_iter_next(cgrp, &it))) { - if (unlikely(n == npids)) + if (unlikely(n == length)) break; - pid = task_pid_vnr(tsk); - if (pid > 0) - pidarray[n++] = pid; + /* get tgid or pid for procs or tasks file respectively */ + if (type == CGROUP_FILE_PROCS) + pid = task_tgid_vnr(tsk); + else + pid = task_pid_vnr(tsk); + if (pid > 0) /* make sure to only use valid results */ + array[n++] = pid; } cgroup_iter_end(cgrp, &it); - return n; + length = n; + /* now sort & (if procs) strip out duplicates */ + sort(array, length, sizeof(pid_t), cmppid, NULL); + if (type == CGROUP_FILE_PROCS) + length = pidlist_uniq(&array, length); + l = cgroup_pidlist_find(cgrp, type); + if (!l) { + pidlist_free(array); + return -ENOMEM; + } + /* store array, freeing old if necessary - lock already held */ + pidlist_free(l->list); + l->list = array; + l->length = length; + l->use_count++; + up_write(&l->mutex); + *lp = l; + return 0; } /** @@ -2216,37 +2602,14 @@ err: return ret; } -/* - * Cache pids for all threads in the same pid namespace that are - * opening the same "tasks" file. - */ -struct cgroup_pids { - /* The node in cgrp->pids_list */ - struct list_head list; - /* The cgroup those pids belong to */ - struct cgroup *cgrp; - /* The namepsace those pids belong to */ - struct pid_namespace *ns; - /* Array of process ids in the cgroup */ - pid_t *tasks_pids; - /* How many files are using the this tasks_pids array */ - int use_count; - /* Length of the current tasks_pids array */ - int length; -}; - -static int cmppid(const void *a, const void *b) -{ - return *(pid_t *)a - *(pid_t *)b; -} /* - * seq_file methods for the "tasks" file. The seq_file position is the + * seq_file methods for the tasks/procs files. The seq_file position is the * next pid to display; the seq_file iterator is a pointer to the pid - * in the cgroup->tasks_pids array. + * in the cgroup->l->list array. */ -static void *cgroup_tasks_start(struct seq_file *s, loff_t *pos) +static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos) { /* * Initially we receive a position value that corresponds to @@ -2254,48 +2617,45 @@ static void *cgroup_tasks_start(struct seq_file *s, loff_t *pos) * after a seek to the start). Use a binary-search to find the * next pid to display, if any */ - struct cgroup_pids *cp = s->private; - struct cgroup *cgrp = cp->cgrp; + struct cgroup_pidlist *l = s->private; int index = 0, pid = *pos; int *iter; - down_read(&cgrp->pids_mutex); + down_read(&l->mutex); if (pid) { - int end = cp->length; + int end = l->length; while (index < end) { int mid = (index + end) / 2; - if (cp->tasks_pids[mid] == pid) { + if (l->list[mid] == pid) { index = mid; break; - } else if (cp->tasks_pids[mid] <= pid) + } else if (l->list[mid] <= pid) index = mid + 1; else end = mid; } } /* If we're off the end of the array, we're done */ - if (index >= cp->length) + if (index >= l->length) return NULL; /* Update the abstract position to be the actual pid that we found */ - iter = cp->tasks_pids + index; + iter = l->list + index; *pos = *iter; return iter; } -static void cgroup_tasks_stop(struct seq_file *s, void *v) +static void cgroup_pidlist_stop(struct seq_file *s, void *v) { - struct cgroup_pids *cp = s->private; - struct cgroup *cgrp = cp->cgrp; - up_read(&cgrp->pids_mutex); + struct cgroup_pidlist *l = s->private; + up_read(&l->mutex); } -static void *cgroup_tasks_next(struct seq_file *s, void *v, loff_t *pos) +static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos) { - struct cgroup_pids *cp = s->private; - int *p = v; - int *end = cp->tasks_pids + cp->length; - + struct cgroup_pidlist *l = s->private; + pid_t *p = v; + pid_t *end = l->list + l->length; /* * Advance to the next pid in the array. If this goes off the * end, we're done @@ -2309,124 +2669,107 @@ static void *cgroup_tasks_next(struct seq_file *s, void *v, loff_t *pos) } } -static int cgroup_tasks_show(struct seq_file *s, void *v) +static int cgroup_pidlist_show(struct seq_file *s, void *v) { return seq_printf(s, "%d\n", *(int *)v); } -static const struct seq_operations cgroup_tasks_seq_operations = { - .start = cgroup_tasks_start, - .stop = cgroup_tasks_stop, - .next = cgroup_tasks_next, - .show = cgroup_tasks_show, +/* + * seq_operations functions for iterating on pidlists through seq_file - + * independent of whether it's tasks or procs + */ +static const struct seq_operations cgroup_pidlist_seq_operations = { + .start = cgroup_pidlist_start, + .stop = cgroup_pidlist_stop, + .next = cgroup_pidlist_next, + .show = cgroup_pidlist_show, }; -static void release_cgroup_pid_array(struct cgroup_pids *cp) +static void cgroup_release_pid_array(struct cgroup_pidlist *l) { - struct cgroup *cgrp = cp->cgrp; - - down_write(&cgrp->pids_mutex); - BUG_ON(!cp->use_count); - if (!--cp->use_count) { - list_del(&cp->list); - put_pid_ns(cp->ns); - kfree(cp->tasks_pids); - kfree(cp); + /* + * the case where we're the last user of this particular pidlist will + * have us remove it from the cgroup's list, which entails taking the + * mutex. since in pidlist_find the pidlist->lock depends on cgroup-> + * pidlist_mutex, we have to take pidlist_mutex first. + */ + mutex_lock(&l->owner->pidlist_mutex); + down_write(&l->mutex); + BUG_ON(!l->use_count); + if (!--l->use_count) { + /* we're the last user if refcount is 0; remove and free */ + list_del(&l->links); + mutex_unlock(&l->owner->pidlist_mutex); + pidlist_free(l->list); + put_pid_ns(l->key.ns); + up_write(&l->mutex); + kfree(l); + return; } - up_write(&cgrp->pids_mutex); + mutex_unlock(&l->owner->pidlist_mutex); + up_write(&l->mutex); } -static int cgroup_tasks_release(struct inode *inode, struct file *file) +static int cgroup_pidlist_release(struct inode *inode, struct file *file) { - struct seq_file *seq; - struct cgroup_pids *cp; - + struct cgroup_pidlist *l; if (!(file->f_mode & FMODE_READ)) return 0; - - seq = file->private_data; - cp = seq->private; - - release_cgroup_pid_array(cp); + /* + * the seq_file will only be initialized if the file was opened for + * reading; hence we check if it's not null only in that case. + */ + l = ((struct seq_file *)file->private_data)->private; + cgroup_release_pid_array(l); return seq_release(inode, file); } -static struct file_operations cgroup_tasks_operations = { +static const struct file_operations cgroup_pidlist_operations = { .read = seq_read, .llseek = seq_lseek, .write = cgroup_file_write, - .release = cgroup_tasks_release, + .release = cgroup_pidlist_release, }; /* - * Handle an open on 'tasks' file. Prepare an array containing the - * process id's of tasks currently attached to the cgroup being opened. + * The following functions handle opens on a file that displays a pidlist + * (tasks or procs). Prepare an array of the process/thread IDs of whoever's + * in the cgroup. */ - -static int cgroup_tasks_open(struct inode *unused, struct file *file) +/* helper function for the two below it */ +static int cgroup_pidlist_open(struct file *file, enum cgroup_filetype type) { struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); - struct pid_namespace *ns = current->nsproxy->pid_ns; - struct cgroup_pids *cp; - pid_t *pidarray; - int npids; + struct cgroup_pidlist *l; int retval; /* Nothing to do for write-only files */ if (!(file->f_mode & FMODE_READ)) return 0; - /* - * If cgroup gets more users after we read count, we won't have - * enough space - tough. This race is indistinguishable to the - * caller from the case that the additional cgroup users didn't - * show up until sometime later on. - */ - npids = cgroup_task_count(cgrp); - pidarray = kmalloc(npids * sizeof(pid_t), GFP_KERNEL); - if (!pidarray) - return -ENOMEM; - npids = pid_array_load(pidarray, npids, cgrp); - sort(pidarray, npids, sizeof(pid_t), cmppid, NULL); - - /* - * Store the array in the cgroup, freeing the old - * array if necessary - */ - down_write(&cgrp->pids_mutex); - - list_for_each_entry(cp, &cgrp->pids_list, list) { - if (ns == cp->ns) - goto found; - } - - cp = kzalloc(sizeof(*cp), GFP_KERNEL); - if (!cp) { - up_write(&cgrp->pids_mutex); - kfree(pidarray); - return -ENOMEM; - } - cp->cgrp = cgrp; - cp->ns = ns; - get_pid_ns(ns); - list_add(&cp->list, &cgrp->pids_list); -found: - kfree(cp->tasks_pids); - cp->tasks_pids = pidarray; - cp->length = npids; - cp->use_count++; - up_write(&cgrp->pids_mutex); - - file->f_op = &cgroup_tasks_operations; + /* have the array populated */ + retval = pidlist_array_load(cgrp, type, &l); + if (retval) + return retval; + /* configure file information */ + file->f_op = &cgroup_pidlist_operations; - retval = seq_open(file, &cgroup_tasks_seq_operations); + retval = seq_open(file, &cgroup_pidlist_seq_operations); if (retval) { - release_cgroup_pid_array(cp); + cgroup_release_pid_array(l); return retval; } - ((struct seq_file *)file->private_data)->private = cp; + ((struct seq_file *)file->private_data)->private = l; return 0; } +static int cgroup_tasks_open(struct inode *unused, struct file *file) +{ + return cgroup_pidlist_open(file, CGROUP_FILE_TASKS); +} +static int cgroup_procs_open(struct inode *unused, struct file *file) +{ + return cgroup_pidlist_open(file, CGROUP_FILE_PROCS); +} static u64 cgroup_read_notify_on_release(struct cgroup *cgrp, struct cftype *cft) @@ -2449,21 +2792,27 @@ static int cgroup_write_notify_on_release(struct cgroup *cgrp, /* * for the common functions, 'private' gives the type of file */ +/* for hysterical raisins, we can't put this on the older files */ +#define CGROUP_FILE_GENERIC_PREFIX "cgroup." static struct cftype files[] = { { .name = "tasks", .open = cgroup_tasks_open, .write_u64 = cgroup_tasks_write, - .release = cgroup_tasks_release, - .private = FILE_TASKLIST, + .release = cgroup_pidlist_release, .mode = S_IRUGO | S_IWUSR, }, - + { + .name = CGROUP_FILE_GENERIC_PREFIX "procs", + .open = cgroup_procs_open, + /* .write_u64 = cgroup_procs_write, TODO */ + .release = cgroup_pidlist_release, + .mode = S_IRUGO, + }, { .name = "notify_on_release", .read_u64 = cgroup_read_notify_on_release, .write_u64 = cgroup_write_notify_on_release, - .private = FILE_NOTIFY_ON_RELEASE, }, }; @@ -2472,7 +2821,6 @@ static struct cftype cft_release_agent = { .read_seq_string = cgroup_release_agent_show, .write_string = cgroup_release_agent_write, .max_write_len = PATH_MAX, - .private = FILE_RELEASE_AGENT, }; static int cgroup_populate_dir(struct cgroup *cgrp) @@ -2879,6 +3227,7 @@ int __init cgroup_init_early(void) init_task.cgroups = &init_css_set; init_css_set_link.cg = &init_css_set; + init_css_set_link.cgrp = dummytop; list_add(&init_css_set_link.cgrp_link_list, &rootnode.top_cgroup.css_sets); list_add(&init_css_set_link.cg_link_list, @@ -2933,7 +3282,7 @@ int __init cgroup_init(void) /* Add init_css_set to the hash table */ hhead = css_set_hash(init_css_set.subsys); hlist_add_head(&init_css_set.hlist, hhead); - + BUG_ON(!init_root_id(&rootnode)); err = register_filesystem(&cgroup_fs_type); if (err < 0) goto out; @@ -2986,15 +3335,16 @@ static int proc_cgroup_show(struct seq_file *m, void *v) for_each_active_root(root) { struct cgroup_subsys *ss; struct cgroup *cgrp; - int subsys_id; int count = 0; - seq_printf(m, "%lu:", root->subsys_bits); + seq_printf(m, "%d:", root->hierarchy_id); for_each_subsys(root, ss) seq_printf(m, "%s%s", count++ ? "," : "", ss->name); + if (strlen(root->name)) + seq_printf(m, "%sname=%s", count ? "," : "", + root->name); seq_putc(m, ':'); - get_first_subsys(&root->top_cgroup, NULL, &subsys_id); - cgrp = task_cgroup(tsk, subsys_id); + cgrp = task_cgroup_from_root(tsk, root); retval = cgroup_path(cgrp, buf, PAGE_SIZE); if (retval < 0) goto out_unlock; @@ -3017,7 +3367,7 @@ static int cgroup_open(struct inode *inode, struct file *file) return single_open(file, proc_cgroup_show, pid); } -struct file_operations proc_cgroup_operations = { +const struct file_operations proc_cgroup_operations = { .open = cgroup_open, .read = seq_read, .llseek = seq_lseek, @@ -3033,8 +3383,8 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v) mutex_lock(&cgroup_mutex); for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { struct cgroup_subsys *ss = subsys[i]; - seq_printf(m, "%s\t%lu\t%d\t%d\n", - ss->name, ss->root->subsys_bits, + seq_printf(m, "%s\t%d\t%d\t%d\n", + ss->name, ss->root->hierarchy_id, ss->root->number_of_cgroups, !ss->disabled); } mutex_unlock(&cgroup_mutex); @@ -3046,7 +3396,7 @@ static int cgroupstats_open(struct inode *inode, struct file *file) return single_open(file, proc_cgroupstats_show, NULL); } -static struct file_operations proc_cgroupstats_operations = { +static const struct file_operations proc_cgroupstats_operations = { .open = cgroupstats_open, .read = seq_read, .llseek = seq_lseek, @@ -3320,13 +3670,11 @@ int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task) { int ret; struct cgroup *target; - int subsys_id; if (cgrp == dummytop) return 1; - get_first_subsys(cgrp, NULL, &subsys_id); - target = task_cgroup(task, subsys_id); + target = task_cgroup_from_root(task, cgrp->root); while (cgrp != target && cgrp!= cgrp->top_cgroup) cgrp = cgrp->parent; ret = (cgrp == target); @@ -3358,8 +3706,10 @@ static void check_for_release(struct cgroup *cgrp) void __css_put(struct cgroup_subsys_state *css) { struct cgroup *cgrp = css->cgroup; + int val; rcu_read_lock(); - if (atomic_dec_return(&css->refcnt) == 1) { + val = atomic_dec_return(&css->refcnt); + if (val == 1) { if (notify_on_release(cgrp)) { set_bit(CGRP_RELEASABLE, &cgrp->flags); check_for_release(cgrp); @@ -3367,6 +3717,7 @@ void __css_put(struct cgroup_subsys_state *css) cgroup_wakeup_rmdir_waiter(cgrp); } rcu_read_unlock(); + WARN_ON_ONCE(val < 1); } /* @@ -3693,3 +4044,154 @@ css_get_next(struct cgroup_subsys *ss, int id, return ret; } +#ifdef CONFIG_CGROUP_DEBUG +static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss, + struct cgroup *cont) +{ + struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL); + + if (!css) + return ERR_PTR(-ENOMEM); + + return css; +} + +static void debug_destroy(struct cgroup_subsys *ss, struct cgroup *cont) +{ + kfree(cont->subsys[debug_subsys_id]); +} + +static u64 cgroup_refcount_read(struct cgroup *cont, struct cftype *cft) +{ + return atomic_read(&cont->count); +} + +static u64 debug_taskcount_read(struct cgroup *cont, struct cftype *cft) +{ + return cgroup_task_count(cont); +} + +static u64 current_css_set_read(struct cgroup *cont, struct cftype *cft) +{ + return (u64)(unsigned long)current->cgroups; +} + +static u64 current_css_set_refcount_read(struct cgroup *cont, + struct cftype *cft) +{ + u64 count; + + rcu_read_lock(); + count = atomic_read(¤t->cgroups->refcount); + rcu_read_unlock(); + return count; +} + +static int current_css_set_cg_links_read(struct cgroup *cont, + struct cftype *cft, + struct seq_file *seq) +{ + struct cg_cgroup_link *link; + struct css_set *cg; + + read_lock(&css_set_lock); + rcu_read_lock(); + cg = rcu_dereference(current->cgroups); + list_for_each_entry(link, &cg->cg_links, cg_link_list) { + struct cgroup *c = link->cgrp; + const char *name; + + if (c->dentry) + name = c->dentry->d_name.name; + else + name = "?"; + seq_printf(seq, "Root %d group %s\n", + c->root->hierarchy_id, name); + } + rcu_read_unlock(); + read_unlock(&css_set_lock); + return 0; +} + +#define MAX_TASKS_SHOWN_PER_CSS 25 +static int cgroup_css_links_read(struct cgroup *cont, + struct cftype *cft, + struct seq_file *seq) +{ + struct cg_cgroup_link *link; + + read_lock(&css_set_lock); + list_for_each_entry(link, &cont->css_sets, cgrp_link_list) { + struct css_set *cg = link->cg; + struct task_struct *task; + int count = 0; + seq_printf(seq, "css_set %p\n", cg); + list_for_each_entry(task, &cg->tasks, cg_list) { + if (count++ > MAX_TASKS_SHOWN_PER_CSS) { + seq_puts(seq, " ...\n"); + break; + } else { + seq_printf(seq, " task %d\n", + task_pid_vnr(task)); + } + } + } + read_unlock(&css_set_lock); + return 0; +} + +static u64 releasable_read(struct cgroup *cgrp, struct cftype *cft) +{ + return test_bit(CGRP_RELEASABLE, &cgrp->flags); +} + +static struct cftype debug_files[] = { + { + .name = "cgroup_refcount", + .read_u64 = cgroup_refcount_read, + }, + { + .name = "taskcount", + .read_u64 = debug_taskcount_read, + }, + + { + .name = "current_css_set", + .read_u64 = current_css_set_read, + }, + + { + .name = "current_css_set_refcount", + .read_u64 = current_css_set_refcount_read, + }, + + { + .name = "current_css_set_cg_links", + .read_seq_string = current_css_set_cg_links_read, + }, + + { + .name = "cgroup_css_links", + .read_seq_string = cgroup_css_links_read, + }, + + { + .name = "releasable", + .read_u64 = releasable_read, + }, +}; + +static int debug_populate(struct cgroup_subsys *ss, struct cgroup *cont) +{ + return cgroup_add_files(cont, ss, debug_files, + ARRAY_SIZE(debug_files)); +} + +struct cgroup_subsys debug_subsys = { + .name = "debug", + .create = debug_create, + .destroy = debug_destroy, + .populate = debug_populate, + .subsys_id = debug_subsys_id, +}; +#endif /* CONFIG_CGROUP_DEBUG */ diff --git a/kernel/cgroup_debug.c b/kernel/cgroup_debug.c deleted file mode 100644 index 0c92d797baa..00000000000 --- a/kernel/cgroup_debug.c +++ /dev/null @@ -1,105 +0,0 @@ -/* - * kernel/cgroup_debug.c - Example cgroup subsystem that - * exposes debug info - * - * Copyright (C) Google Inc, 2007 - * - * Developed by Paul Menage (menage@google.com) - * - */ - -#include <linux/cgroup.h> -#include <linux/fs.h> -#include <linux/slab.h> -#include <linux/rcupdate.h> - -#include <asm/atomic.h> - -static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss, - struct cgroup *cont) -{ - struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL); - - if (!css) - return ERR_PTR(-ENOMEM); - - return css; -} - -static void debug_destroy(struct cgroup_subsys *ss, struct cgroup *cont) -{ - kfree(cont->subsys[debug_subsys_id]); -} - -static u64 cgroup_refcount_read(struct cgroup *cont, struct cftype *cft) -{ - return atomic_read(&cont->count); -} - -static u64 taskcount_read(struct cgroup *cont, struct cftype *cft) -{ - u64 count; - - count = cgroup_task_count(cont); - return count; -} - -static u64 current_css_set_read(struct cgroup *cont, struct cftype *cft) -{ - return (u64)(long)current->cgroups; -} - -static u64 current_css_set_refcount_read(struct cgroup *cont, - struct cftype *cft) -{ - u64 count; - - rcu_read_lock(); - count = atomic_read(¤t->cgroups->refcount); - rcu_read_unlock(); - return count; -} - -static u64 releasable_read(struct cgroup *cgrp, struct cftype *cft) -{ - return test_bit(CGRP_RELEASABLE, &cgrp->flags); -} - -static struct cftype files[] = { - { - .name = "cgroup_refcount", - .read_u64 = cgroup_refcount_read, - }, - { - .name = "taskcount", - .read_u64 = taskcount_read, - }, - - { - .name = "current_css_set", - .read_u64 = current_css_set_read, - }, - - { - .name = "current_css_set_refcount", - .read_u64 = current_css_set_refcount_read, - }, - - { - .name = "releasable", - .read_u64 = releasable_read, - }, -}; - -static int debug_populate(struct cgroup_subsys *ss, struct cgroup *cont) -{ - return cgroup_add_files(cont, ss, files, ARRAY_SIZE(files)); -} - -struct cgroup_subsys debug_subsys = { - .name = "debug", - .create = debug_create, - .destroy = debug_destroy, - .populate = debug_populate, - .subsys_id = debug_subsys_id, -}; diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index fb249e2bcad..59e9ef6aab4 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -159,7 +159,7 @@ static bool is_task_frozen_enough(struct task_struct *task) */ static int freezer_can_attach(struct cgroup_subsys *ss, struct cgroup *new_cgroup, - struct task_struct *task) + struct task_struct *task, bool threadgroup) { struct freezer *freezer; @@ -177,6 +177,19 @@ static int freezer_can_attach(struct cgroup_subsys *ss, if (freezer->state == CGROUP_FROZEN) return -EBUSY; + if (threadgroup) { + struct task_struct *c; + + rcu_read_lock(); + list_for_each_entry_rcu(c, &task->thread_group, thread_group) { + if (is_task_frozen_enough(c)) { + rcu_read_unlock(); + return -EBUSY; + } + } + rcu_read_unlock(); + } + return 0; } diff --git a/kernel/cpu.c b/kernel/cpu.c index 6ba0f1ecb21..291ac586f37 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -212,6 +212,8 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls); if (err == NOTIFY_BAD) { + set_cpu_active(cpu, true); + nr_calls--; __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod, hcpu, nr_calls, NULL); @@ -223,11 +225,11 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) /* Ensure that we are not runnable on dying cpu */ cpumask_copy(old_allowed, ¤t->cpus_allowed); - set_cpus_allowed_ptr(current, - cpumask_of(cpumask_any_but(cpu_online_mask, cpu))); + set_cpus_allowed_ptr(current, cpu_active_mask); err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu)); if (err) { + set_cpu_active(cpu, true); /* CPU didn't die: tell everyone. Can't complain. */ if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod, hcpu) == NOTIFY_BAD) @@ -292,9 +294,6 @@ int __ref cpu_down(unsigned int cpu) err = _cpu_down(cpu, 0); - if (cpu_online(cpu)) - set_cpu_active(cpu, true); - out: cpu_maps_update_done(); stop_machine_destroy(); @@ -387,15 +386,23 @@ int disable_nonboot_cpus(void) * with the userspace trying to use the CPU hotplug at the same time */ cpumask_clear(frozen_cpus); + + for_each_online_cpu(cpu) { + if (cpu == first_cpu) + continue; + set_cpu_active(cpu, false); + } + + synchronize_sched(); + printk("Disabling non-boot CPUs ...\n"); for_each_online_cpu(cpu) { if (cpu == first_cpu) continue; error = _cpu_down(cpu, 1); - if (!error) { + if (!error) cpumask_set_cpu(cpu, frozen_cpus); - printk("CPU%d is down\n", cpu); - } else { + else { printk(KERN_ERR "Error taking CPU%d down: %d\n", cpu, error); break; diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 7e75a41bd50..ba401fab459 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -537,8 +537,7 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c) * element of the partition (one sched domain) to be passed to * partition_sched_domains(). */ -/* FIXME: see the FIXME in partition_sched_domains() */ -static int generate_sched_domains(struct cpumask **domains, +static int generate_sched_domains(cpumask_var_t **domains, struct sched_domain_attr **attributes) { LIST_HEAD(q); /* queue of cpusets to be scanned */ @@ -546,7 +545,7 @@ static int generate_sched_domains(struct cpumask **domains, struct cpuset **csa; /* array of all cpuset ptrs */ int csn; /* how many cpuset ptrs in csa so far */ int i, j, k; /* indices for partition finding loops */ - struct cpumask *doms; /* resulting partition; i.e. sched domains */ + cpumask_var_t *doms; /* resulting partition; i.e. sched domains */ struct sched_domain_attr *dattr; /* attributes for custom domains */ int ndoms = 0; /* number of sched domains in result */ int nslot; /* next empty doms[] struct cpumask slot */ @@ -557,7 +556,8 @@ static int generate_sched_domains(struct cpumask **domains, /* Special case for the 99% of systems with one, full, sched domain */ if (is_sched_load_balance(&top_cpuset)) { - doms = kmalloc(cpumask_size(), GFP_KERNEL); + ndoms = 1; + doms = alloc_sched_domains(ndoms); if (!doms) goto done; @@ -566,9 +566,8 @@ static int generate_sched_domains(struct cpumask **domains, *dattr = SD_ATTR_INIT; update_domain_attr_tree(dattr, &top_cpuset); } - cpumask_copy(doms, top_cpuset.cpus_allowed); + cpumask_copy(doms[0], top_cpuset.cpus_allowed); - ndoms = 1; goto done; } @@ -636,7 +635,7 @@ restart: * Now we know how many domains to create. * Convert <csn, csa> to <ndoms, doms> and populate cpu masks. */ - doms = kmalloc(ndoms * cpumask_size(), GFP_KERNEL); + doms = alloc_sched_domains(ndoms); if (!doms) goto done; @@ -656,7 +655,7 @@ restart: continue; } - dp = doms + nslot; + dp = doms[nslot]; if (nslot == ndoms) { static int warnings = 10; @@ -718,7 +717,7 @@ done: static void do_rebuild_sched_domains(struct work_struct *unused) { struct sched_domain_attr *attr; - struct cpumask *doms; + cpumask_var_t *doms; int ndoms; get_online_cpus(); @@ -738,7 +737,7 @@ static void do_rebuild_sched_domains(struct work_struct *unused) { } -static int generate_sched_domains(struct cpumask **domains, +static int generate_sched_domains(cpumask_var_t **domains, struct sched_domain_attr **attributes) { *domains = NULL; @@ -873,7 +872,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, if (retval < 0) return retval; - if (!cpumask_subset(trialcs->cpus_allowed, cpu_online_mask)) + if (!cpumask_subset(trialcs->cpus_allowed, cpu_active_mask)) return -EINVAL; } retval = validate_change(cs, trialcs); @@ -1324,9 +1323,10 @@ static int fmeter_getrate(struct fmeter *fmp) static cpumask_var_t cpus_attach; /* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */ -static int cpuset_can_attach(struct cgroup_subsys *ss, - struct cgroup *cont, struct task_struct *tsk) +static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont, + struct task_struct *tsk, bool threadgroup) { + int ret; struct cpuset *cs = cgroup_cs(cont); if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) @@ -1343,18 +1343,51 @@ static int cpuset_can_attach(struct cgroup_subsys *ss, if (tsk->flags & PF_THREAD_BOUND) return -EINVAL; - return security_task_setscheduler(tsk, 0, NULL); + ret = security_task_setscheduler(tsk, 0, NULL); + if (ret) + return ret; + if (threadgroup) { + struct task_struct *c; + + rcu_read_lock(); + list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) { + ret = security_task_setscheduler(c, 0, NULL); + if (ret) { + rcu_read_unlock(); + return ret; + } + } + rcu_read_unlock(); + } + return 0; +} + +static void cpuset_attach_task(struct task_struct *tsk, nodemask_t *to, + struct cpuset *cs) +{ + int err; + /* + * can_attach beforehand should guarantee that this doesn't fail. + * TODO: have a better way to handle failure here + */ + err = set_cpus_allowed_ptr(tsk, cpus_attach); + WARN_ON_ONCE(err); + + task_lock(tsk); + cpuset_change_task_nodemask(tsk, to); + task_unlock(tsk); + cpuset_update_task_spread_flag(cs, tsk); + } -static void cpuset_attach(struct cgroup_subsys *ss, - struct cgroup *cont, struct cgroup *oldcont, - struct task_struct *tsk) +static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont, + struct cgroup *oldcont, struct task_struct *tsk, + bool threadgroup) { nodemask_t from, to; struct mm_struct *mm; struct cpuset *cs = cgroup_cs(cont); struct cpuset *oldcs = cgroup_cs(oldcont); - int err; if (cs == &top_cpuset) { cpumask_copy(cpus_attach, cpu_possible_mask); @@ -1363,15 +1396,19 @@ static void cpuset_attach(struct cgroup_subsys *ss, guarantee_online_cpus(cs, cpus_attach); guarantee_online_mems(cs, &to); } - err = set_cpus_allowed_ptr(tsk, cpus_attach); - if (err) - return; - task_lock(tsk); - cpuset_change_task_nodemask(tsk, &to); - task_unlock(tsk); - cpuset_update_task_spread_flag(cs, tsk); + /* do per-task migration stuff possibly for each in the threadgroup */ + cpuset_attach_task(tsk, &to, cs); + if (threadgroup) { + struct task_struct *c; + rcu_read_lock(); + list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) { + cpuset_attach_task(c, &to, cs); + } + rcu_read_unlock(); + } + /* change mm; only needs to be done once even if threadgroup */ from = oldcs->mems_allowed; to = cs->mems_allowed; mm = get_task_mm(tsk); @@ -1973,7 +2010,7 @@ static void scan_for_empty_cpusets(struct cpuset *root) } /* Continue past cpusets with all cpus, mems online */ - if (cpumask_subset(cp->cpus_allowed, cpu_online_mask) && + if (cpumask_subset(cp->cpus_allowed, cpu_active_mask) && nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY])) continue; @@ -1982,7 +2019,7 @@ static void scan_for_empty_cpusets(struct cpuset *root) /* Remove offline cpus and mems from this cpuset. */ mutex_lock(&callback_mutex); cpumask_and(cp->cpus_allowed, cp->cpus_allowed, - cpu_online_mask); + cpu_active_mask); nodes_and(cp->mems_allowed, cp->mems_allowed, node_states[N_HIGH_MEMORY]); mutex_unlock(&callback_mutex); @@ -2014,14 +2051,16 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb, unsigned long phase, void *unused_cpu) { struct sched_domain_attr *attr; - struct cpumask *doms; + cpumask_var_t *doms; int ndoms; switch (phase) { case CPU_ONLINE: case CPU_ONLINE_FROZEN: - case CPU_DEAD: - case CPU_DEAD_FROZEN: + case CPU_DOWN_PREPARE: + case CPU_DOWN_PREPARE_FROZEN: + case CPU_DOWN_FAILED: + case CPU_DOWN_FAILED_FROZEN: break; default: @@ -2030,7 +2069,7 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb, cgroup_lock(); mutex_lock(&callback_mutex); - cpumask_copy(top_cpuset.cpus_allowed, cpu_online_mask); + cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); mutex_unlock(&callback_mutex); scan_for_empty_cpusets(&top_cpuset); ndoms = generate_sched_domains(&doms, &attr); @@ -2077,7 +2116,7 @@ static int cpuset_track_online_nodes(struct notifier_block *self, void __init cpuset_init_smp(void) { - cpumask_copy(top_cpuset.cpus_allowed, cpu_online_mask); + cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; hotcpu_notifier(cpuset_track_online_cpus, 0); @@ -2499,15 +2538,9 @@ const struct file_operations proc_cpuset_operations = { }; #endif /* CONFIG_PROC_PID_CPUSET */ -/* Display task cpus_allowed, mems_allowed in /proc/<pid>/status file. */ +/* Display task mems_allowed in /proc/<pid>/status file. */ void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task) { - seq_printf(m, "Cpus_allowed:\t"); - seq_cpumask(m, &task->cpus_allowed); - seq_printf(m, "\n"); - seq_printf(m, "Cpus_allowed_list:\t"); - seq_cpumask_list(m, &task->cpus_allowed); - seq_printf(m, "\n"); seq_printf(m, "Mems_allowed:\t"); seq_nodemask(m, &task->mems_allowed); seq_printf(m, "\n"); diff --git a/kernel/cred.c b/kernel/cred.c index d7f7a01082e..dd76cfe5f5b 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -782,6 +782,25 @@ EXPORT_SYMBOL(set_create_files_as); #ifdef CONFIG_DEBUG_CREDENTIALS +bool creds_are_invalid(const struct cred *cred) +{ + if (cred->magic != CRED_MAGIC) + return true; + if (atomic_read(&cred->usage) < atomic_read(&cred->subscribers)) + return true; +#ifdef CONFIG_SECURITY_SELINUX + if (selinux_is_enabled()) { + if ((unsigned long) cred->security < PAGE_SIZE) + return true; + if ((*(u32 *)cred->security & 0xffffff00) == + (POISON_FREE << 24 | POISON_FREE << 16 | POISON_FREE << 8)) + return true; + } +#endif + return false; +} +EXPORT_SYMBOL(creds_are_invalid); + /* * dump invalid credentials */ diff --git a/kernel/exit.c b/kernel/exit.c index 60d6fdcc926..5962d7ccf24 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -49,6 +49,7 @@ #include <linux/init_task.h> #include <linux/perf_event.h> #include <trace/events/sched.h> +#include <linux/hw_breakpoint.h> #include <asm/uaccess.h> #include <asm/unistd.h> @@ -110,9 +111,9 @@ static void __exit_signal(struct task_struct *tsk) * We won't ever get here for the group leader, since it * will have been the last reference on the signal_struct. */ - sig->utime = cputime_add(sig->utime, task_utime(tsk)); - sig->stime = cputime_add(sig->stime, task_stime(tsk)); - sig->gtime = cputime_add(sig->gtime, task_gtime(tsk)); + sig->utime = cputime_add(sig->utime, tsk->utime); + sig->stime = cputime_add(sig->stime, tsk->stime); + sig->gtime = cputime_add(sig->gtime, tsk->gtime); sig->min_flt += tsk->min_flt; sig->maj_flt += tsk->maj_flt; sig->nvcsw += tsk->nvcsw; @@ -359,10 +360,8 @@ void __set_special_pids(struct pid *pid) { struct task_struct *curr = current->group_leader; - if (task_session(curr) != pid) { + if (task_session(curr) != pid) change_pid(curr, PIDTYPE_SID, pid); - proc_sid_connector(curr); - } if (task_pgrp(curr) != pid) change_pid(curr, PIDTYPE_PGID, pid); @@ -934,7 +933,7 @@ NORET_TYPE void do_exit(long code) * an exiting task cleaning up the robust pi futexes. */ smp_mb(); - spin_unlock_wait(&tsk->pi_lock); + raw_spin_unlock_wait(&tsk->pi_lock); if (unlikely(in_atomic())) printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n", @@ -972,16 +971,18 @@ NORET_TYPE void do_exit(long code) exit_thread(); cgroup_exit(tsk, 1); - if (group_dead && tsk->signal->leader) + if (group_dead) disassociate_ctty(1); module_put(task_thread_info(tsk)->exec_domain->module); - if (tsk->binfmt) - module_put(tsk->binfmt->module); proc_exit_connector(tsk); /* + * FIXME: do that only when needed, using sched_exit tracepoint + */ + flush_ptrace_hw_breakpoint(tsk); + /* * Flush inherited counters to the parent - before the parent * gets woken up by child-exit notifications. */ @@ -993,8 +994,6 @@ NORET_TYPE void do_exit(long code) tsk->mempolicy = NULL; #endif #ifdef CONFIG_FUTEX - if (unlikely(!list_empty(&tsk->pi_state_list))) - exit_pi_state_list(tsk); if (unlikely(current->pi_state_cache)) kfree(current->pi_state_cache); #endif @@ -1010,7 +1009,7 @@ NORET_TYPE void do_exit(long code) tsk->flags |= PF_EXITPIDONE; if (tsk->io_context) - exit_io_context(); + exit_io_context(tsk); if (tsk->splice_pipe) __free_pipe_info(tsk->splice_pipe); @@ -1097,28 +1096,28 @@ struct wait_opts { int __user *wo_stat; struct rusage __user *wo_rusage; + wait_queue_t child_wait; int notask_error; }; -static struct pid *task_pid_type(struct task_struct *task, enum pid_type type) +static inline +struct pid *task_pid_type(struct task_struct *task, enum pid_type type) { - struct pid *pid = NULL; - if (type == PIDTYPE_PID) - pid = task->pids[type].pid; - else if (type < PIDTYPE_MAX) - pid = task->group_leader->pids[type].pid; - return pid; + if (type != PIDTYPE_PID) + task = task->group_leader; + return task->pids[type].pid; } -static int eligible_child(struct wait_opts *wo, struct task_struct *p) +static int eligible_pid(struct wait_opts *wo, struct task_struct *p) { - int err; - - if (wo->wo_type < PIDTYPE_MAX) { - if (task_pid_type(p, wo->wo_type) != wo->wo_pid) - return 0; - } + return wo->wo_type == PIDTYPE_MAX || + task_pid_type(p, wo->wo_type) == wo->wo_pid; +} +static int eligible_child(struct wait_opts *wo, struct task_struct *p) +{ + if (!eligible_pid(wo, p)) + return 0; /* Wait for all children (clone and not) if __WALL is set; * otherwise, wait for clone children *only* if __WCLONE is * set; otherwise, wait for non-clone children *only*. (Note: @@ -1128,10 +1127,6 @@ static int eligible_child(struct wait_opts *wo, struct task_struct *p) && !(wo->wo_flags & __WALL)) return 0; - err = security_task_wait(p); - if (err) - return err; - return 1; } @@ -1144,18 +1139,20 @@ static int wait_noreap_copyout(struct wait_opts *wo, struct task_struct *p, put_task_struct(p); infop = wo->wo_info; - if (!retval) - retval = put_user(SIGCHLD, &infop->si_signo); - if (!retval) - retval = put_user(0, &infop->si_errno); - if (!retval) - retval = put_user((short)why, &infop->si_code); - if (!retval) - retval = put_user(pid, &infop->si_pid); - if (!retval) - retval = put_user(uid, &infop->si_uid); - if (!retval) - retval = put_user(status, &infop->si_status); + if (infop) { + if (!retval) + retval = put_user(SIGCHLD, &infop->si_signo); + if (!retval) + retval = put_user(0, &infop->si_errno); + if (!retval) + retval = put_user((short)why, &infop->si_code); + if (!retval) + retval = put_user(pid, &infop->si_pid); + if (!retval) + retval = put_user(uid, &infop->si_uid); + if (!retval) + retval = put_user(status, &infop->si_status); + } if (!retval) retval = pid; return retval; @@ -1213,6 +1210,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) struct signal_struct *psig; struct signal_struct *sig; unsigned long maxrss; + cputime_t tgutime, tgstime; /* * The resource counters for the group leader are in its @@ -1228,20 +1226,23 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) * need to protect the access to parent->signal fields, * as other threads in the parent group can be right * here reaping other children at the same time. + * + * We use thread_group_times() to get times for the thread + * group, which consolidates times for all threads in the + * group including the group leader. */ + thread_group_times(p, &tgutime, &tgstime); spin_lock_irq(&p->real_parent->sighand->siglock); psig = p->real_parent->signal; sig = p->signal; psig->cutime = cputime_add(psig->cutime, - cputime_add(p->utime, - cputime_add(sig->utime, - sig->cutime))); + cputime_add(tgutime, + sig->cutime)); psig->cstime = cputime_add(psig->cstime, - cputime_add(p->stime, - cputime_add(sig->stime, - sig->cstime))); + cputime_add(tgstime, + sig->cstime)); psig->cgtime = cputime_add(psig->cgtime, cputime_add(p->gtime, @@ -1485,13 +1486,14 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p) * then ->notask_error is 0 if @p is an eligible child, * or another error from security_task_wait(), or still -ECHILD. */ -static int wait_consider_task(struct wait_opts *wo, struct task_struct *parent, - int ptrace, struct task_struct *p) +static int wait_consider_task(struct wait_opts *wo, int ptrace, + struct task_struct *p) { int ret = eligible_child(wo, p); if (!ret) return ret; + ret = security_task_wait(p); if (unlikely(ret < 0)) { /* * If we have not yet seen any eligible child, @@ -1553,7 +1555,7 @@ static int do_wait_thread(struct wait_opts *wo, struct task_struct *tsk) * Do not consider detached threads. */ if (!task_detached(p)) { - int ret = wait_consider_task(wo, tsk, 0, p); + int ret = wait_consider_task(wo, 0, p); if (ret) return ret; } @@ -1567,7 +1569,7 @@ static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk) struct task_struct *p; list_for_each_entry(p, &tsk->ptraced, ptrace_entry) { - int ret = wait_consider_task(wo, tsk, 1, p); + int ret = wait_consider_task(wo, 1, p); if (ret) return ret; } @@ -1575,15 +1577,38 @@ static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk) return 0; } +static int child_wait_callback(wait_queue_t *wait, unsigned mode, + int sync, void *key) +{ + struct wait_opts *wo = container_of(wait, struct wait_opts, + child_wait); + struct task_struct *p = key; + + if (!eligible_pid(wo, p)) + return 0; + + if ((wo->wo_flags & __WNOTHREAD) && wait->private != p->parent) + return 0; + + return default_wake_function(wait, mode, sync, key); +} + +void __wake_up_parent(struct task_struct *p, struct task_struct *parent) +{ + __wake_up_sync_key(&parent->signal->wait_chldexit, + TASK_INTERRUPTIBLE, 1, p); +} + static long do_wait(struct wait_opts *wo) { - DECLARE_WAITQUEUE(wait, current); struct task_struct *tsk; int retval; trace_sched_process_wait(wo->wo_pid); - add_wait_queue(¤t->signal->wait_chldexit,&wait); + init_waitqueue_func_entry(&wo->child_wait, child_wait_callback); + wo->child_wait.private = current; + add_wait_queue(¤t->signal->wait_chldexit, &wo->child_wait); repeat: /* * If there is nothing that can match our critiera just get out. @@ -1624,32 +1649,7 @@ notask: } end: __set_current_state(TASK_RUNNING); - remove_wait_queue(¤t->signal->wait_chldexit,&wait); - if (wo->wo_info) { - struct siginfo __user *infop = wo->wo_info; - - if (retval > 0) - retval = 0; - else { - /* - * For a WNOHANG return, clear out all the fields - * we would set so the user can easily tell the - * difference. - */ - if (!retval) - retval = put_user(0, &infop->si_signo); - if (!retval) - retval = put_user(0, &infop->si_errno); - if (!retval) - retval = put_user(0, &infop->si_code); - if (!retval) - retval = put_user(0, &infop->si_pid); - if (!retval) - retval = put_user(0, &infop->si_uid); - if (!retval) - retval = put_user(0, &infop->si_status); - } - } + remove_wait_queue(¤t->signal->wait_chldexit, &wo->child_wait); return retval; } @@ -1694,6 +1694,29 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *, wo.wo_stat = NULL; wo.wo_rusage = ru; ret = do_wait(&wo); + + if (ret > 0) { + ret = 0; + } else if (infop) { + /* + * For a WNOHANG return, clear out all the fields + * we would set so the user can easily tell the + * difference. + */ + if (!ret) + ret = put_user(0, &infop->si_signo); + if (!ret) + ret = put_user(0, &infop->si_errno); + if (!ret) + ret = put_user(0, &infop->si_code); + if (!ret) + ret = put_user(0, &infop->si_pid); + if (!ret) + ret = put_user(0, &infop->si_uid); + if (!ret) + ret = put_user(0, &infop->si_status); + } + put_pid(pid); /* avoid REGPARM breakage on x86: */ diff --git a/kernel/fork.c b/kernel/fork.c index 51ad0b0b726..202a0ba63d3 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -64,6 +64,7 @@ #include <linux/magic.h> #include <linux/perf_event.h> #include <linux/posix-timers.h> +#include <linux/user-return-notifier.h> #include <asm/pgtable.h> #include <asm/pgalloc.h> @@ -91,7 +92,7 @@ int nr_processes(void) int cpu; int total = 0; - for_each_online_cpu(cpu) + for_each_possible_cpu(cpu) total += per_cpu(process_counts, cpu); return total; @@ -249,6 +250,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) goto out; setup_thread_stack(tsk, orig); + clear_user_return_notifier(tsk); stackend = end_of_stack(tsk); *stackend = STACK_END_MAGIC; /* for overflow detection */ @@ -434,6 +436,14 @@ __setup("coredump_filter=", coredump_filter_setup); #include <linux/init_task.h> +static void mm_init_aio(struct mm_struct *mm) +{ +#ifdef CONFIG_AIO + spin_lock_init(&mm->ioctx_lock); + INIT_HLIST_HEAD(&mm->ioctx_list); +#endif +} + static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p) { atomic_set(&mm->mm_users, 1); @@ -447,10 +457,9 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p) set_mm_counter(mm, file_rss, 0); set_mm_counter(mm, anon_rss, 0); spin_lock_init(&mm->page_table_lock); - spin_lock_init(&mm->ioctx_lock); - INIT_HLIST_HEAD(&mm->ioctx_list); mm->free_area_cache = TASK_UNMAPPED_BASE; mm->cached_hole_size = ~0UL; + mm_init_aio(mm); mm_init_owner(mm, p); if (likely(!mm_alloc_pgd(mm))) { @@ -511,6 +520,8 @@ void mmput(struct mm_struct *mm) spin_unlock(&mmlist_lock); } put_swap_token(mm); + if (mm->binfmt) + module_put(mm->binfmt->module); mmdrop(mm); } } @@ -561,12 +572,18 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm) /* Get rid of any futexes when releasing the mm */ #ifdef CONFIG_FUTEX - if (unlikely(tsk->robust_list)) + if (unlikely(tsk->robust_list)) { exit_robust_list(tsk); + tsk->robust_list = NULL; + } #ifdef CONFIG_COMPAT - if (unlikely(tsk->compat_robust_list)) + if (unlikely(tsk->compat_robust_list)) { compat_exit_robust_list(tsk); + tsk->compat_robust_list = NULL; + } #endif + if (unlikely(!list_empty(&tsk->pi_state_list))) + exit_pi_state_list(tsk); #endif /* Get rid of any cached register state */ @@ -636,9 +653,14 @@ struct mm_struct *dup_mm(struct task_struct *tsk) mm->hiwater_rss = get_mm_rss(mm); mm->hiwater_vm = mm->total_vm; + if (mm->binfmt && !try_module_get(mm->binfmt->module)) + goto free_pt; + return mm; free_pt: + /* don't put binfmt in mmput, we haven't got module yet */ + mm->binfmt = NULL; mmput(mm); fail_nomem: @@ -864,6 +886,9 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero; sig->gtime = cputime_zero; sig->cgtime = cputime_zero; +#ifndef CONFIG_VIRT_CPU_ACCOUNTING + sig->prev_utime = sig->prev_stime = cputime_zero; +#endif sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0; sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0; sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0; @@ -914,9 +939,9 @@ SYSCALL_DEFINE1(set_tid_address, int __user *, tidptr) static void rt_mutex_init_task(struct task_struct *p) { - spin_lock_init(&p->pi_lock); + raw_spin_lock_init(&p->pi_lock); #ifdef CONFIG_RT_MUTEXES - plist_head_init(&p->pi_waiters, &p->pi_lock); + plist_head_init_raw(&p->pi_waiters, &p->pi_lock); p->pi_blocked_on = NULL; #endif } @@ -979,6 +1004,16 @@ static struct task_struct *copy_process(unsigned long clone_flags, if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM)) return ERR_PTR(-EINVAL); + /* + * Siblings of global init remain as zombies on exit since they are + * not reaped by their parent (swapper). To solve this and to avoid + * multi-rooted process trees, prevent global and container-inits + * from creating siblings. + */ + if ((clone_flags & CLONE_PARENT) && + current->signal->flags & SIGNAL_UNKILLABLE) + return ERR_PTR(-EINVAL); + retval = security_task_create(clone_flags); if (retval) goto fork_out; @@ -1020,9 +1055,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, if (!try_module_get(task_thread_info(p)->exec_domain->module)) goto bad_fork_cleanup_count; - if (p->binfmt && !try_module_get(p->binfmt->module)) - goto bad_fork_cleanup_put_domain; - p->did_exec = 0; delayacct_tsk_init(p); /* Must remain after dup_task_struct() */ copy_flags(clone_flags, p); @@ -1039,8 +1071,10 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->gtime = cputime_zero; p->utimescaled = cputime_zero; p->stimescaled = cputime_zero; +#ifndef CONFIG_VIRT_CPU_ACCOUNTING p->prev_utime = cputime_zero; p->prev_stime = cputime_zero; +#endif p->default_timer_slack_ns = current->timer_slack_ns; @@ -1093,6 +1127,10 @@ static struct task_struct *copy_process(unsigned long clone_flags, #ifdef CONFIG_DEBUG_MUTEXES p->blocked_on = NULL; /* not blocked yet */ #endif +#ifdef CONFIG_CGROUP_MEM_RES_CTLR + p->memcg_batch.do_batch = 0; + p->memcg_batch.memcg = NULL; +#endif p->bts = NULL; @@ -1172,9 +1210,10 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->sas_ss_sp = p->sas_ss_size = 0; /* - * Syscall tracing should be turned off in the child regardless - * of CLONE_PTRACE. + * Syscall tracing and stepping should be turned off in the + * child regardless of CLONE_PTRACE. */ + user_disable_single_step(p); clear_tsk_thread_flag(p, TIF_SYSCALL_TRACE); #ifdef TIF_SYSCALL_EMU clear_tsk_thread_flag(p, TIF_SYSCALL_EMU); @@ -1283,7 +1322,8 @@ bad_fork_free_pid: if (pid != &init_struct_pid) free_pid(pid); bad_fork_cleanup_io: - put_io_context(p->io_context); + if (p->io_context) + exit_io_context(p); bad_fork_cleanup_namespaces: exit_task_namespaces(p); bad_fork_cleanup_mm: @@ -1310,9 +1350,6 @@ bad_fork_cleanup_cgroup: #endif cgroup_exit(p, cgroup_callbacks_done); delayacct_tsk_free(p); - if (p->binfmt) - module_put(p->binfmt->module); -bad_fork_cleanup_put_domain: module_put(task_thread_info(p)->exec_domain->module); bad_fork_cleanup_count: atomic_dec(&p->cred->user->processes); diff --git a/kernel/futex.c b/kernel/futex.c index 248dd119a86..8e3c3ffe1b9 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -89,36 +89,36 @@ struct futex_pi_state { union futex_key key; }; -/* - * We use this hashed waitqueue instead of a normal wait_queue_t, so +/** + * struct futex_q - The hashed futex queue entry, one per waiting task + * @task: the task waiting on the futex + * @lock_ptr: the hash bucket lock + * @key: the key the futex is hashed on + * @pi_state: optional priority inheritance state + * @rt_waiter: rt_waiter storage for use with requeue_pi + * @requeue_pi_key: the requeue_pi target futex key + * @bitset: bitset for the optional bitmasked wakeup + * + * We use this hashed waitqueue, instead of a normal wait_queue_t, so * we can wake only the relevant ones (hashed queues may be shared). * * A futex_q has a woken state, just like tasks have TASK_RUNNING. * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0. * The order of wakup is always to make the first condition true, then - * wake up q->waiter, then make the second condition true. + * the second. + * + * PI futexes are typically woken before they are removed from the hash list via + * the rt_mutex code. See unqueue_me_pi(). */ struct futex_q { struct plist_node list; - /* Waiter reference */ - struct task_struct *task; - /* Which hash list lock to use: */ + struct task_struct *task; spinlock_t *lock_ptr; - - /* Key which the futex is hashed on: */ union futex_key key; - - /* Optional priority inheritance state: */ struct futex_pi_state *pi_state; - - /* rt_waiter storage for requeue_pi: */ struct rt_mutex_waiter *rt_waiter; - - /* The expected requeue pi target futex key: */ union futex_key *requeue_pi_key; - - /* Bitset for the optional bitmasked wakeup */ u32 bitset; }; @@ -150,7 +150,8 @@ static struct futex_hash_bucket *hash_futex(union futex_key *key) */ static inline int match_futex(union futex_key *key1, union futex_key *key2) { - return (key1->both.word == key2->both.word + return (key1 && key2 + && key1->both.word == key2->both.word && key1->both.ptr == key2->both.ptr && key1->both.offset == key2->both.offset); } @@ -198,11 +199,12 @@ static void drop_futex_key_refs(union futex_key *key) } /** - * get_futex_key - Get parameters which are the keys for a futex. - * @uaddr: virtual address of the futex - * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED - * @key: address where result is stored. - * @rw: mapping needs to be read/write (values: VERIFY_READ, VERIFY_WRITE) + * get_futex_key() - Get parameters which are the keys for a futex + * @uaddr: virtual address of the futex + * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED + * @key: address where result is stored. + * @rw: mapping needs to be read/write (values: VERIFY_READ, + * VERIFY_WRITE) * * Returns a negative error code or 0 * The key words are stored in *key on success. @@ -288,8 +290,8 @@ void put_futex_key(int fshared, union futex_key *key) drop_futex_key_refs(key); } -/* - * fault_in_user_writeable - fault in user address and verify RW access +/** + * fault_in_user_writeable() - Fault in user address and verify RW access * @uaddr: pointer to faulting user space address * * Slow path to fixup the fault we just took in the atomic write @@ -302,15 +304,21 @@ void put_futex_key(int fshared, union futex_key *key) */ static int fault_in_user_writeable(u32 __user *uaddr) { - int ret = get_user_pages(current, current->mm, (unsigned long)uaddr, - 1, 1, 0, NULL, NULL); + struct mm_struct *mm = current->mm; + int ret; + + down_read(&mm->mmap_sem); + ret = get_user_pages(current, mm, (unsigned long)uaddr, + 1, 1, 0, NULL, NULL); + up_read(&mm->mmap_sem); + return ret < 0 ? ret : 0; } /** * futex_top_waiter() - Return the highest priority waiter on a futex - * @hb: the hash bucket the futex_q's reside in - * @key: the futex key (to distinguish it from other futex futex_q's) + * @hb: the hash bucket the futex_q's reside in + * @key: the futex key (to distinguish it from other futex futex_q's) * * Must be called with the hb lock held. */ @@ -395,9 +403,9 @@ static void free_pi_state(struct futex_pi_state *pi_state) * and has cleaned up the pi_state already */ if (pi_state->owner) { - spin_lock_irq(&pi_state->owner->pi_lock); + raw_spin_lock_irq(&pi_state->owner->pi_lock); list_del_init(&pi_state->list); - spin_unlock_irq(&pi_state->owner->pi_lock); + raw_spin_unlock_irq(&pi_state->owner->pi_lock); rt_mutex_proxy_unlock(&pi_state->pi_mutex, pi_state->owner); } @@ -462,18 +470,18 @@ void exit_pi_state_list(struct task_struct *curr) * pi_state_list anymore, but we have to be careful * versus waiters unqueueing themselves: */ - spin_lock_irq(&curr->pi_lock); + raw_spin_lock_irq(&curr->pi_lock); while (!list_empty(head)) { next = head->next; pi_state = list_entry(next, struct futex_pi_state, list); key = pi_state->key; hb = hash_futex(&key); - spin_unlock_irq(&curr->pi_lock); + raw_spin_unlock_irq(&curr->pi_lock); spin_lock(&hb->lock); - spin_lock_irq(&curr->pi_lock); + raw_spin_lock_irq(&curr->pi_lock); /* * We dropped the pi-lock, so re-check whether this * task still owns the PI-state: @@ -487,15 +495,15 @@ void exit_pi_state_list(struct task_struct *curr) WARN_ON(list_empty(&pi_state->list)); list_del_init(&pi_state->list); pi_state->owner = NULL; - spin_unlock_irq(&curr->pi_lock); + raw_spin_unlock_irq(&curr->pi_lock); rt_mutex_unlock(&pi_state->pi_mutex); spin_unlock(&hb->lock); - spin_lock_irq(&curr->pi_lock); + raw_spin_lock_irq(&curr->pi_lock); } - spin_unlock_irq(&curr->pi_lock); + raw_spin_unlock_irq(&curr->pi_lock); } static int @@ -550,7 +558,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, * change of the task flags, we do this protected by * p->pi_lock: */ - spin_lock_irq(&p->pi_lock); + raw_spin_lock_irq(&p->pi_lock); if (unlikely(p->flags & PF_EXITING)) { /* * The task is on the way out. When PF_EXITPIDONE is @@ -559,7 +567,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, */ int ret = (p->flags & PF_EXITPIDONE) ? -ESRCH : -EAGAIN; - spin_unlock_irq(&p->pi_lock); + raw_spin_unlock_irq(&p->pi_lock); put_task_struct(p); return ret; } @@ -578,7 +586,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, WARN_ON(!list_empty(&pi_state->list)); list_add(&pi_state->list, &p->pi_state_list); pi_state->owner = p; - spin_unlock_irq(&p->pi_lock); + raw_spin_unlock_irq(&p->pi_lock); put_task_struct(p); @@ -588,7 +596,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, } /** - * futex_lock_pi_atomic() - atomic work required to acquire a pi aware futex + * futex_lock_pi_atomic() - Atomic work required to acquire a pi aware futex * @uaddr: the pi futex user address * @hb: the pi futex hash bucket * @key: the futex key associated with uaddr and hb @@ -752,7 +760,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) if (!pi_state) return -EINVAL; - spin_lock(&pi_state->pi_mutex.wait_lock); + raw_spin_lock(&pi_state->pi_mutex.wait_lock); new_owner = rt_mutex_next_owner(&pi_state->pi_mutex); /* @@ -781,23 +789,23 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) else if (curval != uval) ret = -EINVAL; if (ret) { - spin_unlock(&pi_state->pi_mutex.wait_lock); + raw_spin_unlock(&pi_state->pi_mutex.wait_lock); return ret; } } - spin_lock_irq(&pi_state->owner->pi_lock); + raw_spin_lock_irq(&pi_state->owner->pi_lock); WARN_ON(list_empty(&pi_state->list)); list_del_init(&pi_state->list); - spin_unlock_irq(&pi_state->owner->pi_lock); + raw_spin_unlock_irq(&pi_state->owner->pi_lock); - spin_lock_irq(&new_owner->pi_lock); + raw_spin_lock_irq(&new_owner->pi_lock); WARN_ON(!list_empty(&pi_state->list)); list_add(&pi_state->list, &new_owner->pi_state_list); pi_state->owner = new_owner; - spin_unlock_irq(&new_owner->pi_lock); + raw_spin_unlock_irq(&new_owner->pi_lock); - spin_unlock(&pi_state->pi_mutex.wait_lock); + raw_spin_unlock(&pi_state->pi_mutex.wait_lock); rt_mutex_unlock(&pi_state->pi_mutex); return 0; @@ -915,8 +923,8 @@ retry: hb1 = hash_futex(&key1); hb2 = hash_futex(&key2); - double_lock_hb(hb1, hb2); retry_private: + double_lock_hb(hb1, hb2); op_ret = futex_atomic_op_inuser(op, uaddr2); if (unlikely(op_ret < 0)) { @@ -1002,7 +1010,7 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1, plist_add(&q->list, &hb2->chain); q->lock_ptr = &hb2->lock; #ifdef CONFIG_DEBUG_PI_LIST - q->list.plist.lock = &hb2->lock; + q->list.plist.spinlock = &hb2->lock; #endif } get_futex_key_refs(key2); @@ -1011,9 +1019,9 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1, /** * requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue - * q: the futex_q - * key: the key of the requeue target futex - * hb: the hash_bucket of the requeue target futex + * @q: the futex_q + * @key: the key of the requeue target futex + * @hb: the hash_bucket of the requeue target futex * * During futex_requeue, with requeue_pi=1, it is possible to acquire the * target futex if it is uncontended or via a lock steal. Set the futex_q key @@ -1027,7 +1035,6 @@ static inline void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key, struct futex_hash_bucket *hb) { - drop_futex_key_refs(&q->key); get_futex_key_refs(key); q->key = *key; @@ -1039,7 +1046,7 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key, q->lock_ptr = &hb->lock; #ifdef CONFIG_DEBUG_PI_LIST - q->list.plist.lock = &hb->lock; + q->list.plist.spinlock = &hb->lock; #endif wake_up_state(q->task, TASK_NORMAL); @@ -1225,6 +1232,7 @@ retry_private: */ if (ret == 1) { WARN_ON(pi_state); + drop_count++; task_count++; ret = get_futex_value_locked(&curval2, uaddr2); if (!ret) @@ -1303,6 +1311,7 @@ retry_private: if (ret == 1) { /* We got the lock. */ requeue_pi_wake_futex(this, &key2, hb2); + drop_count++; continue; } else if (ret) { /* -EDEADLK */ @@ -1350,6 +1359,25 @@ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q) return hb; } +static inline void +queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb) +{ + spin_unlock(&hb->lock); + drop_futex_key_refs(&q->key); +} + +/** + * queue_me() - Enqueue the futex_q on the futex_hash_bucket + * @q: The futex_q to enqueue + * @hb: The destination hash bucket + * + * The hb->lock must be held by the caller, and is released here. A call to + * queue_me() is typically paired with exactly one call to unqueue_me(). The + * exceptions involve the PI related operations, which may use unqueue_me_pi() + * or nothing if the unqueue is done as part of the wake process and the unqueue + * state is implicit in the state of woken task (see futex_wait_requeue_pi() for + * an example). + */ static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb) { int prio; @@ -1366,26 +1394,24 @@ static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb) plist_node_init(&q->list, prio); #ifdef CONFIG_DEBUG_PI_LIST - q->list.plist.lock = &hb->lock; + q->list.plist.spinlock = &hb->lock; #endif plist_add(&q->list, &hb->chain); q->task = current; spin_unlock(&hb->lock); } -static inline void -queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb) -{ - spin_unlock(&hb->lock); - drop_futex_key_refs(&q->key); -} - -/* - * queue_me and unqueue_me must be called as a pair, each - * exactly once. They are called with the hashed spinlock held. +/** + * unqueue_me() - Remove the futex_q from its futex_hash_bucket + * @q: The futex_q to unqueue + * + * The q->lock_ptr must not be held by the caller. A call to unqueue_me() must + * be paired with exactly one earlier call to queue_me(). + * + * Returns: + * 1 - if the futex_q was still queued (and we removed unqueued it) + * 0 - if the futex_q was already removed by the waking thread */ - -/* Return 1 if we were still queued (ie. 0 means we were woken) */ static int unqueue_me(struct futex_q *q) { spinlock_t *lock_ptr; @@ -1503,18 +1529,18 @@ retry: * itself. */ if (pi_state->owner != NULL) { - spin_lock_irq(&pi_state->owner->pi_lock); + raw_spin_lock_irq(&pi_state->owner->pi_lock); WARN_ON(list_empty(&pi_state->list)); list_del_init(&pi_state->list); - spin_unlock_irq(&pi_state->owner->pi_lock); + raw_spin_unlock_irq(&pi_state->owner->pi_lock); } pi_state->owner = newowner; - spin_lock_irq(&newowner->pi_lock); + raw_spin_lock_irq(&newowner->pi_lock); WARN_ON(!list_empty(&pi_state->list)); list_add(&pi_state->list, &newowner->pi_state_list); - spin_unlock_irq(&newowner->pi_lock); + raw_spin_unlock_irq(&newowner->pi_lock); return 0; /* @@ -1638,17 +1664,14 @@ out: static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q, struct hrtimer_sleeper *timeout) { - queue_me(q, hb); - /* - * There might have been scheduling since the queue_me(), as we - * cannot hold a spinlock across the get_user() in case it - * faults, and we cannot just set TASK_INTERRUPTIBLE state when - * queueing ourselves into the futex hash. This code thus has to - * rely on the futex_wake() code removing us from hash when it - * wakes us up. + * The task state is guaranteed to be set before another task can + * wake it. set_current_state() is implemented using set_mb() and + * queue_me() calls spin_unlock() upon completion, both serializing + * access to the hash list and forcing another memory barrier. */ set_current_state(TASK_INTERRUPTIBLE); + queue_me(q, hb); /* Arm the timer */ if (timeout) { @@ -1658,8 +1681,8 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q, } /* - * !plist_node_empty() is safe here without any lock. - * q.lock_ptr != 0 is not safe, because of ordering against wakeup. + * If we have been removed from the hash list, then another task + * has tried to wake us, and we can skip the call to schedule(). */ if (likely(!plist_node_empty(&q->list))) { /* @@ -1776,6 +1799,7 @@ static int futex_wait(u32 __user *uaddr, int fshared, current->timer_slack_ns); } +retry: /* Prepare to wait on uaddr. */ ret = futex_wait_setup(uaddr, val, fshared, &q, &hb); if (ret) @@ -1793,9 +1817,14 @@ static int futex_wait(u32 __user *uaddr, int fshared, goto out_put_key; /* - * We expect signal_pending(current), but another thread may - * have handled it for us already. + * We expect signal_pending(current), but we might be the + * victim of a spurious wakeup as well. */ + if (!signal_pending(current)) { + put_futex_key(fshared, &q.key); + goto retry; + } + ret = -ERESTARTSYS; if (!abs_time) goto out_put_key; @@ -2102,11 +2131,12 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, * Unqueue the futex_q and determine which it was. */ plist_del(&q->list, &q->list.plist); - drop_futex_key_refs(&q->key); + /* Handle spurious wakeups gracefully */ + ret = -EWOULDBLOCK; if (timeout && !timeout->task) ret = -ETIMEDOUT; - else + else if (signal_pending(current)) ret = -ERESTARTNOINTR; } return ret; @@ -2114,12 +2144,12 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, /** * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2 - * @uaddr: the futex we initialyl wait on (non-pi) + * @uaddr: the futex we initially wait on (non-pi) * @fshared: whether the futexes are shared (1) or not (0). They must be * the same type, no requeueing from private to shared, etc. * @val: the expected value of uaddr * @abs_time: absolute timeout - * @bitset: 32 bit wakeup bitset set by userspace, defaults to all. + * @bitset: 32 bit wakeup bitset set by userspace, defaults to all * @clockrt: whether to use CLOCK_REALTIME (1) or CLOCK_MONOTONIC (0) * @uaddr2: the pi futex we will take prior to returning to user-space * @@ -2246,7 +2276,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared, res = fixup_owner(uaddr2, fshared, &q, !ret); /* * If fixup_owner() returned an error, proprogate that. If it - * acquired the lock, clear our -ETIMEDOUT or -EINTR. + * acquired the lock, clear -ETIMEDOUT or -EINTR. */ if (res) ret = (res < 0) ? res : 0; @@ -2302,9 +2332,9 @@ out: */ /** - * sys_set_robust_list - set the robust-futex list head of a task - * @head: pointer to the list-head - * @len: length of the list-head, as userspace expects + * sys_set_robust_list() - Set the robust-futex list head of a task + * @head: pointer to the list-head + * @len: length of the list-head, as userspace expects */ SYSCALL_DEFINE2(set_robust_list, struct robust_list_head __user *, head, size_t, len) @@ -2323,10 +2353,10 @@ SYSCALL_DEFINE2(set_robust_list, struct robust_list_head __user *, head, } /** - * sys_get_robust_list - get the robust-futex list head of a task - * @pid: pid of the process [zero for current task] - * @head_ptr: pointer to a list-head pointer, the kernel fills it in - * @len_ptr: pointer to a length field, the kernel fills in the header size + * sys_get_robust_list() - Get the robust-futex list head of a task + * @pid: pid of the process [zero for current task] + * @head_ptr: pointer to a list-head pointer, the kernel fills it in + * @len_ptr: pointer to a length field, the kernel fills in the header size */ SYSCALL_DEFINE3(get_robust_list, int, pid, struct robust_list_head __user * __user *, head_ptr, diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig index 654efd09f6a..70a298d6da7 100644 --- a/kernel/gcov/Kconfig +++ b/kernel/gcov/Kconfig @@ -34,7 +34,7 @@ config GCOV_KERNEL config GCOV_PROFILE_ALL bool "Profile entire Kernel" depends on GCOV_KERNEL - depends on S390 || X86 || (PPC && EXPERIMENTAL) + depends on S390 || X86 || (PPC && EXPERIMENTAL) || MICROBLAZE default n ---help--- This options activates profiling for the entire kernel. diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index e5d98ce50f8..0086628b6e9 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -127,11 +127,11 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer, for (;;) { base = timer->base; if (likely(base != NULL)) { - spin_lock_irqsave(&base->cpu_base->lock, *flags); + raw_spin_lock_irqsave(&base->cpu_base->lock, *flags); if (likely(base == timer->base)) return base; /* The timer has migrated to another CPU: */ - spin_unlock_irqrestore(&base->cpu_base->lock, *flags); + raw_spin_unlock_irqrestore(&base->cpu_base->lock, *flags); } cpu_relax(); } @@ -208,13 +208,13 @@ again: /* See the comment in lock_timer_base() */ timer->base = NULL; - spin_unlock(&base->cpu_base->lock); - spin_lock(&new_base->cpu_base->lock); + raw_spin_unlock(&base->cpu_base->lock); + raw_spin_lock(&new_base->cpu_base->lock); if (cpu != this_cpu && hrtimer_check_target(timer, new_base)) { cpu = this_cpu; - spin_unlock(&new_base->cpu_base->lock); - spin_lock(&base->cpu_base->lock); + raw_spin_unlock(&new_base->cpu_base->lock); + raw_spin_lock(&base->cpu_base->lock); timer->base = base; goto again; } @@ -230,7 +230,7 @@ lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) { struct hrtimer_clock_base *base = timer->base; - spin_lock_irqsave(&base->cpu_base->lock, *flags); + raw_spin_lock_irqsave(&base->cpu_base->lock, *flags); return base; } @@ -509,13 +509,14 @@ static inline int hrtimer_hres_active(void) * next event * Called with interrupts disabled and base->lock held */ -static void hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base) +static void +hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal) { int i; struct hrtimer_clock_base *base = cpu_base->clock_base; - ktime_t expires; + ktime_t expires, expires_next; - cpu_base->expires_next.tv64 = KTIME_MAX; + expires_next.tv64 = KTIME_MAX; for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) { struct hrtimer *timer; @@ -531,10 +532,15 @@ static void hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base) */ if (expires.tv64 < 0) expires.tv64 = 0; - if (expires.tv64 < cpu_base->expires_next.tv64) - cpu_base->expires_next = expires; + if (expires.tv64 < expires_next.tv64) + expires_next = expires; } + if (skip_equal && expires_next.tv64 == cpu_base->expires_next.tv64) + return; + + cpu_base->expires_next.tv64 = expires_next.tv64; + if (cpu_base->expires_next.tv64 != KTIME_MAX) tick_program_event(cpu_base->expires_next, 1); } @@ -551,7 +557,7 @@ static void hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base) static int hrtimer_reprogram(struct hrtimer *timer, struct hrtimer_clock_base *base) { - ktime_t *expires_next = &__get_cpu_var(hrtimer_bases).expires_next; + struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset); int res; @@ -576,7 +582,16 @@ static int hrtimer_reprogram(struct hrtimer *timer, if (expires.tv64 < 0) return -ETIME; - if (expires.tv64 >= expires_next->tv64) + if (expires.tv64 >= cpu_base->expires_next.tv64) + return 0; + + /* + * If a hang was detected in the last timer interrupt then we + * do not schedule a timer which is earlier than the expiry + * which we enforced in the hang detection. We want the system + * to make progress. + */ + if (cpu_base->hang_detected) return 0; /* @@ -584,7 +599,7 @@ static int hrtimer_reprogram(struct hrtimer *timer, */ res = tick_program_event(expires, 0); if (!IS_ERR_VALUE(res)) - *expires_next = expires; + cpu_base->expires_next = expires; return res; } @@ -613,12 +628,12 @@ static void retrigger_next_event(void *arg) base = &__get_cpu_var(hrtimer_bases); /* Adjust CLOCK_REALTIME offset */ - spin_lock(&base->lock); + raw_spin_lock(&base->lock); base->clock_base[CLOCK_REALTIME].offset = timespec_to_ktime(realtime_offset); - hrtimer_force_reprogram(base); - spin_unlock(&base->lock); + hrtimer_force_reprogram(base, 0); + raw_spin_unlock(&base->lock); } /* @@ -679,9 +694,9 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, { if (base->cpu_base->hres_active && hrtimer_reprogram(timer, base)) { if (wakeup) { - spin_unlock(&base->cpu_base->lock); + raw_spin_unlock(&base->cpu_base->lock); raise_softirq_irqoff(HRTIMER_SOFTIRQ); - spin_lock(&base->cpu_base->lock); + raw_spin_lock(&base->cpu_base->lock); } else __raise_softirq_irqoff(HRTIMER_SOFTIRQ); @@ -720,8 +735,6 @@ static int hrtimer_switch_to_hres(void) /* "Retrigger" the interrupt to get things going */ retrigger_next_event(NULL); local_irq_restore(flags); - printk(KERN_DEBUG "Switched to high resolution mode on CPU %d\n", - smp_processor_id()); return 1; } @@ -730,7 +743,8 @@ static int hrtimer_switch_to_hres(void) static inline int hrtimer_hres_active(void) { return 0; } static inline int hrtimer_is_hres_enabled(void) { return 0; } static inline int hrtimer_switch_to_hres(void) { return 0; } -static inline void hrtimer_force_reprogram(struct hrtimer_cpu_base *base) { } +static inline void +hrtimer_force_reprogram(struct hrtimer_cpu_base *base, int skip_equal) { } static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, struct hrtimer_clock_base *base, int wakeup) @@ -742,17 +756,33 @@ static inline void hrtimer_init_timer_hres(struct hrtimer *timer) { } #endif /* CONFIG_HIGH_RES_TIMERS */ -#ifdef CONFIG_TIMER_STATS -void __timer_stats_hrtimer_set_start_info(struct hrtimer *timer, void *addr) +static inline void timer_stats_hrtimer_set_start_info(struct hrtimer *timer) { +#ifdef CONFIG_TIMER_STATS if (timer->start_site) return; - - timer->start_site = addr; + timer->start_site = __builtin_return_address(0); memcpy(timer->start_comm, current->comm, TASK_COMM_LEN); timer->start_pid = current->pid; +#endif +} + +static inline void timer_stats_hrtimer_clear_start_info(struct hrtimer *timer) +{ +#ifdef CONFIG_TIMER_STATS + timer->start_site = NULL; +#endif } + +static inline void timer_stats_account_hrtimer(struct hrtimer *timer) +{ +#ifdef CONFIG_TIMER_STATS + if (likely(!timer_stats_active)) + return; + timer_stats_update_stats(timer, timer->start_pid, timer->start_site, + timer->function, timer->start_comm, 0); #endif +} /* * Counterpart to lock_hrtimer_base above: @@ -760,7 +790,7 @@ void __timer_stats_hrtimer_set_start_info(struct hrtimer *timer, void *addr) static inline void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) { - spin_unlock_irqrestore(&timer->base->cpu_base->lock, *flags); + raw_spin_unlock_irqrestore(&timer->base->cpu_base->lock, *flags); } /** @@ -873,19 +903,29 @@ static void __remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, unsigned long newstate, int reprogram) { - if (timer->state & HRTIMER_STATE_ENQUEUED) { - /* - * Remove the timer from the rbtree and replace the - * first entry pointer if necessary. - */ - if (base->first == &timer->node) { - base->first = rb_next(&timer->node); - /* Reprogram the clock event device. if enabled */ - if (reprogram && hrtimer_hres_active()) - hrtimer_force_reprogram(base->cpu_base); + if (!(timer->state & HRTIMER_STATE_ENQUEUED)) + goto out; + + /* + * Remove the timer from the rbtree and replace the first + * entry pointer if necessary. + */ + if (base->first == &timer->node) { + base->first = rb_next(&timer->node); +#ifdef CONFIG_HIGH_RES_TIMERS + /* Reprogram the clock event device. if enabled */ + if (reprogram && hrtimer_hres_active()) { + ktime_t expires; + + expires = ktime_sub(hrtimer_get_expires(timer), + base->offset); + if (base->cpu_base->expires_next.tv64 == expires.tv64) + hrtimer_force_reprogram(base->cpu_base, 1); } - rb_erase(&timer->node, &base->active); +#endif } + rb_erase(&timer->node, &base->active); +out: timer->state = newstate; } @@ -1083,7 +1123,7 @@ ktime_t hrtimer_get_next_event(void) unsigned long flags; int i; - spin_lock_irqsave(&cpu_base->lock, flags); + raw_spin_lock_irqsave(&cpu_base->lock, flags); if (!hrtimer_hres_active()) { for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) { @@ -1100,7 +1140,7 @@ ktime_t hrtimer_get_next_event(void) } } - spin_unlock_irqrestore(&cpu_base->lock, flags); + raw_spin_unlock_irqrestore(&cpu_base->lock, flags); if (mindelta.tv64 < 0) mindelta.tv64 = 0; @@ -1182,11 +1222,11 @@ static void __run_hrtimer(struct hrtimer *timer, ktime_t *now) * they get migrated to another cpu, therefore its safe to unlock * the timer base. */ - spin_unlock(&cpu_base->lock); + raw_spin_unlock(&cpu_base->lock); trace_hrtimer_expire_entry(timer, now); restart = fn(timer); trace_hrtimer_expire_exit(timer); - spin_lock(&cpu_base->lock); + raw_spin_lock(&cpu_base->lock); /* * Note: We clear the CALLBACK bit after enqueue_hrtimer and @@ -1202,29 +1242,6 @@ static void __run_hrtimer(struct hrtimer *timer, ktime_t *now) #ifdef CONFIG_HIGH_RES_TIMERS -static int force_clock_reprogram; - -/* - * After 5 iteration's attempts, we consider that hrtimer_interrupt() - * is hanging, which could happen with something that slows the interrupt - * such as the tracing. Then we force the clock reprogramming for each future - * hrtimer interrupts to avoid infinite loops and use the min_delta_ns - * threshold that we will overwrite. - * The next tick event will be scheduled to 3 times we currently spend on - * hrtimer_interrupt(). This gives a good compromise, the cpus will spend - * 1/4 of their time to process the hrtimer interrupts. This is enough to - * let it running without serious starvation. - */ - -static inline void -hrtimer_interrupt_hanging(struct clock_event_device *dev, - ktime_t try_time) -{ - force_clock_reprogram = 1; - dev->min_delta_ns = (unsigned long)try_time.tv64 * 3; - printk(KERN_WARNING "hrtimer: interrupt too slow, " - "forcing clock min delta to %lu ns\n", dev->min_delta_ns); -} /* * High resolution timer interrupt * Called with interrupts disabled @@ -1233,24 +1250,18 @@ void hrtimer_interrupt(struct clock_event_device *dev) { struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); struct hrtimer_clock_base *base; - ktime_t expires_next, now; - int nr_retries = 0; - int i; + ktime_t expires_next, now, entry_time, delta; + int i, retries = 0; BUG_ON(!cpu_base->hres_active); cpu_base->nr_events++; dev->next_event.tv64 = KTIME_MAX; - retry: - /* 5 retries is enough to notice a hang */ - if (!(++nr_retries % 5)) - hrtimer_interrupt_hanging(dev, ktime_sub(ktime_get(), now)); - - now = ktime_get(); - + entry_time = now = ktime_get(); +retry: expires_next.tv64 = KTIME_MAX; - spin_lock(&cpu_base->lock); + raw_spin_lock(&cpu_base->lock); /* * We set expires_next to KTIME_MAX here with cpu_base->lock * held to prevent that a timer is enqueued in our queue via @@ -1306,13 +1317,51 @@ void hrtimer_interrupt(struct clock_event_device *dev) * against it. */ cpu_base->expires_next = expires_next; - spin_unlock(&cpu_base->lock); + raw_spin_unlock(&cpu_base->lock); /* Reprogramming necessary ? */ - if (expires_next.tv64 != KTIME_MAX) { - if (tick_program_event(expires_next, force_clock_reprogram)) - goto retry; + if (expires_next.tv64 == KTIME_MAX || + !tick_program_event(expires_next, 0)) { + cpu_base->hang_detected = 0; + return; } + + /* + * The next timer was already expired due to: + * - tracing + * - long lasting callbacks + * - being scheduled away when running in a VM + * + * We need to prevent that we loop forever in the hrtimer + * interrupt routine. We give it 3 attempts to avoid + * overreacting on some spurious event. + */ + now = ktime_get(); + cpu_base->nr_retries++; + if (++retries < 3) + goto retry; + /* + * Give the system a chance to do something else than looping + * here. We stored the entry time, so we know exactly how long + * we spent here. We schedule the next event this amount of + * time away. + */ + cpu_base->nr_hangs++; + cpu_base->hang_detected = 1; + delta = ktime_sub(now, entry_time); + if (delta.tv64 > cpu_base->max_hang_time.tv64) + cpu_base->max_hang_time = delta; + /* + * Limit it to a sensible value as we enforce a longer + * delay. Give the CPU at least 100ms to catch up. + */ + if (delta.tv64 > 100 * NSEC_PER_MSEC) + expires_next = ktime_add_ns(now, 100 * NSEC_PER_MSEC); + else + expires_next = ktime_add(now, delta); + tick_program_event(expires_next, 1); + printk_once(KERN_WARNING "hrtimer: interrupt took %llu ns\n", + ktime_to_ns(delta)); } /* @@ -1408,7 +1457,7 @@ void hrtimer_run_queues(void) gettime = 0; } - spin_lock(&cpu_base->lock); + raw_spin_lock(&cpu_base->lock); while ((node = base->first)) { struct hrtimer *timer; @@ -1420,7 +1469,7 @@ void hrtimer_run_queues(void) __run_hrtimer(timer, &base->softirq_time); } - spin_unlock(&cpu_base->lock); + raw_spin_unlock(&cpu_base->lock); } } @@ -1576,7 +1625,7 @@ static void __cpuinit init_hrtimers_cpu(int cpu) struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu); int i; - spin_lock_init(&cpu_base->lock); + raw_spin_lock_init(&cpu_base->lock); for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) cpu_base->clock_base[i].cpu_base = cpu_base; @@ -1634,16 +1683,16 @@ static void migrate_hrtimers(int scpu) * The caller is globally serialized and nobody else * takes two locks at once, deadlock is not possible. */ - spin_lock(&new_base->lock); - spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); + raw_spin_lock(&new_base->lock); + raw_spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { migrate_hrtimer_list(&old_base->clock_base[i], &new_base->clock_base[i]); } - spin_unlock(&old_base->lock); - spin_unlock(&new_base->lock); + raw_spin_unlock(&old_base->lock); + raw_spin_unlock(&new_base->lock); /* Check, if we got expired work to do */ __hrtimer_peek_ahead_timers(); diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 022a4927b78..0c642d51aac 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -144,7 +144,7 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout) rcu_read_lock(); do_each_thread(g, t) { - if (!--max_count) + if (!max_count--) goto unlock; if (!--batch_count) { batch_count = HUNG_TASK_BATCHING; @@ -171,12 +171,12 @@ static unsigned long timeout_jiffies(unsigned long timeout) * Process updating of timeout sysctl */ int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, - struct file *filp, void __user *buffer, + void __user *buffer, size_t *lenp, loff_t *ppos) { int ret; - ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos); + ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos); if (ret || !write) goto out; diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c new file mode 100644 index 00000000000..dbcbf6a33a0 --- /dev/null +++ b/kernel/hw_breakpoint.c @@ -0,0 +1,453 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) 2007 Alan Stern + * Copyright (C) IBM Corporation, 2009 + * Copyright (C) 2009, Frederic Weisbecker <fweisbec@gmail.com> + * + * Thanks to Ingo Molnar for his many suggestions. + * + * Authors: Alan Stern <stern@rowland.harvard.edu> + * K.Prasad <prasad@linux.vnet.ibm.com> + * Frederic Weisbecker <fweisbec@gmail.com> + */ + +/* + * HW_breakpoint: a unified kernel/user-space hardware breakpoint facility, + * using the CPU's debug registers. + * This file contains the arch-independent routines. + */ + +#include <linux/irqflags.h> +#include <linux/kallsyms.h> +#include <linux/notifier.h> +#include <linux/kprobes.h> +#include <linux/kdebug.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/percpu.h> +#include <linux/sched.h> +#include <linux/init.h> +#include <linux/smp.h> + +#include <linux/hw_breakpoint.h> + +/* + * Constraints data + */ + +/* Number of pinned cpu breakpoints in a cpu */ +static DEFINE_PER_CPU(unsigned int, nr_cpu_bp_pinned); + +/* Number of pinned task breakpoints in a cpu */ +static DEFINE_PER_CPU(unsigned int, nr_task_bp_pinned[HBP_NUM]); + +/* Number of non-pinned cpu/task breakpoints in a cpu */ +static DEFINE_PER_CPU(unsigned int, nr_bp_flexible); + +/* Gather the number of total pinned and un-pinned bp in a cpuset */ +struct bp_busy_slots { + unsigned int pinned; + unsigned int flexible; +}; + +/* Serialize accesses to the above constraints */ +static DEFINE_MUTEX(nr_bp_mutex); + +/* + * Report the maximum number of pinned breakpoints a task + * have in this cpu + */ +static unsigned int max_task_bp_pinned(int cpu) +{ + int i; + unsigned int *tsk_pinned = per_cpu(nr_task_bp_pinned, cpu); + + for (i = HBP_NUM -1; i >= 0; i--) { + if (tsk_pinned[i] > 0) + return i + 1; + } + + return 0; +} + +static int task_bp_pinned(struct task_struct *tsk) +{ + struct perf_event_context *ctx = tsk->perf_event_ctxp; + struct list_head *list; + struct perf_event *bp; + unsigned long flags; + int count = 0; + + if (WARN_ONCE(!ctx, "No perf context for this task")) + return 0; + + list = &ctx->event_list; + + raw_spin_lock_irqsave(&ctx->lock, flags); + + /* + * The current breakpoint counter is not included in the list + * at the open() callback time + */ + list_for_each_entry(bp, list, event_entry) { + if (bp->attr.type == PERF_TYPE_BREAKPOINT) + count++; + } + + raw_spin_unlock_irqrestore(&ctx->lock, flags); + + return count; +} + +/* + * Report the number of pinned/un-pinned breakpoints we have in + * a given cpu (cpu > -1) or in all of them (cpu = -1). + */ +static void +fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp) +{ + int cpu = bp->cpu; + struct task_struct *tsk = bp->ctx->task; + + if (cpu >= 0) { + slots->pinned = per_cpu(nr_cpu_bp_pinned, cpu); + if (!tsk) + slots->pinned += max_task_bp_pinned(cpu); + else + slots->pinned += task_bp_pinned(tsk); + slots->flexible = per_cpu(nr_bp_flexible, cpu); + + return; + } + + for_each_online_cpu(cpu) { + unsigned int nr; + + nr = per_cpu(nr_cpu_bp_pinned, cpu); + if (!tsk) + nr += max_task_bp_pinned(cpu); + else + nr += task_bp_pinned(tsk); + + if (nr > slots->pinned) + slots->pinned = nr; + + nr = per_cpu(nr_bp_flexible, cpu); + + if (nr > slots->flexible) + slots->flexible = nr; + } +} + +/* + * Add a pinned breakpoint for the given task in our constraint table + */ +static void toggle_bp_task_slot(struct task_struct *tsk, int cpu, bool enable) +{ + unsigned int *tsk_pinned; + int count = 0; + + count = task_bp_pinned(tsk); + + tsk_pinned = per_cpu(nr_task_bp_pinned, cpu); + if (enable) { + tsk_pinned[count]++; + if (count > 0) + tsk_pinned[count-1]--; + } else { + tsk_pinned[count]--; + if (count > 0) + tsk_pinned[count-1]++; + } +} + +/* + * Add/remove the given breakpoint in our constraint table + */ +static void toggle_bp_slot(struct perf_event *bp, bool enable) +{ + int cpu = bp->cpu; + struct task_struct *tsk = bp->ctx->task; + + /* Pinned counter task profiling */ + if (tsk) { + if (cpu >= 0) { + toggle_bp_task_slot(tsk, cpu, enable); + return; + } + + for_each_online_cpu(cpu) + toggle_bp_task_slot(tsk, cpu, enable); + return; + } + + /* Pinned counter cpu profiling */ + if (enable) + per_cpu(nr_cpu_bp_pinned, bp->cpu)++; + else + per_cpu(nr_cpu_bp_pinned, bp->cpu)--; +} + +/* + * Contraints to check before allowing this new breakpoint counter: + * + * == Non-pinned counter == (Considered as pinned for now) + * + * - If attached to a single cpu, check: + * + * (per_cpu(nr_bp_flexible, cpu) || (per_cpu(nr_cpu_bp_pinned, cpu) + * + max(per_cpu(nr_task_bp_pinned, cpu)))) < HBP_NUM + * + * -> If there are already non-pinned counters in this cpu, it means + * there is already a free slot for them. + * Otherwise, we check that the maximum number of per task + * breakpoints (for this cpu) plus the number of per cpu breakpoint + * (for this cpu) doesn't cover every registers. + * + * - If attached to every cpus, check: + * + * (per_cpu(nr_bp_flexible, *) || (max(per_cpu(nr_cpu_bp_pinned, *)) + * + max(per_cpu(nr_task_bp_pinned, *)))) < HBP_NUM + * + * -> This is roughly the same, except we check the number of per cpu + * bp for every cpu and we keep the max one. Same for the per tasks + * breakpoints. + * + * + * == Pinned counter == + * + * - If attached to a single cpu, check: + * + * ((per_cpu(nr_bp_flexible, cpu) > 1) + per_cpu(nr_cpu_bp_pinned, cpu) + * + max(per_cpu(nr_task_bp_pinned, cpu))) < HBP_NUM + * + * -> Same checks as before. But now the nr_bp_flexible, if any, must keep + * one register at least (or they will never be fed). + * + * - If attached to every cpus, check: + * + * ((per_cpu(nr_bp_flexible, *) > 1) + max(per_cpu(nr_cpu_bp_pinned, *)) + * + max(per_cpu(nr_task_bp_pinned, *))) < HBP_NUM + */ +int reserve_bp_slot(struct perf_event *bp) +{ + struct bp_busy_slots slots = {0}; + int ret = 0; + + mutex_lock(&nr_bp_mutex); + + fetch_bp_busy_slots(&slots, bp); + + /* Flexible counters need to keep at least one slot */ + if (slots.pinned + (!!slots.flexible) == HBP_NUM) { + ret = -ENOSPC; + goto end; + } + + toggle_bp_slot(bp, true); + +end: + mutex_unlock(&nr_bp_mutex); + + return ret; +} + +void release_bp_slot(struct perf_event *bp) +{ + mutex_lock(&nr_bp_mutex); + + toggle_bp_slot(bp, false); + + mutex_unlock(&nr_bp_mutex); +} + + +int register_perf_hw_breakpoint(struct perf_event *bp) +{ + int ret; + + ret = reserve_bp_slot(bp); + if (ret) + return ret; + + /* + * Ptrace breakpoints can be temporary perf events only + * meant to reserve a slot. In this case, it is created disabled and + * we don't want to check the params right now (as we put a null addr) + * But perf tools create events as disabled and we want to check + * the params for them. + * This is a quick hack that will be removed soon, once we remove + * the tmp breakpoints from ptrace + */ + if (!bp->attr.disabled || !bp->overflow_handler) + ret = arch_validate_hwbkpt_settings(bp, bp->ctx->task); + + return ret; +} + +/** + * register_user_hw_breakpoint - register a hardware breakpoint for user space + * @attr: breakpoint attributes + * @triggered: callback to trigger when we hit the breakpoint + * @tsk: pointer to 'task_struct' of the process to which the address belongs + */ +struct perf_event * +register_user_hw_breakpoint(struct perf_event_attr *attr, + perf_overflow_handler_t triggered, + struct task_struct *tsk) +{ + return perf_event_create_kernel_counter(attr, -1, tsk->pid, triggered); +} +EXPORT_SYMBOL_GPL(register_user_hw_breakpoint); + +/** + * modify_user_hw_breakpoint - modify a user-space hardware breakpoint + * @bp: the breakpoint structure to modify + * @attr: new breakpoint attributes + * @triggered: callback to trigger when we hit the breakpoint + * @tsk: pointer to 'task_struct' of the process to which the address belongs + */ +int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr) +{ + u64 old_addr = bp->attr.bp_addr; + int old_type = bp->attr.bp_type; + int old_len = bp->attr.bp_len; + int err = 0; + + perf_event_disable(bp); + + bp->attr.bp_addr = attr->bp_addr; + bp->attr.bp_type = attr->bp_type; + bp->attr.bp_len = attr->bp_len; + + if (attr->disabled) + goto end; + + err = arch_validate_hwbkpt_settings(bp, bp->ctx->task); + if (!err) + perf_event_enable(bp); + + if (err) { + bp->attr.bp_addr = old_addr; + bp->attr.bp_type = old_type; + bp->attr.bp_len = old_len; + if (!bp->attr.disabled) + perf_event_enable(bp); + + return err; + } + +end: + bp->attr.disabled = attr->disabled; + + return 0; +} +EXPORT_SYMBOL_GPL(modify_user_hw_breakpoint); + +/** + * unregister_hw_breakpoint - unregister a user-space hardware breakpoint + * @bp: the breakpoint structure to unregister + */ +void unregister_hw_breakpoint(struct perf_event *bp) +{ + if (!bp) + return; + perf_event_release_kernel(bp); +} +EXPORT_SYMBOL_GPL(unregister_hw_breakpoint); + +/** + * register_wide_hw_breakpoint - register a wide breakpoint in the kernel + * @attr: breakpoint attributes + * @triggered: callback to trigger when we hit the breakpoint + * + * @return a set of per_cpu pointers to perf events + */ +struct perf_event ** +register_wide_hw_breakpoint(struct perf_event_attr *attr, + perf_overflow_handler_t triggered) +{ + struct perf_event **cpu_events, **pevent, *bp; + long err; + int cpu; + + cpu_events = alloc_percpu(typeof(*cpu_events)); + if (!cpu_events) + return ERR_PTR(-ENOMEM); + + for_each_possible_cpu(cpu) { + pevent = per_cpu_ptr(cpu_events, cpu); + bp = perf_event_create_kernel_counter(attr, cpu, -1, triggered); + + *pevent = bp; + + if (IS_ERR(bp)) { + err = PTR_ERR(bp); + goto fail; + } + } + + return cpu_events; + +fail: + for_each_possible_cpu(cpu) { + pevent = per_cpu_ptr(cpu_events, cpu); + if (IS_ERR(*pevent)) + break; + unregister_hw_breakpoint(*pevent); + } + free_percpu(cpu_events); + /* return the error if any */ + return ERR_PTR(err); +} +EXPORT_SYMBOL_GPL(register_wide_hw_breakpoint); + +/** + * unregister_wide_hw_breakpoint - unregister a wide breakpoint in the kernel + * @cpu_events: the per cpu set of events to unregister + */ +void unregister_wide_hw_breakpoint(struct perf_event **cpu_events) +{ + int cpu; + struct perf_event **pevent; + + for_each_possible_cpu(cpu) { + pevent = per_cpu_ptr(cpu_events, cpu); + unregister_hw_breakpoint(*pevent); + } + free_percpu(cpu_events); +} +EXPORT_SYMBOL_GPL(unregister_wide_hw_breakpoint); + +static struct notifier_block hw_breakpoint_exceptions_nb = { + .notifier_call = hw_breakpoint_exceptions_notify, + /* we need to be notified first */ + .priority = 0x7fffffff +}; + +static int __init init_hw_breakpoint(void) +{ + return register_die_notifier(&hw_breakpoint_exceptions_nb); +} +core_initcall(init_hw_breakpoint); + + +struct pmu perf_ops_bp = { + .enable = arch_install_hw_breakpoint, + .disable = arch_uninstall_hw_breakpoint, + .read = hw_breakpoint_pmu_read, + .unthrottle = hw_breakpoint_pmu_unthrottle +}; diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c index 1de9700f416..2295a31ef11 100644 --- a/kernel/irq/autoprobe.c +++ b/kernel/irq/autoprobe.c @@ -45,7 +45,7 @@ unsigned long probe_irq_on(void) * flush such a longstanding irq before considering it as spurious. */ for_each_irq_desc_reverse(i, desc) { - spin_lock_irq(&desc->lock); + raw_spin_lock_irq(&desc->lock); if (!desc->action && !(desc->status & IRQ_NOPROBE)) { /* * An old-style architecture might still have @@ -61,7 +61,7 @@ unsigned long probe_irq_on(void) desc->chip->set_type(i, IRQ_TYPE_PROBE); desc->chip->startup(i); } - spin_unlock_irq(&desc->lock); + raw_spin_unlock_irq(&desc->lock); } /* Wait for longstanding interrupts to trigger. */ @@ -73,13 +73,13 @@ unsigned long probe_irq_on(void) * happened in the previous stage, it may have masked itself) */ for_each_irq_desc_reverse(i, desc) { - spin_lock_irq(&desc->lock); + raw_spin_lock_irq(&desc->lock); if (!desc->action && !(desc->status & IRQ_NOPROBE)) { desc->status |= IRQ_AUTODETECT | IRQ_WAITING; if (desc->chip->startup(i)) desc->status |= IRQ_PENDING; } - spin_unlock_irq(&desc->lock); + raw_spin_unlock_irq(&desc->lock); } /* @@ -91,7 +91,7 @@ unsigned long probe_irq_on(void) * Now filter out any obviously spurious interrupts */ for_each_irq_desc(i, desc) { - spin_lock_irq(&desc->lock); + raw_spin_lock_irq(&desc->lock); status = desc->status; if (status & IRQ_AUTODETECT) { @@ -103,7 +103,7 @@ unsigned long probe_irq_on(void) if (i < 32) mask |= 1 << i; } - spin_unlock_irq(&desc->lock); + raw_spin_unlock_irq(&desc->lock); } return mask; @@ -129,7 +129,7 @@ unsigned int probe_irq_mask(unsigned long val) int i; for_each_irq_desc(i, desc) { - spin_lock_irq(&desc->lock); + raw_spin_lock_irq(&desc->lock); status = desc->status; if (status & IRQ_AUTODETECT) { @@ -139,7 +139,7 @@ unsigned int probe_irq_mask(unsigned long val) desc->status = status & ~IRQ_AUTODETECT; desc->chip->shutdown(i); } - spin_unlock_irq(&desc->lock); + raw_spin_unlock_irq(&desc->lock); } mutex_unlock(&probing_active); @@ -171,7 +171,7 @@ int probe_irq_off(unsigned long val) unsigned int status; for_each_irq_desc(i, desc) { - spin_lock_irq(&desc->lock); + raw_spin_lock_irq(&desc->lock); status = desc->status; if (status & IRQ_AUTODETECT) { @@ -183,7 +183,7 @@ int probe_irq_off(unsigned long val) desc->status = status & ~IRQ_AUTODETECT; desc->chip->shutdown(i); } - spin_unlock_irq(&desc->lock); + raw_spin_unlock_irq(&desc->lock); } mutex_unlock(&probing_active); diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index c1660194d11..ecc3fa28f66 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -34,7 +34,7 @@ void dynamic_irq_init(unsigned int irq) } /* Ensure we don't have left over values from a previous use of this irq */ - spin_lock_irqsave(&desc->lock, flags); + raw_spin_lock_irqsave(&desc->lock, flags); desc->status = IRQ_DISABLED; desc->chip = &no_irq_chip; desc->handle_irq = handle_bad_irq; @@ -51,7 +51,7 @@ void dynamic_irq_init(unsigned int irq) cpumask_clear(desc->pending_mask); #endif #endif - spin_unlock_irqrestore(&desc->lock, flags); + raw_spin_unlock_irqrestore(&desc->lock, flags); } /** @@ -68,9 +68,9 @@ void dynamic_irq_cleanup(unsigned int irq) return; } - spin_lock_irqsave(&desc->lock, flags); + raw_spin_lock_irqsave(&desc->lock, flags); if (desc->action) { - spin_unlock_irqrestore(&desc->lock, flags); + raw_spin_unlock_irqrestore(&desc->lock, flags); WARN(1, KERN_ERR "Destroying IRQ%d without calling free_irq\n", irq); return; @@ -82,7 +82,7 @@ void dynamic_irq_cleanup(unsigned int irq) desc->chip = &no_irq_chip; desc->name = NULL; clear_kstat_irqs(desc); - spin_unlock_irqrestore(&desc->lock, flags); + raw_spin_unlock_irqrestore(&desc->lock, flags); } @@ -104,10 +104,10 @@ int set_irq_chip(unsigned int irq, struct irq_chip *chip) if (!chip) chip = &no_irq_chip; - spin_lock_irqsave(&desc->lock, flags); + raw_spin_lock_irqsave(&desc->lock, flags); irq_chip_set_defaults(chip); desc->chip = chip; - spin_unlock_irqrestore(&desc->lock, flags); + raw_spin_unlock_irqrestore(&desc->lock, flags); return 0; } @@ -133,9 +133,9 @@ int set_irq_type(unsigned int irq, unsigned int type) if (type == IRQ_TYPE_NONE) return 0; - spin_lock_irqsave(&desc->lock, flags); + raw_spin_lock_irqsave(&desc->lock, flags); ret = __irq_set_trigger(desc, irq, type); - spin_unlock_irqrestore(&desc->lock, flags); + raw_spin_unlock_irqrestore(&desc->lock, flags); return ret; } EXPORT_SYMBOL(set_irq_type); @@ -158,19 +158,19 @@ int set_irq_data(unsigned int irq, void *data) return -EINVAL; } - spin_lock_irqsave(&desc->lock, flags); + raw_spin_lock_irqsave(&desc->lock, flags); desc->handler_data = data; - spin_unlock_irqrestore(&desc->lock, flags); + raw_spin_unlock_irqrestore(&desc->lock, flags); return 0; } EXPORT_SYMBOL(set_irq_data); /** - * set_irq_data - set irq type data for an irq + * set_irq_msi - set MSI descriptor data for an irq * @irq: Interrupt number * @entry: Pointer to MSI descriptor data * - * Set the hardware irq controller data for an irq + * Set the MSI descriptor entry for an irq */ int set_irq_msi(unsigned int irq, struct msi_desc *entry) { @@ -183,11 +183,11 @@ int set_irq_msi(unsigned int irq, struct msi_desc *entry) return -EINVAL; } - spin_lock_irqsave(&desc->lock, flags); + raw_spin_lock_irqsave(&desc->lock, flags); desc->msi_desc = entry; if (entry) entry->irq = irq; - spin_unlock_irqrestore(&desc->lock, flags); + raw_spin_unlock_irqrestore(&desc->lock, flags); return 0; } @@ -214,9 +214,9 @@ int set_irq_chip_data(unsigned int irq, void *data) return -EINVAL; } - spin_lock_irqsave(&desc->lock, flags); + raw_spin_lock_irqsave(&desc->lock, flags); desc->chip_data = data; - spin_unlock_irqrestore(&desc->lock, flags); + raw_spin_unlock_irqrestore(&desc->lock, flags); return 0; } @@ -241,12 +241,12 @@ void set_irq_nested_thread(unsigned int irq, int nest) if (!desc) return; - spin_lock_irqsave(&desc->lock, flags); + raw_spin_lock_irqsave(&desc->lock, flags); if (nest) desc->status |= IRQ_NESTED_THREAD; else desc->status &= ~IRQ_NESTED_THREAD; - spin_unlock_irqrestore(&desc->lock, flags); + raw_spin_unlock_irqrestore(&desc->lock, flags); } EXPORT_SYMBOL_GPL(set_irq_nested_thread); @@ -343,7 +343,7 @@ void handle_nested_irq(unsigned int irq) might_sleep(); - spin_lock_irq(&desc->lock); + raw_spin_lock_irq(&desc->lock); kstat_incr_irqs_this_cpu(irq, desc); @@ -352,17 +352,17 @@ void handle_nested_irq(unsigned int irq) goto out_unlock; desc->status |= IRQ_INPROGRESS; - spin_unlock_irq(&desc->lock); + raw_spin_unlock_irq(&desc->lock); action_ret = action->thread_fn(action->irq, action->dev_id); if (!noirqdebug) note_interrupt(irq, desc, action_ret); - spin_lock_irq(&desc->lock); + raw_spin_lock_irq(&desc->lock); desc->status &= ~IRQ_INPROGRESS; out_unlock: - spin_unlock_irq(&desc->lock); + raw_spin_unlock_irq(&desc->lock); } EXPORT_SYMBOL_GPL(handle_nested_irq); @@ -384,7 +384,7 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc) struct irqaction *action; irqreturn_t action_ret; - spin_lock(&desc->lock); + raw_spin_lock(&desc->lock); if (unlikely(desc->status & IRQ_INPROGRESS)) goto out_unlock; @@ -396,16 +396,16 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc) goto out_unlock; desc->status |= IRQ_INPROGRESS; - spin_unlock(&desc->lock); + raw_spin_unlock(&desc->lock); action_ret = handle_IRQ_event(irq, action); if (!noirqdebug) note_interrupt(irq, desc, action_ret); - spin_lock(&desc->lock); + raw_spin_lock(&desc->lock); desc->status &= ~IRQ_INPROGRESS; out_unlock: - spin_unlock(&desc->lock); + raw_spin_unlock(&desc->lock); } /** @@ -424,7 +424,7 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc) struct irqaction *action; irqreturn_t action_ret; - spin_lock(&desc->lock); + raw_spin_lock(&desc->lock); mask_ack_irq(desc, irq); if (unlikely(desc->status & IRQ_INPROGRESS)) @@ -441,13 +441,13 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc) goto out_unlock; desc->status |= IRQ_INPROGRESS; - spin_unlock(&desc->lock); + raw_spin_unlock(&desc->lock); action_ret = handle_IRQ_event(irq, action); if (!noirqdebug) note_interrupt(irq, desc, action_ret); - spin_lock(&desc->lock); + raw_spin_lock(&desc->lock); desc->status &= ~IRQ_INPROGRESS; if (unlikely(desc->status & IRQ_ONESHOT)) @@ -455,7 +455,7 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc) else if (!(desc->status & IRQ_DISABLED) && desc->chip->unmask) desc->chip->unmask(irq); out_unlock: - spin_unlock(&desc->lock); + raw_spin_unlock(&desc->lock); } EXPORT_SYMBOL_GPL(handle_level_irq); @@ -475,7 +475,7 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) struct irqaction *action; irqreturn_t action_ret; - spin_lock(&desc->lock); + raw_spin_lock(&desc->lock); if (unlikely(desc->status & IRQ_INPROGRESS)) goto out; @@ -497,18 +497,18 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) desc->status |= IRQ_INPROGRESS; desc->status &= ~IRQ_PENDING; - spin_unlock(&desc->lock); + raw_spin_unlock(&desc->lock); action_ret = handle_IRQ_event(irq, action); if (!noirqdebug) note_interrupt(irq, desc, action_ret); - spin_lock(&desc->lock); + raw_spin_lock(&desc->lock); desc->status &= ~IRQ_INPROGRESS; out: desc->chip->eoi(irq); - spin_unlock(&desc->lock); + raw_spin_unlock(&desc->lock); } /** @@ -530,7 +530,7 @@ out: void handle_edge_irq(unsigned int irq, struct irq_desc *desc) { - spin_lock(&desc->lock); + raw_spin_lock(&desc->lock); desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); @@ -576,21 +576,21 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc) } desc->status &= ~IRQ_PENDING; - spin_unlock(&desc->lock); + raw_spin_unlock(&desc->lock); action_ret = handle_IRQ_event(irq, action); if (!noirqdebug) note_interrupt(irq, desc, action_ret); - spin_lock(&desc->lock); + raw_spin_lock(&desc->lock); } while ((desc->status & (IRQ_PENDING | IRQ_DISABLED)) == IRQ_PENDING); desc->status &= ~IRQ_INPROGRESS; out_unlock: - spin_unlock(&desc->lock); + raw_spin_unlock(&desc->lock); } /** - * handle_percpu_IRQ - Per CPU local irq handler + * handle_percpu_irq - Per CPU local irq handler * @irq: the interrupt number * @desc: the interrupt description structure for this irq * @@ -643,7 +643,7 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, } chip_bus_lock(irq, desc); - spin_lock_irqsave(&desc->lock, flags); + raw_spin_lock_irqsave(&desc->lock, flags); /* Uninstall? */ if (handle == handle_bad_irq) { @@ -661,7 +661,7 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, desc->depth = 0; desc->chip->startup(irq); } - spin_unlock_irqrestore(&desc->lock, flags); + raw_spin_unlock_irqrestore(&desc->lock, flags); chip_bus_sync_unlock(irq, desc); } EXPORT_SYMBOL_GPL(__set_irq_handler); @@ -692,9 +692,9 @@ void __init set_irq_noprobe(unsigned int irq) return; } - spin_lock_irqsave(&desc->lock, flags); + raw_spin_lock_irqsave(&desc->lock, flags); desc->status |= IRQ_NOPROBE; - spin_unlock_irqrestore(&desc->lock, flags); + raw_spin_unlock_irqrestore(&desc->lock, flags); } void __init set_irq_probe(unsigned int irq) @@ -707,7 +707,7 @@ void __init set_irq_probe(unsigned int irq) return; } - spin_lock_irqsave(&desc->lock, flags); + raw_spin_lock_irqsave(&desc->lock, flags); desc->status &= ~IRQ_NOPROBE; - spin_unlock_irqrestore(&desc->lock, flags); + raw_spin_unlock_irqrestore(&desc->lock, flags); } diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index a81cf80554d..814940e7f48 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -11,6 +11,7 @@ */ #include <linux/irq.h> +#include <linux/sched.h> #include <linux/slab.h> #include <linux/module.h> #include <linux/random.h> @@ -79,7 +80,7 @@ static struct irq_desc irq_desc_init = { .chip = &no_irq_chip, .handle_irq = handle_bad_irq, .depth = 1, - .lock = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock), + .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc_init.lock), }; void __ref init_kstat_irqs(struct irq_desc *desc, int node, int nr) @@ -107,7 +108,7 @@ static void init_one_irq_desc(int irq, struct irq_desc *desc, int node) { memcpy(desc, &irq_desc_init, sizeof(struct irq_desc)); - spin_lock_init(&desc->lock); + raw_spin_lock_init(&desc->lock); desc->irq = irq; #ifdef CONFIG_SMP desc->node = node; @@ -129,7 +130,7 @@ static void init_one_irq_desc(int irq, struct irq_desc *desc, int node) /* * Protect the sparse_irqs: */ -DEFINE_SPINLOCK(sparse_irq_lock); +DEFINE_RAW_SPINLOCK(sparse_irq_lock); struct irq_desc **irq_desc_ptrs __read_mostly; @@ -140,7 +141,7 @@ static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_sm .chip = &no_irq_chip, .handle_irq = handle_bad_irq, .depth = 1, - .lock = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock), + .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc_init.lock), } }; @@ -211,7 +212,7 @@ struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node) if (desc) return desc; - spin_lock_irqsave(&sparse_irq_lock, flags); + raw_spin_lock_irqsave(&sparse_irq_lock, flags); /* We have to check it to avoid races with another CPU */ desc = irq_desc_ptrs[irq]; @@ -233,7 +234,7 @@ struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node) irq_desc_ptrs[irq] = desc; out_unlock: - spin_unlock_irqrestore(&sparse_irq_lock, flags); + raw_spin_unlock_irqrestore(&sparse_irq_lock, flags); return desc; } @@ -246,7 +247,7 @@ struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = { .chip = &no_irq_chip, .handle_irq = handle_bad_irq, .depth = 1, - .lock = __SPIN_LOCK_UNLOCKED(irq_desc->lock), + .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc->lock), } }; @@ -472,7 +473,7 @@ unsigned int __do_IRQ(unsigned int irq) return 1; } - spin_lock(&desc->lock); + raw_spin_lock(&desc->lock); if (desc->chip->ack) desc->chip->ack(irq); /* @@ -516,13 +517,13 @@ unsigned int __do_IRQ(unsigned int irq) for (;;) { irqreturn_t action_ret; - spin_unlock(&desc->lock); + raw_spin_unlock(&desc->lock); action_ret = handle_IRQ_event(irq, action); if (!noirqdebug) note_interrupt(irq, desc, action_ret); - spin_lock(&desc->lock); + raw_spin_lock(&desc->lock); if (likely(!(desc->status & IRQ_PENDING))) break; desc->status &= ~IRQ_PENDING; @@ -535,7 +536,7 @@ out: * disabled while the handler was running. */ desc->chip->end(irq); - spin_unlock(&desc->lock); + raw_spin_unlock(&desc->lock); return 1; } diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 1b5d742c6a7..b2821f070a3 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -18,7 +18,7 @@ extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume); extern struct lock_class_key irq_desc_lock_class; extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr); extern void clear_kstat_irqs(struct irq_desc *desc); -extern spinlock_t sparse_irq_lock; +extern raw_spinlock_t sparse_irq_lock; #ifdef CONFIG_SPARSE_IRQ /* irq_desc_ptrs allocated at boot time */ diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index bde4c667d24..eb6078ca60c 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -46,9 +46,9 @@ void synchronize_irq(unsigned int irq) cpu_relax(); /* Ok, that indicated we're done: double-check carefully. */ - spin_lock_irqsave(&desc->lock, flags); + raw_spin_lock_irqsave(&desc->lock, flags); status = desc->status; - spin_unlock_irqrestore(&desc->lock, flags); + raw_spin_unlock_irqrestore(&desc->lock, flags); /* Oops, that failed? */ } while (status & IRQ_INPROGRESS); @@ -114,7 +114,7 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask) if (!desc->chip->set_affinity) return -EINVAL; - spin_lock_irqsave(&desc->lock, flags); + raw_spin_lock_irqsave(&desc->lock, flags); #ifdef CONFIG_GENERIC_PENDING_IRQ if (desc->status & IRQ_MOVE_PCNTXT) { @@ -134,7 +134,7 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask) } #endif desc->status |= IRQ_AFFINITY_SET; - spin_unlock_irqrestore(&desc->lock, flags); + raw_spin_unlock_irqrestore(&desc->lock, flags); return 0; } @@ -181,11 +181,11 @@ int irq_select_affinity_usr(unsigned int irq) unsigned long flags; int ret; - spin_lock_irqsave(&desc->lock, flags); + raw_spin_lock_irqsave(&desc->lock, flags); ret = setup_affinity(irq, desc); if (!ret) irq_set_thread_affinity(desc); - spin_unlock_irqrestore(&desc->lock, flags); + raw_spin_unlock_irqrestore(&desc->lock, flags); return ret; } @@ -231,9 +231,9 @@ void disable_irq_nosync(unsigned int irq) return; chip_bus_lock(irq, desc); - spin_lock_irqsave(&desc->lock, flags); + raw_spin_lock_irqsave(&desc->lock, flags); __disable_irq(desc, irq, false); - spin_unlock_irqrestore(&desc->lock, flags); + raw_spin_unlock_irqrestore(&desc->lock, flags); chip_bus_sync_unlock(irq, desc); } EXPORT_SYMBOL(disable_irq_nosync); @@ -308,9 +308,9 @@ void enable_irq(unsigned int irq) return; chip_bus_lock(irq, desc); - spin_lock_irqsave(&desc->lock, flags); + raw_spin_lock_irqsave(&desc->lock, flags); __enable_irq(desc, irq, false); - spin_unlock_irqrestore(&desc->lock, flags); + raw_spin_unlock_irqrestore(&desc->lock, flags); chip_bus_sync_unlock(irq, desc); } EXPORT_SYMBOL(enable_irq); @@ -347,7 +347,7 @@ int set_irq_wake(unsigned int irq, unsigned int on) /* wakeup-capable irqs can be shared between drivers that * don't need to have the same sleep mode behaviors. */ - spin_lock_irqsave(&desc->lock, flags); + raw_spin_lock_irqsave(&desc->lock, flags); if (on) { if (desc->wake_depth++ == 0) { ret = set_irq_wake_real(irq, on); @@ -368,7 +368,7 @@ int set_irq_wake(unsigned int irq, unsigned int on) } } - spin_unlock_irqrestore(&desc->lock, flags); + raw_spin_unlock_irqrestore(&desc->lock, flags); return ret; } EXPORT_SYMBOL(set_irq_wake); @@ -484,12 +484,12 @@ static int irq_wait_for_interrupt(struct irqaction *action) static void irq_finalize_oneshot(unsigned int irq, struct irq_desc *desc) { chip_bus_lock(irq, desc); - spin_lock_irq(&desc->lock); + raw_spin_lock_irq(&desc->lock); if (!(desc->status & IRQ_DISABLED) && (desc->status & IRQ_MASKED)) { desc->status &= ~IRQ_MASKED; desc->chip->unmask(irq); } - spin_unlock_irq(&desc->lock); + raw_spin_unlock_irq(&desc->lock); chip_bus_sync_unlock(irq, desc); } @@ -514,9 +514,9 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) return; } - spin_lock_irq(&desc->lock); + raw_spin_lock_irq(&desc->lock); cpumask_copy(mask, desc->affinity); - spin_unlock_irq(&desc->lock); + raw_spin_unlock_irq(&desc->lock); set_cpus_allowed_ptr(current, mask); free_cpumask_var(mask); @@ -545,7 +545,7 @@ static int irq_thread(void *data) atomic_inc(&desc->threads_active); - spin_lock_irq(&desc->lock); + raw_spin_lock_irq(&desc->lock); if (unlikely(desc->status & IRQ_DISABLED)) { /* * CHECKME: We might need a dedicated @@ -555,9 +555,9 @@ static int irq_thread(void *data) * retriggers the interrupt itself --- tglx */ desc->status |= IRQ_PENDING; - spin_unlock_irq(&desc->lock); + raw_spin_unlock_irq(&desc->lock); } else { - spin_unlock_irq(&desc->lock); + raw_spin_unlock_irq(&desc->lock); action->thread_fn(action->irq, action->dev_id); @@ -679,7 +679,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) /* * The following block of code has to be executed atomically */ - spin_lock_irqsave(&desc->lock, flags); + raw_spin_lock_irqsave(&desc->lock, flags); old_ptr = &desc->action; old = *old_ptr; if (old) { @@ -775,7 +775,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) __enable_irq(desc, irq, false); } - spin_unlock_irqrestore(&desc->lock, flags); + raw_spin_unlock_irqrestore(&desc->lock, flags); /* * Strictly no need to wake it up, but hung_task complains @@ -802,7 +802,7 @@ mismatch: ret = -EBUSY; out_thread: - spin_unlock_irqrestore(&desc->lock, flags); + raw_spin_unlock_irqrestore(&desc->lock, flags); if (new->thread) { struct task_struct *t = new->thread; @@ -844,7 +844,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) if (!desc) return NULL; - spin_lock_irqsave(&desc->lock, flags); + raw_spin_lock_irqsave(&desc->lock, flags); /* * There can be multiple actions per IRQ descriptor, find the right @@ -856,7 +856,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) if (!action) { WARN(1, "Trying to free already-free IRQ %d\n", irq); - spin_unlock_irqrestore(&desc->lock, flags); + raw_spin_unlock_irqrestore(&desc->lock, flags); return NULL; } @@ -884,7 +884,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) desc->chip->disable(irq); } - spin_unlock_irqrestore(&desc->lock, flags); + raw_spin_unlock_irqrestore(&desc->lock, flags); unregister_handler_proc(irq, action); @@ -1067,7 +1067,7 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler, kfree(action); #ifdef CONFIG_DEBUG_SHIRQ - if (irqflags & IRQF_SHARED) { + if (!retval && (irqflags & IRQF_SHARED)) { /* * It's a shared IRQ -- the driver ought to be prepared for it * to happen immediately, so let's make sure.... diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c index fcb6c96f262..24196228083 100644 --- a/kernel/irq/migration.c +++ b/kernel/irq/migration.c @@ -27,7 +27,7 @@ void move_masked_irq(int irq) if (!desc->chip->set_affinity) return; - assert_spin_locked(&desc->lock); + assert_raw_spin_locked(&desc->lock); /* * If there was a valid mask to work with, please diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c index 3fd30197da2..26bac9d8f86 100644 --- a/kernel/irq/numa_migrate.c +++ b/kernel/irq/numa_migrate.c @@ -42,7 +42,7 @@ static bool init_copy_one_irq_desc(int irq, struct irq_desc *old_desc, "for migration.\n", irq); return false; } - spin_lock_init(&desc->lock); + raw_spin_lock_init(&desc->lock); desc->node = node; lockdep_set_class(&desc->lock, &irq_desc_lock_class); init_copy_kstat_irqs(old_desc, desc, node, nr_cpu_ids); @@ -67,7 +67,7 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc, irq = old_desc->irq; - spin_lock_irqsave(&sparse_irq_lock, flags); + raw_spin_lock_irqsave(&sparse_irq_lock, flags); /* We have to check it to avoid races with another CPU */ desc = irq_desc_ptrs[irq]; @@ -91,7 +91,7 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc, } irq_desc_ptrs[irq] = desc; - spin_unlock_irqrestore(&sparse_irq_lock, flags); + raw_spin_unlock_irqrestore(&sparse_irq_lock, flags); /* free the old one */ free_one_irq_desc(old_desc, desc); @@ -100,7 +100,7 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc, return desc; out_unlock: - spin_unlock_irqrestore(&sparse_irq_lock, flags); + raw_spin_unlock_irqrestore(&sparse_irq_lock, flags); return desc; } diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c index a0bb09e7986..0d4005d85b0 100644 --- a/kernel/irq/pm.c +++ b/kernel/irq/pm.c @@ -28,9 +28,9 @@ void suspend_device_irqs(void) for_each_irq_desc(irq, desc) { unsigned long flags; - spin_lock_irqsave(&desc->lock, flags); + raw_spin_lock_irqsave(&desc->lock, flags); __disable_irq(desc, irq, true); - spin_unlock_irqrestore(&desc->lock, flags); + raw_spin_unlock_irqrestore(&desc->lock, flags); } for_each_irq_desc(irq, desc) @@ -56,9 +56,9 @@ void resume_device_irqs(void) if (!(desc->status & IRQ_SUSPENDED)) continue; - spin_lock_irqsave(&desc->lock, flags); + raw_spin_lock_irqsave(&desc->lock, flags); __enable_irq(desc, irq, true); - spin_unlock_irqrestore(&desc->lock, flags); + raw_spin_unlock_irqrestore(&desc->lock, flags); } } EXPORT_SYMBOL_GPL(resume_device_irqs); diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 692363dd591..6f50eccc79c 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -136,7 +136,7 @@ out: static int default_affinity_open(struct inode *inode, struct file *file) { - return single_open(file, default_affinity_show, NULL); + return single_open(file, default_affinity_show, PDE(inode)->data); } static const struct file_operations default_affinity_proc_fops = { @@ -148,18 +148,28 @@ static const struct file_operations default_affinity_proc_fops = { }; #endif -static int irq_spurious_read(char *page, char **start, off_t off, - int count, int *eof, void *data) +static int irq_spurious_proc_show(struct seq_file *m, void *v) { - struct irq_desc *desc = irq_to_desc((long) data); - return sprintf(page, "count %u\n" - "unhandled %u\n" - "last_unhandled %u ms\n", - desc->irq_count, - desc->irqs_unhandled, - jiffies_to_msecs(desc->last_unhandled)); + struct irq_desc *desc = irq_to_desc((long) m->private); + + seq_printf(m, "count %u\n" "unhandled %u\n" "last_unhandled %u ms\n", + desc->irq_count, desc->irqs_unhandled, + jiffies_to_msecs(desc->last_unhandled)); + return 0; +} + +static int irq_spurious_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, irq_spurious_proc_show, NULL); } +static const struct file_operations irq_spurious_proc_fops = { + .open = irq_spurious_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + #define MAX_NAMELEN 128 static int name_unique(unsigned int irq, struct irqaction *new_action) @@ -169,7 +179,7 @@ static int name_unique(unsigned int irq, struct irqaction *new_action) unsigned long flags; int ret = 1; - spin_lock_irqsave(&desc->lock, flags); + raw_spin_lock_irqsave(&desc->lock, flags); for (action = desc->action ; action; action = action->next) { if ((action != new_action) && action->name && !strcmp(new_action->name, action->name)) { @@ -177,7 +187,7 @@ static int name_unique(unsigned int irq, struct irqaction *new_action) break; } } - spin_unlock_irqrestore(&desc->lock, flags); + raw_spin_unlock_irqrestore(&desc->lock, flags); return ret; } @@ -204,7 +214,6 @@ void register_handler_proc(unsigned int irq, struct irqaction *action) void register_irq_proc(unsigned int irq, struct irq_desc *desc) { char name [MAX_NAMELEN]; - struct proc_dir_entry *entry; if (!root_irq_dir || (desc->chip == &no_irq_chip) || desc->dir) return; @@ -214,6 +223,8 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc) /* create /proc/irq/1234 */ desc->dir = proc_mkdir(name, root_irq_dir); + if (!desc->dir) + return; #ifdef CONFIG_SMP /* create /proc/irq/<irq>/smp_affinity */ @@ -221,11 +232,8 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc) &irq_affinity_proc_fops, (void *)(long)irq); #endif - entry = create_proc_entry("spurious", 0444, desc->dir); - if (entry) { - entry->data = (void *)(long)irq; - entry->read_proc = irq_spurious_read; - } + proc_create_data("spurious", 0444, desc->dir, + &irq_spurious_proc_fops, (void *)(long)irq); } #undef MAX_NAMELEN diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index 114e704760f..89fb90ae534 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -28,7 +28,7 @@ static int try_one_irq(int irq, struct irq_desc *desc) struct irqaction *action; int ok = 0, work = 0; - spin_lock(&desc->lock); + raw_spin_lock(&desc->lock); /* Already running on another processor */ if (desc->status & IRQ_INPROGRESS) { /* @@ -37,13 +37,13 @@ static int try_one_irq(int irq, struct irq_desc *desc) */ if (desc->action && (desc->action->flags & IRQF_SHARED)) desc->status |= IRQ_PENDING; - spin_unlock(&desc->lock); + raw_spin_unlock(&desc->lock); return ok; } /* Honour the normal IRQ locking */ desc->status |= IRQ_INPROGRESS; action = desc->action; - spin_unlock(&desc->lock); + raw_spin_unlock(&desc->lock); while (action) { /* Only shared IRQ handlers are safe to call */ @@ -56,7 +56,7 @@ static int try_one_irq(int irq, struct irq_desc *desc) } local_irq_disable(); /* Now clean up the flags */ - spin_lock(&desc->lock); + raw_spin_lock(&desc->lock); action = desc->action; /* @@ -68,9 +68,9 @@ static int try_one_irq(int irq, struct irq_desc *desc) * Perform real IRQ processing for the IRQ we deferred */ work = 1; - spin_unlock(&desc->lock); + raw_spin_unlock(&desc->lock); handle_IRQ_event(irq, action); - spin_lock(&desc->lock); + raw_spin_lock(&desc->lock); desc->status &= ~IRQ_PENDING; } desc->status &= ~IRQ_INPROGRESS; @@ -80,7 +80,7 @@ static int try_one_irq(int irq, struct irq_desc *desc) */ if (work && desc->chip && desc->chip->end) desc->chip->end(irq); - spin_unlock(&desc->lock); + raw_spin_unlock(&desc->lock); return ok; } @@ -104,7 +104,7 @@ static int misrouted_irq(int irq) return ok; } -static void poll_all_shared_irqs(void) +static void poll_spurious_irqs(unsigned long dummy) { struct irq_desc *desc; int i; @@ -121,25 +121,15 @@ static void poll_all_shared_irqs(void) if (!(status & IRQ_SPURIOUS_DISABLED)) continue; + local_irq_disable(); try_one_irq(i, desc); + local_irq_enable(); } -} - -static void poll_spurious_irqs(unsigned long dummy) -{ - poll_all_shared_irqs(); mod_timer(&poll_spurious_irq_timer, jiffies + POLL_SPURIOUS_IRQ_INTERVAL); } -#ifdef CONFIG_DEBUG_SHIRQ -void debug_poll_all_shared_irqs(void) -{ - poll_all_shared_irqs(); -} -#endif - /* * If 99,900 of the previous 100,000 interrupts have not been handled * then assume that the IRQ is stuck in some manner. Drop a diagnostic @@ -230,7 +220,7 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc, /* * If we are seeing only the odd spurious IRQ caused by * bus asynchronicity then don't eventually trigger an error, - * otherwise the couter becomes a doomsday timer for otherwise + * otherwise the counter becomes a doomsday timer for otherwise * working systems */ if (time_after(jiffies, desc->last_unhandled + HZ/10)) diff --git a/kernel/itimer.c b/kernel/itimer.c index b03451ede52..d802883153d 100644 --- a/kernel/itimer.c +++ b/kernel/itimer.c @@ -146,6 +146,7 @@ static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id, { cputime_t cval, nval, cinterval, ninterval; s64 ns_ninterval, ns_nval; + u32 error, incr_error; struct cpu_itimer *it = &tsk->signal->it[clock_id]; nval = timeval_to_cputime(&value->it_value); @@ -153,8 +154,8 @@ static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id, ninterval = timeval_to_cputime(&value->it_interval); ns_ninterval = timeval_to_ns(&value->it_interval); - it->incr_error = cputime_sub_ns(ninterval, ns_ninterval); - it->error = cputime_sub_ns(nval, ns_nval); + error = cputime_sub_ns(nval, ns_nval); + incr_error = cputime_sub_ns(ninterval, ns_ninterval); spin_lock_irq(&tsk->sighand->siglock); @@ -168,6 +169,8 @@ static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id, } it->expires = nval; it->incr = ninterval; + it->error = error; + it->incr_error = incr_error; trace_itimer_state(clock_id == CPUCLOCK_VIRT ? ITIMER_VIRTUAL : ITIMER_PROF, value, nval); diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 8b6b8b697c6..8e5288a8a35 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -181,6 +181,7 @@ unsigned long kallsyms_lookup_name(const char *name) } return module_kallsyms_lookup_name(name); } +EXPORT_SYMBOL_GPL(kallsyms_lookup_name); int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *, unsigned long), diff --git a/kernel/kexec.c b/kernel/kexec.c index f336e2107f9..433e9fcc1fc 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -31,6 +31,7 @@ #include <linux/cpu.h> #include <linux/console.h> #include <linux/vmalloc.h> +#include <linux/swap.h> #include <asm/page.h> #include <asm/uaccess.h> @@ -1082,6 +1083,64 @@ void crash_kexec(struct pt_regs *regs) } } +size_t crash_get_memory_size(void) +{ + size_t size; + mutex_lock(&kexec_mutex); + size = crashk_res.end - crashk_res.start + 1; + mutex_unlock(&kexec_mutex); + return size; +} + +static void free_reserved_phys_range(unsigned long begin, unsigned long end) +{ + unsigned long addr; + + for (addr = begin; addr < end; addr += PAGE_SIZE) { + ClearPageReserved(pfn_to_page(addr >> PAGE_SHIFT)); + init_page_count(pfn_to_page(addr >> PAGE_SHIFT)); + free_page((unsigned long)__va(addr)); + totalram_pages++; + } +} + +int crash_shrink_memory(unsigned long new_size) +{ + int ret = 0; + unsigned long start, end; + + mutex_lock(&kexec_mutex); + + if (kexec_crash_image) { + ret = -ENOENT; + goto unlock; + } + start = crashk_res.start; + end = crashk_res.end; + + if (new_size >= end - start + 1) { + ret = -EINVAL; + if (new_size == end - start + 1) + ret = 0; + goto unlock; + } + + start = roundup(start, PAGE_SIZE); + end = roundup(start + new_size, PAGE_SIZE); + + free_reserved_phys_range(end, crashk_res.end); + + if (start == end) { + crashk_res.end = end; + release_resource(&crashk_res); + } else + crashk_res.end = end - 1; + +unlock: + mutex_unlock(&kexec_mutex); + return ret; +} + static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data, size_t data_len) { diff --git a/kernel/kgdb.c b/kernel/kgdb.c index 9147a3190c9..2eb517e2351 100644 --- a/kernel/kgdb.c +++ b/kernel/kgdb.c @@ -129,6 +129,7 @@ struct task_struct *kgdb_usethread; struct task_struct *kgdb_contthread; int kgdb_single_step; +pid_t kgdb_sstep_pid; /* Our I/O buffers. */ static char remcom_in_buffer[BUFMAX]; @@ -541,12 +542,17 @@ static struct task_struct *getthread(struct pt_regs *regs, int tid) */ if (tid == 0 || tid == -1) tid = -atomic_read(&kgdb_active) - 2; - if (tid < 0) { + if (tid < -1 && tid > -NR_CPUS - 2) { if (kgdb_info[-tid - 2].task) return kgdb_info[-tid - 2].task; else return idle_task(-tid - 2); } + if (tid <= 0) { + printk(KERN_ERR "KGDB: Internal thread select error\n"); + dump_stack(); + return NULL; + } /* * find_task_by_pid_ns() does not take the tasklist lock anymore @@ -619,7 +625,8 @@ static void kgdb_flush_swbreak_addr(unsigned long addr) static int kgdb_activate_sw_breakpoints(void) { unsigned long addr; - int error = 0; + int error; + int ret = 0; int i; for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) { @@ -629,13 +636,16 @@ static int kgdb_activate_sw_breakpoints(void) addr = kgdb_break[i].bpt_addr; error = kgdb_arch_set_breakpoint(addr, kgdb_break[i].saved_instr); - if (error) - return error; + if (error) { + ret = error; + printk(KERN_INFO "KGDB: BP install failed: %lx", addr); + continue; + } kgdb_flush_swbreak_addr(addr); kgdb_break[i].state = BP_ACTIVE; } - return 0; + return ret; } static int kgdb_set_sw_break(unsigned long addr) @@ -682,7 +692,8 @@ static int kgdb_set_sw_break(unsigned long addr) static int kgdb_deactivate_sw_breakpoints(void) { unsigned long addr; - int error = 0; + int error; + int ret = 0; int i; for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) { @@ -691,13 +702,15 @@ static int kgdb_deactivate_sw_breakpoints(void) addr = kgdb_break[i].bpt_addr; error = kgdb_arch_remove_breakpoint(addr, kgdb_break[i].saved_instr); - if (error) - return error; + if (error) { + printk(KERN_INFO "KGDB: BP remove failed: %lx\n", addr); + ret = error; + } kgdb_flush_swbreak_addr(addr); kgdb_break[i].state = BP_SET; } - return 0; + return ret; } static int kgdb_remove_sw_break(unsigned long addr) @@ -870,7 +883,7 @@ static void gdb_cmd_getregs(struct kgdb_state *ks) /* * All threads that don't have debuggerinfo should be - * in __schedule() sleeping, since all other CPUs + * in schedule() sleeping, since all other CPUs * are in kgdb_wait, and thus have debuggerinfo. */ if (local_debuggerinfo) { @@ -1204,8 +1217,10 @@ static int gdb_cmd_exception_pass(struct kgdb_state *ks) return 1; } else { - error_packet(remcom_out_buffer, -EINVAL); - return 0; + kgdb_msg_write("KGDB only knows signal 9 (pass)" + " and 15 (pass and disconnect)\n" + "Executing a continue without signal passing\n", 0); + remcom_in_buffer[0] = 'c'; } /* Indicate fall through */ @@ -1395,6 +1410,7 @@ kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs) struct kgdb_state kgdb_var; struct kgdb_state *ks = &kgdb_var; unsigned long flags; + int sstep_tries = 100; int error = 0; int i, cpu; @@ -1425,13 +1441,14 @@ acquirelock: cpu_relax(); /* - * Do not start the debugger connection on this CPU if the last - * instance of the exception handler wanted to come into the - * debugger on a different CPU via a single step + * For single stepping, try to only enter on the processor + * that was single stepping. To gaurd against a deadlock, the + * kernel will only try for the value of sstep_tries before + * giving up and continuing on. */ if (atomic_read(&kgdb_cpu_doing_single_step) != -1 && - atomic_read(&kgdb_cpu_doing_single_step) != cpu) { - + (kgdb_info[cpu].task && + kgdb_info[cpu].task->pid != kgdb_sstep_pid) && --sstep_tries) { atomic_set(&kgdb_active, -1); touch_softlockup_watchdog(); clocksource_touch_watchdog(); @@ -1524,6 +1541,13 @@ acquirelock: } kgdb_restore: + if (atomic_read(&kgdb_cpu_doing_single_step) != -1) { + int sstep_cpu = atomic_read(&kgdb_cpu_doing_single_step); + if (kgdb_info[sstep_cpu].task) + kgdb_sstep_pid = kgdb_info[sstep_cpu].task->pid; + else + kgdb_sstep_pid = 0; + } /* Free kgdb_active */ atomic_set(&kgdb_active, -1); touch_softlockup_watchdog(); diff --git a/kernel/kmod.c b/kernel/kmod.c index 689d20f3930..25b10319036 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -80,16 +80,16 @@ int __request_module(bool wait, const char *fmt, ...) #define MAX_KMOD_CONCURRENT 50 /* Completely arbitrary value - KAO */ static int kmod_loop_msg; - ret = security_kernel_module_request(); - if (ret) - return ret; - va_start(args, fmt); ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args); va_end(args); if (ret >= MODULE_NAME_LEN) return -ENAMETOOLONG; + ret = security_kernel_module_request(module_name); + if (ret) + return ret; + /* If modprobe needs a service that is in a module, we get a recursive * loop. Limit the number of running kmod threads to max_threads/2 or * MAX_KMOD_CONCURRENT, whichever is the smaller. A cleaner method @@ -143,7 +143,6 @@ struct subprocess_info { static int ____call_usermodehelper(void *data) { struct subprocess_info *sub_info = data; - enum umh_wait wait = sub_info->wait; int retval; BUG_ON(atomic_read(&sub_info->cred->usage) != 1); @@ -185,14 +184,10 @@ static int ____call_usermodehelper(void *data) */ set_user_nice(current, 0); - if (wait == UMH_WAIT_EXEC) - complete(sub_info->complete); - retval = kernel_execve(sub_info->path, sub_info->argv, sub_info->envp); /* Exec failed? */ - if (wait != UMH_WAIT_EXEC) - sub_info->retval = retval; + sub_info->retval = retval; do_exit(0); } @@ -271,14 +266,16 @@ static void __call_usermodehelper(struct work_struct *work) switch (wait) { case UMH_NO_WAIT: - case UMH_WAIT_EXEC: break; case UMH_WAIT_PROC: if (pid > 0) break; sub_info->retval = pid; - break; + /* FALLTHROUGH */ + + case UMH_WAIT_EXEC: + complete(sub_info->complete); } } diff --git a/kernel/kprobes.c b/kernel/kprobes.c index cfadc1291d0..e5342a344c4 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -90,6 +90,9 @@ static spinlock_t *kretprobe_table_lock_ptr(unsigned long hash) */ static struct kprobe_blackpoint kprobe_blacklist[] = { {"preempt_schedule",}, + {"native_get_debugreg",}, + {"irq_entries_start",}, + {"common_interrupt",}, {NULL} /* Terminator */ }; @@ -673,6 +676,40 @@ static kprobe_opcode_t __kprobes *kprobe_addr(struct kprobe *p) return (kprobe_opcode_t *)(((char *)addr) + p->offset); } +/* Check passed kprobe is valid and return kprobe in kprobe_table. */ +static struct kprobe * __kprobes __get_valid_kprobe(struct kprobe *p) +{ + struct kprobe *old_p, *list_p; + + old_p = get_kprobe(p->addr); + if (unlikely(!old_p)) + return NULL; + + if (p != old_p) { + list_for_each_entry_rcu(list_p, &old_p->list, list) + if (list_p == p) + /* kprobe p is a valid probe */ + goto valid; + return NULL; + } +valid: + return old_p; +} + +/* Return error if the kprobe is being re-registered */ +static inline int check_kprobe_rereg(struct kprobe *p) +{ + int ret = 0; + struct kprobe *old_p; + + mutex_lock(&kprobe_mutex); + old_p = __get_valid_kprobe(p); + if (old_p) + ret = -EINVAL; + mutex_unlock(&kprobe_mutex); + return ret; +} + int __kprobes register_kprobe(struct kprobe *p) { int ret = 0; @@ -685,6 +722,10 @@ int __kprobes register_kprobe(struct kprobe *p) return -EINVAL; p->addr = addr; + ret = check_kprobe_rereg(p); + if (ret) + return ret; + preempt_disable(); if (!kernel_text_address((unsigned long) p->addr) || in_kprobes_functions((unsigned long) p->addr)) { @@ -754,26 +795,6 @@ out: } EXPORT_SYMBOL_GPL(register_kprobe); -/* Check passed kprobe is valid and return kprobe in kprobe_table. */ -static struct kprobe * __kprobes __get_valid_kprobe(struct kprobe *p) -{ - struct kprobe *old_p, *list_p; - - old_p = get_kprobe(p->addr); - if (unlikely(!old_p)) - return NULL; - - if (p != old_p) { - list_for_each_entry_rcu(list_p, &old_p->list, list) - if (list_p == p) - /* kprobe p is a valid probe */ - goto valid; - return NULL; - } -valid: - return old_p; -} - /* * Unregister a kprobe without a scheduler synchronization. */ @@ -1014,9 +1035,9 @@ int __kprobes register_kretprobe(struct kretprobe *rp) /* Pre-allocate memory for max kretprobe instances */ if (rp->maxactive <= 0) { #ifdef CONFIG_PREEMPT - rp->maxactive = max(10, 2 * NR_CPUS); + rp->maxactive = max(10, 2 * num_possible_cpus()); #else - rp->maxactive = NR_CPUS; + rp->maxactive = num_possible_cpus(); #endif } spin_lock_init(&rp->lock); @@ -1141,6 +1162,13 @@ static void __kprobes kill_kprobe(struct kprobe *p) arch_remove_kprobe(p); } +void __kprobes dump_kprobe(struct kprobe *kp) +{ + printk(KERN_WARNING "Dumping kprobe:\n"); + printk(KERN_WARNING "Name: %s\nAddress: %p\nOffset: %x\n", + kp->symbol_name, kp->addr, kp->offset); +} + /* Module notifier call back, checking kprobes on the module */ static int __kprobes kprobes_module_callback(struct notifier_block *nb, unsigned long val, void *data) @@ -1333,7 +1361,7 @@ static int __kprobes kprobes_open(struct inode *inode, struct file *filp) return seq_open(filp, &kprobes_seq_ops); } -static struct file_operations debugfs_kprobes_operations = { +static const struct file_operations debugfs_kprobes_operations = { .open = kprobes_open, .read = seq_read, .llseek = seq_lseek, @@ -1515,7 +1543,7 @@ static ssize_t write_enabled_file_bool(struct file *file, return count; } -static struct file_operations fops_kp = { +static const struct file_operations fops_kp = { .read = read_enabled_file_bool, .write = write_enabled_file_bool, }; diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index 528dd78e7e7..3feaf5a7451 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -100,6 +100,26 @@ static ssize_t kexec_crash_loaded_show(struct kobject *kobj, } KERNEL_ATTR_RO(kexec_crash_loaded); +static ssize_t kexec_crash_size_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%zu\n", crash_get_memory_size()); +} +static ssize_t kexec_crash_size_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + unsigned long cnt; + int ret; + + if (strict_strtoul(buf, 0, &cnt)) + return -EINVAL; + + ret = crash_shrink_memory(cnt); + return ret < 0 ? ret : count; +} +KERNEL_ATTR_RW(kexec_crash_size); + static ssize_t vmcoreinfo_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { @@ -147,6 +167,7 @@ static struct attribute * kernel_attrs[] = { #ifdef CONFIG_KEXEC &kexec_loaded_attr.attr, &kexec_crash_loaded_attr.attr, + &kexec_crash_size_attr.attr, &vmcoreinfo_attr.attr, #endif NULL diff --git a/kernel/kthread.c b/kernel/kthread.c index 5fe709982ca..ab7ae57773e 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -150,29 +150,6 @@ struct task_struct *kthread_create(int (*threadfn)(void *data), EXPORT_SYMBOL(kthread_create); /** - * kthread_bind - bind a just-created kthread to a cpu. - * @k: thread created by kthread_create(). - * @cpu: cpu (might not be online, must be possible) for @k to run on. - * - * Description: This function is equivalent to set_cpus_allowed(), - * except that @cpu doesn't need to be online, and the thread must be - * stopped (i.e., just returned from kthread_create()). - */ -void kthread_bind(struct task_struct *k, unsigned int cpu) -{ - /* Must have done schedule() in kthread() before we set_task_cpu */ - if (!wait_task_inactive(k, TASK_UNINTERRUPTIBLE)) { - WARN_ON(1); - return; - } - set_task_cpu(k, cpu); - k->cpus_allowed = cpumask_of_cpu(cpu); - k->rt.nr_cpus_allowed = 1; - k->flags |= PF_THREAD_BOUND; -} -EXPORT_SYMBOL(kthread_bind); - -/** * kthread_stop - stop a thread created by kthread_create(). * @k: thread created by kthread_create(). * diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 3815ac1d58b..5feaddcdbe4 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -49,7 +49,7 @@ #include "lockdep_internals.h" #define CREATE_TRACE_POINTS -#include <trace/events/lockdep.h> +#include <trace/events/lock.h> #ifdef CONFIG_PROVE_LOCKING int prove_locking = 1; @@ -73,11 +73,11 @@ module_param(lock_stat, int, 0644); * to use a raw spinlock - we really dont want the spinlock * code to recurse back into the lockdep code... */ -static raw_spinlock_t lockdep_lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; +static arch_spinlock_t lockdep_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; static int graph_lock(void) { - __raw_spin_lock(&lockdep_lock); + arch_spin_lock(&lockdep_lock); /* * Make sure that if another CPU detected a bug while * walking the graph we dont change it (while the other @@ -85,7 +85,7 @@ static int graph_lock(void) * dropped already) */ if (!debug_locks) { - __raw_spin_unlock(&lockdep_lock); + arch_spin_unlock(&lockdep_lock); return 0; } /* prevent any recursions within lockdep from causing deadlocks */ @@ -95,11 +95,11 @@ static int graph_lock(void) static inline int graph_unlock(void) { - if (debug_locks && !__raw_spin_is_locked(&lockdep_lock)) + if (debug_locks && !arch_spin_is_locked(&lockdep_lock)) return DEBUG_LOCKS_WARN_ON(1); current->lockdep_recursion--; - __raw_spin_unlock(&lockdep_lock); + arch_spin_unlock(&lockdep_lock); return 0; } @@ -111,7 +111,7 @@ static inline int debug_locks_off_graph_unlock(void) { int ret = debug_locks_off(); - __raw_spin_unlock(&lockdep_lock); + arch_spin_unlock(&lockdep_lock); return ret; } @@ -140,7 +140,13 @@ static inline struct lock_class *hlock_class(struct held_lock *hlock) } #ifdef CONFIG_LOCK_STAT -static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS], lock_stats); +static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS], + cpu_lock_stats); + +static inline u64 lockstat_clock(void) +{ + return cpu_clock(smp_processor_id()); +} static int lock_point(unsigned long points[], unsigned long ip) { @@ -158,12 +164,12 @@ static int lock_point(unsigned long points[], unsigned long ip) return i; } -static void lock_time_inc(struct lock_time *lt, s64 time) +static void lock_time_inc(struct lock_time *lt, u64 time) { if (time > lt->max) lt->max = time; - if (time < lt->min || !lt->min) + if (time < lt->min || !lt->nr) lt->min = time; lt->total += time; @@ -172,8 +178,15 @@ static void lock_time_inc(struct lock_time *lt, s64 time) static inline void lock_time_add(struct lock_time *src, struct lock_time *dst) { - dst->min += src->min; - dst->max += src->max; + if (!src->nr) + return; + + if (src->max > dst->max) + dst->max = src->max; + + if (src->min < dst->min || !dst->nr) + dst->min = src->min; + dst->total += src->total; dst->nr += src->nr; } @@ -186,7 +199,7 @@ struct lock_class_stats lock_stats(struct lock_class *class) memset(&stats, 0, sizeof(struct lock_class_stats)); for_each_possible_cpu(cpu) { struct lock_class_stats *pcs = - &per_cpu(lock_stats, cpu)[class - lock_classes]; + &per_cpu(cpu_lock_stats, cpu)[class - lock_classes]; for (i = 0; i < ARRAY_SIZE(stats.contention_point); i++) stats.contention_point[i] += pcs->contention_point[i]; @@ -213,7 +226,7 @@ void clear_lock_stats(struct lock_class *class) for_each_possible_cpu(cpu) { struct lock_class_stats *cpu_stats = - &per_cpu(lock_stats, cpu)[class - lock_classes]; + &per_cpu(cpu_lock_stats, cpu)[class - lock_classes]; memset(cpu_stats, 0, sizeof(struct lock_class_stats)); } @@ -223,23 +236,23 @@ void clear_lock_stats(struct lock_class *class) static struct lock_class_stats *get_lock_stats(struct lock_class *class) { - return &get_cpu_var(lock_stats)[class - lock_classes]; + return &get_cpu_var(cpu_lock_stats)[class - lock_classes]; } static void put_lock_stats(struct lock_class_stats *stats) { - put_cpu_var(lock_stats); + put_cpu_var(cpu_lock_stats); } static void lock_release_holdtime(struct held_lock *hlock) { struct lock_class_stats *stats; - s64 holdtime; + u64 holdtime; if (!lock_stat) return; - holdtime = sched_clock() - hlock->holdtime_stamp; + holdtime = lockstat_clock() - hlock->holdtime_stamp; stats = get_lock_stats(hlock_class(hlock)); if (hlock->read) @@ -374,7 +387,8 @@ static int save_trace(struct stack_trace *trace) * complete trace that maxes out the entries provided will be reported * as incomplete, friggin useless </rant> */ - if (trace->entries[trace->nr_entries-1] == ULONG_MAX) + if (trace->nr_entries != 0 && + trace->entries[trace->nr_entries-1] == ULONG_MAX) trace->nr_entries--; trace->max_entries = trace->nr_entries; @@ -1156,9 +1170,9 @@ unsigned long lockdep_count_forward_deps(struct lock_class *class) this.class = class; local_irq_save(flags); - __raw_spin_lock(&lockdep_lock); + arch_spin_lock(&lockdep_lock); ret = __lockdep_count_forward_deps(&this); - __raw_spin_unlock(&lockdep_lock); + arch_spin_unlock(&lockdep_lock); local_irq_restore(flags); return ret; @@ -1183,9 +1197,9 @@ unsigned long lockdep_count_backward_deps(struct lock_class *class) this.class = class; local_irq_save(flags); - __raw_spin_lock(&lockdep_lock); + arch_spin_lock(&lockdep_lock); ret = __lockdep_count_backward_deps(&this); - __raw_spin_unlock(&lockdep_lock); + arch_spin_unlock(&lockdep_lock); local_irq_restore(flags); return ret; @@ -2792,7 +2806,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, hlock->references = references; #ifdef CONFIG_LOCK_STAT hlock->waittime_stamp = 0; - hlock->holdtime_stamp = sched_clock(); + hlock->holdtime_stamp = lockstat_clock(); #endif if (check == 2 && !mark_irqflags(curr, hlock)) @@ -3322,7 +3336,7 @@ found_it: if (hlock->instance != lock) return; - hlock->waittime_stamp = sched_clock(); + hlock->waittime_stamp = lockstat_clock(); contention_point = lock_point(hlock_class(hlock)->contention_point, ip); contending_point = lock_point(hlock_class(hlock)->contending_point, @@ -3345,8 +3359,7 @@ __lock_acquired(struct lockdep_map *lock, unsigned long ip) struct held_lock *hlock, *prev_hlock; struct lock_class_stats *stats; unsigned int depth; - u64 now; - s64 waittime = 0; + u64 now, waittime = 0; int i, cpu; depth = curr->lockdep_depth; @@ -3374,7 +3387,7 @@ found_it: cpu = smp_processor_id(); if (hlock->waittime_stamp) { - now = sched_clock(); + now = lockstat_clock(); waittime = now - hlock->waittime_stamp; hlock->holdtime_stamp = now; } diff --git a/kernel/module.c b/kernel/module.c index e6bc4b28aa6..12afc5a3ddd 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -370,8 +370,6 @@ EXPORT_SYMBOL_GPL(find_module); #ifdef CONFIG_SMP -#ifndef CONFIG_HAVE_LEGACY_PER_CPU_AREA - static void *percpu_modalloc(unsigned long size, unsigned long align, const char *name) { @@ -395,154 +393,6 @@ static void percpu_modfree(void *freeme) free_percpu(freeme); } -#else /* ... CONFIG_HAVE_LEGACY_PER_CPU_AREA */ - -/* Number of blocks used and allocated. */ -static unsigned int pcpu_num_used, pcpu_num_allocated; -/* Size of each block. -ve means used. */ -static int *pcpu_size; - -static int split_block(unsigned int i, unsigned short size) -{ - /* Reallocation required? */ - if (pcpu_num_used + 1 > pcpu_num_allocated) { - int *new; - - new = krealloc(pcpu_size, sizeof(new[0])*pcpu_num_allocated*2, - GFP_KERNEL); - if (!new) - return 0; - - pcpu_num_allocated *= 2; - pcpu_size = new; - } - - /* Insert a new subblock */ - memmove(&pcpu_size[i+1], &pcpu_size[i], - sizeof(pcpu_size[0]) * (pcpu_num_used - i)); - pcpu_num_used++; - - pcpu_size[i+1] -= size; - pcpu_size[i] = size; - return 1; -} - -static inline unsigned int block_size(int val) -{ - if (val < 0) - return -val; - return val; -} - -static void *percpu_modalloc(unsigned long size, unsigned long align, - const char *name) -{ - unsigned long extra; - unsigned int i; - void *ptr; - int cpu; - - if (align > PAGE_SIZE) { - printk(KERN_WARNING "%s: per-cpu alignment %li > %li\n", - name, align, PAGE_SIZE); - align = PAGE_SIZE; - } - - ptr = __per_cpu_start; - for (i = 0; i < pcpu_num_used; ptr += block_size(pcpu_size[i]), i++) { - /* Extra for alignment requirement. */ - extra = ALIGN((unsigned long)ptr, align) - (unsigned long)ptr; - BUG_ON(i == 0 && extra != 0); - - if (pcpu_size[i] < 0 || pcpu_size[i] < extra + size) - continue; - - /* Transfer extra to previous block. */ - if (pcpu_size[i-1] < 0) - pcpu_size[i-1] -= extra; - else - pcpu_size[i-1] += extra; - pcpu_size[i] -= extra; - ptr += extra; - - /* Split block if warranted */ - if (pcpu_size[i] - size > sizeof(unsigned long)) - if (!split_block(i, size)) - return NULL; - - /* add the per-cpu scanning areas */ - for_each_possible_cpu(cpu) - kmemleak_alloc(ptr + per_cpu_offset(cpu), size, 0, - GFP_KERNEL); - - /* Mark allocated */ - pcpu_size[i] = -pcpu_size[i]; - return ptr; - } - - printk(KERN_WARNING "Could not allocate %lu bytes percpu data\n", - size); - return NULL; -} - -static void percpu_modfree(void *freeme) -{ - unsigned int i; - void *ptr = __per_cpu_start + block_size(pcpu_size[0]); - int cpu; - - /* First entry is core kernel percpu data. */ - for (i = 1; i < pcpu_num_used; ptr += block_size(pcpu_size[i]), i++) { - if (ptr == freeme) { - pcpu_size[i] = -pcpu_size[i]; - goto free; - } - } - BUG(); - - free: - /* remove the per-cpu scanning areas */ - for_each_possible_cpu(cpu) - kmemleak_free(freeme + per_cpu_offset(cpu)); - - /* Merge with previous? */ - if (pcpu_size[i-1] >= 0) { - pcpu_size[i-1] += pcpu_size[i]; - pcpu_num_used--; - memmove(&pcpu_size[i], &pcpu_size[i+1], - (pcpu_num_used - i) * sizeof(pcpu_size[0])); - i--; - } - /* Merge with next? */ - if (i+1 < pcpu_num_used && pcpu_size[i+1] >= 0) { - pcpu_size[i] += pcpu_size[i+1]; - pcpu_num_used--; - memmove(&pcpu_size[i+1], &pcpu_size[i+2], - (pcpu_num_used - (i+1)) * sizeof(pcpu_size[0])); - } -} - -static int percpu_modinit(void) -{ - pcpu_num_used = 2; - pcpu_num_allocated = 2; - pcpu_size = kmalloc(sizeof(pcpu_size[0]) * pcpu_num_allocated, - GFP_KERNEL); - /* Static in-kernel percpu data (used). */ - pcpu_size[0] = -(__per_cpu_end-__per_cpu_start); - /* Free room. */ - pcpu_size[1] = PERCPU_ENOUGH_ROOM + pcpu_size[0]; - if (pcpu_size[1] < 0) { - printk(KERN_ERR "No per-cpu room for modules.\n"); - pcpu_num_used = 1; - } - - return 0; -} -__initcall(percpu_modinit); - -#endif /* CONFIG_HAVE_LEGACY_PER_CPU_AREA */ - static unsigned int find_pcpusec(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, const char *secstrings) @@ -1187,7 +1037,8 @@ static void add_sect_attrs(struct module *mod, unsigned int nsect, /* Count loaded sections and allocate structures */ for (i = 0; i < nsect; i++) - if (sechdrs[i].sh_flags & SHF_ALLOC) + if (sechdrs[i].sh_flags & SHF_ALLOC + && sechdrs[i].sh_size) nloaded++; size[0] = ALIGN(sizeof(*sect_attrs) + nloaded * sizeof(sect_attrs->attrs[0]), @@ -1207,6 +1058,8 @@ static void add_sect_attrs(struct module *mod, unsigned int nsect, for (i = 0; i < nsect; i++) { if (! (sechdrs[i].sh_flags & SHF_ALLOC)) continue; + if (!sechdrs[i].sh_size) + continue; sattr->address = sechdrs[i].sh_addr; sattr->name = kstrdup(secstrings + sechdrs[i].sh_name, GFP_KERNEL); @@ -1797,6 +1650,17 @@ static void setup_modinfo(struct module *mod, Elf_Shdr *sechdrs, } } +static void free_modinfo(struct module *mod) +{ + struct module_attribute *attr; + int i; + + for (i = 0; (attr = modinfo_attrs[i]); i++) { + if (attr->free) + attr->free(mod); + } +} + #ifdef CONFIG_KALLSYMS /* lookup symbol in given range of kernel_symbols */ @@ -1862,13 +1726,93 @@ static char elf_type(const Elf_Sym *sym, return '?'; } +static bool is_core_symbol(const Elf_Sym *src, const Elf_Shdr *sechdrs, + unsigned int shnum) +{ + const Elf_Shdr *sec; + + if (src->st_shndx == SHN_UNDEF + || src->st_shndx >= shnum + || !src->st_name) + return false; + + sec = sechdrs + src->st_shndx; + if (!(sec->sh_flags & SHF_ALLOC) +#ifndef CONFIG_KALLSYMS_ALL + || !(sec->sh_flags & SHF_EXECINSTR) +#endif + || (sec->sh_entsize & INIT_OFFSET_MASK)) + return false; + + return true; +} + +static unsigned long layout_symtab(struct module *mod, + Elf_Shdr *sechdrs, + unsigned int symindex, + unsigned int strindex, + const Elf_Ehdr *hdr, + const char *secstrings, + unsigned long *pstroffs, + unsigned long *strmap) +{ + unsigned long symoffs; + Elf_Shdr *symsect = sechdrs + symindex; + Elf_Shdr *strsect = sechdrs + strindex; + const Elf_Sym *src; + const char *strtab; + unsigned int i, nsrc, ndst; + + /* Put symbol section at end of init part of module. */ + symsect->sh_flags |= SHF_ALLOC; + symsect->sh_entsize = get_offset(mod, &mod->init_size, symsect, + symindex) | INIT_OFFSET_MASK; + DEBUGP("\t%s\n", secstrings + symsect->sh_name); + + src = (void *)hdr + symsect->sh_offset; + nsrc = symsect->sh_size / sizeof(*src); + strtab = (void *)hdr + strsect->sh_offset; + for (ndst = i = 1; i < nsrc; ++i, ++src) + if (is_core_symbol(src, sechdrs, hdr->e_shnum)) { + unsigned int j = src->st_name; + + while(!__test_and_set_bit(j, strmap) && strtab[j]) + ++j; + ++ndst; + } + + /* Append room for core symbols at end of core part. */ + symoffs = ALIGN(mod->core_size, symsect->sh_addralign ?: 1); + mod->core_size = symoffs + ndst * sizeof(Elf_Sym); + + /* Put string table section at end of init part of module. */ + strsect->sh_flags |= SHF_ALLOC; + strsect->sh_entsize = get_offset(mod, &mod->init_size, strsect, + strindex) | INIT_OFFSET_MASK; + DEBUGP("\t%s\n", secstrings + strsect->sh_name); + + /* Append room for core symbols' strings at end of core part. */ + *pstroffs = mod->core_size; + __set_bit(0, strmap); + mod->core_size += bitmap_weight(strmap, strsect->sh_size); + + return symoffs; +} + static void add_kallsyms(struct module *mod, Elf_Shdr *sechdrs, + unsigned int shnum, unsigned int symindex, unsigned int strindex, - const char *secstrings) + unsigned long symoffs, + unsigned long stroffs, + const char *secstrings, + unsigned long *strmap) { - unsigned int i; + unsigned int i, ndst; + const Elf_Sym *src; + Elf_Sym *dst; + char *s; mod->symtab = (void *)sechdrs[symindex].sh_addr; mod->num_symtab = sechdrs[symindex].sh_size / sizeof(Elf_Sym); @@ -1878,13 +1822,46 @@ static void add_kallsyms(struct module *mod, for (i = 0; i < mod->num_symtab; i++) mod->symtab[i].st_info = elf_type(&mod->symtab[i], sechdrs, secstrings, mod); + + mod->core_symtab = dst = mod->module_core + symoffs; + src = mod->symtab; + *dst = *src; + for (ndst = i = 1; i < mod->num_symtab; ++i, ++src) { + if (!is_core_symbol(src, sechdrs, shnum)) + continue; + dst[ndst] = *src; + dst[ndst].st_name = bitmap_weight(strmap, dst[ndst].st_name); + ++ndst; + } + mod->core_num_syms = ndst; + + mod->core_strtab = s = mod->module_core + stroffs; + for (*s = 0, i = 1; i < sechdrs[strindex].sh_size; ++i) + if (test_bit(i, strmap)) + *++s = mod->strtab[i]; } #else +static inline unsigned long layout_symtab(struct module *mod, + Elf_Shdr *sechdrs, + unsigned int symindex, + unsigned int strindex, + const Elf_Ehdr *hdr, + const char *secstrings, + unsigned long *pstroffs, + unsigned long *strmap) +{ + return 0; +} + static inline void add_kallsyms(struct module *mod, Elf_Shdr *sechdrs, + unsigned int shnum, unsigned int symindex, unsigned int strindex, - const char *secstrings) + unsigned long symoffs, + unsigned long stroffs, + const char *secstrings, + const unsigned long *strmap) { } #endif /* CONFIG_KALLSYMS */ @@ -1959,6 +1936,8 @@ static noinline struct module *load_module(void __user *umod, struct module *mod; long err = 0; void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */ + unsigned long symoffs, stroffs, *strmap; + mm_segment_t old_fs; DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n", @@ -2040,11 +2019,6 @@ static noinline struct module *load_module(void __user *umod, /* Don't keep modinfo and version sections. */ sechdrs[infoindex].sh_flags &= ~(unsigned long)SHF_ALLOC; sechdrs[versindex].sh_flags &= ~(unsigned long)SHF_ALLOC; -#ifdef CONFIG_KALLSYMS - /* Keep symbol and string tables for decoding later. */ - sechdrs[symindex].sh_flags |= SHF_ALLOC; - sechdrs[strindex].sh_flags |= SHF_ALLOC; -#endif /* Check module struct version now, before we try to use module. */ if (!check_modstruct_version(sechdrs, versindex, mod)) { @@ -2080,6 +2054,13 @@ static noinline struct module *load_module(void __user *umod, goto free_hdr; } + strmap = kzalloc(BITS_TO_LONGS(sechdrs[strindex].sh_size) + * sizeof(long), GFP_KERNEL); + if (!strmap) { + err = -ENOMEM; + goto free_mod; + } + if (find_module(mod->name)) { err = -EEXIST; goto free_mod; @@ -2109,6 +2090,8 @@ static noinline struct module *load_module(void __user *umod, this is done generically; there doesn't appear to be any special cases for the architectures. */ layout_sections(mod, hdr, sechdrs, secstrings); + symoffs = layout_symtab(mod, sechdrs, symindex, strindex, hdr, + secstrings, &stroffs, strmap); /* Do the allocs. */ ptr = module_alloc_update_bounds(mod->core_size); @@ -2313,7 +2296,10 @@ static noinline struct module *load_module(void __user *umod, percpu_modcopy(mod->percpu, (void *)sechdrs[pcpuindex].sh_addr, sechdrs[pcpuindex].sh_size); - add_kallsyms(mod, sechdrs, symindex, strindex, secstrings); + add_kallsyms(mod, sechdrs, hdr->e_shnum, symindex, strindex, + symoffs, stroffs, secstrings, strmap); + kfree(strmap); + strmap = NULL; if (!mod->taints) { struct _ddebug *debug; @@ -2385,13 +2371,14 @@ static noinline struct module *load_module(void __user *umod, synchronize_sched(); module_arch_cleanup(mod); cleanup: + free_modinfo(mod); kobject_del(&mod->mkobj.kobj); kobject_put(&mod->mkobj.kobj); free_unload: module_unload_free(mod); #if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP) - free_init: percpu_modfree(mod->refptr); + free_init: #endif module_free(mod, mod->module_init); free_core: @@ -2402,6 +2389,7 @@ static noinline struct module *load_module(void __user *umod, percpu_modfree(percpu); free_mod: kfree(args); + kfree(strmap); free_hdr: vfree(hdr); return ERR_PTR(err); @@ -2491,6 +2479,11 @@ SYSCALL_DEFINE3(init_module, void __user *, umod, /* Drop initial reference. */ module_put(mod); trim_init_extable(mod); +#ifdef CONFIG_KALLSYMS + mod->num_symtab = mod->core_num_syms; + mod->symtab = mod->core_symtab; + mod->strtab = mod->core_strtab; +#endif module_free(mod, mod->module_init); mod->module_init = NULL; mod->init_size = 0; @@ -2952,7 +2945,6 @@ void module_layout(struct module *mod, struct modversion_info *ver, struct kernel_param *kp, struct kernel_symbol *ks, - struct marker *marker, struct tracepoint *tp) { } diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c index 50d022e5a56..ec815a960b5 100644 --- a/kernel/mutex-debug.c +++ b/kernel/mutex-debug.c @@ -16,6 +16,7 @@ #include <linux/delay.h> #include <linux/module.h> #include <linux/poison.h> +#include <linux/sched.h> #include <linux/spinlock.h> #include <linux/kallsyms.h> #include <linux/interrupt.h> diff --git a/kernel/mutex-debug.h b/kernel/mutex-debug.h index 6b2d735846a..57d527a16f9 100644 --- a/kernel/mutex-debug.h +++ b/kernel/mutex-debug.h @@ -43,13 +43,13 @@ static inline void mutex_clear_owner(struct mutex *lock) \ DEBUG_LOCKS_WARN_ON(in_interrupt()); \ local_irq_save(flags); \ - __raw_spin_lock(&(lock)->raw_lock); \ + arch_spin_lock(&(lock)->rlock.raw_lock);\ DEBUG_LOCKS_WARN_ON(l->magic != l); \ } while (0) -#define spin_unlock_mutex(lock, flags) \ - do { \ - __raw_spin_unlock(&(lock)->raw_lock); \ - local_irq_restore(flags); \ - preempt_check_resched(); \ +#define spin_unlock_mutex(lock, flags) \ + do { \ + arch_spin_unlock(&(lock)->rlock.raw_lock); \ + local_irq_restore(flags); \ + preempt_check_resched(); \ } while (0) diff --git a/kernel/mutex.c b/kernel/mutex.c index 947b3ad551f..632f04c57d8 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c @@ -148,8 +148,8 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, preempt_disable(); mutex_acquire(&lock->dep_map, subclass, 0, ip); -#if defined(CONFIG_SMP) && !defined(CONFIG_DEBUG_MUTEXES) && \ - !defined(CONFIG_HAVE_DEFAULT_NO_SPIN_MUTEXES) + +#ifdef CONFIG_MUTEX_SPIN_ON_OWNER /* * Optimistic spinning. * diff --git a/kernel/notifier.c b/kernel/notifier.c index 61d5aa5eced..acd24e7643e 100644 --- a/kernel/notifier.c +++ b/kernel/notifier.c @@ -558,7 +558,7 @@ EXPORT_SYMBOL(unregister_reboot_notifier); static ATOMIC_NOTIFIER_HEAD(die_chain); -int notrace notify_die(enum die_val val, const char *str, +int notrace __kprobes notify_die(enum die_val val, const char *str, struct pt_regs *regs, long err, int trap, int sig) { struct die_args args = { diff --git a/kernel/ns_cgroup.c b/kernel/ns_cgroup.c index 5aa854f9e5a..2a5dfec8efe 100644 --- a/kernel/ns_cgroup.c +++ b/kernel/ns_cgroup.c @@ -42,8 +42,8 @@ int ns_cgroup_clone(struct task_struct *task, struct pid *pid) * (hence either you are in the same cgroup as task, or in an * ancestor cgroup thereof) */ -static int ns_can_attach(struct cgroup_subsys *ss, - struct cgroup *new_cgroup, struct task_struct *task) +static int ns_can_attach(struct cgroup_subsys *ss, struct cgroup *new_cgroup, + struct task_struct *task, bool threadgroup) { if (current != task) { if (!capable(CAP_SYS_ADMIN)) @@ -56,6 +56,18 @@ static int ns_can_attach(struct cgroup_subsys *ss, if (!cgroup_is_descendant(new_cgroup, task)) return -EPERM; + if (threadgroup) { + struct task_struct *c; + rcu_read_lock(); + list_for_each_entry_rcu(c, &task->thread_group, thread_group) { + if (!cgroup_is_descendant(new_cgroup, c)) { + rcu_read_unlock(); + return -EPERM; + } + } + rcu_read_unlock(); + } + return 0; } diff --git a/kernel/panic.c b/kernel/panic.c index 8c43226a544..5827f7b9725 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -92,6 +92,8 @@ NORET_TYPE void panic(const char * fmt, ...) atomic_notifier_call_chain(&panic_notifier_list, 0, buf); + bust_spinlocks(0); + if (!panic_blink) panic_blink = no_blink; @@ -138,7 +140,6 @@ NORET_TYPE void panic(const char * fmt, ...) mdelay(1); i++; } - bust_spinlocks(0); } EXPORT_SYMBOL(panic); diff --git a/kernel/params.c b/kernel/params.c index 7f6912ced2b..cf1b6918312 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -23,6 +23,8 @@ #include <linux/device.h> #include <linux/err.h> #include <linux/slab.h> +#include <linux/ctype.h> +#include <linux/string.h> #if 0 #define DEBUGP printk @@ -87,7 +89,7 @@ static char *next_arg(char *args, char **param, char **val) } for (i = 0; args[i]; i++) { - if (args[i] == ' ' && !in_quote) + if (isspace(args[i]) && !in_quote) break; if (equals == 0) { if (args[i] == '=') @@ -121,9 +123,7 @@ static char *next_arg(char *args, char **param, char **val) next = args + i; /* Chew up trailing spaces. */ - while (*next == ' ') - next++; - return next; + return skip_spaces(next); } /* Args looks like "foo=bar,bar2 baz=fuz wiz". */ @@ -138,8 +138,7 @@ int parse_args(const char *name, DEBUGP("Parsing ARGS: %s\n", args); /* Chew leading spaces */ - while (*args == ' ') - args++; + args = skip_spaces(args); while (*args) { int ret; @@ -217,15 +216,11 @@ int param_set_charp(const char *val, struct kernel_param *kp) return -ENOSPC; } - if (kp->flags & KPARAM_KMALLOCED) - kfree(*(char **)kp->arg); - /* This is a hack. We can't need to strdup in early boot, and we * don't need to; this mangled commandline is preserved. */ if (slab_is_available()) { - kp->flags |= KPARAM_KMALLOCED; *(char **)kp->arg = kstrdup(val, GFP_KERNEL); - if (!kp->arg) + if (!*(char **)kp->arg) return -ENOMEM; } else *(const char **)kp->arg = val; @@ -303,6 +298,7 @@ static int param_array(const char *name, unsigned int min, unsigned int max, void *elem, int elemsize, int (*set)(const char *, struct kernel_param *kp), + u16 flags, unsigned int *num) { int ret; @@ -312,6 +308,7 @@ static int param_array(const char *name, /* Get the name right for errors. */ kp.name = name; kp.arg = elem; + kp.flags = flags; /* No equals sign? */ if (!val) { @@ -357,7 +354,8 @@ int param_array_set(const char *val, struct kernel_param *kp) unsigned int temp_num; return param_array(kp->name, val, 1, arr->max, arr->elem, - arr->elemsize, arr->set, arr->num ?: &temp_num); + arr->elemsize, arr->set, kp->flags, + arr->num ?: &temp_num); } int param_array_get(char *buffer, struct kernel_param *kp) @@ -604,11 +602,7 @@ void module_param_sysfs_remove(struct module *mod) void destroy_params(const struct kernel_param *params, unsigned num) { - unsigned int i; - - for (i = 0; i < num; i++) - if (params[i].flags & KPARAM_KMALLOCED) - kfree(*(char **)params[i].arg); + /* FIXME: This should free kmalloced charp parameters. It doesn't. */ } static void __init kernel_add_sysfs_param(const char *name, diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 76ac4db405e..9052d6c8c9f 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -20,6 +20,7 @@ #include <linux/percpu.h> #include <linux/ptrace.h> #include <linux/vmstat.h> +#include <linux/vmalloc.h> #include <linux/hardirq.h> #include <linux/rculist.h> #include <linux/uaccess.h> @@ -27,13 +28,15 @@ #include <linux/anon_inodes.h> #include <linux/kernel_stat.h> #include <linux/perf_event.h> +#include <linux/ftrace_event.h> +#include <linux/hw_breakpoint.h> #include <asm/irq_regs.h> /* * Each CPU has a list of per CPU events: */ -DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context); +static DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context); int perf_max_events __read_mostly = 1; static int perf_reserved_percpu __read_mostly; @@ -200,14 +203,14 @@ perf_lock_task_context(struct task_struct *task, unsigned long *flags) * if so. If we locked the right context, then it * can't get swapped on us any more. */ - spin_lock_irqsave(&ctx->lock, *flags); + raw_spin_lock_irqsave(&ctx->lock, *flags); if (ctx != rcu_dereference(task->perf_event_ctxp)) { - spin_unlock_irqrestore(&ctx->lock, *flags); + raw_spin_unlock_irqrestore(&ctx->lock, *flags); goto retry; } if (!atomic_inc_not_zero(&ctx->refcount)) { - spin_unlock_irqrestore(&ctx->lock, *flags); + raw_spin_unlock_irqrestore(&ctx->lock, *flags); ctx = NULL; } } @@ -228,7 +231,7 @@ static struct perf_event_context *perf_pin_task_context(struct task_struct *task ctx = perf_lock_task_context(task, &flags); if (ctx) { ++ctx->pin_count; - spin_unlock_irqrestore(&ctx->lock, flags); + raw_spin_unlock_irqrestore(&ctx->lock, flags); } return ctx; } @@ -237,12 +240,55 @@ static void perf_unpin_context(struct perf_event_context *ctx) { unsigned long flags; - spin_lock_irqsave(&ctx->lock, flags); + raw_spin_lock_irqsave(&ctx->lock, flags); --ctx->pin_count; - spin_unlock_irqrestore(&ctx->lock, flags); + raw_spin_unlock_irqrestore(&ctx->lock, flags); put_ctx(ctx); } +static inline u64 perf_clock(void) +{ + return cpu_clock(smp_processor_id()); +} + +/* + * Update the record of the current time in a context. + */ +static void update_context_time(struct perf_event_context *ctx) +{ + u64 now = perf_clock(); + + ctx->time += now - ctx->timestamp; + ctx->timestamp = now; +} + +/* + * Update the total_time_enabled and total_time_running fields for a event. + */ +static void update_event_times(struct perf_event *event) +{ + struct perf_event_context *ctx = event->ctx; + u64 run_end; + + if (event->state < PERF_EVENT_STATE_INACTIVE || + event->group_leader->state < PERF_EVENT_STATE_INACTIVE) + return; + + if (ctx->is_active) + run_end = ctx->time; + else + run_end = event->tstamp_stopped; + + event->total_time_enabled = run_end - event->tstamp_enabled; + + if (event->state == PERF_EVENT_STATE_INACTIVE) + run_end = event->tstamp_stopped; + else + run_end = ctx->time; + + event->total_time_running = run_end - event->tstamp_running; +} + /* * Add a event from the lists for its context. * Must be called with ctx->mutex and ctx->lock held. @@ -291,6 +337,18 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) if (event->group_leader != event) event->group_leader->nr_siblings--; + update_event_times(event); + + /* + * If event was in error state, then keep it + * that way, otherwise bogus counts will be + * returned on read(). The only way to get out + * of error state is by explicit re-enabling + * of the event + */ + if (event->state > PERF_EVENT_STATE_OFF) + event->state = PERF_EVENT_STATE_OFF; + /* * If this was a group event with sibling events then * upgrade the siblings to singleton events by adding them @@ -369,7 +427,7 @@ static void __perf_event_remove_from_context(void *info) if (ctx->task && cpuctx->task_ctx != ctx) return; - spin_lock(&ctx->lock); + raw_spin_lock(&ctx->lock); /* * Protect the list operation against NMI by disabling the * events on a global level. @@ -391,7 +449,7 @@ static void __perf_event_remove_from_context(void *info) } perf_enable(); - spin_unlock(&ctx->lock); + raw_spin_unlock(&ctx->lock); } @@ -418,7 +476,7 @@ static void perf_event_remove_from_context(struct perf_event *event) if (!task) { /* * Per cpu events are removed via an smp call and - * the removal is always sucessful. + * the removal is always successful. */ smp_call_function_single(event->cpu, __perf_event_remove_from_context, @@ -430,12 +488,12 @@ retry: task_oncpu_function_call(task, __perf_event_remove_from_context, event); - spin_lock_irq(&ctx->lock); + raw_spin_lock_irq(&ctx->lock); /* * If the context is active we need to retry the smp call. */ if (ctx->nr_active && !list_empty(&event->group_entry)) { - spin_unlock_irq(&ctx->lock); + raw_spin_unlock_irq(&ctx->lock); goto retry; } @@ -444,48 +502,9 @@ retry: * can remove the event safely, if the call above did not * succeed. */ - if (!list_empty(&event->group_entry)) { + if (!list_empty(&event->group_entry)) list_del_event(event, ctx); - } - spin_unlock_irq(&ctx->lock); -} - -static inline u64 perf_clock(void) -{ - return cpu_clock(smp_processor_id()); -} - -/* - * Update the record of the current time in a context. - */ -static void update_context_time(struct perf_event_context *ctx) -{ - u64 now = perf_clock(); - - ctx->time += now - ctx->timestamp; - ctx->timestamp = now; -} - -/* - * Update the total_time_enabled and total_time_running fields for a event. - */ -static void update_event_times(struct perf_event *event) -{ - struct perf_event_context *ctx = event->ctx; - u64 run_end; - - if (event->state < PERF_EVENT_STATE_INACTIVE || - event->group_leader->state < PERF_EVENT_STATE_INACTIVE) - return; - - event->total_time_enabled = ctx->time - event->tstamp_enabled; - - if (event->state == PERF_EVENT_STATE_INACTIVE) - run_end = event->tstamp_stopped; - else - run_end = ctx->time; - - event->total_time_running = run_end - event->tstamp_running; + raw_spin_unlock_irq(&ctx->lock); } /* @@ -516,7 +535,7 @@ static void __perf_event_disable(void *info) if (ctx->task && cpuctx->task_ctx != ctx) return; - spin_lock(&ctx->lock); + raw_spin_lock(&ctx->lock); /* * If the event is on, turn it off. @@ -532,7 +551,7 @@ static void __perf_event_disable(void *info) event->state = PERF_EVENT_STATE_OFF; } - spin_unlock(&ctx->lock); + raw_spin_unlock(&ctx->lock); } /* @@ -548,7 +567,7 @@ static void __perf_event_disable(void *info) * is the current context on this CPU and preemption is disabled, * hence we can't get into perf_event_task_sched_out for this context. */ -static void perf_event_disable(struct perf_event *event) +void perf_event_disable(struct perf_event *event) { struct perf_event_context *ctx = event->ctx; struct task_struct *task = ctx->task; @@ -565,12 +584,12 @@ static void perf_event_disable(struct perf_event *event) retry: task_oncpu_function_call(task, __perf_event_disable, event); - spin_lock_irq(&ctx->lock); + raw_spin_lock_irq(&ctx->lock); /* * If the event is still active, we need to retry the cross-call. */ if (event->state == PERF_EVENT_STATE_ACTIVE) { - spin_unlock_irq(&ctx->lock); + raw_spin_unlock_irq(&ctx->lock); goto retry; } @@ -583,7 +602,7 @@ static void perf_event_disable(struct perf_event *event) event->state = PERF_EVENT_STATE_OFF; } - spin_unlock_irq(&ctx->lock); + raw_spin_unlock_irq(&ctx->lock); } static int @@ -751,7 +770,7 @@ static void __perf_install_in_context(void *info) cpuctx->task_ctx = ctx; } - spin_lock(&ctx->lock); + raw_spin_lock(&ctx->lock); ctx->is_active = 1; update_context_time(ctx); @@ -801,7 +820,7 @@ static void __perf_install_in_context(void *info) unlock: perf_enable(); - spin_unlock(&ctx->lock); + raw_spin_unlock(&ctx->lock); } /* @@ -826,7 +845,7 @@ perf_install_in_context(struct perf_event_context *ctx, if (!task) { /* * Per cpu events are installed via an smp call and - * the install is always sucessful. + * the install is always successful. */ smp_call_function_single(cpu, __perf_install_in_context, event, 1); @@ -837,12 +856,12 @@ retry: task_oncpu_function_call(task, __perf_install_in_context, event); - spin_lock_irq(&ctx->lock); + raw_spin_lock_irq(&ctx->lock); /* * we need to retry the smp call. */ if (ctx->is_active && list_empty(&event->group_entry)) { - spin_unlock_irq(&ctx->lock); + raw_spin_unlock_irq(&ctx->lock); goto retry; } @@ -853,7 +872,7 @@ retry: */ if (list_empty(&event->group_entry)) add_event_to_ctx(event, ctx); - spin_unlock_irq(&ctx->lock); + raw_spin_unlock_irq(&ctx->lock); } /* @@ -898,7 +917,7 @@ static void __perf_event_enable(void *info) cpuctx->task_ctx = ctx; } - spin_lock(&ctx->lock); + raw_spin_lock(&ctx->lock); ctx->is_active = 1; update_context_time(ctx); @@ -940,7 +959,7 @@ static void __perf_event_enable(void *info) } unlock: - spin_unlock(&ctx->lock); + raw_spin_unlock(&ctx->lock); } /* @@ -952,7 +971,7 @@ static void __perf_event_enable(void *info) * perf_event_for_each_child or perf_event_for_each as described * for perf_event_disable. */ -static void perf_event_enable(struct perf_event *event) +void perf_event_enable(struct perf_event *event) { struct perf_event_context *ctx = event->ctx; struct task_struct *task = ctx->task; @@ -966,7 +985,7 @@ static void perf_event_enable(struct perf_event *event) return; } - spin_lock_irq(&ctx->lock); + raw_spin_lock_irq(&ctx->lock); if (event->state >= PERF_EVENT_STATE_INACTIVE) goto out; @@ -981,10 +1000,10 @@ static void perf_event_enable(struct perf_event *event) event->state = PERF_EVENT_STATE_OFF; retry: - spin_unlock_irq(&ctx->lock); + raw_spin_unlock_irq(&ctx->lock); task_oncpu_function_call(task, __perf_event_enable, event); - spin_lock_irq(&ctx->lock); + raw_spin_lock_irq(&ctx->lock); /* * If the context is active and the event is still off, @@ -1001,7 +1020,7 @@ static void perf_event_enable(struct perf_event *event) __perf_event_mark_enabled(event, ctx); out: - spin_unlock_irq(&ctx->lock); + raw_spin_unlock_irq(&ctx->lock); } static int perf_event_refresh(struct perf_event *event, int refresh) @@ -1023,7 +1042,7 @@ void __perf_event_sched_out(struct perf_event_context *ctx, { struct perf_event *event; - spin_lock(&ctx->lock); + raw_spin_lock(&ctx->lock); ctx->is_active = 0; if (likely(!ctx->nr_events)) goto out; @@ -1031,16 +1050,12 @@ void __perf_event_sched_out(struct perf_event_context *ctx, perf_disable(); if (ctx->nr_active) { - list_for_each_entry(event, &ctx->group_list, group_entry) { - if (event != event->group_leader) - event_sched_out(event, cpuctx, ctx); - else - group_sched_out(event, cpuctx, ctx); - } + list_for_each_entry(event, &ctx->group_list, group_entry) + group_sched_out(event, cpuctx, ctx); } perf_enable(); out: - spin_unlock(&ctx->lock); + raw_spin_unlock(&ctx->lock); } /* @@ -1062,8 +1077,6 @@ static int context_equiv(struct perf_event_context *ctx1, && !ctx1->pin_count && !ctx2->pin_count; } -static void __perf_event_read(void *event); - static void __perf_event_sync_stat(struct perf_event *event, struct perf_event *next_event) { @@ -1081,8 +1094,8 @@ static void __perf_event_sync_stat(struct perf_event *event, */ switch (event->state) { case PERF_EVENT_STATE_ACTIVE: - __perf_event_read(event); - break; + event->pmu->read(event); + /* fall-through */ case PERF_EVENT_STATE_INACTIVE: update_event_times(event); @@ -1121,6 +1134,8 @@ static void perf_event_sync_stat(struct perf_event_context *ctx, if (!ctx->nr_stat) return; + update_context_time(ctx); + event = list_first_entry(&ctx->event_list, struct perf_event, event_entry); @@ -1164,8 +1179,6 @@ void perf_event_task_sched_out(struct task_struct *task, if (likely(!ctx || !cpuctx->task_ctx)) return; - update_context_time(ctx); - rcu_read_lock(); parent = rcu_dereference(ctx->parent_ctx); next_ctx = next->perf_event_ctxp; @@ -1180,8 +1193,8 @@ void perf_event_task_sched_out(struct task_struct *task, * order we take the locks because no other cpu could * be trying to lock both of these tasks. */ - spin_lock(&ctx->lock); - spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING); + raw_spin_lock(&ctx->lock); + raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING); if (context_equiv(ctx, next_ctx)) { /* * XXX do we need a memory barrier of sorts @@ -1195,8 +1208,8 @@ void perf_event_task_sched_out(struct task_struct *task, perf_event_sync_stat(ctx, next_ctx); } - spin_unlock(&next_ctx->lock); - spin_unlock(&ctx->lock); + raw_spin_unlock(&next_ctx->lock); + raw_spin_unlock(&ctx->lock); } rcu_read_unlock(); @@ -1238,7 +1251,7 @@ __perf_event_sched_in(struct perf_event_context *ctx, struct perf_event *event; int can_add_hw = 1; - spin_lock(&ctx->lock); + raw_spin_lock(&ctx->lock); ctx->is_active = 1; if (likely(!ctx->nr_events)) goto out; @@ -1258,12 +1271,8 @@ __perf_event_sched_in(struct perf_event_context *ctx, if (event->cpu != -1 && event->cpu != cpu) continue; - if (event != event->group_leader) - event_sched_in(event, cpuctx, ctx, cpu); - else { - if (group_can_go_on(event, cpuctx, 1)) - group_sched_in(event, cpuctx, ctx, cpu); - } + if (group_can_go_on(event, cpuctx, 1)) + group_sched_in(event, cpuctx, ctx, cpu); /* * If this pinned group hasn't been scheduled, @@ -1291,19 +1300,13 @@ __perf_event_sched_in(struct perf_event_context *ctx, if (event->cpu != -1 && event->cpu != cpu) continue; - if (event != event->group_leader) { - if (event_sched_in(event, cpuctx, ctx, cpu)) + if (group_can_go_on(event, cpuctx, can_add_hw)) + if (group_sched_in(event, cpuctx, ctx, cpu)) can_add_hw = 0; - } else { - if (group_can_go_on(event, cpuctx, can_add_hw)) { - if (group_sched_in(event, cpuctx, ctx, cpu)) - can_add_hw = 0; - } - } } perf_enable(); out: - spin_unlock(&ctx->lock); + raw_spin_unlock(&ctx->lock); } /* @@ -1367,8 +1370,8 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx) struct hw_perf_event *hwc; u64 interrupts, freq; - spin_lock(&ctx->lock); - list_for_each_entry(event, &ctx->group_list, group_entry) { + raw_spin_lock(&ctx->lock); + list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { if (event->state != PERF_EVENT_STATE_ACTIVE) continue; @@ -1422,7 +1425,7 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx) perf_enable(); } } - spin_unlock(&ctx->lock); + raw_spin_unlock(&ctx->lock); } /* @@ -1435,7 +1438,7 @@ static void rotate_ctx(struct perf_event_context *ctx) if (!ctx->nr_events) return; - spin_lock(&ctx->lock); + raw_spin_lock(&ctx->lock); /* * Rotate the first entry last (works just fine for group events too): */ @@ -1446,7 +1449,7 @@ static void rotate_ctx(struct perf_event_context *ctx) } perf_enable(); - spin_unlock(&ctx->lock); + raw_spin_unlock(&ctx->lock); } void perf_event_task_tick(struct task_struct *curr, int cpu) @@ -1495,7 +1498,7 @@ static void perf_event_enable_on_exec(struct task_struct *task) __perf_event_task_sched_out(ctx); - spin_lock(&ctx->lock); + raw_spin_lock(&ctx->lock); list_for_each_entry(event, &ctx->group_list, group_entry) { if (!event->attr.enable_on_exec) @@ -1513,7 +1516,7 @@ static void perf_event_enable_on_exec(struct task_struct *task) if (enabled) unclone_ctx(ctx); - spin_unlock(&ctx->lock); + raw_spin_unlock(&ctx->lock); perf_event_task_sched_in(task, smp_processor_id()); out: @@ -1528,7 +1531,6 @@ static void __perf_event_read(void *info) struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); struct perf_event *event = info; struct perf_event_context *ctx = event->ctx; - unsigned long flags; /* * If this is a task context, we need to check whether it is @@ -1540,12 +1542,12 @@ static void __perf_event_read(void *info) if (ctx->task && cpuctx->task_ctx != ctx) return; - local_irq_save(flags); - if (ctx->is_active) - update_context_time(ctx); - event->pmu->read(event); + raw_spin_lock(&ctx->lock); + update_context_time(ctx); update_event_times(event); - local_irq_restore(flags); + raw_spin_unlock(&ctx->lock); + + event->pmu->read(event); } static u64 perf_event_read(struct perf_event *event) @@ -1558,7 +1560,13 @@ static u64 perf_event_read(struct perf_event *event) smp_call_function_single(event->oncpu, __perf_event_read, event, 1); } else if (event->state == PERF_EVENT_STATE_INACTIVE) { + struct perf_event_context *ctx = event->ctx; + unsigned long flags; + + raw_spin_lock_irqsave(&ctx->lock, flags); + update_context_time(ctx); update_event_times(event); + raw_spin_unlock_irqrestore(&ctx->lock, flags); } return atomic64_read(&event->count); @@ -1571,8 +1579,7 @@ static void __perf_event_init_context(struct perf_event_context *ctx, struct task_struct *task) { - memset(ctx, 0, sizeof(*ctx)); - spin_lock_init(&ctx->lock); + raw_spin_lock_init(&ctx->lock); mutex_init(&ctx->mutex); INIT_LIST_HEAD(&ctx->group_list); INIT_LIST_HEAD(&ctx->event_list); @@ -1642,11 +1649,11 @@ static struct perf_event_context *find_get_context(pid_t pid, int cpu) ctx = perf_lock_task_context(task, &flags); if (ctx) { unclone_ctx(ctx); - spin_unlock_irqrestore(&ctx->lock, flags); + raw_spin_unlock_irqrestore(&ctx->lock, flags); } if (!ctx) { - ctx = kmalloc(sizeof(struct perf_event_context), GFP_KERNEL); + ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL); err = -ENOMEM; if (!ctx) goto errout; @@ -1671,6 +1678,8 @@ static struct perf_event_context *find_get_context(pid_t pid, int cpu) return ERR_PTR(err); } +static void perf_event_free_filter(struct perf_event *event); + static void free_event_rcu(struct rcu_head *head) { struct perf_event *event; @@ -1678,6 +1687,7 @@ static void free_event_rcu(struct rcu_head *head) event = container_of(head, struct perf_event, rcu_head); if (event->ns) put_pid_ns(event->ns); + perf_event_free_filter(event); kfree(event); } @@ -1709,16 +1719,10 @@ static void free_event(struct perf_event *event) call_rcu(&event->rcu_head, free_event_rcu); } -/* - * Called when the last reference to the file is gone. - */ -static int perf_release(struct inode *inode, struct file *file) +int perf_event_release_kernel(struct perf_event *event) { - struct perf_event *event = file->private_data; struct perf_event_context *ctx = event->ctx; - file->private_data = NULL; - WARN_ON_ONCE(ctx->parent_ctx); mutex_lock(&ctx->mutex); perf_event_remove_from_context(event); @@ -1733,6 +1737,19 @@ static int perf_release(struct inode *inode, struct file *file) return 0; } +EXPORT_SYMBOL_GPL(perf_event_release_kernel); + +/* + * Called when the last reference to the file is gone. + */ +static int perf_release(struct inode *inode, struct file *file) +{ + struct perf_event *event = file->private_data; + + file->private_data = NULL; + + return perf_event_release_kernel(event); +} static int perf_event_read_size(struct perf_event *event) { @@ -1759,91 +1776,94 @@ static int perf_event_read_size(struct perf_event *event) return size; } -static u64 perf_event_read_value(struct perf_event *event) +u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running) { struct perf_event *child; u64 total = 0; + *enabled = 0; + *running = 0; + + mutex_lock(&event->child_mutex); total += perf_event_read(event); - list_for_each_entry(child, &event->child_list, child_list) + *enabled += event->total_time_enabled + + atomic64_read(&event->child_total_time_enabled); + *running += event->total_time_running + + atomic64_read(&event->child_total_time_running); + + list_for_each_entry(child, &event->child_list, child_list) { total += perf_event_read(child); + *enabled += child->total_time_enabled; + *running += child->total_time_running; + } + mutex_unlock(&event->child_mutex); return total; } - -static int perf_event_read_entry(struct perf_event *event, - u64 read_format, char __user *buf) -{ - int n = 0, count = 0; - u64 values[2]; - - values[n++] = perf_event_read_value(event); - if (read_format & PERF_FORMAT_ID) - values[n++] = primary_event_id(event); - - count = n * sizeof(u64); - - if (copy_to_user(buf, values, count)) - return -EFAULT; - - return count; -} +EXPORT_SYMBOL_GPL(perf_event_read_value); static int perf_event_read_group(struct perf_event *event, u64 read_format, char __user *buf) { struct perf_event *leader = event->group_leader, *sub; - int n = 0, size = 0, err = -EFAULT; - u64 values[3]; + int n = 0, size = 0, ret = -EFAULT; + struct perf_event_context *ctx = leader->ctx; + u64 values[5]; + u64 count, enabled, running; + + mutex_lock(&ctx->mutex); + count = perf_event_read_value(leader, &enabled, &running); values[n++] = 1 + leader->nr_siblings; - if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { - values[n++] = leader->total_time_enabled + - atomic64_read(&leader->child_total_time_enabled); - } - if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { - values[n++] = leader->total_time_running + - atomic64_read(&leader->child_total_time_running); - } + if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) + values[n++] = enabled; + if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) + values[n++] = running; + values[n++] = count; + if (read_format & PERF_FORMAT_ID) + values[n++] = primary_event_id(leader); size = n * sizeof(u64); if (copy_to_user(buf, values, size)) - return -EFAULT; - - err = perf_event_read_entry(leader, read_format, buf + size); - if (err < 0) - return err; + goto unlock; - size += err; + ret = size; list_for_each_entry(sub, &leader->sibling_list, group_entry) { - err = perf_event_read_entry(sub, read_format, - buf + size); - if (err < 0) - return err; + n = 0; + + values[n++] = perf_event_read_value(sub, &enabled, &running); + if (read_format & PERF_FORMAT_ID) + values[n++] = primary_event_id(sub); + + size = n * sizeof(u64); + + if (copy_to_user(buf + ret, values, size)) { + ret = -EFAULT; + goto unlock; + } - size += err; + ret += size; } +unlock: + mutex_unlock(&ctx->mutex); - return size; + return ret; } static int perf_event_read_one(struct perf_event *event, u64 read_format, char __user *buf) { + u64 enabled, running; u64 values[4]; int n = 0; - values[n++] = perf_event_read_value(event); - if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { - values[n++] = event->total_time_enabled + - atomic64_read(&event->child_total_time_enabled); - } - if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { - values[n++] = event->total_time_running + - atomic64_read(&event->child_total_time_running); - } + values[n++] = perf_event_read_value(event, &enabled, &running); + if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) + values[n++] = enabled; + if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) + values[n++] = running; if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(event); @@ -1874,12 +1894,10 @@ perf_read_hw(struct perf_event *event, char __user *buf, size_t count) return -ENOSPC; WARN_ON_ONCE(event->ctx->parent_ctx); - mutex_lock(&event->child_mutex); if (read_format & PERF_FORMAT_GROUP) ret = perf_event_read_group(event, read_format, buf); else ret = perf_event_read_one(event, read_format, buf); - mutex_unlock(&event->child_mutex); return ret; } @@ -1969,7 +1987,7 @@ static int perf_event_period(struct perf_event *event, u64 __user *arg) if (!value) return -EINVAL; - spin_lock_irq(&ctx->lock); + raw_spin_lock_irq(&ctx->lock); if (event->attr.freq) { if (value > sysctl_perf_event_sample_rate) { ret = -EINVAL; @@ -1982,12 +2000,13 @@ static int perf_event_period(struct perf_event *event, u64 __user *arg) event->hw.sample_period = value; } unlock: - spin_unlock_irq(&ctx->lock); + raw_spin_unlock_irq(&ctx->lock); return ret; } -int perf_event_set_output(struct perf_event *event, int output_fd); +static int perf_event_set_output(struct perf_event *event, int output_fd); +static int perf_event_set_filter(struct perf_event *event, void __user *arg); static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { @@ -2015,6 +2034,9 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case PERF_EVENT_IOC_SET_OUTPUT: return perf_event_set_output(event, arg); + case PERF_EVENT_IOC_SET_FILTER: + return perf_event_set_filter(event, (void __user *)arg); + default: return -ENOTTY; } @@ -2105,49 +2127,31 @@ unlock: rcu_read_unlock(); } -static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +static unsigned long perf_data_size(struct perf_mmap_data *data) { - struct perf_event *event = vma->vm_file->private_data; - struct perf_mmap_data *data; - int ret = VM_FAULT_SIGBUS; - - if (vmf->flags & FAULT_FLAG_MKWRITE) { - if (vmf->pgoff == 0) - ret = 0; - return ret; - } - - rcu_read_lock(); - data = rcu_dereference(event->data); - if (!data) - goto unlock; - - if (vmf->pgoff == 0) { - vmf->page = virt_to_page(data->user_page); - } else { - int nr = vmf->pgoff - 1; + return data->nr_pages << (PAGE_SHIFT + data->data_order); +} - if ((unsigned)nr > data->nr_pages) - goto unlock; +#ifndef CONFIG_PERF_USE_VMALLOC - if (vmf->flags & FAULT_FLAG_WRITE) - goto unlock; +/* + * Back perf_mmap() with regular GFP_KERNEL-0 pages. + */ - vmf->page = virt_to_page(data->data_pages[nr]); - } +static struct page * +perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff) +{ + if (pgoff > data->nr_pages) + return NULL; - get_page(vmf->page); - vmf->page->mapping = vma->vm_file->f_mapping; - vmf->page->index = vmf->pgoff; + if (pgoff == 0) + return virt_to_page(data->user_page); - ret = 0; -unlock: - rcu_read_unlock(); - - return ret; + return virt_to_page(data->data_pages[pgoff - 1]); } -static int perf_mmap_data_alloc(struct perf_event *event, int nr_pages) +static struct perf_mmap_data * +perf_mmap_data_alloc(struct perf_event *event, int nr_pages) { struct perf_mmap_data *data; unsigned long size; @@ -2172,19 +2176,10 @@ static int perf_mmap_data_alloc(struct perf_event *event, int nr_pages) goto fail_data_pages; } + data->data_order = 0; data->nr_pages = nr_pages; - atomic_set(&data->lock, -1); - if (event->attr.watermark) { - data->watermark = min_t(long, PAGE_SIZE * nr_pages, - event->attr.wakeup_watermark); - } - if (!data->watermark) - data->watermark = max(PAGE_SIZE, PAGE_SIZE * nr_pages / 4); - - rcu_assign_pointer(event->data, data); - - return 0; + return data; fail_data_pages: for (i--; i >= 0; i--) @@ -2196,7 +2191,7 @@ fail_user_page: kfree(data); fail: - return -ENOMEM; + return NULL; } static void perf_mmap_free_page(unsigned long addr) @@ -2207,28 +2202,170 @@ static void perf_mmap_free_page(unsigned long addr) __free_page(page); } -static void __perf_mmap_data_free(struct rcu_head *rcu_head) +static void perf_mmap_data_free(struct perf_mmap_data *data) { - struct perf_mmap_data *data; int i; - data = container_of(rcu_head, struct perf_mmap_data, rcu_head); - perf_mmap_free_page((unsigned long)data->user_page); for (i = 0; i < data->nr_pages; i++) perf_mmap_free_page((unsigned long)data->data_pages[i]); + kfree(data); +} + +#else + +/* + * Back perf_mmap() with vmalloc memory. + * + * Required for architectures that have d-cache aliasing issues. + */ + +static struct page * +perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff) +{ + if (pgoff > (1UL << data->data_order)) + return NULL; + + return vmalloc_to_page((void *)data->user_page + pgoff * PAGE_SIZE); +} + +static void perf_mmap_unmark_page(void *addr) +{ + struct page *page = vmalloc_to_page(addr); + + page->mapping = NULL; +} + +static void perf_mmap_data_free_work(struct work_struct *work) +{ + struct perf_mmap_data *data; + void *base; + int i, nr; + data = container_of(work, struct perf_mmap_data, work); + nr = 1 << data->data_order; + + base = data->user_page; + for (i = 0; i < nr + 1; i++) + perf_mmap_unmark_page(base + (i * PAGE_SIZE)); + + vfree(base); + kfree(data); +} + +static void perf_mmap_data_free(struct perf_mmap_data *data) +{ + schedule_work(&data->work); +} + +static struct perf_mmap_data * +perf_mmap_data_alloc(struct perf_event *event, int nr_pages) +{ + struct perf_mmap_data *data; + unsigned long size; + void *all_buf; + + WARN_ON(atomic_read(&event->mmap_count)); + + size = sizeof(struct perf_mmap_data); + size += sizeof(void *); + + data = kzalloc(size, GFP_KERNEL); + if (!data) + goto fail; + + INIT_WORK(&data->work, perf_mmap_data_free_work); + + all_buf = vmalloc_user((nr_pages + 1) * PAGE_SIZE); + if (!all_buf) + goto fail_all_buf; + + data->user_page = all_buf; + data->data_pages[0] = all_buf + PAGE_SIZE; + data->data_order = ilog2(nr_pages); + data->nr_pages = 1; + + return data; + +fail_all_buf: kfree(data); + +fail: + return NULL; +} + +#endif + +static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct perf_event *event = vma->vm_file->private_data; + struct perf_mmap_data *data; + int ret = VM_FAULT_SIGBUS; + + if (vmf->flags & FAULT_FLAG_MKWRITE) { + if (vmf->pgoff == 0) + ret = 0; + return ret; + } + + rcu_read_lock(); + data = rcu_dereference(event->data); + if (!data) + goto unlock; + + if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE)) + goto unlock; + + vmf->page = perf_mmap_to_page(data, vmf->pgoff); + if (!vmf->page) + goto unlock; + + get_page(vmf->page); + vmf->page->mapping = vma->vm_file->f_mapping; + vmf->page->index = vmf->pgoff; + + ret = 0; +unlock: + rcu_read_unlock(); + + return ret; +} + +static void +perf_mmap_data_init(struct perf_event *event, struct perf_mmap_data *data) +{ + long max_size = perf_data_size(data); + + atomic_set(&data->lock, -1); + + if (event->attr.watermark) { + data->watermark = min_t(long, max_size, + event->attr.wakeup_watermark); + } + + if (!data->watermark) + data->watermark = max_size / 2; + + + rcu_assign_pointer(event->data, data); +} + +static void perf_mmap_data_free_rcu(struct rcu_head *rcu_head) +{ + struct perf_mmap_data *data; + + data = container_of(rcu_head, struct perf_mmap_data, rcu_head); + perf_mmap_data_free(data); } -static void perf_mmap_data_free(struct perf_event *event) +static void perf_mmap_data_release(struct perf_event *event) { struct perf_mmap_data *data = event->data; WARN_ON(atomic_read(&event->mmap_count)); rcu_assign_pointer(event->data, NULL); - call_rcu(&data->rcu_head, __perf_mmap_data_free); + call_rcu(&data->rcu_head, perf_mmap_data_free_rcu); } static void perf_mmap_open(struct vm_area_struct *vma) @@ -2244,16 +2381,17 @@ static void perf_mmap_close(struct vm_area_struct *vma) WARN_ON_ONCE(event->ctx->parent_ctx); if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) { + unsigned long size = perf_data_size(event->data); struct user_struct *user = current_user(); - atomic_long_sub(event->data->nr_pages + 1, &user->locked_vm); + atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm); vma->vm_mm->locked_vm -= event->data->nr_locked; - perf_mmap_data_free(event); + perf_mmap_data_release(event); mutex_unlock(&event->mmap_mutex); } } -static struct vm_operations_struct perf_mmap_vmops = { +static const struct vm_operations_struct perf_mmap_vmops = { .open = perf_mmap_open, .close = perf_mmap_close, .fault = perf_mmap_fault, @@ -2266,6 +2404,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) unsigned long user_locked, user_lock_limit; struct user_struct *user = current_user(); unsigned long locked, lock_limit; + struct perf_mmap_data *data; unsigned long vma_size; unsigned long nr_pages; long user_extra, extra; @@ -2328,10 +2467,15 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) } WARN_ON(event->data); - ret = perf_mmap_data_alloc(event, nr_pages); - if (ret) + + data = perf_mmap_data_alloc(event, nr_pages); + ret = -ENOMEM; + if (!data) goto unlock; + ret = 0; + perf_mmap_data_init(event, data); + atomic_set(&event->mmap_count, 1); atomic_long_add(user_extra, &user->locked_vm); vma->vm_mm->locked_vm += extra; @@ -2519,7 +2663,7 @@ static bool perf_output_space(struct perf_mmap_data *data, unsigned long tail, if (!data->writable) return true; - mask = (data->nr_pages << PAGE_SHIFT) - 1; + mask = perf_data_size(data) - 1; offset = (offset - tail) & mask; head = (head - tail) & mask; @@ -2558,20 +2702,21 @@ static void perf_output_wakeup(struct perf_output_handle *handle) static void perf_output_lock(struct perf_output_handle *handle) { struct perf_mmap_data *data = handle->data; - int cpu; + int cur, cpu = get_cpu(); handle->locked = 0; - local_irq_save(handle->flags); - cpu = smp_processor_id(); - - if (in_nmi() && atomic_read(&data->lock) == cpu) - return; + for (;;) { + cur = atomic_cmpxchg(&data->lock, -1, cpu); + if (cur == -1) { + handle->locked = 1; + break; + } + if (cur == cpu) + break; - while (atomic_cmpxchg(&data->lock, -1, cpu) != -1) cpu_relax(); - - handle->locked = 1; + } } static void perf_output_unlock(struct perf_output_handle *handle) @@ -2617,14 +2762,14 @@ again: if (atomic_xchg(&data->wakeup, 0)) perf_output_wakeup(handle); out: - local_irq_restore(handle->flags); + put_cpu(); } void perf_output_copy(struct perf_output_handle *handle, const void *buf, unsigned int len) { unsigned int pages_mask; - unsigned int offset; + unsigned long offset; unsigned int size; void **pages; @@ -2633,12 +2778,14 @@ void perf_output_copy(struct perf_output_handle *handle, pages = handle->data->data_pages; do { - unsigned int page_offset; + unsigned long page_offset; + unsigned long page_size; int nr; nr = (offset >> PAGE_SHIFT) & pages_mask; - page_offset = offset & (PAGE_SIZE - 1); - size = min_t(unsigned int, PAGE_SIZE - page_offset, len); + page_size = 1UL << (handle->data->data_order + PAGE_SHIFT); + page_offset = offset & (page_size - 1); + size = min_t(unsigned int, page_size - page_offset, len); memcpy(pages[nr] + page_offset, buf, size); @@ -3126,15 +3273,10 @@ static void perf_event_task_ctx(struct perf_event_context *ctx, { struct perf_event *event; - if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list)) - return; - - rcu_read_lock(); list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { if (perf_event_task_match(event)) perf_event_task_output(event, task_event); } - rcu_read_unlock(); } static void perf_event_task_event(struct perf_task_event *task_event) @@ -3142,11 +3284,11 @@ static void perf_event_task_event(struct perf_task_event *task_event) struct perf_cpu_context *cpuctx; struct perf_event_context *ctx = task_event->task_ctx; + rcu_read_lock(); cpuctx = &get_cpu_var(perf_cpu_context); perf_event_task_ctx(&cpuctx->ctx, task_event); put_cpu_var(perf_cpu_context); - rcu_read_lock(); if (!ctx) ctx = rcu_dereference(task_event->task->perf_event_ctxp); if (ctx) @@ -3238,15 +3380,10 @@ static void perf_event_comm_ctx(struct perf_event_context *ctx, { struct perf_event *event; - if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list)) - return; - - rcu_read_lock(); list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { if (perf_event_comm_match(event)) perf_event_comm_output(event, comm_event); } - rcu_read_unlock(); } static void perf_event_comm_event(struct perf_comm_event *comm_event) @@ -3257,7 +3394,7 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event) char comm[TASK_COMM_LEN]; memset(comm, 0, sizeof(comm)); - strncpy(comm, comm_event->task->comm, sizeof(comm)); + strlcpy(comm, comm_event->task->comm, sizeof(comm)); size = ALIGN(strlen(comm)+1, sizeof(u64)); comm_event->comm = comm; @@ -3265,11 +3402,11 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event) comm_event->event_id.header.size = sizeof(comm_event->event_id) + size; + rcu_read_lock(); cpuctx = &get_cpu_var(perf_cpu_context); perf_event_comm_ctx(&cpuctx->ctx, comm_event); put_cpu_var(perf_cpu_context); - rcu_read_lock(); /* * doesn't really matter which of the child contexts the * events ends up in. @@ -3362,15 +3499,10 @@ static void perf_event_mmap_ctx(struct perf_event_context *ctx, { struct perf_event *event; - if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list)) - return; - - rcu_read_lock(); list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { if (perf_event_mmap_match(event, mmap_event)) perf_event_mmap_output(event, mmap_event); } - rcu_read_unlock(); } static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) @@ -3426,11 +3558,11 @@ got_name: mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size; + rcu_read_lock(); cpuctx = &get_cpu_var(perf_cpu_context); perf_event_mmap_ctx(&cpuctx->ctx, mmap_event); put_cpu_var(perf_cpu_context); - rcu_read_lock(); /* * doesn't really matter which of the child contexts the * events ends up in. @@ -3569,7 +3701,11 @@ static int __perf_event_overflow(struct perf_event *event, int nmi, perf_event_disable(event); } - perf_event_output(event, nmi, data, regs); + if (event->overflow_handler) + event->overflow_handler(event, nmi, data, regs); + else + perf_event_output(event, nmi, data, regs); + return ret; } @@ -3614,16 +3750,16 @@ again: return nr; } -static void perf_swevent_overflow(struct perf_event *event, +static void perf_swevent_overflow(struct perf_event *event, u64 overflow, int nmi, struct perf_sample_data *data, struct pt_regs *regs) { struct hw_perf_event *hwc = &event->hw; int throttle = 0; - u64 overflow; data->period = event->hw.last_period; - overflow = perf_swevent_set_period(event); + if (!overflow) + overflow = perf_swevent_set_period(event); if (hwc->interrupts == MAX_INTERRUPTS) return; @@ -3656,14 +3792,19 @@ static void perf_swevent_add(struct perf_event *event, u64 nr, atomic64_add(nr, &event->count); + if (!regs) + return; + if (!hwc->sample_period) return; - if (!regs) + if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq) + return perf_swevent_overflow(event, 1, nmi, data, regs); + + if (atomic64_add_negative(nr, &hwc->period_left)) return; - if (!atomic64_add_negative(nr, &hwc->period_left)) - perf_swevent_overflow(event, nmi, data, regs); + perf_swevent_overflow(event, 0, nmi, data, regs); } static int perf_swevent_is_counting(struct perf_event *event) @@ -3696,25 +3837,44 @@ static int perf_swevent_is_counting(struct perf_event *event) return 1; } +static int perf_tp_event_match(struct perf_event *event, + struct perf_sample_data *data); + +static int perf_exclude_event(struct perf_event *event, + struct pt_regs *regs) +{ + if (regs) { + if (event->attr.exclude_user && user_mode(regs)) + return 1; + + if (event->attr.exclude_kernel && !user_mode(regs)) + return 1; + } + + return 0; +} + static int perf_swevent_match(struct perf_event *event, enum perf_type_id type, - u32 event_id, struct pt_regs *regs) + u32 event_id, + struct perf_sample_data *data, + struct pt_regs *regs) { if (!perf_swevent_is_counting(event)) return 0; if (event->attr.type != type) return 0; + if (event->attr.config != event_id) return 0; - if (regs) { - if (event->attr.exclude_user && user_mode(regs)) - return 0; + if (perf_exclude_event(event, regs)) + return 0; - if (event->attr.exclude_kernel && !user_mode(regs)) - return 0; - } + if (event->attr.type == PERF_TYPE_TRACEPOINT && + !perf_tp_event_match(event, data)) + return 0; return 1; } @@ -3727,49 +3887,59 @@ static void perf_swevent_ctx_event(struct perf_event_context *ctx, { struct perf_event *event; - if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list)) - return; - - rcu_read_lock(); list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { - if (perf_swevent_match(event, type, event_id, regs)) + if (perf_swevent_match(event, type, event_id, data, regs)) perf_swevent_add(event, nr, nmi, data, regs); } - rcu_read_unlock(); } -static int *perf_swevent_recursion_context(struct perf_cpu_context *cpuctx) +int perf_swevent_get_recursion_context(void) { + struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context); + int rctx; + if (in_nmi()) - return &cpuctx->recursion[3]; + rctx = 3; + else if (in_irq()) + rctx = 2; + else if (in_softirq()) + rctx = 1; + else + rctx = 0; - if (in_irq()) - return &cpuctx->recursion[2]; + if (cpuctx->recursion[rctx]) { + put_cpu_var(perf_cpu_context); + return -1; + } - if (in_softirq()) - return &cpuctx->recursion[1]; + cpuctx->recursion[rctx]++; + barrier(); - return &cpuctx->recursion[0]; + return rctx; +} +EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context); + +void perf_swevent_put_recursion_context(int rctx) +{ + struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); + barrier(); + cpuctx->recursion[rctx]--; + put_cpu_var(perf_cpu_context); } +EXPORT_SYMBOL_GPL(perf_swevent_put_recursion_context); static void do_perf_sw_event(enum perf_type_id type, u32 event_id, u64 nr, int nmi, struct perf_sample_data *data, struct pt_regs *regs) { - struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context); - int *recursion = perf_swevent_recursion_context(cpuctx); + struct perf_cpu_context *cpuctx; struct perf_event_context *ctx; - if (*recursion) - goto out; - - (*recursion)++; - barrier(); - + cpuctx = &__get_cpu_var(perf_cpu_context); + rcu_read_lock(); perf_swevent_ctx_event(&cpuctx->ctx, type, event_id, nr, nmi, data, regs); - rcu_read_lock(); /* * doesn't really matter which of the child contexts the * events ends up in. @@ -3778,23 +3948,24 @@ static void do_perf_sw_event(enum perf_type_id type, u32 event_id, if (ctx) perf_swevent_ctx_event(ctx, type, event_id, nr, nmi, data, regs); rcu_read_unlock(); - - barrier(); - (*recursion)--; - -out: - put_cpu_var(perf_cpu_context); } void __perf_sw_event(u32 event_id, u64 nr, int nmi, struct pt_regs *regs, u64 addr) { - struct perf_sample_data data = { - .addr = addr, - }; + struct perf_sample_data data; + int rctx; - do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, - &data, regs); + rctx = perf_swevent_get_recursion_context(); + if (rctx < 0) + return; + + data.addr = addr; + data.raw = NULL; + + do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, &data, regs); + + perf_swevent_put_recursion_context(rctx); } static void perf_swevent_read(struct perf_event *event) @@ -3839,6 +4010,8 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer) event->pmu->read(event); data.addr = 0; + data.raw = NULL; + data.period = event->hw.last_period; regs = get_irq_regs(); /* * In case we exclude kernel IPs or are somehow not in interrupt @@ -3849,8 +4022,9 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer) regs = task_pt_regs(current); if (regs) { - if (perf_event_overflow(event, 0, &data, regs)) - ret = HRTIMER_NORESTART; + if (!(event->attr.exclude_idle && current->pid == 0)) + if (perf_event_overflow(event, 0, &data, regs)) + ret = HRTIMER_NORESTART; } period = max_t(u64, 10000, event->hw.sample_period); @@ -3859,6 +4033,42 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer) return ret; } +static void perf_swevent_start_hrtimer(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hwc->hrtimer.function = perf_swevent_hrtimer; + if (hwc->sample_period) { + u64 period; + + if (hwc->remaining) { + if (hwc->remaining < 0) + period = 10000; + else + period = hwc->remaining; + hwc->remaining = 0; + } else { + period = max_t(u64, 10000, hwc->sample_period); + } + __hrtimer_start_range_ns(&hwc->hrtimer, + ns_to_ktime(period), 0, + HRTIMER_MODE_REL, 0); + } +} + +static void perf_swevent_cancel_hrtimer(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + if (hwc->sample_period) { + ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer); + hwc->remaining = ktime_to_ns(remaining); + + hrtimer_cancel(&hwc->hrtimer); + } +} + /* * Software event: cpu wall time clock */ @@ -3870,8 +4080,7 @@ static void cpu_clock_perf_event_update(struct perf_event *event) u64 now; now = cpu_clock(cpu); - prev = atomic64_read(&event->hw.prev_count); - atomic64_set(&event->hw.prev_count, now); + prev = atomic64_xchg(&event->hw.prev_count, now); atomic64_add(now - prev, &event->count); } @@ -3881,22 +4090,14 @@ static int cpu_clock_perf_event_enable(struct perf_event *event) int cpu = raw_smp_processor_id(); atomic64_set(&hwc->prev_count, cpu_clock(cpu)); - hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - hwc->hrtimer.function = perf_swevent_hrtimer; - if (hwc->sample_period) { - u64 period = max_t(u64, 10000, hwc->sample_period); - __hrtimer_start_range_ns(&hwc->hrtimer, - ns_to_ktime(period), 0, - HRTIMER_MODE_REL, 0); - } + perf_swevent_start_hrtimer(event); return 0; } static void cpu_clock_perf_event_disable(struct perf_event *event) { - if (event->hw.sample_period) - hrtimer_cancel(&event->hw.hrtimer); + perf_swevent_cancel_hrtimer(event); cpu_clock_perf_event_update(event); } @@ -3933,22 +4134,15 @@ static int task_clock_perf_event_enable(struct perf_event *event) now = event->ctx->time; atomic64_set(&hwc->prev_count, now); - hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - hwc->hrtimer.function = perf_swevent_hrtimer; - if (hwc->sample_period) { - u64 period = max_t(u64, 10000, hwc->sample_period); - __hrtimer_start_range_ns(&hwc->hrtimer, - ns_to_ktime(period), 0, - HRTIMER_MODE_REL, 0); - } + + perf_swevent_start_hrtimer(event); return 0; } static void task_clock_perf_event_disable(struct perf_event *event) { - if (event->hw.sample_period) - hrtimer_cancel(&event->hw.hrtimer); + perf_swevent_cancel_hrtimer(event); task_clock_perf_event_update(event, event->ctx->time); } @@ -3976,6 +4170,7 @@ static const struct pmu perf_ops_task_clock = { }; #ifdef CONFIG_EVENT_PROFILE + void perf_tp_event(int event_id, u64 addr, u64 count, void *record, int entry_size) { @@ -3994,13 +4189,21 @@ void perf_tp_event(int event_id, u64 addr, u64 count, void *record, if (!regs) regs = task_pt_regs(current); + /* Trace events already protected against recursion */ do_perf_sw_event(PERF_TYPE_TRACEPOINT, event_id, count, 1, &data, regs); } EXPORT_SYMBOL_GPL(perf_tp_event); -extern int ftrace_profile_enable(int); -extern void ftrace_profile_disable(int); +static int perf_tp_event_match(struct perf_event *event, + struct perf_sample_data *data) +{ + void *record = data->raw->data; + + if (likely(!event->filter) || filter_match_preds(event->filter, record)) + return 1; + return 0; +} static void tp_perf_event_destroy(struct perf_event *event) { @@ -4025,11 +4228,93 @@ static const struct pmu *tp_perf_event_init(struct perf_event *event) return &perf_ops_generic; } + +static int perf_event_set_filter(struct perf_event *event, void __user *arg) +{ + char *filter_str; + int ret; + + if (event->attr.type != PERF_TYPE_TRACEPOINT) + return -EINVAL; + + filter_str = strndup_user(arg, PAGE_SIZE); + if (IS_ERR(filter_str)) + return PTR_ERR(filter_str); + + ret = ftrace_profile_set_filter(event, event->attr.config, filter_str); + + kfree(filter_str); + return ret; +} + +static void perf_event_free_filter(struct perf_event *event) +{ + ftrace_profile_free_filter(event); +} + #else + +static int perf_tp_event_match(struct perf_event *event, + struct perf_sample_data *data) +{ + return 1; +} + static const struct pmu *tp_perf_event_init(struct perf_event *event) { return NULL; } + +static int perf_event_set_filter(struct perf_event *event, void __user *arg) +{ + return -ENOENT; +} + +static void perf_event_free_filter(struct perf_event *event) +{ +} + +#endif /* CONFIG_EVENT_PROFILE */ + +#ifdef CONFIG_HAVE_HW_BREAKPOINT +static void bp_perf_event_destroy(struct perf_event *event) +{ + release_bp_slot(event); +} + +static const struct pmu *bp_perf_event_init(struct perf_event *bp) +{ + int err; + + err = register_perf_hw_breakpoint(bp); + if (err) + return ERR_PTR(err); + + bp->destroy = bp_perf_event_destroy; + + return &perf_ops_bp; +} + +void perf_bp_event(struct perf_event *bp, void *data) +{ + struct perf_sample_data sample; + struct pt_regs *regs = data; + + sample.raw = NULL; + sample.addr = bp->attr.bp_addr; + + if (!perf_exclude_event(bp, regs)) + perf_swevent_add(bp, 1, 1, &sample, regs); +} +#else +static const struct pmu *bp_perf_event_init(struct perf_event *bp) +{ + return NULL; +} + +void perf_bp_event(struct perf_event *bp, void *regs) +{ +} #endif atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX]; @@ -4076,6 +4361,8 @@ static const struct pmu *sw_perf_event_init(struct perf_event *event) case PERF_COUNT_SW_PAGE_FAULTS_MAJ: case PERF_COUNT_SW_CONTEXT_SWITCHES: case PERF_COUNT_SW_CPU_MIGRATIONS: + case PERF_COUNT_SW_ALIGNMENT_FAULTS: + case PERF_COUNT_SW_EMULATION_FAULTS: if (!event->parent) { atomic_inc(&perf_swevent_enabled[event_id]); event->destroy = sw_perf_event_destroy; @@ -4096,6 +4383,7 @@ perf_event_alloc(struct perf_event_attr *attr, struct perf_event_context *ctx, struct perf_event *group_leader, struct perf_event *parent_event, + perf_overflow_handler_t overflow_handler, gfp_t gfpflags) { const struct pmu *pmu; @@ -4138,6 +4426,11 @@ perf_event_alloc(struct perf_event_attr *attr, event->state = PERF_EVENT_STATE_INACTIVE; + if (!overflow_handler && parent_event) + overflow_handler = parent_event->overflow_handler; + + event->overflow_handler = overflow_handler; + if (attr->disabled) event->state = PERF_EVENT_STATE_OFF; @@ -4172,6 +4465,11 @@ perf_event_alloc(struct perf_event_attr *attr, pmu = tp_perf_event_init(event); break; + case PERF_TYPE_BREAKPOINT: + pmu = bp_perf_event_init(event); + break; + + default: break; } @@ -4284,7 +4582,7 @@ err_size: goto out; } -int perf_event_set_output(struct perf_event *event, int output_fd) +static int perf_event_set_output(struct perf_event *event, int output_fd) { struct perf_event *output_event = NULL; struct file *output_file = NULL; @@ -4414,7 +4712,7 @@ SYSCALL_DEFINE5(perf_event_open, } event = perf_event_alloc(&attr, cpu, ctx, group_leader, - NULL, GFP_KERNEL); + NULL, NULL, GFP_KERNEL); err = PTR_ERR(event); if (IS_ERR(event)) goto err_put_context; @@ -4462,6 +4760,61 @@ err_put_context: return err; } +/** + * perf_event_create_kernel_counter + * + * @attr: attributes of the counter to create + * @cpu: cpu in which the counter is bound + * @pid: task to profile + */ +struct perf_event * +perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, + pid_t pid, + perf_overflow_handler_t overflow_handler) +{ + struct perf_event *event; + struct perf_event_context *ctx; + int err; + + /* + * Get the target context (task or percpu): + */ + + ctx = find_get_context(pid, cpu); + if (IS_ERR(ctx)) { + err = PTR_ERR(ctx); + goto err_exit; + } + + event = perf_event_alloc(attr, cpu, ctx, NULL, + NULL, overflow_handler, GFP_KERNEL); + if (IS_ERR(event)) { + err = PTR_ERR(event); + goto err_put_context; + } + + event->filp = NULL; + WARN_ON_ONCE(ctx->parent_ctx); + mutex_lock(&ctx->mutex); + perf_install_in_context(ctx, event, cpu); + ++ctx->generation; + mutex_unlock(&ctx->mutex); + + event->owner = current; + get_task_struct(current); + mutex_lock(¤t->perf_event_mutex); + list_add_tail(&event->owner_entry, ¤t->perf_event_list); + mutex_unlock(¤t->perf_event_mutex); + + return event; + + err_put_context: + put_ctx(ctx); + err_exit: + return ERR_PTR(err); +} +EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter); + /* * inherit a event from parent task to child task: */ @@ -4487,7 +4840,7 @@ inherit_event(struct perf_event *parent_event, child_event = perf_event_alloc(&parent_event->attr, parent_event->cpu, child_ctx, group_leader, parent_event, - GFP_KERNEL); + NULL, GFP_KERNEL); if (IS_ERR(child_event)) return child_event; get_ctx(child_ctx); @@ -4505,6 +4858,8 @@ inherit_event(struct perf_event *parent_event, if (parent_event->attr.freq) child_event->hw.sample_period = parent_event->hw.sample_period; + child_event->overflow_handler = parent_event->overflow_handler; + /* * Link it up in the child's context: */ @@ -4594,7 +4949,6 @@ __perf_event_exit_task(struct perf_event *child_event, { struct perf_event *parent_event; - update_event_times(child_event); perf_event_remove_from_context(child_event); parent_event = child_event->parent; @@ -4638,7 +4992,7 @@ void perf_event_exit_task(struct task_struct *child) * reading child->perf_event_ctxp, we wait until it has * incremented the context's refcount before we do put_ctx below. */ - spin_lock(&child_ctx->lock); + raw_spin_lock(&child_ctx->lock); child->perf_event_ctxp = NULL; /* * If this context is a clone; unclone it so it can't get @@ -4646,7 +5000,8 @@ void perf_event_exit_task(struct task_struct *child) * the events from it. */ unclone_ctx(child_ctx); - spin_unlock_irqrestore(&child_ctx->lock, flags); + update_context_time(child_ctx); + raw_spin_unlock_irqrestore(&child_ctx->lock, flags); /* * Report the task dead after unscheduling the events so that we @@ -4729,7 +5084,7 @@ again: */ int perf_event_init_task(struct task_struct *child) { - struct perf_event_context *child_ctx, *parent_ctx; + struct perf_event_context *child_ctx = NULL, *parent_ctx; struct perf_event_context *cloned_ctx; struct perf_event *event; struct task_struct *parent = current; @@ -4745,20 +5100,6 @@ int perf_event_init_task(struct task_struct *child) return 0; /* - * This is executed from the parent task context, so inherit - * events that have been marked for cloning. - * First allocate and initialize a context for the child. - */ - - child_ctx = kmalloc(sizeof(struct perf_event_context), GFP_KERNEL); - if (!child_ctx) - return -ENOMEM; - - __perf_event_init_context(child_ctx, child); - child->perf_event_ctxp = child_ctx; - get_task_struct(child); - - /* * If the parent's context is a clone, pin it so it won't get * swapped under us. */ @@ -4781,15 +5122,33 @@ int perf_event_init_task(struct task_struct *child) * We dont have to disable NMIs - we are only looking at * the list, not manipulating it: */ - list_for_each_entry_rcu(event, &parent_ctx->event_list, event_entry) { - if (event != event->group_leader) - continue; + list_for_each_entry(event, &parent_ctx->group_list, group_entry) { if (!event->attr.inherit) { inherited_all = 0; continue; } + if (!child->perf_event_ctxp) { + /* + * This is executed from the parent task context, so + * inherit events that have been marked for cloning. + * First allocate and initialize a context for the + * child. + */ + + child_ctx = kzalloc(sizeof(struct perf_event_context), + GFP_KERNEL); + if (!child_ctx) { + ret = -ENOMEM; + goto exit; + } + + __perf_event_init_context(child_ctx, child); + child->perf_event_ctxp = child_ctx; + get_task_struct(child); + } + ret = inherit_group(event, parent, parent_ctx, child, child_ctx); if (ret) { @@ -4818,6 +5177,7 @@ int perf_event_init_task(struct task_struct *child) get_ctx(child_ctx->parent_ctx); } +exit: mutex_unlock(&parent_ctx->mutex); perf_unpin_context(parent_ctx); @@ -4932,11 +5292,11 @@ perf_set_reserve_percpu(struct sysdev_class *class, perf_reserved_percpu = val; for_each_online_cpu(cpu) { cpuctx = &per_cpu(perf_cpu_context, cpu); - spin_lock_irq(&cpuctx->ctx.lock); + raw_spin_lock_irq(&cpuctx->ctx.lock); mpt = min(perf_max_events - cpuctx->ctx.nr_events, perf_max_events - perf_reserved_percpu); cpuctx->max_pertask = mpt; - spin_unlock_irq(&cpuctx->ctx.lock); + raw_spin_unlock_irq(&cpuctx->ctx.lock); } spin_unlock(&perf_resource_lock); diff --git a/kernel/pid.c b/kernel/pid.c index d3f722d20f9..2e17c9c92cb 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -141,11 +141,12 @@ static int alloc_pidmap(struct pid_namespace *pid_ns) * installing it: */ spin_lock_irq(&pidmap_lock); - if (map->page) - kfree(page); - else + if (!map->page) { map->page = page; + page = NULL; + } spin_unlock_irq(&pidmap_lock); + kfree(page); if (unlikely(!map->page)) break; } @@ -268,12 +269,11 @@ struct pid *alloc_pid(struct pid_namespace *ns) for (type = 0; type < PIDTYPE_MAX; ++type) INIT_HLIST_HEAD(&pid->tasks[type]); + upid = pid->numbers + ns->level; spin_lock_irq(&pidmap_lock); - for (i = ns->level; i >= 0; i--) { - upid = &pid->numbers[i]; + for ( ; upid >= pid->numbers; --upid) hlist_add_head_rcu(&upid->pid_chain, &pid_hash[pid_hashfn(upid->nr, upid->ns)]); - } spin_unlock_irq(&pidmap_lock); out: diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 821722ae58a..86b3796b043 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -118,7 +118,7 @@ struct pid_namespace *copy_pid_ns(unsigned long flags, struct pid_namespace *old { if (!(flags & CLONE_NEWPID)) return get_pid_ns(old_ns); - if (flags & CLONE_THREAD) + if (flags & (CLONE_THREAD|CLONE_PARENT)) return ERR_PTR(-EINVAL); return create_pid_namespace(old_ns); } diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c index dfdec524d1b..3db49b9ca37 100644 --- a/kernel/pm_qos_params.c +++ b/kernel/pm_qos_params.c @@ -29,7 +29,6 @@ #include <linux/pm_qos_params.h> #include <linux/sched.h> -#include <linux/smp_lock.h> #include <linux/spinlock.h> #include <linux/slab.h> #include <linux/time.h> @@ -344,37 +343,33 @@ int pm_qos_remove_notifier(int pm_qos_class, struct notifier_block *notifier) } EXPORT_SYMBOL_GPL(pm_qos_remove_notifier); -#define PID_NAME_LEN sizeof("process_1234567890") -static char name[PID_NAME_LEN]; +#define PID_NAME_LEN 32 static int pm_qos_power_open(struct inode *inode, struct file *filp) { int ret; long pm_qos_class; + char name[PID_NAME_LEN]; - lock_kernel(); pm_qos_class = find_pm_qos_object_by_minor(iminor(inode)); if (pm_qos_class >= 0) { filp->private_data = (void *)pm_qos_class; - sprintf(name, "process_%d", current->pid); + snprintf(name, PID_NAME_LEN, "process_%d", current->pid); ret = pm_qos_add_requirement(pm_qos_class, name, PM_QOS_DEFAULT_VALUE); - if (ret >= 0) { - unlock_kernel(); + if (ret >= 0) return 0; - } } - unlock_kernel(); - return -EPERM; } static int pm_qos_power_release(struct inode *inode, struct file *filp) { int pm_qos_class; + char name[PID_NAME_LEN]; pm_qos_class = (long)filp->private_data; - sprintf(name, "process_%d", current->pid); + snprintf(name, PID_NAME_LEN, "process_%d", current->pid); pm_qos_remove_requirement(pm_qos_class, name); return 0; @@ -385,13 +380,14 @@ static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, { s32 value; int pm_qos_class; + char name[PID_NAME_LEN]; pm_qos_class = (long)filp->private_data; if (count != sizeof(s32)) return -EINVAL; if (copy_from_user(&value, buf, sizeof(s32))) return -EFAULT; - sprintf(name, "process_%d", current->pid); + snprintf(name, PID_NAME_LEN, "process_%d", current->pid); pm_qos_update_requirement(pm_qos_class, name, value); return sizeof(s32); diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 5c9dc228747..438ff452351 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -384,7 +384,8 @@ int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp) /* * Validate the clockid_t for a new CPU-clock timer, and initialize the timer. - * This is called from sys_timer_create with the new timer already locked. + * This is called from sys_timer_create() and do_cpu_nanosleep() with the + * new timer already all-zeros initialized. */ int posix_cpu_timer_create(struct k_itimer *new_timer) { @@ -396,8 +397,6 @@ int posix_cpu_timer_create(struct k_itimer *new_timer) return -EINVAL; INIT_LIST_HEAD(&new_timer->it.cpu.entry); - new_timer->it.cpu.incr.sched = 0; - new_timer->it.cpu.expires.sched = 0; read_lock(&tasklist_lock); if (CPUCLOCK_PERTHREAD(new_timer->it_clock)) { diff --git a/kernel/power/Makefile b/kernel/power/Makefile index c3b81c30e5d..43191815f87 100644 --- a/kernel/power/Makefile +++ b/kernel/power/Makefile @@ -8,7 +8,7 @@ obj-$(CONFIG_PM_SLEEP) += console.o obj-$(CONFIG_FREEZER) += process.o obj-$(CONFIG_SUSPEND) += suspend.o obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o -obj-$(CONFIG_HIBERNATION) += swsusp.o hibernate.o snapshot.o swap.o user.o +obj-$(CONFIG_HIBERNATION) += hibernate.o snapshot.o swap.o user.o obj-$(CONFIG_HIBERNATION_NVS) += hibernate_nvs.o obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o diff --git a/kernel/power/console.c b/kernel/power/console.c index 5187136fe1d..218e5af9015 100644 --- a/kernel/power/console.c +++ b/kernel/power/console.c @@ -6,7 +6,7 @@ #include <linux/vt_kern.h> #include <linux/kbd_kern.h> -#include <linux/console.h> +#include <linux/vt.h> #include <linux/module.h> #include "power.h" @@ -21,8 +21,7 @@ int pm_prepare_console(void) if (orig_fgconsole < 0) return 1; - orig_kmsg = kmsg_redirect; - kmsg_redirect = SUSPEND_CONSOLE; + orig_kmsg = vt_kmsg_redirect(SUSPEND_CONSOLE); return 0; } @@ -30,7 +29,7 @@ void pm_restore_console(void) { if (orig_fgconsole >= 0) { vt_move_to_console(orig_fgconsole, 0); - kmsg_redirect = orig_kmsg; + vt_kmsg_redirect(orig_kmsg); } } #endif diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 04b3a83d686..bbfe472d752 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -32,6 +32,7 @@ static int noresume = 0; static char resume_file[256] = CONFIG_PM_STD_PARTITION; dev_t swsusp_resume_device; sector_t swsusp_resume_block; +int in_suspend __nosavedata = 0; enum { HIBERNATION_INVALID, @@ -202,6 +203,35 @@ static void platform_recover(int platform_mode) } /** + * swsusp_show_speed - print the time elapsed between two events. + * @start: Starting event. + * @stop: Final event. + * @nr_pages - number of pages processed between @start and @stop + * @msg - introductory message to print + */ + +void swsusp_show_speed(struct timeval *start, struct timeval *stop, + unsigned nr_pages, char *msg) +{ + s64 elapsed_centisecs64; + int centisecs; + int k; + int kps; + + elapsed_centisecs64 = timeval_to_ns(stop) - timeval_to_ns(start); + do_div(elapsed_centisecs64, NSEC_PER_SEC / 100); + centisecs = elapsed_centisecs64; + if (centisecs == 0) + centisecs = 1; /* avoid div-by-zero */ + k = nr_pages * (PAGE_SIZE / 1024); + kps = (k * 100) / centisecs; + printk(KERN_INFO "PM: %s %d kbytes in %d.%02d seconds (%d.%02d MB/s)\n", + msg, k, + centisecs / 100, centisecs % 100, + kps / 1000, (kps % 1000) / 10); +} + +/** * create_image - freeze devices that need to be frozen with interrupts * off, create the hibernation image and thaw those devices. Control * reappears in this routine after a restore. @@ -693,21 +723,22 @@ static int software_resume(void) /* The snapshot device should not be opened while we're running */ if (!atomic_add_unless(&snapshot_device_available, -1, 0)) { error = -EBUSY; + swsusp_close(FMODE_READ); goto Unlock; } pm_prepare_console(); error = pm_notifier_call_chain(PM_RESTORE_PREPARE); if (error) - goto Finish; + goto close_finish; error = usermodehelper_disable(); if (error) - goto Finish; + goto close_finish; error = create_basic_memory_bitmaps(); if (error) - goto Finish; + goto close_finish; pr_debug("PM: Preparing processes for restore.\n"); error = prepare_processes(); @@ -719,6 +750,7 @@ static int software_resume(void) pr_debug("PM: Reading hibernation image.\n"); error = swsusp_read(&flags); + swsusp_close(FMODE_READ); if (!error) hibernation_restore(flags & SF_PLATFORM_MODE); @@ -737,6 +769,9 @@ static int software_resume(void) mutex_unlock(&pm_mutex); pr_debug("PM: Resume from disk failed.\n"); return error; +close_finish: + swsusp_close(FMODE_READ); + goto Finish; } late_initcall(software_resume); diff --git a/kernel/power/main.c b/kernel/power/main.c index 347d2cc88cd..0998c713905 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -220,6 +220,7 @@ static struct attribute_group attr_group = { #ifdef CONFIG_PM_RUNTIME struct workqueue_struct *pm_wq; +EXPORT_SYMBOL_GPL(pm_wq); static int __init pm_start_workqueue(void) { diff --git a/kernel/power/process.c b/kernel/power/process.c index cc2e55373b6..5ade1bdcf36 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -14,6 +14,7 @@ #include <linux/module.h> #include <linux/syscalls.h> #include <linux/freezer.h> +#include <linux/delay.h> /* * Timeout for stopping processes @@ -41,7 +42,7 @@ static int try_to_freeze_tasks(bool sig_only) do_gettimeofday(&start); end_time = jiffies + TIMEOUT; - do { + while (true) { todo = 0; read_lock(&tasklist_lock); do_each_thread(g, p) { @@ -62,10 +63,15 @@ static int try_to_freeze_tasks(bool sig_only) todo++; } while_each_thread(g, p); read_unlock(&tasklist_lock); - yield(); /* Yield is okay here */ - if (time_after(jiffies, end_time)) + if (!todo || time_after(jiffies, end_time)) break; - } while (todo); + + /* + * We need to retry, but first give the freezing tasks some + * time to enter the regrigerator. + */ + msleep(10); + } do_gettimeofday(&end); elapsed_csecs64 = timeval_to_ns(&end) - timeval_to_ns(&start); diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c index 17d8bb1acf9..25596e450ac 100644 --- a/kernel/power/suspend_test.c +++ b/kernel/power/suspend_test.c @@ -19,7 +19,7 @@ * The time it takes is system-specific though, so when we test this * during system bootup we allow a LOT of time. */ -#define TEST_SUSPEND_SECONDS 5 +#define TEST_SUSPEND_SECONDS 10 static unsigned long suspend_test_start_time; @@ -49,7 +49,8 @@ void suspend_test_finish(const char *label) * has some performance issues. The stack dump of a WARN_ON * is more likely to get the right attention than a printk... */ - WARN(msec > (TEST_SUSPEND_SECONDS * 1000), "Component: %s\n", label); + WARN(msec > (TEST_SUSPEND_SECONDS * 1000), + "Component: %s, time: %u\n", label, msec); } /* diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 8ba052c86d4..09b2b0ae9e9 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -13,7 +13,6 @@ #include <linux/module.h> #include <linux/file.h> -#include <linux/utsname.h> #include <linux/delay.h> #include <linux/bitops.h> #include <linux/genhd.h> @@ -39,6 +38,107 @@ struct swsusp_header { static struct swsusp_header *swsusp_header; +/** + * The following functions are used for tracing the allocated + * swap pages, so that they can be freed in case of an error. + */ + +struct swsusp_extent { + struct rb_node node; + unsigned long start; + unsigned long end; +}; + +static struct rb_root swsusp_extents = RB_ROOT; + +static int swsusp_extents_insert(unsigned long swap_offset) +{ + struct rb_node **new = &(swsusp_extents.rb_node); + struct rb_node *parent = NULL; + struct swsusp_extent *ext; + + /* Figure out where to put the new node */ + while (*new) { + ext = container_of(*new, struct swsusp_extent, node); + parent = *new; + if (swap_offset < ext->start) { + /* Try to merge */ + if (swap_offset == ext->start - 1) { + ext->start--; + return 0; + } + new = &((*new)->rb_left); + } else if (swap_offset > ext->end) { + /* Try to merge */ + if (swap_offset == ext->end + 1) { + ext->end++; + return 0; + } + new = &((*new)->rb_right); + } else { + /* It already is in the tree */ + return -EINVAL; + } + } + /* Add the new node and rebalance the tree. */ + ext = kzalloc(sizeof(struct swsusp_extent), GFP_KERNEL); + if (!ext) + return -ENOMEM; + + ext->start = swap_offset; + ext->end = swap_offset; + rb_link_node(&ext->node, parent, new); + rb_insert_color(&ext->node, &swsusp_extents); + return 0; +} + +/** + * alloc_swapdev_block - allocate a swap page and register that it has + * been allocated, so that it can be freed in case of an error. + */ + +sector_t alloc_swapdev_block(int swap) +{ + unsigned long offset; + + offset = swp_offset(get_swap_page_of_type(swap)); + if (offset) { + if (swsusp_extents_insert(offset)) + swap_free(swp_entry(swap, offset)); + else + return swapdev_block(swap, offset); + } + return 0; +} + +/** + * free_all_swap_pages - free swap pages allocated for saving image data. + * It also frees the extents used to register which swap entres had been + * allocated. + */ + +void free_all_swap_pages(int swap) +{ + struct rb_node *node; + + while ((node = swsusp_extents.rb_node)) { + struct swsusp_extent *ext; + unsigned long offset; + + ext = container_of(node, struct swsusp_extent, node); + rb_erase(node, &swsusp_extents); + for (offset = ext->start; offset <= ext->end; offset++) + swap_free(swp_entry(swap, offset)); + + kfree(ext); + } +} + +int swsusp_swap_in_use(void) +{ + return (swsusp_extents.rb_node != NULL); +} + /* * General things */ @@ -315,7 +415,6 @@ static int save_image(struct swap_map_handle *handle, { unsigned int m; int ret; - int error = 0; int nr_pages; int err2; struct bio *bio; @@ -330,26 +429,27 @@ static int save_image(struct swap_map_handle *handle, nr_pages = 0; bio = NULL; do_gettimeofday(&start); - do { + while (1) { ret = snapshot_read_next(snapshot, PAGE_SIZE); - if (ret > 0) { - error = swap_write_page(handle, data_of(*snapshot), - &bio); - if (error) - break; - if (!(nr_pages % m)) - printk("\b\b\b\b%3d%%", nr_pages / m); - nr_pages++; - } - } while (ret > 0); + if (ret <= 0) + break; + ret = swap_write_page(handle, data_of(*snapshot), &bio); + if (ret) + break; + if (!(nr_pages % m)) + printk(KERN_CONT "\b\b\b\b%3d%%", nr_pages / m); + nr_pages++; + } err2 = wait_on_bio_chain(&bio); do_gettimeofday(&stop); - if (!error) - error = err2; - if (!error) - printk("\b\b\b\bdone\n"); + if (!ret) + ret = err2; + if (!ret) + printk(KERN_CONT "\b\b\b\bdone\n"); + else + printk(KERN_CONT "\n"); swsusp_show_speed(&start, &stop, nr_to_write, "Wrote"); - return error; + return ret; } /** @@ -537,7 +637,8 @@ static int load_image(struct swap_map_handle *handle, snapshot_write_finalize(snapshot); if (!snapshot_image_loaded(snapshot)) error = -ENODATA; - } + } else + printk("\n"); swsusp_show_speed(&start, &stop, nr_to_read, "Read"); return error; } @@ -573,8 +674,6 @@ int swsusp_read(unsigned int *flags_p) error = load_image(&handle, &snapshot, header->pages - 1); release_swap_reader(&handle); - blkdev_put(resume_bdev, FMODE_READ); - if (!error) pr_debug("PM: Image successfully loaded\n"); else @@ -597,7 +696,7 @@ int swsusp_check(void) error = bio_read_page(swsusp_resume_block, swsusp_header, NULL); if (error) - return error; + goto put; if (!memcmp(SWSUSP_SIG, swsusp_header->sig, 10)) { memcpy(swsusp_header->sig, swsusp_header->orig_sig, 10); @@ -605,8 +704,10 @@ int swsusp_check(void) error = bio_write_page(swsusp_resume_block, swsusp_header, NULL); } else { - return -EINVAL; + error = -EINVAL; } + +put: if (error) blkdev_put(resume_bdev, FMODE_READ); else diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c index 6a07f4dbf2f..5b3601bd189 100644 --- a/kernel/power/swsusp.c +++ b/kernel/power/swsusp.c @@ -56,133 +56,3 @@ #include "power.h" int in_suspend __nosavedata = 0; - -/** - * The following functions are used for tracing the allocated - * swap pages, so that they can be freed in case of an error. - */ - -struct swsusp_extent { - struct rb_node node; - unsigned long start; - unsigned long end; -}; - -static struct rb_root swsusp_extents = RB_ROOT; - -static int swsusp_extents_insert(unsigned long swap_offset) -{ - struct rb_node **new = &(swsusp_extents.rb_node); - struct rb_node *parent = NULL; - struct swsusp_extent *ext; - - /* Figure out where to put the new node */ - while (*new) { - ext = container_of(*new, struct swsusp_extent, node); - parent = *new; - if (swap_offset < ext->start) { - /* Try to merge */ - if (swap_offset == ext->start - 1) { - ext->start--; - return 0; - } - new = &((*new)->rb_left); - } else if (swap_offset > ext->end) { - /* Try to merge */ - if (swap_offset == ext->end + 1) { - ext->end++; - return 0; - } - new = &((*new)->rb_right); - } else { - /* It already is in the tree */ - return -EINVAL; - } - } - /* Add the new node and rebalance the tree. */ - ext = kzalloc(sizeof(struct swsusp_extent), GFP_KERNEL); - if (!ext) - return -ENOMEM; - - ext->start = swap_offset; - ext->end = swap_offset; - rb_link_node(&ext->node, parent, new); - rb_insert_color(&ext->node, &swsusp_extents); - return 0; -} - -/** - * alloc_swapdev_block - allocate a swap page and register that it has - * been allocated, so that it can be freed in case of an error. - */ - -sector_t alloc_swapdev_block(int swap) -{ - unsigned long offset; - - offset = swp_offset(get_swap_page_of_type(swap)); - if (offset) { - if (swsusp_extents_insert(offset)) - swap_free(swp_entry(swap, offset)); - else - return swapdev_block(swap, offset); - } - return 0; -} - -/** - * free_all_swap_pages - free swap pages allocated for saving image data. - * It also frees the extents used to register which swap entres had been - * allocated. - */ - -void free_all_swap_pages(int swap) -{ - struct rb_node *node; - - while ((node = swsusp_extents.rb_node)) { - struct swsusp_extent *ext; - unsigned long offset; - - ext = container_of(node, struct swsusp_extent, node); - rb_erase(node, &swsusp_extents); - for (offset = ext->start; offset <= ext->end; offset++) - swap_free(swp_entry(swap, offset)); - - kfree(ext); - } -} - -int swsusp_swap_in_use(void) -{ - return (swsusp_extents.rb_node != NULL); -} - -/** - * swsusp_show_speed - print the time elapsed between two events represented by - * @start and @stop - * - * @nr_pages - number of pages processed between @start and @stop - * @msg - introductory message to print - */ - -void swsusp_show_speed(struct timeval *start, struct timeval *stop, - unsigned nr_pages, char *msg) -{ - s64 elapsed_centisecs64; - int centisecs; - int k; - int kps; - - elapsed_centisecs64 = timeval_to_ns(stop) - timeval_to_ns(start); - do_div(elapsed_centisecs64, NSEC_PER_SEC / 100); - centisecs = elapsed_centisecs64; - if (centisecs == 0) - centisecs = 1; /* avoid div-by-zero */ - k = nr_pages * (PAGE_SIZE / 1024); - kps = (k * 100) / centisecs; - printk(KERN_INFO "PM: %s %d kbytes in %d.%02d seconds (%d.%02d MB/s)\n", - msg, k, - centisecs / 100, centisecs % 100, - kps / 1000, (kps % 1000) / 10); -} diff --git a/kernel/printk.c b/kernel/printk.c index 2a564570f82..1ded8e7dd19 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -33,6 +33,7 @@ #include <linux/bootmem.h> #include <linux/syscalls.h> #include <linux/kexec.h> +#include <linux/ratelimit.h> #include <linux/kmsg_dump.h> #include <asm/uaccess.h> @@ -1377,11 +1378,11 @@ late_initcall(disable_boot_consoles); */ DEFINE_RATELIMIT_STATE(printk_ratelimit_state, 5 * HZ, 10); -int printk_ratelimit(void) +int __printk_ratelimit(const char *func) { - return __ratelimit(&printk_ratelimit_state); + return ___ratelimit(&printk_ratelimit_state, func); } -EXPORT_SYMBOL(printk_ratelimit); +EXPORT_SYMBOL(__printk_ratelimit); /** * printk_timed_ratelimit - caller-controlled printk ratelimiting diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 307c285af59..23bd09cd042 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -266,9 +266,10 @@ static int ignoring_children(struct sighand_struct *sigh) * or self-reaping. Do notification now if it would have happened earlier. * If it should reap itself, return true. * - * If it's our own child, there is no notification to do. - * But if our normal children self-reap, then this child - * was prevented by ptrace and we must reap it now. + * If it's our own child, there is no notification to do. But if our normal + * children self-reap, then this child was prevented by ptrace and we must + * reap it now, in that case we must also wake up sub-threads sleeping in + * do_wait(). */ static bool __ptrace_detach(struct task_struct *tracer, struct task_struct *p) { @@ -278,8 +279,10 @@ static bool __ptrace_detach(struct task_struct *tracer, struct task_struct *p) if (!task_detached(p) && thread_group_empty(p)) { if (!same_thread_group(p->real_parent, tracer)) do_notify_parent(p, p->exit_signal); - else if (ignoring_children(tracer->sighand)) + else if (ignoring_children(tracer->sighand)) { + __wake_up_parent(p, tracer); p->exit_signal = -1; + } } if (task_detached(p)) { /* Mark it as in the process of being reaped. */ diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index 37ac4548308..9b7fd472387 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -44,23 +44,13 @@ #include <linux/cpu.h> #include <linux/mutex.h> #include <linux/module.h> -#include <linux/kernel_stat.h> -enum rcu_barrier { - RCU_BARRIER_STD, - RCU_BARRIER_BH, - RCU_BARRIER_SCHED, -}; - -static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL}; -static atomic_t rcu_barrier_cpu_count; -static DEFINE_MUTEX(rcu_barrier_mutex); -static struct completion rcu_barrier_completion; -int rcu_scheduler_active __read_mostly; - -static atomic_t rcu_migrate_type_count = ATOMIC_INIT(0); -static struct rcu_head rcu_migrate_head[3]; -static DECLARE_WAIT_QUEUE_HEAD(rcu_migrate_wq); +#ifdef CONFIG_DEBUG_LOCK_ALLOC +static struct lock_class_key rcu_lock_key; +struct lockdep_map rcu_lock_map = + STATIC_LOCKDEP_MAP_INIT("rcu_read_lock", &rcu_lock_key); +EXPORT_SYMBOL_GPL(rcu_lock_map); +#endif /* * Awaken the corresponding synchronize_rcu() instance now that a @@ -73,241 +63,3 @@ void wakeme_after_rcu(struct rcu_head *head) rcu = container_of(head, struct rcu_synchronize, head); complete(&rcu->completion); } - -#ifdef CONFIG_TREE_PREEMPT_RCU - -/** - * synchronize_rcu - wait until a grace period has elapsed. - * - * Control will return to the caller some time after a full grace - * period has elapsed, in other words after all currently executing RCU - * read-side critical sections have completed. RCU read-side critical - * sections are delimited by rcu_read_lock() and rcu_read_unlock(), - * and may be nested. - */ -void synchronize_rcu(void) -{ - struct rcu_synchronize rcu; - - if (!rcu_scheduler_active) - return; - - init_completion(&rcu.completion); - /* Will wake me after RCU finished. */ - call_rcu(&rcu.head, wakeme_after_rcu); - /* Wait for it. */ - wait_for_completion(&rcu.completion); -} -EXPORT_SYMBOL_GPL(synchronize_rcu); - -#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ - -/** - * synchronize_sched - wait until an rcu-sched grace period has elapsed. - * - * Control will return to the caller some time after a full rcu-sched - * grace period has elapsed, in other words after all currently executing - * rcu-sched read-side critical sections have completed. These read-side - * critical sections are delimited by rcu_read_lock_sched() and - * rcu_read_unlock_sched(), and may be nested. Note that preempt_disable(), - * local_irq_disable(), and so on may be used in place of - * rcu_read_lock_sched(). - * - * This means that all preempt_disable code sequences, including NMI and - * hardware-interrupt handlers, in progress on entry will have completed - * before this primitive returns. However, this does not guarantee that - * softirq handlers will have completed, since in some kernels, these - * handlers can run in process context, and can block. - * - * This primitive provides the guarantees made by the (now removed) - * synchronize_kernel() API. In contrast, synchronize_rcu() only - * guarantees that rcu_read_lock() sections will have completed. - * In "classic RCU", these two guarantees happen to be one and - * the same, but can differ in realtime RCU implementations. - */ -void synchronize_sched(void) -{ - struct rcu_synchronize rcu; - - if (rcu_blocking_is_gp()) - return; - - init_completion(&rcu.completion); - /* Will wake me after RCU finished. */ - call_rcu_sched(&rcu.head, wakeme_after_rcu); - /* Wait for it. */ - wait_for_completion(&rcu.completion); -} -EXPORT_SYMBOL_GPL(synchronize_sched); - -/** - * synchronize_rcu_bh - wait until an rcu_bh grace period has elapsed. - * - * Control will return to the caller some time after a full rcu_bh grace - * period has elapsed, in other words after all currently executing rcu_bh - * read-side critical sections have completed. RCU read-side critical - * sections are delimited by rcu_read_lock_bh() and rcu_read_unlock_bh(), - * and may be nested. - */ -void synchronize_rcu_bh(void) -{ - struct rcu_synchronize rcu; - - if (rcu_blocking_is_gp()) - return; - - init_completion(&rcu.completion); - /* Will wake me after RCU finished. */ - call_rcu_bh(&rcu.head, wakeme_after_rcu); - /* Wait for it. */ - wait_for_completion(&rcu.completion); -} -EXPORT_SYMBOL_GPL(synchronize_rcu_bh); - -static void rcu_barrier_callback(struct rcu_head *notused) -{ - if (atomic_dec_and_test(&rcu_barrier_cpu_count)) - complete(&rcu_barrier_completion); -} - -/* - * Called with preemption disabled, and from cross-cpu IRQ context. - */ -static void rcu_barrier_func(void *type) -{ - int cpu = smp_processor_id(); - struct rcu_head *head = &per_cpu(rcu_barrier_head, cpu); - - atomic_inc(&rcu_barrier_cpu_count); - switch ((enum rcu_barrier)type) { - case RCU_BARRIER_STD: - call_rcu(head, rcu_barrier_callback); - break; - case RCU_BARRIER_BH: - call_rcu_bh(head, rcu_barrier_callback); - break; - case RCU_BARRIER_SCHED: - call_rcu_sched(head, rcu_barrier_callback); - break; - } -} - -static inline void wait_migrated_callbacks(void) -{ - wait_event(rcu_migrate_wq, !atomic_read(&rcu_migrate_type_count)); - smp_mb(); /* In case we didn't sleep. */ -} - -/* - * Orchestrate the specified type of RCU barrier, waiting for all - * RCU callbacks of the specified type to complete. - */ -static void _rcu_barrier(enum rcu_barrier type) -{ - BUG_ON(in_interrupt()); - /* Take cpucontrol mutex to protect against CPU hotplug */ - mutex_lock(&rcu_barrier_mutex); - init_completion(&rcu_barrier_completion); - /* - * Initialize rcu_barrier_cpu_count to 1, then invoke - * rcu_barrier_func() on each CPU, so that each CPU also has - * incremented rcu_barrier_cpu_count. Only then is it safe to - * decrement rcu_barrier_cpu_count -- otherwise the first CPU - * might complete its grace period before all of the other CPUs - * did their increment, causing this function to return too - * early. - */ - atomic_set(&rcu_barrier_cpu_count, 1); - on_each_cpu(rcu_barrier_func, (void *)type, 1); - if (atomic_dec_and_test(&rcu_barrier_cpu_count)) - complete(&rcu_barrier_completion); - wait_for_completion(&rcu_barrier_completion); - mutex_unlock(&rcu_barrier_mutex); - wait_migrated_callbacks(); -} - -/** - * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete. - */ -void rcu_barrier(void) -{ - _rcu_barrier(RCU_BARRIER_STD); -} -EXPORT_SYMBOL_GPL(rcu_barrier); - -/** - * rcu_barrier_bh - Wait until all in-flight call_rcu_bh() callbacks complete. - */ -void rcu_barrier_bh(void) -{ - _rcu_barrier(RCU_BARRIER_BH); -} -EXPORT_SYMBOL_GPL(rcu_barrier_bh); - -/** - * rcu_barrier_sched - Wait for in-flight call_rcu_sched() callbacks. - */ -void rcu_barrier_sched(void) -{ - _rcu_barrier(RCU_BARRIER_SCHED); -} -EXPORT_SYMBOL_GPL(rcu_barrier_sched); - -static void rcu_migrate_callback(struct rcu_head *notused) -{ - if (atomic_dec_and_test(&rcu_migrate_type_count)) - wake_up(&rcu_migrate_wq); -} - -extern int rcu_cpu_notify(struct notifier_block *self, - unsigned long action, void *hcpu); - -static int __cpuinit rcu_barrier_cpu_hotplug(struct notifier_block *self, - unsigned long action, void *hcpu) -{ - rcu_cpu_notify(self, action, hcpu); - if (action == CPU_DYING) { - /* - * preempt_disable() in on_each_cpu() prevents stop_machine(), - * so when "on_each_cpu(rcu_barrier_func, (void *)type, 1);" - * returns, all online cpus have queued rcu_barrier_func(), - * and the dead cpu(if it exist) queues rcu_migrate_callback()s. - * - * These callbacks ensure _rcu_barrier() waits for all - * RCU callbacks of the specified type to complete. - */ - atomic_set(&rcu_migrate_type_count, 3); - call_rcu_bh(rcu_migrate_head, rcu_migrate_callback); - call_rcu_sched(rcu_migrate_head + 1, rcu_migrate_callback); - call_rcu(rcu_migrate_head + 2, rcu_migrate_callback); - } else if (action == CPU_DOWN_PREPARE) { - /* Don't need to wait until next removal operation. */ - /* rcu_migrate_head is protected by cpu_add_remove_lock */ - wait_migrated_callbacks(); - } - - return NOTIFY_OK; -} - -void __init rcu_init(void) -{ - int i; - - __rcu_init(); - cpu_notifier(rcu_barrier_cpu_hotplug, 0); - - /* - * We don't need protection against CPU-hotplug here because - * this is called early in boot, before either interrupts - * or the scheduler are operational. - */ - for_each_online_cpu(i) - rcu_barrier_cpu_hotplug(NULL, CPU_UP_PREPARE, (void *)(long)i); -} - -void rcu_scheduler_starting(void) -{ - WARN_ON(num_online_cpus() != 1); - WARN_ON(nr_context_switches() > 0); - rcu_scheduler_active = 1; -} diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c new file mode 100644 index 00000000000..9f6d9ff2572 --- /dev/null +++ b/kernel/rcutiny.c @@ -0,0 +1,282 @@ +/* + * Read-Copy Update mechanism for mutual exclusion, the Bloatwatch edition. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright IBM Corporation, 2008 + * + * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com> + * + * For detailed explanation of Read-Copy Update mechanism see - + * Documentation/RCU + */ +#include <linux/moduleparam.h> +#include <linux/completion.h> +#include <linux/interrupt.h> +#include <linux/notifier.h> +#include <linux/rcupdate.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/mutex.h> +#include <linux/sched.h> +#include <linux/types.h> +#include <linux/init.h> +#include <linux/time.h> +#include <linux/cpu.h> + +/* Global control variables for rcupdate callback mechanism. */ +struct rcu_ctrlblk { + struct rcu_head *rcucblist; /* List of pending callbacks (CBs). */ + struct rcu_head **donetail; /* ->next pointer of last "done" CB. */ + struct rcu_head **curtail; /* ->next pointer of last CB. */ +}; + +/* Definition for rcupdate control block. */ +static struct rcu_ctrlblk rcu_ctrlblk = { + .donetail = &rcu_ctrlblk.rcucblist, + .curtail = &rcu_ctrlblk.rcucblist, +}; + +static struct rcu_ctrlblk rcu_bh_ctrlblk = { + .donetail = &rcu_bh_ctrlblk.rcucblist, + .curtail = &rcu_bh_ctrlblk.rcucblist, +}; + +#ifdef CONFIG_NO_HZ + +static long rcu_dynticks_nesting = 1; + +/* + * Enter dynticks-idle mode, which is an extended quiescent state + * if we have fully entered that mode (i.e., if the new value of + * dynticks_nesting is zero). + */ +void rcu_enter_nohz(void) +{ + if (--rcu_dynticks_nesting == 0) + rcu_sched_qs(0); /* implies rcu_bh_qsctr_inc(0) */ +} + +/* + * Exit dynticks-idle mode, so that we are no longer in an extended + * quiescent state. + */ +void rcu_exit_nohz(void) +{ + rcu_dynticks_nesting++; +} + +#endif /* #ifdef CONFIG_NO_HZ */ + +/* + * Helper function for rcu_qsctr_inc() and rcu_bh_qsctr_inc(). + * Also disable irqs to avoid confusion due to interrupt handlers + * invoking call_rcu(). + */ +static int rcu_qsctr_help(struct rcu_ctrlblk *rcp) +{ + unsigned long flags; + + local_irq_save(flags); + if (rcp->rcucblist != NULL && + rcp->donetail != rcp->curtail) { + rcp->donetail = rcp->curtail; + local_irq_restore(flags); + return 1; + } + local_irq_restore(flags); + + return 0; +} + +/* + * Record an rcu quiescent state. And an rcu_bh quiescent state while we + * are at it, given that any rcu quiescent state is also an rcu_bh + * quiescent state. Use "+" instead of "||" to defeat short circuiting. + */ +void rcu_sched_qs(int cpu) +{ + if (rcu_qsctr_help(&rcu_ctrlblk) + rcu_qsctr_help(&rcu_bh_ctrlblk)) + raise_softirq(RCU_SOFTIRQ); +} + +/* + * Record an rcu_bh quiescent state. + */ +void rcu_bh_qs(int cpu) +{ + if (rcu_qsctr_help(&rcu_bh_ctrlblk)) + raise_softirq(RCU_SOFTIRQ); +} + +/* + * Check to see if the scheduling-clock interrupt came from an extended + * quiescent state, and, if so, tell RCU about it. + */ +void rcu_check_callbacks(int cpu, int user) +{ + if (user || + (idle_cpu(cpu) && + !in_softirq() && + hardirq_count() <= (1 << HARDIRQ_SHIFT))) + rcu_sched_qs(cpu); + else if (!in_softirq()) + rcu_bh_qs(cpu); +} + +/* + * Helper function for rcu_process_callbacks() that operates on the + * specified rcu_ctrlkblk structure. + */ +static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) +{ + struct rcu_head *next, *list; + unsigned long flags; + + /* If no RCU callbacks ready to invoke, just return. */ + if (&rcp->rcucblist == rcp->donetail) + return; + + /* Move the ready-to-invoke callbacks to a local list. */ + local_irq_save(flags); + list = rcp->rcucblist; + rcp->rcucblist = *rcp->donetail; + *rcp->donetail = NULL; + if (rcp->curtail == rcp->donetail) + rcp->curtail = &rcp->rcucblist; + rcp->donetail = &rcp->rcucblist; + local_irq_restore(flags); + + /* Invoke the callbacks on the local list. */ + while (list) { + next = list->next; + prefetch(next); + list->func(list); + list = next; + } +} + +/* + * Invoke any callbacks whose grace period has completed. + */ +static void rcu_process_callbacks(struct softirq_action *unused) +{ + __rcu_process_callbacks(&rcu_ctrlblk); + __rcu_process_callbacks(&rcu_bh_ctrlblk); +} + +/* + * Wait for a grace period to elapse. But it is illegal to invoke + * synchronize_sched() from within an RCU read-side critical section. + * Therefore, any legal call to synchronize_sched() is a quiescent + * state, and so on a UP system, synchronize_sched() need do nothing. + * Ditto for synchronize_rcu_bh(). (But Lai Jiangshan points out the + * benefits of doing might_sleep() to reduce latency.) + * + * Cool, huh? (Due to Josh Triplett.) + * + * But we want to make this a static inline later. + */ +void synchronize_sched(void) +{ + cond_resched(); +} +EXPORT_SYMBOL_GPL(synchronize_sched); + +void synchronize_rcu_bh(void) +{ + synchronize_sched(); +} +EXPORT_SYMBOL_GPL(synchronize_rcu_bh); + +/* + * Helper function for call_rcu() and call_rcu_bh(). + */ +static void __call_rcu(struct rcu_head *head, + void (*func)(struct rcu_head *rcu), + struct rcu_ctrlblk *rcp) +{ + unsigned long flags; + + head->func = func; + head->next = NULL; + + local_irq_save(flags); + *rcp->curtail = head; + rcp->curtail = &head->next; + local_irq_restore(flags); +} + +/* + * Post an RCU callback to be invoked after the end of an RCU grace + * period. But since we have but one CPU, that would be after any + * quiescent state. + */ +void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) +{ + __call_rcu(head, func, &rcu_ctrlblk); +} +EXPORT_SYMBOL_GPL(call_rcu); + +/* + * Post an RCU bottom-half callback to be invoked after any subsequent + * quiescent state. + */ +void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) +{ + __call_rcu(head, func, &rcu_bh_ctrlblk); +} +EXPORT_SYMBOL_GPL(call_rcu_bh); + +void rcu_barrier(void) +{ + struct rcu_synchronize rcu; + + init_completion(&rcu.completion); + /* Will wake me after RCU finished. */ + call_rcu(&rcu.head, wakeme_after_rcu); + /* Wait for it. */ + wait_for_completion(&rcu.completion); +} +EXPORT_SYMBOL_GPL(rcu_barrier); + +void rcu_barrier_bh(void) +{ + struct rcu_synchronize rcu; + + init_completion(&rcu.completion); + /* Will wake me after RCU finished. */ + call_rcu_bh(&rcu.head, wakeme_after_rcu); + /* Wait for it. */ + wait_for_completion(&rcu.completion); +} +EXPORT_SYMBOL_GPL(rcu_barrier_bh); + +void rcu_barrier_sched(void) +{ + struct rcu_synchronize rcu; + + init_completion(&rcu.completion); + /* Will wake me after RCU finished. */ + call_rcu_sched(&rcu.head, wakeme_after_rcu); + /* Wait for it. */ + wait_for_completion(&rcu.completion); +} +EXPORT_SYMBOL_GPL(rcu_barrier_sched); + +void __init rcu_init(void) +{ + open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); +} diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 233768f21f9..9bb52177af0 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -327,6 +327,11 @@ rcu_torture_cb(struct rcu_head *p) cur_ops->deferred_free(rp); } +static int rcu_no_completed(void) +{ + return 0; +} + static void rcu_torture_deferred_free(struct rcu_torture *p) { call_rcu(&p->rtort_rcu, rcu_torture_cb); @@ -388,6 +393,21 @@ static struct rcu_torture_ops rcu_sync_ops = { .name = "rcu_sync" }; +static struct rcu_torture_ops rcu_expedited_ops = { + .init = rcu_sync_torture_init, + .cleanup = NULL, + .readlock = rcu_torture_read_lock, + .read_delay = rcu_read_delay, /* just reuse rcu's version. */ + .readunlock = rcu_torture_read_unlock, + .completed = rcu_no_completed, + .deferred_free = rcu_sync_torture_deferred_free, + .sync = synchronize_rcu_expedited, + .cb_barrier = NULL, + .stats = NULL, + .irq_capable = 1, + .name = "rcu_expedited" +}; + /* * Definitions for rcu_bh torture testing. */ @@ -547,6 +567,25 @@ static struct rcu_torture_ops srcu_ops = { .name = "srcu" }; +static void srcu_torture_synchronize_expedited(void) +{ + synchronize_srcu_expedited(&srcu_ctl); +} + +static struct rcu_torture_ops srcu_expedited_ops = { + .init = srcu_torture_init, + .cleanup = srcu_torture_cleanup, + .readlock = srcu_torture_read_lock, + .read_delay = srcu_read_delay, + .readunlock = srcu_torture_read_unlock, + .completed = srcu_torture_completed, + .deferred_free = rcu_sync_torture_deferred_free, + .sync = srcu_torture_synchronize_expedited, + .cb_barrier = NULL, + .stats = srcu_torture_stats, + .name = "srcu_expedited" +}; + /* * Definitions for sched torture testing. */ @@ -562,11 +601,6 @@ static void sched_torture_read_unlock(int idx) preempt_enable(); } -static int sched_torture_completed(void) -{ - return 0; -} - static void rcu_sched_torture_deferred_free(struct rcu_torture *p) { call_rcu_sched(&p->rtort_rcu, rcu_torture_cb); @@ -583,7 +617,7 @@ static struct rcu_torture_ops sched_ops = { .readlock = sched_torture_read_lock, .read_delay = rcu_read_delay, /* just reuse rcu's version. */ .readunlock = sched_torture_read_unlock, - .completed = sched_torture_completed, + .completed = rcu_no_completed, .deferred_free = rcu_sched_torture_deferred_free, .sync = sched_torture_synchronize, .cb_barrier = rcu_barrier_sched, @@ -592,13 +626,13 @@ static struct rcu_torture_ops sched_ops = { .name = "sched" }; -static struct rcu_torture_ops sched_ops_sync = { +static struct rcu_torture_ops sched_sync_ops = { .init = rcu_sync_torture_init, .cleanup = NULL, .readlock = sched_torture_read_lock, .read_delay = rcu_read_delay, /* just reuse rcu's version. */ .readunlock = sched_torture_read_unlock, - .completed = sched_torture_completed, + .completed = rcu_no_completed, .deferred_free = rcu_sync_torture_deferred_free, .sync = sched_torture_synchronize, .cb_barrier = NULL, @@ -606,15 +640,13 @@ static struct rcu_torture_ops sched_ops_sync = { .name = "sched_sync" }; -extern int rcu_expedited_torture_stats(char *page); - static struct rcu_torture_ops sched_expedited_ops = { .init = rcu_sync_torture_init, .cleanup = NULL, .readlock = sched_torture_read_lock, .read_delay = rcu_read_delay, /* just reuse rcu's version. */ .readunlock = sched_torture_read_unlock, - .completed = sched_torture_completed, + .completed = rcu_no_completed, .deferred_free = rcu_sync_torture_deferred_free, .sync = synchronize_sched_expedited, .cb_barrier = NULL, @@ -650,7 +682,7 @@ rcu_torture_writer(void *arg) old_rp = rcu_torture_current; rp->rtort_mbtest = 1; rcu_assign_pointer(rcu_torture_current, rp); - smp_wmb(); + smp_wmb(); /* Mods to old_rp must follow rcu_assign_pointer() */ if (old_rp) { i = old_rp->rtort_pipe_count; if (i > RCU_TORTURE_PIPE_LEN) @@ -731,13 +763,13 @@ static void rcu_torture_timer(unsigned long unused) /* Should not happen, but... */ pipe_count = RCU_TORTURE_PIPE_LEN; } - ++__get_cpu_var(rcu_torture_count)[pipe_count]; + __this_cpu_inc(per_cpu_var(rcu_torture_count)[pipe_count]); completed = cur_ops->completed() - completed; if (completed > RCU_TORTURE_PIPE_LEN) { /* Should not happen, but... */ completed = RCU_TORTURE_PIPE_LEN; } - ++__get_cpu_var(rcu_torture_batch)[completed]; + __this_cpu_inc(per_cpu_var(rcu_torture_batch)[completed]); preempt_enable(); cur_ops->readunlock(idx); } @@ -786,13 +818,13 @@ rcu_torture_reader(void *arg) /* Should not happen, but... */ pipe_count = RCU_TORTURE_PIPE_LEN; } - ++__get_cpu_var(rcu_torture_count)[pipe_count]; + __this_cpu_inc(per_cpu_var(rcu_torture_count)[pipe_count]); completed = cur_ops->completed() - completed; if (completed > RCU_TORTURE_PIPE_LEN) { /* Should not happen, but... */ completed = RCU_TORTURE_PIPE_LEN; } - ++__get_cpu_var(rcu_torture_batch)[completed]; + __this_cpu_inc(per_cpu_var(rcu_torture_batch)[completed]); preempt_enable(); cur_ops->readunlock(idx); schedule(); @@ -1099,9 +1131,10 @@ rcu_torture_init(void) int cpu; int firsterr = 0; static struct rcu_torture_ops *torture_ops[] = - { &rcu_ops, &rcu_sync_ops, &rcu_bh_ops, &rcu_bh_sync_ops, - &sched_expedited_ops, - &srcu_ops, &sched_ops, &sched_ops_sync, }; + { &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops, + &rcu_bh_ops, &rcu_bh_sync_ops, + &srcu_ops, &srcu_expedited_ops, + &sched_ops, &sched_sync_ops, &sched_expedited_ops, }; mutex_lock(&fullstop_mutex); @@ -1112,8 +1145,12 @@ rcu_torture_init(void) break; } if (i == ARRAY_SIZE(torture_ops)) { - printk(KERN_ALERT "rcutorture: invalid torture type: \"%s\"\n", + printk(KERN_ALERT "rcu-torture: invalid torture type: \"%s\"\n", torture_type); + printk(KERN_ALERT "rcu-torture types:"); + for (i = 0; i < ARRAY_SIZE(torture_ops); i++) + printk(KERN_ALERT " %s", torture_ops[i]->name); + printk(KERN_ALERT "\n"); mutex_unlock(&fullstop_mutex); return -EINVAL; } diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 52b06f6e158..53ae9598f79 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -46,30 +46,30 @@ #include <linux/cpu.h> #include <linux/mutex.h> #include <linux/time.h> +#include <linux/kernel_stat.h> #include "rcutree.h" -#ifdef CONFIG_DEBUG_LOCK_ALLOC -static struct lock_class_key rcu_lock_key; -struct lockdep_map rcu_lock_map = - STATIC_LOCKDEP_MAP_INIT("rcu_read_lock", &rcu_lock_key); -EXPORT_SYMBOL_GPL(rcu_lock_map); -#endif - /* Data structures. */ +static struct lock_class_key rcu_node_class[NUM_RCU_LVLS]; + #define RCU_STATE_INITIALIZER(name) { \ .level = { &name.node[0] }, \ .levelcnt = { \ NUM_RCU_LVL_0, /* root of hierarchy. */ \ NUM_RCU_LVL_1, \ NUM_RCU_LVL_2, \ - NUM_RCU_LVL_3, /* == MAX_RCU_LVLS */ \ + NUM_RCU_LVL_3, \ + NUM_RCU_LVL_4, /* == MAX_RCU_LVLS */ \ }, \ - .signaled = RCU_SIGNAL_INIT, \ + .signaled = RCU_GP_IDLE, \ .gpnum = -300, \ .completed = -300, \ .onofflock = __SPIN_LOCK_UNLOCKED(&name.onofflock), \ + .orphan_cbs_list = NULL, \ + .orphan_cbs_tail = &name.orphan_cbs_list, \ + .orphan_qlen = 0, \ .fqslock = __SPIN_LOCK_UNLOCKED(&name.fqslock), \ .n_force_qs = 0, \ .n_force_qs_ngp = 0, \ @@ -81,24 +81,18 @@ DEFINE_PER_CPU(struct rcu_data, rcu_sched_data); struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state); DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); -extern long rcu_batches_completed_sched(void); -static struct rcu_node *rcu_get_root(struct rcu_state *rsp); -static void cpu_quiet_msk(unsigned long mask, struct rcu_state *rsp, - struct rcu_node *rnp, unsigned long flags); -static void cpu_quiet_msk_finish(struct rcu_state *rsp, unsigned long flags); -#ifdef CONFIG_HOTPLUG_CPU -static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp); -#endif /* #ifdef CONFIG_HOTPLUG_CPU */ -static void __rcu_process_callbacks(struct rcu_state *rsp, - struct rcu_data *rdp); -static void __call_rcu(struct rcu_head *head, - void (*func)(struct rcu_head *rcu), - struct rcu_state *rsp); -static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp); -static void __cpuinit rcu_init_percpu_data(int cpu, struct rcu_state *rsp, - int preemptable); +static int rcu_scheduler_active __read_mostly; -#include "rcutree_plugin.h" + +/* + * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s + * permit this function to be invoked without holding the root rcu_node + * structure's ->lock, but of course results can be subject to change. + */ +static int rcu_gp_in_progress(struct rcu_state *rsp) +{ + return ACCESS_ONCE(rsp->completed) != ACCESS_ONCE(rsp->gpnum); +} /* * Note a quiescent state. Because we do not need to know @@ -110,7 +104,7 @@ void rcu_sched_qs(int cpu) struct rcu_data *rdp; rdp = &per_cpu(rcu_sched_data, cpu); - rdp->passed_quiesc_completed = rdp->completed; + rdp->passed_quiesc_completed = rdp->gpnum - 1; barrier(); rdp->passed_quiesc = 1; rcu_preempt_note_context_switch(cpu); @@ -121,7 +115,7 @@ void rcu_bh_qs(int cpu) struct rcu_data *rdp; rdp = &per_cpu(rcu_bh_data, cpu); - rdp->passed_quiesc_completed = rdp->completed; + rdp->passed_quiesc_completed = rdp->gpnum - 1; barrier(); rdp->passed_quiesc = 1; } @@ -137,6 +131,10 @@ static int blimit = 10; /* Maximum callbacks per softirq. */ static int qhimark = 10000; /* If this many pending, ignore blimit. */ static int qlowmark = 100; /* Once only this many pending, use blimit. */ +module_param(blimit, int, 0); +module_param(qhimark, int, 0); +module_param(qlowmark, int, 0); + static void force_quiescent_state(struct rcu_state *rsp, int relaxed); static int rcu_pending(int cpu); @@ -173,9 +171,7 @@ cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp) static int cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp) { - /* ACCESS_ONCE() because we are accessing outside of lock. */ - return *rdp->nxttail[RCU_DONE_TAIL] && - ACCESS_ONCE(rsp->completed) == ACCESS_ONCE(rsp->gpnum); + return *rdp->nxttail[RCU_DONE_TAIL] && !rcu_gp_in_progress(rsp); } /* @@ -345,31 +341,12 @@ void rcu_irq_exit(void) set_need_resched(); } -/* - * Record the specified "completed" value, which is later used to validate - * dynticks counter manipulations. Specify "rsp->completed - 1" to - * unconditionally invalidate any future dynticks manipulations (which is - * useful at the beginning of a grace period). - */ -static void dyntick_record_completed(struct rcu_state *rsp, long comp) -{ - rsp->dynticks_completed = comp; -} - #ifdef CONFIG_SMP /* - * Recall the previously recorded value of the completion for dynticks. - */ -static long dyntick_recall_completed(struct rcu_state *rsp) -{ - return rsp->dynticks_completed; -} - -/* * Snapshot the specified CPU's dynticks counter so that we can later * credit them with an implicit quiescent state. Return 1 if this CPU - * is already in a quiescent state courtesy of dynticks idle mode. + * is in dynticks idle mode, which is an extended quiescent state. */ static int dyntick_save_progress_counter(struct rcu_data *rdp) { @@ -429,24 +406,8 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) #else /* #ifdef CONFIG_NO_HZ */ -static void dyntick_record_completed(struct rcu_state *rsp, long comp) -{ -} - #ifdef CONFIG_SMP -/* - * If there are no dynticks, then the only way that a CPU can passively - * be in a quiescent state is to be offline. Unlike dynticks idle, which - * is a point in time during the prior (already finished) grace period, - * an offline CPU is always in a quiescent state, and thus can be - * unconditionally applied. So just return the current value of completed. - */ -static long dyntick_recall_completed(struct rcu_state *rsp) -{ - return rsp->completed; -} - static int dyntick_save_progress_counter(struct rcu_data *rdp) { return 0; @@ -475,30 +436,34 @@ static void print_other_cpu_stall(struct rcu_state *rsp) long delta; unsigned long flags; struct rcu_node *rnp = rcu_get_root(rsp); - struct rcu_node *rnp_cur = rsp->level[NUM_RCU_LVLS - 1]; - struct rcu_node *rnp_end = &rsp->node[NUM_RCU_NODES]; /* Only let one CPU complain about others per time interval. */ spin_lock_irqsave(&rnp->lock, flags); delta = jiffies - rsp->jiffies_stall; - if (delta < RCU_STALL_RAT_DELAY || rsp->gpnum == rsp->completed) { + if (delta < RCU_STALL_RAT_DELAY || !rcu_gp_in_progress(rsp)) { spin_unlock_irqrestore(&rnp->lock, flags); return; } rsp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK; + + /* + * Now rat on any tasks that got kicked up to the root rcu_node + * due to CPU offlining. + */ + rcu_print_task_stall(rnp); spin_unlock_irqrestore(&rnp->lock, flags); /* OK, time to rat on our buddy... */ printk(KERN_ERR "INFO: RCU detected CPU stalls:"); - for (; rnp_cur < rnp_end; rnp_cur++) { + rcu_for_each_leaf_node(rsp, rnp) { rcu_print_task_stall(rnp); - if (rnp_cur->qsmask == 0) + if (rnp->qsmask == 0) continue; - for (cpu = 0; cpu <= rnp_cur->grphi - rnp_cur->grplo; cpu++) - if (rnp_cur->qsmask & (1UL << cpu)) - printk(" %d", rnp_cur->grplo + cpu); + for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++) + if (rnp->qsmask & (1UL << cpu)) + printk(" %d", rnp->grplo + cpu); } printk(" (detected by %d, t=%ld jiffies)\n", smp_processor_id(), (long)(jiffies - rsp->gp_start)); @@ -537,8 +502,7 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) /* We haven't checked in, so go dump stack. */ print_cpu_stall(rsp); - } else if (rsp->gpnum != rsp->completed && - delta >= RCU_STALL_RAT_DELAY) { + } else if (rcu_gp_in_progress(rsp) && delta >= RCU_STALL_RAT_DELAY) { /* They had two time units to dump stack, so complain. */ print_other_cpu_stall(rsp); @@ -560,13 +524,33 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) /* * Update CPU-local rcu_data state to record the newly noticed grace period. * This is used both when we started the grace period and when we notice - * that someone else started the grace period. + * that someone else started the grace period. The caller must hold the + * ->lock of the leaf rcu_node structure corresponding to the current CPU, + * and must have irqs disabled. */ +static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) +{ + if (rdp->gpnum != rnp->gpnum) { + rdp->qs_pending = 1; + rdp->passed_quiesc = 0; + rdp->gpnum = rnp->gpnum; + } +} + static void note_new_gpnum(struct rcu_state *rsp, struct rcu_data *rdp) { - rdp->qs_pending = 1; - rdp->passed_quiesc = 0; - rdp->gpnum = rsp->gpnum; + unsigned long flags; + struct rcu_node *rnp; + + local_irq_save(flags); + rnp = rdp->mynode; + if (rdp->gpnum == ACCESS_ONCE(rnp->gpnum) || /* outside lock. */ + !spin_trylock(&rnp->lock)) { /* irqs already off, retry later. */ + local_irq_restore(flags); + return; + } + __note_new_gpnum(rsp, rnp, rdp); + spin_unlock_irqrestore(&rnp->lock, flags); } /* @@ -590,6 +574,79 @@ check_for_new_grace_period(struct rcu_state *rsp, struct rcu_data *rdp) } /* + * Advance this CPU's callbacks, but only if the current grace period + * has ended. This may be called only from the CPU to whom the rdp + * belongs. In addition, the corresponding leaf rcu_node structure's + * ->lock must be held by the caller, with irqs disabled. + */ +static void +__rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) +{ + /* Did another grace period end? */ + if (rdp->completed != rnp->completed) { + + /* Advance callbacks. No harm if list empty. */ + rdp->nxttail[RCU_DONE_TAIL] = rdp->nxttail[RCU_WAIT_TAIL]; + rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_READY_TAIL]; + rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; + + /* Remember that we saw this grace-period completion. */ + rdp->completed = rnp->completed; + } +} + +/* + * Advance this CPU's callbacks, but only if the current grace period + * has ended. This may be called only from the CPU to whom the rdp + * belongs. + */ +static void +rcu_process_gp_end(struct rcu_state *rsp, struct rcu_data *rdp) +{ + unsigned long flags; + struct rcu_node *rnp; + + local_irq_save(flags); + rnp = rdp->mynode; + if (rdp->completed == ACCESS_ONCE(rnp->completed) || /* outside lock. */ + !spin_trylock(&rnp->lock)) { /* irqs already off, retry later. */ + local_irq_restore(flags); + return; + } + __rcu_process_gp_end(rsp, rnp, rdp); + spin_unlock_irqrestore(&rnp->lock, flags); +} + +/* + * Do per-CPU grace-period initialization for running CPU. The caller + * must hold the lock of the leaf rcu_node structure corresponding to + * this CPU. + */ +static void +rcu_start_gp_per_cpu(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) +{ + /* Prior grace period ended, so advance callbacks for current CPU. */ + __rcu_process_gp_end(rsp, rnp, rdp); + + /* + * Because this CPU just now started the new grace period, we know + * that all of its callbacks will be covered by this upcoming grace + * period, even the ones that were registered arbitrarily recently. + * Therefore, advance all outstanding callbacks to RCU_WAIT_TAIL. + * + * Other CPUs cannot be sure exactly when the grace period started. + * Therefore, their recently registered callbacks must pass through + * an additional RCU_NEXT_READY stage, so that they will be handled + * by the next RCU grace period. + */ + rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; + rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; + + /* Set state so that this CPU will detect the next quiescent state. */ + __note_new_gpnum(rsp, rnp, rdp); +} + +/* * Start a new RCU grace period if warranted, re-initializing the hierarchy * in preparation for detecting the next grace period. The caller must hold * the root node's ->lock, which is released before return. Hard irqs must @@ -603,7 +660,23 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) struct rcu_node *rnp = rcu_get_root(rsp); if (!cpu_needs_another_gp(rsp, rdp)) { - spin_unlock_irqrestore(&rnp->lock, flags); + if (rnp->completed == rsp->completed) { + spin_unlock_irqrestore(&rnp->lock, flags); + return; + } + spin_unlock(&rnp->lock); /* irqs remain disabled. */ + + /* + * Propagate new ->completed value to rcu_node structures + * so that other CPUs don't have to wait until the start + * of the next grace period to process their callbacks. + */ + rcu_for_each_node_breadth_first(rsp, rnp) { + spin_lock(&rnp->lock); /* irqs already disabled. */ + rnp->completed = rsp->completed; + spin_unlock(&rnp->lock); /* irqs remain disabled. */ + } + local_irq_restore(flags); return; } @@ -613,23 +686,15 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) rsp->signaled = RCU_GP_INIT; /* Hold off force_quiescent_state. */ rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS; record_gp_stall_check_time(rsp); - dyntick_record_completed(rsp, rsp->completed - 1); - note_new_gpnum(rsp, rdp); - - /* - * Because we are first, we know that all our callbacks will - * be covered by this upcoming grace period, even the ones - * that were registered arbitrarily recently. - */ - rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; - rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; /* Special-case the common single-level case. */ if (NUM_RCU_NODES == 1) { rcu_preempt_check_blocked_tasks(rnp); rnp->qsmask = rnp->qsmaskinit; rnp->gpnum = rsp->gpnum; + rnp->completed = rsp->completed; rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */ + rcu_start_gp_per_cpu(rsp, rnp, rdp); spin_unlock_irqrestore(&rnp->lock, flags); return; } @@ -657,70 +722,51 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) * one corresponding to this CPU, due to the fact that we have * irqs disabled. */ - for (rnp = &rsp->node[0]; rnp < &rsp->node[NUM_RCU_NODES]; rnp++) { - spin_lock(&rnp->lock); /* irqs already disabled. */ + rcu_for_each_node_breadth_first(rsp, rnp) { + spin_lock(&rnp->lock); /* irqs already disabled. */ rcu_preempt_check_blocked_tasks(rnp); rnp->qsmask = rnp->qsmaskinit; rnp->gpnum = rsp->gpnum; - spin_unlock(&rnp->lock); /* irqs already disabled. */ + rnp->completed = rsp->completed; + if (rnp == rdp->mynode) + rcu_start_gp_per_cpu(rsp, rnp, rdp); + spin_unlock(&rnp->lock); /* irqs remain disabled. */ } + rnp = rcu_get_root(rsp); + spin_lock(&rnp->lock); /* irqs already disabled. */ rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */ + spin_unlock(&rnp->lock); /* irqs remain disabled. */ spin_unlock_irqrestore(&rsp->onofflock, flags); } /* - * Advance this CPU's callbacks, but only if the current grace period - * has ended. This may be called only from the CPU to whom the rdp - * belongs. - */ -static void -rcu_process_gp_end(struct rcu_state *rsp, struct rcu_data *rdp) -{ - long completed_snap; - unsigned long flags; - - local_irq_save(flags); - completed_snap = ACCESS_ONCE(rsp->completed); /* outside of lock. */ - - /* Did another grace period end? */ - if (rdp->completed != completed_snap) { - - /* Advance callbacks. No harm if list empty. */ - rdp->nxttail[RCU_DONE_TAIL] = rdp->nxttail[RCU_WAIT_TAIL]; - rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_READY_TAIL]; - rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; - - /* Remember that we saw this grace-period completion. */ - rdp->completed = completed_snap; - } - local_irq_restore(flags); -} - -/* - * Clean up after the prior grace period and let rcu_start_gp() start up - * the next grace period if one is needed. Note that the caller must - * hold rnp->lock, as required by rcu_start_gp(), which will release it. + * Report a full set of quiescent states to the specified rcu_state + * data structure. This involves cleaning up after the prior grace + * period and letting rcu_start_gp() start up the next grace period + * if one is needed. Note that the caller must hold rnp->lock, as + * required by rcu_start_gp(), which will release it. */ -static void cpu_quiet_msk_finish(struct rcu_state *rsp, unsigned long flags) - __releases(rnp->lock) +static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) + __releases(rcu_get_root(rsp)->lock) { - WARN_ON_ONCE(rsp->completed == rsp->gpnum); + WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); rsp->completed = rsp->gpnum; - rcu_process_gp_end(rsp, rsp->rda[smp_processor_id()]); + rsp->signaled = RCU_GP_IDLE; rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */ } /* - * Similar to cpu_quiet(), for which it is a helper function. Allows - * a group of CPUs to be quieted at one go, though all the CPUs in the - * group must be represented by the same leaf rcu_node structure. - * That structure's lock must be held upon entry, and it is released - * before return. + * Similar to rcu_report_qs_rdp(), for which it is a helper function. + * Allows quiescent states for a group of CPUs to be reported at one go + * to the specified rcu_node structure, though all the CPUs in the group + * must be represented by the same rcu_node structure (which need not be + * a leaf rcu_node structure, though it often will be). That structure's + * lock must be held upon entry, and it is released before return. */ static void -cpu_quiet_msk(unsigned long mask, struct rcu_state *rsp, struct rcu_node *rnp, - unsigned long flags) +rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, + struct rcu_node *rnp, unsigned long flags) __releases(rnp->lock) { struct rcu_node *rnp_c; @@ -756,21 +802,23 @@ cpu_quiet_msk(unsigned long mask, struct rcu_state *rsp, struct rcu_node *rnp, /* * Get here if we are the last CPU to pass through a quiescent - * state for this grace period. Invoke cpu_quiet_msk_finish() + * state for this grace period. Invoke rcu_report_qs_rsp() * to clean up and start the next grace period if one is needed. */ - cpu_quiet_msk_finish(rsp, flags); /* releases rnp->lock. */ + rcu_report_qs_rsp(rsp, flags); /* releases rnp->lock. */ } /* - * Record a quiescent state for the specified CPU, which must either be - * the current CPU. The lastcomp argument is used to make sure we are - * still in the grace period of interest. We don't want to end the current - * grace period based on quiescent states detected in an earlier grace - * period! + * Record a quiescent state for the specified CPU to that CPU's rcu_data + * structure. This must be either called from the specified CPU, or + * called when the specified CPU is known to be offline (and when it is + * also known that no other CPU is concurrently trying to help the offline + * CPU). The lastcomp argument is used to make sure we are still in the + * grace period of interest. We don't want to end the current grace period + * based on quiescent states detected in an earlier grace period! */ static void -cpu_quiet(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastcomp) +rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastcomp) { unsigned long flags; unsigned long mask; @@ -778,15 +826,15 @@ cpu_quiet(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastcomp) rnp = rdp->mynode; spin_lock_irqsave(&rnp->lock, flags); - if (lastcomp != ACCESS_ONCE(rsp->completed)) { + if (lastcomp != rnp->completed) { /* * Someone beat us to it for this grace period, so leave. * The race with GP start is resolved by the fact that we * hold the leaf rcu_node lock, so that the per-CPU bits * cannot yet be initialized -- so we would simply find our - * CPU's bit already cleared in cpu_quiet_msk() if this race - * occurred. + * CPU's bit already cleared in rcu_report_qs_rnp() if this + * race occurred. */ rdp->passed_quiesc = 0; /* try again later! */ spin_unlock_irqrestore(&rnp->lock, flags); @@ -804,7 +852,7 @@ cpu_quiet(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastcomp) */ rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; - cpu_quiet_msk(mask, rsp, rnp, flags); /* releases rnp->lock */ + rcu_report_qs_rnp(mask, rsp, rnp, flags); /* rlses rnp->lock */ } } @@ -835,24 +883,73 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp) if (!rdp->passed_quiesc) return; - /* Tell RCU we are done (but cpu_quiet() will be the judge of that). */ - cpu_quiet(rdp->cpu, rsp, rdp, rdp->passed_quiesc_completed); + /* + * Tell RCU we are done (but rcu_report_qs_rdp() will be the + * judge of that). + */ + rcu_report_qs_rdp(rdp->cpu, rsp, rdp, rdp->passed_quiesc_completed); } #ifdef CONFIG_HOTPLUG_CPU /* + * Move a dying CPU's RCU callbacks to the ->orphan_cbs_list for the + * specified flavor of RCU. The callbacks will be adopted by the next + * _rcu_barrier() invocation or by the CPU_DEAD notifier, whichever + * comes first. Because this is invoked from the CPU_DYING notifier, + * irqs are already disabled. + */ +static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp) +{ + int i; + struct rcu_data *rdp = rsp->rda[smp_processor_id()]; + + if (rdp->nxtlist == NULL) + return; /* irqs disabled, so comparison is stable. */ + spin_lock(&rsp->onofflock); /* irqs already disabled. */ + *rsp->orphan_cbs_tail = rdp->nxtlist; + rsp->orphan_cbs_tail = rdp->nxttail[RCU_NEXT_TAIL]; + rdp->nxtlist = NULL; + for (i = 0; i < RCU_NEXT_SIZE; i++) + rdp->nxttail[i] = &rdp->nxtlist; + rsp->orphan_qlen += rdp->qlen; + rdp->qlen = 0; + spin_unlock(&rsp->onofflock); /* irqs remain disabled. */ +} + +/* + * Adopt previously orphaned RCU callbacks. + */ +static void rcu_adopt_orphan_cbs(struct rcu_state *rsp) +{ + unsigned long flags; + struct rcu_data *rdp; + + spin_lock_irqsave(&rsp->onofflock, flags); + rdp = rsp->rda[smp_processor_id()]; + if (rsp->orphan_cbs_list == NULL) { + spin_unlock_irqrestore(&rsp->onofflock, flags); + return; + } + *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_list; + rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_tail; + rdp->qlen += rsp->orphan_qlen; + rsp->orphan_cbs_list = NULL; + rsp->orphan_cbs_tail = &rsp->orphan_cbs_list; + rsp->orphan_qlen = 0; + spin_unlock_irqrestore(&rsp->onofflock, flags); +} + +/* * Remove the outgoing CPU from the bitmasks in the rcu_node hierarchy * and move all callbacks from the outgoing CPU to the current one. */ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) { - int i; unsigned long flags; - long lastcomp; unsigned long mask; + int need_report = 0; struct rcu_data *rdp = rsp->rda[cpu]; - struct rcu_data *rdp_me; struct rcu_node *rnp; /* Exclude any attempts to start a new grace period. */ @@ -865,42 +962,34 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) spin_lock(&rnp->lock); /* irqs already disabled. */ rnp->qsmaskinit &= ~mask; if (rnp->qsmaskinit != 0) { - spin_unlock(&rnp->lock); /* irqs remain disabled. */ + if (rnp != rdp->mynode) + spin_unlock(&rnp->lock); /* irqs remain disabled. */ break; } - rcu_preempt_offline_tasks(rsp, rnp, rdp); + if (rnp == rdp->mynode) + need_report = rcu_preempt_offline_tasks(rsp, rnp, rdp); + else + spin_unlock(&rnp->lock); /* irqs remain disabled. */ mask = rnp->grpmask; - spin_unlock(&rnp->lock); /* irqs remain disabled. */ rnp = rnp->parent; } while (rnp != NULL); - lastcomp = rsp->completed; - - spin_unlock(&rsp->onofflock); /* irqs remain disabled. */ /* - * Move callbacks from the outgoing CPU to the running CPU. - * Note that the outgoing CPU is now quiscent, so it is now - * (uncharacteristically) safe to access its rcu_data structure. - * Note also that we must carefully retain the order of the - * outgoing CPU's callbacks in order for rcu_barrier() to work - * correctly. Finally, note that we start all the callbacks - * afresh, even those that have passed through a grace period - * and are therefore ready to invoke. The theory is that hotplug - * events are rare, and that if they are frequent enough to - * indefinitely delay callbacks, you have far worse things to - * be worrying about. + * We still hold the leaf rcu_node structure lock here, and + * irqs are still disabled. The reason for this subterfuge is + * because invoking rcu_report_unblock_qs_rnp() with ->onofflock + * held leads to deadlock. */ - rdp_me = rsp->rda[smp_processor_id()]; - if (rdp->nxtlist != NULL) { - *rdp_me->nxttail[RCU_NEXT_TAIL] = rdp->nxtlist; - rdp_me->nxttail[RCU_NEXT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; - rdp->nxtlist = NULL; - for (i = 0; i < RCU_NEXT_SIZE; i++) - rdp->nxttail[i] = &rdp->nxtlist; - rdp_me->qlen += rdp->qlen; - rdp->qlen = 0; - } - local_irq_restore(flags); + spin_unlock(&rsp->onofflock); /* irqs remain disabled. */ + rnp = rdp->mynode; + if (need_report & RCU_OFL_TASKS_NORM_GP) + rcu_report_unblock_qs_rnp(rnp, flags); + else + spin_unlock_irqrestore(&rnp->lock, flags); + if (need_report & RCU_OFL_TASKS_EXP_GP) + rcu_report_exp_rnp(rsp, rnp); + + rcu_adopt_orphan_cbs(rsp); } /* @@ -918,6 +1007,14 @@ static void rcu_offline_cpu(int cpu) #else /* #ifdef CONFIG_HOTPLUG_CPU */ +static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp) +{ +} + +static void rcu_adopt_orphan_cbs(struct rcu_state *rsp) +{ +} + static void rcu_offline_cpu(int cpu) { } @@ -928,7 +1025,7 @@ static void rcu_offline_cpu(int cpu) * Invoke any RCU callbacks that have made it to the end of their grace * period. Thottle as specified by rdp->blimit. */ -static void rcu_do_batch(struct rcu_data *rdp) +static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) { unsigned long flags; struct rcu_head *next, *list, **tail; @@ -981,6 +1078,13 @@ static void rcu_do_batch(struct rcu_data *rdp) if (rdp->blimit == LONG_MAX && rdp->qlen <= qlowmark) rdp->blimit = blimit; + /* Reset ->qlen_last_fqs_check trigger if enough CBs have drained. */ + if (rdp->qlen == 0 && rdp->qlen_last_fqs_check != 0) { + rdp->qlen_last_fqs_check = 0; + rdp->n_force_qs_snap = rsp->n_force_qs; + } else if (rdp->qlen < rdp->qlen_last_fqs_check - qhimark) + rdp->qlen_last_fqs_check = rdp->qlen; + local_irq_restore(flags); /* Re-raise the RCU softirq if there are callbacks remaining. */ @@ -1050,33 +1154,32 @@ static int rcu_process_dyntick(struct rcu_state *rsp, long lastcomp, int cpu; unsigned long flags; unsigned long mask; - struct rcu_node *rnp_cur = rsp->level[NUM_RCU_LVLS - 1]; - struct rcu_node *rnp_end = &rsp->node[NUM_RCU_NODES]; + struct rcu_node *rnp; - for (; rnp_cur < rnp_end; rnp_cur++) { + rcu_for_each_leaf_node(rsp, rnp) { mask = 0; - spin_lock_irqsave(&rnp_cur->lock, flags); - if (rsp->completed != lastcomp) { - spin_unlock_irqrestore(&rnp_cur->lock, flags); + spin_lock_irqsave(&rnp->lock, flags); + if (rnp->completed != lastcomp) { + spin_unlock_irqrestore(&rnp->lock, flags); return 1; } - if (rnp_cur->qsmask == 0) { - spin_unlock_irqrestore(&rnp_cur->lock, flags); + if (rnp->qsmask == 0) { + spin_unlock_irqrestore(&rnp->lock, flags); continue; } - cpu = rnp_cur->grplo; + cpu = rnp->grplo; bit = 1; - for (; cpu <= rnp_cur->grphi; cpu++, bit <<= 1) { - if ((rnp_cur->qsmask & bit) != 0 && f(rsp->rda[cpu])) + for (; cpu <= rnp->grphi; cpu++, bit <<= 1) { + if ((rnp->qsmask & bit) != 0 && f(rsp->rda[cpu])) mask |= bit; } - if (mask != 0 && rsp->completed == lastcomp) { + if (mask != 0 && rnp->completed == lastcomp) { - /* cpu_quiet_msk() releases rnp_cur->lock. */ - cpu_quiet_msk(mask, rsp, rnp_cur, flags); + /* rcu_report_qs_rnp() releases rnp->lock. */ + rcu_report_qs_rnp(mask, rsp, rnp, flags); continue; } - spin_unlock_irqrestore(&rnp_cur->lock, flags); + spin_unlock_irqrestore(&rnp->lock, flags); } return 0; } @@ -1091,8 +1194,9 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed) long lastcomp; struct rcu_node *rnp = rcu_get_root(rsp); u8 signaled; + u8 forcenow; - if (ACCESS_ONCE(rsp->completed) == ACCESS_ONCE(rsp->gpnum)) + if (!rcu_gp_in_progress(rsp)) return; /* No grace period in progress, nothing to force. */ if (!spin_trylock_irqsave(&rsp->fqslock, flags)) { rsp->n_force_qs_lh++; /* Inexact, can lose counts. Tough! */ @@ -1103,19 +1207,20 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed) goto unlock_ret; /* no emergency and done recently. */ rsp->n_force_qs++; spin_lock(&rnp->lock); - lastcomp = rsp->completed; + lastcomp = rsp->gpnum - 1; signaled = rsp->signaled; rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS; - if (lastcomp == rsp->gpnum) { + if(!rcu_gp_in_progress(rsp)) { rsp->n_force_qs_ngp++; spin_unlock(&rnp->lock); goto unlock_ret; /* no GP in progress, time updated. */ } spin_unlock(&rnp->lock); switch (signaled) { + case RCU_GP_IDLE: case RCU_GP_INIT: - break; /* grace period still initializing, ignore. */ + break; /* grace period idle or initializing, ignore. */ case RCU_SAVE_DYNTICK: @@ -1126,20 +1231,29 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed) if (rcu_process_dyntick(rsp, lastcomp, dyntick_save_progress_counter)) goto unlock_ret; + /* fall into next case. */ + + case RCU_SAVE_COMPLETED: /* Update state, record completion counter. */ + forcenow = 0; spin_lock(&rnp->lock); - if (lastcomp == rsp->completed) { + if (lastcomp + 1 == rsp->gpnum && + lastcomp == rsp->completed && + rsp->signaled == signaled) { rsp->signaled = RCU_FORCE_QS; - dyntick_record_completed(rsp, lastcomp); + rsp->completed_fqs = lastcomp; + forcenow = signaled == RCU_SAVE_COMPLETED; } spin_unlock(&rnp->lock); - break; + if (!forcenow) + break; + /* fall into next case. */ case RCU_FORCE_QS: /* Check dyntick-idle state, send IPI to laggarts. */ - if (rcu_process_dyntick(rsp, dyntick_recall_completed(rsp), + if (rcu_process_dyntick(rsp, rsp->completed_fqs, rcu_implicit_dynticks_qs)) goto unlock_ret; @@ -1195,7 +1309,7 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp) } /* If there are callbacks ready, invoke them. */ - rcu_do_batch(rdp); + rcu_do_batch(rsp, rdp); } /* @@ -1251,7 +1365,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), rdp->nxttail[RCU_NEXT_TAIL] = &head->next; /* Start a new grace period if one not already started. */ - if (ACCESS_ONCE(rsp->completed) == ACCESS_ONCE(rsp->gpnum)) { + if (!rcu_gp_in_progress(rsp)) { unsigned long nestflag; struct rcu_node *rnp_root = rcu_get_root(rsp); @@ -1259,10 +1373,20 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), rcu_start_gp(rsp, nestflag); /* releases rnp_root->lock. */ } - /* Force the grace period if too many callbacks or too long waiting. */ - if (unlikely(++rdp->qlen > qhimark)) { + /* + * Force the grace period if too many callbacks or too long waiting. + * Enforce hysteresis, and don't invoke force_quiescent_state() + * if some other CPU has recently done so. Also, don't bother + * invoking force_quiescent_state() if the newly enqueued callback + * is the only one waiting for a grace period to complete. + */ + if (unlikely(++rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) { rdp->blimit = LONG_MAX; - force_quiescent_state(rsp, 0); + if (rsp->n_force_qs == rdp->n_force_qs_snap && + *rdp->nxttail[RCU_DONE_TAIL] != head) + force_quiescent_state(rsp, 0); + rdp->n_force_qs_snap = rsp->n_force_qs; + rdp->qlen_last_fqs_check = rdp->qlen; } else if ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0) force_quiescent_state(rsp, 1); local_irq_restore(flags); @@ -1286,6 +1410,68 @@ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) } EXPORT_SYMBOL_GPL(call_rcu_bh); +/** + * synchronize_sched - wait until an rcu-sched grace period has elapsed. + * + * Control will return to the caller some time after a full rcu-sched + * grace period has elapsed, in other words after all currently executing + * rcu-sched read-side critical sections have completed. These read-side + * critical sections are delimited by rcu_read_lock_sched() and + * rcu_read_unlock_sched(), and may be nested. Note that preempt_disable(), + * local_irq_disable(), and so on may be used in place of + * rcu_read_lock_sched(). + * + * This means that all preempt_disable code sequences, including NMI and + * hardware-interrupt handlers, in progress on entry will have completed + * before this primitive returns. However, this does not guarantee that + * softirq handlers will have completed, since in some kernels, these + * handlers can run in process context, and can block. + * + * This primitive provides the guarantees made by the (now removed) + * synchronize_kernel() API. In contrast, synchronize_rcu() only + * guarantees that rcu_read_lock() sections will have completed. + * In "classic RCU", these two guarantees happen to be one and + * the same, but can differ in realtime RCU implementations. + */ +void synchronize_sched(void) +{ + struct rcu_synchronize rcu; + + if (rcu_blocking_is_gp()) + return; + + init_completion(&rcu.completion); + /* Will wake me after RCU finished. */ + call_rcu_sched(&rcu.head, wakeme_after_rcu); + /* Wait for it. */ + wait_for_completion(&rcu.completion); +} +EXPORT_SYMBOL_GPL(synchronize_sched); + +/** + * synchronize_rcu_bh - wait until an rcu_bh grace period has elapsed. + * + * Control will return to the caller some time after a full rcu_bh grace + * period has elapsed, in other words after all currently executing rcu_bh + * read-side critical sections have completed. RCU read-side critical + * sections are delimited by rcu_read_lock_bh() and rcu_read_unlock_bh(), + * and may be nested. + */ +void synchronize_rcu_bh(void) +{ + struct rcu_synchronize rcu; + + if (rcu_blocking_is_gp()) + return; + + init_completion(&rcu.completion); + /* Will wake me after RCU finished. */ + call_rcu_bh(&rcu.head, wakeme_after_rcu); + /* Wait for it. */ + wait_for_completion(&rcu.completion); +} +EXPORT_SYMBOL_GPL(synchronize_rcu_bh); + /* * Check to see if there is any immediate RCU-related work to be done * by the current CPU, for the specified type of RCU, returning 1 if so. @@ -1295,6 +1481,8 @@ EXPORT_SYMBOL_GPL(call_rcu_bh); */ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) { + struct rcu_node *rnp = rdp->mynode; + rdp->n_rcu_pending++; /* Check for CPU stalls, if enabled. */ @@ -1319,19 +1507,19 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) } /* Has another RCU grace period completed? */ - if (ACCESS_ONCE(rsp->completed) != rdp->completed) { /* outside lock */ + if (ACCESS_ONCE(rnp->completed) != rdp->completed) { /* outside lock */ rdp->n_rp_gp_completed++; return 1; } /* Has a new RCU grace period started? */ - if (ACCESS_ONCE(rsp->gpnum) != rdp->gpnum) { /* outside lock */ + if (ACCESS_ONCE(rnp->gpnum) != rdp->gpnum) { /* outside lock */ rdp->n_rp_gp_started++; return 1; } /* Has an RCU GP gone long enough to send resched IPIs &c? */ - if (ACCESS_ONCE(rsp->completed) != ACCESS_ONCE(rsp->gpnum) && + if (rcu_gp_in_progress(rsp) && ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0)) { rdp->n_rp_need_fqs++; return 1; @@ -1369,6 +1557,97 @@ int rcu_needs_cpu(int cpu) } /* + * This function is invoked towards the end of the scheduler's initialization + * process. Before this is called, the idle task might contain + * RCU read-side critical sections (during which time, this idle + * task is booting the system). After this function is called, the + * idle tasks are prohibited from containing RCU read-side critical + * sections. + */ +void rcu_scheduler_starting(void) +{ + WARN_ON(num_online_cpus() != 1); + WARN_ON(nr_context_switches() > 0); + rcu_scheduler_active = 1; +} + +static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL}; +static atomic_t rcu_barrier_cpu_count; +static DEFINE_MUTEX(rcu_barrier_mutex); +static struct completion rcu_barrier_completion; + +static void rcu_barrier_callback(struct rcu_head *notused) +{ + if (atomic_dec_and_test(&rcu_barrier_cpu_count)) + complete(&rcu_barrier_completion); +} + +/* + * Called with preemption disabled, and from cross-cpu IRQ context. + */ +static void rcu_barrier_func(void *type) +{ + int cpu = smp_processor_id(); + struct rcu_head *head = &per_cpu(rcu_barrier_head, cpu); + void (*call_rcu_func)(struct rcu_head *head, + void (*func)(struct rcu_head *head)); + + atomic_inc(&rcu_barrier_cpu_count); + call_rcu_func = type; + call_rcu_func(head, rcu_barrier_callback); +} + +/* + * Orchestrate the specified type of RCU barrier, waiting for all + * RCU callbacks of the specified type to complete. + */ +static void _rcu_barrier(struct rcu_state *rsp, + void (*call_rcu_func)(struct rcu_head *head, + void (*func)(struct rcu_head *head))) +{ + BUG_ON(in_interrupt()); + /* Take mutex to serialize concurrent rcu_barrier() requests. */ + mutex_lock(&rcu_barrier_mutex); + init_completion(&rcu_barrier_completion); + /* + * Initialize rcu_barrier_cpu_count to 1, then invoke + * rcu_barrier_func() on each CPU, so that each CPU also has + * incremented rcu_barrier_cpu_count. Only then is it safe to + * decrement rcu_barrier_cpu_count -- otherwise the first CPU + * might complete its grace period before all of the other CPUs + * did their increment, causing this function to return too + * early. + */ + atomic_set(&rcu_barrier_cpu_count, 1); + preempt_disable(); /* stop CPU_DYING from filling orphan_cbs_list */ + rcu_adopt_orphan_cbs(rsp); + on_each_cpu(rcu_barrier_func, (void *)call_rcu_func, 1); + preempt_enable(); /* CPU_DYING can again fill orphan_cbs_list */ + if (atomic_dec_and_test(&rcu_barrier_cpu_count)) + complete(&rcu_barrier_completion); + wait_for_completion(&rcu_barrier_completion); + mutex_unlock(&rcu_barrier_mutex); +} + +/** + * rcu_barrier_bh - Wait until all in-flight call_rcu_bh() callbacks complete. + */ +void rcu_barrier_bh(void) +{ + _rcu_barrier(&rcu_bh_state, call_rcu_bh); +} +EXPORT_SYMBOL_GPL(rcu_barrier_bh); + +/** + * rcu_barrier_sched - Wait for in-flight call_rcu_sched() callbacks. + */ +void rcu_barrier_sched(void) +{ + _rcu_barrier(&rcu_sched_state, call_rcu_sched); +} +EXPORT_SYMBOL_GPL(rcu_barrier_sched); + +/* * Do boot-time initialization of a CPU's per-CPU RCU data. */ static void __init @@ -1403,21 +1682,18 @@ static void __cpuinit rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable) { unsigned long flags; - long lastcomp; unsigned long mask; struct rcu_data *rdp = rsp->rda[cpu]; struct rcu_node *rnp = rcu_get_root(rsp); /* Set up local state, ensuring consistent view of global state. */ spin_lock_irqsave(&rnp->lock, flags); - lastcomp = rsp->completed; - rdp->completed = lastcomp; - rdp->gpnum = lastcomp; rdp->passed_quiesc = 0; /* We could be racing with new GP, */ rdp->qs_pending = 1; /* so set up to respond to current GP. */ rdp->beenonline = 1; /* We have now been online. */ rdp->preemptable = preemptable; - rdp->passed_quiesc_completed = lastcomp - 1; + rdp->qlen_last_fqs_check = 0; + rdp->n_force_qs_snap = rsp->n_force_qs; rdp->blimit = blimit; spin_unlock(&rnp->lock); /* irqs remain disabled. */ @@ -1437,6 +1713,11 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable) spin_lock(&rnp->lock); /* irqs already disabled. */ rnp->qsmaskinit |= mask; mask = rnp->grpmask; + if (rnp == rdp->mynode) { + rdp->gpnum = rnp->completed; /* if GP in progress... */ + rdp->completed = rnp->completed; + rdp->passed_quiesc_completed = rnp->completed - 1; + } spin_unlock(&rnp->lock); /* irqs already disabled. */ rnp = rnp->parent; } while (rnp != NULL && !(rnp->qsmaskinit & mask)); @@ -1454,8 +1735,8 @@ static void __cpuinit rcu_online_cpu(int cpu) /* * Handle CPU online/offline notification events. */ -int __cpuinit rcu_cpu_notify(struct notifier_block *self, - unsigned long action, void *hcpu) +static int __cpuinit rcu_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu) { long cpu = (long)hcpu; @@ -1464,6 +1745,22 @@ int __cpuinit rcu_cpu_notify(struct notifier_block *self, case CPU_UP_PREPARE_FROZEN: rcu_online_cpu(cpu); break; + case CPU_DYING: + case CPU_DYING_FROZEN: + /* + * preempt_disable() in _rcu_barrier() prevents stop_machine(), + * so when "on_each_cpu(rcu_barrier_func, (void *)type, 1);" + * returns, all online cpus have queued rcu_barrier_func(). + * The dying CPU clears its cpu_online_mask bit and + * moves all of its RCU callbacks to ->orphan_cbs_list + * in the context of stop_machine(), so subsequent calls + * to _rcu_barrier() will adopt these callbacks and only + * then queue rcu_barrier_func() on all remaining CPUs. + */ + rcu_send_cbs_to_orphanage(&rcu_bh_state); + rcu_send_cbs_to_orphanage(&rcu_sched_state); + rcu_preempt_send_cbs_to_orphanage(); + break; case CPU_DEAD: case CPU_DEAD_FROZEN: case CPU_UP_CANCELED: @@ -1527,6 +1824,7 @@ static void __init rcu_init_one(struct rcu_state *rsp) rnp = rsp->level[i]; for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) { spin_lock_init(&rnp->lock); + lockdep_set_class(&rnp->lock, &rcu_node_class[i]); rnp->gpnum = 0; rnp->qsmask = 0; rnp->qsmaskinit = 0; @@ -1547,6 +1845,8 @@ static void __init rcu_init_one(struct rcu_state *rsp) rnp->level = i; INIT_LIST_HEAD(&rnp->blocked_tasks[0]); INIT_LIST_HEAD(&rnp->blocked_tasks[1]); + INIT_LIST_HEAD(&rnp->blocked_tasks[2]); + INIT_LIST_HEAD(&rnp->blocked_tasks[3]); } } } @@ -1558,6 +1858,10 @@ static void __init rcu_init_one(struct rcu_state *rsp) */ #define RCU_INIT_FLAVOR(rsp, rcu_data) \ do { \ + int i; \ + int j; \ + struct rcu_node *rnp; \ + \ rcu_init_one(rsp); \ rnp = (rsp)->level[NUM_RCU_LVLS - 1]; \ j = 0; \ @@ -1570,41 +1874,30 @@ do { \ } \ } while (0) -#ifdef CONFIG_TREE_PREEMPT_RCU - -void __init __rcu_init_preempt(void) -{ - int i; /* All used by RCU_INIT_FLAVOR(). */ - int j; - struct rcu_node *rnp; - - RCU_INIT_FLAVOR(&rcu_preempt_state, rcu_preempt_data); -} - -#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */ - -void __init __rcu_init_preempt(void) +void __init rcu_init(void) { -} - -#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */ - -void __init __rcu_init(void) -{ - int i; /* All used by RCU_INIT_FLAVOR(). */ - int j; - struct rcu_node *rnp; + int i; rcu_bootup_announce(); #ifdef CONFIG_RCU_CPU_STALL_DETECTOR printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n"); #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ +#if NUM_RCU_LVL_4 != 0 + printk(KERN_INFO "Experimental four-level hierarchy is enabled.\n"); +#endif /* #if NUM_RCU_LVL_4 != 0 */ RCU_INIT_FLAVOR(&rcu_sched_state, rcu_sched_data); RCU_INIT_FLAVOR(&rcu_bh_state, rcu_bh_data); __rcu_init_preempt(); open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); + + /* + * We don't need protection against CPU-hotplug here because + * this is called early in boot, before either interrupts + * or the scheduler are operational. + */ + cpu_notifier(rcu_cpu_notify, 0); + for_each_online_cpu(i) + rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)i); } -module_param(blimit, int, 0); -module_param(qhimark, int, 0); -module_param(qlowmark, int, 0); +#include "rcutree_plugin.h" diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 8e8287a983c..d2a0046f63b 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -34,10 +34,11 @@ * In practice, this has not been tested, so there is probably some * bug somewhere. */ -#define MAX_RCU_LVLS 3 +#define MAX_RCU_LVLS 4 #define RCU_FANOUT (CONFIG_RCU_FANOUT) #define RCU_FANOUT_SQ (RCU_FANOUT * RCU_FANOUT) #define RCU_FANOUT_CUBE (RCU_FANOUT_SQ * RCU_FANOUT) +#define RCU_FANOUT_FOURTH (RCU_FANOUT_CUBE * RCU_FANOUT) #if NR_CPUS <= RCU_FANOUT # define NUM_RCU_LVLS 1 @@ -45,23 +46,33 @@ # define NUM_RCU_LVL_1 (NR_CPUS) # define NUM_RCU_LVL_2 0 # define NUM_RCU_LVL_3 0 +# define NUM_RCU_LVL_4 0 #elif NR_CPUS <= RCU_FANOUT_SQ # define NUM_RCU_LVLS 2 # define NUM_RCU_LVL_0 1 -# define NUM_RCU_LVL_1 (((NR_CPUS) + RCU_FANOUT - 1) / RCU_FANOUT) +# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT) # define NUM_RCU_LVL_2 (NR_CPUS) # define NUM_RCU_LVL_3 0 +# define NUM_RCU_LVL_4 0 #elif NR_CPUS <= RCU_FANOUT_CUBE # define NUM_RCU_LVLS 3 # define NUM_RCU_LVL_0 1 -# define NUM_RCU_LVL_1 (((NR_CPUS) + RCU_FANOUT_SQ - 1) / RCU_FANOUT_SQ) -# define NUM_RCU_LVL_2 (((NR_CPUS) + (RCU_FANOUT) - 1) / (RCU_FANOUT)) +# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_SQ) +# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT) # define NUM_RCU_LVL_3 NR_CPUS +# define NUM_RCU_LVL_4 0 +#elif NR_CPUS <= RCU_FANOUT_FOURTH +# define NUM_RCU_LVLS 4 +# define NUM_RCU_LVL_0 1 +# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_CUBE) +# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_SQ) +# define NUM_RCU_LVL_3 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT) +# define NUM_RCU_LVL_4 NR_CPUS #else # error "CONFIG_RCU_FANOUT insufficient for NR_CPUS" #endif /* #if (NR_CPUS) <= RCU_FANOUT */ -#define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3) +#define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3 + NUM_RCU_LVL_4) #define NUM_RCU_NODES (RCU_SUM - NR_CPUS) /* @@ -79,24 +90,67 @@ struct rcu_dynticks { * Definition for node within the RCU grace-period-detection hierarchy. */ struct rcu_node { - spinlock_t lock; + spinlock_t lock; /* Root rcu_node's lock protects some */ + /* rcu_state fields as well as following. */ long gpnum; /* Current grace period for this node. */ /* This will either be equal to or one */ /* behind the root rcu_node's gpnum. */ + long completed; /* Last grace period completed for this node. */ + /* This will either be equal to or one */ + /* behind the root rcu_node's gpnum. */ unsigned long qsmask; /* CPUs or groups that need to switch in */ /* order for current grace period to proceed.*/ + /* In leaf rcu_node, each bit corresponds to */ + /* an rcu_data structure, otherwise, each */ + /* bit corresponds to a child rcu_node */ + /* structure. */ + unsigned long expmask; /* Groups that have ->blocked_tasks[] */ + /* elements that need to drain to allow the */ + /* current expedited grace period to */ + /* complete (only for TREE_PREEMPT_RCU). */ unsigned long qsmaskinit; - /* Per-GP initialization for qsmask. */ + /* Per-GP initial value for qsmask & expmask. */ unsigned long grpmask; /* Mask to apply to parent qsmask. */ + /* Only one bit will be set in this mask. */ int grplo; /* lowest-numbered CPU or group here. */ int grphi; /* highest-numbered CPU or group here. */ u8 grpnum; /* CPU/group number for next level up. */ u8 level; /* root is at level 0. */ struct rcu_node *parent; - struct list_head blocked_tasks[2]; + struct list_head blocked_tasks[4]; /* Tasks blocked in RCU read-side critsect. */ + /* Grace period number (->gpnum) x blocked */ + /* by tasks on the (x & 0x1) element of the */ + /* blocked_tasks[] array. */ } ____cacheline_internodealigned_in_smp; +/* + * Do a full breadth-first scan of the rcu_node structures for the + * specified rcu_state structure. + */ +#define rcu_for_each_node_breadth_first(rsp, rnp) \ + for ((rnp) = &(rsp)->node[0]; \ + (rnp) < &(rsp)->node[NUM_RCU_NODES]; (rnp)++) + +/* + * Do a breadth-first scan of the non-leaf rcu_node structures for the + * specified rcu_state structure. Note that if there is a singleton + * rcu_node tree with but one rcu_node structure, this loop is a no-op. + */ +#define rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) \ + for ((rnp) = &(rsp)->node[0]; \ + (rnp) < (rsp)->level[NUM_RCU_LVLS - 1]; (rnp)++) + +/* + * Scan the leaves of the rcu_node hierarchy for the specified rcu_state + * structure. Note that if there is a singleton rcu_node tree with but + * one rcu_node structure, this loop -will- visit the rcu_node structure. + * It is still a leaf node, even if it is also the root node. + */ +#define rcu_for_each_leaf_node(rsp, rnp) \ + for ((rnp) = (rsp)->level[NUM_RCU_LVLS - 1]; \ + (rnp) < &(rsp)->node[NUM_RCU_NODES]; (rnp)++) + /* Index values for nxttail array in struct rcu_data. */ #define RCU_DONE_TAIL 0 /* Also RCU_WAIT head. */ #define RCU_WAIT_TAIL 1 /* Also RCU_NEXT_READY head. */ @@ -126,23 +180,30 @@ struct rcu_data { * Any of the partitions might be empty, in which case the * pointer to that partition will be equal to the pointer for * the following partition. When the list is empty, all of - * the nxttail elements point to nxtlist, which is NULL. + * the nxttail elements point to the ->nxtlist pointer itself, + * which in that case is NULL. * - * [*nxttail[RCU_NEXT_READY_TAIL], NULL = *nxttail[RCU_NEXT_TAIL]): - * Entries that might have arrived after current GP ended - * [*nxttail[RCU_WAIT_TAIL], *nxttail[RCU_NEXT_READY_TAIL]): - * Entries known to have arrived before current GP ended - * [*nxttail[RCU_DONE_TAIL], *nxttail[RCU_WAIT_TAIL]): - * Entries that batch # <= ->completed - 1: waiting for current GP * [nxtlist, *nxttail[RCU_DONE_TAIL]): * Entries that batch # <= ->completed * The grace period for these entries has completed, and * the other grace-period-completed entries may be moved * here temporarily in rcu_process_callbacks(). + * [*nxttail[RCU_DONE_TAIL], *nxttail[RCU_WAIT_TAIL]): + * Entries that batch # <= ->completed - 1: waiting for current GP + * [*nxttail[RCU_WAIT_TAIL], *nxttail[RCU_NEXT_READY_TAIL]): + * Entries known to have arrived before current GP ended + * [*nxttail[RCU_NEXT_READY_TAIL], *nxttail[RCU_NEXT_TAIL]): + * Entries that might have arrived after current GP ended + * Note that the value of *nxttail[RCU_NEXT_TAIL] will + * always be NULL, as this is the end of the list. */ struct rcu_head *nxtlist; struct rcu_head **nxttail[RCU_NEXT_SIZE]; long qlen; /* # of queued callbacks */ + long qlen_last_fqs_check; + /* qlen at last check for QS forcing */ + unsigned long n_force_qs_snap; + /* did other CPU force QS recently? */ long blimit; /* Upper limit on a processed batch */ #ifdef CONFIG_NO_HZ @@ -173,13 +234,15 @@ struct rcu_data { }; /* Values for signaled field in struct rcu_state. */ -#define RCU_GP_INIT 0 /* Grace period being initialized. */ -#define RCU_SAVE_DYNTICK 1 /* Need to scan dyntick state. */ -#define RCU_FORCE_QS 2 /* Need to force quiescent state. */ +#define RCU_GP_IDLE 0 /* No grace period in progress. */ +#define RCU_GP_INIT 1 /* Grace period being initialized. */ +#define RCU_SAVE_DYNTICK 2 /* Need to scan dyntick state. */ +#define RCU_SAVE_COMPLETED 3 /* Need to save rsp->completed. */ +#define RCU_FORCE_QS 4 /* Need to force quiescent state. */ #ifdef CONFIG_NO_HZ #define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK #else /* #ifdef CONFIG_NO_HZ */ -#define RCU_SIGNAL_INIT RCU_FORCE_QS +#define RCU_SIGNAL_INIT RCU_SAVE_COMPLETED #endif /* #else #ifdef CONFIG_NO_HZ */ #define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */ @@ -216,10 +279,23 @@ struct rcu_state { /* Force QS state. */ long gpnum; /* Current gp number. */ long completed; /* # of last completed gp. */ + + /* End of fields guarded by root rcu_node's lock. */ + spinlock_t onofflock; /* exclude on/offline and */ - /* starting new GP. */ + /* starting new GP. Also */ + /* protects the following */ + /* orphan_cbs fields. */ + struct rcu_head *orphan_cbs_list; /* list of rcu_head structs */ + /* orphaned by all CPUs in */ + /* a given leaf rcu_node */ + /* going offline. */ + struct rcu_head **orphan_cbs_tail; /* And tail pointer. */ + long orphan_qlen; /* Number of orphaned cbs. */ spinlock_t fqslock; /* Only one task forcing */ /* quiescent states. */ + long completed_fqs; /* Value of completed @ snap. */ + /* Protected by fqslock. */ unsigned long jiffies_force_qs; /* Time at which to invoke */ /* force_quiescent_state(). */ unsigned long n_force_qs; /* Number of calls to */ @@ -234,11 +310,15 @@ struct rcu_state { unsigned long jiffies_stall; /* Time at which to check */ /* for CPU stalls. */ #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ -#ifdef CONFIG_NO_HZ - long dynticks_completed; /* Value of completed @ snap. */ -#endif /* #ifdef CONFIG_NO_HZ */ }; +/* Return values for rcu_preempt_offline_tasks(). */ + +#define RCU_OFL_TASKS_NORM_GP 0x1 /* Tasks blocking normal */ + /* GP were moved to root. */ +#define RCU_OFL_TASKS_EXP_GP 0x2 /* Tasks blocking expedited */ + /* GP were moved to root. */ + #ifdef RCU_TREE_NONCORE /* @@ -255,5 +335,37 @@ extern struct rcu_state rcu_preempt_state; DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data); #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ -#endif /* #ifdef RCU_TREE_NONCORE */ +#else /* #ifdef RCU_TREE_NONCORE */ + +/* Forward declarations for rcutree_plugin.h */ +static void rcu_bootup_announce(void); +long rcu_batches_completed(void); +static void rcu_preempt_note_context_switch(int cpu); +static int rcu_preempted_readers(struct rcu_node *rnp); +#ifdef CONFIG_HOTPLUG_CPU +static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, + unsigned long flags); +#endif /* #ifdef CONFIG_HOTPLUG_CPU */ +#ifdef CONFIG_RCU_CPU_STALL_DETECTOR +static void rcu_print_task_stall(struct rcu_node *rnp); +#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ +static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); +#ifdef CONFIG_HOTPLUG_CPU +static int rcu_preempt_offline_tasks(struct rcu_state *rsp, + struct rcu_node *rnp, + struct rcu_data *rdp); +static void rcu_preempt_offline_cpu(int cpu); +#endif /* #ifdef CONFIG_HOTPLUG_CPU */ +static void rcu_preempt_check_callbacks(int cpu); +static void rcu_preempt_process_callbacks(void); +void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); +#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) +static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp); +#endif /* #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) */ +static int rcu_preempt_pending(int cpu); +static int rcu_preempt_needs_cpu(int cpu); +static void __cpuinit rcu_preempt_init_percpu_data(int cpu); +static void rcu_preempt_send_cbs_to_orphanage(void); +static void __init __rcu_init_preempt(void); +#endif /* #else #ifdef RCU_TREE_NONCORE */ diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 1cee04f627e..37fbccdf41d 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -24,16 +24,19 @@ * Paul E. McKenney <paulmck@linux.vnet.ibm.com> */ +#include <linux/delay.h> #ifdef CONFIG_TREE_PREEMPT_RCU struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt_state); DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data); +static int rcu_preempted_readers_exp(struct rcu_node *rnp); + /* * Tell them what RCU they are running. */ -static inline void rcu_bootup_announce(void) +static void __init rcu_bootup_announce(void) { printk(KERN_INFO "Experimental preemptable hierarchical RCU implementation.\n"); @@ -67,7 +70,7 @@ EXPORT_SYMBOL_GPL(rcu_batches_completed); static void rcu_preempt_qs(int cpu) { struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu); - rdp->passed_quiesc_completed = rdp->completed; + rdp->passed_quiesc_completed = rdp->gpnum - 1; barrier(); rdp->passed_quiesc = 1; } @@ -150,11 +153,65 @@ void __rcu_read_lock(void) } EXPORT_SYMBOL_GPL(__rcu_read_lock); +/* + * Check for preempted RCU readers blocking the current grace period + * for the specified rcu_node structure. If the caller needs a reliable + * answer, it must hold the rcu_node's ->lock. + */ +static int rcu_preempted_readers(struct rcu_node *rnp) +{ + int phase = rnp->gpnum & 0x1; + + return !list_empty(&rnp->blocked_tasks[phase]) || + !list_empty(&rnp->blocked_tasks[phase + 2]); +} + +/* + * Record a quiescent state for all tasks that were previously queued + * on the specified rcu_node structure and that were blocking the current + * RCU grace period. The caller must hold the specified rnp->lock with + * irqs disabled, and this lock is released upon return, but irqs remain + * disabled. + */ +static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags) + __releases(rnp->lock) +{ + unsigned long mask; + struct rcu_node *rnp_p; + + if (rnp->qsmask != 0 || rcu_preempted_readers(rnp)) { + spin_unlock_irqrestore(&rnp->lock, flags); + return; /* Still need more quiescent states! */ + } + + rnp_p = rnp->parent; + if (rnp_p == NULL) { + /* + * Either there is only one rcu_node in the tree, + * or tasks were kicked up to root rcu_node due to + * CPUs going offline. + */ + rcu_report_qs_rsp(&rcu_preempt_state, flags); + return; + } + + /* Report up the rest of the hierarchy. */ + mask = rnp->grpmask; + spin_unlock(&rnp->lock); /* irqs remain disabled. */ + spin_lock(&rnp_p->lock); /* irqs already disabled. */ + rcu_report_qs_rnp(mask, &rcu_preempt_state, rnp_p, flags); +} + +/* + * Handle special cases during rcu_read_unlock(), such as needing to + * notify RCU core processing or task having blocked during the RCU + * read-side critical section. + */ static void rcu_read_unlock_special(struct task_struct *t) { int empty; + int empty_exp; unsigned long flags; - unsigned long mask; struct rcu_node *rnp; int special; @@ -196,37 +253,31 @@ static void rcu_read_unlock_special(struct task_struct *t) break; spin_unlock(&rnp->lock); /* irqs remain disabled. */ } - empty = list_empty(&rnp->blocked_tasks[rnp->gpnum & 0x1]); + empty = !rcu_preempted_readers(rnp); + empty_exp = !rcu_preempted_readers_exp(rnp); + smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */ list_del_init(&t->rcu_node_entry); t->rcu_blocked_node = NULL; /* * If this was the last task on the current list, and if * we aren't waiting on any CPUs, report the quiescent state. - * Note that both cpu_quiet_msk_finish() and cpu_quiet_msk() - * drop rnp->lock and restore irq. + * Note that rcu_report_unblock_qs_rnp() releases rnp->lock. */ - if (!empty && rnp->qsmask == 0 && - list_empty(&rnp->blocked_tasks[rnp->gpnum & 0x1])) { - struct rcu_node *rnp_p; - - if (rnp->parent == NULL) { - /* Only one rcu_node in the tree. */ - cpu_quiet_msk_finish(&rcu_preempt_state, flags); - return; - } - /* Report up the rest of the hierarchy. */ - mask = rnp->grpmask; + if (empty) spin_unlock_irqrestore(&rnp->lock, flags); - rnp_p = rnp->parent; - spin_lock_irqsave(&rnp_p->lock, flags); - WARN_ON_ONCE(rnp->qsmask); - cpu_quiet_msk(mask, &rcu_preempt_state, rnp_p, flags); - return; - } - spin_unlock(&rnp->lock); + else + rcu_report_unblock_qs_rnp(rnp, flags); + + /* + * If this was the last task on the expedited lists, + * then we need to report up the rcu_node hierarchy. + */ + if (!empty_exp && !rcu_preempted_readers_exp(rnp)) + rcu_report_exp_rnp(&rcu_preempt_state, rnp); + } else { + local_irq_restore(flags); } - local_irq_restore(flags); } /* @@ -257,12 +308,12 @@ static void rcu_print_task_stall(struct rcu_node *rnp) { unsigned long flags; struct list_head *lp; - int phase = rnp->gpnum & 0x1; + int phase; struct task_struct *t; - if (!list_empty(&rnp->blocked_tasks[phase])) { + if (rcu_preempted_readers(rnp)) { spin_lock_irqsave(&rnp->lock, flags); - phase = rnp->gpnum & 0x1; /* re-read under lock. */ + phase = rnp->gpnum & 0x1; lp = &rnp->blocked_tasks[phase]; list_for_each_entry(t, lp, rcu_node_entry) printk(" P%d", t->pid); @@ -281,20 +332,10 @@ static void rcu_print_task_stall(struct rcu_node *rnp) */ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) { - WARN_ON_ONCE(!list_empty(&rnp->blocked_tasks[rnp->gpnum & 0x1])); + WARN_ON_ONCE(rcu_preempted_readers(rnp)); WARN_ON_ONCE(rnp->qsmask); } -/* - * Check for preempted RCU readers for the specified rcu_node structure. - * If the caller needs a reliable answer, it must hold the rcu_node's - * >lock. - */ -static int rcu_preempted_readers(struct rcu_node *rnp) -{ - return !list_empty(&rnp->blocked_tasks[rnp->gpnum & 0x1]); -} - #ifdef CONFIG_HOTPLUG_CPU /* @@ -303,26 +344,34 @@ static int rcu_preempted_readers(struct rcu_node *rnp) * rcu_node. The reason for not just moving them to the immediate * parent is to remove the need for rcu_read_unlock_special() to * make more than two attempts to acquire the target rcu_node's lock. + * Returns true if there were tasks blocking the current RCU grace + * period. + * + * Returns 1 if there was previously a task blocking the current grace + * period on the specified rcu_node structure. * * The caller must hold rnp->lock with irqs disabled. */ -static void rcu_preempt_offline_tasks(struct rcu_state *rsp, - struct rcu_node *rnp, - struct rcu_data *rdp) +static int rcu_preempt_offline_tasks(struct rcu_state *rsp, + struct rcu_node *rnp, + struct rcu_data *rdp) { int i; struct list_head *lp; struct list_head *lp_root; + int retval = 0; struct rcu_node *rnp_root = rcu_get_root(rsp); struct task_struct *tp; if (rnp == rnp_root) { WARN_ONCE(1, "Last CPU thought to be offlined?"); - return; /* Shouldn't happen: at least one CPU online. */ + return 0; /* Shouldn't happen: at least one CPU online. */ } WARN_ON_ONCE(rnp != rdp->mynode && (!list_empty(&rnp->blocked_tasks[0]) || - !list_empty(&rnp->blocked_tasks[1]))); + !list_empty(&rnp->blocked_tasks[1]) || + !list_empty(&rnp->blocked_tasks[2]) || + !list_empty(&rnp->blocked_tasks[3]))); /* * Move tasks up to root rcu_node. Rely on the fact that the @@ -330,7 +379,11 @@ static void rcu_preempt_offline_tasks(struct rcu_state *rsp, * rcu_nodes in terms of gp_num value. This fact allows us to * move the blocked_tasks[] array directly, element by element. */ - for (i = 0; i < 2; i++) { + if (rcu_preempted_readers(rnp)) + retval |= RCU_OFL_TASKS_NORM_GP; + if (rcu_preempted_readers_exp(rnp)) + retval |= RCU_OFL_TASKS_EXP_GP; + for (i = 0; i < 4; i++) { lp = &rnp->blocked_tasks[i]; lp_root = &rnp_root->blocked_tasks[i]; while (!list_empty(lp)) { @@ -342,6 +395,7 @@ static void rcu_preempt_offline_tasks(struct rcu_state *rsp, spin_unlock(&rnp_root->lock); /* irqs remain disabled */ } } + return retval; } /* @@ -392,6 +446,186 @@ void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) } EXPORT_SYMBOL_GPL(call_rcu); +/** + * synchronize_rcu - wait until a grace period has elapsed. + * + * Control will return to the caller some time after a full grace + * period has elapsed, in other words after all currently executing RCU + * read-side critical sections have completed. RCU read-side critical + * sections are delimited by rcu_read_lock() and rcu_read_unlock(), + * and may be nested. + */ +void synchronize_rcu(void) +{ + struct rcu_synchronize rcu; + + if (!rcu_scheduler_active) + return; + + init_completion(&rcu.completion); + /* Will wake me after RCU finished. */ + call_rcu(&rcu.head, wakeme_after_rcu); + /* Wait for it. */ + wait_for_completion(&rcu.completion); +} +EXPORT_SYMBOL_GPL(synchronize_rcu); + +static DECLARE_WAIT_QUEUE_HEAD(sync_rcu_preempt_exp_wq); +static long sync_rcu_preempt_exp_count; +static DEFINE_MUTEX(sync_rcu_preempt_exp_mutex); + +/* + * Return non-zero if there are any tasks in RCU read-side critical + * sections blocking the current preemptible-RCU expedited grace period. + * If there is no preemptible-RCU expedited grace period currently in + * progress, returns zero unconditionally. + */ +static int rcu_preempted_readers_exp(struct rcu_node *rnp) +{ + return !list_empty(&rnp->blocked_tasks[2]) || + !list_empty(&rnp->blocked_tasks[3]); +} + +/* + * return non-zero if there is no RCU expedited grace period in progress + * for the specified rcu_node structure, in other words, if all CPUs and + * tasks covered by the specified rcu_node structure have done their bit + * for the current expedited grace period. Works only for preemptible + * RCU -- other RCU implementation use other means. + * + * Caller must hold sync_rcu_preempt_exp_mutex. + */ +static int sync_rcu_preempt_exp_done(struct rcu_node *rnp) +{ + return !rcu_preempted_readers_exp(rnp) && + ACCESS_ONCE(rnp->expmask) == 0; +} + +/* + * Report the exit from RCU read-side critical section for the last task + * that queued itself during or before the current expedited preemptible-RCU + * grace period. This event is reported either to the rcu_node structure on + * which the task was queued or to one of that rcu_node structure's ancestors, + * recursively up the tree. (Calm down, calm down, we do the recursion + * iteratively!) + * + * Caller must hold sync_rcu_preempt_exp_mutex. + */ +static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp) +{ + unsigned long flags; + unsigned long mask; + + spin_lock_irqsave(&rnp->lock, flags); + for (;;) { + if (!sync_rcu_preempt_exp_done(rnp)) + break; + if (rnp->parent == NULL) { + wake_up(&sync_rcu_preempt_exp_wq); + break; + } + mask = rnp->grpmask; + spin_unlock(&rnp->lock); /* irqs remain disabled */ + rnp = rnp->parent; + spin_lock(&rnp->lock); /* irqs already disabled */ + rnp->expmask &= ~mask; + } + spin_unlock_irqrestore(&rnp->lock, flags); +} + +/* + * Snapshot the tasks blocking the newly started preemptible-RCU expedited + * grace period for the specified rcu_node structure. If there are no such + * tasks, report it up the rcu_node hierarchy. + * + * Caller must hold sync_rcu_preempt_exp_mutex and rsp->onofflock. + */ +static void +sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp) +{ + int must_wait; + + spin_lock(&rnp->lock); /* irqs already disabled */ + list_splice_init(&rnp->blocked_tasks[0], &rnp->blocked_tasks[2]); + list_splice_init(&rnp->blocked_tasks[1], &rnp->blocked_tasks[3]); + must_wait = rcu_preempted_readers_exp(rnp); + spin_unlock(&rnp->lock); /* irqs remain disabled */ + if (!must_wait) + rcu_report_exp_rnp(rsp, rnp); +} + +/* + * Wait for an rcu-preempt grace period, but expedite it. The basic idea + * is to invoke synchronize_sched_expedited() to push all the tasks to + * the ->blocked_tasks[] lists, move all entries from the first set of + * ->blocked_tasks[] lists to the second set, and finally wait for this + * second set to drain. + */ +void synchronize_rcu_expedited(void) +{ + unsigned long flags; + struct rcu_node *rnp; + struct rcu_state *rsp = &rcu_preempt_state; + long snap; + int trycount = 0; + + smp_mb(); /* Caller's modifications seen first by other CPUs. */ + snap = ACCESS_ONCE(sync_rcu_preempt_exp_count) + 1; + smp_mb(); /* Above access cannot bleed into critical section. */ + + /* + * Acquire lock, falling back to synchronize_rcu() if too many + * lock-acquisition failures. Of course, if someone does the + * expedited grace period for us, just leave. + */ + while (!mutex_trylock(&sync_rcu_preempt_exp_mutex)) { + if (trycount++ < 10) + udelay(trycount * num_online_cpus()); + else { + synchronize_rcu(); + return; + } + if ((ACCESS_ONCE(sync_rcu_preempt_exp_count) - snap) > 0) + goto mb_ret; /* Others did our work for us. */ + } + if ((ACCESS_ONCE(sync_rcu_preempt_exp_count) - snap) > 0) + goto unlock_mb_ret; /* Others did our work for us. */ + + /* force all RCU readers onto blocked_tasks[]. */ + synchronize_sched_expedited(); + + spin_lock_irqsave(&rsp->onofflock, flags); + + /* Initialize ->expmask for all non-leaf rcu_node structures. */ + rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) { + spin_lock(&rnp->lock); /* irqs already disabled. */ + rnp->expmask = rnp->qsmaskinit; + spin_unlock(&rnp->lock); /* irqs remain disabled. */ + } + + /* Snapshot current state of ->blocked_tasks[] lists. */ + rcu_for_each_leaf_node(rsp, rnp) + sync_rcu_preempt_exp_init(rsp, rnp); + if (NUM_RCU_NODES > 1) + sync_rcu_preempt_exp_init(rsp, rcu_get_root(rsp)); + + spin_unlock_irqrestore(&rsp->onofflock, flags); + + /* Wait for snapshotted ->blocked_tasks[] lists to drain. */ + rnp = rcu_get_root(rsp); + wait_event(sync_rcu_preempt_exp_wq, + sync_rcu_preempt_exp_done(rnp)); + + /* Clean up and exit. */ + smp_mb(); /* ensure expedited GP seen before counter increment. */ + ACCESS_ONCE(sync_rcu_preempt_exp_count)++; +unlock_mb_ret: + mutex_unlock(&sync_rcu_preempt_exp_mutex); +mb_ret: + smp_mb(); /* ensure subsequent action seen after grace period. */ +} +EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); + /* * Check to see if there is any immediate preemptable-RCU-related work * to be done. @@ -410,6 +644,15 @@ static int rcu_preempt_needs_cpu(int cpu) return !!per_cpu(rcu_preempt_data, cpu).nxtlist; } +/** + * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete. + */ +void rcu_barrier(void) +{ + _rcu_barrier(&rcu_preempt_state, call_rcu); +} +EXPORT_SYMBOL_GPL(rcu_barrier); + /* * Initialize preemptable RCU's per-CPU data. */ @@ -419,6 +662,22 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu) } /* + * Move preemptable RCU's callbacks to ->orphan_cbs_list. + */ +static void rcu_preempt_send_cbs_to_orphanage(void) +{ + rcu_send_cbs_to_orphanage(&rcu_preempt_state); +} + +/* + * Initialize preemptable RCU's state structures. + */ +static void __init __rcu_init_preempt(void) +{ + RCU_INIT_FLAVOR(&rcu_preempt_state, rcu_preempt_data); +} + +/* * Check for a task exiting while in a preemptable-RCU read-side * critical section, clean up if so. No need to issue warnings, * as debug_check_no_locks_held() already does this if lockdep @@ -439,7 +698,7 @@ void exit_rcu(void) /* * Tell them what RCU they are running. */ -static inline void rcu_bootup_announce(void) +static void __init rcu_bootup_announce(void) { printk(KERN_INFO "Hierarchical RCU implementation.\n"); } @@ -461,6 +720,25 @@ static void rcu_preempt_note_context_switch(int cpu) { } +/* + * Because preemptable RCU does not exist, there are never any preempted + * RCU readers. + */ +static int rcu_preempted_readers(struct rcu_node *rnp) +{ + return 0; +} + +#ifdef CONFIG_HOTPLUG_CPU + +/* Because preemptible RCU does not exist, no quieting of tasks. */ +static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags) +{ + spin_unlock_irqrestore(&rnp->lock, flags); +} + +#endif /* #ifdef CONFIG_HOTPLUG_CPU */ + #ifdef CONFIG_RCU_CPU_STALL_DETECTOR /* @@ -483,25 +761,19 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) WARN_ON_ONCE(rnp->qsmask); } -/* - * Because preemptable RCU does not exist, there are never any preempted - * RCU readers. - */ -static int rcu_preempted_readers(struct rcu_node *rnp) -{ - return 0; -} - #ifdef CONFIG_HOTPLUG_CPU /* * Because preemptable RCU does not exist, it never needs to migrate - * tasks that were blocked within RCU read-side critical sections. + * tasks that were blocked within RCU read-side critical sections, and + * such non-existent tasks cannot possibly have been blocking the current + * grace period. */ -static void rcu_preempt_offline_tasks(struct rcu_state *rsp, - struct rcu_node *rnp, - struct rcu_data *rdp) +static int rcu_preempt_offline_tasks(struct rcu_state *rsp, + struct rcu_node *rnp, + struct rcu_data *rdp) { + return 0; } /* @@ -518,7 +790,7 @@ static void rcu_preempt_offline_cpu(int cpu) * Because preemptable RCU does not exist, it never has any callbacks * to check. */ -void rcu_preempt_check_callbacks(int cpu) +static void rcu_preempt_check_callbacks(int cpu) { } @@ -526,7 +798,7 @@ void rcu_preempt_check_callbacks(int cpu) * Because preemptable RCU does not exist, it never has any callbacks * to process. */ -void rcu_preempt_process_callbacks(void) +static void rcu_preempt_process_callbacks(void) { } @@ -540,6 +812,30 @@ void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) EXPORT_SYMBOL_GPL(call_rcu); /* + * Wait for an rcu-preempt grace period, but make it happen quickly. + * But because preemptable RCU does not exist, map to rcu-sched. + */ +void synchronize_rcu_expedited(void) +{ + synchronize_sched_expedited(); +} +EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); + +#ifdef CONFIG_HOTPLUG_CPU + +/* + * Because preemptable RCU does not exist, there is never any need to + * report on tasks preempted in RCU read-side critical sections during + * expedited RCU grace periods. + */ +static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp) +{ + return; +} + +#endif /* #ifdef CONFIG_HOTPLUG_CPU */ + +/* * Because preemptable RCU does not exist, it never has any work to do. */ static int rcu_preempt_pending(int cpu) @@ -556,6 +852,16 @@ static int rcu_preempt_needs_cpu(int cpu) } /* + * Because preemptable RCU does not exist, rcu_barrier() is just + * another name for rcu_barrier_sched(). + */ +void rcu_barrier(void) +{ + rcu_barrier_sched(); +} +EXPORT_SYMBOL_GPL(rcu_barrier); + +/* * Because preemptable RCU does not exist, there is no per-CPU * data to initialize. */ @@ -563,4 +869,18 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu) { } +/* + * Because there is no preemptable RCU, there are no callbacks to move. + */ +static void rcu_preempt_send_cbs_to_orphanage(void) +{ +} + +/* + * Because preemptable RCU does not exist, it need not be initialized. + */ +static void __init __rcu_init_preempt(void) +{ +} + #endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */ diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index c89f5e9fd17..9d2c88423b3 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c @@ -93,7 +93,7 @@ static int rcudata_open(struct inode *inode, struct file *file) return single_open(file, show_rcudata, NULL); } -static struct file_operations rcudata_fops = { +static const struct file_operations rcudata_fops = { .owner = THIS_MODULE, .open = rcudata_open, .read = seq_read, @@ -145,7 +145,7 @@ static int rcudata_csv_open(struct inode *inode, struct file *file) return single_open(file, show_rcudata_csv, NULL); } -static struct file_operations rcudata_csv_fops = { +static const struct file_operations rcudata_csv_fops = { .owner = THIS_MODULE, .open = rcudata_csv_open, .read = seq_read, @@ -155,24 +155,32 @@ static struct file_operations rcudata_csv_fops = { static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) { + long gpnum; int level = 0; + int phase; struct rcu_node *rnp; + gpnum = rsp->gpnum; seq_printf(m, "c=%ld g=%ld s=%d jfq=%ld j=%x " - "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu\n", - rsp->completed, rsp->gpnum, rsp->signaled, + "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld\n", + rsp->completed, gpnum, rsp->signaled, (long)(rsp->jiffies_force_qs - jiffies), (int)(jiffies & 0xffff), rsp->n_force_qs, rsp->n_force_qs_ngp, rsp->n_force_qs - rsp->n_force_qs_ngp, - rsp->n_force_qs_lh); + rsp->n_force_qs_lh, rsp->orphan_qlen); for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < NUM_RCU_NODES; rnp++) { if (rnp->level != level) { seq_puts(m, "\n"); level = rnp->level; } - seq_printf(m, "%lx/%lx %d:%d ^%d ", + phase = gpnum & 0x1; + seq_printf(m, "%lx/%lx %c%c>%c%c %d:%d ^%d ", rnp->qsmask, rnp->qsmaskinit, + "T."[list_empty(&rnp->blocked_tasks[phase])], + "E."[list_empty(&rnp->blocked_tasks[phase + 2])], + "T."[list_empty(&rnp->blocked_tasks[!phase])], + "E."[list_empty(&rnp->blocked_tasks[!phase + 2])], rnp->grplo, rnp->grphi, rnp->grpnum); } seq_puts(m, "\n"); @@ -196,7 +204,7 @@ static int rcuhier_open(struct inode *inode, struct file *file) return single_open(file, show_rcuhier, NULL); } -static struct file_operations rcuhier_fops = { +static const struct file_operations rcuhier_fops = { .owner = THIS_MODULE, .open = rcuhier_open, .read = seq_read, @@ -222,7 +230,7 @@ static int rcugp_open(struct inode *inode, struct file *file) return single_open(file, show_rcugp, NULL); } -static struct file_operations rcugp_fops = { +static const struct file_operations rcugp_fops = { .owner = THIS_MODULE, .open = rcugp_open, .read = seq_read, @@ -276,7 +284,7 @@ static int rcu_pending_open(struct inode *inode, struct file *file) return single_open(file, show_rcu_pending, NULL); } -static struct file_operations rcu_pending_fops = { +static const struct file_operations rcu_pending_fops = { .owner = THIS_MODULE, .open = rcu_pending_open, .read = seq_read, diff --git a/kernel/relay.c b/kernel/relay.c index bc188549788..c705a41b4ba 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -60,7 +60,7 @@ static int relay_buf_fault(struct vm_area_struct *vma, struct vm_fault *vmf) /* * vm_ops for relay file mappings. */ -static struct vm_operations_struct relay_file_mmap_ops = { +static const struct vm_operations_struct relay_file_mmap_ops = { .fault = relay_buf_fault, .close = relay_file_mmap_close, }; @@ -1198,7 +1198,7 @@ static void relay_pipe_buf_release(struct pipe_inode_info *pipe, relay_consume_bytes(rbuf, buf->private); } -static struct pipe_buf_operations relay_pipe_buf_ops = { +static const struct pipe_buf_operations relay_pipe_buf_ops = { .can_merge = 0, .map = generic_pipe_buf_map, .unmap = generic_pipe_buf_unmap, diff --git a/kernel/res_counter.c b/kernel/res_counter.c index e1338f07431..bcdabf37c40 100644 --- a/kernel/res_counter.c +++ b/kernel/res_counter.c @@ -19,6 +19,7 @@ void res_counter_init(struct res_counter *counter, struct res_counter *parent) { spin_lock_init(&counter->lock); counter->limit = RESOURCE_MAX; + counter->soft_limit = RESOURCE_MAX; counter->parent = parent; } @@ -101,6 +102,8 @@ res_counter_member(struct res_counter *counter, int member) return &counter->limit; case RES_FAILCNT: return &counter->failcnt; + case RES_SOFT_LIMIT: + return &counter->soft_limit; }; BUG(); diff --git a/kernel/resource.c b/kernel/resource.c index fb11a58b959..dc15686b7a7 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -308,35 +308,37 @@ static int find_resource(struct resource *root, struct resource *new, void *alignf_data) { struct resource *this = root->child; + resource_size_t start, end; - new->start = root->start; + start = root->start; /* * Skip past an allocated resource that starts at 0, since the assignment * of this->start - 1 to new->end below would cause an underflow. */ if (this && this->start == 0) { - new->start = this->end + 1; + start = this->end + 1; this = this->sibling; } for(;;) { if (this) - new->end = this->start - 1; + end = this->start - 1; else - new->end = root->end; - if (new->start < min) - new->start = min; - if (new->end > max) - new->end = max; - new->start = ALIGN(new->start, align); + end = root->end; + if (start < min) + start = min; + if (end > max) + end = max; + start = ALIGN(start, align); if (alignf) alignf(alignf_data, new, size, align); - if (new->start < new->end && new->end - new->start >= size - 1) { - new->end = new->start + size - 1; + if (start < end && end - start >= size - 1) { + new->start = start; + new->end = start + size - 1; return 0; } if (!this) break; - new->start = this->end + 1; + start = this->end + 1; this = this->sibling; } return -EBUSY; diff --git a/kernel/rtmutex-debug.c b/kernel/rtmutex-debug.c index 5fcb4fe645e..ddabb54bb5c 100644 --- a/kernel/rtmutex-debug.c +++ b/kernel/rtmutex-debug.c @@ -37,8 +37,8 @@ do { \ if (rt_trace_on) { \ rt_trace_on = 0; \ console_verbose(); \ - if (spin_is_locked(¤t->pi_lock)) \ - spin_unlock(¤t->pi_lock); \ + if (raw_spin_is_locked(¤t->pi_lock)) \ + raw_spin_unlock(¤t->pi_lock); \ } \ } while (0) diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c index 29bd4baf9e7..a9604815786 100644 --- a/kernel/rtmutex.c +++ b/kernel/rtmutex.c @@ -138,9 +138,9 @@ static void rt_mutex_adjust_prio(struct task_struct *task) { unsigned long flags; - spin_lock_irqsave(&task->pi_lock, flags); + raw_spin_lock_irqsave(&task->pi_lock, flags); __rt_mutex_adjust_prio(task); - spin_unlock_irqrestore(&task->pi_lock, flags); + raw_spin_unlock_irqrestore(&task->pi_lock, flags); } /* @@ -195,7 +195,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, /* * Task can not go away as we did a get_task() before ! */ - spin_lock_irqsave(&task->pi_lock, flags); + raw_spin_lock_irqsave(&task->pi_lock, flags); waiter = task->pi_blocked_on; /* @@ -231,8 +231,8 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, goto out_unlock_pi; lock = waiter->lock; - if (!spin_trylock(&lock->wait_lock)) { - spin_unlock_irqrestore(&task->pi_lock, flags); + if (!raw_spin_trylock(&lock->wait_lock)) { + raw_spin_unlock_irqrestore(&task->pi_lock, flags); cpu_relax(); goto retry; } @@ -240,7 +240,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, /* Deadlock detection */ if (lock == orig_lock || rt_mutex_owner(lock) == top_task) { debug_rt_mutex_deadlock(deadlock_detect, orig_waiter, lock); - spin_unlock(&lock->wait_lock); + raw_spin_unlock(&lock->wait_lock); ret = deadlock_detect ? -EDEADLK : 0; goto out_unlock_pi; } @@ -253,13 +253,13 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, plist_add(&waiter->list_entry, &lock->wait_list); /* Release the task */ - spin_unlock_irqrestore(&task->pi_lock, flags); + raw_spin_unlock_irqrestore(&task->pi_lock, flags); put_task_struct(task); /* Grab the next task */ task = rt_mutex_owner(lock); get_task_struct(task); - spin_lock_irqsave(&task->pi_lock, flags); + raw_spin_lock_irqsave(&task->pi_lock, flags); if (waiter == rt_mutex_top_waiter(lock)) { /* Boost the owner */ @@ -277,10 +277,10 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, __rt_mutex_adjust_prio(task); } - spin_unlock_irqrestore(&task->pi_lock, flags); + raw_spin_unlock_irqrestore(&task->pi_lock, flags); top_waiter = rt_mutex_top_waiter(lock); - spin_unlock(&lock->wait_lock); + raw_spin_unlock(&lock->wait_lock); if (!detect_deadlock && waiter != top_waiter) goto out_put_task; @@ -288,7 +288,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, goto again; out_unlock_pi: - spin_unlock_irqrestore(&task->pi_lock, flags); + raw_spin_unlock_irqrestore(&task->pi_lock, flags); out_put_task: put_task_struct(task); @@ -313,9 +313,9 @@ static inline int try_to_steal_lock(struct rt_mutex *lock, if (pendowner == task) return 1; - spin_lock_irqsave(&pendowner->pi_lock, flags); + raw_spin_lock_irqsave(&pendowner->pi_lock, flags); if (task->prio >= pendowner->prio) { - spin_unlock_irqrestore(&pendowner->pi_lock, flags); + raw_spin_unlock_irqrestore(&pendowner->pi_lock, flags); return 0; } @@ -325,7 +325,7 @@ static inline int try_to_steal_lock(struct rt_mutex *lock, * priority. */ if (likely(!rt_mutex_has_waiters(lock))) { - spin_unlock_irqrestore(&pendowner->pi_lock, flags); + raw_spin_unlock_irqrestore(&pendowner->pi_lock, flags); return 1; } @@ -333,7 +333,7 @@ static inline int try_to_steal_lock(struct rt_mutex *lock, next = rt_mutex_top_waiter(lock); plist_del(&next->pi_list_entry, &pendowner->pi_waiters); __rt_mutex_adjust_prio(pendowner); - spin_unlock_irqrestore(&pendowner->pi_lock, flags); + raw_spin_unlock_irqrestore(&pendowner->pi_lock, flags); /* * We are going to steal the lock and a waiter was @@ -350,10 +350,10 @@ static inline int try_to_steal_lock(struct rt_mutex *lock, * might be task: */ if (likely(next->task != task)) { - spin_lock_irqsave(&task->pi_lock, flags); + raw_spin_lock_irqsave(&task->pi_lock, flags); plist_add(&next->pi_list_entry, &task->pi_waiters); __rt_mutex_adjust_prio(task); - spin_unlock_irqrestore(&task->pi_lock, flags); + raw_spin_unlock_irqrestore(&task->pi_lock, flags); } return 1; } @@ -420,7 +420,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, unsigned long flags; int chain_walk = 0, res; - spin_lock_irqsave(&task->pi_lock, flags); + raw_spin_lock_irqsave(&task->pi_lock, flags); __rt_mutex_adjust_prio(task); waiter->task = task; waiter->lock = lock; @@ -434,17 +434,17 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, task->pi_blocked_on = waiter; - spin_unlock_irqrestore(&task->pi_lock, flags); + raw_spin_unlock_irqrestore(&task->pi_lock, flags); if (waiter == rt_mutex_top_waiter(lock)) { - spin_lock_irqsave(&owner->pi_lock, flags); + raw_spin_lock_irqsave(&owner->pi_lock, flags); plist_del(&top_waiter->pi_list_entry, &owner->pi_waiters); plist_add(&waiter->pi_list_entry, &owner->pi_waiters); __rt_mutex_adjust_prio(owner); if (owner->pi_blocked_on) chain_walk = 1; - spin_unlock_irqrestore(&owner->pi_lock, flags); + raw_spin_unlock_irqrestore(&owner->pi_lock, flags); } else if (debug_rt_mutex_detect_deadlock(waiter, detect_deadlock)) chain_walk = 1; @@ -459,12 +459,12 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, */ get_task_struct(owner); - spin_unlock(&lock->wait_lock); + raw_spin_unlock(&lock->wait_lock); res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock, waiter, task); - spin_lock(&lock->wait_lock); + raw_spin_lock(&lock->wait_lock); return res; } @@ -483,7 +483,7 @@ static void wakeup_next_waiter(struct rt_mutex *lock) struct task_struct *pendowner; unsigned long flags; - spin_lock_irqsave(¤t->pi_lock, flags); + raw_spin_lock_irqsave(¤t->pi_lock, flags); waiter = rt_mutex_top_waiter(lock); plist_del(&waiter->list_entry, &lock->wait_list); @@ -500,7 +500,7 @@ static void wakeup_next_waiter(struct rt_mutex *lock) rt_mutex_set_owner(lock, pendowner, RT_MUTEX_OWNER_PENDING); - spin_unlock_irqrestore(¤t->pi_lock, flags); + raw_spin_unlock_irqrestore(¤t->pi_lock, flags); /* * Clear the pi_blocked_on variable and enqueue a possible @@ -509,7 +509,7 @@ static void wakeup_next_waiter(struct rt_mutex *lock) * waiter with higher priority than pending-owner->normal_prio * is blocked on the unboosted (pending) owner. */ - spin_lock_irqsave(&pendowner->pi_lock, flags); + raw_spin_lock_irqsave(&pendowner->pi_lock, flags); WARN_ON(!pendowner->pi_blocked_on); WARN_ON(pendowner->pi_blocked_on != waiter); @@ -523,7 +523,7 @@ static void wakeup_next_waiter(struct rt_mutex *lock) next = rt_mutex_top_waiter(lock); plist_add(&next->pi_list_entry, &pendowner->pi_waiters); } - spin_unlock_irqrestore(&pendowner->pi_lock, flags); + raw_spin_unlock_irqrestore(&pendowner->pi_lock, flags); wake_up_process(pendowner); } @@ -541,15 +541,15 @@ static void remove_waiter(struct rt_mutex *lock, unsigned long flags; int chain_walk = 0; - spin_lock_irqsave(¤t->pi_lock, flags); + raw_spin_lock_irqsave(¤t->pi_lock, flags); plist_del(&waiter->list_entry, &lock->wait_list); waiter->task = NULL; current->pi_blocked_on = NULL; - spin_unlock_irqrestore(¤t->pi_lock, flags); + raw_spin_unlock_irqrestore(¤t->pi_lock, flags); if (first && owner != current) { - spin_lock_irqsave(&owner->pi_lock, flags); + raw_spin_lock_irqsave(&owner->pi_lock, flags); plist_del(&waiter->pi_list_entry, &owner->pi_waiters); @@ -564,7 +564,7 @@ static void remove_waiter(struct rt_mutex *lock, if (owner->pi_blocked_on) chain_walk = 1; - spin_unlock_irqrestore(&owner->pi_lock, flags); + raw_spin_unlock_irqrestore(&owner->pi_lock, flags); } WARN_ON(!plist_node_empty(&waiter->pi_list_entry)); @@ -575,11 +575,11 @@ static void remove_waiter(struct rt_mutex *lock, /* gets dropped in rt_mutex_adjust_prio_chain()! */ get_task_struct(owner); - spin_unlock(&lock->wait_lock); + raw_spin_unlock(&lock->wait_lock); rt_mutex_adjust_prio_chain(owner, 0, lock, NULL, current); - spin_lock(&lock->wait_lock); + raw_spin_lock(&lock->wait_lock); } /* @@ -592,15 +592,15 @@ void rt_mutex_adjust_pi(struct task_struct *task) struct rt_mutex_waiter *waiter; unsigned long flags; - spin_lock_irqsave(&task->pi_lock, flags); + raw_spin_lock_irqsave(&task->pi_lock, flags); waiter = task->pi_blocked_on; if (!waiter || waiter->list_entry.prio == task->prio) { - spin_unlock_irqrestore(&task->pi_lock, flags); + raw_spin_unlock_irqrestore(&task->pi_lock, flags); return; } - spin_unlock_irqrestore(&task->pi_lock, flags); + raw_spin_unlock_irqrestore(&task->pi_lock, flags); /* gets dropped in rt_mutex_adjust_prio_chain()! */ get_task_struct(task); @@ -672,14 +672,14 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state, break; } - spin_unlock(&lock->wait_lock); + raw_spin_unlock(&lock->wait_lock); debug_rt_mutex_print_deadlock(waiter); if (waiter->task) schedule_rt_mutex(lock); - spin_lock(&lock->wait_lock); + raw_spin_lock(&lock->wait_lock); set_current_state(state); } @@ -700,11 +700,11 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, debug_rt_mutex_init_waiter(&waiter); waiter.task = NULL; - spin_lock(&lock->wait_lock); + raw_spin_lock(&lock->wait_lock); /* Try to acquire the lock again: */ if (try_to_take_rt_mutex(lock)) { - spin_unlock(&lock->wait_lock); + raw_spin_unlock(&lock->wait_lock); return 0; } @@ -731,7 +731,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, */ fixup_rt_mutex_waiters(lock); - spin_unlock(&lock->wait_lock); + raw_spin_unlock(&lock->wait_lock); /* Remove pending timer: */ if (unlikely(timeout)) @@ -758,7 +758,7 @@ rt_mutex_slowtrylock(struct rt_mutex *lock) { int ret = 0; - spin_lock(&lock->wait_lock); + raw_spin_lock(&lock->wait_lock); if (likely(rt_mutex_owner(lock) != current)) { @@ -770,7 +770,7 @@ rt_mutex_slowtrylock(struct rt_mutex *lock) fixup_rt_mutex_waiters(lock); } - spin_unlock(&lock->wait_lock); + raw_spin_unlock(&lock->wait_lock); return ret; } @@ -781,7 +781,7 @@ rt_mutex_slowtrylock(struct rt_mutex *lock) static void __sched rt_mutex_slowunlock(struct rt_mutex *lock) { - spin_lock(&lock->wait_lock); + raw_spin_lock(&lock->wait_lock); debug_rt_mutex_unlock(lock); @@ -789,13 +789,13 @@ rt_mutex_slowunlock(struct rt_mutex *lock) if (!rt_mutex_has_waiters(lock)) { lock->owner = NULL; - spin_unlock(&lock->wait_lock); + raw_spin_unlock(&lock->wait_lock); return; } wakeup_next_waiter(lock); - spin_unlock(&lock->wait_lock); + raw_spin_unlock(&lock->wait_lock); /* Undo pi boosting if necessary: */ rt_mutex_adjust_prio(current); @@ -970,8 +970,8 @@ EXPORT_SYMBOL_GPL(rt_mutex_destroy); void __rt_mutex_init(struct rt_mutex *lock, const char *name) { lock->owner = NULL; - spin_lock_init(&lock->wait_lock); - plist_head_init(&lock->wait_list, &lock->wait_lock); + raw_spin_lock_init(&lock->wait_lock); + plist_head_init_raw(&lock->wait_list, &lock->wait_lock); debug_rt_mutex_init(lock, name); } @@ -1032,7 +1032,7 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock, { int ret; - spin_lock(&lock->wait_lock); + raw_spin_lock(&lock->wait_lock); mark_rt_mutex_waiters(lock); @@ -1040,7 +1040,7 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock, /* We got the lock for task. */ debug_rt_mutex_lock(lock); rt_mutex_set_owner(lock, task, 0); - spin_unlock(&lock->wait_lock); + raw_spin_unlock(&lock->wait_lock); rt_mutex_deadlock_account_lock(lock, task); return 1; } @@ -1056,7 +1056,7 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock, */ ret = 0; } - spin_unlock(&lock->wait_lock); + raw_spin_unlock(&lock->wait_lock); debug_rt_mutex_print_deadlock(waiter); @@ -1106,7 +1106,7 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock, { int ret; - spin_lock(&lock->wait_lock); + raw_spin_lock(&lock->wait_lock); set_current_state(TASK_INTERRUPTIBLE); @@ -1124,7 +1124,7 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock, */ fixup_rt_mutex_waiters(lock); - spin_unlock(&lock->wait_lock); + raw_spin_unlock(&lock->wait_lock); /* * Readjust priority, when we did not get the lock. We might have been diff --git a/kernel/sched.c b/kernel/sched.c index 2f76e06bea5..18cceeecce3 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -141,7 +141,7 @@ struct rt_prio_array { struct rt_bandwidth { /* nests inside the rq lock: */ - spinlock_t rt_runtime_lock; + raw_spinlock_t rt_runtime_lock; ktime_t rt_period; u64 rt_runtime; struct hrtimer rt_period_timer; @@ -178,7 +178,7 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime) rt_b->rt_period = ns_to_ktime(period); rt_b->rt_runtime = runtime; - spin_lock_init(&rt_b->rt_runtime_lock); + raw_spin_lock_init(&rt_b->rt_runtime_lock); hrtimer_init(&rt_b->rt_period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); @@ -200,7 +200,7 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b) if (hrtimer_active(&rt_b->rt_period_timer)) return; - spin_lock(&rt_b->rt_runtime_lock); + raw_spin_lock(&rt_b->rt_runtime_lock); for (;;) { unsigned long delta; ktime_t soft, hard; @@ -217,7 +217,7 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b) __hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta, HRTIMER_MODE_ABS_PINNED, 0); } - spin_unlock(&rt_b->rt_runtime_lock); + raw_spin_unlock(&rt_b->rt_runtime_lock); } #ifdef CONFIG_RT_GROUP_SCHED @@ -298,7 +298,7 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct cfs_rq, init_tg_cfs_rq); #ifdef CONFIG_RT_GROUP_SCHED static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); -static DEFINE_PER_CPU_SHARED_ALIGNED(struct rt_rq, init_rt_rq); +static DEFINE_PER_CPU_SHARED_ALIGNED(struct rt_rq, init_rt_rq_var); #endif /* CONFIG_RT_GROUP_SCHED */ #else /* !CONFIG_USER_SCHED */ #define root_task_group init_task_group @@ -309,6 +309,8 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct rt_rq, init_rt_rq); */ static DEFINE_SPINLOCK(task_group_lock); +#ifdef CONFIG_FAIR_GROUP_SCHED + #ifdef CONFIG_SMP static int root_task_group_empty(void) { @@ -316,7 +318,6 @@ static int root_task_group_empty(void) } #endif -#ifdef CONFIG_FAIR_GROUP_SCHED #ifdef CONFIG_USER_SCHED # define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD) #else /* !CONFIG_USER_SCHED */ @@ -469,7 +470,7 @@ struct rt_rq { u64 rt_time; u64 rt_runtime; /* Nests inside the rq lock: */ - spinlock_t rt_runtime_lock; + raw_spinlock_t rt_runtime_lock; #ifdef CONFIG_RT_GROUP_SCHED unsigned long rt_nr_boosted; @@ -524,7 +525,7 @@ static struct root_domain def_root_domain; */ struct rq { /* runqueue lock: */ - spinlock_t lock; + raw_spinlock_t lock; /* * nr_running and cpu_load should be in the same cacheline because @@ -534,14 +535,12 @@ struct rq { #define CPU_LOAD_IDX_MAX 5 unsigned long cpu_load[CPU_LOAD_IDX_MAX]; #ifdef CONFIG_NO_HZ - unsigned long last_tick_seen; unsigned char in_nohz_recently; #endif /* capture load from *all* tasks on this cpu: */ struct load_weight load; unsigned long nr_load_updates; u64 nr_switches; - u64 nr_migrations_in; struct cfs_rq cfs; struct rt_rq rt; @@ -590,6 +589,8 @@ struct rq { u64 rt_avg; u64 age_stamp; + u64 idle_stamp; + u64 avg_idle; #endif /* calc_load related fields */ @@ -676,6 +677,7 @@ inline void update_rq_clock(struct rq *rq) /** * runqueue_is_locked + * @cpu: the processor in question. * * Returns true if the current cpu runqueue is locked. * This interface allows printk to be called with the runqueue lock @@ -683,7 +685,7 @@ inline void update_rq_clock(struct rq *rq) */ int runqueue_is_locked(int cpu) { - return spin_is_locked(&cpu_rq(cpu)->lock); + return raw_spin_is_locked(&cpu_rq(cpu)->lock); } /* @@ -770,7 +772,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf, if (!sched_feat_names[i]) return -EINVAL; - filp->f_pos += cnt; + *ppos += cnt; return cnt; } @@ -780,7 +782,7 @@ static int sched_feat_open(struct inode *inode, struct file *filp) return single_open(filp, sched_feat_show, NULL); } -static struct file_operations sched_feat_fops = { +static const struct file_operations sched_feat_fops = { .open = sched_feat_open, .write = sched_feat_write, .read = seq_read, @@ -812,6 +814,7 @@ const_debug unsigned int sysctl_sched_nr_migrate = 32; * default: 0.25ms */ unsigned int sysctl_sched_shares_ratelimit = 250000; +unsigned int normalized_sysctl_sched_shares_ratelimit = 250000; /* * Inject some fuzzyness into changing the per-cpu group shares @@ -890,7 +893,7 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) */ spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); - spin_unlock_irq(&rq->lock); + raw_spin_unlock_irq(&rq->lock); } #else /* __ARCH_WANT_UNLOCKED_CTXSW */ @@ -914,9 +917,9 @@ static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) next->oncpu = 1; #endif #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW - spin_unlock_irq(&rq->lock); + raw_spin_unlock_irq(&rq->lock); #else - spin_unlock(&rq->lock); + raw_spin_unlock(&rq->lock); #endif } @@ -946,10 +949,10 @@ static inline struct rq *__task_rq_lock(struct task_struct *p) { for (;;) { struct rq *rq = task_rq(p); - spin_lock(&rq->lock); + raw_spin_lock(&rq->lock); if (likely(rq == task_rq(p))) return rq; - spin_unlock(&rq->lock); + raw_spin_unlock(&rq->lock); } } @@ -966,10 +969,10 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) for (;;) { local_irq_save(*flags); rq = task_rq(p); - spin_lock(&rq->lock); + raw_spin_lock(&rq->lock); if (likely(rq == task_rq(p))) return rq; - spin_unlock_irqrestore(&rq->lock, *flags); + raw_spin_unlock_irqrestore(&rq->lock, *flags); } } @@ -978,19 +981,19 @@ void task_rq_unlock_wait(struct task_struct *p) struct rq *rq = task_rq(p); smp_mb(); /* spin-unlock-wait is not a full memory barrier */ - spin_unlock_wait(&rq->lock); + raw_spin_unlock_wait(&rq->lock); } static void __task_rq_unlock(struct rq *rq) __releases(rq->lock) { - spin_unlock(&rq->lock); + raw_spin_unlock(&rq->lock); } static inline void task_rq_unlock(struct rq *rq, unsigned long *flags) __releases(rq->lock) { - spin_unlock_irqrestore(&rq->lock, *flags); + raw_spin_unlock_irqrestore(&rq->lock, *flags); } /* @@ -1003,7 +1006,7 @@ static struct rq *this_rq_lock(void) local_irq_disable(); rq = this_rq(); - spin_lock(&rq->lock); + raw_spin_lock(&rq->lock); return rq; } @@ -1050,10 +1053,10 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer) WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); - spin_lock(&rq->lock); + raw_spin_lock(&rq->lock); update_rq_clock(rq); rq->curr->sched_class->task_tick(rq, rq->curr, 1); - spin_unlock(&rq->lock); + raw_spin_unlock(&rq->lock); return HRTIMER_NORESTART; } @@ -1066,10 +1069,10 @@ static void __hrtick_start(void *arg) { struct rq *rq = arg; - spin_lock(&rq->lock); + raw_spin_lock(&rq->lock); hrtimer_restart(&rq->hrtick_timer); rq->hrtick_csd_pending = 0; - spin_unlock(&rq->lock); + raw_spin_unlock(&rq->lock); } /* @@ -1176,7 +1179,7 @@ static void resched_task(struct task_struct *p) { int cpu; - assert_spin_locked(&task_rq(p)->lock); + assert_raw_spin_locked(&task_rq(p)->lock); if (test_tsk_need_resched(p)) return; @@ -1198,10 +1201,10 @@ static void resched_cpu(int cpu) struct rq *rq = cpu_rq(cpu); unsigned long flags; - if (!spin_trylock_irqsave(&rq->lock, flags)) + if (!raw_spin_trylock_irqsave(&rq->lock, flags)) return; resched_task(cpu_curr(cpu)); - spin_unlock_irqrestore(&rq->lock, flags); + raw_spin_unlock_irqrestore(&rq->lock, flags); } #ifdef CONFIG_NO_HZ @@ -1270,7 +1273,7 @@ static void sched_rt_avg_update(struct rq *rq, u64 rt_delta) #else /* !CONFIG_SMP */ static void resched_task(struct task_struct *p) { - assert_spin_locked(&task_rq(p)->lock); + assert_raw_spin_locked(&task_rq(p)->lock); set_tsk_need_resched(p); } @@ -1563,11 +1566,7 @@ static unsigned long cpu_avg_load_per_task(int cpu) #ifdef CONFIG_FAIR_GROUP_SCHED -struct update_shares_data { - unsigned long rq_weight[NR_CPUS]; -}; - -static DEFINE_PER_CPU(struct update_shares_data, update_shares_data); +static __read_mostly unsigned long *update_shares_data; static void __set_se_shares(struct sched_entity *se, unsigned long shares); @@ -1577,12 +1576,12 @@ static void __set_se_shares(struct sched_entity *se, unsigned long shares); static void update_group_shares_cpu(struct task_group *tg, int cpu, unsigned long sd_shares, unsigned long sd_rq_weight, - struct update_shares_data *usd) + unsigned long *usd_rq_weight) { unsigned long shares, rq_weight; int boost = 0; - rq_weight = usd->rq_weight[cpu]; + rq_weight = usd_rq_weight[cpu]; if (!rq_weight) { boost = 1; rq_weight = NICE_0_LOAD; @@ -1601,11 +1600,11 @@ static void update_group_shares_cpu(struct task_group *tg, int cpu, struct rq *rq = cpu_rq(cpu); unsigned long flags; - spin_lock_irqsave(&rq->lock, flags); + raw_spin_lock_irqsave(&rq->lock, flags); tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight; tg->cfs_rq[cpu]->shares = boost ? 0 : shares; __set_se_shares(tg->se[cpu], shares); - spin_unlock_irqrestore(&rq->lock, flags); + raw_spin_unlock_irqrestore(&rq->lock, flags); } } @@ -1616,8 +1615,8 @@ static void update_group_shares_cpu(struct task_group *tg, int cpu, */ static int tg_shares_up(struct task_group *tg, void *data) { - unsigned long weight, rq_weight = 0, shares = 0; - struct update_shares_data *usd; + unsigned long weight, rq_weight = 0, sum_weight = 0, shares = 0; + unsigned long *usd_rq_weight; struct sched_domain *sd = data; unsigned long flags; int i; @@ -1626,12 +1625,13 @@ static int tg_shares_up(struct task_group *tg, void *data) return 0; local_irq_save(flags); - usd = &__get_cpu_var(update_shares_data); + usd_rq_weight = per_cpu_ptr(update_shares_data, smp_processor_id()); for_each_cpu(i, sched_domain_span(sd)) { weight = tg->cfs_rq[i]->load.weight; - usd->rq_weight[i] = weight; + usd_rq_weight[i] = weight; + rq_weight += weight; /* * If there are currently no tasks on the cpu pretend there * is one of average load so that when a new task gets to @@ -1640,10 +1640,13 @@ static int tg_shares_up(struct task_group *tg, void *data) if (!weight) weight = NICE_0_LOAD; - rq_weight += weight; + sum_weight += weight; shares += tg->cfs_rq[i]->shares; } + if (!rq_weight) + rq_weight = sum_weight; + if ((!shares && rq_weight) || shares > tg->shares) shares = tg->shares; @@ -1651,7 +1654,7 @@ static int tg_shares_up(struct task_group *tg, void *data) shares = tg->shares; for_each_cpu(i, sched_domain_span(sd)) - update_group_shares_cpu(tg, i, shares, rq_weight, usd); + update_group_shares_cpu(tg, i, shares, rq_weight, usd_rq_weight); local_irq_restore(flags); @@ -1703,9 +1706,9 @@ static void update_shares_locked(struct rq *rq, struct sched_domain *sd) if (root_task_group_empty()) return; - spin_unlock(&rq->lock); + raw_spin_unlock(&rq->lock); update_shares(sd); - spin_lock(&rq->lock); + raw_spin_lock(&rq->lock); } static void update_h_load(long cpu) @@ -1745,7 +1748,7 @@ static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) __acquires(busiest->lock) __acquires(this_rq->lock) { - spin_unlock(&this_rq->lock); + raw_spin_unlock(&this_rq->lock); double_rq_lock(this_rq, busiest); return 1; @@ -1766,14 +1769,16 @@ static int _double_lock_balance(struct rq *this_rq, struct rq *busiest) { int ret = 0; - if (unlikely(!spin_trylock(&busiest->lock))) { + if (unlikely(!raw_spin_trylock(&busiest->lock))) { if (busiest < this_rq) { - spin_unlock(&this_rq->lock); - spin_lock(&busiest->lock); - spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING); + raw_spin_unlock(&this_rq->lock); + raw_spin_lock(&busiest->lock); + raw_spin_lock_nested(&this_rq->lock, + SINGLE_DEPTH_NESTING); ret = 1; } else - spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING); + raw_spin_lock_nested(&busiest->lock, + SINGLE_DEPTH_NESTING); } return ret; } @@ -1787,7 +1792,7 @@ static int double_lock_balance(struct rq *this_rq, struct rq *busiest) { if (unlikely(!irqs_disabled())) { /* printk() doesn't work good under rq->lock */ - spin_unlock(&this_rq->lock); + raw_spin_unlock(&this_rq->lock); BUG_ON(1); } @@ -1797,7 +1802,7 @@ static int double_lock_balance(struct rq *this_rq, struct rq *busiest) static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest) __releases(busiest->lock) { - spin_unlock(&busiest->lock); + raw_spin_unlock(&busiest->lock); lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_); } #endif @@ -1812,6 +1817,22 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) #endif static void calc_load_account_active(struct rq *this_rq); +static void update_sysctl(void); +static int get_update_sysctl_factor(void); + +static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) +{ + set_task_rq(p, cpu); +#ifdef CONFIG_SMP + /* + * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be + * successfuly executed on another CPU. We must ensure that updates of + * per-task data have been completed by this moment. + */ + smp_wmb(); + task_thread_info(p)->cpu = cpu; +#endif +} #include "sched_stats.h" #include "sched_idletask.c" @@ -1969,20 +1990,6 @@ inline int task_curr(const struct task_struct *p) return cpu_curr(task_cpu(p)) == p; } -static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) -{ - set_task_rq(p, cpu); -#ifdef CONFIG_SMP - /* - * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be - * successfuly executed on another CPU. We must ensure that updates of - * per-task data have been completed by this moment. - */ - smp_wmb(); - task_thread_info(p)->cpu = cpu; -#endif -} - static inline void check_class_changed(struct rq *rq, struct task_struct *p, const struct sched_class *prev_class, int oldprio, int running) @@ -1995,6 +2002,39 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p, p->sched_class->prio_changed(rq, p, oldprio, running); } +/** + * kthread_bind - bind a just-created kthread to a cpu. + * @p: thread created by kthread_create(). + * @cpu: cpu (might not be online, must be possible) for @k to run on. + * + * Description: This function is equivalent to set_cpus_allowed(), + * except that @cpu doesn't need to be online, and the thread must be + * stopped (i.e., just returned from kthread_create()). + * + * Function lives here instead of kthread.c because it messes with + * scheduler internals which require locking. + */ +void kthread_bind(struct task_struct *p, unsigned int cpu) +{ + struct rq *rq = cpu_rq(cpu); + unsigned long flags; + + /* Must have done schedule() in kthread() before we set_task_cpu */ + if (!wait_task_inactive(p, TASK_UNINTERRUPTIBLE)) { + WARN_ON(1); + return; + } + + raw_spin_lock_irqsave(&rq->lock, flags); + update_rq_clock(rq); + set_task_cpu(p, cpu); + p->cpus_allowed = cpumask_of_cpu(cpu); + p->rt.nr_cpus_allowed = 1; + p->flags |= PF_THREAD_BOUND; + raw_spin_unlock_irqrestore(&rq->lock, flags); +} +EXPORT_SYMBOL(kthread_bind); + #ifdef CONFIG_SMP /* * Is this task likely cache-hot: @@ -2007,7 +2047,7 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) /* * Buddy candidates are cache hot: */ - if (sched_feat(CACHE_HOT_BUDDY) && + if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running && (&p->se == cfs_rq_of(&p->se)->next || &p->se == cfs_rq_of(&p->se)->last)) return 1; @@ -2029,30 +2069,13 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) void set_task_cpu(struct task_struct *p, unsigned int new_cpu) { int old_cpu = task_cpu(p); - struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu); struct cfs_rq *old_cfsrq = task_cfs_rq(p), *new_cfsrq = cpu_cfs_rq(old_cfsrq, new_cpu); - u64 clock_offset; - - clock_offset = old_rq->clock - new_rq->clock; trace_sched_migrate_task(p, new_cpu); -#ifdef CONFIG_SCHEDSTATS - if (p->se.wait_start) - p->se.wait_start -= clock_offset; - if (p->se.sleep_start) - p->se.sleep_start -= clock_offset; - if (p->se.block_start) - p->se.block_start -= clock_offset; -#endif if (old_cpu != new_cpu) { p->se.nr_migrations++; - new_rq->nr_migrations_in++; -#ifdef CONFIG_SCHEDSTATS - if (task_hot(p, old_rq->clock, NULL)) - schedstat_inc(p, se.nr_forced2_migrations); -#endif perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 1, NULL, 0); } @@ -2085,6 +2108,7 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req) * it is sufficient to simply update the task's cpu field. */ if (!p->se.on_rq && !task_running(rq, p)) { + update_rq_clock(rq); set_task_cpu(p, dest_cpu); return 0; } @@ -2292,6 +2316,14 @@ void task_oncpu_function_call(struct task_struct *p, preempt_enable(); } +#ifdef CONFIG_SMP +static inline +int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags) +{ + return p->sched_class->select_task_rq(p, sd_flags, wake_flags); +} +#endif + /*** * try_to_wake_up - wake up a thread * @p: the to-be-woken-up thread @@ -2311,7 +2343,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, { int cpu, orig_cpu, this_cpu, success = 0; unsigned long flags; - struct rq *rq; + struct rq *rq, *orig_rq; if (!sched_feat(SYNC_WAKEUPS)) wake_flags &= ~WF_SYNC; @@ -2319,7 +2351,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, this_cpu = get_cpu(); smp_wmb(); - rq = task_rq_lock(p, &flags); + rq = orig_rq = task_rq_lock(p, &flags); update_rq_clock(rq); if (!(p->state & state)) goto out; @@ -2343,13 +2375,15 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, if (task_contributes_to_load(p)) rq->nr_uninterruptible--; p->state = TASK_WAKING; - task_rq_unlock(rq, &flags); + __task_rq_unlock(rq); - cpu = p->sched_class->select_task_rq(p, SD_BALANCE_WAKE, wake_flags); + cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags); if (cpu != orig_cpu) set_task_cpu(p, cpu); - rq = task_rq_lock(p, &flags); + rq = __task_rq_lock(p); + update_rq_clock(rq); + WARN_ON(p->state != TASK_WAKING); cpu = task_cpu(p); @@ -2406,6 +2440,17 @@ out_running: #ifdef CONFIG_SMP if (p->sched_class->task_wake_up) p->sched_class->task_wake_up(rq, p); + + if (unlikely(rq->idle_stamp)) { + u64 delta = rq->clock - rq->idle_stamp; + u64 max = 2*sysctl_sched_migration_cost; + + if (delta > max) + rq->avg_idle = max; + else + update_avg(&rq->avg_idle, delta); + rq->idle_stamp = 0; + } #endif out: task_rq_unlock(rq, &flags); @@ -2452,7 +2497,6 @@ static void __sched_fork(struct task_struct *p) p->se.avg_overlap = 0; p->se.start_runtime = 0; p->se.avg_wakeup = sysctl_sched_wakeup_granularity; - p->se.avg_running = 0; #ifdef CONFIG_SCHEDSTATS p->se.wait_start = 0; @@ -2474,7 +2518,6 @@ static void __sched_fork(struct task_struct *p) p->se.nr_failed_migrations_running = 0; p->se.nr_failed_migrations_hot = 0; p->se.nr_forced_migrations = 0; - p->se.nr_forced2_migrations = 0; p->se.nr_wakeups = 0; p->se.nr_wakeups_sync = 0; @@ -2515,22 +2558,17 @@ void sched_fork(struct task_struct *p, int clone_flags) __sched_fork(p); /* - * Make sure we do not leak PI boosting priority to the child. - */ - p->prio = current->normal_prio; - - /* * Revert to default priority/policy on fork if requested. */ if (unlikely(p->sched_reset_on_fork)) { - if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) + if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) { p->policy = SCHED_NORMAL; - - if (p->normal_prio < DEFAULT_PRIO) - p->prio = DEFAULT_PRIO; + p->normal_prio = p->static_prio; + } if (PRIO_TO_NICE(p->static_prio) < 0) { p->static_prio = NICE_TO_PRIO(0); + p->normal_prio = p->static_prio; set_load_weight(p); } @@ -2541,11 +2579,19 @@ void sched_fork(struct task_struct *p, int clone_flags) p->sched_reset_on_fork = 0; } + /* + * Make sure we do not leak PI boosting priority to the child. + */ + p->prio = current->normal_prio; + if (!rt_prio(p->prio)) p->sched_class = &fair_sched_class; + if (p->sched_class->task_fork) + p->sched_class->task_fork(p); + #ifdef CONFIG_SMP - cpu = p->sched_class->select_task_rq(p, SD_BALANCE_FORK, 0); + cpu = select_task_rq(p, SD_BALANCE_FORK, 0); #endif set_task_cpu(p, cpu); @@ -2580,19 +2626,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) rq = task_rq_lock(p, &flags); BUG_ON(p->state != TASK_RUNNING); update_rq_clock(rq); - - p->prio = effective_prio(p); - - if (!p->sched_class->task_new || !current->se.on_rq) { - activate_task(rq, p, 0); - } else { - /* - * Let the scheduling class do new task startup - * management (if any): - */ - p->sched_class->task_new(rq, p); - inc_nr_running(rq); - } + activate_task(rq, p, 0); trace_sched_wakeup_new(rq, p, 1); check_preempt_curr(rq, p, WF_FORK); #ifdef CONFIG_SMP @@ -2749,10 +2783,10 @@ static inline void post_schedule(struct rq *rq) if (rq->post_schedule) { unsigned long flags; - spin_lock_irqsave(&rq->lock, flags); + raw_spin_lock_irqsave(&rq->lock, flags); if (rq->curr->sched_class->post_schedule) rq->curr->sched_class->post_schedule(rq); - spin_unlock_irqrestore(&rq->lock, flags); + raw_spin_unlock_irqrestore(&rq->lock, flags); rq->post_schedule = 0; } @@ -2816,14 +2850,14 @@ context_switch(struct rq *rq, struct task_struct *prev, */ arch_start_context_switch(prev); - if (unlikely(!mm)) { + if (likely(!mm)) { next->active_mm = oldmm; atomic_inc(&oldmm->mm_count); enter_lazy_tlb(oldmm, next); } else switch_mm(oldmm, mm, next); - if (unlikely(!prev->mm)) { + if (likely(!prev->mm)) { prev->active_mm = NULL; rq->prev_mm = oldmm; } @@ -2986,15 +3020,6 @@ static void calc_load_account_active(struct rq *this_rq) } /* - * Externally visible per-cpu scheduler statistics: - * cpu_nr_migrations(cpu) - number of migrations into that cpu - */ -u64 cpu_nr_migrations(int cpu) -{ - return cpu_rq(cpu)->nr_migrations_in; -} - -/* * Update rq->cpu_load[] statistics. This function is usually called every * scheduler tick (TICK_NSEC). */ @@ -3043,15 +3068,15 @@ static void double_rq_lock(struct rq *rq1, struct rq *rq2) { BUG_ON(!irqs_disabled()); if (rq1 == rq2) { - spin_lock(&rq1->lock); + raw_spin_lock(&rq1->lock); __acquire(rq2->lock); /* Fake it out ;) */ } else { if (rq1 < rq2) { - spin_lock(&rq1->lock); - spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING); + raw_spin_lock(&rq1->lock); + raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING); } else { - spin_lock(&rq2->lock); - spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING); + raw_spin_lock(&rq2->lock); + raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING); } } update_rq_clock(rq1); @@ -3068,9 +3093,9 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2) __releases(rq1->lock) __releases(rq2->lock) { - spin_unlock(&rq1->lock); + raw_spin_unlock(&rq1->lock); if (rq1 != rq2) - spin_unlock(&rq2->lock); + raw_spin_unlock(&rq2->lock); else __release(rq2->lock); } @@ -3116,7 +3141,7 @@ out: void sched_exec(void) { int new_cpu, this_cpu = get_cpu(); - new_cpu = current->sched_class->select_task_rq(current, SD_BALANCE_EXEC, 0); + new_cpu = select_task_rq(current, SD_BALANCE_EXEC, 0); put_cpu(); if (new_cpu != this_cpu) sched_migrate_task(current, new_cpu); @@ -3132,10 +3157,6 @@ static void pull_task(struct rq *src_rq, struct task_struct *p, deactivate_task(src_rq, p, 0); set_task_cpu(p, this_cpu); activate_task(this_rq, p, 0); - /* - * Note that idle threads have a prio of MAX_PRIO, for this test - * to be always true for them. - */ check_preempt_curr(this_rq, p, 0); } @@ -3658,6 +3679,7 @@ static void update_group_power(struct sched_domain *sd, int cpu) /** * update_sg_lb_stats - Update sched_group's statistics for load balancing. + * @sd: The sched_domain whose statistics are to be updated. * @group: sched_group whose statistics are to be updated. * @this_cpu: Cpu for which load balance is currently performed. * @idle: Idle status of this_cpu @@ -4093,7 +4115,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, unsigned long flags; struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); - cpumask_setall(cpus); + cpumask_copy(cpus, cpu_active_mask); /* * When power savings policy is enabled for the parent domain, idle @@ -4166,14 +4188,15 @@ redo: if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) { - spin_lock_irqsave(&busiest->lock, flags); + raw_spin_lock_irqsave(&busiest->lock, flags); /* don't kick the migration_thread, if the curr * task on busiest cpu can't be moved to this_cpu */ if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) { - spin_unlock_irqrestore(&busiest->lock, flags); + raw_spin_unlock_irqrestore(&busiest->lock, + flags); all_pinned = 1; goto out_one_pinned; } @@ -4183,7 +4206,7 @@ redo: busiest->push_cpu = this_cpu; active_balance = 1; } - spin_unlock_irqrestore(&busiest->lock, flags); + raw_spin_unlock_irqrestore(&busiest->lock, flags); if (active_balance) wake_up_process(busiest->migration_thread); @@ -4256,7 +4279,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd) int all_pinned = 0; struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); - cpumask_setall(cpus); + cpumask_copy(cpus, cpu_active_mask); /* * When power savings policy is enabled for the parent domain, idle @@ -4365,10 +4388,10 @@ redo: /* * Should not call ttwu while holding a rq->lock */ - spin_unlock(&this_rq->lock); + raw_spin_unlock(&this_rq->lock); if (active_balance) wake_up_process(busiest->migration_thread); - spin_lock(&this_rq->lock); + raw_spin_lock(&this_rq->lock); } else sd->nr_balance_failed = 0; @@ -4396,6 +4419,11 @@ static void idle_balance(int this_cpu, struct rq *this_rq) int pulled_task = 0; unsigned long next_balance = jiffies + HZ; + this_rq->idle_stamp = this_rq->clock; + + if (this_rq->avg_idle < sysctl_sched_migration_cost) + return; + for_each_domain(this_cpu, sd) { unsigned long interval; @@ -4410,8 +4438,10 @@ static void idle_balance(int this_cpu, struct rq *this_rq) interval = msecs_to_jiffies(sd->balance_interval); if (time_after(next_balance, sd->last_balance + interval)) next_balance = sd->last_balance + interval; - if (pulled_task) + if (pulled_task) { + this_rq->idle_stamp = 0; break; + } } if (pulled_task || time_after(jiffies, this_rq->next_balance)) { /* @@ -4646,7 +4676,7 @@ int select_nohz_load_balancer(int stop_tick) cpumask_set_cpu(cpu, nohz.cpu_mask); /* time for ilb owner also to sleep */ - if (cpumask_weight(nohz.cpu_mask) == num_online_cpus()) { + if (cpumask_weight(nohz.cpu_mask) == num_active_cpus()) { if (atomic_read(&nohz.load_balancer) == cpu) atomic_set(&nohz.load_balancer, -1); return 0; @@ -5013,8 +5043,13 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime, p->gtime = cputime_add(p->gtime, cputime); /* Add guest time to cpustat. */ - cpustat->user = cputime64_add(cpustat->user, tmp); - cpustat->guest = cputime64_add(cpustat->guest, tmp); + if (TASK_NICE(p) > 0) { + cpustat->nice = cputime64_add(cpustat->nice, tmp); + cpustat->guest_nice = cputime64_add(cpustat->guest_nice, tmp); + } else { + cpustat->user = cputime64_add(cpustat->user, tmp); + cpustat->guest = cputime64_add(cpustat->guest, tmp); + } } /* @@ -5129,60 +5164,86 @@ void account_idle_ticks(unsigned long ticks) * Use precise platform statistics if available: */ #ifdef CONFIG_VIRT_CPU_ACCOUNTING -cputime_t task_utime(struct task_struct *p) +void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) { - return p->utime; + *ut = p->utime; + *st = p->stime; } -cputime_t task_stime(struct task_struct *p) +void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) { - return p->stime; + struct task_cputime cputime; + + thread_group_cputime(p, &cputime); + + *ut = cputime.utime; + *st = cputime.stime; } #else -cputime_t task_utime(struct task_struct *p) + +#ifndef nsecs_to_cputime +# define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs) +#endif + +void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) { - clock_t utime = cputime_to_clock_t(p->utime), - total = utime + cputime_to_clock_t(p->stime); - u64 temp; + cputime_t rtime, utime = p->utime, total = cputime_add(utime, p->stime); /* * Use CFS's precise accounting: */ - temp = (u64)nsec_to_clock_t(p->se.sum_exec_runtime); + rtime = nsecs_to_cputime(p->se.sum_exec_runtime); if (total) { - temp *= utime; + u64 temp; + + temp = (u64)(rtime * utime); do_div(temp, total); - } - utime = (clock_t)temp; + utime = (cputime_t)temp; + } else + utime = rtime; - p->prev_utime = max(p->prev_utime, clock_t_to_cputime(utime)); - return p->prev_utime; + /* + * Compare with previous values, to keep monotonicity: + */ + p->prev_utime = max(p->prev_utime, utime); + p->prev_stime = max(p->prev_stime, cputime_sub(rtime, p->prev_utime)); + + *ut = p->prev_utime; + *st = p->prev_stime; } -cputime_t task_stime(struct task_struct *p) +/* + * Must be called with siglock held. + */ +void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) { - clock_t stime; + struct signal_struct *sig = p->signal; + struct task_cputime cputime; + cputime_t rtime, utime, total; - /* - * Use CFS's precise accounting. (we subtract utime from - * the total, to make sure the total observed by userspace - * grows monotonically - apps rely on that): - */ - stime = nsec_to_clock_t(p->se.sum_exec_runtime) - - cputime_to_clock_t(task_utime(p)); + thread_group_cputime(p, &cputime); - if (stime >= 0) - p->prev_stime = max(p->prev_stime, clock_t_to_cputime(stime)); + total = cputime_add(cputime.utime, cputime.stime); + rtime = nsecs_to_cputime(cputime.sum_exec_runtime); - return p->prev_stime; -} -#endif + if (total) { + u64 temp; -inline cputime_t task_gtime(struct task_struct *p) -{ - return p->gtime; + temp = (u64)(rtime * cputime.utime); + do_div(temp, total); + utime = (cputime_t)temp; + } else + utime = rtime; + + sig->prev_utime = max(sig->prev_utime, utime); + sig->prev_stime = max(sig->prev_stime, + cputime_sub(rtime, sig->prev_utime)); + + *ut = sig->prev_utime; + *st = sig->prev_stime; } +#endif /* * This function gets called by the timer code, with HZ frequency. @@ -5199,11 +5260,11 @@ void scheduler_tick(void) sched_clock_tick(); - spin_lock(&rq->lock); + raw_spin_lock(&rq->lock); update_rq_clock(rq); update_cpu_load(rq); curr->sched_class->task_tick(rq, curr, 0); - spin_unlock(&rq->lock); + raw_spin_unlock(&rq->lock); perf_event_task_tick(curr, cpu); @@ -5317,13 +5378,14 @@ static inline void schedule_debug(struct task_struct *prev) #endif } -static void put_prev_task(struct rq *rq, struct task_struct *p) +static void put_prev_task(struct rq *rq, struct task_struct *prev) { - u64 runtime = p->se.sum_exec_runtime - p->se.prev_sum_exec_runtime; + if (prev->state == TASK_RUNNING) { + u64 runtime = prev->se.sum_exec_runtime; - update_avg(&p->se.avg_running, runtime); + runtime -= prev->se.prev_sum_exec_runtime; + runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost); - if (p->state == TASK_RUNNING) { /* * In order to avoid avg_overlap growing stale when we are * indeed overlapping and hence not getting put to sleep, grow @@ -5333,12 +5395,9 @@ static void put_prev_task(struct rq *rq, struct task_struct *p) * correlates to the amount of cache footprint a task can * build up. */ - runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost); - update_avg(&p->se.avg_overlap, runtime); - } else { - update_avg(&p->se.avg_running, 0); + update_avg(&prev->se.avg_overlap, runtime); } - p->sched_class->put_prev_task(rq, p); + prev->sched_class->put_prev_task(rq, prev); } /* @@ -5399,7 +5458,7 @@ need_resched_nonpreemptible: if (sched_feat(HRTICK)) hrtick_clear(rq); - spin_lock_irq(&rq->lock); + raw_spin_lock_irq(&rq->lock); update_rq_clock(rq); clear_tsk_need_resched(prev); @@ -5435,7 +5494,7 @@ need_resched_nonpreemptible: cpu = smp_processor_id(); rq = cpu_rq(cpu); } else - spin_unlock_irq(&rq->lock); + raw_spin_unlock_irq(&rq->lock); post_schedule(rq); @@ -5448,7 +5507,7 @@ need_resched_nonpreemptible: } EXPORT_SYMBOL(schedule); -#ifdef CONFIG_SMP +#ifdef CONFIG_MUTEX_SPIN_ON_OWNER /* * Look out! "owner" is an entirely speculative pointer * access and not reliable. @@ -6142,22 +6201,14 @@ __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) BUG_ON(p->se.on_rq); p->policy = policy; - switch (p->policy) { - case SCHED_NORMAL: - case SCHED_BATCH: - case SCHED_IDLE: - p->sched_class = &fair_sched_class; - break; - case SCHED_FIFO: - case SCHED_RR: - p->sched_class = &rt_sched_class; - break; - } - p->rt_priority = prio; p->normal_prio = normal_prio(p); /* we are holding p->pi_lock already */ p->prio = rt_mutex_getprio(p); + if (rt_prio(p->prio)) + p->sched_class = &rt_sched_class; + else + p->sched_class = &fair_sched_class; set_load_weight(p); } @@ -6272,7 +6323,7 @@ recheck: * make sure no PI-waiters arrive (or leave) while we are * changing the priority of the task: */ - spin_lock_irqsave(&p->pi_lock, flags); + raw_spin_lock_irqsave(&p->pi_lock, flags); /* * To be able to change p->policy safely, the apropriate * runqueue lock must be held. @@ -6282,7 +6333,7 @@ recheck: if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { policy = oldpolicy = -1; __task_rq_unlock(rq); - spin_unlock_irqrestore(&p->pi_lock, flags); + raw_spin_unlock_irqrestore(&p->pi_lock, flags); goto recheck; } update_rq_clock(rq); @@ -6306,7 +6357,7 @@ recheck: check_class_changed(rq, p, prev_class, oldprio, running); } __task_rq_unlock(rq); - spin_unlock_irqrestore(&p->pi_lock, flags); + raw_spin_unlock_irqrestore(&p->pi_lock, flags); rt_mutex_adjust_pi(p); @@ -6560,6 +6611,8 @@ SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len, long sched_getaffinity(pid_t pid, struct cpumask *mask) { struct task_struct *p; + unsigned long flags; + struct rq *rq; int retval; get_online_cpus(); @@ -6574,7 +6627,9 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask) if (retval) goto out_unlock; + rq = task_rq_lock(p, &flags); cpumask_and(mask, &p->cpus_allowed, cpu_online_mask); + task_rq_unlock(rq, &flags); out_unlock: read_unlock(&tasklist_lock); @@ -6632,7 +6687,7 @@ SYSCALL_DEFINE0(sched_yield) */ __release(rq->lock); spin_release(&rq->lock.dep_map, 1, _THIS_IP_); - _raw_spin_unlock(&rq->lock); + do_raw_spin_unlock(&rq->lock); preempt_enable_no_resched(); schedule(); @@ -6720,9 +6775,6 @@ EXPORT_SYMBOL(yield); /* * This task is about to go to sleep on IO. Increment rq->nr_iowait so * that process accounting knows that this is a task in IO wait state. - * - * But don't do that if it is a deliberate, throttling IO wait (this task - * has set its backing_dev_info: the queue against which it should throttle) */ void __sched io_schedule(void) { @@ -6815,6 +6867,8 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, { struct task_struct *p; unsigned int time_slice; + unsigned long flags; + struct rq *rq; int retval; struct timespec t; @@ -6831,7 +6885,9 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, if (retval) goto out_unlock; - time_slice = p->sched_class->get_rr_interval(p); + rq = task_rq_lock(p, &flags); + time_slice = p->sched_class->get_rr_interval(rq, p); + task_rq_unlock(rq, &flags); read_unlock(&tasklist_lock); jiffies_to_timespec(time_slice, &t); @@ -6905,7 +6961,7 @@ void show_state_filter(unsigned long state_filter) /* * Only show locks if all tasks are dumped: */ - if (state_filter == -1) + if (!state_filter) debug_show_all_locks(); } @@ -6927,12 +6983,11 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) struct rq *rq = cpu_rq(cpu); unsigned long flags; - spin_lock_irqsave(&rq->lock, flags); + raw_spin_lock_irqsave(&rq->lock, flags); __sched_fork(idle); idle->se.exec_start = sched_clock(); - idle->prio = idle->normal_prio = MAX_PRIO; cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu)); __set_task_cpu(idle, cpu); @@ -6940,7 +6995,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) idle->oncpu = 1; #endif - spin_unlock_irqrestore(&rq->lock, flags); + raw_spin_unlock_irqrestore(&rq->lock, flags); /* Set the preempt count _outside_ the spinlocks! */ #if defined(CONFIG_PREEMPT) @@ -6973,22 +7028,43 @@ cpumask_var_t nohz_cpu_mask; * * This idea comes from the SD scheduler of Con Kolivas: */ -static inline void sched_init_granularity(void) +static int get_update_sysctl_factor(void) { - unsigned int factor = 1 + ilog2(num_online_cpus()); - const unsigned long limit = 200000000; + unsigned int cpus = min_t(int, num_online_cpus(), 8); + unsigned int factor; - sysctl_sched_min_granularity *= factor; - if (sysctl_sched_min_granularity > limit) - sysctl_sched_min_granularity = limit; + switch (sysctl_sched_tunable_scaling) { + case SCHED_TUNABLESCALING_NONE: + factor = 1; + break; + case SCHED_TUNABLESCALING_LINEAR: + factor = cpus; + break; + case SCHED_TUNABLESCALING_LOG: + default: + factor = 1 + ilog2(cpus); + break; + } - sysctl_sched_latency *= factor; - if (sysctl_sched_latency > limit) - sysctl_sched_latency = limit; + return factor; +} - sysctl_sched_wakeup_granularity *= factor; +static void update_sysctl(void) +{ + unsigned int factor = get_update_sysctl_factor(); + +#define SET_SYSCTL(name) \ + (sysctl_##name = (factor) * normalized_sysctl_##name) + SET_SYSCTL(sched_min_granularity); + SET_SYSCTL(sched_latency); + SET_SYSCTL(sched_wakeup_granularity); + SET_SYSCTL(sched_shares_ratelimit); +#undef SET_SYSCTL +} - sysctl_sched_shares_ratelimit *= factor; +static inline void sched_init_granularity(void) +{ + update_sysctl(); } #ifdef CONFIG_SMP @@ -7025,7 +7101,7 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) int ret = 0; rq = task_rq_lock(p, &flags); - if (!cpumask_intersects(new_mask, cpu_online_mask)) { + if (!cpumask_intersects(new_mask, cpu_active_mask)) { ret = -EINVAL; goto out; } @@ -7047,7 +7123,7 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) if (cpumask_test_cpu(task_cpu(p), new_mask)) goto out; - if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) { + if (migrate_task(p, cpumask_any_and(cpu_active_mask, new_mask), &req)) { /* Need help from migration thread: drop lock and wait. */ struct task_struct *mt = rq->migration_thread; @@ -7136,10 +7212,10 @@ static int migration_thread(void *data) struct migration_req *req; struct list_head *head; - spin_lock_irq(&rq->lock); + raw_spin_lock_irq(&rq->lock); if (cpu_is_offline(cpu)) { - spin_unlock_irq(&rq->lock); + raw_spin_unlock_irq(&rq->lock); break; } @@ -7151,7 +7227,7 @@ static int migration_thread(void *data) head = &rq->migration_queue; if (list_empty(head)) { - spin_unlock_irq(&rq->lock); + raw_spin_unlock_irq(&rq->lock); schedule(); set_current_state(TASK_INTERRUPTIBLE); continue; @@ -7160,14 +7236,14 @@ static int migration_thread(void *data) list_del_init(head->next); if (req->task != NULL) { - spin_unlock(&rq->lock); + raw_spin_unlock(&rq->lock); __migrate_task(req->task, cpu, req->dest_cpu); } else if (likely(cpu == (badcpu = smp_processor_id()))) { req->dest_cpu = RCU_MIGRATION_GOT_QS; - spin_unlock(&rq->lock); + raw_spin_unlock(&rq->lock); } else { req->dest_cpu = RCU_MIGRATION_MUST_SYNC; - spin_unlock(&rq->lock); + raw_spin_unlock(&rq->lock); WARN_ONCE(1, "migration_thread() on CPU %d, expected %d\n", badcpu, cpu); } local_irq_enable(); @@ -7201,19 +7277,19 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) again: /* Look for allowed, online CPU in same node. */ - for_each_cpu_and(dest_cpu, nodemask, cpu_online_mask) + for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask) if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) goto move; /* Any allowed, online CPU? */ - dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_online_mask); + dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_active_mask); if (dest_cpu < nr_cpu_ids) goto move; /* No more Mr. Nice Guy. */ if (dest_cpu >= nr_cpu_ids) { cpuset_cpus_allowed_locked(p, &p->cpus_allowed); - dest_cpu = cpumask_any_and(cpu_online_mask, &p->cpus_allowed); + dest_cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed); /* * Don't tell them about moving exiting tasks or @@ -7242,7 +7318,7 @@ move: */ static void migrate_nr_uninterruptible(struct rq *rq_src) { - struct rq *rq_dest = cpu_rq(cpumask_any(cpu_online_mask)); + struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask)); unsigned long flags; local_irq_save(flags); @@ -7290,14 +7366,14 @@ void sched_idle_next(void) * Strictly not necessary since rest of the CPUs are stopped by now * and interrupts disabled on the current cpu. */ - spin_lock_irqsave(&rq->lock, flags); + raw_spin_lock_irqsave(&rq->lock, flags); __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1); update_rq_clock(rq); activate_task(rq, p, 0); - spin_unlock_irqrestore(&rq->lock, flags); + raw_spin_unlock_irqrestore(&rq->lock, flags); } /* @@ -7333,9 +7409,9 @@ static void migrate_dead(unsigned int dead_cpu, struct task_struct *p) * that's OK. No task can be added to this CPU, so iteration is * fine. */ - spin_unlock_irq(&rq->lock); + raw_spin_unlock_irq(&rq->lock); move_task_off_dead_cpu(dead_cpu, p); - spin_lock_irq(&rq->lock); + raw_spin_lock_irq(&rq->lock); put_task_struct(p); } @@ -7376,17 +7452,16 @@ static struct ctl_table sd_ctl_dir[] = { .procname = "sched_domain", .mode = 0555, }, - {0, }, + {} }; static struct ctl_table sd_ctl_root[] = { { - .ctl_name = CTL_KERN, .procname = "kernel", .mode = 0555, .child = sd_ctl_dir, }, - {0, }, + {} }; static struct ctl_table *sd_alloc_ctl_entry(int n) @@ -7496,7 +7571,7 @@ static ctl_table *sd_alloc_ctl_cpu_table(int cpu) static struct ctl_table_header *sd_sysctl_header; static void register_sched_domain_sysctl(void) { - int i, cpu_num = num_online_cpus(); + int i, cpu_num = num_possible_cpus(); struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1); char buf[32]; @@ -7506,7 +7581,7 @@ static void register_sched_domain_sysctl(void) if (entry == NULL) return; - for_each_online_cpu(i) { + for_each_possible_cpu(i) { snprintf(buf, 32, "cpu%d", i); entry->procname = kstrdup(buf, GFP_KERNEL); entry->mode = 0555; @@ -7602,13 +7677,13 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) /* Update our root-domain */ rq = cpu_rq(cpu); - spin_lock_irqsave(&rq->lock, flags); + raw_spin_lock_irqsave(&rq->lock, flags); if (rq->rd) { BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); set_rq_online(rq); } - spin_unlock_irqrestore(&rq->lock, flags); + raw_spin_unlock_irqrestore(&rq->lock, flags); break; #ifdef CONFIG_HOTPLUG_CPU @@ -7633,14 +7708,13 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) put_task_struct(rq->migration_thread); rq->migration_thread = NULL; /* Idle task back to normal (off runqueue, low prio) */ - spin_lock_irq(&rq->lock); + raw_spin_lock_irq(&rq->lock); update_rq_clock(rq); deactivate_task(rq, rq->idle, 0); - rq->idle->static_prio = MAX_PRIO; __setscheduler(rq, rq->idle, SCHED_NORMAL, 0); rq->idle->sched_class = &idle_sched_class; migrate_dead_tasks(cpu); - spin_unlock_irq(&rq->lock); + raw_spin_unlock_irq(&rq->lock); cpuset_unlock(); migrate_nr_uninterruptible(rq); BUG_ON(rq->nr_running != 0); @@ -7650,30 +7724,30 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) * they didn't take sched_hotcpu_mutex. Just wake up * the requestors. */ - spin_lock_irq(&rq->lock); + raw_spin_lock_irq(&rq->lock); while (!list_empty(&rq->migration_queue)) { struct migration_req *req; req = list_entry(rq->migration_queue.next, struct migration_req, list); list_del_init(&req->list); - spin_unlock_irq(&rq->lock); + raw_spin_unlock_irq(&rq->lock); complete(&req->done); - spin_lock_irq(&rq->lock); + raw_spin_lock_irq(&rq->lock); } - spin_unlock_irq(&rq->lock); + raw_spin_unlock_irq(&rq->lock); break; case CPU_DYING: case CPU_DYING_FROZEN: /* Update our root-domain */ rq = cpu_rq(cpu); - spin_lock_irqsave(&rq->lock, flags); + raw_spin_lock_irqsave(&rq->lock, flags); if (rq->rd) { BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); set_rq_offline(rq); } - spin_unlock_irqrestore(&rq->lock, flags); + raw_spin_unlock_irqrestore(&rq->lock, flags); break; #endif } @@ -7710,6 +7784,16 @@ early_initcall(migration_init); #ifdef CONFIG_SCHED_DEBUG +static __read_mostly int sched_domain_debug_enabled; + +static int __init sched_domain_debug_setup(char *str) +{ + sched_domain_debug_enabled = 1; + + return 0; +} +early_param("sched_debug", sched_domain_debug_setup); + static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, struct cpumask *groupmask) { @@ -7796,6 +7880,9 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) cpumask_var_t groupmask; int level = 0; + if (!sched_domain_debug_enabled) + return; + if (!sd) { printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu); return; @@ -7875,6 +7962,8 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) static void free_rootdomain(struct root_domain *rd) { + synchronize_sched(); + cpupri_cleanup(&rd->cpupri); free_cpumask_var(rd->rto_mask); @@ -7888,7 +7977,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd) struct root_domain *old_rd = NULL; unsigned long flags; - spin_lock_irqsave(&rq->lock, flags); + raw_spin_lock_irqsave(&rq->lock, flags); if (rq->rd) { old_rd = rq->rd; @@ -7914,7 +8003,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd) if (cpumask_test_cpu(rq->cpu, cpu_active_mask)) set_rq_online(rq); - spin_unlock_irqrestore(&rq->lock, flags); + raw_spin_unlock_irqrestore(&rq->lock, flags); if (old_rd) free_rootdomain(old_rd); @@ -8015,6 +8104,7 @@ static cpumask_var_t cpu_isolated_map; /* Setup the mask of cpus configured for isolated domains */ static int __init isolated_cpu_setup(char *str) { + alloc_bootmem_cpumask_var(&cpu_isolated_map); cpulist_parse(str, cpu_isolated_map); return 1; } @@ -8199,14 +8289,14 @@ enum s_alloc { */ #ifdef CONFIG_SCHED_SMT static DEFINE_PER_CPU(struct static_sched_domain, cpu_domains); -static DEFINE_PER_CPU(struct static_sched_group, sched_group_cpus); +static DEFINE_PER_CPU(struct static_sched_group, sched_groups); static int cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map, struct sched_group **sg, struct cpumask *unused) { if (sg) - *sg = &per_cpu(sched_group_cpus, cpu).sg; + *sg = &per_cpu(sched_groups, cpu).sg; return cpu; } #endif /* CONFIG_SCHED_SMT */ @@ -8851,7 +8941,7 @@ static int build_sched_domains(const struct cpumask *cpu_map) return __build_sched_domains(cpu_map, NULL); } -static struct cpumask *doms_cur; /* current sched domains */ +static cpumask_var_t *doms_cur; /* current sched domains */ static int ndoms_cur; /* number of sched domains in 'doms_cur' */ static struct sched_domain_attr *dattr_cur; /* attribues of custom domains in 'doms_cur' */ @@ -8873,6 +8963,31 @@ int __attribute__((weak)) arch_update_cpu_topology(void) return 0; } +cpumask_var_t *alloc_sched_domains(unsigned int ndoms) +{ + int i; + cpumask_var_t *doms; + + doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL); + if (!doms) + return NULL; + for (i = 0; i < ndoms; i++) { + if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) { + free_sched_domains(doms, i); + return NULL; + } + } + return doms; +} + +void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms) +{ + unsigned int i; + for (i = 0; i < ndoms; i++) + free_cpumask_var(doms[i]); + kfree(doms); +} + /* * Set up scheduler domains and groups. Callers must hold the hotplug lock. * For now this just excludes isolated cpus, but could be used to @@ -8884,12 +8999,12 @@ static int arch_init_sched_domains(const struct cpumask *cpu_map) arch_update_cpu_topology(); ndoms_cur = 1; - doms_cur = kmalloc(cpumask_size(), GFP_KERNEL); + doms_cur = alloc_sched_domains(ndoms_cur); if (!doms_cur) - doms_cur = fallback_doms; - cpumask_andnot(doms_cur, cpu_map, cpu_isolated_map); + doms_cur = &fallback_doms; + cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map); dattr_cur = NULL; - err = build_sched_domains(doms_cur); + err = build_sched_domains(doms_cur[0]); register_sched_domain_sysctl(); return err; @@ -8939,19 +9054,19 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur, * doms_new[] to the current sched domain partitioning, doms_cur[]. * It destroys each deleted domain and builds each new domain. * - * 'doms_new' is an array of cpumask's of length 'ndoms_new'. + * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'. * The masks don't intersect (don't overlap.) We should setup one * sched domain for each mask. CPUs not in any of the cpumasks will * not be load balanced. If the same cpumask appears both in the * current 'doms_cur' domains and in the new 'doms_new', we can leave * it as it is. * - * The passed in 'doms_new' should be kmalloc'd. This routine takes - * ownership of it and will kfree it when done with it. If the caller - * failed the kmalloc call, then it can pass in doms_new == NULL && - * ndoms_new == 1, and partition_sched_domains() will fallback to - * the single partition 'fallback_doms', it also forces the domains - * to be rebuilt. + * The passed in 'doms_new' should be allocated using + * alloc_sched_domains. This routine takes ownership of it and will + * free_sched_domains it when done with it. If the caller failed the + * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1, + * and partition_sched_domains() will fallback to the single partition + * 'fallback_doms', it also forces the domains to be rebuilt. * * If doms_new == NULL it will be replaced with cpu_online_mask. * ndoms_new == 0 is a special case for destroying existing domains, @@ -8959,8 +9074,7 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur, * * Call with hotplug lock held */ -/* FIXME: Change to struct cpumask *doms_new[] */ -void partition_sched_domains(int ndoms_new, struct cpumask *doms_new, +void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], struct sched_domain_attr *dattr_new) { int i, j, n; @@ -8979,40 +9093,40 @@ void partition_sched_domains(int ndoms_new, struct cpumask *doms_new, /* Destroy deleted domains */ for (i = 0; i < ndoms_cur; i++) { for (j = 0; j < n && !new_topology; j++) { - if (cpumask_equal(&doms_cur[i], &doms_new[j]) + if (cpumask_equal(doms_cur[i], doms_new[j]) && dattrs_equal(dattr_cur, i, dattr_new, j)) goto match1; } /* no match - a current sched domain not in new doms_new[] */ - detach_destroy_domains(doms_cur + i); + detach_destroy_domains(doms_cur[i]); match1: ; } if (doms_new == NULL) { ndoms_cur = 0; - doms_new = fallback_doms; - cpumask_andnot(&doms_new[0], cpu_online_mask, cpu_isolated_map); + doms_new = &fallback_doms; + cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map); WARN_ON_ONCE(dattr_new); } /* Build new domains */ for (i = 0; i < ndoms_new; i++) { for (j = 0; j < ndoms_cur && !new_topology; j++) { - if (cpumask_equal(&doms_new[i], &doms_cur[j]) + if (cpumask_equal(doms_new[i], doms_cur[j]) && dattrs_equal(dattr_new, i, dattr_cur, j)) goto match2; } /* no match - add a new doms_new */ - __build_sched_domains(doms_new + i, + __build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL); match2: ; } /* Remember the new sched domains */ - if (doms_cur != fallback_doms) - kfree(doms_cur); + if (doms_cur != &fallback_doms) + free_sched_domains(doms_cur, ndoms_cur); kfree(dattr_cur); /* kfree(NULL) is safe */ doms_cur = doms_new; dattr_cur = dattr_new; @@ -9123,8 +9237,10 @@ static int update_sched_domains(struct notifier_block *nfb, switch (action) { case CPU_ONLINE: case CPU_ONLINE_FROZEN: - case CPU_DEAD: - case CPU_DEAD_FROZEN: + case CPU_DOWN_PREPARE: + case CPU_DOWN_PREPARE_FROZEN: + case CPU_DOWN_FAILED: + case CPU_DOWN_FAILED_FROZEN: partition_sched_domains(1, NULL, NULL); return NOTIFY_OK; @@ -9171,7 +9287,7 @@ void __init sched_init_smp(void) #endif get_online_cpus(); mutex_lock(&sched_domains_mutex); - arch_init_sched_domains(cpu_online_mask); + arch_init_sched_domains(cpu_active_mask); cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); if (cpumask_empty(non_isolated_cpus)) cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); @@ -9244,13 +9360,13 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) #ifdef CONFIG_SMP rt_rq->rt_nr_migratory = 0; rt_rq->overloaded = 0; - plist_head_init(&rt_rq->pushable_tasks, &rq->lock); + plist_head_init_raw(&rt_rq->pushable_tasks, &rq->lock); #endif rt_rq->rt_time = 0; rt_rq->rt_throttled = 0; rt_rq->rt_runtime = 0; - spin_lock_init(&rt_rq->rt_runtime_lock); + raw_spin_lock_init(&rt_rq->rt_runtime_lock); #ifdef CONFIG_RT_GROUP_SCHED rt_rq->rt_nr_boosted = 0; @@ -9334,10 +9450,6 @@ void __init sched_init(void) #ifdef CONFIG_CPUMASK_OFFSTACK alloc_size += num_possible_cpus() * cpumask_size(); #endif - /* - * As sched_init() is called before page_alloc is setup, - * we use alloc_bootmem(). - */ if (alloc_size) { ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT); @@ -9406,11 +9518,15 @@ void __init sched_init(void) #endif /* CONFIG_USER_SCHED */ #endif /* CONFIG_GROUP_SCHED */ +#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP + update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long), + __alignof__(unsigned long)); +#endif for_each_possible_cpu(i) { struct rq *rq; rq = cpu_rq(i); - spin_lock_init(&rq->lock); + raw_spin_lock_init(&rq->lock); rq->nr_running = 0; rq->calc_load_active = 0; rq->calc_load_update = jiffies + LOAD_FREQ; @@ -9470,7 +9586,7 @@ void __init sched_init(void) #elif defined CONFIG_USER_SCHED init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, 0, NULL); init_tg_rt_entry(&init_task_group, - &per_cpu(init_rt_rq, i), + &per_cpu(init_rt_rq_var, i), &per_cpu(init_sched_rt_entity, i), i, 1, root_task_group.rt_se[i]); #endif @@ -9488,6 +9604,8 @@ void __init sched_init(void) rq->cpu = i; rq->online = 0; rq->migration_thread = NULL; + rq->idle_stamp = 0; + rq->avg_idle = 2*sysctl_sched_migration_cost; INIT_LIST_HEAD(&rq->migration_queue); rq_attach_root(rq, &def_root_domain); #endif @@ -9506,7 +9624,7 @@ void __init sched_init(void) #endif #ifdef CONFIG_RT_MUTEXES - plist_head_init(&init_task.pi_waiters, &init_task.pi_lock); + plist_head_init_raw(&init_task.pi_waiters, &init_task.pi_lock); #endif /* @@ -9531,13 +9649,15 @@ void __init sched_init(void) current->sched_class = &fair_sched_class; /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */ - alloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT); + zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT); #ifdef CONFIG_SMP #ifdef CONFIG_NO_HZ - alloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT); + zalloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT); alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT); #endif - alloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); + /* May be allocated at isolcpus cmdline parse time */ + if (cpu_isolated_map == NULL) + zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); #endif /* SMP */ perf_event_init(); @@ -9629,13 +9749,13 @@ void normalize_rt_tasks(void) continue; } - spin_lock(&p->pi_lock); + raw_spin_lock(&p->pi_lock); rq = __task_rq_lock(p); normalize_task(rq, p); __task_rq_unlock(rq); - spin_unlock(&p->pi_lock); + raw_spin_unlock(&p->pi_lock); } while_each_thread(g, p); read_unlock_irqrestore(&tasklist_lock, flags); @@ -9731,13 +9851,15 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) se = kzalloc_node(sizeof(struct sched_entity), GFP_KERNEL, cpu_to_node(i)); if (!se) - goto err; + goto err_free_rq; init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]); } return 1; + err_free_rq: + kfree(cfs_rq); err: return 0; } @@ -9819,13 +9941,15 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) rt_se = kzalloc_node(sizeof(struct sched_rt_entity), GFP_KERNEL, cpu_to_node(i)); if (!rt_se) - goto err; + goto err_free_rq; init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]); } return 1; + err_free_rq: + kfree(rt_rq); err: return 0; } @@ -9994,9 +10118,9 @@ static void set_se_shares(struct sched_entity *se, unsigned long shares) struct rq *rq = cfs_rq->rq; unsigned long flags; - spin_lock_irqsave(&rq->lock, flags); + raw_spin_lock_irqsave(&rq->lock, flags); __set_se_shares(se, shares); - spin_unlock_irqrestore(&rq->lock, flags); + raw_spin_unlock_irqrestore(&rq->lock, flags); } static DEFINE_MUTEX(shares_mutex); @@ -10181,18 +10305,18 @@ static int tg_set_bandwidth(struct task_group *tg, if (err) goto unlock; - spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock); + raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock); tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period); tg->rt_bandwidth.rt_runtime = rt_runtime; for_each_possible_cpu(i) { struct rt_rq *rt_rq = tg->rt_rq[i]; - spin_lock(&rt_rq->rt_runtime_lock); + raw_spin_lock(&rt_rq->rt_runtime_lock); rt_rq->rt_runtime = rt_runtime; - spin_unlock(&rt_rq->rt_runtime_lock); + raw_spin_unlock(&rt_rq->rt_runtime_lock); } - spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock); + raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock); unlock: read_unlock(&tasklist_lock); mutex_unlock(&rt_constraints_mutex); @@ -10297,22 +10421,22 @@ static int sched_rt_global_constraints(void) if (sysctl_sched_rt_runtime == 0) return -EBUSY; - spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); + raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); for_each_possible_cpu(i) { struct rt_rq *rt_rq = &cpu_rq(i)->rt; - spin_lock(&rt_rq->rt_runtime_lock); + raw_spin_lock(&rt_rq->rt_runtime_lock); rt_rq->rt_runtime = global_rt_runtime(); - spin_unlock(&rt_rq->rt_runtime_lock); + raw_spin_unlock(&rt_rq->rt_runtime_lock); } - spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags); + raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags); return 0; } #endif /* CONFIG_RT_GROUP_SCHED */ int sched_rt_handler(struct ctl_table *table, int write, - struct file *filp, void __user *buffer, size_t *lenp, + void __user *buffer, size_t *lenp, loff_t *ppos) { int ret; @@ -10323,7 +10447,7 @@ int sched_rt_handler(struct ctl_table *table, int write, old_period = sysctl_sched_rt_period; old_runtime = sysctl_sched_rt_runtime; - ret = proc_dointvec(table, write, filp, buffer, lenp, ppos); + ret = proc_dointvec(table, write, buffer, lenp, ppos); if (!ret && write) { ret = sched_rt_global_constraints(); @@ -10377,8 +10501,7 @@ cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) } static int -cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, - struct task_struct *tsk) +cpu_cgroup_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk) { #ifdef CONFIG_RT_GROUP_SCHED if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk)) @@ -10388,15 +10511,45 @@ cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, if (tsk->sched_class != &fair_sched_class) return -EINVAL; #endif + return 0; +} +static int +cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, + struct task_struct *tsk, bool threadgroup) +{ + int retval = cpu_cgroup_can_attach_task(cgrp, tsk); + if (retval) + return retval; + if (threadgroup) { + struct task_struct *c; + rcu_read_lock(); + list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) { + retval = cpu_cgroup_can_attach_task(cgrp, c); + if (retval) { + rcu_read_unlock(); + return retval; + } + } + rcu_read_unlock(); + } return 0; } static void cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, - struct cgroup *old_cont, struct task_struct *tsk) + struct cgroup *old_cont, struct task_struct *tsk, + bool threadgroup) { sched_move_task(tsk); + if (threadgroup) { + struct task_struct *c; + rcu_read_lock(); + list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) { + sched_move_task(c); + } + rcu_read_unlock(); + } } #ifdef CONFIG_FAIR_GROUP_SCHED @@ -10567,9 +10720,9 @@ static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu) /* * Take rq->lock to make 64-bit read safe on 32-bit platforms. */ - spin_lock_irq(&cpu_rq(cpu)->lock); + raw_spin_lock_irq(&cpu_rq(cpu)->lock); data = *cpuusage; - spin_unlock_irq(&cpu_rq(cpu)->lock); + raw_spin_unlock_irq(&cpu_rq(cpu)->lock); #else data = *cpuusage; #endif @@ -10585,9 +10738,9 @@ static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) /* * Take rq->lock to make 64-bit write safe on 32-bit platforms. */ - spin_lock_irq(&cpu_rq(cpu)->lock); + raw_spin_lock_irq(&cpu_rq(cpu)->lock); *cpuusage = val; - spin_unlock_irq(&cpu_rq(cpu)->lock); + raw_spin_unlock_irq(&cpu_rq(cpu)->lock); #else *cpuusage = val; #endif @@ -10821,9 +10974,9 @@ void synchronize_sched_expedited(void) init_completion(&req->done); req->task = NULL; req->dest_cpu = RCU_MIGRATION_NEED_QS; - spin_lock_irqsave(&rq->lock, flags); + raw_spin_lock_irqsave(&rq->lock, flags); list_add(&req->list, &rq->migration_queue); - spin_unlock_irqrestore(&rq->lock, flags); + raw_spin_unlock_irqrestore(&rq->lock, flags); wake_up_process(rq->migration_thread); } for_each_online_cpu(cpu) { @@ -10831,13 +10984,14 @@ void synchronize_sched_expedited(void) req = &per_cpu(rcu_migration_req, cpu); rq = cpu_rq(cpu); wait_for_completion(&req->done); - spin_lock_irqsave(&rq->lock, flags); + raw_spin_lock_irqsave(&rq->lock, flags); if (unlikely(req->dest_cpu == RCU_MIGRATION_MUST_SYNC)) need_full_sync = 1; req->dest_cpu = RCU_MIGRATION_IDLE; - spin_unlock_irqrestore(&rq->lock, flags); + raw_spin_unlock_irqrestore(&rq->lock, flags); } rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE; + synchronize_sched_expedited_count++; mutex_unlock(&rcu_sched_expedited_mutex); put_online_cpus(); if (need_full_sync) diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c index ac2e1dc708b..479ce5682d7 100644 --- a/kernel/sched_clock.c +++ b/kernel/sched_clock.c @@ -127,7 +127,7 @@ again: clock = wrap_max(clock, min_clock); clock = wrap_min(clock, max_clock); - if (cmpxchg(&scd->clock, old_clock, clock) != old_clock) + if (cmpxchg64(&scd->clock, old_clock, clock) != old_clock) goto again; return clock; @@ -163,7 +163,7 @@ again: val = remote_clock; } - if (cmpxchg(ptr, old_val, val) != old_val) + if (cmpxchg64(ptr, old_val, val) != old_val) goto again; return val; diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c index 0f052fc674d..597b33099df 100644 --- a/kernel/sched_cpupri.c +++ b/kernel/sched_cpupri.c @@ -135,26 +135,26 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri) if (likely(newpri != CPUPRI_INVALID)) { struct cpupri_vec *vec = &cp->pri_to_cpu[newpri]; - spin_lock_irqsave(&vec->lock, flags); + raw_spin_lock_irqsave(&vec->lock, flags); cpumask_set_cpu(cpu, vec->mask); vec->count++; if (vec->count == 1) set_bit(newpri, cp->pri_active); - spin_unlock_irqrestore(&vec->lock, flags); + raw_spin_unlock_irqrestore(&vec->lock, flags); } if (likely(oldpri != CPUPRI_INVALID)) { struct cpupri_vec *vec = &cp->pri_to_cpu[oldpri]; - spin_lock_irqsave(&vec->lock, flags); + raw_spin_lock_irqsave(&vec->lock, flags); vec->count--; if (!vec->count) clear_bit(oldpri, cp->pri_active); cpumask_clear_cpu(cpu, vec->mask); - spin_unlock_irqrestore(&vec->lock, flags); + raw_spin_unlock_irqrestore(&vec->lock, flags); } *currpri = newpri; @@ -180,7 +180,7 @@ int cpupri_init(struct cpupri *cp, bool bootmem) for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) { struct cpupri_vec *vec = &cp->pri_to_cpu[i]; - spin_lock_init(&vec->lock); + raw_spin_lock_init(&vec->lock); vec->count = 0; if (!zalloc_cpumask_var(&vec->mask, gfp)) goto cleanup; diff --git a/kernel/sched_cpupri.h b/kernel/sched_cpupri.h index 9a7e859b8fb..7cb5bb6b95b 100644 --- a/kernel/sched_cpupri.h +++ b/kernel/sched_cpupri.h @@ -12,7 +12,7 @@ /* values 2-101 are RT priorities 0-99 */ struct cpupri_vec { - spinlock_t lock; + raw_spinlock_t lock; int count; cpumask_var_t mask; }; diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index efb84409bc4..67f95aada4b 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -184,7 +184,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock", SPLIT_NS(cfs_rq->exec_clock)); - spin_lock_irqsave(&rq->lock, flags); + raw_spin_lock_irqsave(&rq->lock, flags); if (cfs_rq->rb_leftmost) MIN_vruntime = (__pick_next_entity(cfs_rq))->vruntime; last = __pick_last_entity(cfs_rq); @@ -192,7 +192,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) max_vruntime = last->vruntime; min_vruntime = cfs_rq->min_vruntime; rq0_min_vruntime = cpu_rq(0)->cfs.min_vruntime; - spin_unlock_irqrestore(&rq->lock, flags); + raw_spin_unlock_irqrestore(&rq->lock, flags); SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "MIN_vruntime", SPLIT_NS(MIN_vruntime)); SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "min_vruntime", @@ -285,12 +285,16 @@ static void print_cpu(struct seq_file *m, int cpu) #ifdef CONFIG_SCHEDSTATS #define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, rq->n); +#define P64(n) SEQ_printf(m, " .%-30s: %Ld\n", #n, rq->n); P(yld_count); P(sched_switch); P(sched_count); P(sched_goidle); +#ifdef CONFIG_SMP + P64(avg_idle); +#endif P(ttwu_count); P(ttwu_local); @@ -305,6 +309,12 @@ static void print_cpu(struct seq_file *m, int cpu) print_rq(m, rq, cpu); } +static const char *sched_tunable_scaling_names[] = { + "none", + "logaritmic", + "linear" +}; + static int sched_debug_show(struct seq_file *m, void *v) { u64 now = ktime_to_ns(ktime_get()); @@ -330,6 +340,10 @@ static int sched_debug_show(struct seq_file *m, void *v) #undef PN #undef P + SEQ_printf(m, " .%-40s: %d (%s)\n", "sysctl_sched_tunable_scaling", + sysctl_sched_tunable_scaling, + sched_tunable_scaling_names[sysctl_sched_tunable_scaling]); + for_each_online_cpu(cpu) print_cpu(m, cpu); @@ -395,7 +409,6 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) PN(se.sum_exec_runtime); PN(se.avg_overlap); PN(se.avg_wakeup); - PN(se.avg_running); nr_switches = p->nvcsw + p->nivcsw; @@ -419,7 +432,6 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) P(se.nr_failed_migrations_running); P(se.nr_failed_migrations_hot); P(se.nr_forced_migrations); - P(se.nr_forced2_migrations); P(se.nr_wakeups); P(se.nr_wakeups_sync); P(se.nr_wakeups_migrate); @@ -495,7 +507,6 @@ void proc_sched_set_task(struct task_struct *p) p->se.nr_failed_migrations_running = 0; p->se.nr_failed_migrations_hot = 0; p->se.nr_forced_migrations = 0; - p->se.nr_forced2_migrations = 0; p->se.nr_wakeups = 0; p->se.nr_wakeups_sync = 0; p->se.nr_wakeups_migrate = 0; diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index ecc637a0d59..5bedf6e3ebf 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -21,6 +21,7 @@ */ #include <linux/latencytop.h> +#include <linux/sched.h> /* * Targeted preemption latency for CPU-bound tasks: @@ -35,12 +36,26 @@ * run vmstat and monitor the context-switches (cs) field) */ unsigned int sysctl_sched_latency = 5000000ULL; +unsigned int normalized_sysctl_sched_latency = 5000000ULL; + +/* + * The initial- and re-scaling of tunables is configurable + * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus)) + * + * Options are: + * SCHED_TUNABLESCALING_NONE - unscaled, always *1 + * SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus) + * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus + */ +enum sched_tunable_scaling sysctl_sched_tunable_scaling + = SCHED_TUNABLESCALING_LOG; /* * Minimal preemption granularity for CPU-bound tasks: * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds) */ unsigned int sysctl_sched_min_granularity = 1000000ULL; +unsigned int normalized_sysctl_sched_min_granularity = 1000000ULL; /* * is kept at sysctl_sched_latency / sysctl_sched_min_granularity @@ -70,6 +85,7 @@ unsigned int __read_mostly sysctl_sched_compat_yield; * have immediate wakeup/sleep latencies. */ unsigned int sysctl_sched_wakeup_granularity = 1000000UL; +unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL; const_debug unsigned int sysctl_sched_migration_cost = 500000UL; @@ -383,11 +399,12 @@ static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) */ #ifdef CONFIG_SCHED_DEBUG -int sched_nr_latency_handler(struct ctl_table *table, int write, - struct file *filp, void __user *buffer, size_t *lenp, +int sched_proc_update_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) { - int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); + int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + int factor = get_update_sysctl_factor(); if (ret || !write) return ret; @@ -395,6 +412,14 @@ int sched_nr_latency_handler(struct ctl_table *table, int write, sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency, sysctl_sched_min_granularity); +#define WRT_SYSCTL(name) \ + (normalized_sysctl_##name = sysctl_##name / (factor)) + WRT_SYSCTL(sched_min_granularity); + WRT_SYSCTL(sched_latency); + WRT_SYSCTL(sched_wakeup_granularity); + WRT_SYSCTL(sched_shares_ratelimit); +#undef WRT_SYSCTL + return 0; } #endif @@ -822,6 +847,26 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) * re-elected due to buddy favours. */ clear_buddies(cfs_rq, curr); + return; + } + + /* + * Ensure that a task that missed wakeup preemption by a + * narrow margin doesn't have to wait for a full slice. + * This also mitigates buddy induced latencies under load. + */ + if (!sched_feat(WAKEUP_PREEMPT)) + return; + + if (delta_exec < sysctl_sched_min_granularity) + return; + + if (cfs_rq->nr_running > 1) { + struct sched_entity *se = __pick_next_entity(cfs_rq); + s64 delta = curr->vruntime - se->vruntime; + + if (delta > ideal_runtime) + resched_task(rq_of(cfs_rq)->curr); } } @@ -861,12 +906,18 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se); static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) { struct sched_entity *se = __pick_next_entity(cfs_rq); + struct sched_entity *left = se; - if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, se) < 1) - return cfs_rq->next; + if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) + se = cfs_rq->next; - if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, se) < 1) - return cfs_rq->last; + /* + * Prefer last buddy, try to return the CPU to a preempted task. + */ + if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1) + se = cfs_rq->last; + + clear_buddies(cfs_rq, se); return se; } @@ -1319,6 +1370,37 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) } /* + * Try and locate an idle CPU in the sched_domain. + */ +static int +select_idle_sibling(struct task_struct *p, struct sched_domain *sd, int target) +{ + int cpu = smp_processor_id(); + int prev_cpu = task_cpu(p); + int i; + + /* + * If this domain spans both cpu and prev_cpu (see the SD_WAKE_AFFINE + * test in select_task_rq_fair) and the prev_cpu is idle then that's + * always a better target than the current cpu. + */ + if (target == cpu && !cpu_rq(prev_cpu)->cfs.nr_running) + return prev_cpu; + + /* + * Otherwise, iterate the domain and find an elegible idle cpu. + */ + for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed) { + if (!cpu_rq(i)->cfs.nr_running) { + target = i; + break; + } + } + + return target; +} + +/* * sched_balance_self: balance the current task (running on cpu) in domains * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and * SD_BALANCE_EXEC. @@ -1346,7 +1428,6 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag new_cpu = prev_cpu; } - rcu_read_lock(); for_each_domain(cpu, tmp) { /* * If power savings logic is enabled for a domain, see if we @@ -1372,11 +1453,35 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag want_sd = 0; } - if (want_affine && (tmp->flags & SD_WAKE_AFFINE) && - cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) { + /* + * While iterating the domains looking for a spanning + * WAKE_AFFINE domain, adjust the affine target to any idle cpu + * in cache sharing domains along the way. + */ + if (want_affine) { + int target = -1; - affine_sd = tmp; - want_affine = 0; + /* + * If both cpu and prev_cpu are part of this domain, + * cpu is a valid SD_WAKE_AFFINE target. + */ + if (cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) + target = cpu; + + /* + * If there's an idle sibling in this domain, make that + * the wake_affine target instead of the current cpu. + */ + if (tmp->flags & SD_PREFER_SIBLING) + target = select_idle_sibling(p, tmp, target); + + if (target >= 0) { + if (tmp->flags & SD_WAKE_AFFINE) { + affine_sd = tmp; + want_affine = 0; + } + cpu = target; + } } if (!want_sd && !want_affine) @@ -1403,10 +1508,8 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag update_shares(tmp); } - if (affine_sd && wake_affine(affine_sd, p, sync)) { - new_cpu = cpu; - goto out; - } + if (affine_sd && wake_affine(affine_sd, p, sync)) + return cpu; while (sd) { int load_idx = sd->forkexec_idx; @@ -1447,8 +1550,6 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag /* while loop will break here if sd == NULL */ } -out: - rcu_read_unlock(); return new_cpu; } #endif /* CONFIG_SMP */ @@ -1568,13 +1669,10 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ struct sched_entity *se = &curr->se, *pse = &p->se; struct cfs_rq *cfs_rq = task_cfs_rq(curr); int sync = wake_flags & WF_SYNC; + int scale = cfs_rq->nr_running >= sched_nr_latency; - update_curr(cfs_rq); - - if (unlikely(rt_prio(p->prio))) { - resched_task(curr); - return; - } + if (unlikely(rt_prio(p->prio))) + goto preempt; if (unlikely(p->sched_class != &fair_sched_class)) return; @@ -1582,18 +1680,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ if (unlikely(se == pse)) return; - /* - * Only set the backward buddy when the current task is still on the - * rq. This can happen when a wakeup gets interleaved with schedule on - * the ->pre_schedule() or idle_balance() point, either of which can - * drop the rq lock. - * - * Also, during early boot the idle thread is in the fair class, for - * obvious reasons its a bad idea to schedule back to the idle thread. - */ - if (sched_feat(LAST_BUDDY) && likely(se->on_rq && curr != rq->idle)) - set_last_buddy(se); - if (sched_feat(NEXT_BUDDY) && !(wake_flags & WF_FORK)) + if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) set_next_buddy(pse); /* @@ -1611,36 +1698,44 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ return; /* Idle tasks are by definition preempted by everybody. */ - if (unlikely(curr->policy == SCHED_IDLE)) { - resched_task(curr); - return; - } + if (unlikely(curr->policy == SCHED_IDLE)) + goto preempt; - if ((sched_feat(WAKEUP_SYNC) && sync) || - (sched_feat(WAKEUP_OVERLAP) && - (se->avg_overlap < sysctl_sched_migration_cost && - pse->avg_overlap < sysctl_sched_migration_cost))) { - resched_task(curr); - return; - } + if (sched_feat(WAKEUP_SYNC) && sync) + goto preempt; - if (sched_feat(WAKEUP_RUNNING)) { - if (pse->avg_running < se->avg_running) { - set_next_buddy(pse); - resched_task(curr); - return; - } - } + if (sched_feat(WAKEUP_OVERLAP) && + se->avg_overlap < sysctl_sched_migration_cost && + pse->avg_overlap < sysctl_sched_migration_cost) + goto preempt; if (!sched_feat(WAKEUP_PREEMPT)) return; + update_curr(cfs_rq); find_matching_se(&se, &pse); - BUG_ON(!pse); - if (wakeup_preempt_entity(se, pse) == 1) - resched_task(curr); + goto preempt; + + return; + +preempt: + resched_task(curr); + /* + * Only set the backward buddy when the current task is still + * on the rq. This can happen when a wakeup gets interleaved + * with schedule on the ->pre_schedule() or idle_balance() + * point, either of which can * drop the rq lock. + * + * Also, during early boot the idle thread is in the fair class, + * for obvious reasons its a bad idea to schedule back to it. + */ + if (unlikely(!se->on_rq || curr == rq->idle)) + return; + + if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se)) + set_last_buddy(se); } static struct task_struct *pick_next_task_fair(struct rq *rq) @@ -1649,21 +1744,11 @@ static struct task_struct *pick_next_task_fair(struct rq *rq) struct cfs_rq *cfs_rq = &rq->cfs; struct sched_entity *se; - if (unlikely(!cfs_rq->nr_running)) + if (!cfs_rq->nr_running) return NULL; do { se = pick_next_entity(cfs_rq); - /* - * If se was a buddy, clear it so that it will have to earn - * the favour again. - * - * If se was not a buddy, clear the buddies because neither - * was elegible to run, let them earn it again. - * - * IOW. unconditionally clear buddies. - */ - __clear_buddies(cfs_rq, NULL); set_next_entity(cfs_rq, se); cfs_rq = group_cfs_rq(se); } while (cfs_rq); @@ -1830,6 +1915,17 @@ move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, return 0; } + +static void rq_online_fair(struct rq *rq) +{ + update_sysctl(); +} + +static void rq_offline_fair(struct rq *rq) +{ + update_sysctl(); +} + #endif /* CONFIG_SMP */ /* @@ -1847,28 +1943,30 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) } /* - * Share the fairness runtime between parent and child, thus the - * total amount of pressure for CPU stays equal - new tasks - * get a chance to run but frequent forkers are not allowed to - * monopolize the CPU. Note: the parent runqueue is locked, - * the child is not running yet. + * called on fork with the child task as argument from the parent's context + * - child not yet on the tasklist + * - preemption disabled */ -static void task_new_fair(struct rq *rq, struct task_struct *p) +static void task_fork_fair(struct task_struct *p) { - struct cfs_rq *cfs_rq = task_cfs_rq(p); + struct cfs_rq *cfs_rq = task_cfs_rq(current); struct sched_entity *se = &p->se, *curr = cfs_rq->curr; int this_cpu = smp_processor_id(); + struct rq *rq = this_rq(); + unsigned long flags; + + raw_spin_lock_irqsave(&rq->lock, flags); - sched_info_queued(p); + if (unlikely(task_cpu(p) != this_cpu)) + __set_task_cpu(p, this_cpu); update_curr(cfs_rq); + if (curr) se->vruntime = curr->vruntime; place_entity(cfs_rq, se, 1); - /* 'curr' will be NULL if the child belongs to a different group */ - if (sysctl_sched_child_runs_first && this_cpu == task_cpu(p) && - curr && entity_before(curr, se)) { + if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) { /* * Upon rescheduling, sched_class::put_prev_task() will place * 'current' within the tree based on its new key value. @@ -1877,7 +1975,7 @@ static void task_new_fair(struct rq *rq, struct task_struct *p) resched_task(rq->curr); } - enqueue_task_fair(rq, p, 0); + raw_spin_unlock_irqrestore(&rq->lock, flags); } /* @@ -1939,21 +2037,17 @@ static void moved_group_fair(struct task_struct *p) } #endif -unsigned int get_rr_interval_fair(struct task_struct *task) +unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task) { struct sched_entity *se = &task->se; - unsigned long flags; - struct rq *rq; unsigned int rr_interval = 0; /* * Time slice is 0 for SCHED_OTHER tasks that are on an otherwise * idle runqueue: */ - rq = task_rq_lock(task, &flags); if (rq->cfs.load.weight) rr_interval = NS_TO_JIFFIES(sched_slice(&rq->cfs, se)); - task_rq_unlock(rq, &flags); return rr_interval; } @@ -1977,11 +2071,13 @@ static const struct sched_class fair_sched_class = { .load_balance = load_balance_fair, .move_one_task = move_one_task_fair, + .rq_online = rq_online_fair, + .rq_offline = rq_offline_fair, #endif .set_curr_task = set_curr_task_fair, .task_tick = task_tick_fair, - .task_new = task_new_fair, + .task_fork = task_fork_fair, .prio_changed = prio_changed_fair, .switched_to = switched_to_fair, diff --git a/kernel/sched_features.h b/kernel/sched_features.h index 0d94083582c..d5059fd761d 100644 --- a/kernel/sched_features.h +++ b/kernel/sched_features.h @@ -54,11 +54,6 @@ SCHED_FEAT(WAKEUP_SYNC, 0) SCHED_FEAT(WAKEUP_OVERLAP, 0) /* - * Wakeup preemption towards tasks that run short - */ -SCHED_FEAT(WAKEUP_RUNNING, 0) - -/* * Use the SYNC wakeup hint, pipes and the likes use this to indicate * the remote end is likely to consume the data we just wrote, and * therefore has cache benefit from being placed on the same cpu, see diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c index b133a28fcde..5f93b570d38 100644 --- a/kernel/sched_idletask.c +++ b/kernel/sched_idletask.c @@ -34,10 +34,10 @@ static struct task_struct *pick_next_task_idle(struct rq *rq) static void dequeue_task_idle(struct rq *rq, struct task_struct *p, int sleep) { - spin_unlock_irq(&rq->lock); + raw_spin_unlock_irq(&rq->lock); printk(KERN_ERR "bad: scheduling from the idle thread!\n"); dump_stack(); - spin_lock_irq(&rq->lock); + raw_spin_lock_irq(&rq->lock); } static void put_prev_task_idle(struct rq *rq, struct task_struct *prev) @@ -97,7 +97,7 @@ static void prio_changed_idle(struct rq *rq, struct task_struct *p, check_preempt_curr(rq, p, 0); } -unsigned int get_rr_interval_idle(struct task_struct *task) +unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task) { return 0; } diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index a4d790cddb1..d2ea2828164 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -327,7 +327,7 @@ static int do_balance_runtime(struct rt_rq *rt_rq) weight = cpumask_weight(rd->span); - spin_lock(&rt_b->rt_runtime_lock); + raw_spin_lock(&rt_b->rt_runtime_lock); rt_period = ktime_to_ns(rt_b->rt_period); for_each_cpu(i, rd->span) { struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i); @@ -336,7 +336,7 @@ static int do_balance_runtime(struct rt_rq *rt_rq) if (iter == rt_rq) continue; - spin_lock(&iter->rt_runtime_lock); + raw_spin_lock(&iter->rt_runtime_lock); /* * Either all rqs have inf runtime and there's nothing to steal * or __disable_runtime() below sets a specific rq to inf to @@ -358,14 +358,14 @@ static int do_balance_runtime(struct rt_rq *rt_rq) rt_rq->rt_runtime += diff; more = 1; if (rt_rq->rt_runtime == rt_period) { - spin_unlock(&iter->rt_runtime_lock); + raw_spin_unlock(&iter->rt_runtime_lock); break; } } next: - spin_unlock(&iter->rt_runtime_lock); + raw_spin_unlock(&iter->rt_runtime_lock); } - spin_unlock(&rt_b->rt_runtime_lock); + raw_spin_unlock(&rt_b->rt_runtime_lock); return more; } @@ -386,8 +386,8 @@ static void __disable_runtime(struct rq *rq) s64 want; int i; - spin_lock(&rt_b->rt_runtime_lock); - spin_lock(&rt_rq->rt_runtime_lock); + raw_spin_lock(&rt_b->rt_runtime_lock); + raw_spin_lock(&rt_rq->rt_runtime_lock); /* * Either we're all inf and nobody needs to borrow, or we're * already disabled and thus have nothing to do, or we have @@ -396,7 +396,7 @@ static void __disable_runtime(struct rq *rq) if (rt_rq->rt_runtime == RUNTIME_INF || rt_rq->rt_runtime == rt_b->rt_runtime) goto balanced; - spin_unlock(&rt_rq->rt_runtime_lock); + raw_spin_unlock(&rt_rq->rt_runtime_lock); /* * Calculate the difference between what we started out with @@ -418,7 +418,7 @@ static void __disable_runtime(struct rq *rq) if (iter == rt_rq || iter->rt_runtime == RUNTIME_INF) continue; - spin_lock(&iter->rt_runtime_lock); + raw_spin_lock(&iter->rt_runtime_lock); if (want > 0) { diff = min_t(s64, iter->rt_runtime, want); iter->rt_runtime -= diff; @@ -427,13 +427,13 @@ static void __disable_runtime(struct rq *rq) iter->rt_runtime -= want; want -= want; } - spin_unlock(&iter->rt_runtime_lock); + raw_spin_unlock(&iter->rt_runtime_lock); if (!want) break; } - spin_lock(&rt_rq->rt_runtime_lock); + raw_spin_lock(&rt_rq->rt_runtime_lock); /* * We cannot be left wanting - that would mean some runtime * leaked out of the system. @@ -445,8 +445,8 @@ balanced: * runtime - in which case borrowing doesn't make sense. */ rt_rq->rt_runtime = RUNTIME_INF; - spin_unlock(&rt_rq->rt_runtime_lock); - spin_unlock(&rt_b->rt_runtime_lock); + raw_spin_unlock(&rt_rq->rt_runtime_lock); + raw_spin_unlock(&rt_b->rt_runtime_lock); } } @@ -454,9 +454,9 @@ static void disable_runtime(struct rq *rq) { unsigned long flags; - spin_lock_irqsave(&rq->lock, flags); + raw_spin_lock_irqsave(&rq->lock, flags); __disable_runtime(rq); - spin_unlock_irqrestore(&rq->lock, flags); + raw_spin_unlock_irqrestore(&rq->lock, flags); } static void __enable_runtime(struct rq *rq) @@ -472,13 +472,13 @@ static void __enable_runtime(struct rq *rq) for_each_leaf_rt_rq(rt_rq, rq) { struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); - spin_lock(&rt_b->rt_runtime_lock); - spin_lock(&rt_rq->rt_runtime_lock); + raw_spin_lock(&rt_b->rt_runtime_lock); + raw_spin_lock(&rt_rq->rt_runtime_lock); rt_rq->rt_runtime = rt_b->rt_runtime; rt_rq->rt_time = 0; rt_rq->rt_throttled = 0; - spin_unlock(&rt_rq->rt_runtime_lock); - spin_unlock(&rt_b->rt_runtime_lock); + raw_spin_unlock(&rt_rq->rt_runtime_lock); + raw_spin_unlock(&rt_b->rt_runtime_lock); } } @@ -486,9 +486,9 @@ static void enable_runtime(struct rq *rq) { unsigned long flags; - spin_lock_irqsave(&rq->lock, flags); + raw_spin_lock_irqsave(&rq->lock, flags); __enable_runtime(rq); - spin_unlock_irqrestore(&rq->lock, flags); + raw_spin_unlock_irqrestore(&rq->lock, flags); } static int balance_runtime(struct rt_rq *rt_rq) @@ -496,9 +496,9 @@ static int balance_runtime(struct rt_rq *rt_rq) int more = 0; if (rt_rq->rt_time > rt_rq->rt_runtime) { - spin_unlock(&rt_rq->rt_runtime_lock); + raw_spin_unlock(&rt_rq->rt_runtime_lock); more = do_balance_runtime(rt_rq); - spin_lock(&rt_rq->rt_runtime_lock); + raw_spin_lock(&rt_rq->rt_runtime_lock); } return more; @@ -524,11 +524,11 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i); struct rq *rq = rq_of_rt_rq(rt_rq); - spin_lock(&rq->lock); + raw_spin_lock(&rq->lock); if (rt_rq->rt_time) { u64 runtime; - spin_lock(&rt_rq->rt_runtime_lock); + raw_spin_lock(&rt_rq->rt_runtime_lock); if (rt_rq->rt_throttled) balance_runtime(rt_rq); runtime = rt_rq->rt_runtime; @@ -539,13 +539,13 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) } if (rt_rq->rt_time || rt_rq->rt_nr_running) idle = 0; - spin_unlock(&rt_rq->rt_runtime_lock); + raw_spin_unlock(&rt_rq->rt_runtime_lock); } else if (rt_rq->rt_nr_running) idle = 0; if (enqueue) sched_rt_rq_enqueue(rt_rq); - spin_unlock(&rq->lock); + raw_spin_unlock(&rq->lock); } return idle; @@ -624,11 +624,11 @@ static void update_curr_rt(struct rq *rq) rt_rq = rt_rq_of_se(rt_se); if (sched_rt_runtime(rt_rq) != RUNTIME_INF) { - spin_lock(&rt_rq->rt_runtime_lock); + raw_spin_lock(&rt_rq->rt_runtime_lock); rt_rq->rt_time += delta_exec; if (sched_rt_runtime_exceeded(rt_rq)) resched_task(curr); - spin_unlock(&rt_rq->rt_runtime_lock); + raw_spin_unlock(&rt_rq->rt_runtime_lock); } } } @@ -1153,29 +1153,12 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu) static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask); -static inline int pick_optimal_cpu(int this_cpu, - const struct cpumask *mask) -{ - int first; - - /* "this_cpu" is cheaper to preempt than a remote processor */ - if ((this_cpu != -1) && cpumask_test_cpu(this_cpu, mask)) - return this_cpu; - - first = cpumask_first(mask); - if (first < nr_cpu_ids) - return first; - - return -1; -} - static int find_lowest_rq(struct task_struct *task) { struct sched_domain *sd; struct cpumask *lowest_mask = __get_cpu_var(local_cpu_mask); int this_cpu = smp_processor_id(); int cpu = task_cpu(task); - cpumask_var_t domain_mask; if (task->rt.nr_cpus_allowed == 1) return -1; /* No other targets possible */ @@ -1198,28 +1181,26 @@ static int find_lowest_rq(struct task_struct *task) * Otherwise, we consult the sched_domains span maps to figure * out which cpu is logically closest to our hot cache data. */ - if (this_cpu == cpu) - this_cpu = -1; /* Skip this_cpu opt if the same */ - - if (alloc_cpumask_var(&domain_mask, GFP_ATOMIC)) { - for_each_domain(cpu, sd) { - if (sd->flags & SD_WAKE_AFFINE) { - int best_cpu; + if (!cpumask_test_cpu(this_cpu, lowest_mask)) + this_cpu = -1; /* Skip this_cpu opt if not among lowest */ - cpumask_and(domain_mask, - sched_domain_span(sd), - lowest_mask); + for_each_domain(cpu, sd) { + if (sd->flags & SD_WAKE_AFFINE) { + int best_cpu; - best_cpu = pick_optimal_cpu(this_cpu, - domain_mask); - - if (best_cpu != -1) { - free_cpumask_var(domain_mask); - return best_cpu; - } - } + /* + * "this_cpu" is cheaper to preempt than a + * remote processor. + */ + if (this_cpu != -1 && + cpumask_test_cpu(this_cpu, sched_domain_span(sd))) + return this_cpu; + + best_cpu = cpumask_first_and(lowest_mask, + sched_domain_span(sd)); + if (best_cpu < nr_cpu_ids) + return best_cpu; } - free_cpumask_var(domain_mask); } /* @@ -1227,7 +1208,13 @@ static int find_lowest_rq(struct task_struct *task) * just give the caller *something* to work with from the compatible * locations. */ - return pick_optimal_cpu(this_cpu, lowest_mask); + if (this_cpu != -1) + return this_cpu; + + cpu = cpumask_any(lowest_mask); + if (cpu < nr_cpu_ids) + return cpu; + return -1; } /* Will lock the rq it finds */ @@ -1259,7 +1246,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) task_running(rq, task) || !task->se.on_rq)) { - spin_unlock(&lowest_rq->lock); + raw_spin_unlock(&lowest_rq->lock); lowest_rq = NULL; break; } @@ -1734,7 +1721,7 @@ static void set_curr_task_rt(struct rq *rq) dequeue_pushable_task(rq, p); } -unsigned int get_rr_interval_rt(struct task_struct *task) +unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task) { /* * Time slice is 0 for SCHED_FIFO tasks diff --git a/kernel/signal.c b/kernel/signal.c index 64c5deeaca5..1814e68e4de 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -22,12 +22,14 @@ #include <linux/ptrace.h> #include <linux/signal.h> #include <linux/signalfd.h> +#include <linux/ratelimit.h> #include <linux/tracehook.h> #include <linux/capability.h> #include <linux/freezer.h> #include <linux/pid_namespace.h> #include <linux/nsproxy.h> -#include <trace/events/sched.h> +#define CREATE_TRACE_POINTS +#include <trace/events/signal.h> #include <asm/param.h> #include <asm/uaccess.h> @@ -41,6 +43,8 @@ static struct kmem_cache *sigqueue_cachep; +int print_fatal_signals __read_mostly; + static void __user *sig_handler(struct task_struct *t, int sig) { return t->sighand->action[sig - 1].sa.sa_handler; @@ -159,7 +163,7 @@ int next_signal(struct sigpending *pending, sigset_t *mask) { unsigned long i, *s, *m, x; int sig = 0; - + s = pending->signal.sig; m = mask->sig; switch (_NSIG_WORDS) { @@ -184,17 +188,31 @@ int next_signal(struct sigpending *pending, sigset_t *mask) sig = ffz(~x) + 1; break; } - + return sig; } +static inline void print_dropped_signal(int sig) +{ + static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 10); + + if (!print_fatal_signals) + return; + + if (!__ratelimit(&ratelimit_state)) + return; + + printk(KERN_INFO "%s/%d: reached RLIMIT_SIGPENDING, dropped signal %d\n", + current->comm, current->pid, sig); +} + /* * allocate a new signal queue record * - this may be called without locks if and only if t == current, otherwise an * appopriate lock must be held to stop the target task from exiting */ -static struct sigqueue *__sigqueue_alloc(struct task_struct *t, gfp_t flags, - int override_rlimit) +static struct sigqueue * +__sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimit) { struct sigqueue *q = NULL; struct user_struct *user; @@ -207,10 +225,15 @@ static struct sigqueue *__sigqueue_alloc(struct task_struct *t, gfp_t flags, */ user = get_uid(__task_cred(t)->user); atomic_inc(&user->sigpending); + if (override_rlimit || atomic_read(&user->sigpending) <= - t->signal->rlim[RLIMIT_SIGPENDING].rlim_cur) + t->signal->rlim[RLIMIT_SIGPENDING].rlim_cur) { q = kmem_cache_alloc(sigqueue_cachep, flags); + } else { + print_dropped_signal(sig); + } + if (unlikely(q == NULL)) { atomic_dec(&user->sigpending); free_uid(user); @@ -400,7 +423,7 @@ still_pending: */ info->si_signo = sig; info->si_errno = 0; - info->si_code = 0; + info->si_code = SI_USER; info->si_pid = 0; info->si_uid = 0; } @@ -584,6 +607,17 @@ static int rm_from_queue(unsigned long mask, struct sigpending *s) return 1; } +static inline int is_si_special(const struct siginfo *info) +{ + return info <= SEND_SIG_FORCED; +} + +static inline bool si_fromuser(const struct siginfo *info) +{ + return info == SEND_SIG_NOINFO || + (!is_si_special(info) && SI_FROMUSER(info)); +} + /* * Bad permissions for sending the signal * - the caller must hold at least the RCU read lock @@ -598,7 +632,7 @@ static int check_kill_permission(int sig, struct siginfo *info, if (!valid_signal(sig)) return -EINVAL; - if (info != SEND_SIG_NOINFO && (is_si_special(info) || SI_FROMKERNEL(info))) + if (!si_fromuser(info)) return 0; error = audit_signal_info(sig, t); /* Let audit system see the signal */ @@ -705,7 +739,7 @@ static int prepare_signal(int sig, struct task_struct *p, int from_ancestor_ns) if (why) { /* - * The first thread which returns from finish_stop() + * The first thread which returns from do_signal_stop() * will take ->siglock, notice SIGNAL_CLD_MASK, and * notify its parent. See get_signal_to_deliver(). */ @@ -834,7 +868,7 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t, struct sigqueue *q; int override_rlimit; - trace_sched_signal_send(sig, t); + trace_signal_generate(sig, info, t); assert_spin_locked(&t->sighand->siglock); @@ -869,7 +903,7 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t, else override_rlimit = 0; - q = __sigqueue_alloc(t, GFP_ATOMIC | __GFP_NOTRACK_FALSE_POSITIVE, + q = __sigqueue_alloc(sig, t, GFP_ATOMIC | __GFP_NOTRACK_FALSE_POSITIVE, override_rlimit); if (q) { list_add_tail(&q->list, &pending->list); @@ -896,12 +930,21 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t, break; } } else if (!is_si_special(info)) { - if (sig >= SIGRTMIN && info->si_code != SI_USER) - /* - * Queue overflow, abort. We may abort if the signal was rt - * and sent by user using something other than kill(). - */ + if (sig >= SIGRTMIN && info->si_code != SI_USER) { + /* + * Queue overflow, abort. We may abort if the + * signal was rt and sent by user using something + * other than kill(). + */ + trace_signal_overflow_fail(sig, group, info); return -EAGAIN; + } else { + /* + * This is a silent loss of information. We still + * send the signal, but the *info bits are lost. + */ + trace_signal_lose_info(sig, group, info); + } } out_set: @@ -917,16 +960,13 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t, int from_ancestor_ns = 0; #ifdef CONFIG_PID_NS - if (!is_si_special(info) && SI_FROMUSER(info) && - task_pid_nr_ns(current, task_active_pid_ns(t)) <= 0) - from_ancestor_ns = 1; + from_ancestor_ns = si_fromuser(info) && + !task_pid_nr_ns(current, task_active_pid_ns(t)); #endif return __send_signal(sig, info, t, group, from_ancestor_ns); } -int print_fatal_signals; - static void print_fatal_signal(struct pt_regs *regs, int signr) { printk("%s/%d: potentially unexpected fatal signal %d.\n", @@ -971,6 +1011,20 @@ specific_send_sig_info(int sig, struct siginfo *info, struct task_struct *t) return send_signal(sig, info, t, 0); } +int do_send_sig_info(int sig, struct siginfo *info, struct task_struct *p, + bool group) +{ + unsigned long flags; + int ret = -ESRCH; + + if (lock_task_sighand(p, &flags)) { + ret = send_signal(sig, info, p, group); + unlock_task_sighand(p, &flags); + } + + return ret; +} + /* * Force a signal that the process can't ignore: if necessary * we unblock the signal and change any SIG_IGN to SIG_DFL. @@ -1008,12 +1062,6 @@ force_sig_info(int sig, struct siginfo *info, struct task_struct *t) return ret; } -void -force_sig_specific(int sig, struct task_struct *t) -{ - force_sig_info(sig, SEND_SIG_FORCED, t); -} - /* * Nuke all other threads in the group. */ @@ -1036,12 +1084,6 @@ void zap_other_threads(struct task_struct *p) } } -int __fatal_signal_pending(struct task_struct *tsk) -{ - return sigismember(&tsk->pending.signal, SIGKILL); -} -EXPORT_SYMBOL(__fatal_signal_pending); - struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long *flags) { struct sighand_struct *sighand; @@ -1068,18 +1110,10 @@ struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long */ int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p) { - unsigned long flags; - int ret; - - ret = check_kill_permission(sig, info, p); + int ret = check_kill_permission(sig, info, p); - if (!ret && sig) { - ret = -ESRCH; - if (lock_task_sighand(p, &flags)) { - ret = __group_send_sig_info(sig, info, p); - unlock_task_sighand(p, &flags); - } - } + if (!ret && sig) + ret = do_send_sig_info(sig, info, p, true); return ret; } @@ -1156,8 +1190,7 @@ int kill_pid_info_as_uid(int sig, struct siginfo *info, struct pid *pid, goto out_unlock; } pcred = __task_cred(p); - if ((info == SEND_SIG_NOINFO || - (!is_si_special(info) && SI_FROMUSER(info))) && + if (si_fromuser(info) && euid != pcred->suid && euid != pcred->uid && uid != pcred->suid && uid != pcred->uid) { ret = -EPERM; @@ -1224,15 +1257,9 @@ static int kill_something_info(int sig, struct siginfo *info, pid_t pid) * These are for backward compatibility with the rest of the kernel source. */ -/* - * The caller must ensure the task can't exit. - */ int send_sig_info(int sig, struct siginfo *info, struct task_struct *p) { - int ret; - unsigned long flags; - /* * Make sure legacy kernel users don't send in bad values * (normal paths check this in check_kill_permission). @@ -1240,10 +1267,7 @@ send_sig_info(int sig, struct siginfo *info, struct task_struct *p) if (!valid_signal(sig)) return -EINVAL; - spin_lock_irqsave(&p->sighand->siglock, flags); - ret = specific_send_sig_info(sig, info, p); - spin_unlock_irqrestore(&p->sighand->siglock, flags); - return ret; + return do_send_sig_info(sig, info, p, false); } #define __si_special(priv) \ @@ -1302,19 +1326,19 @@ EXPORT_SYMBOL(kill_pid); * These functions support sending signals using preallocated sigqueue * structures. This is needed "because realtime applications cannot * afford to lose notifications of asynchronous events, like timer - * expirations or I/O completions". In the case of Posix Timers + * expirations or I/O completions". In the case of Posix Timers * we allocate the sigqueue structure from the timer_create. If this * allocation fails we are able to report the failure to the application * with an EAGAIN error. */ - struct sigqueue *sigqueue_alloc(void) { - struct sigqueue *q; + struct sigqueue *q = __sigqueue_alloc(-1, current, GFP_KERNEL, 0); - if ((q = __sigqueue_alloc(current, GFP_KERNEL, 0))) + if (q) q->flags |= SIGQUEUE_PREALLOC; - return(q); + + return q; } void sigqueue_free(struct sigqueue *q) @@ -1383,15 +1407,6 @@ ret: } /* - * Wake up any threads in the parent blocked in wait* syscalls. - */ -static inline void __wake_up_parent(struct task_struct *p, - struct task_struct *parent) -{ - wake_up_interruptible_sync(&parent->signal->wait_chldexit); -} - -/* * Let a parent know about the death of a child. * For a stopped/continued status change, use do_notify_parent_cldstop instead. * @@ -1673,29 +1688,6 @@ void ptrace_notify(int exit_code) spin_unlock_irq(¤t->sighand->siglock); } -static void -finish_stop(int stop_count) -{ - /* - * If there are no other threads in the group, or if there is - * a group stop in progress and we are the last to stop, - * report to the parent. When ptraced, every thread reports itself. - */ - if (tracehook_notify_jctl(stop_count == 0, CLD_STOPPED)) { - read_lock(&tasklist_lock); - do_notify_parent_cldstop(current, CLD_STOPPED); - read_unlock(&tasklist_lock); - } - - do { - schedule(); - } while (try_to_freeze()); - /* - * Now we don't run again until continued. - */ - current->exit_code = 0; -} - /* * This performs the stopping for SIGSTOP and other stop signals. * We have to stop all threads in the thread group. @@ -1705,15 +1697,9 @@ finish_stop(int stop_count) static int do_signal_stop(int signr) { struct signal_struct *sig = current->signal; - int stop_count; + int notify; - if (sig->group_stop_count > 0) { - /* - * There is a group stop in progress. We don't need to - * start another one. - */ - stop_count = --sig->group_stop_count; - } else { + if (!sig->group_stop_count) { struct task_struct *t; if (!likely(sig->flags & SIGNAL_STOP_DEQUEUED) || @@ -1725,7 +1711,7 @@ static int do_signal_stop(int signr) */ sig->group_exit_code = signr; - stop_count = 0; + sig->group_stop_count = 1; for (t = next_thread(current); t != current; t = next_thread(t)) /* * Setting state to TASK_STOPPED for a group @@ -1734,19 +1720,44 @@ static int do_signal_stop(int signr) */ if (!(t->flags & PF_EXITING) && !task_is_stopped_or_traced(t)) { - stop_count++; + sig->group_stop_count++; signal_wake_up(t, 0); } - sig->group_stop_count = stop_count; } + /* + * If there are no other threads in the group, or if there is + * a group stop in progress and we are the last to stop, report + * to the parent. When ptraced, every thread reports itself. + */ + notify = sig->group_stop_count == 1 ? CLD_STOPPED : 0; + notify = tracehook_notify_jctl(notify, CLD_STOPPED); + /* + * tracehook_notify_jctl() can drop and reacquire siglock, so + * we keep ->group_stop_count != 0 before the call. If SIGCONT + * or SIGKILL comes in between ->group_stop_count == 0. + */ + if (sig->group_stop_count) { + if (!--sig->group_stop_count) + sig->flags = SIGNAL_STOP_STOPPED; + current->exit_code = sig->group_exit_code; + __set_current_state(TASK_STOPPED); + } + spin_unlock_irq(¤t->sighand->siglock); - if (stop_count == 0) - sig->flags = SIGNAL_STOP_STOPPED; - current->exit_code = sig->group_exit_code; - __set_current_state(TASK_STOPPED); + if (notify) { + read_lock(&tasklist_lock); + do_notify_parent_cldstop(current, notify); + read_unlock(&tasklist_lock); + } + + /* Now we don't run again until woken by SIGCONT or SIGKILL */ + do { + schedule(); + } while (try_to_freeze()); + + tracehook_finish_jctl(); + current->exit_code = 0; - spin_unlock_irq(¤t->sighand->siglock); - finish_stop(stop_count); return 1; } @@ -1815,24 +1826,20 @@ relock: int why = (signal->flags & SIGNAL_STOP_CONTINUED) ? CLD_CONTINUED : CLD_STOPPED; signal->flags &= ~SIGNAL_CLD_MASK; - spin_unlock_irq(&sighand->siglock); - if (unlikely(!tracehook_notify_jctl(1, why))) - goto relock; + why = tracehook_notify_jctl(why, CLD_CONTINUED); + spin_unlock_irq(&sighand->siglock); - read_lock(&tasklist_lock); - do_notify_parent_cldstop(current->group_leader, why); - read_unlock(&tasklist_lock); + if (why) { + read_lock(&tasklist_lock); + do_notify_parent_cldstop(current->group_leader, why); + read_unlock(&tasklist_lock); + } goto relock; } for (;;) { struct k_sigaction *ka; - - if (unlikely(signal->group_stop_count > 0) && - do_signal_stop(0)) - goto relock; - /* * Tracing can induce an artifical signal and choose sigaction. * The return value in @signr determines the default action, @@ -1844,6 +1851,10 @@ relock: if (unlikely(signr != 0)) ka = return_ka; else { + if (unlikely(signal->group_stop_count > 0) && + do_signal_stop(0)) + goto relock; + signr = dequeue_signal(current, ¤t->blocked, info); @@ -1860,6 +1871,9 @@ relock: ka = &sighand->action[signr-1]; } + /* Trace actually delivered signals. */ + trace_signal_deliver(signr, info, ka); + if (ka->sa.sa_handler == SIG_IGN) /* Do nothing. */ continue; if (ka->sa.sa_handler != SIG_DFL) { @@ -1987,14 +2001,14 @@ void exit_signals(struct task_struct *tsk) if (unlikely(tsk->signal->group_stop_count) && !--tsk->signal->group_stop_count) { tsk->signal->flags = SIGNAL_STOP_STOPPED; - group_stop = 1; + group_stop = tracehook_notify_jctl(CLD_STOPPED, CLD_STOPPED); } out: spin_unlock_irq(&tsk->sighand->siglock); - if (unlikely(group_stop) && tracehook_notify_jctl(1, CLD_STOPPED)) { + if (unlikely(group_stop)) { read_lock(&tasklist_lock); - do_notify_parent_cldstop(tsk, CLD_STOPPED); + do_notify_parent_cldstop(tsk, group_stop); read_unlock(&tasklist_lock); } } @@ -2290,7 +2304,6 @@ static int do_send_specific(pid_t tgid, pid_t pid, int sig, struct siginfo *info) { struct task_struct *p; - unsigned long flags; int error = -ESRCH; rcu_read_lock(); @@ -2300,14 +2313,16 @@ do_send_specific(pid_t tgid, pid_t pid, int sig, struct siginfo *info) /* * The null signal is a permissions and process existence * probe. No signal is actually delivered. - * - * If lock_task_sighand() fails we pretend the task dies - * after receiving the signal. The window is tiny, and the - * signal is private anyway. */ - if (!error && sig && lock_task_sighand(p, &flags)) { - error = specific_send_sig_info(sig, info, p); - unlock_task_sighand(p, &flags); + if (!error && sig) { + error = do_send_sig_info(sig, info, p, false); + /* + * If lock_task_sighand() failed we pretend the task + * dies after receiving the signal. The window is tiny, + * and the signal is private anyway. + */ + if (unlikely(error == -ESRCH)) + error = 0; } } rcu_read_unlock(); diff --git a/kernel/slow-work-debugfs.c b/kernel/slow-work-debugfs.c new file mode 100644 index 00000000000..e45c4364529 --- /dev/null +++ b/kernel/slow-work-debugfs.c @@ -0,0 +1,227 @@ +/* Slow work debugging + * + * Copyright (C) 2009 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#include <linux/module.h> +#include <linux/slow-work.h> +#include <linux/fs.h> +#include <linux/time.h> +#include <linux/seq_file.h> +#include "slow-work.h" + +#define ITERATOR_SHIFT (BITS_PER_LONG - 4) +#define ITERATOR_SELECTOR (0xfUL << ITERATOR_SHIFT) +#define ITERATOR_COUNTER (~ITERATOR_SELECTOR) + +void slow_work_new_thread_desc(struct slow_work *work, struct seq_file *m) +{ + seq_puts(m, "Slow-work: New thread"); +} + +/* + * Render the time mark field on a work item into a 5-char time with units plus + * a space + */ +static void slow_work_print_mark(struct seq_file *m, struct slow_work *work) +{ + struct timespec now, diff; + + now = CURRENT_TIME; + diff = timespec_sub(now, work->mark); + + if (diff.tv_sec < 0) + seq_puts(m, " -ve "); + else if (diff.tv_sec == 0 && diff.tv_nsec < 1000) + seq_printf(m, "%3luns ", diff.tv_nsec); + else if (diff.tv_sec == 0 && diff.tv_nsec < 1000000) + seq_printf(m, "%3luus ", diff.tv_nsec / 1000); + else if (diff.tv_sec == 0 && diff.tv_nsec < 1000000000) + seq_printf(m, "%3lums ", diff.tv_nsec / 1000000); + else if (diff.tv_sec <= 1) + seq_puts(m, " 1s "); + else if (diff.tv_sec < 60) + seq_printf(m, "%4lus ", diff.tv_sec); + else if (diff.tv_sec < 60 * 60) + seq_printf(m, "%4lum ", diff.tv_sec / 60); + else if (diff.tv_sec < 60 * 60 * 24) + seq_printf(m, "%4luh ", diff.tv_sec / 3600); + else + seq_puts(m, "exces "); +} + +/* + * Describe a slow work item for debugfs + */ +static int slow_work_runqueue_show(struct seq_file *m, void *v) +{ + struct slow_work *work; + struct list_head *p = v; + unsigned long id; + + switch ((unsigned long) v) { + case 1: + seq_puts(m, "THR PID ITEM ADDR FL MARK DESC\n"); + return 0; + case 2: + seq_puts(m, "=== ===== ================ == ===== ==========\n"); + return 0; + + case 3 ... 3 + SLOW_WORK_THREAD_LIMIT - 1: + id = (unsigned long) v - 3; + + read_lock(&slow_work_execs_lock); + work = slow_work_execs[id]; + if (work) { + smp_read_barrier_depends(); + + seq_printf(m, "%3lu %5d %16p %2lx ", + id, slow_work_pids[id], work, work->flags); + slow_work_print_mark(m, work); + + if (work->ops->desc) + work->ops->desc(work, m); + seq_putc(m, '\n'); + } + read_unlock(&slow_work_execs_lock); + return 0; + + default: + work = list_entry(p, struct slow_work, link); + seq_printf(m, "%3s - %16p %2lx ", + work->flags & SLOW_WORK_VERY_SLOW ? "vsq" : "sq", + work, work->flags); + slow_work_print_mark(m, work); + + if (work->ops->desc) + work->ops->desc(work, m); + seq_putc(m, '\n'); + return 0; + } +} + +/* + * map the iterator to a work item + */ +static void *slow_work_runqueue_index(struct seq_file *m, loff_t *_pos) +{ + struct list_head *p; + unsigned long count, id; + + switch (*_pos >> ITERATOR_SHIFT) { + case 0x0: + if (*_pos == 0) + *_pos = 1; + if (*_pos < 3) + return (void *)(unsigned long) *_pos; + if (*_pos < 3 + SLOW_WORK_THREAD_LIMIT) + for (id = *_pos - 3; + id < SLOW_WORK_THREAD_LIMIT; + id++, (*_pos)++) + if (slow_work_execs[id]) + return (void *)(unsigned long) *_pos; + *_pos = 0x1UL << ITERATOR_SHIFT; + + case 0x1: + count = *_pos & ITERATOR_COUNTER; + list_for_each(p, &slow_work_queue) { + if (count == 0) + return p; + count--; + } + *_pos = 0x2UL << ITERATOR_SHIFT; + + case 0x2: + count = *_pos & ITERATOR_COUNTER; + list_for_each(p, &vslow_work_queue) { + if (count == 0) + return p; + count--; + } + *_pos = 0x3UL << ITERATOR_SHIFT; + + default: + return NULL; + } +} + +/* + * set up the iterator to start reading from the first line + */ +static void *slow_work_runqueue_start(struct seq_file *m, loff_t *_pos) +{ + spin_lock_irq(&slow_work_queue_lock); + return slow_work_runqueue_index(m, _pos); +} + +/* + * move to the next line + */ +static void *slow_work_runqueue_next(struct seq_file *m, void *v, loff_t *_pos) +{ + struct list_head *p = v; + unsigned long selector = *_pos >> ITERATOR_SHIFT; + + (*_pos)++; + switch (selector) { + case 0x0: + return slow_work_runqueue_index(m, _pos); + + case 0x1: + if (*_pos >> ITERATOR_SHIFT == 0x1) { + p = p->next; + if (p != &slow_work_queue) + return p; + } + *_pos = 0x2UL << ITERATOR_SHIFT; + p = &vslow_work_queue; + + case 0x2: + if (*_pos >> ITERATOR_SHIFT == 0x2) { + p = p->next; + if (p != &vslow_work_queue) + return p; + } + *_pos = 0x3UL << ITERATOR_SHIFT; + + default: + return NULL; + } +} + +/* + * clean up after reading + */ +static void slow_work_runqueue_stop(struct seq_file *m, void *v) +{ + spin_unlock_irq(&slow_work_queue_lock); +} + +static const struct seq_operations slow_work_runqueue_ops = { + .start = slow_work_runqueue_start, + .stop = slow_work_runqueue_stop, + .next = slow_work_runqueue_next, + .show = slow_work_runqueue_show, +}; + +/* + * open "/sys/kernel/debug/slow_work/runqueue" to list queue contents + */ +static int slow_work_runqueue_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &slow_work_runqueue_ops); +} + +const struct file_operations slow_work_runqueue_fops = { + .owner = THIS_MODULE, + .open = slow_work_runqueue_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; diff --git a/kernel/slow-work.c b/kernel/slow-work.c index 09d7519557d..7494bbf5a27 100644 --- a/kernel/slow-work.c +++ b/kernel/slow-work.c @@ -16,20 +16,17 @@ #include <linux/kthread.h> #include <linux/freezer.h> #include <linux/wait.h> - -#define SLOW_WORK_CULL_TIMEOUT (5 * HZ) /* cull threads 5s after running out of - * things to do */ -#define SLOW_WORK_OOM_TIMEOUT (5 * HZ) /* can't start new threads for 5s after - * OOM */ +#include <linux/debugfs.h> +#include "slow-work.h" static void slow_work_cull_timeout(unsigned long); static void slow_work_oom_timeout(unsigned long); #ifdef CONFIG_SYSCTL -static int slow_work_min_threads_sysctl(struct ctl_table *, int, struct file *, +static int slow_work_min_threads_sysctl(struct ctl_table *, int, void __user *, size_t *, loff_t *); -static int slow_work_max_threads_sysctl(struct ctl_table *, int , struct file *, +static int slow_work_max_threads_sysctl(struct ctl_table *, int , void __user *, size_t *, loff_t *); #endif @@ -46,13 +43,12 @@ static unsigned vslow_work_proportion = 50; /* % of threads that may process #ifdef CONFIG_SYSCTL static const int slow_work_min_min_threads = 2; -static int slow_work_max_max_threads = 255; +static int slow_work_max_max_threads = SLOW_WORK_THREAD_LIMIT; static const int slow_work_min_vslow = 1; static const int slow_work_max_vslow = 99; ctl_table slow_work_sysctls[] = { { - .ctl_name = CTL_UNNUMBERED, .procname = "min-threads", .data = &slow_work_min_threads, .maxlen = sizeof(unsigned), @@ -62,7 +58,6 @@ ctl_table slow_work_sysctls[] = { .extra2 = &slow_work_max_threads, }, { - .ctl_name = CTL_UNNUMBERED, .procname = "max-threads", .data = &slow_work_max_threads, .maxlen = sizeof(unsigned), @@ -72,16 +67,15 @@ ctl_table slow_work_sysctls[] = { .extra2 = (void *) &slow_work_max_max_threads, }, { - .ctl_name = CTL_UNNUMBERED, .procname = "vslow-percentage", .data = &vslow_work_proportion, .maxlen = sizeof(unsigned), .mode = 0644, - .proc_handler = &proc_dointvec_minmax, + .proc_handler = proc_dointvec_minmax, .extra1 = (void *) &slow_work_min_vslow, .extra2 = (void *) &slow_work_max_vslow, }, - { .ctl_name = 0 } + {} }; #endif @@ -98,6 +92,56 @@ static DEFINE_TIMER(slow_work_oom_timer, slow_work_oom_timeout, 0, 0); static struct slow_work slow_work_new_thread; /* new thread starter */ /* + * slow work ID allocation (use slow_work_queue_lock) + */ +static DECLARE_BITMAP(slow_work_ids, SLOW_WORK_THREAD_LIMIT); + +/* + * Unregistration tracking to prevent put_ref() from disappearing during module + * unload + */ +#ifdef CONFIG_MODULES +static struct module *slow_work_thread_processing[SLOW_WORK_THREAD_LIMIT]; +static struct module *slow_work_unreg_module; +static struct slow_work *slow_work_unreg_work_item; +static DECLARE_WAIT_QUEUE_HEAD(slow_work_unreg_wq); +static DEFINE_MUTEX(slow_work_unreg_sync_lock); + +static void slow_work_set_thread_processing(int id, struct slow_work *work) +{ + if (work) + slow_work_thread_processing[id] = work->owner; +} +static void slow_work_done_thread_processing(int id, struct slow_work *work) +{ + struct module *module = slow_work_thread_processing[id]; + + slow_work_thread_processing[id] = NULL; + smp_mb(); + if (slow_work_unreg_work_item == work || + slow_work_unreg_module == module) + wake_up_all(&slow_work_unreg_wq); +} +static void slow_work_clear_thread_processing(int id) +{ + slow_work_thread_processing[id] = NULL; +} +#else +static void slow_work_set_thread_processing(int id, struct slow_work *work) {} +static void slow_work_done_thread_processing(int id, struct slow_work *work) {} +static void slow_work_clear_thread_processing(int id) {} +#endif + +/* + * Data for tracking currently executing items for indication through /proc + */ +#ifdef CONFIG_SLOW_WORK_DEBUG +struct slow_work *slow_work_execs[SLOW_WORK_THREAD_LIMIT]; +pid_t slow_work_pids[SLOW_WORK_THREAD_LIMIT]; +DEFINE_RWLOCK(slow_work_execs_lock); +#endif + +/* * The queues of work items and the lock governing access to them. These are * shared between all the CPUs. It doesn't make sense to have per-CPU queues * as the number of threads bears no relation to the number of CPUs. @@ -105,9 +149,18 @@ static struct slow_work slow_work_new_thread; /* new thread starter */ * There are two queues of work items: one for slow work items, and one for * very slow work items. */ -static LIST_HEAD(slow_work_queue); -static LIST_HEAD(vslow_work_queue); -static DEFINE_SPINLOCK(slow_work_queue_lock); +LIST_HEAD(slow_work_queue); +LIST_HEAD(vslow_work_queue); +DEFINE_SPINLOCK(slow_work_queue_lock); + +/* + * The following are two wait queues that get pinged when a work item is placed + * on an empty queue. These allow work items that are hogging a thread by + * sleeping in a way that could be deferred to yield their thread and enqueue + * themselves. + */ +static DECLARE_WAIT_QUEUE_HEAD(slow_work_queue_waits_for_occupation); +static DECLARE_WAIT_QUEUE_HEAD(vslow_work_queue_waits_for_occupation); /* * The thread controls. A variable used to signal to the threads that they @@ -126,6 +179,20 @@ static DECLARE_COMPLETION(slow_work_last_thread_exited); static int slow_work_user_count; static DEFINE_MUTEX(slow_work_user_lock); +static inline int slow_work_get_ref(struct slow_work *work) +{ + if (work->ops->get_ref) + return work->ops->get_ref(work); + + return 0; +} + +static inline void slow_work_put_ref(struct slow_work *work) +{ + if (work->ops->put_ref) + work->ops->put_ref(work); +} + /* * Calculate the maximum number of active threads in the pool that are * permitted to process very slow work items. @@ -149,7 +216,7 @@ static unsigned slow_work_calc_vsmax(void) * Attempt to execute stuff queued on a slow thread. Return true if we managed * it, false if there was nothing to do. */ -static bool slow_work_execute(void) +static noinline bool slow_work_execute(int id) { struct slow_work *work = NULL; unsigned vsmax; @@ -186,6 +253,13 @@ static bool slow_work_execute(void) } else { very_slow = false; /* avoid the compiler warning */ } + + slow_work_set_thread_processing(id, work); + if (work) { + slow_work_mark_time(work); + slow_work_begin_exec(id, work); + } + spin_unlock_irq(&slow_work_queue_lock); if (!work) @@ -194,12 +268,19 @@ static bool slow_work_execute(void) if (!test_and_clear_bit(SLOW_WORK_PENDING, &work->flags)) BUG(); - work->ops->execute(work); + /* don't execute if the work is in the process of being cancelled */ + if (!test_bit(SLOW_WORK_CANCELLING, &work->flags)) + work->ops->execute(work); if (very_slow) atomic_dec(&vslow_work_executing_count); clear_bit_unlock(SLOW_WORK_EXECUTING, &work->flags); + /* wake up anyone waiting for this work to be complete */ + wake_up_bit(&work->flags, SLOW_WORK_EXECUTING); + + slow_work_end_exec(id, work); + /* if someone tried to enqueue the item whilst we were executing it, * then it'll be left unenqueued to avoid multiple threads trying to * execute it simultaneously @@ -219,7 +300,10 @@ static bool slow_work_execute(void) spin_unlock_irq(&slow_work_queue_lock); } - work->ops->put_ref(work); + /* sort out the race between module unloading and put_ref() */ + slow_work_put_ref(work); + slow_work_done_thread_processing(id, work); + return true; auto_requeue: @@ -227,15 +311,61 @@ auto_requeue: * - we transfer our ref on the item back to the appropriate queue * - don't wake another thread up as we're awake already */ + slow_work_mark_time(work); if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags)) list_add_tail(&work->link, &vslow_work_queue); else list_add_tail(&work->link, &slow_work_queue); spin_unlock_irq(&slow_work_queue_lock); + slow_work_clear_thread_processing(id); return true; } /** + * slow_work_sleep_till_thread_needed - Sleep till thread needed by other work + * work: The work item under execution that wants to sleep + * _timeout: Scheduler sleep timeout + * + * Allow a requeueable work item to sleep on a slow-work processor thread until + * that thread is needed to do some other work or the sleep is interrupted by + * some other event. + * + * The caller must set up a wake up event before calling this and must have set + * the appropriate sleep mode (such as TASK_UNINTERRUPTIBLE) and tested its own + * condition before calling this function as no test is made here. + * + * False is returned if there is nothing on the queue; true is returned if the + * work item should be requeued + */ +bool slow_work_sleep_till_thread_needed(struct slow_work *work, + signed long *_timeout) +{ + wait_queue_head_t *wfo_wq; + struct list_head *queue; + + DEFINE_WAIT(wait); + + if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags)) { + wfo_wq = &vslow_work_queue_waits_for_occupation; + queue = &vslow_work_queue; + } else { + wfo_wq = &slow_work_queue_waits_for_occupation; + queue = &slow_work_queue; + } + + if (!list_empty(queue)) + return true; + + add_wait_queue_exclusive(wfo_wq, &wait); + if (list_empty(queue)) + *_timeout = schedule_timeout(*_timeout); + finish_wait(wfo_wq, &wait); + + return !list_empty(queue); +} +EXPORT_SYMBOL(slow_work_sleep_till_thread_needed); + +/** * slow_work_enqueue - Schedule a slow work item for processing * @work: The work item to queue * @@ -260,16 +390,22 @@ auto_requeue: * allowed to pick items to execute. This ensures that very slow items won't * overly block ones that are just ordinarily slow. * - * Returns 0 if successful, -EAGAIN if not. + * Returns 0 if successful, -EAGAIN if not (or -ECANCELED if cancelled work is + * attempted queued) */ int slow_work_enqueue(struct slow_work *work) { + wait_queue_head_t *wfo_wq; + struct list_head *queue; unsigned long flags; + int ret; + + if (test_bit(SLOW_WORK_CANCELLING, &work->flags)) + return -ECANCELED; BUG_ON(slow_work_user_count <= 0); BUG_ON(!work); BUG_ON(!work->ops); - BUG_ON(!work->ops->get_ref); /* when honouring an enqueue request, we only promise that we will run * the work function in the future; we do not promise to run it once @@ -280,8 +416,19 @@ int slow_work_enqueue(struct slow_work *work) * maintaining our promise */ if (!test_and_set_bit_lock(SLOW_WORK_PENDING, &work->flags)) { + if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags)) { + wfo_wq = &vslow_work_queue_waits_for_occupation; + queue = &vslow_work_queue; + } else { + wfo_wq = &slow_work_queue_waits_for_occupation; + queue = &slow_work_queue; + } + spin_lock_irqsave(&slow_work_queue_lock, flags); + if (unlikely(test_bit(SLOW_WORK_CANCELLING, &work->flags))) + goto cancelled; + /* we promise that we will not attempt to execute the work * function in more than one thread simultaneously * @@ -299,25 +446,221 @@ int slow_work_enqueue(struct slow_work *work) if (test_bit(SLOW_WORK_EXECUTING, &work->flags)) { set_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags); } else { - if (work->ops->get_ref(work) < 0) - goto cant_get_ref; - if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags)) - list_add_tail(&work->link, &vslow_work_queue); - else - list_add_tail(&work->link, &slow_work_queue); + ret = slow_work_get_ref(work); + if (ret < 0) + goto failed; + slow_work_mark_time(work); + list_add_tail(&work->link, queue); wake_up(&slow_work_thread_wq); + + /* if someone who could be requeued is sleeping on a + * thread, then ask them to yield their thread */ + if (work->link.prev == queue) + wake_up(wfo_wq); } spin_unlock_irqrestore(&slow_work_queue_lock, flags); } return 0; -cant_get_ref: +cancelled: + ret = -ECANCELED; +failed: spin_unlock_irqrestore(&slow_work_queue_lock, flags); - return -EAGAIN; + return ret; } EXPORT_SYMBOL(slow_work_enqueue); +static int slow_work_wait(void *word) +{ + schedule(); + return 0; +} + +/** + * slow_work_cancel - Cancel a slow work item + * @work: The work item to cancel + * + * This function will cancel a previously enqueued work item. If we cannot + * cancel the work item, it is guarenteed to have run when this function + * returns. + */ +void slow_work_cancel(struct slow_work *work) +{ + bool wait = true, put = false; + + set_bit(SLOW_WORK_CANCELLING, &work->flags); + smp_mb(); + + /* if the work item is a delayed work item with an active timer, we + * need to wait for the timer to finish _before_ getting the spinlock, + * lest we deadlock against the timer routine + * + * the timer routine will leave DELAYED set if it notices the + * CANCELLING flag in time + */ + if (test_bit(SLOW_WORK_DELAYED, &work->flags)) { + struct delayed_slow_work *dwork = + container_of(work, struct delayed_slow_work, work); + del_timer_sync(&dwork->timer); + } + + spin_lock_irq(&slow_work_queue_lock); + + if (test_bit(SLOW_WORK_DELAYED, &work->flags)) { + /* the timer routine aborted or never happened, so we are left + * holding the timer's reference on the item and should just + * drop the pending flag and wait for any ongoing execution to + * finish */ + struct delayed_slow_work *dwork = + container_of(work, struct delayed_slow_work, work); + + BUG_ON(timer_pending(&dwork->timer)); + BUG_ON(!list_empty(&work->link)); + + clear_bit(SLOW_WORK_DELAYED, &work->flags); + put = true; + clear_bit(SLOW_WORK_PENDING, &work->flags); + + } else if (test_bit(SLOW_WORK_PENDING, &work->flags) && + !list_empty(&work->link)) { + /* the link in the pending queue holds a reference on the item + * that we will need to release */ + list_del_init(&work->link); + wait = false; + put = true; + clear_bit(SLOW_WORK_PENDING, &work->flags); + + } else if (test_and_clear_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags)) { + /* the executor is holding our only reference on the item, so + * we merely need to wait for it to finish executing */ + clear_bit(SLOW_WORK_PENDING, &work->flags); + } + + spin_unlock_irq(&slow_work_queue_lock); + + /* the EXECUTING flag is set by the executor whilst the spinlock is set + * and before the item is dequeued - so assuming the above doesn't + * actually dequeue it, simply waiting for the EXECUTING flag to be + * released here should be sufficient */ + if (wait) + wait_on_bit(&work->flags, SLOW_WORK_EXECUTING, slow_work_wait, + TASK_UNINTERRUPTIBLE); + + clear_bit(SLOW_WORK_CANCELLING, &work->flags); + if (put) + slow_work_put_ref(work); +} +EXPORT_SYMBOL(slow_work_cancel); + +/* + * Handle expiry of the delay timer, indicating that a delayed slow work item + * should now be queued if not cancelled + */ +static void delayed_slow_work_timer(unsigned long data) +{ + wait_queue_head_t *wfo_wq; + struct list_head *queue; + struct slow_work *work = (struct slow_work *) data; + unsigned long flags; + bool queued = false, put = false, first = false; + + if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags)) { + wfo_wq = &vslow_work_queue_waits_for_occupation; + queue = &vslow_work_queue; + } else { + wfo_wq = &slow_work_queue_waits_for_occupation; + queue = &slow_work_queue; + } + + spin_lock_irqsave(&slow_work_queue_lock, flags); + if (likely(!test_bit(SLOW_WORK_CANCELLING, &work->flags))) { + clear_bit(SLOW_WORK_DELAYED, &work->flags); + + if (test_bit(SLOW_WORK_EXECUTING, &work->flags)) { + /* we discard the reference the timer was holding in + * favour of the one the executor holds */ + set_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags); + put = true; + } else { + slow_work_mark_time(work); + list_add_tail(&work->link, queue); + queued = true; + if (work->link.prev == queue) + first = true; + } + } + + spin_unlock_irqrestore(&slow_work_queue_lock, flags); + if (put) + slow_work_put_ref(work); + if (first) + wake_up(wfo_wq); + if (queued) + wake_up(&slow_work_thread_wq); +} + +/** + * delayed_slow_work_enqueue - Schedule a delayed slow work item for processing + * @dwork: The delayed work item to queue + * @delay: When to start executing the work, in jiffies from now + * + * This is similar to slow_work_enqueue(), but it adds a delay before the work + * is actually queued for processing. + * + * The item can have delayed processing requested on it whilst it is being + * executed. The delay will begin immediately, and if it expires before the + * item finishes executing, the item will be placed back on the queue when it + * has done executing. + */ +int delayed_slow_work_enqueue(struct delayed_slow_work *dwork, + unsigned long delay) +{ + struct slow_work *work = &dwork->work; + unsigned long flags; + int ret; + + if (delay == 0) + return slow_work_enqueue(&dwork->work); + + BUG_ON(slow_work_user_count <= 0); + BUG_ON(!work); + BUG_ON(!work->ops); + + if (test_bit(SLOW_WORK_CANCELLING, &work->flags)) + return -ECANCELED; + + if (!test_and_set_bit_lock(SLOW_WORK_PENDING, &work->flags)) { + spin_lock_irqsave(&slow_work_queue_lock, flags); + + if (test_bit(SLOW_WORK_CANCELLING, &work->flags)) + goto cancelled; + + /* the timer holds a reference whilst it is pending */ + ret = work->ops->get_ref(work); + if (ret < 0) + goto cant_get_ref; + + if (test_and_set_bit(SLOW_WORK_DELAYED, &work->flags)) + BUG(); + dwork->timer.expires = jiffies + delay; + dwork->timer.data = (unsigned long) work; + dwork->timer.function = delayed_slow_work_timer; + add_timer(&dwork->timer); + + spin_unlock_irqrestore(&slow_work_queue_lock, flags); + } + + return 0; + +cancelled: + ret = -ECANCELED; +cant_get_ref: + spin_unlock_irqrestore(&slow_work_queue_lock, flags); + return ret; +} +EXPORT_SYMBOL(delayed_slow_work_enqueue); + /* * Schedule a cull of the thread pool at some time in the near future */ @@ -368,13 +711,23 @@ static inline bool slow_work_available(int vsmax) */ static int slow_work_thread(void *_data) { - int vsmax; + int vsmax, id; DEFINE_WAIT(wait); set_freezable(); set_user_nice(current, -5); + /* allocate ourselves an ID */ + spin_lock_irq(&slow_work_queue_lock); + id = find_first_zero_bit(slow_work_ids, SLOW_WORK_THREAD_LIMIT); + BUG_ON(id < 0 || id >= SLOW_WORK_THREAD_LIMIT); + __set_bit(id, slow_work_ids); + slow_work_set_thread_pid(id, current->pid); + spin_unlock_irq(&slow_work_queue_lock); + + sprintf(current->comm, "kslowd%03u", id); + for (;;) { vsmax = vslow_work_proportion; vsmax *= atomic_read(&slow_work_thread_count); @@ -395,7 +748,7 @@ static int slow_work_thread(void *_data) vsmax *= atomic_read(&slow_work_thread_count); vsmax /= 100; - if (slow_work_available(vsmax) && slow_work_execute()) { + if (slow_work_available(vsmax) && slow_work_execute(id)) { cond_resched(); if (list_empty(&slow_work_queue) && list_empty(&vslow_work_queue) && @@ -412,6 +765,11 @@ static int slow_work_thread(void *_data) break; } + spin_lock_irq(&slow_work_queue_lock); + slow_work_set_thread_pid(id, 0); + __clear_bit(id, slow_work_ids); + spin_unlock_irq(&slow_work_queue_lock); + if (atomic_dec_and_test(&slow_work_thread_count)) complete_and_exit(&slow_work_last_thread_exited, 0); return 0; @@ -427,21 +785,6 @@ static void slow_work_cull_timeout(unsigned long data) } /* - * Get a reference on slow work thread starter - */ -static int slow_work_new_thread_get_ref(struct slow_work *work) -{ - return 0; -} - -/* - * Drop a reference on slow work thread starter - */ -static void slow_work_new_thread_put_ref(struct slow_work *work) -{ -} - -/* * Start a new slow work thread */ static void slow_work_new_thread_execute(struct slow_work *work) @@ -475,9 +818,11 @@ static void slow_work_new_thread_execute(struct slow_work *work) } static const struct slow_work_ops slow_work_new_thread_ops = { - .get_ref = slow_work_new_thread_get_ref, - .put_ref = slow_work_new_thread_put_ref, + .owner = THIS_MODULE, .execute = slow_work_new_thread_execute, +#ifdef CONFIG_SLOW_WORK_DEBUG + .desc = slow_work_new_thread_desc, +#endif }; /* @@ -493,10 +838,10 @@ static void slow_work_oom_timeout(unsigned long data) * Handle adjustment of the minimum number of threads */ static int slow_work_min_threads_sysctl(struct ctl_table *table, int write, - struct file *filp, void __user *buffer, + void __user *buffer, size_t *lenp, loff_t *ppos) { - int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); + int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); int n; if (ret == 0) { @@ -521,10 +866,10 @@ static int slow_work_min_threads_sysctl(struct ctl_table *table, int write, * Handle adjustment of the maximum number of threads */ static int slow_work_max_threads_sysctl(struct ctl_table *table, int write, - struct file *filp, void __user *buffer, + void __user *buffer, size_t *lenp, loff_t *ppos) { - int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); + int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); int n; if (ret == 0) { @@ -546,12 +891,13 @@ static int slow_work_max_threads_sysctl(struct ctl_table *table, int write, /** * slow_work_register_user - Register a user of the facility + * @module: The module about to make use of the facility * * Register a user of the facility, starting up the initial threads if there * aren't any other users at this point. This will return 0 if successful, or * an error if not. */ -int slow_work_register_user(void) +int slow_work_register_user(struct module *module) { struct task_struct *p; int loop; @@ -598,14 +944,81 @@ error: } EXPORT_SYMBOL(slow_work_register_user); +/* + * wait for all outstanding items from the calling module to complete + * - note that more items may be queued whilst we're waiting + */ +static void slow_work_wait_for_items(struct module *module) +{ +#ifdef CONFIG_MODULES + DECLARE_WAITQUEUE(myself, current); + struct slow_work *work; + int loop; + + mutex_lock(&slow_work_unreg_sync_lock); + add_wait_queue(&slow_work_unreg_wq, &myself); + + for (;;) { + spin_lock_irq(&slow_work_queue_lock); + + /* first of all, we wait for the last queued item in each list + * to be processed */ + list_for_each_entry_reverse(work, &vslow_work_queue, link) { + if (work->owner == module) { + set_current_state(TASK_UNINTERRUPTIBLE); + slow_work_unreg_work_item = work; + goto do_wait; + } + } + list_for_each_entry_reverse(work, &slow_work_queue, link) { + if (work->owner == module) { + set_current_state(TASK_UNINTERRUPTIBLE); + slow_work_unreg_work_item = work; + goto do_wait; + } + } + + /* then we wait for the items being processed to finish */ + slow_work_unreg_module = module; + smp_mb(); + for (loop = 0; loop < SLOW_WORK_THREAD_LIMIT; loop++) { + if (slow_work_thread_processing[loop] == module) + goto do_wait; + } + spin_unlock_irq(&slow_work_queue_lock); + break; /* okay, we're done */ + + do_wait: + spin_unlock_irq(&slow_work_queue_lock); + schedule(); + slow_work_unreg_work_item = NULL; + slow_work_unreg_module = NULL; + } + + remove_wait_queue(&slow_work_unreg_wq, &myself); + mutex_unlock(&slow_work_unreg_sync_lock); +#endif /* CONFIG_MODULES */ +} + /** * slow_work_unregister_user - Unregister a user of the facility + * @module: The module whose items should be cleared * * Unregister a user of the facility, killing all the threads if this was the * last one. + * + * This waits for all the work items belonging to the nominated module to go + * away before proceeding. */ -void slow_work_unregister_user(void) +void slow_work_unregister_user(struct module *module) { + /* first of all, wait for all outstanding items from the calling module + * to complete */ + if (module) + slow_work_wait_for_items(module); + + /* then we can actually go about shutting down the facility if need + * be */ mutex_lock(&slow_work_user_lock); BUG_ON(slow_work_user_count <= 0); @@ -639,6 +1052,16 @@ static int __init init_slow_work(void) if (slow_work_max_max_threads < nr_cpus * 2) slow_work_max_max_threads = nr_cpus * 2; #endif +#ifdef CONFIG_SLOW_WORK_DEBUG + { + struct dentry *dbdir; + + dbdir = debugfs_create_dir("slow_work", NULL); + if (dbdir && !IS_ERR(dbdir)) + debugfs_create_file("runqueue", S_IFREG | 0400, dbdir, + NULL, &slow_work_runqueue_fops); + } +#endif return 0; } diff --git a/kernel/slow-work.h b/kernel/slow-work.h new file mode 100644 index 00000000000..321f3c59d73 --- /dev/null +++ b/kernel/slow-work.h @@ -0,0 +1,72 @@ +/* Slow work private definitions + * + * Copyright (C) 2009 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#define SLOW_WORK_CULL_TIMEOUT (5 * HZ) /* cull threads 5s after running out of + * things to do */ +#define SLOW_WORK_OOM_TIMEOUT (5 * HZ) /* can't start new threads for 5s after + * OOM */ + +#define SLOW_WORK_THREAD_LIMIT 255 /* abs maximum number of slow-work threads */ + +/* + * slow-work.c + */ +#ifdef CONFIG_SLOW_WORK_DEBUG +extern struct slow_work *slow_work_execs[]; +extern pid_t slow_work_pids[]; +extern rwlock_t slow_work_execs_lock; +#endif + +extern struct list_head slow_work_queue; +extern struct list_head vslow_work_queue; +extern spinlock_t slow_work_queue_lock; + +/* + * slow-work-debugfs.c + */ +#ifdef CONFIG_SLOW_WORK_DEBUG +extern const struct file_operations slow_work_runqueue_fops; + +extern void slow_work_new_thread_desc(struct slow_work *, struct seq_file *); +#endif + +/* + * Helper functions + */ +static inline void slow_work_set_thread_pid(int id, pid_t pid) +{ +#ifdef CONFIG_SLOW_WORK_PROC + slow_work_pids[id] = pid; +#endif +} + +static inline void slow_work_mark_time(struct slow_work *work) +{ +#ifdef CONFIG_SLOW_WORK_PROC + work->mark = CURRENT_TIME; +#endif +} + +static inline void slow_work_begin_exec(int id, struct slow_work *work) +{ +#ifdef CONFIG_SLOW_WORK_PROC + slow_work_execs[id] = work; +#endif +} + +static inline void slow_work_end_exec(int id, struct slow_work *work) +{ +#ifdef CONFIG_SLOW_WORK_PROC + write_lock(&slow_work_execs_lock); + slow_work_execs[id] = NULL; + write_unlock(&slow_work_execs_lock); +#endif +} diff --git a/kernel/smp.c b/kernel/smp.c index fd47a256a24..de735a6637d 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -16,11 +16,11 @@ static DEFINE_PER_CPU(struct call_single_queue, call_single_queue); static struct { struct list_head queue; - spinlock_t lock; + raw_spinlock_t lock; } call_function __cacheline_aligned_in_smp = { .queue = LIST_HEAD_INIT(call_function.queue), - .lock = __SPIN_LOCK_UNLOCKED(call_function.lock), + .lock = __RAW_SPIN_LOCK_UNLOCKED(call_function.lock), }; enum { @@ -35,7 +35,7 @@ struct call_function_data { struct call_single_queue { struct list_head list; - spinlock_t lock; + raw_spinlock_t lock; }; static DEFINE_PER_CPU(struct call_function_data, cfd_data); @@ -80,7 +80,7 @@ static int __cpuinit init_call_single_data(void) for_each_possible_cpu(i) { struct call_single_queue *q = &per_cpu(call_single_queue, i); - spin_lock_init(&q->lock); + raw_spin_lock_init(&q->lock); INIT_LIST_HEAD(&q->list); } @@ -141,10 +141,10 @@ void generic_exec_single(int cpu, struct call_single_data *data, int wait) unsigned long flags; int ipi; - spin_lock_irqsave(&dst->lock, flags); + raw_spin_lock_irqsave(&dst->lock, flags); ipi = list_empty(&dst->list); list_add_tail(&data->list, &dst->list); - spin_unlock_irqrestore(&dst->lock, flags); + raw_spin_unlock_irqrestore(&dst->lock, flags); /* * The list addition should be visible before sending the IPI @@ -171,7 +171,7 @@ void generic_exec_single(int cpu, struct call_single_data *data, int wait) void generic_smp_call_function_interrupt(void) { struct call_function_data *data; - int cpu = get_cpu(); + int cpu = smp_processor_id(); /* * Shouldn't receive this interrupt on a cpu that is not yet online. @@ -201,9 +201,9 @@ void generic_smp_call_function_interrupt(void) refs = atomic_dec_return(&data->refs); WARN_ON(refs < 0); if (!refs) { - spin_lock(&call_function.lock); + raw_spin_lock(&call_function.lock); list_del_rcu(&data->csd.list); - spin_unlock(&call_function.lock); + raw_spin_unlock(&call_function.lock); } if (refs) @@ -212,7 +212,6 @@ void generic_smp_call_function_interrupt(void) csd_unlock(&data->csd); } - put_cpu(); } /* @@ -230,9 +229,9 @@ void generic_smp_call_function_single_interrupt(void) */ WARN_ON_ONCE(!cpu_online(smp_processor_id())); - spin_lock(&q->lock); + raw_spin_lock(&q->lock); list_replace_init(&q->list, &list); - spin_unlock(&q->lock); + raw_spin_unlock(&q->lock); while (!list_empty(&list)) { struct call_single_data *data; @@ -265,9 +264,7 @@ static DEFINE_PER_CPU(struct call_single_data, csd_data); * @info: An arbitrary pointer to pass to the function. * @wait: If true, wait until function has completed on other CPUs. * - * Returns 0 on success, else a negative status code. Note that @wait - * will be implicitly turned on in case of allocation failures, since - * we fall back to on-stack allocation. + * Returns 0 on success, else a negative status code. */ int smp_call_function_single(int cpu, void (*func) (void *info), void *info, int wait) @@ -321,6 +318,51 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info, } EXPORT_SYMBOL(smp_call_function_single); +/* + * smp_call_function_any - Run a function on any of the given cpus + * @mask: The mask of cpus it can run on. + * @func: The function to run. This must be fast and non-blocking. + * @info: An arbitrary pointer to pass to the function. + * @wait: If true, wait until function has completed. + * + * Returns 0 on success, else a negative status code (if no cpus were online). + * Note that @wait will be implicitly turned on in case of allocation failures, + * since we fall back to on-stack allocation. + * + * Selection preference: + * 1) current cpu if in @mask + * 2) any cpu of current node if in @mask + * 3) any other online cpu in @mask + */ +int smp_call_function_any(const struct cpumask *mask, + void (*func)(void *info), void *info, int wait) +{ + unsigned int cpu; + const struct cpumask *nodemask; + int ret; + + /* Try for same CPU (cheapest) */ + cpu = get_cpu(); + if (cpumask_test_cpu(cpu, mask)) + goto call; + + /* Try for same node. */ + nodemask = cpumask_of_node(cpu); + for (cpu = cpumask_first_and(nodemask, mask); cpu < nr_cpu_ids; + cpu = cpumask_next_and(cpu, nodemask, mask)) { + if (cpu_online(cpu)) + goto call; + } + + /* Any online will do: smp_call_function_single handles nr_cpu_ids. */ + cpu = cpumask_any_and(mask, cpu_online_mask); +call: + ret = smp_call_function_single(cpu, func, info, wait); + put_cpu(); + return ret; +} +EXPORT_SYMBOL_GPL(smp_call_function_any); + /** * __smp_call_function_single(): Run a function on another CPU * @cpu: The CPU to run on. @@ -347,13 +389,6 @@ void __smp_call_function_single(int cpu, struct call_single_data *data, generic_exec_single(cpu, data, wait); } -/* Deprecated: shim for archs using old arch_send_call_function_ipi API. */ - -#ifndef arch_send_call_function_ipi_mask -# define arch_send_call_function_ipi_mask(maskp) \ - arch_send_call_function_ipi(*(maskp)) -#endif - /** * smp_call_function_many(): Run a function on a set of other CPUs. * @mask: The set of cpus to run on (only runs on online subset). @@ -362,9 +397,7 @@ void __smp_call_function_single(int cpu, struct call_single_data *data, * @wait: If true, wait (atomically) until function has completed * on other CPUs. * - * If @wait is true, then returns once @func has returned. Note that @wait - * will be implicitly turned on in case of allocation failures, since - * we fall back to on-stack allocation. + * If @wait is true, then returns once @func has returned. * * You must not call this function with disabled interrupts or from a * hardware interrupt handler or from a bottom half handler. Preemption @@ -415,14 +448,14 @@ void smp_call_function_many(const struct cpumask *mask, cpumask_clear_cpu(this_cpu, data->cpumask); atomic_set(&data->refs, cpumask_weight(data->cpumask)); - spin_lock_irqsave(&call_function.lock, flags); + raw_spin_lock_irqsave(&call_function.lock, flags); /* * Place entry at the _HEAD_ of the list, so that any cpu still * observing the entry in generic_smp_call_function_interrupt() * will not miss any other list entries: */ list_add_rcu(&data->csd.list, &call_function.queue); - spin_unlock_irqrestore(&call_function.lock, flags); + raw_spin_unlock_irqrestore(&call_function.lock, flags); /* * Make the list addition visible before sending the ipi. @@ -450,8 +483,7 @@ EXPORT_SYMBOL(smp_call_function_many); * Returns 0. * * If @wait is true, then returns once @func has returned; otherwise - * it returns just before the target cpu calls @func. In case of allocation - * failure, @wait will be implicitly turned on. + * it returns just before the target cpu calls @func. * * You must not call this function with disabled interrupts or from a * hardware interrupt handler or from a bottom half handler. @@ -468,20 +500,20 @@ EXPORT_SYMBOL(smp_call_function); void ipi_call_lock(void) { - spin_lock(&call_function.lock); + raw_spin_lock(&call_function.lock); } void ipi_call_unlock(void) { - spin_unlock(&call_function.lock); + raw_spin_unlock(&call_function.lock); } void ipi_call_lock_irq(void) { - spin_lock_irq(&call_function.lock); + raw_spin_lock_irq(&call_function.lock); } void ipi_call_unlock_irq(void) { - spin_unlock_irq(&call_function.lock); + raw_spin_unlock_irq(&call_function.lock); } diff --git a/kernel/softirq.c b/kernel/softirq.c index f8749e5216e..a09502e2ef7 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -302,9 +302,9 @@ void irq_exit(void) if (!in_interrupt() && local_softirq_pending()) invoke_softirq(); + rcu_irq_exit(); #ifdef CONFIG_NO_HZ /* Make sure that timer wheel updates are propagated */ - rcu_irq_exit(); if (idle_cpu(smp_processor_id()) && !in_interrupt() && !need_resched()) tick_nohz_stop_sched_tick(0); #endif @@ -697,7 +697,7 @@ void __init softirq_init(void) open_softirq(HI_SOFTIRQ, tasklet_hi_action); } -static int ksoftirqd(void * __bind_cpu) +static int run_ksoftirqd(void * __bind_cpu) { set_current_state(TASK_INTERRUPTIBLE); @@ -810,7 +810,7 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb, switch (action) { case CPU_UP_PREPARE: case CPU_UP_PREPARE_FROZEN: - p = kthread_create(ksoftirqd, hcpu, "ksoftirqd/%d", hotcpu); + p = kthread_create(run_ksoftirqd, hcpu, "ksoftirqd/%d", hotcpu); if (IS_ERR(p)) { printk("ksoftirqd for %i failed\n", hotcpu); return NOTIFY_BAD; diff --git a/kernel/softlockup.c b/kernel/softlockup.c index 88796c33083..d22579087e2 100644 --- a/kernel/softlockup.c +++ b/kernel/softlockup.c @@ -22,9 +22,9 @@ static DEFINE_SPINLOCK(print_lock); -static DEFINE_PER_CPU(unsigned long, touch_timestamp); -static DEFINE_PER_CPU(unsigned long, print_timestamp); -static DEFINE_PER_CPU(struct task_struct *, watchdog_task); +static DEFINE_PER_CPU(unsigned long, softlockup_touch_ts); /* touch timestamp */ +static DEFINE_PER_CPU(unsigned long, softlockup_print_ts); /* print timestamp */ +static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog); static int __read_mostly did_panic; int __read_mostly softlockup_thresh = 60; @@ -70,12 +70,12 @@ static void __touch_softlockup_watchdog(void) { int this_cpu = raw_smp_processor_id(); - __raw_get_cpu_var(touch_timestamp) = get_timestamp(this_cpu); + __raw_get_cpu_var(softlockup_touch_ts) = get_timestamp(this_cpu); } void touch_softlockup_watchdog(void) { - __raw_get_cpu_var(touch_timestamp) = 0; + __raw_get_cpu_var(softlockup_touch_ts) = 0; } EXPORT_SYMBOL(touch_softlockup_watchdog); @@ -85,16 +85,16 @@ void touch_all_softlockup_watchdogs(void) /* Cause each CPU to re-update its timestamp rather than complain */ for_each_online_cpu(cpu) - per_cpu(touch_timestamp, cpu) = 0; + per_cpu(softlockup_touch_ts, cpu) = 0; } EXPORT_SYMBOL(touch_all_softlockup_watchdogs); int proc_dosoftlockup_thresh(struct ctl_table *table, int write, - struct file *filp, void __user *buffer, + void __user *buffer, size_t *lenp, loff_t *ppos) { touch_all_softlockup_watchdogs(); - return proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); + return proc_dointvec_minmax(table, write, buffer, lenp, ppos); } /* @@ -104,28 +104,28 @@ int proc_dosoftlockup_thresh(struct ctl_table *table, int write, void softlockup_tick(void) { int this_cpu = smp_processor_id(); - unsigned long touch_timestamp = per_cpu(touch_timestamp, this_cpu); - unsigned long print_timestamp; + unsigned long touch_ts = per_cpu(softlockup_touch_ts, this_cpu); + unsigned long print_ts; struct pt_regs *regs = get_irq_regs(); unsigned long now; /* Is detection switched off? */ - if (!per_cpu(watchdog_task, this_cpu) || softlockup_thresh <= 0) { + if (!per_cpu(softlockup_watchdog, this_cpu) || softlockup_thresh <= 0) { /* Be sure we don't false trigger if switched back on */ - if (touch_timestamp) - per_cpu(touch_timestamp, this_cpu) = 0; + if (touch_ts) + per_cpu(softlockup_touch_ts, this_cpu) = 0; return; } - if (touch_timestamp == 0) { + if (touch_ts == 0) { __touch_softlockup_watchdog(); return; } - print_timestamp = per_cpu(print_timestamp, this_cpu); + print_ts = per_cpu(softlockup_print_ts, this_cpu); /* report at most once a second */ - if (print_timestamp == touch_timestamp || did_panic) + if (print_ts == touch_ts || did_panic) return; /* do not print during early bootup: */ @@ -140,18 +140,18 @@ void softlockup_tick(void) * Wake up the high-prio watchdog task twice per * threshold timespan. */ - if (now > touch_timestamp + softlockup_thresh/2) - wake_up_process(per_cpu(watchdog_task, this_cpu)); + if (now > touch_ts + softlockup_thresh/2) + wake_up_process(per_cpu(softlockup_watchdog, this_cpu)); /* Warn about unreasonable delays: */ - if (now <= (touch_timestamp + softlockup_thresh)) + if (now <= (touch_ts + softlockup_thresh)) return; - per_cpu(print_timestamp, this_cpu) = touch_timestamp; + per_cpu(softlockup_print_ts, this_cpu) = touch_ts; spin_lock(&print_lock); printk(KERN_ERR "BUG: soft lockup - CPU#%d stuck for %lus! [%s:%d]\n", - this_cpu, now - touch_timestamp, + this_cpu, now - touch_ts, current->comm, task_pid_nr(current)); print_modules(); print_irqtrace_events(current); @@ -209,32 +209,32 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) switch (action) { case CPU_UP_PREPARE: case CPU_UP_PREPARE_FROZEN: - BUG_ON(per_cpu(watchdog_task, hotcpu)); + BUG_ON(per_cpu(softlockup_watchdog, hotcpu)); p = kthread_create(watchdog, hcpu, "watchdog/%d", hotcpu); if (IS_ERR(p)) { printk(KERN_ERR "watchdog for %i failed\n", hotcpu); return NOTIFY_BAD; } - per_cpu(touch_timestamp, hotcpu) = 0; - per_cpu(watchdog_task, hotcpu) = p; + per_cpu(softlockup_touch_ts, hotcpu) = 0; + per_cpu(softlockup_watchdog, hotcpu) = p; kthread_bind(p, hotcpu); break; case CPU_ONLINE: case CPU_ONLINE_FROZEN: - wake_up_process(per_cpu(watchdog_task, hotcpu)); + wake_up_process(per_cpu(softlockup_watchdog, hotcpu)); break; #ifdef CONFIG_HOTPLUG_CPU case CPU_UP_CANCELED: case CPU_UP_CANCELED_FROZEN: - if (!per_cpu(watchdog_task, hotcpu)) + if (!per_cpu(softlockup_watchdog, hotcpu)) break; /* Unbind so it can run. Fall thru. */ - kthread_bind(per_cpu(watchdog_task, hotcpu), + kthread_bind(per_cpu(softlockup_watchdog, hotcpu), cpumask_any(cpu_online_mask)); case CPU_DEAD: case CPU_DEAD_FROZEN: - p = per_cpu(watchdog_task, hotcpu); - per_cpu(watchdog_task, hotcpu) = NULL; + p = per_cpu(softlockup_watchdog, hotcpu); + per_cpu(softlockup_watchdog, hotcpu) = NULL; kthread_stop(p); break; #endif /* CONFIG_HOTPLUG_CPU */ diff --git a/kernel/spinlock.c b/kernel/spinlock.c index 5ddab730cb2..be6517fb9c1 100644 --- a/kernel/spinlock.c +++ b/kernel/spinlock.c @@ -21,193 +21,72 @@ #include <linux/debug_locks.h> #include <linux/module.h> -#ifndef _spin_trylock -int __lockfunc _spin_trylock(spinlock_t *lock) -{ - return __spin_trylock(lock); -} -EXPORT_SYMBOL(_spin_trylock); -#endif - -#ifndef _read_trylock -int __lockfunc _read_trylock(rwlock_t *lock) -{ - return __read_trylock(lock); -} -EXPORT_SYMBOL(_read_trylock); -#endif - -#ifndef _write_trylock -int __lockfunc _write_trylock(rwlock_t *lock) -{ - return __write_trylock(lock); -} -EXPORT_SYMBOL(_write_trylock); -#endif - /* * If lockdep is enabled then we use the non-preemption spin-ops * even on CONFIG_PREEMPT, because lockdep assumes that interrupts are * not re-enabled during lock-acquire (which the preempt-spin-ops do): */ #if !defined(CONFIG_GENERIC_LOCKBREAK) || defined(CONFIG_DEBUG_LOCK_ALLOC) - -#ifndef _read_lock -void __lockfunc _read_lock(rwlock_t *lock) -{ - __read_lock(lock); -} -EXPORT_SYMBOL(_read_lock); -#endif - -#ifndef _spin_lock_irqsave -unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock) -{ - return __spin_lock_irqsave(lock); -} -EXPORT_SYMBOL(_spin_lock_irqsave); -#endif - -#ifndef _spin_lock_irq -void __lockfunc _spin_lock_irq(spinlock_t *lock) -{ - __spin_lock_irq(lock); -} -EXPORT_SYMBOL(_spin_lock_irq); -#endif - -#ifndef _spin_lock_bh -void __lockfunc _spin_lock_bh(spinlock_t *lock) -{ - __spin_lock_bh(lock); -} -EXPORT_SYMBOL(_spin_lock_bh); -#endif - -#ifndef _read_lock_irqsave -unsigned long __lockfunc _read_lock_irqsave(rwlock_t *lock) -{ - return __read_lock_irqsave(lock); -} -EXPORT_SYMBOL(_read_lock_irqsave); -#endif - -#ifndef _read_lock_irq -void __lockfunc _read_lock_irq(rwlock_t *lock) -{ - __read_lock_irq(lock); -} -EXPORT_SYMBOL(_read_lock_irq); -#endif - -#ifndef _read_lock_bh -void __lockfunc _read_lock_bh(rwlock_t *lock) -{ - __read_lock_bh(lock); -} -EXPORT_SYMBOL(_read_lock_bh); -#endif - -#ifndef _write_lock_irqsave -unsigned long __lockfunc _write_lock_irqsave(rwlock_t *lock) -{ - return __write_lock_irqsave(lock); -} -EXPORT_SYMBOL(_write_lock_irqsave); -#endif - -#ifndef _write_lock_irq -void __lockfunc _write_lock_irq(rwlock_t *lock) -{ - __write_lock_irq(lock); -} -EXPORT_SYMBOL(_write_lock_irq); -#endif - -#ifndef _write_lock_bh -void __lockfunc _write_lock_bh(rwlock_t *lock) -{ - __write_lock_bh(lock); -} -EXPORT_SYMBOL(_write_lock_bh); -#endif - -#ifndef _spin_lock -void __lockfunc _spin_lock(spinlock_t *lock) -{ - __spin_lock(lock); -} -EXPORT_SYMBOL(_spin_lock); -#endif - -#ifndef _write_lock -void __lockfunc _write_lock(rwlock_t *lock) -{ - __write_lock(lock); -} -EXPORT_SYMBOL(_write_lock); -#endif - -#else /* CONFIG_PREEMPT: */ - /* + * The __lock_function inlines are taken from + * include/linux/spinlock_api_smp.h + */ +#else +#define raw_read_can_lock(l) read_can_lock(l) +#define raw_write_can_lock(l) write_can_lock(l) +/* + * We build the __lock_function inlines here. They are too large for + * inlining all over the place, but here is only one user per function + * which embedds them into the calling _lock_function below. + * * This could be a long-held lock. We both prepare to spin for a long * time (making _this_ CPU preemptable if possible), and we also signal * towards that other CPU that it should break the lock ASAP. - * - * (We do this in a function because inlining it would be excessive.) */ - #define BUILD_LOCK_OPS(op, locktype) \ -void __lockfunc _##op##_lock(locktype##_t *lock) \ +void __lockfunc __raw_##op##_lock(locktype##_t *lock) \ { \ for (;;) { \ preempt_disable(); \ - if (likely(_raw_##op##_trylock(lock))) \ + if (likely(do_raw_##op##_trylock(lock))) \ break; \ preempt_enable(); \ \ if (!(lock)->break_lock) \ (lock)->break_lock = 1; \ - while (!op##_can_lock(lock) && (lock)->break_lock) \ - _raw_##op##_relax(&lock->raw_lock); \ + while (!raw_##op##_can_lock(lock) && (lock)->break_lock)\ + arch_##op##_relax(&lock->raw_lock); \ } \ (lock)->break_lock = 0; \ } \ \ -EXPORT_SYMBOL(_##op##_lock); \ - \ -unsigned long __lockfunc _##op##_lock_irqsave(locktype##_t *lock) \ +unsigned long __lockfunc __raw_##op##_lock_irqsave(locktype##_t *lock) \ { \ unsigned long flags; \ \ for (;;) { \ preempt_disable(); \ local_irq_save(flags); \ - if (likely(_raw_##op##_trylock(lock))) \ + if (likely(do_raw_##op##_trylock(lock))) \ break; \ local_irq_restore(flags); \ preempt_enable(); \ \ if (!(lock)->break_lock) \ (lock)->break_lock = 1; \ - while (!op##_can_lock(lock) && (lock)->break_lock) \ - _raw_##op##_relax(&lock->raw_lock); \ + while (!raw_##op##_can_lock(lock) && (lock)->break_lock)\ + arch_##op##_relax(&lock->raw_lock); \ } \ (lock)->break_lock = 0; \ return flags; \ } \ \ -EXPORT_SYMBOL(_##op##_lock_irqsave); \ - \ -void __lockfunc _##op##_lock_irq(locktype##_t *lock) \ +void __lockfunc __raw_##op##_lock_irq(locktype##_t *lock) \ { \ - _##op##_lock_irqsave(lock); \ + _raw_##op##_lock_irqsave(lock); \ } \ \ -EXPORT_SYMBOL(_##op##_lock_irq); \ - \ -void __lockfunc _##op##_lock_bh(locktype##_t *lock) \ +void __lockfunc __raw_##op##_lock_bh(locktype##_t *lock) \ { \ unsigned long flags; \ \ @@ -216,164 +95,283 @@ void __lockfunc _##op##_lock_bh(locktype##_t *lock) \ /* irq-disabling. We use the generic preemption-aware */ \ /* function: */ \ /**/ \ - flags = _##op##_lock_irqsave(lock); \ + flags = _raw_##op##_lock_irqsave(lock); \ local_bh_disable(); \ local_irq_restore(flags); \ } \ - \ -EXPORT_SYMBOL(_##op##_lock_bh) /* * Build preemption-friendly versions of the following * lock-spinning functions: * - * _[spin|read|write]_lock() - * _[spin|read|write]_lock_irq() - * _[spin|read|write]_lock_irqsave() - * _[spin|read|write]_lock_bh() + * __[spin|read|write]_lock() + * __[spin|read|write]_lock_irq() + * __[spin|read|write]_lock_irqsave() + * __[spin|read|write]_lock_bh() */ -BUILD_LOCK_OPS(spin, spinlock); +BUILD_LOCK_OPS(spin, raw_spinlock); BUILD_LOCK_OPS(read, rwlock); BUILD_LOCK_OPS(write, rwlock); -#endif /* CONFIG_PREEMPT */ +#endif -#ifdef CONFIG_DEBUG_LOCK_ALLOC +#ifndef CONFIG_INLINE_SPIN_TRYLOCK +int __lockfunc _raw_spin_trylock(raw_spinlock_t *lock) +{ + return __raw_spin_trylock(lock); +} +EXPORT_SYMBOL(_raw_spin_trylock); +#endif -void __lockfunc _spin_lock_nested(spinlock_t *lock, int subclass) +#ifndef CONFIG_INLINE_SPIN_TRYLOCK_BH +int __lockfunc _raw_spin_trylock_bh(raw_spinlock_t *lock) { - preempt_disable(); - spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_); - LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock); + return __raw_spin_trylock_bh(lock); } -EXPORT_SYMBOL(_spin_lock_nested); +EXPORT_SYMBOL(_raw_spin_trylock_bh); +#endif -unsigned long __lockfunc _spin_lock_irqsave_nested(spinlock_t *lock, int subclass) +#ifndef CONFIG_INLINE_SPIN_LOCK +void __lockfunc _raw_spin_lock(raw_spinlock_t *lock) { - unsigned long flags; + __raw_spin_lock(lock); +} +EXPORT_SYMBOL(_raw_spin_lock); +#endif - local_irq_save(flags); - preempt_disable(); - spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_); - LOCK_CONTENDED_FLAGS(lock, _raw_spin_trylock, _raw_spin_lock, - _raw_spin_lock_flags, &flags); - return flags; +#ifndef CONFIG_INLINE_SPIN_LOCK_IRQSAVE +unsigned long __lockfunc _raw_spin_lock_irqsave(raw_spinlock_t *lock) +{ + return __raw_spin_lock_irqsave(lock); } -EXPORT_SYMBOL(_spin_lock_irqsave_nested); +EXPORT_SYMBOL(_raw_spin_lock_irqsave); +#endif -void __lockfunc _spin_lock_nest_lock(spinlock_t *lock, - struct lockdep_map *nest_lock) +#ifndef CONFIG_INLINE_SPIN_LOCK_IRQ +void __lockfunc _raw_spin_lock_irq(raw_spinlock_t *lock) { - preempt_disable(); - spin_acquire_nest(&lock->dep_map, 0, 0, nest_lock, _RET_IP_); - LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock); + __raw_spin_lock_irq(lock); } -EXPORT_SYMBOL(_spin_lock_nest_lock); +EXPORT_SYMBOL(_raw_spin_lock_irq); +#endif +#ifndef CONFIG_INLINE_SPIN_LOCK_BH +void __lockfunc _raw_spin_lock_bh(raw_spinlock_t *lock) +{ + __raw_spin_lock_bh(lock); +} +EXPORT_SYMBOL(_raw_spin_lock_bh); #endif -#ifndef _spin_unlock -void __lockfunc _spin_unlock(spinlock_t *lock) +#ifndef CONFIG_INLINE_SPIN_UNLOCK +void __lockfunc _raw_spin_unlock(raw_spinlock_t *lock) { - __spin_unlock(lock); + __raw_spin_unlock(lock); } -EXPORT_SYMBOL(_spin_unlock); +EXPORT_SYMBOL(_raw_spin_unlock); #endif -#ifndef _write_unlock -void __lockfunc _write_unlock(rwlock_t *lock) +#ifndef CONFIG_INLINE_SPIN_UNLOCK_IRQRESTORE +void __lockfunc _raw_spin_unlock_irqrestore(raw_spinlock_t *lock, unsigned long flags) { - __write_unlock(lock); + __raw_spin_unlock_irqrestore(lock, flags); } -EXPORT_SYMBOL(_write_unlock); +EXPORT_SYMBOL(_raw_spin_unlock_irqrestore); #endif -#ifndef _read_unlock -void __lockfunc _read_unlock(rwlock_t *lock) +#ifndef CONFIG_INLINE_SPIN_UNLOCK_IRQ +void __lockfunc _raw_spin_unlock_irq(raw_spinlock_t *lock) { - __read_unlock(lock); + __raw_spin_unlock_irq(lock); } -EXPORT_SYMBOL(_read_unlock); +EXPORT_SYMBOL(_raw_spin_unlock_irq); #endif -#ifndef _spin_unlock_irqrestore -void __lockfunc _spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags) +#ifndef CONFIG_INLINE_SPIN_UNLOCK_BH +void __lockfunc _raw_spin_unlock_bh(raw_spinlock_t *lock) { - __spin_unlock_irqrestore(lock, flags); + __raw_spin_unlock_bh(lock); } -EXPORT_SYMBOL(_spin_unlock_irqrestore); +EXPORT_SYMBOL(_raw_spin_unlock_bh); #endif -#ifndef _spin_unlock_irq -void __lockfunc _spin_unlock_irq(spinlock_t *lock) +#ifndef CONFIG_INLINE_READ_TRYLOCK +int __lockfunc _raw_read_trylock(rwlock_t *lock) { - __spin_unlock_irq(lock); + return __raw_read_trylock(lock); } -EXPORT_SYMBOL(_spin_unlock_irq); +EXPORT_SYMBOL(_raw_read_trylock); #endif -#ifndef _spin_unlock_bh -void __lockfunc _spin_unlock_bh(spinlock_t *lock) +#ifndef CONFIG_INLINE_READ_LOCK +void __lockfunc _raw_read_lock(rwlock_t *lock) { - __spin_unlock_bh(lock); + __raw_read_lock(lock); } -EXPORT_SYMBOL(_spin_unlock_bh); +EXPORT_SYMBOL(_raw_read_lock); #endif -#ifndef _read_unlock_irqrestore -void __lockfunc _read_unlock_irqrestore(rwlock_t *lock, unsigned long flags) +#ifndef CONFIG_INLINE_READ_LOCK_IRQSAVE +unsigned long __lockfunc _raw_read_lock_irqsave(rwlock_t *lock) { - __read_unlock_irqrestore(lock, flags); + return __raw_read_lock_irqsave(lock); } -EXPORT_SYMBOL(_read_unlock_irqrestore); +EXPORT_SYMBOL(_raw_read_lock_irqsave); #endif -#ifndef _read_unlock_irq -void __lockfunc _read_unlock_irq(rwlock_t *lock) +#ifndef CONFIG_INLINE_READ_LOCK_IRQ +void __lockfunc _raw_read_lock_irq(rwlock_t *lock) { - __read_unlock_irq(lock); + __raw_read_lock_irq(lock); } -EXPORT_SYMBOL(_read_unlock_irq); +EXPORT_SYMBOL(_raw_read_lock_irq); #endif -#ifndef _read_unlock_bh -void __lockfunc _read_unlock_bh(rwlock_t *lock) +#ifndef CONFIG_INLINE_READ_LOCK_BH +void __lockfunc _raw_read_lock_bh(rwlock_t *lock) { - __read_unlock_bh(lock); + __raw_read_lock_bh(lock); } -EXPORT_SYMBOL(_read_unlock_bh); +EXPORT_SYMBOL(_raw_read_lock_bh); #endif -#ifndef _write_unlock_irqrestore -void __lockfunc _write_unlock_irqrestore(rwlock_t *lock, unsigned long flags) +#ifndef CONFIG_INLINE_READ_UNLOCK +void __lockfunc _raw_read_unlock(rwlock_t *lock) { - __write_unlock_irqrestore(lock, flags); + __raw_read_unlock(lock); } -EXPORT_SYMBOL(_write_unlock_irqrestore); +EXPORT_SYMBOL(_raw_read_unlock); #endif -#ifndef _write_unlock_irq -void __lockfunc _write_unlock_irq(rwlock_t *lock) +#ifndef CONFIG_INLINE_READ_UNLOCK_IRQRESTORE +void __lockfunc _raw_read_unlock_irqrestore(rwlock_t *lock, unsigned long flags) { - __write_unlock_irq(lock); + __raw_read_unlock_irqrestore(lock, flags); } -EXPORT_SYMBOL(_write_unlock_irq); +EXPORT_SYMBOL(_raw_read_unlock_irqrestore); #endif -#ifndef _write_unlock_bh -void __lockfunc _write_unlock_bh(rwlock_t *lock) +#ifndef CONFIG_INLINE_READ_UNLOCK_IRQ +void __lockfunc _raw_read_unlock_irq(rwlock_t *lock) { - __write_unlock_bh(lock); + __raw_read_unlock_irq(lock); } -EXPORT_SYMBOL(_write_unlock_bh); +EXPORT_SYMBOL(_raw_read_unlock_irq); #endif -#ifndef _spin_trylock_bh -int __lockfunc _spin_trylock_bh(spinlock_t *lock) +#ifndef CONFIG_INLINE_READ_UNLOCK_BH +void __lockfunc _raw_read_unlock_bh(rwlock_t *lock) { - return __spin_trylock_bh(lock); + __raw_read_unlock_bh(lock); } -EXPORT_SYMBOL(_spin_trylock_bh); +EXPORT_SYMBOL(_raw_read_unlock_bh); +#endif + +#ifndef CONFIG_INLINE_WRITE_TRYLOCK +int __lockfunc _raw_write_trylock(rwlock_t *lock) +{ + return __raw_write_trylock(lock); +} +EXPORT_SYMBOL(_raw_write_trylock); +#endif + +#ifndef CONFIG_INLINE_WRITE_LOCK +void __lockfunc _raw_write_lock(rwlock_t *lock) +{ + __raw_write_lock(lock); +} +EXPORT_SYMBOL(_raw_write_lock); +#endif + +#ifndef CONFIG_INLINE_WRITE_LOCK_IRQSAVE +unsigned long __lockfunc _raw_write_lock_irqsave(rwlock_t *lock) +{ + return __raw_write_lock_irqsave(lock); +} +EXPORT_SYMBOL(_raw_write_lock_irqsave); +#endif + +#ifndef CONFIG_INLINE_WRITE_LOCK_IRQ +void __lockfunc _raw_write_lock_irq(rwlock_t *lock) +{ + __raw_write_lock_irq(lock); +} +EXPORT_SYMBOL(_raw_write_lock_irq); +#endif + +#ifndef CONFIG_INLINE_WRITE_LOCK_BH +void __lockfunc _raw_write_lock_bh(rwlock_t *lock) +{ + __raw_write_lock_bh(lock); +} +EXPORT_SYMBOL(_raw_write_lock_bh); +#endif + +#ifndef CONFIG_INLINE_WRITE_UNLOCK +void __lockfunc _raw_write_unlock(rwlock_t *lock) +{ + __raw_write_unlock(lock); +} +EXPORT_SYMBOL(_raw_write_unlock); +#endif + +#ifndef CONFIG_INLINE_WRITE_UNLOCK_IRQRESTORE +void __lockfunc _raw_write_unlock_irqrestore(rwlock_t *lock, unsigned long flags) +{ + __raw_write_unlock_irqrestore(lock, flags); +} +EXPORT_SYMBOL(_raw_write_unlock_irqrestore); +#endif + +#ifndef CONFIG_INLINE_WRITE_UNLOCK_IRQ +void __lockfunc _raw_write_unlock_irq(rwlock_t *lock) +{ + __raw_write_unlock_irq(lock); +} +EXPORT_SYMBOL(_raw_write_unlock_irq); +#endif + +#ifndef CONFIG_INLINE_WRITE_UNLOCK_BH +void __lockfunc _raw_write_unlock_bh(rwlock_t *lock) +{ + __raw_write_unlock_bh(lock); +} +EXPORT_SYMBOL(_raw_write_unlock_bh); +#endif + +#ifdef CONFIG_DEBUG_LOCK_ALLOC + +void __lockfunc _raw_spin_lock_nested(raw_spinlock_t *lock, int subclass) +{ + preempt_disable(); + spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_); + LOCK_CONTENDED(lock, do_raw_spin_trylock, do_raw_spin_lock); +} +EXPORT_SYMBOL(_raw_spin_lock_nested); + +unsigned long __lockfunc _raw_spin_lock_irqsave_nested(raw_spinlock_t *lock, + int subclass) +{ + unsigned long flags; + + local_irq_save(flags); + preempt_disable(); + spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_); + LOCK_CONTENDED_FLAGS(lock, do_raw_spin_trylock, do_raw_spin_lock, + do_raw_spin_lock_flags, &flags); + return flags; +} +EXPORT_SYMBOL(_raw_spin_lock_irqsave_nested); + +void __lockfunc _raw_spin_lock_nest_lock(raw_spinlock_t *lock, + struct lockdep_map *nest_lock) +{ + preempt_disable(); + spin_acquire_nest(&lock->dep_map, 0, 0, nest_lock, _RET_IP_); + LOCK_CONTENDED(lock, do_raw_spin_trylock, do_raw_spin_lock); +} +EXPORT_SYMBOL(_raw_spin_lock_nest_lock); + #endif notrace int in_lock_functions(unsigned long addr) diff --git a/kernel/srcu.c b/kernel/srcu.c index b0aeeaf22ce..818d7d9aa03 100644 --- a/kernel/srcu.c +++ b/kernel/srcu.c @@ -49,6 +49,7 @@ int init_srcu_struct(struct srcu_struct *sp) sp->per_cpu_ref = alloc_percpu(struct srcu_struct_array); return (sp->per_cpu_ref ? 0 : -ENOMEM); } +EXPORT_SYMBOL_GPL(init_srcu_struct); /* * srcu_readers_active_idx -- returns approximate number of readers @@ -97,6 +98,7 @@ void cleanup_srcu_struct(struct srcu_struct *sp) free_percpu(sp->per_cpu_ref); sp->per_cpu_ref = NULL; } +EXPORT_SYMBOL_GPL(cleanup_srcu_struct); /** * srcu_read_lock - register a new reader for an SRCU-protected structure. @@ -118,6 +120,7 @@ int srcu_read_lock(struct srcu_struct *sp) preempt_enable(); return idx; } +EXPORT_SYMBOL_GPL(srcu_read_lock); /** * srcu_read_unlock - unregister a old reader from an SRCU-protected structure. @@ -136,22 +139,12 @@ void srcu_read_unlock(struct srcu_struct *sp, int idx) per_cpu_ptr(sp->per_cpu_ref, smp_processor_id())->c[idx]--; preempt_enable(); } +EXPORT_SYMBOL_GPL(srcu_read_unlock); -/** - * synchronize_srcu - wait for prior SRCU read-side critical-section completion - * @sp: srcu_struct with which to synchronize. - * - * Flip the completed counter, and wait for the old count to drain to zero. - * As with classic RCU, the updater must use some separate means of - * synchronizing concurrent updates. Can block; must be called from - * process context. - * - * Note that it is illegal to call synchornize_srcu() from the corresponding - * SRCU read-side critical section; doing so will result in deadlock. - * However, it is perfectly legal to call synchronize_srcu() on one - * srcu_struct from some other srcu_struct's read-side critical section. +/* + * Helper function for synchronize_srcu() and synchronize_srcu_expedited(). */ -void synchronize_srcu(struct srcu_struct *sp) +void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void)) { int idx; @@ -173,7 +166,7 @@ void synchronize_srcu(struct srcu_struct *sp) return; } - synchronize_sched(); /* Force memory barrier on all CPUs. */ + sync_func(); /* Force memory barrier on all CPUs. */ /* * The preceding synchronize_sched() ensures that any CPU that @@ -190,7 +183,7 @@ void synchronize_srcu(struct srcu_struct *sp) idx = sp->completed & 0x1; sp->completed++; - synchronize_sched(); /* Force memory barrier on all CPUs. */ + sync_func(); /* Force memory barrier on all CPUs. */ /* * At this point, because of the preceding synchronize_sched(), @@ -203,7 +196,7 @@ void synchronize_srcu(struct srcu_struct *sp) while (srcu_readers_active_idx(sp, idx)) schedule_timeout_interruptible(1); - synchronize_sched(); /* Force memory barrier on all CPUs. */ + sync_func(); /* Force memory barrier on all CPUs. */ /* * The preceding synchronize_sched() forces all srcu_read_unlock() @@ -237,6 +230,47 @@ void synchronize_srcu(struct srcu_struct *sp) } /** + * synchronize_srcu - wait for prior SRCU read-side critical-section completion + * @sp: srcu_struct with which to synchronize. + * + * Flip the completed counter, and wait for the old count to drain to zero. + * As with classic RCU, the updater must use some separate means of + * synchronizing concurrent updates. Can block; must be called from + * process context. + * + * Note that it is illegal to call synchronize_srcu() from the corresponding + * SRCU read-side critical section; doing so will result in deadlock. + * However, it is perfectly legal to call synchronize_srcu() on one + * srcu_struct from some other srcu_struct's read-side critical section. + */ +void synchronize_srcu(struct srcu_struct *sp) +{ + __synchronize_srcu(sp, synchronize_sched); +} +EXPORT_SYMBOL_GPL(synchronize_srcu); + +/** + * synchronize_srcu_expedited - like synchronize_srcu, but less patient + * @sp: srcu_struct with which to synchronize. + * + * Flip the completed counter, and wait for the old count to drain to zero. + * As with classic RCU, the updater must use some separate means of + * synchronizing concurrent updates. Can block; must be called from + * process context. + * + * Note that it is illegal to call synchronize_srcu_expedited() + * from the corresponding SRCU read-side critical section; doing so + * will result in deadlock. However, it is perfectly legal to call + * synchronize_srcu_expedited() on one srcu_struct from some other + * srcu_struct's read-side critical section. + */ +void synchronize_srcu_expedited(struct srcu_struct *sp) +{ + __synchronize_srcu(sp, synchronize_sched_expedited); +} +EXPORT_SYMBOL_GPL(synchronize_srcu_expedited); + +/** * srcu_batches_completed - return batches completed. * @sp: srcu_struct on which to report batch completion. * @@ -248,10 +282,4 @@ long srcu_batches_completed(struct srcu_struct *sp) { return sp->completed; } - -EXPORT_SYMBOL_GPL(init_srcu_struct); -EXPORT_SYMBOL_GPL(cleanup_srcu_struct); -EXPORT_SYMBOL_GPL(srcu_read_lock); -EXPORT_SYMBOL_GPL(srcu_read_unlock); -EXPORT_SYMBOL_GPL(synchronize_srcu); EXPORT_SYMBOL_GPL(srcu_batches_completed); diff --git a/kernel/sys.c b/kernel/sys.c index ebcb1561172..20ccfb5da6a 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -8,7 +8,6 @@ #include <linux/mm.h> #include <linux/utsname.h> #include <linux/mman.h> -#include <linux/smp_lock.h> #include <linux/notifier.h> #include <linux/reboot.h> #include <linux/prctl.h> @@ -190,10 +189,10 @@ SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval) !(user = find_user(who))) goto out_unlock; /* No processes for this user */ - do_each_thread(g, p) + do_each_thread(g, p) { if (__task_cred(p)->uid == who) error = set_one_prio(p, niceval, error); - while_each_thread(g, p); + } while_each_thread(g, p); if (who != cred->uid) free_uid(user); /* For find_user() */ break; @@ -253,13 +252,13 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who) !(user = find_user(who))) goto out_unlock; /* No processes for this user */ - do_each_thread(g, p) + do_each_thread(g, p) { if (__task_cred(p)->uid == who) { niceval = 20 - task_nice(p); if (niceval > retval) retval = niceval; } - while_each_thread(g, p); + } while_each_thread(g, p); if (who != cred->uid) free_uid(user); /* for find_user() */ break; @@ -349,6 +348,9 @@ void kernel_power_off(void) machine_power_off(); } EXPORT_SYMBOL_GPL(kernel_power_off); + +static DEFINE_MUTEX(reboot_mutex); + /* * Reboot system call: for obvious reasons only root may call it, * and even root needs to set up some magic numbers in the registers @@ -381,7 +383,7 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd, if ((cmd == LINUX_REBOOT_CMD_POWER_OFF) && !pm_power_off) cmd = LINUX_REBOOT_CMD_HALT; - lock_kernel(); + mutex_lock(&reboot_mutex); switch (cmd) { case LINUX_REBOOT_CMD_RESTART: kernel_restart(NULL); @@ -397,20 +399,18 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd, case LINUX_REBOOT_CMD_HALT: kernel_halt(); - unlock_kernel(); do_exit(0); panic("cannot halt"); case LINUX_REBOOT_CMD_POWER_OFF: kernel_power_off(); - unlock_kernel(); do_exit(0); break; case LINUX_REBOOT_CMD_RESTART2: if (strncpy_from_user(&buffer[0], arg, sizeof(buffer) - 1) < 0) { - unlock_kernel(); - return -EFAULT; + ret = -EFAULT; + break; } buffer[sizeof(buffer) - 1] = '\0'; @@ -433,7 +433,7 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd, ret = -EINVAL; break; } - unlock_kernel(); + mutex_unlock(&reboot_mutex); return ret; } @@ -911,16 +911,15 @@ change_okay: void do_sys_times(struct tms *tms) { - struct task_cputime cputime; - cputime_t cutime, cstime; + cputime_t tgutime, tgstime, cutime, cstime; - thread_group_cputime(current, &cputime); spin_lock_irq(¤t->sighand->siglock); + thread_group_times(current, &tgutime, &tgstime); cutime = current->signal->cutime; cstime = current->signal->cstime; spin_unlock_irq(¤t->sighand->siglock); - tms->tms_utime = cputime_to_clock_t(cputime.utime); - tms->tms_stime = cputime_to_clock_t(cputime.stime); + tms->tms_utime = cputime_to_clock_t(tgutime); + tms->tms_stime = cputime_to_clock_t(tgstime); tms->tms_cutime = cputime_to_clock_t(cutime); tms->tms_cstime = cputime_to_clock_t(cstime); } @@ -1110,6 +1109,8 @@ SYSCALL_DEFINE0(setsid) err = session; out: write_unlock_irq(&tasklist_lock); + if (err > 0) + proc_sid_connector(group_leader); return err; } @@ -1336,16 +1337,14 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) { struct task_struct *t; unsigned long flags; - cputime_t utime, stime; - struct task_cputime cputime; + cputime_t tgutime, tgstime, utime, stime; unsigned long maxrss = 0; memset((char *) r, 0, sizeof *r); utime = stime = cputime_zero; if (who == RUSAGE_THREAD) { - utime = task_utime(current); - stime = task_stime(current); + task_times(current, &utime, &stime); accumulate_thread_rusage(p, r); maxrss = p->signal->maxrss; goto out; @@ -1371,9 +1370,9 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) break; case RUSAGE_SELF: - thread_group_cputime(p, &cputime); - utime = cputime_add(utime, cputime.utime); - stime = cputime_add(stime, cputime.stime); + thread_group_times(p, &tgutime, &tgstime); + utime = cputime_add(utime, tgutime); + stime = cputime_add(stime, tgstime); r->ru_nvcsw += p->signal->nvcsw; r->ru_nivcsw += p->signal->nivcsw; r->ru_minflt += p->signal->min_flt; @@ -1542,6 +1541,41 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, current->timer_slack_ns = arg2; error = 0; break; + case PR_MCE_KILL: + if (arg4 | arg5) + return -EINVAL; + switch (arg2) { + case PR_MCE_KILL_CLEAR: + if (arg3 != 0) + return -EINVAL; + current->flags &= ~PF_MCE_PROCESS; + break; + case PR_MCE_KILL_SET: + current->flags |= PF_MCE_PROCESS; + if (arg3 == PR_MCE_KILL_EARLY) + current->flags |= PF_MCE_EARLY; + else if (arg3 == PR_MCE_KILL_LATE) + current->flags &= ~PF_MCE_EARLY; + else if (arg3 == PR_MCE_KILL_DEFAULT) + current->flags &= + ~(PF_MCE_EARLY|PF_MCE_PROCESS); + else + return -EINVAL; + break; + default: + return -EINVAL; + } + error = 0; + break; + case PR_MCE_KILL_GET: + if (arg2 | arg3 | arg4 | arg5) + return -EINVAL; + if (current->flags & PF_MCE_PROCESS) + error = (current->flags & PF_MCE_EARLY) ? + PR_MCE_KILL_EARLY : PR_MCE_KILL_LATE; + else + error = PR_MCE_KILL_DEFAULT; + break; default: error = -EINVAL; break; diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 515bc230ac2..695384f12a7 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -48,7 +48,10 @@ cond_syscall(sys_shutdown); cond_syscall(sys_sendmsg); cond_syscall(compat_sys_sendmsg); cond_syscall(sys_recvmsg); +cond_syscall(sys_recvmmsg); cond_syscall(compat_sys_recvmsg); +cond_syscall(compat_sys_recvfrom); +cond_syscall(compat_sys_recvmmsg); cond_syscall(sys_socketcall); cond_syscall(sys_futex); cond_syscall(compat_sys_futex); @@ -138,7 +141,6 @@ cond_syscall(sys_pciconfig_read); cond_syscall(sys_pciconfig_write); cond_syscall(sys_pciconfig_iobase); cond_syscall(sys32_ipc); -cond_syscall(sys32_sysctl); cond_syscall(ppc_rtas); cond_syscall(sys_spu_run); cond_syscall(sys_spu_create); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 0dfaa47d7cb..45e4bef0012 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -26,9 +26,7 @@ #include <linux/proc_fs.h> #include <linux/security.h> #include <linux/ctype.h> -#include <linux/utsname.h> #include <linux/kmemcheck.h> -#include <linux/smp_lock.h> #include <linux/fs.h> #include <linux/init.h> #include <linux/kernel.h> @@ -37,6 +35,7 @@ #include <linux/sysrq.h> #include <linux/highuid.h> #include <linux/writeback.h> +#include <linux/ratelimit.h> #include <linux/hugetlb.h> #include <linux/initrd.h> #include <linux/key.h> @@ -61,7 +60,6 @@ #include <asm/io.h> #endif -static int deprecated_sysctl_warning(struct __sysctl_args *args); #if defined(CONFIG_SYSCTL) @@ -77,6 +75,7 @@ extern int max_threads; extern int core_uses_pid; extern int suid_dumpable; extern char core_pattern[]; +extern unsigned int core_pipe_limit; extern int pid_max; extern int min_free_kbytes; extern int pid_max_min, pid_max_max; @@ -158,14 +157,16 @@ extern int no_unaligned_warning; extern int unaligned_dump_stack; #endif +extern struct ratelimit_state printk_ratelimit_state; + #ifdef CONFIG_RT_MUTEXES extern int max_lock_depth; #endif #ifdef CONFIG_PROC_SYSCTL -static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp, +static int proc_do_cad_pid(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); -static int proc_taint(struct ctl_table *table, int write, struct file *filp, +static int proc_taint(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); #endif @@ -207,31 +208,26 @@ extern int lock_stat; static struct ctl_table root_table[] = { { - .ctl_name = CTL_KERN, .procname = "kernel", .mode = 0555, .child = kern_table, }, { - .ctl_name = CTL_VM, .procname = "vm", .mode = 0555, .child = vm_table, }, { - .ctl_name = CTL_FS, .procname = "fs", .mode = 0555, .child = fs_table, }, { - .ctl_name = CTL_DEBUG, .procname = "debug", .mode = 0555, .child = debug_table, }, { - .ctl_name = CTL_DEV, .procname = "dev", .mode = 0555, .child = dev_table, @@ -240,7 +236,7 @@ static struct ctl_table root_table[] = { * NOTE: do not add new entries to this table unless you have read * Documentation/sysctl/ctl_unnumbered.txt */ - { .ctl_name = 0 } + { } }; #ifdef CONFIG_SCHED_DEBUG @@ -248,188 +244,178 @@ static int min_sched_granularity_ns = 100000; /* 100 usecs */ static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */ static int min_wakeup_granularity_ns; /* 0 usecs */ static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */ +static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE; +static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1; +static int min_sched_shares_ratelimit = 100000; /* 100 usec */ +static int max_sched_shares_ratelimit = NSEC_PER_SEC; /* 1 second */ #endif static struct ctl_table kern_table[] = { { - .ctl_name = CTL_UNNUMBERED, .procname = "sched_child_runs_first", .data = &sysctl_sched_child_runs_first, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, #ifdef CONFIG_SCHED_DEBUG { - .ctl_name = CTL_UNNUMBERED, .procname = "sched_min_granularity_ns", .data = &sysctl_sched_min_granularity, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &sched_nr_latency_handler, - .strategy = &sysctl_intvec, + .proc_handler = sched_proc_update_handler, .extra1 = &min_sched_granularity_ns, .extra2 = &max_sched_granularity_ns, }, { - .ctl_name = CTL_UNNUMBERED, .procname = "sched_latency_ns", .data = &sysctl_sched_latency, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &sched_nr_latency_handler, - .strategy = &sysctl_intvec, + .proc_handler = sched_proc_update_handler, .extra1 = &min_sched_granularity_ns, .extra2 = &max_sched_granularity_ns, }, { - .ctl_name = CTL_UNNUMBERED, .procname = "sched_wakeup_granularity_ns", .data = &sysctl_sched_wakeup_granularity, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec_minmax, - .strategy = &sysctl_intvec, + .proc_handler = sched_proc_update_handler, .extra1 = &min_wakeup_granularity_ns, .extra2 = &max_wakeup_granularity_ns, }, { - .ctl_name = CTL_UNNUMBERED, .procname = "sched_shares_ratelimit", .data = &sysctl_sched_shares_ratelimit, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = sched_proc_update_handler, + .extra1 = &min_sched_shares_ratelimit, + .extra2 = &max_sched_shares_ratelimit, }, { - .ctl_name = CTL_UNNUMBERED, - .procname = "sched_shares_thresh", - .data = &sysctl_sched_shares_thresh, - .maxlen = sizeof(unsigned int), + .procname = "sched_tunable_scaling", + .data = &sysctl_sched_tunable_scaling, + .maxlen = sizeof(enum sched_tunable_scaling), .mode = 0644, - .proc_handler = &proc_dointvec_minmax, - .strategy = &sysctl_intvec, - .extra1 = &zero, + .proc_handler = sched_proc_update_handler, + .extra1 = &min_sched_tunable_scaling, + .extra2 = &max_sched_tunable_scaling, }, { - .ctl_name = CTL_UNNUMBERED, - .procname = "sched_features", - .data = &sysctl_sched_features, + .procname = "sched_shares_thresh", + .data = &sysctl_sched_shares_thresh, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, }, { - .ctl_name = CTL_UNNUMBERED, .procname = "sched_migration_cost", .data = &sysctl_sched_migration_cost, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = CTL_UNNUMBERED, .procname = "sched_nr_migrate", .data = &sysctl_sched_nr_migrate, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = CTL_UNNUMBERED, .procname = "sched_time_avg", .data = &sysctl_sched_time_avg, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = CTL_UNNUMBERED, .procname = "timer_migration", .data = &sysctl_timer_migration, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec_minmax, - .strategy = &sysctl_intvec, + .proc_handler = proc_dointvec_minmax, .extra1 = &zero, .extra2 = &one, }, #endif { - .ctl_name = CTL_UNNUMBERED, .procname = "sched_rt_period_us", .data = &sysctl_sched_rt_period, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &sched_rt_handler, + .proc_handler = sched_rt_handler, }, { - .ctl_name = CTL_UNNUMBERED, .procname = "sched_rt_runtime_us", .data = &sysctl_sched_rt_runtime, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &sched_rt_handler, + .proc_handler = sched_rt_handler, }, { - .ctl_name = CTL_UNNUMBERED, .procname = "sched_compat_yield", .data = &sysctl_sched_compat_yield, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, #ifdef CONFIG_PROVE_LOCKING { - .ctl_name = CTL_UNNUMBERED, .procname = "prove_locking", .data = &prove_locking, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, #endif #ifdef CONFIG_LOCK_STAT { - .ctl_name = CTL_UNNUMBERED, .procname = "lock_stat", .data = &lock_stat, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, #endif { - .ctl_name = KERN_PANIC, .procname = "panic", .data = &panic_timeout, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = KERN_CORE_USES_PID, .procname = "core_uses_pid", .data = &core_uses_pid, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = KERN_CORE_PATTERN, .procname = "core_pattern", .data = core_pattern, .maxlen = CORENAME_MAX_SIZE, .mode = 0644, - .proc_handler = &proc_dostring, - .strategy = &sysctl_string, + .proc_handler = proc_dostring, + }, + { + .procname = "core_pipe_limit", + .data = &core_pipe_limit, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, }, #ifdef CONFIG_PROC_SYSCTL { .procname = "tainted", .maxlen = sizeof(long), .mode = 0644, - .proc_handler = &proc_taint, + .proc_handler = proc_taint, }, #endif #ifdef CONFIG_LATENCYTOP @@ -438,181 +424,160 @@ static struct ctl_table kern_table[] = { .data = &latencytop_enabled, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, #endif #ifdef CONFIG_BLK_DEV_INITRD { - .ctl_name = KERN_REALROOTDEV, .procname = "real-root-dev", .data = &real_root_dev, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, #endif { - .ctl_name = CTL_UNNUMBERED, .procname = "print-fatal-signals", .data = &print_fatal_signals, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, #ifdef CONFIG_SPARC { - .ctl_name = KERN_SPARC_REBOOT, .procname = "reboot-cmd", .data = reboot_command, .maxlen = 256, .mode = 0644, - .proc_handler = &proc_dostring, - .strategy = &sysctl_string, + .proc_handler = proc_dostring, }, { - .ctl_name = KERN_SPARC_STOP_A, .procname = "stop-a", .data = &stop_a_enabled, .maxlen = sizeof (int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = KERN_SPARC_SCONS_PWROFF, .procname = "scons-poweroff", .data = &scons_pwroff, .maxlen = sizeof (int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, #endif #ifdef CONFIG_SPARC64 { - .ctl_name = CTL_UNNUMBERED, .procname = "tsb-ratio", .data = &sysctl_tsb_ratio, .maxlen = sizeof (int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, #endif #ifdef __hppa__ { - .ctl_name = KERN_HPPA_PWRSW, .procname = "soft-power", .data = &pwrsw_enabled, .maxlen = sizeof (int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = KERN_HPPA_UNALIGNED, .procname = "unaligned-trap", .data = &unaligned_enabled, .maxlen = sizeof (int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, #endif { - .ctl_name = KERN_CTLALTDEL, .procname = "ctrl-alt-del", .data = &C_A_D, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, #ifdef CONFIG_FUNCTION_TRACER { - .ctl_name = CTL_UNNUMBERED, .procname = "ftrace_enabled", .data = &ftrace_enabled, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &ftrace_enable_sysctl, + .proc_handler = ftrace_enable_sysctl, }, #endif #ifdef CONFIG_STACK_TRACER { - .ctl_name = CTL_UNNUMBERED, .procname = "stack_tracer_enabled", .data = &stack_tracer_enabled, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &stack_trace_sysctl, + .proc_handler = stack_trace_sysctl, }, #endif #ifdef CONFIG_TRACING { - .ctl_name = CTL_UNNUMBERED, .procname = "ftrace_dump_on_oops", .data = &ftrace_dump_on_oops, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, #endif #ifdef CONFIG_MODULES { - .ctl_name = KERN_MODPROBE, .procname = "modprobe", .data = &modprobe_path, .maxlen = KMOD_PATH_LEN, .mode = 0644, - .proc_handler = &proc_dostring, - .strategy = &sysctl_string, + .proc_handler = proc_dostring, }, { - .ctl_name = CTL_UNNUMBERED, .procname = "modules_disabled", .data = &modules_disabled, .maxlen = sizeof(int), .mode = 0644, /* only handle a transition from default "0" to "1" */ - .proc_handler = &proc_dointvec_minmax, + .proc_handler = proc_dointvec_minmax, .extra1 = &one, .extra2 = &one, }, #endif #if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET) { - .ctl_name = KERN_HOTPLUG, .procname = "hotplug", .data = &uevent_helper, .maxlen = UEVENT_HELPER_PATH_LEN, .mode = 0644, - .proc_handler = &proc_dostring, - .strategy = &sysctl_string, + .proc_handler = proc_dostring, }, #endif #ifdef CONFIG_CHR_DEV_SG { - .ctl_name = KERN_SG_BIG_BUFF, .procname = "sg-big-buff", .data = &sg_big_buff, .maxlen = sizeof (int), .mode = 0444, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, #endif #ifdef CONFIG_BSD_PROCESS_ACCT { - .ctl_name = KERN_ACCT, .procname = "acct", .data = &acct_parm, .maxlen = 3*sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, #endif #ifdef CONFIG_MAGIC_SYSRQ { - .ctl_name = KERN_SYSRQ, .procname = "sysrq", .data = &__sysrq_enabled, .maxlen = sizeof (int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, #endif #ifdef CONFIG_PROC_SYSCTL @@ -621,215 +586,188 @@ static struct ctl_table kern_table[] = { .data = NULL, .maxlen = sizeof (int), .mode = 0600, - .proc_handler = &proc_do_cad_pid, + .proc_handler = proc_do_cad_pid, }, #endif { - .ctl_name = KERN_MAX_THREADS, .procname = "threads-max", .data = &max_threads, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = KERN_RANDOM, .procname = "random", .mode = 0555, .child = random_table, }, { - .ctl_name = KERN_OVERFLOWUID, .procname = "overflowuid", .data = &overflowuid, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec_minmax, - .strategy = &sysctl_intvec, + .proc_handler = proc_dointvec_minmax, .extra1 = &minolduid, .extra2 = &maxolduid, }, { - .ctl_name = KERN_OVERFLOWGID, .procname = "overflowgid", .data = &overflowgid, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec_minmax, - .strategy = &sysctl_intvec, + .proc_handler = proc_dointvec_minmax, .extra1 = &minolduid, .extra2 = &maxolduid, }, #ifdef CONFIG_S390 #ifdef CONFIG_MATHEMU { - .ctl_name = KERN_IEEE_EMULATION_WARNINGS, .procname = "ieee_emulation_warnings", .data = &sysctl_ieee_emulation_warnings, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, #endif { - .ctl_name = KERN_S390_USER_DEBUG_LOGGING, .procname = "userprocess_debug", .data = &sysctl_userprocess_debug, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, #endif { - .ctl_name = KERN_PIDMAX, .procname = "pid_max", .data = &pid_max, .maxlen = sizeof (int), .mode = 0644, - .proc_handler = &proc_dointvec_minmax, - .strategy = sysctl_intvec, + .proc_handler = proc_dointvec_minmax, .extra1 = &pid_max_min, .extra2 = &pid_max_max, }, { - .ctl_name = KERN_PANIC_ON_OOPS, .procname = "panic_on_oops", .data = &panic_on_oops, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, #if defined CONFIG_PRINTK { - .ctl_name = KERN_PRINTK, .procname = "printk", .data = &console_loglevel, .maxlen = 4*sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = KERN_PRINTK_RATELIMIT, .procname = "printk_ratelimit", .data = &printk_ratelimit_state.interval, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - .strategy = &sysctl_jiffies, + .proc_handler = proc_dointvec_jiffies, }, { - .ctl_name = KERN_PRINTK_RATELIMIT_BURST, .procname = "printk_ratelimit_burst", .data = &printk_ratelimit_state.burst, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = CTL_UNNUMBERED, .procname = "printk_delay", .data = &printk_delay_msec, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec_minmax, - .strategy = &sysctl_intvec, + .proc_handler = proc_dointvec_minmax, .extra1 = &zero, .extra2 = &ten_thousand, }, #endif { - .ctl_name = KERN_NGROUPS_MAX, .procname = "ngroups_max", .data = &ngroups_max, .maxlen = sizeof (int), .mode = 0444, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) { - .ctl_name = KERN_UNKNOWN_NMI_PANIC, .procname = "unknown_nmi_panic", .data = &unknown_nmi_panic, .maxlen = sizeof (int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { .procname = "nmi_watchdog", .data = &nmi_watchdog_enabled, .maxlen = sizeof (int), .mode = 0644, - .proc_handler = &proc_nmi_enabled, + .proc_handler = proc_nmi_enabled, }, #endif #if defined(CONFIG_X86) { - .ctl_name = KERN_PANIC_ON_NMI, .procname = "panic_on_unrecovered_nmi", .data = &panic_on_unrecovered_nmi, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = CTL_UNNUMBERED, .procname = "panic_on_io_nmi", .data = &panic_on_io_nmi, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = KERN_BOOTLOADER_TYPE, .procname = "bootloader_type", .data = &bootloader_type, .maxlen = sizeof (int), .mode = 0444, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = CTL_UNNUMBERED, .procname = "bootloader_version", .data = &bootloader_version, .maxlen = sizeof (int), .mode = 0444, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = CTL_UNNUMBERED, .procname = "kstack_depth_to_print", .data = &kstack_depth_to_print, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = CTL_UNNUMBERED, .procname = "io_delay_type", .data = &io_delay_type, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, #endif #if defined(CONFIG_MMU) { - .ctl_name = KERN_RANDOMIZE, .procname = "randomize_va_space", .data = &randomize_va_space, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, #endif #if defined(CONFIG_S390) && defined(CONFIG_SMP) { - .ctl_name = KERN_SPIN_RETRY, .procname = "spin_retry", .data = &spin_retry, .maxlen = sizeof (int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, #endif #if defined(CONFIG_ACPI_SLEEP) && defined(CONFIG_X86) @@ -838,123 +776,104 @@ static struct ctl_table kern_table[] = { .data = &acpi_realmode_flags, .maxlen = sizeof (unsigned long), .mode = 0644, - .proc_handler = &proc_doulongvec_minmax, + .proc_handler = proc_doulongvec_minmax, }, #endif #ifdef CONFIG_IA64 { - .ctl_name = KERN_IA64_UNALIGNED, .procname = "ignore-unaligned-usertrap", .data = &no_unaligned_warning, .maxlen = sizeof (int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = CTL_UNNUMBERED, .procname = "unaligned-dump-stack", .data = &unaligned_dump_stack, .maxlen = sizeof (int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, #endif #ifdef CONFIG_DETECT_SOFTLOCKUP { - .ctl_name = CTL_UNNUMBERED, .procname = "softlockup_panic", .data = &softlockup_panic, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec_minmax, - .strategy = &sysctl_intvec, + .proc_handler = proc_dointvec_minmax, .extra1 = &zero, .extra2 = &one, }, { - .ctl_name = CTL_UNNUMBERED, .procname = "softlockup_thresh", .data = &softlockup_thresh, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dosoftlockup_thresh, - .strategy = &sysctl_intvec, + .proc_handler = proc_dosoftlockup_thresh, .extra1 = &neg_one, .extra2 = &sixty, }, #endif #ifdef CONFIG_DETECT_HUNG_TASK { - .ctl_name = CTL_UNNUMBERED, .procname = "hung_task_panic", .data = &sysctl_hung_task_panic, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec_minmax, - .strategy = &sysctl_intvec, + .proc_handler = proc_dointvec_minmax, .extra1 = &zero, .extra2 = &one, }, { - .ctl_name = CTL_UNNUMBERED, .procname = "hung_task_check_count", .data = &sysctl_hung_task_check_count, .maxlen = sizeof(unsigned long), .mode = 0644, - .proc_handler = &proc_doulongvec_minmax, - .strategy = &sysctl_intvec, + .proc_handler = proc_doulongvec_minmax, }, { - .ctl_name = CTL_UNNUMBERED, .procname = "hung_task_timeout_secs", .data = &sysctl_hung_task_timeout_secs, .maxlen = sizeof(unsigned long), .mode = 0644, - .proc_handler = &proc_dohung_task_timeout_secs, - .strategy = &sysctl_intvec, + .proc_handler = proc_dohung_task_timeout_secs, }, { - .ctl_name = CTL_UNNUMBERED, .procname = "hung_task_warnings", .data = &sysctl_hung_task_warnings, .maxlen = sizeof(unsigned long), .mode = 0644, - .proc_handler = &proc_doulongvec_minmax, - .strategy = &sysctl_intvec, + .proc_handler = proc_doulongvec_minmax, }, #endif #ifdef CONFIG_COMPAT { - .ctl_name = KERN_COMPAT_LOG, .procname = "compat-log", .data = &compat_log, .maxlen = sizeof (int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, #endif #ifdef CONFIG_RT_MUTEXES { - .ctl_name = KERN_MAX_LOCK_DEPTH, .procname = "max_lock_depth", .data = &max_lock_depth, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, #endif { - .ctl_name = CTL_UNNUMBERED, .procname = "poweroff_cmd", .data = &poweroff_cmd, .maxlen = POWEROFF_CMD_PATH_LEN, .mode = 0644, - .proc_handler = &proc_dostring, - .strategy = &sysctl_string, + .proc_handler = proc_dostring, }, #ifdef CONFIG_KEYS { - .ctl_name = CTL_UNNUMBERED, .procname = "keys", .mode = 0555, .child = key_sysctls, @@ -962,17 +881,15 @@ static struct ctl_table kern_table[] = { #endif #ifdef CONFIG_RCU_TORTURE_TEST { - .ctl_name = CTL_UNNUMBERED, .procname = "rcutorture_runnable", .data = &rcutorture_runnable, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, #endif #ifdef CONFIG_SLOW_WORK { - .ctl_name = CTL_UNNUMBERED, .procname = "slow-work", .mode = 0555, .child = slow_work_sysctls, @@ -980,146 +897,127 @@ static struct ctl_table kern_table[] = { #endif #ifdef CONFIG_PERF_EVENTS { - .ctl_name = CTL_UNNUMBERED, .procname = "perf_event_paranoid", .data = &sysctl_perf_event_paranoid, .maxlen = sizeof(sysctl_perf_event_paranoid), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = CTL_UNNUMBERED, .procname = "perf_event_mlock_kb", .data = &sysctl_perf_event_mlock, .maxlen = sizeof(sysctl_perf_event_mlock), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = CTL_UNNUMBERED, .procname = "perf_event_max_sample_rate", .data = &sysctl_perf_event_sample_rate, .maxlen = sizeof(sysctl_perf_event_sample_rate), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, #endif #ifdef CONFIG_KMEMCHECK { - .ctl_name = CTL_UNNUMBERED, .procname = "kmemcheck", .data = &kmemcheck_enabled, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, #endif #ifdef CONFIG_BLOCK { - .ctl_name = CTL_UNNUMBERED, .procname = "blk_iopoll", .data = &blk_iopoll_enabled, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, #endif /* * NOTE: do not add new entries to this table unless you have read * Documentation/sysctl/ctl_unnumbered.txt */ - { .ctl_name = 0 } + { } }; static struct ctl_table vm_table[] = { { - .ctl_name = VM_OVERCOMMIT_MEMORY, .procname = "overcommit_memory", .data = &sysctl_overcommit_memory, .maxlen = sizeof(sysctl_overcommit_memory), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = VM_PANIC_ON_OOM, .procname = "panic_on_oom", .data = &sysctl_panic_on_oom, .maxlen = sizeof(sysctl_panic_on_oom), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = CTL_UNNUMBERED, .procname = "oom_kill_allocating_task", .data = &sysctl_oom_kill_allocating_task, .maxlen = sizeof(sysctl_oom_kill_allocating_task), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = CTL_UNNUMBERED, .procname = "oom_dump_tasks", .data = &sysctl_oom_dump_tasks, .maxlen = sizeof(sysctl_oom_dump_tasks), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = VM_OVERCOMMIT_RATIO, .procname = "overcommit_ratio", .data = &sysctl_overcommit_ratio, .maxlen = sizeof(sysctl_overcommit_ratio), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = VM_PAGE_CLUSTER, .procname = "page-cluster", .data = &page_cluster, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = VM_DIRTY_BACKGROUND, .procname = "dirty_background_ratio", .data = &dirty_background_ratio, .maxlen = sizeof(dirty_background_ratio), .mode = 0644, - .proc_handler = &dirty_background_ratio_handler, - .strategy = &sysctl_intvec, + .proc_handler = dirty_background_ratio_handler, .extra1 = &zero, .extra2 = &one_hundred, }, { - .ctl_name = CTL_UNNUMBERED, .procname = "dirty_background_bytes", .data = &dirty_background_bytes, .maxlen = sizeof(dirty_background_bytes), .mode = 0644, - .proc_handler = &dirty_background_bytes_handler, - .strategy = &sysctl_intvec, + .proc_handler = dirty_background_bytes_handler, .extra1 = &one_ul, }, { - .ctl_name = VM_DIRTY_RATIO, .procname = "dirty_ratio", .data = &vm_dirty_ratio, .maxlen = sizeof(vm_dirty_ratio), .mode = 0644, - .proc_handler = &dirty_ratio_handler, - .strategy = &sysctl_intvec, + .proc_handler = dirty_ratio_handler, .extra1 = &zero, .extra2 = &one_hundred, }, { - .ctl_name = CTL_UNNUMBERED, .procname = "dirty_bytes", .data = &vm_dirty_bytes, .maxlen = sizeof(vm_dirty_bytes), .mode = 0644, - .proc_handler = &dirty_bytes_handler, - .strategy = &sysctl_intvec, + .proc_handler = dirty_bytes_handler, .extra1 = &dirty_bytes_min, }, { @@ -1127,383 +1025,363 @@ static struct ctl_table vm_table[] = { .data = &dirty_writeback_interval, .maxlen = sizeof(dirty_writeback_interval), .mode = 0644, - .proc_handler = &dirty_writeback_centisecs_handler, + .proc_handler = dirty_writeback_centisecs_handler, }, { .procname = "dirty_expire_centisecs", .data = &dirty_expire_interval, .maxlen = sizeof(dirty_expire_interval), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = VM_NR_PDFLUSH_THREADS, .procname = "nr_pdflush_threads", .data = &nr_pdflush_threads, .maxlen = sizeof nr_pdflush_threads, .mode = 0444 /* read-only*/, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = VM_SWAPPINESS, .procname = "swappiness", .data = &vm_swappiness, .maxlen = sizeof(vm_swappiness), .mode = 0644, - .proc_handler = &proc_dointvec_minmax, - .strategy = &sysctl_intvec, + .proc_handler = proc_dointvec_minmax, .extra1 = &zero, .extra2 = &one_hundred, }, #ifdef CONFIG_HUGETLB_PAGE - { + { .procname = "nr_hugepages", .data = NULL, .maxlen = sizeof(unsigned long), .mode = 0644, - .proc_handler = &hugetlb_sysctl_handler, + .proc_handler = hugetlb_sysctl_handler, .extra1 = (void *)&hugetlb_zero, .extra2 = (void *)&hugetlb_infinity, - }, + }, +#ifdef CONFIG_NUMA + { + .procname = "nr_hugepages_mempolicy", + .data = NULL, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = &hugetlb_mempolicy_sysctl_handler, + .extra1 = (void *)&hugetlb_zero, + .extra2 = (void *)&hugetlb_infinity, + }, +#endif { - .ctl_name = VM_HUGETLB_GROUP, .procname = "hugetlb_shm_group", .data = &sysctl_hugetlb_shm_group, .maxlen = sizeof(gid_t), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = CTL_UNNUMBERED, .procname = "hugepages_treat_as_movable", .data = &hugepages_treat_as_movable, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &hugetlb_treat_movable_handler, + .proc_handler = hugetlb_treat_movable_handler, }, { - .ctl_name = CTL_UNNUMBERED, .procname = "nr_overcommit_hugepages", .data = NULL, .maxlen = sizeof(unsigned long), .mode = 0644, - .proc_handler = &hugetlb_overcommit_handler, + .proc_handler = hugetlb_overcommit_handler, .extra1 = (void *)&hugetlb_zero, .extra2 = (void *)&hugetlb_infinity, }, #endif { - .ctl_name = VM_LOWMEM_RESERVE_RATIO, .procname = "lowmem_reserve_ratio", .data = &sysctl_lowmem_reserve_ratio, .maxlen = sizeof(sysctl_lowmem_reserve_ratio), .mode = 0644, - .proc_handler = &lowmem_reserve_ratio_sysctl_handler, - .strategy = &sysctl_intvec, + .proc_handler = lowmem_reserve_ratio_sysctl_handler, }, { - .ctl_name = VM_DROP_PAGECACHE, .procname = "drop_caches", .data = &sysctl_drop_caches, .maxlen = sizeof(int), .mode = 0644, .proc_handler = drop_caches_sysctl_handler, - .strategy = &sysctl_intvec, }, { - .ctl_name = VM_MIN_FREE_KBYTES, .procname = "min_free_kbytes", .data = &min_free_kbytes, .maxlen = sizeof(min_free_kbytes), .mode = 0644, - .proc_handler = &min_free_kbytes_sysctl_handler, - .strategy = &sysctl_intvec, + .proc_handler = min_free_kbytes_sysctl_handler, .extra1 = &zero, }, { - .ctl_name = VM_PERCPU_PAGELIST_FRACTION, .procname = "percpu_pagelist_fraction", .data = &percpu_pagelist_fraction, .maxlen = sizeof(percpu_pagelist_fraction), .mode = 0644, - .proc_handler = &percpu_pagelist_fraction_sysctl_handler, - .strategy = &sysctl_intvec, + .proc_handler = percpu_pagelist_fraction_sysctl_handler, .extra1 = &min_percpu_pagelist_fract, }, #ifdef CONFIG_MMU { - .ctl_name = VM_MAX_MAP_COUNT, .procname = "max_map_count", .data = &sysctl_max_map_count, .maxlen = sizeof(sysctl_max_map_count), .mode = 0644, - .proc_handler = &proc_dointvec + .proc_handler = proc_dointvec, + .extra1 = &zero, }, #else { - .ctl_name = CTL_UNNUMBERED, .procname = "nr_trim_pages", .data = &sysctl_nr_trim_pages, .maxlen = sizeof(sysctl_nr_trim_pages), .mode = 0644, - .proc_handler = &proc_dointvec_minmax, - .strategy = &sysctl_intvec, + .proc_handler = proc_dointvec_minmax, .extra1 = &zero, }, #endif { - .ctl_name = VM_LAPTOP_MODE, .procname = "laptop_mode", .data = &laptop_mode, .maxlen = sizeof(laptop_mode), .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - .strategy = &sysctl_jiffies, + .proc_handler = proc_dointvec_jiffies, }, { - .ctl_name = VM_BLOCK_DUMP, .procname = "block_dump", .data = &block_dump, .maxlen = sizeof(block_dump), .mode = 0644, - .proc_handler = &proc_dointvec, - .strategy = &sysctl_intvec, + .proc_handler = proc_dointvec, .extra1 = &zero, }, { - .ctl_name = VM_VFS_CACHE_PRESSURE, .procname = "vfs_cache_pressure", .data = &sysctl_vfs_cache_pressure, .maxlen = sizeof(sysctl_vfs_cache_pressure), .mode = 0644, - .proc_handler = &proc_dointvec, - .strategy = &sysctl_intvec, + .proc_handler = proc_dointvec, .extra1 = &zero, }, #ifdef HAVE_ARCH_PICK_MMAP_LAYOUT { - .ctl_name = VM_LEGACY_VA_LAYOUT, .procname = "legacy_va_layout", .data = &sysctl_legacy_va_layout, .maxlen = sizeof(sysctl_legacy_va_layout), .mode = 0644, - .proc_handler = &proc_dointvec, - .strategy = &sysctl_intvec, + .proc_handler = proc_dointvec, .extra1 = &zero, }, #endif #ifdef CONFIG_NUMA { - .ctl_name = VM_ZONE_RECLAIM_MODE, .procname = "zone_reclaim_mode", .data = &zone_reclaim_mode, .maxlen = sizeof(zone_reclaim_mode), .mode = 0644, - .proc_handler = &proc_dointvec, - .strategy = &sysctl_intvec, + .proc_handler = proc_dointvec, .extra1 = &zero, }, { - .ctl_name = VM_MIN_UNMAPPED, .procname = "min_unmapped_ratio", .data = &sysctl_min_unmapped_ratio, .maxlen = sizeof(sysctl_min_unmapped_ratio), .mode = 0644, - .proc_handler = &sysctl_min_unmapped_ratio_sysctl_handler, - .strategy = &sysctl_intvec, + .proc_handler = sysctl_min_unmapped_ratio_sysctl_handler, .extra1 = &zero, .extra2 = &one_hundred, }, { - .ctl_name = VM_MIN_SLAB, .procname = "min_slab_ratio", .data = &sysctl_min_slab_ratio, .maxlen = sizeof(sysctl_min_slab_ratio), .mode = 0644, - .proc_handler = &sysctl_min_slab_ratio_sysctl_handler, - .strategy = &sysctl_intvec, + .proc_handler = sysctl_min_slab_ratio_sysctl_handler, .extra1 = &zero, .extra2 = &one_hundred, }, #endif #ifdef CONFIG_SMP { - .ctl_name = CTL_UNNUMBERED, .procname = "stat_interval", .data = &sysctl_stat_interval, .maxlen = sizeof(sysctl_stat_interval), .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - .strategy = &sysctl_jiffies, + .proc_handler = proc_dointvec_jiffies, }, #endif { - .ctl_name = CTL_UNNUMBERED, .procname = "mmap_min_addr", .data = &dac_mmap_min_addr, .maxlen = sizeof(unsigned long), .mode = 0644, - .proc_handler = &mmap_min_addr_handler, + .proc_handler = mmap_min_addr_handler, }, #ifdef CONFIG_NUMA { - .ctl_name = CTL_UNNUMBERED, .procname = "numa_zonelist_order", .data = &numa_zonelist_order, .maxlen = NUMA_ZONELIST_ORDER_LEN, .mode = 0644, - .proc_handler = &numa_zonelist_order_handler, - .strategy = &sysctl_string, + .proc_handler = numa_zonelist_order_handler, }, #endif #if (defined(CONFIG_X86_32) && !defined(CONFIG_UML))|| \ (defined(CONFIG_SUPERH) && defined(CONFIG_VSYSCALL)) { - .ctl_name = VM_VDSO_ENABLED, .procname = "vdso_enabled", .data = &vdso_enabled, .maxlen = sizeof(vdso_enabled), .mode = 0644, - .proc_handler = &proc_dointvec, - .strategy = &sysctl_intvec, + .proc_handler = proc_dointvec, .extra1 = &zero, }, #endif #ifdef CONFIG_HIGHMEM { - .ctl_name = CTL_UNNUMBERED, .procname = "highmem_is_dirtyable", .data = &vm_highmem_is_dirtyable, .maxlen = sizeof(vm_highmem_is_dirtyable), .mode = 0644, - .proc_handler = &proc_dointvec_minmax, - .strategy = &sysctl_intvec, + .proc_handler = proc_dointvec_minmax, .extra1 = &zero, .extra2 = &one, }, #endif { - .ctl_name = CTL_UNNUMBERED, .procname = "scan_unevictable_pages", .data = &scan_unevictable_pages, .maxlen = sizeof(scan_unevictable_pages), .mode = 0644, - .proc_handler = &scan_unevictable_handler, + .proc_handler = scan_unevictable_handler, }, +#ifdef CONFIG_MEMORY_FAILURE + { + .procname = "memory_failure_early_kill", + .data = &sysctl_memory_failure_early_kill, + .maxlen = sizeof(sysctl_memory_failure_early_kill), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, + }, + { + .procname = "memory_failure_recovery", + .data = &sysctl_memory_failure_recovery, + .maxlen = sizeof(sysctl_memory_failure_recovery), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, + }, +#endif + /* * NOTE: do not add new entries to this table unless you have read * Documentation/sysctl/ctl_unnumbered.txt */ - { .ctl_name = 0 } + { } }; #if defined(CONFIG_BINFMT_MISC) || defined(CONFIG_BINFMT_MISC_MODULE) static struct ctl_table binfmt_misc_table[] = { - { .ctl_name = 0 } + { } }; #endif static struct ctl_table fs_table[] = { { - .ctl_name = FS_NRINODE, .procname = "inode-nr", .data = &inodes_stat, .maxlen = 2*sizeof(int), .mode = 0444, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = FS_STATINODE, .procname = "inode-state", .data = &inodes_stat, .maxlen = 7*sizeof(int), .mode = 0444, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { .procname = "file-nr", .data = &files_stat, .maxlen = 3*sizeof(int), .mode = 0444, - .proc_handler = &proc_nr_files, + .proc_handler = proc_nr_files, }, { - .ctl_name = FS_MAXFILE, .procname = "file-max", .data = &files_stat.max_files, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = CTL_UNNUMBERED, .procname = "nr_open", .data = &sysctl_nr_open, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec_minmax, + .proc_handler = proc_dointvec_minmax, .extra1 = &sysctl_nr_open_min, .extra2 = &sysctl_nr_open_max, }, { - .ctl_name = FS_DENTRY, .procname = "dentry-state", .data = &dentry_stat, .maxlen = 6*sizeof(int), .mode = 0444, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = FS_OVERFLOWUID, .procname = "overflowuid", .data = &fs_overflowuid, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec_minmax, - .strategy = &sysctl_intvec, + .proc_handler = proc_dointvec_minmax, .extra1 = &minolduid, .extra2 = &maxolduid, }, { - .ctl_name = FS_OVERFLOWGID, .procname = "overflowgid", .data = &fs_overflowgid, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec_minmax, - .strategy = &sysctl_intvec, + .proc_handler = proc_dointvec_minmax, .extra1 = &minolduid, .extra2 = &maxolduid, }, #ifdef CONFIG_FILE_LOCKING { - .ctl_name = FS_LEASES, .procname = "leases-enable", .data = &leases_enable, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, #endif #ifdef CONFIG_DNOTIFY { - .ctl_name = FS_DIR_NOTIFY, .procname = "dir-notify-enable", .data = &dir_notify_enable, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, #endif #ifdef CONFIG_MMU #ifdef CONFIG_FILE_LOCKING { - .ctl_name = FS_LEASE_TIME, .procname = "lease-break-time", .data = &lease_break_time, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, #endif #ifdef CONFIG_AIO @@ -1512,19 +1390,18 @@ static struct ctl_table fs_table[] = { .data = &aio_nr, .maxlen = sizeof(aio_nr), .mode = 0444, - .proc_handler = &proc_doulongvec_minmax, + .proc_handler = proc_doulongvec_minmax, }, { .procname = "aio-max-nr", .data = &aio_max_nr, .maxlen = sizeof(aio_max_nr), .mode = 0644, - .proc_handler = &proc_doulongvec_minmax, + .proc_handler = proc_doulongvec_minmax, }, #endif /* CONFIG_AIO */ #ifdef CONFIG_INOTIFY_USER { - .ctl_name = FS_INOTIFY, .procname = "inotify", .mode = 0555, .child = inotify_table, @@ -1539,19 +1416,16 @@ static struct ctl_table fs_table[] = { #endif #endif { - .ctl_name = KERN_SETUID_DUMPABLE, .procname = "suid_dumpable", .data = &suid_dumpable, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec_minmax, - .strategy = &sysctl_intvec, + .proc_handler = proc_dointvec_minmax, .extra1 = &zero, .extra2 = &two, }, #if defined(CONFIG_BINFMT_MISC) || defined(CONFIG_BINFMT_MISC_MODULE) { - .ctl_name = CTL_UNNUMBERED, .procname = "binfmt_misc", .mode = 0555, .child = binfmt_misc_table, @@ -1561,13 +1435,12 @@ static struct ctl_table fs_table[] = { * NOTE: do not add new entries to this table unless you have read * Documentation/sysctl/ctl_unnumbered.txt */ - { .ctl_name = 0 } + { } }; static struct ctl_table debug_table[] = { #if defined(CONFIG_X86) || defined(CONFIG_PPC) { - .ctl_name = CTL_UNNUMBERED, .procname = "exception-trace", .data = &show_unhandled_signals, .maxlen = sizeof(int), @@ -1575,11 +1448,11 @@ static struct ctl_table debug_table[] = { .proc_handler = proc_dointvec }, #endif - { .ctl_name = 0 } + { } }; static struct ctl_table dev_table[] = { - { .ctl_name = 0 } + { } }; static DEFINE_SPINLOCK(sysctl_lock); @@ -1733,122 +1606,6 @@ void register_sysctl_root(struct ctl_table_root *root) spin_unlock(&sysctl_lock); } -#ifdef CONFIG_SYSCTL_SYSCALL -/* Perform the actual read/write of a sysctl table entry. */ -static int do_sysctl_strategy(struct ctl_table_root *root, - struct ctl_table *table, - void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen) -{ - int op = 0, rc; - - if (oldval) - op |= MAY_READ; - if (newval) - op |= MAY_WRITE; - if (sysctl_perm(root, table, op)) - return -EPERM; - - if (table->strategy) { - rc = table->strategy(table, oldval, oldlenp, newval, newlen); - if (rc < 0) - return rc; - if (rc > 0) - return 0; - } - - /* If there is no strategy routine, or if the strategy returns - * zero, proceed with automatic r/w */ - if (table->data && table->maxlen) { - rc = sysctl_data(table, oldval, oldlenp, newval, newlen); - if (rc < 0) - return rc; - } - return 0; -} - -static int parse_table(int __user *name, int nlen, - void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen, - struct ctl_table_root *root, - struct ctl_table *table) -{ - int n; -repeat: - if (!nlen) - return -ENOTDIR; - if (get_user(n, name)) - return -EFAULT; - for ( ; table->ctl_name || table->procname; table++) { - if (!table->ctl_name) - continue; - if (n == table->ctl_name) { - int error; - if (table->child) { - if (sysctl_perm(root, table, MAY_EXEC)) - return -EPERM; - name++; - nlen--; - table = table->child; - goto repeat; - } - error = do_sysctl_strategy(root, table, - oldval, oldlenp, - newval, newlen); - return error; - } - } - return -ENOTDIR; -} - -int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen) -{ - struct ctl_table_header *head; - int error = -ENOTDIR; - - if (nlen <= 0 || nlen >= CTL_MAXNAME) - return -ENOTDIR; - if (oldval) { - int old_len; - if (!oldlenp || get_user(old_len, oldlenp)) - return -EFAULT; - } - - for (head = sysctl_head_next(NULL); head; - head = sysctl_head_next(head)) { - error = parse_table(name, nlen, oldval, oldlenp, - newval, newlen, - head->root, head->ctl_table); - if (error != -ENOTDIR) { - sysctl_head_finish(head); - break; - } - } - return error; -} - -SYSCALL_DEFINE1(sysctl, struct __sysctl_args __user *, args) -{ - struct __sysctl_args tmp; - int error; - - if (copy_from_user(&tmp, args, sizeof(tmp))) - return -EFAULT; - - error = deprecated_sysctl_warning(&tmp); - if (error) - goto out; - - lock_kernel(); - error = do_sysctl(tmp.name, tmp.nlen, tmp.oldval, tmp.oldlenp, - tmp.newval, tmp.newlen); - unlock_kernel(); -out: - return error; -} -#endif /* CONFIG_SYSCTL_SYSCALL */ - /* * sysctl_perm does NOT grant the superuser all rights automatically, because * some sysctl variables are readonly even to root. @@ -1884,7 +1641,7 @@ int sysctl_perm(struct ctl_table_root *root, struct ctl_table *table, int op) static void sysctl_set_parent(struct ctl_table *parent, struct ctl_table *table) { - for (; table->ctl_name || table->procname; table++) { + for (; table->procname; table++) { table->parent = parent; if (table->child) sysctl_set_parent(table, table->child); @@ -1916,11 +1673,11 @@ static struct ctl_table *is_branch_in(struct ctl_table *branch, return NULL; /* ... and nothing else */ - if (branch[1].procname || branch[1].ctl_name) + if (branch[1].procname) return NULL; /* table should contain subdirectory with the same name */ - for (p = table; p->procname || p->ctl_name; p++) { + for (p = table; p->procname; p++) { if (!p->child) continue; if (p->procname && strcmp(p->procname, s) == 0) @@ -1965,9 +1722,6 @@ static void try_attach(struct ctl_table_header *p, struct ctl_table_header *q) * * The members of the &struct ctl_table structure are used as follows: * - * ctl_name - This is the numeric sysctl value used by sysctl(2). The number - * must be unique within that level of sysctl - * * procname - the name of the sysctl file under /proc/sys. Set to %NULL to not * enter a sysctl file * @@ -1982,8 +1736,6 @@ static void try_attach(struct ctl_table_header *p, struct ctl_table_header *q) * * proc_handler - the text handler routine (described below) * - * strategy - the strategy routine (described below) - * * de - for internal use by the sysctl routines * * extra1, extra2 - extra pointers usable by the proc handler routines @@ -1996,19 +1748,6 @@ static void try_attach(struct ctl_table_header *p, struct ctl_table_header *q) * struct enable minimal validation of the values being written to be * performed, and the mode field allows minimal authentication. * - * More sophisticated management can be enabled by the provision of a - * strategy routine with the table entry. This will be called before - * any automatic read or write of the data is performed. - * - * The strategy routine may return - * - * < 0 - Error occurred (error is passed to user process) - * - * 0 - OK - proceed with automatic read or write. - * - * > 0 - OK - read or write has been done by the strategy routine, so - * return immediately. - * * There must be a proc_handler routine for any terminal nodes * mirrored under /proc/sys (non-terminals are handled by a built-in * directory handler). Several default handlers are available to @@ -2035,13 +1774,13 @@ struct ctl_table_header *__register_sysctl_paths( struct ctl_table_set *set; /* Count the path components */ - for (npath = 0; path[npath].ctl_name || path[npath].procname; ++npath) + for (npath = 0; path[npath].procname; ++npath) ; /* * For each path component, allocate a 2-element ctl_table array. * The first array element will be filled with the sysctl entry - * for this, the second will be the sentinel (ctl_name == 0). + * for this, the second will be the sentinel (procname == 0). * * We allocate everything in one go so that we don't have to * worry about freeing additional memory in unregister_sysctl_table. @@ -2058,7 +1797,6 @@ struct ctl_table_header *__register_sysctl_paths( for (n = 0; n < npath; ++n, ++path) { /* Copy the procname */ new->procname = path->procname; - new->ctl_name = path->ctl_name; new->mode = 0555; *prevp = new; @@ -2218,7 +1956,7 @@ void sysctl_head_put(struct ctl_table_header *head) #ifdef CONFIG_PROC_SYSCTL static int _proc_do_string(void* data, int maxlen, int write, - struct file *filp, void __user *buffer, + void __user *buffer, size_t *lenp, loff_t *ppos) { size_t len; @@ -2279,7 +2017,6 @@ static int _proc_do_string(void* data, int maxlen, int write, * proc_dostring - read a string sysctl * @table: the sysctl table * @write: %TRUE if this is a write to the sysctl file - * @filp: the file structure * @buffer: the user buffer * @lenp: the size of the user buffer * @ppos: file position @@ -2293,10 +2030,10 @@ static int _proc_do_string(void* data, int maxlen, int write, * * Returns 0 on success. */ -int proc_dostring(struct ctl_table *table, int write, struct file *filp, +int proc_dostring(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - return _proc_do_string(table->data, table->maxlen, write, filp, + return _proc_do_string(table->data, table->maxlen, write, buffer, lenp, ppos); } @@ -2321,7 +2058,7 @@ static int do_proc_dointvec_conv(int *negp, unsigned long *lvalp, } static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table, - int write, struct file *filp, void __user *buffer, + int write, void __user *buffer, size_t *lenp, loff_t *ppos, int (*conv)(int *negp, unsigned long *lvalp, int *valp, int write, void *data), @@ -2428,13 +2165,13 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table, #undef TMPBUFLEN } -static int do_proc_dointvec(struct ctl_table *table, int write, struct file *filp, +static int do_proc_dointvec(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos, int (*conv)(int *negp, unsigned long *lvalp, int *valp, int write, void *data), void *data) { - return __do_proc_dointvec(table->data, table, write, filp, + return __do_proc_dointvec(table->data, table, write, buffer, lenp, ppos, conv, data); } @@ -2442,7 +2179,6 @@ static int do_proc_dointvec(struct ctl_table *table, int write, struct file *fil * proc_dointvec - read a vector of integers * @table: the sysctl table * @write: %TRUE if this is a write to the sysctl file - * @filp: the file structure * @buffer: the user buffer * @lenp: the size of the user buffer * @ppos: file position @@ -2452,10 +2188,10 @@ static int do_proc_dointvec(struct ctl_table *table, int write, struct file *fil * * Returns 0 on success. */ -int proc_dointvec(struct ctl_table *table, int write, struct file *filp, +int proc_dointvec(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - return do_proc_dointvec(table,write,filp,buffer,lenp,ppos, + return do_proc_dointvec(table,write,buffer,lenp,ppos, NULL,NULL); } @@ -2463,7 +2199,7 @@ int proc_dointvec(struct ctl_table *table, int write, struct file *filp, * Taint values can only be increased * This means we can safely use a temporary. */ -static int proc_taint(struct ctl_table *table, int write, struct file *filp, +static int proc_taint(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table t; @@ -2475,7 +2211,7 @@ static int proc_taint(struct ctl_table *table, int write, struct file *filp, t = *table; t.data = &tmptaint; - err = proc_doulongvec_minmax(&t, write, filp, buffer, lenp, ppos); + err = proc_doulongvec_minmax(&t, write, buffer, lenp, ppos); if (err < 0) return err; @@ -2527,7 +2263,6 @@ static int do_proc_dointvec_minmax_conv(int *negp, unsigned long *lvalp, * proc_dointvec_minmax - read a vector of integers with min/max values * @table: the sysctl table * @write: %TRUE if this is a write to the sysctl file - * @filp: the file structure * @buffer: the user buffer * @lenp: the size of the user buffer * @ppos: file position @@ -2540,19 +2275,18 @@ static int do_proc_dointvec_minmax_conv(int *negp, unsigned long *lvalp, * * Returns 0 on success. */ -int proc_dointvec_minmax(struct ctl_table *table, int write, struct file *filp, +int proc_dointvec_minmax(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { struct do_proc_dointvec_minmax_conv_param param = { .min = (int *) table->extra1, .max = (int *) table->extra2, }; - return do_proc_dointvec(table, write, filp, buffer, lenp, ppos, + return do_proc_dointvec(table, write, buffer, lenp, ppos, do_proc_dointvec_minmax_conv, ¶m); } static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int write, - struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos, unsigned long convmul, @@ -2657,21 +2391,19 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int } static int do_proc_doulongvec_minmax(struct ctl_table *table, int write, - struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos, unsigned long convmul, unsigned long convdiv) { return __do_proc_doulongvec_minmax(table->data, table, write, - filp, buffer, lenp, ppos, convmul, convdiv); + buffer, lenp, ppos, convmul, convdiv); } /** * proc_doulongvec_minmax - read a vector of long integers with min/max values * @table: the sysctl table * @write: %TRUE if this is a write to the sysctl file - * @filp: the file structure * @buffer: the user buffer * @lenp: the size of the user buffer * @ppos: file position @@ -2684,17 +2416,16 @@ static int do_proc_doulongvec_minmax(struct ctl_table *table, int write, * * Returns 0 on success. */ -int proc_doulongvec_minmax(struct ctl_table *table, int write, struct file *filp, +int proc_doulongvec_minmax(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - return do_proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos, 1l, 1l); + return do_proc_doulongvec_minmax(table, write, buffer, lenp, ppos, 1l, 1l); } /** * proc_doulongvec_ms_jiffies_minmax - read a vector of millisecond values with min/max values * @table: the sysctl table * @write: %TRUE if this is a write to the sysctl file - * @filp: the file structure * @buffer: the user buffer * @lenp: the size of the user buffer * @ppos: file position @@ -2709,11 +2440,10 @@ int proc_doulongvec_minmax(struct ctl_table *table, int write, struct file *filp * Returns 0 on success. */ int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write, - struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) { - return do_proc_doulongvec_minmax(table, write, filp, buffer, + return do_proc_doulongvec_minmax(table, write, buffer, lenp, ppos, HZ, 1000l); } @@ -2789,7 +2519,6 @@ static int do_proc_dointvec_ms_jiffies_conv(int *negp, unsigned long *lvalp, * proc_dointvec_jiffies - read a vector of integers as seconds * @table: the sysctl table * @write: %TRUE if this is a write to the sysctl file - * @filp: the file structure * @buffer: the user buffer * @lenp: the size of the user buffer * @ppos: file position @@ -2801,10 +2530,10 @@ static int do_proc_dointvec_ms_jiffies_conv(int *negp, unsigned long *lvalp, * * Returns 0 on success. */ -int proc_dointvec_jiffies(struct ctl_table *table, int write, struct file *filp, +int proc_dointvec_jiffies(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - return do_proc_dointvec(table,write,filp,buffer,lenp,ppos, + return do_proc_dointvec(table,write,buffer,lenp,ppos, do_proc_dointvec_jiffies_conv,NULL); } @@ -2812,7 +2541,6 @@ int proc_dointvec_jiffies(struct ctl_table *table, int write, struct file *filp, * proc_dointvec_userhz_jiffies - read a vector of integers as 1/USER_HZ seconds * @table: the sysctl table * @write: %TRUE if this is a write to the sysctl file - * @filp: the file structure * @buffer: the user buffer * @lenp: the size of the user buffer * @ppos: pointer to the file position @@ -2824,10 +2552,10 @@ int proc_dointvec_jiffies(struct ctl_table *table, int write, struct file *filp, * * Returns 0 on success. */ -int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, struct file *filp, +int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - return do_proc_dointvec(table,write,filp,buffer,lenp,ppos, + return do_proc_dointvec(table,write,buffer,lenp,ppos, do_proc_dointvec_userhz_jiffies_conv,NULL); } @@ -2835,7 +2563,6 @@ int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, struct file * proc_dointvec_ms_jiffies - read a vector of integers as 1 milliseconds * @table: the sysctl table * @write: %TRUE if this is a write to the sysctl file - * @filp: the file structure * @buffer: the user buffer * @lenp: the size of the user buffer * @ppos: file position @@ -2848,14 +2575,14 @@ int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, struct file * * Returns 0 on success. */ -int proc_dointvec_ms_jiffies(struct ctl_table *table, int write, struct file *filp, +int proc_dointvec_ms_jiffies(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - return do_proc_dointvec(table, write, filp, buffer, lenp, ppos, + return do_proc_dointvec(table, write, buffer, lenp, ppos, do_proc_dointvec_ms_jiffies_conv, NULL); } -static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp, +static int proc_do_cad_pid(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { struct pid *new_pid; @@ -2864,7 +2591,7 @@ static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp tmp = pid_vnr(cad_pid); - r = __do_proc_dointvec(&tmp, table, write, filp, buffer, + r = __do_proc_dointvec(&tmp, table, write, buffer, lenp, ppos, NULL, NULL); if (r || !write) return r; @@ -2879,50 +2606,49 @@ static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp #else /* CONFIG_PROC_FS */ -int proc_dostring(struct ctl_table *table, int write, struct file *filp, +int proc_dostring(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } -int proc_dointvec(struct ctl_table *table, int write, struct file *filp, +int proc_dointvec(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } -int proc_dointvec_minmax(struct ctl_table *table, int write, struct file *filp, +int proc_dointvec_minmax(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } -int proc_dointvec_jiffies(struct ctl_table *table, int write, struct file *filp, +int proc_dointvec_jiffies(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } -int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, struct file *filp, +int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } -int proc_dointvec_ms_jiffies(struct ctl_table *table, int write, struct file *filp, +int proc_dointvec_ms_jiffies(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } -int proc_doulongvec_minmax(struct ctl_table *table, int write, struct file *filp, +int proc_doulongvec_minmax(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write, - struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) { @@ -2932,286 +2658,6 @@ int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write, #endif /* CONFIG_PROC_FS */ - -#ifdef CONFIG_SYSCTL_SYSCALL -/* - * General sysctl support routines - */ - -/* The generic sysctl data routine (used if no strategy routine supplied) */ -int sysctl_data(struct ctl_table *table, - void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen) -{ - size_t len; - - /* Get out of I don't have a variable */ - if (!table->data || !table->maxlen) - return -ENOTDIR; - - if (oldval && oldlenp) { - if (get_user(len, oldlenp)) - return -EFAULT; - if (len) { - if (len > table->maxlen) - len = table->maxlen; - if (copy_to_user(oldval, table->data, len)) - return -EFAULT; - if (put_user(len, oldlenp)) - return -EFAULT; - } - } - - if (newval && newlen) { - if (newlen > table->maxlen) - newlen = table->maxlen; - - if (copy_from_user(table->data, newval, newlen)) - return -EFAULT; - } - return 1; -} - -/* The generic string strategy routine: */ -int sysctl_string(struct ctl_table *table, - void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen) -{ - if (!table->data || !table->maxlen) - return -ENOTDIR; - - if (oldval && oldlenp) { - size_t bufsize; - if (get_user(bufsize, oldlenp)) - return -EFAULT; - if (bufsize) { - size_t len = strlen(table->data), copied; - - /* This shouldn't trigger for a well-formed sysctl */ - if (len > table->maxlen) - len = table->maxlen; - - /* Copy up to a max of bufsize-1 bytes of the string */ - copied = (len >= bufsize) ? bufsize - 1 : len; - - if (copy_to_user(oldval, table->data, copied) || - put_user(0, (char __user *)(oldval + copied))) - return -EFAULT; - if (put_user(len, oldlenp)) - return -EFAULT; - } - } - if (newval && newlen) { - size_t len = newlen; - if (len > table->maxlen) - len = table->maxlen; - if(copy_from_user(table->data, newval, len)) - return -EFAULT; - if (len == table->maxlen) - len--; - ((char *) table->data)[len] = 0; - } - return 1; -} - -/* - * This function makes sure that all of the integers in the vector - * are between the minimum and maximum values given in the arrays - * table->extra1 and table->extra2, respectively. - */ -int sysctl_intvec(struct ctl_table *table, - void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen) -{ - - if (newval && newlen) { - int __user *vec = (int __user *) newval; - int *min = (int *) table->extra1; - int *max = (int *) table->extra2; - size_t length; - int i; - - if (newlen % sizeof(int) != 0) - return -EINVAL; - - if (!table->extra1 && !table->extra2) - return 0; - - if (newlen > table->maxlen) - newlen = table->maxlen; - length = newlen / sizeof(int); - - for (i = 0; i < length; i++) { - int value; - if (get_user(value, vec + i)) - return -EFAULT; - if (min && value < min[i]) - return -EINVAL; - if (max && value > max[i]) - return -EINVAL; - } - } - return 0; -} - -/* Strategy function to convert jiffies to seconds */ -int sysctl_jiffies(struct ctl_table *table, - void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen) -{ - if (oldval && oldlenp) { - size_t olen; - - if (get_user(olen, oldlenp)) - return -EFAULT; - if (olen) { - int val; - - if (olen < sizeof(int)) - return -EINVAL; - - val = *(int *)(table->data) / HZ; - if (put_user(val, (int __user *)oldval)) - return -EFAULT; - if (put_user(sizeof(int), oldlenp)) - return -EFAULT; - } - } - if (newval && newlen) { - int new; - if (newlen != sizeof(int)) - return -EINVAL; - if (get_user(new, (int __user *)newval)) - return -EFAULT; - *(int *)(table->data) = new*HZ; - } - return 1; -} - -/* Strategy function to convert jiffies to seconds */ -int sysctl_ms_jiffies(struct ctl_table *table, - void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen) -{ - if (oldval && oldlenp) { - size_t olen; - - if (get_user(olen, oldlenp)) - return -EFAULT; - if (olen) { - int val; - - if (olen < sizeof(int)) - return -EINVAL; - - val = jiffies_to_msecs(*(int *)(table->data)); - if (put_user(val, (int __user *)oldval)) - return -EFAULT; - if (put_user(sizeof(int), oldlenp)) - return -EFAULT; - } - } - if (newval && newlen) { - int new; - if (newlen != sizeof(int)) - return -EINVAL; - if (get_user(new, (int __user *)newval)) - return -EFAULT; - *(int *)(table->data) = msecs_to_jiffies(new); - } - return 1; -} - - - -#else /* CONFIG_SYSCTL_SYSCALL */ - - -SYSCALL_DEFINE1(sysctl, struct __sysctl_args __user *, args) -{ - struct __sysctl_args tmp; - int error; - - if (copy_from_user(&tmp, args, sizeof(tmp))) - return -EFAULT; - - error = deprecated_sysctl_warning(&tmp); - - /* If no error reading the parameters then just -ENOSYS ... */ - if (!error) - error = -ENOSYS; - - return error; -} - -int sysctl_data(struct ctl_table *table, - void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen) -{ - return -ENOSYS; -} - -int sysctl_string(struct ctl_table *table, - void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen) -{ - return -ENOSYS; -} - -int sysctl_intvec(struct ctl_table *table, - void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen) -{ - return -ENOSYS; -} - -int sysctl_jiffies(struct ctl_table *table, - void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen) -{ - return -ENOSYS; -} - -int sysctl_ms_jiffies(struct ctl_table *table, - void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen) -{ - return -ENOSYS; -} - -#endif /* CONFIG_SYSCTL_SYSCALL */ - -static int deprecated_sysctl_warning(struct __sysctl_args *args) -{ - static int msg_count; - int name[CTL_MAXNAME]; - int i; - - /* Check args->nlen. */ - if (args->nlen < 0 || args->nlen > CTL_MAXNAME) - return -ENOTDIR; - - /* Read in the sysctl name for better debug message logging */ - for (i = 0; i < args->nlen; i++) - if (get_user(name[i], args->name + i)) - return -EFAULT; - - /* Ignore accesses to kernel.version */ - if ((args->nlen == 2) && (name[0] == CTL_KERN) && (name[1] == KERN_VERSION)) - return 0; - - if (msg_count < 5) { - msg_count++; - printk(KERN_INFO - "warning: process `%s' used the deprecated sysctl " - "system call with ", current->comm); - for (i = 0; i < args->nlen; i++) - printk("%d.", name[i]); - printk("\n"); - } - return 0; -} - /* * No sense putting this after each symbol definition, twice, * exception granted :-) @@ -3226,9 +2672,4 @@ EXPORT_SYMBOL(proc_doulongvec_minmax); EXPORT_SYMBOL(proc_doulongvec_ms_jiffies_minmax); EXPORT_SYMBOL(register_sysctl_table); EXPORT_SYMBOL(register_sysctl_paths); -EXPORT_SYMBOL(sysctl_intvec); -EXPORT_SYMBOL(sysctl_jiffies); -EXPORT_SYMBOL(sysctl_ms_jiffies); -EXPORT_SYMBOL(sysctl_string); -EXPORT_SYMBOL(sysctl_data); EXPORT_SYMBOL(unregister_sysctl_table); diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c new file mode 100644 index 00000000000..b75dbf40f57 --- /dev/null +++ b/kernel/sysctl_binary.c @@ -0,0 +1,1507 @@ +#include <linux/stat.h> +#include <linux/sysctl.h> +#include "../fs/xfs/linux-2.6/xfs_sysctl.h" +#include <linux/sunrpc/debug.h> +#include <linux/string.h> +#include <net/ip_vs.h> +#include <linux/syscalls.h> +#include <linux/namei.h> +#include <linux/mount.h> +#include <linux/fs.h> +#include <linux/nsproxy.h> +#include <linux/pid_namespace.h> +#include <linux/file.h> +#include <linux/ctype.h> +#include <linux/netdevice.h> + +#ifdef CONFIG_SYSCTL_SYSCALL + +struct bin_table; +typedef ssize_t bin_convert_t(struct file *file, + void __user *oldval, size_t oldlen, void __user *newval, size_t newlen); + +static bin_convert_t bin_dir; +static bin_convert_t bin_string; +static bin_convert_t bin_intvec; +static bin_convert_t bin_ulongvec; +static bin_convert_t bin_uuid; +static bin_convert_t bin_dn_node_address; + +#define CTL_DIR bin_dir +#define CTL_STR bin_string +#define CTL_INT bin_intvec +#define CTL_ULONG bin_ulongvec +#define CTL_UUID bin_uuid +#define CTL_DNADR bin_dn_node_address + +#define BUFSZ 256 + +struct bin_table { + bin_convert_t *convert; + int ctl_name; + const char *procname; + const struct bin_table *child; +}; + +static const struct bin_table bin_random_table[] = { + { CTL_INT, RANDOM_POOLSIZE, "poolsize" }, + { CTL_INT, RANDOM_ENTROPY_COUNT, "entropy_avail" }, + { CTL_INT, RANDOM_READ_THRESH, "read_wakeup_threshold" }, + { CTL_INT, RANDOM_WRITE_THRESH, "write_wakeup_threshold" }, + { CTL_UUID, RANDOM_BOOT_ID, "boot_id" }, + { CTL_UUID, RANDOM_UUID, "uuid" }, + {} +}; + +static const struct bin_table bin_pty_table[] = { + { CTL_INT, PTY_MAX, "max" }, + { CTL_INT, PTY_NR, "nr" }, + {} +}; + +static const struct bin_table bin_kern_table[] = { + { CTL_STR, KERN_OSTYPE, "ostype" }, + { CTL_STR, KERN_OSRELEASE, "osrelease" }, + /* KERN_OSREV not used */ + { CTL_STR, KERN_VERSION, "version" }, + /* KERN_SECUREMASK not used */ + /* KERN_PROF not used */ + { CTL_STR, KERN_NODENAME, "hostname" }, + { CTL_STR, KERN_DOMAINNAME, "domainname" }, + + { CTL_INT, KERN_PANIC, "panic" }, + { CTL_INT, KERN_REALROOTDEV, "real-root-dev" }, + + { CTL_STR, KERN_SPARC_REBOOT, "reboot-cmd" }, + { CTL_INT, KERN_CTLALTDEL, "ctrl-alt-del" }, + { CTL_INT, KERN_PRINTK, "printk" }, + + /* KERN_NAMETRANS not used */ + /* KERN_PPC_HTABRECLAIM not used */ + /* KERN_PPC_ZEROPAGED not used */ + { CTL_INT, KERN_PPC_POWERSAVE_NAP, "powersave-nap" }, + + { CTL_STR, KERN_MODPROBE, "modprobe" }, + { CTL_INT, KERN_SG_BIG_BUFF, "sg-big-buff" }, + { CTL_INT, KERN_ACCT, "acct" }, + /* KERN_PPC_L2CR "l2cr" no longer used */ + + /* KERN_RTSIGNR not used */ + /* KERN_RTSIGMAX not used */ + + { CTL_ULONG, KERN_SHMMAX, "shmmax" }, + { CTL_INT, KERN_MSGMAX, "msgmax" }, + { CTL_INT, KERN_MSGMNB, "msgmnb" }, + /* KERN_MSGPOOL not used*/ + { CTL_INT, KERN_SYSRQ, "sysrq" }, + { CTL_INT, KERN_MAX_THREADS, "threads-max" }, + { CTL_DIR, KERN_RANDOM, "random", bin_random_table }, + { CTL_ULONG, KERN_SHMALL, "shmall" }, + { CTL_INT, KERN_MSGMNI, "msgmni" }, + { CTL_INT, KERN_SEM, "sem" }, + { CTL_INT, KERN_SPARC_STOP_A, "stop-a" }, + { CTL_INT, KERN_SHMMNI, "shmmni" }, + + { CTL_INT, KERN_OVERFLOWUID, "overflowuid" }, + { CTL_INT, KERN_OVERFLOWGID, "overflowgid" }, + + { CTL_STR, KERN_HOTPLUG, "hotplug", }, + { CTL_INT, KERN_IEEE_EMULATION_WARNINGS, "ieee_emulation_warnings" }, + + { CTL_INT, KERN_S390_USER_DEBUG_LOGGING, "userprocess_debug" }, + { CTL_INT, KERN_CORE_USES_PID, "core_uses_pid" }, + /* KERN_TAINTED "tainted" no longer used */ + { CTL_INT, KERN_CADPID, "cad_pid" }, + { CTL_INT, KERN_PIDMAX, "pid_max" }, + { CTL_STR, KERN_CORE_PATTERN, "core_pattern" }, + { CTL_INT, KERN_PANIC_ON_OOPS, "panic_on_oops" }, + { CTL_INT, KERN_HPPA_PWRSW, "soft-power" }, + { CTL_INT, KERN_HPPA_UNALIGNED, "unaligned-trap" }, + + { CTL_INT, KERN_PRINTK_RATELIMIT, "printk_ratelimit" }, + { CTL_INT, KERN_PRINTK_RATELIMIT_BURST, "printk_ratelimit_burst" }, + + { CTL_DIR, KERN_PTY, "pty", bin_pty_table }, + { CTL_INT, KERN_NGROUPS_MAX, "ngroups_max" }, + { CTL_INT, KERN_SPARC_SCONS_PWROFF, "scons-poweroff" }, + /* KERN_HZ_TIMER "hz_timer" no longer used */ + { CTL_INT, KERN_UNKNOWN_NMI_PANIC, "unknown_nmi_panic" }, + { CTL_INT, KERN_BOOTLOADER_TYPE, "bootloader_type" }, + { CTL_INT, KERN_RANDOMIZE, "randomize_va_space" }, + + { CTL_INT, KERN_SPIN_RETRY, "spin_retry" }, + /* KERN_ACPI_VIDEO_FLAGS "acpi_video_flags" no longer used */ + { CTL_INT, KERN_IA64_UNALIGNED, "ignore-unaligned-usertrap" }, + { CTL_INT, KERN_COMPAT_LOG, "compat-log" }, + { CTL_INT, KERN_MAX_LOCK_DEPTH, "max_lock_depth" }, + { CTL_INT, KERN_NMI_WATCHDOG, "nmi_watchdog" }, + { CTL_INT, KERN_PANIC_ON_NMI, "panic_on_unrecovered_nmi" }, + {} +}; + +static const struct bin_table bin_vm_table[] = { + { CTL_INT, VM_OVERCOMMIT_MEMORY, "overcommit_memory" }, + { CTL_INT, VM_PAGE_CLUSTER, "page-cluster" }, + { CTL_INT, VM_DIRTY_BACKGROUND, "dirty_background_ratio" }, + { CTL_INT, VM_DIRTY_RATIO, "dirty_ratio" }, + /* VM_DIRTY_WB_CS "dirty_writeback_centisecs" no longer used */ + /* VM_DIRTY_EXPIRE_CS "dirty_expire_centisecs" no longer used */ + { CTL_INT, VM_NR_PDFLUSH_THREADS, "nr_pdflush_threads" }, + { CTL_INT, VM_OVERCOMMIT_RATIO, "overcommit_ratio" }, + /* VM_PAGEBUF unused */ + /* VM_HUGETLB_PAGES "nr_hugepages" no longer used */ + { CTL_INT, VM_SWAPPINESS, "swappiness" }, + { CTL_INT, VM_LOWMEM_RESERVE_RATIO, "lowmem_reserve_ratio" }, + { CTL_INT, VM_MIN_FREE_KBYTES, "min_free_kbytes" }, + { CTL_INT, VM_MAX_MAP_COUNT, "max_map_count" }, + { CTL_INT, VM_LAPTOP_MODE, "laptop_mode" }, + { CTL_INT, VM_BLOCK_DUMP, "block_dump" }, + { CTL_INT, VM_HUGETLB_GROUP, "hugetlb_shm_group" }, + { CTL_INT, VM_VFS_CACHE_PRESSURE, "vfs_cache_pressure" }, + { CTL_INT, VM_LEGACY_VA_LAYOUT, "legacy_va_layout" }, + /* VM_SWAP_TOKEN_TIMEOUT unused */ + { CTL_INT, VM_DROP_PAGECACHE, "drop_caches" }, + { CTL_INT, VM_PERCPU_PAGELIST_FRACTION, "percpu_pagelist_fraction" }, + { CTL_INT, VM_ZONE_RECLAIM_MODE, "zone_reclaim_mode" }, + { CTL_INT, VM_MIN_UNMAPPED, "min_unmapped_ratio" }, + { CTL_INT, VM_PANIC_ON_OOM, "panic_on_oom" }, + { CTL_INT, VM_VDSO_ENABLED, "vdso_enabled" }, + { CTL_INT, VM_MIN_SLAB, "min_slab_ratio" }, + + {} +}; + +static const struct bin_table bin_net_core_table[] = { + { CTL_INT, NET_CORE_WMEM_MAX, "wmem_max" }, + { CTL_INT, NET_CORE_RMEM_MAX, "rmem_max" }, + { CTL_INT, NET_CORE_WMEM_DEFAULT, "wmem_default" }, + { CTL_INT, NET_CORE_RMEM_DEFAULT, "rmem_default" }, + /* NET_CORE_DESTROY_DELAY unused */ + { CTL_INT, NET_CORE_MAX_BACKLOG, "netdev_max_backlog" }, + /* NET_CORE_FASTROUTE unused */ + { CTL_INT, NET_CORE_MSG_COST, "message_cost" }, + { CTL_INT, NET_CORE_MSG_BURST, "message_burst" }, + { CTL_INT, NET_CORE_OPTMEM_MAX, "optmem_max" }, + /* NET_CORE_HOT_LIST_LENGTH unused */ + /* NET_CORE_DIVERT_VERSION unused */ + /* NET_CORE_NO_CONG_THRESH unused */ + /* NET_CORE_NO_CONG unused */ + /* NET_CORE_LO_CONG unused */ + /* NET_CORE_MOD_CONG unused */ + { CTL_INT, NET_CORE_DEV_WEIGHT, "dev_weight" }, + { CTL_INT, NET_CORE_SOMAXCONN, "somaxconn" }, + { CTL_INT, NET_CORE_BUDGET, "netdev_budget" }, + { CTL_INT, NET_CORE_AEVENT_ETIME, "xfrm_aevent_etime" }, + { CTL_INT, NET_CORE_AEVENT_RSEQTH, "xfrm_aevent_rseqth" }, + { CTL_INT, NET_CORE_WARNINGS, "warnings" }, + {}, +}; + +static const struct bin_table bin_net_unix_table[] = { + /* NET_UNIX_DESTROY_DELAY unused */ + /* NET_UNIX_DELETE_DELAY unused */ + { CTL_INT, NET_UNIX_MAX_DGRAM_QLEN, "max_dgram_qlen" }, + {} +}; + +static const struct bin_table bin_net_ipv4_route_table[] = { + { CTL_INT, NET_IPV4_ROUTE_FLUSH, "flush" }, + /* NET_IPV4_ROUTE_MIN_DELAY "min_delay" no longer used */ + /* NET_IPV4_ROUTE_MAX_DELAY "max_delay" no longer used */ + { CTL_INT, NET_IPV4_ROUTE_GC_THRESH, "gc_thresh" }, + { CTL_INT, NET_IPV4_ROUTE_MAX_SIZE, "max_size" }, + { CTL_INT, NET_IPV4_ROUTE_GC_MIN_INTERVAL, "gc_min_interval" }, + { CTL_INT, NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS, "gc_min_interval_ms" }, + { CTL_INT, NET_IPV4_ROUTE_GC_TIMEOUT, "gc_timeout" }, + { CTL_INT, NET_IPV4_ROUTE_GC_INTERVAL, "gc_interval" }, + { CTL_INT, NET_IPV4_ROUTE_REDIRECT_LOAD, "redirect_load" }, + { CTL_INT, NET_IPV4_ROUTE_REDIRECT_NUMBER, "redirect_number" }, + { CTL_INT, NET_IPV4_ROUTE_REDIRECT_SILENCE, "redirect_silence" }, + { CTL_INT, NET_IPV4_ROUTE_ERROR_COST, "error_cost" }, + { CTL_INT, NET_IPV4_ROUTE_ERROR_BURST, "error_burst" }, + { CTL_INT, NET_IPV4_ROUTE_GC_ELASTICITY, "gc_elasticity" }, + { CTL_INT, NET_IPV4_ROUTE_MTU_EXPIRES, "mtu_expires" }, + { CTL_INT, NET_IPV4_ROUTE_MIN_PMTU, "min_pmtu" }, + { CTL_INT, NET_IPV4_ROUTE_MIN_ADVMSS, "min_adv_mss" }, + { CTL_INT, NET_IPV4_ROUTE_SECRET_INTERVAL, "secret_interval" }, + {} +}; + +static const struct bin_table bin_net_ipv4_conf_vars_table[] = { + { CTL_INT, NET_IPV4_CONF_FORWARDING, "forwarding" }, + { CTL_INT, NET_IPV4_CONF_MC_FORWARDING, "mc_forwarding" }, + + { CTL_INT, NET_IPV4_CONF_ACCEPT_REDIRECTS, "accept_redirects" }, + { CTL_INT, NET_IPV4_CONF_SECURE_REDIRECTS, "secure_redirects" }, + { CTL_INT, NET_IPV4_CONF_SEND_REDIRECTS, "send_redirects" }, + { CTL_INT, NET_IPV4_CONF_SHARED_MEDIA, "shared_media" }, + { CTL_INT, NET_IPV4_CONF_RP_FILTER, "rp_filter" }, + { CTL_INT, NET_IPV4_CONF_ACCEPT_SOURCE_ROUTE, "accept_source_route" }, + { CTL_INT, NET_IPV4_CONF_PROXY_ARP, "proxy_arp" }, + { CTL_INT, NET_IPV4_CONF_MEDIUM_ID, "medium_id" }, + { CTL_INT, NET_IPV4_CONF_BOOTP_RELAY, "bootp_relay" }, + { CTL_INT, NET_IPV4_CONF_LOG_MARTIANS, "log_martians" }, + { CTL_INT, NET_IPV4_CONF_TAG, "tag" }, + { CTL_INT, NET_IPV4_CONF_ARPFILTER, "arp_filter" }, + { CTL_INT, NET_IPV4_CONF_ARP_ANNOUNCE, "arp_announce" }, + { CTL_INT, NET_IPV4_CONF_ARP_IGNORE, "arp_ignore" }, + { CTL_INT, NET_IPV4_CONF_ARP_ACCEPT, "arp_accept" }, + { CTL_INT, NET_IPV4_CONF_ARP_NOTIFY, "arp_notify" }, + + { CTL_INT, NET_IPV4_CONF_NOXFRM, "disable_xfrm" }, + { CTL_INT, NET_IPV4_CONF_NOPOLICY, "disable_policy" }, + { CTL_INT, NET_IPV4_CONF_FORCE_IGMP_VERSION, "force_igmp_version" }, + { CTL_INT, NET_IPV4_CONF_PROMOTE_SECONDARIES, "promote_secondaries" }, + {} +}; + +static const struct bin_table bin_net_ipv4_conf_table[] = { + { CTL_DIR, NET_PROTO_CONF_ALL, "all", bin_net_ipv4_conf_vars_table }, + { CTL_DIR, NET_PROTO_CONF_DEFAULT, "default", bin_net_ipv4_conf_vars_table }, + { CTL_DIR, 0, NULL, bin_net_ipv4_conf_vars_table }, + {} +}; + +static const struct bin_table bin_net_neigh_vars_table[] = { + { CTL_INT, NET_NEIGH_MCAST_SOLICIT, "mcast_solicit" }, + { CTL_INT, NET_NEIGH_UCAST_SOLICIT, "ucast_solicit" }, + { CTL_INT, NET_NEIGH_APP_SOLICIT, "app_solicit" }, + /* NET_NEIGH_RETRANS_TIME "retrans_time" no longer used */ + { CTL_INT, NET_NEIGH_REACHABLE_TIME, "base_reachable_time" }, + { CTL_INT, NET_NEIGH_DELAY_PROBE_TIME, "delay_first_probe_time" }, + { CTL_INT, NET_NEIGH_GC_STALE_TIME, "gc_stale_time" }, + { CTL_INT, NET_NEIGH_UNRES_QLEN, "unres_qlen" }, + { CTL_INT, NET_NEIGH_PROXY_QLEN, "proxy_qlen" }, + /* NET_NEIGH_ANYCAST_DELAY "anycast_delay" no longer used */ + /* NET_NEIGH_PROXY_DELAY "proxy_delay" no longer used */ + /* NET_NEIGH_LOCKTIME "locktime" no longer used */ + { CTL_INT, NET_NEIGH_GC_INTERVAL, "gc_interval" }, + { CTL_INT, NET_NEIGH_GC_THRESH1, "gc_thresh1" }, + { CTL_INT, NET_NEIGH_GC_THRESH2, "gc_thresh2" }, + { CTL_INT, NET_NEIGH_GC_THRESH3, "gc_thresh3" }, + { CTL_INT, NET_NEIGH_RETRANS_TIME_MS, "retrans_time_ms" }, + { CTL_INT, NET_NEIGH_REACHABLE_TIME_MS, "base_reachable_time_ms" }, + {} +}; + +static const struct bin_table bin_net_neigh_table[] = { + { CTL_DIR, NET_PROTO_CONF_DEFAULT, "default", bin_net_neigh_vars_table }, + { CTL_DIR, 0, NULL, bin_net_neigh_vars_table }, + {} +}; + +static const struct bin_table bin_net_ipv4_netfilter_table[] = { + { CTL_INT, NET_IPV4_NF_CONNTRACK_MAX, "ip_conntrack_max" }, + + /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_SYN_SENT "ip_conntrack_tcp_timeout_syn_sent" no longer used */ + /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_SYN_RECV "ip_conntrack_tcp_timeout_syn_recv" no longer used */ + /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_ESTABLISHED "ip_conntrack_tcp_timeout_established" no longer used */ + /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_FIN_WAIT "ip_conntrack_tcp_timeout_fin_wait" no longer used */ + /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_CLOSE_WAIT "ip_conntrack_tcp_timeout_close_wait" no longer used */ + /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_LAST_ACK "ip_conntrack_tcp_timeout_last_ack" no longer used */ + /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_TIME_WAIT "ip_conntrack_tcp_timeout_time_wait" no longer used */ + /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_CLOSE "ip_conntrack_tcp_timeout_close" no longer used */ + + /* NET_IPV4_NF_CONNTRACK_UDP_TIMEOUT "ip_conntrack_udp_timeout" no longer used */ + /* NET_IPV4_NF_CONNTRACK_UDP_TIMEOUT_STREAM "ip_conntrack_udp_timeout_stream" no longer used */ + /* NET_IPV4_NF_CONNTRACK_ICMP_TIMEOUT "ip_conntrack_icmp_timeout" no longer used */ + /* NET_IPV4_NF_CONNTRACK_GENERIC_TIMEOUT "ip_conntrack_generic_timeout" no longer used */ + + { CTL_INT, NET_IPV4_NF_CONNTRACK_BUCKETS, "ip_conntrack_buckets" }, + { CTL_INT, NET_IPV4_NF_CONNTRACK_LOG_INVALID, "ip_conntrack_log_invalid" }, + /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_MAX_RETRANS "ip_conntrack_tcp_timeout_max_retrans" no longer used */ + { CTL_INT, NET_IPV4_NF_CONNTRACK_TCP_LOOSE, "ip_conntrack_tcp_loose" }, + { CTL_INT, NET_IPV4_NF_CONNTRACK_TCP_BE_LIBERAL, "ip_conntrack_tcp_be_liberal" }, + { CTL_INT, NET_IPV4_NF_CONNTRACK_TCP_MAX_RETRANS, "ip_conntrack_tcp_max_retrans" }, + + /* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_CLOSED "ip_conntrack_sctp_timeout_closed" no longer used */ + /* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_WAIT "ip_conntrack_sctp_timeout_cookie_wait" no longer used */ + /* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_ECHOED "ip_conntrack_sctp_timeout_cookie_echoed" no longer used */ + /* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_ESTABLISHED "ip_conntrack_sctp_timeout_established" no longer used */ + /* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_SENT "ip_conntrack_sctp_timeout_shutdown_sent" no longer used */ + /* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_RECD "ip_conntrack_sctp_timeout_shutdown_recd" no longer used */ + /* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_ACK_SENT "ip_conntrack_sctp_timeout_shutdown_ack_sent" no longer used */ + + { CTL_INT, NET_IPV4_NF_CONNTRACK_COUNT, "ip_conntrack_count" }, + { CTL_INT, NET_IPV4_NF_CONNTRACK_CHECKSUM, "ip_conntrack_checksum" }, + {} +}; + +static const struct bin_table bin_net_ipv4_table[] = { + {CTL_INT, NET_IPV4_FORWARD, "ip_forward" }, + + { CTL_DIR, NET_IPV4_CONF, "conf", bin_net_ipv4_conf_table }, + { CTL_DIR, NET_IPV4_NEIGH, "neigh", bin_net_neigh_table }, + { CTL_DIR, NET_IPV4_ROUTE, "route", bin_net_ipv4_route_table }, + /* NET_IPV4_FIB_HASH unused */ + { CTL_DIR, NET_IPV4_NETFILTER, "netfilter", bin_net_ipv4_netfilter_table }, + + { CTL_INT, NET_IPV4_TCP_TIMESTAMPS, "tcp_timestamps" }, + { CTL_INT, NET_IPV4_TCP_WINDOW_SCALING, "tcp_window_scaling" }, + { CTL_INT, NET_IPV4_TCP_SACK, "tcp_sack" }, + { CTL_INT, NET_IPV4_TCP_RETRANS_COLLAPSE, "tcp_retrans_collapse" }, + { CTL_INT, NET_IPV4_DEFAULT_TTL, "ip_default_ttl" }, + /* NET_IPV4_AUTOCONFIG unused */ + { CTL_INT, NET_IPV4_NO_PMTU_DISC, "ip_no_pmtu_disc" }, + { CTL_INT, NET_IPV4_NONLOCAL_BIND, "ip_nonlocal_bind" }, + { CTL_INT, NET_IPV4_TCP_SYN_RETRIES, "tcp_syn_retries" }, + { CTL_INT, NET_TCP_SYNACK_RETRIES, "tcp_synack_retries" }, + { CTL_INT, NET_TCP_MAX_ORPHANS, "tcp_max_orphans" }, + { CTL_INT, NET_TCP_MAX_TW_BUCKETS, "tcp_max_tw_buckets" }, + { CTL_INT, NET_IPV4_DYNADDR, "ip_dynaddr" }, + { CTL_INT, NET_IPV4_TCP_KEEPALIVE_TIME, "tcp_keepalive_time" }, + { CTL_INT, NET_IPV4_TCP_KEEPALIVE_PROBES, "tcp_keepalive_probes" }, + { CTL_INT, NET_IPV4_TCP_KEEPALIVE_INTVL, "tcp_keepalive_intvl" }, + { CTL_INT, NET_IPV4_TCP_RETRIES1, "tcp_retries1" }, + { CTL_INT, NET_IPV4_TCP_RETRIES2, "tcp_retries2" }, + { CTL_INT, NET_IPV4_TCP_FIN_TIMEOUT, "tcp_fin_timeout" }, + { CTL_INT, NET_TCP_SYNCOOKIES, "tcp_syncookies" }, + { CTL_INT, NET_TCP_TW_RECYCLE, "tcp_tw_recycle" }, + { CTL_INT, NET_TCP_ABORT_ON_OVERFLOW, "tcp_abort_on_overflow" }, + { CTL_INT, NET_TCP_STDURG, "tcp_stdurg" }, + { CTL_INT, NET_TCP_RFC1337, "tcp_rfc1337" }, + { CTL_INT, NET_TCP_MAX_SYN_BACKLOG, "tcp_max_syn_backlog" }, + { CTL_INT, NET_IPV4_LOCAL_PORT_RANGE, "ip_local_port_range" }, + { CTL_INT, NET_IPV4_IGMP_MAX_MEMBERSHIPS, "igmp_max_memberships" }, + { CTL_INT, NET_IPV4_IGMP_MAX_MSF, "igmp_max_msf" }, + { CTL_INT, NET_IPV4_INET_PEER_THRESHOLD, "inet_peer_threshold" }, + { CTL_INT, NET_IPV4_INET_PEER_MINTTL, "inet_peer_minttl" }, + { CTL_INT, NET_IPV4_INET_PEER_MAXTTL, "inet_peer_maxttl" }, + { CTL_INT, NET_IPV4_INET_PEER_GC_MINTIME, "inet_peer_gc_mintime" }, + { CTL_INT, NET_IPV4_INET_PEER_GC_MAXTIME, "inet_peer_gc_maxtime" }, + { CTL_INT, NET_TCP_ORPHAN_RETRIES, "tcp_orphan_retries" }, + { CTL_INT, NET_TCP_FACK, "tcp_fack" }, + { CTL_INT, NET_TCP_REORDERING, "tcp_reordering" }, + { CTL_INT, NET_TCP_ECN, "tcp_ecn" }, + { CTL_INT, NET_TCP_DSACK, "tcp_dsack" }, + { CTL_INT, NET_TCP_MEM, "tcp_mem" }, + { CTL_INT, NET_TCP_WMEM, "tcp_wmem" }, + { CTL_INT, NET_TCP_RMEM, "tcp_rmem" }, + { CTL_INT, NET_TCP_APP_WIN, "tcp_app_win" }, + { CTL_INT, NET_TCP_ADV_WIN_SCALE, "tcp_adv_win_scale" }, + { CTL_INT, NET_TCP_TW_REUSE, "tcp_tw_reuse" }, + { CTL_INT, NET_TCP_FRTO, "tcp_frto" }, + { CTL_INT, NET_TCP_FRTO_RESPONSE, "tcp_frto_response" }, + { CTL_INT, NET_TCP_LOW_LATENCY, "tcp_low_latency" }, + { CTL_INT, NET_TCP_NO_METRICS_SAVE, "tcp_no_metrics_save" }, + { CTL_INT, NET_TCP_MODERATE_RCVBUF, "tcp_moderate_rcvbuf" }, + { CTL_INT, NET_TCP_TSO_WIN_DIVISOR, "tcp_tso_win_divisor" }, + { CTL_STR, NET_TCP_CONG_CONTROL, "tcp_congestion_control" }, + { CTL_INT, NET_TCP_ABC, "tcp_abc" }, + { CTL_INT, NET_TCP_MTU_PROBING, "tcp_mtu_probing" }, + { CTL_INT, NET_TCP_BASE_MSS, "tcp_base_mss" }, + { CTL_INT, NET_IPV4_TCP_WORKAROUND_SIGNED_WINDOWS, "tcp_workaround_signed_windows" }, + { CTL_INT, NET_TCP_DMA_COPYBREAK, "tcp_dma_copybreak" }, + { CTL_INT, NET_TCP_SLOW_START_AFTER_IDLE, "tcp_slow_start_after_idle" }, + { CTL_INT, NET_CIPSOV4_CACHE_ENABLE, "cipso_cache_enable" }, + { CTL_INT, NET_CIPSOV4_CACHE_BUCKET_SIZE, "cipso_cache_bucket_size" }, + { CTL_INT, NET_CIPSOV4_RBM_OPTFMT, "cipso_rbm_optfmt" }, + { CTL_INT, NET_CIPSOV4_RBM_STRICTVALID, "cipso_rbm_strictvalid" }, + /* NET_TCP_AVAIL_CONG_CONTROL "tcp_available_congestion_control" no longer used */ + { CTL_STR, NET_TCP_ALLOWED_CONG_CONTROL, "tcp_allowed_congestion_control" }, + { CTL_INT, NET_TCP_MAX_SSTHRESH, "tcp_max_ssthresh" }, + + { CTL_INT, NET_IPV4_ICMP_ECHO_IGNORE_ALL, "icmp_echo_ignore_all" }, + { CTL_INT, NET_IPV4_ICMP_ECHO_IGNORE_BROADCASTS, "icmp_echo_ignore_broadcasts" }, + { CTL_INT, NET_IPV4_ICMP_IGNORE_BOGUS_ERROR_RESPONSES, "icmp_ignore_bogus_error_responses" }, + { CTL_INT, NET_IPV4_ICMP_ERRORS_USE_INBOUND_IFADDR, "icmp_errors_use_inbound_ifaddr" }, + { CTL_INT, NET_IPV4_ICMP_RATELIMIT, "icmp_ratelimit" }, + { CTL_INT, NET_IPV4_ICMP_RATEMASK, "icmp_ratemask" }, + + { CTL_INT, NET_IPV4_IPFRAG_HIGH_THRESH, "ipfrag_high_thresh" }, + { CTL_INT, NET_IPV4_IPFRAG_LOW_THRESH, "ipfrag_low_thresh" }, + { CTL_INT, NET_IPV4_IPFRAG_TIME, "ipfrag_time" }, + + { CTL_INT, NET_IPV4_IPFRAG_SECRET_INTERVAL, "ipfrag_secret_interval" }, + /* NET_IPV4_IPFRAG_MAX_DIST "ipfrag_max_dist" no longer used */ + + { CTL_INT, 2088 /* NET_IPQ_QMAX */, "ip_queue_maxlen" }, + + /* NET_TCP_DEFAULT_WIN_SCALE unused */ + /* NET_TCP_BIC_BETA unused */ + /* NET_IPV4_TCP_MAX_KA_PROBES unused */ + /* NET_IPV4_IP_MASQ_DEBUG unused */ + /* NET_TCP_SYN_TAILDROP unused */ + /* NET_IPV4_ICMP_SOURCEQUENCH_RATE unused */ + /* NET_IPV4_ICMP_DESTUNREACH_RATE unused */ + /* NET_IPV4_ICMP_TIMEEXCEED_RATE unused */ + /* NET_IPV4_ICMP_PARAMPROB_RATE unused */ + /* NET_IPV4_ICMP_ECHOREPLY_RATE unused */ + /* NET_IPV4_ALWAYS_DEFRAG unused */ + {} +}; + +static const struct bin_table bin_net_ipx_table[] = { + { CTL_INT, NET_IPX_PPROP_BROADCASTING, "ipx_pprop_broadcasting" }, + /* NET_IPX_FORWARDING unused */ + {} +}; + +static const struct bin_table bin_net_atalk_table[] = { + { CTL_INT, NET_ATALK_AARP_EXPIRY_TIME, "aarp-expiry-time" }, + { CTL_INT, NET_ATALK_AARP_TICK_TIME, "aarp-tick-time" }, + { CTL_INT, NET_ATALK_AARP_RETRANSMIT_LIMIT, "aarp-retransmit-limit" }, + { CTL_INT, NET_ATALK_AARP_RESOLVE_TIME, "aarp-resolve-time" }, + {}, +}; + +static const struct bin_table bin_net_netrom_table[] = { + { CTL_INT, NET_NETROM_DEFAULT_PATH_QUALITY, "default_path_quality" }, + { CTL_INT, NET_NETROM_OBSOLESCENCE_COUNT_INITIALISER, "obsolescence_count_initialiser" }, + { CTL_INT, NET_NETROM_NETWORK_TTL_INITIALISER, "network_ttl_initialiser" }, + { CTL_INT, NET_NETROM_TRANSPORT_TIMEOUT, "transport_timeout" }, + { CTL_INT, NET_NETROM_TRANSPORT_MAXIMUM_TRIES, "transport_maximum_tries" }, + { CTL_INT, NET_NETROM_TRANSPORT_ACKNOWLEDGE_DELAY, "transport_acknowledge_delay" }, + { CTL_INT, NET_NETROM_TRANSPORT_BUSY_DELAY, "transport_busy_delay" }, + { CTL_INT, NET_NETROM_TRANSPORT_REQUESTED_WINDOW_SIZE, "transport_requested_window_size" }, + { CTL_INT, NET_NETROM_TRANSPORT_NO_ACTIVITY_TIMEOUT, "transport_no_activity_timeout" }, + { CTL_INT, NET_NETROM_ROUTING_CONTROL, "routing_control" }, + { CTL_INT, NET_NETROM_LINK_FAILS_COUNT, "link_fails_count" }, + { CTL_INT, NET_NETROM_RESET, "reset" }, + {} +}; + +static const struct bin_table bin_net_ax25_param_table[] = { + { CTL_INT, NET_AX25_IP_DEFAULT_MODE, "ip_default_mode" }, + { CTL_INT, NET_AX25_DEFAULT_MODE, "ax25_default_mode" }, + { CTL_INT, NET_AX25_BACKOFF_TYPE, "backoff_type" }, + { CTL_INT, NET_AX25_CONNECT_MODE, "connect_mode" }, + { CTL_INT, NET_AX25_STANDARD_WINDOW, "standard_window_size" }, + { CTL_INT, NET_AX25_EXTENDED_WINDOW, "extended_window_size" }, + { CTL_INT, NET_AX25_T1_TIMEOUT, "t1_timeout" }, + { CTL_INT, NET_AX25_T2_TIMEOUT, "t2_timeout" }, + { CTL_INT, NET_AX25_T3_TIMEOUT, "t3_timeout" }, + { CTL_INT, NET_AX25_IDLE_TIMEOUT, "idle_timeout" }, + { CTL_INT, NET_AX25_N2, "maximum_retry_count" }, + { CTL_INT, NET_AX25_PACLEN, "maximum_packet_length" }, + { CTL_INT, NET_AX25_PROTOCOL, "protocol" }, + { CTL_INT, NET_AX25_DAMA_SLAVE_TIMEOUT, "dama_slave_timeout" }, + {} +}; + +static const struct bin_table bin_net_ax25_table[] = { + { CTL_DIR, 0, NULL, bin_net_ax25_param_table }, + {} +}; + +static const struct bin_table bin_net_rose_table[] = { + { CTL_INT, NET_ROSE_RESTART_REQUEST_TIMEOUT, "restart_request_timeout" }, + { CTL_INT, NET_ROSE_CALL_REQUEST_TIMEOUT, "call_request_timeout" }, + { CTL_INT, NET_ROSE_RESET_REQUEST_TIMEOUT, "reset_request_timeout" }, + { CTL_INT, NET_ROSE_CLEAR_REQUEST_TIMEOUT, "clear_request_timeout" }, + { CTL_INT, NET_ROSE_ACK_HOLD_BACK_TIMEOUT, "acknowledge_hold_back_timeout" }, + { CTL_INT, NET_ROSE_ROUTING_CONTROL, "routing_control" }, + { CTL_INT, NET_ROSE_LINK_FAIL_TIMEOUT, "link_fail_timeout" }, + { CTL_INT, NET_ROSE_MAX_VCS, "maximum_virtual_circuits" }, + { CTL_INT, NET_ROSE_WINDOW_SIZE, "window_size" }, + { CTL_INT, NET_ROSE_NO_ACTIVITY_TIMEOUT, "no_activity_timeout" }, + {} +}; + +static const struct bin_table bin_net_ipv6_conf_var_table[] = { + { CTL_INT, NET_IPV6_FORWARDING, "forwarding" }, + { CTL_INT, NET_IPV6_HOP_LIMIT, "hop_limit" }, + { CTL_INT, NET_IPV6_MTU, "mtu" }, + { CTL_INT, NET_IPV6_ACCEPT_RA, "accept_ra" }, + { CTL_INT, NET_IPV6_ACCEPT_REDIRECTS, "accept_redirects" }, + { CTL_INT, NET_IPV6_AUTOCONF, "autoconf" }, + { CTL_INT, NET_IPV6_DAD_TRANSMITS, "dad_transmits" }, + { CTL_INT, NET_IPV6_RTR_SOLICITS, "router_solicitations" }, + { CTL_INT, NET_IPV6_RTR_SOLICIT_INTERVAL, "router_solicitation_interval" }, + { CTL_INT, NET_IPV6_RTR_SOLICIT_DELAY, "router_solicitation_delay" }, + { CTL_INT, NET_IPV6_USE_TEMPADDR, "use_tempaddr" }, + { CTL_INT, NET_IPV6_TEMP_VALID_LFT, "temp_valid_lft" }, + { CTL_INT, NET_IPV6_TEMP_PREFERED_LFT, "temp_prefered_lft" }, + { CTL_INT, NET_IPV6_REGEN_MAX_RETRY, "regen_max_retry" }, + { CTL_INT, NET_IPV6_MAX_DESYNC_FACTOR, "max_desync_factor" }, + { CTL_INT, NET_IPV6_MAX_ADDRESSES, "max_addresses" }, + { CTL_INT, NET_IPV6_FORCE_MLD_VERSION, "force_mld_version" }, + { CTL_INT, NET_IPV6_ACCEPT_RA_DEFRTR, "accept_ra_defrtr" }, + { CTL_INT, NET_IPV6_ACCEPT_RA_PINFO, "accept_ra_pinfo" }, + { CTL_INT, NET_IPV6_ACCEPT_RA_RTR_PREF, "accept_ra_rtr_pref" }, + { CTL_INT, NET_IPV6_RTR_PROBE_INTERVAL, "router_probe_interval" }, + { CTL_INT, NET_IPV6_ACCEPT_RA_RT_INFO_MAX_PLEN, "accept_ra_rt_info_max_plen" }, + { CTL_INT, NET_IPV6_PROXY_NDP, "proxy_ndp" }, + { CTL_INT, NET_IPV6_ACCEPT_SOURCE_ROUTE, "accept_source_route" }, + {} +}; + +static const struct bin_table bin_net_ipv6_conf_table[] = { + { CTL_DIR, NET_PROTO_CONF_ALL, "all", bin_net_ipv6_conf_var_table }, + { CTL_DIR, NET_PROTO_CONF_DEFAULT, "default", bin_net_ipv6_conf_var_table }, + { CTL_DIR, 0, NULL, bin_net_ipv6_conf_var_table }, + {} +}; + +static const struct bin_table bin_net_ipv6_route_table[] = { + /* NET_IPV6_ROUTE_FLUSH "flush" no longer used */ + { CTL_INT, NET_IPV6_ROUTE_GC_THRESH, "gc_thresh" }, + { CTL_INT, NET_IPV6_ROUTE_MAX_SIZE, "max_size" }, + { CTL_INT, NET_IPV6_ROUTE_GC_MIN_INTERVAL, "gc_min_interval" }, + { CTL_INT, NET_IPV6_ROUTE_GC_TIMEOUT, "gc_timeout" }, + { CTL_INT, NET_IPV6_ROUTE_GC_INTERVAL, "gc_interval" }, + { CTL_INT, NET_IPV6_ROUTE_GC_ELASTICITY, "gc_elasticity" }, + { CTL_INT, NET_IPV6_ROUTE_MTU_EXPIRES, "mtu_expires" }, + { CTL_INT, NET_IPV6_ROUTE_MIN_ADVMSS, "min_adv_mss" }, + { CTL_INT, NET_IPV6_ROUTE_GC_MIN_INTERVAL_MS, "gc_min_interval_ms" }, + {} +}; + +static const struct bin_table bin_net_ipv6_icmp_table[] = { + { CTL_INT, NET_IPV6_ICMP_RATELIMIT, "ratelimit" }, + {} +}; + +static const struct bin_table bin_net_ipv6_table[] = { + { CTL_DIR, NET_IPV6_CONF, "conf", bin_net_ipv6_conf_table }, + { CTL_DIR, NET_IPV6_NEIGH, "neigh", bin_net_neigh_table }, + { CTL_DIR, NET_IPV6_ROUTE, "route", bin_net_ipv6_route_table }, + { CTL_DIR, NET_IPV6_ICMP, "icmp", bin_net_ipv6_icmp_table }, + { CTL_INT, NET_IPV6_BINDV6ONLY, "bindv6only" }, + { CTL_INT, NET_IPV6_IP6FRAG_HIGH_THRESH, "ip6frag_high_thresh" }, + { CTL_INT, NET_IPV6_IP6FRAG_LOW_THRESH, "ip6frag_low_thresh" }, + { CTL_INT, NET_IPV6_IP6FRAG_TIME, "ip6frag_time" }, + { CTL_INT, NET_IPV6_IP6FRAG_SECRET_INTERVAL, "ip6frag_secret_interval" }, + { CTL_INT, NET_IPV6_MLD_MAX_MSF, "mld_max_msf" }, + { CTL_INT, 2088 /* IPQ_QMAX */, "ip6_queue_maxlen" }, + {} +}; + +static const struct bin_table bin_net_x25_table[] = { + { CTL_INT, NET_X25_RESTART_REQUEST_TIMEOUT, "restart_request_timeout" }, + { CTL_INT, NET_X25_CALL_REQUEST_TIMEOUT, "call_request_timeout" }, + { CTL_INT, NET_X25_RESET_REQUEST_TIMEOUT, "reset_request_timeout" }, + { CTL_INT, NET_X25_CLEAR_REQUEST_TIMEOUT, "clear_request_timeout" }, + { CTL_INT, NET_X25_ACK_HOLD_BACK_TIMEOUT, "acknowledgement_hold_back_timeout" }, + { CTL_INT, NET_X25_FORWARD, "x25_forward" }, + {} +}; + +static const struct bin_table bin_net_tr_table[] = { + { CTL_INT, NET_TR_RIF_TIMEOUT, "rif_timeout" }, + {} +}; + + +static const struct bin_table bin_net_decnet_conf_vars[] = { + { CTL_INT, NET_DECNET_CONF_DEV_FORWARDING, "forwarding" }, + { CTL_INT, NET_DECNET_CONF_DEV_PRIORITY, "priority" }, + { CTL_INT, NET_DECNET_CONF_DEV_T2, "t2" }, + { CTL_INT, NET_DECNET_CONF_DEV_T3, "t3" }, + {} +}; + +static const struct bin_table bin_net_decnet_conf[] = { + { CTL_DIR, NET_DECNET_CONF_ETHER, "ethernet", bin_net_decnet_conf_vars }, + { CTL_DIR, NET_DECNET_CONF_GRE, "ipgre", bin_net_decnet_conf_vars }, + { CTL_DIR, NET_DECNET_CONF_X25, "x25", bin_net_decnet_conf_vars }, + { CTL_DIR, NET_DECNET_CONF_PPP, "ppp", bin_net_decnet_conf_vars }, + { CTL_DIR, NET_DECNET_CONF_DDCMP, "ddcmp", bin_net_decnet_conf_vars }, + { CTL_DIR, NET_DECNET_CONF_LOOPBACK, "loopback", bin_net_decnet_conf_vars }, + { CTL_DIR, 0, NULL, bin_net_decnet_conf_vars }, + {} +}; + +static const struct bin_table bin_net_decnet_table[] = { + { CTL_DIR, NET_DECNET_CONF, "conf", bin_net_decnet_conf }, + { CTL_DNADR, NET_DECNET_NODE_ADDRESS, "node_address" }, + { CTL_STR, NET_DECNET_NODE_NAME, "node_name" }, + { CTL_STR, NET_DECNET_DEFAULT_DEVICE, "default_device" }, + { CTL_INT, NET_DECNET_TIME_WAIT, "time_wait" }, + { CTL_INT, NET_DECNET_DN_COUNT, "dn_count" }, + { CTL_INT, NET_DECNET_DI_COUNT, "di_count" }, + { CTL_INT, NET_DECNET_DR_COUNT, "dr_count" }, + { CTL_INT, NET_DECNET_DST_GC_INTERVAL, "dst_gc_interval" }, + { CTL_INT, NET_DECNET_NO_FC_MAX_CWND, "no_fc_max_cwnd" }, + { CTL_INT, NET_DECNET_MEM, "decnet_mem" }, + { CTL_INT, NET_DECNET_RMEM, "decnet_rmem" }, + { CTL_INT, NET_DECNET_WMEM, "decnet_wmem" }, + { CTL_INT, NET_DECNET_DEBUG_LEVEL, "debug" }, + {} +}; + +static const struct bin_table bin_net_sctp_table[] = { + { CTL_INT, NET_SCTP_RTO_INITIAL, "rto_initial" }, + { CTL_INT, NET_SCTP_RTO_MIN, "rto_min" }, + { CTL_INT, NET_SCTP_RTO_MAX, "rto_max" }, + { CTL_INT, NET_SCTP_RTO_ALPHA, "rto_alpha_exp_divisor" }, + { CTL_INT, NET_SCTP_RTO_BETA, "rto_beta_exp_divisor" }, + { CTL_INT, NET_SCTP_VALID_COOKIE_LIFE, "valid_cookie_life" }, + { CTL_INT, NET_SCTP_ASSOCIATION_MAX_RETRANS, "association_max_retrans" }, + { CTL_INT, NET_SCTP_PATH_MAX_RETRANS, "path_max_retrans" }, + { CTL_INT, NET_SCTP_MAX_INIT_RETRANSMITS, "max_init_retransmits" }, + { CTL_INT, NET_SCTP_HB_INTERVAL, "hb_interval" }, + { CTL_INT, NET_SCTP_PRESERVE_ENABLE, "cookie_preserve_enable" }, + { CTL_INT, NET_SCTP_MAX_BURST, "max_burst" }, + { CTL_INT, NET_SCTP_ADDIP_ENABLE, "addip_enable" }, + { CTL_INT, NET_SCTP_PRSCTP_ENABLE, "prsctp_enable" }, + { CTL_INT, NET_SCTP_SNDBUF_POLICY, "sndbuf_policy" }, + { CTL_INT, NET_SCTP_SACK_TIMEOUT, "sack_timeout" }, + { CTL_INT, NET_SCTP_RCVBUF_POLICY, "rcvbuf_policy" }, + {} +}; + +static const struct bin_table bin_net_llc_llc2_timeout_table[] = { + { CTL_INT, NET_LLC2_ACK_TIMEOUT, "ack" }, + { CTL_INT, NET_LLC2_P_TIMEOUT, "p" }, + { CTL_INT, NET_LLC2_REJ_TIMEOUT, "rej" }, + { CTL_INT, NET_LLC2_BUSY_TIMEOUT, "busy" }, + {} +}; + +static const struct bin_table bin_net_llc_station_table[] = { + { CTL_INT, NET_LLC_STATION_ACK_TIMEOUT, "ack_timeout" }, + {} +}; + +static const struct bin_table bin_net_llc_llc2_table[] = { + { CTL_DIR, NET_LLC2, "timeout", bin_net_llc_llc2_timeout_table }, + {} +}; + +static const struct bin_table bin_net_llc_table[] = { + { CTL_DIR, NET_LLC2, "llc2", bin_net_llc_llc2_table }, + { CTL_DIR, NET_LLC_STATION, "station", bin_net_llc_station_table }, + {} +}; + +static const struct bin_table bin_net_netfilter_table[] = { + { CTL_INT, NET_NF_CONNTRACK_MAX, "nf_conntrack_max" }, + /* NET_NF_CONNTRACK_TCP_TIMEOUT_SYN_SENT "nf_conntrack_tcp_timeout_syn_sent" no longer used */ + /* NET_NF_CONNTRACK_TCP_TIMEOUT_SYN_RECV "nf_conntrack_tcp_timeout_syn_recv" no longer used */ + /* NET_NF_CONNTRACK_TCP_TIMEOUT_ESTABLISHED "nf_conntrack_tcp_timeout_established" no longer used */ + /* NET_NF_CONNTRACK_TCP_TIMEOUT_FIN_WAIT "nf_conntrack_tcp_timeout_fin_wait" no longer used */ + /* NET_NF_CONNTRACK_TCP_TIMEOUT_CLOSE_WAIT "nf_conntrack_tcp_timeout_close_wait" no longer used */ + /* NET_NF_CONNTRACK_TCP_TIMEOUT_LAST_ACK "nf_conntrack_tcp_timeout_last_ack" no longer used */ + /* NET_NF_CONNTRACK_TCP_TIMEOUT_TIME_WAIT "nf_conntrack_tcp_timeout_time_wait" no longer used */ + /* NET_NF_CONNTRACK_TCP_TIMEOUT_CLOSE "nf_conntrack_tcp_timeout_close" no longer used */ + /* NET_NF_CONNTRACK_UDP_TIMEOUT "nf_conntrack_udp_timeout" no longer used */ + /* NET_NF_CONNTRACK_UDP_TIMEOUT_STREAM "nf_conntrack_udp_timeout_stream" no longer used */ + /* NET_NF_CONNTRACK_ICMP_TIMEOUT "nf_conntrack_icmp_timeout" no longer used */ + /* NET_NF_CONNTRACK_GENERIC_TIMEOUT "nf_conntrack_generic_timeout" no longer used */ + { CTL_INT, NET_NF_CONNTRACK_BUCKETS, "nf_conntrack_buckets" }, + { CTL_INT, NET_NF_CONNTRACK_LOG_INVALID, "nf_conntrack_log_invalid" }, + /* NET_NF_CONNTRACK_TCP_TIMEOUT_MAX_RETRANS "nf_conntrack_tcp_timeout_max_retrans" no longer used */ + { CTL_INT, NET_NF_CONNTRACK_TCP_LOOSE, "nf_conntrack_tcp_loose" }, + { CTL_INT, NET_NF_CONNTRACK_TCP_BE_LIBERAL, "nf_conntrack_tcp_be_liberal" }, + { CTL_INT, NET_NF_CONNTRACK_TCP_MAX_RETRANS, "nf_conntrack_tcp_max_retrans" }, + /* NET_NF_CONNTRACK_SCTP_TIMEOUT_CLOSED "nf_conntrack_sctp_timeout_closed" no longer used */ + /* NET_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_WAIT "nf_conntrack_sctp_timeout_cookie_wait" no longer used */ + /* NET_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_ECHOED "nf_conntrack_sctp_timeout_cookie_echoed" no longer used */ + /* NET_NF_CONNTRACK_SCTP_TIMEOUT_ESTABLISHED "nf_conntrack_sctp_timeout_established" no longer used */ + /* NET_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_SENT "nf_conntrack_sctp_timeout_shutdown_sent" no longer used */ + /* NET_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_RECD "nf_conntrack_sctp_timeout_shutdown_recd" no longer used */ + /* NET_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_ACK_SENT "nf_conntrack_sctp_timeout_shutdown_ack_sent" no longer used */ + { CTL_INT, NET_NF_CONNTRACK_COUNT, "nf_conntrack_count" }, + /* NET_NF_CONNTRACK_ICMPV6_TIMEOUT "nf_conntrack_icmpv6_timeout" no longer used */ + /* NET_NF_CONNTRACK_FRAG6_TIMEOUT "nf_conntrack_frag6_timeout" no longer used */ + { CTL_INT, NET_NF_CONNTRACK_FRAG6_LOW_THRESH, "nf_conntrack_frag6_low_thresh" }, + { CTL_INT, NET_NF_CONNTRACK_FRAG6_HIGH_THRESH, "nf_conntrack_frag6_high_thresh" }, + { CTL_INT, NET_NF_CONNTRACK_CHECKSUM, "nf_conntrack_checksum" }, + + {} +}; + +static const struct bin_table bin_net_irda_table[] = { + { CTL_INT, NET_IRDA_DISCOVERY, "discovery" }, + { CTL_STR, NET_IRDA_DEVNAME, "devname" }, + { CTL_INT, NET_IRDA_DEBUG, "debug" }, + { CTL_INT, NET_IRDA_FAST_POLL, "fast_poll_increase" }, + { CTL_INT, NET_IRDA_DISCOVERY_SLOTS, "discovery_slots" }, + { CTL_INT, NET_IRDA_DISCOVERY_TIMEOUT, "discovery_timeout" }, + { CTL_INT, NET_IRDA_SLOT_TIMEOUT, "slot_timeout" }, + { CTL_INT, NET_IRDA_MAX_BAUD_RATE, "max_baud_rate" }, + { CTL_INT, NET_IRDA_MIN_TX_TURN_TIME, "min_tx_turn_time" }, + { CTL_INT, NET_IRDA_MAX_TX_DATA_SIZE, "max_tx_data_size" }, + { CTL_INT, NET_IRDA_MAX_TX_WINDOW, "max_tx_window" }, + { CTL_INT, NET_IRDA_MAX_NOREPLY_TIME, "max_noreply_time" }, + { CTL_INT, NET_IRDA_WARN_NOREPLY_TIME, "warn_noreply_time" }, + { CTL_INT, NET_IRDA_LAP_KEEPALIVE_TIME, "lap_keepalive_time" }, + {} +}; + +static const struct bin_table bin_net_table[] = { + { CTL_DIR, NET_CORE, "core", bin_net_core_table }, + /* NET_ETHER not used */ + /* NET_802 not used */ + { CTL_DIR, NET_UNIX, "unix", bin_net_unix_table }, + { CTL_DIR, NET_IPV4, "ipv4", bin_net_ipv4_table }, + { CTL_DIR, NET_IPX, "ipx", bin_net_ipx_table }, + { CTL_DIR, NET_ATALK, "appletalk", bin_net_atalk_table }, + { CTL_DIR, NET_NETROM, "netrom", bin_net_netrom_table }, + { CTL_DIR, NET_AX25, "ax25", bin_net_ax25_table }, + /* NET_BRIDGE "bridge" no longer used */ + { CTL_DIR, NET_ROSE, "rose", bin_net_rose_table }, + { CTL_DIR, NET_IPV6, "ipv6", bin_net_ipv6_table }, + { CTL_DIR, NET_X25, "x25", bin_net_x25_table }, + { CTL_DIR, NET_TR, "token-ring", bin_net_tr_table }, + { CTL_DIR, NET_DECNET, "decnet", bin_net_decnet_table }, + /* NET_ECONET not used */ + { CTL_DIR, NET_SCTP, "sctp", bin_net_sctp_table }, + { CTL_DIR, NET_LLC, "llc", bin_net_llc_table }, + { CTL_DIR, NET_NETFILTER, "netfilter", bin_net_netfilter_table }, + /* NET_DCCP "dccp" no longer used */ + { CTL_DIR, NET_IRDA, "irda", bin_net_irda_table }, + { CTL_INT, 2089, "nf_conntrack_max" }, + {} +}; + +static const struct bin_table bin_fs_quota_table[] = { + { CTL_INT, FS_DQ_LOOKUPS, "lookups" }, + { CTL_INT, FS_DQ_DROPS, "drops" }, + { CTL_INT, FS_DQ_READS, "reads" }, + { CTL_INT, FS_DQ_WRITES, "writes" }, + { CTL_INT, FS_DQ_CACHE_HITS, "cache_hits" }, + { CTL_INT, FS_DQ_ALLOCATED, "allocated_dquots" }, + { CTL_INT, FS_DQ_FREE, "free_dquots" }, + { CTL_INT, FS_DQ_SYNCS, "syncs" }, + { CTL_INT, FS_DQ_WARNINGS, "warnings" }, + {} +}; + +static const struct bin_table bin_fs_xfs_table[] = { + { CTL_INT, XFS_SGID_INHERIT, "irix_sgid_inherit" }, + { CTL_INT, XFS_SYMLINK_MODE, "irix_symlink_mode" }, + { CTL_INT, XFS_PANIC_MASK, "panic_mask" }, + + { CTL_INT, XFS_ERRLEVEL, "error_level" }, + { CTL_INT, XFS_SYNCD_TIMER, "xfssyncd_centisecs" }, + { CTL_INT, XFS_INHERIT_SYNC, "inherit_sync" }, + { CTL_INT, XFS_INHERIT_NODUMP, "inherit_nodump" }, + { CTL_INT, XFS_INHERIT_NOATIME, "inherit_noatime" }, + { CTL_INT, XFS_BUF_TIMER, "xfsbufd_centisecs" }, + { CTL_INT, XFS_BUF_AGE, "age_buffer_centisecs" }, + { CTL_INT, XFS_INHERIT_NOSYM, "inherit_nosymlinks" }, + { CTL_INT, XFS_ROTORSTEP, "rotorstep" }, + { CTL_INT, XFS_INHERIT_NODFRG, "inherit_nodefrag" }, + { CTL_INT, XFS_FILESTREAM_TIMER, "filestream_centisecs" }, + { CTL_INT, XFS_STATS_CLEAR, "stats_clear" }, + {} +}; + +static const struct bin_table bin_fs_ocfs2_nm_table[] = { + { CTL_STR, 1, "hb_ctl_path" }, + {} +}; + +static const struct bin_table bin_fs_ocfs2_table[] = { + { CTL_DIR, 1, "nm", bin_fs_ocfs2_nm_table }, + {} +}; + +static const struct bin_table bin_inotify_table[] = { + { CTL_INT, INOTIFY_MAX_USER_INSTANCES, "max_user_instances" }, + { CTL_INT, INOTIFY_MAX_USER_WATCHES, "max_user_watches" }, + { CTL_INT, INOTIFY_MAX_QUEUED_EVENTS, "max_queued_events" }, + {} +}; + +static const struct bin_table bin_fs_table[] = { + { CTL_INT, FS_NRINODE, "inode-nr" }, + { CTL_INT, FS_STATINODE, "inode-state" }, + /* FS_MAXINODE unused */ + /* FS_NRDQUOT unused */ + /* FS_MAXDQUOT unused */ + /* FS_NRFILE "file-nr" no longer used */ + { CTL_INT, FS_MAXFILE, "file-max" }, + { CTL_INT, FS_DENTRY, "dentry-state" }, + /* FS_NRSUPER unused */ + /* FS_MAXUPSER unused */ + { CTL_INT, FS_OVERFLOWUID, "overflowuid" }, + { CTL_INT, FS_OVERFLOWGID, "overflowgid" }, + { CTL_INT, FS_LEASES, "leases-enable" }, + { CTL_INT, FS_DIR_NOTIFY, "dir-notify-enable" }, + { CTL_INT, FS_LEASE_TIME, "lease-break-time" }, + { CTL_DIR, FS_DQSTATS, "quota", bin_fs_quota_table }, + { CTL_DIR, FS_XFS, "xfs", bin_fs_xfs_table }, + { CTL_ULONG, FS_AIO_NR, "aio-nr" }, + { CTL_ULONG, FS_AIO_MAX_NR, "aio-max-nr" }, + { CTL_DIR, FS_INOTIFY, "inotify", bin_inotify_table }, + { CTL_DIR, FS_OCFS2, "ocfs2", bin_fs_ocfs2_table }, + { CTL_INT, KERN_SETUID_DUMPABLE, "suid_dumpable" }, + {} +}; + +static const struct bin_table bin_ipmi_table[] = { + { CTL_INT, DEV_IPMI_POWEROFF_POWERCYCLE, "poweroff_powercycle" }, + {} +}; + +static const struct bin_table bin_mac_hid_files[] = { + /* DEV_MAC_HID_KEYBOARD_SENDS_LINUX_KEYCODES unused */ + /* DEV_MAC_HID_KEYBOARD_LOCK_KEYCODES unused */ + { CTL_INT, DEV_MAC_HID_MOUSE_BUTTON_EMULATION, "mouse_button_emulation" }, + { CTL_INT, DEV_MAC_HID_MOUSE_BUTTON2_KEYCODE, "mouse_button2_keycode" }, + { CTL_INT, DEV_MAC_HID_MOUSE_BUTTON3_KEYCODE, "mouse_button3_keycode" }, + /* DEV_MAC_HID_ADB_MOUSE_SENDS_KEYCODES unused */ + {} +}; + +static const struct bin_table bin_raid_table[] = { + { CTL_INT, DEV_RAID_SPEED_LIMIT_MIN, "speed_limit_min" }, + { CTL_INT, DEV_RAID_SPEED_LIMIT_MAX, "speed_limit_max" }, + {} +}; + +static const struct bin_table bin_scsi_table[] = { + { CTL_INT, DEV_SCSI_LOGGING_LEVEL, "logging_level" }, + {} +}; + +static const struct bin_table bin_dev_table[] = { + /* DEV_CDROM "cdrom" no longer used */ + /* DEV_HWMON unused */ + /* DEV_PARPORT "parport" no longer used */ + { CTL_DIR, DEV_RAID, "raid", bin_raid_table }, + { CTL_DIR, DEV_MAC_HID, "mac_hid", bin_mac_hid_files }, + { CTL_DIR, DEV_SCSI, "scsi", bin_scsi_table }, + { CTL_DIR, DEV_IPMI, "ipmi", bin_ipmi_table }, + {} +}; + +static const struct bin_table bin_bus_isa_table[] = { + { CTL_INT, BUS_ISA_MEM_BASE, "membase" }, + { CTL_INT, BUS_ISA_PORT_BASE, "portbase" }, + { CTL_INT, BUS_ISA_PORT_SHIFT, "portshift" }, + {} +}; + +static const struct bin_table bin_bus_table[] = { + { CTL_DIR, CTL_BUS_ISA, "isa", bin_bus_isa_table }, + {} +}; + + +static const struct bin_table bin_s390dbf_table[] = { + { CTL_INT, 5678 /* CTL_S390DBF_STOPPABLE */, "debug_stoppable" }, + { CTL_INT, 5679 /* CTL_S390DBF_ACTIVE */, "debug_active" }, + {} +}; + +static const struct bin_table bin_sunrpc_table[] = { + /* CTL_RPCDEBUG "rpc_debug" no longer used */ + /* CTL_NFSDEBUG "nfs_debug" no longer used */ + /* CTL_NFSDDEBUG "nfsd_debug" no longer used */ + /* CTL_NLMDEBUG "nlm_debug" no longer used */ + + { CTL_INT, CTL_SLOTTABLE_UDP, "udp_slot_table_entries" }, + { CTL_INT, CTL_SLOTTABLE_TCP, "tcp_slot_table_entries" }, + { CTL_INT, CTL_MIN_RESVPORT, "min_resvport" }, + { CTL_INT, CTL_MAX_RESVPORT, "max_resvport" }, + {} +}; + +static const struct bin_table bin_pm_table[] = { + /* frv specific */ + /* 1 == CTL_PM_SUSPEND "suspend" no longer used" */ + { CTL_INT, 2 /* CTL_PM_CMODE */, "cmode" }, + { CTL_INT, 3 /* CTL_PM_P0 */, "p0" }, + { CTL_INT, 4 /* CTL_PM_CM */, "cm" }, + {} +}; + +static const struct bin_table bin_root_table[] = { + { CTL_DIR, CTL_KERN, "kernel", bin_kern_table }, + { CTL_DIR, CTL_VM, "vm", bin_vm_table }, + { CTL_DIR, CTL_NET, "net", bin_net_table }, + /* CTL_PROC not used */ + { CTL_DIR, CTL_FS, "fs", bin_fs_table }, + /* CTL_DEBUG "debug" no longer used */ + { CTL_DIR, CTL_DEV, "dev", bin_dev_table }, + { CTL_DIR, CTL_BUS, "bus", bin_bus_table }, + { CTL_DIR, CTL_ABI, "abi" }, + /* CTL_CPU not used */ + /* CTL_ARLAN "arlan" no longer used */ + { CTL_DIR, CTL_S390DBF, "s390dbf", bin_s390dbf_table }, + { CTL_DIR, CTL_SUNRPC, "sunrpc", bin_sunrpc_table }, + { CTL_DIR, CTL_PM, "pm", bin_pm_table }, + {} +}; + +static ssize_t bin_dir(struct file *file, + void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) +{ + return -ENOTDIR; +} + + +static ssize_t bin_string(struct file *file, + void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) +{ + ssize_t result, copied = 0; + + if (oldval && oldlen) { + char __user *lastp; + loff_t pos = 0; + int ch; + + result = vfs_read(file, oldval, oldlen, &pos); + if (result < 0) + goto out; + + copied = result; + lastp = oldval + copied - 1; + + result = -EFAULT; + if (get_user(ch, lastp)) + goto out; + + /* Trim off the trailing newline */ + if (ch == '\n') { + result = -EFAULT; + if (put_user('\0', lastp)) + goto out; + copied -= 1; + } + } + + if (newval && newlen) { + loff_t pos = 0; + + result = vfs_write(file, newval, newlen, &pos); + if (result < 0) + goto out; + } + + result = copied; +out: + return result; +} + +static ssize_t bin_intvec(struct file *file, + void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) +{ + mm_segment_t old_fs = get_fs(); + ssize_t copied = 0; + char *buffer; + ssize_t result; + + result = -ENOMEM; + buffer = kmalloc(BUFSZ, GFP_KERNEL); + if (!buffer) + goto out; + + if (oldval && oldlen) { + unsigned __user *vec = oldval; + size_t length = oldlen / sizeof(*vec); + loff_t pos = 0; + char *str, *end; + int i; + + set_fs(KERNEL_DS); + result = vfs_read(file, buffer, BUFSZ - 1, &pos); + set_fs(old_fs); + if (result < 0) + goto out_kfree; + + str = buffer; + end = str + result; + *end++ = '\0'; + for (i = 0; i < length; i++) { + unsigned long value; + + value = simple_strtoul(str, &str, 10); + while (isspace(*str)) + str++; + + result = -EFAULT; + if (put_user(value, vec + i)) + goto out_kfree; + + copied += sizeof(*vec); + if (!isdigit(*str)) + break; + } + } + + if (newval && newlen) { + unsigned __user *vec = newval; + size_t length = newlen / sizeof(*vec); + loff_t pos = 0; + char *str, *end; + int i; + + str = buffer; + end = str + BUFSZ; + for (i = 0; i < length; i++) { + unsigned long value; + + result = -EFAULT; + if (get_user(value, vec + i)) + goto out_kfree; + + str += snprintf(str, end - str, "%lu\t", value); + } + + set_fs(KERNEL_DS); + result = vfs_write(file, buffer, str - buffer, &pos); + set_fs(old_fs); + if (result < 0) + goto out_kfree; + } + result = copied; +out_kfree: + kfree(buffer); +out: + return result; +} + +static ssize_t bin_ulongvec(struct file *file, + void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) +{ + mm_segment_t old_fs = get_fs(); + ssize_t copied = 0; + char *buffer; + ssize_t result; + + result = -ENOMEM; + buffer = kmalloc(BUFSZ, GFP_KERNEL); + if (!buffer) + goto out; + + if (oldval && oldlen) { + unsigned long __user *vec = oldval; + size_t length = oldlen / sizeof(*vec); + loff_t pos = 0; + char *str, *end; + int i; + + set_fs(KERNEL_DS); + result = vfs_read(file, buffer, BUFSZ - 1, &pos); + set_fs(old_fs); + if (result < 0) + goto out_kfree; + + str = buffer; + end = str + result; + *end++ = '\0'; + for (i = 0; i < length; i++) { + unsigned long value; + + value = simple_strtoul(str, &str, 10); + while (isspace(*str)) + str++; + + result = -EFAULT; + if (put_user(value, vec + i)) + goto out_kfree; + + copied += sizeof(*vec); + if (!isdigit(*str)) + break; + } + } + + if (newval && newlen) { + unsigned long __user *vec = newval; + size_t length = newlen / sizeof(*vec); + loff_t pos = 0; + char *str, *end; + int i; + + str = buffer; + end = str + BUFSZ; + for (i = 0; i < length; i++) { + unsigned long value; + + result = -EFAULT; + if (get_user(value, vec + i)) + goto out_kfree; + + str += snprintf(str, end - str, "%lu\t", value); + } + + set_fs(KERNEL_DS); + result = vfs_write(file, buffer, str - buffer, &pos); + set_fs(old_fs); + if (result < 0) + goto out_kfree; + } + result = copied; +out_kfree: + kfree(buffer); +out: + return result; +} + +static unsigned hex_value(int ch) +{ + return isdigit(ch) ? ch - '0' : ((ch | 0x20) - 'a') + 10; +} + +static ssize_t bin_uuid(struct file *file, + void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) +{ + mm_segment_t old_fs = get_fs(); + ssize_t result, copied = 0; + + /* Only supports reads */ + if (oldval && oldlen) { + loff_t pos = 0; + char buf[40], *str = buf; + unsigned char uuid[16]; + int i; + + set_fs(KERNEL_DS); + result = vfs_read(file, buf, sizeof(buf) - 1, &pos); + set_fs(old_fs); + if (result < 0) + goto out; + + buf[result] = '\0'; + + /* Convert the uuid to from a string to binary */ + for (i = 0; i < 16; i++) { + result = -EIO; + if (!isxdigit(str[0]) || !isxdigit(str[1])) + goto out; + + uuid[i] = (hex_value(str[0]) << 4) | hex_value(str[1]); + str += 2; + if (*str == '-') + str++; + } + + if (oldlen > 16) + oldlen = 16; + + result = -EFAULT; + if (copy_to_user(oldval, uuid, oldlen)) + goto out; + + copied = oldlen; + } + result = copied; +out: + return result; +} + +static ssize_t bin_dn_node_address(struct file *file, + void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) +{ + mm_segment_t old_fs = get_fs(); + ssize_t result, copied = 0; + + if (oldval && oldlen) { + loff_t pos = 0; + char buf[15], *nodep; + unsigned long area, node; + __le16 dnaddr; + + set_fs(KERNEL_DS); + result = vfs_read(file, buf, sizeof(buf) - 1, &pos); + set_fs(old_fs); + if (result < 0) + goto out; + + buf[result] = '\0'; + + /* Convert the decnet addresss to binary */ + result = -EIO; + nodep = strchr(buf, '.') + 1; + if (!nodep) + goto out; + + area = simple_strtoul(buf, NULL, 10); + node = simple_strtoul(nodep, NULL, 10); + + result = -EIO; + if ((area > 63)||(node > 1023)) + goto out; + + dnaddr = cpu_to_le16((area << 10) | node); + + result = -EFAULT; + if (put_user(dnaddr, (__le16 __user *)oldval)) + goto out; + + copied = sizeof(dnaddr); + } + + if (newval && newlen) { + loff_t pos = 0; + __le16 dnaddr; + char buf[15]; + int len; + + result = -EINVAL; + if (newlen != sizeof(dnaddr)) + goto out; + + result = -EFAULT; + if (get_user(dnaddr, (__le16 __user *)newval)) + goto out; + + len = snprintf(buf, sizeof(buf), "%hu.%hu", + le16_to_cpu(dnaddr) >> 10, + le16_to_cpu(dnaddr) & 0x3ff); + + set_fs(KERNEL_DS); + result = vfs_write(file, buf, len, &pos); + set_fs(old_fs); + if (result < 0) + goto out; + } + + result = copied; +out: + return result; +} + +static const struct bin_table *get_sysctl(const int *name, int nlen, char *path) +{ + const struct bin_table *table = &bin_root_table[0]; + int ctl_name; + + /* The binary sysctl tables have a small maximum depth so + * there is no danger of overflowing our path as it PATH_MAX + * bytes long. + */ + memcpy(path, "sys/", 4); + path += 4; + +repeat: + if (!nlen) + return ERR_PTR(-ENOTDIR); + ctl_name = *name; + name++; + nlen--; + for ( ; table->convert; table++) { + int len = 0; + + /* + * For a wild card entry map from ifindex to network + * device name. + */ + if (!table->ctl_name) { +#ifdef CONFIG_NET + struct net *net = current->nsproxy->net_ns; + struct net_device *dev; + dev = dev_get_by_index(net, ctl_name); + if (dev) { + len = strlen(dev->name); + memcpy(path, dev->name, len); + dev_put(dev); + } +#endif + /* Use the well known sysctl number to proc name mapping */ + } else if (ctl_name == table->ctl_name) { + len = strlen(table->procname); + memcpy(path, table->procname, len); + } + if (len) { + path += len; + if (table->child) { + *path++ = '/'; + table = table->child; + goto repeat; + } + *path = '\0'; + return table; + } + } + return ERR_PTR(-ENOTDIR); +} + +static char *sysctl_getname(const int *name, int nlen, const struct bin_table **tablep) +{ + char *tmp, *result; + + result = ERR_PTR(-ENOMEM); + tmp = __getname(); + if (tmp) { + const struct bin_table *table = get_sysctl(name, nlen, tmp); + result = tmp; + *tablep = table; + if (IS_ERR(table)) { + __putname(tmp); + result = ERR_CAST(table); + } + } + return result; +} + +static ssize_t binary_sysctl(const int *name, int nlen, + void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) +{ + const struct bin_table *table = NULL; + struct nameidata nd; + struct vfsmount *mnt; + struct file *file; + ssize_t result; + char *pathname; + int flags; + int acc_mode, fmode; + + pathname = sysctl_getname(name, nlen, &table); + result = PTR_ERR(pathname); + if (IS_ERR(pathname)) + goto out; + + /* How should the sysctl be accessed? */ + if (oldval && oldlen && newval && newlen) { + flags = O_RDWR; + acc_mode = MAY_READ | MAY_WRITE; + fmode = FMODE_READ | FMODE_WRITE; + } else if (newval && newlen) { + flags = O_WRONLY; + acc_mode = MAY_WRITE; + fmode = FMODE_WRITE; + } else if (oldval && oldlen) { + flags = O_RDONLY; + acc_mode = MAY_READ; + fmode = FMODE_READ; + } else { + result = 0; + goto out_putname; + } + + mnt = current->nsproxy->pid_ns->proc_mnt; + result = vfs_path_lookup(mnt->mnt_root, mnt, pathname, 0, &nd); + if (result) + goto out_putname; + + result = may_open(&nd.path, acc_mode, fmode); + if (result) + goto out_putpath; + + file = dentry_open(nd.path.dentry, nd.path.mnt, flags, current_cred()); + result = PTR_ERR(file); + if (IS_ERR(file)) + goto out_putname; + + result = table->convert(file, oldval, oldlen, newval, newlen); + + fput(file); +out_putname: + putname(pathname); +out: + return result; + +out_putpath: + path_put(&nd.path); + goto out_putname; +} + + +#else /* CONFIG_SYSCTL_SYSCALL */ + +static ssize_t binary_sysctl(const int *name, int nlen, + void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) +{ + return -ENOSYS; +} + +#endif /* CONFIG_SYSCTL_SYSCALL */ + + +static void deprecated_sysctl_warning(const int *name, int nlen) +{ + int i; + + if (printk_ratelimit()) { + printk(KERN_INFO + "warning: process `%s' used the deprecated sysctl " + "system call with ", current->comm); + for (i = 0; i < nlen; i++) + printk("%d.", name[i]); + printk("\n"); + } + return; +} + +static ssize_t do_sysctl(int __user *args_name, int nlen, + void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) +{ + int name[CTL_MAXNAME]; + int i; + + /* Check args->nlen. */ + if (nlen < 0 || nlen > CTL_MAXNAME) + return -ENOTDIR; + /* Read in the sysctl name for simplicity */ + for (i = 0; i < nlen; i++) + if (get_user(name[i], args_name + i)) + return -EFAULT; + + deprecated_sysctl_warning(name, nlen); + + return binary_sysctl(name, nlen, oldval, oldlen, newval, newlen); +} + +SYSCALL_DEFINE1(sysctl, struct __sysctl_args __user *, args) +{ + struct __sysctl_args tmp; + size_t oldlen = 0; + ssize_t result; + + if (copy_from_user(&tmp, args, sizeof(tmp))) + return -EFAULT; + + if (tmp.oldval && !tmp.oldlenp) + return -EFAULT; + + if (tmp.oldlenp && get_user(oldlen, tmp.oldlenp)) + return -EFAULT; + + result = do_sysctl(tmp.name, tmp.nlen, tmp.oldval, oldlen, + tmp.newval, tmp.newlen); + + if (result >= 0) { + oldlen = result; + result = 0; + } + + if (tmp.oldlenp && put_user(oldlen, tmp.oldlenp)) + return -EFAULT; + + return result; +} + + +#ifdef CONFIG_COMPAT +#include <asm/compat.h> + +struct compat_sysctl_args { + compat_uptr_t name; + int nlen; + compat_uptr_t oldval; + compat_uptr_t oldlenp; + compat_uptr_t newval; + compat_size_t newlen; + compat_ulong_t __unused[4]; +}; + +asmlinkage long compat_sys_sysctl(struct compat_sysctl_args __user *args) +{ + struct compat_sysctl_args tmp; + compat_size_t __user *compat_oldlenp; + size_t oldlen = 0; + ssize_t result; + + if (copy_from_user(&tmp, args, sizeof(tmp))) + return -EFAULT; + + if (tmp.oldval && !tmp.oldlenp) + return -EFAULT; + + compat_oldlenp = compat_ptr(tmp.oldlenp); + if (compat_oldlenp && get_user(oldlen, compat_oldlenp)) + return -EFAULT; + + result = do_sysctl(compat_ptr(tmp.name), tmp.nlen, + compat_ptr(tmp.oldval), oldlen, + compat_ptr(tmp.newval), tmp.newlen); + + if (result >= 0) { + oldlen = result; + result = 0; + } + + if (compat_oldlenp && put_user(oldlen, compat_oldlenp)) + return -EFAULT; + + return result; +} + +#endif /* CONFIG_COMPAT */ diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c index b38423ca711..04cdcf72c82 100644 --- a/kernel/sysctl_check.c +++ b/kernel/sysctl_check.c @@ -5,1239 +5,6 @@ #include <linux/string.h> #include <net/ip_vs.h> -struct trans_ctl_table { - int ctl_name; - const char *procname; - const struct trans_ctl_table *child; -}; - -static const struct trans_ctl_table trans_random_table[] = { - { RANDOM_POOLSIZE, "poolsize" }, - { RANDOM_ENTROPY_COUNT, "entropy_avail" }, - { RANDOM_READ_THRESH, "read_wakeup_threshold" }, - { RANDOM_WRITE_THRESH, "write_wakeup_threshold" }, - { RANDOM_BOOT_ID, "boot_id" }, - { RANDOM_UUID, "uuid" }, - {} -}; - -static const struct trans_ctl_table trans_pty_table[] = { - { PTY_MAX, "max" }, - { PTY_NR, "nr" }, - {} -}; - -static const struct trans_ctl_table trans_kern_table[] = { - { KERN_OSTYPE, "ostype" }, - { KERN_OSRELEASE, "osrelease" }, - /* KERN_OSREV not used */ - { KERN_VERSION, "version" }, - /* KERN_SECUREMASK not used */ - /* KERN_PROF not used */ - { KERN_NODENAME, "hostname" }, - { KERN_DOMAINNAME, "domainname" }, - - { KERN_PANIC, "panic" }, - { KERN_REALROOTDEV, "real-root-dev" }, - - { KERN_SPARC_REBOOT, "reboot-cmd" }, - { KERN_CTLALTDEL, "ctrl-alt-del" }, - { KERN_PRINTK, "printk" }, - - /* KERN_NAMETRANS not used */ - /* KERN_PPC_HTABRECLAIM not used */ - /* KERN_PPC_ZEROPAGED not used */ - { KERN_PPC_POWERSAVE_NAP, "powersave-nap" }, - - { KERN_MODPROBE, "modprobe" }, - { KERN_SG_BIG_BUFF, "sg-big-buff" }, - { KERN_ACCT, "acct" }, - { KERN_PPC_L2CR, "l2cr" }, - - /* KERN_RTSIGNR not used */ - /* KERN_RTSIGMAX not used */ - - { KERN_SHMMAX, "shmmax" }, - { KERN_MSGMAX, "msgmax" }, - { KERN_MSGMNB, "msgmnb" }, - /* KERN_MSGPOOL not used*/ - { KERN_SYSRQ, "sysrq" }, - { KERN_MAX_THREADS, "threads-max" }, - { KERN_RANDOM, "random", trans_random_table }, - { KERN_SHMALL, "shmall" }, - { KERN_MSGMNI, "msgmni" }, - { KERN_SEM, "sem" }, - { KERN_SPARC_STOP_A, "stop-a" }, - { KERN_SHMMNI, "shmmni" }, - - { KERN_OVERFLOWUID, "overflowuid" }, - { KERN_OVERFLOWGID, "overflowgid" }, - - { KERN_HOTPLUG, "hotplug", }, - { KERN_IEEE_EMULATION_WARNINGS, "ieee_emulation_warnings" }, - - { KERN_S390_USER_DEBUG_LOGGING, "userprocess_debug" }, - { KERN_CORE_USES_PID, "core_uses_pid" }, - { KERN_TAINTED, "tainted" }, - { KERN_CADPID, "cad_pid" }, - { KERN_PIDMAX, "pid_max" }, - { KERN_CORE_PATTERN, "core_pattern" }, - { KERN_PANIC_ON_OOPS, "panic_on_oops" }, - { KERN_HPPA_PWRSW, "soft-power" }, - { KERN_HPPA_UNALIGNED, "unaligned-trap" }, - - { KERN_PRINTK_RATELIMIT, "printk_ratelimit" }, - { KERN_PRINTK_RATELIMIT_BURST, "printk_ratelimit_burst" }, - - { KERN_PTY, "pty", trans_pty_table }, - { KERN_NGROUPS_MAX, "ngroups_max" }, - { KERN_SPARC_SCONS_PWROFF, "scons-poweroff" }, - { KERN_HZ_TIMER, "hz_timer" }, - { KERN_UNKNOWN_NMI_PANIC, "unknown_nmi_panic" }, - { KERN_BOOTLOADER_TYPE, "bootloader_type" }, - { KERN_RANDOMIZE, "randomize_va_space" }, - - { KERN_SPIN_RETRY, "spin_retry" }, - { KERN_ACPI_VIDEO_FLAGS, "acpi_video_flags" }, - { KERN_IA64_UNALIGNED, "ignore-unaligned-usertrap" }, - { KERN_COMPAT_LOG, "compat-log" }, - { KERN_MAX_LOCK_DEPTH, "max_lock_depth" }, - { KERN_NMI_WATCHDOG, "nmi_watchdog" }, - { KERN_PANIC_ON_NMI, "panic_on_unrecovered_nmi" }, - {} -}; - -static const struct trans_ctl_table trans_vm_table[] = { - { VM_OVERCOMMIT_MEMORY, "overcommit_memory" }, - { VM_PAGE_CLUSTER, "page-cluster" }, - { VM_DIRTY_BACKGROUND, "dirty_background_ratio" }, - { VM_DIRTY_RATIO, "dirty_ratio" }, - { VM_DIRTY_WB_CS, "dirty_writeback_centisecs" }, - { VM_DIRTY_EXPIRE_CS, "dirty_expire_centisecs" }, - { VM_NR_PDFLUSH_THREADS, "nr_pdflush_threads" }, - { VM_OVERCOMMIT_RATIO, "overcommit_ratio" }, - /* VM_PAGEBUF unused */ - { VM_HUGETLB_PAGES, "nr_hugepages" }, - { VM_SWAPPINESS, "swappiness" }, - { VM_LOWMEM_RESERVE_RATIO, "lowmem_reserve_ratio" }, - { VM_MIN_FREE_KBYTES, "min_free_kbytes" }, - { VM_MAX_MAP_COUNT, "max_map_count" }, - { VM_LAPTOP_MODE, "laptop_mode" }, - { VM_BLOCK_DUMP, "block_dump" }, - { VM_HUGETLB_GROUP, "hugetlb_shm_group" }, - { VM_VFS_CACHE_PRESSURE, "vfs_cache_pressure" }, - { VM_LEGACY_VA_LAYOUT, "legacy_va_layout" }, - /* VM_SWAP_TOKEN_TIMEOUT unused */ - { VM_DROP_PAGECACHE, "drop_caches" }, - { VM_PERCPU_PAGELIST_FRACTION, "percpu_pagelist_fraction" }, - { VM_ZONE_RECLAIM_MODE, "zone_reclaim_mode" }, - { VM_MIN_UNMAPPED, "min_unmapped_ratio" }, - { VM_PANIC_ON_OOM, "panic_on_oom" }, - { VM_VDSO_ENABLED, "vdso_enabled" }, - { VM_MIN_SLAB, "min_slab_ratio" }, - - {} -}; - -static const struct trans_ctl_table trans_net_core_table[] = { - { NET_CORE_WMEM_MAX, "wmem_max" }, - { NET_CORE_RMEM_MAX, "rmem_max" }, - { NET_CORE_WMEM_DEFAULT, "wmem_default" }, - { NET_CORE_RMEM_DEFAULT, "rmem_default" }, - /* NET_CORE_DESTROY_DELAY unused */ - { NET_CORE_MAX_BACKLOG, "netdev_max_backlog" }, - /* NET_CORE_FASTROUTE unused */ - { NET_CORE_MSG_COST, "message_cost" }, - { NET_CORE_MSG_BURST, "message_burst" }, - { NET_CORE_OPTMEM_MAX, "optmem_max" }, - /* NET_CORE_HOT_LIST_LENGTH unused */ - /* NET_CORE_DIVERT_VERSION unused */ - /* NET_CORE_NO_CONG_THRESH unused */ - /* NET_CORE_NO_CONG unused */ - /* NET_CORE_LO_CONG unused */ - /* NET_CORE_MOD_CONG unused */ - { NET_CORE_DEV_WEIGHT, "dev_weight" }, - { NET_CORE_SOMAXCONN, "somaxconn" }, - { NET_CORE_BUDGET, "netdev_budget" }, - { NET_CORE_AEVENT_ETIME, "xfrm_aevent_etime" }, - { NET_CORE_AEVENT_RSEQTH, "xfrm_aevent_rseqth" }, - { NET_CORE_WARNINGS, "warnings" }, - {}, -}; - -static const struct trans_ctl_table trans_net_unix_table[] = { - /* NET_UNIX_DESTROY_DELAY unused */ - /* NET_UNIX_DELETE_DELAY unused */ - { NET_UNIX_MAX_DGRAM_QLEN, "max_dgram_qlen" }, - {} -}; - -static const struct trans_ctl_table trans_net_ipv4_route_table[] = { - { NET_IPV4_ROUTE_FLUSH, "flush" }, - { NET_IPV4_ROUTE_MIN_DELAY, "min_delay" }, - { NET_IPV4_ROUTE_MAX_DELAY, "max_delay" }, - { NET_IPV4_ROUTE_GC_THRESH, "gc_thresh" }, - { NET_IPV4_ROUTE_MAX_SIZE, "max_size" }, - { NET_IPV4_ROUTE_GC_MIN_INTERVAL, "gc_min_interval" }, - { NET_IPV4_ROUTE_GC_TIMEOUT, "gc_timeout" }, - { NET_IPV4_ROUTE_GC_INTERVAL, "gc_interval" }, - { NET_IPV4_ROUTE_REDIRECT_LOAD, "redirect_load" }, - { NET_IPV4_ROUTE_REDIRECT_NUMBER, "redirect_number" }, - { NET_IPV4_ROUTE_REDIRECT_SILENCE, "redirect_silence" }, - { NET_IPV4_ROUTE_ERROR_COST, "error_cost" }, - { NET_IPV4_ROUTE_ERROR_BURST, "error_burst" }, - { NET_IPV4_ROUTE_GC_ELASTICITY, "gc_elasticity" }, - { NET_IPV4_ROUTE_MTU_EXPIRES, "mtu_expires" }, - { NET_IPV4_ROUTE_MIN_PMTU, "min_pmtu" }, - { NET_IPV4_ROUTE_MIN_ADVMSS, "min_adv_mss" }, - { NET_IPV4_ROUTE_SECRET_INTERVAL, "secret_interval" }, - { NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS, "gc_min_interval_ms" }, - {} -}; - -static const struct trans_ctl_table trans_net_ipv4_conf_vars_table[] = { - { NET_IPV4_CONF_FORWARDING, "forwarding" }, - { NET_IPV4_CONF_MC_FORWARDING, "mc_forwarding" }, - - { NET_IPV4_CONF_PROXY_ARP, "proxy_arp" }, - { NET_IPV4_CONF_ACCEPT_REDIRECTS, "accept_redirects" }, - { NET_IPV4_CONF_SECURE_REDIRECTS, "secure_redirects" }, - { NET_IPV4_CONF_SEND_REDIRECTS, "send_redirects" }, - { NET_IPV4_CONF_SHARED_MEDIA, "shared_media" }, - { NET_IPV4_CONF_RP_FILTER, "rp_filter" }, - { NET_IPV4_CONF_ACCEPT_SOURCE_ROUTE, "accept_source_route" }, - { NET_IPV4_CONF_BOOTP_RELAY, "bootp_relay" }, - { NET_IPV4_CONF_LOG_MARTIANS, "log_martians" }, - { NET_IPV4_CONF_TAG, "tag" }, - { NET_IPV4_CONF_ARPFILTER, "arp_filter" }, - { NET_IPV4_CONF_MEDIUM_ID, "medium_id" }, - { NET_IPV4_CONF_NOXFRM, "disable_xfrm" }, - { NET_IPV4_CONF_NOPOLICY, "disable_policy" }, - { NET_IPV4_CONF_FORCE_IGMP_VERSION, "force_igmp_version" }, - - { NET_IPV4_CONF_ARP_ANNOUNCE, "arp_announce" }, - { NET_IPV4_CONF_ARP_IGNORE, "arp_ignore" }, - { NET_IPV4_CONF_PROMOTE_SECONDARIES, "promote_secondaries" }, - { NET_IPV4_CONF_ARP_ACCEPT, "arp_accept" }, - { NET_IPV4_CONF_ARP_NOTIFY, "arp_notify" }, - {} -}; - -static const struct trans_ctl_table trans_net_ipv4_conf_table[] = { - { NET_PROTO_CONF_ALL, "all", trans_net_ipv4_conf_vars_table }, - { NET_PROTO_CONF_DEFAULT, "default", trans_net_ipv4_conf_vars_table }, - { 0, NULL, trans_net_ipv4_conf_vars_table }, - {} -}; - -static const struct trans_ctl_table trans_net_neigh_vars_table[] = { - { NET_NEIGH_MCAST_SOLICIT, "mcast_solicit" }, - { NET_NEIGH_UCAST_SOLICIT, "ucast_solicit" }, - { NET_NEIGH_APP_SOLICIT, "app_solicit" }, - { NET_NEIGH_RETRANS_TIME, "retrans_time" }, - { NET_NEIGH_REACHABLE_TIME, "base_reachable_time" }, - { NET_NEIGH_DELAY_PROBE_TIME, "delay_first_probe_time" }, - { NET_NEIGH_GC_STALE_TIME, "gc_stale_time" }, - { NET_NEIGH_UNRES_QLEN, "unres_qlen" }, - { NET_NEIGH_PROXY_QLEN, "proxy_qlen" }, - { NET_NEIGH_ANYCAST_DELAY, "anycast_delay" }, - { NET_NEIGH_PROXY_DELAY, "proxy_delay" }, - { NET_NEIGH_LOCKTIME, "locktime" }, - { NET_NEIGH_GC_INTERVAL, "gc_interval" }, - { NET_NEIGH_GC_THRESH1, "gc_thresh1" }, - { NET_NEIGH_GC_THRESH2, "gc_thresh2" }, - { NET_NEIGH_GC_THRESH3, "gc_thresh3" }, - { NET_NEIGH_RETRANS_TIME_MS, "retrans_time_ms" }, - { NET_NEIGH_REACHABLE_TIME_MS, "base_reachable_time_ms" }, - {} -}; - -static const struct trans_ctl_table trans_net_neigh_table[] = { - { NET_PROTO_CONF_DEFAULT, "default", trans_net_neigh_vars_table }, - { 0, NULL, trans_net_neigh_vars_table }, - {} -}; - -static const struct trans_ctl_table trans_net_ipv4_netfilter_table[] = { - { NET_IPV4_NF_CONNTRACK_MAX, "ip_conntrack_max" }, - - { NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_SYN_SENT, "ip_conntrack_tcp_timeout_syn_sent" }, - { NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_SYN_RECV, "ip_conntrack_tcp_timeout_syn_recv" }, - { NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_ESTABLISHED, "ip_conntrack_tcp_timeout_established" }, - { NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_FIN_WAIT, "ip_conntrack_tcp_timeout_fin_wait" }, - { NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_CLOSE_WAIT, "ip_conntrack_tcp_timeout_close_wait" }, - { NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_LAST_ACK, "ip_conntrack_tcp_timeout_last_ack" }, - { NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_TIME_WAIT, "ip_conntrack_tcp_timeout_time_wait" }, - { NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_CLOSE, "ip_conntrack_tcp_timeout_close" }, - - { NET_IPV4_NF_CONNTRACK_UDP_TIMEOUT, "ip_conntrack_udp_timeout" }, - { NET_IPV4_NF_CONNTRACK_UDP_TIMEOUT_STREAM, "ip_conntrack_udp_timeout_stream" }, - { NET_IPV4_NF_CONNTRACK_ICMP_TIMEOUT, "ip_conntrack_icmp_timeout" }, - { NET_IPV4_NF_CONNTRACK_GENERIC_TIMEOUT, "ip_conntrack_generic_timeout" }, - - { NET_IPV4_NF_CONNTRACK_BUCKETS, "ip_conntrack_buckets" }, - { NET_IPV4_NF_CONNTRACK_LOG_INVALID, "ip_conntrack_log_invalid" }, - { NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_MAX_RETRANS, "ip_conntrack_tcp_timeout_max_retrans" }, - { NET_IPV4_NF_CONNTRACK_TCP_LOOSE, "ip_conntrack_tcp_loose" }, - { NET_IPV4_NF_CONNTRACK_TCP_BE_LIBERAL, "ip_conntrack_tcp_be_liberal" }, - { NET_IPV4_NF_CONNTRACK_TCP_MAX_RETRANS, "ip_conntrack_tcp_max_retrans" }, - - { NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_CLOSED, "ip_conntrack_sctp_timeout_closed" }, - { NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_WAIT, "ip_conntrack_sctp_timeout_cookie_wait" }, - { NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_ECHOED, "ip_conntrack_sctp_timeout_cookie_echoed" }, - { NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_ESTABLISHED, "ip_conntrack_sctp_timeout_established" }, - { NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_SENT, "ip_conntrack_sctp_timeout_shutdown_sent" }, - { NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_RECD, "ip_conntrack_sctp_timeout_shutdown_recd" }, - { NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_ACK_SENT, "ip_conntrack_sctp_timeout_shutdown_ack_sent" }, - - { NET_IPV4_NF_CONNTRACK_COUNT, "ip_conntrack_count" }, - { NET_IPV4_NF_CONNTRACK_CHECKSUM, "ip_conntrack_checksum" }, - {} -}; - -static const struct trans_ctl_table trans_net_ipv4_table[] = { - { NET_IPV4_FORWARD, "ip_forward" }, - { NET_IPV4_DYNADDR, "ip_dynaddr" }, - - { NET_IPV4_CONF, "conf", trans_net_ipv4_conf_table }, - { NET_IPV4_NEIGH, "neigh", trans_net_neigh_table }, - { NET_IPV4_ROUTE, "route", trans_net_ipv4_route_table }, - /* NET_IPV4_FIB_HASH unused */ - { NET_IPV4_NETFILTER, "netfilter", trans_net_ipv4_netfilter_table }, - - { NET_IPV4_TCP_TIMESTAMPS, "tcp_timestamps" }, - { NET_IPV4_TCP_WINDOW_SCALING, "tcp_window_scaling" }, - { NET_IPV4_TCP_SACK, "tcp_sack" }, - { NET_IPV4_TCP_RETRANS_COLLAPSE, "tcp_retrans_collapse" }, - { NET_IPV4_DEFAULT_TTL, "ip_default_ttl" }, - /* NET_IPV4_AUTOCONFIG unused */ - { NET_IPV4_NO_PMTU_DISC, "ip_no_pmtu_disc" }, - { NET_IPV4_TCP_SYN_RETRIES, "tcp_syn_retries" }, - { NET_IPV4_IPFRAG_HIGH_THRESH, "ipfrag_high_thresh" }, - { NET_IPV4_IPFRAG_LOW_THRESH, "ipfrag_low_thresh" }, - { NET_IPV4_IPFRAG_TIME, "ipfrag_time" }, - /* NET_IPV4_TCP_MAX_KA_PROBES unused */ - { NET_IPV4_TCP_KEEPALIVE_TIME, "tcp_keepalive_time" }, - { NET_IPV4_TCP_KEEPALIVE_PROBES, "tcp_keepalive_probes" }, - { NET_IPV4_TCP_RETRIES1, "tcp_retries1" }, - { NET_IPV4_TCP_RETRIES2, "tcp_retries2" }, - { NET_IPV4_TCP_FIN_TIMEOUT, "tcp_fin_timeout" }, - /* NET_IPV4_IP_MASQ_DEBUG unused */ - { NET_TCP_SYNCOOKIES, "tcp_syncookies" }, - { NET_TCP_STDURG, "tcp_stdurg" }, - { NET_TCP_RFC1337, "tcp_rfc1337" }, - /* NET_TCP_SYN_TAILDROP unused */ - { NET_TCP_MAX_SYN_BACKLOG, "tcp_max_syn_backlog" }, - { NET_IPV4_LOCAL_PORT_RANGE, "ip_local_port_range" }, - { NET_IPV4_ICMP_ECHO_IGNORE_ALL, "icmp_echo_ignore_all" }, - { NET_IPV4_ICMP_ECHO_IGNORE_BROADCASTS, "icmp_echo_ignore_broadcasts" }, - /* NET_IPV4_ICMP_SOURCEQUENCH_RATE unused */ - /* NET_IPV4_ICMP_DESTUNREACH_RATE unused */ - /* NET_IPV4_ICMP_TIMEEXCEED_RATE unused */ - /* NET_IPV4_ICMP_PARAMPROB_RATE unused */ - /* NET_IPV4_ICMP_ECHOREPLY_RATE unused */ - { NET_IPV4_ICMP_IGNORE_BOGUS_ERROR_RESPONSES, "icmp_ignore_bogus_error_responses" }, - { NET_IPV4_IGMP_MAX_MEMBERSHIPS, "igmp_max_memberships" }, - { NET_TCP_TW_RECYCLE, "tcp_tw_recycle" }, - /* NET_IPV4_ALWAYS_DEFRAG unused */ - { NET_IPV4_TCP_KEEPALIVE_INTVL, "tcp_keepalive_intvl" }, - { NET_IPV4_INET_PEER_THRESHOLD, "inet_peer_threshold" }, - { NET_IPV4_INET_PEER_MINTTL, "inet_peer_minttl" }, - { NET_IPV4_INET_PEER_MAXTTL, "inet_peer_maxttl" }, - { NET_IPV4_INET_PEER_GC_MINTIME, "inet_peer_gc_mintime" }, - { NET_IPV4_INET_PEER_GC_MAXTIME, "inet_peer_gc_maxtime" }, - { NET_TCP_ORPHAN_RETRIES, "tcp_orphan_retries" }, - { NET_TCP_ABORT_ON_OVERFLOW, "tcp_abort_on_overflow" }, - { NET_TCP_SYNACK_RETRIES, "tcp_synack_retries" }, - { NET_TCP_MAX_ORPHANS, "tcp_max_orphans" }, - { NET_TCP_MAX_TW_BUCKETS, "tcp_max_tw_buckets" }, - { NET_TCP_FACK, "tcp_fack" }, - { NET_TCP_REORDERING, "tcp_reordering" }, - { NET_TCP_ECN, "tcp_ecn" }, - { NET_TCP_DSACK, "tcp_dsack" }, - { NET_TCP_MEM, "tcp_mem" }, - { NET_TCP_WMEM, "tcp_wmem" }, - { NET_TCP_RMEM, "tcp_rmem" }, - { NET_TCP_APP_WIN, "tcp_app_win" }, - { NET_TCP_ADV_WIN_SCALE, "tcp_adv_win_scale" }, - { NET_IPV4_NONLOCAL_BIND, "ip_nonlocal_bind" }, - { NET_IPV4_ICMP_RATELIMIT, "icmp_ratelimit" }, - { NET_IPV4_ICMP_RATEMASK, "icmp_ratemask" }, - { NET_TCP_TW_REUSE, "tcp_tw_reuse" }, - { NET_TCP_FRTO, "tcp_frto" }, - { NET_TCP_LOW_LATENCY, "tcp_low_latency" }, - { NET_IPV4_IPFRAG_SECRET_INTERVAL, "ipfrag_secret_interval" }, - { NET_IPV4_IGMP_MAX_MSF, "igmp_max_msf" }, - { NET_TCP_NO_METRICS_SAVE, "tcp_no_metrics_save" }, - /* NET_TCP_DEFAULT_WIN_SCALE unused */ - { NET_TCP_MODERATE_RCVBUF, "tcp_moderate_rcvbuf" }, - { NET_TCP_TSO_WIN_DIVISOR, "tcp_tso_win_divisor" }, - /* NET_TCP_BIC_BETA unused */ - { NET_IPV4_ICMP_ERRORS_USE_INBOUND_IFADDR, "icmp_errors_use_inbound_ifaddr" }, - { NET_TCP_CONG_CONTROL, "tcp_congestion_control" }, - { NET_TCP_ABC, "tcp_abc" }, - { NET_IPV4_IPFRAG_MAX_DIST, "ipfrag_max_dist" }, - { NET_TCP_MTU_PROBING, "tcp_mtu_probing" }, - { NET_TCP_BASE_MSS, "tcp_base_mss" }, - { NET_IPV4_TCP_WORKAROUND_SIGNED_WINDOWS, "tcp_workaround_signed_windows" }, - { NET_TCP_DMA_COPYBREAK, "tcp_dma_copybreak" }, - { NET_TCP_SLOW_START_AFTER_IDLE, "tcp_slow_start_after_idle" }, - { NET_CIPSOV4_CACHE_ENABLE, "cipso_cache_enable" }, - { NET_CIPSOV4_CACHE_BUCKET_SIZE, "cipso_cache_bucket_size" }, - { NET_CIPSOV4_RBM_OPTFMT, "cipso_rbm_optfmt" }, - { NET_CIPSOV4_RBM_STRICTVALID, "cipso_rbm_strictvalid" }, - { NET_TCP_AVAIL_CONG_CONTROL, "tcp_available_congestion_control" }, - { NET_TCP_ALLOWED_CONG_CONTROL, "tcp_allowed_congestion_control" }, - { NET_TCP_MAX_SSTHRESH, "tcp_max_ssthresh" }, - { NET_TCP_FRTO_RESPONSE, "tcp_frto_response" }, - { 2088 /* NET_IPQ_QMAX */, "ip_queue_maxlen" }, - {} -}; - -static const struct trans_ctl_table trans_net_ipx_table[] = { - { NET_IPX_PPROP_BROADCASTING, "ipx_pprop_broadcasting" }, - /* NET_IPX_FORWARDING unused */ - {} -}; - -static const struct trans_ctl_table trans_net_atalk_table[] = { - { NET_ATALK_AARP_EXPIRY_TIME, "aarp-expiry-time" }, - { NET_ATALK_AARP_TICK_TIME, "aarp-tick-time" }, - { NET_ATALK_AARP_RETRANSMIT_LIMIT, "aarp-retransmit-limit" }, - { NET_ATALK_AARP_RESOLVE_TIME, "aarp-resolve-time" }, - {}, -}; - -static const struct trans_ctl_table trans_net_netrom_table[] = { - { NET_NETROM_DEFAULT_PATH_QUALITY, "default_path_quality" }, - { NET_NETROM_OBSOLESCENCE_COUNT_INITIALISER, "obsolescence_count_initialiser" }, - { NET_NETROM_NETWORK_TTL_INITIALISER, "network_ttl_initialiser" }, - { NET_NETROM_TRANSPORT_TIMEOUT, "transport_timeout" }, - { NET_NETROM_TRANSPORT_MAXIMUM_TRIES, "transport_maximum_tries" }, - { NET_NETROM_TRANSPORT_ACKNOWLEDGE_DELAY, "transport_acknowledge_delay" }, - { NET_NETROM_TRANSPORT_BUSY_DELAY, "transport_busy_delay" }, - { NET_NETROM_TRANSPORT_REQUESTED_WINDOW_SIZE, "transport_requested_window_size" }, - { NET_NETROM_TRANSPORT_NO_ACTIVITY_TIMEOUT, "transport_no_activity_timeout" }, - { NET_NETROM_ROUTING_CONTROL, "routing_control" }, - { NET_NETROM_LINK_FAILS_COUNT, "link_fails_count" }, - { NET_NETROM_RESET, "reset" }, - {} -}; - -static const struct trans_ctl_table trans_net_ax25_param_table[] = { - { NET_AX25_IP_DEFAULT_MODE, "ip_default_mode" }, - { NET_AX25_DEFAULT_MODE, "ax25_default_mode" }, - { NET_AX25_BACKOFF_TYPE, "backoff_type" }, - { NET_AX25_CONNECT_MODE, "connect_mode" }, - { NET_AX25_STANDARD_WINDOW, "standard_window_size" }, - { NET_AX25_EXTENDED_WINDOW, "extended_window_size" }, - { NET_AX25_T1_TIMEOUT, "t1_timeout" }, - { NET_AX25_T2_TIMEOUT, "t2_timeout" }, - { NET_AX25_T3_TIMEOUT, "t3_timeout" }, - { NET_AX25_IDLE_TIMEOUT, "idle_timeout" }, - { NET_AX25_N2, "maximum_retry_count" }, - { NET_AX25_PACLEN, "maximum_packet_length" }, - { NET_AX25_PROTOCOL, "protocol" }, - { NET_AX25_DAMA_SLAVE_TIMEOUT, "dama_slave_timeout" }, - {} -}; - -static const struct trans_ctl_table trans_net_ax25_table[] = { - { 0, NULL, trans_net_ax25_param_table }, - {} -}; - -static const struct trans_ctl_table trans_net_bridge_table[] = { - { NET_BRIDGE_NF_CALL_ARPTABLES, "bridge-nf-call-arptables" }, - { NET_BRIDGE_NF_CALL_IPTABLES, "bridge-nf-call-iptables" }, - { NET_BRIDGE_NF_CALL_IP6TABLES, "bridge-nf-call-ip6tables" }, - { NET_BRIDGE_NF_FILTER_VLAN_TAGGED, "bridge-nf-filter-vlan-tagged" }, - { NET_BRIDGE_NF_FILTER_PPPOE_TAGGED, "bridge-nf-filter-pppoe-tagged" }, - {} -}; - -static const struct trans_ctl_table trans_net_rose_table[] = { - { NET_ROSE_RESTART_REQUEST_TIMEOUT, "restart_request_timeout" }, - { NET_ROSE_CALL_REQUEST_TIMEOUT, "call_request_timeout" }, - { NET_ROSE_RESET_REQUEST_TIMEOUT, "reset_request_timeout" }, - { NET_ROSE_CLEAR_REQUEST_TIMEOUT, "clear_request_timeout" }, - { NET_ROSE_ACK_HOLD_BACK_TIMEOUT, "acknowledge_hold_back_timeout" }, - { NET_ROSE_ROUTING_CONTROL, "routing_control" }, - { NET_ROSE_LINK_FAIL_TIMEOUT, "link_fail_timeout" }, - { NET_ROSE_MAX_VCS, "maximum_virtual_circuits" }, - { NET_ROSE_WINDOW_SIZE, "window_size" }, - { NET_ROSE_NO_ACTIVITY_TIMEOUT, "no_activity_timeout" }, - {} -}; - -static const struct trans_ctl_table trans_net_ipv6_conf_var_table[] = { - { NET_IPV6_FORWARDING, "forwarding" }, - { NET_IPV6_HOP_LIMIT, "hop_limit" }, - { NET_IPV6_MTU, "mtu" }, - { NET_IPV6_ACCEPT_RA, "accept_ra" }, - { NET_IPV6_ACCEPT_REDIRECTS, "accept_redirects" }, - { NET_IPV6_AUTOCONF, "autoconf" }, - { NET_IPV6_DAD_TRANSMITS, "dad_transmits" }, - { NET_IPV6_RTR_SOLICITS, "router_solicitations" }, - { NET_IPV6_RTR_SOLICIT_INTERVAL, "router_solicitation_interval" }, - { NET_IPV6_RTR_SOLICIT_DELAY, "router_solicitation_delay" }, - { NET_IPV6_USE_TEMPADDR, "use_tempaddr" }, - { NET_IPV6_TEMP_VALID_LFT, "temp_valid_lft" }, - { NET_IPV6_TEMP_PREFERED_LFT, "temp_prefered_lft" }, - { NET_IPV6_REGEN_MAX_RETRY, "regen_max_retry" }, - { NET_IPV6_MAX_DESYNC_FACTOR, "max_desync_factor" }, - { NET_IPV6_MAX_ADDRESSES, "max_addresses" }, - { NET_IPV6_FORCE_MLD_VERSION, "force_mld_version" }, - { NET_IPV6_ACCEPT_RA_DEFRTR, "accept_ra_defrtr" }, - { NET_IPV6_ACCEPT_RA_PINFO, "accept_ra_pinfo" }, - { NET_IPV6_ACCEPT_RA_RTR_PREF, "accept_ra_rtr_pref" }, - { NET_IPV6_RTR_PROBE_INTERVAL, "router_probe_interval" }, - { NET_IPV6_ACCEPT_RA_RT_INFO_MAX_PLEN, "accept_ra_rt_info_max_plen" }, - { NET_IPV6_PROXY_NDP, "proxy_ndp" }, - { NET_IPV6_ACCEPT_SOURCE_ROUTE, "accept_source_route" }, - {} -}; - -static const struct trans_ctl_table trans_net_ipv6_conf_table[] = { - { NET_PROTO_CONF_ALL, "all", trans_net_ipv6_conf_var_table }, - { NET_PROTO_CONF_DEFAULT, "default", trans_net_ipv6_conf_var_table }, - { 0, NULL, trans_net_ipv6_conf_var_table }, - {} -}; - -static const struct trans_ctl_table trans_net_ipv6_route_table[] = { - { NET_IPV6_ROUTE_FLUSH, "flush" }, - { NET_IPV6_ROUTE_GC_THRESH, "gc_thresh" }, - { NET_IPV6_ROUTE_MAX_SIZE, "max_size" }, - { NET_IPV6_ROUTE_GC_MIN_INTERVAL, "gc_min_interval" }, - { NET_IPV6_ROUTE_GC_TIMEOUT, "gc_timeout" }, - { NET_IPV6_ROUTE_GC_INTERVAL, "gc_interval" }, - { NET_IPV6_ROUTE_GC_ELASTICITY, "gc_elasticity" }, - { NET_IPV6_ROUTE_MTU_EXPIRES, "mtu_expires" }, - { NET_IPV6_ROUTE_MIN_ADVMSS, "min_adv_mss" }, - { NET_IPV6_ROUTE_GC_MIN_INTERVAL_MS, "gc_min_interval_ms" }, - {} -}; - -static const struct trans_ctl_table trans_net_ipv6_icmp_table[] = { - { NET_IPV6_ICMP_RATELIMIT, "ratelimit" }, - {} -}; - -static const struct trans_ctl_table trans_net_ipv6_table[] = { - { NET_IPV6_CONF, "conf", trans_net_ipv6_conf_table }, - { NET_IPV6_NEIGH, "neigh", trans_net_neigh_table }, - { NET_IPV6_ROUTE, "route", trans_net_ipv6_route_table }, - { NET_IPV6_ICMP, "icmp", trans_net_ipv6_icmp_table }, - { NET_IPV6_BINDV6ONLY, "bindv6only" }, - { NET_IPV6_IP6FRAG_HIGH_THRESH, "ip6frag_high_thresh" }, - { NET_IPV6_IP6FRAG_LOW_THRESH, "ip6frag_low_thresh" }, - { NET_IPV6_IP6FRAG_TIME, "ip6frag_time" }, - { NET_IPV6_IP6FRAG_SECRET_INTERVAL, "ip6frag_secret_interval" }, - { NET_IPV6_MLD_MAX_MSF, "mld_max_msf" }, - { 2088 /* IPQ_QMAX */, "ip6_queue_maxlen" }, - {} -}; - -static const struct trans_ctl_table trans_net_x25_table[] = { - { NET_X25_RESTART_REQUEST_TIMEOUT, "restart_request_timeout" }, - { NET_X25_CALL_REQUEST_TIMEOUT, "call_request_timeout" }, - { NET_X25_RESET_REQUEST_TIMEOUT, "reset_request_timeout" }, - { NET_X25_CLEAR_REQUEST_TIMEOUT, "clear_request_timeout" }, - { NET_X25_ACK_HOLD_BACK_TIMEOUT, "acknowledgement_hold_back_timeout" }, - { NET_X25_FORWARD, "x25_forward" }, - {} -}; - -static const struct trans_ctl_table trans_net_tr_table[] = { - { NET_TR_RIF_TIMEOUT, "rif_timeout" }, - {} -}; - - -static const struct trans_ctl_table trans_net_decnet_conf_vars[] = { - { NET_DECNET_CONF_DEV_FORWARDING, "forwarding" }, - { NET_DECNET_CONF_DEV_PRIORITY, "priority" }, - { NET_DECNET_CONF_DEV_T2, "t2" }, - { NET_DECNET_CONF_DEV_T3, "t3" }, - {} -}; - -static const struct trans_ctl_table trans_net_decnet_conf[] = { - { 0, NULL, trans_net_decnet_conf_vars }, - {} -}; - -static const struct trans_ctl_table trans_net_decnet_table[] = { - { NET_DECNET_CONF, "conf", trans_net_decnet_conf }, - { NET_DECNET_NODE_ADDRESS, "node_address" }, - { NET_DECNET_NODE_NAME, "node_name" }, - { NET_DECNET_DEFAULT_DEVICE, "default_device" }, - { NET_DECNET_TIME_WAIT, "time_wait" }, - { NET_DECNET_DN_COUNT, "dn_count" }, - { NET_DECNET_DI_COUNT, "di_count" }, - { NET_DECNET_DR_COUNT, "dr_count" }, - { NET_DECNET_DST_GC_INTERVAL, "dst_gc_interval" }, - { NET_DECNET_NO_FC_MAX_CWND, "no_fc_max_cwnd" }, - { NET_DECNET_MEM, "decnet_mem" }, - { NET_DECNET_RMEM, "decnet_rmem" }, - { NET_DECNET_WMEM, "decnet_wmem" }, - { NET_DECNET_DEBUG_LEVEL, "debug" }, - {} -}; - -static const struct trans_ctl_table trans_net_sctp_table[] = { - { NET_SCTP_RTO_INITIAL, "rto_initial" }, - { NET_SCTP_RTO_MIN, "rto_min" }, - { NET_SCTP_RTO_MAX, "rto_max" }, - { NET_SCTP_RTO_ALPHA, "rto_alpha_exp_divisor" }, - { NET_SCTP_RTO_BETA, "rto_beta_exp_divisor" }, - { NET_SCTP_VALID_COOKIE_LIFE, "valid_cookie_life" }, - { NET_SCTP_ASSOCIATION_MAX_RETRANS, "association_max_retrans" }, - { NET_SCTP_PATH_MAX_RETRANS, "path_max_retrans" }, - { NET_SCTP_MAX_INIT_RETRANSMITS, "max_init_retransmits" }, - { NET_SCTP_HB_INTERVAL, "hb_interval" }, - { NET_SCTP_PRESERVE_ENABLE, "cookie_preserve_enable" }, - { NET_SCTP_MAX_BURST, "max_burst" }, - { NET_SCTP_ADDIP_ENABLE, "addip_enable" }, - { NET_SCTP_PRSCTP_ENABLE, "prsctp_enable" }, - { NET_SCTP_SNDBUF_POLICY, "sndbuf_policy" }, - { NET_SCTP_SACK_TIMEOUT, "sack_timeout" }, - { NET_SCTP_RCVBUF_POLICY, "rcvbuf_policy" }, - {} -}; - -static const struct trans_ctl_table trans_net_llc_llc2_timeout_table[] = { - { NET_LLC2_ACK_TIMEOUT, "ack" }, - { NET_LLC2_P_TIMEOUT, "p" }, - { NET_LLC2_REJ_TIMEOUT, "rej" }, - { NET_LLC2_BUSY_TIMEOUT, "busy" }, - {} -}; - -static const struct trans_ctl_table trans_net_llc_station_table[] = { - { NET_LLC_STATION_ACK_TIMEOUT, "ack_timeout" }, - {} -}; - -static const struct trans_ctl_table trans_net_llc_llc2_table[] = { - { NET_LLC2, "timeout", trans_net_llc_llc2_timeout_table }, - {} -}; - -static const struct trans_ctl_table trans_net_llc_table[] = { - { NET_LLC2, "llc2", trans_net_llc_llc2_table }, - { NET_LLC_STATION, "station", trans_net_llc_station_table }, - {} -}; - -static const struct trans_ctl_table trans_net_netfilter_table[] = { - { NET_NF_CONNTRACK_MAX, "nf_conntrack_max" }, - { NET_NF_CONNTRACK_TCP_TIMEOUT_SYN_SENT, "nf_conntrack_tcp_timeout_syn_sent" }, - { NET_NF_CONNTRACK_TCP_TIMEOUT_SYN_RECV, "nf_conntrack_tcp_timeout_syn_recv" }, - { NET_NF_CONNTRACK_TCP_TIMEOUT_ESTABLISHED, "nf_conntrack_tcp_timeout_established" }, - { NET_NF_CONNTRACK_TCP_TIMEOUT_FIN_WAIT, "nf_conntrack_tcp_timeout_fin_wait" }, - { NET_NF_CONNTRACK_TCP_TIMEOUT_CLOSE_WAIT, "nf_conntrack_tcp_timeout_close_wait" }, - { NET_NF_CONNTRACK_TCP_TIMEOUT_LAST_ACK, "nf_conntrack_tcp_timeout_last_ack" }, - { NET_NF_CONNTRACK_TCP_TIMEOUT_TIME_WAIT, "nf_conntrack_tcp_timeout_time_wait" }, - { NET_NF_CONNTRACK_TCP_TIMEOUT_CLOSE, "nf_conntrack_tcp_timeout_close" }, - { NET_NF_CONNTRACK_UDP_TIMEOUT, "nf_conntrack_udp_timeout" }, - { NET_NF_CONNTRACK_UDP_TIMEOUT_STREAM, "nf_conntrack_udp_timeout_stream" }, - { NET_NF_CONNTRACK_ICMP_TIMEOUT, "nf_conntrack_icmp_timeout" }, - { NET_NF_CONNTRACK_GENERIC_TIMEOUT, "nf_conntrack_generic_timeout" }, - { NET_NF_CONNTRACK_BUCKETS, "nf_conntrack_buckets" }, - { NET_NF_CONNTRACK_LOG_INVALID, "nf_conntrack_log_invalid" }, - { NET_NF_CONNTRACK_TCP_TIMEOUT_MAX_RETRANS, "nf_conntrack_tcp_timeout_max_retrans" }, - { NET_NF_CONNTRACK_TCP_LOOSE, "nf_conntrack_tcp_loose" }, - { NET_NF_CONNTRACK_TCP_BE_LIBERAL, "nf_conntrack_tcp_be_liberal" }, - { NET_NF_CONNTRACK_TCP_MAX_RETRANS, "nf_conntrack_tcp_max_retrans" }, - { NET_NF_CONNTRACK_SCTP_TIMEOUT_CLOSED, "nf_conntrack_sctp_timeout_closed" }, - { NET_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_WAIT, "nf_conntrack_sctp_timeout_cookie_wait" }, - { NET_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_ECHOED, "nf_conntrack_sctp_timeout_cookie_echoed" }, - { NET_NF_CONNTRACK_SCTP_TIMEOUT_ESTABLISHED, "nf_conntrack_sctp_timeout_established" }, - { NET_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_SENT, "nf_conntrack_sctp_timeout_shutdown_sent" }, - { NET_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_RECD, "nf_conntrack_sctp_timeout_shutdown_recd" }, - { NET_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_ACK_SENT, "nf_conntrack_sctp_timeout_shutdown_ack_sent" }, - { NET_NF_CONNTRACK_COUNT, "nf_conntrack_count" }, - { NET_NF_CONNTRACK_ICMPV6_TIMEOUT, "nf_conntrack_icmpv6_timeout" }, - { NET_NF_CONNTRACK_FRAG6_TIMEOUT, "nf_conntrack_frag6_timeout" }, - { NET_NF_CONNTRACK_FRAG6_LOW_THRESH, "nf_conntrack_frag6_low_thresh" }, - { NET_NF_CONNTRACK_FRAG6_HIGH_THRESH, "nf_conntrack_frag6_high_thresh" }, - { NET_NF_CONNTRACK_CHECKSUM, "nf_conntrack_checksum" }, - - {} -}; - -static const struct trans_ctl_table trans_net_dccp_table[] = { - { NET_DCCP_DEFAULT, "default" }, - {} -}; - -static const struct trans_ctl_table trans_net_irda_table[] = { - { NET_IRDA_DISCOVERY, "discovery" }, - { NET_IRDA_DEVNAME, "devname" }, - { NET_IRDA_DEBUG, "debug" }, - { NET_IRDA_FAST_POLL, "fast_poll_increase" }, - { NET_IRDA_DISCOVERY_SLOTS, "discovery_slots" }, - { NET_IRDA_DISCOVERY_TIMEOUT, "discovery_timeout" }, - { NET_IRDA_SLOT_TIMEOUT, "slot_timeout" }, - { NET_IRDA_MAX_BAUD_RATE, "max_baud_rate" }, - { NET_IRDA_MIN_TX_TURN_TIME, "min_tx_turn_time" }, - { NET_IRDA_MAX_TX_DATA_SIZE, "max_tx_data_size" }, - { NET_IRDA_MAX_TX_WINDOW, "max_tx_window" }, - { NET_IRDA_MAX_NOREPLY_TIME, "max_noreply_time" }, - { NET_IRDA_WARN_NOREPLY_TIME, "warn_noreply_time" }, - { NET_IRDA_LAP_KEEPALIVE_TIME, "lap_keepalive_time" }, - {} -}; - -static const struct trans_ctl_table trans_net_table[] = { - { NET_CORE, "core", trans_net_core_table }, - /* NET_ETHER not used */ - /* NET_802 not used */ - { NET_UNIX, "unix", trans_net_unix_table }, - { NET_IPV4, "ipv4", trans_net_ipv4_table }, - { NET_IPX, "ipx", trans_net_ipx_table }, - { NET_ATALK, "appletalk", trans_net_atalk_table }, - { NET_NETROM, "netrom", trans_net_netrom_table }, - { NET_AX25, "ax25", trans_net_ax25_table }, - { NET_BRIDGE, "bridge", trans_net_bridge_table }, - { NET_ROSE, "rose", trans_net_rose_table }, - { NET_IPV6, "ipv6", trans_net_ipv6_table }, - { NET_X25, "x25", trans_net_x25_table }, - { NET_TR, "token-ring", trans_net_tr_table }, - { NET_DECNET, "decnet", trans_net_decnet_table }, - /* NET_ECONET not used */ - { NET_SCTP, "sctp", trans_net_sctp_table }, - { NET_LLC, "llc", trans_net_llc_table }, - { NET_NETFILTER, "netfilter", trans_net_netfilter_table }, - { NET_DCCP, "dccp", trans_net_dccp_table }, - { NET_IRDA, "irda", trans_net_irda_table }, - { 2089, "nf_conntrack_max" }, - {} -}; - -static const struct trans_ctl_table trans_fs_quota_table[] = { - { FS_DQ_LOOKUPS, "lookups" }, - { FS_DQ_DROPS, "drops" }, - { FS_DQ_READS, "reads" }, - { FS_DQ_WRITES, "writes" }, - { FS_DQ_CACHE_HITS, "cache_hits" }, - { FS_DQ_ALLOCATED, "allocated_dquots" }, - { FS_DQ_FREE, "free_dquots" }, - { FS_DQ_SYNCS, "syncs" }, - { FS_DQ_WARNINGS, "warnings" }, - {} -}; - -static const struct trans_ctl_table trans_fs_xfs_table[] = { - { XFS_SGID_INHERIT, "irix_sgid_inherit" }, - { XFS_SYMLINK_MODE, "irix_symlink_mode" }, - { XFS_PANIC_MASK, "panic_mask" }, - - { XFS_ERRLEVEL, "error_level" }, - { XFS_SYNCD_TIMER, "xfssyncd_centisecs" }, - { XFS_INHERIT_SYNC, "inherit_sync" }, - { XFS_INHERIT_NODUMP, "inherit_nodump" }, - { XFS_INHERIT_NOATIME, "inherit_noatime" }, - { XFS_BUF_TIMER, "xfsbufd_centisecs" }, - { XFS_BUF_AGE, "age_buffer_centisecs" }, - { XFS_INHERIT_NOSYM, "inherit_nosymlinks" }, - { XFS_ROTORSTEP, "rotorstep" }, - { XFS_INHERIT_NODFRG, "inherit_nodefrag" }, - { XFS_FILESTREAM_TIMER, "filestream_centisecs" }, - { XFS_STATS_CLEAR, "stats_clear" }, - {} -}; - -static const struct trans_ctl_table trans_fs_ocfs2_nm_table[] = { - { 1, "hb_ctl_path" }, - {} -}; - -static const struct trans_ctl_table trans_fs_ocfs2_table[] = { - { 1, "nm", trans_fs_ocfs2_nm_table }, - {} -}; - -static const struct trans_ctl_table trans_inotify_table[] = { - { INOTIFY_MAX_USER_INSTANCES, "max_user_instances" }, - { INOTIFY_MAX_USER_WATCHES, "max_user_watches" }, - { INOTIFY_MAX_QUEUED_EVENTS, "max_queued_events" }, - {} -}; - -static const struct trans_ctl_table trans_fs_table[] = { - { FS_NRINODE, "inode-nr" }, - { FS_STATINODE, "inode-state" }, - /* FS_MAXINODE unused */ - /* FS_NRDQUOT unused */ - /* FS_MAXDQUOT unused */ - { FS_NRFILE, "file-nr" }, - { FS_MAXFILE, "file-max" }, - { FS_DENTRY, "dentry-state" }, - /* FS_NRSUPER unused */ - /* FS_MAXUPSER unused */ - { FS_OVERFLOWUID, "overflowuid" }, - { FS_OVERFLOWGID, "overflowgid" }, - { FS_LEASES, "leases-enable" }, - { FS_DIR_NOTIFY, "dir-notify-enable" }, - { FS_LEASE_TIME, "lease-break-time" }, - { FS_DQSTATS, "quota", trans_fs_quota_table }, - { FS_XFS, "xfs", trans_fs_xfs_table }, - { FS_AIO_NR, "aio-nr" }, - { FS_AIO_MAX_NR, "aio-max-nr" }, - { FS_INOTIFY, "inotify", trans_inotify_table }, - { FS_OCFS2, "ocfs2", trans_fs_ocfs2_table }, - { KERN_SETUID_DUMPABLE, "suid_dumpable" }, - {} -}; - -static const struct trans_ctl_table trans_debug_table[] = { - {} -}; - -static const struct trans_ctl_table trans_cdrom_table[] = { - { DEV_CDROM_INFO, "info" }, - { DEV_CDROM_AUTOCLOSE, "autoclose" }, - { DEV_CDROM_AUTOEJECT, "autoeject" }, - { DEV_CDROM_DEBUG, "debug" }, - { DEV_CDROM_LOCK, "lock" }, - { DEV_CDROM_CHECK_MEDIA, "check_media" }, - {} -}; - -static const struct trans_ctl_table trans_ipmi_table[] = { - { DEV_IPMI_POWEROFF_POWERCYCLE, "poweroff_powercycle" }, - {} -}; - -static const struct trans_ctl_table trans_mac_hid_files[] = { - /* DEV_MAC_HID_KEYBOARD_SENDS_LINUX_KEYCODES unused */ - /* DEV_MAC_HID_KEYBOARD_LOCK_KEYCODES unused */ - { DEV_MAC_HID_MOUSE_BUTTON_EMULATION, "mouse_button_emulation" }, - { DEV_MAC_HID_MOUSE_BUTTON2_KEYCODE, "mouse_button2_keycode" }, - { DEV_MAC_HID_MOUSE_BUTTON3_KEYCODE, "mouse_button3_keycode" }, - /* DEV_MAC_HID_ADB_MOUSE_SENDS_KEYCODES unused */ - {} -}; - -static const struct trans_ctl_table trans_raid_table[] = { - { DEV_RAID_SPEED_LIMIT_MIN, "speed_limit_min" }, - { DEV_RAID_SPEED_LIMIT_MAX, "speed_limit_max" }, - {} -}; - -static const struct trans_ctl_table trans_scsi_table[] = { - { DEV_SCSI_LOGGING_LEVEL, "logging_level" }, - {} -}; - -static const struct trans_ctl_table trans_parport_default_table[] = { - { DEV_PARPORT_DEFAULT_TIMESLICE, "timeslice" }, - { DEV_PARPORT_DEFAULT_SPINTIME, "spintime" }, - {} -}; - -static const struct trans_ctl_table trans_parport_device_table[] = { - { DEV_PARPORT_DEVICE_TIMESLICE, "timeslice" }, - {} -}; - -static const struct trans_ctl_table trans_parport_devices_table[] = { - { DEV_PARPORT_DEVICES_ACTIVE, "active" }, - { 0, NULL, trans_parport_device_table }, - {} -}; - -static const struct trans_ctl_table trans_parport_parport_table[] = { - { DEV_PARPORT_SPINTIME, "spintime" }, - { DEV_PARPORT_BASE_ADDR, "base-addr" }, - { DEV_PARPORT_IRQ, "irq" }, - { DEV_PARPORT_DMA, "dma" }, - { DEV_PARPORT_MODES, "modes" }, - { DEV_PARPORT_DEVICES, "devices", trans_parport_devices_table }, - { DEV_PARPORT_AUTOPROBE, "autoprobe" }, - { DEV_PARPORT_AUTOPROBE + 1, "autoprobe0" }, - { DEV_PARPORT_AUTOPROBE + 2, "autoprobe1" }, - { DEV_PARPORT_AUTOPROBE + 3, "autoprobe2" }, - { DEV_PARPORT_AUTOPROBE + 4, "autoprobe3" }, - {} -}; -static const struct trans_ctl_table trans_parport_table[] = { - { DEV_PARPORT_DEFAULT, "default", trans_parport_default_table }, - { 0, NULL, trans_parport_parport_table }, - {} -}; - -static const struct trans_ctl_table trans_dev_table[] = { - { DEV_CDROM, "cdrom", trans_cdrom_table }, - /* DEV_HWMON unused */ - { DEV_PARPORT, "parport", trans_parport_table }, - { DEV_RAID, "raid", trans_raid_table }, - { DEV_MAC_HID, "mac_hid", trans_mac_hid_files }, - { DEV_SCSI, "scsi", trans_scsi_table }, - { DEV_IPMI, "ipmi", trans_ipmi_table }, - {} -}; - -static const struct trans_ctl_table trans_bus_isa_table[] = { - { BUS_ISA_MEM_BASE, "membase" }, - { BUS_ISA_PORT_BASE, "portbase" }, - { BUS_ISA_PORT_SHIFT, "portshift" }, - {} -}; - -static const struct trans_ctl_table trans_bus_table[] = { - { CTL_BUS_ISA, "isa", trans_bus_isa_table }, - {} -}; - -static const struct trans_ctl_table trans_arlan_conf_table0[] = { - { 1, "spreadingCode" }, - { 2, "channelNumber" }, - { 3, "scramblingDisable" }, - { 4, "txAttenuation" }, - { 5, "systemId" }, - { 6, "maxDatagramSize" }, - { 7, "maxFrameSize" }, - { 8, "maxRetries" }, - { 9, "receiveMode" }, - { 10, "priority" }, - { 11, "rootOrRepeater" }, - { 12, "SID" }, - { 13, "registrationMode" }, - { 14, "registrationFill" }, - { 15, "localTalkAddress" }, - { 16, "codeFormat" }, - { 17, "numChannels" }, - { 18, "channel1" }, - { 19, "channel2" }, - { 20, "channel3" }, - { 21, "channel4" }, - { 22, "txClear" }, - { 23, "txRetries" }, - { 24, "txRouting" }, - { 25, "txScrambled" }, - { 26, "rxParameter" }, - { 27, "txTimeoutMs" }, - { 28, "waitCardTimeout" }, - { 29, "channelSet" }, - { 30, "name" }, - { 31, "waitTime" }, - { 32, "lParameter" }, - { 33, "_15" }, - { 34, "headerSize" }, - { 36, "tx_delay_ms" }, - { 37, "retries" }, - { 38, "ReTransmitPacketMaxSize" }, - { 39, "waitReTransmitPacketMaxSize" }, - { 40, "fastReTransCount" }, - { 41, "driverRetransmissions" }, - { 42, "txAckTimeoutMs" }, - { 43, "registrationInterrupts" }, - { 44, "hardwareType" }, - { 45, "radioType" }, - { 46, "writeEEPROM" }, - { 47, "writeRadioType" }, - { 48, "entry_exit_debug" }, - { 49, "debug" }, - { 50, "in_speed" }, - { 51, "out_speed" }, - { 52, "in_speed10" }, - { 53, "out_speed10" }, - { 54, "in_speed_max" }, - { 55, "out_speed_max" }, - { 56, "measure_rate" }, - { 57, "pre_Command_Wait" }, - { 58, "rx_tweak1" }, - { 59, "rx_tweak2" }, - { 60, "tx_queue_len" }, - - { 150, "arlan0-txRing" }, - { 151, "arlan0-rxRing" }, - { 152, "arlan0-18" }, - { 153, "arlan0-ring" }, - { 154, "arlan0-shm-cpy" }, - { 155, "config0" }, - { 156, "reset0" }, - {} -}; - -static const struct trans_ctl_table trans_arlan_conf_table1[] = { - { 1, "spreadingCode" }, - { 2, "channelNumber" }, - { 3, "scramblingDisable" }, - { 4, "txAttenuation" }, - { 5, "systemId" }, - { 6, "maxDatagramSize" }, - { 7, "maxFrameSize" }, - { 8, "maxRetries" }, - { 9, "receiveMode" }, - { 10, "priority" }, - { 11, "rootOrRepeater" }, - { 12, "SID" }, - { 13, "registrationMode" }, - { 14, "registrationFill" }, - { 15, "localTalkAddress" }, - { 16, "codeFormat" }, - { 17, "numChannels" }, - { 18, "channel1" }, - { 19, "channel2" }, - { 20, "channel3" }, - { 21, "channel4" }, - { 22, "txClear" }, - { 23, "txRetries" }, - { 24, "txRouting" }, - { 25, "txScrambled" }, - { 26, "rxParameter" }, - { 27, "txTimeoutMs" }, - { 28, "waitCardTimeout" }, - { 29, "channelSet" }, - { 30, "name" }, - { 31, "waitTime" }, - { 32, "lParameter" }, - { 33, "_15" }, - { 34, "headerSize" }, - { 36, "tx_delay_ms" }, - { 37, "retries" }, - { 38, "ReTransmitPacketMaxSize" }, - { 39, "waitReTransmitPacketMaxSize" }, - { 40, "fastReTransCount" }, - { 41, "driverRetransmissions" }, - { 42, "txAckTimeoutMs" }, - { 43, "registrationInterrupts" }, - { 44, "hardwareType" }, - { 45, "radioType" }, - { 46, "writeEEPROM" }, - { 47, "writeRadioType" }, - { 48, "entry_exit_debug" }, - { 49, "debug" }, - { 50, "in_speed" }, - { 51, "out_speed" }, - { 52, "in_speed10" }, - { 53, "out_speed10" }, - { 54, "in_speed_max" }, - { 55, "out_speed_max" }, - { 56, "measure_rate" }, - { 57, "pre_Command_Wait" }, - { 58, "rx_tweak1" }, - { 59, "rx_tweak2" }, - { 60, "tx_queue_len" }, - - { 150, "arlan1-txRing" }, - { 151, "arlan1-rxRing" }, - { 152, "arlan1-18" }, - { 153, "arlan1-ring" }, - { 154, "arlan1-shm-cpy" }, - { 155, "config1" }, - { 156, "reset1" }, - {} -}; - -static const struct trans_ctl_table trans_arlan_conf_table2[] = { - { 1, "spreadingCode" }, - { 2, "channelNumber" }, - { 3, "scramblingDisable" }, - { 4, "txAttenuation" }, - { 5, "systemId" }, - { 6, "maxDatagramSize" }, - { 7, "maxFrameSize" }, - { 8, "maxRetries" }, - { 9, "receiveMode" }, - { 10, "priority" }, - { 11, "rootOrRepeater" }, - { 12, "SID" }, - { 13, "registrationMode" }, - { 14, "registrationFill" }, - { 15, "localTalkAddress" }, - { 16, "codeFormat" }, - { 17, "numChannels" }, - { 18, "channel1" }, - { 19, "channel2" }, - { 20, "channel3" }, - { 21, "channel4" }, - { 22, "txClear" }, - { 23, "txRetries" }, - { 24, "txRouting" }, - { 25, "txScrambled" }, - { 26, "rxParameter" }, - { 27, "txTimeoutMs" }, - { 28, "waitCardTimeout" }, - { 29, "channelSet" }, - { 30, "name" }, - { 31, "waitTime" }, - { 32, "lParameter" }, - { 33, "_15" }, - { 34, "headerSize" }, - { 36, "tx_delay_ms" }, - { 37, "retries" }, - { 38, "ReTransmitPacketMaxSize" }, - { 39, "waitReTransmitPacketMaxSize" }, - { 40, "fastReTransCount" }, - { 41, "driverRetransmissions" }, - { 42, "txAckTimeoutMs" }, - { 43, "registrationInterrupts" }, - { 44, "hardwareType" }, - { 45, "radioType" }, - { 46, "writeEEPROM" }, - { 47, "writeRadioType" }, - { 48, "entry_exit_debug" }, - { 49, "debug" }, - { 50, "in_speed" }, - { 51, "out_speed" }, - { 52, "in_speed10" }, - { 53, "out_speed10" }, - { 54, "in_speed_max" }, - { 55, "out_speed_max" }, - { 56, "measure_rate" }, - { 57, "pre_Command_Wait" }, - { 58, "rx_tweak1" }, - { 59, "rx_tweak2" }, - { 60, "tx_queue_len" }, - - { 150, "arlan2-txRing" }, - { 151, "arlan2-rxRing" }, - { 152, "arlan2-18" }, - { 153, "arlan2-ring" }, - { 154, "arlan2-shm-cpy" }, - { 155, "config2" }, - { 156, "reset2" }, - {} -}; - -static const struct trans_ctl_table trans_arlan_conf_table3[] = { - { 1, "spreadingCode" }, - { 2, "channelNumber" }, - { 3, "scramblingDisable" }, - { 4, "txAttenuation" }, - { 5, "systemId" }, - { 6, "maxDatagramSize" }, - { 7, "maxFrameSize" }, - { 8, "maxRetries" }, - { 9, "receiveMode" }, - { 10, "priority" }, - { 11, "rootOrRepeater" }, - { 12, "SID" }, - { 13, "registrationMode" }, - { 14, "registrationFill" }, - { 15, "localTalkAddress" }, - { 16, "codeFormat" }, - { 17, "numChannels" }, - { 18, "channel1" }, - { 19, "channel2" }, - { 20, "channel3" }, - { 21, "channel4" }, - { 22, "txClear" }, - { 23, "txRetries" }, - { 24, "txRouting" }, - { 25, "txScrambled" }, - { 26, "rxParameter" }, - { 27, "txTimeoutMs" }, - { 28, "waitCardTimeout" }, - { 29, "channelSet" }, - { 30, "name" }, - { 31, "waitTime" }, - { 32, "lParameter" }, - { 33, "_15" }, - { 34, "headerSize" }, - { 36, "tx_delay_ms" }, - { 37, "retries" }, - { 38, "ReTransmitPacketMaxSize" }, - { 39, "waitReTransmitPacketMaxSize" }, - { 40, "fastReTransCount" }, - { 41, "driverRetransmissions" }, - { 42, "txAckTimeoutMs" }, - { 43, "registrationInterrupts" }, - { 44, "hardwareType" }, - { 45, "radioType" }, - { 46, "writeEEPROM" }, - { 47, "writeRadioType" }, - { 48, "entry_exit_debug" }, - { 49, "debug" }, - { 50, "in_speed" }, - { 51, "out_speed" }, - { 52, "in_speed10" }, - { 53, "out_speed10" }, - { 54, "in_speed_max" }, - { 55, "out_speed_max" }, - { 56, "measure_rate" }, - { 57, "pre_Command_Wait" }, - { 58, "rx_tweak1" }, - { 59, "rx_tweak2" }, - { 60, "tx_queue_len" }, - - { 150, "arlan3-txRing" }, - { 151, "arlan3-rxRing" }, - { 152, "arlan3-18" }, - { 153, "arlan3-ring" }, - { 154, "arlan3-shm-cpy" }, - { 155, "config3" }, - { 156, "reset3" }, - {} -}; - -static const struct trans_ctl_table trans_arlan_table[] = { - { 1, "arlan0", trans_arlan_conf_table0 }, - { 2, "arlan1", trans_arlan_conf_table1 }, - { 3, "arlan2", trans_arlan_conf_table2 }, - { 4, "arlan3", trans_arlan_conf_table3 }, - {} -}; - -static const struct trans_ctl_table trans_s390dbf_table[] = { - { 5678 /* CTL_S390DBF_STOPPABLE */, "debug_stoppable" }, - { 5679 /* CTL_S390DBF_ACTIVE */, "debug_active" }, - {} -}; - -static const struct trans_ctl_table trans_sunrpc_table[] = { - { CTL_RPCDEBUG, "rpc_debug" }, - { CTL_NFSDEBUG, "nfs_debug" }, - { CTL_NFSDDEBUG, "nfsd_debug" }, - { CTL_NLMDEBUG, "nlm_debug" }, - { CTL_SLOTTABLE_UDP, "udp_slot_table_entries" }, - { CTL_SLOTTABLE_TCP, "tcp_slot_table_entries" }, - { CTL_MIN_RESVPORT, "min_resvport" }, - { CTL_MAX_RESVPORT, "max_resvport" }, - {} -}; - -static const struct trans_ctl_table trans_pm_table[] = { - { 1 /* CTL_PM_SUSPEND */, "suspend" }, - { 2 /* CTL_PM_CMODE */, "cmode" }, - { 3 /* CTL_PM_P0 */, "p0" }, - { 4 /* CTL_PM_CM */, "cm" }, - {} -}; - -static const struct trans_ctl_table trans_frv_table[] = { - { 1, "cache-mode" }, - { 2, "pin-cxnr" }, - {} -}; - -static const struct trans_ctl_table trans_root_table[] = { - { CTL_KERN, "kernel", trans_kern_table }, - { CTL_VM, "vm", trans_vm_table }, - { CTL_NET, "net", trans_net_table }, - /* CTL_PROC not used */ - { CTL_FS, "fs", trans_fs_table }, - { CTL_DEBUG, "debug", trans_debug_table }, - { CTL_DEV, "dev", trans_dev_table }, - { CTL_BUS, "bus", trans_bus_table }, - { CTL_ABI, "abi" }, - /* CTL_CPU not used */ - { CTL_ARLAN, "arlan", trans_arlan_table }, - { CTL_S390DBF, "s390dbf", trans_s390dbf_table }, - { CTL_SUNRPC, "sunrpc", trans_sunrpc_table }, - { CTL_PM, "pm", trans_pm_table }, - { CTL_FRV, "frv", trans_frv_table }, - {} -}; - - - static int sysctl_depth(struct ctl_table *table) { @@ -1261,47 +28,6 @@ static struct ctl_table *sysctl_parent(struct ctl_table *table, int n) return table; } -static const struct trans_ctl_table *sysctl_binary_lookup(struct ctl_table *table) -{ - struct ctl_table *test; - const struct trans_ctl_table *ref; - int cur_depth; - - cur_depth = sysctl_depth(table); - - ref = trans_root_table; -repeat: - test = sysctl_parent(table, cur_depth); - for (; ref->ctl_name || ref->procname || ref->child; ref++) { - int match = 0; - - if (cur_depth && !ref->child) - continue; - - if (test->procname && ref->procname && - (strcmp(test->procname, ref->procname) == 0)) - match++; - - if (test->ctl_name && ref->ctl_name && - (test->ctl_name == ref->ctl_name)) - match++; - - if (!ref->ctl_name && !ref->procname) - match++; - - if (match) { - if (cur_depth != 0) { - cur_depth--; - ref = ref->child; - goto repeat; - } - goto out; - } - } - ref = NULL; -out: - return ref; -} static void sysctl_print_path(struct ctl_table *table) { @@ -1315,26 +41,6 @@ static void sysctl_print_path(struct ctl_table *table) } } printk(" "); - if (table->ctl_name) { - for (i = depth; i >= 0; i--) { - tmp = sysctl_parent(table, i); - printk(".%d", tmp->ctl_name); - } - } -} - -static void sysctl_repair_table(struct ctl_table *table) -{ - /* Don't complain about the classic default - * sysctl strategy routine. Maybe later we - * can get the tables fixed and complain about - * this. - */ - if (table->ctl_name && table->procname && - (table->proc_handler == proc_dointvec) && - (!table->strategy)) { - table->strategy = sysctl_data; - } } static struct ctl_table *sysctl_check_lookup(struct nsproxy *namespaces, @@ -1352,7 +58,7 @@ static struct ctl_table *sysctl_check_lookup(struct nsproxy *namespaces, ref = head->ctl_table; repeat: test = sysctl_parent(table, cur_depth); - for (; ref->ctl_name || ref->procname; ref++) { + for (; ref->procname; ref++) { int match = 0; if (cur_depth && !ref->child) continue; @@ -1361,10 +67,6 @@ repeat: (strcmp(test->procname, ref->procname) == 0)) match++; - if (test->ctl_name && ref->ctl_name && - (test->ctl_name == ref->ctl_name)) - match++; - if (match) { if (cur_depth != 0) { cur_depth--; @@ -1392,38 +94,6 @@ static void set_fail(const char **fail, struct ctl_table *table, const char *str *fail = str; } -static int sysctl_check_dir(struct nsproxy *namespaces, - struct ctl_table *table) -{ - struct ctl_table *ref; - int error; - - error = 0; - ref = sysctl_check_lookup(namespaces, table); - if (ref) { - int match = 0; - if ((!table->procname && !ref->procname) || - (table->procname && ref->procname && - (strcmp(table->procname, ref->procname) == 0))) - match++; - - if ((!table->ctl_name && !ref->ctl_name) || - (table->ctl_name && ref->ctl_name && - (table->ctl_name == ref->ctl_name))) - match++; - - if (match != 2) { - printk(KERN_ERR "%s: failed: ", __func__); - sysctl_print_path(table); - printk(" ref: "); - sysctl_print_path(ref); - printk("\n"); - error = -EINVAL; - } - } - return error; -} - static void sysctl_check_leaf(struct nsproxy *namespaces, struct ctl_table *table, const char **fail) { @@ -1434,37 +104,15 @@ static void sysctl_check_leaf(struct nsproxy *namespaces, set_fail(fail, table, "Sysctl already exists"); } -static void sysctl_check_bin_path(struct ctl_table *table, const char **fail) -{ - const struct trans_ctl_table *ref; - - ref = sysctl_binary_lookup(table); - if (table->ctl_name && !ref) - set_fail(fail, table, "Unknown sysctl binary path"); - if (ref) { - if (ref->procname && - (!table->procname || - (strcmp(table->procname, ref->procname) != 0))) - set_fail(fail, table, "procname does not match binary path procname"); - - if (ref->ctl_name && table->ctl_name && - (table->ctl_name != ref->ctl_name)) - set_fail(fail, table, "ctl_name does not match binary path ctl_name"); - } -} - int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table) { int error = 0; - for (; table->ctl_name || table->procname; table++) { + for (; table->procname; table++) { const char *fail = NULL; - sysctl_repair_table(table); if (table->parent) { if (table->procname && !table->parent->procname) set_fail(&fail, table, "Parent without procname"); - if (table->ctl_name && !table->parent->ctl_name) - set_fail(&fail, table, "Parent without ctl_name"); } if (!table->procname) set_fail(&fail, table, "No procname"); @@ -1477,21 +125,12 @@ int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table) set_fail(&fail, table, "Writable sysctl directory"); if (table->proc_handler) set_fail(&fail, table, "Directory with proc_handler"); - if (table->strategy) - set_fail(&fail, table, "Directory with strategy"); if (table->extra1) set_fail(&fail, table, "Directory with extra1"); if (table->extra2) set_fail(&fail, table, "Directory with extra2"); - if (sysctl_check_dir(namespaces, table)) - set_fail(&fail, table, "Inconsistent directory names"); } else { - if ((table->strategy == sysctl_data) || - (table->strategy == sysctl_string) || - (table->strategy == sysctl_intvec) || - (table->strategy == sysctl_jiffies) || - (table->strategy == sysctl_ms_jiffies) || - (table->proc_handler == proc_dostring) || + if ((table->proc_handler == proc_dostring) || (table->proc_handler == proc_dointvec) || (table->proc_handler == proc_dointvec_minmax) || (table->proc_handler == proc_dointvec_jiffies) || @@ -1513,15 +152,7 @@ int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table) set_fail(&fail, table, "No max"); } } -#ifdef CONFIG_SYSCTL_SYSCALL - if (table->ctl_name && !table->strategy) - set_fail(&fail, table, "Missing strategy"); -#endif -#if 0 - if (!table->ctl_name && table->strategy) - set_fail(&fail, table, "Strategy without ctl_name"); -#endif -#ifdef CONFIG_PROC_FS +#ifdef CONFIG_PROC_SYSCTL if (table->procname && !table->proc_handler) set_fail(&fail, table, "No proc_handler"); #endif @@ -1531,7 +162,6 @@ int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table) #endif sysctl_check_leaf(namespaces, table, &fail); } - sysctl_check_bin_path(table, &fail); if (table->mode > 0777) set_fail(&fail, table, "bogus .mode"); if (fail) { diff --git a/kernel/time.c b/kernel/time.c index 2e2e469a7fe..c6324d96009 100644 --- a/kernel/time.c +++ b/kernel/time.c @@ -136,7 +136,6 @@ static inline void warp_clock(void) write_seqlock_irq(&xtime_lock); wall_to_monotonic.tv_sec -= sys_tz.tz_minuteswest * 60; xtime.tv_sec += sys_tz.tz_minuteswest * 60; - update_xtime_cache(0); write_sequnlock_irq(&xtime_lock); clock_was_set(); } @@ -662,6 +661,36 @@ u64 nsec_to_clock_t(u64 x) #endif } +/** + * nsecs_to_jiffies - Convert nsecs in u64 to jiffies + * + * @n: nsecs in u64 + * + * Unlike {m,u}secs_to_jiffies, type of input is not unsigned int but u64. + * And this doesn't return MAX_JIFFY_OFFSET since this function is designed + * for scheduler, not for use in device drivers to calculate timeout value. + * + * note: + * NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512) + * ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years + */ +unsigned long nsecs_to_jiffies(u64 n) +{ +#if (NSEC_PER_SEC % HZ) == 0 + /* Common case, HZ = 100, 128, 200, 250, 256, 500, 512, 1000 etc. */ + return div_u64(n, NSEC_PER_SEC / HZ); +#elif (HZ % 512) == 0 + /* overflow after 292 years if HZ = 1024 */ + return div_u64(n * HZ / 512, NSEC_PER_SEC / 512); +#else + /* + * Generic case - optimized for cases where HZ is a multiple of 3. + * overflow after 64.99 years, exact for HZ = 60, 72, 90, 120 etc. + */ + return div_u64(n * 9, (9ull * NSEC_PER_SEC + HZ / 2) / HZ); +#endif +} + #if (BITS_PER_LONG < 64) u64 get_jiffies_64(void) { diff --git a/kernel/time/Makefile b/kernel/time/Makefile index 0b0a6366c9d..ee266620b06 100644 --- a/kernel/time/Makefile +++ b/kernel/time/Makefile @@ -1,4 +1,4 @@ -obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o timecompare.o +obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o timecompare.o timeconv.o obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 620b58abdc3..3d5fc0fd1cc 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -20,6 +20,8 @@ #include <linux/sysdev.h> #include <linux/tick.h> +#include "tick-internal.h" + /* The registered clock event devices */ static LIST_HEAD(clockevent_devices); static LIST_HEAD(clockevents_released); @@ -28,7 +30,7 @@ static LIST_HEAD(clockevents_released); static RAW_NOTIFIER_HEAD(clockevents_chain); /* Protection for the above */ -static DEFINE_SPINLOCK(clockevents_lock); +static DEFINE_RAW_SPINLOCK(clockevents_lock); /** * clockevents_delta2ns - Convert a latch value (device ticks) to nanoseconds @@ -37,10 +39,9 @@ static DEFINE_SPINLOCK(clockevents_lock); * * Math helper, returns latch value converted to nanoseconds (bound checked) */ -unsigned long clockevent_delta2ns(unsigned long latch, - struct clock_event_device *evt) +u64 clockevent_delta2ns(unsigned long latch, struct clock_event_device *evt) { - u64 clc = ((u64) latch << evt->shift); + u64 clc = (u64) latch << evt->shift; if (unlikely(!evt->mult)) { evt->mult = 1; @@ -50,10 +51,10 @@ unsigned long clockevent_delta2ns(unsigned long latch, do_div(clc, evt->mult); if (clc < 1000) clc = 1000; - if (clc > LONG_MAX) - clc = LONG_MAX; + if (clc > KTIME_MAX) + clc = KTIME_MAX; - return (unsigned long) clc; + return clc; } EXPORT_SYMBOL_GPL(clockevent_delta2ns); @@ -140,9 +141,9 @@ int clockevents_register_notifier(struct notifier_block *nb) unsigned long flags; int ret; - spin_lock_irqsave(&clockevents_lock, flags); + raw_spin_lock_irqsave(&clockevents_lock, flags); ret = raw_notifier_chain_register(&clockevents_chain, nb); - spin_unlock_irqrestore(&clockevents_lock, flags); + raw_spin_unlock_irqrestore(&clockevents_lock, flags); return ret; } @@ -184,13 +185,13 @@ void clockevents_register_device(struct clock_event_device *dev) BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED); BUG_ON(!dev->cpumask); - spin_lock_irqsave(&clockevents_lock, flags); + raw_spin_lock_irqsave(&clockevents_lock, flags); list_add(&dev->list, &clockevent_devices); clockevents_do_notify(CLOCK_EVT_NOTIFY_ADD, dev); clockevents_notify_released(); - spin_unlock_irqrestore(&clockevents_lock, flags); + raw_spin_unlock_irqrestore(&clockevents_lock, flags); } EXPORT_SYMBOL_GPL(clockevents_register_device); @@ -240,7 +241,7 @@ void clockevents_notify(unsigned long reason, void *arg) struct list_head *node, *tmp; unsigned long flags; - spin_lock_irqsave(&clockevents_lock, flags); + raw_spin_lock_irqsave(&clockevents_lock, flags); clockevents_do_notify(reason, arg); switch (reason) { @@ -255,7 +256,7 @@ void clockevents_notify(unsigned long reason, void *arg) default: break; } - spin_unlock_irqrestore(&clockevents_lock, flags); + raw_spin_unlock_irqrestore(&clockevents_lock, flags); } EXPORT_SYMBOL_GPL(clockevents_notify); #endif diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 09113347d32..e85c23404d3 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -39,7 +39,7 @@ void timecounter_init(struct timecounter *tc, tc->cycle_last = cc->read(cc); tc->nsec = start_tstamp; } -EXPORT_SYMBOL(timecounter_init); +EXPORT_SYMBOL_GPL(timecounter_init); /** * timecounter_read_delta - get nanoseconds since last call of this function @@ -83,7 +83,7 @@ u64 timecounter_read(struct timecounter *tc) return nsec; } -EXPORT_SYMBOL(timecounter_read); +EXPORT_SYMBOL_GPL(timecounter_read); u64 timecounter_cyc2time(struct timecounter *tc, cycle_t cycle_tstamp) @@ -105,7 +105,60 @@ u64 timecounter_cyc2time(struct timecounter *tc, return nsec; } -EXPORT_SYMBOL(timecounter_cyc2time); +EXPORT_SYMBOL_GPL(timecounter_cyc2time); + +/** + * clocks_calc_mult_shift - calculate mult/shift factors for scaled math of clocks + * @mult: pointer to mult variable + * @shift: pointer to shift variable + * @from: frequency to convert from + * @to: frequency to convert to + * @minsec: guaranteed runtime conversion range in seconds + * + * The function evaluates the shift/mult pair for the scaled math + * operations of clocksources and clockevents. + * + * @to and @from are frequency values in HZ. For clock sources @to is + * NSEC_PER_SEC == 1GHz and @from is the counter frequency. For clock + * event @to is the counter frequency and @from is NSEC_PER_SEC. + * + * The @minsec conversion range argument controls the time frame in + * seconds which must be covered by the runtime conversion with the + * calculated mult and shift factors. This guarantees that no 64bit + * overflow happens when the input value of the conversion is + * multiplied with the calculated mult factor. Larger ranges may + * reduce the conversion accuracy by chosing smaller mult and shift + * factors. + */ +void +clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 minsec) +{ + u64 tmp; + u32 sft, sftacc= 32; + + /* + * Calculate the shift factor which is limiting the conversion + * range: + */ + tmp = ((u64)minsec * from) >> 32; + while (tmp) { + tmp >>=1; + sftacc--; + } + + /* + * Find the conversion shift/mult pair which has the best + * accuracy and fits the maxsec conversion range: + */ + for (sft = 32; sft > 0; sft--) { + tmp = (u64) to << sft; + do_div(tmp, from); + if ((tmp >> sftacc) == 0) + break; + } + *mult = tmp; + *shift = sft; +} /*[Clocksource internal variables]--------- * curr_clocksource: @@ -394,15 +447,11 @@ void clocksource_resume(void) { struct clocksource *cs; - mutex_lock(&clocksource_mutex); - list_for_each_entry(cs, &clocksource_list, list) if (cs->resume) cs->resume(); clocksource_resume_watchdog(); - - mutex_unlock(&clocksource_mutex); } /** @@ -417,6 +466,47 @@ void clocksource_touch_watchdog(void) clocksource_resume_watchdog(); } +/** + * clocksource_max_deferment - Returns max time the clocksource can be deferred + * @cs: Pointer to clocksource + * + */ +static u64 clocksource_max_deferment(struct clocksource *cs) +{ + u64 max_nsecs, max_cycles; + + /* + * Calculate the maximum number of cycles that we can pass to the + * cyc2ns function without overflowing a 64-bit signed result. The + * maximum number of cycles is equal to ULLONG_MAX/cs->mult which + * is equivalent to the below. + * max_cycles < (2^63)/cs->mult + * max_cycles < 2^(log2((2^63)/cs->mult)) + * max_cycles < 2^(log2(2^63) - log2(cs->mult)) + * max_cycles < 2^(63 - log2(cs->mult)) + * max_cycles < 1 << (63 - log2(cs->mult)) + * Please note that we add 1 to the result of the log2 to account for + * any rounding errors, ensure the above inequality is satisfied and + * no overflow will occur. + */ + max_cycles = 1ULL << (63 - (ilog2(cs->mult) + 1)); + + /* + * The actual maximum number of cycles we can defer the clocksource is + * determined by the minimum of max_cycles and cs->mask. + */ + max_cycles = min_t(u64, max_cycles, (u64) cs->mask); + max_nsecs = clocksource_cyc2ns(max_cycles, cs->mult, cs->shift); + + /* + * To ensure that the clocksource does not wrap whilst we are idle, + * limit the time the clocksource can be deferred by 12.5%. Please + * note a margin of 12.5% is used because this can be computed with + * a shift, versus say 10% which would require division. + */ + return max_nsecs - (max_nsecs >> 5); +} + #ifdef CONFIG_GENERIC_TIME /** @@ -515,6 +605,9 @@ static void clocksource_enqueue(struct clocksource *cs) */ int clocksource_register(struct clocksource *cs) { + /* calculate max idle time permitted for this clocksource */ + cs->max_idle_ns = clocksource_max_deferment(cs); + mutex_lock(&clocksource_mutex); clocksource_enqueue(cs); clocksource_select(); @@ -584,7 +677,7 @@ sysfs_show_current_clocksources(struct sys_device *dev, * @count: length of buffer * * Takes input from sysfs interface for manually overriding the default - * clocksource selction. + * clocksource selection. */ static ssize_t sysfs_override_clocksource(struct sys_device *dev, struct sysdev_attribute *attr, diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index c2ec25087a3..b3bafd5fc66 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -31,7 +31,7 @@ static struct tick_device tick_broadcast_device; /* FIXME: Use cpumask_var_t. */ static DECLARE_BITMAP(tick_broadcast_mask, NR_CPUS); static DECLARE_BITMAP(tmpmask, NR_CPUS); -static DEFINE_SPINLOCK(tick_broadcast_lock); +static DEFINE_RAW_SPINLOCK(tick_broadcast_lock); static int tick_broadcast_force; #ifdef CONFIG_TICK_ONESHOT @@ -96,7 +96,7 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) unsigned long flags; int ret = 0; - spin_lock_irqsave(&tick_broadcast_lock, flags); + raw_spin_lock_irqsave(&tick_broadcast_lock, flags); /* * Devices might be registered with both periodic and oneshot @@ -122,7 +122,7 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) tick_broadcast_clear_oneshot(cpu); } } - spin_unlock_irqrestore(&tick_broadcast_lock, flags); + raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); return ret; } @@ -161,13 +161,13 @@ static void tick_do_broadcast(struct cpumask *mask) */ static void tick_do_periodic_broadcast(void) { - spin_lock(&tick_broadcast_lock); + raw_spin_lock(&tick_broadcast_lock); cpumask_and(to_cpumask(tmpmask), cpu_online_mask, tick_get_broadcast_mask()); tick_do_broadcast(to_cpumask(tmpmask)); - spin_unlock(&tick_broadcast_lock); + raw_spin_unlock(&tick_broadcast_lock); } /* @@ -212,7 +212,7 @@ static void tick_do_broadcast_on_off(unsigned long *reason) unsigned long flags; int cpu, bc_stopped; - spin_lock_irqsave(&tick_broadcast_lock, flags); + raw_spin_lock_irqsave(&tick_broadcast_lock, flags); cpu = smp_processor_id(); td = &per_cpu(tick_cpu_device, cpu); @@ -263,7 +263,7 @@ static void tick_do_broadcast_on_off(unsigned long *reason) tick_broadcast_setup_oneshot(bc); } out: - spin_unlock_irqrestore(&tick_broadcast_lock, flags); + raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); } /* @@ -299,7 +299,7 @@ void tick_shutdown_broadcast(unsigned int *cpup) unsigned long flags; unsigned int cpu = *cpup; - spin_lock_irqsave(&tick_broadcast_lock, flags); + raw_spin_lock_irqsave(&tick_broadcast_lock, flags); bc = tick_broadcast_device.evtdev; cpumask_clear_cpu(cpu, tick_get_broadcast_mask()); @@ -309,7 +309,7 @@ void tick_shutdown_broadcast(unsigned int *cpup) clockevents_shutdown(bc); } - spin_unlock_irqrestore(&tick_broadcast_lock, flags); + raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); } void tick_suspend_broadcast(void) @@ -317,13 +317,13 @@ void tick_suspend_broadcast(void) struct clock_event_device *bc; unsigned long flags; - spin_lock_irqsave(&tick_broadcast_lock, flags); + raw_spin_lock_irqsave(&tick_broadcast_lock, flags); bc = tick_broadcast_device.evtdev; if (bc) clockevents_shutdown(bc); - spin_unlock_irqrestore(&tick_broadcast_lock, flags); + raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); } int tick_resume_broadcast(void) @@ -332,7 +332,7 @@ int tick_resume_broadcast(void) unsigned long flags; int broadcast = 0; - spin_lock_irqsave(&tick_broadcast_lock, flags); + raw_spin_lock_irqsave(&tick_broadcast_lock, flags); bc = tick_broadcast_device.evtdev; @@ -351,7 +351,7 @@ int tick_resume_broadcast(void) break; } } - spin_unlock_irqrestore(&tick_broadcast_lock, flags); + raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); return broadcast; } @@ -405,7 +405,7 @@ static void tick_handle_oneshot_broadcast(struct clock_event_device *dev) ktime_t now, next_event; int cpu; - spin_lock(&tick_broadcast_lock); + raw_spin_lock(&tick_broadcast_lock); again: dev->next_event.tv64 = KTIME_MAX; next_event.tv64 = KTIME_MAX; @@ -443,7 +443,7 @@ again: if (tick_broadcast_set_event(next_event, 0)) goto again; } - spin_unlock(&tick_broadcast_lock); + raw_spin_unlock(&tick_broadcast_lock); } /* @@ -457,7 +457,7 @@ void tick_broadcast_oneshot_control(unsigned long reason) unsigned long flags; int cpu; - spin_lock_irqsave(&tick_broadcast_lock, flags); + raw_spin_lock_irqsave(&tick_broadcast_lock, flags); /* * Periodic mode does not care about the enter/exit of power @@ -492,7 +492,7 @@ void tick_broadcast_oneshot_control(unsigned long reason) } out: - spin_unlock_irqrestore(&tick_broadcast_lock, flags); + raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); } /* @@ -563,13 +563,13 @@ void tick_broadcast_switch_to_oneshot(void) struct clock_event_device *bc; unsigned long flags; - spin_lock_irqsave(&tick_broadcast_lock, flags); + raw_spin_lock_irqsave(&tick_broadcast_lock, flags); tick_broadcast_device.mode = TICKDEV_MODE_ONESHOT; bc = tick_broadcast_device.evtdev; if (bc) tick_broadcast_setup_oneshot(bc); - spin_unlock_irqrestore(&tick_broadcast_lock, flags); + raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); } @@ -581,7 +581,7 @@ void tick_shutdown_broadcast_oneshot(unsigned int *cpup) unsigned long flags; unsigned int cpu = *cpup; - spin_lock_irqsave(&tick_broadcast_lock, flags); + raw_spin_lock_irqsave(&tick_broadcast_lock, flags); /* * Clear the broadcast mask flag for the dead cpu, but do not @@ -589,7 +589,7 @@ void tick_shutdown_broadcast_oneshot(unsigned int *cpup) */ cpumask_clear_cpu(cpu, tick_get_broadcast_oneshot_mask()); - spin_unlock_irqrestore(&tick_broadcast_lock, flags); + raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); } /* diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 83c4417b6a3..b6b898d2eee 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -34,7 +34,7 @@ DEFINE_PER_CPU(struct tick_device, tick_cpu_device); ktime_t tick_next_period; ktime_t tick_period; int tick_do_timer_cpu __read_mostly = TICK_DO_TIMER_BOOT; -DEFINE_SPINLOCK(tick_device_lock); +static DEFINE_RAW_SPINLOCK(tick_device_lock); /* * Debugging: see timer_list.c @@ -209,7 +209,7 @@ static int tick_check_new_device(struct clock_event_device *newdev) int cpu, ret = NOTIFY_OK; unsigned long flags; - spin_lock_irqsave(&tick_device_lock, flags); + raw_spin_lock_irqsave(&tick_device_lock, flags); cpu = smp_processor_id(); if (!cpumask_test_cpu(cpu, newdev->cpumask)) @@ -268,7 +268,7 @@ static int tick_check_new_device(struct clock_event_device *newdev) if (newdev->features & CLOCK_EVT_FEAT_ONESHOT) tick_oneshot_notify(); - spin_unlock_irqrestore(&tick_device_lock, flags); + raw_spin_unlock_irqrestore(&tick_device_lock, flags); return NOTIFY_STOP; out_bc: @@ -278,7 +278,7 @@ out_bc: if (tick_check_broadcast_device(newdev)) ret = NOTIFY_STOP; - spin_unlock_irqrestore(&tick_device_lock, flags); + raw_spin_unlock_irqrestore(&tick_device_lock, flags); return ret; } @@ -311,7 +311,7 @@ static void tick_shutdown(unsigned int *cpup) struct clock_event_device *dev = td->evtdev; unsigned long flags; - spin_lock_irqsave(&tick_device_lock, flags); + raw_spin_lock_irqsave(&tick_device_lock, flags); td->mode = TICKDEV_MODE_PERIODIC; if (dev) { /* @@ -322,7 +322,7 @@ static void tick_shutdown(unsigned int *cpup) clockevents_exchange_device(dev, NULL); td->evtdev = NULL; } - spin_unlock_irqrestore(&tick_device_lock, flags); + raw_spin_unlock_irqrestore(&tick_device_lock, flags); } static void tick_suspend(void) @@ -330,9 +330,9 @@ static void tick_suspend(void) struct tick_device *td = &__get_cpu_var(tick_cpu_device); unsigned long flags; - spin_lock_irqsave(&tick_device_lock, flags); + raw_spin_lock_irqsave(&tick_device_lock, flags); clockevents_shutdown(td->evtdev); - spin_unlock_irqrestore(&tick_device_lock, flags); + raw_spin_unlock_irqrestore(&tick_device_lock, flags); } static void tick_resume(void) @@ -341,7 +341,7 @@ static void tick_resume(void) unsigned long flags; int broadcast = tick_resume_broadcast(); - spin_lock_irqsave(&tick_device_lock, flags); + raw_spin_lock_irqsave(&tick_device_lock, flags); clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_RESUME); if (!broadcast) { @@ -350,7 +350,7 @@ static void tick_resume(void) else tick_resume_oneshot(); } - spin_unlock_irqrestore(&tick_device_lock, flags); + raw_spin_unlock_irqrestore(&tick_device_lock, flags); } /* diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index b1c05bf75ee..290eefbc1f6 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -6,7 +6,6 @@ #define TICK_DO_TIMER_BOOT -2 DECLARE_PER_CPU(struct tick_device, tick_cpu_device); -extern spinlock_t tick_device_lock; extern ktime_t tick_next_period; extern ktime_t tick_period; extern int tick_do_timer_cpu __read_mostly; diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c index a96c0e2b89c..0a8a213016f 100644 --- a/kernel/time/tick-oneshot.c +++ b/kernel/time/tick-oneshot.c @@ -50,9 +50,9 @@ int tick_dev_program_event(struct clock_event_device *dev, ktime_t expires, dev->min_delta_ns += dev->min_delta_ns >> 1; printk(KERN_WARNING - "CE: %s increasing min_delta_ns to %lu nsec\n", + "CE: %s increasing min_delta_ns to %llu nsec\n", dev->name ? dev->name : "?", - dev->min_delta_ns << 1); + (unsigned long long) dev->min_delta_ns << 1); i = 0; } diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index e0f59a21c06..f992762d7f5 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -134,18 +134,13 @@ __setup("nohz=", setup_tick_nohz); * value. We do this unconditionally on any cpu, as we don't know whether the * cpu, which has the update task assigned is in a long sleep. */ -static void tick_nohz_update_jiffies(void) +static void tick_nohz_update_jiffies(ktime_t now) { int cpu = smp_processor_id(); struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); unsigned long flags; - ktime_t now; - - if (!ts->tick_stopped) - return; cpumask_clear_cpu(cpu, nohz_cpu_mask); - now = ktime_get(); ts->idle_waketime = now; local_irq_save(flags); @@ -155,20 +150,17 @@ static void tick_nohz_update_jiffies(void) touch_softlockup_watchdog(); } -static void tick_nohz_stop_idle(int cpu) +static void tick_nohz_stop_idle(int cpu, ktime_t now) { struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); + ktime_t delta; - if (ts->idle_active) { - ktime_t now, delta; - now = ktime_get(); - delta = ktime_sub(now, ts->idle_entrytime); - ts->idle_lastupdate = now; - ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta); - ts->idle_active = 0; + delta = ktime_sub(now, ts->idle_entrytime); + ts->idle_lastupdate = now; + ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta); + ts->idle_active = 0; - sched_clock_idle_wakeup_event(0); - } + sched_clock_idle_wakeup_event(0); } static ktime_t tick_nohz_start_idle(struct tick_sched *ts) @@ -216,6 +208,7 @@ void tick_nohz_stop_sched_tick(int inidle) struct tick_sched *ts; ktime_t last_update, expires, now; struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; + u64 time_delta; int cpu; local_irq_save(flags); @@ -231,6 +224,13 @@ void tick_nohz_stop_sched_tick(int inidle) if (!inidle && !ts->inidle) goto end; + /* + * Set ts->inidle unconditionally. Even if the system did not + * switch to NOHZ mode the cpu frequency governers rely on the + * update of the idle time accounting in tick_nohz_start_idle(). + */ + ts->inidle = 1; + now = tick_nohz_start_idle(ts); /* @@ -248,8 +248,6 @@ void tick_nohz_stop_sched_tick(int inidle) if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) goto end; - ts->inidle = 1; - if (need_resched()) goto end; @@ -258,7 +256,7 @@ void tick_nohz_stop_sched_tick(int inidle) if (ratelimit < 10) { printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n", - local_softirq_pending()); + (unsigned int) local_softirq_pending()); ratelimit++; } goto end; @@ -270,14 +268,18 @@ void tick_nohz_stop_sched_tick(int inidle) seq = read_seqbegin(&xtime_lock); last_update = last_jiffies_update; last_jiffies = jiffies; + time_delta = timekeeping_max_deferment(); } while (read_seqretry(&xtime_lock, seq)); - /* Get the next timer wheel timer */ - next_jiffies = get_next_timer_interrupt(last_jiffies); - delta_jiffies = next_jiffies - last_jiffies; - - if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu)) + if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu) || + arch_needs_cpu(cpu)) { + next_jiffies = last_jiffies + 1; delta_jiffies = 1; + } else { + /* Get the next timer wheel timer */ + next_jiffies = get_next_timer_interrupt(last_jiffies); + delta_jiffies = next_jiffies - last_jiffies; + } /* * Do not stop the tick, if we are only one off * or if the cpu is required for rcu @@ -289,22 +291,51 @@ void tick_nohz_stop_sched_tick(int inidle) if ((long)delta_jiffies >= 1) { /* - * calculate the expiry time for the next timer wheel - * timer - */ - expires = ktime_add_ns(last_update, tick_period.tv64 * - delta_jiffies); - - /* * If this cpu is the one which updates jiffies, then * give up the assignment and let it be taken by the * cpu which runs the tick timer next, which might be * this cpu as well. If we don't drop this here the * jiffies might be stale and do_timer() never - * invoked. + * invoked. Keep track of the fact that it was the one + * which had the do_timer() duty last. If this cpu is + * the one which had the do_timer() duty last, we + * limit the sleep time to the timekeeping + * max_deferement value which we retrieved + * above. Otherwise we can sleep as long as we want. */ - if (cpu == tick_do_timer_cpu) + if (cpu == tick_do_timer_cpu) { tick_do_timer_cpu = TICK_DO_TIMER_NONE; + ts->do_timer_last = 1; + } else if (tick_do_timer_cpu != TICK_DO_TIMER_NONE) { + time_delta = KTIME_MAX; + ts->do_timer_last = 0; + } else if (!ts->do_timer_last) { + time_delta = KTIME_MAX; + } + + /* + * calculate the expiry time for the next timer wheel + * timer. delta_jiffies >= NEXT_TIMER_MAX_DELTA signals + * that there is no timer pending or at least extremely + * far into the future (12 days for HZ=1000). In this + * case we set the expiry to the end of time. + */ + if (likely(delta_jiffies < NEXT_TIMER_MAX_DELTA)) { + /* + * Calculate the time delta for the next timer event. + * If the time delta exceeds the maximum time delta + * permitted by the current clocksource then adjust + * the time delta accordingly to ensure the + * clocksource does not wrap. + */ + time_delta = min_t(u64, time_delta, + tick_period.tv64 * delta_jiffies); + } + + if (time_delta < KTIME_MAX) + expires = ktime_add_ns(last_update, time_delta); + else + expires.tv64 = KTIME_MAX; if (delta_jiffies > 1) cpumask_set_cpu(cpu, nohz_cpu_mask); @@ -337,22 +368,19 @@ void tick_nohz_stop_sched_tick(int inidle) ts->idle_sleeps++; + /* Mark expires */ + ts->idle_expires = expires; + /* - * delta_jiffies >= NEXT_TIMER_MAX_DELTA signals that - * there is no timer pending or at least extremly far - * into the future (12 days for HZ=1000). In this case - * we simply stop the tick timer: + * If the expiration time == KTIME_MAX, then + * in this case we simply stop the tick timer. */ - if (unlikely(delta_jiffies >= NEXT_TIMER_MAX_DELTA)) { - ts->idle_expires.tv64 = KTIME_MAX; + if (unlikely(expires.tv64 == KTIME_MAX)) { if (ts->nohz_mode == NOHZ_MODE_HIGHRES) hrtimer_cancel(&ts->sched_timer); goto out; } - /* Mark expiries */ - ts->idle_expires = expires; - if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { hrtimer_start(&ts->sched_timer, expires, HRTIMER_MODE_ABS_PINNED); @@ -431,7 +459,11 @@ void tick_nohz_restart_sched_tick(void) ktime_t now; local_irq_disable(); - tick_nohz_stop_idle(cpu); + if (ts->idle_active || (ts->inidle && ts->tick_stopped)) + now = ktime_get(); + + if (ts->idle_active) + tick_nohz_stop_idle(cpu, now); if (!ts->inidle || !ts->tick_stopped) { ts->inidle = 0; @@ -445,7 +477,6 @@ void tick_nohz_restart_sched_tick(void) /* Update jiffies first */ select_nohz_load_balancer(0); - now = ktime_get(); tick_do_update_jiffies64(now); cpumask_clear_cpu(cpu, nohz_cpu_mask); @@ -579,22 +610,18 @@ static void tick_nohz_switch_to_nohz(void) * timer and do not touch the other magic bits which need to be done * when idle is left. */ -static void tick_nohz_kick_tick(int cpu) +static void tick_nohz_kick_tick(int cpu, ktime_t now) { #if 0 /* Switch back to 2.6.27 behaviour */ struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); - ktime_t delta, now; - - if (!ts->tick_stopped) - return; + ktime_t delta; /* * Do not touch the tick device, when the next expiry is either * already reached or less/equal than the tick period. */ - now = ktime_get(); delta = ktime_sub(hrtimer_get_expires(&ts->sched_timer), now); if (delta.tv64 <= tick_period.tv64) return; @@ -603,9 +630,26 @@ static void tick_nohz_kick_tick(int cpu) #endif } +static inline void tick_check_nohz(int cpu) +{ + struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); + ktime_t now; + + if (!ts->idle_active && !ts->tick_stopped) + return; + now = ktime_get(); + if (ts->idle_active) + tick_nohz_stop_idle(cpu, now); + if (ts->tick_stopped) { + tick_nohz_update_jiffies(now); + tick_nohz_kick_tick(cpu, now); + } +} + #else static inline void tick_nohz_switch_to_nohz(void) { } +static inline void tick_check_nohz(int cpu) { } #endif /* NO_HZ */ @@ -615,11 +659,7 @@ static inline void tick_nohz_switch_to_nohz(void) { } void tick_check_idle(int cpu) { tick_check_oneshot_broadcast(cpu); -#ifdef CONFIG_NO_HZ - tick_nohz_stop_idle(cpu); - tick_nohz_update_jiffies(); - tick_nohz_kick_tick(cpu); -#endif + tick_check_nohz(cpu); } /* diff --git a/kernel/time/timecompare.c b/kernel/time/timecompare.c index 71e7f1a1915..12f5c55090b 100644 --- a/kernel/time/timecompare.c +++ b/kernel/time/timecompare.c @@ -40,7 +40,7 @@ ktime_t timecompare_transform(struct timecompare *sync, return ns_to_ktime(nsec); } -EXPORT_SYMBOL(timecompare_transform); +EXPORT_SYMBOL_GPL(timecompare_transform); int timecompare_offset(struct timecompare *sync, s64 *offset, @@ -89,7 +89,7 @@ int timecompare_offset(struct timecompare *sync, * source time */ sample.offset = - ktime_to_ns(ktime_add(end, start)) / 2 - + (ktime_to_ns(end) + ktime_to_ns(start)) / 2 - ts; /* simple insertion sort based on duration */ @@ -131,7 +131,7 @@ int timecompare_offset(struct timecompare *sync, return used; } -EXPORT_SYMBOL(timecompare_offset); +EXPORT_SYMBOL_GPL(timecompare_offset); void __timecompare_update(struct timecompare *sync, u64 source_tstamp) @@ -188,4 +188,4 @@ void __timecompare_update(struct timecompare *sync, } } } -EXPORT_SYMBOL(__timecompare_update); +EXPORT_SYMBOL_GPL(__timecompare_update); diff --git a/kernel/time/timeconv.c b/kernel/time/timeconv.c new file mode 100644 index 00000000000..86628e755f3 --- /dev/null +++ b/kernel/time/timeconv.c @@ -0,0 +1,127 @@ +/* + * Copyright (C) 1993, 1994, 1995, 1996, 1997 Free Software Foundation, Inc. + * This file is part of the GNU C Library. + * Contributed by Paul Eggert (eggert@twinsun.com). + * + * The GNU C Library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Library General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * The GNU C Library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Library General Public License for more details. + * + * You should have received a copy of the GNU Library General Public + * License along with the GNU C Library; see the file COPYING.LIB. If not, + * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 02111-1307, USA. + */ + +/* + * Converts the calendar time to broken-down time representation + * Based on code from glibc-2.6 + * + * 2009-7-14: + * Moved from glibc-2.6 to kernel by Zhaolei<zhaolei@cn.fujitsu.com> + */ + +#include <linux/time.h> +#include <linux/module.h> + +/* + * Nonzero if YEAR is a leap year (every 4 years, + * except every 100th isn't, and every 400th is). + */ +static int __isleap(long year) +{ + return (year) % 4 == 0 && ((year) % 100 != 0 || (year) % 400 == 0); +} + +/* do a mathdiv for long type */ +static long math_div(long a, long b) +{ + return a / b - (a % b < 0); +} + +/* How many leap years between y1 and y2, y1 must less or equal to y2 */ +static long leaps_between(long y1, long y2) +{ + long leaps1 = math_div(y1 - 1, 4) - math_div(y1 - 1, 100) + + math_div(y1 - 1, 400); + long leaps2 = math_div(y2 - 1, 4) - math_div(y2 - 1, 100) + + math_div(y2 - 1, 400); + return leaps2 - leaps1; +} + +/* How many days come before each month (0-12). */ +static const unsigned short __mon_yday[2][13] = { + /* Normal years. */ + {0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, 365}, + /* Leap years. */ + {0, 31, 60, 91, 121, 152, 182, 213, 244, 274, 305, 335, 366} +}; + +#define SECS_PER_HOUR (60 * 60) +#define SECS_PER_DAY (SECS_PER_HOUR * 24) + +/** + * time_to_tm - converts the calendar time to local broken-down time + * + * @totalsecs the number of seconds elapsed since 00:00:00 on January 1, 1970, + * Coordinated Universal Time (UTC). + * @offset offset seconds adding to totalsecs. + * @result pointer to struct tm variable to receive broken-down time + */ +void time_to_tm(time_t totalsecs, int offset, struct tm *result) +{ + long days, rem, y; + const unsigned short *ip; + + days = totalsecs / SECS_PER_DAY; + rem = totalsecs % SECS_PER_DAY; + rem += offset; + while (rem < 0) { + rem += SECS_PER_DAY; + --days; + } + while (rem >= SECS_PER_DAY) { + rem -= SECS_PER_DAY; + ++days; + } + + result->tm_hour = rem / SECS_PER_HOUR; + rem %= SECS_PER_HOUR; + result->tm_min = rem / 60; + result->tm_sec = rem % 60; + + /* January 1, 1970 was a Thursday. */ + result->tm_wday = (4 + days) % 7; + if (result->tm_wday < 0) + result->tm_wday += 7; + + y = 1970; + + while (days < 0 || days >= (__isleap(y) ? 366 : 365)) { + /* Guess a corrected year, assuming 365 days per year. */ + long yg = y + math_div(days, 365); + + /* Adjust DAYS and Y to match the guessed year. */ + days -= (yg - y) * 365 + leaps_between(y, yg); + y = yg; + } + + result->tm_year = y - 1900; + + result->tm_yday = days; + + ip = __mon_yday[__isleap(y)]; + for (y = 11; days < ip[y]; y--) + continue; + days -= ip[y]; + + result->tm_mon = y; + result->tm_mday = days + 1; +} +EXPORT_SYMBOL(time_to_tm); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index fb0f46fa1ec..af4135f0582 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -13,6 +13,7 @@ #include <linux/percpu.h> #include <linux/init.h> #include <linux/mm.h> +#include <linux/sched.h> #include <linux/sysdev.h> #include <linux/clocksource.h> #include <linux/jiffies.h> @@ -164,19 +165,12 @@ struct timespec raw_time; /* flag for if timekeeping is suspended */ int __read_mostly timekeeping_suspended; -static struct timespec xtime_cache __attribute__ ((aligned (16))); -void update_xtime_cache(u64 nsec) -{ - xtime_cache = xtime; - timespec_add_ns(&xtime_cache, nsec); -} - /* must hold xtime_lock */ void timekeeping_leap_insert(int leapsecond) { xtime.tv_sec += leapsecond; wall_to_monotonic.tv_sec -= leapsecond; - update_vsyscall(&xtime, timekeeper.clock); + update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult); } #ifdef CONFIG_GENERIC_TIME @@ -331,12 +325,10 @@ int do_settimeofday(struct timespec *tv) xtime = *tv; - update_xtime_cache(0); - timekeeper.ntp_error = 0; ntp_clear(); - update_vsyscall(&xtime, timekeeper.clock); + update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult); write_sequnlock_irqrestore(&xtime_lock, flags); @@ -487,6 +479,17 @@ int timekeeping_valid_for_hres(void) } /** + * timekeeping_max_deferment - Returns max time the clocksource can be deferred + * + * Caller must observe xtime_lock via read_seqbegin/read_seqretry to + * ensure that the clocksource does not change! + */ +u64 timekeeping_max_deferment(void) +{ + return timekeeper.clock->max_idle_ns; +} + +/** * read_persistent_clock - Return time from the persistent clock. * * Weak dummy function for arches that do not yet support it. @@ -547,7 +550,6 @@ void __init timekeeping_init(void) } set_normalized_timespec(&wall_to_monotonic, -boot.tv_sec, -boot.tv_nsec); - update_xtime_cache(0); total_sleep_time.tv_sec = 0; total_sleep_time.tv_nsec = 0; write_sequnlock_irqrestore(&xtime_lock, flags); @@ -581,7 +583,6 @@ static int timekeeping_resume(struct sys_device *dev) wall_to_monotonic = timespec_sub(wall_to_monotonic, ts); total_sleep_time = timespec_add_safe(total_sleep_time, ts); } - update_xtime_cache(0); /* re-base the last cycle value */ timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock); timekeeper.ntp_error = 0; @@ -722,6 +723,49 @@ static void timekeeping_adjust(s64 offset) } /** + * logarithmic_accumulation - shifted accumulation of cycles + * + * This functions accumulates a shifted interval of cycles into + * into a shifted interval nanoseconds. Allows for O(log) accumulation + * loop. + * + * Returns the unconsumed cycles. + */ +static cycle_t logarithmic_accumulation(cycle_t offset, int shift) +{ + u64 nsecps = (u64)NSEC_PER_SEC << timekeeper.shift; + + /* If the offset is smaller then a shifted interval, do nothing */ + if (offset < timekeeper.cycle_interval<<shift) + return offset; + + /* Accumulate one shifted interval */ + offset -= timekeeper.cycle_interval << shift; + timekeeper.clock->cycle_last += timekeeper.cycle_interval << shift; + + timekeeper.xtime_nsec += timekeeper.xtime_interval << shift; + while (timekeeper.xtime_nsec >= nsecps) { + timekeeper.xtime_nsec -= nsecps; + xtime.tv_sec++; + second_overflow(); + } + + /* Accumulate into raw time */ + raw_time.tv_nsec += timekeeper.raw_interval << shift;; + while (raw_time.tv_nsec >= NSEC_PER_SEC) { + raw_time.tv_nsec -= NSEC_PER_SEC; + raw_time.tv_sec++; + } + + /* Accumulate error between NTP and clock interval */ + timekeeper.ntp_error += tick_length << shift; + timekeeper.ntp_error -= timekeeper.xtime_interval << + (timekeeper.ntp_error_shift + shift); + + return offset; +} + +/** * update_wall_time - Uses the current clocksource to increment the wall time * * Called from the timer interrupt, must hold a write on xtime_lock. @@ -730,7 +774,7 @@ void update_wall_time(void) { struct clocksource *clock; cycle_t offset; - u64 nsecs; + int shift = 0, maxshift; /* Make sure we're fully resumed: */ if (unlikely(timekeeping_suspended)) @@ -744,33 +788,22 @@ void update_wall_time(void) #endif timekeeper.xtime_nsec = (s64)xtime.tv_nsec << timekeeper.shift; - /* normally this loop will run just once, however in the - * case of lost or late ticks, it will accumulate correctly. + /* + * With NO_HZ we may have to accumulate many cycle_intervals + * (think "ticks") worth of time at once. To do this efficiently, + * we calculate the largest doubling multiple of cycle_intervals + * that is smaller then the offset. We then accumulate that + * chunk in one go, and then try to consume the next smaller + * doubled multiple. */ + shift = ilog2(offset) - ilog2(timekeeper.cycle_interval); + shift = max(0, shift); + /* Bound shift to one less then what overflows tick_length */ + maxshift = (8*sizeof(tick_length) - (ilog2(tick_length)+1)) - 1; + shift = min(shift, maxshift); while (offset >= timekeeper.cycle_interval) { - u64 nsecps = (u64)NSEC_PER_SEC << timekeeper.shift; - - /* accumulate one interval */ - offset -= timekeeper.cycle_interval; - clock->cycle_last += timekeeper.cycle_interval; - - timekeeper.xtime_nsec += timekeeper.xtime_interval; - if (timekeeper.xtime_nsec >= nsecps) { - timekeeper.xtime_nsec -= nsecps; - xtime.tv_sec++; - second_overflow(); - } - - raw_time.tv_nsec += timekeeper.raw_interval; - if (raw_time.tv_nsec >= NSEC_PER_SEC) { - raw_time.tv_nsec -= NSEC_PER_SEC; - raw_time.tv_sec++; - } - - /* accumulate error between NTP and clock interval */ - timekeeper.ntp_error += tick_length; - timekeeper.ntp_error -= timekeeper.xtime_interval << - timekeeper.ntp_error_shift; + offset = logarithmic_accumulation(offset, shift); + shift--; } /* correct the clock when NTP error is too big */ @@ -806,11 +839,8 @@ void update_wall_time(void) timekeeper.ntp_error += timekeeper.xtime_nsec << timekeeper.ntp_error_shift; - nsecs = clocksource_cyc2ns(offset, timekeeper.mult, timekeeper.shift); - update_xtime_cache(nsecs); - /* check to see if there is a new clocksource to use */ - update_vsyscall(&xtime, timekeeper.clock); + update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult); } /** @@ -845,13 +875,13 @@ void monotonic_to_bootbased(struct timespec *ts) unsigned long get_seconds(void) { - return xtime_cache.tv_sec; + return xtime.tv_sec; } EXPORT_SYMBOL(get_seconds); struct timespec __current_kernel_time(void) { - return xtime_cache; + return xtime; } struct timespec current_kernel_time(void) @@ -861,8 +891,7 @@ struct timespec current_kernel_time(void) do { seq = read_seqbegin(&xtime_lock); - - now = xtime_cache; + now = xtime; } while (read_seqretry(&xtime_lock, seq)); return now; @@ -876,8 +905,7 @@ struct timespec get_monotonic_coarse(void) do { seq = read_seqbegin(&xtime_lock); - - now = xtime_cache; + now = xtime; mono = wall_to_monotonic; } while (read_seqretry(&xtime_lock, seq)); diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index fddd69d16e0..28265636b6c 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -84,7 +84,7 @@ print_active_timers(struct seq_file *m, struct hrtimer_clock_base *base, next_one: i = 0; - spin_lock_irqsave(&base->cpu_base->lock, flags); + raw_spin_lock_irqsave(&base->cpu_base->lock, flags); curr = base->first; /* @@ -100,13 +100,13 @@ next_one: timer = rb_entry(curr, struct hrtimer, node); tmp = *timer; - spin_unlock_irqrestore(&base->cpu_base->lock, flags); + raw_spin_unlock_irqrestore(&base->cpu_base->lock, flags); print_timer(m, timer, &tmp, i, now); next++; goto next_one; } - spin_unlock_irqrestore(&base->cpu_base->lock, flags); + raw_spin_unlock_irqrestore(&base->cpu_base->lock, flags); } static void @@ -150,6 +150,9 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now) P_ns(expires_next); P(hres_active); P(nr_events); + P(nr_retries); + P(nr_hangs); + P_ns(max_hang_time); #endif #undef P #undef P_ns @@ -204,10 +207,12 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu) return; } SEQ_printf(m, "%s\n", dev->name); - SEQ_printf(m, " max_delta_ns: %lu\n", dev->max_delta_ns); - SEQ_printf(m, " min_delta_ns: %lu\n", dev->min_delta_ns); - SEQ_printf(m, " mult: %lu\n", dev->mult); - SEQ_printf(m, " shift: %d\n", dev->shift); + SEQ_printf(m, " max_delta_ns: %llu\n", + (unsigned long long) dev->max_delta_ns); + SEQ_printf(m, " min_delta_ns: %llu\n", + (unsigned long long) dev->min_delta_ns); + SEQ_printf(m, " mult: %u\n", dev->mult); + SEQ_printf(m, " shift: %u\n", dev->shift); SEQ_printf(m, " mode: %d\n", dev->mode); SEQ_printf(m, " next_event: %Ld nsecs\n", (unsigned long long) ktime_to_ns(dev->next_event)); @@ -252,7 +257,7 @@ static int timer_list_show(struct seq_file *m, void *v) u64 now = ktime_to_ns(ktime_get()); int cpu; - SEQ_printf(m, "Timer List Version: v0.4\n"); + SEQ_printf(m, "Timer List Version: v0.5\n"); SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES); SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now); @@ -275,7 +280,7 @@ static int timer_list_open(struct inode *inode, struct file *filp) return single_open(filp, timer_list_show, NULL); } -static struct file_operations timer_list_fops = { +static const struct file_operations timer_list_fops = { .open = timer_list_open, .read = seq_read, .llseek = seq_lseek, diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c index 4cde8b9c716..2f3b585b8d7 100644 --- a/kernel/time/timer_stats.c +++ b/kernel/time/timer_stats.c @@ -86,7 +86,7 @@ static DEFINE_SPINLOCK(table_lock); /* * Per-CPU lookup locks for fast hash lookup: */ -static DEFINE_PER_CPU(spinlock_t, lookup_lock); +static DEFINE_PER_CPU(raw_spinlock_t, tstats_lookup_lock); /* * Mutex to serialize state changes with show-stats activities: @@ -238,14 +238,14 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf, /* * It doesnt matter which lock we take: */ - spinlock_t *lock; + raw_spinlock_t *lock; struct entry *entry, input; unsigned long flags; if (likely(!timer_stats_active)) return; - lock = &per_cpu(lookup_lock, raw_smp_processor_id()); + lock = &per_cpu(tstats_lookup_lock, raw_smp_processor_id()); input.timer = timer; input.start_func = startf; @@ -253,7 +253,7 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf, input.pid = pid; input.timer_flag = timer_flag; - spin_lock_irqsave(lock, flags); + raw_spin_lock_irqsave(lock, flags); if (!timer_stats_active) goto out_unlock; @@ -264,7 +264,7 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf, atomic_inc(&overflow_count); out_unlock: - spin_unlock_irqrestore(lock, flags); + raw_spin_unlock_irqrestore(lock, flags); } static void print_name_offset(struct seq_file *m, unsigned long addr) @@ -348,9 +348,11 @@ static void sync_access(void) int cpu; for_each_online_cpu(cpu) { - spin_lock_irqsave(&per_cpu(lookup_lock, cpu), flags); + raw_spinlock_t *lock = &per_cpu(tstats_lookup_lock, cpu); + + raw_spin_lock_irqsave(lock, flags); /* nothing */ - spin_unlock_irqrestore(&per_cpu(lookup_lock, cpu), flags); + raw_spin_unlock_irqrestore(lock, flags); } } @@ -395,7 +397,7 @@ static int tstats_open(struct inode *inode, struct file *filp) return single_open(filp, tstats_show, NULL); } -static struct file_operations tstats_fops = { +static const struct file_operations tstats_fops = { .open = tstats_open, .read = seq_read, .write = tstats_write, @@ -408,7 +410,7 @@ void __init init_timer_stats(void) int cpu; for_each_possible_cpu(cpu) - spin_lock_init(&per_cpu(lookup_lock, cpu)); + raw_spin_lock_init(&per_cpu(tstats_lookup_lock, cpu)); } static int __init init_tstats_procfs(void) diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index b416512ad17..d006554888d 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -339,6 +339,27 @@ config POWER_TRACER power management decisions, specifically the C-state and P-state behavior. +config KSYM_TRACER + bool "Trace read and write access on kernel memory locations" + depends on HAVE_HW_BREAKPOINT + select TRACING + help + This tracer helps find read and write operations on any given kernel + symbol i.e. /proc/kallsyms. + +config PROFILE_KSYM_TRACER + bool "Profile all kernel memory accesses on 'watched' variables" + depends on KSYM_TRACER + help + This tracer profiles kernel accesses on variables watched through the + ksym tracer ftrace plugin. Depending upon the hardware, all read + and write operations on kernel variables can be monitored for + accesses. + + The results will be displayed in: + /debugfs/tracing/profile_ksym + + Say N if unsure. config STACK_TRACER bool "Trace max stack" @@ -428,6 +449,23 @@ config BLK_DEV_IO_TRACE If unsure, say N. +config KPROBE_EVENT + depends on KPROBES + depends on X86 + bool "Enable kprobes-based dynamic events" + select TRACING + default y + help + This allows the user to add tracing events (similar to tracepoints) on the fly + via the ftrace interface. See Documentation/trace/kprobetrace.txt + for more details. + + Those events can be inserted wherever kprobes can probe, and record + various register and memory values. + + This option is also required by perf-probe subcommand of perf tools. If + you want to use perf tools, this option is strongly recommended. + config DYNAMIC_FTRACE bool "enable/disable ftrace tracepoints dynamically" depends on FUNCTION_TRACER diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 26f03ac07c2..cd9ecd89ec7 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -53,6 +53,8 @@ obj-$(CONFIG_EVENT_TRACING) += trace_export.o obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o +obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o +obj-$(CONFIG_KSYM_TRACER) += trace_ksym.o obj-$(CONFIG_EVENT_TRACING) += power-traces.o libftrace-y := ftrace.o diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 3eb159c277c..d9d6206e0b1 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -856,6 +856,37 @@ static void blk_add_trace_remap(struct request_queue *q, struct bio *bio, } /** + * blk_add_trace_rq_remap - Add a trace for a request-remap operation + * @q: queue the io is for + * @rq: the source request + * @dev: target device + * @from: source sector + * + * Description: + * Device mapper remaps request to other devices. + * Add a trace for that action. + * + **/ +static void blk_add_trace_rq_remap(struct request_queue *q, + struct request *rq, dev_t dev, + sector_t from) +{ + struct blk_trace *bt = q->blk_trace; + struct blk_io_trace_remap r; + + if (likely(!bt)) + return; + + r.device_from = cpu_to_be32(dev); + r.device_to = cpu_to_be32(disk_devt(rq->rq_disk)); + r.sector_from = cpu_to_be64(from); + + __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), + rq_data_dir(rq), BLK_TA_REMAP, !!rq->errors, + sizeof(r), &r); +} + +/** * blk_add_driver_data - Add binary message with driver-specific data * @q: queue the io is for * @rq: io request @@ -922,10 +953,13 @@ static void blk_register_tracepoints(void) WARN_ON(ret); ret = register_trace_block_remap(blk_add_trace_remap); WARN_ON(ret); + ret = register_trace_block_rq_remap(blk_add_trace_rq_remap); + WARN_ON(ret); } static void blk_unregister_tracepoints(void) { + unregister_trace_block_rq_remap(blk_add_trace_rq_remap); unregister_trace_block_remap(blk_add_trace_remap); unregister_trace_block_split(blk_add_trace_split); unregister_trace_block_unplug_io(blk_add_trace_unplug_io); @@ -1657,6 +1691,11 @@ int blk_trace_init_sysfs(struct device *dev) return sysfs_create_group(&dev->kobj, &blk_trace_attr_group); } +void blk_trace_remove_sysfs(struct device *dev) +{ + sysfs_remove_group(&dev->kobj, &blk_trace_attr_group); +} + #endif /* CONFIG_BLK_DEV_IO_TRACE */ #ifdef CONFIG_EVENT_TRACING diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 23df7771c93..e51a1bcb7be 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -60,6 +60,13 @@ static int last_ftrace_enabled; /* Quick disabling of function tracer. */ int function_trace_stop; +/* List for set_ftrace_pid's pids. */ +LIST_HEAD(ftrace_pids); +struct ftrace_pid { + struct list_head list; + struct pid *pid; +}; + /* * ftrace_disabled is set when an anomaly is discovered. * ftrace_disabled is much stronger than ftrace_enabled. @@ -78,6 +85,10 @@ ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub; ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub; +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +static int ftrace_set_func(unsigned long *array, int *idx, char *buffer); +#endif + static void ftrace_list_func(unsigned long ip, unsigned long parent_ip) { struct ftrace_ops *op = ftrace_list; @@ -155,7 +166,7 @@ static int __register_ftrace_function(struct ftrace_ops *ops) else func = ftrace_list_func; - if (ftrace_pid_trace) { + if (!list_empty(&ftrace_pids)) { set_ftrace_pid_function(func); func = ftrace_pid_func; } @@ -203,7 +214,7 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops) if (ftrace_list->next == &ftrace_list_end) { ftrace_func_t func = ftrace_list->func; - if (ftrace_pid_trace) { + if (!list_empty(&ftrace_pids)) { set_ftrace_pid_function(func); func = ftrace_pid_func; } @@ -225,9 +236,13 @@ static void ftrace_update_pid_func(void) if (ftrace_trace_function == ftrace_stub) return; +#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST func = ftrace_trace_function; +#else + func = __ftrace_trace_function; +#endif - if (ftrace_pid_trace) { + if (!list_empty(&ftrace_pids)) { set_ftrace_pid_function(func); func = ftrace_pid_func; } else { @@ -736,7 +751,7 @@ ftrace_profile_write(struct file *filp, const char __user *ubuf, out: mutex_unlock(&ftrace_profile_lock); - filp->f_pos += cnt; + *ppos += cnt; return cnt; } @@ -817,8 +832,6 @@ static __init void ftrace_profile_debugfs(struct dentry *d_tracer) } #endif /* CONFIG_FUNCTION_PROFILER */ -/* set when tracing only a pid */ -struct pid *ftrace_pid_trace; static struct pid * const ftrace_swapper_pid = &init_struct_pid; #ifdef CONFIG_DYNAMIC_FTRACE @@ -1074,14 +1087,9 @@ static void ftrace_replace_code(int enable) failed = __ftrace_replace_code(rec, enable); if (failed) { rec->flags |= FTRACE_FL_FAILED; - if ((system_state == SYSTEM_BOOTING) || - !core_kernel_text(rec->ip)) { - ftrace_free_rec(rec); - } else { - ftrace_bug(failed, rec->ip); - /* Stop processing */ - return; - } + ftrace_bug(failed, rec->ip); + /* Stop processing */ + return; } } while_for_each_ftrace_rec(); } @@ -1262,12 +1270,34 @@ static int ftrace_update_code(struct module *mod) ftrace_new_addrs = p->newlist; p->flags = 0L; - /* convert record (i.e, patch mcount-call with NOP) */ - if (ftrace_code_disable(mod, p)) { - p->flags |= FTRACE_FL_CONVERTED; - ftrace_update_cnt++; - } else + /* + * Do the initial record convertion from mcount jump + * to the NOP instructions. + */ + if (!ftrace_code_disable(mod, p)) { ftrace_free_rec(p); + continue; + } + + p->flags |= FTRACE_FL_CONVERTED; + ftrace_update_cnt++; + + /* + * If the tracing is enabled, go ahead and enable the record. + * + * The reason not to enable the record immediatelly is the + * inherent check of ftrace_make_nop/ftrace_make_call for + * correct previous instructions. Making first the NOP + * conversion puts the module to the correct state, thus + * passing the ftrace_make_call check. + */ + if (ftrace_start_up) { + int failed = __ftrace_replace_code(p, 1); + if (failed) { + ftrace_bug(failed, p->ip); + ftrace_free_rec(p); + } + } } stop = ftrace_now(raw_smp_processor_id()); @@ -1621,8 +1651,10 @@ ftrace_regex_open(struct inode *inode, struct file *file, int enable) if (!ret) { struct seq_file *m = file->private_data; m->private = iter; - } else + } else { + trace_parser_put(&iter->parser); kfree(iter); + } } else file->private_data = iter; mutex_unlock(&ftrace_regex_lock); @@ -1655,60 +1687,6 @@ ftrace_regex_lseek(struct file *file, loff_t offset, int origin) return ret; } -enum { - MATCH_FULL, - MATCH_FRONT_ONLY, - MATCH_MIDDLE_ONLY, - MATCH_END_ONLY, -}; - -/* - * (static function - no need for kernel doc) - * - * Pass in a buffer containing a glob and this function will - * set search to point to the search part of the buffer and - * return the type of search it is (see enum above). - * This does modify buff. - * - * Returns enum type. - * search returns the pointer to use for comparison. - * not returns 1 if buff started with a '!' - * 0 otherwise. - */ -static int -ftrace_setup_glob(char *buff, int len, char **search, int *not) -{ - int type = MATCH_FULL; - int i; - - if (buff[0] == '!') { - *not = 1; - buff++; - len--; - } else - *not = 0; - - *search = buff; - - for (i = 0; i < len; i++) { - if (buff[i] == '*') { - if (!i) { - *search = buff + 1; - type = MATCH_END_ONLY; - } else { - if (type == MATCH_END_ONLY) - type = MATCH_MIDDLE_ONLY; - else - type = MATCH_FRONT_ONLY; - buff[i] = 0; - break; - } - } - } - - return type; -} - static int ftrace_match(char *str, char *regex, int len, int type) { int matched = 0; @@ -1757,7 +1735,7 @@ static void ftrace_match_records(char *buff, int len, int enable) int not; flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE; - type = ftrace_setup_glob(buff, len, &search, ¬); + type = filter_parse_regex(buff, len, &search, ¬); search_len = strlen(search); @@ -1825,7 +1803,7 @@ static void ftrace_match_module_records(char *buff, char *mod, int enable) } if (strlen(buff)) { - type = ftrace_setup_glob(buff, strlen(buff), &search, ¬); + type = filter_parse_regex(buff, strlen(buff), &search, ¬); search_len = strlen(search); } @@ -1990,7 +1968,7 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, int count = 0; char *search; - type = ftrace_setup_glob(glob, strlen(glob), &search, ¬); + type = filter_parse_regex(glob, strlen(glob), &search, ¬); len = strlen(search); /* we do not support '!' for function probes */ @@ -2067,7 +2045,7 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, else if (glob) { int not; - type = ftrace_setup_glob(glob, strlen(glob), &search, ¬); + type = filter_parse_regex(glob, strlen(glob), &search, ¬); len = strlen(search); /* we do not support '!' for function probes */ @@ -2202,7 +2180,7 @@ ftrace_regex_write(struct file *file, const char __user *ubuf, struct trace_parser *parser; ssize_t ret, read; - if (!cnt || cnt < 0) + if (!cnt) return 0; mutex_lock(&ftrace_regex_lock); @@ -2216,20 +2194,20 @@ ftrace_regex_write(struct file *file, const char __user *ubuf, parser = &iter->parser; read = trace_get_user(parser, ubuf, cnt, ppos); - if (trace_parser_loaded(parser) && + if (read >= 0 && trace_parser_loaded(parser) && !trace_parser_cont(parser)) { ret = ftrace_process_regex(parser->buffer, parser->idx, enable); if (ret) - goto out; + goto out_unlock; trace_parser_clear(parser); } ret = read; - +out_unlock: mutex_unlock(&ftrace_regex_lock); -out: + return ret; } @@ -2311,6 +2289,32 @@ static int __init set_ftrace_filter(char *str) } __setup("ftrace_filter=", set_ftrace_filter); +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +static char ftrace_graph_buf[FTRACE_FILTER_SIZE] __initdata; +static int __init set_graph_function(char *str) +{ + strlcpy(ftrace_graph_buf, str, FTRACE_FILTER_SIZE); + return 1; +} +__setup("ftrace_graph_filter=", set_graph_function); + +static void __init set_ftrace_early_graph(char *buf) +{ + int ret; + char *func; + + while (buf) { + func = strsep(&buf, ","); + /* we allow only one expression at a time */ + ret = ftrace_set_func(ftrace_graph_funcs, &ftrace_graph_count, + func); + if (ret) + printk(KERN_DEBUG "ftrace: function %s not " + "traceable\n", func); + } +} +#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ + static void __init set_ftrace_early_filter(char *buf, int enable) { char *func; @@ -2327,6 +2331,10 @@ static void __init set_ftrace_early_filters(void) set_ftrace_early_filter(ftrace_filter_buf, 1); if (ftrace_notrace_buf[0]) set_ftrace_early_filter(ftrace_notrace_buf, 0); +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + if (ftrace_graph_buf[0]) + set_ftrace_early_graph(ftrace_graph_buf); +#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ } static int @@ -2512,7 +2520,7 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer) return -ENODEV; /* decode regex */ - type = ftrace_setup_glob(buffer, strlen(buffer), &search, ¬); + type = filter_parse_regex(buffer, strlen(buffer), &search, ¬); if (not) return -EINVAL; @@ -2552,8 +2560,7 @@ ftrace_graph_write(struct file *file, const char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_parser parser; - size_t read = 0; - ssize_t ret; + ssize_t read, ret; if (!cnt || cnt < 0) return 0; @@ -2562,29 +2569,31 @@ ftrace_graph_write(struct file *file, const char __user *ubuf, if (ftrace_graph_count >= FTRACE_GRAPH_MAX_FUNCS) { ret = -EBUSY; - goto out; + goto out_unlock; } if (trace_parser_get_init(&parser, FTRACE_BUFF_MAX)) { ret = -ENOMEM; - goto out; + goto out_unlock; } read = trace_get_user(&parser, ubuf, cnt, ppos); - if (trace_parser_loaded((&parser))) { + if (read >= 0 && trace_parser_loaded((&parser))) { parser.buffer[parser.idx] = 0; /* we allow only one expression at a time */ ret = ftrace_set_func(ftrace_graph_funcs, &ftrace_graph_count, parser.buffer); if (ret) - goto out; + goto out_free; } ret = read; - out: + +out_free: trace_parser_put(&parser); +out_unlock: mutex_unlock(&graph_lock); return ret; @@ -2622,7 +2631,7 @@ static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer) return 0; } -static int ftrace_convert_nops(struct module *mod, +static int ftrace_process_locs(struct module *mod, unsigned long *start, unsigned long *end) { @@ -2655,19 +2664,17 @@ static int ftrace_convert_nops(struct module *mod, } #ifdef CONFIG_MODULES -void ftrace_release(void *start, void *end) +void ftrace_release_mod(struct module *mod) { struct dyn_ftrace *rec; struct ftrace_page *pg; - unsigned long s = (unsigned long)start; - unsigned long e = (unsigned long)end; - if (ftrace_disabled || !start || start == end) + if (ftrace_disabled) return; mutex_lock(&ftrace_lock); do_for_each_ftrace_rec(pg, rec) { - if ((rec->ip >= s) && (rec->ip < e)) { + if (within_module_core(rec->ip, mod)) { /* * rec->ip is changed in ftrace_free_rec() * It should not between s and e if record was freed. @@ -2684,7 +2691,7 @@ static void ftrace_init_module(struct module *mod, { if (ftrace_disabled || start == end) return; - ftrace_convert_nops(mod, start, end); + ftrace_process_locs(mod, start, end); } static int ftrace_module_notify(struct notifier_block *self, @@ -2699,9 +2706,7 @@ static int ftrace_module_notify(struct notifier_block *self, mod->num_ftrace_callsites); break; case MODULE_STATE_GOING: - ftrace_release(mod->ftrace_callsites, - mod->ftrace_callsites + - mod->num_ftrace_callsites); + ftrace_release_mod(mod); break; } @@ -2747,7 +2752,7 @@ void __init ftrace_init(void) last_ftrace_enabled = ftrace_enabled = 1; - ret = ftrace_convert_nops(NULL, + ret = ftrace_process_locs(NULL, __start_mcount_loc, __stop_mcount_loc); @@ -2780,23 +2785,6 @@ static inline void ftrace_startup_enable(int command) { } # define ftrace_shutdown_sysctl() do { } while (0) #endif /* CONFIG_DYNAMIC_FTRACE */ -static ssize_t -ftrace_pid_read(struct file *file, char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - char buf[64]; - int r; - - if (ftrace_pid_trace == ftrace_swapper_pid) - r = sprintf(buf, "swapper tasks\n"); - else if (ftrace_pid_trace) - r = sprintf(buf, "%u\n", pid_vnr(ftrace_pid_trace)); - else - r = sprintf(buf, "no pid\n"); - - return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); -} - static void clear_ftrace_swapper(void) { struct task_struct *p; @@ -2847,14 +2835,12 @@ static void set_ftrace_pid(struct pid *pid) rcu_read_unlock(); } -static void clear_ftrace_pid_task(struct pid **pid) +static void clear_ftrace_pid_task(struct pid *pid) { - if (*pid == ftrace_swapper_pid) + if (pid == ftrace_swapper_pid) clear_ftrace_swapper(); else - clear_ftrace_pid(*pid); - - *pid = NULL; + clear_ftrace_pid(pid); } static void set_ftrace_pid_task(struct pid *pid) @@ -2865,74 +2851,184 @@ static void set_ftrace_pid_task(struct pid *pid) set_ftrace_pid(pid); } -static ssize_t -ftrace_pid_write(struct file *filp, const char __user *ubuf, - size_t cnt, loff_t *ppos) +static int ftrace_pid_add(int p) { struct pid *pid; - char buf[64]; - long val; - int ret; + struct ftrace_pid *fpid; + int ret = -EINVAL; - if (cnt >= sizeof(buf)) - return -EINVAL; + mutex_lock(&ftrace_lock); - if (copy_from_user(&buf, ubuf, cnt)) - return -EFAULT; + if (!p) + pid = ftrace_swapper_pid; + else + pid = find_get_pid(p); - buf[cnt] = 0; + if (!pid) + goto out; - ret = strict_strtol(buf, 10, &val); - if (ret < 0) - return ret; + ret = 0; - mutex_lock(&ftrace_lock); - if (val < 0) { - /* disable pid tracing */ - if (!ftrace_pid_trace) - goto out; + list_for_each_entry(fpid, &ftrace_pids, list) + if (fpid->pid == pid) + goto out_put; - clear_ftrace_pid_task(&ftrace_pid_trace); + ret = -ENOMEM; - } else { - /* swapper task is special */ - if (!val) { - pid = ftrace_swapper_pid; - if (pid == ftrace_pid_trace) - goto out; - } else { - pid = find_get_pid(val); + fpid = kmalloc(sizeof(*fpid), GFP_KERNEL); + if (!fpid) + goto out_put; - if (pid == ftrace_pid_trace) { - put_pid(pid); - goto out; - } - } + list_add(&fpid->list, &ftrace_pids); + fpid->pid = pid; - if (ftrace_pid_trace) - clear_ftrace_pid_task(&ftrace_pid_trace); + set_ftrace_pid_task(pid); - if (!pid) - goto out; + ftrace_update_pid_func(); + ftrace_startup_enable(0); + + mutex_unlock(&ftrace_lock); + return 0; + +out_put: + if (pid != ftrace_swapper_pid) + put_pid(pid); + +out: + mutex_unlock(&ftrace_lock); + return ret; +} + +static void ftrace_pid_reset(void) +{ + struct ftrace_pid *fpid, *safe; + + mutex_lock(&ftrace_lock); + list_for_each_entry_safe(fpid, safe, &ftrace_pids, list) { + struct pid *pid = fpid->pid; - ftrace_pid_trace = pid; + clear_ftrace_pid_task(pid); - set_ftrace_pid_task(ftrace_pid_trace); + list_del(&fpid->list); + kfree(fpid); } - /* update the function call */ ftrace_update_pid_func(); ftrace_startup_enable(0); - out: mutex_unlock(&ftrace_lock); +} - return cnt; +static void *fpid_start(struct seq_file *m, loff_t *pos) +{ + mutex_lock(&ftrace_lock); + + if (list_empty(&ftrace_pids) && (!*pos)) + return (void *) 1; + + return seq_list_start(&ftrace_pids, *pos); +} + +static void *fpid_next(struct seq_file *m, void *v, loff_t *pos) +{ + if (v == (void *)1) + return NULL; + + return seq_list_next(v, &ftrace_pids, pos); +} + +static void fpid_stop(struct seq_file *m, void *p) +{ + mutex_unlock(&ftrace_lock); +} + +static int fpid_show(struct seq_file *m, void *v) +{ + const struct ftrace_pid *fpid = list_entry(v, struct ftrace_pid, list); + + if (v == (void *)1) { + seq_printf(m, "no pid\n"); + return 0; + } + + if (fpid->pid == ftrace_swapper_pid) + seq_printf(m, "swapper tasks\n"); + else + seq_printf(m, "%u\n", pid_vnr(fpid->pid)); + + return 0; +} + +static const struct seq_operations ftrace_pid_sops = { + .start = fpid_start, + .next = fpid_next, + .stop = fpid_stop, + .show = fpid_show, +}; + +static int +ftrace_pid_open(struct inode *inode, struct file *file) +{ + int ret = 0; + + if ((file->f_mode & FMODE_WRITE) && + (file->f_flags & O_TRUNC)) + ftrace_pid_reset(); + + if (file->f_mode & FMODE_READ) + ret = seq_open(file, &ftrace_pid_sops); + + return ret; +} + +static ssize_t +ftrace_pid_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[64], *tmp; + long val; + int ret; + + if (cnt >= sizeof(buf)) + return -EINVAL; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + + /* + * Allow "echo > set_ftrace_pid" or "echo -n '' > set_ftrace_pid" + * to clean the filter quietly. + */ + tmp = strstrip(buf); + if (strlen(tmp) == 0) + return 1; + + ret = strict_strtol(tmp, 10, &val); + if (ret < 0) + return ret; + + ret = ftrace_pid_add(val); + + return ret ? ret : cnt; +} + +static int +ftrace_pid_release(struct inode *inode, struct file *file) +{ + if (file->f_mode & FMODE_READ) + seq_release(inode, file); + + return 0; } static const struct file_operations ftrace_pid_fops = { - .read = ftrace_pid_read, - .write = ftrace_pid_write, + .open = ftrace_pid_open, + .write = ftrace_pid_write, + .read = seq_read, + .llseek = seq_lseek, + .release = ftrace_pid_release, }; static __init int ftrace_init_debugfs(void) @@ -3015,7 +3111,7 @@ int unregister_ftrace_function(struct ftrace_ops *ops) int ftrace_enable_sysctl(struct ctl_table *table, int write, - struct file *file, void __user *buffer, size_t *lenp, + void __user *buffer, size_t *lenp, loff_t *ppos) { int ret; @@ -3025,7 +3121,7 @@ ftrace_enable_sysctl(struct ctl_table *table, int write, mutex_lock(&ftrace_lock); - ret = proc_dointvec(table, write, file, buffer, lenp, ppos); + ret = proc_dointvec(table, write, buffer, lenp, ppos); if (ret || !write || (last_ftrace_enabled == !!ftrace_enabled)) goto out; @@ -3295,4 +3391,3 @@ void ftrace_graph_stop(void) ftrace_stop(); } #endif - diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c index 81b1645c854..a91da69f153 100644 --- a/kernel/trace/kmemtrace.c +++ b/kernel/trace/kmemtrace.c @@ -501,7 +501,7 @@ static int __init init_kmem_tracer(void) return 1; } - if (!register_tracer(&kmem_tracer)) { + if (register_tracer(&kmem_tracer) != 0) { pr_warning("Warning: could not register the kmem tracer\n"); return 1; } diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index d4ff0197054..f58c9ad1583 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -397,18 +397,21 @@ int ring_buffer_print_page_header(struct trace_seq *s) int ret; ret = trace_seq_printf(s, "\tfield: u64 timestamp;\t" - "offset:0;\tsize:%u;\n", - (unsigned int)sizeof(field.time_stamp)); + "offset:0;\tsize:%u;\tsigned:%u;\n", + (unsigned int)sizeof(field.time_stamp), + (unsigned int)is_signed_type(u64)); ret = trace_seq_printf(s, "\tfield: local_t commit;\t" - "offset:%u;\tsize:%u;\n", + "offset:%u;\tsize:%u;\tsigned:%u;\n", (unsigned int)offsetof(typeof(field), commit), - (unsigned int)sizeof(field.commit)); + (unsigned int)sizeof(field.commit), + (unsigned int)is_signed_type(long)); ret = trace_seq_printf(s, "\tfield: char data;\t" - "offset:%u;\tsize:%u;\n", + "offset:%u;\tsize:%u;\tsigned:%u;\n", (unsigned int)offsetof(typeof(field), data), - (unsigned int)BUF_PAGE_SIZE); + (unsigned int)BUF_PAGE_SIZE, + (unsigned int)is_signed_type(char)); return ret; } @@ -420,7 +423,7 @@ struct ring_buffer_per_cpu { int cpu; struct ring_buffer *buffer; spinlock_t reader_lock; /* serialize readers */ - raw_spinlock_t lock; + arch_spinlock_t lock; struct lock_class_key lock_key; struct list_head *pages; struct buffer_page *head_page; /* read from head */ @@ -483,7 +486,7 @@ struct ring_buffer_iter { /* Up this if you want to test the TIME_EXTENTS and normalization */ #define DEBUG_SHIFT 0 -static inline u64 rb_time_stamp(struct ring_buffer *buffer, int cpu) +static inline u64 rb_time_stamp(struct ring_buffer *buffer) { /* shift to debug/test normalization and TIME_EXTENTS */ return buffer->clock() << DEBUG_SHIFT; @@ -494,7 +497,7 @@ u64 ring_buffer_time_stamp(struct ring_buffer *buffer, int cpu) u64 time; preempt_disable_notrace(); - time = rb_time_stamp(buffer, cpu); + time = rb_time_stamp(buffer); preempt_enable_no_resched_notrace(); return time; @@ -599,7 +602,7 @@ static struct list_head *rb_list_head(struct list_head *list) } /* - * rb_is_head_page - test if the give page is the head page + * rb_is_head_page - test if the given page is the head page * * Because the reader may move the head_page pointer, we can * not trust what the head page is (it may be pointing to @@ -995,7 +998,7 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu) cpu_buffer->buffer = buffer; spin_lock_init(&cpu_buffer->reader_lock); lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key); - cpu_buffer->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; + cpu_buffer->lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), GFP_KERNEL, cpu_to_node(cpu)); @@ -1193,6 +1196,7 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages) atomic_inc(&cpu_buffer->record_disabled); synchronize_sched(); + spin_lock_irq(&cpu_buffer->reader_lock); rb_head_page_deactivate(cpu_buffer); for (i = 0; i < nr_pages; i++) { @@ -1207,6 +1211,7 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages) return; rb_reset_cpu(cpu_buffer); + spin_unlock_irq(&cpu_buffer->reader_lock); rb_check_pages(cpu_buffer); @@ -1785,9 +1790,9 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer, static struct ring_buffer_event * rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, unsigned long length, unsigned long tail, - struct buffer_page *commit_page, struct buffer_page *tail_page, u64 *ts) { + struct buffer_page *commit_page = cpu_buffer->commit_page; struct ring_buffer *buffer = cpu_buffer->buffer; struct buffer_page *next_page; int ret; @@ -1868,7 +1873,7 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, * Nested commits always have zero deltas, so * just reread the time stamp */ - *ts = rb_time_stamp(buffer, cpu_buffer->cpu); + *ts = rb_time_stamp(buffer); next_page->page->time_stamp = *ts; } @@ -1890,13 +1895,10 @@ static struct ring_buffer_event * __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, unsigned type, unsigned long length, u64 *ts) { - struct buffer_page *tail_page, *commit_page; + struct buffer_page *tail_page; struct ring_buffer_event *event; unsigned long tail, write; - commit_page = cpu_buffer->commit_page; - /* we just need to protect against interrupts */ - barrier(); tail_page = cpu_buffer->tail_page; write = local_add_return(length, &tail_page->write); @@ -1907,7 +1909,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, /* See if we shot pass the end of this buffer page */ if (write > BUF_PAGE_SIZE) return rb_move_tail(cpu_buffer, length, tail, - commit_page, tail_page, ts); + tail_page, ts); /* We reserved something on the buffer */ @@ -2111,7 +2113,7 @@ rb_reserve_next_event(struct ring_buffer *buffer, if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000)) goto out_fail; - ts = rb_time_stamp(cpu_buffer->buffer, cpu_buffer->cpu); + ts = rb_time_stamp(cpu_buffer->buffer); /* * Only the first commit can update the timestamp. @@ -2681,7 +2683,7 @@ unsigned long ring_buffer_entries(struct ring_buffer *buffer) EXPORT_SYMBOL_GPL(ring_buffer_entries); /** - * ring_buffer_overrun_cpu - get the number of overruns in buffer + * ring_buffer_overruns - get the number of overruns in buffer * @buffer: The ring buffer * * Returns the total number of overruns in the ring buffer @@ -2832,7 +2834,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) int ret; local_irq_save(flags); - __raw_spin_lock(&cpu_buffer->lock); + arch_spin_lock(&cpu_buffer->lock); again: /* @@ -2921,7 +2923,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) goto again; out: - __raw_spin_unlock(&cpu_buffer->lock); + arch_spin_unlock(&cpu_buffer->lock); local_irq_restore(flags); return reader; @@ -3284,9 +3286,9 @@ ring_buffer_read_start(struct ring_buffer *buffer, int cpu) synchronize_sched(); spin_lock_irqsave(&cpu_buffer->reader_lock, flags); - __raw_spin_lock(&cpu_buffer->lock); + arch_spin_lock(&cpu_buffer->lock); rb_iter_reset(iter); - __raw_spin_unlock(&cpu_buffer->lock); + arch_spin_unlock(&cpu_buffer->lock); spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); return iter; @@ -3406,11 +3408,11 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu) if (RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->committing))) goto out; - __raw_spin_lock(&cpu_buffer->lock); + arch_spin_lock(&cpu_buffer->lock); rb_reset_cpu(cpu_buffer); - __raw_spin_unlock(&cpu_buffer->lock); + arch_spin_unlock(&cpu_buffer->lock); out: spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c index 573d3cc762c..b2477caf09c 100644 --- a/kernel/trace/ring_buffer_benchmark.c +++ b/kernel/trace/ring_buffer_benchmark.c @@ -35,6 +35,28 @@ static int disable_reader; module_param(disable_reader, uint, 0644); MODULE_PARM_DESC(disable_reader, "only run producer"); +static int write_iteration = 50; +module_param(write_iteration, uint, 0644); +MODULE_PARM_DESC(write_iteration, "# of writes between timestamp readings"); + +static int producer_nice = 19; +static int consumer_nice = 19; + +static int producer_fifo = -1; +static int consumer_fifo = -1; + +module_param(producer_nice, uint, 0644); +MODULE_PARM_DESC(producer_nice, "nice prio for producer"); + +module_param(consumer_nice, uint, 0644); +MODULE_PARM_DESC(consumer_nice, "nice prio for consumer"); + +module_param(producer_fifo, uint, 0644); +MODULE_PARM_DESC(producer_fifo, "fifo prio for producer"); + +module_param(consumer_fifo, uint, 0644); +MODULE_PARM_DESC(consumer_fifo, "fifo prio for consumer"); + static int read_events; static int kill_test; @@ -208,15 +230,18 @@ static void ring_buffer_producer(void) do { struct ring_buffer_event *event; int *entry; - - event = ring_buffer_lock_reserve(buffer, 10); - if (!event) { - missed++; - } else { - hit++; - entry = ring_buffer_event_data(event); - *entry = smp_processor_id(); - ring_buffer_unlock_commit(buffer, event); + int i; + + for (i = 0; i < write_iteration; i++) { + event = ring_buffer_lock_reserve(buffer, 10); + if (!event) { + missed++; + } else { + hit++; + entry = ring_buffer_event_data(event); + *entry = smp_processor_id(); + ring_buffer_unlock_commit(buffer, event); + } } do_gettimeofday(&end_tv); @@ -263,6 +288,27 @@ static void ring_buffer_producer(void) if (kill_test) trace_printk("ERROR!\n"); + + if (!disable_reader) { + if (consumer_fifo < 0) + trace_printk("Running Consumer at nice: %d\n", + consumer_nice); + else + trace_printk("Running Consumer at SCHED_FIFO %d\n", + consumer_fifo); + } + if (producer_fifo < 0) + trace_printk("Running Producer at nice: %d\n", + producer_nice); + else + trace_printk("Running Producer at SCHED_FIFO %d\n", + producer_fifo); + + /* Let the user know that the test is running at low priority */ + if (producer_fifo < 0 && consumer_fifo < 0 && + producer_nice == 19 && consumer_nice == 19) + trace_printk("WARNING!!! This test is running at lowest priority.\n"); + trace_printk("Time: %lld (usecs)\n", time); trace_printk("Overruns: %lld\n", overruns); if (disable_reader) @@ -392,6 +438,27 @@ static int __init ring_buffer_benchmark_init(void) if (IS_ERR(producer)) goto out_kill; + /* + * Run them as low-prio background tasks by default: + */ + if (!disable_reader) { + if (consumer_fifo >= 0) { + struct sched_param param = { + .sched_priority = consumer_fifo + }; + sched_setscheduler(consumer, SCHED_FIFO, ¶m); + } else + set_user_nice(consumer, consumer_nice); + } + + if (producer_fifo >= 0) { + struct sched_param param = { + .sched_priority = consumer_fifo + }; + sched_setscheduler(producer, SCHED_FIFO, ¶m); + } else + set_user_nice(producer, producer_nice); + return 0; out_kill: diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 6c0f6a8a22e..31118ae16f0 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -86,17 +86,17 @@ static int dummy_set_flag(u32 old_flags, u32 bit, int set) */ static int tracing_disabled = 1; -DEFINE_PER_CPU(local_t, ftrace_cpu_disabled); +DEFINE_PER_CPU(int, ftrace_cpu_disabled); static inline void ftrace_disable_cpu(void) { preempt_disable(); - local_inc(&__get_cpu_var(ftrace_cpu_disabled)); + __this_cpu_inc(per_cpu_var(ftrace_cpu_disabled)); } static inline void ftrace_enable_cpu(void) { - local_dec(&__get_cpu_var(ftrace_cpu_disabled)); + __this_cpu_dec(per_cpu_var(ftrace_cpu_disabled)); preempt_enable(); } @@ -129,7 +129,7 @@ static int tracing_set_tracer(const char *buf); static char bootup_tracer_buf[MAX_TRACER_SIZE] __initdata; static char *default_bootup_tracer; -static int __init set_ftrace(char *str) +static int __init set_cmdline_ftrace(char *str) { strncpy(bootup_tracer_buf, str, MAX_TRACER_SIZE); default_bootup_tracer = bootup_tracer_buf; @@ -137,7 +137,7 @@ static int __init set_ftrace(char *str) ring_buffer_expanded = 1; return 1; } -__setup("ftrace=", set_ftrace); +__setup("ftrace=", set_cmdline_ftrace); static int __init set_ftrace_dump_on_oops(char *str) { @@ -203,7 +203,7 @@ cycle_t ftrace_now(int cpu) */ static struct trace_array max_tr; -static DEFINE_PER_CPU(struct trace_array_cpu, max_data); +static DEFINE_PER_CPU(struct trace_array_cpu, max_tr_data); /* tracer_enabled is used to toggle activation of a tracer */ static int tracer_enabled = 1; @@ -415,7 +415,7 @@ int trace_get_user(struct trace_parser *parser, const char __user *ubuf, /* read the non-space input */ while (cnt && !isspace(ch)) { - if (parser->idx < parser->size) + if (parser->idx < parser->size - 1) parser->buffer[parser->idx++] = ch; else { ret = -EINVAL; @@ -493,15 +493,15 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt) * protected by per_cpu spinlocks. But the action of the swap * needs its own lock. * - * This is defined as a raw_spinlock_t in order to help + * This is defined as a arch_spinlock_t in order to help * with performance when lockdep debugging is enabled. * * It is also used in other places outside the update_max_tr * so it needs to be defined outside of the * CONFIG_TRACER_MAX_TRACE. */ -static raw_spinlock_t ftrace_max_lock = - (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; +static arch_spinlock_t ftrace_max_lock = + (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; #ifdef CONFIG_TRACER_MAX_TRACE unsigned long __read_mostly tracing_max_latency; @@ -555,13 +555,13 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) return; WARN_ON_ONCE(!irqs_disabled()); - __raw_spin_lock(&ftrace_max_lock); + arch_spin_lock(&ftrace_max_lock); tr->buffer = max_tr.buffer; max_tr.buffer = buf; __update_max_tr(tr, tsk, cpu); - __raw_spin_unlock(&ftrace_max_lock); + arch_spin_unlock(&ftrace_max_lock); } /** @@ -581,7 +581,7 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) return; WARN_ON_ONCE(!irqs_disabled()); - __raw_spin_lock(&ftrace_max_lock); + arch_spin_lock(&ftrace_max_lock); ftrace_disable_cpu(); @@ -603,7 +603,7 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) WARN_ON_ONCE(ret && ret != -EAGAIN && ret != -EBUSY); __update_max_tr(tr, tsk, cpu); - __raw_spin_unlock(&ftrace_max_lock); + arch_spin_unlock(&ftrace_max_lock); } #endif /* CONFIG_TRACER_MAX_TRACE */ @@ -802,7 +802,7 @@ static unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1]; static unsigned map_cmdline_to_pid[SAVED_CMDLINES]; static char saved_cmdlines[SAVED_CMDLINES][TASK_COMM_LEN]; static int cmdline_idx; -static raw_spinlock_t trace_cmdline_lock = __RAW_SPIN_LOCK_UNLOCKED; +static arch_spinlock_t trace_cmdline_lock = __ARCH_SPIN_LOCK_UNLOCKED; /* temporary disable recording */ static atomic_t trace_record_cmdline_disabled __read_mostly; @@ -915,7 +915,7 @@ static void trace_save_cmdline(struct task_struct *tsk) * nor do we want to disable interrupts, * so if we miss here, then better luck next time. */ - if (!__raw_spin_trylock(&trace_cmdline_lock)) + if (!arch_spin_trylock(&trace_cmdline_lock)) return; idx = map_pid_to_cmdline[tsk->pid]; @@ -940,7 +940,7 @@ static void trace_save_cmdline(struct task_struct *tsk) memcpy(&saved_cmdlines[idx], tsk->comm, TASK_COMM_LEN); - __raw_spin_unlock(&trace_cmdline_lock); + arch_spin_unlock(&trace_cmdline_lock); } void trace_find_cmdline(int pid, char comm[]) @@ -958,14 +958,14 @@ void trace_find_cmdline(int pid, char comm[]) } preempt_disable(); - __raw_spin_lock(&trace_cmdline_lock); + arch_spin_lock(&trace_cmdline_lock); map = map_pid_to_cmdline[pid]; if (map != NO_CMDLINE_MAP) strcpy(comm, saved_cmdlines[map]); else strcpy(comm, "<...>"); - __raw_spin_unlock(&trace_cmdline_lock); + arch_spin_unlock(&trace_cmdline_lock); preempt_enable(); } @@ -1085,7 +1085,7 @@ trace_function(struct trace_array *tr, struct ftrace_entry *entry; /* If we are reading the ring buffer, don't trace */ - if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled)))) + if (unlikely(__this_cpu_read(per_cpu_var(ftrace_cpu_disabled)))) return; event = trace_buffer_lock_reserve(buffer, TRACE_FN, sizeof(*entry), @@ -1251,8 +1251,8 @@ ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3) */ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) { - static raw_spinlock_t trace_buf_lock = - (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; + static arch_spinlock_t trace_buf_lock = + (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; static u32 trace_buf[TRACE_BUF_SIZE]; struct ftrace_event_call *call = &event_bprint; @@ -1283,7 +1283,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) /* Lockdep uses trace_printk for lock tracing */ local_irq_save(flags); - __raw_spin_lock(&trace_buf_lock); + arch_spin_lock(&trace_buf_lock); len = vbin_printf(trace_buf, TRACE_BUF_SIZE, fmt, args); if (len > TRACE_BUF_SIZE || len < 0) @@ -1304,7 +1304,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) ring_buffer_unlock_commit(buffer, event); out_unlock: - __raw_spin_unlock(&trace_buf_lock); + arch_spin_unlock(&trace_buf_lock); local_irq_restore(flags); out: @@ -1334,7 +1334,7 @@ int trace_array_printk(struct trace_array *tr, int trace_array_vprintk(struct trace_array *tr, unsigned long ip, const char *fmt, va_list args) { - static raw_spinlock_t trace_buf_lock = __RAW_SPIN_LOCK_UNLOCKED; + static arch_spinlock_t trace_buf_lock = __ARCH_SPIN_LOCK_UNLOCKED; static char trace_buf[TRACE_BUF_SIZE]; struct ftrace_event_call *call = &event_print; @@ -1360,12 +1360,9 @@ int trace_array_vprintk(struct trace_array *tr, pause_graph_tracing(); raw_local_irq_save(irq_flags); - __raw_spin_lock(&trace_buf_lock); + arch_spin_lock(&trace_buf_lock); len = vsnprintf(trace_buf, TRACE_BUF_SIZE, fmt, args); - len = min(len, TRACE_BUF_SIZE-1); - trace_buf[len] = 0; - size = sizeof(*entry) + len + 1; buffer = tr->buffer; event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, @@ -1373,15 +1370,15 @@ int trace_array_vprintk(struct trace_array *tr, if (!event) goto out_unlock; entry = ring_buffer_event_data(event); - entry->ip = ip; + entry->ip = ip; memcpy(&entry->buf, trace_buf, len); - entry->buf[len] = 0; + entry->buf[len] = '\0'; if (!filter_check_discard(call, entry, buffer, event)) ring_buffer_unlock_commit(buffer, event); out_unlock: - __raw_spin_unlock(&trace_buf_lock); + arch_spin_unlock(&trace_buf_lock); raw_local_irq_restore(irq_flags); unpause_graph_tracing(); out: @@ -1393,7 +1390,7 @@ int trace_array_vprintk(struct trace_array *tr, int trace_vprintk(unsigned long ip, const char *fmt, va_list args) { - return trace_array_printk(&global_trace, ip, fmt, args); + return trace_array_vprintk(&global_trace, ip, fmt, args); } EXPORT_SYMBOL_GPL(trace_vprintk); @@ -1515,6 +1512,8 @@ static void *s_next(struct seq_file *m, void *v, loff_t *pos) int i = (int)*pos; void *ent; + WARN_ON_ONCE(iter->leftover); + (*pos)++; /* can't go backwards */ @@ -1613,8 +1612,16 @@ static void *s_start(struct seq_file *m, loff_t *pos) ; } else { - l = *pos - 1; - p = s_next(m, p, &l); + /* + * If we overflowed the seq_file before, then we want + * to just reuse the trace_seq buffer again. + */ + if (iter->leftover) + p = iter; + else { + l = *pos - 1; + p = s_next(m, p, &l); + } } trace_event_read_lock(); @@ -1922,6 +1929,7 @@ static enum print_line_t print_trace_line(struct trace_iterator *iter) static int s_show(struct seq_file *m, void *v) { struct trace_iterator *iter = v; + int ret; if (iter->ent == NULL) { if (iter->tr) { @@ -1941,9 +1949,27 @@ static int s_show(struct seq_file *m, void *v) if (!(trace_flags & TRACE_ITER_VERBOSE)) print_func_help_header(m); } + } else if (iter->leftover) { + /* + * If we filled the seq_file buffer earlier, we + * want to just show it now. + */ + ret = trace_print_seq(m, &iter->seq); + + /* ret should this time be zero, but you never know */ + iter->leftover = ret; + } else { print_trace_line(iter); - trace_print_seq(m, &iter->seq); + ret = trace_print_seq(m, &iter->seq); + /* + * If we overflow the seq_file buffer, then it will + * ask us for this data again at start up. + * Use that instead. + * ret is 0 if seq_file write succeeded. + * -1 otherwise. + */ + iter->leftover = ret; } return 0; @@ -1984,11 +2010,9 @@ __tracing_open(struct inode *inode, struct file *file) if (current_trace) *iter->trace = *current_trace; - if (!alloc_cpumask_var(&iter->started, GFP_KERNEL)) + if (!zalloc_cpumask_var(&iter->started, GFP_KERNEL)) goto fail; - cpumask_clear(iter->started); - if (current_trace && current_trace->print_max) iter->tr = &max_tr; else @@ -2255,7 +2279,7 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf, mutex_lock(&tracing_cpumask_update_lock); local_irq_disable(); - __raw_spin_lock(&ftrace_max_lock); + arch_spin_lock(&ftrace_max_lock); for_each_tracing_cpu(cpu) { /* * Increase/decrease the disabled counter if we are @@ -2270,7 +2294,7 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf, atomic_dec(&global_trace.data[cpu]->disabled); } } - __raw_spin_unlock(&ftrace_max_lock); + arch_spin_unlock(&ftrace_max_lock); local_irq_enable(); cpumask_copy(tracing_cpumask, tracing_cpumask_new); @@ -2442,7 +2466,7 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf, return ret; } - filp->f_pos += cnt; + *ppos += cnt; return cnt; } @@ -2584,7 +2608,7 @@ tracing_ctrl_write(struct file *filp, const char __user *ubuf, } mutex_unlock(&trace_types_lock); - filp->f_pos += cnt; + *ppos += cnt; return cnt; } @@ -2766,7 +2790,7 @@ tracing_set_trace_write(struct file *filp, const char __user *ubuf, if (err) return err; - filp->f_pos += ret; + *ppos += ret; return ret; } @@ -2899,6 +2923,10 @@ static int tracing_release_pipe(struct inode *inode, struct file *file) else cpumask_clear_cpu(iter->cpu_file, tracing_reader_cpumask); + + if (iter->trace->pipe_close) + iter->trace->pipe_close(iter); + mutex_unlock(&trace_types_lock); free_cpumask_var(iter->started); @@ -3105,7 +3133,7 @@ static void tracing_spd_release_pipe(struct splice_pipe_desc *spd, __free_page(spd->pages[idx]); } -static struct pipe_buf_operations tracing_pipe_buf_ops = { +static const struct pipe_buf_operations tracing_pipe_buf_ops = { .can_merge = 0, .map = generic_pipe_buf_map, .unmap = generic_pipe_buf_unmap, @@ -3301,7 +3329,7 @@ tracing_entries_write(struct file *filp, const char __user *ubuf, } } - filp->f_pos += cnt; + *ppos += cnt; /* If check pages failed, return ENOMEM */ if (tracing_disabled) @@ -3336,7 +3364,6 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *fpos) { char *buf; - char *end; if (tracing_disabled) return -EINVAL; @@ -3344,7 +3371,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, if (cnt > TRACE_BUF_SIZE) cnt = TRACE_BUF_SIZE; - buf = kmalloc(cnt + 1, GFP_KERNEL); + buf = kmalloc(cnt + 2, GFP_KERNEL); if (buf == NULL) return -ENOMEM; @@ -3352,14 +3379,13 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, kfree(buf); return -EFAULT; } + if (buf[cnt-1] != '\n') { + buf[cnt] = '\n'; + buf[cnt+1] = '\0'; + } else + buf[cnt] = '\0'; - /* Cut from the first nil or newline. */ - buf[cnt] = '\0'; - end = strchr(buf, '\n'); - if (end) - *end = '\0'; - - cnt = mark_printk("%s\n", buf); + cnt = mark_printk("%s", buf); kfree(buf); *fpos += cnt; @@ -3591,7 +3617,7 @@ static void buffer_pipe_buf_get(struct pipe_inode_info *pipe, } /* Pipe buffer operations for a buffer. */ -static struct pipe_buf_operations buffer_pipe_buf_ops = { +static const struct pipe_buf_operations buffer_pipe_buf_ops = { .can_merge = 0, .map = generic_pipe_buf_map, .unmap = generic_pipe_buf_unmap, @@ -3732,7 +3758,7 @@ tracing_stats_read(struct file *filp, char __user *ubuf, s = kmalloc(sizeof(*s), GFP_KERNEL); if (!s) - return ENOMEM; + return -ENOMEM; trace_seq_init(s); @@ -4281,8 +4307,8 @@ trace_printk_seq(struct trace_seq *s) static void __ftrace_dump(bool disable_tracing) { - static raw_spinlock_t ftrace_dump_lock = - (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; + static arch_spinlock_t ftrace_dump_lock = + (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; /* use static because iter can be a bit big for the stack */ static struct trace_iterator iter; unsigned int old_userobj; @@ -4292,7 +4318,7 @@ static void __ftrace_dump(bool disable_tracing) /* only one dump */ local_irq_save(flags); - __raw_spin_lock(&ftrace_dump_lock); + arch_spin_lock(&ftrace_dump_lock); if (dump_ran) goto out; @@ -4367,7 +4393,7 @@ static void __ftrace_dump(bool disable_tracing) } out: - __raw_spin_unlock(&ftrace_dump_lock); + arch_spin_unlock(&ftrace_dump_lock); local_irq_restore(flags); } @@ -4389,7 +4415,7 @@ __init static int tracer_alloc_buffers(void) if (!alloc_cpumask_var(&tracing_cpumask, GFP_KERNEL)) goto out_free_buffer_mask; - if (!alloc_cpumask_var(&tracing_reader_cpumask, GFP_KERNEL)) + if (!zalloc_cpumask_var(&tracing_reader_cpumask, GFP_KERNEL)) goto out_free_tracing_cpumask; /* To save memory, keep the ring buffer size to its minimum */ @@ -4400,7 +4426,6 @@ __init static int tracer_alloc_buffers(void) cpumask_copy(tracing_buffer_mask, cpu_possible_mask); cpumask_copy(tracing_cpumask, cpu_all_mask); - cpumask_clear(tracing_reader_cpumask); /* TODO: make the number of buffers hot pluggable with CPUS */ global_trace.buffer = ring_buffer_alloc(ring_buf_size, @@ -4429,7 +4454,7 @@ __init static int tracer_alloc_buffers(void) /* Allocate the first page for all buffers */ for_each_tracing_cpu(i) { global_trace.data[i] = &per_cpu(global_trace_cpu, i); - max_tr.data[i] = &per_cpu(max_data, i); + max_tr.data[i] = &per_cpu(max_tr_data, i); } trace_init_cmdlines(); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 405cb850b75..a52bed2eedd 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -11,6 +11,7 @@ #include <linux/ftrace.h> #include <trace/boot.h> #include <linux/kmemtrace.h> +#include <linux/hw_breakpoint.h> #include <linux/trace_seq.h> #include <linux/ftrace_event.h> @@ -37,6 +38,7 @@ enum trace_type { TRACE_KMEM_ALLOC, TRACE_KMEM_FREE, TRACE_BLK, + TRACE_KSYM, __TRACE_LAST_TYPE, }; @@ -98,9 +100,32 @@ struct syscall_trace_enter { struct syscall_trace_exit { struct trace_entry ent; int nr; - unsigned long ret; + long ret; }; +struct kprobe_trace_entry { + struct trace_entry ent; + unsigned long ip; + int nargs; + unsigned long args[]; +}; + +#define SIZEOF_KPROBE_TRACE_ENTRY(n) \ + (offsetof(struct kprobe_trace_entry, args) + \ + (sizeof(unsigned long) * (n))) + +struct kretprobe_trace_entry { + struct trace_entry ent; + unsigned long func; + unsigned long ret_ip; + int nargs; + unsigned long args[]; +}; + +#define SIZEOF_KRETPROBE_TRACE_ENTRY(n) \ + (offsetof(struct kretprobe_trace_entry, args) + \ + (sizeof(unsigned long) * (n))) + /* * trace_flag_type is an enumeration that holds different * states when a trace occurs. These are: @@ -209,6 +234,7 @@ extern void __ftrace_bad_type(void); TRACE_KMEM_ALLOC); \ IF_ASSIGN(var, ent, struct kmemtrace_free_entry, \ TRACE_KMEM_FREE); \ + IF_ASSIGN(var, ent, struct ksym_trace_entry, TRACE_KSYM);\ __ftrace_bad_type(); \ } while (0) @@ -246,6 +272,7 @@ struct tracer_flags { * @pipe_open: called when the trace_pipe file is opened * @wait_pipe: override how the user waits for traces on trace_pipe * @close: called when the trace file is released + * @pipe_close: called when the trace_pipe file is released * @read: override the default read callback on trace_pipe * @splice_read: override the default splice_read callback on trace_pipe * @selftest: selftest to run on boot (see trace_selftest.c) @@ -264,6 +291,7 @@ struct tracer { void (*pipe_open)(struct trace_iterator *iter); void (*wait_pipe)(struct trace_iterator *iter); void (*close)(struct trace_iterator *iter); + void (*pipe_close)(struct trace_iterator *iter); ssize_t (*read)(struct trace_iterator *iter, struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos); @@ -364,6 +392,8 @@ int register_tracer(struct tracer *type); void unregister_tracer(struct tracer *type); int is_tracing_stopped(void); +extern int process_new_ksym_entry(char *ksymname, int op, unsigned long addr); + extern unsigned long nsecs_to_usecs(unsigned long nsecs); #ifdef CONFIG_TRACER_MAX_TRACE @@ -413,7 +443,7 @@ extern int DYN_FTRACE_TEST_NAME(void); extern int ring_buffer_expanded; extern bool tracing_selftest_disabled; -DECLARE_PER_CPU(local_t, ftrace_cpu_disabled); +DECLARE_PER_CPU(int, ftrace_cpu_disabled); #ifdef CONFIG_FTRACE_STARTUP_TEST extern int trace_selftest_startup_function(struct tracer *trace, @@ -438,6 +468,8 @@ extern int trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr); extern int trace_selftest_startup_hw_branches(struct tracer *trace, struct trace_array *tr); +extern int trace_selftest_startup_ksym(struct tracer *trace, + struct trace_array *tr); #endif /* CONFIG_FTRACE_STARTUP_TEST */ extern void *head_page(struct trace_array_cpu *data); @@ -483,10 +515,6 @@ static inline int ftrace_graph_addr(unsigned long addr) return 0; } #else -static inline int ftrace_trace_addr(unsigned long addr) -{ - return 1; -} static inline int ftrace_graph_addr(unsigned long addr) { return 1; @@ -500,12 +528,12 @@ print_graph_function(struct trace_iterator *iter) } #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ -extern struct pid *ftrace_pid_trace; +extern struct list_head ftrace_pids; #ifdef CONFIG_FUNCTION_TRACER static inline int ftrace_trace_task(struct task_struct *task) { - if (!ftrace_pid_trace) + if (list_empty(&ftrace_pids)) return 1; return test_tsk_trace_trace(task); @@ -687,7 +715,6 @@ struct event_filter { int n_preds; struct filter_pred **preds; char *filter_string; - bool no_reset; }; struct event_subsystem { @@ -699,22 +726,40 @@ struct event_subsystem { }; struct filter_pred; +struct regex; typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event, int val1, int val2); +typedef int (*regex_match_func)(char *str, struct regex *r, int len); + +enum regex_type { + MATCH_FULL = 0, + MATCH_FRONT_ONLY, + MATCH_MIDDLE_ONLY, + MATCH_END_ONLY, +}; + +struct regex { + char pattern[MAX_FILTER_STR_VAL]; + int len; + int field_len; + regex_match_func match; +}; + struct filter_pred { - filter_pred_fn_t fn; - u64 val; - char str_val[MAX_FILTER_STR_VAL]; - int str_len; - char *field_name; - int offset; - int not; - int op; - int pop_n; + filter_pred_fn_t fn; + u64 val; + struct regex regex; + char *field_name; + int offset; + int not; + int op; + int pop_n; }; +extern enum regex_type +filter_parse_regex(char *buff, int len, char **search, int *not); extern void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s); extern int apply_event_filter(struct ftrace_event_call *call, @@ -730,7 +775,8 @@ filter_check_discard(struct ftrace_event_call *call, void *rec, struct ring_buffer *buffer, struct ring_buffer_event *event) { - if (unlikely(call->filter_active) && !filter_match_preds(call, rec)) { + if (unlikely(call->filter_active) && + !filter_match_preds(call->filter, rec)) { ring_buffer_discard_commit(buffer, event); return 1; } diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index 7a7a9fd249a..4a194f08f88 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c @@ -34,6 +34,7 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect) struct trace_array *tr = branch_tracer; struct ring_buffer_event *event; struct trace_branch *entry; + struct ring_buffer *buffer; unsigned long flags; int cpu, pc; const char *p; @@ -54,7 +55,8 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect) goto out; pc = preempt_count(); - event = trace_buffer_lock_reserve(tr, TRACE_BRANCH, + buffer = tr->buffer; + event = trace_buffer_lock_reserve(buffer, TRACE_BRANCH, sizeof(*entry), flags, pc); if (!event) goto out; @@ -74,8 +76,8 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect) entry->line = f->line; entry->correct = val == expect; - if (!filter_check_discard(call, entry, tr->buffer, event)) - ring_buffer_unlock_commit(tr->buffer, event); + if (!filter_check_discard(call, entry, buffer, event)) + ring_buffer_unlock_commit(buffer, event); out: atomic_dec(&tr->data[cpu]->disabled); diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c index 20c5f92e28a..84a3a7ba072 100644 --- a/kernel/trace/trace_clock.c +++ b/kernel/trace/trace_clock.c @@ -20,6 +20,8 @@ #include <linux/ktime.h> #include <linux/trace_clock.h> +#include "trace.h" + /* * trace_clock_local(): the simplest and least coherent tracing clock. * @@ -28,17 +30,17 @@ */ u64 notrace trace_clock_local(void) { - unsigned long flags; u64 clock; + int resched; /* * sched_clock() is an architecture implemented, fast, scalable, * lockless clock. It is not guaranteed to be coherent across * CPUs, nor across CPU idle events. */ - raw_local_irq_save(flags); + resched = ftrace_preempt_disable(); clock = sched_clock(); - raw_local_irq_restore(flags); + ftrace_preempt_enable(resched); return clock; } @@ -69,10 +71,10 @@ u64 notrace trace_clock(void) /* keep prev_time and lock in the same cacheline. */ static struct { u64 prev_time; - raw_spinlock_t lock; + arch_spinlock_t lock; } trace_clock_struct ____cacheline_aligned_in_smp = { - .lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED, + .lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED, }; u64 notrace trace_clock_global(void) @@ -92,7 +94,7 @@ u64 notrace trace_clock_global(void) if (unlikely(in_nmi())) goto out; - __raw_spin_lock(&trace_clock_struct.lock); + arch_spin_lock(&trace_clock_struct.lock); /* * TODO: if this happens often then maybe we should reset @@ -104,7 +106,7 @@ u64 notrace trace_clock_global(void) trace_clock_struct.prev_time = now; - __raw_spin_unlock(&trace_clock_struct.lock); + arch_spin_unlock(&trace_clock_struct.lock); out: raw_local_irq_restore(flags); diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index ead3d724599..c16a08f399d 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -364,3 +364,19 @@ FTRACE_ENTRY(kmem_free, kmemtrace_free_entry, F_printk("type:%u call_site:%lx ptr:%p", __entry->type_id, __entry->call_site, __entry->ptr) ); + +FTRACE_ENTRY(ksym_trace, ksym_trace_entry, + + TRACE_KSYM, + + F_STRUCT( + __field( unsigned long, ip ) + __field( unsigned char, type ) + __array( char , cmd, TASK_COMM_LEN ) + __field( unsigned long, addr ) + ), + + F_printk("ip: %pF type: %d ksym_name: %pS cmd: %s", + (void *)__entry->ip, (unsigned int)__entry->type, + (void *)__entry->addr, __entry->cmd) +); diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c index dd44b876886..d9c60f80aa0 100644 --- a/kernel/trace/trace_event_profile.c +++ b/kernel/trace/trace_event_profile.c @@ -8,17 +8,14 @@ #include <linux/module.h> #include "trace.h" -/* - * We can't use a size but a type in alloc_percpu() - * So let's create a dummy type that matches the desired size - */ -typedef struct {char buf[FTRACE_MAX_PROFILE_SIZE];} profile_buf_t; -char *trace_profile_buf; -EXPORT_SYMBOL_GPL(trace_profile_buf); +char *perf_trace_buf; +EXPORT_SYMBOL_GPL(perf_trace_buf); + +char *perf_trace_buf_nmi; +EXPORT_SYMBOL_GPL(perf_trace_buf_nmi); -char *trace_profile_buf_nmi; -EXPORT_SYMBOL_GPL(trace_profile_buf_nmi); +typedef typeof(char [FTRACE_MAX_PROFILE_SIZE]) perf_trace_t ; /* Count the events in use (per event id, not per instance) */ static int total_profile_count; @@ -31,29 +28,34 @@ static int ftrace_profile_enable_event(struct ftrace_event_call *event) if (atomic_inc_return(&event->profile_count)) return 0; - if (!total_profile_count++) { - buf = (char *)alloc_percpu(profile_buf_t); + if (!total_profile_count) { + buf = (char *)alloc_percpu(perf_trace_t); if (!buf) goto fail_buf; - rcu_assign_pointer(trace_profile_buf, buf); + rcu_assign_pointer(perf_trace_buf, buf); - buf = (char *)alloc_percpu(profile_buf_t); + buf = (char *)alloc_percpu(perf_trace_t); if (!buf) goto fail_buf_nmi; - rcu_assign_pointer(trace_profile_buf_nmi, buf); + rcu_assign_pointer(perf_trace_buf_nmi, buf); } - ret = event->profile_enable(); - if (!ret) + ret = event->profile_enable(event); + if (!ret) { + total_profile_count++; return 0; + } - kfree(trace_profile_buf_nmi); fail_buf_nmi: - kfree(trace_profile_buf); + if (!total_profile_count) { + free_percpu(perf_trace_buf_nmi); + free_percpu(perf_trace_buf); + perf_trace_buf_nmi = NULL; + perf_trace_buf = NULL; + } fail_buf: - total_profile_count--; atomic_dec(&event->profile_count); return ret; @@ -84,14 +86,14 @@ static void ftrace_profile_disable_event(struct ftrace_event_call *event) if (!atomic_add_negative(-1, &event->profile_count)) return; - event->profile_disable(); + event->profile_disable(event); if (!--total_profile_count) { - buf = trace_profile_buf; - rcu_assign_pointer(trace_profile_buf, NULL); + buf = perf_trace_buf; + rcu_assign_pointer(perf_trace_buf, NULL); - nmi_buf = trace_profile_buf_nmi; - rcu_assign_pointer(trace_profile_buf_nmi, NULL); + nmi_buf = perf_trace_buf_nmi; + rcu_assign_pointer(perf_trace_buf_nmi, NULL); /* * Ensure every events in profiling have finished before diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 6f03c8a1105..1d18315dc83 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -93,9 +93,7 @@ int trace_define_common_fields(struct ftrace_event_call *call) } EXPORT_SYMBOL_GPL(trace_define_common_fields); -#ifdef CONFIG_MODULES - -static void trace_destroy_fields(struct ftrace_event_call *call) +void trace_destroy_fields(struct ftrace_event_call *call) { struct ftrace_event_field *field, *next; @@ -107,8 +105,6 @@ static void trace_destroy_fields(struct ftrace_event_call *call) } } -#endif /* CONFIG_MODULES */ - static void ftrace_event_enable_disable(struct ftrace_event_call *call, int enable) { @@ -117,14 +113,14 @@ static void ftrace_event_enable_disable(struct ftrace_event_call *call, if (call->enabled) { call->enabled = 0; tracing_stop_cmdline_record(); - call->unregfunc(call->data); + call->unregfunc(call); } break; case 1: if (!call->enabled) { call->enabled = 1; tracing_start_cmdline_record(); - call->regfunc(call->data); + call->regfunc(call); } break; } @@ -232,10 +228,9 @@ ftrace_event_write(struct file *file, const char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_parser parser; - size_t read = 0; - ssize_t ret; + ssize_t read, ret; - if (!cnt || cnt < 0) + if (!cnt) return 0; ret = tracing_update_buffers(); @@ -247,7 +242,7 @@ ftrace_event_write(struct file *file, const char __user *ubuf, read = trace_get_user(&parser, ubuf, cnt, ppos); - if (trace_parser_loaded((&parser))) { + if (read >= 0 && trace_parser_loaded((&parser))) { int set = 1; if (*parser.buffer == '!') @@ -508,7 +503,7 @@ extern char *__bad_type_size(void); #define FIELD(type, name) \ sizeof(type) != sizeof(field.name) ? __bad_type_size() : \ #type, "common_" #name, offsetof(typeof(field), name), \ - sizeof(field.name) + sizeof(field.name), is_signed_type(type) static int trace_write_header(struct trace_seq *s) { @@ -516,17 +511,17 @@ static int trace_write_header(struct trace_seq *s) /* struct trace_entry */ return trace_seq_printf(s, - "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" - "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" - "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" - "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" - "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" - "\n", - FIELD(unsigned short, type), - FIELD(unsigned char, flags), - FIELD(unsigned char, preempt_count), - FIELD(int, pid), - FIELD(int, lock_depth)); + "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n" + "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n" + "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n" + "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n" + "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n" + "\n", + FIELD(unsigned short, type), + FIELD(unsigned char, flags), + FIELD(unsigned char, preempt_count), + FIELD(int, pid), + FIELD(int, lock_depth)); } static ssize_t @@ -879,9 +874,9 @@ event_subsystem_dir(const char *name, struct dentry *d_events) "'%s/filter' entry\n", name); } - entry = trace_create_file("enable", 0644, system->entry, - (void *)system->name, - &ftrace_system_enable_fops); + trace_create_file("enable", 0644, system->entry, + (void *)system->name, + &ftrace_system_enable_fops); return system->entry; } @@ -893,7 +888,6 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events, const struct file_operations *filter, const struct file_operations *format) { - struct dentry *entry; int ret; /* @@ -911,12 +905,12 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events, } if (call->regfunc) - entry = trace_create_file("enable", 0644, call->dir, call, - enable); + trace_create_file("enable", 0644, call->dir, call, + enable); if (call->id && call->profile_enable) - entry = trace_create_file("id", 0444, call->dir, call, - id); + trace_create_file("id", 0444, call->dir, call, + id); if (call->define_fields) { ret = call->define_fields(call); @@ -925,41 +919,60 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events, " events/%s\n", call->name); return ret; } - entry = trace_create_file("filter", 0644, call->dir, call, - filter); + trace_create_file("filter", 0644, call->dir, call, + filter); } /* A trace may not want to export its format */ if (!call->show_format) return 0; - entry = trace_create_file("format", 0444, call->dir, call, - format); + trace_create_file("format", 0444, call->dir, call, + format); return 0; } -#define for_each_event(event, start, end) \ - for (event = start; \ - (unsigned long)event < (unsigned long)end; \ - event++) +static int __trace_add_event_call(struct ftrace_event_call *call) +{ + struct dentry *d_events; + int ret; -#ifdef CONFIG_MODULES + if (!call->name) + return -EINVAL; -static LIST_HEAD(ftrace_module_file_list); + if (call->raw_init) { + ret = call->raw_init(call); + if (ret < 0) { + if (ret != -ENOSYS) + pr_warning("Could not initialize trace " + "events/%s\n", call->name); + return ret; + } + } -/* - * Modules must own their file_operations to keep up with - * reference counting. - */ -struct ftrace_module_file_ops { - struct list_head list; - struct module *mod; - struct file_operations id; - struct file_operations enable; - struct file_operations format; - struct file_operations filter; -}; + d_events = event_trace_events_dir(); + if (!d_events) + return -ENOENT; + + ret = event_create_dir(call, d_events, &ftrace_event_id_fops, + &ftrace_enable_fops, &ftrace_event_filter_fops, + &ftrace_event_format_fops); + if (!ret) + list_add(&call->list, &ftrace_events); + + return ret; +} + +/* Add an additional event_call dynamically */ +int trace_add_event_call(struct ftrace_event_call *call) +{ + int ret; + mutex_lock(&event_mutex); + ret = __trace_add_event_call(call); + mutex_unlock(&event_mutex); + return ret; +} static void remove_subsystem_dir(const char *name) { @@ -987,6 +1000,53 @@ static void remove_subsystem_dir(const char *name) } } +/* + * Must be called under locking both of event_mutex and trace_event_mutex. + */ +static void __trace_remove_event_call(struct ftrace_event_call *call) +{ + ftrace_event_enable_disable(call, 0); + if (call->event) + __unregister_ftrace_event(call->event); + debugfs_remove_recursive(call->dir); + list_del(&call->list); + trace_destroy_fields(call); + destroy_preds(call); + remove_subsystem_dir(call->system); +} + +/* Remove an event_call */ +void trace_remove_event_call(struct ftrace_event_call *call) +{ + mutex_lock(&event_mutex); + down_write(&trace_event_mutex); + __trace_remove_event_call(call); + up_write(&trace_event_mutex); + mutex_unlock(&event_mutex); +} + +#define for_each_event(event, start, end) \ + for (event = start; \ + (unsigned long)event < (unsigned long)end; \ + event++) + +#ifdef CONFIG_MODULES + +static LIST_HEAD(ftrace_module_file_list); + +/* + * Modules must own their file_operations to keep up with + * reference counting. + */ +struct ftrace_module_file_ops { + struct list_head list; + struct module *mod; + struct file_operations id; + struct file_operations enable; + struct file_operations format; + struct file_operations filter; +}; + static struct ftrace_module_file_ops * trace_create_file_ops(struct module *mod) { @@ -1044,7 +1104,7 @@ static void trace_module_add_events(struct module *mod) if (!call->name) continue; if (call->raw_init) { - ret = call->raw_init(); + ret = call->raw_init(call); if (ret < 0) { if (ret != -ENOSYS) pr_warning("Could not initialize trace " @@ -1062,10 +1122,11 @@ static void trace_module_add_events(struct module *mod) return; } call->mod = mod; - list_add(&call->list, &ftrace_events); - event_create_dir(call, d_events, - &file_ops->id, &file_ops->enable, - &file_ops->filter, &file_ops->format); + ret = event_create_dir(call, d_events, + &file_ops->id, &file_ops->enable, + &file_ops->filter, &file_ops->format); + if (!ret) + list_add(&call->list, &ftrace_events); } } @@ -1079,14 +1140,7 @@ static void trace_module_remove_events(struct module *mod) list_for_each_entry_safe(call, p, &ftrace_events, list) { if (call->mod == mod) { found = true; - ftrace_event_enable_disable(call, 0); - if (call->event) - __unregister_ftrace_event(call->event); - debugfs_remove_recursive(call->dir); - list_del(&call->list); - trace_destroy_fields(call); - destroy_preds(call); - remove_subsystem_dir(call->system); + __trace_remove_event_call(call); } } @@ -1204,7 +1258,7 @@ static __init int event_trace_init(void) if (!call->name) continue; if (call->raw_init) { - ret = call->raw_init(); + ret = call->raw_init(call); if (ret < 0) { if (ret != -ENOSYS) pr_warning("Could not initialize trace " @@ -1212,10 +1266,12 @@ static __init int event_trace_init(void) continue; } } - list_add(&call->list, &ftrace_events); - event_create_dir(call, d_events, &ftrace_event_id_fops, - &ftrace_enable_fops, &ftrace_event_filter_fops, - &ftrace_event_format_fops); + ret = event_create_dir(call, d_events, &ftrace_event_id_fops, + &ftrace_enable_fops, + &ftrace_event_filter_fops, + &ftrace_event_format_fops); + if (!ret) + list_add(&call->list, &ftrace_events); } while (true) { diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 23245785927..50504cb228d 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -18,11 +18,10 @@ * Copyright (C) 2009 Tom Zanussi <tzanussi@gmail.com> */ -#include <linux/debugfs.h> -#include <linux/uaccess.h> #include <linux/module.h> #include <linux/ctype.h> #include <linux/mutex.h> +#include <linux/perf_event.h> #include "trace.h" #include "trace_output.h" @@ -31,6 +30,7 @@ enum filter_op_ids { OP_OR, OP_AND, + OP_GLOB, OP_NE, OP_EQ, OP_LT, @@ -48,16 +48,17 @@ struct filter_op { }; static struct filter_op filter_ops[] = { - { OP_OR, "||", 1 }, - { OP_AND, "&&", 2 }, - { OP_NE, "!=", 4 }, - { OP_EQ, "==", 4 }, - { OP_LT, "<", 5 }, - { OP_LE, "<=", 5 }, - { OP_GT, ">", 5 }, - { OP_GE, ">=", 5 }, - { OP_NONE, "OP_NONE", 0 }, - { OP_OPEN_PAREN, "(", 0 }, + { OP_OR, "||", 1 }, + { OP_AND, "&&", 2 }, + { OP_GLOB, "~", 4 }, + { OP_NE, "!=", 4 }, + { OP_EQ, "==", 4 }, + { OP_LT, "<", 5 }, + { OP_LE, "<=", 5 }, + { OP_GT, ">", 5 }, + { OP_GE, ">=", 5 }, + { OP_NONE, "OP_NONE", 0 }, + { OP_OPEN_PAREN, "(", 0 }, }; enum { @@ -197,9 +198,9 @@ static int filter_pred_string(struct filter_pred *pred, void *event, char *addr = (char *)(event + pred->offset); int cmp, match; - cmp = strncmp(addr, pred->str_val, pred->str_len); + cmp = pred->regex.match(addr, &pred->regex, pred->regex.field_len); - match = (!cmp) ^ pred->not; + match = cmp ^ pred->not; return match; } @@ -211,9 +212,9 @@ static int filter_pred_pchar(struct filter_pred *pred, void *event, char **addr = (char **)(event + pred->offset); int cmp, match; - cmp = strncmp(*addr, pred->str_val, pred->str_len); + cmp = pred->regex.match(*addr, &pred->regex, pred->regex.field_len); - match = (!cmp) ^ pred->not; + match = cmp ^ pred->not; return match; } @@ -237,9 +238,9 @@ static int filter_pred_strloc(struct filter_pred *pred, void *event, char *addr = (char *)(event + str_loc); int cmp, match; - cmp = strncmp(addr, pred->str_val, str_len); + cmp = pred->regex.match(addr, &pred->regex, str_len); - match = (!cmp) ^ pred->not; + match = cmp ^ pred->not; return match; } @@ -250,10 +251,121 @@ static int filter_pred_none(struct filter_pred *pred, void *event, return 0; } +/* Basic regex callbacks */ +static int regex_match_full(char *str, struct regex *r, int len) +{ + if (strncmp(str, r->pattern, len) == 0) + return 1; + return 0; +} + +static int regex_match_front(char *str, struct regex *r, int len) +{ + if (strncmp(str, r->pattern, len) == 0) + return 1; + return 0; +} + +static int regex_match_middle(char *str, struct regex *r, int len) +{ + if (strstr(str, r->pattern)) + return 1; + return 0; +} + +static int regex_match_end(char *str, struct regex *r, int len) +{ + char *ptr = strstr(str, r->pattern); + + if (ptr && (ptr[r->len] == 0)) + return 1; + return 0; +} + +/** + * filter_parse_regex - parse a basic regex + * @buff: the raw regex + * @len: length of the regex + * @search: will point to the beginning of the string to compare + * @not: tell whether the match will have to be inverted + * + * This passes in a buffer containing a regex and this function will + * set search to point to the search part of the buffer and + * return the type of search it is (see enum above). + * This does modify buff. + * + * Returns enum type. + * search returns the pointer to use for comparison. + * not returns 1 if buff started with a '!' + * 0 otherwise. + */ +enum regex_type filter_parse_regex(char *buff, int len, char **search, int *not) +{ + int type = MATCH_FULL; + int i; + + if (buff[0] == '!') { + *not = 1; + buff++; + len--; + } else + *not = 0; + + *search = buff; + + for (i = 0; i < len; i++) { + if (buff[i] == '*') { + if (!i) { + *search = buff + 1; + type = MATCH_END_ONLY; + } else { + if (type == MATCH_END_ONLY) + type = MATCH_MIDDLE_ONLY; + else + type = MATCH_FRONT_ONLY; + buff[i] = 0; + break; + } + } + } + + return type; +} + +static void filter_build_regex(struct filter_pred *pred) +{ + struct regex *r = &pred->regex; + char *search; + enum regex_type type = MATCH_FULL; + int not = 0; + + if (pred->op == OP_GLOB) { + type = filter_parse_regex(r->pattern, r->len, &search, ¬); + r->len = strlen(search); + memmove(r->pattern, search, r->len+1); + } + + switch (type) { + case MATCH_FULL: + r->match = regex_match_full; + break; + case MATCH_FRONT_ONLY: + r->match = regex_match_front; + break; + case MATCH_MIDDLE_ONLY: + r->match = regex_match_middle; + break; + case MATCH_END_ONLY: + r->match = regex_match_end; + break; + } + + pred->not ^= not; +} + /* return 1 if event matches, 0 otherwise (discard) */ -int filter_match_preds(struct ftrace_event_call *call, void *rec) +int filter_match_preds(struct event_filter *filter, void *rec) { - struct event_filter *filter = call->filter; int match, top = 0, val1 = 0, val2 = 0; int stack[MAX_FILTER_PRED]; struct filter_pred *pred; @@ -396,7 +508,7 @@ static void filter_clear_pred(struct filter_pred *pred) { kfree(pred->field_name); pred->field_name = NULL; - pred->str_len = 0; + pred->regex.len = 0; } static int filter_set_pred(struct filter_pred *dest, @@ -426,9 +538,8 @@ static void filter_disable_preds(struct ftrace_event_call *call) filter->preds[i]->fn = filter_pred_none; } -void destroy_preds(struct ftrace_event_call *call) +static void __free_preds(struct event_filter *filter) { - struct event_filter *filter = call->filter; int i; if (!filter) @@ -441,21 +552,24 @@ void destroy_preds(struct ftrace_event_call *call) kfree(filter->preds); kfree(filter->filter_string); kfree(filter); +} + +void destroy_preds(struct ftrace_event_call *call) +{ + __free_preds(call->filter); call->filter = NULL; + call->filter_active = 0; } -static int init_preds(struct ftrace_event_call *call) +static struct event_filter *__alloc_preds(void) { struct event_filter *filter; struct filter_pred *pred; int i; - if (call->filter) - return 0; - - filter = call->filter = kzalloc(sizeof(*filter), GFP_KERNEL); - if (!call->filter) - return -ENOMEM; + filter = kzalloc(sizeof(*filter), GFP_KERNEL); + if (!filter) + return ERR_PTR(-ENOMEM); filter->n_preds = 0; @@ -471,12 +585,24 @@ static int init_preds(struct ftrace_event_call *call) filter->preds[i] = pred; } - return 0; + return filter; oom: - destroy_preds(call); + __free_preds(filter); + return ERR_PTR(-ENOMEM); +} + +static int init_preds(struct ftrace_event_call *call) +{ + if (call->filter) + return 0; + + call->filter_active = 0; + call->filter = __alloc_preds(); + if (IS_ERR(call->filter)) + return PTR_ERR(call->filter); - return -ENOMEM; + return 0; } static int init_subsystem_preds(struct event_subsystem *system) @@ -499,14 +625,7 @@ static int init_subsystem_preds(struct event_subsystem *system) return 0; } -enum { - FILTER_DISABLE_ALL, - FILTER_INIT_NO_RESET, - FILTER_SKIP_NO_RESET, -}; - -static void filter_free_subsystem_preds(struct event_subsystem *system, - int flag) +static void filter_free_subsystem_preds(struct event_subsystem *system) { struct ftrace_event_call *call; @@ -517,14 +636,6 @@ static void filter_free_subsystem_preds(struct event_subsystem *system, if (strcmp(call->system, system->name) != 0) continue; - if (flag == FILTER_INIT_NO_RESET) { - call->filter->no_reset = false; - continue; - } - - if (flag == FILTER_SKIP_NO_RESET && call->filter->no_reset) - continue; - filter_disable_preds(call); remove_filter_string(call->filter); } @@ -532,10 +643,10 @@ static void filter_free_subsystem_preds(struct event_subsystem *system, static int filter_add_pred_fn(struct filter_parse_state *ps, struct ftrace_event_call *call, + struct event_filter *filter, struct filter_pred *pred, filter_pred_fn_t fn) { - struct event_filter *filter = call->filter; int idx, err; if (filter->n_preds == MAX_FILTER_PRED) { @@ -550,7 +661,6 @@ static int filter_add_pred_fn(struct filter_parse_state *ps, return err; filter->n_preds++; - call->filter_active = 1; return 0; } @@ -575,7 +685,10 @@ static bool is_string_field(struct ftrace_event_field *field) static int is_legal_op(struct ftrace_event_field *field, int op) { - if (is_string_field(field) && (op != OP_EQ && op != OP_NE)) + if (is_string_field(field) && + (op != OP_EQ && op != OP_NE && op != OP_GLOB)) + return 0; + if (!is_string_field(field) && op == OP_GLOB) return 0; return 1; @@ -626,6 +739,7 @@ static filter_pred_fn_t select_comparison_fn(int op, int field_size, static int filter_add_pred(struct filter_parse_state *ps, struct ftrace_event_call *call, + struct event_filter *filter, struct filter_pred *pred, bool dry_run) { @@ -660,21 +774,22 @@ static int filter_add_pred(struct filter_parse_state *ps, } if (is_string_field(field)) { - pred->str_len = field->size; + filter_build_regex(pred); - if (field->filter_type == FILTER_STATIC_STRING) + if (field->filter_type == FILTER_STATIC_STRING) { fn = filter_pred_string; - else if (field->filter_type == FILTER_DYN_STRING) + pred->regex.field_len = field->size; + } else if (field->filter_type == FILTER_DYN_STRING) fn = filter_pred_strloc; else { fn = filter_pred_pchar; - pred->str_len = strlen(pred->str_val); + pred->regex.field_len = strlen(pred->regex.pattern); } } else { if (field->is_signed) - ret = strict_strtoll(pred->str_val, 0, &val); + ret = strict_strtoll(pred->regex.pattern, 0, &val); else - ret = strict_strtoull(pred->str_val, 0, &val); + ret = strict_strtoull(pred->regex.pattern, 0, &val); if (ret) { parse_error(ps, FILT_ERR_ILLEGAL_INTVAL, 0); return -EINVAL; @@ -694,45 +809,7 @@ static int filter_add_pred(struct filter_parse_state *ps, add_pred_fn: if (!dry_run) - return filter_add_pred_fn(ps, call, pred, fn); - return 0; -} - -static int filter_add_subsystem_pred(struct filter_parse_state *ps, - struct event_subsystem *system, - struct filter_pred *pred, - char *filter_string, - bool dry_run) -{ - struct ftrace_event_call *call; - int err = 0; - bool fail = true; - - list_for_each_entry(call, &ftrace_events, list) { - - if (!call->define_fields) - continue; - - if (strcmp(call->system, system->name)) - continue; - - if (call->filter->no_reset) - continue; - - err = filter_add_pred(ps, call, pred, dry_run); - if (err) - call->filter->no_reset = true; - else - fail = false; - - if (!dry_run) - replace_filter_string(call->filter, filter_string); - } - - if (fail) { - parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0); - return err; - } + return filter_add_pred_fn(ps, call, filter, pred, fn); return 0; } @@ -933,8 +1010,9 @@ static void postfix_clear(struct filter_parse_state *ps) while (!list_empty(&ps->postfix)) { elt = list_first_entry(&ps->postfix, struct postfix_elt, list); - kfree(elt->operand); list_del(&elt->list); + kfree(elt->operand); + kfree(elt); } } @@ -1044,8 +1122,8 @@ static struct filter_pred *create_pred(int op, char *operand1, char *operand2) return NULL; } - strcpy(pred->str_val, operand2); - pred->str_len = strlen(operand2); + strcpy(pred->regex.pattern, operand2); + pred->regex.len = strlen(pred->regex.pattern); pred->op = op; @@ -1089,8 +1167,8 @@ static int check_preds(struct filter_parse_state *ps) return 0; } -static int replace_preds(struct event_subsystem *system, - struct ftrace_event_call *call, +static int replace_preds(struct ftrace_event_call *call, + struct event_filter *filter, struct filter_parse_state *ps, char *filter_string, bool dry_run) @@ -1137,11 +1215,7 @@ static int replace_preds(struct event_subsystem *system, add_pred: if (!pred) return -ENOMEM; - if (call) - err = filter_add_pred(ps, call, pred, false); - else - err = filter_add_subsystem_pred(ps, system, pred, - filter_string, dry_run); + err = filter_add_pred(ps, call, filter, pred, dry_run); filter_free_pred(pred); if (err) return err; @@ -1152,10 +1226,50 @@ add_pred: return 0; } -int apply_event_filter(struct ftrace_event_call *call, char *filter_string) +static int replace_system_preds(struct event_subsystem *system, + struct filter_parse_state *ps, + char *filter_string) { + struct ftrace_event_call *call; + bool fail = true; int err; + list_for_each_entry(call, &ftrace_events, list) { + struct event_filter *filter = call->filter; + + if (!call->define_fields) + continue; + + if (strcmp(call->system, system->name) != 0) + continue; + + /* try to see if the filter can be applied */ + err = replace_preds(call, filter, ps, filter_string, true); + if (err) + continue; + + /* really apply the filter */ + filter_disable_preds(call); + err = replace_preds(call, filter, ps, filter_string, false); + if (err) + filter_disable_preds(call); + else { + call->filter_active = 1; + replace_filter_string(filter, filter_string); + } + fail = false; + } + + if (fail) { + parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0); + return -EINVAL; + } + return 0; +} + +int apply_event_filter(struct ftrace_event_call *call, char *filter_string) +{ + int err; struct filter_parse_state *ps; mutex_lock(&event_mutex); @@ -1167,8 +1281,7 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string) if (!strcmp(strstrip(filter_string), "0")) { filter_disable_preds(call); remove_filter_string(call->filter); - mutex_unlock(&event_mutex); - return 0; + goto out_unlock; } err = -ENOMEM; @@ -1186,10 +1299,11 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string) goto out; } - err = replace_preds(NULL, call, ps, filter_string, false); + err = replace_preds(call, call->filter, ps, filter_string, false); if (err) append_filter_err(ps, call->filter); - + else + call->filter_active = 1; out: filter_opstack_clear(ps); postfix_clear(ps); @@ -1204,7 +1318,6 @@ int apply_subsystem_event_filter(struct event_subsystem *system, char *filter_string) { int err; - struct filter_parse_state *ps; mutex_lock(&event_mutex); @@ -1214,10 +1327,9 @@ int apply_subsystem_event_filter(struct event_subsystem *system, goto out_unlock; if (!strcmp(strstrip(filter_string), "0")) { - filter_free_subsystem_preds(system, FILTER_DISABLE_ALL); + filter_free_subsystem_preds(system); remove_filter_string(system->filter); - mutex_unlock(&event_mutex); - return 0; + goto out_unlock; } err = -ENOMEM; @@ -1234,31 +1346,87 @@ int apply_subsystem_event_filter(struct event_subsystem *system, goto out; } - filter_free_subsystem_preds(system, FILTER_INIT_NO_RESET); - - /* try to see the filter can be applied to which events */ - err = replace_preds(system, NULL, ps, filter_string, true); - if (err) { + err = replace_system_preds(system, ps, filter_string); + if (err) append_filter_err(ps, system->filter); - goto out; + +out: + filter_opstack_clear(ps); + postfix_clear(ps); + kfree(ps); +out_unlock: + mutex_unlock(&event_mutex); + + return err; +} + +#ifdef CONFIG_EVENT_PROFILE + +void ftrace_profile_free_filter(struct perf_event *event) +{ + struct event_filter *filter = event->filter; + + event->filter = NULL; + __free_preds(filter); +} + +int ftrace_profile_set_filter(struct perf_event *event, int event_id, + char *filter_str) +{ + int err; + struct event_filter *filter; + struct filter_parse_state *ps; + struct ftrace_event_call *call = NULL; + + mutex_lock(&event_mutex); + + list_for_each_entry(call, &ftrace_events, list) { + if (call->id == event_id) + break; } - filter_free_subsystem_preds(system, FILTER_SKIP_NO_RESET); + err = -EINVAL; + if (!call) + goto out_unlock; - /* really apply the filter to the events */ - err = replace_preds(system, NULL, ps, filter_string, false); - if (err) { - append_filter_err(ps, system->filter); - filter_free_subsystem_preds(system, 2); + err = -EEXIST; + if (event->filter) + goto out_unlock; + + filter = __alloc_preds(); + if (IS_ERR(filter)) { + err = PTR_ERR(filter); + goto out_unlock; } -out: + err = -ENOMEM; + ps = kzalloc(sizeof(*ps), GFP_KERNEL); + if (!ps) + goto free_preds; + + parse_init(ps, filter_ops, filter_str); + err = filter_parse(ps); + if (err) + goto free_ps; + + err = replace_preds(call, filter, ps, filter_str, false); + if (!err) + event->filter = filter; + +free_ps: filter_opstack_clear(ps); postfix_clear(ps); kfree(ps); + +free_preds: + if (err) + __free_preds(filter); + out_unlock: mutex_unlock(&event_mutex); return err; } +#endif /* CONFIG_EVENT_PROFILE */ + diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index 9753fcc61bc..dff8c84ddf1 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -48,11 +48,11 @@ struct ____ftrace_##name { \ tstruct \ }; \ -static void __used ____ftrace_check_##name(void) \ +static void __always_unused ____ftrace_check_##name(void) \ { \ struct ____ftrace_##name *__entry = NULL; \ \ - /* force cmpile-time check on F_printk() */ \ + /* force compile-time check on F_printk() */ \ printk(print); \ } @@ -66,44 +66,47 @@ static void __used ____ftrace_check_##name(void) \ #undef __field #define __field(type, item) \ ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \ - "offset:%zu;\tsize:%zu;\n", \ + "offset:%zu;\tsize:%zu;\tsigned:%u;\n", \ offsetof(typeof(field), item), \ - sizeof(field.item)); \ + sizeof(field.item), is_signed_type(type)); \ if (!ret) \ return 0; #undef __field_desc #define __field_desc(type, container, item) \ ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \ - "offset:%zu;\tsize:%zu;\n", \ + "offset:%zu;\tsize:%zu;\tsigned:%u;\n", \ offsetof(typeof(field), container.item), \ - sizeof(field.container.item)); \ + sizeof(field.container.item), \ + is_signed_type(type)); \ if (!ret) \ return 0; #undef __array #define __array(type, item, len) \ ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \ - "offset:%zu;\tsize:%zu;\n", \ - offsetof(typeof(field), item), \ - sizeof(field.item)); \ + "offset:%zu;\tsize:%zu;\tsigned:%u;\n", \ + offsetof(typeof(field), item), \ + sizeof(field.item), is_signed_type(type)); \ if (!ret) \ return 0; #undef __array_desc #define __array_desc(type, container, item, len) \ ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \ - "offset:%zu;\tsize:%zu;\n", \ + "offset:%zu;\tsize:%zu;\tsigned:%u;\n", \ offsetof(typeof(field), container.item), \ - sizeof(field.container.item)); \ + sizeof(field.container.item), \ + is_signed_type(type)); \ if (!ret) \ return 0; #undef __dynamic_array #define __dynamic_array(type, item) \ ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \ - "offset:%zu;\tsize:0;\n", \ - offsetof(typeof(field), item)); \ + "offset:%zu;\tsize:0;\tsigned:%u;\n", \ + offsetof(typeof(field), item), \ + is_signed_type(type)); \ if (!ret) \ return 0; @@ -131,7 +134,6 @@ ftrace_format_##name(struct ftrace_event_call *unused, \ #include "trace_entries.h" - #undef __field #define __field(type, item) \ ret = trace_define_field(event_call, #type, #item, \ @@ -193,6 +195,11 @@ ftrace_define_fields_##name(struct ftrace_event_call *event_call) \ #include "trace_entries.h" +static int ftrace_raw_init_event(struct ftrace_event_call *call) +{ + INIT_LIST_HEAD(&call->fields); + return 0; +} #undef __field #define __field(type, item) @@ -211,7 +218,6 @@ ftrace_define_fields_##name(struct ftrace_event_call *event_call) \ #undef FTRACE_ENTRY #define FTRACE_ENTRY(call, struct_name, type, tstruct, print) \ -static int ftrace_raw_init_event_##call(void); \ \ struct ftrace_event_call __used \ __attribute__((__aligned__(4))) \ @@ -219,14 +225,9 @@ __attribute__((section("_ftrace_events"))) event_##call = { \ .name = #call, \ .id = type, \ .system = __stringify(TRACE_SYSTEM), \ - .raw_init = ftrace_raw_init_event_##call, \ + .raw_init = ftrace_raw_init_event, \ .show_format = ftrace_format_##call, \ .define_fields = ftrace_define_fields_##call, \ }; \ -static int ftrace_raw_init_event_##call(void) \ -{ \ - INIT_LIST_HEAD(&event_##call.fields); \ - return 0; \ -} \ #include "trace_entries.h" diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 45e6c01b2e4..b1342c5d37c 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -14,9 +14,20 @@ #include "trace.h" #include "trace_output.h" -struct fgraph_data { +struct fgraph_cpu_data { pid_t last_pid; int depth; + int ignore; +}; + +struct fgraph_data { + struct fgraph_cpu_data *cpu_data; + + /* Place to preserve last processed entry. */ + struct ftrace_graph_ent_entry ent; + struct ftrace_graph_ret_entry ret; + int failed; + int cpu; }; #define TRACE_GRAPH_INDENT 2 @@ -176,7 +187,7 @@ static int __trace_graph_entry(struct trace_array *tr, struct ring_buffer *buffer = tr->buffer; struct ftrace_graph_ent_entry *entry; - if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled)))) + if (unlikely(__this_cpu_read(per_cpu_var(ftrace_cpu_disabled)))) return 0; event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_ENT, @@ -240,7 +251,7 @@ static void __trace_graph_return(struct trace_array *tr, struct ring_buffer *buffer = tr->buffer; struct ftrace_graph_ret_entry *entry; - if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled)))) + if (unlikely(__this_cpu_read(per_cpu_var(ftrace_cpu_disabled)))) return; event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_RET, @@ -384,7 +395,7 @@ verif_pid(struct trace_seq *s, pid_t pid, int cpu, struct fgraph_data *data) if (!data) return TRACE_TYPE_HANDLED; - last_pid = &(per_cpu_ptr(data, cpu)->last_pid); + last_pid = &(per_cpu_ptr(data->cpu_data, cpu)->last_pid); if (*last_pid == pid) return TRACE_TYPE_HANDLED; @@ -435,26 +446,49 @@ static struct ftrace_graph_ret_entry * get_return_for_leaf(struct trace_iterator *iter, struct ftrace_graph_ent_entry *curr) { - struct ring_buffer_iter *ring_iter; + struct fgraph_data *data = iter->private; + struct ring_buffer_iter *ring_iter = NULL; struct ring_buffer_event *event; struct ftrace_graph_ret_entry *next; - ring_iter = iter->buffer_iter[iter->cpu]; + /* + * If the previous output failed to write to the seq buffer, + * then we just reuse the data from before. + */ + if (data && data->failed) { + curr = &data->ent; + next = &data->ret; + } else { - /* First peek to compare current entry and the next one */ - if (ring_iter) - event = ring_buffer_iter_peek(ring_iter, NULL); - else { - /* We need to consume the current entry to see the next one */ - ring_buffer_consume(iter->tr->buffer, iter->cpu, NULL); - event = ring_buffer_peek(iter->tr->buffer, iter->cpu, - NULL); - } + ring_iter = iter->buffer_iter[iter->cpu]; + + /* First peek to compare current entry and the next one */ + if (ring_iter) + event = ring_buffer_iter_peek(ring_iter, NULL); + else { + /* + * We need to consume the current entry to see + * the next one. + */ + ring_buffer_consume(iter->tr->buffer, iter->cpu, NULL); + event = ring_buffer_peek(iter->tr->buffer, iter->cpu, + NULL); + } - if (!event) - return NULL; + if (!event) + return NULL; + + next = ring_buffer_event_data(event); - next = ring_buffer_event_data(event); + if (data) { + /* + * Save current and next entries for later reference + * if the output fails. + */ + data->ent = *curr; + data->ret = *next; + } + } if (next->ent.type != TRACE_GRAPH_RET) return NULL; @@ -640,7 +674,7 @@ print_graph_entry_leaf(struct trace_iterator *iter, if (data) { int cpu = iter->cpu; - int *depth = &(per_cpu_ptr(data, cpu)->depth); + int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth); /* * Comments display at + 1 to depth. Since @@ -688,7 +722,7 @@ print_graph_entry_nested(struct trace_iterator *iter, if (data) { int cpu = iter->cpu; - int *depth = &(per_cpu_ptr(data, cpu)->depth); + int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth); *depth = call->depth; } @@ -782,19 +816,34 @@ static enum print_line_t print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s, struct trace_iterator *iter) { - int cpu = iter->cpu; + struct fgraph_data *data = iter->private; struct ftrace_graph_ent *call = &field->graph_ent; struct ftrace_graph_ret_entry *leaf_ret; + static enum print_line_t ret; + int cpu = iter->cpu; if (print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func)) return TRACE_TYPE_PARTIAL_LINE; leaf_ret = get_return_for_leaf(iter, field); if (leaf_ret) - return print_graph_entry_leaf(iter, field, leaf_ret, s); + ret = print_graph_entry_leaf(iter, field, leaf_ret, s); else - return print_graph_entry_nested(iter, field, s, cpu); + ret = print_graph_entry_nested(iter, field, s, cpu); + if (data) { + /* + * If we failed to write our output, then we need to make + * note of it. Because we already consumed our entry. + */ + if (s->full) { + data->failed = 1; + data->cpu = cpu; + } else + data->failed = 0; + } + + return ret; } static enum print_line_t @@ -810,7 +859,7 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, if (data) { int cpu = iter->cpu; - int *depth = &(per_cpu_ptr(data, cpu)->depth); + int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth); /* * Comments display at + 1 to depth. This is the @@ -873,7 +922,7 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent, int i; if (data) - depth = per_cpu_ptr(data, iter->cpu)->depth; + depth = per_cpu_ptr(data->cpu_data, iter->cpu)->depth; if (print_graph_prologue(iter, s, 0, 0)) return TRACE_TYPE_PARTIAL_LINE; @@ -941,8 +990,33 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent, enum print_line_t print_graph_function(struct trace_iterator *iter) { + struct ftrace_graph_ent_entry *field; + struct fgraph_data *data = iter->private; struct trace_entry *entry = iter->ent; struct trace_seq *s = &iter->seq; + int cpu = iter->cpu; + int ret; + + if (data && per_cpu_ptr(data->cpu_data, cpu)->ignore) { + per_cpu_ptr(data->cpu_data, cpu)->ignore = 0; + return TRACE_TYPE_HANDLED; + } + + /* + * If the last output failed, there's a possibility we need + * to print out the missing entry which would never go out. + */ + if (data && data->failed) { + field = &data->ent; + iter->cpu = data->cpu; + ret = print_graph_entry(field, s, iter); + if (ret == TRACE_TYPE_HANDLED && iter->cpu != cpu) { + per_cpu_ptr(data->cpu_data, iter->cpu)->ignore = 1; + ret = TRACE_TYPE_NO_CONSUME; + } + iter->cpu = cpu; + return ret; + } switch (entry->type) { case TRACE_GRAPH_ENT: { @@ -952,7 +1026,7 @@ print_graph_function(struct trace_iterator *iter) * sizeof(struct ftrace_graph_ent_entry) is very small, * it can be safely saved at the stack. */ - struct ftrace_graph_ent_entry *field, saved; + struct ftrace_graph_ent_entry saved; trace_assign_type(field, entry); saved = *field; return print_graph_entry(&saved, s, iter); @@ -1030,31 +1104,54 @@ static void print_graph_headers(struct seq_file *s) static void graph_trace_open(struct trace_iterator *iter) { /* pid and depth on the last trace processed */ - struct fgraph_data *data = alloc_percpu(struct fgraph_data); + struct fgraph_data *data; int cpu; + iter->private = NULL; + + data = kzalloc(sizeof(*data), GFP_KERNEL); if (!data) - pr_warning("function graph tracer: not enough memory\n"); - else - for_each_possible_cpu(cpu) { - pid_t *pid = &(per_cpu_ptr(data, cpu)->last_pid); - int *depth = &(per_cpu_ptr(data, cpu)->depth); - *pid = -1; - *depth = 0; - } + goto out_err; + + data->cpu_data = alloc_percpu(struct fgraph_cpu_data); + if (!data->cpu_data) + goto out_err_free; + + for_each_possible_cpu(cpu) { + pid_t *pid = &(per_cpu_ptr(data->cpu_data, cpu)->last_pid); + int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth); + int *ignore = &(per_cpu_ptr(data->cpu_data, cpu)->ignore); + *pid = -1; + *depth = 0; + *ignore = 0; + } iter->private = data; + + return; + + out_err_free: + kfree(data); + out_err: + pr_warning("function graph tracer: not enough memory\n"); } static void graph_trace_close(struct trace_iterator *iter) { - free_percpu(iter->private); + struct fgraph_data *data = iter->private; + + if (data) { + free_percpu(data->cpu_data); + kfree(data); + } } static struct tracer graph_trace __read_mostly = { .name = "function_graph", .open = graph_trace_open, + .pipe_open = graph_trace_open, .close = graph_trace_close, + .pipe_close = graph_trace_close, .wait_pipe = poll_wait_pipe, .init = graph_trace_init, .reset = graph_trace_reset, diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c index 23b63859130..7b97000745f 100644 --- a/kernel/trace/trace_hw_branches.c +++ b/kernel/trace/trace_hw_branches.c @@ -20,10 +20,10 @@ #define BTS_BUFFER_SIZE (1 << 13) -static DEFINE_PER_CPU(struct bts_tracer *, tracer); -static DEFINE_PER_CPU(unsigned char[BTS_BUFFER_SIZE], buffer); +static DEFINE_PER_CPU(struct bts_tracer *, hwb_tracer); +static DEFINE_PER_CPU(unsigned char[BTS_BUFFER_SIZE], hwb_buffer); -#define this_tracer per_cpu(tracer, smp_processor_id()) +#define this_tracer per_cpu(hwb_tracer, smp_processor_id()) static int trace_hw_branches_enabled __read_mostly; static int trace_hw_branches_suspended __read_mostly; @@ -32,12 +32,13 @@ static struct trace_array *hw_branch_trace __read_mostly; static void bts_trace_init_cpu(int cpu) { - per_cpu(tracer, cpu) = - ds_request_bts_cpu(cpu, per_cpu(buffer, cpu), BTS_BUFFER_SIZE, - NULL, (size_t)-1, BTS_KERNEL); + per_cpu(hwb_tracer, cpu) = + ds_request_bts_cpu(cpu, per_cpu(hwb_buffer, cpu), + BTS_BUFFER_SIZE, NULL, (size_t)-1, + BTS_KERNEL); - if (IS_ERR(per_cpu(tracer, cpu))) - per_cpu(tracer, cpu) = NULL; + if (IS_ERR(per_cpu(hwb_tracer, cpu))) + per_cpu(hwb_tracer, cpu) = NULL; } static int bts_trace_init(struct trace_array *tr) @@ -51,7 +52,7 @@ static int bts_trace_init(struct trace_array *tr) for_each_online_cpu(cpu) { bts_trace_init_cpu(cpu); - if (likely(per_cpu(tracer, cpu))) + if (likely(per_cpu(hwb_tracer, cpu))) trace_hw_branches_enabled = 1; } trace_hw_branches_suspended = 0; @@ -67,9 +68,9 @@ static void bts_trace_reset(struct trace_array *tr) get_online_cpus(); for_each_online_cpu(cpu) { - if (likely(per_cpu(tracer, cpu))) { - ds_release_bts(per_cpu(tracer, cpu)); - per_cpu(tracer, cpu) = NULL; + if (likely(per_cpu(hwb_tracer, cpu))) { + ds_release_bts(per_cpu(hwb_tracer, cpu)); + per_cpu(hwb_tracer, cpu) = NULL; } } trace_hw_branches_enabled = 0; @@ -83,8 +84,8 @@ static void bts_trace_start(struct trace_array *tr) get_online_cpus(); for_each_online_cpu(cpu) - if (likely(per_cpu(tracer, cpu))) - ds_resume_bts(per_cpu(tracer, cpu)); + if (likely(per_cpu(hwb_tracer, cpu))) + ds_resume_bts(per_cpu(hwb_tracer, cpu)); trace_hw_branches_suspended = 0; put_online_cpus(); } @@ -95,8 +96,8 @@ static void bts_trace_stop(struct trace_array *tr) get_online_cpus(); for_each_online_cpu(cpu) - if (likely(per_cpu(tracer, cpu))) - ds_suspend_bts(per_cpu(tracer, cpu)); + if (likely(per_cpu(hwb_tracer, cpu))) + ds_suspend_bts(per_cpu(hwb_tracer, cpu)); trace_hw_branches_suspended = 1; put_online_cpus(); } @@ -114,16 +115,16 @@ static int __cpuinit bts_hotcpu_handler(struct notifier_block *nfb, bts_trace_init_cpu(cpu); if (trace_hw_branches_suspended && - likely(per_cpu(tracer, cpu))) - ds_suspend_bts(per_cpu(tracer, cpu)); + likely(per_cpu(hwb_tracer, cpu))) + ds_suspend_bts(per_cpu(hwb_tracer, cpu)); } break; case CPU_DOWN_PREPARE: /* The notification is sent with interrupts enabled. */ - if (likely(per_cpu(tracer, cpu))) { - ds_release_bts(per_cpu(tracer, cpu)); - per_cpu(tracer, cpu) = NULL; + if (likely(per_cpu(hwb_tracer, cpu))) { + ds_release_bts(per_cpu(hwb_tracer, cpu)); + per_cpu(hwb_tracer, cpu) = NULL; } } @@ -165,6 +166,7 @@ void trace_hw_branch(u64 from, u64 to) struct ftrace_event_call *call = &event_hw_branch; struct trace_array *tr = hw_branch_trace; struct ring_buffer_event *event; + struct ring_buffer *buf; struct hw_branch_entry *entry; unsigned long irq1; int cpu; @@ -180,7 +182,8 @@ void trace_hw_branch(u64 from, u64 to) if (atomic_inc_return(&tr->data[cpu]->disabled) != 1) goto out; - event = trace_buffer_lock_reserve(tr, TRACE_HW_BRANCHES, + buf = tr->buffer; + event = trace_buffer_lock_reserve(buf, TRACE_HW_BRANCHES, sizeof(*entry), 0, 0); if (!event) goto out; @@ -189,8 +192,8 @@ void trace_hw_branch(u64 from, u64 to) entry->ent.type = TRACE_HW_BRANCHES; entry->from = from; entry->to = to; - if (!filter_check_discard(call, entry, tr->buffer, event)) - trace_buffer_unlock_commit(tr, event, 0, 0); + if (!filter_check_discard(call, entry, buf, event)) + trace_buffer_unlock_commit(buf, event, 0, 0); out: atomic_dec(&tr->data[cpu]->disabled); @@ -256,8 +259,8 @@ static void trace_bts_prepare(struct trace_iterator *iter) get_online_cpus(); for_each_online_cpu(cpu) - if (likely(per_cpu(tracer, cpu))) - ds_suspend_bts(per_cpu(tracer, cpu)); + if (likely(per_cpu(hwb_tracer, cpu))) + ds_suspend_bts(per_cpu(hwb_tracer, cpu)); /* * We need to collect the trace on the respective cpu since ftrace * implicitly adds the record for the current cpu. @@ -266,8 +269,8 @@ static void trace_bts_prepare(struct trace_iterator *iter) on_each_cpu(trace_bts_cpu, iter->tr, 1); for_each_online_cpu(cpu) - if (likely(per_cpu(tracer, cpu))) - ds_resume_bts(per_cpu(tracer, cpu)); + if (likely(per_cpu(hwb_tracer, cpu))) + ds_resume_bts(per_cpu(hwb_tracer, cpu)); put_online_cpus(); } diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c new file mode 100644 index 00000000000..b52d397e57e --- /dev/null +++ b/kernel/trace/trace_kprobe.c @@ -0,0 +1,1542 @@ +/* + * Kprobes-based tracing events + * + * Created by Masami Hiramatsu <mhiramat@redhat.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <linux/module.h> +#include <linux/uaccess.h> +#include <linux/kprobes.h> +#include <linux/seq_file.h> +#include <linux/slab.h> +#include <linux/smp.h> +#include <linux/debugfs.h> +#include <linux/types.h> +#include <linux/string.h> +#include <linux/ctype.h> +#include <linux/ptrace.h> +#include <linux/perf_event.h> + +#include "trace.h" +#include "trace_output.h" + +#define MAX_TRACE_ARGS 128 +#define MAX_ARGSTR_LEN 63 +#define MAX_EVENT_NAME_LEN 64 +#define KPROBE_EVENT_SYSTEM "kprobes" + +/* Reserved field names */ +#define FIELD_STRING_IP "__probe_ip" +#define FIELD_STRING_NARGS "__probe_nargs" +#define FIELD_STRING_RETIP "__probe_ret_ip" +#define FIELD_STRING_FUNC "__probe_func" + +const char *reserved_field_names[] = { + "common_type", + "common_flags", + "common_preempt_count", + "common_pid", + "common_tgid", + "common_lock_depth", + FIELD_STRING_IP, + FIELD_STRING_NARGS, + FIELD_STRING_RETIP, + FIELD_STRING_FUNC, +}; + +struct fetch_func { + unsigned long (*func)(struct pt_regs *, void *); + void *data; +}; + +static __kprobes unsigned long call_fetch(struct fetch_func *f, + struct pt_regs *regs) +{ + return f->func(regs, f->data); +} + +/* fetch handlers */ +static __kprobes unsigned long fetch_register(struct pt_regs *regs, + void *offset) +{ + return regs_get_register(regs, (unsigned int)((unsigned long)offset)); +} + +static __kprobes unsigned long fetch_stack(struct pt_regs *regs, + void *num) +{ + return regs_get_kernel_stack_nth(regs, + (unsigned int)((unsigned long)num)); +} + +static __kprobes unsigned long fetch_memory(struct pt_regs *regs, void *addr) +{ + unsigned long retval; + + if (probe_kernel_address(addr, retval)) + return 0; + return retval; +} + +static __kprobes unsigned long fetch_argument(struct pt_regs *regs, void *num) +{ + return regs_get_argument_nth(regs, (unsigned int)((unsigned long)num)); +} + +static __kprobes unsigned long fetch_retvalue(struct pt_regs *regs, + void *dummy) +{ + return regs_return_value(regs); +} + +static __kprobes unsigned long fetch_stack_address(struct pt_regs *regs, + void *dummy) +{ + return kernel_stack_pointer(regs); +} + +/* Memory fetching by symbol */ +struct symbol_cache { + char *symbol; + long offset; + unsigned long addr; +}; + +static unsigned long update_symbol_cache(struct symbol_cache *sc) +{ + sc->addr = (unsigned long)kallsyms_lookup_name(sc->symbol); + if (sc->addr) + sc->addr += sc->offset; + return sc->addr; +} + +static void free_symbol_cache(struct symbol_cache *sc) +{ + kfree(sc->symbol); + kfree(sc); +} + +static struct symbol_cache *alloc_symbol_cache(const char *sym, long offset) +{ + struct symbol_cache *sc; + + if (!sym || strlen(sym) == 0) + return NULL; + sc = kzalloc(sizeof(struct symbol_cache), GFP_KERNEL); + if (!sc) + return NULL; + + sc->symbol = kstrdup(sym, GFP_KERNEL); + if (!sc->symbol) { + kfree(sc); + return NULL; + } + sc->offset = offset; + + update_symbol_cache(sc); + return sc; +} + +static __kprobes unsigned long fetch_symbol(struct pt_regs *regs, void *data) +{ + struct symbol_cache *sc = data; + + if (sc->addr) + return fetch_memory(regs, (void *)sc->addr); + else + return 0; +} + +/* Special indirect memory access interface */ +struct indirect_fetch_data { + struct fetch_func orig; + long offset; +}; + +static __kprobes unsigned long fetch_indirect(struct pt_regs *regs, void *data) +{ + struct indirect_fetch_data *ind = data; + unsigned long addr; + + addr = call_fetch(&ind->orig, regs); + if (addr) { + addr += ind->offset; + return fetch_memory(regs, (void *)addr); + } else + return 0; +} + +static __kprobes void free_indirect_fetch_data(struct indirect_fetch_data *data) +{ + if (data->orig.func == fetch_indirect) + free_indirect_fetch_data(data->orig.data); + else if (data->orig.func == fetch_symbol) + free_symbol_cache(data->orig.data); + kfree(data); +} + +/** + * Kprobe event core functions + */ + +struct probe_arg { + struct fetch_func fetch; + const char *name; +}; + +/* Flags for trace_probe */ +#define TP_FLAG_TRACE 1 +#define TP_FLAG_PROFILE 2 + +struct trace_probe { + struct list_head list; + struct kretprobe rp; /* Use rp.kp for kprobe use */ + unsigned long nhit; + unsigned int flags; /* For TP_FLAG_* */ + const char *symbol; /* symbol name */ + struct ftrace_event_call call; + struct trace_event event; + unsigned int nr_args; + struct probe_arg args[]; +}; + +#define SIZEOF_TRACE_PROBE(n) \ + (offsetof(struct trace_probe, args) + \ + (sizeof(struct probe_arg) * (n))) + +static __kprobes int probe_is_return(struct trace_probe *tp) +{ + return tp->rp.handler != NULL; +} + +static __kprobes const char *probe_symbol(struct trace_probe *tp) +{ + return tp->symbol ? tp->symbol : "unknown"; +} + +static int probe_arg_string(char *buf, size_t n, struct fetch_func *ff) +{ + int ret = -EINVAL; + + if (ff->func == fetch_argument) + ret = snprintf(buf, n, "$arg%lu", (unsigned long)ff->data); + else if (ff->func == fetch_register) { + const char *name; + name = regs_query_register_name((unsigned int)((long)ff->data)); + ret = snprintf(buf, n, "%%%s", name); + } else if (ff->func == fetch_stack) + ret = snprintf(buf, n, "$stack%lu", (unsigned long)ff->data); + else if (ff->func == fetch_memory) + ret = snprintf(buf, n, "@0x%p", ff->data); + else if (ff->func == fetch_symbol) { + struct symbol_cache *sc = ff->data; + if (sc->offset) + ret = snprintf(buf, n, "@%s%+ld", sc->symbol, + sc->offset); + else + ret = snprintf(buf, n, "@%s", sc->symbol); + } else if (ff->func == fetch_retvalue) + ret = snprintf(buf, n, "$retval"); + else if (ff->func == fetch_stack_address) + ret = snprintf(buf, n, "$stack"); + else if (ff->func == fetch_indirect) { + struct indirect_fetch_data *id = ff->data; + size_t l = 0; + ret = snprintf(buf, n, "%+ld(", id->offset); + if (ret >= n) + goto end; + l += ret; + ret = probe_arg_string(buf + l, n - l, &id->orig); + if (ret < 0) + goto end; + l += ret; + ret = snprintf(buf + l, n - l, ")"); + ret += l; + } +end: + if (ret >= n) + return -ENOSPC; + return ret; +} + +static int register_probe_event(struct trace_probe *tp); +static void unregister_probe_event(struct trace_probe *tp); + +static DEFINE_MUTEX(probe_lock); +static LIST_HEAD(probe_list); + +static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs); +static int kretprobe_dispatcher(struct kretprobe_instance *ri, + struct pt_regs *regs); + +/* + * Allocate new trace_probe and initialize it (including kprobes). + */ +static struct trace_probe *alloc_trace_probe(const char *group, + const char *event, + void *addr, + const char *symbol, + unsigned long offs, + int nargs, int is_return) +{ + struct trace_probe *tp; + + tp = kzalloc(SIZEOF_TRACE_PROBE(nargs), GFP_KERNEL); + if (!tp) + return ERR_PTR(-ENOMEM); + + if (symbol) { + tp->symbol = kstrdup(symbol, GFP_KERNEL); + if (!tp->symbol) + goto error; + tp->rp.kp.symbol_name = tp->symbol; + tp->rp.kp.offset = offs; + } else + tp->rp.kp.addr = addr; + + if (is_return) + tp->rp.handler = kretprobe_dispatcher; + else + tp->rp.kp.pre_handler = kprobe_dispatcher; + + if (!event) + goto error; + tp->call.name = kstrdup(event, GFP_KERNEL); + if (!tp->call.name) + goto error; + + if (!group) + goto error; + tp->call.system = kstrdup(group, GFP_KERNEL); + if (!tp->call.system) + goto error; + + INIT_LIST_HEAD(&tp->list); + return tp; +error: + kfree(tp->call.name); + kfree(tp->symbol); + kfree(tp); + return ERR_PTR(-ENOMEM); +} + +static void free_probe_arg(struct probe_arg *arg) +{ + if (arg->fetch.func == fetch_symbol) + free_symbol_cache(arg->fetch.data); + else if (arg->fetch.func == fetch_indirect) + free_indirect_fetch_data(arg->fetch.data); + kfree(arg->name); +} + +static void free_trace_probe(struct trace_probe *tp) +{ + int i; + + for (i = 0; i < tp->nr_args; i++) + free_probe_arg(&tp->args[i]); + + kfree(tp->call.system); + kfree(tp->call.name); + kfree(tp->symbol); + kfree(tp); +} + +static struct trace_probe *find_probe_event(const char *event, + const char *group) +{ + struct trace_probe *tp; + + list_for_each_entry(tp, &probe_list, list) + if (strcmp(tp->call.name, event) == 0 && + strcmp(tp->call.system, group) == 0) + return tp; + return NULL; +} + +/* Unregister a trace_probe and probe_event: call with locking probe_lock */ +static void unregister_trace_probe(struct trace_probe *tp) +{ + if (probe_is_return(tp)) + unregister_kretprobe(&tp->rp); + else + unregister_kprobe(&tp->rp.kp); + list_del(&tp->list); + unregister_probe_event(tp); +} + +/* Register a trace_probe and probe_event */ +static int register_trace_probe(struct trace_probe *tp) +{ + struct trace_probe *old_tp; + int ret; + + mutex_lock(&probe_lock); + + /* register as an event */ + old_tp = find_probe_event(tp->call.name, tp->call.system); + if (old_tp) { + /* delete old event */ + unregister_trace_probe(old_tp); + free_trace_probe(old_tp); + } + ret = register_probe_event(tp); + if (ret) { + pr_warning("Faild to register probe event(%d)\n", ret); + goto end; + } + + tp->rp.kp.flags |= KPROBE_FLAG_DISABLED; + if (probe_is_return(tp)) + ret = register_kretprobe(&tp->rp); + else + ret = register_kprobe(&tp->rp.kp); + + if (ret) { + pr_warning("Could not insert probe(%d)\n", ret); + if (ret == -EILSEQ) { + pr_warning("Probing address(0x%p) is not an " + "instruction boundary.\n", + tp->rp.kp.addr); + ret = -EINVAL; + } + unregister_probe_event(tp); + } else + list_add_tail(&tp->list, &probe_list); +end: + mutex_unlock(&probe_lock); + return ret; +} + +/* Split symbol and offset. */ +static int split_symbol_offset(char *symbol, unsigned long *offset) +{ + char *tmp; + int ret; + + if (!offset) + return -EINVAL; + + tmp = strchr(symbol, '+'); + if (tmp) { + /* skip sign because strict_strtol doesn't accept '+' */ + ret = strict_strtoul(tmp + 1, 0, offset); + if (ret) + return ret; + *tmp = '\0'; + } else + *offset = 0; + return 0; +} + +#define PARAM_MAX_ARGS 16 +#define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long)) + +static int parse_probe_vars(char *arg, struct fetch_func *ff, int is_return) +{ + int ret = 0; + unsigned long param; + + if (strcmp(arg, "retval") == 0) { + if (is_return) { + ff->func = fetch_retvalue; + ff->data = NULL; + } else + ret = -EINVAL; + } else if (strncmp(arg, "stack", 5) == 0) { + if (arg[5] == '\0') { + ff->func = fetch_stack_address; + ff->data = NULL; + } else if (isdigit(arg[5])) { + ret = strict_strtoul(arg + 5, 10, ¶m); + if (ret || param > PARAM_MAX_STACK) + ret = -EINVAL; + else { + ff->func = fetch_stack; + ff->data = (void *)param; + } + } else + ret = -EINVAL; + } else if (strncmp(arg, "arg", 3) == 0 && isdigit(arg[3])) { + ret = strict_strtoul(arg + 3, 10, ¶m); + if (ret || param > PARAM_MAX_ARGS) + ret = -EINVAL; + else { + ff->func = fetch_argument; + ff->data = (void *)param; + } + } else + ret = -EINVAL; + return ret; +} + +/* Recursive argument parser */ +static int __parse_probe_arg(char *arg, struct fetch_func *ff, int is_return) +{ + int ret = 0; + unsigned long param; + long offset; + char *tmp; + + switch (arg[0]) { + case '$': + ret = parse_probe_vars(arg + 1, ff, is_return); + break; + case '%': /* named register */ + ret = regs_query_register_offset(arg + 1); + if (ret >= 0) { + ff->func = fetch_register; + ff->data = (void *)(unsigned long)ret; + ret = 0; + } + break; + case '@': /* memory or symbol */ + if (isdigit(arg[1])) { + ret = strict_strtoul(arg + 1, 0, ¶m); + if (ret) + break; + ff->func = fetch_memory; + ff->data = (void *)param; + } else { + ret = split_symbol_offset(arg + 1, &offset); + if (ret) + break; + ff->data = alloc_symbol_cache(arg + 1, offset); + if (ff->data) + ff->func = fetch_symbol; + else + ret = -EINVAL; + } + break; + case '+': /* indirect memory */ + case '-': + tmp = strchr(arg, '('); + if (!tmp) { + ret = -EINVAL; + break; + } + *tmp = '\0'; + ret = strict_strtol(arg + 1, 0, &offset); + if (ret) + break; + if (arg[0] == '-') + offset = -offset; + arg = tmp + 1; + tmp = strrchr(arg, ')'); + if (tmp) { + struct indirect_fetch_data *id; + *tmp = '\0'; + id = kzalloc(sizeof(struct indirect_fetch_data), + GFP_KERNEL); + if (!id) + return -ENOMEM; + id->offset = offset; + ret = __parse_probe_arg(arg, &id->orig, is_return); + if (ret) + kfree(id); + else { + ff->func = fetch_indirect; + ff->data = (void *)id; + } + } else + ret = -EINVAL; + break; + default: + /* TODO: support custom handler */ + ret = -EINVAL; + } + return ret; +} + +/* String length checking wrapper */ +static int parse_probe_arg(char *arg, struct fetch_func *ff, int is_return) +{ + if (strlen(arg) > MAX_ARGSTR_LEN) { + pr_info("Argument is too long.: %s\n", arg); + return -ENOSPC; + } + return __parse_probe_arg(arg, ff, is_return); +} + +/* Return 1 if name is reserved or already used by another argument */ +static int conflict_field_name(const char *name, + struct probe_arg *args, int narg) +{ + int i; + for (i = 0; i < ARRAY_SIZE(reserved_field_names); i++) + if (strcmp(reserved_field_names[i], name) == 0) + return 1; + for (i = 0; i < narg; i++) + if (strcmp(args[i].name, name) == 0) + return 1; + return 0; +} + +static int create_trace_probe(int argc, char **argv) +{ + /* + * Argument syntax: + * - Add kprobe: p[:[GRP/]EVENT] KSYM[+OFFS]|KADDR [FETCHARGS] + * - Add kretprobe: r[:[GRP/]EVENT] KSYM[+0] [FETCHARGS] + * Fetch args: + * $argN : fetch Nth of function argument. (N:0-) + * $retval : fetch return value + * $stack : fetch stack address + * $stackN : fetch Nth of stack (N:0-) + * @ADDR : fetch memory at ADDR (ADDR should be in kernel) + * @SYM[+|-offs] : fetch memory at SYM +|- offs (SYM is a data symbol) + * %REG : fetch register REG + * Indirect memory fetch: + * +|-offs(ARG) : fetch memory at ARG +|- offs address. + * Alias name of args: + * NAME=FETCHARG : set NAME as alias of FETCHARG. + */ + struct trace_probe *tp; + int i, ret = 0; + int is_return = 0, is_delete = 0; + char *symbol = NULL, *event = NULL, *arg = NULL, *group = NULL; + unsigned long offset = 0; + void *addr = NULL; + char buf[MAX_EVENT_NAME_LEN]; + + /* argc must be >= 1 */ + if (argv[0][0] == 'p') + is_return = 0; + else if (argv[0][0] == 'r') + is_return = 1; + else if (argv[0][0] == '-') + is_delete = 1; + else { + pr_info("Probe definition must be started with 'p', 'r' or" + " '-'.\n"); + return -EINVAL; + } + + if (argv[0][1] == ':') { + event = &argv[0][2]; + if (strchr(event, '/')) { + group = event; + event = strchr(group, '/') + 1; + event[-1] = '\0'; + if (strlen(group) == 0) { + pr_info("Group name is not specifiled\n"); + return -EINVAL; + } + } + if (strlen(event) == 0) { + pr_info("Event name is not specifiled\n"); + return -EINVAL; + } + } + if (!group) + group = KPROBE_EVENT_SYSTEM; + + if (is_delete) { + if (!event) { + pr_info("Delete command needs an event name.\n"); + return -EINVAL; + } + tp = find_probe_event(event, group); + if (!tp) { + pr_info("Event %s/%s doesn't exist.\n", group, event); + return -ENOENT; + } + /* delete an event */ + unregister_trace_probe(tp); + free_trace_probe(tp); + return 0; + } + + if (argc < 2) { + pr_info("Probe point is not specified.\n"); + return -EINVAL; + } + if (isdigit(argv[1][0])) { + if (is_return) { + pr_info("Return probe point must be a symbol.\n"); + return -EINVAL; + } + /* an address specified */ + ret = strict_strtoul(&argv[0][2], 0, (unsigned long *)&addr); + if (ret) { + pr_info("Failed to parse address.\n"); + return ret; + } + } else { + /* a symbol specified */ + symbol = argv[1]; + /* TODO: support .init module functions */ + ret = split_symbol_offset(symbol, &offset); + if (ret) { + pr_info("Failed to parse symbol.\n"); + return ret; + } + if (offset && is_return) { + pr_info("Return probe must be used without offset.\n"); + return -EINVAL; + } + } + argc -= 2; argv += 2; + + /* setup a probe */ + if (!event) { + /* Make a new event name */ + if (symbol) + snprintf(buf, MAX_EVENT_NAME_LEN, "%c@%s%+ld", + is_return ? 'r' : 'p', symbol, offset); + else + snprintf(buf, MAX_EVENT_NAME_LEN, "%c@0x%p", + is_return ? 'r' : 'p', addr); + event = buf; + } + tp = alloc_trace_probe(group, event, addr, symbol, offset, argc, + is_return); + if (IS_ERR(tp)) { + pr_info("Failed to allocate trace_probe.(%d)\n", + (int)PTR_ERR(tp)); + return PTR_ERR(tp); + } + + /* parse arguments */ + ret = 0; + for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) { + /* Parse argument name */ + arg = strchr(argv[i], '='); + if (arg) + *arg++ = '\0'; + else + arg = argv[i]; + + if (conflict_field_name(argv[i], tp->args, i)) { + pr_info("Argument%d name '%s' conflicts with " + "another field.\n", i, argv[i]); + ret = -EINVAL; + goto error; + } + + tp->args[i].name = kstrdup(argv[i], GFP_KERNEL); + if (!tp->args[i].name) { + pr_info("Failed to allocate argument%d name '%s'.\n", + i, argv[i]); + ret = -ENOMEM; + goto error; + } + + /* Parse fetch argument */ + ret = parse_probe_arg(arg, &tp->args[i].fetch, is_return); + if (ret) { + pr_info("Parse error at argument%d. (%d)\n", i, ret); + kfree(tp->args[i].name); + goto error; + } + + tp->nr_args++; + } + + ret = register_trace_probe(tp); + if (ret) + goto error; + return 0; + +error: + free_trace_probe(tp); + return ret; +} + +static void cleanup_all_probes(void) +{ + struct trace_probe *tp; + + mutex_lock(&probe_lock); + /* TODO: Use batch unregistration */ + while (!list_empty(&probe_list)) { + tp = list_entry(probe_list.next, struct trace_probe, list); + unregister_trace_probe(tp); + free_trace_probe(tp); + } + mutex_unlock(&probe_lock); +} + + +/* Probes listing interfaces */ +static void *probes_seq_start(struct seq_file *m, loff_t *pos) +{ + mutex_lock(&probe_lock); + return seq_list_start(&probe_list, *pos); +} + +static void *probes_seq_next(struct seq_file *m, void *v, loff_t *pos) +{ + return seq_list_next(v, &probe_list, pos); +} + +static void probes_seq_stop(struct seq_file *m, void *v) +{ + mutex_unlock(&probe_lock); +} + +static int probes_seq_show(struct seq_file *m, void *v) +{ + struct trace_probe *tp = v; + int i, ret; + char buf[MAX_ARGSTR_LEN + 1]; + + seq_printf(m, "%c", probe_is_return(tp) ? 'r' : 'p'); + seq_printf(m, ":%s/%s", tp->call.system, tp->call.name); + + if (!tp->symbol) + seq_printf(m, " 0x%p", tp->rp.kp.addr); + else if (tp->rp.kp.offset) + seq_printf(m, " %s+%u", probe_symbol(tp), tp->rp.kp.offset); + else + seq_printf(m, " %s", probe_symbol(tp)); + + for (i = 0; i < tp->nr_args; i++) { + ret = probe_arg_string(buf, MAX_ARGSTR_LEN, &tp->args[i].fetch); + if (ret < 0) { + pr_warning("Argument%d decoding error(%d).\n", i, ret); + return ret; + } + seq_printf(m, " %s=%s", tp->args[i].name, buf); + } + seq_printf(m, "\n"); + return 0; +} + +static const struct seq_operations probes_seq_op = { + .start = probes_seq_start, + .next = probes_seq_next, + .stop = probes_seq_stop, + .show = probes_seq_show +}; + +static int probes_open(struct inode *inode, struct file *file) +{ + if ((file->f_mode & FMODE_WRITE) && + (file->f_flags & O_TRUNC)) + cleanup_all_probes(); + + return seq_open(file, &probes_seq_op); +} + +static int command_trace_probe(const char *buf) +{ + char **argv; + int argc = 0, ret = 0; + + argv = argv_split(GFP_KERNEL, buf, &argc); + if (!argv) + return -ENOMEM; + + if (argc) + ret = create_trace_probe(argc, argv); + + argv_free(argv); + return ret; +} + +#define WRITE_BUFSIZE 128 + +static ssize_t probes_write(struct file *file, const char __user *buffer, + size_t count, loff_t *ppos) +{ + char *kbuf, *tmp; + int ret; + size_t done; + size_t size; + + kbuf = kmalloc(WRITE_BUFSIZE, GFP_KERNEL); + if (!kbuf) + return -ENOMEM; + + ret = done = 0; + while (done < count) { + size = count - done; + if (size >= WRITE_BUFSIZE) + size = WRITE_BUFSIZE - 1; + if (copy_from_user(kbuf, buffer + done, size)) { + ret = -EFAULT; + goto out; + } + kbuf[size] = '\0'; + tmp = strchr(kbuf, '\n'); + if (tmp) { + *tmp = '\0'; + size = tmp - kbuf + 1; + } else if (done + size < count) { + pr_warning("Line length is too long: " + "Should be less than %d.", WRITE_BUFSIZE); + ret = -EINVAL; + goto out; + } + done += size; + /* Remove comments */ + tmp = strchr(kbuf, '#'); + if (tmp) + *tmp = '\0'; + + ret = command_trace_probe(kbuf); + if (ret) + goto out; + } + ret = done; +out: + kfree(kbuf); + return ret; +} + +static const struct file_operations kprobe_events_ops = { + .owner = THIS_MODULE, + .open = probes_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, + .write = probes_write, +}; + +/* Probes profiling interfaces */ +static int probes_profile_seq_show(struct seq_file *m, void *v) +{ + struct trace_probe *tp = v; + + seq_printf(m, " %-44s %15lu %15lu\n", tp->call.name, tp->nhit, + tp->rp.kp.nmissed); + + return 0; +} + +static const struct seq_operations profile_seq_op = { + .start = probes_seq_start, + .next = probes_seq_next, + .stop = probes_seq_stop, + .show = probes_profile_seq_show +}; + +static int profile_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &profile_seq_op); +} + +static const struct file_operations kprobe_profile_ops = { + .owner = THIS_MODULE, + .open = profile_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +/* Kprobe handler */ +static __kprobes int kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs) +{ + struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp); + struct kprobe_trace_entry *entry; + struct ring_buffer_event *event; + struct ring_buffer *buffer; + int size, i, pc; + unsigned long irq_flags; + struct ftrace_event_call *call = &tp->call; + + tp->nhit++; + + local_save_flags(irq_flags); + pc = preempt_count(); + + size = SIZEOF_KPROBE_TRACE_ENTRY(tp->nr_args); + + event = trace_current_buffer_lock_reserve(&buffer, call->id, size, + irq_flags, pc); + if (!event) + return 0; + + entry = ring_buffer_event_data(event); + entry->nargs = tp->nr_args; + entry->ip = (unsigned long)kp->addr; + for (i = 0; i < tp->nr_args; i++) + entry->args[i] = call_fetch(&tp->args[i].fetch, regs); + + if (!filter_current_check_discard(buffer, call, entry, event)) + trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc); + return 0; +} + +/* Kretprobe handler */ +static __kprobes int kretprobe_trace_func(struct kretprobe_instance *ri, + struct pt_regs *regs) +{ + struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp); + struct kretprobe_trace_entry *entry; + struct ring_buffer_event *event; + struct ring_buffer *buffer; + int size, i, pc; + unsigned long irq_flags; + struct ftrace_event_call *call = &tp->call; + + local_save_flags(irq_flags); + pc = preempt_count(); + + size = SIZEOF_KRETPROBE_TRACE_ENTRY(tp->nr_args); + + event = trace_current_buffer_lock_reserve(&buffer, call->id, size, + irq_flags, pc); + if (!event) + return 0; + + entry = ring_buffer_event_data(event); + entry->nargs = tp->nr_args; + entry->func = (unsigned long)tp->rp.kp.addr; + entry->ret_ip = (unsigned long)ri->ret_addr; + for (i = 0; i < tp->nr_args; i++) + entry->args[i] = call_fetch(&tp->args[i].fetch, regs); + + if (!filter_current_check_discard(buffer, call, entry, event)) + trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc); + + return 0; +} + +/* Event entry printers */ +enum print_line_t +print_kprobe_event(struct trace_iterator *iter, int flags) +{ + struct kprobe_trace_entry *field; + struct trace_seq *s = &iter->seq; + struct trace_event *event; + struct trace_probe *tp; + int i; + + field = (struct kprobe_trace_entry *)iter->ent; + event = ftrace_find_event(field->ent.type); + tp = container_of(event, struct trace_probe, event); + + if (!trace_seq_printf(s, "%s: (", tp->call.name)) + goto partial; + + if (!seq_print_ip_sym(s, field->ip, flags | TRACE_ITER_SYM_OFFSET)) + goto partial; + + if (!trace_seq_puts(s, ")")) + goto partial; + + for (i = 0; i < field->nargs; i++) + if (!trace_seq_printf(s, " %s=%lx", + tp->args[i].name, field->args[i])) + goto partial; + + if (!trace_seq_puts(s, "\n")) + goto partial; + + return TRACE_TYPE_HANDLED; +partial: + return TRACE_TYPE_PARTIAL_LINE; +} + +enum print_line_t +print_kretprobe_event(struct trace_iterator *iter, int flags) +{ + struct kretprobe_trace_entry *field; + struct trace_seq *s = &iter->seq; + struct trace_event *event; + struct trace_probe *tp; + int i; + + field = (struct kretprobe_trace_entry *)iter->ent; + event = ftrace_find_event(field->ent.type); + tp = container_of(event, struct trace_probe, event); + + if (!trace_seq_printf(s, "%s: (", tp->call.name)) + goto partial; + + if (!seq_print_ip_sym(s, field->ret_ip, flags | TRACE_ITER_SYM_OFFSET)) + goto partial; + + if (!trace_seq_puts(s, " <- ")) + goto partial; + + if (!seq_print_ip_sym(s, field->func, flags & ~TRACE_ITER_SYM_OFFSET)) + goto partial; + + if (!trace_seq_puts(s, ")")) + goto partial; + + for (i = 0; i < field->nargs; i++) + if (!trace_seq_printf(s, " %s=%lx", + tp->args[i].name, field->args[i])) + goto partial; + + if (!trace_seq_puts(s, "\n")) + goto partial; + + return TRACE_TYPE_HANDLED; +partial: + return TRACE_TYPE_PARTIAL_LINE; +} + +static int probe_event_enable(struct ftrace_event_call *call) +{ + struct trace_probe *tp = (struct trace_probe *)call->data; + + tp->flags |= TP_FLAG_TRACE; + if (probe_is_return(tp)) + return enable_kretprobe(&tp->rp); + else + return enable_kprobe(&tp->rp.kp); +} + +static void probe_event_disable(struct ftrace_event_call *call) +{ + struct trace_probe *tp = (struct trace_probe *)call->data; + + tp->flags &= ~TP_FLAG_TRACE; + if (!(tp->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE))) { + if (probe_is_return(tp)) + disable_kretprobe(&tp->rp); + else + disable_kprobe(&tp->rp.kp); + } +} + +static int probe_event_raw_init(struct ftrace_event_call *event_call) +{ + INIT_LIST_HEAD(&event_call->fields); + + return 0; +} + +#undef DEFINE_FIELD +#define DEFINE_FIELD(type, item, name, is_signed) \ + do { \ + ret = trace_define_field(event_call, #type, name, \ + offsetof(typeof(field), item), \ + sizeof(field.item), is_signed, \ + FILTER_OTHER); \ + if (ret) \ + return ret; \ + } while (0) + +static int kprobe_event_define_fields(struct ftrace_event_call *event_call) +{ + int ret, i; + struct kprobe_trace_entry field; + struct trace_probe *tp = (struct trace_probe *)event_call->data; + + ret = trace_define_common_fields(event_call); + if (ret) + return ret; + + DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0); + DEFINE_FIELD(int, nargs, FIELD_STRING_NARGS, 1); + /* Set argument names as fields */ + for (i = 0; i < tp->nr_args; i++) + DEFINE_FIELD(unsigned long, args[i], tp->args[i].name, 0); + return 0; +} + +static int kretprobe_event_define_fields(struct ftrace_event_call *event_call) +{ + int ret, i; + struct kretprobe_trace_entry field; + struct trace_probe *tp = (struct trace_probe *)event_call->data; + + ret = trace_define_common_fields(event_call); + if (ret) + return ret; + + DEFINE_FIELD(unsigned long, func, FIELD_STRING_FUNC, 0); + DEFINE_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP, 0); + DEFINE_FIELD(int, nargs, FIELD_STRING_NARGS, 1); + /* Set argument names as fields */ + for (i = 0; i < tp->nr_args; i++) + DEFINE_FIELD(unsigned long, args[i], tp->args[i].name, 0); + return 0; +} + +static int __probe_event_show_format(struct trace_seq *s, + struct trace_probe *tp, const char *fmt, + const char *arg) +{ + int i; + + /* Show format */ + if (!trace_seq_printf(s, "\nprint fmt: \"%s", fmt)) + return 0; + + for (i = 0; i < tp->nr_args; i++) + if (!trace_seq_printf(s, " %s=%%lx", tp->args[i].name)) + return 0; + + if (!trace_seq_printf(s, "\", %s", arg)) + return 0; + + for (i = 0; i < tp->nr_args; i++) + if (!trace_seq_printf(s, ", REC->%s", tp->args[i].name)) + return 0; + + return trace_seq_puts(s, "\n"); +} + +#undef SHOW_FIELD +#define SHOW_FIELD(type, item, name) \ + do { \ + ret = trace_seq_printf(s, "\tfield: " #type " %s;\t" \ + "offset:%u;\tsize:%u;\n", name, \ + (unsigned int)offsetof(typeof(field), item),\ + (unsigned int)sizeof(type)); \ + if (!ret) \ + return 0; \ + } while (0) + +static int kprobe_event_show_format(struct ftrace_event_call *call, + struct trace_seq *s) +{ + struct kprobe_trace_entry field __attribute__((unused)); + int ret, i; + struct trace_probe *tp = (struct trace_probe *)call->data; + + SHOW_FIELD(unsigned long, ip, FIELD_STRING_IP); + SHOW_FIELD(int, nargs, FIELD_STRING_NARGS); + + /* Show fields */ + for (i = 0; i < tp->nr_args; i++) + SHOW_FIELD(unsigned long, args[i], tp->args[i].name); + trace_seq_puts(s, "\n"); + + return __probe_event_show_format(s, tp, "(%lx)", + "REC->" FIELD_STRING_IP); +} + +static int kretprobe_event_show_format(struct ftrace_event_call *call, + struct trace_seq *s) +{ + struct kretprobe_trace_entry field __attribute__((unused)); + int ret, i; + struct trace_probe *tp = (struct trace_probe *)call->data; + + SHOW_FIELD(unsigned long, func, FIELD_STRING_FUNC); + SHOW_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP); + SHOW_FIELD(int, nargs, FIELD_STRING_NARGS); + + /* Show fields */ + for (i = 0; i < tp->nr_args; i++) + SHOW_FIELD(unsigned long, args[i], tp->args[i].name); + trace_seq_puts(s, "\n"); + + return __probe_event_show_format(s, tp, "(%lx <- %lx)", + "REC->" FIELD_STRING_FUNC + ", REC->" FIELD_STRING_RETIP); +} + +#ifdef CONFIG_EVENT_PROFILE + +/* Kprobe profile handler */ +static __kprobes int kprobe_profile_func(struct kprobe *kp, + struct pt_regs *regs) +{ + struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp); + struct ftrace_event_call *call = &tp->call; + struct kprobe_trace_entry *entry; + struct trace_entry *ent; + int size, __size, i, pc, __cpu; + unsigned long irq_flags; + char *trace_buf; + char *raw_data; + int rctx; + + pc = preempt_count(); + __size = SIZEOF_KPROBE_TRACE_ENTRY(tp->nr_args); + size = ALIGN(__size + sizeof(u32), sizeof(u64)); + size -= sizeof(u32); + if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE, + "profile buffer not large enough")) + return 0; + + /* + * Protect the non nmi buffer + * This also protects the rcu read side + */ + local_irq_save(irq_flags); + + rctx = perf_swevent_get_recursion_context(); + if (rctx < 0) + goto end_recursion; + + __cpu = smp_processor_id(); + + if (in_nmi()) + trace_buf = rcu_dereference(perf_trace_buf_nmi); + else + trace_buf = rcu_dereference(perf_trace_buf); + + if (!trace_buf) + goto end; + + raw_data = per_cpu_ptr(trace_buf, __cpu); + + /* Zero dead bytes from alignment to avoid buffer leak to userspace */ + *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL; + entry = (struct kprobe_trace_entry *)raw_data; + ent = &entry->ent; + + tracing_generic_entry_update(ent, irq_flags, pc); + ent->type = call->id; + entry->nargs = tp->nr_args; + entry->ip = (unsigned long)kp->addr; + for (i = 0; i < tp->nr_args; i++) + entry->args[i] = call_fetch(&tp->args[i].fetch, regs); + perf_tp_event(call->id, entry->ip, 1, entry, size); + +end: + perf_swevent_put_recursion_context(rctx); +end_recursion: + local_irq_restore(irq_flags); + + return 0; +} + +/* Kretprobe profile handler */ +static __kprobes int kretprobe_profile_func(struct kretprobe_instance *ri, + struct pt_regs *regs) +{ + struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp); + struct ftrace_event_call *call = &tp->call; + struct kretprobe_trace_entry *entry; + struct trace_entry *ent; + int size, __size, i, pc, __cpu; + unsigned long irq_flags; + char *trace_buf; + char *raw_data; + int rctx; + + pc = preempt_count(); + __size = SIZEOF_KRETPROBE_TRACE_ENTRY(tp->nr_args); + size = ALIGN(__size + sizeof(u32), sizeof(u64)); + size -= sizeof(u32); + if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE, + "profile buffer not large enough")) + return 0; + + /* + * Protect the non nmi buffer + * This also protects the rcu read side + */ + local_irq_save(irq_flags); + + rctx = perf_swevent_get_recursion_context(); + if (rctx < 0) + goto end_recursion; + + __cpu = smp_processor_id(); + + if (in_nmi()) + trace_buf = rcu_dereference(perf_trace_buf_nmi); + else + trace_buf = rcu_dereference(perf_trace_buf); + + if (!trace_buf) + goto end; + + raw_data = per_cpu_ptr(trace_buf, __cpu); + + /* Zero dead bytes from alignment to avoid buffer leak to userspace */ + *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL; + entry = (struct kretprobe_trace_entry *)raw_data; + ent = &entry->ent; + + tracing_generic_entry_update(ent, irq_flags, pc); + ent->type = call->id; + entry->nargs = tp->nr_args; + entry->func = (unsigned long)tp->rp.kp.addr; + entry->ret_ip = (unsigned long)ri->ret_addr; + for (i = 0; i < tp->nr_args; i++) + entry->args[i] = call_fetch(&tp->args[i].fetch, regs); + perf_tp_event(call->id, entry->ret_ip, 1, entry, size); + +end: + perf_swevent_put_recursion_context(rctx); +end_recursion: + local_irq_restore(irq_flags); + + return 0; +} + +static int probe_profile_enable(struct ftrace_event_call *call) +{ + struct trace_probe *tp = (struct trace_probe *)call->data; + + tp->flags |= TP_FLAG_PROFILE; + + if (probe_is_return(tp)) + return enable_kretprobe(&tp->rp); + else + return enable_kprobe(&tp->rp.kp); +} + +static void probe_profile_disable(struct ftrace_event_call *call) +{ + struct trace_probe *tp = (struct trace_probe *)call->data; + + tp->flags &= ~TP_FLAG_PROFILE; + + if (!(tp->flags & TP_FLAG_TRACE)) { + if (probe_is_return(tp)) + disable_kretprobe(&tp->rp); + else + disable_kprobe(&tp->rp.kp); + } +} +#endif /* CONFIG_EVENT_PROFILE */ + + +static __kprobes +int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs) +{ + struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp); + + if (tp->flags & TP_FLAG_TRACE) + kprobe_trace_func(kp, regs); +#ifdef CONFIG_EVENT_PROFILE + if (tp->flags & TP_FLAG_PROFILE) + kprobe_profile_func(kp, regs); +#endif /* CONFIG_EVENT_PROFILE */ + return 0; /* We don't tweek kernel, so just return 0 */ +} + +static __kprobes +int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs) +{ + struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp); + + if (tp->flags & TP_FLAG_TRACE) + kretprobe_trace_func(ri, regs); +#ifdef CONFIG_EVENT_PROFILE + if (tp->flags & TP_FLAG_PROFILE) + kretprobe_profile_func(ri, regs); +#endif /* CONFIG_EVENT_PROFILE */ + return 0; /* We don't tweek kernel, so just return 0 */ +} + +static int register_probe_event(struct trace_probe *tp) +{ + struct ftrace_event_call *call = &tp->call; + int ret; + + /* Initialize ftrace_event_call */ + if (probe_is_return(tp)) { + tp->event.trace = print_kretprobe_event; + call->raw_init = probe_event_raw_init; + call->show_format = kretprobe_event_show_format; + call->define_fields = kretprobe_event_define_fields; + } else { + tp->event.trace = print_kprobe_event; + call->raw_init = probe_event_raw_init; + call->show_format = kprobe_event_show_format; + call->define_fields = kprobe_event_define_fields; + } + call->event = &tp->event; + call->id = register_ftrace_event(&tp->event); + if (!call->id) + return -ENODEV; + call->enabled = 0; + call->regfunc = probe_event_enable; + call->unregfunc = probe_event_disable; + +#ifdef CONFIG_EVENT_PROFILE + atomic_set(&call->profile_count, -1); + call->profile_enable = probe_profile_enable; + call->profile_disable = probe_profile_disable; +#endif + call->data = tp; + ret = trace_add_event_call(call); + if (ret) { + pr_info("Failed to register kprobe event: %s\n", call->name); + unregister_ftrace_event(&tp->event); + } + return ret; +} + +static void unregister_probe_event(struct trace_probe *tp) +{ + /* tp->event is unregistered in trace_remove_event_call() */ + trace_remove_event_call(&tp->call); +} + +/* Make a debugfs interface for controling probe points */ +static __init int init_kprobe_trace(void) +{ + struct dentry *d_tracer; + struct dentry *entry; + + d_tracer = tracing_init_dentry(); + if (!d_tracer) + return 0; + + entry = debugfs_create_file("kprobe_events", 0644, d_tracer, + NULL, &kprobe_events_ops); + + /* Event list interface */ + if (!entry) + pr_warning("Could not create debugfs " + "'kprobe_events' entry\n"); + + /* Profile interface */ + entry = debugfs_create_file("kprobe_profile", 0444, d_tracer, + NULL, &kprobe_profile_ops); + + if (!entry) + pr_warning("Could not create debugfs " + "'kprobe_profile' entry\n"); + return 0; +} +fs_initcall(init_kprobe_trace); + + +#ifdef CONFIG_FTRACE_STARTUP_TEST + +static int kprobe_trace_selftest_target(int a1, int a2, int a3, + int a4, int a5, int a6) +{ + return a1 + a2 + a3 + a4 + a5 + a6; +} + +static __init int kprobe_trace_self_tests_init(void) +{ + int ret; + int (*target)(int, int, int, int, int, int); + + target = kprobe_trace_selftest_target; + + pr_info("Testing kprobe tracing: "); + + ret = command_trace_probe("p:testprobe kprobe_trace_selftest_target " + "$arg1 $arg2 $arg3 $arg4 $stack $stack0"); + if (WARN_ON_ONCE(ret)) + pr_warning("error enabling function entry\n"); + + ret = command_trace_probe("r:testprobe2 kprobe_trace_selftest_target " + "$retval"); + if (WARN_ON_ONCE(ret)) + pr_warning("error enabling function return\n"); + + ret = target(1, 2, 3, 4, 5, 6); + + cleanup_all_probes(); + + pr_cont("OK\n"); + return 0; +} + +late_initcall(kprobe_trace_self_tests_init); + +#endif diff --git a/kernel/trace/trace_ksym.c b/kernel/trace/trace_ksym.c new file mode 100644 index 00000000000..acb87d4a4ac --- /dev/null +++ b/kernel/trace/trace_ksym.c @@ -0,0 +1,551 @@ +/* + * trace_ksym.c - Kernel Symbol Tracer + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2009 + */ + +#include <linux/kallsyms.h> +#include <linux/uaccess.h> +#include <linux/debugfs.h> +#include <linux/ftrace.h> +#include <linux/module.h> +#include <linux/fs.h> + +#include "trace_output.h" +#include "trace_stat.h" +#include "trace.h" + +#include <linux/hw_breakpoint.h> +#include <asm/hw_breakpoint.h> + +/* + * For now, let us restrict the no. of symbols traced simultaneously to number + * of available hardware breakpoint registers. + */ +#define KSYM_TRACER_MAX HBP_NUM + +#define KSYM_TRACER_OP_LEN 3 /* rw- */ + +struct trace_ksym { + struct perf_event **ksym_hbp; + struct perf_event_attr attr; +#ifdef CONFIG_PROFILE_KSYM_TRACER + unsigned long counter; +#endif + struct hlist_node ksym_hlist; +}; + +static struct trace_array *ksym_trace_array; + +static unsigned int ksym_filter_entry_count; +static unsigned int ksym_tracing_enabled; + +static HLIST_HEAD(ksym_filter_head); + +static DEFINE_MUTEX(ksym_tracer_mutex); + +#ifdef CONFIG_PROFILE_KSYM_TRACER + +#define MAX_UL_INT 0xffffffff + +void ksym_collect_stats(unsigned long hbp_hit_addr) +{ + struct hlist_node *node; + struct trace_ksym *entry; + + rcu_read_lock(); + hlist_for_each_entry_rcu(entry, node, &ksym_filter_head, ksym_hlist) { + if ((entry->attr.bp_addr == hbp_hit_addr) && + (entry->counter <= MAX_UL_INT)) { + entry->counter++; + break; + } + } + rcu_read_unlock(); +} +#endif /* CONFIG_PROFILE_KSYM_TRACER */ + +void ksym_hbp_handler(struct perf_event *hbp, int nmi, + struct perf_sample_data *data, + struct pt_regs *regs) +{ + struct ring_buffer_event *event; + struct ksym_trace_entry *entry; + struct ring_buffer *buffer; + int pc; + + if (!ksym_tracing_enabled) + return; + + buffer = ksym_trace_array->buffer; + + pc = preempt_count(); + + event = trace_buffer_lock_reserve(buffer, TRACE_KSYM, + sizeof(*entry), 0, pc); + if (!event) + return; + + entry = ring_buffer_event_data(event); + entry->ip = instruction_pointer(regs); + entry->type = hw_breakpoint_type(hbp); + entry->addr = hw_breakpoint_addr(hbp); + strlcpy(entry->cmd, current->comm, TASK_COMM_LEN); + +#ifdef CONFIG_PROFILE_KSYM_TRACER + ksym_collect_stats(hw_breakpoint_addr(hbp)); +#endif /* CONFIG_PROFILE_KSYM_TRACER */ + + trace_buffer_unlock_commit(buffer, event, 0, pc); +} + +/* Valid access types are represented as + * + * rw- : Set Read/Write Access Breakpoint + * -w- : Set Write Access Breakpoint + * --- : Clear Breakpoints + * --x : Set Execution Break points (Not available yet) + * + */ +static int ksym_trace_get_access_type(char *str) +{ + int access = 0; + + if (str[0] == 'r') + access |= HW_BREAKPOINT_R; + + if (str[1] == 'w') + access |= HW_BREAKPOINT_W; + + if (str[2] == 'x') + access |= HW_BREAKPOINT_X; + + switch (access) { + case HW_BREAKPOINT_R: + case HW_BREAKPOINT_W: + case HW_BREAKPOINT_W | HW_BREAKPOINT_R: + return access; + default: + return -EINVAL; + } +} + +/* + * There can be several possible malformed requests and we attempt to capture + * all of them. We enumerate some of the rules + * 1. We will not allow kernel symbols with ':' since it is used as a delimiter. + * i.e. multiple ':' symbols disallowed. Possible uses are of the form + * <module>:<ksym_name>:<op>. + * 2. No delimiter symbol ':' in the input string + * 3. Spurious operator symbols or symbols not in their respective positions + * 4. <ksym_name>:--- i.e. clear breakpoint request when ksym_name not in file + * 5. Kernel symbol not a part of /proc/kallsyms + * 6. Duplicate requests + */ +static int parse_ksym_trace_str(char *input_string, char **ksymname, + unsigned long *addr) +{ + int ret; + + *ksymname = strsep(&input_string, ":"); + *addr = kallsyms_lookup_name(*ksymname); + + /* Check for malformed request: (2), (1) and (5) */ + if ((!input_string) || + (strlen(input_string) != KSYM_TRACER_OP_LEN) || + (*addr == 0)) + return -EINVAL;; + + ret = ksym_trace_get_access_type(input_string); + + return ret; +} + +int process_new_ksym_entry(char *ksymname, int op, unsigned long addr) +{ + struct trace_ksym *entry; + int ret = -ENOMEM; + + if (ksym_filter_entry_count >= KSYM_TRACER_MAX) { + printk(KERN_ERR "ksym_tracer: Maximum limit:(%d) reached. No" + " new requests for tracing can be accepted now.\n", + KSYM_TRACER_MAX); + return -ENOSPC; + } + + entry = kzalloc(sizeof(struct trace_ksym), GFP_KERNEL); + if (!entry) + return -ENOMEM; + + hw_breakpoint_init(&entry->attr); + + entry->attr.bp_type = op; + entry->attr.bp_addr = addr; + entry->attr.bp_len = HW_BREAKPOINT_LEN_4; + + ret = -EAGAIN; + entry->ksym_hbp = register_wide_hw_breakpoint(&entry->attr, + ksym_hbp_handler); + + if (IS_ERR(entry->ksym_hbp)) { + ret = PTR_ERR(entry->ksym_hbp); + printk(KERN_INFO "ksym_tracer request failed. Try again" + " later!!\n"); + goto err; + } + + hlist_add_head_rcu(&(entry->ksym_hlist), &ksym_filter_head); + ksym_filter_entry_count++; + + return 0; + +err: + kfree(entry); + + return ret; +} + +static ssize_t ksym_trace_filter_read(struct file *filp, char __user *ubuf, + size_t count, loff_t *ppos) +{ + struct trace_ksym *entry; + struct hlist_node *node; + struct trace_seq *s; + ssize_t cnt = 0; + int ret; + + s = kmalloc(sizeof(*s), GFP_KERNEL); + if (!s) + return -ENOMEM; + trace_seq_init(s); + + mutex_lock(&ksym_tracer_mutex); + + hlist_for_each_entry(entry, node, &ksym_filter_head, ksym_hlist) { + ret = trace_seq_printf(s, "%pS:", (void *)entry->attr.bp_addr); + if (entry->attr.bp_type == HW_BREAKPOINT_R) + ret = trace_seq_puts(s, "r--\n"); + else if (entry->attr.bp_type == HW_BREAKPOINT_W) + ret = trace_seq_puts(s, "-w-\n"); + else if (entry->attr.bp_type == (HW_BREAKPOINT_W | HW_BREAKPOINT_R)) + ret = trace_seq_puts(s, "rw-\n"); + WARN_ON_ONCE(!ret); + } + + cnt = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len); + + mutex_unlock(&ksym_tracer_mutex); + + kfree(s); + + return cnt; +} + +static void __ksym_trace_reset(void) +{ + struct trace_ksym *entry; + struct hlist_node *node, *node1; + + mutex_lock(&ksym_tracer_mutex); + hlist_for_each_entry_safe(entry, node, node1, &ksym_filter_head, + ksym_hlist) { + unregister_wide_hw_breakpoint(entry->ksym_hbp); + ksym_filter_entry_count--; + hlist_del_rcu(&(entry->ksym_hlist)); + synchronize_rcu(); + kfree(entry); + } + mutex_unlock(&ksym_tracer_mutex); +} + +static ssize_t ksym_trace_filter_write(struct file *file, + const char __user *buffer, + size_t count, loff_t *ppos) +{ + struct trace_ksym *entry; + struct hlist_node *node; + char *input_string, *ksymname = NULL; + unsigned long ksym_addr = 0; + int ret, op, changed = 0; + + input_string = kzalloc(count + 1, GFP_KERNEL); + if (!input_string) + return -ENOMEM; + + if (copy_from_user(input_string, buffer, count)) { + kfree(input_string); + return -EFAULT; + } + input_string[count] = '\0'; + + strstrip(input_string); + + /* + * Clear all breakpoints if: + * 1: echo > ksym_trace_filter + * 2: echo 0 > ksym_trace_filter + * 3: echo "*:---" > ksym_trace_filter + */ + if (!input_string[0] || !strcmp(input_string, "0") || + !strcmp(input_string, "*:---")) { + __ksym_trace_reset(); + kfree(input_string); + return count; + } + + ret = op = parse_ksym_trace_str(input_string, &ksymname, &ksym_addr); + if (ret < 0) { + kfree(input_string); + return ret; + } + + mutex_lock(&ksym_tracer_mutex); + + ret = -EINVAL; + hlist_for_each_entry(entry, node, &ksym_filter_head, ksym_hlist) { + if (entry->attr.bp_addr == ksym_addr) { + /* Check for malformed request: (6) */ + if (entry->attr.bp_type != op) + changed = 1; + else + goto out; + break; + } + } + if (changed) { + unregister_wide_hw_breakpoint(entry->ksym_hbp); + entry->attr.bp_type = op; + ret = 0; + if (op > 0) { + entry->ksym_hbp = + register_wide_hw_breakpoint(&entry->attr, + ksym_hbp_handler); + if (IS_ERR(entry->ksym_hbp)) + ret = PTR_ERR(entry->ksym_hbp); + else + goto out; + } + /* Error or "symbol:---" case: drop it */ + ksym_filter_entry_count--; + hlist_del_rcu(&(entry->ksym_hlist)); + synchronize_rcu(); + kfree(entry); + goto out; + } else { + /* Check for malformed request: (4) */ + if (op == 0) + goto out; + ret = process_new_ksym_entry(ksymname, op, ksym_addr); + } +out: + mutex_unlock(&ksym_tracer_mutex); + + kfree(input_string); + + if (!ret) + ret = count; + return ret; +} + +static const struct file_operations ksym_tracing_fops = { + .open = tracing_open_generic, + .read = ksym_trace_filter_read, + .write = ksym_trace_filter_write, +}; + +static void ksym_trace_reset(struct trace_array *tr) +{ + ksym_tracing_enabled = 0; + __ksym_trace_reset(); +} + +static int ksym_trace_init(struct trace_array *tr) +{ + int cpu, ret = 0; + + for_each_online_cpu(cpu) + tracing_reset(tr, cpu); + ksym_tracing_enabled = 1; + ksym_trace_array = tr; + + return ret; +} + +static void ksym_trace_print_header(struct seq_file *m) +{ + seq_puts(m, + "# TASK-PID CPU# Symbol " + "Type Function\n"); + seq_puts(m, + "# | | | " + " | |\n"); +} + +static enum print_line_t ksym_trace_output(struct trace_iterator *iter) +{ + struct trace_entry *entry = iter->ent; + struct trace_seq *s = &iter->seq; + struct ksym_trace_entry *field; + char str[KSYM_SYMBOL_LEN]; + int ret; + + if (entry->type != TRACE_KSYM) + return TRACE_TYPE_UNHANDLED; + + trace_assign_type(field, entry); + + ret = trace_seq_printf(s, "%11s-%-5d [%03d] %pS", field->cmd, + entry->pid, iter->cpu, (char *)field->addr); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + switch (field->type) { + case HW_BREAKPOINT_R: + ret = trace_seq_printf(s, " R "); + break; + case HW_BREAKPOINT_W: + ret = trace_seq_printf(s, " W "); + break; + case HW_BREAKPOINT_R | HW_BREAKPOINT_W: + ret = trace_seq_printf(s, " RW "); + break; + default: + return TRACE_TYPE_PARTIAL_LINE; + } + + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + sprint_symbol(str, field->ip); + ret = trace_seq_printf(s, "%s\n", str); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + return TRACE_TYPE_HANDLED; +} + +struct tracer ksym_tracer __read_mostly = +{ + .name = "ksym_tracer", + .init = ksym_trace_init, + .reset = ksym_trace_reset, +#ifdef CONFIG_FTRACE_SELFTEST + .selftest = trace_selftest_startup_ksym, +#endif + .print_header = ksym_trace_print_header, + .print_line = ksym_trace_output +}; + +__init static int init_ksym_trace(void) +{ + struct dentry *d_tracer; + struct dentry *entry; + + d_tracer = tracing_init_dentry(); + ksym_filter_entry_count = 0; + + entry = debugfs_create_file("ksym_trace_filter", 0644, d_tracer, + NULL, &ksym_tracing_fops); + if (!entry) + pr_warning("Could not create debugfs " + "'ksym_trace_filter' file\n"); + + return register_tracer(&ksym_tracer); +} +device_initcall(init_ksym_trace); + + +#ifdef CONFIG_PROFILE_KSYM_TRACER +static int ksym_tracer_stat_headers(struct seq_file *m) +{ + seq_puts(m, " Access Type "); + seq_puts(m, " Symbol Counter\n"); + seq_puts(m, " ----------- "); + seq_puts(m, " ------ -------\n"); + return 0; +} + +static int ksym_tracer_stat_show(struct seq_file *m, void *v) +{ + struct hlist_node *stat = v; + struct trace_ksym *entry; + int access_type = 0; + char fn_name[KSYM_NAME_LEN]; + + entry = hlist_entry(stat, struct trace_ksym, ksym_hlist); + + access_type = entry->attr.bp_type; + + switch (access_type) { + case HW_BREAKPOINT_R: + seq_puts(m, " R "); + break; + case HW_BREAKPOINT_W: + seq_puts(m, " W "); + break; + case HW_BREAKPOINT_R | HW_BREAKPOINT_W: + seq_puts(m, " RW "); + break; + default: + seq_puts(m, " NA "); + } + + if (lookup_symbol_name(entry->attr.bp_addr, fn_name) >= 0) + seq_printf(m, " %-36s", fn_name); + else + seq_printf(m, " %-36s", "<NA>"); + seq_printf(m, " %15lu\n", entry->counter); + + return 0; +} + +static void *ksym_tracer_stat_start(struct tracer_stat *trace) +{ + return ksym_filter_head.first; +} + +static void * +ksym_tracer_stat_next(void *v, int idx) +{ + struct hlist_node *stat = v; + + return stat->next; +} + +static struct tracer_stat ksym_tracer_stats = { + .name = "ksym_tracer", + .stat_start = ksym_tracer_stat_start, + .stat_next = ksym_tracer_stat_next, + .stat_headers = ksym_tracer_stat_headers, + .stat_show = ksym_tracer_stat_show +}; + +__init static int ksym_tracer_stat_init(void) +{ + int ret; + + ret = register_stat_tracer(&ksym_tracer_stats); + if (ret) { + printk(KERN_WARNING "Warning: could not register " + "ksym tracer stats\n"); + return 1; + } + + return 0; +} +fs_initcall(ksym_tracer_stat_init); +#endif /* CONFIG_PROFILE_KSYM_TRACER */ diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index f572f44c6e1..8e46b3323cd 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -23,13 +23,21 @@ static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly; static int next_event_type = __TRACE_LAST_TYPE + 1; -void trace_print_seq(struct seq_file *m, struct trace_seq *s) +int trace_print_seq(struct seq_file *m, struct trace_seq *s) { int len = s->len >= PAGE_SIZE ? PAGE_SIZE - 1 : s->len; + int ret; + + ret = seq_write(m, s->buffer, len); - seq_write(m, s->buffer, len); + /* + * Only reset this buffer if we successfully wrote to the + * seq_file buffer. + */ + if (!ret) + trace_seq_init(s); - trace_seq_init(s); + return ret; } enum print_line_t trace_print_bprintk_msg_only(struct trace_iterator *iter) @@ -69,6 +77,9 @@ enum print_line_t trace_print_printk_msg_only(struct trace_iterator *iter) * @s: trace sequence descriptor * @fmt: printf format string * + * It returns 0 if the trace oversizes the buffer's free + * space, 1 otherwise. + * * The tracer may use either sequence operations or its own * copy to user routines. To simplify formating of a trace * trace_seq_printf is used to store strings into a special @@ -82,7 +93,7 @@ trace_seq_printf(struct trace_seq *s, const char *fmt, ...) va_list ap; int ret; - if (!len) + if (s->full || !len) return 0; va_start(ap, fmt); @@ -90,12 +101,14 @@ trace_seq_printf(struct trace_seq *s, const char *fmt, ...) va_end(ap); /* If we can't write it all, don't bother writing anything */ - if (ret >= len) + if (ret >= len) { + s->full = 1; return 0; + } s->len += ret; - return len; + return 1; } EXPORT_SYMBOL_GPL(trace_seq_printf); @@ -116,14 +129,16 @@ trace_seq_vprintf(struct trace_seq *s, const char *fmt, va_list args) int len = (PAGE_SIZE - 1) - s->len; int ret; - if (!len) + if (s->full || !len) return 0; ret = vsnprintf(s->buffer + s->len, len, fmt, args); /* If we can't write it all, don't bother writing anything */ - if (ret >= len) + if (ret >= len) { + s->full = 1; return 0; + } s->len += ret; @@ -136,14 +151,16 @@ int trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary) int len = (PAGE_SIZE - 1) - s->len; int ret; - if (!len) + if (s->full || !len) return 0; ret = bstr_printf(s->buffer + s->len, len, fmt, binary); /* If we can't write it all, don't bother writing anything */ - if (ret >= len) + if (ret >= len) { + s->full = 1; return 0; + } s->len += ret; @@ -164,9 +181,14 @@ int trace_seq_puts(struct trace_seq *s, const char *str) { int len = strlen(str); - if (len > ((PAGE_SIZE - 1) - s->len)) + if (s->full) return 0; + if (len > ((PAGE_SIZE - 1) - s->len)) { + s->full = 1; + return 0; + } + memcpy(s->buffer + s->len, str, len); s->len += len; @@ -175,9 +197,14 @@ int trace_seq_puts(struct trace_seq *s, const char *str) int trace_seq_putc(struct trace_seq *s, unsigned char c) { - if (s->len >= (PAGE_SIZE - 1)) + if (s->full) return 0; + if (s->len >= (PAGE_SIZE - 1)) { + s->full = 1; + return 0; + } + s->buffer[s->len++] = c; return 1; @@ -185,8 +212,13 @@ int trace_seq_putc(struct trace_seq *s, unsigned char c) int trace_seq_putmem(struct trace_seq *s, const void *mem, size_t len) { - if (len > ((PAGE_SIZE - 1) - s->len)) + if (s->full) + return 0; + + if (len > ((PAGE_SIZE - 1) - s->len)) { + s->full = 1; return 0; + } memcpy(s->buffer + s->len, mem, len); s->len += len; @@ -200,6 +232,9 @@ int trace_seq_putmem_hex(struct trace_seq *s, const void *mem, size_t len) const unsigned char *data = mem; int i, j; + if (s->full) + return 0; + #ifdef __BIG_ENDIAN for (i = 0, j = 0; i < len; i++) { #else @@ -217,8 +252,13 @@ void *trace_seq_reserve(struct trace_seq *s, size_t len) { void *ret; - if (len > ((PAGE_SIZE - 1) - s->len)) + if (s->full) + return 0; + + if (len > ((PAGE_SIZE - 1) - s->len)) { + s->full = 1; return NULL; + } ret = s->buffer + s->len; s->len += len; @@ -230,8 +270,14 @@ int trace_seq_path(struct trace_seq *s, struct path *path) { unsigned char *p; - if (s->len >= (PAGE_SIZE - 1)) + if (s->full) + return 0; + + if (s->len >= (PAGE_SIZE - 1)) { + s->full = 1; return 0; + } + p = d_path(path, s->buffer + s->len, PAGE_SIZE - s->len); if (!IS_ERR(p)) { p = mangle_path(s->buffer + s->len, p, "\n"); @@ -244,6 +290,7 @@ int trace_seq_path(struct trace_seq *s, struct path *path) return 1; } + s->full = 1; return 0; } @@ -370,6 +417,9 @@ int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm, unsigned long vmstart = 0; int ret = 1; + if (s->full) + return 0; + if (mm) { const struct vm_area_struct *vma; @@ -486,16 +536,18 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) hardirq ? 'h' : softirq ? 's' : '.')) return 0; - if (entry->lock_depth < 0) - ret = trace_seq_putc(s, '.'); + if (entry->preempt_count) + ret = trace_seq_printf(s, "%x", entry->preempt_count); else - ret = trace_seq_printf(s, "%d", entry->lock_depth); + ret = trace_seq_putc(s, '.'); + if (!ret) return 0; - if (entry->preempt_count) - return trace_seq_printf(s, "%x", entry->preempt_count); - return trace_seq_putc(s, '.'); + if (entry->lock_depth < 0) + return trace_seq_putc(s, '.'); + + return trace_seq_printf(s, "%d", entry->lock_depth); } static int @@ -883,7 +935,7 @@ static int trace_ctxwake_raw(struct trace_iterator *iter, char S) trace_assign_type(field, iter->ent); if (!S) - task_state_char(field->prev_state); + S = task_state_char(field->prev_state); T = task_state_char(field->next_state); if (!trace_seq_printf(&iter->seq, "%d %d %c %d %d %d %c\n", field->prev_pid, @@ -918,7 +970,7 @@ static int trace_ctxwake_hex(struct trace_iterator *iter, char S) trace_assign_type(field, iter->ent); if (!S) - task_state_char(field->prev_state); + S = task_state_char(field->prev_state); T = task_state_char(field->next_state); SEQ_PUT_HEX_FIELD_RET(s, field->prev_pid); diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 26185d72767..0271742abb8 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -28,8 +28,8 @@ static int wakeup_current_cpu; static unsigned wakeup_prio = -1; static int wakeup_rt; -static raw_spinlock_t wakeup_lock = - (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; +static arch_spinlock_t wakeup_lock = + (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; static void __wakeup_reset(struct trace_array *tr); @@ -143,7 +143,7 @@ probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev, goto out; local_irq_save(flags); - __raw_spin_lock(&wakeup_lock); + arch_spin_lock(&wakeup_lock); /* We could race with grabbing wakeup_lock */ if (unlikely(!tracer_enabled || next != wakeup_task)) @@ -169,7 +169,7 @@ probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev, out_unlock: __wakeup_reset(wakeup_trace); - __raw_spin_unlock(&wakeup_lock); + arch_spin_unlock(&wakeup_lock); local_irq_restore(flags); out: atomic_dec(&wakeup_trace->data[cpu]->disabled); @@ -193,9 +193,9 @@ static void wakeup_reset(struct trace_array *tr) tracing_reset_online_cpus(tr); local_irq_save(flags); - __raw_spin_lock(&wakeup_lock); + arch_spin_lock(&wakeup_lock); __wakeup_reset(tr); - __raw_spin_unlock(&wakeup_lock); + arch_spin_unlock(&wakeup_lock); local_irq_restore(flags); } @@ -225,7 +225,7 @@ probe_wakeup(struct rq *rq, struct task_struct *p, int success) goto out; /* interrupts should be off from try_to_wake_up */ - __raw_spin_lock(&wakeup_lock); + arch_spin_lock(&wakeup_lock); /* check for races. */ if (!tracer_enabled || p->prio >= wakeup_prio) @@ -255,7 +255,7 @@ probe_wakeup(struct rq *rq, struct task_struct *p, int success) trace_function(wakeup_trace, CALLER_ADDR1, CALLER_ADDR2, flags, pc); out_locked: - __raw_spin_unlock(&wakeup_lock); + arch_spin_unlock(&wakeup_lock); out: atomic_dec(&wakeup_trace->data[cpu]->disabled); } diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index d2cdbabb4ea..280fea470d6 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -17,6 +17,7 @@ static inline int trace_valid_entry(struct trace_entry *entry) case TRACE_GRAPH_ENT: case TRACE_GRAPH_RET: case TRACE_HW_BRANCHES: + case TRACE_KSYM: return 1; } return 0; @@ -66,7 +67,7 @@ static int trace_test_buffer(struct trace_array *tr, unsigned long *count) /* Don't allow flipping of max traces now */ local_irq_save(flags); - __raw_spin_lock(&ftrace_max_lock); + arch_spin_lock(&ftrace_max_lock); cnt = ring_buffer_entries(tr->buffer); @@ -84,7 +85,7 @@ static int trace_test_buffer(struct trace_array *tr, unsigned long *count) break; } tracing_on(); - __raw_spin_unlock(&ftrace_max_lock); + arch_spin_unlock(&ftrace_max_lock); local_irq_restore(flags); if (count) @@ -808,3 +809,57 @@ trace_selftest_startup_hw_branches(struct tracer *trace, return ret; } #endif /* CONFIG_HW_BRANCH_TRACER */ + +#ifdef CONFIG_KSYM_TRACER +static int ksym_selftest_dummy; + +int +trace_selftest_startup_ksym(struct tracer *trace, struct trace_array *tr) +{ + unsigned long count; + int ret; + + /* start the tracing */ + ret = tracer_init(trace, tr); + if (ret) { + warn_failed_init_tracer(trace, ret); + return ret; + } + + ksym_selftest_dummy = 0; + /* Register the read-write tracing request */ + + ret = process_new_ksym_entry("ksym_selftest_dummy", + HW_BREAKPOINT_R | HW_BREAKPOINT_W, + (unsigned long)(&ksym_selftest_dummy)); + + if (ret < 0) { + printk(KERN_CONT "ksym_trace read-write startup test failed\n"); + goto ret_path; + } + /* Perform a read and a write operation over the dummy variable to + * trigger the tracer + */ + if (ksym_selftest_dummy == 0) + ksym_selftest_dummy++; + + /* stop the tracing. */ + tracing_stop(); + /* check the trace buffer */ + ret = trace_test_buffer(tr, &count); + trace->reset(tr); + tracing_start(); + + /* read & write operations - one each is performed on the dummy variable + * triggering two entries in the trace buffer + */ + if (!ret && count != 2) { + printk(KERN_CONT "Ksym tracer startup test failed"); + ret = -1; + } + +ret_path: + return ret; +} +#endif /* CONFIG_KSYM_TRACER */ + diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 0f6facb050a..678a5120ee3 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -27,8 +27,8 @@ static struct stack_trace max_stack_trace = { }; static unsigned long max_stack_size; -static raw_spinlock_t max_stack_lock = - (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; +static arch_spinlock_t max_stack_lock = + (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; static int stack_trace_disabled __read_mostly; static DEFINE_PER_CPU(int, trace_active); @@ -54,7 +54,7 @@ static inline void check_stack(void) return; local_irq_save(flags); - __raw_spin_lock(&max_stack_lock); + arch_spin_lock(&max_stack_lock); /* a race could have already updated it */ if (this_size <= max_stack_size) @@ -103,7 +103,7 @@ static inline void check_stack(void) } out: - __raw_spin_unlock(&max_stack_lock); + arch_spin_unlock(&max_stack_lock); local_irq_restore(flags); } @@ -171,9 +171,9 @@ stack_max_size_write(struct file *filp, const char __user *ubuf, return ret; local_irq_save(flags); - __raw_spin_lock(&max_stack_lock); + arch_spin_lock(&max_stack_lock); *ptr = val; - __raw_spin_unlock(&max_stack_lock); + arch_spin_unlock(&max_stack_lock); local_irq_restore(flags); return count; @@ -207,7 +207,7 @@ t_next(struct seq_file *m, void *v, loff_t *pos) static void *t_start(struct seq_file *m, loff_t *pos) { local_irq_disable(); - __raw_spin_lock(&max_stack_lock); + arch_spin_lock(&max_stack_lock); if (*pos == 0) return SEQ_START_TOKEN; @@ -217,7 +217,7 @@ static void *t_start(struct seq_file *m, loff_t *pos) static void t_stop(struct seq_file *m, void *p) { - __raw_spin_unlock(&max_stack_lock); + arch_spin_unlock(&max_stack_lock); local_irq_enable(); } @@ -296,14 +296,14 @@ static const struct file_operations stack_trace_fops = { int stack_trace_sysctl(struct ctl_table *table, int write, - struct file *file, void __user *buffer, size_t *lenp, + void __user *buffer, size_t *lenp, loff_t *ppos) { int ret; mutex_lock(&stack_sysctl_mutex); - ret = proc_dointvec(table, write, file, buffer, lenp, ppos); + ret = proc_dointvec(table, write, buffer, lenp, ppos); if (ret || !write || (last_stack_tracer_enabled == !!stack_tracer_enabled)) diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 9fbce6c9d2e..57501d90096 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -14,6 +14,43 @@ static int sys_refcount_exit; static DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls); static DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls); +extern unsigned long __start_syscalls_metadata[]; +extern unsigned long __stop_syscalls_metadata[]; + +static struct syscall_metadata **syscalls_metadata; + +static struct syscall_metadata *find_syscall_meta(unsigned long syscall) +{ + struct syscall_metadata *start; + struct syscall_metadata *stop; + char str[KSYM_SYMBOL_LEN]; + + + start = (struct syscall_metadata *)__start_syscalls_metadata; + stop = (struct syscall_metadata *)__stop_syscalls_metadata; + kallsyms_lookup(syscall, NULL, NULL, NULL, str); + + for ( ; start < stop; start++) { + /* + * Only compare after the "sys" prefix. Archs that use + * syscall wrappers may have syscalls symbols aliases prefixed + * with "SyS" instead of "sys", leading to an unwanted + * mismatch. + */ + if (start->name && !strcmp(start->name + 3, str + 3)) + return start; + } + return NULL; +} + +static struct syscall_metadata *syscall_nr_to_meta(int nr) +{ + if (!syscalls_metadata || nr >= NR_syscalls || nr < 0) + return NULL; + + return syscalls_metadata[nr]; +} + enum print_line_t print_syscall_enter(struct trace_iterator *iter, int flags) { @@ -30,7 +67,7 @@ print_syscall_enter(struct trace_iterator *iter, int flags) if (!entry) goto end; - if (entry->enter_id != ent->type) { + if (entry->enter_event->id != ent->type) { WARN_ON_ONCE(1); goto end; } @@ -85,7 +122,7 @@ print_syscall_exit(struct trace_iterator *iter, int flags) return TRACE_TYPE_HANDLED; } - if (entry->exit_id != ent->type) { + if (entry->exit_event->id != ent->type) { WARN_ON_ONCE(1); return TRACE_TYPE_UNHANDLED; } @@ -103,24 +140,19 @@ extern char *__bad_type_size(void); #define SYSCALL_FIELD(type, name) \ sizeof(type) != sizeof(trace.name) ? \ __bad_type_size() : \ - #type, #name, offsetof(typeof(trace), name), sizeof(trace.name) + #type, #name, offsetof(typeof(trace), name), \ + sizeof(trace.name), is_signed_type(type) int syscall_enter_format(struct ftrace_event_call *call, struct trace_seq *s) { int i; - int nr; int ret; - struct syscall_metadata *entry; + struct syscall_metadata *entry = call->data; struct syscall_trace_enter trace; int offset = offsetof(struct syscall_trace_enter, args); - nr = syscall_name_to_nr(call->data); - entry = syscall_nr_to_meta(nr); - - if (!entry) - return 0; - - ret = trace_seq_printf(s, "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n", + ret = trace_seq_printf(s, "\tfield:%s %s;\toffset:%zu;\tsize:%zu;" + "\tsigned:%u;\n", SYSCALL_FIELD(int, nr)); if (!ret) return 0; @@ -130,8 +162,10 @@ int syscall_enter_format(struct ftrace_event_call *call, struct trace_seq *s) entry->args[i]); if (!ret) return 0; - ret = trace_seq_printf(s, "\toffset:%d;\tsize:%zu;\n", offset, - sizeof(unsigned long)); + ret = trace_seq_printf(s, "\toffset:%d;\tsize:%zu;" + "\tsigned:%u;\n", offset, + sizeof(unsigned long), + is_signed_type(unsigned long)); if (!ret) return 0; offset += sizeof(unsigned long); @@ -163,10 +197,12 @@ int syscall_exit_format(struct ftrace_event_call *call, struct trace_seq *s) struct syscall_trace_exit trace; ret = trace_seq_printf(s, - "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" - "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n", + "\tfield:%s %s;\toffset:%zu;\tsize:%zu;" + "\tsigned:%u;\n" + "\tfield:%s %s;\toffset:%zu;\tsize:%zu;" + "\tsigned:%u;\n", SYSCALL_FIELD(int, nr), - SYSCALL_FIELD(unsigned long, ret)); + SYSCALL_FIELD(long, ret)); if (!ret) return 0; @@ -176,22 +212,19 @@ int syscall_exit_format(struct ftrace_event_call *call, struct trace_seq *s) int syscall_enter_define_fields(struct ftrace_event_call *call) { struct syscall_trace_enter trace; - struct syscall_metadata *meta; + struct syscall_metadata *meta = call->data; int ret; - int nr; int i; int offset = offsetof(typeof(trace), args); - nr = syscall_name_to_nr(call->data); - meta = syscall_nr_to_meta(nr); - - if (!meta) - return 0; - ret = trace_define_common_fields(call); if (ret) return ret; + ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER); + if (ret) + return ret; + for (i = 0; i < meta->nb_args; i++) { ret = trace_define_field(call, meta->types[i], meta->args[i], offset, @@ -212,7 +245,11 @@ int syscall_exit_define_fields(struct ftrace_event_call *call) if (ret) return ret; - ret = trace_define_field(call, SYSCALL_FIELD(unsigned long, ret), 0, + ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER); + if (ret) + return ret; + + ret = trace_define_field(call, SYSCALL_FIELD(long, ret), FILTER_OTHER); return ret; @@ -239,8 +276,8 @@ void ftrace_syscall_enter(struct pt_regs *regs, long id) size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args; - event = trace_current_buffer_lock_reserve(&buffer, sys_data->enter_id, - size, 0, 0); + event = trace_current_buffer_lock_reserve(&buffer, + sys_data->enter_event->id, size, 0, 0); if (!event) return; @@ -271,8 +308,8 @@ void ftrace_syscall_exit(struct pt_regs *regs, long ret) if (!sys_data) return; - event = trace_current_buffer_lock_reserve(&buffer, sys_data->exit_id, - sizeof(*entry), 0, 0); + event = trace_current_buffer_lock_reserve(&buffer, + sys_data->exit_event->id, sizeof(*entry), 0, 0); if (!event) return; @@ -285,14 +322,12 @@ void ftrace_syscall_exit(struct pt_regs *regs, long ret) trace_current_buffer_unlock_commit(buffer, event, 0, 0); } -int reg_event_syscall_enter(void *ptr) +int reg_event_syscall_enter(struct ftrace_event_call *call) { int ret = 0; int num; - char *name; - name = (char *)ptr; - num = syscall_name_to_nr(name); + num = ((struct syscall_metadata *)call->data)->syscall_nr; if (num < 0 || num >= NR_syscalls) return -ENOSYS; mutex_lock(&syscall_trace_lock); @@ -309,13 +344,11 @@ int reg_event_syscall_enter(void *ptr) return ret; } -void unreg_event_syscall_enter(void *ptr) +void unreg_event_syscall_enter(struct ftrace_event_call *call) { int num; - char *name; - name = (char *)ptr; - num = syscall_name_to_nr(name); + num = ((struct syscall_metadata *)call->data)->syscall_nr; if (num < 0 || num >= NR_syscalls) return; mutex_lock(&syscall_trace_lock); @@ -326,14 +359,12 @@ void unreg_event_syscall_enter(void *ptr) mutex_unlock(&syscall_trace_lock); } -int reg_event_syscall_exit(void *ptr) +int reg_event_syscall_exit(struct ftrace_event_call *call) { int ret = 0; int num; - char *name; - name = (char *)ptr; - num = syscall_name_to_nr(name); + num = ((struct syscall_metadata *)call->data)->syscall_nr; if (num < 0 || num >= NR_syscalls) return -ENOSYS; mutex_lock(&syscall_trace_lock); @@ -350,13 +381,11 @@ int reg_event_syscall_exit(void *ptr) return ret; } -void unreg_event_syscall_exit(void *ptr) +void unreg_event_syscall_exit(struct ftrace_event_call *call) { int num; - char *name; - name = (char *)ptr; - num = syscall_name_to_nr(name); + num = ((struct syscall_metadata *)call->data)->syscall_nr; if (num < 0 || num >= NR_syscalls) return; mutex_lock(&syscall_trace_lock); @@ -367,13 +396,44 @@ void unreg_event_syscall_exit(void *ptr) mutex_unlock(&syscall_trace_lock); } -struct trace_event event_syscall_enter = { - .trace = print_syscall_enter, -}; +int init_syscall_trace(struct ftrace_event_call *call) +{ + int id; + + id = register_ftrace_event(call->event); + if (!id) + return -ENODEV; + call->id = id; + INIT_LIST_HEAD(&call->fields); + return 0; +} + +int __init init_ftrace_syscalls(void) +{ + struct syscall_metadata *meta; + unsigned long addr; + int i; + + syscalls_metadata = kzalloc(sizeof(*syscalls_metadata) * + NR_syscalls, GFP_KERNEL); + if (!syscalls_metadata) { + WARN_ON(1); + return -ENOMEM; + } + + for (i = 0; i < NR_syscalls; i++) { + addr = arch_syscall_addr(i); + meta = find_syscall_meta(addr); + if (!meta) + continue; + + meta->syscall_nr = i; + syscalls_metadata[i] = meta; + } -struct trace_event event_syscall_exit = { - .trace = print_syscall_exit, -}; + return 0; +} +core_initcall(init_ftrace_syscalls); #ifdef CONFIG_EVENT_PROFILE @@ -387,8 +447,10 @@ static void prof_syscall_enter(struct pt_regs *regs, long id) struct syscall_metadata *sys_data; struct syscall_trace_enter *rec; unsigned long flags; + char *trace_buf; char *raw_data; int syscall_nr; + int rctx; int size; int cpu; @@ -412,41 +474,42 @@ static void prof_syscall_enter(struct pt_regs *regs, long id) /* Protect the per cpu buffer, begin the rcu read side */ local_irq_save(flags); + rctx = perf_swevent_get_recursion_context(); + if (rctx < 0) + goto end_recursion; + cpu = smp_processor_id(); - if (in_nmi()) - raw_data = rcu_dereference(trace_profile_buf_nmi); - else - raw_data = rcu_dereference(trace_profile_buf); + trace_buf = rcu_dereference(perf_trace_buf); - if (!raw_data) + if (!trace_buf) goto end; - raw_data = per_cpu_ptr(raw_data, cpu); + raw_data = per_cpu_ptr(trace_buf, cpu); /* zero the dead bytes from align to not leak stack to user */ *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL; rec = (struct syscall_trace_enter *) raw_data; tracing_generic_entry_update(&rec->ent, 0, 0); - rec->ent.type = sys_data->enter_id; + rec->ent.type = sys_data->enter_event->id; rec->nr = syscall_nr; syscall_get_arguments(current, regs, 0, sys_data->nb_args, (unsigned long *)&rec->args); - perf_tp_event(sys_data->enter_id, 0, 1, rec, size); + perf_tp_event(sys_data->enter_event->id, 0, 1, rec, size); end: + perf_swevent_put_recursion_context(rctx); +end_recursion: local_irq_restore(flags); } -int reg_prof_syscall_enter(char *name) +int prof_sysenter_enable(struct ftrace_event_call *call) { int ret = 0; int num; - num = syscall_name_to_nr(name); - if (num < 0 || num >= NR_syscalls) - return -ENOSYS; + num = ((struct syscall_metadata *)call->data)->syscall_nr; mutex_lock(&syscall_trace_lock); if (!sys_prof_refcount_enter) @@ -462,13 +525,11 @@ int reg_prof_syscall_enter(char *name) return ret; } -void unreg_prof_syscall_enter(char *name) +void prof_sysenter_disable(struct ftrace_event_call *call) { int num; - num = syscall_name_to_nr(name); - if (num < 0 || num >= NR_syscalls) - return; + num = ((struct syscall_metadata *)call->data)->syscall_nr; mutex_lock(&syscall_trace_lock); sys_prof_refcount_enter--; @@ -484,7 +545,9 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret) struct syscall_trace_exit *rec; unsigned long flags; int syscall_nr; + char *trace_buf; char *raw_data; + int rctx; int size; int cpu; @@ -510,17 +573,19 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret) /* Protect the per cpu buffer, begin the rcu read side */ local_irq_save(flags); + + rctx = perf_swevent_get_recursion_context(); + if (rctx < 0) + goto end_recursion; + cpu = smp_processor_id(); - if (in_nmi()) - raw_data = rcu_dereference(trace_profile_buf_nmi); - else - raw_data = rcu_dereference(trace_profile_buf); + trace_buf = rcu_dereference(perf_trace_buf); - if (!raw_data) + if (!trace_buf) goto end; - raw_data = per_cpu_ptr(raw_data, cpu); + raw_data = per_cpu_ptr(trace_buf, cpu); /* zero the dead bytes from align to not leak stack to user */ *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL; @@ -528,24 +593,24 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret) rec = (struct syscall_trace_exit *)raw_data; tracing_generic_entry_update(&rec->ent, 0, 0); - rec->ent.type = sys_data->exit_id; + rec->ent.type = sys_data->exit_event->id; rec->nr = syscall_nr; rec->ret = syscall_get_return_value(current, regs); - perf_tp_event(sys_data->exit_id, 0, 1, rec, size); + perf_tp_event(sys_data->exit_event->id, 0, 1, rec, size); end: + perf_swevent_put_recursion_context(rctx); +end_recursion: local_irq_restore(flags); } -int reg_prof_syscall_exit(char *name) +int prof_sysexit_enable(struct ftrace_event_call *call) { int ret = 0; int num; - num = syscall_name_to_nr(name); - if (num < 0 || num >= NR_syscalls) - return -ENOSYS; + num = ((struct syscall_metadata *)call->data)->syscall_nr; mutex_lock(&syscall_trace_lock); if (!sys_prof_refcount_exit) @@ -561,13 +626,11 @@ int reg_prof_syscall_exit(char *name) return ret; } -void unreg_prof_syscall_exit(char *name) +void prof_sysexit_disable(struct ftrace_event_call *call) { int num; - num = syscall_name_to_nr(name); - if (num < 0 || num >= NR_syscalls) - return; + num = ((struct syscall_metadata *)call->data)->syscall_nr; mutex_lock(&syscall_trace_lock); sys_prof_refcount_exit--; diff --git a/kernel/uid16.c b/kernel/uid16.c index 0314501688b..419209893d8 100644 --- a/kernel/uid16.c +++ b/kernel/uid16.c @@ -4,7 +4,6 @@ */ #include <linux/mm.h> -#include <linux/utsname.h> #include <linux/mman.h> #include <linux/notifier.h> #include <linux/reboot.h> diff --git a/kernel/user-return-notifier.c b/kernel/user-return-notifier.c new file mode 100644 index 00000000000..eb27fd3430a --- /dev/null +++ b/kernel/user-return-notifier.c @@ -0,0 +1,44 @@ + +#include <linux/user-return-notifier.h> +#include <linux/percpu.h> +#include <linux/sched.h> +#include <linux/module.h> + +static DEFINE_PER_CPU(struct hlist_head, return_notifier_list); + +/* + * Request a notification when the current cpu returns to userspace. Must be + * called in atomic context. The notifier will also be called in atomic + * context. + */ +void user_return_notifier_register(struct user_return_notifier *urn) +{ + set_tsk_thread_flag(current, TIF_USER_RETURN_NOTIFY); + hlist_add_head(&urn->link, &__get_cpu_var(return_notifier_list)); +} +EXPORT_SYMBOL_GPL(user_return_notifier_register); + +/* + * Removes a registered user return notifier. Must be called from atomic + * context, and from the same cpu registration occured in. + */ +void user_return_notifier_unregister(struct user_return_notifier *urn) +{ + hlist_del(&urn->link); + if (hlist_empty(&__get_cpu_var(return_notifier_list))) + clear_tsk_thread_flag(current, TIF_USER_RETURN_NOTIFY); +} +EXPORT_SYMBOL_GPL(user_return_notifier_unregister); + +/* Calls registered user return notifiers */ +void fire_user_return_notifiers(void) +{ + struct user_return_notifier *urn; + struct hlist_node *tmp1, *tmp2; + struct hlist_head *head; + + head = &get_cpu_var(return_notifier_list); + hlist_for_each_entry_safe(urn, tmp1, tmp2, head, link) + urn->on_user_return(urn); + put_cpu_var(return_notifier_list); +} diff --git a/kernel/user.c b/kernel/user.c index 2c000e7132a..46d0165ca70 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -330,9 +330,9 @@ done: */ static void free_user(struct user_struct *up, unsigned long flags) { - spin_unlock_irqrestore(&uidhash_lock, flags); INIT_DELAYED_WORK(&up->work, cleanup_user_struct); schedule_delayed_work(&up->work, msecs_to_jiffies(1000)); + spin_unlock_irqrestore(&uidhash_lock, flags); } #else /* CONFIG_USER_SCHED && CONFIG_SYSFS */ diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c index 92359cc747a..a2cd77e70d4 100644 --- a/kernel/utsname_sysctl.c +++ b/kernel/utsname_sysctl.c @@ -42,14 +42,14 @@ static void put_uts(ctl_table *table, int write, void *which) * Special case of dostring for the UTS structure. This has locks * to observe. Should this be in kernel/sys.c ???? */ -static int proc_do_uts_string(ctl_table *table, int write, struct file *filp, +static int proc_do_uts_string(ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table uts_table; int r; memcpy(&uts_table, table, sizeof(uts_table)); uts_table.data = get_uts(table, write); - r = proc_dostring(&uts_table,write,filp,buffer,lenp, ppos); + r = proc_dostring(&uts_table,write,buffer,lenp, ppos); put_uts(table, write, uts_table.data); return r; } @@ -57,78 +57,47 @@ static int proc_do_uts_string(ctl_table *table, int write, struct file *filp, #define proc_do_uts_string NULL #endif - -#ifdef CONFIG_SYSCTL_SYSCALL -/* The generic string strategy routine: */ -static int sysctl_uts_string(ctl_table *table, - void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen) -{ - struct ctl_table uts_table; - int r, write; - write = newval && newlen; - memcpy(&uts_table, table, sizeof(uts_table)); - uts_table.data = get_uts(table, write); - r = sysctl_string(&uts_table, oldval, oldlenp, newval, newlen); - put_uts(table, write, uts_table.data); - return r; -} -#else -#define sysctl_uts_string NULL -#endif - static struct ctl_table uts_kern_table[] = { { - .ctl_name = KERN_OSTYPE, .procname = "ostype", .data = init_uts_ns.name.sysname, .maxlen = sizeof(init_uts_ns.name.sysname), .mode = 0444, .proc_handler = proc_do_uts_string, - .strategy = sysctl_uts_string, }, { - .ctl_name = KERN_OSRELEASE, .procname = "osrelease", .data = init_uts_ns.name.release, .maxlen = sizeof(init_uts_ns.name.release), .mode = 0444, .proc_handler = proc_do_uts_string, - .strategy = sysctl_uts_string, }, { - .ctl_name = KERN_VERSION, .procname = "version", .data = init_uts_ns.name.version, .maxlen = sizeof(init_uts_ns.name.version), .mode = 0444, .proc_handler = proc_do_uts_string, - .strategy = sysctl_uts_string, }, { - .ctl_name = KERN_NODENAME, .procname = "hostname", .data = init_uts_ns.name.nodename, .maxlen = sizeof(init_uts_ns.name.nodename), .mode = 0644, .proc_handler = proc_do_uts_string, - .strategy = sysctl_uts_string, }, { - .ctl_name = KERN_DOMAINNAME, .procname = "domainname", .data = init_uts_ns.name.domainname, .maxlen = sizeof(init_uts_ns.name.domainname), .mode = 0644, .proc_handler = proc_do_uts_string, - .strategy = sysctl_uts_string, }, {} }; static struct ctl_table uts_root_table[] = { { - .ctl_name = CTL_KERN, .procname = "kernel", .mode = 0555, .child = uts_kern_table, diff --git a/kernel/workqueue.c b/kernel/workqueue.c index addfe2df93b..dee48658805 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -68,6 +68,116 @@ struct workqueue_struct { #endif }; +#ifdef CONFIG_DEBUG_OBJECTS_WORK + +static struct debug_obj_descr work_debug_descr; + +/* + * fixup_init is called when: + * - an active object is initialized + */ +static int work_fixup_init(void *addr, enum debug_obj_state state) +{ + struct work_struct *work = addr; + + switch (state) { + case ODEBUG_STATE_ACTIVE: + cancel_work_sync(work); + debug_object_init(work, &work_debug_descr); + return 1; + default: + return 0; + } +} + +/* + * fixup_activate is called when: + * - an active object is activated + * - an unknown object is activated (might be a statically initialized object) + */ +static int work_fixup_activate(void *addr, enum debug_obj_state state) +{ + struct work_struct *work = addr; + + switch (state) { + + case ODEBUG_STATE_NOTAVAILABLE: + /* + * This is not really a fixup. The work struct was + * statically initialized. We just make sure that it + * is tracked in the object tracker. + */ + if (test_bit(WORK_STRUCT_STATIC, work_data_bits(work))) { + debug_object_init(work, &work_debug_descr); + debug_object_activate(work, &work_debug_descr); + return 0; + } + WARN_ON_ONCE(1); + return 0; + + case ODEBUG_STATE_ACTIVE: + WARN_ON(1); + + default: + return 0; + } +} + +/* + * fixup_free is called when: + * - an active object is freed + */ +static int work_fixup_free(void *addr, enum debug_obj_state state) +{ + struct work_struct *work = addr; + + switch (state) { + case ODEBUG_STATE_ACTIVE: + cancel_work_sync(work); + debug_object_free(work, &work_debug_descr); + return 1; + default: + return 0; + } +} + +static struct debug_obj_descr work_debug_descr = { + .name = "work_struct", + .fixup_init = work_fixup_init, + .fixup_activate = work_fixup_activate, + .fixup_free = work_fixup_free, +}; + +static inline void debug_work_activate(struct work_struct *work) +{ + debug_object_activate(work, &work_debug_descr); +} + +static inline void debug_work_deactivate(struct work_struct *work) +{ + debug_object_deactivate(work, &work_debug_descr); +} + +void __init_work(struct work_struct *work, int onstack) +{ + if (onstack) + debug_object_init_on_stack(work, &work_debug_descr); + else + debug_object_init(work, &work_debug_descr); +} +EXPORT_SYMBOL_GPL(__init_work); + +void destroy_work_on_stack(struct work_struct *work) +{ + debug_object_free(work, &work_debug_descr); +} +EXPORT_SYMBOL_GPL(destroy_work_on_stack); + +#else +static inline void debug_work_activate(struct work_struct *work) { } +static inline void debug_work_deactivate(struct work_struct *work) { } +#endif + /* Serializes the accesses to the list of workqueues. */ static DEFINE_SPINLOCK(workqueue_lock); static LIST_HEAD(workqueues); @@ -145,6 +255,7 @@ static void __queue_work(struct cpu_workqueue_struct *cwq, { unsigned long flags; + debug_work_activate(work); spin_lock_irqsave(&cwq->lock, flags); insert_work(cwq, work, &cwq->worklist); spin_unlock_irqrestore(&cwq->lock, flags); @@ -280,6 +391,7 @@ static void run_workqueue(struct cpu_workqueue_struct *cwq) struct lockdep_map lockdep_map = work->lockdep_map; #endif trace_workqueue_execution(cwq->thread, work); + debug_work_deactivate(work); cwq->current_work = work; list_del_init(cwq->worklist.next); spin_unlock_irq(&cwq->lock); @@ -350,11 +462,18 @@ static void wq_barrier_func(struct work_struct *work) static void insert_wq_barrier(struct cpu_workqueue_struct *cwq, struct wq_barrier *barr, struct list_head *head) { - INIT_WORK(&barr->work, wq_barrier_func); + /* + * debugobject calls are safe here even with cwq->lock locked + * as we know for sure that this will not trigger any of the + * checks and call back into the fixup functions where we + * might deadlock. + */ + INIT_WORK_ON_STACK(&barr->work, wq_barrier_func); __set_bit(WORK_STRUCT_PENDING, work_data_bits(&barr->work)); init_completion(&barr->done); + debug_work_activate(&barr->work); insert_work(cwq, &barr->work, head); } @@ -372,8 +491,10 @@ static int flush_cpu_workqueue(struct cpu_workqueue_struct *cwq) } spin_unlock_irq(&cwq->lock); - if (active) + if (active) { wait_for_completion(&barr.done); + destroy_work_on_stack(&barr.work); + } return active; } @@ -451,6 +572,7 @@ out: return 0; wait_for_completion(&barr.done); + destroy_work_on_stack(&barr.work); return 1; } EXPORT_SYMBOL_GPL(flush_work); @@ -485,6 +607,7 @@ static int try_to_grab_pending(struct work_struct *work) */ smp_rmb(); if (cwq == get_wq_data(work)) { + debug_work_deactivate(work); list_del_init(&work->entry); ret = 1; } @@ -507,8 +630,10 @@ static void wait_on_cpu_work(struct cpu_workqueue_struct *cwq, } spin_unlock_irq(&cwq->lock); - if (unlikely(running)) + if (unlikely(running)) { wait_for_completion(&barr.done); + destroy_work_on_stack(&barr.work); + } } static void wait_on_work(struct work_struct *work) @@ -640,6 +765,24 @@ int schedule_delayed_work(struct delayed_work *dwork, EXPORT_SYMBOL(schedule_delayed_work); /** + * flush_delayed_work - block until a dwork_struct's callback has terminated + * @dwork: the delayed work which is to be flushed + * + * Any timeout is cancelled, and any pending work is run immediately. + */ +void flush_delayed_work(struct delayed_work *dwork) +{ + if (del_timer_sync(&dwork->timer)) { + struct cpu_workqueue_struct *cwq; + cwq = wq_per_cpu(keventd_wq, get_cpu()); + __queue_work(cwq, &dwork->work); + put_cpu(); + } + flush_work(&dwork->work); +} +EXPORT_SYMBOL(flush_delayed_work); + +/** * schedule_delayed_work_on - queue work in global workqueue on CPU after delay * @cpu: cpu to use * @dwork: job to be done @@ -667,6 +810,7 @@ EXPORT_SYMBOL(schedule_delayed_work_on); int schedule_on_each_cpu(work_func_t func) { int cpu; + int orig = -1; struct work_struct *works; works = alloc_percpu(struct work_struct); @@ -674,14 +818,28 @@ int schedule_on_each_cpu(work_func_t func) return -ENOMEM; get_online_cpus(); + + /* + * When running in keventd don't schedule a work item on + * itself. Can just call directly because the work queue is + * already bound. This also is faster. + */ + if (current_is_keventd()) + orig = raw_smp_processor_id(); + for_each_online_cpu(cpu) { struct work_struct *work = per_cpu_ptr(works, cpu); INIT_WORK(work, func); - schedule_work_on(cpu, work); + if (cpu != orig) + schedule_work_on(cpu, work); } + if (orig >= 0) + func(per_cpu_ptr(works, orig)); + for_each_online_cpu(cpu) flush_work(per_cpu_ptr(works, cpu)); + put_online_cpus(); free_percpu(works); return 0; |