From c685689fd24d310343ac33942e9a54a974ae9c43 Mon Sep 17 00:00:00 2001 From: Chuansheng Liu Date: Mon, 24 Feb 2014 11:29:50 +0800 Subject: genirq: Remove racy waitqueue_active check We hit one rare case below: T1 calling disable_irq(), but hanging at synchronize_irq() always; The corresponding irq thread is in sleeping state; And all CPUs are in idle state; After analysis, we found there is one possible scenerio which causes T1 is waiting there forever: CPU0 CPU1 synchronize_irq() wait_event() spin_lock() atomic_dec_and_test(&threads_active) insert the __wait into queue spin_unlock() if(waitqueue_active) atomic_read(&threads_active) wake_up() Here after inserted the __wait into queue on CPU0, and before test if queue is empty on CPU1, there is no barrier, it maybe cause it is not visible for CPU1 immediately, although CPU0 has updated the queue list. It is similar for CPU0 atomic_read() threads_active also. So we'd need one smp_mb() before waitqueue_active.that, but removing the waitqueue_active() check solves it as wel l and it makes things simple and clear. Signed-off-by: Chuansheng Liu Cc: Xiaoming Wang Link: http://lkml.kernel.org/r/1393212590-32543-1-git-send-email-chuansheng.liu@intel.com Cc: stable@vger.kernel.org Signed-off-by: Thomas Gleixner --- kernel/irq/manage.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 481a13c43b1..d3bf660cb57 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -802,8 +802,7 @@ static irqreturn_t irq_thread_fn(struct irq_desc *desc, static void wake_threads_waitq(struct irq_desc *desc) { - if (atomic_dec_and_test(&desc->threads_active) && - waitqueue_active(&desc->wait_for_threads)) + if (atomic_dec_and_test(&desc->threads_active)) wake_up(&desc->wait_for_threads); } -- cgit v1.2.3-70-g09d2 From 791c9e0292671a3bfa95286bb5c08129d8605618 Mon Sep 17 00:00:00 2001 From: George McCollister Date: Tue, 18 Feb 2014 17:56:51 -0600 Subject: sched: Fix double normalization of vruntime dequeue_entity() is called when p->on_rq and sets se->on_rq = 0 which appears to guarentee that the !se->on_rq condition is met. If the task has done set_current_state(TASK_INTERRUPTIBLE) without schedule() the second condition will be met and vruntime will be incorrectly adjusted twice. In certain cases this can result in the task's vruntime never increasing past the vruntime of other tasks on the CFS' run queue, starving them of CPU time. This patch changes switched_from_fair() to use !p->on_rq instead of !se->on_rq. I'm able to cause a task with a priority of 120 to starve all other tasks with the same priority on an ARM platform running 3.2.51-rt72 PREEMPT RT by writing one character at time to a serial tty (16550 UART) in a tight loop. I'm also able to verify making this change corrects the problem on that platform and kernel version. Signed-off-by: George McCollister Signed-off-by: Peter Zijlstra Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/1392767811-28916-1-git-send-email-george.mccollister@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 78157099b16..9b4c4f32013 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7001,15 +7001,15 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p) struct cfs_rq *cfs_rq = cfs_rq_of(se); /* - * Ensure the task's vruntime is normalized, so that when its + * Ensure the task's vruntime is normalized, so that when it's * switched back to the fair class the enqueue_entity(.flags=0) will * do the right thing. * - * If it was on_rq, then the dequeue_entity(.flags=0) will already - * have normalized the vruntime, if it was !on_rq, then only when + * If it's on_rq, then the dequeue_entity(.flags=0) will already + * have normalized the vruntime, if it's !on_rq, then only when * the task is sleeping will it still have non-normalized vruntime. */ - if (!se->on_rq && p->state != TASK_RUNNING) { + if (!p->on_rq && p->state != TASK_RUNNING) { /* * Fix up our vruntime so that the current sleep doesn't * cause 'unlimited' sleep bonus. -- cgit v1.2.3-70-g09d2 From 3908ac13b550c93f97d8db136ff572be5495bc06 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Tue, 25 Feb 2014 19:52:23 +0400 Subject: sched/deadline: Cleanup RT leftovers from {inc/dec}_dl_migration In deadline class we do not have group scheduling. So, let's remove unnecessary X = X; equations. Signed-off-by: Kirill Tkhai Signed-off-by: Peter Zijlstra Cc: Juri Lelli Link: http://lkml.kernel.org/r/1393343543.4089.5.camel@tkhai Signed-off-by: Ingo Molnar --- kernel/sched/deadline.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 15cbc17fbf8..aecf93030e0 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -135,7 +135,6 @@ static void update_dl_migration(struct dl_rq *dl_rq) static void inc_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) { struct task_struct *p = dl_task_of(dl_se); - dl_rq = &rq_of_dl_rq(dl_rq)->dl; if (p->nr_cpus_allowed > 1) dl_rq->dl_nr_migratory++; @@ -146,7 +145,6 @@ static void inc_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) static void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) { struct task_struct *p = dl_task_of(dl_se); - dl_rq = &rq_of_dl_rq(dl_rq)->dl; if (p->nr_cpus_allowed > 1) dl_rq->dl_nr_migratory--; -- cgit v1.2.3-70-g09d2 From eec751ed41a0ae7e92a43c33a458d7bd1b941631 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Mon, 24 Feb 2014 11:47:12 +0100 Subject: sched/deadline: Switch CPU's presence test order Commit 82b9580 ("sched/deadline: Test for CPU's presence explicitly") changed how we check if a CPU returned by cpudeadline machinery is valid. But, we don't want to call cpu_present() if best_cpu is equal to -1. So, switch the order of tests inside WARN_ON(). Signed-off-by: Juri Lelli Signed-off-by: Peter Zijlstra Cc: boris.ostrovsky@oracle.com Cc: konrad.wilk@oracle.com Cc: rostedt@goodmis.org Link: http://lkml.kernel.org/r/1393238832-9100-1-git-send-email-juri.lelli@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/cpudeadline.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c index 5b8838b56d1..5b9bb42b2d4 100644 --- a/kernel/sched/cpudeadline.c +++ b/kernel/sched/cpudeadline.c @@ -70,7 +70,7 @@ static void cpudl_heapify(struct cpudl *cp, int idx) static void cpudl_change_key(struct cpudl *cp, int idx, u64 new_dl) { - WARN_ON(!cpu_present(idx) || idx == IDX_INVALID); + WARN_ON(idx == IDX_INVALID || !cpu_present(idx)); if (dl_time_before(new_dl, cp->elements[idx].dl)) { cp->elements[idx].dl = new_dl; @@ -117,7 +117,7 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p, } out: - WARN_ON(!cpu_present(best_cpu) && best_cpu != -1); + WARN_ON(best_cpu != -1 && !cpu_present(best_cpu)); return best_cpu; } -- cgit v1.2.3-70-g09d2 From faa5993736d9b44b508cab4f1f3a77d66641c6f4 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Fri, 21 Feb 2014 11:37:15 +0100 Subject: sched/deadline: Prevent rt_time growth to infinity Kirill Tkhai noted: Since deadline tasks share rt bandwidth, we must care about bandwidth timer set. Otherwise rt_time may grow up to infinity in update_curr_dl(), if there are no other available RT tasks on top level bandwidth. RT task were in fact throttled right after they got enqueued, and never executed again (rt_time never again went below rt_runtime). Peter then proposed to accrue DL execution on rt_time only when rt timer is active, and proposed a patch (this patch is a slight modification of that) to implement that behavior. While this solves Kirill problem, it has a drawback. Indeed, Kirill noted again: It looks we may get into a situation, when all CPU time is shared between RT and DL tasks: rt_runtime = n rt_period = 2n | RT working, DL sleeping | DL working, RT sleeping | ----------------------------------------------------------- | (1) duration = n | (2) duration = n | (repeat) |--------------------------|------------------------------| | (rt_bw timer is running) | (rt_bw timer is not running) | No time for fair tasks at all. While this can happen during the first period, if rq is always backlogged, RT tasks won't have the opportunity to execute anymore: rt_time reached rt_runtime during (1), suppose after (2) RT is enqueued back, it gets throttled since rt timer didn't fire, replenishment is from now on eaten up by DL tasks that accrue their execution on rt_time (while rt timer is active - we have an RT task waiting for replenishment). FAIR tasks are not touched after this first period. Ok, this is not ideal, and the situation is even worse! What above (the nice case), practically never happens in reality, where your rt timer is not aligned to tasks periods, tasks are in general not periodic, etc.. Long story short, you always risk to overload your system. This patch is based on Peter's idea, but exploits an additional fact: if you don't have RT tasks enqueued, it makes little sense to continue incrementing rt_time once you reached the upper limit (DL tasks have their own mechanism for throttling). This cures both problems: - no matter how many DL instances in the past, you'll have an rt_time slightly above rt_runtime when an RT task is enqueued, and from that point on (after the first replenishment), the task will normally execute; - you can still eat up all bandwidth during the first period, but not anymore after that, remember that DL execution will increment rt_time till the upper limit is reached. The situation is still not perfect! But, we have a simple solution for now, that limits how much you can jeopardize your system, as we keep working towards the right answer: RT groups scheduled using deadline servers. Reported-by: Kirill Tkhai Signed-off-by: Juri Lelli Signed-off-by: Peter Zijlstra Cc: Steven Rostedt Link: http://lkml.kernel.org/r/20140225151515.617714e2f2cd6c558531ba61@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/deadline.c | 8 ++++++-- kernel/sched/rt.c | 8 ++++++++ 2 files changed, 14 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index aecf93030e0..6e79b3faa4c 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -562,6 +562,8 @@ int dl_runtime_exceeded(struct rq *rq, struct sched_dl_entity *dl_se) return 1; } +extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq); + /* * Update the current task's runtime statistics (provided it is still * a -deadline task and has not been removed from the dl_rq). @@ -625,11 +627,13 @@ static void update_curr_dl(struct rq *rq) struct rt_rq *rt_rq = &rq->rt; raw_spin_lock(&rt_rq->rt_runtime_lock); - rt_rq->rt_time += delta_exec; /* * We'll let actual RT tasks worry about the overflow here, we - * have our own CBS to keep us inline -- see above. + * have our own CBS to keep us inline; only account when RT + * bandwidth is relevant. */ + if (sched_rt_bandwidth_account(rt_rq)) + rt_rq->rt_time += delta_exec; raw_spin_unlock(&rt_rq->rt_runtime_lock); } } diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index a2740b775b4..1999021042c 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -538,6 +538,14 @@ static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq) #endif /* CONFIG_RT_GROUP_SCHED */ +bool sched_rt_bandwidth_account(struct rt_rq *rt_rq) +{ + struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); + + return (hrtimer_active(&rt_b->rt_period_timer) || + rt_rq->rt_time < rt_b->rt_runtime); +} + #ifdef CONFIG_SMP /* * We ran out of runtime, see if we can borrow some from our neighbours. -- cgit v1.2.3-70-g09d2 From 64be38ab03e9b238a1299857fef8b3707c0ed045 Mon Sep 17 00:00:00 2001 From: Rashika Kheria Date: Thu, 27 Feb 2014 17:10:12 +0530 Subject: genirq: Include missing header file in irqdomain.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Include appropriate header file include/linux/of_irq.h in kernel/irq/irqdomain.c because it contains prototype definition of function define in kernel/irq/irqdomain.c. This eliminates the following warning in kernel/irq/irqdomain.c: kernel/irq/irqdomain.c:468:14: warning: no previous prototype for ‘irq_create_of_mapping’ [-Wmissing-prototypes] Signed-off-by: Rashika Kheria Reviewed-by: Josh Triplett Cc: Benjamin Herrenschmidt Link: http://lkml.kernel.org/r/eb89aebea7ff1a46122918ac389ebecf8248be9a.1393493276.git.rashika.kheria@gmail.com Signed-off-by: Thomas Gleixner --- kernel/irq/irqdomain.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index cf68bb36fe5..f14033700c2 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include -- cgit v1.2.3-70-g09d2 From 4729583006772b9530404bc1bb7c3aa4a10ffd4d Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 27 Feb 2014 18:19:03 +0800 Subject: cpuset: fix a locking issue in cpuset_migrate_mm() I can trigger a lockdep warning: # mount -t cgroup -o cpuset xxx /cgroup # mkdir /cgroup/cpuset # mkdir /cgroup/tmp # echo 0 > /cgroup/tmp/cpuset.cpus # echo 0 > /cgroup/tmp/cpuset.mems # echo 1 > /cgroup/tmp/cpuset.memory_migrate # echo $$ > /cgroup/tmp/tasks # echo 1 > /cgruop/tmp/cpuset.mems =============================== [ INFO: suspicious RCU usage. ] 3.14.0-rc1-0.1-default+ #32 Not tainted ------------------------------- include/linux/cgroup.h:682 suspicious rcu_dereference_check() usage! ... [] dump_stack+0x72/0x86 [] lockdep_rcu_suspicious+0x101/0x140 [] cpuset_migrate_mm+0xb1/0xe0 ... We used to hold cgroup_mutex when calling cpuset_migrate_mm(), but now we hold cpuset_mutex, which causes task_css() to complain. This is not a false-positive but a real issue. Holding cpuset_mutex won't prevent a task from migrating to another cpuset, and it won't prevent the original task->cgroup from destroying during this change. Fixes: 5d21cc2db040 (cpuset: replace cgroup_mutex locking with cpuset internal locking) Cc: # 3.9+ Signed-off-by: Li Zefan Sigend-off-by: Tejun Heo --- kernel/cpuset.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 4410ac6a55f..dba9e4aef69 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -974,12 +974,6 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, * Temporarilly set tasks mems_allowed to target nodes of migration, * so that the migration code can allocate pages on these nodes. * - * Call holding cpuset_mutex, so current's cpuset won't change - * during this call, as manage_mutex holds off any cpuset_attach() - * calls. Therefore we don't need to take task_lock around the - * call to guarantee_online_mems(), as we know no one is changing - * our task's cpuset. - * * While the mm_struct we are migrating is typically from some * other task, the task_struct mems_allowed that we are hacking * is for our current task, which must allocate new pages for that @@ -996,8 +990,10 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from, do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL); + rcu_read_lock(); mems_cs = effective_nodemask_cpuset(task_cs(tsk)); guarantee_online_mems(mems_cs, &tsk->mems_allowed); + rcu_read_unlock(); } /* -- cgit v1.2.3-70-g09d2 From 99afb0fd5f05aac467ffa85c36778fec4396209b Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 27 Feb 2014 18:19:36 +0800 Subject: cpuset: fix a race condition in __cpuset_node_allowed_softwall() It's not safe to access task's cpuset after releasing task_lock(). Holding callback_mutex won't help. Cc: Signed-off-by: Li Zefan Signed-off-by: Tejun Heo --- kernel/cpuset.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index dba9e4aef69..e6b1b66afe5 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -2482,9 +2482,9 @@ int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask) task_lock(current); cs = nearest_hardwall_ancestor(task_cs(current)); + allowed = node_isset(node, cs->mems_allowed); task_unlock(current); - allowed = node_isset(node, cs->mems_allowed); mutex_unlock(&callback_mutex); return allowed; } -- cgit v1.2.3-70-g09d2 From 48095d991d85687569ac025b18a6c7ae1632c9f7 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 3 Feb 2014 17:25:33 -0800 Subject: audit: Use struct net not pid_t to remember the network namespce to reply in In struct audit_netlink_list and audit_reply add a reference to the network namespace of the caller and remove the userspace pid of the caller. This cleanly remembers the callers network namespace, and removes a huge class of races and nasty failure modes that can occur when attempting to relook up the callers network namespace from a pid_t (including the caller's network namespace changing, pid wraparound, and the pid simply not being present). Signed-off-by: "Eric W. Biederman" --- kernel/audit.c | 10 ++++++---- kernel/audit.h | 2 +- kernel/auditfilter.c | 3 ++- 3 files changed, 9 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 34c5a2310fb..1e5756f16f6 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -182,7 +182,7 @@ struct audit_buffer { struct audit_reply { __u32 portid; - pid_t pid; + struct net *net; struct sk_buff *skb; }; @@ -500,7 +500,7 @@ int audit_send_list(void *_dest) { struct audit_netlink_list *dest = _dest; struct sk_buff *skb; - struct net *net = get_net_ns_by_pid(dest->pid); + struct net *net = dest->net; struct audit_net *aunet = net_generic(net, audit_net_id); /* wait for parent to finish and send an ACK */ @@ -510,6 +510,7 @@ int audit_send_list(void *_dest) while ((skb = __skb_dequeue(&dest->q)) != NULL) netlink_unicast(aunet->nlsk, skb, dest->portid, 0); + put_net(net); kfree(dest); return 0; @@ -543,7 +544,7 @@ out_kfree_skb: static int audit_send_reply_thread(void *arg) { struct audit_reply *reply = (struct audit_reply *)arg; - struct net *net = get_net_ns_by_pid(reply->pid); + struct net *net = reply->net; struct audit_net *aunet = net_generic(net, audit_net_id); mutex_lock(&audit_cmd_mutex); @@ -552,6 +553,7 @@ static int audit_send_reply_thread(void *arg) /* Ignore failure. It'll only happen if the sender goes away, because our timeout is set to infinite. */ netlink_unicast(aunet->nlsk , reply->skb, reply->portid, 0); + put_net(net); kfree(reply); return 0; } @@ -583,8 +585,8 @@ static void audit_send_reply(__u32 portid, int seq, int type, int done, if (!skb) goto out; + reply->net = get_net(current->nsproxy->net_ns); reply->portid = portid; - reply->pid = task_pid_vnr(current); reply->skb = skb; tsk = kthread_run(audit_send_reply_thread, reply, "audit_send_reply"); diff --git a/kernel/audit.h b/kernel/audit.h index 57cc64d6771..8df13221460 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -247,7 +247,7 @@ extern void audit_panic(const char *message); struct audit_netlink_list { __u32 portid; - pid_t pid; + struct net *net; struct sk_buff_head q; }; diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 14a78cca384..a5e3d73d73e 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -29,6 +29,7 @@ #include #include #include +#include #include "audit.h" /* @@ -1083,8 +1084,8 @@ int audit_list_rules_send(__u32 portid, int seq) dest = kmalloc(sizeof(struct audit_netlink_list), GFP_KERNEL); if (!dest) return -ENOMEM; + dest->net = get_net(current->nsproxy->net_ns); dest->portid = portid; - dest->pid = task_pid_vnr(current); skb_queue_head_init(&dest->q); mutex_lock(&audit_filter_mutex); -- cgit v1.2.3-70-g09d2 From 6f285b19d09f72e801525f5eea1bdad22e559bf0 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 28 Feb 2014 19:44:55 -0800 Subject: audit: Send replies in the proper network namespace. In perverse cases of file descriptor passing the current network namespace of a process and the network namespace of a socket used by that socket may differ. Therefore use the network namespace of the appropiate socket to ensure replies always go to the appropiate socket. Signed-off-by: "Eric W. Biederman" --- include/linux/audit.h | 3 ++- kernel/audit.c | 21 ++++++++++----------- kernel/auditfilter.c | 7 +++++-- 3 files changed, 17 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/include/linux/audit.h b/include/linux/audit.h index aa865a9a4c4..ec1464df4c6 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -43,6 +43,7 @@ struct mq_attr; struct mqstat; struct audit_watch; struct audit_tree; +struct sk_buff; struct audit_krule { int vers_ops; @@ -463,7 +464,7 @@ extern int audit_filter_user(int type); extern int audit_filter_type(int type); extern int audit_rule_change(int type, __u32 portid, int seq, void *data, size_t datasz); -extern int audit_list_rules_send(__u32 portid, int seq); +extern int audit_list_rules_send(struct sk_buff *request_skb, int seq); extern u32 audit_enabled; #else /* CONFIG_AUDIT */ diff --git a/kernel/audit.c b/kernel/audit.c index 1e5756f16f6..32086bff556 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -570,9 +570,11 @@ static int audit_send_reply_thread(void *arg) * Allocates an skb, builds the netlink message, and sends it to the port id. * No failure notifications. */ -static void audit_send_reply(__u32 portid, int seq, int type, int done, +static void audit_send_reply(struct sk_buff *request_skb, int seq, int type, int done, int multi, const void *payload, int size) { + u32 portid = NETLINK_CB(request_skb).portid; + struct net *net = sock_net(NETLINK_CB(request_skb).sk); struct sk_buff *skb; struct task_struct *tsk; struct audit_reply *reply = kmalloc(sizeof(struct audit_reply), @@ -585,7 +587,7 @@ static void audit_send_reply(__u32 portid, int seq, int type, int done, if (!skb) goto out; - reply->net = get_net(current->nsproxy->net_ns); + reply->net = get_net(net); reply->portid = portid; reply->skb = skb; @@ -675,8 +677,7 @@ static int audit_get_feature(struct sk_buff *skb) seq = nlmsg_hdr(skb)->nlmsg_seq; - audit_send_reply(NETLINK_CB(skb).portid, seq, AUDIT_GET, 0, 0, - &af, sizeof(af)); + audit_send_reply(skb, seq, AUDIT_GET, 0, 0, &af, sizeof(af)); return 0; } @@ -796,8 +797,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) s.backlog = skb_queue_len(&audit_skb_queue); s.version = AUDIT_VERSION_LATEST; s.backlog_wait_time = audit_backlog_wait_time; - audit_send_reply(NETLINK_CB(skb).portid, seq, AUDIT_GET, 0, 0, - &s, sizeof(s)); + audit_send_reply(skb, seq, AUDIT_GET, 0, 0, &s, sizeof(s)); break; } case AUDIT_SET: { @@ -907,7 +907,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) seq, data, nlmsg_len(nlh)); break; case AUDIT_LIST_RULES: - err = audit_list_rules_send(NETLINK_CB(skb).portid, seq); + err = audit_list_rules_send(skb, seq); break; case AUDIT_TRIM: audit_trim_trees(); @@ -972,8 +972,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) memcpy(sig_data->ctx, ctx, len); security_release_secctx(ctx, len); } - audit_send_reply(NETLINK_CB(skb).portid, seq, AUDIT_SIGNAL_INFO, - 0, 0, sig_data, sizeof(*sig_data) + len); + audit_send_reply(skb, seq, AUDIT_SIGNAL_INFO, 0, 0, + sig_data, sizeof(*sig_data) + len); kfree(sig_data); break; case AUDIT_TTY_GET: { @@ -985,8 +985,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) s.log_passwd = tsk->signal->audit_tty_log_passwd; spin_unlock(&tsk->sighand->siglock); - audit_send_reply(NETLINK_CB(skb).portid, seq, - AUDIT_TTY_GET, 0, 0, &s, sizeof(s)); + audit_send_reply(skb, seq, AUDIT_TTY_GET, 0, 0, &s, sizeof(s)); break; } case AUDIT_TTY_SET: { diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index a5e3d73d73e..e8d1c7c515d 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -30,6 +30,7 @@ #include #include #include +#include #include "audit.h" /* @@ -1069,8 +1070,10 @@ int audit_rule_change(int type, __u32 portid, int seq, void *data, * @portid: target portid for netlink audit messages * @seq: netlink audit message sequence (serial) number */ -int audit_list_rules_send(__u32 portid, int seq) +int audit_list_rules_send(struct sk_buff *request_skb, int seq) { + u32 portid = NETLINK_CB(request_skb).portid; + struct net *net = sock_net(NETLINK_CB(request_skb).sk); struct task_struct *tsk; struct audit_netlink_list *dest; int err = 0; @@ -1084,7 +1087,7 @@ int audit_list_rules_send(__u32 portid, int seq) dest = kmalloc(sizeof(struct audit_netlink_list), GFP_KERNEL); if (!dest) return -ENOMEM; - dest->net = get_net(current->nsproxy->net_ns); + dest->net = get_net(net); dest->portid = portid; skb_queue_head_init(&dest->q); -- cgit v1.2.3-70-g09d2 From 45ab2813d40d88fc575e753c38478de242d03f88 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Wed, 26 Feb 2014 13:37:38 -0500 Subject: tracing: Do not add event files for modules that fail tracepoints If a module fails to add its tracepoints due to module tainting, do not create the module event infrastructure in the debugfs directory. As the events will not work and worse yet, they will silently fail, making the user wonder why the events they enable do not display anything. Having a warning on module load and the events not visible to the users will make the cause of the problem much clearer. Link: http://lkml.kernel.org/r/20140227154923.265882695@goodmis.org Fixes: 6d723736e472 "tracing/events: add support for modules to TRACE_EVENT" Acked-by: Mathieu Desnoyers Cc: stable@vger.kernel.org # 2.6.31+ Cc: Rusty Russell Signed-off-by: Steven Rostedt --- include/linux/tracepoint.h | 6 ++++++ kernel/trace/trace_events.c | 10 ++++++++++ kernel/tracepoint.c | 7 ++++++- 3 files changed, 22 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index accc497f8d7..7159a0a933d 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -60,6 +60,12 @@ struct tp_module { unsigned int num_tracepoints; struct tracepoint * const *tracepoints_ptrs; }; +bool trace_module_has_bad_taint(struct module *mod); +#else +static inline bool trace_module_has_bad_taint(struct module *mod) +{ + return false; +} #endif /* CONFIG_MODULES */ struct tracepoint_iter { diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index e71ffd4eccb..f3989ceb5cd 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -1777,6 +1777,16 @@ static void trace_module_add_events(struct module *mod) { struct ftrace_event_call **call, **start, **end; + if (!mod->num_trace_events) + return; + + /* Don't add infrastructure for mods without tracepoints */ + if (trace_module_has_bad_taint(mod)) { + pr_err("%s: module has bad taint, not creating trace events\n", + mod->name); + return; + } + start = mod->trace_events; end = mod->trace_events + mod->num_trace_events; diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index 29f26540e9c..031cc5655a5 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -631,6 +631,11 @@ void tracepoint_iter_reset(struct tracepoint_iter *iter) EXPORT_SYMBOL_GPL(tracepoint_iter_reset); #ifdef CONFIG_MODULES +bool trace_module_has_bad_taint(struct module *mod) +{ + return mod->taints & ~((1 << TAINT_OOT_MODULE) | (1 << TAINT_CRAP)); +} + static int tracepoint_module_coming(struct module *mod) { struct tp_module *tp_mod, *iter; @@ -641,7 +646,7 @@ static int tracepoint_module_coming(struct module *mod) * module headers (for forced load), to make sure we don't cause a crash. * Staging and out-of-tree GPL modules are fine. */ - if (mod->taints & ~((1 << TAINT_OOT_MODULE) | (1 << TAINT_CRAP))) + if (trace_module_has_bad_taint(mod)) return 0; mutex_lock(&tracepoints_mutex); tp_mod = kmalloc(sizeof(struct tp_module), GFP_KERNEL); -- cgit v1.2.3-70-g09d2 From d211f177b28ec070c25b3d0b960aa55f352f731f Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sat, 8 Mar 2014 15:31:54 -0800 Subject: audit: Update kdoc for audit_send_reply and audit_list_rules_send The kbuild test robot reported: > tree: git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace.git for-next > head: 6f285b19d09f72e801525f5eea1bdad22e559bf0 > commit: 6f285b19d09f72e801525f5eea1bdad22e559bf0 [2/2] audit: Send replies in the proper network namespace. > reproduce: make htmldocs > > >> Warning(kernel/audit.c:575): No description found for parameter 'request_skb' > >> Warning(kernel/audit.c:575): Excess function parameter 'portid' description in 'audit_send_reply' > >> Warning(kernel/auditfilter.c:1074): No description found for parameter 'request_skb' > >> Warning(kernel/auditfilter.c:1074): Excess function parameter 'portid' description in 'audit_list_rules_s Which was caused by my failure to update the kdoc annotations when I updated the functions. Fix that small oversight now. Signed-off-by: "Eric W. Biederman" --- kernel/audit.c | 2 +- kernel/auditfilter.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 32086bff556..3392d3e0254 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -559,7 +559,7 @@ static int audit_send_reply_thread(void *arg) } /** * audit_send_reply - send an audit reply message via netlink - * @portid: netlink port to which to send reply + * @request_skb: skb of request we are replying to (used to target the reply) * @seq: sequence number * @type: audit message type * @done: done (last) flag diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index e8d1c7c515d..92062fd6cc8 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -1067,7 +1067,7 @@ int audit_rule_change(int type, __u32 portid, int seq, void *data, /** * audit_list_rules_send - list the audit rules - * @portid: target portid for netlink audit messages + * @request_skb: skb of request we are replying to (used to target the reply) * @seq: netlink audit message sequence (serial) number */ int audit_list_rules_send(struct sk_buff *request_skb, int seq) -- cgit v1.2.3-70-g09d2 From e97ca8e5b864f88b028c1759ba8536fa827d6d96 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 10 Mar 2014 15:49:43 -0700 Subject: mm: fix GFP_THISNODE callers and clarify GFP_THISNODE is for callers that implement their own clever fallback to remote nodes. It restricts the allocation to the specified node and does not invoke reclaim, assuming that the caller will take care of it when the fallback fails, e.g. through a subsequent allocation request without GFP_THISNODE set. However, many current GFP_THISNODE users only want the node exclusive aspect of the flag, without actually implementing their own fallback or triggering reclaim if necessary. This results in things like page migration failing prematurely even when there is easily reclaimable memory available, unless kswapd happens to be running already or a concurrent allocation attempt triggers the necessary reclaim. Convert all callsites that don't implement their own fallback strategy to __GFP_THISNODE. This restricts the allocation a single node too, but at the same time allows the allocator to enter the slowpath, wake kswapd, and invoke direct reclaim if necessary, to make the allocation happen when memory is full. Signed-off-by: Johannes Weiner Acked-by: Rik van Riel Cc: Jan Stancek Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/kernel/uncached.c | 2 +- arch/powerpc/platforms/cell/ras.c | 3 ++- drivers/misc/sgi-xp/xpc_uv.c | 2 +- include/linux/gfp.h | 4 ++++ include/linux/mmzone.h | 4 ++-- include/linux/slab.h | 2 +- kernel/profile.c | 4 ++-- mm/migrate.c | 11 ++++++----- 8 files changed, 19 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/arch/ia64/kernel/uncached.c b/arch/ia64/kernel/uncached.c index a96bcf83a73..20e8a9b21d7 100644 --- a/arch/ia64/kernel/uncached.c +++ b/arch/ia64/kernel/uncached.c @@ -98,7 +98,7 @@ static int uncached_add_chunk(struct uncached_pool *uc_pool, int nid) /* attempt to allocate a granule's worth of cached memory pages */ page = alloc_pages_exact_node(nid, - GFP_KERNEL | __GFP_ZERO | GFP_THISNODE, + GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE, IA64_GRANULE_SHIFT-PAGE_SHIFT); if (!page) { mutex_unlock(&uc_pool->add_chunk_mutex); diff --git a/arch/powerpc/platforms/cell/ras.c b/arch/powerpc/platforms/cell/ras.c index 5ec1e47a0d7..e865d748179 100644 --- a/arch/powerpc/platforms/cell/ras.c +++ b/arch/powerpc/platforms/cell/ras.c @@ -123,7 +123,8 @@ static int __init cbe_ptcal_enable_on_node(int nid, int order) area->nid = nid; area->order = order; - area->pages = alloc_pages_exact_node(area->nid, GFP_KERNEL|GFP_THISNODE, + area->pages = alloc_pages_exact_node(area->nid, + GFP_KERNEL|__GFP_THISNODE, area->order); if (!area->pages) { diff --git a/drivers/misc/sgi-xp/xpc_uv.c b/drivers/misc/sgi-xp/xpc_uv.c index b9e2000969f..95c894482fd 100644 --- a/drivers/misc/sgi-xp/xpc_uv.c +++ b/drivers/misc/sgi-xp/xpc_uv.c @@ -240,7 +240,7 @@ xpc_create_gru_mq_uv(unsigned int mq_size, int cpu, char *irq_name, nid = cpu_to_node(cpu); page = alloc_pages_exact_node(nid, - GFP_KERNEL | __GFP_ZERO | GFP_THISNODE, + GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE, pg_order); if (page == NULL) { dev_err(xpc_part, "xpc_create_gru_mq_uv() failed to alloc %d " diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 0437439bc04..39b81dc7d01 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -123,6 +123,10 @@ struct vm_area_struct; __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN | \ __GFP_NO_KSWAPD) +/* + * GFP_THISNODE does not perform any reclaim, you most likely want to + * use __GFP_THISNODE to allocate from a given node without fallback! + */ #ifdef CONFIG_NUMA #define GFP_THISNODE (__GFP_THISNODE | __GFP_NOWARN | __GFP_NORETRY) #else diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 5f2052c8315..9b61b9bf81a 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -590,10 +590,10 @@ static inline bool zone_is_empty(struct zone *zone) /* * The NUMA zonelists are doubled because we need zonelists that restrict the - * allocations to a single node for GFP_THISNODE. + * allocations to a single node for __GFP_THISNODE. * * [0] : Zonelist with fallback - * [1] : No fallback (GFP_THISNODE) + * [1] : No fallback (__GFP_THISNODE) */ #define MAX_ZONELISTS 2 diff --git a/include/linux/slab.h b/include/linux/slab.h index 9260abdd67d..b5b2df60299 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -410,7 +410,7 @@ static __always_inline void *kmalloc_large(size_t size, gfp_t flags) * * %GFP_NOWAIT - Allocation will not sleep. * - * %GFP_THISNODE - Allocate node-local memory only. + * %__GFP_THISNODE - Allocate node-local memory only. * * %GFP_DMA - Allocation suitable for DMA. * Should only be used for kmalloc() caches. Otherwise, use a diff --git a/kernel/profile.c b/kernel/profile.c index 6631e1ef55a..ebdd9c1a86b 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -549,14 +549,14 @@ static int create_hash_tables(void) struct page *page; page = alloc_pages_exact_node(node, - GFP_KERNEL | __GFP_ZERO | GFP_THISNODE, + GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE, 0); if (!page) goto out_cleanup; per_cpu(cpu_profile_hits, cpu)[1] = (struct profile_hit *)page_address(page); page = alloc_pages_exact_node(node, - GFP_KERNEL | __GFP_ZERO | GFP_THISNODE, + GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE, 0); if (!page) goto out_cleanup; diff --git a/mm/migrate.c b/mm/migrate.c index 482a33d8913..b494fdb9a63 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1158,7 +1158,7 @@ static struct page *new_page_node(struct page *p, unsigned long private, pm->node); else return alloc_pages_exact_node(pm->node, - GFP_HIGHUSER_MOVABLE | GFP_THISNODE, 0); + GFP_HIGHUSER_MOVABLE | __GFP_THISNODE, 0); } /* @@ -1544,9 +1544,9 @@ static struct page *alloc_misplaced_dst_page(struct page *page, struct page *newpage; newpage = alloc_pages_exact_node(nid, - (GFP_HIGHUSER_MOVABLE | GFP_THISNODE | - __GFP_NOMEMALLOC | __GFP_NORETRY | - __GFP_NOWARN) & + (GFP_HIGHUSER_MOVABLE | + __GFP_THISNODE | __GFP_NOMEMALLOC | + __GFP_NORETRY | __GFP_NOWARN) & ~GFP_IOFS, 0); return newpage; @@ -1747,7 +1747,8 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, goto out_dropref; new_page = alloc_pages_node(node, - (GFP_TRANSHUGE | GFP_THISNODE) & ~__GFP_WAIT, HPAGE_PMD_ORDER); + (GFP_TRANSHUGE | __GFP_THISNODE) & ~__GFP_WAIT, + HPAGE_PMD_ORDER); if (!new_page) goto out_fail; -- cgit v1.2.3-70-g09d2 From d44753b843e093f9e1f2f14806fbe106fff74898 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Mon, 3 Mar 2014 12:09:21 +0100 Subject: sched/deadline: Deny unprivileged users to set/change SCHED_DEADLINE policy Deny the use of SCHED_DEADLINE policy to unprivileged users. Even if root users can set the policy for normal users, we don't want the latter to be able to change their parameters (safest behavior). Signed-off-by: Juri Lelli Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1393844961-18097-1-git-send-email-juri.lelli@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 6edbef296ec..f5c6635b806 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3338,6 +3338,15 @@ recheck: return -EPERM; } + /* + * Can't set/change SCHED_DEADLINE policy at all for now + * (safest behavior); in the future we would like to allow + * unprivileged DL tasks to increase their relative deadline + * or reduce their runtime (both ways reducing utilization) + */ + if (dl_policy(policy)) + return -EPERM; + /* * Treat SCHED_IDLE as nice 20. Only allow a switch to * SCHED_NORMAL if the RLIMIT_NICE would normally permit it. -- cgit v1.2.3-70-g09d2 From 177c53d943368fc97644ebc0a250dc8e2d124250 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 28 Feb 2014 13:39:05 +0100 Subject: stop_machine: Fix^2 race between stop_two_cpus() and stop_cpus() We must use smp_call_function_single(.wait=1) for the irq_cpu_stop_queue_work() to ensure the queueing is actually done under stop_cpus_lock. Without this we could have dropped the lock by the time we do the queueing and get the race we tried to fix. Fixes: 7053ea1a34fa ("stop_machine: Fix race between stop_two_cpus() and stop_cpus()") Signed-off-by: Peter Zijlstra Cc: Prarit Bhargava Cc: Rik van Riel Cc: Mel Gorman Cc: Christoph Hellwig Cc: Andrew Morton Link: http://lkml.kernel.org/r/20140228123905.GK3104@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/stop_machine.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 84571e09c90..01fbae5b97b 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -293,7 +293,7 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void * */ smp_call_function_single(min(cpu1, cpu2), &irq_cpu_stop_queue_work, - &call_args, 0); + &call_args, 1); lg_local_unlock(&stop_cpus_lock); preempt_enable(); -- cgit v1.2.3-70-g09d2 From 96b3d28bf4b00f62fc8386ff5d487d1830793a3d Mon Sep 17 00:00:00 2001 From: Fernando Luis Vazquez Cao Date: Thu, 6 Mar 2014 14:25:28 +0900 Subject: sched/clock: Prevent tracing recursion in sched_clock_cpu() Prevent tracing of preempt_disable/enable() in sched_clock_cpu(). When CONFIG_DEBUG_PREEMPT is enabled, preempt_disable/enable() are traced and this causes trace_clock() users (and probably others) to go into an infinite recursion. Systems with a stable sched_clock() are not affected. This problem is similar to that fixed by upstream commit 95ef1e52922 ("KVM guest: prevent tracing recursion with kvmclock"). Signed-off-by: Fernando Luis Vazquez Cao Signed-off-by: Peter Zijlstra Acked-by: Steven Rostedt Cc: Andrew Morton Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1394083528.4524.3.camel@nexus Signed-off-by: Ingo Molnar --- kernel/sched/clock.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index 43c2bcc3576..b30a2924ef1 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -301,14 +301,14 @@ u64 sched_clock_cpu(int cpu) if (unlikely(!sched_clock_running)) return 0ull; - preempt_disable(); + preempt_disable_notrace(); scd = cpu_sdc(cpu); if (cpu != smp_processor_id()) clock = sched_clock_remote(scd); else clock = sched_clock_local(scd); - preempt_enable(); + preempt_enable_notrace(); return clock; } -- cgit v1.2.3-70-g09d2 From 3eb59ec64fc7a3f4576da23f811b39331b830ba2 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 18 Mar 2014 17:02:36 +0800 Subject: cgroup: fix a failure path in create_css() If online_css() fails, we should remove cgroup files belonging to css->ss. Signed-off-by: Li Zefan Signed-off-by: Tejun Heo --- kernel/cgroup.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 105f273b6f8..0c753ddd223 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4112,17 +4112,17 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss) err = percpu_ref_init(&css->refcnt, css_release); if (err) - goto err_free; + goto err_free_css; init_css(css, ss, cgrp); err = cgroup_populate_dir(cgrp, 1 << ss->subsys_id); if (err) - goto err_free; + goto err_free_percpu_ref; err = online_css(css); if (err) - goto err_free; + goto err_clear_dir; dget(cgrp->dentry); css_get(css->parent); @@ -4138,8 +4138,11 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss) return 0; -err_free: +err_clear_dir: + cgroup_clear_dir(css->cgroup, 1 << css->ss->subsys_id); +err_free_percpu_ref: percpu_ref_cancel_init(&css->refcnt); +err_free_css: ss->css_free(css); return err; } -- cgit v1.2.3-70-g09d2 From 87291347c49dc40aa339f587b209618201c2e527 Mon Sep 17 00:00:00 2001 From: Vaibhav Nagarnaik Date: Thu, 13 Feb 2014 19:51:48 -0800 Subject: tracing: Fix array size mismatch in format string In event format strings, the array size is reported in two locations. One in array subscript and then via the "size:" attribute. The values reported there have a mismatch. For e.g., in sched:sched_switch the prev_comm and next_comm character arrays have subscript values as [32] where as the actual field size is 16. name: sched_switch ID: 301 format: field:unsigned short common_type; offset:0; size:2; signed:0; field:unsigned char common_flags; offset:2; size:1; signed:0; field:unsigned char common_preempt_count; offset:3; size:1;signed:0; field:int common_pid; offset:4; size:4; signed:1; field:char prev_comm[32]; offset:8; size:16; signed:1; field:pid_t prev_pid; offset:24; size:4; signed:1; field:int prev_prio; offset:28; size:4; signed:1; field:long prev_state; offset:32; size:8; signed:1; field:char next_comm[32]; offset:40; size:16; signed:1; field:pid_t next_pid; offset:56; size:4; signed:1; field:int next_prio; offset:60; size:4; signed:1; After bisection, the following commit was blamed: 92edca0 tracing: Use direct field, type and system names This commit removes the duplication of strings for field->name and field->type assuming that all the strings passed in __trace_define_field() are immutable. This is not true for arrays, where the type string is created in event_storage variable and field->type for all array fields points to event_storage. Use __stringify() to create a string constant for the type string. Also, get rid of event_storage and event_storage_mutex that are not needed anymore. also, an added benefit is that this reduces the overhead of events a bit more: text data bss dec hex filename 8424787 2036472 1302528 11763787 b3804b vmlinux 8420814 2036408 1302528 11759750 b37086 vmlinux.patched Link: http://lkml.kernel.org/r/1392349908-29685-1-git-send-email-vnagarnaik@google.com Cc: Laurent Chavey Cc: stable@vger.kernel.org # 3.10+ Signed-off-by: Vaibhav Nagarnaik Signed-off-by: Steven Rostedt --- include/linux/ftrace_event.h | 4 ---- include/trace/ftrace.h | 7 ++----- kernel/trace/trace_events.c | 6 ------ kernel/trace/trace_export.c | 7 ++----- 4 files changed, 4 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index 4e4cc28623a..4cdb3a17bcb 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -495,10 +495,6 @@ enum { FILTER_TRACE_FN, }; -#define EVENT_STORAGE_SIZE 128 -extern struct mutex event_storage_mutex; -extern char event_storage[EVENT_STORAGE_SIZE]; - extern int trace_event_raw_init(struct ftrace_event_call *call); extern int trace_define_field(struct ftrace_event_call *call, const char *type, const char *name, int offset, int size, diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index 1a8b28db377..1ee19a24cc5 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -310,15 +310,12 @@ static struct trace_event_functions ftrace_event_type_funcs_##call = { \ #undef __array #define __array(type, item, len) \ do { \ - mutex_lock(&event_storage_mutex); \ + char *type_str = #type"["__stringify(len)"]"; \ BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \ - snprintf(event_storage, sizeof(event_storage), \ - "%s[%d]", #type, len); \ - ret = trace_define_field(event_call, event_storage, #item, \ + ret = trace_define_field(event_call, type_str, #item, \ offsetof(typeof(field), item), \ sizeof(field.item), \ is_signed_type(type), FILTER_OTHER); \ - mutex_unlock(&event_storage_mutex); \ if (ret) \ return ret; \ } while (0); diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index f3989ceb5cd..7b16d40bd64 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -27,12 +27,6 @@ DEFINE_MUTEX(event_mutex); -DEFINE_MUTEX(event_storage_mutex); -EXPORT_SYMBOL_GPL(event_storage_mutex); - -char event_storage[EVENT_STORAGE_SIZE]; -EXPORT_SYMBOL_GPL(event_storage); - LIST_HEAD(ftrace_events); static LIST_HEAD(ftrace_common_fields); diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index 7c3e3e72e2b..ee0a5098ac4 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -95,15 +95,12 @@ static void __always_unused ____ftrace_check_##name(void) \ #undef __array #define __array(type, item, len) \ do { \ + char *type_str = #type"["__stringify(len)"]"; \ BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \ - mutex_lock(&event_storage_mutex); \ - snprintf(event_storage, sizeof(event_storage), \ - "%s[%d]", #type, len); \ - ret = trace_define_field(event_call, event_storage, #item, \ + ret = trace_define_field(event_call, type_str, #item, \ offsetof(typeof(field), item), \ sizeof(field.item), \ is_signed_type(type), filter_type); \ - mutex_unlock(&event_storage_mutex); \ if (ret) \ return ret; \ } while (0); -- cgit v1.2.3-70-g09d2 From 11d4616bd07f38d496bd489ed8fad1dc4d928823 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 20 Mar 2014 22:11:17 -0700 Subject: futex: revert back to the explicit waiter counting code Srikar Dronamraju reports that commit b0c29f79ecea ("futexes: Avoid taking the hb->lock if there's nothing to wake up") causes java threads getting stuck on futexes when runing specjbb on a power7 numa box. The cause appears to be that the powerpc spinlocks aren't using the same ticket lock model that we use on x86 (and other) architectures, which in turn result in the "spin_is_locked()" test in hb_waiters_pending() occasionally reporting an unlocked spinlock even when there are pending waiters. So this reinstates Davidlohr Bueso's original explicit waiter counting code, which I had convinced Davidlohr to drop in favor of figuring out the pending waiters by just using the existing state of the spinlock and the wait queue. Reported-and-tested-by: Srikar Dronamraju Original-code-by: Davidlohr Bueso Signed-off-by: Linus Torvalds --- kernel/futex.c | 53 +++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 43 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 44a1261cb9f..08ec814ad9d 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -234,6 +234,7 @@ static const struct futex_q futex_q_init = { * waiting on a futex. */ struct futex_hash_bucket { + atomic_t waiters; spinlock_t lock; struct plist_head chain; } ____cacheline_aligned_in_smp; @@ -253,22 +254,37 @@ static inline void futex_get_mm(union futex_key *key) smp_mb__after_atomic_inc(); } -static inline bool hb_waiters_pending(struct futex_hash_bucket *hb) +/* + * Reflects a new waiter being added to the waitqueue. + */ +static inline void hb_waiters_inc(struct futex_hash_bucket *hb) { #ifdef CONFIG_SMP + atomic_inc(&hb->waiters); /* - * Tasks trying to enter the critical region are most likely - * potential waiters that will be added to the plist. Ensure - * that wakers won't miss to-be-slept tasks in the window between - * the wait call and the actual plist_add. + * Full barrier (A), see the ordering comment above. */ - if (spin_is_locked(&hb->lock)) - return true; - smp_rmb(); /* Make sure we check the lock state first */ + smp_mb__after_atomic_inc(); +#endif +} + +/* + * Reflects a waiter being removed from the waitqueue by wakeup + * paths. + */ +static inline void hb_waiters_dec(struct futex_hash_bucket *hb) +{ +#ifdef CONFIG_SMP + atomic_dec(&hb->waiters); +#endif +} - return !plist_head_empty(&hb->chain); +static inline int hb_waiters_pending(struct futex_hash_bucket *hb) +{ +#ifdef CONFIG_SMP + return atomic_read(&hb->waiters); #else - return true; + return 1; #endif } @@ -954,6 +970,7 @@ static void __unqueue_futex(struct futex_q *q) hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock); plist_del(&q->list, &hb->chain); + hb_waiters_dec(hb); } /* @@ -1257,7 +1274,9 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1, */ if (likely(&hb1->chain != &hb2->chain)) { plist_del(&q->list, &hb1->chain); + hb_waiters_dec(hb1); plist_add(&q->list, &hb2->chain); + hb_waiters_inc(hb2); q->lock_ptr = &hb2->lock; } get_futex_key_refs(key2); @@ -1600,6 +1619,17 @@ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q) struct futex_hash_bucket *hb; hb = hash_futex(&q->key); + + /* + * Increment the counter before taking the lock so that + * a potential waker won't miss a to-be-slept task that is + * waiting for the spinlock. This is safe as all queue_lock() + * users end up calling queue_me(). Similarly, for housekeeping, + * decrement the counter at queue_unlock() when some error has + * occurred and we don't end up adding the task to the list. + */ + hb_waiters_inc(hb); + q->lock_ptr = &hb->lock; spin_lock(&hb->lock); /* implies MB (A) */ @@ -1611,6 +1641,7 @@ queue_unlock(struct futex_hash_bucket *hb) __releases(&hb->lock) { spin_unlock(&hb->lock); + hb_waiters_dec(hb); } /** @@ -2342,6 +2373,7 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, * Unqueue the futex_q and determine which it was. */ plist_del(&q->list, &hb->chain); + hb_waiters_dec(hb); /* Handle spurious wakeups gracefully */ ret = -EWOULDBLOCK; @@ -2875,6 +2907,7 @@ static int __init futex_init(void) futex_cmpxchg_enabled = 1; for (i = 0; i < futex_hashsize; i++) { + atomic_set(&futex_queues[i].waiters, 0); plist_head_init(&futex_queues[i].chain); spin_lock_init(&futex_queues[i].lock); } -- cgit v1.2.3-70-g09d2