aboutsummaryrefslogtreecommitdiff
path: root/kernel/rcu/rcutorture.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/rcu/rcutorture.c')
-rw-r--r--kernel/rcu/rcutorture.c1707
1 files changed, 1707 insertions, 0 deletions
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
new file mode 100644
index 00000000000..948a7693748
--- /dev/null
+++ b/kernel/rcu/rcutorture.c
@@ -0,0 +1,1707 @@
+/*
+ * Read-Copy Update module-based torture test facility
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ * Copyright (C) IBM Corporation, 2005, 2006
+ *
+ * Authors: Paul E. McKenney <paulmck@us.ibm.com>
+ * Josh Triplett <josh@joshtriplett.org>
+ *
+ * See also: Documentation/RCU/torture.txt
+ */
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/kthread.h>
+#include <linux/err.h>
+#include <linux/spinlock.h>
+#include <linux/smp.h>
+#include <linux/rcupdate.h>
+#include <linux/interrupt.h>
+#include <linux/sched.h>
+#include <linux/atomic.h>
+#include <linux/bitops.h>
+#include <linux/completion.h>
+#include <linux/moduleparam.h>
+#include <linux/percpu.h>
+#include <linux/notifier.h>
+#include <linux/reboot.h>
+#include <linux/freezer.h>
+#include <linux/cpu.h>
+#include <linux/delay.h>
+#include <linux/stat.h>
+#include <linux/srcu.h>
+#include <linux/slab.h>
+#include <linux/trace_clock.h>
+#include <asm/byteorder.h>
+#include <linux/torture.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and Josh Triplett <josh@joshtriplett.org>");
+
+
+torture_param(int, fqs_duration, 0,
+ "Duration of fqs bursts (us), 0 to disable");
+torture_param(int, fqs_holdoff, 0, "Holdoff time within fqs bursts (us)");
+torture_param(int, fqs_stutter, 3, "Wait time between fqs bursts (s)");
+torture_param(bool, gp_cond, false, "Use conditional/async GP wait primitives");
+torture_param(bool, gp_exp, false, "Use expedited GP wait primitives");
+torture_param(bool, gp_normal, false,
+ "Use normal (non-expedited) GP wait primitives");
+torture_param(bool, gp_sync, false, "Use synchronous GP wait primitives");
+torture_param(int, irqreader, 1, "Allow RCU readers from irq handlers");
+torture_param(int, n_barrier_cbs, 0,
+ "# of callbacks/kthreads for barrier testing");
+torture_param(int, nfakewriters, 4, "Number of RCU fake writer threads");
+torture_param(int, nreaders, -1, "Number of RCU reader threads");
+torture_param(int, object_debug, 0,
+ "Enable debug-object double call_rcu() testing");
+torture_param(int, onoff_holdoff, 0, "Time after boot before CPU hotplugs (s)");
+torture_param(int, onoff_interval, 0,
+ "Time between CPU hotplugs (s), 0=disable");
+torture_param(int, shuffle_interval, 3, "Number of seconds between shuffles");
+torture_param(int, shutdown_secs, 0, "Shutdown time (s), <= zero to disable.");
+torture_param(int, stall_cpu, 0, "Stall duration (s), zero to disable.");
+torture_param(int, stall_cpu_holdoff, 10,
+ "Time to wait before starting stall (s).");
+torture_param(int, stat_interval, 60,
+ "Number of seconds between stats printk()s");
+torture_param(int, stutter, 5, "Number of seconds to run/halt test");
+torture_param(int, test_boost, 1, "Test RCU prio boost: 0=no, 1=maybe, 2=yes.");
+torture_param(int, test_boost_duration, 4,
+ "Duration of each boost test, seconds.");
+torture_param(int, test_boost_interval, 7,
+ "Interval between boost tests, seconds.");
+torture_param(bool, test_no_idle_hz, true,
+ "Test support for tickless idle CPUs");
+torture_param(bool, verbose, true,
+ "Enable verbose debugging printk()s");
+
+static char *torture_type = "rcu";
+module_param(torture_type, charp, 0444);
+MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, ...)");
+
+static int nrealreaders;
+static struct task_struct *writer_task;
+static struct task_struct **fakewriter_tasks;
+static struct task_struct **reader_tasks;
+static struct task_struct *stats_task;
+static struct task_struct *fqs_task;
+static struct task_struct *boost_tasks[NR_CPUS];
+static struct task_struct *stall_task;
+static struct task_struct **barrier_cbs_tasks;
+static struct task_struct *barrier_task;
+
+#define RCU_TORTURE_PIPE_LEN 10
+
+struct rcu_torture {
+ struct rcu_head rtort_rcu;
+ int rtort_pipe_count;
+ struct list_head rtort_free;
+ int rtort_mbtest;
+};
+
+static LIST_HEAD(rcu_torture_freelist);
+static struct rcu_torture __rcu *rcu_torture_current;
+static unsigned long rcu_torture_current_version;
+static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN];
+static DEFINE_SPINLOCK(rcu_torture_lock);
+static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1],
+ rcu_torture_count) = { 0 };
+static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1],
+ rcu_torture_batch) = { 0 };
+static atomic_t rcu_torture_wcount[RCU_TORTURE_PIPE_LEN + 1];
+static atomic_t n_rcu_torture_alloc;
+static atomic_t n_rcu_torture_alloc_fail;
+static atomic_t n_rcu_torture_free;
+static atomic_t n_rcu_torture_mberror;
+static atomic_t n_rcu_torture_error;
+static long n_rcu_torture_barrier_error;
+static long n_rcu_torture_boost_ktrerror;
+static long n_rcu_torture_boost_rterror;
+static long n_rcu_torture_boost_failure;
+static long n_rcu_torture_boosts;
+static long n_rcu_torture_timers;
+static long n_barrier_attempts;
+static long n_barrier_successes;
+static struct list_head rcu_torture_removed;
+
+static int rcu_torture_writer_state;
+#define RTWS_FIXED_DELAY 0
+#define RTWS_DELAY 1
+#define RTWS_REPLACE 2
+#define RTWS_DEF_FREE 3
+#define RTWS_EXP_SYNC 4
+#define RTWS_COND_GET 5
+#define RTWS_COND_SYNC 6
+#define RTWS_SYNC 7
+#define RTWS_STUTTER 8
+#define RTWS_STOPPING 9
+
+#if defined(MODULE) || defined(CONFIG_RCU_TORTURE_TEST_RUNNABLE)
+#define RCUTORTURE_RUNNABLE_INIT 1
+#else
+#define RCUTORTURE_RUNNABLE_INIT 0
+#endif
+int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT;
+module_param(rcutorture_runnable, int, 0444);
+MODULE_PARM_DESC(rcutorture_runnable, "Start rcutorture at boot");
+
+#if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU)
+#define rcu_can_boost() 1
+#else /* #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */
+#define rcu_can_boost() 0
+#endif /* #else #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */
+
+#ifdef CONFIG_RCU_TRACE
+static u64 notrace rcu_trace_clock_local(void)
+{
+ u64 ts = trace_clock_local();
+ unsigned long __maybe_unused ts_rem = do_div(ts, NSEC_PER_USEC);
+ return ts;
+}
+#else /* #ifdef CONFIG_RCU_TRACE */
+static u64 notrace rcu_trace_clock_local(void)
+{
+ return 0ULL;
+}
+#endif /* #else #ifdef CONFIG_RCU_TRACE */
+
+static unsigned long boost_starttime; /* jiffies of next boost test start. */
+DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */
+ /* and boost task create/destroy. */
+static atomic_t barrier_cbs_count; /* Barrier callbacks registered. */
+static bool barrier_phase; /* Test phase. */
+static atomic_t barrier_cbs_invoked; /* Barrier callbacks invoked. */
+static wait_queue_head_t *barrier_cbs_wq; /* Coordinate barrier testing. */
+static DECLARE_WAIT_QUEUE_HEAD(barrier_wq);
+
+/*
+ * Allocate an element from the rcu_tortures pool.
+ */
+static struct rcu_torture *
+rcu_torture_alloc(void)
+{
+ struct list_head *p;
+
+ spin_lock_bh(&rcu_torture_lock);
+ if (list_empty(&rcu_torture_freelist)) {
+ atomic_inc(&n_rcu_torture_alloc_fail);
+ spin_unlock_bh(&rcu_torture_lock);
+ return NULL;
+ }
+ atomic_inc(&n_rcu_torture_alloc);
+ p = rcu_torture_freelist.next;
+ list_del_init(p);
+ spin_unlock_bh(&rcu_torture_lock);
+ return container_of(p, struct rcu_torture, rtort_free);
+}
+
+/*
+ * Free an element to the rcu_tortures pool.
+ */
+static void
+rcu_torture_free(struct rcu_torture *p)
+{
+ atomic_inc(&n_rcu_torture_free);
+ spin_lock_bh(&rcu_torture_lock);
+ list_add_tail(&p->rtort_free, &rcu_torture_freelist);
+ spin_unlock_bh(&rcu_torture_lock);
+}
+
+/*
+ * Operations vector for selecting different types of tests.
+ */
+
+struct rcu_torture_ops {
+ int ttype;
+ void (*init)(void);
+ int (*readlock)(void);
+ void (*read_delay)(struct torture_random_state *rrsp);
+ void (*readunlock)(int idx);
+ int (*completed)(void);
+ void (*deferred_free)(struct rcu_torture *p);
+ void (*sync)(void);
+ void (*exp_sync)(void);
+ unsigned long (*get_state)(void);
+ void (*cond_sync)(unsigned long oldstate);
+ void (*call)(struct rcu_head *head, void (*func)(struct rcu_head *rcu));
+ void (*cb_barrier)(void);
+ void (*fqs)(void);
+ void (*stats)(char *page);
+ int irq_capable;
+ int can_boost;
+ const char *name;
+};
+
+static struct rcu_torture_ops *cur_ops;
+
+/*
+ * Definitions for rcu torture testing.
+ */
+
+static int rcu_torture_read_lock(void) __acquires(RCU)
+{
+ rcu_read_lock();
+ return 0;
+}
+
+static void rcu_read_delay(struct torture_random_state *rrsp)
+{
+ const unsigned long shortdelay_us = 200;
+ const unsigned long longdelay_ms = 50;
+
+ /* We want a short delay sometimes to make a reader delay the grace
+ * period, and we want a long delay occasionally to trigger
+ * force_quiescent_state. */
+
+ if (!(torture_random(rrsp) % (nrealreaders * 2000 * longdelay_ms)))
+ mdelay(longdelay_ms);
+ if (!(torture_random(rrsp) % (nrealreaders * 2 * shortdelay_us)))
+ udelay(shortdelay_us);
+#ifdef CONFIG_PREEMPT
+ if (!preempt_count() &&
+ !(torture_random(rrsp) % (nrealreaders * 20000)))
+ preempt_schedule(); /* No QS if preempt_disable() in effect */
+#endif
+}
+
+static void rcu_torture_read_unlock(int idx) __releases(RCU)
+{
+ rcu_read_unlock();
+}
+
+static int rcu_torture_completed(void)
+{
+ return rcu_batches_completed();
+}
+
+/*
+ * Update callback in the pipe. This should be invoked after a grace period.
+ */
+static bool
+rcu_torture_pipe_update_one(struct rcu_torture *rp)
+{
+ int i;
+
+ i = rp->rtort_pipe_count;
+ if (i > RCU_TORTURE_PIPE_LEN)
+ i = RCU_TORTURE_PIPE_LEN;
+ atomic_inc(&rcu_torture_wcount[i]);
+ if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) {
+ rp->rtort_mbtest = 0;
+ return true;
+ }
+ return false;
+}
+
+/*
+ * Update all callbacks in the pipe. Suitable for synchronous grace-period
+ * primitives.
+ */
+static void
+rcu_torture_pipe_update(struct rcu_torture *old_rp)
+{
+ struct rcu_torture *rp;
+ struct rcu_torture *rp1;
+
+ if (old_rp)
+ list_add(&old_rp->rtort_free, &rcu_torture_removed);
+ list_for_each_entry_safe(rp, rp1, &rcu_torture_removed, rtort_free) {
+ if (rcu_torture_pipe_update_one(rp)) {
+ list_del(&rp->rtort_free);
+ rcu_torture_free(rp);
+ }
+ }
+}
+
+static void
+rcu_torture_cb(struct rcu_head *p)
+{
+ struct rcu_torture *rp = container_of(p, struct rcu_torture, rtort_rcu);
+
+ if (torture_must_stop_irq()) {
+ /* Test is ending, just drop callbacks on the floor. */
+ /* The next initialization will pick up the pieces. */
+ return;
+ }
+ if (rcu_torture_pipe_update_one(rp))
+ rcu_torture_free(rp);
+ else
+ cur_ops->deferred_free(rp);
+}
+
+static int rcu_no_completed(void)
+{
+ return 0;
+}
+
+static void rcu_torture_deferred_free(struct rcu_torture *p)
+{
+ call_rcu(&p->rtort_rcu, rcu_torture_cb);
+}
+
+static void rcu_sync_torture_init(void)
+{
+ INIT_LIST_HEAD(&rcu_torture_removed);
+}
+
+static struct rcu_torture_ops rcu_ops = {
+ .ttype = RCU_FLAVOR,
+ .init = rcu_sync_torture_init,
+ .readlock = rcu_torture_read_lock,
+ .read_delay = rcu_read_delay,
+ .readunlock = rcu_torture_read_unlock,
+ .completed = rcu_torture_completed,
+ .deferred_free = rcu_torture_deferred_free,
+ .sync = synchronize_rcu,
+ .exp_sync = synchronize_rcu_expedited,
+ .get_state = get_state_synchronize_rcu,
+ .cond_sync = cond_synchronize_rcu,
+ .call = call_rcu,
+ .cb_barrier = rcu_barrier,
+ .fqs = rcu_force_quiescent_state,
+ .stats = NULL,
+ .irq_capable = 1,
+ .can_boost = rcu_can_boost(),
+ .name = "rcu"
+};
+
+/*
+ * Definitions for rcu_bh torture testing.
+ */
+
+static int rcu_bh_torture_read_lock(void) __acquires(RCU_BH)
+{
+ rcu_read_lock_bh();
+ return 0;
+}
+
+static void rcu_bh_torture_read_unlock(int idx) __releases(RCU_BH)
+{
+ rcu_read_unlock_bh();
+}
+
+static int rcu_bh_torture_completed(void)
+{
+ return rcu_batches_completed_bh();
+}
+
+static void rcu_bh_torture_deferred_free(struct rcu_torture *p)
+{
+ call_rcu_bh(&p->rtort_rcu, rcu_torture_cb);
+}
+
+static struct rcu_torture_ops rcu_bh_ops = {
+ .ttype = RCU_BH_FLAVOR,
+ .init = rcu_sync_torture_init,
+ .readlock = rcu_bh_torture_read_lock,
+ .read_delay = rcu_read_delay, /* just reuse rcu's version. */
+ .readunlock = rcu_bh_torture_read_unlock,
+ .completed = rcu_bh_torture_completed,
+ .deferred_free = rcu_bh_torture_deferred_free,
+ .sync = synchronize_rcu_bh,
+ .exp_sync = synchronize_rcu_bh_expedited,
+ .call = call_rcu_bh,
+ .cb_barrier = rcu_barrier_bh,
+ .fqs = rcu_bh_force_quiescent_state,
+ .stats = NULL,
+ .irq_capable = 1,
+ .name = "rcu_bh"
+};
+
+/*
+ * Don't even think about trying any of these in real life!!!
+ * The names includes "busted", and they really means it!
+ * The only purpose of these functions is to provide a buggy RCU
+ * implementation to make sure that rcutorture correctly emits
+ * buggy-RCU error messages.
+ */
+static void rcu_busted_torture_deferred_free(struct rcu_torture *p)
+{
+ /* This is a deliberate bug for testing purposes only! */
+ rcu_torture_cb(&p->rtort_rcu);
+}
+
+static void synchronize_rcu_busted(void)
+{
+ /* This is a deliberate bug for testing purposes only! */
+}
+
+static void
+call_rcu_busted(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
+{
+ /* This is a deliberate bug for testing purposes only! */
+ func(head);
+}
+
+static struct rcu_torture_ops rcu_busted_ops = {
+ .ttype = INVALID_RCU_FLAVOR,
+ .init = rcu_sync_torture_init,
+ .readlock = rcu_torture_read_lock,
+ .read_delay = rcu_read_delay, /* just reuse rcu's version. */
+ .readunlock = rcu_torture_read_unlock,
+ .completed = rcu_no_completed,
+ .deferred_free = rcu_busted_torture_deferred_free,
+ .sync = synchronize_rcu_busted,
+ .exp_sync = synchronize_rcu_busted,
+ .call = call_rcu_busted,
+ .cb_barrier = NULL,
+ .fqs = NULL,
+ .stats = NULL,
+ .irq_capable = 1,
+ .name = "rcu_busted"
+};
+
+/*
+ * Definitions for srcu torture testing.
+ */
+
+DEFINE_STATIC_SRCU(srcu_ctl);
+
+static int srcu_torture_read_lock(void) __acquires(&srcu_ctl)
+{
+ return srcu_read_lock(&srcu_ctl);
+}
+
+static void srcu_read_delay(struct torture_random_state *rrsp)
+{
+ long delay;
+ const long uspertick = 1000000 / HZ;
+ const long longdelay = 10;
+
+ /* We want there to be long-running readers, but not all the time. */
+
+ delay = torture_random(rrsp) %
+ (nrealreaders * 2 * longdelay * uspertick);
+ if (!delay)
+ schedule_timeout_interruptible(longdelay);
+ else
+ rcu_read_delay(rrsp);
+}
+
+static void srcu_torture_read_unlock(int idx) __releases(&srcu_ctl)
+{
+ srcu_read_unlock(&srcu_ctl, idx);
+}
+
+static int srcu_torture_completed(void)
+{
+ return srcu_batches_completed(&srcu_ctl);
+}
+
+static void srcu_torture_deferred_free(struct rcu_torture *rp)
+{
+ call_srcu(&srcu_ctl, &rp->rtort_rcu, rcu_torture_cb);
+}
+
+static void srcu_torture_synchronize(void)
+{
+ synchronize_srcu(&srcu_ctl);
+}
+
+static void srcu_torture_call(struct rcu_head *head,
+ void (*func)(struct rcu_head *head))
+{
+ call_srcu(&srcu_ctl, head, func);
+}
+
+static void srcu_torture_barrier(void)
+{
+ srcu_barrier(&srcu_ctl);
+}
+
+static void srcu_torture_stats(char *page)
+{
+ int cpu;
+ int idx = srcu_ctl.completed & 0x1;
+
+ page += sprintf(page, "%s%s per-CPU(idx=%d):",
+ torture_type, TORTURE_FLAG, idx);
+ for_each_possible_cpu(cpu) {
+ long c0, c1;
+
+ c0 = (long)per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[!idx];
+ c1 = (long)per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[idx];
+ page += sprintf(page, " %d(%ld,%ld)", cpu, c0, c1);
+ }
+ sprintf(page, "\n");
+}
+
+static void srcu_torture_synchronize_expedited(void)
+{
+ synchronize_srcu_expedited(&srcu_ctl);
+}
+
+static struct rcu_torture_ops srcu_ops = {
+ .ttype = SRCU_FLAVOR,
+ .init = rcu_sync_torture_init,
+ .readlock = srcu_torture_read_lock,
+ .read_delay = srcu_read_delay,
+ .readunlock = srcu_torture_read_unlock,
+ .completed = srcu_torture_completed,
+ .deferred_free = srcu_torture_deferred_free,
+ .sync = srcu_torture_synchronize,
+ .exp_sync = srcu_torture_synchronize_expedited,
+ .call = srcu_torture_call,
+ .cb_barrier = srcu_torture_barrier,
+ .stats = srcu_torture_stats,
+ .name = "srcu"
+};
+
+/*
+ * Definitions for sched torture testing.
+ */
+
+static int sched_torture_read_lock(void)
+{
+ preempt_disable();
+ return 0;
+}
+
+static void sched_torture_read_unlock(int idx)
+{
+ preempt_enable();
+}
+
+static void rcu_sched_torture_deferred_free(struct rcu_torture *p)
+{
+ call_rcu_sched(&p->rtort_rcu, rcu_torture_cb);
+}
+
+static struct rcu_torture_ops sched_ops = {
+ .ttype = RCU_SCHED_FLAVOR,
+ .init = rcu_sync_torture_init,
+ .readlock = sched_torture_read_lock,
+ .read_delay = rcu_read_delay, /* just reuse rcu's version. */
+ .readunlock = sched_torture_read_unlock,
+ .completed = rcu_no_completed,
+ .deferred_free = rcu_sched_torture_deferred_free,
+ .sync = synchronize_sched,
+ .exp_sync = synchronize_sched_expedited,
+ .call = call_rcu_sched,
+ .cb_barrier = rcu_barrier_sched,
+ .fqs = rcu_sched_force_quiescent_state,
+ .stats = NULL,
+ .irq_capable = 1,
+ .name = "sched"
+};
+
+/*
+ * RCU torture priority-boost testing. Runs one real-time thread per
+ * CPU for moderate bursts, repeatedly registering RCU callbacks and
+ * spinning waiting for them to be invoked. If a given callback takes
+ * too long to be invoked, we assume that priority inversion has occurred.
+ */
+
+struct rcu_boost_inflight {
+ struct rcu_head rcu;
+ int inflight;
+};
+
+static void rcu_torture_boost_cb(struct rcu_head *head)
+{
+ struct rcu_boost_inflight *rbip =
+ container_of(head, struct rcu_boost_inflight, rcu);
+
+ smp_mb(); /* Ensure RCU-core accesses precede clearing ->inflight */
+ rbip->inflight = 0;
+}
+
+static int rcu_torture_boost(void *arg)
+{
+ unsigned long call_rcu_time;
+ unsigned long endtime;
+ unsigned long oldstarttime;
+ struct rcu_boost_inflight rbi = { .inflight = 0 };
+ struct sched_param sp;
+
+ VERBOSE_TOROUT_STRING("rcu_torture_boost started");
+
+ /* Set real-time priority. */
+ sp.sched_priority = 1;
+ if (sched_setscheduler(current, SCHED_FIFO, &sp) < 0) {
+ VERBOSE_TOROUT_STRING("rcu_torture_boost RT prio failed!");
+ n_rcu_torture_boost_rterror++;
+ }
+
+ init_rcu_head_on_stack(&rbi.rcu);
+ /* Each pass through the following loop does one boost-test cycle. */
+ do {
+ /* Wait for the next test interval. */
+ oldstarttime = boost_starttime;
+ while (ULONG_CMP_LT(jiffies, oldstarttime)) {
+ schedule_timeout_interruptible(oldstarttime - jiffies);
+ stutter_wait("rcu_torture_boost");
+ if (torture_must_stop())
+ goto checkwait;
+ }
+
+ /* Do one boost-test interval. */
+ endtime = oldstarttime + test_boost_duration * HZ;
+ call_rcu_time = jiffies;
+ while (ULONG_CMP_LT(jiffies, endtime)) {
+ /* If we don't have a callback in flight, post one. */
+ if (!rbi.inflight) {
+ smp_mb(); /* RCU core before ->inflight = 1. */
+ rbi.inflight = 1;
+ call_rcu(&rbi.rcu, rcu_torture_boost_cb);
+ if (jiffies - call_rcu_time >
+ test_boost_duration * HZ - HZ / 2) {
+ VERBOSE_TOROUT_STRING("rcu_torture_boost boosting failed");
+ n_rcu_torture_boost_failure++;
+ }
+ call_rcu_time = jiffies;
+ }
+ cond_resched();
+ stutter_wait("rcu_torture_boost");
+ if (torture_must_stop())
+ goto checkwait;
+ }
+
+ /*
+ * Set the start time of the next test interval.
+ * Yes, this is vulnerable to long delays, but such
+ * delays simply cause a false negative for the next
+ * interval. Besides, we are running at RT priority,
+ * so delays should be relatively rare.
+ */
+ while (oldstarttime == boost_starttime &&
+ !kthread_should_stop()) {
+ if (mutex_trylock(&boost_mutex)) {
+ boost_starttime = jiffies +
+ test_boost_interval * HZ;
+ n_rcu_torture_boosts++;
+ mutex_unlock(&boost_mutex);
+ break;
+ }
+ schedule_timeout_uninterruptible(1);
+ }
+
+ /* Go do the stutter. */
+checkwait: stutter_wait("rcu_torture_boost");
+ } while (!torture_must_stop());
+
+ /* Clean up and exit. */
+ while (!kthread_should_stop() || rbi.inflight) {
+ torture_shutdown_absorb("rcu_torture_boost");
+ schedule_timeout_uninterruptible(1);
+ }
+ smp_mb(); /* order accesses to ->inflight before stack-frame death. */
+ destroy_rcu_head_on_stack(&rbi.rcu);
+ torture_kthread_stopping("rcu_torture_boost");
+ return 0;
+}
+
+/*
+ * RCU torture force-quiescent-state kthread. Repeatedly induces
+ * bursts of calls to force_quiescent_state(), increasing the probability
+ * of occurrence of some important types of race conditions.
+ */
+static int
+rcu_torture_fqs(void *arg)
+{
+ unsigned long fqs_resume_time;
+ int fqs_burst_remaining;
+
+ VERBOSE_TOROUT_STRING("rcu_torture_fqs task started");
+ do {
+ fqs_resume_time = jiffies + fqs_stutter * HZ;
+ while (ULONG_CMP_LT(jiffies, fqs_resume_time) &&
+ !kthread_should_stop()) {
+ schedule_timeout_interruptible(1);
+ }
+ fqs_burst_remaining = fqs_duration;
+ while (fqs_burst_remaining > 0 &&
+ !kthread_should_stop()) {
+ cur_ops->fqs();
+ udelay(fqs_holdoff);
+ fqs_burst_remaining -= fqs_holdoff;
+ }
+ stutter_wait("rcu_torture_fqs");
+ } while (!torture_must_stop());
+ torture_kthread_stopping("rcu_torture_fqs");
+ return 0;
+}
+
+/*
+ * RCU torture writer kthread. Repeatedly substitutes a new structure
+ * for that pointed to by rcu_torture_current, freeing the old structure
+ * after a series of grace periods (the "pipeline").
+ */
+static int
+rcu_torture_writer(void *arg)
+{
+ unsigned long gp_snap;
+ bool gp_cond1 = gp_cond, gp_exp1 = gp_exp, gp_normal1 = gp_normal;
+ bool gp_sync1 = gp_sync;
+ int i;
+ struct rcu_torture *rp;
+ struct rcu_torture *old_rp;
+ static DEFINE_TORTURE_RANDOM(rand);
+ int synctype[] = { RTWS_DEF_FREE, RTWS_EXP_SYNC,
+ RTWS_COND_GET, RTWS_SYNC };
+ int nsynctypes = 0;
+
+ VERBOSE_TOROUT_STRING("rcu_torture_writer task started");
+
+ /* Initialize synctype[] array. If none set, take default. */
+ if (!gp_cond1 && !gp_exp1 && !gp_normal1 && !gp_sync)
+ gp_cond1 = gp_exp1 = gp_normal1 = gp_sync1 = true;
+ if (gp_cond1 && cur_ops->get_state && cur_ops->cond_sync)
+ synctype[nsynctypes++] = RTWS_COND_GET;
+ else if (gp_cond && (!cur_ops->get_state || !cur_ops->cond_sync))
+ pr_alert("rcu_torture_writer: gp_cond without primitives.\n");
+ if (gp_exp1 && cur_ops->exp_sync)
+ synctype[nsynctypes++] = RTWS_EXP_SYNC;
+ else if (gp_exp && !cur_ops->exp_sync)
+ pr_alert("rcu_torture_writer: gp_exp without primitives.\n");
+ if (gp_normal1 && cur_ops->deferred_free)
+ synctype[nsynctypes++] = RTWS_DEF_FREE;
+ else if (gp_normal && !cur_ops->deferred_free)
+ pr_alert("rcu_torture_writer: gp_normal without primitives.\n");
+ if (gp_sync1 && cur_ops->sync)
+ synctype[nsynctypes++] = RTWS_SYNC;
+ else if (gp_sync && !cur_ops->sync)
+ pr_alert("rcu_torture_writer: gp_sync without primitives.\n");
+ if (WARN_ONCE(nsynctypes == 0,
+ "rcu_torture_writer: No update-side primitives.\n")) {
+ /*
+ * No updates primitives, so don't try updating.
+ * The resulting test won't be testing much, hence the
+ * above WARN_ONCE().
+ */
+ rcu_torture_writer_state = RTWS_STOPPING;
+ torture_kthread_stopping("rcu_torture_writer");
+ }
+
+ do {
+ rcu_torture_writer_state = RTWS_FIXED_DELAY;
+ schedule_timeout_uninterruptible(1);
+ rp = rcu_torture_alloc();
+ if (rp == NULL)
+ continue;
+ rp->rtort_pipe_count = 0;
+ rcu_torture_writer_state = RTWS_DELAY;
+ udelay(torture_random(&rand) & 0x3ff);
+ rcu_torture_writer_state = RTWS_REPLACE;
+ old_rp = rcu_dereference_check(rcu_torture_current,
+ current == writer_task);
+ rp->rtort_mbtest = 1;
+ rcu_assign_pointer(rcu_torture_current, rp);
+ smp_wmb(); /* Mods to old_rp must follow rcu_assign_pointer() */
+ if (old_rp) {
+ i = old_rp->rtort_pipe_count;
+ if (i > RCU_TORTURE_PIPE_LEN)
+ i = RCU_TORTURE_PIPE_LEN;
+ atomic_inc(&rcu_torture_wcount[i]);
+ old_rp->rtort_pipe_count++;
+ switch (synctype[torture_random(&rand) % nsynctypes]) {
+ case RTWS_DEF_FREE:
+ rcu_torture_writer_state = RTWS_DEF_FREE;
+ cur_ops->deferred_free(old_rp);
+ break;
+ case RTWS_EXP_SYNC:
+ rcu_torture_writer_state = RTWS_EXP_SYNC;
+ cur_ops->exp_sync();
+ rcu_torture_pipe_update(old_rp);
+ break;
+ case RTWS_COND_GET:
+ rcu_torture_writer_state = RTWS_COND_GET;
+ gp_snap = cur_ops->get_state();
+ i = torture_random(&rand) % 16;
+ if (i != 0)
+ schedule_timeout_interruptible(i);
+ udelay(torture_random(&rand) % 1000);
+ rcu_torture_writer_state = RTWS_COND_SYNC;
+ cur_ops->cond_sync(gp_snap);
+ rcu_torture_pipe_update(old_rp);
+ break;
+ case RTWS_SYNC:
+ rcu_torture_writer_state = RTWS_SYNC;
+ cur_ops->sync();
+ rcu_torture_pipe_update(old_rp);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ break;
+ }
+ }
+ rcutorture_record_progress(++rcu_torture_current_version);
+ rcu_torture_writer_state = RTWS_STUTTER;
+ stutter_wait("rcu_torture_writer");
+ } while (!torture_must_stop());
+ rcu_torture_writer_state = RTWS_STOPPING;
+ torture_kthread_stopping("rcu_torture_writer");
+ return 0;
+}
+
+/*
+ * RCU torture fake writer kthread. Repeatedly calls sync, with a random
+ * delay between calls.
+ */
+static int
+rcu_torture_fakewriter(void *arg)
+{
+ DEFINE_TORTURE_RANDOM(rand);
+
+ VERBOSE_TOROUT_STRING("rcu_torture_fakewriter task started");
+ set_user_nice(current, MAX_NICE);
+
+ do {
+ schedule_timeout_uninterruptible(1 + torture_random(&rand)%10);
+ udelay(torture_random(&rand) & 0x3ff);
+ if (cur_ops->cb_barrier != NULL &&
+ torture_random(&rand) % (nfakewriters * 8) == 0) {
+ cur_ops->cb_barrier();
+ } else if (gp_normal == gp_exp) {
+ if (torture_random(&rand) & 0x80)
+ cur_ops->sync();
+ else
+ cur_ops->exp_sync();
+ } else if (gp_normal) {
+ cur_ops->sync();
+ } else {
+ cur_ops->exp_sync();
+ }
+ stutter_wait("rcu_torture_fakewriter");
+ } while (!torture_must_stop());
+
+ torture_kthread_stopping("rcu_torture_fakewriter");
+ return 0;
+}
+
+static void rcutorture_trace_dump(void)
+{
+ static atomic_t beenhere = ATOMIC_INIT(0);
+
+ if (atomic_read(&beenhere))
+ return;
+ if (atomic_xchg(&beenhere, 1) != 0)
+ return;
+ ftrace_dump(DUMP_ALL);
+}
+
+/*
+ * RCU torture reader from timer handler. Dereferences rcu_torture_current,
+ * incrementing the corresponding element of the pipeline array. The
+ * counter in the element should never be greater than 1, otherwise, the
+ * RCU implementation is broken.
+ */
+static void rcu_torture_timer(unsigned long unused)
+{
+ int idx;
+ int completed;
+ int completed_end;
+ static DEFINE_TORTURE_RANDOM(rand);
+ static DEFINE_SPINLOCK(rand_lock);
+ struct rcu_torture *p;
+ int pipe_count;
+ unsigned long long ts;
+
+ idx = cur_ops->readlock();
+ completed = cur_ops->completed();
+ ts = rcu_trace_clock_local();
+ p = rcu_dereference_check(rcu_torture_current,
+ rcu_read_lock_bh_held() ||
+ rcu_read_lock_sched_held() ||
+ srcu_read_lock_held(&srcu_ctl));
+ if (p == NULL) {
+ /* Leave because rcu_torture_writer is not yet underway */
+ cur_ops->readunlock(idx);
+ return;
+ }
+ if (p->rtort_mbtest == 0)
+ atomic_inc(&n_rcu_torture_mberror);
+ spin_lock(&rand_lock);
+ cur_ops->read_delay(&rand);
+ n_rcu_torture_timers++;
+ spin_unlock(&rand_lock);
+ preempt_disable();
+ pipe_count = p->rtort_pipe_count;
+ if (pipe_count > RCU_TORTURE_PIPE_LEN) {
+ /* Should not happen, but... */
+ pipe_count = RCU_TORTURE_PIPE_LEN;
+ }
+ completed_end = cur_ops->completed();
+ if (pipe_count > 1) {
+ do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu, ts,
+ completed, completed_end);
+ rcutorture_trace_dump();
+ }
+ __this_cpu_inc(rcu_torture_count[pipe_count]);
+ completed = completed_end - completed;
+ if (completed > RCU_TORTURE_PIPE_LEN) {
+ /* Should not happen, but... */
+ completed = RCU_TORTURE_PIPE_LEN;
+ }
+ __this_cpu_inc(rcu_torture_batch[completed]);
+ preempt_enable();
+ cur_ops->readunlock(idx);
+}
+
+/*
+ * RCU torture reader kthread. Repeatedly dereferences rcu_torture_current,
+ * incrementing the corresponding element of the pipeline array. The
+ * counter in the element should never be greater than 1, otherwise, the
+ * RCU implementation is broken.
+ */
+static int
+rcu_torture_reader(void *arg)
+{
+ int completed;
+ int completed_end;
+ int idx;
+ DEFINE_TORTURE_RANDOM(rand);
+ struct rcu_torture *p;
+ int pipe_count;
+ struct timer_list t;
+ unsigned long long ts;
+
+ VERBOSE_TOROUT_STRING("rcu_torture_reader task started");
+ set_user_nice(current, MAX_NICE);
+ if (irqreader && cur_ops->irq_capable)
+ setup_timer_on_stack(&t, rcu_torture_timer, 0);
+
+ do {
+ if (irqreader && cur_ops->irq_capable) {
+ if (!timer_pending(&t))
+ mod_timer(&t, jiffies + 1);
+ }
+ idx = cur_ops->readlock();
+ completed = cur_ops->completed();
+ ts = rcu_trace_clock_local();
+ p = rcu_dereference_check(rcu_torture_current,
+ rcu_read_lock_bh_held() ||
+ rcu_read_lock_sched_held() ||
+ srcu_read_lock_held(&srcu_ctl));
+ if (p == NULL) {
+ /* Wait for rcu_torture_writer to get underway */
+ cur_ops->readunlock(idx);
+ schedule_timeout_interruptible(HZ);
+ continue;
+ }
+ if (p->rtort_mbtest == 0)
+ atomic_inc(&n_rcu_torture_mberror);
+ cur_ops->read_delay(&rand);
+ preempt_disable();
+ pipe_count = p->rtort_pipe_count;
+ if (pipe_count > RCU_TORTURE_PIPE_LEN) {
+ /* Should not happen, but... */
+ pipe_count = RCU_TORTURE_PIPE_LEN;
+ }
+ completed_end = cur_ops->completed();
+ if (pipe_count > 1) {
+ do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu,
+ ts, completed, completed_end);
+ rcutorture_trace_dump();
+ }
+ __this_cpu_inc(rcu_torture_count[pipe_count]);
+ completed = completed_end - completed;
+ if (completed > RCU_TORTURE_PIPE_LEN) {
+ /* Should not happen, but... */
+ completed = RCU_TORTURE_PIPE_LEN;
+ }
+ __this_cpu_inc(rcu_torture_batch[completed]);
+ preempt_enable();
+ cur_ops->readunlock(idx);
+ cond_resched();
+ stutter_wait("rcu_torture_reader");
+ } while (!torture_must_stop());
+ if (irqreader && cur_ops->irq_capable) {
+ del_timer_sync(&t);
+ destroy_timer_on_stack(&t);
+ }
+ torture_kthread_stopping("rcu_torture_reader");
+ return 0;
+}
+
+/*
+ * Create an RCU-torture statistics message in the specified buffer.
+ */
+static void
+rcu_torture_printk(char *page)
+{
+ int cpu;
+ int i;
+ long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 };
+ long batchsummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 };
+ static unsigned long rtcv_snap = ULONG_MAX;
+
+ for_each_possible_cpu(cpu) {
+ for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) {
+ pipesummary[i] += per_cpu(rcu_torture_count, cpu)[i];
+ batchsummary[i] += per_cpu(rcu_torture_batch, cpu)[i];
+ }
+ }
+ for (i = RCU_TORTURE_PIPE_LEN - 1; i >= 0; i--) {
+ if (pipesummary[i] != 0)
+ break;
+ }
+ page += sprintf(page, "%s%s ", torture_type, TORTURE_FLAG);
+ page += sprintf(page,
+ "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d ",
+ rcu_torture_current,
+ rcu_torture_current_version,
+ list_empty(&rcu_torture_freelist),
+ atomic_read(&n_rcu_torture_alloc),
+ atomic_read(&n_rcu_torture_alloc_fail),
+ atomic_read(&n_rcu_torture_free));
+ page += sprintf(page, "rtmbe: %d rtbke: %ld rtbre: %ld ",
+ atomic_read(&n_rcu_torture_mberror),
+ n_rcu_torture_boost_ktrerror,
+ n_rcu_torture_boost_rterror);
+ page += sprintf(page, "rtbf: %ld rtb: %ld nt: %ld ",
+ n_rcu_torture_boost_failure,
+ n_rcu_torture_boosts,
+ n_rcu_torture_timers);
+ page = torture_onoff_stats(page);
+ page += sprintf(page, "barrier: %ld/%ld:%ld",
+ n_barrier_successes,
+ n_barrier_attempts,
+ n_rcu_torture_barrier_error);
+ page += sprintf(page, "\n%s%s ", torture_type, TORTURE_FLAG);
+ if (atomic_read(&n_rcu_torture_mberror) != 0 ||
+ n_rcu_torture_barrier_error != 0 ||
+ n_rcu_torture_boost_ktrerror != 0 ||
+ n_rcu_torture_boost_rterror != 0 ||
+ n_rcu_torture_boost_failure != 0 ||
+ i > 1) {
+ page += sprintf(page, "!!! ");
+ atomic_inc(&n_rcu_torture_error);
+ WARN_ON_ONCE(1);
+ }
+ page += sprintf(page, "Reader Pipe: ");
+ for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
+ page += sprintf(page, " %ld", pipesummary[i]);
+ page += sprintf(page, "\n%s%s ", torture_type, TORTURE_FLAG);
+ page += sprintf(page, "Reader Batch: ");
+ for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
+ page += sprintf(page, " %ld", batchsummary[i]);
+ page += sprintf(page, "\n%s%s ", torture_type, TORTURE_FLAG);
+ page += sprintf(page, "Free-Block Circulation: ");
+ for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) {
+ page += sprintf(page, " %d",
+ atomic_read(&rcu_torture_wcount[i]));
+ }
+ page += sprintf(page, "\n");
+ if (cur_ops->stats)
+ cur_ops->stats(page);
+ if (rtcv_snap == rcu_torture_current_version &&
+ rcu_torture_current != NULL) {
+ int __maybe_unused flags;
+ unsigned long __maybe_unused gpnum;
+ unsigned long __maybe_unused completed;
+
+ rcutorture_get_gp_data(cur_ops->ttype,
+ &flags, &gpnum, &completed);
+ page += sprintf(page,
+ "??? Writer stall state %d g%lu c%lu f%#x\n",
+ rcu_torture_writer_state,
+ gpnum, completed, flags);
+ show_rcu_gp_kthreads();
+ rcutorture_trace_dump();
+ }
+ rtcv_snap = rcu_torture_current_version;
+}
+
+/*
+ * Print torture statistics. Caller must ensure that there is only
+ * one call to this function at a given time!!! This is normally
+ * accomplished by relying on the module system to only have one copy
+ * of the module loaded, and then by giving the rcu_torture_stats
+ * kthread full control (or the init/cleanup functions when rcu_torture_stats
+ * thread is not running).
+ */
+static void
+rcu_torture_stats_print(void)
+{
+ int size = nr_cpu_ids * 200 + 8192;
+ char *buf;
+
+ buf = kmalloc(size, GFP_KERNEL);
+ if (!buf) {
+ pr_err("rcu-torture: Out of memory, need: %d", size);
+ return;
+ }
+ rcu_torture_printk(buf);
+ pr_alert("%s", buf);
+ kfree(buf);
+}
+
+/*
+ * Periodically prints torture statistics, if periodic statistics printing
+ * was specified via the stat_interval module parameter.
+ */
+static int
+rcu_torture_stats(void *arg)
+{
+ VERBOSE_TOROUT_STRING("rcu_torture_stats task started");
+ do {
+ schedule_timeout_interruptible(stat_interval * HZ);
+ rcu_torture_stats_print();
+ torture_shutdown_absorb("rcu_torture_stats");
+ } while (!torture_must_stop());
+ torture_kthread_stopping("rcu_torture_stats");
+ return 0;
+}
+
+static inline void
+rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag)
+{
+ pr_alert("%s" TORTURE_FLAG
+ "--- %s: nreaders=%d nfakewriters=%d "
+ "stat_interval=%d verbose=%d test_no_idle_hz=%d "
+ "shuffle_interval=%d stutter=%d irqreader=%d "
+ "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d "
+ "test_boost=%d/%d test_boost_interval=%d "
+ "test_boost_duration=%d shutdown_secs=%d "
+ "stall_cpu=%d stall_cpu_holdoff=%d "
+ "n_barrier_cbs=%d "
+ "onoff_interval=%d onoff_holdoff=%d\n",
+ torture_type, tag, nrealreaders, nfakewriters,
+ stat_interval, verbose, test_no_idle_hz, shuffle_interval,
+ stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter,
+ test_boost, cur_ops->can_boost,
+ test_boost_interval, test_boost_duration, shutdown_secs,
+ stall_cpu, stall_cpu_holdoff,
+ n_barrier_cbs,
+ onoff_interval, onoff_holdoff);
+}
+
+static void rcutorture_booster_cleanup(int cpu)
+{
+ struct task_struct *t;
+
+ if (boost_tasks[cpu] == NULL)
+ return;
+ mutex_lock(&boost_mutex);
+ t = boost_tasks[cpu];
+ boost_tasks[cpu] = NULL;
+ mutex_unlock(&boost_mutex);
+
+ /* This must be outside of the mutex, otherwise deadlock! */
+ torture_stop_kthread(rcu_torture_boost, t);
+}
+
+static int rcutorture_booster_init(int cpu)
+{
+ int retval;
+
+ if (boost_tasks[cpu] != NULL)
+ return 0; /* Already created, nothing more to do. */
+
+ /* Don't allow time recalculation while creating a new task. */
+ mutex_lock(&boost_mutex);
+ VERBOSE_TOROUT_STRING("Creating rcu_torture_boost task");
+ boost_tasks[cpu] = kthread_create_on_node(rcu_torture_boost, NULL,
+ cpu_to_node(cpu),
+ "rcu_torture_boost");
+ if (IS_ERR(boost_tasks[cpu])) {
+ retval = PTR_ERR(boost_tasks[cpu]);
+ VERBOSE_TOROUT_STRING("rcu_torture_boost task create failed");
+ n_rcu_torture_boost_ktrerror++;
+ boost_tasks[cpu] = NULL;
+ mutex_unlock(&boost_mutex);
+ return retval;
+ }
+ kthread_bind(boost_tasks[cpu], cpu);
+ wake_up_process(boost_tasks[cpu]);
+ mutex_unlock(&boost_mutex);
+ return 0;
+}
+
+/*
+ * CPU-stall kthread. It waits as specified by stall_cpu_holdoff, then
+ * induces a CPU stall for the time specified by stall_cpu.
+ */
+static int rcu_torture_stall(void *args)
+{
+ unsigned long stop_at;
+
+ VERBOSE_TOROUT_STRING("rcu_torture_stall task started");
+ if (stall_cpu_holdoff > 0) {
+ VERBOSE_TOROUT_STRING("rcu_torture_stall begin holdoff");
+ schedule_timeout_interruptible(stall_cpu_holdoff * HZ);
+ VERBOSE_TOROUT_STRING("rcu_torture_stall end holdoff");
+ }
+ if (!kthread_should_stop()) {
+ stop_at = get_seconds() + stall_cpu;
+ /* RCU CPU stall is expected behavior in following code. */
+ pr_alert("rcu_torture_stall start.\n");
+ rcu_read_lock();
+ preempt_disable();
+ while (ULONG_CMP_LT(get_seconds(), stop_at))
+ continue; /* Induce RCU CPU stall warning. */
+ preempt_enable();
+ rcu_read_unlock();
+ pr_alert("rcu_torture_stall end.\n");
+ }
+ torture_shutdown_absorb("rcu_torture_stall");
+ while (!kthread_should_stop())
+ schedule_timeout_interruptible(10 * HZ);
+ return 0;
+}
+
+/* Spawn CPU-stall kthread, if stall_cpu specified. */
+static int __init rcu_torture_stall_init(void)
+{
+ if (stall_cpu <= 0)
+ return 0;
+ return torture_create_kthread(rcu_torture_stall, NULL, stall_task);
+}
+
+/* Callback function for RCU barrier testing. */
+static void rcu_torture_barrier_cbf(struct rcu_head *rcu)
+{
+ atomic_inc(&barrier_cbs_invoked);
+}
+
+/* kthread function to register callbacks used to test RCU barriers. */
+static int rcu_torture_barrier_cbs(void *arg)
+{
+ long myid = (long)arg;
+ bool lastphase = 0;
+ bool newphase;
+ struct rcu_head rcu;
+
+ init_rcu_head_on_stack(&rcu);
+ VERBOSE_TOROUT_STRING("rcu_torture_barrier_cbs task started");
+ set_user_nice(current, MAX_NICE);
+ do {
+ wait_event(barrier_cbs_wq[myid],
+ (newphase =
+ ACCESS_ONCE(barrier_phase)) != lastphase ||
+ torture_must_stop());
+ lastphase = newphase;
+ smp_mb(); /* ensure barrier_phase load before ->call(). */
+ if (torture_must_stop())
+ break;
+ cur_ops->call(&rcu, rcu_torture_barrier_cbf);
+ if (atomic_dec_and_test(&barrier_cbs_count))
+ wake_up(&barrier_wq);
+ } while (!torture_must_stop());
+ cur_ops->cb_barrier();
+ destroy_rcu_head_on_stack(&rcu);
+ torture_kthread_stopping("rcu_torture_barrier_cbs");
+ return 0;
+}
+
+/* kthread function to drive and coordinate RCU barrier testing. */
+static int rcu_torture_barrier(void *arg)
+{
+ int i;
+
+ VERBOSE_TOROUT_STRING("rcu_torture_barrier task starting");
+ do {
+ atomic_set(&barrier_cbs_invoked, 0);
+ atomic_set(&barrier_cbs_count, n_barrier_cbs);
+ smp_mb(); /* Ensure barrier_phase after prior assignments. */
+ barrier_phase = !barrier_phase;
+ for (i = 0; i < n_barrier_cbs; i++)
+ wake_up(&barrier_cbs_wq[i]);
+ wait_event(barrier_wq,
+ atomic_read(&barrier_cbs_count) == 0 ||
+ torture_must_stop());
+ if (torture_must_stop())
+ break;
+ n_barrier_attempts++;
+ cur_ops->cb_barrier(); /* Implies smp_mb() for wait_event(). */
+ if (atomic_read(&barrier_cbs_invoked) != n_barrier_cbs) {
+ n_rcu_torture_barrier_error++;
+ WARN_ON_ONCE(1);
+ }
+ n_barrier_successes++;
+ schedule_timeout_interruptible(HZ / 10);
+ } while (!torture_must_stop());
+ torture_kthread_stopping("rcu_torture_barrier");
+ return 0;
+}
+
+/* Initialize RCU barrier testing. */
+static int rcu_torture_barrier_init(void)
+{
+ int i;
+ int ret;
+
+ if (n_barrier_cbs == 0)
+ return 0;
+ if (cur_ops->call == NULL || cur_ops->cb_barrier == NULL) {
+ pr_alert("%s" TORTURE_FLAG
+ " Call or barrier ops missing for %s,\n",
+ torture_type, cur_ops->name);
+ pr_alert("%s" TORTURE_FLAG
+ " RCU barrier testing omitted from run.\n",
+ torture_type);
+ return 0;
+ }
+ atomic_set(&barrier_cbs_count, 0);
+ atomic_set(&barrier_cbs_invoked, 0);
+ barrier_cbs_tasks =
+ kzalloc(n_barrier_cbs * sizeof(barrier_cbs_tasks[0]),
+ GFP_KERNEL);
+ barrier_cbs_wq =
+ kzalloc(n_barrier_cbs * sizeof(barrier_cbs_wq[0]),
+ GFP_KERNEL);
+ if (barrier_cbs_tasks == NULL || !barrier_cbs_wq)
+ return -ENOMEM;
+ for (i = 0; i < n_barrier_cbs; i++) {
+ init_waitqueue_head(&barrier_cbs_wq[i]);
+ ret = torture_create_kthread(rcu_torture_barrier_cbs,
+ (void *)(long)i,
+ barrier_cbs_tasks[i]);
+ if (ret)
+ return ret;
+ }
+ return torture_create_kthread(rcu_torture_barrier, NULL, barrier_task);
+}
+
+/* Clean up after RCU barrier testing. */
+static void rcu_torture_barrier_cleanup(void)
+{
+ int i;
+
+ torture_stop_kthread(rcu_torture_barrier, barrier_task);
+ if (barrier_cbs_tasks != NULL) {
+ for (i = 0; i < n_barrier_cbs; i++)
+ torture_stop_kthread(rcu_torture_barrier_cbs,
+ barrier_cbs_tasks[i]);
+ kfree(barrier_cbs_tasks);
+ barrier_cbs_tasks = NULL;
+ }
+ if (barrier_cbs_wq != NULL) {
+ kfree(barrier_cbs_wq);
+ barrier_cbs_wq = NULL;
+ }
+}
+
+static int rcutorture_cpu_notify(struct notifier_block *self,
+ unsigned long action, void *hcpu)
+{
+ long cpu = (long)hcpu;
+
+ switch (action) {
+ case CPU_ONLINE:
+ case CPU_DOWN_FAILED:
+ (void)rcutorture_booster_init(cpu);
+ break;
+ case CPU_DOWN_PREPARE:
+ rcutorture_booster_cleanup(cpu);
+ break;
+ default:
+ break;
+ }
+ return NOTIFY_OK;
+}
+
+static struct notifier_block rcutorture_cpu_nb = {
+ .notifier_call = rcutorture_cpu_notify,
+};
+
+static void
+rcu_torture_cleanup(void)
+{
+ int i;
+
+ rcutorture_record_test_transition();
+ if (torture_cleanup()) {
+ if (cur_ops->cb_barrier != NULL)
+ cur_ops->cb_barrier();
+ return;
+ }
+
+ rcu_torture_barrier_cleanup();
+ torture_stop_kthread(rcu_torture_stall, stall_task);
+ torture_stop_kthread(rcu_torture_writer, writer_task);
+
+ if (reader_tasks) {
+ for (i = 0; i < nrealreaders; i++)
+ torture_stop_kthread(rcu_torture_reader,
+ reader_tasks[i]);
+ kfree(reader_tasks);
+ }
+ rcu_torture_current = NULL;
+
+ if (fakewriter_tasks) {
+ for (i = 0; i < nfakewriters; i++) {
+ torture_stop_kthread(rcu_torture_fakewriter,
+ fakewriter_tasks[i]);
+ }
+ kfree(fakewriter_tasks);
+ fakewriter_tasks = NULL;
+ }
+
+ torture_stop_kthread(rcu_torture_stats, stats_task);
+ torture_stop_kthread(rcu_torture_fqs, fqs_task);
+ if ((test_boost == 1 && cur_ops->can_boost) ||
+ test_boost == 2) {
+ unregister_cpu_notifier(&rcutorture_cpu_nb);
+ for_each_possible_cpu(i)
+ rcutorture_booster_cleanup(i);
+ }
+
+ /* Wait for all RCU callbacks to fire. */
+
+ if (cur_ops->cb_barrier != NULL)
+ cur_ops->cb_barrier();
+
+ rcu_torture_stats_print(); /* -After- the stats thread is stopped! */
+
+ if (atomic_read(&n_rcu_torture_error) || n_rcu_torture_barrier_error)
+ rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE");
+ else if (torture_onoff_failures())
+ rcu_torture_print_module_parms(cur_ops,
+ "End of test: RCU_HOTPLUG");
+ else
+ rcu_torture_print_module_parms(cur_ops, "End of test: SUCCESS");
+}
+
+#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
+static void rcu_torture_leak_cb(struct rcu_head *rhp)
+{
+}
+
+static void rcu_torture_err_cb(struct rcu_head *rhp)
+{
+ /*
+ * This -might- happen due to race conditions, but is unlikely.
+ * The scenario that leads to this happening is that the
+ * first of the pair of duplicate callbacks is queued,
+ * someone else starts a grace period that includes that
+ * callback, then the second of the pair must wait for the
+ * next grace period. Unlikely, but can happen. If it
+ * does happen, the debug-objects subsystem won't have splatted.
+ */
+ pr_alert("rcutorture: duplicated callback was invoked.\n");
+}
+#endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
+
+/*
+ * Verify that double-free causes debug-objects to complain, but only
+ * if CONFIG_DEBUG_OBJECTS_RCU_HEAD=y. Otherwise, say that the test
+ * cannot be carried out.
+ */
+static void rcu_test_debug_objects(void)
+{
+#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
+ struct rcu_head rh1;
+ struct rcu_head rh2;
+
+ init_rcu_head_on_stack(&rh1);
+ init_rcu_head_on_stack(&rh2);
+ pr_alert("rcutorture: WARN: Duplicate call_rcu() test starting.\n");
+
+ /* Try to queue the rh2 pair of callbacks for the same grace period. */
+ preempt_disable(); /* Prevent preemption from interrupting test. */
+ rcu_read_lock(); /* Make it impossible to finish a grace period. */
+ call_rcu(&rh1, rcu_torture_leak_cb); /* Start grace period. */
+ local_irq_disable(); /* Make it harder to start a new grace period. */
+ call_rcu(&rh2, rcu_torture_leak_cb);
+ call_rcu(&rh2, rcu_torture_err_cb); /* Duplicate callback. */
+ local_irq_enable();
+ rcu_read_unlock();
+ preempt_enable();
+
+ /* Wait for them all to get done so we can safely return. */
+ rcu_barrier();
+ pr_alert("rcutorture: WARN: Duplicate call_rcu() test complete.\n");
+ destroy_rcu_head_on_stack(&rh1);
+ destroy_rcu_head_on_stack(&rh2);
+#else /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
+ pr_alert("rcutorture: !CONFIG_DEBUG_OBJECTS_RCU_HEAD, not testing duplicate call_rcu()\n");
+#endif /* #else #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
+}
+
+static int __init
+rcu_torture_init(void)
+{
+ int i;
+ int cpu;
+ int firsterr = 0;
+ static struct rcu_torture_ops *torture_ops[] = {
+ &rcu_ops, &rcu_bh_ops, &rcu_busted_ops, &srcu_ops, &sched_ops,
+ };
+
+ if (!torture_init_begin(torture_type, verbose, &rcutorture_runnable))
+ return -EBUSY;
+
+ /* Process args and tell the world that the torturer is on the job. */
+ for (i = 0; i < ARRAY_SIZE(torture_ops); i++) {
+ cur_ops = torture_ops[i];
+ if (strcmp(torture_type, cur_ops->name) == 0)
+ break;
+ }
+ if (i == ARRAY_SIZE(torture_ops)) {
+ pr_alert("rcu-torture: invalid torture type: \"%s\"\n",
+ torture_type);
+ pr_alert("rcu-torture types:");
+ for (i = 0; i < ARRAY_SIZE(torture_ops); i++)
+ pr_alert(" %s", torture_ops[i]->name);
+ pr_alert("\n");
+ torture_init_end();
+ return -EINVAL;
+ }
+ if (cur_ops->fqs == NULL && fqs_duration != 0) {
+ pr_alert("rcu-torture: ->fqs NULL and non-zero fqs_duration, fqs disabled.\n");
+ fqs_duration = 0;
+ }
+ if (cur_ops->init)
+ cur_ops->init(); /* no "goto unwind" prior to this point!!! */
+
+ if (nreaders >= 0) {
+ nrealreaders = nreaders;
+ } else {
+ nrealreaders = num_online_cpus() - 1;
+ if (nrealreaders <= 0)
+ nrealreaders = 1;
+ }
+ rcu_torture_print_module_parms(cur_ops, "Start of test");
+
+ /* Set up the freelist. */
+
+ INIT_LIST_HEAD(&rcu_torture_freelist);
+ for (i = 0; i < ARRAY_SIZE(rcu_tortures); i++) {
+ rcu_tortures[i].rtort_mbtest = 0;
+ list_add_tail(&rcu_tortures[i].rtort_free,
+ &rcu_torture_freelist);
+ }
+
+ /* Initialize the statistics so that each run gets its own numbers. */
+
+ rcu_torture_current = NULL;
+ rcu_torture_current_version = 0;
+ atomic_set(&n_rcu_torture_alloc, 0);
+ atomic_set(&n_rcu_torture_alloc_fail, 0);
+ atomic_set(&n_rcu_torture_free, 0);
+ atomic_set(&n_rcu_torture_mberror, 0);
+ atomic_set(&n_rcu_torture_error, 0);
+ n_rcu_torture_barrier_error = 0;
+ n_rcu_torture_boost_ktrerror = 0;
+ n_rcu_torture_boost_rterror = 0;
+ n_rcu_torture_boost_failure = 0;
+ n_rcu_torture_boosts = 0;
+ for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
+ atomic_set(&rcu_torture_wcount[i], 0);
+ for_each_possible_cpu(cpu) {
+ for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) {
+ per_cpu(rcu_torture_count, cpu)[i] = 0;
+ per_cpu(rcu_torture_batch, cpu)[i] = 0;
+ }
+ }
+
+ /* Start up the kthreads. */
+
+ firsterr = torture_create_kthread(rcu_torture_writer, NULL,
+ writer_task);
+ if (firsterr)
+ goto unwind;
+ fakewriter_tasks = kzalloc(nfakewriters * sizeof(fakewriter_tasks[0]),
+ GFP_KERNEL);
+ if (fakewriter_tasks == NULL) {
+ VERBOSE_TOROUT_ERRSTRING("out of memory");
+ firsterr = -ENOMEM;
+ goto unwind;
+ }
+ for (i = 0; i < nfakewriters; i++) {
+ firsterr = torture_create_kthread(rcu_torture_fakewriter,
+ NULL, fakewriter_tasks[i]);
+ if (firsterr)
+ goto unwind;
+ }
+ reader_tasks = kzalloc(nrealreaders * sizeof(reader_tasks[0]),
+ GFP_KERNEL);
+ if (reader_tasks == NULL) {
+ VERBOSE_TOROUT_ERRSTRING("out of memory");
+ firsterr = -ENOMEM;
+ goto unwind;
+ }
+ for (i = 0; i < nrealreaders; i++) {
+ firsterr = torture_create_kthread(rcu_torture_reader, NULL,
+ reader_tasks[i]);
+ if (firsterr)
+ goto unwind;
+ }
+ if (stat_interval > 0) {
+ firsterr = torture_create_kthread(rcu_torture_stats, NULL,
+ stats_task);
+ if (firsterr)
+ goto unwind;
+ }
+ if (test_no_idle_hz) {
+ firsterr = torture_shuffle_init(shuffle_interval * HZ);
+ if (firsterr)
+ goto unwind;
+ }
+ if (stutter < 0)
+ stutter = 0;
+ if (stutter) {
+ firsterr = torture_stutter_init(stutter * HZ);
+ if (firsterr)
+ goto unwind;
+ }
+ if (fqs_duration < 0)
+ fqs_duration = 0;
+ if (fqs_duration) {
+ /* Create the fqs thread */
+ firsterr = torture_create_kthread(rcu_torture_fqs, NULL,
+ fqs_task);
+ if (firsterr)
+ goto unwind;
+ }
+ if (test_boost_interval < 1)
+ test_boost_interval = 1;
+ if (test_boost_duration < 2)
+ test_boost_duration = 2;
+ if ((test_boost == 1 && cur_ops->can_boost) ||
+ test_boost == 2) {
+
+ boost_starttime = jiffies + test_boost_interval * HZ;
+ register_cpu_notifier(&rcutorture_cpu_nb);
+ for_each_possible_cpu(i) {
+ if (cpu_is_offline(i))
+ continue; /* Heuristic: CPU can go offline. */
+ firsterr = rcutorture_booster_init(i);
+ if (firsterr)
+ goto unwind;
+ }
+ }
+ firsterr = torture_shutdown_init(shutdown_secs, rcu_torture_cleanup);
+ if (firsterr)
+ goto unwind;
+ firsterr = torture_onoff_init(onoff_holdoff * HZ, onoff_interval * HZ);
+ if (firsterr)
+ goto unwind;
+ firsterr = rcu_torture_stall_init();
+ if (firsterr)
+ goto unwind;
+ firsterr = rcu_torture_barrier_init();
+ if (firsterr)
+ goto unwind;
+ if (object_debug)
+ rcu_test_debug_objects();
+ rcutorture_record_test_transition();
+ torture_init_end();
+ return 0;
+
+unwind:
+ torture_init_end();
+ rcu_torture_cleanup();
+ return firsterr;
+}
+
+module_init(rcu_torture_init);
+module_exit(rcu_torture_cleanup);
e='width: 19.4%;'/> -rw-r--r--kernel/sched_features.h118
-rw-r--r--kernel/sched_stats.h375
-rw-r--r--kernel/seccomp.c473
-rw-r--r--kernel/signal.c2032
-rw-r--r--kernel/slow-work-debugfs.c227
-rw-r--r--kernel/slow-work.c1068
-rw-r--r--kernel/slow-work.h72
-rw-r--r--kernel/smp.c611
-rw-r--r--kernel/smpboot.c313
-rw-r--r--kernel/smpboot.h20
-rw-r--r--kernel/softirq.c649
-rw-r--r--kernel/softlockup.c293
-rw-r--r--kernel/srcu.c285
-rw-r--r--kernel/stacktrace.c14
-rw-r--r--kernel/stop_machine.c709
-rw-r--r--kernel/sys.c1671
-rw-r--r--kernel/sys_ni.c37
-rw-r--r--kernel/sysctl.c1797
-rw-r--r--kernel/sysctl_binary.c100
-rw-r--r--kernel/sysctl_check.c175
-rw-r--r--kernel/system_certificates.S20
-rw-r--r--kernel/system_keyring.c105
-rw-r--r--kernel/task_work.c128
-rw-r--r--kernel/taskstats.c321
-rw-r--r--kernel/test_kprobes.c14
-rw-r--r--kernel/time.c90
-rw-r--r--kernel/time/Kconfig199
-rw-r--r--kernel/time/Makefile10
-rw-r--r--kernel/time/alarmtimer.c861
-rw-r--r--kernel/time/clockevents.c578
-rw-r--r--kernel/time/clocksource.c549
-rw-r--r--kernel/time/jiffies.c64
-rw-r--r--kernel/time/ntp.c701
-rw-r--r--kernel/time/ntp_internal.h12
-rw-r--r--kernel/time/posix-clock.c446
-rw-r--r--kernel/time/sched_clock.c217
-rw-r--r--kernel/time/tick-broadcast-hrtimer.c106
-rw-r--r--kernel/time/tick-broadcast.c546
-rw-r--r--kernel/time/tick-common.c257
-rw-r--r--kernel/time/tick-internal.h51
-rw-r--r--kernel/time/tick-oneshot.c54
-rw-r--r--kernel/time/tick-sched.c879
-rw-r--r--kernel/time/timecompare.c191
-rw-r--r--kernel/time/timekeeping.c1526
-rw-r--r--kernel/time/timekeeping_debug.c74
-rw-r--r--kernel/time/timekeeping_internal.h14
-rw-r--r--kernel/time/timer_list.c127
-rw-r--r--kernel/time/timer_stats.c16
-rw-r--r--kernel/timeconst.bc108
-rw-r--r--kernel/timeconst.pl378
-rw-r--r--kernel/timer.c775
-rw-r--r--kernel/torture.c733
-rw-r--r--kernel/trace/Kconfig260
-rw-r--r--kernel/trace/Makefile34
-rw-r--r--kernel/trace/blktrace.c421
-rw-r--r--kernel/trace/ftrace.c3368
-rw-r--r--kernel/trace/kmemtrace.c511
-rw-r--r--kernel/trace/power-traces.c3
-rw-r--r--kernel/trace/ring_buffer.c2233
-rw-r--r--kernel/trace/ring_buffer_benchmark.c14
-rw-r--r--kernel/trace/rpm-traces.c20
-rw-r--r--kernel/trace/trace.c4915
-rw-r--r--kernel/trace/trace.h877
-rw-r--r--kernel/trace/trace_benchmark.c198
-rw-r--r--kernel/trace/trace_benchmark.h41
-rw-r--r--kernel/trace/trace_boot.c185
-rw-r--r--kernel/trace/trace_branch.c41
-rw-r--r--kernel/trace/trace_clock.c42
-rw-r--r--kernel/trace/trace_entries.h202
-rw-r--r--kernel/trace/trace_event_perf.c370
-rw-r--r--kernel/trace/trace_event_profile.c122
-rw-r--r--kernel/trace/trace_events.c2276
-rw-r--r--kernel/trace/trace_events_filter.c1670
-rw-r--r--kernel/trace/trace_events_filter_test.h50
-rw-r--r--kernel/trace/trace_events_trigger.c1437
-rw-r--r--kernel/trace/trace_export.c185
-rw-r--r--kernel/trace/trace_functions.c531
-rw-r--r--kernel/trace/trace_functions_graph.c844
-rw-r--r--kernel/trace/trace_hw_branches.c312
-rw-r--r--kernel/trace/trace_irqsoff.c424
-rw-r--r--kernel/trace/trace_kdb.c135
-rw-r--r--kernel/trace/trace_kprobe.c1817
-rw-r--r--kernel/trace/trace_ksym.c519
-rw-r--r--kernel/trace/trace_mmiotrace.c27
-rw-r--r--kernel/trace/trace_nop.c6
-rw-r--r--kernel/trace/trace_output.c605
-rw-r--r--kernel/trace/trace_output.h6
-rw-r--r--kernel/trace/trace_printk.c145
-rw-r--r--kernel/trace/trace_probe.c726
-rw-r--r--kernel/trace/trace_probe.h400
-rw-r--r--kernel/trace/trace_sched_switch.c85
-rw-r--r--kernel/trace/trace_sched_wakeup.c489
-rw-r--r--kernel/trace/trace_selftest.c779
-rw-r--r--kernel/trace/trace_selftest_dynamic.c6
-rw-r--r--kernel/trace/trace_stack.c200
-rw-r--r--kernel/trace/trace_stat.c44
-rw-r--r--kernel/trace/trace_syscalls.c597
-rw-r--r--kernel/trace/trace_sysprof.c329
-rw-r--r--kernel/trace/trace_uprobe.c1340
-rw-r--r--kernel/trace/trace_workqueue.c295
-rw-r--r--kernel/tracepoint.c667
-rw-r--r--kernel/tsacct.c76
-rw-r--r--kernel/uid16.c105
-rw-r--r--kernel/up.c71
-rw-r--r--kernel/user-return-notifier.c8
-rw-r--r--kernel/user.c396
-rw-r--r--kernel/user_namespace.c911
-rw-r--r--kernel/utsname.c75
-rw-r--r--kernel/utsname_sysctl.c38
-rw-r--r--kernel/watchdog.c648
-rw-r--r--kernel/workqueue.c5085
-rw-r--r--kernel/workqueue_internal.h74
338 files changed, 137282 insertions, 64980 deletions
diff --git a/kernel/.gitignore b/kernel/.gitignore
index ab4f1090f43..790d83c7d16 100644
--- a/kernel/.gitignore
+++ b/kernel/.gitignore
@@ -4,3 +4,5 @@
config_data.h
config_data.gz
timeconst.h
+hz.bc
+x509_certificate_list
diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz
index 94fabd534b0..2a202a84675 100644
--- a/kernel/Kconfig.hz
+++ b/kernel/Kconfig.hz
@@ -55,4 +55,4 @@ config HZ
default 1000 if HZ_1000
config SCHED_HRTICK
- def_bool HIGH_RES_TIMERS && (!SMP || USE_GENERIC_SMP_HELPERS)
+ def_bool HIGH_RES_TIMERS
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
index 88c92fb4461..76768ee812b 100644
--- a/kernel/Kconfig.locks
+++ b/kernel/Kconfig.locks
@@ -87,6 +87,9 @@ config ARCH_INLINE_WRITE_UNLOCK_IRQ
config ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE
bool
+config UNINLINE_SPIN_UNLOCK
+ bool
+
#
# lock_* functions are inlined when:
# - DEBUG_SPINLOCK=n and GENERIC_LOCKBREAK=n and ARCH_INLINE_*LOCK=y
@@ -103,100 +106,134 @@ config ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE
# - DEBUG_SPINLOCK=n and ARCH_INLINE_*LOCK=y
#
+if !DEBUG_SPINLOCK
+
config INLINE_SPIN_TRYLOCK
- def_bool !DEBUG_SPINLOCK && ARCH_INLINE_SPIN_TRYLOCK
+ def_bool y
+ depends on ARCH_INLINE_SPIN_TRYLOCK
config INLINE_SPIN_TRYLOCK_BH
- def_bool !DEBUG_SPINLOCK && ARCH_INLINE_SPIN_TRYLOCK_BH
+ def_bool y
+ depends on ARCH_INLINE_SPIN_TRYLOCK_BH
config INLINE_SPIN_LOCK
- def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && ARCH_INLINE_SPIN_LOCK
+ def_bool y
+ depends on !GENERIC_LOCKBREAK && ARCH_INLINE_SPIN_LOCK
config INLINE_SPIN_LOCK_BH
- def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
- ARCH_INLINE_SPIN_LOCK_BH
+ def_bool y
+ depends on !GENERIC_LOCKBREAK && ARCH_INLINE_SPIN_LOCK_BH
config INLINE_SPIN_LOCK_IRQ
- def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
- ARCH_INLINE_SPIN_LOCK_IRQ
+ def_bool y
+ depends on !GENERIC_LOCKBREAK && ARCH_INLINE_SPIN_LOCK_IRQ
config INLINE_SPIN_LOCK_IRQSAVE
- def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
- ARCH_INLINE_SPIN_LOCK_IRQSAVE
-
-config INLINE_SPIN_UNLOCK
- def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_SPIN_UNLOCK)
+ def_bool y
+ depends on !GENERIC_LOCKBREAK && ARCH_INLINE_SPIN_LOCK_IRQSAVE
config INLINE_SPIN_UNLOCK_BH
- def_bool !DEBUG_SPINLOCK && ARCH_INLINE_SPIN_UNLOCK_BH
+ def_bool y
+ depends on ARCH_INLINE_SPIN_UNLOCK_BH
config INLINE_SPIN_UNLOCK_IRQ
- def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_SPIN_UNLOCK_BH)
+ def_bool y
+ depends on !PREEMPT || ARCH_INLINE_SPIN_UNLOCK_IRQ
config INLINE_SPIN_UNLOCK_IRQRESTORE
- def_bool !DEBUG_SPINLOCK && ARCH_INLINE_SPIN_UNLOCK_IRQRESTORE
+ def_bool y
+ depends on ARCH_INLINE_SPIN_UNLOCK_IRQRESTORE
config INLINE_READ_TRYLOCK
- def_bool !DEBUG_SPINLOCK && ARCH_INLINE_READ_TRYLOCK
+ def_bool y
+ depends on ARCH_INLINE_READ_TRYLOCK
config INLINE_READ_LOCK
- def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && ARCH_INLINE_READ_LOCK
+ def_bool y
+ depends on !GENERIC_LOCKBREAK && ARCH_INLINE_READ_LOCK
config INLINE_READ_LOCK_BH
- def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
- ARCH_INLINE_READ_LOCK_BH
+ def_bool y
+ depends on !GENERIC_LOCKBREAK && ARCH_INLINE_READ_LOCK_BH
config INLINE_READ_LOCK_IRQ
- def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
- ARCH_INLINE_READ_LOCK_IRQ
+ def_bool y
+ depends on !GENERIC_LOCKBREAK && ARCH_INLINE_READ_LOCK_IRQ
config INLINE_READ_LOCK_IRQSAVE
- def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
- ARCH_INLINE_READ_LOCK_IRQSAVE
+ def_bool y
+ depends on !GENERIC_LOCKBREAK && ARCH_INLINE_READ_LOCK_IRQSAVE
config INLINE_READ_UNLOCK
- def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_READ_UNLOCK)
+ def_bool y
+ depends on !PREEMPT || ARCH_INLINE_READ_UNLOCK
config INLINE_READ_UNLOCK_BH
- def_bool !DEBUG_SPINLOCK && ARCH_INLINE_READ_UNLOCK_BH
+ def_bool y
+ depends on ARCH_INLINE_READ_UNLOCK_BH
config INLINE_READ_UNLOCK_IRQ
- def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_READ_UNLOCK_BH)
+ def_bool y
+ depends on !PREEMPT || ARCH_INLINE_READ_UNLOCK_IRQ
config INLINE_READ_UNLOCK_IRQRESTORE
- def_bool !DEBUG_SPINLOCK && ARCH_INLINE_READ_UNLOCK_IRQRESTORE
+ def_bool y
+ depends on ARCH_INLINE_READ_UNLOCK_IRQRESTORE
config INLINE_WRITE_TRYLOCK
- def_bool !DEBUG_SPINLOCK && ARCH_INLINE_WRITE_TRYLOCK
+ def_bool y
+ depends on ARCH_INLINE_WRITE_TRYLOCK
config INLINE_WRITE_LOCK
- def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && ARCH_INLINE_WRITE_LOCK
+ def_bool y
+ depends on !GENERIC_LOCKBREAK && ARCH_INLINE_WRITE_LOCK
config INLINE_WRITE_LOCK_BH
- def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
- ARCH_INLINE_WRITE_LOCK_BH
+ def_bool y
+ depends on !GENERIC_LOCKBREAK && ARCH_INLINE_WRITE_LOCK_BH
config INLINE_WRITE_LOCK_IRQ
- def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
- ARCH_INLINE_WRITE_LOCK_IRQ
+ def_bool y
+ depends on !GENERIC_LOCKBREAK && ARCH_INLINE_WRITE_LOCK_IRQ
config INLINE_WRITE_LOCK_IRQSAVE
- def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
- ARCH_INLINE_WRITE_LOCK_IRQSAVE
+ def_bool y
+ depends on !GENERIC_LOCKBREAK && ARCH_INLINE_WRITE_LOCK_IRQSAVE
config INLINE_WRITE_UNLOCK
- def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_WRITE_UNLOCK)
+ def_bool y
+ depends on !PREEMPT || ARCH_INLINE_WRITE_UNLOCK
config INLINE_WRITE_UNLOCK_BH
- def_bool !DEBUG_SPINLOCK && ARCH_INLINE_WRITE_UNLOCK_BH
+ def_bool y
+ depends on ARCH_INLINE_WRITE_UNLOCK_BH
config INLINE_WRITE_UNLOCK_IRQ
- def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_WRITE_UNLOCK_BH)
+ def_bool y
+ depends on !PREEMPT || ARCH_INLINE_WRITE_UNLOCK_IRQ
config INLINE_WRITE_UNLOCK_IRQRESTORE
- def_bool !DEBUG_SPINLOCK && ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE
+ def_bool y
+ depends on ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE
+
+endif
+
+config ARCH_SUPPORTS_ATOMIC_RMW
+ bool
config MUTEX_SPIN_ON_OWNER
- def_bool SMP && !DEBUG_MUTEXES && !HAVE_DEFAULT_NO_SPIN_MUTEXES
+ def_bool y
+ depends on SMP && !DEBUG_MUTEXES && ARCH_SUPPORTS_ATOMIC_RMW
+
+config RWSEM_SPIN_ON_OWNER
+ def_bool y
+ depends on SMP && RWSEM_XCHGADD_ALGORITHM && ARCH_SUPPORTS_ATOMIC_RMW
+
+config ARCH_USE_QUEUE_RWLOCK
+ bool
+
+config QUEUE_RWLOCK
+ def_bool y if ARCH_USE_QUEUE_RWLOCK
+ depends on SMP
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index bf987b95b35..3f9c97419f0 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -35,6 +35,8 @@ config PREEMPT_VOLUNTARY
config PREEMPT
bool "Preemptible Kernel (Low-Latency Desktop)"
+ select PREEMPT_COUNT
+ select UNINLINE_SPIN_UNLOCK if !ARCH_INLINE_SPIN_UNLOCK
help
This option reduces the latency of the kernel by making
all kernel code (that is not executing in a critical section)
@@ -52,3 +54,5 @@ config PREEMPT
endchoice
+config PREEMPT_COUNT
+ bool \ No newline at end of file
diff --git a/kernel/Makefile b/kernel/Makefile
index 1292b863d66..f2a8b6246ce 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -2,59 +2,51 @@
# Makefile for the linux kernel.
#
-obj-y = sched.o fork.o exec_domain.o panic.o printk.o \
+obj-y = fork.o exec_domain.o panic.o \
cpu.o exit.o itimer.o time.o softirq.o resource.o \
sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \
- signal.o sys.o kmod.o workqueue.o pid.o \
- rcupdate.o extable.o params.o posix-timers.o \
- kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
- hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
- notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \
- async.o range.o
-obj-$(CONFIG_HAVE_EARLY_RES) += early_res.o
-obj-y += groups.o
+ signal.o sys.o kmod.o workqueue.o pid.o task_work.o \
+ extable.o params.o posix-timers.o \
+ kthread.o sys_ni.o posix-cpu-timers.o \
+ hrtimer.o nsproxy.o \
+ notifier.o ksysfs.o cred.o reboot.o \
+ async.o range.o groups.o smpboot.o
ifdef CONFIG_FUNCTION_TRACER
# Do not trace debug files and internal ftrace files
-CFLAGS_REMOVE_lockdep.o = -pg
-CFLAGS_REMOVE_lockdep_proc.o = -pg
-CFLAGS_REMOVE_mutex-debug.o = -pg
-CFLAGS_REMOVE_rtmutex-debug.o = -pg
CFLAGS_REMOVE_cgroup-debug.o = -pg
-CFLAGS_REMOVE_sched_clock.o = -pg
-CFLAGS_REMOVE_perf_event.o = -pg
+CFLAGS_REMOVE_irq_work.o = -pg
endif
+# cond_syscall is currently not LTO compatible
+CFLAGS_sys_ni.o = $(DISABLE_LTO)
+
+obj-y += sched/
+obj-y += locking/
+obj-y += power/
+obj-y += printk/
+obj-y += irq/
+obj-y += rcu/
+
+obj-$(CONFIG_CHECKPOINT_RESTORE) += kcmp.o
obj-$(CONFIG_FREEZER) += freezer.o
obj-$(CONFIG_PROFILING) += profile.o
-obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o
obj-$(CONFIG_STACKTRACE) += stacktrace.o
obj-y += time/
-obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o
-obj-$(CONFIG_LOCKDEP) += lockdep.o
-ifeq ($(CONFIG_PROC_FS),y)
-obj-$(CONFIG_LOCKDEP) += lockdep_proc.o
-endif
obj-$(CONFIG_FUTEX) += futex.o
ifeq ($(CONFIG_COMPAT),y)
obj-$(CONFIG_FUTEX) += futex_compat.o
endif
-obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
-obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
-obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o
obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
-obj-$(CONFIG_USE_GENERIC_SMP_HELPERS) += smp.o
+obj-$(CONFIG_SMP) += smp.o
ifneq ($(CONFIG_SMP),y)
obj-y += up.o
endif
-obj-$(CONFIG_SMP) += spinlock.o
-obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
-obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
obj-$(CONFIG_UID16) += uid16.o
+obj-$(CONFIG_SYSTEM_TRUSTED_KEYRING) += system_keyring.o system_certificates.o
obj-$(CONFIG_MODULES) += module.o
+obj-$(CONFIG_MODULE_SIG) += module_signing.o
obj-$(CONFIG_KALLSYMS) += kallsyms.o
-obj-$(CONFIG_PM) += power/
-obj-$(CONFIG_FREEZER) += power/
obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
obj-$(CONFIG_KEXEC) += kexec.o
obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o
@@ -62,73 +54,170 @@ obj-$(CONFIG_COMPAT) += compat.o
obj-$(CONFIG_CGROUPS) += cgroup.o
obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o
obj-$(CONFIG_CPUSETS) += cpuset.o
-obj-$(CONFIG_CGROUP_NS) += ns_cgroup.o
obj-$(CONFIG_UTS_NS) += utsname.o
obj-$(CONFIG_USER_NS) += user_namespace.o
obj-$(CONFIG_PID_NS) += pid_namespace.o
obj-$(CONFIG_IKCONFIG) += configs.o
obj-$(CONFIG_RESOURCE_COUNTERS) += res_counter.o
-obj-$(CONFIG_STOP_MACHINE) += stop_machine.o
+obj-$(CONFIG_SMP) += stop_machine.o
obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o
-obj-$(CONFIG_AUDIT) += audit.o auditfilter.o audit_watch.o
+obj-$(CONFIG_AUDIT) += audit.o auditfilter.o
obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
-obj-$(CONFIG_GCOV_KERNEL) += gcov/
+obj-$(CONFIG_AUDIT_WATCH) += audit_watch.o
obj-$(CONFIG_AUDIT_TREE) += audit_tree.o
+obj-$(CONFIG_GCOV_KERNEL) += gcov/
obj-$(CONFIG_KPROBES) += kprobes.o
-obj-$(CONFIG_KGDB) += kgdb.o
-obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o
+obj-$(CONFIG_KGDB) += debug/
obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o
-obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
+obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o
obj-$(CONFIG_SECCOMP) += seccomp.o
-obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
-obj-$(CONFIG_TREE_RCU) += rcutree.o
-obj-$(CONFIG_TREE_PREEMPT_RCU) += rcutree.o
-obj-$(CONFIG_TREE_RCU_TRACE) += rcutree_trace.o
-obj-$(CONFIG_TINY_RCU) += rcutiny.o
obj-$(CONFIG_RELAY) += relay.o
obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
obj-$(CONFIG_TRACEPOINTS) += tracepoint.o
obj-$(CONFIG_LATENCYTOP) += latencytop.o
+obj-$(CONFIG_BINFMT_ELF) += elfcore.o
+obj-$(CONFIG_COMPAT_BINFMT_ELF) += elfcore.o
+obj-$(CONFIG_BINFMT_ELF_FDPIC) += elfcore.o
obj-$(CONFIG_FUNCTION_TRACER) += trace/
obj-$(CONFIG_TRACING) += trace/
-obj-$(CONFIG_X86_DS) += trace/
+obj-$(CONFIG_TRACE_CLOCK) += trace/
obj-$(CONFIG_RING_BUFFER) += trace/
-obj-$(CONFIG_SMP) += sched_cpupri.o
-obj-$(CONFIG_SLOW_WORK) += slow-work.o
-obj-$(CONFIG_SLOW_WORK_DEBUG) += slow-work-debugfs.o
-obj-$(CONFIG_PERF_EVENTS) += perf_event.o
-obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
-obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o
+obj-$(CONFIG_TRACEPOINTS) += trace/
+obj-$(CONFIG_IRQ_WORK) += irq_work.o
+obj-$(CONFIG_CPU_PM) += cpu_pm.o
-ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
-# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
-# needed for x86 only. Why this used to be enabled for all architectures is beyond
-# me. I suspect most platforms don't need this, but until we know that for sure
-# I turn this off for IA-64 only. Andreas Schwab says it's also needed on m68k
-# to get a correct value for the wait-channel (WCHAN in ps). --davidm
-CFLAGS_sched.o := $(PROFILING) -fno-omit-frame-pointer
-endif
+obj-$(CONFIG_PERF_EVENTS) += events/
+
+obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o
+obj-$(CONFIG_PADATA) += padata.o
+obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
+obj-$(CONFIG_JUMP_LABEL) += jump_label.o
+obj-$(CONFIG_CONTEXT_TRACKING) += context_tracking.o
+obj-$(CONFIG_TORTURE_TEST) += torture.o
$(obj)/configs.o: $(obj)/config_data.h
# config_data.h contains the same information as ikconfig.h but gzipped.
# Info from config_data can be extracted from /proc/config*
targets += config_data.gz
-$(obj)/config_data.gz: .config FORCE
+$(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE
$(call if_changed,gzip)
-quiet_cmd_ikconfiggz = IKCFG $@
- cmd_ikconfiggz = (echo "static const char kernel_config_data[] __used = MAGIC_START"; cat $< | scripts/bin2c; echo "MAGIC_END;") > $@
+ filechk_ikconfiggz = (echo "static const char kernel_config_data[] __used = MAGIC_START"; cat $< | scripts/bin2c; echo "MAGIC_END;")
targets += config_data.h
$(obj)/config_data.h: $(obj)/config_data.gz FORCE
- $(call if_changed,ikconfiggz)
+ $(call filechk,ikconfiggz)
$(obj)/time.o: $(obj)/timeconst.h
-quiet_cmd_timeconst = TIMEC $@
- cmd_timeconst = $(PERL) $< $(CONFIG_HZ) > $@
+quiet_cmd_hzfile = HZFILE $@
+ cmd_hzfile = echo "hz=$(CONFIG_HZ)" > $@
+
+targets += hz.bc
+$(obj)/hz.bc: $(objtree)/include/config/hz.h FORCE
+ $(call if_changed,hzfile)
+
+quiet_cmd_bc = BC $@
+ cmd_bc = bc -q $(filter-out FORCE,$^) > $@
+
targets += timeconst.h
-$(obj)/timeconst.h: $(src)/timeconst.pl FORCE
- $(call if_changed,timeconst)
+$(obj)/timeconst.h: $(obj)/hz.bc $(src)/timeconst.bc FORCE
+ $(call if_changed,bc)
+
+###############################################################################
+#
+# Roll all the X.509 certificates that we can find together and pull them into
+# the kernel so that they get loaded into the system trusted keyring during
+# boot.
+#
+# We look in the source root and the build root for all files whose name ends
+# in ".x509". Unfortunately, this will generate duplicate filenames, so we
+# have make canonicalise the pathnames and then sort them to discard the
+# duplicates.
+#
+###############################################################################
+ifeq ($(CONFIG_SYSTEM_TRUSTED_KEYRING),y)
+X509_CERTIFICATES-y := $(wildcard *.x509) $(wildcard $(srctree)/*.x509)
+X509_CERTIFICATES-$(CONFIG_MODULE_SIG) += $(objtree)/signing_key.x509
+X509_CERTIFICATES-raw := $(sort $(foreach CERT,$(X509_CERTIFICATES-y), \
+ $(or $(realpath $(CERT)),$(CERT))))
+X509_CERTIFICATES := $(subst $(realpath $(objtree))/,,$(X509_CERTIFICATES-raw))
+
+ifeq ($(X509_CERTIFICATES),)
+$(warning *** No X.509 certificates found ***)
+endif
+
+ifneq ($(wildcard $(obj)/.x509.list),)
+ifneq ($(shell cat $(obj)/.x509.list),$(X509_CERTIFICATES))
+$(info X.509 certificate list changed)
+$(shell rm $(obj)/.x509.list)
+endif
+endif
+
+kernel/system_certificates.o: $(obj)/x509_certificate_list
+
+quiet_cmd_x509certs = CERTS $@
+ cmd_x509certs = cat $(X509_CERTIFICATES) /dev/null >$@ $(foreach X509,$(X509_CERTIFICATES),; echo " - Including cert $(X509)")
+
+targets += $(obj)/x509_certificate_list
+$(obj)/x509_certificate_list: $(X509_CERTIFICATES) $(obj)/.x509.list
+ $(call if_changed,x509certs)
+
+targets += $(obj)/.x509.list
+$(obj)/.x509.list:
+ @echo $(X509_CERTIFICATES) >$@
+endif
+
+clean-files := x509_certificate_list .x509.list
+
+ifeq ($(CONFIG_MODULE_SIG),y)
+###############################################################################
+#
+# If module signing is requested, say by allyesconfig, but a key has not been
+# supplied, then one will need to be generated to make sure the build does not
+# fail and that the kernel may be used afterwards.
+#
+###############################################################################
+ifndef CONFIG_MODULE_SIG_HASH
+$(error Could not determine digest type to use from kernel config)
+endif
+
+signing_key.priv signing_key.x509: x509.genkey
+ @echo "###"
+ @echo "### Now generating an X.509 key pair to be used for signing modules."
+ @echo "###"
+ @echo "### If this takes a long time, you might wish to run rngd in the"
+ @echo "### background to keep the supply of entropy topped up. It"
+ @echo "### needs to be run as root, and uses a hardware random"
+ @echo "### number generator if one is available."
+ @echo "###"
+ openssl req -new -nodes -utf8 -$(CONFIG_MODULE_SIG_HASH) -days 36500 \
+ -batch -x509 -config x509.genkey \
+ -outform DER -out signing_key.x509 \
+ -keyout signing_key.priv 2>&1
+ @echo "###"
+ @echo "### Key pair generated."
+ @echo "###"
+
+x509.genkey:
+ @echo Generating X.509 key generation config
+ @echo >x509.genkey "[ req ]"
+ @echo >>x509.genkey "default_bits = 4096"
+ @echo >>x509.genkey "distinguished_name = req_distinguished_name"
+ @echo >>x509.genkey "prompt = no"
+ @echo >>x509.genkey "string_mask = utf8only"
+ @echo >>x509.genkey "x509_extensions = myexts"
+ @echo >>x509.genkey
+ @echo >>x509.genkey "[ req_distinguished_name ]"
+ @echo >>x509.genkey "O = Magrathea"
+ @echo >>x509.genkey "CN = Glacier signing key"
+ @echo >>x509.genkey "emailAddress = slartibartfast@magrathea.h2g2"
+ @echo >>x509.genkey
+ @echo >>x509.genkey "[ myexts ]"
+ @echo >>x509.genkey "basicConstraints=critical,CA:FALSE"
+ @echo >>x509.genkey "keyUsage=digitalSignature"
+ @echo >>x509.genkey "subjectKeyIdentifier=hash"
+ @echo >>x509.genkey "authorityKeyIdentifier=keyid"
+endif
diff --git a/kernel/acct.c b/kernel/acct.c
index a6605ca921b..808a86ff229 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -55,7 +55,7 @@
#include <linux/times.h>
#include <linux/syscalls.h>
#include <linux/mount.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <asm/div64.h>
#include <linux/blkdev.h> /* sector_div */
#include <linux/pid_namespace.h>
@@ -84,11 +84,10 @@ static void do_acct_process(struct bsd_acct_struct *acct,
* the cache line to have the data after getting the lock.
*/
struct bsd_acct_struct {
- volatile int active;
- volatile int needcheck;
+ int active;
+ unsigned long needcheck;
struct file *file;
struct pid_namespace *ns;
- struct timer_list timer;
struct list_head list;
};
@@ -96,15 +95,6 @@ static DEFINE_SPINLOCK(acct_lock);
static LIST_HEAD(acct_list);
/*
- * Called whenever the timer says to check the free space.
- */
-static void acct_timeout(unsigned long x)
-{
- struct bsd_acct_struct *acct = (struct bsd_acct_struct *)x;
- acct->needcheck = 1;
-}
-
-/*
* Check the amount of free space and suspend/resume accordingly.
*/
static int check_free_space(struct bsd_acct_struct *acct, struct file *file)
@@ -112,23 +102,23 @@ static int check_free_space(struct bsd_acct_struct *acct, struct file *file)
struct kstatfs sbuf;
int res;
int act;
- sector_t resume;
- sector_t suspend;
+ u64 resume;
+ u64 suspend;
spin_lock(&acct_lock);
res = acct->active;
- if (!file || !acct->needcheck)
+ if (!file || time_is_before_jiffies(acct->needcheck))
goto out;
spin_unlock(&acct_lock);
/* May block */
- if (vfs_statfs(file->f_path.dentry, &sbuf))
+ if (vfs_statfs(&file->f_path, &sbuf))
return res;
suspend = sbuf.f_blocks * SUSPEND;
resume = sbuf.f_blocks * RESUME;
- sector_div(suspend, 100);
- sector_div(resume, 100);
+ do_div(suspend, 100);
+ do_div(resume, 100);
if (sbuf.f_bavail <= suspend)
act = -1;
@@ -144,7 +134,7 @@ static int check_free_space(struct bsd_acct_struct *acct, struct file *file)
spin_lock(&acct_lock);
if (file != acct->file) {
if (act)
- res = act>0;
+ res = act > 0;
goto out;
}
@@ -160,10 +150,7 @@ static int check_free_space(struct bsd_acct_struct *acct, struct file *file)
}
}
- del_timer(&acct->timer);
- acct->needcheck = 0;
- acct->timer.expires = jiffies + ACCT_TIMEOUT*HZ;
- add_timer(&acct->timer);
+ acct->needcheck = jiffies + ACCT_TIMEOUT*HZ;
res = acct->active;
out:
spin_unlock(&acct_lock);
@@ -185,9 +172,7 @@ static void acct_file_reopen(struct bsd_acct_struct *acct, struct file *file,
if (acct->file) {
old_acct = acct->file;
old_ns = acct->ns;
- del_timer(&acct->timer);
acct->active = 0;
- acct->needcheck = 0;
acct->file = NULL;
acct->ns = NULL;
list_del(&acct->list);
@@ -195,13 +180,9 @@ static void acct_file_reopen(struct bsd_acct_struct *acct, struct file *file,
if (file) {
acct->file = file;
acct->ns = ns;
- acct->needcheck = 0;
+ acct->needcheck = jiffies + ACCT_TIMEOUT*HZ;
acct->active = 1;
list_add(&acct->list, &acct_list);
- /* It's been deleted if it was used before so this is safe */
- setup_timer(&acct->timer, acct_timeout, (unsigned long)acct);
- acct->timer.expires = jiffies + ACCT_TIMEOUT*HZ;
- add_timer(&acct->timer);
}
if (old_acct) {
mnt_unpin(old_acct->f_path.mnt);
@@ -212,20 +193,19 @@ static void acct_file_reopen(struct bsd_acct_struct *acct, struct file *file,
}
}
-static int acct_on(char *name)
+static int acct_on(struct filename *pathname)
{
struct file *file;
struct vfsmount *mnt;
- int error;
struct pid_namespace *ns;
struct bsd_acct_struct *acct = NULL;
/* Difference from BSD - they don't do O_APPEND */
- file = filp_open(name, O_WRONLY|O_APPEND|O_LARGEFILE, 0);
+ file = file_open_name(pathname, O_WRONLY|O_APPEND|O_LARGEFILE, 0);
if (IS_ERR(file))
return PTR_ERR(file);
- if (!S_ISREG(file->f_path.dentry->d_inode->i_mode)) {
+ if (!S_ISREG(file_inode(file)->i_mode)) {
filp_close(file, NULL);
return -EACCES;
}
@@ -244,13 +224,6 @@ static int acct_on(char *name)
}
}
- error = security_acct(file);
- if (error) {
- kfree(acct);
- filp_close(file, NULL);
- return error;
- }
-
spin_lock(&acct_lock);
if (ns->bacct == NULL) {
ns->bacct = acct;
@@ -281,15 +254,15 @@ static int acct_on(char *name)
*/
SYSCALL_DEFINE1(acct, const char __user *, name)
{
- int error;
+ int error = 0;
if (!capable(CAP_SYS_PACCT))
return -EPERM;
if (name) {
- char *tmp = getname(name);
+ struct filename *tmp = getname(name);
if (IS_ERR(tmp))
- return (PTR_ERR(tmp));
+ return PTR_ERR(tmp);
error = acct_on(tmp);
putname(tmp);
} else {
@@ -299,13 +272,11 @@ SYSCALL_DEFINE1(acct, const char __user *, name)
if (acct == NULL)
return 0;
- error = security_acct(NULL);
- if (!error) {
- spin_lock(&acct_lock);
- acct_file_reopen(acct, NULL, NULL);
- spin_unlock(&acct_lock);
- }
+ spin_lock(&acct_lock);
+ acct_file_reopen(acct, NULL, NULL);
+ spin_unlock(&acct_lock);
}
+
return error;
}
@@ -344,7 +315,7 @@ void acct_auto_close(struct super_block *sb)
spin_lock(&acct_lock);
restart:
list_for_each_entry(acct, &acct_list, list)
- if (acct->file && acct->file->f_path.mnt->mnt_sb == sb) {
+ if (acct->file && acct->file->f_path.dentry->d_sb == sb) {
acct_file_reopen(acct, NULL, NULL);
goto restart;
}
@@ -353,17 +324,17 @@ restart:
void acct_exit_ns(struct pid_namespace *ns)
{
- struct bsd_acct_struct *acct;
+ struct bsd_acct_struct *acct = ns->bacct;
- spin_lock(&acct_lock);
- acct = ns->bacct;
- if (acct != NULL) {
- if (acct->file != NULL)
- acct_file_reopen(acct, NULL, NULL);
+ if (acct == NULL)
+ return;
- kfree(acct);
- }
+ spin_lock(&acct_lock);
+ if (acct->file != NULL)
+ acct_file_reopen(acct, NULL, NULL);
spin_unlock(&acct_lock);
+
+ kfree(acct);
}
/*
@@ -507,7 +478,7 @@ static void do_acct_process(struct bsd_acct_struct *acct,
* Fill the accounting struct with the needed info as recorded
* by the different kernel functions.
*/
- memset((caddr_t)&ac, 0, sizeof(acct_t));
+ memset(&ac, 0, sizeof(acct_t));
ac.ac_version = ACCT_VERSION | ACCT_BYTEORDER;
strlcpy(ac.ac_comm, current->comm, sizeof(ac.ac_comm));
@@ -536,8 +507,8 @@ static void do_acct_process(struct bsd_acct_struct *acct,
do_div(elapsed, AHZ);
ac.ac_btime = get_seconds() - elapsed;
/* we really need to bite the bullet and change layout */
- ac.ac_uid = orig_cred->uid;
- ac.ac_gid = orig_cred->gid;
+ ac.ac_uid = from_kuid_munged(file->f_cred->user_ns, orig_cred->uid);
+ ac.ac_gid = from_kgid_munged(file->f_cred->user_ns, orig_cred->gid);
#if ACCT_VERSION==2
ac.ac_ahz = AHZ;
#endif
@@ -569,6 +540,12 @@ static void do_acct_process(struct bsd_acct_struct *acct,
ac.ac_swaps = encode_comp_t(0);
/*
+ * Get freeze protection. If the fs is frozen, just skip the write
+ * as we could deadlock the system otherwise.
+ */
+ if (!file_start_write_trylock(file))
+ goto out;
+ /*
* Kernel segment override to datasegment and write it
* to the accounting file.
*/
@@ -583,21 +560,12 @@ static void do_acct_process(struct bsd_acct_struct *acct,
sizeof(acct_t), &file->f_pos);
current->signal->rlim[RLIMIT_FSIZE].rlim_cur = flim;
set_fs(fs);
+ file_end_write(file);
out:
revert_creds(orig_cred);
}
/**
- * acct_init_pacct - initialize a new pacct_struct
- * @pacct: per-process accounting info struct to initialize
- */
-void acct_init_pacct(struct pacct_struct *pacct)
-{
- memset(pacct, 0, sizeof(struct pacct_struct));
- pacct->ac_utime = pacct->ac_stime = cputime_zero;
-}
-
-/**
* acct_collect - collect accounting information into pacct_struct
* @exitcode: task exit code
* @group_dead: not 0, if this thread is the last one in the process.
@@ -605,6 +573,7 @@ void acct_init_pacct(struct pacct_struct *pacct)
void acct_collect(long exitcode, int group_dead)
{
struct pacct_struct *pacct = &current->signal->pacct;
+ cputime_t utime, stime;
unsigned long vsize = 0;
if (group_dead && current->mm) {
@@ -632,8 +601,9 @@ void acct_collect(long exitcode, int group_dead)
pacct->ac_flag |= ACORE;
if (current->flags & PF_SIGNALED)
pacct->ac_flag |= AXSIG;
- pacct->ac_utime = cputime_add(pacct->ac_utime, current->utime);
- pacct->ac_stime = cputime_add(pacct->ac_stime, current->stime);
+ task_cputime(current, &utime, &stime);
+ pacct->ac_utime += utime;
+ pacct->ac_stime += stime;
pacct->ac_minflt += current->min_flt;
pacct->ac_majflt += current->maj_flt;
spin_unlock_irq(&current->sighand->siglock);
diff --git a/kernel/async.c b/kernel/async.c
index 27235f5de19..61f023ce022 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -49,96 +49,74 @@ asynchronous and synchronous parts of the kernel.
*/
#include <linux/async.h>
-#include <linux/bug.h>
-#include <linux/module.h>
+#include <linux/atomic.h>
+#include <linux/ktime.h>
+#include <linux/export.h>
#include <linux/wait.h>
#include <linux/sched.h>
-#include <linux/init.h>
-#include <linux/kthread.h>
-#include <linux/delay.h>
-#include <asm/atomic.h>
+#include <linux/slab.h>
+#include <linux/workqueue.h>
+
+#include "workqueue_internal.h"
static async_cookie_t next_cookie = 1;
-#define MAX_THREADS 256
-#define MAX_WORK 32768
+#define MAX_WORK 32768
+#define ASYNC_COOKIE_MAX ULLONG_MAX /* infinity cookie */
-static LIST_HEAD(async_pending);
-static LIST_HEAD(async_running);
+static LIST_HEAD(async_global_pending); /* pending from all registered doms */
+static ASYNC_DOMAIN(async_dfl_domain);
static DEFINE_SPINLOCK(async_lock);
-static int async_enabled = 0;
-
struct async_entry {
- struct list_head list;
- async_cookie_t cookie;
- async_func_ptr *func;
- void *data;
- struct list_head *running;
+ struct list_head domain_list;
+ struct list_head global_list;
+ struct work_struct work;
+ async_cookie_t cookie;
+ async_func_t func;
+ void *data;
+ struct async_domain *domain;
};
static DECLARE_WAIT_QUEUE_HEAD(async_done);
-static DECLARE_WAIT_QUEUE_HEAD(async_new);
static atomic_t entry_count;
-static atomic_t thread_count;
-
-extern int initcall_debug;
-
-/*
- * MUST be called with the lock held!
- */
-static async_cookie_t __lowest_in_progress(struct list_head *running)
+static async_cookie_t lowest_in_progress(struct async_domain *domain)
{
- struct async_entry *entry;
-
- if (!list_empty(running)) {
- entry = list_first_entry(running,
- struct async_entry, list);
- return entry->cookie;
- }
+ struct list_head *pending;
+ async_cookie_t ret = ASYNC_COOKIE_MAX;
+ unsigned long flags;
- list_for_each_entry(entry, &async_pending, list)
- if (entry->running == running)
- return entry->cookie;
+ spin_lock_irqsave(&async_lock, flags);
- return next_cookie; /* "infinity" value */
-}
+ if (domain)
+ pending = &domain->pending;
+ else
+ pending = &async_global_pending;
-static async_cookie_t lowest_in_progress(struct list_head *running)
-{
- unsigned long flags;
- async_cookie_t ret;
+ if (!list_empty(pending))
+ ret = list_first_entry(pending, struct async_entry,
+ domain_list)->cookie;
- spin_lock_irqsave(&async_lock, flags);
- ret = __lowest_in_progress(running);
spin_unlock_irqrestore(&async_lock, flags);
return ret;
}
+
/*
* pick the first pending entry and run it
*/
-static void run_one_entry(void)
+static void async_run_entry_fn(struct work_struct *work)
{
+ struct async_entry *entry =
+ container_of(work, struct async_entry, work);
unsigned long flags;
- struct async_entry *entry;
- ktime_t calltime, delta, rettime;
-
- /* 1) pick one task from the pending queue */
-
- spin_lock_irqsave(&async_lock, flags);
- if (list_empty(&async_pending))
- goto out;
- entry = list_first_entry(&async_pending, struct async_entry, list);
+ ktime_t uninitialized_var(calltime), delta, rettime;
- /* 2) move it to the running queue */
- list_move_tail(&entry->list, entry->running);
- spin_unlock_irqrestore(&async_lock, flags);
-
- /* 3) run it (and print duration)*/
+ /* 1) run (and print duration) */
if (initcall_debug && system_state == SYSTEM_BOOTING) {
- printk("calling %lli_%pF @ %i\n", (long long)entry->cookie,
+ printk(KERN_DEBUG "calling %lli_%pF @ %i\n",
+ (long long)entry->cookie,
entry->func, task_pid_nr(current));
calltime = ktime_get();
}
@@ -146,37 +124,32 @@ static void run_one_entry(void)
if (initcall_debug && system_state == SYSTEM_BOOTING) {
rettime = ktime_get();
delta = ktime_sub(rettime, calltime);
- printk("initcall %lli_%pF returned 0 after %lld usecs\n",
+ printk(KERN_DEBUG "initcall %lli_%pF returned 0 after %lld usecs\n",
(long long)entry->cookie,
entry->func,
(long long)ktime_to_ns(delta) >> 10);
}
- /* 4) remove it from the running queue */
+ /* 2) remove self from the pending queues */
spin_lock_irqsave(&async_lock, flags);
- list_del(&entry->list);
+ list_del_init(&entry->domain_list);
+ list_del_init(&entry->global_list);
- /* 5) free the entry */
+ /* 3) free the entry */
kfree(entry);
atomic_dec(&entry_count);
spin_unlock_irqrestore(&async_lock, flags);
- /* 6) wake up any waiters. */
+ /* 4) wake up any waiters */
wake_up(&async_done);
- return;
-
-out:
- spin_unlock_irqrestore(&async_lock, flags);
}
-
-static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct list_head *running)
+static async_cookie_t __async_schedule(async_func_t func, void *data, struct async_domain *domain)
{
struct async_entry *entry;
unsigned long flags;
async_cookie_t newcookie;
-
/* allow irq-off callers */
entry = kzalloc(sizeof(struct async_entry), GFP_ATOMIC);
@@ -185,59 +158,74 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct l
* If we're out of memory or if there's too much work
* pending already, we execute synchronously.
*/
- if (!async_enabled || !entry || atomic_read(&entry_count) > MAX_WORK) {
+ if (!entry || atomic_read(&entry_count) > MAX_WORK) {
kfree(entry);
spin_lock_irqsave(&async_lock, flags);
newcookie = next_cookie++;
spin_unlock_irqrestore(&async_lock, flags);
/* low on memory.. run synchronously */
- ptr(data, newcookie);
+ func(data, newcookie);
return newcookie;
}
- entry->func = ptr;
+ INIT_LIST_HEAD(&entry->domain_list);
+ INIT_LIST_HEAD(&entry->global_list);
+ INIT_WORK(&entry->work, async_run_entry_fn);
+ entry->func = func;
entry->data = data;
- entry->running = running;
+ entry->domain = domain;
spin_lock_irqsave(&async_lock, flags);
+
+ /* allocate cookie and queue */
newcookie = entry->cookie = next_cookie++;
- list_add_tail(&entry->list, &async_pending);
+
+ list_add_tail(&entry->domain_list, &domain->pending);
+ if (domain->registered)
+ list_add_tail(&entry->global_list, &async_global_pending);
+
atomic_inc(&entry_count);
spin_unlock_irqrestore(&async_lock, flags);
- wake_up(&async_new);
+
+ /* mark that this task has queued an async job, used by module init */
+ current->flags |= PF_USED_ASYNC;
+
+ /* schedule for execution */
+ queue_work(system_unbound_wq, &entry->work);
+
return newcookie;
}
/**
* async_schedule - schedule a function for asynchronous execution
- * @ptr: function to execute asynchronously
+ * @func: function to execute asynchronously
* @data: data pointer to pass to the function
*
* Returns an async_cookie_t that may be used for checkpointing later.
* Note: This function may be called from atomic or non-atomic contexts.
*/
-async_cookie_t async_schedule(async_func_ptr *ptr, void *data)
+async_cookie_t async_schedule(async_func_t func, void *data)
{
- return __async_schedule(ptr, data, &async_running);
+ return __async_schedule(func, data, &async_dfl_domain);
}
EXPORT_SYMBOL_GPL(async_schedule);
/**
* async_schedule_domain - schedule a function for asynchronous execution within a certain domain
- * @ptr: function to execute asynchronously
+ * @func: function to execute asynchronously
* @data: data pointer to pass to the function
- * @running: running list for the domain
+ * @domain: the domain
*
* Returns an async_cookie_t that may be used for checkpointing later.
- * @running may be used in the async_synchronize_*_domain() functions
- * to wait within a certain synchronization domain rather than globally.
- * A synchronization domain is specified via the running queue @running to use.
- * Note: This function may be called from atomic or non-atomic contexts.
+ * @domain may be used in the async_synchronize_*_domain() functions to
+ * wait within a certain synchronization domain rather than globally. A
+ * synchronization domain is specified via @domain. Note: This function
+ * may be called from atomic or non-atomic contexts.
*/
-async_cookie_t async_schedule_domain(async_func_ptr *ptr, void *data,
- struct list_head *running)
+async_cookie_t async_schedule_domain(async_func_t func, void *data,
+ struct async_domain *domain)
{
- return __async_schedule(ptr, data, running);
+ return __async_schedule(func, data, domain);
}
EXPORT_SYMBOL_GPL(async_schedule_domain);
@@ -248,51 +236,66 @@ EXPORT_SYMBOL_GPL(async_schedule_domain);
*/
void async_synchronize_full(void)
{
- do {
- async_synchronize_cookie(next_cookie);
- } while (!list_empty(&async_running) || !list_empty(&async_pending));
+ async_synchronize_full_domain(NULL);
}
EXPORT_SYMBOL_GPL(async_synchronize_full);
/**
+ * async_unregister_domain - ensure no more anonymous waiters on this domain
+ * @domain: idle domain to flush out of any async_synchronize_full instances
+ *
+ * async_synchronize_{cookie|full}_domain() are not flushed since callers
+ * of these routines should know the lifetime of @domain
+ *
+ * Prefer ASYNC_DOMAIN_EXCLUSIVE() declarations over flushing
+ */
+void async_unregister_domain(struct async_domain *domain)
+{
+ spin_lock_irq(&async_lock);
+ WARN_ON(!domain->registered || !list_empty(&domain->pending));
+ domain->registered = 0;
+ spin_unlock_irq(&async_lock);
+}
+EXPORT_SYMBOL_GPL(async_unregister_domain);
+
+/**
* async_synchronize_full_domain - synchronize all asynchronous function within a certain domain
- * @list: running list to synchronize on
+ * @domain: the domain to synchronize
*
* This function waits until all asynchronous function calls for the
- * synchronization domain specified by the running list @list have been done.
+ * synchronization domain specified by @domain have been done.
*/
-void async_synchronize_full_domain(struct list_head *list)
+void async_synchronize_full_domain(struct async_domain *domain)
{
- async_synchronize_cookie_domain(next_cookie, list);
+ async_synchronize_cookie_domain(ASYNC_COOKIE_MAX, domain);
}
EXPORT_SYMBOL_GPL(async_synchronize_full_domain);
/**
* async_synchronize_cookie_domain - synchronize asynchronous function calls within a certain domain with cookie checkpointing
* @cookie: async_cookie_t to use as checkpoint
- * @running: running list to synchronize on
+ * @domain: the domain to synchronize (%NULL for all registered domains)
*
* This function waits until all asynchronous function calls for the
- * synchronization domain specified by the running list @list submitted
- * prior to @cookie have been done.
+ * synchronization domain specified by @domain submitted prior to @cookie
+ * have been done.
*/
-void async_synchronize_cookie_domain(async_cookie_t cookie,
- struct list_head *running)
+void async_synchronize_cookie_domain(async_cookie_t cookie, struct async_domain *domain)
{
- ktime_t starttime, delta, endtime;
+ ktime_t uninitialized_var(starttime), delta, endtime;
if (initcall_debug && system_state == SYSTEM_BOOTING) {
- printk("async_waiting @ %i\n", task_pid_nr(current));
+ printk(KERN_DEBUG "async_waiting @ %i\n", task_pid_nr(current));
starttime = ktime_get();
}
- wait_event(async_done, lowest_in_progress(running) >= cookie);
+ wait_event(async_done, lowest_in_progress(domain) >= cookie);
if (initcall_debug && system_state == SYSTEM_BOOTING) {
endtime = ktime_get();
delta = ktime_sub(endtime, starttime);
- printk("async_continuing @ %i after %lli usec\n",
+ printk(KERN_DEBUG "async_continuing @ %i after %lli usec\n",
task_pid_nr(current),
(long long)ktime_to_ns(delta) >> 10);
}
@@ -308,90 +311,18 @@ EXPORT_SYMBOL_GPL(async_synchronize_cookie_domain);
*/
void async_synchronize_cookie(async_cookie_t cookie)
{
- async_synchronize_cookie_domain(cookie, &async_running);
+ async_synchronize_cookie_domain(cookie, &async_dfl_domain);
}
EXPORT_SYMBOL_GPL(async_synchronize_cookie);
-
-static int async_thread(void *unused)
-{
- DECLARE_WAITQUEUE(wq, current);
- add_wait_queue(&async_new, &wq);
-
- while (!kthread_should_stop()) {
- int ret = HZ;
- set_current_state(TASK_INTERRUPTIBLE);
- /*
- * check the list head without lock.. false positives
- * are dealt with inside run_one_entry() while holding
- * the lock.
- */
- rmb();
- if (!list_empty(&async_pending))
- run_one_entry();
- else
- ret = schedule_timeout(HZ);
-
- if (ret == 0) {
- /*
- * we timed out, this means we as thread are redundant.
- * we sign off and die, but we to avoid any races there
- * is a last-straw check to see if work snuck in.
- */
- atomic_dec(&thread_count);
- wmb(); /* manager must see our departure first */
- if (list_empty(&async_pending))
- break;
- /*
- * woops work came in between us timing out and us
- * signing off; we need to stay alive and keep working.
- */
- atomic_inc(&thread_count);
- }
- }
- remove_wait_queue(&async_new, &wq);
-
- return 0;
-}
-
-static int async_manager_thread(void *unused)
-{
- DECLARE_WAITQUEUE(wq, current);
- add_wait_queue(&async_new, &wq);
-
- while (!kthread_should_stop()) {
- int tc, ec;
-
- set_current_state(TASK_INTERRUPTIBLE);
-
- tc = atomic_read(&thread_count);
- rmb();
- ec = atomic_read(&entry_count);
-
- while (tc < ec && tc < MAX_THREADS) {
- if (IS_ERR(kthread_run(async_thread, NULL, "async/%i",
- tc))) {
- msleep(100);
- continue;
- }
- atomic_inc(&thread_count);
- tc++;
- }
-
- schedule();
- }
- remove_wait_queue(&async_new, &wq);
-
- return 0;
-}
-
-static int __init async_init(void)
+/**
+ * current_is_async - is %current an async worker task?
+ *
+ * Returns %true if %current is an async worker task.
+ */
+bool current_is_async(void)
{
- async_enabled =
- !IS_ERR(kthread_run(async_manager_thread, NULL, "async/mgr"));
+ struct worker *worker = current_wq_worker();
- WARN_ON(!async_enabled);
- return 0;
+ return worker && worker->current_func == async_run_entry_fn;
}
-
-core_initcall(async_init);
diff --git a/kernel/audit.c b/kernel/audit.c
index 5feed232be9..3ef2e0e797e 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -41,23 +41,31 @@
* Example user-space utilities: http://people.redhat.com/sgrubb/audit/
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/init.h>
-#include <asm/types.h>
-#include <asm/atomic.h>
+#include <linux/types.h>
+#include <linux/atomic.h>
#include <linux/mm.h>
-#include <linux/module.h>
+#include <linux/export.h>
+#include <linux/slab.h>
#include <linux/err.h>
#include <linux/kthread.h>
+#include <linux/kernel.h>
+#include <linux/syscalls.h>
#include <linux/audit.h>
#include <net/sock.h>
#include <net/netlink.h>
#include <linux/skbuff.h>
-#include <linux/netlink.h>
-#include <linux/inotify.h>
+#ifdef CONFIG_SECURITY
+#include <linux/security.h>
+#endif
#include <linux/freezer.h>
#include <linux/tty.h>
+#include <linux/pid_namespace.h>
+#include <net/netns/generic.h>
#include "audit.h"
@@ -71,35 +79,39 @@ static int audit_initialized;
#define AUDIT_OFF 0
#define AUDIT_ON 1
#define AUDIT_LOCKED 2
-int audit_enabled;
-int audit_ever_enabled;
+u32 audit_enabled;
+u32 audit_ever_enabled;
+
+EXPORT_SYMBOL_GPL(audit_enabled);
/* Default state when kernel boots without any parameters. */
-static int audit_default;
+static u32 audit_default;
/* If auditing cannot proceed, audit_failure selects what happens. */
-static int audit_failure = AUDIT_FAIL_PRINTK;
+static u32 audit_failure = AUDIT_FAIL_PRINTK;
/*
* If audit records are to be written to the netlink socket, audit_pid
- * contains the pid of the auditd process and audit_nlk_pid contains
- * the pid to use to send netlink messages to that process.
+ * contains the pid of the auditd process and audit_nlk_portid contains
+ * the portid to use to send netlink messages to that process.
*/
int audit_pid;
-static int audit_nlk_pid;
+static __u32 audit_nlk_portid;
/* If audit_rate_limit is non-zero, limit the rate of sending audit records
* to that number per second. This prevents DoS attacks, but results in
* audit records being dropped. */
-static int audit_rate_limit;
+static u32 audit_rate_limit;
-/* Number of outstanding audit_buffers allowed. */
-static int audit_backlog_limit = 64;
-static int audit_backlog_wait_time = 60 * HZ;
-static int audit_backlog_wait_overflow = 0;
+/* Number of outstanding audit_buffers allowed.
+ * When set to zero, this means unlimited. */
+static u32 audit_backlog_limit = 64;
+#define AUDIT_BACKLOG_WAIT_TIME (60 * HZ)
+static u32 audit_backlog_wait_time = AUDIT_BACKLOG_WAIT_TIME;
+static u32 audit_backlog_wait_overflow = 0;
/* The identity of the user shutting down the audit system. */
-uid_t audit_sig_uid = -1;
+kuid_t audit_sig_uid = INVALID_UID;
pid_t audit_sig_pid = -1;
u32 audit_sig_sid = 0;
@@ -114,6 +126,7 @@ static atomic_t audit_lost = ATOMIC_INIT(0);
/* The netlink socket. */
static struct sock *audit_sock;
+int audit_net_id;
/* Hash for inode-based rules */
struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS];
@@ -132,6 +145,17 @@ static struct task_struct *kauditd_task;
static DECLARE_WAIT_QUEUE_HEAD(kauditd_wait);
static DECLARE_WAIT_QUEUE_HEAD(audit_backlog_wait);
+static struct audit_features af = {.vers = AUDIT_FEATURE_VERSION,
+ .mask = -1,
+ .features = 0,
+ .lock = 0,};
+
+static char *audit_feature_names[2] = {
+ "only_unset_loginuid",
+ "loginuid_immutable",
+};
+
+
/* Serialize requests from userspace. */
DEFINE_MUTEX(audit_cmd_mutex);
@@ -157,27 +181,27 @@ struct audit_buffer {
};
struct audit_reply {
- int pid;
+ __u32 portid;
+ struct net *net;
struct sk_buff *skb;
};
-static void audit_set_pid(struct audit_buffer *ab, pid_t pid)
+static void audit_set_portid(struct audit_buffer *ab, __u32 portid)
{
if (ab) {
struct nlmsghdr *nlh = nlmsg_hdr(ab->skb);
- nlh->nlmsg_pid = pid;
+ nlh->nlmsg_pid = portid;
}
}
void audit_panic(const char *message)
{
- switch (audit_failure)
- {
+ switch (audit_failure) {
case AUDIT_FAIL_SILENT:
break;
case AUDIT_FAIL_PRINTK:
if (printk_ratelimit())
- printk(KERN_ERR "audit: %s\n", message);
+ pr_err("%s\n", message);
break;
case AUDIT_FAIL_PANIC:
/* test audit_pid since printk is always losey, why bother? */
@@ -248,9 +272,7 @@ void audit_log_lost(const char *message)
if (print) {
if (printk_ratelimit())
- printk(KERN_WARNING
- "audit: audit_lost=%d audit_rate_limit=%d "
- "audit_backlog_limit=%d\n",
+ pr_warn("audit_lost=%u audit_rate_limit=%u audit_backlog_limit=%u\n",
atomic_read(&audit_lost),
audit_rate_limit,
audit_backlog_limit);
@@ -258,39 +280,29 @@ void audit_log_lost(const char *message)
}
}
-static int audit_log_config_change(char *function_name, int new, int old,
- uid_t loginuid, u32 sessionid, u32 sid,
+static int audit_log_config_change(char *function_name, u32 new, u32 old,
int allow_changes)
{
struct audit_buffer *ab;
int rc = 0;
ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
- audit_log_format(ab, "%s=%d old=%d auid=%u ses=%u", function_name, new,
- old, loginuid, sessionid);
- if (sid) {
- char *ctx = NULL;
- u32 len;
-
- rc = security_secid_to_secctx(sid, &ctx, &len);
- if (rc) {
- audit_log_format(ab, " sid=%u", sid);
- allow_changes = 0; /* Something weird, deny request */
- } else {
- audit_log_format(ab, " subj=%s", ctx);
- security_release_secctx(ctx, len);
- }
- }
+ if (unlikely(!ab))
+ return rc;
+ audit_log_format(ab, "%s=%u old=%u", function_name, new, old);
+ audit_log_session_info(ab);
+ rc = audit_log_task_context(ab);
+ if (rc)
+ allow_changes = 0; /* Something weird, deny request */
audit_log_format(ab, " res=%d", allow_changes);
audit_log_end(ab);
return rc;
}
-static int audit_do_config_change(char *function_name, int *to_change,
- int new, uid_t loginuid, u32 sessionid,
- u32 sid)
+static int audit_do_config_change(char *function_name, u32 *to_change, u32 new)
{
- int allow_changes, rc = 0, old = *to_change;
+ int allow_changes, rc = 0;
+ u32 old = *to_change;
/* check if we are locked */
if (audit_enabled == AUDIT_LOCKED)
@@ -299,8 +311,7 @@ static int audit_do_config_change(char *function_name, int *to_change,
allow_changes = 1;
if (audit_enabled != AUDIT_OFF) {
- rc = audit_log_config_change(function_name, new, old, loginuid,
- sessionid, sid, allow_changes);
+ rc = audit_log_config_change(function_name, new, old, allow_changes);
if (rc)
allow_changes = 0;
}
@@ -314,44 +325,43 @@ static int audit_do_config_change(char *function_name, int *to_change,
return rc;
}
-static int audit_set_rate_limit(int limit, uid_t loginuid, u32 sessionid,
- u32 sid)
+static int audit_set_rate_limit(u32 limit)
+{
+ return audit_do_config_change("audit_rate_limit", &audit_rate_limit, limit);
+}
+
+static int audit_set_backlog_limit(u32 limit)
{
- return audit_do_config_change("audit_rate_limit", &audit_rate_limit,
- limit, loginuid, sessionid, sid);
+ return audit_do_config_change("audit_backlog_limit", &audit_backlog_limit, limit);
}
-static int audit_set_backlog_limit(int limit, uid_t loginuid, u32 sessionid,
- u32 sid)
+static int audit_set_backlog_wait_time(u32 timeout)
{
- return audit_do_config_change("audit_backlog_limit", &audit_backlog_limit,
- limit, loginuid, sessionid, sid);
+ return audit_do_config_change("audit_backlog_wait_time",
+ &audit_backlog_wait_time, timeout);
}
-static int audit_set_enabled(int state, uid_t loginuid, u32 sessionid, u32 sid)
+static int audit_set_enabled(u32 state)
{
int rc;
if (state < AUDIT_OFF || state > AUDIT_LOCKED)
return -EINVAL;
- rc = audit_do_config_change("audit_enabled", &audit_enabled, state,
- loginuid, sessionid, sid);
-
+ rc = audit_do_config_change("audit_enabled", &audit_enabled, state);
if (!rc)
audit_ever_enabled |= !!state;
return rc;
}
-static int audit_set_failure(int state, uid_t loginuid, u32 sessionid, u32 sid)
+static int audit_set_failure(u32 state)
{
if (state != AUDIT_FAIL_SILENT
&& state != AUDIT_FAIL_PRINTK
&& state != AUDIT_FAIL_PANIC)
return -EINVAL;
- return audit_do_config_change("audit_failure", &audit_failure, state,
- loginuid, sessionid, sid);
+ return audit_do_config_change("audit_failure", &audit_failure, state);
}
/*
@@ -366,7 +376,8 @@ static int audit_set_failure(int state, uid_t loginuid, u32 sessionid, u32 sid)
static void audit_hold_skb(struct sk_buff *skb)
{
if (audit_default &&
- skb_queue_len(&audit_skb_hold_queue) < audit_backlog_limit)
+ (!audit_backlog_limit ||
+ skb_queue_len(&audit_skb_hold_queue) < audit_backlog_limit))
skb_queue_tail(&audit_skb_hold_queue, skb);
else
kfree_skb(skb);
@@ -379,13 +390,13 @@ static void audit_hold_skb(struct sk_buff *skb)
static void audit_printk_skb(struct sk_buff *skb)
{
struct nlmsghdr *nlh = nlmsg_hdr(skb);
- char *data = NLMSG_DATA(nlh);
+ char *data = nlmsg_data(nlh);
if (nlh->nlmsg_type != AUDIT_EOE) {
if (printk_ratelimit())
- printk(KERN_NOTICE "type=%d %s\n", nlh->nlmsg_type, data);
+ pr_notice("type=%d %s\n", nlh->nlmsg_type, data);
else
- audit_log_lost("printk limit exceeded\n");
+ audit_log_lost("printk limit exceeded");
}
audit_hold_skb(skb);
@@ -396,117 +407,149 @@ static void kauditd_send_skb(struct sk_buff *skb)
int err;
/* take a reference in case we can't send it and we want to hold it */
skb_get(skb);
- err = netlink_unicast(audit_sock, skb, audit_nlk_pid, 0);
+ err = netlink_unicast(audit_sock, skb, audit_nlk_portid, 0);
if (err < 0) {
- BUG_ON(err != -ECONNREFUSED); /* Shoudn't happen */
- printk(KERN_ERR "audit: *NO* daemon at audit_pid=%d\n", audit_pid);
- audit_log_lost("auditd dissapeared\n");
- audit_pid = 0;
+ BUG_ON(err != -ECONNREFUSED); /* Shouldn't happen */
+ if (audit_pid) {
+ pr_err("*NO* daemon at audit_pid=%d\n", audit_pid);
+ audit_log_lost("auditd disappeared");
+ audit_pid = 0;
+ audit_sock = NULL;
+ }
/* we might get lucky and get this in the next auditd */
audit_hold_skb(skb);
} else
/* drop the extra reference if sent ok */
- kfree_skb(skb);
+ consume_skb(skb);
}
-static int kauditd_thread(void *dummy)
+/*
+ * kauditd_send_multicast_skb - send the skb to multicast userspace listeners
+ *
+ * This function doesn't consume an skb as might be expected since it has to
+ * copy it anyways.
+ */
+static void kauditd_send_multicast_skb(struct sk_buff *skb)
+{
+ struct sk_buff *copy;
+ struct audit_net *aunet = net_generic(&init_net, audit_net_id);
+ struct sock *sock = aunet->nlsk;
+
+ if (!netlink_has_listeners(sock, AUDIT_NLGRP_READLOG))
+ return;
+
+ /*
+ * The seemingly wasteful skb_copy() rather than bumping the refcount
+ * using skb_get() is necessary because non-standard mods are made to
+ * the skb by the original kaudit unicast socket send routine. The
+ * existing auditd daemon assumes this breakage. Fixing this would
+ * require co-ordinating a change in the established protocol between
+ * the kaudit kernel subsystem and the auditd userspace code. There is
+ * no reason for new multicast clients to continue with this
+ * non-compliance.
+ */
+ copy = skb_copy(skb, GFP_KERNEL);
+ if (!copy)
+ return;
+
+ nlmsg_multicast(sock, copy, 0, AUDIT_NLGRP_READLOG, GFP_KERNEL);
+}
+
+/*
+ * flush_hold_queue - empty the hold queue if auditd appears
+ *
+ * If auditd just started, drain the queue of messages already
+ * sent to syslog/printk. Remember loss here is ok. We already
+ * called audit_log_lost() if it didn't go out normally. so the
+ * race between the skb_dequeue and the next check for audit_pid
+ * doesn't matter.
+ *
+ * If you ever find kauditd to be too slow we can get a perf win
+ * by doing our own locking and keeping better track if there
+ * are messages in this queue. I don't see the need now, but
+ * in 5 years when I want to play with this again I'll see this
+ * note and still have no friggin idea what i'm thinking today.
+ */
+static void flush_hold_queue(void)
{
struct sk_buff *skb;
+ if (!audit_default || !audit_pid)
+ return;
+
+ skb = skb_dequeue(&audit_skb_hold_queue);
+ if (likely(!skb))
+ return;
+
+ while (skb && audit_pid) {
+ kauditd_send_skb(skb);
+ skb = skb_dequeue(&audit_skb_hold_queue);
+ }
+
+ /*
+ * if auditd just disappeared but we
+ * dequeued an skb we need to drop ref
+ */
+ if (skb)
+ consume_skb(skb);
+}
+
+static int kauditd_thread(void *dummy)
+{
set_freezable();
while (!kthread_should_stop()) {
- /*
- * if auditd just started drain the queue of messages already
- * sent to syslog/printk. remember loss here is ok. we already
- * called audit_log_lost() if it didn't go out normally. so the
- * race between the skb_dequeue and the next check for audit_pid
- * doesn't matter.
- *
- * if you ever find kauditd to be too slow we can get a perf win
- * by doing our own locking and keeping better track if there
- * are messages in this queue. I don't see the need now, but
- * in 5 years when I want to play with this again I'll see this
- * note and still have no friggin idea what i'm thinking today.
- */
- if (audit_default && audit_pid) {
- skb = skb_dequeue(&audit_skb_hold_queue);
- if (unlikely(skb)) {
- while (skb && audit_pid) {
- kauditd_send_skb(skb);
- skb = skb_dequeue(&audit_skb_hold_queue);
- }
- }
- }
+ struct sk_buff *skb;
+ DECLARE_WAITQUEUE(wait, current);
+
+ flush_hold_queue();
skb = skb_dequeue(&audit_skb_queue);
- wake_up(&audit_backlog_wait);
+
if (skb) {
+ if (skb_queue_len(&audit_skb_queue) <= audit_backlog_limit)
+ wake_up(&audit_backlog_wait);
if (audit_pid)
kauditd_send_skb(skb);
else
audit_printk_skb(skb);
- } else {
- DECLARE_WAITQUEUE(wait, current);
- set_current_state(TASK_INTERRUPTIBLE);
- add_wait_queue(&kauditd_wait, &wait);
-
- if (!skb_queue_len(&audit_skb_queue)) {
- try_to_freeze();
- schedule();
- }
+ continue;
+ }
+ set_current_state(TASK_INTERRUPTIBLE);
+ add_wait_queue(&kauditd_wait, &wait);
- __set_current_state(TASK_RUNNING);
- remove_wait_queue(&kauditd_wait, &wait);
+ if (!skb_queue_len(&audit_skb_queue)) {
+ try_to_freeze();
+ schedule();
}
+
+ __set_current_state(TASK_RUNNING);
+ remove_wait_queue(&kauditd_wait, &wait);
}
return 0;
}
-static int audit_prepare_user_tty(pid_t pid, uid_t loginuid, u32 sessionid)
-{
- struct task_struct *tsk;
- int err;
-
- read_lock(&tasklist_lock);
- tsk = find_task_by_vpid(pid);
- err = -ESRCH;
- if (!tsk)
- goto out;
- err = 0;
-
- spin_lock_irq(&tsk->sighand->siglock);
- if (!tsk->signal->audit_tty)
- err = -EPERM;
- spin_unlock_irq(&tsk->sighand->siglock);
- if (err)
- goto out;
-
- tty_audit_push_task(tsk, loginuid, sessionid);
-out:
- read_unlock(&tasklist_lock);
- return err;
-}
-
int audit_send_list(void *_dest)
{
struct audit_netlink_list *dest = _dest;
- int pid = dest->pid;
struct sk_buff *skb;
+ struct net *net = dest->net;
+ struct audit_net *aunet = net_generic(net, audit_net_id);
/* wait for parent to finish and send an ACK */
mutex_lock(&audit_cmd_mutex);
mutex_unlock(&audit_cmd_mutex);
while ((skb = __skb_dequeue(&dest->q)) != NULL)
- netlink_unicast(audit_sock, skb, pid, 0);
+ netlink_unicast(aunet->nlsk, skb, dest->portid, 0);
+ put_net(net);
kfree(dest);
return 0;
}
-struct sk_buff *audit_make_reply(int pid, int seq, int type, int done,
- int multi, void *payload, int size)
+struct sk_buff *audit_make_reply(__u32 portid, int seq, int type, int done,
+ int multi, const void *payload, int size)
{
struct sk_buff *skb;
struct nlmsghdr *nlh;
@@ -518,33 +561,37 @@ struct sk_buff *audit_make_reply(int pid, int seq, int type, int done,
if (!skb)
return NULL;
- nlh = NLMSG_NEW(skb, pid, seq, t, size, flags);
- data = NLMSG_DATA(nlh);
+ nlh = nlmsg_put(skb, portid, seq, t, size, flags);
+ if (!nlh)
+ goto out_kfree_skb;
+ data = nlmsg_data(nlh);
memcpy(data, payload, size);
return skb;
-nlmsg_failure: /* Used by NLMSG_NEW */
- if (skb)
- kfree_skb(skb);
+out_kfree_skb:
+ kfree_skb(skb);
return NULL;
}
static int audit_send_reply_thread(void *arg)
{
struct audit_reply *reply = (struct audit_reply *)arg;
+ struct net *net = reply->net;
+ struct audit_net *aunet = net_generic(net, audit_net_id);
mutex_lock(&audit_cmd_mutex);
mutex_unlock(&audit_cmd_mutex);
/* Ignore failure. It'll only happen if the sender goes away,
because our timeout is set to infinite. */
- netlink_unicast(audit_sock, reply->skb, reply->pid, 0);
+ netlink_unicast(aunet->nlsk , reply->skb, reply->portid, 0);
+ put_net(net);
kfree(reply);
return 0;
}
/**
* audit_send_reply - send an audit reply message via netlink
- * @pid: process id to send reply to
+ * @request_skb: skb of request we are replying to (used to target the reply)
* @seq: sequence number
* @type: audit message type
* @done: done (last) flag
@@ -552,12 +599,14 @@ static int audit_send_reply_thread(void *arg)
* @payload: payload data
* @size: payload size
*
- * Allocates an skb, builds the netlink message, and sends it to the pid.
+ * Allocates an skb, builds the netlink message, and sends it to the port id.
* No failure notifications.
*/
-void audit_send_reply(int pid, int seq, int type, int done, int multi,
- void *payload, int size)
+static void audit_send_reply(struct sk_buff *request_skb, int seq, int type, int done,
+ int multi, const void *payload, int size)
{
+ u32 portid = NETLINK_CB(request_skb).portid;
+ struct net *net = sock_net(NETLINK_CB(request_skb).sk);
struct sk_buff *skb;
struct task_struct *tsk;
struct audit_reply *reply = kmalloc(sizeof(struct audit_reply),
@@ -566,11 +615,12 @@ void audit_send_reply(int pid, int seq, int type, int done, int multi,
if (!reply)
return;
- skb = audit_make_reply(pid, seq, type, done, multi, payload, size);
+ skb = audit_make_reply(portid, seq, type, done, multi, payload, size);
if (!skb)
goto out;
- reply->pid = pid;
+ reply->net = get_net(net);
+ reply->portid = portid;
reply->skb = skb;
tsk = kthread_run(audit_send_reply_thread, reply, "audit_send_reply");
@@ -589,27 +639,49 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type)
{
int err = 0;
+ /* Only support initial user namespace for now. */
+ /*
+ * We return ECONNREFUSED because it tricks userspace into thinking
+ * that audit was not configured into the kernel. Lots of users
+ * configure their PAM stack (because that's what the distro does)
+ * to reject login if unable to send messages to audit. If we return
+ * ECONNREFUSED the PAM stack thinks the kernel does not have audit
+ * configured in and will let login proceed. If we return EPERM
+ * userspace will reject all logins. This should be removed when we
+ * support non init namespaces!!
+ */
+ if (current_user_ns() != &init_user_ns)
+ return -ECONNREFUSED;
+
switch (msg_type) {
- case AUDIT_GET:
case AUDIT_LIST:
- case AUDIT_LIST_RULES:
- case AUDIT_SET:
case AUDIT_ADD:
- case AUDIT_ADD_RULE:
case AUDIT_DEL:
+ return -EOPNOTSUPP;
+ case AUDIT_GET:
+ case AUDIT_SET:
+ case AUDIT_GET_FEATURE:
+ case AUDIT_SET_FEATURE:
+ case AUDIT_LIST_RULES:
+ case AUDIT_ADD_RULE:
case AUDIT_DEL_RULE:
case AUDIT_SIGNAL_INFO:
case AUDIT_TTY_GET:
case AUDIT_TTY_SET:
case AUDIT_TRIM:
case AUDIT_MAKE_EQUIV:
- if (security_netlink_recv(skb, CAP_AUDIT_CONTROL))
+ /* Only support auditd and auditctl in initial pid namespace
+ * for now. */
+ if ((task_active_pid_ns(current) != &init_pid_ns))
+ return -EPERM;
+
+ if (!netlink_capable(skb, CAP_AUDIT_CONTROL))
err = -EPERM;
break;
case AUDIT_USER:
case AUDIT_FIRST_USER_MSG ... AUDIT_LAST_USER_MSG:
case AUDIT_FIRST_USER_MSG2 ... AUDIT_LAST_USER_MSG2:
- if (security_netlink_recv(skb, CAP_AUDIT_WRITE))
+ if (!netlink_capable(skb, CAP_AUDIT_WRITE))
err = -EPERM;
break;
default: /* bad msg */
@@ -619,45 +691,125 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type)
return err;
}
-static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type,
- u32 pid, u32 uid, uid_t auid, u32 ses,
- u32 sid)
+static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type)
{
int rc = 0;
- char *ctx = NULL;
- u32 len;
+ uid_t uid = from_kuid(&init_user_ns, current_uid());
+ pid_t pid = task_tgid_nr(current);
- if (!audit_enabled) {
+ if (!audit_enabled && msg_type != AUDIT_USER_AVC) {
*ab = NULL;
return rc;
}
*ab = audit_log_start(NULL, GFP_KERNEL, msg_type);
- audit_log_format(*ab, "user pid=%d uid=%u auid=%u ses=%u",
- pid, uid, auid, ses);
- if (sid) {
- rc = security_secid_to_secctx(sid, &ctx, &len);
- if (rc)
- audit_log_format(*ab, " ssid=%u", sid);
- else {
- audit_log_format(*ab, " subj=%s", ctx);
- security_release_secctx(ctx, len);
+ if (unlikely(!*ab))
+ return rc;
+ audit_log_format(*ab, "pid=%d uid=%u", pid, uid);
+ audit_log_session_info(*ab);
+ audit_log_task_context(*ab);
+
+ return rc;
+}
+
+int is_audit_feature_set(int i)
+{
+ return af.features & AUDIT_FEATURE_TO_MASK(i);
+}
+
+
+static int audit_get_feature(struct sk_buff *skb)
+{
+ u32 seq;
+
+ seq = nlmsg_hdr(skb)->nlmsg_seq;
+
+ audit_send_reply(skb, seq, AUDIT_GET, 0, 0, &af, sizeof(af));
+
+ return 0;
+}
+
+static void audit_log_feature_change(int which, u32 old_feature, u32 new_feature,
+ u32 old_lock, u32 new_lock, int res)
+{
+ struct audit_buffer *ab;
+
+ if (audit_enabled == AUDIT_OFF)
+ return;
+
+ ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_FEATURE_CHANGE);
+ audit_log_task_info(ab, current);
+ audit_log_format(ab, "feature=%s old=%u new=%u old_lock=%u new_lock=%u res=%d",
+ audit_feature_names[which], !!old_feature, !!new_feature,
+ !!old_lock, !!new_lock, res);
+ audit_log_end(ab);
+}
+
+static int audit_set_feature(struct sk_buff *skb)
+{
+ struct audit_features *uaf;
+ int i;
+
+ BUILD_BUG_ON(AUDIT_LAST_FEATURE + 1 > sizeof(audit_feature_names)/sizeof(audit_feature_names[0]));
+ uaf = nlmsg_data(nlmsg_hdr(skb));
+
+ /* if there is ever a version 2 we should handle that here */
+
+ for (i = 0; i <= AUDIT_LAST_FEATURE; i++) {
+ u32 feature = AUDIT_FEATURE_TO_MASK(i);
+ u32 old_feature, new_feature, old_lock, new_lock;
+
+ /* if we are not changing this feature, move along */
+ if (!(feature & uaf->mask))
+ continue;
+
+ old_feature = af.features & feature;
+ new_feature = uaf->features & feature;
+ new_lock = (uaf->lock | af.lock) & feature;
+ old_lock = af.lock & feature;
+
+ /* are we changing a locked feature? */
+ if (old_lock && (new_feature != old_feature)) {
+ audit_log_feature_change(i, old_feature, new_feature,
+ old_lock, new_lock, 0);
+ return -EPERM;
}
}
+ /* nothing invalid, do the changes */
+ for (i = 0; i <= AUDIT_LAST_FEATURE; i++) {
+ u32 feature = AUDIT_FEATURE_TO_MASK(i);
+ u32 old_feature, new_feature, old_lock, new_lock;
- return rc;
+ /* if we are not changing this feature, move along */
+ if (!(feature & uaf->mask))
+ continue;
+
+ old_feature = af.features & feature;
+ new_feature = uaf->features & feature;
+ old_lock = af.lock & feature;
+ new_lock = (uaf->lock | af.lock) & feature;
+
+ if (new_feature != old_feature)
+ audit_log_feature_change(i, old_feature, new_feature,
+ old_lock, new_lock, 1);
+
+ if (new_feature)
+ af.features |= feature;
+ else
+ af.features &= ~feature;
+ af.lock |= new_lock;
+ }
+
+ return 0;
}
static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
{
- u32 uid, pid, seq, sid;
+ u32 seq;
void *data;
- struct audit_status *status_get, status_set;
int err;
struct audit_buffer *ab;
u16 msg_type = nlh->nlmsg_type;
- uid_t loginuid; /* loginuid of sender */
- u32 sessionid;
struct audit_sig_info *sig_data;
char *ctx = NULL;
u32 len;
@@ -668,70 +820,90 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
/* As soon as there's any sign of userspace auditd,
* start kauditd to talk to it */
- if (!kauditd_task)
+ if (!kauditd_task) {
kauditd_task = kthread_run(kauditd_thread, NULL, "kauditd");
- if (IS_ERR(kauditd_task)) {
- err = PTR_ERR(kauditd_task);
- kauditd_task = NULL;
- return err;
+ if (IS_ERR(kauditd_task)) {
+ err = PTR_ERR(kauditd_task);
+ kauditd_task = NULL;
+ return err;
+ }
}
-
- pid = NETLINK_CREDS(skb)->pid;
- uid = NETLINK_CREDS(skb)->uid;
- loginuid = NETLINK_CB(skb).loginuid;
- sessionid = NETLINK_CB(skb).sessionid;
- sid = NETLINK_CB(skb).sid;
seq = nlh->nlmsg_seq;
- data = NLMSG_DATA(nlh);
+ data = nlmsg_data(nlh);
switch (msg_type) {
- case AUDIT_GET:
- status_set.enabled = audit_enabled;
- status_set.failure = audit_failure;
- status_set.pid = audit_pid;
- status_set.rate_limit = audit_rate_limit;
- status_set.backlog_limit = audit_backlog_limit;
- status_set.lost = atomic_read(&audit_lost);
- status_set.backlog = skb_queue_len(&audit_skb_queue);
- audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_GET, 0, 0,
- &status_set, sizeof(status_set));
+ case AUDIT_GET: {
+ struct audit_status s;
+ memset(&s, 0, sizeof(s));
+ s.enabled = audit_enabled;
+ s.failure = audit_failure;
+ s.pid = audit_pid;
+ s.rate_limit = audit_rate_limit;
+ s.backlog_limit = audit_backlog_limit;
+ s.lost = atomic_read(&audit_lost);
+ s.backlog = skb_queue_len(&audit_skb_queue);
+ s.version = AUDIT_VERSION_LATEST;
+ s.backlog_wait_time = audit_backlog_wait_time;
+ audit_send_reply(skb, seq, AUDIT_GET, 0, 0, &s, sizeof(s));
break;
- case AUDIT_SET:
- if (nlh->nlmsg_len < sizeof(struct audit_status))
- return -EINVAL;
- status_get = (struct audit_status *)data;
- if (status_get->mask & AUDIT_STATUS_ENABLED) {
- err = audit_set_enabled(status_get->enabled,
- loginuid, sessionid, sid);
+ }
+ case AUDIT_SET: {
+ struct audit_status s;
+ memset(&s, 0, sizeof(s));
+ /* guard against past and future API changes */
+ memcpy(&s, data, min_t(size_t, sizeof(s), nlmsg_len(nlh)));
+ if (s.mask & AUDIT_STATUS_ENABLED) {
+ err = audit_set_enabled(s.enabled);
if (err < 0)
return err;
}
- if (status_get->mask & AUDIT_STATUS_FAILURE) {
- err = audit_set_failure(status_get->failure,
- loginuid, sessionid, sid);
+ if (s.mask & AUDIT_STATUS_FAILURE) {
+ err = audit_set_failure(s.failure);
if (err < 0)
return err;
}
- if (status_get->mask & AUDIT_STATUS_PID) {
- int new_pid = status_get->pid;
+ if (s.mask & AUDIT_STATUS_PID) {
+ int new_pid = s.pid;
+ if ((!new_pid) && (task_tgid_vnr(current) != audit_pid))
+ return -EACCES;
if (audit_enabled != AUDIT_OFF)
- audit_log_config_change("audit_pid", new_pid,
- audit_pid, loginuid,
- sessionid, sid, 1);
-
+ audit_log_config_change("audit_pid", new_pid, audit_pid, 1);
audit_pid = new_pid;
- audit_nlk_pid = NETLINK_CB(skb).pid;
+ audit_nlk_portid = NETLINK_CB(skb).portid;
+ audit_sock = skb->sk;
+ }
+ if (s.mask & AUDIT_STATUS_RATE_LIMIT) {
+ err = audit_set_rate_limit(s.rate_limit);
+ if (err < 0)
+ return err;
+ }
+ if (s.mask & AUDIT_STATUS_BACKLOG_LIMIT) {
+ err = audit_set_backlog_limit(s.backlog_limit);
+ if (err < 0)
+ return err;
}
- if (status_get->mask & AUDIT_STATUS_RATE_LIMIT) {
- err = audit_set_rate_limit(status_get->rate_limit,
- loginuid, sessionid, sid);
+ if (s.mask & AUDIT_STATUS_BACKLOG_WAIT_TIME) {
+ if (sizeof(s) > (size_t)nlh->nlmsg_len)
+ return -EINVAL;
+ if (s.backlog_wait_time < 0 ||
+ s.backlog_wait_time > 10*AUDIT_BACKLOG_WAIT_TIME)
+ return -EINVAL;
+ err = audit_set_backlog_wait_time(s.backlog_wait_time);
if (err < 0)
return err;
}
- if (status_get->mask & AUDIT_STATUS_BACKLOG_LIMIT)
- err = audit_set_backlog_limit(status_get->backlog_limit,
- loginuid, sessionid, sid);
+ break;
+ }
+ case AUDIT_GET_FEATURE:
+ err = audit_get_feature(skb);
+ if (err)
+ return err;
+ break;
+ case AUDIT_SET_FEATURE:
+ err = audit_set_feature(skb);
+ if (err)
+ return err;
break;
case AUDIT_USER:
case AUDIT_FIRST_USER_MSG ... AUDIT_LAST_USER_MSG:
@@ -739,79 +911,54 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
if (!audit_enabled && msg_type != AUDIT_USER_AVC)
return 0;
- err = audit_filter_user(&NETLINK_CB(skb));
- if (err == 1) {
+ err = audit_filter_user(msg_type);
+ if (err == 1) { /* match or error */
err = 0;
if (msg_type == AUDIT_USER_TTY) {
- err = audit_prepare_user_tty(pid, loginuid,
- sessionid);
+ err = tty_audit_push_current();
if (err)
break;
}
- audit_log_common_recv_msg(&ab, msg_type, pid, uid,
- loginuid, sessionid, sid);
-
+ mutex_unlock(&audit_cmd_mutex);
+ audit_log_common_recv_msg(&ab, msg_type);
if (msg_type != AUDIT_USER_TTY)
- audit_log_format(ab, " msg='%.1024s'",
+ audit_log_format(ab, " msg='%.*s'",
+ AUDIT_MESSAGE_TEXT_MAX,
(char *)data);
else {
int size;
- audit_log_format(ab, " msg=");
+ audit_log_format(ab, " data=");
size = nlmsg_len(nlh);
if (size > 0 &&
((unsigned char *)data)[size - 1] == '\0')
size--;
audit_log_n_untrustedstring(ab, data, size);
}
- audit_set_pid(ab, pid);
- audit_log_end(ab);
- }
- break;
- case AUDIT_ADD:
- case AUDIT_DEL:
- if (nlmsg_len(nlh) < sizeof(struct audit_rule))
- return -EINVAL;
- if (audit_enabled == AUDIT_LOCKED) {
- audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid,
- uid, loginuid, sessionid, sid);
-
- audit_log_format(ab, " audit_enabled=%d res=0",
- audit_enabled);
+ audit_set_portid(ab, NETLINK_CB(skb).portid);
audit_log_end(ab);
- return -EPERM;
+ mutex_lock(&audit_cmd_mutex);
}
- /* fallthrough */
- case AUDIT_LIST:
- err = audit_receive_filter(msg_type, NETLINK_CB(skb).pid,
- uid, seq, data, nlmsg_len(nlh),
- loginuid, sessionid, sid);
break;
case AUDIT_ADD_RULE:
case AUDIT_DEL_RULE:
if (nlmsg_len(nlh) < sizeof(struct audit_rule_data))
return -EINVAL;
if (audit_enabled == AUDIT_LOCKED) {
- audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid,
- uid, loginuid, sessionid, sid);
-
- audit_log_format(ab, " audit_enabled=%d res=0",
- audit_enabled);
+ audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE);
+ audit_log_format(ab, " audit_enabled=%d res=0", audit_enabled);
audit_log_end(ab);
return -EPERM;
}
- /* fallthrough */
+ err = audit_rule_change(msg_type, NETLINK_CB(skb).portid,
+ seq, data, nlmsg_len(nlh));
+ break;
case AUDIT_LIST_RULES:
- err = audit_receive_filter(msg_type, NETLINK_CB(skb).pid,
- uid, seq, data, nlmsg_len(nlh),
- loginuid, sessionid, sid);
+ err = audit_list_rules_send(skb, seq);
break;
case AUDIT_TRIM:
audit_trim_trees();
-
- audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid,
- uid, loginuid, sessionid, sid);
-
+ audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE);
audit_log_format(ab, " op=trim res=1");
audit_log_end(ab);
break;
@@ -841,8 +988,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
/* OK, here comes... */
err = audit_tag_tree(old, new);
- audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid,
- uid, loginuid, sessionid, sid);
+ audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE);
audit_log_format(ab, " op=make_equiv old=");
audit_log_untrustedstring(ab, old);
@@ -867,53 +1013,56 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
security_release_secctx(ctx, len);
return -ENOMEM;
}
- sig_data->uid = audit_sig_uid;
+ sig_data->uid = from_kuid(&init_user_ns, audit_sig_uid);
sig_data->pid = audit_sig_pid;
if (audit_sig_sid) {
memcpy(sig_data->ctx, ctx, len);
security_release_secctx(ctx, len);
}
- audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_SIGNAL_INFO,
- 0, 0, sig_data, sizeof(*sig_data) + len);
+ audit_send_reply(skb, seq, AUDIT_SIGNAL_INFO, 0, 0,
+ sig_data, sizeof(*sig_data) + len);
kfree(sig_data);
break;
case AUDIT_TTY_GET: {
struct audit_tty_status s;
- struct task_struct *tsk;
-
- read_lock(&tasklist_lock);
- tsk = find_task_by_vpid(pid);
- if (!tsk)
- err = -ESRCH;
- else {
- spin_lock_irq(&tsk->sighand->siglock);
- s.enabled = tsk->signal->audit_tty != 0;
- spin_unlock_irq(&tsk->sighand->siglock);
- }
- read_unlock(&tasklist_lock);
- audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_TTY_GET, 0, 0,
- &s, sizeof(s));
+ struct task_struct *tsk = current;
+
+ spin_lock(&tsk->sighand->siglock);
+ s.enabled = tsk->signal->audit_tty;
+ s.log_passwd = tsk->signal->audit_tty_log_passwd;
+ spin_unlock(&tsk->sighand->siglock);
+
+ audit_send_reply(skb, seq, AUDIT_TTY_GET, 0, 0, &s, sizeof(s));
break;
}
case AUDIT_TTY_SET: {
- struct audit_tty_status *s;
- struct task_struct *tsk;
-
- if (nlh->nlmsg_len < sizeof(struct audit_tty_status))
- return -EINVAL;
- s = data;
- if (s->enabled != 0 && s->enabled != 1)
- return -EINVAL;
- read_lock(&tasklist_lock);
- tsk = find_task_by_vpid(pid);
- if (!tsk)
- err = -ESRCH;
- else {
- spin_lock_irq(&tsk->sighand->siglock);
- tsk->signal->audit_tty = s->enabled != 0;
- spin_unlock_irq(&tsk->sighand->siglock);
+ struct audit_tty_status s, old;
+ struct task_struct *tsk = current;
+ struct audit_buffer *ab;
+
+ memset(&s, 0, sizeof(s));
+ /* guard against past and future API changes */
+ memcpy(&s, data, min_t(size_t, sizeof(s), nlmsg_len(nlh)));
+ /* check if new data is valid */
+ if ((s.enabled != 0 && s.enabled != 1) ||
+ (s.log_passwd != 0 && s.log_passwd != 1))
+ err = -EINVAL;
+
+ spin_lock(&tsk->sighand->siglock);
+ old.enabled = tsk->signal->audit_tty;
+ old.log_passwd = tsk->signal->audit_tty_log_passwd;
+ if (!err) {
+ tsk->signal->audit_tty = s.enabled;
+ tsk->signal->audit_tty_log_passwd = s.log_passwd;
}
- read_unlock(&tasklist_lock);
+ spin_unlock(&tsk->sighand->siglock);
+
+ audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE);
+ audit_log_format(ab, " op=tty_set old-enabled=%d new-enabled=%d"
+ " old-log_passwd=%d new-log_passwd=%d res=%d",
+ old.enabled, s.enabled, old.log_passwd,
+ s.log_passwd, !err);
+ audit_log_end(ab);
break;
}
default:
@@ -932,7 +1081,7 @@ static void audit_receive_skb(struct sk_buff *skb)
{
struct nlmsghdr *nlh;
/*
- * len MUST be signed for NLMSG_NEXT to be able to dec it below 0
+ * len MUST be signed for nlmsg_next to be able to dec it below 0
* if the nlmsg_len was not aligned
*/
int len;
@@ -941,13 +1090,13 @@ static void audit_receive_skb(struct sk_buff *skb)
nlh = nlmsg_hdr(skb);
len = skb->len;
- while (NLMSG_OK(nlh, len)) {
+ while (nlmsg_ok(nlh, len)) {
err = audit_receive_msg(skb, nlh);
/* if err or if this message says it wants a response */
if (err || (nlh->nlmsg_flags & NLM_F_ACK))
netlink_ack(skb, nlh, err);
- nlh = NLMSG_NEXT(nlh, len);
+ nlh = nlmsg_next(nlh, &len);
}
}
@@ -959,6 +1108,56 @@ static void audit_receive(struct sk_buff *skb)
mutex_unlock(&audit_cmd_mutex);
}
+/* Run custom bind function on netlink socket group connect or bind requests. */
+static int audit_bind(int group)
+{
+ if (!capable(CAP_AUDIT_READ))
+ return -EPERM;
+
+ return 0;
+}
+
+static int __net_init audit_net_init(struct net *net)
+{
+ struct netlink_kernel_cfg cfg = {
+ .input = audit_receive,
+ .bind = audit_bind,
+ .flags = NL_CFG_F_NONROOT_RECV,
+ .groups = AUDIT_NLGRP_MAX,
+ };
+
+ struct audit_net *aunet = net_generic(net, audit_net_id);
+
+ aunet->nlsk = netlink_kernel_create(net, NETLINK_AUDIT, &cfg);
+ if (aunet->nlsk == NULL) {
+ audit_panic("cannot initialize netlink socket in namespace");
+ return -ENOMEM;
+ }
+ aunet->nlsk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
+ return 0;
+}
+
+static void __net_exit audit_net_exit(struct net *net)
+{
+ struct audit_net *aunet = net_generic(net, audit_net_id);
+ struct sock *sock = aunet->nlsk;
+ if (sock == audit_sock) {
+ audit_pid = 0;
+ audit_sock = NULL;
+ }
+
+ RCU_INIT_POINTER(aunet->nlsk, NULL);
+ synchronize_net();
+ netlink_kernel_release(sock);
+}
+
+static struct pernet_operations audit_net_ops __net_initdata = {
+ .init = audit_net_init,
+ .exit = audit_net_exit,
+ .id = &audit_net_id,
+ .size = sizeof(struct audit_net),
+};
+
/* Initialize audit support at boot time. */
static int __init audit_init(void)
{
@@ -967,14 +1166,9 @@ static int __init audit_init(void)
if (audit_initialized == AUDIT_DISABLED)
return 0;
- printk(KERN_INFO "audit: initializing netlink socket (%s)\n",
- audit_default ? "enabled" : "disabled");
- audit_sock = netlink_kernel_create(&init_net, NETLINK_AUDIT, 0,
- audit_receive, NULL, THIS_MODULE);
- if (!audit_sock)
- audit_panic("cannot initialize netlink socket");
- else
- audit_sock->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
+ pr_info("initializing netlink subsys (%s)\n",
+ audit_default ? "enabled" : "disabled");
+ register_pernet_subsys(&audit_net_ops);
skb_queue_head_init(&audit_skb_queue);
skb_queue_head_init(&audit_skb_hold_queue);
@@ -998,22 +1192,32 @@ static int __init audit_enable(char *str)
if (!audit_default)
audit_initialized = AUDIT_DISABLED;
- printk(KERN_INFO "audit: %s", audit_default ? "enabled" : "disabled");
+ pr_info("%s\n", audit_default ?
+ "enabled (after initialization)" : "disabled (until reboot)");
- if (audit_initialized == AUDIT_INITIALIZED) {
- audit_enabled = audit_default;
- audit_ever_enabled |= !!audit_default;
- } else if (audit_initialized == AUDIT_UNINITIALIZED) {
- printk(" (after initialization)");
- } else {
- printk(" (until reboot)");
+ return 1;
+}
+__setup("audit=", audit_enable);
+
+/* Process kernel command-line parameter at boot time.
+ * audit_backlog_limit=<n> */
+static int __init audit_backlog_limit_set(char *str)
+{
+ u32 audit_backlog_limit_arg;
+
+ pr_info("audit_backlog_limit: ");
+ if (kstrtouint(str, 0, &audit_backlog_limit_arg)) {
+ pr_cont("using default of %u, unable to parse %s\n",
+ audit_backlog_limit, str);
+ return 1;
}
- printk("\n");
+
+ audit_backlog_limit = audit_backlog_limit_arg;
+ pr_cont("%d\n", audit_backlog_limit);
return 1;
}
-
-__setup("audit=", audit_enable);
+__setup("audit_backlog_limit=", audit_backlog_limit_set);
static void audit_buffer_free(struct audit_buffer *ab)
{
@@ -1062,13 +1266,15 @@ static struct audit_buffer * audit_buffer_alloc(struct audit_context *ctx,
ab->skb = nlmsg_new(AUDIT_BUFSIZ, gfp_mask);
if (!ab->skb)
- goto nlmsg_failure;
+ goto err;
- nlh = NLMSG_NEW(ab->skb, 0, 0, type, 0, 0);
+ nlh = nlmsg_put(ab->skb, 0, 0, type, 0, 0);
+ if (!nlh)
+ goto out_kfree_skb;
return ab;
-nlmsg_failure: /* Used by NLMSG_NEW */
+out_kfree_skb:
kfree_skb(ab->skb);
ab->skb = NULL;
err:
@@ -1119,12 +1325,24 @@ static inline void audit_get_stamp(struct audit_context *ctx,
}
}
-/* Obtain an audit buffer. This routine does locking to obtain the
- * audit buffer, but then no locking is required for calls to
- * audit_log_*format. If the tsk is a task that is currently in a
- * syscall, then the syscall is marked as auditable and an audit record
- * will be written at syscall exit. If there is no associated task, tsk
- * should be NULL. */
+/*
+ * Wait for auditd to drain the queue a little
+ */
+static long wait_for_auditd(long sleep_time)
+{
+ DECLARE_WAITQUEUE(wait, current);
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ add_wait_queue_exclusive(&audit_backlog_wait, &wait);
+
+ if (audit_backlog_limit &&
+ skb_queue_len(&audit_skb_queue) > audit_backlog_limit)
+ sleep_time = schedule_timeout(sleep_time);
+
+ __set_current_state(TASK_RUNNING);
+ remove_wait_queue(&audit_backlog_wait, &wait);
+
+ return sleep_time;
+}
/**
* audit_log_start - obtain an audit buffer
@@ -1147,7 +1365,8 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
struct audit_buffer *ab = NULL;
struct timespec t;
unsigned int uninitialized_var(serial);
- int reserve;
+ int reserve = 5; /* Allow atomic callers to go up to five
+ entries over the normal backlog limit */
unsigned long timeout_start = jiffies;
if (audit_initialized != AUDIT_INITIALIZED)
@@ -1156,42 +1375,37 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
if (unlikely(audit_filter_type(type)))
return NULL;
- if (gfp_mask & __GFP_WAIT)
- reserve = 0;
- else
- reserve = 5; /* Allow atomic callers to go up to five
- entries over the normal backlog limit */
+ if (gfp_mask & __GFP_WAIT) {
+ if (audit_pid && audit_pid == current->pid)
+ gfp_mask &= ~__GFP_WAIT;
+ else
+ reserve = 0;
+ }
while (audit_backlog_limit
&& skb_queue_len(&audit_skb_queue) > audit_backlog_limit + reserve) {
- if (gfp_mask & __GFP_WAIT && audit_backlog_wait_time
- && time_before(jiffies, timeout_start + audit_backlog_wait_time)) {
-
- /* Wait for auditd to drain the queue a little */
- DECLARE_WAITQUEUE(wait, current);
- set_current_state(TASK_INTERRUPTIBLE);
- add_wait_queue(&audit_backlog_wait, &wait);
-
- if (audit_backlog_limit &&
- skb_queue_len(&audit_skb_queue) > audit_backlog_limit)
- schedule_timeout(timeout_start + audit_backlog_wait_time - jiffies);
-
- __set_current_state(TASK_RUNNING);
- remove_wait_queue(&audit_backlog_wait, &wait);
- continue;
+ if (gfp_mask & __GFP_WAIT && audit_backlog_wait_time) {
+ long sleep_time;
+
+ sleep_time = timeout_start + audit_backlog_wait_time - jiffies;
+ if (sleep_time > 0) {
+ sleep_time = wait_for_auditd(sleep_time);
+ if (sleep_time > 0)
+ continue;
+ }
}
if (audit_rate_check() && printk_ratelimit())
- printk(KERN_WARNING
- "audit: audit_backlog=%d > "
- "audit_backlog_limit=%d\n",
- skb_queue_len(&audit_skb_queue),
- audit_backlog_limit);
+ pr_warn("audit_backlog=%d > audit_backlog_limit=%d\n",
+ skb_queue_len(&audit_skb_queue),
+ audit_backlog_limit);
audit_log_lost("backlog limit exceeded");
audit_backlog_wait_time = audit_backlog_wait_overflow;
wake_up(&audit_backlog_wait);
return NULL;
}
+ audit_backlog_wait_time = AUDIT_BACKLOG_WAIT_TIME;
+
ab = audit_buffer_alloc(ctx, gfp_mask, type);
if (!ab) {
audit_log_lost("out of memory in audit_log_start");
@@ -1262,12 +1476,13 @@ static void audit_log_vformat(struct audit_buffer *ab, const char *fmt,
avail = audit_expand(ab,
max_t(unsigned, AUDIT_BUFSIZ, 1+len-avail));
if (!avail)
- goto out;
+ goto out_va_end;
len = vsnprintf(skb_tail_pointer(skb), avail, fmt, args2);
}
- va_end(args2);
if (len > 0)
skb_put(skb, len);
+out_va_end:
+ va_end(args2);
out:
return;
}
@@ -1308,7 +1523,6 @@ void audit_log_n_hex(struct audit_buffer *ab, const unsigned char *buf,
int i, avail, new_len;
unsigned char *ptr;
struct sk_buff *skb;
- static const unsigned char *hex = "0123456789ABCDEF";
if (!ab)
return;
@@ -1326,10 +1540,8 @@ void audit_log_n_hex(struct audit_buffer *ab, const unsigned char *buf,
}
ptr = skb_tail_pointer(skb);
- for (i=0; i<len; i++) {
- *ptr++ = hex[(buf[i] & 0xF0)>>4]; /* Upper nibble */
- *ptr++ = hex[buf[i] & 0x0F]; /* Lower nibble */
- }
+ for (i = 0; i < len; i++)
+ ptr = hex_byte_pack_upper(ptr, buf[i]);
*ptr = 0;
skb_put(skb, len << 1); /* new string is twice the old string */
}
@@ -1419,12 +1631,12 @@ void audit_log_untrustedstring(struct audit_buffer *ab, const char *string)
/* This is a helper-function to print the escaped d_path */
void audit_log_d_path(struct audit_buffer *ab, const char *prefix,
- struct path *path)
+ const struct path *path)
{
char *p, *pathname;
if (prefix)
- audit_log_format(ab, " %s", prefix);
+ audit_log_format(ab, "%s", prefix);
/* We will allow 11 spaces for ' (deleted)' to be appended */
pathname = kmalloc(PATH_MAX+11, ab->gfp_mask);
@@ -1441,6 +1653,14 @@ void audit_log_d_path(struct audit_buffer *ab, const char *prefix,
kfree(pathname);
}
+void audit_log_session_info(struct audit_buffer *ab)
+{
+ unsigned int sessionid = audit_get_sessionid(current);
+ uid_t auid = from_kuid(&init_user_ns, audit_get_loginuid(current));
+
+ audit_log_format(ab, " auid=%u ses=%u", auid, sessionid);
+}
+
void audit_log_key(struct audit_buffer *ab, char *key)
{
audit_log_format(ab, " key=");
@@ -1450,14 +1670,285 @@ void audit_log_key(struct audit_buffer *ab, char *key)
audit_log_format(ab, "(null)");
}
+void audit_log_cap(struct audit_buffer *ab, char *prefix, kernel_cap_t *cap)
+{
+ int i;
+
+ audit_log_format(ab, " %s=", prefix);
+ CAP_FOR_EACH_U32(i) {
+ audit_log_format(ab, "%08x",
+ cap->cap[(_KERNEL_CAPABILITY_U32S-1) - i]);
+ }
+}
+
+void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name)
+{
+ kernel_cap_t *perm = &name->fcap.permitted;
+ kernel_cap_t *inh = &name->fcap.inheritable;
+ int log = 0;
+
+ if (!cap_isclear(*perm)) {
+ audit_log_cap(ab, "cap_fp", perm);
+ log = 1;
+ }
+ if (!cap_isclear(*inh)) {
+ audit_log_cap(ab, "cap_fi", inh);
+ log = 1;
+ }
+
+ if (log)
+ audit_log_format(ab, " cap_fe=%d cap_fver=%x",
+ name->fcap.fE, name->fcap_ver);
+}
+
+static inline int audit_copy_fcaps(struct audit_names *name,
+ const struct dentry *dentry)
+{
+ struct cpu_vfs_cap_data caps;
+ int rc;
+
+ if (!dentry)
+ return 0;
+
+ rc = get_vfs_caps_from_disk(dentry, &caps);
+ if (rc)
+ return rc;
+
+ name->fcap.permitted = caps.permitted;
+ name->fcap.inheritable = caps.inheritable;
+ name->fcap.fE = !!(caps.magic_etc & VFS_CAP_FLAGS_EFFECTIVE);
+ name->fcap_ver = (caps.magic_etc & VFS_CAP_REVISION_MASK) >>
+ VFS_CAP_REVISION_SHIFT;
+
+ return 0;
+}
+
+/* Copy inode data into an audit_names. */
+void audit_copy_inode(struct audit_names *name, const struct dentry *dentry,
+ const struct inode *inode)
+{
+ name->ino = inode->i_ino;
+ name->dev = inode->i_sb->s_dev;
+ name->mode = inode->i_mode;
+ name->uid = inode->i_uid;
+ name->gid = inode->i_gid;
+ name->rdev = inode->i_rdev;
+ security_inode_getsecid(inode, &name->osid);
+ audit_copy_fcaps(name, dentry);
+}
+
+/**
+ * audit_log_name - produce AUDIT_PATH record from struct audit_names
+ * @context: audit_context for the task
+ * @n: audit_names structure with reportable details
+ * @path: optional path to report instead of audit_names->name
+ * @record_num: record number to report when handling a list of names
+ * @call_panic: optional pointer to int that will be updated if secid fails
+ */
+void audit_log_name(struct audit_context *context, struct audit_names *n,
+ struct path *path, int record_num, int *call_panic)
+{
+ struct audit_buffer *ab;
+ ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH);
+ if (!ab)
+ return;
+
+ audit_log_format(ab, "item=%d", record_num);
+
+ if (path)
+ audit_log_d_path(ab, " name=", path);
+ else if (n->name) {
+ switch (n->name_len) {
+ case AUDIT_NAME_FULL:
+ /* log the full path */
+ audit_log_format(ab, " name=");
+ audit_log_untrustedstring(ab, n->name->name);
+ break;
+ case 0:
+ /* name was specified as a relative path and the
+ * directory component is the cwd */
+ audit_log_d_path(ab, " name=", &context->pwd);
+ break;
+ default:
+ /* log the name's directory component */
+ audit_log_format(ab, " name=");
+ audit_log_n_untrustedstring(ab, n->name->name,
+ n->name_len);
+ }
+ } else
+ audit_log_format(ab, " name=(null)");
+
+ if (n->ino != (unsigned long)-1) {
+ audit_log_format(ab, " inode=%lu"
+ " dev=%02x:%02x mode=%#ho"
+ " ouid=%u ogid=%u rdev=%02x:%02x",
+ n->ino,
+ MAJOR(n->dev),
+ MINOR(n->dev),
+ n->mode,
+ from_kuid(&init_user_ns, n->uid),
+ from_kgid(&init_user_ns, n->gid),
+ MAJOR(n->rdev),
+ MINOR(n->rdev));
+ }
+ if (n->osid != 0) {
+ char *ctx = NULL;
+ u32 len;
+ if (security_secid_to_secctx(
+ n->osid, &ctx, &len)) {
+ audit_log_format(ab, " osid=%u", n->osid);
+ if (call_panic)
+ *call_panic = 2;
+ } else {
+ audit_log_format(ab, " obj=%s", ctx);
+ security_release_secctx(ctx, len);
+ }
+ }
+
+ /* log the audit_names record type */
+ audit_log_format(ab, " nametype=");
+ switch(n->type) {
+ case AUDIT_TYPE_NORMAL:
+ audit_log_format(ab, "NORMAL");
+ break;
+ case AUDIT_TYPE_PARENT:
+ audit_log_format(ab, "PARENT");
+ break;
+ case AUDIT_TYPE_CHILD_DELETE:
+ audit_log_format(ab, "DELETE");
+ break;
+ case AUDIT_TYPE_CHILD_CREATE:
+ audit_log_format(ab, "CREATE");
+ break;
+ default:
+ audit_log_format(ab, "UNKNOWN");
+ break;
+ }
+
+ audit_log_fcaps(ab, n);
+ audit_log_end(ab);
+}
+
+int audit_log_task_context(struct audit_buffer *ab)
+{
+ char *ctx = NULL;
+ unsigned len;
+ int error;
+ u32 sid;
+
+ security_task_getsecid(current, &sid);
+ if (!sid)
+ return 0;
+
+ error = security_secid_to_secctx(sid, &ctx, &len);
+ if (error) {
+ if (error != -EINVAL)
+ goto error_path;
+ return 0;
+ }
+
+ audit_log_format(ab, " subj=%s", ctx);
+ security_release_secctx(ctx, len);
+ return 0;
+
+error_path:
+ audit_panic("error in audit_log_task_context");
+ return error;
+}
+EXPORT_SYMBOL(audit_log_task_context);
+
+void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk)
+{
+ const struct cred *cred;
+ char name[sizeof(tsk->comm)];
+ struct mm_struct *mm = tsk->mm;
+ char *tty;
+
+ if (!ab)
+ return;
+
+ /* tsk == current */
+ cred = current_cred();
+
+ spin_lock_irq(&tsk->sighand->siglock);
+ if (tsk->signal && tsk->signal->tty && tsk->signal->tty->name)
+ tty = tsk->signal->tty->name;
+ else
+ tty = "(none)";
+ spin_unlock_irq(&tsk->sighand->siglock);
+
+ audit_log_format(ab,
+ " ppid=%d pid=%d auid=%u uid=%u gid=%u"
+ " euid=%u suid=%u fsuid=%u"
+ " egid=%u sgid=%u fsgid=%u tty=%s ses=%u",
+ task_ppid_nr(tsk),
+ task_pid_nr(tsk),
+ from_kuid(&init_user_ns, audit_get_loginuid(tsk)),
+ from_kuid(&init_user_ns, cred->uid),
+ from_kgid(&init_user_ns, cred->gid),
+ from_kuid(&init_user_ns, cred->euid),
+ from_kuid(&init_user_ns, cred->suid),
+ from_kuid(&init_user_ns, cred->fsuid),
+ from_kgid(&init_user_ns, cred->egid),
+ from_kgid(&init_user_ns, cred->sgid),
+ from_kgid(&init_user_ns, cred->fsgid),
+ tty, audit_get_sessionid(tsk));
+
+ get_task_comm(name, tsk);
+ audit_log_format(ab, " comm=");
+ audit_log_untrustedstring(ab, name);
+
+ if (mm) {
+ down_read(&mm->mmap_sem);
+ if (mm->exe_file)
+ audit_log_d_path(ab, " exe=", &mm->exe_file->f_path);
+ up_read(&mm->mmap_sem);
+ } else
+ audit_log_format(ab, " exe=(null)");
+ audit_log_task_context(ab);
+}
+EXPORT_SYMBOL(audit_log_task_info);
+
+/**
+ * audit_log_link_denied - report a link restriction denial
+ * @operation: specific link opreation
+ * @link: the path that triggered the restriction
+ */
+void audit_log_link_denied(const char *operation, struct path *link)
+{
+ struct audit_buffer *ab;
+ struct audit_names *name;
+
+ name = kzalloc(sizeof(*name), GFP_NOFS);
+ if (!name)
+ return;
+
+ /* Generate AUDIT_ANOM_LINK with subject, operation, outcome. */
+ ab = audit_log_start(current->audit_context, GFP_KERNEL,
+ AUDIT_ANOM_LINK);
+ if (!ab)
+ goto out;
+ audit_log_format(ab, "op=%s", operation);
+ audit_log_task_info(ab, current);
+ audit_log_format(ab, " res=0");
+ audit_log_end(ab);
+
+ /* Generate AUDIT_PATH record with object. */
+ name->type = AUDIT_TYPE_NORMAL;
+ audit_copy_inode(name, link->dentry, link->dentry->d_inode);
+ audit_log_name(current->audit_context, name, link, 0, NULL);
+out:
+ kfree(name);
+}
+
/**
* audit_log_end - end one audit record
* @ab: the audit_buffer
*
- * The netlink_* functions cannot be called inside an irq context, so
- * the audit buffer is placed on a queue and a tasklet is scheduled to
- * remove them from the queue outside the irq context. May be called in
- * any context.
+ * netlink_unicast() cannot be called inside an irq context because it blocks
+ * (last arg, flags, is not set to MSG_DONTWAIT), so the audit buffer is placed
+ * on a queue and a tasklet is scheduled to remove them from the queue outside
+ * the irq context. May be called in any context.
*/
void audit_log_end(struct audit_buffer *ab)
{
@@ -1467,7 +1958,19 @@ void audit_log_end(struct audit_buffer *ab)
audit_log_lost("rate limit exceeded");
} else {
struct nlmsghdr *nlh = nlmsg_hdr(ab->skb);
- nlh->nlmsg_len = ab->skb->len - NLMSG_SPACE(0);
+
+ kauditd_send_multicast_skb(ab->skb);
+
+ /*
+ * The original kaudit unicast socket sends up messages with
+ * nlmsg_len set to the payload length rather than the entire
+ * message length. This breaks the standard set by netlink.
+ * The existing auditd daemon assumes this breakage. Fixing
+ * this would require co-ordinating a change in the established
+ * protocol between the kaudit kernel subsystem and the auditd
+ * userspace code.
+ */
+ nlh->nlmsg_len = ab->skb->len - NLMSG_HDRLEN;
if (audit_pid) {
skb_queue_tail(&audit_skb_queue, ab->skb);
@@ -1507,6 +2010,32 @@ void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type,
}
}
+#ifdef CONFIG_SECURITY
+/**
+ * audit_log_secctx - Converts and logs SELinux context
+ * @ab: audit_buffer
+ * @secid: security number
+ *
+ * This is a helper function that calls security_secid_to_secctx to convert
+ * secid to secctx and then adds the (converted) SELinux context to the audit
+ * log by calling audit_log_format, thus also preventing leak of internal secid
+ * to userspace. If secid cannot be converted audit_panic is called.
+ */
+void audit_log_secctx(struct audit_buffer *ab, u32 secid)
+{
+ u32 len;
+ char *secctx;
+
+ if (security_secid_to_secctx(secid, &secctx, &len)) {
+ audit_panic("Cannot convert secid to context");
+ } else {
+ audit_log_format(ab, " obj=%s", secctx);
+ security_release_secctx(secctx, len);
+ }
+}
+EXPORT_SYMBOL(audit_log_secctx);
+#endif
+
EXPORT_SYMBOL(audit_log_start);
EXPORT_SYMBOL(audit_log_end);
EXPORT_SYMBOL(audit_log_format);
diff --git a/kernel/audit.h b/kernel/audit.h
index 208687be4f3..7bb65730c89 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -22,6 +22,7 @@
#include <linux/fs.h>
#include <linux/audit.h>
#include <linux/skbuff.h>
+#include <uapi/linux/mqueue.h>
/* 0 = no checking
1 = put_count checking
@@ -29,6 +30,11 @@
*/
#define AUDIT_DEBUG 0
+/* AUDIT_NAMES is the number of slots we reserve in the audit_context
+ * for saving names from getname(). If we get more names we will allocate
+ * a name dynamically and also add those to the list anchored by names_list. */
+#define AUDIT_NAMES 5
+
/* At task start time, the audit_state is set in the audit_context using
a per-task filter. At syscall entry, the audit_state is augmented by
the syscall filter. */
@@ -36,12 +42,8 @@ enum audit_state {
AUDIT_DISABLED, /* Do not create per-task audit_context.
* No syscall-specific audit records can
* be generated. */
- AUDIT_SETUP_CONTEXT, /* Create the per-task audit_context,
- * but don't necessarily fill it in at
- * syscall entry time (i.e., filter
- * instead). */
AUDIT_BUILD_CONTEXT, /* Create the per-task audit_context,
- * and always fill it in at syscall
+ * and fill it in at syscall
* entry time. This makes a full
* syscall record available if some
* other part of the kernel decides it
@@ -63,10 +65,167 @@ struct audit_entry {
struct audit_krule rule;
};
-#ifdef CONFIG_AUDIT
-extern int audit_enabled;
-extern int audit_ever_enabled;
+struct audit_cap_data {
+ kernel_cap_t permitted;
+ kernel_cap_t inheritable;
+ union {
+ unsigned int fE; /* effective bit of file cap */
+ kernel_cap_t effective; /* effective set of process */
+ };
+};
+
+/* When fs/namei.c:getname() is called, we store the pointer in name and
+ * we don't let putname() free it (instead we free all of the saved
+ * pointers at syscall exit time).
+ *
+ * Further, in fs/namei.c:path_lookup() we store the inode and device.
+ */
+struct audit_names {
+ struct list_head list; /* audit_context->names_list */
+
+ struct filename *name;
+ int name_len; /* number of chars to log */
+ bool hidden; /* don't log this record */
+ bool name_put; /* call __putname()? */
+
+ unsigned long ino;
+ dev_t dev;
+ umode_t mode;
+ kuid_t uid;
+ kgid_t gid;
+ dev_t rdev;
+ u32 osid;
+ struct audit_cap_data fcap;
+ unsigned int fcap_ver;
+ unsigned char type; /* record type */
+ /*
+ * This was an allocated audit_names and not from the array of
+ * names allocated in the task audit context. Thus this name
+ * should be freed on syscall exit.
+ */
+ bool should_free;
+};
+
+struct audit_proctitle {
+ int len; /* length of the cmdline field. */
+ char *value; /* the cmdline field */
+};
+
+/* The per-task audit context. */
+struct audit_context {
+ int dummy; /* must be the first element */
+ int in_syscall; /* 1 if task is in a syscall */
+ enum audit_state state, current_state;
+ unsigned int serial; /* serial number for record */
+ int major; /* syscall number */
+ struct timespec ctime; /* time of syscall entry */
+ unsigned long argv[4]; /* syscall arguments */
+ long return_code;/* syscall return code */
+ u64 prio;
+ int return_valid; /* return code is valid */
+ /*
+ * The names_list is the list of all audit_names collected during this
+ * syscall. The first AUDIT_NAMES entries in the names_list will
+ * actually be from the preallocated_names array for performance
+ * reasons. Except during allocation they should never be referenced
+ * through the preallocated_names array and should only be found/used
+ * by running the names_list.
+ */
+ struct audit_names preallocated_names[AUDIT_NAMES];
+ int name_count; /* total records in names_list */
+ struct list_head names_list; /* struct audit_names->list anchor */
+ char *filterkey; /* key for rule that triggered record */
+ struct path pwd;
+ struct audit_aux_data *aux;
+ struct audit_aux_data *aux_pids;
+ struct sockaddr_storage *sockaddr;
+ size_t sockaddr_len;
+ /* Save things to print about task_struct */
+ pid_t pid, ppid;
+ kuid_t uid, euid, suid, fsuid;
+ kgid_t gid, egid, sgid, fsgid;
+ unsigned long personality;
+ int arch;
+
+ pid_t target_pid;
+ kuid_t target_auid;
+ kuid_t target_uid;
+ unsigned int target_sessionid;
+ u32 target_sid;
+ char target_comm[TASK_COMM_LEN];
+
+ struct audit_tree_refs *trees, *first_trees;
+ struct list_head killed_trees;
+ int tree_count;
+
+ int type;
+ union {
+ struct {
+ int nargs;
+ long args[6];
+ } socketcall;
+ struct {
+ kuid_t uid;
+ kgid_t gid;
+ umode_t mode;
+ u32 osid;
+ int has_perm;
+ uid_t perm_uid;
+ gid_t perm_gid;
+ umode_t perm_mode;
+ unsigned long qbytes;
+ } ipc;
+ struct {
+ mqd_t mqdes;
+ struct mq_attr mqstat;
+ } mq_getsetattr;
+ struct {
+ mqd_t mqdes;
+ int sigev_signo;
+ } mq_notify;
+ struct {
+ mqd_t mqdes;
+ size_t msg_len;
+ unsigned int msg_prio;
+ struct timespec abs_timeout;
+ } mq_sendrecv;
+ struct {
+ int oflag;
+ umode_t mode;
+ struct mq_attr attr;
+ } mq_open;
+ struct {
+ pid_t pid;
+ struct audit_cap_data cap;
+ } capset;
+ struct {
+ int fd;
+ int flags;
+ } mmap;
+ struct {
+ int argc;
+ } execve;
+ };
+ int fds[2];
+ struct audit_proctitle proctitle;
+
+#if AUDIT_DEBUG
+ int put_count;
+ int ino_count;
#endif
+};
+
+extern u32 audit_ever_enabled;
+
+extern void audit_copy_inode(struct audit_names *name,
+ const struct dentry *dentry,
+ const struct inode *inode);
+extern void audit_log_cap(struct audit_buffer *ab, char *prefix,
+ kernel_cap_t *cap);
+extern void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name);
+extern void audit_log_name(struct audit_context *context,
+ struct audit_names *n, struct path *path,
+ int record_num, int *call_panic);
extern int audit_pid;
@@ -78,46 +237,59 @@ static inline int audit_hash_ino(u32 ino)
return (ino & (AUDIT_INODE_BUCKETS-1));
}
+/* Indicates that audit should log the full pathname. */
+#define AUDIT_NAME_FULL -1
+
extern int audit_match_class(int class, unsigned syscall);
extern int audit_comparator(const u32 left, const u32 op, const u32 right);
-extern int audit_compare_dname_path(const char *dname, const char *path,
- int *dirlen);
-extern struct sk_buff * audit_make_reply(int pid, int seq, int type,
- int done, int multi,
- void *payload, int size);
-extern void audit_send_reply(int pid, int seq, int type,
- int done, int multi,
- void *payload, int size);
+extern int audit_uid_comparator(kuid_t left, u32 op, kuid_t right);
+extern int audit_gid_comparator(kgid_t left, u32 op, kgid_t right);
+extern int parent_len(const char *path);
+extern int audit_compare_dname_path(const char *dname, const char *path, int plen);
+extern struct sk_buff *audit_make_reply(__u32 portid, int seq, int type,
+ int done, int multi,
+ const void *payload, int size);
extern void audit_panic(const char *message);
struct audit_netlink_list {
- int pid;
+ __u32 portid;
+ struct net *net;
struct sk_buff_head q;
};
int audit_send_list(void *);
+struct audit_net {
+ struct sock *nlsk;
+};
+
extern int selinux_audit_rule_update(void);
extern struct mutex audit_filter_mutex;
extern void audit_free_rule_rcu(struct rcu_head *);
extern struct list_head audit_filter_list[];
+extern struct audit_entry *audit_dupe_rule(struct audit_krule *old);
+
/* audit watch functions */
-extern unsigned long audit_watch_inode(struct audit_watch *watch);
-extern dev_t audit_watch_dev(struct audit_watch *watch);
+#ifdef CONFIG_AUDIT_WATCH
extern void audit_put_watch(struct audit_watch *watch);
extern void audit_get_watch(struct audit_watch *watch);
extern int audit_to_watch(struct audit_krule *krule, char *path, int len, u32 op);
-extern int audit_add_watch(struct audit_krule *krule);
-extern void audit_remove_watch(struct audit_watch *watch);
-extern void audit_remove_watch_rule(struct audit_krule *krule, struct list_head *list);
-extern void audit_inotify_unregister(struct list_head *in_list);
+extern int audit_add_watch(struct audit_krule *krule, struct list_head **list);
+extern void audit_remove_watch_rule(struct audit_krule *krule);
extern char *audit_watch_path(struct audit_watch *watch);
-extern struct list_head *audit_watch_rules(struct audit_watch *watch);
+extern int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev_t dev);
+#else
+#define audit_put_watch(w) {}
+#define audit_get_watch(w) {}
+#define audit_to_watch(k, p, l, o) (-EINVAL)
+#define audit_add_watch(k, l) (-EINVAL)
+#define audit_remove_watch_rule(k) BUG()
+#define audit_watch_path(w) ""
+#define audit_watch_compare(w, i, d) 0
-extern struct audit_entry *audit_dupe_rule(struct audit_krule *old,
- struct audit_watch *watch);
+#endif /* CONFIG_AUDIT_WATCH */
#ifdef CONFIG_AUDIT_TREE
extern struct audit_chunk *audit_tree_lookup(const struct inode *);
@@ -145,7 +317,7 @@ extern void audit_kill_trees(struct list_head *);
extern char *audit_unpack_string(void **, size_t *, size_t);
extern pid_t audit_sig_pid;
-extern uid_t audit_sig_uid;
+extern kuid_t audit_sig_uid;
extern u32 audit_sig_sid;
#ifdef CONFIG_AUDITSYSCALL
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 4b05bd9479d..135944a7b28 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -1,8 +1,9 @@
#include "audit.h"
-#include <linux/inotify.h>
+#include <linux/fsnotify_backend.h>
#include <linux/namei.h>
#include <linux/mount.h>
#include <linux/kthread.h>
+#include <linux/slab.h>
struct audit_tree;
struct audit_chunk;
@@ -21,7 +22,7 @@ struct audit_tree {
struct audit_chunk {
struct list_head hash;
- struct inotify_watch watch;
+ struct fsnotify_mark mark;
struct list_head trees; /* with root here */
int dead;
int count;
@@ -58,7 +59,7 @@ static LIST_HEAD(prune_list);
* tree is refcounted; one reference for "some rules on rules_list refer to
* it", one for each chunk with pointer to it.
*
- * chunk is refcounted by embedded inotify_watch + .refs (non-zero refcount
+ * chunk is refcounted by embedded fsnotify_mark + .refs (non-zero refcount
* of watch contributes 1 to .refs).
*
* node.index allows to get from node.list to containing chunk.
@@ -67,7 +68,7 @@ static LIST_HEAD(prune_list);
* that makes a difference. Some.
*/
-static struct inotify_handle *rtree_ih;
+static struct fsnotify_group *audit_tree_group;
static struct audit_tree *alloc_tree(const char *s)
{
@@ -92,16 +93,10 @@ static inline void get_tree(struct audit_tree *tree)
atomic_inc(&tree->count);
}
-static void __put_tree(struct rcu_head *rcu)
-{
- struct audit_tree *tree = container_of(rcu, struct audit_tree, head);
- kfree(tree);
-}
-
static inline void put_tree(struct audit_tree *tree)
{
if (atomic_dec_and_test(&tree->count))
- call_rcu(&tree->head, __put_tree);
+ kfree_rcu(tree, head);
}
/* to avoid bringing the entire thing in audit.h */
@@ -110,29 +105,6 @@ const char *audit_tree_path(struct audit_tree *tree)
return tree->pathname;
}
-static struct audit_chunk *alloc_chunk(int count)
-{
- struct audit_chunk *chunk;
- size_t size;
- int i;
-
- size = offsetof(struct audit_chunk, owners) + count * sizeof(struct node);
- chunk = kzalloc(size, GFP_KERNEL);
- if (!chunk)
- return NULL;
-
- INIT_LIST_HEAD(&chunk->hash);
- INIT_LIST_HEAD(&chunk->trees);
- chunk->count = count;
- atomic_long_set(&chunk->refs, 1);
- for (i = 0; i < count; i++) {
- INIT_LIST_HEAD(&chunk->owners[i].list);
- chunk->owners[i].index = i;
- }
- inotify_init_watch(&chunk->watch);
- return chunk;
-}
-
static void free_chunk(struct audit_chunk *chunk)
{
int i;
@@ -156,6 +128,35 @@ static void __put_chunk(struct rcu_head *rcu)
audit_put_chunk(chunk);
}
+static void audit_tree_destroy_watch(struct fsnotify_mark *entry)
+{
+ struct audit_chunk *chunk = container_of(entry, struct audit_chunk, mark);
+ call_rcu(&chunk->head, __put_chunk);
+}
+
+static struct audit_chunk *alloc_chunk(int count)
+{
+ struct audit_chunk *chunk;
+ size_t size;
+ int i;
+
+ size = offsetof(struct audit_chunk, owners) + count * sizeof(struct node);
+ chunk = kzalloc(size, GFP_KERNEL);
+ if (!chunk)
+ return NULL;
+
+ INIT_LIST_HEAD(&chunk->hash);
+ INIT_LIST_HEAD(&chunk->trees);
+ chunk->count = count;
+ atomic_long_set(&chunk->refs, 1);
+ for (i = 0; i < count; i++) {
+ INIT_LIST_HEAD(&chunk->owners[i].list);
+ chunk->owners[i].index = i;
+ }
+ fsnotify_init_mark(&chunk->mark, audit_tree_destroy_watch);
+ return chunk;
+}
+
enum {HASH_SIZE = 128};
static struct list_head chunk_hash_heads[HASH_SIZE];
static __cacheline_aligned_in_smp DEFINE_SPINLOCK(hash_lock);
@@ -166,10 +167,15 @@ static inline struct list_head *chunk_hash(const struct inode *inode)
return chunk_hash_heads + n % HASH_SIZE;
}
-/* hash_lock is held by caller */
+/* hash_lock & entry->lock is held by caller */
static void insert_hash(struct audit_chunk *chunk)
{
- struct list_head *list = chunk_hash(chunk->watch.inode);
+ struct fsnotify_mark *entry = &chunk->mark;
+ struct list_head *list;
+
+ if (!entry->i.inode)
+ return;
+ list = chunk_hash(entry->i.inode);
list_add_rcu(&chunk->hash, list);
}
@@ -180,7 +186,8 @@ struct audit_chunk *audit_tree_lookup(const struct inode *inode)
struct audit_chunk *p;
list_for_each_entry_rcu(p, list, hash) {
- if (p->watch.inode == inode) {
+ /* mark.inode may have gone NULL, but who cares? */
+ if (p->mark.i.inode == inode) {
atomic_long_inc(&p->refs);
return p;
}
@@ -209,38 +216,24 @@ static struct audit_chunk *find_chunk(struct node *p)
static void untag_chunk(struct node *p)
{
struct audit_chunk *chunk = find_chunk(p);
- struct audit_chunk *new;
+ struct fsnotify_mark *entry = &chunk->mark;
+ struct audit_chunk *new = NULL;
struct audit_tree *owner;
int size = chunk->count - 1;
int i, j;
- if (!pin_inotify_watch(&chunk->watch)) {
- /*
- * Filesystem is shutting down; all watches are getting
- * evicted, just take it off the node list for this
- * tree and let the eviction logics take care of the
- * rest.
- */
- owner = p->owner;
- if (owner->root == chunk) {
- list_del_init(&owner->same_root);
- owner->root = NULL;
- }
- list_del_init(&p->list);
- p->owner = NULL;
- put_tree(owner);
- return;
- }
+ fsnotify_get_mark(entry);
spin_unlock(&hash_lock);
- /*
- * pin_inotify_watch() succeeded, so the watch won't go away
- * from under us.
- */
- mutex_lock(&chunk->watch.inode->inotify_mutex);
- if (chunk->dead) {
- mutex_unlock(&chunk->watch.inode->inotify_mutex);
+ if (size)
+ new = alloc_chunk(size);
+
+ spin_lock(&entry->lock);
+ if (chunk->dead || !entry->i.inode) {
+ spin_unlock(&entry->lock);
+ if (new)
+ free_chunk(new);
goto out;
}
@@ -255,17 +248,17 @@ static void untag_chunk(struct node *p)
list_del_init(&p->list);
list_del_rcu(&chunk->hash);
spin_unlock(&hash_lock);
- inotify_evict_watch(&chunk->watch);
- mutex_unlock(&chunk->watch.inode->inotify_mutex);
- put_inotify_watch(&chunk->watch);
+ spin_unlock(&entry->lock);
+ fsnotify_destroy_mark(entry, audit_tree_group);
goto out;
}
- new = alloc_chunk(size);
if (!new)
goto Fallback;
- if (inotify_clone_watch(&chunk->watch, &new->watch) < 0) {
- free_chunk(new);
+
+ fsnotify_duplicate_mark(&new->mark, entry);
+ if (fsnotify_add_mark(&new->mark, new->mark.group, new->mark.i.inode, NULL, 1)) {
+ fsnotify_put_mark(&new->mark);
goto Fallback;
}
@@ -297,9 +290,9 @@ static void untag_chunk(struct node *p)
list_for_each_entry(owner, &new->trees, same_root)
owner->root = new;
spin_unlock(&hash_lock);
- inotify_evict_watch(&chunk->watch);
- mutex_unlock(&chunk->watch.inode->inotify_mutex);
- put_inotify_watch(&chunk->watch);
+ spin_unlock(&entry->lock);
+ fsnotify_destroy_mark(entry, audit_tree_group);
+ fsnotify_put_mark(&new->mark); /* drop initial reference */
goto out;
Fallback:
@@ -313,31 +306,33 @@ Fallback:
p->owner = NULL;
put_tree(owner);
spin_unlock(&hash_lock);
- mutex_unlock(&chunk->watch.inode->inotify_mutex);
+ spin_unlock(&entry->lock);
out:
- unpin_inotify_watch(&chunk->watch);
+ fsnotify_put_mark(entry);
spin_lock(&hash_lock);
}
static int create_chunk(struct inode *inode, struct audit_tree *tree)
{
+ struct fsnotify_mark *entry;
struct audit_chunk *chunk = alloc_chunk(1);
if (!chunk)
return -ENOMEM;
- if (inotify_add_watch(rtree_ih, &chunk->watch, inode, IN_IGNORED | IN_DELETE_SELF) < 0) {
- free_chunk(chunk);
+ entry = &chunk->mark;
+ if (fsnotify_add_mark(entry, audit_tree_group, inode, NULL, 0)) {
+ fsnotify_put_mark(entry);
return -ENOSPC;
}
- mutex_lock(&inode->inotify_mutex);
+ spin_lock(&entry->lock);
spin_lock(&hash_lock);
if (tree->goner) {
spin_unlock(&hash_lock);
chunk->dead = 1;
- inotify_evict_watch(&chunk->watch);
- mutex_unlock(&inode->inotify_mutex);
- put_inotify_watch(&chunk->watch);
+ spin_unlock(&entry->lock);
+ fsnotify_destroy_mark(entry, audit_tree_group);
+ fsnotify_put_mark(entry);
return 0;
}
chunk->owners[0].index = (1U << 31);
@@ -350,30 +345,32 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree)
}
insert_hash(chunk);
spin_unlock(&hash_lock);
- mutex_unlock(&inode->inotify_mutex);
+ spin_unlock(&entry->lock);
+ fsnotify_put_mark(entry); /* drop initial reference */
return 0;
}
/* the first tagged inode becomes root of tree */
static int tag_chunk(struct inode *inode, struct audit_tree *tree)
{
- struct inotify_watch *watch;
+ struct fsnotify_mark *old_entry, *chunk_entry;
struct audit_tree *owner;
struct audit_chunk *chunk, *old;
struct node *p;
int n;
- if (inotify_find_watch(rtree_ih, inode, &watch) < 0)
+ old_entry = fsnotify_find_inode_mark(audit_tree_group, inode);
+ if (!old_entry)
return create_chunk(inode, tree);
- old = container_of(watch, struct audit_chunk, watch);
+ old = container_of(old_entry, struct audit_chunk, mark);
/* are we already there? */
spin_lock(&hash_lock);
for (n = 0; n < old->count; n++) {
if (old->owners[n].owner == tree) {
spin_unlock(&hash_lock);
- put_inotify_watch(&old->watch);
+ fsnotify_put_mark(old_entry);
return 0;
}
}
@@ -381,25 +378,44 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
chunk = alloc_chunk(old->count + 1);
if (!chunk) {
- put_inotify_watch(&old->watch);
+ fsnotify_put_mark(old_entry);
return -ENOMEM;
}
- mutex_lock(&inode->inotify_mutex);
- if (inotify_clone_watch(&old->watch, &chunk->watch) < 0) {
- mutex_unlock(&inode->inotify_mutex);
- put_inotify_watch(&old->watch);
+ chunk_entry = &chunk->mark;
+
+ spin_lock(&old_entry->lock);
+ if (!old_entry->i.inode) {
+ /* old_entry is being shot, lets just lie */
+ spin_unlock(&old_entry->lock);
+ fsnotify_put_mark(old_entry);
free_chunk(chunk);
+ return -ENOENT;
+ }
+
+ fsnotify_duplicate_mark(chunk_entry, old_entry);
+ if (fsnotify_add_mark(chunk_entry, chunk_entry->group, chunk_entry->i.inode, NULL, 1)) {
+ spin_unlock(&old_entry->lock);
+ fsnotify_put_mark(chunk_entry);
+ fsnotify_put_mark(old_entry);
return -ENOSPC;
}
+
+ /* even though we hold old_entry->lock, this is safe since chunk_entry->lock could NEVER have been grabbed before */
+ spin_lock(&chunk_entry->lock);
spin_lock(&hash_lock);
+
+ /* we now hold old_entry->lock, chunk_entry->lock, and hash_lock */
if (tree->goner) {
spin_unlock(&hash_lock);
chunk->dead = 1;
- inotify_evict_watch(&chunk->watch);
- mutex_unlock(&inode->inotify_mutex);
- put_inotify_watch(&old->watch);
- put_inotify_watch(&chunk->watch);
+ spin_unlock(&chunk_entry->lock);
+ spin_unlock(&old_entry->lock);
+
+ fsnotify_destroy_mark(chunk_entry, audit_tree_group);
+
+ fsnotify_put_mark(chunk_entry);
+ fsnotify_put_mark(old_entry);
return 0;
}
list_replace_init(&old->trees, &chunk->trees);
@@ -425,18 +441,34 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
list_add(&tree->same_root, &chunk->trees);
}
spin_unlock(&hash_lock);
- inotify_evict_watch(&old->watch);
- mutex_unlock(&inode->inotify_mutex);
- put_inotify_watch(&old->watch); /* pair to inotify_find_watch */
- put_inotify_watch(&old->watch); /* and kill it */
+ spin_unlock(&chunk_entry->lock);
+ spin_unlock(&old_entry->lock);
+ fsnotify_destroy_mark(old_entry, audit_tree_group);
+ fsnotify_put_mark(chunk_entry); /* drop initial reference */
+ fsnotify_put_mark(old_entry); /* pair to fsnotify_find mark_entry */
return 0;
}
+static void audit_log_remove_rule(struct audit_krule *rule)
+{
+ struct audit_buffer *ab;
+
+ ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
+ if (unlikely(!ab))
+ return;
+ audit_log_format(ab, "op=");
+ audit_log_string(ab, "remove rule");
+ audit_log_format(ab, " dir=");
+ audit_log_untrustedstring(ab, rule->tree->pathname);
+ audit_log_key(ab, rule->filterkey);
+ audit_log_format(ab, " list=%d res=1", rule->listnr);
+ audit_log_end(ab);
+}
+
static void kill_rules(struct audit_tree *tree)
{
struct audit_krule *rule, *next;
struct audit_entry *entry;
- struct audit_buffer *ab;
list_for_each_entry_safe(rule, next, &tree->rules, rlist) {
entry = container_of(rule, struct audit_entry, rule);
@@ -444,14 +476,7 @@ static void kill_rules(struct audit_tree *tree)
list_del_init(&rule->rlist);
if (rule->tree) {
/* not a half-baked one */
- ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
- audit_log_format(ab, "op=");
- audit_log_string(ab, "remove rule");
- audit_log_format(ab, " dir=");
- audit_log_untrustedstring(ab, rule->tree->pathname);
- audit_log_key(ab, rule->filterkey);
- audit_log_format(ab, " list=%d res=1", rule->listnr);
- audit_log_end(ab);
+ audit_log_remove_rule(rule);
rule->tree = NULL;
list_del_rcu(&entry->list);
list_del(&entry->rule.list);
@@ -548,6 +573,11 @@ int audit_remove_tree_rule(struct audit_krule *rule)
return 0;
}
+static int compare_root(struct vfsmount *mnt, void *arg)
+{
+ return mnt->mnt_root->d_inode == arg;
+}
+
void audit_trim_trees(void)
{
struct list_head cursor;
@@ -559,7 +589,6 @@ void audit_trim_trees(void)
struct path path;
struct vfsmount *root_mnt;
struct node *node;
- struct list_head list;
int err;
tree = container_of(cursor.next, struct audit_tree, list);
@@ -574,51 +603,29 @@ void audit_trim_trees(void)
root_mnt = collect_mounts(&path);
path_put(&path);
- if (!root_mnt)
+ if (IS_ERR(root_mnt))
goto skip_it;
- list_add_tail(&list, &root_mnt->mnt_list);
spin_lock(&hash_lock);
list_for_each_entry(node, &tree->chunks, list) {
struct audit_chunk *chunk = find_chunk(node);
- struct inode *inode = chunk->watch.inode;
- struct vfsmount *mnt;
+ /* this could be NULL if the watch is dying else where... */
+ struct inode *inode = chunk->mark.i.inode;
node->index |= 1U<<31;
- list_for_each_entry(mnt, &list, mnt_list) {
- if (mnt->mnt_root->d_inode == inode) {
- node->index &= ~(1U<<31);
- break;
- }
- }
+ if (iterate_mounts(compare_root, inode, root_mnt))
+ node->index &= ~(1U<<31);
}
spin_unlock(&hash_lock);
trim_marked(tree);
- put_tree(tree);
- list_del_init(&list);
drop_collected_mounts(root_mnt);
skip_it:
+ put_tree(tree);
mutex_lock(&audit_filter_mutex);
}
list_del(&cursor);
mutex_unlock(&audit_filter_mutex);
}
-static int is_under(struct vfsmount *mnt, struct dentry *dentry,
- struct path *path)
-{
- if (mnt != path->mnt) {
- for (;;) {
- if (mnt->mnt_parent == mnt)
- return 0;
- if (mnt->mnt_parent == path->mnt)
- break;
- mnt = mnt->mnt_parent;
- }
- dentry = mnt->mnt_mountpoint;
- }
- return is_subdir(dentry, path->dentry);
-}
-
int audit_make_tree(struct audit_krule *rule, char *pathname, u32 op)
{
@@ -638,15 +645,20 @@ void audit_put_tree(struct audit_tree *tree)
put_tree(tree);
}
+static int tag_mount(struct vfsmount *mnt, void *arg)
+{
+ return tag_chunk(mnt->mnt_root->d_inode, arg);
+}
+
/* called with audit_filter_mutex */
int audit_add_tree_rule(struct audit_krule *rule)
{
struct audit_tree *seed = rule->tree, *tree;
struct path path;
- struct vfsmount *mnt, *p;
- struct list_head list;
+ struct vfsmount *mnt;
int err;
+ rule->tree = NULL;
list_for_each_entry(tree, &tree_list, list) {
if (!strcmp(seed->pathname, tree->pathname)) {
put_tree(seed);
@@ -666,20 +678,13 @@ int audit_add_tree_rule(struct audit_krule *rule)
goto Err;
mnt = collect_mounts(&path);
path_put(&path);
- if (!mnt) {
- err = -ENOMEM;
+ if (IS_ERR(mnt)) {
+ err = PTR_ERR(mnt);
goto Err;
}
- list_add_tail(&list, &mnt->mnt_list);
get_tree(tree);
- list_for_each_entry(p, &list, mnt_list) {
- err = tag_chunk(p->mnt_root->d_inode, tree);
- if (err)
- break;
- }
-
- list_del(&list);
+ err = iterate_mounts(tag_mount, tree, mnt);
drop_collected_mounts(mnt);
if (!err) {
@@ -714,31 +719,23 @@ int audit_tag_tree(char *old, char *new)
{
struct list_head cursor, barrier;
int failed = 0;
- struct path path;
+ struct path path1, path2;
struct vfsmount *tagged;
- struct list_head list;
- struct vfsmount *mnt;
- struct dentry *dentry;
int err;
- err = kern_path(new, 0, &path);
+ err = kern_path(new, 0, &path2);
if (err)
return err;
- tagged = collect_mounts(&path);
- path_put(&path);
- if (!tagged)
- return -ENOMEM;
+ tagged = collect_mounts(&path2);
+ path_put(&path2);
+ if (IS_ERR(tagged))
+ return PTR_ERR(tagged);
- err = kern_path(old, 0, &path);
+ err = kern_path(old, 0, &path1);
if (err) {
drop_collected_mounts(tagged);
return err;
}
- mnt = mntget(path.mnt);
- dentry = dget(path.dentry);
- path_put(&path);
-
- list_add_tail(&list, &tagged->mnt_list);
mutex_lock(&audit_filter_mutex);
list_add(&barrier, &tree_list);
@@ -746,7 +743,7 @@ int audit_tag_tree(char *old, char *new)
while (cursor.next != &tree_list) {
struct audit_tree *tree;
- struct vfsmount *p;
+ int good_one = 0;
tree = container_of(cursor.next, struct audit_tree, list);
get_tree(tree);
@@ -754,30 +751,19 @@ int audit_tag_tree(char *old, char *new)
list_add(&cursor, &tree->list);
mutex_unlock(&audit_filter_mutex);
- err = kern_path(tree->pathname, 0, &path);
- if (err) {
- put_tree(tree);
- mutex_lock(&audit_filter_mutex);
- continue;
+ err = kern_path(tree->pathname, 0, &path2);
+ if (!err) {
+ good_one = path_is_under(&path1, &path2);
+ path_put(&path2);
}
- spin_lock(&vfsmount_lock);
- if (!is_under(mnt, dentry, &path)) {
- spin_unlock(&vfsmount_lock);
- path_put(&path);
+ if (!good_one) {
put_tree(tree);
mutex_lock(&audit_filter_mutex);
continue;
}
- spin_unlock(&vfsmount_lock);
- path_put(&path);
-
- list_for_each_entry(p, &list, mnt_list) {
- failed = tag_chunk(p->mnt_root->d_inode, tree);
- if (failed)
- break;
- }
+ failed = iterate_mounts(tag_mount, tree, tagged);
if (failed) {
put_tree(tree);
mutex_lock(&audit_filter_mutex);
@@ -818,10 +804,8 @@ int audit_tag_tree(char *old, char *new)
}
list_del(&barrier);
list_del(&cursor);
- list_del(&list);
mutex_unlock(&audit_filter_mutex);
- dput(dentry);
- mntput(mnt);
+ path_put(&path1);
drop_collected_mounts(tagged);
return failed;
}
@@ -889,7 +873,6 @@ void audit_kill_trees(struct list_head *list)
* Here comes the stuff asynchronous to auditctl operations
*/
-/* inode->inotify_mutex is locked */
static void evict_chunk(struct audit_chunk *chunk)
{
struct audit_tree *owner;
@@ -928,35 +911,41 @@ static void evict_chunk(struct audit_chunk *chunk)
mutex_unlock(&audit_filter_mutex);
}
-static void handle_event(struct inotify_watch *watch, u32 wd, u32 mask,
- u32 cookie, const char *dname, struct inode *inode)
+static int audit_tree_handle_event(struct fsnotify_group *group,
+ struct inode *to_tell,
+ struct fsnotify_mark *inode_mark,
+ struct fsnotify_mark *vfsmount_mark,
+ u32 mask, void *data, int data_type,
+ const unsigned char *file_name, u32 cookie)
{
- struct audit_chunk *chunk = container_of(watch, struct audit_chunk, watch);
-
- if (mask & IN_IGNORED) {
- evict_chunk(chunk);
- put_inotify_watch(watch);
- }
+ return 0;
}
-static void destroy_watch(struct inotify_watch *watch)
+static void audit_tree_freeing_mark(struct fsnotify_mark *entry, struct fsnotify_group *group)
{
- struct audit_chunk *chunk = container_of(watch, struct audit_chunk, watch);
- call_rcu(&chunk->head, __put_chunk);
+ struct audit_chunk *chunk = container_of(entry, struct audit_chunk, mark);
+
+ evict_chunk(chunk);
+
+ /*
+ * We are guaranteed to have at least one reference to the mark from
+ * either the inode or the caller of fsnotify_destroy_mark().
+ */
+ BUG_ON(atomic_read(&entry->refcnt) < 1);
}
-static const struct inotify_operations rtree_inotify_ops = {
- .handle_event = handle_event,
- .destroy_watch = destroy_watch,
+static const struct fsnotify_ops audit_tree_ops = {
+ .handle_event = audit_tree_handle_event,
+ .freeing_mark = audit_tree_freeing_mark,
};
static int __init audit_tree_init(void)
{
int i;
- rtree_ih = inotify_init(&rtree_inotify_ops);
- if (IS_ERR(rtree_ih))
- audit_panic("cannot initialize inotify handle for rectree watches");
+ audit_tree_group = fsnotify_alloc_group(&audit_tree_ops);
+ if (IS_ERR(audit_tree_group))
+ audit_panic("cannot initialize fsnotify group for rectree watches");
for (i = 0; i < HASH_SIZE; i++)
INIT_LIST_HEAD(&chunk_hash_heads[i]);
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index cc7e87936cb..70b4554d2fb 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -24,17 +24,18 @@
#include <linux/kthread.h>
#include <linux/mutex.h>
#include <linux/fs.h>
+#include <linux/fsnotify_backend.h>
#include <linux/namei.h>
#include <linux/netlink.h>
#include <linux/sched.h>
-#include <linux/inotify.h>
+#include <linux/slab.h>
#include <linux/security.h>
#include "audit.h"
/*
* Reference counting:
*
- * audit_parent: lifetime is from audit_init_parent() to receipt of an IN_IGNORED
+ * audit_parent: lifetime is from audit_init_parent() to receipt of an FS_IGNORED
* event. Each audit_watch holds a reference to its associated parent.
*
* audit_watch: if added to lists, lifetime is from audit_init_watch() to
@@ -50,40 +51,61 @@ struct audit_watch {
unsigned long ino; /* associated inode number */
struct audit_parent *parent; /* associated parent */
struct list_head wlist; /* entry in parent->watches list */
- struct list_head rules; /* associated rules */
+ struct list_head rules; /* anchor for krule->rlist */
};
struct audit_parent {
- struct list_head ilist; /* entry in inotify registration list */
- struct list_head watches; /* associated watches */
- struct inotify_watch wdata; /* inotify watch data */
- unsigned flags; /* status flags */
+ struct list_head watches; /* anchor for audit_watch->wlist */
+ struct fsnotify_mark mark; /* fsnotify mark on the inode */
};
-/* Inotify handle. */
-struct inotify_handle *audit_ih;
+/* fsnotify handle. */
+static struct fsnotify_group *audit_watch_group;
-/*
- * audit_parent status flags:
- *
- * AUDIT_PARENT_INVALID - set anytime rules/watches are auto-removed due to
- * a filesystem event to ensure we're adding audit watches to a valid parent.
- * Technically not needed for IN_DELETE_SELF or IN_UNMOUNT events, as we cannot
- * receive them while we have nameidata, but must be used for IN_MOVE_SELF which
- * we can receive while holding nameidata.
- */
-#define AUDIT_PARENT_INVALID 0x001
+/* fsnotify events we care about. */
+#define AUDIT_FS_WATCH (FS_MOVE | FS_CREATE | FS_DELETE | FS_DELETE_SELF |\
+ FS_MOVE_SELF | FS_EVENT_ON_CHILD)
-/* Inotify events we care about. */
-#define AUDIT_IN_WATCH IN_MOVE|IN_CREATE|IN_DELETE|IN_DELETE_SELF|IN_MOVE_SELF
+static void audit_free_parent(struct audit_parent *parent)
+{
+ WARN_ON(!list_empty(&parent->watches));
+ kfree(parent);
+}
-static void audit_free_parent(struct inotify_watch *i_watch)
+static void audit_watch_free_mark(struct fsnotify_mark *entry)
{
struct audit_parent *parent;
- parent = container_of(i_watch, struct audit_parent, wdata);
- WARN_ON(!list_empty(&parent->watches));
- kfree(parent);
+ parent = container_of(entry, struct audit_parent, mark);
+ audit_free_parent(parent);
+}
+
+static void audit_get_parent(struct audit_parent *parent)
+{
+ if (likely(parent))
+ fsnotify_get_mark(&parent->mark);
+}
+
+static void audit_put_parent(struct audit_parent *parent)
+{
+ if (likely(parent))
+ fsnotify_put_mark(&parent->mark);
+}
+
+/*
+ * Find and return the audit_parent on the given inode. If found a reference
+ * is taken on this parent.
+ */
+static inline struct audit_parent *audit_find_parent(struct inode *inode)
+{
+ struct audit_parent *parent = NULL;
+ struct fsnotify_mark *entry;
+
+ entry = fsnotify_find_inode_mark(audit_watch_group, inode);
+ if (entry)
+ parent = container_of(entry, struct audit_parent, mark);
+
+ return parent;
}
void audit_get_watch(struct audit_watch *watch)
@@ -101,10 +123,10 @@ void audit_put_watch(struct audit_watch *watch)
}
}
-void audit_remove_watch(struct audit_watch *watch)
+static void audit_remove_watch(struct audit_watch *watch)
{
list_del(&watch->wlist);
- put_inotify_watch(&watch->parent->wdata);
+ audit_put_parent(watch->parent);
watch->parent = NULL;
audit_put_watch(watch); /* match initial get */
}
@@ -114,42 +136,32 @@ char *audit_watch_path(struct audit_watch *watch)
return watch->path;
}
-struct list_head *audit_watch_rules(struct audit_watch *watch)
-{
- return &watch->rules;
-}
-
-unsigned long audit_watch_inode(struct audit_watch *watch)
+int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev_t dev)
{
- return watch->ino;
-}
-
-dev_t audit_watch_dev(struct audit_watch *watch)
-{
- return watch->dev;
+ return (watch->ino != (unsigned long)-1) &&
+ (watch->ino == ino) &&
+ (watch->dev == dev);
}
/* Initialize a parent watch entry. */
-static struct audit_parent *audit_init_parent(struct nameidata *ndp)
+static struct audit_parent *audit_init_parent(struct path *path)
{
+ struct inode *inode = path->dentry->d_inode;
struct audit_parent *parent;
- s32 wd;
+ int ret;
parent = kzalloc(sizeof(*parent), GFP_KERNEL);
if (unlikely(!parent))
return ERR_PTR(-ENOMEM);
INIT_LIST_HEAD(&parent->watches);
- parent->flags = 0;
-
- inotify_init_watch(&parent->wdata);
- /* grab a ref so inotify watch hangs around until we take audit_filter_mutex */
- get_inotify_watch(&parent->wdata);
- wd = inotify_add_watch(audit_ih, &parent->wdata,
- ndp->path.dentry->d_inode, AUDIT_IN_WATCH);
- if (wd < 0) {
- audit_free_parent(&parent->wdata);
- return ERR_PTR(wd);
+
+ fsnotify_init_mark(&parent->mark, audit_watch_free_mark);
+ parent->mark.mask = AUDIT_FS_WATCH;
+ ret = fsnotify_add_mark(&parent->mark, audit_watch_group, inode, NULL, 0);
+ if (ret < 0) {
+ audit_free_parent(parent);
+ return ERR_PTR(ret);
}
return parent;
@@ -178,7 +190,7 @@ int audit_to_watch(struct audit_krule *krule, char *path, int len, u32 op)
{
struct audit_watch *watch;
- if (!audit_ih)
+ if (!audit_watch_group)
return -EOPNOTSUPP;
if (path[0] != '/' || path[len-1] == '/' ||
@@ -216,7 +228,7 @@ static struct audit_watch *audit_dupe_watch(struct audit_watch *old)
new->dev = old->dev;
new->ino = old->ino;
- get_inotify_watch(&old->parent->wdata);
+ audit_get_parent(old->parent);
new->parent = old->parent;
out:
@@ -228,8 +240,10 @@ static void audit_watch_log_rule_change(struct audit_krule *r, struct audit_watc
if (audit_enabled) {
struct audit_buffer *ab;
ab = audit_log_start(NULL, GFP_NOFS, AUDIT_CONFIG_CHANGE);
+ if (unlikely(!ab))
+ return;
audit_log_format(ab, "auid=%u ses=%u op=",
- audit_get_loginuid(current),
+ from_kuid(&init_user_ns, audit_get_loginuid(current)),
audit_get_sessionid(current));
audit_log_string(ab, op);
audit_log_format(ab, " path=");
@@ -250,15 +264,20 @@ static void audit_update_watch(struct audit_parent *parent,
struct audit_entry *oentry, *nentry;
mutex_lock(&audit_filter_mutex);
+ /* Run all of the watches on this parent looking for the one that
+ * matches the given dname */
list_for_each_entry_safe(owatch, nextw, &parent->watches, wlist) {
- if (audit_compare_dname_path(dname, owatch->path, NULL))
+ if (audit_compare_dname_path(dname, owatch->path,
+ AUDIT_NAME_FULL))
continue;
/* If the update involves invalidating rules, do the inode-based
* filtering now, so we don't omit records. */
- if (invalidating && current->audit_context)
+ if (invalidating && !audit_dummy_context())
audit_filter_inodes(current, current->audit_context);
+ /* updating ino will likely change which audit_hash_list we
+ * are on so we need a new watch for the new list */
nwatch = audit_dupe_watch(owatch);
if (IS_ERR(nwatch)) {
mutex_unlock(&audit_filter_mutex);
@@ -274,12 +293,21 @@ static void audit_update_watch(struct audit_parent *parent,
list_del(&oentry->rule.rlist);
list_del_rcu(&oentry->list);
- nentry = audit_dupe_rule(&oentry->rule, nwatch);
+ nentry = audit_dupe_rule(&oentry->rule);
if (IS_ERR(nentry)) {
list_del(&oentry->rule.list);
audit_panic("error updating watch, removing");
} else {
int h = audit_hash_ino((u32)ino);
+
+ /*
+ * nentry->rule.watch == oentry->rule.watch so
+ * we must drop that reference and set it to our
+ * new watch.
+ */
+ audit_put_watch(nentry->rule.watch);
+ audit_get_watch(nwatch);
+ nentry->rule.watch = nwatch;
list_add(&nentry->rule.rlist, &nwatch->rules);
list_add_rcu(&nentry->list, &audit_inode_hash[h]);
list_replace(&oentry->rule.list,
@@ -311,7 +339,6 @@ static void audit_remove_parent_watches(struct audit_parent *parent)
struct audit_entry *e;
mutex_lock(&audit_filter_mutex);
- parent->flags |= AUDIT_PARENT_INVALID;
list_for_each_entry_safe(w, nextw, &parent->watches, wlist) {
list_for_each_entry_safe(r, nextr, &w->rules, rlist) {
e = container_of(r, struct audit_entry, rule);
@@ -324,71 +351,27 @@ static void audit_remove_parent_watches(struct audit_parent *parent)
audit_remove_watch(w);
}
mutex_unlock(&audit_filter_mutex);
-}
-/* Unregister inotify watches for parents on in_list.
- * Generates an IN_IGNORED event. */
-void audit_inotify_unregister(struct list_head *in_list)
-{
- struct audit_parent *p, *n;
-
- list_for_each_entry_safe(p, n, in_list, ilist) {
- list_del(&p->ilist);
- inotify_rm_watch(audit_ih, &p->wdata);
- /* the unpin matching the pin in audit_do_del_rule() */
- unpin_inotify_watch(&p->wdata);
- }
+ fsnotify_destroy_mark(&parent->mark, audit_watch_group);
}
/* Get path information necessary for adding watches. */
-static int audit_get_nd(char *path, struct nameidata **ndp, struct nameidata **ndw)
+static int audit_get_nd(struct audit_watch *watch, struct path *parent)
{
- struct nameidata *ndparent, *ndwatch;
- int err;
-
- ndparent = kmalloc(sizeof(*ndparent), GFP_KERNEL);
- if (unlikely(!ndparent))
- return -ENOMEM;
-
- ndwatch = kmalloc(sizeof(*ndwatch), GFP_KERNEL);
- if (unlikely(!ndwatch)) {
- kfree(ndparent);
- return -ENOMEM;
+ struct dentry *d = kern_path_locked(watch->path, parent);
+ if (IS_ERR(d))
+ return PTR_ERR(d);
+ mutex_unlock(&parent->dentry->d_inode->i_mutex);
+ if (d->d_inode) {
+ /* update watch filter fields */
+ watch->dev = d->d_inode->i_sb->s_dev;
+ watch->ino = d->d_inode->i_ino;
}
-
- err = path_lookup(path, LOOKUP_PARENT, ndparent);
- if (err) {
- kfree(ndparent);
- kfree(ndwatch);
- return err;
- }
-
- err = path_lookup(path, 0, ndwatch);
- if (err) {
- kfree(ndwatch);
- ndwatch = NULL;
- }
-
- *ndp = ndparent;
- *ndw = ndwatch;
-
+ dput(d);
return 0;
}
-/* Release resources used for watch path information. */
-static void audit_put_nd(struct nameidata *ndp, struct nameidata *ndw)
-{
- if (ndp) {
- path_put(&ndp->path);
- kfree(ndp);
- }
- if (ndw) {
- path_put(&ndw->path);
- kfree(ndw);
- }
-}
-
-/* Associate the given rule with an existing parent inotify_watch.
+/* Associate the given rule with an existing parent.
* Caller must hold audit_filter_mutex. */
static void audit_add_to_parent(struct audit_krule *krule,
struct audit_parent *parent)
@@ -396,6 +379,8 @@ static void audit_add_to_parent(struct audit_krule *krule,
struct audit_watch *w, *watch = krule->watch;
int watch_found = 0;
+ BUG_ON(!mutex_is_locked(&audit_filter_mutex));
+
list_for_each_entry(w, &parent->watches, wlist) {
if (strcmp(watch->path, w->path))
continue;
@@ -412,7 +397,7 @@ static void audit_add_to_parent(struct audit_krule *krule,
}
if (!watch_found) {
- get_inotify_watch(&parent->wdata);
+ audit_get_parent(parent);
watch->parent = parent;
list_add(&watch->wlist, &parent->watches);
@@ -422,65 +407,47 @@ static void audit_add_to_parent(struct audit_krule *krule,
/* Find a matching watch entry, or add this one.
* Caller must hold audit_filter_mutex. */
-int audit_add_watch(struct audit_krule *krule)
+int audit_add_watch(struct audit_krule *krule, struct list_head **list)
{
struct audit_watch *watch = krule->watch;
- struct inotify_watch *i_watch;
struct audit_parent *parent;
- struct nameidata *ndp = NULL, *ndw = NULL;
- int ret = 0;
+ struct path parent_path;
+ int h, ret = 0;
mutex_unlock(&audit_filter_mutex);
/* Avoid calling path_lookup under audit_filter_mutex. */
- ret = audit_get_nd(watch->path, &ndp, &ndw);
- if (ret) {
- /* caller expects mutex locked */
- mutex_lock(&audit_filter_mutex);
- goto error;
- }
+ ret = audit_get_nd(watch, &parent_path);
- /* update watch filter fields */
- if (ndw) {
- watch->dev = ndw->path.dentry->d_inode->i_sb->s_dev;
- watch->ino = ndw->path.dentry->d_inode->i_ino;
- }
+ /* caller expects mutex locked */
+ mutex_lock(&audit_filter_mutex);
- /* The audit_filter_mutex must not be held during inotify calls because
- * we hold it during inotify event callback processing. If an existing
- * inotify watch is found, inotify_find_watch() grabs a reference before
- * returning.
- */
- if (inotify_find_watch(audit_ih, ndp->path.dentry->d_inode,
- &i_watch) < 0) {
- parent = audit_init_parent(ndp);
+ if (ret)
+ return ret;
+
+ /* either find an old parent or attach a new one */
+ parent = audit_find_parent(parent_path.dentry->d_inode);
+ if (!parent) {
+ parent = audit_init_parent(&parent_path);
if (IS_ERR(parent)) {
- /* caller expects mutex locked */
- mutex_lock(&audit_filter_mutex);
ret = PTR_ERR(parent);
goto error;
}
- } else
- parent = container_of(i_watch, struct audit_parent, wdata);
-
- mutex_lock(&audit_filter_mutex);
+ }
- /* parent was moved before we took audit_filter_mutex */
- if (parent->flags & AUDIT_PARENT_INVALID)
- ret = -ENOENT;
- else
- audit_add_to_parent(krule, parent);
+ audit_add_to_parent(krule, parent);
- /* match get in audit_init_parent or inotify_find_watch */
- put_inotify_watch(&parent->wdata);
+ /* match get in audit_find_parent or audit_init_parent */
+ audit_put_parent(parent);
+ h = audit_hash_ino((u32)watch->ino);
+ *list = &audit_inode_hash[h];
error:
- audit_put_nd(ndp, ndw); /* NULL args OK */
+ path_put(&parent_path);
return ret;
-
}
-void audit_remove_watch_rule(struct audit_krule *krule, struct list_head *list)
+void audit_remove_watch_rule(struct audit_krule *krule)
{
struct audit_watch *watch = krule->watch;
struct audit_parent *parent = watch->parent;
@@ -491,53 +458,62 @@ void audit_remove_watch_rule(struct audit_krule *krule, struct list_head *list)
audit_remove_watch(watch);
if (list_empty(&parent->watches)) {
- /* Put parent on the inotify un-registration
- * list. Grab a reference before releasing
- * audit_filter_mutex, to be released in
- * audit_inotify_unregister().
- * If filesystem is going away, just leave
- * the sucker alone, eviction will take
- * care of it. */
- if (pin_inotify_watch(&parent->wdata))
- list_add(&parent->ilist, list);
+ audit_get_parent(parent);
+ fsnotify_destroy_mark(&parent->mark, audit_watch_group);
+ audit_put_parent(parent);
}
}
}
-/* Update watch data in audit rules based on inotify events. */
-static void audit_handle_ievent(struct inotify_watch *i_watch, u32 wd, u32 mask,
- u32 cookie, const char *dname, struct inode *inode)
+/* Update watch data in audit rules based on fsnotify events. */
+static int audit_watch_handle_event(struct fsnotify_group *group,
+ struct inode *to_tell,
+ struct fsnotify_mark *inode_mark,
+ struct fsnotify_mark *vfsmount_mark,
+ u32 mask, void *data, int data_type,
+ const unsigned char *dname, u32 cookie)
{
+ struct inode *inode;
struct audit_parent *parent;
- parent = container_of(i_watch, struct audit_parent, wdata);
+ parent = container_of(inode_mark, struct audit_parent, mark);
+
+ BUG_ON(group != audit_watch_group);
- if (mask & (IN_CREATE|IN_MOVED_TO) && inode)
- audit_update_watch(parent, dname, inode->i_sb->s_dev,
- inode->i_ino, 0);
- else if (mask & (IN_DELETE|IN_MOVED_FROM))
+ switch (data_type) {
+ case (FSNOTIFY_EVENT_PATH):
+ inode = ((struct path *)data)->dentry->d_inode;
+ break;
+ case (FSNOTIFY_EVENT_INODE):
+ inode = (struct inode *)data;
+ break;
+ default:
+ BUG();
+ inode = NULL;
+ break;
+ };
+
+ if (mask & (FS_CREATE|FS_MOVED_TO) && inode)
+ audit_update_watch(parent, dname, inode->i_sb->s_dev, inode->i_ino, 0);
+ else if (mask & (FS_DELETE|FS_MOVED_FROM))
audit_update_watch(parent, dname, (dev_t)-1, (unsigned long)-1, 1);
- /* inotify automatically removes the watch and sends IN_IGNORED */
- else if (mask & (IN_DELETE_SELF|IN_UNMOUNT))
+ else if (mask & (FS_DELETE_SELF|FS_UNMOUNT|FS_MOVE_SELF))
audit_remove_parent_watches(parent);
- /* inotify does not remove the watch, so remove it manually */
- else if(mask & IN_MOVE_SELF) {
- audit_remove_parent_watches(parent);
- inotify_remove_watch_locked(audit_ih, i_watch);
- } else if (mask & IN_IGNORED)
- put_inotify_watch(i_watch);
+
+ return 0;
}
-static const struct inotify_operations audit_inotify_ops = {
- .handle_event = audit_handle_ievent,
- .destroy_watch = audit_free_parent,
+static const struct fsnotify_ops audit_watch_fsnotify_ops = {
+ .handle_event = audit_watch_handle_event,
};
static int __init audit_watch_init(void)
{
- audit_ih = inotify_init(&audit_inotify_ops);
- if (IS_ERR(audit_ih))
- audit_panic("cannot initialize inotify handle");
+ audit_watch_group = fsnotify_alloc_group(&audit_watch_fsnotify_ops);
+ if (IS_ERR(audit_watch_group)) {
+ audit_watch_group = NULL;
+ audit_panic("cannot create audit fsnotify group");
+ }
return 0;
}
-subsys_initcall(audit_watch_init);
+device_initcall(audit_watch_init);
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index a70604047f3..8e9bc9c3dbb 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -19,6 +19,8 @@
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/kernel.h>
#include <linux/audit.h>
#include <linux/kthread.h>
@@ -27,7 +29,10 @@
#include <linux/namei.h>
#include <linux/netlink.h>
#include <linux/sched.h>
+#include <linux/slab.h>
#include <linux/security.h>
+#include <net/net_namespace.h>
+#include <net/sock.h>
#include "audit.h"
/*
@@ -70,6 +75,7 @@ static inline void audit_free_rule(struct audit_entry *e)
{
int i;
struct audit_krule *erule = &e->rule;
+
/* some rules don't have associated watches */
if (erule->watch)
audit_put_watch(erule->watch);
@@ -222,7 +228,7 @@ static int audit_match_signal(struct audit_entry *entry)
#endif
/* Common user-space to kernel rule translation. */
-static inline struct audit_entry *audit_to_entry_common(struct audit_rule *rule)
+static inline struct audit_entry *audit_to_entry_common(struct audit_rule_data *rule)
{
unsigned listnr;
struct audit_entry *entry;
@@ -233,17 +239,19 @@ static inline struct audit_entry *audit_to_entry_common(struct audit_rule *rule)
switch(listnr) {
default:
goto exit_err;
- case AUDIT_FILTER_USER:
- case AUDIT_FILTER_TYPE:
#ifdef CONFIG_AUDITSYSCALL
case AUDIT_FILTER_ENTRY:
+ if (rule->action == AUDIT_ALWAYS)
+ goto exit_err;
case AUDIT_FILTER_EXIT:
case AUDIT_FILTER_TASK:
#endif
+ case AUDIT_FILTER_USER:
+ case AUDIT_FILTER_TYPE:
;
}
if (unlikely(rule->action == AUDIT_POSSIBLE)) {
- printk(KERN_ERR "AUDIT_POSSIBLE is deprecated\n");
+ pr_err("AUDIT_POSSIBLE is deprecated\n");
goto exit_err;
}
if (rule->action != AUDIT_NEVER && rule->action != AUDIT_ALWAYS)
@@ -306,103 +314,84 @@ static u32 audit_to_op(u32 op)
return n;
}
-
-/* Translate struct audit_rule to kernel's rule respresentation.
- * Exists for backward compatibility with userspace. */
-static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule)
+/* check if an audit field is valid */
+static int audit_field_valid(struct audit_entry *entry, struct audit_field *f)
{
- struct audit_entry *entry;
- int err = 0;
- int i;
-
- entry = audit_to_entry_common(rule);
- if (IS_ERR(entry))
- goto exit_nofree;
-
- for (i = 0; i < rule->field_count; i++) {
- struct audit_field *f = &entry->rule.fields[i];
- u32 n;
-
- n = rule->fields[i] & (AUDIT_NEGATE|AUDIT_OPERATORS);
-
- /* Support for legacy operators where
- * AUDIT_NEGATE bit signifies != and otherwise assumes == */
- if (n & AUDIT_NEGATE)
- f->op = Audit_not_equal;
- else if (!n)
- f->op = Audit_equal;
- else
- f->op = audit_to_op(n);
-
- entry->rule.vers_ops = (n & AUDIT_OPERATORS) ? 2 : 1;
-
- f->type = rule->fields[i] & ~(AUDIT_NEGATE|AUDIT_OPERATORS);
- f->val = rule->values[i];
-
- err = -EINVAL;
- if (f->op == Audit_bad)
- goto exit_free;
-
- switch(f->type) {
- default:
- goto exit_free;
- case AUDIT_PID:
- case AUDIT_UID:
- case AUDIT_EUID:
- case AUDIT_SUID:
- case AUDIT_FSUID:
- case AUDIT_GID:
- case AUDIT_EGID:
- case AUDIT_SGID:
- case AUDIT_FSGID:
- case AUDIT_LOGINUID:
- case AUDIT_PERS:
- case AUDIT_MSGTYPE:
- case AUDIT_PPID:
- case AUDIT_DEVMAJOR:
- case AUDIT_DEVMINOR:
- case AUDIT_EXIT:
- case AUDIT_SUCCESS:
- /* bit ops are only useful on syscall args */
- if (f->op == Audit_bitmask || f->op == Audit_bittest)
- goto exit_free;
- break;
- case AUDIT_ARG0:
- case AUDIT_ARG1:
- case AUDIT_ARG2:
- case AUDIT_ARG3:
- break;
- /* arch is only allowed to be = or != */
- case AUDIT_ARCH:
- if (f->op != Audit_not_equal && f->op != Audit_equal)
- goto exit_free;
- entry->rule.arch_f = f;
- break;
- case AUDIT_PERM:
- if (f->val & ~15)
- goto exit_free;
- break;
- case AUDIT_FILETYPE:
- if ((f->val & ~S_IFMT) > S_IFMT)
- goto exit_free;
- break;
- case AUDIT_INODE:
- err = audit_to_inode(&entry->rule, f);
- if (err)
- goto exit_free;
- break;
- }
- }
-
- if (entry->rule.inode_f && entry->rule.inode_f->op == Audit_not_equal)
- entry->rule.inode_f = NULL;
-
-exit_nofree:
- return entry;
+ switch(f->type) {
+ case AUDIT_MSGTYPE:
+ if (entry->rule.listnr != AUDIT_FILTER_TYPE &&
+ entry->rule.listnr != AUDIT_FILTER_USER)
+ return -EINVAL;
+ break;
+ };
-exit_free:
- audit_free_rule(entry);
- return ERR_PTR(err);
+ switch(f->type) {
+ default:
+ return -EINVAL;
+ case AUDIT_UID:
+ case AUDIT_EUID:
+ case AUDIT_SUID:
+ case AUDIT_FSUID:
+ case AUDIT_LOGINUID:
+ case AUDIT_OBJ_UID:
+ case AUDIT_GID:
+ case AUDIT_EGID:
+ case AUDIT_SGID:
+ case AUDIT_FSGID:
+ case AUDIT_OBJ_GID:
+ case AUDIT_PID:
+ case AUDIT_PERS:
+ case AUDIT_MSGTYPE:
+ case AUDIT_PPID:
+ case AUDIT_DEVMAJOR:
+ case AUDIT_DEVMINOR:
+ case AUDIT_EXIT:
+ case AUDIT_SUCCESS:
+ case AUDIT_INODE:
+ /* bit ops are only useful on syscall args */
+ if (f->op == Audit_bitmask || f->op == Audit_bittest)
+ return -EINVAL;
+ break;
+ case AUDIT_ARG0:
+ case AUDIT_ARG1:
+ case AUDIT_ARG2:
+ case AUDIT_ARG3:
+ case AUDIT_SUBJ_USER:
+ case AUDIT_SUBJ_ROLE:
+ case AUDIT_SUBJ_TYPE:
+ case AUDIT_SUBJ_SEN:
+ case AUDIT_SUBJ_CLR:
+ case AUDIT_OBJ_USER:
+ case AUDIT_OBJ_ROLE:
+ case AUDIT_OBJ_TYPE:
+ case AUDIT_OBJ_LEV_LOW:
+ case AUDIT_OBJ_LEV_HIGH:
+ case AUDIT_WATCH:
+ case AUDIT_DIR:
+ case AUDIT_FILTERKEY:
+ break;
+ case AUDIT_LOGINUID_SET:
+ if ((f->val != 0) && (f->val != 1))
+ return -EINVAL;
+ /* FALL THROUGH */
+ case AUDIT_ARCH:
+ if (f->op != Audit_not_equal && f->op != Audit_equal)
+ return -EINVAL;
+ break;
+ case AUDIT_PERM:
+ if (f->val & ~15)
+ return -EINVAL;
+ break;
+ case AUDIT_FILETYPE:
+ if (f->val & ~S_IFMT)
+ return -EINVAL;
+ break;
+ case AUDIT_FIELD_COMPARE:
+ if (f->val > AUDIT_MAX_FIELD_COMPARE)
+ return -EINVAL;
+ break;
+ };
+ return 0;
}
/* Translate struct audit_rule_data to kernel's rule respresentation. */
@@ -416,7 +405,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
int i;
char *str;
- entry = audit_to_entry_common((struct audit_rule *)data);
+ entry = audit_to_entry_common(data);
if (IS_ERR(entry))
goto exit_nofree;
@@ -433,30 +422,54 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
f->type = data->fields[i];
f->val = data->values[i];
+ f->uid = INVALID_UID;
+ f->gid = INVALID_GID;
f->lsm_str = NULL;
f->lsm_rule = NULL;
- switch(f->type) {
- case AUDIT_PID:
+
+ /* Support legacy tests for a valid loginuid */
+ if ((f->type == AUDIT_LOGINUID) && (f->val == AUDIT_UID_UNSET)) {
+ f->type = AUDIT_LOGINUID_SET;
+ f->val = 0;
+ }
+
+ if ((f->type == AUDIT_PID) || (f->type == AUDIT_PPID)) {
+ struct pid *pid;
+ rcu_read_lock();
+ pid = find_vpid(f->val);
+ if (!pid) {
+ rcu_read_unlock();
+ err = -ESRCH;
+ goto exit_free;
+ }
+ f->val = pid_nr(pid);
+ rcu_read_unlock();
+ }
+
+ err = audit_field_valid(entry, f);
+ if (err)
+ goto exit_free;
+
+ err = -EINVAL;
+ switch (f->type) {
+ case AUDIT_LOGINUID:
case AUDIT_UID:
case AUDIT_EUID:
case AUDIT_SUID:
case AUDIT_FSUID:
+ case AUDIT_OBJ_UID:
+ f->uid = make_kuid(current_user_ns(), f->val);
+ if (!uid_valid(f->uid))
+ goto exit_free;
+ break;
case AUDIT_GID:
case AUDIT_EGID:
case AUDIT_SGID:
case AUDIT_FSGID:
- case AUDIT_LOGINUID:
- case AUDIT_PERS:
- case AUDIT_MSGTYPE:
- case AUDIT_PPID:
- case AUDIT_DEVMAJOR:
- case AUDIT_DEVMINOR:
- case AUDIT_EXIT:
- case AUDIT_SUCCESS:
- case AUDIT_ARG0:
- case AUDIT_ARG1:
- case AUDIT_ARG2:
- case AUDIT_ARG3:
+ case AUDIT_OBJ_GID:
+ f->gid = make_kgid(current_user_ns(), f->val);
+ if (!gid_valid(f->gid))
+ goto exit_free;
break;
case AUDIT_ARCH:
entry->rule.arch_f = f;
@@ -481,8 +494,8 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
/* Keep currently invalid fields around in case they
* become valid after a policy reload. */
if (err == -EINVAL) {
- printk(KERN_WARNING "audit rule for LSM "
- "\'%s\' is invalid\n", str);
+ pr_warn("audit rule for LSM \'%s\' is invalid\n",
+ str);
err = 0;
}
if (err) {
@@ -520,7 +533,6 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
goto exit_free;
break;
case AUDIT_FILTERKEY:
- err = -EINVAL;
if (entry->rule.filterkey || f->val > AUDIT_MAX_KEY_LEN)
goto exit_free;
str = audit_unpack_string(&bufp, &remain, f->val);
@@ -529,16 +541,6 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
entry->rule.buflen += f->val;
entry->rule.filterkey = str;
break;
- case AUDIT_PERM:
- if (f->val & ~15)
- goto exit_free;
- break;
- case AUDIT_FILETYPE:
- if ((f->val & ~S_IFMT) > S_IFMT)
- goto exit_free;
- break;
- default:
- goto exit_free;
}
}
@@ -549,6 +551,10 @@ exit_nofree:
return entry;
exit_free:
+ if (entry->rule.watch)
+ audit_put_watch(entry->rule.watch); /* matches initial get */
+ if (entry->rule.tree)
+ audit_put_tree(entry->rule.tree); /* that's the temporary one */
audit_free_rule(entry);
return ERR_PTR(err);
}
@@ -564,36 +570,6 @@ static inline size_t audit_pack_string(void **bufp, const char *str)
return len;
}
-/* Translate kernel rule respresentation to struct audit_rule.
- * Exists for backward compatibility with userspace. */
-static struct audit_rule *audit_krule_to_rule(struct audit_krule *krule)
-{
- struct audit_rule *rule;
- int i;
-
- rule = kzalloc(sizeof(*rule), GFP_KERNEL);
- if (unlikely(!rule))
- return NULL;
-
- rule->flags = krule->flags | krule->listnr;
- rule->action = krule->action;
- rule->field_count = krule->field_count;
- for (i = 0; i < rule->field_count; i++) {
- rule->values[i] = krule->fields[i].val;
- rule->fields[i] = krule->fields[i].type;
-
- if (krule->vers_ops == 1) {
- if (krule->fields[i].op == Audit_not_equal)
- rule->fields[i] |= AUDIT_NEGATE;
- } else {
- rule->fields[i] |= audit_ops[krule->fields[i].op];
- }
- }
- for (i = 0; i < AUDIT_BITMASK_SIZE; i++) rule->mask[i] = krule->mask[i];
-
- return rule;
-}
-
/* Translate kernel rule respresentation to struct audit_rule_data. */
static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule)
{
@@ -698,6 +674,23 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b)
if (strcmp(a->filterkey, b->filterkey))
return 1;
break;
+ case AUDIT_UID:
+ case AUDIT_EUID:
+ case AUDIT_SUID:
+ case AUDIT_FSUID:
+ case AUDIT_LOGINUID:
+ case AUDIT_OBJ_UID:
+ if (!uid_eq(a->fields[i].uid, b->fields[i].uid))
+ return 1;
+ break;
+ case AUDIT_GID:
+ case AUDIT_EGID:
+ case AUDIT_SGID:
+ case AUDIT_FSGID:
+ case AUDIT_OBJ_GID:
+ if (!gid_eq(a->fields[i].gid, b->fields[i].gid))
+ return 1;
+ break;
default:
if (a->fields[i].val != b->fields[i].val)
return 1;
@@ -731,8 +724,8 @@ static inline int audit_dupe_lsm_field(struct audit_field *df,
/* Keep currently invalid fields around in case they
* become valid after a policy reload. */
if (ret == -EINVAL) {
- printk(KERN_WARNING "audit rule for LSM \'%s\' is "
- "invalid\n", df->lsm_str);
+ pr_warn("audit rule for LSM \'%s\' is invalid\n",
+ df->lsm_str);
ret = 0;
}
@@ -745,8 +738,7 @@ static inline int audit_dupe_lsm_field(struct audit_field *df,
* rule with the new rule in the filterlist, then free the old rule.
* The rlist element is undefined; list manipulations are handled apart from
* the initial copy. */
-struct audit_entry *audit_dupe_rule(struct audit_krule *old,
- struct audit_watch *watch)
+struct audit_entry *audit_dupe_rule(struct audit_krule *old)
{
u32 fcount = old->field_count;
struct audit_entry *entry;
@@ -768,8 +760,8 @@ struct audit_entry *audit_dupe_rule(struct audit_krule *old,
new->prio = old->prio;
new->buflen = old->buflen;
new->inode_f = old->inode_f;
- new->watch = NULL;
new->field_count = old->field_count;
+
/*
* note that we are OK with not refcounting here; audit_match_tree()
* never dereferences tree and we can't get false positives there
@@ -810,9 +802,9 @@ struct audit_entry *audit_dupe_rule(struct audit_krule *old,
}
}
- if (watch) {
- audit_get_watch(watch);
- new->watch = watch;
+ if (old->watch) {
+ audit_get_watch(old->watch);
+ new->watch = old->watch;
}
return entry;
@@ -865,7 +857,7 @@ static inline int audit_add_rule(struct audit_entry *entry)
struct audit_watch *watch = entry->rule.watch;
struct audit_tree *tree = entry->rule.tree;
struct list_head *list;
- int h, err;
+ int err;
#ifdef CONFIG_AUDITSYSCALL
int dont_count = 0;
@@ -888,15 +880,17 @@ static inline int audit_add_rule(struct audit_entry *entry)
if (watch) {
/* audit_filter_mutex is dropped and re-taken during this call */
- err = audit_add_watch(&entry->rule);
+ err = audit_add_watch(&entry->rule, &list);
if (err) {
mutex_unlock(&audit_filter_mutex);
+ /*
+ * normally audit_add_tree_rule() will free it
+ * on failure
+ */
+ if (tree)
+ audit_put_tree(tree);
goto error;
}
- /* entry->rule.watch may have changed during audit_add_watch() */
- watch = entry->rule.watch;
- h = audit_hash_ino((u32)audit_watch_inode(watch));
- list = &audit_inode_hash[h];
}
if (tree) {
err = audit_add_tree_rule(&entry->rule);
@@ -948,7 +942,6 @@ static inline int audit_del_rule(struct audit_entry *entry)
struct audit_watch *watch = entry->rule.watch;
struct audit_tree *tree = entry->rule.tree;
struct list_head *list;
- LIST_HEAD(inotify_list);
int ret = 0;
#ifdef CONFIG_AUDITSYSCALL
int dont_count = 0;
@@ -968,7 +961,7 @@ static inline int audit_del_rule(struct audit_entry *entry)
}
if (e->rule.watch)
- audit_remove_watch_rule(&e->rule, &inotify_list);
+ audit_remove_watch_rule(&e->rule);
if (e->rule.tree)
audit_remove_tree_rule(&e->rule);
@@ -986,9 +979,6 @@ static inline int audit_del_rule(struct audit_entry *entry)
#endif
mutex_unlock(&audit_filter_mutex);
- if (!list_empty(&inotify_list))
- audit_inotify_unregister(&inotify_list);
-
out:
if (watch)
audit_put_watch(watch); /* match initial get */
@@ -998,37 +988,8 @@ out:
return ret;
}
-/* List rules using struct audit_rule. Exists for backward
- * compatibility with userspace. */
-static void audit_list(int pid, int seq, struct sk_buff_head *q)
-{
- struct sk_buff *skb;
- struct audit_krule *r;
- int i;
-
- /* This is a blocking read, so use audit_filter_mutex instead of rcu
- * iterator to sync with list writers. */
- for (i=0; i<AUDIT_NR_FILTERS; i++) {
- list_for_each_entry(r, &audit_rules_list[i], list) {
- struct audit_rule *rule;
-
- rule = audit_krule_to_rule(r);
- if (unlikely(!rule))
- break;
- skb = audit_make_reply(pid, seq, AUDIT_LIST, 0, 1,
- rule, sizeof(*rule));
- if (skb)
- skb_queue_tail(q, skb);
- kfree(rule);
- }
- }
- skb = audit_make_reply(pid, seq, AUDIT_LIST, 1, 1, NULL, 0);
- if (skb)
- skb_queue_tail(q, skb);
-}
-
/* List rules using struct audit_rule_data. */
-static void audit_list_rules(int pid, int seq, struct sk_buff_head *q)
+static void audit_list_rules(__u32 portid, int seq, struct sk_buff_head *q)
{
struct sk_buff *skb;
struct audit_krule *r;
@@ -1043,24 +1004,25 @@ static void audit_list_rules(int pid, int seq, struct sk_buff_head *q)
data = audit_krule_to_data(r);
if (unlikely(!data))
break;
- skb = audit_make_reply(pid, seq, AUDIT_LIST_RULES, 0, 1,
- data, sizeof(*data) + data->buflen);
+ skb = audit_make_reply(portid, seq, AUDIT_LIST_RULES,
+ 0, 1, data,
+ sizeof(*data) + data->buflen);
if (skb)
skb_queue_tail(q, skb);
kfree(data);
}
}
- skb = audit_make_reply(pid, seq, AUDIT_LIST_RULES, 1, 1, NULL, 0);
+ skb = audit_make_reply(portid, seq, AUDIT_LIST_RULES, 1, 1, NULL, 0);
if (skb)
skb_queue_tail(q, skb);
}
/* Log rule additions and removals */
-static void audit_log_rule_change(uid_t loginuid, u32 sessionid, u32 sid,
- char *action, struct audit_krule *rule,
- int res)
+static void audit_log_rule_change(char *action, struct audit_krule *rule, int res)
{
struct audit_buffer *ab;
+ uid_t loginuid = from_kuid(&init_user_ns, audit_get_loginuid(current));
+ unsigned int sessionid = audit_get_sessionid(current);
if (!audit_enabled)
return;
@@ -1068,17 +1030,8 @@ static void audit_log_rule_change(uid_t loginuid, u32 sessionid, u32 sid,
ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
if (!ab)
return;
- audit_log_format(ab, "auid=%u ses=%u", loginuid, sessionid);
- if (sid) {
- char *ctx = NULL;
- u32 len;
- if (security_secid_to_secctx(sid, &ctx, &len))
- audit_log_format(ab, " ssid=%u", sid);
- else {
- audit_log_format(ab, " subj=%s", ctx);
- security_release_secctx(ctx, len);
- }
- }
+ audit_log_format(ab, "auid=%u ses=%u" ,loginuid, sessionid);
+ audit_log_task_context(ab);
audit_log_format(ab, " op=");
audit_log_string(ab, action);
audit_log_key(ab, rule->filterkey);
@@ -1087,83 +1040,37 @@ static void audit_log_rule_change(uid_t loginuid, u32 sessionid, u32 sid,
}
/**
- * audit_receive_filter - apply all rules to the specified message type
+ * audit_rule_change - apply all rules to the specified message type
* @type: audit message type
- * @pid: target pid for netlink audit messages
- * @uid: target uid for netlink audit messages
+ * @portid: target port id for netlink audit messages
* @seq: netlink audit message sequence (serial) number
* @data: payload data
* @datasz: size of payload data
- * @loginuid: loginuid of sender
- * @sessionid: sessionid for netlink audit message
- * @sid: SE Linux Security ID of sender
*/
-int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
- size_t datasz, uid_t loginuid, u32 sessionid, u32 sid)
+int audit_rule_change(int type, __u32 portid, int seq, void *data,
+ size_t datasz)
{
- struct task_struct *tsk;
- struct audit_netlink_list *dest;
int err = 0;
struct audit_entry *entry;
switch (type) {
- case AUDIT_LIST:
- case AUDIT_LIST_RULES:
- /* We can't just spew out the rules here because we might fill
- * the available socket buffer space and deadlock waiting for
- * auditctl to read from it... which isn't ever going to
- * happen if we're actually running in the context of auditctl
- * trying to _send_ the stuff */
-
- dest = kmalloc(sizeof(struct audit_netlink_list), GFP_KERNEL);
- if (!dest)
- return -ENOMEM;
- dest->pid = pid;
- skb_queue_head_init(&dest->q);
-
- mutex_lock(&audit_filter_mutex);
- if (type == AUDIT_LIST)
- audit_list(pid, seq, &dest->q);
- else
- audit_list_rules(pid, seq, &dest->q);
- mutex_unlock(&audit_filter_mutex);
-
- tsk = kthread_run(audit_send_list, dest, "audit_send_list");
- if (IS_ERR(tsk)) {
- skb_queue_purge(&dest->q);
- kfree(dest);
- err = PTR_ERR(tsk);
- }
- break;
- case AUDIT_ADD:
case AUDIT_ADD_RULE:
- if (type == AUDIT_ADD)
- entry = audit_rule_to_entry(data);
- else
- entry = audit_data_to_entry(data, datasz);
+ entry = audit_data_to_entry(data, datasz);
if (IS_ERR(entry))
return PTR_ERR(entry);
err = audit_add_rule(entry);
- audit_log_rule_change(loginuid, sessionid, sid, "add rule",
- &entry->rule, !err);
-
+ audit_log_rule_change("add rule", &entry->rule, !err);
if (err)
audit_free_rule(entry);
break;
- case AUDIT_DEL:
case AUDIT_DEL_RULE:
- if (type == AUDIT_DEL)
- entry = audit_rule_to_entry(data);
- else
- entry = audit_data_to_entry(data, datasz);
+ entry = audit_data_to_entry(data, datasz);
if (IS_ERR(entry))
return PTR_ERR(entry);
err = audit_del_rule(entry);
- audit_log_rule_change(loginuid, sessionid, sid, "remove rule",
- &entry->rule, !err);
-
+ audit_log_rule_change("remove rule", &entry->rule, !err);
audit_free_rule(entry);
break;
default:
@@ -1173,6 +1080,46 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
return err;
}
+/**
+ * audit_list_rules_send - list the audit rules
+ * @request_skb: skb of request we are replying to (used to target the reply)
+ * @seq: netlink audit message sequence (serial) number
+ */
+int audit_list_rules_send(struct sk_buff *request_skb, int seq)
+{
+ u32 portid = NETLINK_CB(request_skb).portid;
+ struct net *net = sock_net(NETLINK_CB(request_skb).sk);
+ struct task_struct *tsk;
+ struct audit_netlink_list *dest;
+ int err = 0;
+
+ /* We can't just spew out the rules here because we might fill
+ * the available socket buffer space and deadlock waiting for
+ * auditctl to read from it... which isn't ever going to
+ * happen if we're actually running in the context of auditctl
+ * trying to _send_ the stuff */
+
+ dest = kmalloc(sizeof(struct audit_netlink_list), GFP_KERNEL);
+ if (!dest)
+ return -ENOMEM;
+ dest->net = get_net(net);
+ dest->portid = portid;
+ skb_queue_head_init(&dest->q);
+
+ mutex_lock(&audit_filter_mutex);
+ audit_list_rules(portid, seq, &dest->q);
+ mutex_unlock(&audit_filter_mutex);
+
+ tsk = kthread_run(audit_send_list, dest, "audit_send_list");
+ if (IS_ERR(tsk)) {
+ skb_queue_purge(&dest->q);
+ kfree(dest);
+ err = PTR_ERR(tsk);
+ }
+
+ return err;
+}
+
int audit_comparator(u32 left, u32 op, u32 right)
{
switch (op) {
@@ -1198,66 +1145,155 @@ int audit_comparator(u32 left, u32 op, u32 right)
}
}
-/* Compare given dentry name with last component in given path,
- * return of 0 indicates a match. */
-int audit_compare_dname_path(const char *dname, const char *path,
- int *dirlen)
+int audit_uid_comparator(kuid_t left, u32 op, kuid_t right)
{
- int dlen, plen;
- const char *p;
+ switch (op) {
+ case Audit_equal:
+ return uid_eq(left, right);
+ case Audit_not_equal:
+ return !uid_eq(left, right);
+ case Audit_lt:
+ return uid_lt(left, right);
+ case Audit_le:
+ return uid_lte(left, right);
+ case Audit_gt:
+ return uid_gt(left, right);
+ case Audit_ge:
+ return uid_gte(left, right);
+ case Audit_bitmask:
+ case Audit_bittest:
+ default:
+ BUG();
+ return 0;
+ }
+}
- if (!dname || !path)
- return 1;
+int audit_gid_comparator(kgid_t left, u32 op, kgid_t right)
+{
+ switch (op) {
+ case Audit_equal:
+ return gid_eq(left, right);
+ case Audit_not_equal:
+ return !gid_eq(left, right);
+ case Audit_lt:
+ return gid_lt(left, right);
+ case Audit_le:
+ return gid_lte(left, right);
+ case Audit_gt:
+ return gid_gt(left, right);
+ case Audit_ge:
+ return gid_gte(left, right);
+ case Audit_bitmask:
+ case Audit_bittest:
+ default:
+ BUG();
+ return 0;
+ }
+}
+
+/**
+ * parent_len - find the length of the parent portion of a pathname
+ * @path: pathname of which to determine length
+ */
+int parent_len(const char *path)
+{
+ int plen;
+ const char *p;
- dlen = strlen(dname);
plen = strlen(path);
- if (plen < dlen)
- return 1;
+
+ if (plen == 0)
+ return plen;
/* disregard trailing slashes */
p = path + plen - 1;
while ((*p == '/') && (p > path))
p--;
- /* find last path component */
- p = p - dlen + 1;
- if (p < path)
+ /* walk backward until we find the next slash or hit beginning */
+ while ((*p != '/') && (p > path))
+ p--;
+
+ /* did we find a slash? Then increment to include it in path */
+ if (*p == '/')
+ p++;
+
+ return p - path;
+}
+
+/**
+ * audit_compare_dname_path - compare given dentry name with last component in
+ * given path. Return of 0 indicates a match.
+ * @dname: dentry name that we're comparing
+ * @path: full pathname that we're comparing
+ * @parentlen: length of the parent if known. Passing in AUDIT_NAME_FULL
+ * here indicates that we must compute this value.
+ */
+int audit_compare_dname_path(const char *dname, const char *path, int parentlen)
+{
+ int dlen, pathlen;
+ const char *p;
+
+ dlen = strlen(dname);
+ pathlen = strlen(path);
+ if (pathlen < dlen)
return 1;
- else if (p > path) {
- if (*--p != '/')
- return 1;
- else
- p++;
- }
- /* return length of path's directory component */
- if (dirlen)
- *dirlen = p - path;
+ parentlen = parentlen == AUDIT_NAME_FULL ? parent_len(path) : parentlen;
+ if (pathlen - parentlen != dlen)
+ return 1;
+
+ p = path + parentlen;
+
return strncmp(p, dname, dlen);
}
-static int audit_filter_user_rules(struct netlink_skb_parms *cb,
- struct audit_krule *rule,
+static int audit_filter_user_rules(struct audit_krule *rule, int type,
enum audit_state *state)
{
int i;
for (i = 0; i < rule->field_count; i++) {
struct audit_field *f = &rule->fields[i];
+ pid_t pid;
int result = 0;
+ u32 sid;
switch (f->type) {
case AUDIT_PID:
- result = audit_comparator(cb->creds.pid, f->op, f->val);
+ pid = task_pid_nr(current);
+ result = audit_comparator(pid, f->op, f->val);
break;
case AUDIT_UID:
- result = audit_comparator(cb->creds.uid, f->op, f->val);
+ result = audit_uid_comparator(current_uid(), f->op, f->uid);
break;
case AUDIT_GID:
- result = audit_comparator(cb->creds.gid, f->op, f->val);
+ result = audit_gid_comparator(current_gid(), f->op, f->gid);
break;
case AUDIT_LOGINUID:
- result = audit_comparator(cb->loginuid, f->op, f->val);
+ result = audit_uid_comparator(audit_get_loginuid(current),
+ f->op, f->uid);
+ break;
+ case AUDIT_LOGINUID_SET:
+ result = audit_comparator(audit_loginuid_set(current),
+ f->op, f->val);
+ break;
+ case AUDIT_MSGTYPE:
+ result = audit_comparator(type, f->op, f->val);
+ break;
+ case AUDIT_SUBJ_USER:
+ case AUDIT_SUBJ_ROLE:
+ case AUDIT_SUBJ_TYPE:
+ case AUDIT_SUBJ_SEN:
+ case AUDIT_SUBJ_CLR:
+ if (f->lsm_rule) {
+ security_task_getsecid(current, &sid);
+ result = security_audit_rule_match(sid,
+ f->type,
+ f->op,
+ f->lsm_rule,
+ NULL);
+ }
break;
}
@@ -1271,23 +1307,26 @@ static int audit_filter_user_rules(struct netlink_skb_parms *cb,
return 1;
}
-int audit_filter_user(struct netlink_skb_parms *cb)
+int audit_filter_user(int type)
{
enum audit_state state = AUDIT_DISABLED;
struct audit_entry *e;
- int ret = 1;
+ int rc, ret;
+
+ ret = 1; /* Audit by default */
rcu_read_lock();
list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_USER], list) {
- if (audit_filter_user_rules(cb, &e->rule, &state)) {
- if (state == AUDIT_DISABLED)
+ rc = audit_filter_user_rules(&e->rule, type, &state);
+ if (rc) {
+ if (rc > 0 && state == AUDIT_DISABLED)
ret = 0;
break;
}
}
rcu_read_unlock();
- return ret; /* Audit by default */
+ return ret;
}
int audit_filter_type(int type)
@@ -1322,30 +1361,23 @@ static int update_lsm_rule(struct audit_krule *r)
{
struct audit_entry *entry = container_of(r, struct audit_entry, rule);
struct audit_entry *nentry;
- struct audit_watch *watch;
- struct audit_tree *tree;
int err = 0;
if (!security_audit_rule_known(r))
return 0;
- watch = r->watch;
- tree = r->tree;
- nentry = audit_dupe_rule(r, watch);
+ nentry = audit_dupe_rule(r);
if (IS_ERR(nentry)) {
/* save the first error encountered for the
* return value */
err = PTR_ERR(nentry);
audit_panic("error updating LSM filters");
- if (watch)
+ if (r->watch)
list_del(&r->rlist);
list_del_rcu(&entry->list);
list_del(&r->list);
} else {
- if (watch) {
- list_add(&nentry->rule.rlist, audit_watch_rules(watch));
- list_del(&r->rlist);
- } else if (tree)
+ if (r->watch || r->tree)
list_replace_init(&r->rlist, &nentry->rule.rlist);
list_replace_rcu(&entry->list, &nentry->list);
list_replace(&r->list, &nentry->rule.list);
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index fc0f928167e..21eae3c05ec 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -42,13 +42,16 @@
* and <dustin.kirkland@us.ibm.com> for LSPP certification compliance.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/init.h>
#include <asm/types.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
#include <linux/fs.h>
#include <linux/namei.h>
#include <linux/mm.h>
-#include <linux/module.h>
+#include <linux/export.h>
+#include <linux/slab.h>
#include <linux/mount.h>
#include <linux/socket.h>
#include <linux/mqueue.h>
@@ -64,57 +67,30 @@
#include <linux/binfmts.h>
#include <linux/highmem.h>
#include <linux/syscalls.h>
-#include <linux/inotify.h>
#include <linux/capability.h>
#include <linux/fs_struct.h>
+#include <linux/compat.h>
+#include <linux/ctype.h>
#include "audit.h"
-/* AUDIT_NAMES is the number of slots we reserve in the audit_context
- * for saving names from getname(). */
-#define AUDIT_NAMES 20
-
-/* Indicates that audit should log the full pathname. */
-#define AUDIT_NAME_FULL -1
+/* flags stating the success for a syscall */
+#define AUDITSC_INVALID 0
+#define AUDITSC_SUCCESS 1
+#define AUDITSC_FAILURE 2
/* no execve audit message should be longer than this (userspace limits) */
#define MAX_EXECVE_AUDIT_LEN 7500
+/* max length to print of cmdline/proctitle value during audit */
+#define MAX_PROCTITLE_AUDIT_LEN 128
+
/* number of audit rules */
int audit_n_rules;
/* determines whether we collect data for signals sent */
int audit_signals;
-struct audit_cap_data {
- kernel_cap_t permitted;
- kernel_cap_t inheritable;
- union {
- unsigned int fE; /* effective bit of a file capability */
- kernel_cap_t effective; /* effective set of a process */
- };
-};
-
-/* When fs/namei.c:getname() is called, we store the pointer in name and
- * we don't let putname() free it (instead we free all of the saved
- * pointers at syscall exit time).
- *
- * Further, in fs/namei.c:path_lookup() we store the inode and device. */
-struct audit_names {
- const char *name;
- int name_len; /* number of name's characters to log */
- unsigned name_put; /* call __putname() for this name */
- unsigned long ino;
- dev_t dev;
- umode_t mode;
- uid_t uid;
- gid_t gid;
- dev_t rdev;
- u32 osid;
- struct audit_cap_data fcap;
- unsigned int fcap_ver;
-};
-
struct audit_aux_data {
struct audit_aux_data *next;
int type;
@@ -125,18 +101,11 @@ struct audit_aux_data {
/* Number of target pids per aux struct. */
#define AUDIT_AUX_PIDS 16
-struct audit_aux_data_execve {
- struct audit_aux_data d;
- int argc;
- int envc;
- struct mm_struct *mm;
-};
-
struct audit_aux_data_pids {
struct audit_aux_data d;
pid_t target_pid[AUDIT_AUX_PIDS];
- uid_t target_auid[AUDIT_AUX_PIDS];
- uid_t target_uid[AUDIT_AUX_PIDS];
+ kuid_t target_auid[AUDIT_AUX_PIDS];
+ kuid_t target_uid[AUDIT_AUX_PIDS];
unsigned int target_sessionid[AUDIT_AUX_PIDS];
u32 target_sid[AUDIT_AUX_PIDS];
char target_comm[AUDIT_AUX_PIDS][TASK_COMM_LEN];
@@ -151,105 +120,11 @@ struct audit_aux_data_bprm_fcaps {
struct audit_cap_data new_pcap;
};
-struct audit_aux_data_capset {
- struct audit_aux_data d;
- pid_t pid;
- struct audit_cap_data cap;
-};
-
struct audit_tree_refs {
struct audit_tree_refs *next;
struct audit_chunk *c[31];
};
-/* The per-task audit context. */
-struct audit_context {
- int dummy; /* must be the first element */
- int in_syscall; /* 1 if task is in a syscall */
- enum audit_state state, current_state;
- unsigned int serial; /* serial number for record */
- int major; /* syscall number */
- struct timespec ctime; /* time of syscall entry */
- unsigned long argv[4]; /* syscall arguments */
- long return_code;/* syscall return code */
- u64 prio;
- int return_valid; /* return code is valid */
- int name_count;
- struct audit_names names[AUDIT_NAMES];
- char * filterkey; /* key for rule that triggered record */
- struct path pwd;
- struct audit_context *previous; /* For nested syscalls */
- struct audit_aux_data *aux;
- struct audit_aux_data *aux_pids;
- struct sockaddr_storage *sockaddr;
- size_t sockaddr_len;
- /* Save things to print about task_struct */
- pid_t pid, ppid;
- uid_t uid, euid, suid, fsuid;
- gid_t gid, egid, sgid, fsgid;
- unsigned long personality;
- int arch;
-
- pid_t target_pid;
- uid_t target_auid;
- uid_t target_uid;
- unsigned int target_sessionid;
- u32 target_sid;
- char target_comm[TASK_COMM_LEN];
-
- struct audit_tree_refs *trees, *first_trees;
- struct list_head killed_trees;
- int tree_count;
-
- int type;
- union {
- struct {
- int nargs;
- long args[6];
- } socketcall;
- struct {
- uid_t uid;
- gid_t gid;
- mode_t mode;
- u32 osid;
- int has_perm;
- uid_t perm_uid;
- gid_t perm_gid;
- mode_t perm_mode;
- unsigned long qbytes;
- } ipc;
- struct {
- mqd_t mqdes;
- struct mq_attr mqstat;
- } mq_getsetattr;
- struct {
- mqd_t mqdes;
- int sigev_signo;
- } mq_notify;
- struct {
- mqd_t mqdes;
- size_t msg_len;
- unsigned int msg_prio;
- struct timespec abs_timeout;
- } mq_sendrecv;
- struct {
- int oflag;
- mode_t mode;
- struct mq_attr attr;
- } mq_open;
- struct {
- pid_t pid;
- struct audit_cap_data cap;
- } capset;
- };
- int fds[2];
-
-#if AUDIT_DEBUG
- int put_count;
- int ino_count;
-#endif
-};
-
static inline int open_arg(int flags, int mask)
{
int n = ACC_MODE(flags);
@@ -301,21 +176,21 @@ static int audit_match_perm(struct audit_context *ctx, int mask)
}
}
-static int audit_match_filetype(struct audit_context *ctx, int which)
+static int audit_match_filetype(struct audit_context *ctx, int val)
{
- unsigned index = which & ~S_IFMT;
- mode_t mode = which & S_IFMT;
+ struct audit_names *n;
+ umode_t mode = (umode_t)val;
if (unlikely(!ctx))
return 0;
- if (index >= ctx->name_count)
- return 0;
- if (ctx->names[index].ino == -1)
- return 0;
- if ((ctx->names[index].mode ^ mode) & S_IFMT)
- return 0;
- return 1;
+ list_for_each_entry(n, &ctx->names_list, list) {
+ if ((n->ino != -1) &&
+ ((n->mode & S_IFMT) == mode))
+ return 1;
+ }
+
+ return 0;
}
/*
@@ -437,57 +312,202 @@ static int match_tree_refs(struct audit_context *ctx, struct audit_tree *tree)
return 0;
}
+static int audit_compare_uid(kuid_t uid,
+ struct audit_names *name,
+ struct audit_field *f,
+ struct audit_context *ctx)
+{
+ struct audit_names *n;
+ int rc;
+
+ if (name) {
+ rc = audit_uid_comparator(uid, f->op, name->uid);
+ if (rc)
+ return rc;
+ }
+
+ if (ctx) {
+ list_for_each_entry(n, &ctx->names_list, list) {
+ rc = audit_uid_comparator(uid, f->op, n->uid);
+ if (rc)
+ return rc;
+ }
+ }
+ return 0;
+}
+
+static int audit_compare_gid(kgid_t gid,
+ struct audit_names *name,
+ struct audit_field *f,
+ struct audit_context *ctx)
+{
+ struct audit_names *n;
+ int rc;
+
+ if (name) {
+ rc = audit_gid_comparator(gid, f->op, name->gid);
+ if (rc)
+ return rc;
+ }
+
+ if (ctx) {
+ list_for_each_entry(n, &ctx->names_list, list) {
+ rc = audit_gid_comparator(gid, f->op, n->gid);
+ if (rc)
+ return rc;
+ }
+ }
+ return 0;
+}
+
+static int audit_field_compare(struct task_struct *tsk,
+ const struct cred *cred,
+ struct audit_field *f,
+ struct audit_context *ctx,
+ struct audit_names *name)
+{
+ switch (f->val) {
+ /* process to file object comparisons */
+ case AUDIT_COMPARE_UID_TO_OBJ_UID:
+ return audit_compare_uid(cred->uid, name, f, ctx);
+ case AUDIT_COMPARE_GID_TO_OBJ_GID:
+ return audit_compare_gid(cred->gid, name, f, ctx);
+ case AUDIT_COMPARE_EUID_TO_OBJ_UID:
+ return audit_compare_uid(cred->euid, name, f, ctx);
+ case AUDIT_COMPARE_EGID_TO_OBJ_GID:
+ return audit_compare_gid(cred->egid, name, f, ctx);
+ case AUDIT_COMPARE_AUID_TO_OBJ_UID:
+ return audit_compare_uid(tsk->loginuid, name, f, ctx);
+ case AUDIT_COMPARE_SUID_TO_OBJ_UID:
+ return audit_compare_uid(cred->suid, name, f, ctx);
+ case AUDIT_COMPARE_SGID_TO_OBJ_GID:
+ return audit_compare_gid(cred->sgid, name, f, ctx);
+ case AUDIT_COMPARE_FSUID_TO_OBJ_UID:
+ return audit_compare_uid(cred->fsuid, name, f, ctx);
+ case AUDIT_COMPARE_FSGID_TO_OBJ_GID:
+ return audit_compare_gid(cred->fsgid, name, f, ctx);
+ /* uid comparisons */
+ case AUDIT_COMPARE_UID_TO_AUID:
+ return audit_uid_comparator(cred->uid, f->op, tsk->loginuid);
+ case AUDIT_COMPARE_UID_TO_EUID:
+ return audit_uid_comparator(cred->uid, f->op, cred->euid);
+ case AUDIT_COMPARE_UID_TO_SUID:
+ return audit_uid_comparator(cred->uid, f->op, cred->suid);
+ case AUDIT_COMPARE_UID_TO_FSUID:
+ return audit_uid_comparator(cred->uid, f->op, cred->fsuid);
+ /* auid comparisons */
+ case AUDIT_COMPARE_AUID_TO_EUID:
+ return audit_uid_comparator(tsk->loginuid, f->op, cred->euid);
+ case AUDIT_COMPARE_AUID_TO_SUID:
+ return audit_uid_comparator(tsk->loginuid, f->op, cred->suid);
+ case AUDIT_COMPARE_AUID_TO_FSUID:
+ return audit_uid_comparator(tsk->loginuid, f->op, cred->fsuid);
+ /* euid comparisons */
+ case AUDIT_COMPARE_EUID_TO_SUID:
+ return audit_uid_comparator(cred->euid, f->op, cred->suid);
+ case AUDIT_COMPARE_EUID_TO_FSUID:
+ return audit_uid_comparator(cred->euid, f->op, cred->fsuid);
+ /* suid comparisons */
+ case AUDIT_COMPARE_SUID_TO_FSUID:
+ return audit_uid_comparator(cred->suid, f->op, cred->fsuid);
+ /* gid comparisons */
+ case AUDIT_COMPARE_GID_TO_EGID:
+ return audit_gid_comparator(cred->gid, f->op, cred->egid);
+ case AUDIT_COMPARE_GID_TO_SGID:
+ return audit_gid_comparator(cred->gid, f->op, cred->sgid);
+ case AUDIT_COMPARE_GID_TO_FSGID:
+ return audit_gid_comparator(cred->gid, f->op, cred->fsgid);
+ /* egid comparisons */
+ case AUDIT_COMPARE_EGID_TO_SGID:
+ return audit_gid_comparator(cred->egid, f->op, cred->sgid);
+ case AUDIT_COMPARE_EGID_TO_FSGID:
+ return audit_gid_comparator(cred->egid, f->op, cred->fsgid);
+ /* sgid comparison */
+ case AUDIT_COMPARE_SGID_TO_FSGID:
+ return audit_gid_comparator(cred->sgid, f->op, cred->fsgid);
+ default:
+ WARN(1, "Missing AUDIT_COMPARE define. Report as a bug\n");
+ return 0;
+ }
+ return 0;
+}
+
/* Determine if any context name data matches a rule's watch data */
/* Compare a task_struct with an audit_rule. Return 1 on match, 0
- * otherwise. */
+ * otherwise.
+ *
+ * If task_creation is true, this is an explicit indication that we are
+ * filtering a task rule at task creation time. This and tsk == current are
+ * the only situations where tsk->cred may be accessed without an rcu read lock.
+ */
static int audit_filter_rules(struct task_struct *tsk,
struct audit_krule *rule,
struct audit_context *ctx,
struct audit_names *name,
- enum audit_state *state)
+ enum audit_state *state,
+ bool task_creation)
{
- const struct cred *cred = get_task_cred(tsk);
- int i, j, need_sid = 1;
+ const struct cred *cred;
+ int i, need_sid = 1;
u32 sid;
+ cred = rcu_dereference_check(tsk->cred, tsk == current || task_creation);
+
for (i = 0; i < rule->field_count; i++) {
struct audit_field *f = &rule->fields[i];
+ struct audit_names *n;
int result = 0;
+ pid_t pid;
switch (f->type) {
case AUDIT_PID:
- result = audit_comparator(tsk->pid, f->op, f->val);
+ pid = task_pid_nr(tsk);
+ result = audit_comparator(pid, f->op, f->val);
break;
case AUDIT_PPID:
if (ctx) {
if (!ctx->ppid)
- ctx->ppid = sys_getppid();
+ ctx->ppid = task_ppid_nr(tsk);
result = audit_comparator(ctx->ppid, f->op, f->val);
}
break;
case AUDIT_UID:
- result = audit_comparator(cred->uid, f->op, f->val);
+ result = audit_uid_comparator(cred->uid, f->op, f->uid);
break;
case AUDIT_EUID:
- result = audit_comparator(cred->euid, f->op, f->val);
+ result = audit_uid_comparator(cred->euid, f->op, f->uid);
break;
case AUDIT_SUID:
- result = audit_comparator(cred->suid, f->op, f->val);
+ result = audit_uid_comparator(cred->suid, f->op, f->uid);
break;
case AUDIT_FSUID:
- result = audit_comparator(cred->fsuid, f->op, f->val);
+ result = audit_uid_comparator(cred->fsuid, f->op, f->uid);
break;
case AUDIT_GID:
- result = audit_comparator(cred->gid, f->op, f->val);
+ result = audit_gid_comparator(cred->gid, f->op, f->gid);
+ if (f->op == Audit_equal) {
+ if (!result)
+ result = in_group_p(f->gid);
+ } else if (f->op == Audit_not_equal) {
+ if (result)
+ result = !in_group_p(f->gid);
+ }
break;
case AUDIT_EGID:
- result = audit_comparator(cred->egid, f->op, f->val);
+ result = audit_gid_comparator(cred->egid, f->op, f->gid);
+ if (f->op == Audit_equal) {
+ if (!result)
+ result = in_egroup_p(f->gid);
+ } else if (f->op == Audit_not_equal) {
+ if (result)
+ result = !in_egroup_p(f->gid);
+ }
break;
case AUDIT_SGID:
- result = audit_comparator(cred->sgid, f->op, f->val);
+ result = audit_gid_comparator(cred->sgid, f->op, f->gid);
break;
case AUDIT_FSGID:
- result = audit_comparator(cred->fsgid, f->op, f->val);
+ result = audit_gid_comparator(cred->fsgid, f->op, f->gid);
break;
case AUDIT_PERS:
result = audit_comparator(tsk->personality, f->op, f->val);
@@ -510,12 +530,14 @@ static int audit_filter_rules(struct task_struct *tsk,
}
break;
case AUDIT_DEVMAJOR:
- if (name)
- result = audit_comparator(MAJOR(name->dev),
- f->op, f->val);
- else if (ctx) {
- for (j = 0; j < ctx->name_count; j++) {
- if (audit_comparator(MAJOR(ctx->names[j].dev), f->op, f->val)) {
+ if (name) {
+ if (audit_comparator(MAJOR(name->dev), f->op, f->val) ||
+ audit_comparator(MAJOR(name->rdev), f->op, f->val))
+ ++result;
+ } else if (ctx) {
+ list_for_each_entry(n, &ctx->names_list, list) {
+ if (audit_comparator(MAJOR(n->dev), f->op, f->val) ||
+ audit_comparator(MAJOR(n->rdev), f->op, f->val)) {
++result;
break;
}
@@ -523,12 +545,14 @@ static int audit_filter_rules(struct task_struct *tsk,
}
break;
case AUDIT_DEVMINOR:
- if (name)
- result = audit_comparator(MINOR(name->dev),
- f->op, f->val);
- else if (ctx) {
- for (j = 0; j < ctx->name_count; j++) {
- if (audit_comparator(MINOR(ctx->names[j].dev), f->op, f->val)) {
+ if (name) {
+ if (audit_comparator(MINOR(name->dev), f->op, f->val) ||
+ audit_comparator(MINOR(name->rdev), f->op, f->val))
+ ++result;
+ } else if (ctx) {
+ list_for_each_entry(n, &ctx->names_list, list) {
+ if (audit_comparator(MINOR(n->dev), f->op, f->val) ||
+ audit_comparator(MINOR(n->rdev), f->op, f->val)) {
++result;
break;
}
@@ -537,10 +561,34 @@ static int audit_filter_rules(struct task_struct *tsk,
break;
case AUDIT_INODE:
if (name)
- result = (name->ino == f->val);
+ result = audit_comparator(name->ino, f->op, f->val);
else if (ctx) {
- for (j = 0; j < ctx->name_count; j++) {
- if (audit_comparator(ctx->names[j].ino, f->op, f->val)) {
+ list_for_each_entry(n, &ctx->names_list, list) {
+ if (audit_comparator(n->ino, f->op, f->val)) {
+ ++result;
+ break;
+ }
+ }
+ }
+ break;
+ case AUDIT_OBJ_UID:
+ if (name) {
+ result = audit_uid_comparator(name->uid, f->op, f->uid);
+ } else if (ctx) {
+ list_for_each_entry(n, &ctx->names_list, list) {
+ if (audit_uid_comparator(n->uid, f->op, f->uid)) {
+ ++result;
+ break;
+ }
+ }
+ }
+ break;
+ case AUDIT_OBJ_GID:
+ if (name) {
+ result = audit_gid_comparator(name->gid, f->op, f->gid);
+ } else if (ctx) {
+ list_for_each_entry(n, &ctx->names_list, list) {
+ if (audit_gid_comparator(n->gid, f->op, f->gid)) {
++result;
break;
}
@@ -548,9 +596,8 @@ static int audit_filter_rules(struct task_struct *tsk,
}
break;
case AUDIT_WATCH:
- if (name && audit_watch_inode(rule->watch) != (unsigned long)-1)
- result = (name->dev == audit_watch_dev(rule->watch) &&
- name->ino == audit_watch_inode(rule->watch));
+ if (name)
+ result = audit_watch_compare(rule->watch, name->ino, name->dev);
break;
case AUDIT_DIR:
if (ctx)
@@ -559,7 +606,10 @@ static int audit_filter_rules(struct task_struct *tsk,
case AUDIT_LOGINUID:
result = 0;
if (ctx)
- result = audit_comparator(tsk->loginuid, f->op, f->val);
+ result = audit_uid_comparator(tsk->loginuid, f->op, f->uid);
+ break;
+ case AUDIT_LOGINUID_SET:
+ result = audit_comparator(audit_loginuid_set(tsk), f->op, f->val);
break;
case AUDIT_SUBJ_USER:
case AUDIT_SUBJ_ROLE:
@@ -596,11 +646,10 @@ static int audit_filter_rules(struct task_struct *tsk,
name->osid, f->type, f->op,
f->lsm_rule, ctx);
} else if (ctx) {
- for (j = 0; j < ctx->name_count; j++) {
- if (security_audit_rule_match(
- ctx->names[j].osid,
- f->type, f->op,
- f->lsm_rule, ctx)) {
+ list_for_each_entry(n, &ctx->names_list, list) {
+ if (security_audit_rule_match(n->osid, f->type,
+ f->op, f->lsm_rule,
+ ctx)) {
++result;
break;
}
@@ -632,12 +681,12 @@ static int audit_filter_rules(struct task_struct *tsk,
case AUDIT_FILETYPE:
result = audit_match_filetype(ctx, f->val);
break;
+ case AUDIT_FIELD_COMPARE:
+ result = audit_field_compare(tsk, cred, f, ctx, name);
+ break;
}
-
- if (!result) {
- put_cred(cred);
+ if (!result)
return 0;
- }
}
if (ctx) {
@@ -653,7 +702,6 @@ static int audit_filter_rules(struct task_struct *tsk,
case AUDIT_NEVER: *state = AUDIT_DISABLED; break;
case AUDIT_ALWAYS: *state = AUDIT_RECORD_CONTEXT; break;
}
- put_cred(cred);
return 1;
}
@@ -668,7 +716,8 @@ static enum audit_state audit_filter_task(struct task_struct *tsk, char **key)
rcu_read_lock();
list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_TASK], list) {
- if (audit_filter_rules(tsk, &e->rule, NULL, NULL, &state)) {
+ if (audit_filter_rules(tsk, &e->rule, NULL, NULL,
+ &state, true)) {
if (state == AUDIT_RECORD_CONTEXT)
*key = kstrdup(e->rule.filterkey, GFP_ATOMIC);
rcu_read_unlock();
@@ -679,6 +728,22 @@ static enum audit_state audit_filter_task(struct task_struct *tsk, char **key)
return AUDIT_BUILD_CONTEXT;
}
+static int audit_in_mask(const struct audit_krule *rule, unsigned long val)
+{
+ int word, bit;
+
+ if (val > 0xffffffff)
+ return false;
+
+ word = AUDIT_WORD(val);
+ if (word >= AUDIT_BITMASK_SIZE)
+ return false;
+
+ bit = AUDIT_BIT(val);
+
+ return rule->mask[word] & bit;
+}
+
/* At syscall entry and exit time, this filter is called if the
* audit_state is not low enough that auditing cannot take place, but is
* also not high enough that we already know we have to write an audit
@@ -696,13 +761,10 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk,
rcu_read_lock();
if (!list_empty(list)) {
- int word = AUDIT_WORD(ctx->major);
- int bit = AUDIT_BIT(ctx->major);
-
list_for_each_entry_rcu(e, list, list) {
- if ((e->rule.mask[word] & bit) == bit &&
+ if (audit_in_mask(&e->rule, ctx->major) &&
audit_filter_rules(tsk, &e->rule, ctx, NULL,
- &state)) {
+ &state, false)) {
rcu_read_unlock();
ctx->current_state = state;
return state;
@@ -713,50 +775,61 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk,
return AUDIT_BUILD_CONTEXT;
}
-/* At syscall exit time, this filter is called if any audit_names[] have been
+/*
+ * Given an audit_name check the inode hash table to see if they match.
+ * Called holding the rcu read lock to protect the use of audit_inode_hash
+ */
+static int audit_filter_inode_name(struct task_struct *tsk,
+ struct audit_names *n,
+ struct audit_context *ctx) {
+ int h = audit_hash_ino((u32)n->ino);
+ struct list_head *list = &audit_inode_hash[h];
+ struct audit_entry *e;
+ enum audit_state state;
+
+ if (list_empty(list))
+ return 0;
+
+ list_for_each_entry_rcu(e, list, list) {
+ if (audit_in_mask(&e->rule, ctx->major) &&
+ audit_filter_rules(tsk, &e->rule, ctx, n, &state, false)) {
+ ctx->current_state = state;
+ return 1;
+ }
+ }
+
+ return 0;
+}
+
+/* At syscall exit time, this filter is called if any audit_names have been
* collected during syscall processing. We only check rules in sublists at hash
- * buckets applicable to the inode numbers in audit_names[].
+ * buckets applicable to the inode numbers in audit_names.
* Regarding audit_state, same rules apply as for audit_filter_syscall().
*/
void audit_filter_inodes(struct task_struct *tsk, struct audit_context *ctx)
{
- int i;
- struct audit_entry *e;
- enum audit_state state;
+ struct audit_names *n;
if (audit_pid && tsk->tgid == audit_pid)
return;
rcu_read_lock();
- for (i = 0; i < ctx->name_count; i++) {
- int word = AUDIT_WORD(ctx->major);
- int bit = AUDIT_BIT(ctx->major);
- struct audit_names *n = &ctx->names[i];
- int h = audit_hash_ino((u32)n->ino);
- struct list_head *list = &audit_inode_hash[h];
-
- if (list_empty(list))
- continue;
- list_for_each_entry_rcu(e, list, list) {
- if ((e->rule.mask[word] & bit) == bit &&
- audit_filter_rules(tsk, &e->rule, ctx, n, &state)) {
- rcu_read_unlock();
- ctx->current_state = state;
- return;
- }
- }
+ list_for_each_entry(n, &ctx->names_list, list) {
+ if (audit_filter_inode_name(tsk, n, ctx))
+ break;
}
rcu_read_unlock();
}
-static inline struct audit_context *audit_get_context(struct task_struct *tsk,
+/* Transfer the audit context pointer to the caller, clearing it in the tsk's struct */
+static inline struct audit_context *audit_take_context(struct task_struct *tsk,
int return_valid,
long return_code)
{
struct audit_context *context = tsk->audit_context;
- if (likely(!context))
+ if (!context)
return NULL;
context->return_valid = return_valid;
@@ -787,23 +860,30 @@ static inline struct audit_context *audit_get_context(struct task_struct *tsk,
return context;
}
+static inline void audit_proctitle_free(struct audit_context *context)
+{
+ kfree(context->proctitle.value);
+ context->proctitle.value = NULL;
+ context->proctitle.len = 0;
+}
+
static inline void audit_free_names(struct audit_context *context)
{
- int i;
+ struct audit_names *n, *next;
#if AUDIT_DEBUG == 2
if (context->put_count + context->ino_count != context->name_count) {
- printk(KERN_ERR "%s:%d(:%d): major=%d in_syscall=%d"
- " name_count=%d put_count=%d"
- " ino_count=%d [NOT freeing]\n",
- __FILE__, __LINE__,
+ int i = 0;
+
+ pr_err("%s:%d(:%d): major=%d in_syscall=%d"
+ " name_count=%d put_count=%d ino_count=%d"
+ " [NOT freeing]\n", __FILE__, __LINE__,
context->serial, context->major, context->in_syscall,
context->name_count, context->put_count,
context->ino_count);
- for (i = 0; i < context->name_count; i++) {
- printk(KERN_ERR "names[%d] = %p = %s\n", i,
- context->names[i].name,
- context->names[i].name ?: "(null)");
+ list_for_each_entry(n, &context->names_list, list) {
+ pr_err("names[%d] = %p = %s\n", i++, n->name,
+ n->name->name ?: "(null)");
}
dump_stack();
return;
@@ -814,9 +894,12 @@ static inline void audit_free_names(struct audit_context *context)
context->ino_count = 0;
#endif
- for (i = 0; i < context->name_count; i++) {
- if (context->names[i].name && context->names[i].name_put)
- __putname(context->names[i].name);
+ list_for_each_entry_safe(n, next, &context->names_list, list) {
+ list_del(&n->list);
+ if (n->name && n->name_put)
+ final_putname(n->name);
+ if (n->should_free)
+ kfree(n);
}
context->name_count = 0;
path_put(&context->pwd);
@@ -838,22 +921,17 @@ static inline void audit_free_aux(struct audit_context *context)
}
}
-static inline void audit_zero_context(struct audit_context *context,
- enum audit_state state)
-{
- memset(context, 0, sizeof(*context));
- context->state = state;
- context->prio = state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0;
-}
-
static inline struct audit_context *audit_alloc_context(enum audit_state state)
{
struct audit_context *context;
- if (!(context = kmalloc(sizeof(*context), GFP_KERNEL)))
+ context = kzalloc(sizeof(*context), GFP_KERNEL);
+ if (!context)
return NULL;
- audit_zero_context(context, state);
+ context->state = state;
+ context->prio = state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0;
INIT_LIST_HEAD(&context->killed_trees);
+ INIT_LIST_HEAD(&context->names_list);
return context;
}
@@ -876,8 +954,10 @@ int audit_alloc(struct task_struct *tsk)
return 0; /* Return if not auditing. */
state = audit_filter_task(tsk, &key);
- if (likely(state == AUDIT_DISABLED))
+ if (state == AUDIT_DISABLED) {
+ clear_tsk_thread_flag(tsk, TIF_SYSCALL_AUDIT);
return 0;
+ }
if (!(context = audit_alloc_context(state))) {
kfree(key);
@@ -893,91 +973,18 @@ int audit_alloc(struct task_struct *tsk)
static inline void audit_free_context(struct audit_context *context)
{
- struct audit_context *previous;
- int count = 0;
-
- do {
- previous = context->previous;
- if (previous || (count && count < 10)) {
- ++count;
- printk(KERN_ERR "audit(:%d): major=%d name_count=%d:"
- " freeing multiple contexts (%d)\n",
- context->serial, context->major,
- context->name_count, count);
- }
- audit_free_names(context);
- unroll_tree_refs(context, NULL, 0);
- free_tree_refs(context);
- audit_free_aux(context);
- kfree(context->filterkey);
- kfree(context->sockaddr);
- kfree(context);
- context = previous;
- } while (context);
- if (count >= 10)
- printk(KERN_ERR "audit: freed %d contexts\n", count);
-}
-
-void audit_log_task_context(struct audit_buffer *ab)
-{
- char *ctx = NULL;
- unsigned len;
- int error;
- u32 sid;
-
- security_task_getsecid(current, &sid);
- if (!sid)
- return;
-
- error = security_secid_to_secctx(sid, &ctx, &len);
- if (error) {
- if (error != -EINVAL)
- goto error_path;
- return;
- }
-
- audit_log_format(ab, " subj=%s", ctx);
- security_release_secctx(ctx, len);
- return;
-
-error_path:
- audit_panic("error in audit_log_task_context");
- return;
-}
-
-EXPORT_SYMBOL(audit_log_task_context);
-
-static void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk)
-{
- char name[sizeof(tsk->comm)];
- struct mm_struct *mm = tsk->mm;
- struct vm_area_struct *vma;
-
- /* tsk == current */
-
- get_task_comm(name, tsk);
- audit_log_format(ab, " comm=");
- audit_log_untrustedstring(ab, name);
-
- if (mm) {
- down_read(&mm->mmap_sem);
- vma = mm->mmap;
- while (vma) {
- if ((vma->vm_flags & VM_EXECUTABLE) &&
- vma->vm_file) {
- audit_log_d_path(ab, "exe=",
- &vma->vm_file->f_path);
- break;
- }
- vma = vma->vm_next;
- }
- up_read(&mm->mmap_sem);
- }
- audit_log_task_context(ab);
+ audit_free_names(context);
+ unroll_tree_refs(context, NULL, 0);
+ free_tree_refs(context);
+ audit_free_aux(context);
+ kfree(context->filterkey);
+ kfree(context->sockaddr);
+ audit_proctitle_free(context);
+ kfree(context);
}
static int audit_log_pid_context(struct audit_context *context, pid_t pid,
- uid_t auid, uid_t uid, unsigned int sessionid,
+ kuid_t auid, kuid_t uid, unsigned int sessionid,
u32 sid, char *comm)
{
struct audit_buffer *ab;
@@ -989,14 +996,17 @@ static int audit_log_pid_context(struct audit_context *context, pid_t pid,
if (!ab)
return rc;
- audit_log_format(ab, "opid=%d oauid=%d ouid=%d oses=%d", pid, auid,
- uid, sessionid);
- if (security_secid_to_secctx(sid, &ctx, &len)) {
- audit_log_format(ab, " obj=(none)");
- rc = 1;
- } else {
- audit_log_format(ab, " obj=%s", ctx);
- security_release_secctx(ctx, len);
+ audit_log_format(ab, "opid=%d oauid=%d ouid=%d oses=%d", pid,
+ from_kuid(&init_user_ns, auid),
+ from_kuid(&init_user_ns, uid), sessionid);
+ if (sid) {
+ if (security_secid_to_secctx(sid, &ctx, &len)) {
+ audit_log_format(ab, " obj=(none)");
+ rc = 1;
+ } else {
+ audit_log_format(ab, " obj=%s", ctx);
+ security_release_secctx(ctx, len);
+ }
}
audit_log_format(ab, " ocomm=");
audit_log_untrustedstring(ab, comm);
@@ -1008,7 +1018,7 @@ static int audit_log_pid_context(struct audit_context *context, pid_t pid,
/*
* to_send and len_sent accounting are very loose estimates. We aren't
* really worried about a hard cap to MAX_EXECVE_AUDIT_LEN so much as being
- * within about 500 bytes (next page boundry)
+ * within about 500 bytes (next page boundary)
*
* why snprintf? an int is up to 12 digits long. if we just assumed when
* logging that a[%d]= was going to be 16 characters long we would be wasting
@@ -1153,20 +1163,16 @@ static int audit_log_single_execve_arg(struct audit_context *context,
}
static void audit_log_execve_info(struct audit_context *context,
- struct audit_buffer **ab,
- struct audit_aux_data_execve *axi)
+ struct audit_buffer **ab)
{
- int i;
- size_t len, len_sent = 0;
+ int i, len;
+ size_t len_sent = 0;
const char __user *p;
char *buf;
- if (axi->mm != current->mm)
- return; /* execve failed, no additional info */
-
- p = (const char __user *)axi->mm->arg_start;
+ p = (const char __user *)current->mm->arg_start;
- audit_log_format(*ab, "argc=%d", axi->argc);
+ audit_log_format(*ab, "argc=%d", context->execve.argc);
/*
* we need some kernel buffer to hold the userspace args. Just
@@ -1176,11 +1182,11 @@ static void audit_log_execve_info(struct audit_context *context,
*/
buf = kmalloc(MAX_EXECVE_AUDIT_LEN + 1, GFP_KERNEL);
if (!buf) {
- audit_panic("out of memory for argv string\n");
+ audit_panic("out of memory for argv string");
return;
}
- for (i = 0; i < axi->argc; i++) {
+ for (i = 0; i < context->execve.argc; i++) {
len = audit_log_single_execve_arg(context, ab, i,
&len_sent, p, buf);
if (len <= 0)
@@ -1190,35 +1196,6 @@ static void audit_log_execve_info(struct audit_context *context,
kfree(buf);
}
-static void audit_log_cap(struct audit_buffer *ab, char *prefix, kernel_cap_t *cap)
-{
- int i;
-
- audit_log_format(ab, " %s=", prefix);
- CAP_FOR_EACH_U32(i) {
- audit_log_format(ab, "%08x", cap->cap[(_KERNEL_CAPABILITY_U32S-1) - i]);
- }
-}
-
-static void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name)
-{
- kernel_cap_t *perm = &name->fcap.permitted;
- kernel_cap_t *inh = &name->fcap.inheritable;
- int log = 0;
-
- if (!cap_isclear(*perm)) {
- audit_log_cap(ab, "cap_fp", perm);
- log = 1;
- }
- if (!cap_isclear(*inh)) {
- audit_log_cap(ab, "cap_fi", inh);
- log = 1;
- }
-
- if (log)
- audit_log_format(ab, " cap_fe=%d cap_fver=%x", name->fcap.fE, name->fcap_ver);
-}
-
static void show_special(struct audit_context *context, int *call_panic)
{
struct audit_buffer *ab;
@@ -1239,8 +1216,10 @@ static void show_special(struct audit_context *context, int *call_panic)
case AUDIT_IPC: {
u32 osid = context->ipc.osid;
- audit_log_format(ab, "ouid=%u ogid=%u mode=%#o",
- context->ipc.uid, context->ipc.gid, context->ipc.mode);
+ audit_log_format(ab, "ouid=%u ogid=%u mode=%#ho",
+ from_kuid(&init_user_ns, context->ipc.uid),
+ from_kgid(&init_user_ns, context->ipc.gid),
+ context->ipc.mode);
if (osid) {
char *ctx = NULL;
u32 len;
@@ -1256,19 +1235,19 @@ static void show_special(struct audit_context *context, int *call_panic)
audit_log_end(ab);
ab = audit_log_start(context, GFP_KERNEL,
AUDIT_IPC_SET_PERM);
+ if (unlikely(!ab))
+ return;
audit_log_format(ab,
- "qbytes=%lx ouid=%u ogid=%u mode=%#o",
+ "qbytes=%lx ouid=%u ogid=%u mode=%#ho",
context->ipc.qbytes,
context->ipc.perm_uid,
context->ipc.perm_gid,
context->ipc.perm_mode);
- if (!ab)
- return;
}
break; }
case AUDIT_MQ_OPEN: {
audit_log_format(ab,
- "oflag=0x%x mode=%#o mq_flags=0x%lx mq_maxmsg=%ld "
+ "oflag=0x%x mode=%#ho mq_flags=0x%lx mq_maxmsg=%ld "
"mq_msgsize=%ld mq_curmsgs=%ld",
context->mq_open.oflag, context->mq_open.mode,
context->mq_open.attr.mq_flags,
@@ -1306,31 +1285,78 @@ static void show_special(struct audit_context *context, int *call_panic)
audit_log_cap(ab, "cap_pp", &context->capset.cap.permitted);
audit_log_cap(ab, "cap_pe", &context->capset.cap.effective);
break; }
+ case AUDIT_MMAP: {
+ audit_log_format(ab, "fd=%d flags=0x%x", context->mmap.fd,
+ context->mmap.flags);
+ break; }
+ case AUDIT_EXECVE: {
+ audit_log_execve_info(context, &ab);
+ break; }
}
audit_log_end(ab);
}
+static inline int audit_proctitle_rtrim(char *proctitle, int len)
+{
+ char *end = proctitle + len - 1;
+ while (end > proctitle && !isprint(*end))
+ end--;
+
+ /* catch the case where proctitle is only 1 non-print character */
+ len = end - proctitle + 1;
+ len -= isprint(proctitle[len-1]) == 0;
+ return len;
+}
+
+static void audit_log_proctitle(struct task_struct *tsk,
+ struct audit_context *context)
+{
+ int res;
+ char *buf;
+ char *msg = "(null)";
+ int len = strlen(msg);
+ struct audit_buffer *ab;
+
+ ab = audit_log_start(context, GFP_KERNEL, AUDIT_PROCTITLE);
+ if (!ab)
+ return; /* audit_panic or being filtered */
+
+ audit_log_format(ab, "proctitle=");
+
+ /* Not cached */
+ if (!context->proctitle.value) {
+ buf = kmalloc(MAX_PROCTITLE_AUDIT_LEN, GFP_KERNEL);
+ if (!buf)
+ goto out;
+ /* Historically called this from procfs naming */
+ res = get_cmdline(tsk, buf, MAX_PROCTITLE_AUDIT_LEN);
+ if (res == 0) {
+ kfree(buf);
+ goto out;
+ }
+ res = audit_proctitle_rtrim(buf, res);
+ if (res == 0) {
+ kfree(buf);
+ goto out;
+ }
+ context->proctitle.value = buf;
+ context->proctitle.len = res;
+ }
+ msg = context->proctitle.value;
+ len = context->proctitle.len;
+out:
+ audit_log_n_untrustedstring(ab, msg, len);
+ audit_log_end(ab);
+}
+
static void audit_log_exit(struct audit_context *context, struct task_struct *tsk)
{
- const struct cred *cred;
int i, call_panic = 0;
struct audit_buffer *ab;
struct audit_aux_data *aux;
- const char *tty;
+ struct audit_names *n;
/* tsk == current */
- context->pid = tsk->pid;
- if (!context->ppid)
- context->ppid = sys_getppid();
- cred = current_cred();
- context->uid = cred->uid;
- context->gid = cred->gid;
- context->euid = cred->euid;
- context->suid = cred->suid;
- context->fsuid = cred->fsuid;
- context->egid = cred->egid;
- context->sgid = cred->sgid;
- context->fsgid = cred->fsgid;
context->personality = tsk->personality;
ab = audit_log_start(context, GFP_KERNEL, AUDIT_SYSCALL);
@@ -1345,32 +1371,13 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
(context->return_valid==AUDITSC_SUCCESS)?"yes":"no",
context->return_code);
- spin_lock_irq(&tsk->sighand->siglock);
- if (tsk->signal && tsk->signal->tty && tsk->signal->tty->name)
- tty = tsk->signal->tty->name;
- else
- tty = "(none)";
- spin_unlock_irq(&tsk->sighand->siglock);
-
audit_log_format(ab,
- " a0=%lx a1=%lx a2=%lx a3=%lx items=%d"
- " ppid=%d pid=%d auid=%u uid=%u gid=%u"
- " euid=%u suid=%u fsuid=%u"
- " egid=%u sgid=%u fsgid=%u tty=%s ses=%u",
- context->argv[0],
- context->argv[1],
- context->argv[2],
- context->argv[3],
- context->name_count,
- context->ppid,
- context->pid,
- tsk->loginuid,
- context->uid,
- context->gid,
- context->euid, context->suid, context->fsuid,
- context->egid, context->sgid, context->fsgid, tty,
- tsk->sessionid);
-
+ " a0=%lx a1=%lx a2=%lx a3=%lx items=%d",
+ context->argv[0],
+ context->argv[1],
+ context->argv[2],
+ context->argv[3],
+ context->name_count);
audit_log_task_info(ab, tsk);
audit_log_key(ab, context->filterkey);
@@ -1384,11 +1391,6 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
switch (aux->type) {
- case AUDIT_EXECVE: {
- struct audit_aux_data_execve *axi = (void *)aux;
- audit_log_execve_info(context, &ab, axi);
- break; }
-
case AUDIT_BPRM_FCAPS: {
struct audit_aux_data_bprm_fcaps *axs = (void *)aux;
audit_log_format(ab, "fver=%x", axs->fcap_ver);
@@ -1452,71 +1454,20 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
if (context->pwd.dentry && context->pwd.mnt) {
ab = audit_log_start(context, GFP_KERNEL, AUDIT_CWD);
if (ab) {
- audit_log_d_path(ab, "cwd=", &context->pwd);
+ audit_log_d_path(ab, " cwd=", &context->pwd);
audit_log_end(ab);
}
}
- for (i = 0; i < context->name_count; i++) {
- struct audit_names *n = &context->names[i];
-
- ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH);
- if (!ab)
- continue; /* audit_panic has been called */
-
- audit_log_format(ab, "item=%d", i);
-
- if (n->name) {
- switch(n->name_len) {
- case AUDIT_NAME_FULL:
- /* log the full path */
- audit_log_format(ab, " name=");
- audit_log_untrustedstring(ab, n->name);
- break;
- case 0:
- /* name was specified as a relative path and the
- * directory component is the cwd */
- audit_log_d_path(ab, "name=", &context->pwd);
- break;
- default:
- /* log the name's directory component */
- audit_log_format(ab, " name=");
- audit_log_n_untrustedstring(ab, n->name,
- n->name_len);
- }
- } else
- audit_log_format(ab, " name=(null)");
-
- if (n->ino != (unsigned long)-1) {
- audit_log_format(ab, " inode=%lu"
- " dev=%02x:%02x mode=%#o"
- " ouid=%u ogid=%u rdev=%02x:%02x",
- n->ino,
- MAJOR(n->dev),
- MINOR(n->dev),
- n->mode,
- n->uid,
- n->gid,
- MAJOR(n->rdev),
- MINOR(n->rdev));
- }
- if (n->osid != 0) {
- char *ctx = NULL;
- u32 len;
- if (security_secid_to_secctx(
- n->osid, &ctx, &len)) {
- audit_log_format(ab, " osid=%u", n->osid);
- call_panic = 2;
- } else {
- audit_log_format(ab, " obj=%s", ctx);
- security_release_secctx(ctx, len);
- }
- }
-
- audit_log_fcaps(ab, n);
- audit_log_end(ab);
+ i = 0;
+ list_for_each_entry(n, &context->names_list, list) {
+ if (n->hidden)
+ continue;
+ audit_log_name(context, n, NULL, i++, &call_panic);
}
+ audit_log_proctitle(tsk, context);
+
/* Send end of event record to help user space know we are finished */
ab = audit_log_start(context, GFP_KERNEL, AUDIT_EOE);
if (ab)
@@ -1531,12 +1482,12 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
*
* Called from copy_process and do_exit
*/
-void audit_free(struct task_struct *tsk)
+void __audit_free(struct task_struct *tsk)
{
struct audit_context *context;
- context = audit_get_context(tsk, 0, 0);
- if (likely(!context))
+ context = audit_take_context(tsk, 0, 0);
+ if (!context)
return;
/* Check for system calls that do not go through the exit
@@ -1569,7 +1520,7 @@ void audit_free(struct task_struct *tsk)
* will only be written if another part of the kernel requests that it
* be written).
*/
-void audit_syscall_entry(int arch, int major,
+void __audit_syscall_entry(int arch, int major,
unsigned long a1, unsigned long a2,
unsigned long a3, unsigned long a4)
{
@@ -1577,45 +1528,9 @@ void audit_syscall_entry(int arch, int major,
struct audit_context *context = tsk->audit_context;
enum audit_state state;
- if (unlikely(!context))
+ if (!context)
return;
- /*
- * This happens only on certain architectures that make system
- * calls in kernel_thread via the entry.S interface, instead of
- * with direct calls. (If you are porting to a new
- * architecture, hitting this condition can indicate that you
- * got the _exit/_leave calls backward in entry.S.)
- *
- * i386 no
- * x86_64 no
- * ppc64 yes (see arch/powerpc/platforms/iseries/misc.S)
- *
- * This also happens with vm86 emulation in a non-nested manner
- * (entries without exits), so this case must be caught.
- */
- if (context->in_syscall) {
- struct audit_context *newctx;
-
-#if AUDIT_DEBUG
- printk(KERN_ERR
- "audit(:%d) pid=%d in syscall=%d;"
- " entering syscall=%d\n",
- context->serial, tsk->pid, context->major, major);
-#endif
- newctx = audit_alloc_context(context->state);
- if (newctx) {
- newctx->previous = context;
- context = newctx;
- tsk->audit_context = newctx;
- } else {
- /* If we can't alloc a new context, the best we
- * can do is to leak memory (any pending putname
- * will be lost). The only other alternative is
- * to abandon auditing. */
- audit_zero_context(context, context->state);
- }
- }
BUG_ON(context->in_syscall || context->name_count);
if (!audit_enabled)
@@ -1634,7 +1549,7 @@ void audit_syscall_entry(int arch, int major,
context->prio = 0;
state = audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_ENTRY]);
}
- if (likely(state == AUDIT_DISABLED))
+ if (state == AUDIT_DISABLED)
return;
context->serial = 0;
@@ -1644,45 +1559,29 @@ void audit_syscall_entry(int arch, int major,
context->ppid = 0;
}
-void audit_finish_fork(struct task_struct *child)
-{
- struct audit_context *ctx = current->audit_context;
- struct audit_context *p = child->audit_context;
- if (!p || !ctx)
- return;
- if (!ctx->in_syscall || ctx->current_state != AUDIT_RECORD_CONTEXT)
- return;
- p->arch = ctx->arch;
- p->major = ctx->major;
- memcpy(p->argv, ctx->argv, sizeof(ctx->argv));
- p->ctime = ctx->ctime;
- p->dummy = ctx->dummy;
- p->in_syscall = ctx->in_syscall;
- p->filterkey = kstrdup(ctx->filterkey, GFP_KERNEL);
- p->ppid = current->pid;
- p->prio = ctx->prio;
- p->current_state = ctx->current_state;
-}
-
/**
* audit_syscall_exit - deallocate audit context after a system call
- * @valid: success/failure flag
- * @return_code: syscall return value
+ * @success: success value of the syscall
+ * @return_code: return value of the syscall
*
* Tear down after system call. If the audit context has been marked as
* auditable (either because of the AUDIT_RECORD_CONTEXT state from
- * filtering, or because some other part of the kernel write an audit
+ * filtering, or because some other part of the kernel wrote an audit
* message), then write out the syscall information. In call cases,
* free the names stored from getname().
*/
-void audit_syscall_exit(int valid, long return_code)
+void __audit_syscall_exit(int success, long return_code)
{
struct task_struct *tsk = current;
struct audit_context *context;
- context = audit_get_context(tsk, valid, return_code);
+ if (success)
+ success = AUDITSC_SUCCESS;
+ else
+ success = AUDITSC_FAILURE;
- if (likely(!context))
+ context = audit_take_context(tsk, success, return_code);
+ if (!context)
return;
if (context->in_syscall && context->current_state == AUDIT_RECORD_CONTEXT)
@@ -1694,28 +1593,21 @@ void audit_syscall_exit(int valid, long return_code)
if (!list_empty(&context->killed_trees))
audit_kill_trees(&context->killed_trees);
- if (context->previous) {
- struct audit_context *new_context = context->previous;
- context->previous = NULL;
- audit_free_context(context);
- tsk->audit_context = new_context;
- } else {
- audit_free_names(context);
- unroll_tree_refs(context, NULL, 0);
- audit_free_aux(context);
- context->aux = NULL;
- context->aux_pids = NULL;
- context->target_pid = 0;
- context->target_sid = 0;
- context->sockaddr_len = 0;
- context->type = 0;
- context->fds[0] = -1;
- if (context->state != AUDIT_RECORD_CONTEXT) {
- kfree(context->filterkey);
- context->filterkey = NULL;
- }
- tsk->audit_context = context;
+ audit_free_names(context);
+ unroll_tree_refs(context, NULL, 0);
+ audit_free_aux(context);
+ context->aux = NULL;
+ context->aux_pids = NULL;
+ context->target_pid = 0;
+ context->target_sid = 0;
+ context->sockaddr_len = 0;
+ context->type = 0;
+ context->fds[0] = -1;
+ if (context->state != AUDIT_RECORD_CONTEXT) {
+ kfree(context->filterkey);
+ context->filterkey = NULL;
}
+ tsk->audit_context = context;
}
static inline void handle_one(const struct inode *inode)
@@ -1725,7 +1617,7 @@ static inline void handle_one(const struct inode *inode)
struct audit_tree_refs *p;
struct audit_chunk *chunk;
int count;
- if (likely(list_empty(&inode->inotify_watches)))
+ if (likely(hlist_empty(&inode->i_fsnotify_marks)))
return;
context = current->audit_context;
p = context->trees;
@@ -1738,7 +1630,7 @@ static inline void handle_one(const struct inode *inode)
if (likely(put_tree_ref(context, chunk)))
return;
if (unlikely(!grow_tree_refs(context))) {
- printk(KERN_WARNING "out of memory, audit has lost a tree reference\n");
+ pr_warn("out of memory, audit has lost a tree reference\n");
audit_set_auditable(context);
audit_put_chunk(chunk);
unroll_tree_refs(context, p, count);
@@ -1768,7 +1660,7 @@ retry:
seq = read_seqbegin(&rename_lock);
for(;;) {
struct inode *inode = d->d_inode;
- if (inode && unlikely(!list_empty(&inode->inotify_watches))) {
+ if (inode && unlikely(!hlist_empty(&inode->i_fsnotify_marks))) {
struct audit_chunk *chunk;
chunk = audit_tree_lookup(inode);
if (chunk) {
@@ -1797,8 +1689,7 @@ retry:
goto retry;
}
/* too bad */
- printk(KERN_WARNING
- "out of memory, audit has lost a tree reference\n");
+ pr_warn("out of memory, audit has lost a tree reference\n");
unroll_tree_refs(context, p, count);
audit_set_auditable(context);
return;
@@ -1807,6 +1698,55 @@ retry:
#endif
}
+static struct audit_names *audit_alloc_name(struct audit_context *context,
+ unsigned char type)
+{
+ struct audit_names *aname;
+
+ if (context->name_count < AUDIT_NAMES) {
+ aname = &context->preallocated_names[context->name_count];
+ memset(aname, 0, sizeof(*aname));
+ } else {
+ aname = kzalloc(sizeof(*aname), GFP_NOFS);
+ if (!aname)
+ return NULL;
+ aname->should_free = true;
+ }
+
+ aname->ino = (unsigned long)-1;
+ aname->type = type;
+ list_add_tail(&aname->list, &context->names_list);
+
+ context->name_count++;
+#if AUDIT_DEBUG
+ context->ino_count++;
+#endif
+ return aname;
+}
+
+/**
+ * audit_reusename - fill out filename with info from existing entry
+ * @uptr: userland ptr to pathname
+ *
+ * Search the audit_names list for the current audit context. If there is an
+ * existing entry with a matching "uptr" then return the filename
+ * associated with that audit_name. If not, return NULL.
+ */
+struct filename *
+__audit_reusename(const __user char *uptr)
+{
+ struct audit_context *context = current->audit_context;
+ struct audit_names *n;
+
+ list_for_each_entry(n, &context->names_list, list) {
+ if (!n->name)
+ continue;
+ if (n->name->uptr == uptr)
+ return n->name;
+ }
+ return NULL;
+}
+
/**
* audit_getname - add a name to the list
* @name: name to add
@@ -1814,35 +1754,36 @@ retry:
* Add a name to the list of audit names for this context.
* Called from fs/namei.c:getname().
*/
-void __audit_getname(const char *name)
+void __audit_getname(struct filename *name)
{
struct audit_context *context = current->audit_context;
-
- if (IS_ERR(name) || !name)
- return;
+ struct audit_names *n;
if (!context->in_syscall) {
#if AUDIT_DEBUG == 2
- printk(KERN_ERR "%s:%d(:%d): ignoring getname(%p)\n",
+ pr_err("%s:%d(:%d): ignoring getname(%p)\n",
__FILE__, __LINE__, context->serial, name);
dump_stack();
#endif
return;
}
- BUG_ON(context->name_count >= AUDIT_NAMES);
- context->names[context->name_count].name = name;
- context->names[context->name_count].name_len = AUDIT_NAME_FULL;
- context->names[context->name_count].name_put = 1;
- context->names[context->name_count].ino = (unsigned long)-1;
- context->names[context->name_count].osid = 0;
- ++context->name_count;
- if (!context->pwd.dentry) {
- read_lock(&current->fs->lock);
- context->pwd = current->fs->pwd;
- path_get(&current->fs->pwd);
- read_unlock(&current->fs->lock);
- }
+#if AUDIT_DEBUG
+ /* The filename _must_ have a populated ->name */
+ BUG_ON(!name->name);
+#endif
+
+ n = audit_alloc_name(context, AUDIT_TYPE_UNKNOWN);
+ if (!n)
+ return;
+
+ n->name = name;
+ n->name_len = AUDIT_NAME_FULL;
+ n->name_put = true;
+ name->aname = n;
+
+ if (!context->pwd.dentry)
+ get_fs_pwd(current->fs, &context->pwd);
}
/* audit_putname - intercept a putname request
@@ -1852,145 +1793,124 @@ void __audit_getname(const char *name)
* then we delay the putname until syscall exit.
* Called from include/linux/fs.h:putname().
*/
-void audit_putname(const char *name)
+void audit_putname(struct filename *name)
{
struct audit_context *context = current->audit_context;
BUG_ON(!context);
- if (!context->in_syscall) {
+ if (!name->aname || !context->in_syscall) {
#if AUDIT_DEBUG == 2
- printk(KERN_ERR "%s:%d(:%d): __putname(%p)\n",
+ pr_err("%s:%d(:%d): final_putname(%p)\n",
__FILE__, __LINE__, context->serial, name);
if (context->name_count) {
- int i;
- for (i = 0; i < context->name_count; i++)
- printk(KERN_ERR "name[%d] = %p = %s\n", i,
- context->names[i].name,
- context->names[i].name ?: "(null)");
- }
+ struct audit_names *n;
+ int i = 0;
+
+ list_for_each_entry(n, &context->names_list, list)
+ pr_err("name[%d] = %p = %s\n", i++, n->name,
+ n->name->name ?: "(null)");
+ }
#endif
- __putname(name);
+ final_putname(name);
}
#if AUDIT_DEBUG
else {
++context->put_count;
if (context->put_count > context->name_count) {
- printk(KERN_ERR "%s:%d(:%d): major=%d"
- " in_syscall=%d putname(%p) name_count=%d"
- " put_count=%d\n",
+ pr_err("%s:%d(:%d): major=%d in_syscall=%d putname(%p)"
+ " name_count=%d put_count=%d\n",
__FILE__, __LINE__,
context->serial, context->major,
- context->in_syscall, name, context->name_count,
- context->put_count);
+ context->in_syscall, name->name,
+ context->name_count, context->put_count);
dump_stack();
}
}
#endif
}
-static int audit_inc_name_count(struct audit_context *context,
- const struct inode *inode)
-{
- if (context->name_count >= AUDIT_NAMES) {
- if (inode)
- printk(KERN_DEBUG "name_count maxed, losing inode data: "
- "dev=%02x:%02x, inode=%lu\n",
- MAJOR(inode->i_sb->s_dev),
- MINOR(inode->i_sb->s_dev),
- inode->i_ino);
-
- else
- printk(KERN_DEBUG "name_count maxed, losing inode data\n");
- return 1;
- }
- context->name_count++;
-#if AUDIT_DEBUG
- context->ino_count++;
-#endif
- return 0;
-}
-
-
-static inline int audit_copy_fcaps(struct audit_names *name, const struct dentry *dentry)
-{
- struct cpu_vfs_cap_data caps;
- int rc;
-
- memset(&name->fcap.permitted, 0, sizeof(kernel_cap_t));
- memset(&name->fcap.inheritable, 0, sizeof(kernel_cap_t));
- name->fcap.fE = 0;
- name->fcap_ver = 0;
-
- if (!dentry)
- return 0;
-
- rc = get_vfs_caps_from_disk(dentry, &caps);
- if (rc)
- return rc;
-
- name->fcap.permitted = caps.permitted;
- name->fcap.inheritable = caps.inheritable;
- name->fcap.fE = !!(caps.magic_etc & VFS_CAP_FLAGS_EFFECTIVE);
- name->fcap_ver = (caps.magic_etc & VFS_CAP_REVISION_MASK) >> VFS_CAP_REVISION_SHIFT;
-
- return 0;
-}
-
-
-/* Copy inode data into an audit_names. */
-static void audit_copy_inode(struct audit_names *name, const struct dentry *dentry,
- const struct inode *inode)
-{
- name->ino = inode->i_ino;
- name->dev = inode->i_sb->s_dev;
- name->mode = inode->i_mode;
- name->uid = inode->i_uid;
- name->gid = inode->i_gid;
- name->rdev = inode->i_rdev;
- security_inode_getsecid(inode, &name->osid);
- audit_copy_fcaps(name, dentry);
-}
-
/**
- * audit_inode - store the inode and device from a lookup
+ * __audit_inode - store the inode and device from a lookup
* @name: name being audited
* @dentry: dentry being audited
- *
- * Called from fs/namei.c:path_lookup().
+ * @flags: attributes for this particular entry
*/
-void __audit_inode(const char *name, const struct dentry *dentry)
+void __audit_inode(struct filename *name, const struct dentry *dentry,
+ unsigned int flags)
{
- int idx;
struct audit_context *context = current->audit_context;
const struct inode *inode = dentry->d_inode;
+ struct audit_names *n;
+ bool parent = flags & AUDIT_INODE_PARENT;
if (!context->in_syscall)
return;
- if (context->name_count
- && context->names[context->name_count-1].name
- && context->names[context->name_count-1].name == name)
- idx = context->name_count - 1;
- else if (context->name_count > 1
- && context->names[context->name_count-2].name
- && context->names[context->name_count-2].name == name)
- idx = context->name_count - 2;
- else {
- /* FIXME: how much do we care about inodes that have no
- * associated name? */
- if (audit_inc_name_count(context, inode))
- return;
- idx = context->name_count - 1;
- context->names[idx].name = NULL;
+
+ if (!name)
+ goto out_alloc;
+
+#if AUDIT_DEBUG
+ /* The struct filename _must_ have a populated ->name */
+ BUG_ON(!name->name);
+#endif
+ /*
+ * If we have a pointer to an audit_names entry already, then we can
+ * just use it directly if the type is correct.
+ */
+ n = name->aname;
+ if (n) {
+ if (parent) {
+ if (n->type == AUDIT_TYPE_PARENT ||
+ n->type == AUDIT_TYPE_UNKNOWN)
+ goto out;
+ } else {
+ if (n->type != AUDIT_TYPE_PARENT)
+ goto out;
+ }
+ }
+
+ list_for_each_entry_reverse(n, &context->names_list, list) {
+ /* does the name pointer match? */
+ if (!n->name || n->name->name != name->name)
+ continue;
+
+ /* match the correct record type */
+ if (parent) {
+ if (n->type == AUDIT_TYPE_PARENT ||
+ n->type == AUDIT_TYPE_UNKNOWN)
+ goto out;
+ } else {
+ if (n->type != AUDIT_TYPE_PARENT)
+ goto out;
+ }
+ }
+
+out_alloc:
+ /* unable to find the name from a previous getname(). Allocate a new
+ * anonymous entry.
+ */
+ n = audit_alloc_name(context, AUDIT_TYPE_NORMAL);
+ if (!n)
+ return;
+out:
+ if (parent) {
+ n->name_len = n->name ? parent_len(n->name->name) : AUDIT_NAME_FULL;
+ n->type = AUDIT_TYPE_PARENT;
+ if (flags & AUDIT_INODE_HIDDEN)
+ n->hidden = true;
+ } else {
+ n->name_len = AUDIT_NAME_FULL;
+ n->type = AUDIT_TYPE_NORMAL;
}
handle_path(dentry);
- audit_copy_inode(&context->names[idx], dentry, inode);
+ audit_copy_inode(n, dentry, inode);
}
/**
- * audit_inode_child - collect inode info for created/removed objects
- * @dname: inode's dentry name
- * @dentry: dentry being audited
+ * __audit_inode_child - collect inode info for created/removed objects
* @parent: inode of dentry parent
+ * @dentry: dentry being audited
+ * @type: AUDIT_TYPE_* value that we're looking for
*
* For syscalls that create or remove filesystem objects, audit_inode
* can only collect information for the filesystem object's parent.
@@ -2000,89 +1920,80 @@ void __audit_inode(const char *name, const struct dentry *dentry)
* must be hooked prior, in order to capture the target inode during
* unsuccessful attempts.
*/
-void __audit_inode_child(const char *dname, const struct dentry *dentry,
- const struct inode *parent)
+void __audit_inode_child(const struct inode *parent,
+ const struct dentry *dentry,
+ const unsigned char type)
{
- int idx;
struct audit_context *context = current->audit_context;
- const char *found_parent = NULL, *found_child = NULL;
const struct inode *inode = dentry->d_inode;
- int dirlen = 0;
+ const char *dname = dentry->d_name.name;
+ struct audit_names *n, *found_parent = NULL, *found_child = NULL;
if (!context->in_syscall)
return;
if (inode)
handle_one(inode);
- /* determine matching parent */
- if (!dname)
- goto add_names;
- /* parent is more likely, look for it first */
- for (idx = 0; idx < context->name_count; idx++) {
- struct audit_names *n = &context->names[idx];
-
- if (!n->name)
+ /* look for a parent entry first */
+ list_for_each_entry(n, &context->names_list, list) {
+ if (!n->name || n->type != AUDIT_TYPE_PARENT)
continue;
if (n->ino == parent->i_ino &&
- !audit_compare_dname_path(dname, n->name, &dirlen)) {
- n->name_len = dirlen; /* update parent data in place */
- found_parent = n->name;
- goto add_names;
+ !audit_compare_dname_path(dname, n->name->name, n->name_len)) {
+ found_parent = n;
+ break;
}
}
- /* no matching parent, look for matching child */
- for (idx = 0; idx < context->name_count; idx++) {
- struct audit_names *n = &context->names[idx];
+ /* is there a matching child entry? */
+ list_for_each_entry(n, &context->names_list, list) {
+ /* can only match entries that have a name */
+ if (!n->name || n->type != type)
+ continue;
- if (!n->name)
+ /* if we found a parent, make sure this one is a child of it */
+ if (found_parent && (n->name != found_parent->name))
continue;
- /* strcmp() is the more likely scenario */
- if (!strcmp(dname, n->name) ||
- !audit_compare_dname_path(dname, n->name, &dirlen)) {
- if (inode)
- audit_copy_inode(n, NULL, inode);
- else
- n->ino = (unsigned long)-1;
- found_child = n->name;
- goto add_names;
+ if (!strcmp(dname, n->name->name) ||
+ !audit_compare_dname_path(dname, n->name->name,
+ found_parent ?
+ found_parent->name_len :
+ AUDIT_NAME_FULL)) {
+ found_child = n;
+ break;
}
}
-add_names:
if (!found_parent) {
- if (audit_inc_name_count(context, parent))
+ /* create a new, "anonymous" parent record */
+ n = audit_alloc_name(context, AUDIT_TYPE_PARENT);
+ if (!n)
return;
- idx = context->name_count - 1;
- context->names[idx].name = NULL;
- audit_copy_inode(&context->names[idx], NULL, parent);
+ audit_copy_inode(n, NULL, parent);
}
if (!found_child) {
- if (audit_inc_name_count(context, inode))
+ found_child = audit_alloc_name(context, type);
+ if (!found_child)
return;
- idx = context->name_count - 1;
/* Re-use the name belonging to the slot for a matching parent
* directory. All names for this context are relinquished in
* audit_free_names() */
if (found_parent) {
- context->names[idx].name = found_parent;
- context->names[idx].name_len = AUDIT_NAME_FULL;
+ found_child->name = found_parent->name;
+ found_child->name_len = AUDIT_NAME_FULL;
/* don't call __putname() */
- context->names[idx].name_put = 0;
- } else {
- context->names[idx].name = NULL;
+ found_child->name_put = false;
}
-
- if (inode)
- audit_copy_inode(&context->names[idx], NULL, inode);
- else
- context->names[idx].ino = (unsigned long)-1;
}
+ if (inode)
+ audit_copy_inode(found_child, dentry, inode);
+ else
+ found_child->ino = (unsigned long)-1;
}
EXPORT_SYMBOL_GPL(__audit_inode_child);
@@ -2114,37 +2025,78 @@ int auditsc_get_stamp(struct audit_context *ctx,
/* global counter which is incremented every time something logs in */
static atomic_t session_id = ATOMIC_INIT(0);
+static int audit_set_loginuid_perm(kuid_t loginuid)
+{
+ /* if we are unset, we don't need privs */
+ if (!audit_loginuid_set(current))
+ return 0;
+ /* if AUDIT_FEATURE_LOGINUID_IMMUTABLE means never ever allow a change*/
+ if (is_audit_feature_set(AUDIT_FEATURE_LOGINUID_IMMUTABLE))
+ return -EPERM;
+ /* it is set, you need permission */
+ if (!capable(CAP_AUDIT_CONTROL))
+ return -EPERM;
+ /* reject if this is not an unset and we don't allow that */
+ if (is_audit_feature_set(AUDIT_FEATURE_ONLY_UNSET_LOGINUID) && uid_valid(loginuid))
+ return -EPERM;
+ return 0;
+}
+
+static void audit_log_set_loginuid(kuid_t koldloginuid, kuid_t kloginuid,
+ unsigned int oldsessionid, unsigned int sessionid,
+ int rc)
+{
+ struct audit_buffer *ab;
+ uid_t uid, oldloginuid, loginuid;
+
+ if (!audit_enabled)
+ return;
+
+ uid = from_kuid(&init_user_ns, task_uid(current));
+ oldloginuid = from_kuid(&init_user_ns, koldloginuid);
+ loginuid = from_kuid(&init_user_ns, kloginuid),
+
+ ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN);
+ if (!ab)
+ return;
+ audit_log_format(ab, "pid=%d uid=%u", task_pid_nr(current), uid);
+ audit_log_task_context(ab);
+ audit_log_format(ab, " old-auid=%u auid=%u old-ses=%u ses=%u res=%d",
+ oldloginuid, loginuid, oldsessionid, sessionid, !rc);
+ audit_log_end(ab);
+}
+
/**
- * audit_set_loginuid - set a task's audit_context loginuid
- * @task: task whose audit context is being modified
+ * audit_set_loginuid - set current task's audit_context loginuid
* @loginuid: loginuid value
*
* Returns 0.
*
* Called (set) from fs/proc/base.c::proc_loginuid_write().
*/
-int audit_set_loginuid(struct task_struct *task, uid_t loginuid)
+int audit_set_loginuid(kuid_t loginuid)
{
- unsigned int sessionid = atomic_inc_return(&session_id);
- struct audit_context *context = task->audit_context;
+ struct task_struct *task = current;
+ unsigned int oldsessionid, sessionid = (unsigned int)-1;
+ kuid_t oldloginuid;
+ int rc;
- if (context && context->in_syscall) {
- struct audit_buffer *ab;
+ oldloginuid = audit_get_loginuid(current);
+ oldsessionid = audit_get_sessionid(current);
+
+ rc = audit_set_loginuid_perm(loginuid);
+ if (rc)
+ goto out;
+
+ /* are we setting or clearing? */
+ if (uid_valid(loginuid))
+ sessionid = (unsigned int)atomic_inc_return(&session_id);
- ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN);
- if (ab) {
- audit_log_format(ab, "login pid=%d uid=%u "
- "old auid=%u new auid=%u"
- " old ses=%u new ses=%u",
- task->pid, task_uid(task),
- task->loginuid, loginuid,
- task->sessionid, sessionid);
- audit_log_end(ab);
- }
- }
task->sessionid = sessionid;
task->loginuid = loginuid;
- return 0;
+out:
+ audit_log_set_loginuid(oldloginuid, loginuid, oldsessionid, sessionid, rc);
+ return rc;
}
/**
@@ -2154,7 +2106,7 @@ int audit_set_loginuid(struct task_struct *task, uid_t loginuid)
* @attr: queue attributes
*
*/
-void __audit_mq_open(int oflag, mode_t mode, struct mq_attr *attr)
+void __audit_mq_open(int oflag, umode_t mode, struct mq_attr *attr)
{
struct audit_context *context = current->audit_context;
@@ -2254,7 +2206,7 @@ void __audit_ipc_obj(struct kern_ipc_perm *ipcp)
*
* Called only after audit_ipc_obj().
*/
-void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode)
+void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mode)
{
struct audit_context *context = current->audit_context;
@@ -2265,44 +2217,31 @@ void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mod
context->ipc.has_perm = 1;
}
-int audit_bprm(struct linux_binprm *bprm)
+void __audit_bprm(struct linux_binprm *bprm)
{
- struct audit_aux_data_execve *ax;
struct audit_context *context = current->audit_context;
- if (likely(!audit_enabled || !context || context->dummy))
- return 0;
-
- ax = kmalloc(sizeof(*ax), GFP_KERNEL);
- if (!ax)
- return -ENOMEM;
-
- ax->argc = bprm->argc;
- ax->envc = bprm->envc;
- ax->mm = bprm->mm;
- ax->d.type = AUDIT_EXECVE;
- ax->d.next = context->aux;
- context->aux = (void *)ax;
- return 0;
+ context->type = AUDIT_EXECVE;
+ context->execve.argc = bprm->argc;
}
/**
* audit_socketcall - record audit data for sys_socketcall
- * @nargs: number of args
+ * @nargs: number of args, which should not be more than AUDITSC_ARGS.
* @args: args array
*
*/
-void audit_socketcall(int nargs, unsigned long *args)
+int __audit_socketcall(int nargs, unsigned long *args)
{
struct audit_context *context = current->audit_context;
- if (likely(!context || context->dummy))
- return;
-
+ if (nargs <= 0 || nargs > AUDITSC_ARGS || !args)
+ return -EINVAL;
context->type = AUDIT_SOCKETCALL;
context->socketcall.nargs = nargs;
memcpy(context->socketcall.args, args, nargs * sizeof(unsigned long));
+ return 0;
}
/**
@@ -2325,13 +2264,10 @@ void __audit_fd_pair(int fd1, int fd2)
*
* Returns 0 for success or NULL context or < 0 on error.
*/
-int audit_sockaddr(int len, void *a)
+int __audit_sockaddr(int len, void *a)
{
struct audit_context *context = current->audit_context;
- if (likely(!context || context->dummy))
- return 0;
-
if (!context->sockaddr) {
void *p = kmalloc(sizeof(struct sockaddr_storage), GFP_KERNEL);
if (!p)
@@ -2348,7 +2284,7 @@ void __audit_ptrace(struct task_struct *t)
{
struct audit_context *context = current->audit_context;
- context->target_pid = t->pid;
+ context->target_pid = task_pid_nr(t);
context->target_auid = audit_get_loginuid(t);
context->target_uid = task_uid(t);
context->target_sessionid = audit_get_sessionid(t);
@@ -2369,12 +2305,12 @@ int __audit_signal_info(int sig, struct task_struct *t)
struct audit_aux_data_pids *axp;
struct task_struct *tsk = current;
struct audit_context *ctx = tsk->audit_context;
- uid_t uid = current_uid(), t_uid = task_uid(t);
+ kuid_t uid = current_uid(), t_uid = task_uid(t);
if (audit_pid && t->tgid == audit_pid) {
if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1 || sig == SIGUSR2) {
- audit_sig_pid = tsk->pid;
- if (tsk->loginuid != -1)
+ audit_sig_pid = task_pid_nr(tsk);
+ if (uid_valid(tsk->loginuid))
audit_sig_uid = tsk->loginuid;
else
audit_sig_uid = uid;
@@ -2387,7 +2323,7 @@ int __audit_signal_info(int sig, struct task_struct *t)
/* optimize the common case by putting first signal recipient directly
* in audit_context */
if (!ctx->target_pid) {
- ctx->target_pid = t->tgid;
+ ctx->target_pid = task_tgid_nr(t);
ctx->target_auid = audit_get_loginuid(t);
ctx->target_uid = t_uid;
ctx->target_sessionid = audit_get_sessionid(t);
@@ -2408,7 +2344,7 @@ int __audit_signal_info(int sig, struct task_struct *t)
}
BUG_ON(axp->pid_count >= AUDIT_AUX_PIDS);
- axp->target_pid[axp->pid_count] = t->tgid;
+ axp->target_pid[axp->pid_count] = task_tgid_nr(t);
axp->target_auid[axp->pid_count] = audit_get_loginuid(t);
axp->target_uid[axp->pid_count] = t_uid;
axp->target_sessionid[axp->pid_count] = audit_get_sessionid(t);
@@ -2467,24 +2403,58 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm,
/**
* __audit_log_capset - store information about the arguments to the capset syscall
- * @pid: target pid of the capset call
* @new: the new credentials
* @old: the old (current) credentials
*
* Record the aguments userspace sent to sys_capset for later printing by the
* audit system if applicable
*/
-void __audit_log_capset(pid_t pid,
- const struct cred *new, const struct cred *old)
+void __audit_log_capset(const struct cred *new, const struct cred *old)
{
struct audit_context *context = current->audit_context;
- context->capset.pid = pid;
+ context->capset.pid = task_pid_nr(current);
context->capset.cap.effective = new->cap_effective;
context->capset.cap.inheritable = new->cap_effective;
context->capset.cap.permitted = new->cap_permitted;
context->type = AUDIT_CAPSET;
}
+void __audit_mmap_fd(int fd, int flags)
+{
+ struct audit_context *context = current->audit_context;
+ context->mmap.fd = fd;
+ context->mmap.flags = flags;
+ context->type = AUDIT_MMAP;
+}
+
+static void audit_log_task(struct audit_buffer *ab)
+{
+ kuid_t auid, uid;
+ kgid_t gid;
+ unsigned int sessionid;
+ struct mm_struct *mm = current->mm;
+
+ auid = audit_get_loginuid(current);
+ sessionid = audit_get_sessionid(current);
+ current_uid_gid(&uid, &gid);
+
+ audit_log_format(ab, "auid=%u uid=%u gid=%u ses=%u",
+ from_kuid(&init_user_ns, auid),
+ from_kuid(&init_user_ns, uid),
+ from_kgid(&init_user_ns, gid),
+ sessionid);
+ audit_log_task_context(ab);
+ audit_log_format(ab, " pid=%d comm=", task_pid_nr(current));
+ audit_log_untrustedstring(ab, current->comm);
+ if (mm) {
+ down_read(&mm->mmap_sem);
+ if (mm->exe_file)
+ audit_log_d_path(ab, " exe=", &mm->exe_file->f_path);
+ up_read(&mm->mmap_sem);
+ } else
+ audit_log_format(ab, " exe=(null)");
+}
+
/**
* audit_core_dumps - record information about processes that end abnormally
* @signr: signal value
@@ -2495,10 +2465,6 @@ void __audit_log_capset(pid_t pid,
void audit_core_dumps(long signr)
{
struct audit_buffer *ab;
- u32 sid;
- uid_t auid = audit_get_loginuid(current), uid;
- gid_t gid;
- unsigned int sessionid = audit_get_sessionid(current);
if (!audit_enabled)
return;
@@ -2507,24 +2473,26 @@ void audit_core_dumps(long signr)
return;
ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND);
- current_uid_gid(&uid, &gid);
- audit_log_format(ab, "auid=%u uid=%u gid=%u ses=%u",
- auid, uid, gid, sessionid);
- security_task_getsecid(current, &sid);
- if (sid) {
- char *ctx = NULL;
- u32 len;
+ if (unlikely(!ab))
+ return;
+ audit_log_task(ab);
+ audit_log_format(ab, " sig=%ld", signr);
+ audit_log_end(ab);
+}
- if (security_secid_to_secctx(sid, &ctx, &len))
- audit_log_format(ab, " ssid=%u", sid);
- else {
- audit_log_format(ab, " subj=%s", ctx);
- security_release_secctx(ctx, len);
- }
- }
- audit_log_format(ab, " pid=%d comm=", current->pid);
- audit_log_untrustedstring(ab, current->comm);
+void __audit_seccomp(unsigned long syscall, long signr, int code)
+{
+ struct audit_buffer *ab;
+
+ ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_SECCOMP);
+ if (unlikely(!ab))
+ return;
+ audit_log_task(ab);
audit_log_format(ab, " sig=%ld", signr);
+ audit_log_format(ab, " syscall=%ld", syscall);
+ audit_log_format(ab, " compat=%d", is_compat_task());
+ audit_log_format(ab, " ip=0x%lx", KSTK_EIP(current));
+ audit_log_format(ab, " code=0x%x", code);
audit_log_end(ab);
}
diff --git a/kernel/backtracetest.c b/kernel/backtracetest.c
index a5e026bc45c..1323360d90e 100644
--- a/kernel/backtracetest.c
+++ b/kernel/backtracetest.c
@@ -19,8 +19,8 @@
static void backtrace_test_normal(void)
{
- printk("Testing a backtrace from process context.\n");
- printk("The following trace is a kernel self test and not a bug!\n");
+ pr_info("Testing a backtrace from process context.\n");
+ pr_info("The following trace is a kernel self test and not a bug!\n");
dump_stack();
}
@@ -37,8 +37,8 @@ static DECLARE_TASKLET(backtrace_tasklet, &backtrace_test_irq_callback, 0);
static void backtrace_test_irq(void)
{
- printk("Testing a backtrace from irq context.\n");
- printk("The following trace is a kernel self test and not a bug!\n");
+ pr_info("Testing a backtrace from irq context.\n");
+ pr_info("The following trace is a kernel self test and not a bug!\n");
init_completion(&backtrace_work);
tasklet_schedule(&backtrace_tasklet);
@@ -51,8 +51,8 @@ static void backtrace_test_saved(void)
struct stack_trace trace;
unsigned long entries[8];
- printk("Testing a saved backtrace.\n");
- printk("The following trace is a kernel self test and not a bug!\n");
+ pr_info("Testing a saved backtrace.\n");
+ pr_info("The following trace is a kernel self test and not a bug!\n");
trace.nr_entries = 0;
trace.max_entries = ARRAY_SIZE(entries);
@@ -65,19 +65,19 @@ static void backtrace_test_saved(void)
#else
static void backtrace_test_saved(void)
{
- printk("Saved backtrace test skipped.\n");
+ pr_info("Saved backtrace test skipped.\n");
}
#endif
static int backtrace_regression_test(void)
{
- printk("====[ backtrace testing ]===========\n");
+ pr_info("====[ backtrace testing ]===========\n");
backtrace_test_normal();
backtrace_test_irq();
backtrace_test_saved();
- printk("====[ end of backtrace testing ]====\n");
+ pr_info("====[ end of backtrace testing ]====\n");
return 0;
}
diff --git a/kernel/bounds.c b/kernel/bounds.c
index 98a51f26c13..9fd4246b04b 100644
--- a/kernel/bounds.c
+++ b/kernel/bounds.c
@@ -9,11 +9,19 @@
#include <linux/page-flags.h>
#include <linux/mmzone.h>
#include <linux/kbuild.h>
+#include <linux/page_cgroup.h>
+#include <linux/log2.h>
+#include <linux/spinlock_types.h>
void foo(void)
{
/* The enum constants to put into include/generated/bounds.h */
DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS);
DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES);
+ DEFINE(NR_PCG_FLAGS, __NR_PCG_FLAGS);
+#ifdef CONFIG_SMP
+ DEFINE(NR_CPUS_BITS, ilog2(CONFIG_NR_CPUS));
+#endif
+ DEFINE(SPINLOCK_SIZE, sizeof(spinlock_t));
/* End of constants */
}
diff --git a/kernel/capability.c b/kernel/capability.c
index 7f876e60521..a5cf13c018c 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -7,27 +7,24 @@
* 30 May 2002: Cleanup, Robert M. Love <rml@tech9.net>
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/audit.h>
#include <linux/capability.h>
#include <linux/mm.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/security.h>
#include <linux/syscalls.h>
#include <linux/pid_namespace.h>
+#include <linux/user_namespace.h>
#include <asm/uaccess.h>
-#include "cred-internals.h"
/*
* Leveraged for setting/resetting capabilities
*/
const kernel_cap_t __cap_empty_set = CAP_EMPTY_SET;
-const kernel_cap_t __cap_full_set = CAP_FULL_SET;
-const kernel_cap_t __cap_init_eff_set = CAP_INIT_EFF_SET;
-
EXPORT_SYMBOL(__cap_empty_set);
-EXPORT_SYMBOL(__cap_full_set);
-EXPORT_SYMBOL(__cap_init_eff_set);
int file_caps_enabled = 1;
@@ -46,15 +43,10 @@ __setup("no_file_caps", file_caps_disable);
static void warn_legacy_capability_use(void)
{
- static int warned;
- if (!warned) {
- char name[sizeof(current->comm)];
-
- printk(KERN_INFO "warning: `%s' uses 32-bit capabilities"
- " (legacy support in use)\n",
- get_task_comm(name, current));
- warned = 1;
- }
+ char name[sizeof(current->comm)];
+
+ pr_info_once("warning: `%s' uses 32-bit capabilities (legacy support in use)\n",
+ get_task_comm(name, current));
}
/*
@@ -75,16 +67,10 @@ static void warn_legacy_capability_use(void)
static void warn_deprecated_v2(void)
{
- static int warned;
-
- if (!warned) {
- char name[sizeof(current->comm)];
+ char name[sizeof(current->comm)];
- printk(KERN_INFO "warning: `%s' uses deprecated v2"
- " capabilities in a way that may be insecure.\n",
- get_task_comm(name, current));
- warned = 1;
- }
+ pr_info_once("warning: `%s' uses deprecated v2 capabilities in a way that may be insecure\n",
+ get_task_comm(name, current));
}
/*
@@ -135,7 +121,7 @@ static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp,
if (pid && (pid != task_pid_vnr(current))) {
struct task_struct *target;
- read_lock(&tasklist_lock);
+ rcu_read_lock();
target = find_task_by_vpid(pid);
if (!target)
@@ -143,7 +129,7 @@ static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp,
else
ret = security_capget(target, pEp, pIp, pPp);
- read_unlock(&tasklist_lock);
+ rcu_read_unlock();
} else
ret = security_capget(current, pEp, pIp, pPp);
@@ -202,7 +188,7 @@ SYSCALL_DEFINE2(capget, cap_user_header_t, header, cap_user_data_t, dataptr)
*
* An alternative would be to return an error here
* (-ERANGE), but that causes legacy applications to
- * unexpectidly fail; the capget/modify/capset aborts
+ * unexpectedly fail; the capget/modify/capset aborts
* before modification is attempted and the application
* fails.
*/
@@ -281,7 +267,7 @@ SYSCALL_DEFINE2(capset, cap_user_header_t, header, const cap_user_data_t, data)
if (ret < 0)
goto error;
- audit_log_capset(pid, new, current_cred());
+ audit_log_capset(new, current_cred());
return commit_creds(new);
@@ -291,7 +277,88 @@ error:
}
/**
- * capable - Determine if the current task has a superior capability in effect
+ * has_ns_capability - Does a task have a capability in a specific user ns
+ * @t: The task in question
+ * @ns: target user namespace
+ * @cap: The capability to be tested for
+ *
+ * Return true if the specified task has the given superior capability
+ * currently in effect to the specified user namespace, false if not.
+ *
+ * Note that this does not set PF_SUPERPRIV on the task.
+ */
+bool has_ns_capability(struct task_struct *t,
+ struct user_namespace *ns, int cap)
+{
+ int ret;
+
+ rcu_read_lock();
+ ret = security_capable(__task_cred(t), ns, cap);
+ rcu_read_unlock();
+
+ return (ret == 0);
+}
+
+/**
+ * has_capability - Does a task have a capability in init_user_ns
+ * @t: The task in question
+ * @cap: The capability to be tested for
+ *
+ * Return true if the specified task has the given superior capability
+ * currently in effect to the initial user namespace, false if not.
+ *
+ * Note that this does not set PF_SUPERPRIV on the task.
+ */
+bool has_capability(struct task_struct *t, int cap)
+{
+ return has_ns_capability(t, &init_user_ns, cap);
+}
+
+/**
+ * has_ns_capability_noaudit - Does a task have a capability (unaudited)
+ * in a specific user ns.
+ * @t: The task in question
+ * @ns: target user namespace
+ * @cap: The capability to be tested for
+ *
+ * Return true if the specified task has the given superior capability
+ * currently in effect to the specified user namespace, false if not.
+ * Do not write an audit message for the check.
+ *
+ * Note that this does not set PF_SUPERPRIV on the task.
+ */
+bool has_ns_capability_noaudit(struct task_struct *t,
+ struct user_namespace *ns, int cap)
+{
+ int ret;
+
+ rcu_read_lock();
+ ret = security_capable_noaudit(__task_cred(t), ns, cap);
+ rcu_read_unlock();
+
+ return (ret == 0);
+}
+
+/**
+ * has_capability_noaudit - Does a task have a capability (unaudited) in the
+ * initial user ns
+ * @t: The task in question
+ * @cap: The capability to be tested for
+ *
+ * Return true if the specified task has the given superior capability
+ * currently in effect to init_user_ns, false if not. Don't write an
+ * audit message for the check.
+ *
+ * Note that this does not set PF_SUPERPRIV on the task.
+ */
+bool has_capability_noaudit(struct task_struct *t, int cap)
+{
+ return has_ns_capability_noaudit(t, &init_user_ns, cap);
+}
+
+/**
+ * ns_capable - Determine if the current task has a superior capability in effect
+ * @ns: The usernamespace we want the capability in
* @cap: The capability to be tested for
*
* Return true if the current task has the given superior capability currently
@@ -300,17 +367,76 @@ error:
* This sets PF_SUPERPRIV on the task if the capability is available on the
* assumption that it's about to be used.
*/
-int capable(int cap)
+bool ns_capable(struct user_namespace *ns, int cap)
{
if (unlikely(!cap_valid(cap))) {
- printk(KERN_CRIT "capable() called with invalid cap=%u\n", cap);
+ pr_crit("capable() called with invalid cap=%u\n", cap);
BUG();
}
- if (security_capable(cap) == 0) {
+ if (security_capable(current_cred(), ns, cap) == 0) {
current->flags |= PF_SUPERPRIV;
- return 1;
+ return true;
}
- return 0;
+ return false;
+}
+EXPORT_SYMBOL(ns_capable);
+
+/**
+ * file_ns_capable - Determine if the file's opener had a capability in effect
+ * @file: The file we want to check
+ * @ns: The usernamespace we want the capability in
+ * @cap: The capability to be tested for
+ *
+ * Return true if task that opened the file had a capability in effect
+ * when the file was opened.
+ *
+ * This does not set PF_SUPERPRIV because the caller may not
+ * actually be privileged.
+ */
+bool file_ns_capable(const struct file *file, struct user_namespace *ns,
+ int cap)
+{
+ if (WARN_ON_ONCE(!cap_valid(cap)))
+ return false;
+
+ if (security_capable(file->f_cred, ns, cap) == 0)
+ return true;
+
+ return false;
+}
+EXPORT_SYMBOL(file_ns_capable);
+
+/**
+ * capable - Determine if the current task has a superior capability in effect
+ * @cap: The capability to be tested for
+ *
+ * Return true if the current task has the given superior capability currently
+ * available for use, false if not.
+ *
+ * This sets PF_SUPERPRIV on the task if the capability is available on the
+ * assumption that it's about to be used.
+ */
+bool capable(int cap)
+{
+ return ns_capable(&init_user_ns, cap);
}
EXPORT_SYMBOL(capable);
+
+/**
+ * capable_wrt_inode_uidgid - Check nsown_capable and uid and gid mapped
+ * @inode: The inode in question
+ * @cap: The capability in question
+ *
+ * Return true if the current task has the given capability targeted at
+ * its own user namespace and that the given inode's uid and gid are
+ * mapped into the current user namespace.
+ */
+bool capable_wrt_inode_uidgid(const struct inode *inode, int cap)
+{
+ struct user_namespace *ns = current_user_ns();
+
+ return ns_capable(ns, cap) && kuid_has_mapping(ns, inode->i_uid) &&
+ kgid_has_mapping(ns, inode->i_gid);
+}
+EXPORT_SYMBOL(capable_wrt_inode_uidgid);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index aa3bee56644..70776aec256 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -4,6 +4,10 @@
* Based originally on the cpuset system, extracted by Paul Menage
* Copyright (C) 2006 Google, Inc
*
+ * Notifications support
+ * Copyright (C) 2009 Nokia Corporation
+ * Author: Kirill A. Shutemov
+ *
* Copyright notices from the original cpuset code:
* --------------------------------------------------
* Copyright (C) 2003 BULL SA.
@@ -22,12 +26,16 @@
* distribution for more details.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/cgroup.h>
+#include <linux/cred.h>
#include <linux/ctype.h>
#include <linux/errno.h>
-#include <linux/fs.h>
+#include <linux/init_task.h>
#include <linux/kernel.h>
#include <linux/list.h>
+#include <linux/magic.h>
#include <linux/mm.h>
#include <linux/mutex.h>
#include <linux/mount.h>
@@ -35,129 +43,135 @@
#include <linux/proc_fs.h>
#include <linux/rcupdate.h>
#include <linux/sched.h>
-#include <linux/backing-dev.h>
-#include <linux/seq_file.h>
#include <linux/slab.h>
-#include <linux/magic.h>
#include <linux/spinlock.h>
+#include <linux/rwsem.h>
#include <linux/string.h>
#include <linux/sort.h>
#include <linux/kmod.h>
#include <linux/delayacct.h>
#include <linux/cgroupstats.h>
-#include <linux/hash.h>
-#include <linux/namei.h>
-#include <linux/smp_lock.h>
+#include <linux/hashtable.h>
#include <linux/pid_namespace.h>
#include <linux/idr.h>
#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
+#include <linux/kthread.h>
+#include <linux/delay.h>
-#include <asm/atomic.h>
-
-static DEFINE_MUTEX(cgroup_mutex);
-
-/* Generate an array of cgroup subsystem pointers */
-#define SUBSYS(_x) &_x ## _subsys,
-
-static struct cgroup_subsys *subsys[] = {
-#include <linux/cgroup_subsys.h>
-};
-
-#define MAX_CGROUP_ROOT_NAMELEN 64
+#include <linux/atomic.h>
/*
- * A cgroupfs_root represents the root of a cgroup hierarchy,
- * and may be associated with a superblock to form an active
- * hierarchy
+ * pidlists linger the following amount before being destroyed. The goal
+ * is avoiding frequent destruction in the middle of consecutive read calls
+ * Expiring in the middle is a performance problem not a correctness one.
+ * 1 sec should be enough.
*/
-struct cgroupfs_root {
- struct super_block *sb;
-
- /*
- * The bitmask of subsystems intended to be attached to this
- * hierarchy
- */
- unsigned long subsys_bits;
+#define CGROUP_PIDLIST_DESTROY_DELAY HZ
- /* Unique id for this hierarchy. */
- int hierarchy_id;
+#define CGROUP_FILE_NAME_MAX (MAX_CGROUP_TYPE_NAMELEN + \
+ MAX_CFTYPE_NAME + 2)
- /* The bitmask of subsystems currently attached to this hierarchy */
- unsigned long actual_subsys_bits;
+/*
+ * cgroup_mutex is the master lock. Any modification to cgroup or its
+ * hierarchy must be performed while holding it.
+ *
+ * css_set_rwsem protects task->cgroups pointer, the list of css_set
+ * objects, and the chain of tasks off each css_set.
+ *
+ * These locks are exported if CONFIG_PROVE_RCU so that accessors in
+ * cgroup.h can use them for lockdep annotations.
+ */
+#ifdef CONFIG_PROVE_RCU
+DEFINE_MUTEX(cgroup_mutex);
+DECLARE_RWSEM(css_set_rwsem);
+EXPORT_SYMBOL_GPL(cgroup_mutex);
+EXPORT_SYMBOL_GPL(css_set_rwsem);
+#else
+static DEFINE_MUTEX(cgroup_mutex);
+static DECLARE_RWSEM(css_set_rwsem);
+#endif
- /* A list running through the attached subsystems */
- struct list_head subsys_list;
+/*
+ * Protects cgroup_idr and css_idr so that IDs can be released without
+ * grabbing cgroup_mutex.
+ */
+static DEFINE_SPINLOCK(cgroup_idr_lock);
- /* The root cgroup for this hierarchy */
- struct cgroup top_cgroup;
+/*
+ * Protects cgroup_subsys->release_agent_path. Modifying it also requires
+ * cgroup_mutex. Reading requires either cgroup_mutex or this spinlock.
+ */
+static DEFINE_SPINLOCK(release_agent_path_lock);
- /* Tracks how many cgroups are currently defined in hierarchy.*/
- int number_of_cgroups;
+#define cgroup_assert_mutex_or_rcu_locked() \
+ rcu_lockdep_assert(rcu_read_lock_held() || \
+ lockdep_is_held(&cgroup_mutex), \
+ "cgroup_mutex or RCU read lock required");
- /* A list running through the active hierarchies */
- struct list_head root_list;
+/*
+ * cgroup destruction makes heavy use of work items and there can be a lot
+ * of concurrent destructions. Use a separate workqueue so that cgroup
+ * destruction work items don't end up filling up max_active of system_wq
+ * which may lead to deadlock.
+ */
+static struct workqueue_struct *cgroup_destroy_wq;
- /* Hierarchy-specific flags */
- unsigned long flags;
+/*
+ * pidlist destructions need to be flushed on cgroup destruction. Use a
+ * separate workqueue as flush domain.
+ */
+static struct workqueue_struct *cgroup_pidlist_destroy_wq;
- /* The path to use for release notifications. */
- char release_agent_path[PATH_MAX];
+/* generate an array of cgroup subsystem pointers */
+#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys,
+static struct cgroup_subsys *cgroup_subsys[] = {
+#include <linux/cgroup_subsys.h>
+};
+#undef SUBSYS
- /* The name for this hierarchy - may be empty */
- char name[MAX_CGROUP_ROOT_NAMELEN];
+/* array of cgroup subsystem names */
+#define SUBSYS(_x) [_x ## _cgrp_id] = #_x,
+static const char *cgroup_subsys_name[] = {
+#include <linux/cgroup_subsys.h>
};
+#undef SUBSYS
/*
- * The "rootnode" hierarchy is the "dummy hierarchy", reserved for the
- * subsystems that are otherwise unattached - it never has more than a
- * single cgroup, and all tasks are part of that cgroup.
+ * The default hierarchy, reserved for the subsystems that are otherwise
+ * unattached - it never has more than a single cgroup, and all tasks are
+ * part of that cgroup.
*/
-static struct cgroupfs_root rootnode;
+struct cgroup_root cgrp_dfl_root;
/*
- * CSS ID -- ID per subsys's Cgroup Subsys State(CSS). used only when
- * cgroup_subsys->use_id != 0.
+ * The default hierarchy always exists but is hidden until mounted for the
+ * first time. This is for backward compatibility.
*/
-#define CSS_ID_MAX (65535)
-struct css_id {
- /*
- * The css to which this ID points. This pointer is set to valid value
- * after cgroup is populated. If cgroup is removed, this will be NULL.
- * This pointer is expected to be RCU-safe because destroy()
- * is called after synchronize_rcu(). But for safe use, css_is_removed()
- * css_tryget() should be used for avoiding race.
- */
- struct cgroup_subsys_state *css;
- /*
- * ID of this css.
- */
- unsigned short id;
- /*
- * Depth in hierarchy which this ID belongs to.
- */
- unsigned short depth;
- /*
- * ID is freed by RCU. (and lookup routine is RCU safe.)
- */
- struct rcu_head rcu_head;
- /*
- * Hierarchy of CSS ID belongs to.
- */
- unsigned short stack[0]; /* Array of Length (depth+1) */
-};
+static bool cgrp_dfl_root_visible;
+/* some controllers are not supported in the default hierarchy */
+static const unsigned int cgrp_dfl_root_inhibit_ss_mask = 0
+#ifdef CONFIG_CGROUP_DEBUG
+ | (1 << debug_cgrp_id)
+#endif
+ ;
/* The list of hierarchy roots */
-static LIST_HEAD(roots);
-static int root_count;
+static LIST_HEAD(cgroup_roots);
+static int cgroup_root_count;
-static DEFINE_IDA(hierarchy_ida);
-static int next_hierarchy_id;
-static DEFINE_SPINLOCK(hierarchy_id_lock);
+/* hierarchy ID allocation and mapping, protected by cgroup_mutex */
+static DEFINE_IDR(cgroup_hierarchy_idr);
-/* dummytop is a shorthand for the dummy hierarchy's top cgroup */
-#define dummytop (&rootnode.top_cgroup)
+/*
+ * Assign a monotonically increasing serial number to csses. It guarantees
+ * cgroups with bigger numbers are newer than those with smaller numbers.
+ * Also, as csses are always appended to the parent's ->children list, it
+ * guarantees that sibling csses are always sorted in the ascending serial
+ * number order on the list. Protected by cgroup_mutex.
+ */
+static u64 css_serial_nr_next = 1;
/* This flag indicates whether tasks in the fork and exit paths should
* check for fork/exit handlers to call. This avoids us having to do
@@ -166,16 +180,152 @@ static DEFINE_SPINLOCK(hierarchy_id_lock);
*/
static int need_forkexit_callback __read_mostly;
+static struct cftype cgroup_base_files[];
+
+static void cgroup_put(struct cgroup *cgrp);
+static int rebind_subsystems(struct cgroup_root *dst_root,
+ unsigned int ss_mask);
+static int cgroup_destroy_locked(struct cgroup *cgrp);
+static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss);
+static void css_release(struct percpu_ref *ref);
+static void kill_css(struct cgroup_subsys_state *css);
+static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
+ bool is_add);
+static void cgroup_pidlist_destroy_all(struct cgroup *cgrp);
+
+/* IDR wrappers which synchronize using cgroup_idr_lock */
+static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end,
+ gfp_t gfp_mask)
+{
+ int ret;
+
+ idr_preload(gfp_mask);
+ spin_lock_bh(&cgroup_idr_lock);
+ ret = idr_alloc(idr, ptr, start, end, gfp_mask);
+ spin_unlock_bh(&cgroup_idr_lock);
+ idr_preload_end();
+ return ret;
+}
+
+static void *cgroup_idr_replace(struct idr *idr, void *ptr, int id)
+{
+ void *ret;
+
+ spin_lock_bh(&cgroup_idr_lock);
+ ret = idr_replace(idr, ptr, id);
+ spin_unlock_bh(&cgroup_idr_lock);
+ return ret;
+}
+
+static void cgroup_idr_remove(struct idr *idr, int id)
+{
+ spin_lock_bh(&cgroup_idr_lock);
+ idr_remove(idr, id);
+ spin_unlock_bh(&cgroup_idr_lock);
+}
+
+static struct cgroup *cgroup_parent(struct cgroup *cgrp)
+{
+ struct cgroup_subsys_state *parent_css = cgrp->self.parent;
+
+ if (parent_css)
+ return container_of(parent_css, struct cgroup, self);
+ return NULL;
+}
+
+/**
+ * cgroup_css - obtain a cgroup's css for the specified subsystem
+ * @cgrp: the cgroup of interest
+ * @ss: the subsystem of interest (%NULL returns @cgrp->self)
+ *
+ * Return @cgrp's css (cgroup_subsys_state) associated with @ss. This
+ * function must be called either under cgroup_mutex or rcu_read_lock() and
+ * the caller is responsible for pinning the returned css if it wants to
+ * keep accessing it outside the said locks. This function may return
+ * %NULL if @cgrp doesn't have @subsys_id enabled.
+ */
+static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
+ struct cgroup_subsys *ss)
+{
+ if (ss)
+ return rcu_dereference_check(cgrp->subsys[ss->id],
+ lockdep_is_held(&cgroup_mutex));
+ else
+ return &cgrp->self;
+}
+
+/**
+ * cgroup_e_css - obtain a cgroup's effective css for the specified subsystem
+ * @cgrp: the cgroup of interest
+ * @ss: the subsystem of interest (%NULL returns @cgrp->self)
+ *
+ * Similar to cgroup_css() but returns the effctive css, which is defined
+ * as the matching css of the nearest ancestor including self which has @ss
+ * enabled. If @ss is associated with the hierarchy @cgrp is on, this
+ * function is guaranteed to return non-NULL css.
+ */
+static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp,
+ struct cgroup_subsys *ss)
+{
+ lockdep_assert_held(&cgroup_mutex);
+
+ if (!ss)
+ return &cgrp->self;
+
+ if (!(cgrp->root->subsys_mask & (1 << ss->id)))
+ return NULL;
+
+ while (cgroup_parent(cgrp) &&
+ !(cgroup_parent(cgrp)->child_subsys_mask & (1 << ss->id)))
+ cgrp = cgroup_parent(cgrp);
+
+ return cgroup_css(cgrp, ss);
+}
+
/* convenient tests for these bits */
-inline int cgroup_is_removed(const struct cgroup *cgrp)
+static inline bool cgroup_is_dead(const struct cgroup *cgrp)
{
- return test_bit(CGRP_REMOVED, &cgrp->flags);
+ return !(cgrp->self.flags & CSS_ONLINE);
}
-/* bits in struct cgroupfs_root flags field */
-enum {
- ROOT_NOPREFIX, /* mounted subsystems have no named prefix */
-};
+struct cgroup_subsys_state *of_css(struct kernfs_open_file *of)
+{
+ struct cgroup *cgrp = of->kn->parent->priv;
+ struct cftype *cft = of_cft(of);
+
+ /*
+ * This is open and unprotected implementation of cgroup_css().
+ * seq_css() is only called from a kernfs file operation which has
+ * an active reference on the file. Because all the subsystem
+ * files are drained before a css is disassociated with a cgroup,
+ * the matching css from the cgroup's subsys table is guaranteed to
+ * be and stay valid until the enclosing operation is complete.
+ */
+ if (cft->ss)
+ return rcu_dereference_raw(cgrp->subsys[cft->ss->id]);
+ else
+ return &cgrp->self;
+}
+EXPORT_SYMBOL_GPL(of_css);
+
+/**
+ * cgroup_is_descendant - test ancestry
+ * @cgrp: the cgroup to be tested
+ * @ancestor: possible ancestor of @cgrp
+ *
+ * Test whether @cgrp is a descendant of @ancestor. It also returns %true
+ * if @cgrp == @ancestor. This function is safe to call as long as @cgrp
+ * and @ancestor are accessible.
+ */
+bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor)
+{
+ while (cgrp) {
+ if (cgrp == ancestor)
+ return true;
+ cgrp = cgroup_parent(cgrp);
+ }
+ return false;
+}
static int cgroup_is_releasable(const struct cgroup *cgrp)
{
@@ -190,58 +340,138 @@ static int notify_on_release(const struct cgroup *cgrp)
return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
}
-/*
- * for_each_subsys() allows you to iterate on each subsystem attached to
- * an active hierarchy
+/**
+ * for_each_css - iterate all css's of a cgroup
+ * @css: the iteration cursor
+ * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end
+ * @cgrp: the target cgroup to iterate css's of
+ *
+ * Should be called under cgroup_[tree_]mutex.
+ */
+#define for_each_css(css, ssid, cgrp) \
+ for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \
+ if (!((css) = rcu_dereference_check( \
+ (cgrp)->subsys[(ssid)], \
+ lockdep_is_held(&cgroup_mutex)))) { } \
+ else
+
+/**
+ * for_each_e_css - iterate all effective css's of a cgroup
+ * @css: the iteration cursor
+ * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end
+ * @cgrp: the target cgroup to iterate css's of
+ *
+ * Should be called under cgroup_[tree_]mutex.
*/
-#define for_each_subsys(_root, _ss) \
-list_for_each_entry(_ss, &_root->subsys_list, sibling)
+#define for_each_e_css(css, ssid, cgrp) \
+ for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \
+ if (!((css) = cgroup_e_css(cgrp, cgroup_subsys[(ssid)]))) \
+ ; \
+ else
-/* for_each_active_root() allows you to iterate across the active hierarchies */
-#define for_each_active_root(_root) \
-list_for_each_entry(_root, &roots, root_list)
+/**
+ * for_each_subsys - iterate all enabled cgroup subsystems
+ * @ss: the iteration cursor
+ * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
+ */
+#define for_each_subsys(ss, ssid) \
+ for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT && \
+ (((ss) = cgroup_subsys[ssid]) || true); (ssid)++)
+
+/* iterate across the hierarchies */
+#define for_each_root(root) \
+ list_for_each_entry((root), &cgroup_roots, root_list)
+
+/* iterate over child cgrps, lock should be held throughout iteration */
+#define cgroup_for_each_live_child(child, cgrp) \
+ list_for_each_entry((child), &(cgrp)->self.children, self.sibling) \
+ if (({ lockdep_assert_held(&cgroup_mutex); \
+ cgroup_is_dead(child); })) \
+ ; \
+ else
/* the list of cgroups eligible for automatic release. Protected by
* release_list_lock */
static LIST_HEAD(release_list);
-static DEFINE_SPINLOCK(release_list_lock);
+static DEFINE_RAW_SPINLOCK(release_list_lock);
static void cgroup_release_agent(struct work_struct *work);
static DECLARE_WORK(release_agent_work, cgroup_release_agent);
static void check_for_release(struct cgroup *cgrp);
-/* Link structure for associating css_set objects with cgroups */
-struct cg_cgroup_link {
- /*
- * List running through cg_cgroup_links associated with a
- * cgroup, anchored on cgroup->css_sets
- */
- struct list_head cgrp_link_list;
- struct cgroup *cgrp;
- /*
- * List running through cg_cgroup_links pointing at a
- * single css_set object, anchored on css_set->cg_links
- */
- struct list_head cg_link_list;
- struct css_set *cg;
+/*
+ * A cgroup can be associated with multiple css_sets as different tasks may
+ * belong to different cgroups on different hierarchies. In the other
+ * direction, a css_set is naturally associated with multiple cgroups.
+ * This M:N relationship is represented by the following link structure
+ * which exists for each association and allows traversing the associations
+ * from both sides.
+ */
+struct cgrp_cset_link {
+ /* the cgroup and css_set this link associates */
+ struct cgroup *cgrp;
+ struct css_set *cset;
+
+ /* list of cgrp_cset_links anchored at cgrp->cset_links */
+ struct list_head cset_link;
+
+ /* list of cgrp_cset_links anchored at css_set->cgrp_links */
+ struct list_head cgrp_link;
};
-/* The default css_set - used by init and its children prior to any
+/*
+ * The default css_set - used by init and its children prior to any
* hierarchies being mounted. It contains a pointer to the root state
* for each subsystem. Also used to anchor the list of css_sets. Not
* reference-counted, to improve performance when child cgroups
* haven't been created.
*/
+struct css_set init_css_set = {
+ .refcount = ATOMIC_INIT(1),
+ .cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links),
+ .tasks = LIST_HEAD_INIT(init_css_set.tasks),
+ .mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks),
+ .mg_preload_node = LIST_HEAD_INIT(init_css_set.mg_preload_node),
+ .mg_node = LIST_HEAD_INIT(init_css_set.mg_node),
+};
+
+static int css_set_count = 1; /* 1 for init_css_set */
+
+/**
+ * cgroup_update_populated - updated populated count of a cgroup
+ * @cgrp: the target cgroup
+ * @populated: inc or dec populated count
+ *
+ * @cgrp is either getting the first task (css_set) or losing the last.
+ * Update @cgrp->populated_cnt accordingly. The count is propagated
+ * towards root so that a given cgroup's populated_cnt is zero iff the
+ * cgroup and all its descendants are empty.
+ *
+ * @cgrp's interface file "cgroup.populated" is zero if
+ * @cgrp->populated_cnt is zero and 1 otherwise. When @cgrp->populated_cnt
+ * changes from or to zero, userland is notified that the content of the
+ * interface file has changed. This can be used to detect when @cgrp and
+ * its descendants become populated or empty.
+ */
+static void cgroup_update_populated(struct cgroup *cgrp, bool populated)
+{
+ lockdep_assert_held(&css_set_rwsem);
-static struct css_set init_css_set;
-static struct cg_cgroup_link init_css_set_link;
+ do {
+ bool trigger;
+
+ if (populated)
+ trigger = !cgrp->populated_cnt++;
+ else
+ trigger = !--cgrp->populated_cnt;
-static int cgroup_subsys_init_idr(struct cgroup_subsys *ss);
+ if (!trigger)
+ break;
-/* css_set_lock protects the list of css_set objects, and the
- * chain of tasks off each css_set. Nests outside task->alloc_lock
- * due to cgroup_iter_start() */
-static DEFINE_RWLOCK(css_set_lock);
-static int css_set_count;
+ if (cgrp->populated_kn)
+ kernfs_notify(cgrp->populated_kn);
+ cgrp = cgroup_parent(cgrp);
+ } while (cgrp);
+}
/*
* hash table for cgroup groups. This improves the performance to find
@@ -249,147 +479,136 @@ static int css_set_count;
* account cgroups in empty hierarchies.
*/
#define CSS_SET_HASH_BITS 7
-#define CSS_SET_TABLE_SIZE (1 << CSS_SET_HASH_BITS)
-static struct hlist_head css_set_table[CSS_SET_TABLE_SIZE];
+static DEFINE_HASHTABLE(css_set_table, CSS_SET_HASH_BITS);
-static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[])
+static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
{
+ unsigned long key = 0UL;
+ struct cgroup_subsys *ss;
int i;
- int index;
- unsigned long tmp = 0UL;
- for (i = 0; i < CGROUP_SUBSYS_COUNT; i++)
- tmp += (unsigned long)css[i];
- tmp = (tmp >> 16) ^ tmp;
+ for_each_subsys(ss, i)
+ key += (unsigned long)css[i];
+ key = (key >> 16) ^ key;
- index = hash_long(tmp, CSS_SET_HASH_BITS);
-
- return &css_set_table[index];
+ return key;
}
-static void free_css_set_rcu(struct rcu_head *obj)
+static void put_css_set_locked(struct css_set *cset, bool taskexit)
{
- struct css_set *cg = container_of(obj, struct css_set, rcu_head);
- kfree(cg);
-}
+ struct cgrp_cset_link *link, *tmp_link;
+ struct cgroup_subsys *ss;
+ int ssid;
-/* We don't maintain the lists running through each css_set to its
- * task until after the first call to cgroup_iter_start(). This
- * reduces the fork()/exit() overhead for people who have cgroups
- * compiled into their kernel but not actually in use */
-static int use_task_css_set_links __read_mostly;
+ lockdep_assert_held(&css_set_rwsem);
-static void __put_css_set(struct css_set *cg, int taskexit)
-{
- struct cg_cgroup_link *link;
- struct cg_cgroup_link *saved_link;
- /*
- * Ensure that the refcount doesn't hit zero while any readers
- * can see it. Similar to atomic_dec_and_lock(), but for an
- * rwlock
- */
- if (atomic_add_unless(&cg->refcount, -1, 1))
- return;
- write_lock(&css_set_lock);
- if (!atomic_dec_and_test(&cg->refcount)) {
- write_unlock(&css_set_lock);
+ if (!atomic_dec_and_test(&cset->refcount))
return;
- }
/* This css_set is dead. unlink it and release cgroup refcounts */
- hlist_del(&cg->hlist);
+ for_each_subsys(ss, ssid)
+ list_del(&cset->e_cset_node[ssid]);
+ hash_del(&cset->hlist);
css_set_count--;
- list_for_each_entry_safe(link, saved_link, &cg->cg_links,
- cg_link_list) {
+ list_for_each_entry_safe(link, tmp_link, &cset->cgrp_links, cgrp_link) {
struct cgroup *cgrp = link->cgrp;
- list_del(&link->cg_link_list);
- list_del(&link->cgrp_link_list);
- if (atomic_dec_and_test(&cgrp->count) &&
- notify_on_release(cgrp)) {
- if (taskexit)
- set_bit(CGRP_RELEASABLE, &cgrp->flags);
- check_for_release(cgrp);
+
+ list_del(&link->cset_link);
+ list_del(&link->cgrp_link);
+
+ /* @cgrp can't go away while we're holding css_set_rwsem */
+ if (list_empty(&cgrp->cset_links)) {
+ cgroup_update_populated(cgrp, false);
+ if (notify_on_release(cgrp)) {
+ if (taskexit)
+ set_bit(CGRP_RELEASABLE, &cgrp->flags);
+ check_for_release(cgrp);
+ }
}
kfree(link);
}
- write_unlock(&css_set_lock);
- call_rcu(&cg->rcu_head, free_css_set_rcu);
+ kfree_rcu(cset, rcu_head);
}
-/*
- * refcounted get/put for css_set objects
- */
-static inline void get_css_set(struct css_set *cg)
+static void put_css_set(struct css_set *cset, bool taskexit)
{
- atomic_inc(&cg->refcount);
-}
+ /*
+ * Ensure that the refcount doesn't hit zero while any readers
+ * can see it. Similar to atomic_dec_and_lock(), but for an
+ * rwlock
+ */
+ if (atomic_add_unless(&cset->refcount, -1, 1))
+ return;
-static inline void put_css_set(struct css_set *cg)
-{
- __put_css_set(cg, 0);
+ down_write(&css_set_rwsem);
+ put_css_set_locked(cset, taskexit);
+ up_write(&css_set_rwsem);
}
-static inline void put_css_set_taskexit(struct css_set *cg)
+/*
+ * refcounted get/put for css_set objects
+ */
+static inline void get_css_set(struct css_set *cset)
{
- __put_css_set(cg, 1);
+ atomic_inc(&cset->refcount);
}
-/*
+/**
* compare_css_sets - helper function for find_existing_css_set().
- * @cg: candidate css_set being tested
- * @old_cg: existing css_set for a task
+ * @cset: candidate css_set being tested
+ * @old_cset: existing css_set for a task
* @new_cgrp: cgroup that's being entered by the task
* @template: desired set of css pointers in css_set (pre-calculated)
*
- * Returns true if "cg" matches "old_cg" except for the hierarchy
+ * Returns true if "cset" matches "old_cset" except for the hierarchy
* which "new_cgrp" belongs to, for which it should match "new_cgrp".
*/
-static bool compare_css_sets(struct css_set *cg,
- struct css_set *old_cg,
+static bool compare_css_sets(struct css_set *cset,
+ struct css_set *old_cset,
struct cgroup *new_cgrp,
struct cgroup_subsys_state *template[])
{
struct list_head *l1, *l2;
- if (memcmp(template, cg->subsys, sizeof(cg->subsys))) {
- /* Not all subsystems matched */
+ /*
+ * On the default hierarchy, there can be csets which are
+ * associated with the same set of cgroups but different csses.
+ * Let's first ensure that csses match.
+ */
+ if (memcmp(template, cset->subsys, sizeof(cset->subsys)))
return false;
- }
/*
* Compare cgroup pointers in order to distinguish between
- * different cgroups in heirarchies with no subsystems. We
- * could get by with just this check alone (and skip the
- * memcmp above) but on most setups the memcmp check will
- * avoid the need for this more expensive check on almost all
- * candidates.
+ * different cgroups in hierarchies. As different cgroups may
+ * share the same effective css, this comparison is always
+ * necessary.
*/
-
- l1 = &cg->cg_links;
- l2 = &old_cg->cg_links;
+ l1 = &cset->cgrp_links;
+ l2 = &old_cset->cgrp_links;
while (1) {
- struct cg_cgroup_link *cgl1, *cgl2;
- struct cgroup *cg1, *cg2;
+ struct cgrp_cset_link *link1, *link2;
+ struct cgroup *cgrp1, *cgrp2;
l1 = l1->next;
l2 = l2->next;
/* See if we reached the end - both lists are equal length. */
- if (l1 == &cg->cg_links) {
- BUG_ON(l2 != &old_cg->cg_links);
+ if (l1 == &cset->cgrp_links) {
+ BUG_ON(l2 != &old_cset->cgrp_links);
break;
} else {
- BUG_ON(l2 == &old_cg->cg_links);
+ BUG_ON(l2 == &old_cset->cgrp_links);
}
/* Locate the cgroups associated with these links. */
- cgl1 = list_entry(l1, struct cg_cgroup_link, cg_link_list);
- cgl2 = list_entry(l2, struct cg_cgroup_link, cg_link_list);
- cg1 = cgl1->cgrp;
- cg2 = cgl2->cgrp;
+ link1 = list_entry(l1, struct cgrp_cset_link, cgrp_link);
+ link2 = list_entry(l2, struct cgrp_cset_link, cgrp_link);
+ cgrp1 = link1->cgrp;
+ cgrp2 = link2->cgrp;
/* Hierarchies should be linked in the same order. */
- BUG_ON(cg1->root != cg2->root);
+ BUG_ON(cgrp1->root != cgrp2->root);
/*
* If this hierarchy is the hierarchy of the cgroup
@@ -398,236 +617,340 @@ static bool compare_css_sets(struct css_set *cg,
* hierarchy, then this css_set should point to the
* same cgroup as the old css_set.
*/
- if (cg1->root == new_cgrp->root) {
- if (cg1 != new_cgrp)
+ if (cgrp1->root == new_cgrp->root) {
+ if (cgrp1 != new_cgrp)
return false;
} else {
- if (cg1 != cg2)
+ if (cgrp1 != cgrp2)
return false;
}
}
return true;
}
-/*
- * find_existing_css_set() is a helper for
- * find_css_set(), and checks to see whether an existing
- * css_set is suitable.
- *
- * oldcg: the cgroup group that we're using before the cgroup
- * transition
- *
- * cgrp: the cgroup that we're moving into
- *
- * template: location in which to build the desired set of subsystem
- * state objects for the new cgroup group
+/**
+ * find_existing_css_set - init css array and find the matching css_set
+ * @old_cset: the css_set that we're using before the cgroup transition
+ * @cgrp: the cgroup that we're moving into
+ * @template: out param for the new set of csses, should be clear on entry
*/
-static struct css_set *find_existing_css_set(
- struct css_set *oldcg,
- struct cgroup *cgrp,
- struct cgroup_subsys_state *template[])
+static struct css_set *find_existing_css_set(struct css_set *old_cset,
+ struct cgroup *cgrp,
+ struct cgroup_subsys_state *template[])
{
+ struct cgroup_root *root = cgrp->root;
+ struct cgroup_subsys *ss;
+ struct css_set *cset;
+ unsigned long key;
int i;
- struct cgroupfs_root *root = cgrp->root;
- struct hlist_head *hhead;
- struct hlist_node *node;
- struct css_set *cg;
-
- /* Built the set of subsystem state objects that we want to
- * see in the new css_set */
- for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
- if (root->subsys_bits & (1UL << i)) {
- /* Subsystem is in this hierarchy. So we want
- * the subsystem state from the new
- * cgroup */
- template[i] = cgrp->subsys[i];
+
+ /*
+ * Build the set of subsystem state objects that we want to see in the
+ * new css_set. while subsystems can change globally, the entries here
+ * won't change, so no need for locking.
+ */
+ for_each_subsys(ss, i) {
+ if (root->subsys_mask & (1UL << i)) {
+ /*
+ * @ss is in this hierarchy, so we want the
+ * effective css from @cgrp.
+ */
+ template[i] = cgroup_e_css(cgrp, ss);
} else {
- /* Subsystem is not in this hierarchy, so we
- * don't want to change the subsystem state */
- template[i] = oldcg->subsys[i];
+ /*
+ * @ss is not in this hierarchy, so we don't want
+ * to change the css.
+ */
+ template[i] = old_cset->subsys[i];
}
}
- hhead = css_set_hash(template);
- hlist_for_each_entry(cg, node, hhead, hlist) {
- if (!compare_css_sets(cg, oldcg, cgrp, template))
+ key = css_set_hash(template);
+ hash_for_each_possible(css_set_table, cset, hlist, key) {
+ if (!compare_css_sets(cset, old_cset, cgrp, template))
continue;
/* This css_set matches what we need */
- return cg;
+ return cset;
}
/* No existing cgroup group matched */
return NULL;
}
-static void free_cg_links(struct list_head *tmp)
+static void free_cgrp_cset_links(struct list_head *links_to_free)
{
- struct cg_cgroup_link *link;
- struct cg_cgroup_link *saved_link;
+ struct cgrp_cset_link *link, *tmp_link;
- list_for_each_entry_safe(link, saved_link, tmp, cgrp_link_list) {
- list_del(&link->cgrp_link_list);
+ list_for_each_entry_safe(link, tmp_link, links_to_free, cset_link) {
+ list_del(&link->cset_link);
kfree(link);
}
}
-/*
- * allocate_cg_links() allocates "count" cg_cgroup_link structures
- * and chains them on tmp through their cgrp_link_list fields. Returns 0 on
- * success or a negative error
+/**
+ * allocate_cgrp_cset_links - allocate cgrp_cset_links
+ * @count: the number of links to allocate
+ * @tmp_links: list_head the allocated links are put on
+ *
+ * Allocate @count cgrp_cset_link structures and chain them on @tmp_links
+ * through ->cset_link. Returns 0 on success or -errno.
*/
-static int allocate_cg_links(int count, struct list_head *tmp)
+static int allocate_cgrp_cset_links(int count, struct list_head *tmp_links)
{
- struct cg_cgroup_link *link;
+ struct cgrp_cset_link *link;
int i;
- INIT_LIST_HEAD(tmp);
+
+ INIT_LIST_HEAD(tmp_links);
+
for (i = 0; i < count; i++) {
- link = kmalloc(sizeof(*link), GFP_KERNEL);
+ link = kzalloc(sizeof(*link), GFP_KERNEL);
if (!link) {
- free_cg_links(tmp);
+ free_cgrp_cset_links(tmp_links);
return -ENOMEM;
}
- list_add(&link->cgrp_link_list, tmp);
+ list_add(&link->cset_link, tmp_links);
}
return 0;
}
/**
* link_css_set - a helper function to link a css_set to a cgroup
- * @tmp_cg_links: cg_cgroup_link objects allocated by allocate_cg_links()
- * @cg: the css_set to be linked
+ * @tmp_links: cgrp_cset_link objects allocated by allocate_cgrp_cset_links()
+ * @cset: the css_set to be linked
* @cgrp: the destination cgroup
*/
-static void link_css_set(struct list_head *tmp_cg_links,
- struct css_set *cg, struct cgroup *cgrp)
+static void link_css_set(struct list_head *tmp_links, struct css_set *cset,
+ struct cgroup *cgrp)
{
- struct cg_cgroup_link *link;
+ struct cgrp_cset_link *link;
+
+ BUG_ON(list_empty(tmp_links));
- BUG_ON(list_empty(tmp_cg_links));
- link = list_first_entry(tmp_cg_links, struct cg_cgroup_link,
- cgrp_link_list);
- link->cg = cg;
+ if (cgroup_on_dfl(cgrp))
+ cset->dfl_cgrp = cgrp;
+
+ link = list_first_entry(tmp_links, struct cgrp_cset_link, cset_link);
+ link->cset = cset;
link->cgrp = cgrp;
- atomic_inc(&cgrp->count);
- list_move(&link->cgrp_link_list, &cgrp->css_sets);
+
+ if (list_empty(&cgrp->cset_links))
+ cgroup_update_populated(cgrp, true);
+ list_move(&link->cset_link, &cgrp->cset_links);
+
/*
* Always add links to the tail of the list so that the list
* is sorted by order of hierarchy creation
*/
- list_add_tail(&link->cg_link_list, &cg->cg_links);
+ list_add_tail(&link->cgrp_link, &cset->cgrp_links);
}
-/*
- * find_css_set() takes an existing cgroup group and a
- * cgroup object, and returns a css_set object that's
- * equivalent to the old group, but with the given cgroup
- * substituted into the appropriate hierarchy. Must be called with
- * cgroup_mutex held
+/**
+ * find_css_set - return a new css_set with one cgroup updated
+ * @old_cset: the baseline css_set
+ * @cgrp: the cgroup to be updated
+ *
+ * Return a new css_set that's equivalent to @old_cset, but with @cgrp
+ * substituted into the appropriate hierarchy.
*/
-static struct css_set *find_css_set(
- struct css_set *oldcg, struct cgroup *cgrp)
+static struct css_set *find_css_set(struct css_set *old_cset,
+ struct cgroup *cgrp)
{
- struct css_set *res;
- struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
-
- struct list_head tmp_cg_links;
+ struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT] = { };
+ struct css_set *cset;
+ struct list_head tmp_links;
+ struct cgrp_cset_link *link;
+ struct cgroup_subsys *ss;
+ unsigned long key;
+ int ssid;
- struct hlist_head *hhead;
- struct cg_cgroup_link *link;
+ lockdep_assert_held(&cgroup_mutex);
/* First see if we already have a cgroup group that matches
* the desired set */
- read_lock(&css_set_lock);
- res = find_existing_css_set(oldcg, cgrp, template);
- if (res)
- get_css_set(res);
- read_unlock(&css_set_lock);
+ down_read(&css_set_rwsem);
+ cset = find_existing_css_set(old_cset, cgrp, template);
+ if (cset)
+ get_css_set(cset);
+ up_read(&css_set_rwsem);
- if (res)
- return res;
+ if (cset)
+ return cset;
- res = kmalloc(sizeof(*res), GFP_KERNEL);
- if (!res)
+ cset = kzalloc(sizeof(*cset), GFP_KERNEL);
+ if (!cset)
return NULL;
- /* Allocate all the cg_cgroup_link objects that we'll need */
- if (allocate_cg_links(root_count, &tmp_cg_links) < 0) {
- kfree(res);
+ /* Allocate all the cgrp_cset_link objects that we'll need */
+ if (allocate_cgrp_cset_links(cgroup_root_count, &tmp_links) < 0) {
+ kfree(cset);
return NULL;
}
- atomic_set(&res->refcount, 1);
- INIT_LIST_HEAD(&res->cg_links);
- INIT_LIST_HEAD(&res->tasks);
- INIT_HLIST_NODE(&res->hlist);
+ atomic_set(&cset->refcount, 1);
+ INIT_LIST_HEAD(&cset->cgrp_links);
+ INIT_LIST_HEAD(&cset->tasks);
+ INIT_LIST_HEAD(&cset->mg_tasks);
+ INIT_LIST_HEAD(&cset->mg_preload_node);
+ INIT_LIST_HEAD(&cset->mg_node);
+ INIT_HLIST_NODE(&cset->hlist);
/* Copy the set of subsystem state objects generated in
* find_existing_css_set() */
- memcpy(res->subsys, template, sizeof(res->subsys));
+ memcpy(cset->subsys, template, sizeof(cset->subsys));
- write_lock(&css_set_lock);
+ down_write(&css_set_rwsem);
/* Add reference counts and links from the new css_set. */
- list_for_each_entry(link, &oldcg->cg_links, cg_link_list) {
+ list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) {
struct cgroup *c = link->cgrp;
+
if (c->root == cgrp->root)
c = cgrp;
- link_css_set(&tmp_cg_links, res, c);
+ link_css_set(&tmp_links, cset, c);
}
- BUG_ON(!list_empty(&tmp_cg_links));
+ BUG_ON(!list_empty(&tmp_links));
css_set_count++;
- /* Add this cgroup group to the hash table */
- hhead = css_set_hash(res->subsys);
- hlist_add_head(&res->hlist, hhead);
+ /* Add @cset to the hash table */
+ key = css_set_hash(cset->subsys);
+ hash_add(css_set_table, &cset->hlist, key);
- write_unlock(&css_set_lock);
+ for_each_subsys(ss, ssid)
+ list_add_tail(&cset->e_cset_node[ssid],
+ &cset->subsys[ssid]->cgroup->e_csets[ssid]);
- return res;
+ up_write(&css_set_rwsem);
+
+ return cset;
}
-/*
- * Return the cgroup for "task" from the given hierarchy. Must be
- * called with cgroup_mutex held.
- */
-static struct cgroup *task_cgroup_from_root(struct task_struct *task,
- struct cgroupfs_root *root)
+static struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root)
{
- struct css_set *css;
- struct cgroup *res = NULL;
+ struct cgroup *root_cgrp = kf_root->kn->priv;
+
+ return root_cgrp->root;
+}
+
+static int cgroup_init_root_id(struct cgroup_root *root)
+{
+ int id;
+
+ lockdep_assert_held(&cgroup_mutex);
+
+ id = idr_alloc_cyclic(&cgroup_hierarchy_idr, root, 0, 0, GFP_KERNEL);
+ if (id < 0)
+ return id;
+
+ root->hierarchy_id = id;
+ return 0;
+}
+
+static void cgroup_exit_root_id(struct cgroup_root *root)
+{
+ lockdep_assert_held(&cgroup_mutex);
+
+ if (root->hierarchy_id) {
+ idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id);
+ root->hierarchy_id = 0;
+ }
+}
+
+static void cgroup_free_root(struct cgroup_root *root)
+{
+ if (root) {
+ /* hierarhcy ID shoulid already have been released */
+ WARN_ON_ONCE(root->hierarchy_id);
+
+ idr_destroy(&root->cgroup_idr);
+ kfree(root);
+ }
+}
+
+static void cgroup_destroy_root(struct cgroup_root *root)
+{
+ struct cgroup *cgrp = &root->cgrp;
+ struct cgrp_cset_link *link, *tmp_link;
+
+ mutex_lock(&cgroup_mutex);
+
+ BUG_ON(atomic_read(&root->nr_cgrps));
+ BUG_ON(!list_empty(&cgrp->self.children));
+
+ /* Rebind all subsystems back to the default hierarchy */
+ rebind_subsystems(&cgrp_dfl_root, root->subsys_mask);
- BUG_ON(!mutex_is_locked(&cgroup_mutex));
- read_lock(&css_set_lock);
/*
- * No need to lock the task - since we hold cgroup_mutex the
- * task can't change groups, so the only thing that can happen
- * is that it exits and its css is set back to init_css_set.
+ * Release all the links from cset_links to this hierarchy's
+ * root cgroup
*/
- css = task->cgroups;
- if (css == &init_css_set) {
- res = &root->top_cgroup;
+ down_write(&css_set_rwsem);
+
+ list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) {
+ list_del(&link->cset_link);
+ list_del(&link->cgrp_link);
+ kfree(link);
+ }
+ up_write(&css_set_rwsem);
+
+ if (!list_empty(&root->root_list)) {
+ list_del(&root->root_list);
+ cgroup_root_count--;
+ }
+
+ cgroup_exit_root_id(root);
+
+ mutex_unlock(&cgroup_mutex);
+
+ kernfs_destroy_root(root->kf_root);
+ cgroup_free_root(root);
+}
+
+/* look up cgroup associated with given css_set on the specified hierarchy */
+static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
+ struct cgroup_root *root)
+{
+ struct cgroup *res = NULL;
+
+ lockdep_assert_held(&cgroup_mutex);
+ lockdep_assert_held(&css_set_rwsem);
+
+ if (cset == &init_css_set) {
+ res = &root->cgrp;
} else {
- struct cg_cgroup_link *link;
- list_for_each_entry(link, &css->cg_links, cg_link_list) {
+ struct cgrp_cset_link *link;
+
+ list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
struct cgroup *c = link->cgrp;
+
if (c->root == root) {
res = c;
break;
}
}
}
- read_unlock(&css_set_lock);
+
BUG_ON(!res);
return res;
}
/*
- * There is one global cgroup mutex. We also require taking
- * task_lock() when dereferencing a task's cgroup subsys pointers.
- * See "The task_lock() exception", at the end of this comment.
- *
+ * Return the cgroup for "task" from the given hierarchy. Must be
+ * called with cgroup_mutex and css_set_rwsem held.
+ */
+static struct cgroup *task_cgroup_from_root(struct task_struct *task,
+ struct cgroup_root *root)
+{
+ /*
+ * No need to lock the task - since we hold cgroup_mutex the
+ * task can't change groups, so the only thing that can happen
+ * is that it exits and its css is set back to init_css_set.
+ */
+ return cset_cgroup_from_root(task_css_set(task), root);
+}
+
+/*
* A task must hold cgroup_mutex to modify cgroups.
*
* Any task can increment and decrement the count field without lock.
@@ -653,334 +976,302 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
* A cgroup can only be deleted if both its 'count' of using tasks
* is zero, and its list of 'children' cgroups is empty. Since all
* tasks in the system use _some_ cgroup, and since there is always at
- * least one task in the system (init, pid == 1), therefore, top_cgroup
+ * least one task in the system (init, pid == 1), therefore, root cgroup
* always has either children cgroups and/or using tasks. So we don't
- * need a special hack to ensure that top_cgroup cannot be deleted.
- *
- * The task_lock() exception
- *
- * The need for this exception arises from the action of
- * cgroup_attach_task(), which overwrites one tasks cgroup pointer with
- * another. It does so using cgroup_mutex, however there are
- * several performance critical places that need to reference
- * task->cgroup without the expense of grabbing a system global
- * mutex. Therefore except as noted below, when dereferencing or, as
- * in cgroup_attach_task(), modifying a task'ss cgroup pointer we use
- * task_lock(), which acts on a spinlock (task->alloc_lock) already in
- * the task_struct routinely used for such matters.
+ * need a special hack to ensure that root cgroup cannot be deleted.
*
* P.S. One more locking exception. RCU is used to guard the
* update of a tasks cgroup pointer by cgroup_attach_task()
*/
-/**
- * cgroup_lock - lock out any changes to cgroup structures
- *
- */
-void cgroup_lock(void)
+static int cgroup_populate_dir(struct cgroup *cgrp, unsigned int subsys_mask);
+static struct kernfs_syscall_ops cgroup_kf_syscall_ops;
+static const struct file_operations proc_cgroupstats_operations;
+
+static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft,
+ char *buf)
{
- mutex_lock(&cgroup_mutex);
+ if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) &&
+ !(cgrp->root->flags & CGRP_ROOT_NOPREFIX))
+ snprintf(buf, CGROUP_FILE_NAME_MAX, "%s.%s",
+ cft->ss->name, cft->name);
+ else
+ strncpy(buf, cft->name, CGROUP_FILE_NAME_MAX);
+ return buf;
}
/**
- * cgroup_unlock - release lock on cgroup changes
+ * cgroup_file_mode - deduce file mode of a control file
+ * @cft: the control file in question
*
- * Undo the lock taken in a previous cgroup_lock() call.
+ * returns cft->mode if ->mode is not 0
+ * returns S_IRUGO|S_IWUSR if it has both a read and a write handler
+ * returns S_IRUGO if it has only a read handler
+ * returns S_IWUSR if it has only a write hander
*/
-void cgroup_unlock(void)
+static umode_t cgroup_file_mode(const struct cftype *cft)
{
- mutex_unlock(&cgroup_mutex);
-}
+ umode_t mode = 0;
-/*
- * A couple of forward declarations required, due to cyclic reference loop:
- * cgroup_mkdir -> cgroup_create -> cgroup_populate_dir ->
- * cgroup_add_file -> cgroup_create_file -> cgroup_dir_inode_operations
- * -> cgroup_mkdir.
- */
+ if (cft->mode)
+ return cft->mode;
-static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode);
-static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry);
-static int cgroup_populate_dir(struct cgroup *cgrp);
-static const struct inode_operations cgroup_dir_inode_operations;
-static const struct file_operations proc_cgroupstats_operations;
+ if (cft->read_u64 || cft->read_s64 || cft->seq_show)
+ mode |= S_IRUGO;
-static struct backing_dev_info cgroup_backing_dev_info = {
- .name = "cgroup",
- .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
-};
+ if (cft->write_u64 || cft->write_s64 || cft->write)
+ mode |= S_IWUSR;
-static int alloc_css_id(struct cgroup_subsys *ss,
- struct cgroup *parent, struct cgroup *child);
+ return mode;
+}
-static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb)
+static void cgroup_get(struct cgroup *cgrp)
{
- struct inode *inode = new_inode(sb);
+ WARN_ON_ONCE(cgroup_is_dead(cgrp));
+ css_get(&cgrp->self);
+}
- if (inode) {
- inode->i_mode = mode;
- inode->i_uid = current_fsuid();
- inode->i_gid = current_fsgid();
- inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
- inode->i_mapping->backing_dev_info = &cgroup_backing_dev_info;
- }
- return inode;
+static void cgroup_put(struct cgroup *cgrp)
+{
+ css_put(&cgrp->self);
}
-/*
- * Call subsys's pre_destroy handler.
- * This is called before css refcnt check.
+/**
+ * cgroup_kn_unlock - unlocking helper for cgroup kernfs methods
+ * @kn: the kernfs_node being serviced
+ *
+ * This helper undoes cgroup_kn_lock_live() and should be invoked before
+ * the method finishes if locking succeeded. Note that once this function
+ * returns the cgroup returned by cgroup_kn_lock_live() may become
+ * inaccessible any time. If the caller intends to continue to access the
+ * cgroup, it should pin it before invoking this function.
*/
-static int cgroup_call_pre_destroy(struct cgroup *cgrp)
+static void cgroup_kn_unlock(struct kernfs_node *kn)
{
- struct cgroup_subsys *ss;
- int ret = 0;
+ struct cgroup *cgrp;
- for_each_subsys(cgrp->root, ss)
- if (ss->pre_destroy) {
- ret = ss->pre_destroy(ss, cgrp);
- if (ret)
- break;
- }
- return ret;
-}
+ if (kernfs_type(kn) == KERNFS_DIR)
+ cgrp = kn->priv;
+ else
+ cgrp = kn->parent->priv;
-static void free_cgroup_rcu(struct rcu_head *obj)
-{
- struct cgroup *cgrp = container_of(obj, struct cgroup, rcu_head);
+ mutex_unlock(&cgroup_mutex);
- kfree(cgrp);
+ kernfs_unbreak_active_protection(kn);
+ cgroup_put(cgrp);
}
-static void cgroup_diput(struct dentry *dentry, struct inode *inode)
+/**
+ * cgroup_kn_lock_live - locking helper for cgroup kernfs methods
+ * @kn: the kernfs_node being serviced
+ *
+ * This helper is to be used by a cgroup kernfs method currently servicing
+ * @kn. It breaks the active protection, performs cgroup locking and
+ * verifies that the associated cgroup is alive. Returns the cgroup if
+ * alive; otherwise, %NULL. A successful return should be undone by a
+ * matching cgroup_kn_unlock() invocation.
+ *
+ * Any cgroup kernfs method implementation which requires locking the
+ * associated cgroup should use this helper. It avoids nesting cgroup
+ * locking under kernfs active protection and allows all kernfs operations
+ * including self-removal.
+ */
+static struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn)
{
- /* is dentry a directory ? if so, kfree() associated cgroup */
- if (S_ISDIR(inode->i_mode)) {
- struct cgroup *cgrp = dentry->d_fsdata;
- struct cgroup_subsys *ss;
- BUG_ON(!(cgroup_is_removed(cgrp)));
- /* It's possible for external users to be holding css
- * reference counts on a cgroup; css_put() needs to
- * be able to access the cgroup after decrementing
- * the reference count in order to know if it needs to
- * queue the cgroup to be handled by the release
- * agent */
- synchronize_rcu();
+ struct cgroup *cgrp;
- mutex_lock(&cgroup_mutex);
- /*
- * Release the subsystem state objects.
- */
- for_each_subsys(cgrp->root, ss)
- ss->destroy(ss, cgrp);
+ if (kernfs_type(kn) == KERNFS_DIR)
+ cgrp = kn->priv;
+ else
+ cgrp = kn->parent->priv;
- cgrp->root->number_of_cgroups--;
- mutex_unlock(&cgroup_mutex);
+ /*
+ * We're gonna grab cgroup_mutex which nests outside kernfs
+ * active_ref. cgroup liveliness check alone provides enough
+ * protection against removal. Ensure @cgrp stays accessible and
+ * break the active_ref protection.
+ */
+ cgroup_get(cgrp);
+ kernfs_break_active_protection(kn);
- /*
- * Drop the active superblock reference that we took when we
- * created the cgroup
- */
- deactivate_super(cgrp->root->sb);
+ mutex_lock(&cgroup_mutex);
- /*
- * if we're getting rid of the cgroup, refcount should ensure
- * that there are no pidlists left.
- */
- BUG_ON(!list_empty(&cgrp->pidlists));
+ if (!cgroup_is_dead(cgrp))
+ return cgrp;
- call_rcu(&cgrp->rcu_head, free_cgroup_rcu);
- }
- iput(inode);
+ cgroup_kn_unlock(kn);
+ return NULL;
}
-static void remove_dir(struct dentry *d)
+static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
{
- struct dentry *parent = dget(d->d_parent);
+ char name[CGROUP_FILE_NAME_MAX];
- d_delete(d);
- simple_rmdir(parent->d_inode, d);
- dput(parent);
+ lockdep_assert_held(&cgroup_mutex);
+ kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name));
}
-static void cgroup_clear_directory(struct dentry *dentry)
+/**
+ * cgroup_clear_dir - remove subsys files in a cgroup directory
+ * @cgrp: target cgroup
+ * @subsys_mask: mask of the subsystem ids whose files should be removed
+ */
+static void cgroup_clear_dir(struct cgroup *cgrp, unsigned int subsys_mask)
{
- struct list_head *node;
+ struct cgroup_subsys *ss;
+ int i;
- BUG_ON(!mutex_is_locked(&dentry->d_inode->i_mutex));
- spin_lock(&dcache_lock);
- node = dentry->d_subdirs.next;
- while (node != &dentry->d_subdirs) {
- struct dentry *d = list_entry(node, struct dentry, d_u.d_child);
- list_del_init(node);
- if (d->d_inode) {
- /* This should never be called on a cgroup
- * directory with child cgroups */
- BUG_ON(d->d_inode->i_mode & S_IFDIR);
- d = dget_locked(d);
- spin_unlock(&dcache_lock);
- d_delete(d);
- simple_unlink(dentry->d_inode, d);
- dput(d);
- spin_lock(&dcache_lock);
- }
- node = dentry->d_subdirs.next;
+ for_each_subsys(ss, i) {
+ struct cftype *cfts;
+
+ if (!(subsys_mask & (1 << i)))
+ continue;
+ list_for_each_entry(cfts, &ss->cfts, node)
+ cgroup_addrm_files(cgrp, cfts, false);
}
- spin_unlock(&dcache_lock);
}
-/*
- * NOTE : the dentry must have been dget()'ed
- */
-static void cgroup_d_remove_dir(struct dentry *dentry)
+static int rebind_subsystems(struct cgroup_root *dst_root, unsigned int ss_mask)
{
- cgroup_clear_directory(dentry);
-
- spin_lock(&dcache_lock);
- list_del_init(&dentry->d_u.d_child);
- spin_unlock(&dcache_lock);
- remove_dir(dentry);
-}
+ struct cgroup_subsys *ss;
+ unsigned int tmp_ss_mask;
+ int ssid, i, ret;
-/*
- * A queue for waiters to do rmdir() cgroup. A tasks will sleep when
- * cgroup->count == 0 && list_empty(&cgroup->children) && subsys has some
- * reference to css->refcnt. In general, this refcnt is expected to goes down
- * to zero, soon.
- *
- * CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex;
- */
-DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq);
+ lockdep_assert_held(&cgroup_mutex);
-static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp)
-{
- if (unlikely(test_and_clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)))
- wake_up_all(&cgroup_rmdir_waitq);
-}
+ for_each_subsys(ss, ssid) {
+ if (!(ss_mask & (1 << ssid)))
+ continue;
-void cgroup_exclude_rmdir(struct cgroup_subsys_state *css)
-{
- css_get(css);
-}
+ /* if @ss has non-root csses attached to it, can't move */
+ if (css_next_child(NULL, cgroup_css(&ss->root->cgrp, ss)))
+ return -EBUSY;
-void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css)
-{
- cgroup_wakeup_rmdir_waiter(css->cgroup);
- css_put(css);
-}
+ /* can't move between two non-dummy roots either */
+ if (ss->root != &cgrp_dfl_root && dst_root != &cgrp_dfl_root)
+ return -EBUSY;
+ }
+ /* skip creating root files on dfl_root for inhibited subsystems */
+ tmp_ss_mask = ss_mask;
+ if (dst_root == &cgrp_dfl_root)
+ tmp_ss_mask &= ~cgrp_dfl_root_inhibit_ss_mask;
-static int rebind_subsystems(struct cgroupfs_root *root,
- unsigned long final_bits)
-{
- unsigned long added_bits, removed_bits;
- struct cgroup *cgrp = &root->top_cgroup;
- int i;
+ ret = cgroup_populate_dir(&dst_root->cgrp, tmp_ss_mask);
+ if (ret) {
+ if (dst_root != &cgrp_dfl_root)
+ return ret;
- removed_bits = root->actual_subsys_bits & ~final_bits;
- added_bits = final_bits & ~root->actual_subsys_bits;
- /* Check that any added subsystems are currently free */
- for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
- unsigned long bit = 1UL << i;
- struct cgroup_subsys *ss = subsys[i];
- if (!(bit & added_bits))
- continue;
- if (ss->root != &rootnode) {
- /* Subsystem isn't free */
- return -EBUSY;
+ /*
+ * Rebinding back to the default root is not allowed to
+ * fail. Using both default and non-default roots should
+ * be rare. Moving subsystems back and forth even more so.
+ * Just warn about it and continue.
+ */
+ if (cgrp_dfl_root_visible) {
+ pr_warn("failed to create files (%d) while rebinding 0x%x to default root\n",
+ ret, ss_mask);
+ pr_warn("you may retry by moving them to a different hierarchy and unbinding\n");
}
}
- /* Currently we don't handle adding/removing subsystems when
- * any child cgroups exist. This is theoretically supportable
- * but involves complex error handling, so it's being left until
- * later */
- if (root->number_of_cgroups > 1)
- return -EBUSY;
+ /*
+ * Nothing can fail from this point on. Remove files for the
+ * removed subsystems and rebind each subsystem.
+ */
+ for_each_subsys(ss, ssid)
+ if (ss_mask & (1 << ssid))
+ cgroup_clear_dir(&ss->root->cgrp, 1 << ssid);
- /* Process each subsystem */
- for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
- struct cgroup_subsys *ss = subsys[i];
- unsigned long bit = 1UL << i;
- if (bit & added_bits) {
- /* We're binding this subsystem to this hierarchy */
- BUG_ON(cgrp->subsys[i]);
- BUG_ON(!dummytop->subsys[i]);
- BUG_ON(dummytop->subsys[i]->cgroup != dummytop);
- mutex_lock(&ss->hierarchy_mutex);
- cgrp->subsys[i] = dummytop->subsys[i];
- cgrp->subsys[i]->cgroup = cgrp;
- list_move(&ss->sibling, &root->subsys_list);
- ss->root = root;
- if (ss->bind)
- ss->bind(ss, cgrp);
- mutex_unlock(&ss->hierarchy_mutex);
- } else if (bit & removed_bits) {
- /* We're removing this subsystem */
- BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]);
- BUG_ON(cgrp->subsys[i]->cgroup != cgrp);
- mutex_lock(&ss->hierarchy_mutex);
- if (ss->bind)
- ss->bind(ss, dummytop);
- dummytop->subsys[i]->cgroup = dummytop;
- cgrp->subsys[i] = NULL;
- subsys[i]->root = &rootnode;
- list_move(&ss->sibling, &rootnode.subsys_list);
- mutex_unlock(&ss->hierarchy_mutex);
- } else if (bit & final_bits) {
- /* Subsystem state should already exist */
- BUG_ON(!cgrp->subsys[i]);
- } else {
- /* Subsystem state shouldn't exist */
- BUG_ON(cgrp->subsys[i]);
- }
+ for_each_subsys(ss, ssid) {
+ struct cgroup_root *src_root;
+ struct cgroup_subsys_state *css;
+ struct css_set *cset;
+
+ if (!(ss_mask & (1 << ssid)))
+ continue;
+
+ src_root = ss->root;
+ css = cgroup_css(&src_root->cgrp, ss);
+
+ WARN_ON(!css || cgroup_css(&dst_root->cgrp, ss));
+
+ RCU_INIT_POINTER(src_root->cgrp.subsys[ssid], NULL);
+ rcu_assign_pointer(dst_root->cgrp.subsys[ssid], css);
+ ss->root = dst_root;
+ css->cgroup = &dst_root->cgrp;
+
+ down_write(&css_set_rwsem);
+ hash_for_each(css_set_table, i, cset, hlist)
+ list_move_tail(&cset->e_cset_node[ss->id],
+ &dst_root->cgrp.e_csets[ss->id]);
+ up_write(&css_set_rwsem);
+
+ src_root->subsys_mask &= ~(1 << ssid);
+ src_root->cgrp.child_subsys_mask &= ~(1 << ssid);
+
+ /* default hierarchy doesn't enable controllers by default */
+ dst_root->subsys_mask |= 1 << ssid;
+ if (dst_root != &cgrp_dfl_root)
+ dst_root->cgrp.child_subsys_mask |= 1 << ssid;
+
+ if (ss->bind)
+ ss->bind(css);
}
- root->subsys_bits = root->actual_subsys_bits = final_bits;
- synchronize_rcu();
+ kernfs_activate(dst_root->cgrp.kn);
return 0;
}
-static int cgroup_show_options(struct seq_file *seq, struct vfsmount *vfs)
+static int cgroup_show_options(struct seq_file *seq,
+ struct kernfs_root *kf_root)
{
- struct cgroupfs_root *root = vfs->mnt_sb->s_fs_info;
+ struct cgroup_root *root = cgroup_root_from_kf(kf_root);
struct cgroup_subsys *ss;
-
- mutex_lock(&cgroup_mutex);
- for_each_subsys(root, ss)
- seq_printf(seq, ",%s", ss->name);
- if (test_bit(ROOT_NOPREFIX, &root->flags))
+ int ssid;
+
+ for_each_subsys(ss, ssid)
+ if (root->subsys_mask & (1 << ssid))
+ seq_printf(seq, ",%s", ss->name);
+ if (root->flags & CGRP_ROOT_SANE_BEHAVIOR)
+ seq_puts(seq, ",sane_behavior");
+ if (root->flags & CGRP_ROOT_NOPREFIX)
seq_puts(seq, ",noprefix");
+ if (root->flags & CGRP_ROOT_XATTR)
+ seq_puts(seq, ",xattr");
+
+ spin_lock(&release_agent_path_lock);
if (strlen(root->release_agent_path))
seq_printf(seq, ",release_agent=%s", root->release_agent_path);
+ spin_unlock(&release_agent_path_lock);
+
+ if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags))
+ seq_puts(seq, ",clone_children");
if (strlen(root->name))
seq_printf(seq, ",name=%s", root->name);
- mutex_unlock(&cgroup_mutex);
return 0;
}
struct cgroup_sb_opts {
- unsigned long subsys_bits;
- unsigned long flags;
+ unsigned int subsys_mask;
+ unsigned int flags;
char *release_agent;
+ bool cpuset_clone_children;
char *name;
/* User explicitly requested empty subsystem */
bool none;
-
- struct cgroupfs_root *new_root;
-
};
-/* Convert a hierarchy specifier into a bitmask of subsystems and
- * flags. */
-static int parse_cgroupfs_options(char *data,
- struct cgroup_sb_opts *opts)
+static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
{
- char *token, *o = data ?: "all";
- unsigned long mask = (unsigned long)-1;
+ char *token, *o = data;
+ bool all_ss = false, one_ss = false;
+ unsigned int mask = -1U;
+ struct cgroup_subsys *ss;
+ int i;
#ifdef CONFIG_CPUSETS
- mask = ~(1UL << cpuset_subsys_id);
+ mask = ~(1U << cpuset_cgrp_id);
#endif
memset(opts, 0, sizeof(*opts));
@@ -988,30 +1279,45 @@ static int parse_cgroupfs_options(char *data,
while ((token = strsep(&o, ",")) != NULL) {
if (!*token)
return -EINVAL;
- if (!strcmp(token, "all")) {
- /* Add all non-disabled subsystems */
- int i;
- opts->subsys_bits = 0;
- for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
- struct cgroup_subsys *ss = subsys[i];
- if (!ss->disabled)
- opts->subsys_bits |= 1ul << i;
- }
- } else if (!strcmp(token, "none")) {
+ if (!strcmp(token, "none")) {
/* Explicitly have no subsystems */
opts->none = true;
- } else if (!strcmp(token, "noprefix")) {
- set_bit(ROOT_NOPREFIX, &opts->flags);
- } else if (!strncmp(token, "release_agent=", 14)) {
+ continue;
+ }
+ if (!strcmp(token, "all")) {
+ /* Mutually exclusive option 'all' + subsystem name */
+ if (one_ss)
+ return -EINVAL;
+ all_ss = true;
+ continue;
+ }
+ if (!strcmp(token, "__DEVEL__sane_behavior")) {
+ opts->flags |= CGRP_ROOT_SANE_BEHAVIOR;
+ continue;
+ }
+ if (!strcmp(token, "noprefix")) {
+ opts->flags |= CGRP_ROOT_NOPREFIX;
+ continue;
+ }
+ if (!strcmp(token, "clone_children")) {
+ opts->cpuset_clone_children = true;
+ continue;
+ }
+ if (!strcmp(token, "xattr")) {
+ opts->flags |= CGRP_ROOT_XATTR;
+ continue;
+ }
+ if (!strncmp(token, "release_agent=", 14)) {
/* Specifying two release agents is forbidden */
if (opts->release_agent)
return -EINVAL;
opts->release_agent =
- kstrndup(token + 14, PATH_MAX, GFP_KERNEL);
+ kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL);
if (!opts->release_agent)
return -ENOMEM;
- } else if (!strncmp(token, "name=", 5)) {
- int i;
+ continue;
+ }
+ if (!strncmp(token, "name=", 5)) {
const char *name = token + 5;
/* Can't specify an empty name */
if (!strlen(name))
@@ -1029,61 +1335,90 @@ static int parse_cgroupfs_options(char *data,
if (opts->name)
return -EINVAL;
opts->name = kstrndup(name,
- MAX_CGROUP_ROOT_NAMELEN,
+ MAX_CGROUP_ROOT_NAMELEN - 1,
GFP_KERNEL);
if (!opts->name)
return -ENOMEM;
- } else {
- struct cgroup_subsys *ss;
- int i;
- for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
- ss = subsys[i];
- if (!strcmp(token, ss->name)) {
- if (!ss->disabled)
- set_bit(i, &opts->subsys_bits);
- break;
- }
- }
- if (i == CGROUP_SUBSYS_COUNT)
- return -ENOENT;
+
+ continue;
}
+
+ for_each_subsys(ss, i) {
+ if (strcmp(token, ss->name))
+ continue;
+ if (ss->disabled)
+ continue;
+
+ /* Mutually exclusive option 'all' + subsystem name */
+ if (all_ss)
+ return -EINVAL;
+ opts->subsys_mask |= (1 << i);
+ one_ss = true;
+
+ break;
+ }
+ if (i == CGROUP_SUBSYS_COUNT)
+ return -ENOENT;
}
/* Consistency checks */
+ if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) {
+ pr_warn("sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n");
+
+ if ((opts->flags & (CGRP_ROOT_NOPREFIX | CGRP_ROOT_XATTR)) ||
+ opts->cpuset_clone_children || opts->release_agent ||
+ opts->name) {
+ pr_err("sane_behavior: noprefix, xattr, clone_children, release_agent and name are not allowed\n");
+ return -EINVAL;
+ }
+ } else {
+ /*
+ * If the 'all' option was specified select all the
+ * subsystems, otherwise if 'none', 'name=' and a subsystem
+ * name options were not specified, let's default to 'all'
+ */
+ if (all_ss || (!one_ss && !opts->none && !opts->name))
+ for_each_subsys(ss, i)
+ if (!ss->disabled)
+ opts->subsys_mask |= (1 << i);
+
+ /*
+ * We either have to specify by name or by subsystems. (So
+ * all empty hierarchies must have a name).
+ */
+ if (!opts->subsys_mask && !opts->name)
+ return -EINVAL;
+ }
+
/*
* Option noprefix was introduced just for backward compatibility
* with the old cpuset, so we allow noprefix only if mounting just
* the cpuset subsystem.
*/
- if (test_bit(ROOT_NOPREFIX, &opts->flags) &&
- (opts->subsys_bits & mask))
+ if ((opts->flags & CGRP_ROOT_NOPREFIX) && (opts->subsys_mask & mask))
return -EINVAL;
/* Can't specify "none" and some subsystems */
- if (opts->subsys_bits && opts->none)
- return -EINVAL;
-
- /*
- * We either have to specify by name or by subsystems. (So all
- * empty hierarchies must have a name).
- */
- if (!opts->subsys_bits && !opts->name)
+ if (opts->subsys_mask && opts->none)
return -EINVAL;
return 0;
}
-static int cgroup_remount(struct super_block *sb, int *flags, char *data)
+static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
{
int ret = 0;
- struct cgroupfs_root *root = sb->s_fs_info;
- struct cgroup *cgrp = &root->top_cgroup;
+ struct cgroup_root *root = cgroup_root_from_kf(kf_root);
struct cgroup_sb_opts opts;
+ unsigned int added_mask, removed_mask;
+
+ if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) {
+ pr_err("sane_behavior: remount is not allowed\n");
+ return -EINVAL;
+ }
- lock_kernel();
- mutex_lock(&cgrp->dentry->d_inode->i_mutex);
mutex_lock(&cgroup_mutex);
/* See what subsystems are wanted */
@@ -1091,1258 +1426,2133 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
if (ret)
goto out_unlock;
- /* Don't allow flags to change at remount */
- if (opts.flags != root->flags) {
+ if (opts.subsys_mask != root->subsys_mask || opts.release_agent)
+ pr_warn("option changes via remount are deprecated (pid=%d comm=%s)\n",
+ task_tgid_nr(current), current->comm);
+
+ added_mask = opts.subsys_mask & ~root->subsys_mask;
+ removed_mask = root->subsys_mask & ~opts.subsys_mask;
+
+ /* Don't allow flags or name to change at remount */
+ if (((opts.flags ^ root->flags) & CGRP_ROOT_OPTION_MASK) ||
+ (opts.name && strcmp(opts.name, root->name))) {
+ pr_err("option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"\n",
+ opts.flags & CGRP_ROOT_OPTION_MASK, opts.name ?: "",
+ root->flags & CGRP_ROOT_OPTION_MASK, root->name);
ret = -EINVAL;
goto out_unlock;
}
- /* Don't allow name to change at remount */
- if (opts.name && strcmp(opts.name, root->name)) {
- ret = -EINVAL;
+ /* remounting is not allowed for populated hierarchies */
+ if (!list_empty(&root->cgrp.self.children)) {
+ ret = -EBUSY;
goto out_unlock;
}
- ret = rebind_subsystems(root, opts.subsys_bits);
+ ret = rebind_subsystems(root, added_mask);
if (ret)
goto out_unlock;
- /* (re)populate subsystem files */
- cgroup_populate_dir(cgrp);
+ rebind_subsystems(&cgrp_dfl_root, removed_mask);
- if (opts.release_agent)
+ if (opts.release_agent) {
+ spin_lock(&release_agent_path_lock);
strcpy(root->release_agent_path, opts.release_agent);
+ spin_unlock(&release_agent_path_lock);
+ }
out_unlock:
kfree(opts.release_agent);
kfree(opts.name);
mutex_unlock(&cgroup_mutex);
- mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
- unlock_kernel();
return ret;
}
-static const struct super_operations cgroup_ops = {
- .statfs = simple_statfs,
- .drop_inode = generic_delete_inode,
- .show_options = cgroup_show_options,
- .remount_fs = cgroup_remount,
-};
+/*
+ * To reduce the fork() overhead for systems that are not actually using
+ * their cgroups capability, we don't maintain the lists running through
+ * each css_set to its tasks until we see the list actually used - in other
+ * words after the first mount.
+ */
+static bool use_task_css_set_links __read_mostly;
-static void init_cgroup_housekeeping(struct cgroup *cgrp)
+static void cgroup_enable_task_cg_lists(void)
{
- INIT_LIST_HEAD(&cgrp->sibling);
- INIT_LIST_HEAD(&cgrp->children);
- INIT_LIST_HEAD(&cgrp->css_sets);
- INIT_LIST_HEAD(&cgrp->release_list);
- INIT_LIST_HEAD(&cgrp->pidlists);
- mutex_init(&cgrp->pidlist_mutex);
-}
+ struct task_struct *p, *g;
-static void init_cgroup_root(struct cgroupfs_root *root)
-{
- struct cgroup *cgrp = &root->top_cgroup;
- INIT_LIST_HEAD(&root->subsys_list);
- INIT_LIST_HEAD(&root->root_list);
- root->number_of_cgroups = 1;
- cgrp->root = root;
- cgrp->top_cgroup = cgrp;
- init_cgroup_housekeeping(cgrp);
-}
+ down_write(&css_set_rwsem);
-static bool init_root_id(struct cgroupfs_root *root)
-{
- int ret = 0;
+ if (use_task_css_set_links)
+ goto out_unlock;
- do {
- if (!ida_pre_get(&hierarchy_ida, GFP_KERNEL))
- return false;
- spin_lock(&hierarchy_id_lock);
- /* Try to allocate the next unused ID */
- ret = ida_get_new_above(&hierarchy_ida, next_hierarchy_id,
- &root->hierarchy_id);
- if (ret == -ENOSPC)
- /* Try again starting from 0 */
- ret = ida_get_new(&hierarchy_ida, &root->hierarchy_id);
- if (!ret) {
- next_hierarchy_id = root->hierarchy_id + 1;
- } else if (ret != -EAGAIN) {
- /* Can only get here if the 31-bit IDR is full ... */
- BUG_ON(ret);
+ use_task_css_set_links = true;
+
+ /*
+ * We need tasklist_lock because RCU is not safe against
+ * while_each_thread(). Besides, a forking task that has passed
+ * cgroup_post_fork() without seeing use_task_css_set_links = 1
+ * is not guaranteed to have its child immediately visible in the
+ * tasklist if we walk through it with RCU.
+ */
+ read_lock(&tasklist_lock);
+ do_each_thread(g, p) {
+ WARN_ON_ONCE(!list_empty(&p->cg_list) ||
+ task_css_set(p) != &init_css_set);
+
+ /*
+ * We should check if the process is exiting, otherwise
+ * it will race with cgroup_exit() in that the list
+ * entry won't be deleted though the process has exited.
+ * Do it while holding siglock so that we don't end up
+ * racing against cgroup_exit().
+ */
+ spin_lock_irq(&p->sighand->siglock);
+ if (!(p->flags & PF_EXITING)) {
+ struct css_set *cset = task_css_set(p);
+
+ list_add(&p->cg_list, &cset->tasks);
+ get_css_set(cset);
}
- spin_unlock(&hierarchy_id_lock);
- } while (ret);
- return true;
+ spin_unlock_irq(&p->sighand->siglock);
+ } while_each_thread(g, p);
+ read_unlock(&tasklist_lock);
+out_unlock:
+ up_write(&css_set_rwsem);
}
-static int cgroup_test_super(struct super_block *sb, void *data)
+static void init_cgroup_housekeeping(struct cgroup *cgrp)
{
- struct cgroup_sb_opts *opts = data;
- struct cgroupfs_root *root = sb->s_fs_info;
+ struct cgroup_subsys *ss;
+ int ssid;
- /* If we asked for a name then it must match */
- if (opts->name && strcmp(opts->name, root->name))
- return 0;
+ INIT_LIST_HEAD(&cgrp->self.sibling);
+ INIT_LIST_HEAD(&cgrp->self.children);
+ INIT_LIST_HEAD(&cgrp->cset_links);
+ INIT_LIST_HEAD(&cgrp->release_list);
+ INIT_LIST_HEAD(&cgrp->pidlists);
+ mutex_init(&cgrp->pidlist_mutex);
+ cgrp->self.cgroup = cgrp;
+ cgrp->self.flags |= CSS_ONLINE;
- /*
- * If we asked for subsystems (or explicitly for no
- * subsystems) then they must match
- */
- if ((opts->subsys_bits || opts->none)
- && (opts->subsys_bits != root->subsys_bits))
- return 0;
+ for_each_subsys(ss, ssid)
+ INIT_LIST_HEAD(&cgrp->e_csets[ssid]);
- return 1;
+ init_waitqueue_head(&cgrp->offline_waitq);
}
-static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
+static void init_cgroup_root(struct cgroup_root *root,
+ struct cgroup_sb_opts *opts)
{
- struct cgroupfs_root *root;
-
- if (!opts->subsys_bits && !opts->none)
- return NULL;
+ struct cgroup *cgrp = &root->cgrp;
- root = kzalloc(sizeof(*root), GFP_KERNEL);
- if (!root)
- return ERR_PTR(-ENOMEM);
-
- if (!init_root_id(root)) {
- kfree(root);
- return ERR_PTR(-ENOMEM);
- }
- init_cgroup_root(root);
+ INIT_LIST_HEAD(&root->root_list);
+ atomic_set(&root->nr_cgrps, 1);
+ cgrp->root = root;
+ init_cgroup_housekeeping(cgrp);
+ idr_init(&root->cgroup_idr);
- root->subsys_bits = opts->subsys_bits;
root->flags = opts->flags;
if (opts->release_agent)
strcpy(root->release_agent_path, opts->release_agent);
if (opts->name)
strcpy(root->name, opts->name);
- return root;
+ if (opts->cpuset_clone_children)
+ set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);
}
-static void cgroup_drop_root(struct cgroupfs_root *root)
+static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask)
{
- if (!root)
- return;
+ LIST_HEAD(tmp_links);
+ struct cgroup *root_cgrp = &root->cgrp;
+ struct css_set *cset;
+ int i, ret;
- BUG_ON(!root->hierarchy_id);
- spin_lock(&hierarchy_id_lock);
- ida_remove(&hierarchy_ida, root->hierarchy_id);
- spin_unlock(&hierarchy_id_lock);
- kfree(root);
-}
+ lockdep_assert_held(&cgroup_mutex);
-static int cgroup_set_super(struct super_block *sb, void *data)
-{
- int ret;
- struct cgroup_sb_opts *opts = data;
+ ret = cgroup_idr_alloc(&root->cgroup_idr, root_cgrp, 1, 2, GFP_NOWAIT);
+ if (ret < 0)
+ goto out;
+ root_cgrp->id = ret;
- /* If we don't have a new root, we can't set up a new sb */
- if (!opts->new_root)
- return -EINVAL;
+ ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release);
+ if (ret)
+ goto out;
- BUG_ON(!opts->subsys_bits && !opts->none);
+ /*
+ * We're accessing css_set_count without locking css_set_rwsem here,
+ * but that's OK - it can only be increased by someone holding
+ * cgroup_lock, and that's us. The worst that can happen is that we
+ * have some link structures left over
+ */
+ ret = allocate_cgrp_cset_links(css_set_count, &tmp_links);
+ if (ret)
+ goto cancel_ref;
- ret = set_anon_super(sb, NULL);
+ ret = cgroup_init_root_id(root);
if (ret)
- return ret;
+ goto cancel_ref;
+
+ root->kf_root = kernfs_create_root(&cgroup_kf_syscall_ops,
+ KERNFS_ROOT_CREATE_DEACTIVATED,
+ root_cgrp);
+ if (IS_ERR(root->kf_root)) {
+ ret = PTR_ERR(root->kf_root);
+ goto exit_root_id;
+ }
+ root_cgrp->kn = root->kf_root->kn;
- sb->s_fs_info = opts->new_root;
- opts->new_root->sb = sb;
+ ret = cgroup_addrm_files(root_cgrp, cgroup_base_files, true);
+ if (ret)
+ goto destroy_root;
- sb->s_blocksize = PAGE_CACHE_SIZE;
- sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
- sb->s_magic = CGROUP_SUPER_MAGIC;
- sb->s_op = &cgroup_ops;
+ ret = rebind_subsystems(root, ss_mask);
+ if (ret)
+ goto destroy_root;
- return 0;
-}
+ /*
+ * There must be no failure case after here, since rebinding takes
+ * care of subsystems' refcounts, which are explicitly dropped in
+ * the failure exit path.
+ */
+ list_add(&root->root_list, &cgroup_roots);
+ cgroup_root_count++;
-static int cgroup_get_rootdir(struct super_block *sb)
-{
- struct inode *inode =
- cgroup_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR, sb);
- struct dentry *dentry;
+ /*
+ * Link the root cgroup in this hierarchy into all the css_set
+ * objects.
+ */
+ down_write(&css_set_rwsem);
+ hash_for_each(css_set_table, i, cset, hlist)
+ link_css_set(&tmp_links, cset, root_cgrp);
+ up_write(&css_set_rwsem);
- if (!inode)
- return -ENOMEM;
+ BUG_ON(!list_empty(&root_cgrp->self.children));
+ BUG_ON(atomic_read(&root->nr_cgrps) != 1);
- inode->i_fop = &simple_dir_operations;
- inode->i_op = &cgroup_dir_inode_operations;
- /* directories start off with i_nlink == 2 (for "." entry) */
- inc_nlink(inode);
- dentry = d_alloc_root(inode);
- if (!dentry) {
- iput(inode);
- return -ENOMEM;
- }
- sb->s_root = dentry;
- return 0;
+ kernfs_activate(root_cgrp->kn);
+ ret = 0;
+ goto out;
+
+destroy_root:
+ kernfs_destroy_root(root->kf_root);
+ root->kf_root = NULL;
+exit_root_id:
+ cgroup_exit_root_id(root);
+cancel_ref:
+ percpu_ref_cancel_init(&root_cgrp->self.refcnt);
+out:
+ free_cgrp_cset_links(&tmp_links);
+ return ret;
}
-static int cgroup_get_sb(struct file_system_type *fs_type,
+static struct dentry *cgroup_mount(struct file_system_type *fs_type,
int flags, const char *unused_dev_name,
- void *data, struct vfsmount *mnt)
+ void *data)
{
+ struct super_block *pinned_sb = NULL;
+ struct cgroup_subsys *ss;
+ struct cgroup_root *root;
struct cgroup_sb_opts opts;
- struct cgroupfs_root *root;
- int ret = 0;
- struct super_block *sb;
- struct cgroupfs_root *new_root;
+ struct dentry *dentry;
+ int ret;
+ int i;
+ bool new_sb;
+
+ /*
+ * The first time anyone tries to mount a cgroup, enable the list
+ * linking each css_set to its tasks and fix up all existing tasks.
+ */
+ if (!use_task_css_set_links)
+ cgroup_enable_task_cg_lists();
+
+ mutex_lock(&cgroup_mutex);
/* First find the desired set of subsystems */
ret = parse_cgroupfs_options(data, &opts);
if (ret)
- goto out_err;
+ goto out_unlock;
+
+ /* look for a matching existing root */
+ if (!opts.subsys_mask && !opts.none && !opts.name) {
+ cgrp_dfl_root_visible = true;
+ root = &cgrp_dfl_root;
+ cgroup_get(&root->cgrp);
+ ret = 0;
+ goto out_unlock;
+ }
/*
- * Allocate a new cgroup root. We may not need it if we're
- * reusing an existing hierarchy.
+ * Destruction of cgroup root is asynchronous, so subsystems may
+ * still be dying after the previous unmount. Let's drain the
+ * dying subsystems. We just need to ensure that the ones
+ * unmounted previously finish dying and don't care about new ones
+ * starting. Testing ref liveliness is good enough.
*/
- new_root = cgroup_root_from_opts(&opts);
- if (IS_ERR(new_root)) {
- ret = PTR_ERR(new_root);
- goto out_err;
- }
- opts.new_root = new_root;
+ for_each_subsys(ss, i) {
+ if (!(opts.subsys_mask & (1 << i)) ||
+ ss->root == &cgrp_dfl_root)
+ continue;
- /* Locate an existing or new sb for this hierarchy */
- sb = sget(fs_type, cgroup_test_super, cgroup_set_super, &opts);
- if (IS_ERR(sb)) {
- ret = PTR_ERR(sb);
- cgroup_drop_root(opts.new_root);
- goto out_err;
+ if (!percpu_ref_tryget_live(&ss->root->cgrp.self.refcnt)) {
+ mutex_unlock(&cgroup_mutex);
+ msleep(10);
+ ret = restart_syscall();
+ goto out_free;
+ }
+ cgroup_put(&ss->root->cgrp);
}
- root = sb->s_fs_info;
- BUG_ON(!root);
- if (root == opts.new_root) {
- /* We used the new root structure, so this is a new hierarchy */
- struct list_head tmp_cg_links;
- struct cgroup *root_cgrp = &root->top_cgroup;
- struct inode *inode;
- struct cgroupfs_root *existing_root;
- int i;
+ for_each_root(root) {
+ bool name_match = false;
- BUG_ON(sb->s_root != NULL);
+ if (root == &cgrp_dfl_root)
+ continue;
- ret = cgroup_get_rootdir(sb);
- if (ret)
- goto drop_new_super;
- inode = sb->s_root->d_inode;
+ /*
+ * If we asked for a name then it must match. Also, if
+ * name matches but sybsys_mask doesn't, we should fail.
+ * Remember whether name matched.
+ */
+ if (opts.name) {
+ if (strcmp(opts.name, root->name))
+ continue;
+ name_match = true;
+ }
- mutex_lock(&inode->i_mutex);
- mutex_lock(&cgroup_mutex);
+ /*
+ * If we asked for subsystems (or explicitly for no
+ * subsystems) then they must match.
+ */
+ if ((opts.subsys_mask || opts.none) &&
+ (opts.subsys_mask != root->subsys_mask)) {
+ if (!name_match)
+ continue;
+ ret = -EBUSY;
+ goto out_unlock;
+ }
- if (strlen(root->name)) {
- /* Check for name clashes with existing mounts */
- for_each_active_root(existing_root) {
- if (!strcmp(existing_root->name, root->name)) {
- ret = -EBUSY;
- mutex_unlock(&cgroup_mutex);
- mutex_unlock(&inode->i_mutex);
- goto drop_new_super;
- }
+ if ((root->flags ^ opts.flags) & CGRP_ROOT_OPTION_MASK) {
+ if ((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) {
+ pr_err("sane_behavior: new mount options should match the existing superblock\n");
+ ret = -EINVAL;
+ goto out_unlock;
+ } else {
+ pr_warn("new mount options do not match the existing superblock, will be ignored\n");
}
}
/*
- * We're accessing css_set_count without locking
- * css_set_lock here, but that's OK - it can only be
- * increased by someone holding cgroup_lock, and
- * that's us. The worst that can happen is that we
- * have some link structures left over
+ * We want to reuse @root whose lifetime is governed by its
+ * ->cgrp. Let's check whether @root is alive and keep it
+ * that way. As cgroup_kill_sb() can happen anytime, we
+ * want to block it by pinning the sb so that @root doesn't
+ * get killed before mount is complete.
+ *
+ * With the sb pinned, tryget_live can reliably indicate
+ * whether @root can be reused. If it's being killed,
+ * drain it. We can use wait_queue for the wait but this
+ * path is super cold. Let's just sleep a bit and retry.
*/
- ret = allocate_cg_links(css_set_count, &tmp_cg_links);
- if (ret) {
+ pinned_sb = kernfs_pin_sb(root->kf_root, NULL);
+ if (IS_ERR(pinned_sb) ||
+ !percpu_ref_tryget_live(&root->cgrp.self.refcnt)) {
mutex_unlock(&cgroup_mutex);
- mutex_unlock(&inode->i_mutex);
- goto drop_new_super;
+ if (!IS_ERR_OR_NULL(pinned_sb))
+ deactivate_super(pinned_sb);
+ msleep(10);
+ ret = restart_syscall();
+ goto out_free;
}
- ret = rebind_subsystems(root, root->subsys_bits);
- if (ret == -EBUSY) {
- mutex_unlock(&cgroup_mutex);
- mutex_unlock(&inode->i_mutex);
- free_cg_links(&tmp_cg_links);
- goto drop_new_super;
- }
+ ret = 0;
+ goto out_unlock;
+ }
- /* EBUSY should be the only error here */
- BUG_ON(ret);
+ /*
+ * No such thing, create a new one. name= matching without subsys
+ * specification is allowed for already existing hierarchies but we
+ * can't create new one without subsys specification.
+ */
+ if (!opts.subsys_mask && !opts.none) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
- list_add(&root->root_list, &roots);
- root_count++;
+ root = kzalloc(sizeof(*root), GFP_KERNEL);
+ if (!root) {
+ ret = -ENOMEM;
+ goto out_unlock;
+ }
- sb->s_root->d_fsdata = root_cgrp;
- root->top_cgroup.dentry = sb->s_root;
+ init_cgroup_root(root, &opts);
- /* Link the top cgroup in this hierarchy into all
- * the css_set objects */
- write_lock(&css_set_lock);
- for (i = 0; i < CSS_SET_TABLE_SIZE; i++) {
- struct hlist_head *hhead = &css_set_table[i];
- struct hlist_node *node;
- struct css_set *cg;
+ ret = cgroup_setup_root(root, opts.subsys_mask);
+ if (ret)
+ cgroup_free_root(root);
- hlist_for_each_entry(cg, node, hhead, hlist)
- link_css_set(&tmp_cg_links, cg, root_cgrp);
- }
- write_unlock(&css_set_lock);
+out_unlock:
+ mutex_unlock(&cgroup_mutex);
+out_free:
+ kfree(opts.release_agent);
+ kfree(opts.name);
- free_cg_links(&tmp_cg_links);
+ if (ret)
+ return ERR_PTR(ret);
- BUG_ON(!list_empty(&root_cgrp->sibling));
- BUG_ON(!list_empty(&root_cgrp->children));
- BUG_ON(root->number_of_cgroups != 1);
+ dentry = kernfs_mount(fs_type, flags, root->kf_root,
+ CGROUP_SUPER_MAGIC, &new_sb);
+ if (IS_ERR(dentry) || !new_sb)
+ cgroup_put(&root->cgrp);
- cgroup_populate_dir(root_cgrp);
- mutex_unlock(&cgroup_mutex);
- mutex_unlock(&inode->i_mutex);
- } else {
- /*
- * We re-used an existing hierarchy - the new root (if
- * any) is not needed
- */
- cgroup_drop_root(opts.new_root);
+ /*
+ * If @pinned_sb, we're reusing an existing root and holding an
+ * extra ref on its sb. Mount is complete. Put the extra ref.
+ */
+ if (pinned_sb) {
+ WARN_ON(new_sb);
+ deactivate_super(pinned_sb);
}
- simple_set_mnt(mnt, sb);
- kfree(opts.release_agent);
- kfree(opts.name);
- return 0;
+ return dentry;
+}
- drop_new_super:
- deactivate_locked_super(sb);
- out_err:
- kfree(opts.release_agent);
- kfree(opts.name);
+static void cgroup_kill_sb(struct super_block *sb)
+{
+ struct kernfs_root *kf_root = kernfs_root_from_sb(sb);
+ struct cgroup_root *root = cgroup_root_from_kf(kf_root);
- return ret;
+ /*
+ * If @root doesn't have any mounts or children, start killing it.
+ * This prevents new mounts by disabling percpu_ref_tryget_live().
+ * cgroup_mount() may wait for @root's release.
+ *
+ * And don't kill the default root.
+ */
+ if (css_has_online_children(&root->cgrp.self) ||
+ root == &cgrp_dfl_root)
+ cgroup_put(&root->cgrp);
+ else
+ percpu_ref_kill(&root->cgrp.self.refcnt);
+
+ kernfs_kill_sb(sb);
}
-static void cgroup_kill_sb(struct super_block *sb) {
- struct cgroupfs_root *root = sb->s_fs_info;
- struct cgroup *cgrp = &root->top_cgroup;
- int ret;
- struct cg_cgroup_link *link;
- struct cg_cgroup_link *saved_link;
+static struct file_system_type cgroup_fs_type = {
+ .name = "cgroup",
+ .mount = cgroup_mount,
+ .kill_sb = cgroup_kill_sb,
+};
- BUG_ON(!root);
+static struct kobject *cgroup_kobj;
- BUG_ON(root->number_of_cgroups != 1);
- BUG_ON(!list_empty(&cgrp->children));
- BUG_ON(!list_empty(&cgrp->sibling));
+/**
+ * task_cgroup_path - cgroup path of a task in the first cgroup hierarchy
+ * @task: target task
+ * @buf: the buffer to write the path into
+ * @buflen: the length of the buffer
+ *
+ * Determine @task's cgroup on the first (the one with the lowest non-zero
+ * hierarchy_id) cgroup hierarchy and copy its path into @buf. This
+ * function grabs cgroup_mutex and shouldn't be used inside locks used by
+ * cgroup controller callbacks.
+ *
+ * Return value is the same as kernfs_path().
+ */
+char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
+{
+ struct cgroup_root *root;
+ struct cgroup *cgrp;
+ int hierarchy_id = 1;
+ char *path = NULL;
mutex_lock(&cgroup_mutex);
+ down_read(&css_set_rwsem);
- /* Rebind all subsystems back to the default hierarchy */
- ret = rebind_subsystems(root, 0);
- /* Shouldn't be able to fail ... */
- BUG_ON(ret);
+ root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id);
+
+ if (root) {
+ cgrp = task_cgroup_from_root(task, root);
+ path = cgroup_path(cgrp, buf, buflen);
+ } else {
+ /* if no hierarchy exists, everyone is in "/" */
+ if (strlcpy(buf, "/", buflen) < buflen)
+ path = buf;
+ }
+
+ up_read(&css_set_rwsem);
+ mutex_unlock(&cgroup_mutex);
+ return path;
+}
+EXPORT_SYMBOL_GPL(task_cgroup_path);
+
+/* used to track tasks and other necessary states during migration */
+struct cgroup_taskset {
+ /* the src and dst cset list running through cset->mg_node */
+ struct list_head src_csets;
+ struct list_head dst_csets;
/*
- * Release all the links from css_sets to this hierarchy's
- * root cgroup
+ * Fields for cgroup_taskset_*() iteration.
+ *
+ * Before migration is committed, the target migration tasks are on
+ * ->mg_tasks of the csets on ->src_csets. After, on ->mg_tasks of
+ * the csets on ->dst_csets. ->csets point to either ->src_csets
+ * or ->dst_csets depending on whether migration is committed.
+ *
+ * ->cur_csets and ->cur_task point to the current task position
+ * during iteration.
*/
- write_lock(&css_set_lock);
+ struct list_head *csets;
+ struct css_set *cur_cset;
+ struct task_struct *cur_task;
+};
- list_for_each_entry_safe(link, saved_link, &cgrp->css_sets,
- cgrp_link_list) {
- list_del(&link->cg_link_list);
- list_del(&link->cgrp_link_list);
- kfree(link);
- }
- write_unlock(&css_set_lock);
+/**
+ * cgroup_taskset_first - reset taskset and return the first task
+ * @tset: taskset of interest
+ *
+ * @tset iteration is initialized and the first task is returned.
+ */
+struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset)
+{
+ tset->cur_cset = list_first_entry(tset->csets, struct css_set, mg_node);
+ tset->cur_task = NULL;
- if (!list_empty(&root->root_list)) {
- list_del(&root->root_list);
- root_count--;
- }
+ return cgroup_taskset_next(tset);
+}
- mutex_unlock(&cgroup_mutex);
+/**
+ * cgroup_taskset_next - iterate to the next task in taskset
+ * @tset: taskset of interest
+ *
+ * Return the next task in @tset. Iteration must have been initialized
+ * with cgroup_taskset_first().
+ */
+struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset)
+{
+ struct css_set *cset = tset->cur_cset;
+ struct task_struct *task = tset->cur_task;
+
+ while (&cset->mg_node != tset->csets) {
+ if (!task)
+ task = list_first_entry(&cset->mg_tasks,
+ struct task_struct, cg_list);
+ else
+ task = list_next_entry(task, cg_list);
+
+ if (&task->cg_list != &cset->mg_tasks) {
+ tset->cur_cset = cset;
+ tset->cur_task = task;
+ return task;
+ }
- kill_litter_super(sb);
- cgroup_drop_root(root);
+ cset = list_next_entry(cset, mg_node);
+ task = NULL;
+ }
+
+ return NULL;
}
-static struct file_system_type cgroup_fs_type = {
- .name = "cgroup",
- .get_sb = cgroup_get_sb,
- .kill_sb = cgroup_kill_sb,
-};
+/**
+ * cgroup_task_migrate - move a task from one cgroup to another.
+ * @old_cgrp: the cgroup @tsk is being migrated from
+ * @tsk: the task being migrated
+ * @new_cset: the new css_set @tsk is being attached to
+ *
+ * Must be called with cgroup_mutex, threadgroup and css_set_rwsem locked.
+ */
+static void cgroup_task_migrate(struct cgroup *old_cgrp,
+ struct task_struct *tsk,
+ struct css_set *new_cset)
+{
+ struct css_set *old_cset;
+
+ lockdep_assert_held(&cgroup_mutex);
+ lockdep_assert_held(&css_set_rwsem);
+
+ /*
+ * We are synchronized through threadgroup_lock() against PF_EXITING
+ * setting such that we can't race against cgroup_exit() changing the
+ * css_set to init_css_set and dropping the old one.
+ */
+ WARN_ON_ONCE(tsk->flags & PF_EXITING);
+ old_cset = task_css_set(tsk);
+
+ get_css_set(new_cset);
+ rcu_assign_pointer(tsk->cgroups, new_cset);
+
+ /*
+ * Use move_tail so that cgroup_taskset_first() still returns the
+ * leader after migration. This works because cgroup_migrate()
+ * ensures that the dst_cset of the leader is the first on the
+ * tset's dst_csets list.
+ */
+ list_move_tail(&tsk->cg_list, &new_cset->mg_tasks);
+
+ /*
+ * We just gained a reference on old_cset by taking it from the
+ * task. As trading it for new_cset is protected by cgroup_mutex,
+ * we're safe to drop it here; it will be freed under RCU.
+ */
+ set_bit(CGRP_RELEASABLE, &old_cgrp->flags);
+ put_css_set_locked(old_cset, false);
+}
-static inline struct cgroup *__d_cgrp(struct dentry *dentry)
+/**
+ * cgroup_migrate_finish - cleanup after attach
+ * @preloaded_csets: list of preloaded css_sets
+ *
+ * Undo cgroup_migrate_add_src() and cgroup_migrate_prepare_dst(). See
+ * those functions for details.
+ */
+static void cgroup_migrate_finish(struct list_head *preloaded_csets)
{
- return dentry->d_fsdata;
+ struct css_set *cset, *tmp_cset;
+
+ lockdep_assert_held(&cgroup_mutex);
+
+ down_write(&css_set_rwsem);
+ list_for_each_entry_safe(cset, tmp_cset, preloaded_csets, mg_preload_node) {
+ cset->mg_src_cgrp = NULL;
+ cset->mg_dst_cset = NULL;
+ list_del_init(&cset->mg_preload_node);
+ put_css_set_locked(cset, false);
+ }
+ up_write(&css_set_rwsem);
}
-static inline struct cftype *__d_cft(struct dentry *dentry)
+/**
+ * cgroup_migrate_add_src - add a migration source css_set
+ * @src_cset: the source css_set to add
+ * @dst_cgrp: the destination cgroup
+ * @preloaded_csets: list of preloaded css_sets
+ *
+ * Tasks belonging to @src_cset are about to be migrated to @dst_cgrp. Pin
+ * @src_cset and add it to @preloaded_csets, which should later be cleaned
+ * up by cgroup_migrate_finish().
+ *
+ * This function may be called without holding threadgroup_lock even if the
+ * target is a process. Threads may be created and destroyed but as long
+ * as cgroup_mutex is not dropped, no new css_set can be put into play and
+ * the preloaded css_sets are guaranteed to cover all migrations.
+ */
+static void cgroup_migrate_add_src(struct css_set *src_cset,
+ struct cgroup *dst_cgrp,
+ struct list_head *preloaded_csets)
{
- return dentry->d_fsdata;
+ struct cgroup *src_cgrp;
+
+ lockdep_assert_held(&cgroup_mutex);
+ lockdep_assert_held(&css_set_rwsem);
+
+ src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root);
+
+ if (!list_empty(&src_cset->mg_preload_node))
+ return;
+
+ WARN_ON(src_cset->mg_src_cgrp);
+ WARN_ON(!list_empty(&src_cset->mg_tasks));
+ WARN_ON(!list_empty(&src_cset->mg_node));
+
+ src_cset->mg_src_cgrp = src_cgrp;
+ get_css_set(src_cset);
+ list_add(&src_cset->mg_preload_node, preloaded_csets);
}
/**
- * cgroup_path - generate the path of a cgroup
- * @cgrp: the cgroup in question
- * @buf: the buffer to write the path into
- * @buflen: the length of the buffer
+ * cgroup_migrate_prepare_dst - prepare destination css_sets for migration
+ * @dst_cgrp: the destination cgroup (may be %NULL)
+ * @preloaded_csets: list of preloaded source css_sets
*
- * Called with cgroup_mutex held or else with an RCU-protected cgroup
- * reference. Writes path of cgroup into buf. Returns 0 on success,
- * -errno on error.
+ * Tasks are about to be moved to @dst_cgrp and all the source css_sets
+ * have been preloaded to @preloaded_csets. This function looks up and
+ * pins all destination css_sets, links each to its source, and append them
+ * to @preloaded_csets. If @dst_cgrp is %NULL, the destination of each
+ * source css_set is assumed to be its cgroup on the default hierarchy.
+ *
+ * This function must be called after cgroup_migrate_add_src() has been
+ * called on each migration source css_set. After migration is performed
+ * using cgroup_migrate(), cgroup_migrate_finish() must be called on
+ * @preloaded_csets.
*/
-int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
+static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp,
+ struct list_head *preloaded_csets)
{
- char *start;
- struct dentry *dentry = rcu_dereference(cgrp->dentry);
+ LIST_HEAD(csets);
+ struct css_set *src_cset, *tmp_cset;
+
+ lockdep_assert_held(&cgroup_mutex);
+
+ /*
+ * Except for the root, child_subsys_mask must be zero for a cgroup
+ * with tasks so that child cgroups don't compete against tasks.
+ */
+ if (dst_cgrp && cgroup_on_dfl(dst_cgrp) && cgroup_parent(dst_cgrp) &&
+ dst_cgrp->child_subsys_mask)
+ return -EBUSY;
+
+ /* look up the dst cset for each src cset and link it to src */
+ list_for_each_entry_safe(src_cset, tmp_cset, preloaded_csets, mg_preload_node) {
+ struct css_set *dst_cset;
+
+ dst_cset = find_css_set(src_cset,
+ dst_cgrp ?: src_cset->dfl_cgrp);
+ if (!dst_cset)
+ goto err;
+
+ WARN_ON_ONCE(src_cset->mg_dst_cset || dst_cset->mg_dst_cset);
- if (!dentry || cgrp == dummytop) {
/*
- * Inactive subsystems have no dentry for their root
- * cgroup
+ * If src cset equals dst, it's noop. Drop the src.
+ * cgroup_migrate() will skip the cset too. Note that we
+ * can't handle src == dst as some nodes are used by both.
*/
- strcpy(buf, "/");
- return 0;
- }
+ if (src_cset == dst_cset) {
+ src_cset->mg_src_cgrp = NULL;
+ list_del_init(&src_cset->mg_preload_node);
+ put_css_set(src_cset, false);
+ put_css_set(dst_cset, false);
+ continue;
+ }
- start = buf + buflen;
+ src_cset->mg_dst_cset = dst_cset;
- *--start = '\0';
- for (;;) {
- int len = dentry->d_name.len;
- if ((start -= len) < buf)
- return -ENAMETOOLONG;
- memcpy(start, cgrp->dentry->d_name.name, len);
- cgrp = cgrp->parent;
- if (!cgrp)
- break;
- dentry = rcu_dereference(cgrp->dentry);
- if (!cgrp->parent)
- continue;
- if (--start < buf)
- return -ENAMETOOLONG;
- *start = '/';
+ if (list_empty(&dst_cset->mg_preload_node))
+ list_add(&dst_cset->mg_preload_node, &csets);
+ else
+ put_css_set(dst_cset, false);
}
- memmove(buf, start, buf + buflen - start);
+
+ list_splice_tail(&csets, preloaded_csets);
return 0;
+err:
+ cgroup_migrate_finish(&csets);
+ return -ENOMEM;
}
/**
- * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp'
- * @cgrp: the cgroup the task is attaching to
- * @tsk: the task to be attached
+ * cgroup_migrate - migrate a process or task to a cgroup
+ * @cgrp: the destination cgroup
+ * @leader: the leader of the process or the task to migrate
+ * @threadgroup: whether @leader points to the whole process or a single task
+ *
+ * Migrate a process or task denoted by @leader to @cgrp. If migrating a
+ * process, the caller must be holding threadgroup_lock of @leader. The
+ * caller is also responsible for invoking cgroup_migrate_add_src() and
+ * cgroup_migrate_prepare_dst() on the targets before invoking this
+ * function and following up with cgroup_migrate_finish().
*
- * Call holding cgroup_mutex. May take task_lock of
- * the task 'tsk' during call.
+ * As long as a controller's ->can_attach() doesn't fail, this function is
+ * guaranteed to succeed. This means that, excluding ->can_attach()
+ * failure, when migrating multiple targets, the success or failure can be
+ * decided for all targets by invoking group_migrate_prepare_dst() before
+ * actually starting migrating.
*/
-int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
+static int cgroup_migrate(struct cgroup *cgrp, struct task_struct *leader,
+ bool threadgroup)
{
- int retval = 0;
- struct cgroup_subsys *ss;
- struct cgroup *oldcgrp;
- struct css_set *cg;
- struct css_set *newcg;
- struct cgroupfs_root *root = cgrp->root;
-
- /* Nothing to do if the task is already in that cgroup */
- oldcgrp = task_cgroup_from_root(tsk, root);
- if (cgrp == oldcgrp)
+ struct cgroup_taskset tset = {
+ .src_csets = LIST_HEAD_INIT(tset.src_csets),
+ .dst_csets = LIST_HEAD_INIT(tset.dst_csets),
+ .csets = &tset.src_csets,
+ };
+ struct cgroup_subsys_state *css, *failed_css = NULL;
+ struct css_set *cset, *tmp_cset;
+ struct task_struct *task, *tmp_task;
+ int i, ret;
+
+ /*
+ * Prevent freeing of tasks while we take a snapshot. Tasks that are
+ * already PF_EXITING could be freed from underneath us unless we
+ * take an rcu_read_lock.
+ */
+ down_write(&css_set_rwsem);
+ rcu_read_lock();
+ task = leader;
+ do {
+ /* @task either already exited or can't exit until the end */
+ if (task->flags & PF_EXITING)
+ goto next;
+
+ /* leave @task alone if post_fork() hasn't linked it yet */
+ if (list_empty(&task->cg_list))
+ goto next;
+
+ cset = task_css_set(task);
+ if (!cset->mg_src_cgrp)
+ goto next;
+
+ /*
+ * cgroup_taskset_first() must always return the leader.
+ * Take care to avoid disturbing the ordering.
+ */
+ list_move_tail(&task->cg_list, &cset->mg_tasks);
+ if (list_empty(&cset->mg_node))
+ list_add_tail(&cset->mg_node, &tset.src_csets);
+ if (list_empty(&cset->mg_dst_cset->mg_node))
+ list_move_tail(&cset->mg_dst_cset->mg_node,
+ &tset.dst_csets);
+ next:
+ if (!threadgroup)
+ break;
+ } while_each_thread(leader, task);
+ rcu_read_unlock();
+ up_write(&css_set_rwsem);
+
+ /* methods shouldn't be called if no task is actually migrating */
+ if (list_empty(&tset.src_csets))
return 0;
- for_each_subsys(root, ss) {
- if (ss->can_attach) {
- retval = ss->can_attach(ss, cgrp, tsk, false);
- if (retval)
- return retval;
+ /* check that we can legitimately attach to the cgroup */
+ for_each_e_css(css, i, cgrp) {
+ if (css->ss->can_attach) {
+ ret = css->ss->can_attach(css, &tset);
+ if (ret) {
+ failed_css = css;
+ goto out_cancel_attach;
+ }
}
}
- task_lock(tsk);
- cg = tsk->cgroups;
- get_css_set(cg);
- task_unlock(tsk);
/*
- * Locate or allocate a new css_set for this task,
- * based on its final set of cgroups
+ * Now that we're guaranteed success, proceed to move all tasks to
+ * the new cgroup. There are no failure cases after here, so this
+ * is the commit point.
*/
- newcg = find_css_set(cg, cgrp);
- put_css_set(cg);
- if (!newcg)
- return -ENOMEM;
-
- task_lock(tsk);
- if (tsk->flags & PF_EXITING) {
- task_unlock(tsk);
- put_css_set(newcg);
- return -ESRCH;
+ down_write(&css_set_rwsem);
+ list_for_each_entry(cset, &tset.src_csets, mg_node) {
+ list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list)
+ cgroup_task_migrate(cset->mg_src_cgrp, task,
+ cset->mg_dst_cset);
}
- rcu_assign_pointer(tsk->cgroups, newcg);
- task_unlock(tsk);
+ up_write(&css_set_rwsem);
- /* Update the css_set linked lists if we're using them */
- write_lock(&css_set_lock);
- if (!list_empty(&tsk->cg_list)) {
- list_del(&tsk->cg_list);
- list_add(&tsk->cg_list, &newcg->tasks);
- }
- write_unlock(&css_set_lock);
+ /*
+ * Migration is committed, all target tasks are now on dst_csets.
+ * Nothing is sensitive to fork() after this point. Notify
+ * controllers that migration is complete.
+ */
+ tset.csets = &tset.dst_csets;
- for_each_subsys(root, ss) {
- if (ss->attach)
- ss->attach(ss, cgrp, oldcgrp, tsk, false);
+ for_each_e_css(css, i, cgrp)
+ if (css->ss->attach)
+ css->ss->attach(css, &tset);
+
+ ret = 0;
+ goto out_release_tset;
+
+out_cancel_attach:
+ for_each_e_css(css, i, cgrp) {
+ if (css == failed_css)
+ break;
+ if (css->ss->cancel_attach)
+ css->ss->cancel_attach(css, &tset);
}
- set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
- synchronize_rcu();
- put_css_set(cg);
+out_release_tset:
+ down_write(&css_set_rwsem);
+ list_splice_init(&tset.dst_csets, &tset.src_csets);
+ list_for_each_entry_safe(cset, tmp_cset, &tset.src_csets, mg_node) {
+ list_splice_tail_init(&cset->mg_tasks, &cset->tasks);
+ list_del_init(&cset->mg_node);
+ }
+ up_write(&css_set_rwsem);
+ return ret;
+}
- /*
- * wake up rmdir() waiter. the rmdir should fail since the cgroup
- * is no longer empty.
- */
- cgroup_wakeup_rmdir_waiter(cgrp);
- return 0;
+/**
+ * cgroup_attach_task - attach a task or a whole threadgroup to a cgroup
+ * @dst_cgrp: the cgroup to attach to
+ * @leader: the task or the leader of the threadgroup to be attached
+ * @threadgroup: attach the whole threadgroup?
+ *
+ * Call holding cgroup_mutex and threadgroup_lock of @leader.
+ */
+static int cgroup_attach_task(struct cgroup *dst_cgrp,
+ struct task_struct *leader, bool threadgroup)
+{
+ LIST_HEAD(preloaded_csets);
+ struct task_struct *task;
+ int ret;
+
+ /* look up all src csets */
+ down_read(&css_set_rwsem);
+ rcu_read_lock();
+ task = leader;
+ do {
+ cgroup_migrate_add_src(task_css_set(task), dst_cgrp,
+ &preloaded_csets);
+ if (!threadgroup)
+ break;
+ } while_each_thread(leader, task);
+ rcu_read_unlock();
+ up_read(&css_set_rwsem);
+
+ /* prepare dst csets and commit */
+ ret = cgroup_migrate_prepare_dst(dst_cgrp, &preloaded_csets);
+ if (!ret)
+ ret = cgroup_migrate(dst_cgrp, leader, threadgroup);
+
+ cgroup_migrate_finish(&preloaded_csets);
+ return ret;
}
/*
- * Attach task with pid 'pid' to cgroup 'cgrp'. Call with cgroup_mutex
- * held. May take task_lock of task
+ * Find the task_struct of the task to attach by vpid and pass it along to the
+ * function to attach either it or all tasks in its threadgroup. Will lock
+ * cgroup_mutex and threadgroup.
*/
-static int attach_task_by_pid(struct cgroup *cgrp, u64 pid)
+static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
+ size_t nbytes, loff_t off, bool threadgroup)
{
struct task_struct *tsk;
const struct cred *cred = current_cred(), *tcred;
+ struct cgroup *cgrp;
+ pid_t pid;
int ret;
+ if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0)
+ return -EINVAL;
+
+ cgrp = cgroup_kn_lock_live(of->kn);
+ if (!cgrp)
+ return -ENODEV;
+
+retry_find_task:
+ rcu_read_lock();
if (pid) {
- rcu_read_lock();
tsk = find_task_by_vpid(pid);
- if (!tsk || tsk->flags & PF_EXITING) {
+ if (!tsk) {
rcu_read_unlock();
- return -ESRCH;
+ ret = -ESRCH;
+ goto out_unlock_cgroup;
}
-
+ /*
+ * even if we're attaching all tasks in the thread group, we
+ * only need to check permissions on one of them.
+ */
tcred = __task_cred(tsk);
- if (cred->euid &&
- cred->euid != tcred->uid &&
- cred->euid != tcred->suid) {
+ if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
+ !uid_eq(cred->euid, tcred->uid) &&
+ !uid_eq(cred->euid, tcred->suid)) {
rcu_read_unlock();
- return -EACCES;
+ ret = -EACCES;
+ goto out_unlock_cgroup;
}
- get_task_struct(tsk);
- rcu_read_unlock();
- } else {
+ } else
tsk = current;
- get_task_struct(tsk);
+
+ if (threadgroup)
+ tsk = tsk->group_leader;
+
+ /*
+ * Workqueue threads may acquire PF_NO_SETAFFINITY and become
+ * trapped in a cpuset, or RT worker may be born in a cgroup
+ * with no rt_runtime allocated. Just say no.
+ */
+ if (tsk == kthreadd_task || (tsk->flags & PF_NO_SETAFFINITY)) {
+ ret = -EINVAL;
+ rcu_read_unlock();
+ goto out_unlock_cgroup;
}
- ret = cgroup_attach_task(cgrp, tsk);
- put_task_struct(tsk);
- return ret;
-}
+ get_task_struct(tsk);
+ rcu_read_unlock();
-static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid)
-{
- int ret;
- if (!cgroup_lock_live_group(cgrp))
- return -ENODEV;
- ret = attach_task_by_pid(cgrp, pid);
- cgroup_unlock();
- return ret;
+ threadgroup_lock(tsk);
+ if (threadgroup) {
+ if (!thread_group_leader(tsk)) {
+ /*
+ * a race with de_thread from another thread's exec()
+ * may strip us of our leadership, if this happens,
+ * there is no choice but to throw this task away and
+ * try again; this is
+ * "double-double-toil-and-trouble-check locking".
+ */
+ threadgroup_unlock(tsk);
+ put_task_struct(tsk);
+ goto retry_find_task;
+ }
+ }
+
+ ret = cgroup_attach_task(cgrp, tsk, threadgroup);
+
+ threadgroup_unlock(tsk);
+
+ put_task_struct(tsk);
+out_unlock_cgroup:
+ cgroup_kn_unlock(of->kn);
+ return ret ?: nbytes;
}
/**
- * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive.
- * @cgrp: the cgroup to be checked for liveness
- *
- * On success, returns true; the lock should be later released with
- * cgroup_unlock(). On failure returns false with no lock held.
+ * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from'
+ * @from: attach to all cgroups of a given task
+ * @tsk: the task to be attached
*/
-bool cgroup_lock_live_group(struct cgroup *cgrp)
+int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
{
+ struct cgroup_root *root;
+ int retval = 0;
+
mutex_lock(&cgroup_mutex);
- if (cgroup_is_removed(cgrp)) {
- mutex_unlock(&cgroup_mutex);
- return false;
+ for_each_root(root) {
+ struct cgroup *from_cgrp;
+
+ if (root == &cgrp_dfl_root)
+ continue;
+
+ down_read(&css_set_rwsem);
+ from_cgrp = task_cgroup_from_root(from, root);
+ up_read(&css_set_rwsem);
+
+ retval = cgroup_attach_task(from_cgrp, tsk, false);
+ if (retval)
+ break;
}
- return true;
+ mutex_unlock(&cgroup_mutex);
+
+ return retval;
+}
+EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
+
+static ssize_t cgroup_tasks_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ return __cgroup_procs_write(of, buf, nbytes, off, false);
}
-static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft,
- const char *buffer)
+static ssize_t cgroup_procs_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
{
+ return __cgroup_procs_write(of, buf, nbytes, off, true);
+}
+
+static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ struct cgroup *cgrp;
+
BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
- if (!cgroup_lock_live_group(cgrp))
+
+ cgrp = cgroup_kn_lock_live(of->kn);
+ if (!cgrp)
return -ENODEV;
- strcpy(cgrp->root->release_agent_path, buffer);
- cgroup_unlock();
- return 0;
+ spin_lock(&release_agent_path_lock);
+ strlcpy(cgrp->root->release_agent_path, strstrip(buf),
+ sizeof(cgrp->root->release_agent_path));
+ spin_unlock(&release_agent_path_lock);
+ cgroup_kn_unlock(of->kn);
+ return nbytes;
}
-static int cgroup_release_agent_show(struct cgroup *cgrp, struct cftype *cft,
- struct seq_file *seq)
+static int cgroup_release_agent_show(struct seq_file *seq, void *v)
{
- if (!cgroup_lock_live_group(cgrp))
- return -ENODEV;
+ struct cgroup *cgrp = seq_css(seq)->cgroup;
+
+ spin_lock(&release_agent_path_lock);
seq_puts(seq, cgrp->root->release_agent_path);
+ spin_unlock(&release_agent_path_lock);
seq_putc(seq, '\n');
- cgroup_unlock();
return 0;
}
-/* A buffer size big enough for numbers or short strings */
-#define CGROUP_LOCAL_BUFFER_SIZE 64
-
-static ssize_t cgroup_write_X64(struct cgroup *cgrp, struct cftype *cft,
- struct file *file,
- const char __user *userbuf,
- size_t nbytes, loff_t *unused_ppos)
+static int cgroup_sane_behavior_show(struct seq_file *seq, void *v)
{
- char buffer[CGROUP_LOCAL_BUFFER_SIZE];
- int retval = 0;
- char *end;
+ struct cgroup *cgrp = seq_css(seq)->cgroup;
- if (!nbytes)
- return -EINVAL;
- if (nbytes >= sizeof(buffer))
- return -E2BIG;
- if (copy_from_user(buffer, userbuf, nbytes))
- return -EFAULT;
+ seq_printf(seq, "%d\n", cgroup_sane_behavior(cgrp));
+ return 0;
+}
- buffer[nbytes] = 0; /* nul-terminate */
- if (cft->write_u64) {
- u64 val = simple_strtoull(strstrip(buffer), &end, 0);
- if (*end)
- return -EINVAL;
- retval = cft->write_u64(cgrp, cft, val);
- } else {
- s64 val = simple_strtoll(strstrip(buffer), &end, 0);
- if (*end)
- return -EINVAL;
- retval = cft->write_s64(cgrp, cft, val);
+static void cgroup_print_ss_mask(struct seq_file *seq, unsigned int ss_mask)
+{
+ struct cgroup_subsys *ss;
+ bool printed = false;
+ int ssid;
+
+ for_each_subsys(ss, ssid) {
+ if (ss_mask & (1 << ssid)) {
+ if (printed)
+ seq_putc(seq, ' ');
+ seq_printf(seq, "%s", ss->name);
+ printed = true;
+ }
}
- if (!retval)
- retval = nbytes;
- return retval;
+ if (printed)
+ seq_putc(seq, '\n');
}
-static ssize_t cgroup_write_string(struct cgroup *cgrp, struct cftype *cft,
- struct file *file,
- const char __user *userbuf,
- size_t nbytes, loff_t *unused_ppos)
+/* show controllers which are currently attached to the default hierarchy */
+static int cgroup_root_controllers_show(struct seq_file *seq, void *v)
{
- char local_buffer[CGROUP_LOCAL_BUFFER_SIZE];
- int retval = 0;
- size_t max_bytes = cft->max_write_len;
- char *buffer = local_buffer;
-
- if (!max_bytes)
- max_bytes = sizeof(local_buffer) - 1;
- if (nbytes >= max_bytes)
- return -E2BIG;
- /* Allocate a dynamic buffer if we need one */
- if (nbytes >= sizeof(local_buffer)) {
- buffer = kmalloc(nbytes + 1, GFP_KERNEL);
- if (buffer == NULL)
- return -ENOMEM;
+ struct cgroup *cgrp = seq_css(seq)->cgroup;
+
+ cgroup_print_ss_mask(seq, cgrp->root->subsys_mask &
+ ~cgrp_dfl_root_inhibit_ss_mask);
+ return 0;
+}
+
+/* show controllers which are enabled from the parent */
+static int cgroup_controllers_show(struct seq_file *seq, void *v)
+{
+ struct cgroup *cgrp = seq_css(seq)->cgroup;
+
+ cgroup_print_ss_mask(seq, cgroup_parent(cgrp)->child_subsys_mask);
+ return 0;
+}
+
+/* show controllers which are enabled for a given cgroup's children */
+static int cgroup_subtree_control_show(struct seq_file *seq, void *v)
+{
+ struct cgroup *cgrp = seq_css(seq)->cgroup;
+
+ cgroup_print_ss_mask(seq, cgrp->child_subsys_mask);
+ return 0;
+}
+
+/**
+ * cgroup_update_dfl_csses - update css assoc of a subtree in default hierarchy
+ * @cgrp: root of the subtree to update csses for
+ *
+ * @cgrp's child_subsys_mask has changed and its subtree's (self excluded)
+ * css associations need to be updated accordingly. This function looks up
+ * all css_sets which are attached to the subtree, creates the matching
+ * updated css_sets and migrates the tasks to the new ones.
+ */
+static int cgroup_update_dfl_csses(struct cgroup *cgrp)
+{
+ LIST_HEAD(preloaded_csets);
+ struct cgroup_subsys_state *css;
+ struct css_set *src_cset;
+ int ret;
+
+ lockdep_assert_held(&cgroup_mutex);
+
+ /* look up all csses currently attached to @cgrp's subtree */
+ down_read(&css_set_rwsem);
+ css_for_each_descendant_pre(css, cgroup_css(cgrp, NULL)) {
+ struct cgrp_cset_link *link;
+
+ /* self is not affected by child_subsys_mask change */
+ if (css->cgroup == cgrp)
+ continue;
+
+ list_for_each_entry(link, &css->cgroup->cset_links, cset_link)
+ cgroup_migrate_add_src(link->cset, cgrp,
+ &preloaded_csets);
}
- if (nbytes && copy_from_user(buffer, userbuf, nbytes)) {
- retval = -EFAULT;
- goto out;
+ up_read(&css_set_rwsem);
+
+ /* NULL dst indicates self on default hierarchy */
+ ret = cgroup_migrate_prepare_dst(NULL, &preloaded_csets);
+ if (ret)
+ goto out_finish;
+
+ list_for_each_entry(src_cset, &preloaded_csets, mg_preload_node) {
+ struct task_struct *last_task = NULL, *task;
+
+ /* src_csets precede dst_csets, break on the first dst_cset */
+ if (!src_cset->mg_src_cgrp)
+ break;
+
+ /*
+ * All tasks in src_cset need to be migrated to the
+ * matching dst_cset. Empty it process by process. We
+ * walk tasks but migrate processes. The leader might even
+ * belong to a different cset but such src_cset would also
+ * be among the target src_csets because the default
+ * hierarchy enforces per-process membership.
+ */
+ while (true) {
+ down_read(&css_set_rwsem);
+ task = list_first_entry_or_null(&src_cset->tasks,
+ struct task_struct, cg_list);
+ if (task) {
+ task = task->group_leader;
+ WARN_ON_ONCE(!task_css_set(task)->mg_src_cgrp);
+ get_task_struct(task);
+ }
+ up_read(&css_set_rwsem);
+
+ if (!task)
+ break;
+
+ /* guard against possible infinite loop */
+ if (WARN(last_task == task,
+ "cgroup: update_dfl_csses failed to make progress, aborting in inconsistent state\n"))
+ goto out_finish;
+ last_task = task;
+
+ threadgroup_lock(task);
+ /* raced against de_thread() from another thread? */
+ if (!thread_group_leader(task)) {
+ threadgroup_unlock(task);
+ put_task_struct(task);
+ continue;
+ }
+
+ ret = cgroup_migrate(src_cset->dfl_cgrp, task, true);
+
+ threadgroup_unlock(task);
+ put_task_struct(task);
+
+ if (WARN(ret, "cgroup: failed to update controllers for the default hierarchy (%d), further operations may crash or hang\n", ret))
+ goto out_finish;
+ }
}
- buffer[nbytes] = 0; /* nul-terminate */
- retval = cft->write_string(cgrp, cft, strstrip(buffer));
- if (!retval)
- retval = nbytes;
-out:
- if (buffer != local_buffer)
- kfree(buffer);
- return retval;
+out_finish:
+ cgroup_migrate_finish(&preloaded_csets);
+ return ret;
}
-static ssize_t cgroup_file_write(struct file *file, const char __user *buf,
- size_t nbytes, loff_t *ppos)
+/* change the enabled child controllers for a cgroup in the default hierarchy */
+static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes,
+ loff_t off)
{
- struct cftype *cft = __d_cft(file->f_dentry);
- struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
+ unsigned int enable = 0, disable = 0;
+ struct cgroup *cgrp, *child;
+ struct cgroup_subsys *ss;
+ char *tok;
+ int ssid, ret;
- if (cgroup_is_removed(cgrp))
+ /*
+ * Parse input - space separated list of subsystem names prefixed
+ * with either + or -.
+ */
+ buf = strstrip(buf);
+ while ((tok = strsep(&buf, " "))) {
+ if (tok[0] == '\0')
+ continue;
+ for_each_subsys(ss, ssid) {
+ if (ss->disabled || strcmp(tok + 1, ss->name) ||
+ ((1 << ss->id) & cgrp_dfl_root_inhibit_ss_mask))
+ continue;
+
+ if (*tok == '+') {
+ enable |= 1 << ssid;
+ disable &= ~(1 << ssid);
+ } else if (*tok == '-') {
+ disable |= 1 << ssid;
+ enable &= ~(1 << ssid);
+ } else {
+ return -EINVAL;
+ }
+ break;
+ }
+ if (ssid == CGROUP_SUBSYS_COUNT)
+ return -EINVAL;
+ }
+
+ cgrp = cgroup_kn_lock_live(of->kn);
+ if (!cgrp)
return -ENODEV;
- if (cft->write)
- return cft->write(cgrp, cft, file, buf, nbytes, ppos);
- if (cft->write_u64 || cft->write_s64)
- return cgroup_write_X64(cgrp, cft, file, buf, nbytes, ppos);
- if (cft->write_string)
- return cgroup_write_string(cgrp, cft, file, buf, nbytes, ppos);
- if (cft->trigger) {
- int ret = cft->trigger(cgrp, (unsigned int)cft->private);
- return ret ? ret : nbytes;
+
+ for_each_subsys(ss, ssid) {
+ if (enable & (1 << ssid)) {
+ if (cgrp->child_subsys_mask & (1 << ssid)) {
+ enable &= ~(1 << ssid);
+ continue;
+ }
+
+ /*
+ * Because css offlining is asynchronous, userland
+ * might try to re-enable the same controller while
+ * the previous instance is still around. In such
+ * cases, wait till it's gone using offline_waitq.
+ */
+ cgroup_for_each_live_child(child, cgrp) {
+ DEFINE_WAIT(wait);
+
+ if (!cgroup_css(child, ss))
+ continue;
+
+ cgroup_get(child);
+ prepare_to_wait(&child->offline_waitq, &wait,
+ TASK_UNINTERRUPTIBLE);
+ cgroup_kn_unlock(of->kn);
+ schedule();
+ finish_wait(&child->offline_waitq, &wait);
+ cgroup_put(child);
+
+ return restart_syscall();
+ }
+
+ /* unavailable or not enabled on the parent? */
+ if (!(cgrp_dfl_root.subsys_mask & (1 << ssid)) ||
+ (cgroup_parent(cgrp) &&
+ !(cgroup_parent(cgrp)->child_subsys_mask & (1 << ssid)))) {
+ ret = -ENOENT;
+ goto out_unlock;
+ }
+ } else if (disable & (1 << ssid)) {
+ if (!(cgrp->child_subsys_mask & (1 << ssid))) {
+ disable &= ~(1 << ssid);
+ continue;
+ }
+
+ /* a child has it enabled? */
+ cgroup_for_each_live_child(child, cgrp) {
+ if (child->child_subsys_mask & (1 << ssid)) {
+ ret = -EBUSY;
+ goto out_unlock;
+ }
+ }
+ }
+ }
+
+ if (!enable && !disable) {
+ ret = 0;
+ goto out_unlock;
+ }
+
+ /*
+ * Except for the root, child_subsys_mask must be zero for a cgroup
+ * with tasks so that child cgroups don't compete against tasks.
+ */
+ if (enable && cgroup_parent(cgrp) && !list_empty(&cgrp->cset_links)) {
+ ret = -EBUSY;
+ goto out_unlock;
+ }
+
+ /*
+ * Create csses for enables and update child_subsys_mask. This
+ * changes cgroup_e_css() results which in turn makes the
+ * subsequent cgroup_update_dfl_csses() associate all tasks in the
+ * subtree to the updated csses.
+ */
+ for_each_subsys(ss, ssid) {
+ if (!(enable & (1 << ssid)))
+ continue;
+
+ cgroup_for_each_live_child(child, cgrp) {
+ ret = create_css(child, ss);
+ if (ret)
+ goto err_undo_css;
+ }
+ }
+
+ cgrp->child_subsys_mask |= enable;
+ cgrp->child_subsys_mask &= ~disable;
+
+ ret = cgroup_update_dfl_csses(cgrp);
+ if (ret)
+ goto err_undo_css;
+
+ /* all tasks are now migrated away from the old csses, kill them */
+ for_each_subsys(ss, ssid) {
+ if (!(disable & (1 << ssid)))
+ continue;
+
+ cgroup_for_each_live_child(child, cgrp)
+ kill_css(cgroup_css(child, ss));
+ }
+
+ kernfs_activate(cgrp->kn);
+ ret = 0;
+out_unlock:
+ cgroup_kn_unlock(of->kn);
+ return ret ?: nbytes;
+
+err_undo_css:
+ cgrp->child_subsys_mask &= ~enable;
+ cgrp->child_subsys_mask |= disable;
+
+ for_each_subsys(ss, ssid) {
+ if (!(enable & (1 << ssid)))
+ continue;
+
+ cgroup_for_each_live_child(child, cgrp) {
+ struct cgroup_subsys_state *css = cgroup_css(child, ss);
+ if (css)
+ kill_css(css);
+ }
}
- return -EINVAL;
+ goto out_unlock;
+}
+
+static int cgroup_populated_show(struct seq_file *seq, void *v)
+{
+ seq_printf(seq, "%d\n", (bool)seq_css(seq)->cgroup->populated_cnt);
+ return 0;
}
-static ssize_t cgroup_read_u64(struct cgroup *cgrp, struct cftype *cft,
- struct file *file,
- char __user *buf, size_t nbytes,
- loff_t *ppos)
+static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
+ size_t nbytes, loff_t off)
{
- char tmp[CGROUP_LOCAL_BUFFER_SIZE];
- u64 val = cft->read_u64(cgrp, cft);
- int len = sprintf(tmp, "%llu\n", (unsigned long long) val);
+ struct cgroup *cgrp = of->kn->parent->priv;
+ struct cftype *cft = of->kn->priv;
+ struct cgroup_subsys_state *css;
+ int ret;
+
+ if (cft->write)
+ return cft->write(of, buf, nbytes, off);
+
+ /*
+ * kernfs guarantees that a file isn't deleted with operations in
+ * flight, which means that the matching css is and stays alive and
+ * doesn't need to be pinned. The RCU locking is not necessary
+ * either. It's just for the convenience of using cgroup_css().
+ */
+ rcu_read_lock();
+ css = cgroup_css(cgrp, cft->ss);
+ rcu_read_unlock();
+
+ if (cft->write_u64) {
+ unsigned long long v;
+ ret = kstrtoull(buf, 0, &v);
+ if (!ret)
+ ret = cft->write_u64(css, cft, v);
+ } else if (cft->write_s64) {
+ long long v;
+ ret = kstrtoll(buf, 0, &v);
+ if (!ret)
+ ret = cft->write_s64(css, cft, v);
+ } else {
+ ret = -EINVAL;
+ }
- return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
+ return ret ?: nbytes;
}
-static ssize_t cgroup_read_s64(struct cgroup *cgrp, struct cftype *cft,
- struct file *file,
- char __user *buf, size_t nbytes,
- loff_t *ppos)
+static void *cgroup_seqfile_start(struct seq_file *seq, loff_t *ppos)
{
- char tmp[CGROUP_LOCAL_BUFFER_SIZE];
- s64 val = cft->read_s64(cgrp, cft);
- int len = sprintf(tmp, "%lld\n", (long long) val);
+ return seq_cft(seq)->seq_start(seq, ppos);
+}
- return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
+static void *cgroup_seqfile_next(struct seq_file *seq, void *v, loff_t *ppos)
+{
+ return seq_cft(seq)->seq_next(seq, v, ppos);
}
-static ssize_t cgroup_file_read(struct file *file, char __user *buf,
- size_t nbytes, loff_t *ppos)
+static void cgroup_seqfile_stop(struct seq_file *seq, void *v)
{
- struct cftype *cft = __d_cft(file->f_dentry);
- struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
+ seq_cft(seq)->seq_stop(seq, v);
+}
- if (cgroup_is_removed(cgrp))
- return -ENODEV;
+static int cgroup_seqfile_show(struct seq_file *m, void *arg)
+{
+ struct cftype *cft = seq_cft(m);
+ struct cgroup_subsys_state *css = seq_css(m);
+
+ if (cft->seq_show)
+ return cft->seq_show(m, arg);
- if (cft->read)
- return cft->read(cgrp, cft, file, buf, nbytes, ppos);
if (cft->read_u64)
- return cgroup_read_u64(cgrp, cft, file, buf, nbytes, ppos);
- if (cft->read_s64)
- return cgroup_read_s64(cgrp, cft, file, buf, nbytes, ppos);
- return -EINVAL;
+ seq_printf(m, "%llu\n", cft->read_u64(css, cft));
+ else if (cft->read_s64)
+ seq_printf(m, "%lld\n", cft->read_s64(css, cft));
+ else
+ return -EINVAL;
+ return 0;
}
+static struct kernfs_ops cgroup_kf_single_ops = {
+ .atomic_write_len = PAGE_SIZE,
+ .write = cgroup_file_write,
+ .seq_show = cgroup_seqfile_show,
+};
+
+static struct kernfs_ops cgroup_kf_ops = {
+ .atomic_write_len = PAGE_SIZE,
+ .write = cgroup_file_write,
+ .seq_start = cgroup_seqfile_start,
+ .seq_next = cgroup_seqfile_next,
+ .seq_stop = cgroup_seqfile_stop,
+ .seq_show = cgroup_seqfile_show,
+};
+
/*
- * seqfile ops/methods for returning structured data. Currently just
- * supports string->u64 maps, but can be extended in future.
+ * cgroup_rename - Only allow simple rename of directories in place.
*/
+static int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent,
+ const char *new_name_str)
+{
+ struct cgroup *cgrp = kn->priv;
+ int ret;
-struct cgroup_seqfile_state {
- struct cftype *cft;
- struct cgroup *cgroup;
-};
+ if (kernfs_type(kn) != KERNFS_DIR)
+ return -ENOTDIR;
+ if (kn->parent != new_parent)
+ return -EIO;
-static int cgroup_map_add(struct cgroup_map_cb *cb, const char *key, u64 value)
+ /*
+ * This isn't a proper migration and its usefulness is very
+ * limited. Disallow if sane_behavior.
+ */
+ if (cgroup_sane_behavior(cgrp))
+ return -EPERM;
+
+ /*
+ * We're gonna grab cgroup_mutex which nests outside kernfs
+ * active_ref. kernfs_rename() doesn't require active_ref
+ * protection. Break them before grabbing cgroup_mutex.
+ */
+ kernfs_break_active_protection(new_parent);
+ kernfs_break_active_protection(kn);
+
+ mutex_lock(&cgroup_mutex);
+
+ ret = kernfs_rename(kn, new_parent, new_name_str);
+
+ mutex_unlock(&cgroup_mutex);
+
+ kernfs_unbreak_active_protection(kn);
+ kernfs_unbreak_active_protection(new_parent);
+ return ret;
+}
+
+/* set uid and gid of cgroup dirs and files to that of the creator */
+static int cgroup_kn_set_ugid(struct kernfs_node *kn)
{
- struct seq_file *sf = cb->state;
- return seq_printf(sf, "%s %llu\n", key, (unsigned long long)value);
+ struct iattr iattr = { .ia_valid = ATTR_UID | ATTR_GID,
+ .ia_uid = current_fsuid(),
+ .ia_gid = current_fsgid(), };
+
+ if (uid_eq(iattr.ia_uid, GLOBAL_ROOT_UID) &&
+ gid_eq(iattr.ia_gid, GLOBAL_ROOT_GID))
+ return 0;
+
+ return kernfs_setattr(kn, &iattr);
}
-static int cgroup_seqfile_show(struct seq_file *m, void *arg)
+static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft)
{
- struct cgroup_seqfile_state *state = m->private;
- struct cftype *cft = state->cft;
- if (cft->read_map) {
- struct cgroup_map_cb cb = {
- .fill = cgroup_map_add,
- .state = m,
- };
- return cft->read_map(state->cgroup, cft, &cb);
+ char name[CGROUP_FILE_NAME_MAX];
+ struct kernfs_node *kn;
+ struct lock_class_key *key = NULL;
+ int ret;
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ key = &cft->lockdep_key;
+#endif
+ kn = __kernfs_create_file(cgrp->kn, cgroup_file_name(cgrp, cft, name),
+ cgroup_file_mode(cft), 0, cft->kf_ops, cft,
+ NULL, false, key);
+ if (IS_ERR(kn))
+ return PTR_ERR(kn);
+
+ ret = cgroup_kn_set_ugid(kn);
+ if (ret) {
+ kernfs_remove(kn);
+ return ret;
}
- return cft->read_seq_string(state->cgroup, cft, m);
+
+ if (cft->seq_show == cgroup_populated_show)
+ cgrp->populated_kn = kn;
+ return 0;
}
-static int cgroup_seqfile_release(struct inode *inode, struct file *file)
+/**
+ * cgroup_addrm_files - add or remove files to a cgroup directory
+ * @cgrp: the target cgroup
+ * @cfts: array of cftypes to be added
+ * @is_add: whether to add or remove
+ *
+ * Depending on @is_add, add or remove files defined by @cfts on @cgrp.
+ * For removals, this function never fails. If addition fails, this
+ * function doesn't remove files already added. The caller is responsible
+ * for cleaning up.
+ */
+static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
+ bool is_add)
{
- struct seq_file *seq = file->private_data;
- kfree(seq->private);
- return single_release(inode, file);
+ struct cftype *cft;
+ int ret;
+
+ lockdep_assert_held(&cgroup_mutex);
+
+ for (cft = cfts; cft->name[0] != '\0'; cft++) {
+ /* does cft->flags tell us to skip this file on @cgrp? */
+ if ((cft->flags & CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp))
+ continue;
+ if ((cft->flags & CFTYPE_INSANE) && cgroup_sane_behavior(cgrp))
+ continue;
+ if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgroup_parent(cgrp))
+ continue;
+ if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgroup_parent(cgrp))
+ continue;
+
+ if (is_add) {
+ ret = cgroup_add_file(cgrp, cft);
+ if (ret) {
+ pr_warn("%s: failed to add %s, err=%d\n",
+ __func__, cft->name, ret);
+ return ret;
+ }
+ } else {
+ cgroup_rm_file(cgrp, cft);
+ }
+ }
+ return 0;
}
-static const struct file_operations cgroup_seqfile_operations = {
- .read = seq_read,
- .write = cgroup_file_write,
- .llseek = seq_lseek,
- .release = cgroup_seqfile_release,
-};
+static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add)
+{
+ LIST_HEAD(pending);
+ struct cgroup_subsys *ss = cfts[0].ss;
+ struct cgroup *root = &ss->root->cgrp;
+ struct cgroup_subsys_state *css;
+ int ret = 0;
+
+ lockdep_assert_held(&cgroup_mutex);
+
+ /* add/rm files for all cgroups created before */
+ css_for_each_descendant_pre(css, cgroup_css(root, ss)) {
+ struct cgroup *cgrp = css->cgroup;
+
+ if (cgroup_is_dead(cgrp))
+ continue;
+
+ ret = cgroup_addrm_files(cgrp, cfts, is_add);
+ if (ret)
+ break;
+ }
+
+ if (is_add && !ret)
+ kernfs_activate(root->kn);
+ return ret;
+}
-static int cgroup_file_open(struct inode *inode, struct file *file)
+static void cgroup_exit_cftypes(struct cftype *cfts)
{
- int err;
struct cftype *cft;
- err = generic_file_open(inode, file);
- if (err)
- return err;
- cft = __d_cft(file->f_dentry);
+ for (cft = cfts; cft->name[0] != '\0'; cft++) {
+ /* free copy for custom atomic_write_len, see init_cftypes() */
+ if (cft->max_write_len && cft->max_write_len != PAGE_SIZE)
+ kfree(cft->kf_ops);
+ cft->kf_ops = NULL;
+ cft->ss = NULL;
+ }
+}
- if (cft->read_map || cft->read_seq_string) {
- struct cgroup_seqfile_state *state =
- kzalloc(sizeof(*state), GFP_USER);
- if (!state)
- return -ENOMEM;
- state->cft = cft;
- state->cgroup = __d_cgrp(file->f_dentry->d_parent);
- file->f_op = &cgroup_seqfile_operations;
- err = single_open(file, cgroup_seqfile_show, state);
- if (err < 0)
- kfree(state);
- } else if (cft->open)
- err = cft->open(inode, file);
- else
- err = 0;
+static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
+{
+ struct cftype *cft;
- return err;
+ for (cft = cfts; cft->name[0] != '\0'; cft++) {
+ struct kernfs_ops *kf_ops;
+
+ WARN_ON(cft->ss || cft->kf_ops);
+
+ if (cft->seq_start)
+ kf_ops = &cgroup_kf_ops;
+ else
+ kf_ops = &cgroup_kf_single_ops;
+
+ /*
+ * Ugh... if @cft wants a custom max_write_len, we need to
+ * make a copy of kf_ops to set its atomic_write_len.
+ */
+ if (cft->max_write_len && cft->max_write_len != PAGE_SIZE) {
+ kf_ops = kmemdup(kf_ops, sizeof(*kf_ops), GFP_KERNEL);
+ if (!kf_ops) {
+ cgroup_exit_cftypes(cfts);
+ return -ENOMEM;
+ }
+ kf_ops->atomic_write_len = cft->max_write_len;
+ }
+
+ cft->kf_ops = kf_ops;
+ cft->ss = ss;
+ }
+
+ return 0;
}
-static int cgroup_file_release(struct inode *inode, struct file *file)
+static int cgroup_rm_cftypes_locked(struct cftype *cfts)
{
- struct cftype *cft = __d_cft(file->f_dentry);
- if (cft->release)
- return cft->release(inode, file);
+ lockdep_assert_held(&cgroup_mutex);
+
+ if (!cfts || !cfts[0].ss)
+ return -ENOENT;
+
+ list_del(&cfts->node);
+ cgroup_apply_cftypes(cfts, false);
+ cgroup_exit_cftypes(cfts);
return 0;
}
-/*
- * cgroup_rename - Only allow simple rename of directories in place.
+/**
+ * cgroup_rm_cftypes - remove an array of cftypes from a subsystem
+ * @cfts: zero-length name terminated array of cftypes
+ *
+ * Unregister @cfts. Files described by @cfts are removed from all
+ * existing cgroups and all future cgroups won't have them either. This
+ * function can be called anytime whether @cfts' subsys is attached or not.
+ *
+ * Returns 0 on successful unregistration, -ENOENT if @cfts is not
+ * registered.
*/
-static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry,
- struct inode *new_dir, struct dentry *new_dentry)
+int cgroup_rm_cftypes(struct cftype *cfts)
{
- if (!S_ISDIR(old_dentry->d_inode->i_mode))
- return -ENOTDIR;
- if (new_dentry->d_inode)
- return -EEXIST;
- if (old_dir != new_dir)
- return -EIO;
- return simple_rename(old_dir, old_dentry, new_dir, new_dentry);
+ int ret;
+
+ mutex_lock(&cgroup_mutex);
+ ret = cgroup_rm_cftypes_locked(cfts);
+ mutex_unlock(&cgroup_mutex);
+ return ret;
}
-static const struct file_operations cgroup_file_operations = {
- .read = cgroup_file_read,
- .write = cgroup_file_write,
- .llseek = generic_file_llseek,
- .open = cgroup_file_open,
- .release = cgroup_file_release,
-};
+/**
+ * cgroup_add_cftypes - add an array of cftypes to a subsystem
+ * @ss: target cgroup subsystem
+ * @cfts: zero-length name terminated array of cftypes
+ *
+ * Register @cfts to @ss. Files described by @cfts are created for all
+ * existing cgroups to which @ss is attached and all future cgroups will
+ * have them too. This function can be called anytime whether @ss is
+ * attached or not.
+ *
+ * Returns 0 on successful registration, -errno on failure. Note that this
+ * function currently returns 0 as long as @cfts registration is successful
+ * even if some file creation attempts on existing cgroups fail.
+ */
+int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
+{
+ int ret;
-static const struct inode_operations cgroup_dir_inode_operations = {
- .lookup = simple_lookup,
- .mkdir = cgroup_mkdir,
- .rmdir = cgroup_rmdir,
- .rename = cgroup_rename,
-};
+ if (ss->disabled)
+ return 0;
-static int cgroup_create_file(struct dentry *dentry, mode_t mode,
- struct super_block *sb)
-{
- static const struct dentry_operations cgroup_dops = {
- .d_iput = cgroup_diput,
- };
+ if (!cfts || cfts[0].name[0] == '\0')
+ return 0;
- struct inode *inode;
+ ret = cgroup_init_cftypes(ss, cfts);
+ if (ret)
+ return ret;
- if (!dentry)
- return -ENOENT;
- if (dentry->d_inode)
- return -EEXIST;
+ mutex_lock(&cgroup_mutex);
- inode = cgroup_new_inode(mode, sb);
- if (!inode)
- return -ENOMEM;
+ list_add_tail(&cfts->node, &ss->cfts);
+ ret = cgroup_apply_cftypes(cfts, true);
+ if (ret)
+ cgroup_rm_cftypes_locked(cfts);
- if (S_ISDIR(mode)) {
- inode->i_op = &cgroup_dir_inode_operations;
- inode->i_fop = &simple_dir_operations;
+ mutex_unlock(&cgroup_mutex);
+ return ret;
+}
- /* start off with i_nlink == 2 (for "." entry) */
- inc_nlink(inode);
+/**
+ * cgroup_task_count - count the number of tasks in a cgroup.
+ * @cgrp: the cgroup in question
+ *
+ * Return the number of tasks in the cgroup.
+ */
+static int cgroup_task_count(const struct cgroup *cgrp)
+{
+ int count = 0;
+ struct cgrp_cset_link *link;
- /* start with the directory inode held, so that we can
- * populate it without racing with another mkdir */
- mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
- } else if (S_ISREG(mode)) {
- inode->i_size = 0;
- inode->i_fop = &cgroup_file_operations;
- }
- dentry->d_op = &cgroup_dops;
- d_instantiate(dentry, inode);
- dget(dentry); /* Extra count - pin the dentry in core */
- return 0;
+ down_read(&css_set_rwsem);
+ list_for_each_entry(link, &cgrp->cset_links, cset_link)
+ count += atomic_read(&link->cset->refcount);
+ up_read(&css_set_rwsem);
+ return count;
}
-/*
- * cgroup_create_dir - create a directory for an object.
- * @cgrp: the cgroup we create the directory for. It must have a valid
- * ->parent field. And we are going to fill its ->dentry field.
- * @dentry: dentry of the new cgroup
- * @mode: mode to set on new directory.
+/**
+ * css_next_child - find the next child of a given css
+ * @pos: the current position (%NULL to initiate traversal)
+ * @parent: css whose children to walk
+ *
+ * This function returns the next child of @parent and should be called
+ * under either cgroup_mutex or RCU read lock. The only requirement is
+ * that @parent and @pos are accessible. The next sibling is guaranteed to
+ * be returned regardless of their states.
+ *
+ * If a subsystem synchronizes ->css_online() and the start of iteration, a
+ * css which finished ->css_online() is guaranteed to be visible in the
+ * future iterations and will stay visible until the last reference is put.
+ * A css which hasn't finished ->css_online() or already finished
+ * ->css_offline() may show up during traversal. It's each subsystem's
+ * responsibility to synchronize against on/offlining.
*/
-static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry,
- mode_t mode)
-{
- struct dentry *parent;
- int error = 0;
-
- parent = cgrp->parent->dentry;
- error = cgroup_create_file(dentry, S_IFDIR | mode, cgrp->root->sb);
- if (!error) {
- dentry->d_fsdata = cgrp;
- inc_nlink(parent->d_inode);
- rcu_assign_pointer(cgrp->dentry, dentry);
- dget(dentry);
+struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos,
+ struct cgroup_subsys_state *parent)
+{
+ struct cgroup_subsys_state *next;
+
+ cgroup_assert_mutex_or_rcu_locked();
+
+ /*
+ * @pos could already have been unlinked from the sibling list.
+ * Once a cgroup is removed, its ->sibling.next is no longer
+ * updated when its next sibling changes. CSS_RELEASED is set when
+ * @pos is taken off list, at which time its next pointer is valid,
+ * and, as releases are serialized, the one pointed to by the next
+ * pointer is guaranteed to not have started release yet. This
+ * implies that if we observe !CSS_RELEASED on @pos in this RCU
+ * critical section, the one pointed to by its next pointer is
+ * guaranteed to not have finished its RCU grace period even if we
+ * have dropped rcu_read_lock() inbetween iterations.
+ *
+ * If @pos has CSS_RELEASED set, its next pointer can't be
+ * dereferenced; however, as each css is given a monotonically
+ * increasing unique serial number and always appended to the
+ * sibling list, the next one can be found by walking the parent's
+ * children until the first css with higher serial number than
+ * @pos's. While this path can be slower, it happens iff iteration
+ * races against release and the race window is very small.
+ */
+ if (!pos) {
+ next = list_entry_rcu(parent->children.next, struct cgroup_subsys_state, sibling);
+ } else if (likely(!(pos->flags & CSS_RELEASED))) {
+ next = list_entry_rcu(pos->sibling.next, struct cgroup_subsys_state, sibling);
+ } else {
+ list_for_each_entry_rcu(next, &parent->children, sibling)
+ if (next->serial_nr > pos->serial_nr)
+ break;
}
- dput(dentry);
- return error;
+ /*
+ * @next, if not pointing to the head, can be dereferenced and is
+ * the next sibling.
+ */
+ if (&next->sibling != &parent->children)
+ return next;
+ return NULL;
}
/**
- * cgroup_file_mode - deduce file mode of a control file
- * @cft: the control file in question
+ * css_next_descendant_pre - find the next descendant for pre-order walk
+ * @pos: the current position (%NULL to initiate traversal)
+ * @root: css whose descendants to walk
*
- * returns cft->mode if ->mode is not 0
- * returns S_IRUGO|S_IWUSR if it has both a read and a write handler
- * returns S_IRUGO if it has only a read handler
- * returns S_IWUSR if it has only a write hander
+ * To be used by css_for_each_descendant_pre(). Find the next descendant
+ * to visit for pre-order traversal of @root's descendants. @root is
+ * included in the iteration and the first node to be visited.
+ *
+ * While this function requires cgroup_mutex or RCU read locking, it
+ * doesn't require the whole traversal to be contained in a single critical
+ * section. This function will return the correct next descendant as long
+ * as both @pos and @root are accessible and @pos is a descendant of @root.
+ *
+ * If a subsystem synchronizes ->css_online() and the start of iteration, a
+ * css which finished ->css_online() is guaranteed to be visible in the
+ * future iterations and will stay visible until the last reference is put.
+ * A css which hasn't finished ->css_online() or already finished
+ * ->css_offline() may show up during traversal. It's each subsystem's
+ * responsibility to synchronize against on/offlining.
*/
-static mode_t cgroup_file_mode(const struct cftype *cft)
+struct cgroup_subsys_state *
+css_next_descendant_pre(struct cgroup_subsys_state *pos,
+ struct cgroup_subsys_state *root)
{
- mode_t mode = 0;
+ struct cgroup_subsys_state *next;
- if (cft->mode)
- return cft->mode;
+ cgroup_assert_mutex_or_rcu_locked();
- if (cft->read || cft->read_u64 || cft->read_s64 ||
- cft->read_map || cft->read_seq_string)
- mode |= S_IRUGO;
+ /* if first iteration, visit @root */
+ if (!pos)
+ return root;
- if (cft->write || cft->write_u64 || cft->write_s64 ||
- cft->write_string || cft->trigger)
- mode |= S_IWUSR;
+ /* visit the first child if exists */
+ next = css_next_child(NULL, pos);
+ if (next)
+ return next;
- return mode;
+ /* no child, visit my or the closest ancestor's next sibling */
+ while (pos != root) {
+ next = css_next_child(pos, pos->parent);
+ if (next)
+ return next;
+ pos = pos->parent;
+ }
+
+ return NULL;
}
-int cgroup_add_file(struct cgroup *cgrp,
- struct cgroup_subsys *subsys,
- const struct cftype *cft)
+/**
+ * css_rightmost_descendant - return the rightmost descendant of a css
+ * @pos: css of interest
+ *
+ * Return the rightmost descendant of @pos. If there's no descendant, @pos
+ * is returned. This can be used during pre-order traversal to skip
+ * subtree of @pos.
+ *
+ * While this function requires cgroup_mutex or RCU read locking, it
+ * doesn't require the whole traversal to be contained in a single critical
+ * section. This function will return the correct rightmost descendant as
+ * long as @pos is accessible.
+ */
+struct cgroup_subsys_state *
+css_rightmost_descendant(struct cgroup_subsys_state *pos)
{
- struct dentry *dir = cgrp->dentry;
- struct dentry *dentry;
- int error;
- mode_t mode;
+ struct cgroup_subsys_state *last, *tmp;
- char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 };
- if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) {
- strcpy(name, subsys->name);
- strcat(name, ".");
- }
- strcat(name, cft->name);
- BUG_ON(!mutex_is_locked(&dir->d_inode->i_mutex));
- dentry = lookup_one_len(name, dir, strlen(name));
- if (!IS_ERR(dentry)) {
- mode = cgroup_file_mode(cft);
- error = cgroup_create_file(dentry, mode | S_IFREG,
- cgrp->root->sb);
- if (!error)
- dentry->d_fsdata = (void *)cft;
- dput(dentry);
- } else
- error = PTR_ERR(dentry);
- return error;
+ cgroup_assert_mutex_or_rcu_locked();
+
+ do {
+ last = pos;
+ /* ->prev isn't RCU safe, walk ->next till the end */
+ pos = NULL;
+ css_for_each_child(tmp, last)
+ pos = tmp;
+ } while (pos);
+
+ return last;
}
-int cgroup_add_files(struct cgroup *cgrp,
- struct cgroup_subsys *subsys,
- const struct cftype cft[],
- int count)
+static struct cgroup_subsys_state *
+css_leftmost_descendant(struct cgroup_subsys_state *pos)
{
- int i, err;
- for (i = 0; i < count; i++) {
- err = cgroup_add_file(cgrp, subsys, &cft[i]);
- if (err)
- return err;
- }
- return 0;
+ struct cgroup_subsys_state *last;
+
+ do {
+ last = pos;
+ pos = css_next_child(NULL, pos);
+ } while (pos);
+
+ return last;
}
/**
- * cgroup_task_count - count the number of tasks in a cgroup.
- * @cgrp: the cgroup in question
+ * css_next_descendant_post - find the next descendant for post-order walk
+ * @pos: the current position (%NULL to initiate traversal)
+ * @root: css whose descendants to walk
*
- * Return the number of tasks in the cgroup.
+ * To be used by css_for_each_descendant_post(). Find the next descendant
+ * to visit for post-order traversal of @root's descendants. @root is
+ * included in the iteration and the last node to be visited.
+ *
+ * While this function requires cgroup_mutex or RCU read locking, it
+ * doesn't require the whole traversal to be contained in a single critical
+ * section. This function will return the correct next descendant as long
+ * as both @pos and @cgroup are accessible and @pos is a descendant of
+ * @cgroup.
+ *
+ * If a subsystem synchronizes ->css_online() and the start of iteration, a
+ * css which finished ->css_online() is guaranteed to be visible in the
+ * future iterations and will stay visible until the last reference is put.
+ * A css which hasn't finished ->css_online() or already finished
+ * ->css_offline() may show up during traversal. It's each subsystem's
+ * responsibility to synchronize against on/offlining.
*/
-int cgroup_task_count(const struct cgroup *cgrp)
+struct cgroup_subsys_state *
+css_next_descendant_post(struct cgroup_subsys_state *pos,
+ struct cgroup_subsys_state *root)
{
- int count = 0;
- struct cg_cgroup_link *link;
+ struct cgroup_subsys_state *next;
+
+ cgroup_assert_mutex_or_rcu_locked();
- read_lock(&css_set_lock);
- list_for_each_entry(link, &cgrp->css_sets, cgrp_link_list) {
- count += atomic_read(&link->cg->refcount);
+ /* if first iteration, visit leftmost descendant which may be @root */
+ if (!pos)
+ return css_leftmost_descendant(root);
+
+ /* if we visited @root, we're done */
+ if (pos == root)
+ return NULL;
+
+ /* if there's an unvisited sibling, visit its leftmost descendant */
+ next = css_next_child(pos, pos->parent);
+ if (next)
+ return css_leftmost_descendant(next);
+
+ /* no sibling left, visit parent */
+ return pos->parent;
+}
+
+/**
+ * css_has_online_children - does a css have online children
+ * @css: the target css
+ *
+ * Returns %true if @css has any online children; otherwise, %false. This
+ * function can be called from any context but the caller is responsible
+ * for synchronizing against on/offlining as necessary.
+ */
+bool css_has_online_children(struct cgroup_subsys_state *css)
+{
+ struct cgroup_subsys_state *child;
+ bool ret = false;
+
+ rcu_read_lock();
+ css_for_each_child(child, css) {
+ if (child->flags & CSS_ONLINE) {
+ ret = true;
+ break;
+ }
}
- read_unlock(&css_set_lock);
- return count;
+ rcu_read_unlock();
+ return ret;
}
-/*
- * Advance a list_head iterator. The iterator should be positioned at
- * the start of a css_set
+/**
+ * css_advance_task_iter - advance a task itererator to the next css_set
+ * @it: the iterator to advance
+ *
+ * Advance @it to the next css_set to walk.
*/
-static void cgroup_advance_iter(struct cgroup *cgrp,
- struct cgroup_iter *it)
+static void css_advance_task_iter(struct css_task_iter *it)
{
- struct list_head *l = it->cg_link;
- struct cg_cgroup_link *link;
- struct css_set *cg;
+ struct list_head *l = it->cset_pos;
+ struct cgrp_cset_link *link;
+ struct css_set *cset;
/* Advance to the next non-empty css_set */
do {
l = l->next;
- if (l == &cgrp->css_sets) {
- it->cg_link = NULL;
+ if (l == it->cset_head) {
+ it->cset_pos = NULL;
return;
}
- link = list_entry(l, struct cg_cgroup_link, cgrp_link_list);
- cg = link->cg;
- } while (list_empty(&cg->tasks));
- it->cg_link = l;
- it->task = cg->tasks.next;
+
+ if (it->ss) {
+ cset = container_of(l, struct css_set,
+ e_cset_node[it->ss->id]);
+ } else {
+ link = list_entry(l, struct cgrp_cset_link, cset_link);
+ cset = link->cset;
+ }
+ } while (list_empty(&cset->tasks) && list_empty(&cset->mg_tasks));
+
+ it->cset_pos = l;
+
+ if (!list_empty(&cset->tasks))
+ it->task_pos = cset->tasks.next;
+ else
+ it->task_pos = cset->mg_tasks.next;
+
+ it->tasks_head = &cset->tasks;
+ it->mg_tasks_head = &cset->mg_tasks;
}
-/*
- * To reduce the fork() overhead for systems that are not actually
- * using their cgroups capability, we don't maintain the lists running
- * through each css_set to its tasks until we see the list actually
- * used - in other words after the first call to cgroup_iter_start().
+/**
+ * css_task_iter_start - initiate task iteration
+ * @css: the css to walk tasks of
+ * @it: the task iterator to use
+ *
+ * Initiate iteration through the tasks of @css. The caller can call
+ * css_task_iter_next() to walk through the tasks until the function
+ * returns NULL. On completion of iteration, css_task_iter_end() must be
+ * called.
*
- * The tasklist_lock is not held here, as do_each_thread() and
- * while_each_thread() are protected by RCU.
+ * Note that this function acquires a lock which is released when the
+ * iteration finishes. The caller can't sleep while iteration is in
+ * progress.
*/
-static void cgroup_enable_task_cg_lists(void)
+void css_task_iter_start(struct cgroup_subsys_state *css,
+ struct css_task_iter *it)
+ __acquires(css_set_rwsem)
{
- struct task_struct *p, *g;
- write_lock(&css_set_lock);
- use_task_css_set_links = 1;
- do_each_thread(g, p) {
- task_lock(p);
- /*
- * We should check if the process is exiting, otherwise
- * it will race with cgroup_exit() in that the list
- * entry won't be deleted though the process has exited.
- */
- if (!(p->flags & PF_EXITING) && list_empty(&p->cg_list))
- list_add(&p->cg_list, &p->cgroups->tasks);
- task_unlock(p);
- } while_each_thread(g, p);
- write_unlock(&css_set_lock);
-}
+ /* no one should try to iterate before mounting cgroups */
+ WARN_ON_ONCE(!use_task_css_set_links);
-void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it)
-{
- /*
- * The first time anyone tries to iterate across a cgroup,
- * we need to enable the list linking each css_set to its
- * tasks, and fix up all existing tasks.
- */
- if (!use_task_css_set_links)
- cgroup_enable_task_cg_lists();
+ down_read(&css_set_rwsem);
- read_lock(&css_set_lock);
- it->cg_link = &cgrp->css_sets;
- cgroup_advance_iter(cgrp, it);
+ it->ss = css->ss;
+
+ if (it->ss)
+ it->cset_pos = &css->cgroup->e_csets[css->ss->id];
+ else
+ it->cset_pos = &css->cgroup->cset_links;
+
+ it->cset_head = it->cset_pos;
+
+ css_advance_task_iter(it);
}
-struct task_struct *cgroup_iter_next(struct cgroup *cgrp,
- struct cgroup_iter *it)
+/**
+ * css_task_iter_next - return the next task for the iterator
+ * @it: the task iterator being iterated
+ *
+ * The "next" function for task iteration. @it should have been
+ * initialized via css_task_iter_start(). Returns NULL when the iteration
+ * reaches the end.
+ */
+struct task_struct *css_task_iter_next(struct css_task_iter *it)
{
struct task_struct *res;
- struct list_head *l = it->task;
- struct cg_cgroup_link *link;
+ struct list_head *l = it->task_pos;
/* If the iterator cg is NULL, we have no tasks */
- if (!it->cg_link)
+ if (!it->cset_pos)
return NULL;
res = list_entry(l, struct task_struct, cg_list);
- /* Advance iterator to find next entry */
+
+ /*
+ * Advance iterator to find next entry. cset->tasks is consumed
+ * first and then ->mg_tasks. After ->mg_tasks, we move onto the
+ * next cset.
+ */
l = l->next;
- link = list_entry(it->cg_link, struct cg_cgroup_link, cgrp_link_list);
- if (l == &link->cg->tasks) {
- /* We reached the end of this task list - move on to
- * the next cg_cgroup_link */
- cgroup_advance_iter(cgrp, it);
- } else {
- it->task = l;
- }
- return res;
-}
-void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it)
-{
- read_unlock(&css_set_lock);
-}
+ if (l == it->tasks_head)
+ l = it->mg_tasks_head->next;
-static inline int started_after_time(struct task_struct *t1,
- struct timespec *time,
- struct task_struct *t2)
-{
- int start_diff = timespec_compare(&t1->start_time, time);
- if (start_diff > 0) {
- return 1;
- } else if (start_diff < 0) {
- return 0;
- } else {
- /*
- * Arbitrarily, if two processes started at the same
- * time, we'll say that the lower pointer value
- * started first. Note that t2 may have exited by now
- * so this may not be a valid pointer any longer, but
- * that's fine - it still serves to distinguish
- * between two tasks started (effectively) simultaneously.
- */
- return t1 > t2;
- }
+ if (l == it->mg_tasks_head)
+ css_advance_task_iter(it);
+ else
+ it->task_pos = l;
+
+ return res;
}
-/*
- * This function is a callback from heap_insert() and is used to order
- * the heap.
- * In this case we order the heap in descending task start time.
+/**
+ * css_task_iter_end - finish task iteration
+ * @it: the task iterator to finish
+ *
+ * Finish task iteration started by css_task_iter_start().
*/
-static inline int started_after(void *p1, void *p2)
+void css_task_iter_end(struct css_task_iter *it)
+ __releases(css_set_rwsem)
{
- struct task_struct *t1 = p1;
- struct task_struct *t2 = p2;
- return started_after_time(t1, &t2->start_time, t2);
+ up_read(&css_set_rwsem);
}
/**
- * cgroup_scan_tasks - iterate though all the tasks in a cgroup
- * @scan: struct cgroup_scanner containing arguments for the scan
+ * cgroup_trasnsfer_tasks - move tasks from one cgroup to another
+ * @to: cgroup to which the tasks will be moved
+ * @from: cgroup in which the tasks currently reside
*
- * Arguments include pointers to callback functions test_task() and
- * process_task().
- * Iterate through all the tasks in a cgroup, calling test_task() for each,
- * and if it returns true, call process_task() for it also.
- * The test_task pointer may be NULL, meaning always true (select all tasks).
- * Effectively duplicates cgroup_iter_{start,next,end}()
- * but does not lock css_set_lock for the call to process_task().
- * The struct cgroup_scanner may be embedded in any structure of the caller's
- * creation.
- * It is guaranteed that process_task() will act on every task that
- * is a member of the cgroup for the duration of this call. This
- * function may or may not call process_task() for tasks that exit
- * or move to a different cgroup during the call, or are forked or
- * move into the cgroup during the call.
- *
- * Note that test_task() may be called with locks held, and may in some
- * situations be called multiple times for the same task, so it should
- * be cheap.
- * If the heap pointer in the struct cgroup_scanner is non-NULL, a heap has been
- * pre-allocated and will be used for heap operations (and its "gt" member will
- * be overwritten), else a temporary heap will be used (allocation of which
- * may cause this function to fail).
+ * Locking rules between cgroup_post_fork() and the migration path
+ * guarantee that, if a task is forking while being migrated, the new child
+ * is guaranteed to be either visible in the source cgroup after the
+ * parent's migration is complete or put into the target cgroup. No task
+ * can slip out of migration through forking.
*/
-int cgroup_scan_tasks(struct cgroup_scanner *scan)
-{
- int retval, i;
- struct cgroup_iter it;
- struct task_struct *p, *dropped;
- /* Never dereference latest_task, since it's not refcounted */
- struct task_struct *latest_task = NULL;
- struct ptr_heap tmp_heap;
- struct ptr_heap *heap;
- struct timespec latest_time = { 0, 0 };
-
- if (scan->heap) {
- /* The caller supplied our heap and pre-allocated its memory */
- heap = scan->heap;
- heap->gt = &started_after;
- } else {
- /* We need to allocate our own heap memory */
- heap = &tmp_heap;
- retval = heap_init(heap, PAGE_SIZE, GFP_KERNEL, &started_after);
- if (retval)
- /* cannot allocate the heap */
- return retval;
- }
+int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
+{
+ LIST_HEAD(preloaded_csets);
+ struct cgrp_cset_link *link;
+ struct css_task_iter it;
+ struct task_struct *task;
+ int ret;
+
+ mutex_lock(&cgroup_mutex);
+
+ /* all tasks in @from are being moved, all csets are source */
+ down_read(&css_set_rwsem);
+ list_for_each_entry(link, &from->cset_links, cset_link)
+ cgroup_migrate_add_src(link->cset, to, &preloaded_csets);
+ up_read(&css_set_rwsem);
+
+ ret = cgroup_migrate_prepare_dst(to, &preloaded_csets);
+ if (ret)
+ goto out_err;
- again:
/*
- * Scan tasks in the cgroup, using the scanner's "test_task" callback
- * to determine which are of interest, and using the scanner's
- * "process_task" callback to process any of them that need an update.
- * Since we don't want to hold any locks during the task updates,
- * gather tasks to be processed in a heap structure.
- * The heap is sorted by descending task start time.
- * If the statically-sized heap fills up, we overflow tasks that
- * started later, and in future iterations only consider tasks that
- * started after the latest task in the previous pass. This
- * guarantees forward progress and that we don't miss any tasks.
+ * Migrate tasks one-by-one until @form is empty. This fails iff
+ * ->can_attach() fails.
*/
- heap->size = 0;
- cgroup_iter_start(scan->cg, &it);
- while ((p = cgroup_iter_next(scan->cg, &it))) {
- /*
- * Only affect tasks that qualify per the caller's callback,
- * if he provided one
- */
- if (scan->test_task && !scan->test_task(p, scan))
- continue;
- /*
- * Only process tasks that started after the last task
- * we processed
- */
- if (!started_after_time(p, &latest_time, latest_task))
- continue;
- dropped = heap_insert(heap, p);
- if (dropped == NULL) {
- /*
- * The new task was inserted; the heap wasn't
- * previously full
- */
- get_task_struct(p);
- } else if (dropped != p) {
- /*
- * The new task was inserted, and pushed out a
- * different task
- */
- get_task_struct(p);
- put_task_struct(dropped);
- }
- /*
- * Else the new task was newer than anything already in
- * the heap and wasn't inserted
- */
- }
- cgroup_iter_end(scan->cg, &it);
-
- if (heap->size) {
- for (i = 0; i < heap->size; i++) {
- struct task_struct *q = heap->ptrs[i];
- if (i == 0) {
- latest_time = q->start_time;
- latest_task = q;
- }
- /* Process the task per the caller's callback */
- scan->process_task(q, scan);
- put_task_struct(q);
+ do {
+ css_task_iter_start(&from->self, &it);
+ task = css_task_iter_next(&it);
+ if (task)
+ get_task_struct(task);
+ css_task_iter_end(&it);
+
+ if (task) {
+ ret = cgroup_migrate(to, task, false);
+ put_task_struct(task);
}
- /*
- * If we had to process any tasks at all, scan again
- * in case some of them were in the middle of forking
- * children that didn't get processed.
- * Not the most efficient way to do it, but it avoids
- * having to take callback_mutex in the fork path
- */
- goto again;
- }
- if (heap == &tmp_heap)
- heap_free(&tmp_heap);
- return 0;
+ } while (task && !ret);
+out_err:
+ cgroup_migrate_finish(&preloaded_csets);
+ mutex_unlock(&cgroup_mutex);
+ return ret;
}
/*
@@ -2355,6 +3565,36 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
*
*/
+/* which pidlist file are we talking about? */
+enum cgroup_filetype {
+ CGROUP_FILE_PROCS,
+ CGROUP_FILE_TASKS,
+};
+
+/*
+ * A pidlist is a list of pids that virtually represents the contents of one
+ * of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists,
+ * a pair (one each for procs, tasks) for each pid namespace that's relevant
+ * to the cgroup.
+ */
+struct cgroup_pidlist {
+ /*
+ * used to find which pidlist is wanted. doesn't change as long as
+ * this particular list stays in the list.
+ */
+ struct { enum cgroup_filetype type; struct pid_namespace *ns; } key;
+ /* array of xids */
+ pid_t *list;
+ /* how many elements the above list has */
+ int length;
+ /* each of these stored in a list by its cgroup */
+ struct list_head links;
+ /* pointer to the cgroup we belong to, for list removal purposes */
+ struct cgroup *owner;
+ /* for delayed destruction */
+ struct delayed_work destroy_dwork;
+};
+
/*
* The following two functions "fix" the issue where there are more pids
* than kmalloc will give memory for; in such cases, we use vmalloc/vfree.
@@ -2368,6 +3608,7 @@ static void *pidlist_allocate(int count)
else
return kmalloc(count * sizeof(pid_t), GFP_KERNEL);
}
+
static void pidlist_free(void *p)
{
if (is_vmalloc_addr(p))
@@ -2375,35 +3616,55 @@ static void pidlist_free(void *p)
else
kfree(p);
}
-static void *pidlist_resize(void *p, int newcount)
+
+/*
+ * Used to destroy all pidlists lingering waiting for destroy timer. None
+ * should be left afterwards.
+ */
+static void cgroup_pidlist_destroy_all(struct cgroup *cgrp)
{
- void *newlist;
- /* note: if new alloc fails, old p will still be valid either way */
- if (is_vmalloc_addr(p)) {
- newlist = vmalloc(newcount * sizeof(pid_t));
- if (!newlist)
- return NULL;
- memcpy(newlist, p, newcount * sizeof(pid_t));
- vfree(p);
- } else {
- newlist = krealloc(p, newcount * sizeof(pid_t), GFP_KERNEL);
+ struct cgroup_pidlist *l, *tmp_l;
+
+ mutex_lock(&cgrp->pidlist_mutex);
+ list_for_each_entry_safe(l, tmp_l, &cgrp->pidlists, links)
+ mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, 0);
+ mutex_unlock(&cgrp->pidlist_mutex);
+
+ flush_workqueue(cgroup_pidlist_destroy_wq);
+ BUG_ON(!list_empty(&cgrp->pidlists));
+}
+
+static void cgroup_pidlist_destroy_work_fn(struct work_struct *work)
+{
+ struct delayed_work *dwork = to_delayed_work(work);
+ struct cgroup_pidlist *l = container_of(dwork, struct cgroup_pidlist,
+ destroy_dwork);
+ struct cgroup_pidlist *tofree = NULL;
+
+ mutex_lock(&l->owner->pidlist_mutex);
+
+ /*
+ * Destroy iff we didn't get queued again. The state won't change
+ * as destroy_dwork can only be queued while locked.
+ */
+ if (!delayed_work_pending(dwork)) {
+ list_del(&l->links);
+ pidlist_free(l->list);
+ put_pid_ns(l->key.ns);
+ tofree = l;
}
- return newlist;
+
+ mutex_unlock(&l->owner->pidlist_mutex);
+ kfree(tofree);
}
/*
* pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries
- * If the new stripped list is sufficiently smaller and there's enough memory
- * to allocate a new buffer, will let go of the unneeded memory. Returns the
- * number of unique elements.
+ * Returns the number of unique elements.
*/
-/* is the size difference enough that we should re-allocate the array? */
-#define PIDLIST_REALLOC_DIFFERENCE(old, new) ((old) - PAGE_SIZE >= (new))
-static int pidlist_uniq(pid_t **p, int length)
+static int pidlist_uniq(pid_t *list, int length)
{
int src, dest = 1;
- pid_t *list = *p;
- pid_t *newlist;
/*
* we presume the 0th element is unique, so i starts at 1. trivial
@@ -2424,69 +3685,95 @@ static int pidlist_uniq(pid_t **p, int length)
dest++;
}
after:
- /*
- * if the length difference is large enough, we want to allocate a
- * smaller buffer to save memory. if this fails due to out of memory,
- * we'll just stay with what we've got.
- */
- if (PIDLIST_REALLOC_DIFFERENCE(length, dest)) {
- newlist = pidlist_resize(list, dest);
- if (newlist)
- *p = newlist;
- }
return dest;
}
+/*
+ * The two pid files - task and cgroup.procs - guaranteed that the result
+ * is sorted, which forced this whole pidlist fiasco. As pid order is
+ * different per namespace, each namespace needs differently sorted list,
+ * making it impossible to use, for example, single rbtree of member tasks
+ * sorted by task pointer. As pidlists can be fairly large, allocating one
+ * per open file is dangerous, so cgroup had to implement shared pool of
+ * pidlists keyed by cgroup and namespace.
+ *
+ * All this extra complexity was caused by the original implementation
+ * committing to an entirely unnecessary property. In the long term, we
+ * want to do away with it. Explicitly scramble sort order if
+ * sane_behavior so that no such expectation exists in the new interface.
+ *
+ * Scrambling is done by swapping every two consecutive bits, which is
+ * non-identity one-to-one mapping which disturbs sort order sufficiently.
+ */
+static pid_t pid_fry(pid_t pid)
+{
+ unsigned a = pid & 0x55555555;
+ unsigned b = pid & 0xAAAAAAAA;
+
+ return (a << 1) | (b >> 1);
+}
+
+static pid_t cgroup_pid_fry(struct cgroup *cgrp, pid_t pid)
+{
+ if (cgroup_sane_behavior(cgrp))
+ return pid_fry(pid);
+ else
+ return pid;
+}
+
static int cmppid(const void *a, const void *b)
{
return *(pid_t *)a - *(pid_t *)b;
}
+static int fried_cmppid(const void *a, const void *b)
+{
+ return pid_fry(*(pid_t *)a) - pid_fry(*(pid_t *)b);
+}
+
+static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
+ enum cgroup_filetype type)
+{
+ struct cgroup_pidlist *l;
+ /* don't need task_nsproxy() if we're looking at ourself */
+ struct pid_namespace *ns = task_active_pid_ns(current);
+
+ lockdep_assert_held(&cgrp->pidlist_mutex);
+
+ list_for_each_entry(l, &cgrp->pidlists, links)
+ if (l->key.type == type && l->key.ns == ns)
+ return l;
+ return NULL;
+}
+
/*
* find the appropriate pidlist for our purpose (given procs vs tasks)
* returns with the lock on that pidlist already held, and takes care
* of the use count, or returns NULL with no locks held if we're out of
* memory.
*/
-static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
- enum cgroup_filetype type)
+static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp,
+ enum cgroup_filetype type)
{
struct cgroup_pidlist *l;
- /* don't need task_nsproxy() if we're looking at ourself */
- struct pid_namespace *ns = get_pid_ns(current->nsproxy->pid_ns);
- /*
- * We can't drop the pidlist_mutex before taking the l->mutex in case
- * the last ref-holder is trying to remove l from the list at the same
- * time. Holding the pidlist_mutex precludes somebody taking whichever
- * list we find out from under us - compare release_pid_array().
- */
- mutex_lock(&cgrp->pidlist_mutex);
- list_for_each_entry(l, &cgrp->pidlists, links) {
- if (l->key.type == type && l->key.ns == ns) {
- /* found a matching list - drop the extra refcount */
- put_pid_ns(ns);
- /* make sure l doesn't vanish out from under us */
- down_write(&l->mutex);
- mutex_unlock(&cgrp->pidlist_mutex);
- return l;
- }
- }
+
+ lockdep_assert_held(&cgrp->pidlist_mutex);
+
+ l = cgroup_pidlist_find(cgrp, type);
+ if (l)
+ return l;
+
/* entry not found; create a new one */
- l = kmalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
- if (!l) {
- mutex_unlock(&cgrp->pidlist_mutex);
- put_pid_ns(ns);
+ l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
+ if (!l)
return l;
- }
- init_rwsem(&l->mutex);
- down_write(&l->mutex);
+
+ INIT_DELAYED_WORK(&l->destroy_dwork, cgroup_pidlist_destroy_work_fn);
l->key.type = type;
- l->key.ns = ns;
- l->use_count = 0; /* don't increment here */
- l->list = NULL;
+ /* don't need task_nsproxy() if we're looking at ourself */
+ l->key.ns = get_pid_ns(task_active_pid_ns(current));
l->owner = cgrp;
list_add(&l->links, &cgrp->pidlists);
- mutex_unlock(&cgrp->pidlist_mutex);
return l;
}
@@ -2499,10 +3786,12 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
pid_t *array;
int length;
int pid, n = 0; /* used for populating the array */
- struct cgroup_iter it;
+ struct css_task_iter it;
struct task_struct *tsk;
struct cgroup_pidlist *l;
+ lockdep_assert_held(&cgrp->pidlist_mutex);
+
/*
* If cgroup gets more users after we read count, we won't have
* enough space - tough. This race is indistinguishable to the
@@ -2514,8 +3803,8 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
if (!array)
return -ENOMEM;
/* now, populate the array */
- cgroup_iter_start(cgrp, &it);
- while ((tsk = cgroup_iter_next(cgrp, &it))) {
+ css_task_iter_start(&cgrp->self, &it);
+ while ((tsk = css_task_iter_next(&it))) {
if (unlikely(n == length))
break;
/* get tgid or pid for procs or tasks file respectively */
@@ -2526,23 +3815,27 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
if (pid > 0) /* make sure to only use valid results */
array[n++] = pid;
}
- cgroup_iter_end(cgrp, &it);
+ css_task_iter_end(&it);
length = n;
/* now sort & (if procs) strip out duplicates */
- sort(array, length, sizeof(pid_t), cmppid, NULL);
+ if (cgroup_sane_behavior(cgrp))
+ sort(array, length, sizeof(pid_t), fried_cmppid, NULL);
+ else
+ sort(array, length, sizeof(pid_t), cmppid, NULL);
if (type == CGROUP_FILE_PROCS)
- length = pidlist_uniq(&array, length);
- l = cgroup_pidlist_find(cgrp, type);
+ length = pidlist_uniq(array, length);
+
+ l = cgroup_pidlist_find_create(cgrp, type);
if (!l) {
+ mutex_unlock(&cgrp->pidlist_mutex);
pidlist_free(array);
return -ENOMEM;
}
- /* store array, freeing old if necessary - lock already held */
+
+ /* store array, freeing old if necessary */
pidlist_free(l->list);
l->list = array;
l->length = length;
- l->use_count++;
- up_write(&l->mutex);
*lp = l;
return 0;
}
@@ -2558,24 +3851,34 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
*/
int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
{
- int ret = -EINVAL;
+ struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
struct cgroup *cgrp;
- struct cgroup_iter it;
+ struct css_task_iter it;
struct task_struct *tsk;
+ /* it should be kernfs_node belonging to cgroupfs and is a directory */
+ if (dentry->d_sb->s_type != &cgroup_fs_type || !kn ||
+ kernfs_type(kn) != KERNFS_DIR)
+ return -EINVAL;
+
+ mutex_lock(&cgroup_mutex);
+
/*
- * Validate dentry by checking the superblock operations,
- * and make sure it's a directory.
+ * We aren't being called from kernfs and there's no guarantee on
+ * @kn->priv's validity. For this and css_tryget_online_from_dir(),
+ * @kn->priv is RCU safe. Let's do the RCU dancing.
*/
- if (dentry->d_sb->s_op != &cgroup_ops ||
- !S_ISDIR(dentry->d_inode->i_mode))
- goto err;
-
- ret = 0;
- cgrp = dentry->d_fsdata;
+ rcu_read_lock();
+ cgrp = rcu_dereference(kn->priv);
+ if (!cgrp || cgroup_is_dead(cgrp)) {
+ rcu_read_unlock();
+ mutex_unlock(&cgroup_mutex);
+ return -ENOENT;
+ }
+ rcu_read_unlock();
- cgroup_iter_start(cgrp, &it);
- while ((tsk = cgroup_iter_next(cgrp, &it))) {
+ css_task_iter_start(&cgrp->self, &it);
+ while ((tsk = css_task_iter_next(&it))) {
switch (tsk->state) {
case TASK_RUNNING:
stats->nr_running++;
@@ -2595,10 +3898,10 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
break;
}
}
- cgroup_iter_end(cgrp, &it);
+ css_task_iter_end(&it);
-err:
- return ret;
+ mutex_unlock(&cgroup_mutex);
+ return 0;
}
@@ -2616,20 +3919,45 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
* after a seek to the start). Use a binary-search to find the
* next pid to display, if any
*/
- struct cgroup_pidlist *l = s->private;
+ struct kernfs_open_file *of = s->private;
+ struct cgroup *cgrp = seq_css(s)->cgroup;
+ struct cgroup_pidlist *l;
+ enum cgroup_filetype type = seq_cft(s)->private;
int index = 0, pid = *pos;
- int *iter;
+ int *iter, ret;
+
+ mutex_lock(&cgrp->pidlist_mutex);
+
+ /*
+ * !NULL @of->priv indicates that this isn't the first start()
+ * after open. If the matching pidlist is around, we can use that.
+ * Look for it. Note that @of->priv can't be used directly. It
+ * could already have been destroyed.
+ */
+ if (of->priv)
+ of->priv = cgroup_pidlist_find(cgrp, type);
+
+ /*
+ * Either this is the first start() after open or the matching
+ * pidlist has been destroyed inbetween. Create a new one.
+ */
+ if (!of->priv) {
+ ret = pidlist_array_load(cgrp, type,
+ (struct cgroup_pidlist **)&of->priv);
+ if (ret)
+ return ERR_PTR(ret);
+ }
+ l = of->priv;
- down_read(&l->mutex);
if (pid) {
int end = l->length;
while (index < end) {
int mid = (index + end) / 2;
- if (l->list[mid] == pid) {
+ if (cgroup_pid_fry(cgrp, l->list[mid]) == pid) {
index = mid;
break;
- } else if (l->list[mid] <= pid)
+ } else if (cgroup_pid_fry(cgrp, l->list[mid]) <= pid)
index = mid + 1;
else
end = mid;
@@ -2640,19 +3968,25 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
return NULL;
/* Update the abstract position to be the actual pid that we found */
iter = l->list + index;
- *pos = *iter;
+ *pos = cgroup_pid_fry(cgrp, *iter);
return iter;
}
static void cgroup_pidlist_stop(struct seq_file *s, void *v)
{
- struct cgroup_pidlist *l = s->private;
- up_read(&l->mutex);
+ struct kernfs_open_file *of = s->private;
+ struct cgroup_pidlist *l = of->priv;
+
+ if (l)
+ mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork,
+ CGROUP_PIDLIST_DESTROY_DELAY);
+ mutex_unlock(&seq_css(s)->cgroup->pidlist_mutex);
}
static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
{
- struct cgroup_pidlist *l = s->private;
+ struct kernfs_open_file *of = s->private;
+ struct cgroup_pidlist *l = of->priv;
pid_t *p = v;
pid_t *end = l->list + l->length;
/*
@@ -2663,7 +3997,7 @@ static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
if (p >= end) {
return NULL;
} else {
- *pos = *p;
+ *pos = cgroup_pid_fry(seq_css(s)->cgroup, *p);
return p;
}
}
@@ -2673,530 +4007,714 @@ static int cgroup_pidlist_show(struct seq_file *s, void *v)
return seq_printf(s, "%d\n", *(int *)v);
}
-/*
- * seq_operations functions for iterating on pidlists through seq_file -
- * independent of whether it's tasks or procs
- */
-static const struct seq_operations cgroup_pidlist_seq_operations = {
- .start = cgroup_pidlist_start,
- .stop = cgroup_pidlist_stop,
- .next = cgroup_pidlist_next,
- .show = cgroup_pidlist_show,
-};
-
-static void cgroup_release_pid_array(struct cgroup_pidlist *l)
+static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css,
+ struct cftype *cft)
{
- /*
- * the case where we're the last user of this particular pidlist will
- * have us remove it from the cgroup's list, which entails taking the
- * mutex. since in pidlist_find the pidlist->lock depends on cgroup->
- * pidlist_mutex, we have to take pidlist_mutex first.
- */
- mutex_lock(&l->owner->pidlist_mutex);
- down_write(&l->mutex);
- BUG_ON(!l->use_count);
- if (!--l->use_count) {
- /* we're the last user if refcount is 0; remove and free */
- list_del(&l->links);
- mutex_unlock(&l->owner->pidlist_mutex);
- pidlist_free(l->list);
- put_pid_ns(l->key.ns);
- up_write(&l->mutex);
- kfree(l);
- return;
- }
- mutex_unlock(&l->owner->pidlist_mutex);
- up_write(&l->mutex);
+ return notify_on_release(css->cgroup);
}
-static int cgroup_pidlist_release(struct inode *inode, struct file *file)
-{
- struct cgroup_pidlist *l;
- if (!(file->f_mode & FMODE_READ))
- return 0;
- /*
- * the seq_file will only be initialized if the file was opened for
- * reading; hence we check if it's not null only in that case.
- */
- l = ((struct seq_file *)file->private_data)->private;
- cgroup_release_pid_array(l);
- return seq_release(inode, file);
-}
-
-static const struct file_operations cgroup_pidlist_operations = {
- .read = seq_read,
- .llseek = seq_lseek,
- .write = cgroup_file_write,
- .release = cgroup_pidlist_release,
-};
-
-/*
- * The following functions handle opens on a file that displays a pidlist
- * (tasks or procs). Prepare an array of the process/thread IDs of whoever's
- * in the cgroup.
- */
-/* helper function for the two below it */
-static int cgroup_pidlist_open(struct file *file, enum cgroup_filetype type)
+static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css,
+ struct cftype *cft, u64 val)
{
- struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
- struct cgroup_pidlist *l;
- int retval;
-
- /* Nothing to do for write-only files */
- if (!(file->f_mode & FMODE_READ))
- return 0;
-
- /* have the array populated */
- retval = pidlist_array_load(cgrp, type, &l);
- if (retval)
- return retval;
- /* configure file information */
- file->f_op = &cgroup_pidlist_operations;
-
- retval = seq_open(file, &cgroup_pidlist_seq_operations);
- if (retval) {
- cgroup_release_pid_array(l);
- return retval;
- }
- ((struct seq_file *)file->private_data)->private = l;
+ clear_bit(CGRP_RELEASABLE, &css->cgroup->flags);
+ if (val)
+ set_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
+ else
+ clear_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
return 0;
}
-static int cgroup_tasks_open(struct inode *unused, struct file *file)
-{
- return cgroup_pidlist_open(file, CGROUP_FILE_TASKS);
-}
-static int cgroup_procs_open(struct inode *unused, struct file *file)
-{
- return cgroup_pidlist_open(file, CGROUP_FILE_PROCS);
-}
-static u64 cgroup_read_notify_on_release(struct cgroup *cgrp,
- struct cftype *cft)
+static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css,
+ struct cftype *cft)
{
- return notify_on_release(cgrp);
+ return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
}
-static int cgroup_write_notify_on_release(struct cgroup *cgrp,
- struct cftype *cft,
- u64 val)
+static int cgroup_clone_children_write(struct cgroup_subsys_state *css,
+ struct cftype *cft, u64 val)
{
- clear_bit(CGRP_RELEASABLE, &cgrp->flags);
if (val)
- set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
+ set_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
else
- clear_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
+ clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
return 0;
}
-/*
- * for the common functions, 'private' gives the type of file
- */
-/* for hysterical raisins, we can't put this on the older files */
-#define CGROUP_FILE_GENERIC_PREFIX "cgroup."
-static struct cftype files[] = {
+static struct cftype cgroup_base_files[] = {
{
- .name = "tasks",
- .open = cgroup_tasks_open,
- .write_u64 = cgroup_tasks_write,
- .release = cgroup_pidlist_release,
+ .name = "cgroup.procs",
+ .seq_start = cgroup_pidlist_start,
+ .seq_next = cgroup_pidlist_next,
+ .seq_stop = cgroup_pidlist_stop,
+ .seq_show = cgroup_pidlist_show,
+ .private = CGROUP_FILE_PROCS,
+ .write = cgroup_procs_write,
.mode = S_IRUGO | S_IWUSR,
},
{
- .name = CGROUP_FILE_GENERIC_PREFIX "procs",
- .open = cgroup_procs_open,
- /* .write_u64 = cgroup_procs_write, TODO */
- .release = cgroup_pidlist_release,
- .mode = S_IRUGO,
+ .name = "cgroup.clone_children",
+ .flags = CFTYPE_INSANE,
+ .read_u64 = cgroup_clone_children_read,
+ .write_u64 = cgroup_clone_children_write,
+ },
+ {
+ .name = "cgroup.sane_behavior",
+ .flags = CFTYPE_ONLY_ON_ROOT,
+ .seq_show = cgroup_sane_behavior_show,
+ },
+ {
+ .name = "cgroup.controllers",
+ .flags = CFTYPE_ONLY_ON_DFL | CFTYPE_ONLY_ON_ROOT,
+ .seq_show = cgroup_root_controllers_show,
+ },
+ {
+ .name = "cgroup.controllers",
+ .flags = CFTYPE_ONLY_ON_DFL | CFTYPE_NOT_ON_ROOT,
+ .seq_show = cgroup_controllers_show,
+ },
+ {
+ .name = "cgroup.subtree_control",
+ .flags = CFTYPE_ONLY_ON_DFL,
+ .seq_show = cgroup_subtree_control_show,
+ .write = cgroup_subtree_control_write,
+ },
+ {
+ .name = "cgroup.populated",
+ .flags = CFTYPE_ONLY_ON_DFL | CFTYPE_NOT_ON_ROOT,
+ .seq_show = cgroup_populated_show,
+ },
+
+ /*
+ * Historical crazy stuff. These don't have "cgroup." prefix and
+ * don't exist if sane_behavior. If you're depending on these, be
+ * prepared to be burned.
+ */
+ {
+ .name = "tasks",
+ .flags = CFTYPE_INSANE, /* use "procs" instead */
+ .seq_start = cgroup_pidlist_start,
+ .seq_next = cgroup_pidlist_next,
+ .seq_stop = cgroup_pidlist_stop,
+ .seq_show = cgroup_pidlist_show,
+ .private = CGROUP_FILE_TASKS,
+ .write = cgroup_tasks_write,
+ .mode = S_IRUGO | S_IWUSR,
},
{
.name = "notify_on_release",
+ .flags = CFTYPE_INSANE,
.read_u64 = cgroup_read_notify_on_release,
.write_u64 = cgroup_write_notify_on_release,
},
+ {
+ .name = "release_agent",
+ .flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT,
+ .seq_show = cgroup_release_agent_show,
+ .write = cgroup_release_agent_write,
+ .max_write_len = PATH_MAX - 1,
+ },
+ { } /* terminate */
};
-static struct cftype cft_release_agent = {
- .name = "release_agent",
- .read_seq_string = cgroup_release_agent_show,
- .write_string = cgroup_release_agent_write,
- .max_write_len = PATH_MAX,
-};
-
-static int cgroup_populate_dir(struct cgroup *cgrp)
+/**
+ * cgroup_populate_dir - create subsys files in a cgroup directory
+ * @cgrp: target cgroup
+ * @subsys_mask: mask of the subsystem ids whose files should be added
+ *
+ * On failure, no file is added.
+ */
+static int cgroup_populate_dir(struct cgroup *cgrp, unsigned int subsys_mask)
{
- int err;
struct cgroup_subsys *ss;
+ int i, ret = 0;
- /* First clear out any existing files */
- cgroup_clear_directory(cgrp->dentry);
+ /* process cftsets of each subsystem */
+ for_each_subsys(ss, i) {
+ struct cftype *cfts;
- err = cgroup_add_files(cgrp, NULL, files, ARRAY_SIZE(files));
- if (err < 0)
- return err;
+ if (!(subsys_mask & (1 << i)))
+ continue;
- if (cgrp == cgrp->top_cgroup) {
- if ((err = cgroup_add_file(cgrp, NULL, &cft_release_agent)) < 0)
- return err;
+ list_for_each_entry(cfts, &ss->cfts, node) {
+ ret = cgroup_addrm_files(cgrp, cfts, true);
+ if (ret < 0)
+ goto err;
+ }
}
+ return 0;
+err:
+ cgroup_clear_dir(cgrp, subsys_mask);
+ return ret;
+}
- for_each_subsys(cgrp->root, ss) {
- if (ss->populate && (err = ss->populate(ss, cgrp)) < 0)
- return err;
+/*
+ * css destruction is four-stage process.
+ *
+ * 1. Destruction starts. Killing of the percpu_ref is initiated.
+ * Implemented in kill_css().
+ *
+ * 2. When the percpu_ref is confirmed to be visible as killed on all CPUs
+ * and thus css_tryget_online() is guaranteed to fail, the css can be
+ * offlined by invoking offline_css(). After offlining, the base ref is
+ * put. Implemented in css_killed_work_fn().
+ *
+ * 3. When the percpu_ref reaches zero, the only possible remaining
+ * accessors are inside RCU read sections. css_release() schedules the
+ * RCU callback.
+ *
+ * 4. After the grace period, the css can be freed. Implemented in
+ * css_free_work_fn().
+ *
+ * It is actually hairier because both step 2 and 4 require process context
+ * and thus involve punting to css->destroy_work adding two additional
+ * steps to the already complex sequence.
+ */
+static void css_free_work_fn(struct work_struct *work)
+{
+ struct cgroup_subsys_state *css =
+ container_of(work, struct cgroup_subsys_state, destroy_work);
+ struct cgroup *cgrp = css->cgroup;
+
+ if (css->ss) {
+ /* css free path */
+ if (css->parent)
+ css_put(css->parent);
+
+ css->ss->css_free(css);
+ cgroup_put(cgrp);
+ } else {
+ /* cgroup free path */
+ atomic_dec(&cgrp->root->nr_cgrps);
+ cgroup_pidlist_destroy_all(cgrp);
+
+ if (cgroup_parent(cgrp)) {
+ /*
+ * We get a ref to the parent, and put the ref when
+ * this cgroup is being freed, so it's guaranteed
+ * that the parent won't be destroyed before its
+ * children.
+ */
+ cgroup_put(cgroup_parent(cgrp));
+ kernfs_put(cgrp->kn);
+ kfree(cgrp);
+ } else {
+ /*
+ * This is root cgroup's refcnt reaching zero,
+ * which indicates that the root should be
+ * released.
+ */
+ cgroup_destroy_root(cgrp->root);
+ }
}
- /* This cgroup is ready now */
- for_each_subsys(cgrp->root, ss) {
- struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
- /*
- * Update id->css pointer and make this css visible from
- * CSS ID functions. This pointer will be dereferened
- * from RCU-read-side without locks.
- */
- if (css->id)
- rcu_assign_pointer(css->id->css, css);
+}
+
+static void css_free_rcu_fn(struct rcu_head *rcu_head)
+{
+ struct cgroup_subsys_state *css =
+ container_of(rcu_head, struct cgroup_subsys_state, rcu_head);
+
+ INIT_WORK(&css->destroy_work, css_free_work_fn);
+ queue_work(cgroup_destroy_wq, &css->destroy_work);
+}
+
+static void css_release_work_fn(struct work_struct *work)
+{
+ struct cgroup_subsys_state *css =
+ container_of(work, struct cgroup_subsys_state, destroy_work);
+ struct cgroup_subsys *ss = css->ss;
+ struct cgroup *cgrp = css->cgroup;
+
+ mutex_lock(&cgroup_mutex);
+
+ css->flags |= CSS_RELEASED;
+ list_del_rcu(&css->sibling);
+
+ if (ss) {
+ /* css release path */
+ cgroup_idr_remove(&ss->css_idr, css->id);
+ } else {
+ /* cgroup release path */
+ cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
+ cgrp->id = -1;
}
- return 0;
+ mutex_unlock(&cgroup_mutex);
+
+ call_rcu(&css->rcu_head, css_free_rcu_fn);
}
-static void init_cgroup_css(struct cgroup_subsys_state *css,
- struct cgroup_subsys *ss,
- struct cgroup *cgrp)
+static void css_release(struct percpu_ref *ref)
{
- css->cgroup = cgrp;
- atomic_set(&css->refcnt, 1);
- css->flags = 0;
- css->id = NULL;
- if (cgrp == dummytop)
- set_bit(CSS_ROOT, &css->flags);
- BUG_ON(cgrp->subsys[ss->subsys_id]);
- cgrp->subsys[ss->subsys_id] = css;
+ struct cgroup_subsys_state *css =
+ container_of(ref, struct cgroup_subsys_state, refcnt);
+
+ INIT_WORK(&css->destroy_work, css_release_work_fn);
+ queue_work(cgroup_destroy_wq, &css->destroy_work);
}
-static void cgroup_lock_hierarchy(struct cgroupfs_root *root)
+static void init_and_link_css(struct cgroup_subsys_state *css,
+ struct cgroup_subsys *ss, struct cgroup *cgrp)
{
- /* We need to take each hierarchy_mutex in a consistent order */
- int i;
+ lockdep_assert_held(&cgroup_mutex);
- for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
- struct cgroup_subsys *ss = subsys[i];
- if (ss->root == root)
- mutex_lock(&ss->hierarchy_mutex);
+ cgroup_get(cgrp);
+
+ memset(css, 0, sizeof(*css));
+ css->cgroup = cgrp;
+ css->ss = ss;
+ INIT_LIST_HEAD(&css->sibling);
+ INIT_LIST_HEAD(&css->children);
+ css->serial_nr = css_serial_nr_next++;
+
+ if (cgroup_parent(cgrp)) {
+ css->parent = cgroup_css(cgroup_parent(cgrp), ss);
+ css_get(css->parent);
}
+
+ BUG_ON(cgroup_css(cgrp, ss));
}
-static void cgroup_unlock_hierarchy(struct cgroupfs_root *root)
+/* invoke ->css_online() on a new CSS and mark it online if successful */
+static int online_css(struct cgroup_subsys_state *css)
{
- int i;
+ struct cgroup_subsys *ss = css->ss;
+ int ret = 0;
+
+ lockdep_assert_held(&cgroup_mutex);
- for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
- struct cgroup_subsys *ss = subsys[i];
- if (ss->root == root)
- mutex_unlock(&ss->hierarchy_mutex);
+ if (ss->css_online)
+ ret = ss->css_online(css);
+ if (!ret) {
+ css->flags |= CSS_ONLINE;
+ rcu_assign_pointer(css->cgroup->subsys[ss->id], css);
}
+ return ret;
}
-/*
- * cgroup_create - create a cgroup
- * @parent: cgroup that will be parent of the new cgroup
- * @dentry: dentry of the new cgroup
- * @mode: mode to set on new inode
+/* if the CSS is online, invoke ->css_offline() on it and mark it offline */
+static void offline_css(struct cgroup_subsys_state *css)
+{
+ struct cgroup_subsys *ss = css->ss;
+
+ lockdep_assert_held(&cgroup_mutex);
+
+ if (!(css->flags & CSS_ONLINE))
+ return;
+
+ if (ss->css_offline)
+ ss->css_offline(css);
+
+ css->flags &= ~CSS_ONLINE;
+ RCU_INIT_POINTER(css->cgroup->subsys[ss->id], NULL);
+
+ wake_up_all(&css->cgroup->offline_waitq);
+}
+
+/**
+ * create_css - create a cgroup_subsys_state
+ * @cgrp: the cgroup new css will be associated with
+ * @ss: the subsys of new css
*
- * Must be called with the mutex on the parent inode held
+ * Create a new css associated with @cgrp - @ss pair. On success, the new
+ * css is online and installed in @cgrp with all interface files created.
+ * Returns 0 on success, -errno on failure.
*/
-static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
- mode_t mode)
+static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss)
{
- struct cgroup *cgrp;
- struct cgroupfs_root *root = parent->root;
- int err = 0;
+ struct cgroup *parent = cgroup_parent(cgrp);
+ struct cgroup_subsys_state *parent_css = cgroup_css(parent, ss);
+ struct cgroup_subsys_state *css;
+ int err;
+
+ lockdep_assert_held(&cgroup_mutex);
+
+ css = ss->css_alloc(parent_css);
+ if (IS_ERR(css))
+ return PTR_ERR(css);
+
+ init_and_link_css(css, ss, cgrp);
+
+ err = percpu_ref_init(&css->refcnt, css_release);
+ if (err)
+ goto err_free_css;
+
+ err = cgroup_idr_alloc(&ss->css_idr, NULL, 2, 0, GFP_NOWAIT);
+ if (err < 0)
+ goto err_free_percpu_ref;
+ css->id = err;
+
+ err = cgroup_populate_dir(cgrp, 1 << ss->id);
+ if (err)
+ goto err_free_id;
+
+ /* @css is ready to be brought online now, make it visible */
+ list_add_tail_rcu(&css->sibling, &parent_css->children);
+ cgroup_idr_replace(&ss->css_idr, css, css->id);
+
+ err = online_css(css);
+ if (err)
+ goto err_list_del;
+
+ if (ss->broken_hierarchy && !ss->warned_broken_hierarchy &&
+ cgroup_parent(parent)) {
+ pr_warn("%s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n",
+ current->comm, current->pid, ss->name);
+ if (!strcmp(ss->name, "memory"))
+ pr_warn("\"memory\" requires setting use_hierarchy to 1 on the root\n");
+ ss->warned_broken_hierarchy = true;
+ }
+
+ return 0;
+
+err_list_del:
+ list_del_rcu(&css->sibling);
+ cgroup_clear_dir(css->cgroup, 1 << css->ss->id);
+err_free_id:
+ cgroup_idr_remove(&ss->css_idr, css->id);
+err_free_percpu_ref:
+ percpu_ref_cancel_init(&css->refcnt);
+err_free_css:
+ call_rcu(&css->rcu_head, css_free_rcu_fn);
+ return err;
+}
+
+static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
+ umode_t mode)
+{
+ struct cgroup *parent, *cgrp;
+ struct cgroup_root *root;
struct cgroup_subsys *ss;
- struct super_block *sb = root->sb;
+ struct kernfs_node *kn;
+ int ssid, ret;
+ parent = cgroup_kn_lock_live(parent_kn);
+ if (!parent)
+ return -ENODEV;
+ root = parent->root;
+
+ /* allocate the cgroup and its ID, 0 is reserved for the root */
cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL);
- if (!cgrp)
- return -ENOMEM;
+ if (!cgrp) {
+ ret = -ENOMEM;
+ goto out_unlock;
+ }
- /* Grab a reference on the superblock so the hierarchy doesn't
- * get deleted on unmount if there are child cgroups. This
- * can be done outside cgroup_mutex, since the sb can't
- * disappear while someone has an open control file on the
- * fs */
- atomic_inc(&sb->s_active);
+ ret = percpu_ref_init(&cgrp->self.refcnt, css_release);
+ if (ret)
+ goto out_free_cgrp;
- mutex_lock(&cgroup_mutex);
+ /*
+ * Temporarily set the pointer to NULL, so idr_find() won't return
+ * a half-baked cgroup.
+ */
+ cgrp->id = cgroup_idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_NOWAIT);
+ if (cgrp->id < 0) {
+ ret = -ENOMEM;
+ goto out_cancel_ref;
+ }
init_cgroup_housekeeping(cgrp);
- cgrp->parent = parent;
- cgrp->root = parent->root;
- cgrp->top_cgroup = parent->top_cgroup;
+ cgrp->self.parent = &parent->self;
+ cgrp->root = root;
if (notify_on_release(parent))
set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
- for_each_subsys(root, ss) {
- struct cgroup_subsys_state *css = ss->create(ss, cgrp);
+ if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags))
+ set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
- if (IS_ERR(css)) {
- err = PTR_ERR(css);
- goto err_destroy;
- }
- init_cgroup_css(css, ss, cgrp);
- if (ss->use_id) {
- err = alloc_css_id(ss, parent, cgrp);
- if (err)
- goto err_destroy;
- }
- /* At error, ->destroy() callback has to free assigned ID. */
+ /* create the directory */
+ kn = kernfs_create_dir(parent->kn, name, mode, cgrp);
+ if (IS_ERR(kn)) {
+ ret = PTR_ERR(kn);
+ goto out_free_id;
}
+ cgrp->kn = kn;
- cgroup_lock_hierarchy(root);
- list_add(&cgrp->sibling, &cgrp->parent->children);
- cgroup_unlock_hierarchy(root);
- root->number_of_cgroups++;
-
- err = cgroup_create_dir(cgrp, dentry, mode);
- if (err < 0)
- goto err_remove;
-
- /* The cgroup directory was pre-locked for us */
- BUG_ON(!mutex_is_locked(&cgrp->dentry->d_inode->i_mutex));
-
- err = cgroup_populate_dir(cgrp);
- /* If err < 0, we have a half-filled directory - oh well ;) */
+ /*
+ * This extra ref will be put in cgroup_free_fn() and guarantees
+ * that @cgrp->kn is always accessible.
+ */
+ kernfs_get(kn);
- mutex_unlock(&cgroup_mutex);
- mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
+ cgrp->self.serial_nr = css_serial_nr_next++;
- return 0;
+ /* allocation complete, commit to creation */
+ list_add_tail_rcu(&cgrp->self.sibling, &cgroup_parent(cgrp)->self.children);
+ atomic_inc(&root->nr_cgrps);
+ cgroup_get(parent);
- err_remove:
+ /*
+ * @cgrp is now fully operational. If something fails after this
+ * point, it'll be released via the normal destruction path.
+ */
+ cgroup_idr_replace(&root->cgroup_idr, cgrp, cgrp->id);
- cgroup_lock_hierarchy(root);
- list_del(&cgrp->sibling);
- cgroup_unlock_hierarchy(root);
- root->number_of_cgroups--;
+ ret = cgroup_kn_set_ugid(kn);
+ if (ret)
+ goto out_destroy;
- err_destroy:
+ ret = cgroup_addrm_files(cgrp, cgroup_base_files, true);
+ if (ret)
+ goto out_destroy;
- for_each_subsys(root, ss) {
- if (cgrp->subsys[ss->subsys_id])
- ss->destroy(ss, cgrp);
+ /* let's create and online css's */
+ for_each_subsys(ss, ssid) {
+ if (parent->child_subsys_mask & (1 << ssid)) {
+ ret = create_css(cgrp, ss);
+ if (ret)
+ goto out_destroy;
+ }
}
- mutex_unlock(&cgroup_mutex);
+ /*
+ * On the default hierarchy, a child doesn't automatically inherit
+ * child_subsys_mask from the parent. Each is configured manually.
+ */
+ if (!cgroup_on_dfl(cgrp))
+ cgrp->child_subsys_mask = parent->child_subsys_mask;
+
+ kernfs_activate(kn);
- /* Release the reference count that we took on the superblock */
- deactivate_super(sb);
+ ret = 0;
+ goto out_unlock;
+out_free_id:
+ cgroup_idr_remove(&root->cgroup_idr, cgrp->id);
+out_cancel_ref:
+ percpu_ref_cancel_init(&cgrp->self.refcnt);
+out_free_cgrp:
kfree(cgrp);
- return err;
+out_unlock:
+ cgroup_kn_unlock(parent_kn);
+ return ret;
+
+out_destroy:
+ cgroup_destroy_locked(cgrp);
+ goto out_unlock;
}
-static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+/*
+ * This is called when the refcnt of a css is confirmed to be killed.
+ * css_tryget_online() is now guaranteed to fail. Tell the subsystem to
+ * initate destruction and put the css ref from kill_css().
+ */
+static void css_killed_work_fn(struct work_struct *work)
{
- struct cgroup *c_parent = dentry->d_parent->d_fsdata;
+ struct cgroup_subsys_state *css =
+ container_of(work, struct cgroup_subsys_state, destroy_work);
+
+ mutex_lock(&cgroup_mutex);
+ offline_css(css);
+ mutex_unlock(&cgroup_mutex);
- /* the vfs holds inode->i_mutex already */
- return cgroup_create(c_parent, dentry, mode | S_IFDIR);
+ css_put(css);
}
-static int cgroup_has_css_refs(struct cgroup *cgrp)
+/* css kill confirmation processing requires process context, bounce */
+static void css_killed_ref_fn(struct percpu_ref *ref)
{
- /* Check the reference count on each subsystem. Since we
- * already established that there are no tasks in the
- * cgroup, if the css refcount is also 1, then there should
- * be no outstanding references, so the subsystem is safe to
- * destroy. We scan across all subsystems rather than using
- * the per-hierarchy linked list of mounted subsystems since
- * we can be called via check_for_release() with no
- * synchronization other than RCU, and the subsystem linked
- * list isn't RCU-safe */
- int i;
- for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
- struct cgroup_subsys *ss = subsys[i];
- struct cgroup_subsys_state *css;
- /* Skip subsystems not in this hierarchy */
- if (ss->root != cgrp->root)
- continue;
- css = cgrp->subsys[ss->subsys_id];
- /* When called from check_for_release() it's possible
- * that by this point the cgroup has been removed
- * and the css deleted. But a false-positive doesn't
- * matter, since it can only happen if the cgroup
- * has been deleted and hence no longer needs the
- * release agent to be called anyway. */
- if (css && (atomic_read(&css->refcnt) > 1))
- return 1;
- }
- return 0;
+ struct cgroup_subsys_state *css =
+ container_of(ref, struct cgroup_subsys_state, refcnt);
+
+ INIT_WORK(&css->destroy_work, css_killed_work_fn);
+ queue_work(cgroup_destroy_wq, &css->destroy_work);
}
-/*
- * Atomically mark all (or else none) of the cgroup's CSS objects as
- * CSS_REMOVED. Return true on success, or false if the cgroup has
- * busy subsystems. Call with cgroup_mutex held
+/**
+ * kill_css - destroy a css
+ * @css: css to destroy
+ *
+ * This function initiates destruction of @css by removing cgroup interface
+ * files and putting its base reference. ->css_offline() will be invoked
+ * asynchronously once css_tryget_online() is guaranteed to fail and when
+ * the reference count reaches zero, @css will be released.
*/
-
-static int cgroup_clear_css_refs(struct cgroup *cgrp)
+static void kill_css(struct cgroup_subsys_state *css)
{
- struct cgroup_subsys *ss;
- unsigned long flags;
- bool failed = false;
- local_irq_save(flags);
- for_each_subsys(cgrp->root, ss) {
- struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
- int refcnt;
- while (1) {
- /* We can only remove a CSS with a refcnt==1 */
- refcnt = atomic_read(&css->refcnt);
- if (refcnt > 1) {
- failed = true;
- goto done;
- }
- BUG_ON(!refcnt);
- /*
- * Drop the refcnt to 0 while we check other
- * subsystems. This will cause any racing
- * css_tryget() to spin until we set the
- * CSS_REMOVED bits or abort
- */
- if (atomic_cmpxchg(&css->refcnt, refcnt, 0) == refcnt)
- break;
- cpu_relax();
- }
- }
- done:
- for_each_subsys(cgrp->root, ss) {
- struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
- if (failed) {
- /*
- * Restore old refcnt if we previously managed
- * to clear it from 1 to 0
- */
- if (!atomic_read(&css->refcnt))
- atomic_set(&css->refcnt, 1);
- } else {
- /* Commit the fact that the CSS is removed */
- set_bit(CSS_REMOVED, &css->flags);
- }
- }
- local_irq_restore(flags);
- return !failed;
+ lockdep_assert_held(&cgroup_mutex);
+
+ /*
+ * This must happen before css is disassociated with its cgroup.
+ * See seq_css() for details.
+ */
+ cgroup_clear_dir(css->cgroup, 1 << css->ss->id);
+
+ /*
+ * Killing would put the base ref, but we need to keep it alive
+ * until after ->css_offline().
+ */
+ css_get(css);
+
+ /*
+ * cgroup core guarantees that, by the time ->css_offline() is
+ * invoked, no new css reference will be given out via
+ * css_tryget_online(). We can't simply call percpu_ref_kill() and
+ * proceed to offlining css's because percpu_ref_kill() doesn't
+ * guarantee that the ref is seen as killed on all CPUs on return.
+ *
+ * Use percpu_ref_kill_and_confirm() to get notifications as each
+ * css is confirmed to be seen as killed on all CPUs.
+ */
+ percpu_ref_kill_and_confirm(&css->refcnt, css_killed_ref_fn);
}
-static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
+/**
+ * cgroup_destroy_locked - the first stage of cgroup destruction
+ * @cgrp: cgroup to be destroyed
+ *
+ * css's make use of percpu refcnts whose killing latency shouldn't be
+ * exposed to userland and are RCU protected. Also, cgroup core needs to
+ * guarantee that css_tryget_online() won't succeed by the time
+ * ->css_offline() is invoked. To satisfy all the requirements,
+ * destruction is implemented in the following two steps.
+ *
+ * s1. Verify @cgrp can be destroyed and mark it dying. Remove all
+ * userland visible parts and start killing the percpu refcnts of
+ * css's. Set up so that the next stage will be kicked off once all
+ * the percpu refcnts are confirmed to be killed.
+ *
+ * s2. Invoke ->css_offline(), mark the cgroup dead and proceed with the
+ * rest of destruction. Once all cgroup references are gone, the
+ * cgroup is RCU-freed.
+ *
+ * This function implements s1. After this step, @cgrp is gone as far as
+ * the userland is concerned and a new cgroup with the same name may be
+ * created. As cgroup doesn't care about the names internally, this
+ * doesn't cause any problem.
+ */
+static int cgroup_destroy_locked(struct cgroup *cgrp)
+ __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
{
- struct cgroup *cgrp = dentry->d_fsdata;
- struct dentry *d;
- struct cgroup *parent;
- DEFINE_WAIT(wait);
- int ret;
+ struct cgroup_subsys_state *css;
+ bool empty;
+ int ssid;
- /* the vfs holds both inode->i_mutex already */
-again:
- mutex_lock(&cgroup_mutex);
- if (atomic_read(&cgrp->count) != 0) {
- mutex_unlock(&cgroup_mutex);
- return -EBUSY;
- }
- if (!list_empty(&cgrp->children)) {
- mutex_unlock(&cgroup_mutex);
+ lockdep_assert_held(&cgroup_mutex);
+
+ /*
+ * css_set_rwsem synchronizes access to ->cset_links and prevents
+ * @cgrp from being removed while put_css_set() is in progress.
+ */
+ down_read(&css_set_rwsem);
+ empty = list_empty(&cgrp->cset_links);
+ up_read(&css_set_rwsem);
+ if (!empty)
return -EBUSY;
- }
- mutex_unlock(&cgroup_mutex);
/*
- * In general, subsystem has no css->refcnt after pre_destroy(). But
- * in racy cases, subsystem may have to get css->refcnt after
- * pre_destroy() and it makes rmdir return with -EBUSY. This sometimes
- * make rmdir return -EBUSY too often. To avoid that, we use waitqueue
- * for cgroup's rmdir. CGRP_WAIT_ON_RMDIR is for synchronizing rmdir
- * and subsystem's reference count handling. Please see css_get/put
- * and css_tryget() and cgroup_wakeup_rmdir_waiter() implementation.
+ * Make sure there's no live children. We can't test emptiness of
+ * ->self.children as dead children linger on it while being
+ * drained; otherwise, "rmdir parent/child parent" may fail.
*/
- set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
+ if (css_has_online_children(&cgrp->self))
+ return -EBUSY;
/*
- * Call pre_destroy handlers of subsys. Notify subsystems
- * that rmdir() request comes.
+ * Mark @cgrp dead. This prevents further task migration and child
+ * creation by disabling cgroup_lock_live_group().
*/
- ret = cgroup_call_pre_destroy(cgrp);
- if (ret) {
- clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
- return ret;
- }
+ cgrp->self.flags &= ~CSS_ONLINE;
- mutex_lock(&cgroup_mutex);
- parent = cgrp->parent;
- if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) {
- clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
- mutex_unlock(&cgroup_mutex);
- return -EBUSY;
- }
- prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE);
- if (!cgroup_clear_css_refs(cgrp)) {
- mutex_unlock(&cgroup_mutex);
- /*
- * Because someone may call cgroup_wakeup_rmdir_waiter() before
- * prepare_to_wait(), we need to check this flag.
- */
- if (test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags))
- schedule();
- finish_wait(&cgroup_rmdir_waitq, &wait);
- clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
- if (signal_pending(current))
- return -EINTR;
- goto again;
- }
- /* NO css_tryget() can success after here. */
- finish_wait(&cgroup_rmdir_waitq, &wait);
- clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
+ /* initiate massacre of all css's */
+ for_each_css(css, ssid, cgrp)
+ kill_css(css);
- spin_lock(&release_list_lock);
- set_bit(CGRP_REMOVED, &cgrp->flags);
+ /* CSS_ONLINE is clear, remove from ->release_list for the last time */
+ raw_spin_lock(&release_list_lock);
if (!list_empty(&cgrp->release_list))
- list_del(&cgrp->release_list);
- spin_unlock(&release_list_lock);
-
- cgroup_lock_hierarchy(cgrp->root);
- /* delete this cgroup from parent->children */
- list_del(&cgrp->sibling);
- cgroup_unlock_hierarchy(cgrp->root);
+ list_del_init(&cgrp->release_list);
+ raw_spin_unlock(&release_list_lock);
- spin_lock(&cgrp->dentry->d_lock);
- d = dget(cgrp->dentry);
- spin_unlock(&d->d_lock);
+ /*
+ * Remove @cgrp directory along with the base files. @cgrp has an
+ * extra ref on its kn.
+ */
+ kernfs_remove(cgrp->kn);
- cgroup_d_remove_dir(d);
- dput(d);
+ set_bit(CGRP_RELEASABLE, &cgroup_parent(cgrp)->flags);
+ check_for_release(cgroup_parent(cgrp));
- set_bit(CGRP_RELEASABLE, &parent->flags);
- check_for_release(parent);
+ /* put the base reference */
+ percpu_ref_kill(&cgrp->self.refcnt);
- mutex_unlock(&cgroup_mutex);
return 0;
+};
+
+static int cgroup_rmdir(struct kernfs_node *kn)
+{
+ struct cgroup *cgrp;
+ int ret = 0;
+
+ cgrp = cgroup_kn_lock_live(kn);
+ if (!cgrp)
+ return 0;
+ cgroup_get(cgrp); /* for @kn->priv clearing */
+
+ ret = cgroup_destroy_locked(cgrp);
+
+ cgroup_kn_unlock(kn);
+
+ /*
+ * There are two control paths which try to determine cgroup from
+ * dentry without going through kernfs - cgroupstats_build() and
+ * css_tryget_online_from_dir(). Those are supported by RCU
+ * protecting clearing of cgrp->kn->priv backpointer, which should
+ * happen after all files under it have been removed.
+ */
+ if (!ret)
+ RCU_INIT_POINTER(*(void __rcu __force **)&kn->priv, NULL);
+
+ cgroup_put(cgrp);
+ return ret;
}
-static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
+static struct kernfs_syscall_ops cgroup_kf_syscall_ops = {
+ .remount_fs = cgroup_remount,
+ .show_options = cgroup_show_options,
+ .mkdir = cgroup_mkdir,
+ .rmdir = cgroup_rmdir,
+ .rename = cgroup_rename,
+};
+
+static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
{
struct cgroup_subsys_state *css;
printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name);
- /* Create the top cgroup state for this subsystem */
- list_add(&ss->sibling, &rootnode.subsys_list);
- ss->root = &rootnode;
- css = ss->create(ss, dummytop);
+ mutex_lock(&cgroup_mutex);
+
+ idr_init(&ss->css_idr);
+ INIT_LIST_HEAD(&ss->cfts);
+
+ /* Create the root cgroup state for this subsystem */
+ ss->root = &cgrp_dfl_root;
+ css = ss->css_alloc(cgroup_css(&cgrp_dfl_root.cgrp, ss));
/* We don't handle early failures gracefully */
BUG_ON(IS_ERR(css));
- init_cgroup_css(css, ss, dummytop);
+ init_and_link_css(css, ss, &cgrp_dfl_root.cgrp);
+
+ /*
+ * Root csses are never destroyed and we can't initialize
+ * percpu_ref during early init. Disable refcnting.
+ */
+ css->flags |= CSS_NO_REF;
+
+ if (early) {
+ /* allocation can't be done safely during early init */
+ css->id = 1;
+ } else {
+ css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2, GFP_KERNEL);
+ BUG_ON(css->id < 0);
+ }
/* Update the init_css_set to contain a subsys
* pointer to this state - since the subsystem is
* newly registered, all tasks and hence the
- * init_css_set is in the subsystem's top cgroup. */
- init_css_set.subsys[ss->subsys_id] = dummytop->subsys[ss->subsys_id];
+ * init_css_set is in the subsystem's root cgroup. */
+ init_css_set.subsys[ss->id] = css;
need_forkexit_callback |= ss->fork || ss->exit;
@@ -3205,9 +4723,9 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
* need to invoke fork callbacks here. */
BUG_ON(!list_empty(&init_task.tasks));
- mutex_init(&ss->hierarchy_mutex);
- lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key);
- ss->active = 1;
+ BUG_ON(online_css(css));
+
+ mutex_unlock(&cgroup_mutex);
}
/**
@@ -3218,41 +4736,29 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
*/
int __init cgroup_init_early(void)
{
+ static struct cgroup_sb_opts __initdata opts =
+ { .flags = CGRP_ROOT_SANE_BEHAVIOR };
+ struct cgroup_subsys *ss;
int i;
- atomic_set(&init_css_set.refcount, 1);
- INIT_LIST_HEAD(&init_css_set.cg_links);
- INIT_LIST_HEAD(&init_css_set.tasks);
- INIT_HLIST_NODE(&init_css_set.hlist);
- css_set_count = 1;
- init_cgroup_root(&rootnode);
- root_count = 1;
- init_task.cgroups = &init_css_set;
-
- init_css_set_link.cg = &init_css_set;
- init_css_set_link.cgrp = dummytop;
- list_add(&init_css_set_link.cgrp_link_list,
- &rootnode.top_cgroup.css_sets);
- list_add(&init_css_set_link.cg_link_list,
- &init_css_set.cg_links);
-
- for (i = 0; i < CSS_SET_TABLE_SIZE; i++)
- INIT_HLIST_HEAD(&css_set_table[i]);
-
- for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
- struct cgroup_subsys *ss = subsys[i];
-
- BUG_ON(!ss->name);
- BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN);
- BUG_ON(!ss->create);
- BUG_ON(!ss->destroy);
- if (ss->subsys_id != i) {
- printk(KERN_ERR "cgroup: Subsys %s id == %d\n",
- ss->name, ss->subsys_id);
- BUG();
- }
+
+ init_cgroup_root(&cgrp_dfl_root, &opts);
+ cgrp_dfl_root.cgrp.self.flags |= CSS_NO_REF;
+
+ RCU_INIT_POINTER(init_task.cgroups, &init_css_set);
+
+ for_each_subsys(ss, i) {
+ WARN(!ss->css_alloc || !ss->css_free || ss->name || ss->id,
+ "invalid cgroup_subsys %d:%s css_alloc=%p css_free=%p name:id=%d:%s\n",
+ i, cgroup_subsys_name[i], ss->css_alloc, ss->css_free,
+ ss->id, ss->name);
+ WARN(strlen(cgroup_subsys_name[i]) > MAX_CGROUP_TYPE_NAMELEN,
+ "cgroup_subsys_name %s too long\n", cgroup_subsys_name[i]);
+
+ ss->id = i;
+ ss->name = cgroup_subsys_name[i];
if (ss->early_init)
- cgroup_init_subsys(ss);
+ cgroup_init_subsys(ss, true);
}
return 0;
}
@@ -3265,62 +4771,104 @@ int __init cgroup_init_early(void)
*/
int __init cgroup_init(void)
{
- int err;
- int i;
- struct hlist_head *hhead;
+ struct cgroup_subsys *ss;
+ unsigned long key;
+ int ssid, err;
- err = bdi_init(&cgroup_backing_dev_info);
- if (err)
- return err;
+ BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files));
- for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
- struct cgroup_subsys *ss = subsys[i];
- if (!ss->early_init)
- cgroup_init_subsys(ss);
- if (ss->use_id)
- cgroup_subsys_init_idr(ss);
- }
+ mutex_lock(&cgroup_mutex);
/* Add init_css_set to the hash table */
- hhead = css_set_hash(init_css_set.subsys);
- hlist_add_head(&init_css_set.hlist, hhead);
- BUG_ON(!init_root_id(&rootnode));
+ key = css_set_hash(init_css_set.subsys);
+ hash_add(css_set_table, &init_css_set.hlist, key);
+
+ BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0));
+
+ mutex_unlock(&cgroup_mutex);
+
+ for_each_subsys(ss, ssid) {
+ if (ss->early_init) {
+ struct cgroup_subsys_state *css =
+ init_css_set.subsys[ss->id];
+
+ css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2,
+ GFP_KERNEL);
+ BUG_ON(css->id < 0);
+ } else {
+ cgroup_init_subsys(ss, false);
+ }
+
+ list_add_tail(&init_css_set.e_cset_node[ssid],
+ &cgrp_dfl_root.cgrp.e_csets[ssid]);
+
+ /*
+ * Setting dfl_root subsys_mask needs to consider the
+ * disabled flag and cftype registration needs kmalloc,
+ * both of which aren't available during early_init.
+ */
+ if (!ss->disabled) {
+ cgrp_dfl_root.subsys_mask |= 1 << ss->id;
+ WARN_ON(cgroup_add_cftypes(ss, ss->base_cftypes));
+ }
+ }
+
+ cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj);
+ if (!cgroup_kobj)
+ return -ENOMEM;
+
err = register_filesystem(&cgroup_fs_type);
- if (err < 0)
- goto out;
+ if (err < 0) {
+ kobject_put(cgroup_kobj);
+ return err;
+ }
proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations);
+ return 0;
+}
-out:
- if (err)
- bdi_destroy(&cgroup_backing_dev_info);
+static int __init cgroup_wq_init(void)
+{
+ /*
+ * There isn't much point in executing destruction path in
+ * parallel. Good chunk is serialized with cgroup_mutex anyway.
+ * Use 1 for @max_active.
+ *
+ * We would prefer to do this in cgroup_init() above, but that
+ * is called before init_workqueues(): so leave this until after.
+ */
+ cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);
+ BUG_ON(!cgroup_destroy_wq);
- return err;
+ /*
+ * Used to destroy pidlists and separate to serve as flush domain.
+ * Cap @max_active to 1 too.
+ */
+ cgroup_pidlist_destroy_wq = alloc_workqueue("cgroup_pidlist_destroy",
+ 0, 1);
+ BUG_ON(!cgroup_pidlist_destroy_wq);
+
+ return 0;
}
+core_initcall(cgroup_wq_init);
/*
* proc_cgroup_show()
* - Print task's cgroup paths into seq_file, one line for each hierarchy
* - Used for /proc/<pid>/cgroup.
- * - No need to task_lock(tsk) on this tsk->cgroup reference, as it
- * doesn't really matter if tsk->cgroup changes after we read it,
- * and we take cgroup_mutex, keeping cgroup_attach_task() from changing it
- * anyway. No need to check that tsk->cgroup != NULL, thanks to
- * the_top_cgroup_hack in cgroup_exit(), which sets an exiting tasks
- * cgroup to top_cgroup.
*/
/* TODO: Use a proper seq_file iterator */
-static int proc_cgroup_show(struct seq_file *m, void *v)
+int proc_cgroup_show(struct seq_file *m, void *v)
{
struct pid *pid;
struct task_struct *tsk;
- char *buf;
+ char *buf, *path;
int retval;
- struct cgroupfs_root *root;
+ struct cgroup_root *root;
retval = -ENOMEM;
- buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
+ buf = kmalloc(PATH_MAX, GFP_KERNEL);
if (!buf)
goto out;
@@ -3333,28 +4881,36 @@ static int proc_cgroup_show(struct seq_file *m, void *v)
retval = 0;
mutex_lock(&cgroup_mutex);
+ down_read(&css_set_rwsem);
- for_each_active_root(root) {
+ for_each_root(root) {
struct cgroup_subsys *ss;
struct cgroup *cgrp;
- int count = 0;
+ int ssid, count = 0;
+
+ if (root == &cgrp_dfl_root && !cgrp_dfl_root_visible)
+ continue;
seq_printf(m, "%d:", root->hierarchy_id);
- for_each_subsys(root, ss)
- seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
+ for_each_subsys(ss, ssid)
+ if (root->subsys_mask & (1 << ssid))
+ seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
if (strlen(root->name))
seq_printf(m, "%sname=%s", count ? "," : "",
root->name);
seq_putc(m, ':');
cgrp = task_cgroup_from_root(tsk, root);
- retval = cgroup_path(cgrp, buf, PAGE_SIZE);
- if (retval < 0)
+ path = cgroup_path(cgrp, buf, PATH_MAX);
+ if (!path) {
+ retval = -ENAMETOOLONG;
goto out_unlock;
- seq_puts(m, buf);
+ }
+ seq_puts(m, path);
seq_putc(m, '\n');
}
out_unlock:
+ up_read(&css_set_rwsem);
mutex_unlock(&cgroup_mutex);
put_task_struct(tsk);
out_free:
@@ -3363,32 +4919,25 @@ out:
return retval;
}
-static int cgroup_open(struct inode *inode, struct file *file)
-{
- struct pid *pid = PROC_I(inode)->pid;
- return single_open(file, proc_cgroup_show, pid);
-}
-
-const struct file_operations proc_cgroup_operations = {
- .open = cgroup_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
-
/* Display information about each subsystem and each hierarchy */
static int proc_cgroupstats_show(struct seq_file *m, void *v)
{
+ struct cgroup_subsys *ss;
int i;
seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n");
+ /*
+ * ideally we don't want subsystems moving around while we do this.
+ * cgroup_mutex is also necessary to guarantee an atomic snapshot of
+ * subsys/hierarchy state.
+ */
mutex_lock(&cgroup_mutex);
- for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
- struct cgroup_subsys *ss = subsys[i];
+
+ for_each_subsys(ss, i)
seq_printf(m, "%s\t%d\t%d\t%d\n",
ss->name, ss->root->hierarchy_id,
- ss->root->number_of_cgroups, !ss->disabled);
- }
+ atomic_read(&ss->root->nr_cgrps), !ss->disabled);
+
mutex_unlock(&cgroup_mutex);
return 0;
}
@@ -3406,74 +4955,83 @@ static const struct file_operations proc_cgroupstats_operations = {
};
/**
- * cgroup_fork - attach newly forked task to its parents cgroup.
+ * cgroup_fork - initialize cgroup related fields during copy_process()
* @child: pointer to task_struct of forking parent process.
*
- * Description: A task inherits its parent's cgroup at fork().
- *
- * A pointer to the shared css_set was automatically copied in
- * fork.c by dup_task_struct(). However, we ignore that copy, since
- * it was not made under the protection of RCU or cgroup_mutex, so
- * might no longer be a valid cgroup pointer. cgroup_attach_task() might
- * have already changed current->cgroups, allowing the previously
- * referenced cgroup group to be removed and freed.
- *
- * At the point that cgroup_fork() is called, 'current' is the parent
- * task, and the passed argument 'child' points to the child task.
+ * A task is associated with the init_css_set until cgroup_post_fork()
+ * attaches it to the parent's css_set. Empty cg_list indicates that
+ * @child isn't holding reference to its css_set.
*/
void cgroup_fork(struct task_struct *child)
{
- task_lock(current);
- child->cgroups = current->cgroups;
- get_css_set(child->cgroups);
- task_unlock(current);
+ RCU_INIT_POINTER(child->cgroups, &init_css_set);
INIT_LIST_HEAD(&child->cg_list);
}
/**
- * cgroup_fork_callbacks - run fork callbacks
- * @child: the new task
- *
- * Called on a new task very soon before adding it to the
- * tasklist. No need to take any locks since no-one can
- * be operating on this task.
- */
-void cgroup_fork_callbacks(struct task_struct *child)
-{
- if (need_forkexit_callback) {
- int i;
- for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
- struct cgroup_subsys *ss = subsys[i];
- if (ss->fork)
- ss->fork(ss, child);
- }
- }
-}
-
-/**
* cgroup_post_fork - called on a new task after adding it to the task list
* @child: the task in question
*
- * Adds the task to the list running through its css_set if necessary.
- * Has to be after the task is visible on the task list in case we race
- * with the first call to cgroup_iter_start() - to guarantee that the
- * new task ends up on its list.
+ * Adds the task to the list running through its css_set if necessary and
+ * call the subsystem fork() callbacks. Has to be after the task is
+ * visible on the task list in case we race with the first call to
+ * cgroup_task_iter_start() - to guarantee that the new task ends up on its
+ * list.
*/
void cgroup_post_fork(struct task_struct *child)
{
+ struct cgroup_subsys *ss;
+ int i;
+
+ /*
+ * This may race against cgroup_enable_task_cg_links(). As that
+ * function sets use_task_css_set_links before grabbing
+ * tasklist_lock and we just went through tasklist_lock to add
+ * @child, it's guaranteed that either we see the set
+ * use_task_css_set_links or cgroup_enable_task_cg_lists() sees
+ * @child during its iteration.
+ *
+ * If we won the race, @child is associated with %current's
+ * css_set. Grabbing css_set_rwsem guarantees both that the
+ * association is stable, and, on completion of the parent's
+ * migration, @child is visible in the source of migration or
+ * already in the destination cgroup. This guarantee is necessary
+ * when implementing operations which need to migrate all tasks of
+ * a cgroup to another.
+ *
+ * Note that if we lose to cgroup_enable_task_cg_links(), @child
+ * will remain in init_css_set. This is safe because all tasks are
+ * in the init_css_set before cg_links is enabled and there's no
+ * operation which transfers all tasks out of init_css_set.
+ */
if (use_task_css_set_links) {
- write_lock(&css_set_lock);
- task_lock(child);
- if (list_empty(&child->cg_list))
- list_add(&child->cg_list, &child->cgroups->tasks);
- task_unlock(child);
- write_unlock(&css_set_lock);
+ struct css_set *cset;
+
+ down_write(&css_set_rwsem);
+ cset = task_css_set(current);
+ if (list_empty(&child->cg_list)) {
+ rcu_assign_pointer(child->cgroups, cset);
+ list_add(&child->cg_list, &cset->tasks);
+ get_css_set(cset);
+ }
+ up_write(&css_set_rwsem);
+ }
+
+ /*
+ * Call ss->fork(). This must happen after @child is linked on
+ * css_set; otherwise, @child might change state between ->fork()
+ * and addition to css_set.
+ */
+ if (need_forkexit_callback) {
+ for_each_subsys(ss, i)
+ if (ss->fork)
+ ss->fork(child);
}
}
+
/**
* cgroup_exit - detach cgroup from exiting task
* @tsk: pointer to task_struct of exiting process
- * @run_callback: run exit callbacks?
*
* Description: Detach cgroup from @tsk and release it.
*
@@ -3483,245 +5041,74 @@ void cgroup_post_fork(struct task_struct *child)
* use notify_on_release cgroups where very high task exit scaling
* is required on large systems.
*
- * the_top_cgroup_hack:
- *
- * Set the exiting tasks cgroup to the root cgroup (top_cgroup).
- *
- * We call cgroup_exit() while the task is still competent to
- * handle notify_on_release(), then leave the task attached to the
- * root cgroup in each hierarchy for the remainder of its exit.
- *
- * To do this properly, we would increment the reference count on
- * top_cgroup, and near the very end of the kernel/exit.c do_exit()
- * code we would add a second cgroup function call, to drop that
- * reference. This would just create an unnecessary hot spot on
- * the top_cgroup reference count, to no avail.
- *
- * Normally, holding a reference to a cgroup without bumping its
- * count is unsafe. The cgroup could go away, or someone could
- * attach us to a different cgroup, decrementing the count on
- * the first cgroup that we never incremented. But in this case,
- * top_cgroup isn't going away, and either task has PF_EXITING set,
- * which wards off any cgroup_attach_task() attempts, or task is a failed
- * fork, never visible to cgroup_attach_task.
+ * We set the exiting tasks cgroup to the root cgroup (top_cgroup). We
+ * call cgroup_exit() while the task is still competent to handle
+ * notify_on_release(), then leave the task attached to the root cgroup in
+ * each hierarchy for the remainder of its exit. No need to bother with
+ * init_css_set refcnting. init_css_set never goes away and we can't race
+ * with migration path - PF_EXITING is visible to migration path.
*/
-void cgroup_exit(struct task_struct *tsk, int run_callbacks)
+void cgroup_exit(struct task_struct *tsk)
{
+ struct cgroup_subsys *ss;
+ struct css_set *cset;
+ bool put_cset = false;
int i;
- struct css_set *cg;
-
- if (run_callbacks && need_forkexit_callback) {
- for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
- struct cgroup_subsys *ss = subsys[i];
- if (ss->exit)
- ss->exit(ss, tsk);
- }
- }
/*
- * Unlink from the css_set task list if necessary.
- * Optimistically check cg_list before taking
- * css_set_lock
+ * Unlink from @tsk from its css_set. As migration path can't race
+ * with us, we can check cg_list without grabbing css_set_rwsem.
*/
if (!list_empty(&tsk->cg_list)) {
- write_lock(&css_set_lock);
- if (!list_empty(&tsk->cg_list))
- list_del(&tsk->cg_list);
- write_unlock(&css_set_lock);
+ down_write(&css_set_rwsem);
+ list_del_init(&tsk->cg_list);
+ up_write(&css_set_rwsem);
+ put_cset = true;
}
/* Reassign the task to the init_css_set. */
- task_lock(tsk);
- cg = tsk->cgroups;
- tsk->cgroups = &init_css_set;
- task_unlock(tsk);
- if (cg)
- put_css_set_taskexit(cg);
-}
+ cset = task_css_set(tsk);
+ RCU_INIT_POINTER(tsk->cgroups, &init_css_set);
-/**
- * cgroup_clone - clone the cgroup the given subsystem is attached to
- * @tsk: the task to be moved
- * @subsys: the given subsystem
- * @nodename: the name for the new cgroup
- *
- * Duplicate the current cgroup in the hierarchy that the given
- * subsystem is attached to, and move this task into the new
- * child.
- */
-int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys,
- char *nodename)
-{
- struct dentry *dentry;
- int ret = 0;
- struct cgroup *parent, *child;
- struct inode *inode;
- struct css_set *cg;
- struct cgroupfs_root *root;
- struct cgroup_subsys *ss;
-
- /* We shouldn't be called by an unregistered subsystem */
- BUG_ON(!subsys->active);
-
- /* First figure out what hierarchy and cgroup we're dealing
- * with, and pin them so we can drop cgroup_mutex */
- mutex_lock(&cgroup_mutex);
- again:
- root = subsys->root;
- if (root == &rootnode) {
- mutex_unlock(&cgroup_mutex);
- return 0;
- }
-
- /* Pin the hierarchy */
- if (!atomic_inc_not_zero(&root->sb->s_active)) {
- /* We race with the final deactivate_super() */
- mutex_unlock(&cgroup_mutex);
- return 0;
- }
-
- /* Keep the cgroup alive */
- task_lock(tsk);
- parent = task_cgroup(tsk, subsys->subsys_id);
- cg = tsk->cgroups;
- get_css_set(cg);
- task_unlock(tsk);
-
- mutex_unlock(&cgroup_mutex);
-
- /* Now do the VFS work to create a cgroup */
- inode = parent->dentry->d_inode;
-
- /* Hold the parent directory mutex across this operation to
- * stop anyone else deleting the new cgroup */
- mutex_lock(&inode->i_mutex);
- dentry = lookup_one_len(nodename, parent->dentry, strlen(nodename));
- if (IS_ERR(dentry)) {
- printk(KERN_INFO
- "cgroup: Couldn't allocate dentry for %s: %ld\n", nodename,
- PTR_ERR(dentry));
- ret = PTR_ERR(dentry);
- goto out_release;
- }
-
- /* Create the cgroup directory, which also creates the cgroup */
- ret = vfs_mkdir(inode, dentry, 0755);
- child = __d_cgrp(dentry);
- dput(dentry);
- if (ret) {
- printk(KERN_INFO
- "Failed to create cgroup %s: %d\n", nodename,
- ret);
- goto out_release;
- }
-
- /* The cgroup now exists. Retake cgroup_mutex and check
- * that we're still in the same state that we thought we
- * were. */
- mutex_lock(&cgroup_mutex);
- if ((root != subsys->root) ||
- (parent != task_cgroup(tsk, subsys->subsys_id))) {
- /* Aargh, we raced ... */
- mutex_unlock(&inode->i_mutex);
- put_css_set(cg);
-
- deactivate_super(root->sb);
- /* The cgroup is still accessible in the VFS, but
- * we're not going to try to rmdir() it at this
- * point. */
- printk(KERN_INFO
- "Race in cgroup_clone() - leaking cgroup %s\n",
- nodename);
- goto again;
- }
+ if (need_forkexit_callback) {
+ /* see cgroup_post_fork() for details */
+ for_each_subsys(ss, i) {
+ if (ss->exit) {
+ struct cgroup_subsys_state *old_css = cset->subsys[i];
+ struct cgroup_subsys_state *css = task_css(tsk, i);
- /* do any required auto-setup */
- for_each_subsys(root, ss) {
- if (ss->post_clone)
- ss->post_clone(ss, child);
+ ss->exit(css, old_css, tsk);
+ }
+ }
}
- /* All seems fine. Finish by moving the task into the new cgroup */
- ret = cgroup_attach_task(child, tsk);
- mutex_unlock(&cgroup_mutex);
-
- out_release:
- mutex_unlock(&inode->i_mutex);
-
- mutex_lock(&cgroup_mutex);
- put_css_set(cg);
- mutex_unlock(&cgroup_mutex);
- deactivate_super(root->sb);
- return ret;
-}
-
-/**
- * cgroup_is_descendant - see if @cgrp is a descendant of @task's cgrp
- * @cgrp: the cgroup in question
- * @task: the task in question
- *
- * See if @cgrp is a descendant of @task's cgroup in the appropriate
- * hierarchy.
- *
- * If we are sending in dummytop, then presumably we are creating
- * the top cgroup in the subsystem.
- *
- * Called only by the ns (nsproxy) cgroup.
- */
-int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task)
-{
- int ret;
- struct cgroup *target;
-
- if (cgrp == dummytop)
- return 1;
-
- target = task_cgroup_from_root(task, cgrp->root);
- while (cgrp != target && cgrp!= cgrp->top_cgroup)
- cgrp = cgrp->parent;
- ret = (cgrp == target);
- return ret;
+ if (put_cset)
+ put_css_set(cset, true);
}
static void check_for_release(struct cgroup *cgrp)
{
- /* All of these checks rely on RCU to keep the cgroup
- * structure alive */
- if (cgroup_is_releasable(cgrp) && !atomic_read(&cgrp->count)
- && list_empty(&cgrp->children) && !cgroup_has_css_refs(cgrp)) {
- /* Control Group is currently removeable. If it's not
+ if (cgroup_is_releasable(cgrp) && list_empty(&cgrp->cset_links) &&
+ !css_has_online_children(&cgrp->self)) {
+ /*
+ * Control Group is currently removeable. If it's not
* already queued for a userspace notification, queue
- * it now */
+ * it now
+ */
int need_schedule_work = 0;
- spin_lock(&release_list_lock);
- if (!cgroup_is_removed(cgrp) &&
+
+ raw_spin_lock(&release_list_lock);
+ if (!cgroup_is_dead(cgrp) &&
list_empty(&cgrp->release_list)) {
list_add(&cgrp->release_list, &release_list);
need_schedule_work = 1;
}
- spin_unlock(&release_list_lock);
+ raw_spin_unlock(&release_list_lock);
if (need_schedule_work)
schedule_work(&release_agent_work);
}
}
-void __css_put(struct cgroup_subsys_state *css)
-{
- struct cgroup *cgrp = css->cgroup;
- int val;
- rcu_read_lock();
- val = atomic_dec_return(&css->refcnt);
- if (val == 1) {
- if (notify_on_release(cgrp)) {
- set_bit(CGRP_RELEASABLE, &cgrp->flags);
- check_for_release(cgrp);
- }
- cgroup_wakeup_rmdir_waiter(cgrp);
- }
- rcu_read_unlock();
- WARN_ON_ONCE(val < 1);
-}
-
/*
* Notify userspace when a cgroup is released, by running the
* configured release agent with the name of the cgroup (path
@@ -3749,20 +5136,21 @@ static void cgroup_release_agent(struct work_struct *work)
{
BUG_ON(work != &release_agent_work);
mutex_lock(&cgroup_mutex);
- spin_lock(&release_list_lock);
+ raw_spin_lock(&release_list_lock);
while (!list_empty(&release_list)) {
char *argv[3], *envp[3];
int i;
- char *pathbuf = NULL, *agentbuf = NULL;
+ char *pathbuf = NULL, *agentbuf = NULL, *path;
struct cgroup *cgrp = list_entry(release_list.next,
struct cgroup,
release_list);
list_del_init(&cgrp->release_list);
- spin_unlock(&release_list_lock);
- pathbuf = kmalloc(PAGE_SIZE, GFP_KERNEL);
+ raw_spin_unlock(&release_list_lock);
+ pathbuf = kmalloc(PATH_MAX, GFP_KERNEL);
if (!pathbuf)
goto continue_free;
- if (cgroup_path(cgrp, pathbuf, PAGE_SIZE) < 0)
+ path = cgroup_path(cgrp, pathbuf, PATH_MAX);
+ if (!path)
goto continue_free;
agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);
if (!agentbuf)
@@ -3770,7 +5158,7 @@ static void cgroup_release_agent(struct work_struct *work)
i = 0;
argv[i++] = agentbuf;
- argv[i++] = pathbuf;
+ argv[i++] = path;
argv[i] = NULL;
i = 0;
@@ -3788,24 +5176,23 @@ static void cgroup_release_agent(struct work_struct *work)
continue_free:
kfree(pathbuf);
kfree(agentbuf);
- spin_lock(&release_list_lock);
+ raw_spin_lock(&release_list_lock);
}
- spin_unlock(&release_list_lock);
+ raw_spin_unlock(&release_list_lock);
mutex_unlock(&cgroup_mutex);
}
static int __init cgroup_disable(char *str)
{
- int i;
+ struct cgroup_subsys *ss;
char *token;
+ int i;
while ((token = strsep(&str, ",")) != NULL) {
if (!*token)
continue;
- for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
- struct cgroup_subsys *ss = subsys[i];
-
+ for_each_subsys(ss, i) {
if (!strcmp(token, ss->name)) {
ss->disabled = 1;
printk(KERN_INFO "Disabling %s control group"
@@ -3818,237 +5205,62 @@ static int __init cgroup_disable(char *str)
}
__setup("cgroup_disable=", cgroup_disable);
-/*
- * Functons for CSS ID.
- */
-
-/*
- *To get ID other than 0, this should be called when !cgroup_is_removed().
- */
-unsigned short css_id(struct cgroup_subsys_state *css)
-{
- struct css_id *cssid = rcu_dereference(css->id);
-
- if (cssid)
- return cssid->id;
- return 0;
-}
-
-unsigned short css_depth(struct cgroup_subsys_state *css)
-{
- struct css_id *cssid = rcu_dereference(css->id);
-
- if (cssid)
- return cssid->depth;
- return 0;
-}
-
-bool css_is_ancestor(struct cgroup_subsys_state *child,
- const struct cgroup_subsys_state *root)
-{
- struct css_id *child_id = rcu_dereference(child->id);
- struct css_id *root_id = rcu_dereference(root->id);
-
- if (!child_id || !root_id || (child_id->depth < root_id->depth))
- return false;
- return child_id->stack[root_id->depth] == root_id->id;
-}
-
-static void __free_css_id_cb(struct rcu_head *head)
-{
- struct css_id *id;
-
- id = container_of(head, struct css_id, rcu_head);
- kfree(id);
-}
-
-void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css)
-{
- struct css_id *id = css->id;
- /* When this is called before css_id initialization, id can be NULL */
- if (!id)
- return;
-
- BUG_ON(!ss->use_id);
-
- rcu_assign_pointer(id->css, NULL);
- rcu_assign_pointer(css->id, NULL);
- spin_lock(&ss->id_lock);
- idr_remove(&ss->idr, id->id);
- spin_unlock(&ss->id_lock);
- call_rcu(&id->rcu_head, __free_css_id_cb);
-}
-
-/*
- * This is called by init or create(). Then, calls to this function are
- * always serialized (By cgroup_mutex() at create()).
+/**
+ * css_tryget_online_from_dir - get corresponding css from a cgroup dentry
+ * @dentry: directory dentry of interest
+ * @ss: subsystem of interest
+ *
+ * If @dentry is a directory for a cgroup which has @ss enabled on it, try
+ * to get the corresponding css and return it. If such css doesn't exist
+ * or can't be pinned, an ERR_PTR value is returned.
*/
-
-static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth)
+struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry,
+ struct cgroup_subsys *ss)
{
- struct css_id *newid;
- int myid, error, size;
-
- BUG_ON(!ss->use_id);
-
- size = sizeof(*newid) + sizeof(unsigned short) * (depth + 1);
- newid = kzalloc(size, GFP_KERNEL);
- if (!newid)
- return ERR_PTR(-ENOMEM);
- /* get id */
- if (unlikely(!idr_pre_get(&ss->idr, GFP_KERNEL))) {
- error = -ENOMEM;
- goto err_out;
- }
- spin_lock(&ss->id_lock);
- /* Don't use 0. allocates an ID of 1-65535 */
- error = idr_get_new_above(&ss->idr, newid, 1, &myid);
- spin_unlock(&ss->id_lock);
-
- /* Returns error when there are no free spaces for new ID.*/
- if (error) {
- error = -ENOSPC;
- goto err_out;
- }
- if (myid > CSS_ID_MAX)
- goto remove_idr;
-
- newid->id = myid;
- newid->depth = depth;
- return newid;
-remove_idr:
- error = -ENOSPC;
- spin_lock(&ss->id_lock);
- idr_remove(&ss->idr, myid);
- spin_unlock(&ss->id_lock);
-err_out:
- kfree(newid);
- return ERR_PTR(error);
-
-}
-
-static int __init cgroup_subsys_init_idr(struct cgroup_subsys *ss)
-{
- struct css_id *newid;
- struct cgroup_subsys_state *rootcss;
-
- spin_lock_init(&ss->id_lock);
- idr_init(&ss->idr);
-
- rootcss = init_css_set.subsys[ss->subsys_id];
- newid = get_new_cssid(ss, 0);
- if (IS_ERR(newid))
- return PTR_ERR(newid);
-
- newid->stack[0] = newid->id;
- newid->css = rootcss;
- rootcss->id = newid;
- return 0;
-}
-
-static int alloc_css_id(struct cgroup_subsys *ss, struct cgroup *parent,
- struct cgroup *child)
-{
- int subsys_id, i, depth = 0;
- struct cgroup_subsys_state *parent_css, *child_css;
- struct css_id *child_id, *parent_id = NULL;
+ struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
+ struct cgroup_subsys_state *css = NULL;
+ struct cgroup *cgrp;
- subsys_id = ss->subsys_id;
- parent_css = parent->subsys[subsys_id];
- child_css = child->subsys[subsys_id];
- depth = css_depth(parent_css) + 1;
- parent_id = parent_css->id;
+ /* is @dentry a cgroup dir? */
+ if (dentry->d_sb->s_type != &cgroup_fs_type || !kn ||
+ kernfs_type(kn) != KERNFS_DIR)
+ return ERR_PTR(-EBADF);
- child_id = get_new_cssid(ss, depth);
- if (IS_ERR(child_id))
- return PTR_ERR(child_id);
+ rcu_read_lock();
- for (i = 0; i < depth; i++)
- child_id->stack[i] = parent_id->stack[i];
- child_id->stack[depth] = child_id->id;
/*
- * child_id->css pointer will be set after this cgroup is available
- * see cgroup_populate_dir()
+ * This path doesn't originate from kernfs and @kn could already
+ * have been or be removed at any point. @kn->priv is RCU
+ * protected for this access. See cgroup_rmdir() for details.
*/
- rcu_assign_pointer(child_css->id, child_id);
-
- return 0;
-}
-
-/**
- * css_lookup - lookup css by id
- * @ss: cgroup subsys to be looked into.
- * @id: the id
- *
- * Returns pointer to cgroup_subsys_state if there is valid one with id.
- * NULL if not. Should be called under rcu_read_lock()
- */
-struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id)
-{
- struct css_id *cssid = NULL;
+ cgrp = rcu_dereference(kn->priv);
+ if (cgrp)
+ css = cgroup_css(cgrp, ss);
- BUG_ON(!ss->use_id);
- cssid = idr_find(&ss->idr, id);
+ if (!css || !css_tryget_online(css))
+ css = ERR_PTR(-ENOENT);
- if (unlikely(!cssid))
- return NULL;
-
- return rcu_dereference(cssid->css);
+ rcu_read_unlock();
+ return css;
}
/**
- * css_get_next - lookup next cgroup under specified hierarchy.
- * @ss: pointer to subsystem
- * @id: current position of iteration.
- * @root: pointer to css. search tree under this.
- * @foundid: position of found object.
+ * css_from_id - lookup css by id
+ * @id: the cgroup id
+ * @ss: cgroup subsys to be looked into
*
- * Search next css under the specified hierarchy of rootid. Calling under
- * rcu_read_lock() is necessary. Returns NULL if it reaches the end.
+ * Returns the css if there's valid one with @id, otherwise returns NULL.
+ * Should be called under rcu_read_lock().
*/
-struct cgroup_subsys_state *
-css_get_next(struct cgroup_subsys *ss, int id,
- struct cgroup_subsys_state *root, int *foundid)
+struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss)
{
- struct cgroup_subsys_state *ret = NULL;
- struct css_id *tmp;
- int tmpid;
- int rootid = css_id(root);
- int depth = css_depth(root);
-
- if (!rootid)
- return NULL;
-
- BUG_ON(!ss->use_id);
- /* fill start point for scan */
- tmpid = id;
- while (1) {
- /*
- * scan next entry from bitmap(tree), tmpid is updated after
- * idr_get_next().
- */
- spin_lock(&ss->id_lock);
- tmp = idr_get_next(&ss->idr, &tmpid);
- spin_unlock(&ss->id_lock);
-
- if (!tmp)
- break;
- if (tmp->depth >= depth && tmp->stack[depth] == rootid) {
- ret = rcu_dereference(tmp->css);
- if (ret) {
- *foundid = tmpid;
- break;
- }
- }
- /* continue to scan from next id */
- tmpid = tmpid + 1;
- }
- return ret;
+ WARN_ON_ONCE(!rcu_read_lock_held());
+ return idr_find(&ss->css_idr, id);
}
#ifdef CONFIG_CGROUP_DEBUG
-static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss,
- struct cgroup *cont)
+static struct cgroup_subsys_state *
+debug_css_alloc(struct cgroup_subsys_state *parent_css)
{
struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
@@ -4058,101 +5270,100 @@ static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss,
return css;
}
-static void debug_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
+static void debug_css_free(struct cgroup_subsys_state *css)
{
- kfree(cont->subsys[debug_subsys_id]);
+ kfree(css);
}
-static u64 cgroup_refcount_read(struct cgroup *cont, struct cftype *cft)
+static u64 debug_taskcount_read(struct cgroup_subsys_state *css,
+ struct cftype *cft)
{
- return atomic_read(&cont->count);
+ return cgroup_task_count(css->cgroup);
}
-static u64 debug_taskcount_read(struct cgroup *cont, struct cftype *cft)
-{
- return cgroup_task_count(cont);
-}
-
-static u64 current_css_set_read(struct cgroup *cont, struct cftype *cft)
+static u64 current_css_set_read(struct cgroup_subsys_state *css,
+ struct cftype *cft)
{
return (u64)(unsigned long)current->cgroups;
}
-static u64 current_css_set_refcount_read(struct cgroup *cont,
- struct cftype *cft)
+static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css,
+ struct cftype *cft)
{
u64 count;
rcu_read_lock();
- count = atomic_read(&current->cgroups->refcount);
+ count = atomic_read(&task_css_set(current)->refcount);
rcu_read_unlock();
return count;
}
-static int current_css_set_cg_links_read(struct cgroup *cont,
- struct cftype *cft,
- struct seq_file *seq)
+static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
{
- struct cg_cgroup_link *link;
- struct css_set *cg;
+ struct cgrp_cset_link *link;
+ struct css_set *cset;
+ char *name_buf;
+
+ name_buf = kmalloc(NAME_MAX + 1, GFP_KERNEL);
+ if (!name_buf)
+ return -ENOMEM;
- read_lock(&css_set_lock);
+ down_read(&css_set_rwsem);
rcu_read_lock();
- cg = rcu_dereference(current->cgroups);
- list_for_each_entry(link, &cg->cg_links, cg_link_list) {
+ cset = rcu_dereference(current->cgroups);
+ list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
struct cgroup *c = link->cgrp;
- const char *name;
- if (c->dentry)
- name = c->dentry->d_name.name;
- else
- name = "?";
+ cgroup_name(c, name_buf, NAME_MAX + 1);
seq_printf(seq, "Root %d group %s\n",
- c->root->hierarchy_id, name);
+ c->root->hierarchy_id, name_buf);
}
rcu_read_unlock();
- read_unlock(&css_set_lock);
+ up_read(&css_set_rwsem);
+ kfree(name_buf);
return 0;
}
#define MAX_TASKS_SHOWN_PER_CSS 25
-static int cgroup_css_links_read(struct cgroup *cont,
- struct cftype *cft,
- struct seq_file *seq)
+static int cgroup_css_links_read(struct seq_file *seq, void *v)
{
- struct cg_cgroup_link *link;
+ struct cgroup_subsys_state *css = seq_css(seq);
+ struct cgrp_cset_link *link;
- read_lock(&css_set_lock);
- list_for_each_entry(link, &cont->css_sets, cgrp_link_list) {
- struct css_set *cg = link->cg;
+ down_read(&css_set_rwsem);
+ list_for_each_entry(link, &css->cgroup->cset_links, cset_link) {
+ struct css_set *cset = link->cset;
struct task_struct *task;
int count = 0;
- seq_printf(seq, "css_set %p\n", cg);
- list_for_each_entry(task, &cg->tasks, cg_list) {
- if (count++ > MAX_TASKS_SHOWN_PER_CSS) {
- seq_puts(seq, " ...\n");
- break;
- } else {
- seq_printf(seq, " task %d\n",
- task_pid_vnr(task));
- }
+
+ seq_printf(seq, "css_set %p\n", cset);
+
+ list_for_each_entry(task, &cset->tasks, cg_list) {
+ if (count++ > MAX_TASKS_SHOWN_PER_CSS)
+ goto overflow;
+ seq_printf(seq, " task %d\n", task_pid_vnr(task));
}
+
+ list_for_each_entry(task, &cset->mg_tasks, cg_list) {
+ if (count++ > MAX_TASKS_SHOWN_PER_CSS)
+ goto overflow;
+ seq_printf(seq, " task %d\n", task_pid_vnr(task));
+ }
+ continue;
+ overflow:
+ seq_puts(seq, " ...\n");
}
- read_unlock(&css_set_lock);
+ up_read(&css_set_rwsem);
return 0;
}
-static u64 releasable_read(struct cgroup *cgrp, struct cftype *cft)
+static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft)
{
- return test_bit(CGRP_RELEASABLE, &cgrp->flags);
+ return test_bit(CGRP_RELEASABLE, &css->cgroup->flags);
}
static struct cftype debug_files[] = {
{
- .name = "cgroup_refcount",
- .read_u64 = cgroup_refcount_read,
- },
- {
.name = "taskcount",
.read_u64 = debug_taskcount_read,
},
@@ -4169,31 +5380,25 @@ static struct cftype debug_files[] = {
{
.name = "current_css_set_cg_links",
- .read_seq_string = current_css_set_cg_links_read,
+ .seq_show = current_css_set_cg_links_read,
},
{
.name = "cgroup_css_links",
- .read_seq_string = cgroup_css_links_read,
+ .seq_show = cgroup_css_links_read,
},
{
.name = "releasable",
.read_u64 = releasable_read,
},
-};
-static int debug_populate(struct cgroup_subsys *ss, struct cgroup *cont)
-{
- return cgroup_add_files(cont, ss, debug_files,
- ARRAY_SIZE(debug_files));
-}
+ { } /* terminate */
+};
-struct cgroup_subsys debug_subsys = {
- .name = "debug",
- .create = debug_create,
- .destroy = debug_destroy,
- .populate = debug_populate,
- .subsys_id = debug_subsys_id,
+struct cgroup_subsys debug_cgrp_subsys = {
+ .css_alloc = debug_css_alloc,
+ .css_free = debug_css_free,
+ .base_cftypes = debug_files,
};
#endif /* CONFIG_CGROUP_DEBUG */
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index 59e9ef6aab4..a79e40f9d70 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -14,119 +14,76 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
*/
-#include <linux/module.h>
+#include <linux/export.h>
+#include <linux/slab.h>
#include <linux/cgroup.h>
#include <linux/fs.h>
#include <linux/uaccess.h>
#include <linux/freezer.h>
#include <linux/seq_file.h>
+#include <linux/mutex.h>
-enum freezer_state {
- CGROUP_THAWED = 0,
- CGROUP_FREEZING,
- CGROUP_FROZEN,
+/*
+ * A cgroup is freezing if any FREEZING flags are set. FREEZING_SELF is
+ * set if "FROZEN" is written to freezer.state cgroupfs file, and cleared
+ * for "THAWED". FREEZING_PARENT is set if the parent freezer is FREEZING
+ * for whatever reason. IOW, a cgroup has FREEZING_PARENT set if one of
+ * its ancestors has FREEZING_SELF set.
+ */
+enum freezer_state_flags {
+ CGROUP_FREEZER_ONLINE = (1 << 0), /* freezer is fully online */
+ CGROUP_FREEZING_SELF = (1 << 1), /* this freezer is freezing */
+ CGROUP_FREEZING_PARENT = (1 << 2), /* the parent freezer is freezing */
+ CGROUP_FROZEN = (1 << 3), /* this and its descendants frozen */
+
+ /* mask for all FREEZING flags */
+ CGROUP_FREEZING = CGROUP_FREEZING_SELF | CGROUP_FREEZING_PARENT,
};
struct freezer {
- struct cgroup_subsys_state css;
- enum freezer_state state;
- spinlock_t lock; /* protects _writes_ to state */
+ struct cgroup_subsys_state css;
+ unsigned int state;
};
-static inline struct freezer *cgroup_freezer(
- struct cgroup *cgroup)
+static DEFINE_MUTEX(freezer_mutex);
+
+static inline struct freezer *css_freezer(struct cgroup_subsys_state *css)
{
- return container_of(
- cgroup_subsys_state(cgroup, freezer_subsys_id),
- struct freezer, css);
+ return css ? container_of(css, struct freezer, css) : NULL;
}
static inline struct freezer *task_freezer(struct task_struct *task)
{
- return container_of(task_subsys_state(task, freezer_subsys_id),
- struct freezer, css);
+ return css_freezer(task_css(task, freezer_cgrp_id));
}
-int cgroup_frozen(struct task_struct *task)
+static struct freezer *parent_freezer(struct freezer *freezer)
{
- struct freezer *freezer;
- enum freezer_state state;
+ return css_freezer(freezer->css.parent);
+}
- task_lock(task);
- freezer = task_freezer(task);
- state = freezer->state;
- task_unlock(task);
+bool cgroup_freezing(struct task_struct *task)
+{
+ bool ret;
+
+ rcu_read_lock();
+ ret = task_freezer(task)->state & CGROUP_FREEZING;
+ rcu_read_unlock();
- return state == CGROUP_FROZEN;
+ return ret;
}
-/*
- * cgroups_write_string() limits the size of freezer state strings to
- * CGROUP_LOCAL_BUFFER_SIZE
- */
-static const char *freezer_state_strs[] = {
- "THAWED",
- "FREEZING",
- "FROZEN",
+static const char *freezer_state_strs(unsigned int state)
+{
+ if (state & CGROUP_FROZEN)
+ return "FROZEN";
+ if (state & CGROUP_FREEZING)
+ return "FREEZING";
+ return "THAWED";
};
-/*
- * State diagram
- * Transitions are caused by userspace writes to the freezer.state file.
- * The values in parenthesis are state labels. The rest are edge labels.
- *
- * (THAWED) --FROZEN--> (FREEZING) --FROZEN--> (FROZEN)
- * ^ ^ | |
- * | \_______THAWED_______/ |
- * \__________________________THAWED____________/
- */
-
-struct cgroup_subsys freezer_subsys;
-
-/* Locks taken and their ordering
- * ------------------------------
- * css_set_lock
- * cgroup_mutex (AKA cgroup_lock)
- * task->alloc_lock (AKA task_lock)
- * freezer->lock
- * task->sighand->siglock
- *
- * cgroup code forces css_set_lock to be taken before task->alloc_lock
- *
- * freezer_create(), freezer_destroy():
- * cgroup_mutex [ by cgroup core ]
- *
- * can_attach():
- * cgroup_mutex
- *
- * cgroup_frozen():
- * task->alloc_lock (to get task's cgroup)
- *
- * freezer_fork() (preserving fork() performance means can't take cgroup_mutex):
- * task->alloc_lock (to get task's cgroup)
- * freezer->lock
- * sighand->siglock (if the cgroup is freezing)
- *
- * freezer_read():
- * cgroup_mutex
- * freezer->lock
- * read_lock css_set_lock (cgroup iterator start)
- *
- * freezer_write() (freeze):
- * cgroup_mutex
- * freezer->lock
- * read_lock css_set_lock (cgroup iterator start)
- * sighand->siglock
- *
- * freezer_write() (unfreeze):
- * cgroup_mutex
- * freezer->lock
- * read_lock css_set_lock (cgroup iterator start)
- * task->alloc_lock (to prevent races with freeze_task())
- * sighand->siglock
- */
-static struct cgroup_subsys_state *freezer_create(struct cgroup_subsys *ss,
- struct cgroup *cgroup)
+static struct cgroup_subsys_state *
+freezer_css_alloc(struct cgroup_subsys_state *parent_css)
{
struct freezer *freezer;
@@ -134,259 +91,394 @@ static struct cgroup_subsys_state *freezer_create(struct cgroup_subsys *ss,
if (!freezer)
return ERR_PTR(-ENOMEM);
- spin_lock_init(&freezer->lock);
- freezer->state = CGROUP_THAWED;
return &freezer->css;
}
-static void freezer_destroy(struct cgroup_subsys *ss,
- struct cgroup *cgroup)
+/**
+ * freezer_css_online - commit creation of a freezer css
+ * @css: css being created
+ *
+ * We're committing to creation of @css. Mark it online and inherit
+ * parent's freezing state while holding both parent's and our
+ * freezer->lock.
+ */
+static int freezer_css_online(struct cgroup_subsys_state *css)
{
- kfree(cgroup_freezer(cgroup));
+ struct freezer *freezer = css_freezer(css);
+ struct freezer *parent = parent_freezer(freezer);
+
+ mutex_lock(&freezer_mutex);
+
+ freezer->state |= CGROUP_FREEZER_ONLINE;
+
+ if (parent && (parent->state & CGROUP_FREEZING)) {
+ freezer->state |= CGROUP_FREEZING_PARENT | CGROUP_FROZEN;
+ atomic_inc(&system_freezing_cnt);
+ }
+
+ mutex_unlock(&freezer_mutex);
+ return 0;
+}
+
+/**
+ * freezer_css_offline - initiate destruction of a freezer css
+ * @css: css being destroyed
+ *
+ * @css is going away. Mark it dead and decrement system_freezing_count if
+ * it was holding one.
+ */
+static void freezer_css_offline(struct cgroup_subsys_state *css)
+{
+ struct freezer *freezer = css_freezer(css);
+
+ mutex_lock(&freezer_mutex);
+
+ if (freezer->state & CGROUP_FREEZING)
+ atomic_dec(&system_freezing_cnt);
+
+ freezer->state = 0;
+
+ mutex_unlock(&freezer_mutex);
}
-/* Task is frozen or will freeze immediately when next it gets woken */
-static bool is_task_frozen_enough(struct task_struct *task)
+static void freezer_css_free(struct cgroup_subsys_state *css)
{
- return frozen(task) ||
- (task_is_stopped_or_traced(task) && freezing(task));
+ kfree(css_freezer(css));
}
/*
- * The call to cgroup_lock() in the freezer.state write method prevents
- * a write to that file racing against an attach, and hence the
- * can_attach() result will remain valid until the attach completes.
+ * Tasks can be migrated into a different freezer anytime regardless of its
+ * current state. freezer_attach() is responsible for making new tasks
+ * conform to the current state.
+ *
+ * Freezer state changes and task migration are synchronized via
+ * @freezer->lock. freezer_attach() makes the new tasks conform to the
+ * current state and all following state changes can see the new tasks.
*/
-static int freezer_can_attach(struct cgroup_subsys *ss,
- struct cgroup *new_cgroup,
- struct task_struct *task, bool threadgroup)
+static void freezer_attach(struct cgroup_subsys_state *new_css,
+ struct cgroup_taskset *tset)
{
- struct freezer *freezer;
+ struct freezer *freezer = css_freezer(new_css);
+ struct task_struct *task;
+ bool clear_frozen = false;
+
+ mutex_lock(&freezer_mutex);
/*
- * Anything frozen can't move or be moved to/from.
+ * Make the new tasks conform to the current state of @new_css.
+ * For simplicity, when migrating any task to a FROZEN cgroup, we
+ * revert it to FREEZING and let update_if_frozen() determine the
+ * correct state later.
*
- * Since orig_freezer->state == FROZEN means that @task has been
- * frozen, so it's sufficient to check the latter condition.
+ * Tasks in @tset are on @new_css but may not conform to its
+ * current state before executing the following - !frozen tasks may
+ * be visible in a FROZEN cgroup and frozen tasks in a THAWED one.
*/
-
- if (is_task_frozen_enough(task))
- return -EBUSY;
-
- freezer = cgroup_freezer(new_cgroup);
- if (freezer->state == CGROUP_FROZEN)
- return -EBUSY;
-
- if (threadgroup) {
- struct task_struct *c;
-
- rcu_read_lock();
- list_for_each_entry_rcu(c, &task->thread_group, thread_group) {
- if (is_task_frozen_enough(c)) {
- rcu_read_unlock();
- return -EBUSY;
- }
+ cgroup_taskset_for_each(task, tset) {
+ if (!(freezer->state & CGROUP_FREEZING)) {
+ __thaw_task(task);
+ } else {
+ freeze_task(task);
+ freezer->state &= ~CGROUP_FROZEN;
+ clear_frozen = true;
}
- rcu_read_unlock();
}
- return 0;
+ /* propagate FROZEN clearing upwards */
+ while (clear_frozen && (freezer = parent_freezer(freezer))) {
+ freezer->state &= ~CGROUP_FROZEN;
+ clear_frozen = freezer->state & CGROUP_FREEZING;
+ }
+
+ mutex_unlock(&freezer_mutex);
}
-static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task)
+/**
+ * freezer_fork - cgroup post fork callback
+ * @task: a task which has just been forked
+ *
+ * @task has just been created and should conform to the current state of
+ * the cgroup_freezer it belongs to. This function may race against
+ * freezer_attach(). Losing to freezer_attach() means that we don't have
+ * to do anything as freezer_attach() will put @task into the appropriate
+ * state.
+ */
+static void freezer_fork(struct task_struct *task)
{
struct freezer *freezer;
/*
- * No lock is needed, since the task isn't on tasklist yet,
- * so it can't be moved to another cgroup, which means the
- * freezer won't be removed and will be valid during this
- * function call.
+ * The root cgroup is non-freezable, so we can skip locking the
+ * freezer. This is safe regardless of race with task migration.
+ * If we didn't race or won, skipping is obviously the right thing
+ * to do. If we lost and root is the new cgroup, noop is still the
+ * right thing to do.
*/
- freezer = task_freezer(task);
-
- /*
- * The root cgroup is non-freezable, so we can skip the
- * following check.
- */
- if (!freezer->css.cgroup->parent)
+ if (task_css_is_root(task, freezer_cgrp_id))
return;
- spin_lock_irq(&freezer->lock);
- BUG_ON(freezer->state == CGROUP_FROZEN);
+ mutex_lock(&freezer_mutex);
+ rcu_read_lock();
+
+ freezer = task_freezer(task);
+ if (freezer->state & CGROUP_FREEZING)
+ freeze_task(task);
- /* Locking avoids race with FREEZING -> THAWED transitions. */
- if (freezer->state == CGROUP_FREEZING)
- freeze_task(task, true);
- spin_unlock_irq(&freezer->lock);
+ rcu_read_unlock();
+ mutex_unlock(&freezer_mutex);
}
-/*
- * caller must hold freezer->lock
+/**
+ * update_if_frozen - update whether a cgroup finished freezing
+ * @css: css of interest
+ *
+ * Once FREEZING is initiated, transition to FROZEN is lazily updated by
+ * calling this function. If the current state is FREEZING but not FROZEN,
+ * this function checks whether all tasks of this cgroup and the descendant
+ * cgroups finished freezing and, if so, sets FROZEN.
+ *
+ * The caller is responsible for grabbing RCU read lock and calling
+ * update_if_frozen() on all descendants prior to invoking this function.
+ *
+ * Task states and freezer state might disagree while tasks are being
+ * migrated into or out of @css, so we can't verify task states against
+ * @freezer state here. See freezer_attach() for details.
*/
-static void update_freezer_state(struct cgroup *cgroup,
- struct freezer *freezer)
+static void update_if_frozen(struct cgroup_subsys_state *css)
{
- struct cgroup_iter it;
+ struct freezer *freezer = css_freezer(css);
+ struct cgroup_subsys_state *pos;
+ struct css_task_iter it;
struct task_struct *task;
- unsigned int nfrozen = 0, ntotal = 0;
- cgroup_iter_start(cgroup, &it);
- while ((task = cgroup_iter_next(cgroup, &it))) {
- ntotal++;
- if (is_task_frozen_enough(task))
- nfrozen++;
+ lockdep_assert_held(&freezer_mutex);
+
+ if (!(freezer->state & CGROUP_FREEZING) ||
+ (freezer->state & CGROUP_FROZEN))
+ return;
+
+ /* are all (live) children frozen? */
+ rcu_read_lock();
+ css_for_each_child(pos, css) {
+ struct freezer *child = css_freezer(pos);
+
+ if ((child->state & CGROUP_FREEZER_ONLINE) &&
+ !(child->state & CGROUP_FROZEN)) {
+ rcu_read_unlock();
+ return;
+ }
+ }
+ rcu_read_unlock();
+
+ /* are all tasks frozen? */
+ css_task_iter_start(css, &it);
+
+ while ((task = css_task_iter_next(&it))) {
+ if (freezing(task)) {
+ /*
+ * freezer_should_skip() indicates that the task
+ * should be skipped when determining freezing
+ * completion. Consider it frozen in addition to
+ * the usual frozen condition.
+ */
+ if (!frozen(task) && !freezer_should_skip(task))
+ goto out_iter_end;
+ }
}
- /*
- * Transition to FROZEN when no new tasks can be added ensures
- * that we never exist in the FROZEN state while there are unfrozen
- * tasks.
- */
- if (nfrozen == ntotal)
- freezer->state = CGROUP_FROZEN;
- else if (nfrozen > 0)
- freezer->state = CGROUP_FREEZING;
- else
- freezer->state = CGROUP_THAWED;
- cgroup_iter_end(cgroup, &it);
+ freezer->state |= CGROUP_FROZEN;
+out_iter_end:
+ css_task_iter_end(&it);
}
-static int freezer_read(struct cgroup *cgroup, struct cftype *cft,
- struct seq_file *m)
+static int freezer_read(struct seq_file *m, void *v)
{
- struct freezer *freezer;
- enum freezer_state state;
-
- if (!cgroup_lock_live_group(cgroup))
- return -ENODEV;
-
- freezer = cgroup_freezer(cgroup);
- spin_lock_irq(&freezer->lock);
- state = freezer->state;
- if (state == CGROUP_FREEZING) {
- /* We change from FREEZING to FROZEN lazily if the cgroup was
- * only partially frozen when we exitted write. */
- update_freezer_state(cgroup, freezer);
- state = freezer->state;
+ struct cgroup_subsys_state *css = seq_css(m), *pos;
+
+ mutex_lock(&freezer_mutex);
+ rcu_read_lock();
+
+ /* update states bottom-up */
+ css_for_each_descendant_post(pos, css) {
+ if (!css_tryget_online(pos))
+ continue;
+ rcu_read_unlock();
+
+ update_if_frozen(pos);
+
+ rcu_read_lock();
+ css_put(pos);
}
- spin_unlock_irq(&freezer->lock);
- cgroup_unlock();
- seq_puts(m, freezer_state_strs[state]);
+ rcu_read_unlock();
+ mutex_unlock(&freezer_mutex);
+
+ seq_puts(m, freezer_state_strs(css_freezer(css)->state));
seq_putc(m, '\n');
return 0;
}
-static int try_to_freeze_cgroup(struct cgroup *cgroup, struct freezer *freezer)
+static void freeze_cgroup(struct freezer *freezer)
{
- struct cgroup_iter it;
+ struct css_task_iter it;
struct task_struct *task;
- unsigned int num_cant_freeze_now = 0;
- freezer->state = CGROUP_FREEZING;
- cgroup_iter_start(cgroup, &it);
- while ((task = cgroup_iter_next(cgroup, &it))) {
- if (!freeze_task(task, true))
- continue;
- if (is_task_frozen_enough(task))
- continue;
- if (!freezing(task) && !freezer_should_skip(task))
- num_cant_freeze_now++;
- }
- cgroup_iter_end(cgroup, &it);
-
- return num_cant_freeze_now ? -EBUSY : 0;
+ css_task_iter_start(&freezer->css, &it);
+ while ((task = css_task_iter_next(&it)))
+ freeze_task(task);
+ css_task_iter_end(&it);
}
-static void unfreeze_cgroup(struct cgroup *cgroup, struct freezer *freezer)
+static void unfreeze_cgroup(struct freezer *freezer)
{
- struct cgroup_iter it;
+ struct css_task_iter it;
struct task_struct *task;
- cgroup_iter_start(cgroup, &it);
- while ((task = cgroup_iter_next(cgroup, &it))) {
- thaw_process(task);
- }
- cgroup_iter_end(cgroup, &it);
+ css_task_iter_start(&freezer->css, &it);
+ while ((task = css_task_iter_next(&it)))
+ __thaw_task(task);
+ css_task_iter_end(&it);
+}
+
+/**
+ * freezer_apply_state - apply state change to a single cgroup_freezer
+ * @freezer: freezer to apply state change to
+ * @freeze: whether to freeze or unfreeze
+ * @state: CGROUP_FREEZING_* flag to set or clear
+ *
+ * Set or clear @state on @cgroup according to @freeze, and perform
+ * freezing or thawing as necessary.
+ */
+static void freezer_apply_state(struct freezer *freezer, bool freeze,
+ unsigned int state)
+{
+ /* also synchronizes against task migration, see freezer_attach() */
+ lockdep_assert_held(&freezer_mutex);
+
+ if (!(freezer->state & CGROUP_FREEZER_ONLINE))
+ return;
- freezer->state = CGROUP_THAWED;
+ if (freeze) {
+ if (!(freezer->state & CGROUP_FREEZING))
+ atomic_inc(&system_freezing_cnt);
+ freezer->state |= state;
+ freeze_cgroup(freezer);
+ } else {
+ bool was_freezing = freezer->state & CGROUP_FREEZING;
+
+ freezer->state &= ~state;
+
+ if (!(freezer->state & CGROUP_FREEZING)) {
+ if (was_freezing)
+ atomic_dec(&system_freezing_cnt);
+ freezer->state &= ~CGROUP_FROZEN;
+ unfreeze_cgroup(freezer);
+ }
+ }
}
-static int freezer_change_state(struct cgroup *cgroup,
- enum freezer_state goal_state)
+/**
+ * freezer_change_state - change the freezing state of a cgroup_freezer
+ * @freezer: freezer of interest
+ * @freeze: whether to freeze or thaw
+ *
+ * Freeze or thaw @freezer according to @freeze. The operations are
+ * recursive - all descendants of @freezer will be affected.
+ */
+static void freezer_change_state(struct freezer *freezer, bool freeze)
{
- struct freezer *freezer;
- int retval = 0;
+ struct cgroup_subsys_state *pos;
- freezer = cgroup_freezer(cgroup);
+ /*
+ * Update all its descendants in pre-order traversal. Each
+ * descendant will try to inherit its parent's FREEZING state as
+ * CGROUP_FREEZING_PARENT.
+ */
+ mutex_lock(&freezer_mutex);
+ rcu_read_lock();
+ css_for_each_descendant_pre(pos, &freezer->css) {
+ struct freezer *pos_f = css_freezer(pos);
+ struct freezer *parent = parent_freezer(pos_f);
- spin_lock_irq(&freezer->lock);
+ if (!css_tryget_online(pos))
+ continue;
+ rcu_read_unlock();
- update_freezer_state(cgroup, freezer);
- if (goal_state == freezer->state)
- goto out;
+ if (pos_f == freezer)
+ freezer_apply_state(pos_f, freeze,
+ CGROUP_FREEZING_SELF);
+ else
+ freezer_apply_state(pos_f,
+ parent->state & CGROUP_FREEZING,
+ CGROUP_FREEZING_PARENT);
- switch (goal_state) {
- case CGROUP_THAWED:
- unfreeze_cgroup(cgroup, freezer);
- break;
- case CGROUP_FROZEN:
- retval = try_to_freeze_cgroup(cgroup, freezer);
- break;
- default:
- BUG();
+ rcu_read_lock();
+ css_put(pos);
}
-out:
- spin_unlock_irq(&freezer->lock);
-
- return retval;
+ rcu_read_unlock();
+ mutex_unlock(&freezer_mutex);
}
-static int freezer_write(struct cgroup *cgroup,
- struct cftype *cft,
- const char *buffer)
+static ssize_t freezer_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
{
- int retval;
- enum freezer_state goal_state;
+ bool freeze;
+
+ buf = strstrip(buf);
- if (strcmp(buffer, freezer_state_strs[CGROUP_THAWED]) == 0)
- goal_state = CGROUP_THAWED;
- else if (strcmp(buffer, freezer_state_strs[CGROUP_FROZEN]) == 0)
- goal_state = CGROUP_FROZEN;
+ if (strcmp(buf, freezer_state_strs(0)) == 0)
+ freeze = false;
+ else if (strcmp(buf, freezer_state_strs(CGROUP_FROZEN)) == 0)
+ freeze = true;
else
return -EINVAL;
- if (!cgroup_lock_live_group(cgroup))
- return -ENODEV;
- retval = freezer_change_state(cgroup, goal_state);
- cgroup_unlock();
- return retval;
+ freezer_change_state(css_freezer(of_css(of)), freeze);
+ return nbytes;
+}
+
+static u64 freezer_self_freezing_read(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ struct freezer *freezer = css_freezer(css);
+
+ return (bool)(freezer->state & CGROUP_FREEZING_SELF);
+}
+
+static u64 freezer_parent_freezing_read(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ struct freezer *freezer = css_freezer(css);
+
+ return (bool)(freezer->state & CGROUP_FREEZING_PARENT);
}
static struct cftype files[] = {
{
.name = "state",
- .read_seq_string = freezer_read,
- .write_string = freezer_write,
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = freezer_read,
+ .write = freezer_write,
},
+ {
+ .name = "self_freezing",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .read_u64 = freezer_self_freezing_read,
+ },
+ {
+ .name = "parent_freezing",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .read_u64 = freezer_parent_freezing_read,
+ },
+ { } /* terminate */
};
-static int freezer_populate(struct cgroup_subsys *ss, struct cgroup *cgroup)
-{
- if (!cgroup->parent)
- return 0;
- return cgroup_add_files(cgroup, ss, files, ARRAY_SIZE(files));
-}
-
-struct cgroup_subsys freezer_subsys = {
- .name = "freezer",
- .create = freezer_create,
- .destroy = freezer_destroy,
- .populate = freezer_populate,
- .subsys_id = freezer_subsys_id,
- .can_attach = freezer_can_attach,
- .attach = NULL,
+struct cgroup_subsys freezer_cgrp_subsys = {
+ .css_alloc = freezer_css_alloc,
+ .css_online = freezer_css_online,
+ .css_offline = freezer_css_offline,
+ .css_free = freezer_css_free,
+ .attach = freezer_attach,
.fork = freezer_fork,
- .exit = NULL,
+ .base_cftypes = files,
};
diff --git a/kernel/compat.c b/kernel/compat.c
index f6c204f07ea..633394f442f 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -21,43 +21,80 @@
#include <linux/unistd.h>
#include <linux/security.h>
#include <linux/timex.h>
+#include <linux/export.h>
#include <linux/migrate.h>
#include <linux/posix-timers.h>
#include <linux/times.h>
#include <linux/ptrace.h>
+#include <linux/gfp.h>
#include <asm/uaccess.h>
-/*
- * Note that the native side is already converted to a timespec, because
- * that's what we want anyway.
- */
-static int compat_get_timeval(struct timespec *o,
- struct compat_timeval __user *i)
+static int compat_get_timex(struct timex *txc, struct compat_timex __user *utp)
{
- long usec;
+ memset(txc, 0, sizeof(struct timex));
- if (get_user(o->tv_sec, &i->tv_sec) ||
- get_user(usec, &i->tv_usec))
+ if (!access_ok(VERIFY_READ, utp, sizeof(struct compat_timex)) ||
+ __get_user(txc->modes, &utp->modes) ||
+ __get_user(txc->offset, &utp->offset) ||
+ __get_user(txc->freq, &utp->freq) ||
+ __get_user(txc->maxerror, &utp->maxerror) ||
+ __get_user(txc->esterror, &utp->esterror) ||
+ __get_user(txc->status, &utp->status) ||
+ __get_user(txc->constant, &utp->constant) ||
+ __get_user(txc->precision, &utp->precision) ||
+ __get_user(txc->tolerance, &utp->tolerance) ||
+ __get_user(txc->time.tv_sec, &utp->time.tv_sec) ||
+ __get_user(txc->time.tv_usec, &utp->time.tv_usec) ||
+ __get_user(txc->tick, &utp->tick) ||
+ __get_user(txc->ppsfreq, &utp->ppsfreq) ||
+ __get_user(txc->jitter, &utp->jitter) ||
+ __get_user(txc->shift, &utp->shift) ||
+ __get_user(txc->stabil, &utp->stabil) ||
+ __get_user(txc->jitcnt, &utp->jitcnt) ||
+ __get_user(txc->calcnt, &utp->calcnt) ||
+ __get_user(txc->errcnt, &utp->errcnt) ||
+ __get_user(txc->stbcnt, &utp->stbcnt))
return -EFAULT;
- o->tv_nsec = usec * 1000;
+
return 0;
}
-static int compat_put_timeval(struct compat_timeval __user *o,
- struct timeval *i)
+static int compat_put_timex(struct compat_timex __user *utp, struct timex *txc)
{
- return (put_user(i->tv_sec, &o->tv_sec) ||
- put_user(i->tv_usec, &o->tv_usec)) ? -EFAULT : 0;
+ if (!access_ok(VERIFY_WRITE, utp, sizeof(struct compat_timex)) ||
+ __put_user(txc->modes, &utp->modes) ||
+ __put_user(txc->offset, &utp->offset) ||
+ __put_user(txc->freq, &utp->freq) ||
+ __put_user(txc->maxerror, &utp->maxerror) ||
+ __put_user(txc->esterror, &utp->esterror) ||
+ __put_user(txc->status, &utp->status) ||
+ __put_user(txc->constant, &utp->constant) ||
+ __put_user(txc->precision, &utp->precision) ||
+ __put_user(txc->tolerance, &utp->tolerance) ||
+ __put_user(txc->time.tv_sec, &utp->time.tv_sec) ||
+ __put_user(txc->time.tv_usec, &utp->time.tv_usec) ||
+ __put_user(txc->tick, &utp->tick) ||
+ __put_user(txc->ppsfreq, &utp->ppsfreq) ||
+ __put_user(txc->jitter, &utp->jitter) ||
+ __put_user(txc->shift, &utp->shift) ||
+ __put_user(txc->stabil, &utp->stabil) ||
+ __put_user(txc->jitcnt, &utp->jitcnt) ||
+ __put_user(txc->calcnt, &utp->calcnt) ||
+ __put_user(txc->errcnt, &utp->errcnt) ||
+ __put_user(txc->stbcnt, &utp->stbcnt) ||
+ __put_user(txc->tai, &utp->tai))
+ return -EFAULT;
+ return 0;
}
-asmlinkage long compat_sys_gettimeofday(struct compat_timeval __user *tv,
- struct timezone __user *tz)
+COMPAT_SYSCALL_DEFINE2(gettimeofday, struct compat_timeval __user *, tv,
+ struct timezone __user *, tz)
{
if (tv) {
struct timeval ktv;
do_gettimeofday(&ktv);
- if (compat_put_timeval(tv, &ktv))
+ if (compat_put_timeval(&ktv, tv))
return -EFAULT;
}
if (tz) {
@@ -68,38 +105,114 @@ asmlinkage long compat_sys_gettimeofday(struct compat_timeval __user *tv,
return 0;
}
-asmlinkage long compat_sys_settimeofday(struct compat_timeval __user *tv,
- struct timezone __user *tz)
+COMPAT_SYSCALL_DEFINE2(settimeofday, struct compat_timeval __user *, tv,
+ struct timezone __user *, tz)
{
- struct timespec kts;
- struct timezone ktz;
+ struct timeval user_tv;
+ struct timespec new_ts;
+ struct timezone new_tz;
if (tv) {
- if (compat_get_timeval(&kts, tv))
+ if (compat_get_timeval(&user_tv, tv))
return -EFAULT;
+ new_ts.tv_sec = user_tv.tv_sec;
+ new_ts.tv_nsec = user_tv.tv_usec * NSEC_PER_USEC;
}
if (tz) {
- if (copy_from_user(&ktz, tz, sizeof(ktz)))
+ if (copy_from_user(&new_tz, tz, sizeof(*tz)))
return -EFAULT;
}
- return do_sys_settimeofday(tv ? &kts : NULL, tz ? &ktz : NULL);
+ return do_sys_settimeofday(tv ? &new_ts : NULL, tz ? &new_tz : NULL);
}
-int get_compat_timespec(struct timespec *ts, const struct compat_timespec __user *cts)
+static int __compat_get_timeval(struct timeval *tv, const struct compat_timeval __user *ctv)
+{
+ return (!access_ok(VERIFY_READ, ctv, sizeof(*ctv)) ||
+ __get_user(tv->tv_sec, &ctv->tv_sec) ||
+ __get_user(tv->tv_usec, &ctv->tv_usec)) ? -EFAULT : 0;
+}
+
+static int __compat_put_timeval(const struct timeval *tv, struct compat_timeval __user *ctv)
+{
+ return (!access_ok(VERIFY_WRITE, ctv, sizeof(*ctv)) ||
+ __put_user(tv->tv_sec, &ctv->tv_sec) ||
+ __put_user(tv->tv_usec, &ctv->tv_usec)) ? -EFAULT : 0;
+}
+
+static int __compat_get_timespec(struct timespec *ts, const struct compat_timespec __user *cts)
{
return (!access_ok(VERIFY_READ, cts, sizeof(*cts)) ||
__get_user(ts->tv_sec, &cts->tv_sec) ||
__get_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0;
}
-int put_compat_timespec(const struct timespec *ts, struct compat_timespec __user *cts)
+static int __compat_put_timespec(const struct timespec *ts, struct compat_timespec __user *cts)
{
return (!access_ok(VERIFY_WRITE, cts, sizeof(*cts)) ||
__put_user(ts->tv_sec, &cts->tv_sec) ||
__put_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0;
}
+int compat_get_timeval(struct timeval *tv, const void __user *utv)
+{
+ if (COMPAT_USE_64BIT_TIME)
+ return copy_from_user(tv, utv, sizeof(*tv)) ? -EFAULT : 0;
+ else
+ return __compat_get_timeval(tv, utv);
+}
+EXPORT_SYMBOL_GPL(compat_get_timeval);
+
+int compat_put_timeval(const struct timeval *tv, void __user *utv)
+{
+ if (COMPAT_USE_64BIT_TIME)
+ return copy_to_user(utv, tv, sizeof(*tv)) ? -EFAULT : 0;
+ else
+ return __compat_put_timeval(tv, utv);
+}
+EXPORT_SYMBOL_GPL(compat_put_timeval);
+
+int compat_get_timespec(struct timespec *ts, const void __user *uts)
+{
+ if (COMPAT_USE_64BIT_TIME)
+ return copy_from_user(ts, uts, sizeof(*ts)) ? -EFAULT : 0;
+ else
+ return __compat_get_timespec(ts, uts);
+}
+EXPORT_SYMBOL_GPL(compat_get_timespec);
+
+int compat_put_timespec(const struct timespec *ts, void __user *uts)
+{
+ if (COMPAT_USE_64BIT_TIME)
+ return copy_to_user(uts, ts, sizeof(*ts)) ? -EFAULT : 0;
+ else
+ return __compat_put_timespec(ts, uts);
+}
+EXPORT_SYMBOL_GPL(compat_put_timespec);
+
+int compat_convert_timespec(struct timespec __user **kts,
+ const void __user *cts)
+{
+ struct timespec ts;
+ struct timespec __user *uts;
+
+ if (!cts || COMPAT_USE_64BIT_TIME) {
+ *kts = (struct timespec __user *)cts;
+ return 0;
+ }
+
+ uts = compat_alloc_user_space(sizeof(ts));
+ if (!uts)
+ return -EFAULT;
+ if (compat_get_timespec(&ts, cts))
+ return -EFAULT;
+ if (copy_to_user(uts, &ts, sizeof(ts)))
+ return -EFAULT;
+
+ *kts = uts;
+ return 0;
+}
+
static long compat_nanosleep_restart(struct restart_block *restart)
{
struct compat_timespec __user *rmtp;
@@ -116,21 +229,21 @@ static long compat_nanosleep_restart(struct restart_block *restart)
if (ret) {
rmtp = restart->nanosleep.compat_rmtp;
- if (rmtp && put_compat_timespec(&rmt, rmtp))
+ if (rmtp && compat_put_timespec(&rmt, rmtp))
return -EFAULT;
}
return ret;
}
-asmlinkage long compat_sys_nanosleep(struct compat_timespec __user *rqtp,
- struct compat_timespec __user *rmtp)
+COMPAT_SYSCALL_DEFINE2(nanosleep, struct compat_timespec __user *, rqtp,
+ struct compat_timespec __user *, rmtp)
{
struct timespec tu, rmt;
mm_segment_t oldfs;
long ret;
- if (get_compat_timespec(&tu, rqtp))
+ if (compat_get_timespec(&tu, rqtp))
return -EFAULT;
if (!timespec_valid(&tu))
@@ -150,7 +263,7 @@ asmlinkage long compat_sys_nanosleep(struct compat_timespec __user *rqtp,
restart->fn = compat_nanosleep_restart;
restart->nanosleep.compat_rmtp = rmtp;
- if (rmtp && put_compat_timespec(&rmt, rmtp))
+ if (rmtp && compat_put_timespec(&rmt, rmtp))
return -EFAULT;
}
@@ -177,8 +290,8 @@ static inline long put_compat_itimerval(struct compat_itimerval __user *o,
__put_user(i->it_value.tv_usec, &o->it_value.tv_usec)));
}
-asmlinkage long compat_sys_getitimer(int which,
- struct compat_itimerval __user *it)
+COMPAT_SYSCALL_DEFINE2(getitimer, int, which,
+ struct compat_itimerval __user *, it)
{
struct itimerval kit;
int error;
@@ -189,9 +302,9 @@ asmlinkage long compat_sys_getitimer(int which,
return error;
}
-asmlinkage long compat_sys_setitimer(int which,
- struct compat_itimerval __user *in,
- struct compat_itimerval __user *out)
+COMPAT_SYSCALL_DEFINE3(setitimer, int, which,
+ struct compat_itimerval __user *, in,
+ struct compat_itimerval __user *, out)
{
struct itimerval kin, kout;
int error;
@@ -215,7 +328,7 @@ static compat_clock_t clock_t_to_compat_clock_t(clock_t x)
return compat_jiffies_to_clock_t(clock_t_to_jiffies(x));
}
-asmlinkage long compat_sys_times(struct compat_tms __user *tbuf)
+COMPAT_SYSCALL_DEFINE1(times, struct compat_tms __user *, tbuf)
{
if (tbuf) {
struct tms tms;
@@ -234,12 +347,14 @@ asmlinkage long compat_sys_times(struct compat_tms __user *tbuf)
return compat_jiffies_to_clock_t(jiffies);
}
+#ifdef __ARCH_WANT_SYS_SIGPENDING
+
/*
* Assumption: old_sigset_t and compat_old_sigset_t are both
* types that can be passed to put_user()/get_user().
*/
-asmlinkage long compat_sys_sigpending(compat_old_sigset_t __user *set)
+COMPAT_SYSCALL_DEFINE1(sigpending, compat_old_sigset_t __user *, set)
{
old_sigset_t s;
long ret;
@@ -253,36 +368,66 @@ asmlinkage long compat_sys_sigpending(compat_old_sigset_t __user *set)
return ret;
}
-asmlinkage long compat_sys_sigprocmask(int how, compat_old_sigset_t __user *set,
- compat_old_sigset_t __user *oset)
+#endif
+
+#ifdef __ARCH_WANT_SYS_SIGPROCMASK
+
+/*
+ * sys_sigprocmask SIG_SETMASK sets the first (compat) word of the
+ * blocked set of signals to the supplied signal set
+ */
+static inline void compat_sig_setmask(sigset_t *blocked, compat_sigset_word set)
+{
+ memcpy(blocked->sig, &set, sizeof(set));
+}
+
+COMPAT_SYSCALL_DEFINE3(sigprocmask, int, how,
+ compat_old_sigset_t __user *, nset,
+ compat_old_sigset_t __user *, oset)
{
- old_sigset_t s;
- long ret;
- mm_segment_t old_fs;
+ old_sigset_t old_set, new_set;
+ sigset_t new_blocked;
- if (set && get_user(s, set))
- return -EFAULT;
- old_fs = get_fs();
- set_fs(KERNEL_DS);
- ret = sys_sigprocmask(how,
- set ? (old_sigset_t __user *) &s : NULL,
- oset ? (old_sigset_t __user *) &s : NULL);
- set_fs(old_fs);
- if (ret == 0)
- if (oset)
- ret = put_user(s, oset);
- return ret;
+ old_set = current->blocked.sig[0];
+
+ if (nset) {
+ if (get_user(new_set, nset))
+ return -EFAULT;
+ new_set &= ~(sigmask(SIGKILL) | sigmask(SIGSTOP));
+
+ new_blocked = current->blocked;
+
+ switch (how) {
+ case SIG_BLOCK:
+ sigaddsetmask(&new_blocked, new_set);
+ break;
+ case SIG_UNBLOCK:
+ sigdelsetmask(&new_blocked, new_set);
+ break;
+ case SIG_SETMASK:
+ compat_sig_setmask(&new_blocked, new_set);
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ set_current_blocked(&new_blocked);
+ }
+
+ if (oset) {
+ if (put_user(old_set, oset))
+ return -EFAULT;
+ }
+
+ return 0;
}
-asmlinkage long compat_sys_setrlimit(unsigned int resource,
- struct compat_rlimit __user *rlim)
+#endif
+
+COMPAT_SYSCALL_DEFINE2(setrlimit, unsigned int, resource,
+ struct compat_rlimit __user *, rlim)
{
struct rlimit r;
- int ret;
- mm_segment_t old_fs = get_fs ();
-
- if (resource >= RLIM_NLIMITS)
- return -EINVAL;
if (!access_ok(VERIFY_READ, rlim, sizeof(*rlim)) ||
__get_user(r.rlim_cur, &rlim->rlim_cur) ||
@@ -293,23 +438,20 @@ asmlinkage long compat_sys_setrlimit(unsigned int resource,
r.rlim_cur = RLIM_INFINITY;
if (r.rlim_max == COMPAT_RLIM_INFINITY)
r.rlim_max = RLIM_INFINITY;
- set_fs(KERNEL_DS);
- ret = sys_setrlimit(resource, (struct rlimit __user *) &r);
- set_fs(old_fs);
- return ret;
+ return do_prlimit(current, resource, &r, NULL);
}
#ifdef COMPAT_RLIM_OLD_INFINITY
-asmlinkage long compat_sys_old_getrlimit(unsigned int resource,
- struct compat_rlimit __user *rlim)
+COMPAT_SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource,
+ struct compat_rlimit __user *, rlim)
{
struct rlimit r;
int ret;
mm_segment_t old_fs = get_fs();
set_fs(KERNEL_DS);
- ret = sys_old_getrlimit(resource, &r);
+ ret = sys_old_getrlimit(resource, (struct rlimit __user *)&r);
set_fs(old_fs);
if (!ret) {
@@ -328,16 +470,13 @@ asmlinkage long compat_sys_old_getrlimit(unsigned int resource,
#endif
-asmlinkage long compat_sys_getrlimit (unsigned int resource,
- struct compat_rlimit __user *rlim)
+COMPAT_SYSCALL_DEFINE2(getrlimit, unsigned int, resource,
+ struct compat_rlimit __user *, rlim)
{
struct rlimit r;
int ret;
- mm_segment_t old_fs = get_fs();
- set_fs(KERNEL_DS);
- ret = sys_getrlimit(resource, (struct rlimit __user *) &r);
- set_fs(old_fs);
+ ret = do_prlimit(current, resource, NULL, &r);
if (!ret) {
if (r.rlim_cur > COMPAT_RLIM_INFINITY)
r.rlim_cur = COMPAT_RLIM_INFINITY;
@@ -377,28 +516,11 @@ int put_compat_rusage(const struct rusage *r, struct compat_rusage __user *ru)
return 0;
}
-asmlinkage long compat_sys_getrusage(int who, struct compat_rusage __user *ru)
-{
- struct rusage r;
- int ret;
- mm_segment_t old_fs = get_fs();
-
- set_fs(KERNEL_DS);
- ret = sys_getrusage(who, (struct rusage __user *) &r);
- set_fs(old_fs);
-
- if (ret)
- return ret;
-
- if (put_compat_rusage(&r, ru))
- return -EFAULT;
-
- return 0;
-}
-
-asmlinkage long
-compat_sys_wait4(compat_pid_t pid, compat_uint_t __user *stat_addr, int options,
- struct compat_rusage __user *ru)
+COMPAT_SYSCALL_DEFINE4(wait4,
+ compat_pid_t, pid,
+ compat_uint_t __user *, stat_addr,
+ int, options,
+ struct compat_rusage __user *, ru)
{
if (!ru) {
return sys_wait4(pid, stat_addr, options, NULL);
@@ -425,9 +547,10 @@ compat_sys_wait4(compat_pid_t pid, compat_uint_t __user *stat_addr, int options,
}
}
-asmlinkage long compat_sys_waitid(int which, compat_pid_t pid,
- struct compat_siginfo __user *uinfo, int options,
- struct compat_rusage __user *uru)
+COMPAT_SYSCALL_DEFINE5(waitid,
+ int, which, compat_pid_t, pid,
+ struct compat_siginfo __user *, uinfo, int, options,
+ struct compat_rusage __user *, uru)
{
siginfo_t info;
struct rusage ru;
@@ -445,9 +568,13 @@ asmlinkage long compat_sys_waitid(int which, compat_pid_t pid,
return ret;
if (uru) {
- ret = put_compat_rusage(&ru, uru);
+ /* sys_waitid() overwrites everything in ru */
+ if (COMPAT_USE_64BIT_TIME)
+ ret = copy_to_user(uru, &ru, sizeof(ru));
+ else
+ ret = put_compat_rusage(&ru, uru);
if (ret)
- return ret;
+ return -EFAULT;
}
BUG_ON(info.si_code & __SI_MASK);
@@ -469,9 +596,9 @@ static int compat_get_user_cpu_mask(compat_ulong_t __user *user_mask_ptr,
return compat_get_bitmap(k, user_mask_ptr, len * 8);
}
-asmlinkage long compat_sys_sched_setaffinity(compat_pid_t pid,
- unsigned int len,
- compat_ulong_t __user *user_mask_ptr)
+COMPAT_SYSCALL_DEFINE3(sched_setaffinity, compat_pid_t, pid,
+ unsigned int, len,
+ compat_ulong_t __user *, user_mask_ptr)
{
cpumask_var_t new_mask;
int retval;
@@ -489,42 +616,39 @@ out:
return retval;
}
-asmlinkage long compat_sys_sched_getaffinity(compat_pid_t pid, unsigned int len,
- compat_ulong_t __user *user_mask_ptr)
+COMPAT_SYSCALL_DEFINE3(sched_getaffinity, compat_pid_t, pid, unsigned int, len,
+ compat_ulong_t __user *, user_mask_ptr)
{
int ret;
cpumask_var_t mask;
- unsigned long *k;
- unsigned int min_length = cpumask_size();
-
- if (nr_cpu_ids <= BITS_PER_COMPAT_LONG)
- min_length = sizeof(compat_ulong_t);
- if (len < min_length)
+ if ((len * BITS_PER_BYTE) < nr_cpu_ids)
+ return -EINVAL;
+ if (len & (sizeof(compat_ulong_t)-1))
return -EINVAL;
if (!alloc_cpumask_var(&mask, GFP_KERNEL))
return -ENOMEM;
ret = sched_getaffinity(pid, mask);
- if (ret < 0)
- goto out;
+ if (ret == 0) {
+ size_t retlen = min_t(size_t, len, cpumask_size());
- k = cpumask_bits(mask);
- ret = compat_put_bitmap(user_mask_ptr, k, min_length * 8);
- if (ret == 0)
- ret = min_length;
-
-out:
+ if (compat_put_bitmap(user_mask_ptr, cpumask_bits(mask), retlen * 8))
+ ret = -EFAULT;
+ else
+ ret = retlen;
+ }
free_cpumask_var(mask);
+
return ret;
}
int get_compat_itimerspec(struct itimerspec *dst,
const struct compat_itimerspec __user *src)
{
- if (get_compat_timespec(&dst->it_interval, &src->it_interval) ||
- get_compat_timespec(&dst->it_value, &src->it_value))
+ if (__compat_get_timespec(&dst->it_interval, &src->it_interval) ||
+ __compat_get_timespec(&dst->it_value, &src->it_value))
return -EFAULT;
return 0;
}
@@ -532,15 +656,15 @@ int get_compat_itimerspec(struct itimerspec *dst,
int put_compat_itimerspec(struct compat_itimerspec __user *dst,
const struct itimerspec *src)
{
- if (put_compat_timespec(&src->it_interval, &dst->it_interval) ||
- put_compat_timespec(&src->it_value, &dst->it_value))
+ if (__compat_put_timespec(&src->it_interval, &dst->it_interval) ||
+ __compat_put_timespec(&src->it_value, &dst->it_value))
return -EFAULT;
return 0;
}
-long compat_sys_timer_create(clockid_t which_clock,
- struct compat_sigevent __user *timer_event_spec,
- timer_t __user *created_timer_id)
+COMPAT_SYSCALL_DEFINE3(timer_create, clockid_t, which_clock,
+ struct compat_sigevent __user *, timer_event_spec,
+ timer_t __user *, created_timer_id)
{
struct sigevent __user *event = NULL;
@@ -556,9 +680,9 @@ long compat_sys_timer_create(clockid_t which_clock,
return sys_timer_create(which_clock, event, created_timer_id);
}
-long compat_sys_timer_settime(timer_t timer_id, int flags,
- struct compat_itimerspec __user *new,
- struct compat_itimerspec __user *old)
+COMPAT_SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags,
+ struct compat_itimerspec __user *, new,
+ struct compat_itimerspec __user *, old)
{
long err;
mm_segment_t oldfs;
@@ -579,8 +703,8 @@ long compat_sys_timer_settime(timer_t timer_id, int flags,
return err;
}
-long compat_sys_timer_gettime(timer_t timer_id,
- struct compat_itimerspec __user *setting)
+COMPAT_SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id,
+ struct compat_itimerspec __user *, setting)
{
long err;
mm_segment_t oldfs;
@@ -596,14 +720,14 @@ long compat_sys_timer_gettime(timer_t timer_id,
return err;
}
-long compat_sys_clock_settime(clockid_t which_clock,
- struct compat_timespec __user *tp)
+COMPAT_SYSCALL_DEFINE2(clock_settime, clockid_t, which_clock,
+ struct compat_timespec __user *, tp)
{
long err;
mm_segment_t oldfs;
struct timespec ts;
- if (get_compat_timespec(&ts, tp))
+ if (compat_get_timespec(&ts, tp))
return -EFAULT;
oldfs = get_fs();
set_fs(KERNEL_DS);
@@ -613,8 +737,8 @@ long compat_sys_clock_settime(clockid_t which_clock,
return err;
}
-long compat_sys_clock_gettime(clockid_t which_clock,
- struct compat_timespec __user *tp)
+COMPAT_SYSCALL_DEFINE2(clock_gettime, clockid_t, which_clock,
+ struct compat_timespec __user *, tp)
{
long err;
mm_segment_t oldfs;
@@ -625,13 +749,36 @@ long compat_sys_clock_gettime(clockid_t which_clock,
err = sys_clock_gettime(which_clock,
(struct timespec __user *) &ts);
set_fs(oldfs);
- if (!err && put_compat_timespec(&ts, tp))
+ if (!err && compat_put_timespec(&ts, tp))
return -EFAULT;
return err;
}
-long compat_sys_clock_getres(clockid_t which_clock,
- struct compat_timespec __user *tp)
+COMPAT_SYSCALL_DEFINE2(clock_adjtime, clockid_t, which_clock,
+ struct compat_timex __user *, utp)
+{
+ struct timex txc;
+ mm_segment_t oldfs;
+ int err, ret;
+
+ err = compat_get_timex(&txc, utp);
+ if (err)
+ return err;
+
+ oldfs = get_fs();
+ set_fs(KERNEL_DS);
+ ret = sys_clock_adjtime(which_clock, (struct timex __user *) &txc);
+ set_fs(oldfs);
+
+ err = compat_put_timex(utp, &txc);
+ if (err)
+ return err;
+
+ return ret;
+}
+
+COMPAT_SYSCALL_DEFINE2(clock_getres, clockid_t, which_clock,
+ struct compat_timespec __user *, tp)
{
long err;
mm_segment_t oldfs;
@@ -642,7 +789,7 @@ long compat_sys_clock_getres(clockid_t which_clock,
err = sys_clock_getres(which_clock,
(struct timespec __user *) &ts);
set_fs(oldfs);
- if (!err && tp && put_compat_timespec(&ts, tp))
+ if (!err && tp && compat_put_timespec(&ts, tp))
return -EFAULT;
return err;
}
@@ -652,7 +799,7 @@ static long compat_clock_nanosleep_restart(struct restart_block *restart)
long err;
mm_segment_t oldfs;
struct timespec tu;
- struct compat_timespec *rmtp = restart->nanosleep.compat_rmtp;
+ struct compat_timespec __user *rmtp = restart->nanosleep.compat_rmtp;
restart->nanosleep.rmtp = (struct timespec __user *) &tu;
oldfs = get_fs();
@@ -661,7 +808,7 @@ static long compat_clock_nanosleep_restart(struct restart_block *restart)
set_fs(oldfs);
if ((err == -ERESTART_RESTARTBLOCK) && rmtp &&
- put_compat_timespec(&tu, rmtp))
+ compat_put_timespec(&tu, rmtp))
return -EFAULT;
if (err == -ERESTART_RESTARTBLOCK) {
@@ -671,16 +818,16 @@ static long compat_clock_nanosleep_restart(struct restart_block *restart)
return err;
}
-long compat_sys_clock_nanosleep(clockid_t which_clock, int flags,
- struct compat_timespec __user *rqtp,
- struct compat_timespec __user *rmtp)
+COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags,
+ struct compat_timespec __user *, rqtp,
+ struct compat_timespec __user *, rmtp)
{
long err;
mm_segment_t oldfs;
struct timespec in, out;
struct restart_block *restart;
- if (get_compat_timespec(&in, rqtp))
+ if (compat_get_timespec(&in, rqtp))
return -EFAULT;
oldfs = get_fs();
@@ -691,7 +838,7 @@ long compat_sys_clock_nanosleep(clockid_t which_clock, int flags,
set_fs(oldfs);
if ((err == -ERESTART_RESTARTBLOCK) && rmtp &&
- put_compat_timespec(&out, rmtp))
+ compat_put_timespec(&out, rmtp))
return -EFAULT;
if (err == -ERESTART_RESTARTBLOCK) {
@@ -805,7 +952,7 @@ long compat_put_bitmap(compat_ulong_t __user *umask, unsigned long *mask,
}
void
-sigset_from_compat (sigset_t *set, compat_sigset_t *compat)
+sigset_from_compat(sigset_t *set, const compat_sigset_t *compat)
{
switch (_NSIG_WORDS) {
case 4: set->sig[3] = compat->sig[6] | (((long)compat->sig[7]) << 32 );
@@ -814,18 +961,28 @@ sigset_from_compat (sigset_t *set, compat_sigset_t *compat)
case 1: set->sig[0] = compat->sig[0] | (((long)compat->sig[1]) << 32 );
}
}
+EXPORT_SYMBOL_GPL(sigset_from_compat);
-asmlinkage long
-compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese,
- struct compat_siginfo __user *uinfo,
- struct compat_timespec __user *uts, compat_size_t sigsetsize)
+void
+sigset_to_compat(compat_sigset_t *compat, const sigset_t *set)
+{
+ switch (_NSIG_WORDS) {
+ case 4: compat->sig[7] = (set->sig[3] >> 32); compat->sig[6] = set->sig[3];
+ case 3: compat->sig[5] = (set->sig[2] >> 32); compat->sig[4] = set->sig[2];
+ case 2: compat->sig[3] = (set->sig[1] >> 32); compat->sig[2] = set->sig[1];
+ case 1: compat->sig[1] = (set->sig[0] >> 32); compat->sig[0] = set->sig[0];
+ }
+}
+
+COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait, compat_sigset_t __user *, uthese,
+ struct compat_siginfo __user *, uinfo,
+ struct compat_timespec __user *, uts, compat_size_t, sigsetsize)
{
compat_sigset_t s32;
sigset_t s;
- int sig;
struct timespec t;
siginfo_t info;
- long ret, timeout = 0;
+ long ret;
if (sigsetsize != sizeof(sigset_t))
return -EINVAL;
@@ -833,71 +990,27 @@ compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese,
if (copy_from_user(&s32, uthese, sizeof(compat_sigset_t)))
return -EFAULT;
sigset_from_compat(&s, &s32);
- sigdelsetmask(&s,sigmask(SIGKILL)|sigmask(SIGSTOP));
- signotset(&s);
if (uts) {
- if (get_compat_timespec (&t, uts))
+ if (compat_get_timespec(&t, uts))
return -EFAULT;
- if (t.tv_nsec >= 1000000000L || t.tv_nsec < 0
- || t.tv_sec < 0)
- return -EINVAL;
}
- spin_lock_irq(&current->sighand->siglock);
- sig = dequeue_signal(current, &s, &info);
- if (!sig) {
- timeout = MAX_SCHEDULE_TIMEOUT;
- if (uts)
- timeout = timespec_to_jiffies(&t)
- +(t.tv_sec || t.tv_nsec);
- if (timeout) {
- current->real_blocked = current->blocked;
- sigandsets(&current->blocked, &current->blocked, &s);
-
- recalc_sigpending();
- spin_unlock_irq(&current->sighand->siglock);
-
- timeout = schedule_timeout_interruptible(timeout);
-
- spin_lock_irq(&current->sighand->siglock);
- sig = dequeue_signal(current, &s, &info);
- current->blocked = current->real_blocked;
- siginitset(&current->real_blocked, 0);
- recalc_sigpending();
- }
- }
- spin_unlock_irq(&current->sighand->siglock);
+ ret = do_sigtimedwait(&s, &info, uts ? &t : NULL);
- if (sig) {
- ret = sig;
- if (uinfo) {
- if (copy_siginfo_to_user32(uinfo, &info))
- ret = -EFAULT;
- }
- }else {
- ret = timeout?-EINTR:-EAGAIN;
+ if (ret > 0 && uinfo) {
+ if (copy_siginfo_to_user32(uinfo, &info))
+ ret = -EFAULT;
}
- return ret;
-
-}
-asmlinkage long
-compat_sys_rt_tgsigqueueinfo(compat_pid_t tgid, compat_pid_t pid, int sig,
- struct compat_siginfo __user *uinfo)
-{
- siginfo_t info;
-
- if (copy_siginfo_from_user32(&info, uinfo))
- return -EFAULT;
- return do_rt_tgsigqueueinfo(tgid, pid, sig, &info);
+ return ret;
}
#ifdef __ARCH_WANT_COMPAT_SYS_TIME
/* compat_time_t is a 32 bit "long" and needs to get converted. */
-asmlinkage long compat_sys_time(compat_time_t __user * tloc)
+COMPAT_SYSCALL_DEFINE1(time, compat_time_t __user *, tloc)
{
compat_time_t i;
struct timeval tv;
@@ -913,7 +1026,7 @@ asmlinkage long compat_sys_time(compat_time_t __user * tloc)
return i;
}
-asmlinkage long compat_sys_stime(compat_time_t __user *tptr)
+COMPAT_SYSCALL_DEFINE1(stime, compat_time_t __user *, tptr)
{
struct timespec tv;
int err;
@@ -933,99 +1046,30 @@ asmlinkage long compat_sys_stime(compat_time_t __user *tptr)
#endif /* __ARCH_WANT_COMPAT_SYS_TIME */
-#ifdef __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND
-asmlinkage long compat_sys_rt_sigsuspend(compat_sigset_t __user *unewset, compat_size_t sigsetsize)
-{
- sigset_t newset;
- compat_sigset_t newset32;
-
- /* XXX: Don't preclude handling different sized sigset_t's. */
- if (sigsetsize != sizeof(sigset_t))
- return -EINVAL;
-
- if (copy_from_user(&newset32, unewset, sizeof(compat_sigset_t)))
- return -EFAULT;
- sigset_from_compat(&newset, &newset32);
- sigdelsetmask(&newset, sigmask(SIGKILL)|sigmask(SIGSTOP));
-
- spin_lock_irq(&current->sighand->siglock);
- current->saved_sigmask = current->blocked;
- current->blocked = newset;
- recalc_sigpending();
- spin_unlock_irq(&current->sighand->siglock);
-
- current->state = TASK_INTERRUPTIBLE;
- schedule();
- set_restore_sigmask();
- return -ERESTARTNOHAND;
-}
-#endif /* __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND */
-
-asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp)
+COMPAT_SYSCALL_DEFINE1(adjtimex, struct compat_timex __user *, utp)
{
struct timex txc;
- int ret;
-
- memset(&txc, 0, sizeof(struct timex));
+ int err, ret;
- if (!access_ok(VERIFY_READ, utp, sizeof(struct compat_timex)) ||
- __get_user(txc.modes, &utp->modes) ||
- __get_user(txc.offset, &utp->offset) ||
- __get_user(txc.freq, &utp->freq) ||
- __get_user(txc.maxerror, &utp->maxerror) ||
- __get_user(txc.esterror, &utp->esterror) ||
- __get_user(txc.status, &utp->status) ||
- __get_user(txc.constant, &utp->constant) ||
- __get_user(txc.precision, &utp->precision) ||
- __get_user(txc.tolerance, &utp->tolerance) ||
- __get_user(txc.time.tv_sec, &utp->time.tv_sec) ||
- __get_user(txc.time.tv_usec, &utp->time.tv_usec) ||
- __get_user(txc.tick, &utp->tick) ||
- __get_user(txc.ppsfreq, &utp->ppsfreq) ||
- __get_user(txc.jitter, &utp->jitter) ||
- __get_user(txc.shift, &utp->shift) ||
- __get_user(txc.stabil, &utp->stabil) ||
- __get_user(txc.jitcnt, &utp->jitcnt) ||
- __get_user(txc.calcnt, &utp->calcnt) ||
- __get_user(txc.errcnt, &utp->errcnt) ||
- __get_user(txc.stbcnt, &utp->stbcnt))
- return -EFAULT;
+ err = compat_get_timex(&txc, utp);
+ if (err)
+ return err;
ret = do_adjtimex(&txc);
- if (!access_ok(VERIFY_WRITE, utp, sizeof(struct compat_timex)) ||
- __put_user(txc.modes, &utp->modes) ||
- __put_user(txc.offset, &utp->offset) ||
- __put_user(txc.freq, &utp->freq) ||
- __put_user(txc.maxerror, &utp->maxerror) ||
- __put_user(txc.esterror, &utp->esterror) ||
- __put_user(txc.status, &utp->status) ||
- __put_user(txc.constant, &utp->constant) ||
- __put_user(txc.precision, &utp->precision) ||
- __put_user(txc.tolerance, &utp->tolerance) ||
- __put_user(txc.time.tv_sec, &utp->time.tv_sec) ||
- __put_user(txc.time.tv_usec, &utp->time.tv_usec) ||
- __put_user(txc.tick, &utp->tick) ||
- __put_user(txc.ppsfreq, &utp->ppsfreq) ||
- __put_user(txc.jitter, &utp->jitter) ||
- __put_user(txc.shift, &utp->shift) ||
- __put_user(txc.stabil, &utp->stabil) ||
- __put_user(txc.jitcnt, &utp->jitcnt) ||
- __put_user(txc.calcnt, &utp->calcnt) ||
- __put_user(txc.errcnt, &utp->errcnt) ||
- __put_user(txc.stbcnt, &utp->stbcnt) ||
- __put_user(txc.tai, &utp->tai))
- ret = -EFAULT;
+ err = compat_put_timex(utp, &txc);
+ if (err)
+ return err;
return ret;
}
#ifdef CONFIG_NUMA
-asmlinkage long compat_sys_move_pages(pid_t pid, unsigned long nr_pages,
- compat_uptr_t __user *pages32,
- const int __user *nodes,
- int __user *status,
- int flags)
+COMPAT_SYSCALL_DEFINE6(move_pages, pid_t, pid, compat_ulong_t, nr_pages,
+ compat_uptr_t __user *, pages32,
+ const int __user *, nodes,
+ int __user *, status,
+ int, flags)
{
const void __user * __user *pages;
int i;
@@ -1041,10 +1085,10 @@ asmlinkage long compat_sys_move_pages(pid_t pid, unsigned long nr_pages,
return sys_move_pages(pid, nr_pages, pages, nodes, status, flags);
}
-asmlinkage long compat_sys_migrate_pages(compat_pid_t pid,
- compat_ulong_t maxnode,
- const compat_ulong_t __user *old_nodes,
- const compat_ulong_t __user *new_nodes)
+COMPAT_SYSCALL_DEFINE4(migrate_pages, compat_pid_t, pid,
+ compat_ulong_t, maxnode,
+ const compat_ulong_t __user *, old_nodes,
+ const compat_ulong_t __user *, new_nodes)
{
unsigned long __user *old = NULL;
unsigned long __user *new = NULL;
@@ -1075,67 +1119,39 @@ asmlinkage long compat_sys_migrate_pages(compat_pid_t pid,
}
#endif
-struct compat_sysinfo {
- s32 uptime;
- u32 loads[3];
- u32 totalram;
- u32 freeram;
- u32 sharedram;
- u32 bufferram;
- u32 totalswap;
- u32 freeswap;
- u16 procs;
- u16 pad;
- u32 totalhigh;
- u32 freehigh;
- u32 mem_unit;
- char _f[20-2*sizeof(u32)-sizeof(int)];
-};
-
-asmlinkage long
-compat_sys_sysinfo(struct compat_sysinfo __user *info)
+COMPAT_SYSCALL_DEFINE2(sched_rr_get_interval,
+ compat_pid_t, pid,
+ struct compat_timespec __user *, interval)
{
- struct sysinfo s;
+ struct timespec t;
+ int ret;
+ mm_segment_t old_fs = get_fs();
- do_sysinfo(&s);
+ set_fs(KERNEL_DS);
+ ret = sys_sched_rr_get_interval(pid, (struct timespec __user *)&t);
+ set_fs(old_fs);
+ if (compat_put_timespec(&t, interval))
+ return -EFAULT;
+ return ret;
+}
- /* Check to see if any memory value is too large for 32-bit and scale
- * down if needed
- */
- if ((s.totalram >> 32) || (s.totalswap >> 32)) {
- int bitcount = 0;
+/*
+ * Allocate user-space memory for the duration of a single system call,
+ * in order to marshall parameters inside a compat thunk.
+ */
+void __user *compat_alloc_user_space(unsigned long len)
+{
+ void __user *ptr;
- while (s.mem_unit < PAGE_SIZE) {
- s.mem_unit <<= 1;
- bitcount++;
- }
+ /* If len would occupy more than half of the entire compat space... */
+ if (unlikely(len > (((compat_uptr_t)~0) >> 1)))
+ return NULL;
- s.totalram >>= bitcount;
- s.freeram >>= bitcount;
- s.sharedram >>= bitcount;
- s.bufferram >>= bitcount;
- s.totalswap >>= bitcount;
- s.freeswap >>= bitcount;
- s.totalhigh >>= bitcount;
- s.freehigh >>= bitcount;
- }
+ ptr = arch_compat_alloc_user_space(len);
- if (!access_ok(VERIFY_WRITE, info, sizeof(struct compat_sysinfo)) ||
- __put_user (s.uptime, &info->uptime) ||
- __put_user (s.loads[0], &info->loads[0]) ||
- __put_user (s.loads[1], &info->loads[1]) ||
- __put_user (s.loads[2], &info->loads[2]) ||
- __put_user (s.totalram, &info->totalram) ||
- __put_user (s.freeram, &info->freeram) ||
- __put_user (s.sharedram, &info->sharedram) ||
- __put_user (s.bufferram, &info->bufferram) ||
- __put_user (s.totalswap, &info->totalswap) ||
- __put_user (s.freeswap, &info->freeswap) ||
- __put_user (s.procs, &info->procs) ||
- __put_user (s.totalhigh, &info->totalhigh) ||
- __put_user (s.freehigh, &info->freehigh) ||
- __put_user (s.mem_unit, &info->mem_unit))
- return -EFAULT;
+ if (unlikely(!access_ok(VERIFY_WRITE, ptr, len)))
+ return NULL;
- return 0;
+ return ptr;
}
+EXPORT_SYMBOL_GPL(compat_alloc_user_space);
diff --git a/kernel/configs.c b/kernel/configs.c
index abaee684ecb..c18b1f1ae51 100644
--- a/kernel/configs.c
+++ b/kernel/configs.c
@@ -66,6 +66,7 @@ ikconfig_read_current(struct file *file, char __user *buf,
static const struct file_operations ikconfig_file_ops = {
.owner = THIS_MODULE,
.read = ikconfig_read_current,
+ .llseek = default_llseek,
};
static int __init ikconfig_init(void)
@@ -78,7 +79,7 @@ static int __init ikconfig_init(void)
if (!entry)
return -ENOMEM;
- entry->size = kernel_config_data_size;
+ proc_set_size(entry, kernel_config_data_size);
return 0;
}
@@ -91,8 +92,8 @@ static void __exit ikconfig_cleanup(void)
module_init(ikconfig_init);
module_exit(ikconfig_cleanup);
+#endif /* CONFIG_IKCONFIG_PROC */
+
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Randy Dunlap");
MODULE_DESCRIPTION("Echo the kernel .config file used to build the kernel");
-
-#endif /* CONFIG_IKCONFIG_PROC */
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
new file mode 100644
index 00000000000..5664985c46a
--- /dev/null
+++ b/kernel/context_tracking.c
@@ -0,0 +1,216 @@
+/*
+ * Context tracking: Probe on high level context boundaries such as kernel
+ * and userspace. This includes syscalls and exceptions entry/exit.
+ *
+ * This is used by RCU to remove its dependency on the timer tick while a CPU
+ * runs in userspace.
+ *
+ * Started by Frederic Weisbecker:
+ *
+ * Copyright (C) 2012 Red Hat, Inc., Frederic Weisbecker <fweisbec@redhat.com>
+ *
+ * Many thanks to Gilad Ben-Yossef, Paul McKenney, Ingo Molnar, Andrew Morton,
+ * Steven Rostedt, Peter Zijlstra for suggestions and improvements.
+ *
+ */
+
+#include <linux/context_tracking.h>
+#include <linux/rcupdate.h>
+#include <linux/sched.h>
+#include <linux/hardirq.h>
+#include <linux/export.h>
+#include <linux/kprobes.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/context_tracking.h>
+
+struct static_key context_tracking_enabled = STATIC_KEY_INIT_FALSE;
+EXPORT_SYMBOL_GPL(context_tracking_enabled);
+
+DEFINE_PER_CPU(struct context_tracking, context_tracking);
+EXPORT_SYMBOL_GPL(context_tracking);
+
+void context_tracking_cpu_set(int cpu)
+{
+ if (!per_cpu(context_tracking.active, cpu)) {
+ per_cpu(context_tracking.active, cpu) = true;
+ static_key_slow_inc(&context_tracking_enabled);
+ }
+}
+
+/**
+ * context_tracking_user_enter - Inform the context tracking that the CPU is going to
+ * enter userspace mode.
+ *
+ * This function must be called right before we switch from the kernel
+ * to userspace, when it's guaranteed the remaining kernel instructions
+ * to execute won't use any RCU read side critical section because this
+ * function sets RCU in extended quiescent state.
+ */
+void context_tracking_user_enter(void)
+{
+ unsigned long flags;
+
+ /*
+ * Repeat the user_enter() check here because some archs may be calling
+ * this from asm and if no CPU needs context tracking, they shouldn't
+ * go further. Repeat the check here until they support the inline static
+ * key check.
+ */
+ if (!context_tracking_is_enabled())
+ return;
+
+ /*
+ * Some contexts may involve an exception occuring in an irq,
+ * leading to that nesting:
+ * rcu_irq_enter() rcu_user_exit() rcu_user_exit() rcu_irq_exit()
+ * This would mess up the dyntick_nesting count though. And rcu_irq_*()
+ * helpers are enough to protect RCU uses inside the exception. So
+ * just return immediately if we detect we are in an IRQ.
+ */
+ if (in_interrupt())
+ return;
+
+ /* Kernel threads aren't supposed to go to userspace */
+ WARN_ON_ONCE(!current->mm);
+
+ local_irq_save(flags);
+ if ( __this_cpu_read(context_tracking.state) != IN_USER) {
+ if (__this_cpu_read(context_tracking.active)) {
+ trace_user_enter(0);
+ /*
+ * At this stage, only low level arch entry code remains and
+ * then we'll run in userspace. We can assume there won't be
+ * any RCU read-side critical section until the next call to
+ * user_exit() or rcu_irq_enter(). Let's remove RCU's dependency
+ * on the tick.
+ */
+ vtime_user_enter(current);
+ rcu_user_enter();
+ }
+ /*
+ * Even if context tracking is disabled on this CPU, because it's outside
+ * the full dynticks mask for example, we still have to keep track of the
+ * context transitions and states to prevent inconsistency on those of
+ * other CPUs.
+ * If a task triggers an exception in userspace, sleep on the exception
+ * handler and then migrate to another CPU, that new CPU must know where
+ * the exception returns by the time we call exception_exit().
+ * This information can only be provided by the previous CPU when it called
+ * exception_enter().
+ * OTOH we can spare the calls to vtime and RCU when context_tracking.active
+ * is false because we know that CPU is not tickless.
+ */
+ __this_cpu_write(context_tracking.state, IN_USER);
+ }
+ local_irq_restore(flags);
+}
+NOKPROBE_SYMBOL(context_tracking_user_enter);
+
+#ifdef CONFIG_PREEMPT
+/**
+ * preempt_schedule_context - preempt_schedule called by tracing
+ *
+ * The tracing infrastructure uses preempt_enable_notrace to prevent
+ * recursion and tracing preempt enabling caused by the tracing
+ * infrastructure itself. But as tracing can happen in areas coming
+ * from userspace or just about to enter userspace, a preempt enable
+ * can occur before user_exit() is called. This will cause the scheduler
+ * to be called when the system is still in usermode.
+ *
+ * To prevent this, the preempt_enable_notrace will use this function
+ * instead of preempt_schedule() to exit user context if needed before
+ * calling the scheduler.
+ */
+asmlinkage __visible void __sched notrace preempt_schedule_context(void)
+{
+ enum ctx_state prev_ctx;
+
+ if (likely(!preemptible()))
+ return;
+
+ /*
+ * Need to disable preemption in case user_exit() is traced
+ * and the tracer calls preempt_enable_notrace() causing
+ * an infinite recursion.
+ */
+ preempt_disable_notrace();
+ prev_ctx = exception_enter();
+ preempt_enable_no_resched_notrace();
+
+ preempt_schedule();
+
+ preempt_disable_notrace();
+ exception_exit(prev_ctx);
+ preempt_enable_notrace();
+}
+EXPORT_SYMBOL_GPL(preempt_schedule_context);
+#endif /* CONFIG_PREEMPT */
+
+/**
+ * context_tracking_user_exit - Inform the context tracking that the CPU is
+ * exiting userspace mode and entering the kernel.
+ *
+ * This function must be called after we entered the kernel from userspace
+ * before any use of RCU read side critical section. This potentially include
+ * any high level kernel code like syscalls, exceptions, signal handling, etc...
+ *
+ * This call supports re-entrancy. This way it can be called from any exception
+ * handler without needing to know if we came from userspace or not.
+ */
+void context_tracking_user_exit(void)
+{
+ unsigned long flags;
+
+ if (!context_tracking_is_enabled())
+ return;
+
+ if (in_interrupt())
+ return;
+
+ local_irq_save(flags);
+ if (__this_cpu_read(context_tracking.state) == IN_USER) {
+ if (__this_cpu_read(context_tracking.active)) {
+ /*
+ * We are going to run code that may use RCU. Inform
+ * RCU core about that (ie: we may need the tick again).
+ */
+ rcu_user_exit();
+ vtime_user_exit(current);
+ trace_user_exit(0);
+ }
+ __this_cpu_write(context_tracking.state, IN_KERNEL);
+ }
+ local_irq_restore(flags);
+}
+NOKPROBE_SYMBOL(context_tracking_user_exit);
+
+/**
+ * __context_tracking_task_switch - context switch the syscall callbacks
+ * @prev: the task that is being switched out
+ * @next: the task that is being switched in
+ *
+ * The context tracking uses the syscall slow path to implement its user-kernel
+ * boundaries probes on syscalls. This way it doesn't impact the syscall fast
+ * path on CPUs that don't do context tracking.
+ *
+ * But we need to clear the flag on the previous task because it may later
+ * migrate to some CPU that doesn't do the context tracking. As such the TIF
+ * flag may not be desired there.
+ */
+void __context_tracking_task_switch(struct task_struct *prev,
+ struct task_struct *next)
+{
+ clear_tsk_thread_flag(prev, TIF_NOHZ);
+ set_tsk_thread_flag(next, TIF_NOHZ);
+}
+
+#ifdef CONFIG_CONTEXT_TRACKING_FORCE
+void __init context_tracking_init(void)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ context_tracking_cpu_set(cpu);
+}
+#endif
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 677f25376a3..a343bde710b 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -10,22 +10,52 @@
#include <linux/sched.h>
#include <linux/unistd.h>
#include <linux/cpu.h>
-#include <linux/module.h>
+#include <linux/oom.h>
+#include <linux/rcupdate.h>
+#include <linux/export.h>
+#include <linux/bug.h>
#include <linux/kthread.h>
#include <linux/stop_machine.h>
#include <linux/mutex.h>
+#include <linux/gfp.h>
+#include <linux/suspend.h>
+#include <linux/lockdep.h>
+#include <trace/events/power.h>
+
+#include "smpboot.h"
#ifdef CONFIG_SMP
/* Serializes the updates to cpu_online_mask, cpu_present_mask */
static DEFINE_MUTEX(cpu_add_remove_lock);
-static __cpuinitdata RAW_NOTIFIER_HEAD(cpu_chain);
+/*
+ * The following two APIs (cpu_maps_update_begin/done) must be used when
+ * attempting to serialize the updates to cpu_online_mask & cpu_present_mask.
+ * The APIs cpu_notifier_register_begin/done() must be used to protect CPU
+ * hotplug callback (un)registration performed using __register_cpu_notifier()
+ * or __unregister_cpu_notifier().
+ */
+void cpu_maps_update_begin(void)
+{
+ mutex_lock(&cpu_add_remove_lock);
+}
+EXPORT_SYMBOL(cpu_notifier_register_begin);
+
+void cpu_maps_update_done(void)
+{
+ mutex_unlock(&cpu_add_remove_lock);
+}
+EXPORT_SYMBOL(cpu_notifier_register_done);
+
+static RAW_NOTIFIER_HEAD(cpu_chain);
/* If set, cpu_up and cpu_down will return -EBUSY and do nothing.
* Should always be manipulated under cpu_add_remove_lock
*/
static int cpu_hotplug_disabled;
+#ifdef CONFIG_HOTPLUG_CPU
+
static struct {
struct task_struct *active_writer;
struct mutex lock; /* Synchronizes accesses to refcount, */
@@ -34,19 +64,30 @@ static struct {
* an ongoing cpu hotplug operation.
*/
int refcount;
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ struct lockdep_map dep_map;
+#endif
} cpu_hotplug = {
.active_writer = NULL,
.lock = __MUTEX_INITIALIZER(cpu_hotplug.lock),
.refcount = 0,
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ .dep_map = {.name = "cpu_hotplug.lock" },
+#endif
};
-#ifdef CONFIG_HOTPLUG_CPU
+/* Lockdep annotations for get/put_online_cpus() and cpu_hotplug_begin/end() */
+#define cpuhp_lock_acquire_read() lock_map_acquire_read(&cpu_hotplug.dep_map)
+#define cpuhp_lock_acquire() lock_map_acquire(&cpu_hotplug.dep_map)
+#define cpuhp_lock_release() lock_map_release(&cpu_hotplug.dep_map)
void get_online_cpus(void)
{
might_sleep();
if (cpu_hotplug.active_writer == current)
return;
+ cpuhp_lock_acquire_read();
mutex_lock(&cpu_hotplug.lock);
cpu_hotplug.refcount++;
mutex_unlock(&cpu_hotplug.lock);
@@ -59,29 +100,18 @@ void put_online_cpus(void)
if (cpu_hotplug.active_writer == current)
return;
mutex_lock(&cpu_hotplug.lock);
+
+ if (WARN_ON(!cpu_hotplug.refcount))
+ cpu_hotplug.refcount++; /* try to fix things up */
+
if (!--cpu_hotplug.refcount && unlikely(cpu_hotplug.active_writer))
wake_up_process(cpu_hotplug.active_writer);
mutex_unlock(&cpu_hotplug.lock);
+ cpuhp_lock_release();
}
EXPORT_SYMBOL_GPL(put_online_cpus);
-#endif /* CONFIG_HOTPLUG_CPU */
-
-/*
- * The following two API's must be used when attempting
- * to serialize the updates to cpu_online_mask, cpu_present_mask.
- */
-void cpu_maps_update_begin(void)
-{
- mutex_lock(&cpu_add_remove_lock);
-}
-
-void cpu_maps_update_done(void)
-{
- mutex_unlock(&cpu_add_remove_lock);
-}
-
/*
* This ensures that the hotplug operation can begin only when the
* refcount goes to zero.
@@ -104,10 +134,11 @@ void cpu_maps_update_done(void)
* get_online_cpus() not an api which is called all that often.
*
*/
-static void cpu_hotplug_begin(void)
+void cpu_hotplug_begin(void)
{
cpu_hotplug.active_writer = current;
+ cpuhp_lock_acquire();
for (;;) {
mutex_lock(&cpu_hotplug.lock);
if (likely(!cpu_hotplug.refcount))
@@ -118,11 +149,36 @@ static void cpu_hotplug_begin(void)
}
}
-static void cpu_hotplug_done(void)
+void cpu_hotplug_done(void)
{
cpu_hotplug.active_writer = NULL;
mutex_unlock(&cpu_hotplug.lock);
+ cpuhp_lock_release();
}
+
+/*
+ * Wait for currently running CPU hotplug operations to complete (if any) and
+ * disable future CPU hotplug (from sysfs). The 'cpu_add_remove_lock' protects
+ * the 'cpu_hotplug_disabled' flag. The same lock is also acquired by the
+ * hotplug path before performing hotplug operations. So acquiring that lock
+ * guarantees mutual exclusion from any currently running hotplug operations.
+ */
+void cpu_hotplug_disable(void)
+{
+ cpu_maps_update_begin();
+ cpu_hotplug_disabled = 1;
+ cpu_maps_update_done();
+}
+
+void cpu_hotplug_enable(void)
+{
+ cpu_maps_update_begin();
+ cpu_hotplug_disabled = 0;
+ cpu_maps_update_done();
+}
+
+#endif /* CONFIG_HOTPLUG_CPU */
+
/* Need to know about CPUs going up/down? */
int __ref register_cpu_notifier(struct notifier_block *nb)
{
@@ -133,9 +189,35 @@ int __ref register_cpu_notifier(struct notifier_block *nb)
return ret;
}
+int __ref __register_cpu_notifier(struct notifier_block *nb)
+{
+ return raw_notifier_chain_register(&cpu_chain, nb);
+}
+
+static int __cpu_notify(unsigned long val, void *v, int nr_to_call,
+ int *nr_calls)
+{
+ int ret;
+
+ ret = __raw_notifier_call_chain(&cpu_chain, val, v, nr_to_call,
+ nr_calls);
+
+ return notifier_to_errno(ret);
+}
+
+static int cpu_notify(unsigned long val, void *v)
+{
+ return __cpu_notify(val, v, -1, NULL);
+}
+
#ifdef CONFIG_HOTPLUG_CPU
+static void cpu_notify_nofail(unsigned long val, void *v)
+{
+ BUG_ON(cpu_notify(val, v));
+}
EXPORT_SYMBOL(register_cpu_notifier);
+EXPORT_SYMBOL(__register_cpu_notifier);
void __ref unregister_cpu_notifier(struct notifier_block *nb)
{
@@ -145,17 +227,64 @@ void __ref unregister_cpu_notifier(struct notifier_block *nb)
}
EXPORT_SYMBOL(unregister_cpu_notifier);
+void __ref __unregister_cpu_notifier(struct notifier_block *nb)
+{
+ raw_notifier_chain_unregister(&cpu_chain, nb);
+}
+EXPORT_SYMBOL(__unregister_cpu_notifier);
+
+/**
+ * clear_tasks_mm_cpumask - Safely clear tasks' mm_cpumask for a CPU
+ * @cpu: a CPU id
+ *
+ * This function walks all processes, finds a valid mm struct for each one and
+ * then clears a corresponding bit in mm's cpumask. While this all sounds
+ * trivial, there are various non-obvious corner cases, which this function
+ * tries to solve in a safe manner.
+ *
+ * Also note that the function uses a somewhat relaxed locking scheme, so it may
+ * be called only for an already offlined CPU.
+ */
+void clear_tasks_mm_cpumask(int cpu)
+{
+ struct task_struct *p;
+
+ /*
+ * This function is called after the cpu is taken down and marked
+ * offline, so its not like new tasks will ever get this cpu set in
+ * their mm mask. -- Peter Zijlstra
+ * Thus, we may use rcu_read_lock() here, instead of grabbing
+ * full-fledged tasklist_lock.
+ */
+ WARN_ON(cpu_online(cpu));
+ rcu_read_lock();
+ for_each_process(p) {
+ struct task_struct *t;
+
+ /*
+ * Main thread might exit, but other threads may still have
+ * a valid mm. Find one.
+ */
+ t = find_lock_task_mm(p);
+ if (!t)
+ continue;
+ cpumask_clear_cpu(cpu, mm_cpumask(t->mm));
+ task_unlock(t);
+ }
+ rcu_read_unlock();
+}
+
static inline void check_for_tasks(int cpu)
{
struct task_struct *p;
+ cputime_t utime, stime;
write_lock_irq(&tasklist_lock);
for_each_process(p) {
+ task_cputime(p, &utime, &stime);
if (task_cpu(p) == cpu && p->state == TASK_RUNNING &&
- (!cputime_eq(p->utime, cputime_zero) ||
- !cputime_eq(p->stime, cputime_zero)))
- printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d "
- "(state = %ld, flags = %x)\n",
+ (utime || stime))
+ pr_warn("Task %s (pid = %d) is on cpu %d (state = %ld, flags = %x)\n",
p->comm, task_pid_nr(p), cpu,
p->state, p->flags);
}
@@ -178,12 +307,9 @@ static int __ref take_cpu_down(void *_param)
if (err < 0)
return err;
- raw_notifier_call_chain(&cpu_chain, CPU_DYING | param->mod,
- param->hcpu);
-
- /* Force idle task to run as soon as we yield: it should
- immediately notice cpu is offline and die quickly. */
- sched_idle_next();
+ cpu_notify(CPU_DYING | param->mod, param->hcpu);
+ /* Park the stopper thread */
+ kthread_park(current);
return 0;
}
@@ -191,7 +317,6 @@ static int __ref take_cpu_down(void *_param)
static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
{
int err, nr_calls = 0;
- cpumask_var_t old_allowed;
void *hcpu = (void *)(long)cpu;
unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
struct take_cpu_down_param tcd_param = {
@@ -205,65 +330,69 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
if (!cpu_online(cpu))
return -EINVAL;
- if (!alloc_cpumask_var(&old_allowed, GFP_KERNEL))
- return -ENOMEM;
-
cpu_hotplug_begin();
- set_cpu_active(cpu, false);
- err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod,
- hcpu, -1, &nr_calls);
- if (err == NOTIFY_BAD) {
- set_cpu_active(cpu, true);
+ err = __cpu_notify(CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls);
+ if (err) {
nr_calls--;
- __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod,
- hcpu, nr_calls, NULL);
- printk("%s: attempt to take down CPU %u failed\n",
- __func__, cpu);
- err = -EINVAL;
+ __cpu_notify(CPU_DOWN_FAILED | mod, hcpu, nr_calls, NULL);
+ pr_warn("%s: attempt to take down CPU %u failed\n",
+ __func__, cpu);
goto out_release;
}
- /* Ensure that we are not runnable on dying cpu */
- cpumask_copy(old_allowed, &current->cpus_allowed);
- set_cpus_allowed_ptr(current, cpu_active_mask);
+ /*
+ * By now we've cleared cpu_active_mask, wait for all preempt-disabled
+ * and RCU users of this state to go away such that all new such users
+ * will observe it.
+ *
+ * For CONFIG_PREEMPT we have preemptible RCU and its sync_rcu() might
+ * not imply sync_sched(), so explicitly call both.
+ *
+ * Do sync before park smpboot threads to take care the rcu boost case.
+ */
+#ifdef CONFIG_PREEMPT
+ synchronize_sched();
+#endif
+ synchronize_rcu();
+
+ smpboot_park_threads(cpu);
+
+ /*
+ * So now all preempt/rcu users must observe !cpu_active().
+ */
err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu));
if (err) {
- set_cpu_active(cpu, true);
/* CPU didn't die: tell everyone. Can't complain. */
- if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod,
- hcpu) == NOTIFY_BAD)
- BUG();
-
- goto out_allowed;
+ smpboot_unpark_threads(cpu);
+ cpu_notify_nofail(CPU_DOWN_FAILED | mod, hcpu);
+ goto out_release;
}
BUG_ON(cpu_online(cpu));
- /* Wait for it to sleep (leaving idle task). */
+ /*
+ * The migration_call() CPU_DYING callback will have removed all
+ * runnable tasks from the cpu, there's only the idle task left now
+ * that the migration thread is done doing the stop_machine thing.
+ *
+ * Wait for the stop thread to go away.
+ */
while (!idle_cpu(cpu))
- yield();
+ cpu_relax();
/* This actually kills the CPU. */
__cpu_die(cpu);
/* CPU is completely dead: tell everyone. Too late to complain. */
- if (raw_notifier_call_chain(&cpu_chain, CPU_DEAD | mod,
- hcpu) == NOTIFY_BAD)
- BUG();
+ cpu_notify_nofail(CPU_DEAD | mod, hcpu);
check_for_tasks(cpu);
-out_allowed:
- set_cpus_allowed_ptr(current, old_allowed);
out_release:
cpu_hotplug_done();
- if (!err) {
- if (raw_notifier_call_chain(&cpu_chain, CPU_POST_DEAD | mod,
- hcpu) == NOTIFY_BAD)
- BUG();
- }
- free_cpumask_var(old_allowed);
+ if (!err)
+ cpu_notify_nofail(CPU_POST_DEAD | mod, hcpu);
return err;
}
@@ -271,9 +400,6 @@ int __ref cpu_down(unsigned int cpu)
{
int err;
- err = stop_machine_create();
- if (err)
- return err;
cpu_maps_update_begin();
if (cpu_hotplug_disabled) {
@@ -285,66 +411,82 @@ int __ref cpu_down(unsigned int cpu)
out:
cpu_maps_update_done();
- stop_machine_destroy();
return err;
}
EXPORT_SYMBOL(cpu_down);
#endif /*CONFIG_HOTPLUG_CPU*/
/* Requires cpu_add_remove_lock to be held */
-static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
+static int _cpu_up(unsigned int cpu, int tasks_frozen)
{
int ret, nr_calls = 0;
void *hcpu = (void *)(long)cpu;
unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
-
- if (cpu_online(cpu) || !cpu_present(cpu))
- return -EINVAL;
+ struct task_struct *idle;
cpu_hotplug_begin();
- ret = __raw_notifier_call_chain(&cpu_chain, CPU_UP_PREPARE | mod, hcpu,
- -1, &nr_calls);
- if (ret == NOTIFY_BAD) {
- nr_calls--;
- printk("%s: attempt to bring up CPU %u failed\n",
- __func__, cpu);
+
+ if (cpu_online(cpu) || !cpu_present(cpu)) {
ret = -EINVAL;
+ goto out;
+ }
+
+ idle = idle_thread_get(cpu);
+ if (IS_ERR(idle)) {
+ ret = PTR_ERR(idle);
+ goto out;
+ }
+
+ ret = smpboot_create_threads(cpu);
+ if (ret)
+ goto out;
+
+ ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls);
+ if (ret) {
+ nr_calls--;
+ pr_warn("%s: attempt to bring up CPU %u failed\n",
+ __func__, cpu);
goto out_notify;
}
/* Arch-specific enabling code. */
- ret = __cpu_up(cpu);
+ ret = __cpu_up(cpu, idle);
if (ret != 0)
goto out_notify;
BUG_ON(!cpu_online(cpu));
- set_cpu_active(cpu, true);
+ /* Wake the per cpu threads */
+ smpboot_unpark_threads(cpu);
/* Now call notifier in preparation. */
- raw_notifier_call_chain(&cpu_chain, CPU_ONLINE | mod, hcpu);
+ cpu_notify(CPU_ONLINE | mod, hcpu);
out_notify:
if (ret != 0)
- __raw_notifier_call_chain(&cpu_chain,
- CPU_UP_CANCELED | mod, hcpu, nr_calls, NULL);
+ __cpu_notify(CPU_UP_CANCELED | mod, hcpu, nr_calls, NULL);
+out:
cpu_hotplug_done();
return ret;
}
-int __cpuinit cpu_up(unsigned int cpu)
+int cpu_up(unsigned int cpu)
{
int err = 0;
+
if (!cpu_possible(cpu)) {
- printk(KERN_ERR "can't online cpu %d because it is not "
- "configured as may-hotadd at boot time\n", cpu);
-#if defined(CONFIG_IA64) || defined(CONFIG_X86_64)
- printk(KERN_ERR "please check additional_cpus= boot "
- "parameter\n");
+ pr_err("can't online cpu %d because it is not configured as may-hotadd at boot time\n",
+ cpu);
+#if defined(CONFIG_IA64)
+ pr_err("please check additional_cpus= boot parameter\n");
#endif
return -EINVAL;
}
+ err = try_online_node(cpu_to_node(cpu));
+ if (err)
+ return err;
+
cpu_maps_update_begin();
if (cpu_hotplug_disabled) {
@@ -358,17 +500,15 @@ out:
cpu_maps_update_done();
return err;
}
+EXPORT_SYMBOL_GPL(cpu_up);
#ifdef CONFIG_PM_SLEEP_SMP
static cpumask_var_t frozen_cpus;
int disable_nonboot_cpus(void)
{
- int cpu, first_cpu, error;
+ int cpu, first_cpu, error = 0;
- error = stop_machine_create();
- if (error)
- return error;
cpu_maps_update_begin();
first_cpu = cpumask_first(cpu_online_mask);
/*
@@ -377,16 +517,17 @@ int disable_nonboot_cpus(void)
*/
cpumask_clear(frozen_cpus);
- printk("Disabling non-boot CPUs ...\n");
+ pr_info("Disabling non-boot CPUs ...\n");
for_each_online_cpu(cpu) {
if (cpu == first_cpu)
continue;
+ trace_suspend_resume(TPS("CPU_OFF"), cpu, true);
error = _cpu_down(cpu, 1);
+ trace_suspend_resume(TPS("CPU_OFF"), cpu, false);
if (!error)
cpumask_set_cpu(cpu, frozen_cpus);
else {
- printk(KERN_ERR "Error taking CPU%d down: %d\n",
- cpu, error);
+ pr_err("Error taking CPU%d down: %d\n", cpu, error);
break;
}
}
@@ -396,10 +537,9 @@ int disable_nonboot_cpus(void)
/* Make sure the CPUs won't be enabled by someone else */
cpu_hotplug_disabled = 1;
} else {
- printk(KERN_ERR "Non-boot CPUs are not disabled\n");
+ pr_err("Non-boot CPUs are not disabled\n");
}
cpu_maps_update_done();
- stop_machine_destroy();
return error;
}
@@ -421,17 +561,19 @@ void __ref enable_nonboot_cpus(void)
if (cpumask_empty(frozen_cpus))
goto out;
- printk("Enabling non-boot CPUs ...\n");
+ pr_info("Enabling non-boot CPUs ...\n");
arch_enable_nonboot_cpus_begin();
for_each_cpu(cpu, frozen_cpus) {
+ trace_suspend_resume(TPS("CPU_ON"), cpu, true);
error = _cpu_up(cpu, 1);
+ trace_suspend_resume(TPS("CPU_ON"), cpu, false);
if (!error) {
- printk("CPU%d is up\n", cpu);
+ pr_info("CPU%d is up\n", cpu);
continue;
}
- printk(KERN_WARNING "Error taking CPU%d up: %d\n", cpu, error);
+ pr_warn("Error taking CPU%d up: %d\n", cpu, error);
}
arch_enable_nonboot_cpus_end();
@@ -441,13 +583,61 @@ out:
cpu_maps_update_done();
}
-static int alloc_frozen_cpus(void)
+static int __init alloc_frozen_cpus(void)
{
if (!alloc_cpumask_var(&frozen_cpus, GFP_KERNEL|__GFP_ZERO))
return -ENOMEM;
return 0;
}
core_initcall(alloc_frozen_cpus);
+
+/*
+ * When callbacks for CPU hotplug notifications are being executed, we must
+ * ensure that the state of the system with respect to the tasks being frozen
+ * or not, as reported by the notification, remains unchanged *throughout the
+ * duration* of the execution of the callbacks.
+ * Hence we need to prevent the freezer from racing with regular CPU hotplug.
+ *
+ * This synchronization is implemented by mutually excluding regular CPU
+ * hotplug and Suspend/Hibernate call paths by hooking onto the Suspend/
+ * Hibernate notifications.
+ */
+static int
+cpu_hotplug_pm_callback(struct notifier_block *nb,
+ unsigned long action, void *ptr)
+{
+ switch (action) {
+
+ case PM_SUSPEND_PREPARE:
+ case PM_HIBERNATION_PREPARE:
+ cpu_hotplug_disable();
+ break;
+
+ case PM_POST_SUSPEND:
+ case PM_POST_HIBERNATION:
+ cpu_hotplug_enable();
+ break;
+
+ default:
+ return NOTIFY_DONE;
+ }
+
+ return NOTIFY_OK;
+}
+
+
+static int __init cpu_hotplug_pm_sync_init(void)
+{
+ /*
+ * cpu_hotplug_pm_callback has higher priority than x86
+ * bsp_pm_callback which depends on cpu_hotplug_pm_callback
+ * to disable cpu hotplug to avoid cpu hotplug race.
+ */
+ pm_notifier(cpu_hotplug_pm_callback, 0);
+ return 0;
+}
+core_initcall(cpu_hotplug_pm_sync_init);
+
#endif /* CONFIG_PM_SLEEP_SMP */
/**
@@ -458,7 +648,7 @@ core_initcall(alloc_frozen_cpus);
* It must be called by the arch code on the new cpu, before the new cpu
* enables interrupts and before the "boot" cpu returns from __cpu_up().
*/
-void __cpuinit notify_cpu_starting(unsigned int cpu)
+void notify_cpu_starting(unsigned int cpu)
{
unsigned long val = CPU_STARTING;
@@ -466,7 +656,7 @@ void __cpuinit notify_cpu_starting(unsigned int cpu)
if (frozen_cpus != NULL && cpumask_test_cpu(cpu, frozen_cpus))
val = CPU_STARTING_FROZEN;
#endif /* CONFIG_PM_SLEEP_SMP */
- raw_notifier_call_chain(&cpu_chain, val, (void *)(long)cpu);
+ cpu_notify(val, (void *)(long)cpu);
}
#endif /* CONFIG_SMP */
@@ -480,7 +670,7 @@ void __cpuinit notify_cpu_starting(unsigned int cpu)
*/
/* cpu_bit_bitmap[0] is empty - so we can back into it */
-#define MASK_DECLARE_1(x) [x+1][0] = 1UL << (x)
+#define MASK_DECLARE_1(x) [x+1][0] = (1UL << (x))
#define MASK_DECLARE_2(x) MASK_DECLARE_1(x), MASK_DECLARE_1(x+1)
#define MASK_DECLARE_4(x) MASK_DECLARE_2(x), MASK_DECLARE_2(x+2)
#define MASK_DECLARE_8(x) MASK_DECLARE_4(x), MASK_DECLARE_4(x+4)
@@ -538,10 +728,12 @@ void set_cpu_present(unsigned int cpu, bool present)
void set_cpu_online(unsigned int cpu, bool online)
{
- if (online)
+ if (online) {
cpumask_set_cpu(cpu, to_cpumask(cpu_online_bits));
- else
+ cpumask_set_cpu(cpu, to_cpumask(cpu_active_bits));
+ } else {
cpumask_clear_cpu(cpu, to_cpumask(cpu_online_bits));
+ }
}
void set_cpu_active(unsigned int cpu, bool active)
diff --git a/kernel/cpu_pm.c b/kernel/cpu_pm.c
new file mode 100644
index 00000000000..9656a3c3650
--- /dev/null
+++ b/kernel/cpu_pm.c
@@ -0,0 +1,233 @@
+/*
+ * Copyright (C) 2011 Google, Inc.
+ *
+ * Author:
+ * Colin Cross <ccross@android.com>
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/cpu_pm.h>
+#include <linux/module.h>
+#include <linux/notifier.h>
+#include <linux/spinlock.h>
+#include <linux/syscore_ops.h>
+
+static DEFINE_RWLOCK(cpu_pm_notifier_lock);
+static RAW_NOTIFIER_HEAD(cpu_pm_notifier_chain);
+
+static int cpu_pm_notify(enum cpu_pm_event event, int nr_to_call, int *nr_calls)
+{
+ int ret;
+
+ ret = __raw_notifier_call_chain(&cpu_pm_notifier_chain, event, NULL,
+ nr_to_call, nr_calls);
+
+ return notifier_to_errno(ret);
+}
+
+/**
+ * cpu_pm_register_notifier - register a driver with cpu_pm
+ * @nb: notifier block to register
+ *
+ * Add a driver to a list of drivers that are notified about
+ * CPU and CPU cluster low power entry and exit.
+ *
+ * This function may sleep, and has the same return conditions as
+ * raw_notifier_chain_register.
+ */
+int cpu_pm_register_notifier(struct notifier_block *nb)
+{
+ unsigned long flags;
+ int ret;
+
+ write_lock_irqsave(&cpu_pm_notifier_lock, flags);
+ ret = raw_notifier_chain_register(&cpu_pm_notifier_chain, nb);
+ write_unlock_irqrestore(&cpu_pm_notifier_lock, flags);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(cpu_pm_register_notifier);
+
+/**
+ * cpu_pm_unregister_notifier - unregister a driver with cpu_pm
+ * @nb: notifier block to be unregistered
+ *
+ * Remove a driver from the CPU PM notifier list.
+ *
+ * This function may sleep, and has the same return conditions as
+ * raw_notifier_chain_unregister.
+ */
+int cpu_pm_unregister_notifier(struct notifier_block *nb)
+{
+ unsigned long flags;
+ int ret;
+
+ write_lock_irqsave(&cpu_pm_notifier_lock, flags);
+ ret = raw_notifier_chain_unregister(&cpu_pm_notifier_chain, nb);
+ write_unlock_irqrestore(&cpu_pm_notifier_lock, flags);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(cpu_pm_unregister_notifier);
+
+/**
+ * cpu_pm_enter - CPU low power entry notifier
+ *
+ * Notifies listeners that a single CPU is entering a low power state that may
+ * cause some blocks in the same power domain as the cpu to reset.
+ *
+ * Must be called on the affected CPU with interrupts disabled. Platform is
+ * responsible for ensuring that cpu_pm_enter is not called twice on the same
+ * CPU before cpu_pm_exit is called. Notified drivers can include VFP
+ * co-processor, interrupt controller and its PM extensions, local CPU
+ * timers context save/restore which shouldn't be interrupted. Hence it
+ * must be called with interrupts disabled.
+ *
+ * Return conditions are same as __raw_notifier_call_chain.
+ */
+int cpu_pm_enter(void)
+{
+ int nr_calls;
+ int ret = 0;
+
+ read_lock(&cpu_pm_notifier_lock);
+ ret = cpu_pm_notify(CPU_PM_ENTER, -1, &nr_calls);
+ if (ret)
+ /*
+ * Inform listeners (nr_calls - 1) about failure of CPU PM
+ * PM entry who are notified earlier to prepare for it.
+ */
+ cpu_pm_notify(CPU_PM_ENTER_FAILED, nr_calls - 1, NULL);
+ read_unlock(&cpu_pm_notifier_lock);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(cpu_pm_enter);
+
+/**
+ * cpu_pm_exit - CPU low power exit notifier
+ *
+ * Notifies listeners that a single CPU is exiting a low power state that may
+ * have caused some blocks in the same power domain as the cpu to reset.
+ *
+ * Notified drivers can include VFP co-processor, interrupt controller
+ * and its PM extensions, local CPU timers context save/restore which
+ * shouldn't be interrupted. Hence it must be called with interrupts disabled.
+ *
+ * Return conditions are same as __raw_notifier_call_chain.
+ */
+int cpu_pm_exit(void)
+{
+ int ret;
+
+ read_lock(&cpu_pm_notifier_lock);
+ ret = cpu_pm_notify(CPU_PM_EXIT, -1, NULL);
+ read_unlock(&cpu_pm_notifier_lock);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(cpu_pm_exit);
+
+/**
+ * cpu_cluster_pm_enter - CPU cluster low power entry notifier
+ *
+ * Notifies listeners that all cpus in a power domain are entering a low power
+ * state that may cause some blocks in the same power domain to reset.
+ *
+ * Must be called after cpu_pm_enter has been called on all cpus in the power
+ * domain, and before cpu_pm_exit has been called on any cpu in the power
+ * domain. Notified drivers can include VFP co-processor, interrupt controller
+ * and its PM extensions, local CPU timers context save/restore which
+ * shouldn't be interrupted. Hence it must be called with interrupts disabled.
+ *
+ * Must be called with interrupts disabled.
+ *
+ * Return conditions are same as __raw_notifier_call_chain.
+ */
+int cpu_cluster_pm_enter(void)
+{
+ int nr_calls;
+ int ret = 0;
+
+ read_lock(&cpu_pm_notifier_lock);
+ ret = cpu_pm_notify(CPU_CLUSTER_PM_ENTER, -1, &nr_calls);
+ if (ret)
+ /*
+ * Inform listeners (nr_calls - 1) about failure of CPU cluster
+ * PM entry who are notified earlier to prepare for it.
+ */
+ cpu_pm_notify(CPU_CLUSTER_PM_ENTER_FAILED, nr_calls - 1, NULL);
+ read_unlock(&cpu_pm_notifier_lock);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(cpu_cluster_pm_enter);
+
+/**
+ * cpu_cluster_pm_exit - CPU cluster low power exit notifier
+ *
+ * Notifies listeners that all cpus in a power domain are exiting form a
+ * low power state that may have caused some blocks in the same power domain
+ * to reset.
+ *
+ * Must be called after cpu_pm_exit has been called on all cpus in the power
+ * domain, and before cpu_pm_exit has been called on any cpu in the power
+ * domain. Notified drivers can include VFP co-processor, interrupt controller
+ * and its PM extensions, local CPU timers context save/restore which
+ * shouldn't be interrupted. Hence it must be called with interrupts disabled.
+ *
+ * Return conditions are same as __raw_notifier_call_chain.
+ */
+int cpu_cluster_pm_exit(void)
+{
+ int ret;
+
+ read_lock(&cpu_pm_notifier_lock);
+ ret = cpu_pm_notify(CPU_CLUSTER_PM_EXIT, -1, NULL);
+ read_unlock(&cpu_pm_notifier_lock);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(cpu_cluster_pm_exit);
+
+#ifdef CONFIG_PM
+static int cpu_pm_suspend(void)
+{
+ int ret;
+
+ ret = cpu_pm_enter();
+ if (ret)
+ return ret;
+
+ ret = cpu_cluster_pm_enter();
+ return ret;
+}
+
+static void cpu_pm_resume(void)
+{
+ cpu_cluster_pm_exit();
+ cpu_pm_exit();
+}
+
+static struct syscore_ops cpu_pm_syscore_ops = {
+ .suspend = cpu_pm_suspend,
+ .resume = cpu_pm_resume,
+};
+
+static int cpu_pm_init(void)
+{
+ register_syscore_ops(&cpu_pm_syscore_ops);
+ return 0;
+}
+core_initcall(cpu_pm_init);
+#endif
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index ba401fab459..116a4164720 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -37,7 +37,7 @@
#include <linux/mempolicy.h>
#include <linux/mm.h>
#include <linux/memory.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/mount.h>
#include <linux/namei.h>
#include <linux/pagemap.h>
@@ -55,29 +55,13 @@
#include <linux/sort.h>
#include <asm/uaccess.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
#include <linux/mutex.h>
#include <linux/workqueue.h>
#include <linux/cgroup.h>
+#include <linux/wait.h>
-/*
- * Workqueue for cpuset related tasks.
- *
- * Using kevent workqueue may cause deadlock when memory_migrate
- * is set. So we create a separate workqueue thread for cpuset.
- */
-static struct workqueue_struct *cpuset_wq;
-
-/*
- * Tracks how many cpusets are currently defined in system.
- * When there is only one cpuset (the root cpuset) we can
- * short circuit some hooks.
- */
-int number_of_cpusets __read_mostly;
-
-/* Forward declare cgroup structures */
-struct cgroup_subsys cpuset_subsys;
-struct cpuset;
+struct static_key cpusets_enabled_key __read_mostly = STATIC_KEY_INIT_FALSE;
/* See "Frequency meter" comments, below. */
@@ -95,36 +79,65 @@ struct cpuset {
cpumask_var_t cpus_allowed; /* CPUs allowed to tasks in cpuset */
nodemask_t mems_allowed; /* Memory Nodes allowed to tasks */
- struct cpuset *parent; /* my parent */
+ /*
+ * This is old Memory Nodes tasks took on.
+ *
+ * - top_cpuset.old_mems_allowed is initialized to mems_allowed.
+ * - A new cpuset's old_mems_allowed is initialized when some
+ * task is moved into it.
+ * - old_mems_allowed is used in cpuset_migrate_mm() when we change
+ * cpuset.mems_allowed and have tasks' nodemask updated, and
+ * then old_mems_allowed is updated to mems_allowed.
+ */
+ nodemask_t old_mems_allowed;
struct fmeter fmeter; /* memory_pressure filter */
+ /*
+ * Tasks are being attached to this cpuset. Used to prevent
+ * zeroing cpus/mems_allowed between ->can_attach() and ->attach().
+ */
+ int attach_in_progress;
+
/* partition number for rebuild_sched_domains() */
int pn;
/* for custom sched domain */
int relax_domain_level;
-
- /* used for walking a cpuset heirarchy */
- struct list_head stack_list;
};
-/* Retrieve the cpuset for a cgroup */
-static inline struct cpuset *cgroup_cs(struct cgroup *cont)
+static inline struct cpuset *css_cs(struct cgroup_subsys_state *css)
{
- return container_of(cgroup_subsys_state(cont, cpuset_subsys_id),
- struct cpuset, css);
+ return css ? container_of(css, struct cpuset, css) : NULL;
}
/* Retrieve the cpuset for a task */
static inline struct cpuset *task_cs(struct task_struct *task)
{
- return container_of(task_subsys_state(task, cpuset_subsys_id),
- struct cpuset, css);
+ return css_cs(task_css(task, cpuset_cgrp_id));
}
+static inline struct cpuset *parent_cs(struct cpuset *cs)
+{
+ return css_cs(cs->css.parent);
+}
+
+#ifdef CONFIG_NUMA
+static inline bool task_has_mempolicy(struct task_struct *task)
+{
+ return task->mempolicy;
+}
+#else
+static inline bool task_has_mempolicy(struct task_struct *task)
+{
+ return false;
+}
+#endif
+
+
/* bits in struct cpuset flags field */
typedef enum {
+ CS_ONLINE,
CS_CPU_EXCLUSIVE,
CS_MEM_EXCLUSIVE,
CS_MEM_HARDWALL,
@@ -135,6 +148,11 @@ typedef enum {
} cpuset_flagbits_t;
/* convenient tests for these bits */
+static inline bool is_cpuset_online(const struct cpuset *cs)
+{
+ return test_bit(CS_ONLINE, &cs->flags);
+}
+
static inline int is_cpu_exclusive(const struct cpuset *cs)
{
return test_bit(CS_CPU_EXCLUSIVE, &cs->flags);
@@ -171,27 +189,53 @@ static inline int is_spread_slab(const struct cpuset *cs)
}
static struct cpuset top_cpuset = {
- .flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)),
+ .flags = ((1 << CS_ONLINE) | (1 << CS_CPU_EXCLUSIVE) |
+ (1 << CS_MEM_EXCLUSIVE)),
};
+/**
+ * cpuset_for_each_child - traverse online children of a cpuset
+ * @child_cs: loop cursor pointing to the current child
+ * @pos_css: used for iteration
+ * @parent_cs: target cpuset to walk children of
+ *
+ * Walk @child_cs through the online children of @parent_cs. Must be used
+ * with RCU read locked.
+ */
+#define cpuset_for_each_child(child_cs, pos_css, parent_cs) \
+ css_for_each_child((pos_css), &(parent_cs)->css) \
+ if (is_cpuset_online(((child_cs) = css_cs((pos_css)))))
+
+/**
+ * cpuset_for_each_descendant_pre - pre-order walk of a cpuset's descendants
+ * @des_cs: loop cursor pointing to the current descendant
+ * @pos_css: used for iteration
+ * @root_cs: target cpuset to walk ancestor of
+ *
+ * Walk @des_cs through the online descendants of @root_cs. Must be used
+ * with RCU read locked. The caller may modify @pos_css by calling
+ * css_rightmost_descendant() to skip subtree. @root_cs is included in the
+ * iteration and the first node to be visited.
+ */
+#define cpuset_for_each_descendant_pre(des_cs, pos_css, root_cs) \
+ css_for_each_descendant_pre((pos_css), &(root_cs)->css) \
+ if (is_cpuset_online(((des_cs) = css_cs((pos_css)))))
+
/*
- * There are two global mutexes guarding cpuset structures. The first
- * is the main control groups cgroup_mutex, accessed via
- * cgroup_lock()/cgroup_unlock(). The second is the cpuset-specific
- * callback_mutex, below. They can nest. It is ok to first take
- * cgroup_mutex, then nest callback_mutex. We also require taking
- * task_lock() when dereferencing a task's cpuset pointer. See "The
- * task_lock() exception", at the end of this comment.
- *
- * A task must hold both mutexes to modify cpusets. If a task
- * holds cgroup_mutex, then it blocks others wanting that mutex,
- * ensuring that it is the only task able to also acquire callback_mutex
- * and be able to modify cpusets. It can perform various checks on
- * the cpuset structure first, knowing nothing will change. It can
- * also allocate memory while just holding cgroup_mutex. While it is
- * performing these checks, various callback routines can briefly
- * acquire callback_mutex to query cpusets. Once it is ready to make
- * the changes, it takes callback_mutex, blocking everyone else.
+ * There are two global mutexes guarding cpuset structures - cpuset_mutex
+ * and callback_mutex. The latter may nest inside the former. We also
+ * require taking task_lock() when dereferencing a task's cpuset pointer.
+ * See "The task_lock() exception", at the end of this comment.
+ *
+ * A task must hold both mutexes to modify cpusets. If a task holds
+ * cpuset_mutex, then it blocks others wanting that mutex, ensuring that it
+ * is the only task able to also acquire callback_mutex and be able to
+ * modify cpusets. It can perform various checks on the cpuset structure
+ * first, knowing nothing will change. It can also allocate memory while
+ * just holding cpuset_mutex. While it is performing these checks, various
+ * callback routines can briefly acquire callback_mutex to query cpusets.
+ * Once it is ready to make the changes, it takes callback_mutex, blocking
+ * everyone else.
*
* Calls to the kernel memory allocator can not be made while holding
* callback_mutex, as that would risk double tripping on callback_mutex
@@ -213,36 +257,33 @@ static struct cpuset top_cpuset = {
* guidelines for accessing subsystem state in kernel/cgroup.c
*/
+static DEFINE_MUTEX(cpuset_mutex);
static DEFINE_MUTEX(callback_mutex);
/*
- * cpuset_buffer_lock protects both the cpuset_name and cpuset_nodelist
- * buffers. They are statically allocated to prevent using excess stack
- * when calling cpuset_print_task_mems_allowed().
+ * CPU / memory hotplug is handled asynchronously.
*/
-#define CPUSET_NAME_LEN (128)
-#define CPUSET_NODELIST_LEN (256)
-static char cpuset_name[CPUSET_NAME_LEN];
-static char cpuset_nodelist[CPUSET_NODELIST_LEN];
-static DEFINE_SPINLOCK(cpuset_buffer_lock);
+static void cpuset_hotplug_workfn(struct work_struct *work);
+static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn);
+
+static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq);
/*
* This is ugly, but preserves the userspace API for existing cpuset
* users. If someone tries to mount the "cpuset" filesystem, we
* silently switch it to mount "cgroup" instead
*/
-static int cpuset_get_sb(struct file_system_type *fs_type,
- int flags, const char *unused_dev_name,
- void *data, struct vfsmount *mnt)
+static struct dentry *cpuset_mount(struct file_system_type *fs_type,
+ int flags, const char *unused_dev_name, void *data)
{
struct file_system_type *cgroup_fs = get_fs_type("cgroup");
- int ret = -ENODEV;
+ struct dentry *ret = ERR_PTR(-ENODEV);
if (cgroup_fs) {
char mountopts[] =
"cpuset,noprefix,"
"release_agent=/sbin/cpuset_release_agent";
- ret = cgroup_fs->get_sb(cgroup_fs, flags,
- unused_dev_name, mountopts, mnt);
+ ret = cgroup_fs->mount(cgroup_fs, flags,
+ unused_dev_name, mountopts);
put_filesystem(cgroup_fs);
}
return ret;
@@ -250,65 +291,49 @@ static int cpuset_get_sb(struct file_system_type *fs_type,
static struct file_system_type cpuset_fs_type = {
.name = "cpuset",
- .get_sb = cpuset_get_sb,
+ .mount = cpuset_mount,
};
/*
* Return in pmask the portion of a cpusets's cpus_allowed that
* are online. If none are online, walk up the cpuset hierarchy
- * until we find one that does have some online cpus. If we get
- * all the way to the top and still haven't found any online cpus,
- * return cpu_online_map. Or if passed a NULL cs from an exit'ing
- * task, return cpu_online_map.
+ * until we find one that does have some online cpus. The top
+ * cpuset always has some cpus online.
*
* One way or another, we guarantee to return some non-empty subset
- * of cpu_online_map.
+ * of cpu_online_mask.
*
* Call with callback_mutex held.
*/
-
-static void guarantee_online_cpus(const struct cpuset *cs,
- struct cpumask *pmask)
+static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask)
{
- while (cs && !cpumask_intersects(cs->cpus_allowed, cpu_online_mask))
- cs = cs->parent;
- if (cs)
- cpumask_and(pmask, cs->cpus_allowed, cpu_online_mask);
- else
- cpumask_copy(pmask, cpu_online_mask);
- BUG_ON(!cpumask_intersects(pmask, cpu_online_mask));
+ while (!cpumask_intersects(cs->cpus_allowed, cpu_online_mask))
+ cs = parent_cs(cs);
+ cpumask_and(pmask, cs->cpus_allowed, cpu_online_mask);
}
/*
* Return in *pmask the portion of a cpusets's mems_allowed that
* are online, with memory. If none are online with memory, walk
* up the cpuset hierarchy until we find one that does have some
- * online mems. If we get all the way to the top and still haven't
- * found any online mems, return node_states[N_HIGH_MEMORY].
+ * online mems. The top cpuset always has some mems online.
*
* One way or another, we guarantee to return some non-empty subset
- * of node_states[N_HIGH_MEMORY].
+ * of node_states[N_MEMORY].
*
* Call with callback_mutex held.
*/
-
-static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
+static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask)
{
- while (cs && !nodes_intersects(cs->mems_allowed,
- node_states[N_HIGH_MEMORY]))
- cs = cs->parent;
- if (cs)
- nodes_and(*pmask, cs->mems_allowed,
- node_states[N_HIGH_MEMORY]);
- else
- *pmask = node_states[N_HIGH_MEMORY];
- BUG_ON(!nodes_intersects(*pmask, node_states[N_HIGH_MEMORY]));
+ while (!nodes_intersects(cs->mems_allowed, node_states[N_MEMORY]))
+ cs = parent_cs(cs);
+ nodes_and(*pmask, cs->mems_allowed, node_states[N_MEMORY]);
}
/*
* update task's spread flag if cpuset's page/slab spread flag is set
*
- * Called with callback_mutex/cgroup_mutex held
+ * Called with callback_mutex/cpuset_mutex held
*/
static void cpuset_update_task_spread_flag(struct cpuset *cs,
struct task_struct *tsk)
@@ -328,7 +353,7 @@ static void cpuset_update_task_spread_flag(struct cpuset *cs,
*
* One cpuset is a subset of another if all its allowed CPUs and
* Memory Nodes are a subset of the other, and its exclusive flags
- * are only set if the other's are set. Call holding cgroup_mutex.
+ * are only set if the other's are set. Call holding cpuset_mutex.
*/
static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
@@ -343,7 +368,7 @@ static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
* alloc_trial_cpuset - allocate a trial cpuset
* @cs: the cpuset that the trial cpuset duplicates
*/
-static struct cpuset *alloc_trial_cpuset(const struct cpuset *cs)
+static struct cpuset *alloc_trial_cpuset(struct cpuset *cs)
{
struct cpuset *trial;
@@ -377,7 +402,7 @@ static void free_trial_cpuset(struct cpuset *trial)
* If we replaced the flag and mask values of the current cpuset
* (cur) with those values in the trial cpuset (trial), would
* our various subset and exclusive rules still be valid? Presumes
- * cgroup_mutex held.
+ * cpuset_mutex held.
*
* 'cur' is the address of an actual, in-use cpuset. Operations
* such as list traversal that depend on the actual address of the
@@ -390,52 +415,66 @@ static void free_trial_cpuset(struct cpuset *trial)
* Return 0 if valid, -errno if not.
*/
-static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
+static int validate_change(struct cpuset *cur, struct cpuset *trial)
{
- struct cgroup *cont;
+ struct cgroup_subsys_state *css;
struct cpuset *c, *par;
+ int ret;
+
+ rcu_read_lock();
/* Each of our child cpusets must be a subset of us */
- list_for_each_entry(cont, &cur->css.cgroup->children, sibling) {
- if (!is_cpuset_subset(cgroup_cs(cont), trial))
- return -EBUSY;
- }
+ ret = -EBUSY;
+ cpuset_for_each_child(c, css, cur)
+ if (!is_cpuset_subset(c, trial))
+ goto out;
/* Remaining checks don't apply to root cpuset */
+ ret = 0;
if (cur == &top_cpuset)
- return 0;
+ goto out;
- par = cur->parent;
+ par = parent_cs(cur);
/* We must be a subset of our parent cpuset */
+ ret = -EACCES;
if (!is_cpuset_subset(trial, par))
- return -EACCES;
+ goto out;
/*
* If either I or some sibling (!= me) is exclusive, we can't
* overlap
*/
- list_for_each_entry(cont, &par->css.cgroup->children, sibling) {
- c = cgroup_cs(cont);
+ ret = -EINVAL;
+ cpuset_for_each_child(c, css, par) {
if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
c != cur &&
cpumask_intersects(trial->cpus_allowed, c->cpus_allowed))
- return -EINVAL;
+ goto out;
if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) &&
c != cur &&
nodes_intersects(trial->mems_allowed, c->mems_allowed))
- return -EINVAL;
+ goto out;
}
- /* Cpusets with tasks can't have empty cpus_allowed or mems_allowed */
- if (cgroup_task_count(cur->css.cgroup)) {
- if (cpumask_empty(trial->cpus_allowed) ||
- nodes_empty(trial->mems_allowed)) {
- return -ENOSPC;
- }
+ /*
+ * Cpusets with tasks - existing or newly being attached - can't
+ * be changed to have empty cpus_allowed or mems_allowed.
+ */
+ ret = -ENOSPC;
+ if ((cgroup_has_tasks(cur->css.cgroup) || cur->attach_in_progress)) {
+ if (!cpumask_empty(cur->cpus_allowed) &&
+ cpumask_empty(trial->cpus_allowed))
+ goto out;
+ if (!nodes_empty(cur->mems_allowed) &&
+ nodes_empty(trial->mems_allowed))
+ goto out;
}
- return 0;
+ ret = 0;
+out:
+ rcu_read_unlock();
+ return ret;
}
#ifdef CONFIG_SMP
@@ -456,31 +495,27 @@ update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
return;
}
-static void
-update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
+static void update_domain_attr_tree(struct sched_domain_attr *dattr,
+ struct cpuset *root_cs)
{
- LIST_HEAD(q);
-
- list_add(&c->stack_list, &q);
- while (!list_empty(&q)) {
- struct cpuset *cp;
- struct cgroup *cont;
- struct cpuset *child;
+ struct cpuset *cp;
+ struct cgroup_subsys_state *pos_css;
- cp = list_first_entry(&q, struct cpuset, stack_list);
- list_del(q.next);
+ rcu_read_lock();
+ cpuset_for_each_descendant_pre(cp, pos_css, root_cs) {
+ if (cp == root_cs)
+ continue;
- if (cpumask_empty(cp->cpus_allowed))
+ /* skip the whole subtree if @cp doesn't have any CPU */
+ if (cpumask_empty(cp->cpus_allowed)) {
+ pos_css = css_rightmost_descendant(pos_css);
continue;
+ }
if (is_sched_load_balance(cp))
update_domain_attr(dattr, cp);
-
- list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
- child = cgroup_cs(cont);
- list_add_tail(&child->stack_list, &q);
- }
}
+ rcu_read_unlock();
}
/*
@@ -489,7 +524,7 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
* This function builds a partial partition of the systems CPUs
* A 'partial partition' is a set of non-overlapping subsets whose
* union is a subset of that set.
- * The output of this function needs to be passed to kernel/sched.c
+ * The output of this function needs to be passed to kernel/sched/core.c
* partition_sched_domains() routine, which will rebuild the scheduler's
* load balancing domains (sched domains) as specified by that partial
* partition.
@@ -502,7 +537,7 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
* domains when operating in the severe memory shortage situations
* that could cause allocation failures below.
*
- * Must be called with cgroup_lock held.
+ * Must be called with cpuset_mutex held.
*
* The three key local variables below are:
* q - a linked-list queue of cpuset pointers, used to implement a
@@ -518,7 +553,7 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
* is a subset of one of these domains, while there are as
* many such domains as possible, each as small as possible.
* doms - Conversion of 'csa' to an array of cpumasks, for passing to
- * the kernel/sched.c routine partition_sched_domains() in a
+ * the kernel/sched/core.c routine partition_sched_domains() in a
* convenient format, that can be easily compared to the prior
* value to determine what partition elements (sched domains)
* were changed (added or removed.)
@@ -540,7 +575,6 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
static int generate_sched_domains(cpumask_var_t **domains,
struct sched_domain_attr **attributes)
{
- LIST_HEAD(q); /* queue of cpusets to be scanned */
struct cpuset *cp; /* scans q */
struct cpuset **csa; /* array of all cpuset ptrs */
int csn; /* how many cpuset ptrs in csa so far */
@@ -549,6 +583,7 @@ static int generate_sched_domains(cpumask_var_t **domains,
struct sched_domain_attr *dattr; /* attributes for custom domains */
int ndoms = 0; /* number of sched domains in result */
int nslot; /* next empty doms[] struct cpumask slot */
+ struct cgroup_subsys_state *pos_css;
doms = NULL;
dattr = NULL;
@@ -571,38 +606,34 @@ static int generate_sched_domains(cpumask_var_t **domains,
goto done;
}
- csa = kmalloc(number_of_cpusets * sizeof(cp), GFP_KERNEL);
+ csa = kmalloc(nr_cpusets() * sizeof(cp), GFP_KERNEL);
if (!csa)
goto done;
csn = 0;
- list_add(&top_cpuset.stack_list, &q);
- while (!list_empty(&q)) {
- struct cgroup *cont;
- struct cpuset *child; /* scans child cpusets of cp */
-
- cp = list_first_entry(&q, struct cpuset, stack_list);
- list_del(q.next);
-
- if (cpumask_empty(cp->cpus_allowed))
+ rcu_read_lock();
+ cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) {
+ if (cp == &top_cpuset)
continue;
-
/*
- * All child cpusets contain a subset of the parent's cpus, so
- * just skip them, and then we call update_domain_attr_tree()
- * to calc relax_domain_level of the corresponding sched
- * domain.
+ * Continue traversing beyond @cp iff @cp has some CPUs and
+ * isn't load balancing. The former is obvious. The
+ * latter: All child cpusets contain a subset of the
+ * parent's cpus, so just skip them, and then we call
+ * update_domain_attr_tree() to calc relax_domain_level of
+ * the corresponding sched domain.
*/
- if (is_sched_load_balance(cp)) {
- csa[csn++] = cp;
+ if (!cpumask_empty(cp->cpus_allowed) &&
+ !is_sched_load_balance(cp))
continue;
- }
- list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
- child = cgroup_cs(cont);
- list_add_tail(&child->stack_list, &q);
- }
- }
+ if (is_sched_load_balance(cp))
+ csa[csn++] = cp;
+
+ /* skip @cp's subtree */
+ pos_css = css_rightmost_descendant(pos_css);
+ }
+ rcu_read_unlock();
for (i = 0; i < csn; i++)
csa[i]->pn = i;
@@ -660,11 +691,8 @@ restart:
if (nslot == ndoms) {
static int warnings = 10;
if (warnings) {
- printk(KERN_WARNING
- "rebuild_sched_domains confused:"
- " nslot %d, ndoms %d, csn %d, i %d,"
- " apn %d\n",
- nslot, ndoms, csn, i, apn);
+ pr_warn("rebuild_sched_domains confused: nslot %d, ndoms %d, csn %d, i %d, apn %d\n",
+ nslot, ndoms, csn, i, apn);
warnings--;
}
continue;
@@ -707,155 +735,163 @@ done:
/*
* Rebuild scheduler domains.
*
- * Call with neither cgroup_mutex held nor within get_online_cpus().
- * Takes both cgroup_mutex and get_online_cpus().
+ * If the flag 'sched_load_balance' of any cpuset with non-empty
+ * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset
+ * which has that flag enabled, or if any cpuset with a non-empty
+ * 'cpus' is removed, then call this routine to rebuild the
+ * scheduler's dynamic sched domains.
*
- * Cannot be directly called from cpuset code handling changes
- * to the cpuset pseudo-filesystem, because it cannot be called
- * from code that already holds cgroup_mutex.
+ * Call with cpuset_mutex held. Takes get_online_cpus().
*/
-static void do_rebuild_sched_domains(struct work_struct *unused)
+static void rebuild_sched_domains_locked(void)
{
struct sched_domain_attr *attr;
cpumask_var_t *doms;
int ndoms;
+ lockdep_assert_held(&cpuset_mutex);
get_online_cpus();
+ /*
+ * We have raced with CPU hotplug. Don't do anything to avoid
+ * passing doms with offlined cpu to partition_sched_domains().
+ * Anyways, hotplug work item will rebuild sched domains.
+ */
+ if (!cpumask_equal(top_cpuset.cpus_allowed, cpu_active_mask))
+ goto out;
+
/* Generate domain masks and attrs */
- cgroup_lock();
ndoms = generate_sched_domains(&doms, &attr);
- cgroup_unlock();
/* Have scheduler rebuild the domains */
partition_sched_domains(ndoms, doms, attr);
-
+out:
put_online_cpus();
}
#else /* !CONFIG_SMP */
-static void do_rebuild_sched_domains(struct work_struct *unused)
+static void rebuild_sched_domains_locked(void)
{
}
+#endif /* CONFIG_SMP */
-static int generate_sched_domains(cpumask_var_t **domains,
- struct sched_domain_attr **attributes)
+void rebuild_sched_domains(void)
{
- *domains = NULL;
- return 1;
+ mutex_lock(&cpuset_mutex);
+ rebuild_sched_domains_locked();
+ mutex_unlock(&cpuset_mutex);
}
-#endif /* CONFIG_SMP */
-
-static DECLARE_WORK(rebuild_sched_domains_work, do_rebuild_sched_domains);
/*
- * Rebuild scheduler domains, asynchronously via workqueue.
- *
- * If the flag 'sched_load_balance' of any cpuset with non-empty
- * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset
- * which has that flag enabled, or if any cpuset with a non-empty
- * 'cpus' is removed, then call this routine to rebuild the
- * scheduler's dynamic sched domains.
+ * effective_cpumask_cpuset - return nearest ancestor with non-empty cpus
+ * @cs: the cpuset in interest
*
- * The rebuild_sched_domains() and partition_sched_domains()
- * routines must nest cgroup_lock() inside get_online_cpus(),
- * but such cpuset changes as these must nest that locking the
- * other way, holding cgroup_lock() for much of the code.
+ * A cpuset's effective cpumask is the cpumask of the nearest ancestor
+ * with non-empty cpus. We use effective cpumask whenever:
+ * - we update tasks' cpus_allowed. (they take on the ancestor's cpumask
+ * if the cpuset they reside in has no cpus)
+ * - we want to retrieve task_cs(tsk)'s cpus_allowed.
*
- * So in order to avoid an ABBA deadlock, the cpuset code handling
- * these user changes delegates the actual sched domain rebuilding
- * to a separate workqueue thread, which ends up processing the
- * above do_rebuild_sched_domains() function.
+ * Called with cpuset_mutex held. cpuset_cpus_allowed_fallback() is an
+ * exception. See comments there.
*/
-static void async_rebuild_sched_domains(void)
+static struct cpuset *effective_cpumask_cpuset(struct cpuset *cs)
{
- queue_work(cpuset_wq, &rebuild_sched_domains_work);
+ while (cpumask_empty(cs->cpus_allowed))
+ cs = parent_cs(cs);
+ return cs;
}
/*
- * Accomplishes the same scheduler domain rebuild as the above
- * async_rebuild_sched_domains(), however it directly calls the
- * rebuild routine synchronously rather than calling it via an
- * asynchronous work thread.
+ * effective_nodemask_cpuset - return nearest ancestor with non-empty mems
+ * @cs: the cpuset in interest
*
- * This can only be called from code that is not holding
- * cgroup_mutex (not nested in a cgroup_lock() call.)
- */
-void rebuild_sched_domains(void)
-{
- do_rebuild_sched_domains(NULL);
-}
-
-/**
- * cpuset_test_cpumask - test a task's cpus_allowed versus its cpuset's
- * @tsk: task to test
- * @scan: struct cgroup_scanner contained in its struct cpuset_hotplug_scanner
- *
- * Call with cgroup_mutex held. May take callback_mutex during call.
- * Called for each task in a cgroup by cgroup_scan_tasks().
- * Return nonzero if this tasks's cpus_allowed mask should be changed (in other
- * words, if its mask is not equal to its cpuset's mask).
+ * A cpuset's effective nodemask is the nodemask of the nearest ancestor
+ * with non-empty memss. We use effective nodemask whenever:
+ * - we update tasks' mems_allowed. (they take on the ancestor's nodemask
+ * if the cpuset they reside in has no mems)
+ * - we want to retrieve task_cs(tsk)'s mems_allowed.
+ *
+ * Called with cpuset_mutex held.
*/
-static int cpuset_test_cpumask(struct task_struct *tsk,
- struct cgroup_scanner *scan)
+static struct cpuset *effective_nodemask_cpuset(struct cpuset *cs)
{
- return !cpumask_equal(&tsk->cpus_allowed,
- (cgroup_cs(scan->cg))->cpus_allowed);
+ while (nodes_empty(cs->mems_allowed))
+ cs = parent_cs(cs);
+ return cs;
}
/**
- * cpuset_change_cpumask - make a task's cpus_allowed the same as its cpuset's
- * @tsk: task to test
- * @scan: struct cgroup_scanner containing the cgroup of the task
- *
- * Called by cgroup_scan_tasks() for each task in a cgroup whose
- * cpus_allowed mask needs to be changed.
+ * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset.
+ * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
*
- * We don't need to re-check for the cgroup/cpuset membership, since we're
- * holding cgroup_lock() at this point.
+ * Iterate through each task of @cs updating its cpus_allowed to the
+ * effective cpuset's. As this function is called with cpuset_mutex held,
+ * cpuset membership stays stable.
*/
-static void cpuset_change_cpumask(struct task_struct *tsk,
- struct cgroup_scanner *scan)
+static void update_tasks_cpumask(struct cpuset *cs)
{
- set_cpus_allowed_ptr(tsk, ((cgroup_cs(scan->cg))->cpus_allowed));
+ struct cpuset *cpus_cs = effective_cpumask_cpuset(cs);
+ struct css_task_iter it;
+ struct task_struct *task;
+
+ css_task_iter_start(&cs->css, &it);
+ while ((task = css_task_iter_next(&it)))
+ set_cpus_allowed_ptr(task, cpus_cs->cpus_allowed);
+ css_task_iter_end(&it);
}
-/**
- * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset.
- * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
- * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks()
- *
- * Called with cgroup_mutex held
+/*
+ * update_tasks_cpumask_hier - Update the cpumasks of tasks in the hierarchy.
+ * @root_cs: the root cpuset of the hierarchy
+ * @update_root: update root cpuset or not?
*
- * The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
- * calling callback functions for each.
+ * This will update cpumasks of tasks in @root_cs and all other empty cpusets
+ * which take on cpumask of @root_cs.
*
- * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0
- * if @heap != NULL.
+ * Called with cpuset_mutex held
*/
-static void update_tasks_cpumask(struct cpuset *cs, struct ptr_heap *heap)
-{
- struct cgroup_scanner scan;
+static void update_tasks_cpumask_hier(struct cpuset *root_cs, bool update_root)
+{
+ struct cpuset *cp;
+ struct cgroup_subsys_state *pos_css;
+
+ rcu_read_lock();
+ cpuset_for_each_descendant_pre(cp, pos_css, root_cs) {
+ if (cp == root_cs) {
+ if (!update_root)
+ continue;
+ } else {
+ /* skip the whole subtree if @cp have some CPU */
+ if (!cpumask_empty(cp->cpus_allowed)) {
+ pos_css = css_rightmost_descendant(pos_css);
+ continue;
+ }
+ }
+ if (!css_tryget_online(&cp->css))
+ continue;
+ rcu_read_unlock();
+
+ update_tasks_cpumask(cp);
- scan.cg = cs->css.cgroup;
- scan.test_task = cpuset_test_cpumask;
- scan.process_task = cpuset_change_cpumask;
- scan.heap = heap;
- cgroup_scan_tasks(&scan);
+ rcu_read_lock();
+ css_put(&cp->css);
+ }
+ rcu_read_unlock();
}
/**
* update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it
* @cs: the cpuset to consider
+ * @trialcs: trial cpuset
* @buf: buffer of cpu numbers written to this cpuset
*/
static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
const char *buf)
{
- struct ptr_heap heap;
int retval;
int is_load_balanced;
- /* top_cpuset.cpus_allowed tracks cpu_online_map; it's read-only */
+ /* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */
if (cs == &top_cpuset)
return -EACCES;
@@ -875,16 +911,13 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
if (!cpumask_subset(trialcs->cpus_allowed, cpu_active_mask))
return -EINVAL;
}
- retval = validate_change(cs, trialcs);
- if (retval < 0)
- return retval;
/* Nothing to do if the cpus didn't change */
if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed))
return 0;
- retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
- if (retval)
+ retval = validate_change(cs, trialcs);
+ if (retval < 0)
return retval;
is_load_balanced = is_sched_load_balance(trialcs);
@@ -893,16 +926,10 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
mutex_unlock(&callback_mutex);
- /*
- * Scan tasks in the cpuset, and update the cpumasks of any
- * that need an update.
- */
- update_tasks_cpumask(cs, &heap);
-
- heap_free(&heap);
+ update_tasks_cpumask_hier(cs, true);
if (is_load_balanced)
- async_rebuild_sched_domains();
+ rebuild_sched_domains_locked();
return 0;
}
@@ -914,15 +941,6 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
* Temporarilly set tasks mems_allowed to target nodes of migration,
* so that the migration code can allocate pages on these nodes.
*
- * Call holding cgroup_mutex, so current's cpuset won't change
- * during this call, as manage_mutex holds off any cpuset_attach()
- * calls. Therefore we don't need to take task_lock around the
- * call to guarantee_online_mems(), as we know no one is changing
- * our task's cpuset.
- *
- * Hold callback_mutex around the two modifications of our tasks
- * mems_allowed to synchronize with cpuset_mems_allowed().
- *
* While the mm_struct we are migrating is typically from some
* other task, the task_struct mems_allowed that we are hacking
* is for our current task, which must allocate new pages for that
@@ -933,12 +951,16 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
const nodemask_t *to)
{
struct task_struct *tsk = current;
+ struct cpuset *mems_cs;
tsk->mems_allowed = *to;
do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL);
- guarantee_online_mems(task_cs(tsk),&tsk->mems_allowed);
+ rcu_read_lock();
+ mems_cs = effective_nodemask_cpuset(task_cs(tsk));
+ guarantee_online_mems(mems_cs, &tsk->mems_allowed);
+ rcu_read_unlock();
}
/*
@@ -949,49 +971,48 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
* In order to avoid seeing no nodes if the old and new nodes are disjoint,
* we structure updates as setting all new allowed nodes, then clearing newly
* disallowed ones.
- *
- * Called with task's alloc_lock held
*/
static void cpuset_change_task_nodemask(struct task_struct *tsk,
nodemask_t *newmems)
{
- nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
- mpol_rebind_task(tsk, &tsk->mems_allowed);
- mpol_rebind_task(tsk, newmems);
- tsk->mems_allowed = *newmems;
-}
+ bool need_loop;
-/*
- * Update task's mems_allowed and rebind its mempolicy and vmas' mempolicy
- * of it to cpuset's new mems_allowed, and migrate pages to new nodes if
- * memory_migrate flag is set. Called with cgroup_mutex held.
- */
-static void cpuset_change_nodemask(struct task_struct *p,
- struct cgroup_scanner *scan)
-{
- struct mm_struct *mm;
- struct cpuset *cs;
- int migrate;
- const nodemask_t *oldmem = scan->data;
- nodemask_t newmems;
+ /*
+ * Allow tasks that have access to memory reserves because they have
+ * been OOM killed to get memory anywhere.
+ */
+ if (unlikely(test_thread_flag(TIF_MEMDIE)))
+ return;
+ if (current->flags & PF_EXITING) /* Let dying task have memory */
+ return;
- cs = cgroup_cs(scan->cg);
- guarantee_online_mems(cs, &newmems);
+ task_lock(tsk);
+ /*
+ * Determine if a loop is necessary if another thread is doing
+ * read_mems_allowed_begin(). If at least one node remains unchanged and
+ * tsk does not have a mempolicy, then an empty nodemask will not be
+ * possible when mems_allowed is larger than a word.
+ */
+ need_loop = task_has_mempolicy(tsk) ||
+ !nodes_intersects(*newmems, tsk->mems_allowed);
- task_lock(p);
- cpuset_change_task_nodemask(p, &newmems);
- task_unlock(p);
+ if (need_loop) {
+ local_irq_disable();
+ write_seqcount_begin(&tsk->mems_allowed_seq);
+ }
- mm = get_task_mm(p);
- if (!mm)
- return;
+ nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
+ mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1);
- migrate = is_memory_migrate(cs);
+ mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP2);
+ tsk->mems_allowed = *newmems;
+
+ if (need_loop) {
+ write_seqcount_end(&tsk->mems_allowed_seq);
+ local_irq_enable();
+ }
- mpol_rebind_mm(mm, &cs->mems_allowed);
- if (migrate)
- cpuset_migrate_mm(mm, oldmem, &cs->mems_allowed);
- mmput(mm);
+ task_unlock(tsk);
}
static void *cpuset_being_rebound;
@@ -999,43 +1020,102 @@ static void *cpuset_being_rebound;
/**
* update_tasks_nodemask - Update the nodemasks of tasks in the cpuset.
* @cs: the cpuset in which each task's mems_allowed mask needs to be changed
- * @oldmem: old mems_allowed of cpuset cs
- * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks()
*
- * Called with cgroup_mutex held
- * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0
- * if @heap != NULL.
+ * Iterate through each task of @cs updating its mems_allowed to the
+ * effective cpuset's. As this function is called with cpuset_mutex held,
+ * cpuset membership stays stable.
*/
-static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem,
- struct ptr_heap *heap)
+static void update_tasks_nodemask(struct cpuset *cs)
{
- struct cgroup_scanner scan;
+ static nodemask_t newmems; /* protected by cpuset_mutex */
+ struct cpuset *mems_cs = effective_nodemask_cpuset(cs);
+ struct css_task_iter it;
+ struct task_struct *task;
cpuset_being_rebound = cs; /* causes mpol_dup() rebind */
- scan.cg = cs->css.cgroup;
- scan.test_task = NULL;
- scan.process_task = cpuset_change_nodemask;
- scan.heap = heap;
- scan.data = (nodemask_t *)oldmem;
+ guarantee_online_mems(mems_cs, &newmems);
/*
* The mpol_rebind_mm() call takes mmap_sem, which we couldn't
* take while holding tasklist_lock. Forks can happen - the
* mpol_dup() cpuset_being_rebound check will catch such forks,
* and rebind their vma mempolicies too. Because we still hold
- * the global cgroup_mutex, we know that no other rebind effort
+ * the global cpuset_mutex, we know that no other rebind effort
* will be contending for the global variable cpuset_being_rebound.
* It's ok if we rebind the same mm twice; mpol_rebind_mm()
* is idempotent. Also migrate pages in each mm to new nodes.
*/
- cgroup_scan_tasks(&scan);
+ css_task_iter_start(&cs->css, &it);
+ while ((task = css_task_iter_next(&it))) {
+ struct mm_struct *mm;
+ bool migrate;
+
+ cpuset_change_task_nodemask(task, &newmems);
+
+ mm = get_task_mm(task);
+ if (!mm)
+ continue;
+
+ migrate = is_memory_migrate(cs);
+
+ mpol_rebind_mm(mm, &cs->mems_allowed);
+ if (migrate)
+ cpuset_migrate_mm(mm, &cs->old_mems_allowed, &newmems);
+ mmput(mm);
+ }
+ css_task_iter_end(&it);
+
+ /*
+ * All the tasks' nodemasks have been updated, update
+ * cs->old_mems_allowed.
+ */
+ cs->old_mems_allowed = newmems;
/* We're done rebinding vmas to this cpuset's new mems_allowed. */
cpuset_being_rebound = NULL;
}
/*
+ * update_tasks_nodemask_hier - Update the nodemasks of tasks in the hierarchy.
+ * @cs: the root cpuset of the hierarchy
+ * @update_root: update the root cpuset or not?
+ *
+ * This will update nodemasks of tasks in @root_cs and all other empty cpusets
+ * which take on nodemask of @root_cs.
+ *
+ * Called with cpuset_mutex held
+ */
+static void update_tasks_nodemask_hier(struct cpuset *root_cs, bool update_root)
+{
+ struct cpuset *cp;
+ struct cgroup_subsys_state *pos_css;
+
+ rcu_read_lock();
+ cpuset_for_each_descendant_pre(cp, pos_css, root_cs) {
+ if (cp == root_cs) {
+ if (!update_root)
+ continue;
+ } else {
+ /* skip the whole subtree if @cp have some CPU */
+ if (!nodes_empty(cp->mems_allowed)) {
+ pos_css = css_rightmost_descendant(pos_css);
+ continue;
+ }
+ }
+ if (!css_tryget_online(&cp->css))
+ continue;
+ rcu_read_unlock();
+
+ update_tasks_nodemask(cp);
+
+ rcu_read_lock();
+ css_put(&cp->css);
+ }
+ rcu_read_unlock();
+}
+
+/*
* Handle user request to change the 'mems' memory placement
* of a cpuset. Needs to validate the request, update the
* cpusets mems_allowed, and for each task in the cpuset,
@@ -1043,7 +1123,7 @@ static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem,
* mempolicies and if the cpuset is marked 'memory_migrate',
* migrate the tasks pages to the new memory.
*
- * Call with cgroup_mutex held. May take callback_mutex during call.
+ * Call with cpuset_mutex held. May take callback_mutex during call.
* Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
* lock each such tasks mm->mmap_sem, scan its vma's and rebind
* their mempolicies to the cpusets new mems_allowed.
@@ -1051,16 +1131,16 @@ static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem,
static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
const char *buf)
{
- nodemask_t oldmem;
int retval;
- struct ptr_heap heap;
/*
- * top_cpuset.mems_allowed tracks node_stats[N_HIGH_MEMORY];
+ * top_cpuset.mems_allowed tracks node_stats[N_MEMORY];
* it's read-only
*/
- if (cs == &top_cpuset)
- return -EACCES;
+ if (cs == &top_cpuset) {
+ retval = -EACCES;
+ goto done;
+ }
/*
* An empty mems_allowed is ok iff there are no tasks in the cpuset.
@@ -1076,11 +1156,13 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
goto done;
if (!nodes_subset(trialcs->mems_allowed,
- node_states[N_HIGH_MEMORY]))
- return -EINVAL;
+ node_states[N_MEMORY])) {
+ retval = -EINVAL;
+ goto done;
+ }
}
- oldmem = cs->mems_allowed;
- if (nodes_equal(oldmem, trialcs->mems_allowed)) {
+
+ if (nodes_equal(cs->mems_allowed, trialcs->mems_allowed)) {
retval = 0; /* Too easy - nothing to do */
goto done;
}
@@ -1088,30 +1170,30 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
if (retval < 0)
goto done;
- retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
- if (retval < 0)
- goto done;
-
mutex_lock(&callback_mutex);
cs->mems_allowed = trialcs->mems_allowed;
mutex_unlock(&callback_mutex);
- update_tasks_nodemask(cs, &oldmem, &heap);
-
- heap_free(&heap);
+ update_tasks_nodemask_hier(cs, true);
done:
return retval;
}
int current_cpuset_is_being_rebound(void)
{
- return task_cs(current) == cpuset_being_rebound;
+ int ret;
+
+ rcu_read_lock();
+ ret = task_cs(current) == cpuset_being_rebound;
+ rcu_read_unlock();
+
+ return ret;
}
static int update_relax_domain_level(struct cpuset *cs, s64 val)
{
#ifdef CONFIG_SMP
- if (val < -1 || val >= SD_LV_MAX)
+ if (val < -1 || val >= sched_domain_level_max)
return -EINVAL;
#endif
@@ -1119,50 +1201,29 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val)
cs->relax_domain_level = val;
if (!cpumask_empty(cs->cpus_allowed) &&
is_sched_load_balance(cs))
- async_rebuild_sched_domains();
+ rebuild_sched_domains_locked();
}
return 0;
}
-/*
- * cpuset_change_flag - make a task's spread flags the same as its cpuset's
- * @tsk: task to be updated
- * @scan: struct cgroup_scanner containing the cgroup of the task
- *
- * Called by cgroup_scan_tasks() for each task in a cgroup.
- *
- * We don't need to re-check for the cgroup/cpuset membership, since we're
- * holding cgroup_lock() at this point.
- */
-static void cpuset_change_flag(struct task_struct *tsk,
- struct cgroup_scanner *scan)
-{
- cpuset_update_task_spread_flag(cgroup_cs(scan->cg), tsk);
-}
-
-/*
+/**
* update_tasks_flags - update the spread flags of tasks in the cpuset.
* @cs: the cpuset in which each task's spread flags needs to be changed
- * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks()
- *
- * Called with cgroup_mutex held
- *
- * The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
- * calling callback functions for each.
*
- * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0
- * if @heap != NULL.
+ * Iterate through each task of @cs updating its spread flags. As this
+ * function is called with cpuset_mutex held, cpuset membership stays
+ * stable.
*/
-static void update_tasks_flags(struct cpuset *cs, struct ptr_heap *heap)
+static void update_tasks_flags(struct cpuset *cs)
{
- struct cgroup_scanner scan;
+ struct css_task_iter it;
+ struct task_struct *task;
- scan.cg = cs->css.cgroup;
- scan.test_task = NULL;
- scan.process_task = cpuset_change_flag;
- scan.heap = heap;
- cgroup_scan_tasks(&scan);
+ css_task_iter_start(&cs->css, &it);
+ while ((task = css_task_iter_next(&it)))
+ cpuset_update_task_spread_flag(cs, task);
+ css_task_iter_end(&it);
}
/*
@@ -1171,7 +1232,7 @@ static void update_tasks_flags(struct cpuset *cs, struct ptr_heap *heap)
* cs: the cpuset to update
* turning_on: whether the flag is being set or cleared
*
- * Call with cgroup_mutex held.
+ * Call with cpuset_mutex held.
*/
static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
@@ -1180,7 +1241,6 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
struct cpuset *trialcs;
int balance_flag_changed;
int spread_flag_changed;
- struct ptr_heap heap;
int err;
trialcs = alloc_trial_cpuset(cs);
@@ -1196,10 +1256,6 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
if (err < 0)
goto out;
- err = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
- if (err < 0)
- goto out;
-
balance_flag_changed = (is_sched_load_balance(cs) !=
is_sched_load_balance(trialcs));
@@ -1211,11 +1267,10 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
mutex_unlock(&callback_mutex);
if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
- async_rebuild_sched_domains();
+ rebuild_sched_domains_locked();
if (spread_flag_changed)
- update_tasks_flags(cs, &heap);
- heap_free(&heap);
+ update_tasks_flags(cs);
out:
free_trial_cpuset(trialcs);
return err;
@@ -1319,105 +1374,140 @@ static int fmeter_getrate(struct fmeter *fmp)
return val;
}
-/* Protected by cgroup_lock */
-static cpumask_var_t cpus_attach;
+static struct cpuset *cpuset_attach_old_cs;
-/* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */
-static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont,
- struct task_struct *tsk, bool threadgroup)
+/* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */
+static int cpuset_can_attach(struct cgroup_subsys_state *css,
+ struct cgroup_taskset *tset)
{
+ struct cpuset *cs = css_cs(css);
+ struct task_struct *task;
int ret;
- struct cpuset *cs = cgroup_cs(cont);
- if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
- return -ENOSPC;
+ /* used later by cpuset_attach() */
+ cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset));
+
+ mutex_lock(&cpuset_mutex);
/*
- * Kthreads bound to specific cpus cannot be moved to a new cpuset; we
- * cannot change their cpu affinity and isolating such threads by their
- * set of allowed nodes is unnecessary. Thus, cpusets are not
- * applicable for such threads. This prevents checking for success of
- * set_cpus_allowed_ptr() on all attached tasks before cpus_allowed may
- * be changed.
+ * We allow to move tasks into an empty cpuset if sane_behavior
+ * flag is set.
*/
- if (tsk->flags & PF_THREAD_BOUND)
- return -EINVAL;
-
- ret = security_task_setscheduler(tsk, 0, NULL);
- if (ret)
- return ret;
- if (threadgroup) {
- struct task_struct *c;
+ ret = -ENOSPC;
+ if (!cgroup_sane_behavior(css->cgroup) &&
+ (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)))
+ goto out_unlock;
- rcu_read_lock();
- list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
- ret = security_task_setscheduler(c, 0, NULL);
- if (ret) {
- rcu_read_unlock();
- return ret;
- }
- }
- rcu_read_unlock();
+ cgroup_taskset_for_each(task, tset) {
+ /*
+ * Kthreads which disallow setaffinity shouldn't be moved
+ * to a new cpuset; we don't want to change their cpu
+ * affinity and isolating such threads by their set of
+ * allowed nodes is unnecessary. Thus, cpusets are not
+ * applicable for such threads. This prevents checking for
+ * success of set_cpus_allowed_ptr() on all attached tasks
+ * before cpus_allowed may be changed.
+ */
+ ret = -EINVAL;
+ if (task->flags & PF_NO_SETAFFINITY)
+ goto out_unlock;
+ ret = security_task_setscheduler(task);
+ if (ret)
+ goto out_unlock;
}
- return 0;
-}
-static void cpuset_attach_task(struct task_struct *tsk, nodemask_t *to,
- struct cpuset *cs)
-{
- int err;
/*
- * can_attach beforehand should guarantee that this doesn't fail.
- * TODO: have a better way to handle failure here
+ * Mark attach is in progress. This makes validate_change() fail
+ * changes which zero cpus/mems_allowed.
*/
- err = set_cpus_allowed_ptr(tsk, cpus_attach);
- WARN_ON_ONCE(err);
-
- task_lock(tsk);
- cpuset_change_task_nodemask(tsk, to);
- task_unlock(tsk);
- cpuset_update_task_spread_flag(cs, tsk);
+ cs->attach_in_progress++;
+ ret = 0;
+out_unlock:
+ mutex_unlock(&cpuset_mutex);
+ return ret;
+}
+static void cpuset_cancel_attach(struct cgroup_subsys_state *css,
+ struct cgroup_taskset *tset)
+{
+ mutex_lock(&cpuset_mutex);
+ css_cs(css)->attach_in_progress--;
+ mutex_unlock(&cpuset_mutex);
}
-static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont,
- struct cgroup *oldcont, struct task_struct *tsk,
- bool threadgroup)
+/*
+ * Protected by cpuset_mutex. cpus_attach is used only by cpuset_attach()
+ * but we can't allocate it dynamically there. Define it global and
+ * allocate from cpuset_init().
+ */
+static cpumask_var_t cpus_attach;
+
+static void cpuset_attach(struct cgroup_subsys_state *css,
+ struct cgroup_taskset *tset)
{
- nodemask_t from, to;
+ /* static buf protected by cpuset_mutex */
+ static nodemask_t cpuset_attach_nodemask_to;
struct mm_struct *mm;
- struct cpuset *cs = cgroup_cs(cont);
- struct cpuset *oldcs = cgroup_cs(oldcont);
+ struct task_struct *task;
+ struct task_struct *leader = cgroup_taskset_first(tset);
+ struct cpuset *cs = css_cs(css);
+ struct cpuset *oldcs = cpuset_attach_old_cs;
+ struct cpuset *cpus_cs = effective_cpumask_cpuset(cs);
+ struct cpuset *mems_cs = effective_nodemask_cpuset(cs);
- if (cs == &top_cpuset) {
+ mutex_lock(&cpuset_mutex);
+
+ /* prepare for attach */
+ if (cs == &top_cpuset)
cpumask_copy(cpus_attach, cpu_possible_mask);
- to = node_possible_map;
- } else {
- guarantee_online_cpus(cs, cpus_attach);
- guarantee_online_mems(cs, &to);
- }
+ else
+ guarantee_online_cpus(cpus_cs, cpus_attach);
- /* do per-task migration stuff possibly for each in the threadgroup */
- cpuset_attach_task(tsk, &to, cs);
- if (threadgroup) {
- struct task_struct *c;
- rcu_read_lock();
- list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
- cpuset_attach_task(c, &to, cs);
- }
- rcu_read_unlock();
+ guarantee_online_mems(mems_cs, &cpuset_attach_nodemask_to);
+
+ cgroup_taskset_for_each(task, tset) {
+ /*
+ * can_attach beforehand should guarantee that this doesn't
+ * fail. TODO: have a better way to handle failure here
+ */
+ WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach));
+
+ cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to);
+ cpuset_update_task_spread_flag(cs, task);
}
- /* change mm; only needs to be done once even if threadgroup */
- from = oldcs->mems_allowed;
- to = cs->mems_allowed;
- mm = get_task_mm(tsk);
+ /*
+ * Change mm, possibly for multiple threads in a threadgroup. This is
+ * expensive and may sleep.
+ */
+ cpuset_attach_nodemask_to = cs->mems_allowed;
+ mm = get_task_mm(leader);
if (mm) {
- mpol_rebind_mm(mm, &to);
- if (is_memory_migrate(cs))
- cpuset_migrate_mm(mm, &from, &to);
+ struct cpuset *mems_oldcs = effective_nodemask_cpuset(oldcs);
+
+ mpol_rebind_mm(mm, &cpuset_attach_nodemask_to);
+
+ /*
+ * old_mems_allowed is the same with mems_allowed here, except
+ * if this task is being moved automatically due to hotplug.
+ * In that case @mems_allowed has been updated and is empty,
+ * so @old_mems_allowed is the right nodesets that we migrate
+ * mm from.
+ */
+ if (is_memory_migrate(cs)) {
+ cpuset_migrate_mm(mm, &mems_oldcs->old_mems_allowed,
+ &cpuset_attach_nodemask_to);
+ }
mmput(mm);
}
+
+ cs->old_mems_allowed = cpuset_attach_nodemask_to;
+
+ cs->attach_in_progress--;
+ if (!cs->attach_in_progress)
+ wake_up(&cpuset_attach_wq);
+
+ mutex_unlock(&cpuset_mutex);
}
/* The various types of files and directories in a cpuset file system */
@@ -1437,14 +1527,18 @@ typedef enum {
FILE_SPREAD_SLAB,
} cpuset_filetype_t;
-static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
+static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
+ u64 val)
{
- int retval = 0;
- struct cpuset *cs = cgroup_cs(cgrp);
+ struct cpuset *cs = css_cs(css);
cpuset_filetype_t type = cft->private;
+ int retval = 0;
- if (!cgroup_lock_live_group(cgrp))
- return -ENODEV;
+ mutex_lock(&cpuset_mutex);
+ if (!is_cpuset_online(cs)) {
+ retval = -ENODEV;
+ goto out_unlock;
+ }
switch (type) {
case FILE_CPU_EXCLUSIVE:
@@ -1478,18 +1572,21 @@ static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
retval = -EINVAL;
break;
}
- cgroup_unlock();
+out_unlock:
+ mutex_unlock(&cpuset_mutex);
return retval;
}
-static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val)
+static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
+ s64 val)
{
- int retval = 0;
- struct cpuset *cs = cgroup_cs(cgrp);
+ struct cpuset *cs = css_cs(css);
cpuset_filetype_t type = cft->private;
+ int retval = -ENODEV;
- if (!cgroup_lock_live_group(cgrp))
- return -ENODEV;
+ mutex_lock(&cpuset_mutex);
+ if (!is_cpuset_online(cs))
+ goto out_unlock;
switch (type) {
case FILE_SCHED_RELAX_DOMAIN_LEVEL:
@@ -1499,28 +1596,57 @@ static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val)
retval = -EINVAL;
break;
}
- cgroup_unlock();
+out_unlock:
+ mutex_unlock(&cpuset_mutex);
return retval;
}
/*
* Common handling for a write to a "cpus" or "mems" file.
*/
-static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,
- const char *buf)
+static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
{
- int retval = 0;
- struct cpuset *cs = cgroup_cs(cgrp);
+ struct cpuset *cs = css_cs(of_css(of));
struct cpuset *trialcs;
+ int retval = -ENODEV;
- if (!cgroup_lock_live_group(cgrp))
- return -ENODEV;
+ buf = strstrip(buf);
+
+ /*
+ * CPU or memory hotunplug may leave @cs w/o any execution
+ * resources, in which case the hotplug code asynchronously updates
+ * configuration and transfers all tasks to the nearest ancestor
+ * which can execute.
+ *
+ * As writes to "cpus" or "mems" may restore @cs's execution
+ * resources, wait for the previously scheduled operations before
+ * proceeding, so that we don't end up keep removing tasks added
+ * after execution capability is restored.
+ *
+ * cpuset_hotplug_work calls back into cgroup core via
+ * cgroup_transfer_tasks() and waiting for it from a cgroupfs
+ * operation like this one can lead to a deadlock through kernfs
+ * active_ref protection. Let's break the protection. Losing the
+ * protection is okay as we check whether @cs is online after
+ * grabbing cpuset_mutex anyway. This only happens on the legacy
+ * hierarchies.
+ */
+ css_get(&cs->css);
+ kernfs_break_active_protection(of->kn);
+ flush_work(&cpuset_hotplug_work);
+
+ mutex_lock(&cpuset_mutex);
+ if (!is_cpuset_online(cs))
+ goto out_unlock;
trialcs = alloc_trial_cpuset(cs);
- if (!trialcs)
- return -ENOMEM;
+ if (!trialcs) {
+ retval = -ENOMEM;
+ goto out_unlock;
+ }
- switch (cft->private) {
+ switch (of_cft(of)->private) {
case FILE_CPULIST:
retval = update_cpumask(cs, trialcs, buf);
break;
@@ -1533,8 +1659,11 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,
}
free_trial_cpuset(trialcs);
- cgroup_unlock();
- return retval;
+out_unlock:
+ mutex_unlock(&cpuset_mutex);
+ kernfs_unbreak_active_protection(of->kn);
+ css_put(&cs->css);
+ return retval ?: nbytes;
}
/*
@@ -1544,72 +1673,46 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,
* used, list of ranges of sequential numbers, is variable length,
* and since these maps can change value dynamically, one could read
* gibberish by doing partial reads while a list was changing.
- * A single large read to a buffer that crosses a page boundary is
- * ok, because the result being copied to user land is not recomputed
- * across a page fault.
*/
-
-static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs)
+static int cpuset_common_seq_show(struct seq_file *sf, void *v)
{
- int ret;
-
- mutex_lock(&callback_mutex);
- ret = cpulist_scnprintf(page, PAGE_SIZE, cs->cpus_allowed);
- mutex_unlock(&callback_mutex);
+ struct cpuset *cs = css_cs(seq_css(sf));
+ cpuset_filetype_t type = seq_cft(sf)->private;
+ ssize_t count;
+ char *buf, *s;
+ int ret = 0;
- return ret;
-}
-
-static int cpuset_sprintf_memlist(char *page, struct cpuset *cs)
-{
- nodemask_t mask;
+ count = seq_get_buf(sf, &buf);
+ s = buf;
mutex_lock(&callback_mutex);
- mask = cs->mems_allowed;
- mutex_unlock(&callback_mutex);
-
- return nodelist_scnprintf(page, PAGE_SIZE, mask);
-}
-
-static ssize_t cpuset_common_file_read(struct cgroup *cont,
- struct cftype *cft,
- struct file *file,
- char __user *buf,
- size_t nbytes, loff_t *ppos)
-{
- struct cpuset *cs = cgroup_cs(cont);
- cpuset_filetype_t type = cft->private;
- char *page;
- ssize_t retval = 0;
- char *s;
-
- if (!(page = (char *)__get_free_page(GFP_TEMPORARY)))
- return -ENOMEM;
-
- s = page;
switch (type) {
case FILE_CPULIST:
- s += cpuset_sprintf_cpulist(s, cs);
+ s += cpulist_scnprintf(s, count, cs->cpus_allowed);
break;
case FILE_MEMLIST:
- s += cpuset_sprintf_memlist(s, cs);
+ s += nodelist_scnprintf(s, count, cs->mems_allowed);
break;
default:
- retval = -EINVAL;
- goto out;
+ ret = -EINVAL;
+ goto out_unlock;
}
- *s++ = '\n';
- retval = simple_read_from_buffer(buf, nbytes, ppos, page, s - page);
-out:
- free_page((unsigned long)page);
- return retval;
+ if (s < buf + count - 1) {
+ *s++ = '\n';
+ seq_commit(sf, s - buf);
+ } else {
+ seq_commit(sf, -1);
+ }
+out_unlock:
+ mutex_unlock(&callback_mutex);
+ return ret;
}
-static u64 cpuset_read_u64(struct cgroup *cont, struct cftype *cft)
+static u64 cpuset_read_u64(struct cgroup_subsys_state *css, struct cftype *cft)
{
- struct cpuset *cs = cgroup_cs(cont);
+ struct cpuset *cs = css_cs(css);
cpuset_filetype_t type = cft->private;
switch (type) {
case FILE_CPU_EXCLUSIVE:
@@ -1638,9 +1741,9 @@ static u64 cpuset_read_u64(struct cgroup *cont, struct cftype *cft)
return 0;
}
-static s64 cpuset_read_s64(struct cgroup *cont, struct cftype *cft)
+static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft)
{
- struct cpuset *cs = cgroup_cs(cont);
+ struct cpuset *cs = css_cs(css);
cpuset_filetype_t type = cft->private;
switch (type) {
case FILE_SCHED_RELAX_DOMAIN_LEVEL:
@@ -1661,16 +1764,16 @@ static s64 cpuset_read_s64(struct cgroup *cont, struct cftype *cft)
static struct cftype files[] = {
{
.name = "cpus",
- .read = cpuset_common_file_read,
- .write_string = cpuset_write_resmask,
+ .seq_show = cpuset_common_seq_show,
+ .write = cpuset_write_resmask,
.max_write_len = (100U + 6 * NR_CPUS),
.private = FILE_CPULIST,
},
{
.name = "mems",
- .read = cpuset_common_file_read,
- .write_string = cpuset_write_resmask,
+ .seq_show = cpuset_common_seq_show,
+ .write = cpuset_write_resmask,
.max_write_len = (100U + 6 * MAX_NUMNODES),
.private = FILE_MEMLIST,
},
@@ -1738,84 +1841,32 @@ static struct cftype files[] = {
.write_u64 = cpuset_write_u64,
.private = FILE_SPREAD_SLAB,
},
-};
-
-static struct cftype cft_memory_pressure_enabled = {
- .name = "memory_pressure_enabled",
- .read_u64 = cpuset_read_u64,
- .write_u64 = cpuset_write_u64,
- .private = FILE_MEMORY_PRESSURE_ENABLED,
-};
-
-static int cpuset_populate(struct cgroup_subsys *ss, struct cgroup *cont)
-{
- int err;
-
- err = cgroup_add_files(cont, ss, files, ARRAY_SIZE(files));
- if (err)
- return err;
- /* memory_pressure_enabled is in root cpuset only */
- if (!cont->parent)
- err = cgroup_add_file(cont, ss,
- &cft_memory_pressure_enabled);
- return err;
-}
-
-/*
- * post_clone() is called at the end of cgroup_clone().
- * 'cgroup' was just created automatically as a result of
- * a cgroup_clone(), and the current task is about to
- * be moved into 'cgroup'.
- *
- * Currently we refuse to set up the cgroup - thereby
- * refusing the task to be entered, and as a result refusing
- * the sys_unshare() or clone() which initiated it - if any
- * sibling cpusets have exclusive cpus or mem.
- *
- * If this becomes a problem for some users who wish to
- * allow that scenario, then cpuset_post_clone() could be
- * changed to grant parent->cpus_allowed-sibling_cpus_exclusive
- * (and likewise for mems) to the new cgroup. Called with cgroup_mutex
- * held.
- */
-static void cpuset_post_clone(struct cgroup_subsys *ss,
- struct cgroup *cgroup)
-{
- struct cgroup *parent, *child;
- struct cpuset *cs, *parent_cs;
- parent = cgroup->parent;
- list_for_each_entry(child, &parent->children, sibling) {
- cs = cgroup_cs(child);
- if (is_mem_exclusive(cs) || is_cpu_exclusive(cs))
- return;
- }
- cs = cgroup_cs(cgroup);
- parent_cs = cgroup_cs(parent);
+ {
+ .name = "memory_pressure_enabled",
+ .flags = CFTYPE_ONLY_ON_ROOT,
+ .read_u64 = cpuset_read_u64,
+ .write_u64 = cpuset_write_u64,
+ .private = FILE_MEMORY_PRESSURE_ENABLED,
+ },
- cs->mems_allowed = parent_cs->mems_allowed;
- cpumask_copy(cs->cpus_allowed, parent_cs->cpus_allowed);
- return;
-}
+ { } /* terminate */
+};
/*
- * cpuset_create - create a cpuset
- * ss: cpuset cgroup subsystem
- * cont: control group that the new cpuset will be part of
+ * cpuset_css_alloc - allocate a cpuset css
+ * cgrp: control group that the new cpuset will be part of
*/
-static struct cgroup_subsys_state *cpuset_create(
- struct cgroup_subsys *ss,
- struct cgroup *cont)
+static struct cgroup_subsys_state *
+cpuset_css_alloc(struct cgroup_subsys_state *parent_css)
{
struct cpuset *cs;
- struct cpuset *parent;
- if (!cont->parent) {
+ if (!parent_css)
return &top_cpuset.css;
- }
- parent = cgroup_cs(cont->parent);
- cs = kmalloc(sizeof(*cs), GFP_KERNEL);
+
+ cs = kzalloc(sizeof(*cs), GFP_KERNEL);
if (!cs)
return ERR_PTR(-ENOMEM);
if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL)) {
@@ -1823,49 +1874,107 @@ static struct cgroup_subsys_state *cpuset_create(
return ERR_PTR(-ENOMEM);
}
- cs->flags = 0;
- if (is_spread_page(parent))
- set_bit(CS_SPREAD_PAGE, &cs->flags);
- if (is_spread_slab(parent))
- set_bit(CS_SPREAD_SLAB, &cs->flags);
set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
cpumask_clear(cs->cpus_allowed);
nodes_clear(cs->mems_allowed);
fmeter_init(&cs->fmeter);
cs->relax_domain_level = -1;
- cs->parent = parent;
- number_of_cpusets++;
- return &cs->css ;
+ return &cs->css;
+}
+
+static int cpuset_css_online(struct cgroup_subsys_state *css)
+{
+ struct cpuset *cs = css_cs(css);
+ struct cpuset *parent = parent_cs(cs);
+ struct cpuset *tmp_cs;
+ struct cgroup_subsys_state *pos_css;
+
+ if (!parent)
+ return 0;
+
+ mutex_lock(&cpuset_mutex);
+
+ set_bit(CS_ONLINE, &cs->flags);
+ if (is_spread_page(parent))
+ set_bit(CS_SPREAD_PAGE, &cs->flags);
+ if (is_spread_slab(parent))
+ set_bit(CS_SPREAD_SLAB, &cs->flags);
+
+ cpuset_inc();
+
+ if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))
+ goto out_unlock;
+
+ /*
+ * Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is
+ * set. This flag handling is implemented in cgroup core for
+ * histrical reasons - the flag may be specified during mount.
+ *
+ * Currently, if any sibling cpusets have exclusive cpus or mem, we
+ * refuse to clone the configuration - thereby refusing the task to
+ * be entered, and as a result refusing the sys_unshare() or
+ * clone() which initiated it. If this becomes a problem for some
+ * users who wish to allow that scenario, then this could be
+ * changed to grant parent->cpus_allowed-sibling_cpus_exclusive
+ * (and likewise for mems) to the new cgroup.
+ */
+ rcu_read_lock();
+ cpuset_for_each_child(tmp_cs, pos_css, parent) {
+ if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) {
+ rcu_read_unlock();
+ goto out_unlock;
+ }
+ }
+ rcu_read_unlock();
+
+ mutex_lock(&callback_mutex);
+ cs->mems_allowed = parent->mems_allowed;
+ cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
+ mutex_unlock(&callback_mutex);
+out_unlock:
+ mutex_unlock(&cpuset_mutex);
+ return 0;
}
/*
* If the cpuset being removed has its flag 'sched_load_balance'
* enabled, then simulate turning sched_load_balance off, which
- * will call async_rebuild_sched_domains().
+ * will call rebuild_sched_domains_locked().
*/
-static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
+static void cpuset_css_offline(struct cgroup_subsys_state *css)
{
- struct cpuset *cs = cgroup_cs(cont);
+ struct cpuset *cs = css_cs(css);
+
+ mutex_lock(&cpuset_mutex);
if (is_sched_load_balance(cs))
update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
- number_of_cpusets--;
+ cpuset_dec();
+ clear_bit(CS_ONLINE, &cs->flags);
+
+ mutex_unlock(&cpuset_mutex);
+}
+
+static void cpuset_css_free(struct cgroup_subsys_state *css)
+{
+ struct cpuset *cs = css_cs(css);
+
free_cpumask_var(cs->cpus_allowed);
kfree(cs);
}
-struct cgroup_subsys cpuset_subsys = {
- .name = "cpuset",
- .create = cpuset_create,
- .destroy = cpuset_destroy,
+struct cgroup_subsys cpuset_cgrp_subsys = {
+ .css_alloc = cpuset_css_alloc,
+ .css_online = cpuset_css_online,
+ .css_offline = cpuset_css_offline,
+ .css_free = cpuset_css_free,
.can_attach = cpuset_can_attach,
+ .cancel_attach = cpuset_cancel_attach,
.attach = cpuset_attach,
- .populate = cpuset_populate,
- .post_clone = cpuset_post_clone,
- .subsys_id = cpuset_subsys_id,
+ .base_cftypes = files,
.early_init = 1,
};
@@ -1896,234 +2005,230 @@ int __init cpuset_init(void)
if (!alloc_cpumask_var(&cpus_attach, GFP_KERNEL))
BUG();
- number_of_cpusets = 1;
return 0;
}
-/**
- * cpuset_do_move_task - move a given task to another cpuset
- * @tsk: pointer to task_struct the task to move
- * @scan: struct cgroup_scanner contained in its struct cpuset_hotplug_scanner
- *
- * Called by cgroup_scan_tasks() for each task in a cgroup.
- * Return nonzero to stop the walk through the tasks.
- */
-static void cpuset_do_move_task(struct task_struct *tsk,
- struct cgroup_scanner *scan)
-{
- struct cgroup *new_cgroup = scan->data;
-
- cgroup_attach_task(new_cgroup, tsk);
-}
-
-/**
- * move_member_tasks_to_cpuset - move tasks from one cpuset to another
- * @from: cpuset in which the tasks currently reside
- * @to: cpuset to which the tasks will be moved
- *
- * Called with cgroup_mutex held
- * callback_mutex must not be held, as cpuset_attach() will take it.
- *
- * The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
- * calling callback functions for each.
- */
-static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to)
-{
- struct cgroup_scanner scan;
-
- scan.cg = from->css.cgroup;
- scan.test_task = NULL; /* select all tasks in cgroup */
- scan.process_task = cpuset_do_move_task;
- scan.heap = NULL;
- scan.data = to->css.cgroup;
-
- if (cgroup_scan_tasks(&scan))
- printk(KERN_ERR "move_member_tasks_to_cpuset: "
- "cgroup_scan_tasks failed\n");
-}
-
/*
* If CPU and/or memory hotplug handlers, below, unplug any CPUs
* or memory nodes, we need to walk over the cpuset hierarchy,
* removing that CPU or node from all cpusets. If this removes the
* last CPU or node from a cpuset, then move the tasks in the empty
* cpuset to its next-highest non-empty parent.
- *
- * Called with cgroup_mutex held
- * callback_mutex must not be held, as cpuset_attach() will take it.
*/
static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
{
struct cpuset *parent;
/*
- * The cgroup's css_sets list is in use if there are tasks
- * in the cpuset; the list is empty if there are none;
- * the cs->css.refcnt seems always 0.
- */
- if (list_empty(&cs->css.cgroup->css_sets))
- return;
-
- /*
* Find its next-highest non-empty parent, (top cpuset
* has online cpus, so can't be empty).
*/
- parent = cs->parent;
+ parent = parent_cs(cs);
while (cpumask_empty(parent->cpus_allowed) ||
nodes_empty(parent->mems_allowed))
- parent = parent->parent;
+ parent = parent_cs(parent);
- move_member_tasks_to_cpuset(cs, parent);
+ if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) {
+ pr_err("cpuset: failed to transfer tasks out of empty cpuset ");
+ pr_cont_cgroup_name(cs->css.cgroup);
+ pr_cont("\n");
+ }
}
-/*
- * Walk the specified cpuset subtree and look for empty cpusets.
- * The tasks of such cpuset must be moved to a parent cpuset.
- *
- * Called with cgroup_mutex held. We take callback_mutex to modify
- * cpus_allowed and mems_allowed.
- *
- * This walk processes the tree from top to bottom, completing one layer
- * before dropping down to the next. It always processes a node before
- * any of its children.
+/**
+ * cpuset_hotplug_update_tasks - update tasks in a cpuset for hotunplug
+ * @cs: cpuset in interest
*
- * For now, since we lack memory hot unplug, we'll never see a cpuset
- * that has tasks along with an empty 'mems'. But if we did see such
- * a cpuset, we'd handle it just like we do if its 'cpus' was empty.
+ * Compare @cs's cpu and mem masks against top_cpuset and if some have gone
+ * offline, update @cs accordingly. If @cs ends up with no CPU or memory,
+ * all its tasks are moved to the nearest ancestor with both resources.
*/
-static void scan_for_empty_cpusets(struct cpuset *root)
-{
- LIST_HEAD(queue);
- struct cpuset *cp; /* scans cpusets being updated */
- struct cpuset *child; /* scans child cpusets of cp */
- struct cgroup *cont;
- nodemask_t oldmems;
-
- list_add_tail((struct list_head *)&root->stack_list, &queue);
-
- while (!list_empty(&queue)) {
- cp = list_first_entry(&queue, struct cpuset, stack_list);
- list_del(queue.next);
- list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
- child = cgroup_cs(cont);
- list_add_tail(&child->stack_list, &queue);
- }
+static void cpuset_hotplug_update_tasks(struct cpuset *cs)
+{
+ static cpumask_t off_cpus;
+ static nodemask_t off_mems;
+ bool is_empty;
+ bool sane = cgroup_sane_behavior(cs->css.cgroup);
- /* Continue past cpusets with all cpus, mems online */
- if (cpumask_subset(cp->cpus_allowed, cpu_active_mask) &&
- nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY]))
- continue;
+retry:
+ wait_event(cpuset_attach_wq, cs->attach_in_progress == 0);
- oldmems = cp->mems_allowed;
+ mutex_lock(&cpuset_mutex);
- /* Remove offline cpus and mems from this cpuset. */
- mutex_lock(&callback_mutex);
- cpumask_and(cp->cpus_allowed, cp->cpus_allowed,
- cpu_active_mask);
- nodes_and(cp->mems_allowed, cp->mems_allowed,
- node_states[N_HIGH_MEMORY]);
- mutex_unlock(&callback_mutex);
-
- /* Move tasks from the empty cpuset to a parent */
- if (cpumask_empty(cp->cpus_allowed) ||
- nodes_empty(cp->mems_allowed))
- remove_tasks_in_empty_cpuset(cp);
- else {
- update_tasks_cpumask(cp, NULL);
- update_tasks_nodemask(cp, &oldmems, NULL);
- }
+ /*
+ * We have raced with task attaching. We wait until attaching
+ * is finished, so we won't attach a task to an empty cpuset.
+ */
+ if (cs->attach_in_progress) {
+ mutex_unlock(&cpuset_mutex);
+ goto retry;
}
+
+ cpumask_andnot(&off_cpus, cs->cpus_allowed, top_cpuset.cpus_allowed);
+ nodes_andnot(off_mems, cs->mems_allowed, top_cpuset.mems_allowed);
+
+ mutex_lock(&callback_mutex);
+ cpumask_andnot(cs->cpus_allowed, cs->cpus_allowed, &off_cpus);
+ mutex_unlock(&callback_mutex);
+
+ /*
+ * If sane_behavior flag is set, we need to update tasks' cpumask
+ * for empty cpuset to take on ancestor's cpumask. Otherwise, don't
+ * call update_tasks_cpumask() if the cpuset becomes empty, as
+ * the tasks in it will be migrated to an ancestor.
+ */
+ if ((sane && cpumask_empty(cs->cpus_allowed)) ||
+ (!cpumask_empty(&off_cpus) && !cpumask_empty(cs->cpus_allowed)))
+ update_tasks_cpumask(cs);
+
+ mutex_lock(&callback_mutex);
+ nodes_andnot(cs->mems_allowed, cs->mems_allowed, off_mems);
+ mutex_unlock(&callback_mutex);
+
+ /*
+ * If sane_behavior flag is set, we need to update tasks' nodemask
+ * for empty cpuset to take on ancestor's nodemask. Otherwise, don't
+ * call update_tasks_nodemask() if the cpuset becomes empty, as
+ * the tasks in it will be migratd to an ancestor.
+ */
+ if ((sane && nodes_empty(cs->mems_allowed)) ||
+ (!nodes_empty(off_mems) && !nodes_empty(cs->mems_allowed)))
+ update_tasks_nodemask(cs);
+
+ is_empty = cpumask_empty(cs->cpus_allowed) ||
+ nodes_empty(cs->mems_allowed);
+
+ mutex_unlock(&cpuset_mutex);
+
+ /*
+ * If sane_behavior flag is set, we'll keep tasks in empty cpusets.
+ *
+ * Otherwise move tasks to the nearest ancestor with execution
+ * resources. This is full cgroup operation which will
+ * also call back into cpuset. Should be done outside any lock.
+ */
+ if (!sane && is_empty)
+ remove_tasks_in_empty_cpuset(cs);
}
-/*
- * The top_cpuset tracks what CPUs and Memory Nodes are online,
- * period. This is necessary in order to make cpusets transparent
- * (of no affect) on systems that are actively using CPU hotplug
- * but making no active use of cpusets.
+/**
+ * cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset
+ *
+ * This function is called after either CPU or memory configuration has
+ * changed and updates cpuset accordingly. The top_cpuset is always
+ * synchronized to cpu_active_mask and N_MEMORY, which is necessary in
+ * order to make cpusets transparent (of no affect) on systems that are
+ * actively using CPU hotplug but making no active use of cpusets.
*
- * This routine ensures that top_cpuset.cpus_allowed tracks
- * cpu_online_map on each CPU hotplug (cpuhp) event.
+ * Non-root cpusets are only affected by offlining. If any CPUs or memory
+ * nodes have been taken down, cpuset_hotplug_update_tasks() is invoked on
+ * all descendants.
*
- * Called within get_online_cpus(). Needs to call cgroup_lock()
- * before calling generate_sched_domains().
+ * Note that CPU offlining during suspend is ignored. We don't modify
+ * cpusets across suspend/resume cycles at all.
*/
-static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
- unsigned long phase, void *unused_cpu)
+static void cpuset_hotplug_workfn(struct work_struct *work)
{
- struct sched_domain_attr *attr;
- cpumask_var_t *doms;
- int ndoms;
+ static cpumask_t new_cpus;
+ static nodemask_t new_mems;
+ bool cpus_updated, mems_updated;
- switch (phase) {
- case CPU_ONLINE:
- case CPU_ONLINE_FROZEN:
- case CPU_DOWN_PREPARE:
- case CPU_DOWN_PREPARE_FROZEN:
- case CPU_DOWN_FAILED:
- case CPU_DOWN_FAILED_FROZEN:
- break;
+ mutex_lock(&cpuset_mutex);
- default:
- return NOTIFY_DONE;
+ /* fetch the available cpus/mems and find out which changed how */
+ cpumask_copy(&new_cpus, cpu_active_mask);
+ new_mems = node_states[N_MEMORY];
+
+ cpus_updated = !cpumask_equal(top_cpuset.cpus_allowed, &new_cpus);
+ mems_updated = !nodes_equal(top_cpuset.mems_allowed, new_mems);
+
+ /* synchronize cpus_allowed to cpu_active_mask */
+ if (cpus_updated) {
+ mutex_lock(&callback_mutex);
+ cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
+ mutex_unlock(&callback_mutex);
+ /* we don't mess with cpumasks of tasks in top_cpuset */
}
- cgroup_lock();
- mutex_lock(&callback_mutex);
- cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
- mutex_unlock(&callback_mutex);
- scan_for_empty_cpusets(&top_cpuset);
- ndoms = generate_sched_domains(&doms, &attr);
- cgroup_unlock();
+ /* synchronize mems_allowed to N_MEMORY */
+ if (mems_updated) {
+ mutex_lock(&callback_mutex);
+ top_cpuset.mems_allowed = new_mems;
+ mutex_unlock(&callback_mutex);
+ update_tasks_nodemask(&top_cpuset);
+ }
- /* Have scheduler rebuild the domains */
- partition_sched_domains(ndoms, doms, attr);
+ mutex_unlock(&cpuset_mutex);
- return NOTIFY_OK;
+ /* if cpus or mems changed, we need to propagate to descendants */
+ if (cpus_updated || mems_updated) {
+ struct cpuset *cs;
+ struct cgroup_subsys_state *pos_css;
+
+ rcu_read_lock();
+ cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
+ if (cs == &top_cpuset || !css_tryget_online(&cs->css))
+ continue;
+ rcu_read_unlock();
+
+ cpuset_hotplug_update_tasks(cs);
+
+ rcu_read_lock();
+ css_put(&cs->css);
+ }
+ rcu_read_unlock();
+ }
+
+ /* rebuild sched domains if cpus_allowed has changed */
+ if (cpus_updated)
+ rebuild_sched_domains();
+}
+
+void cpuset_update_active_cpus(bool cpu_online)
+{
+ /*
+ * We're inside cpu hotplug critical region which usually nests
+ * inside cgroup synchronization. Bounce actual hotplug processing
+ * to a work item to avoid reverse locking order.
+ *
+ * We still need to do partition_sched_domains() synchronously;
+ * otherwise, the scheduler will get confused and put tasks to the
+ * dead CPU. Fall back to the default single domain.
+ * cpuset_hotplug_workfn() will rebuild it as necessary.
+ */
+ partition_sched_domains(1, NULL, NULL);
+ schedule_work(&cpuset_hotplug_work);
}
-#ifdef CONFIG_MEMORY_HOTPLUG
/*
- * Keep top_cpuset.mems_allowed tracking node_states[N_HIGH_MEMORY].
- * Call this routine anytime after node_states[N_HIGH_MEMORY] changes.
- * See also the previous routine cpuset_track_online_cpus().
+ * Keep top_cpuset.mems_allowed tracking node_states[N_MEMORY].
+ * Call this routine anytime after node_states[N_MEMORY] changes.
+ * See cpuset_update_active_cpus() for CPU hotplug handling.
*/
static int cpuset_track_online_nodes(struct notifier_block *self,
unsigned long action, void *arg)
{
- cgroup_lock();
- switch (action) {
- case MEM_ONLINE:
- case MEM_OFFLINE:
- mutex_lock(&callback_mutex);
- top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
- mutex_unlock(&callback_mutex);
- if (action == MEM_OFFLINE)
- scan_for_empty_cpusets(&top_cpuset);
- break;
- default:
- break;
- }
- cgroup_unlock();
+ schedule_work(&cpuset_hotplug_work);
return NOTIFY_OK;
}
-#endif
+
+static struct notifier_block cpuset_track_online_nodes_nb = {
+ .notifier_call = cpuset_track_online_nodes,
+ .priority = 10, /* ??! */
+};
/**
* cpuset_init_smp - initialize cpus_allowed
*
* Description: Finish top cpuset after cpu, node maps are initialized
- **/
-
+ */
void __init cpuset_init_smp(void)
{
cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
- top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
-
- hotcpu_notifier(cpuset_track_online_cpus, 0);
- hotplug_memory_notifier(cpuset_track_online_nodes, 10);
+ top_cpuset.mems_allowed = node_states[N_MEMORY];
+ top_cpuset.old_mems_allowed = top_cpuset.mems_allowed;
- cpuset_wq = create_singlethread_workqueue("cpuset");
- BUG_ON(!cpuset_wq);
+ register_hotmemory_notifier(&cpuset_track_online_nodes_nb);
}
/**
@@ -2133,26 +2238,48 @@ void __init cpuset_init_smp(void)
*
* Description: Returns the cpumask_var_t cpus_allowed of the cpuset
* attached to the specified @tsk. Guaranteed to return some non-empty
- * subset of cpu_online_map, even if this means going outside the
+ * subset of cpu_online_mask, even if this means going outside the
* tasks cpuset.
**/
void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
{
+ struct cpuset *cpus_cs;
+
mutex_lock(&callback_mutex);
- cpuset_cpus_allowed_locked(tsk, pmask);
+ rcu_read_lock();
+ cpus_cs = effective_cpumask_cpuset(task_cs(tsk));
+ guarantee_online_cpus(cpus_cs, pmask);
+ rcu_read_unlock();
mutex_unlock(&callback_mutex);
}
-/**
- * cpuset_cpus_allowed_locked - return cpus_allowed mask from a tasks cpuset.
- * Must be called with callback_mutex held.
- **/
-void cpuset_cpus_allowed_locked(struct task_struct *tsk, struct cpumask *pmask)
+void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
{
- task_lock(tsk);
- guarantee_online_cpus(task_cs(tsk), pmask);
- task_unlock(tsk);
+ struct cpuset *cpus_cs;
+
+ rcu_read_lock();
+ cpus_cs = effective_cpumask_cpuset(task_cs(tsk));
+ do_set_cpus_allowed(tsk, cpus_cs->cpus_allowed);
+ rcu_read_unlock();
+
+ /*
+ * We own tsk->cpus_allowed, nobody can change it under us.
+ *
+ * But we used cs && cs->cpus_allowed lockless and thus can
+ * race with cgroup_attach_task() or update_cpumask() and get
+ * the wrong tsk->cpus_allowed. However, both cases imply the
+ * subsequent cpuset_change_cpumask()->set_cpus_allowed_ptr()
+ * which takes task_rq_lock().
+ *
+ * If we are called after it dropped the lock we must see all
+ * changes in tsk_cs()->cpus_allowed. Otherwise we can temporary
+ * set any mask even if it is not right from task_cs() pov,
+ * the pending set_cpus_allowed_ptr() will fix things.
+ *
+ * select_fallback_rq() will fix things ups and set cpu_possible_mask
+ * if required.
+ */
}
void cpuset_init_current_mems_allowed(void)
@@ -2166,18 +2293,20 @@ void cpuset_init_current_mems_allowed(void)
*
* Description: Returns the nodemask_t mems_allowed of the cpuset
* attached to the specified @tsk. Guaranteed to return some non-empty
- * subset of node_states[N_HIGH_MEMORY], even if this means going outside the
+ * subset of node_states[N_MEMORY], even if this means going outside the
* tasks cpuset.
**/
nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
{
+ struct cpuset *mems_cs;
nodemask_t mask;
mutex_lock(&callback_mutex);
- task_lock(tsk);
- guarantee_online_mems(task_cs(tsk), &mask);
- task_unlock(tsk);
+ rcu_read_lock();
+ mems_cs = effective_nodemask_cpuset(task_cs(tsk));
+ guarantee_online_mems(mems_cs, &mask);
+ rcu_read_unlock();
mutex_unlock(&callback_mutex);
return mask;
@@ -2200,10 +2329,10 @@ int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
* callback_mutex. If no ancestor is mem_exclusive or mem_hardwall
* (an unusual configuration), then returns the root cpuset.
*/
-static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs)
+static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
{
- while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && cs->parent)
- cs = cs->parent;
+ while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && parent_cs(cs))
+ cs = parent_cs(cs);
return cs;
}
@@ -2270,7 +2399,7 @@ static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs)
*/
int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask)
{
- const struct cpuset *cs; /* current cpuset ancestors */
+ struct cpuset *cs; /* current cpuset ancestors */
int allowed; /* is allocation in zone z allowed? */
if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
@@ -2293,11 +2422,11 @@ int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask)
/* Not hardwall and node outside mems_allowed: scan up cpusets */
mutex_lock(&callback_mutex);
- task_lock(current);
+ rcu_read_lock();
cs = nearest_hardwall_ancestor(task_cs(current));
- task_unlock(current);
-
allowed = node_isset(node, cs->mems_allowed);
+ rcu_read_unlock();
+
mutex_unlock(&callback_mutex);
return allowed;
}
@@ -2341,34 +2470,8 @@ int __cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask)
}
/**
- * cpuset_lock - lock out any changes to cpuset structures
- *
- * The out of memory (oom) code needs to mutex_lock cpusets
- * from being changed while it scans the tasklist looking for a
- * task in an overlapping cpuset. Expose callback_mutex via this
- * cpuset_lock() routine, so the oom code can lock it, before
- * locking the task list. The tasklist_lock is a spinlock, so
- * must be taken inside callback_mutex.
- */
-
-void cpuset_lock(void)
-{
- mutex_lock(&callback_mutex);
-}
-
-/**
- * cpuset_unlock - release lock on cpuset changes
- *
- * Undo the lock taken in a previous cpuset_lock() call.
- */
-
-void cpuset_unlock(void)
-{
- mutex_unlock(&callback_mutex);
-}
-
-/**
- * cpuset_mem_spread_node() - On which node to begin search for a page
+ * cpuset_mem_spread_node() - On which node to begin search for a file page
+ * cpuset_slab_spread_node() - On which node to begin search for a slab page
*
* If a task is marked PF_SPREAD_PAGE or PF_SPREAD_SLAB (as for
* tasks in a cpuset with is_spread_page or is_spread_slab set),
@@ -2393,16 +2496,35 @@ void cpuset_unlock(void)
* See kmem_cache_alloc_node().
*/
-int cpuset_mem_spread_node(void)
+static int cpuset_spread_node(int *rotor)
{
int node;
- node = next_node(current->cpuset_mem_spread_rotor, current->mems_allowed);
+ node = next_node(*rotor, current->mems_allowed);
if (node == MAX_NUMNODES)
node = first_node(current->mems_allowed);
- current->cpuset_mem_spread_rotor = node;
+ *rotor = node;
return node;
}
+
+int cpuset_mem_spread_node(void)
+{
+ if (current->cpuset_mem_spread_rotor == NUMA_NO_NODE)
+ current->cpuset_mem_spread_rotor =
+ node_random(&current->mems_allowed);
+
+ return cpuset_spread_node(&current->cpuset_mem_spread_rotor);
+}
+
+int cpuset_slab_spread_node(void)
+{
+ if (current->cpuset_slab_spread_rotor == NUMA_NO_NODE)
+ current->cpuset_slab_spread_rotor =
+ node_random(&current->mems_allowed);
+
+ return cpuset_spread_node(&current->cpuset_slab_spread_rotor);
+}
+
EXPORT_SYMBOL_GPL(cpuset_mem_spread_node);
/**
@@ -2422,26 +2544,33 @@ int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed);
}
+#define CPUSET_NODELIST_LEN (256)
+
/**
* cpuset_print_task_mems_allowed - prints task's cpuset and mems_allowed
- * @task: pointer to task_struct of some task.
+ * @tsk: pointer to task_struct of some task.
*
* Description: Prints @task's name, cpuset name, and cached copy of its
- * mems_allowed to the kernel log. Must hold task_lock(task) to allow
- * dereferencing task_cs(task).
+ * mems_allowed to the kernel log.
*/
void cpuset_print_task_mems_allowed(struct task_struct *tsk)
{
- struct dentry *dentry;
+ /* Statically allocated to prevent using excess stack. */
+ static char cpuset_nodelist[CPUSET_NODELIST_LEN];
+ static DEFINE_SPINLOCK(cpuset_buffer_lock);
+ struct cgroup *cgrp;
- dentry = task_cs(tsk)->css.cgroup->dentry;
spin_lock(&cpuset_buffer_lock);
- snprintf(cpuset_name, CPUSET_NAME_LEN,
- dentry ? (const char *)dentry->d_name.name : "/");
+ rcu_read_lock();
+
+ cgrp = task_cs(tsk)->css.cgroup;
nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN,
tsk->mems_allowed);
- printk(KERN_INFO "%s cpuset=%s mems_allowed=%s\n",
- tsk->comm, cpuset_name, cpuset_nodelist);
+ pr_info("%s cpuset=", tsk->comm);
+ pr_cont_cgroup_name(cgrp);
+ pr_cont(" mems_allowed=%s\n", cpuset_nodelist);
+
+ rcu_read_unlock();
spin_unlock(&cpuset_buffer_lock);
}
@@ -2473,9 +2602,9 @@ int cpuset_memory_pressure_enabled __read_mostly;
void __cpuset_memory_pressure_bump(void)
{
- task_lock(current);
+ rcu_read_lock();
fmeter_markevent(&task_cs(current)->fmeter);
- task_unlock(current);
+ rcu_read_unlock();
}
#ifdef CONFIG_PROC_PID_CPUSET
@@ -2485,19 +2614,19 @@ void __cpuset_memory_pressure_bump(void)
* - Used for /proc/<pid>/cpuset.
* - No need to task_lock(tsk) on this tsk->cpuset reference, as it
* doesn't really matter if tsk->cpuset changes after we read it,
- * and we take cgroup_mutex, keeping cpuset_attach() from changing it
+ * and we take cpuset_mutex, keeping cpuset_attach() from changing it
* anyway.
*/
-static int proc_cpuset_show(struct seq_file *m, void *unused_v)
+int proc_cpuset_show(struct seq_file *m, void *unused_v)
{
struct pid *pid;
struct task_struct *tsk;
- char *buf;
+ char *buf, *p;
struct cgroup_subsys_state *css;
int retval;
retval = -ENOMEM;
- buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
+ buf = kmalloc(PATH_MAX, GFP_KERNEL);
if (!buf)
goto out;
@@ -2507,44 +2636,32 @@ static int proc_cpuset_show(struct seq_file *m, void *unused_v)
if (!tsk)
goto out_free;
- retval = -EINVAL;
- cgroup_lock();
- css = task_subsys_state(tsk, cpuset_subsys_id);
- retval = cgroup_path(css->cgroup, buf, PAGE_SIZE);
- if (retval < 0)
- goto out_unlock;
- seq_puts(m, buf);
+ retval = -ENAMETOOLONG;
+ rcu_read_lock();
+ css = task_css(tsk, cpuset_cgrp_id);
+ p = cgroup_path(css->cgroup, buf, PATH_MAX);
+ rcu_read_unlock();
+ if (!p)
+ goto out_put_task;
+ seq_puts(m, p);
seq_putc(m, '\n');
-out_unlock:
- cgroup_unlock();
+ retval = 0;
+out_put_task:
put_task_struct(tsk);
out_free:
kfree(buf);
out:
return retval;
}
-
-static int cpuset_open(struct inode *inode, struct file *file)
-{
- struct pid *pid = PROC_I(inode)->pid;
- return single_open(file, proc_cpuset_show, pid);
-}
-
-const struct file_operations proc_cpuset_operations = {
- .open = cpuset_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
#endif /* CONFIG_PROC_PID_CPUSET */
/* Display task mems_allowed in /proc/<pid>/status file. */
void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task)
{
- seq_printf(m, "Mems_allowed:\t");
+ seq_puts(m, "Mems_allowed:\t");
seq_nodemask(m, &task->mems_allowed);
- seq_printf(m, "\n");
- seq_printf(m, "Mems_allowed_list:\t");
+ seq_puts(m, "\n");
+ seq_puts(m, "Mems_allowed_list:\t");
seq_nodemask_list(m, &task->mems_allowed);
- seq_printf(m, "\n");
+ seq_puts(m, "\n");
}
diff --git a/kernel/crash_dump.c b/kernel/crash_dump.c
new file mode 100644
index 00000000000..c766ee54c0b
--- /dev/null
+++ b/kernel/crash_dump.c
@@ -0,0 +1,45 @@
+#include <linux/kernel.h>
+#include <linux/crash_dump.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/export.h>
+
+/*
+ * If we have booted due to a crash, max_pfn will be a very low value. We need
+ * to know the amount of memory that the previous kernel used.
+ */
+unsigned long saved_max_pfn;
+
+/*
+ * stores the physical address of elf header of crash image
+ *
+ * Note: elfcorehdr_addr is not just limited to vmcore. It is also used by
+ * is_kdump_kernel() to determine if we are booting after a panic. Hence put
+ * it under CONFIG_CRASH_DUMP and not CONFIG_PROC_VMCORE.
+ */
+unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX;
+
+/*
+ * stores the size of elf header of crash image
+ */
+unsigned long long elfcorehdr_size;
+
+/*
+ * elfcorehdr= specifies the location of elf core header stored by the crashed
+ * kernel. This option will be passed by kexec loader to the capture kernel.
+ *
+ * Syntax: elfcorehdr=[size[KMG]@]offset[KMG]
+ */
+static int __init setup_elfcorehdr(char *arg)
+{
+ char *end;
+ if (!arg)
+ return -EINVAL;
+ elfcorehdr_addr = memparse(arg, &end);
+ if (*end == '@') {
+ elfcorehdr_size = elfcorehdr_addr;
+ elfcorehdr_addr = memparse(end + 1, &end);
+ }
+ return end > arg ? 0 : -EINVAL;
+}
+early_param("elfcorehdr", setup_elfcorehdr);
diff --git a/kernel/cred.c b/kernel/cred.c
index 1ed8ca18790..e0573a43c7d 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -1,4 +1,4 @@
-/* Task credentials management - see Documentation/credentials.txt
+/* Task credentials management - see Documentation/security/credentials.txt
*
* Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
* Written by David Howells (dhowells@redhat.com)
@@ -8,24 +8,21 @@
* as published by the Free Software Foundation; either version
* 2 of the Licence, or (at your option) any later version.
*/
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/cred.h>
+#include <linux/slab.h>
#include <linux/sched.h>
#include <linux/key.h>
#include <linux/keyctl.h>
#include <linux/init_task.h>
#include <linux/security.h>
+#include <linux/binfmts.h>
#include <linux/cn_proc.h>
-#include "cred-internals.h"
#if 0
#define kdebug(FMT, ...) \
printk("[%-5.5s%5u] "FMT"\n", current->comm, current->pid ,##__VA_ARGS__)
#else
-static inline __attribute__((format(printf, 1, 2)))
-void no_printk(const char *fmt, ...)
-{
-}
#define kdebug(FMT, ...) \
no_printk("[%-5.5s%5u] "FMT"\n", current->comm, current->pid ,##__VA_ARGS__)
#endif
@@ -33,17 +30,6 @@ void no_printk(const char *fmt, ...)
static struct kmem_cache *cred_jar;
/*
- * The common credentials for the initial task's thread group
- */
-#ifdef CONFIG_KEYS
-static struct thread_group_cred init_tgcred = {
- .usage = ATOMIC_INIT(2),
- .tgid = 0,
- .lock = SPIN_LOCK_UNLOCKED,
-};
-#endif
-
-/*
* The initial credentials for the initial task
*/
struct cred init_cred = {
@@ -52,16 +38,22 @@ struct cred init_cred = {
.subscribers = ATOMIC_INIT(2),
.magic = CRED_MAGIC,
#endif
+ .uid = GLOBAL_ROOT_UID,
+ .gid = GLOBAL_ROOT_GID,
+ .suid = GLOBAL_ROOT_UID,
+ .sgid = GLOBAL_ROOT_GID,
+ .euid = GLOBAL_ROOT_UID,
+ .egid = GLOBAL_ROOT_GID,
+ .fsuid = GLOBAL_ROOT_UID,
+ .fsgid = GLOBAL_ROOT_GID,
.securebits = SECUREBITS_DEFAULT,
- .cap_inheritable = CAP_INIT_INH_SET,
+ .cap_inheritable = CAP_EMPTY_SET,
.cap_permitted = CAP_FULL_SET,
- .cap_effective = CAP_INIT_EFF_SET,
- .cap_bset = CAP_INIT_BSET,
+ .cap_effective = CAP_FULL_SET,
+ .cap_bset = CAP_FULL_SET,
.user = INIT_USER,
+ .user_ns = &init_user_ns,
.group_info = &init_groups,
-#ifdef CONFIG_KEYS
- .tgcred = &init_tgcred,
-#endif
};
static inline void set_cred_subscribers(struct cred *cred, int n)
@@ -90,36 +82,6 @@ static inline void alter_cred_subscribers(const struct cred *_cred, int n)
}
/*
- * Dispose of the shared task group credentials
- */
-#ifdef CONFIG_KEYS
-static void release_tgcred_rcu(struct rcu_head *rcu)
-{
- struct thread_group_cred *tgcred =
- container_of(rcu, struct thread_group_cred, rcu);
-
- BUG_ON(atomic_read(&tgcred->usage) != 0);
-
- key_put(tgcred->session_keyring);
- key_put(tgcred->process_keyring);
- kfree(tgcred);
-}
-#endif
-
-/*
- * Release a set of thread group credentials.
- */
-static void release_tgcred(struct cred *cred)
-{
-#ifdef CONFIG_KEYS
- struct thread_group_cred *tgcred = cred->tgcred;
-
- if (atomic_dec_and_test(&tgcred->usage))
- call_rcu(&tgcred->rcu, release_tgcred_rcu);
-#endif
-}
-
-/*
* The RCU callback to actually dispose of a set of credentials
*/
static void put_cred_rcu(struct rcu_head *rcu)
@@ -144,12 +106,14 @@ static void put_cred_rcu(struct rcu_head *rcu)
#endif
security_cred_free(cred);
+ key_put(cred->session_keyring);
+ key_put(cred->process_keyring);
key_put(cred->thread_keyring);
key_put(cred->request_key_auth);
- release_tgcred(cred);
if (cred->group_info)
put_group_info(cred->group_info);
free_uid(cred->user);
+ put_user_ns(cred->user_ns);
kmem_cache_free(cred_jar, cred);
}
@@ -200,13 +164,31 @@ void exit_creds(struct task_struct *tsk)
validate_creds(cred);
alter_cred_subscribers(cred, -1);
put_cred(cred);
+}
- cred = (struct cred *) tsk->replacement_session_keyring;
- if (cred) {
- tsk->replacement_session_keyring = NULL;
- validate_creds(cred);
- put_cred(cred);
- }
+/**
+ * get_task_cred - Get another task's objective credentials
+ * @task: The task to query
+ *
+ * Get the objective credentials of a task, pinning them so that they can't go
+ * away. Accessing a task's credentials directly is not permitted.
+ *
+ * The caller must also make sure task doesn't get deleted, either by holding a
+ * ref on task or by holding tasklist_lock to prevent it from being unlinked.
+ */
+const struct cred *get_task_cred(struct task_struct *task)
+{
+ const struct cred *cred;
+
+ rcu_read_lock();
+
+ do {
+ cred = __task_cred((task));
+ BUG_ON(!cred);
+ } while (!atomic_inc_not_zero(&((struct cred *)cred)->usage));
+
+ rcu_read_unlock();
+ return cred;
}
/*
@@ -221,23 +203,14 @@ struct cred *cred_alloc_blank(void)
if (!new)
return NULL;
-#ifdef CONFIG_KEYS
- new->tgcred = kzalloc(sizeof(*new->tgcred), GFP_KERNEL);
- if (!new->tgcred) {
- kmem_cache_free(cred_jar, new);
- return NULL;
- }
- atomic_set(&new->tgcred->usage, 1);
-#endif
-
atomic_set(&new->usage, 1);
+#ifdef CONFIG_DEBUG_CREDENTIALS
+ new->magic = CRED_MAGIC;
+#endif
if (security_cred_alloc_blank(new, GFP_KERNEL) < 0)
goto error;
-#ifdef CONFIG_DEBUG_CREDENTIALS
- new->magic = CRED_MAGIC;
-#endif
return new;
error:
@@ -280,11 +253,13 @@ struct cred *prepare_creds(void)
set_cred_subscribers(new, 0);
get_group_info(new->group_info);
get_uid(new->user);
+ get_user_ns(new->user_ns);
#ifdef CONFIG_KEYS
+ key_get(new->session_keyring);
+ key_get(new->process_keyring);
key_get(new->thread_keyring);
key_get(new->request_key_auth);
- atomic_inc(&new->tgcred->usage);
#endif
#ifdef CONFIG_SECURITY
@@ -304,103 +279,30 @@ EXPORT_SYMBOL(prepare_creds);
/*
* Prepare credentials for current to perform an execve()
- * - The caller must hold current->cred_guard_mutex
+ * - The caller must hold ->cred_guard_mutex
*/
struct cred *prepare_exec_creds(void)
{
- struct thread_group_cred *tgcred = NULL;
struct cred *new;
-#ifdef CONFIG_KEYS
- tgcred = kmalloc(sizeof(*tgcred), GFP_KERNEL);
- if (!tgcred)
- return NULL;
-#endif
-
new = prepare_creds();
- if (!new) {
- kfree(tgcred);
+ if (!new)
return new;
- }
#ifdef CONFIG_KEYS
/* newly exec'd tasks don't get a thread keyring */
key_put(new->thread_keyring);
new->thread_keyring = NULL;
- /* create a new per-thread-group creds for all this set of threads to
- * share */
- memcpy(tgcred, new->tgcred, sizeof(struct thread_group_cred));
-
- atomic_set(&tgcred->usage, 1);
- spin_lock_init(&tgcred->lock);
-
/* inherit the session keyring; new process keyring */
- key_get(tgcred->session_keyring);
- tgcred->process_keyring = NULL;
-
- release_tgcred(new);
- new->tgcred = tgcred;
+ key_put(new->process_keyring);
+ new->process_keyring = NULL;
#endif
return new;
}
/*
- * prepare new credentials for the usermode helper dispatcher
- */
-struct cred *prepare_usermodehelper_creds(void)
-{
-#ifdef CONFIG_KEYS
- struct thread_group_cred *tgcred = NULL;
-#endif
- struct cred *new;
-
-#ifdef CONFIG_KEYS
- tgcred = kzalloc(sizeof(*new->tgcred), GFP_ATOMIC);
- if (!tgcred)
- return NULL;
-#endif
-
- new = kmem_cache_alloc(cred_jar, GFP_ATOMIC);
- if (!new)
- return NULL;
-
- kdebug("prepare_usermodehelper_creds() alloc %p", new);
-
- memcpy(new, &init_cred, sizeof(struct cred));
-
- atomic_set(&new->usage, 1);
- set_cred_subscribers(new, 0);
- get_group_info(new->group_info);
- get_uid(new->user);
-
-#ifdef CONFIG_KEYS
- new->thread_keyring = NULL;
- new->request_key_auth = NULL;
- new->jit_keyring = KEY_REQKEY_DEFL_DEFAULT;
-
- atomic_set(&tgcred->usage, 1);
- spin_lock_init(&tgcred->lock);
- new->tgcred = tgcred;
-#endif
-
-#ifdef CONFIG_SECURITY
- new->security = NULL;
-#endif
- if (security_prepare_creds(new, &init_cred, GFP_ATOMIC) < 0)
- goto error;
- validate_creds(new);
-
- BUG_ON(atomic_read(&new->usage) != 1);
- return new;
-
-error:
- put_cred(new);
- return NULL;
-}
-
-/*
* Copy credentials for the new process created by fork()
*
* We share if we can, but under some circumstances we have to generate a new
@@ -411,14 +313,9 @@ error:
*/
int copy_creds(struct task_struct *p, unsigned long clone_flags)
{
-#ifdef CONFIG_KEYS
- struct thread_group_cred *tgcred;
-#endif
struct cred *new;
int ret;
- mutex_init(&p->cred_guard_mutex);
-
if (
#ifdef CONFIG_KEYS
!p->cred->thread_keyring &&
@@ -455,22 +352,12 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
install_thread_keyring_to_cred(new);
}
- /* we share the process and session keyrings between all the threads in
- * a process - this is slightly icky as we violate COW credentials a
- * bit */
+ /* The process keyring is only shared between the threads in a process;
+ * anything outside of those threads doesn't inherit.
+ */
if (!(clone_flags & CLONE_THREAD)) {
- tgcred = kmalloc(sizeof(*tgcred), GFP_KERNEL);
- if (!tgcred) {
- ret = -ENOMEM;
- goto error_put;
- }
- atomic_set(&tgcred->usage, 1);
- spin_lock_init(&tgcred->lock);
- tgcred->process_keyring = NULL;
- tgcred->session_keyring = key_get(new->tgcred->session_keyring);
-
- release_tgcred(new);
- new->tgcred = tgcred;
+ key_put(new->process_keyring);
+ new->process_keyring = NULL;
}
#endif
@@ -485,6 +372,31 @@ error_put:
return ret;
}
+static bool cred_cap_issubset(const struct cred *set, const struct cred *subset)
+{
+ const struct user_namespace *set_ns = set->user_ns;
+ const struct user_namespace *subset_ns = subset->user_ns;
+
+ /* If the two credentials are in the same user namespace see if
+ * the capabilities of subset are a subset of set.
+ */
+ if (set_ns == subset_ns)
+ return cap_issubset(subset->cap_permitted, set->cap_permitted);
+
+ /* The credentials are in a different user namespaces
+ * therefore one is a subset of the other only if a set is an
+ * ancestor of subset and set->euid is owner of subset or one
+ * of subsets ancestors.
+ */
+ for (;subset_ns != &init_user_ns; subset_ns = subset_ns->parent) {
+ if ((set_ns == subset_ns->parent) &&
+ uid_eq(subset_ns->owner, set->euid))
+ return true;
+ }
+
+ return false;
+}
+
/**
* commit_creds - Install new credentials upon the current task
* @new: The credentials to be assigned
@@ -516,16 +428,14 @@ int commit_creds(struct cred *new)
#endif
BUG_ON(atomic_read(&new->usage) < 1);
- security_commit_creds(new, old);
-
get_cred(new); /* we will require a ref for the subj creds too */
/* dumpability changes */
- if (old->euid != new->euid ||
- old->egid != new->egid ||
- old->fsuid != new->fsuid ||
- old->fsgid != new->fsgid ||
- !cap_issubset(new->cap_permitted, old->cap_permitted)) {
+ if (!uid_eq(old->euid, new->euid) ||
+ !gid_eq(old->egid, new->egid) ||
+ !uid_eq(old->fsuid, new->fsuid) ||
+ !gid_eq(old->fsgid, new->fsgid) ||
+ !cred_cap_issubset(old, new)) {
if (task->mm)
set_dumpable(task->mm, suid_dumpable);
task->pdeath_signal = 0;
@@ -533,16 +443,14 @@ int commit_creds(struct cred *new)
}
/* alter the thread keyring */
- if (new->fsuid != old->fsuid)
+ if (!uid_eq(new->fsuid, old->fsuid))
key_fsuid_changed(task);
- if (new->fsgid != old->fsgid)
+ if (!gid_eq(new->fsgid, old->fsgid))
key_fsgid_changed(task);
/* do it
- * - What if a process setreuid()'s and this brings the
- * new uid over his NPROC rlimit? We can check this now
- * cheaply with the new uid cache, so if it matters
- * we should be checking for it. -DaveM
+ * RLIMIT_NPROC limits on user->processes have already been checked
+ * in set_user().
*/
alter_cred_subscribers(new, 2);
if (new->user != old->user)
@@ -553,19 +461,17 @@ int commit_creds(struct cred *new)
atomic_dec(&old->user->processes);
alter_cred_subscribers(old, -2);
- sched_switch_user(task);
-
/* send notifications */
- if (new->uid != old->uid ||
- new->euid != old->euid ||
- new->suid != old->suid ||
- new->fsuid != old->fsuid)
+ if (!uid_eq(new->uid, old->uid) ||
+ !uid_eq(new->euid, old->euid) ||
+ !uid_eq(new->suid, old->suid) ||
+ !uid_eq(new->fsuid, old->fsuid))
proc_id_connector(task, PROC_EVENT_UID);
- if (new->gid != old->gid ||
- new->egid != old->egid ||
- new->sgid != old->sgid ||
- new->fsgid != old->fsgid)
+ if (!gid_eq(new->gid, old->gid) ||
+ !gid_eq(new->egid, old->egid) ||
+ !gid_eq(new->sgid, old->sgid) ||
+ !gid_eq(new->fsgid, old->fsgid))
proc_id_connector(task, PROC_EVENT_GID);
/* release the old obj and subj refs both */
@@ -696,14 +602,17 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
validate_creds(old);
*new = *old;
+ atomic_set(&new->usage, 1);
+ set_cred_subscribers(new, 0);
get_uid(new->user);
+ get_user_ns(new->user_ns);
get_group_info(new->group_info);
#ifdef CONFIG_KEYS
- atomic_inc(&init_tgcred.usage);
- new->tgcred = &init_tgcred;
- new->request_key_auth = NULL;
+ new->session_keyring = NULL;
+ new->process_keyring = NULL;
new->thread_keyring = NULL;
+ new->request_key_auth = NULL;
new->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING;
#endif
@@ -713,8 +622,6 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
if (security_prepare_creds(new, old, GFP_KERNEL) < 0)
goto error;
- atomic_set(&new->usage, 1);
- set_cred_subscribers(new, 0);
put_cred(old);
validate_creds(new);
return new;
@@ -786,10 +693,12 @@ bool creds_are_invalid(const struct cred *cred)
{
if (cred->magic != CRED_MAGIC)
return true;
- if (atomic_read(&cred->usage) < atomic_read(&cred->subscribers))
- return true;
#ifdef CONFIG_SECURITY_SELINUX
- if (selinux_is_enabled()) {
+ /*
+ * cred->security == NULL if security_cred_alloc_blank() or
+ * security_prepare_creds() returned an error.
+ */
+ if (selinux_is_enabled() && cred->security) {
if ((unsigned long) cred->security < PAGE_SIZE)
return true;
if ((*(u32 *)cred->security & 0xffffff00) ==
@@ -818,9 +727,15 @@ static void dump_invalid_creds(const struct cred *cred, const char *label,
atomic_read(&cred->usage),
read_cred_subscribers(cred));
printk(KERN_ERR "CRED: ->*uid = { %d,%d,%d,%d }\n",
- cred->uid, cred->euid, cred->suid, cred->fsuid);
+ from_kuid_munged(&init_user_ns, cred->uid),
+ from_kuid_munged(&init_user_ns, cred->euid),
+ from_kuid_munged(&init_user_ns, cred->suid),
+ from_kuid_munged(&init_user_ns, cred->fsuid));
printk(KERN_ERR "CRED: ->*gid = { %d,%d,%d,%d }\n",
- cred->gid, cred->egid, cred->sgid, cred->fsgid);
+ from_kgid_munged(&init_user_ns, cred->gid),
+ from_kgid_munged(&init_user_ns, cred->egid),
+ from_kgid_munged(&init_user_ns, cred->sgid),
+ from_kgid_munged(&init_user_ns, cred->fsgid));
#ifdef CONFIG_SECURITY
printk(KERN_ERR "CRED: ->security is %p\n", cred->security);
if ((unsigned long) cred->security >= PAGE_SIZE &&
diff --git a/kernel/debug/Makefile b/kernel/debug/Makefile
new file mode 100644
index 00000000000..a85edc33998
--- /dev/null
+++ b/kernel/debug/Makefile
@@ -0,0 +1,6 @@
+#
+# Makefile for the linux kernel debugger
+#
+
+obj-$(CONFIG_KGDB) += debug_core.o gdbstub.o
+obj-$(CONFIG_KGDB_KDB) += kdb/
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
new file mode 100644
index 00000000000..1adf62b39b9
--- /dev/null
+++ b/kernel/debug/debug_core.c
@@ -0,0 +1,1067 @@
+/*
+ * Kernel Debug Core
+ *
+ * Maintainer: Jason Wessel <jason.wessel@windriver.com>
+ *
+ * Copyright (C) 2000-2001 VERITAS Software Corporation.
+ * Copyright (C) 2002-2004 Timesys Corporation
+ * Copyright (C) 2003-2004 Amit S. Kale <amitkale@linsyssoft.com>
+ * Copyright (C) 2004 Pavel Machek <pavel@ucw.cz>
+ * Copyright (C) 2004-2006 Tom Rini <trini@kernel.crashing.org>
+ * Copyright (C) 2004-2006 LinSysSoft Technologies Pvt. Ltd.
+ * Copyright (C) 2005-2009 Wind River Systems, Inc.
+ * Copyright (C) 2007 MontaVista Software, Inc.
+ * Copyright (C) 2008 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *
+ * Contributors at various stages not listed above:
+ * Jason Wessel ( jason.wessel@windriver.com )
+ * George Anzinger <george@mvista.com>
+ * Anurekh Saxena (anurekh.saxena@timesys.com)
+ * Lake Stevens Instrument Division (Glenn Engel)
+ * Jim Kingdon, Cygnus Support.
+ *
+ * Original KGDB stub: David Grothe <dave@gcom.com>,
+ * Tigran Aivazian <tigran@sco.com>
+ *
+ * This file is licensed under the terms of the GNU General Public License
+ * version 2. This program is licensed "as is" without any warranty of any
+ * kind, whether express or implied.
+ */
+#include <linux/pid_namespace.h>
+#include <linux/clocksource.h>
+#include <linux/serial_core.h>
+#include <linux/interrupt.h>
+#include <linux/spinlock.h>
+#include <linux/console.h>
+#include <linux/threads.h>
+#include <linux/uaccess.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/ptrace.h>
+#include <linux/string.h>
+#include <linux/delay.h>
+#include <linux/sched.h>
+#include <linux/sysrq.h>
+#include <linux/reboot.h>
+#include <linux/init.h>
+#include <linux/kgdb.h>
+#include <linux/kdb.h>
+#include <linux/pid.h>
+#include <linux/smp.h>
+#include <linux/mm.h>
+#include <linux/vmacache.h>
+#include <linux/rcupdate.h>
+
+#include <asm/cacheflush.h>
+#include <asm/byteorder.h>
+#include <linux/atomic.h>
+
+#include "debug_core.h"
+
+static int kgdb_break_asap;
+
+struct debuggerinfo_struct kgdb_info[NR_CPUS];
+
+/**
+ * kgdb_connected - Is a host GDB connected to us?
+ */
+int kgdb_connected;
+EXPORT_SYMBOL_GPL(kgdb_connected);
+
+/* All the KGDB handlers are installed */
+int kgdb_io_module_registered;
+
+/* Guard for recursive entry */
+static int exception_level;
+
+struct kgdb_io *dbg_io_ops;
+static DEFINE_SPINLOCK(kgdb_registration_lock);
+
+/* Action for the reboot notifiter, a global allow kdb to change it */
+static int kgdbreboot;
+/* kgdb console driver is loaded */
+static int kgdb_con_registered;
+/* determine if kgdb console output should be used */
+static int kgdb_use_con;
+/* Flag for alternate operations for early debugging */
+bool dbg_is_early = true;
+/* Next cpu to become the master debug core */
+int dbg_switch_cpu;
+
+/* Use kdb or gdbserver mode */
+int dbg_kdb_mode = 1;
+
+static int __init opt_kgdb_con(char *str)
+{
+ kgdb_use_con = 1;
+ return 0;
+}
+
+early_param("kgdbcon", opt_kgdb_con);
+
+module_param(kgdb_use_con, int, 0644);
+module_param(kgdbreboot, int, 0644);
+
+/*
+ * Holds information about breakpoints in a kernel. These breakpoints are
+ * added and removed by gdb.
+ */
+static struct kgdb_bkpt kgdb_break[KGDB_MAX_BREAKPOINTS] = {
+ [0 ... KGDB_MAX_BREAKPOINTS-1] = { .state = BP_UNDEFINED }
+};
+
+/*
+ * The CPU# of the active CPU, or -1 if none:
+ */
+atomic_t kgdb_active = ATOMIC_INIT(-1);
+EXPORT_SYMBOL_GPL(kgdb_active);
+static DEFINE_RAW_SPINLOCK(dbg_master_lock);
+static DEFINE_RAW_SPINLOCK(dbg_slave_lock);
+
+/*
+ * We use NR_CPUs not PERCPU, in case kgdb is used to debug early
+ * bootup code (which might not have percpu set up yet):
+ */
+static atomic_t masters_in_kgdb;
+static atomic_t slaves_in_kgdb;
+static atomic_t kgdb_break_tasklet_var;
+atomic_t kgdb_setting_breakpoint;
+
+struct task_struct *kgdb_usethread;
+struct task_struct *kgdb_contthread;
+
+int kgdb_single_step;
+static pid_t kgdb_sstep_pid;
+
+/* to keep track of the CPU which is doing the single stepping*/
+atomic_t kgdb_cpu_doing_single_step = ATOMIC_INIT(-1);
+
+/*
+ * If you are debugging a problem where roundup (the collection of
+ * all other CPUs) is a problem [this should be extremely rare],
+ * then use the nokgdbroundup option to avoid roundup. In that case
+ * the other CPUs might interfere with your debugging context, so
+ * use this with care:
+ */
+static int kgdb_do_roundup = 1;
+
+static int __init opt_nokgdbroundup(char *str)
+{
+ kgdb_do_roundup = 0;
+
+ return 0;
+}
+
+early_param("nokgdbroundup", opt_nokgdbroundup);
+
+/*
+ * Finally, some KGDB code :-)
+ */
+
+/*
+ * Weak aliases for breakpoint management,
+ * can be overriden by architectures when needed:
+ */
+int __weak kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt)
+{
+ int err;
+
+ err = probe_kernel_read(bpt->saved_instr, (char *)bpt->bpt_addr,
+ BREAK_INSTR_SIZE);
+ if (err)
+ return err;
+ err = probe_kernel_write((char *)bpt->bpt_addr,
+ arch_kgdb_ops.gdb_bpt_instr, BREAK_INSTR_SIZE);
+ return err;
+}
+
+int __weak kgdb_arch_remove_breakpoint(struct kgdb_bkpt *bpt)
+{
+ return probe_kernel_write((char *)bpt->bpt_addr,
+ (char *)bpt->saved_instr, BREAK_INSTR_SIZE);
+}
+
+int __weak kgdb_validate_break_address(unsigned long addr)
+{
+ struct kgdb_bkpt tmp;
+ int err;
+ /* Validate setting the breakpoint and then removing it. If the
+ * remove fails, the kernel needs to emit a bad message because we
+ * are deep trouble not being able to put things back the way we
+ * found them.
+ */
+ tmp.bpt_addr = addr;
+ err = kgdb_arch_set_breakpoint(&tmp);
+ if (err)
+ return err;
+ err = kgdb_arch_remove_breakpoint(&tmp);
+ if (err)
+ printk(KERN_ERR "KGDB: Critical breakpoint error, kernel "
+ "memory destroyed at: %lx", addr);
+ return err;
+}
+
+unsigned long __weak kgdb_arch_pc(int exception, struct pt_regs *regs)
+{
+ return instruction_pointer(regs);
+}
+
+int __weak kgdb_arch_init(void)
+{
+ return 0;
+}
+
+int __weak kgdb_skipexception(int exception, struct pt_regs *regs)
+{
+ return 0;
+}
+
+/*
+ * Some architectures need cache flushes when we set/clear a
+ * breakpoint:
+ */
+static void kgdb_flush_swbreak_addr(unsigned long addr)
+{
+ if (!CACHE_FLUSH_IS_SAFE)
+ return;
+
+ if (current->mm) {
+ int i;
+
+ for (i = 0; i < VMACACHE_SIZE; i++) {
+ if (!current->vmacache[i])
+ continue;
+ flush_cache_range(current->vmacache[i],
+ addr, addr + BREAK_INSTR_SIZE);
+ }
+ }
+
+ /* Force flush instruction cache if it was outside the mm */
+ flush_icache_range(addr, addr + BREAK_INSTR_SIZE);
+}
+
+/*
+ * SW breakpoint management:
+ */
+int dbg_activate_sw_breakpoints(void)
+{
+ int error;
+ int ret = 0;
+ int i;
+
+ for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
+ if (kgdb_break[i].state != BP_SET)
+ continue;
+
+ error = kgdb_arch_set_breakpoint(&kgdb_break[i]);
+ if (error) {
+ ret = error;
+ printk(KERN_INFO "KGDB: BP install failed: %lx",
+ kgdb_break[i].bpt_addr);
+ continue;
+ }
+
+ kgdb_flush_swbreak_addr(kgdb_break[i].bpt_addr);
+ kgdb_break[i].state = BP_ACTIVE;
+ }
+ return ret;
+}
+
+int dbg_set_sw_break(unsigned long addr)
+{
+ int err = kgdb_validate_break_address(addr);
+ int breakno = -1;
+ int i;
+
+ if (err)
+ return err;
+
+ for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
+ if ((kgdb_break[i].state == BP_SET) &&
+ (kgdb_break[i].bpt_addr == addr))
+ return -EEXIST;
+ }
+ for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
+ if (kgdb_break[i].state == BP_REMOVED &&
+ kgdb_break[i].bpt_addr == addr) {
+ breakno = i;
+ break;
+ }
+ }
+
+ if (breakno == -1) {
+ for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
+ if (kgdb_break[i].state == BP_UNDEFINED) {
+ breakno = i;
+ break;
+ }
+ }
+ }
+
+ if (breakno == -1)
+ return -E2BIG;
+
+ kgdb_break[breakno].state = BP_SET;
+ kgdb_break[breakno].type = BP_BREAKPOINT;
+ kgdb_break[breakno].bpt_addr = addr;
+
+ return 0;
+}
+
+int dbg_deactivate_sw_breakpoints(void)
+{
+ int error;
+ int ret = 0;
+ int i;
+
+ for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
+ if (kgdb_break[i].state != BP_ACTIVE)
+ continue;
+ error = kgdb_arch_remove_breakpoint(&kgdb_break[i]);
+ if (error) {
+ printk(KERN_INFO "KGDB: BP remove failed: %lx\n",
+ kgdb_break[i].bpt_addr);
+ ret = error;
+ }
+
+ kgdb_flush_swbreak_addr(kgdb_break[i].bpt_addr);
+ kgdb_break[i].state = BP_SET;
+ }
+ return ret;
+}
+
+int dbg_remove_sw_break(unsigned long addr)
+{
+ int i;
+
+ for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
+ if ((kgdb_break[i].state == BP_SET) &&
+ (kgdb_break[i].bpt_addr == addr)) {
+ kgdb_break[i].state = BP_REMOVED;
+ return 0;
+ }
+ }
+ return -ENOENT;
+}
+
+int kgdb_isremovedbreak(unsigned long addr)
+{
+ int i;
+
+ for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
+ if ((kgdb_break[i].state == BP_REMOVED) &&
+ (kgdb_break[i].bpt_addr == addr))
+ return 1;
+ }
+ return 0;
+}
+
+int dbg_remove_all_break(void)
+{
+ int error;
+ int i;
+
+ /* Clear memory breakpoints. */
+ for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
+ if (kgdb_break[i].state != BP_ACTIVE)
+ goto setundefined;
+ error = kgdb_arch_remove_breakpoint(&kgdb_break[i]);
+ if (error)
+ printk(KERN_ERR "KGDB: breakpoint remove failed: %lx\n",
+ kgdb_break[i].bpt_addr);
+setundefined:
+ kgdb_break[i].state = BP_UNDEFINED;
+ }
+
+ /* Clear hardware breakpoints. */
+ if (arch_kgdb_ops.remove_all_hw_break)
+ arch_kgdb_ops.remove_all_hw_break();
+
+ return 0;
+}
+
+/*
+ * Return true if there is a valid kgdb I/O module. Also if no
+ * debugger is attached a message can be printed to the console about
+ * waiting for the debugger to attach.
+ *
+ * The print_wait argument is only to be true when called from inside
+ * the core kgdb_handle_exception, because it will wait for the
+ * debugger to attach.
+ */
+static int kgdb_io_ready(int print_wait)
+{
+ if (!dbg_io_ops)
+ return 0;
+ if (kgdb_connected)
+ return 1;
+ if (atomic_read(&kgdb_setting_breakpoint))
+ return 1;
+ if (print_wait) {
+#ifdef CONFIG_KGDB_KDB
+ if (!dbg_kdb_mode)
+ printk(KERN_CRIT "KGDB: waiting... or $3#33 for KDB\n");
+#else
+ printk(KERN_CRIT "KGDB: Waiting for remote debugger\n");
+#endif
+ }
+ return 1;
+}
+
+static int kgdb_reenter_check(struct kgdb_state *ks)
+{
+ unsigned long addr;
+
+ if (atomic_read(&kgdb_active) != raw_smp_processor_id())
+ return 0;
+
+ /* Panic on recursive debugger calls: */
+ exception_level++;
+ addr = kgdb_arch_pc(ks->ex_vector, ks->linux_regs);
+ dbg_deactivate_sw_breakpoints();
+
+ /*
+ * If the break point removed ok at the place exception
+ * occurred, try to recover and print a warning to the end
+ * user because the user planted a breakpoint in a place that
+ * KGDB needs in order to function.
+ */
+ if (dbg_remove_sw_break(addr) == 0) {
+ exception_level = 0;
+ kgdb_skipexception(ks->ex_vector, ks->linux_regs);
+ dbg_activate_sw_breakpoints();
+ printk(KERN_CRIT "KGDB: re-enter error: breakpoint removed %lx\n",
+ addr);
+ WARN_ON_ONCE(1);
+
+ return 1;
+ }
+ dbg_remove_all_break();
+ kgdb_skipexception(ks->ex_vector, ks->linux_regs);
+
+ if (exception_level > 1) {
+ dump_stack();
+ panic("Recursive entry to debugger");
+ }
+
+ printk(KERN_CRIT "KGDB: re-enter exception: ALL breakpoints killed\n");
+#ifdef CONFIG_KGDB_KDB
+ /* Allow kdb to debug itself one level */
+ return 0;
+#endif
+ dump_stack();
+ panic("Recursive entry to debugger");
+
+ return 1;
+}
+
+static void dbg_touch_watchdogs(void)
+{
+ touch_softlockup_watchdog_sync();
+ clocksource_touch_watchdog();
+ rcu_cpu_stall_reset();
+}
+
+static int kgdb_cpu_enter(struct kgdb_state *ks, struct pt_regs *regs,
+ int exception_state)
+{
+ unsigned long flags;
+ int sstep_tries = 100;
+ int error;
+ int cpu;
+ int trace_on = 0;
+ int online_cpus = num_online_cpus();
+
+ kgdb_info[ks->cpu].enter_kgdb++;
+ kgdb_info[ks->cpu].exception_state |= exception_state;
+
+ if (exception_state == DCPU_WANT_MASTER)
+ atomic_inc(&masters_in_kgdb);
+ else
+ atomic_inc(&slaves_in_kgdb);
+
+ if (arch_kgdb_ops.disable_hw_break)
+ arch_kgdb_ops.disable_hw_break(regs);
+
+acquirelock:
+ /*
+ * Interrupts will be restored by the 'trap return' code, except when
+ * single stepping.
+ */
+ local_irq_save(flags);
+
+ cpu = ks->cpu;
+ kgdb_info[cpu].debuggerinfo = regs;
+ kgdb_info[cpu].task = current;
+ kgdb_info[cpu].ret_state = 0;
+ kgdb_info[cpu].irq_depth = hardirq_count() >> HARDIRQ_SHIFT;
+
+ /* Make sure the above info reaches the primary CPU */
+ smp_mb();
+
+ if (exception_level == 1) {
+ if (raw_spin_trylock(&dbg_master_lock))
+ atomic_xchg(&kgdb_active, cpu);
+ goto cpu_master_loop;
+ }
+
+ /*
+ * CPU will loop if it is a slave or request to become a kgdb
+ * master cpu and acquire the kgdb_active lock:
+ */
+ while (1) {
+cpu_loop:
+ if (kgdb_info[cpu].exception_state & DCPU_NEXT_MASTER) {
+ kgdb_info[cpu].exception_state &= ~DCPU_NEXT_MASTER;
+ goto cpu_master_loop;
+ } else if (kgdb_info[cpu].exception_state & DCPU_WANT_MASTER) {
+ if (raw_spin_trylock(&dbg_master_lock)) {
+ atomic_xchg(&kgdb_active, cpu);
+ break;
+ }
+ } else if (kgdb_info[cpu].exception_state & DCPU_IS_SLAVE) {
+ if (!raw_spin_is_locked(&dbg_slave_lock))
+ goto return_normal;
+ } else {
+return_normal:
+ /* Return to normal operation by executing any
+ * hw breakpoint fixup.
+ */
+ if (arch_kgdb_ops.correct_hw_break)
+ arch_kgdb_ops.correct_hw_break();
+ if (trace_on)
+ tracing_on();
+ kgdb_info[cpu].exception_state &=
+ ~(DCPU_WANT_MASTER | DCPU_IS_SLAVE);
+ kgdb_info[cpu].enter_kgdb--;
+ smp_mb__before_atomic();
+ atomic_dec(&slaves_in_kgdb);
+ dbg_touch_watchdogs();
+ local_irq_restore(flags);
+ return 0;
+ }
+ cpu_relax();
+ }
+
+ /*
+ * For single stepping, try to only enter on the processor
+ * that was single stepping. To guard against a deadlock, the
+ * kernel will only try for the value of sstep_tries before
+ * giving up and continuing on.
+ */
+ if (atomic_read(&kgdb_cpu_doing_single_step) != -1 &&
+ (kgdb_info[cpu].task &&
+ kgdb_info[cpu].task->pid != kgdb_sstep_pid) && --sstep_tries) {
+ atomic_set(&kgdb_active, -1);
+ raw_spin_unlock(&dbg_master_lock);
+ dbg_touch_watchdogs();
+ local_irq_restore(flags);
+
+ goto acquirelock;
+ }
+
+ if (!kgdb_io_ready(1)) {
+ kgdb_info[cpu].ret_state = 1;
+ goto kgdb_restore; /* No I/O connection, resume the system */
+ }
+
+ /*
+ * Don't enter if we have hit a removed breakpoint.
+ */
+ if (kgdb_skipexception(ks->ex_vector, ks->linux_regs))
+ goto kgdb_restore;
+
+ /* Call the I/O driver's pre_exception routine */
+ if (dbg_io_ops->pre_exception)
+ dbg_io_ops->pre_exception();
+
+ /*
+ * Get the passive CPU lock which will hold all the non-primary
+ * CPU in a spin state while the debugger is active
+ */
+ if (!kgdb_single_step)
+ raw_spin_lock(&dbg_slave_lock);
+
+#ifdef CONFIG_SMP
+ /* If send_ready set, slaves are already waiting */
+ if (ks->send_ready)
+ atomic_set(ks->send_ready, 1);
+
+ /* Signal the other CPUs to enter kgdb_wait() */
+ else if ((!kgdb_single_step) && kgdb_do_roundup)
+ kgdb_roundup_cpus(flags);
+#endif
+
+ /*
+ * Wait for the other CPUs to be notified and be waiting for us:
+ */
+ while (kgdb_do_roundup && (atomic_read(&masters_in_kgdb) +
+ atomic_read(&slaves_in_kgdb)) != online_cpus)
+ cpu_relax();
+
+ /*
+ * At this point the primary processor is completely
+ * in the debugger and all secondary CPUs are quiescent
+ */
+ dbg_deactivate_sw_breakpoints();
+ kgdb_single_step = 0;
+ kgdb_contthread = current;
+ exception_level = 0;
+ trace_on = tracing_is_on();
+ if (trace_on)
+ tracing_off();
+
+ while (1) {
+cpu_master_loop:
+ if (dbg_kdb_mode) {
+ kgdb_connected = 1;
+ error = kdb_stub(ks);
+ if (error == -1)
+ continue;
+ kgdb_connected = 0;
+ } else {
+ error = gdb_serial_stub(ks);
+ }
+
+ if (error == DBG_PASS_EVENT) {
+ dbg_kdb_mode = !dbg_kdb_mode;
+ } else if (error == DBG_SWITCH_CPU_EVENT) {
+ kgdb_info[dbg_switch_cpu].exception_state |=
+ DCPU_NEXT_MASTER;
+ goto cpu_loop;
+ } else {
+ kgdb_info[cpu].ret_state = error;
+ break;
+ }
+ }
+
+ /* Call the I/O driver's post_exception routine */
+ if (dbg_io_ops->post_exception)
+ dbg_io_ops->post_exception();
+
+ if (!kgdb_single_step) {
+ raw_spin_unlock(&dbg_slave_lock);
+ /* Wait till all the CPUs have quit from the debugger. */
+ while (kgdb_do_roundup && atomic_read(&slaves_in_kgdb))
+ cpu_relax();
+ }
+
+kgdb_restore:
+ if (atomic_read(&kgdb_cpu_doing_single_step) != -1) {
+ int sstep_cpu = atomic_read(&kgdb_cpu_doing_single_step);
+ if (kgdb_info[sstep_cpu].task)
+ kgdb_sstep_pid = kgdb_info[sstep_cpu].task->pid;
+ else
+ kgdb_sstep_pid = 0;
+ }
+ if (arch_kgdb_ops.correct_hw_break)
+ arch_kgdb_ops.correct_hw_break();
+ if (trace_on)
+ tracing_on();
+
+ kgdb_info[cpu].exception_state &=
+ ~(DCPU_WANT_MASTER | DCPU_IS_SLAVE);
+ kgdb_info[cpu].enter_kgdb--;
+ smp_mb__before_atomic();
+ atomic_dec(&masters_in_kgdb);
+ /* Free kgdb_active */
+ atomic_set(&kgdb_active, -1);
+ raw_spin_unlock(&dbg_master_lock);
+ dbg_touch_watchdogs();
+ local_irq_restore(flags);
+
+ return kgdb_info[cpu].ret_state;
+}
+
+/*
+ * kgdb_handle_exception() - main entry point from a kernel exception
+ *
+ * Locking hierarchy:
+ * interface locks, if any (begin_session)
+ * kgdb lock (kgdb_active)
+ */
+int
+kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs)
+{
+ struct kgdb_state kgdb_var;
+ struct kgdb_state *ks = &kgdb_var;
+ int ret = 0;
+
+ if (arch_kgdb_ops.enable_nmi)
+ arch_kgdb_ops.enable_nmi(0);
+
+ memset(ks, 0, sizeof(struct kgdb_state));
+ ks->cpu = raw_smp_processor_id();
+ ks->ex_vector = evector;
+ ks->signo = signo;
+ ks->err_code = ecode;
+ ks->linux_regs = regs;
+
+ if (kgdb_reenter_check(ks))
+ goto out; /* Ouch, double exception ! */
+ if (kgdb_info[ks->cpu].enter_kgdb != 0)
+ goto out;
+
+ ret = kgdb_cpu_enter(ks, regs, DCPU_WANT_MASTER);
+out:
+ if (arch_kgdb_ops.enable_nmi)
+ arch_kgdb_ops.enable_nmi(1);
+ return ret;
+}
+
+/*
+ * GDB places a breakpoint at this function to know dynamically
+ * loaded objects. It's not defined static so that only one instance with this
+ * name exists in the kernel.
+ */
+
+static int module_event(struct notifier_block *self, unsigned long val,
+ void *data)
+{
+ return 0;
+}
+
+static struct notifier_block dbg_module_load_nb = {
+ .notifier_call = module_event,
+};
+
+int kgdb_nmicallback(int cpu, void *regs)
+{
+#ifdef CONFIG_SMP
+ struct kgdb_state kgdb_var;
+ struct kgdb_state *ks = &kgdb_var;
+
+ memset(ks, 0, sizeof(struct kgdb_state));
+ ks->cpu = cpu;
+ ks->linux_regs = regs;
+
+ if (kgdb_info[ks->cpu].enter_kgdb == 0 &&
+ raw_spin_is_locked(&dbg_master_lock)) {
+ kgdb_cpu_enter(ks, regs, DCPU_IS_SLAVE);
+ return 0;
+ }
+#endif
+ return 1;
+}
+
+int kgdb_nmicallin(int cpu, int trapnr, void *regs, int err_code,
+ atomic_t *send_ready)
+{
+#ifdef CONFIG_SMP
+ if (!kgdb_io_ready(0) || !send_ready)
+ return 1;
+
+ if (kgdb_info[cpu].enter_kgdb == 0) {
+ struct kgdb_state kgdb_var;
+ struct kgdb_state *ks = &kgdb_var;
+
+ memset(ks, 0, sizeof(struct kgdb_state));
+ ks->cpu = cpu;
+ ks->ex_vector = trapnr;
+ ks->signo = SIGTRAP;
+ ks->err_code = err_code;
+ ks->linux_regs = regs;
+ ks->send_ready = send_ready;
+ kgdb_cpu_enter(ks, regs, DCPU_WANT_MASTER);
+ return 0;
+ }
+#endif
+ return 1;
+}
+
+static void kgdb_console_write(struct console *co, const char *s,
+ unsigned count)
+{
+ unsigned long flags;
+
+ /* If we're debugging, or KGDB has not connected, don't try
+ * and print. */
+ if (!kgdb_connected || atomic_read(&kgdb_active) != -1 || dbg_kdb_mode)
+ return;
+
+ local_irq_save(flags);
+ gdbstub_msg_write(s, count);
+ local_irq_restore(flags);
+}
+
+static struct console kgdbcons = {
+ .name = "kgdb",
+ .write = kgdb_console_write,
+ .flags = CON_PRINTBUFFER | CON_ENABLED,
+ .index = -1,
+};
+
+#ifdef CONFIG_MAGIC_SYSRQ
+static void sysrq_handle_dbg(int key)
+{
+ if (!dbg_io_ops) {
+ printk(KERN_CRIT "ERROR: No KGDB I/O module available\n");
+ return;
+ }
+ if (!kgdb_connected) {
+#ifdef CONFIG_KGDB_KDB
+ if (!dbg_kdb_mode)
+ printk(KERN_CRIT "KGDB or $3#33 for KDB\n");
+#else
+ printk(KERN_CRIT "Entering KGDB\n");
+#endif
+ }
+
+ kgdb_breakpoint();
+}
+
+static struct sysrq_key_op sysrq_dbg_op = {
+ .handler = sysrq_handle_dbg,
+ .help_msg = "debug(g)",
+ .action_msg = "DEBUG",
+};
+#endif
+
+static int kgdb_panic_event(struct notifier_block *self,
+ unsigned long val,
+ void *data)
+{
+ if (dbg_kdb_mode)
+ kdb_printf("PANIC: %s\n", (char *)data);
+ kgdb_breakpoint();
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block kgdb_panic_event_nb = {
+ .notifier_call = kgdb_panic_event,
+ .priority = INT_MAX,
+};
+
+void __weak kgdb_arch_late(void)
+{
+}
+
+void __init dbg_late_init(void)
+{
+ dbg_is_early = false;
+ if (kgdb_io_module_registered)
+ kgdb_arch_late();
+ kdb_init(KDB_INIT_FULL);
+}
+
+static int
+dbg_notify_reboot(struct notifier_block *this, unsigned long code, void *x)
+{
+ /*
+ * Take the following action on reboot notify depending on value:
+ * 1 == Enter debugger
+ * 0 == [the default] detatch debug client
+ * -1 == Do nothing... and use this until the board resets
+ */
+ switch (kgdbreboot) {
+ case 1:
+ kgdb_breakpoint();
+ case -1:
+ goto done;
+ }
+ if (!dbg_kdb_mode)
+ gdbstub_exit(code);
+done:
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block dbg_reboot_notifier = {
+ .notifier_call = dbg_notify_reboot,
+ .next = NULL,
+ .priority = INT_MAX,
+};
+
+static void kgdb_register_callbacks(void)
+{
+ if (!kgdb_io_module_registered) {
+ kgdb_io_module_registered = 1;
+ kgdb_arch_init();
+ if (!dbg_is_early)
+ kgdb_arch_late();
+ register_module_notifier(&dbg_module_load_nb);
+ register_reboot_notifier(&dbg_reboot_notifier);
+ atomic_notifier_chain_register(&panic_notifier_list,
+ &kgdb_panic_event_nb);
+#ifdef CONFIG_MAGIC_SYSRQ
+ register_sysrq_key('g', &sysrq_dbg_op);
+#endif
+ if (kgdb_use_con && !kgdb_con_registered) {
+ register_console(&kgdbcons);
+ kgdb_con_registered = 1;
+ }
+ }
+}
+
+static void kgdb_unregister_callbacks(void)
+{
+ /*
+ * When this routine is called KGDB should unregister from the
+ * panic handler and clean up, making sure it is not handling any
+ * break exceptions at the time.
+ */
+ if (kgdb_io_module_registered) {
+ kgdb_io_module_registered = 0;
+ unregister_reboot_notifier(&dbg_reboot_notifier);
+ unregister_module_notifier(&dbg_module_load_nb);
+ atomic_notifier_chain_unregister(&panic_notifier_list,
+ &kgdb_panic_event_nb);
+ kgdb_arch_exit();
+#ifdef CONFIG_MAGIC_SYSRQ
+ unregister_sysrq_key('g', &sysrq_dbg_op);
+#endif
+ if (kgdb_con_registered) {
+ unregister_console(&kgdbcons);
+ kgdb_con_registered = 0;
+ }
+ }
+}
+
+/*
+ * There are times a tasklet needs to be used vs a compiled in
+ * break point so as to cause an exception outside a kgdb I/O module,
+ * such as is the case with kgdboe, where calling a breakpoint in the
+ * I/O driver itself would be fatal.
+ */
+static void kgdb_tasklet_bpt(unsigned long ing)
+{
+ kgdb_breakpoint();
+ atomic_set(&kgdb_break_tasklet_var, 0);
+}
+
+static DECLARE_TASKLET(kgdb_tasklet_breakpoint, kgdb_tasklet_bpt, 0);
+
+void kgdb_schedule_breakpoint(void)
+{
+ if (atomic_read(&kgdb_break_tasklet_var) ||
+ atomic_read(&kgdb_active) != -1 ||
+ atomic_read(&kgdb_setting_breakpoint))
+ return;
+ atomic_inc(&kgdb_break_tasklet_var);
+ tasklet_schedule(&kgdb_tasklet_breakpoint);
+}
+EXPORT_SYMBOL_GPL(kgdb_schedule_breakpoint);
+
+static void kgdb_initial_breakpoint(void)
+{
+ kgdb_break_asap = 0;
+
+ printk(KERN_CRIT "kgdb: Waiting for connection from remote gdb...\n");
+ kgdb_breakpoint();
+}
+
+/**
+ * kgdb_register_io_module - register KGDB IO module
+ * @new_dbg_io_ops: the io ops vector
+ *
+ * Register it with the KGDB core.
+ */
+int kgdb_register_io_module(struct kgdb_io *new_dbg_io_ops)
+{
+ int err;
+
+ spin_lock(&kgdb_registration_lock);
+
+ if (dbg_io_ops) {
+ spin_unlock(&kgdb_registration_lock);
+
+ printk(KERN_ERR "kgdb: Another I/O driver is already "
+ "registered with KGDB.\n");
+ return -EBUSY;
+ }
+
+ if (new_dbg_io_ops->init) {
+ err = new_dbg_io_ops->init();
+ if (err) {
+ spin_unlock(&kgdb_registration_lock);
+ return err;
+ }
+ }
+
+ dbg_io_ops = new_dbg_io_ops;
+
+ spin_unlock(&kgdb_registration_lock);
+
+ printk(KERN_INFO "kgdb: Registered I/O driver %s.\n",
+ new_dbg_io_ops->name);
+
+ /* Arm KGDB now. */
+ kgdb_register_callbacks();
+
+ if (kgdb_break_asap)
+ kgdb_initial_breakpoint();
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(kgdb_register_io_module);
+
+/**
+ * kkgdb_unregister_io_module - unregister KGDB IO module
+ * @old_dbg_io_ops: the io ops vector
+ *
+ * Unregister it with the KGDB core.
+ */
+void kgdb_unregister_io_module(struct kgdb_io *old_dbg_io_ops)
+{
+ BUG_ON(kgdb_connected);
+
+ /*
+ * KGDB is no longer able to communicate out, so
+ * unregister our callbacks and reset state.
+ */
+ kgdb_unregister_callbacks();
+
+ spin_lock(&kgdb_registration_lock);
+
+ WARN_ON_ONCE(dbg_io_ops != old_dbg_io_ops);
+ dbg_io_ops = NULL;
+
+ spin_unlock(&kgdb_registration_lock);
+
+ printk(KERN_INFO
+ "kgdb: Unregistered I/O driver %s, debugger disabled.\n",
+ old_dbg_io_ops->name);
+}
+EXPORT_SYMBOL_GPL(kgdb_unregister_io_module);
+
+int dbg_io_get_char(void)
+{
+ int ret = dbg_io_ops->read_char();
+ if (ret == NO_POLL_CHAR)
+ return -1;
+ if (!dbg_kdb_mode)
+ return ret;
+ if (ret == 127)
+ return 8;
+ return ret;
+}
+
+/**
+ * kgdb_breakpoint - generate breakpoint exception
+ *
+ * This function will generate a breakpoint exception. It is used at the
+ * beginning of a program to sync up with a debugger and can be used
+ * otherwise as a quick means to stop program execution and "break" into
+ * the debugger.
+ */
+noinline void kgdb_breakpoint(void)
+{
+ atomic_inc(&kgdb_setting_breakpoint);
+ wmb(); /* Sync point before breakpoint */
+ arch_kgdb_breakpoint();
+ wmb(); /* Sync point after breakpoint */
+ atomic_dec(&kgdb_setting_breakpoint);
+}
+EXPORT_SYMBOL_GPL(kgdb_breakpoint);
+
+static int __init opt_kgdb_wait(char *str)
+{
+ kgdb_break_asap = 1;
+
+ kdb_init(KDB_INIT_EARLY);
+ if (kgdb_io_module_registered)
+ kgdb_initial_breakpoint();
+
+ return 0;
+}
+
+early_param("kgdbwait", opt_kgdb_wait);
diff --git a/kernel/debug/debug_core.h b/kernel/debug/debug_core.h
new file mode 100644
index 00000000000..127d9bc49fb
--- /dev/null
+++ b/kernel/debug/debug_core.h
@@ -0,0 +1,85 @@
+/*
+ * Created by: Jason Wessel <jason.wessel@windriver.com>
+ *
+ * Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved.
+ *
+ * This file is licensed under the terms of the GNU General Public
+ * License version 2. This program is licensed "as is" without any
+ * warranty of any kind, whether express or implied.
+ */
+
+#ifndef _DEBUG_CORE_H_
+#define _DEBUG_CORE_H_
+/*
+ * These are the private implementation headers between the kernel
+ * debugger core and the debugger front end code.
+ */
+
+/* kernel debug core data structures */
+struct kgdb_state {
+ int ex_vector;
+ int signo;
+ int err_code;
+ int cpu;
+ int pass_exception;
+ unsigned long thr_query;
+ unsigned long threadid;
+ long kgdb_usethreadid;
+ struct pt_regs *linux_regs;
+ atomic_t *send_ready;
+};
+
+/* Exception state values */
+#define DCPU_WANT_MASTER 0x1 /* Waiting to become a master kgdb cpu */
+#define DCPU_NEXT_MASTER 0x2 /* Transition from one master cpu to another */
+#define DCPU_IS_SLAVE 0x4 /* Slave cpu enter exception */
+#define DCPU_SSTEP 0x8 /* CPU is single stepping */
+
+struct debuggerinfo_struct {
+ void *debuggerinfo;
+ struct task_struct *task;
+ int exception_state;
+ int ret_state;
+ int irq_depth;
+ int enter_kgdb;
+};
+
+extern struct debuggerinfo_struct kgdb_info[];
+
+/* kernel debug core break point routines */
+extern int dbg_remove_all_break(void);
+extern int dbg_set_sw_break(unsigned long addr);
+extern int dbg_remove_sw_break(unsigned long addr);
+extern int dbg_activate_sw_breakpoints(void);
+extern int dbg_deactivate_sw_breakpoints(void);
+
+/* polled character access to i/o module */
+extern int dbg_io_get_char(void);
+
+/* stub return value for switching between the gdbstub and kdb */
+#define DBG_PASS_EVENT -12345
+/* Switch from one cpu to another */
+#define DBG_SWITCH_CPU_EVENT -123456
+extern int dbg_switch_cpu;
+
+/* gdbstub interface functions */
+extern int gdb_serial_stub(struct kgdb_state *ks);
+extern void gdbstub_msg_write(const char *s, int len);
+
+/* gdbstub functions used for kdb <-> gdbstub transition */
+extern int gdbstub_state(struct kgdb_state *ks, char *cmd);
+extern int dbg_kdb_mode;
+
+#ifdef CONFIG_KGDB_KDB
+extern int kdb_stub(struct kgdb_state *ks);
+extern int kdb_parse(const char *cmdstr);
+extern int kdb_common_init_state(struct kgdb_state *ks);
+extern int kdb_common_deinit_state(void);
+#else /* ! CONFIG_KGDB_KDB */
+static inline int kdb_stub(struct kgdb_state *ks)
+{
+ return DBG_PASS_EVENT;
+}
+#endif /* CONFIG_KGDB_KDB */
+
+#endif /* _DEBUG_CORE_H_ */
diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c
new file mode 100644
index 00000000000..19d9a578c75
--- /dev/null
+++ b/kernel/debug/gdbstub.c
@@ -0,0 +1,1145 @@
+/*
+ * Kernel Debug Core
+ *
+ * Maintainer: Jason Wessel <jason.wessel@windriver.com>
+ *
+ * Copyright (C) 2000-2001 VERITAS Software Corporation.
+ * Copyright (C) 2002-2004 Timesys Corporation
+ * Copyright (C) 2003-2004 Amit S. Kale <amitkale@linsyssoft.com>
+ * Copyright (C) 2004 Pavel Machek <pavel@ucw.cz>
+ * Copyright (C) 2004-2006 Tom Rini <trini@kernel.crashing.org>
+ * Copyright (C) 2004-2006 LinSysSoft Technologies Pvt. Ltd.
+ * Copyright (C) 2005-2009 Wind River Systems, Inc.
+ * Copyright (C) 2007 MontaVista Software, Inc.
+ * Copyright (C) 2008 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *
+ * Contributors at various stages not listed above:
+ * Jason Wessel ( jason.wessel@windriver.com )
+ * George Anzinger <george@mvista.com>
+ * Anurekh Saxena (anurekh.saxena@timesys.com)
+ * Lake Stevens Instrument Division (Glenn Engel)
+ * Jim Kingdon, Cygnus Support.
+ *
+ * Original KGDB stub: David Grothe <dave@gcom.com>,
+ * Tigran Aivazian <tigran@sco.com>
+ *
+ * This file is licensed under the terms of the GNU General Public License
+ * version 2. This program is licensed "as is" without any warranty of any
+ * kind, whether express or implied.
+ */
+
+#include <linux/kernel.h>
+#include <linux/kgdb.h>
+#include <linux/kdb.h>
+#include <linux/serial_core.h>
+#include <linux/reboot.h>
+#include <linux/uaccess.h>
+#include <asm/cacheflush.h>
+#include <asm/unaligned.h>
+#include "debug_core.h"
+
+#define KGDB_MAX_THREAD_QUERY 17
+
+/* Our I/O buffers. */
+static char remcom_in_buffer[BUFMAX];
+static char remcom_out_buffer[BUFMAX];
+static int gdbstub_use_prev_in_buf;
+static int gdbstub_prev_in_buf_pos;
+
+/* Storage for the registers, in GDB format. */
+static unsigned long gdb_regs[(NUMREGBYTES +
+ sizeof(unsigned long) - 1) /
+ sizeof(unsigned long)];
+
+/*
+ * GDB remote protocol parser:
+ */
+
+#ifdef CONFIG_KGDB_KDB
+static int gdbstub_read_wait(void)
+{
+ int ret = -1;
+ int i;
+
+ if (unlikely(gdbstub_use_prev_in_buf)) {
+ if (gdbstub_prev_in_buf_pos < gdbstub_use_prev_in_buf)
+ return remcom_in_buffer[gdbstub_prev_in_buf_pos++];
+ else
+ gdbstub_use_prev_in_buf = 0;
+ }
+
+ /* poll any additional I/O interfaces that are defined */
+ while (ret < 0)
+ for (i = 0; kdb_poll_funcs[i] != NULL; i++) {
+ ret = kdb_poll_funcs[i]();
+ if (ret > 0)
+ break;
+ }
+ return ret;
+}
+#else
+static int gdbstub_read_wait(void)
+{
+ int ret = dbg_io_ops->read_char();
+ while (ret == NO_POLL_CHAR)
+ ret = dbg_io_ops->read_char();
+ return ret;
+}
+#endif
+/* scan for the sequence $<data>#<checksum> */
+static void get_packet(char *buffer)
+{
+ unsigned char checksum;
+ unsigned char xmitcsum;
+ int count;
+ char ch;
+
+ do {
+ /*
+ * Spin and wait around for the start character, ignore all
+ * other characters:
+ */
+ while ((ch = (gdbstub_read_wait())) != '$')
+ /* nothing */;
+
+ kgdb_connected = 1;
+ checksum = 0;
+ xmitcsum = -1;
+
+ count = 0;
+
+ /*
+ * now, read until a # or end of buffer is found:
+ */
+ while (count < (BUFMAX - 1)) {
+ ch = gdbstub_read_wait();
+ if (ch == '#')
+ break;
+ checksum = checksum + ch;
+ buffer[count] = ch;
+ count = count + 1;
+ }
+
+ if (ch == '#') {
+ xmitcsum = hex_to_bin(gdbstub_read_wait()) << 4;
+ xmitcsum += hex_to_bin(gdbstub_read_wait());
+
+ if (checksum != xmitcsum)
+ /* failed checksum */
+ dbg_io_ops->write_char('-');
+ else
+ /* successful transfer */
+ dbg_io_ops->write_char('+');
+ if (dbg_io_ops->flush)
+ dbg_io_ops->flush();
+ }
+ buffer[count] = 0;
+ } while (checksum != xmitcsum);
+}
+
+/*
+ * Send the packet in buffer.
+ * Check for gdb connection if asked for.
+ */
+static void put_packet(char *buffer)
+{
+ unsigned char checksum;
+ int count;
+ char ch;
+
+ /*
+ * $<packet info>#<checksum>.
+ */
+ while (1) {
+ dbg_io_ops->write_char('$');
+ checksum = 0;
+ count = 0;
+
+ while ((ch = buffer[count])) {
+ dbg_io_ops->write_char(ch);
+ checksum += ch;
+ count++;
+ }
+
+ dbg_io_ops->write_char('#');
+ dbg_io_ops->write_char(hex_asc_hi(checksum));
+ dbg_io_ops->write_char(hex_asc_lo(checksum));
+ if (dbg_io_ops->flush)
+ dbg_io_ops->flush();
+
+ /* Now see what we get in reply. */
+ ch = gdbstub_read_wait();
+
+ if (ch == 3)
+ ch = gdbstub_read_wait();
+
+ /* If we get an ACK, we are done. */
+ if (ch == '+')
+ return;
+
+ /*
+ * If we get the start of another packet, this means
+ * that GDB is attempting to reconnect. We will NAK
+ * the packet being sent, and stop trying to send this
+ * packet.
+ */
+ if (ch == '$') {
+ dbg_io_ops->write_char('-');
+ if (dbg_io_ops->flush)
+ dbg_io_ops->flush();
+ return;
+ }
+ }
+}
+
+static char gdbmsgbuf[BUFMAX + 1];
+
+void gdbstub_msg_write(const char *s, int len)
+{
+ char *bufptr;
+ int wcount;
+ int i;
+
+ if (len == 0)
+ len = strlen(s);
+
+ /* 'O'utput */
+ gdbmsgbuf[0] = 'O';
+
+ /* Fill and send buffers... */
+ while (len > 0) {
+ bufptr = gdbmsgbuf + 1;
+
+ /* Calculate how many this time */
+ if ((len << 1) > (BUFMAX - 2))
+ wcount = (BUFMAX - 2) >> 1;
+ else
+ wcount = len;
+
+ /* Pack in hex chars */
+ for (i = 0; i < wcount; i++)
+ bufptr = hex_byte_pack(bufptr, s[i]);
+ *bufptr = '\0';
+
+ /* Move up */
+ s += wcount;
+ len -= wcount;
+
+ /* Write packet */
+ put_packet(gdbmsgbuf);
+ }
+}
+
+/*
+ * Convert the memory pointed to by mem into hex, placing result in
+ * buf. Return a pointer to the last char put in buf (null). May
+ * return an error.
+ */
+char *kgdb_mem2hex(char *mem, char *buf, int count)
+{
+ char *tmp;
+ int err;
+
+ /*
+ * We use the upper half of buf as an intermediate buffer for the
+ * raw memory copy. Hex conversion will work against this one.
+ */
+ tmp = buf + count;
+
+ err = probe_kernel_read(tmp, mem, count);
+ if (err)
+ return NULL;
+ while (count > 0) {
+ buf = hex_byte_pack(buf, *tmp);
+ tmp++;
+ count--;
+ }
+ *buf = 0;
+
+ return buf;
+}
+
+/*
+ * Convert the hex array pointed to by buf into binary to be placed in
+ * mem. Return a pointer to the character AFTER the last byte
+ * written. May return an error.
+ */
+int kgdb_hex2mem(char *buf, char *mem, int count)
+{
+ char *tmp_raw;
+ char *tmp_hex;
+
+ /*
+ * We use the upper half of buf as an intermediate buffer for the
+ * raw memory that is converted from hex.
+ */
+ tmp_raw = buf + count * 2;
+
+ tmp_hex = tmp_raw - 1;
+ while (tmp_hex >= buf) {
+ tmp_raw--;
+ *tmp_raw = hex_to_bin(*tmp_hex--);
+ *tmp_raw |= hex_to_bin(*tmp_hex--) << 4;
+ }
+
+ return probe_kernel_write(mem, tmp_raw, count);
+}
+
+/*
+ * While we find nice hex chars, build a long_val.
+ * Return number of chars processed.
+ */
+int kgdb_hex2long(char **ptr, unsigned long *long_val)
+{
+ int hex_val;
+ int num = 0;
+ int negate = 0;
+
+ *long_val = 0;
+
+ if (**ptr == '-') {
+ negate = 1;
+ (*ptr)++;
+ }
+ while (**ptr) {
+ hex_val = hex_to_bin(**ptr);
+ if (hex_val < 0)
+ break;
+
+ *long_val = (*long_val << 4) | hex_val;
+ num++;
+ (*ptr)++;
+ }
+
+ if (negate)
+ *long_val = -*long_val;
+
+ return num;
+}
+
+/*
+ * Copy the binary array pointed to by buf into mem. Fix $, #, and
+ * 0x7d escaped with 0x7d. Return -EFAULT on failure or 0 on success.
+ * The input buf is overwitten with the result to write to mem.
+ */
+static int kgdb_ebin2mem(char *buf, char *mem, int count)
+{
+ int size = 0;
+ char *c = buf;
+
+ while (count-- > 0) {
+ c[size] = *buf++;
+ if (c[size] == 0x7d)
+ c[size] = *buf++ ^ 0x20;
+ size++;
+ }
+
+ return probe_kernel_write(mem, c, size);
+}
+
+#if DBG_MAX_REG_NUM > 0
+void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs)
+{
+ int i;
+ int idx = 0;
+ char *ptr = (char *)gdb_regs;
+
+ for (i = 0; i < DBG_MAX_REG_NUM; i++) {
+ dbg_get_reg(i, ptr + idx, regs);
+ idx += dbg_reg_def[i].size;
+ }
+}
+
+void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs)
+{
+ int i;
+ int idx = 0;
+ char *ptr = (char *)gdb_regs;
+
+ for (i = 0; i < DBG_MAX_REG_NUM; i++) {
+ dbg_set_reg(i, ptr + idx, regs);
+ idx += dbg_reg_def[i].size;
+ }
+}
+#endif /* DBG_MAX_REG_NUM > 0 */
+
+/* Write memory due to an 'M' or 'X' packet. */
+static int write_mem_msg(int binary)
+{
+ char *ptr = &remcom_in_buffer[1];
+ unsigned long addr;
+ unsigned long length;
+ int err;
+
+ if (kgdb_hex2long(&ptr, &addr) > 0 && *(ptr++) == ',' &&
+ kgdb_hex2long(&ptr, &length) > 0 && *(ptr++) == ':') {
+ if (binary)
+ err = kgdb_ebin2mem(ptr, (char *)addr, length);
+ else
+ err = kgdb_hex2mem(ptr, (char *)addr, length);
+ if (err)
+ return err;
+ if (CACHE_FLUSH_IS_SAFE)
+ flush_icache_range(addr, addr + length);
+ return 0;
+ }
+
+ return -EINVAL;
+}
+
+static void error_packet(char *pkt, int error)
+{
+ error = -error;
+ pkt[0] = 'E';
+ pkt[1] = hex_asc[(error / 10)];
+ pkt[2] = hex_asc[(error % 10)];
+ pkt[3] = '\0';
+}
+
+/*
+ * Thread ID accessors. We represent a flat TID space to GDB, where
+ * the per CPU idle threads (which under Linux all have PID 0) are
+ * remapped to negative TIDs.
+ */
+
+#define BUF_THREAD_ID_SIZE 8
+
+static char *pack_threadid(char *pkt, unsigned char *id)
+{
+ unsigned char *limit;
+ int lzero = 1;
+
+ limit = id + (BUF_THREAD_ID_SIZE / 2);
+ while (id < limit) {
+ if (!lzero || *id != 0) {
+ pkt = hex_byte_pack(pkt, *id);
+ lzero = 0;
+ }
+ id++;
+ }
+
+ if (lzero)
+ pkt = hex_byte_pack(pkt, 0);
+
+ return pkt;
+}
+
+static void int_to_threadref(unsigned char *id, int value)
+{
+ put_unaligned_be32(value, id);
+}
+
+static struct task_struct *getthread(struct pt_regs *regs, int tid)
+{
+ /*
+ * Non-positive TIDs are remapped to the cpu shadow information
+ */
+ if (tid == 0 || tid == -1)
+ tid = -atomic_read(&kgdb_active) - 2;
+ if (tid < -1 && tid > -NR_CPUS - 2) {
+ if (kgdb_info[-tid - 2].task)
+ return kgdb_info[-tid - 2].task;
+ else
+ return idle_task(-tid - 2);
+ }
+ if (tid <= 0) {
+ printk(KERN_ERR "KGDB: Internal thread select error\n");
+ dump_stack();
+ return NULL;
+ }
+
+ /*
+ * find_task_by_pid_ns() does not take the tasklist lock anymore
+ * but is nicely RCU locked - hence is a pretty resilient
+ * thing to use:
+ */
+ return find_task_by_pid_ns(tid, &init_pid_ns);
+}
+
+
+/*
+ * Remap normal tasks to their real PID,
+ * CPU shadow threads are mapped to -CPU - 2
+ */
+static inline int shadow_pid(int realpid)
+{
+ if (realpid)
+ return realpid;
+
+ return -raw_smp_processor_id() - 2;
+}
+
+/*
+ * All the functions that start with gdb_cmd are the various
+ * operations to implement the handlers for the gdbserial protocol
+ * where KGDB is communicating with an external debugger
+ */
+
+/* Handle the '?' status packets */
+static void gdb_cmd_status(struct kgdb_state *ks)
+{
+ /*
+ * We know that this packet is only sent
+ * during initial connect. So to be safe,
+ * we clear out our breakpoints now in case
+ * GDB is reconnecting.
+ */
+ dbg_remove_all_break();
+
+ remcom_out_buffer[0] = 'S';
+ hex_byte_pack(&remcom_out_buffer[1], ks->signo);
+}
+
+static void gdb_get_regs_helper(struct kgdb_state *ks)
+{
+ struct task_struct *thread;
+ void *local_debuggerinfo;
+ int i;
+
+ thread = kgdb_usethread;
+ if (!thread) {
+ thread = kgdb_info[ks->cpu].task;
+ local_debuggerinfo = kgdb_info[ks->cpu].debuggerinfo;
+ } else {
+ local_debuggerinfo = NULL;
+ for_each_online_cpu(i) {
+ /*
+ * Try to find the task on some other
+ * or possibly this node if we do not
+ * find the matching task then we try
+ * to approximate the results.
+ */
+ if (thread == kgdb_info[i].task)
+ local_debuggerinfo = kgdb_info[i].debuggerinfo;
+ }
+ }
+
+ /*
+ * All threads that don't have debuggerinfo should be
+ * in schedule() sleeping, since all other CPUs
+ * are in kgdb_wait, and thus have debuggerinfo.
+ */
+ if (local_debuggerinfo) {
+ pt_regs_to_gdb_regs(gdb_regs, local_debuggerinfo);
+ } else {
+ /*
+ * Pull stuff saved during switch_to; nothing
+ * else is accessible (or even particularly
+ * relevant).
+ *
+ * This should be enough for a stack trace.
+ */
+ sleeping_thread_to_gdb_regs(gdb_regs, thread);
+ }
+}
+
+/* Handle the 'g' get registers request */
+static void gdb_cmd_getregs(struct kgdb_state *ks)
+{
+ gdb_get_regs_helper(ks);
+ kgdb_mem2hex((char *)gdb_regs, remcom_out_buffer, NUMREGBYTES);
+}
+
+/* Handle the 'G' set registers request */
+static void gdb_cmd_setregs(struct kgdb_state *ks)
+{
+ kgdb_hex2mem(&remcom_in_buffer[1], (char *)gdb_regs, NUMREGBYTES);
+
+ if (kgdb_usethread && kgdb_usethread != current) {
+ error_packet(remcom_out_buffer, -EINVAL);
+ } else {
+ gdb_regs_to_pt_regs(gdb_regs, ks->linux_regs);
+ strcpy(remcom_out_buffer, "OK");
+ }
+}
+
+/* Handle the 'm' memory read bytes */
+static void gdb_cmd_memread(struct kgdb_state *ks)
+{
+ char *ptr = &remcom_in_buffer[1];
+ unsigned long length;
+ unsigned long addr;
+ char *err;
+
+ if (kgdb_hex2long(&ptr, &addr) > 0 && *ptr++ == ',' &&
+ kgdb_hex2long(&ptr, &length) > 0) {
+ err = kgdb_mem2hex((char *)addr, remcom_out_buffer, length);
+ if (!err)
+ error_packet(remcom_out_buffer, -EINVAL);
+ } else {
+ error_packet(remcom_out_buffer, -EINVAL);
+ }
+}
+
+/* Handle the 'M' memory write bytes */
+static void gdb_cmd_memwrite(struct kgdb_state *ks)
+{
+ int err = write_mem_msg(0);
+
+ if (err)
+ error_packet(remcom_out_buffer, err);
+ else
+ strcpy(remcom_out_buffer, "OK");
+}
+
+#if DBG_MAX_REG_NUM > 0
+static char *gdb_hex_reg_helper(int regnum, char *out)
+{
+ int i;
+ int offset = 0;
+
+ for (i = 0; i < regnum; i++)
+ offset += dbg_reg_def[i].size;
+ return kgdb_mem2hex((char *)gdb_regs + offset, out,
+ dbg_reg_def[i].size);
+}
+
+/* Handle the 'p' individual regster get */
+static void gdb_cmd_reg_get(struct kgdb_state *ks)
+{
+ unsigned long regnum;
+ char *ptr = &remcom_in_buffer[1];
+
+ kgdb_hex2long(&ptr, &regnum);
+ if (regnum >= DBG_MAX_REG_NUM) {
+ error_packet(remcom_out_buffer, -EINVAL);
+ return;
+ }
+ gdb_get_regs_helper(ks);
+ gdb_hex_reg_helper(regnum, remcom_out_buffer);
+}
+
+/* Handle the 'P' individual regster set */
+static void gdb_cmd_reg_set(struct kgdb_state *ks)
+{
+ unsigned long regnum;
+ char *ptr = &remcom_in_buffer[1];
+ int i = 0;
+
+ kgdb_hex2long(&ptr, &regnum);
+ if (*ptr++ != '=' ||
+ !(!kgdb_usethread || kgdb_usethread == current) ||
+ !dbg_get_reg(regnum, gdb_regs, ks->linux_regs)) {
+ error_packet(remcom_out_buffer, -EINVAL);
+ return;
+ }
+ memset(gdb_regs, 0, sizeof(gdb_regs));
+ while (i < sizeof(gdb_regs) * 2)
+ if (hex_to_bin(ptr[i]) >= 0)
+ i++;
+ else
+ break;
+ i = i / 2;
+ kgdb_hex2mem(ptr, (char *)gdb_regs, i);
+ dbg_set_reg(regnum, gdb_regs, ks->linux_regs);
+ strcpy(remcom_out_buffer, "OK");
+}
+#endif /* DBG_MAX_REG_NUM > 0 */
+
+/* Handle the 'X' memory binary write bytes */
+static void gdb_cmd_binwrite(struct kgdb_state *ks)
+{
+ int err = write_mem_msg(1);
+
+ if (err)
+ error_packet(remcom_out_buffer, err);
+ else
+ strcpy(remcom_out_buffer, "OK");
+}
+
+/* Handle the 'D' or 'k', detach or kill packets */
+static void gdb_cmd_detachkill(struct kgdb_state *ks)
+{
+ int error;
+
+ /* The detach case */
+ if (remcom_in_buffer[0] == 'D') {
+ error = dbg_remove_all_break();
+ if (error < 0) {
+ error_packet(remcom_out_buffer, error);
+ } else {
+ strcpy(remcom_out_buffer, "OK");
+ kgdb_connected = 0;
+ }
+ put_packet(remcom_out_buffer);
+ } else {
+ /*
+ * Assume the kill case, with no exit code checking,
+ * trying to force detach the debugger:
+ */
+ dbg_remove_all_break();
+ kgdb_connected = 0;
+ }
+}
+
+/* Handle the 'R' reboot packets */
+static int gdb_cmd_reboot(struct kgdb_state *ks)
+{
+ /* For now, only honor R0 */
+ if (strcmp(remcom_in_buffer, "R0") == 0) {
+ printk(KERN_CRIT "Executing emergency reboot\n");
+ strcpy(remcom_out_buffer, "OK");
+ put_packet(remcom_out_buffer);
+
+ /*
+ * Execution should not return from
+ * machine_emergency_restart()
+ */
+ machine_emergency_restart();
+ kgdb_connected = 0;
+
+ return 1;
+ }
+ return 0;
+}
+
+/* Handle the 'q' query packets */
+static void gdb_cmd_query(struct kgdb_state *ks)
+{
+ struct task_struct *g;
+ struct task_struct *p;
+ unsigned char thref[BUF_THREAD_ID_SIZE];
+ char *ptr;
+ int i;
+ int cpu;
+ int finished = 0;
+
+ switch (remcom_in_buffer[1]) {
+ case 's':
+ case 'f':
+ if (memcmp(remcom_in_buffer + 2, "ThreadInfo", 10))
+ break;
+
+ i = 0;
+ remcom_out_buffer[0] = 'm';
+ ptr = remcom_out_buffer + 1;
+ if (remcom_in_buffer[1] == 'f') {
+ /* Each cpu is a shadow thread */
+ for_each_online_cpu(cpu) {
+ ks->thr_query = 0;
+ int_to_threadref(thref, -cpu - 2);
+ ptr = pack_threadid(ptr, thref);
+ *(ptr++) = ',';
+ i++;
+ }
+ }
+
+ do_each_thread(g, p) {
+ if (i >= ks->thr_query && !finished) {
+ int_to_threadref(thref, p->pid);
+ ptr = pack_threadid(ptr, thref);
+ *(ptr++) = ',';
+ ks->thr_query++;
+ if (ks->thr_query % KGDB_MAX_THREAD_QUERY == 0)
+ finished = 1;
+ }
+ i++;
+ } while_each_thread(g, p);
+
+ *(--ptr) = '\0';
+ break;
+
+ case 'C':
+ /* Current thread id */
+ strcpy(remcom_out_buffer, "QC");
+ ks->threadid = shadow_pid(current->pid);
+ int_to_threadref(thref, ks->threadid);
+ pack_threadid(remcom_out_buffer + 2, thref);
+ break;
+ case 'T':
+ if (memcmp(remcom_in_buffer + 1, "ThreadExtraInfo,", 16))
+ break;
+
+ ks->threadid = 0;
+ ptr = remcom_in_buffer + 17;
+ kgdb_hex2long(&ptr, &ks->threadid);
+ if (!getthread(ks->linux_regs, ks->threadid)) {
+ error_packet(remcom_out_buffer, -EINVAL);
+ break;
+ }
+ if ((int)ks->threadid > 0) {
+ kgdb_mem2hex(getthread(ks->linux_regs,
+ ks->threadid)->comm,
+ remcom_out_buffer, 16);
+ } else {
+ static char tmpstr[23 + BUF_THREAD_ID_SIZE];
+
+ sprintf(tmpstr, "shadowCPU%d",
+ (int)(-ks->threadid - 2));
+ kgdb_mem2hex(tmpstr, remcom_out_buffer, strlen(tmpstr));
+ }
+ break;
+#ifdef CONFIG_KGDB_KDB
+ case 'R':
+ if (strncmp(remcom_in_buffer, "qRcmd,", 6) == 0) {
+ int len = strlen(remcom_in_buffer + 6);
+
+ if ((len % 2) != 0) {
+ strcpy(remcom_out_buffer, "E01");
+ break;
+ }
+ kgdb_hex2mem(remcom_in_buffer + 6,
+ remcom_out_buffer, len);
+ len = len / 2;
+ remcom_out_buffer[len++] = 0;
+
+ kdb_common_init_state(ks);
+ kdb_parse(remcom_out_buffer);
+ kdb_common_deinit_state();
+
+ strcpy(remcom_out_buffer, "OK");
+ }
+ break;
+#endif
+ }
+}
+
+/* Handle the 'H' task query packets */
+static void gdb_cmd_task(struct kgdb_state *ks)
+{
+ struct task_struct *thread;
+ char *ptr;
+
+ switch (remcom_in_buffer[1]) {
+ case 'g':
+ ptr = &remcom_in_buffer[2];
+ kgdb_hex2long(&ptr, &ks->threadid);
+ thread = getthread(ks->linux_regs, ks->threadid);
+ if (!thread && ks->threadid > 0) {
+ error_packet(remcom_out_buffer, -EINVAL);
+ break;
+ }
+ kgdb_usethread = thread;
+ ks->kgdb_usethreadid = ks->threadid;
+ strcpy(remcom_out_buffer, "OK");
+ break;
+ case 'c':
+ ptr = &remcom_in_buffer[2];
+ kgdb_hex2long(&ptr, &ks->threadid);
+ if (!ks->threadid) {
+ kgdb_contthread = NULL;
+ } else {
+ thread = getthread(ks->linux_regs, ks->threadid);
+ if (!thread && ks->threadid > 0) {
+ error_packet(remcom_out_buffer, -EINVAL);
+ break;
+ }
+ kgdb_contthread = thread;
+ }
+ strcpy(remcom_out_buffer, "OK");
+ break;
+ }
+}
+
+/* Handle the 'T' thread query packets */
+static void gdb_cmd_thread(struct kgdb_state *ks)
+{
+ char *ptr = &remcom_in_buffer[1];
+ struct task_struct *thread;
+
+ kgdb_hex2long(&ptr, &ks->threadid);
+ thread = getthread(ks->linux_regs, ks->threadid);
+ if (thread)
+ strcpy(remcom_out_buffer, "OK");
+ else
+ error_packet(remcom_out_buffer, -EINVAL);
+}
+
+/* Handle the 'z' or 'Z' breakpoint remove or set packets */
+static void gdb_cmd_break(struct kgdb_state *ks)
+{
+ /*
+ * Since GDB-5.3, it's been drafted that '0' is a software
+ * breakpoint, '1' is a hardware breakpoint, so let's do that.
+ */
+ char *bpt_type = &remcom_in_buffer[1];
+ char *ptr = &remcom_in_buffer[2];
+ unsigned long addr;
+ unsigned long length;
+ int error = 0;
+
+ if (arch_kgdb_ops.set_hw_breakpoint && *bpt_type >= '1') {
+ /* Unsupported */
+ if (*bpt_type > '4')
+ return;
+ } else {
+ if (*bpt_type != '0' && *bpt_type != '1')
+ /* Unsupported. */
+ return;
+ }
+
+ /*
+ * Test if this is a hardware breakpoint, and
+ * if we support it:
+ */
+ if (*bpt_type == '1' && !(arch_kgdb_ops.flags & KGDB_HW_BREAKPOINT))
+ /* Unsupported. */
+ return;
+
+ if (*(ptr++) != ',') {
+ error_packet(remcom_out_buffer, -EINVAL);
+ return;
+ }
+ if (!kgdb_hex2long(&ptr, &addr)) {
+ error_packet(remcom_out_buffer, -EINVAL);
+ return;
+ }
+ if (*(ptr++) != ',' ||
+ !kgdb_hex2long(&ptr, &length)) {
+ error_packet(remcom_out_buffer, -EINVAL);
+ return;
+ }
+
+ if (remcom_in_buffer[0] == 'Z' && *bpt_type == '0')
+ error = dbg_set_sw_break(addr);
+ else if (remcom_in_buffer[0] == 'z' && *bpt_type == '0')
+ error = dbg_remove_sw_break(addr);
+ else if (remcom_in_buffer[0] == 'Z')
+ error = arch_kgdb_ops.set_hw_breakpoint(addr,
+ (int)length, *bpt_type - '0');
+ else if (remcom_in_buffer[0] == 'z')
+ error = arch_kgdb_ops.remove_hw_breakpoint(addr,
+ (int) length, *bpt_type - '0');
+
+ if (error == 0)
+ strcpy(remcom_out_buffer, "OK");
+ else
+ error_packet(remcom_out_buffer, error);
+}
+
+/* Handle the 'C' signal / exception passing packets */
+static int gdb_cmd_exception_pass(struct kgdb_state *ks)
+{
+ /* C09 == pass exception
+ * C15 == detach kgdb, pass exception
+ */
+ if (remcom_in_buffer[1] == '0' && remcom_in_buffer[2] == '9') {
+
+ ks->pass_exception = 1;
+ remcom_in_buffer[0] = 'c';
+
+ } else if (remcom_in_buffer[1] == '1' && remcom_in_buffer[2] == '5') {
+
+ ks->pass_exception = 1;
+ remcom_in_buffer[0] = 'D';
+ dbg_remove_all_break();
+ kgdb_connected = 0;
+ return 1;
+
+ } else {
+ gdbstub_msg_write("KGDB only knows signal 9 (pass)"
+ " and 15 (pass and disconnect)\n"
+ "Executing a continue without signal passing\n", 0);
+ remcom_in_buffer[0] = 'c';
+ }
+
+ /* Indicate fall through */
+ return -1;
+}
+
+/*
+ * This function performs all gdbserial command procesing
+ */
+int gdb_serial_stub(struct kgdb_state *ks)
+{
+ int error = 0;
+ int tmp;
+
+ /* Initialize comm buffer and globals. */
+ memset(remcom_out_buffer, 0, sizeof(remcom_out_buffer));
+ kgdb_usethread = kgdb_info[ks->cpu].task;
+ ks->kgdb_usethreadid = shadow_pid(kgdb_info[ks->cpu].task->pid);
+ ks->pass_exception = 0;
+
+ if (kgdb_connected) {
+ unsigned char thref[BUF_THREAD_ID_SIZE];
+ char *ptr;
+
+ /* Reply to host that an exception has occurred */
+ ptr = remcom_out_buffer;
+ *ptr++ = 'T';
+ ptr = hex_byte_pack(ptr, ks->signo);
+ ptr += strlen(strcpy(ptr, "thread:"));
+ int_to_threadref(thref, shadow_pid(current->pid));
+ ptr = pack_threadid(ptr, thref);
+ *ptr++ = ';';
+ put_packet(remcom_out_buffer);
+ }
+
+ while (1) {
+ error = 0;
+
+ /* Clear the out buffer. */
+ memset(remcom_out_buffer, 0, sizeof(remcom_out_buffer));
+
+ get_packet(remcom_in_buffer);
+
+ switch (remcom_in_buffer[0]) {
+ case '?': /* gdbserial status */
+ gdb_cmd_status(ks);
+ break;
+ case 'g': /* return the value of the CPU registers */
+ gdb_cmd_getregs(ks);
+ break;
+ case 'G': /* set the value of the CPU registers - return OK */
+ gdb_cmd_setregs(ks);
+ break;
+ case 'm': /* mAA..AA,LLLL Read LLLL bytes at address AA..AA */
+ gdb_cmd_memread(ks);
+ break;
+ case 'M': /* MAA..AA,LLLL: Write LLLL bytes at address AA..AA */
+ gdb_cmd_memwrite(ks);
+ break;
+#if DBG_MAX_REG_NUM > 0
+ case 'p': /* pXX Return gdb register XX (in hex) */
+ gdb_cmd_reg_get(ks);
+ break;
+ case 'P': /* PXX=aaaa Set gdb register XX to aaaa (in hex) */
+ gdb_cmd_reg_set(ks);
+ break;
+#endif /* DBG_MAX_REG_NUM > 0 */
+ case 'X': /* XAA..AA,LLLL: Write LLLL bytes at address AA..AA */
+ gdb_cmd_binwrite(ks);
+ break;
+ /* kill or detach. KGDB should treat this like a
+ * continue.
+ */
+ case 'D': /* Debugger detach */
+ case 'k': /* Debugger detach via kill */
+ gdb_cmd_detachkill(ks);
+ goto default_handle;
+ case 'R': /* Reboot */
+ if (gdb_cmd_reboot(ks))
+ goto default_handle;
+ break;
+ case 'q': /* query command */
+ gdb_cmd_query(ks);
+ break;
+ case 'H': /* task related */
+ gdb_cmd_task(ks);
+ break;
+ case 'T': /* Query thread status */
+ gdb_cmd_thread(ks);
+ break;
+ case 'z': /* Break point remove */
+ case 'Z': /* Break point set */
+ gdb_cmd_break(ks);
+ break;
+#ifdef CONFIG_KGDB_KDB
+ case '3': /* Escape into back into kdb */
+ if (remcom_in_buffer[1] == '\0') {
+ gdb_cmd_detachkill(ks);
+ return DBG_PASS_EVENT;
+ }
+#endif
+ case 'C': /* Exception passing */
+ tmp = gdb_cmd_exception_pass(ks);
+ if (tmp > 0)
+ goto default_handle;
+ if (tmp == 0)
+ break;
+ /* Fall through on tmp < 0 */
+ case 'c': /* Continue packet */
+ case 's': /* Single step packet */
+ if (kgdb_contthread && kgdb_contthread != current) {
+ /* Can't switch threads in kgdb */
+ error_packet(remcom_out_buffer, -EINVAL);
+ break;
+ }
+ dbg_activate_sw_breakpoints();
+ /* Fall through to default processing */
+ default:
+default_handle:
+ error = kgdb_arch_handle_exception(ks->ex_vector,
+ ks->signo,
+ ks->err_code,
+ remcom_in_buffer,
+ remcom_out_buffer,
+ ks->linux_regs);
+ /*
+ * Leave cmd processing on error, detach,
+ * kill, continue, or single step.
+ */
+ if (error >= 0 || remcom_in_buffer[0] == 'D' ||
+ remcom_in_buffer[0] == 'k') {
+ error = 0;
+ goto kgdb_exit;
+ }
+
+ }
+
+ /* reply to the request */
+ put_packet(remcom_out_buffer);
+ }
+
+kgdb_exit:
+ if (ks->pass_exception)
+ error = 1;
+ return error;
+}
+
+int gdbstub_state(struct kgdb_state *ks, char *cmd)
+{
+ int error;
+
+ switch (cmd[0]) {
+ case 'e':
+ error = kgdb_arch_handle_exception(ks->ex_vector,
+ ks->signo,
+ ks->err_code,
+ remcom_in_buffer,
+ remcom_out_buffer,
+ ks->linux_regs);
+ return error;
+ case 's':
+ case 'c':
+ strcpy(remcom_in_buffer, cmd);
+ return 0;
+ case '$':
+ strcpy(remcom_in_buffer, cmd);
+ gdbstub_use_prev_in_buf = strlen(remcom_in_buffer);
+ gdbstub_prev_in_buf_pos = 0;
+ return 0;
+ }
+ dbg_io_ops->write_char('+');
+ put_packet(remcom_out_buffer);
+ return 0;
+}
+
+/**
+ * gdbstub_exit - Send an exit message to GDB
+ * @status: The exit code to report.
+ */
+void gdbstub_exit(int status)
+{
+ unsigned char checksum, ch, buffer[3];
+ int loop;
+
+ if (!kgdb_connected)
+ return;
+ kgdb_connected = 0;
+
+ if (!dbg_io_ops || dbg_kdb_mode)
+ return;
+
+ buffer[0] = 'W';
+ buffer[1] = hex_asc_hi(status);
+ buffer[2] = hex_asc_lo(status);
+
+ dbg_io_ops->write_char('$');
+ checksum = 0;
+
+ for (loop = 0; loop < 3; loop++) {
+ ch = buffer[loop];
+ checksum += ch;
+ dbg_io_ops->write_char(ch);
+ }
+
+ dbg_io_ops->write_char('#');
+ dbg_io_ops->write_char(hex_asc_hi(checksum));
+ dbg_io_ops->write_char(hex_asc_lo(checksum));
+
+ /* make sure the output is flushed, lest the bootloader clobber it */
+ if (dbg_io_ops->flush)
+ dbg_io_ops->flush();
+}
diff --git a/kernel/debug/kdb/.gitignore b/kernel/debug/kdb/.gitignore
new file mode 100644
index 00000000000..396d12eda9e
--- /dev/null
+++ b/kernel/debug/kdb/.gitignore
@@ -0,0 +1 @@
+gen-kdb_cmds.c
diff --git a/kernel/debug/kdb/Makefile b/kernel/debug/kdb/Makefile
new file mode 100644
index 00000000000..d4fc58f4b88
--- /dev/null
+++ b/kernel/debug/kdb/Makefile
@@ -0,0 +1,25 @@
+# This file is subject to the terms and conditions of the GNU General Public
+# License. See the file "COPYING" in the main directory of this archive
+# for more details.
+#
+# Copyright (c) 1999-2004 Silicon Graphics, Inc. All Rights Reserved.
+# Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved.
+#
+
+CCVERSION := $(shell $(CC) -v 2>&1 | sed -ne '$$p')
+obj-y := kdb_io.o kdb_main.o kdb_support.o kdb_bt.o gen-kdb_cmds.o kdb_bp.o kdb_debugger.o
+obj-$(CONFIG_KDB_KEYBOARD) += kdb_keyboard.o
+
+clean-files := gen-kdb_cmds.c
+
+quiet_cmd_gen-kdb = GENKDB $@
+ cmd_gen-kdb = $(AWK) 'BEGIN {print "\#include <linux/stddef.h>"; print "\#include <linux/init.h>"} \
+ /^\#/{next} \
+ /^[ \t]*$$/{next} \
+ {gsub(/"/, "\\\"", $$0); \
+ print "static __initdata char kdb_cmd" cmds++ "[] = \"" $$0 "\\n\";"} \
+ END {print "extern char *kdb_cmds[]; char __initdata *kdb_cmds[] = {"; for (i = 0; i < cmds; ++i) {print " kdb_cmd" i ","}; print(" NULL\n};");}' \
+ $(filter-out %/Makefile,$^) > $@#
+
+$(obj)/gen-kdb_cmds.c: $(src)/kdb_cmds $(src)/Makefile
+ $(call cmd,gen-kdb)
diff --git a/kernel/debug/kdb/kdb_bp.c b/kernel/debug/kdb/kdb_bp.c
new file mode 100644
index 00000000000..70a504601dc
--- /dev/null
+++ b/kernel/debug/kdb/kdb_bp.c
@@ -0,0 +1,553 @@
+/*
+ * Kernel Debugger Architecture Independent Breakpoint Handler
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (c) 1999-2004 Silicon Graphics, Inc. All Rights Reserved.
+ * Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved.
+ */
+
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/kdb.h>
+#include <linux/kgdb.h>
+#include <linux/smp.h>
+#include <linux/sched.h>
+#include <linux/interrupt.h>
+#include "kdb_private.h"
+
+/*
+ * Table of kdb_breakpoints
+ */
+kdb_bp_t kdb_breakpoints[KDB_MAXBPT];
+
+static void kdb_setsinglestep(struct pt_regs *regs)
+{
+ KDB_STATE_SET(DOING_SS);
+}
+
+static char *kdb_rwtypes[] = {
+ "Instruction(i)",
+ "Instruction(Register)",
+ "Data Write",
+ "I/O",
+ "Data Access"
+};
+
+static char *kdb_bptype(kdb_bp_t *bp)
+{
+ if (bp->bp_type < 0 || bp->bp_type > 4)
+ return "";
+
+ return kdb_rwtypes[bp->bp_type];
+}
+
+static int kdb_parsebp(int argc, const char **argv, int *nextargp, kdb_bp_t *bp)
+{
+ int nextarg = *nextargp;
+ int diag;
+
+ bp->bph_length = 1;
+ if ((argc + 1) != nextarg) {
+ if (strnicmp(argv[nextarg], "datar", sizeof("datar")) == 0)
+ bp->bp_type = BP_ACCESS_WATCHPOINT;
+ else if (strnicmp(argv[nextarg], "dataw", sizeof("dataw")) == 0)
+ bp->bp_type = BP_WRITE_WATCHPOINT;
+ else if (strnicmp(argv[nextarg], "inst", sizeof("inst")) == 0)
+ bp->bp_type = BP_HARDWARE_BREAKPOINT;
+ else
+ return KDB_ARGCOUNT;
+
+ bp->bph_length = 1;
+
+ nextarg++;
+
+ if ((argc + 1) != nextarg) {
+ unsigned long len;
+
+ diag = kdbgetularg((char *)argv[nextarg],
+ &len);
+ if (diag)
+ return diag;
+
+
+ if (len > 8)
+ return KDB_BADLENGTH;
+
+ bp->bph_length = len;
+ nextarg++;
+ }
+
+ if ((argc + 1) != nextarg)
+ return KDB_ARGCOUNT;
+ }
+
+ *nextargp = nextarg;
+ return 0;
+}
+
+static int _kdb_bp_remove(kdb_bp_t *bp)
+{
+ int ret = 1;
+ if (!bp->bp_installed)
+ return ret;
+ if (!bp->bp_type)
+ ret = dbg_remove_sw_break(bp->bp_addr);
+ else
+ ret = arch_kgdb_ops.remove_hw_breakpoint(bp->bp_addr,
+ bp->bph_length,
+ bp->bp_type);
+ if (ret == 0)
+ bp->bp_installed = 0;
+ return ret;
+}
+
+static void kdb_handle_bp(struct pt_regs *regs, kdb_bp_t *bp)
+{
+ if (KDB_DEBUG(BP))
+ kdb_printf("regs->ip = 0x%lx\n", instruction_pointer(regs));
+
+ /*
+ * Setup single step
+ */
+ kdb_setsinglestep(regs);
+
+ /*
+ * Reset delay attribute
+ */
+ bp->bp_delay = 0;
+ bp->bp_delayed = 1;
+}
+
+static int _kdb_bp_install(struct pt_regs *regs, kdb_bp_t *bp)
+{
+ int ret;
+ /*
+ * Install the breakpoint, if it is not already installed.
+ */
+
+ if (KDB_DEBUG(BP))
+ kdb_printf("%s: bp_installed %d\n",
+ __func__, bp->bp_installed);
+ if (!KDB_STATE(SSBPT))
+ bp->bp_delay = 0;
+ if (bp->bp_installed)
+ return 1;
+ if (bp->bp_delay || (bp->bp_delayed && KDB_STATE(DOING_SS))) {
+ if (KDB_DEBUG(BP))
+ kdb_printf("%s: delayed bp\n", __func__);
+ kdb_handle_bp(regs, bp);
+ return 0;
+ }
+ if (!bp->bp_type)
+ ret = dbg_set_sw_break(bp->bp_addr);
+ else
+ ret = arch_kgdb_ops.set_hw_breakpoint(bp->bp_addr,
+ bp->bph_length,
+ bp->bp_type);
+ if (ret == 0) {
+ bp->bp_installed = 1;
+ } else {
+ kdb_printf("%s: failed to set breakpoint at 0x%lx\n",
+ __func__, bp->bp_addr);
+#ifdef CONFIG_DEBUG_RODATA
+ if (!bp->bp_type) {
+ kdb_printf("Software breakpoints are unavailable.\n"
+ " Change the kernel CONFIG_DEBUG_RODATA=n\n"
+ " OR use hw breaks: help bph\n");
+ }
+#endif
+ return 1;
+ }
+ return 0;
+}
+
+/*
+ * kdb_bp_install
+ *
+ * Install kdb_breakpoints prior to returning from the
+ * kernel debugger. This allows the kdb_breakpoints to be set
+ * upon functions that are used internally by kdb, such as
+ * printk(). This function is only called once per kdb session.
+ */
+void kdb_bp_install(struct pt_regs *regs)
+{
+ int i;
+
+ for (i = 0; i < KDB_MAXBPT; i++) {
+ kdb_bp_t *bp = &kdb_breakpoints[i];
+
+ if (KDB_DEBUG(BP)) {
+ kdb_printf("%s: bp %d bp_enabled %d\n",
+ __func__, i, bp->bp_enabled);
+ }
+ if (bp->bp_enabled)
+ _kdb_bp_install(regs, bp);
+ }
+}
+
+/*
+ * kdb_bp_remove
+ *
+ * Remove kdb_breakpoints upon entry to the kernel debugger.
+ *
+ * Parameters:
+ * None.
+ * Outputs:
+ * None.
+ * Returns:
+ * None.
+ * Locking:
+ * None.
+ * Remarks:
+ */
+void kdb_bp_remove(void)
+{
+ int i;
+
+ for (i = KDB_MAXBPT - 1; i >= 0; i--) {
+ kdb_bp_t *bp = &kdb_breakpoints[i];
+
+ if (KDB_DEBUG(BP)) {
+ kdb_printf("%s: bp %d bp_enabled %d\n",
+ __func__, i, bp->bp_enabled);
+ }
+ if (bp->bp_enabled)
+ _kdb_bp_remove(bp);
+ }
+}
+
+
+/*
+ * kdb_printbp
+ *
+ * Internal function to format and print a breakpoint entry.
+ *
+ * Parameters:
+ * None.
+ * Outputs:
+ * None.
+ * Returns:
+ * None.
+ * Locking:
+ * None.
+ * Remarks:
+ */
+
+static void kdb_printbp(kdb_bp_t *bp, int i)
+{
+ kdb_printf("%s ", kdb_bptype(bp));
+ kdb_printf("BP #%d at ", i);
+ kdb_symbol_print(bp->bp_addr, NULL, KDB_SP_DEFAULT);
+
+ if (bp->bp_enabled)
+ kdb_printf("\n is enabled");
+ else
+ kdb_printf("\n is disabled");
+
+ kdb_printf("\taddr at %016lx, hardtype=%d installed=%d\n",
+ bp->bp_addr, bp->bp_type, bp->bp_installed);
+
+ kdb_printf("\n");
+}
+
+/*
+ * kdb_bp
+ *
+ * Handle the bp commands.
+ *
+ * [bp|bph] <addr-expression> [DATAR|DATAW]
+ *
+ * Parameters:
+ * argc Count of arguments in argv
+ * argv Space delimited command line arguments
+ * Outputs:
+ * None.
+ * Returns:
+ * Zero for success, a kdb diagnostic if failure.
+ * Locking:
+ * None.
+ * Remarks:
+ *
+ * bp Set breakpoint on all cpus. Only use hardware assist if need.
+ * bph Set breakpoint on all cpus. Force hardware register
+ */
+
+static int kdb_bp(int argc, const char **argv)
+{
+ int i, bpno;
+ kdb_bp_t *bp, *bp_check;
+ int diag;
+ char *symname = NULL;
+ long offset = 0ul;
+ int nextarg;
+ kdb_bp_t template = {0};
+
+ if (argc == 0) {
+ /*
+ * Display breakpoint table
+ */
+ for (bpno = 0, bp = kdb_breakpoints; bpno < KDB_MAXBPT;
+ bpno++, bp++) {
+ if (bp->bp_free)
+ continue;
+ kdb_printbp(bp, bpno);
+ }
+
+ return 0;
+ }
+
+ nextarg = 1;
+ diag = kdbgetaddrarg(argc, argv, &nextarg, &template.bp_addr,
+ &offset, &symname);
+ if (diag)
+ return diag;
+ if (!template.bp_addr)
+ return KDB_BADINT;
+
+ /*
+ * Find an empty bp structure to allocate
+ */
+ for (bpno = 0, bp = kdb_breakpoints; bpno < KDB_MAXBPT; bpno++, bp++) {
+ if (bp->bp_free)
+ break;
+ }
+
+ if (bpno == KDB_MAXBPT)
+ return KDB_TOOMANYBPT;
+
+ if (strcmp(argv[0], "bph") == 0) {
+ template.bp_type = BP_HARDWARE_BREAKPOINT;
+ diag = kdb_parsebp(argc, argv, &nextarg, &template);
+ if (diag)
+ return diag;
+ } else {
+ template.bp_type = BP_BREAKPOINT;
+ }
+
+ /*
+ * Check for clashing breakpoints.
+ *
+ * Note, in this design we can't have hardware breakpoints
+ * enabled for both read and write on the same address.
+ */
+ for (i = 0, bp_check = kdb_breakpoints; i < KDB_MAXBPT;
+ i++, bp_check++) {
+ if (!bp_check->bp_free &&
+ bp_check->bp_addr == template.bp_addr) {
+ kdb_printf("You already have a breakpoint at "
+ kdb_bfd_vma_fmt0 "\n", template.bp_addr);
+ return KDB_DUPBPT;
+ }
+ }
+
+ template.bp_enabled = 1;
+
+ /*
+ * Actually allocate the breakpoint found earlier
+ */
+ *bp = template;
+ bp->bp_free = 0;
+
+ kdb_printbp(bp, bpno);
+
+ return 0;
+}
+
+/*
+ * kdb_bc
+ *
+ * Handles the 'bc', 'be', and 'bd' commands
+ *
+ * [bd|bc|be] <breakpoint-number>
+ * [bd|bc|be] *
+ *
+ * Parameters:
+ * argc Count of arguments in argv
+ * argv Space delimited command line arguments
+ * Outputs:
+ * None.
+ * Returns:
+ * Zero for success, a kdb diagnostic for failure
+ * Locking:
+ * None.
+ * Remarks:
+ */
+static int kdb_bc(int argc, const char **argv)
+{
+ unsigned long addr;
+ kdb_bp_t *bp = NULL;
+ int lowbp = KDB_MAXBPT;
+ int highbp = 0;
+ int done = 0;
+ int i;
+ int diag = 0;
+
+ int cmd; /* KDBCMD_B? */
+#define KDBCMD_BC 0
+#define KDBCMD_BE 1
+#define KDBCMD_BD 2
+
+ if (strcmp(argv[0], "be") == 0)
+ cmd = KDBCMD_BE;
+ else if (strcmp(argv[0], "bd") == 0)
+ cmd = KDBCMD_BD;
+ else
+ cmd = KDBCMD_BC;
+
+ if (argc != 1)
+ return KDB_ARGCOUNT;
+
+ if (strcmp(argv[1], "*") == 0) {
+ lowbp = 0;
+ highbp = KDB_MAXBPT;
+ } else {
+ diag = kdbgetularg(argv[1], &addr);
+ if (diag)
+ return diag;
+
+ /*
+ * For addresses less than the maximum breakpoint number,
+ * assume that the breakpoint number is desired.
+ */
+ if (addr < KDB_MAXBPT) {
+ bp = &kdb_breakpoints[addr];
+ lowbp = highbp = addr;
+ highbp++;
+ } else {
+ for (i = 0, bp = kdb_breakpoints; i < KDB_MAXBPT;
+ i++, bp++) {
+ if (bp->bp_addr == addr) {
+ lowbp = highbp = i;
+ highbp++;
+ break;
+ }
+ }
+ }
+ }
+
+ /*
+ * Now operate on the set of breakpoints matching the input
+ * criteria (either '*' for all, or an individual breakpoint).
+ */
+ for (bp = &kdb_breakpoints[lowbp], i = lowbp;
+ i < highbp;
+ i++, bp++) {
+ if (bp->bp_free)
+ continue;
+
+ done++;
+
+ switch (cmd) {
+ case KDBCMD_BC:
+ bp->bp_enabled = 0;
+
+ kdb_printf("Breakpoint %d at "
+ kdb_bfd_vma_fmt " cleared\n",
+ i, bp->bp_addr);
+
+ bp->bp_addr = 0;
+ bp->bp_free = 1;
+
+ break;
+ case KDBCMD_BE:
+ bp->bp_enabled = 1;
+
+ kdb_printf("Breakpoint %d at "
+ kdb_bfd_vma_fmt " enabled",
+ i, bp->bp_addr);
+
+ kdb_printf("\n");
+ break;
+ case KDBCMD_BD:
+ if (!bp->bp_enabled)
+ break;
+
+ bp->bp_enabled = 0;
+
+ kdb_printf("Breakpoint %d at "
+ kdb_bfd_vma_fmt " disabled\n",
+ i, bp->bp_addr);
+
+ break;
+ }
+ if (bp->bp_delay && (cmd == KDBCMD_BC || cmd == KDBCMD_BD)) {
+ bp->bp_delay = 0;
+ KDB_STATE_CLEAR(SSBPT);
+ }
+ }
+
+ return (!done) ? KDB_BPTNOTFOUND : 0;
+}
+
+/*
+ * kdb_ss
+ *
+ * Process the 'ss' (Single Step) command.
+ *
+ * ss
+ *
+ * Parameters:
+ * argc Argument count
+ * argv Argument vector
+ * Outputs:
+ * None.
+ * Returns:
+ * KDB_CMD_SS for success, a kdb error if failure.
+ * Locking:
+ * None.
+ * Remarks:
+ *
+ * Set the arch specific option to trigger a debug trap after the next
+ * instruction.
+ */
+
+static int kdb_ss(int argc, const char **argv)
+{
+ if (argc != 0)
+ return KDB_ARGCOUNT;
+ /*
+ * Set trace flag and go.
+ */
+ KDB_STATE_SET(DOING_SS);
+ return KDB_CMD_SS;
+}
+
+/* Initialize the breakpoint table and register breakpoint commands. */
+
+void __init kdb_initbptab(void)
+{
+ int i;
+ kdb_bp_t *bp;
+
+ /*
+ * First time initialization.
+ */
+ memset(&kdb_breakpoints, '\0', sizeof(kdb_breakpoints));
+
+ for (i = 0, bp = kdb_breakpoints; i < KDB_MAXBPT; i++, bp++)
+ bp->bp_free = 1;
+
+ kdb_register_repeat("bp", kdb_bp, "[<vaddr>]",
+ "Set/Display breakpoints", 0, KDB_REPEAT_NO_ARGS);
+ kdb_register_repeat("bl", kdb_bp, "[<vaddr>]",
+ "Display breakpoints", 0, KDB_REPEAT_NO_ARGS);
+ if (arch_kgdb_ops.flags & KGDB_HW_BREAKPOINT)
+ kdb_register_repeat("bph", kdb_bp, "[<vaddr>]",
+ "[datar [length]|dataw [length]] Set hw brk", 0, KDB_REPEAT_NO_ARGS);
+ kdb_register_repeat("bc", kdb_bc, "<bpnum>",
+ "Clear Breakpoint", 0, KDB_REPEAT_NONE);
+ kdb_register_repeat("be", kdb_bc, "<bpnum>",
+ "Enable Breakpoint", 0, KDB_REPEAT_NONE);
+ kdb_register_repeat("bd", kdb_bc, "<bpnum>",
+ "Disable Breakpoint", 0, KDB_REPEAT_NONE);
+
+ kdb_register_repeat("ss", kdb_ss, "",
+ "Single Step", 1, KDB_REPEAT_NO_ARGS);
+ /*
+ * Architecture dependent initialization.
+ */
+}
diff --git a/kernel/debug/kdb/kdb_bt.c b/kernel/debug/kdb/kdb_bt.c
new file mode 100644
index 00000000000..fe15fff5df5
--- /dev/null
+++ b/kernel/debug/kdb/kdb_bt.c
@@ -0,0 +1,210 @@
+/*
+ * Kernel Debugger Architecture Independent Stack Traceback
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (c) 1999-2004 Silicon Graphics, Inc. All Rights Reserved.
+ * Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved.
+ */
+
+#include <linux/ctype.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/kdb.h>
+#include <linux/nmi.h>
+#include "kdb_private.h"
+
+
+static void kdb_show_stack(struct task_struct *p, void *addr)
+{
+ int old_lvl = console_loglevel;
+ console_loglevel = CONSOLE_LOGLEVEL_MOTORMOUTH;
+ kdb_trap_printk++;
+ kdb_set_current_task(p);
+ if (addr) {
+ show_stack((struct task_struct *)p, addr);
+ } else if (kdb_current_regs) {
+#ifdef CONFIG_X86
+ show_stack(p, &kdb_current_regs->sp);
+#else
+ show_stack(p, NULL);
+#endif
+ } else {
+ show_stack(p, NULL);
+ }
+ console_loglevel = old_lvl;
+ kdb_trap_printk--;
+}
+
+/*
+ * kdb_bt
+ *
+ * This function implements the 'bt' command. Print a stack
+ * traceback.
+ *
+ * bt [<address-expression>] (addr-exp is for alternate stacks)
+ * btp <pid> Kernel stack for <pid>
+ * btt <address-expression> Kernel stack for task structure at
+ * <address-expression>
+ * bta [DRSTCZEUIMA] All useful processes, optionally
+ * filtered by state
+ * btc [<cpu>] The current process on one cpu,
+ * default is all cpus
+ *
+ * bt <address-expression> refers to a address on the stack, that location
+ * is assumed to contain a return address.
+ *
+ * btt <address-expression> refers to the address of a struct task.
+ *
+ * Inputs:
+ * argc argument count
+ * argv argument vector
+ * Outputs:
+ * None.
+ * Returns:
+ * zero for success, a kdb diagnostic if error
+ * Locking:
+ * none.
+ * Remarks:
+ * Backtrack works best when the code uses frame pointers. But even
+ * without frame pointers we should get a reasonable trace.
+ *
+ * mds comes in handy when examining the stack to do a manual traceback or
+ * to get a starting point for bt <address-expression>.
+ */
+
+static int
+kdb_bt1(struct task_struct *p, unsigned long mask,
+ int argcount, int btaprompt)
+{
+ char buffer[2];
+ if (kdb_getarea(buffer[0], (unsigned long)p) ||
+ kdb_getarea(buffer[0], (unsigned long)(p+1)-1))
+ return KDB_BADADDR;
+ if (!kdb_task_state(p, mask))
+ return 0;
+ kdb_printf("Stack traceback for pid %d\n", p->pid);
+ kdb_ps1(p);
+ kdb_show_stack(p, NULL);
+ if (btaprompt) {
+ kdb_getstr(buffer, sizeof(buffer),
+ "Enter <q> to end, <cr> to continue:");
+ if (buffer[0] == 'q') {
+ kdb_printf("\n");
+ return 1;
+ }
+ }
+ touch_nmi_watchdog();
+ return 0;
+}
+
+int
+kdb_bt(int argc, const char **argv)
+{
+ int diag;
+ int argcount = 5;
+ int btaprompt = 1;
+ int nextarg;
+ unsigned long addr;
+ long offset;
+
+ /* Prompt after each proc in bta */
+ kdbgetintenv("BTAPROMPT", &btaprompt);
+
+ if (strcmp(argv[0], "bta") == 0) {
+ struct task_struct *g, *p;
+ unsigned long cpu;
+ unsigned long mask = kdb_task_state_string(argc ? argv[1] :
+ NULL);
+ if (argc == 0)
+ kdb_ps_suppressed();
+ /* Run the active tasks first */
+ for_each_online_cpu(cpu) {
+ p = kdb_curr_task(cpu);
+ if (kdb_bt1(p, mask, argcount, btaprompt))
+ return 0;
+ }
+ /* Now the inactive tasks */
+ kdb_do_each_thread(g, p) {
+ if (KDB_FLAG(CMD_INTERRUPT))
+ return 0;
+ if (task_curr(p))
+ continue;
+ if (kdb_bt1(p, mask, argcount, btaprompt))
+ return 0;
+ } kdb_while_each_thread(g, p);
+ } else if (strcmp(argv[0], "btp") == 0) {
+ struct task_struct *p;
+ unsigned long pid;
+ if (argc != 1)
+ return KDB_ARGCOUNT;
+ diag = kdbgetularg((char *)argv[1], &pid);
+ if (diag)
+ return diag;
+ p = find_task_by_pid_ns(pid, &init_pid_ns);
+ if (p) {
+ kdb_set_current_task(p);
+ return kdb_bt1(p, ~0UL, argcount, 0);
+ }
+ kdb_printf("No process with pid == %ld found\n", pid);
+ return 0;
+ } else if (strcmp(argv[0], "btt") == 0) {
+ if (argc != 1)
+ return KDB_ARGCOUNT;
+ diag = kdbgetularg((char *)argv[1], &addr);
+ if (diag)
+ return diag;
+ kdb_set_current_task((struct task_struct *)addr);
+ return kdb_bt1((struct task_struct *)addr, ~0UL, argcount, 0);
+ } else if (strcmp(argv[0], "btc") == 0) {
+ unsigned long cpu = ~0;
+ struct task_struct *save_current_task = kdb_current_task;
+ char buf[80];
+ if (argc > 1)
+ return KDB_ARGCOUNT;
+ if (argc == 1) {
+ diag = kdbgetularg((char *)argv[1], &cpu);
+ if (diag)
+ return diag;
+ }
+ /* Recursive use of kdb_parse, do not use argv after
+ * this point */
+ argv = NULL;
+ if (cpu != ~0) {
+ if (cpu >= num_possible_cpus() || !cpu_online(cpu)) {
+ kdb_printf("no process for cpu %ld\n", cpu);
+ return 0;
+ }
+ sprintf(buf, "btt 0x%p\n", KDB_TSK(cpu));
+ kdb_parse(buf);
+ return 0;
+ }
+ kdb_printf("btc: cpu status: ");
+ kdb_parse("cpu\n");
+ for_each_online_cpu(cpu) {
+ sprintf(buf, "btt 0x%p\n", KDB_TSK(cpu));
+ kdb_parse(buf);
+ touch_nmi_watchdog();
+ }
+ kdb_set_current_task(save_current_task);
+ return 0;
+ } else {
+ if (argc) {
+ nextarg = 1;
+ diag = kdbgetaddrarg(argc, argv, &nextarg, &addr,
+ &offset, NULL);
+ if (diag)
+ return diag;
+ kdb_show_stack(kdb_current_task, (void *)addr);
+ return 0;
+ } else {
+ return kdb_bt1(kdb_current_task, ~0UL, argcount, 0);
+ }
+ }
+
+ /* NOTREACHED */
+ return 0;
+}
diff --git a/kernel/debug/kdb/kdb_cmds b/kernel/debug/kdb/kdb_cmds
new file mode 100644
index 00000000000..9834ad303ab
--- /dev/null
+++ b/kernel/debug/kdb/kdb_cmds
@@ -0,0 +1,31 @@
+# Initial commands for kdb, alter to suit your needs.
+# These commands are executed in kdb_init() context, no SMP, no
+# processes. Commands that require process data (including stack or
+# registers) are not reliable this early. set and bp commands should
+# be safe. Global breakpoint commands affect each cpu as it is booted.
+
+# Standard debugging information for first level support, just type archkdb
+# or archkdbcpu or archkdbshort at the kdb prompt.
+
+defcmd dumpcommon "" "Common kdb debugging"
+ set BTAPROMPT 0
+ set LINES 10000
+ -summary
+ -cpu
+ -ps
+ -dmesg 600
+ -bt
+endefcmd
+
+defcmd dumpall "" "First line debugging"
+ pid R
+ -dumpcommon
+ -bta
+endefcmd
+
+defcmd dumpcpu "" "Same as dumpall but only tasks on cpus"
+ pid R
+ -dumpcommon
+ -btc
+endefcmd
+
diff --git a/kernel/debug/kdb/kdb_debugger.c b/kernel/debug/kdb/kdb_debugger.c
new file mode 100644
index 00000000000..8859ca34dcf
--- /dev/null
+++ b/kernel/debug/kdb/kdb_debugger.c
@@ -0,0 +1,182 @@
+/*
+ * Created by: Jason Wessel <jason.wessel@windriver.com>
+ *
+ * Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved.
+ *
+ * This file is licensed under the terms of the GNU General Public
+ * License version 2. This program is licensed "as is" without any
+ * warranty of any kind, whether express or implied.
+ */
+
+#include <linux/kgdb.h>
+#include <linux/kdb.h>
+#include <linux/kdebug.h>
+#include <linux/export.h>
+#include <linux/hardirq.h>
+#include "kdb_private.h"
+#include "../debug_core.h"
+
+/*
+ * KDB interface to KGDB internals
+ */
+get_char_func kdb_poll_funcs[] = {
+ dbg_io_get_char,
+ NULL,
+ NULL,
+ NULL,
+ NULL,
+ NULL,
+};
+EXPORT_SYMBOL_GPL(kdb_poll_funcs);
+
+int kdb_poll_idx = 1;
+EXPORT_SYMBOL_GPL(kdb_poll_idx);
+
+static struct kgdb_state *kdb_ks;
+
+int kdb_common_init_state(struct kgdb_state *ks)
+{
+ kdb_initial_cpu = atomic_read(&kgdb_active);
+ kdb_current_task = kgdb_info[ks->cpu].task;
+ kdb_current_regs = kgdb_info[ks->cpu].debuggerinfo;
+ return 0;
+}
+
+int kdb_common_deinit_state(void)
+{
+ kdb_initial_cpu = -1;
+ kdb_current_task = NULL;
+ kdb_current_regs = NULL;
+ return 0;
+}
+
+int kdb_stub(struct kgdb_state *ks)
+{
+ int error = 0;
+ kdb_bp_t *bp;
+ unsigned long addr = kgdb_arch_pc(ks->ex_vector, ks->linux_regs);
+ kdb_reason_t reason = KDB_REASON_OOPS;
+ kdb_dbtrap_t db_result = KDB_DB_NOBPT;
+ int i;
+
+ kdb_ks = ks;
+ if (KDB_STATE(REENTRY)) {
+ reason = KDB_REASON_SWITCH;
+ KDB_STATE_CLEAR(REENTRY);
+ addr = instruction_pointer(ks->linux_regs);
+ }
+ ks->pass_exception = 0;
+ if (atomic_read(&kgdb_setting_breakpoint))
+ reason = KDB_REASON_KEYBOARD;
+
+ if (ks->err_code == KDB_REASON_SYSTEM_NMI && ks->signo == SIGTRAP)
+ reason = KDB_REASON_SYSTEM_NMI;
+
+ else if (in_nmi())
+ reason = KDB_REASON_NMI;
+
+ for (i = 0, bp = kdb_breakpoints; i < KDB_MAXBPT; i++, bp++) {
+ if ((bp->bp_enabled) && (bp->bp_addr == addr)) {
+ reason = KDB_REASON_BREAK;
+ db_result = KDB_DB_BPT;
+ if (addr != instruction_pointer(ks->linux_regs))
+ kgdb_arch_set_pc(ks->linux_regs, addr);
+ break;
+ }
+ }
+ if (reason == KDB_REASON_BREAK || reason == KDB_REASON_SWITCH) {
+ for (i = 0, bp = kdb_breakpoints; i < KDB_MAXBPT; i++, bp++) {
+ if (bp->bp_free)
+ continue;
+ if (bp->bp_addr == addr) {
+ bp->bp_delay = 1;
+ bp->bp_delayed = 1;
+ /*
+ * SSBPT is set when the kernel debugger must single step a
+ * task in order to re-establish an instruction breakpoint
+ * which uses the instruction replacement mechanism. It is
+ * cleared by any action that removes the need to single-step
+ * the breakpoint.
+ */
+ reason = KDB_REASON_BREAK;
+ db_result = KDB_DB_BPT;
+ KDB_STATE_SET(SSBPT);
+ break;
+ }
+ }
+ }
+
+ if (reason != KDB_REASON_BREAK && ks->ex_vector == 0 &&
+ ks->signo == SIGTRAP) {
+ reason = KDB_REASON_SSTEP;
+ db_result = KDB_DB_BPT;
+ }
+ /* Set initial kdb state variables */
+ KDB_STATE_CLEAR(KGDB_TRANS);
+ kdb_common_init_state(ks);
+ /* Remove any breakpoints as needed by kdb and clear single step */
+ kdb_bp_remove();
+ KDB_STATE_CLEAR(DOING_SS);
+ KDB_STATE_SET(PAGER);
+ /* zero out any offline cpu data */
+ for_each_present_cpu(i) {
+ if (!cpu_online(i)) {
+ kgdb_info[i].debuggerinfo = NULL;
+ kgdb_info[i].task = NULL;
+ }
+ }
+ if (ks->err_code == DIE_OOPS || reason == KDB_REASON_OOPS) {
+ ks->pass_exception = 1;
+ KDB_FLAG_SET(CATASTROPHIC);
+ }
+ if (KDB_STATE(SSBPT) && reason == KDB_REASON_SSTEP) {
+ KDB_STATE_CLEAR(SSBPT);
+ KDB_STATE_CLEAR(DOING_SS);
+ } else {
+ /* Start kdb main loop */
+ error = kdb_main_loop(KDB_REASON_ENTER, reason,
+ ks->err_code, db_result, ks->linux_regs);
+ }
+ /*
+ * Upon exit from the kdb main loop setup break points and restart
+ * the system based on the requested continue state
+ */
+ kdb_common_deinit_state();
+ KDB_STATE_CLEAR(PAGER);
+ kdbnearsym_cleanup();
+ if (error == KDB_CMD_KGDB) {
+ if (KDB_STATE(DOING_KGDB))
+ KDB_STATE_CLEAR(DOING_KGDB);
+ return DBG_PASS_EVENT;
+ }
+ kdb_bp_install(ks->linux_regs);
+ dbg_activate_sw_breakpoints();
+ /* Set the exit state to a single step or a continue */
+ if (KDB_STATE(DOING_SS))
+ gdbstub_state(ks, "s");
+ else
+ gdbstub_state(ks, "c");
+
+ KDB_FLAG_CLEAR(CATASTROPHIC);
+
+ /* Invoke arch specific exception handling prior to system resume */
+ kgdb_info[ks->cpu].ret_state = gdbstub_state(ks, "e");
+ if (ks->pass_exception)
+ kgdb_info[ks->cpu].ret_state = 1;
+ if (error == KDB_CMD_CPU) {
+ KDB_STATE_SET(REENTRY);
+ /*
+ * Force clear the single step bit because kdb emulates this
+ * differently vs the gdbstub
+ */
+ kgdb_single_step = 0;
+ dbg_deactivate_sw_breakpoints();
+ return DBG_SWITCH_CPU_EVENT;
+ }
+ return kgdb_info[ks->cpu].ret_state;
+}
+
+void kdb_gdb_state_pass(char *buf)
+{
+ gdbstub_state(kdb_ks, buf);
+}
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
new file mode 100644
index 00000000000..7c70812caea
--- /dev/null
+++ b/kernel/debug/kdb/kdb_io.c
@@ -0,0 +1,852 @@
+/*
+ * Kernel Debugger Architecture Independent Console I/O handler
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (c) 1999-2006 Silicon Graphics, Inc. All Rights Reserved.
+ * Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved.
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/ctype.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/kdev_t.h>
+#include <linux/console.h>
+#include <linux/string.h>
+#include <linux/sched.h>
+#include <linux/smp.h>
+#include <linux/nmi.h>
+#include <linux/delay.h>
+#include <linux/kgdb.h>
+#include <linux/kdb.h>
+#include <linux/kallsyms.h>
+#include "kdb_private.h"
+
+#define CMD_BUFLEN 256
+char kdb_prompt_str[CMD_BUFLEN];
+
+int kdb_trap_printk;
+
+static int kgdb_transition_check(char *buffer)
+{
+ if (buffer[0] != '+' && buffer[0] != '$') {
+ KDB_STATE_SET(KGDB_TRANS);
+ kdb_printf("%s", buffer);
+ } else {
+ int slen = strlen(buffer);
+ if (slen > 3 && buffer[slen - 3] == '#') {
+ kdb_gdb_state_pass(buffer);
+ strcpy(buffer, "kgdb");
+ KDB_STATE_SET(DOING_KGDB);
+ return 1;
+ }
+ }
+ return 0;
+}
+
+static int kdb_read_get_key(char *buffer, size_t bufsize)
+{
+#define ESCAPE_UDELAY 1000
+#define ESCAPE_DELAY (2*1000000/ESCAPE_UDELAY) /* 2 seconds worth of udelays */
+ char escape_data[5]; /* longest vt100 escape sequence is 4 bytes */
+ char *ped = escape_data;
+ int escape_delay = 0;
+ get_char_func *f, *f_escape = NULL;
+ int key;
+
+ for (f = &kdb_poll_funcs[0]; ; ++f) {
+ if (*f == NULL) {
+ /* Reset NMI watchdog once per poll loop */
+ touch_nmi_watchdog();
+ f = &kdb_poll_funcs[0];
+ }
+ if (escape_delay == 2) {
+ *ped = '\0';
+ ped = escape_data;
+ --escape_delay;
+ }
+ if (escape_delay == 1) {
+ key = *ped++;
+ if (!*ped)
+ --escape_delay;
+ break;
+ }
+ key = (*f)();
+ if (key == -1) {
+ if (escape_delay) {
+ udelay(ESCAPE_UDELAY);
+ --escape_delay;
+ }
+ continue;
+ }
+ if (bufsize <= 2) {
+ if (key == '\r')
+ key = '\n';
+ *buffer++ = key;
+ *buffer = '\0';
+ return -1;
+ }
+ if (escape_delay == 0 && key == '\e') {
+ escape_delay = ESCAPE_DELAY;
+ ped = escape_data;
+ f_escape = f;
+ }
+ if (escape_delay) {
+ *ped++ = key;
+ if (f_escape != f) {
+ escape_delay = 2;
+ continue;
+ }
+ if (ped - escape_data == 1) {
+ /* \e */
+ continue;
+ } else if (ped - escape_data == 2) {
+ /* \e<something> */
+ if (key != '[')
+ escape_delay = 2;
+ continue;
+ } else if (ped - escape_data == 3) {
+ /* \e[<something> */
+ int mapkey = 0;
+ switch (key) {
+ case 'A': /* \e[A, up arrow */
+ mapkey = 16;
+ break;
+ case 'B': /* \e[B, down arrow */
+ mapkey = 14;
+ break;
+ case 'C': /* \e[C, right arrow */
+ mapkey = 6;
+ break;
+ case 'D': /* \e[D, left arrow */
+ mapkey = 2;
+ break;
+ case '1': /* dropthrough */
+ case '3': /* dropthrough */
+ /* \e[<1,3,4>], may be home, del, end */
+ case '4':
+ mapkey = -1;
+ break;
+ }
+ if (mapkey != -1) {
+ if (mapkey > 0) {
+ escape_data[0] = mapkey;
+ escape_data[1] = '\0';
+ }
+ escape_delay = 2;
+ }
+ continue;
+ } else if (ped - escape_data == 4) {
+ /* \e[<1,3,4><something> */
+ int mapkey = 0;
+ if (key == '~') {
+ switch (escape_data[2]) {
+ case '1': /* \e[1~, home */
+ mapkey = 1;
+ break;
+ case '3': /* \e[3~, del */
+ mapkey = 4;
+ break;
+ case '4': /* \e[4~, end */
+ mapkey = 5;
+ break;
+ }
+ }
+ if (mapkey > 0) {
+ escape_data[0] = mapkey;
+ escape_data[1] = '\0';
+ }
+ escape_delay = 2;
+ continue;
+ }
+ }
+ break; /* A key to process */
+ }
+ return key;
+}
+
+/*
+ * kdb_read
+ *
+ * This function reads a string of characters, terminated by
+ * a newline, or by reaching the end of the supplied buffer,
+ * from the current kernel debugger console device.
+ * Parameters:
+ * buffer - Address of character buffer to receive input characters.
+ * bufsize - size, in bytes, of the character buffer
+ * Returns:
+ * Returns a pointer to the buffer containing the received
+ * character string. This string will be terminated by a
+ * newline character.
+ * Locking:
+ * No locks are required to be held upon entry to this
+ * function. It is not reentrant - it relies on the fact
+ * that while kdb is running on only one "master debug" cpu.
+ * Remarks:
+ *
+ * The buffer size must be >= 2. A buffer size of 2 means that the caller only
+ * wants a single key.
+ *
+ * An escape key could be the start of a vt100 control sequence such as \e[D
+ * (left arrow) or it could be a character in its own right. The standard
+ * method for detecting the difference is to wait for 2 seconds to see if there
+ * are any other characters. kdb is complicated by the lack of a timer service
+ * (interrupts are off), by multiple input sources and by the need to sometimes
+ * return after just one key. Escape sequence processing has to be done as
+ * states in the polling loop.
+ */
+
+static char *kdb_read(char *buffer, size_t bufsize)
+{
+ char *cp = buffer;
+ char *bufend = buffer+bufsize-2; /* Reserve space for newline
+ * and null byte */
+ char *lastchar;
+ char *p_tmp;
+ char tmp;
+ static char tmpbuffer[CMD_BUFLEN];
+ int len = strlen(buffer);
+ int len_tmp;
+ int tab = 0;
+ int count;
+ int i;
+ int diag, dtab_count;
+ int key;
+
+
+ diag = kdbgetintenv("DTABCOUNT", &dtab_count);
+ if (diag)
+ dtab_count = 30;
+
+ if (len > 0) {
+ cp += len;
+ if (*(buffer+len-1) == '\n')
+ cp--;
+ }
+
+ lastchar = cp;
+ *cp = '\0';
+ kdb_printf("%s", buffer);
+poll_again:
+ key = kdb_read_get_key(buffer, bufsize);
+ if (key == -1)
+ return buffer;
+ if (key != 9)
+ tab = 0;
+ switch (key) {
+ case 8: /* backspace */
+ if (cp > buffer) {
+ if (cp < lastchar) {
+ memcpy(tmpbuffer, cp, lastchar - cp);
+ memcpy(cp-1, tmpbuffer, lastchar - cp);
+ }
+ *(--lastchar) = '\0';
+ --cp;
+ kdb_printf("\b%s \r", cp);
+ tmp = *cp;
+ *cp = '\0';
+ kdb_printf(kdb_prompt_str);
+ kdb_printf("%s", buffer);
+ *cp = tmp;
+ }
+ break;
+ case 13: /* enter */
+ *lastchar++ = '\n';
+ *lastchar++ = '\0';
+ if (!KDB_STATE(KGDB_TRANS)) {
+ KDB_STATE_SET(KGDB_TRANS);
+ kdb_printf("%s", buffer);
+ }
+ kdb_printf("\n");
+ return buffer;
+ case 4: /* Del */
+ if (cp < lastchar) {
+ memcpy(tmpbuffer, cp+1, lastchar - cp - 1);
+ memcpy(cp, tmpbuffer, lastchar - cp - 1);
+ *(--lastchar) = '\0';
+ kdb_printf("%s \r", cp);
+ tmp = *cp;
+ *cp = '\0';
+ kdb_printf(kdb_prompt_str);
+ kdb_printf("%s", buffer);
+ *cp = tmp;
+ }
+ break;
+ case 1: /* Home */
+ if (cp > buffer) {
+ kdb_printf("\r");
+ kdb_printf(kdb_prompt_str);
+ cp = buffer;
+ }
+ break;
+ case 5: /* End */
+ if (cp < lastchar) {
+ kdb_printf("%s", cp);
+ cp = lastchar;
+ }
+ break;
+ case 2: /* Left */
+ if (cp > buffer) {
+ kdb_printf("\b");
+ --cp;
+ }
+ break;
+ case 14: /* Down */
+ memset(tmpbuffer, ' ',
+ strlen(kdb_prompt_str) + (lastchar-buffer));
+ *(tmpbuffer+strlen(kdb_prompt_str) +
+ (lastchar-buffer)) = '\0';
+ kdb_printf("\r%s\r", tmpbuffer);
+ *lastchar = (char)key;
+ *(lastchar+1) = '\0';
+ return lastchar;
+ case 6: /* Right */
+ if (cp < lastchar) {
+ kdb_printf("%c", *cp);
+ ++cp;
+ }
+ break;
+ case 16: /* Up */
+ memset(tmpbuffer, ' ',
+ strlen(kdb_prompt_str) + (lastchar-buffer));
+ *(tmpbuffer+strlen(kdb_prompt_str) +
+ (lastchar-buffer)) = '\0';
+ kdb_printf("\r%s\r", tmpbuffer);
+ *lastchar = (char)key;
+ *(lastchar+1) = '\0';
+ return lastchar;
+ case 9: /* Tab */
+ if (tab < 2)
+ ++tab;
+ p_tmp = buffer;
+ while (*p_tmp == ' ')
+ p_tmp++;
+ if (p_tmp > cp)
+ break;
+ memcpy(tmpbuffer, p_tmp, cp-p_tmp);
+ *(tmpbuffer + (cp-p_tmp)) = '\0';
+ p_tmp = strrchr(tmpbuffer, ' ');
+ if (p_tmp)
+ ++p_tmp;
+ else
+ p_tmp = tmpbuffer;
+ len = strlen(p_tmp);
+ count = kallsyms_symbol_complete(p_tmp,
+ sizeof(tmpbuffer) -
+ (p_tmp - tmpbuffer));
+ if (tab == 2 && count > 0) {
+ kdb_printf("\n%d symbols are found.", count);
+ if (count > dtab_count) {
+ count = dtab_count;
+ kdb_printf(" But only first %d symbols will"
+ " be printed.\nYou can change the"
+ " environment variable DTABCOUNT.",
+ count);
+ }
+ kdb_printf("\n");
+ for (i = 0; i < count; i++) {
+ if (kallsyms_symbol_next(p_tmp, i) < 0)
+ break;
+ kdb_printf("%s ", p_tmp);
+ *(p_tmp + len) = '\0';
+ }
+ if (i >= dtab_count)
+ kdb_printf("...");
+ kdb_printf("\n");
+ kdb_printf(kdb_prompt_str);
+ kdb_printf("%s", buffer);
+ } else if (tab != 2 && count > 0) {
+ len_tmp = strlen(p_tmp);
+ strncpy(p_tmp+len_tmp, cp, lastchar-cp+1);
+ len_tmp = strlen(p_tmp);
+ strncpy(cp, p_tmp+len, len_tmp-len + 1);
+ len = len_tmp - len;
+ kdb_printf("%s", cp);
+ cp += len;
+ lastchar += len;
+ }
+ kdb_nextline = 1; /* reset output line number */
+ break;
+ default:
+ if (key >= 32 && lastchar < bufend) {
+ if (cp < lastchar) {
+ memcpy(tmpbuffer, cp, lastchar - cp);
+ memcpy(cp+1, tmpbuffer, lastchar - cp);
+ *++lastchar = '\0';
+ *cp = key;
+ kdb_printf("%s\r", cp);
+ ++cp;
+ tmp = *cp;
+ *cp = '\0';
+ kdb_printf(kdb_prompt_str);
+ kdb_printf("%s", buffer);
+ *cp = tmp;
+ } else {
+ *++lastchar = '\0';
+ *cp++ = key;
+ /* The kgdb transition check will hide
+ * printed characters if we think that
+ * kgdb is connecting, until the check
+ * fails */
+ if (!KDB_STATE(KGDB_TRANS)) {
+ if (kgdb_transition_check(buffer))
+ return buffer;
+ } else {
+ kdb_printf("%c", key);
+ }
+ }
+ /* Special escape to kgdb */
+ if (lastchar - buffer >= 5 &&
+ strcmp(lastchar - 5, "$?#3f") == 0) {
+ kdb_gdb_state_pass(lastchar - 5);
+ strcpy(buffer, "kgdb");
+ KDB_STATE_SET(DOING_KGDB);
+ return buffer;
+ }
+ if (lastchar - buffer >= 11 &&
+ strcmp(lastchar - 11, "$qSupported") == 0) {
+ kdb_gdb_state_pass(lastchar - 11);
+ strcpy(buffer, "kgdb");
+ KDB_STATE_SET(DOING_KGDB);
+ return buffer;
+ }
+ }
+ break;
+ }
+ goto poll_again;
+}
+
+/*
+ * kdb_getstr
+ *
+ * Print the prompt string and read a command from the
+ * input device.
+ *
+ * Parameters:
+ * buffer Address of buffer to receive command
+ * bufsize Size of buffer in bytes
+ * prompt Pointer to string to use as prompt string
+ * Returns:
+ * Pointer to command buffer.
+ * Locking:
+ * None.
+ * Remarks:
+ * For SMP kernels, the processor number will be
+ * substituted for %d, %x or %o in the prompt.
+ */
+
+char *kdb_getstr(char *buffer, size_t bufsize, char *prompt)
+{
+ if (prompt && kdb_prompt_str != prompt)
+ strncpy(kdb_prompt_str, prompt, CMD_BUFLEN);
+ kdb_printf(kdb_prompt_str);
+ kdb_nextline = 1; /* Prompt and input resets line number */
+ return kdb_read(buffer, bufsize);
+}
+
+/*
+ * kdb_input_flush
+ *
+ * Get rid of any buffered console input.
+ *
+ * Parameters:
+ * none
+ * Returns:
+ * nothing
+ * Locking:
+ * none
+ * Remarks:
+ * Call this function whenever you want to flush input. If there is any
+ * outstanding input, it ignores all characters until there has been no
+ * data for approximately 1ms.
+ */
+
+static void kdb_input_flush(void)
+{
+ get_char_func *f;
+ int res;
+ int flush_delay = 1;
+ while (flush_delay) {
+ flush_delay--;
+empty:
+ touch_nmi_watchdog();
+ for (f = &kdb_poll_funcs[0]; *f; ++f) {
+ res = (*f)();
+ if (res != -1) {
+ flush_delay = 1;
+ goto empty;
+ }
+ }
+ if (flush_delay)
+ mdelay(1);
+ }
+}
+
+/*
+ * kdb_printf
+ *
+ * Print a string to the output device(s).
+ *
+ * Parameters:
+ * printf-like format and optional args.
+ * Returns:
+ * 0
+ * Locking:
+ * None.
+ * Remarks:
+ * use 'kdbcons->write()' to avoid polluting 'log_buf' with
+ * kdb output.
+ *
+ * If the user is doing a cmd args | grep srch
+ * then kdb_grepping_flag is set.
+ * In that case we need to accumulate full lines (ending in \n) before
+ * searching for the pattern.
+ */
+
+static char kdb_buffer[256]; /* A bit too big to go on stack */
+static char *next_avail = kdb_buffer;
+static int size_avail;
+static int suspend_grep;
+
+/*
+ * search arg1 to see if it contains arg2
+ * (kdmain.c provides flags for ^pat and pat$)
+ *
+ * return 1 for found, 0 for not found
+ */
+static int kdb_search_string(char *searched, char *searchfor)
+{
+ char firstchar, *cp;
+ int len1, len2;
+
+ /* not counting the newline at the end of "searched" */
+ len1 = strlen(searched)-1;
+ len2 = strlen(searchfor);
+ if (len1 < len2)
+ return 0;
+ if (kdb_grep_leading && kdb_grep_trailing && len1 != len2)
+ return 0;
+ if (kdb_grep_leading) {
+ if (!strncmp(searched, searchfor, len2))
+ return 1;
+ } else if (kdb_grep_trailing) {
+ if (!strncmp(searched+len1-len2, searchfor, len2))
+ return 1;
+ } else {
+ firstchar = *searchfor;
+ cp = searched;
+ while ((cp = strchr(cp, firstchar))) {
+ if (!strncmp(cp, searchfor, len2))
+ return 1;
+ cp++;
+ }
+ }
+ return 0;
+}
+
+int vkdb_printf(const char *fmt, va_list ap)
+{
+ int diag;
+ int linecount;
+ int colcount;
+ int logging, saved_loglevel = 0;
+ int saved_trap_printk;
+ int got_printf_lock = 0;
+ int retlen = 0;
+ int fnd, len;
+ char *cp, *cp2, *cphold = NULL, replaced_byte = ' ';
+ char *moreprompt = "more> ";
+ struct console *c = console_drivers;
+ static DEFINE_SPINLOCK(kdb_printf_lock);
+ unsigned long uninitialized_var(flags);
+
+ preempt_disable();
+ saved_trap_printk = kdb_trap_printk;
+ kdb_trap_printk = 0;
+
+ /* Serialize kdb_printf if multiple cpus try to write at once.
+ * But if any cpu goes recursive in kdb, just print the output,
+ * even if it is interleaved with any other text.
+ */
+ if (!KDB_STATE(PRINTF_LOCK)) {
+ KDB_STATE_SET(PRINTF_LOCK);
+ spin_lock_irqsave(&kdb_printf_lock, flags);
+ got_printf_lock = 1;
+ atomic_inc(&kdb_event);
+ } else {
+ __acquire(kdb_printf_lock);
+ }
+
+ diag = kdbgetintenv("LINES", &linecount);
+ if (diag || linecount <= 1)
+ linecount = 24;
+
+ diag = kdbgetintenv("COLUMNS", &colcount);
+ if (diag || colcount <= 1)
+ colcount = 80;
+
+ diag = kdbgetintenv("LOGGING", &logging);
+ if (diag)
+ logging = 0;
+
+ if (!kdb_grepping_flag || suspend_grep) {
+ /* normally, every vsnprintf starts a new buffer */
+ next_avail = kdb_buffer;
+ size_avail = sizeof(kdb_buffer);
+ }
+ vsnprintf(next_avail, size_avail, fmt, ap);
+
+ /*
+ * If kdb_parse() found that the command was cmd xxx | grep yyy
+ * then kdb_grepping_flag is set, and kdb_grep_string contains yyy
+ *
+ * Accumulate the print data up to a newline before searching it.
+ * (vsnprintf does null-terminate the string that it generates)
+ */
+
+ /* skip the search if prints are temporarily unconditional */
+ if (!suspend_grep && kdb_grepping_flag) {
+ cp = strchr(kdb_buffer, '\n');
+ if (!cp) {
+ /*
+ * Special cases that don't end with newlines
+ * but should be written without one:
+ * The "[nn]kdb> " prompt should
+ * appear at the front of the buffer.
+ *
+ * The "[nn]more " prompt should also be
+ * (MOREPROMPT -> moreprompt)
+ * written * but we print that ourselves,
+ * we set the suspend_grep flag to make
+ * it unconditional.
+ *
+ */
+ if (next_avail == kdb_buffer) {
+ /*
+ * these should occur after a newline,
+ * so they will be at the front of the
+ * buffer
+ */
+ cp2 = kdb_buffer;
+ len = strlen(kdb_prompt_str);
+ if (!strncmp(cp2, kdb_prompt_str, len)) {
+ /*
+ * We're about to start a new
+ * command, so we can go back
+ * to normal mode.
+ */
+ kdb_grepping_flag = 0;
+ goto kdb_printit;
+ }
+ }
+ /* no newline; don't search/write the buffer
+ until one is there */
+ len = strlen(kdb_buffer);
+ next_avail = kdb_buffer + len;
+ size_avail = sizeof(kdb_buffer) - len;
+ goto kdb_print_out;
+ }
+
+ /*
+ * The newline is present; print through it or discard
+ * it, depending on the results of the search.
+ */
+ cp++; /* to byte after the newline */
+ replaced_byte = *cp; /* remember what/where it was */
+ cphold = cp;
+ *cp = '\0'; /* end the string for our search */
+
+ /*
+ * We now have a newline at the end of the string
+ * Only continue with this output if it contains the
+ * search string.
+ */
+ fnd = kdb_search_string(kdb_buffer, kdb_grep_string);
+ if (!fnd) {
+ /*
+ * At this point the complete line at the start
+ * of kdb_buffer can be discarded, as it does
+ * not contain what the user is looking for.
+ * Shift the buffer left.
+ */
+ *cphold = replaced_byte;
+ strcpy(kdb_buffer, cphold);
+ len = strlen(kdb_buffer);
+ next_avail = kdb_buffer + len;
+ size_avail = sizeof(kdb_buffer) - len;
+ goto kdb_print_out;
+ }
+ /*
+ * at this point the string is a full line and
+ * should be printed, up to the null.
+ */
+ }
+kdb_printit:
+
+ /*
+ * Write to all consoles.
+ */
+ retlen = strlen(kdb_buffer);
+ if (!dbg_kdb_mode && kgdb_connected) {
+ gdbstub_msg_write(kdb_buffer, retlen);
+ } else {
+ if (dbg_io_ops && !dbg_io_ops->is_console) {
+ len = retlen;
+ cp = kdb_buffer;
+ while (len--) {
+ dbg_io_ops->write_char(*cp);
+ cp++;
+ }
+ }
+ while (c) {
+ c->write(c, kdb_buffer, retlen);
+ touch_nmi_watchdog();
+ c = c->next;
+ }
+ }
+ if (logging) {
+ saved_loglevel = console_loglevel;
+ console_loglevel = CONSOLE_LOGLEVEL_SILENT;
+ printk(KERN_INFO "%s", kdb_buffer);
+ }
+
+ if (KDB_STATE(PAGER)) {
+ /*
+ * Check printed string to decide how to bump the
+ * kdb_nextline to control when the more prompt should
+ * show up.
+ */
+ int got = 0;
+ len = retlen;
+ while (len--) {
+ if (kdb_buffer[len] == '\n') {
+ kdb_nextline++;
+ got = 0;
+ } else if (kdb_buffer[len] == '\r') {
+ got = 0;
+ } else {
+ got++;
+ }
+ }
+ kdb_nextline += got / (colcount + 1);
+ }
+
+ /* check for having reached the LINES number of printed lines */
+ if (kdb_nextline >= linecount) {
+ char buf1[16] = "";
+
+ /* Watch out for recursion here. Any routine that calls
+ * kdb_printf will come back through here. And kdb_read
+ * uses kdb_printf to echo on serial consoles ...
+ */
+ kdb_nextline = 1; /* In case of recursion */
+
+ /*
+ * Pause until cr.
+ */
+ moreprompt = kdbgetenv("MOREPROMPT");
+ if (moreprompt == NULL)
+ moreprompt = "more> ";
+
+ kdb_input_flush();
+ c = console_drivers;
+
+ if (dbg_io_ops && !dbg_io_ops->is_console) {
+ len = strlen(moreprompt);
+ cp = moreprompt;
+ while (len--) {
+ dbg_io_ops->write_char(*cp);
+ cp++;
+ }
+ }
+ while (c) {
+ c->write(c, moreprompt, strlen(moreprompt));
+ touch_nmi_watchdog();
+ c = c->next;
+ }
+
+ if (logging)
+ printk("%s", moreprompt);
+
+ kdb_read(buf1, 2); /* '2' indicates to return
+ * immediately after getting one key. */
+ kdb_nextline = 1; /* Really set output line 1 */
+
+ /* empty and reset the buffer: */
+ kdb_buffer[0] = '\0';
+ next_avail = kdb_buffer;
+ size_avail = sizeof(kdb_buffer);
+ if ((buf1[0] == 'q') || (buf1[0] == 'Q')) {
+ /* user hit q or Q */
+ KDB_FLAG_SET(CMD_INTERRUPT); /* command interrupted */
+ KDB_STATE_CLEAR(PAGER);
+ /* end of command output; back to normal mode */
+ kdb_grepping_flag = 0;
+ kdb_printf("\n");
+ } else if (buf1[0] == ' ') {
+ kdb_printf("\r");
+ suspend_grep = 1; /* for this recursion */
+ } else if (buf1[0] == '\n') {
+ kdb_nextline = linecount - 1;
+ kdb_printf("\r");
+ suspend_grep = 1; /* for this recursion */
+ } else if (buf1[0] && buf1[0] != '\n') {
+ /* user hit something other than enter */
+ suspend_grep = 1; /* for this recursion */
+ kdb_printf("\nOnly 'q' or 'Q' are processed at more "
+ "prompt, input ignored\n");
+ } else if (kdb_grepping_flag) {
+ /* user hit enter */
+ suspend_grep = 1; /* for this recursion */
+ kdb_printf("\n");
+ }
+ kdb_input_flush();
+ }
+
+ /*
+ * For grep searches, shift the printed string left.
+ * replaced_byte contains the character that was overwritten with
+ * the terminating null, and cphold points to the null.
+ * Then adjust the notion of available space in the buffer.
+ */
+ if (kdb_grepping_flag && !suspend_grep) {
+ *cphold = replaced_byte;
+ strcpy(kdb_buffer, cphold);
+ len = strlen(kdb_buffer);
+ next_avail = kdb_buffer + len;
+ size_avail = sizeof(kdb_buffer) - len;
+ }
+
+kdb_print_out:
+ suspend_grep = 0; /* end of what may have been a recursive call */
+ if (logging)
+ console_loglevel = saved_loglevel;
+ if (KDB_STATE(PRINTF_LOCK) && got_printf_lock) {
+ got_printf_lock = 0;
+ spin_unlock_irqrestore(&kdb_printf_lock, flags);
+ KDB_STATE_CLEAR(PRINTF_LOCK);
+ atomic_dec(&kdb_event);
+ } else {
+ __release(kdb_printf_lock);
+ }
+ kdb_trap_printk = saved_trap_printk;
+ preempt_enable();
+ return retlen;
+}
+
+int kdb_printf(const char *fmt, ...)
+{
+ va_list ap;
+ int r;
+
+ va_start(ap, fmt);
+ r = vkdb_printf(fmt, ap);
+ va_end(ap);
+
+ return r;
+}
+EXPORT_SYMBOL_GPL(kdb_printf);
diff --git a/kernel/debug/kdb/kdb_keyboard.c b/kernel/debug/kdb/kdb_keyboard.c
new file mode 100644
index 00000000000..118527aa60e
--- /dev/null
+++ b/kernel/debug/kdb/kdb_keyboard.c
@@ -0,0 +1,263 @@
+/*
+ * Kernel Debugger Architecture Dependent Console I/O handler
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.
+ *
+ * Copyright (c) 1999-2006 Silicon Graphics, Inc. All Rights Reserved.
+ * Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved.
+ */
+
+#include <linux/kdb.h>
+#include <linux/keyboard.h>
+#include <linux/ctype.h>
+#include <linux/module.h>
+#include <linux/io.h>
+
+/* Keyboard Controller Registers on normal PCs. */
+
+#define KBD_STATUS_REG 0x64 /* Status register (R) */
+#define KBD_DATA_REG 0x60 /* Keyboard data register (R/W) */
+
+/* Status Register Bits */
+
+#define KBD_STAT_OBF 0x01 /* Keyboard output buffer full */
+#define KBD_STAT_MOUSE_OBF 0x20 /* Mouse output buffer full */
+
+static int kbd_exists;
+static int kbd_last_ret;
+
+/*
+ * Check if the keyboard controller has a keypress for us.
+ * Some parts (Enter Release, LED change) are still blocking polled here,
+ * but hopefully they are all short.
+ */
+int kdb_get_kbd_char(void)
+{
+ int scancode, scanstatus;
+ static int shift_lock; /* CAPS LOCK state (0-off, 1-on) */
+ static int shift_key; /* Shift next keypress */
+ static int ctrl_key;
+ u_short keychar;
+
+ if (KDB_FLAG(NO_I8042) || KDB_FLAG(NO_VT_CONSOLE) ||
+ (inb(KBD_STATUS_REG) == 0xff && inb(KBD_DATA_REG) == 0xff)) {
+ kbd_exists = 0;
+ return -1;
+ }
+ kbd_exists = 1;
+
+ if ((inb(KBD_STATUS_REG) & KBD_STAT_OBF) == 0)
+ return -1;
+
+ /*
+ * Fetch the scancode
+ */
+ scancode = inb(KBD_DATA_REG);
+ scanstatus = inb(KBD_STATUS_REG);
+
+ /*
+ * Ignore mouse events.
+ */
+ if (scanstatus & KBD_STAT_MOUSE_OBF)
+ return -1;
+
+ /*
+ * Ignore release, trigger on make
+ * (except for shift keys, where we want to
+ * keep the shift state so long as the key is
+ * held down).
+ */
+
+ if (((scancode&0x7f) == 0x2a) || ((scancode&0x7f) == 0x36)) {
+ /*
+ * Next key may use shift table
+ */
+ if ((scancode & 0x80) == 0)
+ shift_key = 1;
+ else
+ shift_key = 0;
+ return -1;
+ }
+
+ if ((scancode&0x7f) == 0x1d) {
+ /*
+ * Left ctrl key
+ */
+ if ((scancode & 0x80) == 0)
+ ctrl_key = 1;
+ else
+ ctrl_key = 0;
+ return -1;
+ }
+
+ if ((scancode & 0x80) != 0) {
+ if (scancode == 0x9c)
+ kbd_last_ret = 0;
+ return -1;
+ }
+
+ scancode &= 0x7f;
+
+ /*
+ * Translate scancode
+ */
+
+ if (scancode == 0x3a) {
+ /*
+ * Toggle caps lock
+ */
+ shift_lock ^= 1;
+
+#ifdef KDB_BLINK_LED
+ kdb_toggleled(0x4);
+#endif
+ return -1;
+ }
+
+ if (scancode == 0x0e) {
+ /*
+ * Backspace
+ */
+ return 8;
+ }
+
+ /* Special Key */
+ switch (scancode) {
+ case 0xF: /* Tab */
+ return 9;
+ case 0x53: /* Del */
+ return 4;
+ case 0x47: /* Home */
+ return 1;
+ case 0x4F: /* End */
+ return 5;
+ case 0x4B: /* Left */
+ return 2;
+ case 0x48: /* Up */
+ return 16;
+ case 0x50: /* Down */
+ return 14;
+ case 0x4D: /* Right */
+ return 6;
+ }
+
+ if (scancode == 0xe0)
+ return -1;
+
+ /*
+ * For Japanese 86/106 keyboards
+ * See comment in drivers/char/pc_keyb.c.
+ * - Masahiro Adegawa
+ */
+ if (scancode == 0x73)
+ scancode = 0x59;
+ else if (scancode == 0x7d)
+ scancode = 0x7c;
+
+ if (!shift_lock && !shift_key && !ctrl_key) {
+ keychar = plain_map[scancode];
+ } else if ((shift_lock || shift_key) && key_maps[1]) {
+ keychar = key_maps[1][scancode];
+ } else if (ctrl_key && key_maps[4]) {
+ keychar = key_maps[4][scancode];
+ } else {
+ keychar = 0x0020;
+ kdb_printf("Unknown state/scancode (%d)\n", scancode);
+ }
+ keychar &= 0x0fff;
+ if (keychar == '\t')
+ keychar = ' ';
+ switch (KTYP(keychar)) {
+ case KT_LETTER:
+ case KT_LATIN:
+ if (isprint(keychar))
+ break; /* printable characters */
+ /* drop through */
+ case KT_SPEC:
+ if (keychar == K_ENTER)
+ break;
+ /* drop through */
+ default:
+ return -1; /* ignore unprintables */
+ }
+
+ if (scancode == 0x1c) {
+ kbd_last_ret = 1;
+ return 13;
+ }
+
+ return keychar & 0xff;
+}
+EXPORT_SYMBOL_GPL(kdb_get_kbd_char);
+
+/*
+ * Best effort cleanup of ENTER break codes on leaving KDB. Called on
+ * exiting KDB, when we know we processed an ENTER or KP ENTER scan
+ * code.
+ */
+void kdb_kbd_cleanup_state(void)
+{
+ int scancode, scanstatus;
+
+ /*
+ * Nothing to clean up, since either
+ * ENTER was never pressed, or has already
+ * gotten cleaned up.
+ */
+ if (!kbd_last_ret)
+ return;
+
+ kbd_last_ret = 0;
+ /*
+ * Enter key. Need to absorb the break code here, lest it gets
+ * leaked out if we exit KDB as the result of processing 'g'.
+ *
+ * This has several interesting implications:
+ * + Need to handle KP ENTER, which has break code 0xe0 0x9c.
+ * + Need to handle repeat ENTER and repeat KP ENTER. Repeats
+ * only get a break code at the end of the repeated
+ * sequence. This means we can't propagate the repeated key
+ * press, and must swallow it away.
+ * + Need to handle possible PS/2 mouse input.
+ * + Need to handle mashed keys.
+ */
+
+ while (1) {
+ while ((inb(KBD_STATUS_REG) & KBD_STAT_OBF) == 0)
+ cpu_relax();
+
+ /*
+ * Fetch the scancode.
+ */
+ scancode = inb(KBD_DATA_REG);
+ scanstatus = inb(KBD_STATUS_REG);
+
+ /*
+ * Skip mouse input.
+ */
+ if (scanstatus & KBD_STAT_MOUSE_OBF)
+ continue;
+
+ /*
+ * If we see 0xe0, this is either a break code for KP
+ * ENTER, or a repeat make for KP ENTER. Either way,
+ * since the second byte is equivalent to an ENTER,
+ * skip the 0xe0 and try again.
+ *
+ * If we see 0x1c, this must be a repeat ENTER or KP
+ * ENTER (and we swallowed 0xe0 before). Try again.
+ *
+ * We can also see make and break codes for other keys
+ * mashed before or after pressing ENTER. Thus, if we
+ * see anything other than 0x9c, we have to try again.
+ *
+ * Note, if you held some key as ENTER was depressed,
+ * that break code would get leaked out.
+ */
+ if (scancode != 0x9c)
+ continue;
+
+ return;
+ }
+}
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
new file mode 100644
index 00000000000..2f7c760305c
--- /dev/null
+++ b/kernel/debug/kdb/kdb_main.c
@@ -0,0 +1,2879 @@
+/*
+ * Kernel Debugger Architecture Independent Main Code
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (C) 1999-2004 Silicon Graphics, Inc. All Rights Reserved.
+ * Copyright (C) 2000 Stephane Eranian <eranian@hpl.hp.com>
+ * Xscale (R) modifications copyright (C) 2003 Intel Corporation.
+ * Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved.
+ */
+
+#include <linux/ctype.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/kmsg_dump.h>
+#include <linux/reboot.h>
+#include <linux/sched.h>
+#include <linux/sysrq.h>
+#include <linux/smp.h>
+#include <linux/utsname.h>
+#include <linux/vmalloc.h>
+#include <linux/atomic.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/init.h>
+#include <linux/kallsyms.h>
+#include <linux/kgdb.h>
+#include <linux/kdb.h>
+#include <linux/notifier.h>
+#include <linux/interrupt.h>
+#include <linux/delay.h>
+#include <linux/nmi.h>
+#include <linux/time.h>
+#include <linux/ptrace.h>
+#include <linux/sysctl.h>
+#include <linux/cpu.h>
+#include <linux/kdebug.h>
+#include <linux/proc_fs.h>
+#include <linux/uaccess.h>
+#include <linux/slab.h>
+#include "kdb_private.h"
+
+#define GREP_LEN 256
+char kdb_grep_string[GREP_LEN];
+int kdb_grepping_flag;
+EXPORT_SYMBOL(kdb_grepping_flag);
+int kdb_grep_leading;
+int kdb_grep_trailing;
+
+/*
+ * Kernel debugger state flags
+ */
+int kdb_flags;
+atomic_t kdb_event;
+
+/*
+ * kdb_lock protects updates to kdb_initial_cpu. Used to
+ * single thread processors through the kernel debugger.
+ */
+int kdb_initial_cpu = -1; /* cpu number that owns kdb */
+int kdb_nextline = 1;
+int kdb_state; /* General KDB state */
+
+struct task_struct *kdb_current_task;
+EXPORT_SYMBOL(kdb_current_task);
+struct pt_regs *kdb_current_regs;
+
+const char *kdb_diemsg;
+static int kdb_go_count;
+#ifdef CONFIG_KDB_CONTINUE_CATASTROPHIC
+static unsigned int kdb_continue_catastrophic =
+ CONFIG_KDB_CONTINUE_CATASTROPHIC;
+#else
+static unsigned int kdb_continue_catastrophic;
+#endif
+
+/* kdb_commands describes the available commands. */
+static kdbtab_t *kdb_commands;
+#define KDB_BASE_CMD_MAX 50
+static int kdb_max_commands = KDB_BASE_CMD_MAX;
+static kdbtab_t kdb_base_commands[KDB_BASE_CMD_MAX];
+#define for_each_kdbcmd(cmd, num) \
+ for ((cmd) = kdb_base_commands, (num) = 0; \
+ num < kdb_max_commands; \
+ num++, num == KDB_BASE_CMD_MAX ? cmd = kdb_commands : cmd++)
+
+typedef struct _kdbmsg {
+ int km_diag; /* kdb diagnostic */
+ char *km_msg; /* Corresponding message text */
+} kdbmsg_t;
+
+#define KDBMSG(msgnum, text) \
+ { KDB_##msgnum, text }
+
+static kdbmsg_t kdbmsgs[] = {
+ KDBMSG(NOTFOUND, "Command Not Found"),
+ KDBMSG(ARGCOUNT, "Improper argument count, see usage."),
+ KDBMSG(BADWIDTH, "Illegal value for BYTESPERWORD use 1, 2, 4 or 8, "
+ "8 is only allowed on 64 bit systems"),
+ KDBMSG(BADRADIX, "Illegal value for RADIX use 8, 10 or 16"),
+ KDBMSG(NOTENV, "Cannot find environment variable"),
+ KDBMSG(NOENVVALUE, "Environment variable should have value"),
+ KDBMSG(NOTIMP, "Command not implemented"),
+ KDBMSG(ENVFULL, "Environment full"),
+ KDBMSG(ENVBUFFULL, "Environment buffer full"),
+ KDBMSG(TOOMANYBPT, "Too many breakpoints defined"),
+#ifdef CONFIG_CPU_XSCALE
+ KDBMSG(TOOMANYDBREGS, "More breakpoints than ibcr registers defined"),
+#else
+ KDBMSG(TOOMANYDBREGS, "More breakpoints than db registers defined"),
+#endif
+ KDBMSG(DUPBPT, "Duplicate breakpoint address"),
+ KDBMSG(BPTNOTFOUND, "Breakpoint not found"),
+ KDBMSG(BADMODE, "Invalid IDMODE"),
+ KDBMSG(BADINT, "Illegal numeric value"),
+ KDBMSG(INVADDRFMT, "Invalid symbolic address format"),
+ KDBMSG(BADREG, "Invalid register name"),
+ KDBMSG(BADCPUNUM, "Invalid cpu number"),
+ KDBMSG(BADLENGTH, "Invalid length field"),
+ KDBMSG(NOBP, "No Breakpoint exists"),
+ KDBMSG(BADADDR, "Invalid address"),
+};
+#undef KDBMSG
+
+static const int __nkdb_err = ARRAY_SIZE(kdbmsgs);
+
+
+/*
+ * Initial environment. This is all kept static and local to
+ * this file. We don't want to rely on the memory allocation
+ * mechanisms in the kernel, so we use a very limited allocate-only
+ * heap for new and altered environment variables. The entire
+ * environment is limited to a fixed number of entries (add more
+ * to __env[] if required) and a fixed amount of heap (add more to
+ * KDB_ENVBUFSIZE if required).
+ */
+
+static char *__env[] = {
+#if defined(CONFIG_SMP)
+ "PROMPT=[%d]kdb> ",
+#else
+ "PROMPT=kdb> ",
+#endif
+ "MOREPROMPT=more> ",
+ "RADIX=16",
+ "MDCOUNT=8", /* lines of md output */
+ KDB_PLATFORM_ENV,
+ "DTABCOUNT=30",
+ "NOSECT=1",
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+};
+
+static const int __nenv = ARRAY_SIZE(__env);
+
+struct task_struct *kdb_curr_task(int cpu)
+{
+ struct task_struct *p = curr_task(cpu);
+#ifdef _TIF_MCA_INIT
+ if ((task_thread_info(p)->flags & _TIF_MCA_INIT) && KDB_TSK(cpu))
+ p = krp->p;
+#endif
+ return p;
+}
+
+/*
+ * kdbgetenv - This function will return the character string value of
+ * an environment variable.
+ * Parameters:
+ * match A character string representing an environment variable.
+ * Returns:
+ * NULL No environment variable matches 'match'
+ * char* Pointer to string value of environment variable.
+ */
+char *kdbgetenv(const char *match)
+{
+ char **ep = __env;
+ int matchlen = strlen(match);
+ int i;
+
+ for (i = 0; i < __nenv; i++) {
+ char *e = *ep++;
+
+ if (!e)
+ continue;
+
+ if ((strncmp(match, e, matchlen) == 0)
+ && ((e[matchlen] == '\0')
+ || (e[matchlen] == '='))) {
+ char *cp = strchr(e, '=');
+ return cp ? ++cp : "";
+ }
+ }
+ return NULL;
+}
+
+/*
+ * kdballocenv - This function is used to allocate bytes for
+ * environment entries.
+ * Parameters:
+ * match A character string representing a numeric value
+ * Outputs:
+ * *value the unsigned long representation of the env variable 'match'
+ * Returns:
+ * Zero on success, a kdb diagnostic on failure.
+ * Remarks:
+ * We use a static environment buffer (envbuffer) to hold the values
+ * of dynamically generated environment variables (see kdb_set). Buffer
+ * space once allocated is never free'd, so over time, the amount of space
+ * (currently 512 bytes) will be exhausted if env variables are changed
+ * frequently.
+ */
+static char *kdballocenv(size_t bytes)
+{
+#define KDB_ENVBUFSIZE 512
+ static char envbuffer[KDB_ENVBUFSIZE];
+ static int envbufsize;
+ char *ep = NULL;
+
+ if ((KDB_ENVBUFSIZE - envbufsize) >= bytes) {
+ ep = &envbuffer[envbufsize];
+ envbufsize += bytes;
+ }
+ return ep;
+}
+
+/*
+ * kdbgetulenv - This function will return the value of an unsigned
+ * long-valued environment variable.
+ * Parameters:
+ * match A character string representing a numeric value
+ * Outputs:
+ * *value the unsigned long represntation of the env variable 'match'
+ * Returns:
+ * Zero on success, a kdb diagnostic on failure.
+ */
+static int kdbgetulenv(const char *match, unsigned long *value)
+{
+ char *ep;
+
+ ep = kdbgetenv(match);
+ if (!ep)
+ return KDB_NOTENV;
+ if (strlen(ep) == 0)
+ return KDB_NOENVVALUE;
+
+ *value = simple_strtoul(ep, NULL, 0);
+
+ return 0;
+}
+
+/*
+ * kdbgetintenv - This function will return the value of an
+ * integer-valued environment variable.
+ * Parameters:
+ * match A character string representing an integer-valued env variable
+ * Outputs:
+ * *value the integer representation of the environment variable 'match'
+ * Returns:
+ * Zero on success, a kdb diagnostic on failure.
+ */
+int kdbgetintenv(const char *match, int *value)
+{
+ unsigned long val;
+ int diag;
+
+ diag = kdbgetulenv(match, &val);
+ if (!diag)
+ *value = (int) val;
+ return diag;
+}
+
+/*
+ * kdbgetularg - This function will convert a numeric string into an
+ * unsigned long value.
+ * Parameters:
+ * arg A character string representing a numeric value
+ * Outputs:
+ * *value the unsigned long represntation of arg.
+ * Returns:
+ * Zero on success, a kdb diagnostic on failure.
+ */
+int kdbgetularg(const char *arg, unsigned long *value)
+{
+ char *endp;
+ unsigned long val;
+
+ val = simple_strtoul(arg, &endp, 0);
+
+ if (endp == arg) {
+ /*
+ * Also try base 16, for us folks too lazy to type the
+ * leading 0x...
+ */
+ val = simple_strtoul(arg, &endp, 16);
+ if (endp == arg)
+ return KDB_BADINT;
+ }
+
+ *value = val;
+
+ return 0;
+}
+
+int kdbgetu64arg(const char *arg, u64 *value)
+{
+ char *endp;
+ u64 val;
+
+ val = simple_strtoull(arg, &endp, 0);
+
+ if (endp == arg) {
+
+ val = simple_strtoull(arg, &endp, 16);
+ if (endp == arg)
+ return KDB_BADINT;
+ }
+
+ *value = val;
+
+ return 0;
+}
+
+/*
+ * kdb_set - This function implements the 'set' command. Alter an
+ * existing environment variable or create a new one.
+ */
+int kdb_set(int argc, const char **argv)
+{
+ int i;
+ char *ep;
+ size_t varlen, vallen;
+
+ /*
+ * we can be invoked two ways:
+ * set var=value argv[1]="var", argv[2]="value"
+ * set var = value argv[1]="var", argv[2]="=", argv[3]="value"
+ * - if the latter, shift 'em down.
+ */
+ if (argc == 3) {
+ argv[2] = argv[3];
+ argc--;
+ }
+
+ if (argc != 2)
+ return KDB_ARGCOUNT;
+
+ /*
+ * Check for internal variables
+ */
+ if (strcmp(argv[1], "KDBDEBUG") == 0) {
+ unsigned int debugflags;
+ char *cp;
+
+ debugflags = simple_strtoul(argv[2], &cp, 0);
+ if (cp == argv[2] || debugflags & ~KDB_DEBUG_FLAG_MASK) {
+ kdb_printf("kdb: illegal debug flags '%s'\n",
+ argv[2]);
+ return 0;
+ }
+ kdb_flags = (kdb_flags &
+ ~(KDB_DEBUG_FLAG_MASK << KDB_DEBUG_FLAG_SHIFT))
+ | (debugflags << KDB_DEBUG_FLAG_SHIFT);
+
+ return 0;
+ }
+
+ /*
+ * Tokenizer squashed the '=' sign. argv[1] is variable
+ * name, argv[2] = value.
+ */
+ varlen = strlen(argv[1]);
+ vallen = strlen(argv[2]);
+ ep = kdballocenv(varlen + vallen + 2);
+ if (ep == (char *)0)
+ return KDB_ENVBUFFULL;
+
+ sprintf(ep, "%s=%s", argv[1], argv[2]);
+
+ ep[varlen+vallen+1] = '\0';
+
+ for (i = 0; i < __nenv; i++) {
+ if (__env[i]
+ && ((strncmp(__env[i], argv[1], varlen) == 0)
+ && ((__env[i][varlen] == '\0')
+ || (__env[i][varlen] == '=')))) {
+ __env[i] = ep;
+ return 0;
+ }
+ }
+
+ /*
+ * Wasn't existing variable. Fit into slot.
+ */
+ for (i = 0; i < __nenv-1; i++) {
+ if (__env[i] == (char *)0) {
+ __env[i] = ep;
+ return 0;
+ }
+ }
+
+ return KDB_ENVFULL;
+}
+
+static int kdb_check_regs(void)
+{
+ if (!kdb_current_regs) {
+ kdb_printf("No current kdb registers."
+ " You may need to select another task\n");
+ return KDB_BADREG;
+ }
+ return 0;
+}
+
+/*
+ * kdbgetaddrarg - This function is responsible for parsing an
+ * address-expression and returning the value of the expression,
+ * symbol name, and offset to the caller.
+ *
+ * The argument may consist of a numeric value (decimal or
+ * hexidecimal), a symbol name, a register name (preceded by the
+ * percent sign), an environment variable with a numeric value
+ * (preceded by a dollar sign) or a simple arithmetic expression
+ * consisting of a symbol name, +/-, and a numeric constant value
+ * (offset).
+ * Parameters:
+ * argc - count of arguments in argv
+ * argv - argument vector
+ * *nextarg - index to next unparsed argument in argv[]
+ * regs - Register state at time of KDB entry
+ * Outputs:
+ * *value - receives the value of the address-expression
+ * *offset - receives the offset specified, if any
+ * *name - receives the symbol name, if any
+ * *nextarg - index to next unparsed argument in argv[]
+ * Returns:
+ * zero is returned on success, a kdb diagnostic code is
+ * returned on error.
+ */
+int kdbgetaddrarg(int argc, const char **argv, int *nextarg,
+ unsigned long *value, long *offset,
+ char **name)
+{
+ unsigned long addr;
+ unsigned long off = 0;
+ int positive;
+ int diag;
+ int found = 0;
+ char *symname;
+ char symbol = '\0';
+ char *cp;
+ kdb_symtab_t symtab;
+
+ /*
+ * Process arguments which follow the following syntax:
+ *
+ * symbol | numeric-address [+/- numeric-offset]
+ * %register
+ * $environment-variable
+ */
+
+ if (*nextarg > argc)
+ return KDB_ARGCOUNT;
+
+ symname = (char *)argv[*nextarg];
+
+ /*
+ * If there is no whitespace between the symbol
+ * or address and the '+' or '-' symbols, we
+ * remember the character and replace it with a
+ * null so the symbol/value can be properly parsed
+ */
+ cp = strpbrk(symname, "+-");
+ if (cp != NULL) {
+ symbol = *cp;
+ *cp++ = '\0';
+ }
+
+ if (symname[0] == '$') {
+ diag = kdbgetulenv(&symname[1], &addr);
+ if (diag)
+ return diag;
+ } else if (symname[0] == '%') {
+ diag = kdb_check_regs();
+ if (diag)
+ return diag;
+ /* Implement register values with % at a later time as it is
+ * arch optional.
+ */
+ return KDB_NOTIMP;
+ } else {
+ found = kdbgetsymval(symname, &symtab);
+ if (found) {
+ addr = symtab.sym_start;
+ } else {
+ diag = kdbgetularg(argv[*nextarg], &addr);
+ if (diag)
+ return diag;
+ }
+ }
+
+ if (!found)
+ found = kdbnearsym(addr, &symtab);
+
+ (*nextarg)++;
+
+ if (name)
+ *name = symname;
+ if (value)
+ *value = addr;
+ if (offset && name && *name)
+ *offset = addr - symtab.sym_start;
+
+ if ((*nextarg > argc)
+ && (symbol == '\0'))
+ return 0;
+
+ /*
+ * check for +/- and offset
+ */
+
+ if (symbol == '\0') {
+ if ((argv[*nextarg][0] != '+')
+ && (argv[*nextarg][0] != '-')) {
+ /*
+ * Not our argument. Return.
+ */
+ return 0;
+ } else {
+ positive = (argv[*nextarg][0] == '+');
+ (*nextarg)++;
+ }
+ } else
+ positive = (symbol == '+');
+
+ /*
+ * Now there must be an offset!
+ */
+ if ((*nextarg > argc)
+ && (symbol == '\0')) {
+ return KDB_INVADDRFMT;
+ }
+
+ if (!symbol) {
+ cp = (char *)argv[*nextarg];
+ (*nextarg)++;
+ }
+
+ diag = kdbgetularg(cp, &off);
+ if (diag)
+ return diag;
+
+ if (!positive)
+ off = -off;
+
+ if (offset)
+ *offset += off;
+
+ if (value)
+ *value += off;
+
+ return 0;
+}
+
+static void kdb_cmderror(int diag)
+{
+ int i;
+
+ if (diag >= 0) {
+ kdb_printf("no error detected (diagnostic is %d)\n", diag);
+ return;
+ }
+
+ for (i = 0; i < __nkdb_err; i++) {
+ if (kdbmsgs[i].km_diag == diag) {
+ kdb_printf("diag: %d: %s\n", diag, kdbmsgs[i].km_msg);
+ return;
+ }
+ }
+
+ kdb_printf("Unknown diag %d\n", -diag);
+}
+
+/*
+ * kdb_defcmd, kdb_defcmd2 - This function implements the 'defcmd'
+ * command which defines one command as a set of other commands,
+ * terminated by endefcmd. kdb_defcmd processes the initial
+ * 'defcmd' command, kdb_defcmd2 is invoked from kdb_parse for
+ * the following commands until 'endefcmd'.
+ * Inputs:
+ * argc argument count
+ * argv argument vector
+ * Returns:
+ * zero for success, a kdb diagnostic if error
+ */
+struct defcmd_set {
+ int count;
+ int usable;
+ char *name;
+ char *usage;
+ char *help;
+ char **command;
+};
+static struct defcmd_set *defcmd_set;
+static int defcmd_set_count;
+static int defcmd_in_progress;
+
+/* Forward references */
+static int kdb_exec_defcmd(int argc, const char **argv);
+
+static int kdb_defcmd2(const char *cmdstr, const char *argv0)
+{
+ struct defcmd_set *s = defcmd_set + defcmd_set_count - 1;
+ char **save_command = s->command;
+ if (strcmp(argv0, "endefcmd") == 0) {
+ defcmd_in_progress = 0;
+ if (!s->count)
+ s->usable = 0;
+ if (s->usable)
+ kdb_register(s->name, kdb_exec_defcmd,
+ s->usage, s->help, 0);
+ return 0;
+ }
+ if (!s->usable)
+ return KDB_NOTIMP;
+ s->command = kzalloc((s->count + 1) * sizeof(*(s->command)), GFP_KDB);
+ if (!s->command) {
+ kdb_printf("Could not allocate new kdb_defcmd table for %s\n",
+ cmdstr);
+ s->usable = 0;
+ return KDB_NOTIMP;
+ }
+ memcpy(s->command, save_command, s->count * sizeof(*(s->command)));
+ s->command[s->count++] = kdb_strdup(cmdstr, GFP_KDB);
+ kfree(save_command);
+ return 0;
+}
+
+static int kdb_defcmd(int argc, const char **argv)
+{
+ struct defcmd_set *save_defcmd_set = defcmd_set, *s;
+ if (defcmd_in_progress) {
+ kdb_printf("kdb: nested defcmd detected, assuming missing "
+ "endefcmd\n");
+ kdb_defcmd2("endefcmd", "endefcmd");
+ }
+ if (argc == 0) {
+ int i;
+ for (s = defcmd_set; s < defcmd_set + defcmd_set_count; ++s) {
+ kdb_printf("defcmd %s \"%s\" \"%s\"\n", s->name,
+ s->usage, s->help);
+ for (i = 0; i < s->count; ++i)
+ kdb_printf("%s", s->command[i]);
+ kdb_printf("endefcmd\n");
+ }
+ return 0;
+ }
+ if (argc != 3)
+ return KDB_ARGCOUNT;
+ if (in_dbg_master()) {
+ kdb_printf("Command only available during kdb_init()\n");
+ return KDB_NOTIMP;
+ }
+ defcmd_set = kmalloc((defcmd_set_count + 1) * sizeof(*defcmd_set),
+ GFP_KDB);
+ if (!defcmd_set)
+ goto fail_defcmd;
+ memcpy(defcmd_set, save_defcmd_set,
+ defcmd_set_count * sizeof(*defcmd_set));
+ s = defcmd_set + defcmd_set_count;
+ memset(s, 0, sizeof(*s));
+ s->usable = 1;
+ s->name = kdb_strdup(argv[1], GFP_KDB);
+ if (!s->name)
+ goto fail_name;
+ s->usage = kdb_strdup(argv[2], GFP_KDB);
+ if (!s->usage)
+ goto fail_usage;
+ s->help = kdb_strdup(argv[3], GFP_KDB);
+ if (!s->help)
+ goto fail_help;
+ if (s->usage[0] == '"') {
+ strcpy(s->usage, argv[2]+1);
+ s->usage[strlen(s->usage)-1] = '\0';
+ }
+ if (s->help[0] == '"') {
+ strcpy(s->help, argv[3]+1);
+ s->help[strlen(s->help)-1] = '\0';
+ }
+ ++defcmd_set_count;
+ defcmd_in_progress = 1;
+ kfree(save_defcmd_set);
+ return 0;
+fail_help:
+ kfree(s->usage);
+fail_usage:
+ kfree(s->name);
+fail_name:
+ kfree(defcmd_set);
+fail_defcmd:
+ kdb_printf("Could not allocate new defcmd_set entry for %s\n", argv[1]);
+ defcmd_set = save_defcmd_set;
+ return KDB_NOTIMP;
+}
+
+/*
+ * kdb_exec_defcmd - Execute the set of commands associated with this
+ * defcmd name.
+ * Inputs:
+ * argc argument count
+ * argv argument vector
+ * Returns:
+ * zero for success, a kdb diagnostic if error
+ */
+static int kdb_exec_defcmd(int argc, const char **argv)
+{
+ int i, ret;
+ struct defcmd_set *s;
+ if (argc != 0)
+ return KDB_ARGCOUNT;
+ for (s = defcmd_set, i = 0; i < defcmd_set_count; ++i, ++s) {
+ if (strcmp(s->name, argv[0]) == 0)
+ break;
+ }
+ if (i == defcmd_set_count) {
+ kdb_printf("kdb_exec_defcmd: could not find commands for %s\n",
+ argv[0]);
+ return KDB_NOTIMP;
+ }
+ for (i = 0; i < s->count; ++i) {
+ /* Recursive use of kdb_parse, do not use argv after
+ * this point */
+ argv = NULL;
+ kdb_printf("[%s]kdb> %s\n", s->name, s->command[i]);
+ ret = kdb_parse(s->command[i]);
+ if (ret)
+ return ret;
+ }
+ return 0;
+}
+
+/* Command history */
+#define KDB_CMD_HISTORY_COUNT 32
+#define CMD_BUFLEN 200 /* kdb_printf: max printline
+ * size == 256 */
+static unsigned int cmd_head, cmd_tail;
+static unsigned int cmdptr;
+static char cmd_hist[KDB_CMD_HISTORY_COUNT][CMD_BUFLEN];
+static char cmd_cur[CMD_BUFLEN];
+
+/*
+ * The "str" argument may point to something like | grep xyz
+ */
+static void parse_grep(const char *str)
+{
+ int len;
+ char *cp = (char *)str, *cp2;
+
+ /* sanity check: we should have been called with the \ first */
+ if (*cp != '|')
+ return;
+ cp++;
+ while (isspace(*cp))
+ cp++;
+ if (strncmp(cp, "grep ", 5)) {
+ kdb_printf("invalid 'pipe', see grephelp\n");
+ return;
+ }
+ cp += 5;
+ while (isspace(*cp))
+ cp++;
+ cp2 = strchr(cp, '\n');
+ if (cp2)
+ *cp2 = '\0'; /* remove the trailing newline */
+ len = strlen(cp);
+ if (len == 0) {
+ kdb_printf("invalid 'pipe', see grephelp\n");
+ return;
+ }
+ /* now cp points to a nonzero length search string */
+ if (*cp == '"') {
+ /* allow it be "x y z" by removing the "'s - there must
+ be two of them */
+ cp++;
+ cp2 = strchr(cp, '"');
+ if (!cp2) {
+ kdb_printf("invalid quoted string, see grephelp\n");
+ return;
+ }
+ *cp2 = '\0'; /* end the string where the 2nd " was */
+ }
+ kdb_grep_leading = 0;
+ if (*cp == '^') {
+ kdb_grep_leading = 1;
+ cp++;
+ }
+ len = strlen(cp);
+ kdb_grep_trailing = 0;
+ if (*(cp+len-1) == '$') {
+ kdb_grep_trailing = 1;
+ *(cp+len-1) = '\0';
+ }
+ len = strlen(cp);
+ if (!len)
+ return;
+ if (len >= GREP_LEN) {
+ kdb_printf("search string too long\n");
+ return;
+ }
+ strcpy(kdb_grep_string, cp);
+ kdb_grepping_flag++;
+ return;
+}
+
+/*
+ * kdb_parse - Parse the command line, search the command table for a
+ * matching command and invoke the command function. This
+ * function may be called recursively, if it is, the second call
+ * will overwrite argv and cbuf. It is the caller's
+ * responsibility to save their argv if they recursively call
+ * kdb_parse().
+ * Parameters:
+ * cmdstr The input command line to be parsed.
+ * regs The registers at the time kdb was entered.
+ * Returns:
+ * Zero for success, a kdb diagnostic if failure.
+ * Remarks:
+ * Limited to 20 tokens.
+ *
+ * Real rudimentary tokenization. Basically only whitespace
+ * is considered a token delimeter (but special consideration
+ * is taken of the '=' sign as used by the 'set' command).
+ *
+ * The algorithm used to tokenize the input string relies on
+ * there being at least one whitespace (or otherwise useless)
+ * character between tokens as the character immediately following
+ * the token is altered in-place to a null-byte to terminate the
+ * token string.
+ */
+
+#define MAXARGC 20
+
+int kdb_parse(const char *cmdstr)
+{
+ static char *argv[MAXARGC];
+ static int argc;
+ static char cbuf[CMD_BUFLEN+2];
+ char *cp;
+ char *cpp, quoted;
+ kdbtab_t *tp;
+ int i, escaped, ignore_errors = 0, check_grep;
+
+ /*
+ * First tokenize the command string.
+ */
+ cp = (char *)cmdstr;
+ kdb_grepping_flag = check_grep = 0;
+
+ if (KDB_FLAG(CMD_INTERRUPT)) {
+ /* Previous command was interrupted, newline must not
+ * repeat the command */
+ KDB_FLAG_CLEAR(CMD_INTERRUPT);
+ KDB_STATE_SET(PAGER);
+ argc = 0; /* no repeat */
+ }
+
+ if (*cp != '\n' && *cp != '\0') {
+ argc = 0;
+ cpp = cbuf;
+ while (*cp) {
+ /* skip whitespace */
+ while (isspace(*cp))
+ cp++;
+ if ((*cp == '\0') || (*cp == '\n') ||
+ (*cp == '#' && !defcmd_in_progress))
+ break;
+ /* special case: check for | grep pattern */
+ if (*cp == '|') {
+ check_grep++;
+ break;
+ }
+ if (cpp >= cbuf + CMD_BUFLEN) {
+ kdb_printf("kdb_parse: command buffer "
+ "overflow, command ignored\n%s\n",
+ cmdstr);
+ return KDB_NOTFOUND;
+ }
+ if (argc >= MAXARGC - 1) {
+ kdb_printf("kdb_parse: too many arguments, "
+ "command ignored\n%s\n", cmdstr);
+ return KDB_NOTFOUND;
+ }
+ argv[argc++] = cpp;
+ escaped = 0;
+ quoted = '\0';
+ /* Copy to next unquoted and unescaped
+ * whitespace or '=' */
+ while (*cp && *cp != '\n' &&
+ (escaped || quoted || !isspace(*cp))) {
+ if (cpp >= cbuf + CMD_BUFLEN)
+ break;
+ if (escaped) {
+ escaped = 0;
+ *cpp++ = *cp++;
+ continue;
+ }
+ if (*cp == '\\') {
+ escaped = 1;
+ ++cp;
+ continue;
+ }
+ if (*cp == quoted)
+ quoted = '\0';
+ else if (*cp == '\'' || *cp == '"')
+ quoted = *cp;
+ *cpp = *cp++;
+ if (*cpp == '=' && !quoted)
+ break;
+ ++cpp;
+ }
+ *cpp++ = '\0'; /* Squash a ws or '=' character */
+ }
+ }
+ if (!argc)
+ return 0;
+ if (check_grep)
+ parse_grep(cp);
+ if (defcmd_in_progress) {
+ int result = kdb_defcmd2(cmdstr, argv[0]);
+ if (!defcmd_in_progress) {
+ argc = 0; /* avoid repeat on endefcmd */
+ *(argv[0]) = '\0';
+ }
+ return result;
+ }
+ if (argv[0][0] == '-' && argv[0][1] &&
+ (argv[0][1] < '0' || argv[0][1] > '9')) {
+ ignore_errors = 1;
+ ++argv[0];
+ }
+
+ for_each_kdbcmd(tp, i) {
+ if (tp->cmd_name) {
+ /*
+ * If this command is allowed to be abbreviated,
+ * check to see if this is it.
+ */
+
+ if (tp->cmd_minlen
+ && (strlen(argv[0]) <= tp->cmd_minlen)) {
+ if (strncmp(argv[0],
+ tp->cmd_name,
+ tp->cmd_minlen) == 0) {
+ break;
+ }
+ }
+
+ if (strcmp(argv[0], tp->cmd_name) == 0)
+ break;
+ }
+ }
+
+ /*
+ * If we don't find a command by this name, see if the first
+ * few characters of this match any of the known commands.
+ * e.g., md1c20 should match md.
+ */
+ if (i == kdb_max_commands) {
+ for_each_kdbcmd(tp, i) {
+ if (tp->cmd_name) {
+ if (strncmp(argv[0],
+ tp->cmd_name,
+ strlen(tp->cmd_name)) == 0) {
+ break;
+ }
+ }
+ }
+ }
+
+ if (i < kdb_max_commands) {
+ int result;
+ KDB_STATE_SET(CMD);
+ result = (*tp->cmd_func)(argc-1, (const char **)argv);
+ if (result && ignore_errors && result > KDB_CMD_GO)
+ result = 0;
+ KDB_STATE_CLEAR(CMD);
+ switch (tp->cmd_repeat) {
+ case KDB_REPEAT_NONE:
+ argc = 0;
+ if (argv[0])
+ *(argv[0]) = '\0';
+ break;
+ case KDB_REPEAT_NO_ARGS:
+ argc = 1;
+ if (argv[1])
+ *(argv[1]) = '\0';
+ break;
+ case KDB_REPEAT_WITH_ARGS:
+ break;
+ }
+ return result;
+ }
+
+ /*
+ * If the input with which we were presented does not
+ * map to an existing command, attempt to parse it as an
+ * address argument and display the result. Useful for
+ * obtaining the address of a variable, or the nearest symbol
+ * to an address contained in a register.
+ */
+ {
+ unsigned long value;
+ char *name = NULL;
+ long offset;
+ int nextarg = 0;
+
+ if (kdbgetaddrarg(0, (const char **)argv, &nextarg,
+ &value, &offset, &name)) {
+ return KDB_NOTFOUND;
+ }
+
+ kdb_printf("%s = ", argv[0]);
+ kdb_symbol_print(value, NULL, KDB_SP_DEFAULT);
+ kdb_printf("\n");
+ return 0;
+ }
+}
+
+
+static int handle_ctrl_cmd(char *cmd)
+{
+#define CTRL_P 16
+#define CTRL_N 14
+
+ /* initial situation */
+ if (cmd_head == cmd_tail)
+ return 0;
+ switch (*cmd) {
+ case CTRL_P:
+ if (cmdptr != cmd_tail)
+ cmdptr = (cmdptr-1) % KDB_CMD_HISTORY_COUNT;
+ strncpy(cmd_cur, cmd_hist[cmdptr], CMD_BUFLEN);
+ return 1;
+ case CTRL_N:
+ if (cmdptr != cmd_head)
+ cmdptr = (cmdptr+1) % KDB_CMD_HISTORY_COUNT;
+ strncpy(cmd_cur, cmd_hist[cmdptr], CMD_BUFLEN);
+ return 1;
+ }
+ return 0;
+}
+
+/*
+ * kdb_reboot - This function implements the 'reboot' command. Reboot
+ * the system immediately, or loop for ever on failure.
+ */
+static int kdb_reboot(int argc, const char **argv)
+{
+ emergency_restart();
+ kdb_printf("Hmm, kdb_reboot did not reboot, spinning here\n");
+ while (1)
+ cpu_relax();
+ /* NOTREACHED */
+ return 0;
+}
+
+static void kdb_dumpregs(struct pt_regs *regs)
+{
+ int old_lvl = console_loglevel;
+ console_loglevel = CONSOLE_LOGLEVEL_MOTORMOUTH;
+ kdb_trap_printk++;
+ show_regs(regs);
+ kdb_trap_printk--;
+ kdb_printf("\n");
+ console_loglevel = old_lvl;
+}
+
+void kdb_set_current_task(struct task_struct *p)
+{
+ kdb_current_task = p;
+
+ if (kdb_task_has_cpu(p)) {
+ kdb_current_regs = KDB_TSKREGS(kdb_process_cpu(p));
+ return;
+ }
+ kdb_current_regs = NULL;
+}
+
+/*
+ * kdb_local - The main code for kdb. This routine is invoked on a
+ * specific processor, it is not global. The main kdb() routine
+ * ensures that only one processor at a time is in this routine.
+ * This code is called with the real reason code on the first
+ * entry to a kdb session, thereafter it is called with reason
+ * SWITCH, even if the user goes back to the original cpu.
+ * Inputs:
+ * reason The reason KDB was invoked
+ * error The hardware-defined error code
+ * regs The exception frame at time of fault/breakpoint.
+ * db_result Result code from the break or debug point.
+ * Returns:
+ * 0 KDB was invoked for an event which it wasn't responsible
+ * 1 KDB handled the event for which it was invoked.
+ * KDB_CMD_GO User typed 'go'.
+ * KDB_CMD_CPU User switched to another cpu.
+ * KDB_CMD_SS Single step.
+ */
+static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs,
+ kdb_dbtrap_t db_result)
+{
+ char *cmdbuf;
+ int diag;
+ struct task_struct *kdb_current =
+ kdb_curr_task(raw_smp_processor_id());
+
+ KDB_DEBUG_STATE("kdb_local 1", reason);
+ kdb_go_count = 0;
+ if (reason == KDB_REASON_DEBUG) {
+ /* special case below */
+ } else {
+ kdb_printf("\nEntering kdb (current=0x%p, pid %d) ",
+ kdb_current, kdb_current ? kdb_current->pid : 0);
+#if defined(CONFIG_SMP)
+ kdb_printf("on processor %d ", raw_smp_processor_id());
+#endif
+ }
+
+ switch (reason) {
+ case KDB_REASON_DEBUG:
+ {
+ /*
+ * If re-entering kdb after a single step
+ * command, don't print the message.
+ */
+ switch (db_result) {
+ case KDB_DB_BPT:
+ kdb_printf("\nEntering kdb (0x%p, pid %d) ",
+ kdb_current, kdb_current->pid);
+#if defined(CONFIG_SMP)
+ kdb_printf("on processor %d ", raw_smp_processor_id());
+#endif
+ kdb_printf("due to Debug @ " kdb_machreg_fmt "\n",
+ instruction_pointer(regs));
+ break;
+ case KDB_DB_SS:
+ break;
+ case KDB_DB_SSBPT:
+ KDB_DEBUG_STATE("kdb_local 4", reason);
+ return 1; /* kdba_db_trap did the work */
+ default:
+ kdb_printf("kdb: Bad result from kdba_db_trap: %d\n",
+ db_result);
+ break;
+ }
+
+ }
+ break;
+ case KDB_REASON_ENTER:
+ if (KDB_STATE(KEYBOARD))
+ kdb_printf("due to Keyboard Entry\n");
+ else
+ kdb_printf("due to KDB_ENTER()\n");
+ break;
+ case KDB_REASON_KEYBOARD:
+ KDB_STATE_SET(KEYBOARD);
+ kdb_printf("due to Keyboard Entry\n");
+ break;
+ case KDB_REASON_ENTER_SLAVE:
+ /* drop through, slaves only get released via cpu switch */
+ case KDB_REASON_SWITCH:
+ kdb_printf("due to cpu switch\n");
+ break;
+ case KDB_REASON_OOPS:
+ kdb_printf("Oops: %s\n", kdb_diemsg);
+ kdb_printf("due to oops @ " kdb_machreg_fmt "\n",
+ instruction_pointer(regs));
+ kdb_dumpregs(regs);
+ break;
+ case KDB_REASON_SYSTEM_NMI:
+ kdb_printf("due to System NonMaskable Interrupt\n");
+ break;
+ case KDB_REASON_NMI:
+ kdb_printf("due to NonMaskable Interrupt @ "
+ kdb_machreg_fmt "\n",
+ instruction_pointer(regs));
+ kdb_dumpregs(regs);
+ break;
+ case KDB_REASON_SSTEP:
+ case KDB_REASON_BREAK:
+ kdb_printf("due to %s @ " kdb_machreg_fmt "\n",
+ reason == KDB_REASON_BREAK ?
+ "Breakpoint" : "SS trap", instruction_pointer(regs));
+ /*
+ * Determine if this breakpoint is one that we
+ * are interested in.
+ */
+ if (db_result != KDB_DB_BPT) {
+ kdb_printf("kdb: error return from kdba_bp_trap: %d\n",
+ db_result);
+ KDB_DEBUG_STATE("kdb_local 6", reason);
+ return 0; /* Not for us, dismiss it */
+ }
+ break;
+ case KDB_REASON_RECURSE:
+ kdb_printf("due to Recursion @ " kdb_machreg_fmt "\n",
+ instruction_pointer(regs));
+ break;
+ default:
+ kdb_printf("kdb: unexpected reason code: %d\n", reason);
+ KDB_DEBUG_STATE("kdb_local 8", reason);
+ return 0; /* Not for us, dismiss it */
+ }
+
+ while (1) {
+ /*
+ * Initialize pager context.
+ */
+ kdb_nextline = 1;
+ KDB_STATE_CLEAR(SUPPRESS);
+
+ cmdbuf = cmd_cur;
+ *cmdbuf = '\0';
+ *(cmd_hist[cmd_head]) = '\0';
+
+do_full_getstr:
+#if defined(CONFIG_SMP)
+ snprintf(kdb_prompt_str, CMD_BUFLEN, kdbgetenv("PROMPT"),
+ raw_smp_processor_id());
+#else
+ snprintf(kdb_prompt_str, CMD_BUFLEN, kdbgetenv("PROMPT"));
+#endif
+ if (defcmd_in_progress)
+ strncat(kdb_prompt_str, "[defcmd]", CMD_BUFLEN);
+
+ /*
+ * Fetch command from keyboard
+ */
+ cmdbuf = kdb_getstr(cmdbuf, CMD_BUFLEN, kdb_prompt_str);
+ if (*cmdbuf != '\n') {
+ if (*cmdbuf < 32) {
+ if (cmdptr == cmd_head) {
+ strncpy(cmd_hist[cmd_head], cmd_cur,
+ CMD_BUFLEN);
+ *(cmd_hist[cmd_head] +
+ strlen(cmd_hist[cmd_head])-1) = '\0';
+ }
+ if (!handle_ctrl_cmd(cmdbuf))
+ *(cmd_cur+strlen(cmd_cur)-1) = '\0';
+ cmdbuf = cmd_cur;
+ goto do_full_getstr;
+ } else {
+ strncpy(cmd_hist[cmd_head], cmd_cur,
+ CMD_BUFLEN);
+ }
+
+ cmd_head = (cmd_head+1) % KDB_CMD_HISTORY_COUNT;
+ if (cmd_head == cmd_tail)
+ cmd_tail = (cmd_tail+1) % KDB_CMD_HISTORY_COUNT;
+ }
+
+ cmdptr = cmd_head;
+ diag = kdb_parse(cmdbuf);
+ if (diag == KDB_NOTFOUND) {
+ kdb_printf("Unknown kdb command: '%s'\n", cmdbuf);
+ diag = 0;
+ }
+ if (diag == KDB_CMD_GO
+ || diag == KDB_CMD_CPU
+ || diag == KDB_CMD_SS
+ || diag == KDB_CMD_KGDB)
+ break;
+
+ if (diag)
+ kdb_cmderror(diag);
+ }
+ KDB_DEBUG_STATE("kdb_local 9", diag);
+ return diag;
+}
+
+
+/*
+ * kdb_print_state - Print the state data for the current processor
+ * for debugging.
+ * Inputs:
+ * text Identifies the debug point
+ * value Any integer value to be printed, e.g. reason code.
+ */
+void kdb_print_state(const char *text, int value)
+{
+ kdb_printf("state: %s cpu %d value %d initial %d state %x\n",
+ text, raw_smp_processor_id(), value, kdb_initial_cpu,
+ kdb_state);
+}
+
+/*
+ * kdb_main_loop - After initial setup and assignment of the
+ * controlling cpu, all cpus are in this loop. One cpu is in
+ * control and will issue the kdb prompt, the others will spin
+ * until 'go' or cpu switch.
+ *
+ * To get a consistent view of the kernel stacks for all
+ * processes, this routine is invoked from the main kdb code via
+ * an architecture specific routine. kdba_main_loop is
+ * responsible for making the kernel stacks consistent for all
+ * processes, there should be no difference between a blocked
+ * process and a running process as far as kdb is concerned.
+ * Inputs:
+ * reason The reason KDB was invoked
+ * error The hardware-defined error code
+ * reason2 kdb's current reason code.
+ * Initially error but can change
+ * according to kdb state.
+ * db_result Result code from break or debug point.
+ * regs The exception frame at time of fault/breakpoint.
+ * should always be valid.
+ * Returns:
+ * 0 KDB was invoked for an event which it wasn't responsible
+ * 1 KDB handled the event for which it was invoked.
+ */
+int kdb_main_loop(kdb_reason_t reason, kdb_reason_t reason2, int error,
+ kdb_dbtrap_t db_result, struct pt_regs *regs)
+{
+ int result = 1;
+ /* Stay in kdb() until 'go', 'ss[b]' or an error */
+ while (1) {
+ /*
+ * All processors except the one that is in control
+ * will spin here.
+ */
+ KDB_DEBUG_STATE("kdb_main_loop 1", reason);
+ while (KDB_STATE(HOLD_CPU)) {
+ /* state KDB is turned off by kdb_cpu to see if the
+ * other cpus are still live, each cpu in this loop
+ * turns it back on.
+ */
+ if (!KDB_STATE(KDB))
+ KDB_STATE_SET(KDB);
+ }
+
+ KDB_STATE_CLEAR(SUPPRESS);
+ KDB_DEBUG_STATE("kdb_main_loop 2", reason);
+ if (KDB_STATE(LEAVING))
+ break; /* Another cpu said 'go' */
+ /* Still using kdb, this processor is in control */
+ result = kdb_local(reason2, error, regs, db_result);
+ KDB_DEBUG_STATE("kdb_main_loop 3", result);
+
+ if (result == KDB_CMD_CPU)
+ break;
+
+ if (result == KDB_CMD_SS) {
+ KDB_STATE_SET(DOING_SS);
+ break;
+ }
+
+ if (result == KDB_CMD_KGDB) {
+ if (!KDB_STATE(DOING_KGDB))
+ kdb_printf("Entering please attach debugger "
+ "or use $D#44+ or $3#33\n");
+ break;
+ }
+ if (result && result != 1 && result != KDB_CMD_GO)
+ kdb_printf("\nUnexpected kdb_local return code %d\n",
+ result);
+ KDB_DEBUG_STATE("kdb_main_loop 4", reason);
+ break;
+ }
+ if (KDB_STATE(DOING_SS))
+ KDB_STATE_CLEAR(SSBPT);
+
+ /* Clean up any keyboard devices before leaving */
+ kdb_kbd_cleanup_state();
+
+ return result;
+}
+
+/*
+ * kdb_mdr - This function implements the guts of the 'mdr', memory
+ * read command.
+ * mdr <addr arg>,<byte count>
+ * Inputs:
+ * addr Start address
+ * count Number of bytes
+ * Returns:
+ * Always 0. Any errors are detected and printed by kdb_getarea.
+ */
+static int kdb_mdr(unsigned long addr, unsigned int count)
+{
+ unsigned char c;
+ while (count--) {
+ if (kdb_getarea(c, addr))
+ return 0;
+ kdb_printf("%02x", c);
+ addr++;
+ }
+ kdb_printf("\n");
+ return 0;
+}
+
+/*
+ * kdb_md - This function implements the 'md', 'md1', 'md2', 'md4',
+ * 'md8' 'mdr' and 'mds' commands.
+ *
+ * md|mds [<addr arg> [<line count> [<radix>]]]
+ * mdWcN [<addr arg> [<line count> [<radix>]]]
+ * where W = is the width (1, 2, 4 or 8) and N is the count.
+ * for eg., md1c20 reads 20 bytes, 1 at a time.
+ * mdr <addr arg>,<byte count>
+ */
+static void kdb_md_line(const char *fmtstr, unsigned long addr,
+ int symbolic, int nosect, int bytesperword,
+ int num, int repeat, int phys)
+{
+ /* print just one line of data */
+ kdb_symtab_t symtab;
+ char cbuf[32];
+ char *c = cbuf;
+ int i;
+ unsigned long word;
+
+ memset(cbuf, '\0', sizeof(cbuf));
+ if (phys)
+ kdb_printf("phys " kdb_machreg_fmt0 " ", addr);
+ else
+ kdb_printf(kdb_machreg_fmt0 " ", addr);
+
+ for (i = 0; i < num && repeat--; i++) {
+ if (phys) {
+ if (kdb_getphysword(&word, addr, bytesperword))
+ break;
+ } else if (kdb_getword(&word, addr, bytesperword))
+ break;
+ kdb_printf(fmtstr, word);
+ if (symbolic)
+ kdbnearsym(word, &symtab);
+ else
+ memset(&symtab, 0, sizeof(symtab));
+ if (symtab.sym_name) {
+ kdb_symbol_print(word, &symtab, 0);
+ if (!nosect) {
+ kdb_printf("\n");
+ kdb_printf(" %s %s "
+ kdb_machreg_fmt " "
+ kdb_machreg_fmt " "
+ kdb_machreg_fmt, symtab.mod_name,
+ symtab.sec_name, symtab.sec_start,
+ symtab.sym_start, symtab.sym_end);
+ }
+ addr += bytesperword;
+ } else {
+ union {
+ u64 word;
+ unsigned char c[8];
+ } wc;
+ unsigned char *cp;
+#ifdef __BIG_ENDIAN
+ cp = wc.c + 8 - bytesperword;
+#else
+ cp = wc.c;
+#endif
+ wc.word = word;
+#define printable_char(c) \
+ ({unsigned char __c = c; isascii(__c) && isprint(__c) ? __c : '.'; })
+ switch (bytesperword) {
+ case 8:
+ *c++ = printable_char(*cp++);
+ *c++ = printable_char(*cp++);
+ *c++ = printable_char(*cp++);
+ *c++ = printable_char(*cp++);
+ addr += 4;
+ case 4:
+ *c++ = printable_char(*cp++);
+ *c++ = printable_char(*cp++);
+ addr += 2;
+ case 2:
+ *c++ = printable_char(*cp++);
+ addr++;
+ case 1:
+ *c++ = printable_char(*cp++);
+ addr++;
+ break;
+ }
+#undef printable_char
+ }
+ }
+ kdb_printf("%*s %s\n", (int)((num-i)*(2*bytesperword + 1)+1),
+ " ", cbuf);
+}
+
+static int kdb_md(int argc, const char **argv)
+{
+ static unsigned long last_addr;
+ static int last_radix, last_bytesperword, last_repeat;
+ int radix = 16, mdcount = 8, bytesperword = KDB_WORD_SIZE, repeat;
+ int nosect = 0;
+ char fmtchar, fmtstr[64];
+ unsigned long addr;
+ unsigned long word;
+ long offset = 0;
+ int symbolic = 0;
+ int valid = 0;
+ int phys = 0;
+
+ kdbgetintenv("MDCOUNT", &mdcount);
+ kdbgetintenv("RADIX", &radix);
+ kdbgetintenv("BYTESPERWORD", &bytesperword);
+
+ /* Assume 'md <addr>' and start with environment values */
+ repeat = mdcount * 16 / bytesperword;
+
+ if (strcmp(argv[0], "mdr") == 0) {
+ if (argc != 2)
+ return KDB_ARGCOUNT;
+ valid = 1;
+ } else if (isdigit(argv[0][2])) {
+ bytesperword = (int)(argv[0][2] - '0');
+ if (bytesperword == 0) {
+ bytesperword = last_bytesperword;
+ if (bytesperword == 0)
+ bytesperword = 4;
+ }
+ last_bytesperword = bytesperword;
+ repeat = mdcount * 16 / bytesperword;
+ if (!argv[0][3])
+ valid = 1;
+ else if (argv[0][3] == 'c' && argv[0][4]) {
+ char *p;
+ repeat = simple_strtoul(argv[0] + 4, &p, 10);
+ mdcount = ((repeat * bytesperword) + 15) / 16;
+ valid = !*p;
+ }
+ last_repeat = repeat;
+ } else if (strcmp(argv[0], "md") == 0)
+ valid = 1;
+ else if (strcmp(argv[0], "mds") == 0)
+ valid = 1;
+ else if (strcmp(argv[0], "mdp") == 0) {
+ phys = valid = 1;
+ }
+ if (!valid)
+ return KDB_NOTFOUND;
+
+ if (argc == 0) {
+ if (last_addr == 0)
+ return KDB_ARGCOUNT;
+ addr = last_addr;
+ radix = last_radix;
+ bytesperword = last_bytesperword;
+ repeat = last_repeat;
+ mdcount = ((repeat * bytesperword) + 15) / 16;
+ }
+
+ if (argc) {
+ unsigned long val;
+ int diag, nextarg = 1;
+ diag = kdbgetaddrarg(argc, argv, &nextarg, &addr,
+ &offset, NULL);
+ if (diag)
+ return diag;
+ if (argc > nextarg+2)
+ return KDB_ARGCOUNT;
+
+ if (argc >= nextarg) {
+ diag = kdbgetularg(argv[nextarg], &val);
+ if (!diag) {
+ mdcount = (int) val;
+ repeat = mdcount * 16 / bytesperword;
+ }
+ }
+ if (argc >= nextarg+1) {
+ diag = kdbgetularg(argv[nextarg+1], &val);
+ if (!diag)
+ radix = (int) val;
+ }
+ }
+
+ if (strcmp(argv[0], "mdr") == 0)
+ return kdb_mdr(addr, mdcount);
+
+ switch (radix) {
+ case 10:
+ fmtchar = 'd';
+ break;
+ case 16:
+ fmtchar = 'x';
+ break;
+ case 8:
+ fmtchar = 'o';
+ break;
+ default:
+ return KDB_BADRADIX;
+ }
+
+ last_radix = radix;
+
+ if (bytesperword > KDB_WORD_SIZE)
+ return KDB_BADWIDTH;
+
+ switch (bytesperword) {
+ case 8:
+ sprintf(fmtstr, "%%16.16l%c ", fmtchar);
+ break;
+ case 4:
+ sprintf(fmtstr, "%%8.8l%c ", fmtchar);
+ break;
+ case 2:
+ sprintf(fmtstr, "%%4.4l%c ", fmtchar);
+ break;
+ case 1:
+ sprintf(fmtstr, "%%2.2l%c ", fmtchar);
+ break;
+ default:
+ return KDB_BADWIDTH;
+ }
+
+ last_repeat = repeat;
+ last_bytesperword = bytesperword;
+
+ if (strcmp(argv[0], "mds") == 0) {
+ symbolic = 1;
+ /* Do not save these changes as last_*, they are temporary mds
+ * overrides.
+ */
+ bytesperword = KDB_WORD_SIZE;
+ repeat = mdcount;
+ kdbgetintenv("NOSECT", &nosect);
+ }
+
+ /* Round address down modulo BYTESPERWORD */
+
+ addr &= ~(bytesperword-1);
+
+ while (repeat > 0) {
+ unsigned long a;
+ int n, z, num = (symbolic ? 1 : (16 / bytesperword));
+
+ if (KDB_FLAG(CMD_INTERRUPT))
+ return 0;
+ for (a = addr, z = 0; z < repeat; a += bytesperword, ++z) {
+ if (phys) {
+ if (kdb_getphysword(&word, a, bytesperword)
+ || word)
+ break;
+ } else if (kdb_getword(&word, a, bytesperword) || word)
+ break;
+ }
+ n = min(num, repeat);
+ kdb_md_line(fmtstr, addr, symbolic, nosect, bytesperword,
+ num, repeat, phys);
+ addr += bytesperword * n;
+ repeat -= n;
+ z = (z + num - 1) / num;
+ if (z > 2) {
+ int s = num * (z-2);
+ kdb_printf(kdb_machreg_fmt0 "-" kdb_machreg_fmt0
+ " zero suppressed\n",
+ addr, addr + bytesperword * s - 1);
+ addr += bytesperword * s;
+ repeat -= s;
+ }
+ }
+ last_addr = addr;
+
+ return 0;
+}
+
+/*
+ * kdb_mm - This function implements the 'mm' command.
+ * mm address-expression new-value
+ * Remarks:
+ * mm works on machine words, mmW works on bytes.
+ */
+static int kdb_mm(int argc, const char **argv)
+{
+ int diag;
+ unsigned long addr;
+ long offset = 0;
+ unsigned long contents;
+ int nextarg;
+ int width;
+
+ if (argv[0][2] && !isdigit(argv[0][2]))
+ return KDB_NOTFOUND;
+
+ if (argc < 2)
+ return KDB_ARGCOUNT;
+
+ nextarg = 1;
+ diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL);
+ if (diag)
+ return diag;
+
+ if (nextarg > argc)
+ return KDB_ARGCOUNT;
+ diag = kdbgetaddrarg(argc, argv, &nextarg, &contents, NULL, NULL);
+ if (diag)
+ return diag;
+
+ if (nextarg != argc + 1)
+ return KDB_ARGCOUNT;
+
+ width = argv[0][2] ? (argv[0][2] - '0') : (KDB_WORD_SIZE);
+ diag = kdb_putword(addr, contents, width);
+ if (diag)
+ return diag;
+
+ kdb_printf(kdb_machreg_fmt " = " kdb_machreg_fmt "\n", addr, contents);
+
+ return 0;
+}
+
+/*
+ * kdb_go - This function implements the 'go' command.
+ * go [address-expression]
+ */
+static int kdb_go(int argc, const char **argv)
+{
+ unsigned long addr;
+ int diag;
+ int nextarg;
+ long offset;
+
+ if (raw_smp_processor_id() != kdb_initial_cpu) {
+ kdb_printf("go must execute on the entry cpu, "
+ "please use \"cpu %d\" and then execute go\n",
+ kdb_initial_cpu);
+ return KDB_BADCPUNUM;
+ }
+ if (argc == 1) {
+ nextarg = 1;
+ diag = kdbgetaddrarg(argc, argv, &nextarg,
+ &addr, &offset, NULL);
+ if (diag)
+ return diag;
+ } else if (argc) {
+ return KDB_ARGCOUNT;
+ }
+
+ diag = KDB_CMD_GO;
+ if (KDB_FLAG(CATASTROPHIC)) {
+ kdb_printf("Catastrophic error detected\n");
+ kdb_printf("kdb_continue_catastrophic=%d, ",
+ kdb_continue_catastrophic);
+ if (kdb_continue_catastrophic == 0 && kdb_go_count++ == 0) {
+ kdb_printf("type go a second time if you really want "
+ "to continue\n");
+ return 0;
+ }
+ if (kdb_continue_catastrophic == 2) {
+ kdb_printf("forcing reboot\n");
+ kdb_reboot(0, NULL);
+ }
+ kdb_printf("attempting to continue\n");
+ }
+ return diag;
+}
+
+/*
+ * kdb_rd - This function implements the 'rd' command.
+ */
+static int kdb_rd(int argc, const char **argv)
+{
+ int len = kdb_check_regs();
+#if DBG_MAX_REG_NUM > 0
+ int i;
+ char *rname;
+ int rsize;
+ u64 reg64;
+ u32 reg32;
+ u16 reg16;
+ u8 reg8;
+
+ if (len)
+ return len;
+
+ for (i = 0; i < DBG_MAX_REG_NUM; i++) {
+ rsize = dbg_reg_def[i].size * 2;
+ if (rsize > 16)
+ rsize = 2;
+ if (len + strlen(dbg_reg_def[i].name) + 4 + rsize > 80) {
+ len = 0;
+ kdb_printf("\n");
+ }
+ if (len)
+ len += kdb_printf(" ");
+ switch(dbg_reg_def[i].size * 8) {
+ case 8:
+ rname = dbg_get_reg(i, &reg8, kdb_current_regs);
+ if (!rname)
+ break;
+ len += kdb_printf("%s: %02x", rname, reg8);
+ break;
+ case 16:
+ rname = dbg_get_reg(i, &reg16, kdb_current_regs);
+ if (!rname)
+ break;
+ len += kdb_printf("%s: %04x", rname, reg16);
+ break;
+ case 32:
+ rname = dbg_get_reg(i, &reg32, kdb_current_regs);
+ if (!rname)
+ break;
+ len += kdb_printf("%s: %08x", rname, reg32);
+ break;
+ case 64:
+ rname = dbg_get_reg(i, &reg64, kdb_current_regs);
+ if (!rname)
+ break;
+ len += kdb_printf("%s: %016llx", rname, reg64);
+ break;
+ default:
+ len += kdb_printf("%s: ??", dbg_reg_def[i].name);
+ }
+ }
+ kdb_printf("\n");
+#else
+ if (len)
+ return len;
+
+ kdb_dumpregs(kdb_current_regs);
+#endif
+ return 0;
+}
+
+/*
+ * kdb_rm - This function implements the 'rm' (register modify) command.
+ * rm register-name new-contents
+ * Remarks:
+ * Allows register modification with the same restrictions as gdb
+ */
+static int kdb_rm(int argc, const char **argv)
+{
+#if DBG_MAX_REG_NUM > 0
+ int diag;
+ const char *rname;
+ int i;
+ u64 reg64;
+ u32 reg32;
+ u16 reg16;
+ u8 reg8;
+
+ if (argc != 2)
+ return KDB_ARGCOUNT;
+ /*
+ * Allow presence or absence of leading '%' symbol.
+ */
+ rname = argv[1];
+ if (*rname == '%')
+ rname++;
+
+ diag = kdbgetu64arg(argv[2], &reg64);
+ if (diag)
+ return diag;
+
+ diag = kdb_check_regs();
+ if (diag)
+ return diag;
+
+ diag = KDB_BADREG;
+ for (i = 0; i < DBG_MAX_REG_NUM; i++) {
+ if (strcmp(rname, dbg_reg_def[i].name) == 0) {
+ diag = 0;
+ break;
+ }
+ }
+ if (!diag) {
+ switch(dbg_reg_def[i].size * 8) {
+ case 8:
+ reg8 = reg64;
+ dbg_set_reg(i, &reg8, kdb_current_regs);
+ break;
+ case 16:
+ reg16 = reg64;
+ dbg_set_reg(i, &reg16, kdb_current_regs);
+ break;
+ case 32:
+ reg32 = reg64;
+ dbg_set_reg(i, &reg32, kdb_current_regs);
+ break;
+ case 64:
+ dbg_set_reg(i, &reg64, kdb_current_regs);
+ break;
+ }
+ }
+ return diag;
+#else
+ kdb_printf("ERROR: Register set currently not implemented\n");
+ return 0;
+#endif
+}
+
+#if defined(CONFIG_MAGIC_SYSRQ)
+/*
+ * kdb_sr - This function implements the 'sr' (SYSRQ key) command
+ * which interfaces to the soi-disant MAGIC SYSRQ functionality.
+ * sr <magic-sysrq-code>
+ */
+static int kdb_sr(int argc, const char **argv)
+{
+ if (argc != 1)
+ return KDB_ARGCOUNT;
+ kdb_trap_printk++;
+ __handle_sysrq(*argv[1], false);
+ kdb_trap_printk--;
+
+ return 0;
+}
+#endif /* CONFIG_MAGIC_SYSRQ */
+
+/*
+ * kdb_ef - This function implements the 'regs' (display exception
+ * frame) command. This command takes an address and expects to
+ * find an exception frame at that address, formats and prints
+ * it.
+ * regs address-expression
+ * Remarks:
+ * Not done yet.
+ */
+static int kdb_ef(int argc, const char **argv)
+{
+ int diag;
+ unsigned long addr;
+ long offset;
+ int nextarg;
+
+ if (argc != 1)
+ return KDB_ARGCOUNT;
+
+ nextarg = 1;
+ diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL);
+ if (diag)
+ return diag;
+ show_regs((struct pt_regs *)addr);
+ return 0;
+}
+
+#if defined(CONFIG_MODULES)
+/*
+ * kdb_lsmod - This function implements the 'lsmod' command. Lists
+ * currently loaded kernel modules.
+ * Mostly taken from userland lsmod.
+ */
+static int kdb_lsmod(int argc, const char **argv)
+{
+ struct module *mod;
+
+ if (argc != 0)
+ return KDB_ARGCOUNT;
+
+ kdb_printf("Module Size modstruct Used by\n");
+ list_for_each_entry(mod, kdb_modules, list) {
+ if (mod->state == MODULE_STATE_UNFORMED)
+ continue;
+
+ kdb_printf("%-20s%8u 0x%p ", mod->name,
+ mod->core_size, (void *)mod);
+#ifdef CONFIG_MODULE_UNLOAD
+ kdb_printf("%4ld ", module_refcount(mod));
+#endif
+ if (mod->state == MODULE_STATE_GOING)
+ kdb_printf(" (Unloading)");
+ else if (mod->state == MODULE_STATE_COMING)
+ kdb_printf(" (Loading)");
+ else
+ kdb_printf(" (Live)");
+ kdb_printf(" 0x%p", mod->module_core);
+
+#ifdef CONFIG_MODULE_UNLOAD
+ {
+ struct module_use *use;
+ kdb_printf(" [ ");
+ list_for_each_entry(use, &mod->source_list,
+ source_list)
+ kdb_printf("%s ", use->target->name);
+ kdb_printf("]\n");
+ }
+#endif
+ }
+
+ return 0;
+}
+
+#endif /* CONFIG_MODULES */
+
+/*
+ * kdb_env - This function implements the 'env' command. Display the
+ * current environment variables.
+ */
+
+static int kdb_env(int argc, const char **argv)
+{
+ int i;
+
+ for (i = 0; i < __nenv; i++) {
+ if (__env[i])
+ kdb_printf("%s\n", __env[i]);
+ }
+
+ if (KDB_DEBUG(MASK))
+ kdb_printf("KDBFLAGS=0x%x\n", kdb_flags);
+
+ return 0;
+}
+
+#ifdef CONFIG_PRINTK
+/*
+ * kdb_dmesg - This function implements the 'dmesg' command to display
+ * the contents of the syslog buffer.
+ * dmesg [lines] [adjust]
+ */
+static int kdb_dmesg(int argc, const char **argv)
+{
+ int diag;
+ int logging;
+ int lines = 0;
+ int adjust = 0;
+ int n = 0;
+ int skip = 0;
+ struct kmsg_dumper dumper = { .active = 1 };
+ size_t len;
+ char buf[201];
+
+ if (argc > 2)
+ return KDB_ARGCOUNT;
+ if (argc) {
+ char *cp;
+ lines = simple_strtol(argv[1], &cp, 0);
+ if (*cp)
+ lines = 0;
+ if (argc > 1) {
+ adjust = simple_strtoul(argv[2], &cp, 0);
+ if (*cp || adjust < 0)
+ adjust = 0;
+ }
+ }
+
+ /* disable LOGGING if set */
+ diag = kdbgetintenv("LOGGING", &logging);
+ if (!diag && logging) {
+ const char *setargs[] = { "set", "LOGGING", "0" };
+ kdb_set(2, setargs);
+ }
+
+ kmsg_dump_rewind_nolock(&dumper);
+ while (kmsg_dump_get_line_nolock(&dumper, 1, NULL, 0, NULL))
+ n++;
+
+ if (lines < 0) {
+ if (adjust >= n)
+ kdb_printf("buffer only contains %d lines, nothing "
+ "printed\n", n);
+ else if (adjust - lines >= n)
+ kdb_printf("buffer only contains %d lines, last %d "
+ "lines printed\n", n, n - adjust);
+ skip = adjust;
+ lines = abs(lines);
+ } else if (lines > 0) {
+ skip = n - lines - adjust;
+ lines = abs(lines);
+ if (adjust >= n) {
+ kdb_printf("buffer only contains %d lines, "
+ "nothing printed\n", n);
+ skip = n;
+ } else if (skip < 0) {
+ lines += skip;
+ skip = 0;
+ kdb_printf("buffer only contains %d lines, first "
+ "%d lines printed\n", n, lines);
+ }
+ } else {
+ lines = n;
+ }
+
+ if (skip >= n || skip < 0)
+ return 0;
+
+ kmsg_dump_rewind_nolock(&dumper);
+ while (kmsg_dump_get_line_nolock(&dumper, 1, buf, sizeof(buf), &len)) {
+ if (skip) {
+ skip--;
+ continue;
+ }
+ if (!lines--)
+ break;
+ if (KDB_FLAG(CMD_INTERRUPT))
+ return 0;
+
+ kdb_printf("%.*s\n", (int)len - 1, buf);
+ }
+
+ return 0;
+}
+#endif /* CONFIG_PRINTK */
+
+/* Make sure we balance enable/disable calls, must disable first. */
+static atomic_t kdb_nmi_disabled;
+
+static int kdb_disable_nmi(int argc, const char *argv[])
+{
+ if (atomic_read(&kdb_nmi_disabled))
+ return 0;
+ atomic_set(&kdb_nmi_disabled, 1);
+ arch_kgdb_ops.enable_nmi(0);
+ return 0;
+}
+
+static int kdb_param_enable_nmi(const char *val, const struct kernel_param *kp)
+{
+ if (!atomic_add_unless(&kdb_nmi_disabled, -1, 0))
+ return -EINVAL;
+ arch_kgdb_ops.enable_nmi(1);
+ return 0;
+}
+
+static const struct kernel_param_ops kdb_param_ops_enable_nmi = {
+ .set = kdb_param_enable_nmi,
+};
+module_param_cb(enable_nmi, &kdb_param_ops_enable_nmi, NULL, 0600);
+
+/*
+ * kdb_cpu - This function implements the 'cpu' command.
+ * cpu [<cpunum>]
+ * Returns:
+ * KDB_CMD_CPU for success, a kdb diagnostic if error
+ */
+static void kdb_cpu_status(void)
+{
+ int i, start_cpu, first_print = 1;
+ char state, prev_state = '?';
+
+ kdb_printf("Currently on cpu %d\n", raw_smp_processor_id());
+ kdb_printf("Available cpus: ");
+ for (start_cpu = -1, i = 0; i < NR_CPUS; i++) {
+ if (!cpu_online(i)) {
+ state = 'F'; /* cpu is offline */
+ } else {
+ state = ' '; /* cpu is responding to kdb */
+ if (kdb_task_state_char(KDB_TSK(i)) == 'I')
+ state = 'I'; /* idle task */
+ }
+ if (state != prev_state) {
+ if (prev_state != '?') {
+ if (!first_print)
+ kdb_printf(", ");
+ first_print = 0;
+ kdb_printf("%d", start_cpu);
+ if (start_cpu < i-1)
+ kdb_printf("-%d", i-1);
+ if (prev_state != ' ')
+ kdb_printf("(%c)", prev_state);
+ }
+ prev_state = state;
+ start_cpu = i;
+ }
+ }
+ /* print the trailing cpus, ignoring them if they are all offline */
+ if (prev_state != 'F') {
+ if (!first_print)
+ kdb_printf(", ");
+ kdb_printf("%d", start_cpu);
+ if (start_cpu < i-1)
+ kdb_printf("-%d", i-1);
+ if (prev_state != ' ')
+ kdb_printf("(%c)", prev_state);
+ }
+ kdb_printf("\n");
+}
+
+static int kdb_cpu(int argc, const char **argv)
+{
+ unsigned long cpunum;
+ int diag;
+
+ if (argc == 0) {
+ kdb_cpu_status();
+ return 0;
+ }
+
+ if (argc != 1)
+ return KDB_ARGCOUNT;
+
+ diag = kdbgetularg(argv[1], &cpunum);
+ if (diag)
+ return diag;
+
+ /*
+ * Validate cpunum
+ */
+ if ((cpunum > NR_CPUS) || !cpu_online(cpunum))
+ return KDB_BADCPUNUM;
+
+ dbg_switch_cpu = cpunum;
+
+ /*
+ * Switch to other cpu
+ */
+ return KDB_CMD_CPU;
+}
+
+/* The user may not realize that ps/bta with no parameters does not print idle
+ * or sleeping system daemon processes, so tell them how many were suppressed.
+ */
+void kdb_ps_suppressed(void)
+{
+ int idle = 0, daemon = 0;
+ unsigned long mask_I = kdb_task_state_string("I"),
+ mask_M = kdb_task_state_string("M");
+ unsigned long cpu;
+ const struct task_struct *p, *g;
+ for_each_online_cpu(cpu) {
+ p = kdb_curr_task(cpu);
+ if (kdb_task_state(p, mask_I))
+ ++idle;
+ }
+ kdb_do_each_thread(g, p) {
+ if (kdb_task_state(p, mask_M))
+ ++daemon;
+ } kdb_while_each_thread(g, p);
+ if (idle || daemon) {
+ if (idle)
+ kdb_printf("%d idle process%s (state I)%s\n",
+ idle, idle == 1 ? "" : "es",
+ daemon ? " and " : "");
+ if (daemon)
+ kdb_printf("%d sleeping system daemon (state M) "
+ "process%s", daemon,
+ daemon == 1 ? "" : "es");
+ kdb_printf(" suppressed,\nuse 'ps A' to see all.\n");
+ }
+}
+
+/*
+ * kdb_ps - This function implements the 'ps' command which shows a
+ * list of the active processes.
+ * ps [DRSTCZEUIMA] All processes, optionally filtered by state
+ */
+void kdb_ps1(const struct task_struct *p)
+{
+ int cpu;
+ unsigned long tmp;
+
+ if (!p || probe_kernel_read(&tmp, (char *)p, sizeof(unsigned long)))
+ return;
+
+ cpu = kdb_process_cpu(p);
+ kdb_printf("0x%p %8d %8d %d %4d %c 0x%p %c%s\n",
+ (void *)p, p->pid, p->parent->pid,
+ kdb_task_has_cpu(p), kdb_process_cpu(p),
+ kdb_task_state_char(p),
+ (void *)(&p->thread),
+ p == kdb_curr_task(raw_smp_processor_id()) ? '*' : ' ',
+ p->comm);
+ if (kdb_task_has_cpu(p)) {
+ if (!KDB_TSK(cpu)) {
+ kdb_printf(" Error: no saved data for this cpu\n");
+ } else {
+ if (KDB_TSK(cpu) != p)
+ kdb_printf(" Error: does not match running "
+ "process table (0x%p)\n", KDB_TSK(cpu));
+ }
+ }
+}
+
+static int kdb_ps(int argc, const char **argv)
+{
+ struct task_struct *g, *p;
+ unsigned long mask, cpu;
+
+ if (argc == 0)
+ kdb_ps_suppressed();
+ kdb_printf("%-*s Pid Parent [*] cpu State %-*s Command\n",
+ (int)(2*sizeof(void *))+2, "Task Addr",
+ (int)(2*sizeof(void *))+2, "Thread");
+ mask = kdb_task_state_string(argc ? argv[1] : NULL);
+ /* Run the active tasks first */
+ for_each_online_cpu(cpu) {
+ if (KDB_FLAG(CMD_INTERRUPT))
+ return 0;
+ p = kdb_curr_task(cpu);
+ if (kdb_task_state(p, mask))
+ kdb_ps1(p);
+ }
+ kdb_printf("\n");
+ /* Now the real tasks */
+ kdb_do_each_thread(g, p) {
+ if (KDB_FLAG(CMD_INTERRUPT))
+ return 0;
+ if (kdb_task_state(p, mask))
+ kdb_ps1(p);
+ } kdb_while_each_thread(g, p);
+
+ return 0;
+}
+
+/*
+ * kdb_pid - This function implements the 'pid' command which switches
+ * the currently active process.
+ * pid [<pid> | R]
+ */
+static int kdb_pid(int argc, const char **argv)
+{
+ struct task_struct *p;
+ unsigned long val;
+ int diag;
+
+ if (argc > 1)
+ return KDB_ARGCOUNT;
+
+ if (argc) {
+ if (strcmp(argv[1], "R") == 0) {
+ p = KDB_TSK(kdb_initial_cpu);
+ } else {
+ diag = kdbgetularg(argv[1], &val);
+ if (diag)
+ return KDB_BADINT;
+
+ p = find_task_by_pid_ns((pid_t)val, &init_pid_ns);
+ if (!p) {
+ kdb_printf("No task with pid=%d\n", (pid_t)val);
+ return 0;
+ }
+ }
+ kdb_set_current_task(p);
+ }
+ kdb_printf("KDB current process is %s(pid=%d)\n",
+ kdb_current_task->comm,
+ kdb_current_task->pid);
+
+ return 0;
+}
+
+static int kdb_kgdb(int argc, const char **argv)
+{
+ return KDB_CMD_KGDB;
+}
+
+/*
+ * kdb_help - This function implements the 'help' and '?' commands.
+ */
+static int kdb_help(int argc, const char **argv)
+{
+ kdbtab_t *kt;
+ int i;
+
+ kdb_printf("%-15.15s %-20.20s %s\n", "Command", "Usage", "Description");
+ kdb_printf("-----------------------------"
+ "-----------------------------\n");
+ for_each_kdbcmd(kt, i) {
+ char *space = "";
+ if (KDB_FLAG(CMD_INTERRUPT))
+ return 0;
+ if (!kt->cmd_name)
+ continue;
+ if (strlen(kt->cmd_usage) > 20)
+ space = "\n ";
+ kdb_printf("%-15.15s %-20s%s%s\n", kt->cmd_name,
+ kt->cmd_usage, space, kt->cmd_help);
+ }
+ return 0;
+}
+
+/*
+ * kdb_kill - This function implements the 'kill' commands.
+ */
+static int kdb_kill(int argc, const char **argv)
+{
+ long sig, pid;
+ char *endp;
+ struct task_struct *p;
+ struct siginfo info;
+
+ if (argc != 2)
+ return KDB_ARGCOUNT;
+
+ sig = simple_strtol(argv[1], &endp, 0);
+ if (*endp)
+ return KDB_BADINT;
+ if (sig >= 0) {
+ kdb_printf("Invalid signal parameter.<-signal>\n");
+ return 0;
+ }
+ sig = -sig;
+
+ pid = simple_strtol(argv[2], &endp, 0);
+ if (*endp)
+ return KDB_BADINT;
+ if (pid <= 0) {
+ kdb_printf("Process ID must be large than 0.\n");
+ return 0;
+ }
+
+ /* Find the process. */
+ p = find_task_by_pid_ns(pid, &init_pid_ns);
+ if (!p) {
+ kdb_printf("The specified process isn't found.\n");
+ return 0;
+ }
+ p = p->group_leader;
+ info.si_signo = sig;
+ info.si_errno = 0;
+ info.si_code = SI_USER;
+ info.si_pid = pid; /* same capabilities as process being signalled */
+ info.si_uid = 0; /* kdb has root authority */
+ kdb_send_sig_info(p, &info);
+ return 0;
+}
+
+struct kdb_tm {
+ int tm_sec; /* seconds */
+ int tm_min; /* minutes */
+ int tm_hour; /* hours */
+ int tm_mday; /* day of the month */
+ int tm_mon; /* month */
+ int tm_year; /* year */
+};
+
+static void kdb_gmtime(struct timespec *tv, struct kdb_tm *tm)
+{
+ /* This will work from 1970-2099, 2100 is not a leap year */
+ static int mon_day[] = { 31, 29, 31, 30, 31, 30, 31,
+ 31, 30, 31, 30, 31 };
+ memset(tm, 0, sizeof(*tm));
+ tm->tm_sec = tv->tv_sec % (24 * 60 * 60);
+ tm->tm_mday = tv->tv_sec / (24 * 60 * 60) +
+ (2 * 365 + 1); /* shift base from 1970 to 1968 */
+ tm->tm_min = tm->tm_sec / 60 % 60;
+ tm->tm_hour = tm->tm_sec / 60 / 60;
+ tm->tm_sec = tm->tm_sec % 60;
+ tm->tm_year = 68 + 4*(tm->tm_mday / (4*365+1));
+ tm->tm_mday %= (4*365+1);
+ mon_day[1] = 29;
+ while (tm->tm_mday >= mon_day[tm->tm_mon]) {
+ tm->tm_mday -= mon_day[tm->tm_mon];
+ if (++tm->tm_mon == 12) {
+ tm->tm_mon = 0;
+ ++tm->tm_year;
+ mon_day[1] = 28;
+ }
+ }
+ ++tm->tm_mday;
+}
+
+/*
+ * Most of this code has been lifted from kernel/timer.c::sys_sysinfo().
+ * I cannot call that code directly from kdb, it has an unconditional
+ * cli()/sti() and calls routines that take locks which can stop the debugger.
+ */
+static void kdb_sysinfo(struct sysinfo *val)
+{
+ struct timespec uptime;
+ do_posix_clock_monotonic_gettime(&uptime);
+ memset(val, 0, sizeof(*val));
+ val->uptime = uptime.tv_sec;
+ val->loads[0] = avenrun[0];
+ val->loads[1] = avenrun[1];
+ val->loads[2] = avenrun[2];
+ val->procs = nr_threads-1;
+ si_meminfo(val);
+
+ return;
+}
+
+/*
+ * kdb_summary - This function implements the 'summary' command.
+ */
+static int kdb_summary(int argc, const char **argv)
+{
+ struct timespec now;
+ struct kdb_tm tm;
+ struct sysinfo val;
+
+ if (argc)
+ return KDB_ARGCOUNT;
+
+ kdb_printf("sysname %s\n", init_uts_ns.name.sysname);
+ kdb_printf("release %s\n", init_uts_ns.name.release);
+ kdb_printf("version %s\n", init_uts_ns.name.version);
+ kdb_printf("machine %s\n", init_uts_ns.name.machine);
+ kdb_printf("nodename %s\n", init_uts_ns.name.nodename);
+ kdb_printf("domainname %s\n", init_uts_ns.name.domainname);
+ kdb_printf("ccversion %s\n", __stringify(CCVERSION));
+
+ now = __current_kernel_time();
+ kdb_gmtime(&now, &tm);
+ kdb_printf("date %04d-%02d-%02d %02d:%02d:%02d "
+ "tz_minuteswest %d\n",
+ 1900+tm.tm_year, tm.tm_mon+1, tm.tm_mday,
+ tm.tm_hour, tm.tm_min, tm.tm_sec,
+ sys_tz.tz_minuteswest);
+
+ kdb_sysinfo(&val);
+ kdb_printf("uptime ");
+ if (val.uptime > (24*60*60)) {
+ int days = val.uptime / (24*60*60);
+ val.uptime %= (24*60*60);
+ kdb_printf("%d day%s ", days, days == 1 ? "" : "s");
+ }
+ kdb_printf("%02ld:%02ld\n", val.uptime/(60*60), (val.uptime/60)%60);
+
+ /* lifted from fs/proc/proc_misc.c::loadavg_read_proc() */
+
+#define LOAD_INT(x) ((x) >> FSHIFT)
+#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)
+ kdb_printf("load avg %ld.%02ld %ld.%02ld %ld.%02ld\n",
+ LOAD_INT(val.loads[0]), LOAD_FRAC(val.loads[0]),
+ LOAD_INT(val.loads[1]), LOAD_FRAC(val.loads[1]),
+ LOAD_INT(val.loads[2]), LOAD_FRAC(val.loads[2]));
+#undef LOAD_INT
+#undef LOAD_FRAC
+ /* Display in kilobytes */
+#define K(x) ((x) << (PAGE_SHIFT - 10))
+ kdb_printf("\nMemTotal: %8lu kB\nMemFree: %8lu kB\n"
+ "Buffers: %8lu kB\n",
+ val.totalram, val.freeram, val.bufferram);
+ return 0;
+}
+
+/*
+ * kdb_per_cpu - This function implements the 'per_cpu' command.
+ */
+static int kdb_per_cpu(int argc, const char **argv)
+{
+ char fmtstr[64];
+ int cpu, diag, nextarg = 1;
+ unsigned long addr, symaddr, val, bytesperword = 0, whichcpu = ~0UL;
+
+ if (argc < 1 || argc > 3)
+ return KDB_ARGCOUNT;
+
+ diag = kdbgetaddrarg(argc, argv, &nextarg, &symaddr, NULL, NULL);
+ if (diag)
+ return diag;
+
+ if (argc >= 2) {
+ diag = kdbgetularg(argv[2], &bytesperword);
+ if (diag)
+ return diag;
+ }
+ if (!bytesperword)
+ bytesperword = KDB_WORD_SIZE;
+ else if (bytesperword > KDB_WORD_SIZE)
+ return KDB_BADWIDTH;
+ sprintf(fmtstr, "%%0%dlx ", (int)(2*bytesperword));
+ if (argc >= 3) {
+ diag = kdbgetularg(argv[3], &whichcpu);
+ if (diag)
+ return diag;
+ if (!cpu_online(whichcpu)) {
+ kdb_printf("cpu %ld is not online\n", whichcpu);
+ return KDB_BADCPUNUM;
+ }
+ }
+
+ /* Most architectures use __per_cpu_offset[cpu], some use
+ * __per_cpu_offset(cpu), smp has no __per_cpu_offset.
+ */
+#ifdef __per_cpu_offset
+#define KDB_PCU(cpu) __per_cpu_offset(cpu)
+#else
+#ifdef CONFIG_SMP
+#define KDB_PCU(cpu) __per_cpu_offset[cpu]
+#else
+#define KDB_PCU(cpu) 0
+#endif
+#endif
+ for_each_online_cpu(cpu) {
+ if (KDB_FLAG(CMD_INTERRUPT))
+ return 0;
+
+ if (whichcpu != ~0UL && whichcpu != cpu)
+ continue;
+ addr = symaddr + KDB_PCU(cpu);
+ diag = kdb_getword(&val, addr, bytesperword);
+ if (diag) {
+ kdb_printf("%5d " kdb_bfd_vma_fmt0 " - unable to "
+ "read, diag=%d\n", cpu, addr, diag);
+ continue;
+ }
+ kdb_printf("%5d ", cpu);
+ kdb_md_line(fmtstr, addr,
+ bytesperword == KDB_WORD_SIZE,
+ 1, bytesperword, 1, 1, 0);
+ }
+#undef KDB_PCU
+ return 0;
+}
+
+/*
+ * display help for the use of cmd | grep pattern
+ */
+static int kdb_grep_help(int argc, const char **argv)
+{
+ kdb_printf("Usage of cmd args | grep pattern:\n");
+ kdb_printf(" Any command's output may be filtered through an ");
+ kdb_printf("emulated 'pipe'.\n");
+ kdb_printf(" 'grep' is just a key word.\n");
+ kdb_printf(" The pattern may include a very limited set of "
+ "metacharacters:\n");
+ kdb_printf(" pattern or ^pattern or pattern$ or ^pattern$\n");
+ kdb_printf(" And if there are spaces in the pattern, you may "
+ "quote it:\n");
+ kdb_printf(" \"pat tern\" or \"^pat tern\" or \"pat tern$\""
+ " or \"^pat tern$\"\n");
+ return 0;
+}
+
+/*
+ * kdb_register_repeat - This function is used to register a kernel
+ * debugger command.
+ * Inputs:
+ * cmd Command name
+ * func Function to execute the command
+ * usage A simple usage string showing arguments
+ * help A simple help string describing command
+ * repeat Does the command auto repeat on enter?
+ * Returns:
+ * zero for success, one if a duplicate command.
+ */
+#define kdb_command_extend 50 /* arbitrary */
+int kdb_register_repeat(char *cmd,
+ kdb_func_t func,
+ char *usage,
+ char *help,
+ short minlen,
+ kdb_repeat_t repeat)
+{
+ int i;
+ kdbtab_t *kp;
+
+ /*
+ * Brute force method to determine duplicates
+ */
+ for_each_kdbcmd(kp, i) {
+ if (kp->cmd_name && (strcmp(kp->cmd_name, cmd) == 0)) {
+ kdb_printf("Duplicate kdb command registered: "
+ "%s, func %p help %s\n", cmd, func, help);
+ return 1;
+ }
+ }
+
+ /*
+ * Insert command into first available location in table
+ */
+ for_each_kdbcmd(kp, i) {
+ if (kp->cmd_name == NULL)
+ break;
+ }
+
+ if (i >= kdb_max_commands) {
+ kdbtab_t *new = kmalloc((kdb_max_commands - KDB_BASE_CMD_MAX +
+ kdb_command_extend) * sizeof(*new), GFP_KDB);
+ if (!new) {
+ kdb_printf("Could not allocate new kdb_command "
+ "table\n");
+ return 1;
+ }
+ if (kdb_commands) {
+ memcpy(new, kdb_commands,
+ (kdb_max_commands - KDB_BASE_CMD_MAX) * sizeof(*new));
+ kfree(kdb_commands);
+ }
+ memset(new + kdb_max_commands - KDB_BASE_CMD_MAX, 0,
+ kdb_command_extend * sizeof(*new));
+ kdb_commands = new;
+ kp = kdb_commands + kdb_max_commands - KDB_BASE_CMD_MAX;
+ kdb_max_commands += kdb_command_extend;
+ }
+
+ kp->cmd_name = cmd;
+ kp->cmd_func = func;
+ kp->cmd_usage = usage;
+ kp->cmd_help = help;
+ kp->cmd_flags = 0;
+ kp->cmd_minlen = minlen;
+ kp->cmd_repeat = repeat;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(kdb_register_repeat);
+
+
+/*
+ * kdb_register - Compatibility register function for commands that do
+ * not need to specify a repeat state. Equivalent to
+ * kdb_register_repeat with KDB_REPEAT_NONE.
+ * Inputs:
+ * cmd Command name
+ * func Function to execute the command
+ * usage A simple usage string showing arguments
+ * help A simple help string describing command
+ * Returns:
+ * zero for success, one if a duplicate command.
+ */
+int kdb_register(char *cmd,
+ kdb_func_t func,
+ char *usage,
+ char *help,
+ short minlen)
+{
+ return kdb_register_repeat(cmd, func, usage, help, minlen,
+ KDB_REPEAT_NONE);
+}
+EXPORT_SYMBOL_GPL(kdb_register);
+
+/*
+ * kdb_unregister - This function is used to unregister a kernel
+ * debugger command. It is generally called when a module which
+ * implements kdb commands is unloaded.
+ * Inputs:
+ * cmd Command name
+ * Returns:
+ * zero for success, one command not registered.
+ */
+int kdb_unregister(char *cmd)
+{
+ int i;
+ kdbtab_t *kp;
+
+ /*
+ * find the command.
+ */
+ for_each_kdbcmd(kp, i) {
+ if (kp->cmd_name && (strcmp(kp->cmd_name, cmd) == 0)) {
+ kp->cmd_name = NULL;
+ return 0;
+ }
+ }
+
+ /* Couldn't find it. */
+ return 1;
+}
+EXPORT_SYMBOL_GPL(kdb_unregister);
+
+/* Initialize the kdb command table. */
+static void __init kdb_inittab(void)
+{
+ int i;
+ kdbtab_t *kp;
+
+ for_each_kdbcmd(kp, i)
+ kp->cmd_name = NULL;
+
+ kdb_register_repeat("md", kdb_md, "<vaddr>",
+ "Display Memory Contents, also mdWcN, e.g. md8c1", 1,
+ KDB_REPEAT_NO_ARGS);
+ kdb_register_repeat("mdr", kdb_md, "<vaddr> <bytes>",
+ "Display Raw Memory", 0, KDB_REPEAT_NO_ARGS);
+ kdb_register_repeat("mdp", kdb_md, "<paddr> <bytes>",
+ "Display Physical Memory", 0, KDB_REPEAT_NO_ARGS);
+ kdb_register_repeat("mds", kdb_md, "<vaddr>",
+ "Display Memory Symbolically", 0, KDB_REPEAT_NO_ARGS);
+ kdb_register_repeat("mm", kdb_mm, "<vaddr> <contents>",
+ "Modify Memory Contents", 0, KDB_REPEAT_NO_ARGS);
+ kdb_register_repeat("go", kdb_go, "[<vaddr>]",
+ "Continue Execution", 1, KDB_REPEAT_NONE);
+ kdb_register_repeat("rd", kdb_rd, "",
+ "Display Registers", 0, KDB_REPEAT_NONE);
+ kdb_register_repeat("rm", kdb_rm, "<reg> <contents>",
+ "Modify Registers", 0, KDB_REPEAT_NONE);
+ kdb_register_repeat("ef", kdb_ef, "<vaddr>",
+ "Display exception frame", 0, KDB_REPEAT_NONE);
+ kdb_register_repeat("bt", kdb_bt, "[<vaddr>]",
+ "Stack traceback", 1, KDB_REPEAT_NONE);
+ kdb_register_repeat("btp", kdb_bt, "<pid>",
+ "Display stack for process <pid>", 0, KDB_REPEAT_NONE);
+ kdb_register_repeat("bta", kdb_bt, "[D|R|S|T|C|Z|E|U|I|M|A]",
+ "Backtrace all processes matching state flag", 0, KDB_REPEAT_NONE);
+ kdb_register_repeat("btc", kdb_bt, "",
+ "Backtrace current process on each cpu", 0, KDB_REPEAT_NONE);
+ kdb_register_repeat("btt", kdb_bt, "<vaddr>",
+ "Backtrace process given its struct task address", 0,
+ KDB_REPEAT_NONE);
+ kdb_register_repeat("env", kdb_env, "",
+ "Show environment variables", 0, KDB_REPEAT_NONE);
+ kdb_register_repeat("set", kdb_set, "",
+ "Set environment variables", 0, KDB_REPEAT_NONE);
+ kdb_register_repeat("help", kdb_help, "",
+ "Display Help Message", 1, KDB_REPEAT_NONE);
+ kdb_register_repeat("?", kdb_help, "",
+ "Display Help Message", 0, KDB_REPEAT_NONE);
+ kdb_register_repeat("cpu", kdb_cpu, "<cpunum>",
+ "Switch to new cpu", 0, KDB_REPEAT_NONE);
+ kdb_register_repeat("kgdb", kdb_kgdb, "",
+ "Enter kgdb mode", 0, KDB_REPEAT_NONE);
+ kdb_register_repeat("ps", kdb_ps, "[<flags>|A]",
+ "Display active task list", 0, KDB_REPEAT_NONE);
+ kdb_register_repeat("pid", kdb_pid, "<pidnum>",
+ "Switch to another task", 0, KDB_REPEAT_NONE);
+ kdb_register_repeat("reboot", kdb_reboot, "",
+ "Reboot the machine immediately", 0, KDB_REPEAT_NONE);
+#if defined(CONFIG_MODULES)
+ kdb_register_repeat("lsmod", kdb_lsmod, "",
+ "List loaded kernel modules", 0, KDB_REPEAT_NONE);
+#endif
+#if defined(CONFIG_MAGIC_SYSRQ)
+ kdb_register_repeat("sr", kdb_sr, "<key>",
+ "Magic SysRq key", 0, KDB_REPEAT_NONE);
+#endif
+#if defined(CONFIG_PRINTK)
+ kdb_register_repeat("dmesg", kdb_dmesg, "[lines]",
+ "Display syslog buffer", 0, KDB_REPEAT_NONE);
+#endif
+ if (arch_kgdb_ops.enable_nmi) {
+ kdb_register_repeat("disable_nmi", kdb_disable_nmi, "",
+ "Disable NMI entry to KDB", 0, KDB_REPEAT_NONE);
+ }
+ kdb_register_repeat("defcmd", kdb_defcmd, "name \"usage\" \"help\"",
+ "Define a set of commands, down to endefcmd", 0, KDB_REPEAT_NONE);
+ kdb_register_repeat("kill", kdb_kill, "<-signal> <pid>",
+ "Send a signal to a process", 0, KDB_REPEAT_NONE);
+ kdb_register_repeat("summary", kdb_summary, "",
+ "Summarize the system", 4, KDB_REPEAT_NONE);
+ kdb_register_repeat("per_cpu", kdb_per_cpu, "<sym> [<bytes>] [<cpu>]",
+ "Display per_cpu variables", 3, KDB_REPEAT_NONE);
+ kdb_register_repeat("grephelp", kdb_grep_help, "",
+ "Display help on | grep", 0, KDB_REPEAT_NONE);
+}
+
+/* Execute any commands defined in kdb_cmds. */
+static void __init kdb_cmd_init(void)
+{
+ int i, diag;
+ for (i = 0; kdb_cmds[i]; ++i) {
+ diag = kdb_parse(kdb_cmds[i]);
+ if (diag)
+ kdb_printf("kdb command %s failed, kdb diag %d\n",
+ kdb_cmds[i], diag);
+ }
+ if (defcmd_in_progress) {
+ kdb_printf("Incomplete 'defcmd' set, forcing endefcmd\n");
+ kdb_parse("endefcmd");
+ }
+}
+
+/* Initialize kdb_printf, breakpoint tables and kdb state */
+void __init kdb_init(int lvl)
+{
+ static int kdb_init_lvl = KDB_NOT_INITIALIZED;
+ int i;
+
+ if (kdb_init_lvl == KDB_INIT_FULL || lvl <= kdb_init_lvl)
+ return;
+ for (i = kdb_init_lvl; i < lvl; i++) {
+ switch (i) {
+ case KDB_NOT_INITIALIZED:
+ kdb_inittab(); /* Initialize Command Table */
+ kdb_initbptab(); /* Initialize Breakpoints */
+ break;
+ case KDB_INIT_EARLY:
+ kdb_cmd_init(); /* Build kdb_cmds tables */
+ break;
+ }
+ }
+ kdb_init_lvl = lvl;
+}
diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h
new file mode 100644
index 00000000000..7afd3c8c41d
--- /dev/null
+++ b/kernel/debug/kdb/kdb_private.h
@@ -0,0 +1,260 @@
+#ifndef _KDBPRIVATE_H
+#define _KDBPRIVATE_H
+
+/*
+ * Kernel Debugger Architecture Independent Private Headers
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (c) 2000-2004 Silicon Graphics, Inc. All Rights Reserved.
+ * Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved.
+ */
+
+#include <linux/kgdb.h>
+#include "../debug_core.h"
+
+/* Kernel Debugger Command codes. Must not overlap with error codes. */
+#define KDB_CMD_GO (-1001)
+#define KDB_CMD_CPU (-1002)
+#define KDB_CMD_SS (-1003)
+#define KDB_CMD_KGDB (-1005)
+
+/* Internal debug flags */
+#define KDB_DEBUG_FLAG_BP 0x0002 /* Breakpoint subsystem debug */
+#define KDB_DEBUG_FLAG_BB_SUMM 0x0004 /* Basic block analysis, summary only */
+#define KDB_DEBUG_FLAG_AR 0x0008 /* Activation record, generic */
+#define KDB_DEBUG_FLAG_ARA 0x0010 /* Activation record, arch specific */
+#define KDB_DEBUG_FLAG_BB 0x0020 /* All basic block analysis */
+#define KDB_DEBUG_FLAG_STATE 0x0040 /* State flags */
+#define KDB_DEBUG_FLAG_MASK 0xffff /* All debug flags */
+#define KDB_DEBUG_FLAG_SHIFT 16 /* Shift factor for dbflags */
+
+#define KDB_DEBUG(flag) (kdb_flags & \
+ (KDB_DEBUG_FLAG_##flag << KDB_DEBUG_FLAG_SHIFT))
+#define KDB_DEBUG_STATE(text, value) if (KDB_DEBUG(STATE)) \
+ kdb_print_state(text, value)
+
+#if BITS_PER_LONG == 32
+
+#define KDB_PLATFORM_ENV "BYTESPERWORD=4"
+
+#define kdb_machreg_fmt "0x%lx"
+#define kdb_machreg_fmt0 "0x%08lx"
+#define kdb_bfd_vma_fmt "0x%lx"
+#define kdb_bfd_vma_fmt0 "0x%08lx"
+#define kdb_elfw_addr_fmt "0x%x"
+#define kdb_elfw_addr_fmt0 "0x%08x"
+#define kdb_f_count_fmt "%d"
+
+#elif BITS_PER_LONG == 64
+
+#define KDB_PLATFORM_ENV "BYTESPERWORD=8"
+
+#define kdb_machreg_fmt "0x%lx"
+#define kdb_machreg_fmt0 "0x%016lx"
+#define kdb_bfd_vma_fmt "0x%lx"
+#define kdb_bfd_vma_fmt0 "0x%016lx"
+#define kdb_elfw_addr_fmt "0x%x"
+#define kdb_elfw_addr_fmt0 "0x%016x"
+#define kdb_f_count_fmt "%ld"
+
+#endif
+
+/*
+ * KDB_MAXBPT describes the total number of breakpoints
+ * supported by this architecure.
+ */
+#define KDB_MAXBPT 16
+
+/* Symbol table format returned by kallsyms. */
+typedef struct __ksymtab {
+ unsigned long value; /* Address of symbol */
+ const char *mod_name; /* Module containing symbol or
+ * "kernel" */
+ unsigned long mod_start;
+ unsigned long mod_end;
+ const char *sec_name; /* Section containing symbol */
+ unsigned long sec_start;
+ unsigned long sec_end;
+ const char *sym_name; /* Full symbol name, including
+ * any version */
+ unsigned long sym_start;
+ unsigned long sym_end;
+ } kdb_symtab_t;
+extern int kallsyms_symbol_next(char *prefix_name, int flag);
+extern int kallsyms_symbol_complete(char *prefix_name, int max_len);
+
+/* Exported Symbols for kernel loadable modules to use. */
+extern int kdb_getarea_size(void *, unsigned long, size_t);
+extern int kdb_putarea_size(unsigned long, void *, size_t);
+
+/*
+ * Like get_user and put_user, kdb_getarea and kdb_putarea take variable
+ * names, not pointers. The underlying *_size functions take pointers.
+ */
+#define kdb_getarea(x, addr) kdb_getarea_size(&(x), addr, sizeof((x)))
+#define kdb_putarea(addr, x) kdb_putarea_size(addr, &(x), sizeof((x)))
+
+extern int kdb_getphysword(unsigned long *word,
+ unsigned long addr, size_t size);
+extern int kdb_getword(unsigned long *, unsigned long, size_t);
+extern int kdb_putword(unsigned long, unsigned long, size_t);
+
+extern int kdbgetularg(const char *, unsigned long *);
+extern int kdbgetu64arg(const char *, u64 *);
+extern char *kdbgetenv(const char *);
+extern int kdbgetaddrarg(int, const char **, int*, unsigned long *,
+ long *, char **);
+extern int kdbgetsymval(const char *, kdb_symtab_t *);
+extern int kdbnearsym(unsigned long, kdb_symtab_t *);
+extern void kdbnearsym_cleanup(void);
+extern char *kdb_strdup(const char *str, gfp_t type);
+extern void kdb_symbol_print(unsigned long, const kdb_symtab_t *, unsigned int);
+
+/* Routine for debugging the debugger state. */
+extern void kdb_print_state(const char *, int);
+
+extern int kdb_state;
+#define KDB_STATE_KDB 0x00000001 /* Cpu is inside kdb */
+#define KDB_STATE_LEAVING 0x00000002 /* Cpu is leaving kdb */
+#define KDB_STATE_CMD 0x00000004 /* Running a kdb command */
+#define KDB_STATE_KDB_CONTROL 0x00000008 /* This cpu is under
+ * kdb control */
+#define KDB_STATE_HOLD_CPU 0x00000010 /* Hold this cpu inside kdb */
+#define KDB_STATE_DOING_SS 0x00000020 /* Doing ss command */
+#define KDB_STATE_SSBPT 0x00000080 /* Install breakpoint
+ * after one ss, independent of
+ * DOING_SS */
+#define KDB_STATE_REENTRY 0x00000100 /* Valid re-entry into kdb */
+#define KDB_STATE_SUPPRESS 0x00000200 /* Suppress error messages */
+#define KDB_STATE_PAGER 0x00000400 /* pager is available */
+#define KDB_STATE_GO_SWITCH 0x00000800 /* go is switching
+ * back to initial cpu */
+#define KDB_STATE_PRINTF_LOCK 0x00001000 /* Holds kdb_printf lock */
+#define KDB_STATE_WAIT_IPI 0x00002000 /* Waiting for kdb_ipi() NMI */
+#define KDB_STATE_RECURSE 0x00004000 /* Recursive entry to kdb */
+#define KDB_STATE_IP_ADJUSTED 0x00008000 /* Restart IP has been
+ * adjusted */
+#define KDB_STATE_GO1 0x00010000 /* go only releases one cpu */
+#define KDB_STATE_KEYBOARD 0x00020000 /* kdb entered via
+ * keyboard on this cpu */
+#define KDB_STATE_KEXEC 0x00040000 /* kexec issued */
+#define KDB_STATE_DOING_KGDB 0x00080000 /* kgdb enter now issued */
+#define KDB_STATE_KGDB_TRANS 0x00200000 /* Transition to kgdb */
+#define KDB_STATE_ARCH 0xff000000 /* Reserved for arch
+ * specific use */
+
+#define KDB_STATE(flag) (kdb_state & KDB_STATE_##flag)
+#define KDB_STATE_SET(flag) ((void)(kdb_state |= KDB_STATE_##flag))
+#define KDB_STATE_CLEAR(flag) ((void)(kdb_state &= ~KDB_STATE_##flag))
+
+extern int kdb_nextline; /* Current number of lines displayed */
+
+typedef struct _kdb_bp {
+ unsigned long bp_addr; /* Address breakpoint is present at */
+ unsigned int bp_free:1; /* This entry is available */
+ unsigned int bp_enabled:1; /* Breakpoint is active in register */
+ unsigned int bp_type:4; /* Uses hardware register */
+ unsigned int bp_installed:1; /* Breakpoint is installed */
+ unsigned int bp_delay:1; /* Do delayed bp handling */
+ unsigned int bp_delayed:1; /* Delayed breakpoint */
+ unsigned int bph_length; /* HW break length */
+} kdb_bp_t;
+
+#ifdef CONFIG_KGDB_KDB
+extern kdb_bp_t kdb_breakpoints[/* KDB_MAXBPT */];
+
+/* The KDB shell command table */
+typedef struct _kdbtab {
+ char *cmd_name; /* Command name */
+ kdb_func_t cmd_func; /* Function to execute command */
+ char *cmd_usage; /* Usage String for this command */
+ char *cmd_help; /* Help message for this command */
+ short cmd_flags; /* Parsing flags */
+ short cmd_minlen; /* Minimum legal # command
+ * chars required */
+ kdb_repeat_t cmd_repeat; /* Does command auto repeat on enter? */
+} kdbtab_t;
+
+extern int kdb_bt(int, const char **); /* KDB display back trace */
+
+/* KDB breakpoint management functions */
+extern void kdb_initbptab(void);
+extern void kdb_bp_install(struct pt_regs *);
+extern void kdb_bp_remove(void);
+
+typedef enum {
+ KDB_DB_BPT, /* Breakpoint */
+ KDB_DB_SS, /* Single-step trap */
+ KDB_DB_SSBPT, /* Single step over breakpoint */
+ KDB_DB_NOBPT /* Spurious breakpoint */
+} kdb_dbtrap_t;
+
+extern int kdb_main_loop(kdb_reason_t, kdb_reason_t,
+ int, kdb_dbtrap_t, struct pt_regs *);
+
+/* Miscellaneous functions and data areas */
+extern int kdb_grepping_flag;
+extern char kdb_grep_string[];
+extern int kdb_grep_leading;
+extern int kdb_grep_trailing;
+extern char *kdb_cmds[];
+extern unsigned long kdb_task_state_string(const char *);
+extern char kdb_task_state_char (const struct task_struct *);
+extern unsigned long kdb_task_state(const struct task_struct *p,
+ unsigned long mask);
+extern void kdb_ps_suppressed(void);
+extern void kdb_ps1(const struct task_struct *p);
+extern void kdb_print_nameval(const char *name, unsigned long val);
+extern void kdb_send_sig_info(struct task_struct *p, struct siginfo *info);
+extern void kdb_meminfo_proc_show(void);
+extern char *kdb_getstr(char *, size_t, char *);
+extern void kdb_gdb_state_pass(char *buf);
+
+/* Defines for kdb_symbol_print */
+#define KDB_SP_SPACEB 0x0001 /* Space before string */
+#define KDB_SP_SPACEA 0x0002 /* Space after string */
+#define KDB_SP_PAREN 0x0004 /* Parenthesis around string */
+#define KDB_SP_VALUE 0x0008 /* Print the value of the address */
+#define KDB_SP_SYMSIZE 0x0010 /* Print the size of the symbol */
+#define KDB_SP_NEWLINE 0x0020 /* Newline after string */
+#define KDB_SP_DEFAULT (KDB_SP_VALUE|KDB_SP_PAREN)
+
+#define KDB_TSK(cpu) kgdb_info[cpu].task
+#define KDB_TSKREGS(cpu) kgdb_info[cpu].debuggerinfo
+
+extern struct task_struct *kdb_curr_task(int);
+
+#define kdb_task_has_cpu(p) (task_curr(p))
+
+/* Simplify coexistence with NPTL */
+#define kdb_do_each_thread(g, p) do_each_thread(g, p)
+#define kdb_while_each_thread(g, p) while_each_thread(g, p)
+
+#define GFP_KDB (in_interrupt() ? GFP_ATOMIC : GFP_KERNEL)
+
+extern void *debug_kmalloc(size_t size, gfp_t flags);
+extern void debug_kfree(void *);
+extern void debug_kusage(void);
+
+extern void kdb_set_current_task(struct task_struct *);
+extern struct task_struct *kdb_current_task;
+
+#ifdef CONFIG_KDB_KEYBOARD
+extern void kdb_kbd_cleanup_state(void);
+#else /* ! CONFIG_KDB_KEYBOARD */
+#define kdb_kbd_cleanup_state()
+#endif /* ! CONFIG_KDB_KEYBOARD */
+
+#ifdef CONFIG_MODULES
+extern struct list_head *kdb_modules;
+#endif /* CONFIG_MODULES */
+
+extern char kdb_prompt_str[];
+
+#define KDB_WORD_SIZE ((int)sizeof(unsigned long))
+
+#endif /* CONFIG_KGDB_KDB */
+#endif /* !_KDBPRIVATE_H */
diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c
new file mode 100644
index 00000000000..d35cc2d3a4c
--- /dev/null
+++ b/kernel/debug/kdb/kdb_support.c
@@ -0,0 +1,927 @@
+/*
+ * Kernel Debugger Architecture Independent Support Functions
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (c) 1999-2004 Silicon Graphics, Inc. All Rights Reserved.
+ * Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved.
+ * 03/02/13 added new 2.5 kallsyms <xavier.bru@bull.net>
+ */
+
+#include <stdarg.h>
+#include <linux/types.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/kallsyms.h>
+#include <linux/stddef.h>
+#include <linux/vmalloc.h>
+#include <linux/ptrace.h>
+#include <linux/module.h>
+#include <linux/highmem.h>
+#include <linux/hardirq.h>
+#include <linux/delay.h>
+#include <linux/uaccess.h>
+#include <linux/kdb.h>
+#include <linux/slab.h>
+#include "kdb_private.h"
+
+/*
+ * kdbgetsymval - Return the address of the given symbol.
+ *
+ * Parameters:
+ * symname Character string containing symbol name
+ * symtab Structure to receive results
+ * Returns:
+ * 0 Symbol not found, symtab zero filled
+ * 1 Symbol mapped to module/symbol/section, data in symtab
+ */
+int kdbgetsymval(const char *symname, kdb_symtab_t *symtab)
+{
+ if (KDB_DEBUG(AR))
+ kdb_printf("kdbgetsymval: symname=%s, symtab=%p\n", symname,
+ symtab);
+ memset(symtab, 0, sizeof(*symtab));
+ symtab->sym_start = kallsyms_lookup_name(symname);
+ if (symtab->sym_start) {
+ if (KDB_DEBUG(AR))
+ kdb_printf("kdbgetsymval: returns 1, "
+ "symtab->sym_start=0x%lx\n",
+ symtab->sym_start);
+ return 1;
+ }
+ if (KDB_DEBUG(AR))
+ kdb_printf("kdbgetsymval: returns 0\n");
+ return 0;
+}
+EXPORT_SYMBOL(kdbgetsymval);
+
+static char *kdb_name_table[100]; /* arbitrary size */
+
+/*
+ * kdbnearsym - Return the name of the symbol with the nearest address
+ * less than 'addr'.
+ *
+ * Parameters:
+ * addr Address to check for symbol near
+ * symtab Structure to receive results
+ * Returns:
+ * 0 No sections contain this address, symtab zero filled
+ * 1 Address mapped to module/symbol/section, data in symtab
+ * Remarks:
+ * 2.6 kallsyms has a "feature" where it unpacks the name into a
+ * string. If that string is reused before the caller expects it
+ * then the caller sees its string change without warning. To
+ * avoid cluttering up the main kdb code with lots of kdb_strdup,
+ * tests and kfree calls, kdbnearsym maintains an LRU list of the
+ * last few unique strings. The list is sized large enough to
+ * hold active strings, no kdb caller of kdbnearsym makes more
+ * than ~20 later calls before using a saved value.
+ */
+int kdbnearsym(unsigned long addr, kdb_symtab_t *symtab)
+{
+ int ret = 0;
+ unsigned long symbolsize = 0;
+ unsigned long offset = 0;
+#define knt1_size 128 /* must be >= kallsyms table size */
+ char *knt1 = NULL;
+
+ if (KDB_DEBUG(AR))
+ kdb_printf("kdbnearsym: addr=0x%lx, symtab=%p\n", addr, symtab);
+ memset(symtab, 0, sizeof(*symtab));
+
+ if (addr < 4096)
+ goto out;
+ knt1 = debug_kmalloc(knt1_size, GFP_ATOMIC);
+ if (!knt1) {
+ kdb_printf("kdbnearsym: addr=0x%lx cannot kmalloc knt1\n",
+ addr);
+ goto out;
+ }
+ symtab->sym_name = kallsyms_lookup(addr, &symbolsize , &offset,
+ (char **)(&symtab->mod_name), knt1);
+ if (offset > 8*1024*1024) {
+ symtab->sym_name = NULL;
+ addr = offset = symbolsize = 0;
+ }
+ symtab->sym_start = addr - offset;
+ symtab->sym_end = symtab->sym_start + symbolsize;
+ ret = symtab->sym_name != NULL && *(symtab->sym_name) != '\0';
+
+ if (ret) {
+ int i;
+ /* Another 2.6 kallsyms "feature". Sometimes the sym_name is
+ * set but the buffer passed into kallsyms_lookup is not used,
+ * so it contains garbage. The caller has to work out which
+ * buffer needs to be saved.
+ *
+ * What was Rusty smoking when he wrote that code?
+ */
+ if (symtab->sym_name != knt1) {
+ strncpy(knt1, symtab->sym_name, knt1_size);
+ knt1[knt1_size-1] = '\0';
+ }
+ for (i = 0; i < ARRAY_SIZE(kdb_name_table); ++i) {
+ if (kdb_name_table[i] &&
+ strcmp(kdb_name_table[i], knt1) == 0)
+ break;
+ }
+ if (i >= ARRAY_SIZE(kdb_name_table)) {
+ debug_kfree(kdb_name_table[0]);
+ memcpy(kdb_name_table, kdb_name_table+1,
+ sizeof(kdb_name_table[0]) *
+ (ARRAY_SIZE(kdb_name_table)-1));
+ } else {
+ debug_kfree(knt1);
+ knt1 = kdb_name_table[i];
+ memcpy(kdb_name_table+i, kdb_name_table+i+1,
+ sizeof(kdb_name_table[0]) *
+ (ARRAY_SIZE(kdb_name_table)-i-1));
+ }
+ i = ARRAY_SIZE(kdb_name_table) - 1;
+ kdb_name_table[i] = knt1;
+ symtab->sym_name = kdb_name_table[i];
+ knt1 = NULL;
+ }
+
+ if (symtab->mod_name == NULL)
+ symtab->mod_name = "kernel";
+ if (KDB_DEBUG(AR))
+ kdb_printf("kdbnearsym: returns %d symtab->sym_start=0x%lx, "
+ "symtab->mod_name=%p, symtab->sym_name=%p (%s)\n", ret,
+ symtab->sym_start, symtab->mod_name, symtab->sym_name,
+ symtab->sym_name);
+
+out:
+ debug_kfree(knt1);
+ return ret;
+}
+
+void kdbnearsym_cleanup(void)
+{
+ int i;
+ for (i = 0; i < ARRAY_SIZE(kdb_name_table); ++i) {
+ if (kdb_name_table[i]) {
+ debug_kfree(kdb_name_table[i]);
+ kdb_name_table[i] = NULL;
+ }
+ }
+}
+
+static char ks_namebuf[KSYM_NAME_LEN+1], ks_namebuf_prev[KSYM_NAME_LEN+1];
+
+/*
+ * kallsyms_symbol_complete
+ *
+ * Parameters:
+ * prefix_name prefix of a symbol name to lookup
+ * max_len maximum length that can be returned
+ * Returns:
+ * Number of symbols which match the given prefix.
+ * Notes:
+ * prefix_name is changed to contain the longest unique prefix that
+ * starts with this prefix (tab completion).
+ */
+int kallsyms_symbol_complete(char *prefix_name, int max_len)
+{
+ loff_t pos = 0;
+ int prefix_len = strlen(prefix_name), prev_len = 0;
+ int i, number = 0;
+ const char *name;
+
+ while ((name = kdb_walk_kallsyms(&pos))) {
+ if (strncmp(name, prefix_name, prefix_len) == 0) {
+ strcpy(ks_namebuf, name);
+ /* Work out the longest name that matches the prefix */
+ if (++number == 1) {
+ prev_len = min_t(int, max_len-1,
+ strlen(ks_namebuf));
+ memcpy(ks_namebuf_prev, ks_namebuf, prev_len);
+ ks_namebuf_prev[prev_len] = '\0';
+ continue;
+ }
+ for (i = 0; i < prev_len; i++) {
+ if (ks_namebuf[i] != ks_namebuf_prev[i]) {
+ prev_len = i;
+ ks_namebuf_prev[i] = '\0';
+ break;
+ }
+ }
+ }
+ }
+ if (prev_len > prefix_len)
+ memcpy(prefix_name, ks_namebuf_prev, prev_len+1);
+ return number;
+}
+
+/*
+ * kallsyms_symbol_next
+ *
+ * Parameters:
+ * prefix_name prefix of a symbol name to lookup
+ * flag 0 means search from the head, 1 means continue search.
+ * Returns:
+ * 1 if a symbol matches the given prefix.
+ * 0 if no string found
+ */
+int kallsyms_symbol_next(char *prefix_name, int flag)
+{
+ int prefix_len = strlen(prefix_name);
+ static loff_t pos;
+ const char *name;
+
+ if (!flag)
+ pos = 0;
+
+ while ((name = kdb_walk_kallsyms(&pos))) {
+ if (strncmp(name, prefix_name, prefix_len) == 0) {
+ strncpy(prefix_name, name, strlen(name)+1);
+ return 1;
+ }
+ }
+ return 0;
+}
+
+/*
+ * kdb_symbol_print - Standard method for printing a symbol name and offset.
+ * Inputs:
+ * addr Address to be printed.
+ * symtab Address of symbol data, if NULL this routine does its
+ * own lookup.
+ * punc Punctuation for string, bit field.
+ * Remarks:
+ * The string and its punctuation is only printed if the address
+ * is inside the kernel, except that the value is always printed
+ * when requested.
+ */
+void kdb_symbol_print(unsigned long addr, const kdb_symtab_t *symtab_p,
+ unsigned int punc)
+{
+ kdb_symtab_t symtab, *symtab_p2;
+ if (symtab_p) {
+ symtab_p2 = (kdb_symtab_t *)symtab_p;
+ } else {
+ symtab_p2 = &symtab;
+ kdbnearsym(addr, symtab_p2);
+ }
+ if (!(symtab_p2->sym_name || (punc & KDB_SP_VALUE)))
+ return;
+ if (punc & KDB_SP_SPACEB)
+ kdb_printf(" ");
+ if (punc & KDB_SP_VALUE)
+ kdb_printf(kdb_machreg_fmt0, addr);
+ if (symtab_p2->sym_name) {
+ if (punc & KDB_SP_VALUE)
+ kdb_printf(" ");
+ if (punc & KDB_SP_PAREN)
+ kdb_printf("(");
+ if (strcmp(symtab_p2->mod_name, "kernel"))
+ kdb_printf("[%s]", symtab_p2->mod_name);
+ kdb_printf("%s", symtab_p2->sym_name);
+ if (addr != symtab_p2->sym_start)
+ kdb_printf("+0x%lx", addr - symtab_p2->sym_start);
+ if (punc & KDB_SP_SYMSIZE)
+ kdb_printf("/0x%lx",
+ symtab_p2->sym_end - symtab_p2->sym_start);
+ if (punc & KDB_SP_PAREN)
+ kdb_printf(")");
+ }
+ if (punc & KDB_SP_SPACEA)
+ kdb_printf(" ");
+ if (punc & KDB_SP_NEWLINE)
+ kdb_printf("\n");
+}
+
+/*
+ * kdb_strdup - kdb equivalent of strdup, for disasm code.
+ * Inputs:
+ * str The string to duplicate.
+ * type Flags to kmalloc for the new string.
+ * Returns:
+ * Address of the new string, NULL if storage could not be allocated.
+ * Remarks:
+ * This is not in lib/string.c because it uses kmalloc which is not
+ * available when string.o is used in boot loaders.
+ */
+char *kdb_strdup(const char *str, gfp_t type)
+{
+ int n = strlen(str)+1;
+ char *s = kmalloc(n, type);
+ if (!s)
+ return NULL;
+ return strcpy(s, str);
+}
+
+/*
+ * kdb_getarea_size - Read an area of data. The kdb equivalent of
+ * copy_from_user, with kdb messages for invalid addresses.
+ * Inputs:
+ * res Pointer to the area to receive the result.
+ * addr Address of the area to copy.
+ * size Size of the area.
+ * Returns:
+ * 0 for success, < 0 for error.
+ */
+int kdb_getarea_size(void *res, unsigned long addr, size_t size)
+{
+ int ret = probe_kernel_read((char *)res, (char *)addr, size);
+ if (ret) {
+ if (!KDB_STATE(SUPPRESS)) {
+ kdb_printf("kdb_getarea: Bad address 0x%lx\n", addr);
+ KDB_STATE_SET(SUPPRESS);
+ }
+ ret = KDB_BADADDR;
+ } else {
+ KDB_STATE_CLEAR(SUPPRESS);
+ }
+ return ret;
+}
+
+/*
+ * kdb_putarea_size - Write an area of data. The kdb equivalent of
+ * copy_to_user, with kdb messages for invalid addresses.
+ * Inputs:
+ * addr Address of the area to write to.
+ * res Pointer to the area holding the data.
+ * size Size of the area.
+ * Returns:
+ * 0 for success, < 0 for error.
+ */
+int kdb_putarea_size(unsigned long addr, void *res, size_t size)
+{
+ int ret = probe_kernel_read((char *)addr, (char *)res, size);
+ if (ret) {
+ if (!KDB_STATE(SUPPRESS)) {
+ kdb_printf("kdb_putarea: Bad address 0x%lx\n", addr);
+ KDB_STATE_SET(SUPPRESS);
+ }
+ ret = KDB_BADADDR;
+ } else {
+ KDB_STATE_CLEAR(SUPPRESS);
+ }
+ return ret;
+}
+
+/*
+ * kdb_getphys - Read data from a physical address. Validate the
+ * address is in range, use kmap_atomic() to get data
+ * similar to kdb_getarea() - but for phys addresses
+ * Inputs:
+ * res Pointer to the word to receive the result
+ * addr Physical address of the area to copy
+ * size Size of the area
+ * Returns:
+ * 0 for success, < 0 for error.
+ */
+static int kdb_getphys(void *res, unsigned long addr, size_t size)
+{
+ unsigned long pfn;
+ void *vaddr;
+ struct page *page;
+
+ pfn = (addr >> PAGE_SHIFT);
+ if (!pfn_valid(pfn))
+ return 1;
+ page = pfn_to_page(pfn);
+ vaddr = kmap_atomic(page);
+ memcpy(res, vaddr + (addr & (PAGE_SIZE - 1)), size);
+ kunmap_atomic(vaddr);
+
+ return 0;
+}
+
+/*
+ * kdb_getphysword
+ * Inputs:
+ * word Pointer to the word to receive the result.
+ * addr Address of the area to copy.
+ * size Size of the area.
+ * Returns:
+ * 0 for success, < 0 for error.
+ */
+int kdb_getphysword(unsigned long *word, unsigned long addr, size_t size)
+{
+ int diag;
+ __u8 w1;
+ __u16 w2;
+ __u32 w4;
+ __u64 w8;
+ *word = 0; /* Default value if addr or size is invalid */
+
+ switch (size) {
+ case 1:
+ diag = kdb_getphys(&w1, addr, sizeof(w1));
+ if (!diag)
+ *word = w1;
+ break;
+ case 2:
+ diag = kdb_getphys(&w2, addr, sizeof(w2));
+ if (!diag)
+ *word = w2;
+ break;
+ case 4:
+ diag = kdb_getphys(&w4, addr, sizeof(w4));
+ if (!diag)
+ *word = w4;
+ break;
+ case 8:
+ if (size <= sizeof(*word)) {
+ diag = kdb_getphys(&w8, addr, sizeof(w8));
+ if (!diag)
+ *word = w8;
+ break;
+ }
+ /* drop through */
+ default:
+ diag = KDB_BADWIDTH;
+ kdb_printf("kdb_getphysword: bad width %ld\n", (long) size);
+ }
+ return diag;
+}
+
+/*
+ * kdb_getword - Read a binary value. Unlike kdb_getarea, this treats
+ * data as numbers.
+ * Inputs:
+ * word Pointer to the word to receive the result.
+ * addr Address of the area to copy.
+ * size Size of the area.
+ * Returns:
+ * 0 for success, < 0 for error.
+ */
+int kdb_getword(unsigned long *word, unsigned long addr, size_t size)
+{
+ int diag;
+ __u8 w1;
+ __u16 w2;
+ __u32 w4;
+ __u64 w8;
+ *word = 0; /* Default value if addr or size is invalid */
+ switch (size) {
+ case 1:
+ diag = kdb_getarea(w1, addr);
+ if (!diag)
+ *word = w1;
+ break;
+ case 2:
+ diag = kdb_getarea(w2, addr);
+ if (!diag)
+ *word = w2;
+ break;
+ case 4:
+ diag = kdb_getarea(w4, addr);
+ if (!diag)
+ *word = w4;
+ break;
+ case 8:
+ if (size <= sizeof(*word)) {
+ diag = kdb_getarea(w8, addr);
+ if (!diag)
+ *word = w8;
+ break;
+ }
+ /* drop through */
+ default:
+ diag = KDB_BADWIDTH;
+ kdb_printf("kdb_getword: bad width %ld\n", (long) size);
+ }
+ return diag;
+}
+
+/*
+ * kdb_putword - Write a binary value. Unlike kdb_putarea, this
+ * treats data as numbers.
+ * Inputs:
+ * addr Address of the area to write to..
+ * word The value to set.
+ * size Size of the area.
+ * Returns:
+ * 0 for success, < 0 for error.
+ */
+int kdb_putword(unsigned long addr, unsigned long word, size_t size)
+{
+ int diag;
+ __u8 w1;
+ __u16 w2;
+ __u32 w4;
+ __u64 w8;
+ switch (size) {
+ case 1:
+ w1 = word;
+ diag = kdb_putarea(addr, w1);
+ break;
+ case 2:
+ w2 = word;
+ diag = kdb_putarea(addr, w2);
+ break;
+ case 4:
+ w4 = word;
+ diag = kdb_putarea(addr, w4);
+ break;
+ case 8:
+ if (size <= sizeof(word)) {
+ w8 = word;
+ diag = kdb_putarea(addr, w8);
+ break;
+ }
+ /* drop through */
+ default:
+ diag = KDB_BADWIDTH;
+ kdb_printf("kdb_putword: bad width %ld\n", (long) size);
+ }
+ return diag;
+}
+
+/*
+ * kdb_task_state_string - Convert a string containing any of the
+ * letters DRSTCZEUIMA to a mask for the process state field and
+ * return the value. If no argument is supplied, return the mask
+ * that corresponds to environment variable PS, DRSTCZEU by
+ * default.
+ * Inputs:
+ * s String to convert
+ * Returns:
+ * Mask for process state.
+ * Notes:
+ * The mask folds data from several sources into a single long value, so
+ * be careful not to overlap the bits. TASK_* bits are in the LSB,
+ * special cases like UNRUNNABLE are in the MSB. As of 2.6.10-rc1 there
+ * is no overlap between TASK_* and EXIT_* but that may not always be
+ * true, so EXIT_* bits are shifted left 16 bits before being stored in
+ * the mask.
+ */
+
+/* unrunnable is < 0 */
+#define UNRUNNABLE (1UL << (8*sizeof(unsigned long) - 1))
+#define RUNNING (1UL << (8*sizeof(unsigned long) - 2))
+#define IDLE (1UL << (8*sizeof(unsigned long) - 3))
+#define DAEMON (1UL << (8*sizeof(unsigned long) - 4))
+
+unsigned long kdb_task_state_string(const char *s)
+{
+ long res = 0;
+ if (!s) {
+ s = kdbgetenv("PS");
+ if (!s)
+ s = "DRSTCZEU"; /* default value for ps */
+ }
+ while (*s) {
+ switch (*s) {
+ case 'D':
+ res |= TASK_UNINTERRUPTIBLE;
+ break;
+ case 'R':
+ res |= RUNNING;
+ break;
+ case 'S':
+ res |= TASK_INTERRUPTIBLE;
+ break;
+ case 'T':
+ res |= TASK_STOPPED;
+ break;
+ case 'C':
+ res |= TASK_TRACED;
+ break;
+ case 'Z':
+ res |= EXIT_ZOMBIE << 16;
+ break;
+ case 'E':
+ res |= EXIT_DEAD << 16;
+ break;
+ case 'U':
+ res |= UNRUNNABLE;
+ break;
+ case 'I':
+ res |= IDLE;
+ break;
+ case 'M':
+ res |= DAEMON;
+ break;
+ case 'A':
+ res = ~0UL;
+ break;
+ default:
+ kdb_printf("%s: unknown flag '%c' ignored\n",
+ __func__, *s);
+ break;
+ }
+ ++s;
+ }
+ return res;
+}
+
+/*
+ * kdb_task_state_char - Return the character that represents the task state.
+ * Inputs:
+ * p struct task for the process
+ * Returns:
+ * One character to represent the task state.
+ */
+char kdb_task_state_char (const struct task_struct *p)
+{
+ int cpu;
+ char state;
+ unsigned long tmp;
+
+ if (!p || probe_kernel_read(&tmp, (char *)p, sizeof(unsigned long)))
+ return 'E';
+
+ cpu = kdb_process_cpu(p);
+ state = (p->state == 0) ? 'R' :
+ (p->state < 0) ? 'U' :
+ (p->state & TASK_UNINTERRUPTIBLE) ? 'D' :
+ (p->state & TASK_STOPPED) ? 'T' :
+ (p->state & TASK_TRACED) ? 'C' :
+ (p->exit_state & EXIT_ZOMBIE) ? 'Z' :
+ (p->exit_state & EXIT_DEAD) ? 'E' :
+ (p->state & TASK_INTERRUPTIBLE) ? 'S' : '?';
+ if (is_idle_task(p)) {
+ /* Idle task. Is it really idle, apart from the kdb
+ * interrupt? */
+ if (!kdb_task_has_cpu(p) || kgdb_info[cpu].irq_depth == 1) {
+ if (cpu != kdb_initial_cpu)
+ state = 'I'; /* idle task */
+ }
+ } else if (!p->mm && state == 'S') {
+ state = 'M'; /* sleeping system daemon */
+ }
+ return state;
+}
+
+/*
+ * kdb_task_state - Return true if a process has the desired state
+ * given by the mask.
+ * Inputs:
+ * p struct task for the process
+ * mask mask from kdb_task_state_string to select processes
+ * Returns:
+ * True if the process matches at least one criteria defined by the mask.
+ */
+unsigned long kdb_task_state(const struct task_struct *p, unsigned long mask)
+{
+ char state[] = { kdb_task_state_char(p), '\0' };
+ return (mask & kdb_task_state_string(state)) != 0;
+}
+
+/*
+ * kdb_print_nameval - Print a name and its value, converting the
+ * value to a symbol lookup if possible.
+ * Inputs:
+ * name field name to print
+ * val value of field
+ */
+void kdb_print_nameval(const char *name, unsigned long val)
+{
+ kdb_symtab_t symtab;
+ kdb_printf(" %-11.11s ", name);
+ if (kdbnearsym(val, &symtab))
+ kdb_symbol_print(val, &symtab,
+ KDB_SP_VALUE|KDB_SP_SYMSIZE|KDB_SP_NEWLINE);
+ else
+ kdb_printf("0x%lx\n", val);
+}
+
+/* Last ditch allocator for debugging, so we can still debug even when
+ * the GFP_ATOMIC pool has been exhausted. The algorithms are tuned
+ * for space usage, not for speed. One smallish memory pool, the free
+ * chain is always in ascending address order to allow coalescing,
+ * allocations are done in brute force best fit.
+ */
+
+struct debug_alloc_header {
+ u32 next; /* offset of next header from start of pool */
+ u32 size;
+ void *caller;
+};
+
+/* The memory returned by this allocator must be aligned, which means
+ * so must the header size. Do not assume that sizeof(struct
+ * debug_alloc_header) is a multiple of the alignment, explicitly
+ * calculate the overhead of this header, including the alignment.
+ * The rest of this code must not use sizeof() on any header or
+ * pointer to a header.
+ */
+#define dah_align 8
+#define dah_overhead ALIGN(sizeof(struct debug_alloc_header), dah_align)
+
+static u64 debug_alloc_pool_aligned[256*1024/dah_align]; /* 256K pool */
+static char *debug_alloc_pool = (char *)debug_alloc_pool_aligned;
+static u32 dah_first, dah_first_call = 1, dah_used, dah_used_max;
+
+/* Locking is awkward. The debug code is called from all contexts,
+ * including non maskable interrupts. A normal spinlock is not safe
+ * in NMI context. Try to get the debug allocator lock, if it cannot
+ * be obtained after a second then give up. If the lock could not be
+ * previously obtained on this cpu then only try once.
+ *
+ * sparse has no annotation for "this function _sometimes_ acquires a
+ * lock", so fudge the acquire/release notation.
+ */
+static DEFINE_SPINLOCK(dap_lock);
+static int get_dap_lock(void)
+ __acquires(dap_lock)
+{
+ static int dap_locked = -1;
+ int count;
+ if (dap_locked == smp_processor_id())
+ count = 1;
+ else
+ count = 1000;
+ while (1) {
+ if (spin_trylock(&dap_lock)) {
+ dap_locked = -1;
+ return 1;
+ }
+ if (!count--)
+ break;
+ udelay(1000);
+ }
+ dap_locked = smp_processor_id();
+ __acquire(dap_lock);
+ return 0;
+}
+
+void *debug_kmalloc(size_t size, gfp_t flags)
+{
+ unsigned int rem, h_offset;
+ struct debug_alloc_header *best, *bestprev, *prev, *h;
+ void *p = NULL;
+ if (!get_dap_lock()) {
+ __release(dap_lock); /* we never actually got it */
+ return NULL;
+ }
+ h = (struct debug_alloc_header *)(debug_alloc_pool + dah_first);
+ if (dah_first_call) {
+ h->size = sizeof(debug_alloc_pool_aligned) - dah_overhead;
+ dah_first_call = 0;
+ }
+ size = ALIGN(size, dah_align);
+ prev = best = bestprev = NULL;
+ while (1) {
+ if (h->size >= size && (!best || h->size < best->size)) {
+ best = h;
+ bestprev = prev;
+ if (h->size == size)
+ break;
+ }
+ if (!h->next)
+ break;
+ prev = h;
+ h = (struct debug_alloc_header *)(debug_alloc_pool + h->next);
+ }
+ if (!best)
+ goto out;
+ rem = best->size - size;
+ /* The pool must always contain at least one header */
+ if (best->next == 0 && bestprev == NULL && rem < dah_overhead)
+ goto out;
+ if (rem >= dah_overhead) {
+ best->size = size;
+ h_offset = ((char *)best - debug_alloc_pool) +
+ dah_overhead + best->size;
+ h = (struct debug_alloc_header *)(debug_alloc_pool + h_offset);
+ h->size = rem - dah_overhead;
+ h->next = best->next;
+ } else
+ h_offset = best->next;
+ best->caller = __builtin_return_address(0);
+ dah_used += best->size;
+ dah_used_max = max(dah_used, dah_used_max);
+ if (bestprev)
+ bestprev->next = h_offset;
+ else
+ dah_first = h_offset;
+ p = (char *)best + dah_overhead;
+ memset(p, POISON_INUSE, best->size - 1);
+ *((char *)p + best->size - 1) = POISON_END;
+out:
+ spin_unlock(&dap_lock);
+ return p;
+}
+
+void debug_kfree(void *p)
+{
+ struct debug_alloc_header *h;
+ unsigned int h_offset;
+ if (!p)
+ return;
+ if ((char *)p < debug_alloc_pool ||
+ (char *)p >= debug_alloc_pool + sizeof(debug_alloc_pool_aligned)) {
+ kfree(p);
+ return;
+ }
+ if (!get_dap_lock()) {
+ __release(dap_lock); /* we never actually got it */
+ return; /* memory leak, cannot be helped */
+ }
+ h = (struct debug_alloc_header *)((char *)p - dah_overhead);
+ memset(p, POISON_FREE, h->size - 1);
+ *((char *)p + h->size - 1) = POISON_END;
+ h->caller = NULL;
+ dah_used -= h->size;
+ h_offset = (char *)h - debug_alloc_pool;
+ if (h_offset < dah_first) {
+ h->next = dah_first;
+ dah_first = h_offset;
+ } else {
+ struct debug_alloc_header *prev;
+ unsigned int prev_offset;
+ prev = (struct debug_alloc_header *)(debug_alloc_pool +
+ dah_first);
+ while (1) {
+ if (!prev->next || prev->next > h_offset)
+ break;
+ prev = (struct debug_alloc_header *)
+ (debug_alloc_pool + prev->next);
+ }
+ prev_offset = (char *)prev - debug_alloc_pool;
+ if (prev_offset + dah_overhead + prev->size == h_offset) {
+ prev->size += dah_overhead + h->size;
+ memset(h, POISON_FREE, dah_overhead - 1);
+ *((char *)h + dah_overhead - 1) = POISON_END;
+ h = prev;
+ h_offset = prev_offset;
+ } else {
+ h->next = prev->next;
+ prev->next = h_offset;
+ }
+ }
+ if (h_offset + dah_overhead + h->size == h->next) {
+ struct debug_alloc_header *next;
+ next = (struct debug_alloc_header *)
+ (debug_alloc_pool + h->next);
+ h->size += dah_overhead + next->size;
+ h->next = next->next;
+ memset(next, POISON_FREE, dah_overhead - 1);
+ *((char *)next + dah_overhead - 1) = POISON_END;
+ }
+ spin_unlock(&dap_lock);
+}
+
+void debug_kusage(void)
+{
+ struct debug_alloc_header *h_free, *h_used;
+#ifdef CONFIG_IA64
+ /* FIXME: using dah for ia64 unwind always results in a memory leak.
+ * Fix that memory leak first, then set debug_kusage_one_time = 1 for
+ * all architectures.
+ */
+ static int debug_kusage_one_time;
+#else
+ static int debug_kusage_one_time = 1;
+#endif
+ if (!get_dap_lock()) {
+ __release(dap_lock); /* we never actually got it */
+ return;
+ }
+ h_free = (struct debug_alloc_header *)(debug_alloc_pool + dah_first);
+ if (dah_first == 0 &&
+ (h_free->size == sizeof(debug_alloc_pool_aligned) - dah_overhead ||
+ dah_first_call))
+ goto out;
+ if (!debug_kusage_one_time)
+ goto out;
+ debug_kusage_one_time = 0;
+ kdb_printf("%s: debug_kmalloc memory leak dah_first %d\n",
+ __func__, dah_first);
+ if (dah_first) {
+ h_used = (struct debug_alloc_header *)debug_alloc_pool;
+ kdb_printf("%s: h_used %p size %d\n", __func__, h_used,
+ h_used->size);
+ }
+ do {
+ h_used = (struct debug_alloc_header *)
+ ((char *)h_free + dah_overhead + h_free->size);
+ kdb_printf("%s: h_used %p size %d caller %p\n",
+ __func__, h_used, h_used->size, h_used->caller);
+ h_free = (struct debug_alloc_header *)
+ (debug_alloc_pool + h_free->next);
+ } while (h_free->next);
+ h_used = (struct debug_alloc_header *)
+ ((char *)h_free + dah_overhead + h_free->size);
+ if ((char *)h_used - debug_alloc_pool !=
+ sizeof(debug_alloc_pool_aligned))
+ kdb_printf("%s: h_used %p size %d caller %p\n",
+ __func__, h_used, h_used->size, h_used->caller);
+out:
+ spin_unlock(&dap_lock);
+}
+
+/* Maintain a small stack of kdb_flags to allow recursion without disturbing
+ * the global kdb state.
+ */
+
+static int kdb_flags_stack[4], kdb_flags_index;
+
+void kdb_save_flags(void)
+{
+ BUG_ON(kdb_flags_index >= ARRAY_SIZE(kdb_flags_stack));
+ kdb_flags_stack[kdb_flags_index++] = kdb_flags;
+}
+
+void kdb_restore_flags(void)
+{
+ BUG_ON(kdb_flags_index <= 0);
+ kdb_flags = kdb_flags_stack[--kdb_flags_index];
+}
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index ead9b610aa7..54996b71e66 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -19,8 +19,10 @@
#include <linux/time.h>
#include <linux/sysctl.h>
#include <linux/delayacct.h>
+#include <linux/module.h>
int delayacct_on __read_mostly = 1; /* Delay accounting turned on/off */
+EXPORT_SYMBOL_GPL(delayacct_on);
struct kmem_cache *delayacct_cache;
static int __init delayacct_setup_disable(char *str)
@@ -104,20 +106,17 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
unsigned long long t2, t3;
unsigned long flags;
struct timespec ts;
-
- /* Though tsk->delays accessed later, early exit avoids
- * unnecessary returning of other data
- */
- if (!tsk->delays)
- goto done;
+ cputime_t utime, stime, stimescaled, utimescaled;
tmp = (s64)d->cpu_run_real_total;
- cputime_to_timespec(tsk->utime + tsk->stime, &ts);
+ task_cputime(tsk, &utime, &stime);
+ cputime_to_timespec(utime + stime, &ts);
tmp += timespec_to_ns(&ts);
d->cpu_run_real_total = (tmp < (s64)d->cpu_run_real_total) ? 0 : tmp;
tmp = (s64)d->cpu_scaled_run_real_total;
- cputime_to_timespec(tsk->utimescaled + tsk->stimescaled, &ts);
+ task_cputime_scaled(tsk, &utimescaled, &stimescaled);
+ cputime_to_timespec(utimescaled + stimescaled, &ts);
tmp += timespec_to_ns(&ts);
d->cpu_scaled_run_real_total =
(tmp < (s64)d->cpu_scaled_run_real_total) ? 0 : tmp;
@@ -153,7 +152,6 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
d->freepages_count += tsk->delays->freepages_count;
spin_unlock_irqrestore(&tsk->delays->lock, flags);
-done:
return 0;
}
diff --git a/kernel/dma.c b/kernel/dma.c
index f903189c530..6c6262f86c1 100644
--- a/kernel/dma.c
+++ b/kernel/dma.c
@@ -9,7 +9,7 @@
* [It also happened to remove the sizeof(char *) == sizeof(int)
* assumption introduced because of those /proc/dma patches. -- Hennus]
*/
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/spinlock.h>
@@ -18,7 +18,6 @@
#include <linux/proc_fs.h>
#include <linux/init.h>
#include <asm/dma.h>
-#include <asm/system.h>
diff --git a/kernel/early_res.c b/kernel/early_res.c
deleted file mode 100644
index aa5494ac446..00000000000
--- a/kernel/early_res.c
+++ /dev/null
@@ -1,513 +0,0 @@
-/*
- * early_res, could be used to replace bootmem
- */
-#include <linux/kernel.h>
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/bootmem.h>
-#include <linux/mm.h>
-#include <linux/early_res.h>
-
-/*
- * Early reserved memory areas.
- */
-/*
- * need to make sure this one is bigger enough before
- * find_fw_memmap_area could be used
- */
-#define MAX_EARLY_RES_X 32
-
-struct early_res {
- u64 start, end;
- char name[15];
- char overlap_ok;
-};
-static struct early_res early_res_x[MAX_EARLY_RES_X] __initdata;
-
-static int max_early_res __initdata = MAX_EARLY_RES_X;
-static struct early_res *early_res __initdata = &early_res_x[0];
-static int early_res_count __initdata;
-
-static int __init find_overlapped_early(u64 start, u64 end)
-{
- int i;
- struct early_res *r;
-
- for (i = 0; i < max_early_res && early_res[i].end; i++) {
- r = &early_res[i];
- if (end > r->start && start < r->end)
- break;
- }
-
- return i;
-}
-
-/*
- * Drop the i-th range from the early reservation map,
- * by copying any higher ranges down one over it, and
- * clearing what had been the last slot.
- */
-static void __init drop_range(int i)
-{
- int j;
-
- for (j = i + 1; j < max_early_res && early_res[j].end; j++)
- ;
-
- memmove(&early_res[i], &early_res[i + 1],
- (j - 1 - i) * sizeof(struct early_res));
-
- early_res[j - 1].end = 0;
- early_res_count--;
-}
-
-/*
- * Split any existing ranges that:
- * 1) are marked 'overlap_ok', and
- * 2) overlap with the stated range [start, end)
- * into whatever portion (if any) of the existing range is entirely
- * below or entirely above the stated range. Drop the portion
- * of the existing range that overlaps with the stated range,
- * which will allow the caller of this routine to then add that
- * stated range without conflicting with any existing range.
- */
-static void __init drop_overlaps_that_are_ok(u64 start, u64 end)
-{
- int i;
- struct early_res *r;
- u64 lower_start, lower_end;
- u64 upper_start, upper_end;
- char name[15];
-
- for (i = 0; i < max_early_res && early_res[i].end; i++) {
- r = &early_res[i];
-
- /* Continue past non-overlapping ranges */
- if (end <= r->start || start >= r->end)
- continue;
-
- /*
- * Leave non-ok overlaps as is; let caller
- * panic "Overlapping early reservations"
- * when it hits this overlap.
- */
- if (!r->overlap_ok)
- return;
-
- /*
- * We have an ok overlap. We will drop it from the early
- * reservation map, and add back in any non-overlapping
- * portions (lower or upper) as separate, overlap_ok,
- * non-overlapping ranges.
- */
-
- /* 1. Note any non-overlapping (lower or upper) ranges. */
- strncpy(name, r->name, sizeof(name) - 1);
-
- lower_start = lower_end = 0;
- upper_start = upper_end = 0;
- if (r->start < start) {
- lower_start = r->start;
- lower_end = start;
- }
- if (r->end > end) {
- upper_start = end;
- upper_end = r->end;
- }
-
- /* 2. Drop the original ok overlapping range */
- drop_range(i);
-
- i--; /* resume for-loop on copied down entry */
-
- /* 3. Add back in any non-overlapping ranges. */
- if (lower_end)
- reserve_early_overlap_ok(lower_start, lower_end, name);
- if (upper_end)
- reserve_early_overlap_ok(upper_start, upper_end, name);
- }
-}
-
-static void __init __reserve_early(u64 start, u64 end, char *name,
- int overlap_ok)
-{
- int i;
- struct early_res *r;
-
- i = find_overlapped_early(start, end);
- if (i >= max_early_res)
- panic("Too many early reservations");
- r = &early_res[i];
- if (r->end)
- panic("Overlapping early reservations "
- "%llx-%llx %s to %llx-%llx %s\n",
- start, end - 1, name ? name : "", r->start,
- r->end - 1, r->name);
- r->start = start;
- r->end = end;
- r->overlap_ok = overlap_ok;
- if (name)
- strncpy(r->name, name, sizeof(r->name) - 1);
- early_res_count++;
-}
-
-/*
- * A few early reservtations come here.
- *
- * The 'overlap_ok' in the name of this routine does -not- mean it
- * is ok for these reservations to overlap an earlier reservation.
- * Rather it means that it is ok for subsequent reservations to
- * overlap this one.
- *
- * Use this entry point to reserve early ranges when you are doing
- * so out of "Paranoia", reserving perhaps more memory than you need,
- * just in case, and don't mind a subsequent overlapping reservation
- * that is known to be needed.
- *
- * The drop_overlaps_that_are_ok() call here isn't really needed.
- * It would be needed if we had two colliding 'overlap_ok'
- * reservations, so that the second such would not panic on the
- * overlap with the first. We don't have any such as of this
- * writing, but might as well tolerate such if it happens in
- * the future.
- */
-void __init reserve_early_overlap_ok(u64 start, u64 end, char *name)
-{
- drop_overlaps_that_are_ok(start, end);
- __reserve_early(start, end, name, 1);
-}
-
-static void __init __check_and_double_early_res(u64 ex_start, u64 ex_end)
-{
- u64 start, end, size, mem;
- struct early_res *new;
-
- /* do we have enough slots left ? */
- if ((max_early_res - early_res_count) > max(max_early_res/8, 2))
- return;
-
- /* double it */
- mem = -1ULL;
- size = sizeof(struct early_res) * max_early_res * 2;
- if (early_res == early_res_x)
- start = 0;
- else
- start = early_res[0].end;
- end = ex_start;
- if (start + size < end)
- mem = find_fw_memmap_area(start, end, size,
- sizeof(struct early_res));
- if (mem == -1ULL) {
- start = ex_end;
- end = get_max_mapped();
- if (start + size < end)
- mem = find_fw_memmap_area(start, end, size,
- sizeof(struct early_res));
- }
- if (mem == -1ULL)
- panic("can not find more space for early_res array");
-
- new = __va(mem);
- /* save the first one for own */
- new[0].start = mem;
- new[0].end = mem + size;
- new[0].overlap_ok = 0;
- /* copy old to new */
- if (early_res == early_res_x) {
- memcpy(&new[1], &early_res[0],
- sizeof(struct early_res) * max_early_res);
- memset(&new[max_early_res+1], 0,
- sizeof(struct early_res) * (max_early_res - 1));
- early_res_count++;
- } else {
- memcpy(&new[1], &early_res[1],
- sizeof(struct early_res) * (max_early_res - 1));
- memset(&new[max_early_res], 0,
- sizeof(struct early_res) * max_early_res);
- }
- memset(&early_res[0], 0, sizeof(struct early_res) * max_early_res);
- early_res = new;
- max_early_res *= 2;
- printk(KERN_DEBUG "early_res array is doubled to %d at [%llx - %llx]\n",
- max_early_res, mem, mem + size - 1);
-}
-
-/*
- * Most early reservations come here.
- *
- * We first have drop_overlaps_that_are_ok() drop any pre-existing
- * 'overlap_ok' ranges, so that we can then reserve this memory
- * range without risk of panic'ing on an overlapping overlap_ok
- * early reservation.
- */
-void __init reserve_early(u64 start, u64 end, char *name)
-{
- if (start >= end)
- return;
-
- __check_and_double_early_res(start, end);
-
- drop_overlaps_that_are_ok(start, end);
- __reserve_early(start, end, name, 0);
-}
-
-void __init reserve_early_without_check(u64 start, u64 end, char *name)
-{
- struct early_res *r;
-
- if (start >= end)
- return;
-
- __check_and_double_early_res(start, end);
-
- r = &early_res[early_res_count];
-
- r->start = start;
- r->end = end;
- r->overlap_ok = 0;
- if (name)
- strncpy(r->name, name, sizeof(r->name) - 1);
- early_res_count++;
-}
-
-void __init free_early(u64 start, u64 end)
-{
- struct early_res *r;
- int i;
-
- i = find_overlapped_early(start, end);
- r = &early_res[i];
- if (i >= max_early_res || r->end != end || r->start != start)
- panic("free_early on not reserved area: %llx-%llx!",
- start, end - 1);
-
- drop_range(i);
-}
-
-#ifdef CONFIG_NO_BOOTMEM
-static void __init subtract_early_res(struct range *range, int az)
-{
- int i, count;
- u64 final_start, final_end;
- int idx = 0;
-
- count = 0;
- for (i = 0; i < max_early_res && early_res[i].end; i++)
- count++;
-
- /* need to skip first one ?*/
- if (early_res != early_res_x)
- idx = 1;
-
-#define DEBUG_PRINT_EARLY_RES 1
-
-#if DEBUG_PRINT_EARLY_RES
- printk(KERN_INFO "Subtract (%d early reservations)\n", count);
-#endif
- for (i = idx; i < count; i++) {
- struct early_res *r = &early_res[i];
-#if DEBUG_PRINT_EARLY_RES
- printk(KERN_INFO " #%d [%010llx - %010llx] %15s\n", i,
- r->start, r->end, r->name);
-#endif
- final_start = PFN_DOWN(r->start);
- final_end = PFN_UP(r->end);
- if (final_start >= final_end)
- continue;
- subtract_range(range, az, final_start, final_end);
- }
-
-}
-
-int __init get_free_all_memory_range(struct range **rangep, int nodeid)
-{
- int i, count;
- u64 start = 0, end;
- u64 size;
- u64 mem;
- struct range *range;
- int nr_range;
-
- count = 0;
- for (i = 0; i < max_early_res && early_res[i].end; i++)
- count++;
-
- count *= 2;
-
- size = sizeof(struct range) * count;
- end = get_max_mapped();
-#ifdef MAX_DMA32_PFN
- if (end > (MAX_DMA32_PFN << PAGE_SHIFT))
- start = MAX_DMA32_PFN << PAGE_SHIFT;
-#endif
- mem = find_fw_memmap_area(start, end, size, sizeof(struct range));
- if (mem == -1ULL)
- panic("can not find more space for range free");
-
- range = __va(mem);
- /* use early_node_map[] and early_res to get range array at first */
- memset(range, 0, size);
- nr_range = 0;
-
- /* need to go over early_node_map to find out good range for node */
- nr_range = add_from_early_node_map(range, count, nr_range, nodeid);
-#ifdef CONFIG_X86_32
- subtract_range(range, count, max_low_pfn, -1ULL);
-#endif
- subtract_early_res(range, count);
- nr_range = clean_sort_range(range, count);
-
- /* need to clear it ? */
- if (nodeid == MAX_NUMNODES) {
- memset(&early_res[0], 0,
- sizeof(struct early_res) * max_early_res);
- early_res = NULL;
- max_early_res = 0;
- }
-
- *rangep = range;
- return nr_range;
-}
-#else
-void __init early_res_to_bootmem(u64 start, u64 end)
-{
- int i, count;
- u64 final_start, final_end;
- int idx = 0;
-
- count = 0;
- for (i = 0; i < max_early_res && early_res[i].end; i++)
- count++;
-
- /* need to skip first one ?*/
- if (early_res != early_res_x)
- idx = 1;
-
- printk(KERN_INFO "(%d/%d early reservations) ==> bootmem [%010llx - %010llx]\n",
- count - idx, max_early_res, start, end);
- for (i = idx; i < count; i++) {
- struct early_res *r = &early_res[i];
- printk(KERN_INFO " #%d [%010llx - %010llx] %16s", i,
- r->start, r->end, r->name);
- final_start = max(start, r->start);
- final_end = min(end, r->end);
- if (final_start >= final_end) {
- printk(KERN_CONT "\n");
- continue;
- }
- printk(KERN_CONT " ==> [%010llx - %010llx]\n",
- final_start, final_end);
- reserve_bootmem_generic(final_start, final_end - final_start,
- BOOTMEM_DEFAULT);
- }
- /* clear them */
- memset(&early_res[0], 0, sizeof(struct early_res) * max_early_res);
- early_res = NULL;
- max_early_res = 0;
- early_res_count = 0;
-}
-#endif
-
-/* Check for already reserved areas */
-static inline int __init bad_addr(u64 *addrp, u64 size, u64 align)
-{
- int i;
- u64 addr = *addrp;
- int changed = 0;
- struct early_res *r;
-again:
- i = find_overlapped_early(addr, addr + size);
- r = &early_res[i];
- if (i < max_early_res && r->end) {
- *addrp = addr = round_up(r->end, align);
- changed = 1;
- goto again;
- }
- return changed;
-}
-
-/* Check for already reserved areas */
-static inline int __init bad_addr_size(u64 *addrp, u64 *sizep, u64 align)
-{
- int i;
- u64 addr = *addrp, last;
- u64 size = *sizep;
- int changed = 0;
-again:
- last = addr + size;
- for (i = 0; i < max_early_res && early_res[i].end; i++) {
- struct early_res *r = &early_res[i];
- if (last > r->start && addr < r->start) {
- size = r->start - addr;
- changed = 1;
- goto again;
- }
- if (last > r->end && addr < r->end) {
- addr = round_up(r->end, align);
- size = last - addr;
- changed = 1;
- goto again;
- }
- if (last <= r->end && addr >= r->start) {
- (*sizep)++;
- return 0;
- }
- }
- if (changed) {
- *addrp = addr;
- *sizep = size;
- }
- return changed;
-}
-
-/*
- * Find a free area with specified alignment in a specific range.
- * only with the area.between start to end is active range from early_node_map
- * so they are good as RAM
- */
-u64 __init find_early_area(u64 ei_start, u64 ei_last, u64 start, u64 end,
- u64 size, u64 align)
-{
- u64 addr, last;
-
- addr = round_up(ei_start, align);
- if (addr < start)
- addr = round_up(start, align);
- if (addr >= ei_last)
- goto out;
- while (bad_addr(&addr, size, align) && addr+size <= ei_last)
- ;
- last = addr + size;
- if (last > ei_last)
- goto out;
- if (last > end)
- goto out;
-
- return addr;
-
-out:
- return -1ULL;
-}
-
-u64 __init find_early_area_size(u64 ei_start, u64 ei_last, u64 start,
- u64 *sizep, u64 align)
-{
- u64 addr, last;
-
- addr = round_up(ei_start, align);
- if (addr < start)
- addr = round_up(start, align);
- if (addr >= ei_last)
- goto out;
- *sizep = ei_last - addr;
- while (bad_addr_size(&addr, sizep, align) && addr + *sizep <= ei_last)
- ;
- last = addr + *sizep;
- if (last > ei_last)
- goto out;
-
- return addr;
-
-out:
- return -1ULL;
-}
diff --git a/kernel/elfcore.c b/kernel/elfcore.c
new file mode 100644
index 00000000000..e556751d15d
--- /dev/null
+++ b/kernel/elfcore.c
@@ -0,0 +1,24 @@
+#include <linux/elf.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/binfmts.h>
+
+Elf_Half __weak elf_core_extra_phdrs(void)
+{
+ return 0;
+}
+
+int __weak elf_core_write_extra_phdrs(struct coredump_params *cprm, loff_t offset)
+{
+ return 1;
+}
+
+int __weak elf_core_write_extra_data(struct coredump_params *cprm)
+{
+ return 1;
+}
+
+size_t __weak elf_core_extra_data_size(void)
+{
+ return 0;
+}
diff --git a/kernel/events/Makefile b/kernel/events/Makefile
new file mode 100644
index 00000000000..103f5d147b2
--- /dev/null
+++ b/kernel/events/Makefile
@@ -0,0 +1,9 @@
+ifdef CONFIG_FUNCTION_TRACER
+CFLAGS_REMOVE_core.o = -pg
+endif
+
+obj-y := core.o ring_buffer.o callchain.o
+
+obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
+obj-$(CONFIG_UPROBES) += uprobes.o
+
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
new file mode 100644
index 00000000000..97b67df8fbf
--- /dev/null
+++ b/kernel/events/callchain.c
@@ -0,0 +1,209 @@
+/*
+ * Performance events callchain code, extracted from core.c:
+ *
+ * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
+ * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
+ * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
+ * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
+ *
+ * For licensing details see kernel-base/COPYING
+ */
+
+#include <linux/perf_event.h>
+#include <linux/slab.h>
+#include "internal.h"
+
+struct callchain_cpus_entries {
+ struct rcu_head rcu_head;
+ struct perf_callchain_entry *cpu_entries[0];
+};
+
+static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]);
+static atomic_t nr_callchain_events;
+static DEFINE_MUTEX(callchain_mutex);
+static struct callchain_cpus_entries *callchain_cpus_entries;
+
+
+__weak void perf_callchain_kernel(struct perf_callchain_entry *entry,
+ struct pt_regs *regs)
+{
+}
+
+__weak void perf_callchain_user(struct perf_callchain_entry *entry,
+ struct pt_regs *regs)
+{
+}
+
+static void release_callchain_buffers_rcu(struct rcu_head *head)
+{
+ struct callchain_cpus_entries *entries;
+ int cpu;
+
+ entries = container_of(head, struct callchain_cpus_entries, rcu_head);
+
+ for_each_possible_cpu(cpu)
+ kfree(entries->cpu_entries[cpu]);
+
+ kfree(entries);
+}
+
+static void release_callchain_buffers(void)
+{
+ struct callchain_cpus_entries *entries;
+
+ entries = callchain_cpus_entries;
+ rcu_assign_pointer(callchain_cpus_entries, NULL);
+ call_rcu(&entries->rcu_head, release_callchain_buffers_rcu);
+}
+
+static int alloc_callchain_buffers(void)
+{
+ int cpu;
+ int size;
+ struct callchain_cpus_entries *entries;
+
+ /*
+ * We can't use the percpu allocation API for data that can be
+ * accessed from NMI. Use a temporary manual per cpu allocation
+ * until that gets sorted out.
+ */
+ size = offsetof(struct callchain_cpus_entries, cpu_entries[nr_cpu_ids]);
+
+ entries = kzalloc(size, GFP_KERNEL);
+ if (!entries)
+ return -ENOMEM;
+
+ size = sizeof(struct perf_callchain_entry) * PERF_NR_CONTEXTS;
+
+ for_each_possible_cpu(cpu) {
+ entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL,
+ cpu_to_node(cpu));
+ if (!entries->cpu_entries[cpu])
+ goto fail;
+ }
+
+ rcu_assign_pointer(callchain_cpus_entries, entries);
+
+ return 0;
+
+fail:
+ for_each_possible_cpu(cpu)
+ kfree(entries->cpu_entries[cpu]);
+ kfree(entries);
+
+ return -ENOMEM;
+}
+
+int get_callchain_buffers(void)
+{
+ int err = 0;
+ int count;
+
+ mutex_lock(&callchain_mutex);
+
+ count = atomic_inc_return(&nr_callchain_events);
+ if (WARN_ON_ONCE(count < 1)) {
+ err = -EINVAL;
+ goto exit;
+ }
+
+ if (count > 1) {
+ /* If the allocation failed, give up */
+ if (!callchain_cpus_entries)
+ err = -ENOMEM;
+ goto exit;
+ }
+
+ err = alloc_callchain_buffers();
+exit:
+ if (err)
+ atomic_dec(&nr_callchain_events);
+
+ mutex_unlock(&callchain_mutex);
+
+ return err;
+}
+
+void put_callchain_buffers(void)
+{
+ if (atomic_dec_and_mutex_lock(&nr_callchain_events, &callchain_mutex)) {
+ release_callchain_buffers();
+ mutex_unlock(&callchain_mutex);
+ }
+}
+
+static struct perf_callchain_entry *get_callchain_entry(int *rctx)
+{
+ int cpu;
+ struct callchain_cpus_entries *entries;
+
+ *rctx = get_recursion_context(__get_cpu_var(callchain_recursion));
+ if (*rctx == -1)
+ return NULL;
+
+ entries = rcu_dereference(callchain_cpus_entries);
+ if (!entries)
+ return NULL;
+
+ cpu = smp_processor_id();
+
+ return &entries->cpu_entries[cpu][*rctx];
+}
+
+static void
+put_callchain_entry(int rctx)
+{
+ put_recursion_context(__get_cpu_var(callchain_recursion), rctx);
+}
+
+struct perf_callchain_entry *
+perf_callchain(struct perf_event *event, struct pt_regs *regs)
+{
+ int rctx;
+ struct perf_callchain_entry *entry;
+
+ int kernel = !event->attr.exclude_callchain_kernel;
+ int user = !event->attr.exclude_callchain_user;
+
+ if (!kernel && !user)
+ return NULL;
+
+ entry = get_callchain_entry(&rctx);
+ if (rctx == -1)
+ return NULL;
+
+ if (!entry)
+ goto exit_put;
+
+ entry->nr = 0;
+
+ if (kernel && !user_mode(regs)) {
+ perf_callchain_store(entry, PERF_CONTEXT_KERNEL);
+ perf_callchain_kernel(entry, regs);
+ }
+
+ if (user) {
+ if (!user_mode(regs)) {
+ if (current->mm)
+ regs = task_pt_regs(current);
+ else
+ regs = NULL;
+ }
+
+ if (regs) {
+ /*
+ * Disallow cross-task user callchains.
+ */
+ if (event->ctx->task && event->ctx->task != current)
+ goto exit_put;
+
+ perf_callchain_store(entry, PERF_CONTEXT_USER);
+ perf_callchain_user(entry, regs);
+ }
+ }
+
+exit_put:
+ put_callchain_entry(rctx);
+
+ return entry;
+}
diff --git a/kernel/events/core.c b/kernel/events/core.c
new file mode 100644
index 00000000000..6b17ac1b0c2
--- /dev/null
+++ b/kernel/events/core.c
@@ -0,0 +1,8168 @@
+/*
+ * Performance events core code:
+ *
+ * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
+ * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
+ * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
+ * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
+ *
+ * For licensing details see kernel-base/COPYING
+ */
+
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/cpu.h>
+#include <linux/smp.h>
+#include <linux/idr.h>
+#include <linux/file.h>
+#include <linux/poll.h>
+#include <linux/slab.h>
+#include <linux/hash.h>
+#include <linux/tick.h>
+#include <linux/sysfs.h>
+#include <linux/dcache.h>
+#include <linux/percpu.h>
+#include <linux/ptrace.h>
+#include <linux/reboot.h>
+#include <linux/vmstat.h>
+#include <linux/device.h>
+#include <linux/export.h>
+#include <linux/vmalloc.h>
+#include <linux/hardirq.h>
+#include <linux/rculist.h>
+#include <linux/uaccess.h>
+#include <linux/syscalls.h>
+#include <linux/anon_inodes.h>
+#include <linux/kernel_stat.h>
+#include <linux/perf_event.h>
+#include <linux/ftrace_event.h>
+#include <linux/hw_breakpoint.h>
+#include <linux/mm_types.h>
+#include <linux/cgroup.h>
+#include <linux/module.h>
+#include <linux/mman.h>
+
+#include "internal.h"
+
+#include <asm/irq_regs.h>
+
+struct remote_function_call {
+ struct task_struct *p;
+ int (*func)(void *info);
+ void *info;
+ int ret;
+};
+
+static void remote_function(void *data)
+{
+ struct remote_function_call *tfc = data;
+ struct task_struct *p = tfc->p;
+
+ if (p) {
+ tfc->ret = -EAGAIN;
+ if (task_cpu(p) != smp_processor_id() || !task_curr(p))
+ return;
+ }
+
+ tfc->ret = tfc->func(tfc->info);
+}
+
+/**
+ * task_function_call - call a function on the cpu on which a task runs
+ * @p: the task to evaluate
+ * @func: the function to be called
+ * @info: the function call argument
+ *
+ * Calls the function @func when the task is currently running. This might
+ * be on the current CPU, which just calls the function directly
+ *
+ * returns: @func return value, or
+ * -ESRCH - when the process isn't running
+ * -EAGAIN - when the process moved away
+ */
+static int
+task_function_call(struct task_struct *p, int (*func) (void *info), void *info)
+{
+ struct remote_function_call data = {
+ .p = p,
+ .func = func,
+ .info = info,
+ .ret = -ESRCH, /* No such (running) process */
+ };
+
+ if (task_curr(p))
+ smp_call_function_single(task_cpu(p), remote_function, &data, 1);
+
+ return data.ret;
+}
+
+/**
+ * cpu_function_call - call a function on the cpu
+ * @func: the function to be called
+ * @info: the function call argument
+ *
+ * Calls the function @func on the remote cpu.
+ *
+ * returns: @func return value or -ENXIO when the cpu is offline
+ */
+static int cpu_function_call(int cpu, int (*func) (void *info), void *info)
+{
+ struct remote_function_call data = {
+ .p = NULL,
+ .func = func,
+ .info = info,
+ .ret = -ENXIO, /* No such CPU */
+ };
+
+ smp_call_function_single(cpu, remote_function, &data, 1);
+
+ return data.ret;
+}
+
+#define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
+ PERF_FLAG_FD_OUTPUT |\
+ PERF_FLAG_PID_CGROUP |\
+ PERF_FLAG_FD_CLOEXEC)
+
+/*
+ * branch priv levels that need permission checks
+ */
+#define PERF_SAMPLE_BRANCH_PERM_PLM \
+ (PERF_SAMPLE_BRANCH_KERNEL |\
+ PERF_SAMPLE_BRANCH_HV)
+
+enum event_type_t {
+ EVENT_FLEXIBLE = 0x1,
+ EVENT_PINNED = 0x2,
+ EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
+};
+
+/*
+ * perf_sched_events : >0 events exist
+ * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu
+ */
+struct static_key_deferred perf_sched_events __read_mostly;
+static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
+static DEFINE_PER_CPU(atomic_t, perf_branch_stack_events);
+
+static atomic_t nr_mmap_events __read_mostly;
+static atomic_t nr_comm_events __read_mostly;
+static atomic_t nr_task_events __read_mostly;
+static atomic_t nr_freq_events __read_mostly;
+
+static LIST_HEAD(pmus);
+static DEFINE_MUTEX(pmus_lock);
+static struct srcu_struct pmus_srcu;
+
+/*
+ * perf event paranoia level:
+ * -1 - not paranoid at all
+ * 0 - disallow raw tracepoint access for unpriv
+ * 1 - disallow cpu events for unpriv
+ * 2 - disallow kernel profiling for unpriv
+ */
+int sysctl_perf_event_paranoid __read_mostly = 1;
+
+/* Minimum for 512 kiB + 1 user control page */
+int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */
+
+/*
+ * max perf event sample rate
+ */
+#define DEFAULT_MAX_SAMPLE_RATE 100000
+#define DEFAULT_SAMPLE_PERIOD_NS (NSEC_PER_SEC / DEFAULT_MAX_SAMPLE_RATE)
+#define DEFAULT_CPU_TIME_MAX_PERCENT 25
+
+int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE;
+
+static int max_samples_per_tick __read_mostly = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
+static int perf_sample_period_ns __read_mostly = DEFAULT_SAMPLE_PERIOD_NS;
+
+static int perf_sample_allowed_ns __read_mostly =
+ DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100;
+
+void update_perf_cpu_limits(void)
+{
+ u64 tmp = perf_sample_period_ns;
+
+ tmp *= sysctl_perf_cpu_time_max_percent;
+ do_div(tmp, 100);
+ ACCESS_ONCE(perf_sample_allowed_ns) = tmp;
+}
+
+static int perf_rotate_context(struct perf_cpu_context *cpuctx);
+
+int perf_proc_update_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+
+ if (ret || !write)
+ return ret;
+
+ max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ);
+ perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
+ update_perf_cpu_limits();
+
+ return 0;
+}
+
+int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT;
+
+int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ int ret = proc_dointvec(table, write, buffer, lenp, ppos);
+
+ if (ret || !write)
+ return ret;
+
+ update_perf_cpu_limits();
+
+ return 0;
+}
+
+/*
+ * perf samples are done in some very critical code paths (NMIs).
+ * If they take too much CPU time, the system can lock up and not
+ * get any real work done. This will drop the sample rate when
+ * we detect that events are taking too long.
+ */
+#define NR_ACCUMULATED_SAMPLES 128
+static DEFINE_PER_CPU(u64, running_sample_length);
+
+static void perf_duration_warn(struct irq_work *w)
+{
+ u64 allowed_ns = ACCESS_ONCE(perf_sample_allowed_ns);
+ u64 avg_local_sample_len;
+ u64 local_samples_len;
+
+ local_samples_len = __get_cpu_var(running_sample_length);
+ avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES;
+
+ printk_ratelimited(KERN_WARNING
+ "perf interrupt took too long (%lld > %lld), lowering "
+ "kernel.perf_event_max_sample_rate to %d\n",
+ avg_local_sample_len, allowed_ns >> 1,
+ sysctl_perf_event_sample_rate);
+}
+
+static DEFINE_IRQ_WORK(perf_duration_work, perf_duration_warn);
+
+void perf_sample_event_took(u64 sample_len_ns)
+{
+ u64 allowed_ns = ACCESS_ONCE(perf_sample_allowed_ns);
+ u64 avg_local_sample_len;
+ u64 local_samples_len;
+
+ if (allowed_ns == 0)
+ return;
+
+ /* decay the counter by 1 average sample */
+ local_samples_len = __get_cpu_var(running_sample_length);
+ local_samples_len -= local_samples_len/NR_ACCUMULATED_SAMPLES;
+ local_samples_len += sample_len_ns;
+ __get_cpu_var(running_sample_length) = local_samples_len;
+
+ /*
+ * note: this will be biased artifically low until we have
+ * seen NR_ACCUMULATED_SAMPLES. Doing it this way keeps us
+ * from having to maintain a count.
+ */
+ avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES;
+
+ if (avg_local_sample_len <= allowed_ns)
+ return;
+
+ if (max_samples_per_tick <= 1)
+ return;
+
+ max_samples_per_tick = DIV_ROUND_UP(max_samples_per_tick, 2);
+ sysctl_perf_event_sample_rate = max_samples_per_tick * HZ;
+ perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
+
+ update_perf_cpu_limits();
+
+ if (!irq_work_queue(&perf_duration_work)) {
+ early_printk("perf interrupt took too long (%lld > %lld), lowering "
+ "kernel.perf_event_max_sample_rate to %d\n",
+ avg_local_sample_len, allowed_ns >> 1,
+ sysctl_perf_event_sample_rate);
+ }
+}
+
+static atomic64_t perf_event_id;
+
+static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
+ enum event_type_t event_type);
+
+static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
+ enum event_type_t event_type,
+ struct task_struct *task);
+
+static void update_context_time(struct perf_event_context *ctx);
+static u64 perf_event_time(struct perf_event *event);
+
+void __weak perf_event_print_debug(void) { }
+
+extern __weak const char *perf_pmu_name(void)
+{
+ return "pmu";
+}
+
+static inline u64 perf_clock(void)
+{
+ return local_clock();
+}
+
+static inline struct perf_cpu_context *
+__get_cpu_context(struct perf_event_context *ctx)
+{
+ return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
+}
+
+static void perf_ctx_lock(struct perf_cpu_context *cpuctx,
+ struct perf_event_context *ctx)
+{
+ raw_spin_lock(&cpuctx->ctx.lock);
+ if (ctx)
+ raw_spin_lock(&ctx->lock);
+}
+
+static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
+ struct perf_event_context *ctx)
+{
+ if (ctx)
+ raw_spin_unlock(&ctx->lock);
+ raw_spin_unlock(&cpuctx->ctx.lock);
+}
+
+#ifdef CONFIG_CGROUP_PERF
+
+/*
+ * perf_cgroup_info keeps track of time_enabled for a cgroup.
+ * This is a per-cpu dynamically allocated data structure.
+ */
+struct perf_cgroup_info {
+ u64 time;
+ u64 timestamp;
+};
+
+struct perf_cgroup {
+ struct cgroup_subsys_state css;
+ struct perf_cgroup_info __percpu *info;
+};
+
+/*
+ * Must ensure cgroup is pinned (css_get) before calling
+ * this function. In other words, we cannot call this function
+ * if there is no cgroup event for the current CPU context.
+ */
+static inline struct perf_cgroup *
+perf_cgroup_from_task(struct task_struct *task)
+{
+ return container_of(task_css(task, perf_event_cgrp_id),
+ struct perf_cgroup, css);
+}
+
+static inline bool
+perf_cgroup_match(struct perf_event *event)
+{
+ struct perf_event_context *ctx = event->ctx;
+ struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+
+ /* @event doesn't care about cgroup */
+ if (!event->cgrp)
+ return true;
+
+ /* wants specific cgroup scope but @cpuctx isn't associated with any */
+ if (!cpuctx->cgrp)
+ return false;
+
+ /*
+ * Cgroup scoping is recursive. An event enabled for a cgroup is
+ * also enabled for all its descendant cgroups. If @cpuctx's
+ * cgroup is a descendant of @event's (the test covers identity
+ * case), it's a match.
+ */
+ return cgroup_is_descendant(cpuctx->cgrp->css.cgroup,
+ event->cgrp->css.cgroup);
+}
+
+static inline void perf_put_cgroup(struct perf_event *event)
+{
+ css_put(&event->cgrp->css);
+}
+
+static inline void perf_detach_cgroup(struct perf_event *event)
+{
+ perf_put_cgroup(event);
+ event->cgrp = NULL;
+}
+
+static inline int is_cgroup_event(struct perf_event *event)
+{
+ return event->cgrp != NULL;
+}
+
+static inline u64 perf_cgroup_event_time(struct perf_event *event)
+{
+ struct perf_cgroup_info *t;
+
+ t = per_cpu_ptr(event->cgrp->info, event->cpu);
+ return t->time;
+}
+
+static inline void __update_cgrp_time(struct perf_cgroup *cgrp)
+{
+ struct perf_cgroup_info *info;
+ u64 now;
+
+ now = perf_clock();
+
+ info = this_cpu_ptr(cgrp->info);
+
+ info->time += now - info->timestamp;
+ info->timestamp = now;
+}
+
+static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
+{
+ struct perf_cgroup *cgrp_out = cpuctx->cgrp;
+ if (cgrp_out)
+ __update_cgrp_time(cgrp_out);
+}
+
+static inline void update_cgrp_time_from_event(struct perf_event *event)
+{
+ struct perf_cgroup *cgrp;
+
+ /*
+ * ensure we access cgroup data only when needed and
+ * when we know the cgroup is pinned (css_get)
+ */
+ if (!is_cgroup_event(event))
+ return;
+
+ cgrp = perf_cgroup_from_task(current);
+ /*
+ * Do not update time when cgroup is not active
+ */
+ if (cgrp == event->cgrp)
+ __update_cgrp_time(event->cgrp);
+}
+
+static inline void
+perf_cgroup_set_timestamp(struct task_struct *task,
+ struct perf_event_context *ctx)
+{
+ struct perf_cgroup *cgrp;
+ struct perf_cgroup_info *info;
+
+ /*
+ * ctx->lock held by caller
+ * ensure we do not access cgroup data
+ * unless we have the cgroup pinned (css_get)
+ */
+ if (!task || !ctx->nr_cgroups)
+ return;
+
+ cgrp = perf_cgroup_from_task(task);
+ info = this_cpu_ptr(cgrp->info);
+ info->timestamp = ctx->timestamp;
+}
+
+#define PERF_CGROUP_SWOUT 0x1 /* cgroup switch out every event */
+#define PERF_CGROUP_SWIN 0x2 /* cgroup switch in events based on task */
+
+/*
+ * reschedule events based on the cgroup constraint of task.
+ *
+ * mode SWOUT : schedule out everything
+ * mode SWIN : schedule in based on cgroup for next
+ */
+void perf_cgroup_switch(struct task_struct *task, int mode)
+{
+ struct perf_cpu_context *cpuctx;
+ struct pmu *pmu;
+ unsigned long flags;
+
+ /*
+ * disable interrupts to avoid geting nr_cgroup
+ * changes via __perf_event_disable(). Also
+ * avoids preemption.
+ */
+ local_irq_save(flags);
+
+ /*
+ * we reschedule only in the presence of cgroup
+ * constrained events.
+ */
+ rcu_read_lock();
+
+ list_for_each_entry_rcu(pmu, &pmus, entry) {
+ cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+ if (cpuctx->unique_pmu != pmu)
+ continue; /* ensure we process each cpuctx once */
+
+ /*
+ * perf_cgroup_events says at least one
+ * context on this CPU has cgroup events.
+ *
+ * ctx->nr_cgroups reports the number of cgroup
+ * events for a context.
+ */
+ if (cpuctx->ctx.nr_cgroups > 0) {
+ perf_ctx_lock(cpuctx, cpuctx->task_ctx);
+ perf_pmu_disable(cpuctx->ctx.pmu);
+
+ if (mode & PERF_CGROUP_SWOUT) {
+ cpu_ctx_sched_out(cpuctx, EVENT_ALL);
+ /*
+ * must not be done before ctxswout due
+ * to event_filter_match() in event_sched_out()
+ */
+ cpuctx->cgrp = NULL;
+ }
+
+ if (mode & PERF_CGROUP_SWIN) {
+ WARN_ON_ONCE(cpuctx->cgrp);
+ /*
+ * set cgrp before ctxsw in to allow
+ * event_filter_match() to not have to pass
+ * task around
+ */
+ cpuctx->cgrp = perf_cgroup_from_task(task);
+ cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
+ }
+ perf_pmu_enable(cpuctx->ctx.pmu);
+ perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
+ }
+ }
+
+ rcu_read_unlock();
+
+ local_irq_restore(flags);
+}
+
+static inline void perf_cgroup_sched_out(struct task_struct *task,
+ struct task_struct *next)
+{
+ struct perf_cgroup *cgrp1;
+ struct perf_cgroup *cgrp2 = NULL;
+
+ /*
+ * we come here when we know perf_cgroup_events > 0
+ */
+ cgrp1 = perf_cgroup_from_task(task);
+
+ /*
+ * next is NULL when called from perf_event_enable_on_exec()
+ * that will systematically cause a cgroup_switch()
+ */
+ if (next)
+ cgrp2 = perf_cgroup_from_task(next);
+
+ /*
+ * only schedule out current cgroup events if we know
+ * that we are switching to a different cgroup. Otherwise,
+ * do no touch the cgroup events.
+ */
+ if (cgrp1 != cgrp2)
+ perf_cgroup_switch(task, PERF_CGROUP_SWOUT);
+}
+
+static inline void perf_cgroup_sched_in(struct task_struct *prev,
+ struct task_struct *task)
+{
+ struct perf_cgroup *cgrp1;
+ struct perf_cgroup *cgrp2 = NULL;
+
+ /*
+ * we come here when we know perf_cgroup_events > 0
+ */
+ cgrp1 = perf_cgroup_from_task(task);
+
+ /* prev can never be NULL */
+ cgrp2 = perf_cgroup_from_task(prev);
+
+ /*
+ * only need to schedule in cgroup events if we are changing
+ * cgroup during ctxsw. Cgroup events were not scheduled
+ * out of ctxsw out if that was not the case.
+ */
+ if (cgrp1 != cgrp2)
+ perf_cgroup_switch(task, PERF_CGROUP_SWIN);
+}
+
+static inline int perf_cgroup_connect(int fd, struct perf_event *event,
+ struct perf_event_attr *attr,
+ struct perf_event *group_leader)
+{
+ struct perf_cgroup *cgrp;
+ struct cgroup_subsys_state *css;
+ struct fd f = fdget(fd);
+ int ret = 0;
+
+ if (!f.file)
+ return -EBADF;
+
+ css = css_tryget_online_from_dir(f.file->f_dentry,
+ &perf_event_cgrp_subsys);
+ if (IS_ERR(css)) {
+ ret = PTR_ERR(css);
+ goto out;
+ }
+
+ cgrp = container_of(css, struct perf_cgroup, css);
+ event->cgrp = cgrp;
+
+ /*
+ * all events in a group must monitor
+ * the same cgroup because a task belongs
+ * to only one perf cgroup at a time
+ */
+ if (group_leader && group_leader->cgrp != cgrp) {
+ perf_detach_cgroup(event);
+ ret = -EINVAL;
+ }
+out:
+ fdput(f);
+ return ret;
+}
+
+static inline void
+perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
+{
+ struct perf_cgroup_info *t;
+ t = per_cpu_ptr(event->cgrp->info, event->cpu);
+ event->shadow_ctx_time = now - t->timestamp;
+}
+
+static inline void
+perf_cgroup_defer_enabled(struct perf_event *event)
+{
+ /*
+ * when the current task's perf cgroup does not match
+ * the event's, we need to remember to call the
+ * perf_mark_enable() function the first time a task with
+ * a matching perf cgroup is scheduled in.
+ */
+ if (is_cgroup_event(event) && !perf_cgroup_match(event))
+ event->cgrp_defer_enabled = 1;
+}
+
+static inline void
+perf_cgroup_mark_enabled(struct perf_event *event,
+ struct perf_event_context *ctx)
+{
+ struct perf_event *sub;
+ u64 tstamp = perf_event_time(event);
+
+ if (!event->cgrp_defer_enabled)
+ return;
+
+ event->cgrp_defer_enabled = 0;
+
+ event->tstamp_enabled = tstamp - event->total_time_enabled;
+ list_for_each_entry(sub, &event->sibling_list, group_entry) {
+ if (sub->state >= PERF_EVENT_STATE_INACTIVE) {
+ sub->tstamp_enabled = tstamp - sub->total_time_enabled;
+ sub->cgrp_defer_enabled = 0;
+ }
+ }
+}
+#else /* !CONFIG_CGROUP_PERF */
+
+static inline bool
+perf_cgroup_match(struct perf_event *event)
+{
+ return true;
+}
+
+static inline void perf_detach_cgroup(struct perf_event *event)
+{}
+
+static inline int is_cgroup_event(struct perf_event *event)
+{
+ return 0;
+}
+
+static inline u64 perf_cgroup_event_cgrp_time(struct perf_event *event)
+{
+ return 0;
+}
+
+static inline void update_cgrp_time_from_event(struct perf_event *event)
+{
+}
+
+static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
+{
+}
+
+static inline void perf_cgroup_sched_out(struct task_struct *task,
+ struct task_struct *next)
+{
+}
+
+static inline void perf_cgroup_sched_in(struct task_struct *prev,
+ struct task_struct *task)
+{
+}
+
+static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event,
+ struct perf_event_attr *attr,
+ struct perf_event *group_leader)
+{
+ return -EINVAL;
+}
+
+static inline void
+perf_cgroup_set_timestamp(struct task_struct *task,
+ struct perf_event_context *ctx)
+{
+}
+
+void
+perf_cgroup_switch(struct task_struct *task, struct task_struct *next)
+{
+}
+
+static inline void
+perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
+{
+}
+
+static inline u64 perf_cgroup_event_time(struct perf_event *event)
+{
+ return 0;
+}
+
+static inline void
+perf_cgroup_defer_enabled(struct perf_event *event)
+{
+}
+
+static inline void
+perf_cgroup_mark_enabled(struct perf_event *event,
+ struct perf_event_context *ctx)
+{
+}
+#endif
+
+/*
+ * set default to be dependent on timer tick just
+ * like original code
+ */
+#define PERF_CPU_HRTIMER (1000 / HZ)
+/*
+ * function must be called with interrupts disbled
+ */
+static enum hrtimer_restart perf_cpu_hrtimer_handler(struct hrtimer *hr)
+{
+ struct perf_cpu_context *cpuctx;
+ enum hrtimer_restart ret = HRTIMER_NORESTART;
+ int rotations = 0;
+
+ WARN_ON(!irqs_disabled());
+
+ cpuctx = container_of(hr, struct perf_cpu_context, hrtimer);
+
+ rotations = perf_rotate_context(cpuctx);
+
+ /*
+ * arm timer if needed
+ */
+ if (rotations) {
+ hrtimer_forward_now(hr, cpuctx->hrtimer_interval);
+ ret = HRTIMER_RESTART;
+ }
+
+ return ret;
+}
+
+/* CPU is going down */
+void perf_cpu_hrtimer_cancel(int cpu)
+{
+ struct perf_cpu_context *cpuctx;
+ struct pmu *pmu;
+ unsigned long flags;
+
+ if (WARN_ON(cpu != smp_processor_id()))
+ return;
+
+ local_irq_save(flags);
+
+ rcu_read_lock();
+
+ list_for_each_entry_rcu(pmu, &pmus, entry) {
+ cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+
+ if (pmu->task_ctx_nr == perf_sw_context)
+ continue;
+
+ hrtimer_cancel(&cpuctx->hrtimer);
+ }
+
+ rcu_read_unlock();
+
+ local_irq_restore(flags);
+}
+
+static void __perf_cpu_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
+{
+ struct hrtimer *hr = &cpuctx->hrtimer;
+ struct pmu *pmu = cpuctx->ctx.pmu;
+ int timer;
+
+ /* no multiplexing needed for SW PMU */
+ if (pmu->task_ctx_nr == perf_sw_context)
+ return;
+
+ /*
+ * check default is sane, if not set then force to
+ * default interval (1/tick)
+ */
+ timer = pmu->hrtimer_interval_ms;
+ if (timer < 1)
+ timer = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER;
+
+ cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
+
+ hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
+ hr->function = perf_cpu_hrtimer_handler;
+}
+
+static void perf_cpu_hrtimer_restart(struct perf_cpu_context *cpuctx)
+{
+ struct hrtimer *hr = &cpuctx->hrtimer;
+ struct pmu *pmu = cpuctx->ctx.pmu;
+
+ /* not for SW PMU */
+ if (pmu->task_ctx_nr == perf_sw_context)
+ return;
+
+ if (hrtimer_active(hr))
+ return;
+
+ if (!hrtimer_callback_running(hr))
+ __hrtimer_start_range_ns(hr, cpuctx->hrtimer_interval,
+ 0, HRTIMER_MODE_REL_PINNED, 0);
+}
+
+void perf_pmu_disable(struct pmu *pmu)
+{
+ int *count = this_cpu_ptr(pmu->pmu_disable_count);
+ if (!(*count)++)
+ pmu->pmu_disable(pmu);
+}
+
+void perf_pmu_enable(struct pmu *pmu)
+{
+ int *count = this_cpu_ptr(pmu->pmu_disable_count);
+ if (!--(*count))
+ pmu->pmu_enable(pmu);
+}
+
+static DEFINE_PER_CPU(struct list_head, rotation_list);
+
+/*
+ * perf_pmu_rotate_start() and perf_rotate_context() are fully serialized
+ * because they're strictly cpu affine and rotate_start is called with IRQs
+ * disabled, while rotate_context is called from IRQ context.
+ */
+static void perf_pmu_rotate_start(struct pmu *pmu)
+{
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+ struct list_head *head = &__get_cpu_var(rotation_list);
+
+ WARN_ON(!irqs_disabled());
+
+ if (list_empty(&cpuctx->rotation_list))
+ list_add(&cpuctx->rotation_list, head);
+}
+
+static void get_ctx(struct perf_event_context *ctx)
+{
+ WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
+}
+
+static void put_ctx(struct perf_event_context *ctx)
+{
+ if (atomic_dec_and_test(&ctx->refcount)) {
+ if (ctx->parent_ctx)
+ put_ctx(ctx->parent_ctx);
+ if (ctx->task)
+ put_task_struct(ctx->task);
+ kfree_rcu(ctx, rcu_head);
+ }
+}
+
+static void unclone_ctx(struct perf_event_context *ctx)
+{
+ if (ctx->parent_ctx) {
+ put_ctx(ctx->parent_ctx);
+ ctx->parent_ctx = NULL;
+ }
+ ctx->generation++;
+}
+
+static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
+{
+ /*
+ * only top level events have the pid namespace they were created in
+ */
+ if (event->parent)
+ event = event->parent;
+
+ return task_tgid_nr_ns(p, event->ns);
+}
+
+static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
+{
+ /*
+ * only top level events have the pid namespace they were created in
+ */
+ if (event->parent)
+ event = event->parent;
+
+ return task_pid_nr_ns(p, event->ns);
+}
+
+/*
+ * If we inherit events we want to return the parent event id
+ * to userspace.
+ */
+static u64 primary_event_id(struct perf_event *event)
+{
+ u64 id = event->id;
+
+ if (event->parent)
+ id = event->parent->id;
+
+ return id;
+}
+
+/*
+ * Get the perf_event_context for a task and lock it.
+ * This has to cope with with the fact that until it is locked,
+ * the context could get moved to another task.
+ */
+static struct perf_event_context *
+perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags)
+{
+ struct perf_event_context *ctx;
+
+retry:
+ /*
+ * One of the few rules of preemptible RCU is that one cannot do
+ * rcu_read_unlock() while holding a scheduler (or nested) lock when
+ * part of the read side critical section was preemptible -- see
+ * rcu_read_unlock_special().
+ *
+ * Since ctx->lock nests under rq->lock we must ensure the entire read
+ * side critical section is non-preemptible.
+ */
+ preempt_disable();
+ rcu_read_lock();
+ ctx = rcu_dereference(task->perf_event_ctxp[ctxn]);
+ if (ctx) {
+ /*
+ * If this context is a clone of another, it might
+ * get swapped for another underneath us by
+ * perf_event_task_sched_out, though the
+ * rcu_read_lock() protects us from any context
+ * getting freed. Lock the context and check if it
+ * got swapped before we could get the lock, and retry
+ * if so. If we locked the right context, then it
+ * can't get swapped on us any more.
+ */
+ raw_spin_lock_irqsave(&ctx->lock, *flags);
+ if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) {
+ raw_spin_unlock_irqrestore(&ctx->lock, *flags);
+ rcu_read_unlock();
+ preempt_enable();
+ goto retry;
+ }
+
+ if (!atomic_inc_not_zero(&ctx->refcount)) {
+ raw_spin_unlock_irqrestore(&ctx->lock, *flags);
+ ctx = NULL;
+ }
+ }
+ rcu_read_unlock();
+ preempt_enable();
+ return ctx;
+}
+
+/*
+ * Get the context for a task and increment its pin_count so it
+ * can't get swapped to another task. This also increments its
+ * reference count so that the context can't get freed.
+ */
+static struct perf_event_context *
+perf_pin_task_context(struct task_struct *task, int ctxn)
+{
+ struct perf_event_context *ctx;
+ unsigned long flags;
+
+ ctx = perf_lock_task_context(task, ctxn, &flags);
+ if (ctx) {
+ ++ctx->pin_count;
+ raw_spin_unlock_irqrestore(&ctx->lock, flags);
+ }
+ return ctx;
+}
+
+static void perf_unpin_context(struct perf_event_context *ctx)
+{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&ctx->lock, flags);
+ --ctx->pin_count;
+ raw_spin_unlock_irqrestore(&ctx->lock, flags);
+}
+
+/*
+ * Update the record of the current time in a context.
+ */
+static void update_context_time(struct perf_event_context *ctx)
+{
+ u64 now = perf_clock();
+
+ ctx->time += now - ctx->timestamp;
+ ctx->timestamp = now;
+}
+
+static u64 perf_event_time(struct perf_event *event)
+{
+ struct perf_event_context *ctx = event->ctx;
+
+ if (is_cgroup_event(event))
+ return perf_cgroup_event_time(event);
+
+ return ctx ? ctx->time : 0;
+}
+
+/*
+ * Update the total_time_enabled and total_time_running fields for a event.
+ * The caller of this function needs to hold the ctx->lock.
+ */
+static void update_event_times(struct perf_event *event)
+{
+ struct perf_event_context *ctx = event->ctx;
+ u64 run_end;
+
+ if (event->state < PERF_EVENT_STATE_INACTIVE ||
+ event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
+ return;
+ /*
+ * in cgroup mode, time_enabled represents
+ * the time the event was enabled AND active
+ * tasks were in the monitored cgroup. This is
+ * independent of the activity of the context as
+ * there may be a mix of cgroup and non-cgroup events.
+ *
+ * That is why we treat cgroup events differently
+ * here.
+ */
+ if (is_cgroup_event(event))
+ run_end = perf_cgroup_event_time(event);
+ else if (ctx->is_active)
+ run_end = ctx->time;
+ else
+ run_end = event->tstamp_stopped;
+
+ event->total_time_enabled = run_end - event->tstamp_enabled;
+
+ if (event->state == PERF_EVENT_STATE_INACTIVE)
+ run_end = event->tstamp_stopped;
+ else
+ run_end = perf_event_time(event);
+
+ event->total_time_running = run_end - event->tstamp_running;
+
+}
+
+/*
+ * Update total_time_enabled and total_time_running for all events in a group.
+ */
+static void update_group_times(struct perf_event *leader)
+{
+ struct perf_event *event;
+
+ update_event_times(leader);
+ list_for_each_entry(event, &leader->sibling_list, group_entry)
+ update_event_times(event);
+}
+
+static struct list_head *
+ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
+{
+ if (event->attr.pinned)
+ return &ctx->pinned_groups;
+ else
+ return &ctx->flexible_groups;
+}
+
+/*
+ * Add a event from the lists for its context.
+ * Must be called with ctx->mutex and ctx->lock held.
+ */
+static void
+list_add_event(struct perf_event *event, struct perf_event_context *ctx)
+{
+ WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
+ event->attach_state |= PERF_ATTACH_CONTEXT;
+
+ /*
+ * If we're a stand alone event or group leader, we go to the context
+ * list, group events are kept attached to the group so that
+ * perf_group_detach can, at all times, locate all siblings.
+ */
+ if (event->group_leader == event) {
+ struct list_head *list;
+
+ if (is_software_event(event))
+ event->group_flags |= PERF_GROUP_SOFTWARE;
+
+ list = ctx_group_list(event, ctx);
+ list_add_tail(&event->group_entry, list);
+ }
+
+ if (is_cgroup_event(event))
+ ctx->nr_cgroups++;
+
+ if (has_branch_stack(event))
+ ctx->nr_branch_stack++;
+
+ list_add_rcu(&event->event_entry, &ctx->event_list);
+ if (!ctx->nr_events)
+ perf_pmu_rotate_start(ctx->pmu);
+ ctx->nr_events++;
+ if (event->attr.inherit_stat)
+ ctx->nr_stat++;
+
+ ctx->generation++;
+}
+
+/*
+ * Initialize event state based on the perf_event_attr::disabled.
+ */
+static inline void perf_event__state_init(struct perf_event *event)
+{
+ event->state = event->attr.disabled ? PERF_EVENT_STATE_OFF :
+ PERF_EVENT_STATE_INACTIVE;
+}
+
+/*
+ * Called at perf_event creation and when events are attached/detached from a
+ * group.
+ */
+static void perf_event__read_size(struct perf_event *event)
+{
+ int entry = sizeof(u64); /* value */
+ int size = 0;
+ int nr = 1;
+
+ if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
+ size += sizeof(u64);
+
+ if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
+ size += sizeof(u64);
+
+ if (event->attr.read_format & PERF_FORMAT_ID)
+ entry += sizeof(u64);
+
+ if (event->attr.read_format & PERF_FORMAT_GROUP) {
+ nr += event->group_leader->nr_siblings;
+ size += sizeof(u64);
+ }
+
+ size += entry * nr;
+ event->read_size = size;
+}
+
+static void perf_event__header_size(struct perf_event *event)
+{
+ struct perf_sample_data *data;
+ u64 sample_type = event->attr.sample_type;
+ u16 size = 0;
+
+ perf_event__read_size(event);
+
+ if (sample_type & PERF_SAMPLE_IP)
+ size += sizeof(data->ip);
+
+ if (sample_type & PERF_SAMPLE_ADDR)
+ size += sizeof(data->addr);
+
+ if (sample_type & PERF_SAMPLE_PERIOD)
+ size += sizeof(data->period);
+
+ if (sample_type & PERF_SAMPLE_WEIGHT)
+ size += sizeof(data->weight);
+
+ if (sample_type & PERF_SAMPLE_READ)
+ size += event->read_size;
+
+ if (sample_type & PERF_SAMPLE_DATA_SRC)
+ size += sizeof(data->data_src.val);
+
+ if (sample_type & PERF_SAMPLE_TRANSACTION)
+ size += sizeof(data->txn);
+
+ event->header_size = size;
+}
+
+static void perf_event__id_header_size(struct perf_event *event)
+{
+ struct perf_sample_data *data;
+ u64 sample_type = event->attr.sample_type;
+ u16 size = 0;
+
+ if (sample_type & PERF_SAMPLE_TID)
+ size += sizeof(data->tid_entry);
+
+ if (sample_type & PERF_SAMPLE_TIME)
+ size += sizeof(data->time);
+
+ if (sample_type & PERF_SAMPLE_IDENTIFIER)
+ size += sizeof(data->id);
+
+ if (sample_type & PERF_SAMPLE_ID)
+ size += sizeof(data->id);
+
+ if (sample_type & PERF_SAMPLE_STREAM_ID)
+ size += sizeof(data->stream_id);
+
+ if (sample_type & PERF_SAMPLE_CPU)
+ size += sizeof(data->cpu_entry);
+
+ event->id_header_size = size;
+}
+
+static void perf_group_attach(struct perf_event *event)
+{
+ struct perf_event *group_leader = event->group_leader, *pos;
+
+ /*
+ * We can have double attach due to group movement in perf_event_open.
+ */
+ if (event->attach_state & PERF_ATTACH_GROUP)
+ return;
+
+ event->attach_state |= PERF_ATTACH_GROUP;
+
+ if (group_leader == event)
+ return;
+
+ if (group_leader->group_flags & PERF_GROUP_SOFTWARE &&
+ !is_software_event(event))
+ group_leader->group_flags &= ~PERF_GROUP_SOFTWARE;
+
+ list_add_tail(&event->group_entry, &group_leader->sibling_list);
+ group_leader->nr_siblings++;
+
+ perf_event__header_size(group_leader);
+
+ list_for_each_entry(pos, &group_leader->sibling_list, group_entry)
+ perf_event__header_size(pos);
+}
+
+/*
+ * Remove a event from the lists for its context.
+ * Must be called with ctx->mutex and ctx->lock held.
+ */
+static void
+list_del_event(struct perf_event *event, struct perf_event_context *ctx)
+{
+ struct perf_cpu_context *cpuctx;
+ /*
+ * We can have double detach due to exit/hot-unplug + close.
+ */
+ if (!(event->attach_state & PERF_ATTACH_CONTEXT))
+ return;
+
+ event->attach_state &= ~PERF_ATTACH_CONTEXT;
+
+ if (is_cgroup_event(event)) {
+ ctx->nr_cgroups--;
+ cpuctx = __get_cpu_context(ctx);
+ /*
+ * if there are no more cgroup events
+ * then cler cgrp to avoid stale pointer
+ * in update_cgrp_time_from_cpuctx()
+ */
+ if (!ctx->nr_cgroups)
+ cpuctx->cgrp = NULL;
+ }
+
+ if (has_branch_stack(event))
+ ctx->nr_branch_stack--;
+
+ ctx->nr_events--;
+ if (event->attr.inherit_stat)
+ ctx->nr_stat--;
+
+ list_del_rcu(&event->event_entry);
+
+ if (event->group_leader == event)
+ list_del_init(&event->group_entry);
+
+ update_group_times(event);
+
+ /*
+ * If event was in error state, then keep it
+ * that way, otherwise bogus counts will be
+ * returned on read(). The only way to get out
+ * of error state is by explicit re-enabling
+ * of the event
+ */
+ if (event->state > PERF_EVENT_STATE_OFF)
+ event->state = PERF_EVENT_STATE_OFF;
+
+ ctx->generation++;
+}
+
+static void perf_group_detach(struct perf_event *event)
+{
+ struct perf_event *sibling, *tmp;
+ struct list_head *list = NULL;
+
+ /*
+ * We can have double detach due to exit/hot-unplug + close.
+ */
+ if (!(event->attach_state & PERF_ATTACH_GROUP))
+ return;
+
+ event->attach_state &= ~PERF_ATTACH_GROUP;
+
+ /*
+ * If this is a sibling, remove it from its group.
+ */
+ if (event->group_leader != event) {
+ list_del_init(&event->group_entry);
+ event->group_leader->nr_siblings--;
+ goto out;
+ }
+
+ if (!list_empty(&event->group_entry))
+ list = &event->group_entry;
+
+ /*
+ * If this was a group event with sibling events then
+ * upgrade the siblings to singleton events by adding them
+ * to whatever list we are on.
+ */
+ list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) {
+ if (list)
+ list_move_tail(&sibling->group_entry, list);
+ sibling->group_leader = sibling;
+
+ /* Inherit group flags from the previous leader */
+ sibling->group_flags = event->group_flags;
+ }
+
+out:
+ perf_event__header_size(event->group_leader);
+
+ list_for_each_entry(tmp, &event->group_leader->sibling_list, group_entry)
+ perf_event__header_size(tmp);
+}
+
+static inline int
+event_filter_match(struct perf_event *event)
+{
+ return (event->cpu == -1 || event->cpu == smp_processor_id())
+ && perf_cgroup_match(event);
+}
+
+static void
+event_sched_out(struct perf_event *event,
+ struct perf_cpu_context *cpuctx,
+ struct perf_event_context *ctx)
+{
+ u64 tstamp = perf_event_time(event);
+ u64 delta;
+ /*
+ * An event which could not be activated because of
+ * filter mismatch still needs to have its timings
+ * maintained, otherwise bogus information is return
+ * via read() for time_enabled, time_running:
+ */
+ if (event->state == PERF_EVENT_STATE_INACTIVE
+ && !event_filter_match(event)) {
+ delta = tstamp - event->tstamp_stopped;
+ event->tstamp_running += delta;
+ event->tstamp_stopped = tstamp;
+ }
+
+ if (event->state != PERF_EVENT_STATE_ACTIVE)
+ return;
+
+ perf_pmu_disable(event->pmu);
+
+ event->state = PERF_EVENT_STATE_INACTIVE;
+ if (event->pending_disable) {
+ event->pending_disable = 0;
+ event->state = PERF_EVENT_STATE_OFF;
+ }
+ event->tstamp_stopped = tstamp;
+ event->pmu->del(event, 0);
+ event->oncpu = -1;
+
+ if (!is_software_event(event))
+ cpuctx->active_oncpu--;
+ ctx->nr_active--;
+ if (event->attr.freq && event->attr.sample_freq)
+ ctx->nr_freq--;
+ if (event->attr.exclusive || !cpuctx->active_oncpu)
+ cpuctx->exclusive = 0;
+
+ perf_pmu_enable(event->pmu);
+}
+
+static void
+group_sched_out(struct perf_event *group_event,
+ struct perf_cpu_context *cpuctx,
+ struct perf_event_context *ctx)
+{
+ struct perf_event *event;
+ int state = group_event->state;
+
+ event_sched_out(group_event, cpuctx, ctx);
+
+ /*
+ * Schedule out siblings (if any):
+ */
+ list_for_each_entry(event, &group_event->sibling_list, group_entry)
+ event_sched_out(event, cpuctx, ctx);
+
+ if (state == PERF_EVENT_STATE_ACTIVE && group_event->attr.exclusive)
+ cpuctx->exclusive = 0;
+}
+
+struct remove_event {
+ struct perf_event *event;
+ bool detach_group;
+};
+
+/*
+ * Cross CPU call to remove a performance event
+ *
+ * We disable the event on the hardware level first. After that we
+ * remove it from the context list.
+ */
+static int __perf_remove_from_context(void *info)
+{
+ struct remove_event *re = info;
+ struct perf_event *event = re->event;
+ struct perf_event_context *ctx = event->ctx;
+ struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+
+ raw_spin_lock(&ctx->lock);
+ event_sched_out(event, cpuctx, ctx);
+ if (re->detach_group)
+ perf_group_detach(event);
+ list_del_event(event, ctx);
+ if (!ctx->nr_events && cpuctx->task_ctx == ctx) {
+ ctx->is_active = 0;
+ cpuctx->task_ctx = NULL;
+ }
+ raw_spin_unlock(&ctx->lock);
+
+ return 0;
+}
+
+
+/*
+ * Remove the event from a task's (or a CPU's) list of events.
+ *
+ * CPU events are removed with a smp call. For task events we only
+ * call when the task is on a CPU.
+ *
+ * If event->ctx is a cloned context, callers must make sure that
+ * every task struct that event->ctx->task could possibly point to
+ * remains valid. This is OK when called from perf_release since
+ * that only calls us on the top-level context, which can't be a clone.
+ * When called from perf_event_exit_task, it's OK because the
+ * context has been detached from its task.
+ */
+static void perf_remove_from_context(struct perf_event *event, bool detach_group)
+{
+ struct perf_event_context *ctx = event->ctx;
+ struct task_struct *task = ctx->task;
+ struct remove_event re = {
+ .event = event,
+ .detach_group = detach_group,
+ };
+
+ lockdep_assert_held(&ctx->mutex);
+
+ if (!task) {
+ /*
+ * Per cpu events are removed via an smp call and
+ * the removal is always successful.
+ */
+ cpu_function_call(event->cpu, __perf_remove_from_context, &re);
+ return;
+ }
+
+retry:
+ if (!task_function_call(task, __perf_remove_from_context, &re))
+ return;
+
+ raw_spin_lock_irq(&ctx->lock);
+ /*
+ * If we failed to find a running task, but find the context active now
+ * that we've acquired the ctx->lock, retry.
+ */
+ if (ctx->is_active) {
+ raw_spin_unlock_irq(&ctx->lock);
+ goto retry;
+ }
+
+ /*
+ * Since the task isn't running, its safe to remove the event, us
+ * holding the ctx->lock ensures the task won't get scheduled in.
+ */
+ if (detach_group)
+ perf_group_detach(event);
+ list_del_event(event, ctx);
+ raw_spin_unlock_irq(&ctx->lock);
+}
+
+/*
+ * Cross CPU call to disable a performance event
+ */
+int __perf_event_disable(void *info)
+{
+ struct perf_event *event = info;
+ struct perf_event_context *ctx = event->ctx;
+ struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+
+ /*
+ * If this is a per-task event, need to check whether this
+ * event's task is the current task on this cpu.
+ *
+ * Can trigger due to concurrent perf_event_context_sched_out()
+ * flipping contexts around.
+ */
+ if (ctx->task && cpuctx->task_ctx != ctx)
+ return -EINVAL;
+
+ raw_spin_lock(&ctx->lock);
+
+ /*
+ * If the event is on, turn it off.
+ * If it is in error state, leave it in error state.
+ */
+ if (event->state >= PERF_EVENT_STATE_INACTIVE) {
+ update_context_time(ctx);
+ update_cgrp_time_from_event(event);
+ update_group_times(event);
+ if (event == event->group_leader)
+ group_sched_out(event, cpuctx, ctx);
+ else
+ event_sched_out(event, cpuctx, ctx);
+ event->state = PERF_EVENT_STATE_OFF;
+ }
+
+ raw_spin_unlock(&ctx->lock);
+
+ return 0;
+}
+
+/*
+ * Disable a event.
+ *
+ * If event->ctx is a cloned context, callers must make sure that
+ * every task struct that event->ctx->task could possibly point to
+ * remains valid. This condition is satisifed when called through
+ * perf_event_for_each_child or perf_event_for_each because they
+ * hold the top-level event's child_mutex, so any descendant that
+ * goes to exit will block in sync_child_event.
+ * When called from perf_pending_event it's OK because event->ctx
+ * is the current context on this CPU and preemption is disabled,
+ * hence we can't get into perf_event_task_sched_out for this context.
+ */
+void perf_event_disable(struct perf_event *event)
+{
+ struct perf_event_context *ctx = event->ctx;
+ struct task_struct *task = ctx->task;
+
+ if (!task) {
+ /*
+ * Disable the event on the cpu that it's on
+ */
+ cpu_function_call(event->cpu, __perf_event_disable, event);
+ return;
+ }
+
+retry:
+ if (!task_function_call(task, __perf_event_disable, event))
+ return;
+
+ raw_spin_lock_irq(&ctx->lock);
+ /*
+ * If the event is still active, we need to retry the cross-call.
+ */
+ if (event->state == PERF_EVENT_STATE_ACTIVE) {
+ raw_spin_unlock_irq(&ctx->lock);
+ /*
+ * Reload the task pointer, it might have been changed by
+ * a concurrent perf_event_context_sched_out().
+ */
+ task = ctx->task;
+ goto retry;
+ }
+
+ /*
+ * Since we have the lock this context can't be scheduled
+ * in, so we can change the state safely.
+ */
+ if (event->state == PERF_EVENT_STATE_INACTIVE) {
+ update_group_times(event);
+ event->state = PERF_EVENT_STATE_OFF;
+ }
+ raw_spin_unlock_irq(&ctx->lock);
+}
+EXPORT_SYMBOL_GPL(perf_event_disable);
+
+static void perf_set_shadow_time(struct perf_event *event,
+ struct perf_event_context *ctx,
+ u64 tstamp)
+{
+ /*
+ * use the correct time source for the time snapshot
+ *
+ * We could get by without this by leveraging the
+ * fact that to get to this function, the caller
+ * has most likely already called update_context_time()
+ * and update_cgrp_time_xx() and thus both timestamp
+ * are identical (or very close). Given that tstamp is,
+ * already adjusted for cgroup, we could say that:
+ * tstamp - ctx->timestamp
+ * is equivalent to
+ * tstamp - cgrp->timestamp.
+ *
+ * Then, in perf_output_read(), the calculation would
+ * work with no changes because:
+ * - event is guaranteed scheduled in
+ * - no scheduled out in between
+ * - thus the timestamp would be the same
+ *
+ * But this is a bit hairy.
+ *
+ * So instead, we have an explicit cgroup call to remain
+ * within the time time source all along. We believe it
+ * is cleaner and simpler to understand.
+ */
+ if (is_cgroup_event(event))
+ perf_cgroup_set_shadow_time(event, tstamp);
+ else
+ event->shadow_ctx_time = tstamp - ctx->timestamp;
+}
+
+#define MAX_INTERRUPTS (~0ULL)
+
+static void perf_log_throttle(struct perf_event *event, int enable);
+
+static int
+event_sched_in(struct perf_event *event,
+ struct perf_cpu_context *cpuctx,
+ struct perf_event_context *ctx)
+{
+ u64 tstamp = perf_event_time(event);
+ int ret = 0;
+
+ lockdep_assert_held(&ctx->lock);
+
+ if (event->state <= PERF_EVENT_STATE_OFF)
+ return 0;
+
+ event->state = PERF_EVENT_STATE_ACTIVE;
+ event->oncpu = smp_processor_id();
+
+ /*
+ * Unthrottle events, since we scheduled we might have missed several
+ * ticks already, also for a heavily scheduling task there is little
+ * guarantee it'll get a tick in a timely manner.
+ */
+ if (unlikely(event->hw.interrupts == MAX_INTERRUPTS)) {
+ perf_log_throttle(event, 1);
+ event->hw.interrupts = 0;
+ }
+
+ /*
+ * The new state must be visible before we turn it on in the hardware:
+ */
+ smp_wmb();
+
+ perf_pmu_disable(event->pmu);
+
+ if (event->pmu->add(event, PERF_EF_START)) {
+ event->state = PERF_EVENT_STATE_INACTIVE;
+ event->oncpu = -1;
+ ret = -EAGAIN;
+ goto out;
+ }
+
+ event->tstamp_running += tstamp - event->tstamp_stopped;
+
+ perf_set_shadow_time(event, ctx, tstamp);
+
+ if (!is_software_event(event))
+ cpuctx->active_oncpu++;
+ ctx->nr_active++;
+ if (event->attr.freq && event->attr.sample_freq)
+ ctx->nr_freq++;
+
+ if (event->attr.exclusive)
+ cpuctx->exclusive = 1;
+
+out:
+ perf_pmu_enable(event->pmu);
+
+ return ret;
+}
+
+static int
+group_sched_in(struct perf_event *group_event,
+ struct perf_cpu_context *cpuctx,
+ struct perf_event_context *ctx)
+{
+ struct perf_event *event, *partial_group = NULL;
+ struct pmu *pmu = ctx->pmu;
+ u64 now = ctx->time;
+ bool simulate = false;
+
+ if (group_event->state == PERF_EVENT_STATE_OFF)
+ return 0;
+
+ pmu->start_txn(pmu);
+
+ if (event_sched_in(group_event, cpuctx, ctx)) {
+ pmu->cancel_txn(pmu);
+ perf_cpu_hrtimer_restart(cpuctx);
+ return -EAGAIN;
+ }
+
+ /*
+ * Schedule in siblings as one group (if any):
+ */
+ list_for_each_entry(event, &group_event->sibling_list, group_entry) {
+ if (event_sched_in(event, cpuctx, ctx)) {
+ partial_group = event;
+ goto group_error;
+ }
+ }
+
+ if (!pmu->commit_txn(pmu))
+ return 0;
+
+group_error:
+ /*
+ * Groups can be scheduled in as one unit only, so undo any
+ * partial group before returning:
+ * The events up to the failed event are scheduled out normally,
+ * tstamp_stopped will be updated.
+ *
+ * The failed events and the remaining siblings need to have
+ * their timings updated as if they had gone thru event_sched_in()
+ * and event_sched_out(). This is required to get consistent timings
+ * across the group. This also takes care of the case where the group
+ * could never be scheduled by ensuring tstamp_stopped is set to mark
+ * the time the event was actually stopped, such that time delta
+ * calculation in update_event_times() is correct.
+ */
+ list_for_each_entry(event, &group_event->sibling_list, group_entry) {
+ if (event == partial_group)
+ simulate = true;
+
+ if (simulate) {
+ event->tstamp_running += now - event->tstamp_stopped;
+ event->tstamp_stopped = now;
+ } else {
+ event_sched_out(event, cpuctx, ctx);
+ }
+ }
+ event_sched_out(group_event, cpuctx, ctx);
+
+ pmu->cancel_txn(pmu);
+
+ perf_cpu_hrtimer_restart(cpuctx);
+
+ return -EAGAIN;
+}
+
+/*
+ * Work out whether we can put this event group on the CPU now.
+ */
+static int group_can_go_on(struct perf_event *event,
+ struct perf_cpu_context *cpuctx,
+ int can_add_hw)
+{
+ /*
+ * Groups consisting entirely of software events can always go on.
+ */
+ if (event->group_flags & PERF_GROUP_SOFTWARE)
+ return 1;
+ /*
+ * If an exclusive group is already on, no other hardware
+ * events can go on.
+ */
+ if (cpuctx->exclusive)
+ return 0;
+ /*
+ * If this group is exclusive and there are already
+ * events on the CPU, it can't go on.
+ */
+ if (event->attr.exclusive && cpuctx->active_oncpu)
+ return 0;
+ /*
+ * Otherwise, try to add it if all previous groups were able
+ * to go on.
+ */
+ return can_add_hw;
+}
+
+static void add_event_to_ctx(struct perf_event *event,
+ struct perf_event_context *ctx)
+{
+ u64 tstamp = perf_event_time(event);
+
+ list_add_event(event, ctx);
+ perf_group_attach(event);
+ event->tstamp_enabled = tstamp;
+ event->tstamp_running = tstamp;
+ event->tstamp_stopped = tstamp;
+}
+
+static void task_ctx_sched_out(struct perf_event_context *ctx);
+static void
+ctx_sched_in(struct perf_event_context *ctx,
+ struct perf_cpu_context *cpuctx,
+ enum event_type_t event_type,
+ struct task_struct *task);
+
+static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
+ struct perf_event_context *ctx,
+ struct task_struct *task)
+{
+ cpu_ctx_sched_in(cpuctx, EVENT_PINNED, task);
+ if (ctx)
+ ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task);
+ cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task);
+ if (ctx)
+ ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
+}
+
+/*
+ * Cross CPU call to install and enable a performance event
+ *
+ * Must be called with ctx->mutex held
+ */
+static int __perf_install_in_context(void *info)
+{
+ struct perf_event *event = info;
+ struct perf_event_context *ctx = event->ctx;
+ struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+ struct perf_event_context *task_ctx = cpuctx->task_ctx;
+ struct task_struct *task = current;
+
+ perf_ctx_lock(cpuctx, task_ctx);
+ perf_pmu_disable(cpuctx->ctx.pmu);
+
+ /*
+ * If there was an active task_ctx schedule it out.
+ */
+ if (task_ctx)
+ task_ctx_sched_out(task_ctx);
+
+ /*
+ * If the context we're installing events in is not the
+ * active task_ctx, flip them.
+ */
+ if (ctx->task && task_ctx != ctx) {
+ if (task_ctx)
+ raw_spin_unlock(&task_ctx->lock);
+ raw_spin_lock(&ctx->lock);
+ task_ctx = ctx;
+ }
+
+ if (task_ctx) {
+ cpuctx->task_ctx = task_ctx;
+ task = task_ctx->task;
+ }
+
+ cpu_ctx_sched_out(cpuctx, EVENT_ALL);
+
+ update_context_time(ctx);
+ /*
+ * update cgrp time only if current cgrp
+ * matches event->cgrp. Must be done before
+ * calling add_event_to_ctx()
+ */
+ update_cgrp_time_from_event(event);
+
+ add_event_to_ctx(event, ctx);
+
+ /*
+ * Schedule everything back in
+ */
+ perf_event_sched_in(cpuctx, task_ctx, task);
+
+ perf_pmu_enable(cpuctx->ctx.pmu);
+ perf_ctx_unlock(cpuctx, task_ctx);
+
+ return 0;
+}
+
+/*
+ * Attach a performance event to a context
+ *
+ * First we add the event to the list with the hardware enable bit
+ * in event->hw_config cleared.
+ *
+ * If the event is attached to a task which is on a CPU we use a smp
+ * call to enable it in the task context. The task might have been
+ * scheduled away, but we check this in the smp call again.
+ */
+static void
+perf_install_in_context(struct perf_event_context *ctx,
+ struct perf_event *event,
+ int cpu)
+{
+ struct task_struct *task = ctx->task;
+
+ lockdep_assert_held(&ctx->mutex);
+
+ event->ctx = ctx;
+ if (event->cpu != -1)
+ event->cpu = cpu;
+
+ if (!task) {
+ /*
+ * Per cpu events are installed via an smp call and
+ * the install is always successful.
+ */
+ cpu_function_call(cpu, __perf_install_in_context, event);
+ return;
+ }
+
+retry:
+ if (!task_function_call(task, __perf_install_in_context, event))
+ return;
+
+ raw_spin_lock_irq(&ctx->lock);
+ /*
+ * If we failed to find a running task, but find the context active now
+ * that we've acquired the ctx->lock, retry.
+ */
+ if (ctx->is_active) {
+ raw_spin_unlock_irq(&ctx->lock);
+ goto retry;
+ }
+
+ /*
+ * Since the task isn't running, its safe to add the event, us holding
+ * the ctx->lock ensures the task won't get scheduled in.
+ */
+ add_event_to_ctx(event, ctx);
+ raw_spin_unlock_irq(&ctx->lock);
+}
+
+/*
+ * Put a event into inactive state and update time fields.
+ * Enabling the leader of a group effectively enables all
+ * the group members that aren't explicitly disabled, so we
+ * have to update their ->tstamp_enabled also.
+ * Note: this works for group members as well as group leaders
+ * since the non-leader members' sibling_lists will be empty.
+ */
+static void __perf_event_mark_enabled(struct perf_event *event)
+{
+ struct perf_event *sub;
+ u64 tstamp = perf_event_time(event);
+
+ event->state = PERF_EVENT_STATE_INACTIVE;
+ event->tstamp_enabled = tstamp - event->total_time_enabled;
+ list_for_each_entry(sub, &event->sibling_list, group_entry) {
+ if (sub->state >= PERF_EVENT_STATE_INACTIVE)
+ sub->tstamp_enabled = tstamp - sub->total_time_enabled;
+ }
+}
+
+/*
+ * Cross CPU call to enable a performance event
+ */
+static int __perf_event_enable(void *info)
+{
+ struct perf_event *event = info;
+ struct perf_event_context *ctx = event->ctx;
+ struct perf_event *leader = event->group_leader;
+ struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+ int err;
+
+ /*
+ * There's a time window between 'ctx->is_active' check
+ * in perf_event_enable function and this place having:
+ * - IRQs on
+ * - ctx->lock unlocked
+ *
+ * where the task could be killed and 'ctx' deactivated
+ * by perf_event_exit_task.
+ */
+ if (!ctx->is_active)
+ return -EINVAL;
+
+ raw_spin_lock(&ctx->lock);
+ update_context_time(ctx);
+
+ if (event->state >= PERF_EVENT_STATE_INACTIVE)
+ goto unlock;
+
+ /*
+ * set current task's cgroup time reference point
+ */
+ perf_cgroup_set_timestamp(current, ctx);
+
+ __perf_event_mark_enabled(event);
+
+ if (!event_filter_match(event)) {
+ if (is_cgroup_event(event))
+ perf_cgroup_defer_enabled(event);
+ goto unlock;
+ }
+
+ /*
+ * If the event is in a group and isn't the group leader,
+ * then don't put it on unless the group is on.
+ */
+ if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE)
+ goto unlock;
+
+ if (!group_can_go_on(event, cpuctx, 1)) {
+ err = -EEXIST;
+ } else {
+ if (event == leader)
+ err = group_sched_in(event, cpuctx, ctx);
+ else
+ err = event_sched_in(event, cpuctx, ctx);
+ }
+
+ if (err) {
+ /*
+ * If this event can't go on and it's part of a
+ * group, then the whole group has to come off.
+ */
+ if (leader != event) {
+ group_sched_out(leader, cpuctx, ctx);
+ perf_cpu_hrtimer_restart(cpuctx);
+ }
+ if (leader->attr.pinned) {
+ update_group_times(leader);
+ leader->state = PERF_EVENT_STATE_ERROR;
+ }
+ }
+
+unlock:
+ raw_spin_unlock(&ctx->lock);
+
+ return 0;
+}
+
+/*
+ * Enable a event.
+ *
+ * If event->ctx is a cloned context, callers must make sure that
+ * every task struct that event->ctx->task could possibly point to
+ * remains valid. This condition is satisfied when called through
+ * perf_event_for_each_child or perf_event_for_each as described
+ * for perf_event_disable.
+ */
+void perf_event_enable(struct perf_event *event)
+{
+ struct perf_event_context *ctx = event->ctx;
+ struct task_struct *task = ctx->task;
+
+ if (!task) {
+ /*
+ * Enable the event on the cpu that it's on
+ */
+ cpu_function_call(event->cpu, __perf_event_enable, event);
+ return;
+ }
+
+ raw_spin_lock_irq(&ctx->lock);
+ if (event->state >= PERF_EVENT_STATE_INACTIVE)
+ goto out;
+
+ /*
+ * If the event is in error state, clear that first.
+ * That way, if we see the event in error state below, we
+ * know that it has gone back into error state, as distinct
+ * from the task having been scheduled away before the
+ * cross-call arrived.
+ */
+ if (event->state == PERF_EVENT_STATE_ERROR)
+ event->state = PERF_EVENT_STATE_OFF;
+
+retry:
+ if (!ctx->is_active) {
+ __perf_event_mark_enabled(event);
+ goto out;
+ }
+
+ raw_spin_unlock_irq(&ctx->lock);
+
+ if (!task_function_call(task, __perf_event_enable, event))
+ return;
+
+ raw_spin_lock_irq(&ctx->lock);
+
+ /*
+ * If the context is active and the event is still off,
+ * we need to retry the cross-call.
+ */
+ if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF) {
+ /*
+ * task could have been flipped by a concurrent
+ * perf_event_context_sched_out()
+ */
+ task = ctx->task;
+ goto retry;
+ }
+
+out:
+ raw_spin_unlock_irq(&ctx->lock);
+}
+EXPORT_SYMBOL_GPL(perf_event_enable);
+
+int perf_event_refresh(struct perf_event *event, int refresh)
+{
+ /*
+ * not supported on inherited events
+ */
+ if (event->attr.inherit || !is_sampling_event(event))
+ return -EINVAL;
+
+ atomic_add(refresh, &event->event_limit);
+ perf_event_enable(event);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(perf_event_refresh);
+
+static void ctx_sched_out(struct perf_event_context *ctx,
+ struct perf_cpu_context *cpuctx,
+ enum event_type_t event_type)
+{
+ struct perf_event *event;
+ int is_active = ctx->is_active;
+
+ ctx->is_active &= ~event_type;
+ if (likely(!ctx->nr_events))
+ return;
+
+ update_context_time(ctx);
+ update_cgrp_time_from_cpuctx(cpuctx);
+ if (!ctx->nr_active)
+ return;
+
+ perf_pmu_disable(ctx->pmu);
+ if ((is_active & EVENT_PINNED) && (event_type & EVENT_PINNED)) {
+ list_for_each_entry(event, &ctx->pinned_groups, group_entry)
+ group_sched_out(event, cpuctx, ctx);
+ }
+
+ if ((is_active & EVENT_FLEXIBLE) && (event_type & EVENT_FLEXIBLE)) {
+ list_for_each_entry(event, &ctx->flexible_groups, group_entry)
+ group_sched_out(event, cpuctx, ctx);
+ }
+ perf_pmu_enable(ctx->pmu);
+}
+
+/*
+ * Test whether two contexts are equivalent, i.e. whether they have both been
+ * cloned from the same version of the same context.
+ *
+ * Equivalence is measured using a generation number in the context that is
+ * incremented on each modification to it; see unclone_ctx(), list_add_event()
+ * and list_del_event().
+ */
+static int context_equiv(struct perf_event_context *ctx1,
+ struct perf_event_context *ctx2)
+{
+ /* Pinning disables the swap optimization */
+ if (ctx1->pin_count || ctx2->pin_count)
+ return 0;
+
+ /* If ctx1 is the parent of ctx2 */
+ if (ctx1 == ctx2->parent_ctx && ctx1->generation == ctx2->parent_gen)
+ return 1;
+
+ /* If ctx2 is the parent of ctx1 */
+ if (ctx1->parent_ctx == ctx2 && ctx1->parent_gen == ctx2->generation)
+ return 1;
+
+ /*
+ * If ctx1 and ctx2 have the same parent; we flatten the parent
+ * hierarchy, see perf_event_init_context().
+ */
+ if (ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx &&
+ ctx1->parent_gen == ctx2->parent_gen)
+ return 1;
+
+ /* Unmatched */
+ return 0;
+}
+
+static void __perf_event_sync_stat(struct perf_event *event,
+ struct perf_event *next_event)
+{
+ u64 value;
+
+ if (!event->attr.inherit_stat)
+ return;
+
+ /*
+ * Update the event value, we cannot use perf_event_read()
+ * because we're in the middle of a context switch and have IRQs
+ * disabled, which upsets smp_call_function_single(), however
+ * we know the event must be on the current CPU, therefore we
+ * don't need to use it.
+ */
+ switch (event->state) {
+ case PERF_EVENT_STATE_ACTIVE:
+ event->pmu->read(event);
+ /* fall-through */
+
+ case PERF_EVENT_STATE_INACTIVE:
+ update_event_times(event);
+ break;
+
+ default:
+ break;
+ }
+
+ /*
+ * In order to keep per-task stats reliable we need to flip the event
+ * values when we flip the contexts.
+ */
+ value = local64_read(&next_event->count);
+ value = local64_xchg(&event->count, value);
+ local64_set(&next_event->count, value);
+
+ swap(event->total_time_enabled, next_event->total_time_enabled);
+ swap(event->total_time_running, next_event->total_time_running);
+
+ /*
+ * Since we swizzled the values, update the user visible data too.
+ */
+ perf_event_update_userpage(event);
+ perf_event_update_userpage(next_event);
+}
+
+static void perf_event_sync_stat(struct perf_event_context *ctx,
+ struct perf_event_context *next_ctx)
+{
+ struct perf_event *event, *next_event;
+
+ if (!ctx->nr_stat)
+ return;
+
+ update_context_time(ctx);
+
+ event = list_first_entry(&ctx->event_list,
+ struct perf_event, event_entry);
+
+ next_event = list_first_entry(&next_ctx->event_list,
+ struct perf_event, event_entry);
+
+ while (&event->event_entry != &ctx->event_list &&
+ &next_event->event_entry != &next_ctx->event_list) {
+
+ __perf_event_sync_stat(event, next_event);
+
+ event = list_next_entry(event, event_entry);
+ next_event = list_next_entry(next_event, event_entry);
+ }
+}
+
+static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
+ struct task_struct *next)
+{
+ struct perf_event_context *ctx = task->perf_event_ctxp[ctxn];
+ struct perf_event_context *next_ctx;
+ struct perf_event_context *parent, *next_parent;
+ struct perf_cpu_context *cpuctx;
+ int do_switch = 1;
+
+ if (likely(!ctx))
+ return;
+
+ cpuctx = __get_cpu_context(ctx);
+ if (!cpuctx->task_ctx)
+ return;
+
+ rcu_read_lock();
+ next_ctx = next->perf_event_ctxp[ctxn];
+ if (!next_ctx)
+ goto unlock;
+
+ parent = rcu_dereference(ctx->parent_ctx);
+ next_parent = rcu_dereference(next_ctx->parent_ctx);
+
+ /* If neither context have a parent context; they cannot be clones. */
+ if (!parent || !next_parent)
+ goto unlock;
+
+ if (next_parent == ctx || next_ctx == parent || next_parent == parent) {
+ /*
+ * Looks like the two contexts are clones, so we might be
+ * able to optimize the context switch. We lock both
+ * contexts and check that they are clones under the
+ * lock (including re-checking that neither has been
+ * uncloned in the meantime). It doesn't matter which
+ * order we take the locks because no other cpu could
+ * be trying to lock both of these tasks.
+ */
+ raw_spin_lock(&ctx->lock);
+ raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
+ if (context_equiv(ctx, next_ctx)) {
+ /*
+ * XXX do we need a memory barrier of sorts
+ * wrt to rcu_dereference() of perf_event_ctxp
+ */
+ task->perf_event_ctxp[ctxn] = next_ctx;
+ next->perf_event_ctxp[ctxn] = ctx;
+ ctx->task = next;
+ next_ctx->task = task;
+ do_switch = 0;
+
+ perf_event_sync_stat(ctx, next_ctx);
+ }
+ raw_spin_unlock(&next_ctx->lock);
+ raw_spin_unlock(&ctx->lock);
+ }
+unlock:
+ rcu_read_unlock();
+
+ if (do_switch) {
+ raw_spin_lock(&ctx->lock);
+ ctx_sched_out(ctx, cpuctx, EVENT_ALL);
+ cpuctx->task_ctx = NULL;
+ raw_spin_unlock(&ctx->lock);
+ }
+}
+
+#define for_each_task_context_nr(ctxn) \
+ for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++)
+
+/*
+ * Called from scheduler to remove the events of the current task,
+ * with interrupts disabled.
+ *
+ * We stop each event and update the event value in event->count.
+ *
+ * This does not protect us against NMI, but disable()
+ * sets the disabled bit in the control field of event _before_
+ * accessing the event control register. If a NMI hits, then it will
+ * not restart the event.
+ */
+void __perf_event_task_sched_out(struct task_struct *task,
+ struct task_struct *next)
+{
+ int ctxn;
+
+ for_each_task_context_nr(ctxn)
+ perf_event_context_sched_out(task, ctxn, next);
+
+ /*
+ * if cgroup events exist on this CPU, then we need
+ * to check if we have to switch out PMU state.
+ * cgroup event are system-wide mode only
+ */
+ if (atomic_read(&__get_cpu_var(perf_cgroup_events)))
+ perf_cgroup_sched_out(task, next);
+}
+
+static void task_ctx_sched_out(struct perf_event_context *ctx)
+{
+ struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+
+ if (!cpuctx->task_ctx)
+ return;
+
+ if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
+ return;
+
+ ctx_sched_out(ctx, cpuctx, EVENT_ALL);
+ cpuctx->task_ctx = NULL;
+}
+
+/*
+ * Called with IRQs disabled
+ */
+static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
+ enum event_type_t event_type)
+{
+ ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
+}
+
+static void
+ctx_pinned_sched_in(struct perf_event_context *ctx,
+ struct perf_cpu_context *cpuctx)
+{
+ struct perf_event *event;
+
+ list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
+ if (event->state <= PERF_EVENT_STATE_OFF)
+ continue;
+ if (!event_filter_match(event))
+ continue;
+
+ /* may need to reset tstamp_enabled */
+ if (is_cgroup_event(event))
+ perf_cgroup_mark_enabled(event, ctx);
+
+ if (group_can_go_on(event, cpuctx, 1))
+ group_sched_in(event, cpuctx, ctx);
+
+ /*
+ * If this pinned group hasn't been scheduled,
+ * put it in error state.
+ */
+ if (event->state == PERF_EVENT_STATE_INACTIVE) {
+ update_group_times(event);
+ event->state = PERF_EVENT_STATE_ERROR;
+ }
+ }
+}
+
+static void
+ctx_flexible_sched_in(struct perf_event_context *ctx,
+ struct perf_cpu_context *cpuctx)
+{
+ struct perf_event *event;
+ int can_add_hw = 1;
+
+ list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
+ /* Ignore events in OFF or ERROR state */
+ if (event->state <= PERF_EVENT_STATE_OFF)
+ continue;
+ /*
+ * Listen to the 'cpu' scheduling filter constraint
+ * of events:
+ */
+ if (!event_filter_match(event))
+ continue;
+
+ /* may need to reset tstamp_enabled */
+ if (is_cgroup_event(event))
+ perf_cgroup_mark_enabled(event, ctx);
+
+ if (group_can_go_on(event, cpuctx, can_add_hw)) {
+ if (group_sched_in(event, cpuctx, ctx))
+ can_add_hw = 0;
+ }
+ }
+}
+
+static void
+ctx_sched_in(struct perf_event_context *ctx,
+ struct perf_cpu_context *cpuctx,
+ enum event_type_t event_type,
+ struct task_struct *task)
+{
+ u64 now;
+ int is_active = ctx->is_active;
+
+ ctx->is_active |= event_type;
+ if (likely(!ctx->nr_events))
+ return;
+
+ now = perf_clock();
+ ctx->timestamp = now;
+ perf_cgroup_set_timestamp(task, ctx);
+ /*
+ * First go through the list and put on any pinned groups
+ * in order to give them the best chance of going on.
+ */
+ if (!(is_active & EVENT_PINNED) && (event_type & EVENT_PINNED))
+ ctx_pinned_sched_in(ctx, cpuctx);
+
+ /* Then walk through the lower prio flexible groups */
+ if (!(is_active & EVENT_FLEXIBLE) && (event_type & EVENT_FLEXIBLE))
+ ctx_flexible_sched_in(ctx, cpuctx);
+}
+
+static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
+ enum event_type_t event_type,
+ struct task_struct *task)
+{
+ struct perf_event_context *ctx = &cpuctx->ctx;
+
+ ctx_sched_in(ctx, cpuctx, event_type, task);
+}
+
+static void perf_event_context_sched_in(struct perf_event_context *ctx,
+ struct task_struct *task)
+{
+ struct perf_cpu_context *cpuctx;
+
+ cpuctx = __get_cpu_context(ctx);
+ if (cpuctx->task_ctx == ctx)
+ return;
+
+ perf_ctx_lock(cpuctx, ctx);
+ perf_pmu_disable(ctx->pmu);
+ /*
+ * We want to keep the following priority order:
+ * cpu pinned (that don't need to move), task pinned,
+ * cpu flexible, task flexible.
+ */
+ cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
+
+ if (ctx->nr_events)
+ cpuctx->task_ctx = ctx;
+
+ perf_event_sched_in(cpuctx, cpuctx->task_ctx, task);
+
+ perf_pmu_enable(ctx->pmu);
+ perf_ctx_unlock(cpuctx, ctx);
+
+ /*
+ * Since these rotations are per-cpu, we need to ensure the
+ * cpu-context we got scheduled on is actually rotating.
+ */
+ perf_pmu_rotate_start(ctx->pmu);
+}
+
+/*
+ * When sampling the branck stack in system-wide, it may be necessary
+ * to flush the stack on context switch. This happens when the branch
+ * stack does not tag its entries with the pid of the current task.
+ * Otherwise it becomes impossible to associate a branch entry with a
+ * task. This ambiguity is more likely to appear when the branch stack
+ * supports priv level filtering and the user sets it to monitor only
+ * at the user level (which could be a useful measurement in system-wide
+ * mode). In that case, the risk is high of having a branch stack with
+ * branch from multiple tasks. Flushing may mean dropping the existing
+ * entries or stashing them somewhere in the PMU specific code layer.
+ *
+ * This function provides the context switch callback to the lower code
+ * layer. It is invoked ONLY when there is at least one system-wide context
+ * with at least one active event using taken branch sampling.
+ */
+static void perf_branch_stack_sched_in(struct task_struct *prev,
+ struct task_struct *task)
+{
+ struct perf_cpu_context *cpuctx;
+ struct pmu *pmu;
+ unsigned long flags;
+
+ /* no need to flush branch stack if not changing task */
+ if (prev == task)
+ return;
+
+ local_irq_save(flags);
+
+ rcu_read_lock();
+
+ list_for_each_entry_rcu(pmu, &pmus, entry) {
+ cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+
+ /*
+ * check if the context has at least one
+ * event using PERF_SAMPLE_BRANCH_STACK
+ */
+ if (cpuctx->ctx.nr_branch_stack > 0
+ && pmu->flush_branch_stack) {
+
+ perf_ctx_lock(cpuctx, cpuctx->task_ctx);
+
+ perf_pmu_disable(pmu);
+
+ pmu->flush_branch_stack();
+
+ perf_pmu_enable(pmu);
+
+ perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
+ }
+ }
+
+ rcu_read_unlock();
+
+ local_irq_restore(flags);
+}
+
+/*
+ * Called from scheduler to add the events of the current task
+ * with interrupts disabled.
+ *
+ * We restore the event value and then enable it.
+ *
+ * This does not protect us against NMI, but enable()
+ * sets the enabled bit in the control field of event _before_
+ * accessing the event control register. If a NMI hits, then it will
+ * keep the event running.
+ */
+void __perf_event_task_sched_in(struct task_struct *prev,
+ struct task_struct *task)
+{
+ struct perf_event_context *ctx;
+ int ctxn;
+
+ for_each_task_context_nr(ctxn) {
+ ctx = task->perf_event_ctxp[ctxn];
+ if (likely(!ctx))
+ continue;
+
+ perf_event_context_sched_in(ctx, task);
+ }
+ /*
+ * if cgroup events exist on this CPU, then we need
+ * to check if we have to switch in PMU state.
+ * cgroup event are system-wide mode only
+ */
+ if (atomic_read(&__get_cpu_var(perf_cgroup_events)))
+ perf_cgroup_sched_in(prev, task);
+
+ /* check for system-wide branch_stack events */
+ if (atomic_read(&__get_cpu_var(perf_branch_stack_events)))
+ perf_branch_stack_sched_in(prev, task);
+}
+
+static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
+{
+ u64 frequency = event->attr.sample_freq;
+ u64 sec = NSEC_PER_SEC;
+ u64 divisor, dividend;
+
+ int count_fls, nsec_fls, frequency_fls, sec_fls;
+
+ count_fls = fls64(count);
+ nsec_fls = fls64(nsec);
+ frequency_fls = fls64(frequency);
+ sec_fls = 30;
+
+ /*
+ * We got @count in @nsec, with a target of sample_freq HZ
+ * the target period becomes:
+ *
+ * @count * 10^9
+ * period = -------------------
+ * @nsec * sample_freq
+ *
+ */
+
+ /*
+ * Reduce accuracy by one bit such that @a and @b converge
+ * to a similar magnitude.
+ */
+#define REDUCE_FLS(a, b) \
+do { \
+ if (a##_fls > b##_fls) { \
+ a >>= 1; \
+ a##_fls--; \
+ } else { \
+ b >>= 1; \
+ b##_fls--; \
+ } \
+} while (0)
+
+ /*
+ * Reduce accuracy until either term fits in a u64, then proceed with
+ * the other, so that finally we can do a u64/u64 division.
+ */
+ while (count_fls + sec_fls > 64 && nsec_fls + frequency_fls > 64) {
+ REDUCE_FLS(nsec, frequency);
+ REDUCE_FLS(sec, count);
+ }
+
+ if (count_fls + sec_fls > 64) {
+ divisor = nsec * frequency;
+
+ while (count_fls + sec_fls > 64) {
+ REDUCE_FLS(count, sec);
+ divisor >>= 1;
+ }
+
+ dividend = count * sec;
+ } else {
+ dividend = count * sec;
+
+ while (nsec_fls + frequency_fls > 64) {
+ REDUCE_FLS(nsec, frequency);
+ dividend >>= 1;
+ }
+
+ divisor = nsec * frequency;
+ }
+
+ if (!divisor)
+ return dividend;
+
+ return div64_u64(dividend, divisor);
+}
+
+static DEFINE_PER_CPU(int, perf_throttled_count);
+static DEFINE_PER_CPU(u64, perf_throttled_seq);
+
+static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bool disable)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ s64 period, sample_period;
+ s64 delta;
+
+ period = perf_calculate_period(event, nsec, count);
+
+ delta = (s64)(period - hwc->sample_period);
+ delta = (delta + 7) / 8; /* low pass filter */
+
+ sample_period = hwc->sample_period + delta;
+
+ if (!sample_period)
+ sample_period = 1;
+
+ hwc->sample_period = sample_period;
+
+ if (local64_read(&hwc->period_left) > 8*sample_period) {
+ if (disable)
+ event->pmu->stop(event, PERF_EF_UPDATE);
+
+ local64_set(&hwc->period_left, 0);
+
+ if (disable)
+ event->pmu->start(event, PERF_EF_RELOAD);
+ }
+}
+
+/*
+ * combine freq adjustment with unthrottling to avoid two passes over the
+ * events. At the same time, make sure, having freq events does not change
+ * the rate of unthrottling as that would introduce bias.
+ */
+static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
+ int needs_unthr)
+{
+ struct perf_event *event;
+ struct hw_perf_event *hwc;
+ u64 now, period = TICK_NSEC;
+ s64 delta;
+
+ /*
+ * only need to iterate over all events iff:
+ * - context have events in frequency mode (needs freq adjust)
+ * - there are events to unthrottle on this cpu
+ */
+ if (!(ctx->nr_freq || needs_unthr))
+ return;
+
+ raw_spin_lock(&ctx->lock);
+ perf_pmu_disable(ctx->pmu);
+
+ list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
+ if (event->state != PERF_EVENT_STATE_ACTIVE)
+ continue;
+
+ if (!event_filter_match(event))
+ continue;
+
+ perf_pmu_disable(event->pmu);
+
+ hwc = &event->hw;
+
+ if (hwc->interrupts == MAX_INTERRUPTS) {
+ hwc->interrupts = 0;
+ perf_log_throttle(event, 1);
+ event->pmu->start(event, 0);
+ }
+
+ if (!event->attr.freq || !event->attr.sample_freq)
+ goto next;
+
+ /*
+ * stop the event and update event->count
+ */
+ event->pmu->stop(event, PERF_EF_UPDATE);
+
+ now = local64_read(&event->count);
+ delta = now - hwc->freq_count_stamp;
+ hwc->freq_count_stamp = now;
+
+ /*
+ * restart the event
+ * reload only if value has changed
+ * we have stopped the event so tell that
+ * to perf_adjust_period() to avoid stopping it
+ * twice.
+ */
+ if (delta > 0)
+ perf_adjust_period(event, period, delta, false);
+
+ event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0);
+ next:
+ perf_pmu_enable(event->pmu);
+ }
+
+ perf_pmu_enable(ctx->pmu);
+ raw_spin_unlock(&ctx->lock);
+}
+
+/*
+ * Round-robin a context's events:
+ */
+static void rotate_ctx(struct perf_event_context *ctx)
+{
+ /*
+ * Rotate the first entry last of non-pinned groups. Rotation might be
+ * disabled by the inheritance code.
+ */
+ if (!ctx->rotate_disable)
+ list_rotate_left(&ctx->flexible_groups);
+}
+
+/*
+ * perf_pmu_rotate_start() and perf_rotate_context() are fully serialized
+ * because they're strictly cpu affine and rotate_start is called with IRQs
+ * disabled, while rotate_context is called from IRQ context.
+ */
+static int perf_rotate_context(struct perf_cpu_context *cpuctx)
+{
+ struct perf_event_context *ctx = NULL;
+ int rotate = 0, remove = 1;
+
+ if (cpuctx->ctx.nr_events) {
+ remove = 0;
+ if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
+ rotate = 1;
+ }
+
+ ctx = cpuctx->task_ctx;
+ if (ctx && ctx->nr_events) {
+ remove = 0;
+ if (ctx->nr_events != ctx->nr_active)
+ rotate = 1;
+ }
+
+ if (!rotate)
+ goto done;
+
+ perf_ctx_lock(cpuctx, cpuctx->task_ctx);
+ perf_pmu_disable(cpuctx->ctx.pmu);
+
+ cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
+ if (ctx)
+ ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE);
+
+ rotate_ctx(&cpuctx->ctx);
+ if (ctx)
+ rotate_ctx(ctx);
+
+ perf_event_sched_in(cpuctx, ctx, current);
+
+ perf_pmu_enable(cpuctx->ctx.pmu);
+ perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
+done:
+ if (remove)
+ list_del_init(&cpuctx->rotation_list);
+
+ return rotate;
+}
+
+#ifdef CONFIG_NO_HZ_FULL
+bool perf_event_can_stop_tick(void)
+{
+ if (atomic_read(&nr_freq_events) ||
+ __this_cpu_read(perf_throttled_count))
+ return false;
+ else
+ return true;
+}
+#endif
+
+void perf_event_task_tick(void)
+{
+ struct list_head *head = &__get_cpu_var(rotation_list);
+ struct perf_cpu_context *cpuctx, *tmp;
+ struct perf_event_context *ctx;
+ int throttled;
+
+ WARN_ON(!irqs_disabled());
+
+ __this_cpu_inc(perf_throttled_seq);
+ throttled = __this_cpu_xchg(perf_throttled_count, 0);
+
+ list_for_each_entry_safe(cpuctx, tmp, head, rotation_list) {
+ ctx = &cpuctx->ctx;
+ perf_adjust_freq_unthr_context(ctx, throttled);
+
+ ctx = cpuctx->task_ctx;
+ if (ctx)
+ perf_adjust_freq_unthr_context(ctx, throttled);
+ }
+}
+
+static int event_enable_on_exec(struct perf_event *event,
+ struct perf_event_context *ctx)
+{
+ if (!event->attr.enable_on_exec)
+ return 0;
+
+ event->attr.enable_on_exec = 0;
+ if (event->state >= PERF_EVENT_STATE_INACTIVE)
+ return 0;
+
+ __perf_event_mark_enabled(event);
+
+ return 1;
+}
+
+/*
+ * Enable all of a task's events that have been marked enable-on-exec.
+ * This expects task == current.
+ */
+static void perf_event_enable_on_exec(struct perf_event_context *ctx)
+{
+ struct perf_event *event;
+ unsigned long flags;
+ int enabled = 0;
+ int ret;
+
+ local_irq_save(flags);
+ if (!ctx || !ctx->nr_events)
+ goto out;
+
+ /*
+ * We must ctxsw out cgroup events to avoid conflict
+ * when invoking perf_task_event_sched_in() later on
+ * in this function. Otherwise we end up trying to
+ * ctxswin cgroup events which are already scheduled
+ * in.
+ */
+ perf_cgroup_sched_out(current, NULL);
+
+ raw_spin_lock(&ctx->lock);
+ task_ctx_sched_out(ctx);
+
+ list_for_each_entry(event, &ctx->event_list, event_entry) {
+ ret = event_enable_on_exec(event, ctx);
+ if (ret)
+ enabled = 1;
+ }
+
+ /*
+ * Unclone this context if we enabled any event.
+ */
+ if (enabled)
+ unclone_ctx(ctx);
+
+ raw_spin_unlock(&ctx->lock);
+
+ /*
+ * Also calls ctxswin for cgroup events, if any:
+ */
+ perf_event_context_sched_in(ctx, ctx->task);
+out:
+ local_irq_restore(flags);
+}
+
+void perf_event_exec(void)
+{
+ struct perf_event_context *ctx;
+ int ctxn;
+
+ rcu_read_lock();
+ for_each_task_context_nr(ctxn) {
+ ctx = current->perf_event_ctxp[ctxn];
+ if (!ctx)
+ continue;
+
+ perf_event_enable_on_exec(ctx);
+ }
+ rcu_read_unlock();
+}
+
+/*
+ * Cross CPU call to read the hardware event
+ */
+static void __perf_event_read(void *info)
+{
+ struct perf_event *event = info;
+ struct perf_event_context *ctx = event->ctx;
+ struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+
+ /*
+ * If this is a task context, we need to check whether it is
+ * the current task context of this cpu. If not it has been
+ * scheduled out before the smp call arrived. In that case
+ * event->count would have been updated to a recent sample
+ * when the event was scheduled out.
+ */
+ if (ctx->task && cpuctx->task_ctx != ctx)
+ return;
+
+ raw_spin_lock(&ctx->lock);
+ if (ctx->is_active) {
+ update_context_time(ctx);
+ update_cgrp_time_from_event(event);
+ }
+ update_event_times(event);
+ if (event->state == PERF_EVENT_STATE_ACTIVE)
+ event->pmu->read(event);
+ raw_spin_unlock(&ctx->lock);
+}
+
+static inline u64 perf_event_count(struct perf_event *event)
+{
+ return local64_read(&event->count) + atomic64_read(&event->child_count);
+}
+
+static u64 perf_event_read(struct perf_event *event)
+{
+ /*
+ * If event is enabled and currently active on a CPU, update the
+ * value in the event structure:
+ */
+ if (event->state == PERF_EVENT_STATE_ACTIVE) {
+ smp_call_function_single(event->oncpu,
+ __perf_event_read, event, 1);
+ } else if (event->state == PERF_EVENT_STATE_INACTIVE) {
+ struct perf_event_context *ctx = event->ctx;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&ctx->lock, flags);
+ /*
+ * may read while context is not active
+ * (e.g., thread is blocked), in that case
+ * we cannot update context time
+ */
+ if (ctx->is_active) {
+ update_context_time(ctx);
+ update_cgrp_time_from_event(event);
+ }
+ update_event_times(event);
+ raw_spin_unlock_irqrestore(&ctx->lock, flags);
+ }
+
+ return perf_event_count(event);
+}
+
+/*
+ * Initialize the perf_event context in a task_struct:
+ */
+static void __perf_event_init_context(struct perf_event_context *ctx)
+{
+ raw_spin_lock_init(&ctx->lock);
+ mutex_init(&ctx->mutex);
+ INIT_LIST_HEAD(&ctx->pinned_groups);
+ INIT_LIST_HEAD(&ctx->flexible_groups);
+ INIT_LIST_HEAD(&ctx->event_list);
+ atomic_set(&ctx->refcount, 1);
+}
+
+static struct perf_event_context *
+alloc_perf_context(struct pmu *pmu, struct task_struct *task)
+{
+ struct perf_event_context *ctx;
+
+ ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL);
+ if (!ctx)
+ return NULL;
+
+ __perf_event_init_context(ctx);
+ if (task) {
+ ctx->task = task;
+ get_task_struct(task);
+ }
+ ctx->pmu = pmu;
+
+ return ctx;
+}
+
+static struct task_struct *
+find_lively_task_by_vpid(pid_t vpid)
+{
+ struct task_struct *task;
+ int err;
+
+ rcu_read_lock();
+ if (!vpid)
+ task = current;
+ else
+ task = find_task_by_vpid(vpid);
+ if (task)
+ get_task_struct(task);
+ rcu_read_unlock();
+
+ if (!task)
+ return ERR_PTR(-ESRCH);
+
+ /* Reuse ptrace permission checks for now. */
+ err = -EACCES;
+ if (!ptrace_may_access(task, PTRACE_MODE_READ))
+ goto errout;
+
+ return task;
+errout:
+ put_task_struct(task);
+ return ERR_PTR(err);
+
+}
+
+/*
+ * Returns a matching context with refcount and pincount.
+ */
+static struct perf_event_context *
+find_get_context(struct pmu *pmu, struct task_struct *task, int cpu)
+{
+ struct perf_event_context *ctx;
+ struct perf_cpu_context *cpuctx;
+ unsigned long flags;
+ int ctxn, err;
+
+ if (!task) {
+ /* Must be root to operate on a CPU event: */
+ if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
+ return ERR_PTR(-EACCES);
+
+ /*
+ * We could be clever and allow to attach a event to an
+ * offline CPU and activate it when the CPU comes up, but
+ * that's for later.
+ */
+ if (!cpu_online(cpu))
+ return ERR_PTR(-ENODEV);
+
+ cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
+ ctx = &cpuctx->ctx;
+ get_ctx(ctx);
+ ++ctx->pin_count;
+
+ return ctx;
+ }
+
+ err = -EINVAL;
+ ctxn = pmu->task_ctx_nr;
+ if (ctxn < 0)
+ goto errout;
+
+retry:
+ ctx = perf_lock_task_context(task, ctxn, &flags);
+ if (ctx) {
+ unclone_ctx(ctx);
+ ++ctx->pin_count;
+ raw_spin_unlock_irqrestore(&ctx->lock, flags);
+ } else {
+ ctx = alloc_perf_context(pmu, task);
+ err = -ENOMEM;
+ if (!ctx)
+ goto errout;
+
+ err = 0;
+ mutex_lock(&task->perf_event_mutex);
+ /*
+ * If it has already passed perf_event_exit_task().
+ * we must see PF_EXITING, it takes this mutex too.
+ */
+ if (task->flags & PF_EXITING)
+ err = -ESRCH;
+ else if (task->perf_event_ctxp[ctxn])
+ err = -EAGAIN;
+ else {
+ get_ctx(ctx);
+ ++ctx->pin_count;
+ rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx);
+ }
+ mutex_unlock(&task->perf_event_mutex);
+
+ if (unlikely(err)) {
+ put_ctx(ctx);
+
+ if (err == -EAGAIN)
+ goto retry;
+ goto errout;
+ }
+ }
+
+ return ctx;
+
+errout:
+ return ERR_PTR(err);
+}
+
+static void perf_event_free_filter(struct perf_event *event);
+
+static void free_event_rcu(struct rcu_head *head)
+{
+ struct perf_event *event;
+
+ event = container_of(head, struct perf_event, rcu_head);
+ if (event->ns)
+ put_pid_ns(event->ns);
+ perf_event_free_filter(event);
+ kfree(event);
+}
+
+static void ring_buffer_put(struct ring_buffer *rb);
+static void ring_buffer_attach(struct perf_event *event,
+ struct ring_buffer *rb);
+
+static void unaccount_event_cpu(struct perf_event *event, int cpu)
+{
+ if (event->parent)
+ return;
+
+ if (has_branch_stack(event)) {
+ if (!(event->attach_state & PERF_ATTACH_TASK))
+ atomic_dec(&per_cpu(perf_branch_stack_events, cpu));
+ }
+ if (is_cgroup_event(event))
+ atomic_dec(&per_cpu(perf_cgroup_events, cpu));
+}
+
+static void unaccount_event(struct perf_event *event)
+{
+ if (event->parent)
+ return;
+
+ if (event->attach_state & PERF_ATTACH_TASK)
+ static_key_slow_dec_deferred(&perf_sched_events);
+ if (event->attr.mmap || event->attr.mmap_data)
+ atomic_dec(&nr_mmap_events);
+ if (event->attr.comm)
+ atomic_dec(&nr_comm_events);
+ if (event->attr.task)
+ atomic_dec(&nr_task_events);
+ if (event->attr.freq)
+ atomic_dec(&nr_freq_events);
+ if (is_cgroup_event(event))
+ static_key_slow_dec_deferred(&perf_sched_events);
+ if (has_branch_stack(event))
+ static_key_slow_dec_deferred(&perf_sched_events);
+
+ unaccount_event_cpu(event, event->cpu);
+}
+
+static void __free_event(struct perf_event *event)
+{
+ if (!event->parent) {
+ if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
+ put_callchain_buffers();
+ }
+
+ if (event->destroy)
+ event->destroy(event);
+
+ if (event->ctx)
+ put_ctx(event->ctx);
+
+ if (event->pmu)
+ module_put(event->pmu->module);
+
+ call_rcu(&event->rcu_head, free_event_rcu);
+}
+
+static void _free_event(struct perf_event *event)
+{
+ irq_work_sync(&event->pending);
+
+ unaccount_event(event);
+
+ if (event->rb) {
+ /*
+ * Can happen when we close an event with re-directed output.
+ *
+ * Since we have a 0 refcount, perf_mmap_close() will skip
+ * over us; possibly making our ring_buffer_put() the last.
+ */
+ mutex_lock(&event->mmap_mutex);
+ ring_buffer_attach(event, NULL);
+ mutex_unlock(&event->mmap_mutex);
+ }
+
+ if (is_cgroup_event(event))
+ perf_detach_cgroup(event);
+
+ __free_event(event);
+}
+
+/*
+ * Used to free events which have a known refcount of 1, such as in error paths
+ * where the event isn't exposed yet and inherited events.
+ */
+static void free_event(struct perf_event *event)
+{
+ if (WARN(atomic_long_cmpxchg(&event->refcount, 1, 0) != 1,
+ "unexpected event refcount: %ld; ptr=%p\n",
+ atomic_long_read(&event->refcount), event)) {
+ /* leak to avoid use-after-free */
+ return;
+ }
+
+ _free_event(event);
+}
+
+/*
+ * Called when the last reference to the file is gone.
+ */
+static void put_event(struct perf_event *event)
+{
+ struct perf_event_context *ctx = event->ctx;
+ struct task_struct *owner;
+
+ if (!atomic_long_dec_and_test(&event->refcount))
+ return;
+
+ rcu_read_lock();
+ owner = ACCESS_ONCE(event->owner);
+ /*
+ * Matches the smp_wmb() in perf_event_exit_task(). If we observe
+ * !owner it means the list deletion is complete and we can indeed
+ * free this event, otherwise we need to serialize on
+ * owner->perf_event_mutex.
+ */
+ smp_read_barrier_depends();
+ if (owner) {
+ /*
+ * Since delayed_put_task_struct() also drops the last
+ * task reference we can safely take a new reference
+ * while holding the rcu_read_lock().
+ */
+ get_task_struct(owner);
+ }
+ rcu_read_unlock();
+
+ if (owner) {
+ mutex_lock(&owner->perf_event_mutex);
+ /*
+ * We have to re-check the event->owner field, if it is cleared
+ * we raced with perf_event_exit_task(), acquiring the mutex
+ * ensured they're done, and we can proceed with freeing the
+ * event.
+ */
+ if (event->owner)
+ list_del_init(&event->owner_entry);
+ mutex_unlock(&owner->perf_event_mutex);
+ put_task_struct(owner);
+ }
+
+ WARN_ON_ONCE(ctx->parent_ctx);
+ /*
+ * There are two ways this annotation is useful:
+ *
+ * 1) there is a lock recursion from perf_event_exit_task
+ * see the comment there.
+ *
+ * 2) there is a lock-inversion with mmap_sem through
+ * perf_event_read_group(), which takes faults while
+ * holding ctx->mutex, however this is called after
+ * the last filedesc died, so there is no possibility
+ * to trigger the AB-BA case.
+ */
+ mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING);
+ perf_remove_from_context(event, true);
+ mutex_unlock(&ctx->mutex);
+
+ _free_event(event);
+}
+
+int perf_event_release_kernel(struct perf_event *event)
+{
+ put_event(event);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(perf_event_release_kernel);
+
+static int perf_release(struct inode *inode, struct file *file)
+{
+ put_event(file->private_data);
+ return 0;
+}
+
+u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
+{
+ struct perf_event *child;
+ u64 total = 0;
+
+ *enabled = 0;
+ *running = 0;
+
+ mutex_lock(&event->child_mutex);
+ total += perf_event_read(event);
+ *enabled += event->total_time_enabled +
+ atomic64_read(&event->child_total_time_enabled);
+ *running += event->total_time_running +
+ atomic64_read(&event->child_total_time_running);
+
+ list_for_each_entry(child, &event->child_list, child_list) {
+ total += perf_event_read(child);
+ *enabled += child->total_time_enabled;
+ *running += child->total_time_running;
+ }
+ mutex_unlock(&event->child_mutex);
+
+ return total;
+}
+EXPORT_SYMBOL_GPL(perf_event_read_value);
+
+static int perf_event_read_group(struct perf_event *event,
+ u64 read_format, char __user *buf)
+{
+ struct perf_event *leader = event->group_leader, *sub;
+ int n = 0, size = 0, ret = -EFAULT;
+ struct perf_event_context *ctx = leader->ctx;
+ u64 values[5];
+ u64 count, enabled, running;
+
+ mutex_lock(&ctx->mutex);
+ count = perf_event_read_value(leader, &enabled, &running);
+
+ values[n++] = 1 + leader->nr_siblings;
+ if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
+ values[n++] = enabled;
+ if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
+ values[n++] = running;
+ values[n++] = count;
+ if (read_format & PERF_FORMAT_ID)
+ values[n++] = primary_event_id(leader);
+
+ size = n * sizeof(u64);
+
+ if (copy_to_user(buf, values, size))
+ goto unlock;
+
+ ret = size;
+
+ list_for_each_entry(sub, &leader->sibling_list, group_entry) {
+ n = 0;
+
+ values[n++] = perf_event_read_value(sub, &enabled, &running);
+ if (read_format & PERF_FORMAT_ID)
+ values[n++] = primary_event_id(sub);
+
+ size = n * sizeof(u64);
+
+ if (copy_to_user(buf + ret, values, size)) {
+ ret = -EFAULT;
+ goto unlock;
+ }
+
+ ret += size;
+ }
+unlock:
+ mutex_unlock(&ctx->mutex);
+
+ return ret;
+}
+
+static int perf_event_read_one(struct perf_event *event,
+ u64 read_format, char __user *buf)
+{
+ u64 enabled, running;
+ u64 values[4];
+ int n = 0;
+
+ values[n++] = perf_event_read_value(event, &enabled, &running);
+ if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
+ values[n++] = enabled;
+ if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
+ values[n++] = running;
+ if (read_format & PERF_FORMAT_ID)
+ values[n++] = primary_event_id(event);
+
+ if (copy_to_user(buf, values, n * sizeof(u64)))
+ return -EFAULT;
+
+ return n * sizeof(u64);
+}
+
+/*
+ * Read the performance event - simple non blocking version for now
+ */
+static ssize_t
+perf_read_hw(struct perf_event *event, char __user *buf, size_t count)
+{
+ u64 read_format = event->attr.read_format;
+ int ret;
+
+ /*
+ * Return end-of-file for a read on a event that is in
+ * error state (i.e. because it was pinned but it couldn't be
+ * scheduled on to the CPU at some point).
+ */
+ if (event->state == PERF_EVENT_STATE_ERROR)
+ return 0;
+
+ if (count < event->read_size)
+ return -ENOSPC;
+
+ WARN_ON_ONCE(event->ctx->parent_ctx);
+ if (read_format & PERF_FORMAT_GROUP)
+ ret = perf_event_read_group(event, read_format, buf);
+ else
+ ret = perf_event_read_one(event, read_format, buf);
+
+ return ret;
+}
+
+static ssize_t
+perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
+{
+ struct perf_event *event = file->private_data;
+
+ return perf_read_hw(event, buf, count);
+}
+
+static unsigned int perf_poll(struct file *file, poll_table *wait)
+{
+ struct perf_event *event = file->private_data;
+ struct ring_buffer *rb;
+ unsigned int events = POLL_HUP;
+
+ /*
+ * Pin the event->rb by taking event->mmap_mutex; otherwise
+ * perf_event_set_output() can swizzle our rb and make us miss wakeups.
+ */
+ mutex_lock(&event->mmap_mutex);
+ rb = event->rb;
+ if (rb)
+ events = atomic_xchg(&rb->poll, 0);
+ mutex_unlock(&event->mmap_mutex);
+
+ poll_wait(file, &event->waitq, wait);
+
+ return events;
+}
+
+static void perf_event_reset(struct perf_event *event)
+{
+ (void)perf_event_read(event);
+ local64_set(&event->count, 0);
+ perf_event_update_userpage(event);
+}
+
+/*
+ * Holding the top-level event's child_mutex means that any
+ * descendant process that has inherited this event will block
+ * in sync_child_event if it goes to exit, thus satisfying the
+ * task existence requirements of perf_event_enable/disable.
+ */
+static void perf_event_for_each_child(struct perf_event *event,
+ void (*func)(struct perf_event *))
+{
+ struct perf_event *child;
+
+ WARN_ON_ONCE(event->ctx->parent_ctx);
+ mutex_lock(&event->child_mutex);
+ func(event);
+ list_for_each_entry(child, &event->child_list, child_list)
+ func(child);
+ mutex_unlock(&event->child_mutex);
+}
+
+static void perf_event_for_each(struct perf_event *event,
+ void (*func)(struct perf_event *))
+{
+ struct perf_event_context *ctx = event->ctx;
+ struct perf_event *sibling;
+
+ WARN_ON_ONCE(ctx->parent_ctx);
+ mutex_lock(&ctx->mutex);
+ event = event->group_leader;
+
+ perf_event_for_each_child(event, func);
+ list_for_each_entry(sibling, &event->sibling_list, group_entry)
+ perf_event_for_each_child(sibling, func);
+ mutex_unlock(&ctx->mutex);
+}
+
+static int perf_event_period(struct perf_event *event, u64 __user *arg)
+{
+ struct perf_event_context *ctx = event->ctx;
+ int ret = 0, active;
+ u64 value;
+
+ if (!is_sampling_event(event))
+ return -EINVAL;
+
+ if (copy_from_user(&value, arg, sizeof(value)))
+ return -EFAULT;
+
+ if (!value)
+ return -EINVAL;
+
+ raw_spin_lock_irq(&ctx->lock);
+ if (event->attr.freq) {
+ if (value > sysctl_perf_event_sample_rate) {
+ ret = -EINVAL;
+ goto unlock;
+ }
+
+ event->attr.sample_freq = value;
+ } else {
+ event->attr.sample_period = value;
+ event->hw.sample_period = value;
+ }
+
+ active = (event->state == PERF_EVENT_STATE_ACTIVE);
+ if (active) {
+ perf_pmu_disable(ctx->pmu);
+ event->pmu->stop(event, PERF_EF_UPDATE);
+ }
+
+ local64_set(&event->hw.period_left, 0);
+
+ if (active) {
+ event->pmu->start(event, PERF_EF_RELOAD);
+ perf_pmu_enable(ctx->pmu);
+ }
+
+unlock:
+ raw_spin_unlock_irq(&ctx->lock);
+
+ return ret;
+}
+
+static const struct file_operations perf_fops;
+
+static inline int perf_fget_light(int fd, struct fd *p)
+{
+ struct fd f = fdget(fd);
+ if (!f.file)
+ return -EBADF;
+
+ if (f.file->f_op != &perf_fops) {
+ fdput(f);
+ return -EBADF;
+ }
+ *p = f;
+ return 0;
+}
+
+static int perf_event_set_output(struct perf_event *event,
+ struct perf_event *output_event);
+static int perf_event_set_filter(struct perf_event *event, void __user *arg);
+
+static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+ struct perf_event *event = file->private_data;
+ void (*func)(struct perf_event *);
+ u32 flags = arg;
+
+ switch (cmd) {
+ case PERF_EVENT_IOC_ENABLE:
+ func = perf_event_enable;
+ break;
+ case PERF_EVENT_IOC_DISABLE:
+ func = perf_event_disable;
+ break;
+ case PERF_EVENT_IOC_RESET:
+ func = perf_event_reset;
+ break;
+
+ case PERF_EVENT_IOC_REFRESH:
+ return perf_event_refresh(event, arg);
+
+ case PERF_EVENT_IOC_PERIOD:
+ return perf_event_period(event, (u64 __user *)arg);
+
+ case PERF_EVENT_IOC_ID:
+ {
+ u64 id = primary_event_id(event);
+
+ if (copy_to_user((void __user *)arg, &id, sizeof(id)))
+ return -EFAULT;
+ return 0;
+ }
+
+ case PERF_EVENT_IOC_SET_OUTPUT:
+ {
+ int ret;
+ if (arg != -1) {
+ struct perf_event *output_event;
+ struct fd output;
+ ret = perf_fget_light(arg, &output);
+ if (ret)
+ return ret;
+ output_event = output.file->private_data;
+ ret = perf_event_set_output(event, output_event);
+ fdput(output);
+ } else {
+ ret = perf_event_set_output(event, NULL);
+ }
+ return ret;
+ }
+
+ case PERF_EVENT_IOC_SET_FILTER:
+ return perf_event_set_filter(event, (void __user *)arg);
+
+ default:
+ return -ENOTTY;
+ }
+
+ if (flags & PERF_IOC_FLAG_GROUP)
+ perf_event_for_each(event, func);
+ else
+ perf_event_for_each_child(event, func);
+
+ return 0;
+}
+
+int perf_event_task_enable(void)
+{
+ struct perf_event *event;
+
+ mutex_lock(&current->perf_event_mutex);
+ list_for_each_entry(event, &current->perf_event_list, owner_entry)
+ perf_event_for_each_child(event, perf_event_enable);
+ mutex_unlock(&current->perf_event_mutex);
+
+ return 0;
+}
+
+int perf_event_task_disable(void)
+{
+ struct perf_event *event;
+
+ mutex_lock(&current->perf_event_mutex);
+ list_for_each_entry(event, &current->perf_event_list, owner_entry)
+ perf_event_for_each_child(event, perf_event_disable);
+ mutex_unlock(&current->perf_event_mutex);
+
+ return 0;
+}
+
+static int perf_event_index(struct perf_event *event)
+{
+ if (event->hw.state & PERF_HES_STOPPED)
+ return 0;
+
+ if (event->state != PERF_EVENT_STATE_ACTIVE)
+ return 0;
+
+ return event->pmu->event_idx(event);
+}
+
+static void calc_timer_values(struct perf_event *event,
+ u64 *now,
+ u64 *enabled,
+ u64 *running)
+{
+ u64 ctx_time;
+
+ *now = perf_clock();
+ ctx_time = event->shadow_ctx_time + *now;
+ *enabled = ctx_time - event->tstamp_enabled;
+ *running = ctx_time - event->tstamp_running;
+}
+
+static void perf_event_init_userpage(struct perf_event *event)
+{
+ struct perf_event_mmap_page *userpg;
+ struct ring_buffer *rb;
+
+ rcu_read_lock();
+ rb = rcu_dereference(event->rb);
+ if (!rb)
+ goto unlock;
+
+ userpg = rb->user_page;
+
+ /* Allow new userspace to detect that bit 0 is deprecated */
+ userpg->cap_bit0_is_deprecated = 1;
+ userpg->size = offsetof(struct perf_event_mmap_page, __reserved);
+
+unlock:
+ rcu_read_unlock();
+}
+
+void __weak arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now)
+{
+}
+
+/*
+ * Callers need to ensure there can be no nesting of this function, otherwise
+ * the seqlock logic goes bad. We can not serialize this because the arch
+ * code calls this from NMI context.
+ */
+void perf_event_update_userpage(struct perf_event *event)
+{
+ struct perf_event_mmap_page *userpg;
+ struct ring_buffer *rb;
+ u64 enabled, running, now;
+
+ rcu_read_lock();
+ rb = rcu_dereference(event->rb);
+ if (!rb)
+ goto unlock;
+
+ /*
+ * compute total_time_enabled, total_time_running
+ * based on snapshot values taken when the event
+ * was last scheduled in.
+ *
+ * we cannot simply called update_context_time()
+ * because of locking issue as we can be called in
+ * NMI context
+ */
+ calc_timer_values(event, &now, &enabled, &running);
+
+ userpg = rb->user_page;
+ /*
+ * Disable preemption so as to not let the corresponding user-space
+ * spin too long if we get preempted.
+ */
+ preempt_disable();
+ ++userpg->lock;
+ barrier();
+ userpg->index = perf_event_index(event);
+ userpg->offset = perf_event_count(event);
+ if (userpg->index)
+ userpg->offset -= local64_read(&event->hw.prev_count);
+
+ userpg->time_enabled = enabled +
+ atomic64_read(&event->child_total_time_enabled);
+
+ userpg->time_running = running +
+ atomic64_read(&event->child_total_time_running);
+
+ arch_perf_update_userpage(userpg, now);
+
+ barrier();
+ ++userpg->lock;
+ preempt_enable();
+unlock:
+ rcu_read_unlock();
+}
+
+static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+ struct perf_event *event = vma->vm_file->private_data;
+ struct ring_buffer *rb;
+ int ret = VM_FAULT_SIGBUS;
+
+ if (vmf->flags & FAULT_FLAG_MKWRITE) {
+ if (vmf->pgoff == 0)
+ ret = 0;
+ return ret;
+ }
+
+ rcu_read_lock();
+ rb = rcu_dereference(event->rb);
+ if (!rb)
+ goto unlock;
+
+ if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE))
+ goto unlock;
+
+ vmf->page = perf_mmap_to_page(rb, vmf->pgoff);
+ if (!vmf->page)
+ goto unlock;
+
+ get_page(vmf->page);
+ vmf->page->mapping = vma->vm_file->f_mapping;
+ vmf->page->index = vmf->pgoff;
+
+ ret = 0;
+unlock:
+ rcu_read_unlock();
+
+ return ret;
+}
+
+static void ring_buffer_attach(struct perf_event *event,
+ struct ring_buffer *rb)
+{
+ struct ring_buffer *old_rb = NULL;
+ unsigned long flags;
+
+ if (event->rb) {
+ /*
+ * Should be impossible, we set this when removing
+ * event->rb_entry and wait/clear when adding event->rb_entry.
+ */
+ WARN_ON_ONCE(event->rcu_pending);
+
+ old_rb = event->rb;
+ event->rcu_batches = get_state_synchronize_rcu();
+ event->rcu_pending = 1;
+
+ spin_lock_irqsave(&old_rb->event_lock, flags);
+ list_del_rcu(&event->rb_entry);
+ spin_unlock_irqrestore(&old_rb->event_lock, flags);
+ }
+
+ if (event->rcu_pending && rb) {
+ cond_synchronize_rcu(event->rcu_batches);
+ event->rcu_pending = 0;
+ }
+
+ if (rb) {
+ spin_lock_irqsave(&rb->event_lock, flags);
+ list_add_rcu(&event->rb_entry, &rb->event_list);
+ spin_unlock_irqrestore(&rb->event_lock, flags);
+ }
+
+ rcu_assign_pointer(event->rb, rb);
+
+ if (old_rb) {
+ ring_buffer_put(old_rb);
+ /*
+ * Since we detached before setting the new rb, so that we
+ * could attach the new rb, we could have missed a wakeup.
+ * Provide it now.
+ */
+ wake_up_all(&event->waitq);
+ }
+}
+
+static void ring_buffer_wakeup(struct perf_event *event)
+{
+ struct ring_buffer *rb;
+
+ rcu_read_lock();
+ rb = rcu_dereference(event->rb);
+ if (rb) {
+ list_for_each_entry_rcu(event, &rb->event_list, rb_entry)
+ wake_up_all(&event->waitq);
+ }
+ rcu_read_unlock();
+}
+
+static void rb_free_rcu(struct rcu_head *rcu_head)
+{
+ struct ring_buffer *rb;
+
+ rb = container_of(rcu_head, struct ring_buffer, rcu_head);
+ rb_free(rb);
+}
+
+static struct ring_buffer *ring_buffer_get(struct perf_event *event)
+{
+ struct ring_buffer *rb;
+
+ rcu_read_lock();
+ rb = rcu_dereference(event->rb);
+ if (rb) {
+ if (!atomic_inc_not_zero(&rb->refcount))
+ rb = NULL;
+ }
+ rcu_read_unlock();
+
+ return rb;
+}
+
+static void ring_buffer_put(struct ring_buffer *rb)
+{
+ if (!atomic_dec_and_test(&rb->refcount))
+ return;
+
+ WARN_ON_ONCE(!list_empty(&rb->event_list));
+
+ call_rcu(&rb->rcu_head, rb_free_rcu);
+}
+
+static void perf_mmap_open(struct vm_area_struct *vma)
+{
+ struct perf_event *event = vma->vm_file->private_data;
+
+ atomic_inc(&event->mmap_count);
+ atomic_inc(&event->rb->mmap_count);
+}
+
+/*
+ * A buffer can be mmap()ed multiple times; either directly through the same
+ * event, or through other events by use of perf_event_set_output().
+ *
+ * In order to undo the VM accounting done by perf_mmap() we need to destroy
+ * the buffer here, where we still have a VM context. This means we need
+ * to detach all events redirecting to us.
+ */
+static void perf_mmap_close(struct vm_area_struct *vma)
+{
+ struct perf_event *event = vma->vm_file->private_data;
+
+ struct ring_buffer *rb = ring_buffer_get(event);
+ struct user_struct *mmap_user = rb->mmap_user;
+ int mmap_locked = rb->mmap_locked;
+ unsigned long size = perf_data_size(rb);
+
+ atomic_dec(&rb->mmap_count);
+
+ if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
+ goto out_put;
+
+ ring_buffer_attach(event, NULL);
+ mutex_unlock(&event->mmap_mutex);
+
+ /* If there's still other mmap()s of this buffer, we're done. */
+ if (atomic_read(&rb->mmap_count))
+ goto out_put;
+
+ /*
+ * No other mmap()s, detach from all other events that might redirect
+ * into the now unreachable buffer. Somewhat complicated by the
+ * fact that rb::event_lock otherwise nests inside mmap_mutex.
+ */
+again:
+ rcu_read_lock();
+ list_for_each_entry_rcu(event, &rb->event_list, rb_entry) {
+ if (!atomic_long_inc_not_zero(&event->refcount)) {
+ /*
+ * This event is en-route to free_event() which will
+ * detach it and remove it from the list.
+ */
+ continue;
+ }
+ rcu_read_unlock();
+
+ mutex_lock(&event->mmap_mutex);
+ /*
+ * Check we didn't race with perf_event_set_output() which can
+ * swizzle the rb from under us while we were waiting to
+ * acquire mmap_mutex.
+ *
+ * If we find a different rb; ignore this event, a next
+ * iteration will no longer find it on the list. We have to
+ * still restart the iteration to make sure we're not now
+ * iterating the wrong list.
+ */
+ if (event->rb == rb)
+ ring_buffer_attach(event, NULL);
+
+ mutex_unlock(&event->mmap_mutex);
+ put_event(event);
+
+ /*
+ * Restart the iteration; either we're on the wrong list or
+ * destroyed its integrity by doing a deletion.
+ */
+ goto again;
+ }
+ rcu_read_unlock();
+
+ /*
+ * It could be there's still a few 0-ref events on the list; they'll
+ * get cleaned up by free_event() -- they'll also still have their
+ * ref on the rb and will free it whenever they are done with it.
+ *
+ * Aside from that, this buffer is 'fully' detached and unmapped,
+ * undo the VM accounting.
+ */
+
+ atomic_long_sub((size >> PAGE_SHIFT) + 1, &mmap_user->locked_vm);
+ vma->vm_mm->pinned_vm -= mmap_locked;
+ free_uid(mmap_user);
+
+out_put:
+ ring_buffer_put(rb); /* could be last */
+}
+
+static const struct vm_operations_struct perf_mmap_vmops = {
+ .open = perf_mmap_open,
+ .close = perf_mmap_close,
+ .fault = perf_mmap_fault,
+ .page_mkwrite = perf_mmap_fault,
+};
+
+static int perf_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ struct perf_event *event = file->private_data;
+ unsigned long user_locked, user_lock_limit;
+ struct user_struct *user = current_user();
+ unsigned long locked, lock_limit;
+ struct ring_buffer *rb;
+ unsigned long vma_size;
+ unsigned long nr_pages;
+ long user_extra, extra;
+ int ret = 0, flags = 0;
+
+ /*
+ * Don't allow mmap() of inherited per-task counters. This would
+ * create a performance issue due to all children writing to the
+ * same rb.
+ */
+ if (event->cpu == -1 && event->attr.inherit)
+ return -EINVAL;
+
+ if (!(vma->vm_flags & VM_SHARED))
+ return -EINVAL;
+
+ vma_size = vma->vm_end - vma->vm_start;
+ nr_pages = (vma_size / PAGE_SIZE) - 1;
+
+ /*
+ * If we have rb pages ensure they're a power-of-two number, so we
+ * can do bitmasks instead of modulo.
+ */
+ if (nr_pages != 0 && !is_power_of_2(nr_pages))
+ return -EINVAL;
+
+ if (vma_size != PAGE_SIZE * (1 + nr_pages))
+ return -EINVAL;
+
+ if (vma->vm_pgoff != 0)
+ return -EINVAL;
+
+ WARN_ON_ONCE(event->ctx->parent_ctx);
+again:
+ mutex_lock(&event->mmap_mutex);
+ if (event->rb) {
+ if (event->rb->nr_pages != nr_pages) {
+ ret = -EINVAL;
+ goto unlock;
+ }
+
+ if (!atomic_inc_not_zero(&event->rb->mmap_count)) {
+ /*
+ * Raced against perf_mmap_close() through
+ * perf_event_set_output(). Try again, hope for better
+ * luck.
+ */
+ mutex_unlock(&event->mmap_mutex);
+ goto again;
+ }
+
+ goto unlock;
+ }
+
+ user_extra = nr_pages + 1;
+ user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
+
+ /*
+ * Increase the limit linearly with more CPUs:
+ */
+ user_lock_limit *= num_online_cpus();
+
+ user_locked = atomic_long_read(&user->locked_vm) + user_extra;
+
+ extra = 0;
+ if (user_locked > user_lock_limit)
+ extra = user_locked - user_lock_limit;
+
+ lock_limit = rlimit(RLIMIT_MEMLOCK);
+ lock_limit >>= PAGE_SHIFT;
+ locked = vma->vm_mm->pinned_vm + extra;
+
+ if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() &&
+ !capable(CAP_IPC_LOCK)) {
+ ret = -EPERM;
+ goto unlock;
+ }
+
+ WARN_ON(event->rb);
+
+ if (vma->vm_flags & VM_WRITE)
+ flags |= RING_BUFFER_WRITABLE;
+
+ rb = rb_alloc(nr_pages,
+ event->attr.watermark ? event->attr.wakeup_watermark : 0,
+ event->cpu, flags);
+
+ if (!rb) {
+ ret = -ENOMEM;
+ goto unlock;
+ }
+
+ atomic_set(&rb->mmap_count, 1);
+ rb->mmap_locked = extra;
+ rb->mmap_user = get_current_user();
+
+ atomic_long_add(user_extra, &user->locked_vm);
+ vma->vm_mm->pinned_vm += extra;
+
+ ring_buffer_attach(event, rb);
+
+ perf_event_init_userpage(event);
+ perf_event_update_userpage(event);
+
+unlock:
+ if (!ret)
+ atomic_inc(&event->mmap_count);
+ mutex_unlock(&event->mmap_mutex);
+
+ /*
+ * Since pinned accounting is per vm we cannot allow fork() to copy our
+ * vma.
+ */
+ vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP;
+ vma->vm_ops = &perf_mmap_vmops;
+
+ return ret;
+}
+
+static int perf_fasync(int fd, struct file *filp, int on)
+{
+ struct inode *inode = file_inode(filp);
+ struct perf_event *event = filp->private_data;
+ int retval;
+
+ mutex_lock(&inode->i_mutex);
+ retval = fasync_helper(fd, filp, on, &event->fasync);
+ mutex_unlock(&inode->i_mutex);
+
+ if (retval < 0)
+ return retval;
+
+ return 0;
+}
+
+static const struct file_operations perf_fops = {
+ .llseek = no_llseek,
+ .release = perf_release,
+ .read = perf_read,
+ .poll = perf_poll,
+ .unlocked_ioctl = perf_ioctl,
+ .compat_ioctl = perf_ioctl,
+ .mmap = perf_mmap,
+ .fasync = perf_fasync,
+};
+
+/*
+ * Perf event wakeup
+ *
+ * If there's data, ensure we set the poll() state and publish everything
+ * to user-space before waking everybody up.
+ */
+
+void perf_event_wakeup(struct perf_event *event)
+{
+ ring_buffer_wakeup(event);
+
+ if (event->pending_kill) {
+ kill_fasync(&event->fasync, SIGIO, event->pending_kill);
+ event->pending_kill = 0;
+ }
+}
+
+static void perf_pending_event(struct irq_work *entry)
+{
+ struct perf_event *event = container_of(entry,
+ struct perf_event, pending);
+
+ if (event->pending_disable) {
+ event->pending_disable = 0;
+ __perf_event_disable(event);
+ }
+
+ if (event->pending_wakeup) {
+ event->pending_wakeup = 0;
+ perf_event_wakeup(event);
+ }
+}
+
+/*
+ * We assume there is only KVM supporting the callbacks.
+ * Later on, we might change it to a list if there is
+ * another virtualization implementation supporting the callbacks.
+ */
+struct perf_guest_info_callbacks *perf_guest_cbs;
+
+int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
+{
+ perf_guest_cbs = cbs;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks);
+
+int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
+{
+ perf_guest_cbs = NULL;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
+
+static void
+perf_output_sample_regs(struct perf_output_handle *handle,
+ struct pt_regs *regs, u64 mask)
+{
+ int bit;
+
+ for_each_set_bit(bit, (const unsigned long *) &mask,
+ sizeof(mask) * BITS_PER_BYTE) {
+ u64 val;
+
+ val = perf_reg_value(regs, bit);
+ perf_output_put(handle, val);
+ }
+}
+
+static void perf_sample_regs_user(struct perf_regs_user *regs_user,
+ struct pt_regs *regs)
+{
+ if (!user_mode(regs)) {
+ if (current->mm)
+ regs = task_pt_regs(current);
+ else
+ regs = NULL;
+ }
+
+ if (regs) {
+ regs_user->regs = regs;
+ regs_user->abi = perf_reg_abi(current);
+ }
+}
+
+/*
+ * Get remaining task size from user stack pointer.
+ *
+ * It'd be better to take stack vma map and limit this more
+ * precisly, but there's no way to get it safely under interrupt,
+ * so using TASK_SIZE as limit.
+ */
+static u64 perf_ustack_task_size(struct pt_regs *regs)
+{
+ unsigned long addr = perf_user_stack_pointer(regs);
+
+ if (!addr || addr >= TASK_SIZE)
+ return 0;
+
+ return TASK_SIZE - addr;
+}
+
+static u16
+perf_sample_ustack_size(u16 stack_size, u16 header_size,
+ struct pt_regs *regs)
+{
+ u64 task_size;
+
+ /* No regs, no stack pointer, no dump. */
+ if (!regs)
+ return 0;
+
+ /*
+ * Check if we fit in with the requested stack size into the:
+ * - TASK_SIZE
+ * If we don't, we limit the size to the TASK_SIZE.
+ *
+ * - remaining sample size
+ * If we don't, we customize the stack size to
+ * fit in to the remaining sample size.
+ */
+
+ task_size = min((u64) USHRT_MAX, perf_ustack_task_size(regs));
+ stack_size = min(stack_size, (u16) task_size);
+
+ /* Current header size plus static size and dynamic size. */
+ header_size += 2 * sizeof(u64);
+
+ /* Do we fit in with the current stack dump size? */
+ if ((u16) (header_size + stack_size) < header_size) {
+ /*
+ * If we overflow the maximum size for the sample,
+ * we customize the stack dump size to fit in.
+ */
+ stack_size = USHRT_MAX - header_size - sizeof(u64);
+ stack_size = round_up(stack_size, sizeof(u64));
+ }
+
+ return stack_size;
+}
+
+static void
+perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size,
+ struct pt_regs *regs)
+{
+ /* Case of a kernel thread, nothing to dump */
+ if (!regs) {
+ u64 size = 0;
+ perf_output_put(handle, size);
+ } else {
+ unsigned long sp;
+ unsigned int rem;
+ u64 dyn_size;
+
+ /*
+ * We dump:
+ * static size
+ * - the size requested by user or the best one we can fit
+ * in to the sample max size
+ * data
+ * - user stack dump data
+ * dynamic size
+ * - the actual dumped size
+ */
+
+ /* Static size. */
+ perf_output_put(handle, dump_size);
+
+ /* Data. */
+ sp = perf_user_stack_pointer(regs);
+ rem = __output_copy_user(handle, (void *) sp, dump_size);
+ dyn_size = dump_size - rem;
+
+ perf_output_skip(handle, rem);
+
+ /* Dynamic size. */
+ perf_output_put(handle, dyn_size);
+ }
+}
+
+static void __perf_event_header__init_id(struct perf_event_header *header,
+ struct perf_sample_data *data,
+ struct perf_event *event)
+{
+ u64 sample_type = event->attr.sample_type;
+
+ data->type = sample_type;
+ header->size += event->id_header_size;
+
+ if (sample_type & PERF_SAMPLE_TID) {
+ /* namespace issues */
+ data->tid_entry.pid = perf_event_pid(event, current);
+ data->tid_entry.tid = perf_event_tid(event, current);
+ }
+
+ if (sample_type & PERF_SAMPLE_TIME)
+ data->time = perf_clock();
+
+ if (sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER))
+ data->id = primary_event_id(event);
+
+ if (sample_type & PERF_SAMPLE_STREAM_ID)
+ data->stream_id = event->id;
+
+ if (sample_type & PERF_SAMPLE_CPU) {
+ data->cpu_entry.cpu = raw_smp_processor_id();
+ data->cpu_entry.reserved = 0;
+ }
+}
+
+void perf_event_header__init_id(struct perf_event_header *header,
+ struct perf_sample_data *data,
+ struct perf_event *event)
+{
+ if (event->attr.sample_id_all)
+ __perf_event_header__init_id(header, data, event);
+}
+
+static void __perf_event__output_id_sample(struct perf_output_handle *handle,
+ struct perf_sample_data *data)
+{
+ u64 sample_type = data->type;
+
+ if (sample_type & PERF_SAMPLE_TID)
+ perf_output_put(handle, data->tid_entry);
+
+ if (sample_type & PERF_SAMPLE_TIME)
+ perf_output_put(handle, data->time);
+
+ if (sample_type & PERF_SAMPLE_ID)
+ perf_output_put(handle, data->id);
+
+ if (sample_type & PERF_SAMPLE_STREAM_ID)
+ perf_output_put(handle, data->stream_id);
+
+ if (sample_type & PERF_SAMPLE_CPU)
+ perf_output_put(handle, data->cpu_entry);
+
+ if (sample_type & PERF_SAMPLE_IDENTIFIER)
+ perf_output_put(handle, data->id);
+}
+
+void perf_event__output_id_sample(struct perf_event *event,
+ struct perf_output_handle *handle,
+ struct perf_sample_data *sample)
+{
+ if (event->attr.sample_id_all)
+ __perf_event__output_id_sample(handle, sample);
+}
+
+static void perf_output_read_one(struct perf_output_handle *handle,
+ struct perf_event *event,
+ u64 enabled, u64 running)
+{
+ u64 read_format = event->attr.read_format;
+ u64 values[4];
+ int n = 0;
+
+ values[n++] = perf_event_count(event);
+ if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
+ values[n++] = enabled +
+ atomic64_read(&event->child_total_time_enabled);
+ }
+ if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
+ values[n++] = running +
+ atomic64_read(&event->child_total_time_running);
+ }
+ if (read_format & PERF_FORMAT_ID)
+ values[n++] = primary_event_id(event);
+
+ __output_copy(handle, values, n * sizeof(u64));
+}
+
+/*
+ * XXX PERF_FORMAT_GROUP vs inherited events seems difficult.
+ */
+static void perf_output_read_group(struct perf_output_handle *handle,
+ struct perf_event *event,
+ u64 enabled, u64 running)
+{
+ struct perf_event *leader = event->group_leader, *sub;
+ u64 read_format = event->attr.read_format;
+ u64 values[5];
+ int n = 0;
+
+ values[n++] = 1 + leader->nr_siblings;
+
+ if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
+ values[n++] = enabled;
+
+ if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
+ values[n++] = running;
+
+ if (leader != event)
+ leader->pmu->read(leader);
+
+ values[n++] = perf_event_count(leader);
+ if (read_format & PERF_FORMAT_ID)
+ values[n++] = primary_event_id(leader);
+
+ __output_copy(handle, values, n * sizeof(u64));
+
+ list_for_each_entry(sub, &leader->sibling_list, group_entry) {
+ n = 0;
+
+ if ((sub != event) &&
+ (sub->state == PERF_EVENT_STATE_ACTIVE))
+ sub->pmu->read(sub);
+
+ values[n++] = perf_event_count(sub);
+ if (read_format & PERF_FORMAT_ID)
+ values[n++] = primary_event_id(sub);
+
+ __output_copy(handle, values, n * sizeof(u64));
+ }
+}
+
+#define PERF_FORMAT_TOTAL_TIMES (PERF_FORMAT_TOTAL_TIME_ENABLED|\
+ PERF_FORMAT_TOTAL_TIME_RUNNING)
+
+static void perf_output_read(struct perf_output_handle *handle,
+ struct perf_event *event)
+{
+ u64 enabled = 0, running = 0, now;
+ u64 read_format = event->attr.read_format;
+
+ /*
+ * compute total_time_enabled, total_time_running
+ * based on snapshot values taken when the event
+ * was last scheduled in.
+ *
+ * we cannot simply called update_context_time()
+ * because of locking issue as we are called in
+ * NMI context
+ */
+ if (read_format & PERF_FORMAT_TOTAL_TIMES)
+ calc_timer_values(event, &now, &enabled, &running);
+
+ if (event->attr.read_format & PERF_FORMAT_GROUP)
+ perf_output_read_group(handle, event, enabled, running);
+ else
+ perf_output_read_one(handle, event, enabled, running);
+}
+
+void perf_output_sample(struct perf_output_handle *handle,
+ struct perf_event_header *header,
+ struct perf_sample_data *data,
+ struct perf_event *event)
+{
+ u64 sample_type = data->type;
+
+ perf_output_put(handle, *header);
+
+ if (sample_type & PERF_SAMPLE_IDENTIFIER)
+ perf_output_put(handle, data->id);
+
+ if (sample_type & PERF_SAMPLE_IP)
+ perf_output_put(handle, data->ip);
+
+ if (sample_type & PERF_SAMPLE_TID)
+ perf_output_put(handle, data->tid_entry);
+
+ if (sample_type & PERF_SAMPLE_TIME)
+ perf_output_put(handle, data->time);
+
+ if (sample_type & PERF_SAMPLE_ADDR)
+ perf_output_put(handle, data->addr);
+
+ if (sample_type & PERF_SAMPLE_ID)
+ perf_output_put(handle, data->id);
+
+ if (sample_type & PERF_SAMPLE_STREAM_ID)
+ perf_output_put(handle, data->stream_id);
+
+ if (sample_type & PERF_SAMPLE_CPU)
+ perf_output_put(handle, data->cpu_entry);
+
+ if (sample_type & PERF_SAMPLE_PERIOD)
+ perf_output_put(handle, data->period);
+
+ if (sample_type & PERF_SAMPLE_READ)
+ perf_output_read(handle, event);
+
+ if (sample_type & PERF_SAMPLE_CALLCHAIN) {
+ if (data->callchain) {
+ int size = 1;
+
+ if (data->callchain)
+ size += data->callchain->nr;
+
+ size *= sizeof(u64);
+
+ __output_copy(handle, data->callchain, size);
+ } else {
+ u64 nr = 0;
+ perf_output_put(handle, nr);
+ }
+ }
+
+ if (sample_type & PERF_SAMPLE_RAW) {
+ if (data->raw) {
+ perf_output_put(handle, data->raw->size);
+ __output_copy(handle, data->raw->data,
+ data->raw->size);
+ } else {
+ struct {
+ u32 size;
+ u32 data;
+ } raw = {
+ .size = sizeof(u32),
+ .data = 0,
+ };
+ perf_output_put(handle, raw);
+ }
+ }
+
+ if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
+ if (data->br_stack) {
+ size_t size;
+
+ size = data->br_stack->nr
+ * sizeof(struct perf_branch_entry);
+
+ perf_output_put(handle, data->br_stack->nr);
+ perf_output_copy(handle, data->br_stack->entries, size);
+ } else {
+ /*
+ * we always store at least the value of nr
+ */
+ u64 nr = 0;
+ perf_output_put(handle, nr);
+ }
+ }
+
+ if (sample_type & PERF_SAMPLE_REGS_USER) {
+ u64 abi = data->regs_user.abi;
+
+ /*
+ * If there are no regs to dump, notice it through
+ * first u64 being zero (PERF_SAMPLE_REGS_ABI_NONE).
+ */
+ perf_output_put(handle, abi);
+
+ if (abi) {
+ u64 mask = event->attr.sample_regs_user;
+ perf_output_sample_regs(handle,
+ data->regs_user.regs,
+ mask);
+ }
+ }
+
+ if (sample_type & PERF_SAMPLE_STACK_USER) {
+ perf_output_sample_ustack(handle,
+ data->stack_user_size,
+ data->regs_user.regs);
+ }
+
+ if (sample_type & PERF_SAMPLE_WEIGHT)
+ perf_output_put(handle, data->weight);
+
+ if (sample_type & PERF_SAMPLE_DATA_SRC)
+ perf_output_put(handle, data->data_src.val);
+
+ if (sample_type & PERF_SAMPLE_TRANSACTION)
+ perf_output_put(handle, data->txn);
+
+ if (!event->attr.watermark) {
+ int wakeup_events = event->attr.wakeup_events;
+
+ if (wakeup_events) {
+ struct ring_buffer *rb = handle->rb;
+ int events = local_inc_return(&rb->events);
+
+ if (events >= wakeup_events) {
+ local_sub(wakeup_events, &rb->events);
+ local_inc(&rb->wakeup);
+ }
+ }
+ }
+}
+
+void perf_prepare_sample(struct perf_event_header *header,
+ struct perf_sample_data *data,
+ struct perf_event *event,
+ struct pt_regs *regs)
+{
+ u64 sample_type = event->attr.sample_type;
+
+ header->type = PERF_RECORD_SAMPLE;
+ header->size = sizeof(*header) + event->header_size;
+
+ header->misc = 0;
+ header->misc |= perf_misc_flags(regs);
+
+ __perf_event_header__init_id(header, data, event);
+
+ if (sample_type & PERF_SAMPLE_IP)
+ data->ip = perf_instruction_pointer(regs);
+
+ if (sample_type & PERF_SAMPLE_CALLCHAIN) {
+ int size = 1;
+
+ data->callchain = perf_callchain(event, regs);
+
+ if (data->callchain)
+ size += data->callchain->nr;
+
+ header->size += size * sizeof(u64);
+ }
+
+ if (sample_type & PERF_SAMPLE_RAW) {
+ int size = sizeof(u32);
+
+ if (data->raw)
+ size += data->raw->size;
+ else
+ size += sizeof(u32);
+
+ WARN_ON_ONCE(size & (sizeof(u64)-1));
+ header->size += size;
+ }
+
+ if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
+ int size = sizeof(u64); /* nr */
+ if (data->br_stack) {
+ size += data->br_stack->nr
+ * sizeof(struct perf_branch_entry);
+ }
+ header->size += size;
+ }
+
+ if (sample_type & PERF_SAMPLE_REGS_USER) {
+ /* regs dump ABI info */
+ int size = sizeof(u64);
+
+ perf_sample_regs_user(&data->regs_user, regs);
+
+ if (data->regs_user.regs) {
+ u64 mask = event->attr.sample_regs_user;
+ size += hweight64(mask) * sizeof(u64);
+ }
+
+ header->size += size;
+ }
+
+ if (sample_type & PERF_SAMPLE_STACK_USER) {
+ /*
+ * Either we need PERF_SAMPLE_STACK_USER bit to be allways
+ * processed as the last one or have additional check added
+ * in case new sample type is added, because we could eat
+ * up the rest of the sample size.
+ */
+ struct perf_regs_user *uregs = &data->regs_user;
+ u16 stack_size = event->attr.sample_stack_user;
+ u16 size = sizeof(u64);
+
+ if (!uregs->abi)
+ perf_sample_regs_user(uregs, regs);
+
+ stack_size = perf_sample_ustack_size(stack_size, header->size,
+ uregs->regs);
+
+ /*
+ * If there is something to dump, add space for the dump
+ * itself and for the field that tells the dynamic size,
+ * which is how many have been actually dumped.
+ */
+ if (stack_size)
+ size += sizeof(u64) + stack_size;
+
+ data->stack_user_size = stack_size;
+ header->size += size;
+ }
+}
+
+static void perf_event_output(struct perf_event *event,
+ struct perf_sample_data *data,
+ struct pt_regs *regs)
+{
+ struct perf_output_handle handle;
+ struct perf_event_header header;
+
+ /* protect the callchain buffers */
+ rcu_read_lock();
+
+ perf_prepare_sample(&header, data, event, regs);
+
+ if (perf_output_begin(&handle, event, header.size))
+ goto exit;
+
+ perf_output_sample(&handle, &header, data, event);
+
+ perf_output_end(&handle);
+
+exit:
+ rcu_read_unlock();
+}
+
+/*
+ * read event_id
+ */
+
+struct perf_read_event {
+ struct perf_event_header header;
+
+ u32 pid;
+ u32 tid;
+};
+
+static void
+perf_event_read_event(struct perf_event *event,
+ struct task_struct *task)
+{
+ struct perf_output_handle handle;
+ struct perf_sample_data sample;
+ struct perf_read_event read_event = {
+ .header = {
+ .type = PERF_RECORD_READ,
+ .misc = 0,
+ .size = sizeof(read_event) + event->read_size,
+ },
+ .pid = perf_event_pid(event, task),
+ .tid = perf_event_tid(event, task),
+ };
+ int ret;
+
+ perf_event_header__init_id(&read_event.header, &sample, event);
+ ret = perf_output_begin(&handle, event, read_event.header.size);
+ if (ret)
+ return;
+
+ perf_output_put(&handle, read_event);
+ perf_output_read(&handle, event);
+ perf_event__output_id_sample(event, &handle, &sample);
+
+ perf_output_end(&handle);
+}
+
+typedef void (perf_event_aux_output_cb)(struct perf_event *event, void *data);
+
+static void
+perf_event_aux_ctx(struct perf_event_context *ctx,
+ perf_event_aux_output_cb output,
+ void *data)
+{
+ struct perf_event *event;
+
+ list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
+ if (event->state < PERF_EVENT_STATE_INACTIVE)
+ continue;
+ if (!event_filter_match(event))
+ continue;
+ output(event, data);
+ }
+}
+
+static void
+perf_event_aux(perf_event_aux_output_cb output, void *data,
+ struct perf_event_context *task_ctx)
+{
+ struct perf_cpu_context *cpuctx;
+ struct perf_event_context *ctx;
+ struct pmu *pmu;
+ int ctxn;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(pmu, &pmus, entry) {
+ cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
+ if (cpuctx->unique_pmu != pmu)
+ goto next;
+ perf_event_aux_ctx(&cpuctx->ctx, output, data);
+ if (task_ctx)
+ goto next;
+ ctxn = pmu->task_ctx_nr;
+ if (ctxn < 0)
+ goto next;
+ ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
+ if (ctx)
+ perf_event_aux_ctx(ctx, output, data);
+next:
+ put_cpu_ptr(pmu->pmu_cpu_context);
+ }
+
+ if (task_ctx) {
+ preempt_disable();
+ perf_event_aux_ctx(task_ctx, output, data);
+ preempt_enable();
+ }
+ rcu_read_unlock();
+}
+
+/*
+ * task tracking -- fork/exit
+ *
+ * enabled by: attr.comm | attr.mmap | attr.mmap2 | attr.mmap_data | attr.task
+ */
+
+struct perf_task_event {
+ struct task_struct *task;
+ struct perf_event_context *task_ctx;
+
+ struct {
+ struct perf_event_header header;
+
+ u32 pid;
+ u32 ppid;
+ u32 tid;
+ u32 ptid;
+ u64 time;
+ } event_id;
+};
+
+static int perf_event_task_match(struct perf_event *event)
+{
+ return event->attr.comm || event->attr.mmap ||
+ event->attr.mmap2 || event->attr.mmap_data ||
+ event->attr.task;
+}
+
+static void perf_event_task_output(struct perf_event *event,
+ void *data)
+{
+ struct perf_task_event *task_event = data;
+ struct perf_output_handle handle;
+ struct perf_sample_data sample;
+ struct task_struct *task = task_event->task;
+ int ret, size = task_event->event_id.header.size;
+
+ if (!perf_event_task_match(event))
+ return;
+
+ perf_event_header__init_id(&task_event->event_id.header, &sample, event);
+
+ ret = perf_output_begin(&handle, event,
+ task_event->event_id.header.size);
+ if (ret)
+ goto out;
+
+ task_event->event_id.pid = perf_event_pid(event, task);
+ task_event->event_id.ppid = perf_event_pid(event, current);
+
+ task_event->event_id.tid = perf_event_tid(event, task);
+ task_event->event_id.ptid = perf_event_tid(event, current);
+
+ perf_output_put(&handle, task_event->event_id);
+
+ perf_event__output_id_sample(event, &handle, &sample);
+
+ perf_output_end(&handle);
+out:
+ task_event->event_id.header.size = size;
+}
+
+static void perf_event_task(struct task_struct *task,
+ struct perf_event_context *task_ctx,
+ int new)
+{
+ struct perf_task_event task_event;
+
+ if (!atomic_read(&nr_comm_events) &&
+ !atomic_read(&nr_mmap_events) &&
+ !atomic_read(&nr_task_events))
+ return;
+
+ task_event = (struct perf_task_event){
+ .task = task,
+ .task_ctx = task_ctx,
+ .event_id = {
+ .header = {
+ .type = new ? PERF_RECORD_FORK : PERF_RECORD_EXIT,
+ .misc = 0,
+ .size = sizeof(task_event.event_id),
+ },
+ /* .pid */
+ /* .ppid */
+ /* .tid */
+ /* .ptid */
+ .time = perf_clock(),
+ },
+ };
+
+ perf_event_aux(perf_event_task_output,
+ &task_event,
+ task_ctx);
+}
+
+void perf_event_fork(struct task_struct *task)
+{
+ perf_event_task(task, NULL, 1);
+}
+
+/*
+ * comm tracking
+ */
+
+struct perf_comm_event {
+ struct task_struct *task;
+ char *comm;
+ int comm_size;
+
+ struct {
+ struct perf_event_header header;
+
+ u32 pid;
+ u32 tid;
+ } event_id;
+};
+
+static int perf_event_comm_match(struct perf_event *event)
+{
+ return event->attr.comm;
+}
+
+static void perf_event_comm_output(struct perf_event *event,
+ void *data)
+{
+ struct perf_comm_event *comm_event = data;
+ struct perf_output_handle handle;
+ struct perf_sample_data sample;
+ int size = comm_event->event_id.header.size;
+ int ret;
+
+ if (!perf_event_comm_match(event))
+ return;
+
+ perf_event_header__init_id(&comm_event->event_id.header, &sample, event);
+ ret = perf_output_begin(&handle, event,
+ comm_event->event_id.header.size);
+
+ if (ret)
+ goto out;
+
+ comm_event->event_id.pid = perf_event_pid(event, comm_event->task);
+ comm_event->event_id.tid = perf_event_tid(event, comm_event->task);
+
+ perf_output_put(&handle, comm_event->event_id);
+ __output_copy(&handle, comm_event->comm,
+ comm_event->comm_size);
+
+ perf_event__output_id_sample(event, &handle, &sample);
+
+ perf_output_end(&handle);
+out:
+ comm_event->event_id.header.size = size;
+}
+
+static void perf_event_comm_event(struct perf_comm_event *comm_event)
+{
+ char comm[TASK_COMM_LEN];
+ unsigned int size;
+
+ memset(comm, 0, sizeof(comm));
+ strlcpy(comm, comm_event->task->comm, sizeof(comm));
+ size = ALIGN(strlen(comm)+1, sizeof(u64));
+
+ comm_event->comm = comm;
+ comm_event->comm_size = size;
+
+ comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
+
+ perf_event_aux(perf_event_comm_output,
+ comm_event,
+ NULL);
+}
+
+void perf_event_comm(struct task_struct *task, bool exec)
+{
+ struct perf_comm_event comm_event;
+
+ if (!atomic_read(&nr_comm_events))
+ return;
+
+ comm_event = (struct perf_comm_event){
+ .task = task,
+ /* .comm */
+ /* .comm_size */
+ .event_id = {
+ .header = {
+ .type = PERF_RECORD_COMM,
+ .misc = exec ? PERF_RECORD_MISC_COMM_EXEC : 0,
+ /* .size */
+ },
+ /* .pid */
+ /* .tid */
+ },
+ };
+
+ perf_event_comm_event(&comm_event);
+}
+
+/*
+ * mmap tracking
+ */
+
+struct perf_mmap_event {
+ struct vm_area_struct *vma;
+
+ const char *file_name;
+ int file_size;
+ int maj, min;
+ u64 ino;
+ u64 ino_generation;
+ u32 prot, flags;
+
+ struct {
+ struct perf_event_header header;
+
+ u32 pid;
+ u32 tid;
+ u64 start;
+ u64 len;
+ u64 pgoff;
+ } event_id;
+};
+
+static int perf_event_mmap_match(struct perf_event *event,
+ void *data)
+{
+ struct perf_mmap_event *mmap_event = data;
+ struct vm_area_struct *vma = mmap_event->vma;
+ int executable = vma->vm_flags & VM_EXEC;
+
+ return (!executable && event->attr.mmap_data) ||
+ (executable && (event->attr.mmap || event->attr.mmap2));
+}
+
+static void perf_event_mmap_output(struct perf_event *event,
+ void *data)
+{
+ struct perf_mmap_event *mmap_event = data;
+ struct perf_output_handle handle;
+ struct perf_sample_data sample;
+ int size = mmap_event->event_id.header.size;
+ int ret;
+
+ if (!perf_event_mmap_match(event, data))
+ return;
+
+ if (event->attr.mmap2) {
+ mmap_event->event_id.header.type = PERF_RECORD_MMAP2;
+ mmap_event->event_id.header.size += sizeof(mmap_event->maj);
+ mmap_event->event_id.header.size += sizeof(mmap_event->min);
+ mmap_event->event_id.header.size += sizeof(mmap_event->ino);
+ mmap_event->event_id.header.size += sizeof(mmap_event->ino_generation);
+ mmap_event->event_id.header.size += sizeof(mmap_event->prot);
+ mmap_event->event_id.header.size += sizeof(mmap_event->flags);
+ }
+
+ perf_event_header__init_id(&mmap_event->event_id.header, &sample, event);
+ ret = perf_output_begin(&handle, event,
+ mmap_event->event_id.header.size);
+ if (ret)
+ goto out;
+
+ mmap_event->event_id.pid = perf_event_pid(event, current);
+ mmap_event->event_id.tid = perf_event_tid(event, current);
+
+ perf_output_put(&handle, mmap_event->event_id);
+
+ if (event->attr.mmap2) {
+ perf_output_put(&handle, mmap_event->maj);
+ perf_output_put(&handle, mmap_event->min);
+ perf_output_put(&handle, mmap_event->ino);
+ perf_output_put(&handle, mmap_event->ino_generation);
+ perf_output_put(&handle, mmap_event->prot);
+ perf_output_put(&handle, mmap_event->flags);
+ }
+
+ __output_copy(&handle, mmap_event->file_name,
+ mmap_event->file_size);
+
+ perf_event__output_id_sample(event, &handle, &sample);
+
+ perf_output_end(&handle);
+out:
+ mmap_event->event_id.header.size = size;
+}
+
+static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
+{
+ struct vm_area_struct *vma = mmap_event->vma;
+ struct file *file = vma->vm_file;
+ int maj = 0, min = 0;
+ u64 ino = 0, gen = 0;
+ u32 prot = 0, flags = 0;
+ unsigned int size;
+ char tmp[16];
+ char *buf = NULL;
+ char *name;
+
+ if (file) {
+ struct inode *inode;
+ dev_t dev;
+
+ buf = kmalloc(PATH_MAX, GFP_KERNEL);
+ if (!buf) {
+ name = "//enomem";
+ goto cpy_name;
+ }
+ /*
+ * d_path() works from the end of the rb backwards, so we
+ * need to add enough zero bytes after the string to handle
+ * the 64bit alignment we do later.
+ */
+ name = d_path(&file->f_path, buf, PATH_MAX - sizeof(u64));
+ if (IS_ERR(name)) {
+ name = "//toolong";
+ goto cpy_name;
+ }
+ inode = file_inode(vma->vm_file);
+ dev = inode->i_sb->s_dev;
+ ino = inode->i_ino;
+ gen = inode->i_generation;
+ maj = MAJOR(dev);
+ min = MINOR(dev);
+
+ if (vma->vm_flags & VM_READ)
+ prot |= PROT_READ;
+ if (vma->vm_flags & VM_WRITE)
+ prot |= PROT_WRITE;
+ if (vma->vm_flags & VM_EXEC)
+ prot |= PROT_EXEC;
+
+ if (vma->vm_flags & VM_MAYSHARE)
+ flags = MAP_SHARED;
+ else
+ flags = MAP_PRIVATE;
+
+ if (vma->vm_flags & VM_DENYWRITE)
+ flags |= MAP_DENYWRITE;
+ if (vma->vm_flags & VM_MAYEXEC)
+ flags |= MAP_EXECUTABLE;
+ if (vma->vm_flags & VM_LOCKED)
+ flags |= MAP_LOCKED;
+ if (vma->vm_flags & VM_HUGETLB)
+ flags |= MAP_HUGETLB;
+
+ goto got_name;
+ } else {
+ name = (char *)arch_vma_name(vma);
+ if (name)
+ goto cpy_name;
+
+ if (vma->vm_start <= vma->vm_mm->start_brk &&
+ vma->vm_end >= vma->vm_mm->brk) {
+ name = "[heap]";
+ goto cpy_name;
+ }
+ if (vma->vm_start <= vma->vm_mm->start_stack &&
+ vma->vm_end >= vma->vm_mm->start_stack) {
+ name = "[stack]";
+ goto cpy_name;
+ }
+
+ name = "//anon";
+ goto cpy_name;
+ }
+
+cpy_name:
+ strlcpy(tmp, name, sizeof(tmp));
+ name = tmp;
+got_name:
+ /*
+ * Since our buffer works in 8 byte units we need to align our string
+ * size to a multiple of 8. However, we must guarantee the tail end is
+ * zero'd out to avoid leaking random bits to userspace.
+ */
+ size = strlen(name)+1;
+ while (!IS_ALIGNED(size, sizeof(u64)))
+ name[size++] = '\0';
+
+ mmap_event->file_name = name;
+ mmap_event->file_size = size;
+ mmap_event->maj = maj;
+ mmap_event->min = min;
+ mmap_event->ino = ino;
+ mmap_event->ino_generation = gen;
+ mmap_event->prot = prot;
+ mmap_event->flags = flags;
+
+ if (!(vma->vm_flags & VM_EXEC))
+ mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_DATA;
+
+ mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
+
+ perf_event_aux(perf_event_mmap_output,
+ mmap_event,
+ NULL);
+
+ kfree(buf);
+}
+
+void perf_event_mmap(struct vm_area_struct *vma)
+{
+ struct perf_mmap_event mmap_event;
+
+ if (!atomic_read(&nr_mmap_events))
+ return;
+
+ mmap_event = (struct perf_mmap_event){
+ .vma = vma,
+ /* .file_name */
+ /* .file_size */
+ .event_id = {
+ .header = {
+ .type = PERF_RECORD_MMAP,
+ .misc = PERF_RECORD_MISC_USER,
+ /* .size */
+ },
+ /* .pid */
+ /* .tid */
+ .start = vma->vm_start,
+ .len = vma->vm_end - vma->vm_start,
+ .pgoff = (u64)vma->vm_pgoff << PAGE_SHIFT,
+ },
+ /* .maj (attr_mmap2 only) */
+ /* .min (attr_mmap2 only) */
+ /* .ino (attr_mmap2 only) */
+ /* .ino_generation (attr_mmap2 only) */
+ /* .prot (attr_mmap2 only) */
+ /* .flags (attr_mmap2 only) */
+ };
+
+ perf_event_mmap_event(&mmap_event);
+}
+
+/*
+ * IRQ throttle logging
+ */
+
+static void perf_log_throttle(struct perf_event *event, int enable)
+{
+ struct perf_output_handle handle;
+ struct perf_sample_data sample;
+ int ret;
+
+ struct {
+ struct perf_event_header header;
+ u64 time;
+ u64 id;
+ u64 stream_id;
+ } throttle_event = {
+ .header = {
+ .type = PERF_RECORD_THROTTLE,
+ .misc = 0,
+ .size = sizeof(throttle_event),
+ },
+ .time = perf_clock(),
+ .id = primary_event_id(event),
+ .stream_id = event->id,
+ };
+
+ if (enable)
+ throttle_event.header.type = PERF_RECORD_UNTHROTTLE;
+
+ perf_event_header__init_id(&throttle_event.header, &sample, event);
+
+ ret = perf_output_begin(&handle, event,
+ throttle_event.header.size);
+ if (ret)
+ return;
+
+ perf_output_put(&handle, throttle_event);
+ perf_event__output_id_sample(event, &handle, &sample);
+ perf_output_end(&handle);
+}
+
+/*
+ * Generic event overflow handling, sampling.
+ */
+
+static int __perf_event_overflow(struct perf_event *event,
+ int throttle, struct perf_sample_data *data,
+ struct pt_regs *regs)
+{
+ int events = atomic_read(&event->event_limit);
+ struct hw_perf_event *hwc = &event->hw;
+ u64 seq;
+ int ret = 0;
+
+ /*
+ * Non-sampling counters might still use the PMI to fold short
+ * hardware counters, ignore those.
+ */
+ if (unlikely(!is_sampling_event(event)))
+ return 0;
+
+ seq = __this_cpu_read(perf_throttled_seq);
+ if (seq != hwc->interrupts_seq) {
+ hwc->interrupts_seq = seq;
+ hwc->interrupts = 1;
+ } else {
+ hwc->interrupts++;
+ if (unlikely(throttle
+ && hwc->interrupts >= max_samples_per_tick)) {
+ __this_cpu_inc(perf_throttled_count);
+ hwc->interrupts = MAX_INTERRUPTS;
+ perf_log_throttle(event, 0);
+ tick_nohz_full_kick();
+ ret = 1;
+ }
+ }
+
+ if (event->attr.freq) {
+ u64 now = perf_clock();
+ s64 delta = now - hwc->freq_time_stamp;
+
+ hwc->freq_time_stamp = now;
+
+ if (delta > 0 && delta < 2*TICK_NSEC)
+ perf_adjust_period(event, delta, hwc->last_period, true);
+ }
+
+ /*
+ * XXX event_limit might not quite work as expected on inherited
+ * events
+ */
+
+ event->pending_kill = POLL_IN;
+ if (events && atomic_dec_and_test(&event->event_limit)) {
+ ret = 1;
+ event->pending_kill = POLL_HUP;
+ event->pending_disable = 1;
+ irq_work_queue(&event->pending);
+ }
+
+ if (event->overflow_handler)
+ event->overflow_handler(event, data, regs);
+ else
+ perf_event_output(event, data, regs);
+
+ if (event->fasync && event->pending_kill) {
+ event->pending_wakeup = 1;
+ irq_work_queue(&event->pending);
+ }
+
+ return ret;
+}
+
+int perf_event_overflow(struct perf_event *event,
+ struct perf_sample_data *data,
+ struct pt_regs *regs)
+{
+ return __perf_event_overflow(event, 1, data, regs);
+}
+
+/*
+ * Generic software event infrastructure
+ */
+
+struct swevent_htable {
+ struct swevent_hlist *swevent_hlist;
+ struct mutex hlist_mutex;
+ int hlist_refcount;
+
+ /* Recursion avoidance in each contexts */
+ int recursion[PERF_NR_CONTEXTS];
+
+ /* Keeps track of cpu being initialized/exited */
+ bool online;
+};
+
+static DEFINE_PER_CPU(struct swevent_htable, swevent_htable);
+
+/*
+ * We directly increment event->count and keep a second value in
+ * event->hw.period_left to count intervals. This period event
+ * is kept in the range [-sample_period, 0] so that we can use the
+ * sign as trigger.
+ */
+
+u64 perf_swevent_set_period(struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ u64 period = hwc->last_period;
+ u64 nr, offset;
+ s64 old, val;
+
+ hwc->last_period = hwc->sample_period;
+
+again:
+ old = val = local64_read(&hwc->period_left);
+ if (val < 0)
+ return 0;
+
+ nr = div64_u64(period + val, period);
+ offset = nr * period;
+ val -= offset;
+ if (local64_cmpxchg(&hwc->period_left, old, val) != old)
+ goto again;
+
+ return nr;
+}
+
+static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
+ struct perf_sample_data *data,
+ struct pt_regs *regs)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ int throttle = 0;
+
+ if (!overflow)
+ overflow = perf_swevent_set_period(event);
+
+ if (hwc->interrupts == MAX_INTERRUPTS)
+ return;
+
+ for (; overflow; overflow--) {
+ if (__perf_event_overflow(event, throttle,
+ data, regs)) {
+ /*
+ * We inhibit the overflow from happening when
+ * hwc->interrupts == MAX_INTERRUPTS.
+ */
+ break;
+ }
+ throttle = 1;
+ }
+}
+
+static void perf_swevent_event(struct perf_event *event, u64 nr,
+ struct perf_sample_data *data,
+ struct pt_regs *regs)
+{
+ struct hw_perf_event *hwc = &event->hw;
+
+ local64_add(nr, &event->count);
+
+ if (!regs)
+ return;
+
+ if (!is_sampling_event(event))
+ return;
+
+ if ((event->attr.sample_type & PERF_SAMPLE_PERIOD) && !event->attr.freq) {
+ data->period = nr;
+ return perf_swevent_overflow(event, 1, data, regs);
+ } else
+ data->period = event->hw.last_period;
+
+ if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
+ return perf_swevent_overflow(event, 1, data, regs);
+
+ if (local64_add_negative(nr, &hwc->period_left))
+ return;
+
+ perf_swevent_overflow(event, 0, data, regs);
+}
+
+static int perf_exclude_event(struct perf_event *event,
+ struct pt_regs *regs)
+{
+ if (event->hw.state & PERF_HES_STOPPED)
+ return 1;
+
+ if (regs) {
+ if (event->attr.exclude_user && user_mode(regs))
+ return 1;
+
+ if (event->attr.exclude_kernel && !user_mode(regs))
+ return 1;
+ }
+
+ return 0;
+}
+
+static int perf_swevent_match(struct perf_event *event,
+ enum perf_type_id type,
+ u32 event_id,
+ struct perf_sample_data *data,
+ struct pt_regs *regs)
+{
+ if (event->attr.type != type)
+ return 0;
+
+ if (event->attr.config != event_id)
+ return 0;
+
+ if (perf_exclude_event(event, regs))
+ return 0;
+
+ return 1;
+}
+
+static inline u64 swevent_hash(u64 type, u32 event_id)
+{
+ u64 val = event_id | (type << 32);
+
+ return hash_64(val, SWEVENT_HLIST_BITS);
+}
+
+static inline struct hlist_head *
+__find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id)
+{
+ u64 hash = swevent_hash(type, event_id);
+
+ return &hlist->heads[hash];
+}
+
+/* For the read side: events when they trigger */
+static inline struct hlist_head *
+find_swevent_head_rcu(struct swevent_htable *swhash, u64 type, u32 event_id)
+{
+ struct swevent_hlist *hlist;
+
+ hlist = rcu_dereference(swhash->swevent_hlist);
+ if (!hlist)
+ return NULL;
+
+ return __find_swevent_head(hlist, type, event_id);
+}
+
+/* For the event head insertion and removal in the hlist */
+static inline struct hlist_head *
+find_swevent_head(struct swevent_htable *swhash, struct perf_event *event)
+{
+ struct swevent_hlist *hlist;
+ u32 event_id = event->attr.config;
+ u64 type = event->attr.type;
+
+ /*
+ * Event scheduling is always serialized against hlist allocation
+ * and release. Which makes the protected version suitable here.
+ * The context lock guarantees that.
+ */
+ hlist = rcu_dereference_protected(swhash->swevent_hlist,
+ lockdep_is_held(&event->ctx->lock));
+ if (!hlist)
+ return NULL;
+
+ return __find_swevent_head(hlist, type, event_id);
+}
+
+static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
+ u64 nr,
+ struct perf_sample_data *data,
+ struct pt_regs *regs)
+{
+ struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
+ struct perf_event *event;
+ struct hlist_head *head;
+
+ rcu_read_lock();
+ head = find_swevent_head_rcu(swhash, type, event_id);
+ if (!head)
+ goto end;
+
+ hlist_for_each_entry_rcu(event, head, hlist_entry) {
+ if (perf_swevent_match(event, type, event_id, data, regs))
+ perf_swevent_event(event, nr, data, regs);
+ }
+end:
+ rcu_read_unlock();
+}
+
+int perf_swevent_get_recursion_context(void)
+{
+ struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
+
+ return get_recursion_context(swhash->recursion);
+}
+EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
+
+inline void perf_swevent_put_recursion_context(int rctx)
+{
+ struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
+
+ put_recursion_context(swhash->recursion, rctx);
+}
+
+void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
+{
+ struct perf_sample_data data;
+ int rctx;
+
+ preempt_disable_notrace();
+ rctx = perf_swevent_get_recursion_context();
+ if (rctx < 0)
+ return;
+
+ perf_sample_data_init(&data, addr, 0);
+
+ do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs);
+
+ perf_swevent_put_recursion_context(rctx);
+ preempt_enable_notrace();
+}
+
+static void perf_swevent_read(struct perf_event *event)
+{
+}
+
+static int perf_swevent_add(struct perf_event *event, int flags)
+{
+ struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
+ struct hw_perf_event *hwc = &event->hw;
+ struct hlist_head *head;
+
+ if (is_sampling_event(event)) {
+ hwc->last_period = hwc->sample_period;
+ perf_swevent_set_period(event);
+ }
+
+ hwc->state = !(flags & PERF_EF_START);
+
+ head = find_swevent_head(swhash, event);
+ if (!head) {
+ /*
+ * We can race with cpu hotplug code. Do not
+ * WARN if the cpu just got unplugged.
+ */
+ WARN_ON_ONCE(swhash->online);
+ return -EINVAL;
+ }
+
+ hlist_add_head_rcu(&event->hlist_entry, head);
+
+ return 0;
+}
+
+static void perf_swevent_del(struct perf_event *event, int flags)
+{
+ hlist_del_rcu(&event->hlist_entry);
+}
+
+static void perf_swevent_start(struct perf_event *event, int flags)
+{
+ event->hw.state = 0;
+}
+
+static void perf_swevent_stop(struct perf_event *event, int flags)
+{
+ event->hw.state = PERF_HES_STOPPED;
+}
+
+/* Deref the hlist from the update side */
+static inline struct swevent_hlist *
+swevent_hlist_deref(struct swevent_htable *swhash)
+{
+ return rcu_dereference_protected(swhash->swevent_hlist,
+ lockdep_is_held(&swhash->hlist_mutex));
+}
+
+static void swevent_hlist_release(struct swevent_htable *swhash)
+{
+ struct swevent_hlist *hlist = swevent_hlist_deref(swhash);
+
+ if (!hlist)
+ return;
+
+ rcu_assign_pointer(swhash->swevent_hlist, NULL);
+ kfree_rcu(hlist, rcu_head);
+}
+
+static void swevent_hlist_put_cpu(struct perf_event *event, int cpu)
+{
+ struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
+
+ mutex_lock(&swhash->hlist_mutex);
+
+ if (!--swhash->hlist_refcount)
+ swevent_hlist_release(swhash);
+
+ mutex_unlock(&swhash->hlist_mutex);
+}
+
+static void swevent_hlist_put(struct perf_event *event)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ swevent_hlist_put_cpu(event, cpu);
+}
+
+static int swevent_hlist_get_cpu(struct perf_event *event, int cpu)
+{
+ struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
+ int err = 0;
+
+ mutex_lock(&swhash->hlist_mutex);
+
+ if (!swevent_hlist_deref(swhash) && cpu_online(cpu)) {
+ struct swevent_hlist *hlist;
+
+ hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
+ if (!hlist) {
+ err = -ENOMEM;
+ goto exit;
+ }
+ rcu_assign_pointer(swhash->swevent_hlist, hlist);
+ }
+ swhash->hlist_refcount++;
+exit:
+ mutex_unlock(&swhash->hlist_mutex);
+
+ return err;
+}
+
+static int swevent_hlist_get(struct perf_event *event)
+{
+ int err;
+ int cpu, failed_cpu;
+
+ get_online_cpus();
+ for_each_possible_cpu(cpu) {
+ err = swevent_hlist_get_cpu(event, cpu);
+ if (err) {
+ failed_cpu = cpu;
+ goto fail;
+ }
+ }
+ put_online_cpus();
+
+ return 0;
+fail:
+ for_each_possible_cpu(cpu) {
+ if (cpu == failed_cpu)
+ break;
+ swevent_hlist_put_cpu(event, cpu);
+ }
+
+ put_online_cpus();
+ return err;
+}
+
+struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX];
+
+static void sw_perf_event_destroy(struct perf_event *event)
+{
+ u64 event_id = event->attr.config;
+
+ WARN_ON(event->parent);
+
+ static_key_slow_dec(&perf_swevent_enabled[event_id]);
+ swevent_hlist_put(event);
+}
+
+static int perf_swevent_init(struct perf_event *event)
+{
+ u64 event_id = event->attr.config;
+
+ if (event->attr.type != PERF_TYPE_SOFTWARE)
+ return -ENOENT;
+
+ /*
+ * no branch sampling for software events
+ */
+ if (has_branch_stack(event))
+ return -EOPNOTSUPP;
+
+ switch (event_id) {
+ case PERF_COUNT_SW_CPU_CLOCK:
+ case PERF_COUNT_SW_TASK_CLOCK:
+ return -ENOENT;
+
+ default:
+ break;
+ }
+
+ if (event_id >= PERF_COUNT_SW_MAX)
+ return -ENOENT;
+
+ if (!event->parent) {
+ int err;
+
+ err = swevent_hlist_get(event);
+ if (err)
+ return err;
+
+ static_key_slow_inc(&perf_swevent_enabled[event_id]);
+ event->destroy = sw_perf_event_destroy;
+ }
+
+ return 0;
+}
+
+static int perf_swevent_event_idx(struct perf_event *event)
+{
+ return 0;
+}
+
+static struct pmu perf_swevent = {
+ .task_ctx_nr = perf_sw_context,
+
+ .event_init = perf_swevent_init,
+ .add = perf_swevent_add,
+ .del = perf_swevent_del,
+ .start = perf_swevent_start,
+ .stop = perf_swevent_stop,
+ .read = perf_swevent_read,
+
+ .event_idx = perf_swevent_event_idx,
+};
+
+#ifdef CONFIG_EVENT_TRACING
+
+static int perf_tp_filter_match(struct perf_event *event,
+ struct perf_sample_data *data)
+{
+ void *record = data->raw->data;
+
+ if (likely(!event->filter) || filter_match_preds(event->filter, record))
+ return 1;
+ return 0;
+}
+
+static int perf_tp_event_match(struct perf_event *event,
+ struct perf_sample_data *data,
+ struct pt_regs *regs)
+{
+ if (event->hw.state & PERF_HES_STOPPED)
+ return 0;
+ /*
+ * All tracepoints are from kernel-space.
+ */
+ if (event->attr.exclude_kernel)
+ return 0;
+
+ if (!perf_tp_filter_match(event, data))
+ return 0;
+
+ return 1;
+}
+
+void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
+ struct pt_regs *regs, struct hlist_head *head, int rctx,
+ struct task_struct *task)
+{
+ struct perf_sample_data data;
+ struct perf_event *event;
+
+ struct perf_raw_record raw = {
+ .size = entry_size,
+ .data = record,
+ };
+
+ perf_sample_data_init(&data, addr, 0);
+ data.raw = &raw;
+
+ hlist_for_each_entry_rcu(event, head, hlist_entry) {
+ if (perf_tp_event_match(event, &data, regs))
+ perf_swevent_event(event, count, &data, regs);
+ }
+
+ /*
+ * If we got specified a target task, also iterate its context and
+ * deliver this event there too.
+ */
+ if (task && task != current) {
+ struct perf_event_context *ctx;
+ struct trace_entry *entry = record;
+
+ rcu_read_lock();
+ ctx = rcu_dereference(task->perf_event_ctxp[perf_sw_context]);
+ if (!ctx)
+ goto unlock;
+
+ list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
+ if (event->attr.type != PERF_TYPE_TRACEPOINT)
+ continue;
+ if (event->attr.config != entry->type)
+ continue;
+ if (perf_tp_event_match(event, &data, regs))
+ perf_swevent_event(event, count, &data, regs);
+ }
+unlock:
+ rcu_read_unlock();
+ }
+
+ perf_swevent_put_recursion_context(rctx);
+}
+EXPORT_SYMBOL_GPL(perf_tp_event);
+
+static void tp_perf_event_destroy(struct perf_event *event)
+{
+ perf_trace_destroy(event);
+}
+
+static int perf_tp_event_init(struct perf_event *event)
+{
+ int err;
+
+ if (event->attr.type != PERF_TYPE_TRACEPOINT)
+ return -ENOENT;
+
+ /*
+ * no branch sampling for tracepoint events
+ */
+ if (has_branch_stack(event))
+ return -EOPNOTSUPP;
+
+ err = perf_trace_init(event);
+ if (err)
+ return err;
+
+ event->destroy = tp_perf_event_destroy;
+
+ return 0;
+}
+
+static struct pmu perf_tracepoint = {
+ .task_ctx_nr = perf_sw_context,
+
+ .event_init = perf_tp_event_init,
+ .add = perf_trace_add,
+ .del = perf_trace_del,
+ .start = perf_swevent_start,
+ .stop = perf_swevent_stop,
+ .read = perf_swevent_read,
+
+ .event_idx = perf_swevent_event_idx,
+};
+
+static inline void perf_tp_register(void)
+{
+ perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT);
+}
+
+static int perf_event_set_filter(struct perf_event *event, void __user *arg)
+{
+ char *filter_str;
+ int ret;
+
+ if (event->attr.type != PERF_TYPE_TRACEPOINT)
+ return -EINVAL;
+
+ filter_str = strndup_user(arg, PAGE_SIZE);
+ if (IS_ERR(filter_str))
+ return PTR_ERR(filter_str);
+
+ ret = ftrace_profile_set_filter(event, event->attr.config, filter_str);
+
+ kfree(filter_str);
+ return ret;
+}
+
+static void perf_event_free_filter(struct perf_event *event)
+{
+ ftrace_profile_free_filter(event);
+}
+
+#else
+
+static inline void perf_tp_register(void)
+{
+}
+
+static int perf_event_set_filter(struct perf_event *event, void __user *arg)
+{
+ return -ENOENT;
+}
+
+static void perf_event_free_filter(struct perf_event *event)
+{
+}
+
+#endif /* CONFIG_EVENT_TRACING */
+
+#ifdef CONFIG_HAVE_HW_BREAKPOINT
+void perf_bp_event(struct perf_event *bp, void *data)
+{
+ struct perf_sample_data sample;
+ struct pt_regs *regs = data;
+
+ perf_sample_data_init(&sample, bp->attr.bp_addr, 0);
+
+ if (!bp->hw.state && !perf_exclude_event(bp, regs))
+ perf_swevent_event(bp, 1, &sample, regs);
+}
+#endif
+
+/*
+ * hrtimer based swevent callback
+ */
+
+static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
+{
+ enum hrtimer_restart ret = HRTIMER_RESTART;
+ struct perf_sample_data data;
+ struct pt_regs *regs;
+ struct perf_event *event;
+ u64 period;
+
+ event = container_of(hrtimer, struct perf_event, hw.hrtimer);
+
+ if (event->state != PERF_EVENT_STATE_ACTIVE)
+ return HRTIMER_NORESTART;
+
+ event->pmu->read(event);
+
+ perf_sample_data_init(&data, 0, event->hw.last_period);
+ regs = get_irq_regs();
+
+ if (regs && !perf_exclude_event(event, regs)) {
+ if (!(event->attr.exclude_idle && is_idle_task(current)))
+ if (__perf_event_overflow(event, 1, &data, regs))
+ ret = HRTIMER_NORESTART;
+ }
+
+ period = max_t(u64, 10000, event->hw.sample_period);
+ hrtimer_forward_now(hrtimer, ns_to_ktime(period));
+
+ return ret;
+}
+
+static void perf_swevent_start_hrtimer(struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ s64 period;
+
+ if (!is_sampling_event(event))
+ return;
+
+ period = local64_read(&hwc->period_left);
+ if (period) {
+ if (period < 0)
+ period = 10000;
+
+ local64_set(&hwc->period_left, 0);
+ } else {
+ period = max_t(u64, 10000, hwc->sample_period);
+ }
+ __hrtimer_start_range_ns(&hwc->hrtimer,
+ ns_to_ktime(period), 0,
+ HRTIMER_MODE_REL_PINNED, 0);
+}
+
+static void perf_swevent_cancel_hrtimer(struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+
+ if (is_sampling_event(event)) {
+ ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
+ local64_set(&hwc->period_left, ktime_to_ns(remaining));
+
+ hrtimer_cancel(&hwc->hrtimer);
+ }
+}
+
+static void perf_swevent_init_hrtimer(struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+
+ if (!is_sampling_event(event))
+ return;
+
+ hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ hwc->hrtimer.function = perf_swevent_hrtimer;
+
+ /*
+ * Since hrtimers have a fixed rate, we can do a static freq->period
+ * mapping and avoid the whole period adjust feedback stuff.
+ */
+ if (event->attr.freq) {
+ long freq = event->attr.sample_freq;
+
+ event->attr.sample_period = NSEC_PER_SEC / freq;
+ hwc->sample_period = event->attr.sample_period;
+ local64_set(&hwc->period_left, hwc->sample_period);
+ hwc->last_period = hwc->sample_period;
+ event->attr.freq = 0;
+ }
+}
+
+/*
+ * Software event: cpu wall time clock
+ */
+
+static void cpu_clock_event_update(struct perf_event *event)
+{
+ s64 prev;
+ u64 now;
+
+ now = local_clock();
+ prev = local64_xchg(&event->hw.prev_count, now);
+ local64_add(now - prev, &event->count);
+}
+
+static void cpu_clock_event_start(struct perf_event *event, int flags)
+{
+ local64_set(&event->hw.prev_count, local_clock());
+ perf_swevent_start_hrtimer(event);
+}
+
+static void cpu_clock_event_stop(struct perf_event *event, int flags)
+{
+ perf_swevent_cancel_hrtimer(event);
+ cpu_clock_event_update(event);
+}
+
+static int cpu_clock_event_add(struct perf_event *event, int flags)
+{
+ if (flags & PERF_EF_START)
+ cpu_clock_event_start(event, flags);
+
+ return 0;
+}
+
+static void cpu_clock_event_del(struct perf_event *event, int flags)
+{
+ cpu_clock_event_stop(event, flags);
+}
+
+static void cpu_clock_event_read(struct perf_event *event)
+{
+ cpu_clock_event_update(event);
+}
+
+static int cpu_clock_event_init(struct perf_event *event)
+{
+ if (event->attr.type != PERF_TYPE_SOFTWARE)
+ return -ENOENT;
+
+ if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK)
+ return -ENOENT;
+
+ /*
+ * no branch sampling for software events
+ */
+ if (has_branch_stack(event))
+ return -EOPNOTSUPP;
+
+ perf_swevent_init_hrtimer(event);
+
+ return 0;
+}
+
+static struct pmu perf_cpu_clock = {
+ .task_ctx_nr = perf_sw_context,
+
+ .event_init = cpu_clock_event_init,
+ .add = cpu_clock_event_add,
+ .del = cpu_clock_event_del,
+ .start = cpu_clock_event_start,
+ .stop = cpu_clock_event_stop,
+ .read = cpu_clock_event_read,
+
+ .event_idx = perf_swevent_event_idx,
+};
+
+/*
+ * Software event: task time clock
+ */
+
+static void task_clock_event_update(struct perf_event *event, u64 now)
+{
+ u64 prev;
+ s64 delta;
+
+ prev = local64_xchg(&event->hw.prev_count, now);
+ delta = now - prev;
+ local64_add(delta, &event->count);
+}
+
+static void task_clock_event_start(struct perf_event *event, int flags)
+{
+ local64_set(&event->hw.prev_count, event->ctx->time);
+ perf_swevent_start_hrtimer(event);
+}
+
+static void task_clock_event_stop(struct perf_event *event, int flags)
+{
+ perf_swevent_cancel_hrtimer(event);
+ task_clock_event_update(event, event->ctx->time);
+}
+
+static int task_clock_event_add(struct perf_event *event, int flags)
+{
+ if (flags & PERF_EF_START)
+ task_clock_event_start(event, flags);
+
+ return 0;
+}
+
+static void task_clock_event_del(struct perf_event *event, int flags)
+{
+ task_clock_event_stop(event, PERF_EF_UPDATE);
+}
+
+static void task_clock_event_read(struct perf_event *event)
+{
+ u64 now = perf_clock();
+ u64 delta = now - event->ctx->timestamp;
+ u64 time = event->ctx->time + delta;
+
+ task_clock_event_update(event, time);
+}
+
+static int task_clock_event_init(struct perf_event *event)
+{
+ if (event->attr.type != PERF_TYPE_SOFTWARE)
+ return -ENOENT;
+
+ if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK)
+ return -ENOENT;
+
+ /*
+ * no branch sampling for software events
+ */
+ if (has_branch_stack(event))
+ return -EOPNOTSUPP;
+
+ perf_swevent_init_hrtimer(event);
+
+ return 0;
+}
+
+static struct pmu perf_task_clock = {
+ .task_ctx_nr = perf_sw_context,
+
+ .event_init = task_clock_event_init,
+ .add = task_clock_event_add,
+ .del = task_clock_event_del,
+ .start = task_clock_event_start,
+ .stop = task_clock_event_stop,
+ .read = task_clock_event_read,
+
+ .event_idx = perf_swevent_event_idx,
+};
+
+static void perf_pmu_nop_void(struct pmu *pmu)
+{
+}
+
+static int perf_pmu_nop_int(struct pmu *pmu)
+{
+ return 0;
+}
+
+static void perf_pmu_start_txn(struct pmu *pmu)
+{
+ perf_pmu_disable(pmu);
+}
+
+static int perf_pmu_commit_txn(struct pmu *pmu)
+{
+ perf_pmu_enable(pmu);
+ return 0;
+}
+
+static void perf_pmu_cancel_txn(struct pmu *pmu)
+{
+ perf_pmu_enable(pmu);
+}
+
+static int perf_event_idx_default(struct perf_event *event)
+{
+ return event->hw.idx + 1;
+}
+
+/*
+ * Ensures all contexts with the same task_ctx_nr have the same
+ * pmu_cpu_context too.
+ */
+static struct perf_cpu_context __percpu *find_pmu_context(int ctxn)
+{
+ struct pmu *pmu;
+
+ if (ctxn < 0)
+ return NULL;
+
+ list_for_each_entry(pmu, &pmus, entry) {
+ if (pmu->task_ctx_nr == ctxn)
+ return pmu->pmu_cpu_context;
+ }
+
+ return NULL;
+}
+
+static void update_pmu_context(struct pmu *pmu, struct pmu *old_pmu)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ struct perf_cpu_context *cpuctx;
+
+ cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
+
+ if (cpuctx->unique_pmu == old_pmu)
+ cpuctx->unique_pmu = pmu;
+ }
+}
+
+static void free_pmu_context(struct pmu *pmu)
+{
+ struct pmu *i;
+
+ mutex_lock(&pmus_lock);
+ /*
+ * Like a real lame refcount.
+ */
+ list_for_each_entry(i, &pmus, entry) {
+ if (i->pmu_cpu_context == pmu->pmu_cpu_context) {
+ update_pmu_context(i, pmu);
+ goto out;
+ }
+ }
+
+ free_percpu(pmu->pmu_cpu_context);
+out:
+ mutex_unlock(&pmus_lock);
+}
+static struct idr pmu_idr;
+
+static ssize_t
+type_show(struct device *dev, struct device_attribute *attr, char *page)
+{
+ struct pmu *pmu = dev_get_drvdata(dev);
+
+ return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type);
+}
+static DEVICE_ATTR_RO(type);
+
+static ssize_t
+perf_event_mux_interval_ms_show(struct device *dev,
+ struct device_attribute *attr,
+ char *page)
+{
+ struct pmu *pmu = dev_get_drvdata(dev);
+
+ return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->hrtimer_interval_ms);
+}
+
+static ssize_t
+perf_event_mux_interval_ms_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct pmu *pmu = dev_get_drvdata(dev);
+ int timer, cpu, ret;
+
+ ret = kstrtoint(buf, 0, &timer);
+ if (ret)
+ return ret;
+
+ if (timer < 1)
+ return -EINVAL;
+
+ /* same value, noting to do */
+ if (timer == pmu->hrtimer_interval_ms)
+ return count;
+
+ pmu->hrtimer_interval_ms = timer;
+
+ /* update all cpuctx for this PMU */
+ for_each_possible_cpu(cpu) {
+ struct perf_cpu_context *cpuctx;
+ cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
+ cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
+
+ if (hrtimer_active(&cpuctx->hrtimer))
+ hrtimer_forward_now(&cpuctx->hrtimer, cpuctx->hrtimer_interval);
+ }
+
+ return count;
+}
+static DEVICE_ATTR_RW(perf_event_mux_interval_ms);
+
+static struct attribute *pmu_dev_attrs[] = {
+ &dev_attr_type.attr,
+ &dev_attr_perf_event_mux_interval_ms.attr,
+ NULL,
+};
+ATTRIBUTE_GROUPS(pmu_dev);
+
+static int pmu_bus_running;
+static struct bus_type pmu_bus = {
+ .name = "event_source",
+ .dev_groups = pmu_dev_groups,
+};
+
+static void pmu_dev_release(struct device *dev)
+{
+ kfree(dev);
+}
+
+static int pmu_dev_alloc(struct pmu *pmu)
+{
+ int ret = -ENOMEM;
+
+ pmu->dev = kzalloc(sizeof(struct device), GFP_KERNEL);
+ if (!pmu->dev)
+ goto out;
+
+ pmu->dev->groups = pmu->attr_groups;
+ device_initialize(pmu->dev);
+ ret = dev_set_name(pmu->dev, "%s", pmu->name);
+ if (ret)
+ goto free_dev;
+
+ dev_set_drvdata(pmu->dev, pmu);
+ pmu->dev->bus = &pmu_bus;
+ pmu->dev->release = pmu_dev_release;
+ ret = device_add(pmu->dev);
+ if (ret)
+ goto free_dev;
+
+out:
+ return ret;
+
+free_dev:
+ put_device(pmu->dev);
+ goto out;
+}
+
+static struct lock_class_key cpuctx_mutex;
+static struct lock_class_key cpuctx_lock;
+
+int perf_pmu_register(struct pmu *pmu, const char *name, int type)
+{
+ int cpu, ret;
+
+ mutex_lock(&pmus_lock);
+ ret = -ENOMEM;
+ pmu->pmu_disable_count = alloc_percpu(int);
+ if (!pmu->pmu_disable_count)
+ goto unlock;
+
+ pmu->type = -1;
+ if (!name)
+ goto skip_type;
+ pmu->name = name;
+
+ if (type < 0) {
+ type = idr_alloc(&pmu_idr, pmu, PERF_TYPE_MAX, 0, GFP_KERNEL);
+ if (type < 0) {
+ ret = type;
+ goto free_pdc;
+ }
+ }
+ pmu->type = type;
+
+ if (pmu_bus_running) {
+ ret = pmu_dev_alloc(pmu);
+ if (ret)
+ goto free_idr;
+ }
+
+skip_type:
+ pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr);
+ if (pmu->pmu_cpu_context)
+ goto got_cpu_context;
+
+ ret = -ENOMEM;
+ pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context);
+ if (!pmu->pmu_cpu_context)
+ goto free_dev;
+
+ for_each_possible_cpu(cpu) {
+ struct perf_cpu_context *cpuctx;
+
+ cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
+ __perf_event_init_context(&cpuctx->ctx);
+ lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex);
+ lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
+ cpuctx->ctx.type = cpu_context;
+ cpuctx->ctx.pmu = pmu;
+
+ __perf_cpu_hrtimer_init(cpuctx, cpu);
+
+ INIT_LIST_HEAD(&cpuctx->rotation_list);
+ cpuctx->unique_pmu = pmu;
+ }
+
+got_cpu_context:
+ if (!pmu->start_txn) {
+ if (pmu->pmu_enable) {
+ /*
+ * If we have pmu_enable/pmu_disable calls, install
+ * transaction stubs that use that to try and batch
+ * hardware accesses.
+ */
+ pmu->start_txn = perf_pmu_start_txn;
+ pmu->commit_txn = perf_pmu_commit_txn;
+ pmu->cancel_txn = perf_pmu_cancel_txn;
+ } else {
+ pmu->start_txn = perf_pmu_nop_void;
+ pmu->commit_txn = perf_pmu_nop_int;
+ pmu->cancel_txn = perf_pmu_nop_void;
+ }
+ }
+
+ if (!pmu->pmu_enable) {
+ pmu->pmu_enable = perf_pmu_nop_void;
+ pmu->pmu_disable = perf_pmu_nop_void;
+ }
+
+ if (!pmu->event_idx)
+ pmu->event_idx = perf_event_idx_default;
+
+ list_add_rcu(&pmu->entry, &pmus);
+ ret = 0;
+unlock:
+ mutex_unlock(&pmus_lock);
+
+ return ret;
+
+free_dev:
+ device_del(pmu->dev);
+ put_device(pmu->dev);
+
+free_idr:
+ if (pmu->type >= PERF_TYPE_MAX)
+ idr_remove(&pmu_idr, pmu->type);
+
+free_pdc:
+ free_percpu(pmu->pmu_disable_count);
+ goto unlock;
+}
+EXPORT_SYMBOL_GPL(perf_pmu_register);
+
+void perf_pmu_unregister(struct pmu *pmu)
+{
+ mutex_lock(&pmus_lock);
+ list_del_rcu(&pmu->entry);
+ mutex_unlock(&pmus_lock);
+
+ /*
+ * We dereference the pmu list under both SRCU and regular RCU, so
+ * synchronize against both of those.
+ */
+ synchronize_srcu(&pmus_srcu);
+ synchronize_rcu();
+
+ free_percpu(pmu->pmu_disable_count);
+ if (pmu->type >= PERF_TYPE_MAX)
+ idr_remove(&pmu_idr, pmu->type);
+ device_del(pmu->dev);
+ put_device(pmu->dev);
+ free_pmu_context(pmu);
+}
+EXPORT_SYMBOL_GPL(perf_pmu_unregister);
+
+struct pmu *perf_init_event(struct perf_event *event)
+{
+ struct pmu *pmu = NULL;
+ int idx;
+ int ret;
+
+ idx = srcu_read_lock(&pmus_srcu);
+
+ rcu_read_lock();
+ pmu = idr_find(&pmu_idr, event->attr.type);
+ rcu_read_unlock();
+ if (pmu) {
+ if (!try_module_get(pmu->module)) {
+ pmu = ERR_PTR(-ENODEV);
+ goto unlock;
+ }
+ event->pmu = pmu;
+ ret = pmu->event_init(event);
+ if (ret)
+ pmu = ERR_PTR(ret);
+ goto unlock;
+ }
+
+ list_for_each_entry_rcu(pmu, &pmus, entry) {
+ if (!try_module_get(pmu->module)) {
+ pmu = ERR_PTR(-ENODEV);
+ goto unlock;
+ }
+ event->pmu = pmu;
+ ret = pmu->event_init(event);
+ if (!ret)
+ goto unlock;
+
+ if (ret != -ENOENT) {
+ pmu = ERR_PTR(ret);
+ goto unlock;
+ }
+ }
+ pmu = ERR_PTR(-ENOENT);
+unlock:
+ srcu_read_unlock(&pmus_srcu, idx);
+
+ return pmu;
+}
+
+static void account_event_cpu(struct perf_event *event, int cpu)
+{
+ if (event->parent)
+ return;
+
+ if (has_branch_stack(event)) {
+ if (!(event->attach_state & PERF_ATTACH_TASK))
+ atomic_inc(&per_cpu(perf_branch_stack_events, cpu));
+ }
+ if (is_cgroup_event(event))
+ atomic_inc(&per_cpu(perf_cgroup_events, cpu));
+}
+
+static void account_event(struct perf_event *event)
+{
+ if (event->parent)
+ return;
+
+ if (event->attach_state & PERF_ATTACH_TASK)
+ static_key_slow_inc(&perf_sched_events.key);
+ if (event->attr.mmap || event->attr.mmap_data)
+ atomic_inc(&nr_mmap_events);
+ if (event->attr.comm)
+ atomic_inc(&nr_comm_events);
+ if (event->attr.task)
+ atomic_inc(&nr_task_events);
+ if (event->attr.freq) {
+ if (atomic_inc_return(&nr_freq_events) == 1)
+ tick_nohz_full_kick_all();
+ }
+ if (has_branch_stack(event))
+ static_key_slow_inc(&perf_sched_events.key);
+ if (is_cgroup_event(event))
+ static_key_slow_inc(&perf_sched_events.key);
+
+ account_event_cpu(event, event->cpu);
+}
+
+/*
+ * Allocate and initialize a event structure
+ */
+static struct perf_event *
+perf_event_alloc(struct perf_event_attr *attr, int cpu,
+ struct task_struct *task,
+ struct perf_event *group_leader,
+ struct perf_event *parent_event,
+ perf_overflow_handler_t overflow_handler,
+ void *context)
+{
+ struct pmu *pmu;
+ struct perf_event *event;
+ struct hw_perf_event *hwc;
+ long err = -EINVAL;
+
+ if ((unsigned)cpu >= nr_cpu_ids) {
+ if (!task || cpu != -1)
+ return ERR_PTR(-EINVAL);
+ }
+
+ event = kzalloc(sizeof(*event), GFP_KERNEL);
+ if (!event)
+ return ERR_PTR(-ENOMEM);
+
+ /*
+ * Single events are their own group leaders, with an
+ * empty sibling list:
+ */
+ if (!group_leader)
+ group_leader = event;
+
+ mutex_init(&event->child_mutex);
+ INIT_LIST_HEAD(&event->child_list);
+
+ INIT_LIST_HEAD(&event->group_entry);
+ INIT_LIST_HEAD(&event->event_entry);
+ INIT_LIST_HEAD(&event->sibling_list);
+ INIT_LIST_HEAD(&event->rb_entry);
+ INIT_LIST_HEAD(&event->active_entry);
+ INIT_HLIST_NODE(&event->hlist_entry);
+
+
+ init_waitqueue_head(&event->waitq);
+ init_irq_work(&event->pending, perf_pending_event);
+
+ mutex_init(&event->mmap_mutex);
+
+ atomic_long_set(&event->refcount, 1);
+ event->cpu = cpu;
+ event->attr = *attr;
+ event->group_leader = group_leader;
+ event->pmu = NULL;
+ event->oncpu = -1;
+
+ event->parent = parent_event;
+
+ event->ns = get_pid_ns(task_active_pid_ns(current));
+ event->id = atomic64_inc_return(&perf_event_id);
+
+ event->state = PERF_EVENT_STATE_INACTIVE;
+
+ if (task) {
+ event->attach_state = PERF_ATTACH_TASK;
+
+ if (attr->type == PERF_TYPE_TRACEPOINT)
+ event->hw.tp_target = task;
+#ifdef CONFIG_HAVE_HW_BREAKPOINT
+ /*
+ * hw_breakpoint is a bit difficult here..
+ */
+ else if (attr->type == PERF_TYPE_BREAKPOINT)
+ event->hw.bp_target = task;
+#endif
+ }
+
+ if (!overflow_handler && parent_event) {
+ overflow_handler = parent_event->overflow_handler;
+ context = parent_event->overflow_handler_context;
+ }
+
+ event->overflow_handler = overflow_handler;
+ event->overflow_handler_context = context;
+
+ perf_event__state_init(event);
+
+ pmu = NULL;
+
+ hwc = &event->hw;
+ hwc->sample_period = attr->sample_period;
+ if (attr->freq && attr->sample_freq)
+ hwc->sample_period = 1;
+ hwc->last_period = hwc->sample_period;
+
+ local64_set(&hwc->period_left, hwc->sample_period);
+
+ /*
+ * we currently do not support PERF_FORMAT_GROUP on inherited events
+ */
+ if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
+ goto err_ns;
+
+ pmu = perf_init_event(event);
+ if (!pmu)
+ goto err_ns;
+ else if (IS_ERR(pmu)) {
+ err = PTR_ERR(pmu);
+ goto err_ns;
+ }
+
+ if (!event->parent) {
+ if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) {
+ err = get_callchain_buffers();
+ if (err)
+ goto err_pmu;
+ }
+ }
+
+ return event;
+
+err_pmu:
+ if (event->destroy)
+ event->destroy(event);
+ module_put(pmu->module);
+err_ns:
+ if (event->ns)
+ put_pid_ns(event->ns);
+ kfree(event);
+
+ return ERR_PTR(err);
+}
+
+static int perf_copy_attr(struct perf_event_attr __user *uattr,
+ struct perf_event_attr *attr)
+{
+ u32 size;
+ int ret;
+
+ if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0))
+ return -EFAULT;
+
+ /*
+ * zero the full structure, so that a short copy will be nice.
+ */
+ memset(attr, 0, sizeof(*attr));
+
+ ret = get_user(size, &uattr->size);
+ if (ret)
+ return ret;
+
+ if (size > PAGE_SIZE) /* silly large */
+ goto err_size;
+
+ if (!size) /* abi compat */
+ size = PERF_ATTR_SIZE_VER0;
+
+ if (size < PERF_ATTR_SIZE_VER0)
+ goto err_size;
+
+ /*
+ * If we're handed a bigger struct than we know of,
+ * ensure all the unknown bits are 0 - i.e. new
+ * user-space does not rely on any kernel feature
+ * extensions we dont know about yet.
+ */
+ if (size > sizeof(*attr)) {
+ unsigned char __user *addr;
+ unsigned char __user *end;
+ unsigned char val;
+
+ addr = (void __user *)uattr + sizeof(*attr);
+ end = (void __user *)uattr + size;
+
+ for (; addr < end; addr++) {
+ ret = get_user(val, addr);
+ if (ret)
+ return ret;
+ if (val)
+ goto err_size;
+ }
+ size = sizeof(*attr);
+ }
+
+ ret = copy_from_user(attr, uattr, size);
+ if (ret)
+ return -EFAULT;
+
+ if (attr->__reserved_1)
+ return -EINVAL;
+
+ if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
+ return -EINVAL;
+
+ if (attr->read_format & ~(PERF_FORMAT_MAX-1))
+ return -EINVAL;
+
+ if (attr->sample_type & PERF_SAMPLE_BRANCH_STACK) {
+ u64 mask = attr->branch_sample_type;
+
+ /* only using defined bits */
+ if (mask & ~(PERF_SAMPLE_BRANCH_MAX-1))
+ return -EINVAL;
+
+ /* at least one branch bit must be set */
+ if (!(mask & ~PERF_SAMPLE_BRANCH_PLM_ALL))
+ return -EINVAL;
+
+ /* propagate priv level, when not set for branch */
+ if (!(mask & PERF_SAMPLE_BRANCH_PLM_ALL)) {
+
+ /* exclude_kernel checked on syscall entry */
+ if (!attr->exclude_kernel)
+ mask |= PERF_SAMPLE_BRANCH_KERNEL;
+
+ if (!attr->exclude_user)
+ mask |= PERF_SAMPLE_BRANCH_USER;
+
+ if (!attr->exclude_hv)
+ mask |= PERF_SAMPLE_BRANCH_HV;
+ /*
+ * adjust user setting (for HW filter setup)
+ */
+ attr->branch_sample_type = mask;
+ }
+ /* privileged levels capture (kernel, hv): check permissions */
+ if ((mask & PERF_SAMPLE_BRANCH_PERM_PLM)
+ && perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
+ return -EACCES;
+ }
+
+ if (attr->sample_type & PERF_SAMPLE_REGS_USER) {
+ ret = perf_reg_validate(attr->sample_regs_user);
+ if (ret)
+ return ret;
+ }
+
+ if (attr->sample_type & PERF_SAMPLE_STACK_USER) {
+ if (!arch_perf_have_user_stack_dump())
+ return -ENOSYS;
+
+ /*
+ * We have __u32 type for the size, but so far
+ * we can only use __u16 as maximum due to the
+ * __u16 sample size limit.
+ */
+ if (attr->sample_stack_user >= USHRT_MAX)
+ ret = -EINVAL;
+ else if (!IS_ALIGNED(attr->sample_stack_user, sizeof(u64)))
+ ret = -EINVAL;
+ }
+
+out:
+ return ret;
+
+err_size:
+ put_user(sizeof(*attr), &uattr->size);
+ ret = -E2BIG;
+ goto out;
+}
+
+static int
+perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
+{
+ struct ring_buffer *rb = NULL;
+ int ret = -EINVAL;
+
+ if (!output_event)
+ goto set;
+
+ /* don't allow circular references */
+ if (event == output_event)
+ goto out;
+
+ /*
+ * Don't allow cross-cpu buffers
+ */
+ if (output_event->cpu != event->cpu)
+ goto out;
+
+ /*
+ * If its not a per-cpu rb, it must be the same task.
+ */
+ if (output_event->cpu == -1 && output_event->ctx != event->ctx)
+ goto out;
+
+set:
+ mutex_lock(&event->mmap_mutex);
+ /* Can't redirect output if we've got an active mmap() */
+ if (atomic_read(&event->mmap_count))
+ goto unlock;
+
+ if (output_event) {
+ /* get the rb we want to redirect to */
+ rb = ring_buffer_get(output_event);
+ if (!rb)
+ goto unlock;
+ }
+
+ ring_buffer_attach(event, rb);
+
+ ret = 0;
+unlock:
+ mutex_unlock(&event->mmap_mutex);
+
+out:
+ return ret;
+}
+
+/**
+ * sys_perf_event_open - open a performance event, associate it to a task/cpu
+ *
+ * @attr_uptr: event_id type attributes for monitoring/sampling
+ * @pid: target pid
+ * @cpu: target cpu
+ * @group_fd: group leader event fd
+ */
+SYSCALL_DEFINE5(perf_event_open,
+ struct perf_event_attr __user *, attr_uptr,
+ pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
+{
+ struct perf_event *group_leader = NULL, *output_event = NULL;
+ struct perf_event *event, *sibling;
+ struct perf_event_attr attr;
+ struct perf_event_context *ctx;
+ struct file *event_file = NULL;
+ struct fd group = {NULL, 0};
+ struct task_struct *task = NULL;
+ struct pmu *pmu;
+ int event_fd;
+ int move_group = 0;
+ int err;
+ int f_flags = O_RDWR;
+
+ /* for future expandability... */
+ if (flags & ~PERF_FLAG_ALL)
+ return -EINVAL;
+
+ err = perf_copy_attr(attr_uptr, &attr);
+ if (err)
+ return err;
+
+ if (!attr.exclude_kernel) {
+ if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
+ return -EACCES;
+ }
+
+ if (attr.freq) {
+ if (attr.sample_freq > sysctl_perf_event_sample_rate)
+ return -EINVAL;
+ } else {
+ if (attr.sample_period & (1ULL << 63))
+ return -EINVAL;
+ }
+
+ /*
+ * In cgroup mode, the pid argument is used to pass the fd
+ * opened to the cgroup directory in cgroupfs. The cpu argument
+ * designates the cpu on which to monitor threads from that
+ * cgroup.
+ */
+ if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1))
+ return -EINVAL;
+
+ if (flags & PERF_FLAG_FD_CLOEXEC)
+ f_flags |= O_CLOEXEC;
+
+ event_fd = get_unused_fd_flags(f_flags);
+ if (event_fd < 0)
+ return event_fd;
+
+ if (group_fd != -1) {
+ err = perf_fget_light(group_fd, &group);
+ if (err)
+ goto err_fd;
+ group_leader = group.file->private_data;
+ if (flags & PERF_FLAG_FD_OUTPUT)
+ output_event = group_leader;
+ if (flags & PERF_FLAG_FD_NO_GROUP)
+ group_leader = NULL;
+ }
+
+ if (pid != -1 && !(flags & PERF_FLAG_PID_CGROUP)) {
+ task = find_lively_task_by_vpid(pid);
+ if (IS_ERR(task)) {
+ err = PTR_ERR(task);
+ goto err_group_fd;
+ }
+ }
+
+ if (task && group_leader &&
+ group_leader->attr.inherit != attr.inherit) {
+ err = -EINVAL;
+ goto err_task;
+ }
+
+ get_online_cpus();
+
+ event = perf_event_alloc(&attr, cpu, task, group_leader, NULL,
+ NULL, NULL);
+ if (IS_ERR(event)) {
+ err = PTR_ERR(event);
+ goto err_cpus;
+ }
+
+ if (flags & PERF_FLAG_PID_CGROUP) {
+ err = perf_cgroup_connect(pid, event, &attr, group_leader);
+ if (err) {
+ __free_event(event);
+ goto err_cpus;
+ }
+ }
+
+ if (is_sampling_event(event)) {
+ if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) {
+ err = -ENOTSUPP;
+ goto err_alloc;
+ }
+ }
+
+ account_event(event);
+
+ /*
+ * Special case software events and allow them to be part of
+ * any hardware group.
+ */
+ pmu = event->pmu;
+
+ if (group_leader &&
+ (is_software_event(event) != is_software_event(group_leader))) {
+ if (is_software_event(event)) {
+ /*
+ * If event and group_leader are not both a software
+ * event, and event is, then group leader is not.
+ *
+ * Allow the addition of software events to !software
+ * groups, this is safe because software events never
+ * fail to schedule.
+ */
+ pmu = group_leader->pmu;
+ } else if (is_software_event(group_leader) &&
+ (group_leader->group_flags & PERF_GROUP_SOFTWARE)) {
+ /*
+ * In case the group is a pure software group, and we
+ * try to add a hardware event, move the whole group to
+ * the hardware context.
+ */
+ move_group = 1;
+ }
+ }
+
+ /*
+ * Get the target context (task or percpu):
+ */
+ ctx = find_get_context(pmu, task, event->cpu);
+ if (IS_ERR(ctx)) {
+ err = PTR_ERR(ctx);
+ goto err_alloc;
+ }
+
+ if (task) {
+ put_task_struct(task);
+ task = NULL;
+ }
+
+ /*
+ * Look up the group leader (we will attach this event to it):
+ */
+ if (group_leader) {
+ err = -EINVAL;
+
+ /*
+ * Do not allow a recursive hierarchy (this new sibling
+ * becoming part of another group-sibling):
+ */
+ if (group_leader->group_leader != group_leader)
+ goto err_context;
+ /*
+ * Do not allow to attach to a group in a different
+ * task or CPU context:
+ */
+ if (move_group) {
+ if (group_leader->ctx->type != ctx->type)
+ goto err_context;
+ } else {
+ if (group_leader->ctx != ctx)
+ goto err_context;
+ }
+
+ /*
+ * Only a group leader can be exclusive or pinned
+ */
+ if (attr.exclusive || attr.pinned)
+ goto err_context;
+ }
+
+ if (output_event) {
+ err = perf_event_set_output(event, output_event);
+ if (err)
+ goto err_context;
+ }
+
+ event_file = anon_inode_getfile("[perf_event]", &perf_fops, event,
+ f_flags);
+ if (IS_ERR(event_file)) {
+ err = PTR_ERR(event_file);
+ goto err_context;
+ }
+
+ if (move_group) {
+ struct perf_event_context *gctx = group_leader->ctx;
+
+ mutex_lock(&gctx->mutex);
+ perf_remove_from_context(group_leader, false);
+
+ /*
+ * Removing from the context ends up with disabled
+ * event. What we want here is event in the initial
+ * startup state, ready to be add into new context.
+ */
+ perf_event__state_init(group_leader);
+ list_for_each_entry(sibling, &group_leader->sibling_list,
+ group_entry) {
+ perf_remove_from_context(sibling, false);
+ perf_event__state_init(sibling);
+ put_ctx(gctx);
+ }
+ mutex_unlock(&gctx->mutex);
+ put_ctx(gctx);
+ }
+
+ WARN_ON_ONCE(ctx->parent_ctx);
+ mutex_lock(&ctx->mutex);
+
+ if (move_group) {
+ synchronize_rcu();
+ perf_install_in_context(ctx, group_leader, event->cpu);
+ get_ctx(ctx);
+ list_for_each_entry(sibling, &group_leader->sibling_list,
+ group_entry) {
+ perf_install_in_context(ctx, sibling, event->cpu);
+ get_ctx(ctx);
+ }
+ }
+
+ perf_install_in_context(ctx, event, event->cpu);
+ perf_unpin_context(ctx);
+ mutex_unlock(&ctx->mutex);
+
+ put_online_cpus();
+
+ event->owner = current;
+
+ mutex_lock(&current->perf_event_mutex);
+ list_add_tail(&event->owner_entry, &current->perf_event_list);
+ mutex_unlock(&current->perf_event_mutex);
+
+ /*
+ * Precalculate sample_data sizes
+ */
+ perf_event__header_size(event);
+ perf_event__id_header_size(event);
+
+ /*
+ * Drop the reference on the group_event after placing the
+ * new event on the sibling_list. This ensures destruction
+ * of the group leader will find the pointer to itself in
+ * perf_group_detach().
+ */
+ fdput(group);
+ fd_install(event_fd, event_file);
+ return event_fd;
+
+err_context:
+ perf_unpin_context(ctx);
+ put_ctx(ctx);
+err_alloc:
+ free_event(event);
+err_cpus:
+ put_online_cpus();
+err_task:
+ if (task)
+ put_task_struct(task);
+err_group_fd:
+ fdput(group);
+err_fd:
+ put_unused_fd(event_fd);
+ return err;
+}
+
+/**
+ * perf_event_create_kernel_counter
+ *
+ * @attr: attributes of the counter to create
+ * @cpu: cpu in which the counter is bound
+ * @task: task to profile (NULL for percpu)
+ */
+struct perf_event *
+perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
+ struct task_struct *task,
+ perf_overflow_handler_t overflow_handler,
+ void *context)
+{
+ struct perf_event_context *ctx;
+ struct perf_event *event;
+ int err;
+
+ /*
+ * Get the target context (task or percpu):
+ */
+
+ event = perf_event_alloc(attr, cpu, task, NULL, NULL,
+ overflow_handler, context);
+ if (IS_ERR(event)) {
+ err = PTR_ERR(event);
+ goto err;
+ }
+
+ account_event(event);
+
+ ctx = find_get_context(event->pmu, task, cpu);
+ if (IS_ERR(ctx)) {
+ err = PTR_ERR(ctx);
+ goto err_free;
+ }
+
+ WARN_ON_ONCE(ctx->parent_ctx);
+ mutex_lock(&ctx->mutex);
+ perf_install_in_context(ctx, event, cpu);
+ perf_unpin_context(ctx);
+ mutex_unlock(&ctx->mutex);
+
+ return event;
+
+err_free:
+ free_event(event);
+err:
+ return ERR_PTR(err);
+}
+EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);
+
+void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
+{
+ struct perf_event_context *src_ctx;
+ struct perf_event_context *dst_ctx;
+ struct perf_event *event, *tmp;
+ LIST_HEAD(events);
+
+ src_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, src_cpu)->ctx;
+ dst_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, dst_cpu)->ctx;
+
+ mutex_lock(&src_ctx->mutex);
+ list_for_each_entry_safe(event, tmp, &src_ctx->event_list,
+ event_entry) {
+ perf_remove_from_context(event, false);
+ unaccount_event_cpu(event, src_cpu);
+ put_ctx(src_ctx);
+ list_add(&event->migrate_entry, &events);
+ }
+ mutex_unlock(&src_ctx->mutex);
+
+ synchronize_rcu();
+
+ mutex_lock(&dst_ctx->mutex);
+ list_for_each_entry_safe(event, tmp, &events, migrate_entry) {
+ list_del(&event->migrate_entry);
+ if (event->state >= PERF_EVENT_STATE_OFF)
+ event->state = PERF_EVENT_STATE_INACTIVE;
+ account_event_cpu(event, dst_cpu);
+ perf_install_in_context(dst_ctx, event, dst_cpu);
+ get_ctx(dst_ctx);
+ }
+ mutex_unlock(&dst_ctx->mutex);
+}
+EXPORT_SYMBOL_GPL(perf_pmu_migrate_context);
+
+static void sync_child_event(struct perf_event *child_event,
+ struct task_struct *child)
+{
+ struct perf_event *parent_event = child_event->parent;
+ u64 child_val;
+
+ if (child_event->attr.inherit_stat)
+ perf_event_read_event(child_event, child);
+
+ child_val = perf_event_count(child_event);
+
+ /*
+ * Add back the child's count to the parent's count:
+ */
+ atomic64_add(child_val, &parent_event->child_count);
+ atomic64_add(child_event->total_time_enabled,
+ &parent_event->child_total_time_enabled);
+ atomic64_add(child_event->total_time_running,
+ &parent_event->child_total_time_running);
+
+ /*
+ * Remove this event from the parent's list
+ */
+ WARN_ON_ONCE(parent_event->ctx->parent_ctx);
+ mutex_lock(&parent_event->child_mutex);
+ list_del_init(&child_event->child_list);
+ mutex_unlock(&parent_event->child_mutex);
+
+ /*
+ * Release the parent event, if this was the last
+ * reference to it.
+ */
+ put_event(parent_event);
+}
+
+static void
+__perf_event_exit_task(struct perf_event *child_event,
+ struct perf_event_context *child_ctx,
+ struct task_struct *child)
+{
+ /*
+ * Do not destroy the 'original' grouping; because of the context
+ * switch optimization the original events could've ended up in a
+ * random child task.
+ *
+ * If we were to destroy the original group, all group related
+ * operations would cease to function properly after this random
+ * child dies.
+ *
+ * Do destroy all inherited groups, we don't care about those
+ * and being thorough is better.
+ */
+ perf_remove_from_context(child_event, !!child_event->parent);
+
+ /*
+ * It can happen that the parent exits first, and has events
+ * that are still around due to the child reference. These
+ * events need to be zapped.
+ */
+ if (child_event->parent) {
+ sync_child_event(child_event, child);
+ free_event(child_event);
+ }
+}
+
+static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
+{
+ struct perf_event *child_event, *next;
+ struct perf_event_context *child_ctx, *parent_ctx;
+ unsigned long flags;
+
+ if (likely(!child->perf_event_ctxp[ctxn])) {
+ perf_event_task(child, NULL, 0);
+ return;
+ }
+
+ local_irq_save(flags);
+ /*
+ * We can't reschedule here because interrupts are disabled,
+ * and either child is current or it is a task that can't be
+ * scheduled, so we are now safe from rescheduling changing
+ * our context.
+ */
+ child_ctx = rcu_dereference_raw(child->perf_event_ctxp[ctxn]);
+
+ /*
+ * Take the context lock here so that if find_get_context is
+ * reading child->perf_event_ctxp, we wait until it has
+ * incremented the context's refcount before we do put_ctx below.
+ */
+ raw_spin_lock(&child_ctx->lock);
+ task_ctx_sched_out(child_ctx);
+ child->perf_event_ctxp[ctxn] = NULL;
+
+ /*
+ * In order to avoid freeing: child_ctx->parent_ctx->task
+ * under perf_event_context::lock, grab another reference.
+ */
+ parent_ctx = child_ctx->parent_ctx;
+ if (parent_ctx)
+ get_ctx(parent_ctx);
+
+ /*
+ * If this context is a clone; unclone it so it can't get
+ * swapped to another process while we're removing all
+ * the events from it.
+ */
+ unclone_ctx(child_ctx);
+ update_context_time(child_ctx);
+ raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
+
+ /*
+ * Now that we no longer hold perf_event_context::lock, drop
+ * our extra child_ctx->parent_ctx reference.
+ */
+ if (parent_ctx)
+ put_ctx(parent_ctx);
+
+ /*
+ * Report the task dead after unscheduling the events so that we
+ * won't get any samples after PERF_RECORD_EXIT. We can however still
+ * get a few PERF_RECORD_READ events.
+ */
+ perf_event_task(child, child_ctx, 0);
+
+ /*
+ * We can recurse on the same lock type through:
+ *
+ * __perf_event_exit_task()
+ * sync_child_event()
+ * put_event()
+ * mutex_lock(&ctx->mutex)
+ *
+ * But since its the parent context it won't be the same instance.
+ */
+ mutex_lock(&child_ctx->mutex);
+
+ list_for_each_entry_safe(child_event, next, &child_ctx->event_list, event_entry)
+ __perf_event_exit_task(child_event, child_ctx, child);
+
+ mutex_unlock(&child_ctx->mutex);
+
+ put_ctx(child_ctx);
+}
+
+/*
+ * When a child task exits, feed back event values to parent events.
+ */
+void perf_event_exit_task(struct task_struct *child)
+{
+ struct perf_event *event, *tmp;
+ int ctxn;
+
+ mutex_lock(&child->perf_event_mutex);
+ list_for_each_entry_safe(event, tmp, &child->perf_event_list,
+ owner_entry) {
+ list_del_init(&event->owner_entry);
+
+ /*
+ * Ensure the list deletion is visible before we clear
+ * the owner, closes a race against perf_release() where
+ * we need to serialize on the owner->perf_event_mutex.
+ */
+ smp_wmb();
+ event->owner = NULL;
+ }
+ mutex_unlock(&child->perf_event_mutex);
+
+ for_each_task_context_nr(ctxn)
+ perf_event_exit_task_context(child, ctxn);
+}
+
+static void perf_free_event(struct perf_event *event,
+ struct perf_event_context *ctx)
+{
+ struct perf_event *parent = event->parent;
+
+ if (WARN_ON_ONCE(!parent))
+ return;
+
+ mutex_lock(&parent->child_mutex);
+ list_del_init(&event->child_list);
+ mutex_unlock(&parent->child_mutex);
+
+ put_event(parent);
+
+ perf_group_detach(event);
+ list_del_event(event, ctx);
+ free_event(event);
+}
+
+/*
+ * free an unexposed, unused context as created by inheritance by
+ * perf_event_init_task below, used by fork() in case of fail.
+ */
+void perf_event_free_task(struct task_struct *task)
+{
+ struct perf_event_context *ctx;
+ struct perf_event *event, *tmp;
+ int ctxn;
+
+ for_each_task_context_nr(ctxn) {
+ ctx = task->perf_event_ctxp[ctxn];
+ if (!ctx)
+ continue;
+
+ mutex_lock(&ctx->mutex);
+again:
+ list_for_each_entry_safe(event, tmp, &ctx->pinned_groups,
+ group_entry)
+ perf_free_event(event, ctx);
+
+ list_for_each_entry_safe(event, tmp, &ctx->flexible_groups,
+ group_entry)
+ perf_free_event(event, ctx);
+
+ if (!list_empty(&ctx->pinned_groups) ||
+ !list_empty(&ctx->flexible_groups))
+ goto again;
+
+ mutex_unlock(&ctx->mutex);
+
+ put_ctx(ctx);
+ }
+}
+
+void perf_event_delayed_put(struct task_struct *task)
+{
+ int ctxn;
+
+ for_each_task_context_nr(ctxn)
+ WARN_ON_ONCE(task->perf_event_ctxp[ctxn]);
+}
+
+/*
+ * inherit a event from parent task to child task:
+ */
+static struct perf_event *
+inherit_event(struct perf_event *parent_event,
+ struct task_struct *parent,
+ struct perf_event_context *parent_ctx,
+ struct task_struct *child,
+ struct perf_event *group_leader,
+ struct perf_event_context *child_ctx)
+{
+ struct perf_event *child_event;
+ unsigned long flags;
+
+ /*
+ * Instead of creating recursive hierarchies of events,
+ * we link inherited events back to the original parent,
+ * which has a filp for sure, which we use as the reference
+ * count:
+ */
+ if (parent_event->parent)
+ parent_event = parent_event->parent;
+
+ child_event = perf_event_alloc(&parent_event->attr,
+ parent_event->cpu,
+ child,
+ group_leader, parent_event,
+ NULL, NULL);
+ if (IS_ERR(child_event))
+ return child_event;
+
+ if (!atomic_long_inc_not_zero(&parent_event->refcount)) {
+ free_event(child_event);
+ return NULL;
+ }
+
+ get_ctx(child_ctx);
+
+ /*
+ * Make the child state follow the state of the parent event,
+ * not its attr.disabled bit. We hold the parent's mutex,
+ * so we won't race with perf_event_{en, dis}able_family.
+ */
+ if (parent_event->state >= PERF_EVENT_STATE_INACTIVE)
+ child_event->state = PERF_EVENT_STATE_INACTIVE;
+ else
+ child_event->state = PERF_EVENT_STATE_OFF;
+
+ if (parent_event->attr.freq) {
+ u64 sample_period = parent_event->hw.sample_period;
+ struct hw_perf_event *hwc = &child_event->hw;
+
+ hwc->sample_period = sample_period;
+ hwc->last_period = sample_period;
+
+ local64_set(&hwc->period_left, sample_period);
+ }
+
+ child_event->ctx = child_ctx;
+ child_event->overflow_handler = parent_event->overflow_handler;
+ child_event->overflow_handler_context
+ = parent_event->overflow_handler_context;
+
+ /*
+ * Precalculate sample_data sizes
+ */
+ perf_event__header_size(child_event);
+ perf_event__id_header_size(child_event);
+
+ /*
+ * Link it up in the child's context:
+ */
+ raw_spin_lock_irqsave(&child_ctx->lock, flags);
+ add_event_to_ctx(child_event, child_ctx);
+ raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
+
+ /*
+ * Link this into the parent event's child list
+ */
+ WARN_ON_ONCE(parent_event->ctx->parent_ctx);
+ mutex_lock(&parent_event->child_mutex);
+ list_add_tail(&child_event->child_list, &parent_event->child_list);
+ mutex_unlock(&parent_event->child_mutex);
+
+ return child_event;
+}
+
+static int inherit_group(struct perf_event *parent_event,
+ struct task_struct *parent,
+ struct perf_event_context *parent_ctx,
+ struct task_struct *child,
+ struct perf_event_context *child_ctx)
+{
+ struct perf_event *leader;
+ struct perf_event *sub;
+ struct perf_event *child_ctr;
+
+ leader = inherit_event(parent_event, parent, parent_ctx,
+ child, NULL, child_ctx);
+ if (IS_ERR(leader))
+ return PTR_ERR(leader);
+ list_for_each_entry(sub, &parent_event->sibling_list, group_entry) {
+ child_ctr = inherit_event(sub, parent, parent_ctx,
+ child, leader, child_ctx);
+ if (IS_ERR(child_ctr))
+ return PTR_ERR(child_ctr);
+ }
+ return 0;
+}
+
+static int
+inherit_task_group(struct perf_event *event, struct task_struct *parent,
+ struct perf_event_context *parent_ctx,
+ struct task_struct *child, int ctxn,
+ int *inherited_all)
+{
+ int ret;
+ struct perf_event_context *child_ctx;
+
+ if (!event->attr.inherit) {
+ *inherited_all = 0;
+ return 0;
+ }
+
+ child_ctx = child->perf_event_ctxp[ctxn];
+ if (!child_ctx) {
+ /*
+ * This is executed from the parent task context, so
+ * inherit events that have been marked for cloning.
+ * First allocate and initialize a context for the
+ * child.
+ */
+
+ child_ctx = alloc_perf_context(parent_ctx->pmu, child);
+ if (!child_ctx)
+ return -ENOMEM;
+
+ child->perf_event_ctxp[ctxn] = child_ctx;
+ }
+
+ ret = inherit_group(event, parent, parent_ctx,
+ child, child_ctx);
+
+ if (ret)
+ *inherited_all = 0;
+
+ return ret;
+}
+
+/*
+ * Initialize the perf_event context in task_struct
+ */
+int perf_event_init_context(struct task_struct *child, int ctxn)
+{
+ struct perf_event_context *child_ctx, *parent_ctx;
+ struct perf_event_context *cloned_ctx;
+ struct perf_event *event;
+ struct task_struct *parent = current;
+ int inherited_all = 1;
+ unsigned long flags;
+ int ret = 0;
+
+ if (likely(!parent->perf_event_ctxp[ctxn]))
+ return 0;
+
+ /*
+ * If the parent's context is a clone, pin it so it won't get
+ * swapped under us.
+ */
+ parent_ctx = perf_pin_task_context(parent, ctxn);
+ if (!parent_ctx)
+ return 0;
+
+ /*
+ * No need to check if parent_ctx != NULL here; since we saw
+ * it non-NULL earlier, the only reason for it to become NULL
+ * is if we exit, and since we're currently in the middle of
+ * a fork we can't be exiting at the same time.
+ */
+
+ /*
+ * Lock the parent list. No need to lock the child - not PID
+ * hashed yet and not running, so nobody can access it.
+ */
+ mutex_lock(&parent_ctx->mutex);
+
+ /*
+ * We dont have to disable NMIs - we are only looking at
+ * the list, not manipulating it:
+ */
+ list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) {
+ ret = inherit_task_group(event, parent, parent_ctx,
+ child, ctxn, &inherited_all);
+ if (ret)
+ break;
+ }
+
+ /*
+ * We can't hold ctx->lock when iterating the ->flexible_group list due
+ * to allocations, but we need to prevent rotation because
+ * rotate_ctx() will change the list from interrupt context.
+ */
+ raw_spin_lock_irqsave(&parent_ctx->lock, flags);
+ parent_ctx->rotate_disable = 1;
+ raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
+
+ list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) {
+ ret = inherit_task_group(event, parent, parent_ctx,
+ child, ctxn, &inherited_all);
+ if (ret)
+ break;
+ }
+
+ raw_spin_lock_irqsave(&parent_ctx->lock, flags);
+ parent_ctx->rotate_disable = 0;
+
+ child_ctx = child->perf_event_ctxp[ctxn];
+
+ if (child_ctx && inherited_all) {
+ /*
+ * Mark the child context as a clone of the parent
+ * context, or of whatever the parent is a clone of.
+ *
+ * Note that if the parent is a clone, the holding of
+ * parent_ctx->lock avoids it from being uncloned.
+ */
+ cloned_ctx = parent_ctx->parent_ctx;
+ if (cloned_ctx) {
+ child_ctx->parent_ctx = cloned_ctx;
+ child_ctx->parent_gen = parent_ctx->parent_gen;
+ } else {
+ child_ctx->parent_ctx = parent_ctx;
+ child_ctx->parent_gen = parent_ctx->generation;
+ }
+ get_ctx(child_ctx->parent_ctx);
+ }
+
+ raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
+ mutex_unlock(&parent_ctx->mutex);
+
+ perf_unpin_context(parent_ctx);
+ put_ctx(parent_ctx);
+
+ return ret;
+}
+
+/*
+ * Initialize the perf_event context in task_struct
+ */
+int perf_event_init_task(struct task_struct *child)
+{
+ int ctxn, ret;
+
+ memset(child->perf_event_ctxp, 0, sizeof(child->perf_event_ctxp));
+ mutex_init(&child->perf_event_mutex);
+ INIT_LIST_HEAD(&child->perf_event_list);
+
+ for_each_task_context_nr(ctxn) {
+ ret = perf_event_init_context(child, ctxn);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+static void __init perf_event_init_all_cpus(void)
+{
+ struct swevent_htable *swhash;
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ swhash = &per_cpu(swevent_htable, cpu);
+ mutex_init(&swhash->hlist_mutex);
+ INIT_LIST_HEAD(&per_cpu(rotation_list, cpu));
+ }
+}
+
+static void perf_event_init_cpu(int cpu)
+{
+ struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
+
+ mutex_lock(&swhash->hlist_mutex);
+ swhash->online = true;
+ if (swhash->hlist_refcount > 0) {
+ struct swevent_hlist *hlist;
+
+ hlist = kzalloc_node(sizeof(*hlist), GFP_KERNEL, cpu_to_node(cpu));
+ WARN_ON(!hlist);
+ rcu_assign_pointer(swhash->swevent_hlist, hlist);
+ }
+ mutex_unlock(&swhash->hlist_mutex);
+}
+
+#if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC
+static void perf_pmu_rotate_stop(struct pmu *pmu)
+{
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+
+ WARN_ON(!irqs_disabled());
+
+ list_del_init(&cpuctx->rotation_list);
+}
+
+static void __perf_event_exit_context(void *__info)
+{
+ struct remove_event re = { .detach_group = false };
+ struct perf_event_context *ctx = __info;
+
+ perf_pmu_rotate_stop(ctx->pmu);
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(re.event, &ctx->event_list, event_entry)
+ __perf_remove_from_context(&re);
+ rcu_read_unlock();
+}
+
+static void perf_event_exit_cpu_context(int cpu)
+{
+ struct perf_event_context *ctx;
+ struct pmu *pmu;
+ int idx;
+
+ idx = srcu_read_lock(&pmus_srcu);
+ list_for_each_entry_rcu(pmu, &pmus, entry) {
+ ctx = &per_cpu_ptr(pmu->pmu_cpu_context, cpu)->ctx;
+
+ mutex_lock(&ctx->mutex);
+ smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1);
+ mutex_unlock(&ctx->mutex);
+ }
+ srcu_read_unlock(&pmus_srcu, idx);
+}
+
+static void perf_event_exit_cpu(int cpu)
+{
+ struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
+
+ perf_event_exit_cpu_context(cpu);
+
+ mutex_lock(&swhash->hlist_mutex);
+ swhash->online = false;
+ swevent_hlist_release(swhash);
+ mutex_unlock(&swhash->hlist_mutex);
+}
+#else
+static inline void perf_event_exit_cpu(int cpu) { }
+#endif
+
+static int
+perf_reboot(struct notifier_block *notifier, unsigned long val, void *v)
+{
+ int cpu;
+
+ for_each_online_cpu(cpu)
+ perf_event_exit_cpu(cpu);
+
+ return NOTIFY_OK;
+}
+
+/*
+ * Run the perf reboot notifier at the very last possible moment so that
+ * the generic watchdog code runs as long as possible.
+ */
+static struct notifier_block perf_reboot_notifier = {
+ .notifier_call = perf_reboot,
+ .priority = INT_MIN,
+};
+
+static int
+perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
+{
+ unsigned int cpu = (long)hcpu;
+
+ switch (action & ~CPU_TASKS_FROZEN) {
+
+ case CPU_UP_PREPARE:
+ case CPU_DOWN_FAILED:
+ perf_event_init_cpu(cpu);
+ break;
+
+ case CPU_UP_CANCELED:
+ case CPU_DOWN_PREPARE:
+ perf_event_exit_cpu(cpu);
+ break;
+ default:
+ break;
+ }
+
+ return NOTIFY_OK;
+}
+
+void __init perf_event_init(void)
+{
+ int ret;
+
+ idr_init(&pmu_idr);
+
+ perf_event_init_all_cpus();
+ init_srcu_struct(&pmus_srcu);
+ perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE);
+ perf_pmu_register(&perf_cpu_clock, NULL, -1);
+ perf_pmu_register(&perf_task_clock, NULL, -1);
+ perf_tp_register();
+ perf_cpu_notifier(perf_cpu_notify);
+ register_reboot_notifier(&perf_reboot_notifier);
+
+ ret = init_hw_breakpoint();
+ WARN(ret, "hw_breakpoint initialization failed with: %d", ret);
+
+ /* do not patch jump label more than once per second */
+ jump_label_rate_limit(&perf_sched_events, HZ);
+
+ /*
+ * Build time assertion that we keep the data_head at the intended
+ * location. IOW, validation we got the __reserved[] size right.
+ */
+ BUILD_BUG_ON((offsetof(struct perf_event_mmap_page, data_head))
+ != 1024);
+}
+
+static int __init perf_event_sysfs_init(void)
+{
+ struct pmu *pmu;
+ int ret;
+
+ mutex_lock(&pmus_lock);
+
+ ret = bus_register(&pmu_bus);
+ if (ret)
+ goto unlock;
+
+ list_for_each_entry(pmu, &pmus, entry) {
+ if (!pmu->name || pmu->type < 0)
+ continue;
+
+ ret = pmu_dev_alloc(pmu);
+ WARN(ret, "Failed to register pmu: %s, reason %d\n", pmu->name, ret);
+ }
+ pmu_bus_running = 1;
+ ret = 0;
+
+unlock:
+ mutex_unlock(&pmus_lock);
+
+ return ret;
+}
+device_initcall(perf_event_sysfs_init);
+
+#ifdef CONFIG_CGROUP_PERF
+static struct cgroup_subsys_state *
+perf_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
+{
+ struct perf_cgroup *jc;
+
+ jc = kzalloc(sizeof(*jc), GFP_KERNEL);
+ if (!jc)
+ return ERR_PTR(-ENOMEM);
+
+ jc->info = alloc_percpu(struct perf_cgroup_info);
+ if (!jc->info) {
+ kfree(jc);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ return &jc->css;
+}
+
+static void perf_cgroup_css_free(struct cgroup_subsys_state *css)
+{
+ struct perf_cgroup *jc = container_of(css, struct perf_cgroup, css);
+
+ free_percpu(jc->info);
+ kfree(jc);
+}
+
+static int __perf_cgroup_move(void *info)
+{
+ struct task_struct *task = info;
+ perf_cgroup_switch(task, PERF_CGROUP_SWOUT | PERF_CGROUP_SWIN);
+ return 0;
+}
+
+static void perf_cgroup_attach(struct cgroup_subsys_state *css,
+ struct cgroup_taskset *tset)
+{
+ struct task_struct *task;
+
+ cgroup_taskset_for_each(task, tset)
+ task_function_call(task, __perf_cgroup_move, task);
+}
+
+static void perf_cgroup_exit(struct cgroup_subsys_state *css,
+ struct cgroup_subsys_state *old_css,
+ struct task_struct *task)
+{
+ /*
+ * cgroup_exit() is called in the copy_process() failure path.
+ * Ignore this case since the task hasn't ran yet, this avoids
+ * trying to poke a half freed task state from generic code.
+ */
+ if (!(task->flags & PF_EXITING))
+ return;
+
+ task_function_call(task, __perf_cgroup_move, task);
+}
+
+struct cgroup_subsys perf_event_cgrp_subsys = {
+ .css_alloc = perf_cgroup_css_alloc,
+ .css_free = perf_cgroup_css_free,
+ .exit = perf_cgroup_exit,
+ .attach = perf_cgroup_attach,
+};
+#endif /* CONFIG_CGROUP_PERF */
diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
new file mode 100644
index 00000000000..1559fb0b929
--- /dev/null
+++ b/kernel/events/hw_breakpoint.c
@@ -0,0 +1,662 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) 2007 Alan Stern
+ * Copyright (C) IBM Corporation, 2009
+ * Copyright (C) 2009, Frederic Weisbecker <fweisbec@gmail.com>
+ *
+ * Thanks to Ingo Molnar for his many suggestions.
+ *
+ * Authors: Alan Stern <stern@rowland.harvard.edu>
+ * K.Prasad <prasad@linux.vnet.ibm.com>
+ * Frederic Weisbecker <fweisbec@gmail.com>
+ */
+
+/*
+ * HW_breakpoint: a unified kernel/user-space hardware breakpoint facility,
+ * using the CPU's debug registers.
+ * This file contains the arch-independent routines.
+ */
+
+#include <linux/irqflags.h>
+#include <linux/kallsyms.h>
+#include <linux/notifier.h>
+#include <linux/kprobes.h>
+#include <linux/kdebug.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/list.h>
+#include <linux/cpu.h>
+#include <linux/smp.h>
+
+#include <linux/hw_breakpoint.h>
+/*
+ * Constraints data
+ */
+struct bp_cpuinfo {
+ /* Number of pinned cpu breakpoints in a cpu */
+ unsigned int cpu_pinned;
+ /* tsk_pinned[n] is the number of tasks having n+1 breakpoints */
+ unsigned int *tsk_pinned;
+ /* Number of non-pinned cpu/task breakpoints in a cpu */
+ unsigned int flexible; /* XXX: placeholder, see fetch_this_slot() */
+};
+
+static DEFINE_PER_CPU(struct bp_cpuinfo, bp_cpuinfo[TYPE_MAX]);
+static int nr_slots[TYPE_MAX];
+
+static struct bp_cpuinfo *get_bp_info(int cpu, enum bp_type_idx type)
+{
+ return per_cpu_ptr(bp_cpuinfo + type, cpu);
+}
+
+/* Keep track of the breakpoints attached to tasks */
+static LIST_HEAD(bp_task_head);
+
+static int constraints_initialized;
+
+/* Gather the number of total pinned and un-pinned bp in a cpuset */
+struct bp_busy_slots {
+ unsigned int pinned;
+ unsigned int flexible;
+};
+
+/* Serialize accesses to the above constraints */
+static DEFINE_MUTEX(nr_bp_mutex);
+
+__weak int hw_breakpoint_weight(struct perf_event *bp)
+{
+ return 1;
+}
+
+static inline enum bp_type_idx find_slot_idx(struct perf_event *bp)
+{
+ if (bp->attr.bp_type & HW_BREAKPOINT_RW)
+ return TYPE_DATA;
+
+ return TYPE_INST;
+}
+
+/*
+ * Report the maximum number of pinned breakpoints a task
+ * have in this cpu
+ */
+static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type)
+{
+ unsigned int *tsk_pinned = get_bp_info(cpu, type)->tsk_pinned;
+ int i;
+
+ for (i = nr_slots[type] - 1; i >= 0; i--) {
+ if (tsk_pinned[i] > 0)
+ return i + 1;
+ }
+
+ return 0;
+}
+
+/*
+ * Count the number of breakpoints of the same type and same task.
+ * The given event must be not on the list.
+ */
+static int task_bp_pinned(int cpu, struct perf_event *bp, enum bp_type_idx type)
+{
+ struct task_struct *tsk = bp->hw.bp_target;
+ struct perf_event *iter;
+ int count = 0;
+
+ list_for_each_entry(iter, &bp_task_head, hw.bp_list) {
+ if (iter->hw.bp_target == tsk &&
+ find_slot_idx(iter) == type &&
+ (iter->cpu < 0 || cpu == iter->cpu))
+ count += hw_breakpoint_weight(iter);
+ }
+
+ return count;
+}
+
+static const struct cpumask *cpumask_of_bp(struct perf_event *bp)
+{
+ if (bp->cpu >= 0)
+ return cpumask_of(bp->cpu);
+ return cpu_possible_mask;
+}
+
+/*
+ * Report the number of pinned/un-pinned breakpoints we have in
+ * a given cpu (cpu > -1) or in all of them (cpu = -1).
+ */
+static void
+fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp,
+ enum bp_type_idx type)
+{
+ const struct cpumask *cpumask = cpumask_of_bp(bp);
+ int cpu;
+
+ for_each_cpu(cpu, cpumask) {
+ struct bp_cpuinfo *info = get_bp_info(cpu, type);
+ int nr;
+
+ nr = info->cpu_pinned;
+ if (!bp->hw.bp_target)
+ nr += max_task_bp_pinned(cpu, type);
+ else
+ nr += task_bp_pinned(cpu, bp, type);
+
+ if (nr > slots->pinned)
+ slots->pinned = nr;
+
+ nr = info->flexible;
+ if (nr > slots->flexible)
+ slots->flexible = nr;
+ }
+}
+
+/*
+ * For now, continue to consider flexible as pinned, until we can
+ * ensure no flexible event can ever be scheduled before a pinned event
+ * in a same cpu.
+ */
+static void
+fetch_this_slot(struct bp_busy_slots *slots, int weight)
+{
+ slots->pinned += weight;
+}
+
+/*
+ * Add a pinned breakpoint for the given task in our constraint table
+ */
+static void toggle_bp_task_slot(struct perf_event *bp, int cpu,
+ enum bp_type_idx type, int weight)
+{
+ unsigned int *tsk_pinned = get_bp_info(cpu, type)->tsk_pinned;
+ int old_idx, new_idx;
+
+ old_idx = task_bp_pinned(cpu, bp, type) - 1;
+ new_idx = old_idx + weight;
+
+ if (old_idx >= 0)
+ tsk_pinned[old_idx]--;
+ if (new_idx >= 0)
+ tsk_pinned[new_idx]++;
+}
+
+/*
+ * Add/remove the given breakpoint in our constraint table
+ */
+static void
+toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type,
+ int weight)
+{
+ const struct cpumask *cpumask = cpumask_of_bp(bp);
+ int cpu;
+
+ if (!enable)
+ weight = -weight;
+
+ /* Pinned counter cpu profiling */
+ if (!bp->hw.bp_target) {
+ get_bp_info(bp->cpu, type)->cpu_pinned += weight;
+ return;
+ }
+
+ /* Pinned counter task profiling */
+ for_each_cpu(cpu, cpumask)
+ toggle_bp_task_slot(bp, cpu, type, weight);
+
+ if (enable)
+ list_add_tail(&bp->hw.bp_list, &bp_task_head);
+ else
+ list_del(&bp->hw.bp_list);
+}
+
+/*
+ * Function to perform processor-specific cleanup during unregistration
+ */
+__weak void arch_unregister_hw_breakpoint(struct perf_event *bp)
+{
+ /*
+ * A weak stub function here for those archs that don't define
+ * it inside arch/.../kernel/hw_breakpoint.c
+ */
+}
+
+/*
+ * Contraints to check before allowing this new breakpoint counter:
+ *
+ * == Non-pinned counter == (Considered as pinned for now)
+ *
+ * - If attached to a single cpu, check:
+ *
+ * (per_cpu(info->flexible, cpu) || (per_cpu(info->cpu_pinned, cpu)
+ * + max(per_cpu(info->tsk_pinned, cpu)))) < HBP_NUM
+ *
+ * -> If there are already non-pinned counters in this cpu, it means
+ * there is already a free slot for them.
+ * Otherwise, we check that the maximum number of per task
+ * breakpoints (for this cpu) plus the number of per cpu breakpoint
+ * (for this cpu) doesn't cover every registers.
+ *
+ * - If attached to every cpus, check:
+ *
+ * (per_cpu(info->flexible, *) || (max(per_cpu(info->cpu_pinned, *))
+ * + max(per_cpu(info->tsk_pinned, *)))) < HBP_NUM
+ *
+ * -> This is roughly the same, except we check the number of per cpu
+ * bp for every cpu and we keep the max one. Same for the per tasks
+ * breakpoints.
+ *
+ *
+ * == Pinned counter ==
+ *
+ * - If attached to a single cpu, check:
+ *
+ * ((per_cpu(info->flexible, cpu) > 1) + per_cpu(info->cpu_pinned, cpu)
+ * + max(per_cpu(info->tsk_pinned, cpu))) < HBP_NUM
+ *
+ * -> Same checks as before. But now the info->flexible, if any, must keep
+ * one register at least (or they will never be fed).
+ *
+ * - If attached to every cpus, check:
+ *
+ * ((per_cpu(info->flexible, *) > 1) + max(per_cpu(info->cpu_pinned, *))
+ * + max(per_cpu(info->tsk_pinned, *))) < HBP_NUM
+ */
+static int __reserve_bp_slot(struct perf_event *bp)
+{
+ struct bp_busy_slots slots = {0};
+ enum bp_type_idx type;
+ int weight;
+
+ /* We couldn't initialize breakpoint constraints on boot */
+ if (!constraints_initialized)
+ return -ENOMEM;
+
+ /* Basic checks */
+ if (bp->attr.bp_type == HW_BREAKPOINT_EMPTY ||
+ bp->attr.bp_type == HW_BREAKPOINT_INVALID)
+ return -EINVAL;
+
+ type = find_slot_idx(bp);
+ weight = hw_breakpoint_weight(bp);
+
+ fetch_bp_busy_slots(&slots, bp, type);
+ /*
+ * Simulate the addition of this breakpoint to the constraints
+ * and see the result.
+ */
+ fetch_this_slot(&slots, weight);
+
+ /* Flexible counters need to keep at least one slot */
+ if (slots.pinned + (!!slots.flexible) > nr_slots[type])
+ return -ENOSPC;
+
+ toggle_bp_slot(bp, true, type, weight);
+
+ return 0;
+}
+
+int reserve_bp_slot(struct perf_event *bp)
+{
+ int ret;
+
+ mutex_lock(&nr_bp_mutex);
+
+ ret = __reserve_bp_slot(bp);
+
+ mutex_unlock(&nr_bp_mutex);
+
+ return ret;
+}
+
+static void __release_bp_slot(struct perf_event *bp)
+{
+ enum bp_type_idx type;
+ int weight;
+
+ type = find_slot_idx(bp);
+ weight = hw_breakpoint_weight(bp);
+ toggle_bp_slot(bp, false, type, weight);
+}
+
+void release_bp_slot(struct perf_event *bp)
+{
+ mutex_lock(&nr_bp_mutex);
+
+ arch_unregister_hw_breakpoint(bp);
+ __release_bp_slot(bp);
+
+ mutex_unlock(&nr_bp_mutex);
+}
+
+/*
+ * Allow the kernel debugger to reserve breakpoint slots without
+ * taking a lock using the dbg_* variant of for the reserve and
+ * release breakpoint slots.
+ */
+int dbg_reserve_bp_slot(struct perf_event *bp)
+{
+ if (mutex_is_locked(&nr_bp_mutex))
+ return -1;
+
+ return __reserve_bp_slot(bp);
+}
+
+int dbg_release_bp_slot(struct perf_event *bp)
+{
+ if (mutex_is_locked(&nr_bp_mutex))
+ return -1;
+
+ __release_bp_slot(bp);
+
+ return 0;
+}
+
+static int validate_hw_breakpoint(struct perf_event *bp)
+{
+ int ret;
+
+ ret = arch_validate_hwbkpt_settings(bp);
+ if (ret)
+ return ret;
+
+ if (arch_check_bp_in_kernelspace(bp)) {
+ if (bp->attr.exclude_kernel)
+ return -EINVAL;
+ /*
+ * Don't let unprivileged users set a breakpoint in the trap
+ * path to avoid trap recursion attacks.
+ */
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+ }
+
+ return 0;
+}
+
+int register_perf_hw_breakpoint(struct perf_event *bp)
+{
+ int ret;
+
+ ret = reserve_bp_slot(bp);
+ if (ret)
+ return ret;
+
+ ret = validate_hw_breakpoint(bp);
+
+ /* if arch_validate_hwbkpt_settings() fails then release bp slot */
+ if (ret)
+ release_bp_slot(bp);
+
+ return ret;
+}
+
+/**
+ * register_user_hw_breakpoint - register a hardware breakpoint for user space
+ * @attr: breakpoint attributes
+ * @triggered: callback to trigger when we hit the breakpoint
+ * @tsk: pointer to 'task_struct' of the process to which the address belongs
+ */
+struct perf_event *
+register_user_hw_breakpoint(struct perf_event_attr *attr,
+ perf_overflow_handler_t triggered,
+ void *context,
+ struct task_struct *tsk)
+{
+ return perf_event_create_kernel_counter(attr, -1, tsk, triggered,
+ context);
+}
+EXPORT_SYMBOL_GPL(register_user_hw_breakpoint);
+
+/**
+ * modify_user_hw_breakpoint - modify a user-space hardware breakpoint
+ * @bp: the breakpoint structure to modify
+ * @attr: new breakpoint attributes
+ * @triggered: callback to trigger when we hit the breakpoint
+ * @tsk: pointer to 'task_struct' of the process to which the address belongs
+ */
+int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr)
+{
+ u64 old_addr = bp->attr.bp_addr;
+ u64 old_len = bp->attr.bp_len;
+ int old_type = bp->attr.bp_type;
+ int err = 0;
+
+ /*
+ * modify_user_hw_breakpoint can be invoked with IRQs disabled and hence it
+ * will not be possible to raise IPIs that invoke __perf_event_disable.
+ * So call the function directly after making sure we are targeting the
+ * current task.
+ */
+ if (irqs_disabled() && bp->ctx && bp->ctx->task == current)
+ __perf_event_disable(bp);
+ else
+ perf_event_disable(bp);
+
+ bp->attr.bp_addr = attr->bp_addr;
+ bp->attr.bp_type = attr->bp_type;
+ bp->attr.bp_len = attr->bp_len;
+
+ if (attr->disabled)
+ goto end;
+
+ err = validate_hw_breakpoint(bp);
+ if (!err)
+ perf_event_enable(bp);
+
+ if (err) {
+ bp->attr.bp_addr = old_addr;
+ bp->attr.bp_type = old_type;
+ bp->attr.bp_len = old_len;
+ if (!bp->attr.disabled)
+ perf_event_enable(bp);
+
+ return err;
+ }
+
+end:
+ bp->attr.disabled = attr->disabled;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(modify_user_hw_breakpoint);
+
+/**
+ * unregister_hw_breakpoint - unregister a user-space hardware breakpoint
+ * @bp: the breakpoint structure to unregister
+ */
+void unregister_hw_breakpoint(struct perf_event *bp)
+{
+ if (!bp)
+ return;
+ perf_event_release_kernel(bp);
+}
+EXPORT_SYMBOL_GPL(unregister_hw_breakpoint);
+
+/**
+ * register_wide_hw_breakpoint - register a wide breakpoint in the kernel
+ * @attr: breakpoint attributes
+ * @triggered: callback to trigger when we hit the breakpoint
+ *
+ * @return a set of per_cpu pointers to perf events
+ */
+struct perf_event * __percpu *
+register_wide_hw_breakpoint(struct perf_event_attr *attr,
+ perf_overflow_handler_t triggered,
+ void *context)
+{
+ struct perf_event * __percpu *cpu_events, *bp;
+ long err = 0;
+ int cpu;
+
+ cpu_events = alloc_percpu(typeof(*cpu_events));
+ if (!cpu_events)
+ return (void __percpu __force *)ERR_PTR(-ENOMEM);
+
+ get_online_cpus();
+ for_each_online_cpu(cpu) {
+ bp = perf_event_create_kernel_counter(attr, cpu, NULL,
+ triggered, context);
+ if (IS_ERR(bp)) {
+ err = PTR_ERR(bp);
+ break;
+ }
+
+ per_cpu(*cpu_events, cpu) = bp;
+ }
+ put_online_cpus();
+
+ if (likely(!err))
+ return cpu_events;
+
+ unregister_wide_hw_breakpoint(cpu_events);
+ return (void __percpu __force *)ERR_PTR(err);
+}
+EXPORT_SYMBOL_GPL(register_wide_hw_breakpoint);
+
+/**
+ * unregister_wide_hw_breakpoint - unregister a wide breakpoint in the kernel
+ * @cpu_events: the per cpu set of events to unregister
+ */
+void unregister_wide_hw_breakpoint(struct perf_event * __percpu *cpu_events)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ unregister_hw_breakpoint(per_cpu(*cpu_events, cpu));
+
+ free_percpu(cpu_events);
+}
+EXPORT_SYMBOL_GPL(unregister_wide_hw_breakpoint);
+
+static struct notifier_block hw_breakpoint_exceptions_nb = {
+ .notifier_call = hw_breakpoint_exceptions_notify,
+ /* we need to be notified first */
+ .priority = 0x7fffffff
+};
+
+static void bp_perf_event_destroy(struct perf_event *event)
+{
+ release_bp_slot(event);
+}
+
+static int hw_breakpoint_event_init(struct perf_event *bp)
+{
+ int err;
+
+ if (bp->attr.type != PERF_TYPE_BREAKPOINT)
+ return -ENOENT;
+
+ /*
+ * no branch sampling for breakpoint events
+ */
+ if (has_branch_stack(bp))
+ return -EOPNOTSUPP;
+
+ err = register_perf_hw_breakpoint(bp);
+ if (err)
+ return err;
+
+ bp->destroy = bp_perf_event_destroy;
+
+ return 0;
+}
+
+static int hw_breakpoint_add(struct perf_event *bp, int flags)
+{
+ if (!(flags & PERF_EF_START))
+ bp->hw.state = PERF_HES_STOPPED;
+
+ if (is_sampling_event(bp)) {
+ bp->hw.last_period = bp->hw.sample_period;
+ perf_swevent_set_period(bp);
+ }
+
+ return arch_install_hw_breakpoint(bp);
+}
+
+static void hw_breakpoint_del(struct perf_event *bp, int flags)
+{
+ arch_uninstall_hw_breakpoint(bp);
+}
+
+static void hw_breakpoint_start(struct perf_event *bp, int flags)
+{
+ bp->hw.state = 0;
+}
+
+static void hw_breakpoint_stop(struct perf_event *bp, int flags)
+{
+ bp->hw.state = PERF_HES_STOPPED;
+}
+
+static int hw_breakpoint_event_idx(struct perf_event *bp)
+{
+ return 0;
+}
+
+static struct pmu perf_breakpoint = {
+ .task_ctx_nr = perf_sw_context, /* could eventually get its own */
+
+ .event_init = hw_breakpoint_event_init,
+ .add = hw_breakpoint_add,
+ .del = hw_breakpoint_del,
+ .start = hw_breakpoint_start,
+ .stop = hw_breakpoint_stop,
+ .read = hw_breakpoint_pmu_read,
+
+ .event_idx = hw_breakpoint_event_idx,
+};
+
+int __init init_hw_breakpoint(void)
+{
+ int cpu, err_cpu;
+ int i;
+
+ for (i = 0; i < TYPE_MAX; i++)
+ nr_slots[i] = hw_breakpoint_slots(i);
+
+ for_each_possible_cpu(cpu) {
+ for (i = 0; i < TYPE_MAX; i++) {
+ struct bp_cpuinfo *info = get_bp_info(cpu, i);
+
+ info->tsk_pinned = kcalloc(nr_slots[i], sizeof(int),
+ GFP_KERNEL);
+ if (!info->tsk_pinned)
+ goto err_alloc;
+ }
+ }
+
+ constraints_initialized = 1;
+
+ perf_pmu_register(&perf_breakpoint, "breakpoint", PERF_TYPE_BREAKPOINT);
+
+ return register_die_notifier(&hw_breakpoint_exceptions_nb);
+
+ err_alloc:
+ for_each_possible_cpu(err_cpu) {
+ for (i = 0; i < TYPE_MAX; i++)
+ kfree(get_bp_info(err_cpu, i)->tsk_pinned);
+ if (err_cpu == cpu)
+ break;
+ }
+
+ return -ENOMEM;
+}
+
+
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
new file mode 100644
index 00000000000..569b218782a
--- /dev/null
+++ b/kernel/events/internal.h
@@ -0,0 +1,198 @@
+#ifndef _KERNEL_EVENTS_INTERNAL_H
+#define _KERNEL_EVENTS_INTERNAL_H
+
+#include <linux/hardirq.h>
+#include <linux/uaccess.h>
+
+/* Buffer handling */
+
+#define RING_BUFFER_WRITABLE 0x01
+
+struct ring_buffer {
+ atomic_t refcount;
+ struct rcu_head rcu_head;
+#ifdef CONFIG_PERF_USE_VMALLOC
+ struct work_struct work;
+ int page_order; /* allocation order */
+#endif
+ int nr_pages; /* nr of data pages */
+ int overwrite; /* can overwrite itself */
+
+ atomic_t poll; /* POLL_ for wakeups */
+
+ local_t head; /* write position */
+ local_t nest; /* nested writers */
+ local_t events; /* event limit */
+ local_t wakeup; /* wakeup stamp */
+ local_t lost; /* nr records lost */
+
+ long watermark; /* wakeup watermark */
+ /* poll crap */
+ spinlock_t event_lock;
+ struct list_head event_list;
+
+ atomic_t mmap_count;
+ unsigned long mmap_locked;
+ struct user_struct *mmap_user;
+
+ struct perf_event_mmap_page *user_page;
+ void *data_pages[0];
+};
+
+extern void rb_free(struct ring_buffer *rb);
+extern struct ring_buffer *
+rb_alloc(int nr_pages, long watermark, int cpu, int flags);
+extern void perf_event_wakeup(struct perf_event *event);
+
+extern void
+perf_event_header__init_id(struct perf_event_header *header,
+ struct perf_sample_data *data,
+ struct perf_event *event);
+extern void
+perf_event__output_id_sample(struct perf_event *event,
+ struct perf_output_handle *handle,
+ struct perf_sample_data *sample);
+
+extern struct page *
+perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff);
+
+#ifdef CONFIG_PERF_USE_VMALLOC
+/*
+ * Back perf_mmap() with vmalloc memory.
+ *
+ * Required for architectures that have d-cache aliasing issues.
+ */
+
+static inline int page_order(struct ring_buffer *rb)
+{
+ return rb->page_order;
+}
+
+#else
+
+static inline int page_order(struct ring_buffer *rb)
+{
+ return 0;
+}
+#endif
+
+static inline unsigned long perf_data_size(struct ring_buffer *rb)
+{
+ return rb->nr_pages << (PAGE_SHIFT + page_order(rb));
+}
+
+#define DEFINE_OUTPUT_COPY(func_name, memcpy_func) \
+static inline unsigned long \
+func_name(struct perf_output_handle *handle, \
+ const void *buf, unsigned long len) \
+{ \
+ unsigned long size, written; \
+ \
+ do { \
+ size = min(handle->size, len); \
+ written = memcpy_func(handle->addr, buf, size); \
+ written = size - written; \
+ \
+ len -= written; \
+ handle->addr += written; \
+ buf += written; \
+ handle->size -= written; \
+ if (!handle->size) { \
+ struct ring_buffer *rb = handle->rb; \
+ \
+ handle->page++; \
+ handle->page &= rb->nr_pages - 1; \
+ handle->addr = rb->data_pages[handle->page]; \
+ handle->size = PAGE_SIZE << page_order(rb); \
+ } \
+ } while (len && written == size); \
+ \
+ return len; \
+}
+
+static inline unsigned long
+memcpy_common(void *dst, const void *src, unsigned long n)
+{
+ memcpy(dst, src, n);
+ return 0;
+}
+
+DEFINE_OUTPUT_COPY(__output_copy, memcpy_common)
+
+static inline unsigned long
+memcpy_skip(void *dst, const void *src, unsigned long n)
+{
+ return 0;
+}
+
+DEFINE_OUTPUT_COPY(__output_skip, memcpy_skip)
+
+#ifndef arch_perf_out_copy_user
+#define arch_perf_out_copy_user arch_perf_out_copy_user
+
+static inline unsigned long
+arch_perf_out_copy_user(void *dst, const void *src, unsigned long n)
+{
+ unsigned long ret;
+
+ pagefault_disable();
+ ret = __copy_from_user_inatomic(dst, src, n);
+ pagefault_enable();
+
+ return ret;
+}
+#endif
+
+DEFINE_OUTPUT_COPY(__output_copy_user, arch_perf_out_copy_user)
+
+/* Callchain handling */
+extern struct perf_callchain_entry *
+perf_callchain(struct perf_event *event, struct pt_regs *regs);
+extern int get_callchain_buffers(void);
+extern void put_callchain_buffers(void);
+
+static inline int get_recursion_context(int *recursion)
+{
+ int rctx;
+
+ if (in_nmi())
+ rctx = 3;
+ else if (in_irq())
+ rctx = 2;
+ else if (in_softirq())
+ rctx = 1;
+ else
+ rctx = 0;
+
+ if (recursion[rctx])
+ return -1;
+
+ recursion[rctx]++;
+ barrier();
+
+ return rctx;
+}
+
+static inline void put_recursion_context(int *recursion, int rctx)
+{
+ barrier();
+ recursion[rctx]--;
+}
+
+#ifdef CONFIG_HAVE_PERF_USER_STACK_DUMP
+static inline bool arch_perf_have_user_stack_dump(void)
+{
+ return true;
+}
+
+#define perf_user_stack_pointer(regs) user_stack_pointer(regs)
+#else
+static inline bool arch_perf_have_user_stack_dump(void)
+{
+ return false;
+}
+
+#define perf_user_stack_pointer(regs) 0
+#endif /* CONFIG_HAVE_PERF_USER_STACK_DUMP */
+
+#endif /* _KERNEL_EVENTS_INTERNAL_H */
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
new file mode 100644
index 00000000000..146a5792b1d
--- /dev/null
+++ b/kernel/events/ring_buffer.c
@@ -0,0 +1,417 @@
+/*
+ * Performance events ring-buffer code:
+ *
+ * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
+ * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
+ * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
+ * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
+ *
+ * For licensing details see kernel-base/COPYING
+ */
+
+#include <linux/perf_event.h>
+#include <linux/vmalloc.h>
+#include <linux/slab.h>
+#include <linux/circ_buf.h>
+
+#include "internal.h"
+
+static void perf_output_wakeup(struct perf_output_handle *handle)
+{
+ atomic_set(&handle->rb->poll, POLL_IN);
+
+ handle->event->pending_wakeup = 1;
+ irq_work_queue(&handle->event->pending);
+}
+
+/*
+ * We need to ensure a later event_id doesn't publish a head when a former
+ * event isn't done writing. However since we need to deal with NMIs we
+ * cannot fully serialize things.
+ *
+ * We only publish the head (and generate a wakeup) when the outer-most
+ * event completes.
+ */
+static void perf_output_get_handle(struct perf_output_handle *handle)
+{
+ struct ring_buffer *rb = handle->rb;
+
+ preempt_disable();
+ local_inc(&rb->nest);
+ handle->wakeup = local_read(&rb->wakeup);
+}
+
+static void perf_output_put_handle(struct perf_output_handle *handle)
+{
+ struct ring_buffer *rb = handle->rb;
+ unsigned long head;
+
+again:
+ head = local_read(&rb->head);
+
+ /*
+ * IRQ/NMI can happen here, which means we can miss a head update.
+ */
+
+ if (!local_dec_and_test(&rb->nest))
+ goto out;
+
+ /*
+ * Since the mmap() consumer (userspace) can run on a different CPU:
+ *
+ * kernel user
+ *
+ * if (LOAD ->data_tail) { LOAD ->data_head
+ * (A) smp_rmb() (C)
+ * STORE $data LOAD $data
+ * smp_wmb() (B) smp_mb() (D)
+ * STORE ->data_head STORE ->data_tail
+ * }
+ *
+ * Where A pairs with D, and B pairs with C.
+ *
+ * In our case (A) is a control dependency that separates the load of
+ * the ->data_tail and the stores of $data. In case ->data_tail
+ * indicates there is no room in the buffer to store $data we do not.
+ *
+ * D needs to be a full barrier since it separates the data READ
+ * from the tail WRITE.
+ *
+ * For B a WMB is sufficient since it separates two WRITEs, and for C
+ * an RMB is sufficient since it separates two READs.
+ *
+ * See perf_output_begin().
+ */
+ smp_wmb(); /* B, matches C */
+ rb->user_page->data_head = head;
+
+ /*
+ * Now check if we missed an update -- rely on previous implied
+ * compiler barriers to force a re-read.
+ */
+ if (unlikely(head != local_read(&rb->head))) {
+ local_inc(&rb->nest);
+ goto again;
+ }
+
+ if (handle->wakeup != local_read(&rb->wakeup))
+ perf_output_wakeup(handle);
+
+out:
+ preempt_enable();
+}
+
+int perf_output_begin(struct perf_output_handle *handle,
+ struct perf_event *event, unsigned int size)
+{
+ struct ring_buffer *rb;
+ unsigned long tail, offset, head;
+ int have_lost, page_shift;
+ struct {
+ struct perf_event_header header;
+ u64 id;
+ u64 lost;
+ } lost_event;
+
+ rcu_read_lock();
+ /*
+ * For inherited events we send all the output towards the parent.
+ */
+ if (event->parent)
+ event = event->parent;
+
+ rb = rcu_dereference(event->rb);
+ if (unlikely(!rb))
+ goto out;
+
+ if (unlikely(!rb->nr_pages))
+ goto out;
+
+ handle->rb = rb;
+ handle->event = event;
+
+ have_lost = local_read(&rb->lost);
+ if (unlikely(have_lost)) {
+ size += sizeof(lost_event);
+ if (event->attr.sample_id_all)
+ size += event->id_header_size;
+ }
+
+ perf_output_get_handle(handle);
+
+ do {
+ tail = ACCESS_ONCE(rb->user_page->data_tail);
+ offset = head = local_read(&rb->head);
+ if (!rb->overwrite &&
+ unlikely(CIRC_SPACE(head, tail, perf_data_size(rb)) < size))
+ goto fail;
+
+ /*
+ * The above forms a control dependency barrier separating the
+ * @tail load above from the data stores below. Since the @tail
+ * load is required to compute the branch to fail below.
+ *
+ * A, matches D; the full memory barrier userspace SHOULD issue
+ * after reading the data and before storing the new tail
+ * position.
+ *
+ * See perf_output_put_handle().
+ */
+
+ head += size;
+ } while (local_cmpxchg(&rb->head, offset, head) != offset);
+
+ /*
+ * We rely on the implied barrier() by local_cmpxchg() to ensure
+ * none of the data stores below can be lifted up by the compiler.
+ */
+
+ if (unlikely(head - local_read(&rb->wakeup) > rb->watermark))
+ local_add(rb->watermark, &rb->wakeup);
+
+ page_shift = PAGE_SHIFT + page_order(rb);
+
+ handle->page = (offset >> page_shift) & (rb->nr_pages - 1);
+ offset &= (1UL << page_shift) - 1;
+ handle->addr = rb->data_pages[handle->page] + offset;
+ handle->size = (1UL << page_shift) - offset;
+
+ if (unlikely(have_lost)) {
+ struct perf_sample_data sample_data;
+
+ lost_event.header.size = sizeof(lost_event);
+ lost_event.header.type = PERF_RECORD_LOST;
+ lost_event.header.misc = 0;
+ lost_event.id = event->id;
+ lost_event.lost = local_xchg(&rb->lost, 0);
+
+ perf_event_header__init_id(&lost_event.header,
+ &sample_data, event);
+ perf_output_put(handle, lost_event);
+ perf_event__output_id_sample(event, handle, &sample_data);
+ }
+
+ return 0;
+
+fail:
+ local_inc(&rb->lost);
+ perf_output_put_handle(handle);
+out:
+ rcu_read_unlock();
+
+ return -ENOSPC;
+}
+
+unsigned int perf_output_copy(struct perf_output_handle *handle,
+ const void *buf, unsigned int len)
+{
+ return __output_copy(handle, buf, len);
+}
+
+unsigned int perf_output_skip(struct perf_output_handle *handle,
+ unsigned int len)
+{
+ return __output_skip(handle, NULL, len);
+}
+
+void perf_output_end(struct perf_output_handle *handle)
+{
+ perf_output_put_handle(handle);
+ rcu_read_unlock();
+}
+
+static void
+ring_buffer_init(struct ring_buffer *rb, long watermark, int flags)
+{
+ long max_size = perf_data_size(rb);
+
+ if (watermark)
+ rb->watermark = min(max_size, watermark);
+
+ if (!rb->watermark)
+ rb->watermark = max_size / 2;
+
+ if (flags & RING_BUFFER_WRITABLE)
+ rb->overwrite = 0;
+ else
+ rb->overwrite = 1;
+
+ atomic_set(&rb->refcount, 1);
+
+ INIT_LIST_HEAD(&rb->event_list);
+ spin_lock_init(&rb->event_lock);
+}
+
+#ifndef CONFIG_PERF_USE_VMALLOC
+
+/*
+ * Back perf_mmap() with regular GFP_KERNEL-0 pages.
+ */
+
+struct page *
+perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff)
+{
+ if (pgoff > rb->nr_pages)
+ return NULL;
+
+ if (pgoff == 0)
+ return virt_to_page(rb->user_page);
+
+ return virt_to_page(rb->data_pages[pgoff - 1]);
+}
+
+static void *perf_mmap_alloc_page(int cpu)
+{
+ struct page *page;
+ int node;
+
+ node = (cpu == -1) ? cpu : cpu_to_node(cpu);
+ page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
+ if (!page)
+ return NULL;
+
+ return page_address(page);
+}
+
+struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags)
+{
+ struct ring_buffer *rb;
+ unsigned long size;
+ int i;
+
+ size = sizeof(struct ring_buffer);
+ size += nr_pages * sizeof(void *);
+
+ rb = kzalloc(size, GFP_KERNEL);
+ if (!rb)
+ goto fail;
+
+ rb->user_page = perf_mmap_alloc_page(cpu);
+ if (!rb->user_page)
+ goto fail_user_page;
+
+ for (i = 0; i < nr_pages; i++) {
+ rb->data_pages[i] = perf_mmap_alloc_page(cpu);
+ if (!rb->data_pages[i])
+ goto fail_data_pages;
+ }
+
+ rb->nr_pages = nr_pages;
+
+ ring_buffer_init(rb, watermark, flags);
+
+ return rb;
+
+fail_data_pages:
+ for (i--; i >= 0; i--)
+ free_page((unsigned long)rb->data_pages[i]);
+
+ free_page((unsigned long)rb->user_page);
+
+fail_user_page:
+ kfree(rb);
+
+fail:
+ return NULL;
+}
+
+static void perf_mmap_free_page(unsigned long addr)
+{
+ struct page *page = virt_to_page((void *)addr);
+
+ page->mapping = NULL;
+ __free_page(page);
+}
+
+void rb_free(struct ring_buffer *rb)
+{
+ int i;
+
+ perf_mmap_free_page((unsigned long)rb->user_page);
+ for (i = 0; i < rb->nr_pages; i++)
+ perf_mmap_free_page((unsigned long)rb->data_pages[i]);
+ kfree(rb);
+}
+
+#else
+static int data_page_nr(struct ring_buffer *rb)
+{
+ return rb->nr_pages << page_order(rb);
+}
+
+struct page *
+perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff)
+{
+ /* The '>' counts in the user page. */
+ if (pgoff > data_page_nr(rb))
+ return NULL;
+
+ return vmalloc_to_page((void *)rb->user_page + pgoff * PAGE_SIZE);
+}
+
+static void perf_mmap_unmark_page(void *addr)
+{
+ struct page *page = vmalloc_to_page(addr);
+
+ page->mapping = NULL;
+}
+
+static void rb_free_work(struct work_struct *work)
+{
+ struct ring_buffer *rb;
+ void *base;
+ int i, nr;
+
+ rb = container_of(work, struct ring_buffer, work);
+ nr = data_page_nr(rb);
+
+ base = rb->user_page;
+ /* The '<=' counts in the user page. */
+ for (i = 0; i <= nr; i++)
+ perf_mmap_unmark_page(base + (i * PAGE_SIZE));
+
+ vfree(base);
+ kfree(rb);
+}
+
+void rb_free(struct ring_buffer *rb)
+{
+ schedule_work(&rb->work);
+}
+
+struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags)
+{
+ struct ring_buffer *rb;
+ unsigned long size;
+ void *all_buf;
+
+ size = sizeof(struct ring_buffer);
+ size += sizeof(void *);
+
+ rb = kzalloc(size, GFP_KERNEL);
+ if (!rb)
+ goto fail;
+
+ INIT_WORK(&rb->work, rb_free_work);
+
+ all_buf = vmalloc_user((nr_pages + 1) * PAGE_SIZE);
+ if (!all_buf)
+ goto fail_all_buf;
+
+ rb->user_page = all_buf;
+ rb->data_pages[0] = all_buf + PAGE_SIZE;
+ rb->page_order = ilog2(nr_pages);
+ rb->nr_pages = !!nr_pages;
+
+ ring_buffer_init(rb, watermark, flags);
+
+ return rb;
+
+fail_all_buf:
+ kfree(rb);
+
+fail:
+ return NULL;
+}
+
+#endif
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
new file mode 100644
index 00000000000..6f3254e8c13
--- /dev/null
+++ b/kernel/events/uprobes.c
@@ -0,0 +1,1993 @@
+/*
+ * User-space Probes (UProbes)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2008-2012
+ * Authors:
+ * Srikar Dronamraju
+ * Jim Keniston
+ * Copyright (C) 2011-2012 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
+ */
+
+#include <linux/kernel.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h> /* read_mapping_page */
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/export.h>
+#include <linux/rmap.h> /* anon_vma_prepare */
+#include <linux/mmu_notifier.h> /* set_pte_at_notify */
+#include <linux/swap.h> /* try_to_free_swap */
+#include <linux/ptrace.h> /* user_enable_single_step */
+#include <linux/kdebug.h> /* notifier mechanism */
+#include "../../mm/internal.h" /* munlock_vma_page */
+#include <linux/percpu-rwsem.h>
+#include <linux/task_work.h>
+#include <linux/shmem_fs.h>
+
+#include <linux/uprobes.h>
+
+#define UINSNS_PER_PAGE (PAGE_SIZE/UPROBE_XOL_SLOT_BYTES)
+#define MAX_UPROBE_XOL_SLOTS UINSNS_PER_PAGE
+
+static struct rb_root uprobes_tree = RB_ROOT;
+/*
+ * allows us to skip the uprobe_mmap if there are no uprobe events active
+ * at this time. Probably a fine grained per inode count is better?
+ */
+#define no_uprobe_events() RB_EMPTY_ROOT(&uprobes_tree)
+
+static DEFINE_SPINLOCK(uprobes_treelock); /* serialize rbtree access */
+
+#define UPROBES_HASH_SZ 13
+/* serialize uprobe->pending_list */
+static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ];
+#define uprobes_mmap_hash(v) (&uprobes_mmap_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ])
+
+static struct percpu_rw_semaphore dup_mmap_sem;
+
+/* Have a copy of original instruction */
+#define UPROBE_COPY_INSN 0
+
+struct uprobe {
+ struct rb_node rb_node; /* node in the rb tree */
+ atomic_t ref;
+ struct rw_semaphore register_rwsem;
+ struct rw_semaphore consumer_rwsem;
+ struct list_head pending_list;
+ struct uprobe_consumer *consumers;
+ struct inode *inode; /* Also hold a ref to inode */
+ loff_t offset;
+ unsigned long flags;
+
+ /*
+ * The generic code assumes that it has two members of unknown type
+ * owned by the arch-specific code:
+ *
+ * insn - copy_insn() saves the original instruction here for
+ * arch_uprobe_analyze_insn().
+ *
+ * ixol - potentially modified instruction to execute out of
+ * line, copied to xol_area by xol_get_insn_slot().
+ */
+ struct arch_uprobe arch;
+};
+
+struct return_instance {
+ struct uprobe *uprobe;
+ unsigned long func;
+ unsigned long orig_ret_vaddr; /* original return address */
+ bool chained; /* true, if instance is nested */
+
+ struct return_instance *next; /* keep as stack */
+};
+
+/*
+ * Execute out of line area: anonymous executable mapping installed
+ * by the probed task to execute the copy of the original instruction
+ * mangled by set_swbp().
+ *
+ * On a breakpoint hit, thread contests for a slot. It frees the
+ * slot after singlestep. Currently a fixed number of slots are
+ * allocated.
+ */
+struct xol_area {
+ wait_queue_head_t wq; /* if all slots are busy */
+ atomic_t slot_count; /* number of in-use slots */
+ unsigned long *bitmap; /* 0 = free slot */
+ struct page *page;
+
+ /*
+ * We keep the vma's vm_start rather than a pointer to the vma
+ * itself. The probed process or a naughty kernel module could make
+ * the vma go away, and we must handle that reasonably gracefully.
+ */
+ unsigned long vaddr; /* Page(s) of instruction slots */
+};
+
+/*
+ * valid_vma: Verify if the specified vma is an executable vma
+ * Relax restrictions while unregistering: vm_flags might have
+ * changed after breakpoint was inserted.
+ * - is_register: indicates if we are in register context.
+ * - Return 1 if the specified virtual address is in an
+ * executable vma.
+ */
+static bool valid_vma(struct vm_area_struct *vma, bool is_register)
+{
+ vm_flags_t flags = VM_HUGETLB | VM_MAYEXEC | VM_MAYSHARE;
+
+ if (is_register)
+ flags |= VM_WRITE;
+
+ return vma->vm_file && (vma->vm_flags & flags) == VM_MAYEXEC;
+}
+
+static unsigned long offset_to_vaddr(struct vm_area_struct *vma, loff_t offset)
+{
+ return vma->vm_start + offset - ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
+}
+
+static loff_t vaddr_to_offset(struct vm_area_struct *vma, unsigned long vaddr)
+{
+ return ((loff_t)vma->vm_pgoff << PAGE_SHIFT) + (vaddr - vma->vm_start);
+}
+
+/**
+ * __replace_page - replace page in vma by new page.
+ * based on replace_page in mm/ksm.c
+ *
+ * @vma: vma that holds the pte pointing to page
+ * @addr: address the old @page is mapped at
+ * @page: the cowed page we are replacing by kpage
+ * @kpage: the modified page we replace page by
+ *
+ * Returns 0 on success, -EFAULT on failure.
+ */
+static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
+ struct page *page, struct page *kpage)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ spinlock_t *ptl;
+ pte_t *ptep;
+ int err;
+ /* For mmu_notifiers */
+ const unsigned long mmun_start = addr;
+ const unsigned long mmun_end = addr + PAGE_SIZE;
+
+ /* For try_to_free_swap() and munlock_vma_page() below */
+ lock_page(page);
+
+ mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
+ err = -EAGAIN;
+ ptep = page_check_address(page, mm, addr, &ptl, 0);
+ if (!ptep)
+ goto unlock;
+
+ get_page(kpage);
+ page_add_new_anon_rmap(kpage, vma, addr);
+
+ if (!PageAnon(page)) {
+ dec_mm_counter(mm, MM_FILEPAGES);
+ inc_mm_counter(mm, MM_ANONPAGES);
+ }
+
+ flush_cache_page(vma, addr, pte_pfn(*ptep));
+ ptep_clear_flush(vma, addr, ptep);
+ set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot));
+
+ page_remove_rmap(page);
+ if (!page_mapped(page))
+ try_to_free_swap(page);
+ pte_unmap_unlock(ptep, ptl);
+
+ if (vma->vm_flags & VM_LOCKED)
+ munlock_vma_page(page);
+ put_page(page);
+
+ err = 0;
+ unlock:
+ mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
+ unlock_page(page);
+ return err;
+}
+
+/**
+ * is_swbp_insn - check if instruction is breakpoint instruction.
+ * @insn: instruction to be checked.
+ * Default implementation of is_swbp_insn
+ * Returns true if @insn is a breakpoint instruction.
+ */
+bool __weak is_swbp_insn(uprobe_opcode_t *insn)
+{
+ return *insn == UPROBE_SWBP_INSN;
+}
+
+/**
+ * is_trap_insn - check if instruction is breakpoint instruction.
+ * @insn: instruction to be checked.
+ * Default implementation of is_trap_insn
+ * Returns true if @insn is a breakpoint instruction.
+ *
+ * This function is needed for the case where an architecture has multiple
+ * trap instructions (like powerpc).
+ */
+bool __weak is_trap_insn(uprobe_opcode_t *insn)
+{
+ return is_swbp_insn(insn);
+}
+
+static void copy_from_page(struct page *page, unsigned long vaddr, void *dst, int len)
+{
+ void *kaddr = kmap_atomic(page);
+ memcpy(dst, kaddr + (vaddr & ~PAGE_MASK), len);
+ kunmap_atomic(kaddr);
+}
+
+static void copy_to_page(struct page *page, unsigned long vaddr, const void *src, int len)
+{
+ void *kaddr = kmap_atomic(page);
+ memcpy(kaddr + (vaddr & ~PAGE_MASK), src, len);
+ kunmap_atomic(kaddr);
+}
+
+static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *new_opcode)
+{
+ uprobe_opcode_t old_opcode;
+ bool is_swbp;
+
+ /*
+ * Note: We only check if the old_opcode is UPROBE_SWBP_INSN here.
+ * We do not check if it is any other 'trap variant' which could
+ * be conditional trap instruction such as the one powerpc supports.
+ *
+ * The logic is that we do not care if the underlying instruction
+ * is a trap variant; uprobes always wins over any other (gdb)
+ * breakpoint.
+ */
+ copy_from_page(page, vaddr, &old_opcode, UPROBE_SWBP_INSN_SIZE);
+ is_swbp = is_swbp_insn(&old_opcode);
+
+ if (is_swbp_insn(new_opcode)) {
+ if (is_swbp) /* register: already installed? */
+ return 0;
+ } else {
+ if (!is_swbp) /* unregister: was it changed by us? */
+ return 0;
+ }
+
+ return 1;
+}
+
+/*
+ * NOTE:
+ * Expect the breakpoint instruction to be the smallest size instruction for
+ * the architecture. If an arch has variable length instruction and the
+ * breakpoint instruction is not of the smallest length instruction
+ * supported by that architecture then we need to modify is_trap_at_addr and
+ * uprobe_write_opcode accordingly. This would never be a problem for archs
+ * that have fixed length instructions.
+ *
+ * uprobe_write_opcode - write the opcode at a given virtual address.
+ * @mm: the probed process address space.
+ * @vaddr: the virtual address to store the opcode.
+ * @opcode: opcode to be written at @vaddr.
+ *
+ * Called with mm->mmap_sem held for write.
+ * Return 0 (success) or a negative errno.
+ */
+int uprobe_write_opcode(struct mm_struct *mm, unsigned long vaddr,
+ uprobe_opcode_t opcode)
+{
+ struct page *old_page, *new_page;
+ struct vm_area_struct *vma;
+ int ret;
+
+retry:
+ /* Read the page with vaddr into memory */
+ ret = get_user_pages(NULL, mm, vaddr, 1, 0, 1, &old_page, &vma);
+ if (ret <= 0)
+ return ret;
+
+ ret = verify_opcode(old_page, vaddr, &opcode);
+ if (ret <= 0)
+ goto put_old;
+
+ ret = anon_vma_prepare(vma);
+ if (ret)
+ goto put_old;
+
+ ret = -ENOMEM;
+ new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vaddr);
+ if (!new_page)
+ goto put_old;
+
+ if (mem_cgroup_charge_anon(new_page, mm, GFP_KERNEL))
+ goto put_new;
+
+ __SetPageUptodate(new_page);
+ copy_highpage(new_page, old_page);
+ copy_to_page(new_page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
+
+ ret = __replace_page(vma, vaddr, old_page, new_page);
+ if (ret)
+ mem_cgroup_uncharge_page(new_page);
+
+put_new:
+ page_cache_release(new_page);
+put_old:
+ put_page(old_page);
+
+ if (unlikely(ret == -EAGAIN))
+ goto retry;
+ return ret;
+}
+
+/**
+ * set_swbp - store breakpoint at a given address.
+ * @auprobe: arch specific probepoint information.
+ * @mm: the probed process address space.
+ * @vaddr: the virtual address to insert the opcode.
+ *
+ * For mm @mm, store the breakpoint instruction at @vaddr.
+ * Return 0 (success) or a negative errno.
+ */
+int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr)
+{
+ return uprobe_write_opcode(mm, vaddr, UPROBE_SWBP_INSN);
+}
+
+/**
+ * set_orig_insn - Restore the original instruction.
+ * @mm: the probed process address space.
+ * @auprobe: arch specific probepoint information.
+ * @vaddr: the virtual address to insert the opcode.
+ *
+ * For mm @mm, restore the original opcode (opcode) at @vaddr.
+ * Return 0 (success) or a negative errno.
+ */
+int __weak
+set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr)
+{
+ return uprobe_write_opcode(mm, vaddr, *(uprobe_opcode_t *)&auprobe->insn);
+}
+
+static int match_uprobe(struct uprobe *l, struct uprobe *r)
+{
+ if (l->inode < r->inode)
+ return -1;
+
+ if (l->inode > r->inode)
+ return 1;
+
+ if (l->offset < r->offset)
+ return -1;
+
+ if (l->offset > r->offset)
+ return 1;
+
+ return 0;
+}
+
+static struct uprobe *__find_uprobe(struct inode *inode, loff_t offset)
+{
+ struct uprobe u = { .inode = inode, .offset = offset };
+ struct rb_node *n = uprobes_tree.rb_node;
+ struct uprobe *uprobe;
+ int match;
+
+ while (n) {
+ uprobe = rb_entry(n, struct uprobe, rb_node);
+ match = match_uprobe(&u, uprobe);
+ if (!match) {
+ atomic_inc(&uprobe->ref);
+ return uprobe;
+ }
+
+ if (match < 0)
+ n = n->rb_left;
+ else
+ n = n->rb_right;
+ }
+ return NULL;
+}
+
+/*
+ * Find a uprobe corresponding to a given inode:offset
+ * Acquires uprobes_treelock
+ */
+static struct uprobe *find_uprobe(struct inode *inode, loff_t offset)
+{
+ struct uprobe *uprobe;
+
+ spin_lock(&uprobes_treelock);
+ uprobe = __find_uprobe(inode, offset);
+ spin_unlock(&uprobes_treelock);
+
+ return uprobe;
+}
+
+static struct uprobe *__insert_uprobe(struct uprobe *uprobe)
+{
+ struct rb_node **p = &uprobes_tree.rb_node;
+ struct rb_node *parent = NULL;
+ struct uprobe *u;
+ int match;
+
+ while (*p) {
+ parent = *p;
+ u = rb_entry(parent, struct uprobe, rb_node);
+ match = match_uprobe(uprobe, u);
+ if (!match) {
+ atomic_inc(&u->ref);
+ return u;
+ }
+
+ if (match < 0)
+ p = &parent->rb_left;
+ else
+ p = &parent->rb_right;
+
+ }
+
+ u = NULL;
+ rb_link_node(&uprobe->rb_node, parent, p);
+ rb_insert_color(&uprobe->rb_node, &uprobes_tree);
+ /* get access + creation ref */
+ atomic_set(&uprobe->ref, 2);
+
+ return u;
+}
+
+/*
+ * Acquire uprobes_treelock.
+ * Matching uprobe already exists in rbtree;
+ * increment (access refcount) and return the matching uprobe.
+ *
+ * No matching uprobe; insert the uprobe in rb_tree;
+ * get a double refcount (access + creation) and return NULL.
+ */
+static struct uprobe *insert_uprobe(struct uprobe *uprobe)
+{
+ struct uprobe *u;
+
+ spin_lock(&uprobes_treelock);
+ u = __insert_uprobe(uprobe);
+ spin_unlock(&uprobes_treelock);
+
+ return u;
+}
+
+static void put_uprobe(struct uprobe *uprobe)
+{
+ if (atomic_dec_and_test(&uprobe->ref))
+ kfree(uprobe);
+}
+
+static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset)
+{
+ struct uprobe *uprobe, *cur_uprobe;
+
+ uprobe = kzalloc(sizeof(struct uprobe), GFP_KERNEL);
+ if (!uprobe)
+ return NULL;
+
+ uprobe->inode = igrab(inode);
+ uprobe->offset = offset;
+ init_rwsem(&uprobe->register_rwsem);
+ init_rwsem(&uprobe->consumer_rwsem);
+
+ /* add to uprobes_tree, sorted on inode:offset */
+ cur_uprobe = insert_uprobe(uprobe);
+ /* a uprobe exists for this inode:offset combination */
+ if (cur_uprobe) {
+ kfree(uprobe);
+ uprobe = cur_uprobe;
+ iput(inode);
+ }
+
+ return uprobe;
+}
+
+static void consumer_add(struct uprobe *uprobe, struct uprobe_consumer *uc)
+{
+ down_write(&uprobe->consumer_rwsem);
+ uc->next = uprobe->consumers;
+ uprobe->consumers = uc;
+ up_write(&uprobe->consumer_rwsem);
+}
+
+/*
+ * For uprobe @uprobe, delete the consumer @uc.
+ * Return true if the @uc is deleted successfully
+ * or return false.
+ */
+static bool consumer_del(struct uprobe *uprobe, struct uprobe_consumer *uc)
+{
+ struct uprobe_consumer **con;
+ bool ret = false;
+
+ down_write(&uprobe->consumer_rwsem);
+ for (con = &uprobe->consumers; *con; con = &(*con)->next) {
+ if (*con == uc) {
+ *con = uc->next;
+ ret = true;
+ break;
+ }
+ }
+ up_write(&uprobe->consumer_rwsem);
+
+ return ret;
+}
+
+static int __copy_insn(struct address_space *mapping, struct file *filp,
+ void *insn, int nbytes, loff_t offset)
+{
+ struct page *page;
+ /*
+ * Ensure that the page that has the original instruction is populated
+ * and in page-cache. If ->readpage == NULL it must be shmem_mapping(),
+ * see uprobe_register().
+ */
+ if (mapping->a_ops->readpage)
+ page = read_mapping_page(mapping, offset >> PAGE_CACHE_SHIFT, filp);
+ else
+ page = shmem_read_mapping_page(mapping, offset >> PAGE_CACHE_SHIFT);
+ if (IS_ERR(page))
+ return PTR_ERR(page);
+
+ copy_from_page(page, offset, insn, nbytes);
+ page_cache_release(page);
+
+ return 0;
+}
+
+static int copy_insn(struct uprobe *uprobe, struct file *filp)
+{
+ struct address_space *mapping = uprobe->inode->i_mapping;
+ loff_t offs = uprobe->offset;
+ void *insn = &uprobe->arch.insn;
+ int size = sizeof(uprobe->arch.insn);
+ int len, err = -EIO;
+
+ /* Copy only available bytes, -EIO if nothing was read */
+ do {
+ if (offs >= i_size_read(uprobe->inode))
+ break;
+
+ len = min_t(int, size, PAGE_SIZE - (offs & ~PAGE_MASK));
+ err = __copy_insn(mapping, filp, insn, len, offs);
+ if (err)
+ break;
+
+ insn += len;
+ offs += len;
+ size -= len;
+ } while (size);
+
+ return err;
+}
+
+static int prepare_uprobe(struct uprobe *uprobe, struct file *file,
+ struct mm_struct *mm, unsigned long vaddr)
+{
+ int ret = 0;
+
+ if (test_bit(UPROBE_COPY_INSN, &uprobe->flags))
+ return ret;
+
+ /* TODO: move this into _register, until then we abuse this sem. */
+ down_write(&uprobe->consumer_rwsem);
+ if (test_bit(UPROBE_COPY_INSN, &uprobe->flags))
+ goto out;
+
+ ret = copy_insn(uprobe, file);
+ if (ret)
+ goto out;
+
+ ret = -ENOTSUPP;
+ if (is_trap_insn((uprobe_opcode_t *)&uprobe->arch.insn))
+ goto out;
+
+ ret = arch_uprobe_analyze_insn(&uprobe->arch, mm, vaddr);
+ if (ret)
+ goto out;
+
+ /* uprobe_write_opcode() assumes we don't cross page boundary */
+ BUG_ON((uprobe->offset & ~PAGE_MASK) +
+ UPROBE_SWBP_INSN_SIZE > PAGE_SIZE);
+
+ smp_wmb(); /* pairs with rmb() in find_active_uprobe() */
+ set_bit(UPROBE_COPY_INSN, &uprobe->flags);
+
+ out:
+ up_write(&uprobe->consumer_rwsem);
+
+ return ret;
+}
+
+static inline bool consumer_filter(struct uprobe_consumer *uc,
+ enum uprobe_filter_ctx ctx, struct mm_struct *mm)
+{
+ return !uc->filter || uc->filter(uc, ctx, mm);
+}
+
+static bool filter_chain(struct uprobe *uprobe,
+ enum uprobe_filter_ctx ctx, struct mm_struct *mm)
+{
+ struct uprobe_consumer *uc;
+ bool ret = false;
+
+ down_read(&uprobe->consumer_rwsem);
+ for (uc = uprobe->consumers; uc; uc = uc->next) {
+ ret = consumer_filter(uc, ctx, mm);
+ if (ret)
+ break;
+ }
+ up_read(&uprobe->consumer_rwsem);
+
+ return ret;
+}
+
+static int
+install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,
+ struct vm_area_struct *vma, unsigned long vaddr)
+{
+ bool first_uprobe;
+ int ret;
+
+ ret = prepare_uprobe(uprobe, vma->vm_file, mm, vaddr);
+ if (ret)
+ return ret;
+
+ /*
+ * set MMF_HAS_UPROBES in advance for uprobe_pre_sstep_notifier(),
+ * the task can hit this breakpoint right after __replace_page().
+ */
+ first_uprobe = !test_bit(MMF_HAS_UPROBES, &mm->flags);
+ if (first_uprobe)
+ set_bit(MMF_HAS_UPROBES, &mm->flags);
+
+ ret = set_swbp(&uprobe->arch, mm, vaddr);
+ if (!ret)
+ clear_bit(MMF_RECALC_UPROBES, &mm->flags);
+ else if (first_uprobe)
+ clear_bit(MMF_HAS_UPROBES, &mm->flags);
+
+ return ret;
+}
+
+static int
+remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vaddr)
+{
+ set_bit(MMF_RECALC_UPROBES, &mm->flags);
+ return set_orig_insn(&uprobe->arch, mm, vaddr);
+}
+
+static inline bool uprobe_is_active(struct uprobe *uprobe)
+{
+ return !RB_EMPTY_NODE(&uprobe->rb_node);
+}
+/*
+ * There could be threads that have already hit the breakpoint. They
+ * will recheck the current insn and restart if find_uprobe() fails.
+ * See find_active_uprobe().
+ */
+static void delete_uprobe(struct uprobe *uprobe)
+{
+ if (WARN_ON(!uprobe_is_active(uprobe)))
+ return;
+
+ spin_lock(&uprobes_treelock);
+ rb_erase(&uprobe->rb_node, &uprobes_tree);
+ spin_unlock(&uprobes_treelock);
+ RB_CLEAR_NODE(&uprobe->rb_node); /* for uprobe_is_active() */
+ iput(uprobe->inode);
+ put_uprobe(uprobe);
+}
+
+struct map_info {
+ struct map_info *next;
+ struct mm_struct *mm;
+ unsigned long vaddr;
+};
+
+static inline struct map_info *free_map_info(struct map_info *info)
+{
+ struct map_info *next = info->next;
+ kfree(info);
+ return next;
+}
+
+static struct map_info *
+build_map_info(struct address_space *mapping, loff_t offset, bool is_register)
+{
+ unsigned long pgoff = offset >> PAGE_SHIFT;
+ struct vm_area_struct *vma;
+ struct map_info *curr = NULL;
+ struct map_info *prev = NULL;
+ struct map_info *info;
+ int more = 0;
+
+ again:
+ mutex_lock(&mapping->i_mmap_mutex);
+ vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
+ if (!valid_vma(vma, is_register))
+ continue;
+
+ if (!prev && !more) {
+ /*
+ * Needs GFP_NOWAIT to avoid i_mmap_mutex recursion through
+ * reclaim. This is optimistic, no harm done if it fails.
+ */
+ prev = kmalloc(sizeof(struct map_info),
+ GFP_NOWAIT | __GFP_NOMEMALLOC | __GFP_NOWARN);
+ if (prev)
+ prev->next = NULL;
+ }
+ if (!prev) {
+ more++;
+ continue;
+ }
+
+ if (!atomic_inc_not_zero(&vma->vm_mm->mm_users))
+ continue;
+
+ info = prev;
+ prev = prev->next;
+ info->next = curr;
+ curr = info;
+
+ info->mm = vma->vm_mm;
+ info->vaddr = offset_to_vaddr(vma, offset);
+ }
+ mutex_unlock(&mapping->i_mmap_mutex);
+
+ if (!more)
+ goto out;
+
+ prev = curr;
+ while (curr) {
+ mmput(curr->mm);
+ curr = curr->next;
+ }
+
+ do {
+ info = kmalloc(sizeof(struct map_info), GFP_KERNEL);
+ if (!info) {
+ curr = ERR_PTR(-ENOMEM);
+ goto out;
+ }
+ info->next = prev;
+ prev = info;
+ } while (--more);
+
+ goto again;
+ out:
+ while (prev)
+ prev = free_map_info(prev);
+ return curr;
+}
+
+static int
+register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new)
+{
+ bool is_register = !!new;
+ struct map_info *info;
+ int err = 0;
+
+ percpu_down_write(&dup_mmap_sem);
+ info = build_map_info(uprobe->inode->i_mapping,
+ uprobe->offset, is_register);
+ if (IS_ERR(info)) {
+ err = PTR_ERR(info);
+ goto out;
+ }
+
+ while (info) {
+ struct mm_struct *mm = info->mm;
+ struct vm_area_struct *vma;
+
+ if (err && is_register)
+ goto free;
+
+ down_write(&mm->mmap_sem);
+ vma = find_vma(mm, info->vaddr);
+ if (!vma || !valid_vma(vma, is_register) ||
+ file_inode(vma->vm_file) != uprobe->inode)
+ goto unlock;
+
+ if (vma->vm_start > info->vaddr ||
+ vaddr_to_offset(vma, info->vaddr) != uprobe->offset)
+ goto unlock;
+
+ if (is_register) {
+ /* consult only the "caller", new consumer. */
+ if (consumer_filter(new,
+ UPROBE_FILTER_REGISTER, mm))
+ err = install_breakpoint(uprobe, mm, vma, info->vaddr);
+ } else if (test_bit(MMF_HAS_UPROBES, &mm->flags)) {
+ if (!filter_chain(uprobe,
+ UPROBE_FILTER_UNREGISTER, mm))
+ err |= remove_breakpoint(uprobe, mm, info->vaddr);
+ }
+
+ unlock:
+ up_write(&mm->mmap_sem);
+ free:
+ mmput(mm);
+ info = free_map_info(info);
+ }
+ out:
+ percpu_up_write(&dup_mmap_sem);
+ return err;
+}
+
+static int __uprobe_register(struct uprobe *uprobe, struct uprobe_consumer *uc)
+{
+ consumer_add(uprobe, uc);
+ return register_for_each_vma(uprobe, uc);
+}
+
+static void __uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *uc)
+{
+ int err;
+
+ if (WARN_ON(!consumer_del(uprobe, uc)))
+ return;
+
+ err = register_for_each_vma(uprobe, NULL);
+ /* TODO : cant unregister? schedule a worker thread */
+ if (!uprobe->consumers && !err)
+ delete_uprobe(uprobe);
+}
+
+/*
+ * uprobe_register - register a probe
+ * @inode: the file in which the probe has to be placed.
+ * @offset: offset from the start of the file.
+ * @uc: information on howto handle the probe..
+ *
+ * Apart from the access refcount, uprobe_register() takes a creation
+ * refcount (thro alloc_uprobe) if and only if this @uprobe is getting
+ * inserted into the rbtree (i.e first consumer for a @inode:@offset
+ * tuple). Creation refcount stops uprobe_unregister from freeing the
+ * @uprobe even before the register operation is complete. Creation
+ * refcount is released when the last @uc for the @uprobe
+ * unregisters.
+ *
+ * Return errno if it cannot successully install probes
+ * else return 0 (success)
+ */
+int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *uc)
+{
+ struct uprobe *uprobe;
+ int ret;
+
+ /* Uprobe must have at least one set consumer */
+ if (!uc->handler && !uc->ret_handler)
+ return -EINVAL;
+
+ /* copy_insn() uses read_mapping_page() or shmem_read_mapping_page() */
+ if (!inode->i_mapping->a_ops->readpage && !shmem_mapping(inode->i_mapping))
+ return -EIO;
+ /* Racy, just to catch the obvious mistakes */
+ if (offset > i_size_read(inode))
+ return -EINVAL;
+
+ retry:
+ uprobe = alloc_uprobe(inode, offset);
+ if (!uprobe)
+ return -ENOMEM;
+ /*
+ * We can race with uprobe_unregister()->delete_uprobe().
+ * Check uprobe_is_active() and retry if it is false.
+ */
+ down_write(&uprobe->register_rwsem);
+ ret = -EAGAIN;
+ if (likely(uprobe_is_active(uprobe))) {
+ ret = __uprobe_register(uprobe, uc);
+ if (ret)
+ __uprobe_unregister(uprobe, uc);
+ }
+ up_write(&uprobe->register_rwsem);
+ put_uprobe(uprobe);
+
+ if (unlikely(ret == -EAGAIN))
+ goto retry;
+ return ret;
+}
+EXPORT_SYMBOL_GPL(uprobe_register);
+
+/*
+ * uprobe_apply - unregister a already registered probe.
+ * @inode: the file in which the probe has to be removed.
+ * @offset: offset from the start of the file.
+ * @uc: consumer which wants to add more or remove some breakpoints
+ * @add: add or remove the breakpoints
+ */
+int uprobe_apply(struct inode *inode, loff_t offset,
+ struct uprobe_consumer *uc, bool add)
+{
+ struct uprobe *uprobe;
+ struct uprobe_consumer *con;
+ int ret = -ENOENT;
+
+ uprobe = find_uprobe(inode, offset);
+ if (WARN_ON(!uprobe))
+ return ret;
+
+ down_write(&uprobe->register_rwsem);
+ for (con = uprobe->consumers; con && con != uc ; con = con->next)
+ ;
+ if (con)
+ ret = register_for_each_vma(uprobe, add ? uc : NULL);
+ up_write(&uprobe->register_rwsem);
+ put_uprobe(uprobe);
+
+ return ret;
+}
+
+/*
+ * uprobe_unregister - unregister a already registered probe.
+ * @inode: the file in which the probe has to be removed.
+ * @offset: offset from the start of the file.
+ * @uc: identify which probe if multiple probes are colocated.
+ */
+void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *uc)
+{
+ struct uprobe *uprobe;
+
+ uprobe = find_uprobe(inode, offset);
+ if (WARN_ON(!uprobe))
+ return;
+
+ down_write(&uprobe->register_rwsem);
+ __uprobe_unregister(uprobe, uc);
+ up_write(&uprobe->register_rwsem);
+ put_uprobe(uprobe);
+}
+EXPORT_SYMBOL_GPL(uprobe_unregister);
+
+static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm)
+{
+ struct vm_area_struct *vma;
+ int err = 0;
+
+ down_read(&mm->mmap_sem);
+ for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ unsigned long vaddr;
+ loff_t offset;
+
+ if (!valid_vma(vma, false) ||
+ file_inode(vma->vm_file) != uprobe->inode)
+ continue;
+
+ offset = (loff_t)vma->vm_pgoff << PAGE_SHIFT;
+ if (uprobe->offset < offset ||
+ uprobe->offset >= offset + vma->vm_end - vma->vm_start)
+ continue;
+
+ vaddr = offset_to_vaddr(vma, uprobe->offset);
+ err |= remove_breakpoint(uprobe, mm, vaddr);
+ }
+ up_read(&mm->mmap_sem);
+
+ return err;
+}
+
+static struct rb_node *
+find_node_in_range(struct inode *inode, loff_t min, loff_t max)
+{
+ struct rb_node *n = uprobes_tree.rb_node;
+
+ while (n) {
+ struct uprobe *u = rb_entry(n, struct uprobe, rb_node);
+
+ if (inode < u->inode) {
+ n = n->rb_left;
+ } else if (inode > u->inode) {
+ n = n->rb_right;
+ } else {
+ if (max < u->offset)
+ n = n->rb_left;
+ else if (min > u->offset)
+ n = n->rb_right;
+ else
+ break;
+ }
+ }
+
+ return n;
+}
+
+/*
+ * For a given range in vma, build a list of probes that need to be inserted.
+ */
+static void build_probe_list(struct inode *inode,
+ struct vm_area_struct *vma,
+ unsigned long start, unsigned long end,
+ struct list_head *head)
+{
+ loff_t min, max;
+ struct rb_node *n, *t;
+ struct uprobe *u;
+
+ INIT_LIST_HEAD(head);
+ min = vaddr_to_offset(vma, start);
+ max = min + (end - start) - 1;
+
+ spin_lock(&uprobes_treelock);
+ n = find_node_in_range(inode, min, max);
+ if (n) {
+ for (t = n; t; t = rb_prev(t)) {
+ u = rb_entry(t, struct uprobe, rb_node);
+ if (u->inode != inode || u->offset < min)
+ break;
+ list_add(&u->pending_list, head);
+ atomic_inc(&u->ref);
+ }
+ for (t = n; (t = rb_next(t)); ) {
+ u = rb_entry(t, struct uprobe, rb_node);
+ if (u->inode != inode || u->offset > max)
+ break;
+ list_add(&u->pending_list, head);
+ atomic_inc(&u->ref);
+ }
+ }
+ spin_unlock(&uprobes_treelock);
+}
+
+/*
+ * Called from mmap_region/vma_adjust with mm->mmap_sem acquired.
+ *
+ * Currently we ignore all errors and always return 0, the callers
+ * can't handle the failure anyway.
+ */
+int uprobe_mmap(struct vm_area_struct *vma)
+{
+ struct list_head tmp_list;
+ struct uprobe *uprobe, *u;
+ struct inode *inode;
+
+ if (no_uprobe_events() || !valid_vma(vma, true))
+ return 0;
+
+ inode = file_inode(vma->vm_file);
+ if (!inode)
+ return 0;
+
+ mutex_lock(uprobes_mmap_hash(inode));
+ build_probe_list(inode, vma, vma->vm_start, vma->vm_end, &tmp_list);
+ /*
+ * We can race with uprobe_unregister(), this uprobe can be already
+ * removed. But in this case filter_chain() must return false, all
+ * consumers have gone away.
+ */
+ list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) {
+ if (!fatal_signal_pending(current) &&
+ filter_chain(uprobe, UPROBE_FILTER_MMAP, vma->vm_mm)) {
+ unsigned long vaddr = offset_to_vaddr(vma, uprobe->offset);
+ install_breakpoint(uprobe, vma->vm_mm, vma, vaddr);
+ }
+ put_uprobe(uprobe);
+ }
+ mutex_unlock(uprobes_mmap_hash(inode));
+
+ return 0;
+}
+
+static bool
+vma_has_uprobes(struct vm_area_struct *vma, unsigned long start, unsigned long end)
+{
+ loff_t min, max;
+ struct inode *inode;
+ struct rb_node *n;
+
+ inode = file_inode(vma->vm_file);
+
+ min = vaddr_to_offset(vma, start);
+ max = min + (end - start) - 1;
+
+ spin_lock(&uprobes_treelock);
+ n = find_node_in_range(inode, min, max);
+ spin_unlock(&uprobes_treelock);
+
+ return !!n;
+}
+
+/*
+ * Called in context of a munmap of a vma.
+ */
+void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned long end)
+{
+ if (no_uprobe_events() || !valid_vma(vma, false))
+ return;
+
+ if (!atomic_read(&vma->vm_mm->mm_users)) /* called by mmput() ? */
+ return;
+
+ if (!test_bit(MMF_HAS_UPROBES, &vma->vm_mm->flags) ||
+ test_bit(MMF_RECALC_UPROBES, &vma->vm_mm->flags))
+ return;
+
+ if (vma_has_uprobes(vma, start, end))
+ set_bit(MMF_RECALC_UPROBES, &vma->vm_mm->flags);
+}
+
+/* Slot allocation for XOL */
+static int xol_add_vma(struct mm_struct *mm, struct xol_area *area)
+{
+ int ret = -EALREADY;
+
+ down_write(&mm->mmap_sem);
+ if (mm->uprobes_state.xol_area)
+ goto fail;
+
+ if (!area->vaddr) {
+ /* Try to map as high as possible, this is only a hint. */
+ area->vaddr = get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE,
+ PAGE_SIZE, 0, 0);
+ if (area->vaddr & ~PAGE_MASK) {
+ ret = area->vaddr;
+ goto fail;
+ }
+ }
+
+ ret = install_special_mapping(mm, area->vaddr, PAGE_SIZE,
+ VM_EXEC|VM_MAYEXEC|VM_DONTCOPY|VM_IO, &area->page);
+ if (ret)
+ goto fail;
+
+ smp_wmb(); /* pairs with get_xol_area() */
+ mm->uprobes_state.xol_area = area;
+ fail:
+ up_write(&mm->mmap_sem);
+
+ return ret;
+}
+
+static struct xol_area *__create_xol_area(unsigned long vaddr)
+{
+ struct mm_struct *mm = current->mm;
+ uprobe_opcode_t insn = UPROBE_SWBP_INSN;
+ struct xol_area *area;
+
+ area = kmalloc(sizeof(*area), GFP_KERNEL);
+ if (unlikely(!area))
+ goto out;
+
+ area->bitmap = kzalloc(BITS_TO_LONGS(UINSNS_PER_PAGE) * sizeof(long), GFP_KERNEL);
+ if (!area->bitmap)
+ goto free_area;
+
+ area->page = alloc_page(GFP_HIGHUSER);
+ if (!area->page)
+ goto free_bitmap;
+
+ area->vaddr = vaddr;
+ init_waitqueue_head(&area->wq);
+ /* Reserve the 1st slot for get_trampoline_vaddr() */
+ set_bit(0, area->bitmap);
+ atomic_set(&area->slot_count, 1);
+ copy_to_page(area->page, 0, &insn, UPROBE_SWBP_INSN_SIZE);
+
+ if (!xol_add_vma(mm, area))
+ return area;
+
+ __free_page(area->page);
+ free_bitmap:
+ kfree(area->bitmap);
+ free_area:
+ kfree(area);
+ out:
+ return NULL;
+}
+
+/*
+ * get_xol_area - Allocate process's xol_area if necessary.
+ * This area will be used for storing instructions for execution out of line.
+ *
+ * Returns the allocated area or NULL.
+ */
+static struct xol_area *get_xol_area(void)
+{
+ struct mm_struct *mm = current->mm;
+ struct xol_area *area;
+
+ if (!mm->uprobes_state.xol_area)
+ __create_xol_area(0);
+
+ area = mm->uprobes_state.xol_area;
+ smp_read_barrier_depends(); /* pairs with wmb in xol_add_vma() */
+ return area;
+}
+
+/*
+ * uprobe_clear_state - Free the area allocated for slots.
+ */
+void uprobe_clear_state(struct mm_struct *mm)
+{
+ struct xol_area *area = mm->uprobes_state.xol_area;
+
+ if (!area)
+ return;
+
+ put_page(area->page);
+ kfree(area->bitmap);
+ kfree(area);
+}
+
+void uprobe_start_dup_mmap(void)
+{
+ percpu_down_read(&dup_mmap_sem);
+}
+
+void uprobe_end_dup_mmap(void)
+{
+ percpu_up_read(&dup_mmap_sem);
+}
+
+void uprobe_dup_mmap(struct mm_struct *oldmm, struct mm_struct *newmm)
+{
+ newmm->uprobes_state.xol_area = NULL;
+
+ if (test_bit(MMF_HAS_UPROBES, &oldmm->flags)) {
+ set_bit(MMF_HAS_UPROBES, &newmm->flags);
+ /* unconditionally, dup_mmap() skips VM_DONTCOPY vmas */
+ set_bit(MMF_RECALC_UPROBES, &newmm->flags);
+ }
+}
+
+/*
+ * - search for a free slot.
+ */
+static unsigned long xol_take_insn_slot(struct xol_area *area)
+{
+ unsigned long slot_addr;
+ int slot_nr;
+
+ do {
+ slot_nr = find_first_zero_bit(area->bitmap, UINSNS_PER_PAGE);
+ if (slot_nr < UINSNS_PER_PAGE) {
+ if (!test_and_set_bit(slot_nr, area->bitmap))
+ break;
+
+ slot_nr = UINSNS_PER_PAGE;
+ continue;
+ }
+ wait_event(area->wq, (atomic_read(&area->slot_count) < UINSNS_PER_PAGE));
+ } while (slot_nr >= UINSNS_PER_PAGE);
+
+ slot_addr = area->vaddr + (slot_nr * UPROBE_XOL_SLOT_BYTES);
+ atomic_inc(&area->slot_count);
+
+ return slot_addr;
+}
+
+/*
+ * xol_get_insn_slot - allocate a slot for xol.
+ * Returns the allocated slot address or 0.
+ */
+static unsigned long xol_get_insn_slot(struct uprobe *uprobe)
+{
+ struct xol_area *area;
+ unsigned long xol_vaddr;
+
+ area = get_xol_area();
+ if (!area)
+ return 0;
+
+ xol_vaddr = xol_take_insn_slot(area);
+ if (unlikely(!xol_vaddr))
+ return 0;
+
+ arch_uprobe_copy_ixol(area->page, xol_vaddr,
+ &uprobe->arch.ixol, sizeof(uprobe->arch.ixol));
+
+ return xol_vaddr;
+}
+
+/*
+ * xol_free_insn_slot - If slot was earlier allocated by
+ * @xol_get_insn_slot(), make the slot available for
+ * subsequent requests.
+ */
+static void xol_free_insn_slot(struct task_struct *tsk)
+{
+ struct xol_area *area;
+ unsigned long vma_end;
+ unsigned long slot_addr;
+
+ if (!tsk->mm || !tsk->mm->uprobes_state.xol_area || !tsk->utask)
+ return;
+
+ slot_addr = tsk->utask->xol_vaddr;
+ if (unlikely(!slot_addr))
+ return;
+
+ area = tsk->mm->uprobes_state.xol_area;
+ vma_end = area->vaddr + PAGE_SIZE;
+ if (area->vaddr <= slot_addr && slot_addr < vma_end) {
+ unsigned long offset;
+ int slot_nr;
+
+ offset = slot_addr - area->vaddr;
+ slot_nr = offset / UPROBE_XOL_SLOT_BYTES;
+ if (slot_nr >= UINSNS_PER_PAGE)
+ return;
+
+ clear_bit(slot_nr, area->bitmap);
+ atomic_dec(&area->slot_count);
+ if (waitqueue_active(&area->wq))
+ wake_up(&area->wq);
+
+ tsk->utask->xol_vaddr = 0;
+ }
+}
+
+void __weak arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr,
+ void *src, unsigned long len)
+{
+ /* Initialize the slot */
+ copy_to_page(page, vaddr, src, len);
+
+ /*
+ * We probably need flush_icache_user_range() but it needs vma.
+ * This should work on most of architectures by default. If
+ * architecture needs to do something different it can define
+ * its own version of the function.
+ */
+ flush_dcache_page(page);
+}
+
+/**
+ * uprobe_get_swbp_addr - compute address of swbp given post-swbp regs
+ * @regs: Reflects the saved state of the task after it has hit a breakpoint
+ * instruction.
+ * Return the address of the breakpoint instruction.
+ */
+unsigned long __weak uprobe_get_swbp_addr(struct pt_regs *regs)
+{
+ return instruction_pointer(regs) - UPROBE_SWBP_INSN_SIZE;
+}
+
+unsigned long uprobe_get_trap_addr(struct pt_regs *regs)
+{
+ struct uprobe_task *utask = current->utask;
+
+ if (unlikely(utask && utask->active_uprobe))
+ return utask->vaddr;
+
+ return instruction_pointer(regs);
+}
+
+/*
+ * Called with no locks held.
+ * Called in context of a exiting or a exec-ing thread.
+ */
+void uprobe_free_utask(struct task_struct *t)
+{
+ struct uprobe_task *utask = t->utask;
+ struct return_instance *ri, *tmp;
+
+ if (!utask)
+ return;
+
+ if (utask->active_uprobe)
+ put_uprobe(utask->active_uprobe);
+
+ ri = utask->return_instances;
+ while (ri) {
+ tmp = ri;
+ ri = ri->next;
+
+ put_uprobe(tmp->uprobe);
+ kfree(tmp);
+ }
+
+ xol_free_insn_slot(t);
+ kfree(utask);
+ t->utask = NULL;
+}
+
+/*
+ * Allocate a uprobe_task object for the task if if necessary.
+ * Called when the thread hits a breakpoint.
+ *
+ * Returns:
+ * - pointer to new uprobe_task on success
+ * - NULL otherwise
+ */
+static struct uprobe_task *get_utask(void)
+{
+ if (!current->utask)
+ current->utask = kzalloc(sizeof(struct uprobe_task), GFP_KERNEL);
+ return current->utask;
+}
+
+static int dup_utask(struct task_struct *t, struct uprobe_task *o_utask)
+{
+ struct uprobe_task *n_utask;
+ struct return_instance **p, *o, *n;
+
+ n_utask = kzalloc(sizeof(struct uprobe_task), GFP_KERNEL);
+ if (!n_utask)
+ return -ENOMEM;
+ t->utask = n_utask;
+
+ p = &n_utask->return_instances;
+ for (o = o_utask->return_instances; o; o = o->next) {
+ n = kmalloc(sizeof(struct return_instance), GFP_KERNEL);
+ if (!n)
+ return -ENOMEM;
+
+ *n = *o;
+ atomic_inc(&n->uprobe->ref);
+ n->next = NULL;
+
+ *p = n;
+ p = &n->next;
+ n_utask->depth++;
+ }
+
+ return 0;
+}
+
+static void uprobe_warn(struct task_struct *t, const char *msg)
+{
+ pr_warn("uprobe: %s:%d failed to %s\n",
+ current->comm, current->pid, msg);
+}
+
+static void dup_xol_work(struct callback_head *work)
+{
+ if (current->flags & PF_EXITING)
+ return;
+
+ if (!__create_xol_area(current->utask->dup_xol_addr))
+ uprobe_warn(current, "dup xol area");
+}
+
+/*
+ * Called in context of a new clone/fork from copy_process.
+ */
+void uprobe_copy_process(struct task_struct *t, unsigned long flags)
+{
+ struct uprobe_task *utask = current->utask;
+ struct mm_struct *mm = current->mm;
+ struct xol_area *area;
+
+ t->utask = NULL;
+
+ if (!utask || !utask->return_instances)
+ return;
+
+ if (mm == t->mm && !(flags & CLONE_VFORK))
+ return;
+
+ if (dup_utask(t, utask))
+ return uprobe_warn(t, "dup ret instances");
+
+ /* The task can fork() after dup_xol_work() fails */
+ area = mm->uprobes_state.xol_area;
+ if (!area)
+ return uprobe_warn(t, "dup xol area");
+
+ if (mm == t->mm)
+ return;
+
+ t->utask->dup_xol_addr = area->vaddr;
+ init_task_work(&t->utask->dup_xol_work, dup_xol_work);
+ task_work_add(t, &t->utask->dup_xol_work, true);
+}
+
+/*
+ * Current area->vaddr notion assume the trampoline address is always
+ * equal area->vaddr.
+ *
+ * Returns -1 in case the xol_area is not allocated.
+ */
+static unsigned long get_trampoline_vaddr(void)
+{
+ struct xol_area *area;
+ unsigned long trampoline_vaddr = -1;
+
+ area = current->mm->uprobes_state.xol_area;
+ smp_read_barrier_depends();
+ if (area)
+ trampoline_vaddr = area->vaddr;
+
+ return trampoline_vaddr;
+}
+
+static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs)
+{
+ struct return_instance *ri;
+ struct uprobe_task *utask;
+ unsigned long orig_ret_vaddr, trampoline_vaddr;
+ bool chained = false;
+
+ if (!get_xol_area())
+ return;
+
+ utask = get_utask();
+ if (!utask)
+ return;
+
+ if (utask->depth >= MAX_URETPROBE_DEPTH) {
+ printk_ratelimited(KERN_INFO "uprobe: omit uretprobe due to"
+ " nestedness limit pid/tgid=%d/%d\n",
+ current->pid, current->tgid);
+ return;
+ }
+
+ ri = kzalloc(sizeof(struct return_instance), GFP_KERNEL);
+ if (!ri)
+ goto fail;
+
+ trampoline_vaddr = get_trampoline_vaddr();
+ orig_ret_vaddr = arch_uretprobe_hijack_return_addr(trampoline_vaddr, regs);
+ if (orig_ret_vaddr == -1)
+ goto fail;
+
+ /*
+ * We don't want to keep trampoline address in stack, rather keep the
+ * original return address of first caller thru all the consequent
+ * instances. This also makes breakpoint unwrapping easier.
+ */
+ if (orig_ret_vaddr == trampoline_vaddr) {
+ if (!utask->return_instances) {
+ /*
+ * This situation is not possible. Likely we have an
+ * attack from user-space.
+ */
+ pr_warn("uprobe: unable to set uretprobe pid/tgid=%d/%d\n",
+ current->pid, current->tgid);
+ goto fail;
+ }
+
+ chained = true;
+ orig_ret_vaddr = utask->return_instances->orig_ret_vaddr;
+ }
+
+ atomic_inc(&uprobe->ref);
+ ri->uprobe = uprobe;
+ ri->func = instruction_pointer(regs);
+ ri->orig_ret_vaddr = orig_ret_vaddr;
+ ri->chained = chained;
+
+ utask->depth++;
+
+ /* add instance to the stack */
+ ri->next = utask->return_instances;
+ utask->return_instances = ri;
+
+ return;
+
+ fail:
+ kfree(ri);
+}
+
+/* Prepare to single-step probed instruction out of line. */
+static int
+pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long bp_vaddr)
+{
+ struct uprobe_task *utask;
+ unsigned long xol_vaddr;
+ int err;
+
+ utask = get_utask();
+ if (!utask)
+ return -ENOMEM;
+
+ xol_vaddr = xol_get_insn_slot(uprobe);
+ if (!xol_vaddr)
+ return -ENOMEM;
+
+ utask->xol_vaddr = xol_vaddr;
+ utask->vaddr = bp_vaddr;
+
+ err = arch_uprobe_pre_xol(&uprobe->arch, regs);
+ if (unlikely(err)) {
+ xol_free_insn_slot(current);
+ return err;
+ }
+
+ utask->active_uprobe = uprobe;
+ utask->state = UTASK_SSTEP;
+ return 0;
+}
+
+/*
+ * If we are singlestepping, then ensure this thread is not connected to
+ * non-fatal signals until completion of singlestep. When xol insn itself
+ * triggers the signal, restart the original insn even if the task is
+ * already SIGKILL'ed (since coredump should report the correct ip). This
+ * is even more important if the task has a handler for SIGSEGV/etc, The
+ * _same_ instruction should be repeated again after return from the signal
+ * handler, and SSTEP can never finish in this case.
+ */
+bool uprobe_deny_signal(void)
+{
+ struct task_struct *t = current;
+ struct uprobe_task *utask = t->utask;
+
+ if (likely(!utask || !utask->active_uprobe))
+ return false;
+
+ WARN_ON_ONCE(utask->state != UTASK_SSTEP);
+
+ if (signal_pending(t)) {
+ spin_lock_irq(&t->sighand->siglock);
+ clear_tsk_thread_flag(t, TIF_SIGPENDING);
+ spin_unlock_irq(&t->sighand->siglock);
+
+ if (__fatal_signal_pending(t) || arch_uprobe_xol_was_trapped(t)) {
+ utask->state = UTASK_SSTEP_TRAPPED;
+ set_tsk_thread_flag(t, TIF_UPROBE);
+ set_tsk_thread_flag(t, TIF_NOTIFY_RESUME);
+ }
+ }
+
+ return true;
+}
+
+static void mmf_recalc_uprobes(struct mm_struct *mm)
+{
+ struct vm_area_struct *vma;
+
+ for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ if (!valid_vma(vma, false))
+ continue;
+ /*
+ * This is not strictly accurate, we can race with
+ * uprobe_unregister() and see the already removed
+ * uprobe if delete_uprobe() was not yet called.
+ * Or this uprobe can be filtered out.
+ */
+ if (vma_has_uprobes(vma, vma->vm_start, vma->vm_end))
+ return;
+ }
+
+ clear_bit(MMF_HAS_UPROBES, &mm->flags);
+}
+
+static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr)
+{
+ struct page *page;
+ uprobe_opcode_t opcode;
+ int result;
+
+ pagefault_disable();
+ result = __copy_from_user_inatomic(&opcode, (void __user*)vaddr,
+ sizeof(opcode));
+ pagefault_enable();
+
+ if (likely(result == 0))
+ goto out;
+
+ result = get_user_pages(NULL, mm, vaddr, 1, 0, 1, &page, NULL);
+ if (result < 0)
+ return result;
+
+ copy_from_page(page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
+ put_page(page);
+ out:
+ /* This needs to return true for any variant of the trap insn */
+ return is_trap_insn(&opcode);
+}
+
+static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp)
+{
+ struct mm_struct *mm = current->mm;
+ struct uprobe *uprobe = NULL;
+ struct vm_area_struct *vma;
+
+ down_read(&mm->mmap_sem);
+ vma = find_vma(mm, bp_vaddr);
+ if (vma && vma->vm_start <= bp_vaddr) {
+ if (valid_vma(vma, false)) {
+ struct inode *inode = file_inode(vma->vm_file);
+ loff_t offset = vaddr_to_offset(vma, bp_vaddr);
+
+ uprobe = find_uprobe(inode, offset);
+ }
+
+ if (!uprobe)
+ *is_swbp = is_trap_at_addr(mm, bp_vaddr);
+ } else {
+ *is_swbp = -EFAULT;
+ }
+
+ if (!uprobe && test_and_clear_bit(MMF_RECALC_UPROBES, &mm->flags))
+ mmf_recalc_uprobes(mm);
+ up_read(&mm->mmap_sem);
+
+ return uprobe;
+}
+
+static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)
+{
+ struct uprobe_consumer *uc;
+ int remove = UPROBE_HANDLER_REMOVE;
+ bool need_prep = false; /* prepare return uprobe, when needed */
+
+ down_read(&uprobe->register_rwsem);
+ for (uc = uprobe->consumers; uc; uc = uc->next) {
+ int rc = 0;
+
+ if (uc->handler) {
+ rc = uc->handler(uc, regs);
+ WARN(rc & ~UPROBE_HANDLER_MASK,
+ "bad rc=0x%x from %pf()\n", rc, uc->handler);
+ }
+
+ if (uc->ret_handler)
+ need_prep = true;
+
+ remove &= rc;
+ }
+
+ if (need_prep && !remove)
+ prepare_uretprobe(uprobe, regs); /* put bp at return */
+
+ if (remove && uprobe->consumers) {
+ WARN_ON(!uprobe_is_active(uprobe));
+ unapply_uprobe(uprobe, current->mm);
+ }
+ up_read(&uprobe->register_rwsem);
+}
+
+static void
+handle_uretprobe_chain(struct return_instance *ri, struct pt_regs *regs)
+{
+ struct uprobe *uprobe = ri->uprobe;
+ struct uprobe_consumer *uc;
+
+ down_read(&uprobe->register_rwsem);
+ for (uc = uprobe->consumers; uc; uc = uc->next) {
+ if (uc->ret_handler)
+ uc->ret_handler(uc, ri->func, regs);
+ }
+ up_read(&uprobe->register_rwsem);
+}
+
+static bool handle_trampoline(struct pt_regs *regs)
+{
+ struct uprobe_task *utask;
+ struct return_instance *ri, *tmp;
+ bool chained;
+
+ utask = current->utask;
+ if (!utask)
+ return false;
+
+ ri = utask->return_instances;
+ if (!ri)
+ return false;
+
+ /*
+ * TODO: we should throw out return_instance's invalidated by
+ * longjmp(), currently we assume that the probed function always
+ * returns.
+ */
+ instruction_pointer_set(regs, ri->orig_ret_vaddr);
+
+ for (;;) {
+ handle_uretprobe_chain(ri, regs);
+
+ chained = ri->chained;
+ put_uprobe(ri->uprobe);
+
+ tmp = ri;
+ ri = ri->next;
+ kfree(tmp);
+ utask->depth--;
+
+ if (!chained)
+ break;
+ BUG_ON(!ri);
+ }
+
+ utask->return_instances = ri;
+
+ return true;
+}
+
+bool __weak arch_uprobe_ignore(struct arch_uprobe *aup, struct pt_regs *regs)
+{
+ return false;
+}
+
+/*
+ * Run handler and ask thread to singlestep.
+ * Ensure all non-fatal signals cannot interrupt thread while it singlesteps.
+ */
+static void handle_swbp(struct pt_regs *regs)
+{
+ struct uprobe *uprobe;
+ unsigned long bp_vaddr;
+ int uninitialized_var(is_swbp);
+
+ bp_vaddr = uprobe_get_swbp_addr(regs);
+ if (bp_vaddr == get_trampoline_vaddr()) {
+ if (handle_trampoline(regs))
+ return;
+
+ pr_warn("uprobe: unable to handle uretprobe pid/tgid=%d/%d\n",
+ current->pid, current->tgid);
+ }
+
+ uprobe = find_active_uprobe(bp_vaddr, &is_swbp);
+ if (!uprobe) {
+ if (is_swbp > 0) {
+ /* No matching uprobe; signal SIGTRAP. */
+ send_sig(SIGTRAP, current, 0);
+ } else {
+ /*
+ * Either we raced with uprobe_unregister() or we can't
+ * access this memory. The latter is only possible if
+ * another thread plays with our ->mm. In both cases
+ * we can simply restart. If this vma was unmapped we
+ * can pretend this insn was not executed yet and get
+ * the (correct) SIGSEGV after restart.
+ */
+ instruction_pointer_set(regs, bp_vaddr);
+ }
+ return;
+ }
+
+ /* change it in advance for ->handler() and restart */
+ instruction_pointer_set(regs, bp_vaddr);
+
+ /*
+ * TODO: move copy_insn/etc into _register and remove this hack.
+ * After we hit the bp, _unregister + _register can install the
+ * new and not-yet-analyzed uprobe at the same address, restart.
+ */
+ smp_rmb(); /* pairs with wmb() in install_breakpoint() */
+ if (unlikely(!test_bit(UPROBE_COPY_INSN, &uprobe->flags)))
+ goto out;
+
+ /* Tracing handlers use ->utask to communicate with fetch methods */
+ if (!get_utask())
+ goto out;
+
+ if (arch_uprobe_ignore(&uprobe->arch, regs))
+ goto out;
+
+ handler_chain(uprobe, regs);
+
+ if (arch_uprobe_skip_sstep(&uprobe->arch, regs))
+ goto out;
+
+ if (!pre_ssout(uprobe, regs, bp_vaddr))
+ return;
+
+ /* arch_uprobe_skip_sstep() succeeded, or restart if can't singlestep */
+out:
+ put_uprobe(uprobe);
+}
+
+/*
+ * Perform required fix-ups and disable singlestep.
+ * Allow pending signals to take effect.
+ */
+static void handle_singlestep(struct uprobe_task *utask, struct pt_regs *regs)
+{
+ struct uprobe *uprobe;
+ int err = 0;
+
+ uprobe = utask->active_uprobe;
+ if (utask->state == UTASK_SSTEP_ACK)
+ err = arch_uprobe_post_xol(&uprobe->arch, regs);
+ else if (utask->state == UTASK_SSTEP_TRAPPED)
+ arch_uprobe_abort_xol(&uprobe->arch, regs);
+ else
+ WARN_ON_ONCE(1);
+
+ put_uprobe(uprobe);
+ utask->active_uprobe = NULL;
+ utask->state = UTASK_RUNNING;
+ xol_free_insn_slot(current);
+
+ spin_lock_irq(&current->sighand->siglock);
+ recalc_sigpending(); /* see uprobe_deny_signal() */
+ spin_unlock_irq(&current->sighand->siglock);
+
+ if (unlikely(err)) {
+ uprobe_warn(current, "execute the probed insn, sending SIGILL.");
+ force_sig_info(SIGILL, SEND_SIG_FORCED, current);
+ }
+}
+
+/*
+ * On breakpoint hit, breakpoint notifier sets the TIF_UPROBE flag and
+ * allows the thread to return from interrupt. After that handle_swbp()
+ * sets utask->active_uprobe.
+ *
+ * On singlestep exception, singlestep notifier sets the TIF_UPROBE flag
+ * and allows the thread to return from interrupt.
+ *
+ * While returning to userspace, thread notices the TIF_UPROBE flag and calls
+ * uprobe_notify_resume().
+ */
+void uprobe_notify_resume(struct pt_regs *regs)
+{
+ struct uprobe_task *utask;
+
+ clear_thread_flag(TIF_UPROBE);
+
+ utask = current->utask;
+ if (utask && utask->active_uprobe)
+ handle_singlestep(utask, regs);
+ else
+ handle_swbp(regs);
+}
+
+/*
+ * uprobe_pre_sstep_notifier gets called from interrupt context as part of
+ * notifier mechanism. Set TIF_UPROBE flag and indicate breakpoint hit.
+ */
+int uprobe_pre_sstep_notifier(struct pt_regs *regs)
+{
+ if (!current->mm)
+ return 0;
+
+ if (!test_bit(MMF_HAS_UPROBES, &current->mm->flags) &&
+ (!current->utask || !current->utask->return_instances))
+ return 0;
+
+ set_thread_flag(TIF_UPROBE);
+ return 1;
+}
+
+/*
+ * uprobe_post_sstep_notifier gets called in interrupt context as part of notifier
+ * mechanism. Set TIF_UPROBE flag and indicate completion of singlestep.
+ */
+int uprobe_post_sstep_notifier(struct pt_regs *regs)
+{
+ struct uprobe_task *utask = current->utask;
+
+ if (!current->mm || !utask || !utask->active_uprobe)
+ /* task is currently not uprobed */
+ return 0;
+
+ utask->state = UTASK_SSTEP_ACK;
+ set_thread_flag(TIF_UPROBE);
+ return 1;
+}
+
+static struct notifier_block uprobe_exception_nb = {
+ .notifier_call = arch_uprobe_exception_notify,
+ .priority = INT_MAX-1, /* notified after kprobes, kgdb */
+};
+
+static int __init init_uprobes(void)
+{
+ int i;
+
+ for (i = 0; i < UPROBES_HASH_SZ; i++)
+ mutex_init(&uprobes_mmap_mutex[i]);
+
+ if (percpu_init_rwsem(&dup_mmap_sem))
+ return -ENOMEM;
+
+ return register_die_notifier(&uprobe_exception_nb);
+}
+__initcall(init_uprobes);
diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c
index c35452cadde..83d4382f569 100644
--- a/kernel/exec_domain.c
+++ b/kernel/exec_domain.c
@@ -27,7 +27,7 @@ static struct exec_domain *exec_domains = &default_exec_domain;
static DEFINE_RWLOCK(exec_domains_lock);
-static u_long ident_map[32] = {
+static unsigned long ident_map[32] = {
0, 1, 2, 3, 4, 5, 6, 7,
8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23,
@@ -37,7 +37,7 @@ static u_long ident_map[32] = {
struct exec_domain default_exec_domain = {
.name = "Linux", /* name */
.handler = default_handler, /* lcall7 causes a seg fault. */
- .pers_low = 0, /* PER_LINUX personality. */
+ .pers_low = 0, /* PER_LINUX personality. */
.pers_high = 0, /* PER_LINUX personality. */
.signal_map = ident_map, /* Identity map signals. */
.signal_invmap = ident_map, /* - both ways. */
@@ -56,10 +56,10 @@ default_handler(int segment, struct pt_regs *regp)
}
static struct exec_domain *
-lookup_exec_domain(u_long personality)
+lookup_exec_domain(unsigned int personality)
{
- struct exec_domain * ep;
- u_long pers = personality(personality);
+ unsigned int pers = personality(personality);
+ struct exec_domain *ep;
read_lock(&exec_domains_lock);
for (ep = exec_domains; ep; ep = ep->next) {
@@ -70,7 +70,7 @@ lookup_exec_domain(u_long personality)
#ifdef CONFIG_MODULES
read_unlock(&exec_domains_lock);
- request_module("personality-%ld", pers);
+ request_module("personality-%d", pers);
read_lock(&exec_domains_lock);
for (ep = exec_domains; ep; ep = ep->next) {
@@ -83,7 +83,7 @@ lookup_exec_domain(u_long personality)
ep = &default_exec_domain;
out:
read_unlock(&exec_domains_lock);
- return (ep);
+ return ep;
}
int
@@ -110,8 +110,9 @@ register_exec_domain(struct exec_domain *ep)
out:
write_unlock(&exec_domains_lock);
- return (err);
+ return err;
}
+EXPORT_SYMBOL(register_exec_domain);
int
unregister_exec_domain(struct exec_domain *ep)
@@ -133,26 +134,19 @@ unregister:
write_unlock(&exec_domains_lock);
return 0;
}
+EXPORT_SYMBOL(unregister_exec_domain);
-int
-__set_personality(u_long personality)
+int __set_personality(unsigned int personality)
{
- struct exec_domain *ep, *oep;
-
- ep = lookup_exec_domain(personality);
- if (ep == current_thread_info()->exec_domain) {
- current->personality = personality;
- module_put(ep->module);
- return 0;
- }
+ struct exec_domain *oep = current_thread_info()->exec_domain;
+ current_thread_info()->exec_domain = lookup_exec_domain(personality);
current->personality = personality;
- oep = current_thread_info()->exec_domain;
- current_thread_info()->exec_domain = ep;
-
module_put(oep->module);
+
return 0;
}
+EXPORT_SYMBOL(__set_personality);
#ifdef CONFIG_PROC_FS
static int execdomains_proc_show(struct seq_file *m, void *v)
@@ -188,20 +182,12 @@ static int __init proc_execdomains_init(void)
module_init(proc_execdomains_init);
#endif
-SYSCALL_DEFINE1(personality, u_long, personality)
+SYSCALL_DEFINE1(personality, unsigned int, personality)
{
- u_long old = current->personality;
+ unsigned int old = current->personality;
- if (personality != 0xffffffff) {
+ if (personality != 0xffffffff)
set_personality(personality);
- if (current->personality != personality)
- return -EINVAL;
- }
- return (long)old;
+ return old;
}
-
-
-EXPORT_SYMBOL(register_exec_domain);
-EXPORT_SYMBOL(unregister_exec_domain);
-EXPORT_SYMBOL(__set_personality);
diff --git a/kernel/exit.c b/kernel/exit.c
index 546774a31a6..e5c4668f179 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -20,6 +20,7 @@
#include <linux/tsacct_kern.h>
#include <linux/file.h>
#include <linux/fdtable.h>
+#include <linux/freezer.h>
#include <linux/binfmts.h>
#include <linux/nsproxy.h>
#include <linux/pid_namespace.h>
@@ -31,7 +32,6 @@
#include <linux/mempolicy.h>
#include <linux/taskstats_kern.h>
#include <linux/delayacct.h>
-#include <linux/freezer.h>
#include <linux/cgroup.h>
#include <linux/syscalls.h>
#include <linux/signal.h>
@@ -50,28 +50,31 @@
#include <linux/perf_event.h>
#include <trace/events/sched.h>
#include <linux/hw_breakpoint.h>
+#include <linux/oom.h>
+#include <linux/writeback.h>
+#include <linux/shm.h>
#include <asm/uaccess.h>
#include <asm/unistd.h>
#include <asm/pgtable.h>
#include <asm/mmu_context.h>
-#include "cred-internals.h"
static void exit_mm(struct task_struct * tsk);
-static void __unhash_process(struct task_struct *p)
+static void __unhash_process(struct task_struct *p, bool group_dead)
{
nr_threads--;
detach_pid(p, PIDTYPE_PID);
- if (thread_group_leader(p)) {
+ if (group_dead) {
detach_pid(p, PIDTYPE_PGID);
detach_pid(p, PIDTYPE_SID);
list_del_rcu(&p->tasks);
list_del_init(&p->sibling);
- __get_cpu_var(process_counts)--;
+ __this_cpu_dec(process_counts);
}
list_del_rcu(&p->thread_group);
+ list_del_rcu(&p->thread_node);
}
/*
@@ -80,23 +83,34 @@ static void __unhash_process(struct task_struct *p)
static void __exit_signal(struct task_struct *tsk)
{
struct signal_struct *sig = tsk->signal;
+ bool group_dead = thread_group_leader(tsk);
struct sighand_struct *sighand;
+ struct tty_struct *uninitialized_var(tty);
+ cputime_t utime, stime;
- BUG_ON(!sig);
- BUG_ON(!atomic_read(&sig->count));
-
- sighand = rcu_dereference(tsk->sighand);
+ sighand = rcu_dereference_check(tsk->sighand,
+ lockdep_tasklist_lock_is_held());
spin_lock(&sighand->siglock);
posix_cpu_timers_exit(tsk);
- if (atomic_dec_and_test(&sig->count))
+ if (group_dead) {
posix_cpu_timers_exit_group(tsk);
- else {
+ tty = sig->tty;
+ sig->tty = NULL;
+ } else {
+ /*
+ * This can only happen if the caller is de_thread().
+ * FIXME: this is the temporary hack, we should teach
+ * posix-cpu-timers to handle this case correctly.
+ */
+ if (unlikely(has_group_leader_pid(tsk)))
+ posix_cpu_timers_exit_group(tsk);
+
/*
* If there is any task waiting for the group exit
* then notify it:
*/
- if (sig->group_exit_task && atomic_read(&sig->count) == sig->notify_count)
+ if (sig->notify_count > 0 && !--sig->notify_count)
wake_up_process(sig->group_exit_task);
if (tsk == sig->curr_target)
@@ -111,9 +125,10 @@ static void __exit_signal(struct task_struct *tsk)
* We won't ever get here for the group leader, since it
* will have been the last reference on the signal_struct.
*/
- sig->utime = cputime_add(sig->utime, tsk->utime);
- sig->stime = cputime_add(sig->stime, tsk->stime);
- sig->gtime = cputime_add(sig->gtime, tsk->gtime);
+ task_cputime(tsk, &utime, &stime);
+ sig->utime += utime;
+ sig->stime += stime;
+ sig->gtime += task_gtime(tsk);
sig->min_flt += tsk->min_flt;
sig->maj_flt += tsk->maj_flt;
sig->nvcsw += tsk->nvcsw;
@@ -122,32 +137,24 @@ static void __exit_signal(struct task_struct *tsk)
sig->oublock += task_io_get_oublock(tsk);
task_io_accounting_add(&sig->ioac, &tsk->ioac);
sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
- sig = NULL; /* Marker for below. */
}
- __unhash_process(tsk);
+ sig->nr_threads--;
+ __unhash_process(tsk, group_dead);
/*
* Do this under ->siglock, we can race with another thread
* doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals.
*/
flush_sigqueue(&tsk->pending);
-
- tsk->signal = NULL;
tsk->sighand = NULL;
spin_unlock(&sighand->siglock);
__cleanup_sighand(sighand);
clear_tsk_thread_flag(tsk,TIF_SIGPENDING);
- if (sig) {
+ if (group_dead) {
flush_sigqueue(&sig->shared_pending);
- taskstats_tgid_free(sig);
- /*
- * Make sure ->signal can't go away under rq->lock,
- * see account_group_exec_runtime().
- */
- task_rq_unlock_wait(tsk);
- __cleanup_signal(sig);
+ tty_kref_put(tty);
}
}
@@ -155,9 +162,7 @@ static void delayed_put_task_struct(struct rcu_head *rhp)
{
struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
-#ifdef CONFIG_PERF_EVENTS
- WARN_ON_ONCE(tsk->perf_event_ctxp);
-#endif
+ perf_event_delayed_put(tsk);
trace_sched_process_free(tsk);
put_task_struct(tsk);
}
@@ -168,15 +173,16 @@ void release_task(struct task_struct * p)
struct task_struct *leader;
int zap_leader;
repeat:
- tracehook_prepare_release_task(p);
/* don't need to get the RCU readlock here - the process is dead and
- * can't be modifying its own credentials */
+ * can't be modifying its own credentials. But shut RCU-lockdep up */
+ rcu_read_lock();
atomic_dec(&__task_cred(p)->user->processes);
+ rcu_read_unlock();
proc_flush_task(p);
write_lock_irq(&tasklist_lock);
- tracehook_finish_release_task(p);
+ ptrace_release_task(p);
__exit_signal(p);
/*
@@ -187,22 +193,12 @@ repeat:
zap_leader = 0;
leader = p->group_leader;
if (leader != p && thread_group_empty(leader) && leader->exit_state == EXIT_ZOMBIE) {
- BUG_ON(task_detached(leader));
- do_notify_parent(leader, leader->exit_signal);
/*
* If we were the last child thread and the leader has
* exited already, and the leader's parent ignores SIGCHLD,
* then we are the one who should release the leader.
- *
- * do_notify_parent() will have marked it self-reaping in
- * that case.
- */
- zap_leader = task_detached(leader);
-
- /*
- * This maintains the invariant that release_task()
- * only runs on a task in EXIT_DEAD, just for sanity.
*/
+ zap_leader = do_notify_parent(leader, leader->exit_signal);
if (zap_leader)
leader->exit_state = EXIT_DEAD;
}
@@ -274,18 +270,16 @@ int is_current_pgrp_orphaned(void)
return retval;
}
-static int has_stopped_jobs(struct pid *pgrp)
+static bool has_stopped_jobs(struct pid *pgrp)
{
- int retval = 0;
struct task_struct *p;
do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
- if (!task_is_stopped(p))
- continue;
- retval = 1;
- break;
+ if (p->signal->flags & SIGNAL_STOP_STOPPED)
+ return true;
} while_each_pid_task(pgrp, PIDTYPE_PGID, p);
- return retval;
+
+ return false;
}
/*
@@ -319,264 +313,30 @@ kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent)
}
}
-/**
- * reparent_to_kthreadd - Reparent the calling kernel thread to kthreadd
- *
- * If a kernel thread is launched as a result of a system call, or if
- * it ever exits, it should generally reparent itself to kthreadd so it
- * isn't in the way of other processes and is correctly cleaned up on exit.
- *
- * The various task state such as scheduling policy and priority may have
- * been inherited from a user process, so we reset them to sane values here.
- *
- * NOTE that reparent_to_kthreadd() gives the caller full capabilities.
- */
-static void reparent_to_kthreadd(void)
-{
- write_lock_irq(&tasklist_lock);
-
- ptrace_unlink(current);
- /* Reparent to init */
- current->real_parent = current->parent = kthreadd_task;
- list_move_tail(&current->sibling, &current->real_parent->children);
-
- /* Set the exit signal to SIGCHLD so we signal init on exit */
- current->exit_signal = SIGCHLD;
-
- if (task_nice(current) < 0)
- set_user_nice(current, 0);
- /* cpus_allowed? */
- /* rt_priority? */
- /* signals? */
- memcpy(current->signal->rlim, init_task.signal->rlim,
- sizeof(current->signal->rlim));
-
- atomic_inc(&init_cred.usage);
- commit_creds(&init_cred);
- write_unlock_irq(&tasklist_lock);
-}
-
-void __set_special_pids(struct pid *pid)
-{
- struct task_struct *curr = current->group_leader;
-
- if (task_session(curr) != pid)
- change_pid(curr, PIDTYPE_SID, pid);
-
- if (task_pgrp(curr) != pid)
- change_pid(curr, PIDTYPE_PGID, pid);
-}
-
-static void set_special_pids(struct pid *pid)
-{
- write_lock_irq(&tasklist_lock);
- __set_special_pids(pid);
- write_unlock_irq(&tasklist_lock);
-}
-
-/*
- * Let kernel threads use this to say that they allow a certain signal.
- * Must not be used if kthread was cloned with CLONE_SIGHAND.
- */
-int allow_signal(int sig)
-{
- if (!valid_signal(sig) || sig < 1)
- return -EINVAL;
-
- spin_lock_irq(&current->sighand->siglock);
- /* This is only needed for daemonize()'ed kthreads */
- sigdelset(&current->blocked, sig);
- /*
- * Kernel threads handle their own signals. Let the signal code
- * know it'll be handled, so that they don't get converted to
- * SIGKILL or just silently dropped.
- */
- current->sighand->action[(sig)-1].sa.sa_handler = (void __user *)2;
- recalc_sigpending();
- spin_unlock_irq(&current->sighand->siglock);
- return 0;
-}
-
-EXPORT_SYMBOL(allow_signal);
-
-int disallow_signal(int sig)
-{
- if (!valid_signal(sig) || sig < 1)
- return -EINVAL;
-
- spin_lock_irq(&current->sighand->siglock);
- current->sighand->action[(sig)-1].sa.sa_handler = SIG_IGN;
- recalc_sigpending();
- spin_unlock_irq(&current->sighand->siglock);
- return 0;
-}
-
-EXPORT_SYMBOL(disallow_signal);
-
+#ifdef CONFIG_MEMCG
/*
- * Put all the gunge required to become a kernel thread without
- * attached user resources in one place where it belongs.
+ * A task is exiting. If it owned this mm, find a new owner for the mm.
*/
-
-void daemonize(const char *name, ...)
-{
- va_list args;
- sigset_t blocked;
-
- va_start(args, name);
- vsnprintf(current->comm, sizeof(current->comm), name, args);
- va_end(args);
-
- /*
- * If we were started as result of loading a module, close all of the
- * user space pages. We don't need them, and if we didn't close them
- * they would be locked into memory.
- */
- exit_mm(current);
- /*
- * We don't want to have TIF_FREEZE set if the system-wide hibernation
- * or suspend transition begins right now.
- */
- current->flags |= (PF_NOFREEZE | PF_KTHREAD);
-
- if (current->nsproxy != &init_nsproxy) {
- get_nsproxy(&init_nsproxy);
- switch_task_namespaces(current, &init_nsproxy);
- }
- set_special_pids(&init_struct_pid);
- proc_clear_tty(current);
-
- /* Block and flush all signals */
- sigfillset(&blocked);
- sigprocmask(SIG_BLOCK, &blocked, NULL);
- flush_signals(current);
-
- /* Become as one with the init task */
-
- daemonize_fs_struct();
- exit_files(current);
- current->files = init_task.files;
- atomic_inc(&current->files->count);
-
- reparent_to_kthreadd();
-}
-
-EXPORT_SYMBOL(daemonize);
-
-static void close_files(struct files_struct * files)
+void mm_update_next_owner(struct mm_struct *mm)
{
- int i, j;
- struct fdtable *fdt;
-
- j = 0;
+ struct task_struct *c, *g, *p = current;
+retry:
/*
- * It is safe to dereference the fd table without RCU or
- * ->file_lock because this is the last reference to the
- * files structure.
+ * If the exiting or execing task is not the owner, it's
+ * someone else's problem.
*/
- fdt = files_fdtable(files);
- for (;;) {
- unsigned long set;
- i = j * __NFDBITS;
- if (i >= fdt->max_fds)
- break;
- set = fdt->open_fds->fds_bits[j++];
- while (set) {
- if (set & 1) {
- struct file * file = xchg(&fdt->fd[i], NULL);
- if (file) {
- filp_close(file, files);
- cond_resched();
- }
- }
- i++;
- set >>= 1;
- }
- }
-}
-
-struct files_struct *get_files_struct(struct task_struct *task)
-{
- struct files_struct *files;
-
- task_lock(task);
- files = task->files;
- if (files)
- atomic_inc(&files->count);
- task_unlock(task);
-
- return files;
-}
-
-void put_files_struct(struct files_struct *files)
-{
- struct fdtable *fdt;
-
- if (atomic_dec_and_test(&files->count)) {
- close_files(files);
- /*
- * Free the fd and fdset arrays if we expanded them.
- * If the fdtable was embedded, pass files for freeing
- * at the end of the RCU grace period. Otherwise,
- * you can free files immediately.
- */
- fdt = files_fdtable(files);
- if (fdt != &files->fdtab)
- kmem_cache_free(files_cachep, files);
- free_fdtable(fdt);
- }
-}
-
-void reset_files_struct(struct files_struct *files)
-{
- struct task_struct *tsk = current;
- struct files_struct *old;
-
- old = tsk->files;
- task_lock(tsk);
- tsk->files = files;
- task_unlock(tsk);
- put_files_struct(old);
-}
-
-void exit_files(struct task_struct *tsk)
-{
- struct files_struct * files = tsk->files;
-
- if (files) {
- task_lock(tsk);
- tsk->files = NULL;
- task_unlock(tsk);
- put_files_struct(files);
- }
-}
-
-#ifdef CONFIG_MM_OWNER
-/*
- * Task p is exiting and it owned mm, lets find a new owner for it
- */
-static inline int
-mm_need_new_owner(struct mm_struct *mm, struct task_struct *p)
-{
+ if (mm->owner != p)
+ return;
/*
- * If there are other users of the mm and the owner (us) is exiting
- * we need to find a new owner to take on the responsibility.
+ * The current owner is exiting/execing and there are no other
+ * candidates. Do not leave the mm pointing to a possibly
+ * freed task structure.
*/
- if (atomic_read(&mm->mm_users) <= 1)
- return 0;
- if (mm->owner != p)
- return 0;
- return 1;
-}
-
-void mm_update_next_owner(struct mm_struct *mm)
-{
- struct task_struct *c, *g, *p = current;
-
-retry:
- if (!mm_need_new_owner(mm, p))
+ if (atomic_read(&mm->mm_users) <= 1) {
+ mm->owner = NULL;
return;
+ }
read_lock(&tasklist_lock);
/*
@@ -596,14 +356,18 @@ retry:
}
/*
- * Search through everything else. We should not get
- * here often
+ * Search through everything else, we should not get here often.
*/
- do_each_thread(g, c) {
- if (c->mm == mm)
- goto assign_new_owner;
- } while_each_thread(g, c);
-
+ for_each_process(g) {
+ if (g->flags & PF_KTHREAD)
+ continue;
+ for_each_thread(g, c) {
+ if (c->mm == mm)
+ goto assign_new_owner;
+ if (c->mm)
+ break;
+ }
+ }
read_unlock(&tasklist_lock);
/*
* We found no owner yet mm_users > 1: this implies that we are
@@ -635,7 +399,7 @@ assign_new_owner:
task_unlock(c);
put_task_struct(c);
}
-#endif /* CONFIG_MM_OWNER */
+#endif /* CONFIG_MEMCG */
/*
* Turn us into a lazy TLB process if we
@@ -649,6 +413,7 @@ static void exit_mm(struct task_struct * tsk)
mm_release(tsk, mm);
if (!mm)
return;
+ sync_mm_rss(mm);
/*
* Serialize with any possible pending coredump.
* We must hold mmap_sem around checking core_state
@@ -675,7 +440,7 @@ static void exit_mm(struct task_struct * tsk)
set_task_state(tsk, TASK_UNINTERRUPTIBLE);
if (!self.task) /* see coredump_finish() */
break;
- schedule();
+ freezable_schedule();
}
__set_task_state(tsk, TASK_RUNNING);
down_read(&mm->mmap_sem);
@@ -687,21 +452,21 @@ static void exit_mm(struct task_struct * tsk)
tsk->mm = NULL;
up_read(&mm->mmap_sem);
enter_lazy_tlb(mm, current);
- /* We don't want this task to be frozen prematurely */
- clear_freeze_flag(tsk);
task_unlock(tsk);
mm_update_next_owner(mm);
mmput(mm);
}
/*
- * When we die, we re-parent all our children.
- * Try to give them to another thread in our thread
- * group, and if no such member exists, give it to
- * the child reaper process (ie "init") in our pid
- * space.
+ * When we die, we re-parent all our children, and try to:
+ * 1. give them to another thread in our thread group, if such a member exists
+ * 2. give it to the first ancestor process which prctl'd itself as a
+ * child_subreaper for its children (like a service manager)
+ * 3. give it to the init process (PID 1) in our pid namespace
*/
static struct task_struct *find_new_reaper(struct task_struct *father)
+ __releases(&tasklist_lock)
+ __acquires(&tasklist_lock)
{
struct pid_namespace *pid_ns = task_active_pid_ns(father);
struct task_struct *thread;
@@ -717,17 +482,37 @@ static struct task_struct *find_new_reaper(struct task_struct *father)
if (unlikely(pid_ns->child_reaper == father)) {
write_unlock_irq(&tasklist_lock);
- if (unlikely(pid_ns == &init_pid_ns))
- panic("Attempted to kill init!");
+ if (unlikely(pid_ns == &init_pid_ns)) {
+ panic("Attempted to kill init! exitcode=0x%08x\n",
+ father->signal->group_exit_code ?:
+ father->exit_code);
+ }
zap_pid_ns_processes(pid_ns);
write_lock_irq(&tasklist_lock);
+ } else if (father->signal->has_child_subreaper) {
+ struct task_struct *reaper;
+
/*
- * We can not clear ->child_reaper or leave it alone.
- * There may by stealth EXIT_DEAD tasks on ->children,
- * forget_original_parent() must move them somewhere.
+ * Find the first ancestor marked as child_subreaper.
+ * Note that the code below checks same_thread_group(reaper,
+ * pid_ns->child_reaper). This is what we need to DTRT in a
+ * PID namespace. However we still need the check above, see
+ * http://marc.info/?l=linux-kernel&m=131385460420380
*/
- pid_ns->child_reaper = init_pid_ns.child_reaper;
+ for (reaper = father->real_parent;
+ reaper != &init_task;
+ reaper = reaper->real_parent) {
+ if (same_thread_group(reaper, pid_ns->child_reaper))
+ break;
+ if (!reaper->signal->is_child_subreaper)
+ continue;
+ thread = reaper;
+ do {
+ if (!(thread->flags & PF_EXITING))
+ return reaper;
+ } while_each_thread(reaper, thread);
+ }
}
return pid_ns->child_reaper;
@@ -741,7 +526,7 @@ static void reparent_leader(struct task_struct *father, struct task_struct *p,
{
list_move_tail(&p->sibling, &p->real_parent->children);
- if (task_detached(p))
+ if (p->exit_state == EXIT_DEAD)
return;
/*
* If this is a threaded reparent there is no need to
@@ -750,14 +535,13 @@ static void reparent_leader(struct task_struct *father, struct task_struct *p,
if (same_thread_group(p->real_parent, father))
return;
- /* We don't want people slaying init. */
+ /* We don't want people slaying init. */
p->exit_signal = SIGCHLD;
/* If it has exited notify the new parent about this child's death. */
- if (!task_ptrace(p) &&
+ if (!p->ptrace &&
p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) {
- do_notify_parent(p, p->exit_signal);
- if (task_detached(p)) {
+ if (do_notify_parent(p, p->exit_signal)) {
p->exit_state = EXIT_DEAD;
list_move_tail(&p->sibling, dead);
}
@@ -771,9 +555,12 @@ static void forget_original_parent(struct task_struct *father)
struct task_struct *p, *n, *reaper;
LIST_HEAD(dead_children);
- exit_ptrace(father);
-
write_lock_irq(&tasklist_lock);
+ /*
+ * Note that exit_ptrace() and find_new_reaper() might
+ * drop tasklist_lock and reacquire it.
+ */
+ exit_ptrace(father);
reaper = find_new_reaper(father);
list_for_each_entry_safe(p, n, &father->children, sibling) {
@@ -781,7 +568,7 @@ static void forget_original_parent(struct task_struct *father)
do {
t->real_parent = reaper;
if (t->parent == father) {
- BUG_ON(task_ptrace(t));
+ BUG_ON(t->ptrace);
t->parent = t->real_parent;
}
if (t->pdeath_signal)
@@ -806,8 +593,7 @@ static void forget_original_parent(struct task_struct *father)
*/
static void exit_notify(struct task_struct *tsk, int group_dead)
{
- int signal;
- void *cookie;
+ bool autoreap;
/*
* This does two things:
@@ -818,49 +604,33 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
* jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2)
*/
forget_original_parent(tsk);
- exit_task_namespaces(tsk);
write_lock_irq(&tasklist_lock);
if (group_dead)
kill_orphaned_pgrp(tsk->group_leader, NULL);
- /* Let father know we died
- *
- * Thread signals are configurable, but you aren't going to use
- * that to send signals to arbitary processes.
- * That stops right now.
- *
- * If the parent exec id doesn't match the exec id we saved
- * when we started then we know the parent has changed security
- * domain.
- *
- * If our self_exec id doesn't match our parent_exec_id then
- * we have changed execution domain as these two values started
- * the same after a fork.
- */
- if (tsk->exit_signal != SIGCHLD && !task_detached(tsk) &&
- (tsk->parent_exec_id != tsk->real_parent->self_exec_id ||
- tsk->self_exec_id != tsk->parent_exec_id))
- tsk->exit_signal = SIGCHLD;
-
- signal = tracehook_notify_death(tsk, &cookie, group_dead);
- if (signal >= 0)
- signal = do_notify_parent(tsk, signal);
+ if (unlikely(tsk->ptrace)) {
+ int sig = thread_group_leader(tsk) &&
+ thread_group_empty(tsk) &&
+ !ptrace_reparented(tsk) ?
+ tsk->exit_signal : SIGCHLD;
+ autoreap = do_notify_parent(tsk, sig);
+ } else if (thread_group_leader(tsk)) {
+ autoreap = thread_group_empty(tsk) &&
+ do_notify_parent(tsk, tsk->exit_signal);
+ } else {
+ autoreap = true;
+ }
- tsk->exit_state = signal == DEATH_REAP ? EXIT_DEAD : EXIT_ZOMBIE;
+ tsk->exit_state = autoreap ? EXIT_DEAD : EXIT_ZOMBIE;
- /* mt-exec, de_thread() is waiting for us */
- if (thread_group_leader(tsk) &&
- tsk->signal->group_exit_task &&
- tsk->signal->notify_count < 0)
+ /* mt-exec, de_thread() is waiting for group leader */
+ if (unlikely(tsk->signal->notify_count < 0))
wake_up_process(tsk->signal->group_exit_task);
-
write_unlock_irq(&tasklist_lock);
- tracehook_report_death(tsk, signal, cookie, group_dead);
-
/* If the process is dead, release it - nobody will wait for it */
- if (signal == DEATH_REAP)
+ if (autoreap)
release_task(tsk);
}
@@ -878,9 +648,9 @@ static void check_stack_usage(void)
spin_lock(&low_water_lock);
if (free < lowest_to_date) {
- printk(KERN_WARNING "%s used greatest stack depth: %lu bytes "
- "left\n",
- current->comm, free);
+ printk(KERN_WARNING "%s (%d) used greatest stack depth: "
+ "%lu bytes left\n",
+ current->comm, task_pid_nr(current), free);
lowest_to_date = free;
}
spin_unlock(&low_water_lock);
@@ -889,21 +659,30 @@ static void check_stack_usage(void)
static inline void check_stack_usage(void) {}
#endif
-NORET_TYPE void do_exit(long code)
+void do_exit(long code)
{
struct task_struct *tsk = current;
int group_dead;
profile_task_exit(tsk);
- WARN_ON(atomic_read(&tsk->fs_excl));
+ WARN_ON(blk_needs_flush_plug(tsk));
if (unlikely(in_interrupt()))
panic("Aiee, killing interrupt handler!");
if (unlikely(!tsk->pid))
panic("Attempted to kill the idle task!");
- tracehook_report_exit(&code);
+ /*
+ * If do_exit is called because this processes oopsed, it's possible
+ * that get_fs() was left as KERNEL_DS, so reset it to USER_DS before
+ * continuing. Amongst other possible reasons, this is to prevent
+ * mm_release()->clear_child_tid() from writing to a user-controlled
+ * kernel address.
+ */
+ set_fs(USER_DS);
+
+ ptrace_event(PTRACE_EVENT_EXIT, code);
validate_creds_for_do_exit(tsk);
@@ -928,8 +707,6 @@ NORET_TYPE void do_exit(long code)
schedule();
}
- exit_irq_thread();
-
exit_signals(tsk); /* sets PF_EXITING */
/*
* tsk->flags are checked in the futex code to protect against
@@ -944,7 +721,9 @@ NORET_TYPE void do_exit(long code)
preempt_count());
acct_update_integrals(tsk);
-
+ /* sync mm's RSS info before statistics gathering */
+ if (tsk->mm)
+ sync_mm_rss(tsk->mm);
group_dead = atomic_dec_and_test(&tsk->signal->live);
if (group_dead) {
hrtimer_cancel(&tsk->signal->real_timer);
@@ -955,8 +734,7 @@ NORET_TYPE void do_exit(long code)
acct_collect(code, group_dead);
if (group_dead)
tty_audit_exit();
- if (unlikely(tsk->audit_context))
- audit_free(tsk);
+ audit_free(tsk);
tsk->exit_code = code;
taskstats_exit(tsk, group_dead);
@@ -968,33 +746,39 @@ NORET_TYPE void do_exit(long code)
trace_sched_process_exit(tsk);
exit_sem(tsk);
+ exit_shm(tsk);
exit_files(tsk);
exit_fs(tsk);
- check_stack_usage();
- exit_thread();
- cgroup_exit(tsk, 1);
-
if (group_dead)
disassociate_ctty(1);
+ exit_task_namespaces(tsk);
+ exit_task_work(tsk);
+ exit_thread();
- module_put(task_thread_info(tsk)->exec_domain->module);
+ /*
+ * Flush inherited counters to the parent - before the parent
+ * gets woken up by child-exit notifications.
+ *
+ * because of cgroup mode, must be called before cgroup_exit()
+ */
+ perf_event_exit_task(tsk);
- proc_exit_connector(tsk);
+ cgroup_exit(tsk);
+
+ module_put(task_thread_info(tsk)->exec_domain->module);
/*
* FIXME: do that only when needed, using sched_exit tracepoint
*/
flush_ptrace_hw_breakpoint(tsk);
- /*
- * Flush inherited counters to the parent - before the parent
- * gets woken up by child-exit notifications.
- */
- perf_event_exit_task(tsk);
exit_notify(tsk, group_dead);
+ proc_exit_connector(tsk);
#ifdef CONFIG_NUMA
+ task_lock(tsk);
mpol_put(tsk->mempolicy);
tsk->mempolicy = NULL;
+ task_unlock(tsk);
#endif
#ifdef CONFIG_FUTEX
if (unlikely(current->pi_state_cache))
@@ -1003,7 +787,7 @@ NORET_TYPE void do_exit(long code)
/*
* Make sure we are holding no locks:
*/
- debug_check_no_locks_held(tsk);
+ debug_check_no_locks_held();
/*
* We can do this unlocked here. The futex code uses this flag
* just to verify whether the pi state cleanup has been done
@@ -1015,14 +799,37 @@ NORET_TYPE void do_exit(long code)
exit_io_context(tsk);
if (tsk->splice_pipe)
- __free_pipe_info(tsk->splice_pipe);
+ free_pipe_info(tsk->splice_pipe);
+
+ if (tsk->task_frag.page)
+ put_page(tsk->task_frag.page);
validate_creds_for_do_exit(tsk);
+ check_stack_usage();
preempt_disable();
+ if (tsk->nr_dirtied)
+ __this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied);
exit_rcu();
+
+ /*
+ * The setting of TASK_RUNNING by try_to_wake_up() may be delayed
+ * when the following two conditions become true.
+ * - There is race condition of mmap_sem (It is acquired by
+ * exit_mm()), and
+ * - SMI occurs before setting TASK_RUNINNG.
+ * (or hypervisor of virtual machine switches to other guest)
+ * As a result, we may become TASK_RUNNING after becoming TASK_DEAD
+ *
+ * To avoid it, we have to wait for releasing tsk->pi_lock which
+ * is held by try_to_wake_up()
+ */
+ smp_mb();
+ raw_spin_unlock_wait(&tsk->pi_lock);
+
/* causes final put_task_struct in finish_task_switch(). */
tsk->state = TASK_DEAD;
+ tsk->flags |= PF_NOFREEZE; /* tell freezer to ignore us */
schedule();
BUG();
/* Avoid "noreturn function does return". */
@@ -1032,7 +839,7 @@ NORET_TYPE void do_exit(long code)
EXPORT_SYMBOL_GPL(do_exit);
-NORET_TYPE void complete_and_exit(struct completion *comp, long code)
+void complete_and_exit(struct completion *comp, long code)
{
if (comp)
complete(comp);
@@ -1051,7 +858,7 @@ SYSCALL_DEFINE1(exit, int, error_code)
* Take down every thread in the group. This is called by fatal signals
* as well as by sys_exit_group (below).
*/
-NORET_TYPE void
+void
do_group_exit(int exit_code)
{
struct signal_struct *sig = current->signal;
@@ -1172,7 +979,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
unsigned long state;
int retval, status, traced;
pid_t pid = task_pid_vnr(p);
- uid_t uid = __task_cred(p)->uid;
+ uid_t uid = from_kuid_munged(current_user_ns(), task_uid(p));
struct siginfo __user *infop;
if (!likely(wo->wo_flags & WEXITED))
@@ -1180,7 +987,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
if (unlikely(wo->wo_flags & WNOWAIT)) {
int exit_code = p->exit_code;
- int why, status;
+ int why;
get_task_struct(p);
read_unlock(&tasklist_lock);
@@ -1194,22 +1001,18 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
return wait_noreap_copyout(wo, p, pid, uid, why, status);
}
+ traced = ptrace_reparented(p);
/*
- * Try to move the task's state to DEAD
- * only one thread is allowed to do this:
+ * Move the task's state to DEAD/TRACE, only one thread can do this.
*/
- state = xchg(&p->exit_state, EXIT_DEAD);
- if (state != EXIT_ZOMBIE) {
- BUG_ON(state != EXIT_DEAD);
+ state = traced && thread_group_leader(p) ? EXIT_TRACE : EXIT_DEAD;
+ if (cmpxchg(&p->exit_state, EXIT_ZOMBIE, state) != EXIT_ZOMBIE)
return 0;
- }
-
- traced = ptrace_reparented(p);
/*
* It can be ptraced but not reparented, check
- * !task_detached() to filter out sub-threads.
+ * thread_group_leader() to filter out sub-threads.
*/
- if (likely(!traced) && likely(!task_detached(p))) {
+ if (likely(!traced) && thread_group_leader(p)) {
struct signal_struct *psig;
struct signal_struct *sig;
unsigned long maxrss;
@@ -1230,27 +1033,17 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
* as other threads in the parent group can be right
* here reaping other children at the same time.
*
- * We use thread_group_times() to get times for the thread
+ * We use thread_group_cputime_adjusted() to get times for the thread
* group, which consolidates times for all threads in the
* group including the group leader.
*/
- thread_group_times(p, &tgutime, &tgstime);
+ thread_group_cputime_adjusted(p, &tgutime, &tgstime);
spin_lock_irq(&p->real_parent->sighand->siglock);
psig = p->real_parent->signal;
sig = p->signal;
- psig->cutime =
- cputime_add(psig->cutime,
- cputime_add(tgutime,
- sig->cutime));
- psig->cstime =
- cputime_add(psig->cstime,
- cputime_add(tgstime,
- sig->cstime));
- psig->cgtime =
- cputime_add(psig->cgtime,
- cputime_add(p->gtime,
- cputime_add(sig->gtime,
- sig->cgtime)));
+ psig->cutime += tgutime + sig->cutime;
+ psig->cstime += tgstime + sig->cstime;
+ psig->cgtime += task_gtime(p) + sig->gtime + sig->cgtime;
psig->cmin_flt +=
p->min_flt + sig->min_flt + sig->cmin_flt;
psig->cmaj_flt +=
@@ -1275,7 +1068,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
/*
* Now we are sure this task is interesting, and no other
- * thread can reap it because we set its state to EXIT_DEAD.
+ * thread can reap it because we its state == DEAD/TRACE.
*/
read_unlock(&tasklist_lock);
@@ -1312,25 +1105,19 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
if (!retval)
retval = pid;
- if (traced) {
+ if (state == EXIT_TRACE) {
write_lock_irq(&tasklist_lock);
/* We dropped tasklist, ptracer could die and untrace */
ptrace_unlink(p);
- /*
- * If this is not a detached task, notify the parent.
- * If it's still not detached after that, don't release
- * it now.
- */
- if (!task_detached(p)) {
- do_notify_parent(p, p->exit_signal);
- if (!task_detached(p)) {
- p->exit_state = EXIT_ZOMBIE;
- p = NULL;
- }
- }
+
+ /* If parent wants a zombie, don't release it now */
+ state = EXIT_ZOMBIE;
+ if (do_notify_parent(p, p->exit_signal))
+ state = EXIT_DEAD;
+ p->exit_state = state;
write_unlock_irq(&tasklist_lock);
}
- if (p != NULL)
+ if (state == EXIT_DEAD)
release_task(p);
return retval;
@@ -1339,7 +1126,8 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
static int *task_stopped_code(struct task_struct *p, bool ptrace)
{
if (ptrace) {
- if (task_is_stopped_or_traced(p))
+ if (task_is_stopped_or_traced(p) &&
+ !(p->jobctl & JOBCTL_LISTENING))
return &p->exit_code;
} else {
if (p->signal->flags & SIGNAL_STOP_STOPPED)
@@ -1348,11 +1136,23 @@ static int *task_stopped_code(struct task_struct *p, bool ptrace)
return NULL;
}
-/*
- * Handle sys_wait4 work for one task in state TASK_STOPPED. We hold
- * read_lock(&tasklist_lock) on entry. If we return zero, we still hold
- * the lock and this task is uninteresting. If we return nonzero, we have
- * released the lock and the system call should return.
+/**
+ * wait_task_stopped - Wait for %TASK_STOPPED or %TASK_TRACED
+ * @wo: wait options
+ * @ptrace: is the wait for ptrace
+ * @p: task to wait for
+ *
+ * Handle sys_wait4() work for %p in state %TASK_STOPPED or %TASK_TRACED.
+ *
+ * CONTEXT:
+ * read_lock(&tasklist_lock), which is released if return value is
+ * non-zero. Also, grabs and releases @p->sighand->siglock.
+ *
+ * RETURNS:
+ * 0 if wait condition didn't exist and search for other wait conditions
+ * should continue. Non-zero return, -errno on failure and @p's pid on
+ * success, implies that tasklist_lock is released and wait condition
+ * search should terminate.
*/
static int wait_task_stopped(struct wait_opts *wo,
int ptrace, struct task_struct *p)
@@ -1368,6 +1168,9 @@ static int wait_task_stopped(struct wait_opts *wo,
if (!ptrace && !(wo->wo_flags & WUNTRACED))
return 0;
+ if (!task_stopped_code(p, ptrace))
+ return 0;
+
exit_code = 0;
spin_lock_irq(&p->sighand->siglock);
@@ -1382,8 +1185,7 @@ static int wait_task_stopped(struct wait_opts *wo,
if (!unlikely(wo->wo_flags & WNOWAIT))
*p_code = 0;
- /* don't need the RCU readlock here as we're holding a spinlock */
- uid = __task_cred(p)->uid;
+ uid = from_kuid_munged(current_user_ns(), task_uid(p));
unlock_sig:
spin_unlock_irq(&p->sighand->siglock);
if (!exit_code)
@@ -1456,7 +1258,7 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
}
if (!unlikely(wo->wo_flags & WNOWAIT))
p->signal->flags &= ~SIGNAL_STOP_CONTINUED;
- uid = __task_cred(p)->uid;
+ uid = from_kuid_munged(current_user_ns(), task_uid(p));
spin_unlock_irq(&p->sighand->siglock);
pid = task_pid_vnr(p);
@@ -1492,7 +1294,12 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
static int wait_consider_task(struct wait_opts *wo, int ptrace,
struct task_struct *p)
{
- int ret = eligible_child(wo, p);
+ int ret;
+
+ if (unlikely(p->exit_state == EXIT_DEAD))
+ return 0;
+
+ ret = eligible_child(wo, p);
if (!ret)
return ret;
@@ -1510,33 +1317,88 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
return 0;
}
- if (likely(!ptrace) && unlikely(task_ptrace(p))) {
+ if (unlikely(p->exit_state == EXIT_TRACE)) {
/*
- * This child is hidden by ptrace.
- * We aren't allowed to see it now, but eventually we will.
+ * ptrace == 0 means we are the natural parent. In this case
+ * we should clear notask_error, debugger will notify us.
*/
- wo->notask_error = 0;
+ if (likely(!ptrace))
+ wo->notask_error = 0;
return 0;
}
- if (p->exit_state == EXIT_DEAD)
- return 0;
+ if (likely(!ptrace) && unlikely(p->ptrace)) {
+ /*
+ * If it is traced by its real parent's group, just pretend
+ * the caller is ptrace_do_wait() and reap this child if it
+ * is zombie.
+ *
+ * This also hides group stop state from real parent; otherwise
+ * a single stop can be reported twice as group and ptrace stop.
+ * If a ptracer wants to distinguish these two events for its
+ * own children it should create a separate process which takes
+ * the role of real parent.
+ */
+ if (!ptrace_reparented(p))
+ ptrace = 1;
+ }
+
+ /* slay zombie? */
+ if (p->exit_state == EXIT_ZOMBIE) {
+ /* we don't reap group leaders with subthreads */
+ if (!delay_group_leader(p)) {
+ /*
+ * A zombie ptracee is only visible to its ptracer.
+ * Notification and reaping will be cascaded to the
+ * real parent when the ptracer detaches.
+ */
+ if (unlikely(ptrace) || likely(!p->ptrace))
+ return wait_task_zombie(wo, p);
+ }
+
+ /*
+ * Allow access to stopped/continued state via zombie by
+ * falling through. Clearing of notask_error is complex.
+ *
+ * When !@ptrace:
+ *
+ * If WEXITED is set, notask_error should naturally be
+ * cleared. If not, subset of WSTOPPED|WCONTINUED is set,
+ * so, if there are live subthreads, there are events to
+ * wait for. If all subthreads are dead, it's still safe
+ * to clear - this function will be called again in finite
+ * amount time once all the subthreads are released and
+ * will then return without clearing.
+ *
+ * When @ptrace:
+ *
+ * Stopped state is per-task and thus can't change once the
+ * target task dies. Only continued and exited can happen.
+ * Clear notask_error if WCONTINUED | WEXITED.
+ */
+ if (likely(!ptrace) || (wo->wo_flags & (WCONTINUED | WEXITED)))
+ wo->notask_error = 0;
+ } else {
+ /*
+ * @p is alive and it's gonna stop, continue or exit, so
+ * there always is something to wait for.
+ */
+ wo->notask_error = 0;
+ }
/*
- * We don't reap group leaders with subthreads.
+ * Wait for stopped. Depending on @ptrace, different stopped state
+ * is used and the two don't interact with each other.
*/
- if (p->exit_state == EXIT_ZOMBIE && !delay_group_leader(p))
- return wait_task_zombie(wo, p);
+ ret = wait_task_stopped(wo, ptrace, p);
+ if (ret)
+ return ret;
/*
- * It's stopped or running now, so it might
- * later continue, exit, or stop again.
+ * Wait for continued. There's only one continued state and the
+ * ptracer can consume it which can confuse the real parent. Don't
+ * use WCONTINUED from ptracer. You don't need or want it.
*/
- wo->notask_error = 0;
-
- if (task_stopped_code(p, ptrace))
- return wait_task_stopped(wo, ptrace, p);
-
return wait_task_continued(wo, p);
}
@@ -1716,9 +1578,6 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
}
put_pid(pid);
-
- /* avoid REGPARM breakage on x86: */
- asmlinkage_protect(5, ret, which, upid, infop, options, ru);
return ret;
}
@@ -1756,8 +1615,6 @@ SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr,
ret = do_wait(&wo);
put_pid(pid);
- /* avoid REGPARM breakage on x86: */
- asmlinkage_protect(4, ret, upid, stat_addr, options, ru);
return ret;
}
diff --git a/kernel/extable.c b/kernel/extable.c
index 7f8f263f852..d8a6446adbc 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -35,10 +35,16 @@ DEFINE_MUTEX(text_mutex);
extern struct exception_table_entry __start___ex_table[];
extern struct exception_table_entry __stop___ex_table[];
+/* Cleared by build time tools if the table is already sorted. */
+u32 __initdata __visible main_extable_sort_needed = 1;
+
/* Sort the kernel's built-in exception table */
void __init sort_main_extable(void)
{
- sort_extable(__start___ex_table, __stop___ex_table);
+ if (main_extable_sort_needed && __stop___ex_table > __start___ex_table) {
+ pr_notice("Sorting __ex_table...\n");
+ sort_extable(__start___ex_table, __stop___ex_table);
+ }
}
/* Given an address, look for it in the exception tables. */
@@ -55,7 +61,7 @@ const struct exception_table_entry *search_exception_tables(unsigned long addr)
static inline int init_kernel_text(unsigned long addr)
{
if (addr >= (unsigned long)_sinittext &&
- addr <= (unsigned long)_einittext)
+ addr < (unsigned long)_einittext)
return 1;
return 0;
}
@@ -63,7 +69,7 @@ static inline int init_kernel_text(unsigned long addr)
int core_kernel_text(unsigned long addr)
{
if (addr >= (unsigned long)_stext &&
- addr <= (unsigned long)_etext)
+ addr < (unsigned long)_etext)
return 1;
if (system_state == SYSTEM_BOOTING &&
@@ -72,6 +78,24 @@ int core_kernel_text(unsigned long addr)
return 0;
}
+/**
+ * core_kernel_data - tell if addr points to kernel data
+ * @addr: address to test
+ *
+ * Returns true if @addr passed in is from the core kernel data
+ * section.
+ *
+ * Note: On some archs it may return true for core RODATA, and false
+ * for others. But will always be true for core RW data.
+ */
+int core_kernel_data(unsigned long addr)
+{
+ if (addr >= (unsigned long)_sdata &&
+ addr < (unsigned long)_edata)
+ return 1;
+ return 0;
+}
+
int __kernel_text_address(unsigned long addr)
{
if (core_kernel_text(addr))
diff --git a/kernel/fork.c b/kernel/fork.c
index f88bd984df3..6a13c46cd87 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -28,18 +28,21 @@
#include <linux/mman.h>
#include <linux/mmu_notifier.h>
#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/vmacache.h>
#include <linux/nsproxy.h>
#include <linux/capability.h>
#include <linux/cpu.h>
#include <linux/cgroup.h>
#include <linux/security.h>
#include <linux/hugetlb.h>
+#include <linux/seccomp.h>
#include <linux/swap.h>
#include <linux/syscalls.h>
#include <linux/jiffies.h>
-#include <linux/tracehook.h>
#include <linux/futex.h>
#include <linux/compat.h>
+#include <linux/kthread.h>
#include <linux/task_io_accounting_ops.h>
#include <linux/rcupdate.h>
#include <linux/ptrace.h>
@@ -47,6 +50,7 @@
#include <linux/audit.h>
#include <linux/memcontrol.h>
#include <linux/ftrace.h>
+#include <linux/proc_fs.h>
#include <linux/profile.h>
#include <linux/rmap.h>
#include <linux/ksm.h>
@@ -58,13 +62,18 @@
#include <linux/taskstats_kern.h>
#include <linux/random.h>
#include <linux/tty.h>
-#include <linux/proc_fs.h>
#include <linux/blkdev.h>
#include <linux/fs_struct.h>
#include <linux/magic.h>
#include <linux/perf_event.h>
#include <linux/posix-timers.h>
#include <linux/user-return-notifier.h>
+#include <linux/oom.h>
+#include <linux/khugepaged.h>
+#include <linux/signalfd.h>
+#include <linux/uprobes.h>
+#include <linux/aio.h>
+#include <linux/compiler.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
@@ -75,11 +84,14 @@
#include <trace/events/sched.h>
+#define CREATE_TRACE_POINTS
+#include <trace/events/task.h>
+
/*
* Protected counters by write_lock_irq(&tasklist_lock)
*/
unsigned long total_forks; /* Handle normal Linux uptimes. */
-int nr_threads; /* The idle threads do not count.. */
+int nr_threads; /* The idle threads do not count.. */
int max_threads; /* tunable limit on nr_threads */
@@ -87,6 +99,14 @@ DEFINE_PER_CPU(unsigned long, process_counts) = 0;
__cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */
+#ifdef CONFIG_PROVE_RCU
+int lockdep_tasklist_lock_is_held(void)
+{
+ return lockdep_is_held(&tasklist_lock);
+}
+EXPORT_SYMBOL_GPL(lockdep_tasklist_lock_is_held);
+#endif /* #ifdef CONFIG_PROVE_RCU */
+
int nr_processes(void)
{
int cpu;
@@ -98,27 +118,69 @@ int nr_processes(void)
return total;
}
-#ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR
-# define alloc_task_struct() kmem_cache_alloc(task_struct_cachep, GFP_KERNEL)
-# define free_task_struct(tsk) kmem_cache_free(task_struct_cachep, (tsk))
+void __weak arch_release_task_struct(struct task_struct *tsk)
+{
+}
+
+#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
static struct kmem_cache *task_struct_cachep;
-#endif
-#ifndef __HAVE_ARCH_THREAD_INFO_ALLOCATOR
-static inline struct thread_info *alloc_thread_info(struct task_struct *tsk)
+static inline struct task_struct *alloc_task_struct_node(int node)
{
-#ifdef CONFIG_DEBUG_STACK_USAGE
- gfp_t mask = GFP_KERNEL | __GFP_ZERO;
-#else
- gfp_t mask = GFP_KERNEL;
+ return kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL, node);
+}
+
+static inline void free_task_struct(struct task_struct *tsk)
+{
+ kmem_cache_free(task_struct_cachep, tsk);
+}
#endif
- return (struct thread_info *)__get_free_pages(mask, THREAD_SIZE_ORDER);
+
+void __weak arch_release_thread_info(struct thread_info *ti)
+{
+}
+
+#ifndef CONFIG_ARCH_THREAD_INFO_ALLOCATOR
+
+/*
+ * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a
+ * kmemcache based allocator.
+ */
+# if THREAD_SIZE >= PAGE_SIZE
+static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
+ int node)
+{
+ struct page *page = alloc_kmem_pages_node(node, THREADINFO_GFP,
+ THREAD_SIZE_ORDER);
+
+ return page ? page_address(page) : NULL;
}
static inline void free_thread_info(struct thread_info *ti)
{
- free_pages((unsigned long)ti, THREAD_SIZE_ORDER);
+ free_kmem_pages((unsigned long)ti, THREAD_SIZE_ORDER);
+}
+# else
+static struct kmem_cache *thread_info_cache;
+
+static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
+ int node)
+{
+ return kmem_cache_alloc_node(thread_info_cache, THREADINFO_GFP, node);
}
+
+static void free_thread_info(struct thread_info *ti)
+{
+ kmem_cache_free(thread_info_cache, ti);
+}
+
+void thread_info_cache_init(void)
+{
+ thread_info_cache = kmem_cache_create("thread_info", THREAD_SIZE,
+ THREAD_SIZE, 0, NULL);
+ BUG_ON(thread_info_cache == NULL);
+}
+# endif
#endif
/* SLAB cache for signal_struct structures (tsk->signal) */
@@ -148,39 +210,52 @@ static void account_kernel_stack(struct thread_info *ti, int account)
void free_task(struct task_struct *tsk)
{
- prop_local_destroy_single(&tsk->dirties);
account_kernel_stack(tsk->stack, -1);
+ arch_release_thread_info(tsk->stack);
free_thread_info(tsk->stack);
rt_mutex_debug_task_free(tsk);
ftrace_graph_exit_task(tsk);
+ put_seccomp_filter(tsk);
+ arch_release_task_struct(tsk);
free_task_struct(tsk);
}
EXPORT_SYMBOL(free_task);
+static inline void free_signal_struct(struct signal_struct *sig)
+{
+ taskstats_tgid_free(sig);
+ sched_autogroup_exit(sig);
+ kmem_cache_free(signal_cachep, sig);
+}
+
+static inline void put_signal_struct(struct signal_struct *sig)
+{
+ if (atomic_dec_and_test(&sig->sigcnt))
+ free_signal_struct(sig);
+}
+
void __put_task_struct(struct task_struct *tsk)
{
WARN_ON(!tsk->exit_state);
WARN_ON(atomic_read(&tsk->usage));
WARN_ON(tsk == current);
+ task_numa_free(tsk);
+ security_task_free(tsk);
exit_creds(tsk);
delayacct_tsk_free(tsk);
+ put_signal_struct(tsk->signal);
if (!profile_handoff_task(tsk))
free_task(tsk);
}
+EXPORT_SYMBOL_GPL(__put_task_struct);
-/*
- * macro override instead of weak attribute alias, to workaround
- * gcc 4.1.0 and 4.1.1 bugs with weak attribute and empty functions.
- */
-#ifndef arch_task_cache_init
-#define arch_task_cache_init()
-#endif
+void __init __weak arch_task_cache_init(void) { }
void __init fork_init(unsigned long mempages)
{
-#ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR
+#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
#ifndef ARCH_MIN_TASKALIGN
#define ARCH_MIN_TASKALIGN L1_CACHE_BYTES
#endif
@@ -203,7 +278,7 @@ void __init fork_init(unsigned long mempages)
/*
* we need to allow at least 20 threads to boot a system
*/
- if(max_threads < 20)
+ if (max_threads < 20)
max_threads = 20;
init_task.signal->rlim[RLIMIT_NPROC].rlim_cur = max_threads/2;
@@ -212,7 +287,7 @@ void __init fork_init(unsigned long mempages)
init_task.signal->rlim[RLIMIT_NPROC];
}
-int __attribute__((weak)) arch_dup_task_struct(struct task_struct *dst,
+int __weak arch_dup_task_struct(struct task_struct *dst,
struct task_struct *src)
{
*dst = *src;
@@ -224,33 +299,26 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
struct task_struct *tsk;
struct thread_info *ti;
unsigned long *stackend;
-
+ int node = tsk_fork_get_node(orig);
int err;
- prepare_to_copy(orig);
-
- tsk = alloc_task_struct();
+ tsk = alloc_task_struct_node(node);
if (!tsk)
return NULL;
- ti = alloc_thread_info(tsk);
- if (!ti) {
- free_task_struct(tsk);
- return NULL;
- }
+ ti = alloc_thread_info_node(tsk, node);
+ if (!ti)
+ goto free_tsk;
- err = arch_dup_task_struct(tsk, orig);
+ err = arch_dup_task_struct(tsk, orig);
if (err)
- goto out;
+ goto free_ti;
tsk->stack = ti;
- err = prop_local_init_single(&tsk->dirties);
- if (err)
- goto out;
-
setup_thread_stack(tsk, orig);
clear_user_return_notifier(tsk);
+ clear_tsk_need_resched(tsk);
stackend = end_of_stack(tsk);
*stackend = STACK_END_MAGIC; /* for overflow detection */
@@ -258,20 +326,24 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
tsk->stack_canary = get_random_int();
#endif
- /* One for us, one for whoever does the "release_task()" (usually parent) */
- atomic_set(&tsk->usage,2);
- atomic_set(&tsk->fs_excl, 0);
+ /*
+ * One for us, one for whoever does the "release_task()" (usually
+ * parent)
+ */
+ atomic_set(&tsk->usage, 2);
#ifdef CONFIG_BLK_DEV_IO_TRACE
tsk->btrace_seq = 0;
#endif
tsk->splice_pipe = NULL;
+ tsk->task_frag.page = NULL;
account_kernel_stack(ti, 1);
return tsk;
-out:
+free_ti:
free_thread_info(ti);
+free_tsk:
free_task_struct(tsk);
return NULL;
}
@@ -279,14 +351,15 @@ out:
#ifdef CONFIG_MMU
static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
{
- struct vm_area_struct *mpnt, *tmp, **pprev;
+ struct vm_area_struct *mpnt, *tmp, *prev, **pprev;
struct rb_node **rb_link, *rb_parent;
int retval;
unsigned long charge;
- struct mempolicy *pol;
+ uprobe_start_dup_mmap();
down_write(&oldmm->mmap_sem);
flush_cache_dup_mm(oldmm);
+ uprobe_dup_mmap(oldmm, mm);
/*
* Not linked in yet - no deadlock potential:
*/
@@ -294,9 +367,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
mm->locked_vm = 0;
mm->mmap = NULL;
- mm->mmap_cache = NULL;
- mm->free_area_cache = oldmm->mmap_base;
- mm->cached_hole_size = ~0UL;
+ mm->vmacache_seqnum = 0;
mm->map_count = 0;
cpumask_clear(mm_cpumask(mm));
mm->mm_rb = RB_ROOT;
@@ -306,21 +377,24 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
retval = ksm_fork(mm, oldmm);
if (retval)
goto out;
+ retval = khugepaged_fork(mm, oldmm);
+ if (retval)
+ goto out;
+ prev = NULL;
for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) {
struct file *file;
if (mpnt->vm_flags & VM_DONTCOPY) {
- long pages = vma_pages(mpnt);
- mm->total_vm -= pages;
vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file,
- -pages);
+ -vma_pages(mpnt));
continue;
}
charge = 0;
if (mpnt->vm_flags & VM_ACCOUNT) {
- unsigned int len = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT;
- if (security_vm_enough_memory(len))
+ unsigned long len = vma_pages(mpnt);
+
+ if (security_vm_enough_memory_mm(oldmm, len)) /* sic */
goto fail_nomem;
charge = len;
}
@@ -328,32 +402,36 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
if (!tmp)
goto fail_nomem;
*tmp = *mpnt;
- pol = mpol_dup(vma_policy(mpnt));
- retval = PTR_ERR(pol);
- if (IS_ERR(pol))
+ INIT_LIST_HEAD(&tmp->anon_vma_chain);
+ retval = vma_dup_policy(mpnt, tmp);
+ if (retval)
goto fail_nomem_policy;
- vma_set_policy(tmp, pol);
- tmp->vm_flags &= ~VM_LOCKED;
tmp->vm_mm = mm;
- tmp->vm_next = NULL;
- anon_vma_link(tmp);
+ if (anon_vma_fork(tmp, mpnt))
+ goto fail_nomem_anon_vma_fork;
+ tmp->vm_flags &= ~VM_LOCKED;
+ tmp->vm_next = tmp->vm_prev = NULL;
file = tmp->vm_file;
if (file) {
- struct inode *inode = file->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(file);
struct address_space *mapping = file->f_mapping;
get_file(file);
if (tmp->vm_flags & VM_DENYWRITE)
atomic_dec(&inode->i_writecount);
- spin_lock(&mapping->i_mmap_lock);
+ mutex_lock(&mapping->i_mmap_mutex);
if (tmp->vm_flags & VM_SHARED)
mapping->i_mmap_writable++;
- tmp->vm_truncate_count = mpnt->vm_truncate_count;
flush_dcache_mmap_lock(mapping);
/* insert tmp into the share list, just after mpnt */
- vma_prio_tree_add(tmp, mpnt);
+ if (unlikely(tmp->vm_flags & VM_NONLINEAR))
+ vma_nonlinear_insert(tmp,
+ &mapping->i_mmap_nonlinear);
+ else
+ vma_interval_tree_insert_after(tmp, mpnt,
+ &mapping->i_mmap);
flush_dcache_mmap_unlock(mapping);
- spin_unlock(&mapping->i_mmap_lock);
+ mutex_unlock(&mapping->i_mmap_mutex);
}
/*
@@ -369,6 +447,8 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
*/
*pprev = tmp;
pprev = &tmp->vm_next;
+ tmp->vm_prev = prev;
+ prev = tmp;
__vma_link_rb(mm, tmp, rb_link, rb_parent);
rb_link = &tmp->vm_rb.rb_right;
@@ -390,7 +470,10 @@ out:
up_write(&mm->mmap_sem);
flush_tlb_mm(oldmm);
up_write(&oldmm->mmap_sem);
+ uprobe_end_dup_mmap();
return retval;
+fail_nomem_anon_vma_fork:
+ mpol_put(vma_policy(tmp));
fail_nomem_policy:
kmem_cache_free(vm_area_cachep, tmp);
fail_nomem:
@@ -399,7 +482,7 @@ fail_nomem:
goto out;
}
-static inline int mm_alloc_pgd(struct mm_struct * mm)
+static inline int mm_alloc_pgd(struct mm_struct *mm)
{
mm->pgd = pgd_alloc(mm);
if (unlikely(!mm->pgd))
@@ -407,7 +490,7 @@ static inline int mm_alloc_pgd(struct mm_struct * mm)
return 0;
}
-static inline void mm_free_pgd(struct mm_struct * mm)
+static inline void mm_free_pgd(struct mm_struct *mm)
{
pgd_free(mm, mm->pgd);
}
@@ -440,30 +523,33 @@ static void mm_init_aio(struct mm_struct *mm)
{
#ifdef CONFIG_AIO
spin_lock_init(&mm->ioctx_lock);
- INIT_HLIST_HEAD(&mm->ioctx_list);
+ mm->ioctx_table = NULL;
#endif
}
-static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
+static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
{
atomic_set(&mm->mm_users, 1);
atomic_set(&mm->mm_count, 1);
init_rwsem(&mm->mmap_sem);
INIT_LIST_HEAD(&mm->mmlist);
- mm->flags = (current->mm) ?
- (current->mm->flags & MMF_INIT_MASK) : default_dump_filter;
mm->core_state = NULL;
- mm->nr_ptes = 0;
- set_mm_counter(mm, file_rss, 0);
- set_mm_counter(mm, anon_rss, 0);
+ atomic_long_set(&mm->nr_ptes, 0);
+ memset(&mm->rss_stat, 0, sizeof(mm->rss_stat));
spin_lock_init(&mm->page_table_lock);
- mm->free_area_cache = TASK_UNMAPPED_BASE;
- mm->cached_hole_size = ~0UL;
mm_init_aio(mm);
mm_init_owner(mm, p);
+ clear_tlb_flush_pending(mm);
- if (likely(!mm_alloc_pgd(mm))) {
+ if (current->mm) {
+ mm->flags = current->mm->flags & MMF_INIT_MASK;
+ mm->def_flags = current->mm->def_flags & VM_INIT_DEF_MASK;
+ } else {
+ mm->flags = default_dump_filter;
mm->def_flags = 0;
+ }
+
+ if (likely(!mm_alloc_pgd(mm))) {
mmu_notifier_mm_init(mm);
return mm;
}
@@ -472,19 +558,37 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
return NULL;
}
+static void check_mm(struct mm_struct *mm)
+{
+ int i;
+
+ for (i = 0; i < NR_MM_COUNTERS; i++) {
+ long x = atomic_long_read(&mm->rss_stat.count[i]);
+
+ if (unlikely(x))
+ printk(KERN_ALERT "BUG: Bad rss-counter state "
+ "mm:%p idx:%d val:%ld\n", mm, i, x);
+ }
+
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
+ VM_BUG_ON(mm->pmd_huge_pte);
+#endif
+}
+
/*
* Allocate and initialize an mm_struct.
*/
-struct mm_struct * mm_alloc(void)
+struct mm_struct *mm_alloc(void)
{
- struct mm_struct * mm;
+ struct mm_struct *mm;
mm = allocate_mm();
- if (mm) {
- memset(mm, 0, sizeof(*mm));
- mm = mm_init(mm, current);
- }
- return mm;
+ if (!mm)
+ return NULL;
+
+ memset(mm, 0, sizeof(*mm));
+ mm_init_cpumask(mm);
+ return mm_init(mm, current);
}
/*
@@ -498,6 +602,7 @@ void __mmdrop(struct mm_struct *mm)
mm_free_pgd(mm);
destroy_context(mm);
mmu_notifier_mm_destroy(mm);
+ check_mm(mm);
free_mm(mm);
}
EXPORT_SYMBOL_GPL(__mmdrop);
@@ -510,8 +615,10 @@ void mmput(struct mm_struct *mm)
might_sleep();
if (atomic_dec_and_test(&mm->mm_users)) {
+ uprobe_clear_state(mm);
exit_aio(mm);
ksm_exit(mm);
+ khugepaged_exit(mm); /* must run before exit_mmap */
exit_mmap(mm);
set_mm_exe_file(mm, NULL);
if (!list_empty(&mm->mmlist)) {
@@ -519,7 +626,6 @@ void mmput(struct mm_struct *mm)
list_del(&mm->mmlist);
spin_unlock(&mmlist_lock);
}
- put_swap_token(mm);
if (mm->binfmt)
module_put(mm->binfmt->module);
mmdrop(mm);
@@ -527,6 +633,35 @@ void mmput(struct mm_struct *mm)
}
EXPORT_SYMBOL_GPL(mmput);
+void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
+{
+ if (new_exe_file)
+ get_file(new_exe_file);
+ if (mm->exe_file)
+ fput(mm->exe_file);
+ mm->exe_file = new_exe_file;
+}
+
+struct file *get_mm_exe_file(struct mm_struct *mm)
+{
+ struct file *exe_file;
+
+ /* We need mmap_sem to protect against races with removal of exe_file */
+ down_read(&mm->mmap_sem);
+ exe_file = mm->exe_file;
+ if (exe_file)
+ get_file(exe_file);
+ up_read(&mm->mmap_sem);
+ return exe_file;
+}
+
+static void dup_mm_exe_file(struct mm_struct *oldmm, struct mm_struct *newmm)
+{
+ /* It's safe to write the exe_file pointer without exe_file_lock because
+ * this is called during fork when the task is not yet in /proc */
+ newmm->exe_file = get_mm_exe_file(oldmm);
+}
+
/**
* get_task_mm - acquire a reference to the task's mm
*
@@ -553,6 +688,58 @@ struct mm_struct *get_task_mm(struct task_struct *task)
}
EXPORT_SYMBOL_GPL(get_task_mm);
+struct mm_struct *mm_access(struct task_struct *task, unsigned int mode)
+{
+ struct mm_struct *mm;
+ int err;
+
+ err = mutex_lock_killable(&task->signal->cred_guard_mutex);
+ if (err)
+ return ERR_PTR(err);
+
+ mm = get_task_mm(task);
+ if (mm && mm != current->mm &&
+ !ptrace_may_access(task, mode)) {
+ mmput(mm);
+ mm = ERR_PTR(-EACCES);
+ }
+ mutex_unlock(&task->signal->cred_guard_mutex);
+
+ return mm;
+}
+
+static void complete_vfork_done(struct task_struct *tsk)
+{
+ struct completion *vfork;
+
+ task_lock(tsk);
+ vfork = tsk->vfork_done;
+ if (likely(vfork)) {
+ tsk->vfork_done = NULL;
+ complete(vfork);
+ }
+ task_unlock(tsk);
+}
+
+static int wait_for_vfork_done(struct task_struct *child,
+ struct completion *vfork)
+{
+ int killed;
+
+ freezer_do_not_count();
+ killed = wait_for_completion_killable(vfork);
+ freezer_count();
+
+ if (killed) {
+ task_lock(child);
+ child->vfork_done = NULL;
+ task_unlock(child);
+ }
+
+ put_task_struct(child);
+ return killed;
+}
+
/* Please note the differences between mmput and mm_release.
* mmput is called whenever we stop holding onto a mm_struct,
* error success whatever.
@@ -568,8 +755,6 @@ EXPORT_SYMBOL_GPL(get_task_mm);
*/
void mm_release(struct task_struct *tsk, struct mm_struct *mm)
{
- struct completion *vfork_done = tsk->vfork_done;
-
/* Get rid of any futexes when releasing the mm */
#ifdef CONFIG_FUTEX
if (unlikely(tsk->robust_list)) {
@@ -586,20 +771,17 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm)
exit_pi_state_list(tsk);
#endif
+ uprobe_free_utask(tsk);
+
/* Get rid of any cached register state */
deactivate_mm(tsk, mm);
- /* notify parent sleeping on vfork() */
- if (vfork_done) {
- tsk->vfork_done = NULL;
- complete(vfork_done);
- }
-
/*
* If we're exiting normally, clear a user-space tid field if
* requested. We leave this alone when dying by signal, to leave
* the value intact in a core dump, and to save the unnecessary
- * trouble otherwise. Userland only wants this done for a sys_exit.
+ * trouble, say, a killed vfork parent shouldn't touch this mm.
+ * Userland only wants this done for a sys_exit.
*/
if (tsk->clear_child_tid) {
if (!(tsk->flags & PF_SIGNALED) &&
@@ -614,30 +796,34 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm)
}
tsk->clear_child_tid = NULL;
}
+
+ /*
+ * All done, finally we can wake up parent and return this mm to him.
+ * Also kthread_stop() uses this completion for synchronization.
+ */
+ if (tsk->vfork_done)
+ complete_vfork_done(tsk);
}
/*
* Allocate a new mm structure and copy contents from the
* mm structure of the passed in task structure.
*/
-struct mm_struct *dup_mm(struct task_struct *tsk)
+static struct mm_struct *dup_mm(struct task_struct *tsk)
{
struct mm_struct *mm, *oldmm = current->mm;
int err;
- if (!oldmm)
- return NULL;
-
mm = allocate_mm();
if (!mm)
goto fail_nomem;
memcpy(mm, oldmm, sizeof(*mm));
+ mm_init_cpumask(mm);
- /* Initializing for Swap token stuff */
- mm->token_priority = 0;
- mm->last_interval = 0;
-
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
+ mm->pmd_huge_pte = NULL;
+#endif
if (!mm_init(mm, tsk))
goto fail_nomem;
@@ -676,9 +862,9 @@ fail_nocontext:
return NULL;
}
-static int copy_mm(unsigned long clone_flags, struct task_struct * tsk)
+static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
{
- struct mm_struct * mm, *oldmm;
+ struct mm_struct *mm, *oldmm;
int retval;
tsk->min_flt = tsk->maj_flt = 0;
@@ -699,6 +885,9 @@ static int copy_mm(unsigned long clone_flags, struct task_struct * tsk)
if (!oldmm)
return 0;
+ /* initialize the new vmacache entries */
+ vmacache_flush(tsk);
+
if (clone_flags & CLONE_VM) {
atomic_inc(&oldmm->mm_users);
mm = oldmm;
@@ -711,10 +900,6 @@ static int copy_mm(unsigned long clone_flags, struct task_struct * tsk)
goto fail_nomem;
good_mm:
- /* Initializing for Swap token stuff */
- mm->token_priority = 0;
- mm->last_interval = 0;
-
tsk->mm = mm;
tsk->active_mm = mm;
return 0;
@@ -728,13 +913,13 @@ static int copy_fs(unsigned long clone_flags, struct task_struct *tsk)
struct fs_struct *fs = current->fs;
if (clone_flags & CLONE_FS) {
/* tsk->fs is already what we want */
- write_lock(&fs->lock);
+ spin_lock(&fs->lock);
if (fs->in_exec) {
- write_unlock(&fs->lock);
+ spin_unlock(&fs->lock);
return -EAGAIN;
}
fs->users++;
- write_unlock(&fs->lock);
+ spin_unlock(&fs->lock);
return 0;
}
tsk->fs = copy_fs_struct(fs);
@@ -743,7 +928,7 @@ static int copy_fs(unsigned long clone_flags, struct task_struct *tsk)
return 0;
}
-static int copy_files(unsigned long clone_flags, struct task_struct * tsk)
+static int copy_files(unsigned long clone_flags, struct task_struct *tsk)
{
struct files_struct *oldf, *newf;
int error = 0;
@@ -774,6 +959,7 @@ static int copy_io(unsigned long clone_flags, struct task_struct *tsk)
{
#ifdef CONFIG_BLOCK
struct io_context *ioc = current->io_context;
+ struct io_context *new_ioc;
if (!ioc)
return 0;
@@ -781,15 +967,15 @@ static int copy_io(unsigned long clone_flags, struct task_struct *tsk)
* Share io context with parent, if CLONE_IO is set
*/
if (clone_flags & CLONE_IO) {
- tsk->io_context = ioc_task_link(ioc);
- if (unlikely(!tsk->io_context))
- return -ENOMEM;
+ ioc_task_link(ioc);
+ tsk->io_context = ioc;
} else if (ioprio_valid(ioc->ioprio)) {
- tsk->io_context = alloc_io_context(GFP_KERNEL, -1);
- if (unlikely(!tsk->io_context))
+ new_ioc = get_task_io_context(tsk, GFP_KERNEL, NUMA_NO_NODE);
+ if (unlikely(!new_ioc))
return -ENOMEM;
- tsk->io_context->ioprio = ioc->ioprio;
+ new_ioc->ioprio = ioc->ioprio;
+ put_io_context(new_ioc);
}
#endif
return 0;
@@ -814,8 +1000,10 @@ static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk)
void __cleanup_sighand(struct sighand_struct *sighand)
{
- if (atomic_dec_and_test(&sighand->count))
+ if (atomic_dec_and_test(&sighand->count)) {
+ signalfd_cleanup(sighand);
kmem_cache_free(sighand_cachep, sighand);
+ }
}
@@ -824,23 +1012,14 @@ void __cleanup_sighand(struct sighand_struct *sighand)
*/
static void posix_cpu_timers_init_group(struct signal_struct *sig)
{
+ unsigned long cpu_limit;
+
/* Thread group counters. */
thread_group_cputime_init(sig);
- /* Expiration times and increments. */
- sig->it[CPUCLOCK_PROF].expires = cputime_zero;
- sig->it[CPUCLOCK_PROF].incr = cputime_zero;
- sig->it[CPUCLOCK_VIRT].expires = cputime_zero;
- sig->it[CPUCLOCK_VIRT].incr = cputime_zero;
-
- /* Cached expiration times. */
- sig->cputime_expires.prof_exp = cputime_zero;
- sig->cputime_expires.virt_exp = cputime_zero;
- sig->cputime_expires.sched_exp = 0;
-
- if (sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY) {
- sig->cputime_expires.prof_exp =
- secs_to_cputime(sig->rlim[RLIMIT_CPU].rlim_cur);
+ cpu_limit = ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
+ if (cpu_limit != RLIM_INFINITY) {
+ sig->cputime_expires.prof_exp = secs_to_cputime(cpu_limit);
sig->cputimer.running = 1;
}
@@ -857,77 +1036,49 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
if (clone_flags & CLONE_THREAD)
return 0;
- sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL);
+ sig = kmem_cache_zalloc(signal_cachep, GFP_KERNEL);
tsk->signal = sig;
if (!sig)
return -ENOMEM;
- atomic_set(&sig->count, 1);
+ sig->nr_threads = 1;
atomic_set(&sig->live, 1);
+ atomic_set(&sig->sigcnt, 1);
+
+ /* list_add(thread_node, thread_head) without INIT_LIST_HEAD() */
+ sig->thread_head = (struct list_head)LIST_HEAD_INIT(tsk->thread_node);
+ tsk->thread_node = (struct list_head)LIST_HEAD_INIT(sig->thread_head);
+
init_waitqueue_head(&sig->wait_chldexit);
- sig->flags = 0;
- if (clone_flags & CLONE_NEWPID)
- sig->flags |= SIGNAL_UNKILLABLE;
- sig->group_exit_code = 0;
- sig->group_exit_task = NULL;
- sig->group_stop_count = 0;
sig->curr_target = tsk;
init_sigpending(&sig->shared_pending);
INIT_LIST_HEAD(&sig->posix_timers);
hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
- sig->it_real_incr.tv64 = 0;
sig->real_timer.function = it_real_fn;
- sig->leader = 0; /* session leadership doesn't inherit */
- sig->tty_old_pgrp = NULL;
- sig->tty = NULL;
-
- sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero;
- sig->gtime = cputime_zero;
- sig->cgtime = cputime_zero;
-#ifndef CONFIG_VIRT_CPU_ACCOUNTING
- sig->prev_utime = sig->prev_stime = cputime_zero;
-#endif
- sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0;
- sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0;
- sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0;
- sig->maxrss = sig->cmaxrss = 0;
- task_io_accounting_init(&sig->ioac);
- sig->sum_sched_runtime = 0;
- taskstats_tgid_init(sig);
-
task_lock(current->group_leader);
memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim);
task_unlock(current->group_leader);
posix_cpu_timers_init_group(sig);
- acct_init_pacct(&sig->pacct);
-
tty_audit_fork(sig);
+ sched_autogroup_fork(sig);
- sig->oom_adj = current->signal->oom_adj;
+#ifdef CONFIG_CGROUPS
+ init_rwsem(&sig->group_rwsem);
+#endif
- return 0;
-}
+ sig->oom_score_adj = current->signal->oom_score_adj;
+ sig->oom_score_adj_min = current->signal->oom_score_adj_min;
-void __cleanup_signal(struct signal_struct *sig)
-{
- thread_group_cputime_free(sig);
- tty_kref_put(sig->tty);
- kmem_cache_free(signal_cachep, sig);
-}
+ sig->has_child_subreaper = current->signal->has_child_subreaper ||
+ current->signal->is_child_subreaper;
-static void copy_flags(unsigned long clone_flags, struct task_struct *p)
-{
- unsigned long new_flags = p->flags;
+ mutex_init(&sig->cred_guard_mutex);
- new_flags &= ~PF_SUPERPRIV;
- new_flags |= PF_FORKNOEXEC;
- new_flags |= PF_STARTING;
- p->flags = new_flags;
- clear_freeze_flag(p);
+ return 0;
}
SYSCALL_DEFINE1(set_tid_address, int __user *, tidptr)
@@ -941,31 +1092,39 @@ static void rt_mutex_init_task(struct task_struct *p)
{
raw_spin_lock_init(&p->pi_lock);
#ifdef CONFIG_RT_MUTEXES
- plist_head_init_raw(&p->pi_waiters, &p->pi_lock);
+ p->pi_waiters = RB_ROOT;
+ p->pi_waiters_leftmost = NULL;
p->pi_blocked_on = NULL;
+ p->pi_top_task = NULL;
#endif
}
-#ifdef CONFIG_MM_OWNER
+#ifdef CONFIG_MEMCG
void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
{
mm->owner = p;
}
-#endif /* CONFIG_MM_OWNER */
+#endif /* CONFIG_MEMCG */
/*
* Initialize POSIX timer handling for a single task.
*/
static void posix_cpu_timers_init(struct task_struct *tsk)
{
- tsk->cputime_expires.prof_exp = cputime_zero;
- tsk->cputime_expires.virt_exp = cputime_zero;
+ tsk->cputime_expires.prof_exp = 0;
+ tsk->cputime_expires.virt_exp = 0;
tsk->cputime_expires.sched_exp = 0;
INIT_LIST_HEAD(&tsk->cpu_timers[0]);
INIT_LIST_HEAD(&tsk->cpu_timers[1]);
INIT_LIST_HEAD(&tsk->cpu_timers[2]);
}
+static inline void
+init_task_pid(struct task_struct *task, enum pid_type type, struct pid *pid)
+{
+ task->pids[type].pid = pid;
+}
+
/*
* This creates a new process as a copy of the old one,
* but does not actually start it yet.
@@ -976,7 +1135,6 @@ static void posix_cpu_timers_init(struct task_struct *tsk)
*/
static struct task_struct *copy_process(unsigned long clone_flags,
unsigned long stack_start,
- struct pt_regs *regs,
unsigned long stack_size,
int __user *child_tidptr,
struct pid *pid,
@@ -984,11 +1142,13 @@ static struct task_struct *copy_process(unsigned long clone_flags,
{
int retval;
struct task_struct *p;
- int cgroup_callbacks_done = 0;
if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
return ERR_PTR(-EINVAL);
+ if ((clone_flags & (CLONE_NEWUSER|CLONE_FS)) == (CLONE_NEWUSER|CLONE_FS))
+ return ERR_PTR(-EINVAL);
+
/*
* Thread groups must share signals as well, and detached threads
* can only be started up within the thread group.
@@ -1014,6 +1174,18 @@ static struct task_struct *copy_process(unsigned long clone_flags,
current->signal->flags & SIGNAL_UNKILLABLE)
return ERR_PTR(-EINVAL);
+ /*
+ * If the new process will be in a different pid or user namespace
+ * do not allow it to share a thread group or signal handlers or
+ * parent with the forking task.
+ */
+ if (clone_flags & CLONE_SIGHAND) {
+ if ((clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) ||
+ (task_active_pid_ns(current) !=
+ current->nsproxy->pid_ns_for_children))
+ return ERR_PTR(-EINVAL);
+ }
+
retval = security_task_create(clone_flags);
if (retval)
goto fork_out;
@@ -1024,6 +1196,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
goto fork_out;
ftrace_graph_init_task(p);
+ get_seccomp_filter(p);
rt_mutex_init_task(p);
@@ -1033,11 +1206,12 @@ static struct task_struct *copy_process(unsigned long clone_flags,
#endif
retval = -EAGAIN;
if (atomic_read(&p->real_cred->user->processes) >=
- p->signal->rlim[RLIMIT_NPROC].rlim_cur) {
- if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) &&
- p->real_cred->user != INIT_USER)
+ task_rlimit(p, RLIMIT_NPROC)) {
+ if (p->real_cred->user != INIT_USER &&
+ !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN))
goto bad_fork_free;
}
+ current->flags &= ~PF_NPROC_EXCEEDED;
retval = copy_creds(p, clone_flags);
if (retval < 0)
@@ -1055,9 +1229,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
if (!try_module_get(task_thread_info(p)->exec_domain->module))
goto bad_fork_cleanup_count;
- p->did_exec = 0;
delayacct_tsk_init(p); /* Must remain after dup_task_struct() */
- copy_flags(clone_flags, p);
+ p->flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER);
+ p->flags |= PF_FORKNOEXEC;
INIT_LIST_HEAD(&p->children);
INIT_LIST_HEAD(&p->sibling);
rcu_copy_process(p);
@@ -1066,14 +1240,19 @@ static struct task_struct *copy_process(unsigned long clone_flags,
init_sigpending(&p->pending);
- p->utime = cputime_zero;
- p->stime = cputime_zero;
- p->gtime = cputime_zero;
- p->utimescaled = cputime_zero;
- p->stimescaled = cputime_zero;
-#ifndef CONFIG_VIRT_CPU_ACCOUNTING
- p->prev_utime = cputime_zero;
- p->prev_stime = cputime_zero;
+ p->utime = p->stime = p->gtime = 0;
+ p->utimescaled = p->stimescaled = 0;
+#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
+ p->prev_cputime.utime = p->prev_cputime.stime = 0;
+#endif
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
+ seqlock_init(&p->vtime_seqlock);
+ p->vtime_snap = 0;
+ p->vtime_snap_whence = VTIME_SLEEPING;
+#endif
+
+#if defined(SPLIT_RSS_COUNTING)
+ memset(&p->rss_stat, 0, sizeof(p->rss_stat));
#endif
p->default_timer_slack_ns = current->timer_slack_ns;
@@ -1083,29 +1262,30 @@ static struct task_struct *copy_process(unsigned long clone_flags,
posix_cpu_timers_init(p);
- p->lock_depth = -1; /* -1 = no lock */
do_posix_clock_monotonic_gettime(&p->start_time);
p->real_start_time = p->start_time;
monotonic_to_bootbased(&p->real_start_time);
p->io_context = NULL;
p->audit_context = NULL;
+ if (clone_flags & CLONE_THREAD)
+ threadgroup_change_begin(current);
cgroup_fork(p);
#ifdef CONFIG_NUMA
p->mempolicy = mpol_dup(p->mempolicy);
- if (IS_ERR(p->mempolicy)) {
- retval = PTR_ERR(p->mempolicy);
- p->mempolicy = NULL;
- goto bad_fork_cleanup_cgroup;
- }
- mpol_fix_fork_child_flag(p);
+ if (IS_ERR(p->mempolicy)) {
+ retval = PTR_ERR(p->mempolicy);
+ p->mempolicy = NULL;
+ goto bad_fork_cleanup_threadgroup_lock;
+ }
+#endif
+#ifdef CONFIG_CPUSETS
+ p->cpuset_mem_spread_rotor = NUMA_NO_NODE;
+ p->cpuset_slab_spread_rotor = NUMA_NO_NODE;
+ seqcount_init(&p->mems_allowed_seq);
#endif
#ifdef CONFIG_TRACE_IRQFLAGS
p->irq_events = 0;
-#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
- p->hardirqs_enabled = 1;
-#else
p->hardirqs_enabled = 0;
-#endif
p->hardirq_enable_ip = 0;
p->hardirq_enable_event = 0;
p->hardirq_disable_ip = _THIS_IP_;
@@ -1127,74 +1307,70 @@ static struct task_struct *copy_process(unsigned long clone_flags,
#ifdef CONFIG_DEBUG_MUTEXES
p->blocked_on = NULL; /* not blocked yet */
#endif
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR
+#ifdef CONFIG_MEMCG
p->memcg_batch.do_batch = 0;
p->memcg_batch.memcg = NULL;
#endif
-
- p->bts = NULL;
-
- p->stack_start = stack_start;
+#ifdef CONFIG_BCACHE
+ p->sequential_io = 0;
+ p->sequential_io_avg = 0;
+#endif
/* Perform scheduler related setup. Assign this task to a CPU. */
- sched_fork(p, clone_flags);
+ retval = sched_fork(clone_flags, p);
+ if (retval)
+ goto bad_fork_cleanup_policy;
retval = perf_event_init_task(p);
if (retval)
goto bad_fork_cleanup_policy;
-
- if ((retval = audit_alloc(p)))
+ retval = audit_alloc(p);
+ if (retval)
goto bad_fork_cleanup_policy;
/* copy all the process information */
- if ((retval = copy_semundo(clone_flags, p)))
+ retval = copy_semundo(clone_flags, p);
+ if (retval)
goto bad_fork_cleanup_audit;
- if ((retval = copy_files(clone_flags, p)))
+ retval = copy_files(clone_flags, p);
+ if (retval)
goto bad_fork_cleanup_semundo;
- if ((retval = copy_fs(clone_flags, p)))
+ retval = copy_fs(clone_flags, p);
+ if (retval)
goto bad_fork_cleanup_files;
- if ((retval = copy_sighand(clone_flags, p)))
+ retval = copy_sighand(clone_flags, p);
+ if (retval)
goto bad_fork_cleanup_fs;
- if ((retval = copy_signal(clone_flags, p)))
+ retval = copy_signal(clone_flags, p);
+ if (retval)
goto bad_fork_cleanup_sighand;
- if ((retval = copy_mm(clone_flags, p)))
+ retval = copy_mm(clone_flags, p);
+ if (retval)
goto bad_fork_cleanup_signal;
- if ((retval = copy_namespaces(clone_flags, p)))
+ retval = copy_namespaces(clone_flags, p);
+ if (retval)
goto bad_fork_cleanup_mm;
- if ((retval = copy_io(clone_flags, p)))
+ retval = copy_io(clone_flags, p);
+ if (retval)
goto bad_fork_cleanup_namespaces;
- retval = copy_thread(clone_flags, stack_start, stack_size, p, regs);
+ retval = copy_thread(clone_flags, stack_start, stack_size, p);
if (retval)
goto bad_fork_cleanup_io;
if (pid != &init_struct_pid) {
retval = -ENOMEM;
- pid = alloc_pid(p->nsproxy->pid_ns);
+ pid = alloc_pid(p->nsproxy->pid_ns_for_children);
if (!pid)
goto bad_fork_cleanup_io;
-
- if (clone_flags & CLONE_NEWPID) {
- retval = pid_ns_prepare_proc(p->nsproxy->pid_ns);
- if (retval < 0)
- goto bad_fork_free_pid;
- }
- }
-
- p->pid = pid_nr(pid);
- p->tgid = p->pid;
- if (clone_flags & CLONE_THREAD)
- p->tgid = current->tgid;
-
- if (current->nsproxy != p->nsproxy) {
- retval = ns_cgroup_clone(p, pid);
- if (retval)
- goto bad_fork_free_pid;
}
p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
/*
* Clear TID on mm_release()?
*/
- p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL;
+ p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr : NULL;
+#ifdef CONFIG_BLOCK
+ p->plug = NULL;
+#endif
#ifdef CONFIG_FUTEX
p->robust_list = NULL;
#ifdef CONFIG_COMPAT
@@ -1221,24 +1397,32 @@ static struct task_struct *copy_process(unsigned long clone_flags,
clear_all_latency_tracing(p);
/* ok, now we should be set up.. */
- p->exit_signal = (clone_flags & CLONE_THREAD) ? -1 : (clone_flags & CSIGNAL);
+ p->pid = pid_nr(pid);
+ if (clone_flags & CLONE_THREAD) {
+ p->exit_signal = -1;
+ p->group_leader = current->group_leader;
+ p->tgid = current->tgid;
+ } else {
+ if (clone_flags & CLONE_PARENT)
+ p->exit_signal = current->group_leader->exit_signal;
+ else
+ p->exit_signal = (clone_flags & CSIGNAL);
+ p->group_leader = p;
+ p->tgid = p->pid;
+ }
+
+ p->nr_dirtied = 0;
+ p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10);
+ p->dirty_paused_when = 0;
+
p->pdeath_signal = 0;
- p->exit_state = 0;
+ INIT_LIST_HEAD(&p->thread_group);
+ p->task_works = NULL;
/*
- * Ok, make it visible to the rest of the system.
- * We dont wake it up yet.
+ * Make it visible to the rest of the system, but dont wake it up yet.
+ * Need tasklist lock for parent etc handling!
*/
- p->group_leader = p;
- INIT_LIST_HEAD(&p->thread_group);
-
- /* Now that the task is set up, run cgroup callbacks if
- * necessary. We need to run them before the task is visible
- * on the tasklist. */
- cgroup_fork_callbacks(p);
- cgroup_callbacks_done = 1;
-
- /* Need tasklist lock for parent etc handling! */
write_lock_irq(&tasklist_lock);
/* CLONE_PARENT re-uses the old parent */
@@ -1259,7 +1443,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
* it's process group.
* A fatal signal pending means that current will exit, so the new
* thread can't slip out of an OOM kill (or normal SIGKILL).
- */
+ */
recalc_sigpending();
if (signal_pending(current)) {
spin_unlock(&current->sighand->siglock);
@@ -1268,39 +1452,53 @@ static struct task_struct *copy_process(unsigned long clone_flags,
goto bad_fork_free_pid;
}
- if (clone_flags & CLONE_THREAD) {
- atomic_inc(&current->signal->count);
- atomic_inc(&current->signal->live);
- p->group_leader = current->group_leader;
- list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group);
- }
-
if (likely(p->pid)) {
- tracehook_finish_clone(p, clone_flags, trace);
+ ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace);
+ init_task_pid(p, PIDTYPE_PID, pid);
if (thread_group_leader(p)) {
- if (clone_flags & CLONE_NEWPID)
- p->nsproxy->pid_ns->child_reaper = p;
+ init_task_pid(p, PIDTYPE_PGID, task_pgrp(current));
+ init_task_pid(p, PIDTYPE_SID, task_session(current));
+
+ if (is_child_reaper(pid)) {
+ ns_of_pid(pid)->child_reaper = p;
+ p->signal->flags |= SIGNAL_UNKILLABLE;
+ }
p->signal->leader_pid = pid;
- tty_kref_put(p->signal->tty);
p->signal->tty = tty_kref_get(current->signal->tty);
- attach_pid(p, PIDTYPE_PGID, task_pgrp(current));
- attach_pid(p, PIDTYPE_SID, task_session(current));
list_add_tail(&p->sibling, &p->real_parent->children);
list_add_tail_rcu(&p->tasks, &init_task.tasks);
- __get_cpu_var(process_counts)++;
+ attach_pid(p, PIDTYPE_PGID);
+ attach_pid(p, PIDTYPE_SID);
+ __this_cpu_inc(process_counts);
+ } else {
+ current->signal->nr_threads++;
+ atomic_inc(&current->signal->live);
+ atomic_inc(&current->signal->sigcnt);
+ list_add_tail_rcu(&p->thread_group,
+ &p->group_leader->thread_group);
+ list_add_tail_rcu(&p->thread_node,
+ &p->signal->thread_head);
}
- attach_pid(p, PIDTYPE_PID, pid);
+ attach_pid(p, PIDTYPE_PID);
nr_threads++;
}
total_forks++;
spin_unlock(&current->sighand->siglock);
+ syscall_tracepoint_update(p);
write_unlock_irq(&tasklist_lock);
+
proc_fork_connector(p);
cgroup_post_fork(p);
+ if (clone_flags & CLONE_THREAD)
+ threadgroup_change_end(current);
perf_event_fork(p);
+
+ trace_task_newtask(p, clone_flags);
+ uprobe_copy_process(p, clone_flags);
+
return p;
bad_fork_free_pid:
@@ -1316,7 +1514,7 @@ bad_fork_cleanup_mm:
mmput(p->mm);
bad_fork_cleanup_signal:
if (!(clone_flags & CLONE_THREAD))
- __cleanup_signal(p->signal);
+ free_signal_struct(p->signal);
bad_fork_cleanup_sighand:
__cleanup_sighand(p->sighand);
bad_fork_cleanup_fs:
@@ -1331,9 +1529,10 @@ bad_fork_cleanup_policy:
perf_event_free_task(p);
#ifdef CONFIG_NUMA
mpol_put(p->mempolicy);
-bad_fork_cleanup_cgroup:
+bad_fork_cleanup_threadgroup_lock:
#endif
- cgroup_exit(p, cgroup_callbacks_done);
+ if (clone_flags & CLONE_THREAD)
+ threadgroup_change_end(current);
delayacct_tsk_free(p);
module_put(task_thread_info(p)->exec_domain->module);
bad_fork_cleanup_count:
@@ -1345,21 +1544,24 @@ fork_out:
return ERR_PTR(retval);
}
-noinline struct pt_regs * __cpuinit __attribute__((weak)) idle_regs(struct pt_regs *regs)
+static inline void init_idle_pids(struct pid_link *links)
{
- memset(regs, 0, sizeof(struct pt_regs));
- return regs;
+ enum pid_type type;
+
+ for (type = PIDTYPE_PID; type < PIDTYPE_MAX; ++type) {
+ INIT_HLIST_NODE(&links[type].node); /* not really needed */
+ links[type].pid = &init_struct_pid;
+ }
}
-struct task_struct * __cpuinit fork_idle(int cpu)
+struct task_struct *fork_idle(int cpu)
{
struct task_struct *task;
- struct pt_regs regs;
-
- task = copy_process(CLONE_VM, 0, idle_regs(&regs), 0, NULL,
- &init_struct_pid, 0);
- if (!IS_ERR(task))
+ task = copy_process(CLONE_VM, 0, 0, NULL, &init_struct_pid, 0);
+ if (!IS_ERR(task)) {
+ init_idle_pids(task->pids);
init_idle(task, cpu);
+ }
return task;
}
@@ -1372,7 +1574,6 @@ struct task_struct * __cpuinit fork_idle(int cpu)
*/
long do_fork(unsigned long clone_flags,
unsigned long stack_start,
- struct pt_regs *regs,
unsigned long stack_size,
int __user *parent_tidptr,
int __user *child_tidptr)
@@ -1382,44 +1583,24 @@ long do_fork(unsigned long clone_flags,
long nr;
/*
- * Do some preliminary argument and permissions checking before we
- * actually start allocating stuff
+ * Determine whether and which event to report to ptracer. When
+ * called from kernel_thread or CLONE_UNTRACED is explicitly
+ * requested, no event is reported; otherwise, report if the event
+ * for the type of forking is enabled.
*/
- if (clone_flags & CLONE_NEWUSER) {
- if (clone_flags & CLONE_THREAD)
- return -EINVAL;
- /* hopefully this check will go away when userns support is
- * complete
- */
- if (!capable(CAP_SYS_ADMIN) || !capable(CAP_SETUID) ||
- !capable(CAP_SETGID))
- return -EPERM;
- }
-
- /*
- * We hope to recycle these flags after 2.6.26
- */
- if (unlikely(clone_flags & CLONE_STOPPED)) {
- static int __read_mostly count = 100;
-
- if (count > 0 && printk_ratelimit()) {
- char comm[TASK_COMM_LEN];
+ if (!(clone_flags & CLONE_UNTRACED)) {
+ if (clone_flags & CLONE_VFORK)
+ trace = PTRACE_EVENT_VFORK;
+ else if ((clone_flags & CSIGNAL) != SIGCHLD)
+ trace = PTRACE_EVENT_CLONE;
+ else
+ trace = PTRACE_EVENT_FORK;
- count--;
- printk(KERN_INFO "fork(): process `%s' used deprecated "
- "clone flags 0x%lx\n",
- get_task_comm(comm, current),
- clone_flags & CLONE_STOPPED);
- }
+ if (likely(!ptrace_event_enabled(current, trace)))
+ trace = 0;
}
- /*
- * When called from kernel_thread, don't do user tracing stuff.
- */
- if (likely(user_mode(regs)))
- trace = tracehook_prepare_clone(clone_flags);
-
- p = copy_process(clone_flags, stack_start, regs, stack_size,
+ p = copy_process(clone_flags, stack_start, stack_size,
child_tidptr, NULL, trace);
/*
* Do this prior waking up the new thread - the thread pointer
@@ -1427,10 +1608,12 @@ long do_fork(unsigned long clone_flags,
*/
if (!IS_ERR(p)) {
struct completion vfork;
+ struct pid *pid;
trace_sched_process_fork(current, p);
- nr = task_pid_vnr(p);
+ pid = get_task_pid(p, PIDTYPE_PID);
+ nr = pid_vnr(pid);
if (clone_flags & CLONE_PARENT_SETTID)
put_user(nr, parent_tidptr);
@@ -1438,45 +1621,84 @@ long do_fork(unsigned long clone_flags,
if (clone_flags & CLONE_VFORK) {
p->vfork_done = &vfork;
init_completion(&vfork);
+ get_task_struct(p);
}
- audit_finish_fork(p);
- tracehook_report_clone(regs, clone_flags, nr, p);
+ wake_up_new_task(p);
- /*
- * We set PF_STARTING at creation in case tracing wants to
- * use this to distinguish a fully live task from one that
- * hasn't gotten to tracehook_report_clone() yet. Now we
- * clear it and set the child going.
- */
- p->flags &= ~PF_STARTING;
-
- if (unlikely(clone_flags & CLONE_STOPPED)) {
- /*
- * We'll start up with an immediate SIGSTOP.
- */
- sigaddset(&p->pending.signal, SIGSTOP);
- set_tsk_thread_flag(p, TIF_SIGPENDING);
- __set_task_state(p, TASK_STOPPED);
- } else {
- wake_up_new_task(p, clone_flags);
- }
-
- tracehook_report_clone_complete(trace, regs,
- clone_flags, nr, p);
+ /* forking complete and child started to run, tell ptracer */
+ if (unlikely(trace))
+ ptrace_event_pid(trace, pid);
if (clone_flags & CLONE_VFORK) {
- freezer_do_not_count();
- wait_for_completion(&vfork);
- freezer_count();
- tracehook_report_vfork_done(p, nr);
+ if (!wait_for_vfork_done(p, &vfork))
+ ptrace_event_pid(PTRACE_EVENT_VFORK_DONE, pid);
}
+
+ put_pid(pid);
} else {
nr = PTR_ERR(p);
}
return nr;
}
+/*
+ * Create a kernel thread.
+ */
+pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
+{
+ return do_fork(flags|CLONE_VM|CLONE_UNTRACED, (unsigned long)fn,
+ (unsigned long)arg, NULL, NULL);
+}
+
+#ifdef __ARCH_WANT_SYS_FORK
+SYSCALL_DEFINE0(fork)
+{
+#ifdef CONFIG_MMU
+ return do_fork(SIGCHLD, 0, 0, NULL, NULL);
+#else
+ /* can not support in nommu mode */
+ return -EINVAL;
+#endif
+}
+#endif
+
+#ifdef __ARCH_WANT_SYS_VFORK
+SYSCALL_DEFINE0(vfork)
+{
+ return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0,
+ 0, NULL, NULL);
+}
+#endif
+
+#ifdef __ARCH_WANT_SYS_CLONE
+#ifdef CONFIG_CLONE_BACKWARDS
+SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
+ int __user *, parent_tidptr,
+ int, tls_val,
+ int __user *, child_tidptr)
+#elif defined(CONFIG_CLONE_BACKWARDS2)
+SYSCALL_DEFINE5(clone, unsigned long, newsp, unsigned long, clone_flags,
+ int __user *, parent_tidptr,
+ int __user *, child_tidptr,
+ int, tls_val)
+#elif defined(CONFIG_CLONE_BACKWARDS3)
+SYSCALL_DEFINE6(clone, unsigned long, clone_flags, unsigned long, newsp,
+ int, stack_size,
+ int __user *, parent_tidptr,
+ int __user *, child_tidptr,
+ int, tls_val)
+#else
+SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
+ int __user *, parent_tidptr,
+ int __user *, child_tidptr,
+ int, tls_val)
+#endif
+{
+ return do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr);
+}
+#endif
+
#ifndef ARCH_MIN_MMSTRUCT_ALIGN
#define ARCH_MIN_MMSTRUCT_ALIGN 0
#endif
@@ -1504,54 +1726,41 @@ void __init proc_caches_init(void)
fs_cachep = kmem_cache_create("fs_cache",
sizeof(struct fs_struct), 0,
SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
+ /*
+ * FIXME! The "sizeof(struct mm_struct)" currently includes the
+ * whole struct cpumask for the OFFSTACK case. We could change
+ * this to *only* allocate as much of it as required by the
+ * maximum number of CPU's we can ever have. The cpumask_allocation
+ * is at the end of the structure, exactly for that reason.
+ */
mm_cachep = kmem_cache_create("mm_struct",
sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN,
SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC);
mmap_init();
+ nsproxy_cache_init();
}
/*
- * Check constraints on flags passed to the unshare system call and
- * force unsharing of additional process context as appropriate.
+ * Check constraints on flags passed to the unshare system call.
*/
-static void check_unshare_flags(unsigned long *flags_ptr)
+static int check_unshare_flags(unsigned long unshare_flags)
{
+ if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
+ CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
+ CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET|
+ CLONE_NEWUSER|CLONE_NEWPID))
+ return -EINVAL;
/*
- * If unsharing a thread from a thread group, must also
- * unshare vm.
- */
- if (*flags_ptr & CLONE_THREAD)
- *flags_ptr |= CLONE_VM;
-
- /*
- * If unsharing vm, must also unshare signal handlers.
- */
- if (*flags_ptr & CLONE_VM)
- *flags_ptr |= CLONE_SIGHAND;
-
- /*
- * If unsharing signal handlers and the task was created
- * using CLONE_THREAD, then must unshare the thread
+ * Not implemented, but pretend it works if there is nothing to
+ * unshare. Note that unsharing CLONE_THREAD or CLONE_SIGHAND
+ * needs to unshare vm.
*/
- if ((*flags_ptr & CLONE_SIGHAND) &&
- (atomic_read(&current->signal->count) > 1))
- *flags_ptr |= CLONE_THREAD;
-
- /*
- * If unsharing namespace, must also unshare filesystem information.
- */
- if (*flags_ptr & CLONE_NEWNS)
- *flags_ptr |= CLONE_FS;
-}
-
-/*
- * Unsharing of tasks created with CLONE_THREAD is not supported yet
- */
-static int unshare_thread(unsigned long unshare_flags)
-{
- if (unshare_flags & CLONE_THREAD)
- return -EINVAL;
+ if (unshare_flags & (CLONE_THREAD | CLONE_SIGHAND | CLONE_VM)) {
+ /* FIXME: get_task_mm() increments ->mm_users */
+ if (atomic_read(&current->mm->mm_users) > 1)
+ return -EINVAL;
+ }
return 0;
}
@@ -1578,34 +1787,6 @@ static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp)
}
/*
- * Unsharing of sighand is not supported yet
- */
-static int unshare_sighand(unsigned long unshare_flags, struct sighand_struct **new_sighp)
-{
- struct sighand_struct *sigh = current->sighand;
-
- if ((unshare_flags & CLONE_SIGHAND) && atomic_read(&sigh->count) > 1)
- return -EINVAL;
- else
- return 0;
-}
-
-/*
- * Unshare vm if it is being shared
- */
-static int unshare_vm(unsigned long unshare_flags, struct mm_struct **new_mmp)
-{
- struct mm_struct *mm = current->mm;
-
- if ((unshare_flags & CLONE_VM) &&
- (mm && atomic_read(&mm->mm_users) > 1)) {
- return -EINVAL;
- }
-
- return 0;
-}
-
-/*
* Unshare file descriptor table if it is being shared
*/
static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp)
@@ -1633,23 +1814,37 @@ static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp
*/
SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
{
- int err = 0;
struct fs_struct *fs, *new_fs = NULL;
- struct sighand_struct *new_sigh = NULL;
- struct mm_struct *mm, *new_mm = NULL, *active_mm = NULL;
struct files_struct *fd, *new_fd = NULL;
+ struct cred *new_cred = NULL;
struct nsproxy *new_nsproxy = NULL;
int do_sysvsem = 0;
+ int err;
- check_unshare_flags(&unshare_flags);
+ /*
+ * If unsharing a user namespace must also unshare the thread.
+ */
+ if (unshare_flags & CLONE_NEWUSER)
+ unshare_flags |= CLONE_THREAD | CLONE_FS;
+ /*
+ * If unsharing a thread from a thread group, must also unshare vm.
+ */
+ if (unshare_flags & CLONE_THREAD)
+ unshare_flags |= CLONE_VM;
+ /*
+ * If unsharing vm, must also unshare signal handlers.
+ */
+ if (unshare_flags & CLONE_VM)
+ unshare_flags |= CLONE_SIGHAND;
+ /*
+ * If unsharing namespace, must also unshare filesystem information.
+ */
+ if (unshare_flags & CLONE_NEWNS)
+ unshare_flags |= CLONE_FS;
- /* Return -EINVAL for all unsupported flags */
- err = -EINVAL;
- if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
- CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
- CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET))
+ err = check_unshare_flags(unshare_flags);
+ if (err)
goto bad_unshare_out;
-
/*
* CLONE_NEWIPC must also detach from the undolist: after switching
* to a new ipc namespace, the semaphore arrays from the old
@@ -1657,21 +1852,21 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
*/
if (unshare_flags & (CLONE_NEWIPC|CLONE_SYSVSEM))
do_sysvsem = 1;
- if ((err = unshare_thread(unshare_flags)))
+ err = unshare_fs(unshare_flags, &new_fs);
+ if (err)
goto bad_unshare_out;
- if ((err = unshare_fs(unshare_flags, &new_fs)))
- goto bad_unshare_cleanup_thread;
- if ((err = unshare_sighand(unshare_flags, &new_sigh)))
+ err = unshare_fd(unshare_flags, &new_fd);
+ if (err)
goto bad_unshare_cleanup_fs;
- if ((err = unshare_vm(unshare_flags, &new_mm)))
- goto bad_unshare_cleanup_sigh;
- if ((err = unshare_fd(unshare_flags, &new_fd)))
- goto bad_unshare_cleanup_vm;
- if ((err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy,
- new_fs)))
+ err = unshare_userns(unshare_flags, &new_cred);
+ if (err)
goto bad_unshare_cleanup_fd;
+ err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy,
+ new_cred, new_fs);
+ if (err)
+ goto bad_unshare_cleanup_cred;
- if (new_fs || new_mm || new_fd || do_sysvsem || new_nsproxy) {
+ if (new_fs || new_fd || do_sysvsem || new_cred || new_nsproxy) {
if (do_sysvsem) {
/*
* CLONE_SYSVSEM is equivalent to sys_exit().
@@ -1679,31 +1874,20 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
exit_sem(current);
}
- if (new_nsproxy) {
+ if (new_nsproxy)
switch_task_namespaces(current, new_nsproxy);
- new_nsproxy = NULL;
- }
task_lock(current);
if (new_fs) {
fs = current->fs;
- write_lock(&fs->lock);
+ spin_lock(&fs->lock);
current->fs = new_fs;
if (--fs->users)
new_fs = NULL;
else
new_fs = fs;
- write_unlock(&fs->lock);
- }
-
- if (new_mm) {
- mm = current->mm;
- active_mm = current->active_mm;
- current->mm = new_mm;
- current->active_mm = new_mm;
- activate_mm(active_mm, new_mm);
- new_mm = mm;
+ spin_unlock(&fs->lock);
}
if (new_fd) {
@@ -1713,29 +1897,25 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
}
task_unlock(current);
- }
- if (new_nsproxy)
- put_nsproxy(new_nsproxy);
+ if (new_cred) {
+ /* Install the new user namespace */
+ commit_creds(new_cred);
+ new_cred = NULL;
+ }
+ }
+bad_unshare_cleanup_cred:
+ if (new_cred)
+ put_cred(new_cred);
bad_unshare_cleanup_fd:
if (new_fd)
put_files_struct(new_fd);
-bad_unshare_cleanup_vm:
- if (new_mm)
- mmput(new_mm);
-
-bad_unshare_cleanup_sigh:
- if (new_sigh)
- if (atomic_dec_and_test(&new_sigh->count))
- kmem_cache_free(sighand_cachep, new_sigh);
-
bad_unshare_cleanup_fs:
if (new_fs)
free_fs_struct(new_fs);
-bad_unshare_cleanup_thread:
bad_unshare_out:
return err;
}
diff --git a/kernel/freezer.c b/kernel/freezer.c
index bd1d42b17cb..aa6a8aadb91 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -6,156 +6,177 @@
#include <linux/interrupt.h>
#include <linux/suspend.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/syscalls.h>
#include <linux/freezer.h>
+#include <linux/kthread.h>
+
+/* total number of freezing conditions in effect */
+atomic_t system_freezing_cnt = ATOMIC_INIT(0);
+EXPORT_SYMBOL(system_freezing_cnt);
+
+/* indicate whether PM freezing is in effect, protected by pm_mutex */
+bool pm_freezing;
+bool pm_nosig_freezing;
/*
- * freezing is complete, mark current process as frozen
+ * Temporary export for the deadlock workaround in ata_scsi_hotplug().
+ * Remove once the hack becomes unnecessary.
+ */
+EXPORT_SYMBOL_GPL(pm_freezing);
+
+/* protects freezing and frozen transitions */
+static DEFINE_SPINLOCK(freezer_lock);
+
+/**
+ * freezing_slow_path - slow path for testing whether a task needs to be frozen
+ * @p: task to be tested
+ *
+ * This function is called by freezing() if system_freezing_cnt isn't zero
+ * and tests whether @p needs to enter and stay in frozen state. Can be
+ * called under any context. The freezers are responsible for ensuring the
+ * target tasks see the updated state.
*/
-static inline void frozen_process(void)
+bool freezing_slow_path(struct task_struct *p)
{
- if (!unlikely(current->flags & PF_NOFREEZE)) {
- current->flags |= PF_FROZEN;
- wmb();
- }
- clear_freeze_flag(current);
+ if (p->flags & (PF_NOFREEZE | PF_SUSPEND_TASK))
+ return false;
+
+ if (pm_nosig_freezing || cgroup_freezing(p))
+ return true;
+
+ if (pm_freezing && !(p->flags & PF_KTHREAD))
+ return true;
+
+ return false;
}
+EXPORT_SYMBOL(freezing_slow_path);
/* Refrigerator is place where frozen processes are stored :-). */
-void refrigerator(void)
+bool __refrigerator(bool check_kthr_stop)
{
/* Hmm, should we be allowed to suspend when there are realtime
processes around? */
- long save;
-
- task_lock(current);
- if (freezing(current)) {
- frozen_process();
- task_unlock(current);
- } else {
- task_unlock(current);
- return;
- }
- save = current->state;
- pr_debug("%s entered refrigerator\n", current->comm);
-
- spin_lock_irq(&current->sighand->siglock);
- recalc_sigpending(); /* We sent fake signal, clean it up */
- spin_unlock_irq(&current->sighand->siglock);
+ bool was_frozen = false;
+ long save = current->state;
- /* prevent accounting of that task to load */
- current->flags |= PF_FREEZING;
+ pr_debug("%s entered refrigerator\n", current->comm);
for (;;) {
set_current_state(TASK_UNINTERRUPTIBLE);
- if (!frozen(current))
+
+ spin_lock_irq(&freezer_lock);
+ current->flags |= PF_FROZEN;
+ if (!freezing(current) ||
+ (check_kthr_stop && kthread_should_stop()))
+ current->flags &= ~PF_FROZEN;
+ spin_unlock_irq(&freezer_lock);
+
+ if (!(current->flags & PF_FROZEN))
break;
+ was_frozen = true;
schedule();
}
- /* Remove the accounting blocker */
- current->flags &= ~PF_FREEZING;
-
pr_debug("%s left refrigerator\n", current->comm);
- __set_current_state(save);
+
+ /*
+ * Restore saved task state before returning. The mb'd version
+ * needs to be used; otherwise, it might silently break
+ * synchronization which depends on ordered task state change.
+ */
+ set_current_state(save);
+
+ return was_frozen;
}
-EXPORT_SYMBOL(refrigerator);
+EXPORT_SYMBOL(__refrigerator);
static void fake_signal_wake_up(struct task_struct *p)
{
unsigned long flags;
- spin_lock_irqsave(&p->sighand->siglock, flags);
- signal_wake_up(p, 0);
- spin_unlock_irqrestore(&p->sighand->siglock, flags);
+ if (lock_task_sighand(p, &flags)) {
+ signal_wake_up(p, 0);
+ unlock_task_sighand(p, &flags);
+ }
}
/**
- * freeze_task - send a freeze request to given task
- * @p: task to send the request to
- * @sig_only: if set, the request will only be sent if the task has the
- * PF_FREEZER_NOSIG flag unset
- * Return value: 'false', if @sig_only is set and the task has
- * PF_FREEZER_NOSIG set or the task is frozen, 'true', otherwise
+ * freeze_task - send a freeze request to given task
+ * @p: task to send the request to
+ *
+ * If @p is freezing, the freeze request is sent either by sending a fake
+ * signal (if it's not a kernel thread) or waking it up (if it's a kernel
+ * thread).
*
- * The freeze request is sent by setting the tasks's TIF_FREEZE flag and
- * either sending a fake signal to it or waking it up, depending on whether
- * or not it has PF_FREEZER_NOSIG set. If @sig_only is set and the task
- * has PF_FREEZER_NOSIG set (ie. it is a typical kernel thread), its
- * TIF_FREEZE flag will not be set.
+ * RETURNS:
+ * %false, if @p is not freezing or already frozen; %true, otherwise
*/
-bool freeze_task(struct task_struct *p, bool sig_only)
+bool freeze_task(struct task_struct *p)
{
+ unsigned long flags;
+
/*
- * We first check if the task is freezing and next if it has already
- * been frozen to avoid the race with frozen_process() which first marks
- * the task as frozen and next clears its TIF_FREEZE.
+ * This check can race with freezer_do_not_count, but worst case that
+ * will result in an extra wakeup being sent to the task. It does not
+ * race with freezer_count(), the barriers in freezer_count() and
+ * freezer_should_skip() ensure that either freezer_count() sees
+ * freezing == true in try_to_freeze() and freezes, or
+ * freezer_should_skip() sees !PF_FREEZE_SKIP and freezes the task
+ * normally.
*/
- if (!freezing(p)) {
- rmb();
- if (frozen(p))
- return false;
-
- if (!sig_only || should_send_signal(p))
- set_freeze_flag(p);
- else
- return false;
- }
+ if (freezer_should_skip(p))
+ return false;
- if (should_send_signal(p)) {
- if (!signal_pending(p))
- fake_signal_wake_up(p);
- } else if (sig_only) {
+ spin_lock_irqsave(&freezer_lock, flags);
+ if (!freezing(p) || frozen(p)) {
+ spin_unlock_irqrestore(&freezer_lock, flags);
return false;
- } else {
- wake_up_state(p, TASK_INTERRUPTIBLE);
}
+ if (!(p->flags & PF_KTHREAD))
+ fake_signal_wake_up(p);
+ else
+ wake_up_state(p, TASK_INTERRUPTIBLE);
+
+ spin_unlock_irqrestore(&freezer_lock, flags);
return true;
}
-void cancel_freezing(struct task_struct *p)
+void __thaw_task(struct task_struct *p)
{
unsigned long flags;
- if (freezing(p)) {
- pr_debug(" clean up: %s\n", p->comm);
- clear_freeze_flag(p);
- spin_lock_irqsave(&p->sighand->siglock, flags);
- recalc_sigpending_and_wake(p);
- spin_unlock_irqrestore(&p->sighand->siglock, flags);
- }
-}
-
-static int __thaw_process(struct task_struct *p)
-{
- if (frozen(p)) {
- p->flags &= ~PF_FROZEN;
- return 1;
- }
- clear_freeze_flag(p);
- return 0;
+ /*
+ * Clear freezing and kick @p if FROZEN. Clearing is guaranteed to
+ * be visible to @p as waking up implies wmb. Waking up inside
+ * freezer_lock also prevents wakeups from leaking outside
+ * refrigerator.
+ */
+ spin_lock_irqsave(&freezer_lock, flags);
+ if (frozen(p))
+ wake_up_process(p);
+ spin_unlock_irqrestore(&freezer_lock, flags);
}
-/*
- * Wake up a frozen process
+/**
+ * set_freezable - make %current freezable
*
- * task_lock() is needed to prevent the race with refrigerator() which may
- * occur if the freezing of tasks fails. Namely, without the lock, if the
- * freezing of tasks failed, thaw_tasks() might have run before a task in
- * refrigerator() could call frozen_process(), in which case the task would be
- * frozen and no one would thaw it.
+ * Mark %current freezable and enter refrigerator if necessary.
*/
-int thaw_process(struct task_struct *p)
+bool set_freezable(void)
{
- task_lock(p);
- if (__thaw_process(p) == 1) {
- task_unlock(p);
- wake_up_process(p);
- return 1;
- }
- task_unlock(p);
- return 0;
+ might_sleep();
+
+ /*
+ * Modify flags while holding freezer_lock. This ensures the
+ * freezer notices that we aren't frozen yet or the freezing
+ * condition is visible to try_to_freeze() below.
+ */
+ spin_lock_irq(&freezer_lock);
+ current->flags &= ~PF_NOFREEZE;
+ spin_unlock_irq(&freezer_lock);
+
+ return try_to_freeze();
}
-EXPORT_SYMBOL(thaw_process);
+EXPORT_SYMBOL(set_freezable);
diff --git a/kernel/futex.c b/kernel/futex.c
index e7a35f1039e..b632b5f3f09 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -55,18 +55,133 @@
#include <linux/pagemap.h>
#include <linux/syscalls.h>
#include <linux/signal.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/magic.h>
#include <linux/pid.h>
#include <linux/nsproxy.h>
+#include <linux/ptrace.h>
+#include <linux/sched/rt.h>
+#include <linux/hugetlb.h>
+#include <linux/freezer.h>
+#include <linux/bootmem.h>
#include <asm/futex.h>
-#include "rtmutex_common.h"
+#include "locking/rtmutex_common.h"
+/*
+ * READ this before attempting to hack on futexes!
+ *
+ * Basic futex operation and ordering guarantees
+ * =============================================
+ *
+ * The waiter reads the futex value in user space and calls
+ * futex_wait(). This function computes the hash bucket and acquires
+ * the hash bucket lock. After that it reads the futex user space value
+ * again and verifies that the data has not changed. If it has not changed
+ * it enqueues itself into the hash bucket, releases the hash bucket lock
+ * and schedules.
+ *
+ * The waker side modifies the user space value of the futex and calls
+ * futex_wake(). This function computes the hash bucket and acquires the
+ * hash bucket lock. Then it looks for waiters on that futex in the hash
+ * bucket and wakes them.
+ *
+ * In futex wake up scenarios where no tasks are blocked on a futex, taking
+ * the hb spinlock can be avoided and simply return. In order for this
+ * optimization to work, ordering guarantees must exist so that the waiter
+ * being added to the list is acknowledged when the list is concurrently being
+ * checked by the waker, avoiding scenarios like the following:
+ *
+ * CPU 0 CPU 1
+ * val = *futex;
+ * sys_futex(WAIT, futex, val);
+ * futex_wait(futex, val);
+ * uval = *futex;
+ * *futex = newval;
+ * sys_futex(WAKE, futex);
+ * futex_wake(futex);
+ * if (queue_empty())
+ * return;
+ * if (uval == val)
+ * lock(hash_bucket(futex));
+ * queue();
+ * unlock(hash_bucket(futex));
+ * schedule();
+ *
+ * This would cause the waiter on CPU 0 to wait forever because it
+ * missed the transition of the user space value from val to newval
+ * and the waker did not find the waiter in the hash bucket queue.
+ *
+ * The correct serialization ensures that a waiter either observes
+ * the changed user space value before blocking or is woken by a
+ * concurrent waker:
+ *
+ * CPU 0 CPU 1
+ * val = *futex;
+ * sys_futex(WAIT, futex, val);
+ * futex_wait(futex, val);
+ *
+ * waiters++; (a)
+ * mb(); (A) <-- paired with -.
+ * |
+ * lock(hash_bucket(futex)); |
+ * |
+ * uval = *futex; |
+ * | *futex = newval;
+ * | sys_futex(WAKE, futex);
+ * | futex_wake(futex);
+ * |
+ * `-------> mb(); (B)
+ * if (uval == val)
+ * queue();
+ * unlock(hash_bucket(futex));
+ * schedule(); if (waiters)
+ * lock(hash_bucket(futex));
+ * else wake_waiters(futex);
+ * waiters--; (b) unlock(hash_bucket(futex));
+ *
+ * Where (A) orders the waiters increment and the futex value read through
+ * atomic operations (see hb_waiters_inc) and where (B) orders the write
+ * to futex and the waiters read -- this is done by the barriers in
+ * get_futex_key_refs(), through either ihold or atomic_inc, depending on the
+ * futex type.
+ *
+ * This yields the following case (where X:=waiters, Y:=futex):
+ *
+ * X = Y = 0
+ *
+ * w[X]=1 w[Y]=1
+ * MB MB
+ * r[Y]=y r[X]=x
+ *
+ * Which guarantees that x==0 && y==0 is impossible; which translates back into
+ * the guarantee that we cannot both miss the futex variable change and the
+ * enqueue.
+ *
+ * Note that a new waiter is accounted for in (a) even when it is possible that
+ * the wait call can return error, in which case we backtrack from it in (b).
+ * Refer to the comment in queue_lock().
+ *
+ * Similarly, in order to account for waiters being requeued on another
+ * address we always increment the waiters for the destination bucket before
+ * acquiring the lock. It then decrements them again after releasing it -
+ * the code that actually moves the futex(es) between hash buckets (requeue_futex)
+ * will do the additional required waiter count housekeeping. This is done for
+ * double_lock_hb() and double_unlock_hb(), respectively.
+ */
+
+#ifndef CONFIG_HAVE_FUTEX_CMPXCHG
int __read_mostly futex_cmpxchg_enabled;
+#endif
-#define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8)
+/*
+ * Futex flags used to encode options to functions and preserve them across
+ * restarts.
+ */
+#define FLAGS_SHARED 0x01
+#define FLAGS_CLOCKRT 0x02
+#define FLAGS_HAS_TIMEOUT 0x04
/*
* Priority Inheritance state:
@@ -91,6 +206,7 @@ struct futex_pi_state {
/**
* struct futex_q - The hashed futex queue entry, one per waiting task
+ * @list: priority-sorted list of tasks waiting on this futex
* @task: the task waiting on the futex
* @lock_ptr: the hash bucket lock
* @key: the key the futex is hashed on
@@ -104,7 +220,7 @@ struct futex_pi_state {
*
* A futex_q has a woken state, just like tasks have TASK_RUNNING.
* It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0.
- * The order of wakup is always to make the first condition true, then
+ * The order of wakeup is always to make the first condition true, then
* the second.
*
* PI futexes are typically woken before they are removed from the hash list via
@@ -122,17 +238,71 @@ struct futex_q {
u32 bitset;
};
+static const struct futex_q futex_q_init = {
+ /* list gets initialized in queue_me()*/
+ .key = FUTEX_KEY_INIT,
+ .bitset = FUTEX_BITSET_MATCH_ANY
+};
+
/*
* Hash buckets are shared by all the futex_keys that hash to the same
* location. Each key may have multiple futex_q structures, one for each task
* waiting on a futex.
*/
struct futex_hash_bucket {
+ atomic_t waiters;
spinlock_t lock;
struct plist_head chain;
-};
+} ____cacheline_aligned_in_smp;
+
+static unsigned long __read_mostly futex_hashsize;
-static struct futex_hash_bucket futex_queues[1<<FUTEX_HASHBITS];
+static struct futex_hash_bucket *futex_queues;
+
+static inline void futex_get_mm(union futex_key *key)
+{
+ atomic_inc(&key->private.mm->mm_count);
+ /*
+ * Ensure futex_get_mm() implies a full barrier such that
+ * get_futex_key() implies a full barrier. This is relied upon
+ * as full barrier (B), see the ordering comment above.
+ */
+ smp_mb__after_atomic();
+}
+
+/*
+ * Reflects a new waiter being added to the waitqueue.
+ */
+static inline void hb_waiters_inc(struct futex_hash_bucket *hb)
+{
+#ifdef CONFIG_SMP
+ atomic_inc(&hb->waiters);
+ /*
+ * Full barrier (A), see the ordering comment above.
+ */
+ smp_mb__after_atomic();
+#endif
+}
+
+/*
+ * Reflects a waiter being removed from the waitqueue by wakeup
+ * paths.
+ */
+static inline void hb_waiters_dec(struct futex_hash_bucket *hb)
+{
+#ifdef CONFIG_SMP
+ atomic_dec(&hb->waiters);
+#endif
+}
+
+static inline int hb_waiters_pending(struct futex_hash_bucket *hb)
+{
+#ifdef CONFIG_SMP
+ return atomic_read(&hb->waiters);
+#else
+ return 1;
+#endif
+}
/*
* We hash on the keys returned from get_futex_key (see below).
@@ -142,7 +312,7 @@ static struct futex_hash_bucket *hash_futex(union futex_key *key)
u32 hash = jhash2((u32*)&key->both.word,
(sizeof(key->both.word)+sizeof(key->both.ptr))/4,
key->both.offset);
- return &futex_queues[hash & ((1 << FUTEX_HASHBITS)-1)];
+ return &futex_queues[hash & (futex_hashsize - 1)];
}
/*
@@ -168,10 +338,10 @@ static void get_futex_key_refs(union futex_key *key)
switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
case FUT_OFF_INODE:
- atomic_inc(&key->shared.inode->i_count);
+ ihold(key->shared.inode); /* implies MB (B) */
break;
case FUT_OFF_MMSHARED:
- atomic_inc(&key->private.mm->mm_count);
+ futex_get_mm(key); /* implies MB (B) */
break;
}
}
@@ -203,23 +373,26 @@ static void drop_futex_key_refs(union futex_key *key)
* @uaddr: virtual address of the futex
* @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED
* @key: address where result is stored.
+ * @rw: mapping needs to be read/write (values: VERIFY_READ,
+ * VERIFY_WRITE)
+ *
+ * Return: a negative error code or 0
*
- * Returns a negative error code or 0
* The key words are stored in *key on success.
*
- * For shared mappings, it's (page->index, vma->vm_file->f_path.dentry->d_inode,
+ * For shared mappings, it's (page->index, file_inode(vma->vm_file),
* offset_within_page). For private mappings, it's (uaddr, current->mm).
* We can usually work out the index without swapping in the page.
*
* lock_page() might sleep, the caller should not hold a spinlock.
*/
static int
-get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key)
+get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
{
unsigned long address = (unsigned long)uaddr;
struct mm_struct *mm = current->mm;
- struct page *page;
- int err;
+ struct page *page, *page_head;
+ int err, ro = 0;
/*
* The futex address must be "naturally" aligned.
@@ -229,6 +402,9 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key)
return -EINVAL;
address -= key->both.offset;
+ if (unlikely(!access_ok(rw, uaddr, sizeof(u32))))
+ return -EFAULT;
+
/*
* PROCESS_PRIVATE futexes are fast.
* As the mm cannot disappear under us and the 'key' only needs
@@ -237,25 +413,87 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key)
* but access_ok() should be faster than find_vma()
*/
if (!fshared) {
- if (unlikely(!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))))
- return -EFAULT;
key->private.mm = mm;
key->private.address = address;
- get_futex_key_refs(key);
+ get_futex_key_refs(key); /* implies MB (B) */
return 0;
}
again:
err = get_user_pages_fast(address, 1, 1, &page);
+ /*
+ * If write access is not required (eg. FUTEX_WAIT), try
+ * and get read-only access.
+ */
+ if (err == -EFAULT && rw == VERIFY_READ) {
+ err = get_user_pages_fast(address, 1, 0, &page);
+ ro = 1;
+ }
if (err < 0)
return err;
+ else
+ err = 0;
- page = compound_head(page);
- lock_page(page);
- if (!page->mapping) {
- unlock_page(page);
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ page_head = page;
+ if (unlikely(PageTail(page))) {
put_page(page);
- goto again;
+ /* serialize against __split_huge_page_splitting() */
+ local_irq_disable();
+ if (likely(__get_user_pages_fast(address, 1, !ro, &page) == 1)) {
+ page_head = compound_head(page);
+ /*
+ * page_head is valid pointer but we must pin
+ * it before taking the PG_lock and/or
+ * PG_compound_lock. The moment we re-enable
+ * irqs __split_huge_page_splitting() can
+ * return and the head page can be freed from
+ * under us. We can't take the PG_lock and/or
+ * PG_compound_lock on a page that could be
+ * freed from under us.
+ */
+ if (page != page_head) {
+ get_page(page_head);
+ put_page(page);
+ }
+ local_irq_enable();
+ } else {
+ local_irq_enable();
+ goto again;
+ }
+ }
+#else
+ page_head = compound_head(page);
+ if (page != page_head) {
+ get_page(page_head);
+ put_page(page);
+ }
+#endif
+
+ lock_page(page_head);
+
+ /*
+ * If page_head->mapping is NULL, then it cannot be a PageAnon
+ * page; but it might be the ZERO_PAGE or in the gate area or
+ * in a special mapping (all cases which we are happy to fail);
+ * or it may have been a good file page when get_user_pages_fast
+ * found it, but truncated or holepunched or subjected to
+ * invalidate_complete_page2 before we got the page lock (also
+ * cases which we are happy to fail). And we hold a reference,
+ * so refcount care in invalidate_complete_page's remove_mapping
+ * prevents drop_caches from setting mapping to NULL beneath us.
+ *
+ * The case we do have to guard against is when memory pressure made
+ * shmem_writepage move it from filecache to swapcache beneath us:
+ * an unlikely race, but we do need to retry for page_head->mapping.
+ */
+ if (!page_head->mapping) {
+ int shmem_swizzled = PageSwapCache(page_head);
+ unlock_page(page_head);
+ put_page(page_head);
+ if (shmem_swizzled)
+ goto again;
+ return -EFAULT;
}
/*
@@ -265,25 +503,34 @@ again:
* it's a read-only handle, it's expected that futexes attach to
* the object not the particular process.
*/
- if (PageAnon(page)) {
+ if (PageAnon(page_head)) {
+ /*
+ * A RO anonymous page will never change and thus doesn't make
+ * sense for futex operations.
+ */
+ if (ro) {
+ err = -EFAULT;
+ goto out;
+ }
+
key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */
key->private.mm = mm;
key->private.address = address;
} else {
key->both.offset |= FUT_OFF_INODE; /* inode-based key */
- key->shared.inode = page->mapping->host;
- key->shared.pgoff = page->index;
+ key->shared.inode = page_head->mapping->host;
+ key->shared.pgoff = basepage_index(page);
}
- get_futex_key_refs(key);
+ get_futex_key_refs(key); /* implies MB (B) */
- unlock_page(page);
- put_page(page);
- return 0;
+out:
+ unlock_page(page_head);
+ put_page(page_head);
+ return err;
}
-static inline
-void put_futex_key(int fshared, union futex_key *key)
+static inline void put_futex_key(union futex_key *key)
{
drop_futex_key_refs(key);
}
@@ -295,7 +542,7 @@ void put_futex_key(int fshared, union futex_key *key)
* Slow path to fixup the fault we just took in the atomic write
* access to @uaddr.
*
- * We have no generic implementation of a non destructive write to the
+ * We have no generic implementation of a non-destructive write to the
* user address. We know that we faulted in the atomic pagefault
* disabled section so we can as well avoid the #PF overhead by
* calling get_user_pages() right away.
@@ -306,8 +553,8 @@ static int fault_in_user_writeable(u32 __user *uaddr)
int ret;
down_read(&mm->mmap_sem);
- ret = get_user_pages(current, mm, (unsigned long)uaddr,
- 1, 1, 0, NULL, NULL);
+ ret = fixup_user_fault(current, mm, (unsigned long)uaddr,
+ FAULT_FLAG_WRITE);
up_read(&mm->mmap_sem);
return ret < 0 ? ret : 0;
@@ -332,15 +579,16 @@ static struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb,
return NULL;
}
-static u32 cmpxchg_futex_value_locked(u32 __user *uaddr, u32 uval, u32 newval)
+static int cmpxchg_futex_value_locked(u32 *curval, u32 __user *uaddr,
+ u32 uval, u32 newval)
{
- u32 curval;
+ int ret;
pagefault_disable();
- curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
+ ret = futex_atomic_cmpxchg_inatomic(curval, uaddr, uval, newval);
pagefault_enable();
- return curval;
+ return ret;
}
static int get_futex_value_locked(u32 *dest, u32 __user *from)
@@ -429,20 +677,11 @@ static void free_pi_state(struct futex_pi_state *pi_state)
static struct task_struct * futex_find_get_task(pid_t pid)
{
struct task_struct *p;
- const struct cred *cred = current_cred(), *pcred;
rcu_read_lock();
p = find_task_by_vpid(pid);
- if (!p) {
- p = ERR_PTR(-ESRCH);
- } else {
- pcred = __task_cred(p);
- if (cred->euid != pcred->euid &&
- cred->euid != pcred->uid)
- p = ERR_PTR(-ESRCH);
- else
- get_task_struct(p);
- }
+ if (p)
+ get_task_struct(p);
rcu_read_unlock();
@@ -504,27 +743,74 @@ void exit_pi_state_list(struct task_struct *curr)
raw_spin_unlock_irq(&curr->pi_lock);
}
+/*
+ * We need to check the following states:
+ *
+ * Waiter | pi_state | pi->owner | uTID | uODIED | ?
+ *
+ * [1] NULL | --- | --- | 0 | 0/1 | Valid
+ * [2] NULL | --- | --- | >0 | 0/1 | Valid
+ *
+ * [3] Found | NULL | -- | Any | 0/1 | Invalid
+ *
+ * [4] Found | Found | NULL | 0 | 1 | Valid
+ * [5] Found | Found | NULL | >0 | 1 | Invalid
+ *
+ * [6] Found | Found | task | 0 | 1 | Valid
+ *
+ * [7] Found | Found | NULL | Any | 0 | Invalid
+ *
+ * [8] Found | Found | task | ==taskTID | 0/1 | Valid
+ * [9] Found | Found | task | 0 | 0 | Invalid
+ * [10] Found | Found | task | !=taskTID | 0/1 | Invalid
+ *
+ * [1] Indicates that the kernel can acquire the futex atomically. We
+ * came came here due to a stale FUTEX_WAITERS/FUTEX_OWNER_DIED bit.
+ *
+ * [2] Valid, if TID does not belong to a kernel thread. If no matching
+ * thread is found then it indicates that the owner TID has died.
+ *
+ * [3] Invalid. The waiter is queued on a non PI futex
+ *
+ * [4] Valid state after exit_robust_list(), which sets the user space
+ * value to FUTEX_WAITERS | FUTEX_OWNER_DIED.
+ *
+ * [5] The user space value got manipulated between exit_robust_list()
+ * and exit_pi_state_list()
+ *
+ * [6] Valid state after exit_pi_state_list() which sets the new owner in
+ * the pi_state but cannot access the user space value.
+ *
+ * [7] pi_state->owner can only be NULL when the OWNER_DIED bit is set.
+ *
+ * [8] Owner and user space value match
+ *
+ * [9] There is no transient state which sets the user space TID to 0
+ * except exit_robust_list(), but this is indicated by the
+ * FUTEX_OWNER_DIED bit. See [4]
+ *
+ * [10] There is no transient state which leaves owner and user space
+ * TID out of sync.
+ */
static int
lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
union futex_key *key, struct futex_pi_state **ps)
{
struct futex_pi_state *pi_state = NULL;
struct futex_q *this, *next;
- struct plist_head *head;
struct task_struct *p;
pid_t pid = uval & FUTEX_TID_MASK;
- head = &hb->chain;
-
- plist_for_each_entry_safe(this, next, head, list) {
+ plist_for_each_entry_safe(this, next, &hb->chain, list) {
if (match_futex(&this->key, key)) {
/*
- * Another waiter already exists - bump up
- * the refcount and return its pi_state:
+ * Sanity check the waiter before increasing
+ * the refcount and attaching to it.
*/
pi_state = this->pi_state;
/*
- * Userspace might have messed up non PI and PI futexes
+ * Userspace might have messed up non-PI and
+ * PI futexes [3]
*/
if (unlikely(!pi_state))
return -EINVAL;
@@ -532,40 +818,81 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
WARN_ON(!atomic_read(&pi_state->refcount));
/*
- * When pi_state->owner is NULL then the owner died
- * and another waiter is on the fly. pi_state->owner
- * is fixed up by the task which acquires
- * pi_state->rt_mutex.
- *
- * We do not check for pid == 0 which can happen when
- * the owner died and robust_list_exit() cleared the
- * TID.
+ * Handle the owner died case:
*/
- if (pid && pi_state->owner) {
+ if (uval & FUTEX_OWNER_DIED) {
+ /*
+ * exit_pi_state_list sets owner to NULL and
+ * wakes the topmost waiter. The task which
+ * acquires the pi_state->rt_mutex will fixup
+ * owner.
+ */
+ if (!pi_state->owner) {
+ /*
+ * No pi state owner, but the user
+ * space TID is not 0. Inconsistent
+ * state. [5]
+ */
+ if (pid)
+ return -EINVAL;
+ /*
+ * Take a ref on the state and
+ * return. [4]
+ */
+ goto out_state;
+ }
+
/*
- * Bail out if user space manipulated the
- * futex value.
+ * If TID is 0, then either the dying owner
+ * has not yet executed exit_pi_state_list()
+ * or some waiter acquired the rtmutex in the
+ * pi state, but did not yet fixup the TID in
+ * user space.
+ *
+ * Take a ref on the state and return. [6]
*/
- if (pid != task_pid_vnr(pi_state->owner))
+ if (!pid)
+ goto out_state;
+ } else {
+ /*
+ * If the owner died bit is not set,
+ * then the pi_state must have an
+ * owner. [7]
+ */
+ if (!pi_state->owner)
return -EINVAL;
}
+ /*
+ * Bail out if user space manipulated the
+ * futex value. If pi state exists then the
+ * owner TID must be the same as the user
+ * space TID. [9/10]
+ */
+ if (pid != task_pid_vnr(pi_state->owner))
+ return -EINVAL;
+
+ out_state:
atomic_inc(&pi_state->refcount);
*ps = pi_state;
-
return 0;
}
}
/*
* We are the first waiter - try to look up the real owner and attach
- * the new pi_state to it, but bail out when TID = 0
+ * the new pi_state to it, but bail out when TID = 0 [1]
*/
if (!pid)
return -ESRCH;
p = futex_find_get_task(pid);
- if (IS_ERR(p))
- return PTR_ERR(p);
+ if (!p)
+ return -ESRCH;
+
+ if (!p->mm) {
+ put_task_struct(p);
+ return -EPERM;
+ }
/*
* We need to look at the task state flags to figure out,
@@ -587,6 +914,9 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
return ret;
}
+ /*
+ * No existing pi state. First waiter. [2]
+ */
pi_state = alloc_pi_state();
/*
@@ -621,9 +951,9 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
* be "current" except in the case of requeue pi.
* @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0)
*
- * Returns:
- * 0 - ready to wait
- * 1 - acquired the lock
+ * Return:
+ * 0 - ready to wait;
+ * 1 - acquired the lock;
* <0 - error
*
* The hb->lock and futex_key refs shall be held by the caller.
@@ -633,8 +963,8 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
struct futex_pi_state **ps,
struct task_struct *task, int set_waiters)
{
- int lock_taken, ret, ownerdied = 0;
- u32 uval, newval, curval;
+ int lock_taken, ret, force_take = 0;
+ u32 uval, newval, curval, vpid = task_pid_vnr(task);
retry:
ret = lock_taken = 0;
@@ -644,26 +974,32 @@ retry:
* (by doing a 0 -> TID atomic cmpxchg), while holding all
* the locks. It will most likely not succeed.
*/
- newval = task_pid_vnr(task);
+ newval = vpid;
if (set_waiters)
newval |= FUTEX_WAITERS;
- curval = cmpxchg_futex_value_locked(uaddr, 0, newval);
-
- if (unlikely(curval == -EFAULT))
+ if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, 0, newval)))
return -EFAULT;
/*
* Detect deadlocks.
*/
- if ((unlikely((curval & FUTEX_TID_MASK) == task_pid_vnr(task))))
+ if ((unlikely((curval & FUTEX_TID_MASK) == vpid)))
return -EDEADLK;
/*
- * Surprise - we got the lock. Just return to userspace:
+ * Surprise - we got the lock, but we do not trust user space at all.
*/
- if (unlikely(!curval))
- return 1;
+ if (unlikely(!curval)) {
+ /*
+ * We verify whether there is kernel state for this
+ * futex. If not, we can safely assume, that the 0 ->
+ * TID transition is correct. If state exists, we do
+ * not bother to fixup the user space state as it was
+ * corrupted already.
+ */
+ return futex_top_waiter(hb, key) ? -EINVAL : 1;
+ }
uval = curval;
@@ -674,29 +1010,25 @@ retry:
newval = curval | FUTEX_WAITERS;
/*
- * There are two cases, where a futex might have no owner (the
- * owner TID is 0): OWNER_DIED. We take over the futex in this
- * case. We also do an unconditional take over, when the owner
- * of the futex died.
- *
- * This is safe as we are protected by the hash bucket lock !
+ * Should we force take the futex? See below.
*/
- if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) {
- /* Keep the OWNER_DIED bit */
- newval = (curval & ~FUTEX_TID_MASK) | task_pid_vnr(task);
- ownerdied = 0;
+ if (unlikely(force_take)) {
+ /*
+ * Keep the OWNER_DIED and the WAITERS bit and set the
+ * new TID value.
+ */
+ newval = (curval & ~FUTEX_TID_MASK) | vpid;
+ force_take = 0;
lock_taken = 1;
}
- curval = cmpxchg_futex_value_locked(uaddr, uval, newval);
-
- if (unlikely(curval == -EFAULT))
+ if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)))
return -EFAULT;
if (unlikely(curval != uval))
goto retry;
/*
- * We took the lock due to owner died take over.
+ * We took the lock due to forced take over.
*/
if (unlikely(lock_taken))
return 1;
@@ -711,20 +1043,25 @@ retry:
switch (ret) {
case -ESRCH:
/*
- * No owner found for this futex. Check if the
- * OWNER_DIED bit is set to figure out whether
- * this is a robust futex or not.
+ * We failed to find an owner for this
+ * futex. So we have no pi_state to block
+ * on. This can happen in two cases:
+ *
+ * 1) The owner died
+ * 2) A stale FUTEX_WAITERS bit
+ *
+ * Re-read the futex value.
*/
if (get_futex_value_locked(&curval, uaddr))
return -EFAULT;
/*
- * We simply start over in case of a robust
- * futex. The code above will take the futex
- * and return happy.
+ * If the owner died or we have a stale
+ * WAITERS bit the owner TID in the user space
+ * futex is 0.
*/
- if (curval & FUTEX_OWNER_DIED) {
- ownerdied = 1;
+ if (!(curval & FUTEX_TID_MASK)) {
+ force_take = 1;
goto retry;
}
default:
@@ -735,6 +1072,25 @@ retry:
return ret;
}
+/**
+ * __unqueue_futex() - Remove the futex_q from its futex_hash_bucket
+ * @q: The futex_q to unqueue
+ *
+ * The q->lock_ptr must not be NULL and must be held by the caller.
+ */
+static void __unqueue_futex(struct futex_q *q)
+{
+ struct futex_hash_bucket *hb;
+
+ if (WARN_ON_SMP(!q->lock_ptr || !spin_is_locked(q->lock_ptr))
+ || WARN_ON(plist_node_empty(&q->list)))
+ return;
+
+ hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock);
+ plist_del(&q->list, &hb->chain);
+ hb_waiters_dec(hb);
+}
+
/*
* The hash bucket lock must be held when this is called.
* Afterwards, the futex_q must not be accessed.
@@ -743,16 +1099,19 @@ static void wake_futex(struct futex_q *q)
{
struct task_struct *p = q->task;
+ if (WARN(q->pi_state || q->rt_waiter, "refusing to wake PI futex\n"))
+ return;
+
/*
* We set q->lock_ptr = NULL _before_ we wake up the task. If
- * a non futex wake up happens on another CPU then the task
- * might exit and p would dereference a non existing task
+ * a non-futex wake up happens on another CPU then the task
+ * might exit and p would dereference a non-existing task
* struct. Prevent this by holding a reference on p across the
* wake up.
*/
get_task_struct(p);
- plist_del(&q->list, &q->list.plist);
+ __unqueue_futex(q);
/*
* The waiting task can free the futex_q as soon as
* q->lock_ptr = NULL is written, without taking any locks. A
@@ -770,7 +1129,8 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
{
struct task_struct *new_owner;
struct futex_pi_state *pi_state = this->pi_state;
- u32 curval, newval;
+ u32 uninitialized_var(curval), newval;
+ int ret = 0;
if (!pi_state)
return -EINVAL;
@@ -786,34 +1146,27 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
/*
- * This happens when we have stolen the lock and the original
- * pending owner did not enqueue itself back on the rt_mutex.
- * Thats not a tragedy. We know that way, that a lock waiter
- * is on the fly. We make the futex_q waiter the pending owner.
+ * It is possible that the next waiter (the one that brought
+ * this owner to the kernel) timed out and is no longer
+ * waiting on the lock.
*/
if (!new_owner)
new_owner = this->task;
/*
- * We pass it to the next owner. (The WAITERS bit is always
- * kept enabled while there is PI state around. We must also
- * preserve the owner died bit.)
+ * We pass it to the next owner. The WAITERS bit is always
+ * kept enabled while there is PI state around. We cleanup the
+ * owner died bit, because we are the owner.
*/
- if (!(uval & FUTEX_OWNER_DIED)) {
- int ret = 0;
+ newval = FUTEX_WAITERS | task_pid_vnr(new_owner);
- newval = FUTEX_WAITERS | task_pid_vnr(new_owner);
-
- curval = cmpxchg_futex_value_locked(uaddr, uval, newval);
-
- if (curval == -EFAULT)
- ret = -EFAULT;
- else if (curval != uval)
- ret = -EINVAL;
- if (ret) {
- raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
- return ret;
- }
+ if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))
+ ret = -EFAULT;
+ else if (curval != uval)
+ ret = -EINVAL;
+ if (ret) {
+ raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
+ return ret;
}
raw_spin_lock_irq(&pi_state->owner->pi_lock);
@@ -835,16 +1188,14 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
static int unlock_futex_pi(u32 __user *uaddr, u32 uval)
{
- u32 oldval;
+ u32 uninitialized_var(oldval);
/*
* There is no waiter, so we unlock the futex. The owner died
* bit has not to be preserved here. We are the owner:
*/
- oldval = cmpxchg_futex_value_locked(uaddr, uval, 0);
-
- if (oldval == -EFAULT)
- return oldval;
+ if (cmpxchg_futex_value_locked(&oldval, uaddr, uval, 0))
+ return -EFAULT;
if (oldval != uval)
return -EAGAIN;
@@ -878,26 +1229,30 @@ double_unlock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
/*
* Wake up waiters matching bitset queued on this futex (uaddr).
*/
-static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset)
+static int
+futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
{
struct futex_hash_bucket *hb;
struct futex_q *this, *next;
- struct plist_head *head;
union futex_key key = FUTEX_KEY_INIT;
int ret;
if (!bitset)
return -EINVAL;
- ret = get_futex_key(uaddr, fshared, &key);
+ ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, VERIFY_READ);
if (unlikely(ret != 0))
goto out;
hb = hash_futex(&key);
+
+ /* Make sure we really have tasks to wakeup */
+ if (!hb_waiters_pending(hb))
+ goto out_put_key;
+
spin_lock(&hb->lock);
- head = &hb->chain;
- plist_for_each_entry_safe(this, next, head, list) {
+ plist_for_each_entry_safe(this, next, &hb->chain, list) {
if (match_futex (&this->key, &key)) {
if (this->pi_state || this->rt_waiter) {
ret = -EINVAL;
@@ -915,7 +1270,8 @@ static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset)
}
spin_unlock(&hb->lock);
- put_futex_key(fshared, &key);
+out_put_key:
+ put_futex_key(&key);
out:
return ret;
}
@@ -925,20 +1281,19 @@ out:
* to this virtual address:
*/
static int
-futex_wake_op(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
+futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
int nr_wake, int nr_wake2, int op)
{
union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
struct futex_hash_bucket *hb1, *hb2;
- struct plist_head *head;
struct futex_q *this, *next;
int ret, op_ret;
retry:
- ret = get_futex_key(uaddr1, fshared, &key1);
+ ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ);
if (unlikely(ret != 0))
goto out;
- ret = get_futex_key(uaddr2, fshared, &key2);
+ ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE);
if (unlikely(ret != 0))
goto out_put_key1;
@@ -970,18 +1325,20 @@ retry_private:
if (ret)
goto out_put_keys;
- if (!fshared)
+ if (!(flags & FLAGS_SHARED))
goto retry_private;
- put_futex_key(fshared, &key2);
- put_futex_key(fshared, &key1);
+ put_futex_key(&key2);
+ put_futex_key(&key1);
goto retry;
}
- head = &hb1->chain;
-
- plist_for_each_entry_safe(this, next, head, list) {
+ plist_for_each_entry_safe(this, next, &hb1->chain, list) {
if (match_futex (&this->key, &key1)) {
+ if (this->pi_state || this->rt_waiter) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
wake_futex(this);
if (++ret >= nr_wake)
break;
@@ -989,11 +1346,13 @@ retry_private:
}
if (op_ret > 0) {
- head = &hb2->chain;
-
op_ret = 0;
- plist_for_each_entry_safe(this, next, head, list) {
+ plist_for_each_entry_safe(this, next, &hb2->chain, list) {
if (match_futex (&this->key, &key2)) {
+ if (this->pi_state || this->rt_waiter) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
wake_futex(this);
if (++op_ret >= nr_wake2)
break;
@@ -1002,11 +1361,12 @@ retry_private:
ret += op_ret;
}
+out_unlock:
double_unlock_hb(hb1, hb2);
out_put_keys:
- put_futex_key(fshared, &key2);
+ put_futex_key(&key2);
out_put_key1:
- put_futex_key(fshared, &key1);
+ put_futex_key(&key1);
out:
return ret;
}
@@ -1029,11 +1389,10 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,
*/
if (likely(&hb1->chain != &hb2->chain)) {
plist_del(&q->list, &hb1->chain);
+ hb_waiters_dec(hb1);
plist_add(&q->list, &hb2->chain);
+ hb_waiters_inc(hb2);
q->lock_ptr = &hb2->lock;
-#ifdef CONFIG_DEBUG_PI_LIST
- q->list.plist.spinlock = &hb2->lock;
-#endif
}
get_futex_key_refs(key2);
q->key = *key2;
@@ -1060,16 +1419,12 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
get_futex_key_refs(key);
q->key = *key;
- WARN_ON(plist_node_empty(&q->list));
- plist_del(&q->list, &q->list.plist);
+ __unqueue_futex(q);
WARN_ON(!q->rt_waiter);
q->rt_waiter = NULL;
q->lock_ptr = &hb->lock;
-#ifdef CONFIG_DEBUG_PI_LIST
- q->list.plist.spinlock = &hb->lock;
-#endif
wake_up_state(q->task, TASK_NORMAL);
}
@@ -1089,9 +1444,9 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
* then direct futex_lock_pi_atomic() to force setting the FUTEX_WAITERS bit.
* hb1 and hb2 must be held by the caller.
*
- * Returns:
- * 0 - failed to acquire the lock atomicly
- * 1 - acquired the lock
+ * Return:
+ * 0 - failed to acquire the lock atomically;
+ * >0 - acquired the lock, return value is vpid of the top_waiter
* <0 - error
*/
static int futex_proxy_trylock_atomic(u32 __user *pifutex,
@@ -1102,7 +1457,7 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
{
struct futex_q *top_waiter = NULL;
u32 curval;
- int ret;
+ int ret, vpid;
if (get_futex_value_locked(&curval, pifutex))
return -EFAULT;
@@ -1130,44 +1485,53 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
* the contended case or if set_waiters is 1. The pi_state is returned
* in ps in contended cases.
*/
+ vpid = task_pid_vnr(top_waiter->task);
ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task,
set_waiters);
- if (ret == 1)
+ if (ret == 1) {
requeue_pi_wake_futex(top_waiter, key2, hb2);
-
+ return vpid;
+ }
return ret;
}
/**
* futex_requeue() - Requeue waiters from uaddr1 to uaddr2
- * uaddr1: source futex user address
- * uaddr2: target futex user address
- * nr_wake: number of waiters to wake (must be 1 for requeue_pi)
- * nr_requeue: number of waiters to requeue (0-INT_MAX)
- * requeue_pi: if we are attempting to requeue from a non-pi futex to a
- * pi futex (pi to pi requeue is not supported)
+ * @uaddr1: source futex user address
+ * @flags: futex flags (FLAGS_SHARED, etc.)
+ * @uaddr2: target futex user address
+ * @nr_wake: number of waiters to wake (must be 1 for requeue_pi)
+ * @nr_requeue: number of waiters to requeue (0-INT_MAX)
+ * @cmpval: @uaddr1 expected value (or %NULL)
+ * @requeue_pi: if we are attempting to requeue from a non-pi futex to a
+ * pi futex (pi to pi requeue is not supported)
*
* Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire
* uaddr2 atomically on behalf of the top waiter.
*
- * Returns:
- * >=0 - on success, the number of tasks requeued or woken
+ * Return:
+ * >=0 - on success, the number of tasks requeued or woken;
* <0 - on error
*/
-static int futex_requeue(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
- int nr_wake, int nr_requeue, u32 *cmpval,
- int requeue_pi)
+static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
+ u32 __user *uaddr2, int nr_wake, int nr_requeue,
+ u32 *cmpval, int requeue_pi)
{
union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
int drop_count = 0, task_count = 0, ret;
struct futex_pi_state *pi_state = NULL;
struct futex_hash_bucket *hb1, *hb2;
- struct plist_head *head1;
struct futex_q *this, *next;
- u32 curval2;
if (requeue_pi) {
/*
+ * Requeue PI only works on two distinct uaddrs. This
+ * check is only valid for private futexes. See below.
+ */
+ if (uaddr1 == uaddr2)
+ return -EINVAL;
+
+ /*
* requeue_pi requires a pi_state, try to allocate it now
* without any locks in case it fails.
*/
@@ -1197,17 +1561,28 @@ retry:
pi_state = NULL;
}
- ret = get_futex_key(uaddr1, fshared, &key1);
+ ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ);
if (unlikely(ret != 0))
goto out;
- ret = get_futex_key(uaddr2, fshared, &key2);
+ ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2,
+ requeue_pi ? VERIFY_WRITE : VERIFY_READ);
if (unlikely(ret != 0))
goto out_put_key1;
+ /*
+ * The check above which compares uaddrs is not sufficient for
+ * shared futexes. We need to compare the keys:
+ */
+ if (requeue_pi && match_futex(&key1, &key2)) {
+ ret = -EINVAL;
+ goto out_put_keys;
+ }
+
hb1 = hash_futex(&key1);
hb2 = hash_futex(&key2);
retry_private:
+ hb_waiters_inc(hb2);
double_lock_hb(hb1, hb2);
if (likely(cmpval != NULL)) {
@@ -1217,16 +1592,17 @@ retry_private:
if (unlikely(ret)) {
double_unlock_hb(hb1, hb2);
+ hb_waiters_dec(hb2);
ret = get_user(curval, uaddr1);
if (ret)
goto out_put_keys;
- if (!fshared)
+ if (!(flags & FLAGS_SHARED))
goto retry_private;
- put_futex_key(fshared, &key2);
- put_futex_key(fshared, &key1);
+ put_futex_key(&key2);
+ put_futex_key(&key1);
goto retry;
}
if (curval != *cmpval) {
@@ -1249,16 +1625,25 @@ retry_private:
* At this point the top_waiter has either taken uaddr2 or is
* waiting on it. If the former, then the pi_state will not
* exist yet, look it up one more time to ensure we have a
- * reference to it.
+ * reference to it. If the lock was taken, ret contains the
+ * vpid of the top waiter task.
*/
- if (ret == 1) {
+ if (ret > 0) {
WARN_ON(pi_state);
drop_count++;
task_count++;
- ret = get_futex_value_locked(&curval2, uaddr2);
- if (!ret)
- ret = lookup_pi_state(curval2, hb2, &key2,
- &pi_state);
+ /*
+ * If we acquired the lock, then the user
+ * space value of uaddr2 should be vpid. It
+ * cannot be changed by the top waiter as it
+ * is blocked on hb2 lock if it tries to do
+ * so. If something fiddled with it behind our
+ * back the pi state lookup might unearth
+ * it. So we rather use the known value than
+ * rereading and handing potential crap to
+ * lookup_pi_state.
+ */
+ ret = lookup_pi_state(ret, hb2, &key2, &pi_state);
}
switch (ret) {
@@ -1266,8 +1651,9 @@ retry_private:
break;
case -EFAULT:
double_unlock_hb(hb1, hb2);
- put_futex_key(fshared, &key2);
- put_futex_key(fshared, &key1);
+ hb_waiters_dec(hb2);
+ put_futex_key(&key2);
+ put_futex_key(&key1);
ret = fault_in_user_writeable(uaddr2);
if (!ret)
goto retry;
@@ -1275,8 +1661,9 @@ retry_private:
case -EAGAIN:
/* The owner was exiting, try again. */
double_unlock_hb(hb1, hb2);
- put_futex_key(fshared, &key2);
- put_futex_key(fshared, &key1);
+ hb_waiters_dec(hb2);
+ put_futex_key(&key2);
+ put_futex_key(&key1);
cond_resched();
goto retry;
default:
@@ -1284,8 +1671,7 @@ retry_private:
}
}
- head1 = &hb1->chain;
- plist_for_each_entry_safe(this, next, head1, list) {
+ plist_for_each_entry_safe(this, next, &hb1->chain, list) {
if (task_count - nr_wake >= nr_requeue)
break;
@@ -1295,9 +1681,13 @@ retry_private:
/*
* FUTEX_WAIT_REQEUE_PI and FUTEX_CMP_REQUEUE_PI should always
* be paired with each other and no other futex ops.
+ *
+ * We should never be requeueing a futex_q with a pi_state,
+ * which is awaiting a futex_unlock_pi().
*/
if ((requeue_pi && !this->rt_waiter) ||
- (!requeue_pi && this->rt_waiter)) {
+ (!requeue_pi && this->rt_waiter) ||
+ this->pi_state) {
ret = -EINVAL;
break;
}
@@ -1347,6 +1737,7 @@ retry_private:
out_unlock:
double_unlock_hb(hb1, hb2);
+ hb_waiters_dec(hb2);
/*
* drop_futex_key_refs() must be called outside the spinlocks. During
@@ -1358,9 +1749,9 @@ out_unlock:
drop_futex_key_refs(&key1);
out_put_keys:
- put_futex_key(fshared, &key2);
+ put_futex_key(&key2);
out_put_key1:
- put_futex_key(fshared, &key1);
+ put_futex_key(&key1);
out:
if (pi_state != NULL)
free_pi_state(pi_state);
@@ -1369,22 +1760,34 @@ out:
/* The key must be already stored in q->key. */
static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)
+ __acquires(&hb->lock)
{
struct futex_hash_bucket *hb;
- get_futex_key_refs(&q->key);
hb = hash_futex(&q->key);
+
+ /*
+ * Increment the counter before taking the lock so that
+ * a potential waker won't miss a to-be-slept task that is
+ * waiting for the spinlock. This is safe as all queue_lock()
+ * users end up calling queue_me(). Similarly, for housekeeping,
+ * decrement the counter at queue_unlock() when some error has
+ * occurred and we don't end up adding the task to the list.
+ */
+ hb_waiters_inc(hb);
+
q->lock_ptr = &hb->lock;
- spin_lock(&hb->lock);
+ spin_lock(&hb->lock); /* implies MB (A) */
return hb;
}
static inline void
-queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb)
+queue_unlock(struct futex_hash_bucket *hb)
+ __releases(&hb->lock)
{
spin_unlock(&hb->lock);
- drop_futex_key_refs(&q->key);
+ hb_waiters_dec(hb);
}
/**
@@ -1400,6 +1803,7 @@ queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb)
* an example).
*/
static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
+ __releases(&hb->lock)
{
int prio;
@@ -1414,9 +1818,6 @@ static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
prio = min(current->normal_prio, MAX_RT_PRIO);
plist_node_init(&q->list, prio);
-#ifdef CONFIG_DEBUG_PI_LIST
- q->list.plist.spinlock = &hb->lock;
-#endif
plist_add(&q->list, &hb->chain);
q->task = current;
spin_unlock(&hb->lock);
@@ -1429,8 +1830,8 @@ static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
* The q->lock_ptr must not be held by the caller. A call to unqueue_me() must
* be paired with exactly one earlier call to queue_me().
*
- * Returns:
- * 1 - if the futex_q was still queued (and we removed unqueued it)
+ * Return:
+ * 1 - if the futex_q was still queued (and we removed unqueued it);
* 0 - if the futex_q was already removed by the waking thread
*/
static int unqueue_me(struct futex_q *q)
@@ -1461,8 +1862,7 @@ retry:
spin_unlock(lock_ptr);
goto retry;
}
- WARN_ON(plist_node_empty(&q->list));
- plist_del(&q->list, &q->list.plist);
+ __unqueue_futex(q);
BUG_ON(q->pi_state);
@@ -1480,17 +1880,15 @@ retry:
* and dropped here.
*/
static void unqueue_me_pi(struct futex_q *q)
+ __releases(q->lock_ptr)
{
- WARN_ON(plist_node_empty(&q->list));
- plist_del(&q->list, &q->list.plist);
+ __unqueue_futex(q);
BUG_ON(!q->pi_state);
free_pi_state(q->pi_state);
q->pi_state = NULL;
spin_unlock(q->lock_ptr);
-
- drop_futex_key_refs(&q->key);
}
/*
@@ -1500,12 +1898,12 @@ static void unqueue_me_pi(struct futex_q *q)
* private futexes.
*/
static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
- struct task_struct *newowner, int fshared)
+ struct task_struct *newowner)
{
u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
struct futex_pi_state *pi_state = q->pi_state;
struct task_struct *oldowner = pi_state->owner;
- u32 uval, curval, newval;
+ u32 uval, uninitialized_var(curval), newval;
int ret;
/* Owner died? */
@@ -1514,10 +1912,10 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
/*
* We are here either because we stole the rtmutex from the
- * pending owner or we are the pending owner which failed to
- * get the rtmutex. We have to replace the pending owner TID
- * in the user space variable. This must be atomic as we have
- * to preserve the owner died bit here.
+ * previous highest priority waiter or we are the highest priority
+ * waiter but failed to get the rtmutex the first time.
+ * We have to replace the newowner TID in the user space variable.
+ * This must be atomic as we have to preserve the owner died bit here.
*
* Note: We write the user space value _before_ changing the pi_state
* because we can fault here. Imagine swapped out pages or a fork
@@ -1536,9 +1934,7 @@ retry:
while (1) {
newval = (uval & FUTEX_OWNER_DIED) | newtid;
- curval = cmpxchg_futex_value_locked(uaddr, uval, newval);
-
- if (curval == -EFAULT)
+ if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))
goto handle_fault;
if (curval == uval)
break;
@@ -1566,8 +1962,8 @@ retry:
/*
* To handle the page fault we need to drop the hash bucket
- * lock here. That gives the other task (either the pending
- * owner itself or the task which stole the rtmutex) the
+ * lock here. That gives the other task (either the highest priority
+ * waiter itself or the task which stole the rtmutex) the
* chance to try the fixup of the pi_state. So once we are
* back from handling the fault we need to check the pi_state
* after reacquiring the hash bucket lock and before trying to
@@ -1593,20 +1989,11 @@ handle_fault:
goto retry;
}
-/*
- * In case we must use restart_block to restart a futex_wait,
- * we encode in the 'flags' shared capability
- */
-#define FLAGS_SHARED 0x01
-#define FLAGS_CLOCKRT 0x02
-#define FLAGS_HAS_TIMEOUT 0x04
-
static long futex_wait_restart(struct restart_block *restart);
/**
* fixup_owner() - Post lock pi_state and corner case management
* @uaddr: user address of the futex
- * @fshared: whether the futex is shared (1) or not (0)
* @q: futex_q (contains pi_state and access to the rt_mutex)
* @locked: if the attempt to take the rt_mutex succeeded (1) or not (0)
*
@@ -1614,13 +2001,12 @@ static long futex_wait_restart(struct restart_block *restart);
* the pi_state owner as well as handle race conditions that may allow us to
* acquire the lock. Must be called with the hb lock held.
*
- * Returns:
- * 1 - success, lock taken
- * 0 - success, lock not taken
+ * Return:
+ * 1 - success, lock taken;
+ * 0 - success, lock not taken;
* <0 - on error (-EFAULT)
*/
-static int fixup_owner(u32 __user *uaddr, int fshared, struct futex_q *q,
- int locked)
+static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
{
struct task_struct *owner;
int ret = 0;
@@ -1631,7 +2017,7 @@ static int fixup_owner(u32 __user *uaddr, int fshared, struct futex_q *q,
* did a lock-steal - fix up the PI-state in that case:
*/
if (q->pi_state->owner != current)
- ret = fixup_pi_state_owner(uaddr, q, current, fshared);
+ ret = fixup_pi_state_owner(uaddr, q, current);
goto out;
}
@@ -1653,18 +2039,20 @@ static int fixup_owner(u32 __user *uaddr, int fshared, struct futex_q *q,
/*
* pi_state is incorrect, some other task did a lock steal and
* we returned due to timeout or signal without taking the
- * rt_mutex. Too late. We can access the rt_mutex_owner without
- * locking, as the other task is now blocked on the hash bucket
- * lock. Fix the state up.
+ * rt_mutex. Too late.
*/
+ raw_spin_lock(&q->pi_state->pi_mutex.wait_lock);
owner = rt_mutex_owner(&q->pi_state->pi_mutex);
- ret = fixup_pi_state_owner(uaddr, q, owner, fshared);
+ if (!owner)
+ owner = rt_mutex_next_owner(&q->pi_state->pi_mutex);
+ raw_spin_unlock(&q->pi_state->pi_mutex.wait_lock);
+ ret = fixup_pi_state_owner(uaddr, q, owner);
goto out;
}
/*
* Paranoia check. If we did not take the lock, then we should not be
- * the owner, nor the pending owner, of the rt_mutex.
+ * the owner of the rt_mutex.
*/
if (rt_mutex_owner(&q->pi_state->pi_mutex) == current)
printk(KERN_ERR "fixup_owner: ret = %d pi-mutex: %p "
@@ -1712,7 +2100,7 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
* is no timeout, or if it has yet to expire.
*/
if (!timeout || timeout->task)
- schedule();
+ freezable_schedule();
}
__set_current_state(TASK_RUNNING);
}
@@ -1721,7 +2109,7 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
* futex_wait_setup() - Prepare to wait on a futex
* @uaddr: the futex userspace address
* @val: the expected value
- * @fshared: whether the futex is shared (1) or not (0)
+ * @flags: futex flags (FLAGS_SHARED, etc.)
* @q: the associated futex_q
* @hb: storage for hash_bucket pointer to be returned to caller
*
@@ -1730,11 +2118,11 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
* Return with the hb lock held and a q.key reference on success, and unlocked
* with no q.key reference on failure.
*
- * Returns:
- * 0 - uaddr contains val and hb has been locked
- * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlcoked
+ * Return:
+ * 0 - uaddr contains val and hb has been locked;
+ * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlocked
*/
-static int futex_wait_setup(u32 __user *uaddr, u32 val, int fshared,
+static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
struct futex_q *q, struct futex_hash_bucket **hb)
{
u32 uval;
@@ -1749,17 +2137,17 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, int fshared,
*
* The basic logical guarantee of a futex is that it blocks ONLY
* if cond(var) is known to be true at the time of blocking, for
- * any cond. If we queued after testing *uaddr, that would open
- * a race condition where we could block indefinitely with
+ * any cond. If we locked the hash-bucket after testing *uaddr, that
+ * would open a race condition where we could block indefinitely with
* cond(var) false, which would violate the guarantee.
*
- * A consequence is that futex_wait() can return zero and absorb
- * a wakeup when *uaddr != val on entry to the syscall. This is
- * rare, but normal.
+ * On the other hand, we insert q and release the hash-bucket only
+ * after testing *uaddr. This guarantees that futex_wait() will NOT
+ * absorb a wakeup if *uaddr does not match the desired values
+ * while the syscall executes.
*/
retry:
- q->key = FUTEX_KEY_INIT;
- ret = get_futex_key(uaddr, fshared, &q->key);
+ ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key, VERIFY_READ);
if (unlikely(ret != 0))
return ret;
@@ -1769,60 +2157,60 @@ retry_private:
ret = get_futex_value_locked(&uval, uaddr);
if (ret) {
- queue_unlock(q, *hb);
+ queue_unlock(*hb);
ret = get_user(uval, uaddr);
if (ret)
goto out;
- if (!fshared)
+ if (!(flags & FLAGS_SHARED))
goto retry_private;
- put_futex_key(fshared, &q->key);
+ put_futex_key(&q->key);
goto retry;
}
if (uval != val) {
- queue_unlock(q, *hb);
+ queue_unlock(*hb);
ret = -EWOULDBLOCK;
}
out:
if (ret)
- put_futex_key(fshared, &q->key);
+ put_futex_key(&q->key);
return ret;
}
-static int futex_wait(u32 __user *uaddr, int fshared,
- u32 val, ktime_t *abs_time, u32 bitset, int clockrt)
+static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,
+ ktime_t *abs_time, u32 bitset)
{
struct hrtimer_sleeper timeout, *to = NULL;
struct restart_block *restart;
struct futex_hash_bucket *hb;
- struct futex_q q;
+ struct futex_q q = futex_q_init;
int ret;
if (!bitset)
return -EINVAL;
-
- q.pi_state = NULL;
q.bitset = bitset;
- q.rt_waiter = NULL;
- q.requeue_pi_key = NULL;
if (abs_time) {
to = &timeout;
- hrtimer_init_on_stack(&to->timer, clockrt ? CLOCK_REALTIME :
- CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+ hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ?
+ CLOCK_REALTIME : CLOCK_MONOTONIC,
+ HRTIMER_MODE_ABS);
hrtimer_init_sleeper(to, current);
hrtimer_set_expires_range_ns(&to->timer, *abs_time,
current->timer_slack_ns);
}
retry:
- /* Prepare to wait on uaddr. */
- ret = futex_wait_setup(uaddr, val, fshared, &q, &hb);
+ /*
+ * Prepare to wait on uaddr. On success, holds hb lock and increments
+ * q.key refs.
+ */
+ ret = futex_wait_setup(uaddr, val, flags, &q, &hb);
if (ret)
goto out;
@@ -1831,42 +2219,34 @@ retry:
/* If we were woken (and unqueued), we succeeded, whatever. */
ret = 0;
+ /* unqueue_me() drops q.key ref */
if (!unqueue_me(&q))
- goto out_put_key;
+ goto out;
ret = -ETIMEDOUT;
if (to && !to->task)
- goto out_put_key;
+ goto out;
/*
* We expect signal_pending(current), but we might be the
* victim of a spurious wakeup as well.
*/
- if (!signal_pending(current)) {
- put_futex_key(fshared, &q.key);
+ if (!signal_pending(current))
goto retry;
- }
ret = -ERESTARTSYS;
if (!abs_time)
- goto out_put_key;
+ goto out;
restart = &current_thread_info()->restart_block;
restart->fn = futex_wait_restart;
- restart->futex.uaddr = (u32 *)uaddr;
+ restart->futex.uaddr = uaddr;
restart->futex.val = val;
restart->futex.time = abs_time->tv64;
restart->futex.bitset = bitset;
- restart->futex.flags = FLAGS_HAS_TIMEOUT;
-
- if (fshared)
- restart->futex.flags |= FLAGS_SHARED;
- if (clockrt)
- restart->futex.flags |= FLAGS_CLOCKRT;
+ restart->futex.flags = flags | FLAGS_HAS_TIMEOUT;
ret = -ERESTART_RESTARTBLOCK;
-out_put_key:
- put_futex_key(fshared, &q.key);
out:
if (to) {
hrtimer_cancel(&to->timer);
@@ -1878,8 +2258,7 @@ out:
static long futex_wait_restart(struct restart_block *restart)
{
- u32 __user *uaddr = (u32 __user *)restart->futex.uaddr;
- int fshared = 0;
+ u32 __user *uaddr = restart->futex.uaddr;
ktime_t t, *tp = NULL;
if (restart->futex.flags & FLAGS_HAS_TIMEOUT) {
@@ -1887,11 +2266,9 @@ static long futex_wait_restart(struct restart_block *restart)
tp = &t;
}
restart->fn = do_no_restart_syscall;
- if (restart->futex.flags & FLAGS_SHARED)
- fshared = 1;
- return (long)futex_wait(uaddr, fshared, restart->futex.val, tp,
- restart->futex.bitset,
- restart->futex.flags & FLAGS_CLOCKRT);
+
+ return (long)futex_wait(uaddr, restart->futex.flags,
+ restart->futex.val, tp, restart->futex.bitset);
}
@@ -1901,12 +2278,12 @@ static long futex_wait_restart(struct restart_block *restart)
* if there are waiters then it will block, it does PI, etc. (Due to
* races the kernel might see a 0 value of the futex too.)
*/
-static int futex_lock_pi(u32 __user *uaddr, int fshared,
- int detect, ktime_t *time, int trylock)
+static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, int detect,
+ ktime_t *time, int trylock)
{
struct hrtimer_sleeper timeout, *to = NULL;
struct futex_hash_bucket *hb;
- struct futex_q q;
+ struct futex_q q = futex_q_init;
int res, ret;
if (refill_pi_state_cache())
@@ -1920,12 +2297,8 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,
hrtimer_set_expires(&to->timer, *time);
}
- q.pi_state = NULL;
- q.rt_waiter = NULL;
- q.requeue_pi_key = NULL;
retry:
- q.key = FUTEX_KEY_INIT;
- ret = get_futex_key(uaddr, fshared, &q.key);
+ ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key, VERIFY_WRITE);
if (unlikely(ret != 0))
goto out;
@@ -1946,8 +2319,8 @@ retry_private:
* Task is exiting and we just wait for the
* exit to complete.
*/
- queue_unlock(&q, hb);
- put_futex_key(fshared, &q.key);
+ queue_unlock(hb);
+ put_futex_key(&q.key);
cond_resched();
goto retry;
default:
@@ -1977,7 +2350,7 @@ retry_private:
* Fixup the pi_state owner and possibly acquire the lock if we
* haven't already.
*/
- res = fixup_owner(uaddr, fshared, &q, !ret);
+ res = fixup_owner(uaddr, &q, !ret);
/*
* If fixup_owner() returned an error, proprogate that. If it acquired
* the lock, clear our -ETIMEDOUT or -EINTR.
@@ -1998,26 +2371,26 @@ retry_private:
goto out_put_key;
out_unlock_put_key:
- queue_unlock(&q, hb);
+ queue_unlock(hb);
out_put_key:
- put_futex_key(fshared, &q.key);
+ put_futex_key(&q.key);
out:
if (to)
destroy_hrtimer_on_stack(&to->timer);
return ret != -EINTR ? ret : -ERESTARTNOINTR;
uaddr_faulted:
- queue_unlock(&q, hb);
+ queue_unlock(hb);
ret = fault_in_user_writeable(uaddr);
if (ret)
goto out_put_key;
- if (!fshared)
+ if (!(flags & FLAGS_SHARED))
goto retry_private;
- put_futex_key(fshared, &q.key);
+ put_futex_key(&q.key);
goto retry;
}
@@ -2026,13 +2399,12 @@ uaddr_faulted:
* This is the in-kernel slowpath: we look up the PI state (if any),
* and do the rt-mutex unlock.
*/
-static int futex_unlock_pi(u32 __user *uaddr, int fshared)
+static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
{
struct futex_hash_bucket *hb;
struct futex_q *this, *next;
- u32 uval;
- struct plist_head *head;
union futex_key key = FUTEX_KEY_INIT;
+ u32 uval, vpid = task_pid_vnr(current);
int ret;
retry:
@@ -2041,10 +2413,10 @@ retry:
/*
* We release only a lock we actually own:
*/
- if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current))
+ if ((uval & FUTEX_TID_MASK) != vpid)
return -EPERM;
- ret = get_futex_key(uaddr, fshared, &key);
+ ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, VERIFY_WRITE);
if (unlikely(ret != 0))
goto out;
@@ -2054,28 +2426,24 @@ retry:
/*
* To avoid races, try to do the TID -> 0 atomic transition
* again. If it succeeds then we can return without waking
- * anyone else up:
+ * anyone else up. We only try this if neither the waiters nor
+ * the owner died bit are set.
*/
- if (!(uval & FUTEX_OWNER_DIED))
- uval = cmpxchg_futex_value_locked(uaddr, task_pid_vnr(current), 0);
-
-
- if (unlikely(uval == -EFAULT))
+ if (!(uval & ~FUTEX_TID_MASK) &&
+ cmpxchg_futex_value_locked(&uval, uaddr, vpid, 0))
goto pi_faulted;
/*
* Rare case: we managed to release the lock atomically,
* no need to wake anyone else up:
*/
- if (unlikely(uval == task_pid_vnr(current)))
+ if (unlikely(uval == vpid))
goto out_unlock;
/*
* Ok, other tasks may need to be woken up - check waiters
* and do the wakeup if necessary:
*/
- head = &hb->chain;
-
- plist_for_each_entry_safe(this, next, head, list) {
+ plist_for_each_entry_safe(this, next, &hb->chain, list) {
if (!match_futex (&this->key, &key))
continue;
ret = wake_futex_pi(uaddr, uval, this);
@@ -2091,22 +2459,20 @@ retry:
/*
* No waiters - kernel unlocks the futex:
*/
- if (!(uval & FUTEX_OWNER_DIED)) {
- ret = unlock_futex_pi(uaddr, uval);
- if (ret == -EFAULT)
- goto pi_faulted;
- }
+ ret = unlock_futex_pi(uaddr, uval);
+ if (ret == -EFAULT)
+ goto pi_faulted;
out_unlock:
spin_unlock(&hb->lock);
- put_futex_key(fshared, &key);
+ put_futex_key(&key);
out:
return ret;
pi_faulted:
spin_unlock(&hb->lock);
- put_futex_key(fshared, &key);
+ put_futex_key(&key);
ret = fault_in_user_writeable(uaddr);
if (!ret)
@@ -2127,9 +2493,9 @@ pi_faulted:
* the wakeup and return the appropriate error code to the caller. Must be
* called with the hb lock held.
*
- * Returns
- * 0 - no early wakeup detected
- * <0 - -ETIMEDOUT or -ERESTARTNOINTR
+ * Return:
+ * 0 = no early wakeup detected;
+ * <0 = -ETIMEDOUT or -ERESTARTNOINTR
*/
static inline
int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
@@ -2151,7 +2517,8 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
* We were woken prior to requeue by a timeout or a signal.
* Unqueue the futex_q and determine which it was.
*/
- plist_del(&q->list, &q->list.plist);
+ plist_del(&q->list, &hb->chain);
+ hb_waiters_dec(hb);
/* Handle spurious wakeups gracefully */
ret = -EWOULDBLOCK;
@@ -2166,23 +2533,22 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
/**
* futex_wait_requeue_pi() - Wait on uaddr and take uaddr2
* @uaddr: the futex we initially wait on (non-pi)
- * @fshared: whether the futexes are shared (1) or not (0). They must be
+ * @flags: futex flags (FLAGS_SHARED, FLAGS_CLOCKRT, etc.), they must be
* the same type, no requeueing from private to shared, etc.
* @val: the expected value of uaddr
* @abs_time: absolute timeout
* @bitset: 32 bit wakeup bitset set by userspace, defaults to all
- * @clockrt: whether to use CLOCK_REALTIME (1) or CLOCK_MONOTONIC (0)
* @uaddr2: the pi futex we will take prior to returning to user-space
*
* The caller will wait on uaddr and will be requeued by futex_requeue() to
- * uaddr2 which must be PI aware. Normal wakeup will wake on uaddr2 and
- * complete the acquisition of the rt_mutex prior to returning to userspace.
- * This ensures the rt_mutex maintains an owner when it has waiters; without
- * one, the pi logic wouldn't know which task to boost/deboost, if there was a
- * need to.
+ * uaddr2 which must be PI aware and unique from uaddr. Normal wakeup will wake
+ * on uaddr2 and complete the acquisition of the rt_mutex prior to returning to
+ * userspace. This ensures the rt_mutex maintains an owner when it has waiters;
+ * without one, the pi logic would not know which task to boost/deboost, if
+ * there was a need to.
*
* We call schedule in futex_wait_queue_me() when we enqueue and return there
- * via the following:
+ * via the following--
* 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue()
* 2) wakeup on uaddr2 after a requeue
* 3) signal
@@ -2200,29 +2566,33 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
*
* If 4 or 7, we cleanup and return with -ETIMEDOUT.
*
- * Returns:
- * 0 - On success
+ * Return:
+ * 0 - On success;
* <0 - On error
*/
-static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
+static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
u32 val, ktime_t *abs_time, u32 bitset,
- int clockrt, u32 __user *uaddr2)
+ u32 __user *uaddr2)
{
struct hrtimer_sleeper timeout, *to = NULL;
struct rt_mutex_waiter rt_waiter;
struct rt_mutex *pi_mutex = NULL;
struct futex_hash_bucket *hb;
- union futex_key key2;
- struct futex_q q;
+ union futex_key key2 = FUTEX_KEY_INIT;
+ struct futex_q q = futex_q_init;
int res, ret;
+ if (uaddr == uaddr2)
+ return -EINVAL;
+
if (!bitset)
return -EINVAL;
if (abs_time) {
to = &timeout;
- hrtimer_init_on_stack(&to->timer, clockrt ? CLOCK_REALTIME :
- CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+ hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ?
+ CLOCK_REALTIME : CLOCK_MONOTONIC,
+ HRTIMER_MODE_ABS);
hrtimer_init_sleeper(to, current);
hrtimer_set_expires_range_ns(&to->timer, *abs_time,
current->timer_slack_ns);
@@ -2233,23 +2603,35 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
* code while we sleep on uaddr.
*/
debug_rt_mutex_init_waiter(&rt_waiter);
+ RB_CLEAR_NODE(&rt_waiter.pi_tree_entry);
+ RB_CLEAR_NODE(&rt_waiter.tree_entry);
rt_waiter.task = NULL;
- key2 = FUTEX_KEY_INIT;
- ret = get_futex_key(uaddr2, fshared, &key2);
+ ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE);
if (unlikely(ret != 0))
goto out;
- q.pi_state = NULL;
q.bitset = bitset;
q.rt_waiter = &rt_waiter;
q.requeue_pi_key = &key2;
- /* Prepare to wait on uaddr. */
- ret = futex_wait_setup(uaddr, val, fshared, &q, &hb);
+ /*
+ * Prepare to wait on uaddr. On success, increments q.key (key1) ref
+ * count.
+ */
+ ret = futex_wait_setup(uaddr, val, flags, &q, &hb);
if (ret)
goto out_key2;
+ /*
+ * The check above which compares uaddrs is not sufficient for
+ * shared futexes. We need to compare the keys:
+ */
+ if (match_futex(&q.key, &key2)) {
+ ret = -EINVAL;
+ goto out_put_keys;
+ }
+
/* Queue the futex_q, drop the hb lock, wait for wakeup. */
futex_wait_queue_me(hb, &q, to);
@@ -2263,7 +2645,9 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
* In order for us to be here, we know our q.key == key2, and since
* we took the hb->lock above, we also know that futex_requeue() has
* completed and we no longer have to concern ourselves with a wakeup
- * race with the atomic proxy lock acquition by the requeue code.
+ * race with the atomic proxy lock acquisition by the requeue code. The
+ * futex_requeue dropped our key1 reference and incremented our key2
+ * reference count.
*/
/* Check if the requeue code acquired the second futex for us. */
@@ -2274,8 +2658,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
*/
if (q.pi_state && (q.pi_state->owner != current)) {
spin_lock(q.lock_ptr);
- ret = fixup_pi_state_owner(uaddr2, &q, current,
- fshared);
+ ret = fixup_pi_state_owner(uaddr2, &q, current);
spin_unlock(q.lock_ptr);
}
} else {
@@ -2284,7 +2667,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
* signal. futex_unlock_pi() will not destroy the lock_ptr nor
* the pi_state.
*/
- WARN_ON(!&q.pi_state);
+ WARN_ON(!q.pi_state);
pi_mutex = &q.pi_state->pi_mutex;
ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter, 1);
debug_rt_mutex_free_waiter(&rt_waiter);
@@ -2294,7 +2677,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
* Fixup the pi_state owner and possibly acquire the lock if we
* haven't already.
*/
- res = fixup_owner(uaddr2, fshared, &q, !ret);
+ res = fixup_owner(uaddr2, &q, !ret);
/*
* If fixup_owner() returned an error, proprogate that. If it
* acquired the lock, clear -ETIMEDOUT or -EINTR.
@@ -2311,7 +2694,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
* fault, unlock the rt_mutex and return the fault to userspace.
*/
if (ret == -EFAULT) {
- if (rt_mutex_owner(pi_mutex) == current)
+ if (pi_mutex && rt_mutex_owner(pi_mutex) == current)
rt_mutex_unlock(pi_mutex);
} else if (ret == -EINTR) {
/*
@@ -2325,9 +2708,9 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
}
out_put_keys:
- put_futex_key(fshared, &q.key);
+ put_futex_key(&q.key);
out_key2:
- put_futex_key(fshared, &key2);
+ put_futex_key(&key2);
out:
if (to) {
@@ -2385,31 +2768,29 @@ SYSCALL_DEFINE3(get_robust_list, int, pid,
{
struct robust_list_head __user *head;
unsigned long ret;
- const struct cred *cred = current_cred(), *pcred;
+ struct task_struct *p;
if (!futex_cmpxchg_enabled)
return -ENOSYS;
+ rcu_read_lock();
+
+ ret = -ESRCH;
if (!pid)
- head = current->robust_list;
+ p = current;
else {
- struct task_struct *p;
-
- ret = -ESRCH;
- rcu_read_lock();
p = find_task_by_vpid(pid);
if (!p)
goto err_unlock;
- ret = -EPERM;
- pcred = __task_cred(p);
- if (cred->euid != pcred->euid &&
- cred->euid != pcred->uid &&
- !capable(CAP_SYS_PTRACE))
- goto err_unlock;
- head = p->robust_list;
- rcu_read_unlock();
}
+ ret = -EPERM;
+ if (!ptrace_may_access(p, PTRACE_MODE_READ))
+ goto err_unlock;
+
+ head = p->robust_list;
+ rcu_read_unlock();
+
if (put_user(sizeof(*head), len_ptr))
return -EFAULT;
return put_user(head, head_ptr);
@@ -2426,7 +2807,7 @@ err_unlock:
*/
int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi)
{
- u32 uval, nval, mval;
+ u32 uval, uninitialized_var(nval), mval;
retry:
if (get_user(uval, uaddr))
@@ -2444,11 +2825,20 @@ retry:
* userspace.
*/
mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
- nval = futex_atomic_cmpxchg_inatomic(uaddr, uval, mval);
-
- if (nval == -EFAULT)
- return -1;
-
+ /*
+ * We are not holding a lock here, but we want to have
+ * the pagefault_disable/enable() protection because
+ * we want to handle the fault gracefully. If the
+ * access fails we try to fault in the futex with R/W
+ * verification via get_user_pages. get_user() above
+ * does not guarantee R/W access. If that fails we
+ * give up and leave the futex locked.
+ */
+ if (cmpxchg_futex_value_locked(&nval, uaddr, uval, mval)) {
+ if (fault_in_user_writeable(uaddr))
+ return -1;
+ goto retry;
+ }
if (nval != uval)
goto retry;
@@ -2467,7 +2857,7 @@ retry:
*/
static inline int fetch_robust_entry(struct robust_list __user **entry,
struct robust_list __user * __user *head,
- int *pi)
+ unsigned int *pi)
{
unsigned long uentry;
@@ -2490,7 +2880,8 @@ void exit_robust_list(struct task_struct *curr)
{
struct robust_list_head __user *head = curr->robust_list;
struct robust_list __user *entry, *next_entry, *pending;
- unsigned int limit = ROBUST_LIST_LIMIT, pi, next_pi, pip;
+ unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
+ unsigned int uninitialized_var(next_pi);
unsigned long futex_offset;
int rc;
@@ -2551,63 +2942,57 @@ void exit_robust_list(struct task_struct *curr)
long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
u32 __user *uaddr2, u32 val2, u32 val3)
{
- int clockrt, ret = -ENOSYS;
int cmd = op & FUTEX_CMD_MASK;
- int fshared = 0;
+ unsigned int flags = 0;
if (!(op & FUTEX_PRIVATE_FLAG))
- fshared = 1;
+ flags |= FLAGS_SHARED;
- clockrt = op & FUTEX_CLOCK_REALTIME;
- if (clockrt && cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI)
- return -ENOSYS;
+ if (op & FUTEX_CLOCK_REALTIME) {
+ flags |= FLAGS_CLOCKRT;
+ if (cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI)
+ return -ENOSYS;
+ }
+
+ switch (cmd) {
+ case FUTEX_LOCK_PI:
+ case FUTEX_UNLOCK_PI:
+ case FUTEX_TRYLOCK_PI:
+ case FUTEX_WAIT_REQUEUE_PI:
+ case FUTEX_CMP_REQUEUE_PI:
+ if (!futex_cmpxchg_enabled)
+ return -ENOSYS;
+ }
switch (cmd) {
case FUTEX_WAIT:
val3 = FUTEX_BITSET_MATCH_ANY;
case FUTEX_WAIT_BITSET:
- ret = futex_wait(uaddr, fshared, val, timeout, val3, clockrt);
- break;
+ return futex_wait(uaddr, flags, val, timeout, val3);
case FUTEX_WAKE:
val3 = FUTEX_BITSET_MATCH_ANY;
case FUTEX_WAKE_BITSET:
- ret = futex_wake(uaddr, fshared, val, val3);
- break;
+ return futex_wake(uaddr, flags, val, val3);
case FUTEX_REQUEUE:
- ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL, 0);
- break;
+ return futex_requeue(uaddr, flags, uaddr2, val, val2, NULL, 0);
case FUTEX_CMP_REQUEUE:
- ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3,
- 0);
- break;
+ return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 0);
case FUTEX_WAKE_OP:
- ret = futex_wake_op(uaddr, fshared, uaddr2, val, val2, val3);
- break;
+ return futex_wake_op(uaddr, flags, uaddr2, val, val2, val3);
case FUTEX_LOCK_PI:
- if (futex_cmpxchg_enabled)
- ret = futex_lock_pi(uaddr, fshared, val, timeout, 0);
- break;
+ return futex_lock_pi(uaddr, flags, val, timeout, 0);
case FUTEX_UNLOCK_PI:
- if (futex_cmpxchg_enabled)
- ret = futex_unlock_pi(uaddr, fshared);
- break;
+ return futex_unlock_pi(uaddr, flags);
case FUTEX_TRYLOCK_PI:
- if (futex_cmpxchg_enabled)
- ret = futex_lock_pi(uaddr, fshared, 0, timeout, 1);
- break;
+ return futex_lock_pi(uaddr, flags, 0, timeout, 1);
case FUTEX_WAIT_REQUEUE_PI:
val3 = FUTEX_BITSET_MATCH_ANY;
- ret = futex_wait_requeue_pi(uaddr, fshared, val, timeout, val3,
- clockrt, uaddr2);
- break;
+ return futex_wait_requeue_pi(uaddr, flags, val, timeout, val3,
+ uaddr2);
case FUTEX_CMP_REQUEUE_PI:
- ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3,
- 1);
- break;
- default:
- ret = -ENOSYS;
+ return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1);
}
- return ret;
+ return -ENOSYS;
}
@@ -2644,10 +3029,10 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
}
-static int __init futex_init(void)
+static void __init futex_detect_cmpxchg(void)
{
+#ifndef CONFIG_HAVE_FUTEX_CMPXCHG
u32 curval;
- int i;
/*
* This will fail and we want it. Some arch implementations do
@@ -2656,15 +3041,37 @@ static int __init futex_init(void)
* of the complex code paths. Also we want to prevent
* registration of robust lists in that case. NULL is
* guaranteed to fault and we get -EFAULT on functional
- * implementation, the non functional ones will return
+ * implementation, the non-functional ones will return
* -ENOSYS.
*/
- curval = cmpxchg_futex_value_locked(NULL, 0, 0);
- if (curval == -EFAULT)
+ if (cmpxchg_futex_value_locked(&curval, NULL, 0, 0) == -EFAULT)
futex_cmpxchg_enabled = 1;
+#endif
+}
+
+static int __init futex_init(void)
+{
+ unsigned int futex_shift;
+ unsigned long i;
+
+#if CONFIG_BASE_SMALL
+ futex_hashsize = 16;
+#else
+ futex_hashsize = roundup_pow_of_two(256 * num_possible_cpus());
+#endif
+
+ futex_queues = alloc_large_system_hash("futex", sizeof(*futex_queues),
+ futex_hashsize, 0,
+ futex_hashsize < 256 ? HASH_SMALL : 0,
+ &futex_shift, NULL,
+ futex_hashsize, futex_hashsize);
+ futex_hashsize = 1UL << futex_shift;
+
+ futex_detect_cmpxchg();
- for (i = 0; i < ARRAY_SIZE(futex_queues); i++) {
- plist_head_init(&futex_queues[i].chain, &futex_queues[i].lock);
+ for (i = 0; i < futex_hashsize; i++) {
+ atomic_set(&futex_queues[i].waiters, 0);
+ plist_head_init(&futex_queues[i].chain);
spin_lock_init(&futex_queues[i].lock);
}
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index 235716556bf..55c8c9349cf 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -10,6 +10,8 @@
#include <linux/compat.h>
#include <linux/nsproxy.h>
#include <linux/futex.h>
+#include <linux/ptrace.h>
+#include <linux/syscalls.h>
#include <asm/uaccess.h>
@@ -19,7 +21,7 @@
*/
static inline int
fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry,
- compat_uptr_t __user *head, int *pi)
+ compat_uptr_t __user *head, unsigned int *pi)
{
if (get_user(*uentry, head))
return -EFAULT;
@@ -49,7 +51,8 @@ void compat_exit_robust_list(struct task_struct *curr)
{
struct compat_robust_list_head __user *head = curr->compat_robust_list;
struct robust_list __user *entry, *next_entry, *pending;
- unsigned int limit = ROBUST_LIST_LIMIT, pi, next_pi, pip;
+ unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
+ unsigned int uninitialized_var(next_pi);
compat_uptr_t uentry, next_uentry, upending;
compat_long_t futex_offset;
int rc;
@@ -114,9 +117,9 @@ void compat_exit_robust_list(struct task_struct *curr)
}
}
-asmlinkage long
-compat_sys_set_robust_list(struct compat_robust_list_head __user *head,
- compat_size_t len)
+COMPAT_SYSCALL_DEFINE2(set_robust_list,
+ struct compat_robust_list_head __user *, head,
+ compat_size_t, len)
{
if (!futex_cmpxchg_enabled)
return -ENOSYS;
@@ -129,50 +132,48 @@ compat_sys_set_robust_list(struct compat_robust_list_head __user *head,
return 0;
}
-asmlinkage long
-compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr,
- compat_size_t __user *len_ptr)
+COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid,
+ compat_uptr_t __user *, head_ptr,
+ compat_size_t __user *, len_ptr)
{
struct compat_robust_list_head __user *head;
unsigned long ret;
- const struct cred *cred = current_cred(), *pcred;
+ struct task_struct *p;
if (!futex_cmpxchg_enabled)
return -ENOSYS;
+ rcu_read_lock();
+
+ ret = -ESRCH;
if (!pid)
- head = current->compat_robust_list;
+ p = current;
else {
- struct task_struct *p;
-
- ret = -ESRCH;
- read_lock(&tasklist_lock);
p = find_task_by_vpid(pid);
if (!p)
goto err_unlock;
- ret = -EPERM;
- pcred = __task_cred(p);
- if (cred->euid != pcred->euid &&
- cred->euid != pcred->uid &&
- !capable(CAP_SYS_PTRACE))
- goto err_unlock;
- head = p->compat_robust_list;
- read_unlock(&tasklist_lock);
}
+ ret = -EPERM;
+ if (!ptrace_may_access(p, PTRACE_MODE_READ))
+ goto err_unlock;
+
+ head = p->compat_robust_list;
+ rcu_read_unlock();
+
if (put_user(sizeof(*head), len_ptr))
return -EFAULT;
return put_user(ptr_to_compat(head), head_ptr);
err_unlock:
- read_unlock(&tasklist_lock);
+ rcu_read_unlock();
return ret;
}
-asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val,
- struct compat_timespec __user *utime, u32 __user *uaddr2,
- u32 val3)
+COMPAT_SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
+ struct compat_timespec __user *, utime, u32 __user *, uaddr2,
+ u32, val3)
{
struct timespec ts;
ktime_t t, *tp = NULL;
@@ -182,7 +183,7 @@ asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val,
if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI ||
cmd == FUTEX_WAIT_BITSET ||
cmd == FUTEX_WAIT_REQUEUE_PI)) {
- if (get_compat_timespec(&ts, utime))
+ if (compat_get_timespec(&ts, utime))
return -EFAULT;
if (!timespec_valid(&ts))
return -EINVAL;
diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig
index 70a298d6da7..d04ce8ac439 100644
--- a/kernel/gcov/Kconfig
+++ b/kernel/gcov/Kconfig
@@ -2,7 +2,8 @@ menu "GCOV-based kernel profiling"
config GCOV_KERNEL
bool "Enable gcov-based kernel profiling"
- depends on DEBUG_FS && CONSTRUCTORS
+ depends on DEBUG_FS
+ select CONSTRUCTORS if !UML
default n
---help---
This option enables gcov-based code profiling (e.g. for code coverage
@@ -34,7 +35,7 @@ config GCOV_KERNEL
config GCOV_PROFILE_ALL
bool "Profile entire Kernel"
depends on GCOV_KERNEL
- depends on S390 || X86 || (PPC && EXPERIMENTAL) || MICROBLAZE
+ depends on SUPERH || S390 || X86 || PPC || MICROBLAZE
default n
---help---
This options activates profiling for the entire kernel.
@@ -45,4 +46,34 @@ config GCOV_PROFILE_ALL
larger and run slower. Also be sure to exclude files from profiling
which are not linked to the kernel image to prevent linker errors.
+choice
+ prompt "Specify GCOV format"
+ depends on GCOV_KERNEL
+ default GCOV_FORMAT_AUTODETECT
+ ---help---
+ The gcov format is usually determined by the GCC version, but there are
+ exceptions where format changes are integrated in lower-version GCCs.
+ In such a case use this option to adjust the format used in the kernel
+ accordingly.
+
+ If unsure, choose "Autodetect".
+
+config GCOV_FORMAT_AUTODETECT
+ bool "Autodetect"
+ ---help---
+ Select this option to use the format that corresponds to your GCC
+ version.
+
+config GCOV_FORMAT_3_4
+ bool "GCC 3.4 format"
+ ---help---
+ Select this option to use the format defined by GCC 3.4.
+
+config GCOV_FORMAT_4_7
+ bool "GCC 4.7 format"
+ ---help---
+ Select this option to use the format defined by GCC 4.7.
+
+endchoice
+
endmenu
diff --git a/kernel/gcov/Makefile b/kernel/gcov/Makefile
index 3f761001d51..52aa7e8de92 100644
--- a/kernel/gcov/Makefile
+++ b/kernel/gcov/Makefile
@@ -1,3 +1,33 @@
-EXTRA_CFLAGS := -DSRCTREE='"$(srctree)"' -DOBJTREE='"$(objtree)"'
+ccflags-y := -DSRCTREE='"$(srctree)"' -DOBJTREE='"$(objtree)"'
-obj-$(CONFIG_GCOV_KERNEL) := base.o fs.o gcc_3_4.o
+# if-lt
+# Usage VAR := $(call if-lt, $(a), $(b))
+# Returns 1 if (a < b)
+if-lt = $(shell [ $(1) -lt $(2) ] && echo 1)
+
+ifeq ($(CONFIG_GCOV_FORMAT_3_4),y)
+ cc-ver := 0304
+else ifeq ($(CONFIG_GCOV_FORMAT_4_7),y)
+ cc-ver := 0407
+else
+# Use cc-version if available, otherwise set 0
+#
+# scripts/Kbuild.include, which contains cc-version function, is not included
+# during make clean "make -f scripts/Makefile.clean obj=kernel/gcov"
+# Meaning cc-ver is empty causing if-lt test to fail with
+# "/bin/sh: line 0: [: -lt: unary operator expected" error mesage.
+# This has no affect on the clean phase, but the error message could be
+# confusing/annoying. So this dummy workaround sets cc-ver to zero if cc-version
+# is not available. We can probably move if-lt to Kbuild.include, so it's also
+# not defined during clean or to include Kbuild.include in
+# scripts/Makefile.clean. But the following workaround seems least invasive.
+ cc-ver := $(if $(call cc-version),$(call cc-version),0)
+endif
+
+obj-$(CONFIG_GCOV_KERNEL) := base.o fs.o
+
+ifeq ($(call if-lt, $(cc-ver), 0407),1)
+ obj-$(CONFIG_GCOV_KERNEL) += gcc_3_4.o
+else
+ obj-$(CONFIG_GCOV_KERNEL) += gcc_4_7.o
+endif
diff --git a/kernel/gcov/base.c b/kernel/gcov/base.c
index 9b22d03cc58..b358a802fd1 100644
--- a/kernel/gcov/base.c
+++ b/kernel/gcov/base.c
@@ -20,7 +20,6 @@
#include <linux/mutex.h>
#include "gcov.h"
-static struct gcov_info *gcov_info_head;
static int gcov_events_enabled;
static DEFINE_MUTEX(gcov_lock);
@@ -34,7 +33,7 @@ void __gcov_init(struct gcov_info *info)
mutex_lock(&gcov_lock);
if (gcov_version == 0) {
- gcov_version = info->version;
+ gcov_version = gcov_info_version(info);
/*
* Printing gcc's version magic may prove useful for debugging
* incompatibility reports.
@@ -45,8 +44,7 @@ void __gcov_init(struct gcov_info *info)
* Add new profiling data structure to list and inform event
* listener.
*/
- info->next = gcov_info_head;
- gcov_info_head = info;
+ gcov_info_link(info);
if (gcov_events_enabled)
gcov_event(GCOV_ADD, info);
mutex_unlock(&gcov_lock);
@@ -81,6 +79,18 @@ void __gcov_merge_delta(gcov_type *counters, unsigned int n_counters)
}
EXPORT_SYMBOL(__gcov_merge_delta);
+void __gcov_merge_ior(gcov_type *counters, unsigned int n_counters)
+{
+ /* Unused. */
+}
+EXPORT_SYMBOL(__gcov_merge_ior);
+
+void __gcov_merge_time_profile(gcov_type *counters, unsigned int n_counters)
+{
+ /* Unused. */
+}
+EXPORT_SYMBOL(__gcov_merge_time_profile);
+
/**
* gcov_enable_events - enable event reporting through gcov_event()
*
@@ -91,13 +101,15 @@ EXPORT_SYMBOL(__gcov_merge_delta);
*/
void gcov_enable_events(void)
{
- struct gcov_info *info;
+ struct gcov_info *info = NULL;
mutex_lock(&gcov_lock);
gcov_events_enabled = 1;
+
/* Perform event callback for previously registered entries. */
- for (info = gcov_info_head; info; info = info->next)
+ while ((info = gcov_info_next(info)))
gcov_event(GCOV_ADD, info);
+
mutex_unlock(&gcov_lock);
}
@@ -112,25 +124,23 @@ static int gcov_module_notifier(struct notifier_block *nb, unsigned long event,
void *data)
{
struct module *mod = data;
- struct gcov_info *info;
- struct gcov_info *prev;
+ struct gcov_info *info = NULL;
+ struct gcov_info *prev = NULL;
if (event != MODULE_STATE_GOING)
return NOTIFY_OK;
mutex_lock(&gcov_lock);
- prev = NULL;
+
/* Remove entries located in module from linked list. */
- for (info = gcov_info_head; info; info = info->next) {
+ while ((info = gcov_info_next(info))) {
if (within(info, mod->module_core, mod->core_size)) {
- if (prev)
- prev->next = info->next;
- else
- gcov_info_head = info->next;
+ gcov_info_unlink(prev, info);
if (gcov_events_enabled)
gcov_event(GCOV_REMOVE, info);
} else
prev = info;
}
+
mutex_unlock(&gcov_lock);
return NOTIFY_OK;
diff --git a/kernel/gcov/fs.c b/kernel/gcov/fs.c
index ef3c3f88a7a..15ff01a7637 100644
--- a/kernel/gcov/fs.c
+++ b/kernel/gcov/fs.c
@@ -33,10 +33,11 @@
* @children: child nodes
* @all: list head for list of all nodes
* @parent: parent node
- * @info: associated profiling data structure if not a directory
- * @ghost: when an object file containing profiling data is unloaded we keep a
- * copy of the profiling data here to allow collecting coverage data
- * for cleanup code. Such a node is called a "ghost".
+ * @loaded_info: array of pointers to profiling data sets for loaded object
+ * files.
+ * @num_loaded: number of profiling data sets for loaded object files.
+ * @unloaded_info: accumulated copy of profiling data sets for unloaded
+ * object files. Used only when gcov_persist=1.
* @dentry: main debugfs entry, either a directory or data file
* @links: associated symbolic links
* @name: data file basename
@@ -51,10 +52,11 @@ struct gcov_node {
struct list_head children;
struct list_head all;
struct gcov_node *parent;
- struct gcov_info *info;
- struct gcov_info *ghost;
+ struct gcov_info **loaded_info;
+ struct gcov_info *unloaded_info;
struct dentry *dentry;
struct dentry **links;
+ int num_loaded;
char name[0];
};
@@ -72,8 +74,8 @@ static int __init gcov_persist_setup(char *str)
{
unsigned long val;
- if (strict_strtoul(str, 0, &val)) {
- pr_warning("invalid gcov_persist parameter '%s'\n", str);
+ if (kstrtoul(str, 0, &val)) {
+ pr_warn("invalid gcov_persist parameter '%s'\n", str);
return 0;
}
gcov_persist = val;
@@ -136,16 +138,37 @@ static const struct seq_operations gcov_seq_ops = {
};
/*
- * Return the profiling data set for a given node. This can either be the
- * original profiling data structure or a duplicate (also called "ghost")
- * in case the associated object file has been unloaded.
+ * Return a profiling data set associated with the given node. This is
+ * either a data set for a loaded object file or a data set copy in case
+ * all associated object files have been unloaded.
*/
static struct gcov_info *get_node_info(struct gcov_node *node)
{
- if (node->info)
- return node->info;
+ if (node->num_loaded > 0)
+ return node->loaded_info[0];
+
+ return node->unloaded_info;
+}
+
+/*
+ * Return a newly allocated profiling data set which contains the sum of
+ * all profiling data associated with the given node.
+ */
+static struct gcov_info *get_accumulated_info(struct gcov_node *node)
+{
+ struct gcov_info *info;
+ int i = 0;
- return node->ghost;
+ if (node->unloaded_info)
+ info = gcov_info_dup(node->unloaded_info);
+ else
+ info = gcov_info_dup(node->loaded_info[i++]);
+ if (!info)
+ return NULL;
+ for (; i < node->num_loaded; i++)
+ gcov_info_add(info, node->loaded_info[i]);
+
+ return info;
}
/*
@@ -163,9 +186,10 @@ static int gcov_seq_open(struct inode *inode, struct file *file)
mutex_lock(&node_lock);
/*
* Read from a profiling data copy to minimize reference tracking
- * complexity and concurrent access.
+ * complexity and concurrent access and to keep accumulating multiple
+ * profiling data sets associated with one node simple.
*/
- info = gcov_info_dup(get_node_info(node));
+ info = get_accumulated_info(node);
if (!info)
goto out_unlock;
iter = gcov_iter_new(info);
@@ -218,19 +242,32 @@ static struct gcov_node *get_node_by_name(const char *name)
list_for_each_entry(node, &all_head, all) {
info = get_node_info(node);
- if (info && (strcmp(info->filename, name) == 0))
+ if (info && (strcmp(gcov_info_filename(info), name) == 0))
return node;
}
return NULL;
}
+/*
+ * Reset all profiling data associated with the specified node.
+ */
+static void reset_node(struct gcov_node *node)
+{
+ int i;
+
+ if (node->unloaded_info)
+ gcov_info_reset(node->unloaded_info);
+ for (i = 0; i < node->num_loaded; i++)
+ gcov_info_reset(node->loaded_info[i]);
+}
+
static void remove_node(struct gcov_node *node);
/*
* write() implementation for gcov data files. Reset profiling data for the
- * associated file. If the object file has been unloaded (i.e. this is
- * a "ghost" node), remove the debug fs node as well.
+ * corresponding file. If all associated object files have been unloaded,
+ * remove the debug fs node as well.
*/
static ssize_t gcov_seq_write(struct file *file, const char __user *addr,
size_t len, loff_t *pos)
@@ -242,13 +279,13 @@ static ssize_t gcov_seq_write(struct file *file, const char __user *addr,
seq = file->private_data;
info = gcov_iter_get_info(seq->private);
mutex_lock(&node_lock);
- node = get_node_by_name(info->filename);
+ node = get_node_by_name(gcov_info_filename(info));
if (node) {
/* Reset counts or remove node for unloaded modules. */
- if (node->ghost)
+ if (node->num_loaded == 0)
remove_node(node);
else
- gcov_info_reset(node->info);
+ reset_node(node);
}
/* Reset counts for open file. */
gcov_info_reset(info);
@@ -328,7 +365,7 @@ static const char *deskew(const char *basename)
*/
static void add_links(struct gcov_node *node, struct dentry *parent)
{
- char *basename;
+ const char *basename;
char *target;
int num;
int i;
@@ -339,14 +376,14 @@ static void add_links(struct gcov_node *node, struct dentry *parent)
if (!node->links)
return;
for (i = 0; i < num; i++) {
- target = get_link_target(get_node_info(node)->filename,
- &gcov_link[i]);
+ target = get_link_target(
+ gcov_info_filename(get_node_info(node)),
+ &gcov_link[i]);
if (!target)
goto out_err;
- basename = strrchr(target, '/');
- if (!basename)
+ basename = kbasename(target);
+ if (basename == target)
goto out_err;
- basename++;
node->links[i] = debugfs_create_symlink(deskew(basename),
parent, target);
if (!node->links[i])
@@ -378,7 +415,10 @@ static void init_node(struct gcov_node *node, struct gcov_info *info,
INIT_LIST_HEAD(&node->list);
INIT_LIST_HEAD(&node->children);
INIT_LIST_HEAD(&node->all);
- node->info = info;
+ if (node->loaded_info) {
+ node->loaded_info[0] = info;
+ node->num_loaded = 1;
+ }
node->parent = parent;
if (name)
strcpy(node->name, name);
@@ -394,9 +434,13 @@ static struct gcov_node *new_node(struct gcov_node *parent,
struct gcov_node *node;
node = kzalloc(sizeof(struct gcov_node) + strlen(name) + 1, GFP_KERNEL);
- if (!node) {
- pr_warning("out of memory\n");
- return NULL;
+ if (!node)
+ goto err_nomem;
+ if (info) {
+ node->loaded_info = kcalloc(1, sizeof(struct gcov_info *),
+ GFP_KERNEL);
+ if (!node->loaded_info)
+ goto err_nomem;
}
init_node(node, info, name, parent);
/* Differentiate between gcov data file nodes and directory nodes. */
@@ -406,7 +450,7 @@ static struct gcov_node *new_node(struct gcov_node *parent,
} else
node->dentry = debugfs_create_dir(node->name, parent->dentry);
if (!node->dentry) {
- pr_warning("could not create file\n");
+ pr_warn("could not create file\n");
kfree(node);
return NULL;
}
@@ -416,6 +460,11 @@ static struct gcov_node *new_node(struct gcov_node *parent,
list_add(&node->all, &all_head);
return node;
+
+err_nomem:
+ kfree(node);
+ pr_warn("out of memory\n");
+ return NULL;
}
/* Remove symbolic links associated with node. */
@@ -441,8 +490,9 @@ static void release_node(struct gcov_node *node)
list_del(&node->all);
debugfs_remove(node->dentry);
remove_links(node);
- if (node->ghost)
- gcov_info_free(node->ghost);
+ kfree(node->loaded_info);
+ if (node->unloaded_info)
+ gcov_info_free(node->unloaded_info);
kfree(node);
}
@@ -477,7 +527,7 @@ static struct gcov_node *get_child_by_name(struct gcov_node *parent,
/*
* write() implementation for reset file. Reset all profiling data to zero
- * and remove ghost nodes.
+ * and remove nodes for which all associated object files are unloaded.
*/
static ssize_t reset_write(struct file *file, const char __user *addr,
size_t len, loff_t *pos)
@@ -487,8 +537,8 @@ static ssize_t reset_write(struct file *file, const char __user *addr,
mutex_lock(&node_lock);
restart:
list_for_each_entry(node, &all_head, all) {
- if (node->info)
- gcov_info_reset(node->info);
+ if (node->num_loaded > 0)
+ reset_node(node);
else if (list_empty(&node->children)) {
remove_node(node);
/* Several nodes may have gone - restart loop. */
@@ -511,6 +561,7 @@ static ssize_t reset_read(struct file *file, char __user *addr, size_t len,
static const struct file_operations gcov_reset_fops = {
.write = reset_write,
.read = reset_read,
+ .llseek = noop_llseek,
};
/*
@@ -525,7 +576,7 @@ static void add_node(struct gcov_info *info)
struct gcov_node *parent;
struct gcov_node *node;
- filename = kstrdup(info->filename, GFP_KERNEL);
+ filename = kstrdup(gcov_info_filename(info), GFP_KERNEL);
if (!filename)
return;
parent = &root_node;
@@ -564,37 +615,117 @@ err_remove:
}
/*
- * The profiling data set associated with this node is being unloaded. Store a
- * copy of the profiling data and turn this node into a "ghost".
+ * Associate a profiling data set with an existing node. Needs to be called
+ * with node_lock held.
*/
-static int ghost_node(struct gcov_node *node)
+static void add_info(struct gcov_node *node, struct gcov_info *info)
{
- node->ghost = gcov_info_dup(node->info);
- if (!node->ghost) {
- pr_warning("could not save data for '%s' (out of memory)\n",
- node->info->filename);
- return -ENOMEM;
+ struct gcov_info **loaded_info;
+ int num = node->num_loaded;
+
+ /*
+ * Prepare new array. This is done first to simplify cleanup in
+ * case the new data set is incompatible, the node only contains
+ * unloaded data sets and there's not enough memory for the array.
+ */
+ loaded_info = kcalloc(num + 1, sizeof(struct gcov_info *), GFP_KERNEL);
+ if (!loaded_info) {
+ pr_warn("could not add '%s' (out of memory)\n",
+ gcov_info_filename(info));
+ return;
+ }
+ memcpy(loaded_info, node->loaded_info,
+ num * sizeof(struct gcov_info *));
+ loaded_info[num] = info;
+ /* Check if the new data set is compatible. */
+ if (num == 0) {
+ /*
+ * A module was unloaded, modified and reloaded. The new
+ * data set replaces the copy of the last one.
+ */
+ if (!gcov_info_is_compatible(node->unloaded_info, info)) {
+ pr_warn("discarding saved data for %s "
+ "(incompatible version)\n",
+ gcov_info_filename(info));
+ gcov_info_free(node->unloaded_info);
+ node->unloaded_info = NULL;
+ }
+ } else {
+ /*
+ * Two different versions of the same object file are loaded.
+ * The initial one takes precedence.
+ */
+ if (!gcov_info_is_compatible(node->loaded_info[0], info)) {
+ pr_warn("could not add '%s' (incompatible "
+ "version)\n", gcov_info_filename(info));
+ kfree(loaded_info);
+ return;
+ }
}
- node->info = NULL;
+ /* Overwrite previous array. */
+ kfree(node->loaded_info);
+ node->loaded_info = loaded_info;
+ node->num_loaded = num + 1;
+}
- return 0;
+/*
+ * Return the index of a profiling data set associated with a node.
+ */
+static int get_info_index(struct gcov_node *node, struct gcov_info *info)
+{
+ int i;
+
+ for (i = 0; i < node->num_loaded; i++) {
+ if (node->loaded_info[i] == info)
+ return i;
+ }
+ return -ENOENT;
}
/*
- * Profiling data for this node has been loaded again. Add profiling data
- * from previous instantiation and turn this node into a regular node.
+ * Save the data of a profiling data set which is being unloaded.
*/
-static void revive_node(struct gcov_node *node, struct gcov_info *info)
+static void save_info(struct gcov_node *node, struct gcov_info *info)
{
- if (gcov_info_is_compatible(node->ghost, info))
- gcov_info_add(info, node->ghost);
+ if (node->unloaded_info)
+ gcov_info_add(node->unloaded_info, info);
else {
- pr_warning("discarding saved data for '%s' (version changed)\n",
- info->filename);
+ node->unloaded_info = gcov_info_dup(info);
+ if (!node->unloaded_info) {
+ pr_warn("could not save data for '%s' "
+ "(out of memory)\n",
+ gcov_info_filename(info));
+ }
+ }
+}
+
+/*
+ * Disassociate a profiling data set from a node. Needs to be called with
+ * node_lock held.
+ */
+static void remove_info(struct gcov_node *node, struct gcov_info *info)
+{
+ int i;
+
+ i = get_info_index(node, info);
+ if (i < 0) {
+ pr_warn("could not remove '%s' (not found)\n",
+ gcov_info_filename(info));
+ return;
}
- gcov_info_free(node->ghost);
- node->ghost = NULL;
- node->info = info;
+ if (gcov_persist)
+ save_info(node, info);
+ /* Shrink array. */
+ node->loaded_info[i] = node->loaded_info[node->num_loaded - 1];
+ node->num_loaded--;
+ if (node->num_loaded > 0)
+ return;
+ /* Last loaded data set was removed. */
+ kfree(node->loaded_info);
+ node->loaded_info = NULL;
+ node->num_loaded = 0;
+ if (!node->unloaded_info)
+ remove_node(node);
}
/*
@@ -606,33 +737,21 @@ void gcov_event(enum gcov_action action, struct gcov_info *info)
struct gcov_node *node;
mutex_lock(&node_lock);
- node = get_node_by_name(info->filename);
+ node = get_node_by_name(gcov_info_filename(info));
switch (action) {
case GCOV_ADD:
- /* Add new node or revive ghost. */
- if (!node) {
+ if (node)
+ add_info(node, info);
+ else
add_node(info);
- break;
- }
- if (gcov_persist)
- revive_node(node, info);
- else {
- pr_warning("could not add '%s' (already exists)\n",
- info->filename);
- }
break;
case GCOV_REMOVE:
- /* Remove node or turn into ghost. */
- if (!node) {
- pr_warning("could not remove '%s' (not found)\n",
- info->filename);
- break;
- }
- if (gcov_persist) {
- if (!ghost_node(node))
- break;
+ if (node)
+ remove_info(node, info);
+ else {
+ pr_warn("could not remove '%s' (not found)\n",
+ gcov_info_filename(info));
}
- remove_node(node);
break;
}
mutex_unlock(&node_lock);
diff --git a/kernel/gcov/gcc_3_4.c b/kernel/gcov/gcc_3_4.c
index ae5bb426003..27bc88a3501 100644
--- a/kernel/gcov/gcc_3_4.c
+++ b/kernel/gcov/gcc_3_4.c
@@ -21,6 +21,121 @@
#include <linux/vmalloc.h>
#include "gcov.h"
+#define GCOV_COUNTERS 5
+
+static struct gcov_info *gcov_info_head;
+
+/**
+ * struct gcov_fn_info - profiling meta data per function
+ * @ident: object file-unique function identifier
+ * @checksum: function checksum
+ * @n_ctrs: number of values per counter type belonging to this function
+ *
+ * This data is generated by gcc during compilation and doesn't change
+ * at run-time.
+ */
+struct gcov_fn_info {
+ unsigned int ident;
+ unsigned int checksum;
+ unsigned int n_ctrs[0];
+};
+
+/**
+ * struct gcov_ctr_info - profiling data per counter type
+ * @num: number of counter values for this type
+ * @values: array of counter values for this type
+ * @merge: merge function for counter values of this type (unused)
+ *
+ * This data is generated by gcc during compilation and doesn't change
+ * at run-time with the exception of the values array.
+ */
+struct gcov_ctr_info {
+ unsigned int num;
+ gcov_type *values;
+ void (*merge)(gcov_type *, unsigned int);
+};
+
+/**
+ * struct gcov_info - profiling data per object file
+ * @version: gcov version magic indicating the gcc version used for compilation
+ * @next: list head for a singly-linked list
+ * @stamp: time stamp
+ * @filename: name of the associated gcov data file
+ * @n_functions: number of instrumented functions
+ * @functions: function data
+ * @ctr_mask: mask specifying which counter types are active
+ * @counts: counter data per counter type
+ *
+ * This data is generated by gcc during compilation and doesn't change
+ * at run-time with the exception of the next pointer.
+ */
+struct gcov_info {
+ unsigned int version;
+ struct gcov_info *next;
+ unsigned int stamp;
+ const char *filename;
+ unsigned int n_functions;
+ const struct gcov_fn_info *functions;
+ unsigned int ctr_mask;
+ struct gcov_ctr_info counts[0];
+};
+
+/**
+ * gcov_info_filename - return info filename
+ * @info: profiling data set
+ */
+const char *gcov_info_filename(struct gcov_info *info)
+{
+ return info->filename;
+}
+
+/**
+ * gcov_info_version - return info version
+ * @info: profiling data set
+ */
+unsigned int gcov_info_version(struct gcov_info *info)
+{
+ return info->version;
+}
+
+/**
+ * gcov_info_next - return next profiling data set
+ * @info: profiling data set
+ *
+ * Returns next gcov_info following @info or first gcov_info in the chain if
+ * @info is %NULL.
+ */
+struct gcov_info *gcov_info_next(struct gcov_info *info)
+{
+ if (!info)
+ return gcov_info_head;
+
+ return info->next;
+}
+
+/**
+ * gcov_info_link - link/add profiling data set to the list
+ * @info: profiling data set
+ */
+void gcov_info_link(struct gcov_info *info)
+{
+ info->next = gcov_info_head;
+ gcov_info_head = info;
+}
+
+/**
+ * gcov_info_unlink - unlink/remove profiling data set from the list
+ * @prev: previous profiling data set
+ * @info: profiling data set
+ */
+void gcov_info_unlink(struct gcov_info *prev, struct gcov_info *info)
+{
+ if (prev)
+ prev->next = info->next;
+ else
+ gcov_info_head = info->next;
+}
+
/* Symbolic links to be created for each profiling data file. */
const struct gcov_link gcov_link[] = {
{ OBJ_TREE, "gcno" }, /* Link to .gcno file in $(objtree). */
diff --git a/kernel/gcov/gcc_4_7.c b/kernel/gcov/gcc_4_7.c
new file mode 100644
index 00000000000..826ba9fb5e3
--- /dev/null
+++ b/kernel/gcov/gcc_4_7.c
@@ -0,0 +1,565 @@
+/*
+ * This code provides functions to handle gcc's profiling data format
+ * introduced with gcc 4.7.
+ *
+ * This file is based heavily on gcc_3_4.c file.
+ *
+ * For a better understanding, refer to gcc source:
+ * gcc/gcov-io.h
+ * libgcc/libgcov.c
+ *
+ * Uses gcc-internal data definitions.
+ */
+
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/seq_file.h>
+#include <linux/vmalloc.h>
+#include "gcov.h"
+
+#if __GNUC__ == 4 && __GNUC_MINOR__ >= 9
+#define GCOV_COUNTERS 9
+#else
+#define GCOV_COUNTERS 8
+#endif
+
+#define GCOV_TAG_FUNCTION_LENGTH 3
+
+static struct gcov_info *gcov_info_head;
+
+/**
+ * struct gcov_ctr_info - information about counters for a single function
+ * @num: number of counter values for this type
+ * @values: array of counter values for this type
+ *
+ * This data is generated by gcc during compilation and doesn't change
+ * at run-time with the exception of the values array.
+ */
+struct gcov_ctr_info {
+ unsigned int num;
+ gcov_type *values;
+};
+
+/**
+ * struct gcov_fn_info - profiling meta data per function
+ * @key: comdat key
+ * @ident: unique ident of function
+ * @lineno_checksum: function lineo_checksum
+ * @cfg_checksum: function cfg checksum
+ * @ctrs: instrumented counters
+ *
+ * This data is generated by gcc during compilation and doesn't change
+ * at run-time.
+ *
+ * Information about a single function. This uses the trailing array
+ * idiom. The number of counters is determined from the merge pointer
+ * array in gcov_info. The key is used to detect which of a set of
+ * comdat functions was selected -- it points to the gcov_info object
+ * of the object file containing the selected comdat function.
+ */
+struct gcov_fn_info {
+ const struct gcov_info *key;
+ unsigned int ident;
+ unsigned int lineno_checksum;
+ unsigned int cfg_checksum;
+ struct gcov_ctr_info ctrs[0];
+};
+
+/**
+ * struct gcov_info - profiling data per object file
+ * @version: gcov version magic indicating the gcc version used for compilation
+ * @next: list head for a singly-linked list
+ * @stamp: uniquifying time stamp
+ * @filename: name of the associated gcov data file
+ * @merge: merge functions (null for unused counter type)
+ * @n_functions: number of instrumented functions
+ * @functions: pointer to pointers to function information
+ *
+ * This data is generated by gcc during compilation and doesn't change
+ * at run-time with the exception of the next pointer.
+ */
+struct gcov_info {
+ unsigned int version;
+ struct gcov_info *next;
+ unsigned int stamp;
+ const char *filename;
+ void (*merge[GCOV_COUNTERS])(gcov_type *, unsigned int);
+ unsigned int n_functions;
+ struct gcov_fn_info **functions;
+};
+
+/**
+ * gcov_info_filename - return info filename
+ * @info: profiling data set
+ */
+const char *gcov_info_filename(struct gcov_info *info)
+{
+ return info->filename;
+}
+
+/**
+ * gcov_info_version - return info version
+ * @info: profiling data set
+ */
+unsigned int gcov_info_version(struct gcov_info *info)
+{
+ return info->version;
+}
+
+/**
+ * gcov_info_next - return next profiling data set
+ * @info: profiling data set
+ *
+ * Returns next gcov_info following @info or first gcov_info in the chain if
+ * @info is %NULL.
+ */
+struct gcov_info *gcov_info_next(struct gcov_info *info)
+{
+ if (!info)
+ return gcov_info_head;
+
+ return info->next;
+}
+
+/**
+ * gcov_info_link - link/add profiling data set to the list
+ * @info: profiling data set
+ */
+void gcov_info_link(struct gcov_info *info)
+{
+ info->next = gcov_info_head;
+ gcov_info_head = info;
+}
+
+/**
+ * gcov_info_unlink - unlink/remove profiling data set from the list
+ * @prev: previous profiling data set
+ * @info: profiling data set
+ */
+void gcov_info_unlink(struct gcov_info *prev, struct gcov_info *info)
+{
+ if (prev)
+ prev->next = info->next;
+ else
+ gcov_info_head = info->next;
+}
+
+/* Symbolic links to be created for each profiling data file. */
+const struct gcov_link gcov_link[] = {
+ { OBJ_TREE, "gcno" }, /* Link to .gcno file in $(objtree). */
+ { 0, NULL},
+};
+
+/*
+ * Determine whether a counter is active. Doesn't change at run-time.
+ */
+static int counter_active(struct gcov_info *info, unsigned int type)
+{
+ return info->merge[type] ? 1 : 0;
+}
+
+/* Determine number of active counters. Based on gcc magic. */
+static unsigned int num_counter_active(struct gcov_info *info)
+{
+ unsigned int i;
+ unsigned int result = 0;
+
+ for (i = 0; i < GCOV_COUNTERS; i++) {
+ if (counter_active(info, i))
+ result++;
+ }
+ return result;
+}
+
+/**
+ * gcov_info_reset - reset profiling data to zero
+ * @info: profiling data set
+ */
+void gcov_info_reset(struct gcov_info *info)
+{
+ struct gcov_ctr_info *ci_ptr;
+ unsigned int fi_idx;
+ unsigned int ct_idx;
+
+ for (fi_idx = 0; fi_idx < info->n_functions; fi_idx++) {
+ ci_ptr = info->functions[fi_idx]->ctrs;
+
+ for (ct_idx = 0; ct_idx < GCOV_COUNTERS; ct_idx++) {
+ if (!counter_active(info, ct_idx))
+ continue;
+
+ memset(ci_ptr->values, 0,
+ sizeof(gcov_type) * ci_ptr->num);
+ ci_ptr++;
+ }
+ }
+}
+
+/**
+ * gcov_info_is_compatible - check if profiling data can be added
+ * @info1: first profiling data set
+ * @info2: second profiling data set
+ *
+ * Returns non-zero if profiling data can be added, zero otherwise.
+ */
+int gcov_info_is_compatible(struct gcov_info *info1, struct gcov_info *info2)
+{
+ return (info1->stamp == info2->stamp);
+}
+
+/**
+ * gcov_info_add - add up profiling data
+ * @dest: profiling data set to which data is added
+ * @source: profiling data set which is added
+ *
+ * Adds profiling counts of @source to @dest.
+ */
+void gcov_info_add(struct gcov_info *dst, struct gcov_info *src)
+{
+ struct gcov_ctr_info *dci_ptr;
+ struct gcov_ctr_info *sci_ptr;
+ unsigned int fi_idx;
+ unsigned int ct_idx;
+ unsigned int val_idx;
+
+ for (fi_idx = 0; fi_idx < src->n_functions; fi_idx++) {
+ dci_ptr = dst->functions[fi_idx]->ctrs;
+ sci_ptr = src->functions[fi_idx]->ctrs;
+
+ for (ct_idx = 0; ct_idx < GCOV_COUNTERS; ct_idx++) {
+ if (!counter_active(src, ct_idx))
+ continue;
+
+ for (val_idx = 0; val_idx < sci_ptr->num; val_idx++)
+ dci_ptr->values[val_idx] +=
+ sci_ptr->values[val_idx];
+
+ dci_ptr++;
+ sci_ptr++;
+ }
+ }
+}
+
+/**
+ * gcov_info_dup - duplicate profiling data set
+ * @info: profiling data set to duplicate
+ *
+ * Return newly allocated duplicate on success, %NULL on error.
+ */
+struct gcov_info *gcov_info_dup(struct gcov_info *info)
+{
+ struct gcov_info *dup;
+ struct gcov_ctr_info *dci_ptr; /* dst counter info */
+ struct gcov_ctr_info *sci_ptr; /* src counter info */
+ unsigned int active;
+ unsigned int fi_idx; /* function info idx */
+ unsigned int ct_idx; /* counter type idx */
+ size_t fi_size; /* function info size */
+ size_t cv_size; /* counter values size */
+
+ dup = kmemdup(info, sizeof(*dup), GFP_KERNEL);
+ if (!dup)
+ return NULL;
+
+ dup->next = NULL;
+ dup->filename = NULL;
+ dup->functions = NULL;
+
+ dup->filename = kstrdup(info->filename, GFP_KERNEL);
+ if (!dup->filename)
+ goto err_free;
+
+ dup->functions = kcalloc(info->n_functions,
+ sizeof(struct gcov_fn_info *), GFP_KERNEL);
+ if (!dup->functions)
+ goto err_free;
+
+ active = num_counter_active(info);
+ fi_size = sizeof(struct gcov_fn_info);
+ fi_size += sizeof(struct gcov_ctr_info) * active;
+
+ for (fi_idx = 0; fi_idx < info->n_functions; fi_idx++) {
+ dup->functions[fi_idx] = kzalloc(fi_size, GFP_KERNEL);
+ if (!dup->functions[fi_idx])
+ goto err_free;
+
+ *(dup->functions[fi_idx]) = *(info->functions[fi_idx]);
+
+ sci_ptr = info->functions[fi_idx]->ctrs;
+ dci_ptr = dup->functions[fi_idx]->ctrs;
+
+ for (ct_idx = 0; ct_idx < active; ct_idx++) {
+
+ cv_size = sizeof(gcov_type) * sci_ptr->num;
+
+ dci_ptr->values = vmalloc(cv_size);
+
+ if (!dci_ptr->values)
+ goto err_free;
+
+ dci_ptr->num = sci_ptr->num;
+ memcpy(dci_ptr->values, sci_ptr->values, cv_size);
+
+ sci_ptr++;
+ dci_ptr++;
+ }
+ }
+
+ return dup;
+err_free:
+ gcov_info_free(dup);
+ return NULL;
+}
+
+/**
+ * gcov_info_free - release memory for profiling data set duplicate
+ * @info: profiling data set duplicate to free
+ */
+void gcov_info_free(struct gcov_info *info)
+{
+ unsigned int active;
+ unsigned int fi_idx;
+ unsigned int ct_idx;
+ struct gcov_ctr_info *ci_ptr;
+
+ if (!info->functions)
+ goto free_info;
+
+ active = num_counter_active(info);
+
+ for (fi_idx = 0; fi_idx < info->n_functions; fi_idx++) {
+ if (!info->functions[fi_idx])
+ continue;
+
+ ci_ptr = info->functions[fi_idx]->ctrs;
+
+ for (ct_idx = 0; ct_idx < active; ct_idx++, ci_ptr++)
+ vfree(ci_ptr->values);
+
+ kfree(info->functions[fi_idx]);
+ }
+
+free_info:
+ kfree(info->functions);
+ kfree(info->filename);
+ kfree(info);
+}
+
+#define ITER_STRIDE PAGE_SIZE
+
+/**
+ * struct gcov_iterator - specifies current file position in logical records
+ * @info: associated profiling data
+ * @buffer: buffer containing file data
+ * @size: size of buffer
+ * @pos: current position in file
+ */
+struct gcov_iterator {
+ struct gcov_info *info;
+ void *buffer;
+ size_t size;
+ loff_t pos;
+};
+
+/**
+ * store_gcov_u32 - store 32 bit number in gcov format to buffer
+ * @buffer: target buffer or NULL
+ * @off: offset into the buffer
+ * @v: value to be stored
+ *
+ * Number format defined by gcc: numbers are recorded in the 32 bit
+ * unsigned binary form of the endianness of the machine generating the
+ * file. Returns the number of bytes stored. If @buffer is %NULL, doesn't
+ * store anything.
+ */
+static size_t store_gcov_u32(void *buffer, size_t off, u32 v)
+{
+ u32 *data;
+
+ if (buffer) {
+ data = buffer + off;
+ *data = v;
+ }
+
+ return sizeof(*data);
+}
+
+/**
+ * store_gcov_u64 - store 64 bit number in gcov format to buffer
+ * @buffer: target buffer or NULL
+ * @off: offset into the buffer
+ * @v: value to be stored
+ *
+ * Number format defined by gcc: numbers are recorded in the 32 bit
+ * unsigned binary form of the endianness of the machine generating the
+ * file. 64 bit numbers are stored as two 32 bit numbers, the low part
+ * first. Returns the number of bytes stored. If @buffer is %NULL, doesn't store
+ * anything.
+ */
+static size_t store_gcov_u64(void *buffer, size_t off, u64 v)
+{
+ u32 *data;
+
+ if (buffer) {
+ data = buffer + off;
+
+ data[0] = (v & 0xffffffffUL);
+ data[1] = (v >> 32);
+ }
+
+ return sizeof(*data) * 2;
+}
+
+/**
+ * convert_to_gcda - convert profiling data set to gcda file format
+ * @buffer: the buffer to store file data or %NULL if no data should be stored
+ * @info: profiling data set to be converted
+ *
+ * Returns the number of bytes that were/would have been stored into the buffer.
+ */
+static size_t convert_to_gcda(char *buffer, struct gcov_info *info)
+{
+ struct gcov_fn_info *fi_ptr;
+ struct gcov_ctr_info *ci_ptr;
+ unsigned int fi_idx;
+ unsigned int ct_idx;
+ unsigned int cv_idx;
+ size_t pos = 0;
+
+ /* File header. */
+ pos += store_gcov_u32(buffer, pos, GCOV_DATA_MAGIC);
+ pos += store_gcov_u32(buffer, pos, info->version);
+ pos += store_gcov_u32(buffer, pos, info->stamp);
+
+ for (fi_idx = 0; fi_idx < info->n_functions; fi_idx++) {
+ fi_ptr = info->functions[fi_idx];
+
+ /* Function record. */
+ pos += store_gcov_u32(buffer, pos, GCOV_TAG_FUNCTION);
+ pos += store_gcov_u32(buffer, pos, GCOV_TAG_FUNCTION_LENGTH);
+ pos += store_gcov_u32(buffer, pos, fi_ptr->ident);
+ pos += store_gcov_u32(buffer, pos, fi_ptr->lineno_checksum);
+ pos += store_gcov_u32(buffer, pos, fi_ptr->cfg_checksum);
+
+ ci_ptr = fi_ptr->ctrs;
+
+ for (ct_idx = 0; ct_idx < GCOV_COUNTERS; ct_idx++) {
+ if (!counter_active(info, ct_idx))
+ continue;
+
+ /* Counter record. */
+ pos += store_gcov_u32(buffer, pos,
+ GCOV_TAG_FOR_COUNTER(ct_idx));
+ pos += store_gcov_u32(buffer, pos, ci_ptr->num * 2);
+
+ for (cv_idx = 0; cv_idx < ci_ptr->num; cv_idx++) {
+ pos += store_gcov_u64(buffer, pos,
+ ci_ptr->values[cv_idx]);
+ }
+
+ ci_ptr++;
+ }
+ }
+
+ return pos;
+}
+
+/**
+ * gcov_iter_new - allocate and initialize profiling data iterator
+ * @info: profiling data set to be iterated
+ *
+ * Return file iterator on success, %NULL otherwise.
+ */
+struct gcov_iterator *gcov_iter_new(struct gcov_info *info)
+{
+ struct gcov_iterator *iter;
+
+ iter = kzalloc(sizeof(struct gcov_iterator), GFP_KERNEL);
+ if (!iter)
+ goto err_free;
+
+ iter->info = info;
+ /* Dry-run to get the actual buffer size. */
+ iter->size = convert_to_gcda(NULL, info);
+ iter->buffer = vmalloc(iter->size);
+ if (!iter->buffer)
+ goto err_free;
+
+ convert_to_gcda(iter->buffer, info);
+
+ return iter;
+
+err_free:
+ kfree(iter);
+ return NULL;
+}
+
+
+/**
+ * gcov_iter_get_info - return profiling data set for given file iterator
+ * @iter: file iterator
+ */
+void gcov_iter_free(struct gcov_iterator *iter)
+{
+ vfree(iter->buffer);
+ kfree(iter);
+}
+
+/**
+ * gcov_iter_get_info - return profiling data set for given file iterator
+ * @iter: file iterator
+ */
+struct gcov_info *gcov_iter_get_info(struct gcov_iterator *iter)
+{
+ return iter->info;
+}
+
+/**
+ * gcov_iter_start - reset file iterator to starting position
+ * @iter: file iterator
+ */
+void gcov_iter_start(struct gcov_iterator *iter)
+{
+ iter->pos = 0;
+}
+
+/**
+ * gcov_iter_next - advance file iterator to next logical record
+ * @iter: file iterator
+ *
+ * Return zero if new position is valid, non-zero if iterator has reached end.
+ */
+int gcov_iter_next(struct gcov_iterator *iter)
+{
+ if (iter->pos < iter->size)
+ iter->pos += ITER_STRIDE;
+
+ if (iter->pos >= iter->size)
+ return -EINVAL;
+
+ return 0;
+}
+
+/**
+ * gcov_iter_write - write data for current pos to seq_file
+ * @iter: file iterator
+ * @seq: seq_file handle
+ *
+ * Return zero on success, non-zero otherwise.
+ */
+int gcov_iter_write(struct gcov_iterator *iter, struct seq_file *seq)
+{
+ size_t len;
+
+ if (iter->pos >= iter->size)
+ return -EINVAL;
+
+ len = ITER_STRIDE;
+ if (iter->pos + len > iter->size)
+ len = iter->size - iter->pos;
+
+ seq_write(seq, iter->buffer + iter->pos, len);
+
+ return 0;
+}
diff --git a/kernel/gcov/gcov.h b/kernel/gcov/gcov.h