aboutsummaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/.gitignore5
-rw-r--r--kernel/Makefile8
-rw-r--r--kernel/acct.c17
-rw-r--r--kernel/audit.c4
-rw-r--r--kernel/auditsc.c2
-rw-r--r--kernel/capability.c1
-rw-r--r--kernel/compat.c48
-rw-r--r--kernel/configs.c2
-rw-r--r--kernel/cpuset.c582
-rw-r--r--kernel/crash_dump.c61
-rw-r--r--kernel/exit.c63
-rw-r--r--kernel/fork.c50
-rw-r--r--kernel/futex.c7
-rw-r--r--kernel/hrtimer.c826
-rw-r--r--kernel/irq/manage.c2
-rw-r--r--kernel/irq/proc.c6
-rw-r--r--kernel/itimer.c106
-rw-r--r--kernel/kexec.c21
-rw-r--r--kernel/kprobes.c137
-rw-r--r--kernel/ksysfs.c37
-rw-r--r--kernel/module.c60
-rw-r--r--kernel/mutex-debug.c462
-rw-r--r--kernel/mutex-debug.h134
-rw-r--r--kernel/mutex.c315
-rw-r--r--kernel/mutex.h35
-rw-r--r--kernel/panic.c4
-rw-r--r--kernel/pid.c22
-rw-r--r--kernel/posix-cpu-timers.c76
-rw-r--r--kernel/posix-timers.c895
-rw-r--r--kernel/power/Kconfig2
-rw-r--r--kernel/power/disk.c101
-rw-r--r--kernel/power/main.c4
-rw-r--r--kernel/power/power.h24
-rw-r--r--kernel/power/snapshot.c89
-rw-r--r--kernel/power/swsusp.c1020
-rw-r--r--kernel/printk.c8
-rw-r--r--kernel/ptrace.c78
-rw-r--r--kernel/rcupdate.c135
-rw-r--r--kernel/rcutorture.c99
-rw-r--r--kernel/resource.c2
-rw-r--r--kernel/sched.c561
-rw-r--r--kernel/signal.c168
-rw-r--r--kernel/stop_machine.c6
-rw-r--r--kernel/sys.c100
-rw-r--r--kernel/sys_ni.c24
-rw-r--r--kernel/sysctl.c77
-rw-r--r--kernel/time.c118
-rw-r--r--kernel/timer.c58
-rw-r--r--kernel/uid16.c1
-rw-r--r--kernel/workqueue.c42
50 files changed, 4681 insertions, 2024 deletions
diff --git a/kernel/.gitignore b/kernel/.gitignore
new file mode 100644
index 00000000000..f2ab70073bd
--- /dev/null
+++ b/kernel/.gitignore
@@ -0,0 +1,5 @@
+#
+# Generated files
+#
+config_data.h
+config_data.gz
diff --git a/kernel/Makefile b/kernel/Makefile
index 4f5a1453093..4ae0fbde815 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -6,15 +6,18 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
exit.o itimer.o time.o softirq.o resource.o \
sysctl.o capability.o ptrace.o timer.o user.o \
signal.o sys.o kmod.o workqueue.o pid.o \
- rcupdate.o intermodule.o extable.o params.o posix-timers.o \
- kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o
+ rcupdate.o extable.o params.o posix-timers.o \
+ kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
+ hrtimer.o
+obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o
obj-$(CONFIG_FUTEX) += futex.o
obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
obj-$(CONFIG_SMP) += cpu.o spinlock.o
obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
obj-$(CONFIG_UID16) += uid16.o
obj-$(CONFIG_MODULES) += module.o
+obj-$(CONFIG_OBSOLETE_INTERMODULE) += intermodule.o
obj-$(CONFIG_KALLSYMS) += kallsyms.o
obj-$(CONFIG_PM) += power/
obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
@@ -29,7 +32,6 @@ obj-$(CONFIG_KPROBES) += kprobes.o
obj-$(CONFIG_SYSFS) += ksysfs.o
obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o
obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
-obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
obj-$(CONFIG_SECCOMP) += seccomp.o
obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
diff --git a/kernel/acct.c b/kernel/acct.c
index 6312d6bd43e..065d8b4e51e 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -47,6 +47,7 @@
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/acct.h>
+#include <linux/capability.h>
#include <linux/file.h>
#include <linux/tty.h>
#include <linux/security.h>
@@ -427,6 +428,7 @@ static void do_acct_process(long exitcode, struct file *file)
u64 elapsed;
u64 run_time;
struct timespec uptime;
+ unsigned long jiffies;
/*
* First check to see if there is enough free_space to continue
@@ -467,12 +469,12 @@ static void do_acct_process(long exitcode, struct file *file)
#endif
do_div(elapsed, AHZ);
ac.ac_btime = xtime.tv_sec - elapsed;
- ac.ac_utime = encode_comp_t(jiffies_to_AHZ(
- current->signal->utime +
- current->group_leader->utime));
- ac.ac_stime = encode_comp_t(jiffies_to_AHZ(
- current->signal->stime +
- current->group_leader->stime));
+ jiffies = cputime_to_jiffies(cputime_add(current->group_leader->utime,
+ current->signal->utime));
+ ac.ac_utime = encode_comp_t(jiffies_to_AHZ(jiffies));
+ jiffies = cputime_to_jiffies(cputime_add(current->group_leader->stime,
+ current->signal->stime));
+ ac.ac_stime = encode_comp_t(jiffies_to_AHZ(jiffies));
/* we really need to bite the bullet and change layout */
ac.ac_uid = current->uid;
ac.ac_gid = current->gid;
@@ -580,7 +582,8 @@ void acct_process(long exitcode)
void acct_update_integrals(struct task_struct *tsk)
{
if (likely(tsk->mm)) {
- long delta = tsk->stime - tsk->acct_stimexpd;
+ long delta =
+ cputime_to_jiffies(tsk->stime) - tsk->acct_stimexpd;
if (delta == 0)
return;
diff --git a/kernel/audit.c b/kernel/audit.c
index 32fa03ad198..0a813d2883e 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -42,8 +42,8 @@
*/
#include <linux/init.h>
-#include <asm/atomic.h>
#include <asm/types.h>
+#include <asm/atomic.h>
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/err.h>
@@ -267,7 +267,7 @@ static int audit_set_failure(int state, uid_t loginuid)
return old;
}
-int kauditd_thread(void *dummy)
+static int kauditd_thread(void *dummy)
{
struct sk_buff *skb;
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index d8a68509e72..685c25175d9 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -30,8 +30,8 @@
*/
#include <linux/init.h>
-#include <asm/atomic.h>
#include <asm/types.h>
+#include <asm/atomic.h>
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/mount.h>
diff --git a/kernel/capability.c b/kernel/capability.c
index 8986a37a67e..bfa3c92e16f 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -7,6 +7,7 @@
* 30 May 2002: Cleanup, Robert M. Love <rml@tech9.net>
*/
+#include <linux/capability.h>
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/security.h>
diff --git a/kernel/compat.c b/kernel/compat.c
index 102296e21ea..1867290c37e 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -514,6 +514,24 @@ static int put_compat_itimerspec(struct compat_itimerspec __user *dst,
return 0;
}
+long compat_sys_timer_create(clockid_t which_clock,
+ struct compat_sigevent __user *timer_event_spec,
+ timer_t __user *created_timer_id)
+{
+ struct sigevent __user *event = NULL;
+
+ if (timer_event_spec) {
+ struct sigevent kevent;
+
+ event = compat_alloc_user_space(sizeof(*event));
+ if (get_compat_sigevent(&kevent, timer_event_spec) ||
+ copy_to_user(event, &kevent, sizeof(*event)))
+ return -EFAULT;
+ }
+
+ return sys_timer_create(which_clock, event, created_timer_id);
+}
+
long compat_sys_timer_settime(timer_t timer_id, int flags,
struct compat_itimerspec __user *new,
struct compat_itimerspec __user *old)
@@ -649,8 +667,6 @@ int get_compat_sigevent(struct sigevent *event,
? -EFAULT : 0;
}
-/* timer_create is architecture specific because it needs sigevent conversion */
-
long compat_get_bitmap(unsigned long *mask, compat_ulong_t __user *umask,
unsigned long bitmap_size)
{
@@ -855,3 +871,31 @@ asmlinkage long compat_sys_stime(compat_time_t __user *tptr)
}
#endif /* __ARCH_WANT_COMPAT_SYS_TIME */
+
+#ifdef __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND
+asmlinkage long compat_sys_rt_sigsuspend(compat_sigset_t __user *unewset, compat_size_t sigsetsize)
+{
+ sigset_t newset;
+ compat_sigset_t newset32;
+
+ /* XXX: Don't preclude handling different sized sigset_t's. */
+ if (sigsetsize != sizeof(sigset_t))
+ return -EINVAL;
+
+ if (copy_from_user(&newset32, unewset, sizeof(compat_sigset_t)))
+ return -EFAULT;
+ sigset_from_compat(&newset, &newset32);
+ sigdelsetmask(&newset, sigmask(SIGKILL)|sigmask(SIGSTOP));
+
+ spin_lock_irq(&current->sighand->siglock);
+ current->saved_sigmask = current->blocked;
+ current->blocked = newset;
+ recalc_sigpending();
+ spin_unlock_irq(&current->sighand->siglock);
+
+ current->state = TASK_INTERRUPTIBLE;
+ schedule();
+ set_thread_flag(TIF_RESTORE_SIGMASK);
+ return -ERESTARTNOHAND;
+}
+#endif /* __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND */
diff --git a/kernel/configs.c b/kernel/configs.c
index 986f7af31e0..009e1ebdcb8 100644
--- a/kernel/configs.c
+++ b/kernel/configs.c
@@ -3,7 +3,7 @@
* Echo the kernel .config file used to build the kernel
*
* Copyright (C) 2002 Khalid Aziz <khalid_aziz@hp.com>
- * Copyright (C) 2002 Randy Dunlap <rddunlap@osdl.org>
+ * Copyright (C) 2002 Randy Dunlap <rdunlap@xenotime.net>
* Copyright (C) 2002 Al Stone <ahs3@fc.hp.com>
* Copyright (C) 2002 Hewlett-Packard Company
*
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 7430640f981..fe2f71f92ae 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -39,6 +39,7 @@
#include <linux/namei.h>
#include <linux/pagemap.h>
#include <linux/proc_fs.h>
+#include <linux/rcupdate.h>
#include <linux/sched.h>
#include <linux/seq_file.h>
#include <linux/slab.h>
@@ -54,7 +55,23 @@
#include <asm/atomic.h>
#include <asm/semaphore.h>
-#define CPUSET_SUPER_MAGIC 0x27e0eb
+#define CPUSET_SUPER_MAGIC 0x27e0eb
+
+/*
+ * Tracks how many cpusets are currently defined in system.
+ * When there is only one cpuset (the root cpuset) we can
+ * short circuit some hooks.
+ */
+int number_of_cpusets __read_mostly;
+
+/* See "Frequency meter" comments, below. */
+
+struct fmeter {
+ int cnt; /* unprocessed events count */
+ int val; /* most recent output value */
+ time_t time; /* clock (secs) when val computed */
+ spinlock_t lock; /* guards read or write of above */
+};
struct cpuset {
unsigned long flags; /* "unsigned long" so bitops work */
@@ -80,13 +97,16 @@ struct cpuset {
* Copy of global cpuset_mems_generation as of the most
* recent time this cpuset changed its mems_allowed.
*/
- int mems_generation;
+ int mems_generation;
+
+ struct fmeter fmeter; /* memory_pressure filter */
};
/* bits in struct cpuset flags field */
typedef enum {
CS_CPU_EXCLUSIVE,
CS_MEM_EXCLUSIVE,
+ CS_MEMORY_MIGRATE,
CS_REMOVED,
CS_NOTIFY_ON_RELEASE
} cpuset_flagbits_t;
@@ -112,6 +132,11 @@ static inline int notify_on_release(const struct cpuset *cs)
return !!test_bit(CS_NOTIFY_ON_RELEASE, &cs->flags);
}
+static inline int is_memory_migrate(const struct cpuset *cs)
+{
+ return !!test_bit(CS_MEMORY_MIGRATE, &cs->flags);
+}
+
/*
* Increment this atomic integer everytime any cpuset changes its
* mems_allowed value. Users of cpusets can track this generation
@@ -137,13 +162,10 @@ static struct cpuset top_cpuset = {
.count = ATOMIC_INIT(0),
.sibling = LIST_HEAD_INIT(top_cpuset.sibling),
.children = LIST_HEAD_INIT(top_cpuset.children),
- .parent = NULL,
- .dentry = NULL,
- .mems_generation = 0,
};
static struct vfsmount *cpuset_mount;
-static struct super_block *cpuset_sb = NULL;
+static struct super_block *cpuset_sb;
/*
* We have two global cpuset semaphores below. They can nest.
@@ -227,6 +249,11 @@ static struct super_block *cpuset_sb = NULL;
* a tasks cpuset pointer we use task_lock(), which acts on a spinlock
* (task->alloc_lock) already in the task_struct routinely used for
* such matters.
+ *
+ * P.S. One more locking exception. RCU is used to guard the
+ * update of a tasks cpuset pointer by attach_task() and the
+ * access of task->cpuset->mems_generation via that pointer in
+ * the routine cpuset_update_task_memory_state().
*/
static DECLARE_MUTEX(manage_sem);
@@ -304,7 +331,7 @@ static void cpuset_d_remove_dir(struct dentry *dentry)
spin_lock(&dcache_lock);
node = dentry->d_subdirs.next;
while (node != &dentry->d_subdirs) {
- struct dentry *d = list_entry(node, struct dentry, d_child);
+ struct dentry *d = list_entry(node, struct dentry, d_u.d_child);
list_del_init(node);
if (d->d_inode) {
d = dget_locked(d);
@@ -316,7 +343,7 @@ static void cpuset_d_remove_dir(struct dentry *dentry)
}
node = dentry->d_subdirs.next;
}
- list_del_init(&dentry->d_child);
+ list_del_init(&dentry->d_u.d_child);
spin_unlock(&dcache_lock);
remove_dir(dentry);
}
@@ -570,20 +597,43 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
BUG_ON(!nodes_intersects(*pmask, node_online_map));
}
-/*
- * Refresh current tasks mems_allowed and mems_generation from current
- * tasks cpuset.
+/**
+ * cpuset_update_task_memory_state - update task memory placement
+ *
+ * If the current tasks cpusets mems_allowed changed behind our
+ * backs, update current->mems_allowed, mems_generation and task NUMA
+ * mempolicy to the new value.
*
- * Call without callback_sem or task_lock() held. May be called with
- * or without manage_sem held. Will acquire task_lock() and might
- * acquire callback_sem during call.
+ * Task mempolicy is updated by rebinding it relative to the
+ * current->cpuset if a task has its memory placement changed.
+ * Do not call this routine if in_interrupt().
*
- * The task_lock() is required to dereference current->cpuset safely.
- * Without it, we could pick up the pointer value of current->cpuset
- * in one instruction, and then attach_task could give us a different
- * cpuset, and then the cpuset we had could be removed and freed,
- * and then on our next instruction, we could dereference a no longer
- * valid cpuset pointer to get its mems_generation field.
+ * Call without callback_sem or task_lock() held. May be called
+ * with or without manage_sem held. Doesn't need task_lock to guard
+ * against another task changing a non-NULL cpuset pointer to NULL,
+ * as that is only done by a task on itself, and if the current task
+ * is here, it is not simultaneously in the exit code NULL'ing its
+ * cpuset pointer. This routine also might acquire callback_sem and
+ * current->mm->mmap_sem during call.
+ *
+ * Reading current->cpuset->mems_generation doesn't need task_lock
+ * to guard the current->cpuset derefence, because it is guarded
+ * from concurrent freeing of current->cpuset by attach_task(),
+ * using RCU.
+ *
+ * The rcu_dereference() is technically probably not needed,
+ * as I don't actually mind if I see a new cpuset pointer but
+ * an old value of mems_generation. However this really only
+ * matters on alpha systems using cpusets heavily. If I dropped
+ * that rcu_dereference(), it would save them a memory barrier.
+ * For all other arch's, rcu_dereference is a no-op anyway, and for
+ * alpha systems not using cpusets, another planned optimization,
+ * avoiding the rcu critical section for tasks in the root cpuset
+ * which is statically allocated, so can't vanish, will make this
+ * irrelevant. Better to use RCU as intended, than to engage in
+ * some cute trick to save a memory barrier that is impossible to
+ * test, for alpha systems using cpusets heavily, which might not
+ * even exist.
*
* This routine is needed to update the per-task mems_allowed data,
* within the tasks context, when it is trying to allocate memory
@@ -591,27 +641,31 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
* task has been modifying its cpuset.
*/
-static void refresh_mems(void)
+void cpuset_update_task_memory_state()
{
int my_cpusets_mem_gen;
+ struct task_struct *tsk = current;
+ struct cpuset *cs;
- task_lock(current);
- my_cpusets_mem_gen = current->cpuset->mems_generation;
- task_unlock(current);
-
- if (current->cpuset_mems_generation != my_cpusets_mem_gen) {
- struct cpuset *cs;
- nodemask_t oldmem = current->mems_allowed;
+ if (tsk->cpuset == &top_cpuset) {
+ /* Don't need rcu for top_cpuset. It's never freed. */
+ my_cpusets_mem_gen = top_cpuset.mems_generation;
+ } else {
+ rcu_read_lock();
+ cs = rcu_dereference(tsk->cpuset);
+ my_cpusets_mem_gen = cs->mems_generation;
+ rcu_read_unlock();
+ }
+ if (my_cpusets_mem_gen != tsk->cpuset_mems_generation) {
down(&callback_sem);
- task_lock(current);
- cs = current->cpuset;
- guarantee_online_mems(cs, &current->mems_allowed);
- current->cpuset_mems_generation = cs->mems_generation;
- task_unlock(current);
+ task_lock(tsk);
+ cs = tsk->cpuset; /* Maybe changed when task not locked */
+ guarantee_online_mems(cs, &tsk->mems_allowed);
+ tsk->cpuset_mems_generation = cs->mems_generation;
+ task_unlock(tsk);
up(&callback_sem);
- if (!nodes_equal(oldmem, current->mems_allowed))
- numa_policy_rebind(&oldmem, &current->mems_allowed);
+ mpol_rebind_task(tsk, &tsk->mems_allowed);
}
}
@@ -766,36 +820,150 @@ static int update_cpumask(struct cpuset *cs, char *buf)
}
/*
+ * Handle user request to change the 'mems' memory placement
+ * of a cpuset. Needs to validate the request, update the
+ * cpusets mems_allowed and mems_generation, and for each
+ * task in the cpuset, rebind any vma mempolicies and if
+ * the cpuset is marked 'memory_migrate', migrate the tasks
+ * pages to the new memory.
+ *
* Call with manage_sem held. May take callback_sem during call.
+ * Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
+ * lock each such tasks mm->mmap_sem, scan its vma's and rebind
+ * their mempolicies to the cpusets new mems_allowed.
*/
static int update_nodemask(struct cpuset *cs, char *buf)
{
struct cpuset trialcs;
+ nodemask_t oldmem;
+ struct task_struct *g, *p;
+ struct mm_struct **mmarray;
+ int i, n, ntasks;
+ int migrate;
+ int fudge;
int retval;
trialcs = *cs;
retval = nodelist_parse(buf, trialcs.mems_allowed);
if (retval < 0)
- return retval;
+ goto done;
nodes_and(trialcs.mems_allowed, trialcs.mems_allowed, node_online_map);
- if (nodes_empty(trialcs.mems_allowed))
- return -ENOSPC;
+ oldmem = cs->mems_allowed;
+ if (nodes_equal(oldmem, trialcs.mems_allowed)) {
+ retval = 0; /* Too easy - nothing to do */
+ goto done;
+ }
+ if (nodes_empty(trialcs.mems_allowed)) {
+ retval = -ENOSPC;
+ goto done;
+ }
retval = validate_change(cs, &trialcs);
- if (retval == 0) {
- down(&callback_sem);
- cs->mems_allowed = trialcs.mems_allowed;
- atomic_inc(&cpuset_mems_generation);
- cs->mems_generation = atomic_read(&cpuset_mems_generation);
- up(&callback_sem);
+ if (retval < 0)
+ goto done;
+
+ down(&callback_sem);
+ cs->mems_allowed = trialcs.mems_allowed;
+ atomic_inc(&cpuset_mems_generation);
+ cs->mems_generation = atomic_read(&cpuset_mems_generation);
+ up(&callback_sem);
+
+ set_cpuset_being_rebound(cs); /* causes mpol_copy() rebind */
+
+ fudge = 10; /* spare mmarray[] slots */
+ fudge += cpus_weight(cs->cpus_allowed); /* imagine one fork-bomb/cpu */
+ retval = -ENOMEM;
+
+ /*
+ * Allocate mmarray[] to hold mm reference for each task
+ * in cpuset cs. Can't kmalloc GFP_KERNEL while holding
+ * tasklist_lock. We could use GFP_ATOMIC, but with a
+ * few more lines of code, we can retry until we get a big
+ * enough mmarray[] w/o using GFP_ATOMIC.
+ */
+ while (1) {
+ ntasks = atomic_read(&cs->count); /* guess */
+ ntasks += fudge;
+ mmarray = kmalloc(ntasks * sizeof(*mmarray), GFP_KERNEL);
+ if (!mmarray)
+ goto done;
+ write_lock_irq(&tasklist_lock); /* block fork */
+ if (atomic_read(&cs->count) <= ntasks)
+ break; /* got enough */
+ write_unlock_irq(&tasklist_lock); /* try again */
+ kfree(mmarray);
}
+
+ n = 0;
+
+ /* Load up mmarray[] with mm reference for each task in cpuset. */
+ do_each_thread(g, p) {
+ struct mm_struct *mm;
+
+ if (n >= ntasks) {
+ printk(KERN_WARNING
+ "Cpuset mempolicy rebind incomplete.\n");
+ continue;
+ }
+ if (p->cpuset != cs)
+ continue;
+ mm = get_task_mm(p);
+ if (!mm)
+ continue;
+ mmarray[n++] = mm;
+ } while_each_thread(g, p);
+ write_unlock_irq(&tasklist_lock);
+
+ /*
+ * Now that we've dropped the tasklist spinlock, we can
+ * rebind the vma mempolicies of each mm in mmarray[] to their
+ * new cpuset, and release that mm. The mpol_rebind_mm()
+ * call takes mmap_sem, which we couldn't take while holding
+ * tasklist_lock. Forks can happen again now - the mpol_copy()
+ * cpuset_being_rebound check will catch such forks, and rebind
+ * their vma mempolicies too. Because we still hold the global
+ * cpuset manage_sem, we know that no other rebind effort will
+ * be contending for the global variable cpuset_being_rebound.
+ * It's ok if we rebind the same mm twice; mpol_rebind_mm()
+ * is idempotent. Also migrate pages in each mm to new nodes.
+ */
+ migrate = is_memory_migrate(cs);
+ for (i = 0; i < n; i++) {
+ struct mm_struct *mm = mmarray[i];
+
+ mpol_rebind_mm(mm, &cs->mems_allowed);
+ if (migrate) {
+ do_migrate_pages(mm, &oldmem, &cs->mems_allowed,
+ MPOL_MF_MOVE_ALL);
+ }
+ mmput(mm);
+ }
+
+ /* We're done rebinding vma's to this cpusets new mems_allowed. */
+ kfree(mmarray);
+ set_cpuset_being_rebound(NULL);
+ retval = 0;
+done:
return retval;
}
/*
+ * Call with manage_sem held.
+ */
+
+static int update_memory_pressure_enabled(struct cpuset *cs, char *buf)
+{
+ if (simple_strtoul(buf, NULL, 10) != 0)
+ cpuset_memory_pressure_enabled = 1;
+ else
+ cpuset_memory_pressure_enabled = 0;
+ return 0;
+}
+
+/*
* update_flag - read a 0 or a 1 in a file and update associated flag
* bit: the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE,
- * CS_NOTIFY_ON_RELEASE)
+ * CS_NOTIFY_ON_RELEASE, CS_MEMORY_MIGRATE)
* cs: the cpuset to update
* buf: the buffer where we read the 0 or 1
*
@@ -834,6 +1002,104 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf)
}
/*
+ * Frequency meter - How fast is some event occuring?
+ *
+ * These routines manage a digitally filtered, constant time based,
+ * event frequency meter. There are four routines:
+ * fmeter_init() - initialize a frequency meter.
+ * fmeter_markevent() - called each time the event happens.
+ * fmeter_getrate() - returns the recent rate of such events.
+ * fmeter_update() - internal routine used to update fmeter.
+ *
+ * A common data structure is passed to each of these routines,
+ * which is used to keep track of the state required to manage the
+ * frequency meter and its digital filter.
+ *
+ * The filter works on the number of events marked per unit time.
+ * The filter is single-pole low-pass recursive (IIR). The time unit
+ * is 1 second. Arithmetic is done using 32-bit integers scaled to
+ * simulate 3 decimal digits of precision (multiplied by 1000).
+ *
+ * With an FM_COEF of 933, and a time base of 1 second, the filter
+ * has a half-life of 10 seconds, meaning that if the events quit
+ * happening, then the rate returned from the fmeter_getrate()
+ * will be cut in half each 10 seconds, until it converges to zero.
+ *
+ * It is not worth doing a real infinitely recursive filter. If more
+ * than FM_MAXTICKS ticks have elapsed since the last filter event,
+ * just compute FM_MAXTICKS ticks worth, by which point the level
+ * will be stable.
+ *
+ * Limit the count of unprocessed events to FM_MAXCNT, so as to avoid
+ * arithmetic overflow in the fmeter_update() routine.
+ *
+ * Given the simple 32 bit integer arithmetic used, this meter works
+ * best for reporting rates between one per millisecond (msec) and
+ * one per 32 (approx) seconds. At constant rates faster than one