From 0ab4dc92278a0f3816e486d6350c6652a72e06c8 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@xensource.com>
Date: Tue, 17 Jul 2007 18:37:02 -0700
Subject: usermodehelper: split setup from execution

Rather than having hundreds of variations of call_usermodehelper for
various pieces of usermode state which could be set up, split the
info allocation and initialization from the actual process execution.

This means the general pattern becomes:
 info = call_usermodehelper_setup(path, argv, envp); /* basic state */
 call_usermodehelper_<SET EXTRA STATE>(info, stuff...);	/* extra state */
 call_usermodehelper_exec(info, wait);	/* run process and free info */

This patch introduces wrappers for all the existing calling styles for
call_usermodehelper_*, but folds their implementations into one.

Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Cc: Andi Kleen <ak@suse.de>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: David Howells <dhowells@redhat.com>
Cc: Bj?rn Steinbrink <B.Steinbrink@gmx.de>
Cc: Randy Dunlap <randy.dunlap@oracle.com>
---
 kernel/kmod.c | 191 +++++++++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 135 insertions(+), 56 deletions(-)

(limited to 'kernel')
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 4d32eb07717..d2dce71115d 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -122,6 +122,7 @@ struct subprocess_info {
 	int wait;
 	int retval;
 	struct file *stdin;
+	void (*cleanup)(char **argv, char **envp);
 };
 
 /*
@@ -180,6 +181,14 @@ static int ____call_usermodehelper(void *data)
 	do_exit(0);
 }
 
+void call_usermodehelper_freeinfo(struct subprocess_info *info)
+{
+	if (info->cleanup)
+		(*info->cleanup)(info->argv, info->envp);
+	kfree(info);
+}
+EXPORT_SYMBOL(call_usermodehelper_freeinfo);
+
 /* Keventd can't block, but this (a child) can. */
 static int wait_for_helper(void *data)
 {
@@ -217,7 +226,7 @@ static int wait_for_helper(void *data)
 	}
 
 	if (sub_info->wait < 0)
-		kfree(sub_info);
+		call_usermodehelper_freeinfo(sub_info);
 	else
 		complete(sub_info->complete);
 	return 0;
@@ -252,11 +261,94 @@ static void __call_usermodehelper(struct work_struct *work)
 }
 
 /**
- * call_usermodehelper_keys - start a usermode application
- * @path: pathname for the application
- * @argv: null-terminated argument list
- * @envp: null-terminated environment list
- * @session_keyring: session keyring for process (NULL for an empty keyring)
+ * call_usermodehelper_setup - prepare to call a usermode helper
+ * @path - path to usermode executable
+ * @argv - arg vector for process
+ * @envp - environment for process
+ *
+ * Returns either NULL on allocation failure, or a subprocess_info
+ * structure.  This should be passed to call_usermodehelper_exec to
+ * exec the process and free the structure.
+ */
+struct subprocess_info *call_usermodehelper_setup(char *path,
+						  char **argv, char **envp)
+{
+	struct subprocess_info *sub_info;
+	sub_info = kzalloc(sizeof(struct subprocess_info),  GFP_ATOMIC);
+	if (!sub_info)
+		goto out;
+
+	INIT_WORK(&sub_info->work, __call_usermodehelper);
+	sub_info->path = path;
+	sub_info->argv = argv;
+	sub_info->envp = envp;
+
+  out:
+	return sub_info;
+}
+EXPORT_SYMBOL(call_usermodehelper_setup);
+
+/**
+ * call_usermodehelper_setkeys - set the session keys for usermode helper
+ * @info: a subprocess_info returned by call_usermodehelper_setup
+ * @session_keyring: the session keyring for the process
+ */
+void call_usermodehelper_setkeys(struct subprocess_info *info,
+				 struct key *session_keyring)
+{
+	info->ring = session_keyring;
+}
+EXPORT_SYMBOL(call_usermodehelper_setkeys);
+
+/**
+ * call_usermodehelper_setcleanup - set a cleanup function
+ * @info: a subprocess_info returned by call_usermodehelper_setup
+ * @cleanup: a cleanup function
+ *
+ * The cleanup function is just befor ethe subprocess_info is about to
+ * be freed.  This can be used for freeing the argv and envp.  The
+ * Function must be runnable in either a process context or the
+ * context in which call_usermodehelper_exec is called.
+ */
+void call_usermodehelper_setcleanup(struct subprocess_info *info,
+				    void (*cleanup)(char **argv, char **envp))
+{
+	info->cleanup = cleanup;
+}
+EXPORT_SYMBOL(call_usermodehelper_setcleanup);
+
+/**
+ * call_usermodehelper_stdinpipe - set up a pipe to be used for stdin
+ * @sub_info: a subprocess_info returned by call_usermodehelper_setup
+ * @filp: set to the write-end of a pipe
+ *
+ * This constructs a pipe, and sets the read end to be the stdin of the
+ * subprocess, and returns the write-end in *@filp.
+ */
+int call_usermodehelper_stdinpipe(struct subprocess_info *sub_info,
+				  struct file **filp)
+{
+	struct file *f;
+
+	f = create_write_pipe();
+	if (IS_ERR(f))
+		return PTR_ERR(f);
+	*filp = f;
+
+	f = create_read_pipe(f);
+	if (IS_ERR(f)) {
+		free_write_pipe(*filp);
+		return PTR_ERR(f);
+	}
+	sub_info->stdin = f;
+
+	return 0;
+}
+EXPORT_SYMBOL(call_usermodehelper_stdinpipe);
+
+/**
+ * call_usermodehelper_exec - start a usermode application
+ * @sub_info: information about the subprocessa
  * @wait: wait for the application to finish and return status.
  *        when -1 don't wait at all, but you get no useful error back when
  *        the program couldn't be exec'ed. This makes it safe to call
@@ -265,33 +357,24 @@ static void __call_usermodehelper(struct work_struct *work)
  * Runs a user-space application.  The application is started
  * asynchronously if wait is not set, and runs as a child of keventd.
  * (ie. it runs with full root capabilities).
- *
- * Must be called from process context.  Returns a negative error code
- * if program was not execed successfully, or 0.
  */
-int call_usermodehelper_keys(char *path, char **argv, char **envp,
-			     struct key *session_keyring, int wait)
+int call_usermodehelper_exec(struct subprocess_info *sub_info,
+			     int wait)
 {
 	DECLARE_COMPLETION_ONSTACK(done);
-	struct subprocess_info *sub_info;
 	int retval;
 
-	if (!khelper_wq)
-		return -EBUSY;
-
-	if (path[0] == '\0')
-		return 0;
+	if (sub_info->path[0] == '\0') {
+		retval = 0;
+		goto out;
+	}
 
-	sub_info = kzalloc(sizeof(struct subprocess_info),  GFP_ATOMIC);
-	if (!sub_info)
-		return -ENOMEM;
+	if (!khelper_wq) {
+		retval = -EBUSY;
+		goto out;
+	}
 
-	INIT_WORK(&sub_info->work, __call_usermodehelper);
 	sub_info->complete = &done;
-	sub_info->path = path;
-	sub_info->argv = argv;
-	sub_info->envp = envp;
-	sub_info->ring = session_keyring;
 	sub_info->wait = wait;
 
 	queue_work(khelper_wq, &sub_info->work);
@@ -299,47 +382,43 @@ int call_usermodehelper_keys(char *path, char **argv, char **envp,
 		return 0;
 	wait_for_completion(&done);
 	retval = sub_info->retval;
-	kfree(sub_info);
+
+  out:
+	call_usermodehelper_freeinfo(sub_info);
 	return retval;
 }
-EXPORT_SYMBOL(call_usermodehelper_keys);
+EXPORT_SYMBOL(call_usermodehelper_exec);
 
+/**
+ * call_usermodehelper_pipe - call a usermode helper process with a pipe stdin
+ * @path: path to usermode executable
+ * @argv: arg vector for process
+ * @envp: environment for process
+ * @filp: set to the write-end of a pipe
+ *
+ * This is a simple wrapper which executes a usermode-helper function
+ * with a pipe as stdin.  It is implemented entirely in terms of
+ * lower-level call_usermodehelper_* functions.
+ */
 int call_usermodehelper_pipe(char *path, char **argv, char **envp,
 			     struct file **filp)
 {
-	DECLARE_COMPLETION(done);
-	struct subprocess_info sub_info = {
-		.work		= __WORK_INITIALIZER(sub_info.work,
-						     __call_usermodehelper),
-		.complete	= &done,
-		.path		= path,
-		.argv		= argv,
-		.envp		= envp,
-		.retval		= 0,
-	};
-	struct file *f;
-
-	if (!khelper_wq)
-		return -EBUSY;
+	struct subprocess_info *sub_info;
+	int ret;
 
-	if (path[0] == '\0')
-		return 0;
+	sub_info = call_usermodehelper_setup(path, argv, envp);
+	if (sub_info == NULL)
+		return -ENOMEM;
 
-	f = create_write_pipe();
-	if (IS_ERR(f))
-		return PTR_ERR(f);
-	*filp = f;
+	ret = call_usermodehelper_stdinpipe(sub_info, filp);
+	if (ret < 0)
+		goto out;
 
-	f = create_read_pipe(f);
-	if (IS_ERR(f)) {
-		free_write_pipe(*filp);
-		return PTR_ERR(f);
-	}
-	sub_info.stdin = f;
+	return call_usermodehelper_exec(sub_info, 1);
 
-	queue_work(khelper_wq, &sub_info.work);
-	wait_for_completion(&done);
-	return sub_info.retval;
+  out:
+	call_usermodehelper_freeinfo(sub_info);
+	return ret;
 }
 EXPORT_SYMBOL(call_usermodehelper_pipe);
 
-- 
cgit v1.2.3-70-g09d2


From 10a0a8d4e3f6bf2d077f94344441909abe670f5a Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@xensource.com>
Date: Tue, 17 Jul 2007 18:37:02 -0700
Subject: Add common orderly_poweroff()

Various pieces of code around the kernel want to be able to trigger an
orderly poweroff.  This pulls them together into a single
implementation.

By default the poweroff command is /sbin/poweroff, but it can be set
via sysctl: kernel/poweroff_cmd.  This is split at whitespace, so it
can include command-line arguments.

This patch replaces four other instances of invoking either "poweroff"
or "shutdown -h now": two sbus drivers, and acpi thermal
management.

sparc64 has its own "powerd"; still need to determine whether it should
be replaced by orderly_poweroff().

Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Acked-by: Len Brown <lenb@kernel.org>
Signed-off-by: Chris Wright <chrisw@sous-sol.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Randy Dunlap <randy.dunlap@oracle.com>
Cc: Andi Kleen <ak@suse.de>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: David S. Miller <davem@davemloft.net>
---
 drivers/acpi/thermal.c          | 24 ++---------------
 drivers/sbus/char/bbc_envctrl.c |  5 ++--
 drivers/sbus/char/envctrl.c     |  7 ++---
 include/linux/reboot.h          |  5 ++++
 kernel/sys.c                    | 58 +++++++++++++++++++++++++++++++++++++++++
 kernel/sysctl.c                 | 10 +++++++
 6 files changed, 79 insertions(+), 30 deletions(-)

(limited to 'kernel')

diff --git a/drivers/acpi/thermal.c b/drivers/acpi/thermal.c
index 88a6fc7fd27..58f1338981b 100644
--- a/drivers/acpi/thermal.c
+++ b/drivers/acpi/thermal.c
@@ -40,6 +40,7 @@
 #include <linux/jiffies.h>
 #include <linux/kmod.h>
 #include <linux/seq_file.h>
+#include <linux/reboot.h>
 #include <asm/uaccess.h>
 
 #include <acpi/acpi_bus.h>
@@ -59,7 +60,6 @@
 #define ACPI_THERMAL_NOTIFY_CRITICAL	0xF0
 #define ACPI_THERMAL_NOTIFY_HOT		0xF1
 #define ACPI_THERMAL_MODE_ACTIVE	0x00
-#define ACPI_THERMAL_PATH_POWEROFF	"/sbin/poweroff"
 
 #define ACPI_THERMAL_MAX_ACTIVE	10
 #define ACPI_THERMAL_MAX_LIMIT_STR_LEN 65
@@ -419,26 +419,6 @@ static int acpi_thermal_get_devices(struct acpi_thermal *tz)
 	return 0;
 }
 
-static int acpi_thermal_call_usermode(char *path)
-{
-	char *argv[2] = { NULL, NULL };
-	char *envp[3] = { NULL, NULL, NULL };
-
-
-	if (!path)
-		return -EINVAL;
-
-	argv[0] = path;
-
-	/* minimal command environment */
-	envp[0] = "HOME=/";
-	envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
-
-	call_usermodehelper(argv[0], argv, envp, 0);
-
-	return 0;
-}
-
 static int acpi_thermal_critical(struct acpi_thermal *tz)
 {
 	if (!tz || !tz->trips.critical.flags.valid)
@@ -456,7 +436,7 @@ static int acpi_thermal_critical(struct acpi_thermal *tz)
 	acpi_bus_generate_event(tz->device, ACPI_THERMAL_NOTIFY_CRITICAL,
 				tz->trips.critical.flags.enabled);
 
-	acpi_thermal_call_usermode(ACPI_THERMAL_PATH_POWEROFF);
+	orderly_poweroff(true);
 
 	return 0;
 }
diff --git a/drivers/sbus/char/bbc_envctrl.c b/drivers/sbus/char/bbc_envctrl.c
index a54e4140683..e821a155b65 100644
--- a/drivers/sbus/char/bbc_envctrl.c
+++ b/drivers/sbus/char/bbc_envctrl.c
@@ -7,6 +7,7 @@
 #include <linux/kthread.h>
 #include <linux/delay.h>
 #include <linux/kmod.h>
+#include <linux/reboot.h>
 #include <asm/oplib.h>
 #include <asm/ebus.h>
 
@@ -170,8 +171,6 @@ static void get_current_temps(struct bbc_cpu_temperature *tp)
 static void do_envctrl_shutdown(struct bbc_cpu_temperature *tp)
 {
 	static int shutting_down = 0;
-	static char *envp[] = { "HOME=/", "TERM=linux", "PATH=/sbin:/usr/sbin:/bin:/usr/bin", NULL };
-	char *argv[] = { "/sbin/shutdown", "-h", "now", NULL };
 	char *type = "???";
 	s8 val = -1;
 
@@ -195,7 +194,7 @@ static void do_envctrl_shutdown(struct bbc_cpu_temperature *tp)
 	printk(KERN_CRIT "kenvctrld: Shutting down the system now.\n");
 
 	shutting_down = 1;
-	if (call_usermodehelper("/sbin/shutdown", argv, envp, 0) < 0)
+	if (orderly_poweroff(true) < 0)
 		printk(KERN_CRIT "envctrl: shutdown execution failed\n");
 }
 
diff --git a/drivers/sbus/char/envctrl.c b/drivers/sbus/char/envctrl.c
index 8328acab47f..dadabef116b 100644
--- a/drivers/sbus/char/envctrl.c
+++ b/drivers/sbus/char/envctrl.c
@@ -26,6 +26,7 @@
 #include <linux/ioport.h>
 #include <linux/miscdevice.h>
 #include <linux/kmod.h>
+#include <linux/reboot.h>
 
 #include <asm/ebus.h>
 #include <asm/uaccess.h>
@@ -966,10 +967,6 @@ static struct i2c_child_t *envctrl_get_i2c_child(unsigned char mon_type)
 static void envctrl_do_shutdown(void)
 {
 	static int inprog = 0;
-	static char *envp[] = {	
-		"HOME=/", "TERM=linux", "PATH=/sbin:/usr/sbin:/bin:/usr/bin", NULL };
-	char *argv[] = { 
-		"/sbin/shutdown", "-h", "now", NULL };	
 	int ret;
 
 	if (inprog != 0)
@@ -977,7 +974,7 @@ static void envctrl_do_shutdown(void)
 
 	inprog = 1;
 	printk(KERN_CRIT "kenvctrld: WARNING: Shutting down the system now.\n");
-	ret = call_usermodehelper("/sbin/shutdown", argv, envp, 0);
+	ret = orderly_poweroff(true);
 	if (ret < 0) {
 		printk(KERN_CRIT "kenvctrld: WARNING: system shutdown failed!\n"); 
 		inprog = 0;  /* unlikely to succeed, but we could try again */
diff --git a/include/linux/reboot.h b/include/linux/reboot.h
index 1dd1c707311..85ea63f462a 100644
--- a/include/linux/reboot.h
+++ b/include/linux/reboot.h
@@ -67,6 +67,11 @@ extern void kernel_power_off(void);
 
 void ctrl_alt_del(void);
 
+#define POWEROFF_CMD_PATH_LEN	256
+extern char poweroff_cmd[POWEROFF_CMD_PATH_LEN];
+
+extern int orderly_poweroff(bool force);
+
 /*
  * Emergency restart, callable from an interrupt handler.
  */
diff --git a/kernel/sys.c b/kernel/sys.c
index 4d141ae3e80..aeded9ad66c 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2286,3 +2286,61 @@ asmlinkage long sys_getcpu(unsigned __user *cpup, unsigned __user *nodep,
 	}
 	return err ? -EFAULT : 0;
 }
+
+char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff";
+
+static void argv_cleanup(char **argv, char **envp)
+{
+	argv_free(argv);
+}
+
+/**
+ * orderly_poweroff - Trigger an orderly system poweroff
+ * @force: force poweroff if command execution fails
+ *
+ * This may be called from any context to trigger a system shutdown.
+ * If the orderly shutdown fails, it will force an immediate shutdown.
+ */
+int orderly_poweroff(bool force)
+{
+	int argc;
+	char **argv = argv_split(GFP_ATOMIC, poweroff_cmd, &argc);
+	static char *envp[] = {
+		"HOME=/",
+		"PATH=/sbin:/bin:/usr/sbin:/usr/bin",
+		NULL
+	};
+	int ret = -ENOMEM;
+	struct subprocess_info *info;
+
+	if (argv == NULL) {
+		printk(KERN_WARNING "%s failed to allocate memory for \"%s\"\n",
+		       __func__, poweroff_cmd);
+		goto out;
+	}
+
+	info = call_usermodehelper_setup(argv[0], argv, envp);
+	if (info == NULL) {
+		argv_free(argv);
+		goto out;
+	}
+
+	call_usermodehelper_setcleanup(info, argv_cleanup);
+
+	ret = call_usermodehelper_exec(info, -1);
+
+  out:
+	if (ret && force) {
+		printk(KERN_WARNING "Failed to start orderly shutdown: "
+		       "forcing the issue\n");
+
+		/* I guess this should try to kick off some daemon to
+		   sync and poweroff asap.  Or not even bother syncing
+		   if we're doing an emergency shutdown? */
+		emergency_sync();
+		kernel_power_off();
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(orderly_poweroff);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 7063ebc6db0..44a1d699aad 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -46,6 +46,7 @@
 #include <linux/syscalls.h>
 #include <linux/nfs_fs.h>
 #include <linux/acpi.h>
+#include <linux/reboot.h>
 
 #include <asm/uaccess.h>
 #include <asm/processor.h>
@@ -705,6 +706,15 @@ static ctl_table kern_table[] = {
 		.proc_handler   = &proc_dointvec,
 	},
 #endif
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "poweroff_cmd",
+		.data		= &poweroff_cmd,
+		.maxlen		= POWEROFF_CMD_PATH_LEN,
+		.mode		= 0644,
+		.proc_handler	= &proc_dostring,
+		.strategy	= &sysctl_string,
+	},
 
 	{ .ctl_name = 0 }
 };
-- 
cgit v1.2.3-70-g09d2


From 86313c488a6848b7ec2ba04e74f25f79dd32a0b7 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@xensource.com>
Date: Tue, 17 Jul 2007 18:37:03 -0700
Subject: usermodehelper: Tidy up waiting

Rather than using a tri-state integer for the wait flag in
call_usermodehelper_exec, define a proper enum, and use that.  I've
preserved the integer values so that any callers I've missed should
still work OK.

Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Cc: James Bottomley <James.Bottomley@HansenPartnership.com>
Cc: Randy Dunlap <randy.dunlap@oracle.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Andi Kleen <ak@suse.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Johannes Berg <johannes@sipsolutions.net>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Bjorn Helgaas <bjorn.helgaas@hp.com>
Cc: Joel Becker <joel.becker@oracle.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Kay Sievers <kay.sievers@vrfy.org>
Cc: Srivatsa Vaddagiri <vatsa@in.ibm.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Cc: David Howells <dhowells@redhat.com>
---
 arch/i386/mach-voyager/voyager_thread.c |  2 +-
 arch/x86_64/kernel/mce.c                |  2 +-
 drivers/macintosh/therm_pm72.c          |  3 ++-
 drivers/macintosh/windfarm_core.c       |  3 ++-
 drivers/net/hamradio/baycom_epp.c       |  2 +-
 drivers/pnp/pnpbios/core.c              |  2 +-
 fs/ocfs2/heartbeat.c                    |  2 +-
 include/linux/kmod.h                    | 12 +++++++++---
 kernel/cpuset.c                         |  2 +-
 kernel/kmod.c                           | 27 ++++++++++++++++-----------
 kernel/sys.c                            |  2 +-
 lib/kobject_uevent.c                    |  2 +-
 net/bridge/br_stp_if.c                  |  2 +-
 security/keys/request_key.c             |  3 ++-
 14 files changed, 40 insertions(+), 26 deletions(-)

(limited to 'kernel')

diff --git a/arch/i386/mach-voyager/voyager_thread.c b/arch/i386/mach-voyager/voyager_thread.c
index b4b24e0e45e..f9d59533815 100644
--- a/arch/i386/mach-voyager/voyager_thread.c
+++ b/arch/i386/mach-voyager/voyager_thread.c
@@ -52,7 +52,7 @@ execute(const char *string)
 		NULL,
 	};
 
-	if ((ret = call_usermodehelper(argv[0], argv, envp, 1)) != 0) {
+	if ((ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC)) != 0) {
 		printk(KERN_ERR "Voyager failed to run \"%s\": %i\n",
 		       string, ret);
 	}
diff --git a/arch/x86_64/kernel/mce.c b/arch/x86_64/kernel/mce.c
index aa1d1599179..f3fb8174559 100644
--- a/arch/x86_64/kernel/mce.c
+++ b/arch/x86_64/kernel/mce.c
@@ -174,7 +174,7 @@ static void do_mce_trigger(void)
 	if (events != atomic_read(&mce_logged) && trigger[0]) {
 		/* Small race window, but should be harmless.  */
 		atomic_set(&mce_logged, events);
-		call_usermodehelper(trigger, trigger_argv, NULL, -1);
+		call_usermodehelper(trigger, trigger_argv, NULL, UMH_NO_WAIT);
 	}
 }
 
diff --git a/drivers/macintosh/therm_pm72.c b/drivers/macintosh/therm_pm72.c
index dbb22403979..3d90fc00209 100644
--- a/drivers/macintosh/therm_pm72.c
+++ b/drivers/macintosh/therm_pm72.c
@@ -1770,7 +1770,8 @@ static int call_critical_overtemp(void)
 				"PATH=/sbin:/usr/sbin:/bin:/usr/bin",
 				NULL };
 
-	return call_usermodehelper(critical_overtemp_path, argv, envp, 0);
+	return call_usermodehelper(critical_overtemp_path,
+				   argv, envp, UMH_WAIT_EXEC);
 }
 
 
diff --git a/drivers/macintosh/windfarm_core.c b/drivers/macintosh/windfarm_core.c
index e18d265d5d3..516d943227e 100644
--- a/drivers/macintosh/windfarm_core.c
+++ b/drivers/macintosh/windfarm_core.c
@@ -80,7 +80,8 @@ int wf_critical_overtemp(void)
 				"PATH=/sbin:/usr/sbin:/bin:/usr/bin",
 				NULL };
 
-	return call_usermodehelper(critical_overtemp_path, argv, envp, 0);
+	return call_usermodehelper(critical_overtemp_path,
+				   argv, envp, UMH_WAIT_EXEC);
 }
 EXPORT_SYMBOL_GPL(wf_critical_overtemp);
 
diff --git a/drivers/net/hamradio/baycom_epp.c b/drivers/net/hamradio/baycom_epp.c
index 84aa2117c0e..355c6cf3d11 100644
--- a/drivers/net/hamradio/baycom_epp.c
+++ b/drivers/net/hamradio/baycom_epp.c
@@ -320,7 +320,7 @@ static int eppconfig(struct baycom_state *bc)
 	sprintf(portarg, "%ld", bc->pdev->port->base);
 	printk(KERN_DEBUG "%s: %s -s -p %s -m %s\n", bc_drvname, eppconfig_path, portarg, modearg);
 
-	return call_usermodehelper(eppconfig_path, argv, envp, 1);
+	return call_usermodehelper(eppconfig_path, argv, envp, UMH_WAIT_PROC);
 }
 
 /* ---------------------------------------------------------------------- */
diff --git a/drivers/pnp/pnpbios/core.c b/drivers/pnp/pnpbios/core.c
index 03baf1c64a2..ed112ee1601 100644
--- a/drivers/pnp/pnpbios/core.c
+++ b/drivers/pnp/pnpbios/core.c
@@ -147,7 +147,7 @@ static int pnp_dock_event(int dock, struct pnp_docking_station_info *info)
 		info->location_id, info->serial, info->capabilities);
 	envp[i] = NULL;
 	
-	value = call_usermodehelper (argv [0], argv, envp, 0);
+	value = call_usermodehelper (argv [0], argv, envp, UMH_WAIT_EXEC);
 	kfree (buf);
 	kfree (envp);
 	return 0;
diff --git a/fs/ocfs2/heartbeat.c b/fs/ocfs2/heartbeat.c
index 352eb4a13f9..c4c36171240 100644
--- a/fs/ocfs2/heartbeat.c
+++ b/fs/ocfs2/heartbeat.c
@@ -209,7 +209,7 @@ void ocfs2_stop_heartbeat(struct ocfs2_super *osb)
 	envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
 	envp[2] = NULL;
 
-	ret = call_usermodehelper(argv[0], argv, envp, 1);
+	ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
 	if (ret < 0)
 		mlog_errno(ret);
 }
diff --git a/include/linux/kmod.h b/include/linux/kmod.h
index c4cbe59d9c6..5dc13848891 100644
--- a/include/linux/kmod.h
+++ b/include/linux/kmod.h
@@ -51,15 +51,21 @@ int call_usermodehelper_stdinpipe(struct subprocess_info *sub_info,
 void call_usermodehelper_setcleanup(struct subprocess_info *info,
 				    void (*cleanup)(char **argv, char **envp));
 
+enum umh_wait {
+	UMH_NO_WAIT = -1,	/* don't wait at all */
+	UMH_WAIT_EXEC = 0,	/* wait for the exec, but not the process */
+	UMH_WAIT_PROC = 1,	/* wait for the process to complete */
+};
+
 /* Actually execute the sub-process */
-int call_usermodehelper_exec(struct subprocess_info *info, int wait);
+int call_usermodehelper_exec(struct subprocess_info *info, enum umh_wait wait);
 
 /* Free the subprocess_info. This is only needed if you're not going
    to call call_usermodehelper_exec */
 void call_usermodehelper_freeinfo(struct subprocess_info *info);
 
 static inline int
-call_usermodehelper(char *path, char **argv, char **envp, int wait)
+call_usermodehelper(char *path, char **argv, char **envp, enum umh_wait wait)
 {
 	struct subprocess_info *info;
 
@@ -71,7 +77,7 @@ call_usermodehelper(char *path, char **argv, char **envp, int wait)
 
 static inline int
 call_usermodehelper_keys(char *path, char **argv, char **envp,
-			 struct key *session_keyring, int wait)
+			 struct key *session_keyring, enum umh_wait wait)
 {
 	struct subprocess_info *info;
 
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index b4796d85014..57e6448b171 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -516,7 +516,7 @@ static void cpuset_release_agent(const char *pathbuf)
 	envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
 	envp[i] = NULL;
 
-	call_usermodehelper(argv[0], argv, envp, 0);
+	call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
 	kfree(pathbuf);
 }
 
diff --git a/kernel/kmod.c b/kernel/kmod.c
index d2dce71115d..78d365c524e 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -119,7 +119,7 @@ struct subprocess_info {
 	char **argv;
 	char **envp;
 	struct key *ring;
-	int wait;
+	enum umh_wait wait;
 	int retval;
 	struct file *stdin;
 	void (*cleanup)(char **argv, char **envp);
@@ -225,7 +225,7 @@ static int wait_for_helper(void *data)
 			sub_info->retval = ret;
 	}
 
-	if (sub_info->wait < 0)
+	if (sub_info->wait == UMH_NO_WAIT)
 		call_usermodehelper_freeinfo(sub_info);
 	else
 		complete(sub_info->complete);
@@ -238,26 +238,31 @@ static void __call_usermodehelper(struct work_struct *work)
 	struct subprocess_info *sub_info =
 		container_of(work, struct subprocess_info, work);
 	pid_t pid;
-	int wait = sub_info->wait;
+	enum umh_wait wait = sub_info->wait;
 
 	/* CLONE_VFORK: wait until the usermode helper has execve'd
 	 * successfully We need the data structures to stay around
 	 * until that is done.  */
-	if (wait)
+	if (wait == UMH_WAIT_PROC || wait == UMH_NO_WAIT)
 		pid = kernel_thread(wait_for_helper, sub_info,
 				    CLONE_FS | CLONE_FILES | SIGCHLD);
 	else
 		pid = kernel_thread(____call_usermodehelper, sub_info,
 				    CLONE_VFORK | SIGCHLD);
 
-	if (wait < 0)
-		return;
+	switch (wait) {
+	case UMH_NO_WAIT:
+		break;
 
-	if (pid < 0) {
+	case UMH_WAIT_PROC:
+		if (pid > 0)
+			break;
 		sub_info->retval = pid;
+		/* FALLTHROUGH */
+
+	case UMH_WAIT_EXEC:
 		complete(sub_info->complete);
-	} else if (!wait)
-		complete(sub_info->complete);
+	}
 }
 
 /**
@@ -359,7 +364,7 @@ EXPORT_SYMBOL(call_usermodehelper_stdinpipe);
  * (ie. it runs with full root capabilities).
  */
 int call_usermodehelper_exec(struct subprocess_info *sub_info,
-			     int wait)
+			     enum umh_wait wait)
 {
 	DECLARE_COMPLETION_ONSTACK(done);
 	int retval;
@@ -378,7 +383,7 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info,
 	sub_info->wait = wait;
 
 	queue_work(khelper_wq, &sub_info->work);
-	if (wait < 0) /* task has freed sub_info */
+	if (wait == UMH_NO_WAIT) /* task has freed sub_info */
 		return 0;
 	wait_for_completion(&done);
 	retval = sub_info->retval;
diff --git a/kernel/sys.c b/kernel/sys.c
index aeded9ad66c..18987c7f6ad 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2327,7 +2327,7 @@ int orderly_poweroff(bool force)
 
 	call_usermodehelper_setcleanup(info, argv_cleanup);
 
-	ret = call_usermodehelper_exec(info, -1);
+	ret = call_usermodehelper_exec(info, UMH_NO_WAIT);
 
   out:
 	if (ret && force) {
diff --git a/lib/kobject_uevent.c b/lib/kobject_uevent.c
index 12e311dc664..bd5ecbbafab 100644
--- a/lib/kobject_uevent.c
+++ b/lib/kobject_uevent.c
@@ -208,7 +208,7 @@ int kobject_uevent_env(struct kobject *kobj, enum kobject_action action,
 		argv [0] = uevent_helper;
 		argv [1] = (char *)subsystem;
 		argv [2] = NULL;
-		call_usermodehelper (argv[0], argv, envp, 0);
+		call_usermodehelper (argv[0], argv, envp, UMH_WAIT_EXEC);
 	}
 
 exit:
diff --git a/net/bridge/br_stp_if.c b/net/bridge/br_stp_if.c
index a786e786320..1ea2f86f768 100644
--- a/net/bridge/br_stp_if.c
+++ b/net/bridge/br_stp_if.c
@@ -125,7 +125,7 @@ static void br_stp_start(struct net_bridge *br)
 	char *argv[] = { BR_STP_PROG, br->dev->name, "start", NULL };
 	char *envp[] = { NULL };
 
-	r = call_usermodehelper(BR_STP_PROG, argv, envp, 1);
+	r = call_usermodehelper(BR_STP_PROG, argv, envp, UMH_WAIT_PROC);
 	if (r == 0) {
 		br->stp_enabled = BR_USER_STP;
 		printk(KERN_INFO "%s: userspace STP started\n", br->dev->name);
diff --git a/security/keys/request_key.c b/security/keys/request_key.c
index f573ac189a0..557500110a1 100644
--- a/security/keys/request_key.c
+++ b/security/keys/request_key.c
@@ -108,7 +108,8 @@ static int call_sbin_request_key(struct key *key,
 	argv[i] = NULL;
 
 	/* do it */
-	ret = call_usermodehelper_keys(argv[0], argv, envp, keyring, 1);
+	ret = call_usermodehelper_keys(argv[0], argv, envp, keyring,
+				       UMH_WAIT_PROC);
 
 error_link:
 	key_put(keyring);
-- 
cgit v1.2.3-70-g09d2


From 471d0558045fe35f8c5f291c1ee63815eb9c2dcd Mon Sep 17 00:00:00 2001
From: Alan Stern <stern@rowland.harvard.edu>
Date: Thu, 12 Jul 2007 16:55:07 -0400
Subject: PM: Remove deprecated sysfs files

This patch (as932) removes the deprecated sysfs .../power/state
attribute files.

Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 Documentation/feature-removal-schedule.txt |  3 +-
 drivers/base/power/sysfs.c                 | 66 ------------------------------
 kernel/power/Kconfig                       | 12 ------
 3 files changed, 1 insertion(+), 80 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt
index d05e6243b4d..9cf9d834628 100644
--- a/Documentation/feature-removal-schedule.txt
+++ b/Documentation/feature-removal-schedule.txt
@@ -26,8 +26,7 @@ Who:	Hans Verkuil <hverkuil@xs4all.nl> and
 
 ---------------------------
 
-What:	/sys/devices/.../power/state
-	dev->power.power_state
+What:	dev->power.power_state
 	dpm_runtime_{suspend,resume)()
 When:	July 2007
 Why:	Broken design for runtime control over driver power states, confusing
diff --git a/drivers/base/power/sysfs.c b/drivers/base/power/sysfs.c
index 2d47517dbe3..f2ed179cd69 100644
--- a/drivers/base/power/sysfs.c
+++ b/drivers/base/power/sysfs.c
@@ -7,69 +7,6 @@
 #include "power.h"
 
 
-#ifdef	CONFIG_PM_SYSFS_DEPRECATED
-
-/**
- *	state - Control current power state of device
- *
- *	show() returns the current power state of the device. '0' indicates
- *	the device is on. Other values (2) indicate the device is in some low
- *	power state.
- *
- *	store() sets the current power state, which is an integer valued
- *	0, 2, or 3.  Devices with bus.suspend_late(), or bus.resume_early()
- *	methods fail this operation; those methods couldn't be called.
- *	Otherwise,
- *
- *	- If the recorded dev->power.power_state.event matches the
- *	  target value, nothing is done.
- *	- If the recorded event code is nonzero, the device is reactivated
- *	  by calling bus.resume() and/or class.resume().
- *	- If the target value is nonzero, the device is suspended by
- *	  calling class.suspend() and/or bus.suspend() with event code
- *	  PM_EVENT_SUSPEND.
- *
- *	This mechanism is DEPRECATED and should only be used for testing.
- */
-
-static ssize_t state_show(struct device * dev, struct device_attribute *attr, char * buf)
-{
-	if (dev->power.power_state.event)
-		return sprintf(buf, "2\n");
-	else
-		return sprintf(buf, "0\n");
-}
-
-static ssize_t state_store(struct device * dev, struct device_attribute *attr, const char * buf, size_t n)
-{
-	pm_message_t state;
-	int error = -EINVAL;
-
-	/* disallow incomplete suspend sequences */
-	if (dev->bus && (dev->bus->suspend_late || dev->bus->resume_early))
-		return error;
-
-	state.event = PM_EVENT_SUSPEND;
-	/* Older apps expected to write "3" here - confused with PCI D3 */
-	if ((n == 1) && !strcmp(buf, "3"))
-		error = dpm_runtime_suspend(dev, state);
-
-	if ((n == 1) && !strcmp(buf, "2"))
-		error = dpm_runtime_suspend(dev, state);
-
-	if ((n == 1) && !strcmp(buf, "0")) {
-		dpm_runtime_resume(dev);
-		error = 0;
-	}
-
-	return error ? error : n;
-}
-
-static DEVICE_ATTR(state, 0644, state_show, state_store);
-
-
-#endif	/* CONFIG_PM_SYSFS_DEPRECATED */
-
 /*
  *	wakeup - Report/change current wakeup option for device
  *
@@ -143,9 +80,6 @@ static DEVICE_ATTR(wakeup, 0644, wake_show, wake_store);
 
 
 static struct attribute * power_attrs[] = {
-#ifdef	CONFIG_PM_SYSFS_DEPRECATED
-	&dev_attr_state.attr,
-#endif
 	&dev_attr_wakeup.attr,
 	NULL,
 };
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 495b7d4dd33..73328476761 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -65,18 +65,6 @@ config PM_TRACE
 	CAUTION: this option will cause your machine's real-time clock to be
 	set to an invalid time after a resume.
 
-config PM_SYSFS_DEPRECATED
-	bool "Driver model /sys/devices/.../power/state files (DEPRECATED)"
-	depends on PM && SYSFS
-	default n
-	help
-	  The driver model started out with a sysfs file intended to provide
-	  a userspace hook for device power management.  This feature has never
-	  worked very well, except for limited testing purposes, and so it will
-	  be removed.   It's not clear that a generic mechanism could really
-	  handle the wide variability of device power states; any replacements
-	  are likely to be bus or driver specific.
-
 config SOFTWARE_SUSPEND
 	bool "Software Suspend (Hibernation)"
 	depends on PM && SWAP && (((X86 || PPC64_SWSUSP) && (!SMP || SUSPEND_SMP)) || ((FRV || PPC32) && !SMP))
-- 
cgit v1.2.3-70-g09d2


From 83c54070ee1a2d05c89793884bea1a03f2851ed4 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Thu, 19 Jul 2007 01:47:05 -0700
Subject: mm: fault feedback #2

This patch completes Linus's wish that the fault return codes be made into
bit flags, which I agree makes everything nicer.  This requires requires
all handle_mm_fault callers to be modified (possibly the modifications
should go further and do things like fault accounting in handle_mm_fault --
however that would be for another patch).

[akpm@linux-foundation.org: fix alpha build]
[akpm@linux-foundation.org: fix s390 build]
[akpm@linux-foundation.org: fix sparc build]
[akpm@linux-foundation.org: fix sparc64 build]
[akpm@linux-foundation.org: fix ia64 build]
Signed-off-by: Nick Piggin <npiggin@suse.de>
Cc: Richard Henderson <rth@twiddle.net>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: Russell King <rmk@arm.linux.org.uk>
Cc: Ian Molton <spyro@f2s.com>
Cc: Bryan Wu <bryan.wu@analog.com>
Cc: Mikael Starvik <starvik@axis.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: Hirokazu Takata <takata@linux-m32r.org>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Roman Zippel <zippel@linux-m68k.org>
Cc: Greg Ungerer <gerg@uclinux.org>
Cc: Matthew Wilcox <willy@debian.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Kazumoto Kojima <kkojima@rr.iij4u.or.jp>
Cc: Richard Curnow <rc@rc0.org.uk>
Cc: William Lee Irwin III <wli@holomorphy.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Jeff Dike <jdike@addtoit.com>
Cc: Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it>
Cc: Miles Bader <uclinux-v850@lsi.nec.co.jp>
Cc: Chris Zankel <chris@zankel.net>
Acked-by: Kyle McMartin <kyle@mcmartin.ca>
Acked-by: Haavard Skinnemoen <hskinnemoen@atmel.com>
Acked-by: Ralf Baechle <ralf@linux-mips.org>
Acked-by: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
[ Still apparently needs some ARM and PPC loving - Linus ]
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/alpha/mm/fault.c                     | 22 ++++-----
 arch/arm/mm/fault.c                       | 36 +++++++-------
 arch/arm26/mm/fault.c                     | 30 ++++++------
 arch/avr32/mm/fault.c                     | 23 +++++----
 arch/cris/mm/fault.c                      | 23 ++++-----
 arch/frv/mm/fault.c                       | 23 ++++-----
 arch/i386/mm/fault.c                      | 23 +++++----
 arch/ia64/mm/fault.c                      | 26 +++++-----
 arch/m32r/mm/fault.c                      | 23 +++++----
 arch/m68k/mm/fault.c                      | 21 ++++----
 arch/mips/mm/fault.c                      | 23 +++++----
 arch/parisc/mm/fault.c                    | 23 ++++-----
 arch/powerpc/mm/fault.c                   | 26 +++++-----
 arch/powerpc/platforms/cell/spufs/fault.c | 28 +++++------
 arch/ppc/mm/fault.c                       | 23 +++++----
 arch/s390/lib/uaccess_pt.c                | 23 +++++----
 arch/s390/mm/fault.c                      | 30 ++++++------
 arch/sh/mm/fault.c                        | 23 +++++----
 arch/sh64/mm/fault.c                      | 24 +++++-----
 arch/sparc/mm/fault.c                     | 22 ++++-----
 arch/sparc64/mm/fault.c                   | 24 +++++-----
 arch/um/kernel/trap.c                     | 29 +++++------
 arch/x86_64/mm/fault.c                    | 25 +++++-----
 arch/xtensa/mm/fault.c                    | 23 +++++----
 fs/gfs2/ops_vm.c                          | 11 +++--
 include/linux/mm.h                        | 58 +++++-----------------
 kernel/futex.c                            | 21 ++++----
 mm/filemap.c                              |  6 +--
 mm/filemap_xip.c                          |  2 +-
 mm/hugetlb.c                              | 10 ++--
 mm/memory.c                               | 80 +++++++++++++++----------------
 mm/shmem.c                                |  8 ++--
 32 files changed, 373 insertions(+), 419 deletions(-)

(limited to 'kernel')

diff --git a/arch/alpha/mm/fault.c b/arch/alpha/mm/fault.c
index f5862792a16..a0e18da594d 100644
--- a/arch/alpha/mm/fault.c
+++ b/arch/alpha/mm/fault.c
@@ -148,21 +148,17 @@ do_page_fault(unsigned long address, unsigned long mmcsr,
 	   the fault.  */
 	fault = handle_mm_fault(mm, vma, address, cause > 0);
 	up_read(&mm->mmap_sem);
-
-	switch (fault) {
-	      case VM_FAULT_MINOR:
-		current->min_flt++;
-		break;
-	      case VM_FAULT_MAJOR:
-		current->maj_flt++;
-		break;
-	      case VM_FAULT_SIGBUS:
-		goto do_sigbus;
-	      case VM_FAULT_OOM:
-		goto out_of_memory;
-	      default:
+	if (unlikely(fault & VM_FAULT_ERROR)) {
+		if (fault & VM_FAULT_OOM)
+			goto out_of_memory;
+		else if (fault & VM_FAULT_SIGBUS)
+			goto do_sigbus;
 		BUG();
 	}
+	if (fault & VM_FAULT_MAJOR)
+		current->maj_flt++;
+	else
+		current->min_flt++;
 	return;
 
 	/* Something tried to access memory that isn't in our memory map.
diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c
index 75d491448e4..c04124a095c 100644
--- a/arch/arm/mm/fault.c
+++ b/arch/arm/mm/fault.c
@@ -183,20 +183,20 @@ good_area:
 	 */
 survive:
 	fault = handle_mm_fault(mm, vma, addr & PAGE_MASK, fsr & (1 << 11));
-
-	/*
-	 * Handle the "normal" cases first - successful and sigbus
-	 */
-	switch (fault) {
-	case VM_FAULT_MAJOR:
+	if (unlikely(fault & VM_FAULT_ERROR)) {
+		if (fault & VM_FAULT_OOM)
+			goto out_of_memory;
+		else if (fault & VM_FAULT_SIGBUS)
+			return fault;
+		BUG();
+	}
+	if (fault & VM_FAULT_MAJOR)
 		tsk->maj_flt++;
-		return fault;
-	case VM_FAULT_MINOR:
+	else
 		tsk->min_flt++;
-	case VM_FAULT_SIGBUS:
-		return fault;
-	}
+	return fault;
 
+out_of_memory:
 	if (!is_init(tsk))
 		goto out;
 
@@ -249,7 +249,7 @@ do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
 	/*
 	 * Handle the "normal" case first - VM_FAULT_MAJOR / VM_FAULT_MINOR
 	 */
-	if (fault >= VM_FAULT_MINOR)
+	if (likely(!(fault & VM_FAULT_ERROR)))
 		return 0;
 
 	/*
@@ -259,8 +259,7 @@ do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
 	if (!user_mode(regs))
 		goto no_context;
 
-	switch (fault) {
-	case VM_FAULT_OOM:
+	if (fault & VM_FAULT_OOM) {
 		/*
 		 * We ran out of memory, or some other thing
 		 * happened to us that made us unable to handle
@@ -269,17 +268,15 @@ do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
 		printk("VM: killing process %s\n", tsk->comm);
 		do_exit(SIGKILL);
 		return 0;
-
-	case VM_FAULT_SIGBUS:
+	}
+	if (fault & VM_FAULT_SIGBUS) {
 		/*
 		 * We had some memory, but were unable to
 		 * successfully fix up this page fault.
 		 */
 		sig = SIGBUS;
 		code = BUS_ADRERR;
-		break;
-
-	default:
+	} else {
 		/*
 		 * Something tried to access memory that
 		 * isn't in our memory map..
@@ -287,7 +284,6 @@ do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
 		sig = SIGSEGV;
 		code = fault == VM_FAULT_BADACCESS ?
 			SEGV_ACCERR : SEGV_MAPERR;
-		break;
 	}
 
 	__do_user_fault(tsk, addr, fsr, sig, code, regs);
diff --git a/arch/arm26/mm/fault.c b/arch/arm26/mm/fault.c
index 93c0cee0fb5..dec638a0c8d 100644
--- a/arch/arm26/mm/fault.c
+++ b/arch/arm26/mm/fault.c
@@ -170,20 +170,20 @@ good_area:
 	 */
 survive:
 	fault = handle_mm_fault(mm, vma, addr & PAGE_MASK, DO_COW(fsr));
-
-	/*
-	 * Handle the "normal" cases first - successful and sigbus
-	 */
-	switch (fault) {
-	case VM_FAULT_MAJOR:
+	if (unlikely(fault & VM_FAULT_ERROR)) {
+		if (fault & VM_FAULT_OOM)
+			goto out_of_memory;
+		else if (fault & VM_FAULT_SIGBUS)
+			return fault;
+		BUG();
+	}
+	if (fault & VM_FAULT_MAJOR)
 		tsk->maj_flt++;
-		return fault;
-	case VM_FAULT_MINOR:
+	else
 		tsk->min_flt++;
-	case VM_FAULT_SIGBUS:
-		return fault;
-	}
+	return fault;
 
+out_of_memory:
 	fault = -3; /* out of memory */
 	if (!is_init(tsk))
 		goto out;
@@ -225,13 +225,11 @@ int do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
 	/*
 	 * Handle the "normal" case first
 	 */
-	switch (fault) {
-	case VM_FAULT_MINOR:
-	case VM_FAULT_MAJOR:
+	if (likely(!(fault & VM_FAULT_ERROR)))
 		return 0;
-	case VM_FAULT_SIGBUS:
+	if (fault & VM_FAULT_SIGBUS)
 		goto do_sigbus;
-	}
+	/* else VM_FAULT_OOM */
 
 	/*
 	 * If we are in kernel mode at this point, we
diff --git a/arch/avr32/mm/fault.c b/arch/avr32/mm/fault.c
index 4b2495285d9..ae2d2c593b2 100644
--- a/arch/avr32/mm/fault.c
+++ b/arch/avr32/mm/fault.c
@@ -64,6 +64,7 @@ asmlinkage void do_page_fault(unsigned long ecr, struct pt_regs *regs)
 	int writeaccess;
 	long signr;
 	int code;
+	int fault;
 
 	if (notify_page_fault(regs, ecr))
 		return;
@@ -132,20 +133,18 @@ good_area:
 	 * fault.
 	 */
 survive:
-	switch (handle_mm_fault(mm, vma, address, writeaccess)) {
-	case VM_FAULT_MINOR:
-		tsk->min_flt++;
-		break;
-	case VM_FAULT_MAJOR:
-		tsk->maj_flt++;
-		break;
-	case VM_FAULT_SIGBUS:
-		goto do_sigbus;
-	case VM_FAULT_OOM:
-		goto out_of_memory;
-	default:
+	fault = handle_mm_fault(mm, vma, address, writeaccess);
+	if (unlikely(fault & VM_FAULT_ERROR)) {
+		if (fault & VM_FAULT_OOM)
+			goto out_of_memory;
+		else if (fault & VM_FAULT_SIGBUS)
+			goto do_sigbus;
 		BUG();
 	}
+	if (fault & VM_FAULT_MAJOR)
+		tsk->maj_flt++;
+	else
+		tsk->min_flt++;
 
 	up_read(&mm->mmap_sem);
 	return;
diff --git a/arch/cris/mm/fault.c b/arch/cris/mm/fault.c
index c73e91f1299..8672ab7d797 100644
--- a/arch/cris/mm/fault.c
+++ b/arch/cris/mm/fault.c
@@ -179,6 +179,7 @@ do_page_fault(unsigned long address, struct pt_regs *regs,
 	struct mm_struct *mm;
 	struct vm_area_struct * vma;
 	siginfo_t info;
+	int fault;
 
         D(printk("Page fault for %lX on %X at %lX, prot %d write %d\n",
                  address, smp_processor_id(), instruction_pointer(regs),
@@ -283,18 +284,18 @@ do_page_fault(unsigned long address, struct pt_regs *regs,
 	 * the fault.
 	 */
 
-	switch (handle_mm_fault(mm, vma, address, writeaccess & 1)) {
-	case VM_FAULT_MINOR:
-		tsk->min_flt++;
-		break;
-	case VM_FAULT_MAJOR:
-		tsk->maj_flt++;
-		break;
-	case VM_FAULT_SIGBUS:
-		goto do_sigbus;
-	default:
-		goto out_of_memory;
+	fault = handle_mm_fault(mm, vma, address, writeaccess & 1);
+	if (unlikely(fault & VM_FAULT_ERROR)) {
+		if (fault & VM_FAULT_OOM)
+			goto out_of_memory;
+		else if (fault & VM_FAULT_SIGBUS)
+			goto do_sigbus;
+		BUG();
 	}
+	if (fault & VM_FAULT_MAJOR)
+		tsk->maj_flt++;
+	else
+		tsk->min_flt++;
 
 	up_read(&mm->mmap_sem);
 	return;
diff --git a/arch/frv/mm/fault.c b/arch/frv/mm/fault.c
index 3f12296c368..6798fa0257b 100644
--- a/arch/frv/mm/fault.c
+++ b/arch/frv/mm/fault.c
@@ -40,6 +40,7 @@ asmlinkage void do_page_fault(int datammu, unsigned long esr0, unsigned long ear
 	pud_t *pue;
 	pte_t *pte;
 	int write;
+	int fault;
 
 #if 0
 	const char *atxc[16] = {
@@ -162,18 +163,18 @@ asmlinkage void do_page_fault(int datammu, unsigned long esr0, unsigned long ear
 	 * make sure we exit gracefully rather than endlessly redo
 	 * the fault.
 	 */
-	switch (handle_mm_fault(mm, vma, ear0, write)) {
-	case VM_FAULT_MINOR:
-		current->min_flt++;
-		break;
-	case VM_FAULT_MAJOR:
-		current->maj_flt++;
-		break;
-	case VM_FAULT_SIGBUS:
-		goto do_sigbus;
-	default:
-		goto out_of_memory;
+	fault = handle_mm_fault(mm, vma, ear0, write);
+	if (unlikely(fault & VM_FAULT_ERROR)) {
+		if (fault & VM_FAULT_OOM)
+			goto out_of_memory;
+		else if (fault & VM_FAULT_SIGBUS)
+			goto do_sigbus;
+		BUG();
 	}
+	if (fault & VM_FAULT_MAJOR)
+		current->maj_flt++;
+	else
+		current->min_flt++;
 
 	up_read(&mm->mmap_sem);
 	return;
diff --git a/arch/i386/mm/fault.c b/arch/i386/mm/fault.c
index 1ecb3e43b52..e92a1012493 100644
--- a/arch/i386/mm/fault.c
+++ b/arch/i386/mm/fault.c
@@ -303,6 +303,7 @@ fastcall void __kprobes do_page_fault(struct pt_regs *regs,
 	struct vm_area_struct * vma;
 	unsigned long address;
 	int write, si_code;
+	int fault;
 
 	/* get the address */
         address = read_cr2();
@@ -422,20 +423,18 @@ good_area:
 	 * make sure we exit gracefully rather than endlessly redo
 	 * the fault.
 	 */
-	switch (handle_mm_fault(mm, vma, address, write)) {
-		case VM_FAULT_MINOR:
-			tsk->min_flt++;
-			break;
-		case VM_FAULT_MAJOR:
-			tsk->maj_flt++;
-			break;
-		case VM_FAULT_SIGBUS:
-			goto do_sigbus;
-		case VM_FAULT_OOM:
+	fault = handle_mm_fault(mm, vma, address, write);
+	if (unlikely(fault & VM_FAULT_ERROR)) {
+		if (fault & VM_FAULT_OOM)
 			goto out_of_memory;
-		default:
-			BUG();
+		else if (fault & VM_FAULT_SIGBUS)
+			goto do_sigbus;
+		BUG();
 	}
+	if (fault & VM_FAULT_MAJOR)
+		tsk->maj_flt++;
+	else
+		tsk->min_flt++;
 
 	/*
 	 * Did it hit the DOS screen memory VA from vm86 mode?
diff --git a/arch/ia64/mm/fault.c b/arch/ia64/mm/fault.c
index b87f785c241..73ccb6010c0 100644
--- a/arch/ia64/mm/fault.c
+++ b/arch/ia64/mm/fault.c
@@ -80,6 +80,7 @@ ia64_do_page_fault (unsigned long address, unsigned long isr, struct pt_regs *re
 	struct mm_struct *mm = current->mm;
 	struct siginfo si;
 	unsigned long mask;
+	int fault;
 
 	/* mmap_sem is performance critical.... */
 	prefetchw(&mm->mmap_sem);
@@ -147,26 +148,25 @@ ia64_do_page_fault (unsigned long address, unsigned long isr, struct pt_regs *re
 	 * sure we exit gracefully rather than endlessly redo the
 	 * fault.
 	 */
-	switch (handle_mm_fault(mm, vma, address, (mask & VM_WRITE) != 0)) {
-	      case VM_FAULT_MINOR:
-		++current->min_flt;
-		break;
-	      case VM_FAULT_MAJOR:
-		++current->maj_flt;
-		break;
-	      case VM_FAULT_SIGBUS:
+	fault = handle_mm_fault(mm, vma, address, (mask & VM_WRITE) != 0);
+	if (unlikely(fault & VM_FAULT_ERROR)) {
 		/*
 		 * We ran out of memory, or some other thing happened
 		 * to us that made us unable to handle the page fault
 		 * gracefully.
 		 */
-		signal = SIGBUS;
-		goto bad_area;
-	      case VM_FAULT_OOM:
-		goto out_of_memory;
-	      default:
+		if (fault & VM_FAULT_OOM) {
+			goto out_of_memory;
+		} else if (fault & VM_FAULT_SIGBUS) {
+			signal = SIGBUS;
+			goto bad_area;
+		}
 		BUG();
 	}
+	if (fault & VM_FAULT_MAJOR)
+		current->maj_flt++;
+	else
+		current->min_flt++;
 	up_read(&mm->mmap_sem);
 	return;
 
diff --git a/arch/m32r/mm/fault.c b/arch/m32r/mm/fault.c
index f3935ba2494..676a1c443d2 100644
--- a/arch/m32r/mm/fault.c
+++ b/arch/m32r/mm/fault.c
@@ -80,6 +80,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long error_code,
 	struct vm_area_struct * vma;
 	unsigned long page, addr;
 	int write;
+	int fault;
 	siginfo_t info;
 
 	/*
@@ -195,20 +196,18 @@ survive:
 	 */
 	addr = (address & PAGE_MASK);
 	set_thread_fault_code(error_code);
-	switch (handle_mm_fault(mm, vma, addr, write)) {
-		case VM_FAULT_MINOR:
-			tsk->min_flt++;
-			break;
-		case VM_FAULT_MAJOR:
-			tsk->maj_flt++;
-			break;
-		case VM_FAULT_SIGBUS:
-			goto do_sigbus;
-		case VM_FAULT_OOM:
+	fault = handle_mm_fault(mm, vma, addr, write);
+	if (unlikely(fault & VM_FAULT_ERROR)) {
+		if (fault & VM_FAULT_OOM)
 			goto out_of_memory;
-		default:
-			BUG();
+		else if (fault & VM_FAULT_SIGBUS)
+			goto do_sigbus;
+		BUG();
 	}
+	if (fault & VM_FAULT_MAJOR)
+		tsk->maj_flt++;
+	else
+		tsk->min_flt++;
 	set_thread_fault_code(0);
 	up_read(&mm->mmap_sem);
 	return;
diff --git a/arch/m68k/mm/fault.c b/arch/m68k/mm/fault.c
index 2adbeb16e1b..578b48f47b9 100644
--- a/arch/m68k/mm/fault.c
+++ b/arch/m68k/mm/fault.c
@@ -159,18 +159,17 @@ good_area:
 #ifdef DEBUG
 	printk("handle_mm_fault returns %d\n",fault);
 #endif
-	switch (fault) {
-	case VM_FAULT_MINOR:
-		current->min_flt++;
-		break;
-	case VM_FAULT_MAJOR:
-		current->maj_flt++;
-		break;
-	case VM_FAULT_SIGBUS:
-		goto bus_err;
-	default:
-		goto out_of_memory;
+	if (unlikely(fault & VM_FAULT_ERROR)) {
+		if (fault & VM_FAULT_OOM)
+			goto out_of_memory;
+		else if (fault & VM_FAULT_SIGBUS)
+			goto bus_err;
+		BUG();
 	}
+	if (fault & VM_FAULT_MAJOR)
+		current->maj_flt++;
+	else
+		current->min_flt++;
 
 	up_read(&mm->mmap_sem);
 	return 0;
diff --git a/arch/mips/mm/fault.c b/arch/mips/mm/fault.c
index 7ebea331edb..521771b373d 100644
--- a/arch/mips/mm/fault.c
+++ b/arch/mips/mm/fault.c
@@ -39,6 +39,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long write,
 	struct mm_struct *mm = tsk->mm;
 	const int field = sizeof(unsigned long) * 2;
 	siginfo_t info;
+	int fault;
 
 #if 0
 	printk("Cpu%d[%s:%d:%0*lx:%ld:%0*lx]\n", raw_smp_processor_id(),
@@ -102,20 +103,18 @@ survive:
 	 * make sure we exit gracefully rather than endlessly redo
 	 * the fault.
 	 */
-	switch (handle_mm_fault(mm, vma, address, write)) {
-	case VM_FAULT_MINOR:
-		tsk->min_flt++;
-		break;
-	case VM_FAULT_MAJOR:
-		tsk->maj_flt++;
-		break;
-	case VM_FAULT_SIGBUS:
-		goto do_sigbus;
-	case VM_FAULT_OOM:
-		goto out_of_memory;
-	default:
+	fault = handle_mm_fault(mm, vma, address, write);
+	if (unlikely(fault & VM_FAULT_ERROR)) {
+		if (fault & VM_FAULT_OOM)
+			goto out_of_memory;
+		else if (fault & VM_FAULT_SIGBUS)
+			goto do_sigbus;
 		BUG();
 	}
+	if (fault & VM_FAULT_MAJOR)
+		tsk->maj_flt++;
+	else
+		tsk->min_flt++;
 
 	up_read(&mm->mmap_sem);
 	return;
diff --git a/arch/parisc/mm/fault.c b/arch/parisc/mm/fault.c
index f6f67554c62..7899ab87785 100644
--- a/arch/parisc/mm/fault.c
+++ b/arch/parisc/mm/fault.c
@@ -147,6 +147,7 @@ void do_page_fault(struct pt_regs *regs, unsigned long code,
 	struct mm_struct *mm = tsk->mm;
 	const struct exception_table_entry *fix;
 	unsigned long acc_type;
+	int fault;
 
 	if (in_atomic() || !mm)
 		goto no_context;
@@ -173,23 +174,23 @@ good_area:
 	 * fault.
 	 */
 
-	switch (handle_mm_fault(mm, vma, address, (acc_type & VM_WRITE) != 0)) {
-	      case VM_FAULT_MINOR:
-		++current->min_flt;
-		break;
-	      case VM_FAULT_MAJOR:
-		++current->maj_flt;
-		break;
-	      case VM_FAULT_SIGBUS:
+	fault = handle_mm_fault(mm, vma, address, (acc_type & VM_WRITE) != 0);
+	if (unlikely(fault & VM_FAULT_ERROR)) {
 		/*
 		 * We hit a shared mapping outside of the file, or some
 		 * other thing happened to us that made us unable to
 		 * handle the page fault gracefully.
 		 */
-		goto bad_area;
-	      default:
-		goto out_of_memory;
+		if (fault & VM_FAULT_OOM)
+			goto out_of_memory;
+		else if (fault & VM_FAULT_SIGBUS)
+			goto bad_area;
+		BUG();
 	}
+	if (fault & VM_FAULT_MAJOR)
+		current->maj_flt++;
+	else
+		current->min_flt++;
 	up_read(&mm->mmap_sem);
 	return;
 
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 0ece51310bf..3767211b3d0 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -145,7 +145,7 @@ int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address,
 	struct mm_struct *mm = current->mm;
 	siginfo_t info;
 	int code = SEGV_MAPERR;
-	int is_write = 0;
+	int is_write = 0, ret;
 	int trap = TRAP(regs);
  	int is_exec = trap == 0x400;
 
@@ -330,22 +330,18 @@ good_area:
 	 * the fault.
 	 */
  survive:
-	switch (handle_mm_fault(mm, vma, address, is_write)) {
-
-	case VM_FAULT_MINOR:
-		current->min_flt++;
-		break;
-	case VM_FAULT_MAJOR:
-		current->maj_flt++;
-		break;
-	case VM_FAULT_SIGBUS:
-		goto do_sigbus;
-	case VM_FAULT_OOM:
-		goto out_of_memory;
-	default:
+	ret = handle_mm_fault(mm, vma, address, is_write);
+	if (unlikely(ret & VM_FAULT_ERROR)) {
+		if (ret & VM_FAULT_OOM)
+			goto out_of_memory;
+		else if (ret & VM_FAULT_SIGBUS)
+			goto do_sigbus;
 		BUG();
 	}
-
+	if (ret & VM_FAULT_MAJOR)
+		current->maj_flt++;
+	else
+		current->min_flt++;
 	up_read(&mm->mmap_sem);
 	return 0;
 
diff --git a/arch/powerpc/platforms/cell/spufs/fault.c b/arch/powerpc/platforms/cell/spufs/fault.c
index e064d0c0d80..07f88de0544 100644
--- a/arch/powerpc/platforms/cell/spufs/fault.c
+++ b/arch/powerpc/platforms/cell/spufs/fault.c
@@ -74,23 +74,21 @@ good_area:
 			goto bad_area;
 	}
 	ret = 0;
-	*flt = handle_mm_fault(mm, vma, ea, is_write);
-	switch (*flt) {
-	case VM_FAULT_MINOR:
-		current->min_flt++;
-		break;
-	case VM_FAULT_MAJOR:
-		current->maj_flt++;
-		break;
-	case VM_FAULT_SIGBUS:
-		ret = -EFAULT;
-		goto bad_area;
-	case VM_FAULT_OOM:
-		ret = -ENOMEM;
-		goto bad_area;
-	default:
+	fault = handle_mm_fault(mm, vma, ea, is_write);
+	if (unlikely(fault & VM_FAULT_ERROR)) {
+		if (fault & VM_FAULT_OOM) {
+			ret = -ENOMEM;
+			goto bad_area;
+		} else if (fault & VM_FAULT_SIGBUS) {
+			ret = -EFAULT;
+			goto bad_area;
+		}
 		BUG();
 	}
+	if (fault & VM_FAULT_MAJOR)
+		current->maj_flt++;
+	else
+		current->min_flt++;
 	up_read(&mm->mmap_sem);
 	return ret;
 
diff --git a/arch/ppc/mm/fault.c b/arch/ppc/mm/fault.c
index 465f451f3bc..b98244e277f 100644
--- a/arch/ppc/mm/fault.c
+++ b/arch/ppc/mm/fault.c
@@ -96,6 +96,7 @@ int do_page_fault(struct pt_regs *regs, unsigned long address,
 	struct mm_struct *mm = current->mm;
 	siginfo_t info;
 	int code = SEGV_MAPERR;
+	int fault;
 #if defined(CONFIG_4xx) || defined (CONFIG_BOOKE)
 	int is_write = error_code & ESR_DST;
 #else
@@ -249,20 +250,18 @@ good_area:
 	 * the fault.
 	 */
  survive:
-        switch (handle_mm_fault(mm, vma, address, is_write)) {
-        case VM_FAULT_MINOR:
-                current->min_flt++;
-                break;
-        case VM_FAULT_MAJOR:
-                current->maj_flt++;
-                break;
-        case VM_FAULT_SIGBUS:
-                goto do_sigbus;
-        case VM_FAULT_OOM:
-                goto out_of_memory;
-	default:
+	fault = handle_mm_fault(mm, vma, address, is_write);
+	if (unlikely(fault & VM_FAULT_ERROR)) {
+		if (fault & VM_FAULT_OOM)
+			goto out_of_memory;
+		else if (fault & VM_FAULT_SIGBUS)
+			goto do_sigbus;
 		BUG();
 	}
+	if (fault & VM_FAULT_MAJOR)
+		current->maj_flt++;
+	else
+		current->min_flt++;
 
 	up_read(&mm->mmap_sem);
 	/*
diff --git a/arch/s390/lib/uaccess_pt.c b/arch/s390/lib/uaccess_pt.c
index 63181671e3e..60604b2819b 100644
--- a/arch/s390/lib/uaccess_pt.c
+++ b/arch/s390/lib/uaccess_pt.c
@@ -20,6 +20,7 @@ static int __handle_fault(struct mm_struct *mm, unsigned long address,
 {
 	struct vm_area_struct *vma;
 	int ret = -EFAULT;
+	int fault;
 
 	if (in_atomic())
 		return ret;
@@ -44,20 +45,18 @@ static int __handle_fault(struct mm_struct *mm, unsigned long address,
 	}
 
 survive:
-	switch (handle_mm_fault(mm, vma, address, write_access)) {
-	case VM_FAULT_MINOR:
-		current->min_flt++;
-		break;
-	case VM_FAULT_MAJOR:
-		current->maj_flt++;
-		break;
-	case VM_FAULT_SIGBUS:
-		goto out_sigbus;
-	case VM_FAULT_OOM:
-		goto out_of_memory;
-	default:
+	fault = handle_mm_fault(mm, vma, address, write_access);
+	if (unlikely(fault & VM_FAULT_ERROR)) {
+		if (fault & VM_FAULT_OOM)
+			goto out_of_memory;
+		else if (fault & VM_FAULT_SIGBUS)
+			goto out_sigbus;
 		BUG();
 	}
+	if (fault & VM_FAULT_MAJOR)
+		current->maj_flt++;
+	else
+		current->min_flt++;
 	ret = 0;
 out:
 	up_read(&mm->mmap_sem);
diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
index d855cdbf8fb..54055194e9a 100644
--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@ -307,6 +307,7 @@ do_exception(struct pt_regs *regs, unsigned long error_code, int write)
 	unsigned long address;
 	int space;
 	int si_code;
+	int fault;
 
 	if (notify_page_fault(regs, error_code))
 		return;
@@ -377,23 +378,22 @@ survive:
 	 * make sure we exit gracefully rather than endlessly redo
 	 * the fault.
 	 */
-	switch (handle_mm_fault(mm, vma, address, write)) {
-	case VM_FAULT_MINOR:
-		tsk->min_flt++;
-		break;
-	case VM_FAULT_MAJOR:
-		tsk->maj_flt++;
-		break;
-	case VM_FAULT_SIGBUS:
-		do_sigbus(regs, error_code, address);
-		return;
-	case VM_FAULT_OOM:
-		if (do_out_of_memory(regs, error_code, address))
-			goto survive;
-		return;
-	default:
+	fault = handle_mm_fault(mm, vma, address, write);
+	if (unlikely(fault & VM_FAULT_ERROR)) {
+		if (fault & VM_FAULT_OOM) {
+			if (do_out_of_memory(regs, error_code, address))
+				goto survive;
+			return;
+		} else if (fault & VM_FAULT_SIGBUS) {
+			do_sigbus(regs, error_code, address);
+			return;
+		}
 		BUG();
 	}
+	if (fault & VM_FAULT_MAJOR)
+		tsk->maj_flt++;
+	else
+		tsk->min_flt++;
 
         up_read(&mm->mmap_sem);
 	/*
diff --git a/arch/sh/mm/fault.c b/arch/sh/mm/fault.c
index 0b3eaf6fbb2..964c6767dc7 100644
--- a/arch/sh/mm/fault.c
+++ b/arch/sh/mm/fault.c
@@ -33,6 +33,7 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
 	struct mm_struct *mm;
 	struct vm_area_struct * vma;
 	int si_code;
+	int fault;
 	siginfo_t info;
 
 	trace_hardirqs_on();
@@ -124,20 +125,18 @@ good_area:
 	 * the fault.
 	 */
 survive:
-	switch (handle_mm_fault(mm, vma, address, writeaccess)) {
-		case VM_FAULT_MINOR:
-			tsk->min_flt++;
-			break;
-		case VM_FAULT_MAJOR:
-			tsk->maj_flt++;
-			break;
-		case VM_FAULT_SIGBUS:
-			goto do_sigbus;
-		case VM_FAULT_OOM:
+	fault = handle_mm_fault(mm, vma, address, writeaccess);
+	if (unlikely(fault & VM_FAULT_ERROR)) {
+		if (fault & VM_FAULT_OOM)
 			goto out_of_memory;
-		default:
-			BUG();
+		else if (fault & VM_FAULT_SIGBUS)
+			goto do_sigbus;
+		BUG();
 	}
+	if (fault & VM_FAULT_MAJOR)
+		tsk->maj_flt++;
+	else
+		tsk->min_flt++;
 
 	up_read(&mm->mmap_sem);
 	return;
diff --git a/arch/sh64/mm/fault.c b/arch/sh64/mm/fault.c
index 3cd93ba5d82..0d069d82141 100644
--- a/arch/sh64/mm/fault.c
+++ b/arch/sh64/mm/fault.c
@@ -127,6 +127,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long writeaccess,
 	struct vm_area_struct * vma;
 	const struct exception_table_entry *fixup;
 	pte_t *pte;
+	int fault;
 
 #if defined(CONFIG_SH64_PROC_TLB)
         ++calls_to_do_slow_page_fault;
@@ -221,18 +222,19 @@ good_area:
 	 * the fault.
 	 */
 survive:
-	switch (handle_mm_fault(mm, vma, address, writeaccess)) {
-	case VM_FAULT_MINOR:
-		tsk->min_flt++;
-		break;
-	case VM_FAULT_MAJOR:
-		tsk->maj_flt++;
-		break;
-	case VM_FAULT_SIGBUS:
-		goto do_sigbus;
-	default:
-		goto out_of_memory;
+	fault = handle_mm_fault(mm, vma, address, writeaccess);
+	if (unlikely(fault & VM_FAULT_ERROR)) {
+		if (fault & VM_FAULT_OOM)
+			goto out_of_memory;
+		else if (fault & VM_FAULT_SIGBUS)
+			goto do_sigbus;
+		BUG();
 	}
+	if (fault & VM_FAULT_MAJOR)
+		tsk->maj_flt++;
+	else
+		tsk->min_flt++;
+
 	/* If we get here, the page fault has been handled.  Do the TLB refill
 	   now from the newly-setup PTE, to avoid having to fault again right
 	   away on the same instruction. */
diff --git a/arch/sparc/mm/fault.c b/arch/sparc/mm/fault.c
index c3483365db4..50747fe4435 100644
--- a/arch/sparc/mm/fault.c
+++ b/arch/sparc/mm/fault.c
@@ -226,6 +226,7 @@ asmlinkage void do_sparc_fault(struct pt_regs *regs, int text_fault, int write,
 	unsigned long g2;
 	siginfo_t info;
 	int from_user = !(regs->psr & PSR_PS);
+	int fault;
 
 	if(text_fault)
 		address = regs->pc;
@@ -289,19 +290,18 @@ good_area:
 	 * make sure we exit gracefully rather than endlessly redo
 	 * the fault.
 	 */
-	switch (handle_mm_fault(mm, vma, address, write)) {
-	case VM_FAULT_SIGBUS:
-		goto do_sigbus;
-	case VM_FAULT_OOM:
-		goto out_of_memory;
-	case VM_FAULT_MAJOR:
+	fault = handle_mm_fault(mm, vma, address, write);
+	if (unlikely(fault & VM_FAULT_ERROR)) {
+		if (fault & VM_FAULT_OOM)
+			goto out_of_memory;
+		else if (fault & VM_FAULT_SIGBUS)
+			goto do_sigbus;
+		BUG();
+	}
+	if (fault & VM_FAULT_MAJOR)
 		current->maj_flt++;
-		break;
-	case VM_FAULT_MINOR:
-	default:
+	else
 		current->min_flt++;
-		break;
-	}
 	up_read(&mm->mmap_sem);
 	return;
 
diff --git a/arch/sparc64/mm/fault.c b/arch/sparc64/mm/fault.c
index b582024d219..17123e9ecf7 100644
--- a/arch/sparc64/mm/fault.c
+++ b/arch/sparc64/mm/fault.c
@@ -278,7 +278,7 @@ asmlinkage void __kprobes do_sparc64_fault(struct pt_regs *regs)
 	struct mm_struct *mm = current->mm;
 	struct vm_area_struct *vma;
 	unsigned int insn = 0;
-	int si_code, fault_code;
+	int si_code, fault_code, fault;
 	unsigned long address, mm_rss;
 
 	fault_code = get_thread_fault_code();
@@ -415,20 +415,18 @@ good_area:
 			goto bad_area;
 	}
 
-	switch (handle_mm_fault(mm, vma, address, (fault_code & FAULT_CODE_WRITE))) {
-	case VM_FAULT_MINOR:
-		current->min_flt++;
-		break;
-	case VM_FAULT_MAJOR:
-		current->maj_flt++;
-		break;
-	case VM_FAULT_SIGBUS:
-		goto do_sigbus;
-	case VM_FAULT_OOM:
-		goto out_of_memory;
-	default:
+	fault = handle_mm_fault(mm, vma, address, (fault_code & FAULT_CODE_WRITE));
+	if (unlikely(fault & VM_FAULT_ERROR)) {
+		if (fault & VM_FAULT_OOM)
+			goto out_of_memory;
+		else if (fault & VM_FAULT_SIGBUS)
+			goto do_sigbus;
 		BUG();
 	}
+	if (fault & VM_FAULT_MAJOR)
+		current->maj_flt++;
+	else
+		current->min_flt++;
 
 	up_read(&mm->mmap_sem);
 
diff --git a/arch/um/kernel/trap.c b/arch/um/kernel/trap.c
index abab90c3803..3850d53f79f 100644
--- a/arch/um/kernel/trap.c
+++ b/arch/um/kernel/trap.c
@@ -76,23 +76,24 @@ good_area:
 		goto out;
 
 	do {
+		int fault;
 survive:
-		switch (handle_mm_fault(mm, vma, address, is_write)){
-		case VM_FAULT_MINOR:
-			current->min_flt++;
-			break;
-		case VM_FAULT_MAJOR:
-			current->maj_flt++;
-			break;
-		case VM_FAULT_SIGBUS:
-			err = -EACCES;
-			goto out;
-		case VM_FAULT_OOM:
-			err = -ENOMEM;
-			goto out_of_memory;
-		default:
+		fault = handle_mm_fault(mm, vma, address, is_write);
+		if (unlikely(fault & VM_FAULT_ERROR)) {
+			if (fault & VM_FAULT_OOM) {
+				err = -ENOMEM;
+				goto out_of_memory;
+			} else if (fault & VM_FAULT_SIGBUS) {
+				err = -EACCES;
+				goto out;
+			}
 			BUG();
 		}
+		if (fault & VM_FAULT_MAJOR)
+			current->maj_flt++;
+		else
+			current->min_flt++;
+
 		pgd = pgd_offset(mm, address);
 		pud = pud_offset(pgd, address);
 		pmd = pmd_offset(pud, address);
diff --git a/arch/x86_64/mm/fault.c b/arch/x86_64/mm/fault.c
index 635e58d443d..84f11728fc7 100644
--- a/arch/x86_64/mm/fault.c
+++ b/arch/x86_64/mm/fault.c
@@ -317,7 +317,7 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
 	struct vm_area_struct * vma;
 	unsigned long address;
 	const struct exception_table_entry *fixup;
-	int write;
+	int write, fault;
 	unsigned long flags;
 	siginfo_t info;
 
@@ -450,19 +450,18 @@ good_area:
 	 * make sure we exit gracefully rather than endlessly redo
 	 * the fault.
 	 */
-	switch (handle_mm_fault(mm, vma, address, write)) {
-	case VM_FAULT_MINOR:
-		tsk->min_flt++;
-		break;
-	case VM_FAULT_MAJOR:
-		tsk->maj_flt++;
-		break;
-	case VM_FAULT_SIGBUS:
-		goto do_sigbus;
-	default:
-		goto out_of_memory;
+	fault = handle_mm_fault(mm, vma, address, write);
+	if (unlikely(fault & VM_FAULT_ERROR)) {
+		if (fault & VM_FAULT_OOM)
+			goto out_of_memory;
+		else if (fault & VM_FAULT_SIGBUS)
+			goto do_sigbus;
+		BUG();
 	}
-
+	if (fault & VM_FAULT_MAJOR)
+		tsk->maj_flt++;
+	else
+		tsk->min_flt++;
 	up_read(&mm->mmap_sem);
 	return;
 
diff --git a/arch/xtensa/mm/fault.c b/arch/xtensa/mm/fault.c
index 3dc6f2f07bb..16004067add 100644
--- a/arch/xtensa/mm/fault.c
+++ b/arch/xtensa/mm/fault.c
@@ -41,6 +41,7 @@ void do_page_fault(struct pt_regs *regs)
 	siginfo_t info;
 
 	int is_write, is_exec;
+	int fault;
 
 	info.si_code = SEGV_MAPERR;
 
@@ -102,20 +103,18 @@ good_area:
 	 * the fault.
 	 */
 survive:
-	switch (handle_mm_fault(mm, vma, address, is_write)) {
-	case VM_FAULT_MINOR:
-		current->min_flt++;
-		break;
-	case VM_FAULT_MAJOR:
-		current->maj_flt++;
-		break;
-	case VM_FAULT_SIGBUS:
-		goto do_sigbus;
-	case VM_FAULT_OOM:
-		goto out_of_memory;
-	default:
+	fault = handle_mm_fault(mm, vma, address, is_write);
+	if (unlikely(fault & VM_FAULT_ERROR)) {
+		if (fault & VM_FAULT_OOM)
+			goto out_of_memory;
+		else if (fault & VM_FAULT_SIGBUS)
+			goto do_sigbus;
 		BUG();
 	}
+	if (fault & VM_FAULT_MAJOR)
+		current->maj_flt++;
+	else
+		current->min_flt++;
 
 	up_read(&mm->mmap_sem);
 	return;
diff --git a/fs/gfs2/ops_vm.c b/fs/gfs2/ops_vm.c
index dc287d2e3a6..927d739d468 100644
--- a/fs/gfs2/ops_vm.c
+++ b/fs/gfs2/ops_vm.c
@@ -112,7 +112,7 @@ static int gfs2_sharewrite_fault(struct vm_area_struct *vma,
 	struct gfs2_holder i_gh;
 	int alloc_required;
 	int error;
-	int ret = VM_FAULT_MINOR;
+	int ret = 0;
 
 	error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
 	if (error)
@@ -132,14 +132,19 @@ static int gfs2_sharewrite_fault(struct vm_area_struct *vma,
 	set_bit(GFF_EXLOCK, &gf->f_flags);
 	ret = filemap_fault(vma, vmf);
 	clear_bit(GFF_EXLOCK, &gf->f_flags);
-	if (ret & (VM_FAULT_ERROR | FAULT_RET_NOPAGE))
+	if (ret & VM_FAULT_ERROR)
 		goto out_unlock;
 
 	if (alloc_required) {
 		/* XXX: do we need to drop page lock around alloc_page_backing?*/
 		error = alloc_page_backing(ip, vmf->page);
 		if (error) {
-			if (ret & FAULT_RET_LOCKED)
+			/*
+			 * VM_FAULT_LOCKED should always be the case for
+			 * filemap_fault, but it may not be in a future
+			 * implementation.
+			 */
+			if (ret & VM_FAULT_LOCKED)
 				unlock_page(vmf->page);
 			page_cache_release(vmf->page);
 			ret = VM_FAULT_OOM;
diff --git a/include/linux/mm.h b/include/linux/mm.h
index ff0b8844bd5..f8e12b3b611 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -196,25 +196,10 @@ extern pgprot_t protection_map[16];
 #define FAULT_FLAG_NONLINEAR	0x02	/* Fault was via a nonlinear mapping */
 
 
-#define FAULT_RET_NOPAGE	0x0100	/* ->fault did not return a page. This
-					 * can be used if the handler installs
-					 * their own pte.
-					 */
-#define FAULT_RET_LOCKED	0x0200	/* ->fault locked the page, caller must
-					 * unlock after installing the mapping.
-					 * This is used by pagecache in
-					 * particular, where the page lock is
-					 * used to synchronise against truncate
-					 * and invalidate. Mutually exclusive
-					 * with FAULT_RET_NOPAGE.
-					 */
-
 /*
  * vm_fault is filled by the the pagefault handler and passed to the vma's
- * ->fault function. The vma's ->fault is responsible for returning the
- * VM_FAULT_xxx type which occupies the lowest byte of the return code, ORed
- * with FAULT_RET_ flags that occupy the next byte and give details about
- * how the fault was handled.
+ * ->fault function. The vma's ->fault is responsible for returning a bitmask
+ * of VM_FAULT_xxx flags that give details about how the fault was handled.
  *
  * pgoff should be used in favour of virtual_address, if possible. If pgoff
  * is used, one may set VM_CAN_NONLINEAR in the vma->vm_flags to get nonlinear
@@ -226,9 +211,9 @@ struct vm_fault {
 	void __user *virtual_address;	/* Faulting virtual address */
 
 	struct page *page;		/* ->fault handlers should return a
-					 * page here, unless FAULT_RET_NOPAGE
+					 * page here, unless VM_FAULT_NOPAGE
 					 * is set (which is also implied by
-					 * VM_FAULT_OOM or SIGBUS).
+					 * VM_FAULT_ERROR).
 					 */
 };
 
@@ -712,26 +697,17 @@ static inline int page_mapped(struct page *page)
  * just gets major/minor fault counters bumped up.
  */
 
-/*
- * VM_FAULT_ERROR is set for the error cases, to make some tests simpler.
- */
-#define VM_FAULT_ERROR	0x20
+#define VM_FAULT_MINOR	0 /* For backwards compat. Remove me quickly. */
 
-#define VM_FAULT_OOM	(0x00 | VM_FAULT_ERROR)
-#define VM_FAULT_SIGBUS	(0x01 | VM_FAULT_ERROR)
-#define VM_FAULT_MINOR	0x02
-#define VM_FAULT_MAJOR	0x03
+#define VM_FAULT_OOM	0x0001
+#define VM_FAULT_SIGBUS	0x0002
+#define VM_FAULT_MAJOR	0x0004
+#define VM_FAULT_WRITE	0x0008	/* Special case for get_user_pages */
 
-/* 
- * Special case for get_user_pages.
- * Must be in a distinct bit from the above VM_FAULT_ flags.
- */
-#define VM_FAULT_WRITE	0x10
+#define VM_FAULT_NOPAGE	0x0100	/* ->fault installed the pte, not return page */
+#define VM_FAULT_LOCKED	0x0200	/* ->fault locked the returned page */
 
-/*
- * Mask of VM_FAULT_ flags
- */
-#define VM_FAULT_MASK	0xff
+#define VM_FAULT_ERROR	(VM_FAULT_OOM | VM_FAULT_SIGBUS)
 
 #define offset_in_page(p)	((unsigned long)(p) & ~PAGE_MASK)
 
@@ -817,16 +793,8 @@ extern int vmtruncate(struct inode * inode, loff_t offset);
 extern int vmtruncate_range(struct inode * inode, loff_t offset, loff_t end);
 
 #ifdef CONFIG_MMU
-extern int __handle_mm_fault(struct mm_struct *mm,struct vm_area_struct *vma,
+extern int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 			unsigned long address, int write_access);
-
-static inline int handle_mm_fault(struct mm_struct *mm,
-			struct vm_area_struct *vma, unsigned long address,
-			int write_access)
-{
-	return __handle_mm_fault(mm, vma, address, write_access) &
-				(~VM_FAULT_WRITE);
-}
 #else
 static inline int handle_mm_fault(struct mm_struct *mm,
 			struct vm_area_struct *vma, unsigned long address,
diff --git a/kernel/futex.c b/kernel/futex.c
index 5c3f45d07c5..a12425051ee 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -346,15 +346,20 @@ static int futex_handle_fault(unsigned long address,
 	vma = find_vma(mm, address);
 	if (vma && address >= vma->vm_start &&
 	    (vma->vm_flags & VM_WRITE)) {
-		switch (handle_mm_fault(mm, vma, address, 1)) {
-		case VM_FAULT_MINOR:
-			ret = 0;
-			current->min_flt++;
-			break;
-		case VM_FAULT_MAJOR:
+		int fault;
+		fault = handle_mm_fault(mm, vma, address, 1);
+		if (unlikely((fault & VM_FAULT_ERROR))) {
+#if 0
+			/* XXX: let's do this when we verify it is OK */
+			if (ret & VM_FAULT_OOM)
+				ret = -ENOMEM;
+#endif
+		} else {
 			ret = 0;
-			current->maj_flt++;
-			break;
+			if (fault & VM_FAULT_MAJOR)
+				current->maj_flt++;
+			else
+				current->min_flt++;
 		}
 	}
 	if (!fshared)
diff --git a/mm/filemap.c b/mm/filemap.c
index 0876cc57255..4fd9e3f0f48 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1322,9 +1322,7 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 	struct page *page;
 	unsigned long size;
 	int did_readaround = 0;
-	int ret;
-
-	ret = VM_FAULT_MINOR;
+	int ret = 0;
 
 	size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
 	if (vmf->pgoff >= size)
@@ -1408,7 +1406,7 @@ retry_find:
 	 */
 	mark_page_accessed(page);
 	vmf->page = page;
-	return ret | FAULT_RET_LOCKED;
+	return ret | VM_FAULT_LOCKED;
 
 outside_data_content:
 	/*
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index 847d5d78163..53ee6a29963 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -252,7 +252,7 @@ static int xip_file_fault(struct vm_area_struct *area, struct vm_fault *vmf)
 out:
 	page_cache_get(page);
 	vmf->page = page;
-	return VM_FAULT_MINOR;
+	return 0;
 }
 
 static struct vm_operations_struct xip_file_vm_ops = {
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index aaa7c1a682d..c4a573b857b 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -469,7 +469,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
 	avoidcopy = (page_count(old_page) == 1);
 	if (avoidcopy) {
 		set_huge_ptep_writable(vma, address, ptep);
-		return VM_FAULT_MINOR;
+		return 0;
 	}
 
 	page_cache_get(old_page);
@@ -494,7 +494,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
 	}
 	page_cache_release(new_page);
 	page_cache_release(old_page);
-	return VM_FAULT_MINOR;
+	return 0;
 }
 
 static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
@@ -551,7 +551,7 @@ retry:
 	if (idx >= size)
 		goto backout;
 
-	ret = VM_FAULT_MINOR;
+	ret = 0;
 	if (!pte_none(*ptep))
 		goto backout;
 
@@ -602,7 +602,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 		return ret;
 	}
 
-	ret = VM_FAULT_MINOR;
+	ret = 0;
 
 	spin_lock(&mm->page_table_lock);
 	/* Check for a racing update before calling hugetlb_cow */
@@ -641,7 +641,7 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
 			spin_unlock(&mm->page_table_lock);
 			ret = hugetlb_fault(mm, vma, vaddr, 0);
 			spin_lock(&mm->page_table_lock);
-			if (ret == VM_FAULT_MINOR)
+			if (!(ret & VM_FAULT_MAJOR))
 				continue;
 
 			remainder = 0;
diff --git a/mm/memory.c b/mm/memory.c
index 23c870479b3..61d51da7e17 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1068,31 +1068,30 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 			cond_resched();
 			while (!(page = follow_page(vma, start, foll_flags))) {
 				int ret;
-				ret = __handle_mm_fault(mm, vma, start,
+				ret = handle_mm_fault(mm, vma, start,
 						foll_flags & FOLL_WRITE);
+				if (ret & VM_FAULT_ERROR) {
+					if (ret & VM_FAULT_OOM)
+						return i ? i : -ENOMEM;
+					else if (ret & VM_FAULT_SIGBUS)
+						return i ? i : -EFAULT;
+					BUG();
+				}
+				if (ret & VM_FAULT_MAJOR)
+					tsk->maj_flt++;
+				else
+					tsk->min_flt++;
+
 				/*
-				 * The VM_FAULT_WRITE bit tells us that do_wp_page has
-				 * broken COW when necessary, even if maybe_mkwrite
-				 * decided not to set pte_write. We can thus safely do
-				 * subsequent page lookups as if they were reads.
+				 * The VM_FAULT_WRITE bit tells us that
+				 * do_wp_page has broken COW when necessary,
+				 * even if maybe_mkwrite decided not to set
+				 * pte_write. We can thus safely do subsequent
+				 * page lookups as if they were reads.
 				 */
 				if (ret & VM_FAULT_WRITE)
 					foll_flags &= ~FOLL_WRITE;
-				
-				switch (ret & ~VM_FAULT_WRITE) {
-				case VM_FAULT_MINOR:
-					tsk->min_flt++;
-					break;
-				case VM_FAULT_MAJOR:
-					tsk->maj_flt++;
-					break;
-				case VM_FAULT_SIGBUS:
-					return i ? i : -EFAULT;
-				case VM_FAULT_OOM:
-					return i ? i : -ENOMEM;
-				default:
-					BUG();
-				}
+
 				cond_resched();
 			}
 			if (pages) {
@@ -1639,7 +1638,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 {
 	struct page *old_page, *new_page;
 	pte_t entry;
-	int reuse = 0, ret = VM_FAULT_MINOR;
+	int reuse = 0, ret = 0;
 	struct page *dirty_page = NULL;
 
 	old_page = vm_normal_page(vma, address, orig_pte);
@@ -1835,8 +1834,8 @@ static int unmap_mapping_range_vma(struct vm_area_struct *vma,
 	/*
 	 * files that support invalidating or truncating portions of the
 	 * file from under mmaped areas must have their ->fault function
-	 * return a locked page (and FAULT_RET_LOCKED code). This provides
-	 * synchronisation against concurrent unmapping here.
+	 * return a locked page (and set VM_FAULT_LOCKED in the return).
+	 * This provides synchronisation against concurrent unmapping here.
 	 */
 
 again:
@@ -2140,7 +2139,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	struct page *page;
 	swp_entry_t entry;
 	pte_t pte;
-	int ret = VM_FAULT_MINOR;
+	int ret = 0;
 
 	if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
 		goto out;
@@ -2208,8 +2207,9 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	unlock_page(page);
 
 	if (write_access) {
+		/* XXX: We could OR the do_wp_page code with this one? */
 		if (do_wp_page(mm, vma, address,
-				page_table, pmd, ptl, pte) == VM_FAULT_OOM)
+				page_table, pmd, ptl, pte) & VM_FAULT_OOM)
 			ret = VM_FAULT_OOM;
 		goto out;
 	}
@@ -2280,7 +2280,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	lazy_mmu_prot_update(entry);
 unlock:
 	pte_unmap_unlock(page_table, ptl);
-	return VM_FAULT_MINOR;
+	return 0;
 release:
 	page_cache_release(page);
 	goto unlock;
@@ -2323,11 +2323,11 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 
 	if (likely(vma->vm_ops->fault)) {
 		ret = vma->vm_ops->fault(vma, &vmf);
-		if (unlikely(ret & (VM_FAULT_ERROR | FAULT_RET_NOPAGE)))
-			return (ret & VM_FAULT_MASK);
+		if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
+			return ret;
 	} else {
 		/* Legacy ->nopage path */
-		ret = VM_FAULT_MINOR;
+		ret = 0;
 		vmf.page = vma->vm_ops->nopage(vma, address & PAGE_MASK, &ret);
 		/* no page was available -- either SIGBUS or OOM */
 		if (unlikely(vmf.page == NOPAGE_SIGBUS))
@@ -2340,7 +2340,7 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 	 * For consistency in subsequent calls, make the faulted page always
 	 * locked.
 	 */
-	if (unlikely(!(ret & FAULT_RET_LOCKED)))
+	if (unlikely(!(ret & VM_FAULT_LOCKED)))
 		lock_page(vmf.page);
 	else
 		VM_BUG_ON(!PageLocked(vmf.page));
@@ -2356,7 +2356,8 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 				ret = VM_FAULT_OOM;
 				goto out;
 			}
-			page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
+			page = alloc_page_vma(GFP_HIGHUSER_MOVABLE,
+						vma, address);
 			if (!page) {
 				ret = VM_FAULT_OOM;
 				goto out;
@@ -2384,7 +2385,7 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 				 * is better done later.
 				 */
 				if (!page->mapping) {
-					ret = VM_FAULT_MINOR;
+					ret = 0;
 					anon = 1; /* no anon but release vmf.page */
 					goto out;
 				}
@@ -2447,7 +2448,7 @@ out_unlocked:
 		put_page(dirty_page);
 	}
 
-	return (ret & VM_FAULT_MASK);
+	return ret;
 }
 
 static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
@@ -2486,7 +2487,6 @@ static noinline int do_no_pfn(struct mm_struct *mm, struct vm_area_struct *vma,
 	spinlock_t *ptl;
 	pte_t entry;
 	unsigned long pfn;
-	int ret = VM_FAULT_MINOR;
 
 	pte_unmap(page_table);
 	BUG_ON(!(vma->vm_flags & VM_PFNMAP));
@@ -2498,7 +2498,7 @@ static noinline int do_no_pfn(struct mm_struct *mm, struct vm_area_struct *vma,
 	else if (unlikely(pfn == NOPFN_SIGBUS))
 		return VM_FAULT_SIGBUS;
 	else if (unlikely(pfn == NOPFN_REFAULT))
-		return VM_FAULT_MINOR;
+		return 0;
 
 	page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
 
@@ -2510,7 +2510,7 @@ static noinline int do_no_pfn(struct mm_struct *mm, struct vm_area_struct *vma,
 		set_pte_at(mm, address, page_table, entry);
 	}
 	pte_unmap_unlock(page_table, ptl);
-	return ret;
+	return 0;
 }
 
 /*
@@ -2531,7 +2531,7 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 	pgoff_t pgoff;
 
 	if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
-		return VM_FAULT_MINOR;
+		return 0;
 
 	if (unlikely(!(vma->vm_flags & VM_NONLINEAR) ||
 			!(vma->vm_flags & VM_CAN_NONLINEAR))) {
@@ -2615,13 +2615,13 @@ static inline int handle_pte_fault(struct mm_struct *mm,
 	}
 unlock:
 	pte_unmap_unlock(pte, ptl);
-	return VM_FAULT_MINOR;
+	return 0;
 }
 
 /*
  * By the time we get here, we already hold the mm semaphore
  */
-int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
+int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 		unsigned long address, int write_access)
 {
 	pgd_t *pgd;
@@ -2650,7 +2650,7 @@ int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 	return handle_pte_fault(mm, vma, address, pte, pmd, write_access);
 }
 
-EXPORT_SYMBOL_GPL(__handle_mm_fault);
+EXPORT_SYMBOL_GPL(handle_mm_fault);
 
 #ifndef __PAGETABLE_PUD_FOLDED
 /*
diff --git a/mm/shmem.c b/mm/shmem.c
index 0a555af8733..ad155c7745d 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1103,7 +1103,7 @@ static int shmem_getpage(struct inode *inode, unsigned long idx,
 		return -EFBIG;
 
 	if (type)
-		*type = VM_FAULT_MINOR;
+		*type = 0;
 
 	/*
 	 * Normally, filepage is NULL on entry, and either found
@@ -1138,9 +1138,9 @@ repeat:
 		if (!swappage) {
 			shmem_swp_unmap(entry);
 			/* here we actually do the io */
-			if (type && *type == VM_FAULT_MINOR) {
+			if (type && !(*type & VM_FAULT_MAJOR)) {
 				__count_vm_event(PGMAJFAULT);
-				*type = VM_FAULT_MAJOR;
+				*type |= VM_FAULT_MAJOR;
 			}
 			spin_unlock(&info->lock);
 			swappage = shmem_swapin(info, swap, idx);
@@ -1323,7 +1323,7 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 		return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS);
 
 	mark_page_accessed(vmf->page);
-	return ret | FAULT_RET_LOCKED;
+	return ret | VM_FAULT_LOCKED;
 }
 
 #ifdef CONFIG_NUMA
-- 
cgit v1.2.3-70-g09d2


From 328616e3b76859f1abdd08a8df1ddbb7bb81f807 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Thu, 19 Jul 2007 01:47:26 -0700
Subject: freezer: run show_state() when freezing times out

To see which tasks are stuck where.

Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Alan Stern <stern@rowland.harvard.edu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/power/process.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/power/process.c b/kernel/power/process.c
index e0233d8422b..b850173e756 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -157,6 +157,7 @@ static unsigned int try_to_freeze_tasks(int freeze_user_space)
 				freeze_user_space ? "user space processes" :
 					"kernel threads",
 				TIMEOUT / HZ, todo);
+		show_state();
 		read_lock(&tasklist_lock);
 		do_each_thread(g, p) {
 			if (freeze_user_space && !is_user_space(p))
-- 
cgit v1.2.3-70-g09d2


From a0349828d6d6f95c445674c2953ee9db75c11f8f Mon Sep 17 00:00:00 2001
From: Ben Collins <bcollins@ubuntu.com>
Date: Thu, 19 Jul 2007 01:47:27 -0700
Subject: PM: Do not require dev spew to get PM_DEBUG

In order to enable things like PM_TRACE, you're required to enable
PM_DEBUG, which sends a large spew of messages on boot, and often times can
overflow dmesg buffer.

Create new PM_VERBOSE and shift that to be the option that enables
drivers/base/power's messages.

Signed-off-by: Ben Collins <bcollins@ubuntu.com>
Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
Cc: Pavel Machek <pavel@ucw.cz>
Cc: Greg KH <greg@kroah.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/base/power/Makefile |  2 +-
 kernel/power/Kconfig        | 17 ++++++++++++-----
 2 files changed, 13 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/drivers/base/power/Makefile b/drivers/base/power/Makefile
index fff17800720..966a5e28741 100644
--- a/drivers/base/power/Makefile
+++ b/drivers/base/power/Makefile
@@ -5,6 +5,6 @@ obj-$(CONFIG_PM_TRACE)	+= trace.o
 ifeq ($(CONFIG_DEBUG_DRIVER),y)
 EXTRA_CFLAGS += -DDEBUG
 endif
-ifeq ($(CONFIG_PM_DEBUG),y)
+ifeq ($(CONFIG_PM_VERBOSE),y)
 EXTRA_CFLAGS += -DDEBUG
 endif
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 73328476761..7358609e473 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -33,13 +33,20 @@ config PM_DEBUG
 	bool "Power Management Debug Support"
 	depends on PM
 	---help---
-	This option enables verbose debugging support in the Power Management
-	code. This is helpful when debugging and reporting various PM bugs, 
-	like suspend support.
+	This option enables various debugging support in the Power Management
+	code. This is helpful when debugging and reporting PM bugs, like
+	suspend support.
+
+config PM_VERBOSE
+	bool "Verbose Power Management debugging"
+	depends on PM_DEBUG
+	default n
+	---help---
+	This option enables verbose messages from the Power Management code.
 
 config DISABLE_CONSOLE_SUSPEND
 	bool "Keep console(s) enabled during suspend/resume (DANGEROUS)"
-	depends on PM && PM_DEBUG
+	depends on PM_DEBUG
 	default n
 	---help---
 	This option turns off the console suspend mechanism that prevents
@@ -50,7 +57,7 @@ config DISABLE_CONSOLE_SUSPEND
 
 config PM_TRACE
 	bool "Suspend/resume event tracing"
-	depends on PM && PM_DEBUG && X86_32 && EXPERIMENTAL
+	depends on PM_DEBUG && X86_32 && EXPERIMENTAL
 	default n
 	---help---
 	This enables some cheesy code to save the last PM event point in the
-- 
cgit v1.2.3-70-g09d2


From 127067a9c994dff16b280f409cc7b18a54a63719 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Thu, 19 Jul 2007 01:47:28 -0700
Subject: swsusp: remove incorrect code from user.c

In the face of the recent change of suspend code ordering (cf.
http://marc.info/?l=linux-acpi&m=117938245931603&w=2) we should also modify
the code ordering in swsusp so that hibernation_ops->prepare() is executed
after device_suspend().

However, for this purpose it seems reasonable to eliminate the code
duplication between kernel/power/disk.c and kernel/power/user.c first.  By
eliminating it we can reduce the size of user.c quite substantially and remove
the maintenance difficulty with making essentially the same changes in two
different places.

Moreover, we should also remove the calls to "platform" functions from the
restore code path, since it doesn't carry out any power transition of the
system, but we generally need to disable the GPEs before the restore if the
'platform' hibernation mode has been used.  To do this, we can introduce two
new hibernation_ops to be used in the restore code.

This patch:

Make the code hibernation code in kernel/power/user.c be functionally
equivalent to the corresponding code in kernel/power/disk.c , as it should be.

The calls to the platform functions removed by this patch are incorrect.  They
should be replaced with some other "platform" invocations that will be
introduced in one of the subsequent patches.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@ucw.cz>
Cc: Nigel Cunningham <nigel@nigel.suspend2.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/power/user.c | 17 ++++-------------
 1 file changed, 4 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/user.c b/kernel/power/user.c
index d65305b515b..09468ec6112 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -181,34 +181,25 @@ static inline int snapshot_suspend(int platform_suspend)
 	return error;
 }
 
-static inline int snapshot_restore(int platform_suspend)
+static inline int snapshot_restore(void)
 {
 	int error;
 
 	mutex_lock(&pm_mutex);
 	pm_prepare_console();
-	if (platform_suspend) {
-		error = platform_prepare();
-		if (error)
-			goto Finish;
-	}
 	suspend_console();
 	error = device_suspend(PMSG_PRETHAW);
 	if (error)
-		goto Resume_devices;
+		goto Finish;
 
 	error = disable_nonboot_cpus();
 	if (!error)
 		error = swsusp_resume();
 
 	enable_nonboot_cpus();
- Resume_devices:
-	if (platform_suspend)
-		platform_finish();
-
+ Finish:
 	device_resume();
 	resume_console();
- Finish:
 	pm_restore_console();
 	mutex_unlock(&pm_mutex);
 	return error;
@@ -274,7 +265,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
 			error = -EPERM;
 			break;
 		}
-		error = snapshot_restore(data->platform_suspend);
+		error = snapshot_restore();
 		break;
 
 	case SNAPSHOT_FREE:
-- 
cgit v1.2.3-70-g09d2


From 7777fab989b5d006903188c966058ebcd2d6342a Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Thu, 19 Jul 2007 01:47:29 -0700
Subject: swsusp: remove code duplication between disk.c and user.c

Currently, much of the code in kernel/power/disk.c is duplicated in
kernel/power/user.c , mainly for historical reasons.  By eliminating this code
duplication we can reduce the size of user.c quite substantially and remove
the maintenance difficulty resulting from it.

[bunk@stusta.de: kernel/power/disk.c: make code static]
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@ucw.cz>
Cc: Nigel Cunningham <nigel@nigel.suspend2.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/power/disk.c  | 184 +++++++++++++++++++++++++++++----------------------
 kernel/power/power.h |   5 +-
 kernel/power/user.c  |  96 ++-------------------------
 3 files changed, 115 insertions(+), 170 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index f445b9cd60f..47882bfa610 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -45,7 +45,7 @@ enum {
 
 static int hibernation_mode = HIBERNATION_SHUTDOWN;
 
-struct hibernation_ops *hibernation_ops;
+static struct hibernation_ops *hibernation_ops;
 
 /**
  * hibernation_set_ops - set the global hibernate operations
@@ -74,9 +74,9 @@ void hibernation_set_ops(struct hibernation_ops *ops)
  *	platform driver if so configured and return an error code if it fails
  */
 
-static int platform_prepare(void)
+static int platform_prepare(int platform_mode)
 {
-	return (hibernation_mode == HIBERNATION_PLATFORM && hibernation_ops) ?
+	return (platform_mode && hibernation_ops) ?
 		hibernation_ops->prepare() : 0;
 }
 
@@ -85,12 +85,103 @@ static int platform_prepare(void)
  *	using the platform driver (must be called after platform_prepare())
  */
 
-static void platform_finish(void)
+static void platform_finish(int platform_mode)
 {
-	if (hibernation_mode == HIBERNATION_PLATFORM && hibernation_ops)
+	if (platform_mode && hibernation_ops)
 		hibernation_ops->finish();
 }
 
+/**
+ *	hibernation_snapshot - quiesce devices and create the hibernation
+ *	snapshot image.
+ *	@platform_mode - if set, use the platform driver, if available, to
+ *			 prepare the platform frimware for the power transition.
+ *
+ *	Must be called with pm_mutex held
+ */
+
+int hibernation_snapshot(int platform_mode)
+{
+	int error;
+
+	/* Free memory before shutting down devices. */
+	error = swsusp_shrink_memory();
+	if (error)
+		goto Finish;
+
+	error = platform_prepare(platform_mode);
+	if (error)
+		goto Finish;
+
+	suspend_console();
+	error = device_suspend(PMSG_FREEZE);
+	if (error)
+		goto Resume_devices;
+
+	error = disable_nonboot_cpus();
+	if (!error) {
+		if (hibernation_mode != HIBERNATION_TEST) {
+			in_suspend = 1;
+			error = swsusp_suspend();
+			/* Control returns here after successful restore */
+		} else {
+			printk("swsusp debug: Waiting for 5 seconds.\n");
+			mdelay(5000);
+		}
+	}
+	enable_nonboot_cpus();
+ Resume_devices:
+	platform_finish(platform_mode);
+	device_resume();
+	resume_console();
+ Finish:
+	return error;
+}
+
+/**
+ *	hibernation_restore - quiesce devices and restore the hibernation
+ *	snapshot image.  If successful, control returns in hibernation_snaphot()
+ *
+ *	Must be called with pm_mutex held
+ */
+
+int hibernation_restore(void)
+{
+	int error;
+
+	pm_prepare_console();
+	suspend_console();
+	error = device_suspend(PMSG_PRETHAW);
+	if (error)
+		goto Finish;
+
+	error = disable_nonboot_cpus();
+	if (!error)
+		error = swsusp_resume();
+
+	enable_nonboot_cpus();
+ Finish:
+	device_resume();
+	resume_console();
+	pm_restore_console();
+	return error;
+}
+
+/**
+ *	hibernation_platform_enter - enter the hibernation state using the
+ *	platform driver (if available)
+ */
+
+int hibernation_platform_enter(void)
+{
+	if (hibernation_ops) {
+		kernel_shutdown_prepare(SYSTEM_SUSPEND_DISK);
+		return hibernation_ops->enter();
+	} else {
+		return -ENOSYS;
+	}
+}
+
 /**
  *	power_down - Shut the machine down for hibernation.
  *
@@ -111,11 +202,7 @@ static void power_down(void)
 		kernel_restart(NULL);
 		break;
 	case HIBERNATION_PLATFORM:
-		if (hibernation_ops) {
-			kernel_shutdown_prepare(SYSTEM_SUSPEND_DISK);
-			hibernation_ops->enter();
-			break;
-		}
+		hibernation_platform_enter();
 	}
 	kernel_halt();
 	/*
@@ -171,62 +258,17 @@ int hibernate(void)
 		mdelay(5000);
 		goto Thaw;
 	}
-
-	/* Free memory before shutting down devices. */
-	error = swsusp_shrink_memory();
-	if (error)
-		goto Thaw;
-
-	error = platform_prepare();
-	if (error)
-		goto Thaw;
-
-	suspend_console();
-	error = device_suspend(PMSG_FREEZE);
-	if (error) {
-		printk(KERN_ERR "PM: Some devices failed to suspend\n");
-		goto Resume_devices;
-	}
-	error = disable_nonboot_cpus();
-	if (error)
-		goto Enable_cpus;
-
-	if (hibernation_mode == HIBERNATION_TEST) {
-		printk("swsusp debug: Waiting for 5 seconds.\n");
-		mdelay(5000);
-		goto Enable_cpus;
-	}
-
-	pr_debug("PM: snapshotting memory.\n");
-	in_suspend = 1;
-	error = swsusp_suspend();
-	if (error)
-		goto Enable_cpus;
-
-	if (in_suspend) {
-		enable_nonboot_cpus();
-		platform_finish();
-		device_resume();
-		resume_console();
+	error = hibernation_snapshot(hibernation_mode == HIBERNATION_PLATFORM);
+	if (in_suspend && !error) {
 		pr_debug("PM: writing image.\n");
 		error = swsusp_write();
+		swsusp_free();
 		if (!error)
 			power_down();
-		else {
-			swsusp_free();
-			goto Thaw;
-		}
 	} else {
 		pr_debug("PM: Image restored successfully.\n");
+		swsusp_free();
 	}
-
-	swsusp_free();
- Enable_cpus:
-	enable_nonboot_cpus();
- Resume_devices:
-	platform_finish();
-	device_resume();
-	resume_console();
  Thaw:
 	mutex_unlock(&pm_mutex);
 	unprepare_processes();
@@ -301,29 +343,11 @@ static int software_resume(void)
 	pr_debug("PM: Reading swsusp image.\n");
 
 	error = swsusp_read();
-	if (error) {
-		swsusp_free();
-		goto Thaw;
-	}
-
-	pr_debug("PM: Preparing devices for restore.\n");
-
-	suspend_console();
-	error = device_suspend(PMSG_PRETHAW);
-	if (error)
-		goto Free;
-
-	error = disable_nonboot_cpus();
 	if (!error)
-		swsusp_resume();
+		hibernation_restore();
 
-	enable_nonboot_cpus();
- Free:
-	swsusp_free();
-	device_resume();
-	resume_console();
- Thaw:
 	printk(KERN_ERR "PM: Restore failed, recovering.\n");
+	swsusp_free();
 	unprepare_processes();
  Done:
 	free_basic_memory_bitmaps();
@@ -333,7 +357,7 @@ static int software_resume(void)
  Unlock:
 	mutex_unlock(&pm_mutex);
 	pr_debug("PM: Resume from disk failed.\n");
-	return 0;
+	return error;
 }
 
 late_initcall(software_resume);
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 51381487103..70c378b3f85 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -25,7 +25,10 @@ struct swsusp_info {
  */
 #define SPARE_PAGES	((1024 * 1024) >> PAGE_SHIFT)
 
-extern struct hibernation_ops *hibernation_ops;
+/* kernel/power/disk.c */
+extern int hibernation_snapshot(int platform_mode);
+extern int hibernation_restore(void);
+extern int hibernation_platform_enter(void);
 #endif
 
 extern int pfn_is_nosave(unsigned long);
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 09468ec6112..bfed3b92409 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -128,83 +128,6 @@ static ssize_t snapshot_write(struct file *filp, const char __user *buf,
 	return res;
 }
 
-static inline int platform_prepare(void)
-{
-	int error = 0;
-
-	if (hibernation_ops)
-		error = hibernation_ops->prepare();
-
-	return error;
-}
-
-static inline void platform_finish(void)
-{
-	if (hibernation_ops)
-		hibernation_ops->finish();
-}
-
-static inline int snapshot_suspend(int platform_suspend)
-{
-	int error;
-
-	mutex_lock(&pm_mutex);
-	/* Free memory before shutting down devices. */
-	error = swsusp_shrink_memory();
-	if (error)
-		goto Finish;
-
-	if (platform_suspend) {
-		error = platform_prepare();
-		if (error)
-			goto Finish;
-	}
-	suspend_console();
-	error = device_suspend(PMSG_FREEZE);
-	if (error)
-		goto Resume_devices;
-
-	error = disable_nonboot_cpus();
-	if (!error) {
-		in_suspend = 1;
-		error = swsusp_suspend();
-	}
-	enable_nonboot_cpus();
- Resume_devices:
-	if (platform_suspend)
-		platform_finish();
-
-	device_resume();
-	resume_console();
- Finish:
-	mutex_unlock(&pm_mutex);
-	return error;
-}
-
-static inline int snapshot_restore(void)
-{
-	int error;
-
-	mutex_lock(&pm_mutex);
-	pm_prepare_console();
-	suspend_console();
-	error = device_suspend(PMSG_PRETHAW);
-	if (error)
-		goto Finish;
-
-	error = disable_nonboot_cpus();
-	if (!error)
-		error = swsusp_resume();
-
-	enable_nonboot_cpus();
- Finish:
-	device_resume();
-	resume_console();
-	pm_restore_console();
-	mutex_unlock(&pm_mutex);
-	return error;
-}
-
 static int snapshot_ioctl(struct inode *inode, struct file *filp,
                           unsigned int cmd, unsigned long arg)
 {
@@ -251,7 +174,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
 			error = -EPERM;
 			break;
 		}
-		error = snapshot_suspend(data->platform_suspend);
+		error = hibernation_snapshot(data->platform_suspend);
 		if (!error)
 			error = put_user(in_suspend, (unsigned int __user *)arg);
 		if (!error)
@@ -265,7 +188,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
 			error = -EPERM;
 			break;
 		}
-		error = snapshot_restore();
+		error = hibernation_restore();
 		break;
 
 	case SNAPSHOT_FREE:
@@ -377,19 +300,14 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
 		switch (arg) {
 
 		case PMOPS_PREPARE:
-			if (hibernation_ops) {
-				data->platform_suspend = 1;
-				error = 0;
-			} else {
-				error = -ENOSYS;
-			}
+			data->platform_suspend = 1;
+			error = 0;
 			break;
 
 		case PMOPS_ENTER:
-			if (data->platform_suspend) {
-				kernel_shutdown_prepare(SYSTEM_SUSPEND_DISK);
-				error = hibernation_ops->enter();
-			}
+			if (data->platform_suspend)
+				error = hibernation_platform_enter();
+
 			break;
 
 		case PMOPS_FINISH:
-- 
cgit v1.2.3-70-g09d2


From a634cc10164d1c229fbeca33923e6a0ed939e894 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Thu, 19 Jul 2007 01:47:30 -0700
Subject: swsusp: introduce restore platform operations
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

At least on some machines it is necessary to prepare the ACPI firmware for the
restoration of the system memory state from the hibernation image if the
"platform" mode of hibernation has been used.  Namely, in that cases we need
to disable the GPEs before replacing the "boot" kernel with the "frozen"
kernel (cf.  http://bugzilla.kernel.org/show_bug.cgi?id=7887).  After the
restore they will be re-enabled by hibernation_ops->finish(), but if the
restore fails, they have to be re-enabled by the restore code explicitly.

For this purpose we can introduce two additional hibernation operations,
called pre_restore() and restore_cleanup() and call them from the restore code
path.  Still, they should be called if the "platform" mode of hibernation has
been used, so we need to pass the information about the hibernation mode from
the "frozen" kernel to the "boot" kernel in the image header.

Apparently, we can't drop the disabling of GPEs before the restore because of
Bug #7887 .   We also can't do it unconditionally, because the GPEs wouldn't
have been enabled after a successful restore if the suspend had been done in
the 'shutdown' or 'reboot' mode.

In principle we could (and probably should) unconditionally disable the GPEs
before each snapshot creation *and* before the restore, but then we'd have to
unconditionally enable them after the snapshot creation as well as after the
restore (or restore failure)   Still, for this purpose we'd need to modify
acpi_enter_sleep_state_prep() and acpi_leave_sleep_state() and we'd have to
introduce some mechanism synchronizing the disablind/enabling of the GPEs with
the device drivers' .suspend()/.resume() routines and with
disable_/enable_nonboot_cpus().   However, this would have affected the
suspend (ie.  s2ram) code as well as the hibernation, which I'd like to avoid
in this patch series.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Cc: Nigel Cunningham <nigel@nigel.suspend2.net>
Cc: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/acpi/sleep/main.c | 16 ++++++++++++++
 include/linux/suspend.h   |  4 ++++
 kernel/power/disk.c       | 56 ++++++++++++++++++++++++++++++++++++++---------
 kernel/power/power.h      | 13 ++++++++---
 kernel/power/swap.c       | 20 ++++++++++++-----
 kernel/power/user.c       |  2 +-
 6 files changed, 92 insertions(+), 19 deletions(-)

(limited to 'kernel')

diff --git a/drivers/acpi/sleep/main.c b/drivers/acpi/sleep/main.c
index bc7e16ec839..42127c0d612 100644
--- a/drivers/acpi/sleep/main.c
+++ b/drivers/acpi/sleep/main.c
@@ -217,10 +217,26 @@ static void acpi_hibernation_finish(void)
 	}
 }
 
+static int acpi_hibernation_pre_restore(void)
+{
+	acpi_status status;
+
+	status = acpi_hw_disable_all_gpes();
+
+	return ACPI_SUCCESS(status) ? 0 : -EFAULT;
+}
+
+static void acpi_hibernation_restore_cleanup(void)
+{
+	acpi_hw_enable_all_runtime_gpes();
+}
+
 static struct hibernation_ops acpi_hibernation_ops = {
 	.prepare = acpi_hibernation_prepare,
 	.enter = acpi_hibernation_enter,
 	.finish = acpi_hibernation_finish,
+	.pre_restore = acpi_hibernation_pre_restore,
+	.restore_cleanup = acpi_hibernation_restore_cleanup,
 };
 #endif				/* CONFIG_SOFTWARE_SUSPEND */
 
diff --git a/include/linux/suspend.h b/include/linux/suspend.h
index 9c7cb643066..d235c146da2 100644
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -43,11 +43,15 @@ static inline void pm_restore_console(void) {}
  * @prepare: prepare system for hibernation
  * @enter: shut down system after state has been saved to disk
  * @finish: finish/clean up after state has been reloaded
+ * @pre_restore: prepare system for the restoration from a hibernation image
+ * @restore_cleanup: clean up after a failing image restoration
  */
 struct hibernation_ops {
 	int (*prepare)(void);
 	int (*enter)(void);
 	void (*finish)(void);
+	int (*pre_restore)(void);
+	void (*restore_cleanup)(void);
 };
 
 #if defined(CONFIG_PM) && defined(CONFIG_SOFTWARE_SUSPEND)
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 47882bfa610..fa3b43b7206 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -54,7 +54,8 @@ static struct hibernation_ops *hibernation_ops;
 
 void hibernation_set_ops(struct hibernation_ops *ops)
 {
-	if (ops && !(ops->prepare && ops->enter && ops->finish)) {
+	if (ops && !(ops->prepare && ops->enter && ops->finish
+	    && ops->pre_restore && ops->restore_cleanup)) {
 		WARN_ON(1);
 		return;
 	}
@@ -91,6 +92,31 @@ static void platform_finish(int platform_mode)
 		hibernation_ops->finish();
 }
 
+/**
+ *	platform_pre_restore - prepare the platform for the restoration from a
+ *	hibernation image.  If the restore fails after this function has been
+ *	called, platform_restore_cleanup() must be called.
+ */
+
+static int platform_pre_restore(int platform_mode)
+{
+	return (platform_mode && hibernation_ops) ?
+		hibernation_ops->pre_restore() : 0;
+}
+
+/**
+ *	platform_restore_cleanup - switch the platform to the normal mode of
+ *	operation after a failing restore.  If platform_pre_restore() has been
+ *	called before the failing restore, this function must be called too,
+ *	regardless of the result of platform_pre_restore().
+ */
+
+static void platform_restore_cleanup(int platform_mode)
+{
+	if (platform_mode && hibernation_ops)
+		hibernation_ops->restore_cleanup();
+}
+
 /**
  *	hibernation_snapshot - quiesce devices and create the hibernation
  *	snapshot image.
@@ -141,11 +167,13 @@ int hibernation_snapshot(int platform_mode)
 /**
  *	hibernation_restore - quiesce devices and restore the hibernation
  *	snapshot image.  If successful, control returns in hibernation_snaphot()
+ *	@platform_mode - if set, use the platform driver, if available, to
+ *			 prepare the platform frimware for the transition.
  *
  *	Must be called with pm_mutex held
  */
 
-int hibernation_restore(void)
+int hibernation_restore(int platform_mode)
 {
 	int error;
 
@@ -155,11 +183,14 @@ int hibernation_restore(void)
 	if (error)
 		goto Finish;
 
-	error = disable_nonboot_cpus();
-	if (!error)
-		error = swsusp_resume();
-
-	enable_nonboot_cpus();
+	error = platform_pre_restore(platform_mode);
+	if (!error) {
+		error = disable_nonboot_cpus();
+		if (!error)
+			error = swsusp_resume();
+		enable_nonboot_cpus();
+	}
+	platform_restore_cleanup(platform_mode);
  Finish:
 	device_resume();
 	resume_console();
@@ -260,8 +291,12 @@ int hibernate(void)
 	}
 	error = hibernation_snapshot(hibernation_mode == HIBERNATION_PLATFORM);
 	if (in_suspend && !error) {
+		unsigned int flags = 0;
+
+		if (hibernation_mode == HIBERNATION_PLATFORM)
+			flags |= SF_PLATFORM_MODE;
 		pr_debug("PM: writing image.\n");
-		error = swsusp_write();
+		error = swsusp_write(flags);
 		swsusp_free();
 		if (!error)
 			power_down();
@@ -295,6 +330,7 @@ int hibernate(void)
 static int software_resume(void)
 {
 	int error;
+	unsigned int flags;
 
 	mutex_lock(&pm_mutex);
 	if (!swsusp_resume_device) {
@@ -342,9 +378,9 @@ static int software_resume(void)
 
 	pr_debug("PM: Reading swsusp image.\n");
 
-	error = swsusp_read();
+	error = swsusp_read(&flags);
 	if (!error)
-		hibernation_restore();
+		hibernation_restore(flags & SF_PLATFORM_MODE);
 
 	printk(KERN_ERR "PM: Restore failed, recovering.\n");
 	swsusp_free();
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 70c378b3f85..eab3603b7ca 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -27,7 +27,7 @@ struct swsusp_info {
 
 /* kernel/power/disk.c */
 extern int hibernation_snapshot(int platform_mode);
-extern int hibernation_restore(void);
+extern int hibernation_restore(int platform_mode);
 extern int hibernation_platform_enter(void);
 #endif
 
@@ -155,13 +155,20 @@ extern sector_t alloc_swapdev_block(int swap);
 extern void free_all_swap_pages(int swap);
 extern int swsusp_swap_in_use(void);
 
+/*
+ * Flags that can be passed from the hibernatig hernel to the "boot" kernel in
+ * the image header.
+ */
+#define SF_PLATFORM_MODE	1
+
+/* kernel/power/disk.c */
 extern int swsusp_check(void);
 extern int swsusp_shrink_memory(void);
 extern void swsusp_free(void);
 extern int swsusp_suspend(void);
 extern int swsusp_resume(void);
-extern int swsusp_read(void);
-extern int swsusp_write(void);
+extern int swsusp_read(unsigned int *flags_p);
+extern int swsusp_write(unsigned int flags);
 extern void swsusp_close(void);
 extern int suspend_enter(suspend_state_t state);
 
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 8b1a1b83714..917aba10057 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -33,8 +33,9 @@ extern char resume_file[];
 #define SWSUSP_SIG	"S1SUSPEND"
 
 struct swsusp_header {
-	char reserved[PAGE_SIZE - 20 - sizeof(sector_t)];
+	char reserved[PAGE_SIZE - 20 - sizeof(sector_t) - sizeof(int)];
 	sector_t image;
+	unsigned int flags;	/* Flags to pass to the "boot" kernel */
 	char	orig_sig[10];
 	char	sig[10];
 } __attribute__((packed));
@@ -138,7 +139,7 @@ static int wait_on_bio_chain(struct bio **bio_chain)
  * Saving part
  */
 
-static int mark_swapfiles(sector_t start)
+static int mark_swapfiles(sector_t start, unsigned int flags)
 {
 	int error;
 
@@ -148,6 +149,7 @@ static int mark_swapfiles(sector_t start)
 		memcpy(swsusp_header->orig_sig,swsusp_header->sig, 10);
 		memcpy(swsusp_header->sig,SWSUSP_SIG, 10);
 		swsusp_header->image = start;
+		swsusp_header->flags = flags;
 		error = bio_write_page(swsusp_resume_block,
 					swsusp_header, NULL);
 	} else {
@@ -369,6 +371,7 @@ static int enough_swap(unsigned int nr_pages)
 
 /**
  *	swsusp_write - Write entire image and metadata.
+ *	@flags: flags to pass to the "boot" kernel in the image header
  *
  *	It is important _NOT_ to umount filesystems at this point. We want
  *	them synced (in case something goes wrong) but we DO not want to mark
@@ -376,7 +379,7 @@ static int enough_swap(unsigned int nr_pages)
  *	correctly, we'll mark system clean, anyway.)
  */
 
-int swsusp_write(void)
+int swsusp_write(unsigned int flags)
 {
 	struct swap_map_handle handle;
 	struct snapshot_handle snapshot;
@@ -415,7 +418,7 @@ int swsusp_write(void)
 		if (!error) {
 			flush_swap_writer(&handle);
 			printk("S");
-			error = mark_swapfiles(start);
+			error = mark_swapfiles(start, flags);
 			printk("|\n");
 		}
 	}
@@ -540,13 +543,20 @@ static int load_image(struct swap_map_handle *handle,
 	return error;
 }
 
-int swsusp_read(void)
+/**
+ *	swsusp_read - read the hibernation image.
+ *	@flags_p: flags passed by the "frozen" kernel in the image header should
+ *		  be written into this memeory location
+ */
+
+int swsusp_read(unsigned int *flags_p)
 {
 	int error;
 	struct swap_map_handle handle;
 	struct snapshot_handle snapshot;
 	struct swsusp_info *header;
 
+	*flags_p = swsusp_header->flags;
 	if (IS_ERR(resume_bdev)) {
 		pr_debug("swsusp: block device not initialised\n");
 		return PTR_ERR(resume_bdev);
diff --git a/kernel/power/user.c b/kernel/power/user.c
index bfed3b92409..1f24f30b951 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -188,7 +188,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
 			error = -EPERM;
 			break;
 		}
-		error = hibernation_restore();
+		error = hibernation_restore(data->platform_suspend);
 		break;
 
 	case SNAPSHOT_FREE:
-- 
cgit v1.2.3-70-g09d2


From 10a1803d667e209914eaada9b95525252f23ec78 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Thu, 19 Jul 2007 01:47:31 -0700
Subject: swsusp: fix hibernation code ordering

Change the code ordering so that hibernation_ops->prepare() is called after
device_suspend().  This is needed so that we don't violate the ACPI
specification, which states that the _PTS and _GTS system-control methods,
executed from acpi_sleep_prepare(), ought to be called after devices have been
put in low power states.

The "Finish" label in hibernation_restore() is moved, because device_suspend()
resumes devices if the suspending of them fails and the restore code ordering
should reflect the hibernation code ordering.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@ucw.cz>
Cc: Nigel Cunningham <nigel@nigel.suspend2.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/power/disk.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index fa3b43b7206..77ac605bf20 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -133,14 +133,14 @@ int hibernation_snapshot(int platform_mode)
 	/* Free memory before shutting down devices. */
 	error = swsusp_shrink_memory();
 	if (error)
-		goto Finish;
-
-	error = platform_prepare(platform_mode);
-	if (error)
-		goto Finish;
+		return error;
 
 	suspend_console();
 	error = device_suspend(PMSG_FREEZE);
+	if (error)
+		goto Resume_console;
+
+	error = platform_prepare(platform_mode);
 	if (error)
 		goto Resume_devices;
 
@@ -159,8 +159,8 @@ int hibernation_snapshot(int platform_mode)
  Resume_devices:
 	platform_finish(platform_mode);
 	device_resume();
+ Resume_console:
 	resume_console();
- Finish:
 	return error;
 }
 
@@ -191,8 +191,8 @@ int hibernation_restore(int platform_mode)
 		enable_nonboot_cpus();
 	}
 	platform_restore_cleanup(platform_mode);
- Finish:
 	device_resume();
+ Finish:
 	resume_console();
 	pm_restore_console();
 	return error;
-- 
cgit v1.2.3-70-g09d2


From b1457bcc3a00a0446c7f6e2f22fd24b6d8d0a309 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Thu, 19 Jul 2007 01:47:31 -0700
Subject: Hibernation: prepare to enter the low power state

During hibernation we call hibernation_ops->prepare() before creating the image,
but then, before saving it, we cancel the power transition by calling
hibernation_ops->finish().  Thus prior to calling hibernation_ops->enter() we
should let the platform firmware know that we're going to enter the low power
state after all.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Cc: Gautham R Shenoy <ego@in.ibm.com>
Cc: Pavel Machek <pavel@ucw.cz>
Cc: Nigel Cunningham <nigel@nigel.suspend2.net>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/power/disk.c | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 77ac605bf20..885c653509c 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -205,12 +205,23 @@ int hibernation_restore(int platform_mode)
 
 int hibernation_platform_enter(void)
 {
+	int error;
+
 	if (hibernation_ops) {
 		kernel_shutdown_prepare(SYSTEM_SUSPEND_DISK);
-		return hibernation_ops->enter();
+		/*
+		 * We have cancelled the power transition by running
+		 * hibernation_ops->finish() before saving the image, so we
+		 * should let the firmware know that we're going to enter the
+		 * sleep state after all
+		 */
+		error = hibernation_ops->prepare();
+		if (!error)
+			error = hibernation_ops->enter();
 	} else {
-		return -ENOSYS;
+		error = -ENOSYS;
 	}
+	return error;
 }
 
 /**
-- 
cgit v1.2.3-70-g09d2


From 0c1eecfb345401629aa57c9d3b077273e56c45a7 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Thu, 19 Jul 2007 01:47:33 -0700
Subject: Freezer: avoid freezing kernel threads prematurely

Kernel threads should not have TIF_FREEZE set when user space processes are
being frozen, since otherwise some of them might be frozen prematurely.
To prevent this from happening we can (1) make exit_mm() unset TIF_FREEZE
unconditionally just after clearing tsk->mm and (2) make try_to_freeze_tasks()
check if p->mm is different from zero and PF_BORROWED_MM is unset in p->flags
when user space processes are to be frozen.

Namely, when user space processes are being frozen, we only should set
TIF_FREEZE for tasks that have p->mm different from NULL and don't have
PF_BORROWED_MM set in p->flags.  For this reason task_lock() must be used to
prevent try_to_freeze_tasks() from racing with use_mm()/unuse_mm(), in which
p->mm and p->flags.PF_BORROWED_MM are changed under task_lock(p).  Also, we
need to prevent the following scenario from happening:

* daemonize() is called by a task spawned from a user space code path
* freezer checks if the task has p->mm set and the result is positive
* task enters exit_mm() and clears its TIF_FREEZE
* freezer sets TIF_FREEZE for the task
* task calls try_to_freeze() and goes to the refrigerator, which is wrong at
  that point

This requires us to acquire task_lock(p) before p->flags.PF_BORROWED_MM and
p->mm are examined and release it after TIF_FREEZE is set for p (or it turns
out that TIF_FREEZE should not be set).

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Cc: Gautham R Shenoy <ego@in.ibm.com>
Cc: Pavel Machek <pavel@ucw.cz>
Cc: Nigel Cunningham <nigel@nigel.suspend2.net>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/freezer.h |  9 +++----
 kernel/exit.c           |  3 +++
 kernel/power/process.c  | 64 +++++++++++++++++++++++++------------------------
 3 files changed, 41 insertions(+), 35 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/freezer.h b/include/linux/freezer.h
index 2d38b1a7466..c8e02de737f 100644
--- a/include/linux/freezer.h
+++ b/include/linux/freezer.h
@@ -25,7 +25,7 @@ static inline int freezing(struct task_struct *p)
 /*
  * Request that a process be frozen
  */
-static inline void freeze(struct task_struct *p)
+static inline void set_freeze_flag(struct task_struct *p)
 {
 	set_tsk_thread_flag(p, TIF_FREEZE);
 }
@@ -33,7 +33,7 @@ static inline void freeze(struct task_struct *p)
 /*
  * Sometimes we may need to cancel the previous 'freeze' request
  */
-static inline void do_not_freeze(struct task_struct *p)
+static inline void clear_freeze_flag(struct task_struct *p)
 {
 	clear_tsk_thread_flag(p, TIF_FREEZE);
 }
@@ -56,7 +56,7 @@ static inline int thaw_process(struct task_struct *p)
 		wake_up_process(p);
 		return 1;
 	}
-	clear_tsk_thread_flag(p, TIF_FREEZE);
+	clear_freeze_flag(p);
 	task_unlock(p);
 	return 0;
 }
@@ -129,7 +129,8 @@ static inline void set_freezable(void)
 #else
 static inline int frozen(struct task_struct *p) { return 0; }
 static inline int freezing(struct task_struct *p) { return 0; }
-static inline void freeze(struct task_struct *p) { BUG(); }
+static inline void set_freeze_flag(struct task_struct *p) {}
+static inline void clear_freeze_flag(struct task_struct *p) {}
 static inline int thaw_process(struct task_struct *p) { return 1; }
 
 static inline void refrigerator(void) {}
diff --git a/kernel/exit.c b/kernel/exit.c
index e8af8d0c248..464c2b172f0 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -45,6 +45,7 @@
 #include <linux/resource.h>
 #include <linux/blkdev.h>
 #include <linux/task_io_accounting_ops.h>
+#include <linux/freezer.h>
 
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
@@ -594,6 +595,8 @@ static void exit_mm(struct task_struct * tsk)
 	tsk->mm = NULL;
 	up_read(&mm->mmap_sem);
 	enter_lazy_tlb(mm, current);
+	/* We don't want this task to be frozen prematurely */
+	clear_freeze_flag(tsk);
 	task_unlock(tsk);
 	mmput(mm);
 }
diff --git a/kernel/power/process.c b/kernel/power/process.c
index b850173e756..e1bcdedd146 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -40,7 +40,7 @@ static inline void frozen_process(void)
 		current->flags |= PF_FROZEN;
 		wmb();
 	}
-	clear_tsk_thread_flag(current, TIF_FREEZE);
+	clear_freeze_flag(current);
 }
 
 /* Refrigerator is place where frozen processes are stored :-). */
@@ -75,17 +75,16 @@ void refrigerator(void)
 	current->state = save;
 }
 
-static inline void freeze_process(struct task_struct *p)
+static void freeze_task(struct task_struct *p)
 {
 	unsigned long flags;
 
 	if (!freezing(p)) {
 		rmb();
 		if (!frozen(p)) {
+			set_freeze_flag(p);
 			if (p->state == TASK_STOPPED)
 				force_sig_specific(SIGSTOP, p);
-
-			freeze(p);
 			spin_lock_irqsave(&p->sighand->siglock, flags);
 			signal_wake_up(p, p->state == TASK_STOPPED);
 			spin_unlock_irqrestore(&p->sighand->siglock, flags);
@@ -99,18 +98,13 @@ static void cancel_freezing(struct task_struct *p)
 
 	if (freezing(p)) {
 		pr_debug("  clean up: %s\n", p->comm);
-		do_not_freeze(p);
+		clear_freeze_flag(p);
 		spin_lock_irqsave(&p->sighand->siglock, flags);
 		recalc_sigpending_and_wake(p);
 		spin_unlock_irqrestore(&p->sighand->siglock, flags);
 	}
 }
 
-static inline int is_user_space(struct task_struct *p)
-{
-	return p->mm && !(p->flags & PF_BORROWED_MM);
-}
-
 static unsigned int try_to_freeze_tasks(int freeze_user_space)
 {
 	struct task_struct *g, *p;
@@ -122,20 +116,34 @@ static unsigned int try_to_freeze_tasks(int freeze_user_space)
 		todo = 0;
 		read_lock(&tasklist_lock);
 		do_each_thread(g, p) {
-			if (!freezeable(p))
-				continue;
-
-			if (frozen(p))
+			if (frozen(p) || !freezeable(p))
 				continue;
 
-			if (p->state == TASK_TRACED && frozen(p->parent)) {
-				cancel_freezing(p);
-				continue;
+			if (freeze_user_space) {
+				if (p->state == TASK_TRACED &&
+				    frozen(p->parent)) {
+					cancel_freezing(p);
+					continue;
+				}
+				/*
+				 * Kernel threads should not have TIF_FREEZE set
+				 * at this point, so we must ensure that either
+				 * p->mm is not NULL *and* PF_BORROWED_MM is
+				 * unset, or TIF_FRREZE is left unset.
+				 * The task_lock() is necessary to prevent races
+				 * with exit_mm() or use_mm()/unuse_mm() from
+				 * occuring.
+				 */
+				task_lock(p);
+				if (!p->mm || (p->flags & PF_BORROWED_MM)) {
+					task_unlock(p);
+					continue;
+				}
+				freeze_task(p);
+				task_unlock(p);
+			} else {
+				freeze_task(p);
 			}
-			if (freeze_user_space && !is_user_space(p))
-				continue;
-
-			freeze_process(p);
 			if (!freezer_should_skip(p))
 				todo++;
 		} while_each_thread(g, p);
@@ -152,22 +160,16 @@ static unsigned int try_to_freeze_tasks(int freeze_user_space)
 		 * but it cleans up leftover PF_FREEZE requests.
 		 */
 		printk("\n");
-		printk(KERN_ERR "Stopping %s timed out after %d seconds "
+		printk(KERN_ERR "Freezing of %s timed out after %d seconds "
 				"(%d tasks refusing to freeze):\n",
-				freeze_user_space ? "user space processes" :
-					"kernel threads",
+				freeze_user_space ? "user space " : "tasks ",
 				TIMEOUT / HZ, todo);
 		show_state();
 		read_lock(&tasklist_lock);
 		do_each_thread(g, p) {
-			if (freeze_user_space && !is_user_space(p))
-				continue;
-
 			task_lock(p);
-			if (freezeable(p) && !frozen(p) &&
-			    !freezer_should_skip(p))
+			if (freezing(p) && !freezer_should_skip(p))
 				printk(KERN_ERR " %s\n", p->comm);
-
 			cancel_freezing(p);
 			task_unlock(p);
 		} while_each_thread(g, p);
@@ -211,7 +213,7 @@ static void thaw_tasks(int thaw_user_space)
 		if (!freezeable(p))
 			continue;
 
-		if (is_user_space(p) == !thaw_user_space)
+		if (!p->mm == thaw_user_space)
 			continue;
 
 		thaw_process(p);
-- 
cgit v1.2.3-70-g09d2


From f4a3a7d60c9c9a961e4c970f6eb41dd1c9d3ec21 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Thu, 19 Jul 2007 01:47:33 -0700
Subject: Freezer: use __set_current_state in refrigerator

Use __set_current_state() as appropriate in refrigerator() instead of
accessing current->state directly.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@ucw.cz>
Cc: Gautham R Shenoy <ego@in.ibm.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/power/process.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/power/process.c b/kernel/power/process.c
index e1bcdedd146..9b5301c73b8 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -72,7 +72,7 @@ void refrigerator(void)
 		schedule();
 	}
 	pr_debug("%s left refrigerator\n", current->comm);
-	current->state = save;
+	__set_current_state(save);
 }
 
 static void freeze_task(struct task_struct *p)
-- 
cgit v1.2.3-70-g09d2


From e7cd8a722745a01bcfac4d4a52d53391d177da20 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Thu, 19 Jul 2007 01:47:34 -0700
Subject: Freezer: return int from freeze_processes

Make try_to_freeze_tasks() and freeze_processes() return -EBUSY on failure
instead of the number of unfrozen tasks (none of the callers actually uses
this number).

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@ucw.cz>
Cc: Gautham R Shenoy <ego@in.ibm.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/power/process.c | 21 +++++++++------------
 1 file changed, 9 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/process.c b/kernel/power/process.c
index 9b5301c73b8..00cdbe5f518 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -105,7 +105,7 @@ static void cancel_freezing(struct task_struct *p)
 	}
 }
 
-static unsigned int try_to_freeze_tasks(int freeze_user_space)
+static int try_to_freeze_tasks(int freeze_user_space)
 {
 	struct task_struct *g, *p;
 	unsigned long end_time;
@@ -176,28 +176,25 @@ static unsigned int try_to_freeze_tasks(int freeze_user_space)
 		read_unlock(&tasklist_lock);
 	}
 
-	return todo;
+	return todo ? -EBUSY : 0;
 }
 
 /**
  *	freeze_processes - tell processes to enter the refrigerator
- *
- *	Returns 0 on success, or the number of processes that didn't freeze,
- *	although they were told to.
  */
 int freeze_processes(void)
 {
-	unsigned int nr_unfrozen;
+	int error;
 
 	printk("Stopping tasks ... ");
-	nr_unfrozen = try_to_freeze_tasks(FREEZER_USER_SPACE);
-	if (nr_unfrozen)
-		return nr_unfrozen;
+	error = try_to_freeze_tasks(FREEZER_USER_SPACE);
+	if (error)
+		return error;
 
 	sys_sync();
-	nr_unfrozen = try_to_freeze_tasks(FREEZER_KERNEL_THREADS);
-	if (nr_unfrozen)
-		return nr_unfrozen;
+	error = try_to_freeze_tasks(FREEZER_KERNEL_THREADS);
+	if (error)
+		return error;
 
 	printk("done.\n");
 	BUG_ON(in_atomic());
-- 
cgit v1.2.3-70-g09d2


From c2cf7d87d804c66e063829d5ca739053e901dc15 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Thu, 19 Jul 2007 01:47:35 -0700
Subject: Freezer: remove redundant check in try_to_freeze_tasks

We don't need to check if todo is positive before calling time_after() in
try_to_freeze_tasks(), because if todo is zero at this point, the loop will be
broken anyway due to the while () condition being false.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@ucw.cz>
Cc: Gautham R Shenoy <ego@in.ibm.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/power/process.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/power/process.c b/kernel/power/process.c
index 00cdbe5f518..3434940a3df 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -149,7 +149,7 @@ static int try_to_freeze_tasks(int freeze_user_space)
 		} while_each_thread(g, p);
 		read_unlock(&tasklist_lock);
 		yield();			/* Yield is okay here */
-		if (todo && time_after(jiffies, end_time))
+		if (time_after(jiffies, end_time))
 			break;
 	} while (todo);
 
-- 
cgit v1.2.3-70-g09d2


From b10d911749d37dccfa5873d2088aea3f074b9e45 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Thu, 19 Jul 2007 01:47:36 -0700
Subject: PM: introduce hibernation and suspend notifiers

Make it possible to register hibernation and suspend notifiers, so that
subsystems can perform hibernation-related or suspend-related operations that
should not be carried out by device drivers' .suspend() and .resume()
routines.

[akpm@linux-foundation.org: build fixes]
[akpm@linux-foundation.org: cleanups]
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@ucw.cz>
Cc: Nigel Cunningham <nigel@nigel.suspend2.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/power/notifiers.txt | 50 +++++++++++++++++++++++++++++++++++++++
 include/linux/notifier.h          |  6 +++++
 include/linux/suspend.h           | 48 +++++++++++++++++++++++++++++++++----
 kernel/power/disk.c               | 16 +++++++++----
 kernel/power/main.c               |  9 +++++++
 kernel/power/power.h              | 10 ++++++++
 kernel/power/user.c               | 11 ++++++---
 7 files changed, 138 insertions(+), 12 deletions(-)
 create mode 100644 Documentation/power/notifiers.txt

(limited to 'kernel')

diff --git a/Documentation/power/notifiers.txt b/Documentation/power/notifiers.txt
new file mode 100644
index 00000000000..9293e4bc857
--- /dev/null
+++ b/Documentation/power/notifiers.txt
@@ -0,0 +1,50 @@
+Suspend notifiers
+	(C) 2007 Rafael J. Wysocki <rjw@sisk.pl>, GPL
+
+There are some operations that device drivers may want to carry out in their
+.suspend() routines, but shouldn't, because they can cause the hibernation or
+suspend to fail. For example, a driver may want to allocate a substantial amount
+of memory (like 50 MB) in .suspend(), but that shouldn't be done after the
+swsusp's memory shrinker has run.
+
+Also, there may be some operations, that subsystems want to carry out before a
+hibernation/suspend or after a restore/resume, requiring the system to be fully
+functional, so the drivers' .suspend() and .resume() routines are not suitable
+for this purpose.  For example, device drivers may want to upload firmware to
+their devices after a restore from a hibernation image, but they cannot do it by
+calling request_firmware() from their .resume() routines (user land processes
+are frozen at this point).  The solution may be to load the firmware into
+memory before processes are frozen and upload it from there in the .resume()
+routine.  Of course, a hibernation notifier may be used for this purpose.
+
+The subsystems that have such needs can register suspend notifiers that will be
+called upon the following events by the suspend core:
+
+PM_HIBERNATION_PREPARE	The system is going to hibernate or suspend, tasks will
+			be frozen immediately.
+
+PM_POST_HIBERNATION	The system memory state has been restored from a
+			hibernation image or an error occured during the
+			hibernation.  Device drivers' .resume() callbacks have
+			been executed and tasks have been thawed.
+
+PM_SUSPEND_PREPARE	The system is preparing for a suspend.
+
+PM_POST_SUSPEND		The system has just resumed or an error occured during
+			the suspend.	Device drivers' .resume() callbacks have
+			been executed and tasks have been thawed.
+
+It is generally assumed that whatever the notifiers do for
+PM_HIBERNATION_PREPARE, should be undone for PM_POST_HIBERNATION.  Analogously,
+operations performed for PM_SUSPEND_PREPARE should be reversed for
+PM_POST_SUSPEND.  Additionally, all of the notifiers are called for
+PM_POST_HIBERNATION if one of them fails for PM_HIBERNATION_PREPARE, and
+all of the notifiers are called for PM_POST_SUSPEND if one of them fails for
+PM_SUSPEND_PREPARE.
+
+The hibernation and suspend notifiers are called with pm_mutex held.  They are
+defined in the usual way, but their last argument is meaningless (it is always
+NULL).  To register and/or unregister a suspend notifier use the functions
+register_pm_notifier() and unregister_pm_notifier(), respectively, defined in
+include/linux/suspend.h .  If you don't need to unregister the notifier, you can
+also use the pm_notifier() macro defined in include/linux/suspend.h .
diff --git a/include/linux/notifier.h b/include/linux/notifier.h
index 576f2bb34cc..be3f2bb6fcf 100644
--- a/include/linux/notifier.h
+++ b/include/linux/notifier.h
@@ -212,5 +212,11 @@ extern int __srcu_notifier_call_chain(struct srcu_notifier_head *nh,
 #define CPU_DEAD_FROZEN		(CPU_DEAD | CPU_TASKS_FROZEN)
 #define CPU_DYING_FROZEN	(CPU_DYING | CPU_TASKS_FROZEN)
 
+/* Hibernation and suspend events */
+#define PM_HIBERNATION_PREPARE	0x0001 /* Going to hibernate */
+#define PM_POST_HIBERNATION	0x0002 /* Hibernation finished */
+#define PM_SUSPEND_PREPARE	0x0003 /* Going to suspend the system */
+#define PM_POST_SUSPEND		0x0004 /* Suspend finished */
+
 #endif /* __KERNEL__ */
 #endif /* _LINUX_NOTIFIER_H */
diff --git a/include/linux/suspend.h b/include/linux/suspend.h
index d235c146da2..e8e6da394c9 100644
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -54,7 +54,8 @@ struct hibernation_ops {
 	void (*restore_cleanup)(void);
 };
 
-#if defined(CONFIG_PM) && defined(CONFIG_SOFTWARE_SUSPEND)
+#ifdef CONFIG_PM
+#ifdef CONFIG_SOFTWARE_SUSPEND
 /* kernel/power/snapshot.c */
 extern void __register_nosave_region(unsigned long b, unsigned long e, int km);
 static inline void register_nosave_region(unsigned long b, unsigned long e)
@@ -72,16 +73,14 @@ extern unsigned long get_safe_page(gfp_t gfp_mask);
 
 extern void hibernation_set_ops(struct hibernation_ops *ops);
 extern int hibernate(void);
-#else
-static inline void register_nosave_region(unsigned long b, unsigned long e) {}
-static inline void register_nosave_region_late(unsigned long b, unsigned long e) {}
+#else /* CONFIG_SOFTWARE_SUSPEND */
 static inline int swsusp_page_is_forbidden(struct page *p) { return 0; }
 static inline void swsusp_set_page_free(struct page *p) {}
 static inline void swsusp_unset_page_free(struct page *p) {}
 
 static inline void hibernation_set_ops(struct hibernation_ops *ops) {}
 static inline int hibernate(void) { return -ENOSYS; }
-#endif /* defined(CONFIG_PM) && defined(CONFIG_SOFTWARE_SUSPEND) */
+#endif /* CONFIG_SOFTWARE_SUSPEND */
 
 void save_processor_state(void);
 void restore_processor_state(void);
@@ -89,4 +88,43 @@ struct saved_context;
 void __save_processor_state(struct saved_context *ctxt);
 void __restore_processor_state(struct saved_context *ctxt);
 
+/* kernel/power/main.c */
+extern struct blocking_notifier_head pm_chain_head;
+
+static inline int register_pm_notifier(struct notifier_block *nb)
+{
+	return blocking_notifier_chain_register(&pm_chain_head, nb);
+}
+
+static inline int unregister_pm_notifier(struct notifier_block *nb)
+{
+	return blocking_notifier_chain_unregister(&pm_chain_head, nb);
+}
+
+#define pm_notifier(fn, pri) {				\
+	static struct notifier_block fn##_nb =			\
+		{ .notifier_call = fn, .priority = pri };	\
+	register_pm_notifier(&fn##_nb);			\
+}
+#else /* CONFIG_PM */
+
+static inline int register_pm_notifier(struct notifier_block *nb)
+{
+	return 0;
+}
+
+static inline int unregister_pm_notifier(struct notifier_block *nb)
+{
+	return 0;
+}
+
+#define pm_notifier(fn, pri)	do { (void)(fn); } while (0)
+#endif /* CONFIG_PM */
+
+#if !defined CONFIG_SOFTWARE_SUSPEND || !defined(CONFIG_PM)
+static inline void register_nosave_region(unsigned long b, unsigned long e)
+{
+}
+#endif
+
 #endif /* _LINUX_SWSUSP_H */
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 885c653509c..324ac0188ce 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -281,9 +281,16 @@ int hibernate(void)
 {
 	int error;
 
+	mutex_lock(&pm_mutex);
 	/* The snapshot device should not be opened while we're running */
-	if (!atomic_add_unless(&snapshot_device_available, -1, 0))
-		return -EBUSY;
+	if (!atomic_add_unless(&snapshot_device_available, -1, 0)) {
+		error = -EBUSY;
+		goto Unlock;
+	}
+
+	error = pm_notifier_call_chain(PM_HIBERNATION_PREPARE);
+	if (error)
+		goto Exit;
 
 	/* Allocate memory management structures */
 	error = create_basic_memory_bitmaps();
@@ -294,7 +301,6 @@ int hibernate(void)
 	if (error)
 		goto Finish;
 
-	mutex_lock(&pm_mutex);
 	if (hibernation_mode == HIBERNATION_TESTPROC) {
 		printk("swsusp debug: Waiting for 5 seconds.\n");
 		mdelay(5000);
@@ -316,12 +322,14 @@ int hibernate(void)
 		swsusp_free();
 	}
  Thaw:
-	mutex_unlock(&pm_mutex);
 	unprepare_processes();
  Finish:
 	free_basic_memory_bitmaps();
  Exit:
+	pm_notifier_call_chain(PM_POST_HIBERNATION);
 	atomic_inc(&snapshot_device_available);
+ Unlock:
+	mutex_unlock(&pm_mutex);
 	return error;
 }
 
diff --git a/kernel/power/main.c b/kernel/power/main.c
index fc45ed22620..4d26ad394fb 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -23,6 +23,8 @@
 
 #include "power.h"
 
+BLOCKING_NOTIFIER_HEAD(pm_chain_head);
+
 /*This is just an arbitrary number */
 #define FREE_PAGE_NUMBER (100)
 
@@ -78,6 +80,10 @@ static int suspend_prepare(suspend_state_t state)
 	if (!pm_ops || !pm_ops->enter)
 		return -EPERM;
 
+	error = pm_notifier_call_chain(PM_SUSPEND_PREPARE);
+	if (error)
+		goto Finish;
+
 	pm_prepare_console();
 
 	if (freeze_processes()) {
@@ -125,6 +131,8 @@ static int suspend_prepare(suspend_state_t state)
  Thaw:
 	thaw_processes();
 	pm_restore_console();
+ Finish:
+	pm_notifier_call_chain(PM_POST_SUSPEND);
 	return error;
 }
 
@@ -176,6 +184,7 @@ static void suspend_finish(suspend_state_t state)
 	resume_console();
 	thaw_processes();
 	pm_restore_console();
+	pm_notifier_call_chain(PM_POST_SUSPEND);
 }
 
 
diff --git a/kernel/power/power.h b/kernel/power/power.h
index eab3603b7ca..01c2275b15b 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -173,5 +173,15 @@ extern void swsusp_close(void);
 extern int suspend_enter(suspend_state_t state);
 
 struct timeval;
+/* kernel/power/swsusp.c */
 extern void swsusp_show_speed(struct timeval *, struct timeval *,
 				unsigned int, char *);
+
+/* kernel/power/main.c */
+extern struct blocking_notifier_head pm_chain_head;
+
+static inline int pm_notifier_call_chain(unsigned long val)
+{
+	return (blocking_notifier_call_chain(&pm_chain_head, val, NULL)
+			== NOTIFY_BAD) ? -EINVAL : 0;
+}
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 1f24f30b951..7f19afe01b4 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -151,10 +151,14 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
 		if (data->frozen)
 			break;
 		mutex_lock(&pm_mutex);
-		if (freeze_processes()) {
-			thaw_processes();
-			error = -EBUSY;
+		error = pm_notifier_call_chain(PM_HIBERNATION_PREPARE);
+		if (!error) {
+			error = freeze_processes();
+			if (error)
+				thaw_processes();
 		}
+		if (error)
+			pm_notifier_call_chain(PM_POST_HIBERNATION);
 		mutex_unlock(&pm_mutex);
 		if (!error)
 			data->frozen = 1;
@@ -165,6 +169,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
 			break;
 		mutex_lock(&pm_mutex);
 		thaw_processes();
+		pm_notifier_call_chain(PM_POST_HIBERNATION);
 		mutex_unlock(&pm_mutex);
 		data->frozen = 0;
 		break;
-- 
cgit v1.2.3-70-g09d2


From 8cdd4936c17bd8085cb0dfacc4a37ccf8d0ada7b Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Thu, 19 Jul 2007 01:47:36 -0700
Subject: PM: disable usermode helper before hibernation and suspend

Use a hibernation and suspend notifier to disable the user mode helper before
a hibernation/suspend and enable it after the operation.

[akpm@linux-foundation.org: build fix]
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@ucw.cz>
Acked-by: Nigel Cunningham <nigel@nigel.suspend2.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kmod.c | 31 ++++++++++++++++++++++++++++++-
 1 file changed, 30 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/kmod.c b/kernel/kmod.c
index 78d365c524e..928f3678142 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -33,12 +33,22 @@
 #include <linux/kernel.h>
 #include <linux/init.h>
 #include <linux/resource.h>
+#include <linux/notifier.h>
+#include <linux/suspend.h>
 #include <asm/uaccess.h>
 
 extern int max_threads;
 
 static struct workqueue_struct *khelper_wq;
 
+/*
+ * If set, both call_usermodehelper_keys() and call_usermodehelper_pipe() exit
+ * immediately returning -EBUSY.  Used for preventing user land processes from
+ * being created after the user land has been frozen during a system-wide
+ * hibernation or suspend operation.
+ */
+static int usermodehelper_disabled;
+
 #ifdef CONFIG_KMOD
 
 /*
@@ -265,6 +275,24 @@ static void __call_usermodehelper(struct work_struct *work)
 	}
 }
 
+static int usermodehelper_pm_callback(struct notifier_block *nfb,
+					unsigned long action,
+					void *ignored)
+{
+	switch (action) {
+	case PM_HIBERNATION_PREPARE:
+	case PM_SUSPEND_PREPARE:
+		usermodehelper_disabled = 1;
+		return NOTIFY_OK;
+	case PM_POST_HIBERNATION:
+	case PM_POST_SUSPEND:
+		usermodehelper_disabled = 0;
+		return NOTIFY_OK;
+	}
+
+	return NOTIFY_DONE;
+}
+
 /**
  * call_usermodehelper_setup - prepare to call a usermode helper
  * @path - path to usermode executable
@@ -374,7 +402,7 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info,
 		goto out;
 	}
 
-	if (!khelper_wq) {
+	if (!khelper_wq || usermodehelper_disabled) {
 		retval = -EBUSY;
 		goto out;
 	}
@@ -431,4 +459,5 @@ void __init usermodehelper_init(void)
 {
 	khelper_wq = create_singlethread_workqueue("khelper");
 	BUG_ON(!khelper_wq);
+	pm_notifier(usermodehelper_pm_callback, 0);
 }
-- 
cgit v1.2.3-70-g09d2


From ccd4b65aef4be2278543fde5b999e55a4d694fd8 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Thu, 19 Jul 2007 01:47:37 -0700
Subject: PM: prevent frozen user mode helpers from failing the freezing of
 tasks

At present, if a user mode helper is running while
usermodehelper_pm_callback() is executed, the helper may be frozen and the
completion in call_usermodehelper_exec() won't be completed until user
space processes are thawed.  As a result, the freezing of kernel threads
may fail, which is not desirable.

Prevent this from happening by introducing a counter of running user mode
helpers and allowing usermodehelper_pm_callback() to succeed for action =
PM_HIBERNATION_PREPARE or action = PM_SUSPEND_PREPARE only if there are no
helpers running.  [Namely, usermodehelper_pm_callback() waits for at most
RUNNING_HELPERS_TIMEOUT for the number of running helpers to become zero
and fails if that doesn't happen.]

Special thanks to Uli Luckas <u.luckas@road.de>, Pavel Machek
<pavel@ucw.cz> and Oleg Nesterov <oleg@tv-sign.ru> for reviewing the
previous versions of this patch and for very useful comments.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Uli Luckas <u.luckas@road.de>
Acked-by: Nigel Cunningham <nigel@nigel.suspend2.net>
Acked-by: Pavel Machek <pavel@ucw.cz>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kmod.c | 78 +++++++++++++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 68 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kmod.c b/kernel/kmod.c
index 928f3678142..beedbdc6460 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -41,14 +41,6 @@ extern int max_threads;
 
 static struct workqueue_struct *khelper_wq;
 
-/*
- * If set, both call_usermodehelper_keys() and call_usermodehelper_pipe() exit
- * immediately returning -EBUSY.  Used for preventing user land processes from
- * being created after the user land has been frozen during a system-wide
- * hibernation or suspend operation.
- */
-static int usermodehelper_disabled;
-
 #ifdef CONFIG_KMOD
 
 /*
@@ -275,15 +267,55 @@ static void __call_usermodehelper(struct work_struct *work)
 	}
 }
 
+#ifdef CONFIG_PM
+/*
+ * If set, call_usermodehelper_exec() will exit immediately returning -EBUSY
+ * (used for preventing user land processes from being created after the user
+ * land has been frozen during a system-wide hibernation or suspend operation).
+ */
+static int usermodehelper_disabled;
+
+/* Number of helpers running */
+static atomic_t running_helpers = ATOMIC_INIT(0);
+
+/*
+ * Wait queue head used by usermodehelper_pm_callback() to wait for all running
+ * helpers to finish.
+ */
+static DECLARE_WAIT_QUEUE_HEAD(running_helpers_waitq);
+
+/*
+ * Time to wait for running_helpers to become zero before the setting of
+ * usermodehelper_disabled in usermodehelper_pm_callback() fails
+ */
+#define RUNNING_HELPERS_TIMEOUT	(5 * HZ)
+
 static int usermodehelper_pm_callback(struct notifier_block *nfb,
 					unsigned long action,
 					void *ignored)
 {
+	long retval;
+
 	switch (action) {
 	case PM_HIBERNATION_PREPARE:
 	case PM_SUSPEND_PREPARE:
 		usermodehelper_disabled = 1;
-		return NOTIFY_OK;
+		smp_mb();
+		/*
+		 * From now on call_usermodehelper_exec() won't start any new
+		 * helpers, so it is sufficient if running_helpers turns out to
+		 * be zero at one point (it may be increased later, but that
+		 * doesn't matter).
+		 */
+		retval = wait_event_timeout(running_helpers_waitq,
+					atomic_read(&running_helpers) == 0,
+					RUNNING_HELPERS_TIMEOUT);
+		if (retval) {
+			return NOTIFY_OK;
+		} else {
+			usermodehelper_disabled = 0;
+			return NOTIFY_BAD;
+		}
 	case PM_POST_HIBERNATION:
 	case PM_POST_SUSPEND:
 		usermodehelper_disabled = 0;
@@ -293,6 +325,30 @@ static int usermodehelper_pm_callback(struct notifier_block *nfb,
 	return NOTIFY_DONE;
 }
 
+static void helper_lock(void)
+{
+	atomic_inc(&running_helpers);
+	smp_mb__after_atomic_inc();
+}
+
+static void helper_unlock(void)
+{
+	if (atomic_dec_and_test(&running_helpers))
+		wake_up(&running_helpers_waitq);
+}
+
+static void register_pm_notifier_callback(void)
+{
+	pm_notifier(usermodehelper_pm_callback, 0);
+}
+#else /* CONFIG_PM */
+#define usermodehelper_disabled	0
+
+static inline void helper_lock(void) {}
+static inline void helper_unlock(void) {}
+static inline void register_pm_notifier_callback(void) {}
+#endif /* CONFIG_PM */
+
 /**
  * call_usermodehelper_setup - prepare to call a usermode helper
  * @path - path to usermode executable
@@ -397,6 +453,7 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info,
 	DECLARE_COMPLETION_ONSTACK(done);
 	int retval;
 
+	helper_lock();
 	if (sub_info->path[0] == '\0') {
 		retval = 0;
 		goto out;
@@ -418,6 +475,7 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info,
 
   out:
 	call_usermodehelper_freeinfo(sub_info);
+	helper_unlock();
 	return retval;
 }
 EXPORT_SYMBOL(call_usermodehelper_exec);
@@ -459,5 +517,5 @@ void __init usermodehelper_init(void)
 {
 	khelper_wq = create_singlethread_workqueue("khelper");
 	BUG_ON(!khelper_wq);
-	pm_notifier(usermodehelper_pm_callback, 0);
+	register_pm_notifier_callback();
 }
-- 
cgit v1.2.3-70-g09d2


From 6c961dfb7c903cfd1cd71b506863894038fd704f Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Thu, 19 Jul 2007 01:47:38 -0700
Subject: PM: Reduce code duplication between main.c and user.c

The SNAPSHOT_S2RAM ioctl code is outdated and it should not duplicate the
suspend code in kernel/power/main.c.  Fix that.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@ucw.cz>
Cc: Nigel Cunningham <nigel@nigel.suspend2.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/power/main.c  | 99 +++++++++++++++++++++++++++++-----------------------
 kernel/power/power.h |  3 +-
 kernel/power/user.c  | 38 +++-----------------
 3 files changed, 62 insertions(+), 78 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/main.c b/kernel/power/main.c
index 4d26ad394fb..32147b57c3b 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -65,14 +65,11 @@ static inline void pm_finish(suspend_state_t state)
 
 /**
  *	suspend_prepare - Do prep work before entering low-power state.
- *	@state:		State we're entering.
  *
- *	This is common code that is called for each state that we're 
- *	entering. Allocate a console, stop all processes, then make sure
- *	the platform can enter the requested state.
+ *	This is common code that is called for each state that we're entering.
+ *	Run suspend notifiers, allocate a console and stop all processes.
  */
-
-static int suspend_prepare(suspend_state_t state)
+static int suspend_prepare(void)
 {
 	int error;
 	unsigned int free_pages;
@@ -91,43 +88,18 @@ static int suspend_prepare(suspend_state_t state)
 		goto Thaw;
 	}
 
-	if ((free_pages = global_page_state(NR_FREE_PAGES))
-			< FREE_PAGE_NUMBER) {
+	free_pages = global_page_state(NR_FREE_PAGES);
+	if (free_pages < FREE_PAGE_NUMBER) {
 		pr_debug("PM: free some memory\n");
 		shrink_all_memory(FREE_PAGE_NUMBER - free_pages);
 		if (nr_free_pages() < FREE_PAGE_NUMBER) {
 			error = -ENOMEM;
 			printk(KERN_ERR "PM: No enough memory\n");
-			goto Thaw;
 		}
 	}
-
-	if (pm_ops->set_target) {
-		error = pm_ops->set_target(state);
-		if (error)
-			goto Thaw;
-	}
-	suspend_console();
-	error = device_suspend(PMSG_SUSPEND);
-	if (error) {
-		printk(KERN_ERR "Some devices failed to suspend\n");
-		goto Resume_console;
-	}
-	if (pm_ops->prepare) {
-		if ((error = pm_ops->prepare(state)))
-			goto Resume_devices;
-	}
-
-	error = disable_nonboot_cpus();
 	if (!error)
 		return 0;
 
-	enable_nonboot_cpus();
-	pm_finish(state);
- Resume_devices:
-	device_resume();
- Resume_console:
-	resume_console();
  Thaw:
 	thaw_processes();
 	pm_restore_console();
@@ -148,6 +120,12 @@ void __attribute__ ((weak)) arch_suspend_enable_irqs(void)
 	local_irq_enable();
 }
 
+/**
+ *	suspend_enter - enter the desired system sleep state.
+ *	@state:		state to enter
+ *
+ *	This function should be called after devices have been suspended.
+ */
 int suspend_enter(suspend_state_t state)
 {
 	int error = 0;
@@ -167,21 +145,55 @@ int suspend_enter(suspend_state_t state)
 	return error;
 }
 
+/**
+ *	suspend_devices_and_enter - suspend devices and enter the desired system sleep
+ *			  state.
+ *	@state:		  state to enter
+ */
+int suspend_devices_and_enter(suspend_state_t state)
+{
+	int error;
+
+	if (!pm_ops)
+		return -ENOSYS;
+
+	if (pm_ops->set_target) {
+		error = pm_ops->set_target(state);
+		if (error)
+			return error;
+	}
+	suspend_console();
+	error = device_suspend(PMSG_SUSPEND);
+	if (error) {
+		printk(KERN_ERR "Some devices failed to suspend\n");
+		goto Resume_console;
+	}
+	if (pm_ops->prepare) {
+		error = pm_ops->prepare(state);
+		if (error)
+			goto Resume_devices;
+	}
+	error = disable_nonboot_cpus();
+	if (!error)
+		suspend_enter(state);
+
+	enable_nonboot_cpus();
+	pm_finish(state);
+ Resume_devices:
+	device_resume();
+ Resume_console:
+	resume_console();
+	return error;
+}
 
 /**
  *	suspend_finish - Do final work before exiting suspend sequence.
- *	@state:		State we're coming out of.
  *
  *	Call platform code to clean up, restart processes, and free the 
  *	console that we've allocated. This is not called for suspend-to-disk.
  */
-
-static void suspend_finish(suspend_state_t state)
+static void suspend_finish(void)
 {
-	enable_nonboot_cpus();
-	pm_finish(state);
-	device_resume();
-	resume_console();
 	thaw_processes();
 	pm_restore_console();
 	pm_notifier_call_chain(PM_POST_SUSPEND);
@@ -216,7 +228,6 @@ static inline int valid_state(suspend_state_t state)
  *	Then, do the setup for suspend, enter the state, and cleaup (after
  *	we've woken up).
  */
-
 static int enter_state(suspend_state_t state)
 {
 	int error;
@@ -227,14 +238,14 @@ static int enter_state(suspend_state_t state)
 		return -EBUSY;
 
 	pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]);
-	if ((error = suspend_prepare(state)))
+	if ((error = suspend_prepare()))
 		goto Unlock;
 
 	pr_debug("PM: Entering %s sleep\n", pm_states[state]);
-	error = suspend_enter(state);
+	error = suspend_devices_and_enter(state);
 
 	pr_debug("PM: Finishing wakeup.\n");
-	suspend_finish(state);
+	suspend_finish();
  Unlock:
 	mutex_unlock(&pm_mutex);
 	return error;
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 01c2275b15b..5f24c786f8e 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -170,7 +170,6 @@ extern int swsusp_resume(void);
 extern int swsusp_read(unsigned int *flags_p);
 extern int swsusp_write(unsigned int flags);
 extern void swsusp_close(void);
-extern int suspend_enter(suspend_state_t state);
 
 struct timeval;
 /* kernel/power/swsusp.c */
@@ -178,6 +177,8 @@ extern void swsusp_show_speed(struct timeval *, struct timeval *,
 				unsigned int, char *);
 
 /* kernel/power/main.c */
+extern int suspend_enter(suspend_state_t state);
+extern int suspend_devices_and_enter(suspend_state_t state);
 extern struct blocking_notifier_head pm_chain_head;
 
 static inline int pm_notifier_call_chain(unsigned long val)
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 7f19afe01b4..bd0723a7df3 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -255,47 +255,19 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
 		break;
 
 	case SNAPSHOT_S2RAM:
-		if (!pm_ops) {
-			error = -ENOSYS;
-			break;
-		}
-
 		if (!data->frozen) {
 			error = -EPERM;
 			break;
 		}
-
 		if (!mutex_trylock(&pm_mutex)) {
 			error = -EBUSY;
 			break;
 		}
-
-		if (pm_ops->prepare) {
-			error = pm_ops->prepare(PM_SUSPEND_MEM);
-			if (error)
-				goto OutS3;
-		}
-
-		/* Put devices to sleep */
-		suspend_console();
-		error = device_suspend(PMSG_SUSPEND);
-		if (error) {
-			printk(KERN_ERR "Failed to suspend some devices.\n");
-		} else {
-			error = disable_nonboot_cpus();
-			if (!error) {
-				/* Enter S3, system is already frozen */
-				suspend_enter(PM_SUSPEND_MEM);
-				enable_nonboot_cpus();
-			}
-			/* Wake up devices */
-			device_resume();
-		}
-		resume_console();
-		if (pm_ops->finish)
-			pm_ops->finish(PM_SUSPEND_MEM);
-
- OutS3:
+		/*
+		 * Tasks are frozen and the notifiers have been called with
+		 * PM_HIBERNATION_PREPARE
+		 */
+		error = suspend_devices_and_enter(PM_SUSPEND_MEM);
 		mutex_unlock(&pm_mutex);
 		break;
 
-- 
cgit v1.2.3-70-g09d2


From bd804eba1c8597cbb7cd5a5f9fe886aae16a079a Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Thu, 19 Jul 2007 01:47:40 -0700
Subject: PM: Introduce pm_power_off_prepare

Introduce the pm_power_off_prepare() callback that can be registered by the
interested platforms in analogy with pm_idle() and pm_power_off(), used for
preparing the system to power off (needed by ACPI).

This allows us to drop acpi_sysclass and device_acpi that are only defined in
order to register the ACPI power off preparation callback, which is needed by
pm_power_off() registered in a much different way.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/acpi/sleep/poweroff.c | 38 +++++++++-----------------------------
 include/linux/pm.h            |  1 +
 kernel/sys.c                  |  9 +++++++++
 3 files changed, 19 insertions(+), 29 deletions(-)

(limited to 'kernel')

diff --git a/drivers/acpi/sleep/poweroff.c b/drivers/acpi/sleep/poweroff.c
index 240dde9cf13..39e40d56b03 100644
--- a/drivers/acpi/sleep/poweroff.c
+++ b/drivers/acpi/sleep/poweroff.c
@@ -39,7 +39,13 @@ int acpi_sleep_prepare(u32 acpi_state)
 
 #ifdef CONFIG_PM
 
-void acpi_power_off(void)
+static void acpi_power_off_prepare(void)
+{
+	/* Prepare to power off the system */
+	acpi_sleep_prepare(ACPI_STATE_S5);
+}
+
+static void acpi_power_off(void)
 {
 	/* acpi_sleep_prepare(ACPI_STATE_S5) should have already been called */
 	printk("%s called\n", __FUNCTION__);
@@ -48,27 +54,6 @@ void acpi_power_off(void)
 	acpi_enter_sleep_state(ACPI_STATE_S5);
 }
 
-static int acpi_shutdown(struct sys_device *x)
-{
-	switch (system_state) {
-	case SYSTEM_POWER_OFF:
-		/* Prepare to power off the system */
-		return acpi_sleep_prepare(ACPI_STATE_S5);
-	default:
-		return 0;
-	}
-}
-
-static struct sysdev_class acpi_sysclass = {
-	set_kset_name("acpi"),
-	.shutdown = acpi_shutdown
-};
-
-static struct sys_device device_acpi = {
-	.id = 0,
-	.cls = &acpi_sysclass,
-};
-
 static int acpi_poweroff_init(void)
 {
 	if (!acpi_disabled) {
@@ -78,13 +63,8 @@ static int acpi_poweroff_init(void)
 		status =
 		    acpi_get_sleep_type_data(ACPI_STATE_S5, &type_a, &type_b);
 		if (ACPI_SUCCESS(status)) {
-			int error;
-			error = sysdev_class_register(&acpi_sysclass);
-			if (!error)
-				error = sysdev_register(&device_acpi);
-			if (!error)
-				pm_power_off = acpi_power_off;
-			return error;
+			pm_power_off_prepare = acpi_power_off_prepare;
+			pm_power_off = acpi_power_off;
 		}
 	}
 	return 0;
diff --git a/include/linux/pm.h b/include/linux/pm.h
index 2735b7cadd2..ad3cc2eb0d3 100644
--- a/include/linux/pm.h
+++ b/include/linux/pm.h
@@ -101,6 +101,7 @@ struct pm_dev
  */
 extern void (*pm_idle)(void);
 extern void (*pm_power_off)(void);
+extern void (*pm_power_off_prepare)(void);
 
 typedef int __bitwise suspend_state_t;
 
diff --git a/kernel/sys.c b/kernel/sys.c
index 18987c7f6ad..d40e40a9446 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -99,6 +99,13 @@ int C_A_D = 1;
 struct pid *cad_pid;
 EXPORT_SYMBOL(cad_pid);
 
+/*
+ * If set, this is used for preparing the system to power off.
+ */
+
+void (*pm_power_off_prepare)(void);
+EXPORT_SYMBOL(pm_power_off_prepare);
+
 /*
  *	Notifier list for kernel code which wants to be called
  *	at shutdown. This is used to stop any idling DMA operations
@@ -867,6 +874,8 @@ EXPORT_SYMBOL_GPL(kernel_halt);
 void kernel_power_off(void)
 {
 	kernel_shutdown_prepare(SYSTEM_POWER_OFF);
+	if (pm_power_off_prepare)
+		pm_power_off_prepare();
 	printk(KERN_EMERG "Power down.\n");
 	machine_power_off();
 }
-- 
cgit v1.2.3-70-g09d2


From 5a60d6235c8352ade8f2699e72fcdfe853730456 Mon Sep 17 00:00:00 2001
From: Nigel Cunningham <nigel@nigel.suspend2.net>
Date: Thu, 19 Jul 2007 01:47:41 -0700
Subject: PM: Optional beeping during resume from suspend to RAM

Add a feature allowing the user to make the system beep during a resume from
suspend to RAM, on x86_64 and i386.

This is useful for the users with broken resume from RAM, so that they can
verify if the control reaches the kernel after a wake-up event.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/i386/kernel/acpi/wakeup.S   | 27 +++++++++++++++++++++++++++
 arch/x86_64/kernel/acpi/wakeup.S | 25 +++++++++++++++++++++++++
 include/linux/acpi.h             |  1 +
 kernel/power/main.c              | 23 +++++++++++++++++++++++
 4 files changed, 76 insertions(+)

(limited to 'kernel')

diff --git a/arch/i386/kernel/acpi/wakeup.S b/arch/i386/kernel/acpi/wakeup.S
index a2295a34b2c..b4e2ec3c392 100644
--- a/arch/i386/kernel/acpi/wakeup.S
+++ b/arch/i386/kernel/acpi/wakeup.S
@@ -13,6 +13,21 @@
 # cs = 0x1234, eip = 0x05
 # 
 
+#define BEEP \
+	inb	$97, %al; 	\
+	outb	%al, $0x80; 	\
+	movb	$3, %al; 	\
+	outb	%al, $97; 	\
+	outb	%al, $0x80; 	\
+	movb	$-74, %al; 	\
+	outb	%al, $67; 	\
+	outb	%al, $0x80; 	\
+	movb	$-119, %al; 	\
+	outb	%al, $66; 	\
+	outb	%al, $0x80; 	\
+	movb	$15, %al; 	\
+	outb	%al, $66;
+
 ALIGN
 	.align	4096
 ENTRY(wakeup_start)
@@ -31,6 +46,11 @@ wakeup_code:
 	movw	%cs, %ax
 	movw	%ax, %ds					# Make ds:0 point to wakeup_start
 	movw	%ax, %ss
+
+	testl   $1, beep_flags - wakeup_code
+	jz      1f
+	BEEP
+1:
 	mov	$(wakeup_stack - wakeup_code), %sp		# Private stack is needed for ASUS board
 	movw	$0x0e00 + 'S', %fs:(0x12)
 
@@ -88,6 +108,10 @@ wakeup_code:
 	cmpl	$0x12345678, %eax
 	jne	bogus_real_magic
 
+	testl   $2, beep_flags - wakeup_code
+	jz      1f
+	BEEP
+1:
 	ljmpl	$__KERNEL_CS,$wakeup_pmode_return
 
 real_save_gdt:	.word 0
@@ -98,6 +122,7 @@ real_save_cr4:	.long 0
 real_magic:	.long 0
 video_mode:	.long 0
 video_flags:	.long 0
+beep_flags:	.long 0
 real_efer_save_restore:	.long 0
 real_save_efer_edx: 	.long 0
 real_save_efer_eax: 	.long 0
@@ -262,6 +287,8 @@ ENTRY(acpi_copy_wakeup_routine)
 	movl	%edx, video_mode - wakeup_start (%eax)
 	movl	acpi_video_flags, %edx
 	movl	%edx, video_flags - wakeup_start (%eax)
+	movl	s2ram_beep, %edx
+	movl	%edx, beep_flags - wakeup_start (%eax)
 	movl	$0x12345678, real_magic - wakeup_start (%eax)
 	movl	$0x12345678, saved_magic
 	popl	%ebx
diff --git a/arch/x86_64/kernel/acpi/wakeup.S b/arch/x86_64/kernel/acpi/wakeup.S
index 8550a6ffa27..ed63d184579 100644
--- a/arch/x86_64/kernel/acpi/wakeup.S
+++ b/arch/x86_64/kernel/acpi/wakeup.S
@@ -16,6 +16,21 @@
 # cs = 0x1234, eip = 0x05
 #
 
+#define BEEP \
+	inb	$97, %al; 	\
+	outb	%al, $0x80; 	\
+	movb	$3, %al; 	\
+	outb	%al, $97; 	\
+	outb	%al, $0x80; 	\
+	movb	$-74, %al; 	\
+	outb	%al, $67; 	\
+	outb	%al, $0x80; 	\
+	movb	$-119, %al; 	\
+	outb	%al, $66; 	\
+	outb	%al, $0x80; 	\
+	movb	$15, %al; 	\
+	outb	%al, $66;
+
 
 ALIGN
 	.align	16
@@ -33,6 +48,13 @@ wakeup_code:
 	movw	%cs, %ax
 	movw	%ax, %ds		# Make ds:0 point to wakeup_start
 	movw	%ax, %ss
+
+	# Data segment must be set up before we can see whether to beep.
+	testl   $1, beep_flags - wakeup_code
+	jz      1f
+	BEEP
+1:
+
 					# Private stack is needed for ASUS board
 	mov	$(wakeup_stack - wakeup_code), %sp
 
@@ -229,6 +251,7 @@ gdt_48a:
 	.long   gdta - wakeup_code              # gdt base (relocated in later)
 	
 real_magic:	.quad 0
+beep_flags:	.quad 0
 video_mode:	.quad 0
 video_flags:	.quad 0
 
@@ -344,6 +367,8 @@ ENTRY(acpi_copy_wakeup_routine)
 	pushq	%rax
 	pushq	%rdx
 
+	movl	s2ram_beep, %edx
+	movl	%edx, beep_flags - wakeup_start (,%rdi)
 	movl	saved_video_mode, %edx
 	movl	%edx, video_mode - wakeup_start (,%rdi)
 	movl	acpi_video_flags, %edx
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index fccd8b548d9..c0ccdd72036 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -123,6 +123,7 @@ extern int pci_mmcfg_config_num;
 
 extern int sbf_port;
 extern unsigned long acpi_video_flags;
+extern unsigned long s2ram_beep;
 
 #else	/* !CONFIG_ACPI */
 
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 32147b57c3b..c74a56436d8 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -332,6 +332,27 @@ static ssize_t state_store(struct kset *kset, const char *buf, size_t n)
 
 power_attr(state);
 
+unsigned long s2ram_beep = 0;
+
+static ssize_t s2ram_beep_show(struct kset *kset, char *buf)
+{
+	return sprintf(buf, "%d\n", s2ram_beep);
+}
+
+static ssize_t
+s2ram_beep_store(struct kset *kset, const char *buf, size_t n)
+{
+	int val;
+
+	if (sscanf(buf, "%d", &val) > 0) {
+		s2ram_beep = val;
+		return n;
+	}
+	return -EINVAL;
+}
+
+power_attr(s2ram_beep);
+
 #ifdef CONFIG_PM_TRACE
 int pm_trace_enabled;
 
@@ -357,11 +378,13 @@ power_attr(pm_trace);
 static struct attribute * g[] = {
 	&state_attr.attr,
 	&pm_trace_attr.attr,
+	&s2ram_beep_attr.attr,
 	NULL,
 };
 #else
 static struct attribute * g[] = {
 	&state_attr.attr,
+	&s2ram_beep_attr.attr,
 	NULL,
 };
 #endif /* CONFIG_PM_TRACE */
-- 
cgit v1.2.3-70-g09d2


From 77afcf78a2ded9a91838734234949c0ead5feb12 Mon Sep 17 00:00:00 2001
From: Pavel Machek <pavel@suse.cz>
Date: Thu, 19 Jul 2007 01:47:41 -0700
Subject: PM: Integrate beeping flag with existing acpi_sleep flags

Move "debug during resume from s2ram" into the variable we already use
for real-mode flags to simplify code. It also closes nasty trap for
the user in acpi_sleep_setup; order of parameters actually mattered there,
acpi_sleep=s3_bios,s3_mode doing something different from
acpi_sleep=s3_mode,s3_bios.

Signed-off-by: Pavel Machek <pavel@suse.cz>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/i386/kernel/acpi/sleep.c    | 12 ++++++++----
 arch/i386/kernel/acpi/wakeup.S   | 18 ++++++++----------
 arch/x86_64/kernel/acpi/sleep.c  |  8 +++++---
 arch/x86_64/kernel/acpi/wakeup.S | 15 ++++++---------
 include/linux/acpi.h             |  3 +--
 kernel/power/main.c              | 23 -----------------------
 kernel/sysctl.c                  |  2 +-
 7 files changed, 29 insertions(+), 52 deletions(-)

(limited to 'kernel')

diff --git a/arch/i386/kernel/acpi/sleep.c b/arch/i386/kernel/acpi/sleep.c
index 4ee83577bf6..c42b5ab49de 100644
--- a/arch/i386/kernel/acpi/sleep.c
+++ b/arch/i386/kernel/acpi/sleep.c
@@ -14,7 +14,7 @@
 
 /* address in low memory of the wakeup routine. */
 unsigned long acpi_wakeup_address = 0;
-unsigned long acpi_video_flags;
+unsigned long acpi_realmode_flags;
 extern char wakeup_start, wakeup_end;
 
 extern unsigned long FASTCALL(acpi_copy_wakeup_routine(unsigned long));
@@ -68,9 +68,11 @@ static int __init acpi_sleep_setup(char *str)
 {
 	while ((str != NULL) && (*str != '\0')) {
 		if (strncmp(str, "s3_bios", 7) == 0)
-			acpi_video_flags = 1;
+			acpi_realmode_flags |= 1;
 		if (strncmp(str, "s3_mode", 7) == 0)
-			acpi_video_flags |= 2;
+			acpi_realmode_flags |= 2;
+		if (strncmp(str, "s3_beep", 7) == 0)
+			acpi_realmode_flags |= 4;
 		str = strchr(str, ',');
 		if (str != NULL)
 			str += strspn(str, ", \t");
@@ -80,9 +82,11 @@ static int __init acpi_sleep_setup(char *str)
 
 __setup("acpi_sleep=", acpi_sleep_setup);
 
+/* Ouch, we want to delete this. We already have better version in userspace, in
+   s2ram from suspend.sf.net project */
 static __init int reset_videomode_after_s3(struct dmi_system_id *d)
 {
-	acpi_video_flags |= 2;
+	acpi_realmode_flags |= 2;
 	return 0;
 }
 
diff --git a/arch/i386/kernel/acpi/wakeup.S b/arch/i386/kernel/acpi/wakeup.S
index b4e2ec3c392..ed0a0f2c159 100644
--- a/arch/i386/kernel/acpi/wakeup.S
+++ b/arch/i386/kernel/acpi/wakeup.S
@@ -47,7 +47,7 @@ wakeup_code:
 	movw	%ax, %ds					# Make ds:0 point to wakeup_start
 	movw	%ax, %ss
 
-	testl   $1, beep_flags - wakeup_code
+	testl   $4, realmode_flags - wakeup_code
 	jz      1f
 	BEEP
 1:
@@ -61,7 +61,7 @@ wakeup_code:
 	cmpl	$0x12345678, %eax
 	jne	bogus_real_magic
 
-	testl	$1, video_flags - wakeup_code
+	testl	$1, realmode_flags - wakeup_code
 	jz	1f
 	lcall   $0xc000,$3
 	movw	%cs, %ax
@@ -69,7 +69,7 @@ wakeup_code:
 	movw	%ax, %ss
 1:
 
-	testl	$2, video_flags - wakeup_code
+	testl	$2, realmode_flags - wakeup_code
 	jz	1f
 	mov	video_mode - wakeup_code, %ax
 	call	mode_set
@@ -108,11 +108,11 @@ wakeup_code:
 	cmpl	$0x12345678, %eax
 	jne	bogus_real_magic
 
-	testl   $2, beep_flags - wakeup_code
+	testl   $8, realmode_flags - wakeup_code
 	jz      1f
 	BEEP
 1:
-	ljmpl	$__KERNEL_CS,$wakeup_pmode_return
+	ljmpl	$__KERNEL_CS, $wakeup_pmode_return
 
 real_save_gdt:	.word 0
 		.long 0
@@ -121,7 +121,7 @@ real_save_cr3:	.long 0
 real_save_cr4:	.long 0
 real_magic:	.long 0
 video_mode:	.long 0
-video_flags:	.long 0
+realmode_flags:	.long 0
 beep_flags:	.long 0
 real_efer_save_restore:	.long 0
 real_save_efer_edx: 	.long 0
@@ -285,10 +285,8 @@ ENTRY(acpi_copy_wakeup_routine)
 
 	movl	saved_videomode, %edx
 	movl	%edx, video_mode - wakeup_start (%eax)
-	movl	acpi_video_flags, %edx
-	movl	%edx, video_flags - wakeup_start (%eax)
-	movl	s2ram_beep, %edx
-	movl	%edx, beep_flags - wakeup_start (%eax)
+	movl	acpi_realmode_flags, %edx
+	movl	%edx, realmode_flags - wakeup_start (%eax)
 	movl	$0x12345678, real_magic - wakeup_start (%eax)
 	movl	$0x12345678, saved_magic
 	popl	%ebx
diff --git a/arch/x86_64/kernel/acpi/sleep.c b/arch/x86_64/kernel/acpi/sleep.c
index 195b7034a14..4277f2b27e6 100644
--- a/arch/x86_64/kernel/acpi/sleep.c
+++ b/arch/x86_64/kernel/acpi/sleep.c
@@ -55,7 +55,7 @@
 
 /* address in low memory of the wakeup routine. */
 unsigned long acpi_wakeup_address = 0;
-unsigned long acpi_video_flags;
+unsigned long acpi_realmode_flags;
 extern char wakeup_start, wakeup_end;
 
 extern unsigned long acpi_copy_wakeup_routine(unsigned long);
@@ -103,9 +103,11 @@ static int __init acpi_sleep_setup(char *str)
 {
 	while ((str != NULL) && (*str != '\0')) {
 		if (strncmp(str, "s3_bios", 7) == 0)
-			acpi_video_flags = 1;
+			acpi_realmode_flags |= 1;
 		if (strncmp(str, "s3_mode", 7) == 0)
-			acpi_video_flags |= 2;
+			acpi_realmode_flags |= 2;
+		if (strncmp(str, "s3_beep", 7) == 0)
+			acpi_realmode_flags |= 4;
 		str = strchr(str, ',');
 		if (str != NULL)
 			str += strspn(str, ", \t");
diff --git a/arch/x86_64/kernel/acpi/wakeup.S b/arch/x86_64/kernel/acpi/wakeup.S
index ed63d184579..13f1480cbec 100644
--- a/arch/x86_64/kernel/acpi/wakeup.S
+++ b/arch/x86_64/kernel/acpi/wakeup.S
@@ -50,7 +50,7 @@ wakeup_code:
 	movw	%ax, %ss
 
 	# Data segment must be set up before we can see whether to beep.
-	testl   $1, beep_flags - wakeup_code
+	testl   $4, realmode_flags - wakeup_code
 	jz      1f
 	BEEP
 1:
@@ -70,7 +70,7 @@ wakeup_code:
 	testl	%eax, %eax
 	jnz	no_longmode
 
-	testl	$1, video_flags - wakeup_code
+	testl	$1, realmode_flags - wakeup_code
 	jz	1f
 	lcall   $0xc000,$3
 	movw	%cs, %ax
@@ -78,7 +78,7 @@ wakeup_code:
 	movw	%ax, %ss
 1:
 
-	testl	$2, video_flags - wakeup_code
+	testl	$2, realmode_flags - wakeup_code
 	jz	1f
 	mov	video_mode - wakeup_code, %ax
 	call	mode_seta
@@ -251,9 +251,8 @@ gdt_48a:
 	.long   gdta - wakeup_code              # gdt base (relocated in later)
 	
 real_magic:	.quad 0
-beep_flags:	.quad 0
 video_mode:	.quad 0
-video_flags:	.quad 0
+realmode_flags:	.quad 0
 
 .code16
 bogus_real_magic:
@@ -367,12 +366,10 @@ ENTRY(acpi_copy_wakeup_routine)
 	pushq	%rax
 	pushq	%rdx
 
-	movl	s2ram_beep, %edx
-	movl	%edx, beep_flags - wakeup_start (,%rdi)
 	movl	saved_video_mode, %edx
 	movl	%edx, video_mode - wakeup_start (,%rdi)
-	movl	acpi_video_flags, %edx
-	movl	%edx, video_flags - wakeup_start (,%rdi)
+	movl	acpi_realmode_flags, %edx
+	movl	%edx, realmode_flags - wakeup_start (,%rdi)
 	movq	$0x12345678, real_magic - wakeup_start (,%rdi)
 	movq	$0x123456789abcdef0, %rdx
 	movq	%rdx, saved_magic
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index c0ccdd72036..dc234c508a6 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -122,8 +122,7 @@ extern struct acpi_mcfg_allocation *pci_mmcfg_config;
 extern int pci_mmcfg_config_num;
 
 extern int sbf_port;
-extern unsigned long acpi_video_flags;
-extern unsigned long s2ram_beep;
+extern unsigned long acpi_realmode_flags;
 
 #else	/* !CONFIG_ACPI */
 
diff --git a/kernel/power/main.c b/kernel/power/main.c
index c74a56436d8..32147b57c3b 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -332,27 +332,6 @@ static ssize_t state_store(struct kset *kset, const char *buf, size_t n)
 
 power_attr(state);
 
-unsigned long s2ram_beep = 0;
-
-static ssize_t s2ram_beep_show(struct kset *kset, char *buf)
-{
-	return sprintf(buf, "%d\n", s2ram_beep);
-}
-
-static ssize_t
-s2ram_beep_store(struct kset *kset, const char *buf, size_t n)
-{
-	int val;
-
-	if (sscanf(buf, "%d", &val) > 0) {
-		s2ram_beep = val;
-		return n;
-	}
-	return -EINVAL;
-}
-
-power_attr(s2ram_beep);
-
 #ifdef CONFIG_PM_TRACE
 int pm_trace_enabled;
 
@@ -378,13 +357,11 @@ power_attr(pm_trace);
 static struct attribute * g[] = {
 	&state_attr.attr,
 	&pm_trace_attr.attr,
-	&s2ram_beep_attr.attr,
 	NULL,
 };
 #else
 static struct attribute * g[] = {
 	&state_attr.attr,
-	&s2ram_beep_attr.attr,
 	NULL,
 };
 #endif /* CONFIG_PM_TRACE */
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 44a1d699aad..3ed4912bf18 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -660,7 +660,7 @@ static ctl_table kern_table[] = {
 	{
 		.ctl_name	= KERN_ACPI_VIDEO_FLAGS,
 		.procname	= "acpi_video_flags",
-		.data		= &acpi_video_flags,
+		.data		= &acpi_realmode_flags,
 		.maxlen		= sizeof (unsigned long),
 		.mode		= 0644,
 		.proc_handler	= &proc_doulongvec_minmax,
-- 
cgit v1.2.3-70-g09d2


From 3d7e33825d8799115dd2495c9944badd3272a623 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <michael@ellerman.id.au>
Date: Thu, 19 Jul 2007 01:48:11 -0700
Subject: jprobes: make jprobes a little safer for users

I realise jprobes are a razor-blades-included type of interface, but that
doesn't mean we can't try and make them safer to use.  This guy I know once
wrote code like this:

struct jprobe jp = { .kp.symbol_name = "foo", .entry = "jprobe_foo" };

And then his kernel exploded. Oops.

This patch adds an arch hook, arch_deref_entry_point() (I don't like it
either) which takes the void * in a struct jprobe, and gives back the text
address that it represents.

We can then use that in register_jprobe() to check that the entry point we're
passed is actually in the kernel text, rather than just some random value.

Signed-off-by: Michael Ellerman <michael@ellerman.id.au>
Cc: Prasanna S Panchamukhi <prasanna@in.ibm.com>
Acked-by: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Cc: David S. Miller <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/ia64/kernel/kprobes.c    |  7 ++++++-
 arch/powerpc/kernel/kprobes.c | 11 ++++++++---
 include/linux/kprobes.h       |  1 +
 kernel/kprobes.c              |  9 +++++++++
 4 files changed, 24 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/arch/ia64/kernel/kprobes.c b/arch/ia64/kernel/kprobes.c
index 5bc46f15134..5dc98b5abcf 100644
--- a/arch/ia64/kernel/kprobes.c
+++ b/arch/ia64/kernel/kprobes.c
@@ -936,10 +936,15 @@ static void ia64_get_bsp_cfm(struct unw_frame_info *info, void *arg)
 	return;
 }
 
+unsigned long arch_deref_entry_point(void *entry)
+{
+	return ((struct fnptr *)entry)->ip;
+}
+
 int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
 {
 	struct jprobe *jp = container_of(p, struct jprobe, kp);
-	unsigned long addr = ((struct fnptr *)(jp->entry))->ip;
+	unsigned long addr = arch_deref_entry_point(jp->entry);
 	struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
 	struct param_bsp_cfm pa;
 	int bytes;
diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c
index 0c96611f02f..440f5a87271 100644
--- a/arch/powerpc/kernel/kprobes.c
+++ b/arch/powerpc/kernel/kprobes.c
@@ -492,6 +492,13 @@ int __kprobes kprobe_exceptions_notify(struct notifier_block *self,
 	return ret;
 }
 
+#ifdef CONFIG_PPC64
+unsigned long arch_deref_entry_point(void *entry)
+{
+	return (unsigned long)(((func_descr_t *)entry)->entry);
+}
+#endif
+
 int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
 {
 	struct jprobe *jp = container_of(p, struct jprobe, kp);
@@ -500,11 +507,9 @@ int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
 	memcpy(&kcb->jprobe_saved_regs, regs, sizeof(struct pt_regs));
 
 	/* setup return addr to the jprobe handler routine */
+	regs->nip = arch_deref_entry_point(jp->entry);
 #ifdef CONFIG_PPC64
-	regs->nip = (unsigned long)(((func_descr_t *)jp->entry)->entry);
 	regs->gpr[2] = (unsigned long)(((func_descr_t *)jp->entry)->toc);
-#else
-	regs->nip = (unsigned long)jp->entry;
 #endif
 
 	return 1;
diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index bd892850c94..51464d12a4e 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -214,6 +214,7 @@ int longjmp_break_handler(struct kprobe *, struct pt_regs *);
 int register_jprobe(struct jprobe *p);
 void unregister_jprobe(struct jprobe *p);
 void jprobe_return(void);
+unsigned long arch_deref_entry_point(void *);
 
 int register_kretprobe(struct kretprobe *rp);
 void unregister_kretprobe(struct kretprobe *rp);
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 9e47d8c493f..3e9f513a728 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -675,9 +675,18 @@ static struct notifier_block kprobe_exceptions_nb = {
 	.priority = 0x7fffffff /* we need to be notified first */
 };
 
+unsigned long __weak arch_deref_entry_point(void *entry)
+{
+	return (unsigned long)entry;
+}
 
 int __kprobes register_jprobe(struct jprobe *jp)
 {
+	unsigned long addr = arch_deref_entry_point(jp->entry);
+
+	if (!kernel_text_address(addr))
+		return -EINVAL;
+
 	/* Todo: Verify probepoint is a function entry point */
 	jp->kp.pre_handler = setjmp_pre_handler;
 	jp->kp.break_handler = longjmp_break_handler;
-- 
cgit v1.2.3-70-g09d2


From f34e3b61f2be9628bd41244f3ecc42009c5eced5 Mon Sep 17 00:00:00 2001
From: Fenghua Yu <fenghua.yu@intel.com>
Date: Thu, 19 Jul 2007 01:48:13 -0700
Subject: use the new percpu interface for shared data

Currently most of the per cpu data, which is accessed by different cpus,
has a ____cacheline_aligned_in_smp attribute.  Move all this data to the
new per cpu shared data section: .data.percpu.shared_aligned.

This will seperate the percpu data which is referenced frequently by other
cpus from the local only percpu data.

Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Acked-by: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Christoph Lameter <clameter@sgi.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/i386/kernel/init_task.c   | 2 +-
 arch/i386/kernel/irq.c         | 2 +-
 arch/ia64/kernel/smp.c         | 2 +-
 arch/x86_64/kernel/init_task.c | 2 +-
 kernel/sched.c                 | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/arch/i386/kernel/init_task.c b/arch/i386/kernel/init_task.c
index cff95d10a4d..d26fc063a76 100644
--- a/arch/i386/kernel/init_task.c
+++ b/arch/i386/kernel/init_task.c
@@ -42,5 +42,5 @@ EXPORT_SYMBOL(init_task);
  * per-CPU TSS segments. Threads are completely 'soft' on Linux,
  * no more per-task TSS's.
  */ 
-DEFINE_PER_CPU(struct tss_struct, init_tss) ____cacheline_internodealigned_in_smp = INIT_TSS;
+DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss) = INIT_TSS;
 
diff --git a/arch/i386/kernel/irq.c b/arch/i386/kernel/irq.c
index d2daf672f4a..ba44d40b066 100644
--- a/arch/i386/kernel/irq.c
+++ b/arch/i386/kernel/irq.c
@@ -21,7 +21,7 @@
 #include <asm/apic.h>
 #include <asm/uaccess.h>
 
-DEFINE_PER_CPU(irq_cpustat_t, irq_stat) ____cacheline_internodealigned_in_smp;
+DEFINE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat);
 EXPORT_PER_CPU_SYMBOL(irq_stat);
 
 DEFINE_PER_CPU(struct pt_regs *, irq_regs);
diff --git a/arch/ia64/kernel/smp.c b/arch/ia64/kernel/smp.c
index b3a47f986e1..9f72838db26 100644
--- a/arch/ia64/kernel/smp.c
+++ b/arch/ia64/kernel/smp.c
@@ -82,7 +82,7 @@ static volatile struct call_data_struct *call_data;
 #define IPI_KDUMP_CPU_STOP	3
 
 /* This needs to be cacheline aligned because it is written to by *other* CPUs.  */
-static DEFINE_PER_CPU(u64, ipi_operation) ____cacheline_aligned;
+static DEFINE_PER_CPU_SHARED_ALIGNED(u64, ipi_operation);
 
 extern void cpu_halt (void);
 
diff --git a/arch/x86_64/kernel/init_task.c b/arch/x86_64/kernel/init_task.c
index 3dc5854ba21..4ff33d4f855 100644
--- a/arch/x86_64/kernel/init_task.c
+++ b/arch/x86_64/kernel/init_task.c
@@ -44,7 +44,7 @@ EXPORT_SYMBOL(init_task);
  * section. Since TSS's are completely CPU-local, we want them
  * on exact cacheline boundaries, to eliminate cacheline ping-pong.
  */ 
-DEFINE_PER_CPU(struct tss_struct, init_tss) ____cacheline_internodealigned_in_smp = INIT_TSS;
+DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss) = INIT_TSS;
 
 /* Copies of the original ist values from the tss are only accessed during
  * debugging, no special alignment required.
diff --git a/kernel/sched.c b/kernel/sched.c
index cb31fb4a137..645256b228c 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -301,7 +301,7 @@ struct rq {
 	struct lock_class_key rq_lock_key;
 };
 
-static DEFINE_PER_CPU(struct rq, runqueues) ____cacheline_aligned_in_smp;
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
 static DEFINE_MUTEX(sched_hotcpu_mutex);
 
 static inline void check_preempt_curr(struct rq *rq, struct task_struct *p)
-- 
cgit v1.2.3-70-g09d2


From bdf4c48af20a3b0f01671799ace345e3d49576da Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 19 Jul 2007 01:48:15 -0700
Subject: audit: rework execve audit

The purpose of audit_bprm() is to log the argv array to a userspace daemon at
the end of the execve system call.  Since user-space hasn't had time to run,
this array is still in pristine state on the process' stack; so no need to
copy it, we can just grab it from there.

In order to minimize the damage to audit_log_*() copy each string into a
temporary kernel buffer first.

Currently the audit code requires that the full argument vector fits in a
single packet.  So currently it does clip the argv size to a (sysctl) limit,
but only when execve auditing is enabled.

If the audit protocol gets extended to allow for multiple packets this check
can be removed.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ollie Wild <aaw@google.com>
Cc: <linux-audit@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/filesystems/proc.txt |  7 ++++
 fs/exec.c                          |  3 ++
 include/linux/binfmts.h            |  1 +
 kernel/auditsc.c                   | 84 ++++++++++++++++++++++++++++----------
 kernel/sysctl.c                    | 11 +++++
 5 files changed, 85 insertions(+), 21 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index ebffdffb3d9..72e247ef6fa 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -1065,6 +1065,13 @@ check the amount of free space (value is in seconds). Default settings are: 4,
 resume it  if we have a value of 3 or more percent; consider information about
 the amount of free space valid for 30 seconds
 
+audit_argv_kb
+-------------
+
+The file contains a single value denoting the limit on the argv array size
+for execve (in KiB). This limit is only applied when system call auditing for
+execve is enabled, otherwise the value is ignored.
+
 ctrl-alt-del
 ------------
 
diff --git a/fs/exec.c b/fs/exec.c
index f20561ff452..2e3f7950c18 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1154,6 +1154,7 @@ int do_execve(char * filename,
 {
 	struct linux_binprm *bprm;
 	struct file *file;
+	unsigned long env_p;
 	int retval;
 	int i;
 
@@ -1208,9 +1209,11 @@ int do_execve(char * filename,
 	if (retval < 0)
 		goto out;
 
+	env_p = bprm->p;
 	retval = copy_strings(bprm->argc, argv, bprm);
 	if (retval < 0)
 		goto out;
+	bprm->argv_len = env_p - bprm->p;
 
 	retval = search_binary_handler(bprm,regs);
 	if (retval >= 0) {
diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h
index e1a708337be..a0b209cd576 100644
--- a/include/linux/binfmts.h
+++ b/include/linux/binfmts.h
@@ -40,6 +40,7 @@ struct linux_binprm{
 	unsigned interp_flags;
 	unsigned interp_data;
 	unsigned long loader, exec;
+	unsigned long argv_len;
 };
 
 #define BINPRM_FLAGS_ENFORCE_NONDUMP_BIT 0
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index b7640a5f382..535586fc498 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -153,7 +153,7 @@ struct audit_aux_data_execve {
 	struct audit_aux_data	d;
 	int argc;
 	int envc;
-	char mem[0];
+	struct mm_struct *mm;
 };
 
 struct audit_aux_data_socketcall {
@@ -831,6 +831,55 @@ static int audit_log_pid_context(struct audit_context *context, pid_t pid,
 	return rc;
 }
 
+static void audit_log_execve_info(struct audit_buffer *ab,
+		struct audit_aux_data_execve *axi)
+{
+	int i;
+	long len, ret;
+	const char __user *p = (const char __user *)axi->mm->arg_start;
+	char *buf;
+
+	if (axi->mm != current->mm)
+		return; /* execve failed, no additional info */
+
+	for (i = 0; i < axi->argc; i++, p += len) {
+		len = strnlen_user(p, MAX_ARG_PAGES*PAGE_SIZE);
+		/*
+		 * We just created this mm, if we can't find the strings
+		 * we just copied into it something is _very_ wrong. Similar
+		 * for strings that are too long, we should not have created
+		 * any.
+		 */
+		if (!len || len > MAX_ARG_STRLEN) {
+			WARN_ON(1);
+			send_sig(SIGKILL, current, 0);
+		}
+
+		buf = kmalloc(len, GFP_KERNEL);
+		if (!buf) {
+			audit_panic("out of memory for argv string\n");
+			break;
+		}
+
+		ret = copy_from_user(buf, p, len);
+		/*
+		 * There is no reason for this copy to be short. We just
+		 * copied them here, and the mm hasn't been exposed to user-
+		 * space yet.
+		 */
+		if (!ret) {
+			WARN_ON(1);
+			send_sig(SIGKILL, current, 0);
+		}
+
+		audit_log_format(ab, "a%d=", i);
+		audit_log_untrustedstring(ab, buf);
+		audit_log_format(ab, "\n");
+
+		kfree(buf);
+	}
+}
+
 static void audit_log_exit(struct audit_context *context, struct task_struct *tsk)
 {
 	int i, call_panic = 0;
@@ -971,13 +1020,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
 
 		case AUDIT_EXECVE: {
 			struct audit_aux_data_execve *axi = (void *)aux;
-			int i;
-			const char *p;
-			for (i = 0, p = axi->mem; i < axi->argc; i++) {
-				audit_log_format(ab, "a%d=", i);
-				p = audit_log_untrustedstring(ab, p);
-				audit_log_format(ab, "\n");
-			}
+			audit_log_execve_info(ab, axi);
 			break; }
 
 		case AUDIT_SOCKETCALL: {
@@ -1821,32 +1864,31 @@ int __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode
 	return 0;
 }
 
+int audit_argv_kb = 32;
+
 int audit_bprm(struct linux_binprm *bprm)
 {
 	struct audit_aux_data_execve *ax;
 	struct audit_context *context = current->audit_context;
-	unsigned long p, next;
-	void *to;
 
 	if (likely(!audit_enabled || !context || context->dummy))
 		return 0;
 
-	ax = kmalloc(sizeof(*ax) + PAGE_SIZE * MAX_ARG_PAGES - bprm->p,
-				GFP_KERNEL);
+	/*
+	 * Even though the stack code doesn't limit the arg+env size any more,
+	 * the audit code requires that _all_ arguments be logged in a single
+	 * netlink skb. Hence cap it :-(
+	 */
+	if (bprm->argv_len > (audit_argv_kb << 10))
+		return -E2BIG;
+
+	ax = kmalloc(sizeof(*ax), GFP_KERNEL);
 	if (!ax)
 		return -ENOMEM;
 
 	ax->argc = bprm->argc;
 	ax->envc = bprm->envc;
-	for (p = bprm->p, to = ax->mem; p < MAX_ARG_PAGES*PAGE_SIZE; p = next) {
-		struct page *page = bprm->page[p / PAGE_SIZE];
-		void *kaddr = kmap(page);
-		next = (p + PAGE_SIZE) & ~(PAGE_SIZE - 1);
-		memcpy(to, kaddr + (p & (PAGE_SIZE - 1)), next - p);
-		to += next - p;
-		kunmap(page);
-	}
-
+	ax->mm = bprm->mm;
 	ax->d.type = AUDIT_EXECVE;
 	ax->d.next = context->aux;
 	context->aux = (void *)ax;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 3ed4912bf18..8db41764e2a 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -78,6 +78,7 @@ extern int percpu_pagelist_fraction;
 extern int compat_log;
 extern int maps_protect;
 extern int sysctl_stat_interval;
+extern int audit_argv_kb;
 
 /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
 static int maxolduid = 65535;
@@ -306,6 +307,16 @@ static ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
 	},
+#ifdef CONFIG_AUDITSYSCALL
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "audit_argv_kb",
+		.data		= &audit_argv_kb,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#endif
 	{
 		.ctl_name	= KERN_CORE_PATTERN,
 		.procname	= "core_pattern",
-- 
cgit v1.2.3-70-g09d2


From b6a2fea39318e43fee84fa7b0b90d68bed92d2ba Mon Sep 17 00:00:00 2001
From: Ollie Wild <aaw@google.com>
Date: Thu, 19 Jul 2007 01:48:16 -0700
Subject: mm: variable length argument support

Remove the arg+env limit of MAX_ARG_PAGES by copying the strings directly from
the old mm into the new mm.

We create the new mm before the binfmt code runs, and place the new stack at
the very top of the address space.  Once the binfmt code runs and figures out
where the stack should be, we move it downwards.

It is a bit peculiar in that we have one task with two mm's, one of which is
inactive.

[a.p.zijlstra@chello.nl: limit stack size]
Signed-off-by: Ollie Wild <aaw@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: <linux-arch@vger.kernel.org>
Cc: Hugh Dickins <hugh@veritas.com>
[bunk@stusta.de: unexport bprm_mm_init]
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/ia64/ia32/binfmt_elf32.c  |  67 ++---
 arch/x86_64/ia32/ia32_aout.c   |   2 +-
 arch/x86_64/ia32/ia32_binfmt.c |  58 ----
 fs/binfmt_elf.c                |  28 +-
 fs/binfmt_elf_fdpic.c          |   8 +-
 fs/binfmt_misc.c               |   4 +-
 fs/binfmt_script.c             |   4 +-
 fs/compat.c                    | 128 ++++-----
 fs/exec.c                      | 614 ++++++++++++++++++++++++++---------------
 include/linux/binfmts.h        |  18 +-
 include/linux/mm.h             |   9 +-
 kernel/auditsc.c               |   2 +-
 mm/mmap.c                      |  61 ++--
 mm/mprotect.c                  |   2 +-
 mm/mremap.c                    |   2 +-
 15 files changed, 554 insertions(+), 453 deletions(-)

(limited to 'kernel')

diff --git a/arch/ia64/ia32/binfmt_elf32.c b/arch/ia64/ia32/binfmt_elf32.c
index 6f4d3d06f0e..e1189ba1ca5 100644
--- a/arch/ia64/ia32/binfmt_elf32.c
+++ b/arch/ia64/ia32/binfmt_elf32.c
@@ -195,62 +195,27 @@ ia64_elf32_init (struct pt_regs *regs)
 	ia32_load_state(current);
 }
 
+/*
+ * Undo the override of setup_arg_pages() without this ia32_setup_arg_pages()
+ * will suffer infinite self recursion.
+ */
+#undef setup_arg_pages
+
 int
 ia32_setup_arg_pages (struct linux_binprm *bprm, int executable_stack)
 {
-	unsigned long stack_base;
-	struct vm_area_struct *mpnt;
-	struct mm_struct *mm = current->mm;
-	int i, ret;
-
-	stack_base = IA32_STACK_TOP - MAX_ARG_PAGES*PAGE_SIZE;
-	mm->arg_start = bprm->p + stack_base;
-
-	bprm->p += stack_base;
-	if (bprm->loader)
-		bprm->loader += stack_base;
-	bprm->exec += stack_base;
-
-	mpnt = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
-	if (!mpnt)
-		return -ENOMEM;
-
-	down_write(&current->mm->mmap_sem);
-	{
-		mpnt->vm_mm = current->mm;
-		mpnt->vm_start = PAGE_MASK & (unsigned long) bprm->p;
-		mpnt->vm_end = IA32_STACK_TOP;
-		if (executable_stack == EXSTACK_ENABLE_X)
-			mpnt->vm_flags = VM_STACK_FLAGS |  VM_EXEC;
-		else if (executable_stack == EXSTACK_DISABLE_X)
-			mpnt->vm_flags = VM_STACK_FLAGS & ~VM_EXEC;
-		else
-			mpnt->vm_flags = VM_STACK_FLAGS;
-		mpnt->vm_page_prot = (mpnt->vm_flags & VM_EXEC)?
-					PAGE_COPY_EXEC: PAGE_COPY;
-		if ((ret = insert_vm_struct(current->mm, mpnt))) {
-			up_write(&current->mm->mmap_sem);
-			kmem_cache_free(vm_area_cachep, mpnt);
-			return ret;
-		}
-		current->mm->stack_vm = current->mm->total_vm = vma_pages(mpnt);
+	int ret;
+
+	ret = setup_arg_pages(bprm, IA32_STACK_TOP, executable_stack);
+	if (!ret) {
+		/*
+		 * Can't do it in ia64_elf32_init(). Needs to be done before
+		 * calls to elf32_map()
+		 */
+		current->thread.ppl = ia32_init_pp_list();
 	}
 
-	for (i = 0 ; i < MAX_ARG_PAGES ; i++) {
-		struct page *page = bprm->page[i];
-		if (page) {
-			bprm->page[i] = NULL;
-			install_arg_page(mpnt, page, stack_base);
-		}
-		stack_base += PAGE_SIZE;
-	}
-	up_write(&current->mm->mmap_sem);
-
-	/* Can't do it in ia64_elf32_init(). Needs to be done before calls to
-	   elf32_map() */
-	current->thread.ppl = ia32_init_pp_list();
-
-	return 0;
+	return ret;
 }
 
 static void
diff --git a/arch/x86_64/ia32/ia32_aout.c b/arch/x86_64/ia32/ia32_aout.c
index fe83edb93c1..08781370256 100644
--- a/arch/x86_64/ia32/ia32_aout.c
+++ b/arch/x86_64/ia32/ia32_aout.c
@@ -404,7 +404,7 @@ beyond_if:
 
 	set_brk(current->mm->start_brk, current->mm->brk);
 
-	retval = ia32_setup_arg_pages(bprm, IA32_STACK_TOP, EXSTACK_DEFAULT);
+	retval = setup_arg_pages(bprm, IA32_STACK_TOP, EXSTACK_DEFAULT);
 	if (retval < 0) { 
 		/* Someone check-me: is this error path enough? */ 
 		send_sig(SIGKILL, current, 0); 
diff --git a/arch/x86_64/ia32/ia32_binfmt.c b/arch/x86_64/ia32/ia32_binfmt.c
index 185399baaf6..ed56a8806ea 100644
--- a/arch/x86_64/ia32/ia32_binfmt.c
+++ b/arch/x86_64/ia32/ia32_binfmt.c
@@ -232,9 +232,6 @@ do {							\
 #define load_elf_binary load_elf32_binary
 
 #define ELF_PLAT_INIT(r, load_addr)	elf32_init(r)
-#define setup_arg_pages(bprm, stack_top, exec_stack) \
-	ia32_setup_arg_pages(bprm, stack_top, exec_stack)
-int ia32_setup_arg_pages(struct linux_binprm *bprm, unsigned long stack_top, int executable_stack);
 
 #undef start_thread
 #define start_thread(regs,new_rip,new_rsp) do { \
@@ -286,61 +283,6 @@ static void elf32_init(struct pt_regs *regs)
 	me->thread.es = __USER_DS;
 }
 
-int ia32_setup_arg_pages(struct linux_binprm *bprm, unsigned long stack_top,
-			 int executable_stack)
-{
-	unsigned long stack_base;
-	struct vm_area_struct *mpnt;
-	struct mm_struct *mm = current->mm;
-	int i, ret;
-
-	stack_base = stack_top - MAX_ARG_PAGES * PAGE_SIZE;
-	mm->arg_start = bprm->p + stack_base;
-
-	bprm->p += stack_base;
-	if (bprm->loader)
-		bprm->loader += stack_base;
-	bprm->exec += stack_base;
-
-	mpnt = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
-	if (!mpnt) 
-		return -ENOMEM; 
-
-	down_write(&mm->mmap_sem);
-	{
-		mpnt->vm_mm = mm;
-		mpnt->vm_start = PAGE_MASK & (unsigned long) bprm->p;
-		mpnt->vm_end = stack_top;
-		if (executable_stack == EXSTACK_ENABLE_X)
-			mpnt->vm_flags = VM_STACK_FLAGS |  VM_EXEC;
-		else if (executable_stack == EXSTACK_DISABLE_X)
-			mpnt->vm_flags = VM_STACK_FLAGS & ~VM_EXEC;
-		else
-			mpnt->vm_flags = VM_STACK_FLAGS;
- 		mpnt->vm_page_prot = (mpnt->vm_flags & VM_EXEC) ? 
- 			PAGE_COPY_EXEC : PAGE_COPY;
-		if ((ret = insert_vm_struct(mm, mpnt))) {
-			up_write(&mm->mmap_sem);
-			kmem_cache_free(vm_area_cachep, mpnt);
-			return ret;
-		}
-		mm->stack_vm = mm->total_vm = vma_pages(mpnt);
-	} 
-
-	for (i = 0 ; i < MAX_ARG_PAGES ; i++) {
-		struct page *page = bprm->page[i];
-		if (page) {
-			bprm->page[i] = NULL;
-			install_arg_page(mpnt, page, stack_base);
-		}
-		stack_base += PAGE_SIZE;
-	}
-	up_write(&mm->mmap_sem);
-	
-	return 0;
-}
-EXPORT_SYMBOL(ia32_setup_arg_pages);
-
 #ifdef CONFIG_SYSCTL
 /* Register vsyscall32 into the ABI table */
 #include <linux/sysctl.h>
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index a27e42bf340..295cbaa0e58 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -148,6 +148,7 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
 	elf_addr_t *elf_info;
 	int ei_index = 0;
 	struct task_struct *tsk = current;
+	struct vm_area_struct *vma;
 
 	/*
 	 * If this architecture has a platform capability string, copy it
@@ -234,6 +235,15 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
 	sp = (elf_addr_t __user *)bprm->p;
 #endif
 
+
+	/*
+	 * Grow the stack manually; some architectures have a limit on how
+	 * far ahead a user-space access may be in order to grow the stack.
+	 */
+	vma = find_extend_vma(current->mm, bprm->p);
+	if (!vma)
+		return -EFAULT;
+
 	/* Now, let's put argc (and argv, envp if appropriate) on the stack */
 	if (__put_user(argc, sp++))
 		return -EFAULT;
@@ -254,8 +264,8 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
 		size_t len;
 		if (__put_user((elf_addr_t)p, argv++))
 			return -EFAULT;
-		len = strnlen_user((void __user *)p, PAGE_SIZE*MAX_ARG_PAGES);
-		if (!len || len > PAGE_SIZE*MAX_ARG_PAGES)
+		len = strnlen_user((void __user *)p, MAX_ARG_STRLEN);
+		if (!len || len > MAX_ARG_STRLEN)
 			return 0;
 		p += len;
 	}
@@ -266,8 +276,8 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
 		size_t len;
 		if (__put_user((elf_addr_t)p, envp++))
 			return -EFAULT;
-		len = strnlen_user((void __user *)p, PAGE_SIZE*MAX_ARG_PAGES);
-		if (!len || len > PAGE_SIZE*MAX_ARG_PAGES)
+		len = strnlen_user((void __user *)p, MAX_ARG_STRLEN);
+		if (!len || len > MAX_ARG_STRLEN)
 			return 0;
 		p += len;
 	}
@@ -826,10 +836,6 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
 	}
 
 	/* OK, This is the point of no return */
-	current->mm->start_data = 0;
-	current->mm->end_data = 0;
-	current->mm->end_code = 0;
-	current->mm->mmap = NULL;
 	current->flags &= ~PF_FORKNOEXEC;
 	current->mm->def_flags = def_flags;
 
@@ -1051,9 +1057,13 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
 
 	compute_creds(bprm);
 	current->flags &= ~PF_FORKNOEXEC;
-	create_elf_tables(bprm, &loc->elf_ex,
+	retval = create_elf_tables(bprm, &loc->elf_ex,
 			  (interpreter_type == INTERPRETER_AOUT),
 			  load_addr, interp_load_addr);
+	if (retval < 0) {
+		send_sig(SIGKILL, current, 0);
+		goto out;
+	}
 	/* N.B. passed_fileno might not be initialized? */
 	if (interpreter_type == INTERPRETER_AOUT)
 		current->mm->arg_start += strlen(passed_fileno) + 1;
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 9d62fbad3d4..4739506fb08 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -621,8 +621,8 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm,
 	p = (char __user *) current->mm->arg_start;
 	for (loop = bprm->argc; loop > 0; loop--) {
 		__put_user((elf_caddr_t) p, argv++);
-		len = strnlen_user(p, PAGE_SIZE * MAX_ARG_PAGES);
-		if (!len || len > PAGE_SIZE * MAX_ARG_PAGES)
+		len = strnlen_user(p, MAX_ARG_STRLEN);
+		if (!len || len > MAX_ARG_STRLEN)
 			return -EINVAL;
 		p += len;
 	}
@@ -633,8 +633,8 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm,
 	current->mm->env_start = (unsigned long) p;
 	for (loop = bprm->envc; loop > 0; loop--) {
 		__put_user((elf_caddr_t)(unsigned long) p, envp++);
-		len = strnlen_user(p, PAGE_SIZE * MAX_ARG_PAGES);
-		if (!len || len > PAGE_SIZE * MAX_ARG_PAGES)
+		len = strnlen_user(p, MAX_ARG_STRLEN);
+		if (!len || len > MAX_ARG_STRLEN)
 			return -EINVAL;
 		p += len;
 	}
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index 330fd3fe854..42e94b3ab7b 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -126,7 +126,9 @@ static int load_misc_binary(struct linux_binprm *bprm, struct pt_regs *regs)
 		goto _ret;
 
 	if (!(fmt->flags & MISC_FMT_PRESERVE_ARGV0)) {
-		remove_arg_zero(bprm);
+		retval = remove_arg_zero(bprm);
+		if (retval)
+			goto _ret;
 	}
 
 	if (fmt->flags & MISC_FMT_OPEN_BINARY) {
diff --git a/fs/binfmt_script.c b/fs/binfmt_script.c
index 304c88544d8..4d0e0f6d327 100644
--- a/fs/binfmt_script.c
+++ b/fs/binfmt_script.c
@@ -67,7 +67,9 @@ static int load_script(struct linux_binprm *bprm,struct pt_regs *regs)
 	 * This is done in reverse order, because of how the
 	 * user environment and arguments are stored.
 	 */
-	remove_arg_zero(bprm);
+	retval = remove_arg_zero(bprm);
+	if (retval)
+		return retval;
 	retval = copy_strings_kernel(1, &bprm->interp, bprm);
 	if (retval < 0) return retval; 
 	bprm->argc++;
diff --git a/fs/compat.c b/fs/compat.c
index 4db6216e526..15078ce4c04 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1257,6 +1257,7 @@ static int compat_copy_strings(int argc, compat_uptr_t __user *argv,
 {
 	struct page *kmapped_page = NULL;
 	char *kaddr = NULL;
+	unsigned long kpos = 0;
 	int ret;
 
 	while (argc-- > 0) {
@@ -1265,92 +1266,84 @@ static int compat_copy_strings(int argc, compat_uptr_t __user *argv,
 		unsigned long pos;
 
 		if (get_user(str, argv+argc) ||
-			!(len = strnlen_user(compat_ptr(str), bprm->p))) {
+		    !(len = strnlen_user(compat_ptr(str), MAX_ARG_STRLEN))) {
 			ret = -EFAULT;
 			goto out;
 		}
 
-		if (bprm->p < len)  {
+		if (len > MAX_ARG_STRLEN) {
 			ret = -E2BIG;
 			goto out;
 		}
 
-		bprm->p -= len;
-		/* XXX: add architecture specific overflow check here. */
+		/* We're going to work our way backwords. */
 		pos = bprm->p;
+		str += len;
+		bprm->p -= len;
 
 		while (len > 0) {
-			int i, new, err;
 			int offset, bytes_to_copy;
-			struct page *page;
 
 			offset = pos % PAGE_SIZE;
-			i = pos/PAGE_SIZE;
-			page = bprm->page[i];
-			new = 0;
-			if (!page) {
-				page = alloc_page(GFP_HIGHUSER);
-				bprm->page[i] = page;
-				if (!page) {
-					ret = -ENOMEM;
+			if (offset == 0)
+				offset = PAGE_SIZE;
+
+			bytes_to_copy = offset;
+			if (bytes_to_copy > len)
+				bytes_to_copy = len;
+
+			offset -= bytes_to_copy;
+			pos -= bytes_to_copy;
+			str -= bytes_to_copy;
+			len -= bytes_to_copy;
+
+			if (!kmapped_page || kpos != (pos & PAGE_MASK)) {
+				struct page *page;
+
+#ifdef CONFIG_STACK_GROWSUP
+				ret = expand_stack_downwards(bprm->vma, pos);
+				if (ret < 0) {
+					/* We've exceed the stack rlimit. */
+					ret = -E2BIG;
+					goto out;
+				}
+#endif
+				ret = get_user_pages(current, bprm->mm, pos,
+						     1, 1, 1, &page, NULL);
+				if (ret <= 0) {
+					/* We've exceed the stack rlimit. */
+					ret = -E2BIG;
 					goto out;
 				}
-				new = 1;
-			}
 
-			if (page != kmapped_page) {
-				if (kmapped_page)
+				if (kmapped_page) {
+					flush_kernel_dcache_page(kmapped_page);
 					kunmap(kmapped_page);
+					put_page(kmapped_page);
+				}
 				kmapped_page = page;
 				kaddr = kmap(kmapped_page);
+				kpos = pos & PAGE_MASK;
+				flush_cache_page(bprm->vma, kpos,
+						 page_to_pfn(kmapped_page));
 			}
-			if (new && offset)
-				memset(kaddr, 0, offset);
-			bytes_to_copy = PAGE_SIZE - offset;
-			if (bytes_to_copy > len) {
-				bytes_to_copy = len;
-				if (new)
-					memset(kaddr+offset+len, 0,
-						PAGE_SIZE-offset-len);
-			}
-			err = copy_from_user(kaddr+offset, compat_ptr(str),
-						bytes_to_copy);
-			if (err) {
+			if (copy_from_user(kaddr+offset, compat_ptr(str),
+						bytes_to_copy)) {
 				ret = -EFAULT;
 				goto out;
 			}
-
-			pos += bytes_to_copy;
-			str += bytes_to_copy;
-			len -= bytes_to_copy;
 		}
 	}
 	ret = 0;
 out:
-	if (kmapped_page)
+	if (kmapped_page) {
+		flush_kernel_dcache_page(kmapped_page);
 		kunmap(kmapped_page);
-	return ret;
-}
-
-#ifdef CONFIG_MMU
-
-#define free_arg_pages(bprm) do { } while (0)
-
-#else
-
-static inline void free_arg_pages(struct linux_binprm *bprm)
-{
-	int i;
-
-	for (i = 0; i < MAX_ARG_PAGES; i++) {
-		if (bprm->page[i])
-			__free_page(bprm->page[i]);
-		bprm->page[i] = NULL;
+		put_page(kmapped_page);
 	}
+	return ret;
 }
 
-#endif /* CONFIG_MMU */
-
 /*
  * compat_do_execve() is mostly a copy of do_execve(), with the exception
  * that it processes 32 bit argv and envp pointers.
@@ -1363,7 +1356,6 @@ int compat_do_execve(char * filename,
 	struct linux_binprm *bprm;
 	struct file *file;
 	int retval;
-	int i;
 
 	retval = -ENOMEM;
 	bprm = kzalloc(sizeof(*bprm), GFP_KERNEL);
@@ -1377,24 +1369,19 @@ int compat_do_execve(char * filename,
 
 	sched_exec();
 
-	bprm->p = PAGE_SIZE*MAX_ARG_PAGES-sizeof(void *);
 	bprm->file = file;
 	bprm->filename = filename;
 	bprm->interp = filename;
-	bprm->mm = mm_alloc();
-	retval = -ENOMEM;
-	if (!bprm->mm)
-		goto out_file;
 
-	retval = init_new_context(current, bprm->mm);
-	if (retval < 0)
-		goto out_mm;
+	retval = bprm_mm_init(bprm);
+	if (retval)
+		goto out_file;
 
-	bprm->argc = compat_count(argv, bprm->p / sizeof(compat_uptr_t));
+	bprm->argc = compat_count(argv, MAX_ARG_STRINGS);
 	if ((retval = bprm->argc) < 0)
 		goto out_mm;
 
-	bprm->envc = compat_count(envp, bprm->p / sizeof(compat_uptr_t));
+	bprm->envc = compat_count(envp, MAX_ARG_STRINGS);
 	if ((retval = bprm->envc) < 0)
 		goto out_mm;
 
@@ -1421,8 +1408,6 @@ int compat_do_execve(char * filename,
 
 	retval = search_binary_handler(bprm, regs);
 	if (retval >= 0) {
-		free_arg_pages(bprm);
-
 		/* execve success */
 		security_bprm_free(bprm);
 		acct_update_integrals(current);
@@ -1431,19 +1416,12 @@ int compat_do_execve(char * filename,
 	}
 
 out:
-	/* Something went wrong, return the inode and free the argument pages*/
-	for (i = 0 ; i < MAX_ARG_PAGES ; i++) {
-		struct page * page = bprm->page[i];
-		if (page)
-			__free_page(page);
-	}
-
 	if (bprm->security)
 		security_bprm_free(bprm);
 
 out_mm:
 	if (bprm->mm)
-		mmdrop(bprm->mm);
+		mmput(bprm->mm);
 
 out_file:
 	if (bprm->file) {
diff --git a/fs/exec.c b/fs/exec.c
index 2e3f7950c18..498f2b3dca2 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -54,6 +54,7 @@
 
 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>
+#include <asm/tlb.h>
 
 #ifdef CONFIG_KMOD
 #include <linux/kmod.h>
@@ -178,6 +179,207 @@ exit:
 	goto out;
 }
 
+#ifdef CONFIG_MMU
+
+static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
+		int write)
+{
+	struct page *page;
+	int ret;
+
+#ifdef CONFIG_STACK_GROWSUP
+	if (write) {
+		ret = expand_stack_downwards(bprm->vma, pos);
+		if (ret < 0)
+			return NULL;
+	}
+#endif
+	ret = get_user_pages(current, bprm->mm, pos,
+			1, write, 1, &page, NULL);
+	if (ret <= 0)
+		return NULL;
+
+	if (write) {
+		struct rlimit *rlim = current->signal->rlim;
+		unsigned long size = bprm->vma->vm_end - bprm->vma->vm_start;
+
+		/*
+		 * Limit to 1/4-th the stack size for the argv+env strings.
+		 * This ensures that:
+		 *  - the remaining binfmt code will not run out of stack space,
+		 *  - the program will have a reasonable amount of stack left
+		 *    to work from.
+		 */
+		if (size > rlim[RLIMIT_STACK].rlim_cur / 4) {
+			put_page(page);
+			return NULL;
+		}
+	}
+
+	return page;
+}
+
+static void put_arg_page(struct page *page)
+{
+	put_page(page);
+}
+
+static void free_arg_page(struct linux_binprm *bprm, int i)
+{
+}
+
+static void free_arg_pages(struct linux_binprm *bprm)
+{
+}
+
+static void flush_arg_page(struct linux_binprm *bprm, unsigned long pos,
+		struct page *page)
+{
+	flush_cache_page(bprm->vma, pos, page_to_pfn(page));
+}
+
+static int __bprm_mm_init(struct linux_binprm *bprm)
+{
+	int err = -ENOMEM;
+	struct vm_area_struct *vma = NULL;
+	struct mm_struct *mm = bprm->mm;
+
+	bprm->vma = vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
+	if (!vma)
+		goto err;
+
+	down_write(&mm->mmap_sem);
+	vma->vm_mm = mm;
+
+	/*
+	 * Place the stack at the largest stack address the architecture
+	 * supports. Later, we'll move this to an appropriate place. We don't
+	 * use STACK_TOP because that can depend on attributes which aren't
+	 * configured yet.
+	 */
+	vma->vm_end = STACK_TOP_MAX;
+	vma->vm_start = vma->vm_end - PAGE_SIZE;
+
+	vma->vm_flags = VM_STACK_FLAGS;
+	vma->vm_page_prot = protection_map[vma->vm_flags & 0x7];
+	err = insert_vm_struct(mm, vma);
+	if (err) {
+		up_write(&mm->mmap_sem);
+		goto err;
+	}
+
+	mm->stack_vm = mm->total_vm = 1;
+	up_write(&mm->mmap_sem);
+
+	bprm->p = vma->vm_end - sizeof(void *);
+
+	return 0;
+
+err:
+	if (vma) {
+		bprm->vma = NULL;
+		kmem_cache_free(vm_area_cachep, vma);
+	}
+
+	return err;
+}
+
+static bool valid_arg_len(struct linux_binprm *bprm, long len)
+{
+	return len <= MAX_ARG_STRLEN;
+}
+
+#else
+
+static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
+		int write)
+{
+	struct page *page;
+
+	page = bprm->page[pos / PAGE_SIZE];
+	if (!page && write) {
+		page = alloc_page(GFP_HIGHUSER|__GFP_ZERO);
+		if (!page)
+			return NULL;
+		bprm->page[pos / PAGE_SIZE] = page;
+	}
+
+	return page;
+}
+
+static void put_arg_page(struct page *page)
+{
+}
+
+static void free_arg_page(struct linux_binprm *bprm, int i)
+{
+	if (bprm->page[i]) {
+		__free_page(bprm->page[i]);
+		bprm->page[i] = NULL;
+	}
+}
+
+static void free_arg_pages(struct linux_binprm *bprm)
+{
+	int i;
+
+	for (i = 0; i < MAX_ARG_PAGES; i++)
+		free_arg_page(bprm, i);
+}
+
+static void flush_arg_page(struct linux_binprm *bprm, unsigned long pos,
+		struct page *page)
+{
+}
+
+static int __bprm_mm_init(struct linux_binprm *bprm)
+{
+	bprm->p = PAGE_SIZE * MAX_ARG_PAGES - sizeof(void *);
+	return 0;
+}
+
+static bool valid_arg_len(struct linux_binprm *bprm, long len)
+{
+	return len <= bprm->p;
+}
+
+#endif /* CONFIG_MMU */
+
+/*
+ * Create a new mm_struct and populate it with a temporary stack
+ * vm_area_struct.  We don't have enough context at this point to set the stack
+ * flags, permissions, and offset, so we use temporary values.  We'll update
+ * them later in setup_arg_pages().
+ */
+int bprm_mm_init(struct linux_binprm *bprm)
+{
+	int err;
+	struct mm_struct *mm = NULL;
+
+	bprm->mm = mm = mm_alloc();
+	err = -ENOMEM;
+	if (!mm)
+		goto err;
+
+	err = init_new_context(current, mm);
+	if (err)
+		goto err;
+
+	err = __bprm_mm_init(bprm);
+	if (err)
+		goto err;
+
+	return 0;
+
+err:
+	if (mm) {
+		bprm->mm = NULL;
+		mmdrop(mm);
+	}
+
+	return err;
+}
+
 /*
  * count() counts the number of strings in array ARGV.
  */
@@ -203,15 +405,16 @@ static int count(char __user * __user * argv, int max)
 }
 
 /*
- * 'copy_strings()' copies argument/environment strings from user
- * memory to free pages in kernel mem. These are in a format ready
- * to be put directly into the top of new user memory.
+ * 'copy_strings()' copies argument/environment strings from the old
+ * processes's memory to the new process's stack.  The call to get_user_pages()
+ * ensures the destination page is created and not swapped out.
  */
 static int copy_strings(int argc, char __user * __user * argv,
 			struct linux_binprm *bprm)
 {
 	struct page *kmapped_page = NULL;
 	char *kaddr = NULL;
+	unsigned long kpos = 0;
 	int ret;
 
 	while (argc-- > 0) {
@@ -220,69 +423,69 @@ static int copy_strings(int argc, char __user * __user * argv,
 		unsigned long pos;
 
 		if (get_user(str, argv+argc) ||
-				!(len = strnlen_user(str, bprm->p))) {
+				!(len = strnlen_user(str, MAX_ARG_STRLEN))) {
 			ret = -EFAULT;
 			goto out;
 		}
 
-		if (bprm->p < len)  {
+		if (!valid_arg_len(bprm, len)) {
 			ret = -E2BIG;
 			goto out;
 		}
 
-		bprm->p -= len;
-		/* XXX: add architecture specific overflow check here. */
+		/* We're going to work our way backwords. */
 		pos = bprm->p;
+		str += len;
+		bprm->p -= len;
 
 		while (len > 0) {
-			int i, new, err;
 			int offset, bytes_to_copy;
-			struct page *page;
 
 			offset = pos % PAGE_SIZE;
-			i = pos/PAGE_SIZE;
-			page = bprm->page[i];
-			new = 0;
-			if (!page) {
-				page = alloc_page(GFP_HIGHUSER);
-				bprm->page[i] = page;
+			if (offset == 0)
+				offset = PAGE_SIZE;
+
+			bytes_to_copy = offset;
+			if (bytes_to_copy > len)
+				bytes_to_copy = len;
+
+			offset -= bytes_to_copy;
+			pos -= bytes_to_copy;
+			str -= bytes_to_copy;
+			len -= bytes_to_copy;
+
+			if (!kmapped_page || kpos != (pos & PAGE_MASK)) {
+				struct page *page;
+
+				page = get_arg_page(bprm, pos, 1);
 				if (!page) {
-					ret = -ENOMEM;
+					ret = -E2BIG;
 					goto out;
 				}
-				new = 1;
-			}
 
-			if (page != kmapped_page) {
-				if (kmapped_page)
+				if (kmapped_page) {
+					flush_kernel_dcache_page(kmapped_page);
 					kunmap(kmapped_page);
+					put_arg_page(kmapped_page);
+				}
 				kmapped_page = page;
 				kaddr = kmap(kmapped_page);
+				kpos = pos & PAGE_MASK;
+				flush_arg_page(bprm, kpos, kmapped_page);
 			}
-			if (new && offset)
-				memset(kaddr, 0, offset);
-			bytes_to_copy = PAGE_SIZE - offset;
-			if (bytes_to_copy > len) {
-				bytes_to_copy = len;
-				if (new)
-					memset(kaddr+offset+len, 0,
-						PAGE_SIZE-offset-len);
-			}
-			err = copy_from_user(kaddr+offset, str, bytes_to_copy);
-			if (err) {
+			if (copy_from_user(kaddr+offset, str, bytes_to_copy)) {
 				ret = -EFAULT;
 				goto out;
 			}
-
-			pos += bytes_to_copy;
-			str += bytes_to_copy;
-			len -= bytes_to_copy;
 		}
 	}
 	ret = 0;
 out:
-	if (kmapped_page)
+	if (kmapped_page) {
+		flush_kernel_dcache_page(kmapped_page);
 		kunmap(kmapped_page);
+		put_arg_page(kmapped_page);
+	}
 	return ret;
 }
 
@@ -298,181 +501,172 @@ int copy_strings_kernel(int argc,char ** argv, struct linux_binprm *bprm)
 	set_fs(oldfs);
 	return r;
 }
-
 EXPORT_SYMBOL(copy_strings_kernel);
 
 #ifdef CONFIG_MMU
+
 /*
- * This routine is used to map in a page into an address space: needed by
- * execve() for the initial stack and environment pages.
+ * During bprm_mm_init(), we create a temporary stack at STACK_TOP_MAX.  Once
+ * the binfmt code determines where the new stack should reside, we shift it to
+ * its final location.  The process proceeds as follows:
  *
- * vma->vm_mm->mmap_sem is held for writing.
+ * 1) Use shift to calculate the new vma endpoints.
+ * 2) Extend vma to cover both the old and new ranges.  This ensures the
+ *    arguments passed to subsequent functions are consistent.
+ * 3) Move vma's page tables to the new range.
+ * 4) Free up any cleared pgd range.
+ * 5) Shrink the vma to cover only the new range.
  */
-void install_arg_page(struct vm_area_struct *vma,
-			struct page *page, unsigned long address)
+static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
 {
 	struct mm_struct *mm = vma->vm_mm;
-	pte_t * pte;
-	spinlock_t *ptl;
+	unsigned long old_start = vma->vm_start;
+	unsigned long old_end = vma->vm_end;
+	unsigned long length = old_end - old_start;
+	unsigned long new_start = old_start - shift;
+	unsigned long new_end = old_end - shift;
+	struct mmu_gather *tlb;
 
-	if (unlikely(anon_vma_prepare(vma)))
-		goto out;
+	BUG_ON(new_start > new_end);
 
-	flush_dcache_page(page);
-	pte = get_locked_pte(mm, address, &ptl);
-	if (!pte)
-		goto out;
-	if (!pte_none(*pte)) {
-		pte_unmap_unlock(pte, ptl);
-		goto out;
+	/*
+	 * ensure there are no vmas between where we want to go
+	 * and where we are
+	 */
+	if (vma != find_vma(mm, new_start))
+		return -EFAULT;
+
+	/*
+	 * cover the whole range: [new_start, old_end)
+	 */
+	vma_adjust(vma, new_start, old_end, vma->vm_pgoff, NULL);
+
+	/*
+	 * move the page tables downwards, on failure we rely on
+	 * process cleanup to remove whatever mess we made.
+	 */
+	if (length != move_page_tables(vma, old_start,
+				       vma, new_start, length))
+		return -ENOMEM;
+
+	lru_add_drain();
+	tlb = tlb_gather_mmu(mm, 0);
+	if (new_end > old_start) {
+		/*
+		 * when the old and new regions overlap clear from new_end.
+		 */
+		free_pgd_range(&tlb, new_end, old_end, new_end,
+			vma->vm_next ? vma->vm_next->vm_start : 0);
+	} else {
+		/*
+		 * otherwise, clean from old_start; this is done to not touch
+		 * the address space in [new_end, old_start) some architectures
+		 * have constraints on va-space that make this illegal (IA64) -
+		 * for the others its just a little faster.
+		 */
+		free_pgd_range(&tlb, old_start, old_end, new_end,
+			vma->vm_next ? vma->vm_next->vm_start : 0);
 	}
-	inc_mm_counter(mm, anon_rss);
-	lru_cache_add_active(page);
-	set_pte_at(mm, address, pte, pte_mkdirty(pte_mkwrite(mk_pte(
-					page, vma->vm_page_prot))));
-	page_add_new_anon_rmap(page, vma, address);
-	pte_unmap_unlock(pte, ptl);
-
-	/* no need for flush_tlb */
-	return;
-out:
-	__free_page(page);
-	force_sig(SIGKILL, current);
+	tlb_finish_mmu(tlb, new_end, old_end);
+
+	/*
+	 * shrink the vma to just the new range.
+	 */
+	vma_adjust(vma, new_start, new_end, vma->vm_pgoff, NULL);
+
+	return 0;
 }
 
 #define EXTRA_STACK_VM_PAGES	20	/* random */
 
+/*
+ * Finalizes the stack vm_area_struct. The flags and permissions are updated,
+ * the stack is optionally relocated, and some extra space is added.
+ */
 int setup_arg_pages(struct linux_binprm *bprm,
 		    unsigned long stack_top,
 		    int executable_stack)
 {
-	unsigned long stack_base;
-	struct vm_area_struct *mpnt;
+	unsigned long ret;
+	unsigned long stack_shift;
 	struct mm_struct *mm = current->mm;
-	int i, ret;
-	long arg_size;
+	struct vm_area_struct *vma = bprm->vma;
+	struct vm_area_struct *prev = NULL;
+	unsigned long vm_flags;
+	unsigned long stack_base;
 
 #ifdef CONFIG_STACK_GROWSUP
-	/* Move the argument and environment strings to the bottom of the
-	 * stack space.
-	 */
-	int offset, j;
-	char *to, *from;
-
-	/* Start by shifting all the pages down */
-	i = 0;
-	for (j = 0; j < MAX_ARG_PAGES; j++) {
-		struct page *page = bprm->page[j];
-		if (!page)
-			continue;
-		bprm->page[i++] = page;
-	}
-
-	/* Now move them within their pages */
-	offset = bprm->p % PAGE_SIZE;
-	to = kmap(bprm->page[0]);
-	for (j = 1; j < i; j++) {
-		memmove(to, to + offset, PAGE_SIZE - offset);
-		from = kmap(bprm->page[j]);
-		memcpy(to + PAGE_SIZE - offset, from, offset);
-		kunmap(bprm->page[j - 1]);
-		to = from;
-	}
-	memmove(to, to + offset, PAGE_SIZE - offset);
-	kunmap(bprm->page[j - 1]);
-
 	/* Limit stack size to 1GB */
 	stack_base = current->signal->rlim[RLIMIT_STACK].rlim_max;
 	if (stack_base > (1 << 30))
 		stack_base = 1 << 30;
-	stack_base = PAGE_ALIGN(stack_top - stack_base);
 
-	/* Adjust bprm->p to point to the end of the strings. */
-	bprm->p = stack_base + PAGE_SIZE * i - offset;
+	/* Make sure we didn't let the argument array grow too large. */
+	if (vma->vm_end - vma->vm_start > stack_base)
+		return -ENOMEM;
 
-	mm->arg_start = stack_base;
-	arg_size = i << PAGE_SHIFT;
+	stack_base = PAGE_ALIGN(stack_top - stack_base);
 
-	/* zero pages that were copied above */
-	while (i < MAX_ARG_PAGES)
-		bprm->page[i++] = NULL;
+	stack_shift = vma->vm_start - stack_base;
+	mm->arg_start = bprm->p - stack_shift;
+	bprm->p = vma->vm_end - stack_shift;
 #else
-	stack_base = arch_align_stack(stack_top - MAX_ARG_PAGES*PAGE_SIZE);
-	stack_base = PAGE_ALIGN(stack_base);
-	bprm->p += stack_base;
+	stack_top = arch_align_stack(stack_top);
+	stack_top = PAGE_ALIGN(stack_top);
+	stack_shift = vma->vm_end - stack_top;
+
+	bprm->p -= stack_shift;
 	mm->arg_start = bprm->p;
-	arg_size = stack_top - (PAGE_MASK & (unsigned long) mm->arg_start);
 #endif
 
-	arg_size += EXTRA_STACK_VM_PAGES * PAGE_SIZE;
-
 	if (bprm->loader)
-		bprm->loader += stack_base;
-	bprm->exec += stack_base;
-
-	mpnt = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
-	if (!mpnt)
-		return -ENOMEM;
+		bprm->loader -= stack_shift;
+	bprm->exec -= stack_shift;
 
 	down_write(&mm->mmap_sem);
-	{
-		mpnt->vm_mm = mm;
-#ifdef CONFIG_STACK_GROWSUP
-		mpnt->vm_start = stack_base;
-		mpnt->vm_end = stack_base + arg_size;
-#else
-		mpnt->vm_end = stack_top;
-		mpnt->vm_start = mpnt->vm_end - arg_size;
-#endif
-		/* Adjust stack execute permissions; explicitly enable
-		 * for EXSTACK_ENABLE_X, disable for EXSTACK_DISABLE_X
-		 * and leave alone (arch default) otherwise. */
-		if (unlikely(executable_stack == EXSTACK_ENABLE_X))
-			mpnt->vm_flags = VM_STACK_FLAGS |  VM_EXEC;
-		else if (executable_stack == EXSTACK_DISABLE_X)
-			mpnt->vm_flags = VM_STACK_FLAGS & ~VM_EXEC;
-		else
-			mpnt->vm_flags = VM_STACK_FLAGS;
-		mpnt->vm_flags |= mm->def_flags;
-		mpnt->vm_page_prot = protection_map[mpnt->vm_flags & 0x7];
-		if ((ret = insert_vm_struct(mm, mpnt))) {
+	vm_flags = vma->vm_flags;
+
+	/*
+	 * Adjust stack execute permissions; explicitly enable for
+	 * EXSTACK_ENABLE_X, disable for EXSTACK_DISABLE_X and leave alone
+	 * (arch default) otherwise.
+	 */
+	if (unlikely(executable_stack == EXSTACK_ENABLE_X))
+		vm_flags |= VM_EXEC;
+	else if (executable_stack == EXSTACK_DISABLE_X)
+		vm_flags &= ~VM_EXEC;
+	vm_flags |= mm->def_flags;
+
+	ret = mprotect_fixup(vma, &prev, vma->vm_start, vma->vm_end,
+			vm_flags);
+	if (ret)
+		goto out_unlock;
+	BUG_ON(prev != vma);
+
+	/* Move stack pages down in memory. */
+	if (stack_shift) {
+		ret = shift_arg_pages(vma, stack_shift);
+		if (ret) {
 			up_write(&mm->mmap_sem);
-			kmem_cache_free(vm_area_cachep, mpnt);
 			return ret;
 		}
-		mm->stack_vm = mm->total_vm = vma_pages(mpnt);
 	}
 
-	for (i = 0 ; i < MAX_ARG_PAGES ; i++) {
-		struct page *page = bprm->page[i];
-		if (page) {
-			bprm->page[i] = NULL;
-			install_arg_page(mpnt, page, stack_base);
-		}
-		stack_base += PAGE_SIZE;
-	}
+#ifdef CONFIG_STACK_GROWSUP
+	stack_base = vma->vm_end + EXTRA_STACK_VM_PAGES * PAGE_SIZE;
+#else
+	stack_base = vma->vm_start - EXTRA_STACK_VM_PAGES * PAGE_SIZE;
+#endif
+	ret = expand_stack(vma, stack_base);
+	if (ret)
+		ret = -EFAULT;
+
+out_unlock:
 	up_write(&mm->mmap_sem);
-	
 	return 0;
 }
-
 EXPORT_SYMBOL(setup_arg_pages);
 
-#define free_arg_pages(bprm) do { } while (0)
-
-#else
-
-static inline void free_arg_pages(struct linux_binprm *bprm)
-{
-	int i;
-
-	for (i = 0; i < MAX_ARG_PAGES; i++) {
-		if (bprm->page[i])
-			__free_page(bprm->page[i]);
-		bprm->page[i] = NULL;
-	}
-}
-
 #endif /* CONFIG_MMU */
 
 struct file *open_exec(const char *name)
@@ -1000,43 +1194,42 @@ EXPORT_SYMBOL(compute_creds);
  * points to; chop off the first by relocating brpm->p to right after
  * the first '\0' encountered.
  */
-void remove_arg_zero(struct linux_binprm *bprm)
+int remove_arg_zero(struct linux_binprm *bprm)
 {
-	if (bprm->argc) {
-		char ch;
+	int ret = 0;
+	unsigned long offset;
+	char *kaddr;
+	struct page *page;
 
-		do {
-			unsigned long offset;
-			unsigned long index;
-			char *kaddr;
-			struct page *page;
-
-			offset = bprm->p & ~PAGE_MASK;
-			index = bprm->p >> PAGE_SHIFT;
+	if (!bprm->argc)
+		return 0;
 
-			page = bprm->page[index];
-			kaddr = kmap_atomic(page, KM_USER0);
+	do {
+		offset = bprm->p & ~PAGE_MASK;
+		page = get_arg_page(bprm, bprm->p, 0);
+		if (!page) {
+			ret = -EFAULT;
+			goto out;
+		}
+		kaddr = kmap_atomic(page, KM_USER0);
 
-			/* run through page until we reach end or find NUL */
-			do {
-				ch = *(kaddr + offset);
+		for (; offset < PAGE_SIZE && kaddr[offset];
+				offset++, bprm->p++)
+			;
 
-				/* discard that character... */
-				bprm->p++;
-				offset++;
-			} while (offset < PAGE_SIZE && ch != '\0');
+		kunmap_atomic(kaddr, KM_USER0);
+		put_arg_page(page);
 
-			kunmap_atomic(kaddr, KM_USER0);
+		if (offset == PAGE_SIZE)
+			free_arg_page(bprm, (bprm->p >> PAGE_SHIFT) - 1);
+	} while (offset == PAGE_SIZE);
 
-			/* free the old page */
-			if (offset == PAGE_SIZE) {
-				__free_page(page);
-				bprm->page[index] = NULL;
-			}
-		} while (ch != '\0');
+	bprm->p++;
+	bprm->argc--;
+	ret = 0;
 
-		bprm->argc--;
-	}
+out:
+	return ret;
 }
 EXPORT_SYMBOL(remove_arg_zero);
 
@@ -1062,7 +1255,7 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
 		fput(bprm->file);
 		bprm->file = NULL;
 
-	        loader = PAGE_SIZE*MAX_ARG_PAGES-sizeof(void *);
+		loader = bprm->vma->vm_end - sizeof(void *);
 
 		file = open_exec("/sbin/loader");
 		retval = PTR_ERR(file);
@@ -1156,7 +1349,6 @@ int do_execve(char * filename,
 	struct file *file;
 	unsigned long env_p;
 	int retval;
-	int i;
 
 	retval = -ENOMEM;
 	bprm = kzalloc(sizeof(*bprm), GFP_KERNEL);
@@ -1170,25 +1362,19 @@ int do_execve(char * filename,
 
 	sched_exec();
 
-	bprm->p = PAGE_SIZE*MAX_ARG_PAGES-sizeof(void *);
-
 	bprm->file = file;
 	bprm->filename = filename;
 	bprm->interp = filename;
-	bprm->mm = mm_alloc();
-	retval = -ENOMEM;
-	if (!bprm->mm)
-		goto out_file;
 
-	retval = init_new_context(current, bprm->mm);
-	if (retval < 0)
-		goto out_mm;
+	retval = bprm_mm_init(bprm);
+	if (retval)
+		goto out_file;
 
-	bprm->argc = count(argv, bprm->p / sizeof(void *));
+	bprm->argc = count(argv, MAX_ARG_STRINGS);
 	if ((retval = bprm->argc) < 0)
 		goto out_mm;
 
-	bprm->envc = count(envp, bprm->p / sizeof(void *));
+	bprm->envc = count(envp, MAX_ARG_STRINGS);
 	if ((retval = bprm->envc) < 0)
 		goto out_mm;
 
@@ -1217,9 +1403,8 @@ int do_execve(char * filename,
 
 	retval = search_binary_handler(bprm,regs);
 	if (retval >= 0) {
-		free_arg_pages(bprm);
-
 		/* execve success */
+		free_arg_pages(bprm);
 		security_bprm_free(bprm);
 		acct_update_integrals(current);
 		kfree(bprm);
@@ -1227,26 +1412,19 @@ int do_execve(char * filename,
 	}
 
 out:
-	/* Something went wrong, return the inode and free the argument pages*/
-	for (i = 0 ; i < MAX_ARG_PAGES ; i++) {
-		struct page * page = bprm->page[i];
-		if (page)
-			__free_page(page);
-	}
-
+	free_arg_pages(bprm);
 	if (bprm->security)
 		security_bprm_free(bprm);
 
 out_mm:
 	if (bprm->mm)
-		mmdrop(bprm->mm);
+		mmput (bprm->mm);
 
 out_file:
 	if (bprm->file) {
 		allow_write_access(bprm->file);
 		fput(bprm->file);
 	}
-
 out_kfree:
 	kfree(bprm);
 
diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h
index a0b209cd576..91c8c07fe8b 100644
--- a/include/linux/binfmts.h
+++ b/include/linux/binfmts.h
@@ -6,11 +6,13 @@
 struct pt_regs;
 
 /*
- * MAX_ARG_PAGES defines the number of pages allocated for arguments
- * and envelope for the new program. 32 should suffice, this gives
- * a maximum env+arg of 128kB w/4KB pages!
+ * These are the maximum length and maximum number of strings passed to the
+ * execve() system call.  MAX_ARG_STRLEN is essentially random but serves to
+ * prevent the kernel from being unduly impacted by misaddressed pointers.
+ * MAX_ARG_STRINGS is chosen to fit in a signed 32-bit integer.
  */
-#define MAX_ARG_PAGES 32
+#define MAX_ARG_STRLEN (PAGE_SIZE * 32)
+#define MAX_ARG_STRINGS 0x7FFFFFFF
 
 /* sizeof(linux_binprm->buf) */
 #define BINPRM_BUF_SIZE 128
@@ -24,7 +26,12 @@ struct pt_regs;
  */
 struct linux_binprm{
 	char buf[BINPRM_BUF_SIZE];
+#ifdef CONFIG_MMU
+	struct vm_area_struct *vma;
+#else
+# define MAX_ARG_PAGES	32
 	struct page *page[MAX_ARG_PAGES];
+#endif
 	struct mm_struct *mm;
 	unsigned long p; /* current top of mem */
 	int sh_bang;
@@ -69,7 +76,7 @@ extern int register_binfmt(struct linux_binfmt *);
 extern int unregister_binfmt(struct linux_binfmt *);
 
 extern int prepare_binprm(struct linux_binprm *);
-extern void remove_arg_zero(struct linux_binprm *);
+extern int __must_check remove_arg_zero(struct linux_binprm *);
 extern int search_binary_handler(struct linux_binprm *,struct pt_regs *);
 extern int flush_old_exec(struct linux_binprm * bprm);
 
@@ -86,6 +93,7 @@ extern int suid_dumpable;
 extern int setup_arg_pages(struct linux_binprm * bprm,
 			   unsigned long stack_top,
 			   int executable_stack);
+extern int bprm_mm_init(struct linux_binprm *bprm);
 extern int copy_strings_kernel(int argc,char ** argv,struct linux_binprm *bprm);
 extern void compute_creds(struct linux_binprm *binprm);
 extern int do_coredump(long signr, int exit_code, struct pt_regs * regs);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 50a0ed1d180..c456c3a1c28 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -808,7 +808,6 @@ static inline int handle_mm_fault(struct mm_struct *mm,
 
 extern int make_pages_present(unsigned long addr, unsigned long end);
 extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write);
-void install_arg_page(struct vm_area_struct *, struct page *, unsigned long);
 
 int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start,
 		int len, int write, int force, struct page **pages, struct vm_area_struct **vmas);
@@ -825,9 +824,15 @@ int FASTCALL(set_page_dirty(struct page *page));
 int set_page_dirty_lock(struct page *page);
 int clear_page_dirty_for_io(struct page *page);
 
+extern unsigned long move_page_tables(struct vm_area_struct *vma,
+		unsigned long old_addr, struct vm_area_struct *new_vma,
+		unsigned long new_addr, unsigned long len);
 extern unsigned long do_mremap(unsigned long addr,
 			       unsigned long old_len, unsigned long new_len,
 			       unsigned long flags, unsigned long new_addr);
+extern int mprotect_fixup(struct vm_area_struct *vma,
+			  struct vm_area_struct **pprev, unsigned long start,
+			  unsigned long end, unsigned long newflags);
 
 /*
  * A callback you can register to apply pressure to ageable caches.
@@ -1159,6 +1164,8 @@ extern int expand_stack(struct vm_area_struct *vma, unsigned long address);
 #ifdef CONFIG_IA64
 extern int expand_upwards(struct vm_area_struct *vma, unsigned long address);
 #endif
+extern int expand_stack_downwards(struct vm_area_struct *vma,
+				  unsigned long address);
 
 /* Look up the first VMA which satisfies  addr < vm_end,  NULL if none. */
 extern struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr);
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 535586fc498..145cbb79c4b 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -843,7 +843,7 @@ static void audit_log_execve_info(struct audit_buffer *ab,
 		return; /* execve failed, no additional info */
 
 	for (i = 0; i < axi->argc; i++, p += len) {
-		len = strnlen_user(p, MAX_ARG_PAGES*PAGE_SIZE);
+		len = strnlen_user(p, MAX_ARG_STRLEN);
 		/*
 		 * We just created this mm, if we can't find the strings
 		 * we just copied into it something is _very_ wrong. Similar
diff --git a/mm/mmap.c b/mm/mmap.c
index 724f342bcf8..7afc7a7cec6 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1571,33 +1571,11 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
 }
 #endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */
 
-#ifdef CONFIG_STACK_GROWSUP
-int expand_stack(struct vm_area_struct *vma, unsigned long address)
-{
-	return expand_upwards(vma, address);
-}
-
-struct vm_area_struct *
-find_extend_vma(struct mm_struct *mm, unsigned long addr)
-{
-	struct vm_area_struct *vma, *prev;
-
-	addr &= PAGE_MASK;
-	vma = find_vma_prev(mm, addr, &prev);
-	if (vma && (vma->vm_start <= addr))
-		return vma;
-	if (!prev || expand_stack(prev, addr))
-		return NULL;
-	if (prev->vm_flags & VM_LOCKED) {
-		make_pages_present(addr, prev->vm_end);
-	}
-	return prev;
-}
-#else
 /*
  * vma is the first one with address < vma->vm_start.  Have to extend vma.
  */
-int expand_stack(struct vm_area_struct *vma, unsigned long address)
+static inline int expand_downwards(struct vm_area_struct *vma,
+				   unsigned long address)
 {
 	int error;
 
@@ -1634,6 +1612,38 @@ int expand_stack(struct vm_area_struct *vma, unsigned long address)
 	return error;
 }
 
+int expand_stack_downwards(struct vm_area_struct *vma, unsigned long address)
+{
+	return expand_downwards(vma, address);
+}
+
+#ifdef CONFIG_STACK_GROWSUP
+int expand_stack(struct vm_area_struct *vma, unsigned long address)
+{
+	return expand_upwards(vma, address);
+}
+
+struct vm_area_struct *
+find_extend_vma(struct mm_struct *mm, unsigned long addr)
+{
+	struct vm_area_struct *vma, *prev;
+
+	addr &= PAGE_MASK;
+	vma = find_vma_prev(mm, addr, &prev);
+	if (vma && (vma->vm_start <= addr))
+		return vma;
+	if (!prev || expand_stack(prev, addr))
+		return NULL;
+	if (prev->vm_flags & VM_LOCKED)
+		make_pages_present(addr, prev->vm_end);
+	return prev;
+}
+#else
+int expand_stack(struct vm_area_struct *vma, unsigned long address)
+{
+	return expand_downwards(vma, address);
+}
+
 struct vm_area_struct *
 find_extend_vma(struct mm_struct * mm, unsigned long addr)
 {
@@ -1651,9 +1661,8 @@ find_extend_vma(struct mm_struct * mm, unsigned long addr)
 	start = vma->vm_start;
 	if (expand_stack(vma, addr))
 		return NULL;
-	if (vma->vm_flags & VM_LOCKED) {
+	if (vma->vm_flags & VM_LOCKED)
 		make_pages_present(addr, start);
-	}
 	return vma;
 }
 #endif
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 3b8f3c0c63f..e8346c30abe 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -128,7 +128,7 @@ static void change_protection(struct vm_area_struct *vma,
 	flush_tlb_range(vma, start, end);
 }
 
-static int
+int
 mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
 	unsigned long start, unsigned long end, unsigned long newflags)
 {
diff --git a/mm/mremap.c b/mm/mremap.c
index bc7c52efc71..8ea5c2412c6 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -120,7 +120,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
 
 #define LATENCY_LIMIT	(64 * PAGE_SIZE)
 
-static unsigned long move_page_tables(struct vm_area_struct *vma,
+unsigned long move_page_tables(struct vm_area_struct *vma,
 		unsigned long old_addr, struct vm_area_struct *new_vma,
 		unsigned long new_addr, unsigned long len)
 {
-- 
cgit v1.2.3-70-g09d2


From 76fdbb25f963de5dc1e308325f0578a2f92b1c2d Mon Sep 17 00:00:00 2001
From: "Kawai, Hidehiro" <hidehiro.kawai.ez@hitachi.com>
Date: Thu, 19 Jul 2007 01:48:26 -0700
Subject: coredump masking: bound suid_dumpable sysctl

This patch series is version 5 of the core dump masking feature, which
controls which VMAs should be dumped based on their memory types and
per-process flags.

I adopted most of Andrew's suggestion at the previous version.  He also
suggested using system call instead of /proc/<pid>/ interface, I decided to
use the latter continuously because adding new system call with pid argument
will give a big impact on the kernel.

You can access the per-process flags via /proc/<pid>/coredump_filter
interface.  coredump_filter represents a bitmask of memory types, and if a bit
is set, VMAs of corresponding memory type are written into a core file when
the process is dumped.  The bitmask is inherited from the parent process when
a process is created.

The original purpose is to avoid longtime system slowdown when a number of
processes which share a huge shared memory are dumped at the same time.  To
achieve this purpose, this patch series adds an ability to suppress dumping
anonymous shared memory for specified processes.  In this version, three other
memory types are also supported.

Here are the coredump_filter bits:
  bit 0: anonymous private memory
  bit 1: anonymous shared memory
  bit 2: file-backed private memory
  bit 3: file-backed shared memory

The default value of coredump_filter is 0x3.  This means the new core dump
routine has the same behavior as conventional behavior by default.

In this version, coredump_filter bits and mm.dumpable are merged into
mm.flags, and it is accessed by atomic bitops.

The supported core file formats are ELF and ELF-FDPIC.  ELF has been tested,
but ELF-FDPIC has not been built and tested because I don't have the test
environment.

This patch limits a value of suid_dumpable sysctl to the range of 0 to 2.

Signed-off-by: Hidehiro Kawai <hidehiro.kawai.ez@hitachi.com>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Cc: David Howells <dhowells@redhat.com>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sysctl.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 8db41764e2a..2aaa3f98185 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -733,6 +733,7 @@ static ctl_table kern_table[] = {
 /* Constants for minimum and maximum testing in vm_table.
    We use these as one-element integer vectors. */
 static int zero;
+static int two = 2;
 static int one_hundred = 100;
 
 
@@ -1123,7 +1124,10 @@ static ctl_table fs_table[] = {
 		.data		= &lease_break_time,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &zero,
+		.extra2		= &two,
 	},
 	{
 		.ctl_name	= FS_AIO_NR,
-- 
cgit v1.2.3-70-g09d2


From 6c5d523826dc639df709ed0f88c5d2ce25379652 Mon Sep 17 00:00:00 2001
From: "Kawai, Hidehiro" <hidehiro.kawai.ez@hitachi.com>
Date: Thu, 19 Jul 2007 01:48:27 -0700
Subject: coredump masking: reimplementation of dumpable using two flags

This patch changes mm_struct.dumpable to a pair of bit flags.

set_dumpable() converts three-value dumpable to two flags and stores it into
lower two bits of mm_struct.flags instead of mm_struct.dumpable.
get_dumpable() behaves in the opposite way.

[akpm@linux-foundation.org: export set_dumpable]
Signed-off-by: Hidehiro Kawai <hidehiro.kawai.ez@hitachi.com>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Cc: David Howells <dhowells@redhat.com>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/exec.c                    | 62 +++++++++++++++++++++++++++++++++++++++-----
 fs/proc/base.c               |  2 +-
 include/asm-ia64/processor.h |  4 +--
 include/linux/sched.h        |  9 ++++++-
 kernel/ptrace.c              |  2 +-
 kernel/sys.c                 | 24 ++++++++---------
 security/commoncap.c         |  2 +-
 security/dummy.c             |  2 +-
 8 files changed, 82 insertions(+), 25 deletions(-)

(limited to 'kernel')

diff --git a/fs/exec.c b/fs/exec.c
index 498f2b3dca2..7bdea7937ee 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1058,9 +1058,9 @@ int flush_old_exec(struct linux_binprm * bprm)
 	current->sas_ss_sp = current->sas_ss_size = 0;
 
 	if (current->euid == current->uid && current->egid == current->gid)
-		current->mm->dumpable = 1;
+		set_dumpable(current->mm, 1);
 	else
-		current->mm->dumpable = suid_dumpable;
+		set_dumpable(current->mm, suid_dumpable);
 
 	name = bprm->filename;
 
@@ -1088,7 +1088,7 @@ int flush_old_exec(struct linux_binprm * bprm)
 	    file_permission(bprm->file, MAY_READ) ||
 	    (bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP)) {
 		suid_keys(current);
-		current->mm->dumpable = suid_dumpable;
+		set_dumpable(current->mm, suid_dumpable);
 	}
 
 	/* An exec changes our domain. We are no longer part of the thread
@@ -1665,6 +1665,56 @@ fail:
 	return core_waiters;
 }
 
+/*
+ * set_dumpable converts traditional three-value dumpable to two flags and
+ * stores them into mm->flags.  It modifies lower two bits of mm->flags, but
+ * these bits are not changed atomically.  So get_dumpable can observe the
+ * intermediate state.  To avoid doing unexpected behavior, get get_dumpable
+ * return either old dumpable or new one by paying attention to the order of
+ * modifying the bits.
+ *
+ * dumpable |   mm->flags (binary)
+ * old  new | initial interim  final
+ * ---------+-----------------------
+ *  0    1  |   00      01      01
+ *  0    2  |   00      10(*)   11
+ *  1    0  |   01      00      00
+ *  1    2  |   01      11      11
+ *  2    0  |   11      10(*)   00
+ *  2    1  |   11      11      01
+ *
+ * (*) get_dumpable regards interim value of 10 as 11.
+ */
+void set_dumpable(struct mm_struct *mm, int value)
+{
+	switch (value) {
+	case 0:
+		clear_bit(MMF_DUMPABLE, &mm->flags);
+		smp_wmb();
+		clear_bit(MMF_DUMP_SECURELY, &mm->flags);
+		break;
+	case 1:
+		set_bit(MMF_DUMPABLE, &mm->flags);
+		smp_wmb();
+		clear_bit(MMF_DUMP_SECURELY, &mm->flags);
+		break;
+	case 2:
+		set_bit(MMF_DUMP_SECURELY, &mm->flags);
+		smp_wmb();
+		set_bit(MMF_DUMPABLE, &mm->flags);
+		break;
+	}
+}
+EXPORT_SYMBOL_GPL(set_dumpable);
+
+int get_dumpable(struct mm_struct *mm)
+{
+	int ret;
+
+	ret = mm->flags & 0x3;
+	return (ret >= 2) ? 2 : ret;
+}
+
 int do_coredump(long signr, int exit_code, struct pt_regs * regs)
 {
 	char corename[CORENAME_MAX_SIZE + 1];
@@ -1683,7 +1733,7 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs)
 	if (!binfmt || !binfmt->core_dump)
 		goto fail;
 	down_write(&mm->mmap_sem);
-	if (!mm->dumpable) {
+	if (!get_dumpable(mm)) {
 		up_write(&mm->mmap_sem);
 		goto fail;
 	}
@@ -1693,11 +1743,11 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs)
 	 *	process nor do we know its entire history. We only know it
 	 *	was tainted so we dump it as root in mode 2.
 	 */
-	if (mm->dumpable == 2) {	/* Setuid core dump mode */
+	if (get_dumpable(mm) == 2) {	/* Setuid core dump mode */
 		flag = O_EXCL;		/* Stop rewrite attacks */
 		current->fsuid = 0;	/* Dump root private */
 	}
-	mm->dumpable = 0;
+	set_dumpable(mm, 0);
 
 	retval = coredump_wait(exit_code);
 	if (retval < 0)
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 42cb4f5613b..49b3ab0175e 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1014,7 +1014,7 @@ static int task_dumpable(struct task_struct *task)
 	task_lock(task);
 	mm = task->mm;
 	if (mm)
-		dumpable = mm->dumpable;
+		dumpable = get_dumpable(mm);
 	task_unlock(task);
 	if(dumpable == 1)
 		return 1;
diff --git a/include/asm-ia64/processor.h b/include/asm-ia64/processor.h
index db81ba406ce..6251c76437d 100644
--- a/include/asm-ia64/processor.h
+++ b/include/asm-ia64/processor.h
@@ -295,9 +295,9 @@ struct thread_struct {
 	regs->ar_bspstore = current->thread.rbs_bot;						\
 	regs->ar_fpsr = FPSR_DEFAULT;								\
 	regs->loadrs = 0;									\
-	regs->r8 = current->mm->dumpable;	/* set "don't zap registers" flag */		\
+	regs->r8 = get_dumpable(current->mm);	/* set "don't zap registers" flag */		\
 	regs->r12 = new_sp - 16;	/* allocate 16 byte scratch area */			\
-	if (unlikely(!current->mm->dumpable)) {							\
+	if (unlikely(!get_dumpable(current->mm))) {							\
 		/*										\
 		 * Zap scratch regs to avoid leaking bits between processes with different	\
 		 * uid/privileges.								\
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 731edaca8ff..8dbd0836640 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -345,6 +345,13 @@ typedef unsigned long mm_counter_t;
 		(mm)->hiwater_vm = (mm)->total_vm;	\
 } while (0)
 
+extern void set_dumpable(struct mm_struct *mm, int value);
+extern int get_dumpable(struct mm_struct *mm);
+
+/* mm flags */
+#define MMF_DUMPABLE      0  /* core dump is permitted */
+#define MMF_DUMP_SECURELY 1  /* core file is readable only by root */
+
 struct mm_struct {
 	struct vm_area_struct * mmap;		/* list of VMAs */
 	struct rb_root mm_rb;
@@ -402,7 +409,7 @@ struct mm_struct {
 	unsigned int token_priority;
 	unsigned int last_interval;
 
-	unsigned char dumpable:2;
+	unsigned long flags; /* Must use atomic bitops to access the bits */
 
 	/* coredumping support */
 	int core_waiters;
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 4a1745f1dad..82a558b655d 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -142,7 +142,7 @@ static int may_attach(struct task_struct *task)
 		return -EPERM;
 	smp_rmb();
 	if (task->mm)
-		dumpable = task->mm->dumpable;
+		dumpable = get_dumpable(task->mm);
 	if (!dumpable && !capable(CAP_SYS_PTRACE))
 		return -EPERM;
 
diff --git a/kernel/sys.c b/kernel/sys.c
index d40e40a9446..08562f41976 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1036,7 +1036,7 @@ asmlinkage long sys_setregid(gid_t rgid, gid_t egid)
 			return -EPERM;
 	}
 	if (new_egid != old_egid) {
-		current->mm->dumpable = suid_dumpable;
+		set_dumpable(current->mm, suid_dumpable);
 		smp_wmb();
 	}
 	if (rgid != (gid_t) -1 ||
@@ -1066,13 +1066,13 @@ asmlinkage long sys_setgid(gid_t gid)
 
 	if (capable(CAP_SETGID)) {
 		if (old_egid != gid) {
-			current->mm->dumpable = suid_dumpable;
+			set_dumpable(current->mm, suid_dumpable);
 			smp_wmb();
 		}
 		current->gid = current->egid = current->sgid = current->fsgid = gid;
 	} else if ((gid == current->gid) || (gid == current->sgid)) {
 		if (old_egid != gid) {
-			current->mm->dumpable = suid_dumpable;
+			set_dumpable(current->mm, suid_dumpable);
 			smp_wmb();
 		}
 		current->egid = current->fsgid = gid;
@@ -1103,7 +1103,7 @@ static int set_user(uid_t new_ruid, int dumpclear)
 	switch_uid(new_user);
 
 	if (dumpclear) {
-		current->mm->dumpable = suid_dumpable;
+		set_dumpable(current->mm, suid_dumpable);
 		smp_wmb();
 	}
 	current->uid = new_ruid;
@@ -1159,7 +1159,7 @@ asmlinkage long sys_setreuid(uid_t ruid, uid_t euid)
 		return -EAGAIN;
 
 	if (new_euid != old_euid) {
-		current->mm->dumpable = suid_dumpable;
+		set_dumpable(current->mm, suid_dumpable);
 		smp_wmb();
 	}
 	current->fsuid = current->euid = new_euid;
@@ -1209,7 +1209,7 @@ asmlinkage long sys_setuid(uid_t uid)
 		return -EPERM;
 
 	if (old_euid != uid) {
-		current->mm->dumpable = suid_dumpable;
+		set_dumpable(current->mm, suid_dumpable);
 		smp_wmb();
 	}
 	current->fsuid = current->euid = uid;
@@ -1254,7 +1254,7 @@ asmlinkage long sys_setresuid(uid_t ruid, uid_t euid, uid_t suid)
 	}
 	if (euid != (uid_t) -1) {
 		if (euid != current->euid) {
-			current->mm->dumpable = suid_dumpable;
+			set_dumpable(current->mm, suid_dumpable);
 			smp_wmb();
 		}
 		current->euid = euid;
@@ -1304,7 +1304,7 @@ asmlinkage long sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid)
 	}
 	if (egid != (gid_t) -1) {
 		if (egid != current->egid) {
-			current->mm->dumpable = suid_dumpable;
+			set_dumpable(current->mm, suid_dumpable);
 			smp_wmb();
 		}
 		current->egid = egid;
@@ -1350,7 +1350,7 @@ asmlinkage long sys_setfsuid(uid_t uid)
 	    uid == current->suid || uid == current->fsuid || 
 	    capable(CAP_SETUID)) {
 		if (uid != old_fsuid) {
-			current->mm->dumpable = suid_dumpable;
+			set_dumpable(current->mm, suid_dumpable);
 			smp_wmb();
 		}
 		current->fsuid = uid;
@@ -1379,7 +1379,7 @@ asmlinkage long sys_setfsgid(gid_t gid)
 	    gid == current->sgid || gid == current->fsgid || 
 	    capable(CAP_SETGID)) {
 		if (gid != old_fsgid) {
-			current->mm->dumpable = suid_dumpable;
+			set_dumpable(current->mm, suid_dumpable);
 			smp_wmb();
 		}
 		current->fsgid = gid;
@@ -2176,14 +2176,14 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
 			error = put_user(current->pdeath_signal, (int __user *)arg2);
 			break;
 		case PR_GET_DUMPABLE:
-			error = current->mm->dumpable;
+			error = get_dumpable(current->mm);
 			break;
 		case PR_SET_DUMPABLE:
 			if (arg2 < 0 || arg2 > 1) {
 				error = -EINVAL;
 				break;
 			}
-			current->mm->dumpable = arg2;
+			set_dumpable(current->mm, arg2);
 			break;
 
 		case PR_SET_UNALIGN:
diff --git a/security/commoncap.c b/security/commoncap.c
index 384379ede4f..338606eb723 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -148,7 +148,7 @@ void cap_bprm_apply_creds (struct linux_binprm *bprm, int unsafe)
 
 	if (bprm->e_uid != current->uid || bprm->e_gid != current->gid ||
 	    !cap_issubset (new_permitted, current->cap_permitted)) {
-		current->mm->dumpable = suid_dumpable;
+		set_dumpable(current->mm, suid_dumpable);
 
 		if (unsafe & ~LSM_UNSAFE_PTRACE_CAP) {
 			if (!capable(CAP_SETUID)) {
diff --git a/security/dummy.c b/security/dummy.c
index d6a112ce297..19d813d5e08 100644
--- a/security/dummy.c
+++ b/security/dummy.c
@@ -130,7 +130,7 @@ static void dummy_bprm_free_security (struct linux_binprm *bprm)
 static void dummy_bprm_apply_creds (struct linux_binprm *bprm, int unsafe)
 {
 	if (bprm->e_uid != current->uid || bprm->e_gid != current->gid) {
-		current->mm->dumpable = suid_dumpable;
+		set_dumpable(current->mm, suid_dumpable);
 
 		if ((unsafe & ~LSM_UNSAFE_PTRACE_CAP) && !capable(CAP_SETUID)) {
 			bprm->e_uid = current->uid;
-- 
cgit v1.2.3-70-g09d2


From 3cb4a0bb1e773e3c41800b33a3f7dab32bd06c64 Mon Sep 17 00:00:00 2001
From: "Kawai, Hidehiro" <hidehiro.kawai.ez@hitachi.com>
Date: Thu, 19 Jul 2007 01:48:28 -0700
Subject: coredump masking: add an interface for core dump filter

This patch adds an interface to set/reset flags which determines each memory
segment should be dumped or not when a core file is generated.

/proc/<pid>/coredump_filter file is provided to access the flags.  You can
change the flag status for a particular process by writing to or reading from
the file.

The flag status is inherited to the child process when it is created.

Signed-off-by: Hidehiro Kawai <hidehiro.kawai.ez@hitachi.com>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Cc: David Howells <dhowells@redhat.com>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/base.c        | 89 +++++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/sched.h | 14 ++++++++
 kernel/fork.c         |  2 ++
 3 files changed, 105 insertions(+)

(limited to 'kernel')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 49b3ab0175e..3c77d5a64e7 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -72,6 +72,7 @@
 #include <linux/poll.h>
 #include <linux/nsproxy.h>
 #include <linux/oom.h>
+#include <linux/elf.h>
 #include "internal.h"
 
 /* NOTE:
@@ -1785,6 +1786,91 @@ static const struct inode_operations proc_attr_dir_inode_operations = {
 
 #endif
 
+#if defined(USE_ELF_CORE_DUMP) && defined(CONFIG_ELF_CORE)
+static ssize_t proc_coredump_filter_read(struct file *file, char __user *buf,
+					 size_t count, loff_t *ppos)
+{
+	struct task_struct *task = get_proc_task(file->f_dentry->d_inode);
+	struct mm_struct *mm;
+	char buffer[PROC_NUMBUF];
+	size_t len;
+	int ret;
+
+	if (!task)
+		return -ESRCH;
+
+	ret = 0;
+	mm = get_task_mm(task);
+	if (mm) {
+		len = snprintf(buffer, sizeof(buffer), "%08lx\n",
+			       ((mm->flags & MMF_DUMP_FILTER_MASK) >>
+				MMF_DUMP_FILTER_SHIFT));
+		mmput(mm);
+		ret = simple_read_from_buffer(buf, count, ppos, buffer, len);
+	}
+
+	put_task_struct(task);
+
+	return ret;
+}
+
+static ssize_t proc_coredump_filter_write(struct file *file,
+					  const char __user *buf,
+					  size_t count,
+					  loff_t *ppos)
+{
+	struct task_struct *task;
+	struct mm_struct *mm;
+	char buffer[PROC_NUMBUF], *end;
+	unsigned int val;
+	int ret;
+	int i;
+	unsigned long mask;
+
+	ret = -EFAULT;
+	memset(buffer, 0, sizeof(buffer));
+	if (count > sizeof(buffer) - 1)
+		count = sizeof(buffer) - 1;
+	if (copy_from_user(buffer, buf, count))
+		goto out_no_task;
+
+	ret = -EINVAL;
+	val = (unsigned int)simple_strtoul(buffer, &end, 0);
+	if (*end == '\n')
+		end++;
+	if (end - buffer == 0)
+		goto out_no_task;
+
+	ret = -ESRCH;
+	task = get_proc_task(file->f_dentry->d_inode);
+	if (!task)
+		goto out_no_task;
+
+	ret = end - buffer;
+	mm = get_task_mm(task);
+	if (!mm)
+		goto out_no_mm;
+
+	for (i = 0, mask = 1; i < MMF_DUMP_FILTER_BITS; i++, mask <<= 1) {
+		if (val & mask)
+			set_bit(i + MMF_DUMP_FILTER_SHIFT, &mm->flags);
+		else
+			clear_bit(i + MMF_DUMP_FILTER_SHIFT, &mm->flags);
+	}
+
+	mmput(mm);
+ out_no_mm:
+	put_task_struct(task);
+ out_no_task:
+	return ret;
+}
+
+static const struct file_operations proc_coredump_filter_operations = {
+	.read		= proc_coredump_filter_read,
+	.write		= proc_coredump_filter_write,
+};
+#endif
+
 /*
  * /proc/self:
  */
@@ -2005,6 +2091,9 @@ static const struct pid_entry tgid_base_stuff[] = {
 #ifdef CONFIG_FAULT_INJECTION
 	REG("make-it-fail", S_IRUGO|S_IWUSR, fault_inject),
 #endif
+#if defined(USE_ELF_CORE_DUMP) && defined(CONFIG_ELF_CORE)
+	REG("coredump_filter", S_IRUGO|S_IWUSR, coredump_filter),
+#endif
 #ifdef CONFIG_TASK_IO_ACCOUNTING
 	INF("io",	S_IRUGO, pid_io_accounting),
 #endif
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8dbd0836640..94f624aef01 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -349,8 +349,22 @@ extern void set_dumpable(struct mm_struct *mm, int value);
 extern int get_dumpable(struct mm_struct *mm);
 
 /* mm flags */
+/* dumpable bits */
 #define MMF_DUMPABLE      0  /* core dump is permitted */
 #define MMF_DUMP_SECURELY 1  /* core file is readable only by root */
+#define MMF_DUMPABLE_BITS 2
+
+/* coredump filter bits */
+#define MMF_DUMP_ANON_PRIVATE	2
+#define MMF_DUMP_ANON_SHARED	3
+#define MMF_DUMP_MAPPED_PRIVATE	4
+#define MMF_DUMP_MAPPED_SHARED	5
+#define MMF_DUMP_FILTER_SHIFT	MMF_DUMPABLE_BITS
+#define MMF_DUMP_FILTER_BITS	4
+#define MMF_DUMP_FILTER_MASK \
+	(((1 << MMF_DUMP_FILTER_BITS) - 1) << MMF_DUMP_FILTER_SHIFT)
+#define MMF_DUMP_FILTER_DEFAULT \
+	((1 << MMF_DUMP_ANON_PRIVATE) |	(1 << MMF_DUMP_ANON_SHARED))
 
 struct mm_struct {
 	struct vm_area_struct * mmap;		/* list of VMAs */
diff --git a/kernel/fork.c b/kernel/fork.c
index ba39bdb2a7b..46983899822 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -334,6 +334,8 @@ static struct mm_struct * mm_init(struct mm_struct * mm)
 	atomic_set(&mm->mm_count, 1);
 	init_rwsem(&mm->mmap_sem);
 	INIT_LIST_HEAD(&mm->mmlist);
+	mm->flags = (current->mm) ? current->mm->flags
+				  : MMF_DUMP_FILTER_DEFAULT;
 	mm->core_waiters = 0;
 	mm->nr_ptes = 0;
 	set_mm_counter(mm, file_rss, 0);
-- 
cgit v1.2.3-70-g09d2


From 01c55ed3260e130f152b7fbab2e18f23980b59a4 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Thu, 19 Jul 2007 01:48:32 -0700
Subject: kernel/relay.c: make functions static

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Cc: Tom Zanussi <zanussi@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/relay.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/relay.c b/kernel/relay.c
index a615a8f513f..510fbbd7b50 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -80,7 +80,7 @@ static struct vm_operations_struct relay_file_mmap_ops = {
  *
  *	Caller should already have grabbed mmap_sem.
  */
-int relay_mmap_buf(struct rchan_buf *buf, struct vm_area_struct *vma)
+static int relay_mmap_buf(struct rchan_buf *buf, struct vm_area_struct *vma)
 {
 	unsigned long length = vma->vm_end - vma->vm_start;
 	struct file *filp = vma->vm_file;
@@ -145,7 +145,7 @@ depopulate:
  *
  *	Returns channel buffer if successful, %NULL otherwise.
  */
-struct rchan_buf *relay_create_buf(struct rchan *chan)
+static struct rchan_buf *relay_create_buf(struct rchan *chan)
 {
 	struct rchan_buf *buf = kzalloc(sizeof(struct rchan_buf), GFP_KERNEL);
 	if (!buf)
@@ -175,7 +175,7 @@ free_buf:
  *
  *	Should only be called from kref_put().
  */
-void relay_destroy_channel(struct kref *kref)
+static void relay_destroy_channel(struct kref *kref)
 {
 	struct rchan *chan = container_of(kref, struct rchan, kref);
 	kfree(chan);
@@ -185,7 +185,7 @@ void relay_destroy_channel(struct kref *kref)
  *	relay_destroy_buf - destroy an rchan_buf struct and associated buffer
  *	@buf: the buffer struct
  */
-void relay_destroy_buf(struct rchan_buf *buf)
+static void relay_destroy_buf(struct rchan_buf *buf)
 {
 	struct rchan *chan = buf->chan;
 	unsigned int i;
@@ -210,7 +210,7 @@ void relay_destroy_buf(struct rchan_buf *buf)
  *	rchan_buf_struct and the channel buffer.  Should only be called from
  *	kref_put().
  */
-void relay_remove_buf(struct kref *kref)
+static void relay_remove_buf(struct kref *kref)
 {
 	struct rchan_buf *buf = container_of(kref, struct rchan_buf, kref);
 	buf->chan->cb->remove_buf_file(buf->dentry);
@@ -223,11 +223,10 @@ void relay_remove_buf(struct kref *kref)
  *
  *	Returns 1 if the buffer is empty, 0 otherwise.
  */
-int relay_buf_empty(struct rchan_buf *buf)
+static int relay_buf_empty(struct rchan_buf *buf)
 {
 	return (buf->subbufs_produced - buf->subbufs_consumed) ? 0 : 1;
 }
-EXPORT_SYMBOL_GPL(relay_buf_empty);
 
 /**
  *	relay_buf_full - boolean, is the channel buffer full?
-- 
cgit v1.2.3-70-g09d2


From da1a679cde9b12d6e331f43d2d92a234f2d1f9b0 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Thu, 19 Jul 2007 01:48:39 -0700
Subject: Add /sys/kernel/notes

This patch adds the /sys/kernel/notes magic file.  Reading this delivers the
contents of the kernel's .notes section.  This lets userland easily glean any
detailed information about the running kernel's build that was stored there at
compile time.

Signed-off-by: Roland McGrath <roland@redhat.com>
Cc: Andi Kleen <ak@suse.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Richard Henderson <rth@twiddle.net>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/ksysfs.c | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

(limited to 'kernel')

diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 559deca5ed1..2565e1b6dd7 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -62,6 +62,28 @@ static ssize_t kexec_crash_loaded_show(struct kset *kset, char *page)
 KERNEL_ATTR_RO(kexec_crash_loaded);
 #endif /* CONFIG_KEXEC */
 
+/*
+ * Make /sys/kernel/notes give the raw contents of our kernel .notes section.
+ */
+extern const char __start_notes __attribute__((weak));
+extern const char __stop_notes __attribute__((weak));
+#define	notes_size (&__stop_notes - &__start_notes)
+
+static ssize_t notes_read(struct kobject *kobj, struct bin_attribute *bin_attr,
+			  char *buf, loff_t off, size_t count)
+{
+	memcpy(buf, &__start_notes + off, count);
+	return count;
+}
+
+static struct bin_attribute notes_attr = {
+	.attr = {
+		.name = "notes",
+		.mode = S_IRUGO,
+	},
+	.read = &notes_read,
+};
+
 decl_subsys(kernel, NULL, NULL);
 EXPORT_SYMBOL_GPL(kernel_subsys);
 
@@ -88,6 +110,12 @@ static int __init ksysfs_init(void)
 		error = sysfs_create_group(&kernel_subsys.kobj,
 					   &kernel_attr_group);
 
+	if (!error && notes_size > 0) {
+		notes_attr.size = notes_size;
+		error = sysfs_create_bin_file(&kernel_subsys.kobj,
+					      &notes_attr);
+	}
+
 	return error;
 }
 
-- 
cgit v1.2.3-70-g09d2


From ca58abcb4a6d52ee2db1b1130cea3ca2a76677b9 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 19 Jul 2007 01:48:53 -0700
Subject: lockdep: sanitise CONFIG_PROVE_LOCKING

Ensure that all of the lock dependency tracking code is under
CONFIG_PROVE_LOCKING.  This allows us to use the held lock tracking code for
other purposes.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Ingo Molnar <mingo@elte.hu>
Acked-by: Jason Baron <jbaron@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/lockdep.c  | 13 ++++++++++++-
 kernel/spinlock.c |  4 ++--
 2 files changed, 14 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index edba2ffb43d..05c1261791f 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -95,6 +95,7 @@ static int lockdep_initialized;
 unsigned long nr_list_entries;
 static struct lock_list list_entries[MAX_LOCKDEP_ENTRIES];
 
+#ifdef CONFIG_PROVE_LOCKING
 /*
  * Allocate a lockdep entry. (assumes the graph_lock held, returns
  * with NULL on failure)
@@ -111,6 +112,7 @@ static struct lock_list *alloc_list_entry(void)
 	}
 	return list_entries + nr_list_entries++;
 }
+#endif
 
 /*
  * All data structures here are protected by the global debug_lock.
@@ -140,7 +142,9 @@ LIST_HEAD(all_lock_classes);
 static struct list_head classhash_table[CLASSHASH_SIZE];
 
 unsigned long nr_lock_chains;
+#ifdef CONFIG_PROVE_LOCKING
 static struct lock_chain lock_chains[MAX_LOCKDEP_CHAINS];
+#endif
 
 /*
  * We put the lock dependency chains into a hash-table as well, to cache
@@ -482,6 +486,7 @@ static void print_lock_dependencies(struct lock_class *class, int depth)
 	}
 }
 
+#ifdef CONFIG_PROVE_LOCKING
 /*
  * Add a new dependency to the head of the list:
  */
@@ -541,6 +546,7 @@ print_circular_bug_entry(struct lock_list *target, unsigned int depth)
 
 	return 0;
 }
+#endif
 
 static void print_kernel_version(void)
 {
@@ -549,6 +555,7 @@ static void print_kernel_version(void)
 		init_utsname()->version);
 }
 
+#ifdef CONFIG_PROVE_LOCKING
 /*
  * When a circular dependency is detected, print the
  * header first:
@@ -639,6 +646,7 @@ check_noncircular(struct lock_class *source, unsigned int depth)
 	}
 	return 1;
 }
+#endif
 
 static int very_verbose(struct lock_class *class)
 {
@@ -823,6 +831,7 @@ check_usage(struct task_struct *curr, struct held_lock *prev,
 
 #endif
 
+#ifdef CONFIG_PROVE_LOCKING
 static int
 print_deadlock_bug(struct task_struct *curr, struct held_lock *prev,
 		   struct held_lock *next)
@@ -1087,7 +1096,7 @@ out_bug:
 
 	return 0;
 }
-
+#endif
 
 /*
  * Is this the address of a static object:
@@ -1307,6 +1316,7 @@ out_unlock_set:
 	return class;
 }
 
+#ifdef CONFIG_PROVE_LOCKING
 /*
  * Look up a dependency chain. If the key is not present yet then
  * add it and return 1 - in this case the new dependency chain is
@@ -1381,6 +1391,7 @@ cache_hit:
 
 	return 1;
 }
+#endif
 
 /*
  * We are building curr_chain_key incrementally, so double-check
diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index 2c6c2bf8551..cd93bfe3f10 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -88,7 +88,7 @@ unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock)
 	 * _raw_spin_lock_flags() code, because lockdep assumes
 	 * that interrupts are not re-enabled during lock-acquire:
 	 */
-#ifdef CONFIG_PROVE_LOCKING
+#ifdef CONFIG_LOCKDEP
 	_raw_spin_lock(lock);
 #else
 	_raw_spin_lock_flags(lock, &flags);
@@ -305,7 +305,7 @@ unsigned long __lockfunc _spin_lock_irqsave_nested(spinlock_t *lock, int subclas
 	 * _raw_spin_lock_flags() code, because lockdep assumes
 	 * that interrupts are not re-enabled during lock-acquire:
 	 */
-#ifdef CONFIG_PROVE_SPIN_LOCKING
+#ifdef CONFIG_LOCKDEP
 	_raw_spin_lock(lock);
 #else
 	_raw_spin_lock_flags(lock, &flags);
-- 
cgit v1.2.3-70-g09d2


From 8e18257d29238311e82085152741f0c3aa18b74d Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 19 Jul 2007 01:48:54 -0700
Subject: lockdep: reduce the ifdeffery

Move code around to get fewer but larger #ifdef sections.  Break some
in-function #ifdefs out into their own functions.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/lockdep.c      | 1657 ++++++++++++++++++++++++++-----------------------
 kernel/lockdep_proc.c |    2 +
 2 files changed, 873 insertions(+), 786 deletions(-)

(limited to 'kernel')

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 05c1261791f..87ac3642507 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -95,25 +95,6 @@ static int lockdep_initialized;
 unsigned long nr_list_entries;
 static struct lock_list list_entries[MAX_LOCKDEP_ENTRIES];
 
-#ifdef CONFIG_PROVE_LOCKING
-/*
- * Allocate a lockdep entry. (assumes the graph_lock held, returns
- * with NULL on failure)
- */
-static struct lock_list *alloc_list_entry(void)
-{
-	if (nr_list_entries >= MAX_LOCKDEP_ENTRIES) {
-		if (!debug_locks_off_graph_unlock())
-			return NULL;
-
-		printk("BUG: MAX_LOCKDEP_ENTRIES too low!\n");
-		printk("turning off the locking correctness validator.\n");
-		return NULL;
-	}
-	return list_entries + nr_list_entries++;
-}
-#endif
-
 /*
  * All data structures here are protected by the global debug_lock.
  *
@@ -141,11 +122,6 @@ LIST_HEAD(all_lock_classes);
 
 static struct list_head classhash_table[CLASSHASH_SIZE];
 
-unsigned long nr_lock_chains;
-#ifdef CONFIG_PROVE_LOCKING
-static struct lock_chain lock_chains[MAX_LOCKDEP_CHAINS];
-#endif
-
 /*
  * We put the lock dependency chains into a hash-table as well, to cache
  * their existence:
@@ -227,26 +203,6 @@ static int verbose(struct lock_class *class)
 	return 0;
 }
 
-#ifdef CONFIG_TRACE_IRQFLAGS
-
-static int hardirq_verbose(struct lock_class *class)
-{
-#if HARDIRQ_VERBOSE
-	return class_filter(class);
-#endif
-	return 0;
-}
-
-static int softirq_verbose(struct lock_class *class)
-{
-#if SOFTIRQ_VERBOSE
-	return class_filter(class);
-#endif
-	return 0;
-}
-
-#endif
-
 /*
  * Stack-trace: tightly packed array of stack backtrace
  * addresses. Protected by the graph_lock.
@@ -486,151 +442,392 @@ static void print_lock_dependencies(struct lock_class *class, int depth)
 	}
 }
 
-#ifdef CONFIG_PROVE_LOCKING
+static void print_kernel_version(void)
+{
+	printk("%s %.*s\n", init_utsname()->release,
+		(int)strcspn(init_utsname()->version, " "),
+		init_utsname()->version);
+}
+
+static int very_verbose(struct lock_class *class)
+{
+#if VERY_VERBOSE
+	return class_filter(class);
+#endif
+	return 0;
+}
+
 /*
- * Add a new dependency to the head of the list:
+ * Is this the address of a static object:
  */
-static int add_lock_to_list(struct lock_class *class, struct lock_class *this,
-			    struct list_head *head, unsigned long ip, int distance)
+static int static_obj(void *obj)
 {
-	struct lock_list *entry;
+	unsigned long start = (unsigned long) &_stext,
+		      end   = (unsigned long) &_end,
+		      addr  = (unsigned long) obj;
+#ifdef CONFIG_SMP
+	int i;
+#endif
+
 	/*
-	 * Lock not present yet - get a new dependency struct and
-	 * add it to the list:
+	 * static variable?
 	 */
-	entry = alloc_list_entry();
-	if (!entry)
-		return 0;
-
-	entry->class = this;
-	entry->distance = distance;
-	if (!save_trace(&entry->trace))
-		return 0;
+	if ((addr >= start) && (addr < end))
+		return 1;
 
+#ifdef CONFIG_SMP
 	/*
-	 * Since we never remove from the dependency list, the list can
-	 * be walked lockless by other CPUs, it's only allocation
-	 * that must be protected by the spinlock. But this also means
-	 * we must make new entries visible only once writes to the
-	 * entry become visible - hence the RCU op:
+	 * percpu var?
 	 */
-	list_add_tail_rcu(&entry->entry, head);
-
-	return 1;
-}
-
-/*
- * Recursive, forwards-direction lock-dependency checking, used for
- * both noncyclic checking and for hardirq-unsafe/softirq-unsafe
- * checking.
- *
- * (to keep the stackframe of the recursive functions small we
- *  use these global variables, and we also mark various helper
- *  functions as noinline.)
- */
-static struct held_lock *check_source, *check_target;
-
-/*
- * Print a dependency chain entry (this is only done when a deadlock
- * has been detected):
- */
-static noinline int
-print_circular_bug_entry(struct lock_list *target, unsigned int depth)
-{
-	if (debug_locks_silent)
-		return 0;
-	printk("\n-> #%u", depth);
-	print_lock_name(target->class);
-	printk(":\n");
-	print_stack_trace(&target->trace, 6);
+	for_each_possible_cpu(i) {
+		start = (unsigned long) &__per_cpu_start + per_cpu_offset(i);
+		end   = (unsigned long) &__per_cpu_start + PERCPU_ENOUGH_ROOM
+					+ per_cpu_offset(i);
 
-	return 0;
-}
+		if ((addr >= start) && (addr < end))
+			return 1;
+	}
 #endif
 
-static void print_kernel_version(void)
-{
-	printk("%s %.*s\n", init_utsname()->release,
-		(int)strcspn(init_utsname()->version, " "),
-		init_utsname()->version);
+	/*
+	 * module var?
+	 */
+	return is_module_address(addr);
 }
 
-#ifdef CONFIG_PROVE_LOCKING
 /*
- * When a circular dependency is detected, print the
- * header first:
+ * To make lock name printouts unique, we calculate a unique
+ * class->name_version generation counter:
  */
-static noinline int
-print_circular_bug_header(struct lock_list *entry, unsigned int depth)
+static int count_matching_names(struct lock_class *new_class)
 {
-	struct task_struct *curr = current;
+	struct lock_class *class;
+	int count = 0;
 
-	if (!debug_locks_off_graph_unlock() || debug_locks_silent)
+	if (!new_class->name)
 		return 0;
 
-	printk("\n=======================================================\n");
-	printk(  "[ INFO: possible circular locking dependency detected ]\n");
-	print_kernel_version();
-	printk(  "-------------------------------------------------------\n");
-	printk("%s/%d is trying to acquire lock:\n",
-		curr->comm, curr->pid);
-	print_lock(check_source);
-	printk("\nbut task is already holding lock:\n");
-	print_lock(check_target);
-	printk("\nwhich lock already depends on the new lock.\n\n");
-	printk("\nthe existing dependency chain (in reverse order) is:\n");
-
-	print_circular_bug_entry(entry, depth);
+	list_for_each_entry(class, &all_lock_classes, lock_entry) {
+		if (new_class->key - new_class->subclass == class->key)
+			return class->name_version;
+		if (class->name && !strcmp(class->name, new_class->name))
+			count = max(count, class->name_version);
+	}
 
-	return 0;
+	return count + 1;
 }
 
-static noinline int print_circular_bug_tail(void)
+/*
+ * Register a lock's class in the hash-table, if the class is not present
+ * yet. Otherwise we look it up. We cache the result in the lock object
+ * itself, so actual lookup of the hash should be once per lock object.
+ */
+static inline struct lock_class *
+look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
 {
-	struct task_struct *curr = current;
-	struct lock_list this;
-
-	if (debug_locks_silent)
-		return 0;
-
-	this.class = check_source->class;
-	if (!save_trace(&this.trace))
-		return 0;
-
-	print_circular_bug_entry(&this, 0);
+	struct lockdep_subclass_key *key;
+	struct list_head *hash_head;
+	struct lock_class *class;
 
-	printk("\nother info that might help us debug this:\n\n");
-	lockdep_print_held_locks(curr);
+#ifdef CONFIG_DEBUG_LOCKDEP
+	/*
+	 * If the architecture calls into lockdep before initializing
+	 * the hashes then we'll warn about it later. (we cannot printk
+	 * right now)
+	 */
+	if (unlikely(!lockdep_initialized)) {
+		lockdep_init();
+		lockdep_init_error = 1;
+	}
+#endif
 
-	printk("\nstack backtrace:\n");
-	dump_stack();
+	/*
+	 * Static locks do not have their class-keys yet - for them the key
+	 * is the lock object itself:
+	 */
+	if (unlikely(!lock->key))
+		lock->key = (void *)lock;
 
-	return 0;
-}
+	/*
+	 * NOTE: the class-key must be unique. For dynamic locks, a static
+	 * lock_class_key variable is passed in through the mutex_init()
+	 * (or spin_lock_init()) call - which acts as the key. For static
+	 * locks we use the lock object itself as the key.
+	 */
+	BUILD_BUG_ON(sizeof(struct lock_class_key) > sizeof(struct lock_class));
 
-#define RECURSION_LIMIT 40
+	key = lock->key->subkeys + subclass;
 
-static int noinline print_infinite_recursion_bug(void)
-{
-	if (!debug_locks_off_graph_unlock())
-		return 0;
+	hash_head = classhashentry(key);
 
-	WARN_ON(1);
+	/*
+	 * We can walk the hash lockfree, because the hash only
+	 * grows, and we are careful when adding entries to the end:
+	 */
+	list_for_each_entry(class, hash_head, hash_entry)
+		if (class->key == key)
+			return class;
 
-	return 0;
+	return NULL;
 }
 
 /*
- * Prove that the dependency graph starting at <entry> can not
- * lead to <target>. Print an error and return 0 if it does.
+ * Register a lock's class in the hash-table, if the class is not present
+ * yet. Otherwise we look it up. We cache the result in the lock object
+ * itself, so actual lookup of the hash should be once per lock object.
  */
-static noinline int
-check_noncircular(struct lock_class *source, unsigned int depth)
+static inline struct lock_class *
+register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
 {
-	struct lock_list *entry;
-
-	debug_atomic_inc(&nr_cyclic_check_recursions);
-	if (depth > max_recursion_depth)
+	struct lockdep_subclass_key *key;
+	struct list_head *hash_head;
+	struct lock_class *class;
+	unsigned long flags;
+
+	class = look_up_lock_class(lock, subclass);
+	if (likely(class))
+		return class;
+
+	/*
+	 * Debug-check: all keys must be persistent!
+ 	 */
+	if (!static_obj(lock->key)) {
+		debug_locks_off();
+		printk("INFO: trying to register non-static key.\n");
+		printk("the code is fine but needs lockdep annotation.\n");
+		printk("turning off the locking correctness validator.\n");
+		dump_stack();
+
+		return NULL;
+	}
+
+	key = lock->key->subkeys + subclass;
+	hash_head = classhashentry(key);
+
+	raw_local_irq_save(flags);
+	if (!graph_lock()) {
+		raw_local_irq_restore(flags);
+		return NULL;
+	}
+	/*
+	 * We have to do the hash-walk again, to avoid races
+	 * with another CPU:
+	 */
+	list_for_each_entry(class, hash_head, hash_entry)
+		if (class->key == key)
+			goto out_unlock_set;
+	/*
+	 * Allocate a new key from the static array, and add it to
+	 * the hash:
+	 */
+	if (nr_lock_classes >= MAX_LOCKDEP_KEYS) {
+		if (!debug_locks_off_graph_unlock()) {
+			raw_local_irq_restore(flags);
+			return NULL;
+		}
+		raw_local_irq_restore(flags);
+
+		printk("BUG: MAX_LOCKDEP_KEYS too low!\n");
+		printk("turning off the locking correctness validator.\n");
+		return NULL;
+	}
+	class = lock_classes + nr_lock_classes++;
+	debug_atomic_inc(&nr_unused_locks);
+	class->key = key;
+	class->name = lock->name;
+	class->subclass = subclass;
+	INIT_LIST_HEAD(&class->lock_entry);
+	INIT_LIST_HEAD(&class->locks_before);
+	INIT_LIST_HEAD(&class->locks_after);
+	class->name_version = count_matching_names(class);
+	/*
+	 * We use RCU's safe list-add method to make
+	 * parallel walking of the hash-list safe:
+	 */
+	list_add_tail_rcu(&class->hash_entry, hash_head);
+
+	if (verbose(class)) {
+		graph_unlock();
+		raw_local_irq_restore(flags);
+
+		printk("\nnew class %p: %s", class->key, class->name);
+		if (class->name_version > 1)
+			printk("#%d", class->name_version);
+		printk("\n");
+		dump_stack();
+
+		raw_local_irq_save(flags);
+		if (!graph_lock()) {
+			raw_local_irq_restore(flags);
+			return NULL;
+		}
+	}
+out_unlock_set:
+	graph_unlock();
+	raw_local_irq_restore(flags);
+
+	if (!subclass || force)
+		lock->class_cache = class;
+
+	if (DEBUG_LOCKS_WARN_ON(class->subclass != subclass))
+		return NULL;
+
+	return class;
+}
+
+#ifdef CONFIG_PROVE_LOCKING
+/*
+ * Allocate a lockdep entry. (assumes the graph_lock held, returns
+ * with NULL on failure)
+ */
+static struct lock_list *alloc_list_entry(void)
+{
+	if (nr_list_entries >= MAX_LOCKDEP_ENTRIES) {
+		if (!debug_locks_off_graph_unlock())
+			return NULL;
+
+		printk("BUG: MAX_LOCKDEP_ENTRIES too low!\n");
+		printk("turning off the locking correctness validator.\n");
+		return NULL;
+	}
+	return list_entries + nr_list_entries++;
+}
+
+/*
+ * Add a new dependency to the head of the list:
+ */
+static int add_lock_to_list(struct lock_class *class, struct lock_class *this,
+			    struct list_head *head, unsigned long ip, int distance)
+{
+	struct lock_list *entry;
+	/*
+	 * Lock not present yet - get a new dependency struct and
+	 * add it to the list:
+	 */
+	entry = alloc_list_entry();
+	if (!entry)
+		return 0;
+
+	entry->class = this;
+	entry->distance = distance;
+	if (!save_trace(&entry->trace))
+		return 0;
+
+	/*
+	 * Since we never remove from the dependency list, the list can
+	 * be walked lockless by other CPUs, it's only allocation
+	 * that must be protected by the spinlock. But this also means
+	 * we must make new entries visible only once writes to the
+	 * entry become visible - hence the RCU op:
+	 */
+	list_add_tail_rcu(&entry->entry, head);
+
+	return 1;
+}
+
+/*
+ * Recursive, forwards-direction lock-dependency checking, used for
+ * both noncyclic checking and for hardirq-unsafe/softirq-unsafe
+ * checking.
+ *
+ * (to keep the stackframe of the recursive functions small we
+ *  use these global variables, and we also mark various helper
+ *  functions as noinline.)
+ */
+static struct held_lock *check_source, *check_target;
+
+/*
+ * Print a dependency chain entry (this is only done when a deadlock
+ * has been detected):
+ */
+static noinline int
+print_circular_bug_entry(struct lock_list *target, unsigned int depth)
+{
+	if (debug_locks_silent)
+		return 0;
+	printk("\n-> #%u", depth);
+	print_lock_name(target->class);
+	printk(":\n");
+	print_stack_trace(&target->trace, 6);
+
+	return 0;
+}
+
+/*
+ * When a circular dependency is detected, print the
+ * header first:
+ */
+static noinline int
+print_circular_bug_header(struct lock_list *entry, unsigned int depth)
+{
+	struct task_struct *curr = current;
+
+	if (!debug_locks_off_graph_unlock() || debug_locks_silent)
+		return 0;
+
+	printk("\n=======================================================\n");
+	printk(  "[ INFO: possible circular locking dependency detected ]\n");
+	print_kernel_version();
+	printk(  "-------------------------------------------------------\n");
+	printk("%s/%d is trying to acquire lock:\n",
+		curr->comm, curr->pid);
+	print_lock(check_source);
+	printk("\nbut task is already holding lock:\n");
+	print_lock(check_target);
+	printk("\nwhich lock already depends on the new lock.\n\n");
+	printk("\nthe existing dependency chain (in reverse order) is:\n");
+
+	print_circular_bug_entry(entry, depth);
+
+	return 0;
+}
+
+static noinline int print_circular_bug_tail(void)
+{
+	struct task_struct *curr = current;
+	struct lock_list this;
+
+	if (debug_locks_silent)
+		return 0;
+
+	this.class = check_source->class;
+	if (!save_trace(&this.trace))
+		return 0;
+
+	print_circular_bug_entry(&this, 0);
+
+	printk("\nother info that might help us debug this:\n\n");
+	lockdep_print_held_locks(curr);
+
+	printk("\nstack backtrace:\n");
+	dump_stack();
+
+	return 0;
+}
+
+#define RECURSION_LIMIT 40
+
+static int noinline print_infinite_recursion_bug(void)
+{
+	if (!debug_locks_off_graph_unlock())
+		return 0;
+
+	WARN_ON(1);
+
+	return 0;
+}
+
+/*
+ * Prove that the dependency graph starting at <entry> can not
+ * lead to <target>. Print an error and return 0 if it does.
+ */
+static noinline int
+check_noncircular(struct lock_class *source, unsigned int depth)
+{
+	struct lock_list *entry;
+
+	debug_atomic_inc(&nr_cyclic_check_recursions);
+	if (depth > max_recursion_depth)
 		max_recursion_depth = depth;
 	if (depth >= RECURSION_LIMIT)
 		return print_infinite_recursion_bug();
@@ -646,17 +843,8 @@ check_noncircular(struct lock_class *source, unsigned int depth)
 	}
 	return 1;
 }
-#endif
 
-static int very_verbose(struct lock_class *class)
-{
-#if VERY_VERBOSE
-	return class_filter(class);
-#endif
-	return 0;
-}
 #ifdef CONFIG_TRACE_IRQFLAGS
-
 /*
  * Forwards and backwards subgraph searching, for the purposes of
  * proving that two subgraphs can be connected by a new dependency
@@ -829,9 +1017,80 @@ check_usage(struct task_struct *curr, struct held_lock *prev,
 			bit_backwards, bit_forwards, irqclass);
 }
 
+static int
+check_prev_add_irq(struct task_struct *curr, struct held_lock *prev,
+		struct held_lock *next)
+{
+	/*
+	 * Prove that the new dependency does not connect a hardirq-safe
+	 * lock with a hardirq-unsafe lock - to achieve this we search
+	 * the backwards-subgraph starting at <prev>, and the
+	 * forwards-subgraph starting at <next>:
+	 */
+	if (!check_usage(curr, prev, next, LOCK_USED_IN_HARDIRQ,
+					LOCK_ENABLED_HARDIRQS, "hard"))
+		return 0;
+
+	/*
+	 * Prove that the new dependency does not connect a hardirq-safe-read
+	 * lock with a hardirq-unsafe lock - to achieve this we search
+	 * the backwards-subgraph starting at <prev>, and the
+	 * forwards-subgraph starting at <next>:
+	 */
+	if (!check_usage(curr, prev, next, LOCK_USED_IN_HARDIRQ_READ,
+					LOCK_ENABLED_HARDIRQS, "hard-read"))
+		return 0;
+
+	/*
+	 * Prove that the new dependency does not connect a softirq-safe
+	 * lock with a softirq-unsafe lock - to achieve this we search
+	 * the backwards-subgraph starting at <prev>, and the
+	 * forwards-subgraph starting at <next>:
+	 */
+	if (!check_usage(curr, prev, next, LOCK_USED_IN_SOFTIRQ,
+					LOCK_ENABLED_SOFTIRQS, "soft"))
+		return 0;
+	/*
+	 * Prove that the new dependency does not connect a softirq-safe-read
+	 * lock with a softirq-unsafe lock - to achieve this we search
+	 * the backwards-subgraph starting at <prev>, and the
+	 * forwards-subgraph starting at <next>:
+	 */
+	if (!check_usage(curr, prev, next, LOCK_USED_IN_SOFTIRQ_READ,
+					LOCK_ENABLED_SOFTIRQS, "soft"))
+		return 0;
+
+	return 1;
+}
+
+static void inc_chains(void)
+{
+	if (current->hardirq_context)
+		nr_hardirq_chains++;
+	else {
+		if (current->softirq_context)
+			nr_softirq_chains++;
+		else
+			nr_process_chains++;
+	}
+}
+
+#else
+
+static inline int
+check_prev_add_irq(struct task_struct *curr, struct held_lock *prev,
+		struct held_lock *next)
+{
+	return 1;
+}
+
+static inline void inc_chains(void)
+{
+	nr_process_chains++;
+}
+
 #endif
 
-#ifdef CONFIG_PROVE_LOCKING
 static int
 print_deadlock_bug(struct task_struct *curr, struct held_lock *prev,
 		   struct held_lock *next)
@@ -931,46 +1190,9 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
 	if (!(check_noncircular(next->class, 0)))
 		return print_circular_bug_tail();
 
-#ifdef CONFIG_TRACE_IRQFLAGS
-	/*
-	 * Prove that the new dependency does not connect a hardirq-safe
-	 * lock with a hardirq-unsafe lock - to achieve this we search
-	 * the backwards-subgraph starting at <prev>, and the
-	 * forwards-subgraph starting at <next>:
-	 */
-	if (!check_usage(curr, prev, next, LOCK_USED_IN_HARDIRQ,
-					LOCK_ENABLED_HARDIRQS, "hard"))
-		return 0;
-
-	/*
-	 * Prove that the new dependency does not connect a hardirq-safe-read
-	 * lock with a hardirq-unsafe lock - to achieve this we search
-	 * the backwards-subgraph starting at <prev>, and the
-	 * forwards-subgraph starting at <next>:
-	 */
-	if (!check_usage(curr, prev, next, LOCK_USED_IN_HARDIRQ_READ,
-					LOCK_ENABLED_HARDIRQS, "hard-read"))
+	if (!check_prev_add_irq(curr, prev, next))
 		return 0;
 
-	/*
-	 * Prove that the new dependency does not connect a softirq-safe
-	 * lock with a softirq-unsafe lock - to achieve this we search
-	 * the backwards-subgraph starting at <prev>, and the
-	 * forwards-subgraph starting at <next>:
-	 */
-	if (!check_usage(curr, prev, next, LOCK_USED_IN_SOFTIRQ,
-					LOCK_ENABLED_SOFTIRQS, "soft"))
-		return 0;
-	/*
-	 * Prove that the new dependency does not connect a softirq-safe-read
-	 * lock with a softirq-unsafe lock - to achieve this we search
-	 * the backwards-subgraph starting at <prev>, and the
-	 * forwards-subgraph starting at <next>:
-	 */
-	if (!check_usage(curr, prev, next, LOCK_USED_IN_SOFTIRQ_READ,
-					LOCK_ENABLED_SOFTIRQS, "soft"))
-		return 0;
-#endif
 	/*
 	 * For recursive read-locks we do all the dependency checks,
 	 * but we dont store read-triggered dependencies (only
@@ -1013,310 +1235,93 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
 		return 0;
 
 	/*
-	 * Debugging printouts:
-	 */
-	if (verbose(prev->class) || verbose(next->class)) {
-		graph_unlock();
-		printk("\n new dependency: ");
-		print_lock_name(prev->class);
-		printk(" => ");
-		print_lock_name(next->class);
-		printk("\n");
-		dump_stack();
-		return graph_lock();
-	}
-	return 1;
-}
-
-/*
- * Add the dependency to all directly-previous locks that are 'relevant'.
- * The ones that are relevant are (in increasing distance from curr):
- * all consecutive trylock entries and the final non-trylock entry - or
- * the end of this context's lock-chain - whichever comes first.
- */
-static int
-check_prevs_add(struct task_struct *curr, struct held_lock *next)
-{
-	int depth = curr->lockdep_depth;
-	struct held_lock *hlock;
-
-	/*
-	 * Debugging checks.
-	 *
-	 * Depth must not be zero for a non-head lock:
-	 */
-	if (!depth)
-		goto out_bug;
-	/*
-	 * At least two relevant locks must exist for this
-	 * to be a head:
-	 */
-	if (curr->held_locks[depth].irq_context !=
-			curr->held_locks[depth-1].irq_context)
-		goto out_bug;
-
-	for (;;) {
-		int distance = curr->lockdep_depth - depth + 1;
-		hlock = curr->held_locks + depth-1;
-		/*
-		 * Only non-recursive-read entries get new dependencies
-		 * added:
-		 */
-		if (hlock->read != 2) {
-			if (!check_prev_add(curr, hlock, next, distance))
-				return 0;
-			/*
-			 * Stop after the first non-trylock entry,
-			 * as non-trylock entries have added their
-			 * own direct dependencies already, so this
-			 * lock is connected to them indirectly:
-			 */
-			if (!hlock->trylock)
-				break;
-		}
-		depth--;
-		/*
-		 * End of lock-stack?
-		 */
-		if (!depth)
-			break;
-		/*
-		 * Stop the search if we cross into another context:
-		 */
-		if (curr->held_locks[depth].irq_context !=
-				curr->held_locks[depth-1].irq_context)
-			break;
-	}
-	return 1;
-out_bug:
-	if (!debug_locks_off_graph_unlock())
-		return 0;
-
-	WARN_ON(1);
-
-	return 0;
-}
-#endif
-
-/*
- * Is this the address of a static object:
- */
-static int static_obj(void *obj)
-{
-	unsigned long start = (unsigned long) &_stext,
-		      end   = (unsigned long) &_end,
-		      addr  = (unsigned long) obj;
-#ifdef CONFIG_SMP
-	int i;
-#endif
-
-	/*
-	 * static variable?
-	 */
-	if ((addr >= start) && (addr < end))
-		return 1;
-
-#ifdef CONFIG_SMP
-	/*
-	 * percpu var?
-	 */
-	for_each_possible_cpu(i) {
-		start = (unsigned long) &__per_cpu_start + per_cpu_offset(i);
-		end   = (unsigned long) &__per_cpu_start + PERCPU_ENOUGH_ROOM
-					+ per_cpu_offset(i);
-
-		if ((addr >= start) && (addr < end))
-			return 1;
-	}
-#endif
-
-	/*
-	 * module var?
-	 */
-	return is_module_address(addr);
-}
-
-/*
- * To make lock name printouts unique, we calculate a unique
- * class->name_version generation counter:
- */
-static int count_matching_names(struct lock_class *new_class)
-{
-	struct lock_class *class;
-	int count = 0;
-
-	if (!new_class->name)
-		return 0;
-
-	list_for_each_entry(class, &all_lock_classes, lock_entry) {
-		if (new_class->key - new_class->subclass == class->key)
-			return class->name_version;
-		if (class->name && !strcmp(class->name, new_class->name))
-			count = max(count, class->name_version);
-	}
-
-	return count + 1;
-}
-
-/*
- * Register a lock's class in the hash-table, if the class is not present
- * yet. Otherwise we look it up. We cache the result in the lock object
- * itself, so actual lookup of the hash should be once per lock object.
- */
-static inline struct lock_class *
-look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
-{
-	struct lockdep_subclass_key *key;
-	struct list_head *hash_head;
-	struct lock_class *class;
-
-#ifdef CONFIG_DEBUG_LOCKDEP
-	/*
-	 * If the architecture calls into lockdep before initializing
-	 * the hashes then we'll warn about it later. (we cannot printk
-	 * right now)
-	 */
-	if (unlikely(!lockdep_initialized)) {
-		lockdep_init();
-		lockdep_init_error = 1;
-	}
-#endif
-
-	/*
-	 * Static locks do not have their class-keys yet - for them the key
-	 * is the lock object itself:
-	 */
-	if (unlikely(!lock->key))
-		lock->key = (void *)lock;
-
-	/*
-	 * NOTE: the class-key must be unique. For dynamic locks, a static
-	 * lock_class_key variable is passed in through the mutex_init()
-	 * (or spin_lock_init()) call - which acts as the key. For static
-	 * locks we use the lock object itself as the key.
-	 */
-	BUILD_BUG_ON(sizeof(struct lock_class_key) > sizeof(struct lock_class));
-
-	key = lock->key->subkeys + subclass;
-
-	hash_head = classhashentry(key);
-
-	/*
-	 * We can walk the hash lockfree, because the hash only
-	 * grows, and we are careful when adding entries to the end:
-	 */
-	list_for_each_entry(class, hash_head, hash_entry)
-		if (class->key == key)
-			return class;
-
-	return NULL;
-}
-
-/*
- * Register a lock's class in the hash-table, if the class is not present
- * yet. Otherwise we look it up. We cache the result in the lock object
- * itself, so actual lookup of the hash should be once per lock object.
- */
-static inline struct lock_class *
-register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
-{
-	struct lockdep_subclass_key *key;
-	struct list_head *hash_head;
-	struct lock_class *class;
-	unsigned long flags;
-
-	class = look_up_lock_class(lock, subclass);
-	if (likely(class))
-		return class;
-
-	/*
-	 * Debug-check: all keys must be persistent!
- 	 */
-	if (!static_obj(lock->key)) {
-		debug_locks_off();
-		printk("INFO: trying to register non-static key.\n");
-		printk("the code is fine but needs lockdep annotation.\n");
-		printk("turning off the locking correctness validator.\n");
+	 * Debugging printouts:
+	 */
+	if (verbose(prev->class) || verbose(next->class)) {
+		graph_unlock();
+		printk("\n new dependency: ");
+		print_lock_name(prev->class);
+		printk(" => ");
+		print_lock_name(next->class);
+		printk("\n");
 		dump_stack();
-
-		return NULL;
+		return graph_lock();
 	}
+	return 1;
+}
 
-	key = lock->key->subkeys + subclass;
-	hash_head = classhashentry(key);
+/*
+ * Add the dependency to all directly-previous locks that are 'relevant'.
+ * The ones that are relevant are (in increasing distance from curr):
+ * all consecutive trylock entries and the final non-trylock entry - or
+ * the end of this context's lock-chain - whichever comes first.
+ */
+static int
+check_prevs_add(struct task_struct *curr, struct held_lock *next)
+{
+	int depth = curr->lockdep_depth;
+	struct held_lock *hlock;
 
-	raw_local_irq_save(flags);
-	if (!graph_lock()) {
-		raw_local_irq_restore(flags);
-		return NULL;
-	}
-	/*
-	 * We have to do the hash-walk again, to avoid races
-	 * with another CPU:
-	 */
-	list_for_each_entry(class, hash_head, hash_entry)
-		if (class->key == key)
-			goto out_unlock_set;
 	/*
-	 * Allocate a new key from the static array, and add it to
-	 * the hash:
+	 * Debugging checks.
+	 *
+	 * Depth must not be zero for a non-head lock:
 	 */
-	if (nr_lock_classes >= MAX_LOCKDEP_KEYS) {
-		if (!debug_locks_off_graph_unlock()) {
-			raw_local_irq_restore(flags);
-			return NULL;
-		}
-		raw_local_irq_restore(flags);
-
-		printk("BUG: MAX_LOCKDEP_KEYS too low!\n");
-		printk("turning off the locking correctness validator.\n");
-		return NULL;
-	}
-	class = lock_classes + nr_lock_classes++;
-	debug_atomic_inc(&nr_unused_locks);
-	class->key = key;
-	class->name = lock->name;
-	class->subclass = subclass;
-	INIT_LIST_HEAD(&class->lock_entry);
-	INIT_LIST_HEAD(&class->locks_before);
-	INIT_LIST_HEAD(&class->locks_after);
-	class->name_version = count_matching_names(class);
+	if (!depth)
+		goto out_bug;
 	/*
-	 * We use RCU's safe list-add method to make
-	 * parallel walking of the hash-list safe:
+	 * At least two relevant locks must exist for this
+	 * to be a head:
 	 */
-	list_add_tail_rcu(&class->hash_entry, hash_head);
-
-	if (verbose(class)) {
-		graph_unlock();
-		raw_local_irq_restore(flags);
-
-		printk("\nnew class %p: %s", class->key, class->name);
-		if (class->name_version > 1)
-			printk("#%d", class->name_version);
-		printk("\n");
-		dump_stack();
+	if (curr->held_locks[depth].irq_context !=
+			curr->held_locks[depth-1].irq_context)
+		goto out_bug;
 
-		raw_local_irq_save(flags);
-		if (!graph_lock()) {
-			raw_local_irq_restore(flags);
-			return NULL;
+	for (;;) {
+		int distance = curr->lockdep_depth - depth + 1;
+		hlock = curr->held_locks + depth-1;
+		/*
+		 * Only non-recursive-read entries get new dependencies
+		 * added:
+		 */
+		if (hlock->read != 2) {
+			if (!check_prev_add(curr, hlock, next, distance))
+				return 0;
+			/*
+			 * Stop after the first non-trylock entry,
+			 * as non-trylock entries have added their
+			 * own direct dependencies already, so this
+			 * lock is connected to them indirectly:
+			 */
+			if (!hlock->trylock)
+				break;
 		}
+		depth--;
+		/*
+		 * End of lock-stack?
+		 */
+		if (!depth)
+			break;
+		/*
+		 * Stop the search if we cross into another context:
+		 */
+		if (curr->held_locks[depth].irq_context !=
+				curr->held_locks[depth-1].irq_context)
+			break;
 	}
-out_unlock_set:
-	graph_unlock();
-	raw_local_irq_restore(flags);
-
-	if (!subclass || force)
-		lock->class_cache = class;
+	return 1;
+out_bug:
+	if (!debug_locks_off_graph_unlock())
+		return 0;
 
-	if (DEBUG_LOCKS_WARN_ON(class->subclass != subclass))
-		return NULL;
+	WARN_ON(1);
 
-	return class;
+	return 0;
 }
 
-#ifdef CONFIG_PROVE_LOCKING
+unsigned long nr_lock_chains;
+static struct lock_chain lock_chains[MAX_LOCKDEP_CHAINS];
+
 /*
  * Look up a dependency chain. If the key is not present yet then
  * add it and return 1 - in this case the new dependency chain is
@@ -1376,21 +1381,71 @@ cache_hit:
 	chain->chain_key = chain_key;
 	list_add_tail_rcu(&chain->entry, hash_head);
 	debug_atomic_inc(&chain_lookup_misses);
-#ifdef CONFIG_TRACE_IRQFLAGS
-	if (current->hardirq_context)
-		nr_hardirq_chains++;
-	else {
-		if (current->softirq_context)
-			nr_softirq_chains++;
-		else
-			nr_process_chains++;
-	}
-#else
-	nr_process_chains++;
-#endif
+	inc_chains();
+
+	return 1;
+}
+
+static int validate_chain(struct task_struct *curr, struct lockdep_map *lock,
+	       	struct held_lock *hlock, int chain_head)
+{
+	/*
+	 * Trylock needs to maintain the stack of held locks, but it
+	 * does not add new dependencies, because trylock can be done
+	 * in any order.
+	 *
+	 * We look up the chain_key and do the O(N^2) check and update of
+	 * the dependencies only if this is a new dependency chain.
+	 * (If lookup_chain_cache() returns with 1 it acquires
+	 * graph_lock for us)
+	 */
+	if (!hlock->trylock && (hlock->check == 2) &&
+			lookup_chain_cache(curr->curr_chain_key, hlock->class)) {
+		/*
+		 * Check whether last held lock:
+		 *
+		 * - is irq-safe, if this lock is irq-unsafe
+		 * - is softirq-safe, if this lock is hardirq-unsafe
+		 *
+		 * And check whether the new lock's dependency graph
+		 * could lead back to the previous lock.
+		 *
+		 * any of these scenarios could lead to a deadlock. If
+		 * All validations
+		 */
+		int ret = check_deadlock(curr, hlock, lock, hlock->read);
+
+		if (!ret)
+			return 0;
+		/*
+		 * Mark recursive read, as we jump over it when
+		 * building dependencies (just like we jump over
+		 * trylock entries):
+		 */
+		if (ret == 2)
+			hlock->read = 2;
+		/*
+		 * Add dependency only if this lock is not the head
+		 * of the chain, and if it's not a secondary read-lock:
+		 */
+		if (!chain_head && ret != 2)
+			if (!check_prevs_add(curr, hlock))
+				return 0;
+		graph_unlock();
+	} else
+		/* after lookup_chain_cache(): */
+		if (unlikely(!debug_locks))
+			return 0;
 
 	return 1;
 }
+#else
+static inline int validate_chain(struct task_struct *curr,
+	       	struct lockdep_map *lock, struct held_lock *hlock,
+		int chain_head)
+{
+	return 1;
+}
 #endif
 
 /*
@@ -1436,6 +1491,57 @@ static void check_chain_key(struct task_struct *curr)
 #endif
 }
 
+static int
+print_usage_bug(struct task_struct *curr, struct held_lock *this,
+		enum lock_usage_bit prev_bit, enum lock_usage_bit new_bit)
+{
+	if (!debug_locks_off_graph_unlock() || debug_locks_silent)
+		return 0;
+
+	printk("\n=================================\n");
+	printk(  "[ INFO: inconsistent lock state ]\n");
+	print_kernel_version();
+	printk(  "---------------------------------\n");
+
+	printk("inconsistent {%s} -> {%s} usage.\n",
+		usage_str[prev_bit], usage_str[new_bit]);
+
+	printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] takes:\n",
+		curr->comm, curr->pid,
+		trace_hardirq_context(curr), hardirq_count() >> HARDIRQ_SHIFT,
+		trace_softirq_context(curr), softirq_count() >> SOFTIRQ_SHIFT,
+		trace_hardirqs_enabled(curr),
+		trace_softirqs_enabled(curr));
+	print_lock(this);
+
+	printk("{%s} state was registered at:\n", usage_str[prev_bit]);
+	print_stack_trace(this->class->usage_traces + prev_bit, 1);
+
+	print_irqtrace_events(curr);
+	printk("\nother info that might help us debug this:\n");
+	lockdep_print_held_locks(curr);
+
+	printk("\nstack backtrace:\n");
+	dump_stack();
+
+	return 0;
+}
+
+/*
+ * Print out an error if an invalid bit is set:
+ */
+static inline int
+valid_state(struct task_struct *curr, struct held_lock *this,
+	    enum lock_usage_bit new_bit, enum lock_usage_bit bad_bit)
+{
+	if (unlikely(this->class->usage_mask & (1 << bad_bit)))
+		return print_usage_bug(curr, this, bad_bit, new_bit);
+	return 1;
+}
+
+static int mark_lock(struct task_struct *curr, struct held_lock *this,
+		     enum lock_usage_bit new_bit);
+
 #ifdef CONFIG_TRACE_IRQFLAGS
 
 /*
@@ -1529,90 +1635,30 @@ void print_irqtrace_events(struct task_struct *curr)
 	print_ip_sym(curr->softirq_disable_ip);
 }
 
-#endif
-
-static int
-print_usage_bug(struct task_struct *curr, struct held_lock *this,
-		enum lock_usage_bit prev_bit, enum lock_usage_bit new_bit)
+static int hardirq_verbose(struct lock_class *class)
 {
-	if (!debug_locks_off_graph_unlock() || debug_locks_silent)
-		return 0;
-
-	printk("\n=================================\n");
-	printk(  "[ INFO: inconsistent lock state ]\n");
-	print_kernel_version();
-	printk(  "---------------------------------\n");
-
-	printk("inconsistent {%s} -> {%s} usage.\n",
-		usage_str[prev_bit], usage_str[new_bit]);
-
-	printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] takes:\n",
-		curr->comm, curr->pid,
-		trace_hardirq_context(curr), hardirq_count() >> HARDIRQ_SHIFT,
-		trace_softirq_context(curr), softirq_count() >> SOFTIRQ_SHIFT,
-		trace_hardirqs_enabled(curr),
-		trace_softirqs_enabled(curr));
-	print_lock(this);
-
-	printk("{%s} state was registered at:\n", usage_str[prev_bit]);
-	print_stack_trace(this->class->usage_traces + prev_bit, 1);
-
-	print_irqtrace_events(curr);
-	printk("\nother info that might help us debug this:\n");
-	lockdep_print_held_locks(curr);
-
-	printk("\nstack backtrace:\n");
-	dump_stack();
-
+#if HARDIRQ_VERBOSE
+	return class_filter(class);
+#endif
 	return 0;
 }
 
-/*
- * Print out an error if an invalid bit is set:
- */
-static inline int
-valid_state(struct task_struct *curr, struct held_lock *this,
-	    enum lock_usage_bit new_bit, enum lock_usage_bit bad_bit)
+static int softirq_verbose(struct lock_class *class)
 {
-	if (unlikely(this->class->usage_mask & (1 << bad_bit)))
-		return print_usage_bug(curr, this, bad_bit, new_bit);
-	return 1;
+#if SOFTIRQ_VERBOSE
+	return class_filter(class);
+#endif
+	return 0;
 }
 
 #define STRICT_READ_CHECKS	1
 
-/*
- * Mark a lock with a usage bit, and validate the state transition:
- */
-static int mark_lock(struct task_struct *curr, struct held_lock *this,
-		     enum lock_usage_bit new_bit)
+static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
+		enum lock_usage_bit new_bit)
 {
-	unsigned int new_mask = 1 << new_bit, ret = 1;
-
-	/*
-	 * If already set then do not dirty the cacheline,
-	 * nor do any checks:
-	 */
-	if (likely(this->class->usage_mask & new_mask))
-		return 1;
-
-	if (!graph_lock())
-		return 0;
-	/*
-	 * Make sure we didnt race:
-	 */
-	if (unlikely(this->class->usage_mask & new_mask)) {
-		graph_unlock();
-		return 1;
-	}
-
-	this->class->usage_mask |= new_mask;
+	int ret = 1;
 
-	if (!save_trace(this->class->usage_traces + new_bit))
-		return 0;
-
-	switch (new_bit) {
-#ifdef CONFIG_TRACE_IRQFLAGS
+	switch(new_bit) {
 	case LOCK_USED_IN_HARDIRQ:
 		if (!valid_state(curr, this, new_bit, LOCK_ENABLED_HARDIRQS))
 			return 0;
@@ -1771,37 +1817,14 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
 		if (softirq_verbose(this->class))
 			ret = 2;
 		break;
-#endif
-	case LOCK_USED:
-		/*
-		 * Add it to the global list of classes:
-		 */
-		list_add_tail_rcu(&this->class->lock_entry, &all_lock_classes);
-		debug_atomic_dec(&nr_unused_locks);
-		break;
 	default:
-		if (!debug_locks_off_graph_unlock())
-			return 0;
 		WARN_ON(1);
-		return 0;
-	}
-
-	graph_unlock();
-
-	/*
-	 * We must printk outside of the graph_lock:
-	 */
-	if (ret == 2) {
-		printk("\nmarked lock as {%s}:\n", usage_str[new_bit]);
-		print_lock(this);
-		print_irqtrace_events(curr);
-		dump_stack();
+		break;
 	}
 
 	return ret;
 }
 
-#ifdef CONFIG_TRACE_IRQFLAGS
 /*
  * Mark all held locks with a usage bit:
  */
@@ -1890,101 +1913,268 @@ void trace_hardirqs_on(void)
 		if (!mark_held_locks(curr, 0))
 			return;
 
-	curr->hardirq_enable_ip = ip;
-	curr->hardirq_enable_event = ++curr->irq_events;
-	debug_atomic_inc(&hardirqs_on_events);
+	curr->hardirq_enable_ip = ip;
+	curr->hardirq_enable_event = ++curr->irq_events;
+	debug_atomic_inc(&hardirqs_on_events);
+}
+
+EXPORT_SYMBOL(trace_hardirqs_on);
+
+/*
+ * Hardirqs were disabled:
+ */
+void trace_hardirqs_off(void)
+{
+	struct task_struct *curr = current;
+
+	if (unlikely(!debug_locks || current->lockdep_recursion))
+		return;
+
+	if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
+		return;
+
+	if (curr->hardirqs_enabled) {
+		/*
+		 * We have done an ON -> OFF transition:
+		 */
+		curr->hardirqs_enabled = 0;
+		curr->hardirq_disable_ip = _RET_IP_;
+		curr->hardirq_disable_event = ++curr->irq_events;
+		debug_atomic_inc(&hardirqs_off_events);
+	} else
+		debug_atomic_inc(&redundant_hardirqs_off);
+}
+
+EXPORT_SYMBOL(trace_hardirqs_off);
+
+/*
+ * Softirqs will be enabled:
+ */
+void trace_softirqs_on(unsigned long ip)
+{
+	struct task_struct *curr = current;
+
+	if (unlikely(!debug_locks))
+		return;
+
+	if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
+		return;
+
+	if (curr->softirqs_enabled) {
+		debug_atomic_inc(&redundant_softirqs_on);
+		return;
+	}
+
+	/*
+	 * We'll do an OFF -> ON transition:
+	 */
+	curr->softirqs_enabled = 1;
+	curr->softirq_enable_ip = ip;
+	curr->softirq_enable_event = ++curr->irq_events;
+	debug_atomic_inc(&softirqs_on_events);
+	/*
+	 * We are going to turn softirqs on, so set the
+	 * usage bit for all held locks, if hardirqs are
+	 * enabled too:
+	 */
+	if (curr->hardirqs_enabled)
+		mark_held_locks(curr, 0);
+}
+
+/*
+ * Softirqs were disabled:
+ */
+void trace_softirqs_off(unsigned long ip)
+{
+	struct task_struct *curr = current;
+
+	if (unlikely(!debug_locks))
+		return;
+
+	if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
+		return;
+
+	if (curr->softirqs_enabled) {
+		/*
+		 * We have done an ON -> OFF transition:
+		 */
+		curr->softirqs_enabled = 0;
+		curr->softirq_disable_ip = ip;
+		curr->softirq_disable_event = ++curr->irq_events;
+		debug_atomic_inc(&softirqs_off_events);
+		DEBUG_LOCKS_WARN_ON(!softirq_count());
+	} else
+		debug_atomic_inc(&redundant_softirqs_off);
+}
+
+static int mark_irqflags(struct task_struct *curr, struct held_lock *hlock)
+{
+	/*
+	 * If non-trylock use in a hardirq or softirq context, then
+	 * mark the lock as used in these contexts:
+	 */
+	if (!hlock->trylock) {
+		if (hlock->read) {
+			if (curr->hardirq_context)
+				if (!mark_lock(curr, hlock,
+						LOCK_USED_IN_HARDIRQ_READ))
+					return 0;
+			if (curr->softirq_context)
+				if (!mark_lock(curr, hlock,
+						LOCK_USED_IN_SOFTIRQ_READ))
+					return 0;
+		} else {
+			if (curr->hardirq_context)
+				if (!mark_lock(curr, hlock, LOCK_USED_IN_HARDIRQ))
+					return 0;
+			if (curr->softirq_context)
+				if (!mark_lock(curr, hlock, LOCK_USED_IN_SOFTIRQ))
+					return 0;
+		}
+	}
+	if (!hlock->hardirqs_off) {
+		if (hlock->read) {
+			if (!mark_lock(curr, hlock,
+					LOCK_ENABLED_HARDIRQS_READ))
+				return 0;
+			if (curr->softirqs_enabled)
+				if (!mark_lock(curr, hlock,
+						LOCK_ENABLED_SOFTIRQS_READ))
+					return 0;
+		} else {
+			if (!mark_lock(curr, hlock,
+					LOCK_ENABLED_HARDIRQS))
+				return 0;
+			if (curr->softirqs_enabled)
+				if (!mark_lock(curr, hlock,
+						LOCK_ENABLED_SOFTIRQS))
+					return 0;
+		}
+	}
+
+	return 1;
+}
+
+static int separate_irq_context(struct task_struct *curr,
+		struct held_lock *hlock)
+{
+	unsigned int depth = curr->lockdep_depth;
+
+	/*
+	 * Keep track of points where we cross into an interrupt context:
+	 */
+	hlock->irq_context = 2*(curr->hardirq_context ? 1 : 0) +
+				curr->softirq_context;
+	if (depth) {
+		struct held_lock *prev_hlock;
+
+		prev_hlock = curr->held_locks + depth-1;
+		/*
+		 * If we cross into another context, reset the
+		 * hash key (this also prevents the checking and the
+		 * adding of the dependency to 'prev'):
+		 */
+		if (prev_hlock->irq_context != hlock->irq_context)
+			return 1;
+	}
+	return 0;
 }
 
-EXPORT_SYMBOL(trace_hardirqs_on);
+#else
 
-/*
- * Hardirqs were disabled:
- */
-void trace_hardirqs_off(void)
+static inline
+int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
+		enum lock_usage_bit new_bit)
 {
-	struct task_struct *curr = current;
-
-	if (unlikely(!debug_locks || current->lockdep_recursion))
-		return;
+	WARN_ON(1);
+	return 1;
+}
 
-	if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
-		return;
+static inline int mark_irqflags(struct task_struct *curr,
+		struct held_lock *hlock)
+{
+	return 1;
+}
 
-	if (curr->hardirqs_enabled) {
-		/*
-		 * We have done an ON -> OFF transition:
-		 */
-		curr->hardirqs_enabled = 0;
-		curr->hardirq_disable_ip = _RET_IP_;
-		curr->hardirq_disable_event = ++curr->irq_events;
-		debug_atomic_inc(&hardirqs_off_events);
-	} else
-		debug_atomic_inc(&redundant_hardirqs_off);
+static inline int separate_irq_context(struct task_struct *curr,
+		struct held_lock *hlock)
+{
+	return 0;
 }
 
-EXPORT_SYMBOL(trace_hardirqs_off);
+#endif
 
 /*
- * Softirqs will be enabled:
+ * Mark a lock with a usage bit, and validate the state transition:
  */
-void trace_softirqs_on(unsigned long ip)
+static int mark_lock(struct task_struct *curr, struct held_lock *this,
+		     enum lock_usage_bit new_bit)
 {
-	struct task_struct *curr = current;
-
-	if (unlikely(!debug_locks))
-		return;
-
-	if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
-		return;
-
-	if (curr->softirqs_enabled) {
-		debug_atomic_inc(&redundant_softirqs_on);
-		return;
-	}
+	unsigned int new_mask = 1 << new_bit, ret = 1;
 
 	/*
-	 * We'll do an OFF -> ON transition:
+	 * If already set then do not dirty the cacheline,
+	 * nor do any checks:
 	 */
-	curr->softirqs_enabled = 1;
-	curr->softirq_enable_ip = ip;
-	curr->softirq_enable_event = ++curr->irq_events;
-	debug_atomic_inc(&softirqs_on_events);
+	if (likely(this->class->usage_mask & new_mask))
+		return 1;
+
+	if (!graph_lock())
+		return 0;
 	/*
-	 * We are going to turn softirqs on, so set the
-	 * usage bit for all held locks, if hardirqs are
-	 * enabled too:
+	 * Make sure we didnt race:
 	 */
-	if (curr->hardirqs_enabled)
-		mark_held_locks(curr, 0);
-}
-
-/*
- * Softirqs were disabled:
- */
-void trace_softirqs_off(unsigned long ip)
-{
-	struct task_struct *curr = current;
+	if (unlikely(this->class->usage_mask & new_mask)) {
+		graph_unlock();
+		return 1;
+	}
 
-	if (unlikely(!debug_locks))
-		return;
+	this->class->usage_mask |= new_mask;
 
-	if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
-		return;
+	if (!save_trace(this->class->usage_traces + new_bit))
+		return 0;
 
-	if (curr->softirqs_enabled) {
+	switch (new_bit) {
+	case LOCK_USED_IN_HARDIRQ:
+	case LOCK_USED_IN_SOFTIRQ:
+	case LOCK_USED_IN_HARDIRQ_READ:
+	case LOCK_USED_IN_SOFTIRQ_READ:
+	case LOCK_ENABLED_HARDIRQS:
+	case LOCK_ENABLED_SOFTIRQS:
+	case LOCK_ENABLED_HARDIRQS_READ:
+	case LOCK_ENABLED_SOFTIRQS_READ:
+		ret = mark_lock_irq(curr, this, new_bit);
+		if (!ret)
+			return 0;
+		break;
+	case LOCK_USED:
 		/*
-		 * We have done an ON -> OFF transition:
+		 * Add it to the global list of classes:
 		 */
-		curr->softirqs_enabled = 0;
-		curr->softirq_disable_ip = ip;
-		curr->softirq_disable_event = ++curr->irq_events;
-		debug_atomic_inc(&softirqs_off_events);
-		DEBUG_LOCKS_WARN_ON(!softirq_count());
-	} else
-		debug_atomic_inc(&redundant_softirqs_off);
-}
+		list_add_tail_rcu(&this->class->lock_entry, &all_lock_classes);
+		debug_atomic_dec(&nr_unused_locks);
+		break;
+	default:
+		if (!debug_locks_off_graph_unlock())
+			return 0;
+		WARN_ON(1);
+		return 0;
+	}
 
-#endif
+	graph_unlock();
+
+	/*
+	 * We must printk outside of the graph_lock:
+	 */
+	if (ret == 2) {
+		printk("\nmarked lock as {%s}:\n", usage_str[new_bit]);
+		print_lock(this);
+		print_irqtrace_events(curr);
+		dump_stack();
+	}
+
+	return ret;
+}
 
 /*
  * Initialize a lock instance's lock-class mapping info:
@@ -2082,56 +2272,13 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 	hlock->check = check;
 	hlock->hardirqs_off = hardirqs_off;
 
-	if (check != 2)
-		goto out_calc_hash;
-#ifdef CONFIG_TRACE_IRQFLAGS
-	/*
-	 * If non-trylock use in a hardirq or softirq context, then
-	 * mark the lock as used in these contexts:
-	 */
-	if (!trylock) {
-		if (read) {
-			if (curr->hardirq_context)
-				if (!mark_lock(curr, hlock,
-						LOCK_USED_IN_HARDIRQ_READ))
-					return 0;
-			if (curr->softirq_context)
-				if (!mark_lock(curr, hlock,
-						LOCK_USED_IN_SOFTIRQ_READ))
-					return 0;
-		} else {
-			if (curr->hardirq_context)
-				if (!mark_lock(curr, hlock, LOCK_USED_IN_HARDIRQ))
-					return 0;
-			if (curr->softirq_context)
-				if (!mark_lock(curr, hlock, LOCK_USED_IN_SOFTIRQ))
-					return 0;
-		}
-	}
-	if (!hardirqs_off) {
-		if (read) {
-			if (!mark_lock(curr, hlock,
-					LOCK_ENABLED_HARDIRQS_READ))
-				return 0;
-			if (curr->softirqs_enabled)
-				if (!mark_lock(curr, hlock,
-						LOCK_ENABLED_SOFTIRQS_READ))
-					return 0;
-		} else {
-			if (!mark_lock(curr, hlock,
-					LOCK_ENABLED_HARDIRQS))
-				return 0;
-			if (curr->softirqs_enabled)
-				if (!mark_lock(curr, hlock,
-						LOCK_ENABLED_SOFTIRQS))
-					return 0;
-		}
-	}
-#endif
+	if (check == 2 && !mark_irqflags(curr, hlock))
+		return 0;
+
 	/* mark it as used: */
 	if (!mark_lock(curr, hlock, LOCK_USED))
 		return 0;
-out_calc_hash:
+
 	/*
 	 * Calculate the chain hash: it's the combined has of all the
 	 * lock keys along the dependency chain. We save the hash value
@@ -2154,77 +2301,15 @@ out_calc_hash:
 	}
 
 	hlock->prev_chain_key = chain_key;
-
-#ifdef CONFIG_TRACE_IRQFLAGS
-	/*
-	 * Keep track of points where we cross into an interrupt context:
-	 */
-	hlock->irq_context = 2*(curr->hardirq_context ? 1 : 0) +
-				curr->softirq_context;
-	if (depth) {
-		struct held_lock *prev_hlock;
-
-		prev_hlock = curr->held_locks + depth-1;
-		/*
-		 * If we cross into another context, reset the
-		 * hash key (this also prevents the checking and the
-		 * adding of the dependency to 'prev'):
-		 */
-		if (prev_hlock->irq_context != hlock->irq_context) {
-			chain_key = 0;
-			chain_head = 1;
-		}
+	if (separate_irq_context(curr, hlock)) {
+		chain_key = 0;
+		chain_head = 1;
 	}
-#endif
 	chain_key = iterate_chain_key(chain_key, id);
 	curr->curr_chain_key = chain_key;
 
-	/*
-	 * Trylock needs to maintain the stack of held locks, but it
-	 * does not add new dependencies, because trylock can be done
-	 * in any order.
-	 *
-	 * We look up the chain_key and do the O(N^2) check and update of
-	 * the dependencies only if this is a new dependency chain.
-	 * (If lookup_chain_cache() returns with 1 it acquires
-	 * graph_lock for us)
-	 */
-	if (!trylock && (check == 2) && lookup_chain_cache(chain_key, class)) {
-		/*
-		 * Check whether last held lock:
-		 *
-		 * - is irq-safe, if this lock is irq-unsafe
-		 * - is softirq-safe, if this lock is hardirq-unsafe
-		 *
-		 * And check whether the new lock's dependency graph
-		 * could lead back to the previous lock.
-		 *
-		 * any of these scenarios could lead to a deadlock. If
-		 * All validations
-		 */
-		int ret = check_deadlock(curr, hlock, lock, read);
-
-		if (!ret)
-			return 0;
-		/*
-		 * Mark recursive read, as we jump over it when
-		 * building dependencies (just like we jump over
-		 * trylock entries):
-		 */
-		if (ret == 2)
-			hlock->read = 2;
-		/*
-		 * Add dependency only if this lock is not the head
-		 * of the chain, and if it's not a secondary read-lock:
-		 */
-		if (!chain_head && ret != 2)
-			if (!check_prevs_add(curr, hlock))
-				return 0;
-		graph_unlock();
-	} else
-		/* after lookup_chain_cache(): */
-		if (unlikely(!debug_locks))
-			return 0;
+	if (!validate_chain(curr, lock, hlock, chain_head))
+		return 0;
 
 	curr->lockdep_depth++;
 	check_chain_key(curr);
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index 58f35e586ee..2fde34127e2 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -271,8 +271,10 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
 	if (nr_list_entries)
 		factor = sum_forward_deps / nr_list_entries;
 
+#ifdef CONFIG_PROVE_LOCKING
 	seq_printf(m, " dependency chains:             %11lu [max: %lu]\n",
 			nr_lock_chains, MAX_LOCKDEP_CHAINS);
+#endif
 
 #ifdef CONFIG_TRACE_IRQFLAGS
 	seq_printf(m, " in-hardirq chains:             %11u\n",
-- 
cgit v1.2.3-70-g09d2


From f20786ff4da51e56b1956acf30be2552be266746 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 19 Jul 2007 01:48:56 -0700
Subject: lockstat: core infrastructure

Introduce the core lock statistics code.

Lock statistics provides lock wait-time and hold-time (as well as the count
of corresponding contention and acquisitions events). Also, the first few
call-sites that encounter contention are tracked.

Lock wait-time is the time spent waiting on the lock. This provides insight
into the locking scheme, that is, a heavily contended lock is indicative of
a too coarse locking scheme.

Lock hold-time is the duration the lock was held, this provides a reference for
the wait-time numbers, so they can be put into perspective.

  1)
    lock
  2)
    ... do stuff ..
    unlock
  3)

The time between 1 and 2 is the wait-time. The time between 2 and 3 is the
hold-time.

The lockdep held-lock tracking code is reused, because it already collects locks
into meaningful groups (classes), and because it is an existing infrastructure
for lock instrumentation.

Currently lockdep tracks lock acquisition with two hooks:

  lock()
    lock_acquire()
    _lock()

 ... code protected by lock ...

  unlock()
    lock_release()
    _unlock()

We need to extend this with two more hooks, in order to measure contention.

  lock_contended() - used to measure contention events
  lock_acquired()  - completion of the contention

These are then placed the following way:

  lock()
    lock_acquire()
    if (!_try_lock())
      lock_contended()
      _lock()
      lock_acquired()

 ... do locked stuff ...

  unlock()
    lock_release()
    _unlock()

(Note: the try_lock() 'trick' is used to avoid instrumenting all platform
       dependent lock primitive implementations.)

It is also possible to toggle the two lockdep features at runtime using:

  /proc/sys/kernel/prove_locking
  /proc/sys/kernel/lock_stat

(esp. turning off the O(n^2) prove_locking functionaliy can help)

[akpm@linux-foundation.org: build fixes]
[akpm@linux-foundation.org: nuke unneeded ifdefs]
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Ingo Molnar <mingo@elte.hu>
Acked-by: Jason Baron <jbaron@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/lockdep.h |  53 +++++++++++
 kernel/lockdep.c        | 247 ++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/sysctl.c         |  22 +++++
 lib/Kconfig.debug       |  11 +++
 4 files changed, 333 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index 14c937d345c..8f946f614f8 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -9,6 +9,7 @@
 #define __LINUX_LOCKDEP_H
 
 struct task_struct;
+struct lockdep_map;
 
 #ifdef CONFIG_LOCKDEP
 
@@ -114,8 +115,32 @@ struct lock_class {
 
 	const char			*name;
 	int				name_version;
+
+#ifdef CONFIG_LOCK_STAT
+	unsigned long			contention_point[4];
+#endif
+};
+
+#ifdef CONFIG_LOCK_STAT
+struct lock_time {
+	s64				min;
+	s64				max;
+	s64				total;
+	unsigned long			nr;
 };
 
+struct lock_class_stats {
+	unsigned long			contention_point[4];
+	struct lock_time		read_waittime;
+	struct lock_time		write_waittime;
+	struct lock_time		read_holdtime;
+	struct lock_time		write_holdtime;
+};
+
+struct lock_class_stats lock_stats(struct lock_class *class);
+void clear_lock_stats(struct lock_class *class);
+#endif
+
 /*
  * Map the lock object (the lock instance) to the lock-class object.
  * This is embedded into specific lock instances:
@@ -165,6 +190,10 @@ struct held_lock {
 	unsigned long			acquire_ip;
 	struct lockdep_map		*instance;
 
+#ifdef CONFIG_LOCK_STAT
+	u64 				waittime_stamp;
+	u64				holdtime_stamp;
+#endif
 	/*
 	 * The lock-stack is unified in that the lock chains of interrupt
 	 * contexts nest ontop of process context chains, but we 'separate'
@@ -281,6 +310,30 @@ struct lock_class_key { };
 
 #endif /* !LOCKDEP */
 
+#ifdef CONFIG_LOCK_STAT
+
+extern void lock_contended(struct lockdep_map *lock, unsigned long ip);
+extern void lock_acquired(struct lockdep_map *lock);
+
+#define LOCK_CONTENDED(_lock, try, lock)			\
+do {								\
+	if (!try(_lock)) {					\
+		lock_contended(&(_lock)->dep_map, _RET_IP_);	\
+		lock(_lock);					\
+		lock_acquired(&(_lock)->dep_map);		\
+	}							\
+} while (0)
+
+#else /* CONFIG_LOCK_STAT */
+
+#define lock_contended(lockdep_map, ip) do {} while (0)
+#define lock_acquired(lockdep_map) do {} while (0)
+
+#define LOCK_CONTENDED(_lock, try, lock) \
+	lock(_lock)
+
+#endif /* CONFIG_LOCK_STAT */
+
 #if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_GENERIC_HARDIRQS)
 extern void early_init_irq_lock_class(void);
 #else
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 87ac3642507..70ca4db28af 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -42,6 +42,20 @@
 
 #include "lockdep_internals.h"
 
+#ifdef CONFIG_PROVE_LOCKING
+int prove_locking = 1;
+module_param(prove_locking, int, 0644);
+#else
+#define prove_locking 0
+#endif
+
+#ifdef CONFIG_LOCK_STAT
+int lock_stat = 1;
+module_param(lock_stat, int, 0644);
+#else
+#define lock_stat 0
+#endif
+
 /*
  * lockdep_lock: protects the lockdep graph, the hashes and the
  *               class/list/hash allocators.
@@ -104,6 +118,70 @@ static struct lock_list list_entries[MAX_LOCKDEP_ENTRIES];
 unsigned long nr_lock_classes;
 static struct lock_class lock_classes[MAX_LOCKDEP_KEYS];
 
+#ifdef CONFIG_LOCK_STAT
+static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS], lock_stats);
+
+static int lock_contention_point(struct lock_class *class, unsigned long ip)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(class->contention_point); i++) {
+		if (class->contention_point[i] == 0) {
+			class->contention_point[i] = ip;
+			break;
+		}
+		if (class->contention_point[i] == ip)
+			break;
+	}
+
+	return i;
+}
+
+static void lock_time_inc(struct lock_time *lt, s64 time)
+{
+	if (time > lt->max)
+		lt->max = time;
+
+	if (time < lt->min || !lt->min)
+		lt->min = time;
+
+	lt->total += time;
+	lt->nr++;
+}
+
+static struct lock_class_stats *get_lock_stats(struct lock_class *class)
+{
+	return &get_cpu_var(lock_stats)[class - lock_classes];
+}
+
+static void put_lock_stats(struct lock_class_stats *stats)
+{
+	put_cpu_var(lock_stats);
+}
+
+static void lock_release_holdtime(struct held_lock *hlock)
+{
+	struct lock_class_stats *stats;
+	s64 holdtime;
+
+	if (!lock_stat)
+		return;
+
+	holdtime = sched_clock() - hlock->holdtime_stamp;
+
+	stats = get_lock_stats(hlock->class);
+	if (hlock->read)
+		lock_time_inc(&stats->read_holdtime, holdtime);
+	else
+		lock_time_inc(&stats->write_holdtime, holdtime);
+	put_lock_stats(stats);
+}
+#else
+static inline void lock_release_holdtime(struct held_lock *hlock)
+{
+}
+#endif
+
 /*
  * We keep a global list of all lock classes. The list only grows,
  * never shrinks. The list is only accessed with the lockdep
@@ -2221,6 +2299,9 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 	int chain_head = 0;
 	u64 chain_key;
 
+	if (!prove_locking)
+		check = 1;
+
 	if (unlikely(!debug_locks))
 		return 0;
 
@@ -2271,6 +2352,10 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 	hlock->read = read;
 	hlock->check = check;
 	hlock->hardirqs_off = hardirqs_off;
+#ifdef CONFIG_LOCK_STAT
+	hlock->waittime_stamp = 0;
+	hlock->holdtime_stamp = sched_clock();
+#endif
 
 	if (check == 2 && !mark_irqflags(curr, hlock))
 		return 0;
@@ -2411,6 +2496,8 @@ lock_release_non_nested(struct task_struct *curr,
 	return print_unlock_inbalance_bug(curr, lock, ip);
 
 found_it:
+	lock_release_holdtime(hlock);
+
 	/*
 	 * We have the right lock to unlock, 'hlock' points to it.
 	 * Now we remove it from the stack, and add back the other
@@ -2463,6 +2550,8 @@ static int lock_release_nested(struct task_struct *curr,
 
 	curr->curr_chain_key = hlock->prev_chain_key;
 
+	lock_release_holdtime(hlock);
+
 #ifdef CONFIG_DEBUG_LOCKDEP
 	hlock->prev_chain_key = 0;
 	hlock->class = NULL;
@@ -2537,6 +2626,9 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 {
 	unsigned long flags;
 
+	if (unlikely(!lock_stat && !prove_locking))
+		return;
+
 	if (unlikely(current->lockdep_recursion))
 		return;
 
@@ -2556,6 +2648,9 @@ void lock_release(struct lockdep_map *lock, int nested, unsigned long ip)
 {
 	unsigned long flags;
 
+	if (unlikely(!lock_stat && !prove_locking))
+		return;
+
 	if (unlikely(current->lockdep_recursion))
 		return;
 
@@ -2569,6 +2664,158 @@ void lock_release(struct lockdep_map *lock, int nested, unsigned long ip)
 
 EXPORT_SYMBOL_GPL(lock_release);
 
+#ifdef CONFIG_LOCK_STAT
+static int
+print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock,
+			   unsigned long ip)
+{
+	if (!debug_locks_off())
+		return 0;
+	if (debug_locks_silent)
+		return 0;
+
+	printk("\n=================================\n");
+	printk(  "[ BUG: bad contention detected! ]\n");
+	printk(  "---------------------------------\n");
+	printk("%s/%d is trying to contend lock (",
+		curr->comm, curr->pid);
+	print_lockdep_cache(lock);
+	printk(") at:\n");
+	print_ip_sym(ip);
+	printk("but there are no locks held!\n");
+	printk("\nother info that might help us debug this:\n");
+	lockdep_print_held_locks(curr);
+
+	printk("\nstack backtrace:\n");
+	dump_stack();
+
+	return 0;
+}
+
+static void
+__lock_contended(struct lockdep_map *lock, unsigned long ip)
+{
+	struct task_struct *curr = current;
+	struct held_lock *hlock, *prev_hlock;
+	struct lock_class_stats *stats;
+	unsigned int depth;
+	int i, point;
+
+	depth = curr->lockdep_depth;
+	if (DEBUG_LOCKS_WARN_ON(!depth))
+		return;
+
+	prev_hlock = NULL;
+	for (i = depth-1; i >= 0; i--) {
+		hlock = curr->held_locks + i;
+		/*
+		 * We must not cross into another context:
+		 */
+		if (prev_hlock && prev_hlock->irq_context != hlock->irq_context)
+			break;
+		if (hlock->instance == lock)
+			goto found_it;
+		prev_hlock = hlock;
+	}
+	print_lock_contention_bug(curr, lock, ip);
+	return;
+
+found_it:
+	hlock->waittime_stamp = sched_clock();
+
+	point = lock_contention_point(hlock->class, ip);
+
+	stats = get_lock_stats(hlock->class);
+	if (point < ARRAY_SIZE(stats->contention_point))
+		stats->contention_point[i]++;
+	put_lock_stats(stats);
+}
+
+static void
+__lock_acquired(struct lockdep_map *lock)
+{
+	struct task_struct *curr = current;
+	struct held_lock *hlock, *prev_hlock;
+	struct lock_class_stats *stats;
+	unsigned int depth;
+	u64 now;
+	s64 waittime;
+	int i;
+
+	depth = curr->lockdep_depth;
+	if (DEBUG_LOCKS_WARN_ON(!depth))
+		return;
+
+	prev_hlock = NULL;
+	for (i = depth-1; i >= 0; i--) {
+		hlock = curr->held_locks + i;
+		/*
+		 * We must not cross into another context:
+		 */
+		if (prev_hlock && prev_hlock->irq_context != hlock->irq_context)
+			break;
+		if (hlock->instance == lock)
+			goto found_it;
+		prev_hlock = hlock;
+	}
+	print_lock_contention_bug(curr, lock, _RET_IP_);
+	return;
+
+found_it:
+	if (!hlock->waittime_stamp)
+		return;
+
+	now = sched_clock();
+	waittime = now - hlock->waittime_stamp;
+	hlock->holdtime_stamp = now;
+
+	stats = get_lock_stats(hlock->class);
+	if (hlock->read)
+		lock_time_inc(&stats->read_waittime, waittime);
+	else
+		lock_time_inc(&stats->write_waittime, waittime);
+	put_lock_stats(stats);
+}
+
+void lock_contended(struct lockdep_map *lock, unsigned long ip)
+{
+	unsigned long flags;
+
+	if (unlikely(!lock_stat))
+		return;
+
+	if (unlikely(current->lockdep_recursion))
+		return;
+
+	raw_local_irq_save(flags);
+	check_flags(flags);
+	current->lockdep_recursion = 1;
+	__lock_contended(lock, ip);
+	current->lockdep_recursion = 0;
+	raw_local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(lock_contended);
+
+void lock_acquired(struct lockdep_map *lock)
+{
+	unsigned long flags;
+
+	if (unlikely(!lock_stat))
+		return;
+
+	if (unlikely(current->lockdep_recursion))
+		return;
+
+	raw_local_irq_save(flags);
+	check_flags(flags);
+	current->lockdep_recursion = 1;
+	__lock_acquired(lock);
+	current->lockdep_recursion = 0;
+	raw_local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(lock_acquired);
+#endif
+
 /*
  * Used by the testsuite, sanitize the validator state
  * after a simulated failure:
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 2aaa3f98185..e69179b1809 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -161,6 +161,8 @@ extern ctl_table inotify_table[];
 int sysctl_legacy_va_layout;
 #endif
 
+extern int prove_locking;
+extern int lock_stat;
 
 /* The default sysctl tables: */
 
@@ -282,6 +284,26 @@ static ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
 	},
+#ifdef CONFIG_PROVE_LOCKING
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "prove_locking",
+		.data		= &prove_locking,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#endif
+#ifdef CONFIG_LOCK_STAT
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "lock_stat",
+		.data		= &lock_stat,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#endif
 	{
 		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "sched_features",
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 640844024ff..f3e0c2abcbd 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -283,6 +283,17 @@ config LOCKDEP
 	select KALLSYMS
 	select KALLSYMS_ALL
 
+config LOCK_STAT
+	bool "Lock usage statisitics"
+	depends on DEBUG_KERNEL && TRACE_IRQFLAGS_SUPPORT && STACKTRACE_SUPPORT && LOCKDEP_SUPPORT
+	select LOCKDEP
+	select DEBUG_SPINLOCK
+	select DEBUG_MUTEXES
+	select DEBUG_LOCK_ALLOC
+	default n
+	help
+	 This feature enables tracking lock contention points
+
 config DEBUG_LOCKDEP
 	bool "Lock dependency engine debugging"
 	depends on DEBUG_KERNEL && LOCKDEP
-- 
cgit v1.2.3-70-g09d2


From c46261de0d98372112d8edf16f74ce418a268d46 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 19 Jul 2007 01:48:57 -0700
Subject: lockstat: human readability tweaks

Present all this fancy new lock statistics information:

*warning, _wide_ output ahead*

(output edited for purpose of brevity)

 # cat /proc/lock_stat
lock_stat version 0.1
-----------------------------------------------------------------------------------------------------------------------------------------------------------------
                              class name    contentions   waittime-min   waittime-max waittime-total   acquisitions   holdtime-min   holdtime-max holdtime-total
-----------------------------------------------------------------------------------------------------------------------------------------------------------------

                         &inode->i_mutex:         14458           6.57      398832.75     2469412.23        6768876           0.34    11398383.65   339410830.89
                         ---------------
                         &inode->i_mutex           4486          [<ffffffff802a08f9>] pipe_wait+0x86/0x8d
                         &inode->i_mutex              0          [<ffffffff802a01e8>] pipe_write_fasync+0x29/0x5d
                         &inode->i_mutex              0          [<ffffffff802a0e18>] pipe_read+0x74/0x3a5
                         &inode->i_mutex              0          [<ffffffff802a1a6a>] do_lookup+0x81/0x1ae

.................................................................................................................................................................

              &inode->i_data.tree_lock-W:           491           0.27          62.47         493.89        2477833           0.39         468.89     1146584.25
              &inode->i_data.tree_lock-R:            65           0.44           4.27          48.78       26288792           0.36         184.62    10197458.24
              --------------------------
                &inode->i_data.tree_lock             46          [<ffffffff80277095>] __do_page_cache_readahead+0x69/0x24f
                &inode->i_data.tree_lock             31          [<ffffffff8026f9fb>] add_to_page_cache+0x31/0xba
                &inode->i_data.tree_lock              0          [<ffffffff802770ee>] __do_page_cache_readahead+0xc2/0x24f
                &inode->i_data.tree_lock              0          [<ffffffff8026f6e4>] find_get_page+0x1a/0x58

.................................................................................................................................................................

                      proc_inum_idr.lock:             0           0.00           0.00           0.00             36           0.00          65.60         148.26
                        proc_subdir_lock:             0           0.00           0.00           0.00        3049859           0.00         106.81     1563212.42
                        shrinker_rwsem-W:             0           0.00           0.00           0.00              5           0.00           1.73           3.68
                        shrinker_rwsem-R:             0           0.00           0.00           0.00            633           2.57         246.57       10909.76

'contentions' and 'acquisitions' are the number of such events measured (since
the last reset). The waittime- and holdtime- (min, max, total) numbers are
presented in microseconds.

If there are any contention points, the lock class is presented in the block
format (as i_mutex and tree_lock above), otherwise a single line of output is
presented.

The output is sorted on absolute number of contentions (read + write), this
should get the worst offenders presented first, so that:

 # grep : /proc/lock_stat | head

will quickly show who's bad.

The stats can be reset using:

 # echo 0 > /proc/lock_stat

[bunk@stusta.de: make 2 functions static]
[akpm@linux-foundation.org: fix printk warning]
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Ingo Molnar <mingo@elte.hu>
Acked-by: Jason Baron <jbaron@redhat.com>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/lockdep.c      |  44 +++++++++
 kernel/lockdep_proc.c | 266 ++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 310 insertions(+)

(limited to 'kernel')

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 70ca4db28af..a8dc99d9fef 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -149,6 +149,50 @@ static void lock_time_inc(struct lock_time *lt, s64 time)
 	lt->nr++;
 }
 
+static inline void lock_time_add(struct lock_time *src, struct lock_time *dst)
+{
+	dst->min += src->min;
+	dst->max += src->max;
+	dst->total += src->total;
+	dst->nr += src->nr;
+}
+
+struct lock_class_stats lock_stats(struct lock_class *class)
+{
+	struct lock_class_stats stats;
+	int cpu, i;
+
+	memset(&stats, 0, sizeof(struct lock_class_stats));
+	for_each_possible_cpu(cpu) {
+		struct lock_class_stats *pcs =
+			&per_cpu(lock_stats, cpu)[class - lock_classes];
+
+		for (i = 0; i < ARRAY_SIZE(stats.contention_point); i++)
+			stats.contention_point[i] += pcs->contention_point[i];
+
+		lock_time_add(&pcs->read_waittime, &stats.read_waittime);
+		lock_time_add(&pcs->write_waittime, &stats.write_waittime);
+
+		lock_time_add(&pcs->read_holdtime, &stats.read_holdtime);
+		lock_time_add(&pcs->write_holdtime, &stats.write_holdtime);
+	}
+
+	return stats;
+}
+
+void clear_lock_stats(struct lock_class *class)
+{
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		struct lock_class_stats *cpu_stats =
+			&per_cpu(lock_stats, cpu)[class - lock_classes];
+
+		memset(cpu_stats, 0, sizeof(struct lock_class_stats));
+	}
+	memset(class->contention_point, 0, sizeof(class->contention_point));
+}
+
 static struct lock_class_stats *get_lock_stats(struct lock_class *class)
 {
 	return &get_cpu_var(lock_stats)[class - lock_classes];
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index 2fde34127e2..e682926c9ad 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -15,6 +15,10 @@
 #include <linux/seq_file.h>
 #include <linux/kallsyms.h>
 #include <linux/debug_locks.h>
+#include <linux/vmalloc.h>
+#include <linux/sort.h>
+#include <asm/uaccess.h>
+#include <asm/div64.h>
 
 #include "lockdep_internals.h"
 
@@ -344,6 +348,262 @@ static const struct file_operations proc_lockdep_stats_operations = {
 	.release	= seq_release,
 };
 
+#ifdef CONFIG_LOCK_STAT
+
+struct lock_stat_data {
+	struct lock_class *class;
+	struct lock_class_stats stats;
+};
+
+struct lock_stat_seq {
+	struct lock_stat_data *iter;
+	struct lock_stat_data *iter_end;
+	struct lock_stat_data stats[MAX_LOCKDEP_KEYS];
+};
+
+/*
+ * sort on absolute number of contentions
+ */
+static int lock_stat_cmp(const void *l, const void *r)
+{
+	const struct lock_stat_data *dl = l, *dr = r;
+	unsigned long nl, nr;
+
+	nl = dl->stats.read_waittime.nr + dl->stats.write_waittime.nr;
+	nr = dr->stats.read_waittime.nr + dr->stats.write_waittime.nr;
+
+	return nr - nl;
+}
+
+static void seq_line(struct seq_file *m, char c, int offset, int length)
+{
+	int i;
+
+	for (i = 0; i < offset; i++)
+		seq_puts(m, " ");
+	for (i = 0; i < length; i++)
+		seq_printf(m, "%c", c);
+	seq_puts(m, "\n");
+}
+
+static void snprint_time(char *buf, size_t bufsiz, s64 nr)
+{
+	unsigned long rem;
+
+	rem = do_div(nr, 1000); /* XXX: do_div_signed */
+	snprintf(buf, bufsiz, "%lld.%02d", (long long)nr, ((int)rem+5)/10);
+}
+
+static void seq_time(struct seq_file *m, s64 time)
+{
+	char num[15];
+
+	snprint_time(num, sizeof(num), time);
+	seq_printf(m, " %14s", num);
+}
+
+static void seq_lock_time(struct seq_file *m, struct lock_time *lt)
+{
+	seq_printf(m, "%14lu", lt->nr);
+	seq_time(m, lt->min);
+	seq_time(m, lt->max);
+	seq_time(m, lt->total);
+}
+
+static void seq_stats(struct seq_file *m, struct lock_stat_data *data)
+{
+	char name[39];
+	struct lock_class *class;
+	struct lock_class_stats *stats;
+	int i, namelen;
+
+	class = data->class;
+	stats = &data->stats;
+
+	snprintf(name, 38, "%s", class->name);
+	namelen = strlen(name);
+
+	if (stats->write_holdtime.nr) {
+		if (stats->read_holdtime.nr)
+			seq_printf(m, "%38s-W:", name);
+		else
+			seq_printf(m, "%40s:", name);
+
+		seq_lock_time(m, &stats->write_waittime);
+		seq_puts(m, " ");
+		seq_lock_time(m, &stats->write_holdtime);
+		seq_puts(m, "\n");
+	}
+
+	if (stats->read_holdtime.nr) {
+		seq_printf(m, "%38s-R:", name);
+		seq_lock_time(m, &stats->read_waittime);
+		seq_puts(m, " ");
+		seq_lock_time(m, &stats->read_holdtime);
+		seq_puts(m, "\n");
+	}
+
+	if (stats->read_waittime.nr + stats->write_waittime.nr == 0)
+		return;
+
+	if (stats->read_holdtime.nr)
+		namelen += 2;
+
+	for (i = 0; i < ARRAY_SIZE(class->contention_point); i++) {
+		char sym[KSYM_SYMBOL_LEN];
+		char ip[32];
+
+		if (class->contention_point[i] == 0)
+			break;
+
+		if (!i)
+			seq_line(m, '-', 40-namelen, namelen);
+
+		sprint_symbol(sym, class->contention_point[i]);
+		snprintf(ip, sizeof(ip), "[<%p>]",
+				(void *)class->contention_point[i]);
+		seq_printf(m, "%40s %14lu %29s %s\n", name,
+				stats->contention_point[i],
+				ip, sym);
+	}
+	if (i) {
+		seq_puts(m, "\n");
+		seq_line(m, '.', 0, 40 + 1 + 8 * (14 + 1));
+		seq_puts(m, "\n");
+	}
+}
+
+static void seq_header(struct seq_file *m)
+{
+	seq_printf(m, "lock_stat version 0.1\n");
+	seq_line(m, '-', 0, 40 + 1 + 8 * (14 + 1));
+	seq_printf(m, "%40s %14s %14s %14s %14s %14s %14s %14s %14s\n",
+			"class name",
+			"contentions",
+			"waittime-min",
+			"waittime-max",
+			"waittime-total",
+			"acquisitions",
+			"holdtime-min",
+			"holdtime-max",
+			"holdtime-total");
+	seq_line(m, '-', 0, 40 + 1 + 8 * (14 + 1));
+	seq_printf(m, "\n");
+}
+
+static void *ls_start(struct seq_file *m, loff_t *pos)
+{
+	struct lock_stat_seq *data = m->private;
+
+	if (data->iter == data->stats)
+		seq_header(m);
+
+	return data->iter;
+}
+
+static void *ls_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	struct lock_stat_seq *data = m->private;
+
+	(*pos)++;
+
+	data->iter = v;
+	data->iter++;
+	if (data->iter == data->iter_end)
+		data->iter = NULL;
+
+	return data->iter;
+}
+
+static void ls_stop(struct seq_file *m, void *v)
+{
+}
+
+static int ls_show(struct seq_file *m, void *v)
+{
+	struct lock_stat_seq *data = m->private;
+
+	seq_stats(m, data->iter);
+	return 0;
+}
+
+static struct seq_operations lockstat_ops = {
+	.start	= ls_start,
+	.next	= ls_next,
+	.stop	= ls_stop,
+	.show	= ls_show,
+};
+
+static int lock_stat_open(struct inode *inode, struct file *file)
+{
+	int res;
+	struct lock_class *class;
+	struct lock_stat_seq *data = vmalloc(sizeof(struct lock_stat_seq));
+
+	if (!data)
+		return -ENOMEM;
+
+	res = seq_open(file, &lockstat_ops);
+	if (!res) {
+		struct lock_stat_data *iter = data->stats;
+		struct seq_file *m = file->private_data;
+
+		data->iter = iter;
+		list_for_each_entry(class, &all_lock_classes, lock_entry) {
+			iter->class = class;
+			iter->stats = lock_stats(class);
+			iter++;
+		}
+		data->iter_end = iter;
+
+		sort(data->stats, data->iter_end - data->iter,
+				sizeof(struct lock_stat_data),
+				lock_stat_cmp, NULL);
+
+		m->private = data;
+	} else
+		vfree(data);
+
+	return res;
+}
+
+static ssize_t lock_stat_write(struct file *file, const char __user *buf,
+			       size_t count, loff_t *ppos)
+{
+	struct lock_class *class;
+	char c;
+
+	if (count) {
+		if (get_user(c, buf))
+			return -EFAULT;
+
+		if (c != '0')
+			return count;
+
+		list_for_each_entry(class, &all_lock_classes, lock_entry)
+			clear_lock_stats(class);
+	}
+	return count;
+}
+
+static int lock_stat_release(struct inode *inode, struct file *file)
+{
+	struct seq_file *seq = file->private_data;
+
+	vfree(seq->private);
+	seq->private = NULL;
+	return seq_release(inode, file);
+}
+
+static const struct file_operations proc_lock_stat_operations = {
+	.open		= lock_stat_open,
+	.write		= lock_stat_write,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= lock_stat_release,
+};
+#endif /* CONFIG_LOCK_STAT */
+
 static int __init lockdep_proc_init(void)
 {
 	struct proc_dir_entry *entry;
@@ -356,6 +616,12 @@ static int __init lockdep_proc_init(void)
 	if (entry)
 		entry->proc_fops = &proc_lockdep_stats_operations;
 
+#ifdef CONFIG_LOCK_STAT
+	entry = create_proc_entry("lock_stat", S_IRUSR, NULL);
+	if (entry)
+		entry->proc_fops = &proc_lock_stat_operations;
+#endif
+
 	return 0;
 }
 
-- 
cgit v1.2.3-70-g09d2


From 4fe87745a6722d42ff27a60768c77958fa1fc498 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 19 Jul 2007 01:48:58 -0700
Subject: lockstat: hook into spinlock_t, rwlock_t, rwsem and mutex

Call the new lockstat tracking functions from the various lock primitives.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Ingo Molnar <mingo@elte.hu>
Acked-by: Jason Baron <jbaron@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/mutex.c    |  8 ++++++++
 kernel/rwsem.c    |  8 ++++----
 kernel/spinlock.c | 28 ++++++++++++++--------------
 3 files changed, 26 insertions(+), 18 deletions(-)

(limited to 'kernel')

diff --git a/kernel/mutex.c b/kernel/mutex.c
index 303eab18484..7a3f32761f2 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -139,6 +139,12 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass)
 	list_add_tail(&waiter.list, &lock->wait_list);
 	waiter.task = task;
 
+	old_val = atomic_xchg(&lock->count, -1);
+	if (old_val == 1)
+		goto done;
+
+	lock_contended(&lock->dep_map, _RET_IP_);
+
 	for (;;) {
 		/*
 		 * Lets try to take the lock again - this is needed even if
@@ -174,6 +180,8 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass)
 		spin_lock_mutex(&lock->wait_lock, flags);
 	}
 
+	lock_acquired(&lock->dep_map);
+done:
 	/* got the lock - rejoice! */
 	mutex_remove_waiter(lock, &waiter, task_thread_info(task));
 	debug_mutex_set_owner(lock, task_thread_info(task));
diff --git a/kernel/rwsem.c b/kernel/rwsem.c
index 9a87886b022..1ec620c0306 100644
--- a/kernel/rwsem.c
+++ b/kernel/rwsem.c
@@ -20,7 +20,7 @@ void down_read(struct rw_semaphore *sem)
 	might_sleep();
 	rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_);
 
-	__down_read(sem);
+	LOCK_CONTENDED(sem, __down_read_trylock, __down_read);
 }
 
 EXPORT_SYMBOL(down_read);
@@ -47,7 +47,7 @@ void down_write(struct rw_semaphore *sem)
 	might_sleep();
 	rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);
 
-	__down_write(sem);
+	LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
 }
 
 EXPORT_SYMBOL(down_write);
@@ -111,7 +111,7 @@ void down_read_nested(struct rw_semaphore *sem, int subclass)
 	might_sleep();
 	rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_);
 
-	__down_read(sem);
+	LOCK_CONTENDED(sem, __down_read_trylock, __down_read);
 }
 
 EXPORT_SYMBOL(down_read_nested);
@@ -130,7 +130,7 @@ void down_write_nested(struct rw_semaphore *sem, int subclass)
 	might_sleep();
 	rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_);
 
-	__down_write_nested(sem, subclass);
+	LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
 }
 
 EXPORT_SYMBOL(down_write_nested);
diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index cd93bfe3f10..cd72424c266 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -72,7 +72,7 @@ void __lockfunc _read_lock(rwlock_t *lock)
 {
 	preempt_disable();
 	rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_);
-	_raw_read_lock(lock);
+	LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock);
 }
 EXPORT_SYMBOL(_read_lock);
 
@@ -89,7 +89,7 @@ unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock)
 	 * that interrupts are not re-enabled during lock-acquire:
 	 */
 #ifdef CONFIG_LOCKDEP
-	_raw_spin_lock(lock);
+	LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
 #else
 	_raw_spin_lock_flags(lock, &flags);
 #endif
@@ -102,7 +102,7 @@ void __lockfunc _spin_lock_irq(spinlock_t *lock)
 	local_irq_disable();
 	preempt_disable();
 	spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
-	_raw_spin_lock(lock);
+	LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
 }
 EXPORT_SYMBOL(_spin_lock_irq);
 
@@ -111,7 +111,7 @@ void __lockfunc _spin_lock_bh(spinlock_t *lock)
 	local_bh_disable();
 	preempt_disable();
 	spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
-	_raw_spin_lock(lock);
+	LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
 }
 EXPORT_SYMBOL(_spin_lock_bh);
 
@@ -122,7 +122,7 @@ unsigned long __lockfunc _read_lock_irqsave(rwlock_t *lock)
 	local_irq_save(flags);
 	preempt_disable();
 	rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_);
-	_raw_read_lock(lock);
+	LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock);
 	return flags;
 }
 EXPORT_SYMBOL(_read_lock_irqsave);
@@ -132,7 +132,7 @@ void __lockfunc _read_lock_irq(rwlock_t *lock)
 	local_irq_disable();
 	preempt_disable();
 	rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_);
-	_raw_read_lock(lock);
+	LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock);
 }
 EXPORT_SYMBOL(_read_lock_irq);
 
@@ -141,7 +141,7 @@ void __lockfunc _read_lock_bh(rwlock_t *lock)
 	local_bh_disable();
 	preempt_disable();
 	rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_);
-	_raw_read_lock(lock);
+	LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock);
 }
 EXPORT_SYMBOL(_read_lock_bh);
 
@@ -152,7 +152,7 @@ unsigned long __lockfunc _write_lock_irqsave(rwlock_t *lock)
 	local_irq_save(flags);
 	preempt_disable();
 	rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_);
-	_raw_write_lock(lock);
+	LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock);
 	return flags;
 }
 EXPORT_SYMBOL(_write_lock_irqsave);
@@ -162,7 +162,7 @@ void __lockfunc _write_lock_irq(rwlock_t *lock)
 	local_irq_disable();
 	preempt_disable();
 	rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_);
-	_raw_write_lock(lock);
+	LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock);
 }
 EXPORT_SYMBOL(_write_lock_irq);
 
@@ -171,7 +171,7 @@ void __lockfunc _write_lock_bh(rwlock_t *lock)
 	local_bh_disable();
 	preempt_disable();
 	rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_);
-	_raw_write_lock(lock);
+	LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock);
 }
 EXPORT_SYMBOL(_write_lock_bh);
 
@@ -179,7 +179,7 @@ void __lockfunc _spin_lock(spinlock_t *lock)
 {
 	preempt_disable();
 	spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
-	_raw_spin_lock(lock);
+	LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
 }
 
 EXPORT_SYMBOL(_spin_lock);
@@ -188,7 +188,7 @@ void __lockfunc _write_lock(rwlock_t *lock)
 {
 	preempt_disable();
 	rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_);
-	_raw_write_lock(lock);
+	LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock);
 }
 
 EXPORT_SYMBOL(_write_lock);
@@ -289,7 +289,7 @@ void __lockfunc _spin_lock_nested(spinlock_t *lock, int subclass)
 {
 	preempt_disable();
 	spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
-	_raw_spin_lock(lock);
+	LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
 }
 
 EXPORT_SYMBOL(_spin_lock_nested);
@@ -306,7 +306,7 @@ unsigned long __lockfunc _spin_lock_irqsave_nested(spinlock_t *lock, int subclas
 	 * that interrupts are not re-enabled during lock-acquire:
 	 */
 #ifdef CONFIG_LOCKDEP
-	_raw_spin_lock(lock);
+	LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
 #else
 	_raw_spin_lock_flags(lock, &flags);
 #endif
-- 
cgit v1.2.3-70-g09d2


From 4b32d0a4e9ec07808a5c406a416c6576c986b047 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 19 Jul 2007 01:48:59 -0700
Subject: lockdep: various fixes

 - update the copyright notices
 - use the default hash function
 - fix a thinko in a BUILD_BUG_ON
 - add a WARN_ON to spot inconsitent naming
 - fix a termination issue in /proc/lock_stat

[akpm@linux-foundation.org: cleanups]
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/lockdep.h |  3 ++-
 kernel/lockdep.c        | 21 ++++++++++++---------
 kernel/lockdep_proc.c   |  6 +++++-
 3 files changed, 19 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index 8f946f614f8..3d3386b88b6 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -1,7 +1,8 @@
 /*
  * Runtime locking correctness validator
  *
- *  Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *  Copyright (C) 2006,2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *  Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
  *
  * see Documentation/lockdep-design.txt for more details.
  */
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index a8dc99d9fef..cb64022851c 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -5,7 +5,8 @@
  *
  * Started by Ingo Molnar:
  *
- *  Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *  Copyright (C) 2006,2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *  Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
  *
  * this code maps all the lock dependencies as they occur in a live kernel
  * and will warn about the following classes of locking bugs:
@@ -37,6 +38,7 @@
 #include <linux/debug_locks.h>
 #include <linux/irqflags.h>
 #include <linux/utsname.h>
+#include <linux/hash.h>
 
 #include <asm/sections.h>
 
@@ -238,8 +240,7 @@ LIST_HEAD(all_lock_classes);
  */
 #define CLASSHASH_BITS		(MAX_LOCKDEP_KEYS_BITS - 1)
 #define CLASSHASH_SIZE		(1UL << CLASSHASH_BITS)
-#define CLASSHASH_MASK		(CLASSHASH_SIZE - 1)
-#define __classhashfn(key)	((((unsigned long)key >> CLASSHASH_BITS) + (unsigned long)key) & CLASSHASH_MASK)
+#define __classhashfn(key)	hash_long((unsigned long)key, CLASSHASH_BITS)
 #define classhashentry(key)	(classhash_table + __classhashfn((key)))
 
 static struct list_head classhash_table[CLASSHASH_SIZE];
@@ -250,9 +251,7 @@ static struct list_head classhash_table[CLASSHASH_SIZE];
  */
 #define CHAINHASH_BITS		(MAX_LOCKDEP_CHAINS_BITS-1)
 #define CHAINHASH_SIZE		(1UL << CHAINHASH_BITS)
-#define CHAINHASH_MASK		(CHAINHASH_SIZE - 1)
-#define __chainhashfn(chain) \
-		(((chain >> CHAINHASH_BITS) + chain) & CHAINHASH_MASK)
+#define __chainhashfn(chain)	hash_long(chain, CHAINHASH_BITS)
 #define chainhashentry(chain)	(chainhash_table + __chainhashfn((chain)))
 
 static struct list_head chainhash_table[CHAINHASH_SIZE];
@@ -676,7 +675,8 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
 	 * (or spin_lock_init()) call - which acts as the key. For static
 	 * locks we use the lock object itself as the key.
 	 */
-	BUILD_BUG_ON(sizeof(struct lock_class_key) > sizeof(struct lock_class));
+	BUILD_BUG_ON(sizeof(struct lock_class_key) >
+			sizeof(struct lockdep_map));
 
 	key = lock->key->subkeys + subclass;
 
@@ -686,9 +686,12 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
 	 * We can walk the hash lockfree, because the hash only
 	 * grows, and we are careful when adding entries to the end:
 	 */
-	list_for_each_entry(class, hash_head, hash_entry)
-		if (class->key == key)
+	list_for_each_entry(class, hash_head, hash_entry) {
+		if (class->key == key) {
+			WARN_ON_ONCE(class->name != lock->name);
 			return class;
+		}
+	}
 
 	return NULL;
 }
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index e682926c9ad..39163ed1bf0 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -5,7 +5,8 @@
  *
  * Started by Ingo Molnar:
  *
- *  Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *  Copyright (C) 2006,2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *  Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
  *
  * Code for /proc/lockdep and /proc/lockdep_stats:
  *
@@ -498,6 +499,9 @@ static void *ls_start(struct seq_file *m, loff_t *pos)
 	if (data->iter == data->stats)
 		seq_header(m);
 
+	if (data->iter == data->iter_end)
+		data->iter = NULL;
+
 	return data->iter;
 }
 
-- 
cgit v1.2.3-70-g09d2


From 96645678cd726e87ce42a0664de71e047e32bca4 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 19 Jul 2007 01:49:00 -0700
Subject: lockstat: measure lock bouncing

    __acquire
        |
       lock _____
        |        \
        |    __contended
        |         |
        |        wait
        | _______/
        |/
        |
   __acquired
        |
   __release
        |
     unlock

We measure acquisition and contention bouncing.

This is done by recording a cpu stamp in each lock instance.

Contention bouncing requires the cpu stamp to be set on acquisition. Hence we
move __acquired into the generic path.

__acquired is then used to measure acquisition bouncing by comparing the
current cpu with the old stamp before replacing it.

__contended is used to measure contention bouncing (only useful for preemptable
locks)

[akpm@linux-foundation.org: cleanups]
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/lockdep.h | 17 ++++++++++++++++-
 kernel/lockdep.c        | 38 ++++++++++++++++++++++++++------------
 kernel/lockdep_proc.c   | 19 ++++++++++++-------
 kernel/mutex.c          |  2 +-
 4 files changed, 55 insertions(+), 21 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index 3d3386b88b6..0e843bf6587 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -130,12 +130,24 @@ struct lock_time {
 	unsigned long			nr;
 };
 
+enum bounce_type {
+	bounce_acquired_write,
+	bounce_acquired_read,
+	bounce_contended_write,
+	bounce_contended_read,
+	nr_bounce_types,
+
+	bounce_acquired = bounce_acquired_write,
+	bounce_contended = bounce_contended_write,
+};
+
 struct lock_class_stats {
 	unsigned long			contention_point[4];
 	struct lock_time		read_waittime;
 	struct lock_time		write_waittime;
 	struct lock_time		read_holdtime;
 	struct lock_time		write_holdtime;
+	unsigned long			bounces[nr_bounce_types];
 };
 
 struct lock_class_stats lock_stats(struct lock_class *class);
@@ -150,6 +162,9 @@ struct lockdep_map {
 	struct lock_class_key		*key;
 	struct lock_class		*class_cache;
 	const char			*name;
+#ifdef CONFIG_LOCK_STAT
+	int				cpu;
+#endif
 };
 
 /*
@@ -321,8 +336,8 @@ do {								\
 	if (!try(_lock)) {					\
 		lock_contended(&(_lock)->dep_map, _RET_IP_);	\
 		lock(_lock);					\
-		lock_acquired(&(_lock)->dep_map);		\
 	}							\
+	lock_acquired(&(_lock)->dep_map);			\
 } while (0)
 
 #else /* CONFIG_LOCK_STAT */
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index cb64022851c..156fce4960c 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -177,6 +177,9 @@ struct lock_class_stats lock_stats(struct lock_class *class)
 
 		lock_time_add(&pcs->read_holdtime, &stats.read_holdtime);
 		lock_time_add(&pcs->write_holdtime, &stats.write_holdtime);
+
+		for (i = 0; i < ARRAY_SIZE(stats.bounces); i++)
+			stats.bounces[i] += pcs->bounces[i];
 	}
 
 	return stats;
@@ -2325,6 +2328,9 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name,
 	lock->name = name;
 	lock->key = key;
 	lock->class_cache = NULL;
+#ifdef CONFIG_LOCK_STAT
+	lock->cpu = raw_smp_processor_id();
+#endif
 	if (subclass)
 		register_lock_class(lock, subclass, 1);
 }
@@ -2775,6 +2781,8 @@ found_it:
 	stats = get_lock_stats(hlock->class);
 	if (point < ARRAY_SIZE(stats->contention_point))
 		stats->contention_point[i]++;
+	if (lock->cpu != smp_processor_id())
+		stats->bounces[bounce_contended + !!hlock->read]++;
 	put_lock_stats(stats);
 }
 
@@ -2786,8 +2794,8 @@ __lock_acquired(struct lockdep_map *lock)
 	struct lock_class_stats *stats;
 	unsigned int depth;
 	u64 now;
-	s64 waittime;
-	int i;
+	s64 waittime = 0;
+	int i, cpu;
 
 	depth = curr->lockdep_depth;
 	if (DEBUG_LOCKS_WARN_ON(!depth))
@@ -2809,19 +2817,25 @@ __lock_acquired(struct lockdep_map *lock)
 	return;
 
 found_it:
-	if (!hlock->waittime_stamp)
-		return;
-
-	now = sched_clock();
-	waittime = now - hlock->waittime_stamp;
-	hlock->holdtime_stamp = now;
+	cpu = smp_processor_id();
+	if (hlock->waittime_stamp) {
+		now = sched_clock();
+		waittime = now - hlock->waittime_stamp;
+		hlock->holdtime_stamp = now;
+	}
 
 	stats = get_lock_stats(hlock->class);
-	if (hlock->read)
-		lock_time_inc(&stats->read_waittime, waittime);
-	else
-		lock_time_inc(&stats->write_waittime, waittime);
+	if (waittime) {
+		if (hlock->read)
+			lock_time_inc(&stats->read_waittime, waittime);
+		else
+			lock_time_inc(&stats->write_waittime, waittime);
+	}
+	if (lock->cpu != cpu)
+		stats->bounces[bounce_acquired + !!hlock->read]++;
 	put_lock_stats(stats);
+
+	lock->cpu = cpu;
 }
 
 void lock_contended(struct lockdep_map *lock, unsigned long ip)
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index 39163ed1bf0..7ff80135cbe 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -430,16 +430,18 @@ static void seq_stats(struct seq_file *m, struct lock_stat_data *data)
 		else
 			seq_printf(m, "%40s:", name);
 
+		seq_printf(m, "%14lu ", stats->bounces[bounce_contended_write]);
 		seq_lock_time(m, &stats->write_waittime);
-		seq_puts(m, " ");
+		seq_printf(m, " %14lu ", stats->bounces[bounce_acquired_write]);
 		seq_lock_time(m, &stats->write_holdtime);
 		seq_puts(m, "\n");
 	}
 
 	if (stats->read_holdtime.nr) {
 		seq_printf(m, "%38s-R:", name);
+		seq_printf(m, "%14lu ", stats->bounces[bounce_contended_read]);
 		seq_lock_time(m, &stats->read_waittime);
-		seq_puts(m, " ");
+		seq_printf(m, " %14lu ", stats->bounces[bounce_acquired_read]);
 		seq_lock_time(m, &stats->read_holdtime);
 		seq_puts(m, "\n");
 	}
@@ -469,26 +471,29 @@ static void seq_stats(struct seq_file *m, struct lock_stat_data *data)
 	}
 	if (i) {
 		seq_puts(m, "\n");
-		seq_line(m, '.', 0, 40 + 1 + 8 * (14 + 1));
+		seq_line(m, '.', 0, 40 + 1 + 10 * (14 + 1));
 		seq_puts(m, "\n");
 	}
 }
 
 static void seq_header(struct seq_file *m)
 {
-	seq_printf(m, "lock_stat version 0.1\n");
-	seq_line(m, '-', 0, 40 + 1 + 8 * (14 + 1));
-	seq_printf(m, "%40s %14s %14s %14s %14s %14s %14s %14s %14s\n",
+	seq_printf(m, "lock_stat version 0.2\n");
+	seq_line(m, '-', 0, 40 + 1 + 10 * (14 + 1));
+	seq_printf(m, "%40s %14s %14s %14s %14s %14s %14s %14s %14s "
+			"%14s %14s\n",
 			"class name",
+			"con-bounces",
 			"contentions",
 			"waittime-min",
 			"waittime-max",
 			"waittime-total",
+			"acq-bounces",
 			"acquisitions",
 			"holdtime-min",
 			"holdtime-max",
 			"holdtime-total");
-	seq_line(m, '-', 0, 40 + 1 + 8 * (14 + 1));
+	seq_line(m, '-', 0, 40 + 1 + 10 * (14 + 1));
 	seq_printf(m, "\n");
 }
 
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 7a3f32761f2..691b86564dd 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -180,8 +180,8 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass)
 		spin_lock_mutex(&lock->wait_lock, flags);
 	}
 
-	lock_acquired(&lock->dep_map);
 done:
+	lock_acquired(&lock->dep_map);
 	/* got the lock - rejoice! */
 	mutex_remove_waiter(lock, &waiter, task_thread_info(task));
 	debug_mutex_set_owner(lock, task_thread_info(task));
-- 
cgit v1.2.3-70-g09d2


From d38e1d5aaee384698fcef9455d6e2df1d062a1d0 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 19 Jul 2007 01:49:01 -0700
Subject: lockstat: better class name representation

optionally add class->name_version and class->subclass to the class name

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/lockdep_proc.c | 24 +++++++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index 7ff80135cbe..9f17af4a249 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -421,8 +421,30 @@ static void seq_stats(struct seq_file *m, struct lock_stat_data *data)
 	class = data->class;
 	stats = &data->stats;
 
-	snprintf(name, 38, "%s", class->name);
+	namelen = 38;
+	if (class->name_version > 1)
+		namelen -= 2; /* XXX truncates versions > 9 */
+	if (class->subclass)
+		namelen -= 2;
+
+	if (!class->name) {
+		char str[KSYM_NAME_LEN];
+		const char *key_name;
+
+		key_name = __get_key_name(class->key, str);
+		snprintf(name, namelen, "%s", key_name);
+	} else {
+		snprintf(name, namelen, "%s", class->name);
+	}
 	namelen = strlen(name);
+	if (class->name_version > 1) {
+		snprintf(name+namelen, 3, "#%d", class->name_version);
+		namelen += 2;
+	}
+	if (class->subclass) {
+		snprintf(name+namelen, 3, "/%d", class->subclass);
+		namelen += 2;
+	}
 
 	if (stats->write_holdtime.nr) {
 		if (stats->read_holdtime.nr)
-- 
cgit v1.2.3-70-g09d2


From c71063c9c9dc232d0d51f936f237f7dc5681e8e3 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Thu, 19 Jul 2007 01:49:02 -0700
Subject: lockdep debugging: give stacktrace for init_error

When I started adding support for lockdep to 64-bit powerpc, I got a
lockdep_init_error and with this patch was able to pinpoint why and where
to put lockdep_init().  Let's support this generally for others adding
lockdep support to their architecture.

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arjan van de Ven <arjan@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/lockdep.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 156fce4960c..734da579ad1 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -375,6 +375,11 @@ unsigned int max_recursion_depth;
  * about it later on, in lockdep_info().
  */
 static int lockdep_init_error;
+static unsigned long lockdep_init_trace_data[20];
+static struct stack_trace lockdep_init_trace = {
+	.max_entries = ARRAY_SIZE(lockdep_init_trace_data),
+	.entries = lockdep_init_trace_data,
+};
 
 /*
  * Various lockdep statistics:
@@ -662,6 +667,7 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
 	if (unlikely(!lockdep_initialized)) {
 		lockdep_init();
 		lockdep_init_error = 1;
+		save_stack_trace(&lockdep_init_trace);
 	}
 #endif
 
@@ -3040,8 +3046,11 @@ void __init lockdep_info(void)
 		sizeof(struct held_lock) * MAX_LOCK_DEPTH);
 
 #ifdef CONFIG_DEBUG_LOCKDEP
-	if (lockdep_init_error)
-		printk("WARNING: lockdep init error! Arch code didnt call lockdep_init() early enough?\n");
+	if (lockdep_init_error) {
+		printk("WARNING: lockdep init error! Arch code didn't call lockdep_init() early enough?\n");
+		printk("Call stack leading to lockdep invocation was:\n");
+		print_stack_trace(&lockdep_init_trace, 0);
+	}
 #endif
 }
 
-- 
cgit v1.2.3-70-g09d2


From 71120f183bff04ba4f7ba3cc554202061912d548 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 19 Jul 2007 01:49:16 -0700
Subject: timekeeping: fixup shadow variable argument

clocksource_adjust() has a clock argument, which shadows the file global clock
variable.  Fix this up.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: john stultz <johnstul@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/time/timekeeping.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 728cedfd3cb..89698776613 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -401,7 +401,7 @@ static __always_inline int clocksource_bigadjust(s64 error, s64 *interval,
  * this is optimized for the most common adjustments of -1,0,1,
  * for other values we can do a bit more work.
  */
-static void clocksource_adjust(struct clocksource *clock, s64 offset)
+static void clocksource_adjust(s64 offset)
 {
 	s64 error, interval = clock->cycle_interval;
 	int adj;
@@ -476,7 +476,7 @@ void update_wall_time(void)
 	}
 
 	/* correct the clock when NTP error is too big */
-	clocksource_adjust(clock, offset);
+	clocksource_adjust(offset);
 
 	/* store full nanoseconds into xtime */
 	xtime.tv_nsec = (s64)clock->xtime_nsec >> clock->shift;
-- 
cgit v1.2.3-70-g09d2


From 6819457d2cb7fe4fdb0fc3655b6b6dc71a86bee9 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 19 Jul 2007 01:49:16 -0700
Subject: timer.c: cleanup recently introduced whitespace damage

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/timer.c | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/timer.c b/kernel/timer.c
index b7792fb0338..d1e8b975c7a 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -103,14 +103,14 @@ static inline tvec_base_t *tbase_get_base(tvec_base_t *base)
 static inline void timer_set_deferrable(struct timer_list *timer)
 {
 	timer->base = ((tvec_base_t *)((unsigned long)(timer->base) |
-	                               TBASE_DEFERRABLE_FLAG));
+				       TBASE_DEFERRABLE_FLAG));
 }
 
 static inline void
 timer_set_base(struct timer_list *timer, tvec_base_t *new_base)
 {
 	timer->base = (tvec_base_t *)((unsigned long)(new_base) |
-	                              tbase_get_deferrable(timer->base));
+				      tbase_get_deferrable(timer->base));
 }
 
 /**
@@ -445,10 +445,10 @@ EXPORT_SYMBOL(__mod_timer);
 void add_timer_on(struct timer_list *timer, int cpu)
 {
 	tvec_base_t *base = per_cpu(tvec_bases, cpu);
-  	unsigned long flags;
+	unsigned long flags;
 
 	timer_stats_timer_set_start_info(timer);
-  	BUG_ON(timer_pending(timer) || !timer->function);
+	BUG_ON(timer_pending(timer) || !timer->function);
 	spin_lock_irqsave(&base->lock, flags);
 	timer_set_base(timer, base);
 	internal_add_timer(base, timer);
@@ -627,7 +627,7 @@ static inline void __run_timers(tvec_base_t *base)
 	while (time_after_eq(jiffies, base->timer_jiffies)) {
 		struct list_head work_list;
 		struct list_head *head = &work_list;
- 		int index = base->timer_jiffies & TVR_MASK;
+		int index = base->timer_jiffies & TVR_MASK;
 
 		/*
 		 * Cascade timers:
@@ -644,8 +644,8 @@ static inline void __run_timers(tvec_base_t *base)
 			unsigned long data;
 
 			timer = list_first_entry(head, struct timer_list,entry);
- 			fn = timer->function;
- 			data = timer->data;
+			fn = timer->function;
+			data = timer->data;
 
 			timer_stats_account_timer(timer);
 
@@ -689,8 +689,8 @@ static unsigned long __next_timer_interrupt(tvec_base_t *base)
 	index = slot = timer_jiffies & TVR_MASK;
 	do {
 		list_for_each_entry(nte, base->tv1.vec + slot, entry) {
- 			if (tbase_get_deferrable(nte->base))
- 				continue;
+			if (tbase_get_deferrable(nte->base))
+				continue;
 
 			found = 1;
 			expires = nte->expires;
@@ -834,7 +834,7 @@ void update_process_times(int user_tick)
 	if (rcu_pending(cpu))
 		rcu_check_callbacks(cpu, user_tick);
 	scheduler_tick();
- 	run_posix_cpu_timers(p);
+	run_posix_cpu_timers(p);
 }
 
 /*
@@ -909,7 +909,7 @@ static inline void update_times(unsigned long ticks)
 	update_wall_time();
 	calc_load(ticks);
 }
-  
+
 /*
  * The 64-bit jiffies value is not atomic - you MUST NOT read it
  * without sampling the sequence number in xtime_lock.
@@ -1105,7 +1105,7 @@ asmlinkage long sys_gettid(void)
 /**
  * do_sysinfo - fill in sysinfo struct
  * @info: pointer to buffer to fill
- */ 
+ */
 int do_sysinfo(struct sysinfo *info)
 {
 	unsigned long mem_total, sav_total;
-- 
cgit v1.2.3-70-g09d2


From 5992b6dac0d23a2b51a1ccbaf8f1a2e62097b12b Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Thu, 19 Jul 2007 01:49:21 -0700
Subject: lguest: export symbols for lguest as a module

lguest does some fairly lowlevel things to support a host, which
normal modules don't need:

math_state_restore:
	When the guest triggers a Device Not Available fault, we need
	to be able to restore the FPU

__put_task_struct:
	We need to hold a reference to another task for inter-guest
	I/O, and put_task_struct() is an inline function which calls
	__put_task_struct.

access_process_vm:
	We need to access another task for inter-guest I/O.

map_vm_area & __get_vm_area:
	We need to map the switcher shim (ie. monitor) at 0xFFC01000.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/i386/kernel/traps.c | 1 +
 kernel/fork.c            | 1 +
 mm/memory.c              | 1 +
 mm/vmalloc.c             | 2 ++
 4 files changed, 5 insertions(+)

(limited to 'kernel')

diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c
index d32fd4b6f78..109ebbcde58 100644
--- a/arch/i386/kernel/traps.c
+++ b/arch/i386/kernel/traps.c
@@ -1056,6 +1056,7 @@ asmlinkage void math_state_restore(void)
 	thread->status |= TS_USEDFPU;	/* So we fnsave on switch_to() */
 	tsk->fpu_counter++;
 }
+EXPORT_SYMBOL_GPL(math_state_restore);
 
 #ifndef CONFIG_MATH_EMULATION
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 46983899822..e7a2d995b08 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -127,6 +127,7 @@ void __put_task_struct(struct task_struct *tsk)
 	if (!profile_handoff_task(tsk))
 		free_task(tsk);
 }
+EXPORT_SYMBOL_GPL(__put_task_struct);
 
 void __init fork_init(unsigned long mempages)
 {
diff --git a/mm/memory.c b/mm/memory.c
index 50dd3d1f4d1..8aace3db3a5 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2865,3 +2865,4 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in
 
 	return buf - old_buf;
 }
+EXPORT_SYMBOL_GPL(access_process_vm);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 213d5e5079f..3cee76a8c9f 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -164,6 +164,7 @@ int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages)
 	flush_cache_vmap((unsigned long) area->addr, end);
 	return err;
 }
+EXPORT_SYMBOL_GPL(map_vm_area);
 
 static struct vm_struct *__get_vm_area_node(unsigned long size, unsigned long flags,
 					    unsigned long start, unsigned long end,
@@ -242,6 +243,7 @@ struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags,
 {
 	return __get_vm_area_node(size, flags, start, end, -1, GFP_KERNEL);
 }
+EXPORT_SYMBOL_GPL(__get_vm_area);
 
 /**
  *	get_vm_area  -  reserve a contingous kernel virtual area
-- 
cgit v1.2.3-70-g09d2


From d7e28ffe6c74416b54345d6004fd0964c115b12c Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Thu, 19 Jul 2007 01:49:23 -0700
Subject: lguest: the host code

This is the code for the "lg.ko" module, which allows lguest guests to
be launched.

[akpm@linux-foundation.org: update for futex-new-private-futexes]
[akpm@linux-foundation.org: build fix]
[jmorris@namei.org: lguest: use hrtimers]
[akpm@linux-foundation.org: x86_64 build fix]
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Cc: Andi Kleen <ak@suse.de>
Cc: Eric Dumazet <dada1@cosmosbay.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/i386/kernel/tsc.c                |   4 +-
 arch/x86_64/kernel/tsc.c              |   2 +-
 drivers/lguest/core.c                 | 462 ++++++++++++++++++++++++++++++++++
 drivers/lguest/hypercalls.c           | 192 ++++++++++++++
 drivers/lguest/interrupts_and_traps.c | 268 ++++++++++++++++++++
 drivers/lguest/io.c                   | 399 +++++++++++++++++++++++++++++
 drivers/lguest/lg.h                   | 261 +++++++++++++++++++
 drivers/lguest/lguest.c               | 125 +++++++--
 drivers/lguest/lguest_asm.S           |   5 +-
 drivers/lguest/lguest_user.c          | 236 +++++++++++++++++
 drivers/lguest/page_tables.c          | 411 ++++++++++++++++++++++++++++++
 drivers/lguest/segments.c             | 125 +++++++++
 drivers/lguest/switcher.S             | 159 ++++++++++++
 include/asm-i386/tsc.h                |   1 +
 include/linux/lguest.h                |  12 +-
 include/linux/lguest_launcher.h       |  73 ++++++
 kernel/fork.c                         |   1 -
 17 files changed, 2702 insertions(+), 34 deletions(-)
 create mode 100644 drivers/lguest/core.c
 create mode 100644 drivers/lguest/hypercalls.c
 create mode 100644 drivers/lguest/interrupts_and_traps.c
 create mode 100644 drivers/lguest/io.c
 create mode 100644 drivers/lguest/lg.h
 create mode 100644 drivers/lguest/lguest_user.c
 create mode 100644 drivers/lguest/page_tables.c
 create mode 100644 drivers/lguest/segments.c
 create mode 100644 drivers/lguest/switcher.S
 create mode 100644 include/linux/lguest_launcher.h

(limited to 'kernel')

diff --git a/arch/i386/kernel/tsc.c b/arch/i386/kernel/tsc.c
index 252f9010f28..debd7dbb415 100644
--- a/arch/i386/kernel/tsc.c
+++ b/arch/i386/kernel/tsc.c
@@ -27,6 +27,7 @@ static int tsc_enabled;
  * an extra value to store the TSC freq
  */
 unsigned int tsc_khz;
+EXPORT_SYMBOL_GPL(tsc_khz);
 
 int tsc_disable;
 
@@ -58,10 +59,11 @@ __setup("notsc", tsc_setup);
  */
 static int tsc_unstable;
 
-static inline int check_tsc_unstable(void)
+int check_tsc_unstable(void)
 {
 	return tsc_unstable;
 }
+EXPORT_SYMBOL_GPL(check_tsc_unstable);
 
 /* Accellerators for sched_clock()
  * convert from cycles(64bits) => nanoseconds (64bits)
diff --git a/arch/x86_64/kernel/tsc.c b/arch/x86_64/kernel/tsc.c
index 48f9a8e6aa9..e850aa01e1b 100644
--- a/arch/x86_64/kernel/tsc.c
+++ b/arch/x86_64/kernel/tsc.c
@@ -44,7 +44,7 @@ unsigned long long sched_clock(void)
 
 static int tsc_unstable;
 
-static inline int check_tsc_unstable(void)
+inline int check_tsc_unstable(void)
 {
 	return tsc_unstable;
 }
diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c
new file mode 100644
index 00000000000..ce909ec5749
--- /dev/null
+++ b/drivers/lguest/core.c
@@ -0,0 +1,462 @@
+/* World's simplest hypervisor, to test paravirt_ops and show
+ * unbelievers that virtualization is the future.  Plus, it's fun! */
+#include <linux/module.h>
+#include <linux/stringify.h>
+#include <linux/stddef.h>
+#include <linux/io.h>
+#include <linux/mm.h>
+#include <linux/vmalloc.h>
+#include <linux/cpu.h>
+#include <linux/freezer.h>
+#include <asm/paravirt.h>
+#include <asm/desc.h>
+#include <asm/pgtable.h>
+#include <asm/uaccess.h>
+#include <asm/poll.h>
+#include <asm/highmem.h>
+#include <asm/asm-offsets.h>
+#include <asm/i387.h>
+#include "lg.h"
+
+/* Found in switcher.S */
+extern char start_switcher_text[], end_switcher_text[], switch_to_guest[];
+extern unsigned long default_idt_entries[];
+
+/* Every guest maps the core switcher code. */
+#define SHARED_SWITCHER_PAGES \
+	DIV_ROUND_UP(end_switcher_text - start_switcher_text, PAGE_SIZE)
+/* Pages for switcher itself, then two pages per cpu */
+#define TOTAL_SWITCHER_PAGES (SHARED_SWITCHER_PAGES + 2 * NR_CPUS)
+
+/* We map at -4M for ease of mapping into the guest (one PTE page). */
+#define SWITCHER_ADDR 0xFFC00000
+
+static struct vm_struct *switcher_vma;
+static struct page **switcher_page;
+
+static int cpu_had_pge;
+static struct {
+	unsigned long offset;
+	unsigned short segment;
+} lguest_entry;
+
+/* This One Big lock protects all inter-guest data structures. */
+DEFINE_MUTEX(lguest_lock);
+static DEFINE_PER_CPU(struct lguest *, last_guest);
+
+/* FIXME: Make dynamic. */
+#define MAX_LGUEST_GUESTS 16
+struct lguest lguests[MAX_LGUEST_GUESTS];
+
+/* Offset from where switcher.S was compiled to where we've copied it */
+static unsigned long switcher_offset(void)
+{
+	return SWITCHER_ADDR - (unsigned long)start_switcher_text;
+}
+
+/* This cpu's struct lguest_pages. */
+static struct lguest_pages *lguest_pages(unsigned int cpu)
+{
+	return &(((struct lguest_pages *)
+		  (SWITCHER_ADDR + SHARED_SWITCHER_PAGES*PAGE_SIZE))[cpu]);
+}
+
+static __init int map_switcher(void)
+{
+	int i, err;
+	struct page **pagep;
+
+	switcher_page = kmalloc(sizeof(switcher_page[0])*TOTAL_SWITCHER_PAGES,
+				GFP_KERNEL);
+	if (!switcher_page) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	for (i = 0; i < TOTAL_SWITCHER_PAGES; i++) {
+		unsigned long addr = get_zeroed_page(GFP_KERNEL);
+		if (!addr) {
+			err = -ENOMEM;
+			goto free_some_pages;
+		}
+		switcher_page[i] = virt_to_page(addr);
+	}
+
+	switcher_vma = __get_vm_area(TOTAL_SWITCHER_PAGES * PAGE_SIZE,
+				       VM_ALLOC, SWITCHER_ADDR, VMALLOC_END);
+	if (!switcher_vma) {
+		err = -ENOMEM;
+		printk("lguest: could not map switcher pages high\n");
+		goto free_pages;
+	}
+
+	pagep = switcher_page;
+	err = map_vm_area(switcher_vma, PAGE_KERNEL, &pagep);
+	if (err) {
+		printk("lguest: map_vm_area failed: %i\n", err);
+		goto free_vma;
+	}
+	memcpy(switcher_vma->addr, start_switcher_text,
+	       end_switcher_text - start_switcher_text);
+
+	/* Fix up IDT entries to point into copied text. */
+	for (i = 0; i < IDT_ENTRIES; i++)
+		default_idt_entries[i] += switcher_offset();
+
+	for_each_possible_cpu(i) {
+		struct lguest_pages *pages = lguest_pages(i);
+		struct lguest_ro_state *state = &pages->state;
+
+		/* These fields are static: rest done in copy_in_guest_info */
+		state->host_gdt_desc.size = GDT_SIZE-1;
+		state->host_gdt_desc.address = (long)get_cpu_gdt_table(i);
+		store_idt(&state->host_idt_desc);
+		state->guest_idt_desc.size = sizeof(state->guest_idt)-1;
+		state->guest_idt_desc.address = (long)&state->guest_idt;
+		state->guest_gdt_desc.size = sizeof(state->guest_gdt)-1;
+		state->guest_gdt_desc.address = (long)&state->guest_gdt;
+		state->guest_tss.esp0 = (long)(&pages->regs + 1);
+		state->guest_tss.ss0 = LGUEST_DS;
+		/* No I/O for you! */
+		state->guest_tss.io_bitmap_base = sizeof(state->guest_tss);
+		setup_default_gdt_entries(state);
+		setup_default_idt_entries(state, default_idt_entries);
+
+		/* Setup LGUEST segments on all cpus */
+		get_cpu_gdt_table(i)[GDT_ENTRY_LGUEST_CS] = FULL_EXEC_SEGMENT;
+		get_cpu_gdt_table(i)[GDT_ENTRY_LGUEST_DS] = FULL_SEGMENT;
+	}
+
+	/* Initialize entry point into switcher. */
+	lguest_entry.offset = (long)switch_to_guest + switcher_offset();
+	lguest_entry.segment = LGUEST_CS;
+
+	printk(KERN_INFO "lguest: mapped switcher at %p\n",
+	       switcher_vma->addr);
+	return 0;
+
+free_vma:
+	vunmap(switcher_vma->addr);
+free_pages:
+	i = TOTAL_SWITCHER_PAGES;
+free_some_pages:
+	for (--i; i >= 0; i--)
+		__free_pages(switcher_page[i], 0);
+	kfree(switcher_page);
+out:
+	return err;
+}
+
+static void unmap_switcher(void)
+{
+	unsigned int i;
+
+	vunmap(switcher_vma->addr);
+	for (i = 0; i < TOTAL_SWITCHER_PAGES; i++)
+		__free_pages(switcher_page[i], 0);
+}
+
+/* IN/OUT insns: enough to get us past boot-time probing. */
+static int emulate_insn(struct lguest *lg)
+{
+	u8 insn;
+	unsigned int insnlen = 0, in = 0, shift = 0;
+	unsigned long physaddr = guest_pa(lg, lg->regs->eip);
+
+	/* This only works for addresses in linear mapping... */
+	if (lg->regs->eip < lg->page_offset)
+		return 0;
+	lgread(lg, &insn, physaddr, 1);
+
+	/* Operand size prefix means it's actually for ax. */
+	if (insn == 0x66) {
+		shift = 16;
+		insnlen = 1;
+		lgread(lg, &insn, physaddr + insnlen, 1);
+	}
+
+	switch (insn & 0xFE) {
+	case 0xE4: /* in     <next byte>,%al */
+		insnlen += 2;
+		in = 1;
+		break;
+	case 0xEC: /* in     (%dx),%al */
+		insnlen += 1;
+		in = 1;
+		break;
+	case 0xE6: /* out    %al,<next byte> */
+		insnlen += 2;
+		break;
+	case 0xEE: /* out    %al,(%dx) */
+		insnlen += 1;
+		break;
+	default:
+		return 0;
+	}
+
+	if (in) {
+		/* Lower bit tells is whether it's a 16 or 32 bit access */
+		if (insn & 0x1)
+			lg->regs->eax = 0xFFFFFFFF;
+		else
+			lg->regs->eax |= (0xFFFF << shift);
+	}
+	lg->regs->eip += insnlen;
+	return 1;
+}
+
+int lguest_address_ok(const struct lguest *lg,
+		      unsigned long addr, unsigned long len)
+{
+	return (addr+len) / PAGE_SIZE < lg->pfn_limit && (addr+len >= addr);
+}
+
+/* Just like get_user, but don't let guest access lguest binary. */
+u32 lgread_u32(struct lguest *lg, unsigned long addr)
+{
+	u32 val = 0;
+
+	/* Don't let them access lguest binary */
+	if (!lguest_address_ok(lg, addr, sizeof(val))
+	    || get_user(val, (u32 __user *)addr) != 0)
+		kill_guest(lg, "bad read address %#lx", addr);
+	return val;
+}
+
+void lgwrite_u32(struct lguest *lg, unsigned long addr, u32 val)
+{
+	if (!lguest_address_ok(lg, addr, sizeof(val))
+	    || put_user(val, (u32 __user *)addr) != 0)
+		kill_guest(lg, "bad write address %#lx", addr);
+}
+
+void lgread(struct lguest *lg, void *b, unsigned long addr, unsigned bytes)
+{
+	if (!lguest_address_ok(lg, addr, bytes)
+	    || copy_from_user(b, (void __user *)addr, bytes) != 0) {
+		/* copy_from_user should do this, but as we rely on it... */
+		memset(b, 0, bytes);
+		kill_guest(lg, "bad read address %#lx len %u", addr, bytes);
+	}
+}
+
+void lgwrite(struct lguest *lg, unsigned long addr, const void *b,
+	     unsigned bytes)
+{
+	if (!lguest_address_ok(lg, addr, bytes)
+	    || copy_to_user((void __user *)addr, b, bytes) != 0)
+		kill_guest(lg, "bad write address %#lx len %u", addr, bytes);
+}
+
+static void set_ts(void)
+{
+	u32 cr0;
+
+	cr0 = read_cr0();
+	if (!(cr0 & 8))
+		write_cr0(cr0|8);
+}
+
+static void copy_in_guest_info(struct lguest *lg, struct lguest_pages *pages)
+{
+	if (__get_cpu_var(last_guest) != lg || lg->last_pages != pages) {
+		__get_cpu_var(last_guest) = lg;
+		lg->last_pages = pages;
+		lg->changed = CHANGED_ALL;
+	}
+
+	/* These are pretty cheap, so we do them unconditionally. */
+	pages->state.host_cr3 = __pa(current->mm->pgd);
+	map_switcher_in_guest(lg, pages);
+	pages->state.guest_tss.esp1 = lg->esp1;
+	pages->state.guest_tss.ss1 = lg->ss1;
+
+	/* Copy direct trap entries. */
+	if (lg->changed & CHANGED_IDT)
+		copy_traps(lg, pages->state.guest_idt, default_idt_entries);
+
+	/* Copy all GDT entries but the TSS. */
+	if (lg->changed & CHANGED_GDT)
+		copy_gdt(lg, pages->state.guest_gdt);
+	/* If only the TLS entries have changed, copy them. */
+	else if (lg->changed & CHANGED_GDT_TLS)
+		copy_gdt_tls(lg, pages->state.guest_gdt);
+
+	lg->changed = 0;
+}
+
+static void run_guest_once(struct lguest *lg, struct lguest_pages *pages)
+{
+	unsigned int clobber;
+
+	copy_in_guest_info(lg, pages);
+
+	/* Put eflags on stack, lcall does rest: suitable for iret return. */
+	asm volatile("pushf; lcall *lguest_entry"
+		     : "=a"(clobber), "=b"(clobber)
+		     : "0"(pages), "1"(__pa(lg->pgdirs[lg->pgdidx].pgdir))
+		     : "memory", "%edx", "%ecx", "%edi", "%esi");
+}
+
+int run_guest(struct lguest *lg, unsigned long __user *user)
+{
+	while (!lg->dead) {
+		unsigned int cr2 = 0; /* Damn gcc */
+
+		/* Hypercalls first: we might have been out to userspace */
+		do_hypercalls(lg);
+		if (lg->dma_is_pending) {
+			if (put_user(lg->pending_dma, user) ||
+			    put_user(lg->pending_key, user+1))
+				return -EFAULT;
+			return sizeof(unsigned long)*2;
+		}
+
+		if (signal_pending(current))
+			return -ERESTARTSYS;
+
+		/* If Waker set break_out, return to Launcher. */
+		if (lg->break_out)
+			return -EAGAIN;
+
+		maybe_do_interrupt(lg);
+
+		try_to_freeze();
+
+		if (lg->dead)
+			break;
+
+		if (lg->halted) {
+			set_current_state(TASK_INTERRUPTIBLE);
+			schedule();
+			continue;
+		}
+
+		local_irq_disable();
+
+		/* Even if *we* don't want FPU trap, guest might... */
+		if (lg->ts)
+			set_ts();
+
+		/* Don't let Guest do SYSENTER: we can't handle it. */
+		if (boot_cpu_has(X86_FEATURE_SEP))
+			wrmsr(MSR_IA32_SYSENTER_CS, 0, 0);
+
+		run_guest_once(lg, lguest_pages(raw_smp_processor_id()));
+
+		/* Save cr2 now if we page-faulted. */
+		if (lg->regs->trapnum == 14)
+			cr2 = read_cr2();
+		else if (lg->regs->trapnum == 7)
+			math_state_restore();
+
+		if (boot_cpu_has(X86_FEATURE_SEP))
+			wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0);
+		local_irq_enable();
+
+		switch (lg->regs->trapnum) {
+		case 13: /* We've intercepted a GPF. */
+			if (lg->regs->errcode == 0) {
+				if (emulate_insn(lg))
+					continue;
+			}
+			break;
+		case 14: /* We've intercepted a page fault. */
+			if (demand_page(lg, cr2, lg->regs->errcode))
+				continue;
+
+			/* If lguest_data is NULL, this won't hurt. */
+			if (put_user(cr2, &lg->lguest_data->cr2))
+				kill_guest(lg, "Writing cr2");
+			break;
+		case 7: /* We've intercepted a Device Not Available fault. */
+			/* If they don't want to know, just absorb it. */
+			if (!lg->ts)
+				continue;
+			break;
+		case 32 ... 255: /* Real interrupt, fall thru */
+			cond_resched();
+		case LGUEST_TRAP_ENTRY: /* Handled at top of loop */
+			continue;
+		}
+
+		if (deliver_trap(lg, lg->regs->trapnum))
+			continue;
+
+		kill_guest(lg, "unhandled trap %li at %#lx (%#lx)",
+			   lg->regs->trapnum, lg->regs->eip,
+			   lg->regs->trapnum == 14 ? cr2 : lg->regs->errcode);
+	}
+	return -ENOENT;
+}
+
+int find_free_guest(void)
+{
+	unsigned int i;
+	for (i = 0; i < MAX_LGUEST_GUESTS; i++)
+		if (!lguests[i].tsk)
+			return i;
+	return -1;
+}
+
+static void adjust_pge(void *on)
+{
+	if (on)
+		write_cr4(read_cr4() | X86_CR4_PGE);
+	else
+		write_cr4(read_cr4() & ~X86_CR4_PGE);
+}
+
+static int __init init(void)
+{
+	int err;
+
+	if (paravirt_enabled()) {
+		printk("lguest is afraid of %s\n", paravirt_ops.name);
+		return -EPERM;
+	}
+
+	err = map_switcher();
+	if (err)
+		return err;
+
+	err = init_pagetables(switcher_page, SHARED_SWITCHER_PAGES);
+	if (err) {
+		unmap_switcher();
+		return err;
+	}
+	lguest_io_init();
+
+	err = lguest_device_init();
+	if (err) {
+		free_pagetables();
+		unmap_switcher();
+		return err;
+	}
+	lock_cpu_hotplug();
+	if (cpu_has_pge) { /* We have a broader idea of "global". */
+		cpu_had_pge = 1;
+		on_each_cpu(adjust_pge, (void *)0, 0, 1);
+		clear_bit(X86_FEATURE_PGE, boot_cpu_data.x86_capability);
+	}
+	unlock_cpu_hotplug();
+	return 0;
+}
+
+static void __exit fini(void)
+{
+	lguest_device_remove();
+	free_pagetables();
+	unmap_switcher();
+	lock_cpu_hotplug();
+	if (cpu_had_pge) {
+		set_bit(X86_FEATURE_PGE, boot_cpu_data.x86_capability);
+		on_each_cpu(adjust_pge, (void *)1, 0, 1);
+	}
+	unlock_cpu_hotplug();
+}
+
+module_init(init);
+module_exit(fini);
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Rusty Russell <rusty@rustcorp.com.au>");
diff --git a/drivers/lguest/hypercalls.c b/drivers/lguest/hypercalls.c
new file mode 100644
index 00000000000..ea52ca451f7
--- /dev/null
+++ b/drivers/lguest/hypercalls.c
@@ -0,0 +1,192 @@
+/*  Actual hypercalls, which allow guests to actually do something.
+    Copyright (C) 2006 Rusty Russell IBM Corporation
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
+*/
+#include <linux/uaccess.h>
+#include <linux/syscalls.h>
+#include <linux/mm.h>
+#include <asm/page.h>
+#include <asm/pgtable.h>
+#include <irq_vectors.h>
+#include "lg.h"
+
+static void do_hcall(struct lguest *lg, struct lguest_regs *regs)
+{
+	switch (regs->eax) {
+	case LHCALL_FLUSH_ASYNC:
+		break;
+	case LHCALL_LGUEST_INIT:
+		kill_guest(lg, "already have lguest_data");
+		break;
+	case LHCALL_CRASH: {
+		char msg[128];
+		lgread(lg, msg, regs->edx, sizeof(msg));
+		msg[sizeof(msg)-1] = '\0';
+		kill_guest(lg, "CRASH: %s", msg);
+		break;
+	}
+	case LHCALL_FLUSH_TLB:
+		if (regs->edx)
+			guest_pagetable_clear_all(lg);
+		else
+			guest_pagetable_flush_user(lg);
+		break;
+	case LHCALL_GET_WALLCLOCK: {
+		struct timespec ts;
+		ktime_get_real_ts(&ts);
+		regs->eax = ts.tv_sec;
+		break;
+	}
+	case LHCALL_BIND_DMA:
+		regs->eax = bind_dma(lg, regs->edx, regs->ebx,
+				     regs->ecx >> 8, regs->ecx & 0xFF);
+		break;
+	case LHCALL_SEND_DMA:
+		send_dma(lg, regs->edx, regs->ebx);
+		break;
+	case LHCALL_LOAD_GDT:
+		load_guest_gdt(lg, regs->edx, regs->ebx);
+		break;
+	case LHCALL_LOAD_IDT_ENTRY:
+		load_guest_idt_entry(lg, regs->edx, regs->ebx, regs->ecx);
+		break;
+	case LHCALL_NEW_PGTABLE:
+		guest_new_pagetable(lg, regs->edx);
+		break;
+	case LHCALL_SET_STACK:
+		guest_set_stack(lg, regs->edx, regs->ebx, regs->ecx);
+		break;
+	case LHCALL_SET_PTE:
+		guest_set_pte(lg, regs->edx, regs->ebx, mkgpte(regs->ecx));
+		break;
+	case LHCALL_SET_PMD:
+		guest_set_pmd(lg, regs->edx, regs->ebx);
+		break;
+	case LHCALL_LOAD_TLS:
+		guest_load_tls(lg, regs->edx);
+		break;
+	case LHCALL_SET_CLOCKEVENT:
+		guest_set_clockevent(lg, regs->edx);
+		break;
+	case LHCALL_TS:
+		lg->ts = regs->edx;
+		break;
+	case LHCALL_HALT:
+		lg->halted = 1;
+		break;
+	default:
+		kill_guest(lg, "Bad hypercall %li\n", regs->eax);
+	}
+}
+
+/* We always do queued calls before actual hypercall. */
+static void do_async_hcalls(struct lguest *lg)
+{
+	unsigned int i;
+	u8 st[LHCALL_RING_SIZE];
+
+	if (copy_from_user(&st, &lg->lguest_data->hcall_status, sizeof(st)))
+		return;
+
+	for (i = 0; i < ARRAY_SIZE(st); i++) {
+		struct lguest_regs regs;
+		unsigned int n = lg->next_hcall;
+
+		if (st[n] == 0xFF)
+			break;
+
+		if (++lg->next_hcall == LHCALL_RING_SIZE)
+			lg->next_hcall = 0;
+
+		if (get_user(regs.eax, &lg->lguest_data->hcalls[n].eax)
+		    || get_user(regs.edx, &lg->lguest_data->hcalls[n].edx)
+		    || get_user(regs.ecx, &lg->lguest_data->hcalls[n].ecx)
+		    || get_user(regs.ebx, &lg->lguest_data->hcalls[n].ebx)) {
+			kill_guest(lg, "Fetching async hypercalls");
+			break;
+		}
+
+		do_hcall(lg, &regs);
+		if (put_user(0xFF, &lg->lguest_data->hcall_status[n])) {
+			kill_guest(lg, "Writing result for async hypercall");
+			break;
+		}
+
+		if (lg->dma_is_pending)
+			break;
+	}
+}
+
+static void initialize(struct lguest *lg)
+{
+	u32 tsc_speed;
+
+	if (lg->regs->eax != LHCALL_LGUEST_INIT) {
+		kill_guest(lg, "hypercall %li before LGUEST_INIT",
+			   lg->regs->eax);
+		return;
+	}
+
+	/* We only tell the guest to use the TSC if it's reliable. */
+	if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC) && !check_tsc_unstable())
+		tsc_speed = tsc_khz;
+	else
+		tsc_speed = 0;
+
+	lg->lguest_data = (struct lguest_data __user *)lg->regs->edx;
+	/* We check here so we can simply copy_to_user/from_user */
+	if (!lguest_address_ok(lg, lg->regs->edx, sizeof(*lg->lguest_data))) {
+		kill_guest(lg, "bad guest page %p", lg->lguest_data);
+		return;
+	}
+	if (get_user(lg->noirq_start, &lg->lguest_data->noirq_start)
+	    || get_user(lg->noirq_end, &lg->lguest_data->noirq_end)
+	    /* We reserve the top pgd entry. */
+	    || put_user(4U*1024*1024, &lg->lguest_data->reserve_mem)
+	    || put_user(tsc_speed, &lg->lguest_data->tsc_khz)
+	    || put_user(lg->guestid, &lg->lguest_data->guestid))
+		kill_guest(lg, "bad guest page %p", lg->lguest_data);
+
+	/* This is the one case where the above accesses might have
+	 * been the first write to a Guest page.  This may have caused
+	 * a copy-on-write fault, but the Guest might be referring to
+	 * the old (read-only) page. */
+	guest_pagetable_clear_all(lg);
+}
+
+/* Even if we go out to userspace and come back, we don't want to do
+ * the hypercall again. */
+static void clear_hcall(struct lguest *lg)
+{
+	lg->regs->trapnum = 255;
+}
+
+void do_hypercalls(struct lguest *lg)
+{
+	if (unlikely(!lg->lguest_data)) {
+		if (lg->regs->trapnum == LGUEST_TRAP_ENTRY) {
+			initialize(lg);
+			clear_hcall(lg);
+		}
+		return;
+	}
+
+	do_async_hcalls(lg);
+	if (!lg->dma_is_pending && lg->regs->trapnum == LGUEST_TRAP_ENTRY) {
+		do_hcall(lg, lg->regs);
+		clear_hcall(lg);
+	}
+}
diff --git a/drivers/lguest/interrupts_and_traps.c b/drivers/lguest/interrupts_and_traps.c
new file mode 100644
index 00000000000..d9de5bbc613
--- /dev/null
+++ b/drivers/lguest/interrupts_and_traps.c
@@ -0,0 +1,268 @@
+#include <linux/uaccess.h>
+#include "lg.h"
+
+static unsigned long idt_address(u32 lo, u32 hi)
+{
+	return (lo & 0x0000FFFF) | (hi & 0xFFFF0000);
+}
+
+static int idt_type(u32 lo, u32 hi)
+{
+	return (hi >> 8) & 0xF;
+}
+
+static int idt_present(u32 lo, u32 hi)
+{
+	return (hi & 0x8000);
+}
+
+static void push_guest_stack(struct lguest *lg, unsigned long *gstack, u32 val)
+{
+	*gstack -= 4;
+	lgwrite_u32(lg, *gstack, val);
+}
+
+static void set_guest_interrupt(struct lguest *lg, u32 lo, u32 hi, int has_err)
+{
+	unsigned long gstack;
+	u32 eflags, ss, irq_enable;
+
+	/* If they want a ring change, we use new stack and push old ss/esp */
+	if ((lg->regs->ss&0x3) != GUEST_PL) {
+		gstack = guest_pa(lg, lg->esp1);
+		ss = lg->ss1;
+		push_guest_stack(lg, &gstack, lg->regs->ss);
+		push_guest_stack(lg, &gstack, lg->regs->esp);
+	} else {
+		gstack = guest_pa(lg, lg->regs->esp);
+		ss = lg->regs->ss;
+	}
+
+	/* We use IF bit in eflags to indicate whether irqs were disabled
+	   (it's always 0, since irqs are enabled when guest is running). */
+	eflags = lg->regs->eflags;
+	if (get_user(irq_enable, &lg->lguest_data->irq_enabled))
+		irq_enable = 0;
+	eflags |= (irq_enable & X86_EFLAGS_IF);
+
+	push_guest_stack(lg, &gstack, eflags);
+	push_guest_stack(lg, &gstack, lg->regs->cs);
+	push_guest_stack(lg, &gstack, lg->regs->eip);
+
+	if (has_err)
+		push_guest_stack(lg, &gstack, lg->regs->errcode);
+
+	/* Change the real stack so switcher returns to trap handler */
+	lg->regs->ss = ss;
+	lg->regs->esp = gstack + lg->page_offset;
+	lg->regs->cs = (__KERNEL_CS|GUEST_PL);
+	lg->regs->eip = idt_address(lo, hi);
+
+	/* Disable interrupts for an interrupt gate. */
+	if (idt_type(lo, hi) == 0xE)
+		if (put_user(0, &lg->lguest_data->irq_enabled))
+			kill_guest(lg, "Disabling interrupts");
+}
+
+void maybe_do_interrupt(struct lguest *lg)
+{
+	unsigned int irq;
+	DECLARE_BITMAP(blk, LGUEST_IRQS);
+	struct desc_struct *idt;
+
+	if (!lg->lguest_data)
+		return;
+
+	/* Mask out any interrupts they have blocked. */
+	if (copy_from_user(&blk, lg->lguest_data->blocked_interrupts,
+			   sizeof(blk)))
+		return;
+
+	bitmap_andnot(blk, lg->irqs_pending, blk, LGUEST_IRQS);
+
+	irq = find_first_bit(blk, LGUEST_IRQS);
+	if (irq >= LGUEST_IRQS)
+		return;
+
+	if (lg->regs->eip >= lg->noirq_start && lg->regs->eip < lg->noirq_end)
+		return;
+
+	/* If they're halted, we re-enable interrupts. */
+	if (lg->halted) {
+		/* Re-enable interrupts. */
+		if (put_user(X86_EFLAGS_IF, &lg->lguest_data->irq_enabled))
+			kill_guest(lg, "Re-enabling interrupts");
+		lg->halted = 0;
+	} else {
+		/* Maybe they have interrupts disabled? */
+		u32 irq_enabled;
+		if (get_user(irq_enabled, &lg->lguest_data->irq_enabled))
+			irq_enabled = 0;
+		if (!irq_enabled)
+			return;
+	}
+
+	idt = &lg->idt[FIRST_EXTERNAL_VECTOR+irq];
+	if (idt_present(idt->a, idt->b)) {
+		clear_bit(irq, lg->irqs_pending);
+		set_guest_interrupt(lg, idt->a, idt->b, 0);
+	}
+}
+
+static int has_err(unsigned int trap)
+{
+	return (trap == 8 || (trap >= 10 && trap <= 14) || trap == 17);
+}
+
+int deliver_trap(struct lguest *lg, unsigned int num)
+{
+	u32 lo = lg->idt[num].a, hi = lg->idt[num].b;
+
+	if (!idt_present(lo, hi))
+		return 0;
+	set_guest_interrupt(lg, lo, hi, has_err(num));
+	return 1;
+}
+
+static int direct_trap(const struct lguest *lg,
+		       const struct desc_struct *trap,
+		       unsigned int num)
+{
+	/* Hardware interrupts don't go to guest (except syscall). */
+	if (num >= FIRST_EXTERNAL_VECTOR && num != SYSCALL_VECTOR)
+		return 0;
+
+	/* We intercept page fault (demand shadow paging & cr2 saving)
+	   protection fault (in/out emulation) and device not
+	   available (TS handling), and hypercall */
+	if (num == 14 || num == 13 || num == 7 || num == LGUEST_TRAP_ENTRY)
+		return 0;
+
+	/* Interrupt gates (0xE) or not present (0x0) can't go direct. */
+	return idt_type(trap->a, trap->b) == 0xF;
+}
+
+void pin_stack_pages(struct lguest *lg)
+{
+	unsigned int i;
+
+	for (i = 0; i < lg->stack_pages; i++)
+		pin_page(lg, lg->esp1 - i * PAGE_SIZE);
+}
+
+void guest_set_stack(struct lguest *lg, u32 seg, u32 esp, unsigned int pages)
+{
+	/* You cannot have a stack segment with priv level 0. */
+	if ((seg & 0x3) != GUEST_PL)
+		kill_guest(lg, "bad stack segment %i", seg);
+	if (pages > 2)
+		kill_guest(lg, "bad stack pages %u", pages);
+	lg->ss1 = seg;
+	lg->esp1 = esp;
+	lg->stack_pages = pages;
+	pin_stack_pages(lg);
+}
+
+/* Set up trap in IDT. */
+static void set_trap(struct lguest *lg, struct desc_struct *trap,
+		     unsigned int num, u32 lo, u32 hi)
+{
+	u8 type = idt_type(lo, hi);
+
+	if (!idt_present(lo, hi)) {
+		trap->a = trap->b = 0;
+		return;
+	}
+
+	if (type != 0xE && type != 0xF)
+		kill_guest(lg, "bad IDT type %i", type);
+
+	trap->a = ((__KERNEL_CS|GUEST_PL)<<16) | (lo&0x0000FFFF);
+	trap->b = (hi&0xFFFFEF00);
+}
+
+void load_guest_idt_entry(struct lguest *lg, unsigned int num, u32 lo, u32 hi)
+{
+	/* Guest never handles: NMI, doublefault, hypercall, spurious irq. */
+	if (num == 2 || num == 8 || num == 15 || num == LGUEST_TRAP_ENTRY)
+		return;
+
+	lg->changed |= CHANGED_IDT;
+	if (num < ARRAY_SIZE(lg->idt))
+		set_trap(lg, &lg->idt[num], num, lo, hi);
+	else if (num == SYSCALL_VECTOR)
+		set_trap(lg, &lg->syscall_idt, num, lo, hi);
+}
+
+static void default_idt_entry(struct desc_struct *idt,
+			      int trap,
+			      const unsigned long handler)
+{
+	u32 flags = 0x8e00;
+
+	/* They can't "int" into any of them except hypercall. */
+	if (trap == LGUEST_TRAP_ENTRY)
+		flags |= (GUEST_PL << 13);
+
+	idt->a = (LGUEST_CS<<16) | (handler&0x0000FFFF);
+	idt->b = (handler&0xFFFF0000) | flags;
+}
+
+void setup_default_idt_entries(struct lguest_ro_state *state,
+			       const unsigned long *def)
+{
+	unsigned int i;
+
+	for (i = 0; i < ARRAY_SIZE(state->guest_idt); i++)
+		default_idt_entry(&state->guest_idt[i], i, def[i]);
+}
+
+void copy_traps(const struct lguest *lg, struct desc_struct *idt,
+		const unsigned long *def)
+{
+	unsigned int i;
+
+	/* All hardware interrupts are same whatever the guest: only the
+	 * traps might be different. */
+	for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++) {
+		if (direct_trap(lg, &lg->idt[i], i))
+			idt[i] = lg->idt[i];
+		else
+			default_idt_entry(&idt[i], i, def[i]);
+	}
+	i = SYSCALL_VECTOR;
+	if (direct_trap(lg, &lg->syscall_idt, i))
+		idt[i] = lg->syscall_idt;
+	else
+		default_idt_entry(&idt[i], i, def[i]);
+}
+
+void guest_set_clockevent(struct lguest *lg, unsigned long delta)
+{
+	ktime_t expires;
+
+	if (unlikely(delta == 0)) {
+		/* Clock event device is shutting down. */
+		hrtimer_cancel(&lg->hrt);
+		return;
+	}
+
+	expires = ktime_add_ns(ktime_get_real(), delta);
+	hrtimer_start(&lg->hrt, expires, HRTIMER_MODE_ABS);
+}
+
+static enum hrtimer_restart clockdev_fn(struct hrtimer *timer)
+{
+	struct lguest *lg = container_of(timer, struct lguest, hrt);
+
+	set_bit(0, lg->irqs_pending);
+	if (lg->halted)
+		wake_up_process(lg->tsk);
+	return HRTIMER_NORESTART;
+}
+
+void init_clockdev(struct lguest *lg)
+{
+	hrtimer_init(&lg->hrt, CLOCK_REALTIME, HRTIMER_MODE_ABS);
+	lg->hrt.function = clockdev_fn;
+}
diff --git a/drivers/lguest/io.c b/drivers/lguest/io.c
new file mode 100644
index 00000000000..06bdba2337e
--- /dev/null
+++ b/drivers/lguest/io.c
@@ -0,0 +1,399 @@
+/* Simple I/O model for guests, based on shared memory.
+ * Copyright (C) 2006 Rusty Russell IBM Corporation
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
+ */
+#include <linux/types.h>
+#include <linux/futex.h>
+#include <linux/jhash.h>
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/uaccess.h>
+#include "lg.h"
+
+static struct list_head dma_hash[61];
+
+void lguest_io_init(void)
+{
+	unsigned int i;
+
+	for (i = 0; i < ARRAY_SIZE(dma_hash); i++)
+		INIT_LIST_HEAD(&dma_hash[i]);
+}
+
+/* FIXME: allow multi-page lengths. */
+static int check_dma_list(struct lguest *lg, const struct lguest_dma *dma)
+{
+	unsigned int i;
+
+	for (i = 0; i < LGUEST_MAX_DMA_SECTIONS; i++) {
+		if (!dma->len[i])
+			return 1;
+		if (!lguest_address_ok(lg, dma->addr[i], dma->len[i]))
+			goto kill;
+		if (dma->len[i] > PAGE_SIZE)
+			goto kill;
+		/* We could do over a page, but is it worth it? */
+		if ((dma->addr[i] % PAGE_SIZE) + dma->len[i] > PAGE_SIZE)
+			goto kill;
+	}
+	return 1;
+
+kill:
+	kill_guest(lg, "bad DMA entry: %u@%#lx", dma->len[i], dma->addr[i]);
+	return 0;
+}
+
+static unsigned int hash(const union futex_key *key)
+{
+	return jhash2((u32*)&key->both.word,
+		      (sizeof(key->both.word)+sizeof(key->both.ptr))/4,
+		      key->both.offset)
+		% ARRAY_SIZE(dma_hash);
+}
+
+static inline int key_eq(const union futex_key *a, const union futex_key *b)
+{
+	return (a->both.word == b->both.word
+		&& a->both.ptr == b->both.ptr
+		&& a->both.offset == b->both.offset);
+}
+
+/* Must hold read lock on dmainfo owner's current->mm->mmap_sem */
+static void unlink_dma(struct lguest_dma_info *dmainfo)
+{
+	BUG_ON(!mutex_is_locked(&lguest_lock));
+	dmainfo->interrupt = 0;
+	list_del(&dmainfo->list);
+	drop_futex_key_refs(&dmainfo->key);
+}
+
+static int unbind_dma(struct lguest *lg,
+		      const union futex_key *key,
+		      unsigned long dmas)
+{
+	int i, ret = 0;
+
+	for (i = 0; i < LGUEST_MAX_DMA; i++) {
+		if (key_eq(key, &lg->dma[i].key) && dmas == lg->dma[i].dmas) {
+			unlink_dma(&lg->dma[i]);
+			ret = 1;
+			break;
+		}
+	}
+	return ret;
+}
+
+int bind_dma(struct lguest *lg,
+	     unsigned long ukey, unsigned long dmas, u16 numdmas, u8 interrupt)
+{
+	unsigned int i;
+	int ret = 0;
+	union futex_key key;
+	struct rw_semaphore *fshared = &current->mm->mmap_sem;
+
+	if (interrupt >= LGUEST_IRQS)
+		return 0;
+
+	mutex_lock(&lguest_lock);
+	down_read(fshared);
+	if (get_futex_key((u32 __user *)ukey, fshared, &key) != 0) {
+		kill_guest(lg, "bad dma key %#lx", ukey);
+		goto unlock;
+	}
+	get_futex_key_refs(&key);
+
+	if (interrupt == 0)
+		ret = unbind_dma(lg, &key, dmas);
+	else {
+		for (i = 0; i < LGUEST_MAX_DMA; i++) {
+			if (lg->dma[i].interrupt)
+				continue;
+
+			lg->dma[i].dmas = dmas;
+			lg->dma[i].num_dmas = numdmas;
+			lg->dma[i].next_dma = 0;
+			lg->dma[i].key = key;
+			lg->dma[i].guestid = lg->guestid;
+			lg->dma[i].interrupt = interrupt;
+			list_add(&lg->dma[i].list, &dma_hash[hash(&key)]);
+			ret = 1;
+			goto unlock;
+		}
+	}
+	drop_futex_key_refs(&key);
+unlock:
+ 	up_read(fshared);
+	mutex_unlock(&lguest_lock);
+	return ret;
+}
+
+/* lgread from another guest */
+static int lgread_other(struct lguest *lg,
+			void *buf, u32 addr, unsigned bytes)
+{
+	if (!lguest_address_ok(lg, addr, bytes)
+	    || access_process_vm(lg->tsk, addr, buf, bytes, 0) != bytes) {
+		memset(buf, 0, bytes);
+		kill_guest(lg, "bad address in registered DMA struct");
+		return 0;
+	}
+	return 1;
+}
+
+/* lgwrite to another guest */
+static int lgwrite_other(struct lguest *lg, u32 addr,
+			 const void *buf, unsigned bytes)
+{
+	if (!lguest_address_ok(lg, addr, bytes)
+	    || (access_process_vm(lg->tsk, addr, (void *)buf, bytes, 1)
+		!= bytes)) {
+		kill_guest(lg, "bad address writing to registered DMA");
+		return 0;
+	}
+	return 1;
+}
+
+static u32 copy_data(struct lguest *srclg,
+		     const struct lguest_dma *src,
+		     const struct lguest_dma *dst,
+		     struct page *pages[])
+{
+	unsigned int totlen, si, di, srcoff, dstoff;
+	void *maddr = NULL;
+
+	totlen = 0;
+	si = di = 0;
+	srcoff = dstoff = 0;
+	while (si < LGUEST_MAX_DMA_SECTIONS && src->len[si]
+	       && di < LGUEST_MAX_DMA_SECTIONS && dst->len[di]) {
+		u32 len = min(src->len[si] - srcoff, dst->len[di] - dstoff);
+
+		if (!maddr)
+			maddr = kmap(pages[di]);
+
+		/* FIXME: This is not completely portable, since
+		   archs do different things for copy_to_user_page. */
+		if (copy_from_user(maddr + (dst->addr[di] + dstoff)%PAGE_SIZE,
+				   (void *__user)src->addr[si], len) != 0) {
+			kill_guest(srclg, "bad address in sending DMA");
+			totlen = 0;
+			break;
+		}
+
+		totlen += len;
+		srcoff += len;
+		dstoff += len;
+		if (srcoff == src->len[si]) {
+			si++;
+			srcoff = 0;
+		}
+		if (dstoff == dst->len[di]) {
+			kunmap(pages[di]);
+			maddr = NULL;
+			di++;
+			dstoff = 0;
+		}
+	}
+
+	if (maddr)
+		kunmap(pages[di]);
+
+	return totlen;
+}
+
+/* Src is us, ie. current. */
+static u32 do_dma(struct lguest *srclg, const struct lguest_dma *src,
+		  struct lguest *dstlg, const struct lguest_dma *dst)
+{
+	int i;
+	u32 ret;
+	struct page *pages[LGUEST_MAX_DMA_SECTIONS];
+
+	if (!check_dma_list(dstlg, dst) || !check_dma_list(srclg, src))
+		return 0;
+
+	/* First get the destination pages */
+	for (i = 0; i < LGUEST_MAX_DMA_SECTIONS; i++) {
+		if (dst->len[i] == 0)
+			break;
+		if (get_user_pages(dstlg->tsk, dstlg->mm,
+				   dst->addr[i], 1, 1, 1, pages+i, NULL)
+		    != 1) {
+			kill_guest(dstlg, "Error mapping DMA pages");
+			ret = 0;
+			goto drop_pages;
+		}
+	}
+
+	/* Now copy until we run out of src or dst. */
+	ret = copy_data(srclg, src, dst, pages);
+
+drop_pages:
+	while (--i >= 0)
+		put_page(pages[i]);
+	return ret;
+}
+
+static int dma_transfer(struct lguest *srclg,
+			unsigned long udma,
+			struct lguest_dma_info *dst)
+{
+	struct lguest_dma dst_dma, src_dma;
+	struct lguest *dstlg;
+	u32 i, dma = 0;
+
+	dstlg = &lguests[dst->guestid];
+	/* Get our dma list. */
+	lgread(srclg, &src_dma, udma, sizeof(src_dma));
+
+	/* We can't deadlock against them dmaing to us, because this
+	 * is all under the lguest_lock. */
+	down_read(&dstlg->mm->mmap_sem);
+
+	for (i = 0; i < dst->num_dmas; i++) {
+		dma = (dst->next_dma + i) % dst->num_dmas;
+		if (!lgread_other(dstlg, &dst_dma,
+				  dst->dmas + dma * sizeof(struct lguest_dma),
+				  sizeof(dst_dma))) {
+			goto fail;
+		}
+		if (!dst_dma.used_len)
+			break;
+	}
+	if (i != dst->num_dmas) {
+		unsigned long used_lenp;
+		unsigned int ret;
+
+		ret = do_dma(srclg, &src_dma, dstlg, &dst_dma);
+		/* Put used length in src. */
+		lgwrite_u32(srclg,
+			    udma+offsetof(struct lguest_dma, used_len), ret);
+		if (ret == 0 && src_dma.len[0] != 0)
+			goto fail;
+
+		/* Make sure destination sees contents before length. */
+		wmb();
+		used_lenp = dst->dmas
+			+ dma * sizeof(struct lguest_dma)
+			+ offsetof(struct lguest_dma, used_len);
+		lgwrite_other(dstlg, used_lenp, &ret, sizeof(ret));
+		dst->next_dma++;
+	}
+ 	up_read(&dstlg->mm->mmap_sem);
+
+	/* Do this last so dst doesn't simply sleep on lock. */
+	set_bit(dst->interrupt, dstlg->irqs_pending);
+	wake_up_process(dstlg->tsk);
+	return i == dst->num_dmas;
+
+fail:
+	up_read(&dstlg->mm->mmap_sem);
+	return 0;
+}
+
+void send_dma(struct lguest *lg, unsigned long ukey, unsigned long udma)
+{
+	union futex_key key;
+	int empty = 0;
+	struct rw_semaphore *fshared = &current->mm->mmap_sem;
+
+again:
+	mutex_lock(&lguest_lock);
+	down_read(fshared);
+	if (get_futex_key((u32 __user *)ukey, fshared, &key) != 0) {
+		kill_guest(lg, "bad sending DMA key");
+		goto unlock;
+	}
+	/* Shared mapping?  Look for other guests... */
+	if (key.shared.offset & 1) {
+		struct lguest_dma_info *i;
+		list_for_each_entry(i, &dma_hash[hash(&key)], list) {
+			if (i->guestid == lg->guestid)
+				continue;
+			if (!key_eq(&key, &i->key))
+				continue;
+
+			empty += dma_transfer(lg, udma, i);
+			break;
+		}
+		if (empty == 1) {
+			/* Give any recipients one chance to restock. */
+			up_read(&current->mm->mmap_sem);
+			mutex_unlock(&lguest_lock);
+			empty++;
+			goto again;
+		}
+	} else {
+		/* Private mapping: tell our userspace. */
+		lg->dma_is_pending = 1;
+		lg->pending_dma = udma;
+		lg->pending_key = ukey;
+	}
+unlock:
+	up_read(fshared);
+	mutex_unlock(&lguest_lock);
+}
+
+void release_all_dma(struct lguest *lg)
+{
+	unsigned int i;
+
+	BUG_ON(!mutex_is_locked(&lguest_lock));
+
+	down_read(&lg->mm->mmap_sem);
+	for (i = 0; i < LGUEST_MAX_DMA; i++) {
+		if (lg->dma[i].interrupt)
+			unlink_dma(&lg->dma[i]);
+	}
+	up_read(&lg->mm->mmap_sem);
+}
+
+/* Userspace wants a dma buffer from this guest. */
+unsigned long get_dma_buffer(struct lguest *lg,
+			     unsigned long ukey, unsigned long *interrupt)
+{
+	unsigned long ret = 0;
+	union futex_key key;
+	struct lguest_dma_info *i;
+	struct rw_semaphore *fshared = &current->mm->mmap_sem;
+
+	mutex_lock(&lguest_lock);
+	down_read(fshared);
+	if (get_futex_key((u32 __user *)ukey, fshared, &key) != 0) {
+		kill_guest(lg, "bad registered DMA buffer");
+		goto unlock;
+	}
+	list_for_each_entry(i, &dma_hash[hash(&key)], list) {
+		if (key_eq(&key, &i->key) && i->guestid == lg->guestid) {
+			unsigned int j;
+			for (j = 0; j < i->num_dmas; j++) {
+				struct lguest_dma dma;
+
+				ret = i->dmas + j * sizeof(struct lguest_dma);
+				lgread(lg, &dma, ret, sizeof(dma));
+				if (dma.used_len == 0)
+					break;
+			}
+			*interrupt = i->interrupt;
+			break;
+		}
+	}
+unlock:
+	up_read(fshared);
+	mutex_unlock(&lguest_lock);
+	return ret;
+}
+
diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h
new file mode 100644
index 00000000000..3e2ddfbc816
--- /dev/null
+++ b/drivers/lguest/lg.h
@@ -0,0 +1,261 @@
+#ifndef _LGUEST_H
+#define _LGUEST_H
+
+#include <asm/desc.h>
+
+#define GDT_ENTRY_LGUEST_CS	10
+#define GDT_ENTRY_LGUEST_DS	11
+#define LGUEST_CS		(GDT_ENTRY_LGUEST_CS * 8)
+#define LGUEST_DS		(GDT_ENTRY_LGUEST_DS * 8)
+
+#ifndef __ASSEMBLY__
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/stringify.h>
+#include <linux/binfmts.h>
+#include <linux/futex.h>
+#include <linux/lguest.h>
+#include <linux/lguest_launcher.h>
+#include <linux/wait.h>
+#include <linux/err.h>
+#include <asm/semaphore.h>
+#include "irq_vectors.h"
+
+#define GUEST_PL 1
+
+struct lguest_regs
+{
+	/* Manually saved part. */
+	unsigned long ebx, ecx, edx;
+	unsigned long esi, edi, ebp;
+	unsigned long gs;
+	unsigned long eax;
+	unsigned long fs, ds, es;
+	unsigned long trapnum, errcode;
+	/* Trap pushed part */
+	unsigned long eip;
+	unsigned long cs;
+	unsigned long eflags;
+	unsigned long esp;
+	unsigned long ss;
+};
+
+void free_pagetables(void);
+int init_pagetables(struct page **switcher_page, unsigned int pages);
+
+/* Full 4G segment descriptors, suitable for CS and DS. */
+#define FULL_EXEC_SEGMENT ((struct desc_struct){0x0000ffff, 0x00cf9b00})
+#define FULL_SEGMENT ((struct desc_struct){0x0000ffff, 0x00cf9300})
+
+struct lguest_dma_info
+{
+	struct list_head list;
+	union futex_key key;
+	unsigned long dmas;
+	u16 next_dma;
+	u16 num_dmas;
+	u16 guestid;
+	u8 interrupt; 	/* 0 when not registered */
+};
+
+/* We have separate types for the guest's ptes & pgds and the shadow ptes &
+ * pgds.  Since this host might use three-level pagetables and the guest and
+ * shadow pagetables don't, we can't use the normal pte_t/pgd_t. */
+typedef union {
+	struct { unsigned flags:12, pfn:20; };
+	struct { unsigned long val; } raw;
+} spgd_t;
+typedef union {
+	struct { unsigned flags:12, pfn:20; };
+	struct { unsigned long val; } raw;
+} spte_t;
+typedef union {
+	struct { unsigned flags:12, pfn:20; };
+	struct { unsigned long val; } raw;
+} gpgd_t;
+typedef union {
+	struct { unsigned flags:12, pfn:20; };
+	struct { unsigned long val; } raw;
+} gpte_t;
+#define mkgpte(_val) ((gpte_t){.raw.val = _val})
+#define mkgpgd(_val) ((gpgd_t){.raw.val = _val})
+
+struct pgdir
+{
+	unsigned long cr3;
+	spgd_t *pgdir;
+};
+
+/* This is a guest-specific page (mapped ro) into the guest. */
+struct lguest_ro_state
+{
+	/* Host information we need to restore when we switch back. */
+	u32 host_cr3;
+	struct Xgt_desc_struct host_idt_desc;
+	struct Xgt_desc_struct host_gdt_desc;
+	u32 host_sp;
+
+	/* Fields which are used when guest is running. */
+	struct Xgt_desc_struct guest_idt_desc;
+	struct Xgt_desc_struct guest_gdt_desc;
+	struct i386_hw_tss guest_tss;
+	struct desc_struct guest_idt[IDT_ENTRIES];
+	struct desc_struct guest_gdt[GDT_ENTRIES];
+};
+
+/* We have two pages shared with guests, per cpu.  */
+struct lguest_pages
+{
+	/* This is the stack page mapped rw in guest */
+	char spare[PAGE_SIZE - sizeof(struct lguest_regs)];
+	struct lguest_regs regs;
+
+	/* This is the host state & guest descriptor page, ro in guest */
+	struct lguest_ro_state state;
+} __attribute__((aligned(PAGE_SIZE)));
+
+#define CHANGED_IDT		1
+#define CHANGED_GDT		2
+#define CHANGED_GDT_TLS		4 /* Actually a subset of CHANGED_GDT */
+#define CHANGED_ALL	        3
+
+/* The private info the thread maintains about the guest. */
+struct lguest
+{
+	/* At end of a page shared mapped over lguest_pages in guest.  */
+	unsigned long regs_page;
+	struct lguest_regs *regs;
+	struct lguest_data __user *lguest_data;
+	struct task_struct *tsk;
+	struct mm_struct *mm; 	/* == tsk->mm, but that becomes NULL on exit */
+	u16 guestid;
+	u32 pfn_limit;
+	u32 page_offset;
+	u32 cr2;
+	int halted;
+	int ts;
+	u32 next_hcall;
+	u32 esp1;
+	u8 ss1;
+
+	/* Do we need to stop what we're doing and return to userspace? */
+	int break_out;
+	wait_queue_head_t break_wq;
+
+	/* Bitmap of what has changed: see CHANGED_* above. */
+	int changed;
+	struct lguest_pages *last_pages;
+
+	/* We keep a small number of these. */
+	u32 pgdidx;
+	struct pgdir pgdirs[4];
+
+	/* Cached wakeup: we hold a reference to this task. */
+	struct task_struct *wake;
+
+	unsigned long noirq_start, noirq_end;
+	int dma_is_pending;
+	unsigned long pending_dma; /* struct lguest_dma */
+	unsigned long pending_key; /* address they're sending to */
+
+	unsigned int stack_pages;
+	u32 tsc_khz;
+
+	struct lguest_dma_info dma[LGUEST_MAX_DMA];
+
+	/* Dead? */
+	const char *dead;
+
+	/* The GDT entries copied into lguest_ro_state when running. */
+	struct desc_struct gdt[GDT_ENTRIES];
+
+	/* The IDT entries: some copied into lguest_ro_state when running. */
+	struct desc_struct idt[FIRST_EXTERNAL_VECTOR+LGUEST_IRQS];
+	struct desc_struct syscall_idt;
+
+	/* Virtual clock device */
+	struct hrtimer hrt;
+
+	/* Pending virtual interrupts */
+	DECLARE_BITMAP(irqs_pending, LGUEST_IRQS);
+};
+
+extern struct lguest lguests[];
+extern struct mutex lguest_lock;
+
+/* core.c: */
+u32 lgread_u32(struct lguest *lg, unsigned long addr);
+void lgwrite_u32(struct lguest *lg, unsigned long addr, u32 val);
+void lgread(struct lguest *lg, void *buf, unsigned long addr, unsigned len);
+void lgwrite(struct lguest *lg, unsigned long, const void *buf, unsigned len);
+int find_free_guest(void);
+int lguest_address_ok(const struct lguest *lg,
+		      unsigned long addr, unsigned long len);
+int run_guest(struct lguest *lg, unsigned long __user *user);
+
+
+/* interrupts_and_traps.c: */
+void maybe_do_interrupt(struct lguest *lg);
+int deliver_trap(struct lguest *lg, unsigned int num);
+void load_guest_idt_entry(struct lguest *lg, unsigned int i, u32 low, u32 hi);
+void guest_set_stack(struct lguest *lg, u32 seg, u32 esp, unsigned int pages);
+void pin_stack_pages(struct lguest *lg);
+void setup_default_idt_entries(struct lguest_ro_state *state,
+			       const unsigned long *def);
+void copy_traps(const struct lguest *lg, struct desc_struct *idt,
+		const unsigned long *def);
+void guest_set_clockevent(struct lguest *lg, unsigned long delta);
+void init_clockdev(struct lguest *lg);
+
+/* segments.c: */
+void setup_default_gdt_entries(struct lguest_ro_state *state);
+void setup_guest_gdt(struct lguest *lg);
+void load_guest_gdt(struct lguest *lg, unsigned long table, u32 num);
+void guest_load_tls(struct lguest *lg, unsigned long tls_array);
+void copy_gdt(const struct lguest *lg, struct desc_struct *gdt);
+void copy_gdt_tls(const struct lguest *lg, struct desc_struct *gdt);
+
+/* page_tables.c: */
+int init_guest_pagetable(struct lguest *lg, unsigned long pgtable);
+void free_guest_pagetable(struct lguest *lg);
+void guest_new_pagetable(struct lguest *lg, unsigned long pgtable);
+void guest_set_pmd(struct lguest *lg, unsigned long cr3, u32 i);
+void guest_pagetable_clear_all(struct lguest *lg);
+void guest_pagetable_flush_user(struct lguest *lg);
+void guest_set_pte(struct lguest *lg, unsigned long cr3,
+		   unsigned long vaddr, gpte_t val);
+void map_switcher_in_guest(struct lguest *lg, struct lguest_pages *pages);
+int demand_page(struct lguest *info, unsigned long cr2, int errcode);
+void pin_page(struct lguest *lg, unsigned long vaddr);
+
+/* lguest_user.c: */
+int lguest_device_init(void);
+void lguest_device_remove(void);
+
+/* io.c: */
+void lguest_io_init(void);
+int bind_dma(struct lguest *lg,
+	     unsigned long key, unsigned long udma, u16 numdmas, u8 interrupt);
+void send_dma(struct lguest *info, unsigned long key, unsigned long udma);
+void release_all_dma(struct lguest *lg);
+unsigned long get_dma_buffer(struct lguest *lg, unsigned long key,
+			     unsigned long *interrupt);
+
+/* hypercalls.c: */
+void do_hypercalls(struct lguest *lg);
+
+#define kill_guest(lg, fmt...)					\
+do {								\
+	if (!(lg)->dead) {					\
+		(lg)->dead = kasprintf(GFP_ATOMIC, fmt);	\
+		if (!(lg)->dead)				\
+			(lg)->dead = ERR_PTR(-ENOMEM);		\
+	}							\
+} while(0)
+
+static inline unsigned long guest_pa(struct lguest *lg, unsigned long vaddr)
+{
+	return vaddr - lg->page_offset;
+}
+#endif	/* __ASSEMBLY__ */
+#endif	/* _LGUEST_H */
diff --git a/drivers/lguest/lguest.c b/drivers/lguest/lguest.c
index b3a72bd8d6f..b9a58b78c99 100644
--- a/drivers/lguest/lguest.c
+++ b/drivers/lguest/lguest.c
@@ -25,6 +25,8 @@
 #include <linux/screen_info.h>
 #include <linux/irq.h>
 #include <linux/interrupt.h>
+#include <linux/clocksource.h>
+#include <linux/clockchips.h>
 #include <linux/lguest.h>
 #include <linux/lguest_launcher.h>
 #include <linux/lguest_bus.h>
@@ -37,6 +39,7 @@
 #include <asm/e820.h>
 #include <asm/mce.h>
 #include <asm/io.h>
+//#include <asm/sched-clock.h>
 
 /* Declarations for definitions in lguest_guest.S */
 extern char lguest_noirq_start[], lguest_noirq_end[];
@@ -54,7 +57,6 @@ struct lguest_data lguest_data = {
 	.blocked_interrupts = { 1 }, /* Block timer interrupts */
 };
 struct lguest_device_desc *lguest_devices;
-static __initdata const struct lguest_boot_info *boot = __va(0);
 
 static enum paravirt_lazy_mode lazy_mode;
 static void lguest_lazy_mode(enum paravirt_lazy_mode mode)
@@ -210,7 +212,7 @@ static void lguest_cpuid(unsigned int *eax, unsigned int *ebx,
 	case 1:	/* Basic feature request. */
 		/* We only allow kernel to see SSE3, CMPXCHG16B and SSSE3 */
 		*ecx &= 0x00002201;
-		/* Similarly: SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, FPU. */
+		/* SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, FPU. */
 		*edx &= 0x07808101;
 		/* Host wants to know when we flush kernel pages: set PGE. */
 		*edx |= 0x00002000;
@@ -346,24 +348,104 @@ static unsigned long lguest_get_wallclock(void)
 	return hcall(LHCALL_GET_WALLCLOCK, 0, 0, 0);
 }
 
+static cycle_t lguest_clock_read(void)
+{
+	if (lguest_data.tsc_khz)
+		return native_read_tsc();
+	else
+		return jiffies;
+}
+
+/* This is what we tell the kernel is our clocksource.  */
+static struct clocksource lguest_clock = {
+	.name		= "lguest",
+	.rating		= 400,
+	.read		= lguest_clock_read,
+};
+
+/* We also need a "struct clock_event_device": Linux asks us to set it to go
+ * off some time in the future.  Actually, James Morris figured all this out, I
+ * just applied the patch. */
+static int lguest_clockevent_set_next_event(unsigned long delta,
+                                           struct clock_event_device *evt)
+{
+	if (delta < LG_CLOCK_MIN_DELTA) {
+		if (printk_ratelimit())
+			printk(KERN_DEBUG "%s: small delta %lu ns\n",
+			       __FUNCTION__, delta);
+		return -ETIME;
+	}
+	hcall(LHCALL_SET_CLOCKEVENT, delta, 0, 0);
+	return 0;
+}
+
+static void lguest_clockevent_set_mode(enum clock_event_mode mode,
+                                      struct clock_event_device *evt)
+{
+	switch (mode) {
+	case CLOCK_EVT_MODE_UNUSED:
+	case CLOCK_EVT_MODE_SHUTDOWN:
+		/* A 0 argument shuts the clock down. */
+		hcall(LHCALL_SET_CLOCKEVENT, 0, 0, 0);
+		break;
+	case CLOCK_EVT_MODE_ONESHOT:
+		/* This is what we expect. */
+		break;
+	case CLOCK_EVT_MODE_PERIODIC:
+		BUG();
+	}
+}
+
+/* This describes our primitive timer chip. */
+static struct clock_event_device lguest_clockevent = {
+	.name                   = "lguest",
+	.features               = CLOCK_EVT_FEAT_ONESHOT,
+	.set_next_event         = lguest_clockevent_set_next_event,
+	.set_mode               = lguest_clockevent_set_mode,
+	.rating                 = INT_MAX,
+	.mult                   = 1,
+	.shift                  = 0,
+	.min_delta_ns           = LG_CLOCK_MIN_DELTA,
+	.max_delta_ns           = LG_CLOCK_MAX_DELTA,
+};
+
+/* This is the Guest timer interrupt handler (hardware interrupt 0).  We just
+ * call the clockevent infrastructure and it does whatever needs doing. */
 static void lguest_time_irq(unsigned int irq, struct irq_desc *desc)
 {
-	do_timer(hcall(LHCALL_TIMER_READ, 0, 0, 0));
-	update_process_times(user_mode_vm(get_irq_regs()));
+	unsigned long flags;
+
+	/* Don't interrupt us while this is running. */
+	local_irq_save(flags);
+	lguest_clockevent.event_handler(&lguest_clockevent);
+	local_irq_restore(flags);
 }
 
-static u64 sched_clock_base;
 static void lguest_time_init(void)
 {
 	set_irq_handler(0, lguest_time_irq);
-	hcall(LHCALL_TIMER_READ, 0, 0, 0);
-	sched_clock_base = jiffies_64;
-	enable_lguest_irq(0);
-}
 
-static unsigned long long lguest_sched_clock(void)
-{
-	return (jiffies_64 - sched_clock_base) * (1000000000 / HZ);
+	/* We use the TSC if the Host tells us we can, otherwise a dumb
+	 * jiffies-based clock. */
+	if (lguest_data.tsc_khz) {
+		lguest_clock.shift = 22;
+		lguest_clock.mult = clocksource_khz2mult(lguest_data.tsc_khz,
+							 lguest_clock.shift);
+		lguest_clock.mask = CLOCKSOURCE_MASK(64);
+		lguest_clock.flags = CLOCK_SOURCE_IS_CONTINUOUS;
+	} else {
+		/* To understand this, start at kernel/time/jiffies.c... */
+		lguest_clock.shift = 8;
+		lguest_clock.mult = (((u64)NSEC_PER_SEC<<8)/ACTHZ) << 8;
+		lguest_clock.mask = CLOCKSOURCE_MASK(32);
+	}
+	clocksource_register(&lguest_clock);
+
+	/* We can't set cpumask in the initializer: damn C limitations! */
+	lguest_clockevent.cpumask = cpumask_of_cpu(0);
+	clockevents_register_device(&lguest_clockevent);
+
+	enable_lguest_irq(0);
 }
 
 static void lguest_load_esp0(struct tss_struct *tss,
@@ -418,8 +500,7 @@ static __init char *lguest_memory_setup(void)
 	/* We do this here because lockcheck barfs if before start_kernel */
 	atomic_notifier_chain_register(&panic_notifier_list, &paniced);
 
-	e820.nr_map = 0;
-	add_memory_region(0, PFN_PHYS(boot->max_pfn), E820_RAM);
+	add_memory_region(E820_MAP->addr, E820_MAP->size, E820_MAP->type);
 	return "LGUEST";
 }
 
@@ -450,8 +531,13 @@ static unsigned lguest_patch(u8 type, u16 clobber, void *insns, unsigned len)
 	return insn_len;
 }
 
-__init void lguest_init(void)
+__init void lguest_init(void *boot)
 {
+	/* Copy boot parameters first. */
+	memcpy(&boot_params, boot, PARAM_SIZE);
+	memcpy(boot_command_line, __va(boot_params.hdr.cmd_line_ptr),
+	       COMMAND_LINE_SIZE);
+
 	paravirt_ops.name = "lguest";
 	paravirt_ops.paravirt_enabled = 1;
 	paravirt_ops.kernel_rpl = 1;
@@ -498,10 +584,8 @@ __init void lguest_init(void)
 	paravirt_ops.time_init = lguest_time_init;
 	paravirt_ops.set_lazy_mode = lguest_lazy_mode;
 	paravirt_ops.wbinvd = lguest_wbinvd;
-	paravirt_ops.sched_clock = lguest_sched_clock;
 
 	hcall(LHCALL_LGUEST_INIT, __pa(&lguest_data), 0, 0);
-	strncpy(boot_command_line, boot->cmdline, COMMAND_LINE_SIZE);
 
 	/* We use top of mem for initial pagetables. */
 	init_pg_tables_end = __pa(pg0);
@@ -532,13 +616,6 @@ __init void lguest_init(void)
 
 	add_preferred_console("hvc", 0, NULL);
 
-	if (boot->initrd_size) {
-		/* We stash this at top of memory. */
-		INITRD_START = boot->max_pfn*PAGE_SIZE - boot->initrd_size;
-		INITRD_SIZE = boot->initrd_size;
-		LOADER_TYPE = 0xFF;
-	}
-
 	pm_power_off = lguest_power_off;
 	start_kernel();
 }
diff --git a/drivers/lguest/lguest_asm.S b/drivers/lguest/lguest_asm.S
index 5ac3d20bb18..00046c57b5b 100644
--- a/drivers/lguest/lguest_asm.S
+++ b/drivers/lguest/lguest_asm.S
@@ -10,7 +10,8 @@
  * This is where we begin: we have a magic signature which the launcher looks
  * for.  The plan is that the Linux boot protocol will be extended with a
  * "platform type" field which will guide us here from the normal entry point,
- * but for the moment this suffices.
+ * but for the moment this suffices.  We pass the virtual address of the boot
+ * info to lguest_init().
  *
  * We put it in .init.text will be discarded after boot.
  */
@@ -18,6 +19,8 @@
 .ascii "GenuineLguest"
 	/* Set up initial stack. */
  	movl $(init_thread_union+THREAD_SIZE),%esp
+	movl %esi, %eax
+	addl $__PAGE_OFFSET, %eax
 	jmp lguest_init
 
 /* The templates for inline patching. */
diff --git a/drivers/lguest/lguest_user.c b/drivers/lguest/lguest_user.c
new file mode 100644
index 00000000000..e90d7a783da
--- /dev/null
+++ b/drivers/lguest/lguest_user.c
@@ -0,0 +1,236 @@
+/* Userspace control of the guest, via /dev/lguest. */
+#include <linux/uaccess.h>
+#include <linux/miscdevice.h>
+#include <linux/fs.h>
+#include "lg.h"
+
+static void setup_regs(struct lguest_regs *regs, unsigned long start)
+{
+	/* Write out stack in format lguest expects, so we can switch to it. */
+	regs->ds = regs->es = regs->ss = __KERNEL_DS|GUEST_PL;
+	regs->cs = __KERNEL_CS|GUEST_PL;
+	regs->eflags = 0x202; 	/* Interrupts enabled. */
+	regs->eip = start;
+	/* esi points to our boot information (physical address 0) */
+}
+
+/* + addr */
+static long user_get_dma(struct lguest *lg, const u32 __user *input)
+{
+	unsigned long key, udma, irq;
+
+	if (get_user(key, input) != 0)
+		return -EFAULT;
+	udma = get_dma_buffer(lg, key, &irq);
+	if (!udma)
+		return -ENOENT;
+
+	/* We put irq number in udma->used_len. */
+	lgwrite_u32(lg, udma + offsetof(struct lguest_dma, used_len), irq);
+	return udma;
+}
+
+/* To force the Guest to stop running and return to the Launcher, the
+ * Waker sets writes LHREQ_BREAK and the value "1" to /dev/lguest.  The
+ * Launcher then writes LHREQ_BREAK and "0" to release the Waker. */
+static int break_guest_out(struct lguest *lg, const u32 __user *input)
+{
+	unsigned long on;
+
+	/* Fetch whether they're turning break on or off.. */
+	if (get_user(on, input) != 0)
+		return -EFAULT;
+
+	if (on) {
+		lg->break_out = 1;
+		/* Pop it out (may be running on different CPU) */
+		wake_up_process(lg->tsk);
+		/* Wait for them to reset it */
+		return wait_event_interruptible(lg->break_wq, !lg->break_out);
+	} else {
+		lg->break_out = 0;
+		wake_up(&lg->break_wq);
+		return 0;
+	}
+}
+
+/* + irq */
+static int user_send_irq(struct lguest *lg, const u32 __user *input)
+{
+	u32 irq;
+
+	if (get_user(irq, input) != 0)
+		return -EFAULT;
+	if (irq >= LGUEST_IRQS)
+		return -EINVAL;
+	set_bit(irq, lg->irqs_pending);
+	return 0;
+}
+
+static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o)
+{
+	struct lguest *lg = file->private_data;
+
+	if (!lg)
+		return -EINVAL;
+
+	/* If you're not the task which owns the guest, go away. */
+	if (current != lg->tsk)
+		return -EPERM;
+
+	if (lg->dead) {
+		size_t len;
+
+		if (IS_ERR(lg->dead))
+			return PTR_ERR(lg->dead);
+
+		len = min(size, strlen(lg->dead)+1);
+		if (copy_to_user(user, lg->dead, len) != 0)
+			return -EFAULT;
+		return len;
+	}
+
+	if (lg->dma_is_pending)
+		lg->dma_is_pending = 0;
+
+	return run_guest(lg, (unsigned long __user *)user);
+}
+
+/* Take: pfnlimit, pgdir, start, pageoffset. */
+static int initialize(struct file *file, const u32 __user *input)
+{
+	struct lguest *lg;
+	int err, i;
+	u32 args[4];
+
+	/* We grab the Big Lguest lock, which protects the global array
+	 * "lguests" and multiple simultaneous initializations. */
+	mutex_lock(&lguest_lock);
+
+	if (file->private_data) {
+		err = -EBUSY;
+		goto unlock;
+	}
+
+	if (copy_from_user(args, input, sizeof(args)) != 0) {
+		err = -EFAULT;
+		goto unlock;
+	}
+
+	i = find_free_guest();
+	if (i < 0) {
+		err = -ENOSPC;
+		goto unlock;
+	}
+	lg = &lguests[i];
+	lg->guestid = i;
+	lg->pfn_limit = args[0];
+	lg->page_offset = args[3];
+	lg->regs_page = get_zeroed_page(GFP_KERNEL);
+	if (!lg->regs_page) {
+		err = -ENOMEM;
+		goto release_guest;
+	}
+	lg->regs = (void *)lg->regs_page + PAGE_SIZE - sizeof(*lg->regs);
+
+	err = init_guest_pagetable(lg, args[1]);
+	if (err)
+		goto free_regs;
+
+	setup_regs(lg->regs, args[2]);
+	setup_guest_gdt(lg);
+	init_clockdev(lg);
+	lg->tsk = current;
+	lg->mm = get_task_mm(lg->tsk);
+	init_waitqueue_head(&lg->break_wq);
+	lg->last_pages = NULL;
+	file->private_data = lg;
+
+	mutex_unlock(&lguest_lock);
+
+	return sizeof(args);
+
+free_regs:
+	free_page(lg->regs_page);
+release_guest:
+	memset(lg, 0, sizeof(*lg));
+unlock:
+	mutex_unlock(&lguest_lock);
+	return err;
+}
+
+static ssize_t write(struct file *file, const char __user *input,
+		     size_t size, loff_t *off)
+{
+	struct lguest *lg = file->private_data;
+	u32 req;
+
+	if (get_user(req, input) != 0)
+		return -EFAULT;
+	input += sizeof(req);
+
+	if (req != LHREQ_INITIALIZE && !lg)
+		return -EINVAL;
+	if (lg && lg->dead)
+		return -ENOENT;
+
+	/* If you're not the task which owns the Guest, you can only break */
+	if (lg && current != lg->tsk && req != LHREQ_BREAK)
+		return -EPERM;
+
+	switch (req) {
+	case LHREQ_INITIALIZE:
+		return initialize(file, (const u32 __user *)input);
+	case LHREQ_GETDMA:
+		return user_get_dma(lg, (const u32 __user *)input);
+	case LHREQ_IRQ:
+		return user_send_irq(lg, (const u32 __user *)input);
+	case LHREQ_BREAK:
+		return break_guest_out(lg, (const u32 __user *)input);
+	default:
+		return -EINVAL;
+	}
+}
+
+static int close(struct inode *inode, struct file *file)
+{
+	struct lguest *lg = file->private_data;
+
+	if (!lg)
+		return 0;
+
+	mutex_lock(&lguest_lock);
+	/* Cancels the hrtimer set via LHCALL_SET_CLOCKEVENT. */
+	hrtimer_cancel(&lg->hrt);
+	release_all_dma(lg);
+	free_guest_pagetable(lg);
+	mmput(lg->mm);
+	if (!IS_ERR(lg->dead))
+		kfree(lg->dead);
+	free_page(lg->regs_page);
+	memset(lg, 0, sizeof(*lg));
+	mutex_unlock(&lguest_lock);
+	return 0;
+}
+
+static struct file_operations lguest_fops = {
+	.owner	 = THIS_MODULE,
+	.release = close,
+	.write	 = write,
+	.read	 = read,
+};
+static struct miscdevice lguest_dev = {
+	.minor	= MISC_DYNAMIC_MINOR,
+	.name	= "lguest",
+	.fops	= &lguest_fops,
+};
+
+int __init lguest_device_init(void)
+{
+	return misc_register(&lguest_dev);
+}
+
+void __exit lguest_device_remove(void)
+{
+	misc_deregister(&lguest_dev);
+}
diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c
new file mode 100644
index 00000000000..1b0ba09b126
--- /dev/null
+++ b/drivers/lguest/page_tables.c
@@ -0,0 +1,411 @@
+/* Shadow page table operations.
+ * Copyright (C) Rusty Russell IBM Corporation 2006.
+ * GPL v2 and any later version */
+#include <linux/mm.h>
+#include <linux/types.h>
+#include <linux/spinlock.h>
+#include <linux/random.h>
+#include <linux/percpu.h>
+#include <asm/tlbflush.h>
+#include "lg.h"
+
+#define PTES_PER_PAGE_SHIFT 10
+#define PTES_PER_PAGE (1 << PTES_PER_PAGE_SHIFT)
+#define SWITCHER_PGD_INDEX (PTES_PER_PAGE - 1)
+
+static DEFINE_PER_CPU(spte_t *, switcher_pte_pages);
+#define switcher_pte_page(cpu) per_cpu(switcher_pte_pages, cpu)
+
+static unsigned vaddr_to_pgd_index(unsigned long vaddr)
+{
+	return vaddr >> (PAGE_SHIFT + PTES_PER_PAGE_SHIFT);
+}
+
+/* These access the shadow versions (ie. the ones used by the CPU). */
+static spgd_t *spgd_addr(struct lguest *lg, u32 i, unsigned long vaddr)
+{
+	unsigned int index = vaddr_to_pgd_index(vaddr);
+
+	if (index >= SWITCHER_PGD_INDEX) {
+		kill_guest(lg, "attempt to access switcher pages");
+		index = 0;
+	}
+	return &lg->pgdirs[i].pgdir[index];
+}
+
+static spte_t *spte_addr(struct lguest *lg, spgd_t spgd, unsigned long vaddr)
+{
+	spte_t *page = __va(spgd.pfn << PAGE_SHIFT);
+	BUG_ON(!(spgd.flags & _PAGE_PRESENT));
+	return &page[(vaddr >> PAGE_SHIFT) % PTES_PER_PAGE];
+}
+
+/* These access the guest versions. */
+static unsigned long gpgd_addr(struct lguest *lg, unsigned long vaddr)
+{
+	unsigned int index = vaddr >> (PAGE_SHIFT + PTES_PER_PAGE_SHIFT);
+	return lg->pgdirs[lg->pgdidx].cr3 + index * sizeof(gpgd_t);
+}
+
+static unsigned long gpte_addr(struct lguest *lg,
+			       gpgd_t gpgd, unsigned long vaddr)
+{
+	unsigned long gpage = gpgd.pfn << PAGE_SHIFT;
+	BUG_ON(!(gpgd.flags & _PAGE_PRESENT));
+	return gpage + ((vaddr>>PAGE_SHIFT) % PTES_PER_PAGE) * sizeof(gpte_t);
+}
+
+/* Do a virtual -> physical mapping on a user page. */
+static unsigned long get_pfn(unsigned long virtpfn, int write)
+{
+	struct page *page;
+	unsigned long ret = -1UL;
+
+	down_read(&current->mm->mmap_sem);
+	if (get_user_pages(current, current->mm, virtpfn << PAGE_SHIFT,
+			   1, write, 1, &page, NULL) == 1)
+		ret = page_to_pfn(page);
+	up_read(&current->mm->mmap_sem);
+	return ret;
+}
+
+static spte_t gpte_to_spte(struct lguest *lg, gpte_t gpte, int write)
+{
+	spte_t spte;
+	unsigned long pfn;
+
+	/* We ignore the global flag. */
+	spte.flags = (gpte.flags & ~_PAGE_GLOBAL);
+	pfn = get_pfn(gpte.pfn, write);
+	if (pfn == -1UL) {
+		kill_guest(lg, "failed to get page %u", gpte.pfn);
+		/* Must not put_page() bogus page on cleanup. */
+		spte.flags = 0;
+	}
+	spte.pfn = pfn;
+	return spte;
+}
+
+static void release_pte(spte_t pte)
+{
+	if (pte.flags & _PAGE_PRESENT)
+		put_page(pfn_to_page(pte.pfn));
+}
+
+static void check_gpte(struct lguest *lg, gpte_t gpte)
+{
+	if ((gpte.flags & (_PAGE_PWT|_PAGE_PSE)) || gpte.pfn >= lg->pfn_limit)
+		kill_guest(lg, "bad page table entry");
+}
+
+static void check_gpgd(struct lguest *lg, gpgd_t gpgd)
+{
+	if ((gpgd.flags & ~_PAGE_TABLE) || gpgd.pfn >= lg->pfn_limit)
+		kill_guest(lg, "bad page directory entry");
+}
+
+/* FIXME: We hold reference to pages, which prevents them from being
+   swapped.  It'd be nice to have a callback when Linux wants to swap out. */
+
+/* We fault pages in, which allows us to update accessed/dirty bits.
+ * Return true if we got page. */
+int demand_page(struct lguest *lg, unsigned long vaddr, int errcode)
+{
+	gpgd_t gpgd;
+	spgd_t *spgd;
+	unsigned long gpte_ptr;
+	gpte_t gpte;
+	spte_t *spte;
+
+	gpgd = mkgpgd(lgread_u32(lg, gpgd_addr(lg, vaddr)));
+	if (!(gpgd.flags & _PAGE_PRESENT))
+		return 0;
+
+	spgd = spgd_addr(lg, lg->pgdidx, vaddr);
+	if (!(spgd->flags & _PAGE_PRESENT)) {
+		/* Get a page of PTEs for them. */
+		unsigned long ptepage = get_zeroed_page(GFP_KERNEL);
+		/* FIXME: Steal from self in this case? */
+		if (!ptepage) {
+			kill_guest(lg, "out of memory allocating pte page");
+			return 0;
+		}
+		check_gpgd(lg, gpgd);
+		spgd->raw.val = (__pa(ptepage) | gpgd.flags);
+	}
+
+	gpte_ptr = gpte_addr(lg, gpgd, vaddr);
+	gpte = mkgpte(lgread_u32(lg, gpte_ptr));
+
+	/* No page? */
+	if (!(gpte.flags & _PAGE_PRESENT))
+		return 0;
+
+	/* Write to read-only page? */
+	if ((errcode & 2) && !(gpte.flags & _PAGE_RW))
+		return 0;
+
+	/* User access to a non-user page? */
+	if ((errcode & 4) && !(gpte.flags & _PAGE_USER))
+		return 0;
+
+	check_gpte(lg, gpte);
+	gpte.flags |= _PAGE_ACCESSED;
+	if (errcode & 2)
+		gpte.flags |= _PAGE_DIRTY;
+
+	/* We're done with the old pte. */
+	spte = spte_addr(lg, *spgd, vaddr);
+	release_pte(*spte);
+
+	/* We don't make it writable if this isn't a write: later
+	 * write will fault so we can set dirty bit in guest. */
+	if (gpte.flags & _PAGE_DIRTY)
+		*spte = gpte_to_spte(lg, gpte, 1);
+	else {
+		gpte_t ro_gpte = gpte;
+		ro_gpte.flags &= ~_PAGE_RW;
+		*spte = gpte_to_spte(lg, ro_gpte, 0);
+	}
+
+	/* Now we update dirty/accessed on guest. */
+	lgwrite_u32(lg, gpte_ptr, gpte.raw.val);
+	return 1;
+}
+
+/* This is much faster than the full demand_page logic. */
+static int page_writable(struct lguest *lg, unsigned long vaddr)
+{
+	spgd_t *spgd;
+	unsigned long flags;
+
+	spgd = spgd_addr(lg, lg->pgdidx, vaddr);
+	if (!(spgd->flags & _PAGE_PRESENT))
+		return 0;
+
+	flags = spte_addr(lg, *spgd, vaddr)->flags;
+	return (flags & (_PAGE_PRESENT|_PAGE_RW)) == (_PAGE_PRESENT|_PAGE_RW);
+}
+
+void pin_page(struct lguest *lg, unsigned long vaddr)
+{
+	if (!page_writable(lg, vaddr) && !demand_page(lg, vaddr, 2))
+		kill_guest(lg, "bad stack page %#lx", vaddr);
+}
+
+static void release_pgd(struct lguest *lg, spgd_t *spgd)
+{
+	if (spgd->flags & _PAGE_PRESENT) {
+		unsigned int i;
+		spte_t *ptepage = __va(spgd->pfn << PAGE_SHIFT);
+		for (i = 0; i < PTES_PER_PAGE; i++)
+			release_pte(ptepage[i]);
+		free_page((long)ptepage);
+		spgd->raw.val = 0;
+	}
+}
+
+static void flush_user_mappings(struct lguest *lg, int idx)
+{
+	unsigned int i;
+	for (i = 0; i < vaddr_to_pgd_index(lg->page_offset); i++)
+		release_pgd(lg, lg->pgdirs[idx].pgdir + i);
+}
+
+void guest_pagetable_flush_user(struct lguest *lg)
+{
+	flush_user_mappings(lg, lg->pgdidx);
+}
+
+static unsigned int find_pgdir(struct lguest *lg, unsigned long pgtable)
+{
+	unsigned int i;
+	for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++)
+		if (lg->pgdirs[i].cr3 == pgtable)
+			break;
+	return i;
+}
+
+static unsigned int new_pgdir(struct lguest *lg,
+			      unsigned long cr3,
+			      int *blank_pgdir)
+{
+	unsigned int next;
+
+	next = random32() % ARRAY_SIZE(lg->pgdirs);
+	if (!lg->pgdirs[next].pgdir) {
+		lg->pgdirs[next].pgdir = (spgd_t *)get_zeroed_page(GFP_KERNEL);
+		if (!lg->pgdirs[next].pgdir)
+			next = lg->pgdidx;
+		else
+			/* There are no mappings: you'll need to re-pin */
+			*blank_pgdir = 1;
+	}
+	lg->pgdirs[next].cr3 = cr3;
+	/* Release all the non-kernel mappings. */
+	flush_user_mappings(lg, next);
+
+	return next;
+}
+
+void guest_new_pagetable(struct lguest *lg, unsigned long pgtable)
+{
+	int newpgdir, repin = 0;
+
+	newpgdir = find_pgdir(lg, pgtable);
+	if (newpgdir == ARRAY_SIZE(lg->pgdirs))
+		newpgdir = new_pgdir(lg, pgtable, &repin);
+	lg->pgdidx = newpgdir;
+	if (repin)
+		pin_stack_pages(lg);
+}
+
+static void release_all_pagetables(struct lguest *lg)
+{
+	unsigned int i, j;
+
+	for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++)
+		if (lg->pgdirs[i].pgdir)
+			for (j = 0; j < SWITCHER_PGD_INDEX; j++)
+				release_pgd(lg, lg->pgdirs[i].pgdir + j);
+}
+
+void guest_pagetable_clear_all(struct lguest *lg)
+{
+	release_all_pagetables(lg);
+	pin_stack_pages(lg);
+}
+
+static void do_set_pte(struct lguest *lg, int idx,
+		       unsigned long vaddr, gpte_t gpte)
+{
+	spgd_t *spgd = spgd_addr(lg, idx, vaddr);
+	if (spgd->flags & _PAGE_PRESENT) {
+		spte_t *spte = spte_addr(lg, *spgd, vaddr);
+		release_pte(*spte);
+		if (gpte.flags & (_PAGE_DIRTY | _PAGE_ACCESSED)) {
+			check_gpte(lg, gpte);
+			*spte = gpte_to_spte(lg, gpte, gpte.flags&_PAGE_DIRTY);
+		} else
+			spte->raw.val = 0;
+	}
+}
+
+void guest_set_pte(struct lguest *lg,
+		   unsigned long cr3, unsigned long vaddr, gpte_t gpte)
+{
+	/* Kernel mappings must be changed on all top levels. */
+	if (vaddr >= lg->page_offset) {
+		unsigned int i;
+		for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++)
+			if (lg->pgdirs[i].pgdir)
+				do_set_pte(lg, i, vaddr, gpte);
+	} else {
+		int pgdir = find_pgdir(lg, cr3);
+		if (pgdir != ARRAY_SIZE(lg->pgdirs))
+			do_set_pte(lg, pgdir, vaddr, gpte);
+	}
+}
+
+void guest_set_pmd(struct lguest *lg, unsigned long cr3, u32 idx)
+{
+	int pgdir;
+
+	if (idx >= SWITCHER_PGD_INDEX)
+		return;
+
+	pgdir = find_pgdir(lg, cr3);
+	if (pgdir < ARRAY_SIZE(lg->pgdirs))
+		release_pgd(lg, lg->pgdirs[pgdir].pgdir + idx);
+}
+
+int init_guest_pagetable(struct lguest *lg, unsigned long pgtable)
+{
+	/* We assume this in flush_user_mappings, so check now */
+	if (vaddr_to_pgd_index(lg->page_offset) >= SWITCHER_PGD_INDEX)
+		return -EINVAL;
+	lg->pgdidx = 0;
+	lg->pgdirs[lg->pgdidx].cr3 = pgtable;
+	lg->pgdirs[lg->pgdidx].pgdir = (spgd_t*)get_zeroed_page(GFP_KERNEL);
+	if (!lg->pgdirs[lg->pgdidx].pgdir)
+		return -ENOMEM;
+	return 0;
+}
+
+void free_guest_pagetable(struct lguest *lg)
+{
+	unsigned int i;
+
+	release_all_pagetables(lg);
+	for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++)
+		free_page((long)lg->pgdirs[i].pgdir);
+}
+
+/* Caller must be preempt-safe */
+void map_switcher_in_guest(struct lguest *lg, struct lguest_pages *pages)
+{
+	spte_t *switcher_pte_page = __get_cpu_var(switcher_pte_pages);
+	spgd_t switcher_pgd;
+	spte_t regs_pte;
+
+	/* Since switcher less that 4MB, we simply mug top pte page. */
+	switcher_pgd.pfn = __pa(switcher_pte_page) >> PAGE_SHIFT;
+	switcher_pgd.flags = _PAGE_KERNEL;
+	lg->pgdirs[lg->pgdidx].pgdir[SWITCHER_PGD_INDEX] = switcher_pgd;
+
+	/* Map our regs page over stack page. */
+	regs_pte.pfn = __pa(lg->regs_page) >> PAGE_SHIFT;
+	regs_pte.flags = _PAGE_KERNEL;
+	switcher_pte_page[(unsigned long)pages/PAGE_SIZE%PTES_PER_PAGE]
+		= regs_pte;
+}
+
+static void free_switcher_pte_pages(void)
+{
+	unsigned int i;
+
+	for_each_possible_cpu(i)
+		free_page((long)switcher_pte_page(i));
+}
+
+static __init void populate_switcher_pte_page(unsigned int cpu,
+					      struct page *switcher_page[],
+					      unsigned int pages)
+{
+	unsigned int i;
+	spte_t *pte = switcher_pte_page(cpu);
+
+	for (i = 0; i < pages; i++) {
+		pte[i].pfn = page_to_pfn(switcher_page[i]);
+		pte[i].flags = _PAGE_PRESENT|_PAGE_ACCESSED;
+	}
+
+	/* We only map this CPU's pages, so guest can't see others. */
+	i = pages + cpu*2;
+
+	/* First page (regs) is rw, second (state) is ro. */
+	pte[i].pfn = page_to_pfn(switcher_page[i]);
+	pte[i].flags = _PAGE_PRESENT|_PAGE_ACCESSED|_PAGE_RW;
+	pte[i+1].pfn = page_to_pfn(switcher_page[i+1]);
+	pte[i+1].flags = _PAGE_PRESENT|_PAGE_ACCESSED;
+}
+
+__init int init_pagetables(struct page **switcher_page, unsigned int pages)
+{
+	unsigned int i;
+
+	for_each_possible_cpu(i) {
+		switcher_pte_page(i) = (spte_t *)get_zeroed_page(GFP_KERNEL);
+		if (!switcher_pte_page(i)) {
+			free_switcher_pte_pages();
+			return -ENOMEM;
+		}
+		populate_switcher_pte_page(i, switcher_page, pages);
+	}
+	return 0;
+}
+
+void free_pagetables(void)
+{
+	free_switcher_pte_pages();
+}
diff --git a/drivers/lguest/segments.c b/drivers/lguest/segments.c
new file mode 100644
index 00000000000..1b2cfe89dcd
--- /dev/null
+++ b/drivers/lguest/segments.c
@@ -0,0 +1,125 @@
+#include "lg.h"
+
+static int desc_ok(const struct desc_struct *gdt)
+{
+	/* MBZ=0, P=1, DT=1  */
+	return ((gdt->b & 0x00209000) == 0x00009000);
+}
+
+static int segment_present(const struct desc_struct *gdt)
+{
+	return gdt->b & 0x8000;
+}
+
+static int ignored_gdt(unsigned int num)
+{
+	return (num == GDT_ENTRY_TSS
+		|| num == GDT_ENTRY_LGUEST_CS
+		|| num == GDT_ENTRY_LGUEST_DS
+		|| num == GDT_ENTRY_DOUBLEFAULT_TSS);
+}
+
+/* We don't allow removal of CS, DS or SS; it doesn't make sense. */
+static void check_segment_use(struct lguest *lg, unsigned int desc)
+{
+	if (lg->regs->gs / 8 == desc)
+		lg->regs->gs = 0;
+	if (lg->regs->fs / 8 == desc)
+		lg->regs->fs = 0;
+	if (lg->regs->es / 8 == desc)
+		lg->regs->es = 0;
+	if (lg->regs->ds / 8 == desc
+	    || lg->regs->cs / 8 == desc
+	    || lg->regs->ss / 8 == desc)
+		kill_guest(lg, "Removed live GDT entry %u", desc);
+}
+
+static void fixup_gdt_table(struct lguest *lg, unsigned start, unsigned end)
+{
+	unsigned int i;
+
+	for (i = start; i < end; i++) {
+		/* We never copy these ones to real gdt */
+		if (ignored_gdt(i))
+			continue;
+
+		/* We could fault in switch_to_guest if they are using
+		 * a removed segment. */
+		if (!segment_present(&lg->gdt[i])) {
+			check_segment_use(lg, i);
+			continue;
+		}
+
+		if (!desc_ok(&lg->gdt[i]))
+			kill_guest(lg, "Bad GDT descriptor %i", i);
+
+		/* DPL 0 presumably means "for use by guest". */
+		if ((lg->gdt[i].b & 0x00006000) == 0)
+			lg->gdt[i].b |= (GUEST_PL << 13);
+
+		/* Set accessed bit, since gdt isn't writable. */
+		lg->gdt[i].b |= 0x00000100;
+	}
+}
+
+void setup_default_gdt_entries(struct lguest_ro_state *state)
+{
+	struct desc_struct *gdt = state->guest_gdt;
+	unsigned long tss = (unsigned long)&state->guest_tss;
+
+	/* Hypervisor segments. */
+	gdt[GDT_ENTRY_LGUEST_CS] = FULL_EXEC_SEGMENT;
+	gdt[GDT_ENTRY_LGUEST_DS] = FULL_SEGMENT;
+
+	/* This is the one which we *cannot* copy from guest, since tss
+	   is depended on this lguest_ro_state, ie. this cpu. */
+	gdt[GDT_ENTRY_TSS].a = 0x00000067 | (tss << 16);
+	gdt[GDT_ENTRY_TSS].b = 0x00008900 | (tss & 0xFF000000)
+		| ((tss >> 16) & 0x000000FF);
+}
+
+void setup_guest_gdt(struct lguest *lg)
+{
+	lg->gdt[GDT_ENTRY_KERNEL_CS] = FULL_EXEC_SEGMENT;
+	lg->gdt[GDT_ENTRY_KERNEL_DS] = FULL_SEGMENT;
+	lg->gdt[GDT_ENTRY_KERNEL_CS].b |= (GUEST_PL << 13);
+	lg->gdt[GDT_ENTRY_KERNEL_DS].b |= (GUEST_PL << 13);
+}
+
+/* This is a fast version for the common case where only the three TLS entries
+ * have changed. */
+void copy_gdt_tls(const struct lguest *lg, struct desc_struct *gdt)
+{
+	unsigned int i;
+
+	for (i = GDT_ENTRY_TLS_MIN; i <= GDT_ENTRY_TLS_MAX; i++)
+		gdt[i] = lg->gdt[i];
+}
+
+void copy_gdt(const struct lguest *lg, struct desc_struct *gdt)
+{
+	unsigned int i;
+
+	for (i = 0; i < GDT_ENTRIES; i++)
+		if (!ignored_gdt(i))
+			gdt[i] = lg->gdt[i];
+}
+
+void load_guest_gdt(struct lguest *lg, unsigned long table, u32 num)
+{
+	if (num > ARRAY_SIZE(lg->gdt))
+		kill_guest(lg, "too many gdt entries %i", num);
+
+	lgread(lg, lg->gdt, table, num * sizeof(lg->gdt[0]));
+	fixup_gdt_table(lg, 0, ARRAY_SIZE(lg->gdt));
+	lg->changed |= CHANGED_GDT;
+}
+
+void guest_load_tls(struct lguest *lg, unsigned long gtls)
+{
+	struct desc_struct *tls = &lg->gdt[GDT_ENTRY_TLS_MIN];
+
+	lgread(lg, tls, gtls, sizeof(*tls)*GDT_ENTRY_TLS_ENTRIES);
+	fixup_gdt_table(lg, GDT_ENTRY_TLS_MIN, GDT_ENTRY_TLS_MAX+1);
+	lg->changed |= CHANGED_GDT_TLS;
+}
diff --git a/drivers/lguest/switcher.S b/drivers/lguest/switcher.S
new file mode 100644
index 00000000000..eadd4cc299d
--- /dev/null
+++ b/drivers/lguest/switcher.S
@@ -0,0 +1,159 @@
+/* This code sits at 0xFFC00000 to do the low-level guest<->host switch.
+
+   There is are two pages above us for this CPU (struct lguest_pages).
+   The second page (struct lguest_ro_state) becomes read-only after the
+   context switch.  The first page (the stack for traps) remains writable,
+   but while we're in here, the guest cannot be running.
+*/
+#include <linux/linkage.h>
+#include <asm/asm-offsets.h>
+#include "lg.h"
+
+.text
+ENTRY(start_switcher_text)
+
+/* %eax points to lguest pages for this CPU.  %ebx contains cr3 value.
+   All normal registers can be clobbered! */
+ENTRY(switch_to_guest)
+	/* Save host segments on host stack. */
+	pushl	%es
+	pushl	%ds
+	pushl	%gs
+	pushl	%fs
+	/* With CONFIG_FRAME_POINTER, gcc doesn't let us clobber this! */
+	pushl	%ebp
+	/* Save host stack. */
+	movl	%esp, LGUEST_PAGES_host_sp(%eax)
+	/* Switch to guest stack: if we get NMI we expect to be there. */
+	movl	%eax, %edx
+	addl	$LGUEST_PAGES_regs, %edx
+	movl	%edx, %esp
+	/* Switch to guest's GDT, IDT. */
+	lgdt	LGUEST_PAGES_guest_gdt_desc(%eax)
+	lidt	LGUEST_PAGES_guest_idt_desc(%eax)
+	/* Switch to guest's TSS while GDT still writable. */
+	movl	$(GDT_ENTRY_TSS*8), %edx
+	ltr	%dx
+	/* Set host's TSS GDT entry to available (clear byte 5 bit 2). */
+	movl	(LGUEST_PAGES_host_gdt_desc+2)(%eax), %edx
+	andb	$0xFD, (GDT_ENTRY_TSS*8 + 5)(%edx)
+	/* Switch to guest page tables:	lguest_pages->state now read-only. */
+	movl	%ebx, %cr3
+	/* Restore guest regs */
+	popl	%ebx
+	popl	%ecx
+	popl	%edx
+	popl	%esi
+	popl	%edi
+	popl	%ebp
+	popl	%gs
+	popl	%eax
+	popl	%fs
+	popl	%ds
+	popl	%es
+	/* Skip error code and trap number */
+	addl	$8, %esp
+	iret
+
+#define SWITCH_TO_HOST							\
+	/* Save guest state */						\
+	pushl	%es;							\
+	pushl	%ds;							\
+	pushl	%fs;							\
+	pushl	%eax;							\
+	pushl	%gs;							\
+	pushl	%ebp;							\
+	pushl	%edi;							\
+	pushl	%esi;							\
+	pushl	%edx;							\
+	pushl	%ecx;							\
+	pushl	%ebx;							\
+	/* Load lguest ds segment for convenience. */			\
+	movl	$(LGUEST_DS), %eax;					\
+	movl	%eax, %ds;						\
+	/* Figure out where we are, based on stack (at top of regs). */	\
+	movl	%esp, %eax;						\
+	subl	$LGUEST_PAGES_regs, %eax;				\
+	/* Put trap number in %ebx before we switch cr3 and lose it. */ \
+	movl	LGUEST_PAGES_regs_trapnum(%eax), %ebx;			\
+	/* Switch to host page tables (host GDT, IDT and stack are in host   \
+	   mem, so need this first) */					\
+	movl	LGUEST_PAGES_host_cr3(%eax), %edx;			\
+	movl	%edx, %cr3;						\
+	/* Set guest's TSS to available (clear byte 5 bit 2). */	\
+	andb	$0xFD, (LGUEST_PAGES_guest_gdt+GDT_ENTRY_TSS*8+5)(%eax); \
+	/* Switch to host's GDT & IDT. */				\
+	lgdt	LGUEST_PAGES_host_gdt_desc(%eax);			\
+	lidt	LGUEST_PAGES_host_idt_desc(%eax);			\
+	/* Switch to host's stack. */					\
+	movl	LGUEST_PAGES_host_sp(%eax), %esp;			\
+	/* Switch to host's TSS */					\
+	movl	$(GDT_ENTRY_TSS*8), %edx;				\
+	ltr	%dx;							\
+	popl	%ebp;							\
+	popl	%fs;							\
+	popl	%gs;							\
+	popl	%ds;							\
+	popl	%es
+
+/* Return to run_guest_once. */
+return_to_host:
+	SWITCH_TO_HOST
+	iret
+
+deliver_to_host:
+	SWITCH_TO_HOST
+	/* Decode IDT and jump to hosts' irq handler.  When that does iret, it
+	 * will return to run_guest_once.  This is a feature. */
+	movl	(LGUEST_PAGES_host_idt_desc+2)(%eax), %edx
+	leal	(%edx,%ebx,8), %eax
+	movzwl	(%eax),%edx
+	movl	4(%eax), %eax
+	xorw	%ax, %ax
+	orl	%eax, %edx
+	jmp	*%edx
+
+/* Real hardware interrupts are delivered straight to the host.  Others
+   cause us to return to run_guest_once so it can decide what to do.  Note
+   that some of these are overridden by the guest to deliver directly, and
+   never enter here (see load_guest_idt_entry). */
+.macro IRQ_STUB N TARGET
+	.data; .long 1f; .text; 1:
+ /* Make an error number for most traps, which don't have one. */
+ .if (\N <> 8) && (\N < 10 || \N > 14) && (\N <> 17)
+	pushl	$0
+ .endif
+	pushl	$\N
+	jmp	\TARGET
+	ALIGN
+.endm
+
+.macro IRQ_STUBS FIRST LAST TARGET
+ irq=\FIRST
+ .rept \LAST-\FIRST+1
+	IRQ_STUB irq \TARGET
+  irq=irq+1
+ .endr
+.endm
+
+/* We intercept every interrupt, because we may need to switch back to
+ * host.  Unfortunately we can't tell them apart except by entry
+ * point, so we need 256 entry points.
+ */
+.data
+.global default_idt_entries
+default_idt_entries:
+.text
+	IRQ_STUBS 0 1 return_to_host		/* First two traps */
+	IRQ_STUB 2 handle_nmi			/* NMI */
+	IRQ_STUBS 3 31 return_to_host		/* Rest of traps */
+	IRQ_STUBS 32 127 deliver_to_host	/* Real interrupts */
+	IRQ_STUB 128 return_to_host		/* System call (overridden) */
+	IRQ_STUBS 129 255 deliver_to_host	/* Other real interrupts */
+
+/* We ignore NMI and return. */
+handle_nmi:
+	addl	$8, %esp
+	iret
+
+ENTRY(end_switcher_text)
diff --git a/include/asm-i386/tsc.h b/include/asm-i386/tsc.h
index 62c091ffccc..a4d806610b7 100644
--- a/include/asm-i386/tsc.h
+++ b/include/asm-i386/tsc.h
@@ -63,6 +63,7 @@ extern void tsc_init(void);
 extern void mark_tsc_unstable(char *reason);
 extern int unsynchronized_tsc(void);
 extern void init_tsc_clocksource(void);
+int check_tsc_unstable(void);
 
 /*
  * Boot-time check whether the TSCs are synchronized across
diff --git a/include/linux/lguest.h b/include/linux/lguest.h
index f30c04fc22b..500aace21ca 100644
--- a/include/linux/lguest.h
+++ b/include/linux/lguest.h
@@ -3,11 +3,6 @@
 #ifndef _ASM_LGUEST_H
 #define _ASM_LGUEST_H
 
-/* These are randomly chosen numbers which indicate we're an lguest at boot */
-#define LGUEST_MAGIC_EBP 0x4C687970
-#define LGUEST_MAGIC_EDI 0x652D4D65
-#define LGUEST_MAGIC_ESI 0xFFFFFFFF
-
 #ifndef __ASSEMBLY__
 #include <asm/irq.h>
 
@@ -20,7 +15,7 @@
 #define LHCALL_LOAD_IDT_ENTRY	6
 #define LHCALL_SET_STACK	7
 #define LHCALL_TS		8
-#define LHCALL_TIMER_READ	9
+#define LHCALL_SET_CLOCKEVENT	9
 #define LHCALL_HALT		10
 #define LHCALL_GET_WALLCLOCK	11
 #define LHCALL_BIND_DMA		12
@@ -29,6 +24,9 @@
 #define LHCALL_SET_PMD		15
 #define LHCALL_LOAD_TLS		16
 
+#define LG_CLOCK_MIN_DELTA	100UL
+#define LG_CLOCK_MAX_DELTA	ULONG_MAX
+
 #define LGUEST_TRAP_ENTRY 0x1F
 
 static inline unsigned long
@@ -75,6 +73,8 @@ struct lguest_data
 	unsigned long reserve_mem;
 	/* ID of this guest (used by network driver to set ethernet address) */
 	u16 guestid;
+	/* KHz for the TSC clock. */
+	u32 tsc_khz;
 
 /* Fields initialized by the guest at boot: */
 	/* Instruction range to suppress interrupts even if enabled */
diff --git a/include/linux/lguest_launcher.h b/include/linux/lguest_launcher.h
new file mode 100644
index 00000000000..0ba414a40c8
--- /dev/null
+++ b/include/linux/lguest_launcher.h
@@ -0,0 +1,73 @@
+#ifndef _ASM_LGUEST_USER
+#define _ASM_LGUEST_USER
+/* Everything the "lguest" userspace program needs to know. */
+/* They can register up to 32 arrays of lguest_dma. */
+#define LGUEST_MAX_DMA		32
+/* At most we can dma 16 lguest_dma in one op. */
+#define LGUEST_MAX_DMA_SECTIONS	16
+
+/* How many devices?  Assume each one wants up to two dma arrays per device. */
+#define LGUEST_MAX_DEVICES (LGUEST_MAX_DMA/2)
+
+struct lguest_dma
+{
+	/* 0 if free to be used, filled by hypervisor. */
+ 	u32 used_len;
+	unsigned long addr[LGUEST_MAX_DMA_SECTIONS];
+	u16 len[LGUEST_MAX_DMA_SECTIONS];
+};
+
+struct lguest_block_page
+{
+	/* 0 is a read, 1 is a write. */
+	int type;
+	u32 sector; 	/* Offset in device = sector * 512. */
+	u32 bytes;	/* Length expected to be read/written in bytes */
+	/* 0 = pending, 1 = done, 2 = done, error */
+	int result;
+	u32 num_sectors; /* Disk length = num_sectors * 512 */
+};
+
+/* There is a shared page of these. */
+struct lguest_net
+{
+	/* Simply the mac address (with multicast bit meaning promisc). */
+	unsigned char mac[6];
+};
+
+/* Where the Host expects the Guest to SEND_DMA console output to. */
+#define LGUEST_CONSOLE_DMA_KEY 0
+
+/* We have a page of these descriptors in the lguest_device page. */
+struct lguest_device_desc {
+	u16 type;
+#define LGUEST_DEVICE_T_CONSOLE	1
+#define LGUEST_DEVICE_T_NET	2
+#define LGUEST_DEVICE_T_BLOCK	3
+
+	u16 features;
+#define LGUEST_NET_F_NOCSUM		0x4000 /* Don't bother checksumming */
+#define LGUEST_DEVICE_F_RANDOMNESS	0x8000 /* IRQ is fairly random */
+
+	u16 status;
+/* 256 and above are device specific. */
+#define LGUEST_DEVICE_S_ACKNOWLEDGE	1 /* We have seen device. */
+#define LGUEST_DEVICE_S_DRIVER		2 /* We have found a driver */
+#define LGUEST_DEVICE_S_DRIVER_OK	4 /* Driver says OK! */
+#define LGUEST_DEVICE_S_REMOVED		8 /* Device has gone away. */
+#define LGUEST_DEVICE_S_REMOVED_ACK	16 /* Driver has been told. */
+#define LGUEST_DEVICE_S_FAILED		128 /* Something actually failed */
+
+	u16 num_pages;
+	u32 pfn;
+};
+
+/* Write command first word is a request. */
+enum lguest_req
+{
+	LHREQ_INITIALIZE, /* + pfnlimit, pgdir, start, pageoffset */
+	LHREQ_GETDMA, /* + addr (returns &lguest_dma, irq in ->used_len) */
+	LHREQ_IRQ, /* + irq */
+	LHREQ_BREAK, /* + on/off flag (on blocks until someone does off) */
+};
+#endif /* _ASM_LGUEST_USER */
diff --git a/kernel/fork.c b/kernel/fork.c
index e7a2d995b08..46983899822 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -127,7 +127,6 @@ void __put_task_struct(struct task_struct *tsk)
 	if (!profile_handoff_task(tsk))
 		free_task(tsk);
 }
-EXPORT_SYMBOL_GPL(__put_task_struct);
 
 void __init fork_init(unsigned long mempages)
 {
-- 
cgit v1.2.3-70-g09d2


From ed2c12f323e8fafbc94f9bcfb924f9df36e64dc7 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Thu, 19 Jul 2007 01:50:35 -0700
Subject: kernel/sysctl.c: finish off the warning comments

I've been chasing these comments around this file all week.  Hopefully we're
straight now.

Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sysctl.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index e69179b1809..222299844ad 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -748,7 +748,10 @@ static ctl_table kern_table[] = {
 		.proc_handler	= &proc_dostring,
 		.strategy	= &sysctl_string,
 	},
-
+/*
+ * NOTE: do not add new entries to this table unless you have read
+ * Documentation/sysctl/ctl_unnumbered.txt
+ */
 	{ .ctl_name = 0 }
 };
 
-- 
cgit v1.2.3-70-g09d2


From 9439aab8dbc33c2c03c3a19dba267360383ba38c Mon Sep 17 00:00:00 2001
From: Suresh Siddha <suresh.b.siddha@intel.com>
Date: Thu, 19 Jul 2007 21:28:35 +0200
Subject: [PATCH] sched: fix newly idle load balance in case of SMT

In the presence of SMT, newly idle balance was never happening for
multi-core and SMP domains (even when both the logical siblings are
idle).

If thread 0 is already idle and when thread 1 is about to go to idle,
newly idle load balance always think that one of the threads is not idle
and skips doing the newly idle load balance for multi-core and SMP
domains.

This is because of the idle_cpu() macro, which checks if the current
process on a cpu is an idle process. But this is not the case for the
thread doing the load_balance_newidle().

Fix this by using runqueue's nr_running field instead of idle_cpu(). And
also skip the logic of 'only one idle cpu in the group will be doing
load balancing' during newly idle case.

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 645256b228c..e36d99d1ddb 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2235,7 +2235,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 
 			rq = cpu_rq(i);
 
-			if (*sd_idle && !idle_cpu(i))
+			if (*sd_idle && rq->nr_running)
 				*sd_idle = 0;
 
 			/* Bias balancing toward cpus of our domain */
@@ -2257,9 +2257,11 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 		/*
 		 * First idle cpu or the first cpu(busiest) in this sched group
 		 * is eligible for doing load balancing at this and above
-		 * domains.
+		 * domains. In the newly idle case, we will allow all the cpu's
+		 * to do the newly idle load balance.
 		 */
-		if (local_group && balance_cpu != this_cpu && balance) {
+		if (idle != CPU_NEWLY_IDLE && local_group &&
+		    balance_cpu != this_cpu && balance) {
 			*balance = 0;
 			goto ret;
 		}
-- 
cgit v1.2.3-70-g09d2


From 969bb4e4032dac67287951d8f6642a3b5119694e Mon Sep 17 00:00:00 2001
From: Suresh Siddha <suresh.b.siddha@intel.com>
Date: Thu, 19 Jul 2007 21:28:35 +0200
Subject: [PATCH] sched: fix the all pinned logic in load_balance_newidle()

nr_moved is not the correct check for triggering all pinned logic. Fix
the all pinned logic in the case of load_balance_newidle().

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index e36d99d1ddb..a35a92ff38f 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2679,6 +2679,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
 	unsigned long imbalance;
 	int nr_moved = 0;
 	int sd_idle = 0;
+	int all_pinned = 0;
 	cpumask_t cpus = CPU_MASK_ALL;
 
 	/*
@@ -2717,10 +2718,11 @@ redo:
 		double_lock_balance(this_rq, busiest);
 		nr_moved = move_tasks(this_rq, this_cpu, busiest,
 					minus_1_or_zero(busiest->nr_running),
-					imbalance, sd, CPU_NEWLY_IDLE, NULL);
+					imbalance, sd, CPU_NEWLY_IDLE,
+					&all_pinned);
 		spin_unlock(&busiest->lock);
 
-		if (!nr_moved) {
+		if (unlikely(all_pinned)) {
 			cpu_clear(cpu_of(busiest), cpus);
 			if (!cpus_empty(cpus))
 				goto redo;
-- 
cgit v1.2.3-70-g09d2


From e436d80085133858bf2613a630365e8a0459fd58 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 19 Jul 2007 21:28:35 +0200
Subject: [PATCH] sched: implement cpu_clock(cpu) high-speed time source

Implement the cpu_clock(cpu) interface for kernel-internal use:
high-speed (but slightly incorrect) per-cpu clock constructed from
sched_clock().

This API, unused at the moment, will be used in the future by blktrace,
by the softlockup-watchdog, by printk and by lockstat.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h |  7 +++++++
 kernel/sched.c        | 17 +++++++++++++++++
 2 files changed, 24 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 94f624aef01..33b9b4841ee 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1348,6 +1348,13 @@ static inline int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
 #endif
 
 extern unsigned long long sched_clock(void);
+
+/*
+ * For kernel-internal use: high-speed (but slightly incorrect) per-cpu
+ * clock constructed from sched_clock():
+ */
+extern unsigned long long cpu_clock(int cpu);
+
 extern unsigned long long
 task_sched_runtime(struct task_struct *task);
 
diff --git a/kernel/sched.c b/kernel/sched.c
index a35a92ff38f..93cf241cfbe 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -379,6 +379,23 @@ static inline unsigned long long rq_clock(struct rq *rq)
 #define task_rq(p)		cpu_rq(task_cpu(p))
 #define cpu_curr(cpu)		(cpu_rq(cpu)->curr)
 
+/*
+ * For kernel-internal use: high-speed (but slightly incorrect) per-cpu
+ * clock constructed from sched_clock():
+ */
+unsigned long long cpu_clock(int cpu)
+{
+	struct rq *rq = cpu_rq(cpu);
+	unsigned long long now;
+	unsigned long flags;
+
+	spin_lock_irqsave(&rq->lock, flags);
+	now = rq_clock(rq);
+	spin_unlock_irqrestore(&rq->lock, flags);
+
+	return now;
+}
+
 #ifdef CONFIG_FAIR_GROUP_SCHED
 /* Change a task's ->cfs_rq if it moves across CPUs */
 static inline void set_task_cfs_rq(struct task_struct *p)
-- 
cgit v1.2.3-70-g09d2


From 20c2df83d25c6a95affe6157a4c9cac4cf5ffaac Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Fri, 20 Jul 2007 10:11:58 +0900
Subject: mm: Remove slab destructors from kmem_cache_create().

Slab destructors were no longer supported after Christoph's
c59def9f222d44bb7e2f0a559f2906191a0862d7 change. They've been
BUGs for both slab and slub, and slob never supported them
either.

This rips out support for the dtor pointer from kmem_cache_create()
completely and fixes up every single callsite in the kernel (there were
about 224, not including the slab allocator definitions themselves,
or the documentation references).

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/arm/plat-s3c24xx/dma.c               |  2 +-
 arch/arm26/mm/memc.c                      |  4 ++--
 arch/i386/mm/init.c                       |  3 +--
 arch/ia64/ia32/ia32_support.c             |  2 +-
 arch/powerpc/kernel/rtas_flash.c          |  2 +-
 arch/powerpc/mm/hugetlbpage.c             |  2 +-
 arch/powerpc/mm/init_64.c                 |  3 +--
 arch/powerpc/platforms/cell/spufs/inode.c |  2 +-
 arch/sh/kernel/cpu/sh4/sq.c               |  3 +--
 arch/sh/mm/pmb.c                          |  2 +-
 arch/sparc64/mm/tsb.c                     |  3 +--
 block/bsg.c                               |  2 +-
 block/ll_rw_blk.c                         |  6 +++---
 drivers/acpi/osl.c                        |  2 +-
 drivers/block/aoe/aoeblk.c                |  4 ++--
 drivers/ieee1394/eth1394.c                |  2 +-
 drivers/infiniband/core/mad.c             |  1 -
 drivers/infiniband/hw/amso1100/c2_vq.c    |  2 +-
 drivers/infiniband/hw/ehca/ehca_av.c      |  2 +-
 drivers/infiniband/hw/ehca/ehca_cq.c      |  2 +-
 drivers/infiniband/hw/ehca/ehca_main.c    |  2 +-
 drivers/infiniband/hw/ehca/ehca_mrmw.c    |  4 ++--
 drivers/infiniband/hw/ehca/ehca_pd.c      |  2 +-
 drivers/infiniband/hw/ehca/ehca_qp.c      |  2 +-
 drivers/infiniband/ulp/iser/iscsi_iser.c  |  2 +-
 drivers/kvm/mmu.c                         |  8 ++++----
 drivers/md/raid5.c                        |  4 ++--
 drivers/message/i2o/i2o_block.c           |  3 +--
 drivers/mtd/ubi/eba.c                     |  2 +-
 drivers/mtd/ubi/wl.c                      |  2 +-
 drivers/s390/block/dasd_devmap.c          |  2 +-
 drivers/s390/scsi/zfcp_aux.c              |  6 +++---
 drivers/scsi/aic94xx/aic94xx_init.c       |  4 ++--
 drivers/scsi/libsas/sas_init.c            |  2 +-
 drivers/scsi/qla2xxx/qla_os.c             |  2 +-
 drivers/scsi/qla4xxx/ql4_os.c             |  2 +-
 drivers/scsi/scsi.c                       |  2 +-
 drivers/scsi/scsi_lib.c                   |  4 ++--
 drivers/scsi/scsi_tgt_lib.c               |  2 +-
 drivers/usb/host/uhci-hcd.c               |  2 +-
 drivers/usb/mon/mon_text.c                |  2 +-
 fs/adfs/super.c                           |  4 ++--
 fs/affs/super.c                           |  2 +-
 fs/afs/super.c                            |  3 +--
 fs/befs/linuxvfs.c                        |  4 ++--
 fs/bfs/inode.c                            |  4 ++--
 fs/bio.c                                  |  2 +-
 fs/block_dev.c                            |  2 +-
 fs/cifs/cifsfs.c                          | 10 +++++-----
 fs/coda/inode.c                           |  4 ++--
 fs/configfs/mount.c                       |  2 +-
 fs/dcache.c                               |  4 ++--
 fs/dcookies.c                             |  2 +-
 fs/dlm/lowcomms.c                         |  2 +-
 fs/dlm/memory.c                           |  2 +-
 fs/dnotify.c                              |  2 +-
 fs/dquot.c                                |  4 ++--
 fs/ecryptfs/main.c                        |  2 +-
 fs/efs/super.c                            |  4 ++--
 fs/eventpoll.c                            |  4 ++--
 fs/ext2/super.c                           |  4 ++--
 fs/ext3/super.c                           |  2 +-
 fs/ext4/super.c                           |  2 +-
 fs/fat/cache.c                            |  2 +-
 fs/fat/inode.c                            |  2 +-
 fs/fcntl.c                                |  2 +-
 fs/freevxfs/vxfs_super.c                  |  4 ++--
 fs/fuse/dev.c                             |  2 +-
 fs/fuse/inode.c                           |  2 +-
 fs/gfs2/main.c                            |  6 +++---
 fs/hfs/super.c                            |  2 +-
 fs/hfsplus/super.c                        |  2 +-
 fs/hpfs/super.c                           |  4 ++--
 fs/hugetlbfs/inode.c                      |  2 +-
 fs/inode.c                                |  3 +--
 fs/inotify_user.c                         |  4 ++--
 fs/isofs/inode.c                          |  2 +-
 fs/jbd/journal.c                          |  8 +++-----
 fs/jbd/revoke.c                           |  4 ++--
 fs/jbd2/journal.c                         |  8 +++-----
 fs/jbd2/revoke.c                          |  4 ++--
 fs/jffs2/malloc.c                         | 18 +++++++++---------
 fs/jffs2/super.c                          |  2 +-
 fs/jfs/jfs_metapage.c                     |  2 +-
 fs/jfs/super.c                            |  2 +-
 fs/locks.c                                |  2 +-
 fs/mbcache.c                              |  2 +-
 fs/minix/inode.c                          |  4 ++--
 fs/namespace.c                            |  2 +-
 fs/ncpfs/inode.c                          |  4 ++--
 fs/nfs/direct.c                           |  2 +-
 fs/nfs/inode.c                            |  4 ++--
 fs/nfs/pagelist.c                         |  2 +-
 fs/nfs/read.c                             |  2 +-
 fs/nfs/write.c                            |  2 +-
 fs/nfsd/nfs4state.c                       |  8 ++++----
 fs/ntfs/super.c                           | 10 +++++-----
 fs/ocfs2/dlm/dlmfs.c                      |  2 +-
 fs/ocfs2/dlm/dlmmaster.c                  |  2 +-
 fs/ocfs2/super.c                          |  2 +-
 fs/ocfs2/uptodate.c                       |  2 +-
 fs/openpromfs/inode.c                     |  2 +-
 fs/proc/inode.c                           |  4 ++--
 fs/qnx4/inode.c                           |  2 +-
 fs/reiserfs/super.c                       |  2 +-
 fs/romfs/inode.c                          |  4 ++--
 fs/smbfs/inode.c                          |  4 ++--
 fs/smbfs/request.c                        |  2 +-
 fs/sysfs/mount.c                          |  2 +-
 fs/sysv/inode.c                           |  2 +-
 fs/udf/super.c                            |  2 +-
 fs/ufs/super.c                            |  4 ++--
 fs/xfs/linux-2.6/kmem.h                   |  4 ++--
 include/linux/i2o.h                       |  3 +--
 include/linux/slab.h                      |  3 +--
 ipc/mqueue.c                              |  2 +-
 kernel/fork.c                             | 18 +++++++++---------
 kernel/nsproxy.c                          |  2 +-
 kernel/posix-timers.c                     |  2 +-
 kernel/user.c                             |  2 +-
 lib/idr.c                                 |  2 +-
 lib/radix-tree.c                          |  2 +-
 mm/mempolicy.c                            |  4 ++--
 mm/rmap.c                                 |  2 +-
 mm/shmem.c                                |  2 +-
 mm/slab.c                                 | 17 +++++++----------
 mm/slob.c                                 |  3 +--
 mm/slub.c                                 |  4 +---
 net/bridge/br_fdb.c                       |  2 +-
 net/core/flow.c                           |  2 +-
 net/core/neighbour.c                      |  2 +-
 net/core/skbuff.c                         |  4 ++--
 net/core/sock.c                           |  6 +++---
 net/dccp/ackvec.c                         |  4 ++--
 net/dccp/ccid.c                           |  2 +-
 net/dccp/ccids/lib/loss_interval.c        |  2 +-
 net/dccp/ccids/lib/packet_history.c       |  4 ++--
 net/dccp/proto.c                          |  2 +-
 net/decnet/dn_route.c                     |  2 +-
 net/decnet/dn_table.c                     |  2 +-
 net/ipv4/fib_hash.c                       |  4 ++--
 net/ipv4/fib_trie.c                       |  2 +-
 net/ipv4/inetpeer.c                       |  2 +-
 net/ipv4/ipmr.c                           |  2 +-
 net/ipv4/ipvs/ip_vs_conn.c                |  2 +-
 net/ipv4/route.c                          |  2 +-
 net/ipv4/tcp.c                            |  2 +-
 net/ipv6/ip6_fib.c                        |  2 +-
 net/ipv6/route.c                          |  2 +-
 net/ipv6/xfrm6_tunnel.c                   |  2 +-
 net/netfilter/nf_conntrack_core.c         |  2 +-
 net/netfilter/nf_conntrack_expect.c       |  2 +-
 net/netfilter/xt_hashlimit.c              |  2 +-
 net/rxrpc/af_rxrpc.c                      |  2 +-
 net/sctp/protocol.c                       |  4 ++--
 net/socket.c                              |  3 +--
 net/sunrpc/rpc_pipe.c                     |  2 +-
 net/sunrpc/sched.c                        |  4 ++--
 net/tipc/handler.c                        |  2 +-
 net/xfrm/xfrm_input.c                     |  2 +-
 net/xfrm/xfrm_policy.c                    |  2 +-
 security/keys/key.c                       |  2 +-
 security/selinux/avc.c                    |  2 +-
 security/selinux/hooks.c                  |  2 +-
 security/selinux/ss/avtab.c               |  2 +-
 165 files changed, 247 insertions(+), 268 deletions(-)

(limited to 'kernel')

diff --git a/arch/arm/plat-s3c24xx/dma.c b/arch/arm/plat-s3c24xx/dma.c
index 08d80f2f51f..6d048490c55 100644
--- a/arch/arm/plat-s3c24xx/dma.c
+++ b/arch/arm/plat-s3c24xx/dma.c
@@ -1333,7 +1333,7 @@ int __init s3c24xx_dma_init(unsigned int channels, unsigned int irq,
 	dma_kmem = kmem_cache_create("dma_desc",
 				     sizeof(struct s3c2410_dma_buf), 0,
 				     SLAB_HWCACHE_ALIGN,
-				     s3c2410_dma_cache_ctor, NULL);
+				     s3c2410_dma_cache_ctor);
 
 	if (dma_kmem == NULL) {
 		printk(KERN_ERR "dma failed to make kmem cache\n");
diff --git a/arch/arm26/mm/memc.c b/arch/arm26/mm/memc.c
index 42505541a9b..ffecd857824 100644
--- a/arch/arm26/mm/memc.c
+++ b/arch/arm26/mm/memc.c
@@ -176,9 +176,9 @@ void __init pgtable_cache_init(void)
 {
 	pte_cache = kmem_cache_create("pte-cache",
 				sizeof(pte_t) * PTRS_PER_PTE,
-				0, SLAB_PANIC, pte_cache_ctor, NULL);
+				0, SLAB_PANIC, pte_cache_ctor);
 
 	pgd_cache = kmem_cache_create("pgd-cache", MEMC_TABLE_SIZE +
 				sizeof(pgd_t) * PTRS_PER_PGD,
-				0, SLAB_PANIC, pgd_cache_ctor, NULL);
+				0, SLAB_PANIC, pgd_cache_ctor);
 }
diff --git a/arch/i386/mm/init.c b/arch/i386/mm/init.c
index 6a68b1ae061..6e72f22e6bb 100644
--- a/arch/i386/mm/init.c
+++ b/arch/i386/mm/init.c
@@ -752,8 +752,7 @@ void __init pgtable_cache_init(void)
 					PTRS_PER_PMD*sizeof(pmd_t),
 					PTRS_PER_PMD*sizeof(pmd_t),
 					SLAB_PANIC,
-					pmd_ctor,
-					NULL);
+					pmd_ctor);
 		if (!SHARED_KERNEL_PMD) {
 			/* If we're in PAE mode and have a non-shared
 			   kernel pmd, then the pgd size must be a
diff --git a/arch/ia64/ia32/ia32_support.c b/arch/ia64/ia32/ia32_support.c
index beea7a0b9dc..e13a1a1db4b 100644
--- a/arch/ia64/ia32/ia32_support.c
+++ b/arch/ia64/ia32/ia32_support.c
@@ -253,7 +253,7 @@ ia32_init (void)
 
 		partial_page_cachep = kmem_cache_create("partial_page_cache",
 						sizeof(struct partial_page),
-						0, SLAB_PANIC, NULL, NULL);
+						0, SLAB_PANIC, NULL);
 	}
 #endif
 	return 0;
diff --git a/arch/powerpc/kernel/rtas_flash.c b/arch/powerpc/kernel/rtas_flash.c
index f72118c0844..62b7bf2f3ea 100644
--- a/arch/powerpc/kernel/rtas_flash.c
+++ b/arch/powerpc/kernel/rtas_flash.c
@@ -804,7 +804,7 @@ int __init rtas_flash_init(void)
 
 	flash_block_cache = kmem_cache_create("rtas_flash_cache",
 				RTAS_BLK_SIZE, RTAS_BLK_SIZE, 0,
-				rtas_block_ctor, NULL);
+				rtas_block_ctor);
 	if (!flash_block_cache) {
 		printk(KERN_ERR "%s: failed to create block cache\n",
 				__FUNCTION__);
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 92a1b16fb7e..4835f73af30 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -542,7 +542,7 @@ static int __init hugetlbpage_init(void)
 					       HUGEPTE_TABLE_SIZE,
 					       HUGEPTE_TABLE_SIZE,
 					       0,
-					       zero_ctor, NULL);
+					       zero_ctor);
 	if (! huge_pgtable_cache)
 		panic("hugetlbpage_init(): could not create hugepte cache\n");
 
diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index 1d6edf724c8..9f27bb56a61 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -178,7 +178,6 @@ void pgtable_cache_init(void)
 		pgtable_cache[i] = kmem_cache_create(name,
 						     size, size,
 						     SLAB_PANIC,
-						     zero_ctor,
-						     NULL);
+						     zero_ctor);
 	}
 }
diff --git a/arch/powerpc/platforms/cell/spufs/inode.c b/arch/powerpc/platforms/cell/spufs/inode.c
index f37460e5bfd..7eb4d6cbcb7 100644
--- a/arch/powerpc/platforms/cell/spufs/inode.c
+++ b/arch/powerpc/platforms/cell/spufs/inode.c
@@ -654,7 +654,7 @@ static int __init spufs_init(void)
 	ret = -ENOMEM;
 	spufs_inode_cache = kmem_cache_create("spufs_inode_cache",
 			sizeof(struct spufs_inode_info), 0,
-			SLAB_HWCACHE_ALIGN, spufs_init_once, NULL);
+			SLAB_HWCACHE_ALIGN, spufs_init_once);
 
 	if (!spufs_inode_cache)
 		goto out;
diff --git a/arch/sh/kernel/cpu/sh4/sq.c b/arch/sh/kernel/cpu/sh4/sq.c
index d7fff752e56..b98d6c3e6f3 100644
--- a/arch/sh/kernel/cpu/sh4/sq.c
+++ b/arch/sh/kernel/cpu/sh4/sq.c
@@ -371,8 +371,7 @@ static int __init sq_api_init(void)
 	printk(KERN_NOTICE "sq: Registering store queue API.\n");
 
 	sq_cache = kmem_cache_create("store_queue_cache",
-				sizeof(struct sq_mapping), 0, 0,
-				NULL, NULL);
+				sizeof(struct sq_mapping), 0, 0, NULL);
 	if (unlikely(!sq_cache))
 		return ret;
 
diff --git a/arch/sh/mm/pmb.c b/arch/sh/mm/pmb.c
index b6a5a338145..a08a4a958ad 100644
--- a/arch/sh/mm/pmb.c
+++ b/arch/sh/mm/pmb.c
@@ -310,7 +310,7 @@ static int __init pmb_init(void)
 	BUG_ON(unlikely(nr_entries >= NR_PMB_ENTRIES));
 
 	pmb_cache = kmem_cache_create("pmb", sizeof(struct pmb_entry), 0,
-				      SLAB_PANIC, pmb_cache_ctor, NULL);
+				      SLAB_PANIC, pmb_cache_ctor);
 
 	jump_to_P2();
 
diff --git a/arch/sparc64/mm/tsb.c b/arch/sparc64/mm/tsb.c
index 8eb8a7c76ec..7ff0a02f581 100644
--- a/arch/sparc64/mm/tsb.c
+++ b/arch/sparc64/mm/tsb.c
@@ -262,8 +262,7 @@ void __init pgtable_cache_init(void)
 
 		tsb_caches[i] = kmem_cache_create(name,
 						  size, size,
-						  0,
-						  NULL, NULL);
+						  0, NULL);
 		if (!tsb_caches[i]) {
 			prom_printf("Could not create %s cache\n", name);
 			prom_halt();
diff --git a/block/bsg.c b/block/bsg.c
index baa04e7adf1..f2992e72b84 100644
--- a/block/bsg.c
+++ b/block/bsg.c
@@ -1043,7 +1043,7 @@ static int __init bsg_init(void)
 	dev_t devid;
 
 	bsg_cmd_cachep = kmem_cache_create("bsg_cmd",
-				sizeof(struct bsg_command), 0, 0, NULL, NULL);
+				sizeof(struct bsg_command), 0, 0, NULL);
 	if (!bsg_cmd_cachep) {
 		printk(KERN_ERR "bsg: failed creating slab cache\n");
 		return -ENOMEM;
diff --git a/block/ll_rw_blk.c b/block/ll_rw_blk.c
index d7cadf30416..66056ca5e63 100644
--- a/block/ll_rw_blk.c
+++ b/block/ll_rw_blk.c
@@ -3698,13 +3698,13 @@ int __init blk_dev_init(void)
 		panic("Failed to create kblockd\n");
 
 	request_cachep = kmem_cache_create("blkdev_requests",
-			sizeof(struct request), 0, SLAB_PANIC, NULL, NULL);
+			sizeof(struct request), 0, SLAB_PANIC, NULL);
 
 	requestq_cachep = kmem_cache_create("blkdev_queue",
-			sizeof(request_queue_t), 0, SLAB_PANIC, NULL, NULL);
+			sizeof(request_queue_t), 0, SLAB_PANIC, NULL);
 
 	iocontext_cachep = kmem_cache_create("blkdev_ioc",
-			sizeof(struct io_context), 0, SLAB_PANIC, NULL, NULL);
+			sizeof(struct io_context), 0, SLAB_PANIC, NULL);
 
 	for_each_possible_cpu(i)
 		INIT_LIST_HEAD(&per_cpu(blk_cpu_done, i));
diff --git a/drivers/acpi/osl.c b/drivers/acpi/osl.c
index 2e7ba615d76..00d53c2fd1e 100644
--- a/drivers/acpi/osl.c
+++ b/drivers/acpi/osl.c
@@ -1098,7 +1098,7 @@ void acpi_os_release_lock(acpi_spinlock lockp, acpi_cpu_flags flags)
 acpi_status
 acpi_os_create_cache(char *name, u16 size, u16 depth, acpi_cache_t ** cache)
 {
-	*cache = kmem_cache_create(name, size, 0, 0, NULL, NULL);
+	*cache = kmem_cache_create(name, size, 0, 0, NULL);
 	if (*cache == NULL)
 		return AE_ERROR;
 	else
diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c
index 478489c568a..4f598270fa3 100644
--- a/drivers/block/aoe/aoeblk.c
+++ b/drivers/block/aoe/aoeblk.c
@@ -257,9 +257,9 @@ aoeblk_exit(void)
 int __init
 aoeblk_init(void)
 {
-	buf_pool_cache = kmem_cache_create("aoe_bufs", 
+	buf_pool_cache = kmem_cache_create("aoe_bufs",
 					   sizeof(struct buf),
-					   0, 0, NULL, NULL);
+					   0, 0, NULL);
 	if (buf_pool_cache == NULL)
 		return -ENOMEM;
 
diff --git a/drivers/ieee1394/eth1394.c b/drivers/ieee1394/eth1394.c
index 93362eed94e..3a9d7e2d4de 100644
--- a/drivers/ieee1394/eth1394.c
+++ b/drivers/ieee1394/eth1394.c
@@ -1729,7 +1729,7 @@ static int __init ether1394_init_module(void)
 
 	packet_task_cache = kmem_cache_create("packet_task",
 					      sizeof(struct packet_task),
-					      0, 0, NULL, NULL);
+					      0, 0, NULL);
 	if (!packet_task_cache)
 		return -ENOMEM;
 
diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c
index 6b8faca02f8..bc547f1d34b 100644
--- a/drivers/infiniband/core/mad.c
+++ b/drivers/infiniband/core/mad.c
@@ -2998,7 +2998,6 @@ static int __init ib_mad_init_module(void)
 					 sizeof(struct ib_mad_private),
 					 0,
 					 SLAB_HWCACHE_ALIGN,
-					 NULL,
 					 NULL);
 	if (!ib_mad_cache) {
 		printk(KERN_ERR PFX "Couldn't create ib_mad cache\n");
diff --git a/drivers/infiniband/hw/amso1100/c2_vq.c b/drivers/infiniband/hw/amso1100/c2_vq.c
index 36620a22413..cfdacb1ec27 100644
--- a/drivers/infiniband/hw/amso1100/c2_vq.c
+++ b/drivers/infiniband/hw/amso1100/c2_vq.c
@@ -85,7 +85,7 @@ int vq_init(struct c2_dev *c2dev)
 		(char) ('0' + c2dev->devnum));
 	c2dev->host_msg_cache =
 	    kmem_cache_create(c2dev->vq_cache_name, c2dev->rep_vq.msg_size, 0,
-			      SLAB_HWCACHE_ALIGN, NULL, NULL);
+			      SLAB_HWCACHE_ALIGN, NULL);
 	if (c2dev->host_msg_cache == NULL) {
 		return -ENOMEM;
 	}
diff --git a/drivers/infiniband/hw/ehca/ehca_av.c b/drivers/infiniband/hw/ehca/ehca_av.c
index e53a97af126..97d108634c5 100644
--- a/drivers/infiniband/hw/ehca/ehca_av.c
+++ b/drivers/infiniband/hw/ehca/ehca_av.c
@@ -259,7 +259,7 @@ int ehca_init_av_cache(void)
 	av_cache = kmem_cache_create("ehca_cache_av",
 				   sizeof(struct ehca_av), 0,
 				   SLAB_HWCACHE_ALIGN,
-				   NULL, NULL);
+				   NULL);
 	if (!av_cache)
 		return -ENOMEM;
 	return 0;
diff --git a/drivers/infiniband/hw/ehca/ehca_cq.c b/drivers/infiniband/hw/ehca/ehca_cq.c
index 9e87883b561..1e8ca3fca4a 100644
--- a/drivers/infiniband/hw/ehca/ehca_cq.c
+++ b/drivers/infiniband/hw/ehca/ehca_cq.c
@@ -387,7 +387,7 @@ int ehca_init_cq_cache(void)
 	cq_cache = kmem_cache_create("ehca_cache_cq",
 				     sizeof(struct ehca_cq), 0,
 				     SLAB_HWCACHE_ALIGN,
-				     NULL, NULL);
+				     NULL);
 	if (!cq_cache)
 		return -ENOMEM;
 	return 0;
diff --git a/drivers/infiniband/hw/ehca/ehca_main.c b/drivers/infiniband/hw/ehca/ehca_main.c
index 36377c6db3d..04c324330b7 100644
--- a/drivers/infiniband/hw/ehca/ehca_main.c
+++ b/drivers/infiniband/hw/ehca/ehca_main.c
@@ -163,7 +163,7 @@ static int ehca_create_slab_caches(void)
 	ctblk_cache = kmem_cache_create("ehca_cache_ctblk",
 					EHCA_PAGESIZE, H_CB_ALIGNMENT,
 					SLAB_HWCACHE_ALIGN,
-					NULL, NULL);
+					NULL);
 	if (!ctblk_cache) {
 		ehca_gen_err("Cannot create ctblk SLAB cache.");
 		ehca_cleanup_mrmw_cache();
diff --git a/drivers/infiniband/hw/ehca/ehca_mrmw.c b/drivers/infiniband/hw/ehca/ehca_mrmw.c
index 6262c5462d5..9f4c9d46e8e 100644
--- a/drivers/infiniband/hw/ehca/ehca_mrmw.c
+++ b/drivers/infiniband/hw/ehca/ehca_mrmw.c
@@ -1950,13 +1950,13 @@ int ehca_init_mrmw_cache(void)
 	mr_cache = kmem_cache_create("ehca_cache_mr",
 				     sizeof(struct ehca_mr), 0,
 				     SLAB_HWCACHE_ALIGN,
-				     NULL, NULL);
+				     NULL);
 	if (!mr_cache)
 		return -ENOMEM;
 	mw_cache = kmem_cache_create("ehca_cache_mw",
 				     sizeof(struct ehca_mw), 0,
 				     SLAB_HWCACHE_ALIGN,
-				     NULL, NULL);
+				     NULL);
 	if (!mw_cache) {
 		kmem_cache_destroy(mr_cache);
 		mr_cache = NULL;
diff --git a/drivers/infiniband/hw/ehca/ehca_pd.c b/drivers/infiniband/hw/ehca/ehca_pd.c
index 79d0591a804..c85312ad292 100644
--- a/drivers/infiniband/hw/ehca/ehca_pd.c
+++ b/drivers/infiniband/hw/ehca/ehca_pd.c
@@ -100,7 +100,7 @@ int ehca_init_pd_cache(void)
 	pd_cache = kmem_cache_create("ehca_cache_pd",
 				     sizeof(struct ehca_pd), 0,
 				     SLAB_HWCACHE_ALIGN,
-				     NULL, NULL);
+				     NULL);
 	if (!pd_cache)
 		return -ENOMEM;
 	return 0;
diff --git a/drivers/infiniband/hw/ehca/ehca_qp.c b/drivers/infiniband/hw/ehca/ehca_qp.c
index 48e9ceacd6f..a3146e696c5 100644
--- a/drivers/infiniband/hw/ehca/ehca_qp.c
+++ b/drivers/infiniband/hw/ehca/ehca_qp.c
@@ -1760,7 +1760,7 @@ int ehca_init_qp_cache(void)
 	qp_cache = kmem_cache_create("ehca_cache_qp",
 				     sizeof(struct ehca_qp), 0,
 				     SLAB_HWCACHE_ALIGN,
-				     NULL, NULL);
+				     NULL);
 	if (!qp_cache)
 		return -ENOMEM;
 	return 0;
diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.c b/drivers/infiniband/ulp/iser/iscsi_iser.c
index effdee299b0..5db31438027 100644
--- a/drivers/infiniband/ulp/iser/iscsi_iser.c
+++ b/drivers/infiniband/ulp/iser/iscsi_iser.c
@@ -637,7 +637,7 @@ static int __init iser_init(void)
 	ig.desc_cache = kmem_cache_create("iser_descriptors",
 					  sizeof (struct iser_desc),
 					  0, SLAB_HWCACHE_ALIGN,
-					  NULL, NULL);
+					  NULL);
 	if (ig.desc_cache == NULL)
 		return -ENOMEM;
 
diff --git a/drivers/kvm/mmu.c b/drivers/kvm/mmu.c
index b297a6b111a..1199d3f32ac 100644
--- a/drivers/kvm/mmu.c
+++ b/drivers/kvm/mmu.c
@@ -1332,24 +1332,24 @@ int kvm_mmu_module_init(void)
 {
 	pte_chain_cache = kmem_cache_create("kvm_pte_chain",
 					    sizeof(struct kvm_pte_chain),
-					    0, 0, NULL, NULL);
+					    0, 0, NULL);
 	if (!pte_chain_cache)
 		goto nomem;
 	rmap_desc_cache = kmem_cache_create("kvm_rmap_desc",
 					    sizeof(struct kvm_rmap_desc),
-					    0, 0, NULL, NULL);
+					    0, 0, NULL);
 	if (!rmap_desc_cache)
 		goto nomem;
 
 	mmu_page_cache = kmem_cache_create("kvm_mmu_page",
 					   PAGE_SIZE,
-					   PAGE_SIZE, 0, NULL, NULL);
+					   PAGE_SIZE, 0, NULL);
 	if (!mmu_page_cache)
 		goto nomem;
 
 	mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header",
 						  sizeof(struct kvm_mmu_page),
-						  0, 0, NULL, NULL);
+						  0, 0, NULL);
 	if (!mmu_page_header_cache)
 		goto nomem;
 
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 0b66afef2d8..c8dfdb30291 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -951,7 +951,7 @@ static int grow_stripes(raid5_conf_t *conf, int num)
 	conf->active_name = 0;
 	sc = kmem_cache_create(conf->cache_name[conf->active_name],
 			       sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev),
-			       0, 0, NULL, NULL);
+			       0, 0, NULL);
 	if (!sc)
 		return 1;
 	conf->slab_cache = sc;
@@ -1003,7 +1003,7 @@ static int resize_stripes(raid5_conf_t *conf, int newsize)
 	/* Step 1 */
 	sc = kmem_cache_create(conf->cache_name[1-conf->active_name],
 			       sizeof(struct stripe_head)+(newsize-1)*sizeof(struct r5dev),
-			       0, 0, NULL, NULL);
+			       0, 0, NULL);
 	if (!sc)
 		return -ENOMEM;
 
diff --git a/drivers/message/i2o/i2o_block.c b/drivers/message/i2o/i2o_block.c
index 64a52bd7544..988c8ce47f5 100644
--- a/drivers/message/i2o/i2o_block.c
+++ b/drivers/message/i2o/i2o_block.c
@@ -1171,8 +1171,7 @@ static int __init i2o_block_init(void)
 	/* Allocate request mempool and slab */
 	size = sizeof(struct i2o_block_request);
 	i2o_blk_req_pool.slab = kmem_cache_create("i2o_block_req", size, 0,
-						  SLAB_HWCACHE_ALIGN, NULL,
-						  NULL);
+						  SLAB_HWCACHE_ALIGN, NULL);
 	if (!i2o_blk_req_pool.slab) {
 		osm_err("can't init request slab\n");
 		rc = -ENOMEM;
diff --git a/drivers/mtd/ubi/eba.c b/drivers/mtd/ubi/eba.c
index 8aff9385613..7c5e29eaf11 100644
--- a/drivers/mtd/ubi/eba.c
+++ b/drivers/mtd/ubi/eba.c
@@ -1149,7 +1149,7 @@ int ubi_eba_init_scan(struct ubi_device *ubi, struct ubi_scan_info *si)
 	if (ubi_devices_cnt == 0) {
 		ltree_slab = kmem_cache_create("ubi_ltree_slab",
 					       sizeof(struct ltree_entry), 0,
-					       0, &ltree_entry_ctor, NULL);
+					       0, &ltree_entry_ctor);
 		if (!ltree_slab)
 			return -ENOMEM;
 	}
diff --git a/drivers/mtd/ubi/wl.c b/drivers/mtd/ubi/wl.c
index 9de95376209..a5a9b8d8730 100644
--- a/drivers/mtd/ubi/wl.c
+++ b/drivers/mtd/ubi/wl.c
@@ -1452,7 +1452,7 @@ int ubi_wl_init_scan(struct ubi_device *ubi, struct ubi_scan_info *si)
 	if (ubi_devices_cnt == 0) {
 		wl_entries_slab = kmem_cache_create("ubi_wl_entry_slab",
 						    sizeof(struct ubi_wl_entry),
-						    0, 0, NULL, NULL);
+						    0, 0, NULL);
 		if (!wl_entries_slab)
 			return -ENOMEM;
 	}
diff --git a/drivers/s390/block/dasd_devmap.c b/drivers/s390/block/dasd_devmap.c
index 6a89cefe99b..0c67258fb9e 100644
--- a/drivers/s390/block/dasd_devmap.c
+++ b/drivers/s390/block/dasd_devmap.c
@@ -291,7 +291,7 @@ dasd_parse_keyword( char *parsestring ) {
 		dasd_page_cache =
 			kmem_cache_create("dasd_page_cache", PAGE_SIZE,
 					  PAGE_SIZE, SLAB_CACHE_DMA,
-					  NULL, NULL );
+					  NULL);
 		if (!dasd_page_cache)
 			MESSAGE(KERN_WARNING, "%s", "Failed to create slab, "
 				"fixed buffer mode disabled.");
diff --git a/drivers/s390/scsi/zfcp_aux.c b/drivers/s390/scsi/zfcp_aux.c
index a1db9592513..9726261c367 100644
--- a/drivers/s390/scsi/zfcp_aux.c
+++ b/drivers/s390/scsi/zfcp_aux.c
@@ -259,21 +259,21 @@ zfcp_module_init(void)
 	size = sizeof(struct zfcp_fsf_req_qtcb);
 	align = calc_alignment(size);
 	zfcp_data.fsf_req_qtcb_cache =
-		kmem_cache_create("zfcp_fsf", size, align, 0, NULL, NULL);
+		kmem_cache_create("zfcp_fsf", size, align, 0, NULL);
 	if (!zfcp_data.fsf_req_qtcb_cache)
 		goto out;
 
 	size = sizeof(struct fsf_status_read_buffer);
 	align = calc_alignment(size);
 	zfcp_data.sr_buffer_cache =
-		kmem_cache_create("zfcp_sr", size, align, 0, NULL, NULL);
+		kmem_cache_create("zfcp_sr", size, align, 0, NULL);
 	if (!zfcp_data.sr_buffer_cache)
 		goto out_sr_cache;
 
 	size = sizeof(struct zfcp_gid_pn_data);
 	align = calc_alignment(size);
 	zfcp_data.gid_pn_cache =
-		kmem_cache_create("zfcp_gid", size, align, 0, NULL, NULL);
+		kmem_cache_create("zfcp_gid", size, align, 0, NULL);
 	if (!zfcp_data.gid_pn_cache)
 		goto out_gid_cache;
 
diff --git a/drivers/scsi/aic94xx/aic94xx_init.c b/drivers/scsi/aic94xx/aic94xx_init.c
index 1c0d7578e79..b8c6810090d 100644
--- a/drivers/scsi/aic94xx/aic94xx_init.c
+++ b/drivers/scsi/aic94xx/aic94xx_init.c
@@ -462,7 +462,7 @@ static int asd_create_global_caches(void)
 					    sizeof(struct asd_dma_tok),
 					    0,
 					    SLAB_HWCACHE_ALIGN,
-					    NULL, NULL);
+					    NULL);
 		if (!asd_dma_token_cache) {
 			asd_printk("couldn't create dma token cache\n");
 			return -ENOMEM;
@@ -474,7 +474,7 @@ static int asd_create_global_caches(void)
 						   sizeof(struct asd_ascb),
 						   0,
 						   SLAB_HWCACHE_ALIGN,
-						   NULL, NULL);
+						   NULL);
 		if (!asd_ascb_cache) {
 			asd_printk("couldn't create ascb cache\n");
 			goto Err;
diff --git a/drivers/scsi/libsas/sas_init.c b/drivers/scsi/libsas/sas_init.c
index 965698c8b7b..1396c83b0c9 100644
--- a/drivers/scsi/libsas/sas_init.c
+++ b/drivers/scsi/libsas/sas_init.c
@@ -292,7 +292,7 @@ EXPORT_SYMBOL_GPL(sas_domain_release_transport);
 static int __init sas_class_init(void)
 {
 	sas_task_cache = kmem_cache_create("sas_task", sizeof(struct sas_task),
-					   0, SLAB_HWCACHE_ALIGN, NULL, NULL);
+					   0, SLAB_HWCACHE_ALIGN, NULL);
 	if (!sas_task_cache)
 		return -ENOMEM;
 
diff --git a/drivers/scsi/qla2xxx/qla_os.c b/drivers/scsi/qla2xxx/qla_os.c
index b5a77b0c0de..92376f9dfdd 100644
--- a/drivers/scsi/qla2xxx/qla_os.c
+++ b/drivers/scsi/qla2xxx/qla_os.c
@@ -2723,7 +2723,7 @@ qla2x00_module_init(void)
 
 	/* Allocate cache for SRBs. */
 	srb_cachep = kmem_cache_create("qla2xxx_srbs", sizeof(srb_t), 0,
-	    SLAB_HWCACHE_ALIGN, NULL, NULL);
+	    SLAB_HWCACHE_ALIGN, NULL);
 	if (srb_cachep == NULL) {
 		printk(KERN_ERR
 		    "qla2xxx: Unable to allocate SRB cache...Failing load!\n");
diff --git a/drivers/scsi/qla4xxx/ql4_os.c b/drivers/scsi/qla4xxx/ql4_os.c
index e69160a7bc6..b1d565c12c5 100644
--- a/drivers/scsi/qla4xxx/ql4_os.c
+++ b/drivers/scsi/qla4xxx/ql4_os.c
@@ -1677,7 +1677,7 @@ static int __init qla4xxx_module_init(void)
 
 	/* Allocate cache for SRBs. */
 	srb_cachep = kmem_cache_create("qla4xxx_srbs", sizeof(struct srb), 0,
-				       SLAB_HWCACHE_ALIGN, NULL, NULL);
+				       SLAB_HWCACHE_ALIGN, NULL);
 	if (srb_cachep == NULL) {
 		printk(KERN_ERR
 		       "%s: Unable to allocate SRB cache..."
diff --git a/drivers/scsi/scsi.c b/drivers/scsi/scsi.c
index a691dda40d2..a5de1a829a7 100644
--- a/drivers/scsi/scsi.c
+++ b/drivers/scsi/scsi.c
@@ -288,7 +288,7 @@ int scsi_setup_command_freelist(struct Scsi_Host *shost)
 	if (!pool->users) {
 		pool->slab = kmem_cache_create(pool->name,
 				sizeof(struct scsi_cmnd), 0,
-				pool->slab_flags, NULL, NULL);
+				pool->slab_flags, NULL);
 		if (!pool->slab)
 			goto fail;
 	}
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 1f5a07bf2a7..da63c544919 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -1661,7 +1661,7 @@ int __init scsi_init_queue(void)
 
 	scsi_io_context_cache = kmem_cache_create("scsi_io_context",
 					sizeof(struct scsi_io_context),
-					0, 0, NULL, NULL);
+					0, 0, NULL);
 	if (!scsi_io_context_cache) {
 		printk(KERN_ERR "SCSI: can't init scsi io context cache\n");
 		return -ENOMEM;
@@ -1672,7 +1672,7 @@ int __init scsi_init_queue(void)
 		int size = sgp->size * sizeof(struct scatterlist);
 
 		sgp->slab = kmem_cache_create(sgp->name, size, 0,
-				SLAB_HWCACHE_ALIGN, NULL, NULL);
+				SLAB_HWCACHE_ALIGN, NULL);
 		if (!sgp->slab) {
 			printk(KERN_ERR "SCSI: can't init sg slab %s\n",
 					sgp->name);
diff --git a/drivers/scsi/scsi_tgt_lib.c b/drivers/scsi/scsi_tgt_lib.c
index 2570f48a69c..371b69c110b 100644
--- a/drivers/scsi/scsi_tgt_lib.c
+++ b/drivers/scsi/scsi_tgt_lib.c
@@ -585,7 +585,7 @@ static int __init scsi_tgt_init(void)
 
 	scsi_tgt_cmd_cache = kmem_cache_create("scsi_tgt_cmd",
 					       sizeof(struct scsi_tgt_cmd),
-					       0, 0, NULL, NULL);
+					       0, 0, NULL);
 	if (!scsi_tgt_cmd_cache)
 		return -ENOMEM;
 
diff --git a/drivers/usb/host/uhci-hcd.c b/drivers/usb/host/uhci-hcd.c
index 76c555a67da..805e5fc5f5d 100644
--- a/drivers/usb/host/uhci-hcd.c
+++ b/drivers/usb/host/uhci-hcd.c
@@ -933,7 +933,7 @@ static int __init uhci_hcd_init(void)
 	}
 
 	uhci_up_cachep = kmem_cache_create("uhci_urb_priv",
-		sizeof(struct urb_priv), 0, 0, NULL, NULL);
+		sizeof(struct urb_priv), 0, 0, NULL);
 	if (!uhci_up_cachep)
 		goto up_failed;
 
diff --git a/drivers/usb/mon/mon_text.c b/drivers/usb/mon/mon_text.c
index 982b773d71e..8f27a9e1c36 100644
--- a/drivers/usb/mon/mon_text.c
+++ b/drivers/usb/mon/mon_text.c
@@ -340,7 +340,7 @@ static int mon_text_open(struct inode *inode, struct file *file)
 	snprintf(rp->slab_name, SLAB_NAME_SZ, "mon_text_%p", rp);
 	rp->e_slab = kmem_cache_create(rp->slab_name,
 	    sizeof(struct mon_event_text), sizeof(long), 0,
-	    mon_text_ctor, NULL);
+	    mon_text_ctor);
 	if (rp->e_slab == NULL) {
 		rc = -ENOMEM;
 		goto err_slab;
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index de2ed5ca335..1c9fd302949 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -234,14 +234,14 @@ static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flag
 
 	inode_init_once(&ei->vfs_inode);
 }
- 
+
 static int init_inodecache(void)
 {
 	adfs_inode_cachep = kmem_cache_create("adfs_inode_cache",
 					     sizeof(struct adfs_inode_info),
 					     0, (SLAB_RECLAIM_ACCOUNT|
 						SLAB_MEM_SPREAD),
-					     init_once, NULL);
+					     init_once);
 	if (adfs_inode_cachep == NULL)
 		return -ENOMEM;
 	return 0;
diff --git a/fs/affs/super.c b/fs/affs/super.c
index 6d0ebc32153..c80191ae205 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -99,7 +99,7 @@ static int init_inodecache(void)
 					     sizeof(struct affs_inode_info),
 					     0, (SLAB_RECLAIM_ACCOUNT|
 						SLAB_MEM_SPREAD),
-					     init_once, NULL);
+					     init_once);
 	if (affs_inode_cachep == NULL)
 		return -ENOMEM;
 	return 0;
diff --git a/fs/afs/super.c b/fs/afs/super.c
index 993cdf1cce3..b8808b40f82 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -89,8 +89,7 @@ int __init afs_fs_init(void)
 					     sizeof(struct afs_vnode),
 					     0,
 					     SLAB_HWCACHE_ALIGN,
-					     afs_i_init_once,
-					     NULL);
+					     afs_i_init_once);
 	if (!afs_inode_cachep) {
 		printk(KERN_NOTICE "kAFS: Failed to allocate inode cache\n");
 		return ret;
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index a5c5171c282..a4514182768 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -414,7 +414,7 @@ befs_read_inode(struct inode *inode)
 }
 
 /* Initialize the inode cache. Called at fs setup.
- * 
+ *
  * Taken from NFS implementation by Al Viro.
  */
 static int
@@ -424,7 +424,7 @@ befs_init_inodecache(void)
 					      sizeof (struct befs_inode_info),
 					      0, (SLAB_RECLAIM_ACCOUNT|
 						SLAB_MEM_SPREAD),
-					      init_once, NULL);
+					      init_once);
 	if (befs_inode_cachep == NULL) {
 		printk(KERN_ERR "befs_init_inodecache: "
 		       "Couldn't initalize inode slabcache\n");
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index 58c7bd9f530..f346eb14e86 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -250,14 +250,14 @@ static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flag
 
 	inode_init_once(&bi->vfs_inode);
 }
- 
+
 static int init_inodecache(void)
 {
 	bfs_inode_cachep = kmem_cache_create("bfs_inode_cache",
 					     sizeof(struct bfs_inode_info),
 					     0, (SLAB_RECLAIM_ACCOUNT|
 						SLAB_MEM_SPREAD),
-					     init_once, NULL);
+					     init_once);
 	if (bfs_inode_cachep == NULL)
 		return -ENOMEM;
 	return 0;
diff --git a/fs/bio.c b/fs/bio.c
index 33e46340a76..0d2c2d38b7b 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -1187,7 +1187,7 @@ static void __init biovec_init_slabs(void)
 
 		size = bvs->nr_vecs * sizeof(struct bio_vec);
 		bvs->slab = kmem_cache_create(bvs->name, size, 0,
-                                SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
+                                SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
 	}
 }
 
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 3635315e3b9..2980eabe577 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -517,7 +517,7 @@ void __init bdev_cache_init(void)
 	bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode),
 			0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
 				SLAB_MEM_SPREAD|SLAB_PANIC),
-			init_once, NULL);
+			init_once);
 	err = register_filesystem(&bd_type);
 	if (err)
 		panic("Cannot register bdev pseudo-fs");
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 1fd0dc85f53..cabb6a55d7d 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -719,7 +719,7 @@ cifs_init_inodecache(void)
 					      sizeof (struct cifsInodeInfo),
 					      0, (SLAB_RECLAIM_ACCOUNT|
 						SLAB_MEM_SPREAD),
-					      cifs_init_once, NULL);
+					      cifs_init_once);
 	if (cifs_inode_cachep == NULL)
 		return -ENOMEM;
 
@@ -748,7 +748,7 @@ cifs_init_request_bufs(void)
 	cifs_req_cachep = kmem_cache_create("cifs_request",
 					    CIFSMaxBufSize +
 					    MAX_CIFS_HDR_SIZE, 0,
-					    SLAB_HWCACHE_ALIGN, NULL, NULL);
+					    SLAB_HWCACHE_ALIGN, NULL);
 	if (cifs_req_cachep == NULL)
 		return -ENOMEM;
 
@@ -776,7 +776,7 @@ cifs_init_request_bufs(void)
 	alloc of large cifs buffers even when page debugging is on */
 	cifs_sm_req_cachep = kmem_cache_create("cifs_small_rq",
 			MAX_CIFS_SMALL_BUFFER_SIZE, 0, SLAB_HWCACHE_ALIGN,
-			NULL, NULL);
+			NULL);
 	if (cifs_sm_req_cachep == NULL) {
 		mempool_destroy(cifs_req_poolp);
 		kmem_cache_destroy(cifs_req_cachep);
@@ -817,7 +817,7 @@ cifs_init_mids(void)
 {
 	cifs_mid_cachep = kmem_cache_create("cifs_mpx_ids",
 				sizeof (struct mid_q_entry), 0,
-				SLAB_HWCACHE_ALIGN, NULL, NULL);
+				SLAB_HWCACHE_ALIGN, NULL);
 	if (cifs_mid_cachep == NULL)
 		return -ENOMEM;
 
@@ -830,7 +830,7 @@ cifs_init_mids(void)
 
 	cifs_oplock_cachep = kmem_cache_create("cifs_oplock_structs",
 				sizeof (struct oplock_q_entry), 0,
-				SLAB_HWCACHE_ALIGN, NULL, NULL);
+				SLAB_HWCACHE_ALIGN, NULL);
 	if (cifs_oplock_cachep == NULL) {
 		mempool_destroy(cifs_mid_poolp);
 		kmem_cache_destroy(cifs_mid_cachep);
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index 6771a4271e3..342f4e0d582 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -64,13 +64,13 @@ static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flag
 
 	inode_init_once(&ei->vfs_inode);
 }
- 
+
 int coda_init_inodecache(void)
 {
 	coda_inode_cachep = kmem_cache_create("coda_inode_cache",
 				sizeof(struct coda_inode_info),
 				0, SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD,
-				init_once, NULL);
+				init_once);
 	if (coda_inode_cachep == NULL)
 		return -ENOMEM;
 	return 0;
diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c
index b00d962de83..871b0cb6183 100644
--- a/fs/configfs/mount.c
+++ b/fs/configfs/mount.c
@@ -136,7 +136,7 @@ static int __init configfs_init(void)
 
 	configfs_dir_cachep = kmem_cache_create("configfs_dir_cache",
 						sizeof(struct configfs_dirent),
-						0, 0, NULL, NULL);
+						0, 0, NULL);
 	if (!configfs_dir_cachep)
 		goto out;
 
diff --git a/fs/dcache.c b/fs/dcache.c
index cb9d05056b5..678d39deb60 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -2165,10 +2165,10 @@ void __init vfs_caches_init(unsigned long mempages)
 	mempages -= reserve;
 
 	names_cachep = kmem_cache_create("names_cache", PATH_MAX, 0,
-			SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
+			SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
 
 	filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0,
-			SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
+			SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
 
 	dcache_init(mempages);
 	inode_init(mempages);
diff --git a/fs/dcookies.c b/fs/dcookies.c
index 21af1629f9b..c1208f53bd7 100644
--- a/fs/dcookies.c
+++ b/fs/dcookies.c
@@ -205,7 +205,7 @@ static int dcookie_init(void)
 
 	dcookie_cache = kmem_cache_create("dcookie_cache",
 		sizeof(struct dcookie_struct),
-		0, 0, NULL, NULL);
+		0, 0, NULL);
 
 	if (!dcookie_cache)
 		goto out;
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 0553a6158dc..dd362739d29 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -1449,7 +1449,7 @@ int dlm_lowcomms_start(void)
 	error = -ENOMEM;
 	con_cache = kmem_cache_create("dlm_conn", sizeof(struct connection),
 				      __alignof__(struct connection), 0,
-				      NULL, NULL);
+				      NULL);
 	if (!con_cache)
 		goto out;
 
diff --git a/fs/dlm/memory.c b/fs/dlm/memory.c
index fb9e2ee998a..ecf0e5cb203 100644
--- a/fs/dlm/memory.c
+++ b/fs/dlm/memory.c
@@ -23,7 +23,7 @@ int dlm_memory_init(void)
 	int ret = 0;
 
 	lkb_cache = kmem_cache_create("dlm_lkb", sizeof(struct dlm_lkb),
-				__alignof__(struct dlm_lkb), 0, NULL, NULL);
+				__alignof__(struct dlm_lkb), 0, NULL);
 	if (!lkb_cache)
 		ret = -ENOMEM;
 	return ret;
diff --git a/fs/dnotify.c b/fs/dnotify.c
index 936409fcd93..28d01ed66de 100644
--- a/fs/dnotify.c
+++ b/fs/dnotify.c
@@ -176,7 +176,7 @@ EXPORT_SYMBOL_GPL(dnotify_parent);
 static int __init dnotify_init(void)
 {
 	dn_cache = kmem_cache_create("dnotify_cache",
-		sizeof(struct dnotify_struct), 0, SLAB_PANIC, NULL, NULL);
+		sizeof(struct dnotify_struct), 0, SLAB_PANIC, NULL);
 	return 0;
 }
 
diff --git a/fs/dquot.c b/fs/dquot.c
index 7e273151f58..de9a29f64ff 100644
--- a/fs/dquot.c
+++ b/fs/dquot.c
@@ -1848,11 +1848,11 @@ static int __init dquot_init(void)
 
 	register_sysctl_table(sys_table);
 
-	dquot_cachep = kmem_cache_create("dquot", 
+	dquot_cachep = kmem_cache_create("dquot",
 			sizeof(struct dquot), sizeof(unsigned long) * 4,
 			(SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
 				SLAB_MEM_SPREAD|SLAB_PANIC),
-			NULL, NULL);
+			NULL);
 
 	order = 0;
 	dquot_hash = (struct hlist_head *)__get_free_pages(GFP_ATOMIC, order);
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index 02ca6f1e55d..e557a676692 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -677,7 +677,7 @@ static int ecryptfs_init_kmem_caches(void)
 
 		info = &ecryptfs_cache_infos[i];
 		*(info->cache) = kmem_cache_create(info->name, info->size,
-				0, SLAB_HWCACHE_ALIGN, info->ctor, NULL);
+				0, SLAB_HWCACHE_ALIGN, info->ctor);
 		if (!*(info->cache)) {
 			ecryptfs_free_kmem_caches();
 			ecryptfs_printk(KERN_WARNING, "%s: "
diff --git a/fs/efs/super.c b/fs/efs/super.c
index d360c81f3a7..ce4acb8ff81 100644
--- a/fs/efs/super.c
+++ b/fs/efs/super.c
@@ -75,13 +75,13 @@ static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flag
 
 	inode_init_once(&ei->vfs_inode);
 }
- 
+
 static int init_inodecache(void)
 {
 	efs_inode_cachep = kmem_cache_create("efs_inode_cache",
 				sizeof(struct efs_inode_info),
 				0, SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD,
-				init_once, NULL);
+				init_once);
 	if (efs_inode_cachep == NULL)
 		return -ENOMEM;
 	return 0;
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 0b73cd45a06..77b9953624f 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1324,12 +1324,12 @@ static int __init eventpoll_init(void)
 	/* Allocates slab cache used to allocate "struct epitem" items */
 	epi_cache = kmem_cache_create("eventpoll_epi", sizeof(struct epitem),
 			0, SLAB_HWCACHE_ALIGN|EPI_SLAB_DEBUG|SLAB_PANIC,
-			NULL, NULL);
+			NULL);
 
 	/* Allocates slab cache used to allocate "struct eppoll_entry" */
 	pwq_cache = kmem_cache_create("eventpoll_pwq",
 			sizeof(struct eppoll_entry), 0,
-			EPI_SLAB_DEBUG|SLAB_PANIC, NULL, NULL);
+			EPI_SLAB_DEBUG|SLAB_PANIC, NULL);
 
 	return 0;
 }
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index a6b1072daea..68579a0ed3f 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -167,14 +167,14 @@ static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flag
 #endif
 	inode_init_once(&ei->vfs_inode);
 }
- 
+
 static int init_inodecache(void)
 {
 	ext2_inode_cachep = kmem_cache_create("ext2_inode_cache",
 					     sizeof(struct ext2_inode_info),
 					     0, (SLAB_RECLAIM_ACCOUNT|
 						SLAB_MEM_SPREAD),
-					     init_once, NULL);
+					     init_once);
 	if (ext2_inode_cachep == NULL)
 		return -ENOMEM;
 	return 0;
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 4f84dc86628..f0614e3f1fe 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -490,7 +490,7 @@ static int init_inodecache(void)
 					     sizeof(struct ext3_inode_info),
 					     0, (SLAB_RECLAIM_ACCOUNT|
 						SLAB_MEM_SPREAD),
-					     init_once, NULL);
+					     init_once);
 	if (ext3_inode_cachep == NULL)
 		return -ENOMEM;
 	return 0;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 6dcbb28dc06..75adbb64e02 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -541,7 +541,7 @@ static int init_inodecache(void)
 					     sizeof(struct ext4_inode_info),
 					     0, (SLAB_RECLAIM_ACCOUNT|
 						SLAB_MEM_SPREAD),
-					     init_once, NULL);
+					     init_once);
 	if (ext4_inode_cachep == NULL)
 		return -ENOMEM;
 	return 0;
diff --git a/fs/fat/cache.c b/fs/fat/cache.c
index 3c9c8a15ec7..be6f89b152c 100644
--- a/fs/fat/cache.c
+++ b/fs/fat/cache.c
@@ -48,7 +48,7 @@ int __init fat_cache_init(void)
 	fat_cache_cachep = kmem_cache_create("fat_cache",
 				sizeof(struct fat_cache),
 				0, SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD,
-				init_once, NULL);
+				init_once);
 	if (fat_cache_cachep == NULL)
 		return -ENOMEM;
 	return 0;
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 0a7ddb39a59..4baa5f20536 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -514,7 +514,7 @@ static int __init fat_init_inodecache(void)
 					     sizeof(struct msdos_inode_info),
 					     0, (SLAB_RECLAIM_ACCOUNT|
 						SLAB_MEM_SPREAD),
-					     init_once, NULL);
+					     init_once);
 	if (fat_inode_cachep == NULL)
 		return -ENOMEM;
 	return 0;
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 3f22e9f4f69..78b2ff04405 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -638,7 +638,7 @@ EXPORT_SYMBOL(kill_fasync);
 static int __init fasync_init(void)
 {
 	fasync_cache = kmem_cache_create("fasync_cache",
-		sizeof(struct fasync_struct), 0, SLAB_PANIC, NULL, NULL);
+		sizeof(struct fasync_struct), 0, SLAB_PANIC, NULL);
 	return 0;
 }
 
diff --git a/fs/freevxfs/vxfs_super.c b/fs/freevxfs/vxfs_super.c
index 647d600f0bc..4f95572d272 100644
--- a/fs/freevxfs/vxfs_super.c
+++ b/fs/freevxfs/vxfs_super.c
@@ -263,8 +263,8 @@ vxfs_init(void)
 	int rv;
 
 	vxfs_inode_cachep = kmem_cache_create("vxfs_inode",
-			sizeof(struct vxfs_inode_info), 0, 
-			SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL, NULL);
+			sizeof(struct vxfs_inode_info), 0,
+			SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL);
 	if (!vxfs_inode_cachep)
 		return -ENOMEM;
 	rv = register_filesystem(&vxfs_fs_type);
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 357764d85ff..3ad22beb24c 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -1044,7 +1044,7 @@ int __init fuse_dev_init(void)
 	int err = -ENOMEM;
 	fuse_req_cachep = kmem_cache_create("fuse_request",
 					    sizeof(struct fuse_req),
-					    0, 0, NULL, NULL);
+					    0, 0, NULL);
 	if (!fuse_req_cachep)
 		goto out;
 
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index cc5efc13496..5448f625ab5 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -706,7 +706,7 @@ static int __init fuse_fs_init(void)
 	fuse_inode_cachep = kmem_cache_create("fuse_inode",
 					      sizeof(struct fuse_inode),
 					      0, SLAB_HWCACHE_ALIGN,
-					      fuse_inode_init_once, NULL);
+					      fuse_inode_init_once);
 	err = -ENOMEM;
 	if (!fuse_inode_cachep)
 		goto out_unreg2;
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index 787a0edef10..d5d4e68b880 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -72,7 +72,7 @@ static int __init init_gfs2_fs(void)
 	gfs2_glock_cachep = kmem_cache_create("gfs2_glock",
 					      sizeof(struct gfs2_glock),
 					      0, 0,
-					      gfs2_init_glock_once, NULL);
+					      gfs2_init_glock_once);
 	if (!gfs2_glock_cachep)
 		goto fail;
 
@@ -80,13 +80,13 @@ static int __init init_gfs2_fs(void)
 					      sizeof(struct gfs2_inode),
 					      0,  SLAB_RECLAIM_ACCOUNT|
 					          SLAB_MEM_SPREAD,
-					      gfs2_init_inode_once, NULL);
+					      gfs2_init_inode_once);
 	if (!gfs2_inode_cachep)
 		goto fail;
 
 	gfs2_bufdata_cachep = kmem_cache_create("gfs2_bufdata",
 						sizeof(struct gfs2_bufdata),
-					        0, 0, NULL, NULL);
+					        0, 0, NULL);
 	if (!gfs2_bufdata_cachep)
 		goto fail;
 
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index 92cf8751e42..6c5f92dfb50 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -443,7 +443,7 @@ static int __init init_hfs_fs(void)
 
 	hfs_inode_cachep = kmem_cache_create("hfs_inode_cache",
 		sizeof(struct hfs_inode_info), 0, SLAB_HWCACHE_ALIGN,
-		hfs_init_once, NULL);
+		hfs_init_once);
 	if (!hfs_inode_cachep)
 		return -ENOMEM;
 	err = register_filesystem(&hfs_fs_type);
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 6d87a2a9534..7b0f2e5a44e 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -479,7 +479,7 @@ static int __init init_hfsplus_fs(void)
 
 	hfsplus_inode_cachep = kmem_cache_create("hfsplus_icache",
 		HFSPLUS_INODE_SIZE, 0, SLAB_HWCACHE_ALIGN,
-		hfsplus_init_once, NULL);
+		hfsplus_init_once);
 	if (!hfsplus_inode_cachep)
 		return -ENOMEM;
 	err = register_filesystem(&hfsplus_fs_type);
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index 29cc34abb2e..89612ee7c80 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -181,14 +181,14 @@ static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flag
 	mutex_init(&ei->i_parent_mutex);
 	inode_init_once(&ei->vfs_inode);
 }
- 
+
 static int init_inodecache(void)
 {
 	hpfs_inode_cachep = kmem_cache_create("hpfs_inode_cache",
 					     sizeof(struct hpfs_inode_info),
 					     0, (SLAB_RECLAIM_ACCOUNT|
 						SLAB_MEM_SPREAD),
-					     init_once, NULL);
+					     init_once);
 	if (hpfs_inode_cachep == NULL)
 		return -ENOMEM;
 	return 0;
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index d145cb79c30..c848a191525 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -848,7 +848,7 @@ static int __init init_hugetlbfs_fs(void)
 
 	hugetlbfs_inode_cachep = kmem_cache_create("hugetlbfs_inode_cache",
 					sizeof(struct hugetlbfs_inode_info),
-					0, 0, init_once, NULL);
+					0, 0, init_once);
 	if (hugetlbfs_inode_cachep == NULL)
 		return -ENOMEM;
 
diff --git a/fs/inode.c b/fs/inode.c
index 320e088d0b2..29f5068f819 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1388,8 +1388,7 @@ void __init inode_init(unsigned long mempages)
 					 0,
 					 (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|
 					 SLAB_MEM_SPREAD),
-					 init_once,
-					 NULL);
+					 init_once);
 	register_shrinker(&icache_shrinker);
 
 	/* Hash may have been set up in inode_init_early */
diff --git a/fs/inotify_user.c b/fs/inotify_user.c
index 9f2224f65a1..9bf2f6c09df 100644
--- a/fs/inotify_user.c
+++ b/fs/inotify_user.c
@@ -716,10 +716,10 @@ static int __init inotify_user_setup(void)
 
 	watch_cachep = kmem_cache_create("inotify_watch_cache",
 					 sizeof(struct inotify_user_watch),
-					 0, SLAB_PANIC, NULL, NULL);
+					 0, SLAB_PANIC, NULL);
 	event_cachep = kmem_cache_create("inotify_event_cache",
 					 sizeof(struct inotify_kernel_event),
-					 0, SLAB_PANIC, NULL, NULL);
+					 0, SLAB_PANIC, NULL);
 
 	return 0;
 }
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 4f5418be059..95c72aa8186 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -86,7 +86,7 @@ static int init_inodecache(void)
 					sizeof(struct iso_inode_info),
 					0, (SLAB_RECLAIM_ACCOUNT|
 					SLAB_MEM_SPREAD),
-					init_once, NULL);
+					init_once);
 	if (isofs_inode_cachep == NULL)
 		return -ENOMEM;
 	return 0;
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index 46fe7439fb9..06ab3c10b1b 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -1668,7 +1668,7 @@ static int journal_create_jbd_slab(size_t slab_size)
 	 * boundary.
 	 */
 	jbd_slab[i] = kmem_cache_create(jbd_slab_names[i],
-				slab_size, slab_size, 0, NULL, NULL);
+				slab_size, slab_size, 0, NULL);
 	if (!jbd_slab[i]) {
 		printk(KERN_EMERG "JBD: no memory for jbd_slab cache\n");
 		return -ENOMEM;
@@ -1711,8 +1711,7 @@ static int journal_init_journal_head_cache(void)
 				sizeof(struct journal_head),
 				0,		/* offset */
 				0,		/* flags */
-				NULL,		/* ctor */
-				NULL);		/* dtor */
+				NULL);		/* ctor */
 	retval = 0;
 	if (journal_head_cache == 0) {
 		retval = -ENOMEM;
@@ -2008,8 +2007,7 @@ static int __init journal_init_handle_cache(void)
 				sizeof(handle_t),
 				0,		/* offset */
 				0,		/* flags */
-				NULL,		/* ctor */
-				NULL);		/* dtor */
+				NULL);		/* ctor */
 	if (jbd_handle_cache == NULL) {
 		printk(KERN_EMERG "JBD: failed to create handle cache\n");
 		return -ENOMEM;
diff --git a/fs/jbd/revoke.c b/fs/jbd/revoke.c
index 8db2fa25170..62e13c8db13 100644
--- a/fs/jbd/revoke.c
+++ b/fs/jbd/revoke.c
@@ -170,13 +170,13 @@ int __init journal_init_revoke_caches(void)
 {
 	revoke_record_cache = kmem_cache_create("revoke_record",
 					   sizeof(struct jbd_revoke_record_s),
-					   0, SLAB_HWCACHE_ALIGN, NULL, NULL);
+					   0, SLAB_HWCACHE_ALIGN, NULL);
 	if (revoke_record_cache == 0)
 		return -ENOMEM;
 
 	revoke_table_cache = kmem_cache_create("revoke_table",
 					   sizeof(struct jbd_revoke_table_s),
-					   0, 0, NULL, NULL);
+					   0, 0, NULL);
 	if (revoke_table_cache == 0) {
 		kmem_cache_destroy(revoke_record_cache);
 		revoke_record_cache = NULL;
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index f290cb7cb83..f37324aee81 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -1680,7 +1680,7 @@ static int jbd2_journal_create_jbd_slab(size_t slab_size)
 	 * boundary.
 	 */
 	jbd_slab[i] = kmem_cache_create(jbd_slab_names[i],
-				slab_size, slab_size, 0, NULL, NULL);
+				slab_size, slab_size, 0, NULL);
 	if (!jbd_slab[i]) {
 		printk(KERN_EMERG "JBD: no memory for jbd_slab cache\n");
 		return -ENOMEM;
@@ -1723,8 +1723,7 @@ static int journal_init_jbd2_journal_head_cache(void)
 				sizeof(struct journal_head),
 				0,		/* offset */
 				0,		/* flags */
-				NULL,		/* ctor */
-				NULL);		/* dtor */
+				NULL);		/* ctor */
 	retval = 0;
 	if (jbd2_journal_head_cache == 0) {
 		retval = -ENOMEM;
@@ -2006,8 +2005,7 @@ static int __init journal_init_handle_cache(void)
 				sizeof(handle_t),
 				0,		/* offset */
 				0,		/* flags */
-				NULL,		/* ctor */
-				NULL);		/* dtor */
+				NULL);		/* ctor */
 	if (jbd2_handle_cache == NULL) {
 		printk(KERN_EMERG "JBD: failed to create handle cache\n");
 		return -ENOMEM;
diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c
index 28cac049a56..01d88975e0c 100644
--- a/fs/jbd2/revoke.c
+++ b/fs/jbd2/revoke.c
@@ -171,13 +171,13 @@ int __init jbd2_journal_init_revoke_caches(void)
 {
 	jbd2_revoke_record_cache = kmem_cache_create("jbd2_revoke_record",
 					   sizeof(struct jbd2_revoke_record_s),
-					   0, SLAB_HWCACHE_ALIGN, NULL, NULL);
+					   0, SLAB_HWCACHE_ALIGN, NULL);
 	if (jbd2_revoke_record_cache == 0)
 		return -ENOMEM;
 
 	jbd2_revoke_table_cache = kmem_cache_create("jbd2_revoke_table",
 					   sizeof(struct jbd2_revoke_table_s),
-					   0, 0, NULL, NULL);
+					   0, 0, NULL);
 	if (jbd2_revoke_table_cache == 0) {
 		kmem_cache_destroy(jbd2_revoke_record_cache);
 		jbd2_revoke_record_cache = NULL;
diff --git a/fs/jffs2/malloc.c b/fs/jffs2/malloc.c
index 35c1a5e30ba..f9211252b5f 100644
--- a/fs/jffs2/malloc.c
+++ b/fs/jffs2/malloc.c
@@ -33,56 +33,56 @@ int __init jffs2_create_slab_caches(void)
 {
 	full_dnode_slab = kmem_cache_create("jffs2_full_dnode",
 					    sizeof(struct jffs2_full_dnode),
-					    0, 0, NULL, NULL);
+					    0, 0, NULL);
 	if (!full_dnode_slab)
 		goto err;
 
 	raw_dirent_slab = kmem_cache_create("jffs2_raw_dirent",
 					    sizeof(struct jffs2_raw_dirent),
-					    0, 0, NULL, NULL);
+					    0, 0, NULL);
 	if (!raw_dirent_slab)
 		goto err;
 
 	raw_inode_slab = kmem_cache_create("jffs2_raw_inode",
 					   sizeof(struct jffs2_raw_inode),
-					   0, 0, NULL, NULL);
+					   0, 0, NULL);
 	if (!raw_inode_slab)
 		goto err;
 
 	tmp_dnode_info_slab = kmem_cache_create("jffs2_tmp_dnode",
 						sizeof(struct jffs2_tmp_dnode_info),
-						0, 0, NULL, NULL);
+						0, 0, NULL);
 	if (!tmp_dnode_info_slab)
 		goto err;
 
 	raw_node_ref_slab = kmem_cache_create("jffs2_refblock",
 					      sizeof(struct jffs2_raw_node_ref) * (REFS_PER_BLOCK + 1),
-					      0, 0, NULL, NULL);
+					      0, 0, NULL);
 	if (!raw_node_ref_slab)
 		goto err;
 
 	node_frag_slab = kmem_cache_create("jffs2_node_frag",
 					   sizeof(struct jffs2_node_frag),
-					   0, 0, NULL, NULL);
+					   0, 0, NULL);
 	if (!node_frag_slab)
 		goto err;
 
 	inode_cache_slab = kmem_cache_create("jffs2_inode_cache",
 					     sizeof(struct jffs2_inode_cache),
-					     0, 0, NULL, NULL);
+					     0, 0, NULL);
 	if (!inode_cache_slab)
 		goto err;
 
 #ifdef CONFIG_JFFS2_FS_XATTR
 	xattr_datum_cache = kmem_cache_create("jffs2_xattr_datum",
 					     sizeof(struct jffs2_xattr_datum),
-					     0, 0, NULL, NULL);
+					     0, 0, NULL);
 	if (!xattr_datum_cache)
 		goto err;
 
 	xattr_ref_cache = kmem_cache_create("jffs2_xattr_ref",
 					   sizeof(struct jffs2_xattr_ref),
-					   0, 0, NULL, NULL);
+					   0, 0, NULL);
 	if (!xattr_ref_cache)
 		goto err;
 #endif
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index e220d3bd610..be2b70c2ec1 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -192,7 +192,7 @@ static int __init init_jffs2_fs(void)
 					     sizeof(struct jffs2_inode_info),
 					     0, (SLAB_RECLAIM_ACCOUNT|
 						SLAB_MEM_SPREAD),
-					     jffs2_i_init_once, NULL);
+					     jffs2_i_init_once);
 	if (!jffs2_inode_cachep) {
 		printk(KERN_ERR "JFFS2 error: Failed to initialise inode cache\n");
 		return -ENOMEM;
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index 77c7f1129dd..62e96be02ac 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -213,7 +213,7 @@ int __init metapage_init(void)
 	 * Allocate the metapage structures
 	 */
 	metapage_cache = kmem_cache_create("jfs_mp", sizeof(struct metapage),
-					   0, 0, init_once, NULL);
+					   0, 0, init_once);
 	if (metapage_cache == NULL)
 		return -ENOMEM;
 
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 929fceca799..4b372f55065 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -776,7 +776,7 @@ static int __init init_jfs_fs(void)
 	jfs_inode_cachep =
 	    kmem_cache_create("jfs_ip", sizeof(struct jfs_inode_info), 0,
 			    SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD,
-			    init_once, NULL);
+			    init_once);
 	if (jfs_inode_cachep == NULL)
 		return -ENOMEM;
 
diff --git a/fs/locks.c b/fs/locks.c
index 4f2d749ac62..31051063724 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -2276,7 +2276,7 @@ static int __init filelock_init(void)
 {
 	filelock_cache = kmem_cache_create("file_lock_cache",
 			sizeof(struct file_lock), 0, SLAB_PANIC,
-			init_once, NULL);
+			init_once);
 	return 0;
 }
 
diff --git a/fs/mbcache.c b/fs/mbcache.c
index fbb1d02f879..1046cbefbfb 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -292,7 +292,7 @@ mb_cache_create(const char *name, struct mb_cache_op *cache_op,
 			INIT_LIST_HEAD(&cache->c_indexes_hash[m][n]);
 	}
 	cache->c_entry_cache = kmem_cache_create(name, entry_size, 0,
-		SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL, NULL);
+		SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL);
 	if (!cache->c_entry_cache)
 		goto fail;
 
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index be4044614ac..43668d7d668 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -75,14 +75,14 @@ static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flag
 
 	inode_init_once(&ei->vfs_inode);
 }
- 
+
 static int init_inodecache(void)
 {
 	minix_inode_cachep = kmem_cache_create("minix_inode_cache",
 					     sizeof(struct minix_inode_info),
 					     0, (SLAB_RECLAIM_ACCOUNT|
 						SLAB_MEM_SPREAD),
-					     init_once, NULL);
+					     init_once);
 	if (minix_inode_cachep == NULL)
 		return -ENOMEM;
 	return 0;
diff --git a/fs/namespace.c b/fs/namespace.c
index 4198003d7e1..ddbda13c2d3 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1801,7 +1801,7 @@ void __init mnt_init(unsigned long mempages)
 	init_rwsem(&namespace_sem);
 
 	mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct vfsmount),
-			0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL, NULL);
+			0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
 
 	mount_hashtable = (struct list_head *)__get_free_page(GFP_ATOMIC);
 
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index cf06eb9f050..7f8536dbded 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -63,14 +63,14 @@ static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flag
 	mutex_init(&ei->open_mutex);
 	inode_init_once(&ei->vfs_inode);
 }
- 
+
 static int init_inodecache(void)
 {
 	ncp_inode_cachep = kmem_cache_create("ncp_inode_cache",
 					     sizeof(struct ncp_inode_info),
 					     0, (SLAB_RECLAIM_ACCOUNT|
 						SLAB_MEM_SPREAD),
-					     init_once, NULL);
+					     init_once);
 	if (ncp_inode_cachep == NULL)
 		return -ENOMEM;
 	return 0;
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index a5c82b6f3b4..fcf4d384610 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -875,7 +875,7 @@ int __init nfs_init_directcache(void)
 						sizeof(struct nfs_direct_req),
 						0, (SLAB_RECLAIM_ACCOUNT|
 							SLAB_MEM_SPREAD),
-						NULL, NULL);
+						NULL);
 	if (nfs_direct_cachep == NULL)
 		return -ENOMEM;
 
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 3d9fccf4ef9..bca6cdcb9f0 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1165,14 +1165,14 @@ static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flag
 	nfsi->npages = 0;
 	nfs4_init_once(nfsi);
 }
- 
+
 static int __init nfs_init_inodecache(void)
 {
 	nfs_inode_cachep = kmem_cache_create("nfs_inode_cache",
 					     sizeof(struct nfs_inode),
 					     0, (SLAB_RECLAIM_ACCOUNT|
 						SLAB_MEM_SPREAD),
-					     init_once, NULL);
+					     init_once);
 	if (nfs_inode_cachep == NULL)
 		return -ENOMEM;
 
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index f56dae5216f..345bb9b4765 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -442,7 +442,7 @@ int __init nfs_init_nfspagecache(void)
 	nfs_page_cachep = kmem_cache_create("nfs_page",
 					    sizeof(struct nfs_page),
 					    0, SLAB_HWCACHE_ALIGN,
-					    NULL, NULL);
+					    NULL);
 	if (nfs_page_cachep == NULL)
 		return -ENOMEM;
 
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 6ae2e58ed05..19e05633f4e 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -598,7 +598,7 @@ int __init nfs_init_readpagecache(void)
 	nfs_rdata_cachep = kmem_cache_create("nfs_read_data",
 					     sizeof(struct nfs_read_data),
 					     0, SLAB_HWCACHE_ALIGN,
-					     NULL, NULL);
+					     NULL);
 	if (nfs_rdata_cachep == NULL)
 		return -ENOMEM;
 
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 73ac992ece8..ef97e0c0f5b 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1467,7 +1467,7 @@ int __init nfs_init_writepagecache(void)
 	nfs_wdata_cachep = kmem_cache_create("nfs_write_data",
 					     sizeof(struct nfs_write_data),
 					     0, SLAB_HWCACHE_ALIGN,
-					     NULL, NULL);
+					     NULL);
 	if (nfs_wdata_cachep == NULL)
 		return -ENOMEM;
 
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 6284807bd37..3f559700788 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1032,19 +1032,19 @@ static int
 nfsd4_init_slabs(void)
 {
 	stateowner_slab = kmem_cache_create("nfsd4_stateowners",
-			sizeof(struct nfs4_stateowner), 0, 0, NULL, NULL);
+			sizeof(struct nfs4_stateowner), 0, 0, NULL);
 	if (stateowner_slab == NULL)
 		goto out_nomem;
 	file_slab = kmem_cache_create("nfsd4_files",
-			sizeof(struct nfs4_file), 0, 0, NULL, NULL);
+			sizeof(struct nfs4_file), 0, 0, NULL);
 	if (file_slab == NULL)
 		goto out_nomem;
 	stateid_slab = kmem_cache_create("nfsd4_stateids",
-			sizeof(struct nfs4_stateid), 0, 0, NULL, NULL);
+			sizeof(struct nfs4_stateid), 0, 0, NULL);
 	if (stateid_slab == NULL)
 		goto out_nomem;
 	deleg_slab = kmem_cache_create("nfsd4_delegations",
-			sizeof(struct nfs4_delegation), 0, 0, NULL, NULL);
+			sizeof(struct nfs4_delegation), 0, 0, NULL);
 	if (deleg_slab == NULL)
 		goto out_nomem;
 	return 0;
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index 4566b918255..90c4e3a2970 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -3143,7 +3143,7 @@ static int __init init_ntfs_fs(void)
 
 	ntfs_index_ctx_cache = kmem_cache_create(ntfs_index_ctx_cache_name,
 			sizeof(ntfs_index_context), 0 /* offset */,
-			SLAB_HWCACHE_ALIGN, NULL /* ctor */, NULL /* dtor */);
+			SLAB_HWCACHE_ALIGN, NULL /* ctor */);
 	if (!ntfs_index_ctx_cache) {
 		printk(KERN_CRIT "NTFS: Failed to create %s!\n",
 				ntfs_index_ctx_cache_name);
@@ -3151,7 +3151,7 @@ static int __init init_ntfs_fs(void)
 	}
 	ntfs_attr_ctx_cache = kmem_cache_create(ntfs_attr_ctx_cache_name,
 			sizeof(ntfs_attr_search_ctx), 0 /* offset */,
-			SLAB_HWCACHE_ALIGN, NULL /* ctor */, NULL /* dtor */);
+			SLAB_HWCACHE_ALIGN, NULL /* ctor */);
 	if (!ntfs_attr_ctx_cache) {
 		printk(KERN_CRIT "NTFS: Failed to create %s!\n",
 				ntfs_attr_ctx_cache_name);
@@ -3160,7 +3160,7 @@ static int __init init_ntfs_fs(void)
 
 	ntfs_name_cache = kmem_cache_create(ntfs_name_cache_name,
 			(NTFS_MAX_NAME_LEN+1) * sizeof(ntfschar), 0,
-			SLAB_HWCACHE_ALIGN, NULL, NULL);
+			SLAB_HWCACHE_ALIGN, NULL);
 	if (!ntfs_name_cache) {
 		printk(KERN_CRIT "NTFS: Failed to create %s!\n",
 				ntfs_name_cache_name);
@@ -3169,7 +3169,7 @@ static int __init init_ntfs_fs(void)
 
 	ntfs_inode_cache = kmem_cache_create(ntfs_inode_cache_name,
 			sizeof(ntfs_inode), 0,
-			SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL, NULL);
+			SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL);
 	if (!ntfs_inode_cache) {
 		printk(KERN_CRIT "NTFS: Failed to create %s!\n",
 				ntfs_inode_cache_name);
@@ -3179,7 +3179,7 @@ static int __init init_ntfs_fs(void)
 	ntfs_big_inode_cache = kmem_cache_create(ntfs_big_inode_cache_name,
 			sizeof(big_ntfs_inode), 0,
 			SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD,
-			ntfs_big_inode_init_once, NULL);
+			ntfs_big_inode_init_once);
 	if (!ntfs_big_inode_cache) {
 		printk(KERN_CRIT "NTFS: Failed to create %s!\n",
 				ntfs_big_inode_cache_name);
diff --git a/fs/ocfs2/dlm/dlmfs.c b/fs/ocfs2/dlm/dlmfs.c
index fd8cb1badc9..7418dc83de1 100644
--- a/fs/ocfs2/dlm/dlmfs.c
+++ b/fs/ocfs2/dlm/dlmfs.c
@@ -592,7 +592,7 @@ static int __init init_dlmfs_fs(void)
 				sizeof(struct dlmfs_inode_private),
 				0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
 					SLAB_MEM_SPREAD),
-				dlmfs_init_once, NULL);
+				dlmfs_init_once);
 	if (!dlmfs_inode_cache)
 		return -ENOMEM;
 	cleanup_inode = 1;
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 65b2b9b9268..62e4a7daa28 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -510,7 +510,7 @@ int dlm_init_mle_cache(void)
 	dlm_mle_cache = kmem_cache_create("dlm_mle_cache",
 					  sizeof(struct dlm_master_list_entry),
 					  0, SLAB_HWCACHE_ALIGN,
-					  NULL, NULL);
+					  NULL);
 	if (dlm_mle_cache == NULL)
 		return -ENOMEM;
 	return 0;
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 3a5a1ed09ac..200c7d4790d 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -984,7 +984,7 @@ static int ocfs2_initialize_mem_caches(void)
 				       0,
 				       (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
 						SLAB_MEM_SPREAD),
-				       ocfs2_inode_init_once, NULL);
+				       ocfs2_inode_init_once);
 	if (!ocfs2_inode_cachep)
 		return -ENOMEM;
 
diff --git a/fs/ocfs2/uptodate.c b/fs/ocfs2/uptodate.c
index 39814b900fc..4da8851f2b2 100644
--- a/fs/ocfs2/uptodate.c
+++ b/fs/ocfs2/uptodate.c
@@ -548,7 +548,7 @@ int __init init_ocfs2_uptodate_cache(void)
 {
 	ocfs2_uptodate_cachep = kmem_cache_create("ocfs2_uptodate",
 				  sizeof(struct ocfs2_meta_cache_item),
-				  0, SLAB_HWCACHE_ALIGN, NULL, NULL);
+				  0, SLAB_HWCACHE_ALIGN, NULL);
 	if (!ocfs2_uptodate_cachep)
 		return -ENOMEM;
 
diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index e62397341c3..dd86be2aa6c 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -431,7 +431,7 @@ static int __init init_openprom_fs(void)
 					    0,
 					    (SLAB_RECLAIM_ACCOUNT |
 					     SLAB_MEM_SPREAD),
-					    op_inode_init_once, NULL);
+					    op_inode_init_once);
 	if (!op_inode_cachep)
 		return -ENOMEM;
 
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index dd28e86ab42..94e2c1adf18 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -112,14 +112,14 @@ static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flag
 
 	inode_init_once(&ei->vfs_inode);
 }
- 
+
 int __init proc_init_inodecache(void)
 {
 	proc_inode_cachep = kmem_cache_create("proc_inode_cache",
 					     sizeof(struct proc_inode),
 					     0, (SLAB_RECLAIM_ACCOUNT|
 						SLAB_MEM_SPREAD),
-					     init_once, NULL);
+					     init_once);
 	if (proc_inode_cachep == NULL)
 		return -ENOMEM;
 	return 0;
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index 8d256eb1181..1bc8d873a9e 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -545,7 +545,7 @@ static int init_inodecache(void)
 					     sizeof(struct qnx4_inode_info),
 					     0, (SLAB_RECLAIM_ACCOUNT|
 						SLAB_MEM_SPREAD),
-					     init_once, NULL);
+					     init_once);
 	if (qnx4_inode_cachep == NULL)
 		return -ENOMEM;
 	return 0;
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 5a93cfe1a03..5b68dd3f191 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -527,7 +527,7 @@ static int init_inodecache(void)
 							 reiserfs_inode_info),
 						  0, (SLAB_RECLAIM_ACCOUNT|
 							SLAB_MEM_SPREAD),
-						  init_once, NULL);
+						  init_once);
 	if (reiserfs_inode_cachep == NULL)
 		return -ENOMEM;
 	return 0;
diff --git a/fs/romfs/inode.c b/fs/romfs/inode.c
index 2284e03342c..dae7945f90e 100644
--- a/fs/romfs/inode.c
+++ b/fs/romfs/inode.c
@@ -572,14 +572,14 @@ static void init_once(void *foo, struct kmem_cache *cachep, unsigned long flags)
 
 	inode_init_once(&ei->vfs_inode);
 }
- 
+
 static int init_inodecache(void)
 {
 	romfs_inode_cachep = kmem_cache_create("romfs_inode_cache",
 					     sizeof(struct romfs_inode_info),
 					     0, (SLAB_RECLAIM_ACCOUNT|
 						SLAB_MEM_SPREAD),
-					     init_once, NULL);
+					     init_once);
 	if (romfs_inode_cachep == NULL)
 		return -ENOMEM;
 	return 0;
diff --git a/fs/smbfs/inode.c b/fs/smbfs/inode.c
index 6724a6cf01f..73d1450a95d 100644
--- a/fs/smbfs/inode.c
+++ b/fs/smbfs/inode.c
@@ -73,14 +73,14 @@ static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flag
 
 	inode_init_once(&ei->vfs_inode);
 }
- 
+
 static int init_inodecache(void)
 {
 	smb_inode_cachep = kmem_cache_create("smb_inode_cache",
 					     sizeof(struct smb_inode_info),
 					     0, (SLAB_RECLAIM_ACCOUNT|
 						SLAB_MEM_SPREAD),
-					     init_once, NULL);
+					     init_once);
 	if (smb_inode_cachep == NULL)
 		return -ENOMEM;
 	return 0;
diff --git a/fs/smbfs/request.c b/fs/smbfs/request.c
index 3f54a0f80fa..ca4b2d59c0c 100644
--- a/fs/smbfs/request.c
+++ b/fs/smbfs/request.c
@@ -40,7 +40,7 @@ int smb_init_request_cache(void)
 	req_cachep = kmem_cache_create("smb_request",
 				       sizeof(struct smb_request), 0,
 				       SMB_SLAB_DEBUG | SLAB_HWCACHE_ALIGN,
-				       NULL, NULL);
+				       NULL);
 	if (req_cachep == NULL)
 		return -ENOMEM;
 
diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index 60714d075c2..fbc7b65fe26 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -86,7 +86,7 @@ int __init sysfs_init(void)
 
 	sysfs_dir_cachep = kmem_cache_create("sysfs_dir_cache",
 					      sizeof(struct sysfs_dirent),
-					      0, 0, NULL, NULL);
+					      0, 0, NULL);
 	if (!sysfs_dir_cachep)
 		goto out;
 
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index 56441169339..7c4e5d302ab 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -342,7 +342,7 @@ int __init sysv_init_icache(void)
 	sysv_inode_cachep = kmem_cache_create("sysv_inode_cache",
 			sizeof(struct sysv_inode_info), 0,
 			SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD,
-			init_once, NULL);
+			init_once);
 	if (!sysv_inode_cachep)
 		return -ENOMEM;
 	return 0;
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 911387aa181..72097ee6b75 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -149,7 +149,7 @@ static int init_inodecache(void)
 					     sizeof(struct udf_inode_info),
 					     0, (SLAB_RECLAIM_ACCOUNT |
 						 SLAB_MEM_SPREAD),
-					     init_once, NULL);
+					     init_once);
 	if (udf_inode_cachep == NULL)
 		return -ENOMEM;
 	return 0;
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 2b3011689e8..73402c5eeb8 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -1240,14 +1240,14 @@ static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flag
 
 	inode_init_once(&ei->vfs_inode);
 }
- 
+
 static int init_inodecache(void)
 {
 	ufs_inode_cachep = kmem_cache_create("ufs_inode_cache",
 					     sizeof(struct ufs_inode_info),
 					     0, (SLAB_RECLAIM_ACCOUNT|
 						SLAB_MEM_SPREAD),
-					     init_once, NULL);
+					     init_once);
 	if (ufs_inode_cachep == NULL)
 		return -ENOMEM;
 	return 0;
diff --git a/fs/xfs/linux-2.6/kmem.h b/fs/xfs/linux-2.6/kmem.h
index 4b6470cf87f..b4acc7f3c37 100644
--- a/fs/xfs/linux-2.6/kmem.h
+++ b/fs/xfs/linux-2.6/kmem.h
@@ -74,14 +74,14 @@ extern void  kmem_free(void *, size_t);
 static inline kmem_zone_t *
 kmem_zone_init(int size, char *zone_name)
 {
-	return kmem_cache_create(zone_name, size, 0, 0, NULL, NULL);
+	return kmem_cache_create(zone_name, size, 0, 0, NULL);
 }
 
 static inline kmem_zone_t *
 kmem_zone_init_flags(int size, char *zone_name, unsigned long flags,
 		     void (*construct)(void *, kmem_zone_t *, unsigned long))
 {
-	return kmem_cache_create(zone_name, size, 0, flags, construct, NULL);
+	return kmem_cache_create(zone_name, size, 0, flags, construct);
 }
 
 static inline void
diff --git a/include/linux/i2o.h b/include/linux/i2o.h
index 333a370a3bd..9752307d16b 100644
--- a/include/linux/i2o.h
+++ b/include/linux/i2o.h
@@ -946,8 +946,7 @@ static inline int i2o_pool_alloc(struct i2o_pool *pool, const char *name,
 	strcpy(pool->name, name);
 
 	pool->slab =
-	    kmem_cache_create(pool->name, size, 0, SLAB_HWCACHE_ALIGN, NULL,
-			      NULL);
+	    kmem_cache_create(pool->name, size, 0, SLAB_HWCACHE_ALIGN, NULL);
 	if (!pool->slab)
 		goto free_name;
 
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 0e1d0daef6a..7d0ecc1659f 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -51,7 +51,6 @@ int slab_is_available(void);
 
 struct kmem_cache *kmem_cache_create(const char *, size_t, size_t,
 			unsigned long,
-			void (*)(void *, struct kmem_cache *, unsigned long),
 			void (*)(void *, struct kmem_cache *, unsigned long));
 void kmem_cache_destroy(struct kmem_cache *);
 int kmem_cache_shrink(struct kmem_cache *);
@@ -70,7 +69,7 @@ int kmem_ptr_validate(struct kmem_cache *cachep, const void *ptr);
  */
 #define KMEM_CACHE(__struct, __flags) kmem_cache_create(#__struct,\
 		sizeof(struct __struct), __alignof__(struct __struct),\
-		(__flags), NULL, NULL)
+		(__flags), NULL)
 
 /*
  * The largest kmalloc size supported by the slab allocators is
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index a242c83d89d..145d5a0d299 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -1253,7 +1253,7 @@ static int __init init_mqueue_fs(void)
 
 	mqueue_inode_cachep = kmem_cache_create("mqueue_inode_cache",
 				sizeof(struct mqueue_inode_info), 0,
-				SLAB_HWCACHE_ALIGN, init_once, NULL);
+				SLAB_HWCACHE_ALIGN, init_once);
 	if (mqueue_inode_cachep == NULL)
 		return -ENOMEM;
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 46983899822..7332e236d36 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -137,7 +137,7 @@ void __init fork_init(unsigned long mempages)
 	/* create a slab on which task_structs can be allocated */
 	task_struct_cachep =
 		kmem_cache_create("task_struct", sizeof(struct task_struct),
-			ARCH_MIN_TASKALIGN, SLAB_PANIC, NULL, NULL);
+			ARCH_MIN_TASKALIGN, SLAB_PANIC, NULL);
 #endif
 
 	/*
@@ -1446,22 +1446,22 @@ void __init proc_caches_init(void)
 	sighand_cachep = kmem_cache_create("sighand_cache",
 			sizeof(struct sighand_struct), 0,
 			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU,
-			sighand_ctor, NULL);
+			sighand_ctor);
 	signal_cachep = kmem_cache_create("signal_cache",
 			sizeof(struct signal_struct), 0,
-			SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
-	files_cachep = kmem_cache_create("files_cache", 
+			SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
+	files_cachep = kmem_cache_create("files_cache",
 			sizeof(struct files_struct), 0,
-			SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
-	fs_cachep = kmem_cache_create("fs_cache", 
+			SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
+	fs_cachep = kmem_cache_create("fs_cache",
 			sizeof(struct fs_struct), 0,
-			SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
+			SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
 	vm_area_cachep = kmem_cache_create("vm_area_struct",
 			sizeof(struct vm_area_struct), 0,
-			SLAB_PANIC, NULL, NULL);
+			SLAB_PANIC, NULL);
 	mm_cachep = kmem_cache_create("mm_struct",
 			sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN,
-			SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
+			SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
 }
 
 /*
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 10f0bbba382..a4fb7d46971 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -193,7 +193,7 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags,
 static int __init nsproxy_cache_init(void)
 {
 	nsproxy_cachep = kmem_cache_create("nsproxy", sizeof(struct nsproxy),
-					   0, SLAB_PANIC, NULL, NULL);
+					   0, SLAB_PANIC, NULL);
 	return 0;
 }
 
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 329ce017207..55b3761edaa 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -241,7 +241,7 @@ static __init int init_posix_timers(void)
 	register_posix_clock(CLOCK_MONOTONIC, &clock_monotonic);
 
 	posix_timers_cache = kmem_cache_create("posix_timers_cache",
-					sizeof (struct k_itimer), 0, 0, NULL, NULL);
+					sizeof (struct k_itimer), 0, 0, NULL);
 	idr_init(&posix_timers_id);
 	return 0;
 }
diff --git a/kernel/user.c b/kernel/user.c
index 98b82507797..e7d11cef699 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -208,7 +208,7 @@ static int __init uid_cache_init(void)
 	int n;
 
 	uid_cachep = kmem_cache_create("uid_cache", sizeof(struct user_struct),
-			0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
+			0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
 
 	for(n = 0; n < UIDHASH_SZ; ++n)
 		INIT_LIST_HEAD(init_user_ns.uidhash_table + n);
diff --git a/lib/idr.c b/lib/idr.c
index 5ca67b3cfd3..ffd61941e75 100644
--- a/lib/idr.c
+++ b/lib/idr.c
@@ -590,7 +590,7 @@ static  int init_id_cache(void)
 {
 	if (!idr_layer_cache)
 		idr_layer_cache = kmem_cache_create("idr_layer_cache",
-			sizeof(struct idr_layer), 0, 0, idr_cache_ctor, NULL);
+			sizeof(struct idr_layer), 0, 0, idr_cache_ctor);
 	return 0;
 }
 
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index 9927cca14cb..514efb200be 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -1021,7 +1021,7 @@ void __init radix_tree_init(void)
 {
 	radix_tree_node_cachep = kmem_cache_create("radix_tree_node",
 			sizeof(struct radix_tree_node), 0,
-			SLAB_PANIC, radix_tree_node_ctor, NULL);
+			SLAB_PANIC, radix_tree_node_ctor);
 	radix_tree_init_maxindex();
 	hotcpu_notifier(radix_tree_callback, 0);
 }
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 9f4e9b95e8f..71b84b45154 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1605,11 +1605,11 @@ void __init numa_policy_init(void)
 
 	policy_cache = kmem_cache_create("numa_policy",
 					 sizeof(struct mempolicy),
-					 0, SLAB_PANIC, NULL, NULL);
+					 0, SLAB_PANIC, NULL);
 
 	sn_cache = kmem_cache_create("shared_policy_node",
 				     sizeof(struct sp_node),
-				     0, SLAB_PANIC, NULL, NULL);
+				     0, SLAB_PANIC, NULL);
 
 	/*
 	 * Set interleaving policy for system init. Interleaving is only
diff --git a/mm/rmap.c b/mm/rmap.c
index fede5c7910b..41ac39749ef 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -149,7 +149,7 @@ static void anon_vma_ctor(void *data, struct kmem_cache *cachep,
 void __init anon_vma_init(void)
 {
 	anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma),
-			0, SLAB_DESTROY_BY_RCU|SLAB_PANIC, anon_vma_ctor, NULL);
+			0, SLAB_DESTROY_BY_RCU|SLAB_PANIC, anon_vma_ctor);
 }
 
 /*
diff --git a/mm/shmem.c b/mm/shmem.c
index ad155c7745d..fcd19d323f9 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2322,7 +2322,7 @@ static int init_inodecache(void)
 {
 	shmem_inode_cachep = kmem_cache_create("shmem_inode_cache",
 				sizeof(struct shmem_inode_info),
-				0, 0, init_once, NULL);
+				0, 0, init_once);
 	if (shmem_inode_cachep == NULL)
 		return -ENOMEM;
 	return 0;
diff --git a/mm/slab.c b/mm/slab.c
index c3feeaab387..bde271c001b 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1484,7 +1484,7 @@ void __init kmem_cache_init(void)
 					sizes[INDEX_AC].cs_size,
 					ARCH_KMALLOC_MINALIGN,
 					ARCH_KMALLOC_FLAGS|SLAB_PANIC,
-					NULL, NULL);
+					NULL);
 
 	if (INDEX_AC != INDEX_L3) {
 		sizes[INDEX_L3].cs_cachep =
@@ -1492,7 +1492,7 @@ void __init kmem_cache_init(void)
 				sizes[INDEX_L3].cs_size,
 				ARCH_KMALLOC_MINALIGN,
 				ARCH_KMALLOC_FLAGS|SLAB_PANIC,
-				NULL, NULL);
+				NULL);
 	}
 
 	slab_early_init = 0;
@@ -1510,7 +1510,7 @@ void __init kmem_cache_init(void)
 					sizes->cs_size,
 					ARCH_KMALLOC_MINALIGN,
 					ARCH_KMALLOC_FLAGS|SLAB_PANIC,
-					NULL, NULL);
+					NULL);
 		}
 #ifdef CONFIG_ZONE_DMA
 		sizes->cs_dmacachep = kmem_cache_create(
@@ -1519,7 +1519,7 @@ void __init kmem_cache_init(void)
 					ARCH_KMALLOC_MINALIGN,
 					ARCH_KMALLOC_FLAGS|SLAB_CACHE_DMA|
 						SLAB_PANIC,
-					NULL, NULL);
+					NULL);
 #endif
 		sizes++;
 		names++;
@@ -2101,12 +2101,10 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep)
  * @align: The required alignment for the objects.
  * @flags: SLAB flags
  * @ctor: A constructor for the objects.
- * @dtor: A destructor for the objects (not implemented anymore).
  *
  * Returns a ptr to the cache on success, NULL on failure.
  * Cannot be called within a int, but can be interrupted.
- * The @ctor is run when new pages are allocated by the cache
- * and the @dtor is run before the pages are handed back.
+ * The @ctor is run when new pages are allocated by the cache.
  *
  * @name must be valid until the cache is destroyed. This implies that
  * the module calling this has to destroy the cache before getting unloaded.
@@ -2126,8 +2124,7 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep)
 struct kmem_cache *
 kmem_cache_create (const char *name, size_t size, size_t align,
 	unsigned long flags,
-	void (*ctor)(void*, struct kmem_cache *, unsigned long),
-	void (*dtor)(void*, struct kmem_cache *, unsigned long))
+	void (*ctor)(void*, struct kmem_cache *, unsigned long))
 {
 	size_t left_over, slab_size, ralign;
 	struct kmem_cache *cachep = NULL, *pc;
@@ -2136,7 +2133,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
 	 * Sanity checks... these are all serious usage bugs.
 	 */
 	if (!name || in_interrupt() || (size < BYTES_PER_WORD) ||
-	    size > KMALLOC_MAX_SIZE || dtor) {
+	    size > KMALLOC_MAX_SIZE) {
 		printk(KERN_ERR "%s: Early error in slab %s\n", __FUNCTION__,
 				name);
 		BUG();
diff --git a/mm/slob.c b/mm/slob.c
index c89ef116d7a..d50920ecc02 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -492,8 +492,7 @@ struct kmem_cache {
 
 struct kmem_cache *kmem_cache_create(const char *name, size_t size,
 	size_t align, unsigned long flags,
-	void (*ctor)(void*, struct kmem_cache *, unsigned long),
-	void (*dtor)(void*, struct kmem_cache *, unsigned long))
+	void (*ctor)(void*, struct kmem_cache *, unsigned long))
 {
 	struct kmem_cache *c;
 
diff --git a/mm/slub.c b/mm/slub.c
index 322f3a5d72c..9b2d6178d06 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2668,12 +2668,10 @@ static struct kmem_cache *find_mergeable(size_t size,
 
 struct kmem_cache *kmem_cache_create(const char *name, size_t size,
 		size_t align, unsigned long flags,
-		void (*ctor)(void *, struct kmem_cache *, unsigned long),
-		void (*dtor)(void *, struct kmem_cache *, unsigned long))
+		void (*ctor)(void *, struct kmem_cache *, unsigned long))
 {
 	struct kmem_cache *s;
 
-	BUG_ON(dtor);
 	down_write(&slub_lock);
 	s = find_mergeable(size, align, flags, ctor);
 	if (s) {
diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
index 3fc69729381..69b70977f00 100644
--- a/net/bridge/br_fdb.c
+++ b/net/bridge/br_fdb.c
@@ -36,7 +36,7 @@ int __init br_fdb_init(void)
 	br_fdb_cache = kmem_cache_create("bridge_fdb_cache",
 					 sizeof(struct net_bridge_fdb_entry),
 					 0,
-					 SLAB_HWCACHE_ALIGN, NULL, NULL);
+					 SLAB_HWCACHE_ALIGN, NULL);
 	if (!br_fdb_cache)
 		return -ENOMEM;
 
diff --git a/net/core/flow.c b/net/core/flow.c
index 051430545a0..0ab5234b17d 100644
--- a/net/core/flow.c
+++ b/net/core/flow.c
@@ -350,7 +350,7 @@ static int __init flow_cache_init(void)
 	flow_cachep = kmem_cache_create("flow_cache",
 					sizeof(struct flow_cache_entry),
 					0, SLAB_HWCACHE_ALIGN|SLAB_PANIC,
-					NULL, NULL);
+					NULL);
 	flow_hash_shift = 10;
 	flow_lwm = 2 * flow_hash_size;
 	flow_hwm = 4 * flow_hash_size;
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 9df26a07f06..ca2a1533138 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -1347,7 +1347,7 @@ void neigh_table_init_no_netlink(struct neigh_table *tbl)
 		tbl->kmem_cachep =
 			kmem_cache_create(tbl->id, tbl->entry_size, 0,
 					  SLAB_HWCACHE_ALIGN|SLAB_PANIC,
-					  NULL, NULL);
+					  NULL);
 	tbl->stats = alloc_percpu(struct neigh_statistics);
 	if (!tbl->stats)
 		panic("cannot create neighbour cache statistics");
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 0583e8498f1..35021eb3ed0 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -2021,13 +2021,13 @@ void __init skb_init(void)
 					      sizeof(struct sk_buff),
 					      0,
 					      SLAB_HWCACHE_ALIGN|SLAB_PANIC,
-					      NULL, NULL);
+					      NULL);
 	skbuff_fclone_cache = kmem_cache_create("skbuff_fclone_cache",
 						(2*sizeof(struct sk_buff)) +
 						sizeof(atomic_t),
 						0,
 						SLAB_HWCACHE_ALIGN|SLAB_PANIC,
-						NULL, NULL);
+						NULL);
 }
 
 /**
diff --git a/net/core/sock.c b/net/core/sock.c
index 239a08a6ff2..bd209c4477a 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1767,7 +1767,7 @@ int proto_register(struct proto *prot, int alloc_slab)
 
 	if (alloc_slab) {
 		prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
-					       SLAB_HWCACHE_ALIGN, NULL, NULL);
+					       SLAB_HWCACHE_ALIGN, NULL);
 
 		if (prot->slab == NULL) {
 			printk(KERN_CRIT "%s: Can't create sock SLAB cache!\n",
@@ -1785,7 +1785,7 @@ int proto_register(struct proto *prot, int alloc_slab)
 			sprintf(request_sock_slab_name, mask, prot->name);
 			prot->rsk_prot->slab = kmem_cache_create(request_sock_slab_name,
 								 prot->rsk_prot->obj_size, 0,
-								 SLAB_HWCACHE_ALIGN, NULL, NULL);
+								 SLAB_HWCACHE_ALIGN, NULL);
 
 			if (prot->rsk_prot->slab == NULL) {
 				printk(KERN_CRIT "%s: Can't create request sock SLAB cache!\n",
@@ -1807,7 +1807,7 @@ int proto_register(struct proto *prot, int alloc_slab)
 				kmem_cache_create(timewait_sock_slab_name,
 						  prot->twsk_prot->twsk_obj_size,
 						  0, SLAB_HWCACHE_ALIGN,
-						  NULL, NULL);
+						  NULL);
 			if (prot->twsk_prot->twsk_slab == NULL)
 				goto out_free_timewait_sock_slab_name;
 		}
diff --git a/net/dccp/ackvec.c b/net/dccp/ackvec.c
index 01030f34617..7ac775f9a64 100644
--- a/net/dccp/ackvec.c
+++ b/net/dccp/ackvec.c
@@ -481,14 +481,14 @@ int __init dccp_ackvec_init(void)
 {
 	dccp_ackvec_slab = kmem_cache_create("dccp_ackvec",
 					     sizeof(struct dccp_ackvec), 0,
-					     SLAB_HWCACHE_ALIGN, NULL, NULL);
+					     SLAB_HWCACHE_ALIGN, NULL);
 	if (dccp_ackvec_slab == NULL)
 		goto out_err;
 
 	dccp_ackvec_record_slab =
 			kmem_cache_create("dccp_ackvec_record",
 					  sizeof(struct dccp_ackvec_record),
-					  0, SLAB_HWCACHE_ALIGN, NULL, NULL);
+					  0, SLAB_HWCACHE_ALIGN, NULL);
 	if (dccp_ackvec_record_slab == NULL)
 		goto out_destroy_slab;
 
diff --git a/net/dccp/ccid.c b/net/dccp/ccid.c
index d8cf92f09e6..ccbf72c793b 100644
--- a/net/dccp/ccid.c
+++ b/net/dccp/ccid.c
@@ -69,7 +69,7 @@ static struct kmem_cache *ccid_kmem_cache_create(int obj_size, const char *fmt,.
 	if (slab_name == NULL)
 		return NULL;
 	slab = kmem_cache_create(slab_name, sizeof(struct ccid) + obj_size, 0,
-				 SLAB_HWCACHE_ALIGN, NULL, NULL);
+				 SLAB_HWCACHE_ALIGN, NULL);
 	if (slab == NULL)
 		kfree(slab_name);
 	return slab;
diff --git a/net/dccp/ccids/lib/loss_interval.c b/net/dccp/ccids/lib/loss_interval.c
index dd0fc992b04..174d3f13d93 100644
--- a/net/dccp/ccids/lib/loss_interval.c
+++ b/net/dccp/ccids/lib/loss_interval.c
@@ -282,7 +282,7 @@ static __init int dccp_li_init(void)
 {
 	dccp_li_cachep = kmem_cache_create("dccp_li_hist",
 					   sizeof(struct dccp_li_hist_entry),
-					   0, SLAB_HWCACHE_ALIGN, NULL, NULL);
+					   0, SLAB_HWCACHE_ALIGN, NULL);
 	return dccp_li_cachep == NULL ? -ENOBUFS : 0;
 }
 
diff --git a/net/dccp/ccids/lib/packet_history.c b/net/dccp/ccids/lib/packet_history.c
index 2e8ef42721e..34c4f604772 100644
--- a/net/dccp/ccids/lib/packet_history.c
+++ b/net/dccp/ccids/lib/packet_history.c
@@ -59,7 +59,7 @@ struct dccp_tx_hist *dccp_tx_hist_new(const char *name)
 	hist->dccptxh_slab = kmem_cache_create(slab_name,
 					     sizeof(struct dccp_tx_hist_entry),
 					       0, SLAB_HWCACHE_ALIGN,
-					       NULL, NULL);
+					       NULL);
 	if (hist->dccptxh_slab == NULL)
 		goto out_free_slab_name;
 out:
@@ -148,7 +148,7 @@ struct dccp_rx_hist *dccp_rx_hist_new(const char *name)
 	hist->dccprxh_slab = kmem_cache_create(slab_name,
 					     sizeof(struct dccp_rx_hist_entry),
 					       0, SLAB_HWCACHE_ALIGN,
-					       NULL, NULL);
+					       NULL);
 	if (hist->dccprxh_slab == NULL)
 		goto out_free_slab_name;
 out:
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index 6607b7b14f3..04b59ec4f51 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -1003,7 +1003,7 @@ static int __init dccp_init(void)
 	dccp_hashinfo.bind_bucket_cachep =
 		kmem_cache_create("dccp_bind_bucket",
 				  sizeof(struct inet_bind_bucket), 0,
-				  SLAB_HWCACHE_ALIGN, NULL, NULL);
+				  SLAB_HWCACHE_ALIGN, NULL);
 	if (!dccp_hashinfo.bind_bucket_cachep)
 		goto out;
 
diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c
index 82622fb6f68..f2a61ef2af9 100644
--- a/net/decnet/dn_route.c
+++ b/net/decnet/dn_route.c
@@ -1770,7 +1770,7 @@ void __init dn_route_init(void)
 
 	dn_dst_ops.kmem_cachep =
 		kmem_cache_create("dn_dst_cache", sizeof(struct dn_route), 0,
-				  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
+				  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
 	init_timer(&dn_route_timer);
 	dn_route_timer.function = dn_dst_check_expire;
 	dn_route_timer.expires = jiffies + decnet_dst_gc_interval * HZ;
diff --git a/net/decnet/dn_table.c b/net/decnet/dn_table.c
index d6615c9361e..fda0772fa21 100644
--- a/net/decnet/dn_table.c
+++ b/net/decnet/dn_table.c
@@ -881,7 +881,7 @@ void __init dn_fib_table_init(void)
 	dn_hash_kmem = kmem_cache_create("dn_fib_info_cache",
 					sizeof(struct dn_fib_info),
 					0, SLAB_HWCACHE_ALIGN,
-					NULL, NULL);
+					NULL);
 }
 
 void __exit dn_fib_table_cleanup(void)
diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c
index 07e843a47dd..9ad1d9ff9ce 100644
--- a/net/ipv4/fib_hash.c
+++ b/net/ipv4/fib_hash.c
@@ -771,13 +771,13 @@ struct fib_table * __init fib_hash_init(u32 id)
 		fn_hash_kmem = kmem_cache_create("ip_fib_hash",
 						 sizeof(struct fib_node),
 						 0, SLAB_HWCACHE_ALIGN,
-						 NULL, NULL);
+						 NULL);
 
 	if (fn_alias_kmem == NULL)
 		fn_alias_kmem = kmem_cache_create("ip_fib_alias",
 						  sizeof(struct fib_alias),
 						  0, SLAB_HWCACHE_ALIGN,
-						  NULL, NULL);
+						  NULL);
 
 	tb = kmalloc(sizeof(struct fib_table) + sizeof(struct fn_hash),
 		     GFP_KERNEL);
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 30e332ade61..9ca786a6fd3 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -1970,7 +1970,7 @@ struct fib_table * __init fib_hash_init(u32 id)
 		fn_alias_kmem = kmem_cache_create("ip_fib_alias",
 						  sizeof(struct fib_alias),
 						  0, SLAB_HWCACHE_ALIGN,
-						  NULL, NULL);
+						  NULL);
 
 	tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie),
 		     GFP_KERNEL);
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c
index 2f44e612806..6cbce96a54c 100644
--- a/net/ipv4/inetpeer.c
+++ b/net/ipv4/inetpeer.c
@@ -123,7 +123,7 @@ void __init inet_initpeers(void)
 	peer_cachep = kmem_cache_create("inet_peer_cache",
 			sizeof(struct inet_peer),
 			0, SLAB_HWCACHE_ALIGN|SLAB_PANIC,
-			NULL, NULL);
+			NULL);
 
 	/* All the timers, started at system startup tend
 	   to synchronize. Perturb it a bit.
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index d96582acdf6..7003cc1b7fe 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -1917,7 +1917,7 @@ void __init ip_mr_init(void)
 	mrt_cachep = kmem_cache_create("ip_mrt_cache",
 				       sizeof(struct mfc_cache),
 				       0, SLAB_HWCACHE_ALIGN|SLAB_PANIC,
-				       NULL, NULL);
+				       NULL);
 	init_timer(&ipmr_expire_timer);
 	ipmr_expire_timer.function=ipmr_expire_process;
 	register_netdevice_notifier(&ip_mr_notifier);
diff --git a/net/ipv4/ipvs/ip_vs_conn.c b/net/ipv4/ipvs/ip_vs_conn.c
index 3b446b1a6b9..d612a6a5d95 100644
--- a/net/ipv4/ipvs/ip_vs_conn.c
+++ b/net/ipv4/ipvs/ip_vs_conn.c
@@ -901,7 +901,7 @@ int ip_vs_conn_init(void)
 	/* Allocate ip_vs_conn slab cache */
 	ip_vs_conn_cachep = kmem_cache_create("ip_vs_conn",
 					      sizeof(struct ip_vs_conn), 0,
-					      SLAB_HWCACHE_ALIGN, NULL, NULL);
+					      SLAB_HWCACHE_ALIGN, NULL);
 	if (!ip_vs_conn_cachep) {
 		vfree(ip_vs_conn_tab);
 		return -ENOMEM;
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 88fa648d7ba..df42b7fb326 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -2967,7 +2967,7 @@ int __init ip_rt_init(void)
 
 	ipv4_dst_ops.kmem_cachep =
 		kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
-				  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
+				  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
 
 	ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
 
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 987b94403be..da4c0b6ab79 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2430,7 +2430,7 @@ void __init tcp_init(void)
 	tcp_hashinfo.bind_bucket_cachep =
 		kmem_cache_create("tcp_bind_bucket",
 				  sizeof(struct inet_bind_bucket), 0,
-				  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
+				  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
 
 	/* Size and allocate the main established and bind bucket
 	 * hash tables.
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 662a7d9681f..6a612a701ea 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -1474,7 +1474,7 @@ void __init fib6_init(void)
 	fib6_node_kmem = kmem_cache_create("fib6_nodes",
 					   sizeof(struct fib6_node),
 					   0, SLAB_HWCACHE_ALIGN|SLAB_PANIC,
-					   NULL, NULL);
+					   NULL);
 
 	fib6_tables_init();
 
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index fe8d9837f9f..919de682b33 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2555,7 +2555,7 @@ void __init ip6_route_init(void)
 #endif
 	ip6_dst_ops.kmem_cachep =
 		kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
-				  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
+				  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
 	ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops.kmem_cachep;
 
 	fib6_init();
diff --git a/net/ipv6/xfrm6_tunnel.c b/net/ipv6/xfrm6_tunnel.c
index 6f87dd568de..30f3236c402 100644
--- a/net/ipv6/xfrm6_tunnel.c
+++ b/net/ipv6/xfrm6_tunnel.c
@@ -84,7 +84,7 @@ static int xfrm6_tunnel_spi_init(void)
 	xfrm6_tunnel_spi_kmem = kmem_cache_create("xfrm6_tunnel_spi",
 						  sizeof(struct xfrm6_tunnel_spi),
 						  0, SLAB_HWCACHE_ALIGN,
-						  NULL, NULL);
+						  NULL);
 	if (!xfrm6_tunnel_spi_kmem)
 		return -ENOMEM;
 
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 8cce814f6be..aa086c83af8 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -1108,7 +1108,7 @@ int __init nf_conntrack_init(void)
 
 	nf_conntrack_cachep = kmem_cache_create("nf_conntrack",
 						sizeof(struct nf_conn),
-						0, 0, NULL, NULL);
+						0, 0, NULL);
 	if (!nf_conntrack_cachep) {
 		printk(KERN_ERR "Unable to create nf_conn slab cache\n");
 		goto err_free_hash;
diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c
index 2191fe008f6..1aa6229ca99 100644
--- a/net/netfilter/nf_conntrack_expect.c
+++ b/net/netfilter/nf_conntrack_expect.c
@@ -540,7 +540,7 @@ int __init nf_conntrack_expect_init(void)
 
 	nf_ct_expect_cachep = kmem_cache_create("nf_conntrack_expect",
 					sizeof(struct nf_conntrack_expect),
-					0, 0, NULL, NULL);
+					0, 0, NULL);
 	if (!nf_ct_expect_cachep)
 		goto err2;
 
diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c
index d6b3d01975b..bd45f9d3f7d 100644
--- a/net/netfilter/xt_hashlimit.c
+++ b/net/netfilter/xt_hashlimit.c
@@ -738,7 +738,7 @@ static int __init xt_hashlimit_init(void)
 	err = -ENOMEM;
 	hashlimit_cachep = kmem_cache_create("xt_hashlimit",
 					    sizeof(struct dsthash_ent), 0, 0,
-					    NULL, NULL);
+					    NULL);
 	if (!hashlimit_cachep) {
 		printk(KERN_ERR "xt_hashlimit: unable to create slab cache\n");
 		goto err2;
diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c
index 46f6d572ad2..16a68df4e36 100644
--- a/net/rxrpc/af_rxrpc.c
+++ b/net/rxrpc/af_rxrpc.c
@@ -792,7 +792,7 @@ static int __init af_rxrpc_init(void)
 	ret = -ENOMEM;
 	rxrpc_call_jar = kmem_cache_create(
 		"rxrpc_call_jar", sizeof(struct rxrpc_call), 0,
-		SLAB_HWCACHE_ALIGN, NULL, NULL);
+		SLAB_HWCACHE_ALIGN, NULL);
 	if (!rxrpc_call_jar) {
 		printk(KERN_NOTICE "RxRPC: Failed to allocate call jar\n");
 		goto error_call_jar;
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index 34bab36637a..e98579b788b 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -980,14 +980,14 @@ SCTP_STATIC __init int sctp_init(void)
 	sctp_bucket_cachep = kmem_cache_create("sctp_bind_bucket",
 					       sizeof(struct sctp_bind_bucket),
 					       0, SLAB_HWCACHE_ALIGN,
-					       NULL, NULL);
+					       NULL);
 	if (!sctp_bucket_cachep)
 		goto out;
 
 	sctp_chunk_cachep = kmem_cache_create("sctp_chunk",
 					       sizeof(struct sctp_chunk),
 					       0, SLAB_HWCACHE_ALIGN,
-					       NULL, NULL);
+					       NULL);
 	if (!sctp_chunk_cachep)
 		goto err_chunk_cachep;
 
diff --git a/net/socket.c b/net/socket.c
index b7111425004..ec077037f53 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -272,8 +272,7 @@ static int init_inodecache(void)
 					      (SLAB_HWCACHE_ALIGN |
 					       SLAB_RECLAIM_ACCOUNT |
 					       SLAB_MEM_SPREAD),
-					      init_once,
-					      NULL);
+					      init_once);
 	if (sock_inode_cachep == NULL)
 		return -ENOMEM;
 	return 0;
diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
index 5b2b6fb244f..650af064ff8 100644
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -867,7 +867,7 @@ int register_rpc_pipefs(void)
 				sizeof(struct rpc_inode),
 				0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
 						SLAB_MEM_SPREAD),
-				init_once, NULL);
+				init_once);
 	if (!rpc_inode_cachep)
 		return -ENOMEM;
 	err = register_filesystem(&rpc_pipe_fs_type);
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index 2ac43c41c3a..b5723c262a3 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -1031,13 +1031,13 @@ rpc_init_mempool(void)
 	rpc_task_slabp = kmem_cache_create("rpc_tasks",
 					     sizeof(struct rpc_task),
 					     0, SLAB_HWCACHE_ALIGN,
-					     NULL, NULL);
+					     NULL);
 	if (!rpc_task_slabp)
 		goto err_nomem;
 	rpc_buffer_slabp = kmem_cache_create("rpc_buffers",
 					     RPC_BUFFER_MAXSIZE,
 					     0, SLAB_HWCACHE_ALIGN,
-					     NULL, NULL);
+					     NULL);
 	if (!rpc_buffer_slabp)
 		goto err_nomem;
 	rpc_task_mempool = mempool_create_slab_pool(RPC_TASK_POOLSIZE,
diff --git a/net/tipc/handler.c b/net/tipc/handler.c
index e1dcf663f8a..0c70010a7df 100644
--- a/net/tipc/handler.c
+++ b/net/tipc/handler.c
@@ -97,7 +97,7 @@ int tipc_handler_start(void)
 {
 	tipc_queue_item_cache =
 		kmem_cache_create("tipc_queue_items", sizeof(struct queue_item),
-				  0, SLAB_HWCACHE_ALIGN, NULL, NULL);
+				  0, SLAB_HWCACHE_ALIGN, NULL);
 	if (!tipc_queue_item_cache)
 		return -ENOMEM;
 
diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c
index 5c4695840c5..113f4442998 100644
--- a/net/xfrm/xfrm_input.c
+++ b/net/xfrm/xfrm_input.c
@@ -83,5 +83,5 @@ void __init xfrm_input_init(void)
 	secpath_cachep = kmem_cache_create("secpath_cache",
 					   sizeof(struct sec_path),
 					   0, SLAB_HWCACHE_ALIGN|SLAB_PANIC,
-					   NULL, NULL);
+					   NULL);
 }
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index cfaf17c8851..c3a4b0a1868 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -2378,7 +2378,7 @@ static void __init xfrm_policy_init(void)
 	xfrm_dst_cache = kmem_cache_create("xfrm_dst_cache",
 					   sizeof(struct xfrm_dst),
 					   0, SLAB_HWCACHE_ALIGN|SLAB_PANIC,
-					   NULL, NULL);
+					   NULL);
 
 	hmask = 8 - 1;
 	sz = (hmask+1) * sizeof(struct hlist_head);
diff --git a/security/keys/key.c b/security/keys/key.c
index 700400d801d..01bbc6d9d19 100644
--- a/security/keys/key.c
+++ b/security/keys/key.c
@@ -1001,7 +1001,7 @@ void __init key_init(void)
 {
 	/* allocate a slab in which we can store keys */
 	key_jar = kmem_cache_create("key_jar", sizeof(struct key),
-			0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
+			0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
 
 	/* add the special key types */
 	list_add_tail(&key_type_keyring.link, &key_types_list);
diff --git a/security/selinux/avc.c b/security/selinux/avc.c
index 78c408fd2b0..ecd06738453 100644
--- a/security/selinux/avc.c
+++ b/security/selinux/avc.c
@@ -239,7 +239,7 @@ void __init avc_init(void)
 	atomic_set(&avc_cache.lru_hint, 0);
 
 	avc_node_cachep = kmem_cache_create("avc_node", sizeof(struct avc_node),
-					     0, SLAB_PANIC, NULL, NULL);
+					     0, SLAB_PANIC, NULL);
 
 	audit_log(current->audit_context, GFP_KERNEL, AUDIT_KERNEL, "AVC INITIALIZED\n");
 }
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 26356e67108..0fac6829c63 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -4913,7 +4913,7 @@ static __init int selinux_init(void)
 
 	sel_inode_cache = kmem_cache_create("selinux_inode_security",
 					    sizeof(struct inode_security_struct),
-					    0, SLAB_PANIC, NULL, NULL);
+					    0, SLAB_PANIC, NULL);
 	avc_init();
 
 	original_ops = secondary_ops = security_ops;
diff --git a/security/selinux/ss/avtab.c b/security/selinux/ss/avtab.c
index 3122908afdc..85705eb289e 100644
--- a/security/selinux/ss/avtab.c
+++ b/security/selinux/ss/avtab.c
@@ -445,7 +445,7 @@ void avtab_cache_init(void)
 {
 	avtab_node_cachep = kmem_cache_create("avtab_node",
 					      sizeof(struct avtab_node),
-					      0, SLAB_PANIC, NULL, NULL);
+					      0, SLAB_PANIC, NULL);
 }
 
 void avtab_cache_destroy(void)
-- 
cgit v1.2.3-70-g09d2


From 1f564ad6d4182859612cbae452122e5eb2d62a76 Mon Sep 17 00:00:00 2001
From: Bob Picco <bob.picco@hp.com>
Date: Wed, 18 Jul 2007 15:51:28 -0700
Subject: [IA64] remove time interpolator

Remove time_interpolator code (This is generic code, but
only user was ia64.  It has been superseded by the
CONFIG_GENERIC_TIME code).

Signed-off-by: Bob Picco <bob.picco@hp.com>
Signed-off-by: John Stultz <johnstul@us.ibm.com>
Signed-off-by: Peter Keilty <peter.keilty@hp.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 Documentation/time_interpolators.txt |  41 --------
 include/linux/timex.h                |  60 -----------
 kernel/time.c                        |  88 ----------------
 kernel/time/ntp.c                    |  10 --
 kernel/time/timekeeping.c            |   4 -
 kernel/timer.c                       | 188 -----------------------------------
 6 files changed, 391 deletions(-)
 delete mode 100644 Documentation/time_interpolators.txt

(limited to 'kernel')

diff --git a/Documentation/time_interpolators.txt b/Documentation/time_interpolators.txt
deleted file mode 100644
index e3b60854fbc..00000000000
--- a/Documentation/time_interpolators.txt
+++ /dev/null
@@ -1,41 +0,0 @@
-Time Interpolators
-------------------
-
-Time interpolators are a base of time calculation between timer ticks and
-allow an accurate determination of time down to the accuracy of the time
-source in nanoseconds.
-
-The architecture specific code typically provides gettimeofday and
-settimeofday under Linux. The time interpolator provides both if an arch
-defines CONFIG_TIME_INTERPOLATION. The arch still must set up timer tick
-operations and call the necessary functions to advance the clock.
-
-With the time interpolator a standardized interface exists for time
-interpolation between ticks. The provided logic is highly scalable
-and has been tested in SMP situations of up to 512 CPUs.
-
-If CONFIG_TIME_INTERPOLATION is defined then the architecture specific code
-(or the device drivers - like HPET) may register time interpolators.
-These are typically defined in the following way:
-
-static struct time_interpolator my_interpolator {
-	.frequency = MY_FREQUENCY,
-	.source = TIME_SOURCE_MMIO32,
-	.shift = 8,		/* scaling for higher accuracy */
-	.drift = -1,		/* Unknown drift */
-	.jitter = 0		/* time source is stable */
-};
-
-void time_init(void)
-{
-	....
-	/* Initialization of the timer *.
-	my_interpolator.address = &my_timer;
-	register_time_interpolator(&my_interpolator);
-	....
-}
-
-For more details see include/linux/timex.h and kernel/timer.c.
-
-Christoph Lameter <christoph@lameter.com>, October 31, 2004
-
diff --git a/include/linux/timex.h b/include/linux/timex.h
index da929dbbea2..37ac3ff90fa 100644
--- a/include/linux/timex.h
+++ b/include/linux/timex.h
@@ -224,66 +224,6 @@ static inline int ntp_synced(void)
 	__x < 0 ? -(-__x >> __s) : __x >> __s;	\
 })
 
-
-#ifdef CONFIG_TIME_INTERPOLATION
-
-#define TIME_SOURCE_CPU 0
-#define TIME_SOURCE_MMIO64 1
-#define TIME_SOURCE_MMIO32 2
-#define TIME_SOURCE_FUNCTION 3
-
-/* For proper operations time_interpolator clocks must run slightly slower
- * than the standard clock since the interpolator may only correct by having
- * time jump forward during a tick. A slower clock is usually a side effect
- * of the integer divide of the nanoseconds in a second by the frequency.
- * The accuracy of the division can be increased by specifying a shift.
- * However, this may cause the clock not to be slow enough.
- * The interpolator will self-tune the clock by slowing down if no
- * resets occur or speeding up if the time jumps per analysis cycle
- * become too high.
- *
- * Setting jitter compensates for a fluctuating timesource by comparing
- * to the last value read from the timesource to insure that an earlier value
- * is not returned by a later call. The price to pay
- * for the compensation is that the timer routines are not as scalable anymore.
- */
-
-struct time_interpolator {
-	u16 source;			/* time source flags */
-	u8 shift;			/* increases accuracy of multiply by shifting. */
-				/* Note that bits may be lost if shift is set too high */
-	u8 jitter;			/* if set compensate for fluctuations */
-	u32 nsec_per_cyc;		/* set by register_time_interpolator() */
-	void *addr;			/* address of counter or function */
-	cycles_t mask;			/* mask the valid bits of the counter */
-	unsigned long offset;		/* nsec offset at last update of interpolator */
-	u64 last_counter;		/* counter value in units of the counter at last update */
-	cycles_t last_cycle;		/* Last timer value if TIME_SOURCE_JITTER is set */
-	u64 frequency;			/* frequency in counts/second */
-	long drift;			/* drift in parts-per-million (or -1) */
-	unsigned long skips;		/* skips forward */
-	unsigned long ns_skipped;	/* nanoseconds skipped */
-	struct time_interpolator *next;
-};
-
-extern void register_time_interpolator(struct time_interpolator *);
-extern void unregister_time_interpolator(struct time_interpolator *);
-extern void time_interpolator_reset(void);
-extern unsigned long time_interpolator_get_offset(void);
-extern void time_interpolator_update(long delta_nsec);
-
-#else /* !CONFIG_TIME_INTERPOLATION */
-
-static inline void time_interpolator_reset(void)
-{
-}
-
-static inline void time_interpolator_update(long delta_nsec)
-{
-}
-
-#endif /* !CONFIG_TIME_INTERPOLATION */
-
 #define TICK_LENGTH_SHIFT	32
 
 #ifdef CONFIG_NO_HZ
diff --git a/kernel/time.c b/kernel/time.c
index ffe19149d77..e325597f5bf 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -136,7 +136,6 @@ static inline void warp_clock(void)
 	write_seqlock_irq(&xtime_lock);
 	wall_to_monotonic.tv_sec -= sys_tz.tz_minuteswest * 60;
 	xtime.tv_sec += sys_tz.tz_minuteswest * 60;
-	time_interpolator_reset();
 	write_sequnlock_irq(&xtime_lock);
 	clock_was_set();
 }
@@ -309,92 +308,6 @@ struct timespec timespec_trunc(struct timespec t, unsigned gran)
 }
 EXPORT_SYMBOL(timespec_trunc);
 
-#ifdef CONFIG_TIME_INTERPOLATION
-void getnstimeofday (struct timespec *tv)
-{
-	unsigned long seq,sec,nsec;
-
-	do {
-		seq = read_seqbegin(&xtime_lock);
-		sec = xtime.tv_sec;
-		nsec = xtime.tv_nsec+time_interpolator_get_offset();
-	} while (unlikely(read_seqretry(&xtime_lock, seq)));
-
-	while (unlikely(nsec >= NSEC_PER_SEC)) {
-		nsec -= NSEC_PER_SEC;
-		++sec;
-	}
-	tv->tv_sec = sec;
-	tv->tv_nsec = nsec;
-}
-EXPORT_SYMBOL_GPL(getnstimeofday);
-
-int do_settimeofday (struct timespec *tv)
-{
-	time_t wtm_sec, sec = tv->tv_sec;
-	long wtm_nsec, nsec = tv->tv_nsec;
-
-	if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC)
-		return -EINVAL;
-
-	write_seqlock_irq(&xtime_lock);
-	{
-		wtm_sec  = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec);
-		wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec);
-
-		set_normalized_timespec(&xtime, sec, nsec);
-		set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
-
-		time_adjust = 0;		/* stop active adjtime() */
-		time_status |= STA_UNSYNC;
-		time_maxerror = NTP_PHASE_LIMIT;
-		time_esterror = NTP_PHASE_LIMIT;
-		time_interpolator_reset();
-	}
-	write_sequnlock_irq(&xtime_lock);
-	clock_was_set();
-	return 0;
-}
-EXPORT_SYMBOL(do_settimeofday);
-
-void do_gettimeofday (struct timeval *tv)
-{
-	unsigned long seq, nsec, usec, sec, offset;
-	do {
-		seq = read_seqbegin(&xtime_lock);
-		offset = time_interpolator_get_offset();
-		sec = xtime.tv_sec;
-		nsec = xtime.tv_nsec;
-	} while (unlikely(read_seqretry(&xtime_lock, seq)));
-
-	usec = (nsec + offset) / 1000;
-
-	while (unlikely(usec >= USEC_PER_SEC)) {
-		usec -= USEC_PER_SEC;
-		++sec;
-	}
-
-	tv->tv_sec = sec;
-	tv->tv_usec = usec;
-
-	/*
-	 * Make sure xtime.tv_sec [returned by sys_time()] always
-	 * follows the gettimeofday() result precisely. This
-	 * condition is extremely unlikely, it can hit at most
-	 * once per second:
-	 */
-	if (unlikely(xtime.tv_sec != tv->tv_sec)) {
-		unsigned long flags;
-
-		write_seqlock_irqsave(&xtime_lock, flags);
-		update_wall_time();
-		write_sequnlock_irqrestore(&xtime_lock, flags);
-	}
-}
-EXPORT_SYMBOL(do_gettimeofday);
-
-#else	/* CONFIG_TIME_INTERPOLATION */
-
 #ifndef CONFIG_GENERIC_TIME
 /*
  * Simulate gettimeofday using do_gettimeofday which only allows a timeval
@@ -410,7 +323,6 @@ void getnstimeofday(struct timespec *tv)
 }
 EXPORT_SYMBOL_GPL(getnstimeofday);
 #endif
-#endif	/* CONFIG_TIME_INTERPOLATION */
 
 /* Converts Gregorian date to seconds since 1970-01-01 00:00:00.
  * Assumes input in normal date format, i.e. 1980-12-31 23:59:59
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 438c6b723ee..b5e352597cb 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -116,11 +116,6 @@ void second_overflow(void)
 		if (xtime.tv_sec % 86400 == 0) {
 			xtime.tv_sec--;
 			wall_to_monotonic.tv_sec++;
-			/*
-			 * The timer interpolator will make time change
-			 * gradually instead of an immediate jump by one second
-			 */
-			time_interpolator_update(-NSEC_PER_SEC);
 			time_state = TIME_OOP;
 			printk(KERN_NOTICE "Clock: inserting leap second "
 					"23:59:60 UTC\n");
@@ -130,11 +125,6 @@ void second_overflow(void)
 		if ((xtime.tv_sec + 1) % 86400 == 0) {
 			xtime.tv_sec++;
 			wall_to_monotonic.tv_sec--;
-			/*
-			 * Use of time interpolator for a gradual change of
-			 * time
-			 */
-			time_interpolator_update(NSEC_PER_SEC);
 			time_state = TIME_WAIT;
 			printk(KERN_NOTICE "Clock: deleting leap second "
 					"23:59:59 UTC\n");
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 728cedfd3cb..027d46c906e 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -466,10 +466,6 @@ void update_wall_time(void)
 			second_overflow();
 		}
 
-		/* interpolator bits */
-		time_interpolator_update(clock->xtime_interval
-						>> clock->shift);
-
 		/* accumulate error between NTP and clock interval */
 		clock->error += current_tick_length();
 		clock->error -= clock->xtime_interval << (TICK_LENGTH_SHIFT - clock->shift);
diff --git a/kernel/timer.c b/kernel/timer.c
index b7792fb0338..dbc03ab14ee 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1349,194 +1349,6 @@ void __init init_timers(void)
 	open_softirq(TIMER_SOFTIRQ, run_timer_softirq, NULL);
 }
 
-#ifdef CONFIG_TIME_INTERPOLATION
-
-struct time_interpolator *time_interpolator __read_mostly;
-static struct time_interpolator *time_interpolator_list __read_mostly;
-static DEFINE_SPINLOCK(time_interpolator_lock);
-
-static inline cycles_t time_interpolator_get_cycles(unsigned int src)
-{
-	unsigned long (*x)(void);
-
-	switch (src)
-	{
-		case TIME_SOURCE_FUNCTION:
-			x = time_interpolator->addr;
-			return x();
-
-		case TIME_SOURCE_MMIO64	:
-			return readq_relaxed((void __iomem *)time_interpolator->addr);
-
-		case TIME_SOURCE_MMIO32	:
-			return readl_relaxed((void __iomem *)time_interpolator->addr);
-
-		default: return get_cycles();
-	}
-}
-
-static inline u64 time_interpolator_get_counter(int writelock)
-{
-	unsigned int src = time_interpolator->source;
-
-	if (time_interpolator->jitter)
-	{
-		cycles_t lcycle;
-		cycles_t now;
-
-		do {
-			lcycle = time_interpolator->last_cycle;
-			now = time_interpolator_get_cycles(src);
-			if (lcycle && time_after(lcycle, now))
-				return lcycle;
-
-			/* When holding the xtime write lock, there's no need
-			 * to add the overhead of the cmpxchg.  Readers are
-			 * force to retry until the write lock is released.
-			 */
-			if (writelock) {
-				time_interpolator->last_cycle = now;
-				return now;
-			}
-			/* Keep track of the last timer value returned. The use of cmpxchg here
-			 * will cause contention in an SMP environment.
-			 */
-		} while (unlikely(cmpxchg(&time_interpolator->last_cycle, lcycle, now) != lcycle));
-		return now;
-	}
-	else
-		return time_interpolator_get_cycles(src);
-}
-
-void time_interpolator_reset(void)
-{
-	time_interpolator->offset = 0;
-	time_interpolator->last_counter = time_interpolator_get_counter(1);
-}
-
-#define GET_TI_NSECS(count,i) (((((count) - i->last_counter) & (i)->mask) * (i)->nsec_per_cyc) >> (i)->shift)
-
-unsigned long time_interpolator_get_offset(void)
-{
-	/* If we do not have a time interpolator set up then just return zero */
-	if (!time_interpolator)
-		return 0;
-
-	return time_interpolator->offset +
-		GET_TI_NSECS(time_interpolator_get_counter(0), time_interpolator);
-}
-
-#define INTERPOLATOR_ADJUST 65536
-#define INTERPOLATOR_MAX_SKIP 10*INTERPOLATOR_ADJUST
-
-void time_interpolator_update(long delta_nsec)
-{
-	u64 counter;
-	unsigned long offset;
-
-	/* If there is no time interpolator set up then do nothing */
-	if (!time_interpolator)
-		return;
-
-	/*
-	 * The interpolator compensates for late ticks by accumulating the late
-	 * time in time_interpolator->offset. A tick earlier than expected will
-	 * lead to a reset of the offset and a corresponding jump of the clock
-	 * forward. Again this only works if the interpolator clock is running
-	 * slightly slower than the regular clock and the tuning logic insures
-	 * that.
-	 */
-
-	counter = time_interpolator_get_counter(1);
-	offset = time_interpolator->offset +
-			GET_TI_NSECS(counter, time_interpolator);
-
-	if (delta_nsec < 0 || (unsigned long) delta_nsec < offset)
-		time_interpolator->offset = offset - delta_nsec;
-	else {
-		time_interpolator->skips++;
-		time_interpolator->ns_skipped += delta_nsec - offset;
-		time_interpolator->offset = 0;
-	}
-	time_interpolator->last_counter = counter;
-
-	/* Tuning logic for time interpolator invoked every minute or so.
-	 * Decrease interpolator clock speed if no skips occurred and an offset is carried.
-	 * Increase interpolator clock speed if we skip too much time.
-	 */
-	if (jiffies % INTERPOLATOR_ADJUST == 0)
-	{
-		if (time_interpolator->skips == 0 && time_interpolator->offset > tick_nsec)
-			time_interpolator->nsec_per_cyc--;
-		if (time_interpolator->ns_skipped > INTERPOLATOR_MAX_SKIP && time_interpolator->offset == 0)
-			time_interpolator->nsec_per_cyc++;
-		time_interpolator->skips = 0;
-		time_interpolator->ns_skipped = 0;
-	}
-}
-
-static inline int
-is_better_time_interpolator(struct time_interpolator *new)
-{
-	if (!time_interpolator)
-		return 1;
-	return new->frequency > 2*time_interpolator->frequency ||
-	    (unsigned long)new->drift < (unsigned long)time_interpolator->drift;
-}
-
-void
-register_time_interpolator(struct time_interpolator *ti)
-{
-	unsigned long flags;
-
-	/* Sanity check */
-	BUG_ON(ti->frequency == 0 || ti->mask == 0);
-
-	ti->nsec_per_cyc = ((u64)NSEC_PER_SEC << ti->shift) / ti->frequency;
-	spin_lock(&time_interpolator_lock);
-	write_seqlock_irqsave(&xtime_lock, flags);
-	if (is_better_time_interpolator(ti)) {
-		time_interpolator = ti;
-		time_interpolator_reset();
-	}
-	write_sequnlock_irqrestore(&xtime_lock, flags);
-
-	ti->next = time_interpolator_list;
-	time_interpolator_list = ti;
-	spin_unlock(&time_interpolator_lock);
-}
-
-void
-unregister_time_interpolator(struct time_interpolator *ti)
-{
-	struct time_interpolator *curr, **prev;
-	unsigned long flags;
-
-	spin_lock(&time_interpolator_lock);
-	prev = &time_interpolator_list;
-	for (curr = *prev; curr; curr = curr->next) {
-		if (curr == ti) {
-			*prev = curr->next;
-			break;
-		}
-		prev = &curr->next;
-	}
-
-	write_seqlock_irqsave(&xtime_lock, flags);
-	if (ti == time_interpolator) {
-		/* we lost the best time-interpolator: */
-		time_interpolator = NULL;
-		/* find the next-best interpolator */
-		for (curr = time_interpolator_list; curr; curr = curr->next)
-			if (is_better_time_interpolator(curr))
-				time_interpolator = curr;
-		time_interpolator_reset();
-	}
-	write_sequnlock_irqrestore(&xtime_lock, flags);
-	spin_unlock(&time_interpolator_lock);
-}
-#endif /* CONFIG_TIME_INTERPOLATION */
-
 /**
  * msleep - sleep safely even with waitqueue interruptions
  * @msecs: Time in milliseconds to sleep for
-- 
cgit v1.2.3-70-g09d2


From 0b1937ac0ef1541c0ea44e6f81c33d2f59803957 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 20 Jul 2007 17:02:04 +0100
Subject: FRV: Fix linkage problems

Make it possible to use __start_notes and __stop_notes without getting a GPREL
overflow error from the FRV linker.

Small variables that would otherwise be in .data or .bss may, depending on the
arch, be placed in special sections (.sdata or .sbss) that permit single
instruction references on fixed instruction width machines.

__start_notes and __stop_notes aren't really char variables, and certainly
don't refer to data in .data or .bss.  Making them type "void" fools the
compiler into not assuming anything about them.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/ksysfs.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 2565e1b6dd7..d0e5c48e18c 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -65,8 +65,8 @@ KERNEL_ATTR_RO(kexec_crash_loaded);
 /*
  * Make /sys/kernel/notes give the raw contents of our kernel .notes section.
  */
-extern const char __start_notes __attribute__((weak));
-extern const char __stop_notes __attribute__((weak));
+extern const void __start_notes __attribute__((weak));
+extern const void __stop_notes __attribute__((weak));
 #define	notes_size (&__stop_notes - &__start_notes)
 
 static ssize_t notes_read(struct kobject *kobj, struct bin_attribute *bin_attr,
-- 
cgit v1.2.3-70-g09d2


From 2008220879af095d00ca27eb168a55c8595fbc0b Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@woody.linux-foundation.org>
Date: Fri, 20 Jul 2007 13:28:54 -0700
Subject: Revert "sys_time() speedup"

This basically reverts commit 4e44f3497d41db4c3b9051c61410dee8ae4fb49c,
while waiting for it to be re-done more completely.  There are cases of
people mixing "time()" with higher-resolution time sources, and we need
to take the nanosecond offsets into account.

Ingo has a patch that does that, but it's still under some discussion.
In the meantime, just revert back to the old simple situation of just
doing the whole exact timesource calculations.

But rather than using do_gettimeofday(), use the internal nanosecond
resolution getnstimeofday(), which at least avoids one unnecessary
conversion (since we really don't care about whether the fractional
seconds are nanoseconds or microseconds - we'll just throw them away).

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/time.c | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time.c b/kernel/time.c
index e325597f5bf..5b81da08bbd 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -57,17 +57,14 @@ EXPORT_SYMBOL(sys_tz);
  */
 asmlinkage long sys_time(time_t __user * tloc)
 {
-	/*
-	 * We read xtime.tv_sec atomically - it's updated
-	 * atomically by update_wall_time(), so no need to
-	 * even read-lock the xtime seqlock:
-	 */
-	time_t i = xtime.tv_sec;
+	time_t i;
+	struct timespec tv;
 
-	smp_rmb(); /* sys_time() results are coherent */
+	getnstimeofday(&tv);
+	i = tv.tv_sec;
 
 	if (tloc) {
-		if (put_user(i, tloc))
+		if (put_user(i,tloc))
 			i = -EFAULT;
 	}
 	return i;
-- 
cgit v1.2.3-70-g09d2


From 18de5bc4c1f1f1fa5e14f354a7603bd6e9d4e3b6 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 21 Jul 2007 04:37:34 -0700
Subject: clockevents: fix resume logic

We need to make sure, that the clockevent devices are resumed, before
the tick is resumed. The current resume logic does not guarantee this.

Add CLOCK_EVT_MODE_RESUME and call the set mode functions of the clock
event devices before resuming the tick / oneshot functionality.

Fixup the existing users.

Thanks to Nigel Cunningham for tracking down a long standing thinko,
which affected the jinxed VAIO.

[akpm@linux-foundation.org: xen build fix]
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: john stultz <johnstul@us.ibm.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm/mach-davinci/time.c      |  2 ++
 arch/arm/mach-imx/time.c          |  1 +
 arch/arm/mach-ixp4xx/common.c     |  2 ++
 arch/arm/mach-omap1/time.c        |  1 +
 arch/arm/plat-omap/timer32k.c     |  2 ++
 arch/i386/kernel/apic.c           |  3 ++
 arch/i386/kernel/hpet.c           | 71 +++------------------------------------
 arch/i386/kernel/i8253.c          | 26 +++++++-------
 arch/i386/kernel/vmiclock.c       |  1 +
 arch/i386/xen/time.c              |  3 ++
 arch/sh/kernel/timers/timer-tmu.c |  1 +
 arch/sparc64/kernel/time.c        |  1 +
 drivers/lguest/lguest.c           |  2 ++
 include/linux/clockchips.h        |  1 +
 kernel/time/tick-broadcast.c      |  6 ++--
 kernel/time/tick-common.c         | 16 +++++----
 16 files changed, 51 insertions(+), 88 deletions(-)

(limited to 'kernel')

diff --git a/arch/arm/mach-davinci/time.c b/arch/arm/mach-davinci/time.c
index 4d8425de692..e96a3dcdc1a 100644
--- a/arch/arm/mach-davinci/time.c
+++ b/arch/arm/mach-davinci/time.c
@@ -285,6 +285,8 @@ static void davinci_set_mode(enum clock_event_mode mode,
 	case CLOCK_EVT_MODE_SHUTDOWN:
 		t->opts = TIMER_OPTS_DISABLED;
 		break;
+	case CLOCK_EVT_MODE_RESUME:
+		break;
 	}
 }
 
diff --git a/arch/arm/mach-imx/time.c b/arch/arm/mach-imx/time.c
index 010f6fa984a..d86d124aea2 100644
--- a/arch/arm/mach-imx/time.c
+++ b/arch/arm/mach-imx/time.c
@@ -159,6 +159,7 @@ static void imx_set_mode(enum clock_event_mode mode, struct clock_event_device *
 		break;
 	case CLOCK_EVT_MODE_SHUTDOWN:
 	case CLOCK_EVT_MODE_UNUSED:
+	case CLOCK_EVT_MODE_RESUME:
 		/* Left event sources disabled, no more interrupts appears */
 		break;
 	}
diff --git a/arch/arm/mach-ixp4xx/common.c b/arch/arm/mach-ixp4xx/common.c
index 8112f726ffa..23e7fba6d3e 100644
--- a/arch/arm/mach-ixp4xx/common.c
+++ b/arch/arm/mach-ixp4xx/common.c
@@ -459,6 +459,8 @@ static void ixp4xx_set_mode(enum clock_event_mode mode,
 	default:
 		osrt = opts = 0;
 		break;
+	case CLOCK_EVT_MODE_RESUME:
+		break;
 	}
 
 	*IXP4XX_OSRT1 = osrt | opts;
diff --git a/arch/arm/mach-omap1/time.c b/arch/arm/mach-omap1/time.c
index 3705d20c4e5..237651ebae5 100644
--- a/arch/arm/mach-omap1/time.c
+++ b/arch/arm/mach-omap1/time.c
@@ -156,6 +156,7 @@ static void omap_mpu_set_mode(enum clock_event_mode mode,
 		break;
 	case CLOCK_EVT_MODE_UNUSED:
 	case CLOCK_EVT_MODE_SHUTDOWN:
+	case CLOCK_EVT_MODE_RESUME:
 		break;
 	}
 }
diff --git a/arch/arm/plat-omap/timer32k.c b/arch/arm/plat-omap/timer32k.c
index 2feceec8ecc..b0af014b0e2 100644
--- a/arch/arm/plat-omap/timer32k.c
+++ b/arch/arm/plat-omap/timer32k.c
@@ -156,6 +156,8 @@ static void omap_32k_timer_set_mode(enum clock_event_mode mode,
 	case CLOCK_EVT_MODE_SHUTDOWN:
 		omap_32k_timer_stop();
 		break;
+	case CLOCK_EVT_MODE_RESUME:
+		break;
 	}
 }
 
diff --git a/arch/i386/kernel/apic.c b/arch/i386/kernel/apic.c
index 67824f3bb97..610f44b2436 100644
--- a/arch/i386/kernel/apic.c
+++ b/arch/i386/kernel/apic.c
@@ -263,6 +263,9 @@ static void lapic_timer_setup(enum clock_event_mode mode,
 		v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
 		apic_write_around(APIC_LVTT, v);
 		break;
+	case CLOCK_EVT_MODE_RESUME:
+		/* Nothing to do here */
+		break;
 	}
 
 	local_irq_restore(flags);
diff --git a/arch/i386/kernel/hpet.c b/arch/i386/kernel/hpet.c
index 17d73459fc5..cfbf792a0bf 100644
--- a/arch/i386/kernel/hpet.c
+++ b/arch/i386/kernel/hpet.c
@@ -187,6 +187,10 @@ static void hpet_set_mode(enum clock_event_mode mode,
 		cfg &= ~HPET_TN_ENABLE;
 		hpet_writel(cfg, HPET_T0_CFG);
 		break;
+
+	case CLOCK_EVT_MODE_RESUME:
+		hpet_enable_int();
+		break;
 	}
 }
 
@@ -217,6 +221,7 @@ static struct clocksource clocksource_hpet = {
 	.mask		= HPET_MASK,
 	.shift		= HPET_SHIFT,
 	.flags		= CLOCK_SOURCE_IS_CONTINUOUS,
+	.resume		= hpet_start_counter,
 };
 
 /*
@@ -291,7 +296,6 @@ int __init hpet_enable(void)
 
 	clocksource_register(&clocksource_hpet);
 
-
 	if (id & HPET_ID_LEGSUP) {
 		hpet_enable_int();
 		hpet_reserve_platform_timers(id);
@@ -524,68 +528,3 @@ irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id)
 	return IRQ_HANDLED;
 }
 #endif
-
-
-/*
- * Suspend/resume part
- */
-
-#ifdef CONFIG_PM
-
-static int hpet_suspend(struct sys_device *sys_device, pm_message_t state)
-{
-	unsigned long cfg = hpet_readl(HPET_CFG);
-
-	cfg &= ~(HPET_CFG_ENABLE|HPET_CFG_LEGACY);
-	hpet_writel(cfg, HPET_CFG);
-
-	return 0;
-}
-
-static int hpet_resume(struct sys_device *sys_device)
-{
-	unsigned int id;
-
-	hpet_start_counter();
-
-	id = hpet_readl(HPET_ID);
-
-	if (id & HPET_ID_LEGSUP)
-		hpet_enable_int();
-
-	return 0;
-}
-
-static struct sysdev_class hpet_class = {
-	set_kset_name("hpet"),
-	.suspend	= hpet_suspend,
-	.resume		= hpet_resume,
-};
-
-static struct sys_device hpet_device = {
-	.id		= 0,
-	.cls		= &hpet_class,
-};
-
-
-static __init int hpet_register_sysfs(void)
-{
-	int err;
-
-	if (!is_hpet_capable())
-		return 0;
-
-	err = sysdev_class_register(&hpet_class);
-
-	if (!err) {
-		err = sysdev_register(&hpet_device);
-		if (err)
-			sysdev_class_unregister(&hpet_class);
-	}
-
-	return err;
-}
-
-device_initcall(hpet_register_sysfs);
-
-#endif
diff --git a/arch/i386/kernel/i8253.c b/arch/i386/kernel/i8253.c
index f8a3c4054c7..931eabe1e56 100644
--- a/arch/i386/kernel/i8253.c
+++ b/arch/i386/kernel/i8253.c
@@ -3,11 +3,11 @@
  *
  */
 #include <linux/clockchips.h>
-#include <linux/spinlock.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
 #include <linux/jiffies.h>
-#include <linux/sysdev.h>
 #include <linux/module.h>
-#include <linux/init.h>
+#include <linux/spinlock.h>
 
 #include <asm/smp.h>
 #include <asm/delay.h>
@@ -41,26 +41,24 @@ static void init_pit_timer(enum clock_event_mode mode,
 	case CLOCK_EVT_MODE_PERIODIC:
 		/* binary, mode 2, LSB/MSB, ch 0 */
 		outb_p(0x34, PIT_MODE);
-		udelay(10);
 		outb_p(LATCH & 0xff , PIT_CH0);	/* LSB */
-		udelay(10);
 		outb(LATCH >> 8 , PIT_CH0);	/* MSB */
 		break;
 
-	/*
-	 * Avoid unnecessary state transitions, as it confuses
-	 * Geode / Cyrix based boxen.
-	 */
 	case CLOCK_EVT_MODE_SHUTDOWN:
-		if (evt->mode == CLOCK_EVT_MODE_UNUSED)
-			break;
 	case CLOCK_EVT_MODE_UNUSED:
-		if (evt->mode == CLOCK_EVT_MODE_SHUTDOWN)
-			break;
+		outb_p(0x30, PIT_MODE);
+		outb_p(0, PIT_CH0);	/* LSB */
+		outb_p(0, PIT_CH0);	/* MSB */
+		break;
+
 	case CLOCK_EVT_MODE_ONESHOT:
 		/* One shot setup */
 		outb_p(0x38, PIT_MODE);
-		udelay(10);
+		break;
+
+	case CLOCK_EVT_MODE_RESUME:
+		/* Nothing to do here */
 		break;
 	}
 	spin_unlock_irqrestore(&i8253_lock, flags);
diff --git a/arch/i386/kernel/vmiclock.c b/arch/i386/kernel/vmiclock.c
index f9b845f4e69..d23077cca45 100644
--- a/arch/i386/kernel/vmiclock.c
+++ b/arch/i386/kernel/vmiclock.c
@@ -142,6 +142,7 @@ static void vmi_timer_set_mode(enum clock_event_mode mode,
 
 	switch (mode) {
 	case CLOCK_EVT_MODE_ONESHOT:
+	case CLOCK_EVT_MODE_RESUME:
 		break;
 	case CLOCK_EVT_MODE_PERIODIC:
 		cycles_per_hz = vmi_timer_ops.get_cycle_frequency();
diff --git a/arch/i386/xen/time.c b/arch/i386/xen/time.c
index 51fdabf1fd4..dfd6db69ead 100644
--- a/arch/i386/xen/time.c
+++ b/arch/i386/xen/time.c
@@ -412,6 +412,7 @@ static void xen_timerop_set_mode(enum clock_event_mode mode,
 		break;
 
 	case CLOCK_EVT_MODE_ONESHOT:
+	case CLOCK_EVT_MODE_RESUME:
 		break;
 
 	case CLOCK_EVT_MODE_UNUSED:
@@ -474,6 +475,8 @@ static void xen_vcpuop_set_mode(enum clock_event_mode mode,
 		    HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL))
 			BUG();
 		break;
+	case CLOCK_EVT_MODE_RESUME:
+		break;
 	}
 }
 
diff --git a/arch/sh/kernel/timers/timer-tmu.c b/arch/sh/kernel/timers/timer-tmu.c
index 097ebd49f1b..7aca37d7976 100644
--- a/arch/sh/kernel/timers/timer-tmu.c
+++ b/arch/sh/kernel/timers/timer-tmu.c
@@ -80,6 +80,7 @@ static void tmu_set_mode(enum clock_event_mode mode,
 		break;
 	case CLOCK_EVT_MODE_UNUSED:
 	case CLOCK_EVT_MODE_SHUTDOWN:
+	case CLOCK_EVT_MODE_RESUME:
 		break;
 	}
 }
diff --git a/arch/sparc64/kernel/time.c b/arch/sparc64/kernel/time.c
index e340eb401fb..87c10a7544d 100644
--- a/arch/sparc64/kernel/time.c
+++ b/arch/sparc64/kernel/time.c
@@ -931,6 +931,7 @@ static void sparc64_timer_setup(enum clock_event_mode mode,
 {
 	switch (mode) {
 	case CLOCK_EVT_MODE_ONESHOT:
+	case CLOCK_EVT_MODE_RESUME:
 		break;
 
 	case CLOCK_EVT_MODE_SHUTDOWN:
diff --git a/drivers/lguest/lguest.c b/drivers/lguest/lguest.c
index 434fea1e82f..18dade06d4a 100644
--- a/drivers/lguest/lguest.c
+++ b/drivers/lguest/lguest.c
@@ -398,6 +398,8 @@ static void lguest_clockevent_set_mode(enum clock_event_mode mode,
 		break;
 	case CLOCK_EVT_MODE_PERIODIC:
 		BUG();
+	case CLOCK_EVT_MODE_RESUME:
+		break;
 	}
 }
 
diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h
index 8d7a39019ac..e0bd46eb241 100644
--- a/include/linux/clockchips.h
+++ b/include/linux/clockchips.h
@@ -23,6 +23,7 @@ enum clock_event_mode {
 	CLOCK_EVT_MODE_SHUTDOWN,
 	CLOCK_EVT_MODE_PERIODIC,
 	CLOCK_EVT_MODE_ONESHOT,
+	CLOCK_EVT_MODE_RESUME,
 };
 
 /* Clock event notification values */
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 8001d37071f..8339af229cb 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -49,7 +49,7 @@ cpumask_t *tick_get_broadcast_mask(void)
  */
 static void tick_broadcast_start_periodic(struct clock_event_device *bc)
 {
-	if (bc && bc->mode == CLOCK_EVT_MODE_SHUTDOWN)
+	if (bc)
 		tick_setup_periodic(bc, 1);
 }
 
@@ -299,7 +299,7 @@ void tick_suspend_broadcast(void)
 	spin_lock_irqsave(&tick_broadcast_lock, flags);
 
 	bc = tick_broadcast_device.evtdev;
-	if (bc && tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC)
+	if (bc)
 		clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN);
 
 	spin_unlock_irqrestore(&tick_broadcast_lock, flags);
@@ -316,6 +316,8 @@ int tick_resume_broadcast(void)
 	bc = tick_broadcast_device.evtdev;
 
 	if (bc) {
+		clockevents_set_mode(bc, CLOCK_EVT_MODE_RESUME);
+
 		switch (tick_broadcast_device.mode) {
 		case TICKDEV_MODE_PERIODIC:
 			if(!cpus_empty(tick_broadcast_mask))
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index a96ec9ab345..77a21abc871 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -318,12 +318,17 @@ static void tick_resume(void)
 {
 	struct tick_device *td = &__get_cpu_var(tick_cpu_device);
 	unsigned long flags;
+	int broadcast = tick_resume_broadcast();
 
 	spin_lock_irqsave(&tick_device_lock, flags);
-	if (td->mode == TICKDEV_MODE_PERIODIC)
-		tick_setup_periodic(td->evtdev, 0);
-	else
-		tick_resume_oneshot();
+	clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_RESUME);
+
+	if (!broadcast) {
+		if (td->mode == TICKDEV_MODE_PERIODIC)
+			tick_setup_periodic(td->evtdev, 0);
+		else
+			tick_resume_oneshot();
+	}
 	spin_unlock_irqrestore(&tick_device_lock, flags);
 }
 
@@ -360,8 +365,7 @@ static int tick_notify(struct notifier_block *nb, unsigned long reason,
 		break;
 
 	case CLOCK_EVT_NOTIFY_RESUME:
-		if (!tick_resume_broadcast())
-			tick_resume();
+		tick_resume();
 		break;
 
 	default:
-- 
cgit v1.2.3-70-g09d2


From 5590a536c0bc403fc73908c66c1c88cbed735ecb Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 21 Jul 2007 04:37:35 -0700
Subject: clockevents: fix device replacement

When a device is replaced by a better rated device, then the broadcast
mode needs to be evaluated again. When the new device has no requirement
for broadcasting, then the broadcast bits for the CPU must be cleared.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: john stultz <johnstul@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/time/tick-broadcast.c | 29 ++++++++++++++++++++++++++++-
 1 file changed, 28 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 8339af229cb..db8e0f3d409 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -31,6 +31,12 @@ struct tick_device tick_broadcast_device;
 static cpumask_t tick_broadcast_mask;
 static DEFINE_SPINLOCK(tick_broadcast_lock);
 
+#ifdef CONFIG_TICK_ONESHOT
+static void tick_broadcast_clear_oneshot(int cpu);
+#else
+static inline void tick_broadcast_clear_oneshot(int cpu) { }
+#endif
+
 /*
  * Debugging: see timer_list.c
  */
@@ -99,8 +105,19 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu)
 		cpu_set(cpu, tick_broadcast_mask);
 		tick_broadcast_start_periodic(tick_broadcast_device.evtdev);
 		ret = 1;
-	}
+	} else {
+		/*
+		 * When the new device is not affected by the stop
+		 * feature and the cpu is marked in the broadcast mask
+		 * then clear the broadcast bit.
+		 */
+		if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) {
+			int cpu = smp_processor_id();
 
+			cpu_clear(cpu, tick_broadcast_mask);
+			tick_broadcast_clear_oneshot(cpu);
+		}
+	}
 	spin_unlock_irqrestore(&tick_broadcast_lock, flags);
 	return ret;
 }
@@ -487,6 +504,16 @@ out:
 	spin_unlock_irqrestore(&tick_broadcast_lock, flags);
 }
 
+/*
+ * Reset the one shot broadcast for a cpu
+ *
+ * Called with tick_broadcast_lock held
+ */
+static void tick_broadcast_clear_oneshot(int cpu)
+{
+	cpu_clear(cpu, tick_broadcast_oneshot_mask);
+}
+
 /**
  * tick_broadcast_setup_highres - setup the broadcast device for highres
  */
-- 
cgit v1.2.3-70-g09d2


From 3704540b48295253bd9c87a5e7ff545f9d47a3b8 Mon Sep 17 00:00:00 2001
From: john stultz <johnstul@us.ibm.com>
Date: Sat, 21 Jul 2007 04:37:35 -0700
Subject: tick management: spread timer interrupt

After discussing w/ Thomas over IRC, it seems the issue is the sched tick
fires on every cpu at the same time, causing extra lock contention.

This smaller change, adds an extra offset per cpu so the ticks don't line up.
This patch also drops the idle latency from 40us down to under 20us.

Signed-off-by: john stultz <johnstul@us.ibm.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/time/tick-sched.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 52db9e3c526..b416995b975 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -546,6 +546,7 @@ void tick_setup_sched_timer(void)
 {
 	struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
 	ktime_t now = ktime_get();
+	u64 offset;
 
 	/*
 	 * Emulate tick processing via per-CPU hrtimers:
@@ -554,8 +555,12 @@ void tick_setup_sched_timer(void)
 	ts->sched_timer.function = tick_sched_timer;
 	ts->sched_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
 
-	/* Get the next period */
+	/* Get the next period (per cpu) */
 	ts->sched_timer.expires = tick_init_jiffy_update();
+	offset = ktime_to_ns(tick_period) >> 1;
+	do_div(offset, NR_CPUS);
+	offset *= smp_processor_id();
+	ts->sched_timer.expires = ktime_add_ns(ts->sched_timer.expires, offset);
 
 	for (;;) {
 		hrtimer_forward(&ts->sched_timer, now, tick_period);
-- 
cgit v1.2.3-70-g09d2


From 820de5c39ef7f6866d2c9e6c7d208bcd2a6e1942 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sat, 21 Jul 2007 04:37:36 -0700
Subject: highres: improve debug output

Add some more debug information to the hrtimer and clock events code.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: john stultz <johnstul@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/i386/kernel/apic.c    |  3 +++
 kernel/hrtimer.c           |  5 ++++-
 kernel/time/tick-oneshot.c | 15 ++++++++++++++-
 3 files changed, 21 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/arch/i386/kernel/apic.c b/arch/i386/kernel/apic.c
index 610f44b2436..83988c3c8e2 100644
--- a/arch/i386/kernel/apic.c
+++ b/arch/i386/kernel/apic.c
@@ -524,6 +524,9 @@ void __init setup_boot_APIC_clock(void)
 		 */
 		if (nmi_watchdog != NMI_IO_APIC)
 			lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY;
+		else
+			printk(KERN_WARNING "APIC timer registered as dummy,"
+			       " due to nmi_watchdog=1!\n");
 	}
 
 	/* Setup the lapic or request the broadcast */
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 72d034258ba..065a8978662 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -558,7 +558,8 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
  */
 static int hrtimer_switch_to_hres(void)
 {
-	struct hrtimer_cpu_base *base = &__get_cpu_var(hrtimer_bases);
+	int cpu = smp_processor_id();
+	struct hrtimer_cpu_base *base = &per_cpu(hrtimer_bases, cpu);
 	unsigned long flags;
 
 	if (base->hres_active)
@@ -568,6 +569,8 @@ static int hrtimer_switch_to_hres(void)
 
 	if (tick_init_highres()) {
 		local_irq_restore(flags);
+		printk(KERN_WARNING "Could not switch to high resolution "
+				    "mode on CPU %d\n", cpu);
 		return 0;
 	}
 	base->hres_active = 1;
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
index f6997ab0c3c..0258d3115d5 100644
--- a/kernel/time/tick-oneshot.c
+++ b/kernel/time/tick-oneshot.c
@@ -73,8 +73,21 @@ int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *))
 	struct clock_event_device *dev = td->evtdev;
 
 	if (!dev || !(dev->features & CLOCK_EVT_FEAT_ONESHOT) ||
-	    !tick_device_is_functional(dev))
+		    !tick_device_is_functional(dev)) {
+
+		printk(KERN_INFO "Clockevents: "
+		       "could not switch to one-shot mode:");
+		if (!dev) {
+			printk(" no tick device\n");
+		} else {
+			if (!tick_device_is_functional(dev))
+				printk(" %s is not functional.\n", dev->name);
+			else
+				printk(" %s does not support one-shot mode.\n",
+				       dev->name);
+		}
 		return -EINVAL;
+	}
 
 	td->mode = TICKDEV_MODE_ONESHOT;
 	dev->event_handler = handler;
-- 
cgit v1.2.3-70-g09d2


From 99bc2fcb283852931fb6bbef40f3df8316b59000 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sat, 21 Jul 2007 04:37:36 -0700
Subject: hrtimer: speedup hrtimer_enqueue

Speedup hrtimer_enqueue by evaluating the rbtree insertion result.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: john stultz <johnstul@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/hrtimer.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 065a8978662..eb1ddebd2c0 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -686,6 +686,7 @@ static void enqueue_hrtimer(struct hrtimer *timer,
 	struct rb_node **link = &base->active.rb_node;
 	struct rb_node *parent = NULL;
 	struct hrtimer *entry;
+	int leftmost = 1;
 
 	/*
 	 * Find the right place in the rbtree:
@@ -697,18 +698,19 @@ static void enqueue_hrtimer(struct hrtimer *timer,
 		 * We dont care about collisions. Nodes with
 		 * the same expiry time stay together.
 		 */
-		if (timer->expires.tv64 < entry->expires.tv64)
+		if (timer->expires.tv64 < entry->expires.tv64) {
 			link = &(*link)->rb_left;
-		else
+		} else {
 			link = &(*link)->rb_right;
+			leftmost = 0;
+		}
 	}
 
 	/*
 	 * Insert the timer to the rbtree and check whether it
 	 * replaces the first pending timer
 	 */
-	if (!base->first || timer->expires.tv64 <
-	    rb_entry(base->first, struct hrtimer, node)->expires.tv64) {
+	if (leftmost) {
 		/*
 		 * Reprogram the clock event device. When the timer is already
 		 * expired hrtimer_enqueue_reprogram has either called the
-- 
cgit v1.2.3-70-g09d2


From 82644459c592a28a3eab682f9b88d81019ddfe8b Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 21 Jul 2007 04:37:37 -0700
Subject: NTP: move the cmos update code into ntp.c

i386 and sparc64 have the identical code to update the cmos clock.  Move it
into kernel/time/ntp.c as there are other architectures coming along with the
same requirements.

[akpm@linux-foundation.org: build fixes]
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Chris Wright <chrisw@sous-sol.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: john stultz <johnstul@us.ibm.com>
Cc: David Miller <davem@davemloft.net>
Cc: Roman Zippel <zippel@linux-m68k.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/i386/Kconfig          |  4 ++++
 arch/i386/kernel/time.c    | 50 ++-------------------------------------
 arch/sparc64/Kconfig       |  4 ++++
 arch/sparc64/kernel/time.c | 53 ++---------------------------------------
 include/asm-i386/timer.h   |  1 -
 include/linux/time.h       |  3 +++
 kernel/time/ntp.c          | 59 +++++++++++++++++++++++++++++++++++++++++++---
 7 files changed, 71 insertions(+), 103 deletions(-)

(limited to 'kernel')

diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig
index 7a11b905ef4..361aca8b3ec 100644
--- a/arch/i386/Kconfig
+++ b/arch/i386/Kconfig
@@ -18,6 +18,10 @@ config GENERIC_TIME
 	bool
 	default y
 
+config GENERIC_CMOS_UPDATE
+	bool
+	default y
+
 config CLOCKSOURCE_WATCHDOG
 	bool
 	default y
diff --git a/arch/i386/kernel/time.c b/arch/i386/kernel/time.c
index a665df61f08..19a6c678d02 100644
--- a/arch/i386/kernel/time.c
+++ b/arch/i386/kernel/time.c
@@ -207,55 +207,9 @@ unsigned long read_persistent_clock(void)
 	return retval;
 }
 
-static void sync_cmos_clock(unsigned long dummy);
-
-static DEFINE_TIMER(sync_cmos_timer, sync_cmos_clock, 0, 0);
-int no_sync_cmos_clock;
-
-static void sync_cmos_clock(unsigned long dummy)
-{
-	struct timeval now, next;
-	int fail = 1;
-
-	/*
-	 * If we have an externally synchronized Linux clock, then update
-	 * CMOS clock accordingly every ~11 minutes. Set_rtc_mmss() has to be
-	 * called as close as possible to 500 ms before the new second starts.
-	 * This code is run on a timer.  If the clock is set, that timer
-	 * may not expire at the correct time.  Thus, we adjust...
-	 */
-	if (!ntp_synced())
-		/*
-		 * Not synced, exit, do not restart a timer (if one is
-		 * running, let it run out).
-		 */
-		return;
-
-	do_gettimeofday(&now);
-	if (now.tv_usec >= USEC_AFTER - ((unsigned) TICK_SIZE) / 2 &&
-	    now.tv_usec <= USEC_BEFORE + ((unsigned) TICK_SIZE) / 2)
-		fail = set_rtc_mmss(now.tv_sec);
-
-	next.tv_usec = USEC_AFTER - now.tv_usec;
-	if (next.tv_usec <= 0)
-		next.tv_usec += USEC_PER_SEC;
-
-	if (!fail)
-		next.tv_sec = 659;
-	else
-		next.tv_sec = 0;
-
-	if (next.tv_usec >= USEC_PER_SEC) {
-		next.tv_sec++;
-		next.tv_usec -= USEC_PER_SEC;
-	}
-	mod_timer(&sync_cmos_timer, jiffies + timeval_to_jiffies(&next));
-}
-
-void notify_arch_cmos_timer(void)
+int update_persistent_clock(struct timespec now)
 {
-	if (!no_sync_cmos_clock)
-		mod_timer(&sync_cmos_timer, jiffies + 1);
+	return set_rtc_mmss(now.tv_sec);
 }
 
 extern void (*late_time_init)(void);
diff --git a/arch/sparc64/Kconfig b/arch/sparc64/Kconfig
index f1cc55677ff..33dabf588bd 100644
--- a/arch/sparc64/Kconfig
+++ b/arch/sparc64/Kconfig
@@ -23,6 +23,10 @@ config GENERIC_TIME
 	bool
 	default y
 
+config GENERIC_CMOS_UPDATE
+	bool
+	default y
+
 config GENERIC_CLOCKEVENTS
 	bool
 	default y
diff --git a/arch/sparc64/kernel/time.c b/arch/sparc64/kernel/time.c
index 87c10a7544d..49063ca2efc 100644
--- a/arch/sparc64/kernel/time.c
+++ b/arch/sparc64/kernel/time.c
@@ -403,58 +403,9 @@ static struct sparc64_tick_ops hbtick_operations __read_mostly = {
 
 static unsigned long timer_ticks_per_nsec_quotient __read_mostly;
 
-#define TICK_SIZE (tick_nsec / 1000)
-
-#define USEC_AFTER	500000
-#define USEC_BEFORE	500000
-
-static void sync_cmos_clock(unsigned long dummy);
-
-static DEFINE_TIMER(sync_cmos_timer, sync_cmos_clock, 0, 0);
-
-static void sync_cmos_clock(unsigned long dummy)
-{
-	struct timeval now, next;
-	int fail = 1;
-
-	/*
-	 * If we have an externally synchronized Linux clock, then update
-	 * CMOS clock accordingly every ~11 minutes. Set_rtc_mmss() has to be
-	 * called as close as possible to 500 ms before the new second starts.
-	 * This code is run on a timer.  If the clock is set, that timer
-	 * may not expire at the correct time.  Thus, we adjust...
-	 */
-	if (!ntp_synced())
-		/*
-		 * Not synced, exit, do not restart a timer (if one is
-		 * running, let it run out).
-		 */
-		return;
-
-	do_gettimeofday(&now);
-	if (now.tv_usec >= USEC_AFTER - ((unsigned) TICK_SIZE) / 2 &&
-	    now.tv_usec <= USEC_BEFORE + ((unsigned) TICK_SIZE) / 2)
-		fail = set_rtc_mmss(now.tv_sec);
-
-	next.tv_usec = USEC_AFTER - now.tv_usec;
-	if (next.tv_usec <= 0)
-		next.tv_usec += USEC_PER_SEC;
-
-	if (!fail)
-		next.tv_sec = 659;
-	else
-		next.tv_sec = 0;
-
-	if (next.tv_usec >= USEC_PER_SEC) {
-		next.tv_sec++;
-		next.tv_usec -= USEC_PER_SEC;
-	}
-	mod_timer(&sync_cmos_timer, jiffies + timeval_to_jiffies(&next));
-}
-
-void notify_arch_cmos_timer(void)
+int update_persistent_clock(struct timespec now)
 {
-	mod_timer(&sync_cmos_timer, jiffies + 1);
+	return set_rtc_mmss(now.tv_sec);
 }
 
 /* Kick start a stopped clock (procedure from the Sun NVRAM/hostid FAQ). */
diff --git a/include/asm-i386/timer.h b/include/asm-i386/timer.h
index 51a713e33a9..b371667cfde 100644
--- a/include/asm-i386/timer.h
+++ b/include/asm-i386/timer.h
@@ -11,7 +11,6 @@ unsigned long native_calculate_cpu_khz(void);
 
 extern int timer_ack;
 extern int no_timer_check;
-extern int no_sync_cmos_clock;
 extern int recalibrate_cpu_khz(void);
 
 #ifndef CONFIG_PARAVIRT
diff --git a/include/linux/time.h b/include/linux/time.h
index ec3b0ced0af..e6aea5146e5 100644
--- a/include/linux/time.h
+++ b/include/linux/time.h
@@ -4,6 +4,7 @@
 #include <linux/types.h>
 
 #ifdef __KERNEL__
+# include <linux/cache.h>
 # include <linux/seqlock.h>
 #endif
 
@@ -94,6 +95,8 @@ extern struct timespec wall_to_monotonic;
 extern seqlock_t xtime_lock __attribute__((weak));
 
 extern unsigned long read_persistent_clock(void);
+extern int update_persistent_clock(struct timespec now);
+extern int no_sync_cmos_clock __read_mostly;
 void timekeeping_init(void);
 
 static inline unsigned long get_seconds(void)
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index b5e352597cb..cd91237dbfe 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -10,6 +10,7 @@
 
 #include <linux/mm.h>
 #include <linux/time.h>
+#include <linux/timer.h>
 #include <linux/timex.h>
 #include <linux/jiffies.h>
 #include <linux/hrtimer.h>
@@ -175,12 +176,64 @@ u64 current_tick_length(void)
 	return tick_length;
 }
 
+#ifdef CONFIG_GENERIC_CMOS_UPDATE
 
-void __attribute__ ((weak)) notify_arch_cmos_timer(void)
+/* Disable the cmos update - used by virtualization and embedded */
+int no_sync_cmos_clock  __read_mostly;
+
+static void sync_cmos_clock(unsigned long dummy);
+
+static DEFINE_TIMER(sync_cmos_timer, sync_cmos_clock, 0, 0);
+
+static void sync_cmos_clock(unsigned long dummy)
+{
+	struct timespec now, next;
+	int fail = 1;
+
+	/*
+	 * If we have an externally synchronized Linux clock, then update
+	 * CMOS clock accordingly every ~11 minutes. Set_rtc_mmss() has to be
+	 * called as close as possible to 500 ms before the new second starts.
+	 * This code is run on a timer.  If the clock is set, that timer
+	 * may not expire at the correct time.  Thus, we adjust...
+	 */
+	if (!ntp_synced())
+		/*
+		 * Not synced, exit, do not restart a timer (if one is
+		 * running, let it run out).
+		 */
+		return;
+
+	getnstimeofday(&now);
+	if (abs(xtime.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec / 2)
+		fail = update_persistent_clock(now);
+
+	next.tv_nsec = (NSEC_PER_SEC / 2) - now.tv_nsec;
+	if (next.tv_nsec <= 0)
+		next.tv_nsec += NSEC_PER_SEC;
+
+	if (!fail)
+		next.tv_sec = 659;
+	else
+		next.tv_sec = 0;
+
+	if (next.tv_nsec >= NSEC_PER_SEC) {
+		next.tv_sec++;
+		next.tv_nsec -= NSEC_PER_SEC;
+	}
+	mod_timer(&sync_cmos_timer, jiffies + timespec_to_jiffies(&next));
+}
+
+static void notify_cmos_timer(void)
 {
-	return;
+	if (no_sync_cmos_clock)
+		mod_timer(&sync_cmos_timer, jiffies + 1);
 }
 
+#else
+static inline void notify_cmos_timer(void) { }
+#endif
+
 /* adjtimex mainly allows reading (and writing, if superuser) of
  * kernel time-keeping variables. used by xntpd.
  */
@@ -345,6 +398,6 @@ leave:	if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0)
 	txc->stbcnt	   = 0;
 	write_sequnlock_irq(&xtime_lock);
 	do_gettimeofday(&txc->time);
-	notify_arch_cmos_timer();
+	notify_cmos_timer();
 	return(result);
 }
-- 
cgit v1.2.3-70-g09d2


From 42ee2b74140b69fa24da1c671b03c9f8019e6f62 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Sat, 21 Jul 2007 17:09:54 +0200
Subject: x86_64: Report the pending irq if available in smp_affinity

Otherwise smp_affinity would only update after the next interrupt
on x86 systems.

Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/irq/proc.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index b4f1674fca7..50b81b98046 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -19,7 +19,15 @@ static struct proc_dir_entry *root_irq_dir;
 static int irq_affinity_read_proc(char *page, char **start, off_t off,
 				  int count, int *eof, void *data)
 {
-	int len = cpumask_scnprintf(page, count, irq_desc[(long)data].affinity);
+	struct irq_desc *desc = irq_desc + (long)data;
+	cpumask_t *mask = &desc->affinity;
+	int len;
+
+#ifdef CONFIG_GENERIC_PENDING_IRQ
+	if (desc->status & IRQ_MOVE_PENDING)
+		mask = &desc->pending_mask;
+#endif
+	len = cpumask_scnprintf(page, count, *mask);
 
 	if (count - len < 2)
 		return -EINVAL;
-- 
cgit v1.2.3-70-g09d2


From 44bf4cea43816d43deab73c1c16361e899996eaa Mon Sep 17 00:00:00 2001
From: Nigel Cunningham <nigel@nigel.suspend2.net>
Date: Sat, 21 Jul 2007 17:10:41 +0200
Subject: x86: PM_TRACE support

Signed-off-by: Nigel Cunningham <nigel@nigel.suspend2.net>
Cc: Randy Dunlap <rdunlap@xenotime.net>
Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
Cc: Pavel Machek <pavel@ucw.cz>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86_64/kernel/vmlinux.lds.S  |  7 +++++++
 drivers/base/power/trace.c        |  5 ++++-
 include/asm-i386/resume-trace.h   | 13 +++++++++++++
 include/asm-x86_64/resume-trace.h | 13 +++++++++++++
 include/linux/resume-trace.h      | 19 +++++--------------
 kernel/power/Kconfig              |  2 +-
 6 files changed, 43 insertions(+), 16 deletions(-)
 create mode 100644 include/asm-i386/resume-trace.h
 create mode 100644 include/asm-x86_64/resume-trace.h

(limited to 'kernel')

diff --git a/arch/x86_64/kernel/vmlinux.lds.S b/arch/x86_64/kernel/vmlinux.lds.S
index c2d5a840cb1..8778848000c 100644
--- a/arch/x86_64/kernel/vmlinux.lds.S
+++ b/arch/x86_64/kernel/vmlinux.lds.S
@@ -54,6 +54,13 @@ SECTIONS
 
   RODATA
 
+  . = ALIGN(4);
+  .tracedata : AT(ADDR(.tracedata) - LOAD_OFFSET) {
+  	__tracedata_start = .;
+	*(.tracedata)
+  	__tracedata_end = .;
+  }
+
   . = ALIGN(PAGE_SIZE);        /* Align data segment to page size boundary */
 				/* Data */
   .data : AT(ADDR(.data) - LOAD_OFFSET) {
diff --git a/drivers/base/power/trace.c b/drivers/base/power/trace.c
index a9ab30fefff..2b0c601e422 100644
--- a/drivers/base/power/trace.c
+++ b/drivers/base/power/trace.c
@@ -142,6 +142,7 @@ void set_trace_device(struct device *dev)
 {
 	dev_hash_value = hash_string(DEVSEED, dev->bus_id, DEVHASH);
 }
+EXPORT_SYMBOL(set_trace_device);
 
 /*
  * We could just take the "tracedata" index into the .tracedata
@@ -162,6 +163,7 @@ void generate_resume_trace(void *tracedata, unsigned int user)
 	file_hash_value = hash_string(lineno, file, FILEHASH);
 	set_magic_time(user_hash_value, file_hash_value, dev_hash_value);
 }
+EXPORT_SYMBOL(generate_resume_trace);
 
 extern char __tracedata_start, __tracedata_end;
 static int show_file_hash(unsigned int value)
@@ -170,7 +172,8 @@ static int show_file_hash(unsigned int value)
 	char *tracedata;
 
 	match = 0;
-	for (tracedata = &__tracedata_start ; tracedata < &__tracedata_end ; tracedata += 6) {
+	for (tracedata = &__tracedata_start ; tracedata < &__tracedata_end ;
+			tracedata += 2 + sizeof(unsigned long)) {
 		unsigned short lineno = *(unsigned short *)tracedata;
 		const char *file = *(const char **)(tracedata + 2);
 		unsigned int hash = hash_string(lineno, file, FILEHASH);
diff --git a/include/asm-i386/resume-trace.h b/include/asm-i386/resume-trace.h
new file mode 100644
index 00000000000..ec9cfd65623
--- /dev/null
+++ b/include/asm-i386/resume-trace.h
@@ -0,0 +1,13 @@
+#define TRACE_RESUME(user) do {					\
+	if (pm_trace_enabled) {					\
+		void *tracedata;				\
+		asm volatile("movl $1f,%0\n"			\
+			".section .tracedata,\"a\"\n"		\
+			"1:\t.word %c1\n"			\
+			"\t.long %c2\n"				\
+			".previous"				\
+			:"=r" (tracedata)			\
+			: "i" (__LINE__), "i" (__FILE__));	\
+		generate_resume_trace(tracedata, user);		\
+	}							\
+} while (0)
diff --git a/include/asm-x86_64/resume-trace.h b/include/asm-x86_64/resume-trace.h
new file mode 100644
index 00000000000..34bf998fdf6
--- /dev/null
+++ b/include/asm-x86_64/resume-trace.h
@@ -0,0 +1,13 @@
+#define TRACE_RESUME(user) do {					\
+	if (pm_trace_enabled) {					\
+		void *tracedata;				\
+		asm volatile("movq $1f,%0\n"			\
+			".section .tracedata,\"a\"\n"		\
+			"1:\t.word %c1\n"			\
+			"\t.quad %c2\n"				\
+			".previous"				\
+			:"=r" (tracedata)			\
+			: "i" (__LINE__), "i" (__FILE__));	\
+		generate_resume_trace(tracedata, user);		\
+	}							\
+} while (0)
diff --git a/include/linux/resume-trace.h b/include/linux/resume-trace.h
index 81e9299ca14..f3f4f28c696 100644
--- a/include/linux/resume-trace.h
+++ b/include/linux/resume-trace.h
@@ -2,6 +2,7 @@
 #define RESUME_TRACE_H
 
 #ifdef CONFIG_PM_TRACE
+#include <asm/resume-trace.h>
 
 extern int pm_trace_enabled;
 
@@ -9,20 +10,10 @@ struct device;
 extern void set_trace_device(struct device *);
 extern void generate_resume_trace(void *tracedata, unsigned int user);
 
-#define TRACE_DEVICE(dev) set_trace_device(dev)
-#define TRACE_RESUME(user) do {					\
-	if (pm_trace_enabled) {					\
-		void *tracedata;				\
-		asm volatile("movl $1f,%0\n"			\
-			".section .tracedata,\"a\"\n"		\
-			"1:\t.word %c1\n"			\
-			"\t.long %c2\n"				\
-			".previous"				\
-			:"=r" (tracedata)			\
-			: "i" (__LINE__), "i" (__FILE__));	\
-		generate_resume_trace(tracedata, user);		\
-	}							\
-} while (0)
+#define TRACE_DEVICE(dev) do { \
+	if (pm_trace_enabled) \
+		set_trace_device(dev); \
+	} while(0)
 
 #else
 
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 7358609e473..c1a106d87d9 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -57,7 +57,7 @@ config DISABLE_CONSOLE_SUSPEND
 
 config PM_TRACE
 	bool "Suspend/resume event tracing"
-	depends on PM_DEBUG && X86_32 && EXPERIMENTAL
+	depends on PM_DEBUG && X86 && EXPERIMENTAL
 	default n
 	---help---
 	This enables some cheesy code to save the last PM event point in the
-- 
cgit v1.2.3-70-g09d2


From 5b9a4262232d632c28990fcdf4f36d0e0ade5f18 Mon Sep 17 00:00:00 2001
From: Steve Grubb <sgrubb@redhat.com>
Date: Tue, 29 May 2007 10:38:18 -0400
Subject: [PATCH] Make IPC mode consistent

The mode fields for IPC records are not consistent. Some are hex, others are
octal. This patch makes them all octal.

Signed-off-by: Steve Grubb <sgrubb@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/auditsc.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 145cbb79c4b..f5e917e60ac 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -995,7 +995,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
 		case AUDIT_IPC: {
 			struct audit_aux_data_ipcctl *axi = (void *)aux;
 			audit_log_format(ab, 
-				 "ouid=%u ogid=%u mode=%x",
+				 "ouid=%u ogid=%u mode=%#o",
 				 axi->uid, axi->gid, axi->mode);
 			if (axi->osid != 0) {
 				char *ctx = NULL;
@@ -1014,7 +1014,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
 		case AUDIT_IPC_SET_PERM: {
 			struct audit_aux_data_ipcctl *axi = (void *)aux;
 			audit_log_format(ab,
-				"qbytes=%lx ouid=%u ogid=%u mode=%x",
+				"qbytes=%lx ouid=%u ogid=%u mode=%#o",
 				axi->qbytes, axi->uid, axi->gid, axi->mode);
 			break; }
 
-- 
cgit v1.2.3-70-g09d2


From c926e4f432af0f61ac2b9b637fb51a4871a3fc91 Mon Sep 17 00:00:00 2001
From: Klaus Weidner <klaus@atsec.com>
Date: Wed, 16 May 2007 17:45:42 -0500
Subject: [PATCH] audit: fix broken class-based syscall audit

The sanity check in audit_match_class() is wrong.  We are able to audit
2048 syscalls but in audit_match_class() we were accidentally using
sizeof(_u32) instead of number of bits in _u32 when deciding how many
syscalls were valid.  On ia64 in particular we were hitting syscall
numbers over the (wrong) limit of 256.  Fixing the audit_match_class
check takes care of the problem.

Signed-off-by: Klaus Weidner <klaus@atsec.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/auditfilter.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 1bf093dcffe..0ea96bab91c 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -304,7 +304,7 @@ int __init audit_register_class(int class, unsigned *list)
 
 int audit_match_class(int class, unsigned syscall)
 {
-	if (unlikely(syscall >= AUDIT_BITMASK_SIZE * sizeof(__u32)))
+	if (unlikely(syscall >= AUDIT_BITMASK_SIZE * 32))
 		return 0;
 	if (unlikely(class >= AUDIT_SYSCALL_CLASSES || !classes[class]))
 		return 0;
-- 
cgit v1.2.3-70-g09d2


From 74f2345b6be1410f824cb7dd638d2c10a9709379 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Mon, 4 Jun 2007 17:00:14 -0400
Subject: [PATCH] allow audit filtering on bit & operations

Right now the audit filter can match on = != > < >= blah blah blah.
This allow the filter to also look at bitwise AND operations, &

Signed-off-by: Eric Paris <eparis@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/audit.h | 30 +++++++++++++++++-------------
 kernel/auditfilter.c  | 11 +++++++++++
 2 files changed, 28 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index 8ca7ca0b47f..a35859ab2fd 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -161,7 +161,7 @@
  * are currently used in an audit field constant understood by the kernel.
  * If you are adding a new #define AUDIT_<whatever>, please ensure that
  * AUDIT_UNUSED_BITS is updated if need be. */
-#define AUDIT_UNUSED_BITS	0x0FFFFC00
+#define AUDIT_UNUSED_BITS	0x07FFFC00
 
 
 /* Rule fields */
@@ -213,25 +213,29 @@
 #define AUDIT_NEGATE			0x80000000
 
 /* These are the supported operators.
- *	4  2  1
- *	=  >  <
- *	-------
- *	0  0  0		0	nonsense
- *	0  0  1		1	<
- *	0  1  0		2	>
- *	0  1  1		3	!=
- *	1  0  0		4	=
- *	1  0  1		5	<=
- *	1  1  0		6	>=
- *	1  1  1		7	all operators
+ *	4  2  1  8
+ *	=  >  <  ?
+ *	----------
+ *	0  0  0	 0	00	nonsense
+ *	0  0  0	 1	08	&  bit mask
+ *	0  0  1	 0	10	<
+ *	0  1  0	 0	20	>
+ *	0  1  1	 0	30	!=
+ *	1  0  0	 0	40	=
+ *	1  0  0	 1	48	&=  bit test
+ *	1  0  1	 0	50	<=
+ *	1  1  0	 0	60	>=
+ *	1  1  1	 1	78	all operators
  */
+#define AUDIT_BIT_MASK			0x08000000
 #define AUDIT_LESS_THAN			0x10000000
 #define AUDIT_GREATER_THAN		0x20000000
 #define AUDIT_NOT_EQUAL			0x30000000
 #define AUDIT_EQUAL			0x40000000
+#define AUDIT_BIT_TEST			(AUDIT_BIT_MASK|AUDIT_EQUAL)
 #define AUDIT_LESS_THAN_OR_EQUAL	(AUDIT_LESS_THAN|AUDIT_EQUAL)
 #define AUDIT_GREATER_THAN_OR_EQUAL	(AUDIT_GREATER_THAN|AUDIT_EQUAL)
-#define AUDIT_OPERATORS			(AUDIT_EQUAL|AUDIT_NOT_EQUAL)
+#define AUDIT_OPERATORS			(AUDIT_EQUAL|AUDIT_NOT_EQUAL|AUDIT_BIT_MASK)
 
 /* Status symbols */
 				/* Mask values */
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 0ea96bab91c..359645cff5b 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -456,6 +456,13 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule)
 		case AUDIT_DEVMINOR:
 		case AUDIT_EXIT:
 		case AUDIT_SUCCESS:
+			/* bit ops are only useful on syscall args */
+			if (f->op == AUDIT_BIT_MASK ||
+						f->op == AUDIT_BIT_TEST) {
+				err = -EINVAL;
+				goto exit_free;
+			}
+			break;
 		case AUDIT_ARG0:
 		case AUDIT_ARG1:
 		case AUDIT_ARG2:
@@ -1566,6 +1573,10 @@ int audit_comparator(const u32 left, const u32 op, const u32 right)
 		return (left > right);
 	case AUDIT_GREATER_THAN_OR_EQUAL:
 		return (left >= right);
+	case AUDIT_BIT_MASK:
+		return (left & right);
+	case AUDIT_BIT_TEST:
+		return ((left & right) == right);
 	}
 	BUG();
 	return 0;
-- 
cgit v1.2.3-70-g09d2


From 4259fa01a2d2aa3e589b34ba7624080232d9c1ff Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 7 Jun 2007 11:13:31 -0400
Subject: [PATCH] get rid of AVC_PATH postponed treatment

        Selinux folks had been complaining about the lack of AVC_PATH
records when audit is disabled.  I must admit my stupidity - I assumed
that avc_audit() really couldn't use audit_log_d_path() because of
deadlocks (== could be called with dcache_lock or vfsmount_lock held).
Shouldn't have made that assumption - it never gets called that way.
It _is_ called under spinlocks, but not those.

        Since audit_log_d_path() uses ab->gfp_mask for allocations,
kmalloc() in there is not a problem.  IOW, the simple fix is sufficient:
let's rip AUDIT_AVC_PATH out and simply generate pathname as part of main
record.  It's trivial to do.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Acked-by: James Morris <jmorris@namei.org>
---
 include/linux/audit.h  |  2 --
 kernel/auditsc.c       | 47 -----------------------------------------------
 security/selinux/avc.c | 15 ++++++++-------
 3 files changed, 8 insertions(+), 56 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index a35859ab2fd..4bbd8601b8f 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -411,7 +411,6 @@ extern int audit_bprm(struct linux_binprm *bprm);
 extern int audit_socketcall(int nargs, unsigned long *args);
 extern int audit_sockaddr(int len, void *addr);
 extern int __audit_fd_pair(int fd1, int fd2);
-extern int audit_avc_path(struct dentry *dentry, struct vfsmount *mnt);
 extern int audit_set_macxattr(const char *name);
 extern int __audit_mq_open(int oflag, mode_t mode, struct mq_attr __user *u_attr);
 extern int __audit_mq_timedsend(mqd_t mqdes, size_t msg_len, unsigned int msg_prio, const struct timespec __user *u_abs_timeout);
@@ -491,7 +490,6 @@ extern int audit_signals;
 #define audit_socketcall(n,a) ({ 0; })
 #define audit_fd_pair(n,a) ({ 0; })
 #define audit_sockaddr(len, addr) ({ 0; })
-#define audit_avc_path(dentry, mnt) ({ 0; })
 #define audit_set_macxattr(n) do { ; } while (0)
 #define audit_mq_open(o,m,a) ({ 0; })
 #define audit_mq_timedsend(d,l,p,t) ({ 0; })
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index f5e917e60ac..bde1124d590 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -173,12 +173,6 @@ struct audit_aux_data_fd_pair {
 	int	fd[2];
 };
 
-struct audit_aux_data_path {
-	struct audit_aux_data	d;
-	struct dentry		*dentry;
-	struct vfsmount		*mnt;
-};
-
 struct audit_aux_data_pids {
 	struct audit_aux_data	d;
 	pid_t			target_pid[AUDIT_AUX_PIDS];
@@ -654,12 +648,6 @@ static inline void audit_free_aux(struct audit_context *context)
 	struct audit_aux_data *aux;
 
 	while ((aux = context->aux)) {
-		if (aux->type == AUDIT_AVC_PATH) {
-			struct audit_aux_data_path *axi = (void *)aux;
-			dput(axi->dentry);
-			mntput(axi->mnt);
-		}
-
 		context->aux = aux->next;
 		kfree(aux);
 	}
@@ -1038,11 +1026,6 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
 			audit_log_hex(ab, axs->a, axs->len);
 			break; }
 
-		case AUDIT_AVC_PATH: {
-			struct audit_aux_data_path *axi = (void *)aux;
-			audit_log_d_path(ab, "path=", axi->dentry, axi->mnt);
-			break; }
-
 		case AUDIT_FD_PAIR: {
 			struct audit_aux_data_fd_pair *axs = (void *)aux;
 			audit_log_format(ab, "fd0=%d fd1=%d", axs->fd[0], axs->fd[1]);
@@ -1990,36 +1973,6 @@ void __audit_ptrace(struct task_struct *t)
 	selinux_get_task_sid(t, &context->target_sid);
 }
 
-/**
- * audit_avc_path - record the granting or denial of permissions
- * @dentry: dentry to record
- * @mnt: mnt to record
- *
- * Returns 0 for success or NULL context or < 0 on error.
- *
- * Called from security/selinux/avc.c::avc_audit()
- */
-int audit_avc_path(struct dentry *dentry, struct vfsmount *mnt)
-{
-	struct audit_aux_data_path *ax;
-	struct audit_context *context = current->audit_context;
-
-	if (likely(!context))
-		return 0;
-
-	ax = kmalloc(sizeof(*ax), GFP_ATOMIC);
-	if (!ax)
-		return -ENOMEM;
-
-	ax->dentry = dget(dentry);
-	ax->mnt = mntget(mnt);
-
-	ax->d.type = AUDIT_AVC_PATH;
-	ax->d.next = context->aux;
-	context->aux = (void *)ax;
-	return 0;
-}
-
 /**
  * audit_signal_info - record signal info for shutting down audit subsystem
  * @sig: signal value
diff --git a/security/selinux/avc.c b/security/selinux/avc.c
index ecd06738453..0e69adf63bd 100644
--- a/security/selinux/avc.c
+++ b/security/selinux/avc.c
@@ -570,10 +570,12 @@ void avc_audit(u32 ssid, u32 tsid,
 		case AVC_AUDIT_DATA_FS:
 			if (a->u.fs.dentry) {
 				struct dentry *dentry = a->u.fs.dentry;
-				if (a->u.fs.mnt)
-					audit_avc_path(dentry, a->u.fs.mnt);
-				audit_log_format(ab, " name=");
-				audit_log_untrustedstring(ab, dentry->d_name.name);
+				if (a->u.fs.mnt) {
+					audit_log_d_path(ab, "path=", dentry, a->u.fs.mnt);
+				} else {
+					audit_log_format(ab, " name=");
+					audit_log_untrustedstring(ab, dentry->d_name.name);
+				}
 				inode = dentry->d_inode;
 			} else if (a->u.fs.inode) {
 				struct dentry *dentry;
@@ -624,9 +626,8 @@ void avc_audit(u32 ssid, u32 tsid,
 				case AF_UNIX:
 					u = unix_sk(sk);
 					if (u->dentry) {
-						audit_avc_path(u->dentry, u->mnt);
-						audit_log_format(ab, " name=");
-						audit_log_untrustedstring(ab, u->dentry->d_name.name);
+						audit_log_d_path(ab, "path=",
+								 u->dentry, u->mnt);
 						break;
 					}
 					if (!u->addr)
-- 
cgit v1.2.3-70-g09d2


From abd4f7505bafdd6c5319fe3cb5caf9af6104e17a Mon Sep 17 00:00:00 2001
From: Masoud Asgharifard Sharbiani <masouds@google.com>
Date: Sun, 22 Jul 2007 11:12:28 +0200
Subject: x86: i386-show-unhandled-signals-v3

This patch makes the i386 behave the same way that x86_64 does when a
segfault happens.  A line gets printed to the kernel log so that tools
that need to check for failures can behave more uniformly between
debug.show_unhandled_signals sysctl variable to 0 (or by doing echo 0 >
/proc/sys/debug/exception-trace)

Also, all of the lines being printed are now using printk_ratelimit() to
deny the ability of DoS from a local user with a program like the
following:

main()
{
       while (1)
               if (!fork()) *(int *)0 = 0;
}

This new revision also includes the fix that Andrew did which got rid of
new sysctl that was added to the system in earlier versions of this.
Also, 'show-unhandled-signals' sysctl has been renamed back to the old
'exception-trace' to avoid breakage of people's scripts.

AK: Enabling by default for i386 will be likely controversal, but let's see what happens
AK: Really folks, before complaining just fix your segfaults
AK: I bet this will find a lot of silent issues

Signed-off-by: Masoud Sharbiani <masouds@google.com>
Signed-off-by: Andi Kleen <ak@suse.de>
[ Personally, I've found the complaints useful on x86-64, so I'm all for
  this. That said, I wonder if we could do it more prettily..   -Linus ]
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/i386/kernel/signal.c   |  7 +++++++
 arch/i386/kernel/traps.c    |  7 +++++++
 arch/i386/mm/fault.c        | 10 ++++++++++
 arch/x86_64/kernel/signal.c |  2 +-
 arch/x86_64/kernel/traps.c  |  6 ++++--
 arch/x86_64/mm/fault.c      | 15 +++------------
 arch/x86_64/mm/init.c       | 33 ---------------------------------
 include/asm-x86_64/proto.h  |  2 --
 include/linux/signal.h      |  3 +++
 kernel/signal.c             | 10 ++++++++++
 kernel/sysctl.c             | 10 ++++++++++
 11 files changed, 55 insertions(+), 50 deletions(-)

(limited to 'kernel')

diff --git a/arch/i386/kernel/signal.c b/arch/i386/kernel/signal.c
index d574e38f0f7..f5dd85656c1 100644
--- a/arch/i386/kernel/signal.c
+++ b/arch/i386/kernel/signal.c
@@ -199,6 +199,13 @@ asmlinkage int sys_sigreturn(unsigned long __unused)
 	return eax;
 
 badframe:
+	if (show_unhandled_signals && printk_ratelimit())
+		printk("%s%s[%d] bad frame in sigreturn frame:%p eip:%lx"
+		       " esp:%lx oeax:%lx\n",
+		    current->pid > 1 ? KERN_INFO : KERN_EMERG,
+		    current->comm, current->pid, frame, regs->eip,
+		    regs->esp, regs->orig_eax);
+
 	force_sig(SIGSEGV, current);
 	return 0;
 }	
diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c
index 57772a18c39..438949da3b6 100644
--- a/arch/i386/kernel/traps.c
+++ b/arch/i386/kernel/traps.c
@@ -618,6 +618,13 @@ fastcall void __kprobes do_general_protection(struct pt_regs * regs,
 
 	current->thread.error_code = error_code;
 	current->thread.trap_no = 13;
+	if (show_unhandled_signals && unhandled_signal(current, SIGSEGV) &&
+	    printk_ratelimit())
+		printk(KERN_INFO
+		    "%s[%d] general protection eip:%lx esp:%lx error:%lx\n",
+		    current->comm, current->pid,
+		    regs->eip, regs->esp, error_code);
+
 	force_sig(SIGSEGV, current);
 	return;
 
diff --git a/arch/i386/mm/fault.c b/arch/i386/mm/fault.c
index e92a1012493..01ffdd4964f 100644
--- a/arch/i386/mm/fault.c
+++ b/arch/i386/mm/fault.c
@@ -283,6 +283,8 @@ static inline int vmalloc_fault(unsigned long address)
 	return 0;
 }
 
+int show_unhandled_signals = 1;
+
 /*
  * This routine handles page faults.  It determines the address,
  * and the problem, and then passes it off to one of the appropriate
@@ -469,6 +471,14 @@ bad_area_nosemaphore:
 		if (is_prefetch(regs, address, error_code))
 			return;
 
+		if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
+		    printk_ratelimit()) {
+			printk("%s%s[%d]: segfault at %08lx eip %08lx "
+			    "esp %08lx error %lx\n",
+			    tsk->pid > 1 ? KERN_INFO : KERN_EMERG,
+			    tsk->comm, tsk->pid, address, regs->eip,
+			    regs->esp, error_code);
+		}
 		tsk->thread.cr2 = address;
 		/* Kernel addresses are always protection faults */
 		tsk->thread.error_code = error_code | (address >= TASK_SIZE);
diff --git a/arch/x86_64/kernel/signal.c b/arch/x86_64/kernel/signal.c
index 4886afcd628..739175b01e0 100644
--- a/arch/x86_64/kernel/signal.c
+++ b/arch/x86_64/kernel/signal.c
@@ -487,7 +487,7 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
 void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
 { 
 	struct task_struct *me = current; 
-	if (exception_trace)
+	if (show_unhandled_signals && printk_ratelimit())
 		printk("%s[%d] bad frame in %s frame:%p rip:%lx rsp:%lx orax:%lx\n",
 	       me->comm,me->pid,where,frame,regs->rip,regs->rsp,regs->orig_rax); 
 
diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c
index 8713ad4a4db..03888420775 100644
--- a/arch/x86_64/kernel/traps.c
+++ b/arch/x86_64/kernel/traps.c
@@ -584,7 +584,8 @@ static void __kprobes do_trap(int trapnr, int signr, char *str,
 		tsk->thread.error_code = error_code;
 		tsk->thread.trap_no = trapnr;
 
-		if (exception_trace && unhandled_signal(tsk, signr))
+		if (show_unhandled_signals && unhandled_signal(tsk, signr) &&
+		    printk_ratelimit())
 			printk(KERN_INFO
 			       "%s[%d] trap %s rip:%lx rsp:%lx error:%lx\n",
 			       tsk->comm, tsk->pid, str,
@@ -688,7 +689,8 @@ asmlinkage void __kprobes do_general_protection(struct pt_regs * regs,
 		tsk->thread.error_code = error_code;
 		tsk->thread.trap_no = 13;
 
-		if (exception_trace && unhandled_signal(tsk, SIGSEGV))
+		if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
+		    printk_ratelimit())
 			printk(KERN_INFO
 		       "%s[%d] general protection rip:%lx rsp:%lx error:%lx\n",
 			       tsk->comm, tsk->pid,
diff --git a/arch/x86_64/mm/fault.c b/arch/x86_64/mm/fault.c
index 2074bddd4f0..5e9ac70c135 100644
--- a/arch/x86_64/mm/fault.c
+++ b/arch/x86_64/mm/fault.c
@@ -221,16 +221,6 @@ static int is_errata93(struct pt_regs *regs, unsigned long address)
 	return 0;
 } 
 
-int unhandled_signal(struct task_struct *tsk, int sig)
-{
-	if (is_init(tsk))
-		return 1;
-	if (tsk->ptrace & PT_PTRACED)
-		return 0;
-	return (tsk->sighand->action[sig-1].sa.sa_handler == SIG_IGN) ||
-		(tsk->sighand->action[sig-1].sa.sa_handler == SIG_DFL);
-}
-
 static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
 				 unsigned long error_code)
 {
@@ -302,7 +292,7 @@ static int vmalloc_fault(unsigned long address)
 }
 
 static int page_fault_trace;
-int exception_trace = 1;
+int show_unhandled_signals = 1;
 
 /*
  * This routine handles page faults.  It determines the address,
@@ -494,7 +484,8 @@ bad_area_nosemaphore:
 		    (address >> 32))
 			return;
 
-		if (exception_trace && unhandled_signal(tsk, SIGSEGV)) {
+		if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
+		    printk_ratelimit()) {
 			printk(
 		       "%s%s[%d]: segfault at %016lx rip %016lx rsp %016lx error %lx\n",
 					tsk->pid > 1 ? KERN_INFO : KERN_EMERG,
diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c
index 381c2ecd407..88678e82e23 100644
--- a/arch/x86_64/mm/init.c
+++ b/arch/x86_64/mm/init.c
@@ -697,39 +697,6 @@ int kern_addr_valid(unsigned long addr)
 	return pfn_valid(pte_pfn(*pte));
 }
 
-#ifdef CONFIG_SYSCTL
-#include <linux/sysctl.h>
-
-static ctl_table debug_table2[] = {
-	{
-		.ctl_name	= 99,
-		.procname	= "exception-trace",
-		.data		= &exception_trace,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec
-	},
-	{}
-}; 
-
-static ctl_table debug_root_table2[] = { 
-	{
-		.ctl_name = CTL_DEBUG,
-		.procname = "debug",
-		.mode = 0555,
-		.child = debug_table2
-	},
-	{}
-}; 
-
-static __init int x8664_sysctl_init(void)
-{ 
-	register_sysctl_table(debug_root_table2);
-	return 0;
-}
-__initcall(x8664_sysctl_init);
-#endif
-
 /* A pseudo VMA to allow ptrace access for the vsyscall page.  This only
    covers the 64bit vsyscall page now. 32bit has a real VMA now and does
    not need special handling anymore. */
diff --git a/include/asm-x86_64/proto.h b/include/asm-x86_64/proto.h
index d6e3225549c..31f20ad6587 100644
--- a/include/asm-x86_64/proto.h
+++ b/include/asm-x86_64/proto.h
@@ -75,8 +75,6 @@ extern void setup_node_bootmem(int nodeid, unsigned long start, unsigned long en
 extern void early_quirks(void);
 extern void check_efer(void);
 
-extern int unhandled_signal(struct task_struct *tsk, int sig);
-
 extern void select_idle_routine(const struct cpuinfo_x86 *c);
 
 extern unsigned long table_start, table_end;
diff --git a/include/linux/signal.h b/include/linux/signal.h
index ea91abe740d..0ae33886624 100644
--- a/include/linux/signal.h
+++ b/include/linux/signal.h
@@ -237,12 +237,15 @@ extern int group_send_sig_info(int sig, struct siginfo *info, struct task_struct
 extern int __group_send_sig_info(int, struct siginfo *, struct task_struct *);
 extern long do_sigpending(void __user *, unsigned long);
 extern int sigprocmask(int, sigset_t *, sigset_t *);
+extern int show_unhandled_signals;
 
 struct pt_regs;
 extern int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka, struct pt_regs *regs, void *cookie);
 
 extern struct kmem_cache *sighand_cachep;
 
+int unhandled_signal(struct task_struct *tsk, int sig);
+
 /*
  * In POSIX a signal is sent either to a specific thread (Linux task)
  * or to the process as a whole (Linux thread group).  How the signal
diff --git a/kernel/signal.c b/kernel/signal.c
index 39d122753ba..ef8156a6aad 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -255,6 +255,16 @@ flush_signal_handlers(struct task_struct *t, int force_default)
 	}
 }
 
+int unhandled_signal(struct task_struct *tsk, int sig)
+{
+	if (is_init(tsk))
+		return 1;
+	if (tsk->ptrace & PT_PTRACED)
+		return 0;
+	return (tsk->sighand->action[sig-1].sa.sa_handler == SIG_IGN) ||
+		(tsk->sighand->action[sig-1].sa.sa_handler == SIG_DFL);
+}
+
 
 /* Notify the system that a driver wants to block all signals for this
  * process, and wants to be notified if any signals at all were to be
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 222299844ad..ddebf3f2aff 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1203,6 +1203,16 @@ static ctl_table fs_table[] = {
 };
 
 static ctl_table debug_table[] = {
+#ifdef CONFIG_X86
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "exception-trace",
+		.data		= &show_unhandled_signals,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
+#endif
 	{ .ctl_name = 0 }
 };
 
-- 
cgit v1.2.3-70-g09d2


From e8b2fd01228f690c3e0cb3f14facfa8d93d4adae Mon Sep 17 00:00:00 2001
From: Len Brown <len.brown@intel.com>
Date: Tue, 24 Jul 2007 22:26:33 -0400
Subject: ACPI: Kconfig: remove CONFIG_ACPI_SLEEP from source

As it was a synonym for (CONFIG_ACPI && CONFIG_X86),
the ifdefs for it were more clutter than they were worth.

For ia64, just add a few stubs in anticipation of future
S3 or S4 support.

Signed-off-by: Len Brown <len.brown@intel.com>
---
 arch/i386/kernel/acpi/Makefile   |  2 +-
 arch/i386/kernel/setup.c         |  2 +-
 arch/i386/mm/init.c              |  2 +-
 arch/ia64/kernel/acpi.c          | 19 +++++++++++++++++++
 arch/x86_64/kernel/acpi/Makefile |  2 +-
 arch/x86_64/kernel/acpi/sleep.c  |  4 ----
 arch/x86_64/kernel/head.S        |  2 +-
 arch/x86_64/kernel/setup.c       |  2 +-
 drivers/acpi/Kconfig             | 10 +++-------
 drivers/acpi/sleep/Makefile      |  4 ++--
 drivers/acpi/sleep/main.c        |  2 ++
 drivers/acpi/sleep/poweroff.c    |  2 --
 drivers/acpi/sleep/wakeup.c      |  2 --
 include/acpi/acpi_drivers.h      |  4 ----
 include/asm-i386/acpi.h          | 23 +++++++++--------------
 include/asm-i386/suspend.h       |  2 +-
 include/asm-ia64/acpi.h          |  5 +++++
 include/asm-x86_64/acpi.h        | 22 +++++++++-------------
 include/asm-x86_64/suspend.h     |  2 --
 kernel/sysctl.c                  |  2 +-
 20 files changed, 57 insertions(+), 58 deletions(-)

(limited to 'kernel')

diff --git a/arch/i386/kernel/acpi/Makefile b/arch/i386/kernel/acpi/Makefile
index 7f7be01f44e..223f58fc9f4 100644
--- a/arch/i386/kernel/acpi/Makefile
+++ b/arch/i386/kernel/acpi/Makefile
@@ -2,7 +2,7 @@ obj-$(CONFIG_ACPI)		+= boot.o
 ifneq ($(CONFIG_PCI),)
 obj-$(CONFIG_X86_IO_APIC)	+= earlyquirk.o
 endif
-obj-$(CONFIG_ACPI_SLEEP)	+= sleep.o wakeup.o
+obj-$(CONFIG_ACPI)		+= sleep.o wakeup.o
 
 ifneq ($(CONFIG_ACPI_PROCESSOR),)
 obj-y				+= cstate.o processor.o
diff --git a/arch/i386/kernel/setup.c b/arch/i386/kernel/setup.c
index d474cd639bc..7fe5da3c932 100644
--- a/arch/i386/kernel/setup.c
+++ b/arch/i386/kernel/setup.c
@@ -422,7 +422,7 @@ void __init setup_bootmem_allocator(void)
 	 */
 	reserve_bootmem(PAGE_SIZE, PAGE_SIZE);
 #endif
-#ifdef CONFIG_ACPI_SLEEP
+#ifdef CONFIG_ACPI
 	/*
 	 * Reserve low memory region for sleep support.
 	 */
diff --git a/arch/i386/mm/init.c b/arch/i386/mm/init.c
index c3b9905af2d..1b1a1e66d09 100644
--- a/arch/i386/mm/init.c
+++ b/arch/i386/mm/init.c
@@ -432,7 +432,7 @@ static void __init pagetable_init (void)
 	paravirt_pagetable_setup_done(pgd_base);
 }
 
-#if defined(CONFIG_SOFTWARE_SUSPEND) || defined(CONFIG_ACPI_SLEEP)
+#if defined(CONFIG_SOFTWARE_SUSPEND) || defined(CONFIG_ACPI)
 /*
  * Swap suspend & friends need this for resume because things like the intel-agp
  * driver might have split up a kernel 4MB mapping.
diff --git a/arch/ia64/kernel/acpi.c b/arch/ia64/kernel/acpi.c
index 103dd8edda7..c6ede8780de 100644
--- a/arch/ia64/kernel/acpi.c
+++ b/arch/ia64/kernel/acpi.c
@@ -67,6 +67,8 @@ EXPORT_SYMBOL(pm_power_off);
 unsigned int acpi_cpei_override;
 unsigned int acpi_cpei_phys_cpuid;
 
+unsigned long acpi_wakeup_address = 0;
+
 const char __init *
 acpi_get_sysname(void)
 {
@@ -986,4 +988,21 @@ int acpi_unregister_ioapic(acpi_handle handle, u32 gsi_base)
 
 EXPORT_SYMBOL(acpi_unregister_ioapic);
 
+/*
+ * acpi_save_state_mem() - save kernel state
+ *
+ * TBD when when IA64 starts to support suspend...
+ */
+int acpi_save_state_mem(void) { return 0; } 
+
+/*
+ * acpi_restore_state()
+ */
+void acpi_restore_state_mem(void) {}
+
+/*
+ * do_suspend_lowlevel()
+ */
+void do_suspend_lowlevel(void) {}
+
 #endif				/* CONFIG_ACPI */
diff --git a/arch/x86_64/kernel/acpi/Makefile b/arch/x86_64/kernel/acpi/Makefile
index 080b9963f1b..17595d23fee 100644
--- a/arch/x86_64/kernel/acpi/Makefile
+++ b/arch/x86_64/kernel/acpi/Makefile
@@ -1,6 +1,6 @@
 obj-y			:= boot.o
 boot-y			:= ../../../i386/kernel/acpi/boot.o
-obj-$(CONFIG_ACPI_SLEEP)	+= sleep.o wakeup.o
+obj-y			+= sleep.o wakeup.o
 
 ifneq ($(CONFIG_ACPI_PROCESSOR),)
 obj-y			+= processor.o
diff --git a/arch/x86_64/kernel/acpi/sleep.c b/arch/x86_64/kernel/acpi/sleep.c
index 4277f2b27e6..79475d23707 100644
--- a/arch/x86_64/kernel/acpi/sleep.c
+++ b/arch/x86_64/kernel/acpi/sleep.c
@@ -51,8 +51,6 @@
                               Low-Level Sleep Support
    -------------------------------------------------------------------------- */
 
-#ifdef CONFIG_ACPI_SLEEP
-
 /* address in low memory of the wakeup routine. */
 unsigned long acpi_wakeup_address = 0;
 unsigned long acpi_realmode_flags;
@@ -117,8 +115,6 @@ static int __init acpi_sleep_setup(char *str)
 
 __setup("acpi_sleep=", acpi_sleep_setup);
 
-#endif				/*CONFIG_ACPI_SLEEP */
-
 void acpi_pci_link_exit(void)
 {
 }
diff --git a/arch/x86_64/kernel/head.S b/arch/x86_64/kernel/head.S
index e89abcdbdde..3a16e417dd8 100644
--- a/arch/x86_64/kernel/head.S
+++ b/arch/x86_64/kernel/head.S
@@ -120,7 +120,7 @@ ident_complete:
 	addq	%rbp, trampoline_level4_pgt + 0(%rip)
 	addq	%rbp, trampoline_level4_pgt + (511*8)(%rip)
 #endif
-#ifdef CONFIG_ACPI_SLEEP
+#ifdef CONFIG_ACPI
 	addq	%rbp, wakeup_level4_pgt + 0(%rip)
 	addq	%rbp, wakeup_level4_pgt + (511*8)(%rip)
 #endif
diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c
index af838f6b0b7..0f400f3c469 100644
--- a/arch/x86_64/kernel/setup.c
+++ b/arch/x86_64/kernel/setup.c
@@ -333,7 +333,7 @@ void __init setup_arch(char **cmdline_p)
 	reserve_bootmem_generic(SMP_TRAMPOLINE_BASE, 2*PAGE_SIZE);
 #endif
 
-#ifdef CONFIG_ACPI_SLEEP
+#ifdef CONFIG_ACPI
        /*
         * Reserve low memory region for sleep support.
         */
diff --git a/drivers/acpi/Kconfig b/drivers/acpi/Kconfig
index 524cbf151fc..251344cb29a 100644
--- a/drivers/acpi/Kconfig
+++ b/drivers/acpi/Kconfig
@@ -11,6 +11,9 @@ menuconfig ACPI
 	depends on PCI
 	depends on PM
 	select PNP
+	# for sleep
+	select HOTPLUG_CPU if X86 && SMP
+	select SUSPEND_SMP if X86 && SMP
 	default y
 	---help---
 	  Advanced Configuration and Power Interface (ACPI) support for 
@@ -42,13 +45,6 @@ menuconfig ACPI
 
 if ACPI
 
-config ACPI_SLEEP
-	bool
-	depends on X86
-	select HOTPLUG_CPU if SMP
-	select SUSPEND_SMP if SMP
-	default y
-
 config ACPI_PROCFS
 	bool "Deprecated /proc/acpi files"
 	depends on PROC_FS
diff --git a/drivers/acpi/sleep/Makefile b/drivers/acpi/sleep/Makefile
index 195a4f69c0f..01a993a1d08 100644
--- a/drivers/acpi/sleep/Makefile
+++ b/drivers/acpi/sleep/Makefile
@@ -1,5 +1,5 @@
 obj-y					:= poweroff.o wakeup.o
-obj-$(CONFIG_ACPI_SLEEP)		+= main.o
-obj-$(CONFIG_ACPI_SLEEP)		+= proc.o
+obj-y					+= main.o
+obj-$(CONFIG_X86)			+= proc.o
 
 EXTRA_CFLAGS += $(ACPI_CFLAGS)
diff --git a/drivers/acpi/sleep/main.c b/drivers/acpi/sleep/main.c
index 3279e72a94f..54c2dfcf865 100644
--- a/drivers/acpi/sleep/main.c
+++ b/drivers/acpi/sleep/main.c
@@ -136,10 +136,12 @@ static int acpi_pm_finish(suspend_state_t pm_state)
 	/* reset firmware waking vector */
 	acpi_set_firmware_waking_vector((acpi_physical_address) 0);
 
+#ifdef CONFIG_X86
 	if (init_8259A_after_S1) {
 		printk("Broken toshiba laptop -> kicking interrupts\n");
 		init_8259A(0);
 	}
+#endif
 	return 0;
 }
 
diff --git a/drivers/acpi/sleep/poweroff.c b/drivers/acpi/sleep/poweroff.c
index 39e40d56b03..b3f68ef0669 100644
--- a/drivers/acpi/sleep/poweroff.c
+++ b/drivers/acpi/sleep/poweroff.c
@@ -18,7 +18,6 @@
 
 int acpi_sleep_prepare(u32 acpi_state)
 {
-#ifdef CONFIG_ACPI_SLEEP
 	/* do we have a wakeup address for S2 and S3? */
 	if (acpi_state == ACPI_STATE_S3) {
 		if (!acpi_wakeup_address) {
@@ -31,7 +30,6 @@ int acpi_sleep_prepare(u32 acpi_state)
 	}
 	ACPI_FLUSH_CPU_CACHE();
 	acpi_enable_wakeup_device_prep(acpi_state);
-#endif
 	acpi_gpe_sleep_prepare(acpi_state);
 	acpi_enter_sleep_state_prep(acpi_state);
 	return 0;
diff --git a/drivers/acpi/sleep/wakeup.c b/drivers/acpi/sleep/wakeup.c
index fab8f2694f0..97c27ddb144 100644
--- a/drivers/acpi/sleep/wakeup.c
+++ b/drivers/acpi/sleep/wakeup.c
@@ -17,7 +17,6 @@ ACPI_MODULE_NAME("wakeup_devices")
 extern struct list_head acpi_wakeup_device_list;
 extern spinlock_t acpi_device_lock;
 
-#ifdef CONFIG_ACPI_SLEEP
 /**
  * acpi_enable_wakeup_device_prep - prepare wakeup devices
  *	@sleep_state:	ACPI state
@@ -180,7 +179,6 @@ static int __init acpi_wakeup_device_init(void)
 }
 
 late_initcall(acpi_wakeup_device_init);
-#endif
 
 /*
  * Disable all wakeup GPEs before entering requested sleep state.
diff --git a/include/acpi/acpi_drivers.h b/include/acpi/acpi_drivers.h
index 553515912c0..07b5d76b92c 100644
--- a/include/acpi/acpi_drivers.h
+++ b/include/acpi/acpi_drivers.h
@@ -142,10 +142,6 @@ static inline void unregister_hotplug_dock_device(acpi_handle handle)
 /*--------------------------------------------------------------------------
                                   Suspend/Resume
   -------------------------------------------------------------------------- */
-#ifdef CONFIG_ACPI_SLEEP
 extern int acpi_sleep_init(void);
-#else
-#define acpi_sleep_init() do {} while (0)
-#endif
 
 #endif /*__ACPI_DRIVERS_H__*/
diff --git a/include/asm-i386/acpi.h b/include/asm-i386/acpi.h
index 449f3f272e0..125179adf04 100644
--- a/include/asm-i386/acpi.h
+++ b/include/asm-i386/acpi.h
@@ -121,19 +121,6 @@ static inline void acpi_disable_pci(void)
 }
 extern int acpi_irq_balance_set(char *str);
 
-#else	/* !CONFIG_ACPI */
-
-#define acpi_lapic 0
-#define acpi_ioapic 0
-static inline void acpi_noirq_set(void) { }
-static inline void acpi_disable_pci(void) { }
-static inline void disable_acpi(void) { }
-
-#endif	/* !CONFIG_ACPI */
-
-
-#ifdef CONFIG_ACPI_SLEEP
-
 /* routines for saving/restoring kernel state */
 extern int acpi_save_state_mem(void);
 extern void acpi_restore_state_mem(void);
@@ -143,7 +130,15 @@ extern unsigned long acpi_wakeup_address;
 /* early initialization routine */
 extern void acpi_reserve_bootmem(void);
 
-#endif /*CONFIG_ACPI_SLEEP*/
+#else	/* !CONFIG_ACPI */
+
+#define acpi_lapic 0
+#define acpi_ioapic 0
+static inline void acpi_noirq_set(void) { }
+static inline void acpi_disable_pci(void) { }
+static inline void disable_acpi(void) { }
+
+#endif	/* !CONFIG_ACPI */
 
 #define ARCH_HAS_POWER_INIT	1
 
diff --git a/include/asm-i386/suspend.h b/include/asm-i386/suspend.h
index 8dbaafe611f..a2520732ffd 100644
--- a/include/asm-i386/suspend.h
+++ b/include/asm-i386/suspend.h
@@ -21,7 +21,7 @@ struct saved_context {
 	unsigned long return_address;
 } __attribute__((packed));
 
-#ifdef CONFIG_ACPI_SLEEP
+#ifdef CONFIG_ACPI
 extern unsigned long saved_eip;
 extern unsigned long saved_esp;
 extern unsigned long saved_ebp;
diff --git a/include/asm-ia64/acpi.h b/include/asm-ia64/acpi.h
index 5b526357d17..49730ffbbae 100644
--- a/include/asm-ia64/acpi.h
+++ b/include/asm-ia64/acpi.h
@@ -100,6 +100,11 @@ const char *acpi_get_sysname (void);
 int acpi_request_vector (u32 int_type);
 int acpi_gsi_to_irq (u32 gsi, unsigned int *irq);
 
+/* routines for saving/restoring kernel state */
+extern int acpi_save_state_mem(void);
+extern void acpi_restore_state_mem(void);
+extern unsigned long acpi_wakeup_address;
+
 /*
  * Record the cpei override flag and current logical cpu. This is
  * useful for CPU removal.
diff --git a/include/asm-x86_64/acpi.h b/include/asm-x86_64/acpi.h
index 1da8f49c0fe..98173357dd8 100644
--- a/include/asm-x86_64/acpi.h
+++ b/include/asm-x86_64/acpi.h
@@ -108,6 +108,15 @@ static inline void acpi_disable_pci(void)
 }
 extern int acpi_irq_balance_set(char *str);
 
+/* routines for saving/restoring kernel state */
+extern int acpi_save_state_mem(void);
+extern void acpi_restore_state_mem(void);
+
+extern unsigned long acpi_wakeup_address;
+
+/* early initialization routine */
+extern void acpi_reserve_bootmem(void);
+
 #else	/* !CONFIG_ACPI */
 
 #define acpi_lapic 0
@@ -121,19 +130,6 @@ extern int acpi_numa;
 extern int acpi_scan_nodes(unsigned long start, unsigned long end);
 #define NR_NODE_MEMBLKS (MAX_NUMNODES*2)
 
-#ifdef CONFIG_ACPI_SLEEP
-
-/* routines for saving/restoring kernel state */
-extern int acpi_save_state_mem(void);
-extern void acpi_restore_state_mem(void);
-
-extern unsigned long acpi_wakeup_address;
-
-/* early initialization routine */
-extern void acpi_reserve_bootmem(void);
-
-#endif /*CONFIG_ACPI_SLEEP*/
-
 extern int acpi_disabled;
 extern int acpi_pci_disabled;
 
diff --git a/include/asm-x86_64/suspend.h b/include/asm-x86_64/suspend.h
index 9c3f8de90d2..b897e8cb55f 100644
--- a/include/asm-x86_64/suspend.h
+++ b/include/asm-x86_64/suspend.h
@@ -44,7 +44,6 @@ extern unsigned long saved_context_eflags;
 
 extern void fix_processor_context(void);
 
-#ifdef CONFIG_ACPI_SLEEP
 extern unsigned long saved_rip;
 extern unsigned long saved_rsp;
 extern unsigned long saved_rbp;
@@ -54,4 +53,3 @@ extern unsigned long saved_rdi;
 
 /* routines for saving/restoring kernel state */
 extern int acpi_save_state_mem(void);
-#endif
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index ddebf3f2aff..eb26f2ba51e 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -689,7 +689,7 @@ static ctl_table kern_table[] = {
 		.proc_handler	= &proc_dointvec,
 	},
 #endif
-#ifdef CONFIG_ACPI_SLEEP
+#if defined(CONFIG_ACPI) && defined(CONFIG_X86)
 	{
 		.ctl_name	= KERN_ACPI_VIDEO_FLAGS,
 		.procname	= "acpi_video_flags",
-- 
cgit v1.2.3-70-g09d2


From 2c6b47de17c75d553de3e2fb426d8298d2074585 Mon Sep 17 00:00:00 2001
From: john stultz <johnstul@us.ibm.com>
Date: Tue, 24 Jul 2007 17:47:43 -0700
Subject: Cleanup non-arch xtime uses, use get_seconds() or
 current_kernel_time().

This avoids use of the kernel-internal "xtime" variable directly outside
of the actual time-related functions.  Instead, use the helper functions
that we already have available to us.

This doesn't actually change any behaviour, but this will allow us to
fix the fact that "xtime" isn't updated very often with CONFIG_NO_HZ
(because much of the realtime information is maintained as separate
offsets to 'xtime'), which has caused interfaces that use xtime directly
to get a time that is out of sync with the real-time clock by up to a
third of a second or so.

Signed-off-by: John Stultz <johnstul@us.ibm.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/rtc/class.c        |  5 +++--
 drivers/s390/net/ctcmain.c |  6 +++---
 drivers/s390/net/netiucv.c |  4 ++--
 include/linux/time.h       |  2 +-
 kernel/acct.c              |  2 +-
 kernel/hrtimer.c           |  2 +-
 kernel/time.c              | 16 ----------------
 kernel/time/timekeeping.c  | 16 ++++++++++++++++
 kernel/tsacct.c            |  2 +-
 net/rxrpc/af_rxrpc.c       |  2 +-
 net/rxrpc/ar-connection.c  |  4 ++--
 net/rxrpc/ar-transport.c   |  4 ++--
 net/rxrpc/rxkad.c          |  2 +-
 13 files changed, 34 insertions(+), 33 deletions(-)

(limited to 'kernel')

diff --git a/drivers/rtc/class.c b/drivers/rtc/class.c
index 8b3cd31d6a6..10ab3b71ffc 100644
--- a/drivers/rtc/class.c
+++ b/drivers/rtc/class.c
@@ -46,6 +46,7 @@ static int rtc_suspend(struct device *dev, pm_message_t mesg)
 {
 	struct rtc_device	*rtc = to_rtc_device(dev);
 	struct rtc_time		tm;
+	struct timespec		ts = current_kernel_time();
 
 	if (strncmp(rtc->dev.bus_id,
 				CONFIG_RTC_HCTOSYS_DEVICE,
@@ -57,8 +58,8 @@ static int rtc_suspend(struct device *dev, pm_message_t mesg)
 
 	/* RTC precision is 1 second; adjust delta for avg 1/2 sec err */
 	set_normalized_timespec(&delta,
-				xtime.tv_sec - oldtime,
-				xtime.tv_nsec - (NSEC_PER_SEC >> 1));
+				ts.tv_sec - oldtime,
+				ts.tv_nsec - (NSEC_PER_SEC >> 1));
 
 	return 0;
 }
diff --git a/drivers/s390/net/ctcmain.c b/drivers/s390/net/ctcmain.c
index b20fd068173..92e8a37b502 100644
--- a/drivers/s390/net/ctcmain.c
+++ b/drivers/s390/net/ctcmain.c
@@ -674,7 +674,7 @@ ch_action_txdone(fsm_instance * fi, int event, void *arg)
 	int first = 1;
 	int i;
 	unsigned long duration;
-	struct timespec done_stamp = xtime;
+	struct timespec done_stamp = current_kernel_time();
 
 	DBF_TEXT(trace, 4, __FUNCTION__);
 
@@ -730,7 +730,7 @@ ch_action_txdone(fsm_instance * fi, int event, void *arg)
 		spin_unlock(&ch->collect_lock);
 		ch->ccw[1].count = ch->trans_skb->len;
 		fsm_addtimer(&ch->timer, CTC_TIMEOUT_5SEC, CH_EVENT_TIMER, ch);
-		ch->prof.send_stamp = xtime;
+		ch->prof.send_stamp = current_kernel_time();
 		rc = ccw_device_start(ch->cdev, &ch->ccw[0],
 				      (unsigned long) ch, 0xff, 0);
 		ch->prof.doios_multi++;
@@ -2281,7 +2281,7 @@ transmit_skb(struct channel *ch, struct sk_buff *skb)
 		fsm_newstate(ch->fsm, CH_STATE_TX);
 		fsm_addtimer(&ch->timer, CTC_TIMEOUT_5SEC, CH_EVENT_TIMER, ch);
 		spin_lock_irqsave(get_ccwdev_lock(ch->cdev), saveflags);
-		ch->prof.send_stamp = xtime;
+		ch->prof.send_stamp = current_kernel_time();
 		rc = ccw_device_start(ch->cdev, &ch->ccw[ccw_idx],
 				      (unsigned long) ch, 0xff, 0);
 		spin_unlock_irqrestore(get_ccwdev_lock(ch->cdev), saveflags);
diff --git a/drivers/s390/net/netiucv.c b/drivers/s390/net/netiucv.c
index 3d28e1a5bf7..26888947433 100644
--- a/drivers/s390/net/netiucv.c
+++ b/drivers/s390/net/netiucv.c
@@ -753,7 +753,7 @@ static void conn_action_txdone(fsm_instance *fi, int event, void *arg)
 
 	header.next = 0;
 	memcpy(skb_put(conn->tx_buff, NETIUCV_HDRLEN), &header, NETIUCV_HDRLEN);
-	conn->prof.send_stamp = xtime;
+	conn->prof.send_stamp = current_kernel_time();
 	txmsg.class = 0;
 	txmsg.tag = 0;
 	rc = iucv_message_send(conn->path, &txmsg, 0, 0,
@@ -1185,7 +1185,7 @@ static int netiucv_transmit_skb(struct iucv_connection *conn,
 		memcpy(skb_put(nskb, NETIUCV_HDRLEN), &header,  NETIUCV_HDRLEN);
 
 		fsm_newstate(conn->fsm, CONN_STATE_TX);
-		conn->prof.send_stamp = xtime;
+		conn->prof.send_stamp = current_kernel_time();
 
 		msg.tag = 1;
 		msg.class = 0;
diff --git a/include/linux/time.h b/include/linux/time.h
index e6aea5146e5..71181df8b74 100644
--- a/include/linux/time.h
+++ b/include/linux/time.h
@@ -107,7 +107,7 @@ static inline unsigned long get_seconds(void)
 struct timespec current_kernel_time(void);
 
 #define CURRENT_TIME		(current_kernel_time())
-#define CURRENT_TIME_SEC	((struct timespec) { xtime.tv_sec, 0 })
+#define CURRENT_TIME_SEC	((struct timespec) { get_seconds(), 0 })
 
 extern void do_gettimeofday(struct timeval *tv);
 extern int do_settimeofday(struct timespec *tv);
diff --git a/kernel/acct.c b/kernel/acct.c
index 70d0d88e555..24f0f8b2ba7 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -468,7 +468,7 @@ static void do_acct_process(struct file *file)
 	}
 #endif
 	do_div(elapsed, AHZ);
-	ac.ac_btime = xtime.tv_sec - elapsed;
+	ac.ac_btime = get_seconds() - elapsed;
 	/* we really need to bite the bullet and change layout */
 	ac.ac_uid = current->uid;
 	ac.ac_gid = current->gid;
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index eb1ddebd2c0..a7bb05e6cb6 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -144,7 +144,7 @@ static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base)
 #ifdef CONFIG_NO_HZ
 		getnstimeofday(&xts);
 #else
-		xts = xtime;
+		xts = current_kernel_time();
 #endif
 		tom = wall_to_monotonic;
 	} while (read_seqretry(&xtime_lock, seq));
diff --git a/kernel/time.c b/kernel/time.c
index 5b81da08bbd..2289a8d6831 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -215,22 +215,6 @@ asmlinkage long sys_adjtimex(struct timex __user *txc_p)
 	return copy_to_user(txc_p, &txc, sizeof(struct timex)) ? -EFAULT : ret;
 }
 
-inline struct timespec current_kernel_time(void)
-{
-        struct timespec now;
-        unsigned long seq;
-
-	do {
-		seq = read_seqbegin(&xtime_lock);
-		
-		now = xtime;
-	} while (read_seqretry(&xtime_lock, seq));
-
-	return now; 
-}
-
-EXPORT_SYMBOL(current_kernel_time);
-
 /**
  * current_fs_time - Return FS time
  * @sb: Superblock.
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 88c81026e00..07a3f1420c2 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -509,3 +509,19 @@ void monotonic_to_bootbased(struct timespec *ts)
 {
 	ts->tv_sec += total_sleep_time;
 }
+
+struct timespec current_kernel_time(void)
+{
+	struct timespec now;
+	unsigned long seq;
+
+	do {
+		seq = read_seqbegin(&xtime_lock);
+
+		now = xtime;
+	} while (read_seqretry(&xtime_lock, seq));
+
+	return now;
+}
+
+EXPORT_SYMBOL(current_kernel_time);
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 658f638c402..c122131a122 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -39,7 +39,7 @@ void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk)
 	ac_etime = timespec_to_ns(&ts);
 	do_div(ac_etime, NSEC_PER_USEC);
 	stats->ac_etime = ac_etime;
-	stats->ac_btime = xtime.tv_sec - ts.tv_sec;
+	stats->ac_btime = get_seconds() - ts.tv_sec;
 	if (thread_group_leader(tsk)) {
 		stats->ac_exitcode = tsk->exit_code;
 		if (tsk->flags & PF_FORKNOEXEC)
diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c
index 16a68df4e36..c58fa0d1be2 100644
--- a/net/rxrpc/af_rxrpc.c
+++ b/net/rxrpc/af_rxrpc.c
@@ -787,7 +787,7 @@ static int __init af_rxrpc_init(void)
 
 	BUILD_BUG_ON(sizeof(struct rxrpc_skb_priv) > sizeof(dummy_skb->cb));
 
-	rxrpc_epoch = htonl(xtime.tv_sec);
+	rxrpc_epoch = htonl(get_seconds());
 
 	ret = -ENOMEM;
 	rxrpc_call_jar = kmem_cache_create(
diff --git a/net/rxrpc/ar-connection.c b/net/rxrpc/ar-connection.c
index 482750efc23..372b24466dc 100644
--- a/net/rxrpc/ar-connection.c
+++ b/net/rxrpc/ar-connection.c
@@ -791,7 +791,7 @@ void rxrpc_put_connection(struct rxrpc_connection *conn)
 
 	ASSERTCMP(atomic_read(&conn->usage), >, 0);
 
-	conn->put_time = xtime.tv_sec;
+	conn->put_time = get_seconds();
 	if (atomic_dec_and_test(&conn->usage)) {
 		_debug("zombie");
 		rxrpc_queue_delayed_work(&rxrpc_connection_reap, 0);
@@ -835,7 +835,7 @@ void rxrpc_connection_reaper(struct work_struct *work)
 
 	_enter("");
 
-	now = xtime.tv_sec;
+	now = get_seconds();
 	earliest = ULONG_MAX;
 
 	write_lock_bh(&rxrpc_connection_lock);
diff --git a/net/rxrpc/ar-transport.c b/net/rxrpc/ar-transport.c
index d43d78f1930..bb282a6a19f 100644
--- a/net/rxrpc/ar-transport.c
+++ b/net/rxrpc/ar-transport.c
@@ -183,7 +183,7 @@ void rxrpc_put_transport(struct rxrpc_transport *trans)
 
 	ASSERTCMP(atomic_read(&trans->usage), >, 0);
 
-	trans->put_time = xtime.tv_sec;
+	trans->put_time = get_seconds();
 	if (unlikely(atomic_dec_and_test(&trans->usage)))
 		_debug("zombie");
 		/* let the reaper determine the timeout to avoid a race with
@@ -219,7 +219,7 @@ static void rxrpc_transport_reaper(struct work_struct *work)
 
 	_enter("");
 
-	now = xtime.tv_sec;
+	now = get_seconds();
 	earliest = ULONG_MAX;
 
 	/* extract all the transports that have been dead too long */
diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c
index 5ec705144e1..ac3cabdca78 100644
--- a/net/rxrpc/rxkad.c
+++ b/net/rxrpc/rxkad.c
@@ -916,7 +916,7 @@ static int rxkad_decrypt_ticket(struct rxrpc_connection *conn,
 		issue = be32_to_cpu(stamp);
 	}
 	p += 4;
-	now = xtime.tv_sec;
+	now = get_seconds();
 	_debug("KIV ISSUE: %lx [%lx]", issue, now);
 
 	/* check the ticket is in date */
-- 
cgit v1.2.3-70-g09d2


From 17c38b7490b3f0300c7812aefdae2ddda7ab4112 Mon Sep 17 00:00:00 2001
From: john stultz <johnstul@us.ibm.com>
Date: Tue, 24 Jul 2007 18:38:34 -0700
Subject: Cache xtime every call to update_wall_time

This avoids xtime lag seen with dynticks, because while 'xtime' itself
is still not updated often, we keep a 'xtime_cache' variable around that
contains the approximate real-time that _is_ updated each time we do a
'update_wall_time()', and is thus never off by more than one tick.

IOW, this restores the original semantics for 'xtime' users, as long as
you use the proper abstraction functions (ie 'current_kernel_time()' or
'get_seconds()' depending on whether you want a timespec or just the
seconds field).

[ Updated Patch.  As penance for my sins I've also yanked another #ifdef
  that was added to avoid the xtime lag w/ hrtimers.  ]

Signed-off-by: John Stultz <johnstul@us.ibm.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/time.h      |  6 +-----
 kernel/hrtimer.c          |  4 ----
 kernel/time/timekeeping.c | 26 +++++++++++++++++++++++---
 3 files changed, 24 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/time.h b/include/linux/time.h
index 71181df8b74..6a5f503b4f1 100644
--- a/include/linux/time.h
+++ b/include/linux/time.h
@@ -99,11 +99,7 @@ extern int update_persistent_clock(struct timespec now);
 extern int no_sync_cmos_clock __read_mostly;
 void timekeeping_init(void);
 
-static inline unsigned long get_seconds(void)
-{
-	return xtime.tv_sec;
-}
-
+unsigned long get_seconds(void);
 struct timespec current_kernel_time(void);
 
 #define CURRENT_TIME		(current_kernel_time())
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index a7bb05e6cb6..c21ca6bfaa6 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -141,11 +141,7 @@ static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base)
 
 	do {
 		seq = read_seqbegin(&xtime_lock);
-#ifdef CONFIG_NO_HZ
-		getnstimeofday(&xts);
-#else
 		xts = current_kernel_time();
-#endif
 		tom = wall_to_monotonic;
 	} while (read_seqretry(&xtime_lock, seq));
 
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 07a3f1420c2..acc417b5a9b 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -47,10 +47,22 @@ EXPORT_SYMBOL(xtime_lock);
 struct timespec xtime __attribute__ ((aligned (16)));
 struct timespec wall_to_monotonic __attribute__ ((aligned (16)));
 static unsigned long total_sleep_time;		/* seconds */
-
 EXPORT_SYMBOL(xtime);
 
 
+#ifdef CONFIG_NO_HZ
+static struct timespec xtime_cache __attribute__ ((aligned (16)));
+static inline void update_xtime_cache(u64 nsec)
+{
+	xtime_cache = xtime;
+	timespec_add_ns(&xtime_cache, nsec);
+}
+#else
+#define xtime_cache xtime
+/* We do *not* want to evaluate the argument for this case */
+#define update_xtime_cache(n) do { } while (0)
+#endif
+
 static struct clocksource *clock; /* pointer to current clocksource */
 
 
@@ -478,6 +490,8 @@ void update_wall_time(void)
 	xtime.tv_nsec = (s64)clock->xtime_nsec >> clock->shift;
 	clock->xtime_nsec -= (s64)xtime.tv_nsec << clock->shift;
 
+	update_xtime_cache(cyc2ns(clock, offset));
+
 	/* check to see if there is a new clocksource to use */
 	change_clocksource();
 	update_vsyscall(&xtime, clock);
@@ -510,6 +524,13 @@ void monotonic_to_bootbased(struct timespec *ts)
 	ts->tv_sec += total_sleep_time;
 }
 
+unsigned long get_seconds(void)
+{
+	return xtime_cache.tv_sec;
+}
+EXPORT_SYMBOL(get_seconds);
+
+
 struct timespec current_kernel_time(void)
 {
 	struct timespec now;
@@ -518,10 +539,9 @@ struct timespec current_kernel_time(void)
 	do {
 		seq = read_seqbegin(&xtime_lock);
 
-		now = xtime;
+		now = xtime_cache;
 	} while (read_seqretry(&xtime_lock, seq));
 
 	return now;
 }
-
 EXPORT_SYMBOL(current_kernel_time);
-- 
cgit v1.2.3-70-g09d2


From e107be36efb2a233833e8c9899039a370e4b2318 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@qumranet.com>
Date: Thu, 26 Jul 2007 13:40:43 +0200
Subject: [PATCH] sched: arch preempt notifier mechanism

This adds a general mechanism whereby a task can request the scheduler to
notify it whenever it is preempted or scheduled back in.  This allows the
task to swap any special-purpose registers like the fpu or Intel's VT
registers.

Signed-off-by: Avi Kivity <avi@qumranet.com>
[ mingo@elte.hu: fixes, cleanups ]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/preempt.h | 44 +++++++++++++++++++++++++++++
 include/linux/sched.h   |  5 ++++
 kernel/Kconfig.preempt  |  3 ++
 kernel/sched.c          | 73 +++++++++++++++++++++++++++++++++++++++++++++++--
 4 files changed, 123 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index d0926d63406..484988ed301 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -8,6 +8,7 @@
 
 #include <linux/thread_info.h>
 #include <linux/linkage.h>
+#include <linux/list.h>
 
 #ifdef CONFIG_DEBUG_PREEMPT
   extern void fastcall add_preempt_count(int val);
@@ -60,4 +61,47 @@ do { \
 
 #endif
 
+#ifdef CONFIG_PREEMPT_NOTIFIERS
+
+struct preempt_notifier;
+
+/**
+ * preempt_ops - notifiers called when a task is preempted and rescheduled
+ * @sched_in: we're about to be rescheduled:
+ *    notifier: struct preempt_notifier for the task being scheduled
+ *    cpu:  cpu we're scheduled on
+ * @sched_out: we've just been preempted
+ *    notifier: struct preempt_notifier for the task being preempted
+ *    next: the task that's kicking us out
+ */
+struct preempt_ops {
+	void (*sched_in)(struct preempt_notifier *notifier, int cpu);
+	void (*sched_out)(struct preempt_notifier *notifier,
+			  struct task_struct *next);
+};
+
+/**
+ * preempt_notifier - key for installing preemption notifiers
+ * @link: internal use
+ * @ops: defines the notifier functions to be called
+ *
+ * Usually used in conjunction with container_of().
+ */
+struct preempt_notifier {
+	struct hlist_node link;
+	struct preempt_ops *ops;
+};
+
+void preempt_notifier_register(struct preempt_notifier *notifier);
+void preempt_notifier_unregister(struct preempt_notifier *notifier);
+
+static inline void preempt_notifier_init(struct preempt_notifier *notifier,
+				     struct preempt_ops *ops)
+{
+	INIT_HLIST_NODE(&notifier->link);
+	notifier->ops = ops;
+}
+
+#endif
+
 #endif /* __LINUX_PREEMPT_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 7c61b50823f..7a4de876874 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -935,6 +935,11 @@ struct task_struct {
 	struct sched_class *sched_class;
 	struct sched_entity se;
 
+#ifdef CONFIG_PREEMPT_NOTIFIERS
+	/* list of struct preempt_notifier: */
+	struct hlist_head preempt_notifiers;
+#endif
+
 	unsigned short ioprio;
 #ifdef CONFIG_BLK_DEV_IO_TRACE
 	unsigned int btrace_seq;
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index c64ce9c1420..6b066632e40 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -63,3 +63,6 @@ config PREEMPT_BKL
 	  Say Y here if you are building a kernel for a desktop system.
 	  Say N if you are unsure.
 
+config PREEMPT_NOTIFIERS
+	bool
+
diff --git a/kernel/sched.c b/kernel/sched.c
index 93cf241cfbe..e901aa59f20 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1592,6 +1592,10 @@ static void __sched_fork(struct task_struct *p)
 	INIT_LIST_HEAD(&p->run_list);
 	p->se.on_rq = 0;
 
+#ifdef CONFIG_PREEMPT_NOTIFIERS
+	INIT_HLIST_HEAD(&p->preempt_notifiers);
+#endif
+
 	/*
 	 * We mark the process as running here, but have not actually
 	 * inserted it onto the runqueue yet. This guarantees that
@@ -1673,6 +1677,63 @@ void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
 	task_rq_unlock(rq, &flags);
 }
 
+#ifdef CONFIG_PREEMPT_NOTIFIERS
+
+/**
+ * preempt_notifier_register - tell me when current is being being preempted
+ *                         and rescheduled
+ */
+void preempt_notifier_register(struct preempt_notifier *notifier)
+{
+	hlist_add_head(&notifier->link, &current->preempt_notifiers);
+}
+EXPORT_SYMBOL_GPL(preempt_notifier_register);
+
+/**
+ * preempt_notifier_unregister - no longer interested in preemption notifications
+ *
+ * This is safe to call from within a preemption notifier.
+ */
+void preempt_notifier_unregister(struct preempt_notifier *notifier)
+{
+	hlist_del(&notifier->link);
+}
+EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
+
+static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
+{
+	struct preempt_notifier *notifier;
+	struct hlist_node *node;
+
+	hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
+		notifier->ops->sched_in(notifier, raw_smp_processor_id());
+}
+
+static void
+fire_sched_out_preempt_notifiers(struct task_struct *curr,
+				 struct task_struct *next)
+{
+	struct preempt_notifier *notifier;
+	struct hlist_node *node;
+
+	hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
+		notifier->ops->sched_out(notifier, next);
+}
+
+#else
+
+static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
+{
+}
+
+static void
+fire_sched_out_preempt_notifiers(struct task_struct *curr,
+				 struct task_struct *next)
+{
+}
+
+#endif
+
 /**
  * prepare_task_switch - prepare to switch tasks
  * @rq: the runqueue preparing to switch
@@ -1685,8 +1746,11 @@ void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
  * prepare_task_switch sets up locking and calls architecture specific
  * hooks.
  */
-static inline void prepare_task_switch(struct rq *rq, struct task_struct *next)
+static inline void
+prepare_task_switch(struct rq *rq, struct task_struct *prev,
+		    struct task_struct *next)
 {
+	fire_sched_out_preempt_notifiers(prev, next);
 	prepare_lock_switch(rq, next);
 	prepare_arch_switch(next);
 }
@@ -1728,6 +1792,7 @@ static inline void finish_task_switch(struct rq *rq, struct task_struct *prev)
 	prev_state = prev->state;
 	finish_arch_switch(prev);
 	finish_lock_switch(rq, prev);
+	fire_sched_in_preempt_notifiers(current);
 	if (mm)
 		mmdrop(mm);
 	if (unlikely(prev_state == TASK_DEAD)) {
@@ -1768,7 +1833,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
 {
 	struct mm_struct *mm, *oldmm;
 
-	prepare_task_switch(rq, next);
+	prepare_task_switch(rq, prev, next);
 	mm = next->mm;
 	oldmm = prev->active_mm;
 	/*
@@ -6335,6 +6400,10 @@ void __init sched_init(void)
 
 	set_load_weight(&init_task);
 
+#ifdef CONFIG_PREEMPT_NOTIFIERS
+	INIT_HLIST_HEAD(&init_task.preempt_notifiers);
+#endif
+
 #ifdef CONFIG_SMP
 	nr_cpu_ids = highest_cpu + 1;
 	open_softirq(SCHED_SOFTIRQ, run_rebalance_domains, NULL);
-- 
cgit v1.2.3-70-g09d2


From 018a2212950457b1093e504cd834aa0fe749da6c Mon Sep 17 00:00:00 2001
From: Satoru Takeuchi <takeuchi_satoru@jp.fujitsu.com>
Date: Thu, 26 Jul 2007 13:40:43 +0200
Subject: [PATCH] sched: remove unused rq->load_balance_class

Remove unused rq->load_balance_class.

Signed-off-by: Satoru Takeuchi <takeuchi_satoru@jp.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index e901aa59f20..cc6c1192c44 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -263,8 +263,6 @@ struct rq {
 	unsigned int clock_warps, clock_overflows;
 	unsigned int clock_unstable_events;
 
-	struct sched_class *load_balance_class;
-
 	atomic_t nr_iowait;
 
 #ifdef CONFIG_SMP
-- 
cgit v1.2.3-70-g09d2


From 2cd4d0ea19713304963dbb2de5073700bfe253f5 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 26 Jul 2007 13:40:43 +0200
Subject: [PATCH] sched: make cpu_clock() not use the rq clock

it is enough to disable interrupts to get the precise rq-clock
of the local CPU.

this also solves an NMI watchdog regression: the NMI watchdog
calls touch_softlockup_watchdog(), which might deadlock on
rq->lock if the NMI hits an rq-locked critical section.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index cc6c1192c44..3eed860cf29 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -383,13 +383,12 @@ static inline unsigned long long rq_clock(struct rq *rq)
  */
 unsigned long long cpu_clock(int cpu)
 {
-	struct rq *rq = cpu_rq(cpu);
 	unsigned long long now;
 	unsigned long flags;
 
-	spin_lock_irqsave(&rq->lock, flags);
-	now = rq_clock(rq);
-	spin_unlock_irqrestore(&rq->lock, flags);
+	local_irq_save(flags);
+	now = rq_clock(cpu_rq(cpu));
+	local_irq_restore(flags);
 
 	return now;
 }
-- 
cgit v1.2.3-70-g09d2


From f33734619371ae40f34bbce001938408e6634f05 Mon Sep 17 00:00:00 2001
From: Josh Triplett <josh@kernel.org>
Date: Thu, 26 Jul 2007 13:40:43 +0200
Subject: [PATCH] sched: mark sysrq_sched_debug_show() static

Only sched.c uses sysrq_sched_debug_show, and sched.c includes sched_debug.c,
so all uses of sysrq_sched_debug_show occur in the same source file.

Eliminates a sparse warning:
warning: symbol 'sysrq_sched_debug_show' was not declared. Should it be static?

Signed-off-by: Josh Triplett <josh@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_debug.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 29f2c21e7da..42970f723a9 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -186,7 +186,7 @@ static int sched_debug_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-void sysrq_sched_debug_show(void)
+static void sysrq_sched_debug_show(void)
 {
 	sched_debug_show(NULL, NULL);
 }
-- 
cgit v1.2.3-70-g09d2


From e692ab53473c93c0d0820618c97aa74a62ab67da Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Thu, 26 Jul 2007 13:40:43 +0200
Subject: [PATCH] sched: debug feature - make the sched-domains tree
 runtime-tweakable

debugging feature: make the sched-domains tree runtime-tweakable.

Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
[ mingo@elte.hu: made it depend on CONFIG_SCHED_DEBUG & small updates ]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 122 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 122 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 3eed860cf29..5c51d7e5dcc 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -53,6 +53,7 @@
 #include <linux/percpu.h>
 #include <linux/kthread.h>
 #include <linux/seq_file.h>
+#include <linux/sysctl.h>
 #include <linux/syscalls.h>
 #include <linux/times.h>
 #include <linux/tsacct_kern.h>
@@ -5202,10 +5203,129 @@ static void migrate_dead_tasks(unsigned int dead_cpu)
 		if (!next)
 			break;
 		migrate_dead(dead_cpu, next);
+
 	}
 }
 #endif /* CONFIG_HOTPLUG_CPU */
 
+#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
+
+static struct ctl_table sd_ctl_dir[] = {
+	{CTL_UNNUMBERED, "sched_domain", NULL, 0, 0755, NULL, },
+	{0,},
+};
+
+static struct ctl_table sd_ctl_root[] = {
+	{CTL_UNNUMBERED, "kernel", NULL, 0, 0755, sd_ctl_dir, },
+	{0,},
+};
+
+static struct ctl_table *sd_alloc_ctl_entry(int n)
+{
+	struct ctl_table *entry =
+		kmalloc(n * sizeof(struct ctl_table), GFP_KERNEL);
+
+	BUG_ON(!entry);
+	memset(entry, 0, n * sizeof(struct ctl_table));
+
+	return entry;
+}
+
+static void
+set_table_entry(struct ctl_table *entry, int ctl_name,
+		const char *procname, void *data, int maxlen,
+		mode_t mode, proc_handler *proc_handler)
+{
+	entry->ctl_name = ctl_name;
+	entry->procname = procname;
+	entry->data = data;
+	entry->maxlen = maxlen;
+	entry->mode = mode;
+	entry->proc_handler = proc_handler;
+}
+
+static struct ctl_table *
+sd_alloc_ctl_domain_table(struct sched_domain *sd)
+{
+	struct ctl_table *table = sd_alloc_ctl_entry(14);
+
+	set_table_entry(&table[0], 1, "min_interval", &sd->min_interval,
+		sizeof(long), 0644, proc_doulongvec_minmax);
+	set_table_entry(&table[1], 2, "max_interval", &sd->max_interval,
+		sizeof(long), 0644, proc_doulongvec_minmax);
+	set_table_entry(&table[2], 3, "busy_idx", &sd->busy_idx,
+		sizeof(int), 0644, proc_dointvec_minmax);
+	set_table_entry(&table[3], 4, "idle_idx", &sd->idle_idx,
+		sizeof(int), 0644, proc_dointvec_minmax);
+	set_table_entry(&table[4], 5, "newidle_idx", &sd->newidle_idx,
+		sizeof(int), 0644, proc_dointvec_minmax);
+	set_table_entry(&table[5], 6, "wake_idx", &sd->wake_idx,
+		sizeof(int), 0644, proc_dointvec_minmax);
+	set_table_entry(&table[6], 7, "forkexec_idx", &sd->forkexec_idx,
+		sizeof(int), 0644, proc_dointvec_minmax);
+	set_table_entry(&table[7], 8, "busy_factor", &sd->busy_factor,
+		sizeof(int), 0644, proc_dointvec_minmax);
+	set_table_entry(&table[8], 9, "imbalance_pct", &sd->imbalance_pct,
+		sizeof(int), 0644, proc_dointvec_minmax);
+	set_table_entry(&table[9], 10, "cache_hot_time", &sd->cache_hot_time,
+		sizeof(long long), 0644, proc_doulongvec_minmax);
+	set_table_entry(&table[10], 11, "cache_nice_tries",
+		&sd->cache_nice_tries,
+		sizeof(int), 0644, proc_dointvec_minmax);
+	set_table_entry(&table[12], 13, "flags", &sd->flags,
+		sizeof(int), 0644, proc_dointvec_minmax);
+
+	return table;
+}
+
+static ctl_table *sd_alloc_ctl_cpu_table(int cpu)
+{
+	struct ctl_table *entry, *table;
+	struct sched_domain *sd;
+	int domain_num = 0, i;
+	char buf[32];
+
+	for_each_domain(cpu, sd)
+		domain_num++;
+	entry = table = sd_alloc_ctl_entry(domain_num + 1);
+
+	i = 0;
+	for_each_domain(cpu, sd) {
+		snprintf(buf, 32, "domain%d", i);
+		entry->ctl_name = i + 1;
+		entry->procname = kstrdup(buf, GFP_KERNEL);
+		entry->mode = 0755;
+		entry->child = sd_alloc_ctl_domain_table(sd);
+		entry++;
+		i++;
+	}
+	return table;
+}
+
+static struct ctl_table_header *sd_sysctl_header;
+static void init_sched_domain_sysctl(void)
+{
+	int i, cpu_num = num_online_cpus();
+	struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
+	char buf[32];
+
+	sd_ctl_dir[0].child = entry;
+
+	for (i = 0; i < cpu_num; i++, entry++) {
+		snprintf(buf, 32, "cpu%d", i);
+		entry->ctl_name = i + 1;
+		entry->procname = kstrdup(buf, GFP_KERNEL);
+		entry->mode = 0755;
+		entry->child = sd_alloc_ctl_cpu_table(i);
+	}
+	sd_sysctl_header = register_sysctl_table(sd_ctl_root);
+}
+#else
+static void init_sched_domain_sysctl(void)
+{
+}
+#endif
+
 /*
  * migration_call - callback that gets triggered when a CPU is added.
  * Here we can start up the necessary migration thread for the new CPU.
@@ -6311,6 +6431,8 @@ void __init sched_init_smp(void)
 	/* XXX: Theoretical race here - CPU may be hotplugged now */
 	hotcpu_notifier(update_sched_domains, 0);
 
+	init_sched_domain_sysctl();
+
 	/* Move init over to a non-isolated CPU */
 	if (set_cpus_allowed(current, non_isolated_cpus) < 0)
 		BUG();
-- 
cgit v1.2.3-70-g09d2


From 61df47c8da1b4ba0f243975f11efc8956de0cba6 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Thu, 26 Jul 2007 10:40:56 -0700
Subject: kernel-doc fix for kmod.c

Fix kmod.c:
Warning(linux-2.6.23-rc1//kernel/kmod.c:364): No description found for parameter 'envp'

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kmod.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kmod.c b/kernel/kmod.c
index beedbdc6460..9809cc1f33d 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -351,11 +351,11 @@ static inline void register_pm_notifier_callback(void) {}
 
 /**
  * call_usermodehelper_setup - prepare to call a usermode helper
- * @path - path to usermode executable
- * @argv - arg vector for process
- * @envp - environment for process
+ * @path: path to usermode executable
+ * @argv: arg vector for process
+ * @envp: environment for process
  *
- * Returns either NULL on allocation failure, or a subprocess_info
+ * Returns either %NULL on allocation failure, or a subprocess_info
  * structure.  This should be passed to call_usermodehelper_exec to
  * exec the process and free the structure.
  */
-- 
cgit v1.2.3-70-g09d2


From 58b3b71dfaaecbf7cff1fe10c049d663f0313e5f Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Thu, 26 Jul 2007 16:29:55 +0200
Subject: Fix ThinkPad T42 poweroff failure introduced by by "PM: Introduce
 pm_power_off_prepare"
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Commit bd804eba1c8597cbb7cd5a5f9fe886aae16a079a ("PM: Introduce
pm_power_off_prepare") caused problems in the poweroff path, as reported by
YOSHIFUJI Hideaki / 吉藤英明.

Generally, sysdev_shutdown() should be called after the ACPI preparation for
powering the system off.  To make it happen, we can separate sysdev_shutdown()
from device_shutdown() and call it directly wherever necessary.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Tested-by: YOSHIFUJI Hideaki / 吉藤英明 <yoshfuji@linux-ipv6.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/base/power/shutdown.c | 2 --
 include/linux/device.h        | 3 +++
 kernel/power/disk.c           | 1 +
 kernel/sys.c                  | 3 +++
 4 files changed, 7 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/drivers/base/power/shutdown.c b/drivers/base/power/shutdown.c
index a47ee1b70d2..56e8eaaac01 100644
--- a/drivers/base/power/shutdown.c
+++ b/drivers/base/power/shutdown.c
@@ -44,7 +44,5 @@ void device_shutdown(void)
 			dev->driver->shutdown(dev);
 		}
 	}
-
-	sysdev_shutdown();
 }
 
diff --git a/include/linux/device.h b/include/linux/device.h
index d9f0a57f5a2..3a38d1f70cb 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -551,6 +551,9 @@ extern void put_device(struct device * dev);
 /* drivers/base/power/shutdown.c */
 extern void device_shutdown(void);
 
+/* drivers/base/sys.c */
+extern void sysdev_shutdown(void);
+
 
 /* drivers/base/firmware.c */
 extern int __must_check firmware_register(struct kset *);
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 324ac0188ce..eb72255b5c8 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -216,6 +216,7 @@ int hibernation_platform_enter(void)
 		 * sleep state after all
 		 */
 		error = hibernation_ops->prepare();
+		sysdev_shutdown();
 		if (!error)
 			error = hibernation_ops->enter();
 	} else {
diff --git a/kernel/sys.c b/kernel/sys.c
index 08562f41976..14f8adcfffd 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -804,6 +804,7 @@ static void kernel_restart_prepare(char *cmd)
 	blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd);
 	system_state = SYSTEM_RESTART;
 	device_shutdown();
+	sysdev_shutdown();
 }
 
 /**
@@ -860,6 +861,7 @@ void kernel_shutdown_prepare(enum system_states state)
 void kernel_halt(void)
 {
 	kernel_shutdown_prepare(SYSTEM_HALT);
+	sysdev_shutdown();
 	printk(KERN_EMERG "System halted.\n");
 	machine_halt();
 }
@@ -876,6 +878,7 @@ void kernel_power_off(void)
 	kernel_shutdown_prepare(SYSTEM_POWER_OFF);
 	if (pm_power_off_prepare)
 		pm_power_off_prepare();
+	sysdev_shutdown();
 	printk(KERN_EMERG "Power down.\n");
 	machine_power_off();
 }
-- 
cgit v1.2.3-70-g09d2


From 0af3678f7c5872836d1cc8d7c659abd62c3c5ae7 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@ftp.linux.org.uk>
Date: Fri, 27 Jul 2007 14:24:33 +0100
Subject: rip some includes from linux/interrupt.h

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Acked-by: Jeff Garzik <jeff@garzik.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/interrupt.h | 4 ++--
 kernel/irq/devres.c       | 1 +
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 0a3c2ebf200..5523f19d88d 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -11,8 +11,6 @@
 #include <linux/hardirq.h>
 #include <linux/sched.h>
 #include <linux/irqflags.h>
-#include <linux/bottom_half.h>
-#include <linux/device.h>
 #include <asm/atomic.h>
 #include <asm/ptrace.h>
 #include <asm/system.h>
@@ -97,6 +95,8 @@ extern int __must_check request_irq(unsigned int, irq_handler_t handler,
 		       unsigned long, const char *, void *);
 extern void free_irq(unsigned int, void *);
 
+struct device;
+
 extern int __must_check devm_request_irq(struct device *dev, unsigned int irq,
 			    irq_handler_t handler, unsigned long irqflags,
 			    const char *devname, void *dev_id);
diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c
index d8ee241115f..6d9204f3a37 100644
--- a/kernel/irq/devres.c
+++ b/kernel/irq/devres.c
@@ -1,5 +1,6 @@
 #include <linux/module.h>
 #include <linux/interrupt.h>
+#include <linux/device.h>
 
 /*
  * Device resource management aware IRQ request/free implementation.
-- 
cgit v1.2.3-70-g09d2


From 040b3a2df2dd26c3e401823f3b0ce3fe99e966c5 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Sat, 28 Jul 2007 00:55:18 +0200
Subject: audit: fix two bugs in the new execve audit code

copy_from_user() returns the number of bytes not copied, hence 0 is the
expected output.

axi->mm might not be valid anymore when not equal to current->mm, do not
dereference before checking that - thanks to Al for spotting that.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Tested-by: Steve Grubb <sgrubb@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/auditsc.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index bde1124d590..a777d376141 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -824,12 +824,14 @@ static void audit_log_execve_info(struct audit_buffer *ab,
 {
 	int i;
 	long len, ret;
-	const char __user *p = (const char __user *)axi->mm->arg_start;
+	const char __user *p;
 	char *buf;
 
 	if (axi->mm != current->mm)
 		return; /* execve failed, no additional info */
 
+	p = (const char __user *)axi->mm->arg_start;
+
 	for (i = 0; i < axi->argc; i++, p += len) {
 		len = strnlen_user(p, MAX_ARG_STRLEN);
 		/*
@@ -855,7 +857,7 @@ static void audit_log_execve_info(struct audit_buffer *ab,
 		 * copied them here, and the mm hasn't been exposed to user-
 		 * space yet.
 		 */
-		if (!ret) {
+		if (ret) {
 			WARN_ON(1);
 			send_sig(SIGKILL, current, 0);
 		}
-- 
cgit v1.2.3-70-g09d2


From b0cb1a19d05b8ea8611a9ef48a17fe417f1832e6 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Sun, 29 Jul 2007 23:24:36 +0200
Subject: Replace CONFIG_SOFTWARE_SUSPEND with CONFIG_HIBERNATION

Replace CONFIG_SOFTWARE_SUSPEND with CONFIG_HIBERNATION to avoid
confusion (among other things, with CONFIG_SUSPEND introduced in the
next patch).

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/i386/Kconfig.debug                 | 4 ++--
 arch/i386/kernel/e820.c                 | 2 +-
 arch/i386/mm/init.c                     | 2 +-
 arch/i386/power/Makefile                | 2 +-
 arch/powerpc/Kconfig.debug              | 2 +-
 arch/powerpc/configs/lite5200_defconfig | 2 +-
 arch/powerpc/configs/pmac32_defconfig   | 2 +-
 arch/powerpc/kernel/Makefile            | 6 +++---
 arch/ppc/configs/TQM8540_defconfig      | 2 +-
 arch/ppc/configs/TQM8541_defconfig      | 2 +-
 arch/ppc/configs/TQM8555_defconfig      | 2 +-
 arch/ppc/configs/TQM8560_defconfig      | 2 +-
 arch/ppc/configs/ev64360_defconfig      | 2 +-
 arch/ppc/configs/ml300_defconfig        | 2 +-
 arch/ppc/configs/ml403_defconfig        | 2 +-
 arch/ppc/configs/mpc834x_sys_defconfig  | 2 +-
 arch/ppc/configs/prep_defconfig         | 2 +-
 arch/sparc64/Kconfig.debug              | 2 +-
 arch/x86_64/defconfig                   | 2 +-
 arch/x86_64/kernel/Makefile             | 2 +-
 arch/x86_64/kernel/suspend.c            | 4 ++--
 drivers/acpi/sleep/main.c               | 6 +++---
 drivers/acpi/sleep/proc.c               | 2 +-
 drivers/i2c/chips/tps65010.c            | 2 +-
 include/asm-i386/e820.h                 | 2 +-
 include/linux/suspend.h                 | 8 ++++----
 kernel/power/Kconfig                    | 6 +++---
 kernel/power/Makefile                   | 2 +-
 kernel/power/main.c                     | 2 +-
 kernel/power/power.h                    | 2 +-
 kernel/sys.c                            | 2 +-
 mm/Kconfig                              | 4 ++--
 mm/swapfile.c                           | 6 +++---
 33 files changed, 47 insertions(+), 47 deletions(-)

(limited to 'kernel')

diff --git a/arch/i386/Kconfig.debug b/arch/i386/Kconfig.debug
index b31c0802e1c..f03531eacdf 100644
--- a/arch/i386/Kconfig.debug
+++ b/arch/i386/Kconfig.debug
@@ -36,11 +36,11 @@ config DEBUG_STACK_USAGE
 	  This option will slow down process creation somewhat.
 
 comment "Page alloc debug is incompatible with Software Suspend on i386"
-	depends on DEBUG_KERNEL && SOFTWARE_SUSPEND
+	depends on DEBUG_KERNEL && HIBERNATION
 
 config DEBUG_PAGEALLOC
 	bool "Debug page memory allocations"
-	depends on DEBUG_KERNEL && !SOFTWARE_SUSPEND && !HUGETLBFS
+	depends on DEBUG_KERNEL && !HIBERNATION && !HUGETLBFS
 	help
 	  Unmap pages from the kernel linear mapping after free_pages().
 	  This results in a large slowdown, but helps to find certain types
diff --git a/arch/i386/kernel/e820.c b/arch/i386/kernel/e820.c
index e60cddbc4cf..3c86b979a40 100644
--- a/arch/i386/kernel/e820.c
+++ b/arch/i386/kernel/e820.c
@@ -321,7 +321,7 @@ static int __init request_standard_resources(void)
 
 subsys_initcall(request_standard_resources);
 
-#if defined(CONFIG_PM) && defined(CONFIG_SOFTWARE_SUSPEND)
+#if defined(CONFIG_PM) && defined(CONFIG_HIBERNATION)
 /**
  * e820_mark_nosave_regions - Find the ranges of physical addresses that do not
  * correspond to e820 RAM areas and mark the corresponding pages as nosave for
diff --git a/arch/i386/mm/init.c b/arch/i386/mm/init.c
index 4c4809f13cb..730a5b177b1 100644
--- a/arch/i386/mm/init.c
+++ b/arch/i386/mm/init.c
@@ -432,7 +432,7 @@ static void __init pagetable_init (void)
 	paravirt_pagetable_setup_done(pgd_base);
 }
 
-#if defined(CONFIG_SOFTWARE_SUSPEND) || defined(CONFIG_ACPI)
+#if defined(CONFIG_HIBERNATION) || defined(CONFIG_ACPI)
 /*
  * Swap suspend & friends need this for resume because things like the intel-agp
  * driver might have split up a kernel 4MB mapping.
diff --git a/arch/i386/power/Makefile b/arch/i386/power/Makefile
index 2de7bbf03cd..d764ec95006 100644
--- a/arch/i386/power/Makefile
+++ b/arch/i386/power/Makefile
@@ -1,2 +1,2 @@
 obj-$(CONFIG_PM)		+= cpu.o
-obj-$(CONFIG_SOFTWARE_SUSPEND)	+= swsusp.o suspend.o
+obj-$(CONFIG_HIBERNATION)	+= swsusp.o suspend.o
diff --git a/arch/powerpc/Kconfig.debug b/arch/powerpc/Kconfig.debug
index 5c71624ee38..22acece95b1 100644
--- a/arch/powerpc/Kconfig.debug
+++ b/arch/powerpc/Kconfig.debug
@@ -20,7 +20,7 @@ config DEBUG_STACK_USAGE
 
 config DEBUG_PAGEALLOC
         bool "Debug page memory allocations"
-        depends on DEBUG_KERNEL && !SOFTWARE_SUSPEND
+        depends on DEBUG_KERNEL && !HIBERNATION
         help
           Unmap pages from the kernel linear mapping after free_pages().
           This results in a large slowdown, but helps to find certain types
diff --git a/arch/powerpc/configs/lite5200_defconfig b/arch/powerpc/configs/lite5200_defconfig
index d12a981398b..9c30ca45161 100644
--- a/arch/powerpc/configs/lite5200_defconfig
+++ b/arch/powerpc/configs/lite5200_defconfig
@@ -196,7 +196,7 @@ CONFIG_PM=y
 # CONFIG_PM_LEGACY is not set
 # CONFIG_PM_DEBUG is not set
 # CONFIG_PM_SYSFS_DEPRECATED is not set
-# CONFIG_SOFTWARE_SUSPEND is not set
+# CONFIG_HIBERNATION is not set
 CONFIG_SECCOMP=y
 # CONFIG_WANT_DEVICE_TREE is not set
 CONFIG_ISA_DMA_API=y
diff --git a/arch/powerpc/configs/pmac32_defconfig b/arch/powerpc/configs/pmac32_defconfig
index 0d8ba623e29..08525d6fb1f 100644
--- a/arch/powerpc/configs/pmac32_defconfig
+++ b/arch/powerpc/configs/pmac32_defconfig
@@ -218,7 +218,7 @@ CONFIG_PM=y
 CONFIG_PM_DEBUG=y
 # CONFIG_DISABLE_CONSOLE_SUSPEND is not set
 CONFIG_PM_SYSFS_DEPRECATED=y
-CONFIG_SOFTWARE_SUSPEND=y
+CONFIG_HIBERNATION=y
 CONFIG_PM_STD_PARTITION=""
 CONFIG_APM_EMULATION=y
 CONFIG_SECCOMP=y
diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index 42c42ecad00..f39a72f30aa 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -37,9 +37,9 @@ obj-$(CONFIG_GENERIC_TBSYNC)	+= smp-tbsync.o
 obj-$(CONFIG_CRASH_DUMP)	+= crash_dump.o
 obj-$(CONFIG_6xx)		+= idle_6xx.o l2cr_6xx.o cpu_setup_6xx.o
 obj-$(CONFIG_TAU)		+= tau_6xx.o
-obj-$(CONFIG_SOFTWARE_SUSPEND)	+= swsusp.o suspend.o
-obj32-$(CONFIG_SOFTWARE_SUSPEND) += swsusp_32.o
-obj64-$(CONFIG_SOFTWARE_SUSPEND) += swsusp_64.o swsusp_asm64.o
+obj-$(CONFIG_HIBERNATION)	+= swsusp.o suspend.o
+obj32-$(CONFIG_HIBERNATION) += swsusp_32.o
+obj64-$(CONFIG_HIBERNATION) += swsusp_64.o swsusp_asm64.o
 obj32-$(CONFIG_MODULES)		+= module_32.o
 
 ifeq ($(CONFIG_PPC_MERGE),y)
diff --git a/arch/ppc/configs/TQM8540_defconfig b/arch/ppc/configs/TQM8540_defconfig
index 99bf3b7a276..f33f0e772dc 100644
--- a/arch/ppc/configs/TQM8540_defconfig
+++ b/arch/ppc/configs/TQM8540_defconfig
@@ -136,7 +136,7 @@ CONFIG_BINFMT_ELF=y
 # CONFIG_BINFMT_MISC is not set
 # CONFIG_CMDLINE_BOOL is not set
 # CONFIG_PM is not set
-# CONFIG_SOFTWARE_SUSPEND is not set
+# CONFIG_HIBERNATION is not set
 CONFIG_SECCOMP=y
 CONFIG_ISA_DMA_API=y
 
diff --git a/arch/ppc/configs/TQM8541_defconfig b/arch/ppc/configs/TQM8541_defconfig
index 0ff56695d34..e00cd62daa3 100644
--- a/arch/ppc/configs/TQM8541_defconfig
+++ b/arch/ppc/configs/TQM8541_defconfig
@@ -138,7 +138,7 @@ CONFIG_BINFMT_ELF=y
 # CONFIG_BINFMT_MISC is not set
 # CONFIG_CMDLINE_BOOL is not set
 # CONFIG_PM is not set
-# CONFIG_SOFTWARE_SUSPEND is not set
+# CONFIG_HIBERNATION is not set
 CONFIG_SECCOMP=y
 CONFIG_ISA_DMA_API=y
 
diff --git a/arch/ppc/configs/TQM8555_defconfig b/arch/ppc/configs/TQM8555_defconfig
index 730b3db2e47..43a0d9df1e2 100644
--- a/arch/ppc/configs/TQM8555_defconfig
+++ b/arch/ppc/configs/TQM8555_defconfig
@@ -138,7 +138,7 @@ CONFIG_BINFMT_ELF=y
 # CONFIG_BINFMT_MISC is not set
 # CONFIG_CMDLINE_BOOL is not set
 # CONFIG_PM is not set
-# CONFIG_SOFTWARE_SUSPEND is not set
+# CONFIG_HIBERNATION is not set
 CONFIG_SECCOMP=y
 CONFIG_ISA_DMA_API=y
 
diff --git a/arch/ppc/configs/TQM8560_defconfig b/arch/ppc/configs/TQM8560_defconfig
index 1d902072825..a814d17a2be 100644
--- a/arch/ppc/configs/TQM8560_defconfig
+++ b/arch/ppc/configs/TQM8560_defconfig
@@ -137,7 +137,7 @@ CONFIG_BINFMT_ELF=y
 # CONFIG_BINFMT_MISC is not set
 # CONFIG_CMDLINE_BOOL is not set
 # CONFIG_PM is not set
-# CONFIG_SOFTWARE_SUSPEND is not set
+# CONFIG_HIBERNATION is not set
 CONFIG_SECCOMP=y
 CONFIG_ISA_DMA_API=y
 
diff --git a/arch/ppc/configs/ev64360_defconfig b/arch/ppc/configs/ev64360_defconfig
index d471e578dcb..f297c4bb632 100644
--- a/arch/ppc/configs/ev64360_defconfig
+++ b/arch/ppc/configs/ev64360_defconfig
@@ -142,7 +142,7 @@ CONFIG_BINFMT_MISC=y
 CONFIG_CMDLINE_BOOL=y
 CONFIG_CMDLINE="console=ttyMM0,115200 root=/dev/mtdblock1 rw rootfstype=jffs2"
 # CONFIG_PM is not set
-# CONFIG_SOFTWARE_SUSPEND is not set
+# CONFIG_HIBERNATION is not set
 CONFIG_SECCOMP=y
 CONFIG_ISA_DMA_API=y
 
diff --git a/arch/ppc/configs/ml300_defconfig b/arch/ppc/configs/ml300_defconfig
index 4a33aca948c..69bad91a6b6 100644
--- a/arch/ppc/configs/ml300_defconfig
+++ b/arch/ppc/configs/ml300_defconfig
@@ -148,7 +148,7 @@ CONFIG_BINFMT_ELF=y
 CONFIG_CMDLINE_BOOL=y
 CONFIG_CMDLINE="console=ttyS0,9600"
 # CONFIG_PM is not set
-# CONFIG_SOFTWARE_SUSPEND is not set
+# CONFIG_HIBERNATION is not set
 CONFIG_SECCOMP=y
 CONFIG_ISA_DMA_API=y
 
diff --git a/arch/ppc/configs/ml403_defconfig b/arch/ppc/configs/ml403_defconfig
index fafd2516fa5..a78896ea456 100644
--- a/arch/ppc/configs/ml403_defconfig
+++ b/arch/ppc/configs/ml403_defconfig
@@ -149,7 +149,7 @@ CONFIG_BINFMT_ELF=y
 CONFIG_CMDLINE_BOOL=y
 CONFIG_CMDLINE="console=ttyS0,9600"
 # CONFIG_PM is not set
-# CONFIG_SOFTWARE_SUSPEND is not set
+# CONFIG_HIBERNATION is not set
 CONFIG_SECCOMP=y
 CONFIG_ISA_DMA_API=y
 
diff --git a/arch/ppc/configs/mpc834x_sys_defconfig b/arch/ppc/configs/mpc834x_sys_defconfig
index b96a6d6dad0..d90c8a7e060 100644
--- a/arch/ppc/configs/mpc834x_sys_defconfig
+++ b/arch/ppc/configs/mpc834x_sys_defconfig
@@ -130,7 +130,7 @@ CONFIG_BINFMT_ELF=y
 # CONFIG_BINFMT_MISC is not set
 # CONFIG_CMDLINE_BOOL is not set
 # CONFIG_PM is not set
-# CONFIG_SOFTWARE_SUSPEND is not set
+# CONFIG_HIBERNATION is not set
 CONFIG_SECCOMP=y
 CONFIG_ISA_DMA_API=y
 
diff --git a/arch/ppc/configs/prep_defconfig b/arch/ppc/configs/prep_defconfig
index 0aa333178b2..b7cee2d7140 100644
--- a/arch/ppc/configs/prep_defconfig
+++ b/arch/ppc/configs/prep_defconfig
@@ -166,7 +166,7 @@ CONFIG_PROC_PREPRESIDUAL=y
 CONFIG_PM=y
 # CONFIG_PM_LEGACY is not set
 # CONFIG_PM_DEBUG is not set
-CONFIG_SOFTWARE_SUSPEND=y
+CONFIG_HIBERNATION=y
 CONFIG_PM_STD_PARTITION=""
 # CONFIG_SECCOMP is not set
 CONFIG_ISA_DMA_API=y
diff --git a/arch/sparc64/Kconfig.debug b/arch/sparc64/Kconfig.debug
index 1f130f3b6c2..a5faa3683bd 100644
--- a/arch/sparc64/Kconfig.debug
+++ b/arch/sparc64/Kconfig.debug
@@ -29,7 +29,7 @@ config DEBUG_BOOTMEM
 
 config DEBUG_PAGEALLOC
 	bool "Debug page memory allocations"
-	depends on DEBUG_KERNEL && !SOFTWARE_SUSPEND
+	depends on DEBUG_KERNEL && !HIBERNATION
 	help
 	  Unmap pages from the kernel linear mapping after free_pages().
 	  This results in a large slowdown, but helps to find certain types
diff --git a/arch/x86_64/defconfig b/arch/x86_64/defconfig
index b7c4cd04bfc..e64f65c9d90 100644
--- a/arch/x86_64/defconfig
+++ b/arch/x86_64/defconfig
@@ -199,7 +199,7 @@ CONFIG_GENERIC_PENDING_IRQ=y
 CONFIG_PM=y
 # CONFIG_PM_LEGACY is not set
 # CONFIG_PM_DEBUG is not set
-CONFIG_SOFTWARE_SUSPEND=y
+CONFIG_HIBERNATION=y
 CONFIG_PM_STD_PARTITION=""
 CONFIG_SUSPEND_SMP=y
 
diff --git a/arch/x86_64/kernel/Makefile b/arch/x86_64/kernel/Makefile
index 47f1dc30bf5..d1d18c1ea0f 100644
--- a/arch/x86_64/kernel/Makefile
+++ b/arch/x86_64/kernel/Makefile
@@ -26,7 +26,7 @@ obj-y				+= io_apic.o mpparse.o genapic.o genapic_flat.o
 obj-$(CONFIG_KEXEC)		+= machine_kexec.o relocate_kernel.o crash.o
 obj-$(CONFIG_CRASH_DUMP)	+= crash_dump.o
 obj-$(CONFIG_PM)		+= suspend.o
-obj-$(CONFIG_SOFTWARE_SUSPEND)	+= suspend_asm.o
+obj-$(CONFIG_HIBERNATION)	+= suspend_asm.o
 obj-$(CONFIG_CPU_FREQ)		+= cpufreq/
 obj-$(CONFIG_EARLY_PRINTK)	+= early_printk.o
 obj-$(CONFIG_IOMMU)		+= pci-gart.o aperture.o
diff --git a/arch/x86_64/kernel/suspend.c b/arch/x86_64/kernel/suspend.c
index ea83a9f9196..573c0a6e0ac 100644
--- a/arch/x86_64/kernel/suspend.c
+++ b/arch/x86_64/kernel/suspend.c
@@ -146,7 +146,7 @@ void fix_processor_context(void)
 
 }
 
-#ifdef CONFIG_SOFTWARE_SUSPEND
+#ifdef CONFIG_HIBERNATION
 /* Defined in arch/x86_64/kernel/suspend_asm.S */
 extern int restore_image(void);
 
@@ -236,4 +236,4 @@ int pfn_is_nosave(unsigned long pfn)
 	unsigned long nosave_end_pfn = PAGE_ALIGN(__pa_symbol(&__nosave_end)) >> PAGE_SHIFT;
 	return (pfn >= nosave_begin_pfn) && (pfn < nosave_end_pfn);
 }
-#endif /* CONFIG_SOFTWARE_SUSPEND */
+#endif /* CONFIG_HIBERNATION */
diff --git a/drivers/acpi/sleep/main.c b/drivers/acpi/sleep/main.c
index ab21357c5c7..b4e94c893c8 100644
--- a/drivers/acpi/sleep/main.c
+++ b/drivers/acpi/sleep/main.c
@@ -202,7 +202,7 @@ static struct pm_ops acpi_pm_ops = {
 	.finish = acpi_pm_finish,
 };
 
-#ifdef CONFIG_SOFTWARE_SUSPEND
+#ifdef CONFIG_HIBERNATION
 static int acpi_hibernation_prepare(void)
 {
 	return acpi_sleep_prepare(ACPI_STATE_S4);
@@ -254,7 +254,7 @@ static struct hibernation_ops acpi_hibernation_ops = {
 	.pre_restore = acpi_hibernation_pre_restore,
 	.restore_cleanup = acpi_hibernation_restore_cleanup,
 };
-#endif				/* CONFIG_SOFTWARE_SUSPEND */
+#endif				/* CONFIG_HIBERNATION */
 
 /**
  *	acpi_pm_device_sleep_state - return preferred power state of ACPI device
@@ -374,7 +374,7 @@ int __init acpi_sleep_init(void)
 
 	pm_set_ops(&acpi_pm_ops);
 
-#ifdef CONFIG_SOFTWARE_SUSPEND
+#ifdef CONFIG_HIBERNATION
 	if (sleep_states[ACPI_STATE_S4])
 		hibernation_set_ops(&acpi_hibernation_ops);
 #else
diff --git a/drivers/acpi/sleep/proc.c b/drivers/acpi/sleep/proc.c
index ed58e1168ae..1b7bbb5ba62 100644
--- a/drivers/acpi/sleep/proc.c
+++ b/drivers/acpi/sleep/proc.c
@@ -66,7 +66,7 @@ acpi_system_write_sleep(struct file *file,
 		goto Done;
 	}
 	state = simple_strtoul(str, NULL, 0);
-#ifdef CONFIG_SOFTWARE_SUSPEND
+#ifdef CONFIG_HIBERNATION
 	if (state == 4) {
 		error = hibernate();
 		goto Done;
diff --git a/drivers/i2c/chips/tps65010.c b/drivers/i2c/chips/tps65010.c
index 3c3f2ebf3fc..503ffec2ce0 100644
--- a/drivers/i2c/chips/tps65010.c
+++ b/drivers/i2c/chips/tps65010.c
@@ -352,7 +352,7 @@ static void tps65010_interrupt(struct tps65010 *tps)
 			/* REVISIT:  this might need its own workqueue
 			 * plus tweaks including deadlock avoidance ...
 			 * also needs to get error handling and probably
-			 * an #ifdef CONFIG_SOFTWARE_SUSPEND
+			 * an #ifdef CONFIG_HIBERNATION
 			 */
 			hibernate();
 #endif
diff --git a/include/asm-i386/e820.h b/include/asm-i386/e820.h
index 43114c82460..cf67dbb1db7 100644
--- a/include/asm-i386/e820.h
+++ b/include/asm-i386/e820.h
@@ -47,7 +47,7 @@ extern void e820_register_memory(void);
 extern void limit_regions(unsigned long long size);
 extern void print_memory_map(char *who);
 
-#if defined(CONFIG_PM) && defined(CONFIG_SOFTWARE_SUSPEND)
+#if defined(CONFIG_PM) && defined(CONFIG_HIBERNATION)
 extern void e820_mark_nosave_regions(void);
 #else
 static inline void e820_mark_nosave_regions(void)
diff --git a/include/linux/suspend.h b/include/linux/suspend.h
index 618f93c32b7..d16c1b85d51 100644
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -55,7 +55,7 @@ struct hibernation_ops {
 };
 
 #ifdef CONFIG_PM
-#ifdef CONFIG_SOFTWARE_SUSPEND
+#ifdef CONFIG_HIBERNATION
 /* kernel/power/snapshot.c */
 extern void __register_nosave_region(unsigned long b, unsigned long e, int km);
 static inline void register_nosave_region(unsigned long b, unsigned long e)
@@ -73,14 +73,14 @@ extern unsigned long get_safe_page(gfp_t gfp_mask);
 
 extern void hibernation_set_ops(struct hibernation_ops *ops);
 extern int hibernate(void);
-#else /* CONFIG_SOFTWARE_SUSPEND */
+#else /* CONFIG_HIBERNATION */
 static inline int swsusp_page_is_forbidden(struct page *p) { return 0; }
 static inline void swsusp_set_page_free(struct page *p) {}
 static inline void swsusp_unset_page_free(struct page *p) {}
 
 static inline void hibernation_set_ops(struct hibernation_ops *ops) {}
 static inline int hibernate(void) { return -ENOSYS; }
-#endif /* CONFIG_SOFTWARE_SUSPEND */
+#endif /* CONFIG_HIBERNATION */
 
 void save_processor_state(void);
 void restore_processor_state(void);
@@ -121,7 +121,7 @@ static inline int unregister_pm_notifier(struct notifier_block *nb)
 #define pm_notifier(fn, pri)	do { (void)(fn); } while (0)
 #endif /* CONFIG_PM */
 
-#if !defined CONFIG_SOFTWARE_SUSPEND || !defined(CONFIG_PM)
+#if !defined CONFIG_HIBERNATION || !defined(CONFIG_PM)
 static inline void register_nosave_region(unsigned long b, unsigned long e)
 {
 }
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index c1a106d87d9..c2582a4a537 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -72,8 +72,8 @@ config PM_TRACE
 	CAUTION: this option will cause your machine's real-time clock to be
 	set to an invalid time after a resume.
 
-config SOFTWARE_SUSPEND
-	bool "Software Suspend (Hibernation)"
+config HIBERNATION
+	bool "Hibernation"
 	depends on PM && SWAP && (((X86 || PPC64_SWSUSP) && (!SMP || SUSPEND_SMP)) || ((FRV || PPC32) && !SMP))
 	---help---
 	  Enable the suspend to disk (STD) functionality, which is usually
@@ -112,7 +112,7 @@ config SOFTWARE_SUSPEND
 
 config PM_STD_PARTITION
 	string "Default resume partition"
-	depends on SOFTWARE_SUSPEND
+	depends on HIBERNATION
 	default ""
 	---help---
 	  The default resume partition is the partition that the suspend-
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index 38725f526af..c6b03764512 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -5,6 +5,6 @@ endif
 
 obj-y				:= main.o process.o console.o
 obj-$(CONFIG_PM_LEGACY)		+= pm.o
-obj-$(CONFIG_SOFTWARE_SUSPEND)	+= swsusp.o disk.o snapshot.o swap.o user.o
+obj-$(CONFIG_HIBERNATION)	+= swsusp.o disk.o snapshot.o swap.o user.o
 
 obj-$(CONFIG_MAGIC_SYSRQ)	+= poweroff.o
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 32147b57c3b..cfba6987ae7 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -292,7 +292,7 @@ static ssize_t state_show(struct kset *kset, char *buf)
 		if (pm_states[i] && valid_state(i))
 			s += sprintf(s,"%s ", pm_states[i]);
 	}
-#ifdef CONFIG_SOFTWARE_SUSPEND
+#ifdef CONFIG_HIBERNATION
 	s += sprintf(s, "%s\n", "disk");
 #else
 	if (s != buf)
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 5f24c786f8e..9080914796f 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -13,7 +13,7 @@ struct swsusp_info {
 
 
 
-#ifdef CONFIG_SOFTWARE_SUSPEND
+#ifdef CONFIG_HIBERNATION
 /*
  * Keep some memory free so that I/O operations can succeed without paging
  * [Might this be more than 4 MB?]
diff --git a/kernel/sys.c b/kernel/sys.c
index 14f8adcfffd..449b81b98b3 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -954,7 +954,7 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user
 		unlock_kernel();
 		return -EINVAL;
 
-#ifdef CONFIG_SOFTWARE_SUSPEND
+#ifdef CONFIG_HIBERNATION
 	case LINUX_REBOOT_CMD_SW_SUSPEND:
 		{
 			int ret = hibernate();
diff --git a/mm/Kconfig b/mm/Kconfig
index 86187221e78..e24d348083c 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -116,11 +116,11 @@ config SPARSEMEM_EXTREME
 config MEMORY_HOTPLUG
 	bool "Allow for memory hot-add"
 	depends on SPARSEMEM || X86_64_ACPI_NUMA
-	depends on HOTPLUG && !SOFTWARE_SUSPEND && ARCH_ENABLE_MEMORY_HOTPLUG
+	depends on HOTPLUG && !HIBERNATION && ARCH_ENABLE_MEMORY_HOTPLUG
 	depends on (IA64 || X86 || PPC64 || SUPERH)
 
 comment "Memory hotplug is currently incompatible with Software Suspend"
-	depends on SPARSEMEM && HOTPLUG && SOFTWARE_SUSPEND
+	depends on SPARSEMEM && HOTPLUG && HIBERNATION
 
 config MEMORY_HOTPLUG_SPARSE
 	def_bool y
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 7ff0a81c7b0..f071648e136 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -425,7 +425,7 @@ void free_swap_and_cache(swp_entry_t entry)
 	}
 }
 
-#ifdef CONFIG_SOFTWARE_SUSPEND
+#ifdef CONFIG_HIBERNATION
 /*
  * Find the swap type that corresponds to given device (if any).
  *
@@ -951,7 +951,7 @@ sector_t map_swap_page(struct swap_info_struct *sis, pgoff_t offset)
 	}
 }
 
-#ifdef CONFIG_SOFTWARE_SUSPEND
+#ifdef CONFIG_HIBERNATION
 /*
  * Get the (PAGE_SIZE) block corresponding to given offset on the swapdev
  * corresponding to given index in swap_info (swap type).
@@ -966,7 +966,7 @@ sector_t swapdev_block(int swap_type, pgoff_t offset)
 	sis = swap_info + swap_type;
 	return (sis->flags & SWP_WRITEOK) ? map_swap_page(sis, offset) : 0;
 }
-#endif /* CONFIG_SOFTWARE_SUSPEND */
+#endif /* CONFIG_HIBERNATION */
 
 /*
  * Free all of a swapdev's extent information
-- 
cgit v1.2.3-70-g09d2


From 296699de6bdc717189a331ab6bbe90e05c94db06 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Sun, 29 Jul 2007 23:27:18 +0200
Subject: Introduce CONFIG_SUSPEND for suspend-to-Ram and standby

Introduce CONFIG_SUSPEND representing the ability to enter system sleep
states, such as the ACPI S3 state, and allow the user to choose SUSPEND
and HIBERNATION independently of each other.

Make HOTPLUG_CPU be selected automatically if SUSPEND or HIBERNATION has
been chosen and the kernel is intended for SMP systems.

Also, introduce CONFIG_PM_SLEEP which is automatically selected if
CONFIG_SUSPEND or CONFIG_HIBERNATION is set and use it to select the
code needed for both suspend and hibernation.

The top-level power management headers and the ACPI code related to
suspend and hibernation are modified to use the new definitions (the
changes in drivers/acpi/sleep/main.c are, mostly, moving code to reduce
the number of ifdefs).

There are many other files in which CONFIG_PM can be replaced with
CONFIG_PM_SLEEP or even with CONFIG_SUSPEND, but they can be updated in
the future.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/acpi/Kconfig        |  8 ++++
 drivers/acpi/sleep/Makefile |  2 +-
 drivers/acpi/sleep/main.c   | 94 ++++++++++++++++++++++++---------------------
 drivers/acpi/sleep/proc.c   | 10 ++---
 drivers/acpi/sleep/sleep.h  |  2 +
 drivers/base/power/Makefile |  2 +-
 drivers/base/power/power.h  |  4 +-
 include/acpi/acpi_bus.h     |  9 +++++
 include/acpi/acpi_drivers.h |  4 ++
 include/linux/freezer.h     |  6 +--
 include/linux/pm.h          | 15 ++++++--
 include/linux/suspend.h     | 10 ++---
 kernel/power/Kconfig        | 41 +++++++++++++++-----
 kernel/power/Makefile       |  3 +-
 kernel/power/main.c         | 26 +++++++++----
 kernel/power/power.h        | 10 ++++-
 mm/page_alloc.c             |  4 +-
 17 files changed, 164 insertions(+), 86 deletions(-)

(limited to 'kernel')

diff --git a/drivers/acpi/Kconfig b/drivers/acpi/Kconfig
index 22b401b2e08..66e78d52a09 100644
--- a/drivers/acpi/Kconfig
+++ b/drivers/acpi/Kconfig
@@ -63,6 +63,14 @@ config ACPI_PROCFS
 
 	  Say N to delete /proc/acpi/ files that have moved to /sys/
 
+config ACPI_PROCFS_SLEEP
+	bool "/proc/acpi/sleep (deprecated)"
+	depends on PM_SLEEP && ACPI_PROCFS
+	default n
+	---help---
+	  Create /proc/acpi/sleep
+	  Deprecated by /sys/power/state
+
 config ACPI_AC
 	tristate "AC Adapter"
 	depends on X86
diff --git a/drivers/acpi/sleep/Makefile b/drivers/acpi/sleep/Makefile
index 01a993a1d08..2bec897ab1e 100644
--- a/drivers/acpi/sleep/Makefile
+++ b/drivers/acpi/sleep/Makefile
@@ -1,5 +1,5 @@
 obj-y					:= poweroff.o wakeup.o
-obj-y					+= main.o
+obj-$(CONFIG_PM_SLEEP)			+= main.o
 obj-$(CONFIG_X86)			+= proc.o
 
 EXTRA_CFLAGS += $(ACPI_CFLAGS)
diff --git a/drivers/acpi/sleep/main.c b/drivers/acpi/sleep/main.c
index b4e94c893c8..e8cff5dd4cb 100644
--- a/drivers/acpi/sleep/main.c
+++ b/drivers/acpi/sleep/main.c
@@ -21,6 +21,9 @@
 
 u8 sleep_states[ACPI_S_STATE_COUNT];
 
+static u32 acpi_target_sleep_state = ACPI_STATE_S0;
+
+#ifdef CONFIG_SUSPEND
 static struct pm_ops acpi_pm_ops;
 
 extern void do_suspend_lowlevel(void);
@@ -34,11 +37,6 @@ static u32 acpi_suspend_states[] = {
 
 static int init_8259A_after_S1;
 
-extern int acpi_sleep_prepare(u32 acpi_state);
-extern void acpi_power_off(void);
-
-static u32 acpi_target_sleep_state = ACPI_STATE_S0;
-
 /**
  *	acpi_pm_set_target - Set the target system sleep state to the state
  *		associated with given @pm_state, if supported.
@@ -163,21 +161,6 @@ static int acpi_pm_finish(suspend_state_t pm_state)
 	return 0;
 }
 
-int acpi_suspend(u32 acpi_state)
-{
-	suspend_state_t states[] = {
-		[1] = PM_SUSPEND_STANDBY,
-		[3] = PM_SUSPEND_MEM,
-		[5] = PM_SUSPEND_MAX
-	};
-
-	if (acpi_state < 6 && states[acpi_state])
-		return pm_suspend(states[acpi_state]);
-	if (acpi_state == 4)
-		return hibernate();
-	return -EINVAL;
-}
-
 static int acpi_pm_state_valid(suspend_state_t pm_state)
 {
 	u32 acpi_state;
@@ -202,6 +185,27 @@ static struct pm_ops acpi_pm_ops = {
 	.finish = acpi_pm_finish,
 };
 
+/*
+ * Toshiba fails to preserve interrupts over S1, reinitialization
+ * of 8259 is needed after S1 resume.
+ */
+static int __init init_ints_after_s1(struct dmi_system_id *d)
+{
+	printk(KERN_WARNING "%s with broken S1 detected.\n", d->ident);
+	init_8259A_after_S1 = 1;
+	return 0;
+}
+
+static struct dmi_system_id __initdata acpisleep_dmi_table[] = {
+	{
+	 .callback = init_ints_after_s1,
+	 .ident = "Toshiba Satellite 4030cdt",
+	 .matches = {DMI_MATCH(DMI_PRODUCT_NAME, "S4030CDT/4.3"),},
+	 },
+	{},
+};
+#endif /* CONFIG_SUSPEND */
+
 #ifdef CONFIG_HIBERNATION
 static int acpi_hibernation_prepare(void)
 {
@@ -256,6 +260,21 @@ static struct hibernation_ops acpi_hibernation_ops = {
 };
 #endif				/* CONFIG_HIBERNATION */
 
+int acpi_suspend(u32 acpi_state)
+{
+	suspend_state_t states[] = {
+		[1] = PM_SUSPEND_STANDBY,
+		[3] = PM_SUSPEND_MEM,
+		[5] = PM_SUSPEND_MAX
+	};
+
+	if (acpi_state < 6 && states[acpi_state])
+		return pm_suspend(states[acpi_state]);
+	if (acpi_state == 4)
+		return hibernate();
+	return -EINVAL;
+}
+
 /**
  *	acpi_pm_device_sleep_state - return preferred power state of ACPI device
  *		in the system sleep state given by %acpi_target_sleep_state
@@ -331,39 +350,22 @@ int acpi_pm_device_sleep_state(struct device *dev, int wake, int *d_min_p)
 	return d_max;
 }
 
-/*
- * Toshiba fails to preserve interrupts over S1, reinitialization
- * of 8259 is needed after S1 resume.
- */
-static int __init init_ints_after_s1(struct dmi_system_id *d)
-{
-	printk(KERN_WARNING "%s with broken S1 detected.\n", d->ident);
-	init_8259A_after_S1 = 1;
-	return 0;
-}
-
-static struct dmi_system_id __initdata acpisleep_dmi_table[] = {
-	{
-	 .callback = init_ints_after_s1,
-	 .ident = "Toshiba Satellite 4030cdt",
-	 .matches = {DMI_MATCH(DMI_PRODUCT_NAME, "S4030CDT/4.3"),},
-	 },
-	{},
-};
-
 int __init acpi_sleep_init(void)
 {
+	acpi_status status;
+	u8 type_a, type_b;
+#ifdef CONFIG_SUSPEND
 	int i = 0;
 
 	dmi_check_system(acpisleep_dmi_table);
+#endif
 
 	if (acpi_disabled)
 		return 0;
 
+#ifdef CONFIG_SUSPEND
 	printk(KERN_INFO PREFIX "(supports");
-	for (i = 0; i < ACPI_S_STATE_COUNT; i++) {
-		acpi_status status;
-		u8 type_a, type_b;
+	for (i = ACPI_STATE_S0; i < ACPI_STATE_S4; i++) {
 		status = acpi_get_sleep_type_data(i, &type_a, &type_b);
 		if (ACPI_SUCCESS(status)) {
 			sleep_states[i] = 1;
@@ -373,10 +375,14 @@ int __init acpi_sleep_init(void)
 	printk(")\n");
 
 	pm_set_ops(&acpi_pm_ops);
+#endif
 
 #ifdef CONFIG_HIBERNATION
-	if (sleep_states[ACPI_STATE_S4])
+	status = acpi_get_sleep_type_data(ACPI_STATE_S4, &type_a, &type_b);
+	if (ACPI_SUCCESS(status)) {
 		hibernation_set_ops(&acpi_hibernation_ops);
+		sleep_states[ACPI_STATE_S4] = 1;
+	}
 #else
 	sleep_states[ACPI_STATE_S4] = 0;
 #endif
diff --git a/drivers/acpi/sleep/proc.c b/drivers/acpi/sleep/proc.c
index 1b7bbb5ba62..5dfe8b78963 100644
--- a/drivers/acpi/sleep/proc.c
+++ b/drivers/acpi/sleep/proc.c
@@ -23,7 +23,7 @@
  */
 
 ACPI_MODULE_NAME("sleep")
-#ifdef	CONFIG_ACPI_PROCFS
+#ifdef	CONFIG_ACPI_PROCFS_SLEEP
 static int acpi_system_sleep_seq_show(struct seq_file *seq, void *offset)
 {
 	int i;
@@ -76,7 +76,7 @@ acpi_system_write_sleep(struct file *file,
       Done:
 	return error ? error : count;
 }
-#endif				/* CONFIG_ACPI_PROCFS */
+#endif				/* CONFIG_ACPI_PROCFS_SLEEP */
 
 #if defined(CONFIG_RTC_DRV_CMOS) || defined(CONFIG_RTC_DRV_CMOS_MODULE)
 /* use /sys/class/rtc/rtcX/wakealarm instead; it's not ACPI-specific */
@@ -471,7 +471,7 @@ static const struct file_operations acpi_system_wakeup_device_fops = {
 	.release = single_release,
 };
 
-#ifdef	CONFIG_ACPI_PROCFS
+#ifdef	CONFIG_ACPI_PROCFS_SLEEP
 static const struct file_operations acpi_system_sleep_fops = {
 	.open = acpi_system_sleep_open_fs,
 	.read = seq_read,
@@ -479,7 +479,7 @@ static const struct file_operations acpi_system_sleep_fops = {
 	.llseek = seq_lseek,
 	.release = single_release,
 };
-#endif				/* CONFIG_ACPI_PROCFS */
+#endif				/* CONFIG_ACPI_PROCFS_SLEEP */
 
 #ifdef	HAVE_ACPI_LEGACY_ALARM
 static const struct file_operations acpi_system_alarm_fops = {
@@ -506,7 +506,7 @@ static int __init acpi_sleep_proc_init(void)
 	if (acpi_disabled)
 		return 0;
 
-#ifdef	CONFIG_ACPI_PROCFS
+#ifdef	CONFIG_ACPI_PROCFS_SLEEP
 	/* 'sleep' [R/W] */
 	entry =
 	    create_proc_entry("sleep", S_IFREG | S_IRUGO | S_IWUSR,
diff --git a/drivers/acpi/sleep/sleep.h b/drivers/acpi/sleep/sleep.h
index f3e70397a7d..ff1f8504f49 100644
--- a/drivers/acpi/sleep/sleep.h
+++ b/drivers/acpi/sleep/sleep.h
@@ -6,3 +6,5 @@ extern void acpi_enable_wakeup_device_prep(u8 sleep_state);
 extern void acpi_enable_wakeup_device(u8 sleep_state);
 extern void acpi_disable_wakeup_device(u8 sleep_state);
 extern void acpi_gpe_sleep_prepare(u32 sleep_state);
+
+extern int acpi_sleep_prepare(u32 acpi_state);
diff --git a/drivers/base/power/Makefile b/drivers/base/power/Makefile
index 966a5e28741..9caeaea753a 100644
--- a/drivers/base/power/Makefile
+++ b/drivers/base/power/Makefile
@@ -1,5 +1,5 @@
 obj-y			:= shutdown.o
-obj-$(CONFIG_PM)	+= main.o suspend.o resume.o sysfs.o
+obj-$(CONFIG_PM_SLEEP)	+= main.o suspend.o resume.o sysfs.o
 obj-$(CONFIG_PM_TRACE)	+= trace.o
 
 ifeq ($(CONFIG_DEBUG_DRIVER),y)
diff --git a/drivers/base/power/power.h b/drivers/base/power/power.h
index 591a0dd5dee..8ba0830cbc0 100644
--- a/drivers/base/power/power.h
+++ b/drivers/base/power/power.h
@@ -5,7 +5,7 @@
 extern void device_shutdown(void);
 
 
-#ifdef CONFIG_PM
+#ifdef CONFIG_PM_SLEEP
 
 /*
  * main.c
@@ -62,7 +62,7 @@ extern int resume_device(struct device *);
  */
 extern int suspend_device(struct device *, pm_message_t);
 
-#else /* CONFIG_PM */
+#else /* CONFIG_PM_SLEEP */
 
 
 static inline int device_pm_add(struct device * dev)
diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h
index 533ef40f7cc..3d0fea235bf 100644
--- a/include/acpi/acpi_bus.h
+++ b/include/acpi/acpi_bus.h
@@ -366,7 +366,16 @@ acpi_handle acpi_get_child(acpi_handle, acpi_integer);
 acpi_handle acpi_get_pci_rootbridge_handle(unsigned int, unsigned int);
 #define DEVICE_ACPI_HANDLE(dev) ((acpi_handle)((dev)->archdata.acpi_handle))
 
+#ifdef CONFIG_PM_SLEEP
 int acpi_pm_device_sleep_state(struct device *, int, int *);
+#else /* !CONFIG_PM_SLEEP */
+static inline int acpi_pm_device_sleep_state(struct device *d, int w, int *p)
+{
+	if (p)
+		*p = ACPI_STATE_D0;
+	return ACPI_STATE_D3;
+}
+#endif /* !CONFIG_PM_SLEEP */
 
 #endif				/* CONFIG_ACPI */
 
diff --git a/include/acpi/acpi_drivers.h b/include/acpi/acpi_drivers.h
index f85f77a538a..777d37ae81a 100644
--- a/include/acpi/acpi_drivers.h
+++ b/include/acpi/acpi_drivers.h
@@ -147,6 +147,10 @@ static inline void unregister_hotplug_dock_device(acpi_handle handle)
 /*--------------------------------------------------------------------------
                                   Suspend/Resume
   -------------------------------------------------------------------------- */
+#ifdef CONFIG_PM_SLEEP
 extern int acpi_sleep_init(void);
+#else
+static inline int acpi_sleep_init(void) { return 0; }
+#endif
 
 #endif /*__ACPI_DRIVERS_H__*/
diff --git a/include/linux/freezer.h b/include/linux/freezer.h
index c8e02de737f..efded00ad08 100644
--- a/include/linux/freezer.h
+++ b/include/linux/freezer.h
@@ -5,7 +5,7 @@
 
 #include <linux/sched.h>
 
-#ifdef CONFIG_PM
+#ifdef CONFIG_PM_SLEEP
 /*
  * Check if a process has been frozen
  */
@@ -126,7 +126,7 @@ static inline void set_freezable(void)
 	current->flags &= ~PF_NOFREEZE;
 }
 
-#else
+#else /* !CONFIG_PM_SLEEP */
 static inline int frozen(struct task_struct *p) { return 0; }
 static inline int freezing(struct task_struct *p) { return 0; }
 static inline void set_freeze_flag(struct task_struct *p) {}
@@ -143,6 +143,6 @@ static inline void freezer_do_not_count(void) {}
 static inline void freezer_count(void) {}
 static inline int freezer_should_skip(struct task_struct *p) { return 0; }
 static inline void set_freezable(void) {}
-#endif
+#endif /* !CONFIG_PM_SLEEP */
 
 #endif	/* FREEZER_H_INCLUDED */
diff --git a/include/linux/pm.h b/include/linux/pm.h
index ad3cc2eb0d3..e52f6f83c06 100644
--- a/include/linux/pm.h
+++ b/include/linux/pm.h
@@ -165,6 +165,7 @@ struct pm_ops {
 	int (*finish)(suspend_state_t state);
 };
 
+#ifdef CONFIG_SUSPEND
 extern struct pm_ops *pm_ops;
 
 /**
@@ -193,6 +194,12 @@ extern void arch_suspend_disable_irqs(void);
 extern void arch_suspend_enable_irqs(void);
 
 extern int pm_suspend(suspend_state_t state);
+#else /* !CONFIG_SUSPEND */
+#define suspend_valid_only_mem	NULL
+
+static inline void pm_set_ops(struct pm_ops *pm_ops) {}
+static inline int pm_suspend(suspend_state_t state) { return -ENOSYS; }
+#endif /* !CONFIG_SUSPEND */
 
 /*
  * Device power management
@@ -266,7 +273,7 @@ typedef struct pm_message {
 struct dev_pm_info {
 	pm_message_t		power_state;
 	unsigned		can_wakeup:1;
-#ifdef	CONFIG_PM
+#ifdef	CONFIG_PM_SLEEP
 	unsigned		should_wakeup:1;
 	struct list_head	entry;
 #endif
@@ -276,7 +283,7 @@ extern int device_power_down(pm_message_t state);
 extern void device_power_up(void);
 extern void device_resume(void);
 
-#ifdef CONFIG_PM
+#ifdef CONFIG_PM_SLEEP
 extern int device_suspend(pm_message_t state);
 extern int device_prepare_suspend(pm_message_t state);
 
@@ -306,7 +313,7 @@ static inline int call_platform_enable_wakeup(struct device *dev, int is_on)
 	return 0;
 }
 
-#else /* !CONFIG_PM */
+#else /* !CONFIG_PM_SLEEP */
 
 static inline int device_suspend(pm_message_t state)
 {
@@ -323,7 +330,7 @@ static inline int call_platform_enable_wakeup(struct device *dev, int is_on)
 	return 0;
 }
 
-#endif
+#endif /* !CONFIG_PM_SLEEP */
 
 /* changes to device_may_wakeup take effect on the next pm state change.
  * by default, devices should wakeup if they can.
diff --git a/include/linux/suspend.h b/include/linux/suspend.h
index d16c1b85d51..388cace9751 100644
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -24,7 +24,7 @@ struct pbe {
 extern void drain_local_pages(void);
 extern void mark_free_pages(struct zone *zone);
 
-#if defined(CONFIG_PM) && defined(CONFIG_VT) && defined(CONFIG_VT_CONSOLE)
+#if defined(CONFIG_PM_SLEEP) && defined(CONFIG_VT) && defined(CONFIG_VT_CONSOLE)
 extern int pm_prepare_console(void);
 extern void pm_restore_console(void);
 #else
@@ -54,7 +54,6 @@ struct hibernation_ops {
 	void (*restore_cleanup)(void);
 };
 
-#ifdef CONFIG_PM
 #ifdef CONFIG_HIBERNATION
 /* kernel/power/snapshot.c */
 extern void __register_nosave_region(unsigned long b, unsigned long e, int km);
@@ -82,6 +81,7 @@ static inline void hibernation_set_ops(struct hibernation_ops *ops) {}
 static inline int hibernate(void) { return -ENOSYS; }
 #endif /* CONFIG_HIBERNATION */
 
+#ifdef CONFIG_PM_SLEEP
 void save_processor_state(void);
 void restore_processor_state(void);
 struct saved_context;
@@ -106,7 +106,7 @@ static inline int unregister_pm_notifier(struct notifier_block *nb)
 		{ .notifier_call = fn, .priority = pri };	\
 	register_pm_notifier(&fn##_nb);			\
 }
-#else /* CONFIG_PM */
+#else /* !CONFIG_PM_SLEEP */
 
 static inline int register_pm_notifier(struct notifier_block *nb)
 {
@@ -119,9 +119,9 @@ static inline int unregister_pm_notifier(struct notifier_block *nb)
 }
 
 #define pm_notifier(fn, pri)	do { (void)(fn); } while (0)
-#endif /* CONFIG_PM */
+#endif /* !CONFIG_PM_SLEEP */
 
-#if !defined CONFIG_HIBERNATION || !defined(CONFIG_PM)
+#ifndef CONFIG_HIBERNATION
 static inline void register_nosave_region(unsigned long b, unsigned long e)
 {
 }
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index c2582a4a537..412859f8d94 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -46,7 +46,7 @@ config PM_VERBOSE
 
 config DISABLE_CONSOLE_SUSPEND
 	bool "Keep console(s) enabled during suspend/resume (DANGEROUS)"
-	depends on PM_DEBUG
+	depends on PM_DEBUG && PM_SLEEP
 	default n
 	---help---
 	This option turns off the console suspend mechanism that prevents
@@ -57,7 +57,7 @@ config DISABLE_CONSOLE_SUSPEND
 
 config PM_TRACE
 	bool "Suspend/resume event tracing"
-	depends on PM_DEBUG && X86 && EXPERIMENTAL
+	depends on PM_DEBUG && X86 && PM_SLEEP && EXPERIMENTAL
 	default n
 	---help---
 	This enables some cheesy code to save the last PM event point in the
@@ -72,9 +72,37 @@ config PM_TRACE
 	CAUTION: this option will cause your machine's real-time clock to be
 	set to an invalid time after a resume.
 
+config SUSPEND_SMP_POSSIBLE
+	bool
+	depends on (X86 && !X86_VOYAGER) || (PPC64 && (PPC_PSERIES || PPC_PMAC))
+	depends on SMP
+	default y
+
+config SUSPEND_SMP
+	bool
+	depends on SUSPEND_SMP_POSSIBLE && PM_SLEEP
+	select HOTPLUG_CPU
+	default y
+
+config PM_SLEEP
+	bool
+	depends on SUSPEND || HIBERNATION
+	default y
+
+config SUSPEND
+	bool "Suspend to RAM and standby"
+	depends on PM
+	depends on !SMP || SUSPEND_SMP_POSSIBLE
+	default y
+	---help---
+	  Allow the system to enter sleep states in which main memory is
+	  powered and thus its contents are preserved, such as the
+	  suspend-to-RAM state (i.e. the ACPI S3 state).
+
 config HIBERNATION
-	bool "Hibernation"
-	depends on PM && SWAP && (((X86 || PPC64_SWSUSP) && (!SMP || SUSPEND_SMP)) || ((FRV || PPC32) && !SMP))
+	bool "Hibernation (aka 'suspend to disk')"
+	depends on PM && SWAP
+	depends on ((X86 || PPC64_SWSUSP || FRV || PPC32) && !SMP) || SUSPEND_SMP_POSSIBLE
 	---help---
 	  Enable the suspend to disk (STD) functionality, which is usually
 	  called "hibernation" in user interfaces.  STD checkpoints the
@@ -132,11 +160,6 @@ config PM_STD_PARTITION
 	  suspended image to. It will simply pick the first available swap 
 	  device.
 
-config SUSPEND_SMP
-	bool
-	depends on HOTPLUG_CPU && (X86 || PPC64) && PM
-	default y
-
 config APM_EMULATION
 	tristate "Advanced Power Management Emulation"
 	depends on PM && SYS_SUPPORTS_APM_EMULATION
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index c6b03764512..f7dfff28ecd 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -3,8 +3,9 @@ ifeq ($(CONFIG_PM_DEBUG),y)
 EXTRA_CFLAGS	+=	-DDEBUG
 endif
 
-obj-y				:= main.o process.o console.o
+obj-y				:= main.o
 obj-$(CONFIG_PM_LEGACY)		+= pm.o
+obj-$(CONFIG_PM_SLEEP)		+= process.o console.o
 obj-$(CONFIG_HIBERNATION)	+= swsusp.o disk.o snapshot.o swap.o user.o
 
 obj-$(CONFIG_MAGIC_SYSRQ)	+= poweroff.o
diff --git a/kernel/power/main.c b/kernel/power/main.c
index cfba6987ae7..350b485b3b6 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -25,11 +25,13 @@
 
 BLOCKING_NOTIFIER_HEAD(pm_chain_head);
 
-/*This is just an arbitrary number */
-#define FREE_PAGE_NUMBER (100)
-
 DEFINE_MUTEX(pm_mutex);
 
+#ifdef CONFIG_SUSPEND
+
+/* This is just an arbitrary number */
+#define FREE_PAGE_NUMBER (100)
+
 struct pm_ops *pm_ops;
 
 /**
@@ -269,6 +271,8 @@ int pm_suspend(suspend_state_t state)
 
 EXPORT_SYMBOL(pm_suspend);
 
+#endif /* CONFIG_SUSPEND */
+
 decl_subsys(power,NULL,NULL);
 
 
@@ -285,13 +289,15 @@ decl_subsys(power,NULL,NULL);
 
 static ssize_t state_show(struct kset *kset, char *buf)
 {
+	char *s = buf;
+#ifdef CONFIG_SUSPEND
 	int i;
-	char * s = buf;
 
 	for (i = 0; i < PM_SUSPEND_MAX; i++) {
 		if (pm_states[i] && valid_state(i))
 			s += sprintf(s,"%s ", pm_states[i]);
 	}
+#endif
 #ifdef CONFIG_HIBERNATION
 	s += sprintf(s, "%s\n", "disk");
 #else
@@ -304,11 +310,13 @@ static ssize_t state_show(struct kset *kset, char *buf)
 
 static ssize_t state_store(struct kset *kset, const char *buf, size_t n)
 {
+#ifdef CONFIG_SUSPEND
 	suspend_state_t state = PM_SUSPEND_STANDBY;
 	const char * const *s;
+#endif
 	char *p;
-	int error;
 	int len;
+	int error = -EINVAL;
 
 	p = memchr(buf, '\n', n);
 	len = p ? p - buf : n;
@@ -316,17 +324,19 @@ static ssize_t state_store(struct kset *kset, const char *buf, size_t n)
 	/* First, check if we are requested to hibernate */
 	if (len == 4 && !strncmp(buf, "disk", len)) {
 		error = hibernate();
-		return error ? error : n;
+  goto Exit;
 	}
 
+#ifdef CONFIG_SUSPEND
 	for (s = &pm_states[state]; state < PM_SUSPEND_MAX; s++, state++) {
 		if (*s && len == strlen(*s) && !strncmp(buf, *s, len))
 			break;
 	}
 	if (state < PM_SUSPEND_MAX && *s)
 		error = enter_state(state);
-	else
-		error = -EINVAL;
+#endif
+
+ Exit:
 	return error ? error : n;
 }
 
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 9080914796f..95fbf2dd3fe 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -176,9 +176,17 @@ struct timeval;
 extern void swsusp_show_speed(struct timeval *, struct timeval *,
 				unsigned int, char *);
 
+#ifdef CONFIG_SUSPEND
 /* kernel/power/main.c */
-extern int suspend_enter(suspend_state_t state);
 extern int suspend_devices_and_enter(suspend_state_t state);
+#else /* !CONFIG_SUSPEND */
+static inline int suspend_devices_and_enter(suspend_state_t state)
+{
+	return -ENOSYS;
+}
+#endif /* !CONFIG_SUSPEND */
+
+/* kernel/power/common.c */
 extern struct blocking_notifier_head pm_chain_head;
 
 static inline int pm_notifier_call_chain(unsigned long val)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 6d3550ca028..0bd4d82ddff 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -726,7 +726,7 @@ static void __drain_pages(unsigned int cpu)
 	}
 }
 
-#ifdef CONFIG_PM
+#ifdef CONFIG_HIBERNATION
 
 void mark_free_pages(struct zone *zone)
 {
@@ -772,7 +772,7 @@ void drain_local_pages(void)
 	__drain_pages(smp_processor_id());
 	local_irq_restore(flags);	
 }
-#endif /* CONFIG_PM */
+#endif /* CONFIG_HIBERNATION */
 
 /*
  * Free a 0-order page
-- 
cgit v1.2.3-70-g09d2


From 673d5b43daa00b42759cecc6b0760b8bf6be80d2 Mon Sep 17 00:00:00 2001
From: Len Brown <len.brown@intel.com>
Date: Sat, 28 Jul 2007 03:33:16 -0400
Subject: ACPI: restore CONFIG_ACPI_SLEEP

Restore the 2.6.22 CONFIG_ACPI_SLEEP build option, but now shadowing the
new CONFIG_PM_SLEEP option.

Signed-off-by: Len Brown <len.brown@intel.com>
[ Modified to work with the PM config setup changes. ]
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/i386/kernel/acpi/Makefile   | 2 +-
 arch/i386/kernel/setup.c         | 2 +-
 arch/x86_64/kernel/acpi/Makefile | 2 +-
 arch/x86_64/kernel/head.S        | 2 +-
 arch/x86_64/kernel/setup.c       | 2 +-
 drivers/acpi/Kconfig             | 5 +++++
 drivers/acpi/sleep/Makefile      | 4 ++--
 drivers/acpi/sleep/poweroff.c    | 2 ++
 drivers/acpi/sleep/proc.c        | 2 +-
 drivers/pci/pci-acpi.c           | 4 ++++
 drivers/pnp/pnpacpi/core.c       | 4 ++++
 include/acpi/acpi_drivers.h      | 2 +-
 kernel/sysctl.c                  | 2 +-
 13 files changed, 25 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/arch/i386/kernel/acpi/Makefile b/arch/i386/kernel/acpi/Makefile
index 223f58fc9f4..7f7be01f44e 100644
--- a/arch/i386/kernel/acpi/Makefile
+++ b/arch/i386/kernel/acpi/Makefile
@@ -2,7 +2,7 @@ obj-$(CONFIG_ACPI)		+= boot.o
 ifneq ($(CONFIG_PCI),)
 obj-$(CONFIG_X86_IO_APIC)	+= earlyquirk.o
 endif
-obj-$(CONFIG_ACPI)		+= sleep.o wakeup.o
+obj-$(CONFIG_ACPI_SLEEP)	+= sleep.o wakeup.o
 
 ifneq ($(CONFIG_ACPI_PROCESSOR),)
 obj-y				+= cstate.o processor.o
diff --git a/arch/i386/kernel/setup.c b/arch/i386/kernel/setup.c
index 7fe5da3c932..d474cd639bc 100644
--- a/arch/i386/kernel/setup.c
+++ b/arch/i386/kernel/setup.c
@@ -422,7 +422,7 @@ void __init setup_bootmem_allocator(void)
 	 */
 	reserve_bootmem(PAGE_SIZE, PAGE_SIZE);
 #endif
-#ifdef CONFIG_ACPI
+#ifdef CONFIG_ACPI_SLEEP
 	/*
 	 * Reserve low memory region for sleep support.
 	 */
diff --git a/arch/x86_64/kernel/acpi/Makefile b/arch/x86_64/kernel/acpi/Makefile
index 17595d23fee..080b9963f1b 100644
--- a/arch/x86_64/kernel/acpi/Makefile
+++ b/arch/x86_64/kernel/acpi/Makefile
@@ -1,6 +1,6 @@
 obj-y			:= boot.o
 boot-y			:= ../../../i386/kernel/acpi/boot.o
-obj-y			+= sleep.o wakeup.o
+obj-$(CONFIG_ACPI_SLEEP)	+= sleep.o wakeup.o
 
 ifneq ($(CONFIG_ACPI_PROCESSOR),)
 obj-y			+= processor.o
diff --git a/arch/x86_64/kernel/head.S b/arch/x86_64/kernel/head.S
index 3a16e417dd8..e89abcdbdde 100644
--- a/arch/x86_64/kernel/head.S
+++ b/arch/x86_64/kernel/head.S
@@ -120,7 +120,7 @@ ident_complete:
 	addq	%rbp, trampoline_level4_pgt + 0(%rip)
 	addq	%rbp, trampoline_level4_pgt + (511*8)(%rip)
 #endif
-#ifdef CONFIG_ACPI
+#ifdef CONFIG_ACPI_SLEEP
 	addq	%rbp, wakeup_level4_pgt + 0(%rip)
 	addq	%rbp, wakeup_level4_pgt + (511*8)(%rip)
 #endif
diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c
index 0f400f3c469..af838f6b0b7 100644
--- a/arch/x86_64/kernel/setup.c
+++ b/arch/x86_64/kernel/setup.c
@@ -333,7 +333,7 @@ void __init setup_arch(char **cmdline_p)
 	reserve_bootmem_generic(SMP_TRAMPOLINE_BASE, 2*PAGE_SIZE);
 #endif
 
-#ifdef CONFIG_ACPI
+#ifdef CONFIG_ACPI_SLEEP
        /*
         * Reserve low memory region for sleep support.
         */
diff --git a/drivers/acpi/Kconfig b/drivers/acpi/Kconfig
index 66e78d52a09..934d639b368 100644
--- a/drivers/acpi/Kconfig
+++ b/drivers/acpi/Kconfig
@@ -42,6 +42,11 @@ menuconfig ACPI
 
 if ACPI
 
+config ACPI_SLEEP
+	bool
+	depends on PM_SLEEP
+	default y
+
 config ACPI_PROCFS
 	bool "Deprecated /proc/acpi files"
 	depends on PROC_FS
diff --git a/drivers/acpi/sleep/Makefile b/drivers/acpi/sleep/Makefile
index 2bec897ab1e..195a4f69c0f 100644
--- a/drivers/acpi/sleep/Makefile
+++ b/drivers/acpi/sleep/Makefile
@@ -1,5 +1,5 @@
 obj-y					:= poweroff.o wakeup.o
-obj-$(CONFIG_PM_SLEEP)			+= main.o
-obj-$(CONFIG_X86)			+= proc.o
+obj-$(CONFIG_ACPI_SLEEP)		+= main.o
+obj-$(CONFIG_ACPI_SLEEP)		+= proc.o
 
 EXTRA_CFLAGS += $(ACPI_CFLAGS)
diff --git a/drivers/acpi/sleep/poweroff.c b/drivers/acpi/sleep/poweroff.c
index b3f68ef0669..39e40d56b03 100644
--- a/drivers/acpi/sleep/poweroff.c
+++ b/drivers/acpi/sleep/poweroff.c
@@ -18,6 +18,7 @@
 
 int acpi_sleep_prepare(u32 acpi_state)
 {
+#ifdef CONFIG_ACPI_SLEEP
 	/* do we have a wakeup address for S2 and S3? */
 	if (acpi_state == ACPI_STATE_S3) {
 		if (!acpi_wakeup_address) {
@@ -30,6 +31,7 @@ int acpi_sleep_prepare(u32 acpi_state)
 	}
 	ACPI_FLUSH_CPU_CACHE();
 	acpi_enable_wakeup_device_prep(acpi_state);
+#endif
 	acpi_gpe_sleep_prepare(acpi_state);
 	acpi_enter_sleep_state_prep(acpi_state);
 	return 0;
diff --git a/drivers/acpi/sleep/proc.c b/drivers/acpi/sleep/proc.c
index 5dfe8b78963..66b62b0d360 100644
--- a/drivers/acpi/sleep/proc.c
+++ b/drivers/acpi/sleep/proc.c
@@ -78,7 +78,7 @@ acpi_system_write_sleep(struct file *file,
 }
 #endif				/* CONFIG_ACPI_PROCFS_SLEEP */
 
-#if defined(CONFIG_RTC_DRV_CMOS) || defined(CONFIG_RTC_DRV_CMOS_MODULE)
+#if defined(CONFIG_RTC_DRV_CMOS) || defined(CONFIG_RTC_DRV_CMOS_MODULE) || !defined(CONFIG_X86)
 /* use /sys/class/rtc/rtcX/wakealarm instead; it's not ACPI-specific */
 #else
 #define	HAVE_ACPI_LEGACY_ALARM
diff --git a/drivers/pci/pci-acpi.c b/drivers/pci/pci-acpi.c
index 67c63d1f158..5c6a5d04300 100644
--- a/drivers/pci/pci-acpi.c
+++ b/drivers/pci/pci-acpi.c
@@ -220,6 +220,7 @@ acpi_status pci_osc_control_set(acpi_handle handle, u32 flags)
 }
 EXPORT_SYMBOL(pci_osc_control_set);
 
+#ifdef CONFIG_ACPI_SLEEP
 /*
  * _SxD returns the D-state with the highest power
  * (lowest D-state number) supported in the S-state "x".
@@ -267,6 +268,7 @@ static pci_power_t acpi_pci_choose_state(struct pci_dev *pdev,
 	}
 	return PCI_POWER_ERROR;
 }
+#endif
 
 static int acpi_pci_set_power_state(struct pci_dev *dev, pci_power_t state)
 {
@@ -340,7 +342,9 @@ static int __init acpi_pci_init(void)
 	ret = register_acpi_bus_type(&acpi_pci_bus);
 	if (ret)
 		return 0;
+#ifdef	CONFIG_ACPI_SLEEP
 	platform_pci_choose_state = acpi_pci_choose_state;
+#endif
 	platform_pci_set_power_state = acpi_pci_set_power_state;
 	return 0;
 }
diff --git a/drivers/pnp/pnpacpi/core.c b/drivers/pnp/pnpacpi/core.c
index 6a2a3c2f4d5..616fc72190b 100644
--- a/drivers/pnp/pnpacpi/core.c
+++ b/drivers/pnp/pnpacpi/core.c
@@ -127,6 +127,7 @@ static int pnpacpi_disable_resources(struct pnp_dev *dev)
 	return ACPI_FAILURE(status) ? -ENODEV : 0;
 }
 
+#ifdef CONFIG_ACPI_SLEEP
 static int pnpacpi_suspend(struct pnp_dev *dev, pm_message_t state)
 {
 	return acpi_bus_set_power((acpi_handle) dev->data,
@@ -140,14 +141,17 @@ static int pnpacpi_resume(struct pnp_dev *dev)
 {
 	return acpi_bus_set_power((acpi_handle) dev->data, ACPI_STATE_D0);
 }
+#endif
 
 static struct pnp_protocol pnpacpi_protocol = {
 	.name	 = "Plug and Play ACPI",
 	.get	 = pnpacpi_get_resources,
 	.set	 = pnpacpi_set_resources,
 	.disable = pnpacpi_disable_resources,
+#ifdef CONFIG_ACPI_SLEEP
 	.suspend = pnpacpi_suspend,
 	.resume = pnpacpi_resume,
+#endif
 };
 
 static int __init pnpacpi_add_device(struct acpi_device *device)
diff --git a/include/acpi/acpi_drivers.h b/include/acpi/acpi_drivers.h
index 777d37ae81a..202acb9ff4d 100644
--- a/include/acpi/acpi_drivers.h
+++ b/include/acpi/acpi_drivers.h
@@ -147,7 +147,7 @@ static inline void unregister_hotplug_dock_device(acpi_handle handle)
 /*--------------------------------------------------------------------------
                                   Suspend/Resume
   -------------------------------------------------------------------------- */
-#ifdef CONFIG_PM_SLEEP
+#ifdef CONFIG_ACPI_SLEEP
 extern int acpi_sleep_init(void);
 #else
 static inline int acpi_sleep_init(void) { return 0; }
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index eb26f2ba51e..79c891e6266 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -689,7 +689,7 @@ static ctl_table kern_table[] = {
 		.proc_handler	= &proc_dointvec,
 	},
 #endif
-#if defined(CONFIG_ACPI) && defined(CONFIG_X86)
+#if	defined(CONFIG_ACPI_SLEEP) && defined(CONFIG_X86)
 	{
 		.ctl_name	= KERN_ACPI_VIDEO_FLAGS,
 		.procname	= "acpi_video_flags",
-- 
cgit v1.2.3-70-g09d2


From 74c5b597e9c2fc728c61582afdea4971a5c8ed8f Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Mon, 30 Jul 2007 11:26:38 -0700
Subject: modules: better error messages when modules fail to load due to a
 sysfs problem.

This helps people when debugging problems like the ones that were in the
recent -mm releases.


Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 kernel/params.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/params.c b/kernel/params.c
index effbaaedd7f..4e57732fcfb 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -567,7 +567,12 @@ static void __init kernel_param_sysfs_setup(const char *name,
 	kobject_set_name(&mk->kobj, name);
 	kobject_init(&mk->kobj);
 	ret = kobject_add(&mk->kobj);
-	BUG_ON(ret < 0);
+	if (ret) {
+		printk(KERN_ERR "Module '%s' failed to be added to sysfs, "
+		      "error number %d\n", name, ret);
+		printk(KERN_ERR	"The system will be unstable now.\n");
+		return;
+	}
 	param_sysfs_setup(mk, kparam, num_params, name_skip);
 	kobject_uevent(&mk->kobj, KOBJ_ADD);
 }
-- 
cgit v1.2.3-70-g09d2


From 421cee293587081efef165b137514884b8472565 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Tue, 31 Jul 2007 00:37:50 -0700
Subject: sched: fix kernel-doc warnings

Fix kernel-doc warnings in sched.c:

Warning(linux-2623-rc1g4//kernel/sched.c:1685): No description found for parameter 'notifier'
Warning(linux-2623-rc1g4//kernel/sched.c:1696): No description found for parameter 'notifier'
Warning(linux-2623-rc1g4//kernel/sched.c:1750): No description found for parameter 'prev'

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sched.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 5c51d7e5dcc..238a76957e8 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1678,8 +1678,8 @@ void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
 #ifdef CONFIG_PREEMPT_NOTIFIERS
 
 /**
- * preempt_notifier_register - tell me when current is being being preempted
- *                         and rescheduled
+ * preempt_notifier_register - tell me when current is being being preempted & rescheduled
+ * @notifier: notifier struct to register
  */
 void preempt_notifier_register(struct preempt_notifier *notifier)
 {
@@ -1689,6 +1689,7 @@ EXPORT_SYMBOL_GPL(preempt_notifier_register);
 
 /**
  * preempt_notifier_unregister - no longer interested in preemption notifications
+ * @notifier: notifier struct to unregister
  *
  * This is safe to call from within a preemption notifier.
  */
@@ -1735,6 +1736,7 @@ fire_sched_out_preempt_notifiers(struct task_struct *curr,
 /**
  * prepare_task_switch - prepare to switch tasks
  * @rq: the runqueue preparing to switch
+ * @prev: the current task that is being switched out
  * @next: the task we are going to switch to.
  *
  * This is called with the rq lock held and interrupts off. It must
-- 
cgit v1.2.3-70-g09d2


From 5ea473a1dfeca2ee38c5dd458c1174d129e6b64e Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@sw.ru>
Date: Tue, 31 Jul 2007 00:38:50 -0700
Subject: Fix leaks on /proc/{*/sched,sched_debug,timer_list,timer_stats}

On every open/close one struct seq_operations leaks.
Kudos to /proc/slab_allocators.

Signed-off-by: Alexey Dobriyan <adobriyan@sw.ru>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/base.c            | 2 +-
 kernel/sched_debug.c      | 2 +-
 kernel/time/timer_list.c  | 2 +-
 kernel/time/timer_stats.c | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 3c77d5a64e7..19489b0d555 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -927,7 +927,7 @@ static const struct file_operations proc_pid_sched_operations = {
 	.read		= seq_read,
 	.write		= sched_write,
 	.llseek		= seq_lseek,
-	.release	= seq_release,
+	.release	= single_release,
 };
 
 #endif
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 42970f723a9..0eca442b779 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -200,7 +200,7 @@ static struct file_operations sched_debug_fops = {
 	.open		= sched_debug_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
-	.release	= seq_release,
+	.release	= single_release,
 };
 
 static int __init init_sched_debug_procfs(void)
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index e5edc3a22a0..fdb2e03d4fe 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -267,7 +267,7 @@ static struct file_operations timer_list_fops = {
 	.open		= timer_list_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
-	.release	= seq_release,
+	.release	= single_release,
 };
 
 static int __init init_timer_list_procfs(void)
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c
index 8ed62fda16c..3c38fb5eae1 100644
--- a/kernel/time/timer_stats.c
+++ b/kernel/time/timer_stats.c
@@ -399,7 +399,7 @@ static struct file_operations tstats_fops = {
 	.read		= seq_read,
 	.write		= tstats_write,
 	.llseek		= seq_lseek,
-	.release	= seq_release,
+	.release	= single_release,
 };
 
 void __init init_timer_stats(void)
-- 
cgit v1.2.3-70-g09d2


From c0f3358621dc746219d49a9dee1799704d3a32f8 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@sw.ru>
Date: Tue, 31 Jul 2007 00:38:50 -0700
Subject: Fix leak on /proc/lockdep_stats

Signed-off-by: Alexey Dobriyan <adobriyan@sw.ru>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/lockdep_proc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index 9f17af4a249..c851b2dcc68 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -346,7 +346,7 @@ static const struct file_operations proc_lockdep_stats_operations = {
 	.open		= lockdep_stats_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
-	.release	= seq_release,
+	.release	= single_release,
 };
 
 #ifdef CONFIG_LOCK_STAT
-- 
cgit v1.2.3-70-g09d2


From f54f098612d7f86463b5fb4763d03533d634de73 Mon Sep 17 00:00:00 2001
From: Andreas Schwab <schwab@suse.de>
Date: Tue, 31 Jul 2007 00:38:51 -0700
Subject: futex: pass nr_wake2 to futex_wake_op

The fourth argument of sys_futex is ignored when op == FUTEX_WAKE_OP,
but futex_wake_op expects it as its nr_wake2 parameter.

The only user of this operation in glibc is always passing 1, so this
bug had no consequences so far.

Signed-off-by: Andreas Schwab <schwab@suse.de>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Ulrich Drepper <drepper@redhat.com>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/futex.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/futex.c b/kernel/futex.c
index a12425051ee..3415e9ad139 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -2060,8 +2060,10 @@ asmlinkage long sys_futex(u32 __user *uaddr, int op, u32 val,
 	}
 	/*
 	 * requeue parameter in 'utime' if cmd == FUTEX_REQUEUE.
+	 * number of waiters to wake in 'utime' if cmd == FUTEX_WAKE_OP.
 	 */
-	if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE)
+	if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE ||
+	    cmd == FUTEX_WAKE_OP)
 		val2 = (u32) (unsigned long) utime;
 
 	return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
-- 
cgit v1.2.3-70-g09d2


From e804a4a4dd596d853f6d6f814fbdcf97b8efcdea Mon Sep 17 00:00:00 2001
From: Satyam Sharma <ssatyam@cse.iitk.ac.in>
Date: Tue, 31 Jul 2007 00:39:16 -0700
Subject: kthread: silence bogus section mismatch warning

WARNING: kernel/built-in.o(.text+0x16910): Section mismatch:
reference to .init.text: (between 'kthreadd' and 'init_waitqueue_head')

comes because kernel/kthread.c:kthreadd() is not __init but calls
kthreadd_setup() which is __init. But this is ok, because kthreadd_setup()
is only ever called at init time, and then kthreadd() proceeds into its
"for (;;)" loop. We could mark kthreadd __init_refok, but kthreadd_setup()
with just one callsite and 4 lines in it (it's been that small since
10ab825bdef8df51) doesn't need to be a separate function at all -- so let's
just move those four lines at beginning of kthreadd() itself.

Signed-off-by: Satyam Sharma <ssatyam@cse.iitk.ac.in>
Acked-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kthread.c | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kthread.c b/kernel/kthread.c
index a404f7ee739..dcfe724300e 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -214,23 +214,15 @@ int kthread_stop(struct task_struct *k)
 }
 EXPORT_SYMBOL(kthread_stop);
 
-
-static noinline __init_refok void kthreadd_setup(void)
+int kthreadd(void *unused)
 {
 	struct task_struct *tsk = current;
 
+	/* Setup a clean context for our children to inherit. */
 	set_task_comm(tsk, "kthreadd");
-
 	ignore_signals(tsk);
-
 	set_user_nice(tsk, -5);
 	set_cpus_allowed(tsk, CPU_MASK_ALL);
-}
-
-int kthreadd(void *unused)
-{
-	/* Setup a clean context for our children to inherit. */
-	kthreadd_setup();
 
 	current->flags |= PF_NOFREEZE;
 
-- 
cgit v1.2.3-70-g09d2


From c9b3febc5b9c55a76b838c977b078195ec8bb95e Mon Sep 17 00:00:00 2001
From: Jesper Juhl <jesper.juhl@gmail.com>
Date: Tue, 31 Jul 2007 00:39:18 -0700
Subject: Fix a use after free bug in kernel->userspace relay file support

Coverity spotted what looks like a real possible case of using a variable
after it has been freed.  The problem is in
kernel/relay.c::relay_open_buf()

If the code hits "goto free_buf;" it ends up in this code :

  free_buf:
    	relay_destroy_buf(buf);	<--- calls kfree() on 'buf'.
  free_name:
   	kfree(tmpname);
  end:
  	return buf;		<-- use after free of 'buf'.

I read through the callers and they all handle a NULL return from this
function as an error (and hitting the 'free_buf' label only happens on
failure to chan->cb->create_buf_file(), so that looks like a clear error to
me).

The patch simply sets 'buf' to NULL after the call to
relay_destroy_buf(buf); - as far as I can see that should take care of the
problem.

The patch also corrects a reference to a documentation file while
I was at it.

Note from Mathieu: the documentation reference change should have been
done in a separate patch, but I guess no one will really care.

Signed-off-by: Jesper Juhl <jesper.juhl@gmail.com>
Acked-by: "David J. Wilder" <wilder@us.ibm.com>
Tested-by: "David J. Wilder" <wilder@us.ibm.com>
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Cc: Tom Zanussi <zanussi@us.ibm.com>
Cc: Karim Yaghmour <karim@opersys.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/relay.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/relay.c b/kernel/relay.c
index 510fbbd7b50..ad855017bc5 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -1,7 +1,7 @@
 /*
  * Public API and common code for kernel->userspace relay file support.
  *
- * See Documentation/filesystems/relayfs.txt for an overview of relayfs.
+ * See Documentation/filesystems/relay.txt for an overview.
  *
  * Copyright (C) 2002-2005 - Tom Zanussi (zanussi@us.ibm.com), IBM Corp
  * Copyright (C) 1999-2005 - Karim Yaghmour (karim@opersys.com)
@@ -426,6 +426,7 @@ static struct rchan_buf *relay_open_buf(struct rchan *chan, unsigned int cpu)
 
 free_buf:
  	relay_destroy_buf(buf);
+ 	buf = NULL;
 free_name:
  	kfree(tmpname);
 end:
-- 
cgit v1.2.3-70-g09d2


From 0fc4969b866671dfe39b1a9119d0fdc7ea0f63e5 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 1 Aug 2007 17:13:19 +0200
Subject: genirq: temporary fix for level-triggered IRQ resend

Marcin Slusarz reported a ne2k-pci "hung network interface" regression.

delayed disable relies on the ability to re-trigger the interrupt in the
case that a real interrupt happens after the software disable was set.
In this case we actually disable the interrupt on the hardware level
_after_ it occurred.

On enable_irq, we need to re-trigger the interrupt. On i386 this relies
on a hardware resend mechanism (send_IPI_self()).

Actually we only need the resend for edge type interrupts. Level type
interrupts come back once enable_irq() re-enables the interrupt line.

I assume that the interrupt in question is level triggered because it is
shared and above the legacy irqs 0-15:

	17:         12   IO-APIC-fasteoi   eth1, eth0

Looking into the IO_APIC code, the resend via send_IPI_self() happens
unconditionally. So the resend is done for level and edge interrupts.
This makes the problem more mysterious.

The code in question lib8390.c does

	disable_irq();
	fiddle_with_the_network_card_hardware()
	enable_irq();

The fiddle_with_the_network_card_hardware() might cause interrupts,
which are cleared in the same code path again,

Marcin found that when he disables the irq line on the hardware level
(removing the delayed disable) the card is kept alive.

So the difference is that we can get a resend on enable_irq, when an
interrupt happens during the time, where we are in the disabled region.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/irq/resend.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'kernel')

diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c
index 5bfeaed7e48..c3827274688 100644
--- a/kernel/irq/resend.c
+++ b/kernel/irq/resend.c
@@ -62,6 +62,15 @@ void check_irq_resend(struct irq_desc *desc, unsigned int irq)
 	 */
 	desc->chip->enable(irq);
 
+	/*
+	 * Temporary hack to figure out more about the problem, which
+	 * is causing the ancient network cards to die.
+	 */
+	if (desc->handle_irq != handle_edge_irq) {
+		WARN_ON_ONCE(1);
+		return;
+	}
+
 	if ((status & (IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) {
 		desc->status = (status & ~IRQ_PENDING) | IRQ_REPLAY;
 
-- 
cgit v1.2.3-70-g09d2


From 362a7016637648c6aefc98b706298baedfaa1543 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 2 Aug 2007 17:41:40 +0200
Subject: [PATCH] sched: remove cache_hot_time

remove the last unused remains of cache_hot_time.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h    | 1 -
 include/linux/topology.h | 1 -
 kernel/sched.c           | 2 --
 3 files changed, 4 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 2e490271acf..81eec7e36c8 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -734,7 +734,6 @@ struct sched_domain {
 	unsigned long max_interval;	/* Maximum balance interval ms */
 	unsigned int busy_factor;	/* less balancing by factor if busy */
 	unsigned int imbalance_pct;	/* No balance until over watermark */
-	unsigned long long cache_hot_time; /* Task considered cache hot (ns) */
 	unsigned int cache_nice_tries;	/* Leave cache hot tasks for # tries */
 	unsigned int busy_idx;
 	unsigned int idle_idx;
diff --git a/include/linux/topology.h b/include/linux/topology.h
index d0890a7e5ba..525d437b125 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -185,7 +185,6 @@
 	.max_interval		= 64*num_online_cpus(),	\
 	.busy_factor		= 128,			\
 	.imbalance_pct		= 133,			\
-	.cache_hot_time		= (10*1000000),		\
 	.cache_nice_tries	= 1,			\
 	.busy_idx		= 3,			\
 	.idle_idx		= 3,			\
diff --git a/kernel/sched.c b/kernel/sched.c
index 238a76957e8..1641235f8e9 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5269,8 +5269,6 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd)
 		sizeof(int), 0644, proc_dointvec_minmax);
 	set_table_entry(&table[8], 9, "imbalance_pct", &sd->imbalance_pct,
 		sizeof(int), 0644, proc_dointvec_minmax);
-	set_table_entry(&table[9], 10, "cache_hot_time", &sd->cache_hot_time,
-		sizeof(long long), 0644, proc_doulongvec_minmax);
 	set_table_entry(&table[10], 11, "cache_nice_tries",
 		&sd->cache_nice_tries,
 		sizeof(int), 0644, proc_dointvec_minmax);
-- 
cgit v1.2.3-70-g09d2


From 5a4f3ea77e1b0c72a3ec136c881eb0d64aa1d25e Mon Sep 17 00:00:00 2001
From: Peter Williams <pwil3058@bigpond.net.au>
Date: Thu, 2 Aug 2007 17:41:40 +0200
Subject: [PATCH] sched: tidy up left over smpnice code

1. The only place that RTPRIO_TO_LOAD_WEIGHT() is used is in the call to
move_tasks() in the function active_load_balance() and its purpose here
is just to make sure that the load to be moved is big enough to ensure
that exactly one task is moved (if there's one available).  This can be
accomplished by using ULONG_MAX instead and this allows
RTPRIO_TO_LOAD_WEIGHT() to be deleted.

2. This, in turn, allows PRIO_TO_LOAD_WEIGHT() to be deleted.

3. This allows load_weight() to be deleted which allows
TIME_SLICE_NICE_ZERO to be deleted along with the comment above it.

Signed-off-by: Peter Williams <pwil3058@bigpond.net.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 16 +---------------
 1 file changed, 1 insertion(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 1641235f8e9..ed8cebf5328 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -727,19 +727,6 @@ static void update_curr_load(struct rq *rq, u64 now)
  * slice expiry etc.
  */
 
-/*
- * Assume: static_prio_timeslice(NICE_TO_PRIO(0)) == DEF_TIMESLICE
- * If static_prio_timeslice() is ever changed to break this assumption then
- * this code will need modification
- */
-#define TIME_SLICE_NICE_ZERO DEF_TIMESLICE
-#define load_weight(lp) \
-	(((lp) * SCHED_LOAD_SCALE) / TIME_SLICE_NICE_ZERO)
-#define PRIO_TO_LOAD_WEIGHT(prio) \
-	load_weight(static_prio_timeslice(prio))
-#define RTPRIO_TO_LOAD_WEIGHT(rp) \
-	(PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + load_weight(rp))
-
 #define WEIGHT_IDLEPRIO		2
 #define WMULT_IDLEPRIO		(1 << 31)
 
@@ -2908,8 +2895,7 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
 		schedstat_inc(sd, alb_cnt);
 
 		if (move_tasks(target_rq, target_cpu, busiest_rq, 1,
-			       RTPRIO_TO_LOAD_WEIGHT(100), sd, CPU_IDLE,
-			       NULL))
+			       ULONG_MAX, sd, CPU_IDLE, NULL))
 			schedstat_inc(sd, alb_pushed);
 		else
 			schedstat_inc(sd, alb_failed);
-- 
cgit v1.2.3-70-g09d2


From ecf691daf7afb418537ba459290191a0a5853be5 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 2 Aug 2007 17:41:40 +0200
Subject: [PATCH] sched: calc_delta_mine(): use fixed limit

use fixed limit in calc_delta_mine() - this saves an instruction :)

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index ed8cebf5328..b2bc8fa24ba 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -657,7 +657,7 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight,
 		tmp = (tmp * lw->inv_weight) >> WMULT_SHIFT;
 	}
 
-	return (unsigned long)min(tmp, (u64)sysctl_sched_runtime_limit);
+	return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
 }
 
 static inline unsigned long
-- 
cgit v1.2.3-70-g09d2


From cb1c4fc924d7eeb3fb723ad72705d4a70e9781fd Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 2 Aug 2007 17:41:40 +0200
Subject: [PATCH] sched: uninline calc_delta_mine()

uninline calc_delta_mine():

   text    data     bss     dec     hex filename
   29162    4162      24   33348    8244 sched.o.before
   29039    4162      24   33225    81c9 sched.o.after

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index b2bc8fa24ba..ff4aa17d65c 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -637,7 +637,7 @@ static u64 div64_likely32(u64 divident, unsigned long divisor)
 
 #define WMULT_SHIFT	32
 
-static inline unsigned long
+static unsigned long
 calc_delta_mine(unsigned long delta_exec, unsigned long weight,
 		struct load_weight *lw)
 {
-- 
cgit v1.2.3-70-g09d2


From 4e6f96f313561d86d248edf0eaff2336d8217e1b Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 2 Aug 2007 17:41:40 +0200
Subject: [PATCH] sched: uninline inc/dec_nr_running()

uninline inc_nr_running() and dec_nr_running():

   text    data     bss     dec     hex filename
   29039    4162      24   33225    81c9 sched.o.before
   29027    4162      24   33213    81bd sched.o.after

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index ff4aa17d65c..7bed2c58b98 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -782,13 +782,13 @@ dec_load(struct rq *rq, const struct task_struct *p, u64 now)
 	update_load_sub(&rq->ls.load, p->se.load.weight);
 }
 
-static inline void inc_nr_running(struct task_struct *p, struct rq *rq, u64 now)
+static void inc_nr_running(struct task_struct *p, struct rq *rq, u64 now)
 {
 	rq->nr_running++;
 	inc_load(rq, p, now);
 }
 
-static inline void dec_nr_running(struct task_struct *p, struct rq *rq, u64 now)
+static void dec_nr_running(struct task_struct *p, struct rq *rq, u64 now)
 {
 	rq->nr_running--;
 	dec_load(rq, p, now);
-- 
cgit v1.2.3-70-g09d2


From cad60d93e18ba52b6f069b2edb031c89bf603b07 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 2 Aug 2007 17:41:40 +0200
Subject: [PATCH] sched: ->task_new cleanup

make sched_class.task_new == NULL a 'default method', this
allows the removal of task_rt_new.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h |  2 +-
 kernel/sched.c        | 11 ++++++++---
 kernel/sched_fair.c   |  4 +---
 kernel/sched_rt.c     | 10 ----------
 4 files changed, 10 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 81eec7e36c8..c9e0c2a6a95 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -874,7 +874,7 @@ struct sched_class {
 
 	void (*set_curr_task) (struct rq *rq);
 	void (*task_tick) (struct rq *rq, struct task_struct *p);
-	void (*task_new) (struct rq *rq, struct task_struct *p);
+	void (*task_new) (struct rq *rq, struct task_struct *p, u64 now);
 };
 
 struct load_weight {
diff --git a/kernel/sched.c b/kernel/sched.c
index 7bed2c58b98..915c75e5a27 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1641,22 +1641,27 @@ void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
 	unsigned long flags;
 	struct rq *rq;
 	int this_cpu;
+	u64 now;
 
 	rq = task_rq_lock(p, &flags);
 	BUG_ON(p->state != TASK_RUNNING);
 	this_cpu = smp_processor_id(); /* parent's CPU */
+	now = rq_clock(rq);
 
 	p->prio = effective_prio(p);
 
-	if (!sysctl_sched_child_runs_first || (clone_flags & CLONE_VM) ||
-			task_cpu(p) != this_cpu || !current->se.on_rq) {
+	if (!p->sched_class->task_new || !sysctl_sched_child_runs_first ||
+			(clone_flags & CLONE_VM) || task_cpu(p) != this_cpu ||
+			!current->se.on_rq) {
+
 		activate_task(rq, p, 0);
 	} else {
 		/*
 		 * Let the scheduling class do new task startup
 		 * management (if any):
 		 */
-		p->sched_class->task_new(rq, p);
+		p->sched_class->task_new(rq, p, now);
+		inc_nr_running(p, rq, now);
 	}
 	check_preempt_curr(rq, p);
 	task_rq_unlock(rq, &flags);
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 6971db0a716..243da6cae71 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1041,11 +1041,10 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr)
  * monopolize the CPU. Note: the parent runqueue is locked,
  * the child is not running yet.
  */
-static void task_new_fair(struct rq *rq, struct task_struct *p)
+static void task_new_fair(struct rq *rq, struct task_struct *p, u64 now)
 {
 	struct cfs_rq *cfs_rq = task_cfs_rq(p);
 	struct sched_entity *se = &p->se;
-	u64 now = rq_clock(rq);
 
 	sched_info_queued(p);
 
@@ -1072,7 +1071,6 @@ static void task_new_fair(struct rq *rq, struct task_struct *p)
 		p->se.wait_runtime = -(sysctl_sched_granularity / 2);
 
 	__enqueue_entity(cfs_rq, se);
-	inc_nr_running(p, rq, now);
 }
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 1192a2741b9..ade20dc422f 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -229,15 +229,6 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p)
 	requeue_task_rt(rq, p);
 }
 
-/*
- * No parent/child timeslice management necessary for RT tasks,
- * just activate them:
- */
-static void task_new_rt(struct rq *rq, struct task_struct *p)
-{
-	activate_task(rq, p, 1);
-}
-
 static struct sched_class rt_sched_class __read_mostly = {
 	.enqueue_task		= enqueue_task_rt,
 	.dequeue_task		= dequeue_task_rt,
@@ -251,5 +242,4 @@ static struct sched_class rt_sched_class __read_mostly = {
 	.load_balance		= load_balance_rt,
 
 	.task_tick		= task_tick_rt,
-	.task_new		= task_new_rt,
 };
-- 
cgit v1.2.3-70-g09d2


From 9c2172459a47c99adf9c968180a8a57d9ff84efa Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 2 Aug 2007 17:41:40 +0200
Subject: [PATCH] sched: move load-calculation functions

move load-calculation functions so that they can use the per-policy
declarations and methods.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 132 ++++++++++++++++++++++++++++-----------------------------
 1 file changed, 66 insertions(+), 66 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 915c75e5a27..a9d374061a4 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -678,46 +678,6 @@ static void update_load_sub(struct load_weight *lw, unsigned long dec)
 	lw->inv_weight = 0;
 }
 
-static void __update_curr_load(struct rq *rq, struct load_stat *ls)
-{
-	if (rq->curr != rq->idle && ls->load.weight) {
-		ls->delta_exec += ls->delta_stat;
-		ls->delta_fair += calc_delta_fair(ls->delta_stat, &ls->load);
-		ls->delta_stat = 0;
-	}
-}
-
-/*
- * Update delta_exec, delta_fair fields for rq.
- *
- * delta_fair clock advances at a rate inversely proportional to
- * total load (rq->ls.load.weight) on the runqueue, while
- * delta_exec advances at the same rate as wall-clock (provided
- * cpu is not idle).
- *
- * delta_exec / delta_fair is a measure of the (smoothened) load on this
- * runqueue over any given interval. This (smoothened) load is used
- * during load balance.
- *
- * This function is called /before/ updating rq->ls.load
- * and when switching tasks.
- */
-static void update_curr_load(struct rq *rq, u64 now)
-{
-	struct load_stat *ls = &rq->ls;
-	u64 start;
-
-	start = ls->load_update_start;
-	ls->load_update_start = now;
-	ls->delta_stat += now - start;
-	/*
-	 * Stagger updates to ls->delta_fair. Very frequent updates
-	 * can be expensive.
-	 */
-	if (ls->delta_stat >= sysctl_sched_stat_granularity)
-		__update_curr_load(rq, ls);
-}
-
 /*
  * To aid in avoiding the subversion of "niceness" due to uneven distribution
  * of tasks with abnormal "nice" values across CPUs the contribution that
@@ -768,32 +728,6 @@ static const u32 prio_to_wmult[40] = {
 /*  15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
 };
 
-static inline void
-inc_load(struct rq *rq, const struct task_struct *p, u64 now)
-{
-	update_curr_load(rq, now);
-	update_load_add(&rq->ls.load, p->se.load.weight);
-}
-
-static inline void
-dec_load(struct rq *rq, const struct task_struct *p, u64 now)
-{
-	update_curr_load(rq, now);
-	update_load_sub(&rq->ls.load, p->se.load.weight);
-}
-
-static void inc_nr_running(struct task_struct *p, struct rq *rq, u64 now)
-{
-	rq->nr_running++;
-	inc_load(rq, p, now);
-}
-
-static void dec_nr_running(struct task_struct *p, struct rq *rq, u64 now)
-{
-	rq->nr_running--;
-	dec_load(rq, p, now);
-}
-
 static void activate_task(struct rq *rq, struct task_struct *p, int wakeup);
 
 /*
@@ -824,6 +758,72 @@ static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
 
 #define sched_class_highest (&rt_sched_class)
 
+static void __update_curr_load(struct rq *rq, struct load_stat *ls)
+{
+	if (rq->curr != rq->idle && ls->load.weight) {
+		ls->delta_exec += ls->delta_stat;
+		ls->delta_fair += calc_delta_fair(ls->delta_stat, &ls->load);
+		ls->delta_stat = 0;
+	}
+}
+
+/*
+ * Update delta_exec, delta_fair fields for rq.
+ *
+ * delta_fair clock advances at a rate inversely proportional to
+ * total load (rq->ls.load.weight) on the runqueue, while
+ * delta_exec advances at the same rate as wall-clock (provided
+ * cpu is not idle).
+ *
+ * delta_exec / delta_fair is a measure of the (smoothened) load on this
+ * runqueue over any given interval. This (smoothened) load is used
+ * during load balance.
+ *
+ * This function is called /before/ updating rq->ls.load
+ * and when switching tasks.
+ */
+static void update_curr_load(struct rq *rq, u64 now)
+{
+	struct load_stat *ls = &rq->ls;
+	u64 start;
+
+	start = ls->load_update_start;
+	ls->load_update_start = now;
+	ls->delta_stat += now - start;
+	/*
+	 * Stagger updates to ls->delta_fair. Very frequent updates
+	 * can be expensive.
+	 */
+	if (ls->delta_stat >= sysctl_sched_stat_granularity)
+		__update_curr_load(rq, ls);
+}
+
+static inline void
+inc_load(struct rq *rq, const struct task_struct *p, u64 now)
+{
+	update_curr_load(rq, now);
+	update_load_add(&rq->ls.load, p->se.load.weight);
+}
+
+static inline void
+dec_load(struct rq *rq, const struct task_struct *p, u64 now)
+{
+	update_curr_load(rq, now);
+	update_load_sub(&rq->ls.load, p->se.load.weight);
+}
+
+static void inc_nr_running(struct task_struct *p, struct rq *rq, u64 now)
+{
+	rq->nr_running++;
+	inc_load(rq, p, now);
+}
+
+static void dec_nr_running(struct task_struct *p, struct rq *rq, u64 now)
+{
+	rq->nr_running--;
+	dec_load(rq, p, now);
+}
+
 static void set_load_weight(struct task_struct *p)
 {
 	task_rq(p)->cfs.wait_runtime -= p->se.wait_runtime;
-- 
cgit v1.2.3-70-g09d2


From c3c7011969274768818842b0a08ec45d88f45b4f Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 2 Aug 2007 17:41:40 +0200
Subject: [PATCH] sched: add schedstat_set() API

add the schedstat_set() API, to allow the reduction of
CONFIG_SCHEDSTAT related #ifdefs. No code changed.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_stats.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h
index c63c38f6fa6..c20a94dda61 100644
--- a/kernel/sched_stats.h
+++ b/kernel/sched_stats.h
@@ -116,6 +116,7 @@ rq_sched_info_depart(struct rq *rq, unsigned long long delta)
 }
 # define schedstat_inc(rq, field)	do { (rq)->field++; } while (0)
 # define schedstat_add(rq, field, amt)	do { (rq)->field += (amt); } while (0)
+# define schedstat_set(var, val)	do { var = (val); } while (0)
 #else /* !CONFIG_SCHEDSTATS */
 static inline void
 rq_sched_info_arrive(struct rq *rq, unsigned long long delta)
@@ -125,6 +126,7 @@ rq_sched_info_depart(struct rq *rq, unsigned long long delta)
 {}
 # define schedstat_inc(rq, field)	do { } while (0)
 # define schedstat_add(rq, field, amt)	do { } while (0)
+# define schedstat_set(var, val)	do { } while (0)
 #endif
 
 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
-- 
cgit v1.2.3-70-g09d2


From 8179ca23d513717cc5e3dc81a1ffe01af0955468 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 2 Aug 2007 17:41:40 +0200
Subject: [PATCH] sched: use schedstat_set() API

make use of the new schedstat_set() API to eliminate two #ifdef sections.

No functional changes:

    text    data     bss     dec     hex filename
   29009    4122      28   33159    8187 sched.o.before
   29009    4122      28   33159    8187 sched.o.after

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 13 ++-----------
 1 file changed, 2 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 243da6cae71..5bf7285ad02 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -292,10 +292,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, u64 now)
 		return;
 
 	delta_exec = curr->delta_exec;
-#ifdef CONFIG_SCHEDSTATS
-	if (unlikely(delta_exec > curr->exec_max))
-		curr->exec_max = delta_exec;
-#endif
+	schedstat_set(curr->exec_max, max((u64)delta_exec, curr->exec_max));
 
 	curr->sum_exec_runtime += delta_exec;
 	cfs_rq->exec_clock += delta_exec;
@@ -425,13 +422,7 @@ __update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
 {
 	unsigned long delta_fair = se->delta_fair_run;
 
-#ifdef CONFIG_SCHEDSTATS
-	{
-		s64 delta_wait = now - se->wait_start;
-		if (unlikely(delta_wait > se->wait_max))
-			se->wait_max = delta_wait;
-	}
-#endif
+	schedstat_set(se->wait_max, max(se->wait_max, now - se->wait_start));
 
 	if (unlikely(se->load.weight != NICE_0_LOAD))
 		delta_fair = calc_weighted(delta_fair, se->load.weight,
-- 
cgit v1.2.3-70-g09d2


From 6cfb0d5d06bea2b8791f32145eae539d524e5f6c Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 2 Aug 2007 17:41:40 +0200
Subject: [PATCH] sched: reduce debug code

move the rest of the debugging/instrumentation code to under
CONFIG_SCHEDSTATS too. This reduces code size and speeds code up:

    text    data     bss     dec     hex filename
   33044    4122      28   37194    914a sched.o.before
   32708    4122      28   36858    8ffa sched.o.after

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c       | 28 ++++++++++++++++++----------
 kernel/sched_debug.c | 22 ++++++++++++++++------
 kernel/sched_fair.c  |  4 ++--
 kernel/sched_rt.c    |  4 ++--
 4 files changed, 38 insertions(+), 20 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index a9d374061a4..72bb9483d94 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -983,18 +983,21 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 	u64 clock_offset, fair_clock_offset;
 
 	clock_offset = old_rq->clock - new_rq->clock;
-	fair_clock_offset = old_rq->cfs.fair_clock -
-						 new_rq->cfs.fair_clock;
-	if (p->se.wait_start)
-		p->se.wait_start -= clock_offset;
+	fair_clock_offset = old_rq->cfs.fair_clock - new_rq->cfs.fair_clock;
+
 	if (p->se.wait_start_fair)
 		p->se.wait_start_fair -= fair_clock_offset;
+	if (p->se.sleep_start_fair)
+		p->se.sleep_start_fair -= fair_clock_offset;
+
+#ifdef CONFIG_SCHEDSTATS
+	if (p->se.wait_start)
+		p->se.wait_start -= clock_offset;
 	if (p->se.sleep_start)
 		p->se.sleep_start -= clock_offset;
 	if (p->se.block_start)
 		p->se.block_start -= clock_offset;
-	if (p->se.sleep_start_fair)
-		p->se.sleep_start_fair -= fair_clock_offset;
+#endif
 
 	__set_task_cpu(p, new_cpu);
 }
@@ -1555,17 +1558,19 @@ int fastcall wake_up_state(struct task_struct *p, unsigned int state)
 static void __sched_fork(struct task_struct *p)
 {
 	p->se.wait_start_fair		= 0;
-	p->se.wait_start		= 0;
 	p->se.exec_start		= 0;
 	p->se.sum_exec_runtime		= 0;
 	p->se.delta_exec		= 0;
 	p->se.delta_fair_run		= 0;
 	p->se.delta_fair_sleep		= 0;
 	p->se.wait_runtime		= 0;
+	p->se.sleep_start_fair		= 0;
+
+#ifdef CONFIG_SCHEDSTATS
+	p->se.wait_start		= 0;
 	p->se.sum_wait_runtime		= 0;
 	p->se.sum_sleep_runtime		= 0;
 	p->se.sleep_start		= 0;
-	p->se.sleep_start_fair		= 0;
 	p->se.block_start		= 0;
 	p->se.sleep_max			= 0;
 	p->se.block_max			= 0;
@@ -1573,6 +1578,7 @@ static void __sched_fork(struct task_struct *p)
 	p->se.wait_max			= 0;
 	p->se.wait_runtime_overruns	= 0;
 	p->se.wait_runtime_underruns	= 0;
+#endif
 
 	INIT_LIST_HEAD(&p->run_list);
 	p->se.on_rq = 0;
@@ -6579,12 +6585,14 @@ void normalize_rt_tasks(void)
 	do_each_thread(g, p) {
 		p->se.fair_key			= 0;
 		p->se.wait_runtime		= 0;
+		p->se.exec_start		= 0;
 		p->se.wait_start_fair		= 0;
+		p->se.sleep_start_fair		= 0;
+#ifdef CONFIG_SCHEDSTATS
 		p->se.wait_start		= 0;
-		p->se.exec_start		= 0;
 		p->se.sleep_start		= 0;
-		p->se.sleep_start_fair		= 0;
 		p->se.block_start		= 0;
+#endif
 		task_rq(p)->cfs.fair_clock	= 0;
 		task_rq(p)->clock		= 0;
 
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 0eca442b779..1c61e5315ad 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -44,11 +44,16 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p, u64 now)
 		(long long)p->se.wait_runtime,
 		(long long)(p->nvcsw + p->nivcsw),
 		p->prio,
+#ifdef CONFIG_SCHEDSTATS
 		(long long)p->se.sum_exec_runtime,
 		(long long)p->se.sum_wait_runtime,
 		(long long)p->se.sum_sleep_runtime,
 		(long long)p->se.wait_runtime_overruns,
-		(long long)p->se.wait_runtime_underruns);
+		(long long)p->se.wait_runtime_underruns
+#else
+		0LL, 0LL, 0LL, 0LL, 0LL
+#endif
+	);
 }
 
 static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu, u64 now)
@@ -171,7 +176,7 @@ static int sched_debug_show(struct seq_file *m, void *v)
 	u64 now = ktime_to_ns(ktime_get());
 	int cpu;
 
-	SEQ_printf(m, "Sched Debug Version: v0.05, %s %.*s\n",
+	SEQ_printf(m, "Sched Debug Version: v0.05-v20, %s %.*s\n",
 		init_utsname()->release,
 		(int)strcspn(init_utsname()->version, " "),
 		init_utsname()->version);
@@ -235,21 +240,24 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
 #define P(F) \
 	SEQ_printf(m, "%-25s:%20Ld\n", #F, (long long)p->F)
 
-	P(se.wait_start);
+	P(se.wait_runtime);
 	P(se.wait_start_fair);
 	P(se.exec_start);
-	P(se.sleep_start);
 	P(se.sleep_start_fair);
+	P(se.sum_exec_runtime);
+
+#ifdef CONFIG_SCHEDSTATS
+	P(se.wait_start);
+	P(se.sleep_start);
 	P(se.block_start);
 	P(se.sleep_max);
 	P(se.block_max);
 	P(se.exec_max);
 	P(se.wait_max);
-	P(se.wait_runtime);
 	P(se.wait_runtime_overruns);
 	P(se.wait_runtime_underruns);
 	P(se.sum_wait_runtime);
-	P(se.sum_exec_runtime);
+#endif
 	SEQ_printf(m, "%-25s:%20Ld\n",
 		   "nr_switches", (long long)(p->nvcsw + p->nivcsw));
 	P(se.load.weight);
@@ -269,7 +277,9 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
 
 void proc_sched_set_task(struct task_struct *p)
 {
+#ifdef CONFIG_SCHEDSTATS
 	p->se.sleep_max = p->se.block_max = p->se.exec_max = p->se.wait_max = 0;
 	p->se.wait_runtime_overruns = p->se.wait_runtime_underruns = 0;
+#endif
 	p->se.sum_exec_runtime = 0;
 }
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 5bf7285ad02..6f579ff5a9b 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -349,7 +349,7 @@ static inline void
 update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
 {
 	se->wait_start_fair = cfs_rq->fair_clock;
-	se->wait_start = now;
+	schedstat_set(se->wait_start, now);
 }
 
 /*
@@ -447,7 +447,7 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
 	}
 
 	se->wait_start_fair = 0;
-	se->wait_start = 0;
+	schedstat_set(se->wait_start, 0);
 }
 
 static inline void
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index ade20dc422f..002fcf8d3f6 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -18,8 +18,8 @@ static inline void update_curr_rt(struct rq *rq, u64 now)
 	delta_exec = now - curr->se.exec_start;
 	if (unlikely((s64)delta_exec < 0))
 		delta_exec = 0;
-	if (unlikely(delta_exec > curr->se.exec_max))
-		curr->se.exec_max = delta_exec;
+
+	schedstat_set(curr->se.exec_max, max(curr->se.exec_max, delta_exec));
 
 	curr->se.sum_exec_runtime += delta_exec;
 	curr->se.exec_start = now;
-- 
cgit v1.2.3-70-g09d2


From b6b1d87785712474d0ed80689c17107d616a1171 Mon Sep 17 00:00:00 2001
From: Daniel Ritz <daniel.ritz-ml@swissonline.ch>
Date: Fri, 3 Aug 2007 16:07:43 +0200
Subject: serial: fix 8250 early console setup

the early setup function serial8250_console_early_setup() can be called
from non __init code (eg. hotpluggable serial ports like serial_cs) so
remove the __init from the call chain to avoid crashes.

Signed-off-by: Daniel Ritz <daniel.ritz@gmx.ch>
Cc: Yinghai Lu <yinghai.lu@sun.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/serial/8250.c       | 2 +-
 drivers/serial/8250_early.c | 2 +-
 kernel/printk.c             | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/drivers/serial/8250.c b/drivers/serial/8250.c
index 2f5a5ac1b27..301313002f6 100644
--- a/drivers/serial/8250.c
+++ b/drivers/serial/8250.c
@@ -2514,7 +2514,7 @@ static int __init serial8250_console_setup(struct console *co, char *options)
 	return uart_set_options(port, co, baud, parity, bits, flow);
 }
 
-static int __init serial8250_console_early_setup(void)
+static int serial8250_console_early_setup(void)
 {
 	return serial8250_find_port_for_earlycon();
 }
diff --git a/drivers/serial/8250_early.c b/drivers/serial/8250_early.c
index 150cad5c2eb..4d4c9f01be8 100644
--- a/drivers/serial/8250_early.c
+++ b/drivers/serial/8250_early.c
@@ -227,7 +227,7 @@ int __init setup_early_serial8250_console(char *cmdline)
 	return 0;
 }
 
-int __init serial8250_find_port_for_earlycon(void)
+int serial8250_find_port_for_earlycon(void)
 {
 	struct early_serial8250_device *device = &early_device;
 	struct uart_port *port = &device->port;
diff --git a/kernel/printk.c b/kernel/printk.c
index 051d27e36a6..bd2cd062878 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -732,7 +732,7 @@ int __init add_preferred_console(char *name, int idx, char *options)
 	return 0;
 }
 
-int __init update_console_cmdline(char *name, int idx, char *name_new, int idx_new, char *options)
+int update_console_cmdline(char *name, int idx, char *name_new, int idx_new, char *options)
 {
 	struct console_cmdline *c;
 	int i;
-- 
cgit v1.2.3-70-g09d2


From 247284481ca40288bd120cf0707681c3bdbee78f Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Sat, 4 Aug 2007 01:04:41 +0400
Subject: Kill some obsolete sub-thread-ptrace stuff

There is a couple of subtle checks which were needed to handle ptracing from
the same thread group. This was deprecated a long ago, imho this code just
complicates the understanding.

And, the "->parent->signal->flags & SIGNAL_GROUP_EXIT" check in exit_notify()
is not right. SIGNAL_GROUP_EXIT can mean exec(), not exit_group(). This means
ptracer can lose a ptraced zombie on exec(). Minor problem, but still the bug.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c   | 8 ++------
 kernel/signal.c | 4 ----
 2 files changed, 2 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 464c2b172f0..9578c1ae19c 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -813,7 +813,7 @@ static void exit_notify(struct task_struct *tsk)
 		__kill_pgrp_info(SIGCONT, SEND_SIG_PRIV, pgrp);
 	}
 
-	/* Let father know we died 
+	/* Let father know we died
 	 *
 	 * Thread signals are configurable, but you aren't going to use
 	 * that to send signals to arbitary processes. 
@@ -826,9 +826,7 @@ static void exit_notify(struct task_struct *tsk)
 	 * If our self_exec id doesn't match our parent_exec_id then
 	 * we have changed execution domain as these two values started
 	 * the same after a fork.
-	 *	
 	 */
-	
 	if (tsk->exit_signal != SIGCHLD && tsk->exit_signal != -1 &&
 	    ( tsk->parent_exec_id != t->self_exec_id  ||
 	      tsk->self_exec_id != tsk->parent_exec_id)
@@ -848,9 +846,7 @@ static void exit_notify(struct task_struct *tsk)
 	}
 
 	state = EXIT_ZOMBIE;
-	if (tsk->exit_signal == -1 &&
-	    (likely(tsk->ptrace == 0) ||
-	     unlikely(tsk->parent->signal->flags & SIGNAL_GROUP_EXIT)))
+	if (tsk->exit_signal == -1 && likely(!tsk->ptrace))
 		state = EXIT_DEAD;
 	tsk->exit_state = state;
 
diff --git a/kernel/signal.c b/kernel/signal.c
index ef8156a6aad..b27c01a6644 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1561,10 +1561,6 @@ static inline int may_ptrace_stop(void)
 		    (current->ptrace & PT_ATTACHED)))
 		return 0;
 
-	if (unlikely(current->signal == current->parent->signal) &&
-	    unlikely(current->signal->flags & SIGNAL_GROUP_EXIT))
-		return 0;
-
 	/*
 	 * Are we in the middle of do_coredump?
 	 * If so and our tracer is also part of the coredump stopping
-- 
cgit v1.2.3-70-g09d2


From 6f605d83dd3906bcf69280f8754df85f80538471 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@ftp.linux.org.uk>
Date: Mon, 6 Aug 2007 04:26:59 +0100
Subject: take sched_debug.c out of nasal demon territory

C99 6.10.3[11]: preprocessing directive within the argument list of
macro invocation => undefined behaviour.  Don't do that...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sched_debug.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 1c61e5315ad..8421b9399e1 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -36,24 +36,24 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p, u64 now)
 	else
 		SEQ_printf(m, " ");
 
-	SEQ_printf(m, "%15s %5d %15Ld %13Ld %13Ld %9Ld %5d "
-		      "%15Ld %15Ld %15Ld %15Ld %15Ld\n",
+	SEQ_printf(m, "%15s %5d %15Ld %13Ld %13Ld %9Ld %5d ",
 		p->comm, p->pid,
 		(long long)p->se.fair_key,
 		(long long)(p->se.fair_key - rq->cfs.fair_clock),
 		(long long)p->se.wait_runtime,
 		(long long)(p->nvcsw + p->nivcsw),
-		p->prio,
+		p->prio);
 #ifdef CONFIG_SCHEDSTATS
+	SEQ_printf(m, "%15Ld %15Ld %15Ld %15Ld %15Ld\n",
 		(long long)p->se.sum_exec_runtime,
 		(long long)p->se.sum_wait_runtime,
 		(long long)p->se.sum_sleep_runtime,
 		(long long)p->se.wait_runtime_overruns,
-		(long long)p->se.wait_runtime_underruns
+		(long long)p->se.wait_runtime_underruns);
 #else
-		0LL, 0LL, 0LL, 0LL, 0LL
+	SEQ_printf(m, "%15Ld %15Ld %15Ld %15Ld %15Ld\n",
+		0LL, 0LL, 0LL, 0LL, 0LL);
 #endif
-	);
 }
 
 static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu, u64 now)
-- 
cgit v1.2.3-70-g09d2


From 175fc484256e9c85e043f599ec2f6bc0d2e6c443 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@ftp.linux.org.uk>
Date: Wed, 8 Aug 2007 00:01:46 +0100
Subject: fix oops in __audit_signal_info()

	The check for audit_signals is misplaced and the check for
audit_dummy_context() is missing; as the result, if we send a signal to
auditd from task with NULL ->audit_context while we have audit_signals
!= 0 we end up with an oops.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Acked-by: James Morris <jmorris@namei.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/auditsc.c | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index a777d376141..3401293359e 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1992,19 +1992,19 @@ int __audit_signal_info(int sig, struct task_struct *t)
 	extern uid_t audit_sig_uid;
 	extern u32 audit_sig_sid;
 
-	if (audit_pid && t->tgid == audit_pid &&
-	    (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1)) {
-		audit_sig_pid = tsk->pid;
-		if (ctx)
-			audit_sig_uid = ctx->loginuid;
-		else
-			audit_sig_uid = tsk->uid;
-		selinux_get_task_sid(tsk, &audit_sig_sid);
+	if (audit_pid && t->tgid == audit_pid) {
+		if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1) {
+			audit_sig_pid = tsk->pid;
+			if (ctx)
+				audit_sig_uid = ctx->loginuid;
+			else
+				audit_sig_uid = tsk->uid;
+			selinux_get_task_sid(tsk, &audit_sig_sid);
+		}
+		if (!audit_signals || audit_dummy_context())
+			return 0;
 	}
 
-	if (!audit_signals) /* audit_context checked in wrapper */
-		return 0;
-
 	/* optimize the common case by putting first signal recipient directly
 	 * in audit_context */
 	if (!ctx->target_pid) {
-- 
cgit v1.2.3-70-g09d2


From 0915c4e89d311948b67cdd4c183a2efbcafcc9f9 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 9 Aug 2007 11:16:45 +0200
Subject: sched: batch sleeper bonus

batch up the sleeper bonus sum a bit more. Anything below
sched-granularity is too small to make a practical difference
anyway.

this optimization reduces the math in high-frequency scheduling
scenarios.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 6f579ff5a9b..9f401588d50 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -300,7 +300,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, u64 now)
 	delta_fair = calc_delta_fair(delta_exec, lw);
 	delta_mine = calc_delta_mine(delta_exec, curr->load.weight, lw);
 
-	if (cfs_rq->sleeper_bonus > sysctl_sched_stat_granularity) {
+	if (cfs_rq->sleeper_bonus > sysctl_sched_granularity) {
 		delta = calc_delta_mine(cfs_rq->sleeper_bonus,
 					curr->load.weight, lw);
 		if (unlikely(delta > cfs_rq->sleeper_bonus))
-- 
cgit v1.2.3-70-g09d2


From f1a438d813d416fa9f4be4e6dbd10b54c5938d89 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 9 Aug 2007 11:16:45 +0200
Subject: sched: reorder update_cpu_load(rq) with the ->task_tick() call

Peter Williams suggested to flip the order of update_cpu_load(rq) with
the ->task_tick() call. This is a NOP for the current scheduler (the
two functions are independent of each other), ->task_tick() might
create some state for update_cpu_load() in the future (or in PlugSched).

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 72bb9483d94..4680f52974e 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3298,9 +3298,9 @@ void scheduler_tick(void)
 	struct task_struct *curr = rq->curr;
 
 	spin_lock(&rq->lock);
+	update_cpu_load(rq);
 	if (curr != rq->idle) /* FIXME: needed? */
 		curr->sched_class->task_tick(rq, curr);
-	update_cpu_load(rq);
 	spin_unlock(&rq->lock);
 
 #ifdef CONFIG_SMP
-- 
cgit v1.2.3-70-g09d2


From 4301065920b0cbde3986519582347e883b166f3e Mon Sep 17 00:00:00 2001
From: Peter Williams <pwil3058@bigpond.net.au>
Date: Thu, 9 Aug 2007 11:16:46 +0200
Subject: sched: simplify move_tasks()

The move_tasks() function is currently multiplexed with two distinct
capabilities:

1. attempt to move a specified amount of weighted load from one run
queue to another; and
2. attempt to move a specified number of tasks from one run queue to
another.

The first of these capabilities is used in two places, load_balance()
and load_balance_idle(), and in both of these cases the return value of
move_tasks() is used purely to decide if tasks/load were moved and no
notice of the actual number of tasks moved is taken.

The second capability is used in exactly one place,
active_load_balance(), to attempt to move exactly one task and, as
before, the return value is only used as an indicator of success or failure.

This multiplexing of sched_task() was introduced, by me, as part of the
smpnice patches and was motivated by the fact that the alternative, one
function to move specified load and one to move a single task, would
have led to two functions of roughly the same complexity as the old
move_tasks() (or the new balance_tasks()).  However, the new modular
design of the new CFS scheduler allows a simpler solution to be adopted
and this patch addresses that solution by:

1. adding a new function, move_one_task(), to be used by
active_load_balance(); and
2. making move_tasks() a single purpose function that tries to move a
specified weighted load and returns 1 for success and 0 for failure.

One of the consequences of these changes is that neither move_one_task()
or the new move_tasks() care how many tasks sched_class.load_balance()
moves and this enables its interface to be simplified by returning the
amount of load moved as its result and removing the load_moved pointer
from the argument list.  This helps simplify the new move_tasks() and
slightly reduces the amount of work done in each of
sched_class.load_balance()'s implementations.

Further simplification, e.g. changes to balance_tasks(), are possible
but (slightly) complicated by the special needs of load_balance_fair()
so I've left them to a later patch (if this one gets accepted).

NB Since move_tasks() gets called with two run queue locks held even
small reductions in overhead are worthwhile.

[ mingo@elte.hu ]

this change also reduces code size nicely:

   text    data     bss     dec     hex filename
   39216    3618      24   42858    a76a sched.o.before
   39173    3618      24   42815    a73f sched.o.after

Signed-off-by: Peter Williams <pwil3058@bigpond.net.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h   |  4 +--
 kernel/sched.c          | 82 +++++++++++++++++++++++++++----------------------
 kernel/sched_fair.c     |  8 ++---
 kernel/sched_idletask.c |  4 +--
 kernel/sched_rt.c       |  9 +++---
 5 files changed, 58 insertions(+), 49 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 17249fae501..24bce423f10 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -866,11 +866,11 @@ struct sched_class {
 	struct task_struct * (*pick_next_task) (struct rq *rq, u64 now);
 	void (*put_prev_task) (struct rq *rq, struct task_struct *p, u64 now);
 
-	int (*load_balance) (struct rq *this_rq, int this_cpu,
+	unsigned long (*load_balance) (struct rq *this_rq, int this_cpu,
 			struct rq *busiest,
 			unsigned long max_nr_move, unsigned long max_load_move,
 			struct sched_domain *sd, enum cpu_idle_type idle,
-			int *all_pinned, unsigned long *total_load_moved);
+			int *all_pinned);
 
 	void (*set_curr_task) (struct rq *rq);
 	void (*task_tick) (struct rq *rq, struct task_struct *p);
diff --git a/kernel/sched.c b/kernel/sched.c
index 4680f52974e..42029634ef5 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2231,32 +2231,49 @@ out:
 }
 
 /*
- * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted
- * load from busiest to this_rq, as part of a balancing operation within
- * "domain". Returns the number of tasks moved.
+ * move_tasks tries to move up to max_load_move weighted load from busiest to
+ * this_rq, as part of a balancing operation within domain "sd".
+ * Returns 1 if successful and 0 otherwise.
  *
  * Called with both runqueues locked.
  */
 static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
-		      unsigned long max_nr_move, unsigned long max_load_move,
+		      unsigned long max_load_move,
 		      struct sched_domain *sd, enum cpu_idle_type idle,
 		      int *all_pinned)
 {
 	struct sched_class *class = sched_class_highest;
-	unsigned long load_moved, total_nr_moved = 0, nr_moved;
-	long rem_load_move = max_load_move;
+	unsigned long total_load_moved = 0;
 
 	do {
-		nr_moved = class->load_balance(this_rq, this_cpu, busiest,
-				max_nr_move, (unsigned long)rem_load_move,
-				sd, idle, all_pinned, &load_moved);
-		total_nr_moved += nr_moved;
-		max_nr_move -= nr_moved;
-		rem_load_move -= load_moved;
+		total_load_moved +=
+			class->load_balance(this_rq, this_cpu, busiest,
+				ULONG_MAX, max_load_move - total_load_moved,
+				sd, idle, all_pinned);
 		class = class->next;
-	} while (class && max_nr_move && rem_load_move > 0);
+	} while (class && max_load_move > total_load_moved);
 
-	return total_nr_moved;
+	return total_load_moved > 0;
+}
+
+/*
+ * move_one_task tries to move exactly one task from busiest to this_rq, as
+ * part of active balancing operations within "domain".
+ * Returns 1 if successful and 0 otherwise.
+ *
+ * Called with both runqueues locked.
+ */
+static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
+			 struct sched_domain *sd, enum cpu_idle_type idle)
+{
+	struct sched_class *class;
+
+	for (class = sched_class_highest; class; class = class->next)
+		if (class->load_balance(this_rq, this_cpu, busiest,
+					1, ULONG_MAX, sd, idle, NULL))
+			return 1;
+
+	return 0;
 }
 
 /*
@@ -2588,11 +2605,6 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
  */
 #define MAX_PINNED_INTERVAL	512
 
-static inline unsigned long minus_1_or_zero(unsigned long n)
-{
-	return n > 0 ? n - 1 : 0;
-}
-
 /*
  * Check this_cpu to ensure it is balanced within domain. Attempt to move
  * tasks if there is an imbalance.
@@ -2601,7 +2613,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
 			struct sched_domain *sd, enum cpu_idle_type idle,
 			int *balance)
 {
-	int nr_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
+	int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
 	struct sched_group *group;
 	unsigned long imbalance;
 	struct rq *busiest;
@@ -2642,18 +2654,17 @@ redo:
 
 	schedstat_add(sd, lb_imbalance[idle], imbalance);
 
-	nr_moved = 0;
+	ld_moved = 0;
 	if (busiest->nr_running > 1) {
 		/*
 		 * Attempt to move tasks. If find_busiest_group has found
 		 * an imbalance but busiest->nr_running <= 1, the group is
-		 * still unbalanced. nr_moved simply stays zero, so it is
+		 * still unbalanced. ld_moved simply stays zero, so it is
 		 * correctly treated as an imbalance.
 		 */
 		local_irq_save(flags);
 		double_rq_lock(this_rq, busiest);
-		nr_moved = move_tasks(this_rq, this_cpu, busiest,
-				      minus_1_or_zero(busiest->nr_running),
+		ld_moved = move_tasks(this_rq, this_cpu, busiest,
 				      imbalance, sd, idle, &all_pinned);
 		double_rq_unlock(this_rq, busiest);
 		local_irq_restore(flags);
@@ -2661,7 +2672,7 @@ redo:
 		/*
 		 * some other cpu did the load balance for us.
 		 */
-		if (nr_moved && this_cpu != smp_processor_id())
+		if (ld_moved && this_cpu != smp_processor_id())
 			resched_cpu(this_cpu);
 
 		/* All tasks on this runqueue were pinned by CPU affinity */
@@ -2673,7 +2684,7 @@ redo:
 		}
 	}
 
-	if (!nr_moved) {
+	if (!ld_moved) {
 		schedstat_inc(sd, lb_failed[idle]);
 		sd->nr_balance_failed++;
 
@@ -2722,10 +2733,10 @@ redo:
 			sd->balance_interval *= 2;
 	}
 
-	if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
+	if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
 	    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
 		return -1;
-	return nr_moved;
+	return ld_moved;
 
 out_balanced:
 	schedstat_inc(sd, lb_balanced[idle]);
@@ -2757,7 +2768,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
 	struct sched_group *group;
 	struct rq *busiest = NULL;
 	unsigned long imbalance;
-	int nr_moved = 0;
+	int ld_moved = 0;
 	int sd_idle = 0;
 	int all_pinned = 0;
 	cpumask_t cpus = CPU_MASK_ALL;
@@ -2792,12 +2803,11 @@ redo:
 
 	schedstat_add(sd, lb_imbalance[CPU_NEWLY_IDLE], imbalance);
 
-	nr_moved = 0;
+	ld_moved = 0;
 	if (busiest->nr_running > 1) {
 		/* Attempt to move tasks */
 		double_lock_balance(this_rq, busiest);
-		nr_moved = move_tasks(this_rq, this_cpu, busiest,
-					minus_1_or_zero(busiest->nr_running),
+		ld_moved = move_tasks(this_rq, this_cpu, busiest,
 					imbalance, sd, CPU_NEWLY_IDLE,
 					&all_pinned);
 		spin_unlock(&busiest->lock);
@@ -2809,7 +2819,7 @@ redo:
 		}
 	}
 
-	if (!nr_moved) {
+	if (!ld_moved) {
 		schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]);
 		if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
 		    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
@@ -2817,7 +2827,7 @@ redo:
 	} else
 		sd->nr_balance_failed = 0;
 
-	return nr_moved;
+	return ld_moved;
 
 out_balanced:
 	schedstat_inc(sd, lb_balanced[CPU_NEWLY_IDLE]);
@@ -2905,8 +2915,8 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
 	if (likely(sd)) {
 		schedstat_inc(sd, alb_cnt);
 
-		if (move_tasks(target_rq, target_cpu, busiest_rq, 1,
-			       ULONG_MAX, sd, CPU_IDLE, NULL))
+		if (move_one_task(target_rq, target_cpu, busiest_rq,
+				  sd, CPU_IDLE))
 			schedstat_inc(sd, alb_pushed);
 		else
 			schedstat_inc(sd, alb_failed);
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 9f401588d50..7307a37cf26 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -944,11 +944,11 @@ static int cfs_rq_best_prio(struct cfs_rq *cfs_rq)
 	return p->prio;
 }
 
-static int
+static unsigned long
 load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
 			unsigned long max_nr_move, unsigned long max_load_move,
 			struct sched_domain *sd, enum cpu_idle_type idle,
-			int *all_pinned, unsigned long *total_load_moved)
+			int *all_pinned)
 {
 	struct cfs_rq *busy_cfs_rq;
 	unsigned long load_moved, total_nr_moved = 0, nr_moved;
@@ -1006,9 +1006,7 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
 			break;
 	}
 
-	*total_load_moved = max_load_move - rem_load_move;
-
-	return total_nr_moved;
+	return max_load_move - rem_load_move;
 }
 
 /*
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index 41841e741c4..1d8d9e13d95 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -37,11 +37,11 @@ static void put_prev_task_idle(struct rq *rq, struct task_struct *prev, u64 now)
 {
 }
 
-static int
+static unsigned long
 load_balance_idle(struct rq *this_rq, int this_cpu, struct rq *busiest,
 			unsigned long max_nr_move, unsigned long max_load_move,
 			struct sched_domain *sd, enum cpu_idle_type idle,
-			int *all_pinned, unsigned long *total_load_moved)
+			int *all_pinned)
 {
 	return 0;
 }
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 002fcf8d3f6..2b0626a43cb 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -172,15 +172,16 @@ static struct task_struct *load_balance_next_rt(void *arg)
 	return p;
 }
 
-static int
+static unsigned long
 load_balance_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
 			unsigned long max_nr_move, unsigned long max_load_move,
 			struct sched_domain *sd, enum cpu_idle_type idle,
-			int *all_pinned, unsigned long *load_moved)
+			int *all_pinned)
 {
 	int this_best_prio, best_prio, best_prio_seen = 0;
 	int nr_moved;
 	struct rq_iterator rt_rq_iterator;
+	unsigned long load_moved;
 
 	best_prio = sched_find_first_bit(busiest->rt.active.bitmap);
 	this_best_prio = sched_find_first_bit(this_rq->rt.active.bitmap);
@@ -203,11 +204,11 @@ load_balance_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
 	rt_rq_iterator.arg = busiest;
 
 	nr_moved = balance_tasks(this_rq, this_cpu, busiest, max_nr_move,
-			max_load_move, sd, idle, all_pinned, load_moved,
+			max_load_move, sd, idle, all_pinned, &load_moved,
 			this_best_prio, best_prio, best_prio_seen,
 			&rt_rq_iterator);
 
-	return nr_moved;
+	return load_moved;
 }
 
 static void task_tick_rt(struct rq *rq, struct task_struct *p)
-- 
cgit v1.2.3-70-g09d2


From 9531b62f5ebf2b693bf85129d20328188f685c44 Mon Sep 17 00:00:00 2001
From: Ulrich Drepper <drepper@redhat.com>
Date: Thu, 9 Aug 2007 11:16:46 +0200
Subject: sched: clean up sched_getaffinity()

here's another tiny cleanup.  The generated code is not affected (gcc is
smart enough) but for people looking over the code it is just irritating
to have the extra conditional.

Signed-off-by: Ulrich Drepper <drepper@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 42029634ef5..50c3587b06c 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4473,10 +4473,8 @@ long sched_getaffinity(pid_t pid, cpumask_t *mask)
 out_unlock:
 	read_unlock(&tasklist_lock);
 	mutex_unlock(&sched_hotcpu_mutex);
-	if (retval)
-		return retval;
 
-	return 0;
+	return retval;
 }
 
 /**
-- 
cgit v1.2.3-70-g09d2


From 291ae5a12088e1aa87aae4899a818498be3d18eb Mon Sep 17 00:00:00 2001
From: Josh Triplett <josh@kernel.org>
Date: Thu, 9 Aug 2007 11:16:46 +0200
Subject: sched: mark print_cfs_stats static

sched_fair.c defines print_cfs_stats, and sched_debug.c uses it, but sched.c
includes both sched_fair.c and sched_debug.c, so all the references to
print_cfs_stats occur in the same compilation unit.  Thus, mark
print_cfs_stats static.

Eliminates a sparse warning:
warning: symbol 'print_cfs_stats' was not declared. Should it be static?

Signed-off-by: Josh Triplett <josh@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 7307a37cf26..edcb4b542bc 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1107,7 +1107,7 @@ struct sched_class fair_sched_class __read_mostly = {
 };
 
 #ifdef CONFIG_SCHED_DEBUG
-void print_cfs_stats(struct seq_file *m, int cpu, u64 now)
+static void print_cfs_stats(struct seq_file *m, int cpu, u64 now)
 {
 	struct rq *rq = cpu_rq(cpu);
 	struct cfs_rq *cfs_rq;
-- 
cgit v1.2.3-70-g09d2


From 7bfd0485871df01764ca89d5679f128d870aef1a Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 9 Aug 2007 11:16:46 +0200
Subject: sched: uninline rq_clock()

uninline rq_clock() to save 263 bytes of code:

   text    data     bss     dec     hex filename
   39561    3642      24   43227    a8db sched.o.before
   39298    3642      24   42964    a7d4 sched.o.after

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 50c3587b06c..0112f63ad37 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -353,7 +353,7 @@ static unsigned long long __rq_clock(struct rq *rq)
 	return clock;
 }
 
-static inline unsigned long long rq_clock(struct rq *rq)
+static unsigned long long rq_clock(struct rq *rq)
 {
 	int this_cpu = smp_processor_id();
 
-- 
cgit v1.2.3-70-g09d2


From 8e717b194ce3f3ac9e6acc63f66fe274cdf9cde1 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 9 Aug 2007 11:16:46 +0200
Subject: sched: schedule() speedup

speed up schedule(): share the 'now' parameter that deactivate_task()
was calculating internally.

( this also fixes the small accounting window between the deactivate
  call and the pick_next_task() call. )

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 0112f63ad37..49f5b281c56 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -940,10 +940,9 @@ static inline void activate_idle_task(struct task_struct *p, struct rq *rq)
 /*
  * deactivate_task - remove a task from the runqueue.
  */
-static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
+static void
+deactivate_task(struct rq *rq, struct task_struct *p, int sleep, u64 now)
 {
-	u64 now = rq_clock(rq);
-
 	if (p->state == TASK_UNINTERRUPTIBLE)
 		rq->nr_uninterruptible++;
 
@@ -2122,7 +2121,7 @@ void sched_exec(void)
 static void pull_task(struct rq *src_rq, struct task_struct *p,
 		      struct rq *this_rq, int this_cpu)
 {
-	deactivate_task(src_rq, p, 0);
+	deactivate_task(src_rq, p, 0, rq_clock(src_rq));
 	set_task_cpu(p, this_cpu);
 	activate_task(this_rq, p, 0);
 	/*
@@ -3446,13 +3445,14 @@ need_resched_nonpreemptible:
 
 	spin_lock_irq(&rq->lock);
 	clear_tsk_need_resched(prev);
+	now = __rq_clock(rq);
 
 	if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
 		if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
 				unlikely(signal_pending(prev)))) {
 			prev->state = TASK_RUNNING;
 		} else {
-			deactivate_task(rq, prev, 1);
+			deactivate_task(rq, prev, 1, now);
 		}
 		switch_count = &prev->nvcsw;
 	}
@@ -3460,7 +3460,6 @@ need_resched_nonpreemptible:
 	if (unlikely(!rq->nr_running))
 		idle_balance(cpu, rq);
 
-	now = __rq_clock(rq);
 	prev->sched_class->put_prev_task(rq, prev, now);
 	next = pick_next_task(rq, prev, now);
 
@@ -4220,7 +4219,7 @@ recheck:
 	}
 	on_rq = p->se.on_rq;
 	if (on_rq)
-		deactivate_task(rq, p, 0);
+		deactivate_task(rq, p, 0, rq_clock(rq));
 	oldprio = p->prio;
 	__setscheduler(rq, p, policy, param->sched_priority);
 	if (on_rq) {
@@ -4973,7 +4972,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
 
 	on_rq = p->se.on_rq;
 	if (on_rq)
-		deactivate_task(rq_src, p, 0);
+		deactivate_task(rq_src, p, 0, rq_clock(rq_src));
 	set_task_cpu(p, dest_cpu);
 	if (on_rq) {
 		activate_task(rq_dest, p, 0);
@@ -5387,7 +5386,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
 		rq->migration_thread = NULL;
 		/* Idle task back to normal (off runqueue, low prio) */
 		rq = task_rq_lock(rq->idle, &flags);
-		deactivate_task(rq, rq->idle, 0);
+		deactivate_task(rq, rq->idle, 0, rq_clock(rq));
 		rq->idle->static_prio = MAX_PRIO;
 		__setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
 		rq->idle->sched_class = &idle_sched_class;
@@ -6626,7 +6625,7 @@ void normalize_rt_tasks(void)
 
 		on_rq = p->se.on_rq;
 		if (on_rq)
-			deactivate_task(task_rq(p), p, 0);
+			deactivate_task(task_rq(p), p, 0, rq_clock(task_rq(p)));
 		__setscheduler(rq, p, SCHED_NORMAL, 0);
 		if (on_rq) {
 			activate_task(task_rq(p), p, 0);
-- 
cgit v1.2.3-70-g09d2


From c5dcfe72aa8d26e924cccca9725a9f7be0d4ab01 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 9 Aug 2007 11:16:46 +0200
Subject: sched: clean up delta_mine

cleanup: delta_mine is an unsigned value.

no code impact:

   text    data     bss     dec     hex filename
   27823    2726      16   30565    7765 sched.o.before
   27823    2726      16   30565    7765 sched.o.after

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index edcb4b542bc..037b8245e53 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -283,8 +283,7 @@ add_wait_runtime(struct cfs_rq *cfs_rq, struct sched_entity *se, long delta)
 static inline void
 __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, u64 now)
 {
-	unsigned long delta, delta_exec, delta_fair;
-	long delta_mine;
+	unsigned long delta, delta_exec, delta_fair, delta_mine;
 	struct load_weight *lw = &cfs_rq->load;
 	unsigned long load = lw->weight;
 
-- 
cgit v1.2.3-70-g09d2


From fd8bb43e27bbba1b6d49552c3d588cf741dd44af Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 9 Aug 2007 11:16:46 +0200
Subject: sched: delta_exec accounting fix

small delta_exec accounting fix: increase delta_exec and increase
sum_exec_runtime even if the task is not on the runqueue anymore.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 037b8245e53..16511e9e552 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -287,15 +287,15 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, u64 now)
 	struct load_weight *lw = &cfs_rq->load;
 	unsigned long load = lw->weight;
 
-	if (unlikely(!load))
-		return;
-
 	delta_exec = curr->delta_exec;
 	schedstat_set(curr->exec_max, max((u64)delta_exec, curr->exec_max));
 
 	curr->sum_exec_runtime += delta_exec;
 	cfs_rq->exec_clock += delta_exec;
 
+	if (unlikely(!load))
+		return;
+
 	delta_fair = calc_delta_fair(delta_exec, lw);
 	delta_mine = calc_delta_mine(delta_exec, curr->load.weight, lw);
 
-- 
cgit v1.2.3-70-g09d2


From e0361851e5647cdd62fd5c367df5d7e145769d04 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@sw.ru>
Date: Thu, 9 Aug 2007 11:16:46 +0200
Subject: sched: remove binary sysctls from kernel.sched_domain

kernel.sched_domain hierarchy is under CTL_UNNUMBERED and thus
unreachable to sysctl(2). Generating .ctl_number's in such situation is
not useful.

Signed-off-by: Alexey Dobriyan <adobriyan@sw.ru>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 38 +++++++++++++++++++++-----------------
 1 file changed, 21 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 49f5b281c56..85b93118d24 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5217,12 +5217,19 @@ static void migrate_dead_tasks(unsigned int dead_cpu)
 #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
 
 static struct ctl_table sd_ctl_dir[] = {
-	{CTL_UNNUMBERED, "sched_domain", NULL, 0, 0755, NULL, },
+	{
+		.procname	= "sched_domain",
+		.mode		= 0755,
+	},
 	{0,},
 };
 
 static struct ctl_table sd_ctl_root[] = {
-	{CTL_UNNUMBERED, "kernel", NULL, 0, 0755, sd_ctl_dir, },
+	{
+		.procname	= "kernel",
+		.mode		= 0755,
+		.child		= sd_ctl_dir,
+	},
 	{0,},
 };
 
@@ -5238,11 +5245,10 @@ static struct ctl_table *sd_alloc_ctl_entry(int n)
 }
 
 static void
-set_table_entry(struct ctl_table *entry, int ctl_name,
+set_table_entry(struct ctl_table *entry,
 		const char *procname, void *data, int maxlen,
 		mode_t mode, proc_handler *proc_handler)
 {
-	entry->ctl_name = ctl_name;
 	entry->procname = procname;
 	entry->data = data;
 	entry->maxlen = maxlen;
@@ -5255,28 +5261,28 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd)
 {
 	struct ctl_table *table = sd_alloc_ctl_entry(14);
 
-	set_table_entry(&table[0], 1, "min_interval", &sd->min_interval,
+	set_table_entry(&table[0], "min_interval", &sd->min_interval,
 		sizeof(long), 0644, proc_doulongvec_minmax);
-	set_table_entry(&table[1], 2, "max_interval", &sd->max_interval,
+	set_table_entry(&table[1], "max_interval", &sd->max_interval,
 		sizeof(long), 0644, proc_doulongvec_minmax);
-	set_table_entry(&table[2], 3, "busy_idx", &sd->busy_idx,
+	set_table_entry(&table[2], "busy_idx", &sd->busy_idx,
 		sizeof(int), 0644, proc_dointvec_minmax);
-	set_table_entry(&table[3], 4, "idle_idx", &sd->idle_idx,
+	set_table_entry(&table[3], "idle_idx", &sd->idle_idx,
 		sizeof(int), 0644, proc_dointvec_minmax);
-	set_table_entry(&table[4], 5, "newidle_idx", &sd->newidle_idx,
+	set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx,
 		sizeof(int), 0644, proc_dointvec_minmax);
-	set_table_entry(&table[5], 6, "wake_idx", &sd->wake_idx,
+	set_table_entry(&table[5], "wake_idx", &sd->wake_idx,
 		sizeof(int), 0644, proc_dointvec_minmax);
-	set_table_entry(&table[6], 7, "forkexec_idx", &sd->forkexec_idx,
+	set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx,
 		sizeof(int), 0644, proc_dointvec_minmax);
-	set_table_entry(&table[7], 8, "busy_factor", &sd->busy_factor,
+	set_table_entry(&table[7], "busy_factor", &sd->busy_factor,
 		sizeof(int), 0644, proc_dointvec_minmax);
-	set_table_entry(&table[8], 9, "imbalance_pct", &sd->imbalance_pct,
+	set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,
 		sizeof(int), 0644, proc_dointvec_minmax);
-	set_table_entry(&table[10], 11, "cache_nice_tries",
+	set_table_entry(&table[10], "cache_nice_tries",
 		&sd->cache_nice_tries,
 		sizeof(int), 0644, proc_dointvec_minmax);
-	set_table_entry(&table[12], 13, "flags", &sd->flags,
+	set_table_entry(&table[12], "flags", &sd->flags,
 		sizeof(int), 0644, proc_dointvec_minmax);
 
 	return table;
@@ -5296,7 +5302,6 @@ static ctl_table *sd_alloc_ctl_cpu_table(int cpu)
 	i = 0;
 	for_each_domain(cpu, sd) {
 		snprintf(buf, 32, "domain%d", i);
-		entry->ctl_name = i + 1;
 		entry->procname = kstrdup(buf, GFP_KERNEL);
 		entry->mode = 0755;
 		entry->child = sd_alloc_ctl_domain_table(sd);
@@ -5317,7 +5322,6 @@ static void init_sched_domain_sysctl(void)
 
 	for (i = 0; i < cpu_num; i++, entry++) {
 		snprintf(buf, 32, "cpu%d", i);
-		entry->ctl_name = i + 1;
 		entry->procname = kstrdup(buf, GFP_KERNEL);
 		entry->mode = 0755;
 		entry->child = sd_alloc_ctl_cpu_table(i);
-- 
cgit v1.2.3-70-g09d2


From a4ac01c36e286dd1b9a1d5cd7422c5af51dc55f8 Mon Sep 17 00:00:00 2001
From: Peter Williams <pwil3058@bigpond.net.au>
Date: Thu, 9 Aug 2007 11:16:46 +0200
Subject: sched: fix bug in balance_tasks()

There are two problems with balance_tasks() and how it used:

1. The variables best_prio and best_prio_seen (inherited from the old
move_tasks()) were only required to handle problems caused by the
active/expired arrays, the order in which they were processed and the
possibility that the task with the highest priority could be on either.
  These issues are no longer present and the extra overhead associated
with their use is unnecessary (and possibly wrong).

2. In the absence of CONFIG_FAIR_GROUP_SCHED being set, the same
this_best_prio variable needs to be used by all scheduling classes or
there is a risk of moving too much load.  E.g. if the highest priority
task on this at the beginning is a fairly low priority task and the rt
class migrates a task (during its turn) then that moved task becomes the
new highest priority task on this_rq but when the sched_fair class
initializes its copy of this_best_prio it will get the priority of the
original highest priority task as, due to the run queue locks being
held, the reschedule triggered by pull_task() will not have taken place.
  This could result in inappropriate overriding of skip_for_load and
excessive load being moved.

The attached patch addresses these problems by deleting all reference to
best_prio and best_prio_seen and making this_best_prio a reference
parameter to the various functions involved.

load_balance_fair() has also been modified so that this_best_prio is
only reset (in the loop) if CONFIG_FAIR_GROUP_SCHED is set.  This should
preserve the effect of helping spread groups' higher priority tasks
around the available CPUs while improving system performance when
CONFIG_FAIR_GROUP_SCHED isn't set.

Signed-off-by: Peter Williams <pwil3058@bigpond.net.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h   |  2 +-
 kernel/sched.c          | 26 +++++++++++---------------
 kernel/sched_fair.c     | 32 ++++++++++++--------------------
 kernel/sched_idletask.c |  2 +-
 kernel/sched_rt.c       | 19 ++-----------------
 5 files changed, 27 insertions(+), 54 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 24bce423f10..513b81c60e8 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -870,7 +870,7 @@ struct sched_class {
 			struct rq *busiest,
 			unsigned long max_nr_move, unsigned long max_load_move,
 			struct sched_domain *sd, enum cpu_idle_type idle,
-			int *all_pinned);
+			int *all_pinned, int *this_best_prio);
 
 	void (*set_curr_task) (struct rq *rq);
 	void (*task_tick) (struct rq *rq, struct task_struct *p);
diff --git a/kernel/sched.c b/kernel/sched.c
index 85b93118d24..1fa07c14624 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -745,8 +745,7 @@ static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
 		      unsigned long max_nr_move, unsigned long max_load_move,
 		      struct sched_domain *sd, enum cpu_idle_type idle,
 		      int *all_pinned, unsigned long *load_moved,
-		      int this_best_prio, int best_prio, int best_prio_seen,
-		      struct rq_iterator *iterator);
+		      int *this_best_prio, struct rq_iterator *iterator);
 
 #include "sched_stats.h"
 #include "sched_rt.c"
@@ -2165,8 +2164,7 @@ static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
 		      unsigned long max_nr_move, unsigned long max_load_move,
 		      struct sched_domain *sd, enum cpu_idle_type idle,
 		      int *all_pinned, unsigned long *load_moved,
-		      int this_best_prio, int best_prio, int best_prio_seen,
-		      struct rq_iterator *iterator)
+		      int *this_best_prio, struct rq_iterator *iterator)
 {
 	int pulled = 0, pinned = 0, skip_for_load;
 	struct task_struct *p;
@@ -2191,12 +2189,8 @@ next:
 	 */
 	skip_for_load = (p->se.load.weight >> 1) > rem_load_move +
 							 SCHED_LOAD_SCALE_FUZZ;
-	if (skip_for_load && p->prio < this_best_prio)
-		skip_for_load = !best_prio_seen && p->prio == best_prio;
-	if (skip_for_load ||
+	if ((skip_for_load && p->prio >= *this_best_prio) ||
 	    !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
-
-		best_prio_seen |= p->prio == best_prio;
 		p = iterator->next(iterator->arg);
 		goto next;
 	}
@@ -2210,8 +2204,8 @@ next:
 	 * and the prescribed amount of weighted load.
 	 */
 	if (pulled < max_nr_move && rem_load_move > 0) {
-		if (p->prio < this_best_prio)
-			this_best_prio = p->prio;
+		if (p->prio < *this_best_prio)
+			*this_best_prio = p->prio;
 		p = iterator->next(iterator->arg);
 		goto next;
 	}
@@ -2243,12 +2237,13 @@ static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
 {
 	struct sched_class *class = sched_class_highest;
 	unsigned long total_load_moved = 0;
+	int this_best_prio = this_rq->curr->prio;
 
 	do {
 		total_load_moved +=
 			class->load_balance(this_rq, this_cpu, busiest,
 				ULONG_MAX, max_load_move - total_load_moved,
-				sd, idle, all_pinned);
+				sd, idle, all_pinned, &this_best_prio);
 		class = class->next;
 	} while (class && max_load_move > total_load_moved);
 
@@ -2266,10 +2261,12 @@ static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
 			 struct sched_domain *sd, enum cpu_idle_type idle)
 {
 	struct sched_class *class;
+	int this_best_prio = MAX_PRIO;
 
 	for (class = sched_class_highest; class; class = class->next)
 		if (class->load_balance(this_rq, this_cpu, busiest,
-					1, ULONG_MAX, sd, idle, NULL))
+					1, ULONG_MAX, sd, idle, NULL,
+					&this_best_prio))
 			return 1;
 
 	return 0;
@@ -3184,8 +3181,7 @@ static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
 		      unsigned long max_nr_move, unsigned long max_load_move,
 		      struct sched_domain *sd, enum cpu_idle_type idle,
 		      int *all_pinned, unsigned long *load_moved,
-		      int this_best_prio, int best_prio, int best_prio_seen,
-		      struct rq_iterator *iterator)
+		      int *this_best_prio, struct rq_iterator *iterator)
 {
 	*load_moved = 0;
 
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 16511e9e552..923bed0b0c4 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -929,6 +929,7 @@ static struct task_struct *load_balance_next_fair(void *arg)
 	return __load_balance_iterator(cfs_rq, cfs_rq->rb_load_balance_curr);
 }
 
+#ifdef CONFIG_FAIR_GROUP_SCHED
 static int cfs_rq_best_prio(struct cfs_rq *cfs_rq)
 {
 	struct sched_entity *curr;
@@ -942,12 +943,13 @@ static int cfs_rq_best_prio(struct cfs_rq *cfs_rq)
 
 	return p->prio;
 }
+#endif
 
 static unsigned long
 load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
-			unsigned long max_nr_move, unsigned long max_load_move,
-			struct sched_domain *sd, enum cpu_idle_type idle,
-			int *all_pinned)
+		  unsigned long max_nr_move, unsigned long max_load_move,
+		  struct sched_domain *sd, enum cpu_idle_type idle,
+		  int *all_pinned, int *this_best_prio)
 {
 	struct cfs_rq *busy_cfs_rq;
 	unsigned long load_moved, total_nr_moved = 0, nr_moved;
@@ -958,10 +960,10 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
 	cfs_rq_iterator.next = load_balance_next_fair;
 
 	for_each_leaf_cfs_rq(busiest, busy_cfs_rq) {
+#ifdef CONFIG_FAIR_GROUP_SCHED
 		struct cfs_rq *this_cfs_rq;
-		long imbalance;
+		long imbalances;
 		unsigned long maxload;
-		int this_best_prio, best_prio, best_prio_seen = 0;
 
 		this_cfs_rq = cpu_cfs_rq(busy_cfs_rq, this_cpu);
 
@@ -975,27 +977,17 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
 		imbalance /= 2;
 		maxload = min(rem_load_move, imbalance);
 
-		this_best_prio = cfs_rq_best_prio(this_cfs_rq);
-		best_prio = cfs_rq_best_prio(busy_cfs_rq);
-
-		/*
-		 * Enable handling of the case where there is more than one task
-		 * with the best priority. If the current running task is one
-		 * of those with prio==best_prio we know it won't be moved
-		 * and therefore it's safe to override the skip (based on load)
-		 * of any task we find with that prio.
-		 */
-		if (cfs_rq_curr(busy_cfs_rq) == &busiest->curr->se)
-			best_prio_seen = 1;
-
+		*this_best_prio = cfs_rq_best_prio(this_cfs_rq);
+#else
+#define maxload rem_load_move
+#endif
 		/* pass busy_cfs_rq argument into
 		 * load_balance_[start|next]_fair iterators
 		 */
 		cfs_rq_iterator.arg = busy_cfs_rq;
 		nr_moved = balance_tasks(this_rq, this_cpu, busiest,
 				max_nr_move, maxload, sd, idle, all_pinned,
-				&load_moved, this_best_prio, best_prio,
-				best_prio_seen, &cfs_rq_iterator);
+				&load_moved, this_best_prio, &cfs_rq_iterator);
 
 		total_nr_moved += nr_moved;
 		max_nr_move -= nr_moved;
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index 1d8d9e13d95..dc9e1068911 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -41,7 +41,7 @@ static unsigned long
 load_balance_idle(struct rq *this_rq, int this_cpu, struct rq *busiest,
 			unsigned long max_nr_move, unsigned long max_load_move,
 			struct sched_domain *sd, enum cpu_idle_type idle,
-			int *all_pinned)
+			int *all_pinned, int *this_best_prio)
 {
 	return 0;
 }
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 2b0626a43cb..5b559e8c8aa 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -176,26 +176,12 @@ static unsigned long
 load_balance_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
 			unsigned long max_nr_move, unsigned long max_load_move,
 			struct sched_domain *sd, enum cpu_idle_type idle,
-			int *all_pinned)
+			int *all_pinned, int *this_best_prio)
 {
-	int this_best_prio, best_prio, best_prio_seen = 0;
 	int nr_moved;
 	struct rq_iterator rt_rq_iterator;
 	unsigned long load_moved;
 
-	best_prio = sched_find_first_bit(busiest->rt.active.bitmap);
-	this_best_prio = sched_find_first_bit(this_rq->rt.active.bitmap);
-
-	/*
-	 * Enable handling of the case where there is more than one task
-	 * with the best priority.   If the current running task is one
-	 * of those with prio==best_prio we know it won't be moved
-	 * and therefore it's safe to override the skip (based on load)
-	 * of any task we find with that prio.
-	 */
-	if (busiest->curr->prio == best_prio)
-		best_prio_seen = 1;
-
 	rt_rq_iterator.start = load_balance_start_rt;
 	rt_rq_iterator.next = load_balance_next_rt;
 	/* pass 'busiest' rq argument into
@@ -205,8 +191,7 @@ load_balance_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
 
 	nr_moved = balance_tasks(this_rq, this_cpu, busiest, max_nr_move,
 			max_load_move, sd, idle, all_pinned, &load_moved,
-			this_best_prio, best_prio, best_prio_seen,
-			&rt_rq_iterator);
+			this_best_prio, &rt_rq_iterator);
 
 	return load_moved;
 }
-- 
cgit v1.2.3-70-g09d2


From b04a0f4c1651a553ee1a03dc70297d66ec74db5c Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 9 Aug 2007 11:16:46 +0200
Subject: sched: add [__]update_rq_clock(rq)

add the [__]update_rq_clock(rq) functions. (No change in functionality,
just reorganization to prepare for elimination of the heavy 64-bit
timestamp-passing in the scheduler.)

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 30 ++++++++++++++++++++++--------
 1 file changed, 22 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 1fa07c14624..d613723f324 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -318,15 +318,19 @@ static inline int cpu_of(struct rq *rq)
 }
 
 /*
- * Per-runqueue clock, as finegrained as the platform can give us:
+ * Update the per-runqueue clock, as finegrained as the platform can give
+ * us, but without assuming monotonicity, etc.:
  */
-static unsigned long long __rq_clock(struct rq *rq)
+static void __update_rq_clock(struct rq *rq)
 {
 	u64 prev_raw = rq->prev_clock_raw;
 	u64 now = sched_clock();
 	s64 delta = now - prev_raw;
 	u64 clock = rq->clock;
 
+#ifdef CONFIG_SCHED_DEBUG
+	WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
+#endif
 	/*
 	 * Protect against sched_clock() occasionally going backwards:
 	 */
@@ -349,17 +353,24 @@ static unsigned long long __rq_clock(struct rq *rq)
 
 	rq->prev_clock_raw = now;
 	rq->clock = clock;
+}
 
-	return clock;
+static void update_rq_clock(struct rq *rq)
+{
+	if (likely(smp_processor_id() == cpu_of(rq)))
+		__update_rq_clock(rq);
 }
 
-static unsigned long long rq_clock(struct rq *rq)
+static u64 __rq_clock(struct rq *rq)
 {
-	int this_cpu = smp_processor_id();
+	__update_rq_clock(rq);
 
-	if (this_cpu == cpu_of(rq))
-		return __rq_clock(rq);
+	return rq->clock;
+}
 
+static u64 rq_clock(struct rq *rq)
+{
+	update_rq_clock(rq);
 	return rq->clock;
 }
 
@@ -386,9 +397,12 @@ unsigned long long cpu_clock(int cpu)
 {
 	unsigned long long now;
 	unsigned long flags;
+	struct rq *rq;
 
 	local_irq_save(flags);
-	now = rq_clock(cpu_rq(cpu));
+	rq = cpu_rq(cpu);
+	update_rq_clock(rq);
+	now = rq->clock;
 	local_irq_restore(flags);
 
 	return now;
-- 
cgit v1.2.3-70-g09d2


From a8e504d2a57ecd3f905b402072cdd1903f963bef Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 9 Aug 2007 11:16:47 +0200
Subject: sched: eliminate rq_clock() use

eliminate rq_clock() use by changing it to:

   update_rq_clock(rq)
   now = rq->clock;

identity transformation - no change in behavior.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c      | 49 ++++++++++++++++++++++++++++++++++---------------
 kernel/sched_fair.c |  8 ++++++--
 2 files changed, 40 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index d613723f324..fe3c152d0c6 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -927,7 +927,10 @@ static int effective_prio(struct task_struct *p)
  */
 static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
 {
-	u64 now = rq_clock(rq);
+	u64 now;
+
+	update_rq_clock(rq);
+	now = rq->clock;
 
 	if (p->state == TASK_UNINTERRUPTIBLE)
 		rq->nr_uninterruptible--;
@@ -941,7 +944,10 @@ static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
  */
 static inline void activate_idle_task(struct task_struct *p, struct rq *rq)
 {
-	u64 now = rq_clock(rq);
+	u64 now;
+
+	update_rq_clock(rq);
+	now = rq->clock;
 
 	if (p->state == TASK_UNINTERRUPTIBLE)
 		rq->nr_uninterruptible--;
@@ -1664,7 +1670,8 @@ void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
 	rq = task_rq_lock(p, &flags);
 	BUG_ON(p->state != TASK_RUNNING);
 	this_cpu = smp_processor_id(); /* parent's CPU */
-	now = rq_clock(rq);
+	update_rq_clock(rq);
+	now = rq->clock;
 
 	p->prio = effective_prio(p);
 
@@ -2134,7 +2141,8 @@ void sched_exec(void)
 static void pull_task(struct rq *src_rq, struct task_struct *p,
 		      struct rq *this_rq, int this_cpu)
 {
-	deactivate_task(src_rq, p, 0, rq_clock(src_rq));
+	update_rq_clock(src_rq);
+	deactivate_task(src_rq, p, 0, src_rq->clock);
 	set_task_cpu(p, this_cpu);
 	activate_task(this_rq, p, 0);
 	/*
@@ -3221,7 +3229,8 @@ unsigned long long task_sched_runtime(struct task_struct *p)
 	rq = task_rq_lock(p, &flags);
 	ns = p->se.sum_exec_runtime;
 	if (rq->curr == p) {
-		delta_exec = rq_clock(rq) - p->se.exec_start;
+		update_rq_clock(rq);
+		delta_exec = rq->clock - p->se.exec_start;
 		if ((s64)delta_exec > 0)
 			ns += delta_exec;
 	}
@@ -3919,7 +3928,8 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
 	BUG_ON(prio < 0 || prio > MAX_PRIO);
 
 	rq = task_rq_lock(p, &flags);
-	now = rq_clock(rq);
+	update_rq_clock(rq);
+	now = rq->clock;
 
 	oldprio = p->prio;
 	on_rq = p->se.on_rq;
@@ -3966,7 +3976,8 @@ void set_user_nice(struct task_struct *p, long nice)
 	 * the task might be in the middle of scheduling on another CPU.
 	 */
 	rq = task_rq_lock(p, &flags);
-	now = rq_clock(rq);
+	update_rq_clock(rq);
+	now = rq->clock;
 	/*
 	 * The RT priorities are set via sched_setscheduler(), but we still
 	 * allow the 'normal' nice value to be set - but as expected
@@ -4228,8 +4239,10 @@ recheck:
 		goto recheck;
 	}
 	on_rq = p->se.on_rq;
-	if (on_rq)
-		deactivate_task(rq, p, 0, rq_clock(rq));
+	if (on_rq) {
+		update_rq_clock(rq);
+		deactivate_task(rq, p, 0, rq->clock);
+	}
 	oldprio = p->prio;
 	__setscheduler(rq, p, policy, param->sched_priority);
 	if (on_rq) {
@@ -4981,8 +4994,10 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
 		goto out;
 
 	on_rq = p->se.on_rq;
-	if (on_rq)
-		deactivate_task(rq_src, p, 0, rq_clock(rq_src));
+	if (on_rq) {
+		update_rq_clock(rq_src);
+		deactivate_task(rq_src, p, 0, rq_src->clock);
+	}
 	set_task_cpu(p, dest_cpu);
 	if (on_rq) {
 		activate_task(rq_dest, p, 0);
@@ -5215,7 +5230,8 @@ static void migrate_dead_tasks(unsigned int dead_cpu)
 	for ( ; ; ) {
 		if (!rq->nr_running)
 			break;
-		next = pick_next_task(rq, rq->curr, rq_clock(rq));
+		update_rq_clock(rq);
+		next = pick_next_task(rq, rq->curr, rq->clock);
 		if (!next)
 			break;
 		migrate_dead(dead_cpu, next);
@@ -5400,7 +5416,8 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
 		rq->migration_thread = NULL;
 		/* Idle task back to normal (off runqueue, low prio) */
 		rq = task_rq_lock(rq->idle, &flags);
-		deactivate_task(rq, rq->idle, 0, rq_clock(rq));
+		update_rq_clock(rq);
+		deactivate_task(rq, rq->idle, 0, rq->clock);
 		rq->idle->static_prio = MAX_PRIO;
 		__setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
 		rq->idle->sched_class = &idle_sched_class;
@@ -6638,8 +6655,10 @@ void normalize_rt_tasks(void)
 #endif
 
 		on_rq = p->se.on_rq;
-		if (on_rq)
-			deactivate_task(task_rq(p), p, 0, rq_clock(task_rq(p)));
+		if (on_rq) {
+			update_rq_clock(task_rq(p));
+			deactivate_task(task_rq(p), p, 0, task_rq(p)->clock);
+		}
 		__setscheduler(rq, p, SCHED_NORMAL, 0);
 		if (on_rq) {
 			activate_task(task_rq(p), p, 0);
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 923bed0b0c4..969f08c8bd3 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -844,7 +844,8 @@ static void check_preempt_curr_fair(struct rq *rq, struct task_struct *p)
 	unsigned long gran;
 
 	if (unlikely(rt_prio(p->prio))) {
-		update_curr(cfs_rq, rq_clock(rq));
+		update_rq_clock(rq);
+		update_curr(cfs_rq, rq->clock);
 		resched_task(curr);
 		return;
 	}
@@ -1063,9 +1064,12 @@ static void set_curr_task_fair(struct rq *rq)
 {
 	struct task_struct *curr = rq->curr;
 	struct sched_entity *se = &curr->se;
-	u64 now = rq_clock(rq);
+	u64 now;
 	struct cfs_rq *cfs_rq;
 
+	update_rq_clock(rq);
+	now = rq->clock;
+
 	for_each_sched_entity(se) {
 		cfs_rq = cfs_rq_of(se);
 		set_next_entity(cfs_rq, se, now);
-- 
cgit v1.2.3-70-g09d2


From 2ab81159fa426bd09c21faf7c25fba13bc9d2902 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 9 Aug 2007 11:16:47 +0200
Subject: sched: remove rq_clock()

remove the now unused rq_clock() function.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index fe3c152d0c6..89321105479 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -368,12 +368,6 @@ static u64 __rq_clock(struct rq *rq)
 	return rq->clock;
 }
 
-static u64 rq_clock(struct rq *rq)
-{
-	update_rq_clock(rq);
-	return rq->clock;
-}
-
 /*
  * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
  * See detach_destroy_domains: synchronize_sched for details.
-- 
cgit v1.2.3-70-g09d2


From c1b3da3ecdbf9e9f377474c11ba988b8821f86c8 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 9 Aug 2007 11:16:47 +0200
Subject: sched: eliminate __rq_clock() use

eliminate __rq_clock() use by changing it to:

   __update_rq_clock(rq)
   now = rq->clock;

identity transformation - no change in behavior.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c      | 8 ++++++--
 kernel/sched_fair.c | 9 +++++++--
 2 files changed, 13 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 89321105479..d6734517517 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1967,9 +1967,12 @@ static void update_cpu_load(struct rq *this_rq)
 	unsigned long total_load = this_rq->ls.load.weight;
 	unsigned long this_load =  total_load;
 	struct load_stat *ls = &this_rq->ls;
-	u64 now = __rq_clock(this_rq);
+	u64 now;
 	int i, scale;
 
+	__update_rq_clock(this_rq);
+	now = this_rq->clock;
+
 	this_rq->nr_load_updates++;
 	if (unlikely(!(sysctl_sched_features & SCHED_FEAT_PRECISE_CPU_LOAD)))
 		goto do_avg;
@@ -3458,7 +3461,8 @@ need_resched_nonpreemptible:
 
 	spin_lock_irq(&rq->lock);
 	clear_tsk_need_resched(prev);
-	now = __rq_clock(rq);
+	__update_rq_clock(rq);
+	now = rq->clock;
 
 	if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
 		if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 969f08c8bd3..bd20fad3def 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -672,7 +672,10 @@ static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
 {
 	struct rq *rq = rq_of(cfs_rq);
 	struct sched_entity *next;
-	u64 now = __rq_clock(rq);
+	u64 now;
+
+	__update_rq_clock(rq);
+	now = rq->clock;
 
 	/*
 	 * Dequeue and enqueue the task to update its
@@ -824,8 +827,10 @@ dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep, u64 now)
 static void yield_task_fair(struct rq *rq, struct task_struct *p)
 {
 	struct cfs_rq *cfs_rq = task_cfs_rq(p);
-	u64 now = __rq_clock(rq);
+	u64 now;
 
+	__update_rq_clock(rq);
+	now = rq->clock;
 	/*
 	 * Dequeue and enqueue the task to update its
 	 * position within the tree:
-- 
cgit v1.2.3-70-g09d2


From eb59449400f1e5984509e502711141302a2867ab Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 9 Aug 2007 11:16:47 +0200
Subject: sched: remove __rq_clock()

remove the (now unused) __rq_clock() function.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 7 -------
 1 file changed, 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index d6734517517..65eb484dc26 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -361,13 +361,6 @@ static void update_rq_clock(struct rq *rq)
 		__update_rq_clock(rq);
 }
 
-static u64 __rq_clock(struct rq *rq)
-{
-	__update_rq_clock(rq);
-
-	return rq->clock;
-}
-
 /*
  * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
  * See detach_destroy_domains: synchronize_sched for details.
-- 
cgit v1.2.3-70-g09d2


From d281918d7c135c555d9cebcf73d4320efa8177dc Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 9 Aug 2007 11:16:47 +0200
Subject: sched: remove 'now' use from assignments

change all 'now' timestamp uses in assignments to rq->clock.

( this is an identity transformation that causes no functionality change:
  all such new rq->clock is necessarily preceded by an update_rq_clock()
  call. )

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c      |  8 ++++----
 kernel/sched_fair.c | 19 ++++++++++---------
 kernel/sched_rt.c   |  6 +++---
 3 files changed, 17 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 65eb484dc26..49a5fb0cdea 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -788,8 +788,8 @@ static void update_curr_load(struct rq *rq, u64 now)
 	u64 start;
 
 	start = ls->load_update_start;
-	ls->load_update_start = now;
-	ls->delta_stat += now - start;
+	ls->load_update_start = rq->clock;
+	ls->delta_stat += rq->clock - start;
 	/*
 	 * Stagger updates to ls->delta_fair. Very frequent updates
 	 * can be expensive.
@@ -1979,8 +1979,8 @@ static void update_cpu_load(struct rq *this_rq)
 	exec_delta64 = ls->delta_exec + 1;
 	ls->delta_exec = 0;
 
-	sample_interval64 = now - ls->load_update_last;
-	ls->load_update_last = now;
+	sample_interval64 = this_rq->clock - ls->load_update_last;
+	ls->load_update_last = this_rq->clock;
 
 	if ((s64)sample_interval64 < (s64)TICK_NSEC)
 		sample_interval64 = TICK_NSEC;
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index bd20fad3def..bcf5fc59e8e 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -333,7 +333,7 @@ static void update_curr(struct cfs_rq *cfs_rq, u64 now)
 	 * since the last time we changed load (this cannot
 	 * overflow on 32 bits):
 	 */
-	delta_exec = (unsigned long)(now - curr->exec_start);
+	delta_exec = (unsigned long)(rq_of(cfs_rq)->clock - curr->exec_start);
 
 	curr->delta_exec += delta_exec;
 
@@ -341,14 +341,14 @@ static void update_curr(struct cfs_rq *cfs_rq, u64 now)
 		__update_curr(cfs_rq, curr, now);
 		curr->delta_exec = 0;
 	}
-	curr->exec_start = now;
+	curr->exec_start = rq_of(cfs_rq)->clock;
 }
 
 static inline void
 update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
 {
 	se->wait_start_fair = cfs_rq->fair_clock;
-	schedstat_set(se->wait_start, now);
+	schedstat_set(se->wait_start, rq_of(cfs_rq)->clock);
 }
 
 /*
@@ -421,7 +421,8 @@ __update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
 {
 	unsigned long delta_fair = se->delta_fair_run;
 
-	schedstat_set(se->wait_max, max(se->wait_max, now - se->wait_start));
+	schedstat_set(se->wait_max, max(se->wait_max,
+			rq_of(cfs_rq)->clock - se->wait_start));
 
 	if (unlikely(se->load.weight != NICE_0_LOAD))
 		delta_fair = calc_weighted(delta_fair, se->load.weight,
@@ -470,7 +471,7 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
 	/*
 	 * We are starting a new run period:
 	 */
-	se->exec_start = now;
+	se->exec_start = rq_of(cfs_rq)->clock;
 }
 
 /*
@@ -545,7 +546,7 @@ enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
 
 #ifdef CONFIG_SCHEDSTATS
 	if (se->sleep_start) {
-		u64 delta = now - se->sleep_start;
+		u64 delta = rq_of(cfs_rq)->clock - se->sleep_start;
 
 		if ((s64)delta < 0)
 			delta = 0;
@@ -557,7 +558,7 @@ enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
 		se->sum_sleep_runtime += delta;
 	}
 	if (se->block_start) {
-		u64 delta = now - se->block_start;
+		u64 delta = rq_of(cfs_rq)->clock - se->block_start;
 
 		if ((s64)delta < 0)
 			delta = 0;
@@ -599,9 +600,9 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
 			struct task_struct *tsk = task_of(se);
 
 			if (tsk->state & TASK_INTERRUPTIBLE)
-				se->sleep_start = now;
+				se->sleep_start = rq_of(cfs_rq)->clock;
 			if (tsk->state & TASK_UNINTERRUPTIBLE)
-				se->block_start = now;
+				se->block_start = rq_of(cfs_rq)->clock;
 		}
 		cfs_rq->wait_runtime -= se->wait_runtime;
 #endif
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 5b559e8c8aa..5fbd87ad0f5 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -15,14 +15,14 @@ static inline void update_curr_rt(struct rq *rq, u64 now)
 	if (!task_has_rt_policy(curr))
 		return;
 
-	delta_exec = now - curr->se.exec_start;
+	delta_exec = rq->clock - curr->se.exec_start;
 	if (unlikely((s64)delta_exec < 0))
 		delta_exec = 0;
 
 	schedstat_set(curr->se.exec_max, max(curr->se.exec_max, delta_exec));
 
 	curr->se.sum_exec_runtime += delta_exec;
-	curr->se.exec_start = now;
+	curr->se.exec_start = rq->clock;
 }
 
 static void
@@ -89,7 +89,7 @@ static struct task_struct *pick_next_task_rt(struct rq *rq, u64 now)
 	queue = array->queue + idx;
 	next = list_entry(queue->next, struct task_struct, run_list);
 
-	next->se.exec_start = now;
+	next->se.exec_start = rq->clock;
 
 	return next;
 }
-- 
cgit v1.2.3-70-g09d2


From 5cef9eca3837a8dcf605a360e213c4179a07c41a Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 9 Aug 2007 11:16:47 +0200
Subject: sched: remove the 'u64 now' parameter from print_cfs_rq()

remove the 'u64 now' parameter from print_cfs_rq().

( identity transformation that causes no change in functionality. )

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h | 4 ++--
 kernel/sched_debug.c  | 4 ++--
 kernel/sched_fair.c   | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 513b81c60e8..62ddddb49db 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -139,7 +139,7 @@ struct cfs_rq;
 extern void proc_sched_show_task(struct task_struct *p, struct seq_file *m);
 extern void proc_sched_set_task(struct task_struct *p);
 extern void
-print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq, u64 now);
+print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq);
 #else
 static inline void
 proc_sched_show_task(struct task_struct *p, struct seq_file *m)
@@ -149,7 +149,7 @@ static inline void proc_sched_set_task(struct task_struct *p)
 {
 }
 static inline void
-print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq, u64 now)
+print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 {
 }
 #endif
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 8421b9399e1..f977ee53f8c 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -106,7 +106,7 @@ print_cfs_rq_runtime_sum(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 		(long long)wait_runtime_rq_sum);
 }
 
-void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq, u64 now)
+void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 {
 	SEQ_printf(m, "\ncfs_rq %p\n", cfs_rq);
 
@@ -166,7 +166,7 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now)
 	P(cpu_load[4]);
 #undef P
 
-	print_cfs_stats(m, cpu, now);
+	print_cfs_stats(m, cpu);
 
 	print_rq(m, rq, cpu, now);
 }
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index bcf5fc59e8e..025ac532b27 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1108,12 +1108,12 @@ struct sched_class fair_sched_class __read_mostly = {
 };
 
 #ifdef CONFIG_SCHED_DEBUG
-static void print_cfs_stats(struct seq_file *m, int cpu, u64 now)
+static void print_cfs_stats(struct seq_file *m, int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
 	struct cfs_rq *cfs_rq;
 
 	for_each_leaf_cfs_rq(rq, cfs_rq)
-		print_cfs_rq(m, cpu, cfs_rq, now);
+		print_cfs_rq(m, cpu, cfs_rq);
 }
 #endif
-- 
cgit v1.2.3-70-g09d2


From b7cc089657c12340077fe937380f9e54bbd6b300 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 9 Aug 2007 11:16:47 +0200
Subject: sched: remove the 'u64 now' parameter from update_curr()

remove the 'u64 now' parameter from update_curr().

( identity transformation that causes no change in functionality. )

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 025ac532b27..79875988282 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -281,7 +281,7 @@ add_wait_runtime(struct cfs_rq *cfs_rq, struct sched_entity *se, long delta)
  * are not in our scheduling class.
  */
 static inline void
-__update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, u64 now)
+__update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr)
 {
 	unsigned long delta, delta_exec, delta_fair, delta_mine;
 	struct load_weight *lw = &cfs_rq->load;
@@ -320,7 +320,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, u64 now)
 	add_wait_runtime(cfs_rq, curr, delta_mine - delta_exec);
 }
 
-static void update_curr(struct cfs_rq *cfs_rq, u64 now)
+static void update_curr(struct cfs_rq *cfs_rq)
 {
 	struct sched_entity *curr = cfs_rq_curr(cfs_rq);
 	unsigned long delta_exec;
@@ -338,7 +338,7 @@ static void update_curr(struct cfs_rq *cfs_rq, u64 now)
 	curr->delta_exec += delta_exec;
 
 	if (unlikely(curr->delta_exec > sysctl_sched_stat_granularity)) {
-		__update_curr(cfs_rq, curr, now);
+		__update_curr(cfs_rq, curr);
 		curr->delta_exec = 0;
 	}
 	curr->exec_start = rq_of(cfs_rq)->clock;
@@ -453,7 +453,7 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
 static inline void
 update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
 {
-	update_curr(cfs_rq, now);
+	update_curr(cfs_rq);
 	/*
 	 * Mark the end of the wait period if dequeueing a
 	 * waiting task:
@@ -579,7 +579,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
 	/*
 	 * Update the fair clock.
 	 */
-	update_curr(cfs_rq, now);
+	update_curr(cfs_rq);
 
 	if (wakeup)
 		enqueue_sleeper(cfs_rq, se, now);
@@ -660,7 +660,7 @@ put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev, u64 now)
 	 * was not called and update_curr() has to be done:
 	 */
 	if (prev->on_rq)
-		update_curr(cfs_rq, now);
+		update_curr(cfs_rq);
 
 	update_stats_curr_end(cfs_rq, prev, now);
 
@@ -851,7 +851,7 @@ static void check_preempt_curr_fair(struct rq *rq, struct task_struct *p)
 
 	if (unlikely(rt_prio(p->prio))) {
 		update_rq_clock(rq);
-		update_curr(cfs_rq, rq->clock);
+		update_curr(cfs_rq);
 		resched_task(curr);
 		return;
 	}
-- 
cgit v1.2.3-70-g09d2


From 5870db5b83932bea0deac3c68e3c40f377d0b8f7 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 9 Aug 2007 11:16:47 +0200
Subject: sched: remove the 'u64 now' parameter from update_stats_wait_start()

remove the 'u64 now' parameter from update_stats_wait_start().

( identity transformation that causes no change in functionality. )

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 79875988282..e48f32e99a0 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -345,7 +345,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
 }
 
 static inline void
-update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
+update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	se->wait_start_fair = cfs_rq->fair_clock;
 	schedstat_set(se->wait_start, rq_of(cfs_rq)->clock);
@@ -386,7 +386,7 @@ update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
 	 * a dequeue/enqueue event is a NOP)
 	 */
 	if (se != cfs_rq_curr(cfs_rq))
-		update_stats_wait_start(cfs_rq, se, now);
+		update_stats_wait_start(cfs_rq, se);
 	/*
 	 * Update the key:
 	 */
@@ -665,7 +665,7 @@ put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev, u64 now)
 	update_stats_curr_end(cfs_rq, prev, now);
 
 	if (prev->on_rq)
-		update_stats_wait_start(cfs_rq, prev, now);
+		update_stats_wait_start(cfs_rq, prev);
 	set_cfs_rq_curr(cfs_rq, NULL);
 }
 
-- 
cgit v1.2.3-70-g09d2


From d2417e5a3e6c79e79f982c7553301dc3539873b0 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 9 Aug 2007 11:16:47 +0200
Subject: sched: remove the 'u64 now' parameter from update_stats_enqueue()

remove the 'u64 now' parameter from update_stats_enqueue().

( identity transformation that causes no change in functionality. )

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index e48f32e99a0..66209d68845 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -376,8 +376,7 @@ calc_weighted(unsigned long delta, unsigned long weight, int shift)
 /*
  * Task is being enqueued - update stats:
  */
-static void
-update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
+static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	s64 key;
 
@@ -584,7 +583,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
 	if (wakeup)
 		enqueue_sleeper(cfs_rq, se, now);
 
-	update_stats_enqueue(cfs_rq, se, now);
+	update_stats_enqueue(cfs_rq, se);
 	__enqueue_entity(cfs_rq, se);
 }
 
@@ -1035,7 +1034,7 @@ static void task_new_fair(struct rq *rq, struct task_struct *p, u64 now)
 
 	sched_info_queued(p);
 
-	update_stats_enqueue(cfs_rq, se, now);
+	update_stats_enqueue(cfs_rq, se);
 	/*
 	 * Child runs first: we let it run before the parent
 	 * until it reschedules once. We set up the key so that
-- 
cgit v1.2.3-70-g09d2


From eac55ea37642163e6bdd899ac319c413c1f1b7cd Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 9 Aug 2007 11:16:47 +0200
Subject: sched: remove the 'u64 now' parameter from __update_stats_wait_end()

remove the 'u64 now' parameter from __update_stats_wait_end().

( identity transformation that causes no change in functionality. )

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 66209d68845..cfaf2b18f28 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -416,7 +416,7 @@ static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
  * Note: must be called with a freshly updated rq->fair_clock.
  */
 static inline void
-__update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
+__update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	unsigned long delta_fair = se->delta_fair_run;
 
@@ -441,7 +441,7 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
 	se->delta_fair_run += delta_fair;
 	if (unlikely(abs(se->delta_fair_run) >=
 				sysctl_sched_stat_granularity)) {
-		__update_stats_wait_end(cfs_rq, se, now);
+		__update_stats_wait_end(cfs_rq, se);
 		se->delta_fair_run = 0;
 	}
 
-- 
cgit v1.2.3-70-g09d2


From 9ef0a9615b0d9cd29c6bc0e8898f1bc3145e44c6 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 9 Aug 2007 11:16:47 +0200
Subject: sched: remove the 'u64 now' parameter from update_stats_wait_end()

remove the 'u64 now' parameter from update_stats_wait_end().

( identity transformation that causes no change in functionality. )

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index cfaf2b18f28..0cfa1d68241 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -431,7 +431,7 @@ __update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
 }
 
 static void
-update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
+update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	unsigned long delta_fair;
 
@@ -458,7 +458,7 @@ update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
 	 * waiting task:
 	 */
 	if (se != cfs_rq_curr(cfs_rq))
-		update_stats_wait_end(cfs_rq, se, now);
+		update_stats_wait_end(cfs_rq, se);
 }
 
 /*
@@ -637,7 +637,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
 	 * done a put_prev_task_fair() shortly before this, which
 	 * updated rq->fair_clock - used by update_stats_wait_end())
 	 */
-	update_stats_wait_end(cfs_rq, se, now);
+	update_stats_wait_end(cfs_rq, se);
 	update_stats_curr_start(cfs_rq, se, now);
 	set_cfs_rq_curr(cfs_rq, se);
 }
-- 
cgit v1.2.3-70-g09d2


From 79303e9e0219a23f8757af99393b21ecb35231bf Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 9 Aug 2007 11:16:47 +0200
Subject: sched: remove the 'u64 now' parameter from update_stats_curr_start()

remove the 'u64 now' parameter from update_stats_curr_start().

( identity transformation that causes no change in functionality. )

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 0cfa1d68241..1c73073be4c 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -465,7 +465,7 @@ update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
  * We are picking a new current task - update its stats:
  */
 static inline void
-update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
+update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	/*
 	 * We are starting a new run period:
@@ -638,7 +638,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
 	 * updated rq->fair_clock - used by update_stats_wait_end())
 	 */
 	update_stats_wait_end(cfs_rq, se);
-	update_stats_curr_start(cfs_rq, se, now);
+	update_stats_curr_start(cfs_rq, se);
 	set_cfs_rq_curr(cfs_rq, se);
 }
 
-- 
cgit v1.2.3-70-g09d2


From 19b6a2e3706675eea4d74729114e36968fa43577 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 9 Aug 2007 11:16:48 +0200
Subject: sched: remove the 'u64 now' parameter from update_stats_dequeue()

remove the 'u64 now' parameter from update_stats_dequeue().

( identity transformation that causes no change in functionality. )

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 1c73073be4c..9ec912e3398 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -450,7 +450,7 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
 }
 
 static inline void
-update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
+update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	update_curr(cfs_rq);
 	/*
@@ -591,7 +591,7 @@ static void
 dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
 	       int sleep, u64 now)
 {
-	update_stats_dequeue(cfs_rq, se, now);
+	update_stats_dequeue(cfs_rq, se);
 	if (sleep) {
 		se->sleep_start_fair = cfs_rq->fair_clock;
 #ifdef CONFIG_SCHEDSTATS
-- 
cgit v1.2.3-70-g09d2


From c7e9b5b293106c8dd6b1ca968d24f10fa919f6fd Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 9 Aug 2007 11:16:48 +0200
Subject: sched: remove the 'u64 now' parameter from update_stats_curr_end()

remove the 'u64 now' parameter from update_stats_curr_end().

( identity transformation that causes no change in functionality. )

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 9ec912e3398..41a37daba2d 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -477,7 +477,7 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
  * We are descheduling a task - update its stats:
  */
 static inline void
-update_stats_curr_end(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
+update_stats_curr_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	se->exec_start = 0;
 }
@@ -661,7 +661,7 @@ put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev, u64 now)
 	if (prev->on_rq)
 		update_curr(cfs_rq);
 
-	update_stats_curr_end(cfs_rq, prev, now);
+	update_stats_curr_end(cfs_rq, prev);
 
 	if (prev->on_rq)
 		update_stats_wait_start(cfs_rq, prev);
-- 
cgit v1.2.3-70-g09d2


From dfdc119e54f44cba70ebe1f565767d3d0640d19f Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 9 Aug 2007 11:16:48 +0200
Subject: sched: remove the 'u64 now' parameter from __enqueue_sleeper()

remove the 'u64 now' parameter from __enqueue_sleeper().

( identity transformation that causes no change in functionality. )

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 41a37daba2d..f4dbc7e1ce4 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -486,8 +486,7 @@ update_stats_curr_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
  * Scheduling class queueing methods:
  */
 
-static void
-__enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
+static void __enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	unsigned long load = cfs_rq->load.weight, delta_fair;
 	long prev_runtime;
@@ -537,7 +536,7 @@ enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
 	se->delta_fair_sleep += delta_fair;
 	if (unlikely(abs(se->delta_fair_sleep) >=
 				sysctl_sched_stat_granularity)) {
-		__enqueue_sleeper(cfs_rq, se, now);
+		__enqueue_sleeper(cfs_rq, se);
 		se->delta_fair_sleep = 0;
 	}
 
-- 
cgit v1.2.3-70-g09d2


From 2396af69bec0ba3274383c20de7a31acf7c74b7a Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 9 Aug 2007 11:16:48 +0200
Subject: sched: remove the 'u64 now' parameter from enqueue_sleeper()

remove the 'u64 now' parameter from enqueue_sleeper().

( identity transformation that causes no change in functionality. )

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index f4dbc7e1ce4..ca62f1973e2 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -520,8 +520,7 @@ static void __enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	schedstat_add(cfs_rq, wait_runtime, se->wait_runtime);
 }
 
-static void
-enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
+static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	struct task_struct *tsk = task_of(se);
 	unsigned long delta_fair;
@@ -580,7 +579,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
 	update_curr(cfs_rq);
 
 	if (wakeup)
-		enqueue_sleeper(cfs_rq, se, now);
+		enqueue_sleeper(cfs_rq, se);
 
 	update_stats_enqueue(cfs_rq, se);
 	__enqueue_entity(cfs_rq, se);
-- 
cgit v1.2.3-70-g09d2


From 668031ca8fa2cc565f325f4fb69f131af449b7a7 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 9 Aug 2007 11:16:48 +0200
Subject: sched: remove the 'u64 now' parameter from enqueue_entity()

remove the 'u64 now' parameter from enqueue_entity().

( identity transformation that causes no change in functionality. )

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index ca62f1973e2..5576ead0dfd 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -570,8 +570,7 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
 }
 
 static void
-enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
-	       int wakeup, u64 now)
+enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
 {
 	/*
 	 * Update the fair clock.
@@ -680,7 +679,7 @@ static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
 	 * position within the tree:
 	 */
 	dequeue_entity(cfs_rq, curr, 0, now);
-	enqueue_entity(cfs_rq, curr, 0, now);
+	enqueue_entity(cfs_rq, curr, 0);
 
 	/*
 	 * Reschedule if another task tops the current one.
@@ -795,7 +794,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup, u64 now)
 		if (se->on_rq)
 			break;
 		cfs_rq = cfs_rq_of(se);
-		enqueue_entity(cfs_rq, se, wakeup, now);
+		enqueue_entity(cfs_rq, se, wakeup);
 	}
 }
 
@@ -834,7 +833,7 @@ static void yield_task_fair(struct rq *rq, struct task_struct *p)
 	 * position within the tree:
 	 */
 	dequeue_entity(cfs_rq, &p->se, 0, now);
-	enqueue_entity(cfs_rq, &p->se, 0, now);
+	enqueue_entity(cfs_rq, &p->se, 0);
 }
 
 /*
-- 
cgit v1.2.3-70-g09d2


From 525c2716a41d3e87387b32c5b0868acb52cbb559 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 9 Aug 2007 11:16:48 +0200
Subject: sched: remove the 'u64 now' parameter from dequeue_entity()

remove the 'u64 now' parameter from dequeue_entity().

( identity transformation that causes no change in functionality. )

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 5576ead0dfd..da92b78570c 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -585,8 +585,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
 }
 
 static void
-dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
-	       int sleep, u64 now)
+dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
 {
 	update_stats_dequeue(cfs_rq, se);
 	if (sleep) {
@@ -678,7 +677,7 @@ static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
 	 * Dequeue and enqueue the task to update its
 	 * position within the tree:
 	 */
-	dequeue_entity(cfs_rq, curr, 0, now);
+	dequeue_entity(cfs_rq, curr, 0);
 	enqueue_entity(cfs_rq, curr, 0);
 
 	/*
@@ -811,7 +810,7 @@ dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep, u64 now)
 
 	for_each_sched_entity(se) {
 		cfs_rq = cfs_rq_of(se);
-		dequeue_entity(cfs_rq, se, sleep, now);
+		dequeue_entity(cfs_rq, se, sleep);
 		/* Don't dequeue parent if it has other entities besides us */
 		if (cfs_rq->load.weight)
 			break;
@@ -832,7 +831,7 @@ static void yield_task_fair(struct rq *rq, struct task_struct *p)
 	 * Dequeue and enqueue the task to update its
 	 * position within the tree:
 	 */
-	dequeue_entity(cfs_rq, &p->se, 0, now);
+	dequeue_entity(cfs_rq, &p->se, 0);
 	enqueue_entity(cfs_rq, &p->se, 0);
 }
 
-- 
cgit v1.2.3-70-g09d2


From 8494f412edecbdbc36105e0a08f80d05a14dde2c Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 9 Aug 2007 11:16:48 +0200
Subject: sched: remove the 'u64 now' parameter from set_next_entity()

remove the 'u64 now' parameter from set_next_entity().

( identity transformation that causes no change in functionality. )

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index da92b78570c..538e09f17d7 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -624,7 +624,7 @@ __check_preempt_curr_fair(struct cfs_rq *cfs_rq, struct sched_entity *se,
 }
 
 static inline void
-set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
+set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	/*
 	 * Any task has to be enqueued before it get to execute on
@@ -642,7 +642,7 @@ static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq, u64 now)
 {
 	struct sched_entity *se = __pick_next_entity(cfs_rq);
 
-	set_next_entity(cfs_rq, se, now);
+	set_next_entity(cfs_rq, se);
 
 	return se;
 }
@@ -1073,7 +1073,7 @@ static void set_curr_task_fair(struct rq *rq)
 
 	for_each_sched_entity(se) {
 		cfs_rq = cfs_rq_of(se);
-		set_next_entity(cfs_rq, se, now);
+		set_next_entity(cfs_rq, se);
 	}
 }
 #else
-- 
cgit v1.2.3-70-g09d2


From 9948f4b2a728e9ca4928a9a97eb09df955f5b17c Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 9 Aug 2007 11:16:48 +0200
Subject: sched: remove the 'u64 now' parameter from pick_next_entity()

remove the 'u64 now' parameter from pick_next_entity().

( identity transformation that causes no change in functionality. )

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 538e09f17d7..54afe804538 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -638,7 +638,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	set_cfs_rq_curr(cfs_rq, se);
 }
 
-static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq, u64 now)
+static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
 {
 	struct sched_entity *se = __pick_next_entity(cfs_rq);
 
@@ -871,7 +871,7 @@ static struct task_struct *pick_next_task_fair(struct rq *rq, u64 now)
 		return NULL;
 
 	do {
-		se = pick_next_entity(cfs_rq, now);
+		se = pick_next_entity(cfs_rq);
 		cfs_rq = group_cfs_rq(se);
 	} while (cfs_rq);
 
-- 
cgit v1.2.3-70-g09d2


From ab6cde2692c76b88ea430aa188ea50303216a955 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 9 Aug 2007 11:16:48 +0200
Subject: sched: remove the 'u64 now' parameter from put_prev_entity()

remove the 'u64 now' parameter from put_prev_entity().

( identity transformation that causes no change in functionality. )

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 54afe804538..a11d18861a3 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -647,8 +647,7 @@ static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
 	return se;
 }
 
-static void
-put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev, u64 now)
+static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
 {
 	/*
 	 * If still on the runqueue then deactivate_task()
@@ -888,7 +887,7 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev, u64 now)
 
 	for_each_sched_entity(se) {
 		cfs_rq = cfs_rq_of(se);
-		put_prev_entity(cfs_rq, se, now);
+		put_prev_entity(cfs_rq, se);
 	}
 }
 
-- 
cgit v1.2.3-70-g09d2


From f1e14ef64d3e1bdcb3437f1e926fe5a7f861aa0a Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 9 Aug 2007 11:16:48 +0200
Subject: sched: remove the 'u64 now' parameter from update_curr_rt()

remove the 'u64 now' parameter from update_curr_rt().

( identity transformation that causes no change in functionality. )

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_rt.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 5fbd87ad0f5..fa5a46273b7 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -7,7 +7,7 @@
  * Update the current task's runtime statistics. Skip current tasks that
  * are not in our scheduling class.
  */
-static inline void update_curr_rt(struct rq *rq, u64 now)
+static inline void update_curr_rt(struct rq *rq)
 {
 	struct task_struct *curr = rq->curr;
 	u64 delta_exec;
@@ -42,7 +42,7 @@ dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep, u64 now)
 {
 	struct rt_prio_array *array = &rq->rt.active;
 
-	update_curr_rt(rq, now);
+	update_curr_rt(rq);
 
 	list_del(&p->run_list);
 	if (list_empty(array->queue + p->prio))
@@ -96,7 +96,7 @@ static struct task_struct *pick_next_task_rt(struct rq *rq, u64 now)
 
 static void put_prev_task_rt(struct rq *rq, struct task_struct *p, u64 now)
 {
-	update_curr_rt(rq, now);
+	update_curr_rt(rq);
 	p->se.exec_start = 0;
 }
 
-- 
cgit v1.2.3-70-g09d2


From fd390f6a04f22fb457d6fd1855964f79536525de Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 9 Aug 2007 11:16:48 +0200
Subject: sched: remove the 'u64 now' parameter from ->enqueue_task()

remove the 'u64 now' parameter from ->enqueue_task().

( identity transformation that causes no change in functionality. )

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h | 3 +--
 kernel/sched.c        | 2 +-
 kernel/sched_fair.c   | 3 +--
 kernel/sched_rt.c     | 3 +--
 4 files changed, 4 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 62ddddb49db..b11dedfbab6 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -855,8 +855,7 @@ struct sched_domain;
 struct sched_class {
 	struct sched_class *next;
 
-	void (*enqueue_task) (struct rq *rq, struct task_struct *p,
-			      int wakeup, u64 now);
+	void (*enqueue_task) (struct rq *rq, struct task_struct *p, int wakeup);
 	void (*dequeue_task) (struct rq *rq, struct task_struct *p,
 			      int sleep, u64 now);
 	void (*yield_task) (struct rq *rq, struct task_struct *p);
diff --git a/kernel/sched.c b/kernel/sched.c
index 49a5fb0cdea..43ae1566b8f 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -852,7 +852,7 @@ static void
 enqueue_task(struct rq *rq, struct task_struct *p, int wakeup, u64 now)
 {
 	sched_info_queued(p);
-	p->sched_class->enqueue_task(rq, p, wakeup, now);
+	p->sched_class->enqueue_task(rq, p, wakeup);
 	p->se.on_rq = 1;
 }
 
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index a11d18861a3..81db9626b7e 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -782,8 +782,7 @@ static inline int is_same_group(struct task_struct *curr, struct task_struct *p)
  * increased. Here we update the fair scheduling stats and
  * then put the task into the rbtree:
  */
-static void
-enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup, u64 now)
+static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup)
 {
 	struct cfs_rq *cfs_rq;
 	struct sched_entity *se = &p->se;
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index fa5a46273b7..1edaa99e0d3 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -25,8 +25,7 @@ static inline void update_curr_rt(struct rq *rq)
 	curr->se.exec_start = rq->clock;
 }
 
-static void
-enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup, u64 now)
+static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup)
 {
 	struct rt_prio_array *array = &rq->rt.active;
 
-- 
cgit v1.2.3-70-g09d2


From f02231e51a280f1a0fee4d03ad8f50048e06cced Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 9 Aug 2007 11:16:48 +0200
Subject: sched: remove the 'u64 now' parameter from ->dequeue_task()

remove the 'u64 now' parameter from ->dequeue_task().

( identity transformation that causes no change in functionality. )

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h   | 3 +--
 kernel/sched.c          | 2 +-
 kernel/sched_fair.c     | 3 +--
 kernel/sched_idletask.c | 2 +-
 kernel/sched_rt.c       | 3 +--
 5 files changed, 5 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index b11dedfbab6..c7815a6b70e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -856,8 +856,7 @@ struct sched_class {
 	struct sched_class *next;
 
 	void (*enqueue_task) (struct rq *rq, struct task_struct *p, int wakeup);
-	void (*dequeue_task) (struct rq *rq, struct task_struct *p,
-			      int sleep, u64 now);
+	void (*dequeue_task) (struct rq *rq, struct task_struct *p, int sleep);
 	void (*yield_task) (struct rq *rq, struct task_struct *p);
 
 	void (*check_preempt_curr) (struct rq *rq, struct task_struct *p);
diff --git a/kernel/sched.c b/kernel/sched.c
index 43ae1566b8f..e51d75f4b4d 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -859,7 +859,7 @@ enqueue_task(struct rq *rq, struct task_struct *p, int wakeup, u64 now)
 static void
 dequeue_task(struct rq *rq, struct task_struct *p, int sleep, u64 now)
 {
-	p->sched_class->dequeue_task(rq, p, sleep, now);
+	p->sched_class->dequeue_task(rq, p, sleep);
 	p->se.on_rq = 0;
 }
 
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 81db9626b7e..fb4d614af2c 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -800,8 +800,7 @@ static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup)
  * decreased. We remove the task from the rbtree and
  * update the fair scheduling stats:
  */
-static void
-dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep, u64 now)
+static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep)
 {
 	struct cfs_rq *cfs_rq;
 	struct sched_entity *se = &p->se;
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index dc9e1068911..f69e083e0d9 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -25,7 +25,7 @@ static struct task_struct *pick_next_task_idle(struct rq *rq, u64 now)
  * message if some code attempts to do it:
  */
 static void
-dequeue_task_idle(struct rq *rq, struct task_struct *p, int sleep, u64 now)
+dequeue_task_idle(struct rq *rq, struct task_struct *p, int sleep)
 {
 	spin_unlock_irq(&rq->lock);
 	printk(KERN_ERR "bad: scheduling from the idle thread!\n");
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 1edaa99e0d3..60591e2512b 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -36,8 +36,7 @@ static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup)
 /*
  * Adding/removing a task to/from a priority array:
  */
-static void
-dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep, u64 now)
+static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
 {
 	struct rt_prio_array *array = &rq->rt.active;
 
-- 
cgit v1.2.3-70-g09d2


From fb8d47240246e20f864f0724a23a7220cd1c59ac Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 9 Aug 2007 11:16:48 +0200
Subject: sched: remove the 'u64 now' parameter from ->pick_next_task()

remove the 'u64 now' parameter from ->pick_next_task().

( identity transformation that causes no change in functionality. )

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h   | 2 +-
 kernel/sched.c          | 4 ++--
 kernel/sched_fair.c     | 2 +-
 kernel/sched_idletask.c | 2 +-
 kernel/sched_rt.c       | 2 +-
 5 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index c7815a6b70e..c6ad4071c79 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -861,7 +861,7 @@ struct sched_class {
 
 	void (*check_preempt_curr) (struct rq *rq, struct task_struct *p);
 
-	struct task_struct * (*pick_next_task) (struct rq *rq, u64 now);
+	struct task_struct * (*pick_next_task) (struct rq *rq);
 	void (*put_prev_task) (struct rq *rq, struct task_struct *p, u64 now);
 
 	unsigned long (*load_balance) (struct rq *this_rq, int this_cpu,
diff --git a/kernel/sched.c b/kernel/sched.c
index e51d75f4b4d..b67a288a0f1 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3410,14 +3410,14 @@ pick_next_task(struct rq *rq, struct task_struct *prev, u64 now)
 	 * the fair class we can call that function directly:
 	 */
 	if (likely(rq->nr_running == rq->cfs.nr_running)) {
-		p = fair_sched_class.pick_next_task(rq, now);
+		p = fair_sched_class.pick_next_task(rq);
 		if (likely(p))
 			return p;
 	}
 
 	class = sched_class_highest;
 	for ( ; ; ) {
-		p = class->pick_next_task(rq, now);
+		p = class->pick_next_task(rq);
 		if (p)
 			return p;
 		/*
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index fb4d614af2c..0b23aaf074f 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -859,7 +859,7 @@ static void check_preempt_curr_fair(struct rq *rq, struct task_struct *p)
 		__check_preempt_curr_fair(cfs_rq, &p->se, &curr->se, gran);
 }
 
-static struct task_struct *pick_next_task_fair(struct rq *rq, u64 now)
+static struct task_struct *pick_next_task_fair(struct rq *rq)
 {
 	struct cfs_rq *cfs_rq = &rq->cfs;
 	struct sched_entity *se;
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index f69e083e0d9..9f4c28f858f 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -13,7 +13,7 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p)
 	resched_task(rq->idle);
 }
 
-static struct task_struct *pick_next_task_idle(struct rq *rq, u64 now)
+static struct task_struct *pick_next_task_idle(struct rq *rq)
 {
 	schedstat_inc(rq, sched_goidle);
 
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 60591e2512b..c0b0d6237bb 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -73,7 +73,7 @@ static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p)
 		resched_task(rq->curr);
 }
 
-static struct task_struct *pick_next_task_rt(struct rq *rq, u64 now)
+static struct task_struct *pick_next_task_rt(struct rq *rq)
 {
 	struct rt_prio_array *array = &rq->rt.active;
 	struct task_struct *next;
-- 
cgit v1.2.3-70-g09d2


From ff95f3df54609d9d4b9572f8a67d09922a645043 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 9 Aug 2007 11:16:49 +0200
Subject: sched: remove the 'u64 now' parameter from pick_next_task()

remove the 'u64 now' parameter from pick_next_task().

( identity transformation that causes no change in functionality. )

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index b67a288a0f1..4f9f9e9d726 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3400,7 +3400,7 @@ static inline void schedule_debug(struct task_struct *prev)
  * Pick up the highest-prio task:
  */
 static inline struct task_struct *
-pick_next_task(struct rq *rq, struct task_struct *prev, u64 now)
+pick_next_task(struct rq *rq, struct task_struct *prev)
 {
 	struct sched_class *class;
 	struct task_struct *p;
@@ -3471,7 +3471,7 @@ need_resched_nonpreemptible:
 		idle_balance(cpu, rq);
 
 	prev->sched_class->put_prev_task(rq, prev, now);
-	next = pick_next_task(rq, prev, now);
+	next = pick_next_task(rq, prev);
 
 	sched_info_switch(prev, next);
 
@@ -5222,7 +5222,7 @@ static void migrate_dead_tasks(unsigned int dead_cpu)
 		if (!rq->nr_running)
 			break;
 		update_rq_clock(rq);
-		next = pick_next_task(rq, rq->curr, rq->clock);
+		next = pick_next_task(rq, rq->curr);
 		if (!next)
 			break;
 		migrate_dead(dead_cpu, next);
-- 
cgit v1.2.3-70-g09d2


From 31ee529cc2254e8b62880535ec8f21a4c5e1c091 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 9 Aug 2007 11:16:49 +0200
Subject: sched: remove the 'u64 now' parameter from ->put_prev_task()

remove the 'u64 now' parameter from ->put_prev_task().

( identity transformation that causes no change in functionality. )

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h   | 2 +-
 kernel/sched.c          | 2 +-
 kernel/sched_fair.c     | 2 +-
 kernel/sched_idletask.c | 2 +-
 kernel/sched_rt.c       | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index c6ad4071c79..9afb66a4935 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -862,7 +862,7 @@ struct sched_class {
 	void (*check_preempt_curr) (struct rq *rq, struct task_struct *p);
 
 	struct task_struct * (*pick_next_task) (struct rq *rq);
-	void (*put_prev_task) (struct rq *rq, struct task_struct *p, u64 now);
+	void (*put_prev_task) (struct rq *rq, struct task_struct *p);
 
 	unsigned long (*load_balance) (struct rq *this_rq, int this_cpu,
 			struct rq *busiest,
diff --git a/kernel/sched.c b/kernel/sched.c
index 4f9f9e9d726..66444016048 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3470,7 +3470,7 @@ need_resched_nonpreemptible:
 	if (unlikely(!rq->nr_running))
 		idle_balance(cpu, rq);
 
-	prev->sched_class->put_prev_task(rq, prev, now);
+	prev->sched_class->put_prev_task(rq, prev);
 	next = pick_next_task(rq, prev);
 
 	sched_info_switch(prev, next);
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 0b23aaf074f..103327b4275 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -878,7 +878,7 @@ static struct task_struct *pick_next_task_fair(struct rq *rq)
 /*
  * Account for a descheduled task:
  */
-static void put_prev_task_fair(struct rq *rq, struct task_struct *prev, u64 now)
+static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
 {
 	struct sched_entity *se = &prev->se;
 	struct cfs_rq *cfs_rq;
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index 9f4c28f858f..3503fb2d9f9 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -33,7 +33,7 @@ dequeue_task_idle(struct rq *rq, struct task_struct *p, int sleep)
 	spin_lock_irq(&rq->lock);
 }
 
-static void put_prev_task_idle(struct rq *rq, struct task_struct *prev, u64 now)
+static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
 {
 }
 
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index c0b0d6237bb..dcdcad632fd 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -92,7 +92,7 @@ static struct task_struct *pick_next_task_rt(struct rq *rq)
 	return next;
 }
 
-static void put_prev_task_rt(struct rq *rq, struct task_struct *p, u64 now)
+static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
 {
 	update_curr_rt(rq);
 	p->se.exec_start = 0;
-- 
cgit v1.2.3-70-g09d2


From ee0827d8b5271094380410cf21d8c48c109a773a Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 9 Aug 2007 11:16:49 +0200
Subject: sched: remove the 'u64 now' parameter from ->task_new()

remove the 'u64 now' parameter from ->task_new().

( identity transformation that causes no change in functionality. )

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h | 2 +-
 kernel/sched.c        | 2 +-
 kernel/sched_fair.c   | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 9afb66a4935..682ef87da6e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -872,7 +872,7 @@ struct sched_class {
 
 	void (*set_curr_task) (struct rq *rq);
 	void (*task_tick) (struct rq *rq, struct task_struct *p);
-	void (*task_new) (struct rq *rq, struct task_struct *p, u64 now);
+	void (*task_new) (struct rq *rq, struct task_struct *p);
 };
 
 struct load_weight {
diff --git a/kernel/sched.c b/kernel/sched.c
index 66444016048..0619178efa0 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1672,7 +1672,7 @@ void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
 		 * Let the scheduling class do new task startup
 		 * management (if any):
 		 */
-		p->sched_class->task_new(rq, p, now);
+		p->sched_class->task_new(rq, p);
 		inc_nr_running(p, rq, now);
 	}
 	check_preempt_curr(rq, p);
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 103327b4275..4a2cbde1057 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1020,7 +1020,7 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr)
  * monopolize the CPU. Note: the parent runqueue is locked,
  * the child is not running yet.
  */
-static void task_new_fair(struct rq *rq, struct task_struct *p, u64 now)
+static void task_new_fair(struct rq *rq, struct task_struct *p)
 {
 	struct cfs_rq *cfs_rq = task_cfs_rq(p);
 	struct sched_entity *se = &p->se;
-- 
cgit v1.2.3-70-g09d2


From 84a1d7a2f91d2f26d21026973dbf3023d17c701f Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 9 Aug 2007 11:16:49 +0200
Subject: sched: remove the 'u64 now' parameter from update_curr_load()

remove the 'u64 now' parameter from update_curr_load().

( identity transformation that causes no change in functionality. )

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 0619178efa0..5d5859c2e01 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -782,7 +782,7 @@ static void __update_curr_load(struct rq *rq, struct load_stat *ls)
  * This function is called /before/ updating rq->ls.load
  * and when switching tasks.
  */
-static void update_curr_load(struct rq *rq, u64 now)
+static void update_curr_load(struct rq *rq)
 {
 	struct load_stat *ls = &rq->ls;
 	u64 start;
@@ -801,14 +801,14 @@ static void update_curr_load(struct rq *rq, u64 now)
 static inline void
 inc_load(struct rq *rq, const struct task_struct *p, u64 now)
 {
-	update_curr_load(rq, now);
+	update_curr_load(rq);
 	update_load_add(&rq->ls.load, p->se.load.weight);
 }
 
 static inline void
 dec_load(struct rq *rq, const struct task_struct *p, u64 now)
 {
-	update_curr_load(rq, now);
+	update_curr_load(rq);
 	update_load_sub(&rq->ls.load, p->se.load.weight);
 }
 
@@ -1971,7 +1971,7 @@ static void update_cpu_load(struct rq *this_rq)
 		goto do_avg;
 
 	/* Update delta_fair/delta_exec fields first */
-	update_curr_load(this_rq, now);
+	update_curr_load(this_rq);
 
 	fair_delta64 = ls->delta_fair + 1;
 	ls->delta_fair = 0;
-- 
cgit v1.2.3-70-g09d2


From 29b4b623fe8163ca3c1da125da81234d41c8a3db Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 9 Aug 2007 11:16:49 +0200
Subject: sched: remove the 'u64 now' parameter from inc_load()

remove the 'u64 now' parameter from inc_load().

( identity transformation that causes no change in functionality. )

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 5d5859c2e01..aa8cac4ae54 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -798,8 +798,7 @@ static void update_curr_load(struct rq *rq)
 		__update_curr_load(rq, ls);
 }
 
-static inline void
-inc_load(struct rq *rq, const struct task_struct *p, u64 now)
+static inline void inc_load(struct rq *rq, const struct task_struct *p)
 {
 	update_curr_load(rq);
 	update_load_add(&rq->ls.load, p->se.load.weight);
@@ -815,7 +814,7 @@ dec_load(struct rq *rq, const struct task_struct *p, u64 now)
 static void inc_nr_running(struct task_struct *p, struct rq *rq, u64 now)
 {
 	rq->nr_running++;
-	inc_load(rq, p, now);
+	inc_load(rq, p);
 }
 
 static void dec_nr_running(struct task_struct *p, struct rq *rq, u64 now)
@@ -3993,7 +3992,7 @@ void set_user_nice(struct task_struct *p, long nice)
 
 	if (on_rq) {
 		enqueue_task(rq, p, 0, now);
-		inc_load(rq, p, now);
+		inc_load(rq, p);
 		/*
 		 * If the task increased its priority or is running and
 		 * lowered its priority, then reschedule its CPU:
-- 
cgit v1.2.3-70-g09d2


From 79b5dddf831b4719b7ec8dfcfb9bf9c619805b9c Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 9 Aug 2007 11:16:49 +0200
Subject: sched: remove the 'u64 now' parameter from dec_load()

remove the 'u64 now' parameter from dec_load().

( identity transformation that causes no change in functionality. )

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index aa8cac4ae54..23583bb9327 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -804,8 +804,7 @@ static inline void inc_load(struct rq *rq, const struct task_struct *p)
 	update_load_add(&rq->ls.load, p->se.load.weight);
 }
 
-static inline void
-dec_load(struct rq *rq, const struct task_struct *p, u64 now)
+static inline void dec_load(struct rq *rq, const struct task_struct *p)
 {
 	update_curr_load(rq);
 	update_load_sub(&rq->ls.load, p->se.load.weight);
@@ -820,7 +819,7 @@ static void inc_nr_running(struct task_struct *p, struct rq *rq, u64 now)
 static void dec_nr_running(struct task_struct *p, struct rq *rq, u64 now)
 {
 	rq->nr_running--;
-	dec_load(rq, p, now);
+	dec_load(rq, p);
 }
 
 static void set_load_weight(struct task_struct *p)
@@ -3981,7 +3980,7 @@ void set_user_nice(struct task_struct *p, long nice)
 	on_rq = p->se.on_rq;
 	if (on_rq) {
 		dequeue_task(rq, p, 0, now);
-		dec_load(rq, p, now);
+		dec_load(rq, p);
 	}
 
 	p->static_prio = NICE_TO_PRIO(nice);
-- 
cgit v1.2.3-70-g09d2


From e5fa2237b53d751c59f773a68e1b12c411f0b19b Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 9 Aug 2007 11:16:49 +0200
Subject: sched: remove the 'u64 now' parameter from inc_nr_running()

remove the 'u64 now' parameter from inc_nr_running().

( identity transformation that causes no change in functionality. )

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 23583bb9327..bdb683464c0 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -810,7 +810,7 @@ static inline void dec_load(struct rq *rq, const struct task_struct *p)
 	update_load_sub(&rq->ls.load, p->se.load.weight);
 }
 
-static void inc_nr_running(struct task_struct *p, struct rq *rq, u64 now)
+static void inc_nr_running(struct task_struct *p, struct rq *rq)
 {
 	rq->nr_running++;
 	inc_load(rq, p);
@@ -921,7 +921,7 @@ static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
 		rq->nr_uninterruptible--;
 
 	enqueue_task(rq, p, wakeup, now);
-	inc_nr_running(p, rq, now);
+	inc_nr_running(p, rq);
 }
 
 /*
@@ -938,7 +938,7 @@ static inline void activate_idle_task(struct task_struct *p, struct rq *rq)
 		rq->nr_uninterruptible--;
 
 	enqueue_task(rq, p, 0, now);
-	inc_nr_running(p, rq, now);
+	inc_nr_running(p, rq);
 }
 
 /*
@@ -1671,7 +1671,7 @@ void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
 		 * management (if any):
 		 */
 		p->sched_class->task_new(rq, p);
-		inc_nr_running(p, rq, now);
+		inc_nr_running(p, rq);
 	}
 	check_preempt_curr(rq, p);
 	task_rq_unlock(rq, &flags);
-- 
cgit v1.2.3-70-g09d2


From db53181e41728cfd58336925422dc17f1d2c655c Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 9 Aug 2007 11:16:49 +0200
Subject: sched: remove the 'u64 now' parameter from dec_nr_running()

remove the 'u64 now' parameter from dec_nr_running().

( identity transformation that causes no change in functionality. )

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index bdb683464c0..86e751a19d6 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -816,7 +816,7 @@ static void inc_nr_running(struct task_struct *p, struct rq *rq)
 	inc_load(rq, p);
 }
 
-static void dec_nr_running(struct task_struct *p, struct rq *rq, u64 now)
+static void dec_nr_running(struct task_struct *p, struct rq *rq)
 {
 	rq->nr_running--;
 	dec_load(rq, p);
@@ -951,7 +951,7 @@ deactivate_task(struct rq *rq, struct task_struct *p, int sleep, u64 now)
 		rq->nr_uninterruptible++;
 
 	dequeue_task(rq, p, sleep, now);
-	dec_nr_running(p, rq, now);
+	dec_nr_running(p, rq);
 }
 
 /**
-- 
cgit v1.2.3-70-g09d2


From 8159f87e2bfeeba8887b8ef34f7b523958910132 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 9 Aug 2007 11:16:49 +0200
Subject: sched: remove the 'u64 now' parameter from enqueue_task()

remove the 'u64 now' parameter from enqueue_task().

( identity transformation that causes no change in functionality. )

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 86e751a19d6..0ecfdd134f7 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -846,8 +846,7 @@ static void set_load_weight(struct task_struct *p)
 	p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];
 }
 
-static void
-enqueue_task(struct rq *rq, struct task_struct *p, int wakeup, u64 now)
+static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)
 {
 	sched_info_queued(p);
 	p->sched_class->enqueue_task(rq, p, wakeup);
@@ -920,7 +919,7 @@ static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
 	if (p->state == TASK_UNINTERRUPTIBLE)
 		rq->nr_uninterruptible--;
 
-	enqueue_task(rq, p, wakeup, now);
+	enqueue_task(rq, p, wakeup);
 	inc_nr_running(p, rq);
 }
 
@@ -937,7 +936,7 @@ static inline void activate_idle_task(struct task_struct *p, struct rq *rq)
 	if (p->state == TASK_UNINTERRUPTIBLE)
 		rq->nr_uninterruptible--;
 
-	enqueue_task(rq, p, 0, now);
+	enqueue_task(rq, p, 0);
 	inc_nr_running(p, rq);
 }
 
@@ -3933,7 +3932,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
 	p->prio = prio;
 
 	if (on_rq) {
-		enqueue_task(rq, p, 0, now);
+		enqueue_task(rq, p, 0);
 		/*
 		 * Reschedule if we are currently running on this runqueue and
 		 * our priority decreased, or if we are not currently running on
@@ -3990,7 +3989,7 @@ void set_user_nice(struct task_struct *p, long nice)
 	delta = p->prio - old_prio;
 
 	if (on_rq) {
-		enqueue_task(rq, p, 0, now);
+		enqueue_task(rq, p, 0);
 		inc_load(rq, p);
 		/*
 		 * If the task increased its priority or is running and
-- 
cgit v1.2.3-70-g09d2


From 69be72c13db0e9165796422b544f989033146171 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 9 Aug 2007 11:16:49 +0200
Subject: sched: remove the 'u64 now' parameter from dequeue_task()

remove the 'u64 now' parameter from dequeue_task().

( identity transformation that causes no change in functionality. )

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 0ecfdd134f7..05ce3f54e81 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -853,8 +853,7 @@ static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)
 	p->se.on_rq = 1;
 }
 
-static void
-dequeue_task(struct rq *rq, struct task_struct *p, int sleep, u64 now)
+static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)
 {
 	p->sched_class->dequeue_task(rq, p, sleep);
 	p->se.on_rq = 0;
@@ -949,7 +948,7 @@ deactivate_task(struct rq *rq, struct task_struct *p, int sleep, u64 now)
 	if (p->state == TASK_UNINTERRUPTIBLE)
 		rq->nr_uninterruptible++;
 
-	dequeue_task(rq, p, sleep, now);
+	dequeue_task(rq, p, sleep);
 	dec_nr_running(p, rq);
 }
 
@@ -3922,7 +3921,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
 	oldprio = p->prio;
 	on_rq = p->se.on_rq;
 	if (on_rq)
-		dequeue_task(rq, p, 0, now);
+		dequeue_task(rq, p, 0);
 
 	if (rt_prio(prio))
 		p->sched_class = &rt_sched_class;
@@ -3978,7 +3977,7 @@ void set_user_nice(struct task_struct *p, long nice)
 	}
 	on_rq = p->se.on_rq;
 	if (on_rq) {
-		dequeue_task(rq, p, 0, now);
+		dequeue_task(rq, p, 0);
 		dec_load(rq, p);
 	}
 
-- 
cgit v1.2.3-70-g09d2


From 2e1cb74a501c4b1bca5e55dabff24f267349193c Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 9 Aug 2007 11:16:49 +0200
Subject: sched: remove the 'u64 now' parameter from deactivate_task()

remove the 'u64 now' parameter from deactivate_task().

( identity transformation that causes no change in functionality. )

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 05ce3f54e81..2dc5d2f7b39 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -942,8 +942,7 @@ static inline void activate_idle_task(struct task_struct *p, struct rq *rq)
 /*
  * deactivate_task - remove a task from the runqueue.
  */
-static void
-deactivate_task(struct rq *rq, struct task_struct *p, int sleep, u64 now)
+static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
 {
 	if (p->state == TASK_UNINTERRUPTIBLE)
 		rq->nr_uninterruptible++;
@@ -2128,7 +2127,7 @@ static void pull_task(struct rq *src_rq, struct task_struct *p,
 		      struct rq *this_rq, int this_cpu)
 {
 	update_rq_clock(src_rq);
-	deactivate_task(src_rq, p, 0, src_rq->clock);
+	deactivate_task(src_rq, p, 0);
 	set_task_cpu(p, this_cpu);
 	activate_task(this_rq, p, 0);
 	/*
@@ -3458,7 +3457,7 @@ need_resched_nonpreemptible:
 				unlikely(signal_pending(prev)))) {
 			prev->state = TASK_RUNNING;
 		} else {
-			deactivate_task(rq, prev, 1, now);
+			deactivate_task(rq, prev, 1);
 		}
 		switch_count = &prev->nvcsw;
 	}
@@ -4228,7 +4227,7 @@ recheck:
 	on_rq = p->se.on_rq;
 	if (on_rq) {
 		update_rq_clock(rq);
-		deactivate_task(rq, p, 0, rq->clock);
+		deactivate_task(rq, p, 0);
 	}
 	oldprio = p->prio;
 	__setscheduler(rq, p, policy, param->sched_priority);
@@ -4983,7 +4982,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
 	on_rq = p->se.on_rq;
 	if (on_rq) {
 		update_rq_clock(rq_src);
-		deactivate_task(rq_src, p, 0, rq_src->clock);
+		deactivate_task(rq_src, p, 0);
 	}
 	set_task_cpu(p, dest_cpu);
 	if (on_rq) {
@@ -5404,7 +5403,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
 		/* Idle task back to normal (off runqueue, low prio) */
 		rq = task_rq_lock(rq->idle, &flags);
 		update_rq_clock(rq);
-		deactivate_task(rq, rq->idle, 0, rq->clock);
+		deactivate_task(rq, rq->idle, 0);
 		rq->idle->static_prio = MAX_PRIO;
 		__setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
 		rq->idle->sched_class = &idle_sched_class;
@@ -6644,7 +6643,7 @@ void normalize_rt_tasks(void)
 		on_rq = p->se.on_rq;
 		if (on_rq) {
 			update_rq_clock(task_rq(p));
-			deactivate_task(task_rq(p), p, 0, task_rq(p)->clock);
+			deactivate_task(task_rq(p), p, 0);
 		}
 		__setscheduler(rq, p, SCHED_NORMAL, 0);
 		if (on_rq) {
-- 
cgit v1.2.3-70-g09d2


From bdd4dfa89c1e3e1379729b9edec1526b3ecc25ec Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 9 Aug 2007 11:16:51 +0200
Subject: sched: remove the 'u64 now' local variables

final step: remove all (now superfluous) 'u64 now' variables.

( identity transformation that causes no change in functionality. )

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c      | 16 ----------------
 kernel/sched_fair.c |  6 ------
 2 files changed, 22 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 2dc5d2f7b39..b78b9d9ffd1 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -910,10 +910,7 @@ static int effective_prio(struct task_struct *p)
  */
 static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
 {
-	u64 now;
-
 	update_rq_clock(rq);
-	now = rq->clock;
 
 	if (p->state == TASK_UNINTERRUPTIBLE)
 		rq->nr_uninterruptible--;
@@ -927,10 +924,7 @@ static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
  */
 static inline void activate_idle_task(struct task_struct *p, struct rq *rq)
 {
-	u64 now;
-
 	update_rq_clock(rq);
-	now = rq->clock;
 
 	if (p->state == TASK_UNINTERRUPTIBLE)
 		rq->nr_uninterruptible--;
@@ -1647,13 +1641,11 @@ void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
 	unsigned long flags;
 	struct rq *rq;
 	int this_cpu;
-	u64 now;
 
 	rq = task_rq_lock(p, &flags);
 	BUG_ON(p->state != TASK_RUNNING);
 	this_cpu = smp_processor_id(); /* parent's CPU */
 	update_rq_clock(rq);
-	now = rq->clock;
 
 	p->prio = effective_prio(p);
 
@@ -1955,11 +1947,9 @@ static void update_cpu_load(struct rq *this_rq)
 	unsigned long total_load = this_rq->ls.load.weight;
 	unsigned long this_load =  total_load;
 	struct load_stat *ls = &this_rq->ls;
-	u64 now;
 	int i, scale;
 
 	__update_rq_clock(this_rq);
-	now = this_rq->clock;
 
 	this_rq->nr_load_updates++;
 	if (unlikely(!(sysctl_sched_features & SCHED_FEAT_PRECISE_CPU_LOAD)))
@@ -3431,7 +3421,6 @@ asmlinkage void __sched schedule(void)
 	struct task_struct *prev, *next;
 	long *switch_count;
 	struct rq *rq;
-	u64 now;
 	int cpu;
 
 need_resched:
@@ -3450,7 +3439,6 @@ need_resched_nonpreemptible:
 	spin_lock_irq(&rq->lock);
 	clear_tsk_need_resched(prev);
 	__update_rq_clock(rq);
-	now = rq->clock;
 
 	if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
 		if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
@@ -3909,13 +3897,11 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
 	unsigned long flags;
 	int oldprio, on_rq;
 	struct rq *rq;
-	u64 now;
 
 	BUG_ON(prio < 0 || prio > MAX_PRIO);
 
 	rq = task_rq_lock(p, &flags);
 	update_rq_clock(rq);
-	now = rq->clock;
 
 	oldprio = p->prio;
 	on_rq = p->se.on_rq;
@@ -3953,7 +3939,6 @@ void set_user_nice(struct task_struct *p, long nice)
 	int old_prio, delta, on_rq;
 	unsigned long flags;
 	struct rq *rq;
-	u64 now;
 
 	if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
 		return;
@@ -3963,7 +3948,6 @@ void set_user_nice(struct task_struct *p, long nice)
 	 */
 	rq = task_rq_lock(p, &flags);
 	update_rq_clock(rq);
-	now = rq->clock;
 	/*
 	 * The RT priorities are set via sched_setscheduler(), but we still
 	 * allow the 'normal' nice value to be set - but as expected
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 4a2cbde1057..eb7ca49c326 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -667,10 +667,8 @@ static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
 {
 	struct rq *rq = rq_of(cfs_rq);
 	struct sched_entity *next;
-	u64 now;
 
 	__update_rq_clock(rq);
-	now = rq->clock;
 
 	/*
 	 * Dequeue and enqueue the task to update its
@@ -820,10 +818,8 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep)
 static void yield_task_fair(struct rq *rq, struct task_struct *p)
 {
 	struct cfs_rq *cfs_rq = task_cfs_rq(p);
-	u64 now;
 
 	__update_rq_clock(rq);
-	now = rq->clock;
 	/*
 	 * Dequeue and enqueue the task to update its
 	 * position within the tree:
@@ -1062,11 +1058,9 @@ static void set_curr_task_fair(struct rq *rq)
 {
 	struct task_struct *curr = rq->curr;
 	struct sched_entity *se = &curr->se;
-	u64 now;
 	struct cfs_rq *cfs_rq;
 
 	update_rq_clock(rq);
-	now = rq->clock;
 
 	for_each_sched_entity(se) {
 		cfs_rq = cfs_rq_of(se);
-- 
cgit v1.2.3-70-g09d2


From a48da48b403319918a587be8b5d46fe1d186c2ac Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 9 Aug 2007 11:16:51 +0200
Subject: sched debug: remove the 'u64 now' parameter from print_task()/_rq()

remove the 'u64 now' parameter from sched_debug.c:print_task()/_rq().

( identity transformation that causes no change in functionality. )

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_debug.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index f977ee53f8c..3da32156394 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -29,7 +29,7 @@
  } while (0)
 
 static void
-print_task(struct seq_file *m, struct rq *rq, struct task_struct *p, u64 now)
+print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
 {
 	if (rq->curr == p)
 		SEQ_printf(m, "R");
@@ -56,7 +56,7 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p, u64 now)
 #endif
 }
 
-static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu, u64 now)
+static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
 {
 	struct task_struct *g, *p;
 
@@ -77,7 +77,7 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu, u64 now)
 		if (!p->se.on_rq || task_cpu(p) != rq_cpu)
 			continue;
 
-		print_task(m, rq, p, now);
+		print_task(m, rq, p);
 	} while_each_thread(g, p);
 
 	read_unlock_irq(&tasklist_lock);
@@ -124,7 +124,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 	print_cfs_rq_runtime_sum(m, cpu, cfs_rq);
 }
 
-static void print_cpu(struct seq_file *m, int cpu, u64 now)
+static void print_cpu(struct seq_file *m, int cpu)
 {
 	struct rq *rq = &per_cpu(runqueues, cpu);
 
@@ -168,7 +168,7 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now)
 
 	print_cfs_stats(m, cpu);
 
-	print_rq(m, rq, cpu, now);
+	print_rq(m, rq, cpu);
 }
 
 static int sched_debug_show(struct seq_file *m, void *v)
@@ -184,7 +184,7 @@ static int sched_debug_show(struct seq_file *m, void *v)
 	SEQ_printf(m, "now at %Lu nsecs\n", (unsigned long long)now);
 
 	for_each_online_cpu(cpu)
-		print_cpu(m, cpu, now);
+		print_cpu(m, cpu);
 
 	SEQ_printf(m, "\n");
 
-- 
cgit v1.2.3-70-g09d2


From 546fe3c909b0a4235c7237c210da483eaaac1edc Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 9 Aug 2007 11:16:51 +0200
Subject: sched: move the __update_rq_clock() call to scheduler_tick()

move the __update_rq_clock() call from update_cpu_load() to
scheduler_tick().

( identity transformation that causes no change in functionality. )

this allows the direct use of rq->clock in ->task_tick() functions.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index b78b9d9ffd1..3f5d5294999 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1949,8 +1949,6 @@ static void update_cpu_load(struct rq *this_rq)
 	struct load_stat *ls = &this_rq->ls;
 	int i, scale;
 
-	__update_rq_clock(this_rq);
-
 	this_rq->nr_load_updates++;
 	if (unlikely(!(sysctl_sched_features & SCHED_FEAT_PRECISE_CPU_LOAD)))
 		goto do_avg;
@@ -3301,6 +3299,7 @@ void scheduler_tick(void)
 	struct task_struct *curr = rq->curr;
 
 	spin_lock(&rq->lock);
+	__update_rq_clock(rq);
 	update_cpu_load(rq);
 	if (curr != rq->idle) /* FIXME: needed? */
 		curr->sched_class->task_tick(rq, curr);
-- 
cgit v1.2.3-70-g09d2


From d9e0e6aa6d72df21ff190962c842e027fca0e009 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 9 Aug 2007 11:16:51 +0200
Subject: sched: remove __update_rq_clock() call from entity_tick()

remove __update_rq_clock() call from entity_tick().

no change in functionality because scheduler_tick() already calls
__update_rq_clock().

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index eb7ca49c326..e62d5b9b158 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -665,11 +665,8 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
 
 static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
 {
-	struct rq *rq = rq_of(cfs_rq);
 	struct sched_entity *next;
 
-	__update_rq_clock(rq);
-
 	/*
 	 * Dequeue and enqueue the task to update its
 	 * position within the tree:
-- 
cgit v1.2.3-70-g09d2


From c3b64f1e4f772418a649bb8e3b39fcea6c358330 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 9 Aug 2007 11:16:51 +0200
Subject: sched: clean up set_curr_task_fair()

clean up set_curr_task_fair().

( identity transformation that causes no change in functionality. )

   text    data     bss     dec     hex filename
  39170    3750      36   42956    a7cc sched.o.before
  39170    3750      36   42956    a7cc sched.o.after

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 15 ++++-----------
 1 file changed, 4 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index e62d5b9b158..b885b3c85bb 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1053,16 +1053,10 @@ static void task_new_fair(struct rq *rq, struct task_struct *p)
  */
 static void set_curr_task_fair(struct rq *rq)
 {
-	struct task_struct *curr = rq->curr;
-	struct sched_entity *se = &curr->se;
-	struct cfs_rq *cfs_rq;
-
-	update_rq_clock(rq);
+	struct sched_entity *se = &rq->curr.se;
 
-	for_each_sched_entity(se) {
-		cfs_rq = cfs_rq_of(se);
-		set_next_entity(cfs_rq, se);
-	}
+	for_each_sched_entity(se)
+		set_next_entity(cfs_rq_of(se), se);
 }
 #else
 static void set_curr_task_fair(struct rq *rq)
@@ -1093,10 +1087,9 @@ struct sched_class fair_sched_class __read_mostly = {
 #ifdef CONFIG_SCHED_DEBUG
 static void print_cfs_stats(struct seq_file *m, int cpu)
 {
-	struct rq *rq = cpu_rq(cpu);
 	struct cfs_rq *cfs_rq;
 
-	for_each_leaf_cfs_rq(rq, cfs_rq)
+	for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
 		print_cfs_rq(m, cpu, cfs_rq);
 }
 #endif
-- 
cgit v1.2.3-70-g09d2


From 2daa357705bfe68788132cf9079930ca948a90af Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 9 Aug 2007 11:16:51 +0200
Subject: sched: optimize activate_task()

optimize activate_task() by removing update_rq_clock() from it.
(and add update_rq_clock() to all callsites of activate_task() that
did not have it before.)

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 3f5d5294999..9ccd91e5b65 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -910,8 +910,6 @@ static int effective_prio(struct task_struct *p)
  */
 static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
 {
-	update_rq_clock(rq);
-
 	if (p->state == TASK_UNINTERRUPTIBLE)
 		rq->nr_uninterruptible--;
 
@@ -1510,6 +1508,7 @@ out_set_cpu:
 
 out_activate:
 #endif /* CONFIG_SMP */
+	update_rq_clock(rq);
 	activate_task(rq, p, 1);
 	/*
 	 * Sync wakeups (i.e. those types of wakeups where the waker
@@ -2117,6 +2116,7 @@ static void pull_task(struct rq *src_rq, struct task_struct *p,
 	update_rq_clock(src_rq);
 	deactivate_task(src_rq, p, 0);
 	set_task_cpu(p, this_cpu);
+	__update_rq_clock(this_rq);
 	activate_task(this_rq, p, 0);
 	/*
 	 * Note that idle threads have a prio of MAX_PRIO, for this test
@@ -4207,11 +4207,10 @@ recheck:
 		spin_unlock_irqrestore(&p->pi_lock, flags);
 		goto recheck;
 	}
+	update_rq_clock(rq);
 	on_rq = p->se.on_rq;
-	if (on_rq) {
-		update_rq_clock(rq);
+	if (on_rq)
 		deactivate_task(rq, p, 0);
-	}
 	oldprio = p->prio;
 	__setscheduler(rq, p, policy, param->sched_priority);
 	if (on_rq) {
@@ -4969,6 +4968,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
 	}
 	set_task_cpu(p, dest_cpu);
 	if (on_rq) {
+		update_rq_clock(rq_dest);
 		activate_task(rq_dest, p, 0);
 		check_preempt_curr(rq_dest, p);
 	}
@@ -6623,14 +6623,13 @@ void normalize_rt_tasks(void)
 			goto out_unlock;
 #endif
 
+		update_rq_clock(rq);
 		on_rq = p->se.on_rq;
-		if (on_rq) {
-			update_rq_clock(task_rq(p));
-			deactivate_task(task_rq(p), p, 0);
-		}
+		if (on_rq)
+			deactivate_task(rq, p, 0);
 		__setscheduler(rq, p, SCHED_NORMAL, 0);
 		if (on_rq) {
-			activate_task(task_rq(p), p, 0);
+			activate_task(rq, p, 0);
 			resched_task(rq->curr);
 		}
 #ifdef CONFIG_SMP
-- 
cgit v1.2.3-70-g09d2


From 6e82a3befe91423e501c2124312bd805be0048eb Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 9 Aug 2007 11:16:51 +0200
Subject: sched: optimize update_rq_clock() calls in the load-balancer

optimize update_rq_clock() calls in the load-balancer: update them
right after locking the runqueue(s) so that the pull functions do
not have to call it.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 9ccd91e5b65..afc59f274e5 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2017,6 +2017,8 @@ static void double_rq_lock(struct rq *rq1, struct rq *rq2)
 			spin_lock(&rq1->lock);
 		}
 	}
+	update_rq_clock(rq1);
+	update_rq_clock(rq2);
 }
 
 /*
@@ -2113,10 +2115,8 @@ void sched_exec(void)
 static void pull_task(struct rq *src_rq, struct task_struct *p,
 		      struct rq *this_rq, int this_cpu)
 {
-	update_rq_clock(src_rq);
 	deactivate_task(src_rq, p, 0);
 	set_task_cpu(p, this_cpu);
-	__update_rq_clock(this_rq);
 	activate_task(this_rq, p, 0);
 	/*
 	 * Note that idle threads have a prio of MAX_PRIO, for this test
@@ -2798,6 +2798,8 @@ redo:
 	if (busiest->nr_running > 1) {
 		/* Attempt to move tasks */
 		double_lock_balance(this_rq, busiest);
+		/* this_rq->clock is already updated */
+		update_rq_clock(busiest);
 		ld_moved = move_tasks(this_rq, this_cpu, busiest,
 					imbalance, sd, CPU_NEWLY_IDLE,
 					&all_pinned);
@@ -2895,6 +2897,8 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
 
 	/* move a task from busiest_rq to target_rq */
 	double_lock_balance(busiest_rq, target_rq);
+	update_rq_clock(busiest_rq);
+	update_rq_clock(target_rq);
 
 	/* Search for an sd spanning us and the target CPU. */
 	for_each_domain(target_cpu, sd) {
@@ -4962,13 +4966,11 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
 		goto out;
 
 	on_rq = p->se.on_rq;
-	if (on_rq) {
-		update_rq_clock(rq_src);
+	if (on_rq)
 		deactivate_task(rq_src, p, 0);
-	}
+
 	set_task_cpu(p, dest_cpu);
 	if (on_rq) {
-		update_rq_clock(rq_dest);
 		activate_task(rq_dest, p, 0);
 		check_preempt_curr(rq_dest, p);
 	}
-- 
cgit v1.2.3-70-g09d2


From 254753dc321ea2b753ca9bc58ac329557a20efac Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 9 Aug 2007 11:16:51 +0200
Subject: sched: make the multiplication table more accurate

do small deltas in the weight and multiplication constant table so
that the worst-case numeric error is better than 1:100000000. (8 digits)

the current error table is:

     nice       mult *   inv_mult   error
     ------------------------------------------
     -20:      88761 *      48388  -0.0000000065
     -19:      71755 *      59856  -0.0000000037
     -18:      56483 *      76040   0.0000000056
     -17:      46273 *      92818   0.0000000042
     -16:      36291 *     118348  -0.0000000065
     -15:      29154 *     147320  -0.0000000037
     -14:      23254 *     184698  -0.0000000009
     -13:      18705 *     229616  -0.0000000037
     -12:      14949 *     287308  -0.0000000009
     -11:      11916 *     360437  -0.0000000009
     -10:       9548 *     449829  -0.0000000009
      -9:       7620 *     563644  -0.0000000037
      -8:       6100 *     704093   0.0000000009
      -7:       4904 *     875809   0.0000000093
      -6:       3906 *    1099582  -0.0000000009
      -5:       3121 *    1376151  -0.0000000058
      -4:       2501 *    1717300   0.0000000009
      -3:       1991 *    2157191  -0.0000000035
      -2:       1586 *    2708050   0.0000000009
      -1:       1277 *    3363326   0.0000000014
       0:       1024 *    4194304   0.0000000000
       1:        820 *    5237765   0.0000000009
       2:        655 *    6557202   0.0000000033
       3:        526 *    8165337  -0.0000000079
       4:        423 *   10153587   0.0000000012
       5:        335 *   12820798   0.0000000079
       6:        272 *   15790321   0.0000000037
       7:        215 *   19976592  -0.0000000037
       8:        172 *   24970740  -0.0000000037
       9:        137 *   31350126  -0.0000000079
      10:        110 *   39045157  -0.0000000061
      11:         87 *   49367440  -0.0000000037
      12:         70 *   61356676   0.0000000056
      13:         56 *   76695844  -0.0000000075
      14:         45 *   95443717  -0.0000000072
      15:         36 *  119304647  -0.0000000009
      16:         29 *  148102320  -0.0000000037
      17:         23 *  186737708  -0.0000000028
      18:         18 *  238609294  -0.0000000009
      19:         15 *  286331153  -0.0000000002

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 29 ++++++++++++++++-------------
 1 file changed, 16 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index afc59f274e5..5470ab0258a 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -704,11 +704,14 @@ static void update_load_sub(struct load_weight *lw, unsigned long dec)
  * the relative distance between them is ~25%.)
  */
 static const int prio_to_weight[40] = {
-/* -20 */ 88818, 71054, 56843, 45475, 36380, 29104, 23283, 18626, 14901, 11921,
-/* -10 */  9537,  7629,  6103,  4883,  3906,  3125,  2500,  2000,  1600,  1280,
-/*   0 */  NICE_0_LOAD /* 1024 */,
-/*   1 */          819,   655,   524,   419,   336,   268,   215,   172,   137,
-/*  10 */   110,    87,    70,    56,    45,    36,    29,    23,    18,    15,
+ /* -20 */     88761,     71755,     56483,     46273,     36291,
+ /* -15 */     29154,     23254,     18705,     14949,     11916,
+ /* -10 */      9548,      7620,      6100,      4904,      3906,
+ /*  -5 */      3121,      2501,      1991,      1586,      1277,
+ /*   0 */      1024,       820,       655,       526,       423,
+ /*   5 */       335,       272,       215,       172,       137,
+ /*  10 */       110,        87,        70,        56,        45,
+ /*  15 */        36,        29,        23,        18,        15,
 };
 
 /*
@@ -719,14 +722,14 @@ static const int prio_to_weight[40] = {
  * into multiplications:
  */
 static const u32 prio_to_wmult[40] = {
-/* -20 */     48356,     60446,     75558,     94446,    118058,
-/* -15 */    147573,    184467,    230589,    288233,    360285,
-/* -10 */    450347,    562979,    703746,    879575,   1099582,
-/*  -5 */   1374389,   1717986,   2147483,   2684354,   3355443,
-/*   0 */   4194304,   5244160,   6557201,   8196502,  10250518,
-/*   5 */  12782640,  16025997,  19976592,  24970740,  31350126,
-/*  10 */  39045157,  49367440,  61356675,  76695844,  95443717,
-/*  15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
+ /* -20 */     48388,     59856,     76040,     92818,    118348,
+ /* -15 */    147320,    184698,    229616,    287308,    360437,
+ /* -10 */    449829,    563644,    704093,    875809,   1099582,
+ /*  -5 */   1376151,   1717300,   2157191,   2708050,   3363326,
+ /*   0 */   4194304,   5237765,   6557202,   8165337,  10153587,
+ /*   5 */  12820798,  15790321,  19976592,  24970740,  31350126,
+ /*  10 */  39045157,  49367440,  61356676,  76695844,  95443717,
+ /*  15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
 };
 
 static void activate_task(struct rq *rq, struct task_struct *p, int wakeup);
-- 
cgit v1.2.3-70-g09d2


From 194081ebfaa8c7d16133e08dd79254910c20c6ff Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 9 Aug 2007 11:16:51 +0200
Subject: sched: round a bit better

round a tiny bit better in high-frequency rescheduling scenarios,
by rounding around zero instead of rounding down.

(this is pretty theoretical though)

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 5470ab0258a..b0afd8db139 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -638,6 +638,11 @@ static u64 div64_likely32(u64 divident, unsigned long divisor)
 
 #define WMULT_SHIFT	32
 
+/*
+ * Shift right and round:
+ */
+#define RSR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
+
 static unsigned long
 calc_delta_mine(unsigned long delta_exec, unsigned long weight,
 		struct load_weight *lw)
@@ -645,18 +650,17 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight,
 	u64 tmp;
 
 	if (unlikely(!lw->inv_weight))
-		lw->inv_weight = WMULT_CONST / lw->weight;
+		lw->inv_weight = (WMULT_CONST - lw->weight/2) / lw->weight + 1;
 
 	tmp = (u64)delta_exec * weight;
 	/*
 	 * Check whether we'd overflow the 64-bit multiplication:
 	 */
-	if (unlikely(tmp > WMULT_CONST)) {
-		tmp = ((tmp >> WMULT_SHIFT/2) * lw->inv_weight)
-				>> (WMULT_SHIFT/2);
-	} else {
-		tmp = (tmp * lw->inv_weight) >> WMULT_SHIFT;
-	}
+	if (unlikely(tmp > WMULT_CONST))
+		tmp = RSR(RSR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
+			WMULT_SHIFT/2);
+	else
+		tmp = RSR(tmp * lw->inv_weight, WMULT_SHIFT);
 
 	return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
 }
-- 
cgit v1.2.3-70-g09d2


From a69edb55605117cc0f20aa36c49c20b96590774d Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 9 Aug 2007 11:16:52 +0200
Subject: sched: fix update_stats_enqueue() reniced codepath

the key has to be rescaled to /weight even if it has a positive value.

(this change only affects the scheduling of reniced tasks)

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index b885b3c85bb..7a632c534ce 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -405,7 +405,8 @@ static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 					(WMULT_SHIFT - NICE_0_SHIFT);
 		} else {
 			tmp = se->wait_runtime;
-			key -= (tmp * se->load.weight) >> NICE_0_SHIFT;
+			key -= (tmp * se->load.inv_weight) >>
+					(WMULT_SHIFT - NICE_0_SHIFT);
 		}
 	}
 
-- 
cgit v1.2.3-70-g09d2


From 7cff8cf61cac15fa29a1ca802826d2bcbca66152 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 9 Aug 2007 11:16:52 +0200
Subject: sched: refine negative nice level granularity

refine the granularity of negative nice level tasks: let them
reschedule more often to offset the effect of them consuming
their wait_runtime proportionately slower. (This makes nice-0
task scheduling smoother in the presence of negatively
reniced tasks.)

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 7a632c534ce..e91db32cadf 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -222,21 +222,25 @@ niced_granularity(struct sched_entity *curr, unsigned long granularity)
 {
 	u64 tmp;
 
+	if (likely(curr->load.weight == NICE_0_LOAD))
+		return granularity;
 	/*
-	 * Negative nice levels get the same granularity as nice-0:
+	 * Positive nice levels get the same granularity as nice-0:
 	 */
-	if (likely(curr->load.weight >= NICE_0_LOAD))
-		return granularity;
+	if (likely(curr->load.weight < NICE_0_LOAD)) {
+		tmp = curr->load.weight * (u64)granularity;
+		return (long) (tmp >> NICE_0_SHIFT);
+	}
 	/*
-	 * Positive nice level tasks get linearly finer
+	 * Negative nice level tasks get linearly finer
 	 * granularity:
 	 */
-	tmp = curr->load.weight * (u64)granularity;
+	tmp = curr->load.inv_weight * (u64)granularity;
 
 	/*
 	 * It will always fit into 'long':
 	 */
-	return (long) (tmp >> NICE_0_SHIFT);
+	return (long) (tmp >> WMULT_SHIFT);
 }
 
 static inline void
-- 
cgit v1.2.3-70-g09d2


From 88ffc3505988196ef5cfdc0278ad89025c2a7b1a Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@woody.linux-foundation.org>
Date: Thu, 9 Aug 2007 08:10:16 -0700
Subject: Revert "genirq: temporary fix for level-triggered IRQ resend"

This reverts commit 0fc4969b866671dfe39b1a9119d0fdc7ea0f63e5.  It was
always meant to be temporary, but it's generating more useless noise
than anything else, and we probably should never have done it in the
generic kernel (only had the people involved test it on their own).

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/irq/resend.c | 9 ---------
 1 file changed, 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c
index c3827274688..5bfeaed7e48 100644
--- a/kernel/irq/resend.c
+++ b/kernel/irq/resend.c
@@ -62,15 +62,6 @@ void check_irq_resend(struct irq_desc *desc, unsigned int irq)
 	 */
 	desc->chip->enable(irq);
 
-	/*
-	 * Temporary hack to figure out more about the problem, which
-	 * is causing the ancient network cards to die.
-	 */
-	if (desc->handle_irq != handle_edge_irq) {
-		WARN_ON_ONCE(1);
-		return;
-	}
-
 	if ((status & (IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) {
 		desc->status = (status & ~IRQ_PENDING) | IRQ_REPLAY;
 
-- 
cgit v1.2.3-70-g09d2


From 529c77261bccd9d37f110f58b0753d95beaa9fa2 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 10 Aug 2007 23:05:11 +0200
Subject: sched: improve rq-clock overflow logic

improve the rq-clock overflow logic: limit the absolute rq->clock
delta since the last scheduler tick, instead of limiting the delta
itself.

tested by Arjan van de Ven - whole laptop was misbehaving due to
an incorrectly calibrated cpu_khz confusing sched_clock().

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
---
 kernel/sched.c | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index b0afd8db139..6247e4a8350 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -263,6 +263,7 @@ struct rq {
 
 	unsigned int clock_warps, clock_overflows;
 	unsigned int clock_unstable_events;
+	u64 tick_timestamp;
 
 	atomic_t nr_iowait;
 
@@ -341,8 +342,11 @@ static void __update_rq_clock(struct rq *rq)
 		/*
 		 * Catch too large forward jumps too:
 		 */
-		if (unlikely(delta > 2*TICK_NSEC)) {
-			clock++;
+		if (unlikely(clock + delta > rq->tick_timestamp + TICK_NSEC)) {
+			if (clock < rq->tick_timestamp + TICK_NSEC)
+				clock = rq->tick_timestamp + TICK_NSEC;
+			else
+				clock++;
 			rq->clock_overflows++;
 		} else {
 			if (unlikely(delta > rq->clock_max_delta))
@@ -3308,9 +3312,16 @@ void scheduler_tick(void)
 	int cpu = smp_processor_id();
 	struct rq *rq = cpu_rq(cpu);
 	struct task_struct *curr = rq->curr;
+	u64 next_tick = rq->tick_timestamp + TICK_NSEC;
 
 	spin_lock(&rq->lock);
 	__update_rq_clock(rq);
+	/*
+	 * Let rq->clock advance by at least TICK_NSEC:
+	 */
+	if (unlikely(rq->clock < next_tick))
+		rq->clock = next_tick;
+	rq->tick_timestamp = rq->clock;
 	update_cpu_load(rq);
 	if (curr != rq->idle) /* FIXME: needed? */
 		curr->sched_class->task_tick(rq, curr);
-- 
cgit v1.2.3-70-g09d2


From e56f31aad9d8c0102bc074cdab4e3ee76b38600d Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 10 Aug 2007 23:05:11 +0200
Subject: sched: fix typo in the FAIR_GROUP_SCHED branch

while there's no in-tree way to turn group scheduling at the moment,
fix a typo in it nevertheless.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index e91db32cadf..c5af38948a1 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -959,13 +959,12 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
 	for_each_leaf_cfs_rq(busiest, busy_cfs_rq) {
 #ifdef CONFIG_FAIR_GROUP_SCHED
 		struct cfs_rq *this_cfs_rq;
-		long imbalances;
+		long imbalance;
 		unsigned long maxload;
 
 		this_cfs_rq = cpu_cfs_rq(busy_cfs_rq, this_cpu);
 
-		imbalance = busy_cfs_rq->load.weight -
-						 this_cfs_rq->load.weight;
+		imbalance = busy_cfs_rq->load.weight - this_cfs_rq->load.weight;
 		/* Don't pull if this_cfs_rq has more load than busy_cfs_rq */
 		if (imbalance <= 0)
 			continue;
@@ -976,7 +975,7 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
 
 		*this_best_prio = cfs_rq_best_prio(this_cfs_rq);
 #else
-#define maxload rem_load_move
+# define maxload rem_load_move
 #endif
 		/* pass busy_cfs_rq argument into
 		 * load_balance_[start|next]_fair iterators
-- 
cgit v1.2.3-70-g09d2


From 5167e75f4d2d10bff6afee1f358313e87b4df246 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 10 Aug 2007 23:05:11 +0200
Subject: sched debug: dont print kernel address in /proc/sched_debug

Arjan van de Ven pointed out that we should not print kernel addresses
in world-readable /proc files - fix that.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
---
 kernel/sched_debug.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 3da32156394..87e524762b8 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -108,7 +108,7 @@ print_cfs_rq_runtime_sum(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 
 void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 {
-	SEQ_printf(m, "\ncfs_rq %p\n", cfs_rq);
+	SEQ_printf(m, "\ncfs_rq\n");
 
 #define P(x) \
 	SEQ_printf(m, "  .%-30s: %Ld\n", #x, (long long)(cfs_rq->x))
-- 
cgit v1.2.3-70-g09d2


From 8daec965e7035bbf8d364fe7585bffac7222b87a Mon Sep 17 00:00:00 2001
From: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
Date: Fri, 10 Aug 2007 13:00:51 -0700
Subject: Fix missing numa_zonelist_order sysctl

Misplaced #endif is hiding the numa_zonelist_order sysctl when !SECURITY.

Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sysctl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 79c891e6266..8bdb8c07e04 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1023,6 +1023,7 @@ static ctl_table vm_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_doulongvec_minmax,
 	},
+#endif
 #ifdef CONFIG_NUMA
 	{
 		.ctl_name	= CTL_UNNUMBERED,
@@ -1034,7 +1035,6 @@ static ctl_table vm_table[] = {
 		.strategy	= &sysctl_string,
 	},
 #endif
-#endif
 #if defined(CONFIG_X86_32) || \
    (defined(CONFIG_SUPERH) && defined(CONFIG_VSYSCALL))
 	{
-- 
cgit v1.2.3-70-g09d2


From c5a69adff920ddf138c3ea9886574b195d9e3d52 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Fri, 10 Aug 2007 13:00:57 -0700
Subject: Hibernation: do not try to mark invalid PFNs as nosave

On some systems some PFNs reported by the early initialization code as
'nosave' may be invalid.  If we try to set the corresponding bits in the
hibernation bitmap, BUG_ON() in memory_bm_find_bit() will be triggered and
the system won't be able to boot (cf.
https://bugzilla.novell.com/show_bug.cgi?id=296242).

Prevent this from happening by verifying if the 'nosave' PFNs are valid in
mark_nosave_pages().

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/power/snapshot.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index a3b7854b8f7..a686590d88c 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -709,7 +709,8 @@ static void mark_nosave_pages(struct memory_bitmap *bm)
 				region->end_pfn << PAGE_SHIFT);
 
 		for (pfn = region->start_pfn; pfn < region->end_pfn; pfn++)
-			memory_bm_set_bit(bm, pfn);
+			if (pfn_valid(pfn))
+				memory_bm_set_bit(bm, pfn);
 	}
 }
 
-- 
cgit v1.2.3-70-g09d2


From 6ddfca9548d8ecc26096a30667423ba919109533 Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Fri, 10 Aug 2007 13:01:09 -0700
Subject: timer: remove clockevents_unregister_notifier

I find a function(clockevents_unregister_notifier) which is not called by
anything in tree.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/clockchips.h |  1 -
 kernel/time/clockevents.c  | 10 ----------
 2 files changed, 11 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h
index e0bd46eb241..def5a659b8a 100644
--- a/include/linux/clockchips.h
+++ b/include/linux/clockchips.h
@@ -123,7 +123,6 @@ extern void clockevents_exchange_device(struct clock_event_device *old,
 extern void clockevents_set_mode(struct clock_event_device *dev,
 				 enum clock_event_mode mode);
 extern int clockevents_register_notifier(struct notifier_block *nb);
-extern void clockevents_unregister_notifier(struct notifier_block *nb);
 extern int clockevents_program_event(struct clock_event_device *dev,
 				     ktime_t expires, ktime_t now);
 
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 2ad1c37b8df..41dd3105ce7 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -113,16 +113,6 @@ int clockevents_register_notifier(struct notifier_block *nb)
 	return ret;
 }
 
-/**
- * clockevents_unregister_notifier - unregister a clock events change listener
- */
-void clockevents_unregister_notifier(struct notifier_block *nb)
-{
-	spin_lock(&clockevents_lock);
-	raw_notifier_chain_unregister(&clockevents_chain, nb);
-	spin_unlock(&clockevents_lock);
-}
-
 /*
  * Notify about a clock event change. Called with clockevents_lock
  * held.
-- 
cgit v1.2.3-70-g09d2


From cd5bfea278987ebfe60f3ff92a01696b17c4f978 Mon Sep 17 00:00:00 2001
From: Peter Chubb <peterc@gelato.unsw.edu.au>
Date: Fri, 10 Aug 2007 13:01:10 -0700
Subject: fix compilation with gcc 4.2

gcc-4.2 is a lot more picky about its symbol handling.  EXPORT_SYMBOL no
longer works on symbols that are undefined or defined with static scope.

For example, with CONFIG_PROFILE off, I see:

  kernel/profile.c:206: error: __ksymtab_profile_event_unregister causes a section type conflict
  kernel/profile.c:205: error: __ksymtab_profile_event_register causes a section type conflict

This patch moves the EXPORTs inside the #ifdef CONFIG_PROFILE, so we
only try to export symbols that are defined.

Also, in kernel/kprobes.c there's an EXPORT_SYMBOL_GPL() for
jprobes_return, which if CONFIG_JPROBES is undefined is a static
inline and gives the same error.

And in drivers/acpi/resources/rsxface.c, there's an
ACPI_EXPORT_SYMBOPL() for a static symbol. If it's static, it's not
accessible from outside the compilation unit, so should bot be exported.

These three changes allow building a zx1_defconfig kernel with gcc 4.2
on IA64.

[akpm@linux-foundation.org: export jpobe_return properly]
Signed-off-by: Peter Chubb <peterc@gelato.unsw.edu.au>
Cc: Prasanna S Panchamukhi <prasanna@in.ibm.com>
Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: Len Brown <lenb@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/acpi/resources/rsxface.c | 2 --
 kernel/kprobes.c                 | 5 +++++
 kernel/profile.c                 | 4 ++--
 3 files changed, 7 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/drivers/acpi/resources/rsxface.c b/drivers/acpi/resources/rsxface.c
index f63813a358c..4c3fd4cdaf7 100644
--- a/drivers/acpi/resources/rsxface.c
+++ b/drivers/acpi/resources/rsxface.c
@@ -474,8 +474,6 @@ acpi_rs_match_vendor_resource(struct acpi_resource *resource, void *context)
 	return (AE_CTRL_TERMINATE);
 }
 
-ACPI_EXPORT_SYMBOL(acpi_rs_match_vendor_resource)
-
 /*******************************************************************************
  *
  * FUNCTION:    acpi_walk_resources
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 3e9f513a728..4b8a4493c54 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1063,6 +1063,11 @@ EXPORT_SYMBOL_GPL(register_kprobe);
 EXPORT_SYMBOL_GPL(unregister_kprobe);
 EXPORT_SYMBOL_GPL(register_jprobe);
 EXPORT_SYMBOL_GPL(unregister_jprobe);
+#ifdef CONFIG_KPROBES
 EXPORT_SYMBOL_GPL(jprobe_return);
+#endif
+
+#ifdef CONFIG_KPROBES
 EXPORT_SYMBOL_GPL(register_kretprobe);
 EXPORT_SYMBOL_GPL(unregister_kretprobe);
+#endif
diff --git a/kernel/profile.c b/kernel/profile.c
index 5b20fe977be..cb1e37d2dac 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -199,11 +199,11 @@ EXPORT_SYMBOL_GPL(register_timer_hook);
 EXPORT_SYMBOL_GPL(unregister_timer_hook);
 EXPORT_SYMBOL_GPL(task_handoff_register);
 EXPORT_SYMBOL_GPL(task_handoff_unregister);
+EXPORT_SYMBOL_GPL(profile_event_register);
+EXPORT_SYMBOL_GPL(profile_event_unregister);
 
 #endif /* CONFIG_PROFILING */
 
-EXPORT_SYMBOL_GPL(profile_event_register);
-EXPORT_SYMBOL_GPL(profile_event_unregister);
 
 #ifdef CONFIG_SMP
 /*
-- 
cgit v1.2.3-70-g09d2


From 6707de00fdec3e3225192fe3dcd21323a8936b1f Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Sun, 12 Aug 2007 18:08:19 +0200
Subject: sched: make global code static

This patch makes the following needlessly global code static:

- arch_reinit_sched_domains()
- struct attr_sched_mc_power_savings
- struct attr_sched_smt_power_savings

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/cpu.h |  2 --
 kernel/sched.c      | 46 +++++++++++++++++++++++-----------------------
 2 files changed, 23 insertions(+), 25 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index c2236bbff41..1d5ded0836e 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -41,8 +41,6 @@ extern void cpu_remove_sysdev_attr(struct sysdev_attribute *attr);
 extern int cpu_add_sysdev_attr_group(struct attribute_group *attrs);
 extern void cpu_remove_sysdev_attr_group(struct attribute_group *attrs);
 
-extern struct sysdev_attribute attr_sched_mc_power_savings;
-extern struct sysdev_attribute attr_sched_smt_power_savings;
 extern int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls);
 
 #ifdef CONFIG_HOTPLUG_CPU
diff --git a/kernel/sched.c b/kernel/sched.c
index 6247e4a8350..c02659f1bd0 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -6328,7 +6328,7 @@ int partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2)
 }
 
 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
-int arch_reinit_sched_domains(void)
+static int arch_reinit_sched_domains(void)
 {
 	int err;
 
@@ -6357,24 +6357,6 @@ static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
 	return ret ? ret : count;
 }
 
-int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
-{
-	int err = 0;
-
-#ifdef CONFIG_SCHED_SMT
-	if (smt_capable())
-		err = sysfs_create_file(&cls->kset.kobj,
-					&attr_sched_smt_power_savings.attr);
-#endif
-#ifdef CONFIG_SCHED_MC
-	if (!err && mc_capable())
-		err = sysfs_create_file(&cls->kset.kobj,
-					&attr_sched_mc_power_savings.attr);
-#endif
-	return err;
-}
-#endif
-
 #ifdef CONFIG_SCHED_MC
 static ssize_t sched_mc_power_savings_show(struct sys_device *dev, char *page)
 {
@@ -6385,8 +6367,8 @@ static ssize_t sched_mc_power_savings_store(struct sys_device *dev,
 {
 	return sched_power_savings_store(buf, count, 0);
 }
-SYSDEV_ATTR(sched_mc_power_savings, 0644, sched_mc_power_savings_show,
-	    sched_mc_power_savings_store);
+static SYSDEV_ATTR(sched_mc_power_savings, 0644, sched_mc_power_savings_show,
+		   sched_mc_power_savings_store);
 #endif
 
 #ifdef CONFIG_SCHED_SMT
@@ -6399,8 +6381,26 @@ static ssize_t sched_smt_power_savings_store(struct sys_device *dev,
 {
 	return sched_power_savings_store(buf, count, 1);
 }
-SYSDEV_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show,
-	    sched_smt_power_savings_store);
+static SYSDEV_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show,
+		   sched_smt_power_savings_store);
+#endif
+
+int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
+{
+	int err = 0;
+
+#ifdef CONFIG_SCHED_SMT
+	if (smt_capable())
+		err = sysfs_create_file(&cls->kset.kobj,
+					&attr_sched_smt_power_savings.attr);
+#endif
+#ifdef CONFIG_SCHED_MC
+	if (!err && mc_capable())
+		err = sysfs_create_file(&cls->kset.kobj,
+					&attr_sched_mc_power_savings.attr);
+#endif
+	return err;
+}
 #endif
 
 /*
-- 
cgit v1.2.3-70-g09d2


From 5d2b3d3695a841231b65b5536a70dc29961c5611 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 12 Aug 2007 18:08:19 +0200
Subject: sched: fix sleeper bonus

Peter Ziljstra noticed that the sleeper bonus deduction code
was not properly rate-limited: a task that scheduled more
frequently would get a disproportionately large deduction.
So limit the deduction to delta_exec.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index c5af38948a1..fedbb51bba9 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -75,7 +75,7 @@ enum {
 
 unsigned int sysctl_sched_features __read_mostly =
 		SCHED_FEAT_FAIR_SLEEPERS	*1 |
-		SCHED_FEAT_SLEEPER_AVG		*1 |
+		SCHED_FEAT_SLEEPER_AVG		*0 |
 		SCHED_FEAT_SLEEPER_LOAD_AVG	*1 |
 		SCHED_FEAT_PRECISE_CPU_LOAD	*1 |
 		SCHED_FEAT_START_DEBIT		*1 |
@@ -304,11 +304,9 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr)
 	delta_mine = calc_delta_mine(delta_exec, curr->load.weight, lw);
 
 	if (cfs_rq->sleeper_bonus > sysctl_sched_granularity) {
-		delta = calc_delta_mine(cfs_rq->sleeper_bonus,
-					curr->load.weight, lw);
-		if (unlikely(delta > cfs_rq->sleeper_bonus))
-			delta = cfs_rq->sleeper_bonus;
-
+		delta = min(cfs_rq->sleeper_bonus, (u64)delta_exec);
+		delta = calc_delta_mine(delta, curr->load.weight, lw);
+		delta = min((u64)delta, cfs_rq->sleeper_bonus);
 		cfs_rq->sleeper_bonus -= delta;
 		delta_mine -= delta;
 	}
@@ -521,6 +519,8 @@ static void __enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	 * Track the amount of bonus we've given to sleepers:
 	 */
 	cfs_rq->sleeper_bonus += delta_fair;
+	if (unlikely(cfs_rq->sleeper_bonus > sysctl_sched_runtime_limit))
+		cfs_rq->sleeper_bonus = sysctl_sched_runtime_limit;
 
 	schedstat_add(cfs_rq, wait_runtime, se->wait_runtime);
 }
-- 
cgit v1.2.3-70-g09d2


From de0cf899bbf06b6f64a5dce9c59d74c41b6b4232 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Sun, 12 Aug 2007 18:08:19 +0200
Subject: sched: run_rebalance_domains: s/SCHED_IDLE/CPU_IDLE/

rebalance_domains(SCHED_IDLE) looks strange (typo), change it to CPU_IDLE.

the effect of this bug was slightly more agressive idle-balancing on
SMP than intended.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index c02659f1bd0..45e17b83b7f 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3106,7 +3106,7 @@ static void run_rebalance_domains(struct softirq_action *h)
 			if (need_resched())
 				break;
 
-			rebalance_domains(balance_cpu, SCHED_IDLE);
+			rebalance_domains(balance_cpu, CPU_IDLE);
 
 			rq = cpu_rq(balance_cpu);
 			if (time_after(this_rq->next_balance, rq->next_balance))
-- 
cgit v1.2.3-70-g09d2


From 496634217e5671ed876a0348e9f5b7165e830b20 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 12 Aug 2007 15:46:34 +0000
Subject: genirq: cleanup mismerge artifact

Commit 5a43a066b11ac2fe84cf67307f20b83bea390f83: "genirq: Allow fasteoi
handler to retrigger disabled interrupts" was erroneously applied to
handle_level_irq().  This added the irq retrigger / resend functionality
to the level irq handler.

Revert the offending bits.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/irq/chip.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 615ce97c6cf..f1a73f0b54e 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -352,13 +352,10 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
 	 * keep it masked and get out of here
 	 */
 	action = desc->action;
-	if (unlikely(!action || (desc->status & IRQ_DISABLED))) {
-		desc->status |= IRQ_PENDING;
+	if (unlikely(!action || (desc->status & IRQ_DISABLED)))
 		goto out_unlock;
-	}
 
 	desc->status |= IRQ_INPROGRESS;
-	desc->status &= ~IRQ_PENDING;
 	spin_unlock(&desc->lock);
 
 	action_ret = handle_IRQ_event(irq, action);
-- 
cgit v1.2.3-70-g09d2


From 2464286ace55b3abddfb9cc30ab95e2dac1de9a6 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 12 Aug 2007 15:46:35 +0000
Subject: genirq: suppress resend of level interrupts

Level type interrupts are resent by the interrupt hardware when they are
still active at irq_enable().

Suppress the resend mechanism for interrupts marked as level.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/irq/resend.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c
index 5bfeaed7e48..a8046791ba2 100644
--- a/kernel/irq/resend.c
+++ b/kernel/irq/resend.c
@@ -62,7 +62,12 @@ void check_irq_resend(struct irq_desc *desc, unsigned int irq)
 	 */
 	desc->chip->enable(irq);
 
-	if ((status & (IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) {
+	/*
+	 * We do not resend level type interrupts. Level type
+	 * interrupts are resent by hardware when they are still
+	 * active.
+	 */
+	if ((status & (IRQ_LEVEL | IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) {
 		desc->status = (status & ~IRQ_PENDING) | IRQ_REPLAY;
 
 		if (!desc->chip || !desc->chip->retrigger ||
-- 
cgit v1.2.3-70-g09d2


From e598fbaabdb6608915cbc5e80409d70f4f857e5c Mon Sep 17 00:00:00 2001
From: Christian Heim <phreak@gentoo.org>
Date: Sun, 19 Aug 2007 13:07:59 +0200
Subject: Remove double inclusion of linux/capability.h

Remove the second inclusion of linux/capability.h, which has been
introduced with "[PATCH] move capable() to capability.h" (commit
c59ede7b78db329949d9cdcd7064e22d357560ef)

Signed-off-by: Christian Heim <phreak@gentoo.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sysctl.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 8bdb8c07e04..9029690f4fa 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -27,7 +27,6 @@
 #include <linux/capability.h>
 #include <linux/ctype.h>
 #include <linux/utsname.h>
-#include <linux/capability.h>
 #include <linux/smp_lock.h>
 #include <linux/fs.h>
 #include <linux/init.h>
-- 
cgit v1.2.3-70-g09d2


From 0c5564bd91ad237212871d52deaf79ffe06bcc64 Mon Sep 17 00:00:00 2001
From: Robin Getz <rgetz@blackfin.uclinux.org>
Date: Mon, 20 Aug 2007 15:22:47 -0400
Subject: ensure we don't use bootconsoles after init has been released

This is a followup to the cleanups for earlyprintk patch from Gerd Hoffmann

http://git.kernel.org/?p=linux/kernel/git/torvalds/linux-2.6.git;a=commit;h=69331af79cf29e26d1231152a172a1a10c2df511

This ensures that a bootconsole is unregistered if it is not replaced.
The current implementation spews garbage out the bootconsole in this case,
since the bootconsole structure is normally in the init section, and is
freed, but still used.

Signed-off-by: Robin Getz <rgetz@blackfin.uclinux.org>
Acked-by: Gerd Hoffmann <kraxel@redhat.com>
Acked-by: Paul Mundt <lethal@linux-sh.org>
Cc: Mike Frysinger <vapier.adi@gmail.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/printk.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'kernel')

diff --git a/kernel/printk.c b/kernel/printk.c
index bd2cd062878..5c7c325b29c 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -1083,6 +1083,17 @@ int unregister_console(struct console *console)
 }
 EXPORT_SYMBOL(unregister_console);
 
+static int __init disable_boot_consoles(void)
+{
+	if (console_drivers->flags & CON_BOOT) {
+		printk(KERN_INFO "turn off boot console %s%d\n",
+			console_drivers->name, console_drivers->index);
+		return unregister_console(console_drivers);
+	}
+	return 0;
+}
+late_initcall(disable_boot_consoles);
+
 /**
  * tty_write_message - write a message to a certain tty, not just the console.
  * @tty: the destination tty_struct
-- 
cgit v1.2.3-70-g09d2


From cb00e99c0abd844b884c64c6b54aa3b7d345ebb1 Mon Sep 17 00:00:00 2001
From: Robin Getz <rgetz@blackfin.uclinux.org>
Date: Tue, 21 Aug 2007 23:14:58 -0400
Subject: fix - ensure we don't use bootconsoles after init has been released

Gerd Hoffmann pointed out that my patch from yesterday can lead
to a null pointer dereference if the kernel is booted with no
console, and no earlyprintk defined. This fixes that issue.

Signed-off-by: Robin Getz <rgetz@blackfin.uclinux.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/printk.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/printk.c b/kernel/printk.c
index 5c7c325b29c..8451dfc31d2 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -1085,10 +1085,12 @@ EXPORT_SYMBOL(unregister_console);
 
 static int __init disable_boot_consoles(void)
 {
-	if (console_drivers->flags & CON_BOOT) {
-		printk(KERN_INFO "turn off boot console %s%d\n",
-			console_drivers->name, console_drivers->index);
-		return unregister_console(console_drivers);
+	if (console_drivers != NULL) {
+		if (console_drivers->flags & CON_BOOT) {
+			printk(KERN_INFO "turn off boot console %s%d\n",
+				console_drivers->name, console_drivers->index);
+			return unregister_console(console_drivers);
+		}
 	}
 	return 0;
 }
-- 
cgit v1.2.3-70-g09d2


From 256e2fdf033f5c8b5093cd817d44cea3a11a4e6f Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Mon, 6 Aug 2007 23:47:45 +0400
Subject: Fix Off-by-one in /sys/module/*/refcnt

sysfs internals were changed to not pin module in question.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Acked-by: Kay Sievers <kay.sievers@vrfy.org>
Acked-by: Tejun Heo <htejun@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 kernel/module.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index 33c04ad5117..db0ead0363e 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -784,8 +784,7 @@ EXPORT_SYMBOL_GPL(symbol_put_addr);
 static ssize_t show_refcnt(struct module_attribute *mattr,
 			   struct module *mod, char *buffer)
 {
-	/* sysfs holds a reference */
-	return sprintf(buffer, "%u\n", module_refcount(mod)-1);
+	return sprintf(buffer, "%u\n", module_refcount(mod));
 }
 
 static struct module_attribute refcnt = {
-- 
cgit v1.2.3-70-g09d2


From 88ae704c2aba150372e3d5c2f017c816773d09a7 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Wed, 22 Aug 2007 14:01:05 -0700
Subject: kernel/auditsc.c: fix an off-by-one

This patch fixes an off-by-one in a BUG_ON() spotted by the Coverity
checker.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Cc: Amy Griffis <amy.griffis@hp.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/auditsc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 3401293359e..04f3ffb8d9d 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -2023,7 +2023,7 @@ int __audit_signal_info(int sig, struct task_struct *t)
 		axp->d.next = ctx->aux_pids;
 		ctx->aux_pids = (void *)axp;
 	}
-	BUG_ON(axp->pid_count > AUDIT_AUX_PIDS);
+	BUG_ON(axp->pid_count >= AUDIT_AUX_PIDS);
 
 	axp->target_pid[axp->pid_count] = t->tgid;
 	selinux_get_task_sid(t, &axp->target_sid[axp->pid_count]);
-- 
cgit v1.2.3-70-g09d2


From 187226f57f1381cfc63216979b4375f30e593795 Mon Sep 17 00:00:00 2001
From: john stultz <johnstul@us.ibm.com>
Date: Wed, 22 Aug 2007 14:01:10 -0700
Subject: futex_unlock_pi() hurts my brain and may cause application deadlock

Avoid futex_unlock_pi returning -EFAULT (which results in deadlock), by
clearing uval before jumping to retry_locked.

Signed-off-by: John Stultz <johnstul@us.ibm.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/futex.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/futex.c b/kernel/futex.c
index 3415e9ad139..e8935b195e8 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1670,6 +1670,7 @@ pi_faulted:
 					 attempt);
 		if (ret)
 			goto out;
+		uval = 0;
 		goto retry_unlocked;
 	}
 
-- 
cgit v1.2.3-70-g09d2


From 8b7f07155f8ee1536da2f9590f1aa9383afefb6b Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Wed, 22 Aug 2007 14:01:20 -0700
Subject: free_irq(): fix DEBUG_SHIRQ handling

If we're going to run the handler from free_irq() then we must do it with
local irq's disabled.  Otherwise lockdep complains that the handler is taking
irq-safe spinlocks in a non-irq-safe fashion.

Cc: Ingo Molnar <mingo@elte.hu>
Cc: David Woodhouse <dwmw2@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/irq/manage.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 203a518b6f1..853aefbd184 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -462,7 +462,9 @@ void free_irq(unsigned int irq, void *dev_id)
 		 * We do this after actually deregistering it, to make sure that
 		 * a 'real' IRQ doesn't run in parallel with our fake
 		 */
+		local_irq_save(flags);
 		handler(irq, dev_id);
+		local_irq_restore(flags);
 	}
 #endif
 }
-- 
cgit v1.2.3-70-g09d2


From 179394af7a2baa1d0a3cb1670075310d72247d38 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 22 Aug 2007 14:01:37 -0700
Subject: posix-timers: fix deletion race

timer_delete does:
	lock_timer();
	timer->it_process = NULL;
	unlock_timer();
	release_posix_timer();

timer->it_process is checked in lock_timer() to prevent access to a
timer, which is on the way to be deleted, but the check happens after
idr_lock is dropped. This allows release_posix_timer() to delete the
timer before the lock code can check the timer:

  CPU 0				CPU 1

  lock_timer();
  timer->it_process = NULL;
  unlock_timer();
				lock_timer()
					spin_lock(idr_lock);
					timer = idr_find();
					spin_lock(timer->lock);
					spin_unlock(idr_lock);
  release_posix_timer();
	spin_lock(idr_lock);
	idr_remove(timer);
	spin_unlock(idr_lock);
	free_timer(timer);
					if (timer->......)

Change the locking to prevent this.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/posix-timers.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 55b3761edaa..6923ad8a598 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -605,13 +605,14 @@ static struct k_itimer * lock_timer(timer_t timer_id, unsigned long *flags)
 	timr = (struct k_itimer *) idr_find(&posix_timers_id, (int) timer_id);
 	if (timr) {
 		spin_lock(&timr->it_lock);
-		spin_unlock(&idr_lock);
 
 		if ((timr->it_id != timer_id) || !(timr->it_process) ||
 				timr->it_process->tgid != current->tgid) {
-			unlock_timer(timr, *flags);
+			spin_unlock(&timr->it_lock);
+			spin_unlock_irqrestore(&idr_lock, *flags);
 			timr = NULL;
-		}
+		} else
+			spin_unlock(&idr_lock);
 	} else
 		spin_unlock_irqrestore(&idr_lock, *flags);
 
-- 
cgit v1.2.3-70-g09d2


From d02479bdeb1c9b037892061cdcf4e730183391fa Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Wed, 22 Aug 2007 14:01:37 -0700
Subject: posix-timers: fix creation race

sys_timer_create() sets ->it_process and unlocks ->siglock, then checks
tmr->it_sigev_notify to define if get_task_struct() is needed.

We already passed ->it_id to the caller, another thread can delete this timer
and free its memory in between.

As a minimal fix, move this code under ->siglock, sys_timer_delete() takes it
too before calling release_posix_timer().  A proper serialization would be to
take ->it_lock, we add a partly initialized timer on posix_timers_id, not
good.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/posix-timers.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 6923ad8a598..7a15afb73ed 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -547,9 +547,9 @@ sys_timer_create(const clockid_t which_clock,
 				new_timer->it_process = process;
 				list_add(&new_timer->list,
 					 &process->signal->posix_timers);
-				spin_unlock_irqrestore(&process->sighand->siglock, flags);
 				if (new_timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID))
 					get_task_struct(process);
+				spin_unlock_irqrestore(&process->sighand->siglock, flags);
 			} else {
 				spin_unlock_irqrestore(&process->sighand->siglock, flags);
 				process = NULL;
-- 
cgit v1.2.3-70-g09d2


From 834d216e1f804560bd1421c511ad168d7c24b01d Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Wed, 22 Aug 2007 14:01:42 -0700
Subject: signalfd: fix interaction with posix-timers

dequeue_signal:

	if (__SI_TIMER) {
		spin_unlock(&tsk->sighand->siglock);
		do_schedule_next_timer(info);
		spin_lock(&tsk->sighand->siglock);
	}

Unless tsk == curent, this is absolutely unsafe: nothing prevents tsk from
exiting. If signalfd was passed to another process, do_schedule_next_timer()
is just wrong.

Add yet another "tsk == current" check into dequeue_signal().

This patch fixes an oopsable bug, but breaks the scheduling of posix timers
if the shared __SI_TIMER signal was fetched via signalfd attached to another
sub-thread. Mostly fixed by the next patch.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Davide Libenzi <davidel@xmailserver.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Michael Kerrisk <mtk-manpages@gmx.net>
Cc: Roland McGrath <roland@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index b27c01a6644..ad63109e413 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -378,7 +378,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
 	/* We only dequeue private signals from ourselves, we don't let
 	 * signalfd steal them
 	 */
-	if (tsk == current)
+	if (likely(tsk == current))
 		signr = __dequeue_signal(&tsk->pending, mask, info);
 	if (!signr) {
 		signr = __dequeue_signal(&tsk->signal->shared_pending,
@@ -425,7 +425,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
 		if (!(tsk->signal->flags & SIGNAL_GROUP_EXIT))
 			tsk->signal->flags |= SIGNAL_STOP_DEQUEUED;
 	}
-	if ( signr &&
+	if (signr && likely(tsk == current) &&
 	     ((info->si_code & __SI_MASK) == __SI_TIMER) &&
 	     info->si_sys_private){
 		/*
-- 
cgit v1.2.3-70-g09d2


From 2aa44d0567ed21b47b87d68819415d48194cb923 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 23 Aug 2007 15:18:02 +0200
Subject: sched: sched_clock_idle_[sleep|wakeup]_event()

construct a more or less wall-clock time out of sched_clock(), by
using ACPI-idle's existing knowledge about how much time we spent
idling. This allows the rq clock to work around TSC-stops-in-C2,
TSC-gets-corrupted-in-C3 type of problems.

( Besides the scheduler's statistics this also benefits blktrace and
  printk-timestamps as well. )

Furthermore, the precise before-C2/C3-sleep and after-C2/C3-wakeup
callbacks allow the scheduler to get out the most of the period where
the CPU has a reliable TSC. This results in slightly more precise
task statistics.

the ACPI bits were acked by Len.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Acked-by: Len Brown <len.brown@intel.com>
---
 arch/i386/kernel/tsc.c        |  1 -
 drivers/acpi/processor_idle.c | 32 +++++++++++++++++++++++++-------
 include/linux/sched.h         |  3 ++-
 kernel/sched.c                | 41 ++++++++++++++++++++++++++++++++---------
 kernel/sched_debug.c          |  3 ++-
 5 files changed, 61 insertions(+), 19 deletions(-)

(limited to 'kernel')

diff --git a/arch/i386/kernel/tsc.c b/arch/i386/kernel/tsc.c
index debd7dbb415..a39280b4dd3 100644
--- a/arch/i386/kernel/tsc.c
+++ b/arch/i386/kernel/tsc.c
@@ -292,7 +292,6 @@ static struct clocksource clocksource_tsc = {
 
 void mark_tsc_unstable(char *reason)
 {
-	sched_clock_unstable_event();
 	if (!tsc_unstable) {
 		tsc_unstable = 1;
 		tsc_enabled = 0;
diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c
index a8634a0655f..d9b8af763e1 100644
--- a/drivers/acpi/processor_idle.c
+++ b/drivers/acpi/processor_idle.c
@@ -63,6 +63,7 @@
 ACPI_MODULE_NAME("processor_idle");
 #define ACPI_PROCESSOR_FILE_POWER	"power"
 #define US_TO_PM_TIMER_TICKS(t)		((t * (PM_TIMER_FREQUENCY/1000)) / 1000)
+#define PM_TIMER_TICK_NS		(1000000000ULL/PM_TIMER_FREQUENCY)
 #define C2_OVERHEAD			4	/* 1us (3.579 ticks per us) */
 #define C3_OVERHEAD			4	/* 1us (3.579 ticks per us) */
 static void (*pm_idle_save) (void) __read_mostly;
@@ -462,6 +463,9 @@ static void acpi_processor_idle(void)
 		 * TBD: Can't get time duration while in C1, as resumes
 		 *      go to an ISR rather than here.  Need to instrument
 		 *      base interrupt handler.
+		 *
+		 * Note: the TSC better not stop in C1, sched_clock() will
+		 *       skew otherwise.
 		 */
 		sleep_ticks = 0xFFFFFFFF;
 		break;
@@ -469,6 +473,8 @@ static void acpi_processor_idle(void)
 	case ACPI_STATE_C2:
 		/* Get start time (ticks) */
 		t1 = inl(acpi_gbl_FADT.xpm_timer_block.address);
+		/* Tell the scheduler that we are going deep-idle: */
+		sched_clock_idle_sleep_event();
 		/* Invoke C2 */
 		acpi_state_timer_broadcast(pr, cx, 1);
 		acpi_cstate_enter(cx);
@@ -479,17 +485,22 @@ static void acpi_processor_idle(void)
 		/* TSC halts in C2, so notify users */
 		mark_tsc_unstable("possible TSC halt in C2");
 #endif
+		/* Compute time (ticks) that we were actually asleep */
+		sleep_ticks = ticks_elapsed(t1, t2);
+
+		/* Tell the scheduler how much we idled: */
+		sched_clock_idle_wakeup_event(sleep_ticks*PM_TIMER_TICK_NS);
+
 		/* Re-enable interrupts */
 		local_irq_enable();
+		/* Do not account our idle-switching overhead: */
+		sleep_ticks -= cx->latency_ticks + C2_OVERHEAD;
+
 		current_thread_info()->status |= TS_POLLING;
-		/* Compute time (ticks) that we were actually asleep */
-		sleep_ticks =
-		    ticks_elapsed(t1, t2) - cx->latency_ticks - C2_OVERHEAD;
 		acpi_state_timer_broadcast(pr, cx, 0);
 		break;
 
 	case ACPI_STATE_C3:
-
 		/*
 		 * disable bus master
 		 * bm_check implies we need ARB_DIS
@@ -518,6 +529,8 @@ static void acpi_processor_idle(void)
 		t1 = inl(acpi_gbl_FADT.xpm_timer_block.address);
 		/* Invoke C3 */
 		acpi_state_timer_broadcast(pr, cx, 1);
+		/* Tell the scheduler that we are going deep-idle: */
+		sched_clock_idle_sleep_event();
 		acpi_cstate_enter(cx);
 		/* Get end time (ticks) */
 		t2 = inl(acpi_gbl_FADT.xpm_timer_block.address);
@@ -531,12 +544,17 @@ static void acpi_processor_idle(void)
 		/* TSC halts in C3, so notify users */
 		mark_tsc_unstable("TSC halts in C3");
 #endif
+		/* Compute time (ticks) that we were actually asleep */
+		sleep_ticks = ticks_elapsed(t1, t2);
+		/* Tell the scheduler how much we idled: */
+		sched_clock_idle_wakeup_event(sleep_ticks*PM_TIMER_TICK_NS);
+
 		/* Re-enable interrupts */
 		local_irq_enable();
+		/* Do not account our idle-switching overhead: */
+		sleep_ticks -= cx->latency_ticks + C3_OVERHEAD;
+
 		current_thread_info()->status |= TS_POLLING;
-		/* Compute time (ticks) that we were actually asleep */
-		sleep_ticks =
-		    ticks_elapsed(t1, t2) - cx->latency_ticks - C3_OVERHEAD;
 		acpi_state_timer_broadcast(pr, cx, 0);
 		break;
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 682ef87da6e..1845b2e99a8 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1388,7 +1388,8 @@ extern void sched_exec(void);
 #define sched_exec()   {}
 #endif
 
-extern void sched_clock_unstable_event(void);
+extern void sched_clock_idle_sleep_event(void);
+extern void sched_clock_idle_wakeup_event(u64 delta_ns);
 
 #ifdef CONFIG_HOTPLUG_CPU
 extern void idle_task_exit(void);
diff --git a/kernel/sched.c b/kernel/sched.c
index 45e17b83b7f..48e7586168e 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -262,7 +262,8 @@ struct rq {
 	s64 clock_max_delta;
 
 	unsigned int clock_warps, clock_overflows;
-	unsigned int clock_unstable_events;
+	u64 idle_clock;
+	unsigned int clock_deep_idle_events;
 	u64 tick_timestamp;
 
 	atomic_t nr_iowait;
@@ -556,18 +557,40 @@ static inline struct rq *this_rq_lock(void)
 }
 
 /*
- * CPU frequency is/was unstable - start new by setting prev_clock_raw:
+ * We are going deep-idle (irqs are disabled):
  */
-void sched_clock_unstable_event(void)
+void sched_clock_idle_sleep_event(void)
 {
-	unsigned long flags;
-	struct rq *rq;
+	struct rq *rq = cpu_rq(smp_processor_id());
 
-	rq = task_rq_lock(current, &flags);
-	rq->prev_clock_raw = sched_clock();
-	rq->clock_unstable_events++;
-	task_rq_unlock(rq, &flags);
+	spin_lock(&rq->lock);
+	__update_rq_clock(rq);
+	spin_unlock(&rq->lock);
+	rq->clock_deep_idle_events++;
+}
+EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event);
+
+/*
+ * We just idled delta nanoseconds (called with irqs disabled):
+ */
+void sched_clock_idle_wakeup_event(u64 delta_ns)
+{
+	struct rq *rq = cpu_rq(smp_processor_id());
+	u64 now = sched_clock();
+
+	rq->idle_clock += delta_ns;
+	/*
+	 * Override the previous timestamp and ignore all
+	 * sched_clock() deltas that occured while we idled,
+	 * and use the PM-provided delta_ns to advance the
+	 * rq clock:
+	 */
+	spin_lock(&rq->lock);
+	rq->prev_clock_raw = now;
+	rq->clock += delta_ns;
+	spin_unlock(&rq->lock);
 }
+EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
 
 /*
  * resched_task - mark a task 'to be rescheduled now'.
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 87e524762b8..ab18f45f2ab 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -154,10 +154,11 @@ static void print_cpu(struct seq_file *m, int cpu)
 	P(next_balance);
 	P(curr->pid);
 	P(clock);
+	P(idle_clock);
 	P(prev_clock_raw);
 	P(clock_warps);
 	P(clock_overflows);
-	P(clock_unstable_events);
+	P(clock_deep_idle_events);
 	P(clock_max_delta);
 	P(cpu_load[0]);
 	P(cpu_load[1]);
-- 
cgit v1.2.3-70-g09d2


From c57baf1e1e24b004b57d282267542baab802753c Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Thu, 23 Aug 2007 15:18:02 +0200
Subject: sched: fix sysctl directory permissions

There are two remaining gotchas:

- The directories have impossible permissions (writeable).

- The ctl_name for the kernel directory is inconsistent with
  everything else.  It should be CTL_KERN.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 48e7586168e..5fecbbba12a 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5257,15 +5257,16 @@ static void migrate_dead_tasks(unsigned int dead_cpu)
 static struct ctl_table sd_ctl_dir[] = {
 	{
 		.procname	= "sched_domain",
-		.mode		= 0755,
+		.mode		= 0555,
 	},
 	{0,},
 };
 
 static struct ctl_table sd_ctl_root[] = {
 	{
+		.ctl_name	= CTL_KERN,
 		.procname	= "kernel",
-		.mode		= 0755,
+		.mode		= 0555,
 		.child		= sd_ctl_dir,
 	},
 	{0,},
@@ -5341,7 +5342,7 @@ static ctl_table *sd_alloc_ctl_cpu_table(int cpu)
 	for_each_domain(cpu, sd) {
 		snprintf(buf, 32, "domain%d", i);
 		entry->procname = kstrdup(buf, GFP_KERNEL);
-		entry->mode = 0755;
+		entry->mode = 0555;
 		entry->child = sd_alloc_ctl_domain_table(sd);
 		entry++;
 		i++;
@@ -5361,7 +5362,7 @@ static void init_sched_domain_sysctl(void)
 	for (i = 0; i < cpu_num; i++, entry++) {
 		snprintf(buf, 32, "cpu%d", i);
 		entry->procname = kstrdup(buf, GFP_KERNEL);
-		entry->mode = 0755;
+		entry->mode = 0555;
 		entry->child = sd_alloc_ctl_cpu_table(i);
 	}
 	sd_sysctl_header = register_sysctl_table(sd_ctl_root);
-- 
cgit v1.2.3-70-g09d2


From f8700df7c419781efb34696de7e7f49717f8ede7 Mon Sep 17 00:00:00 2001
From: Suresh Siddha <suresh.b.siddha@intel.com>
Date: Thu, 23 Aug 2007 15:18:02 +0200
Subject: sched: fix broken SMT/MC optimizations

On a four package system with HT - HT load balancing optimizations were
broken.  For example, if two tasks end up running on two logical threads
of one of the packages, scheduler is not able to pull one of the tasks
to a completely idle package.

In this scenario, for nice-0 tasks, imbalance calculated by scheduler
will be 512 and find_busiest_queue() will return 0 (as each cpu's load
is 1024 > imbalance and has only one task running).

Similarly MC scheduler optimizations also get fixed with this patch.

[ mingo@elte.hu: restored fair balancing by increasing the fuzz and
                 adding it back to the power decision, without the /2
                 factor. ]

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h | 2 +-
 kernel/sched.c        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 1845b2e99a8..ba78807eab9 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -681,7 +681,7 @@ enum cpu_idle_type {
 #define SCHED_LOAD_SHIFT	10
 #define SCHED_LOAD_SCALE	(1L << SCHED_LOAD_SHIFT)
 
-#define SCHED_LOAD_SCALE_FUZZ	(SCHED_LOAD_SCALE >> 1)
+#define SCHED_LOAD_SCALE_FUZZ	SCHED_LOAD_SCALE
 
 #ifdef CONFIG_SMP
 #define SD_LOAD_BALANCE		1	/* Do load balancing on this domain. */
diff --git a/kernel/sched.c b/kernel/sched.c
index 5fecbbba12a..d96030db8ff 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2517,7 +2517,7 @@ group_next:
 	 * a think about bumping its value to force at least one task to be
 	 * moved
 	 */
-	if (*imbalance + SCHED_LOAD_SCALE_FUZZ < busiest_load_per_task/2) {
+	if (*imbalance + SCHED_LOAD_SCALE_FUZZ < busiest_load_per_task) {
 		unsigned long tmp, pwr_now, pwr_move;
 		unsigned int imbn;
 
-- 
cgit v1.2.3-70-g09d2


From f549da848eca595abca14ebc5e1bf00fd72aa53d Mon Sep 17 00:00:00 2001
From: Suresh Siddha <suresh.b.siddha@intel.com>
Date: Thu, 23 Aug 2007 15:18:02 +0200
Subject: sched: skip updating rq's next_balance under null SD

Was playing with sched_smt_power_savings/sched_mc_power_savings and
found out that while the scheduler domains are reconstructed when sysfs
settings change, rebalance_domains() can get triggered with null domain
on other cpus, which is setting next_balance to jiffies + 60*HZ.
Resulting in no idle/busy balancing for 60 seconds.

Fix this.

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index d96030db8ff..a4b22d93e00 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3043,6 +3043,7 @@ static inline void rebalance_domains(int cpu, enum cpu_idle_type idle)
 	struct sched_domain *sd;
 	/* Earliest time when we have to do rebalance again */
 	unsigned long next_balance = jiffies + 60*HZ;
+	int update_next_balance = 0;
 
 	for_each_domain(cpu, sd) {
 		if (!(sd->flags & SD_LOAD_BALANCE))
@@ -3079,8 +3080,10 @@ static inline void rebalance_domains(int cpu, enum cpu_idle_type idle)
 		if (sd->flags & SD_SERIALIZE)
 			spin_unlock(&balancing);
 out:
-		if (time_after(next_balance, sd->last_balance + interval))
+		if (time_after(next_balance, sd->last_balance + interval)) {
 			next_balance = sd->last_balance + interval;
+			update_next_balance = 1;
+		}
 
 		/*
 		 * Stop the load balance at this level. There is another
@@ -3090,7 +3093,14 @@ out:
 		if (!balance)
 			break;
 	}
-	rq->next_balance = next_balance;
+
+	/*
+	 * next_balance will be updated only when there is a need.
+	 * When the cpu is attached to null domain for ex, it will not be
+	 * updated.
+	 */
+	if (likely(update_next_balance))
+		rq->next_balance = next_balance;
 }
 
 /*
-- 
cgit v1.2.3-70-g09d2


From 505c0efd58031923ae01deac16d896607cafa70e Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 23 Aug 2007 15:18:02 +0200
Subject: sched: tweak the sched_runtime_limit tunable

Michael Gerdau reported reniced task CPU usage weirdnesses.
Such symptoms can be caused by limit underruns so double the
sched_runtime_limit.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index a4b22d93e00..96e9b82246d 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4923,7 +4923,7 @@ static inline void sched_init_granularity(void)
 	if (sysctl_sched_granularity > gran_limit)
 		sysctl_sched_granularity = gran_limit;
 
-	sysctl_sched_runtime_limit = sysctl_sched_granularity * 4;
+	sysctl_sched_runtime_limit = sysctl_sched_granularity * 8;
 	sysctl_sched_wakeup_granularity = sysctl_sched_granularity / 2;
 }
 
-- 
cgit v1.2.3-70-g09d2


From 7c6c16f354cde4a48bd305b2587fc78257bcb936 Mon Sep 17 00:00:00 2001
From: Bruce Ashfield <bruce.ashfield@windriver.com>
Date: Fri, 24 Aug 2007 20:39:10 +0200
Subject: sched: CONFIG_SCHED_GROUP_FAIR=y fixlet

when I built with CONFIG_FAIR_GROUP_SCHED=y, I need the following change
to make things right.

[ From: mingo@elte.hu ]

this config option is not upstream-configurable right now but lets fix
this for completeness.

Signed-off-by: Bruce Ashfield <bruce.ashfield@windriver.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index fedbb51bba9..b5270dc98be 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1057,7 +1057,7 @@ static void task_new_fair(struct rq *rq, struct task_struct *p)
  */
 static void set_curr_task_fair(struct rq *rq)
 {
-	struct sched_entity *se = &rq->curr.se;
+	struct sched_entity *se = &rq->curr->se;
 
 	for_each_sched_entity(se)
 		set_next_entity(cfs_rq_of(se), se);
-- 
cgit v1.2.3-70-g09d2


From 71fd37146385c8255bfd370f33ca81fe8c81e5a5 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 24 Aug 2007 20:39:10 +0200
Subject: sched: remove HZ dependency from the granularity default

remove HZ dependency from the granularity default. Use 10 msec for
the base granularity, 1 msec for wakeup granularity and 25 msec for
batch wakeup granularity. (These defaults are close to the values
that the default HZ=250 setting got previously, and thus it's the
most common setting.)

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c      |  2 +-
 kernel/sched_fair.c | 13 ++++++-------
 2 files changed, 7 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 96e9b82246d..e95ff22ed17 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4923,7 +4923,7 @@ static inline void sched_init_granularity(void)
 	if (sysctl_sched_granularity > gran_limit)
 		sysctl_sched_granularity = gran_limit;
 
-	sysctl_sched_runtime_limit = sysctl_sched_granularity * 8;
+	sysctl_sched_runtime_limit = sysctl_sched_granularity * 5;
 	sysctl_sched_wakeup_granularity = sysctl_sched_granularity / 2;
 }
 
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index b5270dc98be..6b0974c3fb6 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -19,7 +19,7 @@
 
 /*
  * Preemption granularity:
- * (default: 2 msec, units: nanoseconds)
+ * (default: 10 msec, units: nanoseconds)
  *
  * NOTE: this granularity value is not the same as the concept of
  * 'timeslice length' - timeslices in CFS will typically be somewhat
@@ -31,18 +31,17 @@
  * number of CPUs. (i.e. factor 2x on 2-way systems, 3x on 4-way
  * systems, 4x on 8-way systems, 5x on 16-way systems, etc.)
  */
-unsigned int sysctl_sched_granularity __read_mostly = 2000000000ULL/HZ;
+unsigned int sysctl_sched_granularity __read_mostly = 10000000UL;
 
 /*
  * SCHED_BATCH wake-up granularity.
- * (default: 10 msec, units: nanoseconds)
+ * (default: 25 msec, units: nanoseconds)
  *
  * This option delays the preemption effects of decoupled workloads
  * and reduces their over-scheduling. Synchronous workloads will still
  * have immediate wakeup/sleep latencies.
  */
-unsigned int sysctl_sched_batch_wakeup_granularity __read_mostly =
-							10000000000ULL/HZ;
+unsigned int sysctl_sched_batch_wakeup_granularity __read_mostly = 25000000UL;
 
 /*
  * SCHED_OTHER wake-up granularity.
@@ -52,12 +51,12 @@ unsigned int sysctl_sched_batch_wakeup_granularity __read_mostly =
  * and reduces their over-scheduling. Synchronous workloads will still
  * have immediate wakeup/sleep latencies.
  */
-unsigned int sysctl_sched_wakeup_granularity __read_mostly = 1000000000ULL/HZ;
+unsigned int sysctl_sched_wakeup_granularity __read_mostly = 1000000UL;
 
 unsigned int sysctl_sched_stat_granularity __read_mostly;
 
 /*
- * Initialized in sched_init_granularity():
+ * Initialized in sched_init_granularity() [to 5 times the base granularity]:
  */
 unsigned int sysctl_sched_runtime_limit __read_mostly;
 
-- 
cgit v1.2.3-70-g09d2


From deac4ee65af4befb66b542e4a782e63da93b51a0 Mon Sep 17 00:00:00 2001
From: Sven-Thorsten Dietrich <sven@thebigcorporation.com>
Date: Fri, 24 Aug 2007 20:39:10 +0200
Subject: sched: simplify can_migrate_task()

Remove trivial conditional branch in Linux scheduler's
can_migrate_task() function.

   text    data     bss     dec     hex filename
   34770    2998      24   37792    93a0 sched.o.before
   34757    2998      24   37779    9393 sched.o.after

Signed-off-by: Sven-Thorsten Dietrich <sven@thebigcorporation.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index e95ff22ed17..6798328a2e0 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2180,12 +2180,6 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
 	if (task_running(rq, p))
 		return 0;
 
-	/*
-	 * Aggressive migration if too many balance attempts have failed:
-	 */
-	if (sd->nr_balance_failed > sd->cache_nice_tries)
-		return 1;
-
 	return 1;
 }
 
-- 
cgit v1.2.3-70-g09d2


From 98fbc798533339be802c6dcd48c2293c712e87db Mon Sep 17 00:00:00 2001
From: Dmitry Adamushko <dmitry.adamushko@gmail.com>
Date: Fri, 24 Aug 2007 20:39:10 +0200
Subject: sched: optimize task_tick_rt() a bit

Mitchell Erblich suggested a quality-of-implementation change to
not requeue SCHED_RR tasks if there's only a single task on the
runqueue, by checking for rq->nr_running == 1.

provide a more efficient implementation of that, to check that
particular RT priority-queue only.

[ From: mingo@elte.hu ]

Also first requeue the task then set need_resched - results in slightly
better machine-instruction ordering. Also clean up the code a bit.

Signed-off-by: Dmitry Adamushko <dmitry.adamushko@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_rt.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index dcdcad632fd..4b87476a02d 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -207,10 +207,15 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p)
 		return;
 
 	p->time_slice = static_prio_timeslice(p->static_prio);
-	set_tsk_need_resched(p);
 
-	/* put it at the end of the queue: */
-	requeue_task_rt(rq, p);
+	/*
+	 * Requeue to the end of queue if we are not the only element
+	 * on the queue:
+	 */
+	if (p->run_list.prev != p->run_list.next) {
+		requeue_task_rt(rq, p);
+		set_tsk_need_resched(p);
+	}
 }
 
 static struct sched_class rt_sched_class __read_mostly = {
-- 
cgit v1.2.3-70-g09d2


From b2133c8b1e270b4a7c36f70e29be8738d09e850b Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 24 Aug 2007 20:39:10 +0200
Subject: sched: tidy up and simplify the bonus balance

make the bonus balance more consistent: do not hand out a bonus if
there's too much in flight already, and only deduct as much from a
runner as it has the capacity. This makes the bonus engine a zero-sum
game (as intended).

this also simplifies the code:

   text    data     bss     dec     hex filename
  34770    2998      24   37792    93a0 sched.o.before
  34749    2998      24   37771    938b sched.o.after

and it also avoids overscheduling in sleep-happy workloads like
hackbench.c.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 6b0974c3fb6..c578370cd69 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -306,6 +306,8 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr)
 		delta = min(cfs_rq->sleeper_bonus, (u64)delta_exec);
 		delta = calc_delta_mine(delta, curr->load.weight, lw);
 		delta = min((u64)delta, cfs_rq->sleeper_bonus);
+		delta = min(delta, (unsigned long)(
+			(long)sysctl_sched_runtime_limit - curr->wait_runtime));
 		cfs_rq->sleeper_bonus -= delta;
 		delta_mine -= delta;
 	}
@@ -493,6 +495,13 @@ static void __enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	unsigned long load = cfs_rq->load.weight, delta_fair;
 	long prev_runtime;
 
+	/*
+	 * Do not boost sleepers if there's too much bonus 'in flight'
+	 * already:
+	 */
+	if (unlikely(cfs_rq->sleeper_bonus > sysctl_sched_runtime_limit))
+		return;
+
 	if (sysctl_sched_features & SCHED_FEAT_SLEEPER_LOAD_AVG)
 		load = rq_of(cfs_rq)->cpu_load[2];
 
@@ -512,16 +521,13 @@ static void __enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
 
 	prev_runtime = se->wait_runtime;
 	__add_wait_runtime(cfs_rq, se, delta_fair);
+	schedstat_add(cfs_rq, wait_runtime, se->wait_runtime);
 	delta_fair = se->wait_runtime - prev_runtime;
 
 	/*
 	 * Track the amount of bonus we've given to sleepers:
 	 */
 	cfs_rq->sleeper_bonus += delta_fair;
-	if (unlikely(cfs_rq->sleeper_bonus > sysctl_sched_runtime_limit))
-		cfs_rq->sleeper_bonus = sysctl_sched_runtime_limit;
-
-	schedstat_add(cfs_rq, wait_runtime, se->wait_runtime);
 }
 
 static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
-- 
cgit v1.2.3-70-g09d2


From a6f2994042cc2db9e507dc702ed0b5e2cc5890fe Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 24 Aug 2007 20:39:10 +0200
Subject: sched: simplify bonus calculation #1

current code:

 delta = min(cfs_rq->sleeper_bonus, (u64)delta_exec);
 delta = calc_delta_mine(delta, curr->load.weight, lw);
 delta = min((u64)delta, cfs_rq->sleeper_bonus);

drop the first min(), because we clip against sleeper_bonus in the 3rd line
again. That gives:

 delta = calc_delta_mine(delta_exec, curr->load.weight, lw);
 delta = min((u64)delta, cfs_rq->sleeper_bonus);

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index c578370cd69..5b2d97fcd80 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -303,8 +303,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr)
 	delta_mine = calc_delta_mine(delta_exec, curr->load.weight, lw);
 
 	if (cfs_rq->sleeper_bonus > sysctl_sched_granularity) {
-		delta = min(cfs_rq->sleeper_bonus, (u64)delta_exec);
-		delta = calc_delta_mine(delta, curr->load.weight, lw);
+		delta = calc_delta_mine(delta_exec, curr->load.weight, lw);
 		delta = min((u64)delta, cfs_rq->sleeper_bonus);
 		delta = min(delta, (unsigned long)(
 			(long)sysctl_sched_runtime_limit - curr->wait_runtime));
-- 
cgit v1.2.3-70-g09d2


From ea0aa3b23a193d1fc5c982286edecd071af67d94 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 24 Aug 2007 20:39:10 +0200
Subject: sched: simplify bonus calculation #2

current code:

 delta = calc_delta_mine(delta_exec, curr->load.weight, lw);
 delta = min((u64)delta, cfs_rq->sleeper_bonus);

Notice that this calc_delta_mine() line is exactly delta_mine, which
gives:

 delta = min((u64)delta_mine, cfs_rq->sleeper_bonus);

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 5b2d97fcd80..c078f1af721 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -303,8 +303,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr)
 	delta_mine = calc_delta_mine(delta_exec, curr->load.weight, lw);
 
 	if (cfs_rq->sleeper_bonus > sysctl_sched_granularity) {
-		delta = calc_delta_mine(delta_exec, curr->load.weight, lw);
-		delta = min((u64)delta, cfs_rq->sleeper_bonus);
+		delta = min((u64)delta_mine, cfs_rq->sleeper_bonus);
 		delta = min(delta, (unsigned long)(
 			(long)sysctl_sched_runtime_limit - curr->wait_runtime));
 		cfs_rq->sleeper_bonus -= delta;
-- 
cgit v1.2.3-70-g09d2


From 095e56c7036fe97bc3ebcd80ed6e121be0847656 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 24 Aug 2007 20:39:10 +0200
Subject: sched: fix startup penalty calculation

fix task startup penalty miscalculation: sysctl_sched_granularity is
unsigned int and wait_runtime is long so we first have to convert it
to long before turning it negative ...

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index c078f1af721..4d6b7e2df2a 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1047,7 +1047,7 @@ static void task_new_fair(struct rq *rq, struct task_struct *p)
 	 * -granularity/2, so initialize the task with that:
 	 */
 	if (sysctl_sched_features & SCHED_FEAT_START_DEBIT)
-		p->se.wait_runtime = -(sysctl_sched_granularity / 2);
+		p->se.wait_runtime = -((long)sysctl_sched_granularity / 2);
 
 	__enqueue_entity(cfs_rq, se);
 }
-- 
cgit v1.2.3-70-g09d2


From 1fc84aaae3bae9646dd4c7798b8c0ff934338909 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Sat, 25 Aug 2007 18:41:52 +0200
Subject: sched: fix CONFIG_SCHED_DEBUG dependency of lockdep sysctls

Make the lockdep sysctls not depend on CONFIG_SCHED_DEBUG.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sysctl.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 9029690f4fa..ea90ef51085 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -283,6 +283,15 @@ static ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
 	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "sched_features",
+		.data		= &sysctl_sched_features,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#endif
 #ifdef CONFIG_PROVE_LOCKING
 	{
 		.ctl_name	= CTL_UNNUMBERED,
@@ -302,15 +311,6 @@ static ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
 	},
-#endif
-	{
-		.ctl_name	= CTL_UNNUMBERED,
-		.procname	= "sched_features",
-		.data		= &sysctl_sched_features,
-		.maxlen		= sizeof(unsigned int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
-	},
 #endif
 	{
 		.ctl_name	= KERN_PANIC,
-- 
cgit v1.2.3-70-g09d2


From 218050855ece4e923106ab614ac65afa0f618df3 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Sat, 25 Aug 2007 18:41:53 +0200
Subject: sched: adaptive scheduler granularity

Instead of specifying the preemption granularity, specify the wanted
latency. By fixing the granlarity to a constany the wakeup latency
it a function of the number of running tasks on the rq.

Invert this relation.

sysctl_sched_granularity becomes a minimum for the dynamic granularity
computed from the new sysctl_sched_latency.

Then use this latency to do more intelligent granularity decisions: if
there are fewer tasks running then we can schedule coarser. This helps
performance while still always keeping the latency target.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h |  1 +
 kernel/sched.c        | 14 ++++++----
 kernel/sched_fair.c   | 77 +++++++++++++++++++++++++++++++++++++++++++--------
 kernel/sysctl.c       | 11 ++++++++
 4 files changed, 86 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index ba78807eab9..322764e0405 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1399,6 +1399,7 @@ static inline void idle_task_exit(void) {}
 
 extern void sched_idle_next(void);
 
+extern unsigned int sysctl_sched_latency;
 extern unsigned int sysctl_sched_granularity;
 extern unsigned int sysctl_sched_wakeup_granularity;
 extern unsigned int sysctl_sched_batch_wakeup_granularity;
diff --git a/kernel/sched.c b/kernel/sched.c
index 6798328a2e0..da26f46d50d 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4911,14 +4911,18 @@ cpumask_t nohz_cpu_mask = CPU_MASK_NONE;
 static inline void sched_init_granularity(void)
 {
 	unsigned int factor = 1 + ilog2(num_online_cpus());
-	const unsigned long gran_limit = 100000000;
+	const unsigned long limit = 100000000;
 
 	sysctl_sched_granularity *= factor;
-	if (sysctl_sched_granularity > gran_limit)
-		sysctl_sched_granularity = gran_limit;
+	if (sysctl_sched_granularity > limit)
+		sysctl_sched_granularity = limit;
 
-	sysctl_sched_runtime_limit = sysctl_sched_granularity * 5;
-	sysctl_sched_wakeup_granularity = sysctl_sched_granularity / 2;
+	sysctl_sched_latency *= factor;
+	if (sysctl_sched_latency > limit)
+		sysctl_sched_latency = limit;
+
+	sysctl_sched_runtime_limit = sysctl_sched_latency * 5;
+	sysctl_sched_wakeup_granularity = sysctl_sched_latency / 2;
 }
 
 #ifdef CONFIG_SMP
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 4d6b7e2df2a..0ba1e60f08d 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -15,23 +15,32 @@
  *
  *  Scaled math optimizations by Thomas Gleixner
  *  Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de>
+ *
+ *  Adaptive scheduling granularity, math enhancements by Peter Zijlstra
+ *  Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
  */
 
 /*
- * Preemption granularity:
- * (default: 10 msec, units: nanoseconds)
+ * Targeted preemption latency for CPU-bound tasks:
+ * (default: 20ms, units: nanoseconds)
  *
- * NOTE: this granularity value is not the same as the concept of
- * 'timeslice length' - timeslices in CFS will typically be somewhat
- * larger than this value. (to see the precise effective timeslice
- * length of your workload, run vmstat and monitor the context-switches
- * field)
+ * NOTE: this latency value is not the same as the concept of
+ * 'timeslice length' - timeslices in CFS are of variable length.
+ * (to see the precise effective timeslice length of your workload,
+ *  run vmstat and monitor the context-switches field)
  *
  * On SMP systems the value of this is multiplied by the log2 of the
  * number of CPUs. (i.e. factor 2x on 2-way systems, 3x on 4-way
  * systems, 4x on 8-way systems, 5x on 16-way systems, etc.)
+ * Targeted preemption latency for CPU-bound tasks:
  */
-unsigned int sysctl_sched_granularity __read_mostly = 10000000UL;
+unsigned int sysctl_sched_latency __read_mostly = 20000000ULL;
+
+/*
+ * Minimal preemption granularity for CPU-bound tasks:
+ * (default: 2 msec, units: nanoseconds)
+ */
+unsigned int sysctl_sched_granularity __read_mostly = 2000000ULL;
 
 /*
  * SCHED_BATCH wake-up granularity.
@@ -212,6 +221,49 @@ static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq)
  * Scheduling class statistics methods:
  */
 
+/*
+ * Calculate the preemption granularity needed to schedule every
+ * runnable task once per sysctl_sched_latency amount of time.
+ * (down to a sensible low limit on granularity)
+ *
+ * For example, if there are 2 tasks running and latency is 10 msecs,
+ * we switch tasks every 5 msecs. If we have 3 tasks running, we have
+ * to switch tasks every 3.33 msecs to get a 10 msecs observed latency
+ * for each task. We do finer and finer scheduling up to until we
+ * reach the minimum granularity value.
+ *
+ * To achieve this we use the following dynamic-granularity rule:
+ *
+ *    gran = lat/nr - lat/nr/nr
+ *
+ * This comes out of the following equations:
+ *
+ *    kA1 + gran = kB1
+ *    kB2 + gran = kA2
+ *    kA2 = kA1
+ *    kB2 = kB1 - d + d/nr
+ *    lat = d * nr
+ *
+ * Where 'k' is key, 'A' is task A (waiting), 'B' is task B (running),
+ * '1' is start of time, '2' is end of time, 'd' is delay between
+ * 1 and 2 (during which task B was running), 'nr' is number of tasks
+ * running, 'lat' is the the period of each task. ('lat' is the
+ * sched_latency that we aim for.)
+ */
+static long
+sched_granularity(struct cfs_rq *cfs_rq)
+{
+	unsigned int gran = sysctl_sched_latency;
+	unsigned int nr = cfs_rq->nr_running;
+
+	if (nr > 1) {
+		gran = gran/nr - gran/nr/nr;
+		gran = max(gran, sysctl_sched_granularity);
+	}
+
+	return gran;
+}
+
 /*
  * We rescale the rescheduling granularity of tasks according to their
  * nice level, but only linearly, not exponentially:
@@ -302,7 +354,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr)
 	delta_fair = calc_delta_fair(delta_exec, lw);
 	delta_mine = calc_delta_mine(delta_exec, curr->load.weight, lw);
 
-	if (cfs_rq->sleeper_bonus > sysctl_sched_granularity) {
+	if (cfs_rq->sleeper_bonus > sysctl_sched_latency) {
 		delta = min((u64)delta_mine, cfs_rq->sleeper_bonus);
 		delta = min(delta, (unsigned long)(
 			(long)sysctl_sched_runtime_limit - curr->wait_runtime));
@@ -689,7 +741,8 @@ static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
 	if (next == curr)
 		return;
 
-	__check_preempt_curr_fair(cfs_rq, next, curr, sysctl_sched_granularity);
+	__check_preempt_curr_fair(cfs_rq, next, curr,
+			sched_granularity(cfs_rq));
 }
 
 /**************************************************
@@ -1034,7 +1087,7 @@ static void task_new_fair(struct rq *rq, struct task_struct *p)
 	 * it will preempt the parent:
 	 */
 	p->se.fair_key = current->se.fair_key -
-		niced_granularity(&rq->curr->se, sysctl_sched_granularity) - 1;
+		niced_granularity(&rq->curr->se, sched_granularity(cfs_rq)) - 1;
 	/*
 	 * The first wait is dominated by the child-runs-first logic,
 	 * so do not credit it with that waiting time yet:
@@ -1047,7 +1100,7 @@ static void task_new_fair(struct rq *rq, struct task_struct *p)
 	 * -granularity/2, so initialize the task with that:
 	 */
 	if (sysctl_sched_features & SCHED_FEAT_START_DEBIT)
-		p->se.wait_runtime = -((long)sysctl_sched_granularity / 2);
+		p->se.wait_runtime = -(sched_granularity(cfs_rq) / 2);
 
 	__enqueue_entity(cfs_rq, se);
 }
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index ea90ef51085..9e3d2960faf 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -231,6 +231,17 @@ static ctl_table kern_table[] = {
 		.extra1		= &min_sched_granularity_ns,
 		.extra2		= &max_sched_granularity_ns,
 	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "sched_latency_ns",
+		.data		= &sysctl_sched_latency,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &min_sched_granularity_ns,
+		.extra2		= &max_sched_granularity_ns,
+	},
 	{
 		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "sched_wakeup_granularity_ns",
-- 
cgit v1.2.3-70-g09d2


From 172ac3dbb7d3e528ac53d08a34df88d1ac53c534 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sat, 25 Aug 2007 18:41:53 +0200
Subject: sched: cleanup, sched_granularity -> sched_min_granularity

due to adaptive granularity scheduling the role of sched_granularity
has changed to "minimum granularity", so rename the variable (and the
tunable) accordingly.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 include/linux/sched.h | 2 +-
 kernel/sched.c        | 6 +++---
 kernel/sched_fair.c   | 4 ++--
 kernel/sysctl.c       | 4 ++--
 4 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 322764e0405..bd6a0320a77 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1400,7 +1400,7 @@ static inline void idle_task_exit(void) {}
 extern void sched_idle_next(void);
 
 extern unsigned int sysctl_sched_latency;
-extern unsigned int sysctl_sched_granularity;
+extern unsigned int sysctl_sched_min_granularity;
 extern unsigned int sysctl_sched_wakeup_granularity;
 extern unsigned int sysctl_sched_batch_wakeup_granularity;
 extern unsigned int sysctl_sched_stat_granularity;
diff --git a/kernel/sched.c b/kernel/sched.c
index da26f46d50d..a40ab657ad1 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4913,9 +4913,9 @@ static inline void sched_init_granularity(void)
 	unsigned int factor = 1 + ilog2(num_online_cpus());
 	const unsigned long limit = 100000000;
 
-	sysctl_sched_granularity *= factor;
-	if (sysctl_sched_granularity > limit)
-		sysctl_sched_granularity = limit;
+	sysctl_sched_min_granularity *= factor;
+	if (sysctl_sched_min_granularity > limit)
+		sysctl_sched_min_granularity = limit;
 
 	sysctl_sched_latency *= factor;
 	if (sysctl_sched_latency > limit)
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 0ba1e60f08d..ee3771850aa 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -40,7 +40,7 @@ unsigned int sysctl_sched_latency __read_mostly = 20000000ULL;
  * Minimal preemption granularity for CPU-bound tasks:
  * (default: 2 msec, units: nanoseconds)
  */
-unsigned int sysctl_sched_granularity __read_mostly = 2000000ULL;
+unsigned int sysctl_sched_min_granularity __read_mostly = 2000000ULL;
 
 /*
  * SCHED_BATCH wake-up granularity.
@@ -258,7 +258,7 @@ sched_granularity(struct cfs_rq *cfs_rq)
 
 	if (nr > 1) {
 		gran = gran/nr - gran/nr/nr;
-		gran = max(gran, sysctl_sched_granularity);
+		gran = max(gran, sysctl_sched_min_granularity);
 	}
 
 	return gran;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 9e3d2960faf..6ace893c17c 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -222,8 +222,8 @@ static ctl_table kern_table[] = {
 #ifdef CONFIG_SCHED_DEBUG
 	{
 		.ctl_name	= CTL_UNNUMBERED,
-		.procname	= "sched_granularity_ns",
-		.data		= &sysctl_sched_granularity,
+		.procname	= "sched_min_granularity_ns",
+		.data		= &sysctl_sched_min_granularity,
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec_minmax,
-- 
cgit v1.2.3-70-g09d2


From 50c46637aa894f904e2fb39086a3d7732f68bd50 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sat, 25 Aug 2007 22:17:19 +0200
Subject: sched: s/sched_latency/sched_min_granularity

runtime limit and wakeup granularity used to be a function of
granularity and that was incorrect changed to sched_latency.

Fix this to make wakeup granularity a function of min-granularity,
and the runtime limit equal to latency.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index a40ab657ad1..9fe473a190d 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4921,8 +4921,8 @@ static inline void sched_init_granularity(void)
 	if (sysctl_sched_latency > limit)
 		sysctl_sched_latency = limit;
 
-	sysctl_sched_runtime_limit = sysctl_sched_latency * 5;
-	sysctl_sched_wakeup_granularity = sysctl_sched_latency / 2;
+	sysctl_sched_runtime_limit = sysctl_sched_latency;
+	sysctl_sched_wakeup_granularity = sysctl_sched_min_granularity / 2;
 }
 
 #ifdef CONFIG_SMP
-- 
cgit v1.2.3-70-g09d2


From d243769d3f83b318813a04a9592bb7cfedc6c280 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Mon, 27 Aug 2007 16:06:19 +0100
Subject: fix bogus hotplug cpu warning

Fix bogus DEBUG_PREEMPT warning on x86_64, when cpu brought online after
bootup: current_is_keventd is right to note its use of smp_processor_id
is preempt-safe, but should use raw_smp_processor_id to avoid the warning.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/workqueue.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 58e5c152a6b..e080d1d744c 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -635,7 +635,7 @@ int keventd_up(void)
 int current_is_keventd(void)
 {
 	struct cpu_workqueue_struct *cwq;
-	int cpu = smp_processor_id();	/* preempt-safe: keventd is per-cpu */
+	int cpu = raw_smp_processor_id(); /* preempt-safe: keventd is per-cpu */
 	int ret = 0;
 
 	BUG_ON(!keventd_wq);
-- 
cgit v1.2.3-70-g09d2


From 5f01d519e60a6ca1a7d9be9f2d73c5f521383992 Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Tue, 28 Aug 2007 12:53:24 +0200
Subject: sched: fix sleeper bonus limit

There is an Amarok song switch time increase (regression) under
hefty load.

What is happening is that sleeper_bonus is never consumed, and only
rarely goes below runtime_limit, so for the most part, Amarok isn't
getting any bonus at all.  We're keeping sleeper_bonus right at
runtime_limit (sched_latency == sched_runtime_limit == 40ms) forever, ie
we don't consume if we're lower that that, and don't add if we're above
it.  One Amarok thread waking (or anybody else) will push us past the
threshold, so the next thread waking gets nada, but will reap pain from
the previous thread waking until we drop back to runtime_limit.  It
looks to me like under load, some random task gets a bonus, and
everybody else pays, whether deserving or not.

This diff fixed the regression for me at any load rate.

Signed-off-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 kernel/sched_fair.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index ee3771850aa..9f53d49f3aa 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -354,7 +354,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr)
 	delta_fair = calc_delta_fair(delta_exec, lw);
 	delta_mine = calc_delta_mine(delta_exec, curr->load.weight, lw);
 
-	if (cfs_rq->sleeper_bonus > sysctl_sched_latency) {
+	if (cfs_rq->sleeper_bonus > sysctl_sched_min_granularity) {
 		delta = min((u64)delta_mine, cfs_rq->sleeper_bonus);
 		delta = min(delta, (unsigned long)(
 			(long)sysctl_sched_runtime_limit - curr->wait_runtime));
-- 
cgit v1.2.3-70-g09d2


From f6cf891c4d7128f9f91243fc0b9ce99e10fa1586 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 28 Aug 2007 12:53:24 +0200
Subject: sched: make the scheduler converge to the ideal latency

de-HZ-ification of the granularity defaults unearthed a pre-existing
property of CFS: while it correctly converges to the granularity goal,
it does not prevent run-time fluctuations in the range of
[-gran ... 0 ... +gran].

With the increase of the granularity due to the removal of HZ
dependencies, this becomes visible in chew-max output (with 5 tasks
running):

 out:  28 . 27. 32 | flu:  0 .  0 | ran:    9 .   13 | per:   37 .   40
 out:  27 . 27. 32 | flu:  0 .  0 | ran:   17 .   13 | per:   44 .   40
 out:  27 . 27. 32 | flu:  0 .  0 | ran:    9 .   13 | per:   36 .   40
 out:  29 . 27. 32 | flu:  2 .  0 | ran:   17 .   13 | per:   46 .   40
 out:  28 . 27. 32 | flu:  0 .  0 | ran:    9 .   13 | per:   37 .   40
 out:  29 . 27. 32 | flu:  0 .  0 | ran:   18 .   13 | per:   47 .   40
 out:  28 . 27. 32 | flu:  0 .  0 | ran:    9 .   13 | per:   37 .   40

average slice is the ideal 13 msecs and the period is picture-perfect 40
msecs. But the 'ran' field fluctuates around 13.33 msecs and there's no
mechanism in CFS to keep that from happening: it's a perfectly valid
solution that CFS finds.

to fix this we add a granularity/preemption rule that knows about
the "target latency", which makes tasks that run longer than the ideal
latency run a bit less. The simplest approach is to simply decrease the
preemption granularity when a task overruns its ideal latency. For this
we have to track how much the task executed since its last preemption.

( this adds a new field to task_struct, but we can eliminate that
  overhead in 2.6.24 by putting all the scheduler timestamps into an
  anonymous union. )

with this change in place, chew-max output is fluctuation-less all
around:

 out:  28 . 27. 39 | flu:  0 .  2 | ran:   13 .   13 | per:   41 .   40
 out:  28 . 27. 39 | flu:  0 .  2 | ran:   13 .   13 | per:   41 .   40
 out:  28 . 27. 39 | flu:  0 .  2 | ran:   13 .   13 | per:   41 .   40
 out:  28 . 27. 39 | flu:  0 .  2 | ran:   13 .   13 | per:   41 .   40
 out:  28 . 27. 39 | flu:  0 .  1 | ran:   13 .   13 | per:   41 .   40
 out:  28 . 27. 39 | flu:  0 .  1 | ran:   13 .   13 | per:   41 .   40

this patch has no impact on any fastpath or on any globally observable
scheduling property. (unless you have sharp enough eyes to see
millisecond-level ruckles in glxgears smoothness :-)

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Mike Galbraith <efault@gmx.de>
---
 include/linux/sched.h |  1 +
 kernel/sched.c        |  1 +
 kernel/sched_fair.c   | 26 ++++++++++++++++++++++----
 3 files changed, 24 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index bd6a0320a77..f4e324ed2e4 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -904,6 +904,7 @@ struct sched_entity {
 
 	u64			exec_start;
 	u64			sum_exec_runtime;
+	u64			prev_sum_exec_runtime;
 	u64			wait_start_fair;
 	u64			sleep_start_fair;
 
diff --git a/kernel/sched.c b/kernel/sched.c
index 9fe473a190d..b533d6db78a 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1587,6 +1587,7 @@ static void __sched_fork(struct task_struct *p)
 	p->se.wait_start_fair		= 0;
 	p->se.exec_start		= 0;
 	p->se.sum_exec_runtime		= 0;
+	p->se.prev_sum_exec_runtime	= 0;
 	p->se.delta_exec		= 0;
 	p->se.delta_fair_run		= 0;
 	p->se.delta_fair_sleep		= 0;
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 9f53d49f3aa..721fe774487 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -668,7 +668,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
 /*
  * Preempt the current task with a newly woken task if needed:
  */
-static void
+static int
 __check_preempt_curr_fair(struct cfs_rq *cfs_rq, struct sched_entity *se,
 			  struct sched_entity *curr, unsigned long granularity)
 {
@@ -679,8 +679,11 @@ __check_preempt_curr_fair(struct cfs_rq *cfs_rq, struct sched_entity *se,
 	 * preempt the current task unless the best task has
 	 * a larger than sched_granularity fairness advantage:
 	 */
-	if (__delta > niced_granularity(curr, granularity))
+	if (__delta > niced_granularity(curr, granularity)) {
 		resched_task(rq_of(cfs_rq)->curr);
+		return 1;
+	}
+	return 0;
 }
 
 static inline void
@@ -725,6 +728,7 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
 
 static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
 {
+	unsigned long gran, ideal_runtime, delta_exec;
 	struct sched_entity *next;
 
 	/*
@@ -741,8 +745,22 @@ static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
 	if (next == curr)
 		return;
 
-	__check_preempt_curr_fair(cfs_rq, next, curr,
-			sched_granularity(cfs_rq));
+	gran = sched_granularity(cfs_rq);
+	ideal_runtime = niced_granularity(curr,
+		max(sysctl_sched_latency / cfs_rq->nr_running,
+		    (unsigned long)sysctl_sched_min_granularity));
+	/*
+	 * If we executed more than what the latency constraint suggests,
+	 * reduce the rescheduling granularity. This way the total latency
+	 * of how much a task is not scheduled converges to
+	 * sysctl_sched_latency:
+	 */
+	delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
+	if (delta_exec > ideal_runtime)
+		gran = 0;
+
+	if (__check_preempt_curr_fair(cfs_rq, next, curr, gran))
+		curr->prev_sum_exec_runtime = curr->sum_exec_runtime;
 }
 
 /**************************************************
-- 
cgit v1.2.3-70-g09d2


From 7109c4429af3640f79a638f177fc5d05b9807149 Mon Sep 17 00:00:00 2001
From: Ting Yang <tingy@cs.umass.edu>
Date: Tue, 28 Aug 2007 12:53:24 +0200
Subject: sched: call update_curr() in task_tick_fair()

update the fair-clock before using it for the key value.

[ mingo@elte.hu: small cleanups. ]

Signed-off-by: Ting Yang <tingy@cs.umass.edu>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 kernel/sched_fair.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 721fe774487..9f06094e527 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1094,10 +1094,11 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr)
 static void task_new_fair(struct rq *rq, struct task_struct *p)
 {
 	struct cfs_rq *cfs_rq = task_cfs_rq(p);
-	struct sched_entity *se = &p->se;
+	struct sched_entity *se = &p->se, *curr = cfs_rq_curr(cfs_rq);
 
 	sched_info_queued(p);
 
+	update_curr(cfs_rq);
 	update_stats_enqueue(cfs_rq, se);
 	/*
 	 * Child runs first: we let it run before the parent
@@ -1105,7 +1106,7 @@ static void task_new_fair(struct rq *rq, struct task_struct *p)
 	 * it will preempt the parent:
 	 */
 	p->se.fair_key = current->se.fair_key -
-		niced_granularity(&rq->curr->se, sched_granularity(cfs_rq)) - 1;
+		niced_granularity(curr, sched_granularity(cfs_rq)) - 1;
 	/*
 	 * The first wait is dominated by the child-runs-first logic,
 	 * so do not credit it with that waiting time yet:
-- 
cgit v1.2.3-70-g09d2


From b77d69db9f4ba03b2ed17e383c2d73ca89f5ab14 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 28 Aug 2007 12:53:24 +0200
Subject: sched: fix wait_start_fair condition in update_stats_wait_end()

Peter Zijlstra noticed the following bug in SCHED_FEAT_SKIP_INITIAL (which
is disabled by default at the moment): it relies on se.wait_start_fair
being 0 while update_stats_wait_end() did not recognize a 0 value,
so instead of 'skipping' the initial interval we gave the new child
a maximum boost of +runtime-limit ...

(No impact on the default kernel, but nice to fix for completeness.)

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Mike Galbraith <efault@gmx.de>
---
 kernel/sched_fair.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 9f06094e527..0c718857176 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -489,6 +489,9 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	unsigned long delta_fair;
 
+	if (unlikely(!se->wait_start_fair))
+		return;
+
 	delta_fair = (unsigned long)min((u64)(2*sysctl_sched_runtime_limit),
 			(u64)(cfs_rq->fair_clock - se->wait_start_fair));
 
-- 
cgit v1.2.3-70-g09d2


From 213c8af67f21c1dc0d50940b159d9521c95f3c89 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 28 Aug 2007 12:53:24 +0200
Subject: sched: small schedstat fix

small schedstat fix: the cfs_rq->wait_runtime 'sum of all runtimes'
statistics counters missed newly forked tasks and thus had a constant
negative skew. Fix this.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Mike Galbraith <efault@gmx.de>
---
 kernel/sched_fair.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 0c718857176..75f025da6f7 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1121,8 +1121,10 @@ static void task_new_fair(struct rq *rq, struct task_struct *p)
 	 * The statistical average of wait_runtime is about
 	 * -granularity/2, so initialize the task with that:
 	 */
-	if (sysctl_sched_features & SCHED_FEAT_START_DEBIT)
+	if (sysctl_sched_features & SCHED_FEAT_START_DEBIT) {
 		p->se.wait_runtime = -(sched_granularity(cfs_rq) / 2);
+		schedstat_add(cfs_rq, wait_runtime, se->wait_runtime);
+	}
 
 	__enqueue_entity(cfs_rq, se);
 }
-- 
cgit v1.2.3-70-g09d2


From 9f508f8258e18e9333f18daf1f0860df48d49ed2 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 28 Aug 2007 12:53:24 +0200
Subject: sched: clean up task_new_fair()

cleanup: we have the 'se' and 'curr' entity-pointers already,
no need to use p->se and current->se.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Mike Galbraith <efault@gmx.de>
---
 kernel/sched_fair.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 75f025da6f7..ce39282d9c0 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1108,21 +1108,21 @@ static void task_new_fair(struct rq *rq, struct task_struct *p)
 	 * until it reschedules once. We set up the key so that
 	 * it will preempt the parent:
 	 */
-	p->se.fair_key = current->se.fair_key -
+	se->fair_key = curr->fair_key -
 		niced_granularity(curr, sched_granularity(cfs_rq)) - 1;
 	/*
 	 * The first wait is dominated by the child-runs-first logic,
 	 * so do not credit it with that waiting time yet:
 	 */
 	if (sysctl_sched_features & SCHED_FEAT_SKIP_INITIAL)
-		p->se.wait_start_fair = 0;
+		se->wait_start_fair = 0;
 
 	/*
 	 * The statistical average of wait_runtime is about
 	 * -granularity/2, so initialize the task with that:
 	 */
 	if (sysctl_sched_features & SCHED_FEAT_START_DEBIT) {
-		p->se.wait_runtime = -(sched_granularity(cfs_rq) / 2);
+		se->wait_runtime = -(sched_granularity(cfs_rq) / 2);
 		schedstat_add(cfs_rq, wait_runtime, se->wait_runtime);
 	}
 
-- 
cgit v1.2.3-70-g09d2


From f2ab6d8889422c1f5354f014e8bef337b1d1bade Mon Sep 17 00:00:00 2001
From: Jonathan Lim <jlim@sgi.com>
Date: Thu, 30 Aug 2007 23:56:23 -0700
Subject: Assign task_struct.exit_code before taskstats_exit()

taskstats.ac_exitcode is assigned to task_struct.exit_code in bacct_add_tsk()
through the following kernel function calls:

  do_exit()
    taskstats_exit()
      fill_pid()
        bacct_add_tsk()

The problem is that in do_exit(), task_struct.exit_code is set to 'code' only
after taskstats_exit() has been called.  So we need to move the assignment
before taskstats_exit().

Signed-off-by: Jonathan Lim <jlim@sgi.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 9578c1ae19c..06b24b3aa37 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -975,6 +975,7 @@ fastcall NORET_TYPE void do_exit(long code)
 	if (unlikely(tsk->audit_context))
 		audit_free(tsk);
 
+	tsk->exit_code = code;
 	taskstats_exit(tsk, group_dead);
 
 	exit_mm(tsk);
@@ -996,7 +997,6 @@ fastcall NORET_TYPE void do_exit(long code)
 	if (tsk->binfmt)
 		module_put(tsk->binfmt->module);
 
-	tsk->exit_code = code;
 	proc_exit_connector(tsk);
 	exit_task_namespaces(tsk);
 	exit_notify(tsk);
-- 
cgit v1.2.3-70-g09d2


From b07e35f94a7b6a059f889b904529ee907dc0634d Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Thu, 30 Aug 2007 23:56:27 -0700
Subject: setpgid(child) fails if the child was forked by sub-thread

Spotted by Marcin Kowalczyk <qrczak@knm.org.pl>.

sys_setpgid(child) fails if the child was forked by sub-thread.

Fix the "is it our child" check. The previous commit
ee0acf90d320c29916ba8c5c1b2e908d81f5057d was not complete.

(this patch asks for the new same_thread_group() helper, but mainline doesn't
 have it yet).

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Roland McGrath <roland@redhat.com>
Cc: <stable@kernel.org>
Tested-by: "Marcin 'Qrczak' Kowalczyk" <qrczak@knm.org.pl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sys.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sys.c b/kernel/sys.c
index 449b81b98b3..1b33b05d346 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1442,7 +1442,6 @@ asmlinkage long sys_times(struct tms __user * tbuf)
  * Auch. Had to add the 'did_exec' flag to conform completely to POSIX.
  * LBT 04.03.94
  */
-
 asmlinkage long sys_setpgid(pid_t pid, pid_t pgid)
 {
 	struct task_struct *p;
@@ -1470,7 +1469,7 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid)
 	if (!thread_group_leader(p))
 		goto out;
 
-	if (p->real_parent == group_leader) {
+	if (p->real_parent->tgid == group_leader->tgid) {
 		err = -EPERM;
 		if (task_session(p) != task_session(group_leader))
 			goto out;
-- 
cgit v1.2.3-70-g09d2


From f3de4be9d5f8551d7880a1f1f5231a30e0161b1f Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Thu, 30 Aug 2007 23:56:29 -0700
Subject: PM: Fix dependencies of CONFIG_SUSPEND and CONFIG_HIBERNATION

Dependencies of CONFIG_SUSPEND and CONFIG_HIBERNATION introduced by commit
296699de6bdc717189a331ab6bbe90e05c94db06 "Introduce CONFIG_SUSPEND for
suspend-to-Ram and standby" are incorrect, as they don't cover the facts that
(1) not all architectures support suspend and (2) SMP hibernation is only
possible on X86 and PPC64 (if CONFIG_PPC64_SWSUSP is set).

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86_64/defconfig |  1 -
 include/linux/cpu.h   |  6 +++---
 kernel/cpu.c          |  4 ++--
 kernel/power/Kconfig  | 41 +++++++++++++++++++++++++++++++----------
 4 files changed, 36 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86_64/defconfig b/arch/x86_64/defconfig
index e64f65c9d90..b091c5e3555 100644
--- a/arch/x86_64/defconfig
+++ b/arch/x86_64/defconfig
@@ -201,7 +201,6 @@ CONFIG_PM=y
 # CONFIG_PM_DEBUG is not set
 CONFIG_HIBERNATION=y
 CONFIG_PM_STD_PARTITION=""
-CONFIG_SUSPEND_SMP=y
 
 #
 # ACPI (Advanced Configuration and Power Interface) Support
diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index 1d5ded0836e..0ad72c4cf31 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -126,16 +126,16 @@ static inline void cpuhotplug_mutex_unlock(struct mutex *cpu_hp_mutex)
 static inline int cpu_is_offline(int cpu) { return 0; }
 #endif		/* CONFIG_HOTPLUG_CPU */
 
-#ifdef CONFIG_SUSPEND_SMP
+#ifdef CONFIG_PM_SLEEP_SMP
 extern int suspend_cpu_hotplug;
 
 extern int disable_nonboot_cpus(void);
 extern void enable_nonboot_cpus(void);
-#else
+#else /* !CONFIG_PM_SLEEP_SMP */
 #define suspend_cpu_hotplug	0
 
 static inline int disable_nonboot_cpus(void) { return 0; }
 static inline void enable_nonboot_cpus(void) {}
-#endif
+#endif /* !CONFIG_PM_SLEEP_SMP */
 
 #endif /* _LINUX_CPU_H_ */
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 181ae708602..38033db8d8e 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -273,7 +273,7 @@ int __cpuinit cpu_up(unsigned int cpu)
 	return err;
 }
 
-#ifdef CONFIG_SUSPEND_SMP
+#ifdef CONFIG_PM_SLEEP_SMP
 static cpumask_t frozen_cpus;
 
 int disable_nonboot_cpus(void)
@@ -334,4 +334,4 @@ void enable_nonboot_cpus(void)
 out:
 	mutex_unlock(&cpu_add_remove_lock);
 }
-#endif
+#endif /* CONFIG_PM_SLEEP_SMP */
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 412859f8d94..c8580a1e687 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -72,15 +72,10 @@ config PM_TRACE
 	CAUTION: this option will cause your machine's real-time clock to be
 	set to an invalid time after a resume.
 
-config SUSPEND_SMP_POSSIBLE
-	bool
-	depends on (X86 && !X86_VOYAGER) || (PPC64 && (PPC_PSERIES || PPC_PMAC))
-	depends on SMP
-	default y
-
-config SUSPEND_SMP
+config PM_SLEEP_SMP
 	bool
-	depends on SUSPEND_SMP_POSSIBLE && PM_SLEEP
+	depends on SUSPEND_SMP_POSSIBLE || HIBERNATION_SMP_POSSIBLE
+	depends on PM_SLEEP
 	select HOTPLUG_CPU
 	default y
 
@@ -89,20 +84,46 @@ config PM_SLEEP
 	depends on SUSPEND || HIBERNATION
 	default y
 
+config SUSPEND_UP_POSSIBLE
+	bool
+	depends on (X86 && !X86_VOYAGER) || PPC || ARM || BLACKFIN || MIPS \
+		   || SUPERH || FRV
+	depends on !SMP
+	default y
+
+config SUSPEND_SMP_POSSIBLE
+	bool
+	depends on (X86 && !X86_VOYAGER) \
+		   || (PPC && (PPC_PSERIES || PPC_PMAC)) || ARM
+	depends on SMP
+	default y
+
 config SUSPEND
 	bool "Suspend to RAM and standby"
 	depends on PM
-	depends on !SMP || SUSPEND_SMP_POSSIBLE
+	depends on SUSPEND_UP_POSSIBLE || SUSPEND_SMP_POSSIBLE
 	default y
 	---help---
 	  Allow the system to enter sleep states in which main memory is
 	  powered and thus its contents are preserved, such as the
 	  suspend-to-RAM state (i.e. the ACPI S3 state).
 
+config HIBERNATION_UP_POSSIBLE
+	bool
+	depends on X86 || PPC64_SWSUSP || FRV || PPC32
+	depends on !SMP
+	default y
+
+config HIBERNATION_SMP_POSSIBLE
+	bool
+	depends on (X86 && !X86_VOYAGER) || PPC64_SWSUSP
+	depends on SMP
+	default y
+
 config HIBERNATION
 	bool "Hibernation (aka 'suspend to disk')"
 	depends on PM && SWAP
-	depends on ((X86 || PPC64_SWSUSP || FRV || PPC32) && !SMP) || SUSPEND_SMP_POSSIBLE
+	depends on HIBERNATION_UP_POSSIBLE || HIBERNATION_SMP_POSSIBLE
 	---help---
 	  Enable the suspend to disk (STD) functionality, which is usually
 	  called "hibernation" in user interfaces.  STD checkpoints the
-- 
cgit v1.2.3-70-g09d2


From 59845b1ffd9121e5ef474ea5f27405fd7a83c85b Mon Sep 17 00:00:00 2001
From: Jarek Poplawski <jarkao2@o2.pl>
Date: Thu, 30 Aug 2007 23:56:34 -0700
Subject: request_irq: fix DEBUG_SHIRQ handling

Mariusz Kozlowski reported lockdep's warning:

> =================================
> [ INFO: inconsistent lock state ]
> 2.6.23-rc2-mm1 #7
> ---------------------------------
> inconsistent {in-hardirq-W} -> {hardirq-on-W} usage.
> ifconfig/5492 [HC0[0]:SC0[0]:HE1:SE1] takes:
>  (&tp->lock){+...}, at: [<de8706e0>] rtl8139_interrupt+0x27/0x46b [8139too]
> {in-hardirq-W} state was registered at:
>   [<c0138eeb>] __lock_acquire+0x949/0x11ac
>   [<c01397e7>] lock_acquire+0x99/0xb2
>   [<c0452ff3>] _spin_lock+0x35/0x42
>   [<de8706e0>] rtl8139_interrupt+0x27/0x46b [8139too]
>   [<c0147a5d>] handle_IRQ_event+0x28/0x59
>   [<c01493ca>] handle_level_irq+0xad/0x10b
>   [<c0105a13>] do_IRQ+0x93/0xd0
>   [<c010441e>] common_interrupt+0x2e/0x34
...
> other info that might help us debug this:
> 1 lock held by ifconfig/5492:
>  #0:  (rtnl_mutex){--..}, at: [<c0451778>] mutex_lock+0x1c/0x1f
>
> stack backtrace:
...
>  [<c0452ff3>] _spin_lock+0x35/0x42
>  [<de8706e0>] rtl8139_interrupt+0x27/0x46b [8139too]
>  [<c01480fd>] free_irq+0x11b/0x146
>  [<de871d59>] rtl8139_close+0x8a/0x14a [8139too]
>  [<c03bde63>] dev_close+0x57/0x74
...

This shows that a driver's irq handler was running both in hard interrupt
and process contexts with irqs enabled. The latter was done during
free_irq() call and was possible only with CONFIG_DEBUG_SHIRQ enabled.
This was fixed by another patch.

But similar problem is possible with request_irq(): any locks taken from
irq handler could be vulnerable - especially with soft interrupts. This
patch fixes it by disabling local interrupts during handler's run. (It
seems, disabling softirqs should be enough, but it needs more checking
on possible races or other special cases).

Reported-by: Mariusz Kozlowski <m.kozlowski@tuxland.pl>
Signed-off-by: Jarek Poplawski <jarkao2@o2.pl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/irq/manage.c | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 853aefbd184..7230d914eaa 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -547,14 +547,11 @@ int request_irq(unsigned int irq, irq_handler_t handler,
 		 * We do this before actually registering it, to make sure that
 		 * a 'real' IRQ doesn't run in parallel with our fake
 		 */
-		if (irqflags & IRQF_DISABLED) {
-			unsigned long flags;
+		unsigned long flags;
 
-			local_irq_save(flags);
-			handler(irq, dev_id);
-			local_irq_restore(flags);
-		} else
-			handler(irq, dev_id);
+		local_irq_save(flags);
+		handler(irq, dev_id);
+		local_irq_restore(flags);
 	}
 #endif
 
-- 
cgit v1.2.3-70-g09d2


From 99db67bc04af0f2e8cb710ac92aaeb9af135a7c6 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@sw.ru>
Date: Thu, 30 Aug 2007 23:56:34 -0700
Subject: userns: don't leak root user

Signed-off-by: Alexey Dobriyan <adobriyan@sw.ru>
Acked-by: Cedric Le Goater <clg@fr.ibm.com>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/user_namespace.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index d055d987850..85af9422ea6 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -81,6 +81,7 @@ void free_user_ns(struct kref *kref)
 	struct user_namespace *ns;
 
 	ns = container_of(kref, struct user_namespace, kref);
+	free_uid(ns->root_user);
 	kfree(ns);
 }
 
-- 
cgit v1.2.3-70-g09d2


From 60187d2708caa870f0825d753df1612ea688eb9e Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Thu, 30 Aug 2007 23:56:35 -0700
Subject: sigqueue_free: fix the race with collect_signal()

Spotted by taoyue <yue.tao@windriver.com> and Jeremy Katz <jeremy.katz@windriver.com>.

collect_signal:				sigqueue_free:

	list_del_init(&first->list);
						if (!list_empty(&q->list)) {
							// not taken
						}
						q->flags &= ~SIGQUEUE_PREALLOC;

	__sigqueue_free(first);			__sigqueue_free(q);

Now, __sigqueue_free() is called twice on the same "struct sigqueue" with the
obviously bad implications.

In particular, this double free breaks the array_cache->avail logic, so the
same sigqueue could be "allocated" twice, and the bug can manifest itself via
the "impossible" BUG_ON(!SIGQUEUE_PREALLOC) in sigqueue_free/send_sigqueue.

Hopefully this can explain these mysterious bug-reports, see

	http://marc.info/?t=118766926500003
	http://marc.info/?t=118466273000005

Alexey Dobriyan reports this patch makes the difference for the testcase, but
nobody has an access to the application which opened the problems originally.

Also, this patch removes tasklist lock/unlock, ->siglock is enough.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: taoyue <yue.tao@windriver.com>
Cc: Jeremy Katz <jeremy.katz@windriver.com>
Cc: Sukadev Bhattiprolu <sukadev@us.ibm.com>
Cc: Alexey Dobriyan <adobriyan@sw.ru>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Roland McGrath <roland@redhat.com>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index ad63109e413..3169bed0b4d 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1300,20 +1300,19 @@ struct sigqueue *sigqueue_alloc(void)
 void sigqueue_free(struct sigqueue *q)
 {
 	unsigned long flags;
+	spinlock_t *lock = &current->sighand->siglock;
+
 	BUG_ON(!(q->flags & SIGQUEUE_PREALLOC));
 	/*
 	 * If the signal is still pending remove it from the
-	 * pending queue.
+	 * pending queue. We must hold ->siglock while testing
+	 * q->list to serialize with collect_signal().
 	 */
-	if (unlikely(!list_empty(&q->list))) {
-		spinlock_t *lock = &current->sighand->siglock;
-		read_lock(&tasklist_lock);
-		spin_lock_irqsave(lock, flags);
-		if (!list_empty(&q->list))
-			list_del_init(&q->list);
-		spin_unlock_irqrestore(lock, flags);
-		read_unlock(&tasklist_lock);
-	}
+	spin_lock_irqsave(lock, flags);
+	if (!list_empty(&q->list))
+		list_del_init(&q->list);
+	spin_unlock_irqrestore(lock, flags);
+
 	q->flags &= ~SIGQUEUE_PREALLOC;
 	__sigqueue_free(q);
 }
-- 
cgit v1.2.3-70-g09d2


From 7fd0d2dde929ead79901e389e70dbfb3c6c06986 Mon Sep 17 00:00:00 2001
From: Suresh Siddha <suresh.b.siddha@intel.com>
Date: Wed, 5 Sep 2007 14:32:48 +0200
Subject: sched: fix MC/HT scheduler optimization, without breaking the FUZZ
 logic.

First fix the check
	if (*imbalance + SCHED_LOAD_SCALE_FUZZ < busiest_load_per_task)
with this
	if (*imbalance < busiest_load_per_task)

As the current check is always false for nice 0 tasks (as
SCHED_LOAD_SCALE_FUZZ is same as busiest_load_per_task for nice 0
tasks).

With the above change, imbalance was getting reset to 0 in the corner
case condition, making the FUZZ logic fail. Fix it by not corrupting the
imbalance and change the imbalance, only when it finds that the HT/MC
optimization is needed.

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index b533d6db78a..c8759ec6d8a 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2512,7 +2512,7 @@ group_next:
 	 * a think about bumping its value to force at least one task to be
 	 * moved
 	 */
-	if (*imbalance + SCHED_LOAD_SCALE_FUZZ < busiest_load_per_task) {
+	if (*imbalance < busiest_load_per_task) {
 		unsigned long tmp, pwr_now, pwr_move;
 		unsigned int imbn;
 
@@ -2564,10 +2564,8 @@ small_imbalance:
 		pwr_move /= SCHED_LOAD_SCALE;
 
 		/* Move if we gain throughput */
-		if (pwr_move <= pwr_now)
-			goto out_balanced;
-
-		*imbalance = busiest_load_per_task;
+		if (pwr_move > pwr_now)
+			*imbalance = busiest_load_per_task;
 	}
 
 	return busiest;
-- 
cgit v1.2.3-70-g09d2


From a0dc72601d48b171b4870dfdd0824901a2b2b1a9 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 5 Sep 2007 14:32:49 +0200
Subject: sched: fix niced_granularity() shift

fix niced_granularity(). This resulted in under-scheduling for
CPU-bound negative nice level tasks (and this in turn caused
higher than necessary latencies in nice-0 tasks).

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index ce39282d9c0..810b52d994e 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -291,7 +291,7 @@ niced_granularity(struct sched_entity *curr, unsigned long granularity)
 	/*
 	 * It will always fit into 'long':
 	 */
-	return (long) (tmp >> WMULT_SHIFT);
+	return (long) (tmp >> (WMULT_SHIFT-NICE_0_SHIFT));
 }
 
 static inline void
-- 
cgit v1.2.3-70-g09d2


From a206c07213cf6372289f189c3774c4c3255a7ae1 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 5 Sep 2007 14:32:49 +0200
Subject: sched: debug: fix cfs_rq->wait_runtime accounting

the cfs_rq->wait_runtime debug/statistics counter was not maintained
properly - fix this.

this also removes some code:

   text    data     bss     dec     hex filename
  13420     228    1204   14852    3a04 sched.o.before
  13404     228    1204   14836    39f4 sched.o.after

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 kernel/sched.c      |  1 -
 kernel/sched_fair.c | 10 +++++-----
 2 files changed, 5 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index c8759ec6d8a..97986f1f0be 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -858,7 +858,6 @@ static void dec_nr_running(struct task_struct *p, struct rq *rq)
 
 static void set_load_weight(struct task_struct *p)
 {
-	task_rq(p)->cfs.wait_runtime -= p->se.wait_runtime;
 	p->se.wait_runtime = 0;
 
 	if (task_has_rt_policy(p)) {
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 810b52d994e..bac2aff8273 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -194,6 +194,8 @@ __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	update_load_add(&cfs_rq->load, se->load.weight);
 	cfs_rq->nr_running++;
 	se->on_rq = 1;
+
+	schedstat_add(cfs_rq, wait_runtime, se->wait_runtime);
 }
 
 static inline void
@@ -205,6 +207,8 @@ __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	update_load_sub(&cfs_rq->load, se->load.weight);
 	cfs_rq->nr_running--;
 	se->on_rq = 0;
+
+	schedstat_add(cfs_rq, wait_runtime, -se->wait_runtime);
 }
 
 static inline struct rb_node *first_fair(struct cfs_rq *cfs_rq)
@@ -574,7 +578,6 @@ static void __enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
 
 	prev_runtime = se->wait_runtime;
 	__add_wait_runtime(cfs_rq, se, delta_fair);
-	schedstat_add(cfs_rq, wait_runtime, se->wait_runtime);
 	delta_fair = se->wait_runtime - prev_runtime;
 
 	/*
@@ -662,7 +665,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
 			if (tsk->state & TASK_UNINTERRUPTIBLE)
 				se->block_start = rq_of(cfs_rq)->clock;
 		}
-		cfs_rq->wait_runtime -= se->wait_runtime;
 #endif
 	}
 	__dequeue_entity(cfs_rq, se);
@@ -1121,10 +1123,8 @@ static void task_new_fair(struct rq *rq, struct task_struct *p)
 	 * The statistical average of wait_runtime is about
 	 * -granularity/2, so initialize the task with that:
 	 */
-	if (sysctl_sched_features & SCHED_FEAT_START_DEBIT) {
+	if (sysctl_sched_features & SCHED_FEAT_START_DEBIT)
 		se->wait_runtime = -(sched_granularity(cfs_rq) / 2);
-		schedstat_add(cfs_rq, wait_runtime, se->wait_runtime);
-	}
 
 	__enqueue_entity(cfs_rq, se);
 }
-- 
cgit v1.2.3-70-g09d2


From 2491b2b89d4646e02ab51c90ab7012d124924ddc Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 5 Sep 2007 14:32:49 +0200
Subject: sched: debug: fix sum_exec_runtime clearing

when cleaning sched-stats also clear prev_sum_exec_runtime.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_debug.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index ab18f45f2ab..c3ee38bd342 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -283,4 +283,5 @@ void proc_sched_set_task(struct task_struct *p)
 	p->se.wait_runtime_overruns = p->se.wait_runtime_underruns = 0;
 #endif
 	p->se.sum_exec_runtime = 0;
+	p->se.prev_sum_exec_runtime	= 0;
 }
-- 
cgit v1.2.3-70-g09d2


From cf2ab4696ee42f895eed88c2b6e432fe03dda0db Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 5 Sep 2007 14:32:49 +0200
Subject: sched: fix xtensa build warning

rename RSR to SRR - 'RSR' is already defined on xtensa.

found by Adrian Bunk.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 97986f1f0be..deeb1f8e0c3 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -668,7 +668,7 @@ static u64 div64_likely32(u64 divident, unsigned long divisor)
 /*
  * Shift right and round:
  */
-#define RSR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
+#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
 
 static unsigned long
 calc_delta_mine(unsigned long delta_exec, unsigned long weight,
@@ -684,10 +684,10 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight,
 	 * Check whether we'd overflow the 64-bit multiplication:
 	 */
 	if (unlikely(tmp > WMULT_CONST))
-		tmp = RSR(RSR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
+		tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
 			WMULT_SHIFT/2);
 	else
-		tmp = RSR(tmp * lw->inv_weight, WMULT_SHIFT);
+		tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
 
 	return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
 }
-- 
cgit v1.2.3-70-g09d2


From 7c92e54f6f9601cfa9d8894ee248abcf62ed9a1c Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 5 Sep 2007 14:32:49 +0200
Subject: sched: simplify __check_preempt_curr_fair()

Preparatory patch for fix-ideal-runtime:

simplify __check_preempt_curr_fair(): get rid of the integer return.

   text    data     bss     dec     hex filename
  13404     228    1204   14836    39f4 sched.o.before
  13393     228    1204   14825    39e9 sched.o.after

functionality is unchanged.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index bac2aff8273..f0dd4be1a3a 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -673,7 +673,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
 /*
  * Preempt the current task with a newly woken task if needed:
  */
-static int
+static void
 __check_preempt_curr_fair(struct cfs_rq *cfs_rq, struct sched_entity *se,
 			  struct sched_entity *curr, unsigned long granularity)
 {
@@ -686,9 +686,8 @@ __check_preempt_curr_fair(struct cfs_rq *cfs_rq, struct sched_entity *se,
 	 */
 	if (__delta > niced_granularity(curr, granularity)) {
 		resched_task(rq_of(cfs_rq)->curr);
-		return 1;
+		curr->prev_sum_exec_runtime = curr->sum_exec_runtime;
 	}
-	return 0;
 }
 
 static inline void
@@ -764,8 +763,7 @@ static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
 	if (delta_exec > ideal_runtime)
 		gran = 0;
 
-	if (__check_preempt_curr_fair(cfs_rq, next, curr, gran))
-		curr->prev_sum_exec_runtime = curr->sum_exec_runtime;
+	__check_preempt_curr_fair(cfs_rq, next, curr, gran);
 }
 
 /**************************************************
-- 
cgit v1.2.3-70-g09d2


From 4a55b45036a677fac43fe81ddf7fdcd007aaaee7 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 5 Sep 2007 14:32:49 +0200
Subject: sched: improve prev_sum_exec_runtime setting

Second preparatory patch for fix-ideal runtime:

Mark prev_sum_exec_runtime at the beginning of our run, the same spot
that adds our wait period to wait_runtime. This seems a more natural
location to do this, and it also reduces the code a bit:

   text    data     bss     dec     hex filename
  13397     228    1204   14829    39ed sched.o.before
  13391     228    1204   14823    39e7 sched.o.after

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index f0dd4be1a3a..2d01bbc2d04 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -684,10 +684,8 @@ __check_preempt_curr_fair(struct cfs_rq *cfs_rq, struct sched_entity *se,
 	 * preempt the current task unless the best task has
 	 * a larger than sched_granularity fairness advantage:
 	 */
-	if (__delta > niced_granularity(curr, granularity)) {
+	if (__delta > niced_granularity(curr, granularity))
 		resched_task(rq_of(cfs_rq)->curr);
-		curr->prev_sum_exec_runtime = curr->sum_exec_runtime;
-	}
 }
 
 static inline void
@@ -703,6 +701,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	update_stats_wait_end(cfs_rq, se);
 	update_stats_curr_start(cfs_rq, se);
 	set_cfs_rq_curr(cfs_rq, se);
+	se->prev_sum_exec_runtime = se->sum_exec_runtime;
 }
 
 static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
-- 
cgit v1.2.3-70-g09d2


From 1169783085adb9ac969d21103a6885e8435f7ed3 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 5 Sep 2007 14:32:49 +0200
Subject: sched: fix ideal_runtime calculations for reniced tasks

fix ideal_runtime:

  - do not scale it using niced_granularity()
    it is against sum_exec_delta, so its wall-time, not fair-time.

  - move the whole check into __check_preempt_curr_fair()
    so that wakeup preemption can also benefit from the new logic.

this also results in code size reduction:

   text    data     bss     dec     hex filename
  13391     228    1204   14823    39e7 sched.o.before
  13369     228    1204   14801    39d1 sched.o.after

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 38 ++++++++++++++++++++++----------------
 1 file changed, 22 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 2d01bbc2d04..892616bf2c7 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -678,11 +678,31 @@ __check_preempt_curr_fair(struct cfs_rq *cfs_rq, struct sched_entity *se,
 			  struct sched_entity *curr, unsigned long granularity)
 {
 	s64 __delta = curr->fair_key - se->fair_key;
+	unsigned long ideal_runtime, delta_exec;
+
+	/*
+	 * ideal_runtime is compared against sum_exec_runtime, which is
+	 * walltime, hence do not scale.
+	 */
+	ideal_runtime = max(sysctl_sched_latency / cfs_rq->nr_running,
+			(unsigned long)sysctl_sched_min_granularity);
+
+	/*
+	 * If we executed more than what the latency constraint suggests,
+	 * reduce the rescheduling granularity. This way the total latency
+	 * of how much a task is not scheduled converges to
+	 * sysctl_sched_latency:
+	 */
+	delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
+	if (delta_exec > ideal_runtime)
+		granularity = 0;
 
 	/*
 	 * Take scheduling granularity into account - do not
 	 * preempt the current task unless the best task has
 	 * a larger than sched_granularity fairness advantage:
+	 *
+	 * scale granularity as key space is in fair_clock.
 	 */
 	if (__delta > niced_granularity(curr, granularity))
 		resched_task(rq_of(cfs_rq)->curr);
@@ -731,7 +751,6 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
 
 static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
 {
-	unsigned long gran, ideal_runtime, delta_exec;
 	struct sched_entity *next;
 
 	/*
@@ -748,21 +767,8 @@ static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
 	if (next == curr)
 		return;
 
-	gran = sched_granularity(cfs_rq);
-	ideal_runtime = niced_granularity(curr,
-		max(sysctl_sched_latency / cfs_rq->nr_running,
-		    (unsigned long)sysctl_sched_min_granularity));
-	/*
-	 * If we executed more than what the latency constraint suggests,
-	 * reduce the rescheduling granularity. This way the total latency
-	 * of how much a task is not scheduled converges to
-	 * sysctl_sched_latency:
-	 */
-	delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
-	if (delta_exec > ideal_runtime)
-		gran = 0;
-
-	__check_preempt_curr_fair(cfs_rq, next, curr, gran);
+	__check_preempt_curr_fair(cfs_rq, next, curr,
+			sched_granularity(cfs_rq));
 }
 
 /**************************************************
-- 
cgit v1.2.3-70-g09d2


From 7d94143291e4e625e2bc3b1ebdc7143ee7a9a2f1 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Wed, 5 Sep 2007 03:05:56 -0700
Subject: Fix spurious syscall tracing after PTRACE_DETACH + PTRACE_ATTACH

When PTRACE_SYSCALL was used and then PTRACE_DETACH is used, the
TIF_SYSCALL_TRACE flag is left set on the formerly-traced task.  This
means that when a new tracer comes along and does PTRACE_ATTACH, it's
possible he gets a syscall tracing stop even though he's never used
PTRACE_SYSCALL.  This happens if the task was in the middle of a system
call when the second PTRACE_ATTACH was done.  The symptom is an
unexpected SIGTRAP when the tracer thinks that only SIGSTOP should have
been provoked by his ptrace calls so far.

A few machines already fixed this in ptrace_disable (i386, ia64, m68k).
But all other machines do not, and still have this bug.  On x86_64, this
constitutes a regression in IA32 compatibility support.

Since all machines now use TIF_SYSCALL_TRACE for this, I put the
clearing of TIF_SYSCALL_TRACE in the generic ptrace_detach code rather
than adding it to every other machine's ptrace_disable.

Signed-off-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/i386/kernel/ptrace.c | 1 -
 arch/ia64/kernel/ptrace.c | 1 -
 arch/m68k/kernel/ptrace.c | 1 -
 kernel/ptrace.c           | 1 +
 4 files changed, 1 insertion(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/arch/i386/kernel/ptrace.c b/arch/i386/kernel/ptrace.c
index 0c8f00e69c4..7c1b92522e9 100644
--- a/arch/i386/kernel/ptrace.c
+++ b/arch/i386/kernel/ptrace.c
@@ -274,7 +274,6 @@ static void clear_singlestep(struct task_struct *child)
 void ptrace_disable(struct task_struct *child)
 { 
 	clear_singlestep(child);
-	clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
 	clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
 }
 
diff --git a/arch/ia64/kernel/ptrace.c b/arch/ia64/kernel/ptrace.c
index 122444a9789..2e96f17b2f3 100644
--- a/arch/ia64/kernel/ptrace.c
+++ b/arch/ia64/kernel/ptrace.c
@@ -1577,7 +1577,6 @@ sys_ptrace (long request, pid_t pid, unsigned long addr, unsigned long data)
 
 	      case PTRACE_DETACH:
 		/* detach a process that was attached. */
-		clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
 		ret = ptrace_detach(child, data);
 		goto out_tsk;
 
diff --git a/arch/m68k/kernel/ptrace.c b/arch/m68k/kernel/ptrace.c
index 2cf0690b788..e792d3cba4c 100644
--- a/arch/m68k/kernel/ptrace.c
+++ b/arch/m68k/kernel/ptrace.c
@@ -116,7 +116,6 @@ static inline void singlestep_disable(struct task_struct *child)
 void ptrace_disable(struct task_struct *child)
 {
 	singlestep_disable(child);
-	clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
 }
 
 long arch_ptrace(struct task_struct *child, long request, long addr, long data)
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 82a558b655d..3eca7a55f2e 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -233,6 +233,7 @@ int ptrace_detach(struct task_struct *child, unsigned int data)
 
 	/* Architecture-specific hardware disable .. */
 	ptrace_disable(child);
+	clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
 
 	write_lock_irq(&tasklist_lock);
 	/* protect against de_thread()->release_task() */
-- 
cgit v1.2.3-70-g09d2


From 179c85ea53bef807621f335767e41e23f86f01df Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Tue, 11 Sep 2007 15:23:49 -0700
Subject: futex_compat: fix list traversal bugs

The futex list traversal on the compat side appears to have
a bug.

It's loop termination condition compares:

        while (compat_ptr(uentry) != &head->list)

But that can't be right because "uentry" has the special
"pi" indicator bit still potentially set at bit 0.  This
is cleared by fetch_robust_entry() into the "entry"
return value.

What this seems to mean is that the list won't terminate
when list iteration gets back to the the head.  And we'll
also process the list head like a normal entry, which could
cause all kinds of problems.

So we should check for equality with "entry".  That pointer
is of the non-compat type so we have to do a little casting
to keep the compiler and sparse happy.

The same problem can in theory occur with the 'pending'
variable, although that has not been reported from users
so far.

Based on the original patch from David Miller.

Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: David Miller <davem@davemloft.net>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/futex_compat.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index f7921360efa..7e52eb051f2 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -61,10 +61,10 @@ void compat_exit_robust_list(struct task_struct *curr)
 	if (fetch_robust_entry(&upending, &pending,
 			       &head->list_op_pending, &pip))
 		return;
-	if (upending)
+	if (pending)
 		handle_futex_death((void __user *)pending + futex_offset, curr, pip);
 
-	while (compat_ptr(uentry) != &head->list) {
+	while (entry != (struct robust_list __user *) &head->list) {
 		/*
 		 * A pending lock might already be on the list, so
 		 * dont process it twice:
-- 
cgit v1.2.3-70-g09d2


From 3210f0ecdba6a81c3f8efe6f442d2e1f57db98f9 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <michael@ellerman.id.au>
Date: Tue, 11 Sep 2007 15:23:51 -0700
Subject: Restore call_usermodehelper_pipe() behaviour

The semantics of call_usermodehelper_pipe() used to be that it would fork
the helper, and wait for the kernel thread to be started.  This was
implemented by setting sub_info.wait to 0 (implicitly), and doing a
wait_for_completion().

As part of the cleanup done in 0ab4dc92278a0f3816e486d6350c6652a72e06c8,
call_usermodehelper_pipe() was changed to pass 1 as the value for wait to
call_usermodehelper_exec().

This is equivalent to setting sub_info.wait to 1, which is a change from
the previous behaviour.  Using 1 instead of 0 causes
__call_usermodehelper() to start the kernel thread running
wait_for_helper(), rather than directly calling ____call_usermodehelper().

The end result is that the calling kernel code blocks until the user mode
helper finishes.  As the helper is expecting input on stdin, and now no one
is writing anything, everything locks up (observed in do_coredump).

The fix is to change the 1 to UMH_WAIT_EXEC (aka 0), indicating that we
want to wait for the kernel thread to be started, but not for the helper to
finish.

Signed-off-by: Michael Ellerman <michael@ellerman.id.au>
Acked-by: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kmod.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/kmod.c b/kernel/kmod.c
index 9809cc1f33d..c6a4f8aebeb 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -505,7 +505,7 @@ int call_usermodehelper_pipe(char *path, char **argv, char **envp,
 	if (ret < 0)
 		goto out;
 
-	return call_usermodehelper_exec(sub_info, 1);
+	return call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC);
 
   out:
 	call_usermodehelper_freeinfo(sub_info);
-- 
cgit v1.2.3-70-g09d2


From 298a5df45d497e66064fda22ef0abf13766d3333 Mon Sep 17 00:00:00 2001
From: Tony Breeds <tony@bakeyournoodle.com>
Date: Tue, 11 Sep 2007 15:24:03 -0700
Subject: Fix "no_sync_cmos_clock" logic inversion in kernel/time/ntp.c

Seems to me that this timer will only get started on platforms that say
they don't want it?

Signed-off-by: Tony Breeds <tony@bakeyournoodle.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Gabriel Paubert <paubert@iram.es>
Cc: Zachary Amsden <zach@vmware.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: John Stultz <johnstul@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/time/ntp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index cd91237dbfe..de6a2d6b3eb 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -226,7 +226,7 @@ static void sync_cmos_clock(unsigned long dummy)
 
 static void notify_cmos_timer(void)
 {
-	if (no_sync_cmos_clock)
+	if (!no_sync_cmos_clock)
 		mod_timer(&sync_cmos_timer, jiffies + 1);
 }
 
-- 
cgit v1.2.3-70-g09d2


From 3be9095063885d482b87d3875ea7f28e635882d0 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 16 Sep 2007 15:36:43 +0200
Subject: timekeeping: access rtc outside of xtime lock

Lockdep complains about the access of rtc in timekeeping_suspend
inside the interrupt disabled region of the write locked xtime lock.
Move the access outside.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: John Stultz <johnstul@us.ibm.com>
---
 kernel/time/timekeeping.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index acc417b5a9b..f682091fa89 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -325,9 +325,10 @@ static int timekeeping_suspend(struct sys_device *dev, pm_message_t state)
 {
 	unsigned long flags;
 
+	timekeeping_suspend_time = read_persistent_clock();
+
 	write_seqlock_irqsave(&xtime_lock, flags);
 	timekeeping_suspended = 1;
-	timekeeping_suspend_time = read_persistent_clock();
 	write_sequnlock_irqrestore(&xtime_lock, flags);
 
 	clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);
-- 
cgit v1.2.3-70-g09d2


From 6a669ee8a790487b7ec1edda762d39615a78264b Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 16 Sep 2007 15:36:43 +0200
Subject: timekeeping: Prevent time going backwards on resume

Timekeeping resume adjusts xtime by adding the slept time in seconds and
resets the reference value of the clock source (clock->cycle_last).
clock->cycle last is used to calculate the delta between the last xtime
update and the readout of the clock source in __get_nsec_offset(). xtime
plus the offset is the current time. The resume code ignores the delta
which had already elapsed between the last xtime update and the actual
time of suspend. If the suspend time is short, then we can see time
going backwards on resume.

Suspend:
offs_s = clock->read() - clock->cycle_last;
now = xtime + offs_s;
timekeeping_suspend_time = read_rtc();

Resume:
sleep_time = read_rtc() - timekeeping_suspend_time;
xtime.tv_sec += sleep_time;
clock->cycle_last = clock->read();
offs_r = clock->read() - clock->cycle_last;
now = xtime + offs_r;

if sleep_time_seconds == 0 and offs_r < offs_s, then time goes
backwards.

Fix this by storing the offset from the last xtime update and add it to
xtime during resume, when we reset clock->cycle_last:

sleep_time = read_rtc() - timekeeping_suspend_time;
xtime.tv_sec += sleep_time;
xtime += offs_s;	/* Fixup xtime offset at suspend time */
clock->cycle_last = clock->read();
offs_r = clock->read() - clock->cycle_last;
now = xtime + offs_r;

Thanks to Marcelo for tracking this down on the OLPC and providing the
necessary details to analyze the root cause.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: John Stultz <johnstul@us.ibm.com>
Cc: Tosatti <marcelo@kvack.org>
---
 kernel/time/timekeeping.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'kernel')

diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index f682091fa89..4ad79f6bdec 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -217,6 +217,7 @@ static void change_clocksource(void)
 }
 #else
 static inline void change_clocksource(void) { }
+static inline s64 __get_nsec_offset(void) { return 0; }
 #endif
 
 /**
@@ -280,6 +281,8 @@ void __init timekeeping_init(void)
 static int timekeeping_suspended;
 /* time in seconds when suspend began */
 static unsigned long timekeeping_suspend_time;
+/* xtime offset when we went into suspend */
+static s64 timekeeping_suspend_nsecs;
 
 /**
  * timekeeping_resume - Resumes the generic timekeeping subsystem.
@@ -305,6 +308,8 @@ static int timekeeping_resume(struct sys_device *dev)
 		wall_to_monotonic.tv_sec -= sleep_length;
 		total_sleep_time += sleep_length;
 	}
+	/* Make sure that we have the correct xtime reference */
+	timespec_add_ns(&xtime, timekeeping_suspend_nsecs);
 	/* re-base the last cycle value */
 	clock->cycle_last = clocksource_read(clock);
 	clock->error = 0;
@@ -328,6 +333,8 @@ static int timekeeping_suspend(struct sys_device *dev, pm_message_t state)
 	timekeeping_suspend_time = read_persistent_clock();
 
 	write_seqlock_irqsave(&xtime_lock, flags);
+	/* Get the current xtime offset */
+	timekeeping_suspend_nsecs = __get_nsec_offset();
 	timekeeping_suspended = 1;
 	write_sequnlock_irqrestore(&xtime_lock, flags);
 
-- 
cgit v1.2.3-70-g09d2


From 07eec6af448d13a6a520d9c6f06f2e87f61b567a Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 16 Sep 2007 15:36:43 +0200
Subject: clockevents: Enforce oneshot broadcast when broadcast mask is set on
 resume

The jinxed VAIO refuses to resume without hitting keys on the keyboard
when this is not enforced. It is unclear why the cpu ends up in a lower
C State without notifying the clock events layer, but enforcing the
oneshot broadcast here is safe.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/tick-broadcast.c | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index db8e0f3d409..947959fb2bb 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -382,12 +382,23 @@ static int tick_broadcast_set_event(ktime_t expires, int force)
 
 int tick_resume_broadcast_oneshot(struct clock_event_device *bc)
 {
+	int cpu = smp_processor_id();
+
+	/*
+	 * If the CPU is marked for broadcast, enforce oneshot
+	 * broadcast mode. The jinxed VAIO does not resume otherwise.
+	 * No idea why it ends up in a lower C State during resume
+	 * without notifying the clock events layer.
+	 */
+	if (cpu_isset(cpu, tick_broadcast_mask))
+		cpu_set(cpu, tick_broadcast_oneshot_mask);
+
 	clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT);
 
 	if(!cpus_empty(tick_broadcast_oneshot_mask))
 		tick_broadcast_set_event(ktime_get(), 1);
 
-	return cpu_isset(smp_processor_id(), tick_broadcast_oneshot_mask);
+	return cpu_isset(cpu, tick_broadcast_oneshot_mask);
 }
 
 /*
-- 
cgit v1.2.3-70-g09d2


From 31d9b3938c0459e5e9755ce0a98ac1e24eeff972 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 16 Sep 2007 15:36:43 +0200
Subject: clockevents: do not shutdown the oneshot broadcast device

When a cpu goes offline it is removed from the broadcast masks. If the
mask becomes empty the code shuts down the broadcast device. This is
wrong, because the broadcast device needs to be ready for the online
cpu going idle (into a c-state, which stops the local apic timer).

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/tick-broadcast.c | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 947959fb2bb..aab881c86a1 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -560,20 +560,17 @@ void tick_broadcast_switch_to_oneshot(void)
  */
 void tick_shutdown_broadcast_oneshot(unsigned int *cpup)
 {
-	struct clock_event_device *bc;
 	unsigned long flags;
 	unsigned int cpu = *cpup;
 
 	spin_lock_irqsave(&tick_broadcast_lock, flags);
 
-	bc = tick_broadcast_device.evtdev;
+	/*
+	 * Clear the broadcast mask flag for the dead cpu, but do not
+	 * stop the broadcast device!
+	 */
 	cpu_clear(cpu, tick_broadcast_oneshot_mask);
 
-	if (tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT) {
-		if (bc && cpus_empty(tick_broadcast_oneshot_mask))
-			clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN);
-	}
-
 	spin_unlock_irqrestore(&tick_broadcast_lock, flags);
 }
 
-- 
cgit v1.2.3-70-g09d2


From 5e41d0d60a534d2a5dc9772600a58f44c8d12506 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 16 Sep 2007 15:36:43 +0200
Subject: clockevents: prevent stale tick update on offline cpu

Taking a cpu offline removes the cpu from the online mask before the
CPU_DEAD notification is done. The clock events layer does the cleanup
of the dead CPU from the CPU_DEAD notifier chain. tick_do_timer_cpu is
used to avoid xtime lock contention by assigning the task of jiffies
xtime updates to one CPU. If a CPU is taken offline, then this
assignment becomes stale. This went unnoticed because most of the time
the offline CPU went dead before the online CPU reached __cpu_die(),
where the CPU_DEAD state is checked. In the case that the offline CPU did
not reach the DEAD state before we reach __cpu_die(), the code in there
goes to sleep for 100ms. Due to the stale time update assignment, the
system is stuck forever.

Take the assignment away when a cpu is not longer in the cpu_online_mask.
We do this in the last call to tick_nohz_stop_sched_tick() when the offline
CPU is on the way to the final play_dead() idle entry.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/tick-sched.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'kernel')

diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index b416995b975..8c3fef1db09 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -160,6 +160,18 @@ void tick_nohz_stop_sched_tick(void)
 	cpu = smp_processor_id();
 	ts = &per_cpu(tick_cpu_sched, cpu);
 
+	/*
+	 * If this cpu is offline and it is the one which updates
+	 * jiffies, then give up the assignment and let it be taken by
+	 * the cpu which runs the tick timer next. If we don't drop
+	 * this here the jiffies might be stale and do_timer() never
+	 * invoked.
+	 */
+	if (unlikely(!cpu_online(cpu))) {
+		if (cpu == tick_do_timer_cpu)
+			tick_do_timer_cpu = -1;
+	}
+
 	if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE))
 		goto end;
 
-- 
cgit v1.2.3-70-g09d2


From efc63c4fb0f95865907472d1c6bc0cfea9ee156b Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@sw.ru>
Date: Tue, 18 Sep 2007 22:46:27 -0700
Subject: Fix UTS corruption during clone(CLONE_NEWUTS)

struct utsname is copied from master one without any exclusion.

Here is sample output from one proggie doing

	sethostname("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaa");
	sethostname("bbbbbbbbbbbbbbbbbbbbbbbbbbbbbb");

and another

	clone(,, CLONE_NEWUTS, ...)
	uname()

	hostname = 'aaaaaaaaaaaaaaaaaaaaaaaaabbbbb'
	hostname = 'bbbaaaaaaaaaaaaaaaaaaaaaaaaaaa'
	hostname = 'aaaaaaaabbbbbbbbbbbbbbbbbbbbbb'
	hostname = 'aaaaaaaaaaaaaaaaaaaaaaaaaabbbb'
	hostname = 'aaaaaaaaaaaaaaaaaaaaaaaaaaaabb'
	hostname = 'aaabbbbbbbbbbbbbbbbbbbbbbbbbbb'
	hostname = 'bbbbbbbbbbbbbbbbaaaaaaaaaaaaaa'

Hostname is sometimes corrupted.

Yes, even _the_ simplest namespace activity had bug in it. :-(

Signed-off-by: Alexey Dobriyan <adobriyan@sw.ru>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/utsname.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/utsname.c b/kernel/utsname.c
index 9d8180a0f0d..816d7b24fa0 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -28,7 +28,9 @@ static struct uts_namespace *clone_uts_ns(struct uts_namespace *old_ns)
 	if (!ns)
 		return ERR_PTR(-ENOMEM);
 
+	down_read(&uts_sem);
 	memcpy(&ns->name, &old_ns->name, sizeof(ns->name));
+	up_read(&uts_sem);
 	kref_init(&ns->kref);
 	return ns;
 }
-- 
cgit v1.2.3-70-g09d2


From d8a4821dca693867a7953104c1e3cc830eb9191f Mon Sep 17 00:00:00 2001
From: Matthias Kaehlcke <matthias.kaehlcke@gmail.com>
Date: Tue, 18 Sep 2007 22:46:43 -0700
Subject: kernel/user.c: Use list_for_each_entry instead of list_for_each

kernel/user.c: Convert list_for_each to list_for_each_entry in
uid_hash_find()

Signed-off-by: Matthias Kaehlcke <matthias.kaehlcke@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/user.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/user.c b/kernel/user.c
index e7d11cef699..e080ba863ae 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -67,13 +67,9 @@ static inline void uid_hash_remove(struct user_struct *up)
 
 static inline struct user_struct *uid_hash_find(uid_t uid, struct list_head *hashent)
 {
-	struct list_head *up;
-
-	list_for_each(up, hashent) {
-		struct user_struct *user;
-
-		user = list_entry(up, struct user_struct, uidhash_list);
+	struct user_struct *user;
 
+	list_for_each_entry(user, hashent, uidhash_list) {
 		if(user->uid == uid) {
 			atomic_inc(&user->__count);
 			return user;
-- 
cgit v1.2.3-70-g09d2


From 735de2230f09741077a645a913de0a04b10208bf Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@openvz.org>
Date: Tue, 18 Sep 2007 22:46:44 -0700
Subject: Convert uid hash to hlist

Surprisingly, but (spotted by Alexey Dobriyan) the uid hash still uses
list_heads, thus occupying twice as much place as it could.  Convert it to
hlist_heads.

Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Signed-off-by: Alexey Dobriyan <adobriyan@openvz.org>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sched.h          |  2 +-
 include/linux/user_namespace.h |  2 +-
 kernel/user.c                  | 15 ++++++++-------
 kernel/user_namespace.c        |  2 +-
 4 files changed, 11 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index f4e324ed2e4..6239bc2c2ba 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -593,7 +593,7 @@ struct user_struct {
 #endif
 
 	/* Hash table maintenance information */
-	struct list_head uidhash_list;
+	struct hlist_node uidhash_node;
 	uid_t uid;
 };
 
diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index 1101b0ce878..b5f41d4c2ee 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -11,7 +11,7 @@
 
 struct user_namespace {
 	struct kref		kref;
-	struct list_head	uidhash_table[UIDHASH_SZ];
+	struct hlist_head	uidhash_table[UIDHASH_SZ];
 	struct user_struct	*root_user;
 };
 
diff --git a/kernel/user.c b/kernel/user.c
index e080ba863ae..add57c7e4c0 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -55,21 +55,22 @@ struct user_struct root_user = {
 /*
  * These routines must be called with the uidhash spinlock held!
  */
-static inline void uid_hash_insert(struct user_struct *up, struct list_head *hashent)
+static inline void uid_hash_insert(struct user_struct *up, struct hlist_head *hashent)
 {
-	list_add(&up->uidhash_list, hashent);
+	hlist_add_head(&up->uidhash_node, hashent);
 }
 
 static inline void uid_hash_remove(struct user_struct *up)
 {
-	list_del(&up->uidhash_list);
+	hlist_del(&up->uidhash_node);
 }
 
-static inline struct user_struct *uid_hash_find(uid_t uid, struct list_head *hashent)
+static inline struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
 {
 	struct user_struct *user;
+	struct hlist_node *h;
 
-	list_for_each_entry(user, hashent, uidhash_list) {
+	hlist_for_each_entry(user, h, hashent, uidhash_node) {
 		if(user->uid == uid) {
 			atomic_inc(&user->__count);
 			return user;
@@ -118,7 +119,7 @@ void free_uid(struct user_struct *up)
 
 struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid)
 {
-	struct list_head *hashent = uidhashentry(ns, uid);
+	struct hlist_head *hashent = uidhashentry(ns, uid);
 	struct user_struct *up;
 
 	spin_lock_irq(&uidhash_lock);
@@ -207,7 +208,7 @@ static int __init uid_cache_init(void)
 			0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
 
 	for(n = 0; n < UIDHASH_SZ; ++n)
-		INIT_LIST_HEAD(init_user_ns.uidhash_table + n);
+		INIT_HLIST_HEAD(init_user_ns.uidhash_table + n);
 
 	/* Insert the root user immediately (init already runs as root) */
 	spin_lock_irq(&uidhash_lock);
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 85af9422ea6..e7ba1bf8457 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -39,7 +39,7 @@ static struct user_namespace *clone_user_ns(struct user_namespace *old_ns)
 	kref_init(&ns->kref);
 
 	for (n = 0; n < UIDHASH_SZ; ++n)
-		INIT_LIST_HEAD(ns->uidhash_table + n);
+		INIT_HLIST_HEAD(ns->uidhash_table + n);
 
 	/* Insert new root user.  */
 	ns->root_user = alloc_uid(ns, 0);
-- 
cgit v1.2.3-70-g09d2


From 28f300d23674fa01ae747c66ce861d4ee6aebe8c Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@openvz.org>
Date: Tue, 18 Sep 2007 22:46:45 -0700
Subject: Fix user namespace exiting OOPs

It turned out, that the user namespace is released during the do_exit() in
exit_task_namespaces(), but the struct user_struct is released only during the
put_task_struct(), i.e.  MUCH later.

On debug kernels with poisoned slabs this will cause the oops in
uid_hash_remove() because the head of the chain, which resides inside the
struct user_namespace, will be already freed and poisoned.

Since the uid hash itself is required only when someone can search it, i.e.
when the namespace is alive, we can safely unhash all the user_struct-s from
it during the namespace exiting.  The subsequent free_uid() will complete the
user_struct destruction.

For example simple program

   #include <sched.h>

   char stack[2 * 1024 * 1024];

   int f(void *foo)
   {
   	return 0;
   }

   int main(void)
   {
   	clone(f, stack + 1 * 1024 * 1024, 0x10000000, 0);
   	return 0;
   }

run on kernel with CONFIG_USER_NS turned on will oops the
kernel immediately.

This was spotted during OpenVZ kernel testing.

Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Signed-off-by: Alexey Dobriyan <adobriyan@openvz.org>
Acked-by: "Serge E. Hallyn" <serue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sched.h   |  1 +
 kernel/user.c           | 26 +++++++++++++++++++++++++-
 kernel/user_namespace.c |  2 +-
 3 files changed, 27 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 6239bc2c2ba..5445eaec690 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1472,6 +1472,7 @@ static inline struct user_struct *get_uid(struct user_struct *u)
 }
 extern void free_uid(struct user_struct *);
 extern void switch_uid(struct user_struct *);
+extern void release_uids(struct user_namespace *ns);
 
 #include <asm/current.h>
 
diff --git a/kernel/user.c b/kernel/user.c
index add57c7e4c0..9ca2848fc35 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -62,7 +62,7 @@ static inline void uid_hash_insert(struct user_struct *up, struct hlist_head *ha
 
 static inline void uid_hash_remove(struct user_struct *up)
 {
-	hlist_del(&up->uidhash_node);
+	hlist_del_init(&up->uidhash_node);
 }
 
 static inline struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
@@ -199,6 +199,30 @@ void switch_uid(struct user_struct *new_user)
 	suid_keys(current);
 }
 
+void release_uids(struct user_namespace *ns)
+{
+	int i;
+	unsigned long flags;
+	struct hlist_head *head;
+	struct hlist_node *nd;
+
+	spin_lock_irqsave(&uidhash_lock, flags);
+	/*
+	 * collapse the chains so that the user_struct-s will
+	 * be still alive, but not in hashes. subsequent free_uid()
+	 * will free them.
+	 */
+	for (i = 0; i < UIDHASH_SZ; i++) {
+		head = ns->uidhash_table + i;
+		while (!hlist_empty(head)) {
+			nd = head->first;
+			hlist_del_init(nd);
+		}
+	}
+	spin_unlock_irqrestore(&uidhash_lock, flags);
+
+	free_uid(ns->root_user);
+}
 
 static int __init uid_cache_init(void)
 {
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index e7ba1bf8457..7af90fc4f0f 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -81,7 +81,7 @@ void free_user_ns(struct kref *kref)
 	struct user_namespace *ns;
 
 	ns = container_of(kref, struct user_namespace, kref);
-	free_uid(ns->root_user);
+	release_uids(ns);
 	kfree(ns);
 }
 
-- 
cgit v1.2.3-70-g09d2


From 1799e35d5baab6e06168b46cc78b968e728ea3d1 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 19 Sep 2007 23:34:46 +0200
Subject: sched: add /proc/sys/kernel/sched_compat_yield

add /proc/sys/kernel/sched_compat_yield to make sys_sched_yield()
more agressive, by moving the yielding task to the last position
in the rbtree.

with sched_compat_yield=0:

   PID USER      PR  NI  VIRT  RES  SHR S %CPU %MEM    TIME+  COMMAND
  2539 mingo     20   0  1576  252  204 R   50  0.0   0:02.03 loop_yield
  2541 mingo     20   0  1576  244  196 R   50  0.0   0:02.05 loop

with sched_compat_yield=1:

   PID USER      PR  NI  VIRT  RES  SHR S %CPU %MEM    TIME+  COMMAND
  2584 mingo     20   0  1576  248  196 R   99  0.0   0:52.45 loop
  2582 mingo     20   0  1576  256  204 R    0  0.0   0:00.00 loop_yield

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 include/linux/sched.h |  1 +
 kernel/sched.c        |  5 +---
 kernel/sched_fair.c   | 63 ++++++++++++++++++++++++++++++++++++++++++++++-----
 kernel/sysctl.c       |  8 +++++++
 4 files changed, 67 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 5445eaec690..3de79016f2a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1406,6 +1406,7 @@ extern unsigned int sysctl_sched_wakeup_granularity;
 extern unsigned int sysctl_sched_batch_wakeup_granularity;
 extern unsigned int sysctl_sched_stat_granularity;
 extern unsigned int sysctl_sched_runtime_limit;
+extern unsigned int sysctl_sched_compat_yield;
 extern unsigned int sysctl_sched_child_runs_first;
 extern unsigned int sysctl_sched_features;
 
diff --git a/kernel/sched.c b/kernel/sched.c
index deeb1f8e0c3..63e0971c8fb 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4550,10 +4550,7 @@ asmlinkage long sys_sched_yield(void)
 	struct rq *rq = this_rq_lock();
 
 	schedstat_inc(rq, yld_cnt);
-	if (unlikely(rq->nr_running == 1))
-		schedstat_inc(rq, yld_act_empty);
-	else
-		current->sched_class->yield_task(rq, current);
+	current->sched_class->yield_task(rq, current);
 
 	/*
 	 * Since we are going to call schedule() anyway, there's
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 892616bf2c7..c9fbe8e73a4 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -42,6 +42,14 @@ unsigned int sysctl_sched_latency __read_mostly = 20000000ULL;
  */
 unsigned int sysctl_sched_min_granularity __read_mostly = 2000000ULL;
 
+/*
+ * sys_sched_yield() compat mode
+ *
+ * This option switches the agressive yield implementation of the
+ * old scheduler back on.
+ */
+unsigned int __read_mostly sysctl_sched_compat_yield;
+
 /*
  * SCHED_BATCH wake-up granularity.
  * (default: 25 msec, units: nanoseconds)
@@ -897,19 +905,62 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep)
 }
 
 /*
- * sched_yield() support is very simple - we dequeue and enqueue
+ * sched_yield() support is very simple - we dequeue and enqueue.
+ *
+ * If compat_yield is turned on then we requeue to the end of the tree.
  */
 static void yield_task_fair(struct rq *rq, struct task_struct *p)
 {
 	struct cfs_rq *cfs_rq = task_cfs_rq(p);
+	struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
+	struct sched_entity *rightmost, *se = &p->se;
+	struct rb_node *parent;
 
-	__update_rq_clock(rq);
 	/*
-	 * Dequeue and enqueue the task to update its
-	 * position within the tree:
+	 * Are we the only task in the tree?
+	 */
+	if (unlikely(cfs_rq->nr_running == 1))
+		return;
+
+	if (likely(!sysctl_sched_compat_yield)) {
+		__update_rq_clock(rq);
+		/*
+		 * Dequeue and enqueue the task to update its
+		 * position within the tree:
+		 */
+		dequeue_entity(cfs_rq, &p->se, 0);
+		enqueue_entity(cfs_rq, &p->se, 0);
+
+		return;
+	}
+	/*
+	 * Find the rightmost entry in the rbtree:
 	 */
-	dequeue_entity(cfs_rq, &p->se, 0);
-	enqueue_entity(cfs_rq, &p->se, 0);
+	do {
+		parent = *link;
+		link = &parent->rb_right;
+	} while (*link);
+
+	rightmost = rb_entry(parent, struct sched_entity, run_node);
+	/*
+	 * Already in the rightmost position?
+	 */
+	if (unlikely(rightmost == se))
+		return;
+
+	/*
+	 * Minimally necessary key value to be last in the tree:
+	 */
+	se->fair_key = rightmost->fair_key + 1;
+
+	if (cfs_rq->rb_leftmost == &se->run_node)
+		cfs_rq->rb_leftmost = rb_next(&se->run_node);
+	/*
+	 * Relink the task to the rightmost position:
+	 */
+	rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
+	rb_link_node(&se->run_node, parent, link);
+	rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
 }
 
 /*
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 6ace893c17c..53a456ebf6d 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -303,6 +303,14 @@ static ctl_table kern_table[] = {
 		.proc_handler	= &proc_dointvec,
 	},
 #endif
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "sched_compat_yield",
+		.data		= &sysctl_sched_compat_yield,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
 #ifdef CONFIG_PROVE_LOCKING
 	{
 		.ctl_name	= CTL_UNNUMBERED,
-- 
cgit v1.2.3-70-g09d2


From 9c95e7319ba98585ebb6d304eca2d56f401ed70c Mon Sep 17 00:00:00 2001
From: Hiroshi Shimamoto <h-shimamoto@ct.jp.nec.com>
Date: Wed, 19 Sep 2007 23:34:46 +0200
Subject: sched: fix invalid sched_class use

When using rt_mutex, a NULL pointer dereference is occurred at
enqueue_task_rt. Here is a scenario;
1) there are two threads, the thread A is fair_sched_class and
   thread B is rt_sched_class.
2) Thread A is boosted up to rt_sched_class, because the thread A
   has a rt_mutex lock and the thread B is waiting the lock.
3) At this time, when thread A create a new thread C, the thread
   C has a rt_sched_class.
4) When doing wake_up_new_task() for the thread C, the priority
   of the thread C is out of the RT priority range, because the
   normal priority of thread A is not the RT priority. It makes
   data corruption by overflowing the rt_prio_array.
The new thread C should be fair_sched_class.

The new thread should be valid scheduler class before queuing.
This patch fixes to set the suitable scheduler class.

Signed-off-by: Hiroshi Shimamoto <h-shimamoto@ct.jp.nec.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 kernel/sched.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 63e0971c8fb..6107a0cd632 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1682,6 +1682,11 @@ void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
 
 	p->prio = effective_prio(p);
 
+	if (rt_prio(p->prio))
+		p->sched_class = &rt_sched_class;
+	else
+		p->sched_class = &fair_sched_class;
+
 	if (!p->sched_class->task_new || !sysctl_sched_child_runs_first ||
 			(clone_flags & CLONE_VM) || task_cpu(p) != this_cpu ||
 			!current->se.on_rq) {
-- 
cgit v1.2.3-70-g09d2


From b8fceee17a310f189188599a8fa5e9beaff57eb0 Mon Sep 17 00:00:00 2001
From: Davide Libenzi <davidel@xmailserver.org>
Date: Thu, 20 Sep 2007 12:40:16 -0700
Subject: signalfd simplification

This simplifies signalfd code, by avoiding it to remain attached to the
sighand during its lifetime.

In this way, the signalfd remain attached to the sighand only during
poll(2) (and select and epoll) and read(2).  This also allows to remove
all the custom "tsk == current" checks in kernel/signal.c, since
dequeue_signal() will only be called by "current".

I think this is also what Ben was suggesting time ago.

The external effect of this, is that a thread can extract only its own
private signals and the group ones.  I think this is an acceptable
behaviour, in that those are the signals the thread would be able to
fetch w/out signalfd.

Signed-off-by: Davide Libenzi <davidel@xmailserver.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/exec.c                 |   3 -
 fs/signalfd.c             | 190 +++++++---------------------------------------
 include/linux/init_task.h |   2 +-
 include/linux/sched.h     |   2 +-
 include/linux/signalfd.h  |  40 +---------
 kernel/exit.c             |   9 ---
 kernel/fork.c             |   2 +-
 kernel/signal.c           |   8 +-
 8 files changed, 39 insertions(+), 217 deletions(-)

(limited to 'kernel')

diff --git a/fs/exec.c b/fs/exec.c
index c21a8cc0627..073b0b8c6d0 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -50,7 +50,6 @@
 #include <linux/tsacct_kern.h>
 #include <linux/cn_proc.h>
 #include <linux/audit.h>
-#include <linux/signalfd.h>
 
 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>
@@ -784,7 +783,6 @@ static int de_thread(struct task_struct *tsk)
 	 * and we can just re-use it all.
 	 */
 	if (atomic_read(&oldsighand->count) <= 1) {
-		signalfd_detach(tsk);
 		exit_itimers(sig);
 		return 0;
 	}
@@ -923,7 +921,6 @@ static int de_thread(struct task_struct *tsk)
 	sig->flags = 0;
 
 no_thread_group:
-	signalfd_detach(tsk);
 	exit_itimers(sig);
 	if (leader)
 		release_task(leader);
diff --git a/fs/signalfd.c b/fs/signalfd.c
index a8e293d3003..aefb0be0794 100644
--- a/fs/signalfd.c
+++ b/fs/signalfd.c
@@ -11,8 +11,10 @@
  *      Now using anonymous inode source.
  *      Thanks to Oleg Nesterov for useful code review and suggestions.
  *      More comments and suggestions from Arnd Bergmann.
- * Sat May 19, 2007: Davi E. M. Arnaut <davi@haxent.com.br>
+ *  Sat May 19, 2007: Davi E. M. Arnaut <davi@haxent.com.br>
  *      Retrieve multiple signals with one read() call
+ *  Sun Jul 15, 2007: Davide Libenzi <davidel@xmailserver.org>
+ *      Attach to the sighand only during read() and poll().
  */
 
 #include <linux/file.h>
@@ -27,102 +29,12 @@
 #include <linux/signalfd.h>
 
 struct signalfd_ctx {
-	struct list_head lnk;
-	wait_queue_head_t wqh;
 	sigset_t sigmask;
-	struct task_struct *tsk;
 };
 
-struct signalfd_lockctx {
-	struct task_struct *tsk;
-	unsigned long flags;
-};
-
-/*
- * Tries to acquire the sighand lock. We do not increment the sighand
- * use count, and we do not even pin the task struct, so we need to
- * do it inside an RCU read lock, and we must be prepared for the
- * ctx->tsk going to NULL (in signalfd_deliver()), and for the sighand
- * being detached. We return 0 if the sighand has been detached, or
- * 1 if we were able to pin the sighand lock.
- */
-static int signalfd_lock(struct signalfd_ctx *ctx, struct signalfd_lockctx *lk)
-{
-	struct sighand_struct *sighand = NULL;
-
-	rcu_read_lock();
-	lk->tsk = rcu_dereference(ctx->tsk);
-	if (likely(lk->tsk != NULL))
-		sighand = lock_task_sighand(lk->tsk, &lk->flags);
-	rcu_read_unlock();
-
-	if (!sighand)
-		return 0;
-
-	if (!ctx->tsk) {
-		unlock_task_sighand(lk->tsk, &lk->flags);
-		return 0;
-	}
-
-	if (lk->tsk->tgid == current->tgid)
-		lk->tsk = current;
-
-	return 1;
-}
-
-static void signalfd_unlock(struct signalfd_lockctx *lk)
-{
-	unlock_task_sighand(lk->tsk, &lk->flags);
-}
-
-/*
- * This must be called with the sighand lock held.
- */
-void signalfd_deliver(struct task_struct *tsk, int sig)
-{
-	struct sighand_struct *sighand = tsk->sighand;
-	struct signalfd_ctx *ctx, *tmp;
-
-	BUG_ON(!sig);
-	list_for_each_entry_safe(ctx, tmp, &sighand->signalfd_list, lnk) {
-		/*
-		 * We use a negative signal value as a way to broadcast that the
-		 * sighand has been orphaned, so that we can notify all the
-		 * listeners about this. Remember the ctx->sigmask is inverted,
-		 * so if the user is interested in a signal, that corresponding
-		 * bit will be zero.
-		 */
-		if (sig < 0) {
-			if (ctx->tsk == tsk) {
-				ctx->tsk = NULL;
-				list_del_init(&ctx->lnk);
-				wake_up(&ctx->wqh);
-			}
-		} else {
-			if (!sigismember(&ctx->sigmask, sig))
-				wake_up(&ctx->wqh);
-		}
-	}
-}
-
-static void signalfd_cleanup(struct signalfd_ctx *ctx)
-{
-	struct signalfd_lockctx lk;
-
-	/*
-	 * This is tricky. If the sighand is gone, we do not need to remove
-	 * context from the list, the list itself won't be there anymore.
-	 */
-	if (signalfd_lock(ctx, &lk)) {
-		list_del(&ctx->lnk);
-		signalfd_unlock(&lk);
-	}
-	kfree(ctx);
-}
-
 static int signalfd_release(struct inode *inode, struct file *file)
 {
-	signalfd_cleanup(file->private_data);
+	kfree(file->private_data);
 	return 0;
 }
 
@@ -130,23 +42,15 @@ static unsigned int signalfd_poll(struct file *file, poll_table *wait)
 {
 	struct signalfd_ctx *ctx = file->private_data;
 	unsigned int events = 0;
-	struct signalfd_lockctx lk;
 
-	poll_wait(file, &ctx->wqh, wait);
+	poll_wait(file, &current->sighand->signalfd_wqh, wait);
 
-	/*
-	 * Let the caller get a POLLIN in this case, ala socket recv() when
-	 * the peer disconnects.
-	 */
-	if (signalfd_lock(ctx, &lk)) {
-		if ((lk.tsk == current &&
-		     next_signal(&lk.tsk->pending, &ctx->sigmask) > 0) ||
-		    next_signal(&lk.tsk->signal->shared_pending,
-				&ctx->sigmask) > 0)
-			events |= POLLIN;
-		signalfd_unlock(&lk);
-	} else
+	spin_lock_irq(&current->sighand->siglock);
+	if (next_signal(&current->pending, &ctx->sigmask) ||
+	    next_signal(&current->signal->shared_pending,
+			&ctx->sigmask))
 		events |= POLLIN;
+	spin_unlock_irq(&current->sighand->siglock);
 
 	return events;
 }
@@ -219,59 +123,46 @@ static ssize_t signalfd_dequeue(struct signalfd_ctx *ctx, siginfo_t *info,
 				int nonblock)
 {
 	ssize_t ret;
-	struct signalfd_lockctx lk;
 	DECLARE_WAITQUEUE(wait, current);
 
-	if (!signalfd_lock(ctx, &lk))
-		return 0;
-
-	ret = dequeue_signal(lk.tsk, &ctx->sigmask, info);
+	spin_lock_irq(&current->sighand->siglock);
+	ret = dequeue_signal(current, &ctx->sigmask, info);
 	switch (ret) {
 	case 0:
 		if (!nonblock)
 			break;
 		ret = -EAGAIN;
 	default:
-		signalfd_unlock(&lk);
+		spin_unlock_irq(&current->sighand->siglock);
 		return ret;
 	}
 
-	add_wait_queue(&ctx->wqh, &wait);
+	add_wait_queue(&current->sighand->signalfd_wqh, &wait);
 	for (;;) {
 		set_current_state(TASK_INTERRUPTIBLE);
-		ret = dequeue_signal(lk.tsk, &ctx->sigmask, info);
-		signalfd_unlock(&lk);
+		ret = dequeue_signal(current, &ctx->sigmask, info);
 		if (ret != 0)
 			break;
 		if (signal_pending(current)) {
 			ret = -ERESTARTSYS;
 			break;
 		}
+		spin_unlock_irq(&current->sighand->siglock);
 		schedule();
-		ret = signalfd_lock(ctx, &lk);
-		if (unlikely(!ret)) {
-			/*
-			 * Let the caller read zero byte, ala socket
-			 * recv() when the peer disconnect. This test
-			 * must be done before doing a dequeue_signal(),
-			 * because if the sighand has been orphaned,
-			 * the dequeue_signal() call is going to crash
-			 * because ->sighand will be long gone.
-			 */
-			 break;
-		}
+		spin_lock_irq(&current->sighand->siglock);
 	}
+	spin_unlock_irq(&current->sighand->siglock);
 
-	remove_wait_queue(&ctx->wqh, &wait);
+	remove_wait_queue(&current->sighand->signalfd_wqh, &wait);
 	__set_current_state(TASK_RUNNING);
 
 	return ret;
 }
 
 /*
- * Returns either the size of a "struct signalfd_siginfo", or zero if the
- * sighand we are attached to, has been orphaned. The "count" parameter
- * must be at least the size of a "struct signalfd_siginfo".
+ * Returns a multiple of the size of a "struct signalfd_siginfo", or a negative
+ * error code. The "count" parameter must be at least the size of a
+ * "struct signalfd_siginfo".
  */
 static ssize_t signalfd_read(struct file *file, char __user *buf, size_t count,
 			     loff_t *ppos)
@@ -287,7 +178,6 @@ static ssize_t signalfd_read(struct file *file, char __user *buf, size_t count,
 		return -EINVAL;
 
 	siginfo = (struct signalfd_siginfo __user *) buf;
-
 	do {
 		ret = signalfd_dequeue(ctx, &info, nonblock);
 		if (unlikely(ret <= 0))
@@ -300,7 +190,7 @@ static ssize_t signalfd_read(struct file *file, char __user *buf, size_t count,
 		nonblock = 1;
 	} while (--count);
 
-	return total ? total : ret;
+	return total ? total: ret;
 }
 
 static const struct file_operations signalfd_fops = {
@@ -309,20 +199,13 @@ static const struct file_operations signalfd_fops = {
 	.read		= signalfd_read,
 };
 
-/*
- * Create a file descriptor that is associated with our signal
- * state. We can pass it around to others if we want to, but
- * it will always be _our_ signal state.
- */
 asmlinkage long sys_signalfd(int ufd, sigset_t __user *user_mask, size_t sizemask)
 {
 	int error;
 	sigset_t sigmask;
 	struct signalfd_ctx *ctx;
-	struct sighand_struct *sighand;
 	struct file *file;
 	struct inode *inode;
-	struct signalfd_lockctx lk;
 
 	if (sizemask != sizeof(sigset_t) ||
 	    copy_from_user(&sigmask, user_mask, sizeof(sigmask)))
@@ -335,17 +218,7 @@ asmlinkage long sys_signalfd(int ufd, sigset_t __user *user_mask, size_t sizemas
 		if (!ctx)
 			return -ENOMEM;
 
-		init_waitqueue_head(&ctx->wqh);
 		ctx->sigmask = sigmask;
-		ctx->tsk = current->group_leader;
-
-		sighand = current->sighand;
-		/*
-		 * Add this fd to the list of signal listeners.
-		 */
-		spin_lock_irq(&sighand->siglock);
-		list_add_tail(&ctx->lnk, &sighand->signalfd_list);
-		spin_unlock_irq(&sighand->siglock);
 
 		/*
 		 * When we call this, the initialization must be complete, since
@@ -364,23 +237,18 @@ asmlinkage long sys_signalfd(int ufd, sigset_t __user *user_mask, size_t sizemas
 			fput(file);
 			return -EINVAL;
 		}
-		/*
-		 * We need to be prepared of the fact that the sighand this fd
-		 * is attached to, has been detched. In that case signalfd_lock()
-		 * will return 0, and we'll just skip setting the new mask.
-		 */
-		if (signalfd_lock(ctx, &lk)) {
-			ctx->sigmask = sigmask;
-			signalfd_unlock(&lk);
-		}
-		wake_up(&ctx->wqh);
+		spin_lock_irq(&current->sighand->siglock);
+		ctx->sigmask = sigmask;
+		spin_unlock_irq(&current->sighand->siglock);
+
+		wake_up(&current->sighand->signalfd_wqh);
 		fput(file);
 	}
 
 	return ufd;
 
 err_fdalloc:
-	signalfd_cleanup(ctx);
+	kfree(ctx);
 	return error;
 }
 
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index cab741c2d60..f8abfa349ef 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -86,7 +86,7 @@ extern struct nsproxy init_nsproxy;
 	.count		= ATOMIC_INIT(1), 				\
 	.action		= { { { .sa_handler = NULL, } }, },		\
 	.siglock	= __SPIN_LOCK_UNLOCKED(sighand.siglock),	\
-	.signalfd_list	= LIST_HEAD_INIT(sighand.signalfd_list),	\
+	.signalfd_wqh	= __WAIT_QUEUE_HEAD_INITIALIZER(sighand.signalfd_wqh),	\
 }
 
 extern struct group_info init_groups;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 3de79016f2a..a01ac6dd5f5 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -438,7 +438,7 @@ struct sighand_struct {
 	atomic_t		count;
 	struct k_sigaction	action[_NSIG];
 	spinlock_t		siglock;
-	struct list_head        signalfd_list;
+	wait_queue_head_t	signalfd_wqh;
 };
 
 struct pacct_struct {
diff --git a/include/linux/signalfd.h b/include/linux/signalfd.h
index 51042949569..4c9ff0910ae 100644
--- a/include/linux/signalfd.h
+++ b/include/linux/signalfd.h
@@ -45,49 +45,17 @@ struct signalfd_siginfo {
 #ifdef CONFIG_SIGNALFD
 
 /*
- * Deliver the signal to listening signalfd. This must be called
- * with the sighand lock held. Same are the following that end up
- * calling signalfd_deliver().
- */
-void signalfd_deliver(struct task_struct *tsk, int sig);
-
-/*
- * No need to fall inside signalfd_deliver() if no signal listeners
- * are available.
+ * Deliver the signal to listening signalfd.
  */
 static inline void signalfd_notify(struct task_struct *tsk, int sig)
 {
-	if (unlikely(!list_empty(&tsk->sighand->signalfd_list)))
-		signalfd_deliver(tsk, sig);
-}
-
-/*
- * The signal -1 is used to notify the signalfd that the sighand
- * is on its way to be detached.
- */
-static inline void signalfd_detach_locked(struct task_struct *tsk)
-{
-	if (unlikely(!list_empty(&tsk->sighand->signalfd_list)))
-		signalfd_deliver(tsk, -1);
-}
-
-static inline void signalfd_detach(struct task_struct *tsk)
-{
-	struct sighand_struct *sighand = tsk->sighand;
-
-	if (unlikely(!list_empty(&sighand->signalfd_list))) {
-		spin_lock_irq(&sighand->siglock);
-		signalfd_deliver(tsk, -1);
-		spin_unlock_irq(&sighand->siglock);
-	}
+	if (unlikely(waitqueue_active(&tsk->sighand->signalfd_wqh)))
+		wake_up(&tsk->sighand->signalfd_wqh);
 }
 
 #else /* CONFIG_SIGNALFD */
 
-#define signalfd_deliver(t, s) do { } while (0)
-#define signalfd_notify(t, s) do { } while (0)
-#define signalfd_detach_locked(t) do { } while (0)
-#define signalfd_detach(t) do { } while (0)
+static inline void signalfd_notify(struct task_struct *tsk, int sig) { }
 
 #endif /* CONFIG_SIGNALFD */
 
diff --git a/kernel/exit.c b/kernel/exit.c
index 06b24b3aa37..993369ee94d 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -24,7 +24,6 @@
 #include <linux/pid_namespace.h>
 #include <linux/ptrace.h>
 #include <linux/profile.h>
-#include <linux/signalfd.h>
 #include <linux/mount.h>
 #include <linux/proc_fs.h>
 #include <linux/kthread.h>
@@ -86,14 +85,6 @@ static void __exit_signal(struct task_struct *tsk)
 	sighand = rcu_dereference(tsk->sighand);
 	spin_lock(&sighand->siglock);
 
-	/*
-	 * Notify that this sighand has been detached. This must
-	 * be called with the tsk->sighand lock held. Also, this
-	 * access tsk->sighand internally, so it must be called
-	 * before tsk->sighand is reset.
-	 */
-	signalfd_detach_locked(tsk);
-
 	posix_cpu_timers_exit(tsk);
 	if (atomic_dec_and_test(&sig->count))
 		posix_cpu_timers_exit_group(tsk);
diff --git a/kernel/fork.c b/kernel/fork.c
index 7332e236d36..33f12f48684 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1438,7 +1438,7 @@ static void sighand_ctor(void *data, struct kmem_cache *cachep,
 	struct sighand_struct *sighand = data;
 
 	spin_lock_init(&sighand->siglock);
-	INIT_LIST_HEAD(&sighand->signalfd_list);
+	init_waitqueue_head(&sighand->signalfd_wqh);
 }
 
 void __init proc_caches_init(void)
diff --git a/kernel/signal.c b/kernel/signal.c
index 3169bed0b4d..9fb91a32edd 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -378,8 +378,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
 	/* We only dequeue private signals from ourselves, we don't let
 	 * signalfd steal them
 	 */
-	if (likely(tsk == current))
-		signr = __dequeue_signal(&tsk->pending, mask, info);
+	signr = __dequeue_signal(&tsk->pending, mask, info);
 	if (!signr) {
 		signr = __dequeue_signal(&tsk->signal->shared_pending,
 					 mask, info);
@@ -407,8 +406,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
 			}
 		}
 	}
-	if (likely(tsk == current))
-		recalc_sigpending();
+	recalc_sigpending();
 	if (signr && unlikely(sig_kernel_stop(signr))) {
 		/*
 		 * Set a marker that we have dequeued a stop signal.  Our
@@ -425,7 +423,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
 		if (!(tsk->signal->flags & SIGNAL_GROUP_EXIT))
 			tsk->signal->flags |= SIGNAL_STOP_DEQUEUED;
 	}
-	if (signr && likely(tsk == current) &&
+	if (signr &&
 	     ((info->si_code & __SI_MASK) == __SI_TIMER) &&
 	     info->si_sys_private){
 		/*
-- 
cgit v1.2.3-70-g09d2


From b7e113dc9d52c4a37d2da6fafe77959f3a28eccf Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 22 Sep 2007 22:29:06 +0000
Subject: clockevents: remove the suspend/resume workaround^Wthinko

In a desparate attempt to fix the suspend/resume problem on Andrews
VAIO I added a workaround which enforced the broadcast of the oneshot
timer on resume. This was actually resolving the problem on the VAIO
but was just a stupid workaround, which was not tackling the root
cause: the assignement of lower idle C-States in the ACPI processor_idle
code. The cpuidle patches, which utilize the dynamic tick feature and
go faster into deeper C-states exposed the problem again. The correct
solution is the previous patch, which prevents lower C-states across
the suspend/resume.

Remove the enforcement code, including the conditional broadcast timer
arming, which helped to pamper over the real problem for quite a time.
The oneshot broadcast flag for the cpu, which runs the resume code can
never be set at the time when this code is executed. It only gets set,
when the CPU is entering a lower idle C-State.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Len Brown <lenb@kernel.org>
Cc: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Cc: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/time/tick-broadcast.c | 17 +----------------
 1 file changed, 1 insertion(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index aab881c86a1..0962e057766 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -382,23 +382,8 @@ static int tick_broadcast_set_event(ktime_t expires, int force)
 
 int tick_resume_broadcast_oneshot(struct clock_event_device *bc)
 {
-	int cpu = smp_processor_id();
-
-	/*
-	 * If the CPU is marked for broadcast, enforce oneshot
-	 * broadcast mode. The jinxed VAIO does not resume otherwise.
-	 * No idea why it ends up in a lower C State during resume
-	 * without notifying the clock events layer.
-	 */
-	if (cpu_isset(cpu, tick_broadcast_mask))
-		cpu_set(cpu, tick_broadcast_oneshot_mask);
-
 	clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT);
-
-	if(!cpus_empty(tick_broadcast_oneshot_mask))
-		tick_broadcast_set_event(ktime_get(), 1);
-
-	return cpu_isset(cpu, tick_broadcast_oneshot_mask);
+	return 0;
 }
 
 /*
-- 
cgit v1.2.3-70-g09d2


From 459685c75b82a0431da102365d507fdb72858b84 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@ftp.linux.org.uk>
Date: Wed, 26 Sep 2007 01:54:12 +0100
Subject: hibernation doesn't even build on frv - tons of helpers are missing

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Acked-By: David Howells <dhowells@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/power/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index c8580a1e687..14b0e10dc95 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -110,7 +110,7 @@ config SUSPEND
 
 config HIBERNATION_UP_POSSIBLE
 	bool
-	depends on X86 || PPC64_SWSUSP || FRV || PPC32
+	depends on X86 || PPC64_SWSUSP || PPC32
 	depends on !SMP
 	default y
 
-- 
cgit v1.2.3-70-g09d2


From 4047727e5ae33f9b8d2b7766d1994ea6e5ec2991 Mon Sep 17 00:00:00 2001
From: Mark Lord <lkml@rtr.ca>
Date: Mon, 1 Oct 2007 01:20:10 -0700
Subject: Fix SMP poweroff hangs

We need to disable all CPUs other than the boot CPU (usually 0) before
attempting to power-off modern SMP machines.  This fixes the
hang-on-poweroff issue on my MythTV SMP box, and also on Thomas Gleixner's
new toybox.

Signed-off-by: Mark Lord <mlord@pobox.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sys.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sys.c b/kernel/sys.c
index 1b33b05d346..8ae2e636eb1 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -32,6 +32,7 @@
 #include <linux/getcpu.h>
 #include <linux/task_io_accounting_ops.h>
 #include <linux/seccomp.h>
+#include <linux/cpu.h>
 
 #include <linux/compat.h>
 #include <linux/syscalls.h>
@@ -878,6 +879,7 @@ void kernel_power_off(void)
 	kernel_shutdown_prepare(SYSTEM_POWER_OFF);
 	if (pm_power_off_prepare)
 		pm_power_off_prepare();
+	disable_nonboot_cpus();
 	sysdev_shutdown();
 	printk(KERN_EMERG "Power down.\n");
 	machine_power_off();
-- 
cgit v1.2.3-70-g09d2


From 9f96cb1e8bca179a92afa40dfc3c49990f1cfc71 Mon Sep 17 00:00:00 2001
From: Martin Schwidefsky <schwidefsky@de.ibm.com>
Date: Mon, 1 Oct 2007 01:20:13 -0700
Subject: robust futex thread exit race

Calling handle_futex_death in exit_robust_list for the different robust
mutexes of a thread basically frees the mutex.  Another thread might grab
the lock immediately which updates the next pointer of the mutex.
fetch_robust_entry over the next pointer might therefore branch into the
robust mutex list of a different thread.  This can cause two problems: 1)
some mutexes held by the dead thread are not getting freed and 2) some
mutexs held by a different thread are freed.

The next point need to be read before calling handle_futex_death.

Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/futex.c        | 26 ++++++++++++++++----------
 kernel/futex_compat.c | 28 ++++++++++++++++++----------
 2 files changed, 34 insertions(+), 20 deletions(-)

(limited to 'kernel')

diff --git a/kernel/futex.c b/kernel/futex.c
index e8935b195e8..fcc94e7b408 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1943,9 +1943,10 @@ static inline int fetch_robust_entry(struct robust_list __user **entry,
 void exit_robust_list(struct task_struct *curr)
 {
 	struct robust_list_head __user *head = curr->robust_list;
-	struct robust_list __user *entry, *pending;
-	unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
+	struct robust_list __user *entry, *next_entry, *pending;
+	unsigned int limit = ROBUST_LIST_LIMIT, pi, next_pi, pip;
 	unsigned long futex_offset;
+	int rc;
 
 	/*
 	 * Fetch the list head (which was registered earlier, via
@@ -1965,11 +1966,13 @@ void exit_robust_list(struct task_struct *curr)
 	if (fetch_robust_entry(&pending, &head->list_op_pending, &pip))
 		return;
 
-	if (pending)
-		handle_futex_death((void __user *)pending + futex_offset,
-				   curr, pip);
-
+	next_entry = NULL;	/* avoid warning with gcc */
 	while (entry != &head->list) {
+		/*
+		 * Fetch the next entry in the list before calling
+		 * handle_futex_death:
+		 */
+		rc = fetch_robust_entry(&next_entry, &entry->next, &next_pi);
 		/*
 		 * A pending lock might already be on the list, so
 		 * don't process it twice:
@@ -1978,11 +1981,10 @@ void exit_robust_list(struct task_struct *curr)
 			if (handle_futex_death((void __user *)entry + futex_offset,
 						curr, pi))
 				return;
-		/*
-		 * Fetch the next entry in the list:
-		 */
-		if (fetch_robust_entry(&entry, &entry->next, &pi))
+		if (rc)
 			return;
+		entry = next_entry;
+		pi = next_pi;
 		/*
 		 * Avoid excessively long or circular lists:
 		 */
@@ -1991,6 +1993,10 @@ void exit_robust_list(struct task_struct *curr)
 
 		cond_resched();
 	}
+
+	if (pending)
+		handle_futex_death((void __user *)pending + futex_offset,
+				   curr, pip);
 }
 
 long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index 7e52eb051f2..2c2e2954b71 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -38,10 +38,11 @@ fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry,
 void compat_exit_robust_list(struct task_struct *curr)
 {
 	struct compat_robust_list_head __user *head = curr->compat_robust_list;
-	struct robust_list __user *entry, *pending;
-	unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
-	compat_uptr_t uentry, upending;
+	struct robust_list __user *entry, *next_entry, *pending;
+	unsigned int limit = ROBUST_LIST_LIMIT, pi, next_pi, pip;
+	compat_uptr_t uentry, next_uentry, upending;
 	compat_long_t futex_offset;
+	int rc;
 
 	/*
 	 * Fetch the list head (which was registered earlier, via
@@ -61,10 +62,15 @@ void compat_exit_robust_list(struct task_struct *curr)
 	if (fetch_robust_entry(&upending, &pending,
 			       &head->list_op_pending, &pip))
 		return;
-	if (pending)
-		handle_futex_death((void __user *)pending + futex_offset, curr, pip);
 
+	next_entry = NULL;	/* avoid warning with gcc */
 	while (entry != (struct robust_list __user *) &head->list) {
+		/*
+		 * Fetch the next entry in the list before calling
+		 * handle_futex_death:
+		 */
+		rc = fetch_robust_entry(&next_uentry, &next_entry,
+			(compat_uptr_t __user *)&entry->next, &next_pi);
 		/*
 		 * A pending lock might already be on the list, so
 		 * dont process it twice:
@@ -74,12 +80,11 @@ void compat_exit_robust_list(struct task_struct *curr)
 						curr, pi))
 				return;
 
-		/*
-		 * Fetch the next entry in the list:
-		 */
-		if (fetch_robust_entry(&uentry, &entry,
-				       (compat_uptr_t __user *)&entry->next, &pi))
+		if (rc)
 			return;
+		uentry = next_uentry;
+		entry = next_entry;
+		pi = next_pi;
 		/*
 		 * Avoid excessively long or circular lists:
 		 */
@@ -88,6 +93,9 @@ void compat_exit_robust_list(struct task_struct *curr)
 
 		cond_resched();
 	}
+	if (pending)
+		handle_futex_death((void __user *)pending + futex_offset,
+				   curr, pip);
 }
 
 asmlinkage long
-- 
cgit v1.2.3-70-g09d2


From 30084fbd1caa4b2e1a336fcdef60b68129d1d8f8 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 2 Oct 2007 14:13:08 +0200
Subject: sched: fix profile=sleep

fix sleep profiling - we lost this chunk in the CFS merge.

Found-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index c9fbe8e73a4..67c67a87146 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -639,6 +639,16 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
 
 		se->block_start = 0;
 		se->sum_sleep_runtime += delta;
+
+		/*
+		 * Blocking time is in units of nanosecs, so shift by 20 to
+		 * get a milliseconds-range estimation of the amount of
+		 * time that the task spent sleeping:
+		 */
+		if (unlikely(prof_on == SLEEP_PROFILING)) {
+			profile_hits(SLEEP_PROFILING, (void *)get_wchan(tsk),
+				     delta >> 20);
+		}
 	}
 #endif
 }
-- 
cgit v1.2.3-70-g09d2


From 74922be1485818ed368c4cf4f0b100f70bf01e08 Mon Sep 17 00:00:00 2001
From: Anton Blanchard <anton@samba.org>
Date: Sun, 7 Oct 2007 00:24:31 -0700
Subject: Fix timer_stats printout of events/sec

When using /proc/timer_stats on ppc64 I noticed the events/sec field wasnt
accurate.  Sometimes the integer part was incorrect due to rounding (we
werent taking the fractional seconds into consideration).

The fraction part is also wrong, we need to pad the printf statement and
take the bottom three digits of 1000 times the value.

Signed-off-by: Anton Blanchard <anton@samba.org>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/time/timer_stats.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c
index 3c38fb5eae1..c36bb7ed030 100644
--- a/kernel/time/timer_stats.c
+++ b/kernel/time/timer_stats.c
@@ -327,8 +327,9 @@ static int tstats_show(struct seq_file *m, void *v)
 		ms = 1;
 
 	if (events && period.tv_sec)
-		seq_printf(m, "%ld total events, %ld.%ld events/sec\n", events,
-			   events / period.tv_sec, events * 1000 / ms);
+		seq_printf(m, "%ld total events, %ld.%03ld events/sec\n",
+			   events, events * 1000 / ms,
+			   (events * 1000000 / ms) % 1000);
 	else
 		seq_printf(m, "%ld total events\n", events);
 
-- 
cgit v1.2.3-70-g09d2


From 291041e935e6d0513f2b7e4a300aa9f02ec1d925 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@ftp.linux.org.uk>
Date: Sun, 7 Oct 2007 00:24:36 -0700
Subject: fix bogus reporting of signals by audit

Async signals should not be reported as sent by current in audit log.  As
it is, we call audit_signal_info() too early in check_kill_permission().
Note that check_kill_permission() has that test already - it needs to know
if it should apply current-based permission checks.  So the solution is to
move the call of audit_signal_info() between those.

Bogosity in question is easily reproduced - add a rule watching for e.g.
kill(2) from specific process (so that audit_signal_info() would not
short-circuit to nothing), say load_policy, watch the bogus OBJ_PID entry
in audit logs claiming that write(2) on selinuxfs file issued by
load_policy(8) had somehow managed to send a signal to syslogd...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Acked-by: Steve Grubb <sgrubb@redhat.com>
Acked-by: Eric Paris <eparis@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index 9fb91a32edd..79295238109 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -531,18 +531,18 @@ static int check_kill_permission(int sig, struct siginfo *info,
 	if (!valid_signal(sig))
 		return error;
 
-	error = audit_signal_info(sig, t); /* Let audit system see the signal */
-	if (error)
-		return error;
-
-	error = -EPERM;
-	if ((info == SEND_SIG_NOINFO || (!is_si_special(info) && SI_FROMUSER(info)))
-	    && ((sig != SIGCONT) ||
-		(process_session(current) != process_session(t)))
-	    && (current->euid ^ t->suid) && (current->euid ^ t->uid)
-	    && (current->uid ^ t->suid) && (current->uid ^ t->uid)
-	    && !capable(CAP_KILL))
+	if (info == SEND_SIG_NOINFO || (!is_si_special(info) && SI_FROMUSER(info))) {
+		error = audit_signal_info(sig, t); /* Let audit system see the signal */
+		if (error)
+			return error;
+		error = -EPERM;
+		if (((sig != SIGCONT) ||
+			(process_session(current) != process_session(t)))
+		    && (current->euid ^ t->suid) && (current->euid ^ t->uid)
+		    && (current->uid ^ t->suid) && (current->uid ^ t->uid)
+		    && !capable(CAP_KILL))
 		return error;
+	}
 
 	return security_task_kill(t, info, sig, 0);
 }
-- 
cgit v1.2.3-70-g09d2


From f5ff8422bbdd59f8c1f699df248e1b7a11073027 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Fri, 21 Sep 2007 09:19:54 +0200
Subject: Fix warnings with !CONFIG_BLOCK

Hide everything in blkdev.h with CONFIG_BLOCK isn't set, and fixup
the (few) files that fail to build because they were relying on blkdev.h
pulling in extra includes for them.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/fs-writeback.c         | 1 +
 include/linux/blkdev.h    | 4 ++--
 include/linux/writeback.h | 1 +
 kernel/sched.c            | 1 +
 mm/readahead.c            | 1 +
 5 files changed, 6 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index a4b142a6a2c..8d23b0b3871 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -14,6 +14,7 @@
  */
 
 #include <linux/kernel.h>
+#include <linux/module.h>
 #include <linux/spinlock.h>
 #include <linux/sched.h>
 #include <linux/fs.h>
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 492ac946391..a0a99814044 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1,6 +1,8 @@
 #ifndef _LINUX_BLKDEV_H
 #define _LINUX_BLKDEV_H
 
+#ifdef CONFIG_BLOCK
+
 #include <linux/sched.h>
 #include <linux/major.h>
 #include <linux/genhd.h>
@@ -32,8 +34,6 @@
 )
 #endif
 
-#ifdef CONFIG_BLOCK
-
 struct scsi_ioctl_command;
 
 struct request_queue;
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index b4af6bcb7b7..c7c3337c3a8 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -5,6 +5,7 @@
 #define WRITEBACK_H
 
 #include <linux/sched.h>
+#include <linux/fs.h>
 
 struct backing_dev_info;
 
diff --git a/kernel/sched.c b/kernel/sched.c
index 6107a0cd632..6c10fa796ca 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -61,6 +61,7 @@
 #include <linux/delayacct.h>
 #include <linux/reciprocal_div.h>
 #include <linux/unistd.h>
+#include <linux/pagemap.h>
 
 #include <asm/tlb.h>
 
diff --git a/mm/readahead.c b/mm/readahead.c
index 39bf45d4332..be20c9d699d 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -15,6 +15,7 @@
 #include <linux/backing-dev.h>
 #include <linux/task_io_accounting_ops.h>
 #include <linux/pagevec.h>
+#include <linux/pagemap.h>
 
 void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
 {
-- 
cgit v1.2.3-70-g09d2


From a272378d1128d1c60a463a315646c86d174ff74c Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
Date: Sun, 19 Aug 2007 17:16:05 -0700
Subject: [KTIME]: Introduce ktime_sub_ns and ktime_sub_us

First user will be the DCCP transport networking protocol.

Signed-off-by: Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ktime.h | 21 +++++++++++++++++++++
 kernel/hrtimer.c      | 24 ++++++++++++++++++++++++
 2 files changed, 45 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/ktime.h b/include/linux/ktime.h
index dae7143644f..a6ddec141f9 100644
--- a/include/linux/ktime.h
+++ b/include/linux/ktime.h
@@ -102,6 +102,13 @@ static inline ktime_t ktime_set(const long secs, const unsigned long nsecs)
 #define ktime_add_ns(kt, nsval) \
 		({ (ktime_t){ .tv64 = (kt).tv64 + (nsval) }; })
 
+/*
+ * Subtract a scalar nanosecod from a ktime_t variable
+ * res = kt - nsval:
+ */
+#define ktime_sub_ns(kt, nsval) \
+		({ (ktime_t){ .tv64 = (kt).tv64 - (nsval) }; })
+
 /* convert a timespec to ktime_t format: */
 static inline ktime_t timespec_to_ktime(struct timespec ts)
 {
@@ -199,6 +206,15 @@ static inline ktime_t ktime_add(const ktime_t add1, const ktime_t add2)
  */
 extern ktime_t ktime_add_ns(const ktime_t kt, u64 nsec);
 
+/**
+ * ktime_sub_ns - Subtract a scalar nanoseconds value from a ktime_t variable
+ * @kt:		minuend
+ * @nsec:	the scalar nsec value to subtract
+ *
+ * Returns the subtraction of @nsec from @kt in ktime_t format
+ */
+extern ktime_t ktime_sub_ns(const ktime_t kt, u64 nsec);
+
 /**
  * timespec_to_ktime - convert a timespec to ktime_t format
  * @ts:		the timespec variable to convert
@@ -289,6 +305,11 @@ static inline ktime_t ktime_add_us(const ktime_t kt, const u64 usec)
 	return ktime_add_ns(kt, usec * 1000);
 }
 
+static inline ktime_t ktime_sub_us(const ktime_t kt, const u64 usec)
+{
+	return ktime_sub_ns(kt, usec * 1000);
+}
+
 /*
  * The resolution of the clocks. The resolution value is returned in
  * the clock_getres() system call to give application programmers an
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index c21ca6bfaa6..dc8a4451d79 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -277,6 +277,30 @@ ktime_t ktime_add_ns(const ktime_t kt, u64 nsec)
 }
 
 EXPORT_SYMBOL_GPL(ktime_add_ns);
+
+/**
+ * ktime_sub_ns - Subtract a scalar nanoseconds value from a ktime_t variable
+ * @kt:		minuend
+ * @nsec:	the scalar nsec value to subtract
+ *
+ * Returns the subtraction of @nsec from @kt in ktime_t format
+ */
+ktime_t ktime_sub_ns(const ktime_t kt, u64 nsec)
+{
+	ktime_t tmp;
+
+	if (likely(nsec < NSEC_PER_SEC)) {
+		tmp.tv64 = nsec;
+	} else {
+		unsigned long rem = do_div(nsec, NSEC_PER_SEC);
+
+		tmp = ktime_set((long)nsec, rem);
+	}
+
+	return ktime_sub(kt, tmp);
+}
+
+EXPORT_SYMBOL_GPL(ktime_sub_ns);
 # endif /* !CONFIG_KTIME_SCALAR */
 
 /*
-- 
cgit v1.2.3-70-g09d2


From c45248c70125cc374fdf264659643276c72801bf Mon Sep 17 00:00:00 2001
From: Robert Olsson <robert.olsson@its.uu.se>
Date: Mon, 17 Sep 2007 11:47:12 -0700
Subject: [SOFTIRQ]: Remove do_softirq() symbol export.

As noted by Christoph Hellwig, pktgen was the only user so
it can now be removed.

[ Add missing cases caught by Adrian Bunk. -DaveM ]

Signed-off-by: Robert Olsson <robert.olsson@its.uu.se>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/i386/kernel/irq.c    | 2 --
 arch/powerpc/kernel/irq.c | 1 -
 arch/s390/kernel/irq.c    | 1 -
 arch/sh/kernel/irq.c      | 1 -
 arch/x86_64/kernel/irq.c  | 1 -
 kernel/softirq.c          | 2 --
 6 files changed, 8 deletions(-)

(limited to 'kernel')

diff --git a/arch/i386/kernel/irq.c b/arch/i386/kernel/irq.c
index dd2b97fc00b..4f681bcdb1f 100644
--- a/arch/i386/kernel/irq.c
+++ b/arch/i386/kernel/irq.c
@@ -231,8 +231,6 @@ asmlinkage void do_softirq(void)
 
 	local_irq_restore(flags);
 }
-
-EXPORT_SYMBOL(do_softirq);
 #endif
 
 /*
diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
index 24bea97c736..9bf63d5256d 100644
--- a/arch/powerpc/kernel/irq.c
+++ b/arch/powerpc/kernel/irq.c
@@ -395,7 +395,6 @@ void do_softirq(void)
 
 	local_irq_restore(flags);
 }
-EXPORT_SYMBOL(do_softirq);
 
 
 /*
diff --git a/arch/s390/kernel/irq.c b/arch/s390/kernel/irq.c
index 8f0cbca3120..c36d8123ca1 100644
--- a/arch/s390/kernel/irq.c
+++ b/arch/s390/kernel/irq.c
@@ -95,7 +95,6 @@ asmlinkage void do_softirq(void)
 
 	local_irq_restore(flags);
 }
-EXPORT_SYMBOL(do_softirq);
 
 void init_irq_proc(void)
 {
diff --git a/arch/sh/kernel/irq.c b/arch/sh/kernel/irq.c
index 03404987528..4b49d03ffbd 100644
--- a/arch/sh/kernel/irq.c
+++ b/arch/sh/kernel/irq.c
@@ -245,7 +245,6 @@ asmlinkage void do_softirq(void)
 
 	local_irq_restore(flags);
 }
-EXPORT_SYMBOL(do_softirq);
 #endif
 
 void __init init_IRQ(void)
diff --git a/arch/x86_64/kernel/irq.c b/arch/x86_64/kernel/irq.c
index 39cb3fa83eb..bd11e42b22b 100644
--- a/arch/x86_64/kernel/irq.c
+++ b/arch/x86_64/kernel/irq.c
@@ -210,4 +210,3 @@ asmlinkage void do_softirq(void)
 	}
  	local_irq_restore(flags);
 }
-EXPORT_SYMBOL(do_softirq);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 0f546ddea43..dbbdcd7f3c2 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -271,8 +271,6 @@ asmlinkage void do_softirq(void)
 	local_irq_restore(flags);
 }
 
-EXPORT_SYMBOL(do_softirq);
-
 #endif
 
 /*
-- 
cgit v1.2.3-70-g09d2


From b4b510290b056b86611757ce1175a230f1080f53 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Wed, 12 Sep 2007 13:05:38 +0200
Subject: [NET]: Support multiple network namespaces with netlink

Each netlink socket will live in exactly one network namespace,
this includes the controlling kernel sockets.

This patch updates all of the existing netlink protocols
to only support the initial network namespace.  Request
by clients in other namespaces will get -ECONREFUSED.
As they would if the kernel did not have the support for
that netlink protocol compiled in.

As each netlink protocol is updated to be multiple network
namespace safe it can register multiple kernel sockets
to acquire a presence in the rest of the network namespaces.

The implementation in af_netlink is a simple filter implementation
at hash table insertion and hash table look up time.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/connector/connector.c       |   2 +-
 drivers/scsi/scsi_netlink.c         |   2 +-
 drivers/scsi/scsi_transport_iscsi.c |   2 +-
 fs/ecryptfs/netlink.c               |   2 +-
 include/linux/netlink.h             |   6 ++-
 kernel/audit.c                      |   4 +-
 lib/kobject_uevent.c                |   5 +-
 net/bridge/netfilter/ebt_ulog.c     |   5 +-
 net/core/rtnetlink.c                |   4 +-
 net/decnet/netfilter/dn_rtmsg.c     |   3 +-
 net/ipv4/fib_frontend.c             |   4 +-
 net/ipv4/inet_diag.c                |   4 +-
 net/ipv4/netfilter/ip_queue.c       |   6 +--
 net/ipv4/netfilter/ipt_ULOG.c       |   3 +-
 net/ipv6/netfilter/ip6_queue.c      |   6 +--
 net/netfilter/nfnetlink.c           |   2 +-
 net/netfilter/nfnetlink_log.c       |   3 +-
 net/netfilter/nfnetlink_queue.c     |   3 +-
 net/netlink/af_netlink.c            | 104 +++++++++++++++++++++++++++---------
 net/netlink/genetlink.c             |   4 +-
 net/xfrm/xfrm_user.c                |   2 +-
 security/selinux/netlink.c          |   5 +-
 22 files changed, 121 insertions(+), 60 deletions(-)

(limited to 'kernel')

diff --git a/drivers/connector/connector.c b/drivers/connector/connector.c
index a7b9e9bb3e8..569070997cc 100644
--- a/drivers/connector/connector.c
+++ b/drivers/connector/connector.c
@@ -446,7 +446,7 @@ static int __devinit cn_init(void)
 	dev->id.idx = cn_idx;
 	dev->id.val = cn_val;
 
-	dev->nls = netlink_kernel_create(NETLINK_CONNECTOR,
+	dev->nls = netlink_kernel_create(&init_net, NETLINK_CONNECTOR,
 					 CN_NETLINK_USERS + 0xf,
 					 dev->input, NULL, THIS_MODULE);
 	if (!dev->nls)
diff --git a/drivers/scsi/scsi_netlink.c b/drivers/scsi/scsi_netlink.c
index 4bf9aa547c7..163acf6ad2d 100644
--- a/drivers/scsi/scsi_netlink.c
+++ b/drivers/scsi/scsi_netlink.c
@@ -167,7 +167,7 @@ scsi_netlink_init(void)
 		return;
 	}
 
-	scsi_nl_sock = netlink_kernel_create(NETLINK_SCSITRANSPORT,
+	scsi_nl_sock = netlink_kernel_create(&init_net, NETLINK_SCSITRANSPORT,
 				SCSI_NL_GRP_CNT, scsi_nl_rcv, NULL,
 				THIS_MODULE);
 	if (!scsi_nl_sock) {
diff --git a/drivers/scsi/scsi_transport_iscsi.c b/drivers/scsi/scsi_transport_iscsi.c
index 34c1860a259..4916f01230d 100644
--- a/drivers/scsi/scsi_transport_iscsi.c
+++ b/drivers/scsi/scsi_transport_iscsi.c
@@ -1523,7 +1523,7 @@ static __init int iscsi_transport_init(void)
 	if (err)
 		goto unregister_conn_class;
 
-	nls = netlink_kernel_create(NETLINK_ISCSI, 1, iscsi_if_rx, NULL,
+	nls = netlink_kernel_create(&init_net, NETLINK_ISCSI, 1, iscsi_if_rx, NULL,
 			THIS_MODULE);
 	if (!nls) {
 		err = -ENOBUFS;
diff --git a/fs/ecryptfs/netlink.c b/fs/ecryptfs/netlink.c
index fe9186312d7..056519cd92b 100644
--- a/fs/ecryptfs/netlink.c
+++ b/fs/ecryptfs/netlink.c
@@ -227,7 +227,7 @@ int ecryptfs_init_netlink(void)
 {
 	int rc;
 
-	ecryptfs_nl_sock = netlink_kernel_create(NETLINK_ECRYPTFS, 0,
+	ecryptfs_nl_sock = netlink_kernel_create(&init_net, NETLINK_ECRYPTFS, 0,
 						 ecryptfs_receive_nl_message,
 						 NULL, THIS_MODULE);
 	if (!ecryptfs_nl_sock) {
diff --git a/include/linux/netlink.h b/include/linux/netlink.h
index 83d8239f0cc..d2843ae4a83 100644
--- a/include/linux/netlink.h
+++ b/include/linux/netlink.h
@@ -27,6 +27,8 @@
 
 #define MAX_LINKS 32		
 
+struct net;
+
 struct sockaddr_nl
 {
 	sa_family_t	nl_family;	/* AF_NETLINK	*/
@@ -157,7 +159,8 @@ struct netlink_skb_parms
 #define NETLINK_CREDS(skb)	(&NETLINK_CB((skb)).creds)
 
 
-extern struct sock *netlink_kernel_create(int unit, unsigned int groups,
+extern struct sock *netlink_kernel_create(struct net *net,
+					  int unit,unsigned int groups,
 					  void (*input)(struct sock *sk, int len),
 					  struct mutex *cb_mutex,
 					  struct module *module);
@@ -206,6 +209,7 @@ struct netlink_callback
 
 struct netlink_notify
 {
+	struct net *net;
 	int pid;
 	int protocol;
 };
diff --git a/kernel/audit.c b/kernel/audit.c
index eb0f9165b40..f3c390f6c0b 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -876,8 +876,8 @@ static int __init audit_init(void)
 
 	printk(KERN_INFO "audit: initializing netlink socket (%s)\n",
 	       audit_default ? "enabled" : "disabled");
-	audit_sock = netlink_kernel_create(NETLINK_AUDIT, 0, audit_receive,
-					   NULL, THIS_MODULE);
+	audit_sock = netlink_kernel_create(&init_net, NETLINK_AUDIT, 0,
+					   audit_receive, NULL, THIS_MODULE);
 	if (!audit_sock)
 		audit_panic("cannot initialize netlink socket");
 	else
diff --git a/lib/kobject_uevent.c b/lib/kobject_uevent.c
index df02814699d..e06a8dcec0f 100644
--- a/lib/kobject_uevent.c
+++ b/lib/kobject_uevent.c
@@ -280,9 +280,8 @@ EXPORT_SYMBOL_GPL(add_uevent_var);
 #if defined(CONFIG_NET)
 static int __init kobject_uevent_init(void)
 {
-	uevent_sock = netlink_kernel_create(NETLINK_KOBJECT_UEVENT, 1, NULL,
-					    NULL, THIS_MODULE);
-
+	uevent_sock = netlink_kernel_create(&init_net, NETLINK_KOBJECT_UEVENT,
+					    1, NULL, NULL, THIS_MODULE);
 	if (!uevent_sock) {
 		printk(KERN_ERR
 		       "kobject_uevent: unable to create netlink socket!\n");
diff --git a/net/bridge/netfilter/ebt_ulog.c b/net/bridge/netfilter/ebt_ulog.c
index 204c968fa86..e7cfd30bac7 100644
--- a/net/bridge/netfilter/ebt_ulog.c
+++ b/net/bridge/netfilter/ebt_ulog.c
@@ -300,8 +300,9 @@ static int __init ebt_ulog_init(void)
 		spin_lock_init(&ulog_buffers[i].lock);
 	}
 
-	ebtulognl = netlink_kernel_create(NETLINK_NFLOG, EBT_ULOG_MAXNLGROUPS,
-					  NULL, NULL, THIS_MODULE);
+	ebtulognl = netlink_kernel_create(&init_net, NETLINK_NFLOG,
+					  EBT_ULOG_MAXNLGROUPS, NULL, NULL,
+					  THIS_MODULE);
 	if (!ebtulognl)
 		ret = -ENOMEM;
 	else if ((ret = ebt_register_watcher(&ulog)))
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 41859508bed..416768d1e0c 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -1327,8 +1327,8 @@ void __init rtnetlink_init(void)
 	if (!rta_buf)
 		panic("rtnetlink_init: cannot allocate rta_buf\n");
 
-	rtnl = netlink_kernel_create(NETLINK_ROUTE, RTNLGRP_MAX, rtnetlink_rcv,
-				     &rtnl_mutex, THIS_MODULE);
+	rtnl = netlink_kernel_create(&init_net, NETLINK_ROUTE, RTNLGRP_MAX,
+				     rtnetlink_rcv, &rtnl_mutex, THIS_MODULE);
 	if (rtnl == NULL)
 		panic("rtnetlink_init: cannot initialize rtnetlink\n");
 	netlink_set_nonroot(NETLINK_ROUTE, NL_NONROOT_RECV);
diff --git a/net/decnet/netfilter/dn_rtmsg.c b/net/decnet/netfilter/dn_rtmsg.c
index 696234688cf..ebb38feb4df 100644
--- a/net/decnet/netfilter/dn_rtmsg.c
+++ b/net/decnet/netfilter/dn_rtmsg.c
@@ -137,7 +137,8 @@ static int __init dn_rtmsg_init(void)
 {
 	int rv = 0;
 
-	dnrmg = netlink_kernel_create(NETLINK_DNRTMSG, DNRNG_NLGRP_MAX,
+	dnrmg = netlink_kernel_create(&init_net,
+				      NETLINK_DNRTMSG, DNRNG_NLGRP_MAX,
 				      dnrmg_receive_user_sk, NULL, THIS_MODULE);
 	if (dnrmg == NULL) {
 		printk(KERN_ERR "dn_rtmsg: Cannot create netlink socket");
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index cefb55ec3d6..140bf7a8d87 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -816,8 +816,8 @@ static void nl_fib_input(struct sock *sk, int len)
 
 static void nl_fib_lookup_init(void)
 {
-      netlink_kernel_create(NETLINK_FIB_LOOKUP, 0, nl_fib_input, NULL,
-			    THIS_MODULE);
+	netlink_kernel_create(&init_net, NETLINK_FIB_LOOKUP, 0, nl_fib_input,
+				NULL, THIS_MODULE);
 }
 
 static void fib_disable_ip(struct net_device *dev, int force)
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 686ddd62f71..031cc4856b4 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -897,8 +897,8 @@ static int __init inet_diag_init(void)
 	if (!inet_diag_table)
 		goto out;
 
-	idiagnl = netlink_kernel_create(NETLINK_INET_DIAG, 0, inet_diag_rcv,
-					NULL, THIS_MODULE);
+	idiagnl = netlink_kernel_create(&init_net, NETLINK_INET_DIAG, 0,
+					inet_diag_rcv, NULL, THIS_MODULE);
 	if (idiagnl == NULL)
 		goto out_free_table;
 	err = 0;
diff --git a/net/ipv4/netfilter/ip_queue.c b/net/ipv4/netfilter/ip_queue.c
index d91856097f2..82fda92e6b9 100644
--- a/net/ipv4/netfilter/ip_queue.c
+++ b/net/ipv4/netfilter/ip_queue.c
@@ -579,7 +579,7 @@ ipq_rcv_nl_event(struct notifier_block *this,
 	if (event == NETLINK_URELEASE &&
 	    n->protocol == NETLINK_FIREWALL && n->pid) {
 		write_lock_bh(&queue_lock);
-		if (n->pid == peer_pid)
+		if ((n->net == &init_net) && (n->pid == peer_pid))
 			__ipq_reset();
 		write_unlock_bh(&queue_lock);
 	}
@@ -671,8 +671,8 @@ static int __init ip_queue_init(void)
 	struct proc_dir_entry *proc;
 
 	netlink_register_notifier(&ipq_nl_notifier);
-	ipqnl = netlink_kernel_create(NETLINK_FIREWALL, 0, ipq_rcv_sk,
-				      NULL, THIS_MODULE);
+	ipqnl = netlink_kernel_create(&init_net, NETLINK_FIREWALL, 0,
+				      ipq_rcv_sk, NULL, THIS_MODULE);
 	if (ipqnl == NULL) {
 		printk(KERN_ERR "ip_queue: failed to create netlink socket\n");
 		goto cleanup_netlink_notifier;
diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c
index 6ca43e4ca7e..c636d6d6357 100644
--- a/net/ipv4/netfilter/ipt_ULOG.c
+++ b/net/ipv4/netfilter/ipt_ULOG.c
@@ -409,7 +409,8 @@ static int __init ipt_ulog_init(void)
 	for (i = 0; i < ULOG_MAXNLGROUPS; i++)
 		setup_timer(&ulog_buffers[i].timer, ulog_timer, i);
 
-	nflognl = netlink_kernel_create(NETLINK_NFLOG, ULOG_MAXNLGROUPS, NULL,
+	nflognl = netlink_kernel_create(&init_net,
+					NETLINK_NFLOG, ULOG_MAXNLGROUPS, NULL,
 					NULL, THIS_MODULE);
 	if (!nflognl)
 		return -ENOMEM;
diff --git a/net/ipv6/netfilter/ip6_queue.c b/net/ipv6/netfilter/ip6_queue.c
index 64536a3ef2f..2f5a5245383 100644
--- a/net/ipv6/netfilter/ip6_queue.c
+++ b/net/ipv6/netfilter/ip6_queue.c
@@ -569,7 +569,7 @@ ipq_rcv_nl_event(struct notifier_block *this,
 	if (event == NETLINK_URELEASE &&
 	    n->protocol == NETLINK_IP6_FW && n->pid) {
 		write_lock_bh(&queue_lock);
-		if (n->pid == peer_pid)
+		if ((n->net == &init_net) && (n->pid == peer_pid))
 			__ipq_reset();
 		write_unlock_bh(&queue_lock);
 	}
@@ -661,8 +661,8 @@ static int __init ip6_queue_init(void)
 	struct proc_dir_entry *proc;
 
 	netlink_register_notifier(&ipq_nl_notifier);
-	ipqnl = netlink_kernel_create(NETLINK_IP6_FW, 0, ipq_rcv_sk, NULL,
-				      THIS_MODULE);
+	ipqnl = netlink_kernel_create(&init_net, NETLINK_IP6_FW, 0, ipq_rcv_sk,
+					NULL, THIS_MODULE);
 	if (ipqnl == NULL) {
 		printk(KERN_ERR "ip6_queue: failed to create netlink socket\n");
 		goto cleanup_netlink_notifier;
diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c
index 8797e6953ef..fa974e8e0ce 100644
--- a/net/netfilter/nfnetlink.c
+++ b/net/netfilter/nfnetlink.c
@@ -264,7 +264,7 @@ static int __init nfnetlink_init(void)
 {
 	printk("Netfilter messages via NETLINK v%s.\n", nfversion);
 
-	nfnl = netlink_kernel_create(NETLINK_NETFILTER, NFNLGRP_MAX,
+	nfnl = netlink_kernel_create(&init_net, NETLINK_NETFILTER, NFNLGRP_MAX,
 				     nfnetlink_rcv, NULL, THIS_MODULE);
 	if (!nfnl) {
 		printk(KERN_ERR "cannot initialize nfnetlink!\n");
diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c
index 2351533a850..8e4001b8f76 100644
--- a/net/netfilter/nfnetlink_log.c
+++ b/net/netfilter/nfnetlink_log.c
@@ -706,7 +706,8 @@ nfulnl_rcv_nl_event(struct notifier_block *this,
 
 			hlist_for_each_entry_safe(inst, tmp, t2, head, hlist) {
 				UDEBUG("node = %p\n", inst);
-				if (n->pid == inst->peer_pid)
+				if ((n->net == &init_net) &&
+				    (n->pid == inst->peer_pid))
 					__instance_destroy(inst);
 			}
 		}
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
index 5a8e8ff7664..c97369f48db 100644
--- a/net/netfilter/nfnetlink_queue.c
+++ b/net/netfilter/nfnetlink_queue.c
@@ -765,7 +765,8 @@ nfqnl_rcv_nl_event(struct notifier_block *this,
 			struct hlist_head *head = &instance_table[i];
 
 			hlist_for_each_entry_safe(inst, tmp, t2, head, hlist) {
-				if (n->pid == inst->peer_pid)
+				if ((n->net == &init_net) &&
+				    (n->pid == inst->peer_pid))
 					__instance_destroy(inst);
 			}
 		}
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 406a493300d..3029f865cd6 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -211,7 +211,7 @@ netlink_unlock_table(void)
 		wake_up(&nl_table_wait);
 }
 
-static __inline__ struct sock *netlink_lookup(int protocol, u32 pid)
+static __inline__ struct sock *netlink_lookup(struct net *net, int protocol, u32 pid)
 {
 	struct nl_pid_hash *hash = &nl_table[protocol].hash;
 	struct hlist_head *head;
@@ -221,7 +221,7 @@ static __inline__ struct sock *netlink_lookup(int protocol, u32 pid)
 	read_lock(&nl_table_lock);
 	head = nl_pid_hashfn(hash, pid);
 	sk_for_each(sk, node, head) {
-		if (nlk_sk(sk)->pid == pid) {
+		if ((sk->sk_net == net) && (nlk_sk(sk)->pid == pid)) {
 			sock_hold(sk);
 			goto found;
 		}
@@ -328,7 +328,7 @@ netlink_update_listeners(struct sock *sk)
 	 * makes sure updates are visible before bind or setsockopt return. */
 }
 
-static int netlink_insert(struct sock *sk, u32 pid)
+static int netlink_insert(struct sock *sk, struct net *net, u32 pid)
 {
 	struct nl_pid_hash *hash = &nl_table[sk->sk_protocol].hash;
 	struct hlist_head *head;
@@ -341,7 +341,7 @@ static int netlink_insert(struct sock *sk, u32 pid)
 	head = nl_pid_hashfn(hash, pid);
 	len = 0;
 	sk_for_each(osk, node, head) {
-		if (nlk_sk(osk)->pid == pid)
+		if ((osk->sk_net == net) && (nlk_sk(osk)->pid == pid))
 			break;
 		len++;
 	}
@@ -419,9 +419,6 @@ static int netlink_create(struct net *net, struct socket *sock, int protocol)
 	struct netlink_sock *nlk;
 	int err = 0;
 
-	if (net != &init_net)
-		return -EAFNOSUPPORT;
-
 	sock->state = SS_UNCONNECTED;
 
 	if (sock->type != SOCK_RAW && sock->type != SOCK_DGRAM)
@@ -481,6 +478,7 @@ static int netlink_release(struct socket *sock)
 
 	if (nlk->pid && !nlk->subscriptions) {
 		struct netlink_notify n = {
+						.net = sk->sk_net,
 						.protocol = sk->sk_protocol,
 						.pid = nlk->pid,
 					  };
@@ -509,6 +507,7 @@ static int netlink_release(struct socket *sock)
 static int netlink_autobind(struct socket *sock)
 {
 	struct sock *sk = sock->sk;
+	struct net *net = sk->sk_net;
 	struct nl_pid_hash *hash = &nl_table[sk->sk_protocol].hash;
 	struct hlist_head *head;
 	struct sock *osk;
@@ -522,6 +521,8 @@ retry:
 	netlink_table_grab();
 	head = nl_pid_hashfn(hash, pid);
 	sk_for_each(osk, node, head) {
+		if ((osk->sk_net != net))
+			continue;
 		if (nlk_sk(osk)->pid == pid) {
 			/* Bind collision, search negative pid values. */
 			pid = rover--;
@@ -533,7 +534,7 @@ retry:
 	}
 	netlink_table_ungrab();
 
-	err = netlink_insert(sk, pid);
+	err = netlink_insert(sk, net, pid);
 	if (err == -EADDRINUSE)
 		goto retry;
 
@@ -598,6 +599,7 @@ static int netlink_realloc_groups(struct sock *sk)
 static int netlink_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
 {
 	struct sock *sk = sock->sk;
+	struct net *net = sk->sk_net;
 	struct netlink_sock *nlk = nlk_sk(sk);
 	struct sockaddr_nl *nladdr = (struct sockaddr_nl *)addr;
 	int err;
@@ -619,7 +621,7 @@ static int netlink_bind(struct socket *sock, struct sockaddr *addr, int addr_len
 			return -EINVAL;
 	} else {
 		err = nladdr->nl_pid ?
-			netlink_insert(sk, nladdr->nl_pid) :
+			netlink_insert(sk, net, nladdr->nl_pid) :
 			netlink_autobind(sock);
 		if (err)
 			return err;
@@ -703,10 +705,12 @@ static void netlink_overrun(struct sock *sk)
 static struct sock *netlink_getsockbypid(struct sock *ssk, u32 pid)
 {
 	int protocol = ssk->sk_protocol;
+	struct net *net;
 	struct sock *sock;
 	struct netlink_sock *nlk;
 
-	sock = netlink_lookup(protocol, pid);
+	net = ssk->sk_net;
+	sock = netlink_lookup(net, protocol, pid);
 	if (!sock)
 		return ERR_PTR(-ECONNREFUSED);
 
@@ -887,6 +891,7 @@ static __inline__ int netlink_broadcast_deliver(struct sock *sk, struct sk_buff
 
 struct netlink_broadcast_data {
 	struct sock *exclude_sk;
+	struct net *net;
 	u32 pid;
 	u32 group;
 	int failure;
@@ -909,6 +914,9 @@ static inline int do_one_broadcast(struct sock *sk,
 	    !test_bit(p->group - 1, nlk->groups))
 		goto out;
 
+	if ((sk->sk_net != p->net))
+		goto out;
+
 	if (p->failure) {
 		netlink_overrun(sk);
 		goto out;
@@ -947,6 +955,7 @@ out:
 int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, u32 pid,
 		      u32 group, gfp_t allocation)
 {
+	struct net *net = ssk->sk_net;
 	struct netlink_broadcast_data info;
 	struct hlist_node *node;
 	struct sock *sk;
@@ -954,6 +963,7 @@ int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, u32 pid,
 	skb = netlink_trim(skb, allocation);
 
 	info.exclude_sk = ssk;
+	info.net = net;
 	info.pid = pid;
 	info.group = group;
 	info.failure = 0;
@@ -1002,6 +1012,9 @@ static inline int do_one_set_err(struct sock *sk,
 	if (sk == p->exclude_sk)
 		goto out;
 
+	if (sk->sk_net != p->exclude_sk->sk_net)
+		goto out;
+
 	if (nlk->pid == p->pid || p->group - 1 >= nlk->ngroups ||
 	    !test_bit(p->group - 1, nlk->groups))
 		goto out;
@@ -1304,7 +1317,7 @@ static void netlink_data_ready(struct sock *sk, int len)
  */
 
 struct sock *
-netlink_kernel_create(int unit, unsigned int groups,
+netlink_kernel_create(struct net *net, int unit, unsigned int groups,
 		      void (*input)(struct sock *sk, int len),
 		      struct mutex *cb_mutex, struct module *module)
 {
@@ -1321,7 +1334,7 @@ netlink_kernel_create(int unit, unsigned int groups,
 	if (sock_create_lite(PF_NETLINK, SOCK_DGRAM, unit, &sock))
 		return NULL;
 
-	if (__netlink_create(&init_net, sock, cb_mutex, unit) < 0)
+	if (__netlink_create(net, sock, cb_mutex, unit) < 0)
 		goto out_sock_release;
 
 	if (groups < 32)
@@ -1336,18 +1349,20 @@ netlink_kernel_create(int unit, unsigned int groups,
 	if (input)
 		nlk_sk(sk)->data_ready = input;
 
-	if (netlink_insert(sk, 0))
+	if (netlink_insert(sk, net, 0))
 		goto out_sock_release;
 
 	nlk = nlk_sk(sk);
 	nlk->flags |= NETLINK_KERNEL_SOCKET;
 
 	netlink_table_grab();
-	nl_table[unit].groups = groups;
-	nl_table[unit].listeners = listeners;
-	nl_table[unit].cb_mutex = cb_mutex;
-	nl_table[unit].module = module;
-	nl_table[unit].registered = 1;
+	if (!nl_table[unit].registered) {
+		nl_table[unit].groups = groups;
+		nl_table[unit].listeners = listeners;
+		nl_table[unit].cb_mutex = cb_mutex;
+		nl_table[unit].module = module;
+		nl_table[unit].registered = 1;
+	}
 	netlink_table_ungrab();
 
 	return sk;
@@ -1513,7 +1528,7 @@ int netlink_dump_start(struct sock *ssk, struct sk_buff *skb,
 	atomic_inc(&skb->users);
 	cb->skb = skb;
 
-	sk = netlink_lookup(ssk->sk_protocol, NETLINK_CB(skb).pid);
+	sk = netlink_lookup(ssk->sk_net, ssk->sk_protocol, NETLINK_CB(skb).pid);
 	if (sk == NULL) {
 		netlink_destroy_callback(cb);
 		return -ECONNREFUSED;
@@ -1555,7 +1570,8 @@ void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err)
 	if (!skb) {
 		struct sock *sk;
 
-		sk = netlink_lookup(in_skb->sk->sk_protocol,
+		sk = netlink_lookup(in_skb->sk->sk_net,
+				    in_skb->sk->sk_protocol,
 				    NETLINK_CB(in_skb).pid);
 		if (sk) {
 			sk->sk_err = ENOBUFS;
@@ -1706,6 +1722,7 @@ int nlmsg_notify(struct sock *sk, struct sk_buff *skb, u32 pid,
 
 #ifdef CONFIG_PROC_FS
 struct nl_seq_iter {
+	struct net *net;
 	int link;
 	int hash_idx;
 };
@@ -1723,6 +1740,8 @@ static struct sock *netlink_seq_socket_idx(struct seq_file *seq, loff_t pos)
 
 		for (j = 0; j <= hash->mask; j++) {
 			sk_for_each(s, node, &hash->table[j]) {
+				if (iter->net != s->sk_net)
+					continue;
 				if (off == pos) {
 					iter->link = i;
 					iter->hash_idx = j;
@@ -1752,11 +1771,14 @@ static void *netlink_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 	if (v == SEQ_START_TOKEN)
 		return netlink_seq_socket_idx(seq, 0);
 
-	s = sk_next(v);
+	iter = seq->private;
+	s = v;
+	do {
+		s = sk_next(s);
+	} while (s && (iter->net != s->sk_net));
 	if (s)
 		return s;
 
-	iter = seq->private;
 	i = iter->link;
 	j = iter->hash_idx + 1;
 
@@ -1765,6 +1787,8 @@ static void *netlink_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 
 		for (; j <= hash->mask; j++) {
 			s = sk_head(&hash->table[j]);
+			while (s && (iter->net != s->sk_net))
+				s = sk_next(s);
 			if (s) {
 				iter->link = i;
 				iter->hash_idx = j;
@@ -1835,15 +1859,24 @@ static int netlink_seq_open(struct inode *inode, struct file *file)
 
 	seq = file->private_data;
 	seq->private = iter;
+	iter->net = get_net(PROC_NET(inode));
 	return 0;
 }
 
+static int netlink_seq_release(struct inode *inode, struct file *file)
+{
+	struct seq_file *seq = file->private_data;
+	struct nl_seq_iter *iter = seq->private;
+	put_net(iter->net);
+	return seq_release_private(inode, file);
+}
+
 static const struct file_operations netlink_seq_fops = {
 	.owner		= THIS_MODULE,
 	.open		= netlink_seq_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
-	.release	= seq_release_private,
+	.release	= netlink_seq_release,
 };
 
 #endif
@@ -1885,6 +1918,27 @@ static struct net_proto_family netlink_family_ops = {
 	.owner	= THIS_MODULE,	/* for consistency 8) */
 };
 
+static int netlink_net_init(struct net *net)
+{
+#ifdef CONFIG_PROC_FS
+	if (!proc_net_fops_create(net, "netlink", 0, &netlink_seq_fops))
+		return -ENOMEM;
+#endif
+	return 0;
+}
+
+static void netlink_net_exit(struct net *net)
+{
+#ifdef CONFIG_PROC_FS
+	proc_net_remove(net, "netlink");
+#endif
+}
+
+static struct pernet_operations netlink_net_ops = {
+	.init = netlink_net_init,
+	.exit = netlink_net_exit,
+};
+
 static int __init netlink_proto_init(void)
 {
 	struct sk_buff *dummy_skb;
@@ -1930,9 +1984,7 @@ static int __init netlink_proto_init(void)
 	}
 
 	sock_register(&netlink_family_ops);
-#ifdef CONFIG_PROC_FS
-	proc_net_fops_create(&init_net, "netlink", 0, &netlink_seq_fops);
-#endif
+	register_pernet_subsys(&netlink_net_ops);
 	/* The netlink device handler may be needed early. */
 	rtnetlink_init();
 out:
diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c
index 8c11ca4a212..af8fe26815f 100644
--- a/net/netlink/genetlink.c
+++ b/net/netlink/genetlink.c
@@ -782,8 +782,8 @@ static int __init genl_init(void)
 	netlink_set_nonroot(NETLINK_GENERIC, NL_NONROOT_RECV);
 
 	/* we'll bump the group number right afterwards */
-	genl_sock = netlink_kernel_create(NETLINK_GENERIC, 0, genl_rcv,
-					  NULL, THIS_MODULE);
+	genl_sock = netlink_kernel_create(&init_net, NETLINK_GENERIC, 0,
+					  genl_rcv, NULL, THIS_MODULE);
 	if (genl_sock == NULL)
 		panic("GENL: Cannot initialize generic netlink\n");
 
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index 0d81c0f2391..1f8e7c22ddb 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -2399,7 +2399,7 @@ static int __init xfrm_user_init(void)
 
 	printk(KERN_INFO "Initializing XFRM netlink socket\n");
 
-	nlsk = netlink_kernel_create(NETLINK_XFRM, XFRMNLGRP_MAX,
+	nlsk = netlink_kernel_create(&init_net, NETLINK_XFRM, XFRMNLGRP_MAX,
 				     xfrm_netlink_rcv, NULL, THIS_MODULE);
 	if (nlsk == NULL)
 		return -ENOMEM;
diff --git a/security/selinux/netlink.c b/security/selinux/netlink.c
index f49046de63a..b59871d74da 100644
--- a/security/selinux/netlink.c
+++ b/security/selinux/netlink.c
@@ -17,6 +17,7 @@
 #include <linux/skbuff.h>
 #include <linux/netlink.h>
 #include <linux/selinux_netlink.h>
+#include <net/net_namespace.h>
 
 static struct sock *selnl;
 
@@ -104,8 +105,8 @@ void selnl_notify_policyload(u32 seqno)
 
 static int __init selnl_init(void)
 {
-	selnl = netlink_kernel_create(NETLINK_SELINUX, SELNLGRP_MAX, NULL, NULL,
-	                              THIS_MODULE);
+	selnl = netlink_kernel_create(&init_net, NETLINK_SELINUX,
+				      SELNLGRP_MAX, NULL, NULL, THIS_MODULE);
 	if (selnl == NULL)
 		panic("SELinux:  Cannot create netlink socket.");
 	netlink_set_nonroot(NETLINK_SELINUX, NL_NONROOT_RECV);	
-- 
cgit v1.2.3-70-g09d2


From 464771fe4743afd00ebff65aee0983fa1aa1da4f Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@kernel.org>
Date: Wed, 12 Sep 2007 15:14:45 +0200
Subject: [KERNEL]: Unexport raise_softirq_irqoff

raise_softirq_irqoff no longer has any modular user.

Signed-off-by: Adrian Bunk <bunk@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/softirq.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/softirq.c b/kernel/softirq.c
index dbbdcd7f3c2..bd89bc4eb0b 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -330,8 +330,6 @@ inline fastcall void raise_softirq_irqoff(unsigned int nr)
 		wakeup_softirqd();
 }
 
-EXPORT_SYMBOL(raise_softirq_irqoff);
-
 void fastcall raise_softirq(unsigned int nr)
 {
 	unsigned long flags;
-- 
cgit v1.2.3-70-g09d2


From 9dd776b6d7b0b85966b6ddd03e2b2aae59012ab1 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Wed, 26 Sep 2007 22:04:26 -0700
Subject: [NET]: Add network namespace clone & unshare support.

This patch allows you to create a new network namespace
using sys_clone, or sys_unshare.

As the network namespace is still experimental and under development
clone and unshare support is only made available when CONFIG_NET_NS is
selected at compile time.

As this patch introduces network namespace support into code paths
that exist when the CONFIG_NET is not selected there are a few
additions made to net_namespace.h to allow a few more functions
to be used when the networking stack is not compiled in.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/sched.h       |  1 +
 include/net/net_namespace.h | 18 ++++++++++++++++++
 kernel/fork.c               |  3 ++-
 kernel/nsproxy.c            | 15 +++++++++++++--
 net/Kconfig                 |  8 ++++++++
 net/core/net_namespace.c    | 43 +++++++++++++++++++++++++++++++++++++++++--
 6 files changed, 83 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 313c6b6e774..a4a141055c4 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -27,6 +27,7 @@
 #define CLONE_NEWUTS		0x04000000	/* New utsname group? */
 #define CLONE_NEWIPC		0x08000000	/* New ipcs */
 #define CLONE_NEWUSER		0x10000000	/* New user namespace */
+#define CLONE_NEWNET		0x20000000	/* New network namespace */
 
 /*
  * Scheduling policies
diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index ac8f8304094..3ea4194613e 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -38,11 +38,23 @@ extern struct net init_net;
 
 extern struct list_head net_namespace_list;
 
+#ifdef CONFIG_NET
+extern struct net *copy_net_ns(unsigned long flags, struct net *net_ns);
+#else
+static inline struct net *copy_net_ns(unsigned long flags, struct net *net_ns)
+{
+	/* There is nothing to copy so this is a noop */
+	return net_ns;
+}
+#endif
+
 extern void __put_net(struct net *net);
 
 static inline struct net *get_net(struct net *net)
 {
+#ifdef CONFIG_NET
 	atomic_inc(&net->count);
+#endif
 	return net;
 }
 
@@ -60,19 +72,25 @@ static inline struct net *maybe_get_net(struct net *net)
 
 static inline void put_net(struct net *net)
 {
+#ifdef CONFIG_NET
 	if (atomic_dec_and_test(&net->count))
 		__put_net(net);
+#endif
 }
 
 static inline struct net *hold_net(struct net *net)
 {
+#ifdef CONFIG_NET
 	atomic_inc(&net->use_count);
+#endif
 	return net;
 }
 
 static inline void release_net(struct net *net)
 {
+#ifdef CONFIG_NET
 	atomic_dec(&net->use_count);
+#endif
 }
 
 extern void net_lock(void);
diff --git a/kernel/fork.c b/kernel/fork.c
index 33f12f48684..5e67f90a169 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1608,7 +1608,8 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
 	err = -EINVAL;
 	if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
 				CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
-				CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWUSER))
+				CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWUSER|
+				CLONE_NEWNET))
 		goto bad_unshare_out;
 
 	if ((err = unshare_thread(unshare_flags)))
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index a4fb7d46971..f1decd21a53 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -20,6 +20,7 @@
 #include <linux/mnt_namespace.h>
 #include <linux/utsname.h>
 #include <linux/pid_namespace.h>
+#include <net/net_namespace.h>
 
 static struct kmem_cache *nsproxy_cachep;
 
@@ -98,8 +99,17 @@ static struct nsproxy *create_new_namespaces(unsigned long flags,
 		goto out_user;
 	}
 
+	new_nsp->net_ns = copy_net_ns(flags, tsk->nsproxy->net_ns);
+	if (IS_ERR(new_nsp->net_ns)) {
+		err = PTR_ERR(new_nsp->net_ns);
+		goto out_net;
+	}
+
 	return new_nsp;
 
+out_net:
+	if (new_nsp->user_ns)
+		put_user_ns(new_nsp->user_ns);
 out_user:
 	if (new_nsp->pid_ns)
 		put_pid_ns(new_nsp->pid_ns);
@@ -132,7 +142,7 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
 
 	get_nsproxy(old_ns);
 
-	if (!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | CLONE_NEWUSER)))
+	if (!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | CLONE_NEWUSER | CLONE_NEWNET)))
 		return 0;
 
 	if (!capable(CAP_SYS_ADMIN)) {
@@ -164,6 +174,7 @@ void free_nsproxy(struct nsproxy *ns)
 		put_pid_ns(ns->pid_ns);
 	if (ns->user_ns)
 		put_user_ns(ns->user_ns);
+	put_net(ns->net_ns);
 	kmem_cache_free(nsproxy_cachep, ns);
 }
 
@@ -177,7 +188,7 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags,
 	int err = 0;
 
 	if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
-			       CLONE_NEWUSER)))
+			       CLONE_NEWUSER | CLONE_NEWNET)))
 		return 0;
 
 	if (!capable(CAP_SYS_ADMIN))
diff --git a/net/Kconfig b/net/Kconfig
index cdba08ca2ef..ab4e6da5012 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -27,6 +27,14 @@ if NET
 
 menu "Networking options"
 
+config NET_NS
+	bool "Network namespace support"
+	default n
+	depends on EXPERIMENTAL && !SYSFS
+	help
+	  Allow user space to create what appear to be multiple instances
+	  of the network stack.
+
 source "net/packet/Kconfig"
 source "net/unix/Kconfig"
 source "net/xfrm/Kconfig"
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 0e6cb02d7b7..e478e353ea6 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -4,6 +4,7 @@
 #include <linux/slab.h>
 #include <linux/list.h>
 #include <linux/delay.h>
+#include <linux/sched.h>
 #include <net/net_namespace.h>
 
 /*
@@ -32,12 +33,10 @@ void net_unlock(void)
 	mutex_unlock(&net_list_mutex);
 }
 
-#if 0
 static struct net *net_alloc(void)
 {
 	return kmem_cache_alloc(net_cachep, GFP_KERNEL);
 }
-#endif
 
 static void net_free(struct net *net)
 {
@@ -128,6 +127,46 @@ out_undo:
 	goto out;
 }
 
+struct net *copy_net_ns(unsigned long flags, struct net *old_net)
+{
+	struct net *new_net = NULL;
+	int err;
+
+	get_net(old_net);
+
+	if (!(flags & CLONE_NEWNET))
+		return old_net;
+
+#ifndef CONFIG_NET_NS
+	return ERR_PTR(-EINVAL);
+#endif
+
+	err = -ENOMEM;
+	new_net = net_alloc();
+	if (!new_net)
+		goto out;
+
+	mutex_lock(&net_mutex);
+	err = setup_net(new_net);
+	if (err)
+		goto out_unlock;
+
+	net_lock();
+	list_add_tail(&new_net->list, &net_namespace_list);
+	net_unlock();
+
+
+out_unlock:
+	mutex_unlock(&net_mutex);
+out:
+	put_net(old_net);
+	if (err) {
+		net_free(new_net);
+		new_net = ERR_PTR(err);
+	}
+	return new_net;
+}
+
 static int __init net_ns_init(void)
 {
 	int err;
-- 
cgit v1.2.3-70-g09d2


From cd40b7d3983c708aabe3d3008ec64ffce56d33b0 Mon Sep 17 00:00:00 2001
From: "Denis V. Lunev" <den@openvz.org>
Date: Wed, 10 Oct 2007 21:15:29 -0700
Subject: [NET]: make netlink user -> kernel interface synchronious

This patch make processing netlink user -> kernel messages synchronious.
This change was inspired by the talk with Alexey Kuznetsov about current
netlink messages processing. He says that he was badly wrong when introduced
asynchronious user -> kernel communication.

The call netlink_unicast is the only path to send message to the kernel
netlink socket. But, unfortunately, it is also used to send data to the
user.

Before this change the user message has been attached to the socket queue
and sk->sk_data_ready was called. The process has been blocked until all
pending messages were processed. The bad thing is that this processing
may occur in the arbitrary process context.

This patch changes nlk->data_ready callback to get 1 skb and force packet
processing right in the netlink_unicast.

Kernel -> user path in netlink_unicast remains untouched.

EINTR processing for in netlink_run_queue was changed. It forces rtnl_lock
drop, but the process remains in the cycle until the message will be fully
processed. So, there is no need to use this kludges now.

Signed-off-by: Denis V. Lunev <den@openvz.org>
Acked-by: Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/connector/connector.c       |  14 +---
 drivers/scsi/scsi_netlink.c         |  25 +-----
 drivers/scsi/scsi_transport_iscsi.c |  82 +++++++++----------
 fs/ecryptfs/netlink.c               |  14 +---
 include/linux/connector.h           |   2 +-
 include/linux/netlink.h             |   2 +-
 include/net/netlink.h               |   6 +-
 kernel/audit.c                      |  12 +--
 net/core/rtnetlink.c                |  12 +--
 net/decnet/netfilter/dn_rtmsg.c     |  14 +---
 net/ipv4/fib_frontend.c             |   9 ++-
 net/ipv4/inet_diag.c                |  12 +--
 net/ipv4/netfilter/ip_queue.c       |  17 +---
 net/ipv6/netfilter/ip6_queue.c      |  19 ++---
 net/netfilter/nfnetlink.c           |  12 +--
 net/netlink/af_netlink.c            | 152 +++++++++++-------------------------
 net/netlink/genetlink.c             |  12 +--
 net/xfrm/xfrm_user.c                |  13 +--
 18 files changed, 130 insertions(+), 299 deletions(-)

(limited to 'kernel')

diff --git a/drivers/connector/connector.c b/drivers/connector/connector.c
index 569070997cc..0e328d387af 100644
--- a/drivers/connector/connector.c
+++ b/drivers/connector/connector.c
@@ -234,18 +234,6 @@ out:
 	kfree_skb(__skb);
 }
 
-/*
- * Netlink socket input callback - dequeues the skbs and calls the
- * main netlink receiving function.
- */
-static void cn_input(struct sock *sk, int len)
-{
-	struct sk_buff *skb;
-
-	while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL)
-		cn_rx_skb(skb);
-}
-
 /*
  * Notification routing.
  *
@@ -442,7 +430,7 @@ static int __devinit cn_init(void)
 	struct cn_dev *dev = &cdev;
 	int err;
 
-	dev->input = cn_input;
+	dev->input = cn_rx_skb;
 	dev->id.idx = cn_idx;
 	dev->id.val = cn_val;
 
diff --git a/drivers/scsi/scsi_netlink.c b/drivers/scsi/scsi_netlink.c
index 163acf6ad2d..40579edca10 100644
--- a/drivers/scsi/scsi_netlink.c
+++ b/drivers/scsi/scsi_netlink.c
@@ -64,7 +64,7 @@ scsi_nl_rcv_msg(struct sk_buff *skb)
 
 		if (nlh->nlmsg_type != SCSI_TRANSPORT_MSG) {
 			err = -EBADMSG;
-			goto next_msg;
+			return;
 		}
 
 		hdr = NLMSG_DATA(nlh);
@@ -98,27 +98,6 @@ next_msg:
 }
 
 
-/**
- * scsi_nl_rcv_msg -
- *    Receive handler for a socket. Extracts a received message buffer from
- *    the socket, and starts message processing.
- *
- * @sk:		socket
- * @len:	unused
- *
- **/
-static void
-scsi_nl_rcv(struct sock *sk, int len)
-{
-	struct sk_buff *skb;
-
-	while ((skb = skb_dequeue(&sk->sk_receive_queue))) {
-		scsi_nl_rcv_msg(skb);
-		kfree_skb(skb);
-	}
-}
-
-
 /**
  * scsi_nl_rcv_event -
  *    Event handler for a netlink socket.
@@ -168,7 +147,7 @@ scsi_netlink_init(void)
 	}
 
 	scsi_nl_sock = netlink_kernel_create(&init_net, NETLINK_SCSITRANSPORT,
-				SCSI_NL_GRP_CNT, scsi_nl_rcv, NULL,
+				SCSI_NL_GRP_CNT, scsi_nl_rcv_msg, NULL,
 				THIS_MODULE);
 	if (!scsi_nl_sock) {
 		printk(KERN_ERR "%s: register of recieve handler failed\n",
diff --git a/drivers/scsi/scsi_transport_iscsi.c b/drivers/scsi/scsi_transport_iscsi.c
index 4916f01230d..5428d15f23c 100644
--- a/drivers/scsi/scsi_transport_iscsi.c
+++ b/drivers/scsi/scsi_transport_iscsi.c
@@ -1097,61 +1097,49 @@ iscsi_if_recv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 }
 
 /*
- * Get message from skb (based on rtnetlink_rcv_skb).  Each message is
- * processed by iscsi_if_recv_msg.  Malformed skbs with wrong lengths or
- * invalid creds are discarded silently.
+ * Get message from skb.  Each message is processed by iscsi_if_recv_msg.
+ * Malformed skbs with wrong lengths or invalid creds are not processed.
  */
 static void
-iscsi_if_rx(struct sock *sk, int len)
+iscsi_if_rx(struct sk_buff *skb)
 {
-	struct sk_buff *skb;
-
 	mutex_lock(&rx_queue_mutex);
-	while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
-		if (NETLINK_CREDS(skb)->uid) {
-			skb_pull(skb, skb->len);
-			goto free_skb;
+	while (skb->len >= NLMSG_SPACE(0)) {
+		int err;
+		uint32_t rlen;
+		struct nlmsghdr	*nlh;
+		struct iscsi_uevent *ev;
+
+		nlh = nlmsg_hdr(skb);
+		if (nlh->nlmsg_len < sizeof(*nlh) ||
+		    skb->len < nlh->nlmsg_len) {
+			break;
 		}
 
-		while (skb->len >= NLMSG_SPACE(0)) {
-			int err;
-			uint32_t rlen;
-			struct nlmsghdr	*nlh;
-			struct iscsi_uevent *ev;
+		ev = NLMSG_DATA(nlh);
+		rlen = NLMSG_ALIGN(nlh->nlmsg_len);
+		if (rlen > skb->len)
+			rlen = skb->len;
 
-			nlh = nlmsg_hdr(skb);
-			if (nlh->nlmsg_len < sizeof(*nlh) ||
-			    skb->len < nlh->nlmsg_len) {
-				break;
-			}
-
-			ev = NLMSG_DATA(nlh);
-			rlen = NLMSG_ALIGN(nlh->nlmsg_len);
-			if (rlen > skb->len)
-				rlen = skb->len;
-
-			err = iscsi_if_recv_msg(skb, nlh);
-			if (err) {
-				ev->type = ISCSI_KEVENT_IF_ERROR;
-				ev->iferror = err;
-			}
-			do {
-				/*
-				 * special case for GET_STATS:
-				 * on success - sending reply and stats from
-				 * inside of if_recv_msg(),
-				 * on error - fall through.
-				 */
-				if (ev->type == ISCSI_UEVENT_GET_STATS && !err)
-					break;
-				err = iscsi_if_send_reply(
-					NETLINK_CREDS(skb)->pid, nlh->nlmsg_seq,
-					nlh->nlmsg_type, 0, 0, ev, sizeof(*ev));
-			} while (err < 0 && err != -ECONNREFUSED);
-			skb_pull(skb, rlen);
+		err = iscsi_if_recv_msg(skb, nlh);
+		if (err) {
+			ev->type = ISCSI_KEVENT_IF_ERROR;
+			ev->iferror = err;
 		}
-free_skb:
-		kfree_skb(skb);
+		do {
+			/*
+			 * special case for GET_STATS:
+			 * on success - sending reply and stats from
+			 * inside of if_recv_msg(),
+			 * on error - fall through.
+			 */
+			if (ev->type == ISCSI_UEVENT_GET_STATS && !err)
+				break;
+			err = iscsi_if_send_reply(
+				NETLINK_CREDS(skb)->pid, nlh->nlmsg_seq,
+				nlh->nlmsg_type, 0, 0, ev, sizeof(*ev));
+		} while (err < 0 && err != -ECONNREFUSED);
+		skb_pull(skb, rlen);
 	}
 	mutex_unlock(&rx_queue_mutex);
 }
diff --git a/fs/ecryptfs/netlink.c b/fs/ecryptfs/netlink.c
index 056519cd92b..9aa345121e0 100644
--- a/fs/ecryptfs/netlink.c
+++ b/fs/ecryptfs/netlink.c
@@ -165,22 +165,10 @@ static int ecryptfs_process_nl_quit(struct sk_buff *skb)
  * it to its desired netlink context element and wake up the process
  * that is waiting for a response.
  */
-static void ecryptfs_receive_nl_message(struct sock *sk, int len)
+static void ecryptfs_receive_nl_message(struct sk_buff *skb)
 {
-	struct sk_buff *skb;
 	struct nlmsghdr *nlh;
-	int rc = 0;	/* skb_recv_datagram requires this */
 
-receive:
-	skb = skb_recv_datagram(sk, 0, 0, &rc);
-	if (rc == -EINTR)
-		goto receive;
-	else if (rc < 0) {
-		ecryptfs_printk(KERN_ERR, "Error occurred while "
-				"receiving eCryptfs netlink message; "
-				"rc = [%d]\n", rc);
-		return;
-	}
 	nlh = nlmsg_hdr(skb);
 	if (!NLMSG_OK(nlh, skb->len)) {
 		ecryptfs_printk(KERN_ERR, "Received corrupt netlink "
diff --git a/include/linux/connector.h b/include/linux/connector.h
index 10eb56b2940..b62f823e90c 100644
--- a/include/linux/connector.h
+++ b/include/linux/connector.h
@@ -153,7 +153,7 @@ struct cn_dev {
 
 	u32 seq, groups;
 	struct sock *nls;
-	void (*input) (struct sock * sk, int len);
+	void (*input) (struct sk_buff *skb);
 
 	struct cn_queue_dev *cbdev;
 };
diff --git a/include/linux/netlink.h b/include/linux/netlink.h
index 7b552b6c2c1..7c1f3b1d2ee 100644
--- a/include/linux/netlink.h
+++ b/include/linux/netlink.h
@@ -175,7 +175,7 @@ struct netlink_skb_parms
 
 extern struct sock *netlink_kernel_create(struct net *net,
 					  int unit,unsigned int groups,
-					  void (*input)(struct sock *sk, int len),
+					  void (*input)(struct sk_buff *skb),
 					  struct mutex *cb_mutex,
 					  struct module *module);
 extern int netlink_change_ngroups(struct sock *sk, unsigned int groups);
diff --git a/include/net/netlink.h b/include/net/netlink.h
index 1afd3e837d2..9298218c07f 100644
--- a/include/net/netlink.h
+++ b/include/net/netlink.h
@@ -220,9 +220,9 @@ struct nl_info {
 	u32			pid;
 };
 
-extern unsigned int	netlink_run_queue(struct sock *sk, unsigned int qlen,
-					  int (*cb)(struct sk_buff *,
-						    struct nlmsghdr *));
+extern int		netlink_rcv_skb(struct sk_buff *skb,
+					int (*cb)(struct sk_buff *,
+						  struct nlmsghdr *));
 extern int		nlmsg_notify(struct sock *sk, struct sk_buff *skb,
 				     u32 pid, unsigned int group, int report,
 				     gfp_t flags);
diff --git a/kernel/audit.c b/kernel/audit.c
index f3c390f6c0b..2924251a654 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -847,18 +847,10 @@ static void audit_receive_skb(struct sk_buff *skb)
 }
 
 /* Receive messages from netlink socket. */
-static void audit_receive(struct sock *sk, int length)
+static void audit_receive(struct sk_buff  *skb)
 {
-	struct sk_buff  *skb;
-	unsigned int qlen;
-
 	mutex_lock(&audit_cmd_mutex);
-
-	for (qlen = skb_queue_len(&sk->sk_receive_queue); qlen; qlen--) {
-		skb = skb_dequeue(&sk->sk_receive_queue);
-		audit_receive_skb(skb);
-		kfree_skb(skb);
-	}
+	audit_receive_skb(skb);
 	mutex_unlock(&audit_cmd_mutex);
 }
 
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 471d2d9f8ea..1072d16696c 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -1312,15 +1312,11 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 	return doit(skb, nlh, (void *)&rta_buf[0]);
 }
 
-static void rtnetlink_rcv(struct sock *sk, int len)
+static void rtnetlink_rcv(struct sk_buff *skb)
 {
-	unsigned int qlen = 0;
-
-	do {
-		rtnl_lock();
-		qlen = netlink_run_queue(sk, qlen, &rtnetlink_rcv_msg);
-		rtnl_unlock();
-	} while (qlen);
+	rtnl_lock();
+	netlink_rcv_skb(skb, &rtnetlink_rcv_msg);
+	rtnl_unlock();
 }
 
 static int rtnetlink_event(struct notifier_block *this, unsigned long event, void *ptr)
diff --git a/net/decnet/netfilter/dn_rtmsg.c b/net/decnet/netfilter/dn_rtmsg.c
index ebb38feb4df..f7fba7721e6 100644
--- a/net/decnet/netfilter/dn_rtmsg.c
+++ b/net/decnet/netfilter/dn_rtmsg.c
@@ -115,17 +115,6 @@ static inline void dnrmg_receive_user_skb(struct sk_buff *skb)
 	RCV_SKB_FAIL(-EINVAL);
 }
 
-static void dnrmg_receive_user_sk(struct sock *sk, int len)
-{
-	struct sk_buff *skb;
-	unsigned int qlen = skb_queue_len(&sk->sk_receive_queue);
-
-	for (; qlen && (skb = skb_dequeue(&sk->sk_receive_queue)); qlen--) {
-		dnrmg_receive_user_skb(skb);
-		kfree_skb(skb);
-	}
-}
-
 static struct nf_hook_ops dnrmg_ops = {
 	.hook		= dnrmg_hook,
 	.pf		= PF_DECnet,
@@ -139,7 +128,8 @@ static int __init dn_rtmsg_init(void)
 
 	dnrmg = netlink_kernel_create(&init_net,
 				      NETLINK_DNRTMSG, DNRNG_NLGRP_MAX,
-				      dnrmg_receive_user_sk, NULL, THIS_MODULE);
+				      dnrmg_receive_user_skb,
+				      NULL, THIS_MODULE);
 	if (dnrmg == NULL) {
 		printk(KERN_ERR "dn_rtmsg: Cannot create netlink socket");
 		return -ENOMEM;
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index f823ca34cb1..a5cba234960 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -62,6 +62,9 @@ static struct hlist_head fib_table_hash[FIB_TABLE_HASHSZ];
 #define FIB_TABLE_HASHSZ 256
 static struct hlist_head fib_table_hash[FIB_TABLE_HASHSZ];
 
+static struct sock *fibnl = NULL;
+
+
 struct fib_table *fib_new_table(u32 id)
 {
 	struct fib_table *tb;
@@ -811,13 +814,13 @@ static void nl_fib_input(struct sock *sk, int len)
 	pid = NETLINK_CB(skb).pid;       /* pid of sending process */
 	NETLINK_CB(skb).pid = 0;         /* from kernel */
 	NETLINK_CB(skb).dst_group = 0;  /* unicast */
-	netlink_unicast(sk, skb, pid, MSG_DONTWAIT);
+	netlink_unicast(fibnl, skb, pid, MSG_DONTWAIT);
 }
 
 static void nl_fib_lookup_init(void)
 {
-	netlink_kernel_create(&init_net, NETLINK_FIB_LOOKUP, 0, nl_fib_input,
-				NULL, THIS_MODULE);
+	fibnl = netlink_kernel_create(&init_net, NETLINK_FIB_LOOKUP, 0,
+				      nl_fib_input, NULL, THIS_MODULE);
 }
 
 static void fib_disable_ip(struct net_device *dev, int force)
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index b04a6ee5a9a..7eb83ebed2e 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -839,15 +839,11 @@ static int inet_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 
 static DEFINE_MUTEX(inet_diag_mutex);
 
-static void inet_diag_rcv(struct sock *sk, int len)
+static void inet_diag_rcv(struct sk_buff *skb)
 {
-	unsigned int qlen = 0;
-
-	do {
-		mutex_lock(&inet_diag_mutex);
-		qlen = netlink_run_queue(sk, qlen, &inet_diag_rcv_msg);
-		mutex_unlock(&inet_diag_mutex);
-	} while (qlen);
+	mutex_lock(&inet_diag_mutex);
+	netlink_rcv_skb(skb, &inet_diag_rcv_msg);
+	mutex_unlock(&inet_diag_mutex);
 }
 
 static DEFINE_SPINLOCK(inet_diag_register_lock);
diff --git a/net/ipv4/netfilter/ip_queue.c b/net/ipv4/netfilter/ip_queue.c
index aaa3f5c5676..23cbfc7c80f 100644
--- a/net/ipv4/netfilter/ip_queue.c
+++ b/net/ipv4/netfilter/ip_queue.c
@@ -475,7 +475,7 @@ ipq_dev_drop(int ifindex)
 #define RCV_SKB_FAIL(err) do { netlink_ack(skb, nlh, (err)); return; } while (0)
 
 static inline void
-ipq_rcv_skb(struct sk_buff *skb)
+__ipq_rcv_skb(struct sk_buff *skb)
 {
 	int status, type, pid, flags, nlmsglen, skblen;
 	struct nlmsghdr *nlh;
@@ -533,19 +533,10 @@ ipq_rcv_skb(struct sk_buff *skb)
 }
 
 static void
-ipq_rcv_sk(struct sock *sk, int len)
+ipq_rcv_skb(struct sk_buff *skb)
 {
-	struct sk_buff *skb;
-	unsigned int qlen;
-
 	mutex_lock(&ipqnl_mutex);
-
-	for (qlen = skb_queue_len(&sk->sk_receive_queue); qlen; qlen--) {
-		skb = skb_dequeue(&sk->sk_receive_queue);
-		ipq_rcv_skb(skb);
-		kfree_skb(skb);
-	}
-
+	__ipq_rcv_skb(skb);
 	mutex_unlock(&ipqnl_mutex);
 }
 
@@ -670,7 +661,7 @@ static int __init ip_queue_init(void)
 
 	netlink_register_notifier(&ipq_nl_notifier);
 	ipqnl = netlink_kernel_create(&init_net, NETLINK_FIREWALL, 0,
-				      ipq_rcv_sk, NULL, THIS_MODULE);
+				      ipq_rcv_skb, NULL, THIS_MODULE);
 	if (ipqnl == NULL) {
 		printk(KERN_ERR "ip_queue: failed to create netlink socket\n");
 		goto cleanup_netlink_notifier;
diff --git a/net/ipv6/netfilter/ip6_queue.c b/net/ipv6/netfilter/ip6_queue.c
index c75f467a8f5..0473145ac53 100644
--- a/net/ipv6/netfilter/ip6_queue.c
+++ b/net/ipv6/netfilter/ip6_queue.c
@@ -464,7 +464,7 @@ ipq_dev_drop(int ifindex)
 #define RCV_SKB_FAIL(err) do { netlink_ack(skb, nlh, (err)); return; } while (0)
 
 static inline void
-ipq_rcv_skb(struct sk_buff *skb)
+__ipq_rcv_skb(struct sk_buff *skb)
 {
 	int status, type, pid, flags, nlmsglen, skblen;
 	struct nlmsghdr *nlh;
@@ -522,19 +522,10 @@ ipq_rcv_skb(struct sk_buff *skb)
 }
 
 static void
-ipq_rcv_sk(struct sock *sk, int len)
+ipq_rcv_skb(struct sk_buff *skb)
 {
-	struct sk_buff *skb;
-	unsigned int qlen;
-
 	mutex_lock(&ipqnl_mutex);
-
-	for (qlen = skb_queue_len(&sk->sk_receive_queue); qlen; qlen--) {
-		skb = skb_dequeue(&sk->sk_receive_queue);
-		ipq_rcv_skb(skb);
-		kfree_skb(skb);
-	}
-
+	__ipq_rcv_skb(skb);
 	mutex_unlock(&ipqnl_mutex);
 }
 
@@ -658,8 +649,8 @@ static int __init ip6_queue_init(void)
 	struct proc_dir_entry *proc;
 
 	netlink_register_notifier(&ipq_nl_notifier);
-	ipqnl = netlink_kernel_create(&init_net, NETLINK_IP6_FW, 0, ipq_rcv_sk,
-					NULL, THIS_MODULE);
+	ipqnl = netlink_kernel_create(&init_net, NETLINK_IP6_FW, 0,
+			              ipq_rcv_skb, NULL, THIS_MODULE);
 	if (ipqnl == NULL) {
 		printk(KERN_ERR "ip6_queue: failed to create netlink socket\n");
 		goto cleanup_netlink_notifier;
diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c
index 99775af19ff..2128542995f 100644
--- a/net/netfilter/nfnetlink.c
+++ b/net/netfilter/nfnetlink.c
@@ -169,15 +169,11 @@ static int nfnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 	}
 }
 
-static void nfnetlink_rcv(struct sock *sk, int len)
+static void nfnetlink_rcv(struct sk_buff *skb)
 {
-	unsigned int qlen = 0;
-
-	do {
-		nfnl_lock();
-		qlen = netlink_run_queue(sk, qlen, nfnetlink_rcv_msg);
-		nfnl_unlock();
-	} while (qlen);
+	nfnl_lock();
+	netlink_rcv_skb(skb, &nfnetlink_rcv_msg);
+	nfnl_unlock();
 }
 
 static void __exit nfnetlink_exit(void)
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 4ce7dcbcb6e..c776bcd9f82 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -80,7 +80,7 @@ struct netlink_sock {
 	struct netlink_callback	*cb;
 	struct mutex		*cb_mutex;
 	struct mutex		cb_def_mutex;
-	void			(*data_ready)(struct sock *sk, int bytes);
+	void			(*netlink_rcv)(struct sk_buff *skb);
 	struct module		*module;
 };
 
@@ -127,7 +127,6 @@ static DECLARE_WAIT_QUEUE_HEAD(nl_table_wait);
 
 static int netlink_dump(struct sock *sk);
 static void netlink_destroy_callback(struct netlink_callback *cb);
-static void netlink_queue_skip(struct nlmsghdr *nlh, struct sk_buff *skb);
 
 static DEFINE_RWLOCK(nl_table_lock);
 static atomic_t nl_table_users = ATOMIC_INIT(0);
@@ -709,21 +708,17 @@ static void netlink_overrun(struct sock *sk)
 
 static struct sock *netlink_getsockbypid(struct sock *ssk, u32 pid)
 {
-	int protocol = ssk->sk_protocol;
-	struct net *net;
 	struct sock *sock;
 	struct netlink_sock *nlk;
 
-	net = ssk->sk_net;
-	sock = netlink_lookup(net, protocol, pid);
+	sock = netlink_lookup(ssk->sk_net, ssk->sk_protocol, pid);
 	if (!sock)
 		return ERR_PTR(-ECONNREFUSED);
 
 	/* Don't bother queuing skb if kernel socket has no input function */
 	nlk = nlk_sk(sock);
-	if ((netlink_is_kernel(sock) && !nlk->data_ready) ||
-	    (sock->sk_state == NETLINK_CONNECTED &&
-	     nlk->dst_pid != nlk_sk(ssk)->pid)) {
+	if (sock->sk_state == NETLINK_CONNECTED &&
+	    nlk->dst_pid != nlk_sk(ssk)->pid) {
 		sock_put(sock);
 		return ERR_PTR(-ECONNREFUSED);
 	}
@@ -837,7 +832,34 @@ static inline struct sk_buff *netlink_trim(struct sk_buff *skb,
 	return skb;
 }
 
-int netlink_unicast(struct sock *ssk, struct sk_buff *skb, u32 pid, int nonblock)
+static inline void netlink_rcv_wake(struct sock *sk)
+{
+	struct netlink_sock *nlk = nlk_sk(sk);
+
+	if (skb_queue_empty(&sk->sk_receive_queue))
+		clear_bit(0, &nlk->state);
+	if (!test_bit(0, &nlk->state))
+		wake_up_interruptible(&nlk->wait);
+}
+
+static inline int netlink_unicast_kernel(struct sock *sk, struct sk_buff *skb)
+{
+	int ret;
+	struct netlink_sock *nlk = nlk_sk(sk);
+
+	ret = -ECONNREFUSED;
+	if (nlk->netlink_rcv != NULL) {
+		ret = skb->len;
+		skb_set_owner_r(skb, sk);
+		nlk->netlink_rcv(skb);
+	}
+	kfree_skb(skb);
+	sock_put(sk);
+	return ret;
+}
+
+int netlink_unicast(struct sock *ssk, struct sk_buff *skb,
+		    u32 pid, int nonblock)
 {
 	struct sock *sk;
 	int err;
@@ -852,6 +874,9 @@ retry:
 		kfree_skb(skb);
 		return PTR_ERR(sk);
 	}
+	if (netlink_is_kernel(sk))
+		return netlink_unicast_kernel(sk, skb);
+
 	err = netlink_attachskb(sk, skb, nonblock, timeo, ssk);
 	if (err == 1)
 		goto retry;
@@ -1151,16 +1176,6 @@ static void netlink_cmsg_recv_pktinfo(struct msghdr *msg, struct sk_buff *skb)
 	put_cmsg(msg, SOL_NETLINK, NETLINK_PKTINFO, sizeof(info), &info);
 }
 
-static inline void netlink_rcv_wake(struct sock *sk)
-{
-	struct netlink_sock *nlk = nlk_sk(sk);
-
-	if (skb_queue_empty(&sk->sk_receive_queue))
-		clear_bit(0, &nlk->state);
-	if (!test_bit(0, &nlk->state))
-		wake_up_interruptible(&nlk->wait);
-}
-
 static int netlink_sendmsg(struct kiocb *kiocb, struct socket *sock,
 			   struct msghdr *msg, size_t len)
 {
@@ -1308,11 +1323,7 @@ out:
 
 static void netlink_data_ready(struct sock *sk, int len)
 {
-	struct netlink_sock *nlk = nlk_sk(sk);
-
-	if (nlk->data_ready)
-		nlk->data_ready(sk, len);
-	netlink_rcv_wake(sk);
+	BUG();
 }
 
 /*
@@ -1323,7 +1334,7 @@ static void netlink_data_ready(struct sock *sk, int len)
 
 struct sock *
 netlink_kernel_create(struct net *net, int unit, unsigned int groups,
-		      void (*input)(struct sock *sk, int len),
+		      void (*input)(struct sk_buff *skb),
 		      struct mutex *cb_mutex, struct module *module)
 {
 	struct socket *sock;
@@ -1352,7 +1363,7 @@ netlink_kernel_create(struct net *net, int unit, unsigned int groups,
 	sk = sock->sk;
 	sk->sk_data_ready = netlink_data_ready;
 	if (input)
-		nlk_sk(sk)->data_ready = input;
+		nlk_sk(sk)->netlink_rcv = input;
 
 	if (netlink_insert(sk, net, 0))
 		goto out_sock_release;
@@ -1552,12 +1563,7 @@ int netlink_dump_start(struct sock *ssk, struct sk_buff *skb,
 
 	netlink_dump(sk);
 	sock_put(sk);
-
-	/* We successfully started a dump, by returning -EINTR we
-	 * signal the queue mangement to interrupt processing of
-	 * any netlink messages so userspace gets a chance to read
-	 * the results. */
-	return -EINTR;
+	return 0;
 }
 
 void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err)
@@ -1594,13 +1600,15 @@ void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err)
 	netlink_unicast(in_skb->sk, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT);
 }
 
-static int netlink_rcv_skb(struct sk_buff *skb, int (*cb)(struct sk_buff *,
+int netlink_rcv_skb(struct sk_buff *skb, int (*cb)(struct sk_buff *,
 						     struct nlmsghdr *))
 {
 	struct nlmsghdr *nlh;
 	int err;
 
 	while (skb->len >= nlmsg_total_size(0)) {
+		int msglen;
+
 		nlh = nlmsg_hdr(skb);
 		err = 0;
 
@@ -1616,85 +1624,19 @@ static int netlink_rcv_skb(struct sk_buff *skb, int (*cb)(struct sk_buff *,
 			goto skip;
 
 		err = cb(skb, nlh);
-		if (err == -EINTR) {
-			/* Not an error, but we interrupt processing */
-			netlink_queue_skip(nlh, skb);
-			return err;
-		}
 skip:
 		if (nlh->nlmsg_flags & NLM_F_ACK || err)
 			netlink_ack(skb, nlh, err);
 
-		netlink_queue_skip(nlh, skb);
+	        msglen = NLMSG_ALIGN(nlh->nlmsg_len);
+		if (msglen > skb->len)
+			msglen = skb->len;
+		skb_pull(skb, msglen);
 	}
 
 	return 0;
 }
 
-/**
- * nelink_run_queue - Process netlink receive queue.
- * @sk: Netlink socket containing the queue
- * @qlen: Initial queue length
- * @cb: Callback function invoked for each netlink message found
- *
- * Processes as much as there was in the queue upon entry and invokes
- * a callback function for each netlink message found. The callback
- * function may refuse a message by returning a negative error code
- * but setting the error pointer to 0 in which case this function
- * returns with a qlen != 0.
- *
- * qlen must be initialized to 0 before the initial entry, afterwards
- * the function may be called repeatedly until the returned qlen is 0.
- *
- * The callback function may return -EINTR to signal that processing
- * of netlink messages shall be interrupted. In this case the message
- * currently being processed will NOT be requeued onto the receive
- * queue.
- */
-unsigned int netlink_run_queue(struct sock *sk, unsigned int qlen,
-			       int (*cb)(struct sk_buff *, struct nlmsghdr *))
-{
-	struct sk_buff *skb;
-
-	if (!qlen || qlen > skb_queue_len(&sk->sk_receive_queue))
-		qlen = skb_queue_len(&sk->sk_receive_queue);
-
-	for (; qlen; qlen--) {
-		skb = skb_dequeue(&sk->sk_receive_queue);
-		if (netlink_rcv_skb(skb, cb)) {
-			if (skb->len)
-				skb_queue_head(&sk->sk_receive_queue, skb);
-			else {
-				kfree_skb(skb);
-				qlen--;
-			}
-			break;
-		}
-
-		kfree_skb(skb);
-	}
-
-	return qlen;
-}
-
-/**
- * netlink_queue_skip - Skip netlink message while processing queue.
- * @nlh: Netlink message to be skipped
- * @skb: Socket buffer containing the netlink messages.
- *
- * Pulls the given netlink message off the socket buffer so the next
- * call to netlink_queue_run() will not reconsider the message.
- */
-static void netlink_queue_skip(struct nlmsghdr *nlh, struct sk_buff *skb)
-{
-	int msglen = NLMSG_ALIGN(nlh->nlmsg_len);
-
-	if (msglen > skb->len)
-		msglen = skb->len;
-
-	skb_pull(skb, msglen);
-}
-
 /**
  * nlmsg_notify - send a notification netlink message
  * @sk: netlink socket to use
@@ -1998,7 +1940,7 @@ panic:
 core_initcall(netlink_proto_init);
 
 EXPORT_SYMBOL(netlink_ack);
-EXPORT_SYMBOL(netlink_run_queue);
+EXPORT_SYMBOL(netlink_rcv_skb);
 EXPORT_SYMBOL(netlink_broadcast);
 EXPORT_SYMBOL(netlink_dump_start);
 EXPORT_SYMBOL(netlink_kernel_create);
diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c
index 3f1104dc128..150579a2146 100644
--- a/net/netlink/genetlink.c
+++ b/net/netlink/genetlink.c
@@ -470,15 +470,11 @@ static int genl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 	return ops->doit(skb, &info);
 }
 
-static void genl_rcv(struct sock *sk, int len)
+static void genl_rcv(struct sk_buff *skb)
 {
-	unsigned int qlen = 0;
-
-	do {
-		genl_lock();
-		qlen = netlink_run_queue(sk, qlen, genl_rcv_msg);
-		genl_unlock();
-	} while (qlen && genl_sock && genl_sock->sk_receive_queue.qlen);
+	genl_lock();
+	netlink_rcv_skb(skb, &genl_rcv_msg);
+	genl_unlock();
 }
 
 /**************************************************************************
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index 5238f6a8dfa..d41588d101d 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -1895,16 +1895,11 @@ static int xfrm_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 	return link->doit(skb, nlh, attrs);
 }
 
-static void xfrm_netlink_rcv(struct sock *sk, int len)
+static void xfrm_netlink_rcv(struct sk_buff *skb)
 {
-	unsigned int qlen = 0;
-
-	do {
-		mutex_lock(&xfrm_cfg_mutex);
-		qlen = netlink_run_queue(sk, qlen, &xfrm_user_rcv_msg);
-		mutex_unlock(&xfrm_cfg_mutex);
-
-	} while (qlen);
+	mutex_lock(&xfrm_cfg_mutex);
+	netlink_rcv_skb(skb, &xfrm_user_rcv_msg);
+	mutex_unlock(&xfrm_cfg_mutex);
 }
 
 static inline size_t xfrm_expire_msgsize(void)
-- 
cgit v1.2.3-70-g09d2


From d0c3d534a4388a465101b634a95f2ec586415254 Mon Sep 17 00:00:00 2001
From: Olof Johansson <olof@lixom.net>
Date: Fri, 12 Oct 2007 10:20:07 +1000
Subject: [POWERPC] Implement logging of unhandled signals

Implement show_unhandled_signals sysctl + support to print when a process
is killed due to unhandled signals just as i386 and x86_64 does.

Default to having it off, unlike x86 that defaults on.

Signed-off-by: Olof Johansson <olof@lixom.net>
Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/kernel/signal.c    |  6 ++++++
 arch/powerpc/kernel/signal_32.c | 38 ++++++++++++++++++++++++++++++++++++++
 arch/powerpc/kernel/signal_64.c | 15 +++++++++++++++
 arch/powerpc/kernel/traps.c     | 12 +++++++++++-
 kernel/sysctl.c                 |  2 +-
 5 files changed, 71 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/arch/powerpc/kernel/signal.c b/arch/powerpc/kernel/signal.c
index c434d6c4e4e..a65a44fbe52 100644
--- a/arch/powerpc/kernel/signal.c
+++ b/arch/powerpc/kernel/signal.c
@@ -16,6 +16,12 @@
 
 #include "signal.h"
 
+/* Log an error when sending an unhandled signal to a process. Controlled
+ * through debug.exception-trace sysctl.
+ */
+
+int show_unhandled_signals = 0;
+
 /*
  * Allocate space for the signal frame
  */
diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c
index 590057e9e98..6126bca8b70 100644
--- a/arch/powerpc/kernel/signal_32.c
+++ b/arch/powerpc/kernel/signal_32.c
@@ -705,11 +705,13 @@ int handle_rt_signal32(unsigned long sig, struct k_sigaction *ka,
 {
 	struct rt_sigframe __user *rt_sf;
 	struct mcontext __user *frame;
+	void __user *addr;
 	unsigned long newsp = 0;
 
 	/* Set up Signal Frame */
 	/* Put a Real Time Context onto stack */
 	rt_sf = get_sigframe(ka, regs, sizeof(*rt_sf));
+	addr = rt_sf;
 	if (unlikely(rt_sf == NULL))
 		goto badframe;
 
@@ -728,6 +730,7 @@ int handle_rt_signal32(unsigned long sig, struct k_sigaction *ka,
 
 	/* Save user registers on the stack */
 	frame = &rt_sf->uc.uc_mcontext;
+	addr = frame;
 	if (vdso32_rt_sigtramp && current->mm->context.vdso_base) {
 		if (save_user_regs(regs, frame, 0))
 			goto badframe;
@@ -742,6 +745,7 @@ int handle_rt_signal32(unsigned long sig, struct k_sigaction *ka,
 
 	/* create a stack frame for the caller of the handler */
 	newsp = ((unsigned long)rt_sf) - (__SIGNAL_FRAMESIZE + 16);
+	addr = (void __user *)regs->gpr[1];
 	if (put_user(regs->gpr[1], (u32 __user *)newsp))
 		goto badframe;
 
@@ -762,6 +766,12 @@ badframe:
 	printk("badframe in handle_rt_signal, regs=%p frame=%p newsp=%lx\n",
 	       regs, frame, newsp);
 #endif
+	if (show_unhandled_signals && printk_ratelimit())
+		printk(KERN_INFO "%s[%d]: bad frame in handle_rt_signal32: "
+			"%p nip %08lx lr %08lx\n",
+			current->comm, current->pid,
+			addr, regs->nip, regs->link);
+
 	force_sigsegv(sig, current);
 	return 0;
 }
@@ -886,6 +896,12 @@ long sys_rt_sigreturn(int r3, int r4, int r5, int r6, int r7, int r8,
 	return 0;
 
  bad:
+	if (show_unhandled_signals && printk_ratelimit())
+		printk(KERN_INFO "%s[%d]: bad frame in sys_rt_sigreturn: "
+			"%p nip %08lx lr %08lx\n",
+			current->comm, current->pid,
+			rt_sf, regs->nip, regs->link);
+
 	force_sig(SIGSEGV, current);
 	return 0;
 }
@@ -967,6 +983,13 @@ int sys_debug_setcontext(struct ucontext __user *ctx,
 	 * We kill the task with a SIGSEGV in this situation.
 	 */
 	if (do_setcontext(ctx, regs, 1)) {
+		if (show_unhandled_signals && printk_ratelimit())
+			printk(KERN_INFO "%s[%d]: bad frame in "
+				"sys_debug_setcontext: %p nip %08lx "
+				"lr %08lx\n",
+				current->comm, current->pid,
+				ctx, regs->nip, regs->link);
+
 		force_sig(SIGSEGV, current);
 		goto out;
 	}
@@ -1048,6 +1071,12 @@ badframe:
 	printk("badframe in handle_signal, regs=%p frame=%p newsp=%lx\n",
 	       regs, frame, newsp);
 #endif
+	if (show_unhandled_signals && printk_ratelimit())
+		printk(KERN_INFO "%s[%d]: bad frame in handle_signal32: "
+			"%p nip %08lx lr %08lx\n",
+			current->comm, current->pid,
+			frame, regs->nip, regs->link);
+
 	force_sigsegv(sig, current);
 	return 0;
 }
@@ -1061,12 +1090,14 @@ long sys_sigreturn(int r3, int r4, int r5, int r6, int r7, int r8,
 	struct sigcontext __user *sc;
 	struct sigcontext sigctx;
 	struct mcontext __user *sr;
+	void __user *addr;
 	sigset_t set;
 
 	/* Always make any pending restarted system calls return -EINTR */
 	current_thread_info()->restart_block.fn = do_no_restart_syscall;
 
 	sc = (struct sigcontext __user *)(regs->gpr[1] + __SIGNAL_FRAMESIZE);
+	addr = sc;
 	if (copy_from_user(&sigctx, sc, sizeof(sigctx)))
 		goto badframe;
 
@@ -1083,6 +1114,7 @@ long sys_sigreturn(int r3, int r4, int r5, int r6, int r7, int r8,
 	restore_sigmask(&set);
 
 	sr = (struct mcontext __user *)from_user_ptr(sigctx.regs);
+	addr = sr;
 	if (!access_ok(VERIFY_READ, sr, sizeof(*sr))
 	    || restore_user_regs(regs, sr, 1))
 		goto badframe;
@@ -1091,6 +1123,12 @@ long sys_sigreturn(int r3, int r4, int r5, int r6, int r7, int r8,
 	return 0;
 
 badframe:
+	if (show_unhandled_signals && printk_ratelimit())
+		printk(KERN_INFO "%s[%d]: bad frame in sys_sigreturn: "
+			"%p nip %08lx lr %08lx\n",
+			current->comm, current->pid,
+			addr, regs->nip, regs->link);
+
 	force_sig(SIGSEGV, current);
 	return 0;
 }
diff --git a/arch/powerpc/kernel/signal_64.c b/arch/powerpc/kernel/signal_64.c
index de895e6d8c6..faeb8f207ea 100644
--- a/arch/powerpc/kernel/signal_64.c
+++ b/arch/powerpc/kernel/signal_64.c
@@ -64,6 +64,11 @@ struct rt_sigframe {
 	char abigap[288];
 } __attribute__ ((aligned (16)));
 
+static const char fmt32[] = KERN_INFO \
+	"%s[%d]: bad frame in %s: %08lx nip %08lx lr %08lx\n";
+static const char fmt64[] = KERN_INFO \
+	"%s[%d]: bad frame in %s: %016lx nip %016lx lr %016lx\n";
+
 /*
  * Set up the sigcontext for the signal frame.
  */
@@ -315,6 +320,11 @@ badframe:
 	printk("badframe in sys_rt_sigreturn, regs=%p uc=%p &uc->uc_mcontext=%p\n",
 	       regs, uc, &uc->uc_mcontext);
 #endif
+	if (show_unhandled_signals && printk_ratelimit())
+		printk(regs->msr & MSR_SF ? fmt64 : fmt32,
+			current->comm, current->pid, "rt_sigreturn",
+			(long)uc, regs->nip, regs->link);
+
 	force_sig(SIGSEGV, current);
 	return 0;
 }
@@ -398,6 +408,11 @@ badframe:
 	printk("badframe in setup_rt_frame, regs=%p frame=%p newsp=%lx\n",
 	       regs, frame, newsp);
 #endif
+	if (show_unhandled_signals && printk_ratelimit())
+		printk(regs->msr & MSR_SF ? fmt64 : fmt32,
+			current->comm, current->pid, "setup_rt_frame",
+			(long)frame, regs->nip, regs->link);
+
 	force_sigsegv(signr, current);
 	return 0;
 }
diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index 2f1857c9819..bf9e39c6e29 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -172,11 +172,21 @@ int die(const char *str, struct pt_regs *regs, long err)
 void _exception(int signr, struct pt_regs *regs, int code, unsigned long addr)
 {
 	siginfo_t info;
+	const char fmt32[] = KERN_INFO "%s[%d]: unhandled signal %d " \
+			"at %08lx nip %08lx lr %08lx code %x\n";
+	const char fmt64[] = KERN_INFO "%s[%d]: unhandled signal %d " \
+			"at %016lx nip %016lx lr %016lx code %x\n";
 
 	if (!user_mode(regs)) {
 		if (die("Exception in kernel mode", regs, signr))
 			return;
-	}
+	} else if (show_unhandled_signals &&
+		    unhandled_signal(current, signr) &&
+		    printk_ratelimit()) {
+			printk(regs->msr & MSR_SF ? fmt64 : fmt32,
+				current->comm, current->pid, signr,
+				addr, regs->nip, regs->link, code);
+		}
 
 	memset(&info, 0, sizeof(info));
 	info.si_signo = signr;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 53a456ebf6d..c7314f95264 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1221,7 +1221,7 @@ static ctl_table fs_table[] = {
 };
 
 static ctl_table debug_table[] = {
-#ifdef CONFIG_X86
+#if defined(CONFIG_X86) || defined(CONFIG_PPC)
 	{
 		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "exception-trace",
-- 
cgit v1.2.3-70-g09d2


From de68d9b173ee657115dd0e584c2365b7954253a5 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 12 Oct 2007 23:04:05 +0200
Subject: clockevents: Allow build w/o run-tine usage for migration purposes

Migration aid to allow preparatory patches which introduce not yet
used parts of clock events code.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
---
 include/linux/clockchips.h | 8 ++++++--
 kernel/time/Kconfig        | 5 +++++
 kernel/time/Makefile       | 2 +-
 kernel/time/clockevents.c  | 3 ++-
 4 files changed, 14 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h
index f368a6fe8cc..d2ddea92689 100644
--- a/include/linux/clockchips.h
+++ b/include/linux/clockchips.h
@@ -8,7 +8,7 @@
 #ifndef _LINUX_CLOCKCHIPS_H
 #define _LINUX_CLOCKCHIPS_H
 
-#ifdef CONFIG_GENERIC_CLOCKEVENTS
+#ifdef CONFIG_GENERIC_CLOCKEVENTS_BUILD
 
 #include <linux/clocksource.h>
 #include <linux/cpumask.h>
@@ -126,9 +126,13 @@ extern int clockevents_register_notifier(struct notifier_block *nb);
 extern int clockevents_program_event(struct clock_event_device *dev,
 				     ktime_t expires, ktime_t now);
 
+#ifdef CONFIG_GENERIC_CLOCKEVENTS
 extern void clockevents_notify(unsigned long reason, void *arg);
-
 #else
+# define clockevents_notify(reason, arg) do { } while (0)
+#endif
+
+#else /* CONFIG_GENERIC_CLOCKEVENTS_BUILD */
 
 #define clockevents_notify(reason, arg) do { } while (0)
 
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index f6635112654..8d53106a0a9 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -23,3 +23,8 @@ config HIGH_RES_TIMERS
 	  hardware is not capable then this option only increases
 	  the size of the kernel image.
 
+config GENERIC_CLOCKEVENTS_BUILD
+	bool
+	default y
+	depends on GENERIC_CLOCKEVENTS || GENERIC_CLOCKEVENTS_MIGR
+
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index 99b6034fc86..905b0b50792 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -1,6 +1,6 @@
 obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o
 
-obj-$(CONFIG_GENERIC_CLOCKEVENTS)		+= clockevents.o
+obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD)		+= clockevents.o
 obj-$(CONFIG_GENERIC_CLOCKEVENTS)		+= tick-common.o
 obj-$(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST)	+= tick-broadcast.o
 obj-$(CONFIG_TICK_ONESHOT)			+= tick-oneshot.o
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 41dd3105ce7..822beebe664 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -194,6 +194,7 @@ void clockevents_exchange_device(struct clock_event_device *old,
 	local_irq_restore(flags);
 }
 
+#ifdef CONFIG_GENERIC_CLOCKEVENTS
 /**
  * clockevents_notify - notification about relevant events
  */
@@ -222,4 +223,4 @@ void clockevents_notify(unsigned long reason, void *arg)
 	spin_unlock(&clockevents_lock);
 }
 EXPORT_SYMBOL_GPL(clockevents_notify);
-
+#endif
-- 
cgit v1.2.3-70-g09d2


From c8a1d398de70a7774359b4720c392891cdd485f9 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 12 Oct 2007 23:04:06 +0200
Subject: clockevents: fix periodic broadcast for oneshot devices

The next_event member of the clock event device is used to keep track
of the next periodic event. For one shot only devices it is wrong to
clear the variable, as the next event will be based on it.

Pointed out by Ralf Baechle

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
---
 kernel/time/tick-broadcast.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 0962e057766..acf15b49e55 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -176,8 +176,6 @@ static void tick_do_periodic_broadcast(void)
  */
 static void tick_handle_periodic_broadcast(struct clock_event_device *dev)
 {
-	dev->next_event.tv64 = KTIME_MAX;
-
 	tick_do_periodic_broadcast();
 
 	/*
-- 
cgit v1.2.3-70-g09d2


From 4a93232dab0a07074bcc5291a0f1f39919916f31 Mon Sep 17 00:00:00 2001
From: Venki Pallipadi <venkatesh.pallipadi@intel.com>
Date: Fri, 12 Oct 2007 23:04:23 +0200
Subject: clock events: allow replacement of broadcast timer

Change the broadcast timer, if a timer with higher rating becomes available.

Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Cc: Andi Kleen <ak@suse.de>
Cc: john stultz <johnstul@us.ibm.com>
Cc: Greg KH <greg@kroah.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/tick-broadcast.c | 13 ++++++-------
 kernel/time/tick-common.c    |  4 ++--
 2 files changed, 8 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index acf15b49e55..298bc7c6f09 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -64,8 +64,9 @@ static void tick_broadcast_start_periodic(struct clock_event_device *bc)
  */
 int tick_check_broadcast_device(struct clock_event_device *dev)
 {
-	if (tick_broadcast_device.evtdev ||
-	    (dev->features & CLOCK_EVT_FEAT_C3STOP))
+	if ((tick_broadcast_device.evtdev &&
+	     tick_broadcast_device.evtdev->rating >= dev->rating) ||
+	     (dev->features & CLOCK_EVT_FEAT_C3STOP))
 		return 0;
 
 	clockevents_exchange_device(NULL, dev);
@@ -513,11 +514,9 @@ static void tick_broadcast_clear_oneshot(int cpu)
  */
 void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
 {
-	if (bc->mode != CLOCK_EVT_MODE_ONESHOT) {
-		bc->event_handler = tick_handle_oneshot_broadcast;
-		clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT);
-		bc->next_event.tv64 = KTIME_MAX;
-	}
+	bc->event_handler = tick_handle_oneshot_broadcast;
+	clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT);
+	bc->next_event.tv64 = KTIME_MAX;
 }
 
 /*
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 77a21abc871..3f3ae390783 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -200,7 +200,7 @@ static int tick_check_new_device(struct clock_event_device *newdev)
 
 	cpu = smp_processor_id();
 	if (!cpu_isset(cpu, newdev->cpumask))
-		goto out;
+		goto out_bc;
 
 	td = &per_cpu(tick_cpu_device, cpu);
 	curdev = td->evtdev;
@@ -265,7 +265,7 @@ out_bc:
 	 */
 	if (tick_check_broadcast_device(newdev))
 		ret = NOTIFY_STOP;
-out:
+
 	spin_unlock_irqrestore(&tick_device_lock, flags);
 
 	return ret;
-- 
cgit v1.2.3-70-g09d2