From a52f5c5620673c292cb159205bf0e1eb5af1985b Mon Sep 17 00:00:00 2001
From: Dmitri Vorobiev <dmitri.vorobiev@movial.com>
Date: Fri, 1 May 2009 13:10:21 -0700
Subject: clockevents: tick_broadcast_device can become static

The variable tick_broadcast_device is not used outside of the
file where it is defined, so let's make it static.

Signed-off-by: Dmitri Vorobiev <dmitri.vorobiev@movial.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/tick-broadcast.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 118a3b3b3f9..877dbedc311 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -27,7 +27,7 @@
  * timer stops in C3 state.
  */
 
-struct tick_device tick_broadcast_device;
+static struct tick_device tick_broadcast_device;
 /* FIXME: Use cpumask_var_t. */
 static DECLARE_BITMAP(tick_broadcast_mask, NR_CPUS);
 static DECLARE_BITMAP(tmpmask, NR_CPUS);
-- 
cgit v1.2.3-70-g09d2


From a04198887658e1d8ae25f5420035c057cb170e67 Mon Sep 17 00:00:00 2001
From: Jon Hunter <jon-hunter@ti.com>
Date: Fri, 1 May 2009 13:10:23 -0700
Subject: timers: allow deferrable timers for intervals tv2-tv5 to be deferred

In the current kernel implementation only kernel timers for time interval
tv1 are being deferred. This patch allows any timer that is configured as
deferrable to be defer regardless of time interval.

This patch was previously discussed in
http://marc.info/?l=linux-kernel&m=123196343531966&w=2 and was acked by
Venki Pallipadi, the author of the original deferrable timer patch.

Signed-off-by: Jon Hunter <jon-hunter@ti.com>
Acked-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/timer.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/timer.c b/kernel/timer.c
index cffffad01c3..5c1e84beaf4 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1015,6 +1015,9 @@ cascade:
 		index = slot = timer_jiffies & TVN_MASK;
 		do {
 			list_for_each_entry(nte, varp->vec + slot, entry) {
+				if (tbase_get_deferrable(nte->base))
+					continue;
+
 				found = 1;
 				if (time_before(nte->expires, expires))
 					expires = nte->expires;
-- 
cgit v1.2.3-70-g09d2


From c81fc2c331b8514ad112054cd2d87e6ec132286b Mon Sep 17 00:00:00 2001
From: Magnus Damm <magnus.damm@gmail.com>
Date: Fri, 1 May 2009 14:52:47 +0900
Subject: clockevent: export register_device and delta2ns

Export the following symbols using EXPORT_SYMBOL_GPL:
 - clockevent_delta2ns
 - clockevents_register_device

This allows us to build SuperH clockevent and clocksource
drivers as modules, see drivers/clocksource/sh_*.c

[ Impact: allow modular build of clockevent drivers ]

Signed-off-by: Magnus Damm <damm@igel.co.jp>
LKML-Reference: <20090501055247.8286.64067.sendpatchset@rx1.opensource.se>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/clockevents.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index d13be216a79..3948fa644a2 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -54,6 +54,7 @@ unsigned long clockevent_delta2ns(unsigned long latch,
 
 	return (unsigned long) clc;
 }
+EXPORT_SYMBOL_GPL(clockevent_delta2ns);
 
 /**
  * clockevents_set_mode - set the operating mode of a clock event device
@@ -187,6 +188,7 @@ void clockevents_register_device(struct clock_event_device *dev)
 
 	spin_unlock(&clockevents_lock);
 }
+EXPORT_SYMBOL_GPL(clockevents_register_device);
 
 /*
  * Noop handler when we shut down an event device
-- 
cgit v1.2.3-70-g09d2


From 597d0275736dad9c3bda6f0a00a1c477dc0f37b1 Mon Sep 17 00:00:00 2001
From: Arun R Bharadwaj <arun@linux.vnet.ibm.com>
Date: Thu, 16 Apr 2009 12:13:26 +0530
Subject: timers: Framework for identifying pinned timers

* Arun R Bharadwaj <arun@linux.vnet.ibm.com> [2009-04-16 12:11:36]:

This patch creates a new framework for identifying cpu-pinned timers
and hrtimers.

This framework is needed because pinned timers are expected to fire on
the same CPU on which they are queued. So it is essential to identify
these and not migrate them, in case there are any.

For regular timers, the currently existing add_timer_on() can be used
queue pinned timers and subsequently mod_timer_pinned() can be used
to modify the 'expires' field.

For hrtimers, new modes HRTIMER_ABS_PINNED and HRTIMER_REL_PINNED are
added to queue cpu-pinned hrtimer.

[ tglx: use .._PINNED mode argument instead of creating tons of new
functions ]

Signed-off-by: Arun R Bharadwaj <arun@linux.vnet.ibm.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/hrtimer.h |  7 +++++--
 include/linux/timer.h   |  3 +++
 kernel/hrtimer.c        |  7 ++++---
 kernel/timer.c          | 31 +++++++++++++++++++++++++++----
 4 files changed, 39 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 0d2f7c8a33d..7400900de94 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -30,8 +30,11 @@ struct hrtimer_cpu_base;
  * Mode arguments of xxx_hrtimer functions:
  */
 enum hrtimer_mode {
-	HRTIMER_MODE_ABS,	/* Time value is absolute */
-	HRTIMER_MODE_REL,	/* Time value is relative to now */
+	HRTIMER_MODE_ABS = 0x0,		/* Time value is absolute */
+	HRTIMER_MODE_REL = 0x1,		/* Time value is relative to now */
+	HRTIMER_MODE_PINNED = 0x02,	/* Timer is bound to CPU */
+	HRTIMER_MODE_ABS_PINNED = 0x02,
+	HRTIMER_MODE_REL_PINNED = 0x03,
 };
 
 /*
diff --git a/include/linux/timer.h b/include/linux/timer.h
index 6cdb6f3331f..ccf882eed8f 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -163,7 +163,10 @@ extern void add_timer_on(struct timer_list *timer, int cpu);
 extern int del_timer(struct timer_list * timer);
 extern int mod_timer(struct timer_list *timer, unsigned long expires);
 extern int mod_timer_pending(struct timer_list *timer, unsigned long expires);
+extern int mod_timer_pinned(struct timer_list *timer, unsigned long expires);
 
+#define TIMER_NOT_PINNED	0
+#define TIMER_PINNED		1
 /*
  * The jiffies value which is added to now, when there is no timer
  * in the timer wheel:
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index cb8a15c1958..c71bcd54924 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -193,7 +193,8 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
  * Switch the timer base to the current CPU when possible.
  */
 static inline struct hrtimer_clock_base *
-switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base)
+switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base,
+		    int pinned)
 {
 	struct hrtimer_clock_base *new_base;
 	struct hrtimer_cpu_base *new_cpu_base;
@@ -907,9 +908,9 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
 	ret = remove_hrtimer(timer, base);
 
 	/* Switch the timer base, if necessary: */
-	new_base = switch_hrtimer_base(timer, base);
+	new_base = switch_hrtimer_base(timer, base, mode & HRTIMER_MODE_PINNED);
 
-	if (mode == HRTIMER_MODE_REL) {
+	if (mode & HRTIMER_MODE_REL) {
 		tim = ktime_add_safe(tim, new_base->get_time());
 		/*
 		 * CONFIG_TIME_LOW_RES is a temporary way for architectures
diff --git a/kernel/timer.c b/kernel/timer.c
index 5c1e84beaf4..3424dfd11d5 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -604,7 +604,8 @@ static struct tvec_base *lock_timer_base(struct timer_list *timer,
 }
 
 static inline int
-__mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
+__mod_timer(struct timer_list *timer, unsigned long expires,
+						bool pending_only, int pinned)
 {
 	struct tvec_base *base, *new_base;
 	unsigned long flags;
@@ -668,7 +669,7 @@ out_unlock:
  */
 int mod_timer_pending(struct timer_list *timer, unsigned long expires)
 {
-	return __mod_timer(timer, expires, true);
+	return __mod_timer(timer, expires, true, TIMER_NOT_PINNED);
 }
 EXPORT_SYMBOL(mod_timer_pending);
 
@@ -702,10 +703,32 @@ int mod_timer(struct timer_list *timer, unsigned long expires)
 	if (timer->expires == expires && timer_pending(timer))
 		return 1;
 
-	return __mod_timer(timer, expires, false);
+	return __mod_timer(timer, expires, false, TIMER_NOT_PINNED);
 }
 EXPORT_SYMBOL(mod_timer);
 
+/**
+ * mod_timer_pinned - modify a timer's timeout
+ * @timer: the timer to be modified
+ * @expires: new timeout in jiffies
+ *
+ * mod_timer_pinned() is a way to update the expire field of an
+ * active timer (if the timer is inactive it will be activated)
+ * and not allow the timer to be migrated to a different CPU.
+ *
+ * mod_timer_pinned(timer, expires) is equivalent to:
+ *
+ *     del_timer(timer); timer->expires = expires; add_timer(timer);
+ */
+int mod_timer_pinned(struct timer_list *timer, unsigned long expires)
+{
+	if (timer->expires == expires && timer_pending(timer))
+		return 1;
+
+	return __mod_timer(timer, expires, false, TIMER_PINNED);
+}
+EXPORT_SYMBOL(mod_timer_pinned);
+
 /**
  * add_timer - start a timer
  * @timer: the timer to be added
@@ -1356,7 +1379,7 @@ signed long __sched schedule_timeout(signed long timeout)
 	expire = timeout + jiffies;
 
 	setup_timer_on_stack(&timer, process_timeout, (unsigned long)current);
-	__mod_timer(&timer, expire, false);
+	__mod_timer(&timer, expire, false, TIMER_NOT_PINNED);
 	schedule();
 	del_singleshot_timer_sync(&timer);
 
-- 
cgit v1.2.3-70-g09d2


From 5c333864a6ba811052d52ef14fbed056b9ac3512 Mon Sep 17 00:00:00 2001
From: Arun R Bharadwaj <arun@linux.vnet.ibm.com>
Date: Thu, 16 Apr 2009 12:14:37 +0530
Subject: timers: Identifying the existing pinned timers

* Arun R Bharadwaj <arun@linux.vnet.ibm.com> [2009-04-16 12:11:36]:

The following pinned hrtimers have been identified and marked:
1)sched_rt_period_timer
2)tick_sched_timer
3)stack_trace_timer_fn

[ tglx: fixup the hrtimer pinned mode ]

Signed-off-by: Arun R Bharadwaj <arun@linux.vnet.ibm.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/apic/x2apic_uv_x.c | 2 +-
 kernel/sched.c                     | 4 ++--
 kernel/time/tick-sched.c           | 7 ++++---
 kernel/trace/trace_sysprof.c       | 3 ++-
 4 files changed, 9 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 2bda6935297..a9cad1b00d6 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -463,7 +463,7 @@ static void uv_heartbeat(unsigned long ignored)
 	uv_set_scir_bits(bits);
 
 	/* enable next timer period */
-	mod_timer(timer, jiffies + SCIR_CPU_HB_INTERVAL);
+	mod_timer_pinned(timer, jiffies + SCIR_CPU_HB_INTERVAL);
 }
 
 static void __cpuinit uv_heartbeat_enable(int cpu)
diff --git a/kernel/sched.c b/kernel/sched.c
index b902e587a3a..9c5b4d3f97a 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -244,7 +244,7 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
 		hard = hrtimer_get_expires(&rt_b->rt_period_timer);
 		delta = ktime_to_ns(ktime_sub(hard, soft));
 		__hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta,
-				HRTIMER_MODE_ABS, 0);
+				HRTIMER_MODE_ABS_PINNED, 0);
 	}
 	spin_unlock(&rt_b->rt_runtime_lock);
 }
@@ -1154,7 +1154,7 @@ static __init void init_hrtick(void)
 static void hrtick_start(struct rq *rq, u64 delay)
 {
 	__hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,
-			HRTIMER_MODE_REL, 0);
+			HRTIMER_MODE_REL_PINNED, 0);
 }
 
 static inline void init_hrtick(void)
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index d3f1ef4d5cb..2aff39c6f10 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -349,7 +349,7 @@ void tick_nohz_stop_sched_tick(int inidle)
 
 		if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
 			hrtimer_start(&ts->sched_timer, expires,
-				      HRTIMER_MODE_ABS);
+				      HRTIMER_MODE_ABS_PINNED);
 			/* Check, if the timer was already in the past */
 			if (hrtimer_active(&ts->sched_timer))
 				goto out;
@@ -395,7 +395,7 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
 
 		if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
 			hrtimer_start_expires(&ts->sched_timer,
-				      HRTIMER_MODE_ABS);
+					      HRTIMER_MODE_ABS_PINNED);
 			/* Check, if the timer was already in the past */
 			if (hrtimer_active(&ts->sched_timer))
 				break;
@@ -698,7 +698,8 @@ void tick_setup_sched_timer(void)
 
 	for (;;) {
 		hrtimer_forward(&ts->sched_timer, now, tick_period);
-		hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS);
+		hrtimer_start_expires(&ts->sched_timer,
+				      HRTIMER_MODE_ABS_PINNED);
 		/* Check, if the timer was already in the past */
 		if (hrtimer_active(&ts->sched_timer))
 			break;
diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c
index 91fd19c2149..d180554bc93 100644
--- a/kernel/trace/trace_sysprof.c
+++ b/kernel/trace/trace_sysprof.c
@@ -203,7 +203,8 @@ static void start_stack_timer(void *unused)
 	hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 	hrtimer->function = stack_trace_timer_fn;
 
-	hrtimer_start(hrtimer, ns_to_ktime(sample_period), HRTIMER_MODE_REL);
+	hrtimer_start(hrtimer, ns_to_ktime(sample_period),
+		      HRTIMER_MODE_REL_PINNED);
 }
 
 static void start_stack_timers(void)
-- 
cgit v1.2.3-70-g09d2


From cd1bb94b4a0531e8211a3774f17de831f8285f76 Mon Sep 17 00:00:00 2001
From: Arun R Bharadwaj <arun@linux.vnet.ibm.com>
Date: Thu, 16 Apr 2009 12:15:34 +0530
Subject: timers: /proc/sys sysctl hook to enable timer migration

* Arun R Bharadwaj <arun@linux.vnet.ibm.com> [2009-04-16 12:11:36]:

This patch creates the /proc/sys sysctl interface at
/proc/sys/kernel/timer_migration

Timer migration is enabled by default.

To disable timer migration, when CONFIG_SCHED_DEBUG = y,

echo 0 > /proc/sys/kernel/timer_migration

Signed-off-by: Arun R Bharadwaj <arun@linux.vnet.ibm.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/sched.h | 1 +
 kernel/sched.c        | 2 ++
 kernel/sysctl.c       | 8 ++++++++
 3 files changed, 11 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index b4c38bc8049..61850401040 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1766,6 +1766,7 @@ extern unsigned int sysctl_sched_child_runs_first;
 extern unsigned int sysctl_sched_features;
 extern unsigned int sysctl_sched_migration_cost;
 extern unsigned int sysctl_sched_nr_migrate;
+extern unsigned int sysctl_timer_migration;
 
 int sched_nr_latency_handler(struct ctl_table *table, int write,
 		struct file *file, void __user *buffer, size_t *length,
diff --git a/kernel/sched.c b/kernel/sched.c
index 9c5b4d3f97a..7f1dd56af86 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -8731,6 +8731,8 @@ void __init sched_init_smp(void)
 }
 #endif /* CONFIG_SMP */
 
+const_debug unsigned int sysctl_timer_migration = 1;
+
 int in_sched_functions(unsigned long addr)
 {
 	return in_lock_functions(addr) ||
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index e3d2c7dd59b..b3ce5813730 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -324,6 +324,14 @@ static struct ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
 	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "timer_migration",
+		.data		= &sysctl_timer_migration,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
 #endif
 	{
 		.ctl_name	= CTL_UNNUMBERED,
-- 
cgit v1.2.3-70-g09d2


From eea08f32adb3f97553d49a4f79a119833036000a Mon Sep 17 00:00:00 2001
From: Arun R Bharadwaj <arun@linux.vnet.ibm.com>
Date: Thu, 16 Apr 2009 12:16:41 +0530
Subject: timers: Logic to move non pinned timers

* Arun R Bharadwaj <arun@linux.vnet.ibm.com> [2009-04-16 12:11:36]:

This patch migrates all non pinned timers and hrtimers to the current
idle load balancer, from all the idle CPUs. Timers firing on busy CPUs
are not migrated.

While migrating hrtimers, care should be taken to check if migrating
a hrtimer would result in a latency or not. So we compare the expiry of the
hrtimer with the next timer interrupt on the target cpu and migrate the
hrtimer only if it expires *after* the next interrupt on the target cpu.
So, added a clockevents_get_next_event() helper function to return the
next_event on the target cpu's clock_event_device.

[ tglx: cleanups and simplifications ]

Signed-off-by: Arun R Bharadwaj <arun@linux.vnet.ibm.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/clockchips.h |  9 ++++++++
 include/linux/sched.h      | 12 +++++++++++
 kernel/hrtimer.c           | 51 ++++++++++++++++++++++++++++++++++++++++++++--
 kernel/sched.c             |  5 +++++
 kernel/time/clockevents.c  | 12 +++++++++++
 kernel/timer.c             | 17 +++++++++++++---
 6 files changed, 101 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h
index 3a1dbba4d3a..20a100fe2b4 100644
--- a/include/linux/clockchips.h
+++ b/include/linux/clockchips.h
@@ -143,3 +143,12 @@ extern void clockevents_notify(unsigned long reason, void *arg);
 #endif
 
 #endif
+
+#ifdef CONFIG_GENERIC_CLOCKEVENTS
+extern ktime_t clockevents_get_next_event(int cpu);
+#else
+static inline ktime_t clockevents_get_next_event(int cpu)
+{
+	return (ktime_t) { .tv64 = KTIME_MAX };
+}
+#endif
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 61850401040..311dec12397 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -257,6 +257,7 @@ extern void task_rq_unlock_wait(struct task_struct *p);
 extern cpumask_var_t nohz_cpu_mask;
 #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ)
 extern int select_nohz_load_balancer(int cpu);
+extern int get_nohz_load_balancer(void);
 #else
 static inline int select_nohz_load_balancer(int cpu)
 {
@@ -1772,6 +1773,17 @@ int sched_nr_latency_handler(struct ctl_table *table, int write,
 		struct file *file, void __user *buffer, size_t *length,
 		loff_t *ppos);
 #endif
+#ifdef CONFIG_SCHED_DEBUG
+static inline unsigned int get_sysctl_timer_migration(void)
+{
+	return sysctl_timer_migration;
+}
+#else
+static inline unsigned int get_sysctl_timer_migration(void)
+{
+	return 1;
+}
+#endif
 extern unsigned int sysctl_sched_rt_period;
 extern int sysctl_sched_rt_runtime;
 
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index c71bcd54924..b675a67c9ac 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -43,6 +43,8 @@
 #include <linux/seq_file.h>
 #include <linux/err.h>
 #include <linux/debugobjects.h>
+#include <linux/sched.h>
+#include <linux/timer.h>
 
 #include <asm/uaccess.h>
 
@@ -198,8 +200,19 @@ switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base,
 {
 	struct hrtimer_clock_base *new_base;
 	struct hrtimer_cpu_base *new_cpu_base;
+	int cpu, preferred_cpu = -1;
+
+	cpu = smp_processor_id();
+#if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP)
+	if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu)) {
+		preferred_cpu = get_nohz_load_balancer();
+		if (preferred_cpu >= 0)
+			cpu = preferred_cpu;
+	}
+#endif
 
-	new_cpu_base = &__get_cpu_var(hrtimer_bases);
+again:
+	new_cpu_base = &per_cpu(hrtimer_bases, cpu);
 	new_base = &new_cpu_base->clock_base[base->index];
 
 	if (base != new_base) {
@@ -219,6 +232,40 @@ switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base,
 		timer->base = NULL;
 		spin_unlock(&base->cpu_base->lock);
 		spin_lock(&new_base->cpu_base->lock);
+
+		/* Optimized away for NOHZ=n SMP=n */
+		if (cpu == preferred_cpu) {
+			/* Calculate clock monotonic expiry time */
+#ifdef CONFIG_HIGH_RES_TIMERS
+			ktime_t expires = ktime_sub(hrtimer_get_expires(timer),
+							new_base->offset);
+#else
+			ktime_t expires = hrtimer_get_expires(timer);
+#endif
+
+			/*
+			 * Get the next event on target cpu from the
+			 * clock events layer.
+			 * This covers the highres=off nohz=on case as well.
+			 */
+			ktime_t next = clockevents_get_next_event(cpu);
+
+			ktime_t delta = ktime_sub(expires, next);
+
+			/*
+			 * We do not migrate the timer when it is expiring
+			 * before the next event on the target cpu because
+			 * we cannot reprogram the target cpu hardware and
+			 * we would cause it to fire late.
+			 */
+			if (delta.tv64 < 0) {
+				cpu = smp_processor_id();
+				spin_unlock(&new_base->cpu_base->lock);
+				spin_lock(&base->cpu_base->lock);
+				timer->base = base;
+				goto again;
+			}
+		}
 		timer->base = new_base;
 	}
 	return new_base;
@@ -236,7 +283,7 @@ lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
 	return base;
 }
 
-# define switch_hrtimer_base(t, b)	(b)
+# define switch_hrtimer_base(t, b, p)	(b)
 
 #endif	/* !CONFIG_SMP */
 
diff --git a/kernel/sched.c b/kernel/sched.c
index 7f1dd56af86..9fe3774a0fd 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4244,6 +4244,11 @@ static struct {
 	.load_balancer = ATOMIC_INIT(-1),
 };
 
+int get_nohz_load_balancer(void)
+{
+	return atomic_read(&nohz.load_balancer);
+}
+
 /*
  * This routine will try to nominate the ilb (idle load balancing)
  * owner among the cpus whose ticks are stopped. ilb owner will do the idle
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index d13be216a79..ab20ded013b 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -18,6 +18,7 @@
 #include <linux/notifier.h>
 #include <linux/smp.h>
 #include <linux/sysdev.h>
+#include <linux/tick.h>
 
 /* The registered clock event devices */
 static LIST_HEAD(clockevent_devices);
@@ -251,4 +252,15 @@ void clockevents_notify(unsigned long reason, void *arg)
 	spin_unlock(&clockevents_lock);
 }
 EXPORT_SYMBOL_GPL(clockevents_notify);
+
+ktime_t clockevents_get_next_event(int cpu)
+{
+	struct tick_device *td;
+	struct clock_event_device *dev;
+
+	td = &per_cpu(tick_cpu_device, cpu);
+	dev = td->evtdev;
+
+	return dev->next_event;
+}
 #endif
diff --git a/kernel/timer.c b/kernel/timer.c
index 3424dfd11d5..3f841db5edf 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -37,6 +37,7 @@
 #include <linux/delay.h>
 #include <linux/tick.h>
 #include <linux/kallsyms.h>
+#include <linux/sched.h>
 
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
@@ -609,9 +610,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
 {
 	struct tvec_base *base, *new_base;
 	unsigned long flags;
-	int ret;
-
-	ret = 0;
+	int ret = 0 , cpu;
 
 	timer_stats_timer_set_start_info(timer);
 	BUG_ON(!timer->function);
@@ -630,6 +629,18 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
 
 	new_base = __get_cpu_var(tvec_bases);
 
+	cpu = smp_processor_id();
+
+#if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP)
+	if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu)) {
+		int preferred_cpu = get_nohz_load_balancer();
+
+		if (preferred_cpu >= 0)
+			cpu = preferred_cpu;
+	}
+#endif
+	new_base = per_cpu(tvec_bases, cpu);
+
 	if (base != new_base) {
 		/*
 		 * We are trying to schedule the timer on the local CPU.
-- 
cgit v1.2.3-70-g09d2


From a9862e0560866eadbc59b84867492004da436516 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Tue, 19 May 2009 22:49:07 +0200
Subject: Export add_timer_on for modules

Needed in followon patch.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 kernel/timer.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/timer.c b/kernel/timer.c
index cffffad01c3..e2c47b82ac3 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -756,6 +756,7 @@ void add_timer_on(struct timer_list *timer, int cpu)
 	wake_up_idle_cpu(cpu);
 	spin_unlock_irqrestore(&base->lock, flags);
 }
+EXPORT_SYMBOL_GPL(add_timer_on);
 
 /**
  * del_timer - deactive a timer.
-- 
cgit v1.2.3-70-g09d2


From ad6ccfad6f759a5d657dabe2071a8f2a503fcc84 Mon Sep 17 00:00:00 2001
From: Manish Katiyar <mkatiyar@gmail.com>
Date: Tue, 12 May 2009 13:43:35 -0700
Subject: kernel/kallsyms.c: replace deprecated __initcall with device_initcall
 and fix whitespace

Fix coding style whitespace issues and replace __initcall with
device_initcall.  Fixed multi-line comments as per coding style.

Errors as reported by checkpatch.pl :-
Before:
total: 14 errors, 14 warnings, 487 lines checked
After :
total: 0 errors, 8 warnings, 507 lines checked

Compile tested binary verified as :-
Before:
 text    data     bss     dec     hex filename
 2405       4       0    2409     969 kernel/kallsyms.o
After :
 text     data     bss     dec     hex filename
 2405       4       0    2409     969 kernel/kallsyms.o

Signed-off-by: Manish Katiyar <mkatiyar@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Sam Ravnborg <sam@ravnborg.org>
---
 kernel/kallsyms.c | 134 +++++++++++++++++++++++++++++++-----------------------
 1 file changed, 78 insertions(+), 56 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 374faf9bfdc..3a29dbe7898 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -30,12 +30,16 @@
 #define all_var 0
 #endif
 
-/* These will be re-linked against their real values during the second link stage */
+/*
+ * These will be re-linked against their real values
+ * during the second link stage.
+ */
 extern const unsigned long kallsyms_addresses[] __attribute__((weak));
 extern const u8 kallsyms_names[] __attribute__((weak));
 
-/* tell the compiler that the count isn't in the small data section if the arch
- * has one (eg: FRV)
+/*
+ * Tell the compiler that the count isn't in the small data section if the arch
+ * has one (eg: FRV).
  */
 extern const unsigned long kallsyms_num_syms
 __attribute__((weak, section(".rodata")));
@@ -75,31 +79,37 @@ static int is_ksym_addr(unsigned long addr)
 	return is_kernel_text(addr) || is_kernel_inittext(addr);
 }
 
-/* expand a compressed symbol data into the resulting uncompressed string,
-   given the offset to where the symbol is in the compressed stream */
+/*
+ * Expand a compressed symbol data into the resulting uncompressed string,
+ * given the offset to where the symbol is in the compressed stream.
+ */
 static unsigned int kallsyms_expand_symbol(unsigned int off, char *result)
 {
 	int len, skipped_first = 0;
 	const u8 *tptr, *data;
 
-	/* get the compressed symbol length from the first symbol byte */
+	/* Get the compressed symbol length from the first symbol byte. */
 	data = &kallsyms_names[off];
 	len = *data;
 	data++;
 
-	/* update the offset to return the offset for the next symbol on
-	 * the compressed stream */
+	/*
+	 * Update the offset to return the offset for the next symbol on
+	 * the compressed stream.
+	 */
 	off += len + 1;
 
-	/* for every byte on the compressed symbol data, copy the table
-	   entry for that byte */
-	while(len) {
-		tptr = &kallsyms_token_table[ kallsyms_token_index[*data] ];
+	/*
+	 * For every byte on the compressed symbol data, copy the table
+	 * entry for that byte.
+	 */
+	while (len) {
+		tptr = &kallsyms_token_table[kallsyms_token_index[*data]];
 		data++;
 		len--;
 
 		while (*tptr) {
-			if(skipped_first) {
+			if (skipped_first) {
 				*result = *tptr;
 				result++;
 			} else
@@ -110,36 +120,46 @@ static unsigned int kallsyms_expand_symbol(unsigned int off, char *result)
 
 	*result = '\0';
 
-	/* return to offset to the next symbol */
+	/* Return to offset to the next symbol. */
 	return off;
 }
 
-/* get symbol type information. This is encoded as a single char at the
- * begining of the symbol name */
+/*
+ * Get symbol type information. This is encoded as a single char at the
+ * beginning of the symbol name.
+ */
 static char kallsyms_get_symbol_type(unsigned int off)
 {
-	/* get just the first code, look it up in the token table, and return the
-	 * first char from this token */
-	return kallsyms_token_table[ kallsyms_token_index[ kallsyms_names[off+1] ] ];
+	/*
+	 * Get just the first code, look it up in the token table,
+	 * and return the first char from this token.
+	 */
+	return kallsyms_token_table[kallsyms_token_index[kallsyms_names[off + 1]]];
 }
 
 
-/* find the offset on the compressed stream given and index in the
- * kallsyms array */
+/*
+ * Find the offset on the compressed stream given and index in the
+ * kallsyms array.
+ */
 static unsigned int get_symbol_offset(unsigned long pos)
 {
 	const u8 *name;
 	int i;
 
-	/* use the closest marker we have. We have markers every 256 positions,
-	 * so that should be close enough */
-	name = &kallsyms_names[ kallsyms_markers[pos>>8] ];
+	/*
+	 * Use the closest marker we have. We have markers every 256 positions,
+	 * so that should be close enough.
+	 */
+	name = &kallsyms_names[kallsyms_markers[pos >> 8]];
 
-	/* sequentially scan all the symbols up to the point we're searching for.
-	 * Every symbol is stored in a [<len>][<len> bytes of data] format, so we
-	 * just need to add the len to the current pointer for every symbol we
-	 * wish to skip */
-	for(i = 0; i < (pos&0xFF); i++)
+	/*
+	 * Sequentially scan all the symbols up to the point we're searching
+	 * for. Every symbol is stored in a [<len>][<len> bytes of data] format,
+	 * so we just need to add the len to the current pointer for every
+	 * symbol we wish to skip.
+	 */
+	for (i = 0; i < (pos & 0xFF); i++)
 		name = name + (*name) + 1;
 
 	return name - kallsyms_names;
@@ -190,7 +210,7 @@ static unsigned long get_symbol_pos(unsigned long addr,
 	/* This kernel should never had been booted. */
 	BUG_ON(!kallsyms_addresses);
 
-	/* do a binary search on the sorted kallsyms_addresses array */
+	/* Do a binary search on the sorted kallsyms_addresses array. */
 	low = 0;
 	high = kallsyms_num_syms;
 
@@ -203,15 +223,15 @@ static unsigned long get_symbol_pos(unsigned long addr,
 	}
 
 	/*
-	 * search for the first aliased symbol. Aliased
-	 * symbols are symbols with the same address
+	 * Search for the first aliased symbol. Aliased
+	 * symbols are symbols with the same address.
 	 */
 	while (low && kallsyms_addresses[low-1] == kallsyms_addresses[low])
 		--low;
 
 	symbol_start = kallsyms_addresses[low];
 
-	/* Search for next non-aliased symbol */
+	/* Search for next non-aliased symbol. */
 	for (i = low + 1; i < kallsyms_num_syms; i++) {
 		if (kallsyms_addresses[i] > symbol_start) {
 			symbol_end = kallsyms_addresses[i];
@@ -219,7 +239,7 @@ static unsigned long get_symbol_pos(unsigned long addr,
 		}
 	}
 
-	/* if we found no next symbol, we use the end of the section */
+	/* If we found no next symbol, we use the end of the section. */
 	if (!symbol_end) {
 		if (is_kernel_inittext(addr))
 			symbol_end = (unsigned long)_einittext;
@@ -252,10 +272,10 @@ int kallsyms_lookup_size_offset(unsigned long addr, unsigned long *symbolsize,
 
 /*
  * Lookup an address
- * - modname is set to NULL if it's in the kernel
- * - we guarantee that the returned name is valid until we reschedule even if
- *   it resides in a module
- * - we also guarantee that modname will be valid until rescheduled
+ * - modname is set to NULL if it's in the kernel.
+ * - We guarantee that the returned name is valid until we reschedule even if.
+ *   It resides in a module.
+ * - We also guarantee that modname will be valid until rescheduled.
  */
 const char *kallsyms_lookup(unsigned long addr,
 			    unsigned long *symbolsize,
@@ -276,7 +296,7 @@ const char *kallsyms_lookup(unsigned long addr,
 		return namebuf;
 	}
 
-	/* see if it's in a module */
+	/* See if it's in a module. */
 	return module_address_lookup(addr, symbolsize, offset, modname,
 				     namebuf);
 }
@@ -294,7 +314,7 @@ int lookup_symbol_name(unsigned long addr, char *symname)
 		kallsyms_expand_symbol(get_symbol_offset(pos), symname);
 		return 0;
 	}
-	/* see if it's in a module */
+	/* See if it's in a module. */
 	return lookup_module_symbol_name(addr, symname);
 }
 
@@ -313,7 +333,7 @@ int lookup_symbol_attrs(unsigned long addr, unsigned long *size,
 		modname[0] = '\0';
 		return 0;
 	}
-	/* see if it's in a module */
+	/* See if it's in a module. */
 	return lookup_module_symbol_attrs(addr, size, offset, modname, name);
 }
 
@@ -342,6 +362,7 @@ int sprint_symbol(char *buffer, unsigned long address)
 
 	return len;
 }
+EXPORT_SYMBOL_GPL(sprint_symbol);
 
 /* Look up a kernel symbol and print it to the kernel messages. */
 void __print_symbol(const char *fmt, unsigned long address)
@@ -352,13 +373,13 @@ void __print_symbol(const char *fmt, unsigned long address)
 
 	printk(fmt, buffer);
 }
+EXPORT_SYMBOL(__print_symbol);
 
 /* To avoid using get_symbol_offset for every symbol, we carry prefix along. */
-struct kallsym_iter
-{
+struct kallsym_iter {
 	loff_t pos;
 	unsigned long value;
-	unsigned int nameoff; /* If iterating in core kernel symbols */
+	unsigned int nameoff; /* If iterating in core kernel symbols. */
 	char type;
 	char name[KSYM_NAME_LEN];
 	char module_name[MODULE_NAME_LEN];
@@ -404,7 +425,7 @@ static int update_iter(struct kallsym_iter *iter, loff_t pos)
 		iter->pos = pos;
 		return get_ksymbol_mod(iter);
 	}
-	
+
 	/* If we're not on the desired position, reset to new position. */
 	if (pos != iter->pos)
 		reset_iter(iter, pos);
@@ -439,23 +460,25 @@ static int s_show(struct seq_file *m, void *p)
 {
 	struct kallsym_iter *iter = m->private;
 
-	/* Some debugging symbols have no name.  Ignore them. */ 
+	/* Some debugging symbols have no name.  Ignore them. */
 	if (!iter->name[0])
 		return 0;
 
 	if (iter->module_name[0]) {
 		char type;
 
-		/* Label it "global" if it is exported,
-		 * "local" if not exported. */
+		/*
+		 * Label it "global" if it is exported,
+		 * "local" if not exported.
+		 */
 		type = iter->exported ? toupper(iter->type) :
 					tolower(iter->type);
 		seq_printf(m, "%0*lx %c %s\t[%s]\n",
-			   (int)(2*sizeof(void*)),
+			   (int)(2 * sizeof(void *)),
 			   iter->value, type, iter->name, iter->module_name);
 	} else
 		seq_printf(m, "%0*lx %c %s\n",
-			   (int)(2*sizeof(void*)),
+			   (int)(2 * sizeof(void *)),
 			   iter->value, iter->type, iter->name);
 	return 0;
 }
@@ -469,9 +492,11 @@ static const struct seq_operations kallsyms_op = {
 
 static int kallsyms_open(struct inode *inode, struct file *file)
 {
-	/* We keep iterator in m->private, since normal case is to
+	/*
+	 * We keep iterator in m->private, since normal case is to
 	 * s_start from where we left off, so we avoid doing
-	 * using get_symbol_offset for every symbol */
+	 * using get_symbol_offset for every symbol.
+	 */
 	struct kallsym_iter *iter;
 	int ret;
 
@@ -500,7 +525,4 @@ static int __init kallsyms_init(void)
 	proc_create("kallsyms", 0444, NULL, &kallsyms_operations);
 	return 0;
 }
-__initcall(kallsyms_init);
-
-EXPORT_SYMBOL(__print_symbol);
-EXPORT_SYMBOL_GPL(sprint_symbol);
+device_initcall(kallsyms_init);
-- 
cgit v1.2.3-70-g09d2


From 3f68535adad8dd89499505a65fb25d0e02d118cc Mon Sep 17 00:00:00 2001
From: john stultz <johnstul@us.ibm.com>
Date: Wed, 21 Jan 2009 22:53:22 -0700
Subject: clocksource: sanity check sysfs clocksource changes

Thomas, Andrew and Ingo pointed out that we don't have any safety checks
in the clocksource sysfs entries to make sure sysadmins don't try to
change the clocksource to a non high-res timer capable clocksource (such
as jiffies) when high-res timers (HRT) is enabled.  Doing so will likely
hang a system.

Correct this by filtering non HRT clocksources from available_clocksources
and not accepting non HRT clocksources with HRT enabled.

Signed-off-by: John Stultz <johnstul@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/hrtimer.h   |  2 +-
 kernel/hrtimer.c          |  4 ++--
 kernel/time/clocksource.c | 18 +++++++++++++++++-
 3 files changed, 20 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 0d2f7c8a33d..58021b0c396 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -305,7 +305,7 @@ static inline int hrtimer_is_hres_active(struct hrtimer *timer)
 
 extern ktime_t ktime_get(void);
 extern ktime_t ktime_get_real(void);
-
+extern int hrtimer_hres_active(void);
 
 DECLARE_PER_CPU(struct tick_device, tick_cpu_device);
 
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index cb8a15c1958..1a70c18cdff 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -476,7 +476,7 @@ static inline int hrtimer_is_hres_enabled(void)
 /*
  * Is the high resolution mode active ?
  */
-static inline int hrtimer_hres_active(void)
+int hrtimer_hres_active(void)
 {
 	return __get_cpu_var(hrtimer_bases).hres_active;
 }
@@ -704,7 +704,7 @@ static int hrtimer_switch_to_hres(void)
 
 #else
 
-static inline int hrtimer_hres_active(void) { return 0; }
+int hrtimer_hres_active(void) { return 0; }
 static inline int hrtimer_is_hres_enabled(void) { return 0; }
 static inline int hrtimer_switch_to_hres(void) { return 0; }
 static inline void hrtimer_force_reprogram(struct hrtimer_cpu_base *base) { }
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 80189f6f1c5..18b9f5da4ee 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -30,6 +30,7 @@
 #include <linux/module.h>
 #include <linux/sched.h> /* for spin_unlock_irq() using preempt_count() m68k */
 #include <linux/tick.h>
+#include <linux/hrtimer.h>
 
 void timecounter_init(struct timecounter *tc,
 		      const struct cyclecounter *cc,
@@ -509,6 +510,18 @@ static ssize_t sysfs_override_clocksource(struct sys_device *dev,
 		}
 	}
 
+	/*
+	 * Check to make sure we don't switch to a non-HRT usable
+	 * clocksource if HRT is enabled and running
+	 */
+	if (hrtimer_hres_active() &&
+	    !(ovr->flags & CLOCK_SOURCE_VALID_FOR_HRES)) {
+		printk(KERN_WARNING "%s clocksource is not HRT compatible. "
+			"Cannot switch while in HRT mode\n", ovr->name);
+		ovr = NULL;
+		override_name[0] = 0;
+	}
+
 	/* Reselect, when the override name has changed */
 	if (ovr != clocksource_override) {
 		clocksource_override = ovr;
@@ -537,7 +550,10 @@ sysfs_show_available_clocksources(struct sys_device *dev,
 
 	spin_lock_irq(&clocksource_lock);
 	list_for_each_entry(src, &clocksource_list, list) {
-		count += snprintf(buf + count,
+		/* Don't show non-HRES clocksource if HRES is enabled */
+		if (!hrtimer_hres_active() ||
+				(src->flags & CLOCK_SOURCE_VALID_FOR_HRES))
+			count += snprintf(buf + count,
 				  max((ssize_t)PAGE_SIZE - count, (ssize_t)0),
 				  "%s ", src->name);
 	}
-- 
cgit v1.2.3-70-g09d2


From 589ff870ed60a9ebdd5ec99ec3f5afe1282fe151 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 18 Apr 2009 03:28:19 -0400
Subject: Switch collect_mounts() to struct path

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namespace.c      | 4 ++--
 include/linux/fs.h  | 2 +-
 kernel/audit_tree.c | 6 +++---
 3 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/fs/namespace.c b/fs/namespace.c
index 88a904d5aa2..c85962206aa 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1253,11 +1253,11 @@ Enomem:
 	return NULL;
 }
 
-struct vfsmount *collect_mounts(struct vfsmount *mnt, struct dentry *dentry)
+struct vfsmount *collect_mounts(struct path *path)
 {
 	struct vfsmount *tree;
 	down_write(&namespace_sem);
-	tree = copy_tree(mnt, dentry, CL_COPY_ALL | CL_PRIVATE);
+	tree = copy_tree(path->mnt, path->dentry, CL_COPY_ALL | CL_PRIVATE);
 	up_write(&namespace_sem);
 	return tree;
 }
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 323b5ce474c..03fb2102b8f 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1800,7 +1800,7 @@ extern struct vfsmount *kern_mount_data(struct file_system_type *, void *data);
 extern int may_umount_tree(struct vfsmount *);
 extern int may_umount(struct vfsmount *);
 extern long do_mount(char *, char *, char *, unsigned long, void *);
-extern struct vfsmount *collect_mounts(struct vfsmount *, struct dentry *);
+extern struct vfsmount *collect_mounts(struct path *);
 extern void drop_collected_mounts(struct vfsmount *);
 
 extern int vfs_statfs(struct dentry *, struct kstatfs *);
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 6e7351739a8..1f6396d7668 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -568,7 +568,7 @@ void audit_trim_trees(void)
 		if (err)
 			goto skip_it;
 
-		root_mnt = collect_mounts(path.mnt, path.dentry);
+		root_mnt = collect_mounts(&path);
 		path_put(&path);
 		if (!root_mnt)
 			goto skip_it;
@@ -660,7 +660,7 @@ int audit_add_tree_rule(struct audit_krule *rule)
 	err = kern_path(tree->pathname, 0, &path);
 	if (err)
 		goto Err;
-	mnt = collect_mounts(path.mnt, path.dentry);
+	mnt = collect_mounts(&path);
 	path_put(&path);
 	if (!mnt) {
 		err = -ENOMEM;
@@ -720,7 +720,7 @@ int audit_tag_tree(char *old, char *new)
 	err = kern_path(new, 0, &path);
 	if (err)
 		return err;
-	tagged = collect_mounts(path.mnt, path.dentry);
+	tagged = collect_mounts(&path);
 	path_put(&path);
 	if (!tagged)
 		return -ENOMEM;
-- 
cgit v1.2.3-70-g09d2


From 337eb00a2c3a421999c39c94ce7e33545ee8baa7 Mon Sep 17 00:00:00 2001
From: Alessio Igor Bogani <abogani@texware.it>
Date: Tue, 12 May 2009 15:10:54 +0200
Subject: Push BKL down into ->remount_fs()

[xfs, btrfs, capifs, shmem don't need BKL, exempt]

Signed-off-by: Alessio Igor Bogani <abogani@texware.it>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 drivers/usb/core/inode.c |  5 +++++
 fs/affs/super.c          |  7 ++++++-
 fs/ext2/super.c          | 12 ++++++++++--
 fs/ext3/super.c          |  4 ++++
 fs/ext4/super.c          |  4 ++++
 fs/hpfs/super.c          |  4 ++++
 fs/jffs2/fs.c            |  3 +++
 fs/jfs/super.c           | 22 ++++++++++++++++++----
 fs/nfs/super.c           |  2 ++
 fs/nilfs2/super.c        |  4 ++++
 fs/ntfs/super.c          | 15 ++++++++++++++-
 fs/ocfs2/super.c         |  4 ++++
 fs/reiserfs/super.c      |  4 ++++
 fs/super.c               |  2 --
 fs/ubifs/super.c         |  9 ++++++++-
 fs/udf/super.c           |  6 +++++-
 fs/ufs/super.c           | 11 ++++++++++-
 kernel/cgroup.c          |  3 +++
 18 files changed, 108 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/drivers/usb/core/inode.c b/drivers/usb/core/inode.c
index dff5760a37f..ffe75e83787 100644
--- a/drivers/usb/core/inode.c
+++ b/drivers/usb/core/inode.c
@@ -39,6 +39,7 @@
 #include <linux/parser.h>
 #include <linux/notifier.h>
 #include <linux/seq_file.h>
+#include <linux/smp_lock.h>
 #include <asm/byteorder.h>
 #include "usb.h"
 #include "hcd.h"
@@ -265,9 +266,13 @@ static int remount(struct super_block *sb, int *flags, char *data)
 		return -EINVAL;
 	}
 
+	lock_kernel();
+
 	if (usbfs_mount && usbfs_mount->mnt_sb)
 		update_sb(usbfs_mount->mnt_sb);
 
+	unlock_kernel();
+
 	return 0;
 }
 
diff --git a/fs/affs/super.c b/fs/affs/super.c
index 280d361af41..c4814937c96 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -16,6 +16,7 @@
 #include <linux/parser.h>
 #include <linux/magic.h>
 #include <linux/sched.h>
+#include <linux/smp_lock.h>
 #include "affs.h"
 
 extern struct timezone sys_tz;
@@ -512,6 +513,7 @@ affs_remount(struct super_block *sb, int *flags, char *data)
 		kfree(new_opts);
 		return -EINVAL;
 	}
+	lock_kernel();
 	replace_mount_options(sb, new_opts);
 
 	sbi->s_flags = mount_flags;
@@ -519,8 +521,10 @@ affs_remount(struct super_block *sb, int *flags, char *data)
 	sbi->s_uid   = uid;
 	sbi->s_gid   = gid;
 
-	if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
+	if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) {
+		unlock_kernel();
 		return 0;
+	}
 	if (*flags & MS_RDONLY) {
 		sb->s_dirt = 1;
 		while (sb->s_dirt)
@@ -529,6 +533,7 @@ affs_remount(struct super_block *sb, int *flags, char *data)
 	} else
 		res = affs_init_bitmap(sb, flags);
 
+	unlock_kernel();
 	return res;
 }
 
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index a44963d8edb..f8cbdf56919 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -1162,6 +1162,8 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
 	unsigned long old_sb_flags;
 	int err;
 
+	lock_kernel();
+
 	/* Store the old options */
 	old_sb_flags = sb->s_flags;
 	old_opts.s_mount_opt = sbi->s_mount_opt;
@@ -1197,12 +1199,16 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
 		sbi->s_mount_opt &= ~EXT2_MOUNT_XIP;
 		sbi->s_mount_opt |= old_mount_opt & EXT2_MOUNT_XIP;
 	}
-	if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
+	if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) {
+		unlock_kernel();
 		return 0;
+	}
 	if (*flags & MS_RDONLY) {
 		if (le16_to_cpu(es->s_state) & EXT2_VALID_FS ||
-		    !(sbi->s_mount_state & EXT2_VALID_FS))
+		    !(sbi->s_mount_state & EXT2_VALID_FS)) {
+			unlock_kernel();
 			return 0;
+		}
 		/*
 		 * OK, we are remounting a valid rw partition rdonly, so set
 		 * the rdonly flag and then mark the partition as valid again.
@@ -1229,12 +1235,14 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
 			sb->s_flags &= ~MS_RDONLY;
 	}
 	ext2_sync_super(sb, es);
+	unlock_kernel();
 	return 0;
 restore_opts:
 	sbi->s_mount_opt = old_opts.s_mount_opt;
 	sbi->s_resuid = old_opts.s_resuid;
 	sbi->s_resgid = old_opts.s_resgid;
 	sb->s_flags = old_sb_flags;
+	unlock_kernel();
 	return err;
 }
 
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index e213a2613a5..26aa64dee6a 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -2490,6 +2490,8 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
 	int i;
 #endif
 
+	lock_kernel();
+
 	/* Store the original options */
 	lock_super(sb);
 	old_sb_flags = sb->s_flags;
@@ -2600,6 +2602,7 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
 			kfree(old_opts.s_qf_names[i]);
 #endif
 	unlock_super(sb);
+	unlock_kernel();
 	return 0;
 restore_opts:
 	sb->s_flags = old_sb_flags;
@@ -2617,6 +2620,7 @@ restore_opts:
 	}
 #endif
 	unlock_super(sb);
+	unlock_kernel();
 	return err;
 }
 
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index c17200a4230..012c4251397 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -3422,6 +3422,8 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 	int i;
 #endif
 
+	lock_kernel();
+
 	/* Store the original options */
 	lock_super(sb);
 	old_sb_flags = sb->s_flags;
@@ -3558,6 +3560,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 			kfree(old_opts.s_qf_names[i]);
 #endif
 	unlock_super(sb);
+	unlock_kernel();
 	return 0;
 
 restore_opts:
@@ -3578,6 +3581,7 @@ restore_opts:
 	}
 #endif
 	unlock_super(sb);
+	unlock_kernel();
 	return err;
 }
 
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index f68193cf081..f2feaa06bf2 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -13,6 +13,7 @@
 #include <linux/statfs.h>
 #include <linux/magic.h>
 #include <linux/sched.h>
+#include <linux/smp_lock.h>
 
 /* Mark the filesystem dirty, so that chkdsk checks it when os/2 booted */
 
@@ -398,6 +399,7 @@ static int hpfs_remount_fs(struct super_block *s, int *flags, char *data)
 	
 	*flags |= MS_NOATIME;
 	
+	lock_kernel();
 	lock_super(s);
 	uid = sbi->sb_uid; gid = sbi->sb_gid;
 	umask = 0777 & ~sbi->sb_mode;
@@ -432,10 +434,12 @@ static int hpfs_remount_fs(struct super_block *s, int *flags, char *data)
 	replace_mount_options(s, new_opts);
 
 	unlock_super(s);
+	unlock_kernel();
 	return 0;
 
 out_err:
 	unlock_super(s);
+	unlock_kernel();
 	kfree(new_opts);
 	return -EINVAL;
 }
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index 237b27a3d57..3451a81b214 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -20,6 +20,7 @@
 #include <linux/vmalloc.h>
 #include <linux/vfs.h>
 #include <linux/crc32.h>
+#include <linux/smp_lock.h>
 #include "nodelist.h"
 
 static int jffs2_flash_setup(struct jffs2_sb_info *c);
@@ -387,6 +388,7 @@ int jffs2_remount_fs (struct super_block *sb, int *flags, char *data)
 	   This also catches the case where it was stopped and this
 	   is just a remount to restart it.
 	   Flush the writebuffer, if neccecary, else we loose it */
+	lock_kernel();
 	if (!(sb->s_flags & MS_RDONLY)) {
 		jffs2_stop_garbage_collect_thread(c);
 		mutex_lock(&c->alloc_sem);
@@ -399,6 +401,7 @@ int jffs2_remount_fs (struct super_block *sb, int *flags, char *data)
 
 	*flags |= MS_NOATIME;
 
+	unlock_kernel();
 	return 0;
 }
 
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 3eb13adf386..09b1b6ee218 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -32,6 +32,7 @@
 #include <linux/crc32.h>
 #include <asm/uaccess.h>
 #include <linux/seq_file.h>
+#include <linux/smp_lock.h>
 
 #include "jfs_incore.h"
 #include "jfs_filsys.h"
@@ -375,19 +376,24 @@ static int jfs_remount(struct super_block *sb, int *flags, char *data)
 	s64 newLVSize = 0;
 	int rc = 0;
 	int flag = JFS_SBI(sb)->flag;
+	int ret;
 
 	if (!parse_options(data, sb, &newLVSize, &flag)) {
 		return -EINVAL;
 	}
+	lock_kernel();
 	if (newLVSize) {
 		if (sb->s_flags & MS_RDONLY) {
 			printk(KERN_ERR
 		  "JFS: resize requires volume to be mounted read-write\n");
+			unlock_kernel();
 			return -EROFS;
 		}
 		rc = jfs_extendfs(sb, newLVSize, 0);
-		if (rc)
+		if (rc) {
+			unlock_kernel();
 			return rc;
+		}
 	}
 
 	if ((sb->s_flags & MS_RDONLY) && !(*flags & MS_RDONLY)) {
@@ -398,23 +404,31 @@ static int jfs_remount(struct super_block *sb, int *flags, char *data)
 		truncate_inode_pages(JFS_SBI(sb)->direct_inode->i_mapping, 0);
 
 		JFS_SBI(sb)->flag = flag;
-		return jfs_mount_rw(sb, 1);
+		ret = jfs_mount_rw(sb, 1);
+		unlock_kernel();
+		return ret;
 	}
 	if ((!(sb->s_flags & MS_RDONLY)) && (*flags & MS_RDONLY)) {
 		rc = jfs_umount_rw(sb);
 		JFS_SBI(sb)->flag = flag;
+		unlock_kernel();
 		return rc;
 	}
 	if ((JFS_SBI(sb)->flag & JFS_NOINTEGRITY) != (flag & JFS_NOINTEGRITY))
 		if (!(sb->s_flags & MS_RDONLY)) {
 			rc = jfs_umount_rw(sb);
-			if (rc)
+			if (rc) {
+				unlock_kernel();
 				return rc;
+			}
 			JFS_SBI(sb)->flag = flag;
-			return jfs_mount_rw(sb, 1);
+			ret = jfs_mount_rw(sb, 1);
+			unlock_kernel();
+			return ret;
 		}
 	JFS_SBI(sb)->flag = flag;
 
+	unlock_kernel();
 	return 0;
 }
 
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index d2d67781c57..26127b69a27 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -1813,6 +1813,7 @@ nfs_remount(struct super_block *sb, int *flags, char *raw_data)
 	if (data == NULL)
 		return -ENOMEM;
 
+	lock_kernel();
 	/* fill out struct with values from existing mount */
 	data->flags = nfss->flags;
 	data->rsize = nfss->rsize;
@@ -1837,6 +1838,7 @@ nfs_remount(struct super_block *sb, int *flags, char *raw_data)
 	error = nfs_compare_remount_data(nfss, data);
 out:
 	kfree(data);
+	unlock_kernel();
 	return error;
 }
 
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 7262e8427c2..11151eaa2c4 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -906,6 +906,8 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
 	struct nilfs_mount_options old_opts;
 	int err;
 
+	lock_kernel();
+
 	old_sb_flags = sb->s_flags;
 	old_opts.mount_opt = sbi->s_mount_opt;
 	old_opts.snapshot_cno = sbi->s_snapshot_cno;
@@ -985,6 +987,7 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
 		up(&sb->s_bdev->bd_mount_sem);
 	}
  out:
+	unlock_kernel();
 	return 0;
 
  rw_remount_failed:
@@ -993,6 +996,7 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
 	sb->s_flags = old_sb_flags;
 	sbi->s_mount_opt = old_opts.mount_opt;
 	sbi->s_snapshot_cno = old_opts.snapshot_cno;
+	unlock_kernel();
 	return err;
 }
 
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index 7a7b0d32639..abaaa1cbf8d 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -443,6 +443,8 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt)
 	ntfs_volume *vol = NTFS_SB(sb);
 
 	ntfs_debug("Entering with remount options string: %s", opt);
+
+	lock_kernel();
 #ifndef NTFS_RW
 	/* For read-only compiled driver, enforce read-only flag. */
 	*flags |= MS_RDONLY;
@@ -466,15 +468,18 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt)
 		if (NVolErrors(vol)) {
 			ntfs_error(sb, "Volume has errors and is read-only%s",
 					es);
+			unlock_kernel();
 			return -EROFS;
 		}
 		if (vol->vol_flags & VOLUME_IS_DIRTY) {
 			ntfs_error(sb, "Volume is dirty and read-only%s", es);
+			unlock_kernel();
 			return -EROFS;
 		}
 		if (vol->vol_flags & VOLUME_MODIFIED_BY_CHKDSK) {
 			ntfs_error(sb, "Volume has been modified by chkdsk "
 					"and is read-only%s", es);
+			unlock_kernel();
 			return -EROFS;
 		}
 		if (vol->vol_flags & VOLUME_MUST_MOUNT_RO_MASK) {
@@ -482,11 +487,13 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt)
 					"(0x%x) and is read-only%s",
 					(unsigned)le16_to_cpu(vol->vol_flags),
 					es);
+			unlock_kernel();
 			return -EROFS;
 		}
 		if (ntfs_set_volume_flags(vol, VOLUME_IS_DIRTY)) {
 			ntfs_error(sb, "Failed to set dirty bit in volume "
 					"information flags%s", es);
+			unlock_kernel();
 			return -EROFS;
 		}
 #if 0
@@ -506,18 +513,21 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt)
 			ntfs_error(sb, "Failed to empty journal $LogFile%s",
 					es);
 			NVolSetErrors(vol);
+			unlock_kernel();
 			return -EROFS;
 		}
 		if (!ntfs_mark_quotas_out_of_date(vol)) {
 			ntfs_error(sb, "Failed to mark quotas out of date%s",
 					es);
 			NVolSetErrors(vol);
+			unlock_kernel();
 			return -EROFS;
 		}
 		if (!ntfs_stamp_usnjrnl(vol)) {
 			ntfs_error(sb, "Failed to stamp transation log "
 					"($UsnJrnl)%s", es);
 			NVolSetErrors(vol);
+			unlock_kernel();
 			return -EROFS;
 		}
 	} else if (!(sb->s_flags & MS_RDONLY) && (*flags & MS_RDONLY)) {
@@ -533,8 +543,11 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt)
 
 	// TODO: Deal with *flags.
 
-	if (!parse_options(vol, opt))
+	if (!parse_options(vol, opt)) {
+		unlock_kernel();
 		return -EINVAL;
+	}
+	unlock_kernel();
 	ntfs_debug("Done.");
 	return 0;
 }
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 02737596b59..201b40a441f 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -42,6 +42,7 @@
 #include <linux/mount.h>
 #include <linux/seq_file.h>
 #include <linux/quotaops.h>
+#include <linux/smp_lock.h>
 
 #define MLOG_MASK_PREFIX ML_SUPER
 #include <cluster/masklog.h>
@@ -581,6 +582,8 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
 	struct mount_options parsed_options;
 	struct ocfs2_super *osb = OCFS2_SB(sb);
 
+	lock_kernel();
+
 	if (!ocfs2_parse_options(sb, data, &parsed_options, 1)) {
 		ret = -EINVAL;
 		goto out;
@@ -684,6 +687,7 @@ unlock_osb:
 			ocfs2_set_journal_params(osb);
 	}
 out:
+	unlock_kernel();
 	return ret;
 }
 
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 90dcb7b033e..2969773cfc2 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -28,6 +28,7 @@
 #include <linux/mount.h>
 #include <linux/namei.h>
 #include <linux/crc32.h>
+#include <linux/smp_lock.h>
 
 struct file_system_type reiserfs_fs_type;
 
@@ -1196,6 +1197,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
 	memcpy(qf_names, REISERFS_SB(s)->s_qf_names, sizeof(qf_names));
 #endif
 
+	lock_kernel();
 	rs = SB_DISK_SUPER_BLOCK(s);
 
 	if (!reiserfs_parse_options
@@ -1318,10 +1320,12 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
 
 out_ok:
 	replace_mount_options(s, new_opts);
+	unlock_kernel();
 	return 0;
 
 out_err:
 	kfree(new_opts);
+	unlock_kernel();
 	return err;
 }
 
diff --git a/fs/super.c b/fs/super.c
index 1905f4af01c..83b47416d00 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -540,7 +540,6 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
 	shrink_dcache_sb(sb);
 	sync_filesystem(sb);
 
-	lock_kernel();
 	/* If we are remounting RDONLY and current sb is read/write,
 	   make sure there are no rw files opened */
 	if ((flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY)) {
@@ -566,7 +565,6 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
 		}
 	}
 	sb->s_flags = (sb->s_flags & ~MS_RMT_MASK) | (flags & MS_RMT_MASK);
-	unlock_kernel();
 	if (remount_rw)
 		vfs_dq_quota_on_remount(sb);
 	return 0;
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 522c3fd7eb3..3589eab02a2 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -36,6 +36,7 @@
 #include <linux/mount.h>
 #include <linux/math64.h>
 #include <linux/writeback.h>
+#include <linux/smp_lock.h>
 #include "ubifs.h"
 
 /*
@@ -1770,17 +1771,22 @@ static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data)
 		return err;
 	}
 
+	lock_kernel();
 	if ((sb->s_flags & MS_RDONLY) && !(*flags & MS_RDONLY)) {
 		if (c->ro_media) {
 			ubifs_msg("cannot re-mount due to prior errors");
+			unlock_kernel();
 			return -EROFS;
 		}
 		err = ubifs_remount_rw(c);
-		if (err)
+		if (err) {
+			unlock_kernel();
 			return err;
+		}
 	} else if (!(sb->s_flags & MS_RDONLY) && (*flags & MS_RDONLY)) {
 		if (c->ro_media) {
 			ubifs_msg("cannot re-mount due to prior errors");
+			unlock_kernel();
 			return -EROFS;
 		}
 		ubifs_remount_ro(c);
@@ -1795,6 +1801,7 @@ static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data)
 	}
 
 	ubifs_assert(c->lst.taken_empty_lebs > 0);
+	unlock_kernel();
 	return 0;
 }
 
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 04802cc39b1..6832135159b 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -568,6 +568,7 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options)
 	if (!udf_parse_options(options, &uopt, true))
 		return -EINVAL;
 
+	lock_kernel();
 	sbi->s_flags = uopt.flags;
 	sbi->s_uid   = uopt.uid;
 	sbi->s_gid   = uopt.gid;
@@ -581,13 +582,16 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options)
 			*flags |= MS_RDONLY;
 	}
 
-	if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
+	if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) {
+		unlock_kernel();
 		return 0;
+	}
 	if (*flags & MS_RDONLY)
 		udf_close_lvid(sb);
 	else
 		udf_open_lvid(sb);
 
+	unlock_kernel();
 	return 0;
 }
 
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index c97210ee067..6560dda7b18 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -263,6 +263,7 @@ void ufs_panic (struct super_block * sb, const char * function,
 	struct ufs_super_block_first * usb1;
 	va_list args;
 	
+	lock_kernel();
 	uspi = UFS_SB(sb)->s_uspi;
 	usb1 = ubh_get_usb_first(uspi);
 	
@@ -1182,7 +1183,8 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
 	struct ufs_super_block_third * usb3;
 	unsigned new_mount_opt, ufstype;
 	unsigned flags;
-	
+
+	lock_kernel();
 	lock_super(sb);
 	uspi = UFS_SB(sb)->s_uspi;
 	flags = UFS_SB(sb)->s_flags;
@@ -1198,6 +1200,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
 	ufs_set_opt (new_mount_opt, ONERROR_LOCK);
 	if (!ufs_parse_options (data, &new_mount_opt)) {
 		unlock_super(sb);
+		unlock_kernel();
 		return -EINVAL;
 	}
 	if (!(new_mount_opt & UFS_MOUNT_UFSTYPE)) {
@@ -1205,12 +1208,14 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
 	} else if ((new_mount_opt & UFS_MOUNT_UFSTYPE) != ufstype) {
 		printk("ufstype can't be changed during remount\n");
 		unlock_super(sb);
+		unlock_kernel();
 		return -EINVAL;
 	}
 
 	if ((*mount_flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) {
 		UFS_SB(sb)->s_mount_opt = new_mount_opt;
 		unlock_super(sb);
+		unlock_kernel();
 		return 0;
 	}
 	
@@ -1236,6 +1241,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
 		printk("ufs was compiled with read-only support, "
 		"can't be mounted as read-write\n");
 		unlock_super(sb);
+		unlock_kernel();
 		return -EINVAL;
 #else
 		if (ufstype != UFS_MOUNT_UFSTYPE_SUN && 
@@ -1245,11 +1251,13 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
 		    ufstype != UFS_MOUNT_UFSTYPE_UFS2) {
 			printk("this ufstype is read-only supported\n");
 			unlock_super(sb);
+			unlock_kernel();
 			return -EINVAL;
 		}
 		if (!ufs_read_cylinder_structures(sb)) {
 			printk("failed during remounting\n");
 			unlock_super(sb);
+			unlock_kernel();
 			return -EPERM;
 		}
 		sb->s_flags &= ~MS_RDONLY;
@@ -1257,6 +1265,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
 	}
 	UFS_SB(sb)->s_mount_opt = new_mount_opt;
 	unlock_super(sb);
+	unlock_kernel();
 	return 0;
 }
 
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index a7267bfd376..3fb789f6df9 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -46,6 +46,7 @@
 #include <linux/cgroupstats.h>
 #include <linux/hash.h>
 #include <linux/namei.h>
+#include <linux/smp_lock.h>
 
 #include <asm/atomic.h>
 
@@ -900,6 +901,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
 	struct cgroup *cgrp = &root->top_cgroup;
 	struct cgroup_sb_opts opts;
 
+	lock_kernel();
 	mutex_lock(&cgrp->dentry->d_inode->i_mutex);
 	mutex_lock(&cgroup_mutex);
 
@@ -927,6 +929,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
 	kfree(opts.release_agent);
 	mutex_unlock(&cgroup_mutex);
 	mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
+	unlock_kernel();
 	return ret;
 }
 
-- 
cgit v1.2.3-70-g09d2


From 28be225b23b115573e0ecc8ef9996f42a1652f74 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Fri, 12 Jun 2009 11:33:02 +0300
Subject: irq: slab alloc for default irq_affinity

Ingo had

[    0.000000] ------------[ cut here ]------------
[    0.000000] WARNING: at mm/bootmem.c:537 alloc_arch_preferred_bootmem+0x2b/0x71()
[    0.000000] Hardware name: System Product Name
[    0.000000] Modules linked in:
[    0.000000] Pid: 0, comm: swapper Tainted: G        W  2.6.30-tip-03087-g0bb2618-dirty #52506
[    0.000000] Call Trace:
[    0.000000]  [<81032588>] warn_slowpath_common+0x60/0x90
[    0.000000]  [<810325c5>] warn_slowpath_null+0xd/0x10
[    0.000000]  [<819d1bc0>] alloc_arch_preferred_bootmem+0x2b/0x71
[    0.000000]  [<819d1c31>] ___alloc_bootmem_nopanic+0x2b/0x9a
[    0.000000]  [<81050a0a>] ? lock_release+0xac/0xb2
[    0.000000]  [<819d1d4c>] ___alloc_bootmem+0xe/0x2d
[    0.000000]  [<819d1e9f>] __alloc_bootmem+0xa/0xc
[    0.000000]  [<819d7c63>] alloc_bootmem_cpumask_var+0x21/0x26
[    0.000000]  [<819d0cc8>] early_irq_init+0x15/0x10d
[    0.000000]  [<819bb75a>] start_kernel+0x167/0x326
[    0.000000]  [<819bb06b>] __init_begin+0x6b/0x70
[    0.000000] ---[ end trace 4eaa2a86a8e2da23 ]---
[    0.000000] NR_IRQS:2304 nr_irqs:424
[    0.000000] CPU 0 irqstacks, hard=821e6000 soft=821e7000

we need to update init_irq_default_affinity

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
---
 kernel/irq/handle.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 10457854123..065205bdd92 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -45,7 +45,7 @@ void handle_bad_irq(unsigned int irq, struct irq_desc *desc)
 #if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_HARDIRQS)
 static void __init init_irq_default_affinity(void)
 {
-	alloc_bootmem_cpumask_var(&irq_default_affinity);
+	alloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT);
 	cpumask_setall(irq_default_affinity);
 }
 #else
-- 
cgit v1.2.3-70-g09d2


From 9a71af2c3627b379b7c31917a7f6ee0d29bc559b Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Fri, 12 Jun 2009 21:46:53 -0600
Subject: module_param: invbool should take a 'bool', not an 'int'

It takes an 'int' for historical reasons, and there are only two
users: simply switch it over to bool.

The other user (uvesafb.c) will get a (harmless-on-x86) warning until
the next patch is applied.

Cc: Brad Douglas <brad@neruo.com>
Cc: Michal Januszewski <spock@gentoo.org>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 drivers/video/aty/aty128fb.c | 2 +-
 include/linux/moduleparam.h  | 2 +-
 kernel/params.c              | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/drivers/video/aty/aty128fb.c b/drivers/video/aty/aty128fb.c
index 35e8eb02b9e..e4e4d433b00 100644
--- a/drivers/video/aty/aty128fb.c
+++ b/drivers/video/aty/aty128fb.c
@@ -354,7 +354,7 @@ static int default_crt_on __devinitdata = 0;
 static int default_lcd_on __devinitdata = 1;
 
 #ifdef CONFIG_MTRR
-static int mtrr = 1;
+static bool mtrr = true;
 #endif
 
 #ifdef CONFIG_PMAC_BACKLIGHT
diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h
index a4f0b931846..9bbca8e8c19 100644
--- a/include/linux/moduleparam.h
+++ b/include/linux/moduleparam.h
@@ -192,7 +192,7 @@ extern int param_get_bool(char *buffer, struct kernel_param *kp);
 
 extern int param_set_invbool(const char *val, struct kernel_param *kp);
 extern int param_get_invbool(char *buffer, struct kernel_param *kp);
-#define param_check_invbool(name, p) __param_check(name, p, int)
+#define param_check_invbool(name, p) __param_check(name, p, bool)
 
 /* Comma-separated array: *nump is set to number they actually specified. */
 #define module_param_array_named(name, array, type, nump, perm)		\
diff --git a/kernel/params.c b/kernel/params.c
index de273ec85bd..023abbf5f89 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -272,13 +272,13 @@ int param_set_invbool(const char *val, struct kernel_param *kp)
 	dummy.arg = &boolval;
 	ret = param_set_bool(val, &dummy);
 	if (ret == 0)
-		*(int *)kp->arg = !boolval;
+		*(bool *)kp->arg = !boolval;
 	return ret;
 }
 
 int param_get_invbool(char *buffer, struct kernel_param *kp)
 {
-	return sprintf(buffer, "%c", (*(int *)kp->arg) ? 'N' : 'Y');
+	return sprintf(buffer, "%c", (*(bool *)kp->arg) ? 'N' : 'Y');
 }
 
 /* We break the rule and mangle the string. */
-- 
cgit v1.2.3-70-g09d2


From 45fcc70c0b6ee0c508e1fdb5fef735c3546803f4 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Fri, 12 Jun 2009 21:46:56 -0600
Subject: module_param: split perm field into flags and perm

Impact: cleanup

Rather than hack KPARAM_KMALLOCED into the perm field, separate it out.
Since the perm field was 32 bits and only needs 16, we don't add bloat.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 include/linux/moduleparam.h | 8 ++++++--
 kernel/params.c             | 9 +++------
 2 files changed, 9 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h
index 9bbca8e8c19..009a5f76876 100644
--- a/include/linux/moduleparam.h
+++ b/include/linux/moduleparam.h
@@ -36,9 +36,13 @@ typedef int (*param_set_fn)(const char *val, struct kernel_param *kp);
 /* Returns length written or -errno.  Buffer is 4k (ie. be short!) */
 typedef int (*param_get_fn)(char *buffer, struct kernel_param *kp);
 
+/* Flag bits for kernel_param.flags */
+#define KPARAM_KMALLOCED	1
+
 struct kernel_param {
 	const char *name;
-	unsigned int perm;
+	u16 perm;
+	u16 flags;
 	param_set_fn set;
 	param_get_fn get;
 	union {
@@ -88,7 +92,7 @@ struct kparam_array
 	static struct kernel_param __moduleparam_const __param_##name	\
 	__used								\
     __attribute__ ((unused,__section__ ("__param"),aligned(sizeof(void *)))) \
-	= { __param_str_##name, perm, set, get, { arg } }
+	= { __param_str_##name, perm, 0, set, get, { arg } }
 
 #define module_param_call(name, set, get, arg, perm)			      \
 	__module_param_call(MODULE_PARAM_PREFIX, name, set, get, arg, perm)
diff --git a/kernel/params.c b/kernel/params.c
index 023abbf5f89..b4660dc13db 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -24,9 +24,6 @@
 #include <linux/err.h>
 #include <linux/slab.h>
 
-/* We abuse the high bits of "perm" to record whether we kmalloc'ed. */
-#define KPARAM_KMALLOCED	0x80000000
-
 #if 0
 #define DEBUGP printk
 #else
@@ -220,13 +217,13 @@ int param_set_charp(const char *val, struct kernel_param *kp)
 		return -ENOSPC;
 	}
 
-	if (kp->perm & KPARAM_KMALLOCED)
+	if (kp->flags & KPARAM_KMALLOCED)
 		kfree(*(char **)kp->arg);
 
 	/* This is a hack.  We can't need to strdup in early boot, and we
 	 * don't need to; this mangled commandline is preserved. */
 	if (slab_is_available()) {
-		kp->perm |= KPARAM_KMALLOCED;
+		kp->flags |= KPARAM_KMALLOCED;
 		*(char **)kp->arg = kstrdup(val, GFP_KERNEL);
 		if (!kp->arg)
 			return -ENOMEM;
@@ -591,7 +588,7 @@ void destroy_params(const struct kernel_param *params, unsigned num)
 	unsigned int i;
 
 	for (i = 0; i < num; i++)
-		if (params[i].perm & KPARAM_KMALLOCED)
+		if (params[i].flags & KPARAM_KMALLOCED)
 			kfree(*(char **)params[i].arg);
 }
 
-- 
cgit v1.2.3-70-g09d2


From fddd520122953550ec2c8b60e7ca0d0f0d115d97 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Fri, 12 Jun 2009 21:46:57 -0600
Subject: module_param: allow 'bool' module_params to be bool, not just int.

Impact: API cleanup

For historical reasons, 'bool' parameters must be an int, not a bool.
But there are around 600 users, so a conversion seems like useless churn.

So we use __same_type() to distinguish, and handle both cases.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 include/linux/moduleparam.h | 32 +++++++++++++++++++++++---------
 kernel/params.c             | 33 ++++++++++++++++++++++++++-------
 2 files changed, 49 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h
index 009a5f76876..6547c3cdbc4 100644
--- a/include/linux/moduleparam.h
+++ b/include/linux/moduleparam.h
@@ -38,6 +38,7 @@ typedef int (*param_get_fn)(char *buffer, struct kernel_param *kp);
 
 /* Flag bits for kernel_param.flags */
 #define KPARAM_KMALLOCED	1
+#define KPARAM_ISBOOL		2
 
 struct kernel_param {
 	const char *name;
@@ -83,7 +84,7 @@ struct kparam_array
    parameters.  perm sets the visibility in sysfs: 000 means it's
    not there, read bits mean it's readable, write bits mean it's
    writable. */
-#define __module_param_call(prefix, name, set, get, arg, perm)		\
+#define __module_param_call(prefix, name, set, get, arg, isbool, perm)	\
 	/* Default value instead of permissions? */			\
 	static int __param_perm_check_##name __attribute__((unused)) =	\
 	BUILD_BUG_ON_ZERO((perm) < 0 || (perm) > 0777 || ((perm) & 2))	\
@@ -92,10 +93,13 @@ struct kparam_array
 	static struct kernel_param __moduleparam_const __param_##name	\
 	__used								\
     __attribute__ ((unused,__section__ ("__param"),aligned(sizeof(void *)))) \
-	= { __param_str_##name, perm, 0, set, get, { arg } }
+	= { __param_str_##name, perm, isbool ? KPARAM_ISBOOL : 0,	\
+	    set, get, { arg } }
 
 #define module_param_call(name, set, get, arg, perm)			      \
-	__module_param_call(MODULE_PARAM_PREFIX, name, set, get, arg, perm)
+	__module_param_call(MODULE_PARAM_PREFIX,			      \
+			    name, set, get, arg,			      \
+			    __same_type(*(arg), bool), perm)
 
 /* Helper functions: type is byte, short, ushort, int, uint, long,
    ulong, charp, bool or invbool, or XXX if you define param_get_XXX,
@@ -124,15 +128,16 @@ struct kparam_array
 #define core_param(name, var, type, perm)				\
 	param_check_##type(name, &(var));				\
 	__module_param_call("", name, param_set_##type, param_get_##type, \
-			    &var, perm)
+			    &var, __same_type(var, bool), perm)
 #endif /* !MODULE */
 
 /* Actually copy string: maxlen param is usually sizeof(string). */
 #define module_param_string(name, string, len, perm)			\
 	static const struct kparam_string __param_string_##name		\
 		= { len, string };					\
-	module_param_call(name, param_set_copystring, param_get_string,	\
-			  .str = &__param_string_##name, perm);		\
+	__module_param_call(MODULE_PARAM_PREFIX, name,			\
+			    param_set_copystring, param_get_string,	\
+			    .str = &__param_string_##name, 0, perm);	\
 	__MODULE_PARM_TYPE(name, "string")
 
 /* Called on module insert or kernel boot */
@@ -190,9 +195,16 @@ extern int param_set_charp(const char *val, struct kernel_param *kp);
 extern int param_get_charp(char *buffer, struct kernel_param *kp);
 #define param_check_charp(name, p) __param_check(name, p, char *)
 
+/* For historical reasons "bool" parameters can be (unsigned) "int". */
 extern int param_set_bool(const char *val, struct kernel_param *kp);
 extern int param_get_bool(char *buffer, struct kernel_param *kp);
-#define param_check_bool(name, p) __param_check(name, p, int)
+#define param_check_bool(name, p)					\
+	static inline void __check_##name(void)				\
+	{								\
+		BUILD_BUG_ON(!__same_type(*(p), bool) &&		\
+			     !__same_type(*(p), unsigned int) &&	\
+			     !__same_type(*(p), int));			\
+	}
 
 extern int param_set_invbool(const char *val, struct kernel_param *kp);
 extern int param_get_invbool(char *buffer, struct kernel_param *kp);
@@ -203,8 +215,10 @@ extern int param_get_invbool(char *buffer, struct kernel_param *kp);
 	static const struct kparam_array __param_arr_##name		\
 	= { ARRAY_SIZE(array), nump, param_set_##type, param_get_##type,\
 	    sizeof(array[0]), array };					\
-	module_param_call(name, param_array_set, param_array_get, 	\
-			  .arr = &__param_arr_##name, perm);		\
+	__module_param_call(MODULE_PARAM_PREFIX, name,			\
+			    param_array_set, param_array_get,		\
+			    .arr = &__param_arr_##name,			\
+			    __same_type(array[0], bool), perm);		\
 	__MODULE_PARM_TYPE(name, "array of " #type)
 
 #define module_param_array(name, type, nump, perm)		\
diff --git a/kernel/params.c b/kernel/params.c
index b4660dc13db..7f6912ced2b 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -238,35 +238,54 @@ int param_get_charp(char *buffer, struct kernel_param *kp)
 	return sprintf(buffer, "%s", *((char **)kp->arg));
 }
 
+/* Actually could be a bool or an int, for historical reasons. */
 int param_set_bool(const char *val, struct kernel_param *kp)
 {
+	bool v;
+
 	/* No equals means "set"... */
 	if (!val) val = "1";
 
 	/* One of =[yYnN01] */
 	switch (val[0]) {
 	case 'y': case 'Y': case '1':
-		*(int *)kp->arg = 1;
-		return 0;
+		v = true;
+		break;
 	case 'n': case 'N': case '0':
-		*(int *)kp->arg = 0;
-		return 0;
+		v = false;
+		break;
+	default:
+		return -EINVAL;
 	}
-	return -EINVAL;
+
+	if (kp->flags & KPARAM_ISBOOL)
+		*(bool *)kp->arg = v;
+	else
+		*(int *)kp->arg = v;
+	return 0;
 }
 
 int param_get_bool(char *buffer, struct kernel_param *kp)
 {
+	bool val;
+	if (kp->flags & KPARAM_ISBOOL)
+		val = *(bool *)kp->arg;
+	else
+		val = *(int *)kp->arg;
+
 	/* Y and N chosen as being relatively non-coder friendly */
-	return sprintf(buffer, "%c", (*(int *)kp->arg) ? 'Y' : 'N');
+	return sprintf(buffer, "%c", val ? 'Y' : 'N');
 }
 
+/* This one must be bool. */
 int param_set_invbool(const char *val, struct kernel_param *kp)
 {
-	int boolval, ret;
+	int ret;
+	bool boolval;
 	struct kernel_param dummy;
 
 	dummy.arg = &boolval;
+	dummy.flags = KPARAM_ISBOOL;
 	ret = param_set_bool(val, &dummy);
 	if (ret == 0)
 		*(bool *)kp->arg = !boolval;
-- 
cgit v1.2.3-70-g09d2


From ad6561dffa17f17bb68d7207d422c26c381c4313 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Fri, 12 Jun 2009 21:47:03 -0600
Subject: module: trim exception table on init free.

It's theoretically possible that there are exception table entries
which point into the (freed) init text of modules.  These could cause
future problems if other modules get loaded into that memory and cause
an exception as we'd see the wrong fixup.  The only case I know of is
kvm-intel.ko (when CONFIG_CC_OPTIMIZE_FOR_SIZE=n).

Amerigo fixed this long-standing FIXME in the x86 version, but this
patch is more general.

This implements trim_init_extable(); most archs are simple since they
use the standard lib/extable.c sort code.  Alpha and IA64 use relative
addresses in their fixups, so thier trimming is a slight variation.

Sparc32 is unique; it doesn't seem to define ARCH_HAS_SORT_EXTABLE,
yet it defines its own sort_extable() which overrides the one in lib.
It doesn't sort, so we have to mark deleted entries instead of
actually trimming them.

Inspired-by: Amerigo Wang <amwang@redhat.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Cc: linux-alpha@vger.kernel.org
Cc: sparclinux@vger.kernel.org
Cc: linux-ia64@vger.kernel.org
---
 arch/alpha/mm/extable.c             | 21 +++++++++++++++++++++
 arch/ia64/mm/extable.c              | 26 ++++++++++++++++++++++++++
 arch/sparc/include/asm/uaccess_32.h |  3 +++
 arch/sparc/mm/extable.c             | 29 +++++++++++++++++++++++++++++
 include/linux/module.h              |  1 +
 kernel/module.c                     |  1 +
 lib/extable.c                       | 21 ++++++++++++++++++++-
 7 files changed, 101 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/arch/alpha/mm/extable.c b/arch/alpha/mm/extable.c
index 62dc379d301..813c9b63c0e 100644
--- a/arch/alpha/mm/extable.c
+++ b/arch/alpha/mm/extable.c
@@ -48,6 +48,27 @@ void sort_extable(struct exception_table_entry *start,
 	     cmp_ex, swap_ex);
 }
 
+#ifdef CONFIG_MODULES
+/*
+ * Any entry referring to the module init will be at the beginning or
+ * the end.
+ */
+void trim_init_extable(struct module *m)
+{
+	/*trim the beginning*/
+	while (m->num_exentries &&
+	       within_module_init(ex_to_addr(&m->extable[0]), m)) {
+		m->extable++;
+		m->num_exentries--;
+	}
+	/*trim the end*/
+	while (m->num_exentries &&
+	       within_module_init(ex_to_addr(&m->extable[m->num_exentries-1]),
+				  m))
+		m->num_exentries--;
+}
+#endif /* CONFIG_MODULES */
+
 const struct exception_table_entry *
 search_extable(const struct exception_table_entry *first,
 	       const struct exception_table_entry *last,
diff --git a/arch/ia64/mm/extable.c b/arch/ia64/mm/extable.c
index 71c50dd8f87..e95d5ad9285 100644
--- a/arch/ia64/mm/extable.c
+++ b/arch/ia64/mm/extable.c
@@ -53,6 +53,32 @@ void sort_extable (struct exception_table_entry *start,
 	     cmp_ex, swap_ex);
 }
 
+static inline unsigned long ex_to_addr(const struct exception_table_entry *x)
+{
+	return (unsigned long)&x->insn + x->insn;
+}
+
+#ifdef CONFIG_MODULES
+/*
+ * Any entry referring to the module init will be at the beginning or
+ * the end.
+ */
+void trim_init_extable(struct module *m)
+{
+	/*trim the beginning*/
+	while (m->num_exentries &&
+	       within_module_init(ex_to_addr(&m->extable[0]), m)) {
+		m->extable++;
+		m->num_exentries--;
+	}
+	/*trim the end*/
+	while (m->num_exentries &&
+	       within_module_init(ex_to_addr(&m->extable[m->num_exentries-1]),
+				  m))
+		m->num_exentries--;
+}
+#endif /* CONFIG_MODULES */
+
 const struct exception_table_entry *
 search_extable (const struct exception_table_entry *first,
 		const struct exception_table_entry *last,
diff --git a/arch/sparc/include/asm/uaccess_32.h b/arch/sparc/include/asm/uaccess_32.h
index 47d5619d43f..8303ac48103 100644
--- a/arch/sparc/include/asm/uaccess_32.h
+++ b/arch/sparc/include/asm/uaccess_32.h
@@ -17,6 +17,9 @@
 
 #ifndef __ASSEMBLY__
 
+#define ARCH_HAS_SORT_EXTABLE
+#define ARCH_HAS_SEARCH_EXTABLE
+
 /* Sparc is not segmented, however we need to be able to fool access_ok()
  * when doing system calls from kernel mode legitimately.
  *
diff --git a/arch/sparc/mm/extable.c b/arch/sparc/mm/extable.c
index 16cc28935e3..a61c349448e 100644
--- a/arch/sparc/mm/extable.c
+++ b/arch/sparc/mm/extable.c
@@ -28,6 +28,10 @@ search_extable(const struct exception_table_entry *start,
 	 *	word 3: last insn address + 4 bytes
 	 *	word 4: fixup code address
 	 *
+	 * Deleted entries are encoded as:
+	 *	word 1: unused
+	 *	word 2: -1
+	 *
 	 * See asm/uaccess.h for more details.
 	 */
 
@@ -39,6 +43,10 @@ search_extable(const struct exception_table_entry *start,
 			continue;
 		}
 
+		/* A deleted entry; see trim_init_extable */
+		if (walk->fixup == -1)
+			continue;
+
 		if (walk->insn == value)
 			return walk;
 	}
@@ -57,6 +65,27 @@ search_extable(const struct exception_table_entry *start,
         return NULL;
 }
 
+#ifdef CONFIG_MODULES
+/* We could memmove them around; easier to mark the trimmed ones. */
+void trim_init_extable(struct module *m)
+{
+	unsigned int i;
+	bool range;
+
+	for (i = 0; i < m->num_exentries; i += range ? 2 : 1) {
+		range = m->extable[i].fixup == 0;
+
+		if (within_module_init(m->extable[i].insn, m)) {
+			m->extable[i].fixup = -1;
+			if (range)
+				m->extable[i+1].fixup = -1;
+		}
+		if (range)
+			i++;
+	}
+}
+#endif /* CONFIG_MODULES */
+
 /* Special extable search, which handles ranges.  Returns fixup */
 unsigned long search_extables_range(unsigned long addr, unsigned long *g2)
 {
diff --git a/include/linux/module.h b/include/linux/module.h
index a8f2c0aa4c3..a7bc6e7b43a 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -77,6 +77,7 @@ search_extable(const struct exception_table_entry *first,
 void sort_extable(struct exception_table_entry *start,
 		  struct exception_table_entry *finish);
 void sort_main_extable(void);
+void trim_init_extable(struct module *m);
 
 #ifdef MODULE
 #define MODULE_GENERIC_TABLE(gtype,name)			\
diff --git a/kernel/module.c b/kernel/module.c
index 35f7de00bf0..e4ab36ce767 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2455,6 +2455,7 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
 	mutex_lock(&module_mutex);
 	/* Drop initial reference. */
 	module_put(mod);
+	trim_init_extable(mod);
 	module_free(mod, mod->module_init);
 	mod->module_init = NULL;
 	mod->init_size = 0;
diff --git a/lib/extable.c b/lib/extable.c
index 179c0874559..4cac81ec225 100644
--- a/lib/extable.c
+++ b/lib/extable.c
@@ -39,7 +39,26 @@ void sort_extable(struct exception_table_entry *start,
 	sort(start, finish - start, sizeof(struct exception_table_entry),
 	     cmp_ex, NULL);
 }
-#endif
+
+#ifdef CONFIG_MODULES
+/*
+ * If the exception table is sorted, any referring to the module init
+ * will be at the beginning or the end.
+ */
+void trim_init_extable(struct module *m)
+{
+	/*trim the beginning*/
+	while (m->num_exentries && within_module_init(m->extable[0].insn, m)) {
+		m->extable++;
+		m->num_exentries--;
+	}
+	/*trim the end*/
+	while (m->num_exentries &&
+		within_module_init(m->extable[m->num_exentries-1].insn, m))
+		m->num_exentries--;
+}
+#endif /* CONFIG_MODULES */
+#endif /* !ARCH_HAS_SORT_EXTABLE */
 
 #ifndef ARCH_HAS_SEARCH_EXTABLE
 /*
-- 
cgit v1.2.3-70-g09d2


From 081fad86178ec0f64f32f1bd04cf4aad22714fb9 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 11 Jun 2009 17:57:21 +0200
Subject: perf_counter: Remove PERF_TYPE_RAW special casing

The PERF_TYPE_RAW special case seems superfluous these days. Remove
it and add it to the switch() stmt like the others.

[ Impact: cleanup ]

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_counter.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index ef5d8a5b245..663bbe01505 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -3570,12 +3570,8 @@ perf_counter_alloc(struct perf_counter_attr *attr,
 	if (attr->inherit && (attr->sample_type & PERF_SAMPLE_GROUP))
 		goto done;
 
-	if (attr->type == PERF_TYPE_RAW) {
-		pmu = hw_perf_counter_init(counter);
-		goto done;
-	}
-
 	switch (attr->type) {
+	case PERF_TYPE_RAW:
 	case PERF_TYPE_HARDWARE:
 	case PERF_TYPE_HW_CACHE:
 		pmu = hw_perf_counter_init(counter);
-- 
cgit v1.2.3-70-g09d2


From 974802eaa1afdc87e00821df7020a2b3c6fee623 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 12 Jun 2009 12:46:55 +0200
Subject: perf_counter: Add forward/backward attribute ABI compatibility

Provide for means of extending the perf_counter_attr in a 'natural' way.

We allow growing the structure by appending fields at the end by specifying
the full structure size inside it.

When a new kernel sees a smaller (old) structure, it will 0 pad the tail.
When an old kernel sees a larger (new) structure, it will verify the tail
consists of 0s, otherwise fail.

If we fail due to a size-mismatch, we return -E2BIG and write the kernel's
native attribe size back into the provided structure.

Furthermore, add some attribute verification, so that we'll fail counter
creation when unknown bits are present (PERF_SAMPLE, PERF_FORMAT, or in
the __reserved fields).

(This ABI detail is introduced while keeping the existing syscall ABI.)

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h | 19 ++++++++--
 include/linux/syscalls.h     |  2 +-
 kernel/perf_counter.c        | 89 ++++++++++++++++++++++++++++++++++++++++++--
 tools/perf/perf.h            |  5 ++-
 4 files changed, 105 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 7c4f32f6ae1..1b3118a1023 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -120,6 +120,8 @@ enum perf_counter_sample_format {
 	PERF_SAMPLE_ID				= 1U << 6,
 	PERF_SAMPLE_CPU				= 1U << 7,
 	PERF_SAMPLE_PERIOD			= 1U << 8,
+
+	PERF_SAMPLE_MAX = 1U << 9,		/* non-ABI */
 };
 
 /*
@@ -131,17 +133,26 @@ enum perf_counter_read_format {
 	PERF_FORMAT_TOTAL_TIME_ENABLED		= 1U << 0,
 	PERF_FORMAT_TOTAL_TIME_RUNNING		= 1U << 1,
 	PERF_FORMAT_ID				= 1U << 2,
+
+	PERF_FORMAT_MAX = 1U << 3, 		/* non-ABI */
 };
 
+#define PERF_ATTR_SIZE_VER0	64	/* sizeof first published struct */
+
 /*
  * Hardware event to monitor via a performance monitoring counter:
  */
 struct perf_counter_attr {
+
 	/*
 	 * Major type: hardware/software/tracepoint/etc.
 	 */
 	__u32			type;
-	__u32			__reserved_1;
+
+	/*
+	 * Size of the attr structure, for fwd/bwd compat.
+	 */
+	__u32			size;
 
 	/*
 	 * Type specific configuration information.
@@ -168,12 +179,12 @@ struct perf_counter_attr {
 				comm	       :  1, /* include comm data     */
 				freq           :  1, /* use freq, not period  */
 
-				__reserved_2   : 53;
+				__reserved_1   : 53;
 
 	__u32			wakeup_events;	/* wakeup every n events */
-	__u32			__reserved_3;
+	__u32			__reserved_2;
 
-	__u64			__reserved_4;
+	__u64			__reserved_3;
 };
 
 /*
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index c6c84ad8bd7..418d90f5eff 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -758,6 +758,6 @@ int kernel_execve(const char *filename, char *const argv[], char *const envp[]);
 
 
 asmlinkage long sys_perf_counter_open(
-		const struct perf_counter_attr __user *attr_uptr,
+		struct perf_counter_attr __user *attr_uptr,
 		pid_t pid, int cpu, int group_fd, unsigned long flags);
 #endif
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 663bbe01505..29b685f551a 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -3584,6 +3584,9 @@ perf_counter_alloc(struct perf_counter_attr *attr,
 	case PERF_TYPE_TRACEPOINT:
 		pmu = tp_perf_counter_init(counter);
 		break;
+
+	default:
+		break;
 	}
 done:
 	err = 0;
@@ -3610,6 +3613,85 @@ done:
 	return counter;
 }
 
+static int perf_copy_attr(struct perf_counter_attr __user *uattr,
+			  struct perf_counter_attr *attr)
+{
+	int ret;
+	u32 size;
+
+	if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0))
+		return -EFAULT;
+
+	/*
+	 * zero the full structure, so that a short copy will be nice.
+	 */
+	memset(attr, 0, sizeof(*attr));
+
+	ret = get_user(size, &uattr->size);
+	if (ret)
+		return ret;
+
+	if (size > PAGE_SIZE)	/* silly large */
+		goto err_size;
+
+	if (!size)		/* abi compat */
+		size = PERF_ATTR_SIZE_VER0;
+
+	if (size < PERF_ATTR_SIZE_VER0)
+		goto err_size;
+
+	/*
+	 * If we're handed a bigger struct than we know of,
+	 * ensure all the unknown bits are 0.
+	 */
+	if (size > sizeof(*attr)) {
+		unsigned long val;
+		unsigned long __user *addr;
+		unsigned long __user *end;
+
+		addr = PTR_ALIGN((void __user *)uattr + sizeof(*attr),
+				sizeof(unsigned long));
+		end  = PTR_ALIGN((void __user *)uattr + size,
+				sizeof(unsigned long));
+
+		for (; addr < end; addr += sizeof(unsigned long)) {
+			ret = get_user(val, addr);
+			if (ret)
+				return ret;
+			if (val)
+				goto err_size;
+		}
+	}
+
+	ret = copy_from_user(attr, uattr, size);
+	if (ret)
+		return -EFAULT;
+
+	/*
+	 * If the type exists, the corresponding creation will verify
+	 * the attr->config.
+	 */
+	if (attr->type >= PERF_TYPE_MAX)
+		return -EINVAL;
+
+	if (attr->__reserved_1 || attr->__reserved_2 || attr->__reserved_3)
+		return -EINVAL;
+
+	if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
+		return -EINVAL;
+
+	if (attr->read_format & ~(PERF_FORMAT_MAX-1))
+		return -EINVAL;
+
+out:
+	return ret;
+
+err_size:
+	put_user(sizeof(*attr), &uattr->size);
+	ret = -E2BIG;
+	goto out;
+}
+
 /**
  * sys_perf_counter_open - open a performance counter, associate it to a task/cpu
  *
@@ -3619,7 +3701,7 @@ done:
  * @group_fd:		group leader counter fd
  */
 SYSCALL_DEFINE5(perf_counter_open,
-		const struct perf_counter_attr __user *, attr_uptr,
+		struct perf_counter_attr __user *, attr_uptr,
 		pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
 {
 	struct perf_counter *counter, *group_leader;
@@ -3635,8 +3717,9 @@ SYSCALL_DEFINE5(perf_counter_open,
 	if (flags)
 		return -EINVAL;
 
-	if (copy_from_user(&attr, attr_uptr, sizeof(attr)) != 0)
-		return -EFAULT;
+	ret = perf_copy_attr(attr_uptr, &attr);
+	if (ret)
+		return ret;
 
 	if (!attr.exclude_kernel) {
 		if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
diff --git a/tools/perf/perf.h b/tools/perf/perf.h
index af0a5046d74..87a1aca4a42 100644
--- a/tools/perf/perf.h
+++ b/tools/perf/perf.h
@@ -53,11 +53,12 @@ static inline unsigned long long rdclock(void)
 	_min1 < _min2 ? _min1 : _min2; })
 
 static inline int
-sys_perf_counter_open(struct perf_counter_attr *attr_uptr,
+sys_perf_counter_open(struct perf_counter_attr *attr,
 		      pid_t pid, int cpu, int group_fd,
 		      unsigned long flags)
 {
-	return syscall(__NR_perf_counter_open, attr_uptr, pid, cpu,
+	attr->size = sizeof(*attr);
+	return syscall(__NR_perf_counter_open, attr, pid, cpu,
 		       group_fd, flags);
 }
 
-- 
cgit v1.2.3-70-g09d2


From b43e352139f51216a8c56b0bd5fc3d4e05c65619 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Fri, 12 Jun 2009 22:27:00 -0600
Subject: sched: export kick_process

lguest needs kick_process: wake_up_process() does nothing if a process
is running, which isn't sufficient (we need it in the kernel).

And lguest support is usually modular.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Cc: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index f04aa966450..8ec9d13140b 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2192,6 +2192,7 @@ void kick_process(struct task_struct *p)
 		smp_send_reschedule(cpu);
 	preempt_enable();
 }
+EXPORT_SYMBOL_GPL(kick_process);
 
 /*
  * Return a low guess at the load of a migration-source cpu weighted
-- 
cgit v1.2.3-70-g09d2


From 1dc492a0a4470852cb451db1e00d580ce9fd7a28 Mon Sep 17 00:00:00 2001
From: Manish Katiyar <mkatiyar@gmail.com>
Date: Sun, 22 Feb 2009 10:24:27 +0530
Subject: trivial: kernel/power/poweroff.c: whitespace fix

Fix coding style whitespace fixes. Patch compile tested
Before :-
total: 1 errors, 0 warnings, 46 lines checked
After
total: 0 errors, 0 warnings, 46 lines checked

Before :-
  text	   data	    bss	    dec	    hex	filename
    107	     48	      0	    155	     9b	kernel/power/poweroff.o
After
   text	   data	    bss	    dec	    hex	filename
    107	     48	      0	    155	     9b	kernel/power/poweroff.o

Signed-off-by: Manish Katiyar <mkatiyar@gmail.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 kernel/power/poweroff.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/power/poweroff.c b/kernel/power/poweroff.c
index 97890831e1b..e8b33700627 100644
--- a/kernel/power/poweroff.c
+++ b/kernel/power/poweroff.c
@@ -34,7 +34,7 @@ static struct sysrq_key_op	sysrq_poweroff_op = {
 	.handler        = handle_poweroff,
 	.help_msg       = "powerOff",
 	.action_msg     = "Power Off",
- 	.enable_mask	= SYSRQ_ENABLE_BOOT,
+	.enable_mask	= SYSRQ_ENABLE_BOOT,
 };
 
 static int pm_sysrq_init(void)
-- 
cgit v1.2.3-70-g09d2


From 3ac49a1c9928b4a242b3cb1d83bc1d5c9b8fcb50 Mon Sep 17 00:00:00 2001
From: Jean Delvare <khali@linux-fr.org>
Date: Thu, 4 Jun 2009 16:20:28 +0200
Subject: trivial: fix ETIMEOUT -> ETIMEDOUT typos

fix ETIMEOUT -> ETIMEDOUT typos

Signed-off-by: Jean Delvare <khali@linux-fr.org>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 arch/arm/mach-sa1100/jornada720_ssp.c | 4 ++--
 drivers/net/qlge/qlge_main.c          | 2 +-
 drivers/net/usb/usbnet.c              | 2 +-
 drivers/staging/wlan-ng/hfa384x_usb.c | 2 +-
 kernel/rtmutex.c                      | 2 +-
 5 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/arch/arm/mach-sa1100/jornada720_ssp.c b/arch/arm/mach-sa1100/jornada720_ssp.c
index 28cf3696797..506a5e5a9ad 100644
--- a/arch/arm/mach-sa1100/jornada720_ssp.c
+++ b/arch/arm/mach-sa1100/jornada720_ssp.c
@@ -54,7 +54,7 @@ EXPORT_SYMBOL(jornada_ssp_reverse);
  * timeout after <timeout> rounds. Needs mcu running before its called.
  *
  * returns : %mcu output on success
- *	   : %-ETIMEOUT on timeout
+ *	   : %-ETIMEDOUT on timeout
  */
 int jornada_ssp_byte(u8 byte)
 {
@@ -82,7 +82,7 @@ EXPORT_SYMBOL(jornada_ssp_byte);
  * jornada_ssp_inout - decide if input is command or trading byte
  *
  * returns : (jornada_ssp_byte(byte)) on success
- *         : %-ETIMEOUT on timeout failure
+ *         : %-ETIMEDOUT on timeout failure
  */
 int jornada_ssp_inout(u8 byte)
 {
diff --git a/drivers/net/qlge/qlge_main.c b/drivers/net/qlge/qlge_main.c
index c92ced24794..1fd5ecb2442 100644
--- a/drivers/net/qlge/qlge_main.c
+++ b/drivers/net/qlge/qlge_main.c
@@ -3174,7 +3174,7 @@ static int ql_adapter_reset(struct ql_adapter *qdev)
 
 	if (value & RST_FO_FR) {
 		QPRINTK(qdev, IFDOWN, ERR,
-			"ETIMEOUT!!! errored out of resetting the chip!\n");
+			"ETIMEDOUT!!! errored out of resetting the chip!\n");
 		status = -ETIMEDOUT;
 	}
 
diff --git a/drivers/net/usb/usbnet.c b/drivers/net/usb/usbnet.c
index f3a2fce6166..47f68cfa7e2 100644
--- a/drivers/net/usb/usbnet.c
+++ b/drivers/net/usb/usbnet.c
@@ -398,7 +398,7 @@ static void rx_complete (struct urb *urb)
 
 	/* stalls need manual reset. this is rare ... except that
 	 * when going through USB 2.0 TTs, unplug appears this way.
-	 * we avoid the highspeed version of the ETIMEOUT/EILSEQ
+	 * we avoid the highspeed version of the ETIMEDOUT/EILSEQ
 	 * storm, recovering as needed.
 	 */
 	case -EPIPE:
diff --git a/drivers/staging/wlan-ng/hfa384x_usb.c b/drivers/staging/wlan-ng/hfa384x_usb.c
index 888198c9a10..824e65bdc43 100644
--- a/drivers/staging/wlan-ng/hfa384x_usb.c
+++ b/drivers/staging/wlan-ng/hfa384x_usb.c
@@ -2424,7 +2424,7 @@ int hfa384x_drvr_ramdl_write(hfa384x_t *hw, u32 daddr, void *buf, u32 len)
 *	0		success
 *	>0		f/w reported error - f/w status code
 *	<0		driver reported error
-*	-ETIMEOUT	timout waiting for the cmd regs to become
+*	-ETIMEDOUT	timout waiting for the cmd regs to become
 *			available, or waiting for the control reg
 *			to indicate the Aux port is enabled.
 *	-ENODATA	the buffer does NOT contain a valid PDA.
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index 820c5af44f3..fcd107a78c5 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -902,7 +902,7 @@ EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible);
  * Returns:
  *  0 		on success
  * -EINTR 	when interrupted by a signal
- * -ETIMEOUT	when the timeout expired
+ * -ETIMEDOUT	when the timeout expired
  * -EDEADLK	when the lock would deadlock (when deadlock detection is on)
  */
 int
-- 
cgit v1.2.3-70-g09d2


From e39a71ef80877f4e30d808af9acceec80f4d2f7c Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Fri, 15 May 2009 00:53:26 +0200
Subject: PM: Rename device_power_down/up()

Rename the functions performing "_noirq" dev_pm_ops
operations from device_power_down() and device_power_up()
to device_suspend_noirq() and device_resume_noirq().

The new function names are chosen to show that the functions
are responsible for calling the _noirq() versions to finalize
the suspend/resume operation. The current function names do
not perform power down/up anymore so the names may be misleading.

Global function renames:
- device_power_down() -> device_suspend_noirq()
- device_power_up() -> device_resume_noirq()

Static function renames:
- suspend_device_noirq() -> __device_suspend_noirq()
- resume_device_noirq() -> __device_resume_noirq()

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Acked-by: Greg Kroah-Hartman <gregkh@suse.de>
Acked-by: Len Brown <lenb@kernel.org>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 arch/x86/kernel/apm_32.c  |  8 ++++----
 drivers/base/power/main.c | 26 +++++++++++++-------------
 drivers/xen/manage.c      | 10 +++++-----
 include/linux/pm.h        |  4 ++--
 kernel/kexec.c            |  8 ++++----
 kernel/power/disk.c       | 16 ++++++++--------
 kernel/power/main.c       |  4 ++--
 7 files changed, 38 insertions(+), 38 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 49e0939bac4..31ae547da15 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -1235,7 +1235,7 @@ static int suspend(int vetoable)
 
 	device_suspend(PMSG_SUSPEND);
 
-	device_power_down(PMSG_SUSPEND);
+	device_suspend_noirq(PMSG_SUSPEND);
 
 	local_irq_disable();
 	sysdev_suspend(PMSG_SUSPEND);
@@ -1259,7 +1259,7 @@ static int suspend(int vetoable)
 	sysdev_resume();
 	local_irq_enable();
 
-	device_power_up(PMSG_RESUME);
+	device_resume_noirq(PMSG_RESUME);
 
 	device_resume(PMSG_RESUME);
 	queue_event(APM_NORMAL_RESUME, NULL);
@@ -1277,7 +1277,7 @@ static void standby(void)
 {
 	int err;
 
-	device_power_down(PMSG_SUSPEND);
+	device_suspend_noirq(PMSG_SUSPEND);
 
 	local_irq_disable();
 	sysdev_suspend(PMSG_SUSPEND);
@@ -1291,7 +1291,7 @@ static void standby(void)
 	sysdev_resume();
 	local_irq_enable();
 
-	device_power_up(PMSG_RESUME);
+	device_resume_noirq(PMSG_RESUME);
 }
 
 static apm_event_t get_event(void)
diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c
index 3e4bc699bc0..c5a35bc9d63 100644
--- a/drivers/base/power/main.c
+++ b/drivers/base/power/main.c
@@ -315,13 +315,13 @@ static void pm_dev_err(struct device *dev, pm_message_t state, char *info,
 /*------------------------- Resume routines -------------------------*/
 
 /**
- *	resume_device_noirq - Power on one device (early resume).
+ *	__device_resume_noirq - Power on one device (early resume).
  *	@dev:	Device.
  *	@state: PM transition of the system being carried out.
  *
  *	Must be called with interrupts disabled.
  */
-static int resume_device_noirq(struct device *dev, pm_message_t state)
+static int __device_resume_noirq(struct device *dev, pm_message_t state)
 {
 	int error = 0;
 
@@ -363,7 +363,7 @@ static void dpm_power_up(pm_message_t state)
 			int error;
 
 			dev->power.status = DPM_OFF;
-			error = resume_device_noirq(dev, state);
+			error = __device_resume_noirq(dev, state);
 			if (error)
 				pm_dev_err(dev, state, " early", error);
 		}
@@ -371,18 +371,18 @@ static void dpm_power_up(pm_message_t state)
 }
 
 /**
- *	device_power_up - Turn on all devices that need special attention.
+ *	device_resume_noirq - Turn on all devices that need special attention.
  *	@state: PM transition of the system being carried out.
  *
  *	Call the "early" resume handlers and enable device drivers to receive
  *	interrupts.
  */
-void device_power_up(pm_message_t state)
+void device_resume_noirq(pm_message_t state)
 {
 	dpm_power_up(state);
 	resume_device_irqs();
 }
-EXPORT_SYMBOL_GPL(device_power_up);
+EXPORT_SYMBOL_GPL(device_resume_noirq);
 
 /**
  *	resume_device - Restore state for one device.
@@ -577,13 +577,13 @@ static pm_message_t resume_event(pm_message_t sleep_state)
 }
 
 /**
- *	suspend_device_noirq - Shut down one device (late suspend).
+ *	__device_suspend_noirq - Shut down one device (late suspend).
  *	@dev:	Device.
  *	@state: PM transition of the system being carried out.
  *
  *	This is called with interrupts off and only a single CPU running.
  */
-static int suspend_device_noirq(struct device *dev, pm_message_t state)
+static int __device_suspend_noirq(struct device *dev, pm_message_t state)
 {
 	int error = 0;
 
@@ -602,7 +602,7 @@ static int suspend_device_noirq(struct device *dev, pm_message_t state)
 }
 
 /**
- *	device_power_down - Shut down special devices.
+ *	device_suspend_noirq - Shut down special devices.
  *	@state: PM transition of the system being carried out.
  *
  *	Prevent device drivers from receiving interrupts and call the "late"
@@ -610,7 +610,7 @@ static int suspend_device_noirq(struct device *dev, pm_message_t state)
  *
  *	Must be called under dpm_list_mtx.
  */
-int device_power_down(pm_message_t state)
+int device_suspend_noirq(pm_message_t state)
 {
 	struct device *dev;
 	int error = 0;
@@ -618,7 +618,7 @@ int device_power_down(pm_message_t state)
 	suspend_device_irqs();
 	mutex_lock(&dpm_list_mtx);
 	list_for_each_entry_reverse(dev, &dpm_list, power.entry) {
-		error = suspend_device_noirq(dev, state);
+		error = __device_suspend_noirq(dev, state);
 		if (error) {
 			pm_dev_err(dev, state, " late", error);
 			break;
@@ -627,10 +627,10 @@ int device_power_down(pm_message_t state)
 	}
 	mutex_unlock(&dpm_list_mtx);
 	if (error)
-		device_power_up(resume_event(state));
+		device_resume_noirq(resume_event(state));
 	return error;
 }
-EXPORT_SYMBOL_GPL(device_power_down);
+EXPORT_SYMBOL_GPL(device_suspend_noirq);
 
 /**
  *	suspend_device - Save state of one device.
diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c
index fddc2025dec..d5b327ac403 100644
--- a/drivers/xen/manage.c
+++ b/drivers/xen/manage.c
@@ -43,7 +43,7 @@ static int xen_suspend(void *data)
 	if (err) {
 		printk(KERN_ERR "xen_suspend: sysdev_suspend failed: %d\n",
 			err);
-		device_power_up(PMSG_RESUME);
+		device_resume_noirq(PMSG_RESUME);
 		return err;
 	}
 
@@ -69,7 +69,7 @@ static int xen_suspend(void *data)
 	}
 
 	sysdev_resume();
-	device_power_up(PMSG_RESUME);
+	device_resume_noirq(PMSG_RESUME);
 
 	return 0;
 }
@@ -101,9 +101,9 @@ static void do_suspend(void)
 	printk(KERN_DEBUG "suspending xenstore...\n");
 	xs_suspend();
 
-	err = device_power_down(PMSG_SUSPEND);
+	err = device_suspend_noirq(PMSG_SUSPEND);
 	if (err) {
-		printk(KERN_ERR "device_power_down failed: %d\n", err);
+		printk(KERN_ERR "device_suspend_noirq failed: %d\n", err);
 		goto resume_devices;
 	}
 
@@ -119,7 +119,7 @@ static void do_suspend(void)
 	} else
 		xs_suspend_cancel();
 
-	device_power_up(PMSG_RESUME);
+	device_resume_noirq(PMSG_RESUME);
 
 resume_devices:
 	device_resume(PMSG_RESUME);
diff --git a/include/linux/pm.h b/include/linux/pm.h
index 1d4e2d28982..2170252074f 100644
--- a/include/linux/pm.h
+++ b/include/linux/pm.h
@@ -382,12 +382,12 @@ struct dev_pm_info {
 #ifdef CONFIG_PM_SLEEP
 extern void device_pm_lock(void);
 extern int sysdev_resume(void);
-extern void device_power_up(pm_message_t state);
+extern void device_resume_noirq(pm_message_t state);
 extern void device_resume(pm_message_t state);
 
 extern void device_pm_unlock(void);
 extern int sysdev_suspend(pm_message_t state);
-extern int device_power_down(pm_message_t state);
+extern int device_suspend_noirq(pm_message_t state);
 extern int device_suspend(pm_message_t state);
 extern int device_prepare_suspend(pm_message_t state);
 
diff --git a/kernel/kexec.c b/kernel/kexec.c
index e4983770913..5a3da87adae 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1452,13 +1452,13 @@ int kernel_kexec(void)
 		if (error)
 			goto Resume_console;
 		/* At this point, device_suspend() has been called,
-		 * but *not* device_power_down(). We *must*
-		 * device_power_down() now.  Otherwise, drivers for
+		 * but *not* device_suspend_noirq(). We *must* call
+		 * device_suspend_noirq() now.  Otherwise, drivers for
 		 * some devices (e.g. interrupt controllers) become
 		 * desynchronized with the actual state of the
 		 * hardware at resume time, and evil weirdness ensues.
 		 */
-		error = device_power_down(PMSG_FREEZE);
+		error = device_suspend_noirq(PMSG_FREEZE);
 		if (error)
 			goto Resume_devices;
 		error = disable_nonboot_cpus();
@@ -1486,7 +1486,7 @@ int kernel_kexec(void)
 		local_irq_enable();
  Enable_cpus:
 		enable_nonboot_cpus();
-		device_power_up(PMSG_RESTORE);
+		device_resume_noirq(PMSG_RESTORE);
  Resume_devices:
 		device_resume(PMSG_RESTORE);
  Resume_console:
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 5cb080e7eeb..1c18bc894a2 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -216,12 +216,12 @@ static int create_image(int platform_mode)
 		return error;
 
 	/* At this point, device_suspend() has been called, but *not*
-	 * device_power_down(). We *must* call device_power_down() now.
+	 * device_suspend_noirq(). We *must* call device_suspend_noirq() now.
 	 * Otherwise, drivers for some devices (e.g. interrupt controllers)
 	 * become desynchronized with the actual state of the hardware
 	 * at resume time, and evil weirdness ensues.
 	 */
-	error = device_power_down(PMSG_FREEZE);
+	error = device_suspend_noirq(PMSG_FREEZE);
 	if (error) {
 		printk(KERN_ERR "PM: Some devices failed to power down, "
 			"aborting hibernation\n");
@@ -262,7 +262,7 @@ static int create_image(int platform_mode)
 
  Power_up:
 	sysdev_resume();
-	/* NOTE:  device_power_up() is just a resume() for devices
+	/* NOTE:  device_resume_noirq() is just a resume() for devices
 	 * that suspended with irqs off ... no overall powerup.
 	 */
 
@@ -275,7 +275,7 @@ static int create_image(int platform_mode)
  Platform_finish:
 	platform_finish(platform_mode);
 
-	device_power_up(in_suspend ?
+	device_resume_noirq(in_suspend ?
 		(error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
 
 	return error;
@@ -339,7 +339,7 @@ static int resume_target_kernel(bool platform_mode)
 {
 	int error;
 
-	error = device_power_down(PMSG_QUIESCE);
+	error = device_suspend_noirq(PMSG_QUIESCE);
 	if (error) {
 		printk(KERN_ERR "PM: Some devices failed to power down, "
 			"aborting resume\n");
@@ -394,7 +394,7 @@ static int resume_target_kernel(bool platform_mode)
  Cleanup:
 	platform_restore_cleanup(platform_mode);
 
-	device_power_up(PMSG_RECOVER);
+	device_resume_noirq(PMSG_RECOVER);
 
 	return error;
 }
@@ -454,7 +454,7 @@ int hibernation_platform_enter(void)
 		goto Resume_devices;
 	}
 
-	error = device_power_down(PMSG_HIBERNATE);
+	error = device_suspend_noirq(PMSG_HIBERNATE);
 	if (error)
 		goto Resume_devices;
 
@@ -479,7 +479,7 @@ int hibernation_platform_enter(void)
  Platofrm_finish:
 	hibernation_ops->finish();
 
-	device_power_up(PMSG_RESTORE);
+	device_suspend_noirq(PMSG_RESTORE);
 
  Resume_devices:
 	entering_platform_hibernation = false;
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 868028280d1..2f6638ee03c 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -295,7 +295,7 @@ static int suspend_enter(suspend_state_t state)
 			return error;
 	}
 
-	error = device_power_down(PMSG_SUSPEND);
+	error = device_suspend_noirq(PMSG_SUSPEND);
 	if (error) {
 		printk(KERN_ERR "PM: Some devices failed to power down\n");
 		goto Platfrom_finish;
@@ -335,7 +335,7 @@ static int suspend_enter(suspend_state_t state)
 		suspend_ops->wake();
 
  Power_up_devices:
-	device_power_up(PMSG_RESUME);
+	device_resume_noirq(PMSG_RESUME);
 
  Platfrom_finish:
 	if (suspend_ops->finish)
-- 
cgit v1.2.3-70-g09d2


From d161630297a20802d01c55847bfcba85d2118a9f Mon Sep 17 00:00:00 2001
From: Alan Stern <stern@rowland.harvard.edu>
Date: Sun, 24 May 2009 22:05:42 +0200
Subject: PM core: rename suspend and resume functions

This patch (as1241) renames a bunch of functions in the PM core.
Rather than go through a boring list of name changes, suffice it to
say that in the end we have a bunch of pairs of functions:

	device_resume_noirq	dpm_resume_noirq
	device_resume		dpm_resume
	device_complete		dpm_complete
	device_suspend_noirq	dpm_suspend_noirq
	device_suspend		dpm_suspend
	device_prepare		dpm_prepare

in which device_X does the X operation on a single device and dpm_X
invokes device_X for all devices in the dpm_list.

In addition, the old dpm_power_up and device_resume_noirq have been
combined into a single function (dpm_resume_noirq).

Lastly, dpm_suspend_start and dpm_resume_end are the renamed versions
of the former top-level device_suspend and device_resume routines.

Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
Acked-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 arch/x86/kernel/apm_32.c  | 14 ++++-----
 drivers/base/power/main.c | 80 ++++++++++++++++++++---------------------------
 drivers/xen/manage.c      | 16 +++++-----
 include/linux/pm.h        | 11 +++----
 kernel/kexec.c            | 14 ++++-----
 kernel/power/disk.c       | 30 +++++++++---------
 kernel/power/main.c       |  8 ++---
 7 files changed, 80 insertions(+), 93 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 31ae547da15..79302e9a33a 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -1233,9 +1233,9 @@ static int suspend(int vetoable)
 	int err;
 	struct apm_user	*as;
 
-	device_suspend(PMSG_SUSPEND);
+	dpm_suspend_start(PMSG_SUSPEND);
 
-	device_suspend_noirq(PMSG_SUSPEND);
+	dpm_suspend_noirq(PMSG_SUSPEND);
 
 	local_irq_disable();
 	sysdev_suspend(PMSG_SUSPEND);
@@ -1259,9 +1259,9 @@ static int suspend(int vetoable)
 	sysdev_resume();
 	local_irq_enable();
 
-	device_resume_noirq(PMSG_RESUME);
+	dpm_resume_noirq(PMSG_RESUME);
 
-	device_resume(PMSG_RESUME);
+	dpm_resume_end(PMSG_RESUME);
 	queue_event(APM_NORMAL_RESUME, NULL);
 	spin_lock(&user_list_lock);
 	for (as = user_list; as != NULL; as = as->next) {
@@ -1277,7 +1277,7 @@ static void standby(void)
 {
 	int err;
 
-	device_suspend_noirq(PMSG_SUSPEND);
+	dpm_suspend_noirq(PMSG_SUSPEND);
 
 	local_irq_disable();
 	sysdev_suspend(PMSG_SUSPEND);
@@ -1291,7 +1291,7 @@ static void standby(void)
 	sysdev_resume();
 	local_irq_enable();
 
-	device_resume_noirq(PMSG_RESUME);
+	dpm_resume_noirq(PMSG_RESUME);
 }
 
 static apm_event_t get_event(void)
@@ -1376,7 +1376,7 @@ static void check_events(void)
 			ignore_bounce = 1;
 			if ((event != APM_NORMAL_RESUME)
 			    || (ignore_normal_resume == 0)) {
-				device_resume(PMSG_RESUME);
+				dpm_resume_end(PMSG_RESUME);
 				queue_event(event, NULL);
 			}
 			ignore_normal_resume = 0;
diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c
index c5a35bc9d63..1f3d82260db 100644
--- a/drivers/base/power/main.c
+++ b/drivers/base/power/main.c
@@ -315,13 +315,13 @@ static void pm_dev_err(struct device *dev, pm_message_t state, char *info,
 /*------------------------- Resume routines -------------------------*/
 
 /**
- *	__device_resume_noirq - Power on one device (early resume).
+ *	device_resume_noirq - Power on one device (early resume).
  *	@dev:	Device.
  *	@state: PM transition of the system being carried out.
  *
  *	Must be called with interrupts disabled.
  */
-static int __device_resume_noirq(struct device *dev, pm_message_t state)
+static int device_resume_noirq(struct device *dev, pm_message_t state)
 {
 	int error = 0;
 
@@ -344,16 +344,16 @@ static int __device_resume_noirq(struct device *dev, pm_message_t state)
 }
 
 /**
- *	dpm_power_up - Power on all regular (non-sysdev) devices.
+ *	dpm_resume_noirq - Power on all regular (non-sysdev) devices.
  *	@state: PM transition of the system being carried out.
  *
- *	Execute the appropriate "noirq resume" callback for all devices marked
- *	as DPM_OFF_IRQ.
+ *	Call the "noirq" resume handlers for all devices marked as
+ *	DPM_OFF_IRQ and enable device drivers to receive interrupts.
  *
  *	Must be called under dpm_list_mtx.  Device drivers should not receive
  *	interrupts while it's being executed.
  */
-static void dpm_power_up(pm_message_t state)
+void dpm_resume_noirq(pm_message_t state)
 {
 	struct device *dev;
 
@@ -363,33 +363,21 @@ static void dpm_power_up(pm_message_t state)
 			int error;
 
 			dev->power.status = DPM_OFF;
-			error = __device_resume_noirq(dev, state);
+			error = device_resume_noirq(dev, state);
 			if (error)
 				pm_dev_err(dev, state, " early", error);
 		}
 	mutex_unlock(&dpm_list_mtx);
-}
-
-/**
- *	device_resume_noirq - Turn on all devices that need special attention.
- *	@state: PM transition of the system being carried out.
- *
- *	Call the "early" resume handlers and enable device drivers to receive
- *	interrupts.
- */
-void device_resume_noirq(pm_message_t state)
-{
-	dpm_power_up(state);
 	resume_device_irqs();
 }
-EXPORT_SYMBOL_GPL(device_resume_noirq);
+EXPORT_SYMBOL_GPL(dpm_resume_noirq);
 
 /**
- *	resume_device - Restore state for one device.
+ *	device_resume - Restore state for one device.
  *	@dev:	Device.
  *	@state: PM transition of the system being carried out.
  */
-static int resume_device(struct device *dev, pm_message_t state)
+static int device_resume(struct device *dev, pm_message_t state)
 {
 	int error = 0;
 
@@ -462,7 +450,7 @@ static void dpm_resume(pm_message_t state)
 			dev->power.status = DPM_RESUMING;
 			mutex_unlock(&dpm_list_mtx);
 
-			error = resume_device(dev, state);
+			error = device_resume(dev, state);
 
 			mutex_lock(&dpm_list_mtx);
 			if (error)
@@ -480,11 +468,11 @@ static void dpm_resume(pm_message_t state)
 }
 
 /**
- *	complete_device - Complete a PM transition for given device
+ *	device_complete - Complete a PM transition for given device
  *	@dev:	Device.
  *	@state: PM transition of the system being carried out.
  */
-static void complete_device(struct device *dev, pm_message_t state)
+static void device_complete(struct device *dev, pm_message_t state)
 {
 	down(&dev->sem);
 
@@ -527,7 +515,7 @@ static void dpm_complete(pm_message_t state)
 			dev->power.status = DPM_ON;
 			mutex_unlock(&dpm_list_mtx);
 
-			complete_device(dev, state);
+			device_complete(dev, state);
 
 			mutex_lock(&dpm_list_mtx);
 		}
@@ -540,19 +528,19 @@ static void dpm_complete(pm_message_t state)
 }
 
 /**
- *	device_resume - Restore state of each device in system.
+ *	dpm_resume_end - Restore state of each device in system.
  *	@state: PM transition of the system being carried out.
  *
  *	Resume all the devices, unlock them all, and allow new
  *	devices to be registered once again.
  */
-void device_resume(pm_message_t state)
+void dpm_resume_end(pm_message_t state)
 {
 	might_sleep();
 	dpm_resume(state);
 	dpm_complete(state);
 }
-EXPORT_SYMBOL_GPL(device_resume);
+EXPORT_SYMBOL_GPL(dpm_resume_end);
 
 
 /*------------------------- Suspend routines -------------------------*/
@@ -577,13 +565,13 @@ static pm_message_t resume_event(pm_message_t sleep_state)
 }
 
 /**
- *	__device_suspend_noirq - Shut down one device (late suspend).
+ *	device_suspend_noirq - Shut down one device (late suspend).
  *	@dev:	Device.
  *	@state: PM transition of the system being carried out.
  *
  *	This is called with interrupts off and only a single CPU running.
  */
-static int __device_suspend_noirq(struct device *dev, pm_message_t state)
+static int device_suspend_noirq(struct device *dev, pm_message_t state)
 {
 	int error = 0;
 
@@ -602,15 +590,15 @@ static int __device_suspend_noirq(struct device *dev, pm_message_t state)
 }
 
 /**
- *	device_suspend_noirq - Shut down special devices.
+ *	dpm_suspend_noirq - Power down all regular (non-sysdev) devices.
  *	@state: PM transition of the system being carried out.
  *
- *	Prevent device drivers from receiving interrupts and call the "late"
+ *	Prevent device drivers from receiving interrupts and call the "noirq"
  *	suspend handlers.
  *
  *	Must be called under dpm_list_mtx.
  */
-int device_suspend_noirq(pm_message_t state)
+int dpm_suspend_noirq(pm_message_t state)
 {
 	struct device *dev;
 	int error = 0;
@@ -618,7 +606,7 @@ int device_suspend_noirq(pm_message_t state)
 	suspend_device_irqs();
 	mutex_lock(&dpm_list_mtx);
 	list_for_each_entry_reverse(dev, &dpm_list, power.entry) {
-		error = __device_suspend_noirq(dev, state);
+		error = device_suspend_noirq(dev, state);
 		if (error) {
 			pm_dev_err(dev, state, " late", error);
 			break;
@@ -627,17 +615,17 @@ int device_suspend_noirq(pm_message_t state)
 	}
 	mutex_unlock(&dpm_list_mtx);
 	if (error)
-		device_resume_noirq(resume_event(state));
+		dpm_resume_noirq(resume_event(state));
 	return error;
 }
-EXPORT_SYMBOL_GPL(device_suspend_noirq);
+EXPORT_SYMBOL_GPL(dpm_suspend_noirq);
 
 /**
- *	suspend_device - Save state of one device.
+ *	device_suspend - Save state of one device.
  *	@dev:	Device.
  *	@state: PM transition of the system being carried out.
  */
-static int suspend_device(struct device *dev, pm_message_t state)
+static int device_suspend(struct device *dev, pm_message_t state)
 {
 	int error = 0;
 
@@ -704,7 +692,7 @@ static int dpm_suspend(pm_message_t state)
 		get_device(dev);
 		mutex_unlock(&dpm_list_mtx);
 
-		error = suspend_device(dev, state);
+		error = device_suspend(dev, state);
 
 		mutex_lock(&dpm_list_mtx);
 		if (error) {
@@ -723,11 +711,11 @@ static int dpm_suspend(pm_message_t state)
 }
 
 /**
- *	prepare_device - Execute the ->prepare() callback(s) for given device.
+ *	device_prepare - Execute the ->prepare() callback(s) for given device.
  *	@dev:	Device.
  *	@state: PM transition of the system being carried out.
  */
-static int prepare_device(struct device *dev, pm_message_t state)
+static int device_prepare(struct device *dev, pm_message_t state)
 {
 	int error = 0;
 
@@ -781,7 +769,7 @@ static int dpm_prepare(pm_message_t state)
 		dev->power.status = DPM_PREPARING;
 		mutex_unlock(&dpm_list_mtx);
 
-		error = prepare_device(dev, state);
+		error = device_prepare(dev, state);
 
 		mutex_lock(&dpm_list_mtx);
 		if (error) {
@@ -807,12 +795,12 @@ static int dpm_prepare(pm_message_t state)
 }
 
 /**
- *	device_suspend - Save state and stop all devices in system.
+ *	dpm_suspend_start - Save state and stop all devices in system.
  *	@state: PM transition of the system being carried out.
  *
  *	Prepare and suspend all devices.
  */
-int device_suspend(pm_message_t state)
+int dpm_suspend_start(pm_message_t state)
 {
 	int error;
 
@@ -822,7 +810,7 @@ int device_suspend(pm_message_t state)
 		error = dpm_suspend(state);
 	return error;
 }
-EXPORT_SYMBOL_GPL(device_suspend);
+EXPORT_SYMBOL_GPL(dpm_suspend_start);
 
 void __suspend_report_result(const char *function, void *fn, int ret)
 {
diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c
index d5b327ac403..10d03d7931c 100644
--- a/drivers/xen/manage.c
+++ b/drivers/xen/manage.c
@@ -43,7 +43,7 @@ static int xen_suspend(void *data)
 	if (err) {
 		printk(KERN_ERR "xen_suspend: sysdev_suspend failed: %d\n",
 			err);
-		device_resume_noirq(PMSG_RESUME);
+		dpm_resume_noirq(PMSG_RESUME);
 		return err;
 	}
 
@@ -69,7 +69,7 @@ static int xen_suspend(void *data)
 	}
 
 	sysdev_resume();
-	device_resume_noirq(PMSG_RESUME);
+	dpm_resume_noirq(PMSG_RESUME);
 
 	return 0;
 }
@@ -92,18 +92,18 @@ static void do_suspend(void)
 	}
 #endif
 
-	err = device_suspend(PMSG_SUSPEND);
+	err = dpm_suspend_start(PMSG_SUSPEND);
 	if (err) {
-		printk(KERN_ERR "xen suspend: device_suspend %d\n", err);
+		printk(KERN_ERR "xen suspend: dpm_suspend_start %d\n", err);
 		goto out;
 	}
 
 	printk(KERN_DEBUG "suspending xenstore...\n");
 	xs_suspend();
 
-	err = device_suspend_noirq(PMSG_SUSPEND);
+	err = dpm_suspend_noirq(PMSG_SUSPEND);
 	if (err) {
-		printk(KERN_ERR "device_suspend_noirq failed: %d\n", err);
+		printk(KERN_ERR "dpm_suspend_noirq failed: %d\n", err);
 		goto resume_devices;
 	}
 
@@ -119,10 +119,10 @@ static void do_suspend(void)
 	} else
 		xs_suspend_cancel();
 
-	device_resume_noirq(PMSG_RESUME);
+	dpm_resume_noirq(PMSG_RESUME);
 
 resume_devices:
-	device_resume(PMSG_RESUME);
+	dpm_resume_end(PMSG_RESUME);
 
 	/* Make sure timer events get retriggered on all CPUs */
 	clock_was_set();
diff --git a/include/linux/pm.h b/include/linux/pm.h
index 2170252074f..b3f74764a58 100644
--- a/include/linux/pm.h
+++ b/include/linux/pm.h
@@ -382,14 +382,13 @@ struct dev_pm_info {
 #ifdef CONFIG_PM_SLEEP
 extern void device_pm_lock(void);
 extern int sysdev_resume(void);
-extern void device_resume_noirq(pm_message_t state);
-extern void device_resume(pm_message_t state);
+extern void dpm_resume_noirq(pm_message_t state);
+extern void dpm_resume_end(pm_message_t state);
 
 extern void device_pm_unlock(void);
 extern int sysdev_suspend(pm_message_t state);
-extern int device_suspend_noirq(pm_message_t state);
-extern int device_suspend(pm_message_t state);
-extern int device_prepare_suspend(pm_message_t state);
+extern int dpm_suspend_noirq(pm_message_t state);
+extern int dpm_suspend_start(pm_message_t state);
 
 extern void __suspend_report_result(const char *function, void *fn, int ret);
 
@@ -403,7 +402,7 @@ extern void __suspend_report_result(const char *function, void *fn, int ret);
 #define device_pm_lock() do {} while (0)
 #define device_pm_unlock() do {} while (0)
 
-static inline int device_suspend(pm_message_t state)
+static inline int dpm_suspend_start(pm_message_t state)
 {
 	return 0;
 }
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 5a3da87adae..ae1c35201cc 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1448,17 +1448,17 @@ int kernel_kexec(void)
 			goto Restore_console;
 		}
 		suspend_console();
-		error = device_suspend(PMSG_FREEZE);
+		error = dpm_suspend_start(PMSG_FREEZE);
 		if (error)
 			goto Resume_console;
-		/* At this point, device_suspend() has been called,
-		 * but *not* device_suspend_noirq(). We *must* call
-		 * device_suspend_noirq() now.  Otherwise, drivers for
+		/* At this point, dpm_suspend_start() has been called,
+		 * but *not* dpm_suspend_noirq(). We *must* call
+		 * dpm_suspend_noirq() now.  Otherwise, drivers for
 		 * some devices (e.g. interrupt controllers) become
 		 * desynchronized with the actual state of the
 		 * hardware at resume time, and evil weirdness ensues.
 		 */
-		error = device_suspend_noirq(PMSG_FREEZE);
+		error = dpm_suspend_noirq(PMSG_FREEZE);
 		if (error)
 			goto Resume_devices;
 		error = disable_nonboot_cpus();
@@ -1486,9 +1486,9 @@ int kernel_kexec(void)
 		local_irq_enable();
  Enable_cpus:
 		enable_nonboot_cpus();
-		device_resume_noirq(PMSG_RESTORE);
+		dpm_resume_noirq(PMSG_RESTORE);
  Resume_devices:
-		device_resume(PMSG_RESTORE);
+		dpm_resume_end(PMSG_RESTORE);
  Resume_console:
 		resume_console();
 		thaw_processes();
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 1c18bc894a2..a9beba68b6c 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -215,13 +215,13 @@ static int create_image(int platform_mode)
 	if (error)
 		return error;
 
-	/* At this point, device_suspend() has been called, but *not*
-	 * device_suspend_noirq(). We *must* call device_suspend_noirq() now.
+	/* At this point, dpm_suspend_start() has been called, but *not*
+	 * dpm_suspend_noirq(). We *must* call dpm_suspend_noirq() now.
 	 * Otherwise, drivers for some devices (e.g. interrupt controllers)
 	 * become desynchronized with the actual state of the hardware
 	 * at resume time, and evil weirdness ensues.
 	 */
-	error = device_suspend_noirq(PMSG_FREEZE);
+	error = dpm_suspend_noirq(PMSG_FREEZE);
 	if (error) {
 		printk(KERN_ERR "PM: Some devices failed to power down, "
 			"aborting hibernation\n");
@@ -262,7 +262,7 @@ static int create_image(int platform_mode)
 
  Power_up:
 	sysdev_resume();
-	/* NOTE:  device_resume_noirq() is just a resume() for devices
+	/* NOTE:  dpm_resume_noirq() is just a resume() for devices
 	 * that suspended with irqs off ... no overall powerup.
 	 */
 
@@ -275,7 +275,7 @@ static int create_image(int platform_mode)
  Platform_finish:
 	platform_finish(platform_mode);
 
-	device_resume_noirq(in_suspend ?
+	dpm_resume_noirq(in_suspend ?
 		(error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
 
 	return error;
@@ -304,7 +304,7 @@ int hibernation_snapshot(int platform_mode)
 		goto Close;
 
 	suspend_console();
-	error = device_suspend(PMSG_FREEZE);
+	error = dpm_suspend_start(PMSG_FREEZE);
 	if (error)
 		goto Recover_platform;
 
@@ -315,7 +315,7 @@ int hibernation_snapshot(int platform_mode)
 	/* Control returns here after successful restore */
 
  Resume_devices:
-	device_resume(in_suspend ?
+	dpm_resume_end(in_suspend ?
 		(error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
 	resume_console();
  Close:
@@ -339,7 +339,7 @@ static int resume_target_kernel(bool platform_mode)
 {
 	int error;
 
-	error = device_suspend_noirq(PMSG_QUIESCE);
+	error = dpm_suspend_noirq(PMSG_QUIESCE);
 	if (error) {
 		printk(KERN_ERR "PM: Some devices failed to power down, "
 			"aborting resume\n");
@@ -394,7 +394,7 @@ static int resume_target_kernel(bool platform_mode)
  Cleanup:
 	platform_restore_cleanup(platform_mode);
 
-	device_resume_noirq(PMSG_RECOVER);
+	dpm_resume_noirq(PMSG_RECOVER);
 
 	return error;
 }
@@ -414,10 +414,10 @@ int hibernation_restore(int platform_mode)
 
 	pm_prepare_console();
 	suspend_console();
-	error = device_suspend(PMSG_QUIESCE);
+	error = dpm_suspend_start(PMSG_QUIESCE);
 	if (!error) {
 		error = resume_target_kernel(platform_mode);
-		device_resume(PMSG_RECOVER);
+		dpm_resume_end(PMSG_RECOVER);
 	}
 	resume_console();
 	pm_restore_console();
@@ -447,14 +447,14 @@ int hibernation_platform_enter(void)
 
 	entering_platform_hibernation = true;
 	suspend_console();
-	error = device_suspend(PMSG_HIBERNATE);
+	error = dpm_suspend_start(PMSG_HIBERNATE);
 	if (error) {
 		if (hibernation_ops->recover)
 			hibernation_ops->recover();
 		goto Resume_devices;
 	}
 
-	error = device_suspend_noirq(PMSG_HIBERNATE);
+	error = dpm_suspend_noirq(PMSG_HIBERNATE);
 	if (error)
 		goto Resume_devices;
 
@@ -479,11 +479,11 @@ int hibernation_platform_enter(void)
  Platofrm_finish:
 	hibernation_ops->finish();
 
-	device_suspend_noirq(PMSG_RESTORE);
+	dpm_suspend_noirq(PMSG_RESTORE);
 
  Resume_devices:
 	entering_platform_hibernation = false;
-	device_resume(PMSG_RESTORE);
+	dpm_resume_end(PMSG_RESTORE);
 	resume_console();
 
  Close:
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 2f6638ee03c..46386b9f8dd 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -295,7 +295,7 @@ static int suspend_enter(suspend_state_t state)
 			return error;
 	}
 
-	error = device_suspend_noirq(PMSG_SUSPEND);
+	error = dpm_suspend_noirq(PMSG_SUSPEND);
 	if (error) {
 		printk(KERN_ERR "PM: Some devices failed to power down\n");
 		goto Platfrom_finish;
@@ -335,7 +335,7 @@ static int suspend_enter(suspend_state_t state)
 		suspend_ops->wake();
 
  Power_up_devices:
-	device_resume_noirq(PMSG_RESUME);
+	dpm_resume_noirq(PMSG_RESUME);
 
  Platfrom_finish:
 	if (suspend_ops->finish)
@@ -363,7 +363,7 @@ int suspend_devices_and_enter(suspend_state_t state)
 	}
 	suspend_console();
 	suspend_test_start();
-	error = device_suspend(PMSG_SUSPEND);
+	error = dpm_suspend_start(PMSG_SUSPEND);
 	if (error) {
 		printk(KERN_ERR "PM: Some devices failed to suspend\n");
 		goto Recover_platform;
@@ -376,7 +376,7 @@ int suspend_devices_and_enter(suspend_state_t state)
 
  Resume_devices:
 	suspend_test_start();
-	device_resume(PMSG_RESUME);
+	dpm_resume_end(PMSG_RESUME);
 	suspend_test_finish("resume devices");
 	resume_console();
  Close:
-- 
cgit v1.2.3-70-g09d2


From c6f37f12197ac3bd2e5a35f2f0e195ae63d437de Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Sun, 24 May 2009 22:16:31 +0200
Subject: PM/Suspend: Do not shrink memory before suspend

Remove the shrinking of memory from the suspend-to-RAM code, where
it is not really necessary.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Nigel Cunningham <nigel@tuxonice.net>
Acked-by: Wu Fengguang <fengguang.wu@intel.com>
---
 kernel/power/main.c | 20 +-------------------
 mm/vmscan.c         |  4 ++--
 2 files changed, 3 insertions(+), 21 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/main.c b/kernel/power/main.c
index 46386b9f8dd..2a19f347bd8 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -188,9 +188,6 @@ static void suspend_test_finish(const char *label)
 
 #endif
 
-/* This is just an arbitrary number */
-#define FREE_PAGE_NUMBER (100)
-
 static struct platform_suspend_ops *suspend_ops;
 
 /**
@@ -226,7 +223,6 @@ int suspend_valid_only_mem(suspend_state_t state)
 static int suspend_prepare(void)
 {
 	int error;
-	unsigned int free_pages;
 
 	if (!suspend_ops || !suspend_ops->enter)
 		return -EPERM;
@@ -241,24 +237,10 @@ static int suspend_prepare(void)
 	if (error)
 		goto Finish;
 
-	if (suspend_freeze_processes()) {
-		error = -EAGAIN;
-		goto Thaw;
-	}
-
-	free_pages = global_page_state(NR_FREE_PAGES);
-	if (free_pages < FREE_PAGE_NUMBER) {
-		pr_debug("PM: free some memory\n");
-		shrink_all_memory(FREE_PAGE_NUMBER - free_pages);
-		if (nr_free_pages() < FREE_PAGE_NUMBER) {
-			error = -ENOMEM;
-			printk(KERN_ERR "PM: No enough memory\n");
-		}
-	}
+	error = suspend_freeze_processes();
 	if (!error)
 		return 0;
 
- Thaw:
 	suspend_thaw_processes();
 	usermodehelper_enable();
  Finish:
diff --git a/mm/vmscan.c b/mm/vmscan.c
index d254306562c..95c08a8cc2b 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2056,7 +2056,7 @@ unsigned long global_lru_pages(void)
 		+ global_page_state(NR_INACTIVE_FILE);
 }
 
-#ifdef CONFIG_PM
+#ifdef CONFIG_HIBERNATION
 /*
  * Helper function for shrink_all_memory().  Tries to reclaim 'nr_pages' pages
  * from LRU lists system-wide, for given pass and priority.
@@ -2196,7 +2196,7 @@ out:
 
 	return sc.nr_reclaimed;
 }
-#endif
+#endif /* CONFIG_HIBERNATION */
 
 /* It's optimal to keep kswapds on the same CPUs as their memory, but
    not required for correctness.  So if the last cpu in a node goes
-- 
cgit v1.2.3-70-g09d2


From fe419535d82724314bbf1244a0e740e4ea1bd3ae Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Thu, 11 Jun 2009 23:11:17 +0200
Subject: PM/Hibernate: Move memory shrinking to snapshot.c (rev. 2)

A future patch is going to modify the memory shrinking code so that
it will make memory allocations to free memory instead of using an
artificial memory shrinking mechanism for that.  For this purpose it
is convenient to move swsusp_shrink_memory() from
kernel/power/swsusp.c to kernel/power/snapshot.c, because the new
memory-shrinking code is going to use things that are local to
kernel/power/snapshot.c .

[rev. 2: Make some functions static and remove their headers from
 kernel/power/power.h]

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@ucw.cz>
Acked-by: Wu Fengguang <fengguang.wu@intel.com>
---
 kernel/power/power.h    |  4 +--
 kernel/power/snapshot.c | 80 +++++++++++++++++++++++++++++++++++++++++++++++--
 kernel/power/swsusp.c   | 76 ----------------------------------------------
 3 files changed, 79 insertions(+), 81 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/power.h b/kernel/power/power.h
index 46b5ec7a3af..ec4dbdfb07b 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -74,7 +74,7 @@ extern asmlinkage int swsusp_arch_resume(void);
 
 extern int create_basic_memory_bitmaps(void);
 extern void free_basic_memory_bitmaps(void);
-extern unsigned int count_data_pages(void);
+extern int swsusp_shrink_memory(void);
 
 /**
  *	Auxiliary structure used for reading the snapshot image data and
@@ -149,7 +149,6 @@ extern int swsusp_swap_in_use(void);
 
 /* kernel/power/disk.c */
 extern int swsusp_check(void);
-extern int swsusp_shrink_memory(void);
 extern void swsusp_free(void);
 extern int swsusp_read(unsigned int *flags_p);
 extern int swsusp_write(unsigned int flags);
@@ -176,7 +175,6 @@ extern int pm_notifier_call_chain(unsigned long val);
 #endif
 
 #ifdef CONFIG_HIGHMEM
-unsigned int count_highmem_pages(void);
 int restore_highmem(void);
 #else
 static inline unsigned int count_highmem_pages(void) { return 0; }
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 33e2e4a819f..523a451b45d 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -39,6 +39,14 @@ static int swsusp_page_is_free(struct page *);
 static void swsusp_set_page_forbidden(struct page *);
 static void swsusp_unset_page_forbidden(struct page *);
 
+/*
+ * Preferred image size in bytes (tunable via /sys/power/image_size).
+ * When it is set to N, swsusp will do its best to ensure the image
+ * size will not exceed N bytes, but if that is impossible, it will
+ * try to create the smallest image possible.
+ */
+unsigned long image_size = 500 * 1024 * 1024;
+
 /* List of PBEs needed for restoring the pages that were allocated before
  * the suspend and included in the suspend image, but have also been
  * allocated by the "resume" kernel, so their contents cannot be written
@@ -840,7 +848,7 @@ static struct page *saveable_highmem_page(struct zone *zone, unsigned long pfn)
  *	pages.
  */
 
-unsigned int count_highmem_pages(void)
+static unsigned int count_highmem_pages(void)
 {
 	struct zone *zone;
 	unsigned int n = 0;
@@ -902,7 +910,7 @@ static struct page *saveable_page(struct zone *zone, unsigned long pfn)
  *	pages.
  */
 
-unsigned int count_data_pages(void)
+static unsigned int count_data_pages(void)
 {
 	struct zone *zone;
 	unsigned long pfn, max_zone_pfn;
@@ -1058,6 +1066,74 @@ void swsusp_free(void)
 	buffer = NULL;
 }
 
+/**
+ *	swsusp_shrink_memory -  Try to free as much memory as needed
+ *
+ *	... but do not OOM-kill anyone
+ *
+ *	Notice: all userland should be stopped before it is called, or
+ *	livelock is possible.
+ */
+
+#define SHRINK_BITE	10000
+static inline unsigned long __shrink_memory(long tmp)
+{
+	if (tmp > SHRINK_BITE)
+		tmp = SHRINK_BITE;
+	return shrink_all_memory(tmp);
+}
+
+int swsusp_shrink_memory(void)
+{
+	long tmp;
+	struct zone *zone;
+	unsigned long pages = 0;
+	unsigned int i = 0;
+	char *p = "-\\|/";
+	struct timeval start, stop;
+
+	printk(KERN_INFO "PM: Shrinking memory...  ");
+	do_gettimeofday(&start);
+	do {
+		long size, highmem_size;
+
+		highmem_size = count_highmem_pages();
+		size = count_data_pages() + PAGES_FOR_IO + SPARE_PAGES;
+		tmp = size;
+		size += highmem_size;
+		for_each_populated_zone(zone) {
+			tmp += snapshot_additional_pages(zone);
+			if (is_highmem(zone)) {
+				highmem_size -=
+					zone_page_state(zone, NR_FREE_PAGES);
+			} else {
+				tmp -= zone_page_state(zone, NR_FREE_PAGES);
+				tmp += zone->lowmem_reserve[ZONE_NORMAL];
+			}
+		}
+
+		if (highmem_size < 0)
+			highmem_size = 0;
+
+		tmp += highmem_size;
+		if (tmp > 0) {
+			tmp = __shrink_memory(tmp);
+			if (!tmp)
+				return -ENOMEM;
+			pages += tmp;
+		} else if (size > image_size / PAGE_SIZE) {
+			tmp = __shrink_memory(size - (image_size / PAGE_SIZE));
+			pages += tmp;
+		}
+		printk("\b%c", p[i++%4]);
+	} while (tmp > 0);
+	do_gettimeofday(&stop);
+	printk("\bdone (%lu pages freed)\n", pages);
+	swsusp_show_speed(&start, &stop, pages, "Freed");
+
+	return 0;
+}
+
 #ifdef CONFIG_HIGHMEM
 /**
   *	count_pages_for_highmem - compute the number of non-highmem pages
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index 78c35047586..87b901cb392 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -55,14 +55,6 @@
 
 #include "power.h"
 
-/*
- * Preferred image size in bytes (tunable via /sys/power/image_size).
- * When it is set to N, swsusp will do its best to ensure the image
- * size will not exceed N bytes, but if that is impossible, it will
- * try to create the smallest image possible.
- */
-unsigned long image_size = 500 * 1024 * 1024;
-
 int in_suspend __nosavedata = 0;
 
 /**
@@ -195,74 +187,6 @@ void swsusp_show_speed(struct timeval *start, struct timeval *stop,
 			kps / 1000, (kps % 1000) / 10);
 }
 
-/**
- *	swsusp_shrink_memory -  Try to free as much memory as needed
- *
- *	... but do not OOM-kill anyone
- *
- *	Notice: all userland should be stopped before it is called, or
- *	livelock is possible.
- */
-
-#define SHRINK_BITE	10000
-static inline unsigned long __shrink_memory(long tmp)
-{
-	if (tmp > SHRINK_BITE)
-		tmp = SHRINK_BITE;
-	return shrink_all_memory(tmp);
-}
-
-int swsusp_shrink_memory(void)
-{
-	long tmp;
-	struct zone *zone;
-	unsigned long pages = 0;
-	unsigned int i = 0;
-	char *p = "-\\|/";
-	struct timeval start, stop;
-
-	printk(KERN_INFO "PM: Shrinking memory...  ");
-	do_gettimeofday(&start);
-	do {
-		long size, highmem_size;
-
-		highmem_size = count_highmem_pages();
-		size = count_data_pages() + PAGES_FOR_IO + SPARE_PAGES;
-		tmp = size;
-		size += highmem_size;
-		for_each_populated_zone(zone) {
-			tmp += snapshot_additional_pages(zone);
-			if (is_highmem(zone)) {
-				highmem_size -=
-					zone_page_state(zone, NR_FREE_PAGES);
-			} else {
-				tmp -= zone_page_state(zone, NR_FREE_PAGES);
-				tmp += zone->lowmem_reserve[ZONE_NORMAL];
-			}
-		}
-
-		if (highmem_size < 0)
-			highmem_size = 0;
-
-		tmp += highmem_size;
-		if (tmp > 0) {
-			tmp = __shrink_memory(tmp);
-			if (!tmp)
-				return -ENOMEM;
-			pages += tmp;
-		} else if (size > image_size / PAGE_SIZE) {
-			tmp = __shrink_memory(size - (image_size / PAGE_SIZE));
-			pages += tmp;
-		}
-		printk("\b%c", p[i++%4]);
-	} while (tmp > 0);
-	do_gettimeofday(&stop);
-	printk("\bdone (%lu pages freed)\n", pages);
-	swsusp_show_speed(&start, &stop, pages, "Freed");
-
-	return 0;
-}
-
 /*
  * Platforms, like ACPI, may want us to save some memory used by them during
  * hibernation and to restore the contents of this memory during the subsequent
-- 
cgit v1.2.3-70-g09d2


From a9d7052363a6e06bb623ed1876c56c7ca5b2c6d8 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Wed, 10 Jun 2009 01:27:12 +0200
Subject: PM: Separate suspend to RAM functionality from core

Move the suspend to RAM and standby code from kernel/power/main.c
to two separate files, kernel/power/suspend.c containing the basic
functions and kernel/power/suspend_test.c containing the automatic
suspend test facility based on the RTC clock alarm.

There are no changes in functionality related to these modifications.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@ucw.cz>
---
 kernel/power/Makefile       |   2 +
 kernel/power/main.c         | 503 --------------------------------------------
 kernel/power/power.h        |  17 +-
 kernel/power/suspend.c      | 300 ++++++++++++++++++++++++++
 kernel/power/suspend_test.c | 187 ++++++++++++++++
 5 files changed, 505 insertions(+), 504 deletions(-)
 create mode 100644 kernel/power/suspend.c
 create mode 100644 kernel/power/suspend_test.c

(limited to 'kernel')

diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index 720ea4f781b..c4baf1b633c 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -6,6 +6,8 @@ endif
 obj-$(CONFIG_PM)		+= main.o
 obj-$(CONFIG_PM_SLEEP)		+= console.o
 obj-$(CONFIG_FREEZER)		+= process.o
+obj-$(CONFIG_SUSPEND)		+= suspend.o
+obj-$(CONFIG_PM_TEST_SUSPEND)	+= suspend_test.o
 obj-$(CONFIG_HIBERNATION)	+= swsusp.o disk.o snapshot.o swap.o user.o
 
 obj-$(CONFIG_MAGIC_SYSRQ)	+= poweroff.o
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 2a19f347bd8..f710e36930c 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -8,20 +8,9 @@
  *
  */
 
-#include <linux/module.h>
-#include <linux/suspend.h>
 #include <linux/kobject.h>
 #include <linux/string.h>
-#include <linux/delay.h>
-#include <linux/errno.h>
-#include <linux/kmod.h>
-#include <linux/init.h>
-#include <linux/console.h>
-#include <linux/cpu.h>
 #include <linux/resume-trace.h>
-#include <linux/freezer.h>
-#include <linux/vmstat.h>
-#include <linux/syscalls.h>
 
 #include "power.h"
 
@@ -119,355 +108,6 @@ power_attr(pm_test);
 
 #endif /* CONFIG_PM_SLEEP */
 
-#ifdef CONFIG_SUSPEND
-
-static int suspend_test(int level)
-{
-#ifdef CONFIG_PM_DEBUG
-	if (pm_test_level == level) {
-		printk(KERN_INFO "suspend debug: Waiting for 5 seconds.\n");
-		mdelay(5000);
-		return 1;
-	}
-#endif /* !CONFIG_PM_DEBUG */
-	return 0;
-}
-
-#ifdef CONFIG_PM_TEST_SUSPEND
-
-/*
- * We test the system suspend code by setting an RTC wakealarm a short
- * time in the future, then suspending.  Suspending the devices won't
- * normally take long ... some systems only need a few milliseconds.
- *
- * The time it takes is system-specific though, so when we test this
- * during system bootup we allow a LOT of time.
- */
-#define TEST_SUSPEND_SECONDS	5
-
-static unsigned long suspend_test_start_time;
-
-static void suspend_test_start(void)
-{
-	/* FIXME Use better timebase than "jiffies", ideally a clocksource.
-	 * What we want is a hardware counter that will work correctly even
-	 * during the irqs-are-off stages of the suspend/resume cycle...
-	 */
-	suspend_test_start_time = jiffies;
-}
-
-static void suspend_test_finish(const char *label)
-{
-	long nj = jiffies - suspend_test_start_time;
-	unsigned msec;
-
-	msec = jiffies_to_msecs(abs(nj));
-	pr_info("PM: %s took %d.%03d seconds\n", label,
-			msec / 1000, msec % 1000);
-
-	/* Warning on suspend means the RTC alarm period needs to be
-	 * larger -- the system was sooo slooowwww to suspend that the
-	 * alarm (should have) fired before the system went to sleep!
-	 *
-	 * Warning on either suspend or resume also means the system
-	 * has some performance issues.  The stack dump of a WARN_ON
-	 * is more likely to get the right attention than a printk...
-	 */
-	WARN(msec > (TEST_SUSPEND_SECONDS * 1000), "Component: %s\n", label);
-}
-
-#else
-
-static void suspend_test_start(void)
-{
-}
-
-static void suspend_test_finish(const char *label)
-{
-}
-
-#endif
-
-static struct platform_suspend_ops *suspend_ops;
-
-/**
- *	suspend_set_ops - Set the global suspend method table.
- *	@ops:	Pointer to ops structure.
- */
-
-void suspend_set_ops(struct platform_suspend_ops *ops)
-{
-	mutex_lock(&pm_mutex);
-	suspend_ops = ops;
-	mutex_unlock(&pm_mutex);
-}
-
-/**
- * suspend_valid_only_mem - generic memory-only valid callback
- *
- * Platform drivers that implement mem suspend only and only need
- * to check for that in their .valid callback can use this instead
- * of rolling their own .valid callback.
- */
-int suspend_valid_only_mem(suspend_state_t state)
-{
-	return state == PM_SUSPEND_MEM;
-}
-
-/**
- *	suspend_prepare - Do prep work before entering low-power state.
- *
- *	This is common code that is called for each state that we're entering.
- *	Run suspend notifiers, allocate a console and stop all processes.
- */
-static int suspend_prepare(void)
-{
-	int error;
-
-	if (!suspend_ops || !suspend_ops->enter)
-		return -EPERM;
-
-	pm_prepare_console();
-
-	error = pm_notifier_call_chain(PM_SUSPEND_PREPARE);
-	if (error)
-		goto Finish;
-
-	error = usermodehelper_disable();
-	if (error)
-		goto Finish;
-
-	error = suspend_freeze_processes();
-	if (!error)
-		return 0;
-
-	suspend_thaw_processes();
-	usermodehelper_enable();
- Finish:
-	pm_notifier_call_chain(PM_POST_SUSPEND);
-	pm_restore_console();
-	return error;
-}
-
-/* default implementation */
-void __attribute__ ((weak)) arch_suspend_disable_irqs(void)
-{
-	local_irq_disable();
-}
-
-/* default implementation */
-void __attribute__ ((weak)) arch_suspend_enable_irqs(void)
-{
-	local_irq_enable();
-}
-
-/**
- *	suspend_enter - enter the desired system sleep state.
- *	@state:		state to enter
- *
- *	This function should be called after devices have been suspended.
- */
-static int suspend_enter(suspend_state_t state)
-{
-	int error;
-
-	if (suspend_ops->prepare) {
-		error = suspend_ops->prepare();
-		if (error)
-			return error;
-	}
-
-	error = dpm_suspend_noirq(PMSG_SUSPEND);
-	if (error) {
-		printk(KERN_ERR "PM: Some devices failed to power down\n");
-		goto Platfrom_finish;
-	}
-
-	if (suspend_ops->prepare_late) {
-		error = suspend_ops->prepare_late();
-		if (error)
-			goto Power_up_devices;
-	}
-
-	if (suspend_test(TEST_PLATFORM))
-		goto Platform_wake;
-
-	error = disable_nonboot_cpus();
-	if (error || suspend_test(TEST_CPUS))
-		goto Enable_cpus;
-
-	arch_suspend_disable_irqs();
-	BUG_ON(!irqs_disabled());
-
-	error = sysdev_suspend(PMSG_SUSPEND);
-	if (!error) {
-		if (!suspend_test(TEST_CORE))
-			error = suspend_ops->enter(state);
-		sysdev_resume();
-	}
-
-	arch_suspend_enable_irqs();
-	BUG_ON(irqs_disabled());
-
- Enable_cpus:
-	enable_nonboot_cpus();
-
- Platform_wake:
-	if (suspend_ops->wake)
-		suspend_ops->wake();
-
- Power_up_devices:
-	dpm_resume_noirq(PMSG_RESUME);
-
- Platfrom_finish:
-	if (suspend_ops->finish)
-		suspend_ops->finish();
-
-	return error;
-}
-
-/**
- *	suspend_devices_and_enter - suspend devices and enter the desired system
- *				    sleep state.
- *	@state:		  state to enter
- */
-int suspend_devices_and_enter(suspend_state_t state)
-{
-	int error;
-
-	if (!suspend_ops)
-		return -ENOSYS;
-
-	if (suspend_ops->begin) {
-		error = suspend_ops->begin(state);
-		if (error)
-			goto Close;
-	}
-	suspend_console();
-	suspend_test_start();
-	error = dpm_suspend_start(PMSG_SUSPEND);
-	if (error) {
-		printk(KERN_ERR "PM: Some devices failed to suspend\n");
-		goto Recover_platform;
-	}
-	suspend_test_finish("suspend devices");
-	if (suspend_test(TEST_DEVICES))
-		goto Recover_platform;
-
-	suspend_enter(state);
-
- Resume_devices:
-	suspend_test_start();
-	dpm_resume_end(PMSG_RESUME);
-	suspend_test_finish("resume devices");
-	resume_console();
- Close:
-	if (suspend_ops->end)
-		suspend_ops->end();
-	return error;
-
- Recover_platform:
-	if (suspend_ops->recover)
-		suspend_ops->recover();
-	goto Resume_devices;
-}
-
-/**
- *	suspend_finish - Do final work before exiting suspend sequence.
- *
- *	Call platform code to clean up, restart processes, and free the 
- *	console that we've allocated. This is not called for suspend-to-disk.
- */
-static void suspend_finish(void)
-{
-	suspend_thaw_processes();
-	usermodehelper_enable();
-	pm_notifier_call_chain(PM_POST_SUSPEND);
-	pm_restore_console();
-}
-
-
-
-
-static const char * const pm_states[PM_SUSPEND_MAX] = {
-	[PM_SUSPEND_STANDBY]	= "standby",
-	[PM_SUSPEND_MEM]	= "mem",
-};
-
-static inline int valid_state(suspend_state_t state)
-{
-	/* All states need lowlevel support and need to be valid
-	 * to the lowlevel implementation, no valid callback
-	 * implies that none are valid. */
-	if (!suspend_ops || !suspend_ops->valid || !suspend_ops->valid(state))
-		return 0;
-	return 1;
-}
-
-
-/**
- *	enter_state - Do common work of entering low-power state.
- *	@state:		pm_state structure for state we're entering.
- *
- *	Make sure we're the only ones trying to enter a sleep state. Fail
- *	if someone has beat us to it, since we don't want anything weird to
- *	happen when we wake up.
- *	Then, do the setup for suspend, enter the state, and cleaup (after
- *	we've woken up).
- */
-static int enter_state(suspend_state_t state)
-{
-	int error;
-
-	if (!valid_state(state))
-		return -ENODEV;
-
-	if (!mutex_trylock(&pm_mutex))
-		return -EBUSY;
-
-	printk(KERN_INFO "PM: Syncing filesystems ... ");
-	sys_sync();
-	printk("done.\n");
-
-	pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]);
-	error = suspend_prepare();
-	if (error)
-		goto Unlock;
-
-	if (suspend_test(TEST_FREEZER))
-		goto Finish;
-
-	pr_debug("PM: Entering %s sleep\n", pm_states[state]);
-	error = suspend_devices_and_enter(state);
-
- Finish:
-	pr_debug("PM: Finishing wakeup.\n");
-	suspend_finish();
- Unlock:
-	mutex_unlock(&pm_mutex);
-	return error;
-}
-
-
-/**
- *	pm_suspend - Externally visible function for suspending system.
- *	@state:		Enumerated value of state to enter.
- *
- *	Determine whether or not value is within range, get state 
- *	structure, and enter (above).
- */
-
-int pm_suspend(suspend_state_t state)
-{
-	if (state > PM_SUSPEND_ON && state <= PM_SUSPEND_MAX)
-		return enter_state(state);
-	return -EINVAL;
-}
-
-EXPORT_SYMBOL(pm_suspend);
-
-#endif /* CONFIG_SUSPEND */
-
 struct kobject *power_kobj;
 
 /**
@@ -480,7 +120,6 @@ struct kobject *power_kobj;
  *	store() accepts one of those strings, translates it into the 
  *	proper enumerated value, and initiates a suspend transition.
  */
-
 static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr,
 			  char *buf)
 {
@@ -578,7 +217,6 @@ static struct attribute_group attr_group = {
 	.attrs = g,
 };
 
-
 static int __init pm_init(void)
 {
 	power_kobj = kobject_create_and_add("power", NULL);
@@ -588,144 +226,3 @@ static int __init pm_init(void)
 }
 
 core_initcall(pm_init);
-
-
-#ifdef CONFIG_PM_TEST_SUSPEND
-
-#include <linux/rtc.h>
-
-/*
- * To test system suspend, we need a hands-off mechanism to resume the
- * system.  RTCs wake alarms are a common self-contained mechanism.
- */
-
-static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state)
-{
-	static char err_readtime[] __initdata =
-		KERN_ERR "PM: can't read %s time, err %d\n";
-	static char err_wakealarm [] __initdata =
-		KERN_ERR "PM: can't set %s wakealarm, err %d\n";
-	static char err_suspend[] __initdata =
-		KERN_ERR "PM: suspend test failed, error %d\n";
-	static char info_test[] __initdata =
-		KERN_INFO "PM: test RTC wakeup from '%s' suspend\n";
-
-	unsigned long		now;
-	struct rtc_wkalrm	alm;
-	int			status;
-
-	/* this may fail if the RTC hasn't been initialized */
-	status = rtc_read_time(rtc, &alm.time);
-	if (status < 0) {
-		printk(err_readtime, dev_name(&rtc->dev), status);
-		return;
-	}
-	rtc_tm_to_time(&alm.time, &now);
-
-	memset(&alm, 0, sizeof alm);
-	rtc_time_to_tm(now + TEST_SUSPEND_SECONDS, &alm.time);
-	alm.enabled = true;
-
-	status = rtc_set_alarm(rtc, &alm);
-	if (status < 0) {
-		printk(err_wakealarm, dev_name(&rtc->dev), status);
-		return;
-	}
-
-	if (state == PM_SUSPEND_MEM) {
-		printk(info_test, pm_states[state]);
-		status = pm_suspend(state);
-		if (status == -ENODEV)
-			state = PM_SUSPEND_STANDBY;
-	}
-	if (state == PM_SUSPEND_STANDBY) {
-		printk(info_test, pm_states[state]);
-		status = pm_suspend(state);
-	}
-	if (status < 0)
-		printk(err_suspend, status);
-
-	/* Some platforms can't detect that the alarm triggered the
-	 * wakeup, or (accordingly) disable it after it afterwards.
-	 * It's supposed to give oneshot behavior; cope.
-	 */
-	alm.enabled = false;
-	rtc_set_alarm(rtc, &alm);
-}
-
-static int __init has_wakealarm(struct device *dev, void *name_ptr)
-{
-	struct rtc_device *candidate = to_rtc_device(dev);
-
-	if (!candidate->ops->set_alarm)
-		return 0;
-	if (!device_may_wakeup(candidate->dev.parent))
-		return 0;
-
-	*(const char **)name_ptr = dev_name(dev);
-	return 1;
-}
-
-/*
- * Kernel options like "test_suspend=mem" force suspend/resume sanity tests
- * at startup time.  They're normally disabled, for faster boot and because
- * we can't know which states really work on this particular system.
- */
-static suspend_state_t test_state __initdata = PM_SUSPEND_ON;
-
-static char warn_bad_state[] __initdata =
-	KERN_WARNING "PM: can't test '%s' suspend state\n";
-
-static int __init setup_test_suspend(char *value)
-{
-	unsigned i;
-
-	/* "=mem" ==> "mem" */
-	value++;
-	for (i = 0; i < PM_SUSPEND_MAX; i++) {
-		if (!pm_states[i])
-			continue;
-		if (strcmp(pm_states[i], value) != 0)
-			continue;
-		test_state = (__force suspend_state_t) i;
-		return 0;
-	}
-	printk(warn_bad_state, value);
-	return 0;
-}
-__setup("test_suspend", setup_test_suspend);
-
-static int __init test_suspend(void)
-{
-	static char		warn_no_rtc[] __initdata =
-		KERN_WARNING "PM: no wakealarm-capable RTC driver is ready\n";
-
-	char			*pony = NULL;
-	struct rtc_device	*rtc = NULL;
-
-	/* PM is initialized by now; is that state testable? */
-	if (test_state == PM_SUSPEND_ON)
-		goto done;
-	if (!valid_state(test_state)) {
-		printk(warn_bad_state, pm_states[test_state]);
-		goto done;
-	}
-
-	/* RTCs have initialized by now too ... can we use one? */
-	class_find_device(rtc_class, NULL, &pony, has_wakealarm);
-	if (pony)
-		rtc = rtc_class_open(pony);
-	if (!rtc) {
-		printk(warn_no_rtc);
-		goto done;
-	}
-
-	/* go for it */
-	test_wakealarm(rtc, test_state);
-	rtc_class_close(rtc);
-done:
-	return 0;
-}
-late_initcall(test_suspend);
-
-#endif /* CONFIG_PM_TEST_SUSPEND */
diff --git a/kernel/power/power.h b/kernel/power/power.h
index ec4dbdfb07b..2bd98d9fc19 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -160,15 +160,30 @@ extern void swsusp_show_speed(struct timeval *, struct timeval *,
 				unsigned int, char *);
 
 #ifdef CONFIG_SUSPEND
-/* kernel/power/main.c */
+/* kernel/power/suspend.c */
+extern const char *const pm_states[];
+
+extern bool valid_state(suspend_state_t state);
 extern int suspend_devices_and_enter(suspend_state_t state);
+extern int enter_state(suspend_state_t state);
 #else /* !CONFIG_SUSPEND */
 static inline int suspend_devices_and_enter(suspend_state_t state)
 {
 	return -ENOSYS;
 }
+static inline int enter_state(suspend_state_t state) { return -ENOSYS; }
+static inline bool valid_state(suspend_state_t state) { return false; }
 #endif /* !CONFIG_SUSPEND */
 
+#ifdef CONFIG_PM_TEST_SUSPEND
+/* kernel/power/suspend_test.c */
+extern void suspend_test_start(void);
+extern void suspend_test_finish(const char *label);
+#else /* !CONFIG_PM_TEST_SUSPEND */
+static inline void suspend_test_start(void) {}
+static inline void suspend_test_finish(const char *label) {}
+#endif /* !CONFIG_PM_TEST_SUSPEND */
+
 #ifdef CONFIG_PM_SLEEP
 /* kernel/power/main.c */
 extern int pm_notifier_call_chain(unsigned long val);
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
new file mode 100644
index 00000000000..6f10dfc2d3e
--- /dev/null
+++ b/kernel/power/suspend.c
@@ -0,0 +1,300 @@
+/*
+ * kernel/power/suspend.c - Suspend to RAM and standby functionality.
+ *
+ * Copyright (c) 2003 Patrick Mochel
+ * Copyright (c) 2003 Open Source Development Lab
+ * Copyright (c) 2009 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc.
+ *
+ * This file is released under the GPLv2.
+ */
+
+#include <linux/string.h>
+#include <linux/delay.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/console.h>
+#include <linux/cpu.h>
+#include <linux/syscalls.h>
+
+#include "power.h"
+
+const char *const pm_states[PM_SUSPEND_MAX] = {
+	[PM_SUSPEND_STANDBY]	= "standby",
+	[PM_SUSPEND_MEM]	= "mem",
+};
+
+static struct platform_suspend_ops *suspend_ops;
+
+/**
+ *	suspend_set_ops - Set the global suspend method table.
+ *	@ops:	Pointer to ops structure.
+ */
+void suspend_set_ops(struct platform_suspend_ops *ops)
+{
+	mutex_lock(&pm_mutex);
+	suspend_ops = ops;
+	mutex_unlock(&pm_mutex);
+}
+
+bool valid_state(suspend_state_t state)
+{
+	/*
+	 * All states need lowlevel support and need to be valid to the lowlevel
+	 * implementation, no valid callback implies that none are valid.
+	 */
+	return suspend_ops && suspend_ops->valid && suspend_ops->valid(state);
+}
+
+/**
+ * suspend_valid_only_mem - generic memory-only valid callback
+ *
+ * Platform drivers that implement mem suspend only and only need
+ * to check for that in their .valid callback can use this instead
+ * of rolling their own .valid callback.
+ */
+int suspend_valid_only_mem(suspend_state_t state)
+{
+	return state == PM_SUSPEND_MEM;
+}
+
+static int suspend_test(int level)
+{
+#ifdef CONFIG_PM_DEBUG
+	if (pm_test_level == level) {
+		printk(KERN_INFO "suspend debug: Waiting for 5 seconds.\n");
+		mdelay(5000);
+		return 1;
+	}
+#endif /* !CONFIG_PM_DEBUG */
+	return 0;
+}
+
+/**
+ *	suspend_prepare - Do prep work before entering low-power state.
+ *
+ *	This is common code that is called for each state that we're entering.
+ *	Run suspend notifiers, allocate a console and stop all processes.
+ */
+static int suspend_prepare(void)
+{
+	int error;
+
+	if (!suspend_ops || !suspend_ops->enter)
+		return -EPERM;
+
+	pm_prepare_console();
+
+	error = pm_notifier_call_chain(PM_SUSPEND_PREPARE);
+	if (error)
+		goto Finish;
+
+	error = usermodehelper_disable();
+	if (error)
+		goto Finish;
+
+	error = suspend_freeze_processes();
+	if (!error)
+		return 0;
+
+	suspend_thaw_processes();
+	usermodehelper_enable();
+ Finish:
+	pm_notifier_call_chain(PM_POST_SUSPEND);
+	pm_restore_console();
+	return error;
+}
+
+/* default implementation */
+void __attribute__ ((weak)) arch_suspend_disable_irqs(void)
+{
+	local_irq_disable();
+}
+
+/* default implementation */
+void __attribute__ ((weak)) arch_suspend_enable_irqs(void)
+{
+	local_irq_enable();
+}
+
+/**
+ *	suspend_enter - enter the desired system sleep state.
+ *	@state:		state to enter
+ *
+ *	This function should be called after devices have been suspended.
+ */
+static int suspend_enter(suspend_state_t state)
+{
+	int error;
+
+	if (suspend_ops->prepare) {
+		error = suspend_ops->prepare();
+		if (error)
+			return error;
+	}
+
+	error = dpm_suspend_noirq(PMSG_SUSPEND);
+	if (error) {
+		printk(KERN_ERR "PM: Some devices failed to power down\n");
+		goto Platfrom_finish;
+	}
+
+	if (suspend_ops->prepare_late) {
+		error = suspend_ops->prepare_late();
+		if (error)
+			goto Power_up_devices;
+	}
+
+	if (suspend_test(TEST_PLATFORM))
+		goto Platform_wake;
+
+	error = disable_nonboot_cpus();
+	if (error || suspend_test(TEST_CPUS))
+		goto Enable_cpus;
+
+	arch_suspend_disable_irqs();
+	BUG_ON(!irqs_disabled());
+
+	error = sysdev_suspend(PMSG_SUSPEND);
+	if (!error) {
+		if (!suspend_test(TEST_CORE))
+			error = suspend_ops->enter(state);
+		sysdev_resume();
+	}
+
+	arch_suspend_enable_irqs();
+	BUG_ON(irqs_disabled());
+
+ Enable_cpus:
+	enable_nonboot_cpus();
+
+ Platform_wake:
+	if (suspend_ops->wake)
+		suspend_ops->wake();
+
+ Power_up_devices:
+	dpm_resume_noirq(PMSG_RESUME);
+
+ Platfrom_finish:
+	if (suspend_ops->finish)
+		suspend_ops->finish();
+
+	return error;
+}
+
+/**
+ *	suspend_devices_and_enter - suspend devices and enter the desired system
+ *				    sleep state.
+ *	@state:		  state to enter
+ */
+int suspend_devices_and_enter(suspend_state_t state)
+{
+	int error;
+
+	if (!suspend_ops)
+		return -ENOSYS;
+
+	if (suspend_ops->begin) {
+		error = suspend_ops->begin(state);
+		if (error)
+			goto Close;
+	}
+	suspend_console();
+	suspend_test_start();
+	error = dpm_suspend_start(PMSG_SUSPEND);
+	if (error) {
+		printk(KERN_ERR "PM: Some devices failed to suspend\n");
+		goto Recover_platform;
+	}
+	suspend_test_finish("suspend devices");
+	if (suspend_test(TEST_DEVICES))
+		goto Recover_platform;
+
+	suspend_enter(state);
+
+ Resume_devices:
+	suspend_test_start();
+	dpm_resume_end(PMSG_RESUME);
+	suspend_test_finish("resume devices");
+	resume_console();
+ Close:
+	if (suspend_ops->end)
+		suspend_ops->end();
+	return error;
+
+ Recover_platform:
+	if (suspend_ops->recover)
+		suspend_ops->recover();
+	goto Resume_devices;
+}
+
+/**
+ *	suspend_finish - Do final work before exiting suspend sequence.
+ *
+ *	Call platform code to clean up, restart processes, and free the
+ *	console that we've allocated. This is not called for suspend-to-disk.
+ */
+static void suspend_finish(void)
+{
+	suspend_thaw_processes();
+	usermodehelper_enable();
+	pm_notifier_call_chain(PM_POST_SUSPEND);
+	pm_restore_console();
+}
+
+/**
+ *	enter_state - Do common work of entering low-power state.
+ *	@state:		pm_state structure for state we're entering.
+ *
+ *	Make sure we're the only ones trying to enter a sleep state. Fail
+ *	if someone has beat us to it, since we don't want anything weird to
+ *	happen when we wake up.
+ *	Then, do the setup for suspend, enter the state, and cleaup (after
+ *	we've woken up).
+ */
+int enter_state(suspend_state_t state)
+{
+	int error;
+
+	if (!valid_state(state))
+		return -ENODEV;
+
+	if (!mutex_trylock(&pm_mutex))
+		return -EBUSY;
+
+	printk(KERN_INFO "PM: Syncing filesystems ... ");
+	sys_sync();
+	printk("done.\n");
+
+	pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]);
+	error = suspend_prepare();
+	if (error)
+		goto Unlock;
+
+	if (suspend_test(TEST_FREEZER))
+		goto Finish;
+
+	pr_debug("PM: Entering %s sleep\n", pm_states[state]);
+	error = suspend_devices_and_enter(state);
+
+ Finish:
+	pr_debug("PM: Finishing wakeup.\n");
+	suspend_finish();
+ Unlock:
+	mutex_unlock(&pm_mutex);
+	return error;
+}
+
+/**
+ *	pm_suspend - Externally visible function for suspending system.
+ *	@state:		Enumerated value of state to enter.
+ *
+ *	Determine whether or not value is within range, get state
+ *	structure, and enter (above).
+ */
+int pm_suspend(suspend_state_t state)
+{
+	if (state > PM_SUSPEND_ON && state <= PM_SUSPEND_MAX)
+		return enter_state(state);
+	return -EINVAL;
+}
+EXPORT_SYMBOL(pm_suspend);
diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c
new file mode 100644
index 00000000000..17d8bb1acf9
--- /dev/null
+++ b/kernel/power/suspend_test.c
@@ -0,0 +1,187 @@
+/*
+ * kernel/power/suspend_test.c - Suspend to RAM and standby test facility.
+ *
+ * Copyright (c) 2009 Pavel Machek <pavel@ucw.cz>
+ *
+ * This file is released under the GPLv2.
+ */
+
+#include <linux/init.h>
+#include <linux/rtc.h>
+
+#include "power.h"
+
+/*
+ * We test the system suspend code by setting an RTC wakealarm a short
+ * time in the future, then suspending.  Suspending the devices won't
+ * normally take long ... some systems only need a few milliseconds.
+ *
+ * The time it takes is system-specific though, so when we test this
+ * during system bootup we allow a LOT of time.
+ */
+#define TEST_SUSPEND_SECONDS	5
+
+static unsigned long suspend_test_start_time;
+
+void suspend_test_start(void)
+{
+	/* FIXME Use better timebase than "jiffies", ideally a clocksource.
+	 * What we want is a hardware counter that will work correctly even
+	 * during the irqs-are-off stages of the suspend/resume cycle...
+	 */
+	suspend_test_start_time = jiffies;
+}
+
+void suspend_test_finish(const char *label)
+{
+	long nj = jiffies - suspend_test_start_time;
+	unsigned msec;
+
+	msec = jiffies_to_msecs(abs(nj));
+	pr_info("PM: %s took %d.%03d seconds\n", label,
+			msec / 1000, msec % 1000);
+
+	/* Warning on suspend means the RTC alarm period needs to be
+	 * larger -- the system was sooo slooowwww to suspend that the
+	 * alarm (should have) fired before the system went to sleep!
+	 *
+	 * Warning on either suspend or resume also means the system
+	 * has some performance issues.  The stack dump of a WARN_ON
+	 * is more likely to get the right attention than a printk...
+	 */
+	WARN(msec > (TEST_SUSPEND_SECONDS * 1000), "Component: %s\n", label);
+}
+
+/*
+ * To test system suspend, we need a hands-off mechanism to resume the
+ * system.  RTCs wake alarms are a common self-contained mechanism.
+ */
+
+static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state)
+{
+	static char err_readtime[] __initdata =
+		KERN_ERR "PM: can't read %s time, err %d\n";
+	static char err_wakealarm [] __initdata =
+		KERN_ERR "PM: can't set %s wakealarm, err %d\n";
+	static char err_suspend[] __initdata =
+		KERN_ERR "PM: suspend test failed, error %d\n";
+	static char info_test[] __initdata =
+		KERN_INFO "PM: test RTC wakeup from '%s' suspend\n";
+
+	unsigned long		now;
+	struct rtc_wkalrm	alm;
+	int			status;
+
+	/* this may fail if the RTC hasn't been initialized */
+	status = rtc_read_time(rtc, &alm.time);
+	if (status < 0) {
+		printk(err_readtime, dev_name(&rtc->dev), status);
+		return;
+	}
+	rtc_tm_to_time(&alm.time, &now);
+
+	memset(&alm, 0, sizeof alm);
+	rtc_time_to_tm(now + TEST_SUSPEND_SECONDS, &alm.time);
+	alm.enabled = true;
+
+	status = rtc_set_alarm(rtc, &alm);
+	if (status < 0) {
+		printk(err_wakealarm, dev_name(&rtc->dev), status);
+		return;
+	}
+
+	if (state == PM_SUSPEND_MEM) {
+		printk(info_test, pm_states[state]);
+		status = pm_suspend(state);
+		if (status == -ENODEV)
+			state = PM_SUSPEND_STANDBY;
+	}
+	if (state == PM_SUSPEND_STANDBY) {
+		printk(info_test, pm_states[state]);
+		status = pm_suspend(state);
+	}
+	if (status < 0)
+		printk(err_suspend, status);
+
+	/* Some platforms can't detect that the alarm triggered the
+	 * wakeup, or (accordingly) disable it after it afterwards.
+	 * It's supposed to give oneshot behavior; cope.
+	 */
+	alm.enabled = false;
+	rtc_set_alarm(rtc, &alm);
+}
+
+static int __init has_wakealarm(struct device *dev, void *name_ptr)
+{
+	struct rtc_device *candidate = to_rtc_device(dev);
+
+	if (!candidate->ops->set_alarm)
+		return 0;
+	if (!device_may_wakeup(candidate->dev.parent))
+		return 0;
+
+	*(const char **)name_ptr = dev_name(dev);
+	return 1;
+}
+
+/*
+ * Kernel options like "test_suspend=mem" force suspend/resume sanity tests
+ * at startup time.  They're normally disabled, for faster boot and because
+ * we can't know which states really work on this particular system.
+ */
+static suspend_state_t test_state __initdata = PM_SUSPEND_ON;
+
+static char warn_bad_state[] __initdata =
+	KERN_WARNING "PM: can't test '%s' suspend state\n";
+
+static int __init setup_test_suspend(char *value)
+{
+	unsigned i;
+
+	/* "=mem" ==> "mem" */
+	value++;
+	for (i = 0; i < PM_SUSPEND_MAX; i++) {
+		if (!pm_states[i])
+			continue;
+		if (strcmp(pm_states[i], value) != 0)
+			continue;
+		test_state = (__force suspend_state_t) i;
+		return 0;
+	}
+	printk(warn_bad_state, value);
+	return 0;
+}
+__setup("test_suspend", setup_test_suspend);
+
+static int __init test_suspend(void)
+{
+	static char		warn_no_rtc[] __initdata =
+		KERN_WARNING "PM: no wakealarm-capable RTC driver is ready\n";
+
+	char			*pony = NULL;
+	struct rtc_device	*rtc = NULL;
+
+	/* PM is initialized by now; is that state testable? */
+	if (test_state == PM_SUSPEND_ON)
+		goto done;
+	if (!valid_state(test_state)) {
+		printk(warn_bad_state, pm_states[test_state]);
+		goto done;
+	}
+
+	/* RTCs have initialized by now too ... can we use one? */
+	class_find_device(rtc_class, NULL, &pony, has_wakealarm);
+	if (pony)
+		rtc = rtc_class_open(pony);
+	if (!rtc) {
+		printk(warn_no_rtc);
+		goto done;
+	}
+
+	/* go for it */
+	test_wakealarm(rtc, test_state);
+	rtc_class_close(rtc);
+done:
+	return 0;
+}
+late_initcall(test_suspend);
-- 
cgit v1.2.3-70-g09d2


From 8b759b84c8b3c27ccc8dd787294636297b3ebb40 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Wed, 10 Jun 2009 01:27:49 +0200
Subject: PM/Hibernate: Rename disk.c to hibernate.c

Change the name of kernel/power/disk.c to kernel/power/hibernate.c
in analogy with the file names introduced by the changes that
separated the suspend to RAM and standby funtionality from the
common PM functions.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@ucw.cz>
---
 kernel/power/Makefile    |   2 +-
 kernel/power/disk.c      | 955 -----------------------------------------------
 kernel/power/hibernate.c | 955 +++++++++++++++++++++++++++++++++++++++++++++++
 kernel/power/power.h     |   4 +-
 4 files changed, 958 insertions(+), 958 deletions(-)
 delete mode 100644 kernel/power/disk.c
 create mode 100644 kernel/power/hibernate.c

(limited to 'kernel')

diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index c4baf1b633c..eadb17fc8f5 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -8,6 +8,6 @@ obj-$(CONFIG_PM_SLEEP)		+= console.o
 obj-$(CONFIG_FREEZER)		+= process.o
 obj-$(CONFIG_SUSPEND)		+= suspend.o
 obj-$(CONFIG_PM_TEST_SUSPEND)	+= suspend_test.o
-obj-$(CONFIG_HIBERNATION)	+= swsusp.o disk.o snapshot.o swap.o user.o
+obj-$(CONFIG_HIBERNATION)	+= swsusp.o hibernate.o snapshot.o swap.o user.o
 
 obj-$(CONFIG_MAGIC_SYSRQ)	+= poweroff.o
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
deleted file mode 100644
index a9beba68b6c..00000000000
--- a/kernel/power/disk.c
+++ /dev/null
@@ -1,955 +0,0 @@
-/*
- * kernel/power/disk.c - Suspend-to-disk support.
- *
- * Copyright (c) 2003 Patrick Mochel
- * Copyright (c) 2003 Open Source Development Lab
- * Copyright (c) 2004 Pavel Machek <pavel@suse.cz>
- *
- * This file is released under the GPLv2.
- *
- */
-
-#include <linux/suspend.h>
-#include <linux/syscalls.h>
-#include <linux/reboot.h>
-#include <linux/string.h>
-#include <linux/device.h>
-#include <linux/kmod.h>
-#include <linux/delay.h>
-#include <linux/fs.h>
-#include <linux/mount.h>
-#include <linux/pm.h>
-#include <linux/console.h>
-#include <linux/cpu.h>
-#include <linux/freezer.h>
-#include <scsi/scsi_scan.h>
-#include <asm/suspend.h>
-
-#include "power.h"
-
-
-static int noresume = 0;
-static char resume_file[256] = CONFIG_PM_STD_PARTITION;
-dev_t swsusp_resume_device;
-sector_t swsusp_resume_block;
-
-enum {
-	HIBERNATION_INVALID,
-	HIBERNATION_PLATFORM,
-	HIBERNATION_TEST,
-	HIBERNATION_TESTPROC,
-	HIBERNATION_SHUTDOWN,
-	HIBERNATION_REBOOT,
-	/* keep last */
-	__HIBERNATION_AFTER_LAST
-};
-#define HIBERNATION_MAX (__HIBERNATION_AFTER_LAST-1)
-#define HIBERNATION_FIRST (HIBERNATION_INVALID + 1)
-
-static int hibernation_mode = HIBERNATION_SHUTDOWN;
-
-static struct platform_hibernation_ops *hibernation_ops;
-
-/**
- * hibernation_set_ops - set the global hibernate operations
- * @ops: the hibernation operations to use in subsequent hibernation transitions
- */
-
-void hibernation_set_ops(struct platform_hibernation_ops *ops)
-{
-	if (ops && !(ops->begin && ops->end &&  ops->pre_snapshot
-	    && ops->prepare && ops->finish && ops->enter && ops->pre_restore
-	    && ops->restore_cleanup)) {
-		WARN_ON(1);
-		return;
-	}
-	mutex_lock(&pm_mutex);
-	hibernation_ops = ops;
-	if (ops)
-		hibernation_mode = HIBERNATION_PLATFORM;
-	else if (hibernation_mode == HIBERNATION_PLATFORM)
-		hibernation_mode = HIBERNATION_SHUTDOWN;
-
-	mutex_unlock(&pm_mutex);
-}
-
-static bool entering_platform_hibernation;
-
-bool system_entering_hibernation(void)
-{
-	return entering_platform_hibernation;
-}
-EXPORT_SYMBOL(system_entering_hibernation);
-
-#ifdef CONFIG_PM_DEBUG
-static void hibernation_debug_sleep(void)
-{
-	printk(KERN_INFO "hibernation debug: Waiting for 5 seconds.\n");
-	mdelay(5000);
-}
-
-static int hibernation_testmode(int mode)
-{
-	if (hibernation_mode == mode) {
-		hibernation_debug_sleep();
-		return 1;
-	}
-	return 0;
-}
-
-static int hibernation_test(int level)
-{
-	if (pm_test_level == level) {
-		hibernation_debug_sleep();
-		return 1;
-	}
-	return 0;
-}
-#else /* !CONFIG_PM_DEBUG */
-static int hibernation_testmode(int mode) { return 0; }
-static int hibernation_test(int level) { return 0; }
-#endif /* !CONFIG_PM_DEBUG */
-
-/**
- *	platform_begin - tell the platform driver that we're starting
- *	hibernation
- */
-
-static int platform_begin(int platform_mode)
-{
-	return (platform_mode && hibernation_ops) ?
-		hibernation_ops->begin() : 0;
-}
-
-/**
- *	platform_end - tell the platform driver that we've entered the
- *	working state
- */
-
-static void platform_end(int platform_mode)
-{
-	if (platform_mode && hibernation_ops)
-		hibernation_ops->end();
-}
-
-/**
- *	platform_pre_snapshot - prepare the machine for hibernation using the
- *	platform driver if so configured and return an error code if it fails
- */
-
-static int platform_pre_snapshot(int platform_mode)
-{
-	return (platform_mode && hibernation_ops) ?
-		hibernation_ops->pre_snapshot() : 0;
-}
-
-/**
- *	platform_leave - prepare the machine for switching to the normal mode
- *	of operation using the platform driver (called with interrupts disabled)
- */
-
-static void platform_leave(int platform_mode)
-{
-	if (platform_mode && hibernation_ops)
-		hibernation_ops->leave();
-}
-
-/**
- *	platform_finish - switch the machine to the normal mode of operation
- *	using the platform driver (must be called after platform_prepare())
- */
-
-static void platform_finish(int platform_mode)
-{
-	if (platform_mode && hibernation_ops)
-		hibernation_ops->finish();
-}
-
-/**
- *	platform_pre_restore - prepare the platform for the restoration from a
- *	hibernation image.  If the restore fails after this function has been
- *	called, platform_restore_cleanup() must be called.
- */
-
-static int platform_pre_restore(int platform_mode)
-{
-	return (platform_mode && hibernation_ops) ?
-		hibernation_ops->pre_restore() : 0;
-}
-
-/**
- *	platform_restore_cleanup - switch the platform to the normal mode of
- *	operation after a failing restore.  If platform_pre_restore() has been
- *	called before the failing restore, this function must be called too,
- *	regardless of the result of platform_pre_restore().
- */
-
-static void platform_restore_cleanup(int platform_mode)
-{
-	if (platform_mode && hibernation_ops)
-		hibernation_ops->restore_cleanup();
-}
-
-/**
- *	platform_recover - recover the platform from a failure to suspend
- *	devices.
- */
-
-static void platform_recover(int platform_mode)
-{
-	if (platform_mode && hibernation_ops && hibernation_ops->recover)
-		hibernation_ops->recover();
-}
-
-/**
- *	create_image - freeze devices that need to be frozen with interrupts
- *	off, create the hibernation image and thaw those devices.  Control
- *	reappears in this routine after a restore.
- */
-
-static int create_image(int platform_mode)
-{
-	int error;
-
-	error = arch_prepare_suspend();
-	if (error)
-		return error;
-
-	/* At this point, dpm_suspend_start() has been called, but *not*
-	 * dpm_suspend_noirq(). We *must* call dpm_suspend_noirq() now.
-	 * Otherwise, drivers for some devices (e.g. interrupt controllers)
-	 * become desynchronized with the actual state of the hardware
-	 * at resume time, and evil weirdness ensues.
-	 */
-	error = dpm_suspend_noirq(PMSG_FREEZE);
-	if (error) {
-		printk(KERN_ERR "PM: Some devices failed to power down, "
-			"aborting hibernation\n");
-		return error;
-	}
-
-	error = platform_pre_snapshot(platform_mode);
-	if (error || hibernation_test(TEST_PLATFORM))
-		goto Platform_finish;
-
-	error = disable_nonboot_cpus();
-	if (error || hibernation_test(TEST_CPUS)
-	    || hibernation_testmode(HIBERNATION_TEST))
-		goto Enable_cpus;
-
-	local_irq_disable();
-
-	error = sysdev_suspend(PMSG_FREEZE);
-	if (error) {
-		printk(KERN_ERR "PM: Some system devices failed to power down, "
-			"aborting hibernation\n");
-		goto Enable_irqs;
-	}
-
-	if (hibernation_test(TEST_CORE))
-		goto Power_up;
-
-	in_suspend = 1;
-	save_processor_state();
-	error = swsusp_arch_suspend();
-	if (error)
-		printk(KERN_ERR "PM: Error %d creating hibernation image\n",
-			error);
-	/* Restore control flow magically appears here */
-	restore_processor_state();
-	if (!in_suspend)
-		platform_leave(platform_mode);
-
- Power_up:
-	sysdev_resume();
-	/* NOTE:  dpm_resume_noirq() is just a resume() for devices
-	 * that suspended with irqs off ... no overall powerup.
-	 */
-
- Enable_irqs:
-	local_irq_enable();
-
- Enable_cpus:
-	enable_nonboot_cpus();
-
- Platform_finish:
-	platform_finish(platform_mode);
-
-	dpm_resume_noirq(in_suspend ?
-		(error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
-
-	return error;
-}
-
-/**
- *	hibernation_snapshot - quiesce devices and create the hibernation
- *	snapshot image.
- *	@platform_mode - if set, use the platform driver, if available, to
- *			 prepare the platform firmware for the power transition.
- *
- *	Must be called with pm_mutex held
- */
-
-int hibernation_snapshot(int platform_mode)
-{
-	int error;
-
-	error = platform_begin(platform_mode);
-	if (error)
-		return error;
-
-	/* Free memory before shutting down devices. */
-	error = swsusp_shrink_memory();
-	if (error)
-		goto Close;
-
-	suspend_console();
-	error = dpm_suspend_start(PMSG_FREEZE);
-	if (error)
-		goto Recover_platform;
-
-	if (hibernation_test(TEST_DEVICES))
-		goto Recover_platform;
-
-	error = create_image(platform_mode);
-	/* Control returns here after successful restore */
-
- Resume_devices:
-	dpm_resume_end(in_suspend ?
-		(error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
-	resume_console();
- Close:
-	platform_end(platform_mode);
-	return error;
-
- Recover_platform:
-	platform_recover(platform_mode);
-	goto Resume_devices;
-}
-
-/**
- *	resume_target_kernel - prepare devices that need to be suspended with
- *	interrupts off, restore the contents of highmem that have not been
- *	restored yet from the image and run the low level code that will restore
- *	the remaining contents of memory and switch to the just restored target
- *	kernel.
- */
-
-static int resume_target_kernel(bool platform_mode)
-{
-	int error;
-
-	error = dpm_suspend_noirq(PMSG_QUIESCE);
-	if (error) {
-		printk(KERN_ERR "PM: Some devices failed to power down, "
-			"aborting resume\n");
-		return error;
-	}
-
-	error = platform_pre_restore(platform_mode);
-	if (error)
-		goto Cleanup;
-
-	error = disable_nonboot_cpus();
-	if (error)
-		goto Enable_cpus;
-
-	local_irq_disable();
-
-	error = sysdev_suspend(PMSG_QUIESCE);
-	if (error)
-		goto Enable_irqs;
-
-	/* We'll ignore saved state, but this gets preempt count (etc) right */
-	save_processor_state();
-	error = restore_highmem();
-	if (!error) {
-		error = swsusp_arch_resume();
-		/*
-		 * The code below is only ever reached in case of a failure.
-		 * Otherwise execution continues at place where
-		 * swsusp_arch_suspend() was called
-		 */
-		BUG_ON(!error);
-		/* This call to restore_highmem() undos the previous one */
-		restore_highmem();
-	}
-	/*
-	 * The only reason why swsusp_arch_resume() can fail is memory being
-	 * very tight, so we have to free it as soon as we can to avoid
-	 * subsequent failures
-	 */
-	swsusp_free();
-	restore_processor_state();
-	touch_softlockup_watchdog();
-
-	sysdev_resume();
-
- Enable_irqs:
-	local_irq_enable();
-
- Enable_cpus:
-	enable_nonboot_cpus();
-
- Cleanup:
-	platform_restore_cleanup(platform_mode);
-
-	dpm_resume_noirq(PMSG_RECOVER);
-
-	return error;
-}
-
-/**
- *	hibernation_restore - quiesce devices and restore the hibernation
- *	snapshot image.  If successful, control returns in hibernation_snaphot()
- *	@platform_mode - if set, use the platform driver, if available, to
- *			 prepare the platform firmware for the transition.
- *
- *	Must be called with pm_mutex held
- */
-
-int hibernation_restore(int platform_mode)
-{
-	int error;
-
-	pm_prepare_console();
-	suspend_console();
-	error = dpm_suspend_start(PMSG_QUIESCE);
-	if (!error) {
-		error = resume_target_kernel(platform_mode);
-		dpm_resume_end(PMSG_RECOVER);
-	}
-	resume_console();
-	pm_restore_console();
-	return error;
-}
-
-/**
- *	hibernation_platform_enter - enter the hibernation state using the
- *	platform driver (if available)
- */
-
-int hibernation_platform_enter(void)
-{
-	int error;
-
-	if (!hibernation_ops)
-		return -ENOSYS;
-
-	/*
-	 * We have cancelled the power transition by running
-	 * hibernation_ops->finish() before saving the image, so we should let
-	 * the firmware know that we're going to enter the sleep state after all
-	 */
-	error = hibernation_ops->begin();
-	if (error)
-		goto Close;
-
-	entering_platform_hibernation = true;
-	suspend_console();
-	error = dpm_suspend_start(PMSG_HIBERNATE);
-	if (error) {
-		if (hibernation_ops->recover)
-			hibernation_ops->recover();
-		goto Resume_devices;
-	}
-
-	error = dpm_suspend_noirq(PMSG_HIBERNATE);
-	if (error)
-		goto Resume_devices;
-
-	error = hibernation_ops->prepare();
-	if (error)
-		goto Platofrm_finish;
-
-	error = disable_nonboot_cpus();
-	if (error)
-		goto Platofrm_finish;
-
-	local_irq_disable();
-	sysdev_suspend(PMSG_HIBERNATE);
-	hibernation_ops->enter();
-	/* We should never get here */
-	while (1);
-
-	/*
-	 * We don't need to reenable the nonboot CPUs or resume consoles, since
-	 * the system is going to be halted anyway.
-	 */
- Platofrm_finish:
-	hibernation_ops->finish();
-
-	dpm_suspend_noirq(PMSG_RESTORE);
-
- Resume_devices:
-	entering_platform_hibernation = false;
-	dpm_resume_end(PMSG_RESTORE);
-	resume_console();
-
- Close:
-	hibernation_ops->end();
-
-	return error;
-}
-
-/**
- *	power_down - Shut the machine down for hibernation.
- *
- *	Use the platform driver, if configured so; otherwise try
- *	to power off or reboot.
- */
-
-static void power_down(void)
-{
-	switch (hibernation_mode) {
-	case HIBERNATION_TEST:
-	case HIBERNATION_TESTPROC:
-		break;
-	case HIBERNATION_REBOOT:
-		kernel_restart(NULL);
-		break;
-	case HIBERNATION_PLATFORM:
-		hibernation_platform_enter();
-	case HIBERNATION_SHUTDOWN:
-		kernel_power_off();
-		break;
-	}
-	kernel_halt();
-	/*
-	 * Valid image is on the disk, if we continue we risk serious data
-	 * corruption after resume.
-	 */
-	printk(KERN_CRIT "PM: Please power down manually\n");
-	while(1);
-}
-
-static int prepare_processes(void)
-{
-	int error = 0;
-
-	if (freeze_processes()) {
-		error = -EBUSY;
-		thaw_processes();
-	}
-	return error;
-}
-
-/**
- *	hibernate - The granpappy of the built-in hibernation management
- */
-
-int hibernate(void)
-{
-	int error;
-
-	mutex_lock(&pm_mutex);
-	/* The snapshot device should not be opened while we're running */
-	if (!atomic_add_unless(&snapshot_device_available, -1, 0)) {
-		error = -EBUSY;
-		goto Unlock;
-	}
-
-	pm_prepare_console();
-	error = pm_notifier_call_chain(PM_HIBERNATION_PREPARE);
-	if (error)
-		goto Exit;
-
-	error = usermodehelper_disable();
-	if (error)
-		goto Exit;
-
-	/* Allocate memory management structures */
-	error = create_basic_memory_bitmaps();
-	if (error)
-		goto Exit;
-
-	printk(KERN_INFO "PM: Syncing filesystems ... ");
-	sys_sync();
-	printk("done.\n");
-
-	error = prepare_processes();
-	if (error)
-		goto Finish;
-
-	if (hibernation_test(TEST_FREEZER))
-		goto Thaw;
-
-	if (hibernation_testmode(HIBERNATION_TESTPROC))
-		goto Thaw;
-
-	error = hibernation_snapshot(hibernation_mode == HIBERNATION_PLATFORM);
-	if (in_suspend && !error) {
-		unsigned int flags = 0;
-
-		if (hibernation_mode == HIBERNATION_PLATFORM)
-			flags |= SF_PLATFORM_MODE;
-		pr_debug("PM: writing image.\n");
-		error = swsusp_write(flags);
-		swsusp_free();
-		if (!error)
-			power_down();
-	} else {
-		pr_debug("PM: Image restored successfully.\n");
-		swsusp_free();
-	}
- Thaw:
-	thaw_processes();
- Finish:
-	free_basic_memory_bitmaps();
-	usermodehelper_enable();
- Exit:
-	pm_notifier_call_chain(PM_POST_HIBERNATION);
-	pm_restore_console();
-	atomic_inc(&snapshot_device_available);
- Unlock:
-	mutex_unlock(&pm_mutex);
-	return error;
-}
-
-
-/**
- *	software_resume - Resume from a saved image.
- *
- *	Called as a late_initcall (so all devices are discovered and
- *	initialized), we call swsusp to see if we have a saved image or not.
- *	If so, we quiesce devices, the restore the saved image. We will
- *	return above (in hibernate() ) if everything goes well.
- *	Otherwise, we fail gracefully and return to the normally
- *	scheduled program.
- *
- */
-
-static int software_resume(void)
-{
-	int error;
-	unsigned int flags;
-
-	/*
-	 * If the user said "noresume".. bail out early.
-	 */
-	if (noresume)
-		return 0;
-
-	/*
-	 * name_to_dev_t() below takes a sysfs buffer mutex when sysfs
-	 * is configured into the kernel. Since the regular hibernate
-	 * trigger path is via sysfs which takes a buffer mutex before
-	 * calling hibernate functions (which take pm_mutex) this can
-	 * cause lockdep to complain about a possible ABBA deadlock
-	 * which cannot happen since we're in the boot code here and
-	 * sysfs can't be invoked yet. Therefore, we use a subclass
-	 * here to avoid lockdep complaining.
-	 */
-	mutex_lock_nested(&pm_mutex, SINGLE_DEPTH_NESTING);
-
-	if (swsusp_resume_device)
-		goto Check_image;
-
-	if (!strlen(resume_file)) {
-		error = -ENOENT;
-		goto Unlock;
-	}
-
-	pr_debug("PM: Checking image partition %s\n", resume_file);
-
-	/* Check if the device is there */
-	swsusp_resume_device = name_to_dev_t(resume_file);
-	if (!swsusp_resume_device) {
-		/*
-		 * Some device discovery might still be in progress; we need
-		 * to wait for this to finish.
-		 */
-		wait_for_device_probe();
-		/*
-		 * We can't depend on SCSI devices being available after loading
-		 * one of their modules until scsi_complete_async_scans() is
-		 * called and the resume device usually is a SCSI one.
-		 */
-		scsi_complete_async_scans();
-
-		swsusp_resume_device = name_to_dev_t(resume_file);
-		if (!swsusp_resume_device) {
-			error = -ENODEV;
-			goto Unlock;
-		}
-	}
-
- Check_image:
-	pr_debug("PM: Resume from partition %d:%d\n",
-		MAJOR(swsusp_resume_device), MINOR(swsusp_resume_device));
-
-	pr_debug("PM: Checking hibernation image.\n");
-	error = swsusp_check();
-	if (error)
-		goto Unlock;
-
-	/* The snapshot device should not be opened while we're running */
-	if (!atomic_add_unless(&snapshot_device_available, -1, 0)) {
-		error = -EBUSY;
-		goto Unlock;
-	}
-
-	pm_prepare_console();
-	error = pm_notifier_call_chain(PM_RESTORE_PREPARE);
-	if (error)
-		goto Finish;
-
-	error = usermodehelper_disable();
-	if (error)
-		goto Finish;
-
-	error = create_basic_memory_bitmaps();
-	if (error)
-		goto Finish;
-
-	pr_debug("PM: Preparing processes for restore.\n");
-	error = prepare_processes();
-	if (error) {
-		swsusp_close(FMODE_READ);
-		goto Done;
-	}
-
-	pr_debug("PM: Reading hibernation image.\n");
-
-	error = swsusp_read(&flags);
-	if (!error)
-		hibernation_restore(flags & SF_PLATFORM_MODE);
-
-	printk(KERN_ERR "PM: Restore failed, recovering.\n");
-	swsusp_free();
-	thaw_processes();
- Done:
-	free_basic_memory_bitmaps();
-	usermodehelper_enable();
- Finish:
-	pm_notifier_call_chain(PM_POST_RESTORE);
-	pm_restore_console();
-	atomic_inc(&snapshot_device_available);
-	/* For success case, the suspend path will release the lock */
- Unlock:
-	mutex_unlock(&pm_mutex);
-	pr_debug("PM: Resume from disk failed.\n");
-	return error;
-}
-
-late_initcall(software_resume);
-
-
-static const char * const hibernation_modes[] = {
-	[HIBERNATION_PLATFORM]	= "platform",
-	[HIBERNATION_SHUTDOWN]	= "shutdown",
-	[HIBERNATION_REBOOT]	= "reboot",
-	[HIBERNATION_TEST]	= "test",
-	[HIBERNATION_TESTPROC]	= "testproc",
-};
-
-/**
- *	disk - Control hibernation mode
- *
- *	Suspend-to-disk can be handled in several ways. We have a few options
- *	for putting the system to sleep - using the platform driver (e.g. ACPI
- *	or other hibernation_ops), powering off the system or rebooting the
- *	system (for testing) as well as the two test modes.
- *
- *	The system can support 'platform', and that is known a priori (and
- *	encoded by the presence of hibernation_ops). However, the user may
- *	choose 'shutdown' or 'reboot' as alternatives, as well as one fo the
- *	test modes, 'test' or 'testproc'.
- *
- *	show() will display what the mode is currently set to.
- *	store() will accept one of
- *
- *	'platform'
- *	'shutdown'
- *	'reboot'
- *	'test'
- *	'testproc'
- *
- *	It will only change to 'platform' if the system
- *	supports it (as determined by having hibernation_ops).
- */
-
-static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr,
-			 char *buf)
-{
-	int i;
-	char *start = buf;
-
-	for (i = HIBERNATION_FIRST; i <= HIBERNATION_MAX; i++) {
-		if (!hibernation_modes[i])
-			continue;
-		switch (i) {
-		case HIBERNATION_SHUTDOWN:
-		case HIBERNATION_REBOOT:
-		case HIBERNATION_TEST:
-		case HIBERNATION_TESTPROC:
-			break;
-		case HIBERNATION_PLATFORM:
-			if (hibernation_ops)
-				break;
-			/* not a valid mode, continue with loop */
-			continue;
-		}
-		if (i == hibernation_mode)
-			buf += sprintf(buf, "[%s] ", hibernation_modes[i]);
-		else
-			buf += sprintf(buf, "%s ", hibernation_modes[i]);
-	}
-	buf += sprintf(buf, "\n");
-	return buf-start;
-}
-
-
-static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr,
-			  const char *buf, size_t n)
-{
-	int error = 0;
-	int i;
-	int len;
-	char *p;
-	int mode = HIBERNATION_INVALID;
-
-	p = memchr(buf, '\n', n);
-	len = p ? p - buf : n;
-
-	mutex_lock(&pm_mutex);
-	for (i = HIBERNATION_FIRST; i <= HIBERNATION_MAX; i++) {
-		if (len == strlen(hibernation_modes[i])
-		    && !strncmp(buf, hibernation_modes[i], len)) {
-			mode = i;
-			break;
-		}
-	}
-	if (mode != HIBERNATION_INVALID) {
-		switch (mode) {
-		case HIBERNATION_SHUTDOWN:
-		case HIBERNATION_REBOOT:
-		case HIBERNATION_TEST:
-		case HIBERNATION_TESTPROC:
-			hibernation_mode = mode;
-			break;
-		case HIBERNATION_PLATFORM:
-			if (hibernation_ops)
-				hibernation_mode = mode;
-			else
-				error = -EINVAL;
-		}
-	} else
-		error = -EINVAL;
-
-	if (!error)
-		pr_debug("PM: Hibernation mode set to '%s'\n",
-			 hibernation_modes[mode]);
-	mutex_unlock(&pm_mutex);
-	return error ? error : n;
-}
-
-power_attr(disk);
-
-static ssize_t resume_show(struct kobject *kobj, struct kobj_attribute *attr,
-			   char *buf)
-{
-	return sprintf(buf,"%d:%d\n", MAJOR(swsusp_resume_device),
-		       MINOR(swsusp_resume_device));
-}
-
-static ssize_t resume_store(struct kobject *kobj, struct kobj_attribute *attr,
-			    const char *buf, size_t n)
-{
-	unsigned int maj, min;
-	dev_t res;
-	int ret = -EINVAL;
-
-	if (sscanf(buf, "%u:%u", &maj, &min) != 2)
-		goto out;
-
-	res = MKDEV(maj,min);
-	if (maj != MAJOR(res) || min != MINOR(res))
-		goto out;
-
-	mutex_lock(&pm_mutex);
-	swsusp_resume_device = res;
-	mutex_unlock(&pm_mutex);
-	printk(KERN_INFO "PM: Starting manual resume from disk\n");
-	noresume = 0;
-	software_resume();
-	ret = n;
- out:
-	return ret;
-}
-
-power_attr(resume);
-
-static ssize_t image_size_show(struct kobject *kobj, struct kobj_attribute *attr,
-			       char *buf)
-{
-	return sprintf(buf, "%lu\n", image_size);
-}
-
-static ssize_t image_size_store(struct kobject *kobj, struct kobj_attribute *attr,
-				const char *buf, size_t n)
-{
-	unsigned long size;
-
-	if (sscanf(buf, "%lu", &size) == 1) {
-		image_size = size;
-		return n;
-	}
-
-	return -EINVAL;
-}
-
-power_attr(image_size);
-
-static struct attribute * g[] = {
-	&disk_attr.attr,
-	&resume_attr.attr,
-	&image_size_attr.attr,
-	NULL,
-};
-
-
-static struct attribute_group attr_group = {
-	.attrs = g,
-};
-
-
-static int __init pm_disk_init(void)
-{
-	return sysfs_create_group(power_kobj, &attr_group);
-}
-
-core_initcall(pm_disk_init);
-
-
-static int __init resume_setup(char *str)
-{
-	if (noresume)
-		return 1;
-
-	strncpy( resume_file, str, 255 );
-	return 1;
-}
-
-static int __init resume_offset_setup(char *str)
-{
-	unsigned long long offset;
-
-	if (noresume)
-		return 1;
-
-	if (sscanf(str, "%llu", &offset) == 1)
-		swsusp_resume_block = offset;
-
-	return 1;
-}
-
-static int __init noresume_setup(char *str)
-{
-	noresume = 1;
-	return 1;
-}
-
-__setup("noresume", noresume_setup);
-__setup("resume_offset=", resume_offset_setup);
-__setup("resume=", resume_setup);
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
new file mode 100644
index 00000000000..81d2e746489
--- /dev/null
+++ b/kernel/power/hibernate.c
@@ -0,0 +1,955 @@
+/*
+ * kernel/power/hibernate.c - Hibernation (a.k.a suspend-to-disk) support.
+ *
+ * Copyright (c) 2003 Patrick Mochel
+ * Copyright (c) 2003 Open Source Development Lab
+ * Copyright (c) 2004 Pavel Machek <pavel@suse.cz>
+ * Copyright (c) 2009 Rafael J. Wysocki, Novell Inc.
+ *
+ * This file is released under the GPLv2.
+ */
+
+#include <linux/suspend.h>
+#include <linux/syscalls.h>
+#include <linux/reboot.h>
+#include <linux/string.h>
+#include <linux/device.h>
+#include <linux/kmod.h>
+#include <linux/delay.h>
+#include <linux/fs.h>
+#include <linux/mount.h>
+#include <linux/pm.h>
+#include <linux/console.h>
+#include <linux/cpu.h>
+#include <linux/freezer.h>
+#include <scsi/scsi_scan.h>
+#include <asm/suspend.h>
+
+#include "power.h"
+
+
+static int noresume = 0;
+static char resume_file[256] = CONFIG_PM_STD_PARTITION;
+dev_t swsusp_resume_device;
+sector_t swsusp_resume_block;
+
+enum {
+	HIBERNATION_INVALID,
+	HIBERNATION_PLATFORM,
+	HIBERNATION_TEST,
+	HIBERNATION_TESTPROC,
+	HIBERNATION_SHUTDOWN,
+	HIBERNATION_REBOOT,
+	/* keep last */
+	__HIBERNATION_AFTER_LAST
+};
+#define HIBERNATION_MAX (__HIBERNATION_AFTER_LAST-1)
+#define HIBERNATION_FIRST (HIBERNATION_INVALID + 1)
+
+static int hibernation_mode = HIBERNATION_SHUTDOWN;
+
+static struct platform_hibernation_ops *hibernation_ops;
+
+/**
+ * hibernation_set_ops - set the global hibernate operations
+ * @ops: the hibernation operations to use in subsequent hibernation transitions
+ */
+
+void hibernation_set_ops(struct platform_hibernation_ops *ops)
+{
+	if (ops && !(ops->begin && ops->end &&  ops->pre_snapshot
+	    && ops->prepare && ops->finish && ops->enter && ops->pre_restore
+	    && ops->restore_cleanup)) {
+		WARN_ON(1);
+		return;
+	}
+	mutex_lock(&pm_mutex);
+	hibernation_ops = ops;
+	if (ops)
+		hibernation_mode = HIBERNATION_PLATFORM;
+	else if (hibernation_mode == HIBERNATION_PLATFORM)
+		hibernation_mode = HIBERNATION_SHUTDOWN;
+
+	mutex_unlock(&pm_mutex);
+}
+
+static bool entering_platform_hibernation;
+
+bool system_entering_hibernation(void)
+{
+	return entering_platform_hibernation;
+}
+EXPORT_SYMBOL(system_entering_hibernation);
+
+#ifdef CONFIG_PM_DEBUG
+static void hibernation_debug_sleep(void)
+{
+	printk(KERN_INFO "hibernation debug: Waiting for 5 seconds.\n");
+	mdelay(5000);
+}
+
+static int hibernation_testmode(int mode)
+{
+	if (hibernation_mode == mode) {
+		hibernation_debug_sleep();
+		return 1;
+	}
+	return 0;
+}
+
+static int hibernation_test(int level)
+{
+	if (pm_test_level == level) {
+		hibernation_debug_sleep();
+		return 1;
+	}
+	return 0;
+}
+#else /* !CONFIG_PM_DEBUG */
+static int hibernation_testmode(int mode) { return 0; }
+static int hibernation_test(int level) { return 0; }
+#endif /* !CONFIG_PM_DEBUG */
+
+/**
+ *	platform_begin - tell the platform driver that we're starting
+ *	hibernation
+ */
+
+static int platform_begin(int platform_mode)
+{
+	return (platform_mode && hibernation_ops) ?
+		hibernation_ops->begin() : 0;
+}
+
+/**
+ *	platform_end - tell the platform driver that we've entered the
+ *	working state
+ */
+
+static void platform_end(int platform_mode)
+{
+	if (platform_mode && hibernation_ops)
+		hibernation_ops->end();
+}
+
+/**
+ *	platform_pre_snapshot - prepare the machine for hibernation using the
+ *	platform driver if so configured and return an error code if it fails
+ */
+
+static int platform_pre_snapshot(int platform_mode)
+{
+	return (platform_mode && hibernation_ops) ?
+		hibernation_ops->pre_snapshot() : 0;
+}
+
+/**
+ *	platform_leave - prepare the machine for switching to the normal mode
+ *	of operation using the platform driver (called with interrupts disabled)
+ */
+
+static void platform_leave(int platform_mode)
+{
+	if (platform_mode && hibernation_ops)
+		hibernation_ops->leave();
+}
+
+/**
+ *	platform_finish - switch the machine to the normal mode of operation
+ *	using the platform driver (must be called after platform_prepare())
+ */
+
+static void platform_finish(int platform_mode)
+{
+	if (platform_mode && hibernation_ops)
+		hibernation_ops->finish();
+}
+
+/**
+ *	platform_pre_restore - prepare the platform for the restoration from a
+ *	hibernation image.  If the restore fails after this function has been
+ *	called, platform_restore_cleanup() must be called.
+ */
+
+static int platform_pre_restore(int platform_mode)
+{
+	return (platform_mode && hibernation_ops) ?
+		hibernation_ops->pre_restore() : 0;
+}
+
+/**
+ *	platform_restore_cleanup - switch the platform to the normal mode of
+ *	operation after a failing restore.  If platform_pre_restore() has been
+ *	called before the failing restore, this function must be called too,
+ *	regardless of the result of platform_pre_restore().
+ */
+
+static void platform_restore_cleanup(int platform_mode)
+{
+	if (platform_mode && hibernation_ops)
+		hibernation_ops->restore_cleanup();
+}
+
+/**
+ *	platform_recover - recover the platform from a failure to suspend
+ *	devices.
+ */
+
+static void platform_recover(int platform_mode)
+{
+	if (platform_mode && hibernation_ops && hibernation_ops->recover)
+		hibernation_ops->recover();
+}
+
+/**
+ *	create_image - freeze devices that need to be frozen with interrupts
+ *	off, create the hibernation image and thaw those devices.  Control
+ *	reappears in this routine after a restore.
+ */
+
+static int create_image(int platform_mode)
+{
+	int error;
+
+	error = arch_prepare_suspend();
+	if (error)
+		return error;
+
+	/* At this point, dpm_suspend_start() has been called, but *not*
+	 * dpm_suspend_noirq(). We *must* call dpm_suspend_noirq() now.
+	 * Otherwise, drivers for some devices (e.g. interrupt controllers)
+	 * become desynchronized with the actual state of the hardware
+	 * at resume time, and evil weirdness ensues.
+	 */
+	error = dpm_suspend_noirq(PMSG_FREEZE);
+	if (error) {
+		printk(KERN_ERR "PM: Some devices failed to power down, "
+			"aborting hibernation\n");
+		return error;
+	}
+
+	error = platform_pre_snapshot(platform_mode);
+	if (error || hibernation_test(TEST_PLATFORM))
+		goto Platform_finish;
+
+	error = disable_nonboot_cpus();
+	if (error || hibernation_test(TEST_CPUS)
+	    || hibernation_testmode(HIBERNATION_TEST))
+		goto Enable_cpus;
+
+	local_irq_disable();
+
+	error = sysdev_suspend(PMSG_FREEZE);
+	if (error) {
+		printk(KERN_ERR "PM: Some system devices failed to power down, "
+			"aborting hibernation\n");
+		goto Enable_irqs;
+	}
+
+	if (hibernation_test(TEST_CORE))
+		goto Power_up;
+
+	in_suspend = 1;
+	save_processor_state();
+	error = swsusp_arch_suspend();
+	if (error)
+		printk(KERN_ERR "PM: Error %d creating hibernation image\n",
+			error);
+	/* Restore control flow magically appears here */
+	restore_processor_state();
+	if (!in_suspend)
+		platform_leave(platform_mode);
+
+ Power_up:
+	sysdev_resume();
+	/* NOTE:  dpm_resume_noirq() is just a resume() for devices
+	 * that suspended with irqs off ... no overall powerup.
+	 */
+
+ Enable_irqs:
+	local_irq_enable();
+
+ Enable_cpus:
+	enable_nonboot_cpus();
+
+ Platform_finish:
+	platform_finish(platform_mode);
+
+	dpm_resume_noirq(in_suspend ?
+		(error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
+
+	return error;
+}
+
+/**
+ *	hibernation_snapshot - quiesce devices and create the hibernation
+ *	snapshot image.
+ *	@platform_mode - if set, use the platform driver, if available, to
+ *			 prepare the platform firmware for the power transition.
+ *
+ *	Must be called with pm_mutex held
+ */
+
+int hibernation_snapshot(int platform_mode)
+{
+	int error;
+
+	error = platform_begin(platform_mode);
+	if (error)
+		return error;
+
+	/* Free memory before shutting down devices. */
+	error = swsusp_shrink_memory();
+	if (error)
+		goto Close;
+
+	suspend_console();
+	error = dpm_suspend_start(PMSG_FREEZE);
+	if (error)
+		goto Recover_platform;
+
+	if (hibernation_test(TEST_DEVICES))
+		goto Recover_platform;
+
+	error = create_image(platform_mode);
+	/* Control returns here after successful restore */
+
+ Resume_devices:
+	dpm_resume_end(in_suspend ?
+		(error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
+	resume_console();
+ Close:
+	platform_end(platform_mode);
+	return error;
+
+ Recover_platform:
+	platform_recover(platform_mode);
+	goto Resume_devices;
+}
+
+/**
+ *	resume_target_kernel - prepare devices that need to be suspended with
+ *	interrupts off, restore the contents of highmem that have not been
+ *	restored yet from the image and run the low level code that will restore
+ *	the remaining contents of memory and switch to the just restored target
+ *	kernel.
+ */
+
+static int resume_target_kernel(bool platform_mode)
+{
+	int error;
+
+	error = dpm_suspend_noirq(PMSG_QUIESCE);
+	if (error) {
+		printk(KERN_ERR "PM: Some devices failed to power down, "
+			"aborting resume\n");
+		return error;
+	}
+
+	error = platform_pre_restore(platform_mode);
+	if (error)
+		goto Cleanup;
+
+	error = disable_nonboot_cpus();
+	if (error)
+		goto Enable_cpus;
+
+	local_irq_disable();
+
+	error = sysdev_suspend(PMSG_QUIESCE);
+	if (error)
+		goto Enable_irqs;
+
+	/* We'll ignore saved state, but this gets preempt count (etc) right */
+	save_processor_state();
+	error = restore_highmem();
+	if (!error) {
+		error = swsusp_arch_resume();
+		/*
+		 * The code below is only ever reached in case of a failure.
+		 * Otherwise execution continues at place where
+		 * swsusp_arch_suspend() was called
+		 */
+		BUG_ON(!error);
+		/* This call to restore_highmem() undos the previous one */
+		restore_highmem();
+	}
+	/*
+	 * The only reason why swsusp_arch_resume() can fail is memory being
+	 * very tight, so we have to free it as soon as we can to avoid
+	 * subsequent failures
+	 */
+	swsusp_free();
+	restore_processor_state();
+	touch_softlockup_watchdog();
+
+	sysdev_resume();
+
+ Enable_irqs:
+	local_irq_enable();
+
+ Enable_cpus:
+	enable_nonboot_cpus();
+
+ Cleanup:
+	platform_restore_cleanup(platform_mode);
+
+	dpm_resume_noirq(PMSG_RECOVER);
+
+	return error;
+}
+
+/**
+ *	hibernation_restore - quiesce devices and restore the hibernation
+ *	snapshot image.  If successful, control returns in hibernation_snaphot()
+ *	@platform_mode - if set, use the platform driver, if available, to
+ *			 prepare the platform firmware for the transition.
+ *
+ *	Must be called with pm_mutex held
+ */
+
+int hibernation_restore(int platform_mode)
+{
+	int error;
+
+	pm_prepare_console();
+	suspend_console();
+	error = dpm_suspend_start(PMSG_QUIESCE);
+	if (!error) {
+		error = resume_target_kernel(platform_mode);
+		dpm_resume_end(PMSG_RECOVER);
+	}
+	resume_console();
+	pm_restore_console();
+	return error;
+}
+
+/**
+ *	hibernation_platform_enter - enter the hibernation state using the
+ *	platform driver (if available)
+ */
+
+int hibernation_platform_enter(void)
+{
+	int error;
+
+	if (!hibernation_ops)
+		return -ENOSYS;
+
+	/*
+	 * We have cancelled the power transition by running
+	 * hibernation_ops->finish() before saving the image, so we should let
+	 * the firmware know that we're going to enter the sleep state after all
+	 */
+	error = hibernation_ops->begin();
+	if (error)
+		goto Close;
+
+	entering_platform_hibernation = true;
+	suspend_console();
+	error = dpm_suspend_start(PMSG_HIBERNATE);
+	if (error) {
+		if (hibernation_ops->recover)
+			hibernation_ops->recover();
+		goto Resume_devices;
+	}
+
+	error = dpm_suspend_noirq(PMSG_HIBERNATE);
+	if (error)
+		goto Resume_devices;
+
+	error = hibernation_ops->prepare();
+	if (error)
+		goto Platofrm_finish;
+
+	error = disable_nonboot_cpus();
+	if (error)
+		goto Platofrm_finish;
+
+	local_irq_disable();
+	sysdev_suspend(PMSG_HIBERNATE);
+	hibernation_ops->enter();
+	/* We should never get here */
+	while (1);
+
+	/*
+	 * We don't need to reenable the nonboot CPUs or resume consoles, since
+	 * the system is going to be halted anyway.
+	 */
+ Platofrm_finish:
+	hibernation_ops->finish();
+
+	dpm_suspend_noirq(PMSG_RESTORE);
+
+ Resume_devices:
+	entering_platform_hibernation = false;
+	dpm_resume_end(PMSG_RESTORE);
+	resume_console();
+
+ Close:
+	hibernation_ops->end();
+
+	return error;
+}
+
+/**
+ *	power_down - Shut the machine down for hibernation.
+ *
+ *	Use the platform driver, if configured so; otherwise try
+ *	to power off or reboot.
+ */
+
+static void power_down(void)
+{
+	switch (hibernation_mode) {
+	case HIBERNATION_TEST:
+	case HIBERNATION_TESTPROC:
+		break;
+	case HIBERNATION_REBOOT:
+		kernel_restart(NULL);
+		break;
+	case HIBERNATION_PLATFORM:
+		hibernation_platform_enter();
+	case HIBERNATION_SHUTDOWN:
+		kernel_power_off();
+		break;
+	}
+	kernel_halt();
+	/*
+	 * Valid image is on the disk, if we continue we risk serious data
+	 * corruption after resume.
+	 */
+	printk(KERN_CRIT "PM: Please power down manually\n");
+	while(1);
+}
+
+static int prepare_processes(void)
+{
+	int error = 0;
+
+	if (freeze_processes()) {
+		error = -EBUSY;
+		thaw_processes();
+	}
+	return error;
+}
+
+/**
+ *	hibernate - The granpappy of the built-in hibernation management
+ */
+
+int hibernate(void)
+{
+	int error;
+
+	mutex_lock(&pm_mutex);
+	/* The snapshot device should not be opened while we're running */
+	if (!atomic_add_unless(&snapshot_device_available, -1, 0)) {
+		error = -EBUSY;
+		goto Unlock;
+	}
+
+	pm_prepare_console();
+	error = pm_notifier_call_chain(PM_HIBERNATION_PREPARE);
+	if (error)
+		goto Exit;
+
+	error = usermodehelper_disable();
+	if (error)
+		goto Exit;
+
+	/* Allocate memory management structures */
+	error = create_basic_memory_bitmaps();
+	if (error)
+		goto Exit;
+
+	printk(KERN_INFO "PM: Syncing filesystems ... ");
+	sys_sync();
+	printk("done.\n");
+
+	error = prepare_processes();
+	if (error)
+		goto Finish;
+
+	if (hibernation_test(TEST_FREEZER))
+		goto Thaw;
+
+	if (hibernation_testmode(HIBERNATION_TESTPROC))
+		goto Thaw;
+
+	error = hibernation_snapshot(hibernation_mode == HIBERNATION_PLATFORM);
+	if (in_suspend && !error) {
+		unsigned int flags = 0;
+
+		if (hibernation_mode == HIBERNATION_PLATFORM)
+			flags |= SF_PLATFORM_MODE;
+		pr_debug("PM: writing image.\n");
+		error = swsusp_write(flags);
+		swsusp_free();
+		if (!error)
+			power_down();
+	} else {
+		pr_debug("PM: Image restored successfully.\n");
+		swsusp_free();
+	}
+ Thaw:
+	thaw_processes();
+ Finish:
+	free_basic_memory_bitmaps();
+	usermodehelper_enable();
+ Exit:
+	pm_notifier_call_chain(PM_POST_HIBERNATION);
+	pm_restore_console();
+	atomic_inc(&snapshot_device_available);
+ Unlock:
+	mutex_unlock(&pm_mutex);
+	return error;
+}
+
+
+/**
+ *	software_resume - Resume from a saved image.
+ *
+ *	Called as a late_initcall (so all devices are discovered and
+ *	initialized), we call swsusp to see if we have a saved image or not.
+ *	If so, we quiesce devices, the restore the saved image. We will
+ *	return above (in hibernate() ) if everything goes well.
+ *	Otherwise, we fail gracefully and return to the normally
+ *	scheduled program.
+ *
+ */
+
+static int software_resume(void)
+{
+	int error;
+	unsigned int flags;
+
+	/*
+	 * If the user said "noresume".. bail out early.
+	 */
+	if (noresume)
+		return 0;
+
+	/*
+	 * name_to_dev_t() below takes a sysfs buffer mutex when sysfs
+	 * is configured into the kernel. Since the regular hibernate
+	 * trigger path is via sysfs which takes a buffer mutex before
+	 * calling hibernate functions (which take pm_mutex) this can
+	 * cause lockdep to complain about a possible ABBA deadlock
+	 * which cannot happen since we're in the boot code here and
+	 * sysfs can't be invoked yet. Therefore, we use a subclass
+	 * here to avoid lockdep complaining.
+	 */
+	mutex_lock_nested(&pm_mutex, SINGLE_DEPTH_NESTING);
+
+	if (swsusp_resume_device)
+		goto Check_image;
+
+	if (!strlen(resume_file)) {
+		error = -ENOENT;
+		goto Unlock;
+	}
+
+	pr_debug("PM: Checking image partition %s\n", resume_file);
+
+	/* Check if the device is there */
+	swsusp_resume_device = name_to_dev_t(resume_file);
+	if (!swsusp_resume_device) {
+		/*
+		 * Some device discovery might still be in progress; we need
+		 * to wait for this to finish.
+		 */
+		wait_for_device_probe();
+		/*
+		 * We can't depend on SCSI devices being available after loading
+		 * one of their modules until scsi_complete_async_scans() is
+		 * called and the resume device usually is a SCSI one.
+		 */
+		scsi_complete_async_scans();
+
+		swsusp_resume_device = name_to_dev_t(resume_file);
+		if (!swsusp_resume_device) {
+			error = -ENODEV;
+			goto Unlock;
+		}
+	}
+
+ Check_image:
+	pr_debug("PM: Resume from partition %d:%d\n",
+		MAJOR(swsusp_resume_device), MINOR(swsusp_resume_device));
+
+	pr_debug("PM: Checking hibernation image.\n");
+	error = swsusp_check();
+	if (error)
+		goto Unlock;
+
+	/* The snapshot device should not be opened while we're running */
+	if (!atomic_add_unless(&snapshot_device_available, -1, 0)) {
+		error = -EBUSY;
+		goto Unlock;
+	}
+
+	pm_prepare_console();
+	error = pm_notifier_call_chain(PM_RESTORE_PREPARE);
+	if (error)
+		goto Finish;
+
+	error = usermodehelper_disable();
+	if (error)
+		goto Finish;
+
+	error = create_basic_memory_bitmaps();
+	if (error)
+		goto Finish;
+
+	pr_debug("PM: Preparing processes for restore.\n");
+	error = prepare_processes();
+	if (error) {
+		swsusp_close(FMODE_READ);
+		goto Done;
+	}
+
+	pr_debug("PM: Reading hibernation image.\n");
+
+	error = swsusp_read(&flags);
+	if (!error)
+		hibernation_restore(flags & SF_PLATFORM_MODE);
+
+	printk(KERN_ERR "PM: Restore failed, recovering.\n");
+	swsusp_free();
+	thaw_processes();
+ Done:
+	free_basic_memory_bitmaps();
+	usermodehelper_enable();
+ Finish:
+	pm_notifier_call_chain(PM_POST_RESTORE);
+	pm_restore_console();
+	atomic_inc(&snapshot_device_available);
+	/* For success case, the suspend path will release the lock */
+ Unlock:
+	mutex_unlock(&pm_mutex);
+	pr_debug("PM: Resume from disk failed.\n");
+	return error;
+}
+
+late_initcall(software_resume);
+
+
+static const char * const hibernation_modes[] = {
+	[HIBERNATION_PLATFORM]	= "platform",
+	[HIBERNATION_SHUTDOWN]	= "shutdown",
+	[HIBERNATION_REBOOT]	= "reboot",
+	[HIBERNATION_TEST]	= "test",
+	[HIBERNATION_TESTPROC]	= "testproc",
+};
+
+/**
+ *	disk - Control hibernation mode
+ *
+ *	Suspend-to-disk can be handled in several ways. We have a few options
+ *	for putting the system to sleep - using the platform driver (e.g. ACPI
+ *	or other hibernation_ops), powering off the system or rebooting the
+ *	system (for testing) as well as the two test modes.
+ *
+ *	The system can support 'platform', and that is known a priori (and
+ *	encoded by the presence of hibernation_ops). However, the user may
+ *	choose 'shutdown' or 'reboot' as alternatives, as well as one fo the
+ *	test modes, 'test' or 'testproc'.
+ *
+ *	show() will display what the mode is currently set to.
+ *	store() will accept one of
+ *
+ *	'platform'
+ *	'shutdown'
+ *	'reboot'
+ *	'test'
+ *	'testproc'
+ *
+ *	It will only change to 'platform' if the system
+ *	supports it (as determined by having hibernation_ops).
+ */
+
+static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr,
+			 char *buf)
+{
+	int i;
+	char *start = buf;
+
+	for (i = HIBERNATION_FIRST; i <= HIBERNATION_MAX; i++) {
+		if (!hibernation_modes[i])
+			continue;
+		switch (i) {
+		case HIBERNATION_SHUTDOWN:
+		case HIBERNATION_REBOOT:
+		case HIBERNATION_TEST:
+		case HIBERNATION_TESTPROC:
+			break;
+		case HIBERNATION_PLATFORM:
+			if (hibernation_ops)
+				break;
+			/* not a valid mode, continue with loop */
+			continue;
+		}
+		if (i == hibernation_mode)
+			buf += sprintf(buf, "[%s] ", hibernation_modes[i]);
+		else
+			buf += sprintf(buf, "%s ", hibernation_modes[i]);
+	}
+	buf += sprintf(buf, "\n");
+	return buf-start;
+}
+
+
+static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr,
+			  const char *buf, size_t n)
+{
+	int error = 0;
+	int i;
+	int len;
+	char *p;
+	int mode = HIBERNATION_INVALID;
+
+	p = memchr(buf, '\n', n);
+	len = p ? p - buf : n;
+
+	mutex_lock(&pm_mutex);
+	for (i = HIBERNATION_FIRST; i <= HIBERNATION_MAX; i++) {
+		if (len == strlen(hibernation_modes[i])
+		    && !strncmp(buf, hibernation_modes[i], len)) {
+			mode = i;
+			break;
+		}
+	}
+	if (mode != HIBERNATION_INVALID) {
+		switch (mode) {
+		case HIBERNATION_SHUTDOWN:
+		case HIBERNATION_REBOOT:
+		case HIBERNATION_TEST:
+		case HIBERNATION_TESTPROC:
+			hibernation_mode = mode;
+			break;
+		case HIBERNATION_PLATFORM:
+			if (hibernation_ops)
+				hibernation_mode = mode;
+			else
+				error = -EINVAL;
+		}
+	} else
+		error = -EINVAL;
+
+	if (!error)
+		pr_debug("PM: Hibernation mode set to '%s'\n",
+			 hibernation_modes[mode]);
+	mutex_unlock(&pm_mutex);
+	return error ? error : n;
+}
+
+power_attr(disk);
+
+static ssize_t resume_show(struct kobject *kobj, struct kobj_attribute *attr,
+			   char *buf)
+{
+	return sprintf(buf,"%d:%d\n", MAJOR(swsusp_resume_device),
+		       MINOR(swsusp_resume_device));
+}
+
+static ssize_t resume_store(struct kobject *kobj, struct kobj_attribute *attr,
+			    const char *buf, size_t n)
+{
+	unsigned int maj, min;
+	dev_t res;
+	int ret = -EINVAL;
+
+	if (sscanf(buf, "%u:%u", &maj, &min) != 2)
+		goto out;
+
+	res = MKDEV(maj,min);
+	if (maj != MAJOR(res) || min != MINOR(res))
+		goto out;
+
+	mutex_lock(&pm_mutex);
+	swsusp_resume_device = res;
+	mutex_unlock(&pm_mutex);
+	printk(KERN_INFO "PM: Starting manual resume from disk\n");
+	noresume = 0;
+	software_resume();
+	ret = n;
+ out:
+	return ret;
+}
+
+power_attr(resume);
+
+static ssize_t image_size_show(struct kobject *kobj, struct kobj_attribute *attr,
+			       char *buf)
+{
+	return sprintf(buf, "%lu\n", image_size);
+}
+
+static ssize_t image_size_store(struct kobject *kobj, struct kobj_attribute *attr,
+				const char *buf, size_t n)
+{
+	unsigned long size;
+
+	if (sscanf(buf, "%lu", &size) == 1) {
+		image_size = size;
+		return n;
+	}
+
+	return -EINVAL;
+}
+
+power_attr(image_size);
+
+static struct attribute * g[] = {
+	&disk_attr.attr,
+	&resume_attr.attr,
+	&image_size_attr.attr,
+	NULL,
+};
+
+
+static struct attribute_group attr_group = {
+	.attrs = g,
+};
+
+
+static int __init pm_disk_init(void)
+{
+	return sysfs_create_group(power_kobj, &attr_group);
+}
+
+core_initcall(pm_disk_init);
+
+
+static int __init resume_setup(char *str)
+{
+	if (noresume)
+		return 1;
+
+	strncpy( resume_file, str, 255 );
+	return 1;
+}
+
+static int __init resume_offset_setup(char *str)
+{
+	unsigned long long offset;
+
+	if (noresume)
+		return 1;
+
+	if (sscanf(str, "%llu", &offset) == 1)
+		swsusp_resume_block = offset;
+
+	return 1;
+}
+
+static int __init noresume_setup(char *str)
+{
+	noresume = 1;
+	return 1;
+}
+
+__setup("noresume", noresume_setup);
+__setup("resume_offset=", resume_offset_setup);
+__setup("resume=", resume_setup);
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 2bd98d9fc19..26d5a26f82e 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -45,7 +45,7 @@ static inline char *check_image_kernel(struct swsusp_info *info)
  */
 #define SPARE_PAGES	((1024 * 1024) >> PAGE_SHIFT)
 
-/* kernel/power/disk.c */
+/* kernel/power/hibernate.c */
 extern int hibernation_snapshot(int platform_mode);
 extern int hibernation_restore(int platform_mode);
 extern int hibernation_platform_enter(void);
@@ -147,7 +147,7 @@ extern int swsusp_swap_in_use(void);
  */
 #define SF_PLATFORM_MODE	1
 
-/* kernel/power/disk.c */
+/* kernel/power/hibernate.c */
 extern int swsusp_check(void);
 extern void swsusp_free(void);
 extern int swsusp_read(unsigned int *flags_p);
-- 
cgit v1.2.3-70-g09d2


From fce2b111fae9151a53dabb36513b398d03337a19 Mon Sep 17 00:00:00 2001
From: Cornelia Huck <cornelia.huck@de.ibm.com>
Date: Wed, 10 Jun 2009 01:28:19 +0200
Subject: PM/Hibernate: Move NVS routines into a seperate file (v2).

The *_nvs_* routines in swsusp.c make use of the io*map()
functions, which are only provided for HAS_IOMEM, thus
breaking compilation if HAS_IOMEM is not set. Fix this
by moving the *_nvs_* routines into hibernate_nvs.c, which
is only compiled if HAS_IOMEM is set.

[rjw: Change the name of the new file to hibernate_nvs.c, add the
 license line to the header comment.]

Signed-off-by: Cornelia Huck <cornelia.huck@de.ibm.com>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 include/linux/suspend.h      |  18 +++---
 kernel/power/Kconfig         |   4 ++
 kernel/power/Makefile        |   1 +
 kernel/power/hibernate_nvs.c | 135 +++++++++++++++++++++++++++++++++++++++++++
 kernel/power/swsusp.c        | 122 --------------------------------------
 5 files changed, 151 insertions(+), 129 deletions(-)
 create mode 100644 kernel/power/hibernate_nvs.c

(limited to 'kernel')

diff --git a/include/linux/suspend.h b/include/linux/suspend.h
index 795032edfc4..cd15df6c63c 100644
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -245,11 +245,6 @@ extern unsigned long get_safe_page(gfp_t gfp_mask);
 
 extern void hibernation_set_ops(struct platform_hibernation_ops *ops);
 extern int hibernate(void);
-extern int hibernate_nvs_register(unsigned long start, unsigned long size);
-extern int hibernate_nvs_alloc(void);
-extern void hibernate_nvs_free(void);
-extern void hibernate_nvs_save(void);
-extern void hibernate_nvs_restore(void);
 extern bool system_entering_hibernation(void);
 #else /* CONFIG_HIBERNATION */
 static inline int swsusp_page_is_forbidden(struct page *p) { return 0; }
@@ -258,6 +253,16 @@ static inline void swsusp_unset_page_free(struct page *p) {}
 
 static inline void hibernation_set_ops(struct platform_hibernation_ops *ops) {}
 static inline int hibernate(void) { return -ENOSYS; }
+static inline bool system_entering_hibernation(void) { return false; }
+#endif /* CONFIG_HIBERNATION */
+
+#ifdef CONFIG_HIBERNATION_NVS
+extern int hibernate_nvs_register(unsigned long start, unsigned long size);
+extern int hibernate_nvs_alloc(void);
+extern void hibernate_nvs_free(void);
+extern void hibernate_nvs_save(void);
+extern void hibernate_nvs_restore(void);
+#else /* CONFIG_HIBERNATION_NVS */
 static inline int hibernate_nvs_register(unsigned long a, unsigned long b)
 {
 	return 0;
@@ -266,8 +271,7 @@ static inline int hibernate_nvs_alloc(void) { return 0; }
 static inline void hibernate_nvs_free(void) {}
 static inline void hibernate_nvs_save(void) {}
 static inline void hibernate_nvs_restore(void) {}
-static inline bool system_entering_hibernation(void) { return false; }
-#endif /* CONFIG_HIBERNATION */
+#endif /* CONFIG_HIBERNATION_NVS */
 
 #ifdef CONFIG_PM_SLEEP
 void save_processor_state(void);
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 23bd4daeb96..72067cbdb37 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -116,9 +116,13 @@ config SUSPEND_FREEZER
 
 	  Turning OFF this setting is NOT recommended! If in doubt, say Y.
 
+config HIBERNATION_NVS
+	bool
+
 config HIBERNATION
 	bool "Hibernation (aka 'suspend to disk')"
 	depends on PM && SWAP && ARCH_HIBERNATION_POSSIBLE
+	select HIBERNATION_NVS if HAS_IOMEM
 	---help---
 	  Enable the suspend to disk (STD) functionality, which is usually
 	  called "hibernation" in user interfaces.  STD checkpoints the
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index eadb17fc8f5..c3b81c30e5d 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -9,5 +9,6 @@ obj-$(CONFIG_FREEZER)		+= process.o
 obj-$(CONFIG_SUSPEND)		+= suspend.o
 obj-$(CONFIG_PM_TEST_SUSPEND)	+= suspend_test.o
 obj-$(CONFIG_HIBERNATION)	+= swsusp.o hibernate.o snapshot.o swap.o user.o
+obj-$(CONFIG_HIBERNATION_NVS)	+= hibernate_nvs.o
 
 obj-$(CONFIG_MAGIC_SYSRQ)	+= poweroff.o
diff --git a/kernel/power/hibernate_nvs.c b/kernel/power/hibernate_nvs.c
new file mode 100644
index 00000000000..39ac698ef83
--- /dev/null
+++ b/kernel/power/hibernate_nvs.c
@@ -0,0 +1,135 @@
+/*
+ * linux/kernel/power/hibernate_nvs.c - Routines for handling NVS memory
+ *
+ * Copyright (C) 2008,2009 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc.
+ *
+ * This file is released under the GPLv2.
+ */
+
+#include <linux/io.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/mm.h>
+#include <linux/suspend.h>
+
+/*
+ * Platforms, like ACPI, may want us to save some memory used by them during
+ * hibernation and to restore the contents of this memory during the subsequent
+ * resume.  The code below implements a mechanism allowing us to do that.
+ */
+
+struct nvs_page {
+	unsigned long phys_start;
+	unsigned int size;
+	void *kaddr;
+	void *data;
+	struct list_head node;
+};
+
+static LIST_HEAD(nvs_list);
+
+/**
+ *	hibernate_nvs_register - register platform NVS memory region to save
+ *	@start - physical address of the region
+ *	@size - size of the region
+ *
+ *	The NVS region need not be page-aligned (both ends) and we arrange
+ *	things so that the data from page-aligned addresses in this region will
+ *	be copied into separate RAM pages.
+ */
+int hibernate_nvs_register(unsigned long start, unsigned long size)
+{
+	struct nvs_page *entry, *next;
+
+	while (size > 0) {
+		unsigned int nr_bytes;
+
+		entry = kzalloc(sizeof(struct nvs_page), GFP_KERNEL);
+		if (!entry)
+			goto Error;
+
+		list_add_tail(&entry->node, &nvs_list);
+		entry->phys_start = start;
+		nr_bytes = PAGE_SIZE - (start & ~PAGE_MASK);
+		entry->size = (size < nr_bytes) ? size : nr_bytes;
+
+		start += entry->size;
+		size -= entry->size;
+	}
+	return 0;
+
+ Error:
+	list_for_each_entry_safe(entry, next, &nvs_list, node) {
+		list_del(&entry->node);
+		kfree(entry);
+	}
+	return -ENOMEM;
+}
+
+/**
+ *	hibernate_nvs_free - free data pages allocated for saving NVS regions
+ */
+void hibernate_nvs_free(void)
+{
+	struct nvs_page *entry;
+
+	list_for_each_entry(entry, &nvs_list, node)
+		if (entry->data) {
+			free_page((unsigned long)entry->data);
+			entry->data = NULL;
+			if (entry->kaddr) {
+				iounmap(entry->kaddr);
+				entry->kaddr = NULL;
+			}
+		}
+}
+
+/**
+ *	hibernate_nvs_alloc - allocate memory necessary for saving NVS regions
+ */
+int hibernate_nvs_alloc(void)
+{
+	struct nvs_page *entry;
+
+	list_for_each_entry(entry, &nvs_list, node) {
+		entry->data = (void *)__get_free_page(GFP_KERNEL);
+		if (!entry->data) {
+			hibernate_nvs_free();
+			return -ENOMEM;
+		}
+	}
+	return 0;
+}
+
+/**
+ *	hibernate_nvs_save - save NVS memory regions
+ */
+void hibernate_nvs_save(void)
+{
+	struct nvs_page *entry;
+
+	printk(KERN_INFO "PM: Saving platform NVS memory\n");
+
+	list_for_each_entry(entry, &nvs_list, node)
+		if (entry->data) {
+			entry->kaddr = ioremap(entry->phys_start, entry->size);
+			memcpy(entry->data, entry->kaddr, entry->size);
+		}
+}
+
+/**
+ *	hibernate_nvs_restore - restore NVS memory regions
+ *
+ *	This function is going to be called with interrupts disabled, so it
+ *	cannot iounmap the virtual addresses used to access the NVS region.
+ */
+void hibernate_nvs_restore(void)
+{
+	struct nvs_page *entry;
+
+	printk(KERN_INFO "PM: Restoring platform NVS memory\n");
+
+	list_for_each_entry(entry, &nvs_list, node)
+		if (entry->data)
+			memcpy(entry->kaddr, entry->data, entry->size);
+}
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index 87b901cb392..6a07f4dbf2f 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -186,125 +186,3 @@ void swsusp_show_speed(struct timeval *start, struct timeval *stop,
 			centisecs / 100, centisecs % 100,
 			kps / 1000, (kps % 1000) / 10);
 }
-
-/*
- * Platforms, like ACPI, may want us to save some memory used by them during
- * hibernation and to restore the contents of this memory during the subsequent
- * resume.  The code below implements a mechanism allowing us to do that.
- */
-
-struct nvs_page {
-	unsigned long phys_start;
-	unsigned int size;
-	void *kaddr;
-	void *data;
-	struct list_head node;
-};
-
-static LIST_HEAD(nvs_list);
-
-/**
- *	hibernate_nvs_register - register platform NVS memory region to save
- *	@start - physical address of the region
- *	@size - size of the region
- *
- *	The NVS region need not be page-aligned (both ends) and we arrange
- *	things so that the data from page-aligned addresses in this region will
- *	be copied into separate RAM pages.
- */
-int hibernate_nvs_register(unsigned long start, unsigned long size)
-{
-	struct nvs_page *entry, *next;
-
-	while (size > 0) {
-		unsigned int nr_bytes;
-
-		entry = kzalloc(sizeof(struct nvs_page), GFP_KERNEL);
-		if (!entry)
-			goto Error;
-
-		list_add_tail(&entry->node, &nvs_list);
-		entry->phys_start = start;
-		nr_bytes = PAGE_SIZE - (start & ~PAGE_MASK);
-		entry->size = (size < nr_bytes) ? size : nr_bytes;
-
-		start += entry->size;
-		size -= entry->size;
-	}
-	return 0;
-
- Error:
-	list_for_each_entry_safe(entry, next, &nvs_list, node) {
-		list_del(&entry->node);
-		kfree(entry);
-	}
-	return -ENOMEM;
-}
-
-/**
- *	hibernate_nvs_free - free data pages allocated for saving NVS regions
- */
-void hibernate_nvs_free(void)
-{
-	struct nvs_page *entry;
-
-	list_for_each_entry(entry, &nvs_list, node)
-		if (entry->data) {
-			free_page((unsigned long)entry->data);
-			entry->data = NULL;
-			if (entry->kaddr) {
-				iounmap(entry->kaddr);
-				entry->kaddr = NULL;
-			}
-		}
-}
-
-/**
- *	hibernate_nvs_alloc - allocate memory necessary for saving NVS regions
- */
-int hibernate_nvs_alloc(void)
-{
-	struct nvs_page *entry;
-
-	list_for_each_entry(entry, &nvs_list, node) {
-		entry->data = (void *)__get_free_page(GFP_KERNEL);
-		if (!entry->data) {
-			hibernate_nvs_free();
-			return -ENOMEM;
-		}
-	}
-	return 0;
-}
-
-/**
- *	hibernate_nvs_save - save NVS memory regions
- */
-void hibernate_nvs_save(void)
-{
-	struct nvs_page *entry;
-
-	printk(KERN_INFO "PM: Saving platform NVS memory\n");
-
-	list_for_each_entry(entry, &nvs_list, node)
-		if (entry->data) {
-			entry->kaddr = ioremap(entry->phys_start, entry->size);
-			memcpy(entry->data, entry->kaddr, entry->size);
-		}
-}
-
-/**
- *	hibernate_nvs_restore - restore NVS memory regions
- *
- *	This function is going to be called with interrupts disabled, so it
- *	cannot iounmap the virtual addresses used to access the NVS region.
- */
-void hibernate_nvs_restore(void)
-{
-	struct nvs_page *entry;
-
-	printk(KERN_INFO "PM: Restoring platform NVS memory\n");
-
-	list_for_each_entry(entry, &nvs_list, node)
-		if (entry->data)
-			memcpy(entry->kaddr, entry->data, entry->size);
-}
-- 
cgit v1.2.3-70-g09d2


From cd6d95d8449b7c9f415f26041e9ae173d387b6bd Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 12 Jun 2009 11:29:27 +0200
Subject: clocksource: prevent selection of low resolution clocksourse also for
 nohz=on

commit 3f68535adad (clocksource: sanity check sysfs clocksource
changes) prevents selection of non high resolution capable
clocksources when high resolution mode is active, but did not take
into account that the same rules apply for highres=off nohz=on.

Check the tick device mode instead of hrtimer_hres_active() to verify
whether the system needs to be protected from a switch to jiffies or
other non highres capable clock sources.

Reported-by: Luming Yu <luming.yu@gmail.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/hrtimer.h    |  2 +-
 include/linux/tick.h       |  3 +++
 kernel/hrtimer.c           |  4 ++--
 kernel/time/clocksource.c  | 18 ++++++++++--------
 kernel/time/tick-oneshot.c | 17 +++++++++++++++++
 5 files changed, 33 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 58021b0c396..0d2f7c8a33d 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -305,7 +305,7 @@ static inline int hrtimer_is_hres_active(struct hrtimer *timer)
 
 extern ktime_t ktime_get(void);
 extern ktime_t ktime_get_real(void);
-extern int hrtimer_hres_active(void);
+
 
 DECLARE_PER_CPU(struct tick_device, tick_cpu_device);
 
diff --git a/include/linux/tick.h b/include/linux/tick.h
index 469b82d88b3..0482229c07d 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -97,10 +97,12 @@ extern void tick_clock_notify(void);
 extern int tick_check_oneshot_change(int allow_nohz);
 extern struct tick_sched *tick_get_tick_sched(int cpu);
 extern void tick_check_idle(int cpu);
+extern int tick_oneshot_mode_active(void);
 # else
 static inline void tick_clock_notify(void) { }
 static inline int tick_check_oneshot_change(int allow_nohz) { return 0; }
 static inline void tick_check_idle(int cpu) { }
+static inline int tick_oneshot_mode_active(void) { return 0; }
 # endif
 
 #else /* CONFIG_GENERIC_CLOCKEVENTS */
@@ -109,6 +111,7 @@ static inline void tick_cancel_sched_timer(int cpu) { }
 static inline void tick_clock_notify(void) { }
 static inline int tick_check_oneshot_change(int allow_nohz) { return 0; }
 static inline void tick_check_idle(int cpu) { }
+static inline int tick_oneshot_mode_active(void) { return 0; }
 #endif /* !CONFIG_GENERIC_CLOCKEVENTS */
 
 # ifdef CONFIG_NO_HZ
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 1a70c18cdff..cb8a15c1958 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -476,7 +476,7 @@ static inline int hrtimer_is_hres_enabled(void)
 /*
  * Is the high resolution mode active ?
  */
-int hrtimer_hres_active(void)
+static inline int hrtimer_hres_active(void)
 {
 	return __get_cpu_var(hrtimer_bases).hres_active;
 }
@@ -704,7 +704,7 @@ static int hrtimer_switch_to_hres(void)
 
 #else
 
-int hrtimer_hres_active(void) { return 0; }
+static inline int hrtimer_hres_active(void) { return 0; }
 static inline int hrtimer_is_hres_enabled(void) { return 0; }
 static inline int hrtimer_switch_to_hres(void) { return 0; }
 static inline void hrtimer_force_reprogram(struct hrtimer_cpu_base *base) { }
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 18b9f5da4ee..592bf584d1d 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -30,7 +30,6 @@
 #include <linux/module.h>
 #include <linux/sched.h> /* for spin_unlock_irq() using preempt_count() m68k */
 #include <linux/tick.h>
-#include <linux/hrtimer.h>
 
 void timecounter_init(struct timecounter *tc,
 		      const struct cyclecounter *cc,
@@ -511,13 +510,13 @@ static ssize_t sysfs_override_clocksource(struct sys_device *dev,
 	}
 
 	/*
-	 * Check to make sure we don't switch to a non-HRT usable
-	 * clocksource if HRT is enabled and running
+	 * Check to make sure we don't switch to a non-highres capable
+	 * clocksource if the tick code is in oneshot mode (highres or nohz)
 	 */
-	if (hrtimer_hres_active() &&
+	if (tick_oneshot_mode_active() &&
 	    !(ovr->flags & CLOCK_SOURCE_VALID_FOR_HRES)) {
 		printk(KERN_WARNING "%s clocksource is not HRT compatible. "
-			"Cannot switch while in HRT mode\n", ovr->name);
+			"Cannot switch while in HRT/NOHZ mode\n", ovr->name);
 		ovr = NULL;
 		override_name[0] = 0;
 	}
@@ -550,9 +549,12 @@ sysfs_show_available_clocksources(struct sys_device *dev,
 
 	spin_lock_irq(&clocksource_lock);
 	list_for_each_entry(src, &clocksource_list, list) {
-		/* Don't show non-HRES clocksource if HRES is enabled */
-		if (!hrtimer_hres_active() ||
-				(src->flags & CLOCK_SOURCE_VALID_FOR_HRES))
+		/*
+		 * Don't show non-HRES clocksource if the tick code is
+		 * in one shot mode (highres=on or nohz=on)
+		 */
+		if (!tick_oneshot_mode_active() ||
+		    (src->flags & CLOCK_SOURCE_VALID_FOR_HRES))
 			count += snprintf(buf + count,
 				  max((ssize_t)PAGE_SIZE - count, (ssize_t)0),
 				  "%s ", src->name);
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
index 2e8de678e76..a96c0e2b89c 100644
--- a/kernel/time/tick-oneshot.c
+++ b/kernel/time/tick-oneshot.c
@@ -128,6 +128,23 @@ int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *))
 	return 0;
 }
 
+/**
+ * tick_check_oneshot_mode - check whether the system is in oneshot mode
+ *
+ * returns 1 when either nohz or highres are enabled. otherwise 0.
+ */
+int tick_oneshot_mode_active(void)
+{
+	unsigned long flags;
+	int ret;
+
+	local_irq_save(flags);
+	ret = __get_cpu_var(tick_cpu_device).mode == TICKDEV_MODE_ONESHOT;
+	local_irq_restore(flags);
+
+	return ret;
+}
+
 #ifdef CONFIG_HIGH_RES_TIMERS
 /**
  * tick_init_highres - switch to high resolution mode
-- 
cgit v1.2.3-70-g09d2


From 5fd29d6ccbc98884569d6f3105aeca70858b3e0f Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Tue, 16 Jun 2009 10:57:02 -0700
Subject: printk: clean up handling of log-levels and newlines

It used to be that we would only look at the log-level in a printk()
after explicit newlines, which can cause annoying problems when the
previous printk() did not end with a '\n'. In that case, the log-level
marker would be just printed out in the middle of the line, and be
seen as just noise rather than change the logging level.

This changes things to always look at the log-level in the first
bytes of the printout. If a log level marker is found, it is always
used as the log-level. Additionally, if no newline existed, one is
added (unless the log-level is the explicit KERN_CONT marker, to
explicitly show that it's a continuation of a previous line).

Acked-by: Arjan van de Ven <arjan@infradead.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kernel.h |  2 +-
 kernel/printk.c        | 31 ++++++++++++++++++++++---------
 2 files changed, 23 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 883cd44ff76..066bb1eddfe 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -102,7 +102,7 @@ extern const char linux_proc_banner[];
  * line that had no enclosing \n). Only to be used by core/arch code
  * during early bootup (a continued line is not SMP-safe otherwise).
  */
-#define	KERN_CONT	""
+#define	KERN_CONT	"<c>"
 
 extern int console_printk[];
 
diff --git a/kernel/printk.c b/kernel/printk.c
index 5052b5497c6..a87770ce73a 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -687,20 +687,33 @@ asmlinkage int vprintk(const char *fmt, va_list args)
 				  sizeof(printk_buf) - printed_len, fmt, args);
 
 
+	p = printk_buf;
+
+	/* Do we have a loglevel in the string? */
+	if (p[0] == '<') {
+		unsigned char c = p[1];
+		if (c && p[2] == '>') {
+			switch (c) {
+			case '0' ... '7': /* loglevel */
+				current_log_level = c - '0';
+				if (!new_text_line) {
+					emit_log_char('\n');
+					new_text_line = 1;
+				}
+			/* Fallthrough - skip the loglevel */
+			case 'c': /* KERN_CONT */
+				p += 3;
+				break;
+			}
+		}
+	}
+
 	/*
 	 * Copy the output into log_buf.  If the caller didn't provide
 	 * appropriate log level tags, we insert them here
 	 */
-	for (p = printk_buf; *p; p++) {
+	for ( ; *p; p++) {
 		if (new_text_line) {
-			/* If a token, set current_log_level and skip over */
-			if (p[0] == '<' && p[1] >= '0' && p[1] <= '7' &&
-			    p[2] == '>') {
-				current_log_level = p[1] - '0';
-				p += 3;
-				printed_len -= 3;
-			}
-
 			/* Always output the token */
 			emit_log_char('<');
 			emit_log_char(current_log_level + '0');
-- 
cgit v1.2.3-70-g09d2


From e28d713704117bca0820c732210df6075b09f13b Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Tue, 16 Jun 2009 11:02:28 -0700
Subject: printk: Add KERN_DEFAULT printk log-level

This adds a KERN_DEFAULT loglevel marker, for when you cannot decide
which loglevel you want, and just want to keep an existing printk
with the default loglevel.

The difference between having KERN_DEFAULT and having no log-level
marker at all is two-fold:

 - having the log-level marker will now force a new-line if the
   previous printout had not added one (perhaps because it forgot,
   but perhaps because it expected a continuation)

 - having a log-level marker is required if you are printing out a
   message that otherwise itself could perhaps otherwise be mistaken
   for a log-level.

Signed-of-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kernel.h | 2 ++
 kernel/printk.c        | 2 ++
 2 files changed, 4 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 066bb1eddfe..1b2e1747df1 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -97,6 +97,8 @@ extern const char linux_proc_banner[];
 #define	KERN_INFO	"<6>"	/* informational			*/
 #define	KERN_DEBUG	"<7>"	/* debug-level messages			*/
 
+/* Use the default kernel loglevel */
+#define KERN_DEFAULT	"<d>"
 /*
  * Annotation for a "continued" line of log printout (only done after a
  * line that had no enclosing \n). Only to be used by core/arch code
diff --git a/kernel/printk.c b/kernel/printk.c
index a87770ce73a..b4d97b54c1e 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -696,6 +696,8 @@ asmlinkage int vprintk(const char *fmt, va_list args)
 			switch (c) {
 			case '0' ... '7': /* loglevel */
 				current_log_level = c - '0';
+			/* Fallthrough - make sure we're on a new line */
+			case 'd': /* KERN_DEFAULT */
 				if (!new_text_line) {
 					emit_log_char('\n');
 					new_text_line = 1;
-- 
cgit v1.2.3-70-g09d2


From b231125af7811a2f68c455d3bda95ac170ee4fa6 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Tue, 16 Jun 2009 11:07:14 -0700
Subject: printk: add KERN_DEFAULT loglevel to print_modules()

Several WARN_ON() messages omit the '\n' at the end of the string, which
is a simple (and understandable) error.  The next line printed after
that warning line is usually the current module list, and that printk
does not have a log-level marker - resulting in one long mixed-up line.

Adding this loglevel marker will now avoid this unreadable mess.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/module.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index e4ab36ce767..215aaab09e9 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2899,7 +2899,7 @@ void print_modules(void)
 	struct module *mod;
 	char buf[8];
 
-	printk("Modules linked in:");
+	printk(KERN_DEFAULT "Modules linked in:");
 	/* Most callers should already have preempt disabled, but make sure */
 	preempt_disable();
 	list_for_each_entry_rcu(mod, &modules, list)
-- 
cgit v1.2.3-70-g09d2