aboutsummaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/cpuset.c22
-rw-r--r--kernel/delayacct.c6
-rw-r--r--kernel/exit.c17
-rw-r--r--kernel/fork.c5
-rw-r--r--kernel/irq/chip.c3
-rw-r--r--kernel/ksysfs.c12
-rw-r--r--kernel/module.c18
-rw-r--r--kernel/params.c2
-rw-r--r--kernel/pid.c4
-rw-r--r--kernel/power/Kconfig11
-rw-r--r--kernel/power/disk.c77
-rw-r--r--kernel/power/main.c29
-rw-r--r--kernel/power/power.h49
-rw-r--r--kernel/power/process.c6
-rw-r--r--kernel/power/snapshot.c309
-rw-r--r--kernel/power/swap.c60
-rw-r--r--kernel/power/swsusp.c139
-rw-r--r--kernel/power/user.c39
-rw-r--r--kernel/sched.c8
-rw-r--r--kernel/signal.c6
-rw-r--r--kernel/sys.c2
-rw-r--r--kernel/taskstats.c4
22 files changed, 563 insertions, 265 deletions
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index f382b0f775e..d240349cbf0 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -2351,6 +2351,8 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs)
* z's node is in our tasks mems_allowed, yes. If it's not a
* __GFP_HARDWALL request and this zone's nodes is in the nearest
* mem_exclusive cpuset ancestor to this tasks cpuset, yes.
+ * If the task has been OOM killed and has access to memory reserves
+ * as specified by the TIF_MEMDIE flag, yes.
* Otherwise, no.
*
* If __GFP_HARDWALL is set, cpuset_zone_allowed_softwall()
@@ -2368,7 +2370,8 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs)
* calls get to this routine, we should just shut up and say 'yes'.
*
* GFP_USER allocations are marked with the __GFP_HARDWALL bit,
- * and do not allow allocations outside the current tasks cpuset.
+ * and do not allow allocations outside the current tasks cpuset
+ * unless the task has been OOM killed as is marked TIF_MEMDIE.
* GFP_KERNEL allocations are not so marked, so can escape to the
* nearest enclosing mem_exclusive ancestor cpuset.
*
@@ -2392,6 +2395,7 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs)
* affect that:
* in_interrupt - any node ok (current task context irrelevant)
* GFP_ATOMIC - any node ok
+ * TIF_MEMDIE - any node ok
* GFP_KERNEL - any node in enclosing mem_exclusive cpuset ok
* GFP_USER - only nodes in current tasks mems allowed ok.
*
@@ -2413,6 +2417,12 @@ int __cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask)
might_sleep_if(!(gfp_mask & __GFP_HARDWALL));
if (node_isset(node, current->mems_allowed))
return 1;
+ /*
+ * Allow tasks that have access to memory reserves because they have
+ * been OOM killed to get memory anywhere.
+ */
+ if (unlikely(test_thread_flag(TIF_MEMDIE)))
+ return 1;
if (gfp_mask & __GFP_HARDWALL) /* If hardwall request, stop here */
return 0;
@@ -2438,7 +2448,9 @@ int __cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask)
*
* If we're in interrupt, yes, we can always allocate.
* If __GFP_THISNODE is set, yes, we can always allocate. If zone
- * z's node is in our tasks mems_allowed, yes. Otherwise, no.
+ * z's node is in our tasks mems_allowed, yes. If the task has been
+ * OOM killed and has access to memory reserves as specified by the
+ * TIF_MEMDIE flag, yes. Otherwise, no.
*
* The __GFP_THISNODE placement logic is really handled elsewhere,
* by forcibly using a zonelist starting at a specified node, and by
@@ -2462,6 +2474,12 @@ int __cpuset_zone_allowed_hardwall(struct zone *z, gfp_t gfp_mask)
node = zone_to_nid(z);
if (node_isset(node, current->mems_allowed))
return 1;
+ /*
+ * Allow tasks that have access to memory reserves because they have
+ * been OOM killed to get memory anywhere.
+ */
+ if (unlikely(test_thread_flag(TIF_MEMDIE)))
+ return 1;
return 0;
}
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index 766d5912b26..c0148ae992c 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -31,11 +31,7 @@ __setup("nodelayacct", delayacct_setup_disable);
void delayacct_init(void)
{
- delayacct_cache = kmem_cache_create("delayacct_cache",
- sizeof(struct task_delay_info),
- 0,
- SLAB_PANIC,
- NULL, NULL);
+ delayacct_cache = KMEM_CACHE(task_delay_info, SLAB_PANIC);
delayacct_tsk_init(&init_task);
}
diff --git a/kernel/exit.c b/kernel/exit.c
index b55ed4cc910..92369240d91 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1033,6 +1033,8 @@ asmlinkage void sys_exit_group(int error_code)
static int eligible_child(pid_t pid, int options, struct task_struct *p)
{
+ int err;
+
if (pid > 0) {
if (p->pid != pid)
return 0;
@@ -1066,8 +1068,9 @@ static int eligible_child(pid_t pid, int options, struct task_struct *p)
if (delay_group_leader(p))
return 2;
- if (security_task_wait(p))
- return 0;
+ err = security_task_wait(p);
+ if (err)
+ return err;
return 1;
}
@@ -1449,6 +1452,7 @@ static long do_wait(pid_t pid, int options, struct siginfo __user *infop,
DECLARE_WAITQUEUE(wait, current);
struct task_struct *tsk;
int flag, retval;
+ int allowed, denied;
add_wait_queue(&current->signal->wait_chldexit,&wait);
repeat:
@@ -1457,6 +1461,7 @@ repeat:
* match our criteria, even if we are not able to reap it yet.
*/
flag = 0;
+ allowed = denied = 0;
current->state = TASK_INTERRUPTIBLE;
read_lock(&tasklist_lock);
tsk = current;
@@ -1472,6 +1477,12 @@ repeat:
if (!ret)
continue;
+ if (unlikely(ret < 0)) {
+ denied = ret;
+ continue;
+ }
+ allowed = 1;
+
switch (p->state) {
case TASK_TRACED:
/*
@@ -1570,6 +1581,8 @@ check_continued:
goto repeat;
}
retval = -ECHILD;
+ if (unlikely(denied) && !allowed)
+ retval = denied;
end:
current->state = TASK_RUNNING;
remove_wait_queue(&current->signal->wait_chldexit,&wait);
diff --git a/kernel/fork.c b/kernel/fork.c
index 6af959c034d..b7d169def94 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -286,6 +286,8 @@ static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
if (retval)
goto out;
}
+ /* a new mm has just been created */
+ arch_dup_mmap(oldmm, mm);
retval = 0;
out:
up_write(&mm->mmap_sem);
@@ -1423,8 +1425,7 @@ static void sighand_ctor(void *data, struct kmem_cache *cachep, unsigned long fl
{
struct sighand_struct *sighand = data;
- if ((flags & (SLAB_CTOR_VERIFY | SLAB_CTOR_CONSTRUCTOR)) ==
- SLAB_CTOR_CONSTRUCTOR)
+ if (flags & SLAB_CTOR_CONSTRUCTOR)
spin_lock_init(&sighand->siglock);
}
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 0133f4f9e9f..615ce97c6cf 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -11,6 +11,7 @@
*/
#include <linux/irq.h>
+#include <linux/msi.h>
#include <linux/module.h>
#include <linux/interrupt.h>
#include <linux/kernel_stat.h>
@@ -185,6 +186,8 @@ int set_irq_msi(unsigned int irq, struct msi_desc *entry)
desc = irq_desc + irq;
spin_lock_irqsave(&desc->lock, flags);
desc->msi_desc = entry;
+ if (entry)
+ entry->irq = irq;
spin_unlock_irqrestore(&desc->lock, flags);
return 0;
}
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index e0ffe4ab091..559deca5ed1 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -24,18 +24,18 @@ static struct subsys_attribute _name##_attr = \
#if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET)
/* current uevent sequence number */
-static ssize_t uevent_seqnum_show(struct subsystem *subsys, char *page)
+static ssize_t uevent_seqnum_show(struct kset *kset, char *page)
{
return sprintf(page, "%llu\n", (unsigned long long)uevent_seqnum);
}
KERNEL_ATTR_RO(uevent_seqnum);
/* uevent helper program, used during early boo */
-static ssize_t uevent_helper_show(struct subsystem *subsys, char *page)
+static ssize_t uevent_helper_show(struct kset *kset, char *page)
{
return sprintf(page, "%s\n", uevent_helper);
}
-static ssize_t uevent_helper_store(struct subsystem *subsys, const char *page, size_t count)
+static ssize_t uevent_helper_store(struct kset *kset, const char *page, size_t count)
{
if (count+1 > UEVENT_HELPER_PATH_LEN)
return -ENOENT;
@@ -49,13 +49,13 @@ KERNEL_ATTR_RW(uevent_helper);
#endif
#ifdef CONFIG_KEXEC
-static ssize_t kexec_loaded_show(struct subsystem *subsys, char *page)
+static ssize_t kexec_loaded_show(struct kset *kset, char *page)
{
return sprintf(page, "%d\n", !!kexec_image);
}
KERNEL_ATTR_RO(kexec_loaded);
-static ssize_t kexec_crash_loaded_show(struct subsystem *subsys, char *page)
+static ssize_t kexec_crash_loaded_show(struct kset *kset, char *page)
{
return sprintf(page, "%d\n", !!kexec_crash_image);
}
@@ -85,7 +85,7 @@ static int __init ksysfs_init(void)
{
int error = subsystem_register(&kernel_subsys);
if (!error)
- error = sysfs_create_group(&kernel_subsys.kset.kobj,
+ error = sysfs_create_group(&kernel_subsys.kobj,
&kernel_attr_group);
return error;
diff --git a/kernel/module.c b/kernel/module.c
index 9da5af668a2..1eb8ca565ba 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -45,6 +45,8 @@
#include <asm/cacheflush.h>
#include <linux/license.h>
+extern int module_sysfs_initialized;
+
#if 0
#define DEBUGP printk
#else
@@ -346,10 +348,10 @@ static void *percpu_modalloc(unsigned long size, unsigned long align,
unsigned int i;
void *ptr;
- if (align > SMP_CACHE_BYTES) {
- printk(KERN_WARNING "%s: per-cpu alignment %li > %i\n",
- name, align, SMP_CACHE_BYTES);
- align = SMP_CACHE_BYTES;
+ if (align > PAGE_SIZE) {
+ printk(KERN_WARNING "%s: per-cpu alignment %li > %li\n",
+ name, align, PAGE_SIZE);
+ align = PAGE_SIZE;
}
ptr = __per_cpu_start;
@@ -430,7 +432,7 @@ static int percpu_modinit(void)
pcpu_size = kmalloc(sizeof(pcpu_size[0]) * pcpu_num_allocated,
GFP_KERNEL);
/* Static in-kernel percpu data (used). */
- pcpu_size[0] = -ALIGN(__per_cpu_end-__per_cpu_start, SMP_CACHE_BYTES);
+ pcpu_size[0] = -(__per_cpu_end-__per_cpu_start);
/* Free room. */
pcpu_size[1] = PERCPU_ENOUGH_ROOM + pcpu_size[0];
if (pcpu_size[1] < 0) {
@@ -1117,8 +1119,8 @@ int mod_sysfs_init(struct module *mod)
{
int err;
- if (!module_subsys.kset.subsys) {
- printk(KERN_ERR "%s: module_subsys not initialized\n",
+ if (!module_sysfs_initialized) {
+ printk(KERN_ERR "%s: module sysfs not initialized\n",
mod->name);
err = -EINVAL;
goto out;
@@ -2385,7 +2387,7 @@ void module_add_driver(struct module *mod, struct device_driver *drv)
struct kobject *mkobj;
/* Lookup built-in module entry in /sys/modules */
- mkobj = kset_find_obj(&module_subsys.kset, drv->mod_name);
+ mkobj = kset_find_obj(&module_subsys, drv->mod_name);
if (mkobj) {
mk = container_of(mkobj, struct module_kobject, kobj);
/* remember our module structure */
diff --git a/kernel/params.c b/kernel/params.c
index 1fc4ac746cd..312172320b4 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -691,6 +691,7 @@ static struct kset_uevent_ops module_uevent_ops = {
};
decl_subsys(module, &module_ktype, &module_uevent_ops);
+int module_sysfs_initialized;
static struct kobj_type module_ktype = {
.sysfs_ops = &module_sysfs_ops,
@@ -709,6 +710,7 @@ static int __init param_sysfs_init(void)
__FILE__, __LINE__, ret);
return ret;
}
+ module_sysfs_initialized = 1;
param_sysfs_builtin();
diff --git a/kernel/pid.c b/kernel/pid.c
index 78f2aee90f5..9c80bc23d6b 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -412,7 +412,5 @@ void __init pidmap_init(void)
set_bit(0, init_pid_ns.pidmap[0].page);
atomic_dec(&init_pid_ns.pidmap[0].nr_free);
- pid_cachep = kmem_cache_create("pid", sizeof(struct pid),
- __alignof__(struct pid),
- SLAB_PANIC, NULL, NULL);
+ pid_cachep = KMEM_CACHE(pid, SLAB_PANIC);
}
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 5001c652028..495b7d4dd33 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -78,17 +78,22 @@ config PM_SYSFS_DEPRECATED
are likely to be bus or driver specific.
config SOFTWARE_SUSPEND
- bool "Software Suspend"
+ bool "Software Suspend (Hibernation)"
depends on PM && SWAP && (((X86 || PPC64_SWSUSP) && (!SMP || SUSPEND_SMP)) || ((FRV || PPC32) && !SMP))
---help---
- Enable the suspend to disk (STD) functionality.
+ Enable the suspend to disk (STD) functionality, which is usually
+ called "hibernation" in user interfaces. STD checkpoints the
+ system and powers it off; and restores that checkpoint on reboot.
You can suspend your machine with 'echo disk > /sys/power/state'.
Alternatively, you can use the additional userland tools available
from <http://suspend.sf.net>.
In principle it does not require ACPI or APM, although for example
- ACPI will be used if available.
+ ACPI will be used for the final steps when it is available. One
+ of the reasons to use software suspend is that the firmware hooks
+ for suspend states like suspend-to-RAM (STR) often don't work very
+ well with Linux.
It creates an image which is saved in your active swap. Upon the next
boot, pass the 'resume=/dev/swappartition' argument to the kernel to
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 02e4fb69111..06331374d86 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -130,15 +130,25 @@ int pm_suspend_disk(void)
{
int error;
+ /* The snapshot device should not be opened while we're running */
+ if (!atomic_add_unless(&snapshot_device_available, -1, 0))
+ return -EBUSY;
+
+ /* Allocate memory management structures */
+ error = create_basic_memory_bitmaps();
+ if (error)
+ goto Exit;
+
error = prepare_processes();
if (error)
- return error;
+ goto Finish;
if (pm_disk_mode == PM_DISK_TESTPROC) {
printk("swsusp debug: Waiting for 5 seconds.\n");
mdelay(5000);
goto Thaw;
}
+
/* Free memory before shutting down devices. */
error = swsusp_shrink_memory();
if (error)
@@ -196,6 +206,10 @@ int pm_suspend_disk(void)
resume_console();
Thaw:
unprepare_processes();
+ Finish:
+ free_basic_memory_bitmaps();
+ Exit:
+ atomic_inc(&snapshot_device_available);
return error;
}
@@ -239,13 +253,21 @@ static int software_resume(void)
}
pr_debug("PM: Checking swsusp image.\n");
-
error = swsusp_check();
if (error)
- goto Done;
+ goto Unlock;
- pr_debug("PM: Preparing processes for restore.\n");
+ /* The snapshot device should not be opened while we're running */
+ if (!atomic_add_unless(&snapshot_device_available, -1, 0)) {
+ error = -EBUSY;
+ goto Unlock;
+ }
+ error = create_basic_memory_bitmaps();
+ if (error)
+ goto Finish;
+
+ pr_debug("PM: Preparing processes for restore.\n");
error = prepare_processes();
if (error) {
swsusp_close();
@@ -280,7 +302,11 @@ static int software_resume(void)
printk(KERN_ERR "PM: Restore failed, recovering.\n");
unprepare_processes();
Done:
+ free_basic_memory_bitmaps();
+ Finish:
+ atomic_inc(&snapshot_device_available);
/* For success case, the suspend path will release the lock */
+ Unlock:
mutex_unlock(&pm_mutex);
pr_debug("PM: Resume from disk failed.\n");
return 0;
@@ -322,13 +348,40 @@ static const char * const pm_disk_modes[] = {
* supports it (as determined from pm_ops->pm_disk_mode).
*/
-static ssize_t disk_show(struct subsystem * subsys, char * buf)
+static ssize_t disk_show(struct kset *kset, char *buf)
{
- return sprintf(buf, "%s\n", pm_disk_modes[pm_disk_mode]);
+ int i;
+ char *start = buf;
+
+ for (i = PM_DISK_PLATFORM; i < PM_DISK_MAX; i++) {
+ if (!pm_disk_modes[i])
+ continue;
+ switch (i) {
+ case PM_DISK_SHUTDOWN:
+ case PM_DISK_REBOOT:
+ case PM_DISK_TEST:
+ case PM_DISK_TESTPROC:
+ break;
+ default:
+ if (pm_ops && pm_ops->enter &&
+ (i == pm_ops->pm_disk_mode))
+ break;
+ /* not a valid mode, continue with loop */
+ continue;
+ }
+ if (i == pm_disk_mode)
+ buf += sprintf(buf, "[%s]", pm_disk_modes[i]);
+ else
+ buf += sprintf(buf, "%s", pm_disk_modes[i]);
+ if (i+1 != PM_DISK_MAX)
+ buf += sprintf(buf, " ");
+ }
+ buf += sprintf(buf, "\n");
+ return buf-start;
}
-static ssize_t disk_store(struct subsystem * s, const char * buf, size_t n)
+static ssize_t disk_store(struct kset *kset, const char *buf, size_t n)
{
int error = 0;
int i;
@@ -373,13 +426,13 @@ static ssize_t disk_store(struct subsystem * s, const char * buf, size_t n)
power_attr(disk);
-static ssize_t resume_show(struct subsystem * subsys, char *buf)
+static ssize_t resume_show(struct kset *kset, char *buf)
{
return sprintf(buf,"%d:%d\n", MAJOR(swsusp_resume_device),
MINOR(swsusp_resume_device));
}
-static ssize_t resume_store(struct subsystem *subsys, const char *buf, size_t n)
+static ssize_t resume_store(struct kset *kset, const char *buf, size_t n)
{
unsigned int maj, min;
dev_t res;
@@ -405,12 +458,12 @@ static ssize_t resume_store(struct subsystem *subsys, const char *buf, size_t n)
power_attr(resume);
-static ssize_t image_size_show(struct subsystem * subsys, char *buf)
+static ssize_t image_size_show(struct kset *kset, char *buf)
{
return sprintf(buf, "%lu\n", image_size);
}
-static ssize_t image_size_store(struct subsystem * subsys, const char * buf, size_t n)
+static ssize_t image_size_store(struct kset *kset, const char *buf, size_t n)
{
unsigned long size;
@@ -439,7 +492,7 @@ static struct attribute_group attr_group = {
static int __init pm_disk_init(void)
{
- return sysfs_create_group(&power_subsys.kset.kobj,&attr_group);
+ return sysfs_create_group(&power_subsys.kobj, &attr_group);
}
core_initcall(pm_disk_init);
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 72419a3b1be..f6dda685e7e 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -184,17 +184,21 @@ static void suspend_finish(suspend_state_t state)
static const char * const pm_states[PM_SUSPEND_MAX] = {
[PM_SUSPEND_STANDBY] = "standby",
[PM_SUSPEND_MEM] = "mem",
-#ifdef CONFIG_SOFTWARE_SUSPEND
[PM_SUSPEND_DISK] = "disk",
-#endif
};
static inline int valid_state(suspend_state_t state)
{
/* Suspend-to-disk does not really need low-level support.
- * It can work with reboot if needed. */
+ * It can work with shutdown/reboot if needed. If it isn't
+ * configured, then it cannot be supported.
+ */
if (state == PM_SUSPEND_DISK)
+#ifdef CONFIG_SOFTWARE_SUSPEND
return 1;
+#else
+ return 0;
+#endif
/* all other states need lowlevel support and need to be
* valid to the lowlevel implementation, no valid callback
@@ -244,15 +248,6 @@ static int enter_state(suspend_state_t state)
return error;
}
-/*
- * This is main interface to the outside world. It needs to be
- * called from process context.
- */
-int software_suspend(void)
-{
- return enter_state(PM_SUSPEND_DISK);
-}
-
/**
* pm_suspend - Externally visible function for suspending system.
@@ -285,7 +280,7 @@ decl_subsys(power,NULL,NULL);
* proper enumerated value, and initiates a suspend transition.
*/
-static ssize_t state_show(struct subsystem * subsys, char * buf)
+static ssize_t state_show(struct kset *kset, char *buf)
{
int i;
char * s = buf;
@@ -298,7 +293,7 @@ static ssize_t state_show(struct subsystem * subsys, char * buf)
return (s - buf);
}
-static ssize_t state_store(struct subsystem * subsys, const char * buf, size_t n)
+static ssize_t state_store(struct kset *kset, const char *buf, size_t n)
{
suspend_state_t state = PM_SUSPEND_STANDBY;
const char * const *s;
@@ -325,13 +320,13 @@ power_attr(state);
#ifdef CONFIG_PM_TRACE
int pm_trace_enabled;
-static ssize_t pm_trace_show(struct subsystem * subsys, char * buf)
+static ssize_t pm_trace_show(struct kset *kset, char *buf)
{
return sprintf(buf, "%d\n", pm_trace_enabled);
}
static ssize_t
-pm_trace_store(struct subsystem * subsys, const char * buf, size_t n)
+pm_trace_store(struct kset *kset, const char *buf, size_t n)
{
int val;
@@ -365,7 +360,7 @@ static int __init pm_init(void)
{
int error = subsystem_register(&power_subsys);
if (!error)
- error = sysfs_create_group(&power_subsys.kset.kobj,&attr_group);
+ error = sysfs_create_group(&power_subsys.kobj,&attr_group);
return error;
}
diff --git a/kernel/power/power.h b/kernel/power/power.h
index eb461b816bf..34b43542785 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -14,8 +14,18 @@ struct swsusp_info {
#ifdef CONFIG_SOFTWARE_SUSPEND
-extern int pm_suspend_disk(void);
+/*
+ * Keep some memory free so that I/O operations can succeed without paging
+ * [Might this be more than 4 MB?]
+ */
+#define PAGES_FOR_IO ((4096 * 1024) >> PAGE_SHIFT)
+/*
+ * Keep 1 MB of memory free so that device drivers can allocate some pages in
+ * their .suspend() routines without breaking the suspend to disk.
+ */
+#define SPARE_PAGES ((1024 * 1024) >> PAGE_SHIFT)
+extern int pm_suspend_disk(void);
#else
static inline int pm_suspend_disk(void)
{
@@ -23,6 +33,8 @@ static inline int pm_suspend_disk(void)
}
#endif
+extern int pfn_is_nosave(unsigned long);
+
extern struct mutex pm_mutex;
#define power_attr(_name) \
@@ -35,10 +47,7 @@ static struct subsys_attribute _name##_attr = { \
.store = _name##_store, \
}
-extern struct subsystem power_subsys;
-
-/* References to section boundaries */
-extern const void __nosave_begin, __nosave_end;
+extern struct kset power_subsys;
/* Preferred image size in bytes (default 500 MB) */
extern unsigned long image_size;
@@ -49,6 +58,8 @@ extern sector_t swsusp_resume_block;
extern asmlinkage int swsusp_arch_suspend(void);
extern asmlinkage int swsusp_arch_resume(void);
+extern int create_basic_memory_bitmaps(void);
+extern void free_basic_memory_bitmaps(void);
extern unsigned int count_data_pages(void);
/**
@@ -139,30 +150,12 @@ struct resume_swap_area {
#define PMOPS_ENTER 2
#define PMOPS_FINISH 3
-/**
- * The bitmap is used for tracing allocated swap pages
- *
- * The entire bitmap consists of a number of bitmap_page
- * structures linked with the help of the .next member.
- * Thus each page can be allocated individually, so we only
- * need to make 0-order memory allocations to create
- * the bitmap.
- */
-
-#define BITMAP_PAGE_SIZE (PAGE_SIZE - sizeof(void *))
-#define BITMAP_PAGE_CHUNKS (BITMAP_PAGE_SIZE / sizeof(long))
-#define BITS_PER_CHUNK (sizeof(long) * 8)
-#define BITMAP_PAGE_BITS (BITMAP_PAGE_CHUNKS * BITS_PER_CHUNK)
-
-struct bitmap_page {
- unsigned long chunks[BITMAP_PAGE_CHUNKS];
- struct bitmap_page *next;
-};
+/* If unset, the snapshot device cannot be open. */
+extern atomic_t snapshot_device_available;
-extern void free_bitmap(struct bitmap_page *bitmap);
-extern struct bitmap_page *alloc_bitmap(unsigned int nr_bits);
-extern sector_t alloc_swapdev_block(int swap, struct bitmap_page *bitmap);
-extern void free_all_swap_pages(int swap, struct bitmap_page *bitmap);
+extern sector_t alloc_swapdev_block(int swap);
+extern void free_all_swap_pages(int swap);
+extern int swsusp_swap_in_use(void);
extern int swsusp_check(void);
extern int swsusp_shrink_memory(void);
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 6d566bf7085..0eb5c420e8e 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -47,8 +47,10 @@ void refrigerator(void)
recalc_sigpending(); /* We sent fake signal, clean it up */
spin_unlock_irq(&current->sighand->siglock);
- while (frozen(current)) {
- current->state = TASK_UNINTERRUPTIBLE;
+ for (;;) {
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ if (!frozen(current))
+ break;
schedule();
}
pr_debug("%s left refrigerator\n", current->comm);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index fc53ad06812..128da11f01c 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -21,6 +21,7 @@
#include <linux/kernel.h>
#include <linux/pm.h>
#include <linux/device.h>
+#include <linux/init.h>
#include <linux/bootmem.h>
#include <linux/syscalls.h>
#include <linux/console.h>
@@ -34,6 +35,10 @@
#include "power.h"
+static int swsusp_page_is_free(struct page *);
+static void swsusp_set_page_forbidden(struct page *);
+static void swsusp_unset_page_forbidden(struct page *);
+
/* List of PBEs needed for restoring the pages that were allocated before
* the suspend and included in the suspend image, but have also been
* allocated by the "resume" kernel, so their contents cannot be written
@@ -67,15 +72,15 @@ static void *get_image_page(gfp_t gfp_mask, int safe_needed)
res = (void *)get_zeroed_page(gfp_mask);
if (safe_needed)
- while (res && PageNosaveFree(virt_to_page(res))) {
+ while (res && swsusp_page_is_free(virt_to_page(res))) {
/* The page is unsafe, mark it for swsusp_free() */
- SetPageNosave(virt_to_page(res));
+ swsusp_set_page_forbidden(virt_to_page(res));
allocated_unsafe_pages++;
res = (void *)get_zeroed_page(gfp_mask);
}
if (res) {
- SetPageNosave(virt_to_page(res));
- SetPageNosaveFree(virt_to_page(res));
+ swsusp_set_page_forbidden(virt_to_page(res));
+ swsusp_set_page_free(virt_to_page(res));
}
return res;
}
@@ -91,8 +96,8 @@ static struct page *alloc_image_page(gfp_t gfp_mask)
page = alloc_page(gfp_mask);
if (page) {
- SetPageNosave(page);
- SetPageNosaveFree(page);
+ swsusp_set_page_forbidden(page);
+ swsusp_set_page_free(page);
}
return page;
}
@@ -110,9 +115,9 @@ static inline void free_image_page(void *addr, int clear_nosave_free)
page = virt_to_page(addr);
- ClearPageNosave(page);
+ swsusp_unset_page_forbidden(page);
if (clear_nosave_free)
- ClearPageNosaveFree(page);
+ swsusp_unset_page_free(page);
__free_page(page);
}
@@ -224,11 +229,6 @@ static void chain_free(struct chain_allocator *ca, int clear_page_nosave)
* of type unsigned long each). It also contains the pfns that
* correspond to the start and end of the represented memory area and
* the number of bit chunks in the block.
- *
- * NOTE: Memory bitmaps are used for two types of operations only:
- * "set a bit" and "find the next bit set". Moreover, the searching
- * is always carried out after all of the "set a bit" operations
- * on given bitmap.
*/
#define BM_END_OF_MAP (~0UL)
@@ -443,15 +443,13 @@ static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free)
}
/**
- * memory_bm_set_bit - set the bit in the bitmap @bm that corresponds
+ * memory_bm_find_bit - find the bit in the bitmap @bm that corresponds
* to given pfn. The cur_zone_bm member of @bm and the cur_block member
* of @bm->cur_zone_bm are updated.
- *
- * If the bit cannot be set, the function returns -EINVAL .
*/
-static int
-memory_bm_set_bit(struct memory_bitmap *bm, unsigned long pfn)
+static void memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn,
+ void **addr, unsigned int *bit_nr)
{
struct zone_bitmap *zone_bm;
struct bm_block *bb;
@@ -463,8 +461,8 @@ memory_bm_set_bit(struct memory_bitmap *bm, unsigned long pfn)
/* We don't assume that the zones are sorted by pfns */
while (pfn < zone_bm->start_pfn || pfn >= zone_bm->end_pfn) {
zone_bm = zone_bm->next;
- if (unlikely(!zone_bm))
- return -EINVAL;
+
+ BUG_ON(!zone_bm);
}
bm->cur.zone_bm = zone_bm;
}
@@ -475,13 +473,40 @@ memory_bm_set_bit(struct memory_bitmap *bm, unsigned long pfn)
while (pfn >= bb->end_pfn) {
bb = bb->next;
- if (unlikely(!bb))
- return -EINVAL;
+
+ BUG_ON(!bb);
}
zone_bm->cur_block = bb;
pfn -= bb->start_pfn;
- set_bit(pfn % BM_BITS_PER_CHUNK, bb->data + pfn / BM_BITS_PER_CHUNK);
- return 0;
+ *bit_nr = pfn % BM_BITS_PER_CHUNK;
+ *addr = bb->data + pfn / BM_BITS_PER_CHUNK;
+}
+
+static void memory_bm_set_bit(struct memory_bitmap *bm, unsigned long pfn)
+{
+ void *addr;
+ unsigned int bit;
+
+ memory_bm_find_bit(bm, pfn, &addr, &bit);
+ set_bit(bit, addr);
+}
+
+static void memory_bm_clear_bit(struct memory_bitmap *bm, unsigned long pfn)
+{
+ void *addr;
+ unsigned int bit;
+
+ memory_bm_find_bit(bm, pfn, &addr, &bit);
+ clear_bit(bit, addr);
+}
+
+static int memory_bm_test_bit(struct memory_bitmap *bm, unsigned long pfn)
+{
+ void *addr;
+ unsigned int bit;
+
+ memory_bm_find_bit(bm, pfn, &addr, &bit);
+ return test_bit(bit, addr);
}
/* Two auxiliary functions for memory_bm_next_pfn */
@@ -564,6 +589,199 @@ static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm)
}
/**
+ * This structure represents a range of page frames the contents of which
+ * should not be saved during the suspend.
+ */
+
+struct nosave_region {
+ struct list_head list;
+ unsigned long start_pfn;
+ unsigned long end_pfn;
+};
+
+static LIST_HEAD(nosave_regions);
+
+/**
+ * register_nosave_region - register a range of page frames the contents
+ * of which should not be saved during the suspend (to be used in the early
+ * initialization code)
+ */
+
+void __init
+register_nosave_region(unsigned long start_pfn, unsigned long end_pfn)
+{
+ struct nosave_region *region;
+
+ if (start_pfn >= end_pfn)
+ return;
+
+ if (!list_empty(&nosave_regions)) {
+ /* Try to extend the previous region (they should be sorted) */
+ region = list_entry(nosave_regions.prev,
+ struct nosave_region, list);
+ if (region->end_pfn == start_pfn) {
+ region->end_pfn = end_pfn;
+ goto Report;
+ }
+ }
+ /* This allocation cannot fail */
+ region = alloc_bootmem_low(sizeof(struct nosave_region));
+ region->start_pfn = start_pfn;
+ region->end_pfn = end_pfn;
+ list_add_tail(&region->list, &nosave_regions);
+ Report:
+ printk("swsusp: Registered nosave memory region: %016lx - %016lx\n",
+ start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT);
+}
+
+/*
+ * Set bits in this map correspond to the page frames the contents of which
+ * should not be saved during the su