diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/cpuset.c | 22 | ||||
-rw-r--r-- | kernel/delayacct.c | 6 | ||||
-rw-r--r-- | kernel/exit.c | 17 | ||||
-rw-r--r-- | kernel/fork.c | 5 | ||||
-rw-r--r-- | kernel/irq/chip.c | 3 | ||||
-rw-r--r-- | kernel/ksysfs.c | 12 | ||||
-rw-r--r-- | kernel/module.c | 18 | ||||
-rw-r--r-- | kernel/params.c | 2 | ||||
-rw-r--r-- | kernel/pid.c | 4 | ||||
-rw-r--r-- | kernel/power/Kconfig | 11 | ||||
-rw-r--r-- | kernel/power/disk.c | 77 | ||||
-rw-r--r-- | kernel/power/main.c | 29 | ||||
-rw-r--r-- | kernel/power/power.h | 49 | ||||
-rw-r--r-- | kernel/power/process.c | 6 | ||||
-rw-r--r-- | kernel/power/snapshot.c | 309 | ||||
-rw-r--r-- | kernel/power/swap.c | 60 | ||||
-rw-r--r-- | kernel/power/swsusp.c | 139 | ||||
-rw-r--r-- | kernel/power/user.c | 39 | ||||
-rw-r--r-- | kernel/sched.c | 8 | ||||
-rw-r--r-- | kernel/signal.c | 6 | ||||
-rw-r--r-- | kernel/sys.c | 2 | ||||
-rw-r--r-- | kernel/taskstats.c | 4 |
22 files changed, 563 insertions, 265 deletions
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index f382b0f775e..d240349cbf0 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -2351,6 +2351,8 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs) * z's node is in our tasks mems_allowed, yes. If it's not a * __GFP_HARDWALL request and this zone's nodes is in the nearest * mem_exclusive cpuset ancestor to this tasks cpuset, yes. + * If the task has been OOM killed and has access to memory reserves + * as specified by the TIF_MEMDIE flag, yes. * Otherwise, no. * * If __GFP_HARDWALL is set, cpuset_zone_allowed_softwall() @@ -2368,7 +2370,8 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs) * calls get to this routine, we should just shut up and say 'yes'. * * GFP_USER allocations are marked with the __GFP_HARDWALL bit, - * and do not allow allocations outside the current tasks cpuset. + * and do not allow allocations outside the current tasks cpuset + * unless the task has been OOM killed as is marked TIF_MEMDIE. * GFP_KERNEL allocations are not so marked, so can escape to the * nearest enclosing mem_exclusive ancestor cpuset. * @@ -2392,6 +2395,7 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs) * affect that: * in_interrupt - any node ok (current task context irrelevant) * GFP_ATOMIC - any node ok + * TIF_MEMDIE - any node ok * GFP_KERNEL - any node in enclosing mem_exclusive cpuset ok * GFP_USER - only nodes in current tasks mems allowed ok. * @@ -2413,6 +2417,12 @@ int __cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask) might_sleep_if(!(gfp_mask & __GFP_HARDWALL)); if (node_isset(node, current->mems_allowed)) return 1; + /* + * Allow tasks that have access to memory reserves because they have + * been OOM killed to get memory anywhere. + */ + if (unlikely(test_thread_flag(TIF_MEMDIE))) + return 1; if (gfp_mask & __GFP_HARDWALL) /* If hardwall request, stop here */ return 0; @@ -2438,7 +2448,9 @@ int __cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask) * * If we're in interrupt, yes, we can always allocate. * If __GFP_THISNODE is set, yes, we can always allocate. If zone - * z's node is in our tasks mems_allowed, yes. Otherwise, no. + * z's node is in our tasks mems_allowed, yes. If the task has been + * OOM killed and has access to memory reserves as specified by the + * TIF_MEMDIE flag, yes. Otherwise, no. * * The __GFP_THISNODE placement logic is really handled elsewhere, * by forcibly using a zonelist starting at a specified node, and by @@ -2462,6 +2474,12 @@ int __cpuset_zone_allowed_hardwall(struct zone *z, gfp_t gfp_mask) node = zone_to_nid(z); if (node_isset(node, current->mems_allowed)) return 1; + /* + * Allow tasks that have access to memory reserves because they have + * been OOM killed to get memory anywhere. + */ + if (unlikely(test_thread_flag(TIF_MEMDIE))) + return 1; return 0; } diff --git a/kernel/delayacct.c b/kernel/delayacct.c index 766d5912b26..c0148ae992c 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c @@ -31,11 +31,7 @@ __setup("nodelayacct", delayacct_setup_disable); void delayacct_init(void) { - delayacct_cache = kmem_cache_create("delayacct_cache", - sizeof(struct task_delay_info), - 0, - SLAB_PANIC, - NULL, NULL); + delayacct_cache = KMEM_CACHE(task_delay_info, SLAB_PANIC); delayacct_tsk_init(&init_task); } diff --git a/kernel/exit.c b/kernel/exit.c index b55ed4cc910..92369240d91 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1033,6 +1033,8 @@ asmlinkage void sys_exit_group(int error_code) static int eligible_child(pid_t pid, int options, struct task_struct *p) { + int err; + if (pid > 0) { if (p->pid != pid) return 0; @@ -1066,8 +1068,9 @@ static int eligible_child(pid_t pid, int options, struct task_struct *p) if (delay_group_leader(p)) return 2; - if (security_task_wait(p)) - return 0; + err = security_task_wait(p); + if (err) + return err; return 1; } @@ -1449,6 +1452,7 @@ static long do_wait(pid_t pid, int options, struct siginfo __user *infop, DECLARE_WAITQUEUE(wait, current); struct task_struct *tsk; int flag, retval; + int allowed, denied; add_wait_queue(¤t->signal->wait_chldexit,&wait); repeat: @@ -1457,6 +1461,7 @@ repeat: * match our criteria, even if we are not able to reap it yet. */ flag = 0; + allowed = denied = 0; current->state = TASK_INTERRUPTIBLE; read_lock(&tasklist_lock); tsk = current; @@ -1472,6 +1477,12 @@ repeat: if (!ret) continue; + if (unlikely(ret < 0)) { + denied = ret; + continue; + } + allowed = 1; + switch (p->state) { case TASK_TRACED: /* @@ -1570,6 +1581,8 @@ check_continued: goto repeat; } retval = -ECHILD; + if (unlikely(denied) && !allowed) + retval = denied; end: current->state = TASK_RUNNING; remove_wait_queue(¤t->signal->wait_chldexit,&wait); diff --git a/kernel/fork.c b/kernel/fork.c index 6af959c034d..b7d169def94 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -286,6 +286,8 @@ static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) if (retval) goto out; } + /* a new mm has just been created */ + arch_dup_mmap(oldmm, mm); retval = 0; out: up_write(&mm->mmap_sem); @@ -1423,8 +1425,7 @@ static void sighand_ctor(void *data, struct kmem_cache *cachep, unsigned long fl { struct sighand_struct *sighand = data; - if ((flags & (SLAB_CTOR_VERIFY | SLAB_CTOR_CONSTRUCTOR)) == - SLAB_CTOR_CONSTRUCTOR) + if (flags & SLAB_CTOR_CONSTRUCTOR) spin_lock_init(&sighand->siglock); } diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 0133f4f9e9f..615ce97c6cf 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -11,6 +11,7 @@ */ #include <linux/irq.h> +#include <linux/msi.h> #include <linux/module.h> #include <linux/interrupt.h> #include <linux/kernel_stat.h> @@ -185,6 +186,8 @@ int set_irq_msi(unsigned int irq, struct msi_desc *entry) desc = irq_desc + irq; spin_lock_irqsave(&desc->lock, flags); desc->msi_desc = entry; + if (entry) + entry->irq = irq; spin_unlock_irqrestore(&desc->lock, flags); return 0; } diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index e0ffe4ab091..559deca5ed1 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -24,18 +24,18 @@ static struct subsys_attribute _name##_attr = \ #if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET) /* current uevent sequence number */ -static ssize_t uevent_seqnum_show(struct subsystem *subsys, char *page) +static ssize_t uevent_seqnum_show(struct kset *kset, char *page) { return sprintf(page, "%llu\n", (unsigned long long)uevent_seqnum); } KERNEL_ATTR_RO(uevent_seqnum); /* uevent helper program, used during early boo */ -static ssize_t uevent_helper_show(struct subsystem *subsys, char *page) +static ssize_t uevent_helper_show(struct kset *kset, char *page) { return sprintf(page, "%s\n", uevent_helper); } -static ssize_t uevent_helper_store(struct subsystem *subsys, const char *page, size_t count) +static ssize_t uevent_helper_store(struct kset *kset, const char *page, size_t count) { if (count+1 > UEVENT_HELPER_PATH_LEN) return -ENOENT; @@ -49,13 +49,13 @@ KERNEL_ATTR_RW(uevent_helper); #endif #ifdef CONFIG_KEXEC -static ssize_t kexec_loaded_show(struct subsystem *subsys, char *page) +static ssize_t kexec_loaded_show(struct kset *kset, char *page) { return sprintf(page, "%d\n", !!kexec_image); } KERNEL_ATTR_RO(kexec_loaded); -static ssize_t kexec_crash_loaded_show(struct subsystem *subsys, char *page) +static ssize_t kexec_crash_loaded_show(struct kset *kset, char *page) { return sprintf(page, "%d\n", !!kexec_crash_image); } @@ -85,7 +85,7 @@ static int __init ksysfs_init(void) { int error = subsystem_register(&kernel_subsys); if (!error) - error = sysfs_create_group(&kernel_subsys.kset.kobj, + error = sysfs_create_group(&kernel_subsys.kobj, &kernel_attr_group); return error; diff --git a/kernel/module.c b/kernel/module.c index 9da5af668a2..1eb8ca565ba 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -45,6 +45,8 @@ #include <asm/cacheflush.h> #include <linux/license.h> +extern int module_sysfs_initialized; + #if 0 #define DEBUGP printk #else @@ -346,10 +348,10 @@ static void *percpu_modalloc(unsigned long size, unsigned long align, unsigned int i; void *ptr; - if (align > SMP_CACHE_BYTES) { - printk(KERN_WARNING "%s: per-cpu alignment %li > %i\n", - name, align, SMP_CACHE_BYTES); - align = SMP_CACHE_BYTES; + if (align > PAGE_SIZE) { + printk(KERN_WARNING "%s: per-cpu alignment %li > %li\n", + name, align, PAGE_SIZE); + align = PAGE_SIZE; } ptr = __per_cpu_start; @@ -430,7 +432,7 @@ static int percpu_modinit(void) pcpu_size = kmalloc(sizeof(pcpu_size[0]) * pcpu_num_allocated, GFP_KERNEL); /* Static in-kernel percpu data (used). */ - pcpu_size[0] = -ALIGN(__per_cpu_end-__per_cpu_start, SMP_CACHE_BYTES); + pcpu_size[0] = -(__per_cpu_end-__per_cpu_start); /* Free room. */ pcpu_size[1] = PERCPU_ENOUGH_ROOM + pcpu_size[0]; if (pcpu_size[1] < 0) { @@ -1117,8 +1119,8 @@ int mod_sysfs_init(struct module *mod) { int err; - if (!module_subsys.kset.subsys) { - printk(KERN_ERR "%s: module_subsys not initialized\n", + if (!module_sysfs_initialized) { + printk(KERN_ERR "%s: module sysfs not initialized\n", mod->name); err = -EINVAL; goto out; @@ -2385,7 +2387,7 @@ void module_add_driver(struct module *mod, struct device_driver *drv) struct kobject *mkobj; /* Lookup built-in module entry in /sys/modules */ - mkobj = kset_find_obj(&module_subsys.kset, drv->mod_name); + mkobj = kset_find_obj(&module_subsys, drv->mod_name); if (mkobj) { mk = container_of(mkobj, struct module_kobject, kobj); /* remember our module structure */ diff --git a/kernel/params.c b/kernel/params.c index 1fc4ac746cd..312172320b4 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -691,6 +691,7 @@ static struct kset_uevent_ops module_uevent_ops = { }; decl_subsys(module, &module_ktype, &module_uevent_ops); +int module_sysfs_initialized; static struct kobj_type module_ktype = { .sysfs_ops = &module_sysfs_ops, @@ -709,6 +710,7 @@ static int __init param_sysfs_init(void) __FILE__, __LINE__, ret); return ret; } + module_sysfs_initialized = 1; param_sysfs_builtin(); diff --git a/kernel/pid.c b/kernel/pid.c index 78f2aee90f5..9c80bc23d6b 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -412,7 +412,5 @@ void __init pidmap_init(void) set_bit(0, init_pid_ns.pidmap[0].page); atomic_dec(&init_pid_ns.pidmap[0].nr_free); - pid_cachep = kmem_cache_create("pid", sizeof(struct pid), - __alignof__(struct pid), - SLAB_PANIC, NULL, NULL); + pid_cachep = KMEM_CACHE(pid, SLAB_PANIC); } diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 5001c652028..495b7d4dd33 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -78,17 +78,22 @@ config PM_SYSFS_DEPRECATED are likely to be bus or driver specific. config SOFTWARE_SUSPEND - bool "Software Suspend" + bool "Software Suspend (Hibernation)" depends on PM && SWAP && (((X86 || PPC64_SWSUSP) && (!SMP || SUSPEND_SMP)) || ((FRV || PPC32) && !SMP)) ---help--- - Enable the suspend to disk (STD) functionality. + Enable the suspend to disk (STD) functionality, which is usually + called "hibernation" in user interfaces. STD checkpoints the + system and powers it off; and restores that checkpoint on reboot. You can suspend your machine with 'echo disk > /sys/power/state'. Alternatively, you can use the additional userland tools available from <http://suspend.sf.net>. In principle it does not require ACPI or APM, although for example - ACPI will be used if available. + ACPI will be used for the final steps when it is available. One + of the reasons to use software suspend is that the firmware hooks + for suspend states like suspend-to-RAM (STR) often don't work very + well with Linux. It creates an image which is saved in your active swap. Upon the next boot, pass the 'resume=/dev/swappartition' argument to the kernel to diff --git a/kernel/power/disk.c b/kernel/power/disk.c index 02e4fb69111..06331374d86 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c @@ -130,15 +130,25 @@ int pm_suspend_disk(void) { int error; + /* The snapshot device should not be opened while we're running */ + if (!atomic_add_unless(&snapshot_device_available, -1, 0)) + return -EBUSY; + + /* Allocate memory management structures */ + error = create_basic_memory_bitmaps(); + if (error) + goto Exit; + error = prepare_processes(); if (error) - return error; + goto Finish; if (pm_disk_mode == PM_DISK_TESTPROC) { printk("swsusp debug: Waiting for 5 seconds.\n"); mdelay(5000); goto Thaw; } + /* Free memory before shutting down devices. */ error = swsusp_shrink_memory(); if (error) @@ -196,6 +206,10 @@ int pm_suspend_disk(void) resume_console(); Thaw: unprepare_processes(); + Finish: + free_basic_memory_bitmaps(); + Exit: + atomic_inc(&snapshot_device_available); return error; } @@ -239,13 +253,21 @@ static int software_resume(void) } pr_debug("PM: Checking swsusp image.\n"); - error = swsusp_check(); if (error) - goto Done; + goto Unlock; - pr_debug("PM: Preparing processes for restore.\n"); + /* The snapshot device should not be opened while we're running */ + if (!atomic_add_unless(&snapshot_device_available, -1, 0)) { + error = -EBUSY; + goto Unlock; + } + error = create_basic_memory_bitmaps(); + if (error) + goto Finish; + + pr_debug("PM: Preparing processes for restore.\n"); error = prepare_processes(); if (error) { swsusp_close(); @@ -280,7 +302,11 @@ static int software_resume(void) printk(KERN_ERR "PM: Restore failed, recovering.\n"); unprepare_processes(); Done: + free_basic_memory_bitmaps(); + Finish: + atomic_inc(&snapshot_device_available); /* For success case, the suspend path will release the lock */ + Unlock: mutex_unlock(&pm_mutex); pr_debug("PM: Resume from disk failed.\n"); return 0; @@ -322,13 +348,40 @@ static const char * const pm_disk_modes[] = { * supports it (as determined from pm_ops->pm_disk_mode). */ -static ssize_t disk_show(struct subsystem * subsys, char * buf) +static ssize_t disk_show(struct kset *kset, char *buf) { - return sprintf(buf, "%s\n", pm_disk_modes[pm_disk_mode]); + int i; + char *start = buf; + + for (i = PM_DISK_PLATFORM; i < PM_DISK_MAX; i++) { + if (!pm_disk_modes[i]) + continue; + switch (i) { + case PM_DISK_SHUTDOWN: + case PM_DISK_REBOOT: + case PM_DISK_TEST: + case PM_DISK_TESTPROC: + break; + default: + if (pm_ops && pm_ops->enter && + (i == pm_ops->pm_disk_mode)) + break; + /* not a valid mode, continue with loop */ + continue; + } + if (i == pm_disk_mode) + buf += sprintf(buf, "[%s]", pm_disk_modes[i]); + else + buf += sprintf(buf, "%s", pm_disk_modes[i]); + if (i+1 != PM_DISK_MAX) + buf += sprintf(buf, " "); + } + buf += sprintf(buf, "\n"); + return buf-start; } -static ssize_t disk_store(struct subsystem * s, const char * buf, size_t n) +static ssize_t disk_store(struct kset *kset, const char *buf, size_t n) { int error = 0; int i; @@ -373,13 +426,13 @@ static ssize_t disk_store(struct subsystem * s, const char * buf, size_t n) power_attr(disk); -static ssize_t resume_show(struct subsystem * subsys, char *buf) +static ssize_t resume_show(struct kset *kset, char *buf) { return sprintf(buf,"%d:%d\n", MAJOR(swsusp_resume_device), MINOR(swsusp_resume_device)); } -static ssize_t resume_store(struct subsystem *subsys, const char *buf, size_t n) +static ssize_t resume_store(struct kset *kset, const char *buf, size_t n) { unsigned int maj, min; dev_t res; @@ -405,12 +458,12 @@ static ssize_t resume_store(struct subsystem *subsys, const char *buf, size_t n) power_attr(resume); -static ssize_t image_size_show(struct subsystem * subsys, char *buf) +static ssize_t image_size_show(struct kset *kset, char *buf) { return sprintf(buf, "%lu\n", image_size); } -static ssize_t image_size_store(struct subsystem * subsys, const char * buf, size_t n) +static ssize_t image_size_store(struct kset *kset, const char *buf, size_t n) { unsigned long size; @@ -439,7 +492,7 @@ static struct attribute_group attr_group = { static int __init pm_disk_init(void) { - return sysfs_create_group(&power_subsys.kset.kobj,&attr_group); + return sysfs_create_group(&power_subsys.kobj, &attr_group); } core_initcall(pm_disk_init); diff --git a/kernel/power/main.c b/kernel/power/main.c index 72419a3b1be..f6dda685e7e 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -184,17 +184,21 @@ static void suspend_finish(suspend_state_t state) static const char * const pm_states[PM_SUSPEND_MAX] = { [PM_SUSPEND_STANDBY] = "standby", [PM_SUSPEND_MEM] = "mem", -#ifdef CONFIG_SOFTWARE_SUSPEND [PM_SUSPEND_DISK] = "disk", -#endif }; static inline int valid_state(suspend_state_t state) { /* Suspend-to-disk does not really need low-level support. - * It can work with reboot if needed. */ + * It can work with shutdown/reboot if needed. If it isn't + * configured, then it cannot be supported. + */ if (state == PM_SUSPEND_DISK) +#ifdef CONFIG_SOFTWARE_SUSPEND return 1; +#else + return 0; +#endif /* all other states need lowlevel support and need to be * valid to the lowlevel implementation, no valid callback @@ -244,15 +248,6 @@ static int enter_state(suspend_state_t state) return error; } -/* - * This is main interface to the outside world. It needs to be - * called from process context. - */ -int software_suspend(void) -{ - return enter_state(PM_SUSPEND_DISK); -} - /** * pm_suspend - Externally visible function for suspending system. @@ -285,7 +280,7 @@ decl_subsys(power,NULL,NULL); * proper enumerated value, and initiates a suspend transition. */ -static ssize_t state_show(struct subsystem * subsys, char * buf) +static ssize_t state_show(struct kset *kset, char *buf) { int i; char * s = buf; @@ -298,7 +293,7 @@ static ssize_t state_show(struct subsystem * subsys, char * buf) return (s - buf); } -static ssize_t state_store(struct subsystem * subsys, const char * buf, size_t n) +static ssize_t state_store(struct kset *kset, const char *buf, size_t n) { suspend_state_t state = PM_SUSPEND_STANDBY; const char * const *s; @@ -325,13 +320,13 @@ power_attr(state); #ifdef CONFIG_PM_TRACE int pm_trace_enabled; -static ssize_t pm_trace_show(struct subsystem * subsys, char * buf) +static ssize_t pm_trace_show(struct kset *kset, char *buf) { return sprintf(buf, "%d\n", pm_trace_enabled); } static ssize_t -pm_trace_store(struct subsystem * subsys, const char * buf, size_t n) +pm_trace_store(struct kset *kset, const char *buf, size_t n) { int val; @@ -365,7 +360,7 @@ static int __init pm_init(void) { int error = subsystem_register(&power_subsys); if (!error) - error = sysfs_create_group(&power_subsys.kset.kobj,&attr_group); + error = sysfs_create_group(&power_subsys.kobj,&attr_group); return error; } diff --git a/kernel/power/power.h b/kernel/power/power.h index eb461b816bf..34b43542785 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -14,8 +14,18 @@ struct swsusp_info { #ifdef CONFIG_SOFTWARE_SUSPEND -extern int pm_suspend_disk(void); +/* + * Keep some memory free so that I/O operations can succeed without paging + * [Might this be more than 4 MB?] + */ +#define PAGES_FOR_IO ((4096 * 1024) >> PAGE_SHIFT) +/* + * Keep 1 MB of memory free so that device drivers can allocate some pages in + * their .suspend() routines without breaking the suspend to disk. + */ +#define SPARE_PAGES ((1024 * 1024) >> PAGE_SHIFT) +extern int pm_suspend_disk(void); #else static inline int pm_suspend_disk(void) { @@ -23,6 +33,8 @@ static inline int pm_suspend_disk(void) } #endif +extern int pfn_is_nosave(unsigned long); + extern struct mutex pm_mutex; #define power_attr(_name) \ @@ -35,10 +47,7 @@ static struct subsys_attribute _name##_attr = { \ .store = _name##_store, \ } -extern struct subsystem power_subsys; - -/* References to section boundaries */ -extern const void __nosave_begin, __nosave_end; +extern struct kset power_subsys; /* Preferred image size in bytes (default 500 MB) */ extern unsigned long image_size; @@ -49,6 +58,8 @@ extern sector_t swsusp_resume_block; extern asmlinkage int swsusp_arch_suspend(void); extern asmlinkage int swsusp_arch_resume(void); +extern int create_basic_memory_bitmaps(void); +extern void free_basic_memory_bitmaps(void); extern unsigned int count_data_pages(void); /** @@ -139,30 +150,12 @@ struct resume_swap_area { #define PMOPS_ENTER 2 #define PMOPS_FINISH 3 -/** - * The bitmap is used for tracing allocated swap pages - * - * The entire bitmap consists of a number of bitmap_page - * structures linked with the help of the .next member. - * Thus each page can be allocated individually, so we only - * need to make 0-order memory allocations to create - * the bitmap. - */ - -#define BITMAP_PAGE_SIZE (PAGE_SIZE - sizeof(void *)) -#define BITMAP_PAGE_CHUNKS (BITMAP_PAGE_SIZE / sizeof(long)) -#define BITS_PER_CHUNK (sizeof(long) * 8) -#define BITMAP_PAGE_BITS (BITMAP_PAGE_CHUNKS * BITS_PER_CHUNK) - -struct bitmap_page { - unsigned long chunks[BITMAP_PAGE_CHUNKS]; - struct bitmap_page *next; -}; +/* If unset, the snapshot device cannot be open. */ +extern atomic_t snapshot_device_available; -extern void free_bitmap(struct bitmap_page *bitmap); -extern struct bitmap_page *alloc_bitmap(unsigned int nr_bits); -extern sector_t alloc_swapdev_block(int swap, struct bitmap_page *bitmap); -extern void free_all_swap_pages(int swap, struct bitmap_page *bitmap); +extern sector_t alloc_swapdev_block(int swap); +extern void free_all_swap_pages(int swap); +extern int swsusp_swap_in_use(void); extern int swsusp_check(void); extern int swsusp_shrink_memory(void); diff --git a/kernel/power/process.c b/kernel/power/process.c index 6d566bf7085..0eb5c420e8e 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -47,8 +47,10 @@ void refrigerator(void) recalc_sigpending(); /* We sent fake signal, clean it up */ spin_unlock_irq(¤t->sighand->siglock); - while (frozen(current)) { - current->state = TASK_UNINTERRUPTIBLE; + for (;;) { + set_current_state(TASK_UNINTERRUPTIBLE); + if (!frozen(current)) + break; schedule(); } pr_debug("%s left refrigerator\n", current->comm); diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index fc53ad06812..128da11f01c 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -21,6 +21,7 @@ #include <linux/kernel.h> #include <linux/pm.h> #include <linux/device.h> +#include <linux/init.h> #include <linux/bootmem.h> #include <linux/syscalls.h> #include <linux/console.h> @@ -34,6 +35,10 @@ #include "power.h" +static int swsusp_page_is_free(struct page *); +static void swsusp_set_page_forbidden(struct page *); +static void swsusp_unset_page_forbidden(struct page *); + /* List of PBEs needed for restoring the pages that were allocated before * the suspend and included in the suspend image, but have also been * allocated by the "resume" kernel, so their contents cannot be written @@ -67,15 +72,15 @@ static void *get_image_page(gfp_t gfp_mask, int safe_needed) res = (void *)get_zeroed_page(gfp_mask); if (safe_needed) - while (res && PageNosaveFree(virt_to_page(res))) { + while (res && swsusp_page_is_free(virt_to_page(res))) { /* The page is unsafe, mark it for swsusp_free() */ - SetPageNosave(virt_to_page(res)); + swsusp_set_page_forbidden(virt_to_page(res)); allocated_unsafe_pages++; res = (void *)get_zeroed_page(gfp_mask); } if (res) { - SetPageNosave(virt_to_page(res)); - SetPageNosaveFree(virt_to_page(res)); + swsusp_set_page_forbidden(virt_to_page(res)); + swsusp_set_page_free(virt_to_page(res)); } return res; } @@ -91,8 +96,8 @@ static struct page *alloc_image_page(gfp_t gfp_mask) page = alloc_page(gfp_mask); if (page) { - SetPageNosave(page); - SetPageNosaveFree(page); + swsusp_set_page_forbidden(page); + swsusp_set_page_free(page); } return page; } @@ -110,9 +115,9 @@ static inline void free_image_page(void *addr, int clear_nosave_free) page = virt_to_page(addr); - ClearPageNosave(page); + swsusp_unset_page_forbidden(page); if (clear_nosave_free) - ClearPageNosaveFree(page); + swsusp_unset_page_free(page); __free_page(page); } @@ -224,11 +229,6 @@ static void chain_free(struct chain_allocator *ca, int clear_page_nosave) * of type unsigned long each). It also contains the pfns that * correspond to the start and end of the represented memory area and * the number of bit chunks in the block. - * - * NOTE: Memory bitmaps are used for two types of operations only: - * "set a bit" and "find the next bit set". Moreover, the searching - * is always carried out after all of the "set a bit" operations - * on given bitmap. */ #define BM_END_OF_MAP (~0UL) @@ -443,15 +443,13 @@ static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free) } /** - * memory_bm_set_bit - set the bit in the bitmap @bm that corresponds + * memory_bm_find_bit - find the bit in the bitmap @bm that corresponds * to given pfn. The cur_zone_bm member of @bm and the cur_block member * of @bm->cur_zone_bm are updated. - * - * If the bit cannot be set, the function returns -EINVAL . */ -static int -memory_bm_set_bit(struct memory_bitmap *bm, unsigned long pfn) +static void memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn, + void **addr, unsigned int *bit_nr) { struct zone_bitmap *zone_bm; struct bm_block *bb; @@ -463,8 +461,8 @@ memory_bm_set_bit(struct memory_bitmap *bm, unsigned long pfn) /* We don't assume that the zones are sorted by pfns */ while (pfn < zone_bm->start_pfn || pfn >= zone_bm->end_pfn) { zone_bm = zone_bm->next; - if (unlikely(!zone_bm)) - return -EINVAL; + + BUG_ON(!zone_bm); } bm->cur.zone_bm = zone_bm; } @@ -475,13 +473,40 @@ memory_bm_set_bit(struct memory_bitmap *bm, unsigned long pfn) while (pfn >= bb->end_pfn) { bb = bb->next; - if (unlikely(!bb)) - return -EINVAL; + + BUG_ON(!bb); } zone_bm->cur_block = bb; pfn -= bb->start_pfn; - set_bit(pfn % BM_BITS_PER_CHUNK, bb->data + pfn / BM_BITS_PER_CHUNK); - return 0; + *bit_nr = pfn % BM_BITS_PER_CHUNK; + *addr = bb->data + pfn / BM_BITS_PER_CHUNK; +} + +static void memory_bm_set_bit(struct memory_bitmap *bm, unsigned long pfn) +{ + void *addr; + unsigned int bit; + + memory_bm_find_bit(bm, pfn, &addr, &bit); + set_bit(bit, addr); +} + +static void memory_bm_clear_bit(struct memory_bitmap *bm, unsigned long pfn) +{ + void *addr; + unsigned int bit; + + memory_bm_find_bit(bm, pfn, &addr, &bit); + clear_bit(bit, addr); +} + +static int memory_bm_test_bit(struct memory_bitmap *bm, unsigned long pfn) +{ + void *addr; + unsigned int bit; + + memory_bm_find_bit(bm, pfn, &addr, &bit); + return test_bit(bit, addr); } /* Two auxiliary functions for memory_bm_next_pfn */ @@ -564,6 +589,199 @@ static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm) } /** + * This structure represents a range of page frames the contents of which + * should not be saved during the suspend. + */ + +struct nosave_region { + struct list_head list; + unsigned long start_pfn; + unsigned long end_pfn; +}; + +static LIST_HEAD(nosave_regions); + +/** + * register_nosave_region - register a range of page frames the contents + * of which should not be saved during the suspend (to be used in the early + * initialization code) + */ + +void __init +register_nosave_region(unsigned long start_pfn, unsigned long end_pfn) +{ + struct nosave_region *region; + + if (start_pfn >= end_pfn) + return; + + if (!list_empty(&nosave_regions)) { + /* Try to extend the previous region (they should be sorted) */ + region = list_entry(nosave_regions.prev, + struct nosave_region, list); + if (region->end_pfn == start_pfn) { + region->end_pfn = end_pfn; + goto Report; + } + } + /* This allocation cannot fail */ + region = alloc_bootmem_low(sizeof(struct nosave_region)); + region->start_pfn = start_pfn; + region->end_pfn = end_pfn; + list_add_tail(®ion->list, &nosave_regions); + Report: + printk("swsusp: Registered nosave memory region: %016lx - %016lx\n", + start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT); +} + +/* + * Set bits in this map correspond to the page frames the contents of which + * should not be saved during the su |