diff options
Diffstat (limited to 'kernel/power')
| -rw-r--r-- | kernel/power/Kconfig | 360 | ||||
| -rw-r--r-- | kernel/power/Makefile | 17 | ||||
| -rw-r--r-- | kernel/power/autosleep.c | 128 | ||||
| -rw-r--r-- | kernel/power/block_io.c | 103 | ||||
| -rw-r--r-- | kernel/power/console.c | 149 | ||||
| -rw-r--r-- | kernel/power/disk.c | 406 | ||||
| -rw-r--r-- | kernel/power/hibernate.c | 1168 | ||||
| -rw-r--r-- | kernel/power/main.c | 717 | ||||
| -rw-r--r-- | kernel/power/pm.c | 209 | ||||
| -rw-r--r-- | kernel/power/power.h | 293 | ||||
| -rw-r--r-- | kernel/power/poweroff.c | 17 | ||||
| -rw-r--r-- | kernel/power/process.c | 302 | ||||
| -rw-r--r-- | kernel/power/qos.c | 601 | ||||
| -rw-r--r-- | kernel/power/snapshot.c | 2162 | ||||
| -rw-r--r-- | kernel/power/suspend.c | 443 | ||||
| -rw-r--r-- | kernel/power/suspend_test.c | 185 | ||||
| -rw-r--r-- | kernel/power/swap.c | 1542 | ||||
| -rw-r--r-- | kernel/power/swsusp.c | 287 | ||||
| -rw-r--r-- | kernel/power/user.c | 410 | ||||
| -rw-r--r-- | kernel/power/wakelock.c | 268 |
20 files changed, 7312 insertions, 2455 deletions
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 825068ca347..9a83d780fac 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -1,112 +1,77 @@ -config PM - bool "Power Management support" - depends on !IA64_HP_SIM - ---help--- - "Power Management" means that parts of your computer are shut - off or put into a power conserving "sleep" mode if they are not - being used. There are two competing standards for doing this: APM - and ACPI. If you want to use either one, say Y here and then also - to the requisite support below. - - Power Management is most important for battery powered laptop - computers; if you have a laptop, check out the Linux Laptop home - page on the WWW at <http://www.linux-on-laptops.com/> or - Tuxmobil - Linux on Mobile Computers at <http://www.tuxmobil.org/> - and the Battery Powered Linux mini-HOWTO, available from - <http://www.tldp.org/docs.html#howto>. - - Note that, even if you say N here, Linux on the x86 architecture - will issue the hlt instruction if nothing is to be done, thereby - sending the processor to sleep and saving power. - -config PM_LEGACY - bool "Legacy Power Management API" - depends on PM +config SUSPEND + bool "Suspend to RAM and standby" + depends on ARCH_SUSPEND_POSSIBLE default y ---help--- - Support for pm_register() and friends. + Allow the system to enter sleep states in which main memory is + powered and thus its contents are preserved, such as the + suspend-to-RAM state (e.g. the ACPI S3 state). - If unsure, say Y. +config SUSPEND_FREEZER + bool "Enable freezer for suspend to RAM/standby" \ + if ARCH_WANTS_FREEZER_CONTROL || BROKEN + depends on SUSPEND + default y + help + This allows you to turn off the freezer for suspend. If this is + done, no tasks are frozen for suspend to RAM/standby. -config PM_DEBUG - bool "Power Management Debug Support" - depends on PM - ---help--- - This option enables verbose debugging support in the Power Management - code. This is helpful when debugging and reporting various PM bugs, - like suspend support. + Turning OFF this setting is NOT recommended! If in doubt, say Y. -config DISABLE_CONSOLE_SUSPEND - bool "Keep console(s) enabled during suspend/resume (DANGEROUS)" - depends on PM && PM_DEBUG - default n - ---help--- - This option turns off the console suspend mechanism that prevents - debug messages from reaching the console during the suspend/resume - operations. This may be helpful when debugging device drivers' - suspend/resume routines, but may itself lead to problems, for example - if netconsole is used. +config HIBERNATE_CALLBACKS + bool -config PM_TRACE - bool "Suspend/resume event tracing" - depends on PM && PM_DEBUG && X86_32 && EXPERIMENTAL - default n +config HIBERNATION + bool "Hibernation (aka 'suspend to disk')" + depends on SWAP && ARCH_HIBERNATION_POSSIBLE + select HIBERNATE_CALLBACKS + select LZO_COMPRESS + select LZO_DECOMPRESS + select CRC32 ---help--- - This enables some cheesy code to save the last PM event point in the - RTC across reboots, so that you can debug a machine that just hangs - during suspend (or more commonly, during resume). - - To use this debugging feature you should attempt to suspend the machine, - then reboot it, then run + Enable the suspend to disk (STD) functionality, which is usually + called "hibernation" in user interfaces. STD checkpoints the + system and powers it off; and restores that checkpoint on reboot. - dmesg -s 1000000 | grep 'hash matches' + You can suspend your machine with 'echo disk > /sys/power/state' + after placing resume=/dev/swappartition on the kernel command line + in your bootloader's configuration file. - CAUTION: this option will cause your machine's real-time clock to be - set to an invalid time after a resume. + Alternatively, you can use the additional userland tools available + from <http://suspend.sf.net>. -config PM_SYSFS_DEPRECATED - bool "Driver model /sys/devices/.../power/state files (DEPRECATED)" - depends on PM && SYSFS - default n - help - The driver model started out with a sysfs file intended to provide - a userspace hook for device power management. This feature has never - worked very well, except for limited testing purposes, and so it will - be removed. It's not clear that a generic mechanism could really - handle the wide variability of device power states; any replacements - are likely to be bus or driver specific. - -config SOFTWARE_SUSPEND - bool "Software Suspend" - depends on PM && SWAP && ((X86 && (!SMP || SUSPEND_SMP) && !X86_PAE) || ((FRV || PPC32) && !SMP)) - ---help--- - Enable the possibility of suspending the machine. - It doesn't need ACPI or APM. - You may suspend your machine by 'swsusp' or 'shutdown -z <time>' - (patch for sysvinit needed). + In principle it does not require ACPI or APM, although for example + ACPI will be used for the final steps when it is available. One + of the reasons to use software suspend is that the firmware hooks + for suspend states like suspend-to-RAM (STR) often don't work very + well with Linux. - It creates an image which is saved in your active swap. Upon next + It creates an image which is saved in your active swap. Upon the next boot, pass the 'resume=/dev/swappartition' argument to the kernel to have it detect the saved image, restore memory state from it, and continue to run as before. If you do not want the previous state to - be reloaded, then use the 'noresume' kernel argument. However, note - that your partitions will be fsck'd and you must re-mkswap your swap - partitions. It does not work with swap files. + be reloaded, then use the 'noresume' kernel command line argument. + Note, however, that fsck will be run on your filesystems and you will + need to run mkswap against the swap partition used for the suspend. + + It also works with swap files to a limited extent (for details see + <file:Documentation/power/swsusp-and-swap-files.txt>). - Right now you may boot without resuming and then later resume but - in meantime you cannot use those swap partitions/files which were - involved in suspending. Also in this case there is a risk that buffers - on disk won't match with saved ones. + Right now you may boot without resuming and resume later but in the + meantime you cannot use the swap partition(s)/file(s) involved in + suspending. Also in this case you must not use the filesystems + that were mounted before the suspend. In particular, you MUST NOT + MOUNT any journaled filesystems mounted before the suspend or they + will get corrupted in a nasty way. For more information take a look at <file:Documentation/power/swsusp.txt>. - (For now, swsusp is incompatible with PAE aka HIGHMEM_64G on i386. - we need identity mapping for resume to work, and that is trivial - to get with 4MB pages, but less than trivial on PAE). +config ARCH_SAVE_PAGE_KEYS + bool config PM_STD_PARTITION string "Default resume partition" - depends on SOFTWARE_SUSPEND + depends on HIBERNATION default "" ---help--- The default resume partition is the partition that the suspend- @@ -126,7 +91,220 @@ config PM_STD_PARTITION suspended image to. It will simply pick the first available swap device. -config SUSPEND_SMP - bool - depends on HOTPLUG_CPU && X86 && PM +config PM_SLEEP + def_bool y + depends on SUSPEND || HIBERNATE_CALLBACKS + +config PM_SLEEP_SMP + def_bool y + depends on SMP + depends on ARCH_SUSPEND_POSSIBLE || ARCH_HIBERNATION_POSSIBLE + depends on PM_SLEEP + select HOTPLUG_CPU + +config PM_AUTOSLEEP + bool "Opportunistic sleep" + depends on PM_SLEEP + default n + ---help--- + Allow the kernel to trigger a system transition into a global sleep + state automatically whenever there are no active wakeup sources. + +config PM_WAKELOCKS + bool "User space wakeup sources interface" + depends on PM_SLEEP + default n + ---help--- + Allow user space to create, activate and deactivate wakeup source + objects with the help of a sysfs-based interface. + +config PM_WAKELOCKS_LIMIT + int "Maximum number of user space wakeup sources (0 = no limit)" + range 0 100000 + default 100 + depends on PM_WAKELOCKS + +config PM_WAKELOCKS_GC + bool "Garbage collector for user space wakeup sources" + depends on PM_WAKELOCKS default y + +config PM_RUNTIME + bool "Run-time PM core functionality" + depends on !IA64_HP_SIM + ---help--- + Enable functionality allowing I/O devices to be put into energy-saving + (low power) states at run time (or autosuspended) after a specified + period of inactivity and woken up in response to a hardware-generated + wake-up event or a driver's request. + + Hardware support is generally required for this functionality to work + and the bus type drivers of the buses the devices are on are + responsible for the actual handling of the autosuspend requests and + wake-up events. + +config PM + def_bool y + depends on PM_SLEEP || PM_RUNTIME + +config PM_DEBUG + bool "Power Management Debug Support" + depends on PM + ---help--- + This option enables various debugging support in the Power Management + code. This is helpful when debugging and reporting PM bugs, like + suspend support. + +config PM_ADVANCED_DEBUG + bool "Extra PM attributes in sysfs for low-level debugging/testing" + depends on PM_DEBUG + ---help--- + Add extra sysfs attributes allowing one to access some Power Management + fields of device objects from user space. If you are not a kernel + developer interested in debugging/testing Power Management, say "no". + +config PM_TEST_SUSPEND + bool "Test suspend/resume and wakealarm during bootup" + depends on SUSPEND && PM_DEBUG && RTC_CLASS=y + ---help--- + This option will let you suspend your machine during bootup, and + make it wake up a few seconds later using an RTC wakeup alarm. + Enable this with a kernel parameter like "test_suspend=mem". + + You probably want to have your system's RTC driver statically + linked, ensuring that it's available when this test runs. + +config PM_SLEEP_DEBUG + def_bool y + depends on PM_DEBUG && PM_SLEEP + +config DPM_WATCHDOG + bool "Device suspend/resume watchdog" + depends on PM_DEBUG && PSTORE + ---help--- + Sets up a watchdog timer to capture drivers that are + locked up attempting to suspend/resume a device. + A detected lockup causes system panic with message + captured in pstore device for inspection in subsequent + boot session. + +config DPM_WATCHDOG_TIMEOUT + int "Watchdog timeout in seconds" + range 1 120 + default 12 + depends on DPM_WATCHDOG + +config PM_TRACE + bool + help + This enables code to save the last PM event point across + reboot. The architecture needs to support this, x86 for + example does by saving things in the RTC, see below. + + The architecture specific code must provide the extern + functions from <linux/resume-trace.h> as well as the + <asm/resume-trace.h> header with a TRACE_RESUME() macro. + + The way the information is presented is architecture- + dependent, x86 will print the information during a + late_initcall. + +config PM_TRACE_RTC + bool "Suspend/resume event tracing" + depends on PM_SLEEP_DEBUG + depends on X86 + select PM_TRACE + ---help--- + This enables some cheesy code to save the last PM event point in the + RTC across reboots, so that you can debug a machine that just hangs + during suspend (or more commonly, during resume). + + To use this debugging feature you should attempt to suspend the + machine, reboot it and then run + + dmesg -s 1000000 | grep 'hash matches' + + CAUTION: this option will cause your machine's real-time clock to be + set to an invalid time after a resume. + +config APM_EMULATION + tristate "Advanced Power Management Emulation" + depends on PM && SYS_SUPPORTS_APM_EMULATION + help + APM is a BIOS specification for saving power using several different + techniques. This is mostly useful for battery powered laptops with + APM compliant BIOSes. If you say Y here, the system time will be + reset after a RESUME operation, the /proc/apm device will provide + battery status information, and user-space programs will receive + notification of APM "events" (e.g. battery status change). + + In order to use APM, you will need supporting software. For location + and more information, read <file:Documentation/power/apm-acpi.txt> + and the Battery Powered Linux mini-HOWTO, available from + <http://www.tldp.org/docs.html#howto>. + + This driver does not spin down disk drives (see the hdparm(8) + manpage ("man 8 hdparm") for that), and it doesn't turn off + VESA-compliant "green" monitors. + + Generally, if you don't have a battery in your machine, there isn't + much point in using this driver and you should say N. If you get + random kernel OOPSes or reboots that don't seem to be related to + anything, try disabling/enabling this option (or disabling/enabling + APM in your BIOS). + +config ARCH_HAS_OPP + bool + +config PM_OPP + bool + ---help--- + SOCs have a standard set of tuples consisting of frequency and + voltage pairs that the device will support per voltage domain. This + is called Operating Performance Point or OPP. The actual definitions + of OPP varies over silicon within the same family of devices. + + OPP layer organizes the data internally using device pointers + representing individual voltage domains and provides SOC + implementations a ready to use framework to manage OPPs. + For more information, read <file:Documentation/power/opp.txt> + +config PM_CLK + def_bool y + depends on PM && HAVE_CLK + +config PM_GENERIC_DOMAINS + bool + depends on PM + +config WQ_POWER_EFFICIENT_DEFAULT + bool "Enable workqueue power-efficient mode by default" + depends on PM + default n + help + Per-cpu workqueues are generally preferred because they show + better performance thanks to cache locality; unfortunately, + per-cpu workqueues tend to be more power hungry than unbound + workqueues. + + Enabling workqueue.power_efficient kernel parameter makes the + per-cpu workqueues which were observed to contribute + significantly to power consumption unbound, leading to measurably + lower power usage at the cost of small performance overhead. + + This config option determines whether workqueue.power_efficient + is enabled by default. + + If in doubt, say N. + +config PM_GENERIC_DOMAINS_SLEEP + def_bool y + depends on PM_SLEEP && PM_GENERIC_DOMAINS + +config PM_GENERIC_DOMAINS_RUNTIME + def_bool y + depends on PM_RUNTIME && PM_GENERIC_DOMAINS + +config CPU_PM + bool + depends on SUSPEND || CPU_IDLE diff --git a/kernel/power/Makefile b/kernel/power/Makefile index 38725f526af..29472bff11e 100644 --- a/kernel/power/Makefile +++ b/kernel/power/Makefile @@ -1,10 +1,15 @@ -ifeq ($(CONFIG_PM_DEBUG),y) -EXTRA_CFLAGS += -DDEBUG -endif +ccflags-$(CONFIG_PM_DEBUG) := -DDEBUG -obj-y := main.o process.o console.o -obj-$(CONFIG_PM_LEGACY) += pm.o -obj-$(CONFIG_SOFTWARE_SUSPEND) += swsusp.o disk.o snapshot.o swap.o user.o +obj-y += qos.o +obj-$(CONFIG_PM) += main.o +obj-$(CONFIG_VT_CONSOLE_SLEEP) += console.o +obj-$(CONFIG_FREEZER) += process.o +obj-$(CONFIG_SUSPEND) += suspend.o +obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o +obj-$(CONFIG_HIBERNATION) += hibernate.o snapshot.o swap.o user.o \ + block_io.o +obj-$(CONFIG_PM_AUTOSLEEP) += autosleep.o +obj-$(CONFIG_PM_WAKELOCKS) += wakelock.o obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o diff --git a/kernel/power/autosleep.c b/kernel/power/autosleep.c new file mode 100644 index 00000000000..9012ecf7b81 --- /dev/null +++ b/kernel/power/autosleep.c @@ -0,0 +1,128 @@ +/* + * kernel/power/autosleep.c + * + * Opportunistic sleep support. + * + * Copyright (C) 2012 Rafael J. Wysocki <rjw@sisk.pl> + */ + +#include <linux/device.h> +#include <linux/mutex.h> +#include <linux/pm_wakeup.h> + +#include "power.h" + +static suspend_state_t autosleep_state; +static struct workqueue_struct *autosleep_wq; +/* + * Note: it is only safe to mutex_lock(&autosleep_lock) if a wakeup_source + * is active, otherwise a deadlock with try_to_suspend() is possible. + * Alternatively mutex_lock_interruptible() can be used. This will then fail + * if an auto_sleep cycle tries to freeze processes. + */ +static DEFINE_MUTEX(autosleep_lock); +static struct wakeup_source *autosleep_ws; + +static void try_to_suspend(struct work_struct *work) +{ + unsigned int initial_count, final_count; + + if (!pm_get_wakeup_count(&initial_count, true)) + goto out; + + mutex_lock(&autosleep_lock); + + if (!pm_save_wakeup_count(initial_count) || + system_state != SYSTEM_RUNNING) { + mutex_unlock(&autosleep_lock); + goto out; + } + + if (autosleep_state == PM_SUSPEND_ON) { + mutex_unlock(&autosleep_lock); + return; + } + if (autosleep_state >= PM_SUSPEND_MAX) + hibernate(); + else + pm_suspend(autosleep_state); + + mutex_unlock(&autosleep_lock); + + if (!pm_get_wakeup_count(&final_count, false)) + goto out; + + /* + * If the wakeup occured for an unknown reason, wait to prevent the + * system from trying to suspend and waking up in a tight loop. + */ + if (final_count == initial_count) + schedule_timeout_uninterruptible(HZ / 2); + + out: + queue_up_suspend_work(); +} + +static DECLARE_WORK(suspend_work, try_to_suspend); + +void queue_up_suspend_work(void) +{ + if (autosleep_state > PM_SUSPEND_ON) + queue_work(autosleep_wq, &suspend_work); +} + +suspend_state_t pm_autosleep_state(void) +{ + return autosleep_state; +} + +int pm_autosleep_lock(void) +{ + return mutex_lock_interruptible(&autosleep_lock); +} + +void pm_autosleep_unlock(void) +{ + mutex_unlock(&autosleep_lock); +} + +int pm_autosleep_set_state(suspend_state_t state) +{ + +#ifndef CONFIG_HIBERNATION + if (state >= PM_SUSPEND_MAX) + return -EINVAL; +#endif + + __pm_stay_awake(autosleep_ws); + + mutex_lock(&autosleep_lock); + + autosleep_state = state; + + __pm_relax(autosleep_ws); + + if (state > PM_SUSPEND_ON) { + pm_wakep_autosleep_enabled(true); + queue_up_suspend_work(); + } else { + pm_wakep_autosleep_enabled(false); + } + + mutex_unlock(&autosleep_lock); + return 0; +} + +int __init pm_autosleep_init(void) +{ + autosleep_ws = wakeup_source_register("autosleep"); + if (!autosleep_ws) + return -ENOMEM; + + autosleep_wq = alloc_ordered_workqueue("autosleep", 0); + if (autosleep_wq) + return 0; + + wakeup_source_unregister(autosleep_ws); + return -ENOMEM; +} diff --git a/kernel/power/block_io.c b/kernel/power/block_io.c new file mode 100644 index 00000000000..9a58bc25881 --- /dev/null +++ b/kernel/power/block_io.c @@ -0,0 +1,103 @@ +/* + * This file provides functions for block I/O operations on swap/file. + * + * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@ucw.cz> + * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl> + * + * This file is released under the GPLv2. + */ + +#include <linux/bio.h> +#include <linux/kernel.h> +#include <linux/pagemap.h> +#include <linux/swap.h> + +#include "power.h" + +/** + * submit - submit BIO request. + * @rw: READ or WRITE. + * @off physical offset of page. + * @page: page we're reading or writing. + * @bio_chain: list of pending biod (for async reading) + * + * Straight from the textbook - allocate and initialize the bio. + * If we're reading, make sure the page is marked as dirty. + * Then submit it and, if @bio_chain == NULL, wait. + */ +static int submit(int rw, struct block_device *bdev, sector_t sector, + struct page *page, struct bio **bio_chain) +{ + const int bio_rw = rw | REQ_SYNC; + struct bio *bio; + + bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1); + bio->bi_iter.bi_sector = sector; + bio->bi_bdev = bdev; + bio->bi_end_io = end_swap_bio_read; + + if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { + printk(KERN_ERR "PM: Adding page to bio failed at %llu\n", + (unsigned long long)sector); + bio_put(bio); + return -EFAULT; + } + + lock_page(page); + bio_get(bio); + + if (bio_chain == NULL) { + submit_bio(bio_rw, bio); + wait_on_page_locked(page); + if (rw == READ) + bio_set_pages_dirty(bio); + bio_put(bio); + } else { + if (rw == READ) + get_page(page); /* These pages are freed later */ + bio->bi_private = *bio_chain; + *bio_chain = bio; + submit_bio(bio_rw, bio); + } + return 0; +} + +int hib_bio_read_page(pgoff_t page_off, void *addr, struct bio **bio_chain) +{ + return submit(READ, hib_resume_bdev, page_off * (PAGE_SIZE >> 9), + virt_to_page(addr), bio_chain); +} + +int hib_bio_write_page(pgoff_t page_off, void *addr, struct bio **bio_chain) +{ + return submit(WRITE, hib_resume_bdev, page_off * (PAGE_SIZE >> 9), + virt_to_page(addr), bio_chain); +} + +int hib_wait_on_bio_chain(struct bio **bio_chain) +{ + struct bio *bio; + struct bio *next_bio; + int ret = 0; + + if (bio_chain == NULL) + return 0; + + bio = *bio_chain; + if (bio == NULL) + return 0; + while (bio) { + struct page *page; + + next_bio = bio->bi_private; + page = bio->bi_io_vec[0].bv_page; + wait_on_page_locked(page); + if (!PageUptodate(page) || PageError(page)) + ret = -EIO; + put_page(page); + bio_put(bio); + bio = next_bio; + } + *bio_chain = NULL; + return ret; +} diff --git a/kernel/power/console.c b/kernel/power/console.c index 623786d4415..aba9c545a0e 100644 --- a/kernel/power/console.c +++ b/kernel/power/console.c @@ -1,50 +1,151 @@ /* - * drivers/power/process.c - Functions for saving/restoring console. + * Functions for saving/restoring console. * * Originally from swsusp. */ +#include <linux/console.h> #include <linux/vt_kern.h> #include <linux/kbd_kern.h> -#include <linux/console.h> +#include <linux/vt.h> +#include <linux/module.h> +#include <linux/slab.h> #include "power.h" -#if defined(CONFIG_VT) && defined(CONFIG_VT_CONSOLE) #define SUSPEND_CONSOLE (MAX_NR_CONSOLES-1) static int orig_fgconsole, orig_kmsg; -int pm_prepare_console(void) +static DEFINE_MUTEX(vt_switch_mutex); + +struct pm_vt_switch { + struct list_head head; + struct device *dev; + bool required; +}; + +static LIST_HEAD(pm_vt_switch_list); + + +/** + * pm_vt_switch_required - indicate VT switch at suspend requirements + * @dev: device + * @required: if true, caller needs VT switch at suspend/resume time + * + * The different console drivers may or may not require VT switches across + * suspend/resume, depending on how they handle restoring video state and + * what may be running. + * + * Drivers can indicate support for switchless suspend/resume, which can + * save time and flicker, by using this routine and passing 'false' as + * the argument. If any loaded driver needs VT switching, or the + * no_console_suspend argument has been passed on the command line, VT + * switches will occur. + */ +void pm_vt_switch_required(struct device *dev, bool required) { - acquire_console_sem(); + struct pm_vt_switch *entry, *tmp; - orig_fgconsole = fg_console; + mutex_lock(&vt_switch_mutex); + list_for_each_entry(tmp, &pm_vt_switch_list, head) { + if (tmp->dev == dev) { + /* already registered, update requirement */ + tmp->required = required; + goto out; + } + } - if (vc_allocate(SUSPEND_CONSOLE)) { - /* we can't have a free VC for now. Too bad, - * we don't want to mess the screen for now. */ - release_console_sem(); - return 1; + entry = kmalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) + goto out; + + entry->required = required; + entry->dev = dev; + + list_add(&entry->head, &pm_vt_switch_list); +out: + mutex_unlock(&vt_switch_mutex); +} +EXPORT_SYMBOL(pm_vt_switch_required); + +/** + * pm_vt_switch_unregister - stop tracking a device's VT switching needs + * @dev: device + * + * Remove @dev from the vt switch list. + */ +void pm_vt_switch_unregister(struct device *dev) +{ + struct pm_vt_switch *tmp; + + mutex_lock(&vt_switch_mutex); + list_for_each_entry(tmp, &pm_vt_switch_list, head) { + if (tmp->dev == dev) { + list_del(&tmp->head); + kfree(tmp); + break; + } } + mutex_unlock(&vt_switch_mutex); +} +EXPORT_SYMBOL(pm_vt_switch_unregister); + +/* + * There are three cases when a VT switch on suspend/resume are required: + * 1) no driver has indicated a requirement one way or another, so preserve + * the old behavior + * 2) console suspend is disabled, we want to see debug messages across + * suspend/resume + * 3) any registered driver indicates it needs a VT switch + * + * If none of these conditions is present, meaning we have at least one driver + * that doesn't need the switch, and none that do, we can avoid it to make + * resume look a little prettier (and suspend too, but that's usually hidden, + * e.g. when closing the lid on a laptop). + */ +static bool pm_vt_switch(void) +{ + struct pm_vt_switch *entry; + bool ret = true; - set_console(SUSPEND_CONSOLE); - release_console_sem(); + mutex_lock(&vt_switch_mutex); + if (list_empty(&pm_vt_switch_list)) + goto out; - if (vt_waitactive(SUSPEND_CONSOLE)) { - pr_debug("Suspend: Can't switch VCs."); - return 1; + if (!console_suspend_enabled) + goto out; + + list_for_each_entry(entry, &pm_vt_switch_list, head) { + if (entry->required) + goto out; } - orig_kmsg = kmsg_redirect; - kmsg_redirect = SUSPEND_CONSOLE; + + ret = false; +out: + mutex_unlock(&vt_switch_mutex); + return ret; +} + +int pm_prepare_console(void) +{ + if (!pm_vt_switch()) + return 0; + + orig_fgconsole = vt_move_to_console(SUSPEND_CONSOLE, 1); + if (orig_fgconsole < 0) + return 1; + + orig_kmsg = vt_kmsg_redirect(SUSPEND_CONSOLE); return 0; } void pm_restore_console(void) { - acquire_console_sem(); - set_console(orig_fgconsole); - release_console_sem(); - kmsg_redirect = orig_kmsg; - return; + if (!pm_vt_switch()) + return; + + if (orig_fgconsole >= 0) { + vt_move_to_console(orig_fgconsole, 0); + vt_kmsg_redirect(orig_kmsg); + } } -#endif diff --git a/kernel/power/disk.c b/kernel/power/disk.c deleted file mode 100644 index d7223494279..00000000000 --- a/kernel/power/disk.c +++ /dev/null @@ -1,406 +0,0 @@ -/* - * kernel/power/disk.c - Suspend-to-disk support. - * - * Copyright (c) 2003 Patrick Mochel - * Copyright (c) 2003 Open Source Development Lab - * Copyright (c) 2004 Pavel Machek <pavel@suse.cz> - * - * This file is released under the GPLv2. - * - */ - -#include <linux/suspend.h> -#include <linux/syscalls.h> -#include <linux/reboot.h> -#include <linux/string.h> -#include <linux/device.h> -#include <linux/delay.h> -#include <linux/fs.h> -#include <linux/mount.h> -#include <linux/pm.h> -#include <linux/cpu.h> - -#include "power.h" - - -static int noresume = 0; -char resume_file[256] = CONFIG_PM_STD_PARTITION; -dev_t swsusp_resume_device; - -/** - * power_down - Shut machine down for hibernate. - * @mode: Suspend-to-disk mode - * - * Use the platform driver, if configured so, and return gracefully if it - * fails. - * Otherwise, try to power off and reboot. If they fail, halt the machine, - * there ain't no turning back. - */ - -static void power_down(suspend_disk_method_t mode) -{ - int error = 0; - - switch(mode) { - case PM_DISK_PLATFORM: - kernel_shutdown_prepare(SYSTEM_SUSPEND_DISK); - error = pm_ops->enter(PM_SUSPEND_DISK); - break; - case PM_DISK_SHUTDOWN: - kernel_power_off(); - break; - case PM_DISK_REBOOT: - kernel_restart(NULL); - break; - } - kernel_halt(); - /* Valid image is on the disk, if we continue we risk serious data corruption - after resume. */ - printk(KERN_CRIT "Please power me down manually\n"); - while(1); -} - -static inline void platform_finish(void) -{ - if (pm_disk_mode == PM_DISK_PLATFORM) { - if (pm_ops && pm_ops->finish) - pm_ops->finish(PM_SUSPEND_DISK); - } -} - -static int prepare_processes(void) -{ - int error; - - pm_prepare_console(); - - error = disable_nonboot_cpus(); - if (error) - goto enable_cpus; - - if (freeze_processes()) { - error = -EBUSY; - goto thaw; - } - - /* Free memory before shutting down devices. */ - if (!(error = swsusp_shrink_memory())) - return 0; -thaw: - thaw_processes(); -enable_cpus: - enable_nonboot_cpus(); - pm_restore_console(); - return error; -} - -static void unprepare_processes(void) -{ - platform_finish(); - thaw_processes(); - enable_nonboot_cpus(); - pm_restore_console(); -} - -/** - * pm_suspend_disk - The granpappy of hibernation power management. - * - * If we're going through the firmware, then get it over with quickly. - * - * If not, then call swsusp to do its thing, then figure out how - * to power down the system. - */ - -int pm_suspend_disk(void) -{ - int error; - - error = prepare_processes(); - if (error) - return error; - - error = device_suspend(PMSG_FREEZE); - if (error) { - printk("Some devices failed to suspend\n"); - unprepare_processes(); - return error; - } - - pr_debug("PM: snapshotting memory.\n"); - in_suspend = 1; - if ((error = swsusp_suspend())) - goto Done; - - if (in_suspend) { - device_resume(); - pr_debug("PM: writing image.\n"); - error = swsusp_write(); - if (!error) - power_down(pm_disk_mode); - else { - swsusp_free(); - unprepare_processes(); - return error; - } - } else - pr_debug("PM: Image restored successfully.\n"); - - swsusp_free(); - Done: - device_resume(); - unprepare_processes(); - return error; -} - - -/** - * software_resume - Resume from a saved image. - * - * Called as a late_initcall (so all devices are discovered and - * initialized), we call swsusp to see if we have a saved image or not. - * If so, we quiesce devices, the restore the saved image. We will - * return above (in pm_suspend_disk() ) if everything goes well. - * Otherwise, we fail gracefully and return to the normally - * scheduled program. - * - */ - -static int software_resume(void) -{ - int error; - - down(&pm_sem); - if (!swsusp_resume_device) { - if (!strlen(resume_file)) { - up(&pm_sem); - return -ENOENT; - } - swsusp_resume_device = name_to_dev_t(resume_file); - pr_debug("swsusp: Resume From Partition %s\n", resume_file); - } else { - pr_debug("swsusp: Resume From Partition %d:%d\n", - MAJOR(swsusp_resume_device), MINOR(swsusp_resume_device)); - } - - if (noresume) { - /** - * FIXME: If noresume is specified, we need to find the partition - * and reset it back to normal swap space. - */ - up(&pm_sem); - return 0; - } - - pr_debug("PM: Checking swsusp image.\n"); - - if ((error = swsusp_check())) - goto Done; - - pr_debug("PM: Preparing processes for restore.\n"); - - if ((error = prepare_processes())) { - swsusp_close(); - goto Done; - } - - pr_debug("PM: Reading swsusp image.\n"); - - if ((error = swsusp_read())) { - swsusp_free(); - goto Thaw; - } - - pr_debug("PM: Preparing devices for restore.\n"); - - if ((error = device_suspend(PMSG_PRETHAW))) { - printk("Some devices failed to suspend\n"); - swsusp_free(); - goto Thaw; - } - - mb(); - - pr_debug("PM: Restoring saved image.\n"); - swsusp_resume(); - pr_debug("PM: Restore failed, recovering.n"); - device_resume(); - Thaw: - unprepare_processes(); - Done: - /* For success case, the suspend path will release the lock */ - up(&pm_sem); - pr_debug("PM: Resume from disk failed.\n"); - return 0; -} - -late_initcall(software_resume); - - -static const char * const pm_disk_modes[] = { - [PM_DISK_FIRMWARE] = "firmware", - [PM_DISK_PLATFORM] = "platform", - [PM_DISK_SHUTDOWN] = "shutdown", - [PM_DISK_REBOOT] = "reboot", -}; - -/** - * disk - Control suspend-to-disk mode - * - * Suspend-to-disk can be handled in several ways. The greatest - * distinction is who writes memory to disk - the firmware or the OS. - * If the firmware does it, we assume that it also handles suspending - * the system. - * If the OS does it, then we have three options for putting the system - * to sleep - using the platform driver (e.g. ACPI or other PM registers), - * powering off the system or rebooting the system (for testing). - * - * The system will support either 'firmware' or 'platform', and that is - * known a priori (and encoded in pm_ops). But, the user may choose - * 'shutdown' or 'reboot' as alternatives. - * - * show() will display what the mode is currently set to. - * store() will accept one of - * - * 'firmware' - * 'platform' - * 'shutdown' - * 'reboot' - * - * It will only change to 'firmware' or 'platform' if the system - * supports it (as determined from pm_ops->pm_disk_mode). - */ - -static ssize_t disk_show(struct subsystem * subsys, char * buf) -{ - return sprintf(buf, "%s\n", pm_disk_modes[pm_disk_mode]); -} - - -static ssize_t disk_store(struct subsystem * s, const char * buf, size_t n) -{ - int error = 0; - int i; - int len; - char *p; - suspend_disk_method_t mode = 0; - - p = memchr(buf, '\n', n); - len = p ? p - buf : n; - - down(&pm_sem); - for (i = PM_DISK_FIRMWARE; i < PM_DISK_MAX; i++) { - if (!strncmp(buf, pm_disk_modes[i], len)) { - mode = i; - break; - } - } - if (mode) { - if (mode == PM_DISK_SHUTDOWN || mode == PM_DISK_REBOOT) - pm_disk_mode = mode; - else { - if (pm_ops && pm_ops->enter && - (mode == pm_ops->pm_disk_mode)) - pm_disk_mode = mode; - else - error = -EINVAL; - } - } else - error = -EINVAL; - - pr_debug("PM: suspend-to-disk mode set to '%s'\n", - pm_disk_modes[mode]); - up(&pm_sem); - return error ? error : n; -} - -power_attr(disk); - -static ssize_t resume_show(struct subsystem * subsys, char *buf) -{ - return sprintf(buf,"%d:%d\n", MAJOR(swsusp_resume_device), - MINOR(swsusp_resume_device)); -} - -static ssize_t resume_store(struct subsystem *subsys, const char *buf, size_t n) -{ - unsigned int maj, min; - dev_t res; - int ret = -EINVAL; - - if (sscanf(buf, "%u:%u", &maj, &min) != 2) - goto out; - - res = MKDEV(maj,min); - if (maj != MAJOR(res) || min != MINOR(res)) - goto out; - - down(&pm_sem); - swsusp_resume_device = res; - up(&pm_sem); - printk("Attempting manual resume\n"); - noresume = 0; - software_resume(); - ret = n; -out: - return ret; -} - -power_attr(resume); - -static ssize_t image_size_show(struct subsystem * subsys, char *buf) -{ - return sprintf(buf, "%lu\n", image_size); -} - -static ssize_t image_size_store(struct subsystem * subsys, const char * buf, size_t n) -{ - unsigned long size; - - if (sscanf(buf, "%lu", &size) == 1) { - image_size = size; - return n; - } - - return -EINVAL; -} - -power_attr(image_size); - -static struct attribute * g[] = { - &disk_attr.attr, - &resume_attr.attr, - &image_size_attr.attr, - NULL, -}; - - -static struct attribute_group attr_group = { - .attrs = g, -}; - - -static int __init pm_disk_init(void) -{ - return sysfs_create_group(&power_subsys.kset.kobj,&attr_group); -} - -core_initcall(pm_disk_init); - - -static int __init resume_setup(char *str) -{ - if (noresume) - return 1; - - strncpy( resume_file, str, 255 ); - return 1; -} - -static int __init noresume_setup(char *str) -{ - noresume = 1; - return 1; -} - -__setup("noresume", noresume_setup); -__setup("resume=", resume_setup); diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c new file mode 100644 index 00000000000..fcc2611d3f1 --- /dev/null +++ b/kernel/power/hibernate.c @@ -0,0 +1,1168 @@ +/* + * kernel/power/hibernate.c - Hibernation (a.k.a suspend-to-disk) support. + * + * Copyright (c) 2003 Patrick Mochel + * Copyright (c) 2003 Open Source Development Lab + * Copyright (c) 2004 Pavel Machek <pavel@ucw.cz> + * Copyright (c) 2009 Rafael J. Wysocki, Novell Inc. + * Copyright (C) 2012 Bojan Smojver <bojan@rexursive.com> + * + * This file is released under the GPLv2. + */ + +#include <linux/export.h> +#include <linux/suspend.h> +#include <linux/syscalls.h> +#include <linux/reboot.h> +#include <linux/string.h> +#include <linux/device.h> +#include <linux/async.h> +#include <linux/delay.h> +#include <linux/fs.h> +#include <linux/mount.h> +#include <linux/pm.h> +#include <linux/console.h> +#include <linux/cpu.h> +#include <linux/freezer.h> +#include <linux/gfp.h> +#include <linux/syscore_ops.h> +#include <linux/ctype.h> +#include <linux/genhd.h> +#include <trace/events/power.h> + +#include "power.h" + + +static int nocompress; +static int noresume; +static int nohibernate; +static int resume_wait; +static unsigned int resume_delay; +static char resume_file[256] = CONFIG_PM_STD_PARTITION; +dev_t swsusp_resume_device; +sector_t swsusp_resume_block; +__visible int in_suspend __nosavedata; + +enum { + HIBERNATION_INVALID, + HIBERNATION_PLATFORM, + HIBERNATION_SHUTDOWN, + HIBERNATION_REBOOT, +#ifdef CONFIG_SUSPEND + HIBERNATION_SUSPEND, +#endif + /* keep last */ + __HIBERNATION_AFTER_LAST +}; +#define HIBERNATION_MAX (__HIBERNATION_AFTER_LAST-1) +#define HIBERNATION_FIRST (HIBERNATION_INVALID + 1) + +static int hibernation_mode = HIBERNATION_SHUTDOWN; + +bool freezer_test_done; + +static const struct platform_hibernation_ops *hibernation_ops; + +bool hibernation_available(void) +{ + return (nohibernate == 0); +} + +/** + * hibernation_set_ops - Set the global hibernate operations. + * @ops: Hibernation operations to use in subsequent hibernation transitions. + */ +void hibernation_set_ops(const struct platform_hibernation_ops *ops) +{ + if (ops && !(ops->begin && ops->end && ops->pre_snapshot + && ops->prepare && ops->finish && ops->enter && ops->pre_restore + && ops->restore_cleanup && ops->leave)) { + WARN_ON(1); + return; + } + lock_system_sleep(); + hibernation_ops = ops; + if (ops) + hibernation_mode = HIBERNATION_PLATFORM; + else if (hibernation_mode == HIBERNATION_PLATFORM) + hibernation_mode = HIBERNATION_SHUTDOWN; + + unlock_system_sleep(); +} +EXPORT_SYMBOL_GPL(hibernation_set_ops); + +static bool entering_platform_hibernation; + +bool system_entering_hibernation(void) +{ + return entering_platform_hibernation; +} +EXPORT_SYMBOL(system_entering_hibernation); + +#ifdef CONFIG_PM_DEBUG +static void hibernation_debug_sleep(void) +{ + printk(KERN_INFO "hibernation debug: Waiting for 5 seconds.\n"); + mdelay(5000); +} + +static int hibernation_test(int level) +{ + if (pm_test_level == level) { + hibernation_debug_sleep(); + return 1; + } + return 0; +} +#else /* !CONFIG_PM_DEBUG */ +static int hibernation_test(int level) { return 0; } +#endif /* !CONFIG_PM_DEBUG */ + +/** + * platform_begin - Call platform to start hibernation. + * @platform_mode: Whether or not to use the platform driver. + */ +static int platform_begin(int platform_mode) +{ + return (platform_mode && hibernation_ops) ? + hibernation_ops->begin() : 0; +} + +/** + * platform_end - Call platform to finish transition to the working state. + * @platform_mode: Whether or not to use the platform driver. + */ +static void platform_end(int platform_mode) +{ + if (platform_mode && hibernation_ops) + hibernation_ops->end(); +} + +/** + * platform_pre_snapshot - Call platform to prepare the machine for hibernation. + * @platform_mode: Whether or not to use the platform driver. + * + * Use the platform driver to prepare the system for creating a hibernate image, + * if so configured, and return an error code if that fails. + */ + +static int platform_pre_snapshot(int platform_mode) +{ + return (platform_mode && hibernation_ops) ? + hibernation_ops->pre_snapshot() : 0; +} + +/** + * platform_leave - Call platform to prepare a transition to the working state. + * @platform_mode: Whether or not to use the platform driver. + * + * Use the platform driver prepare to prepare the machine for switching to the + * normal mode of operation. + * + * This routine is called on one CPU with interrupts disabled. + */ +static void platform_leave(int platform_mode) +{ + if (platform_mode && hibernation_ops) + hibernation_ops->leave(); +} + +/** + * platform_finish - Call platform to switch the system to the working state. + * @platform_mode: Whether or not to use the platform driver. + * + * Use the platform driver to switch the machine to the normal mode of + * operation. + * + * This routine must be called after platform_prepare(). + */ +static void platform_finish(int platform_mode) +{ + if (platform_mode && hibernation_ops) + hibernation_ops->finish(); +} + +/** + * platform_pre_restore - Prepare for hibernate image restoration. + * @platform_mode: Whether or not to use the platform driver. + * + * Use the platform driver to prepare the system for resume from a hibernation + * image. + * + * If the restore fails after this function has been called, + * platform_restore_cleanup() must be called. + */ +static int platform_pre_restore(int platform_mode) +{ + return (platform_mode && hibernation_ops) ? + hibernation_ops->pre_restore() : 0; +} + +/** + * platform_restore_cleanup - Switch to the working state after failing restore. + * @platform_mode: Whether or not to use the platform driver. + * + * Use the platform driver to switch the system to the normal mode of operation + * after a failing restore. + * + * If platform_pre_restore() has been called before the failing restore, this + * function must be called too, regardless of the result of + * platform_pre_restore(). + */ +static void platform_restore_cleanup(int platform_mode) +{ + if (platform_mode && hibernation_ops) + hibernation_ops->restore_cleanup(); +} + +/** + * platform_recover - Recover from a failure to suspend devices. + * @platform_mode: Whether or not to use the platform driver. + */ +static void platform_recover(int platform_mode) +{ + if (platform_mode && hibernation_ops && hibernation_ops->recover) + hibernation_ops->recover(); +} + +/** + * swsusp_show_speed - Print time elapsed between two events during hibernation. + * @start: Starting event. + * @stop: Final event. + * @nr_pages: Number of memory pages processed between @start and @stop. + * @msg: Additional diagnostic message to print. + */ +void swsusp_show_speed(struct timeval *start, struct timeval *stop, + unsigned nr_pages, char *msg) +{ + u64 elapsed_centisecs64; + unsigned int centisecs; + unsigned int k; + unsigned int kps; + + elapsed_centisecs64 = timeval_to_ns(stop) - timeval_to_ns(start); + /* + * If "(s64)elapsed_centisecs64 < 0", it will print long elapsed time, + * it is obvious enough for what went wrong. + */ + do_div(elapsed_centisecs64, NSEC_PER_SEC / 100); + centisecs = elapsed_centisecs64; + if (centisecs == 0) + centisecs = 1; /* avoid div-by-zero */ + k = nr_pages * (PAGE_SIZE / 1024); + kps = (k * 100) / centisecs; + printk(KERN_INFO "PM: %s %u kbytes in %u.%02u seconds (%u.%02u MB/s)\n", + msg, k, + centisecs / 100, centisecs % 100, + kps / 1000, (kps % 1000) / 10); +} + +/** + * create_image - Create a hibernation image. + * @platform_mode: Whether or not to use the platform driver. + * + * Execute device drivers' "late" and "noirq" freeze callbacks, create a + * hibernation image and run the drivers' "noirq" and "early" thaw callbacks. + * + * Control reappears in this routine after the subsequent restore. + */ +static int create_image(int platform_mode) +{ + int error; + + error = dpm_suspend_end(PMSG_FREEZE); + if (error) { + printk(KERN_ERR "PM: Some devices failed to power down, " + "aborting hibernation\n"); + return error; + } + + error = platform_pre_snapshot(platform_mode); + if (error || hibernation_test(TEST_PLATFORM)) + goto Platform_finish; + + error = disable_nonboot_cpus(); + if (error || hibernation_test(TEST_CPUS)) + goto Enable_cpus; + + local_irq_disable(); + + error = syscore_suspend(); + if (error) { + printk(KERN_ERR "PM: Some system devices failed to power down, " + "aborting hibernation\n"); + goto Enable_irqs; + } + + if (hibernation_test(TEST_CORE) || pm_wakeup_pending()) + goto Power_up; + + in_suspend = 1; + save_processor_state(); + trace_suspend_resume(TPS("machine_suspend"), PM_EVENT_HIBERNATE, true); + error = swsusp_arch_suspend(); + trace_suspend_resume(TPS("machine_suspend"), PM_EVENT_HIBERNATE, false); + if (error) + printk(KERN_ERR "PM: Error %d creating hibernation image\n", + error); + /* Restore control flow magically appears here */ + restore_processor_state(); + if (!in_suspend) + events_check_enabled = false; + + platform_leave(platform_mode); + + Power_up: + syscore_resume(); + + Enable_irqs: + local_irq_enable(); + + Enable_cpus: + enable_nonboot_cpus(); + + Platform_finish: + platform_finish(platform_mode); + + dpm_resume_start(in_suspend ? + (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); + + return error; +} + +/** + * hibernation_snapshot - Quiesce devices and create a hibernation image. + * @platform_mode: If set, use platform driver to prepare for the transition. + * + * This routine must be called with pm_mutex held. + */ +int hibernation_snapshot(int platform_mode) +{ + pm_message_t msg; + int error; + + error = platform_begin(platform_mode); + if (error) + goto Close; + + /* Preallocate image memory before shutting down devices. */ + error = hibernate_preallocate_memory(); + if (error) + goto Close; + + error = freeze_kernel_threads(); + if (error) + goto Cleanup; + + if (hibernation_test(TEST_FREEZER)) { + + /* + * Indicate to the caller that we are returning due to a + * successful freezer test. + */ + freezer_test_done = true; + goto Thaw; + } + + error = dpm_prepare(PMSG_FREEZE); + if (error) { + dpm_complete(PMSG_RECOVER); + goto Thaw; + } + + suspend_console(); + ftrace_stop(); + pm_restrict_gfp_mask(); + + error = dpm_suspend(PMSG_FREEZE); + + if (error || hibernation_test(TEST_DEVICES)) + platform_recover(platform_mode); + else + error = create_image(platform_mode); + + /* + * In the case that we call create_image() above, the control + * returns here (1) after the image has been created or the + * image creation has failed and (2) after a successful restore. + */ + + /* We may need to release the preallocated image pages here. */ + if (error || !in_suspend) + swsusp_free(); + + msg = in_suspend ? (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE; + dpm_resume(msg); + + if (error || !in_suspend) + pm_restore_gfp_mask(); + + ftrace_start(); + resume_console(); + dpm_complete(msg); + + Close: + platform_end(platform_mode); + return error; + + Thaw: + thaw_kernel_threads(); + Cleanup: + swsusp_free(); + goto Close; +} + +/** + * resume_target_kernel - Restore system state from a hibernation image. + * @platform_mode: Whether or not to use the platform driver. + * + * Execute device drivers' "noirq" and "late" freeze callbacks, restore the + * contents of highmem that have not been restored yet from the image and run + * the low-level code that will restore the remaining contents of memory and + * switch to the just restored target kernel. + */ +static int resume_target_kernel(bool platform_mode) +{ + int error; + + error = dpm_suspend_end(PMSG_QUIESCE); + if (error) { + printk(KERN_ERR "PM: Some devices failed to power down, " + "aborting resume\n"); + return error; + } + + error = platform_pre_restore(platform_mode); + if (error) + goto Cleanup; + + error = disable_nonboot_cpus(); + if (error) + goto Enable_cpus; + + local_irq_disable(); + + error = syscore_suspend(); + if (error) + goto Enable_irqs; + + save_processor_state(); + error = restore_highmem(); + if (!error) { + error = swsusp_arch_resume(); + /* + * The code below is only ever reached in case of a failure. + * Otherwise, execution continues at the place where + * swsusp_arch_suspend() was called. + */ + BUG_ON(!error); + /* + * This call to restore_highmem() reverts the changes made by + * the previous one. + */ + restore_highmem(); + } + /* + * The only reason why swsusp_arch_resume() can fail is memory being + * very tight, so we have to free it as soon as we can to avoid + * subsequent failures. + */ + swsusp_free(); + restore_processor_state(); + touch_softlockup_watchdog(); + + syscore_resume(); + + Enable_irqs: + local_irq_enable(); + + Enable_cpus: + enable_nonboot_cpus(); + + Cleanup: + platform_restore_cleanup(platform_mode); + + dpm_resume_start(PMSG_RECOVER); + + return error; +} + +/** + * hibernation_restore - Quiesce devices and restore from a hibernation image. + * @platform_mode: If set, use platform driver to prepare for the transition. + * + * This routine must be called with pm_mutex held. If it is successful, control + * reappears in the restored target kernel in hibernation_snapshot(). + */ +int hibernation_restore(int platform_mode) +{ + int error; + + pm_prepare_console(); + suspend_console(); + ftrace_stop(); + pm_restrict_gfp_mask(); + error = dpm_suspend_start(PMSG_QUIESCE); + if (!error) { + error = resume_target_kernel(platform_mode); + dpm_resume_end(PMSG_RECOVER); + } + pm_restore_gfp_mask(); + ftrace_start(); + resume_console(); + pm_restore_console(); + return error; +} + +/** + * hibernation_platform_enter - Power off the system using the platform driver. + */ +int hibernation_platform_enter(void) +{ + int error; + + if (!hibernation_ops) + return -ENOSYS; + + /* + * We have cancelled the power transition by running + * hibernation_ops->finish() before saving the image, so we should let + * the firmware know that we're going to enter the sleep state after all + */ + error = hibernation_ops->begin(); + if (error) + goto Close; + + entering_platform_hibernation = true; + suspend_console(); + ftrace_stop(); + error = dpm_suspend_start(PMSG_HIBERNATE); + if (error) { + if (hibernation_ops->recover) + hibernation_ops->recover(); + goto Resume_devices; + } + + error = dpm_suspend_end(PMSG_HIBERNATE); + if (error) + goto Resume_devices; + + error = hibernation_ops->prepare(); + if (error) + goto Platform_finish; + + error = disable_nonboot_cpus(); + if (error) + goto Platform_finish; + + local_irq_disable(); + syscore_suspend(); + if (pm_wakeup_pending()) { + error = -EAGAIN; + goto Power_up; + } + + hibernation_ops->enter(); + /* We should never get here */ + while (1); + + Power_up: + syscore_resume(); + local_irq_enable(); + enable_nonboot_cpus(); + + Platform_finish: + hibernation_ops->finish(); + + dpm_resume_start(PMSG_RESTORE); + + Resume_devices: + entering_platform_hibernation = false; + dpm_resume_end(PMSG_RESTORE); + ftrace_start(); + resume_console(); + + Close: + hibernation_ops->end(); + + return error; +} + +/** + * power_down - Shut the machine down for hibernation. + * + * Use the platform driver, if configured, to put the system into the sleep + * state corresponding to hibernation, or try to power it off or reboot, + * depending on the value of hibernation_mode. + */ +static void power_down(void) +{ +#ifdef CONFIG_SUSPEND + int error; +#endif + + switch (hibernation_mode) { + case HIBERNATION_REBOOT: + kernel_restart(NULL); + break; + case HIBERNATION_PLATFORM: + hibernation_platform_enter(); + case HIBERNATION_SHUTDOWN: + if (pm_power_off) + kernel_power_off(); + break; +#ifdef CONFIG_SUSPEND + case HIBERNATION_SUSPEND: + error = suspend_devices_and_enter(PM_SUSPEND_MEM); + if (error) { + if (hibernation_ops) + hibernation_mode = HIBERNATION_PLATFORM; + else + hibernation_mode = HIBERNATION_SHUTDOWN; + power_down(); + } + /* + * Restore swap signature. + */ + error = swsusp_unmark(); + if (error) + printk(KERN_ERR "PM: Swap will be unusable! " + "Try swapon -a.\n"); + return; +#endif + } + kernel_halt(); + /* + * Valid image is on the disk, if we continue we risk serious data + * corruption after resume. + */ + printk(KERN_CRIT "PM: Please power down manually\n"); + while (1) + cpu_relax(); +} + +/** + * hibernate - Carry out system hibernation, including saving the image. + */ +int hibernate(void) +{ + int error; + + if (!hibernation_available()) { + pr_debug("PM: Hibernation not available.\n"); + return -EPERM; + } + + lock_system_sleep(); + /* The snapshot device should not be opened while we're running */ + if (!atomic_add_unless(&snapshot_device_available, -1, 0)) { + error = -EBUSY; + goto Unlock; + } + + pm_prepare_console(); + error = pm_notifier_call_chain(PM_HIBERNATION_PREPARE); + if (error) + goto Exit; + + printk(KERN_INFO "PM: Syncing filesystems ... "); + sys_sync(); + printk("done.\n"); + + error = freeze_processes(); + if (error) + goto Exit; + + lock_device_hotplug(); + /* Allocate memory management structures */ + error = create_basic_memory_bitmaps(); + if (error) + goto Thaw; + + error = hibernation_snapshot(hibernation_mode == HIBERNATION_PLATFORM); + if (error || freezer_test_done) + goto Free_bitmaps; + + if (in_suspend) { + unsigned int flags = 0; + + if (hibernation_mode == HIBERNATION_PLATFORM) + flags |= SF_PLATFORM_MODE; + if (nocompress) + flags |= SF_NOCOMPRESS_MODE; + else + flags |= SF_CRC32_MODE; + + pr_debug("PM: writing image.\n"); + error = swsusp_write(flags); + swsusp_free(); + if (!error) + power_down(); + in_suspend = 0; + pm_restore_gfp_mask(); + } else { + pr_debug("PM: Image restored successfully.\n"); + } + + Free_bitmaps: + free_basic_memory_bitmaps(); + Thaw: + unlock_device_hotplug(); + thaw_processes(); + + /* Don't bother checking whether freezer_test_done is true */ + freezer_test_done = false; + Exit: + pm_notifier_call_chain(PM_POST_HIBERNATION); + pm_restore_console(); + atomic_inc(&snapshot_device_available); + Unlock: + unlock_system_sleep(); + return error; +} + + +/** + * software_resume - Resume from a saved hibernation image. + * + * This routine is called as a late initcall, when all devices have been + * discovered and initialized already. + * + * The image reading code is called to see if there is a hibernation image + * available for reading. If that is the case, devices are quiesced and the + * contents of memory is restored from the saved image. + * + * If this is successful, control reappears in the restored target kernel in + * hibernation_snaphot() which returns to hibernate(). Otherwise, the routine + * attempts to recover gracefully and make the kernel return to the normal mode + * of operation. + */ +static int software_resume(void) +{ + int error; + unsigned int flags; + + /* + * If the user said "noresume".. bail out early. + */ + if (noresume || !hibernation_available()) + return 0; + + /* + * name_to_dev_t() below takes a sysfs buffer mutex when sysfs + * is configured into the kernel. Since the regular hibernate + * trigger path is via sysfs which takes a buffer mutex before + * calling hibernate functions (which take pm_mutex) this can + * cause lockdep to complain about a possible ABBA deadlock + * which cannot happen since we're in the boot code here and + * sysfs can't be invoked yet. Therefore, we use a subclass + * here to avoid lockdep complaining. + */ + mutex_lock_nested(&pm_mutex, SINGLE_DEPTH_NESTING); + + if (swsusp_resume_device) + goto Check_image; + + if (!strlen(resume_file)) { + error = -ENOENT; + goto Unlock; + } + + pr_debug("PM: Checking hibernation image partition %s\n", resume_file); + + if (resume_delay) { + printk(KERN_INFO "Waiting %dsec before reading resume device...\n", + resume_delay); + ssleep(resume_delay); + } + + /* Check if the device is there */ + swsusp_resume_device = name_to_dev_t(resume_file); + + /* + * name_to_dev_t is ineffective to verify parition if resume_file is in + * integer format. (e.g. major:minor) + */ + if (isdigit(resume_file[0]) && resume_wait) { + int partno; + while (!get_gendisk(swsusp_resume_device, &partno)) + msleep(10); + } + + if (!swsusp_resume_device) { + /* + * Some device discovery might still be in progress; we need + * to wait for this to finish. + */ + wait_for_device_probe(); + + if (resume_wait) { + while ((swsusp_resume_device = name_to_dev_t(resume_file)) == 0) + msleep(10); + async_synchronize_full(); + } + + swsusp_resume_device = name_to_dev_t(resume_file); + if (!swsusp_resume_device) { + error = -ENODEV; + goto Unlock; + } + } + + Check_image: + pr_debug("PM: Hibernation image partition %d:%d present\n", + MAJOR(swsusp_resume_device), MINOR(swsusp_resume_device)); + + pr_debug("PM: Looking for hibernation image.\n"); + error = swsusp_check(); + if (error) + goto Unlock; + + /* The snapshot device should not be opened while we're running */ + if (!atomic_add_unless(&snapshot_device_available, -1, 0)) { + error = -EBUSY; + swsusp_close(FMODE_READ); + goto Unlock; + } + + pm_prepare_console(); + error = pm_notifier_call_chain(PM_RESTORE_PREPARE); + if (error) + goto Close_Finish; + + pr_debug("PM: Preparing processes for restore.\n"); + error = freeze_processes(); + if (error) + goto Close_Finish; + + pr_debug("PM: Loading hibernation image.\n"); + + lock_device_hotplug(); + error = create_basic_memory_bitmaps(); + if (error) + goto Thaw; + + error = swsusp_read(&flags); + swsusp_close(FMODE_READ); + if (!error) + hibernation_restore(flags & SF_PLATFORM_MODE); + + printk(KERN_ERR "PM: Failed to load hibernation image, recovering.\n"); + swsusp_free(); + free_basic_memory_bitmaps(); + Thaw: + unlock_device_hotplug(); + thaw_processes(); + Finish: + pm_notifier_call_chain(PM_POST_RESTORE); + pm_restore_console(); + atomic_inc(&snapshot_device_available); + /* For success case, the suspend path will release the lock */ + Unlock: + mutex_unlock(&pm_mutex); + pr_debug("PM: Hibernation image not present or could not be loaded.\n"); + return error; + Close_Finish: + swsusp_close(FMODE_READ); + goto Finish; +} + +late_initcall_sync(software_resume); + + +static const char * const hibernation_modes[] = { + [HIBERNATION_PLATFORM] = "platform", + [HIBERNATION_SHUTDOWN] = "shutdown", + [HIBERNATION_REBOOT] = "reboot", +#ifdef CONFIG_SUSPEND + [HIBERNATION_SUSPEND] = "suspend", +#endif +}; + +/* + * /sys/power/disk - Control hibernation mode. + * + * Hibernation can be handled in several ways. There are a few different ways + * to put the system into the sleep state: using the platform driver (e.g. ACPI + * or other hibernation_ops), powering it off or rebooting it (for testing + * mostly). + * + * The sysfs file /sys/power/disk provides an interface for selecting the + * hibernation mode to use. Reading from this file causes the available modes + * to be printed. There are 3 modes that can be supported: + * + * 'platform' + * 'shutdown' + * 'reboot' + * + * If a platform hibernation driver is in use, 'platform' will be supported + * and will be used by default. Otherwise, 'shutdown' will be used by default. + * The selected option (i.e. the one corresponding to the current value of + * hibernation_mode) is enclosed by a square bracket. + * + * To select a given hibernation mode it is necessary to write the mode's + * string representation (as returned by reading from /sys/power/disk) back + * into /sys/power/disk. + */ + +static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + int i; + char *start = buf; + + if (!hibernation_available()) + return sprintf(buf, "[disabled]\n"); + + for (i = HIBERNATION_FIRST; i <= HIBERNATION_MAX; i++) { + if (!hibernation_modes[i]) + continue; + switch (i) { + case HIBERNATION_SHUTDOWN: + case HIBERNATION_REBOOT: +#ifdef CONFIG_SUSPEND + case HIBERNATION_SUSPEND: +#endif + break; + case HIBERNATION_PLATFORM: + if (hibernation_ops) + break; + /* not a valid mode, continue with loop */ + continue; + } + if (i == hibernation_mode) + buf += sprintf(buf, "[%s] ", hibernation_modes[i]); + else + buf += sprintf(buf, "%s ", hibernation_modes[i]); + } + buf += sprintf(buf, "\n"); + return buf-start; +} + +static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t n) +{ + int error = 0; + int i; + int len; + char *p; + int mode = HIBERNATION_INVALID; + + if (!hibernation_available()) + return -EPERM; + + p = memchr(buf, '\n', n); + len = p ? p - buf : n; + + lock_system_sleep(); + for (i = HIBERNATION_FIRST; i <= HIBERNATION_MAX; i++) { + if (len == strlen(hibernation_modes[i]) + && !strncmp(buf, hibernation_modes[i], len)) { + mode = i; + break; + } + } + if (mode != HIBERNATION_INVALID) { + switch (mode) { + case HIBERNATION_SHUTDOWN: + case HIBERNATION_REBOOT: +#ifdef CONFIG_SUSPEND + case HIBERNATION_SUSPEND: +#endif + hibernation_mode = mode; + break; + case HIBERNATION_PLATFORM: + if (hibernation_ops) + hibernation_mode = mode; + else + error = -EINVAL; + } + } else + error = -EINVAL; + + if (!error) + pr_debug("PM: Hibernation mode set to '%s'\n", + hibernation_modes[mode]); + unlock_system_sleep(); + return error ? error : n; +} + +power_attr(disk); + +static ssize_t resume_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf,"%d:%d\n", MAJOR(swsusp_resume_device), + MINOR(swsusp_resume_device)); +} + +static ssize_t resume_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t n) +{ + dev_t res; + int len = n; + char *name; + + if (len && buf[len-1] == '\n') + len--; + name = kstrndup(buf, len, GFP_KERNEL); + if (!name) + return -ENOMEM; + + res = name_to_dev_t(name); + kfree(name); + if (!res) + return -EINVAL; + + lock_system_sleep(); + swsusp_resume_device = res; + unlock_system_sleep(); + printk(KERN_INFO "PM: Starting manual resume from disk\n"); + noresume = 0; + software_resume(); + return n; +} + +power_attr(resume); + +static ssize_t image_size_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, "%lu\n", image_size); +} + +static ssize_t image_size_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t n) +{ + unsigned long size; + + if (sscanf(buf, "%lu", &size) == 1) { + image_size = size; + return n; + } + + return -EINVAL; +} + +power_attr(image_size); + +static ssize_t reserved_size_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%lu\n", reserved_size); +} + +static ssize_t reserved_size_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t n) +{ + unsigned long size; + + if (sscanf(buf, "%lu", &size) == 1) { + reserved_size = size; + return n; + } + + return -EINVAL; +} + +power_attr(reserved_size); + +static struct attribute * g[] = { + &disk_attr.attr, + &resume_attr.attr, + &image_size_attr.attr, + &reserved_size_attr.attr, + NULL, +}; + + +static struct attribute_group attr_group = { + .attrs = g, +}; + + +static int __init pm_disk_init(void) +{ + return sysfs_create_group(power_kobj, &attr_group); +} + +core_initcall(pm_disk_init); + + +static int __init resume_setup(char *str) +{ + if (noresume) + return 1; + + strncpy( resume_file, str, 255 ); + return 1; +} + +static int __init resume_offset_setup(char *str) +{ + unsigned long long offset; + + if (noresume) + return 1; + + if (sscanf(str, "%llu", &offset) == 1) + swsusp_resume_block = offset; + + return 1; +} + +static int __init hibernate_setup(char *str) +{ + if (!strncmp(str, "noresume", 8)) + noresume = 1; + else if (!strncmp(str, "nocompress", 10)) + nocompress = 1; + else if (!strncmp(str, "no", 2)) { + noresume = 1; + nohibernate = 1; + } + return 1; +} + +static int __init noresume_setup(char *str) +{ + noresume = 1; + return 1; +} + +static int __init resumewait_setup(char *str) +{ + resume_wait = 1; + return 1; +} + +static int __init resumedelay_setup(char *str) +{ + int rc = kstrtouint(str, 0, &resume_delay); + + if (rc) + return rc; + return 1; +} + +static int __init nohibernate_setup(char *str) +{ + noresume = 1; + nohibernate = 1; + return 1; +} + +static int __init kaslr_nohibernate_setup(char *str) +{ + return nohibernate_setup(str); +} + +__setup("noresume", noresume_setup); +__setup("resume_offset=", resume_offset_setup); +__setup("resume=", resume_setup); +__setup("hibernate=", hibernate_setup); +__setup("resumewait", resumewait_setup); +__setup("resumedelay=", resumedelay_setup); +__setup("nohibernate", nohibernate_setup); +__setup("kaslr", kaslr_nohibernate_setup); diff --git a/kernel/power/main.c b/kernel/power/main.c index 873228c71da..8e90f330f13 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -3,300 +3,536 @@ * * Copyright (c) 2003 Patrick Mochel * Copyright (c) 2003 Open Source Development Lab - * + * * This file is released under the GPLv2 * */ -#include <linux/suspend.h> +#include <linux/export.h> #include <linux/kobject.h> #include <linux/string.h> -#include <linux/delay.h> -#include <linux/errno.h> -#include <linux/init.h> -#include <linux/pm.h> -#include <linux/console.h> -#include <linux/cpu.h> #include <linux/resume-trace.h> +#include <linux/workqueue.h> +#include <linux/debugfs.h> +#include <linux/seq_file.h> #include "power.h" -/*This is just an arbitrary number */ -#define FREE_PAGE_NUMBER (100) +DEFINE_MUTEX(pm_mutex); -DECLARE_MUTEX(pm_sem); +#ifdef CONFIG_PM_SLEEP -struct pm_ops *pm_ops; -suspend_disk_method_t pm_disk_mode = PM_DISK_SHUTDOWN; +/* Routines for PM-transition notifications */ -/** - * pm_set_ops - Set the global power method table. - * @ops: Pointer to ops structure. - */ +static BLOCKING_NOTIFIER_HEAD(pm_chain_head); -void pm_set_ops(struct pm_ops * ops) +int register_pm_notifier(struct notifier_block *nb) { - down(&pm_sem); - pm_ops = ops; - up(&pm_sem); + return blocking_notifier_chain_register(&pm_chain_head, nb); } +EXPORT_SYMBOL_GPL(register_pm_notifier); +int unregister_pm_notifier(struct notifier_block *nb) +{ + return blocking_notifier_chain_unregister(&pm_chain_head, nb); +} +EXPORT_SYMBOL_GPL(unregister_pm_notifier); -/** - * suspend_prepare - Do prep work before entering low-power state. - * @state: State we're entering. - * - * This is common code that is called for each state that we're - * entering. Allocate a console, stop all processes, then make sure - * the platform can enter the requested state. - */ +int pm_notifier_call_chain(unsigned long val) +{ + int ret = blocking_notifier_call_chain(&pm_chain_head, val, NULL); + + return notifier_to_errno(ret); +} + +/* If set, devices may be suspended and resumed asynchronously. */ +int pm_async_enabled = 1; -static int suspend_prepare(suspend_state_t state) +static ssize_t pm_async_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) { - int error; - unsigned int free_pages; + return sprintf(buf, "%d\n", pm_async_enabled); +} - if (!pm_ops || !pm_ops->enter) - return -EPERM; +static ssize_t pm_async_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t n) +{ + unsigned long val; - pm_prepare_console(); + if (kstrtoul(buf, 10, &val)) + return -EINVAL; - error = disable_nonboot_cpus(); - if (error) - goto Enable_cpu; + if (val > 1) + return -EINVAL; - if (freeze_processes()) { - error = -EAGAIN; - goto Thaw; - } + pm_async_enabled = val; + return n; +} + +power_attr(pm_async); - if ((free_pages = nr_free_pages()) < FREE_PAGE_NUMBER) { - pr_debug("PM: free some memory\n"); - shrink_all_memory(FREE_PAGE_NUMBER - free_pages); - if (nr_free_pages() < FREE_PAGE_NUMBER) { - error = -ENOMEM; - printk(KERN_ERR "PM: No enough memory\n"); - goto Thaw; +#ifdef CONFIG_PM_DEBUG +int pm_test_level = TEST_NONE; + +static const char * const pm_tests[__TEST_AFTER_LAST] = { + [TEST_NONE] = "none", + [TEST_CORE] = "core", + [TEST_CPUS] = "processors", + [TEST_PLATFORM] = "platform", + [TEST_DEVICES] = "devices", + [TEST_FREEZER] = "freezer", +}; + +static ssize_t pm_test_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + char *s = buf; + int level; + + for (level = TEST_FIRST; level <= TEST_MAX; level++) + if (pm_tests[level]) { + if (level == pm_test_level) + s += sprintf(s, "[%s] ", pm_tests[level]); + else + s += sprintf(s, "%s ", pm_tests[level]); } - } - if (pm_ops->prepare) { - if ((error = pm_ops->prepare(state))) - goto Thaw; + if (s != buf) + /* convert the last space to a newline */ + *(s-1) = '\n'; + + return (s - buf); +} + +static ssize_t pm_test_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t n) +{ + const char * const *s; + int level; + char *p; + int len; + int error = -EINVAL; + + p = memchr(buf, '\n', n); + len = p ? p - buf : n; + + lock_system_sleep(); + + level = TEST_FIRST; + for (s = &pm_tests[level]; level <= TEST_MAX; s++, level++) + if (*s && len == strlen(*s) && !strncmp(buf, *s, len)) { + pm_test_level = level; + error = 0; + break; + } + + unlock_system_sleep(); + + return error ? error : n; +} + +power_attr(pm_test); +#endif /* CONFIG_PM_DEBUG */ + +#ifdef CONFIG_DEBUG_FS +static char *suspend_step_name(enum suspend_stat_step step) +{ + switch (step) { + case SUSPEND_FREEZE: + return "freeze"; + case SUSPEND_PREPARE: + return "prepare"; + case SUSPEND_SUSPEND: + return "suspend"; + case SUSPEND_SUSPEND_NOIRQ: + return "suspend_noirq"; + case SUSPEND_RESUME_NOIRQ: + return "resume_noirq"; + case SUSPEND_RESUME: + return "resume"; + default: + return ""; } +} - suspend_console(); - if ((error = device_suspend(PMSG_SUSPEND))) { - printk(KERN_ERR "Some devices failed to suspend\n"); - goto Finish; +static int suspend_stats_show(struct seq_file *s, void *unused) +{ + int i, index, last_dev, last_errno, last_step; + + last_dev = suspend_stats.last_failed_dev + REC_FAILED_NUM - 1; + last_dev %= REC_FAILED_NUM; + last_errno = suspend_stats.last_failed_errno + REC_FAILED_NUM - 1; + last_errno %= REC_FAILED_NUM; + last_step = suspend_stats.last_failed_step + REC_FAILED_NUM - 1; + last_step %= REC_FAILED_NUM; + seq_printf(s, "%s: %d\n%s: %d\n%s: %d\n%s: %d\n%s: %d\n" + "%s: %d\n%s: %d\n%s: %d\n%s: %d\n%s: %d\n", + "success", suspend_stats.success, + "fail", suspend_stats.fail, + "failed_freeze", suspend_stats.failed_freeze, + "failed_prepare", suspend_stats.failed_prepare, + "failed_suspend", suspend_stats.failed_suspend, + "failed_suspend_late", + suspend_stats.failed_suspend_late, + "failed_suspend_noirq", + suspend_stats.failed_suspend_noirq, + "failed_resume", suspend_stats.failed_resume, + "failed_resume_early", + suspend_stats.failed_resume_early, + "failed_resume_noirq", + suspend_stats.failed_resume_noirq); + seq_printf(s, "failures:\n last_failed_dev:\t%-s\n", + suspend_stats.failed_devs[last_dev]); + for (i = 1; i < REC_FAILED_NUM; i++) { + index = last_dev + REC_FAILED_NUM - i; + index %= REC_FAILED_NUM; + seq_printf(s, "\t\t\t%-s\n", + suspend_stats.failed_devs[index]); + } + seq_printf(s, " last_failed_errno:\t%-d\n", + suspend_stats.errno[last_errno]); + for (i = 1; i < REC_FAILED_NUM; i++) { + index = last_errno + REC_FAILED_NUM - i; + index %= REC_FAILED_NUM; + seq_printf(s, "\t\t\t%-d\n", + suspend_stats.errno[index]); } + seq_printf(s, " last_failed_step:\t%-s\n", + suspend_step_name( + suspend_stats.failed_steps[last_step])); + for (i = 1; i < REC_FAILED_NUM; i++) { + index = last_step + REC_FAILED_NUM - i; + index %= REC_FAILED_NUM; + seq_printf(s, "\t\t\t%-s\n", + suspend_step_name( + suspend_stats.failed_steps[index])); + } + return 0; - Finish: - if (pm_ops->finish) - pm_ops->finish(state); - Thaw: - thaw_processes(); - Enable_cpu: - enable_nonboot_cpus(); - pm_restore_console(); - return error; } - -int suspend_enter(suspend_state_t state) +static int suspend_stats_open(struct inode *inode, struct file *file) { - int error = 0; - unsigned long flags; + return single_open(file, suspend_stats_show, NULL); +} - local_irq_save(flags); +static const struct file_operations suspend_stats_operations = { + .open = suspend_stats_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; - if ((error = device_power_down(PMSG_SUSPEND))) { - printk(KERN_ERR "Some devices failed to power down\n"); - goto Done; - } - error = pm_ops->enter(state); - device_power_up(); - Done: - local_irq_restore(flags); - return error; +static int __init pm_debugfs_init(void) +{ + debugfs_create_file("suspend_stats", S_IFREG | S_IRUGO, + NULL, NULL, &suspend_stats_operations); + return 0; } +late_initcall(pm_debugfs_init); +#endif /* CONFIG_DEBUG_FS */ -/** - * suspend_finish - Do final work before exiting suspend sequence. - * @state: State we're coming out of. +#endif /* CONFIG_PM_SLEEP */ + +#ifdef CONFIG_PM_SLEEP_DEBUG +/* + * pm_print_times: print time taken by devices to suspend and resume. * - * Call platform code to clean up, restart processes, and free the - * console that we've allocated. This is not called for suspend-to-disk. + * show() returns whether printing of suspend and resume times is enabled. + * store() accepts 0 or 1. 0 disables printing and 1 enables it. */ +bool pm_print_times_enabled; -static void suspend_finish(suspend_state_t state) +static ssize_t pm_print_times_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) { - device_resume(); - resume_console(); - thaw_processes(); - enable_nonboot_cpus(); - if (pm_ops && pm_ops->finish) - pm_ops->finish(state); - pm_restore_console(); + return sprintf(buf, "%d\n", pm_print_times_enabled); } +static ssize_t pm_print_times_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t n) +{ + unsigned long val; + if (kstrtoul(buf, 10, &val)) + return -EINVAL; + if (val > 1) + return -EINVAL; -static const char * const pm_states[PM_SUSPEND_MAX] = { - [PM_SUSPEND_STANDBY] = "standby", - [PM_SUSPEND_MEM] = "mem", -#ifdef CONFIG_SOFTWARE_SUSPEND - [PM_SUSPEND_DISK] = "disk", -#endif -}; + pm_print_times_enabled = !!val; + return n; +} -static inline int valid_state(suspend_state_t state) -{ - /* Suspend-to-disk does not really need low-level support. - * It can work with reboot if needed. */ - if (state == PM_SUSPEND_DISK) - return 1; +power_attr(pm_print_times); - if (pm_ops && pm_ops->valid && !pm_ops->valid(state)) - return 0; - return 1; +static inline void pm_print_times_init(void) +{ + pm_print_times_enabled = !!initcall_debug; } +#else /* !CONFIG_PP_SLEEP_DEBUG */ +static inline void pm_print_times_init(void) {} +#endif /* CONFIG_PM_SLEEP_DEBUG */ +struct kobject *power_kobj; /** - * enter_state - Do common work of entering low-power state. - * @state: pm_state structure for state we're entering. + * state - control system sleep states. + * + * show() returns available sleep state labels, which may be "mem", "standby", + * "freeze" and "disk" (hibernation). See Documentation/power/states.txt for a + * description of what they mean. * - * Make sure we're the only ones trying to enter a sleep state. Fail - * if someone has beat us to it, since we don't want anything weird to - * happen when we wake up. - * Then, do the setup for suspend, enter the state, and cleaup (after - * we've woken up). + * store() accepts one of those strings, translates it into the proper + * enumerated value, and initiates a suspend transition. */ - -static int enter_state(suspend_state_t state) +static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) { - int error; + char *s = buf; +#ifdef CONFIG_SUSPEND + suspend_state_t i; - if (!valid_state(state)) - return -ENODEV; - if (down_trylock(&pm_sem)) - return -EBUSY; + for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++) + if (pm_states[i].state) + s += sprintf(s,"%s ", pm_states[i].label); - if (state == PM_SUSPEND_DISK) { - error = pm_suspend_disk(); - goto Unlock; - } +#endif + if (hibernation_available()) + s += sprintf(s, "disk "); + if (s != buf) + /* convert the last space to a newline */ + *(s-1) = '\n'; + return (s - buf); +} + +static suspend_state_t decode_state(const char *buf, size_t n) +{ +#ifdef CONFIG_SUSPEND + suspend_state_t state = PM_SUSPEND_MIN; + struct pm_sleep_state *s; +#endif + char *p; + int len; + + p = memchr(buf, '\n', n); + len = p ? p - buf : n; - pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]); - if ((error = suspend_prepare(state))) - goto Unlock; + /* Check hibernation first. */ + if (len == 4 && !strncmp(buf, "disk", len)) + return PM_SUSPEND_MAX; - pr_debug("PM: Entering %s sleep\n", pm_states[state]); - error = suspend_enter(state); +#ifdef CONFIG_SUSPEND + for (s = &pm_states[state]; state < PM_SUSPEND_MAX; s++, state++) + if (s->state && len == strlen(s->label) + && !strncmp(buf, s->label, len)) + return s->state; +#endif - pr_debug("PM: Finishing wakeup.\n"); - suspend_finish(state); - Unlock: - up(&pm_sem); - return error; + return PM_SUSPEND_ON; } -/* - * This is main interface to the outside world. It needs to be - * called from process context. - */ -int software_suspend(void) +static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t n) { - return enter_state(PM_SUSPEND_DISK); + suspend_state_t state; + int error; + + error = pm_autosleep_lock(); + if (error) + return error; + + if (pm_autosleep_state() > PM_SUSPEND_ON) { + error = -EBUSY; + goto out; + } + + state = decode_state(buf, n); + if (state < PM_SUSPEND_MAX) + error = pm_suspend(state); + else if (state == PM_SUSPEND_MAX) + error = hibernate(); + else + error = -EINVAL; + + out: + pm_autosleep_unlock(); + return error ? error : n; } +power_attr(state); -/** - * pm_suspend - Externally visible function for suspending system. - * @state: Enumarted value of state to enter. +#ifdef CONFIG_PM_SLEEP +/* + * The 'wakeup_count' attribute, along with the functions defined in + * drivers/base/power/wakeup.c, provides a means by which wakeup events can be + * handled in a non-racy way. * - * Determine whether or not value is within range, get state - * structure, and enter (above). + * If a wakeup event occurs when the system is in a sleep state, it simply is + * woken up. In turn, if an event that would wake the system up from a sleep + * state occurs when it is undergoing a transition to that sleep state, the + * transition should be aborted. Moreover, if such an event occurs when the + * system is in the working state, an attempt to start a transition to the + * given sleep state should fail during certain period after the detection of + * the event. Using the 'state' attribute alone is not sufficient to satisfy + * these requirements, because a wakeup event may occur exactly when 'state' + * is being written to and may be delivered to user space right before it is + * frozen, so the event will remain only partially processed until the system is + * woken up by another event. In particular, it won't cause the transition to + * a sleep state to be aborted. + * + * This difficulty may be overcome if user space uses 'wakeup_count' before + * writing to 'state'. It first should read from 'wakeup_count' and store + * the read value. Then, after carrying out its own preparations for the system + * transition to a sleep state, it should write the stored value to + * 'wakeup_count'. If that fails, at least one wakeup event has occurred since + * 'wakeup_count' was read and 'state' should not be written to. Otherwise, it + * is allowed to write to 'state', but the transition will be aborted if there + * are any wakeup events detected after 'wakeup_count' was written to. */ -int pm_suspend(suspend_state_t state) +static ssize_t wakeup_count_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) { - if (state > PM_SUSPEND_ON && state <= PM_SUSPEND_MAX) - return enter_state(state); - return -EINVAL; + unsigned int val; + + return pm_get_wakeup_count(&val, true) ? + sprintf(buf, "%u\n", val) : -EINTR; } +static ssize_t wakeup_count_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t n) +{ + unsigned int val; + int error; + error = pm_autosleep_lock(); + if (error) + return error; -decl_subsys(power,NULL,NULL); + if (pm_autosleep_state() > PM_SUSPEND_ON) { + error = -EBUSY; + goto out; + } + error = -EINVAL; + if (sscanf(buf, "%u", &val) == 1) { + if (pm_save_wakeup_count(val)) + error = n; + else + pm_print_active_wakeup_sources(); + } -/** - * state - control system power state. - * - * show() returns what states are supported, which is hard-coded to - * 'standby' (Power-On Suspend), 'mem' (Suspend-to-RAM), and - * 'disk' (Suspend-to-Disk). - * - * store() accepts one of those strings, translates it into the - * proper enumerated value, and initiates a suspend transition. - */ + out: + pm_autosleep_unlock(); + return error; +} + +power_attr(wakeup_count); -static ssize_t state_show(struct subsystem * subsys, char * buf) +#ifdef CONFIG_PM_AUTOSLEEP +static ssize_t autosleep_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) { - int i; - char * s = buf; + suspend_state_t state = pm_autosleep_state(); - for (i = 0; i < PM_SUSPEND_MAX; i++) { - if (pm_states[i] && valid_state(i)) - s += sprintf(s,"%s ", pm_states[i]); - } - s += sprintf(s,"\n"); - return (s - buf); + if (state == PM_SUSPEND_ON) + return sprintf(buf, "off\n"); + +#ifdef CONFIG_SUSPEND + if (state < PM_SUSPEND_MAX) + return sprintf(buf, "%s\n", pm_states[state].state ? + pm_states[state].label : "error"); +#endif +#ifdef CONFIG_HIBERNATION + return sprintf(buf, "disk\n"); +#else + return sprintf(buf, "error"); +#endif } -static ssize_t state_store(struct subsystem * subsys, const char * buf, size_t n) +static ssize_t autosleep_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t n) { - suspend_state_t state = PM_SUSPEND_STANDBY; - const char * const *s; - char *p; + suspend_state_t state = decode_state(buf, n); int error; - int len; - p = memchr(buf, '\n', n); - len = p ? p - buf : n; + if (state == PM_SUSPEND_ON + && strcmp(buf, "off") && strcmp(buf, "off\n")) + return -EINVAL; - for (s = &pm_states[state]; state < PM_SUSPEND_MAX; s++, state++) { - if (*s && !strncmp(buf, *s, len)) - break; - } - if (state < PM_SUSPEND_MAX && *s) - error = enter_state(state); - else - error = -EINVAL; + error = pm_autosleep_set_state(state); return error ? error : n; } -power_attr(state); +power_attr(autosleep); +#endif /* CONFIG_PM_AUTOSLEEP */ + +#ifdef CONFIG_PM_WAKELOCKS +static ssize_t wake_lock_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return pm_show_wakelocks(buf, true); +} + +static ssize_t wake_lock_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t n) +{ + int error = pm_wake_lock(buf); + return error ? error : n; +} + +power_attr(wake_lock); + +static ssize_t wake_unlock_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return pm_show_wakelocks(buf, false); +} + +static ssize_t wake_unlock_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t n) +{ + int error = pm_wake_unlock(buf); + return error ? error : n; +} + +power_attr(wake_unlock); + +#endif /* CONFIG_PM_WAKELOCKS */ +#endif /* CONFIG_PM_SLEEP */ #ifdef CONFIG_PM_TRACE int pm_trace_enabled; -static ssize_t pm_trace_show(struct subsystem * subsys, char * buf) +static ssize_t pm_trace_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) { return sprintf(buf, "%d\n", pm_trace_enabled); } static ssize_t -pm_trace_store(struct subsystem * subsys, const char * buf, size_t n) +pm_trace_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t n) { int val; if (sscanf(buf, "%d", &val) == 1) { pm_trace_enabled = !!val; + if (pm_trace_enabled) { + pr_warn("PM: Enabling pm_trace changes system date and time during resume.\n" + "PM: Correct system time has to be restored manually after resume.\n"); + } return n; } return -EINVAL; @@ -304,29 +540,110 @@ pm_trace_store(struct subsystem * subsys, const char * buf, size_t n) power_attr(pm_trace); +static ssize_t pm_trace_dev_match_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return show_trace_dev_match(buf, PAGE_SIZE); +} + +static ssize_t +pm_trace_dev_match_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t n) +{ + return -EINVAL; +} + +power_attr(pm_trace_dev_match); + +#endif /* CONFIG_PM_TRACE */ + +#ifdef CONFIG_FREEZER +static ssize_t pm_freeze_timeout_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%u\n", freeze_timeout_msecs); +} + +static ssize_t pm_freeze_timeout_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t n) +{ + unsigned long val; + + if (kstrtoul(buf, 10, &val)) + return -EINVAL; + + freeze_timeout_msecs = val; + return n; +} + +power_attr(pm_freeze_timeout); + +#endif /* CONFIG_FREEZER*/ + static struct attribute * g[] = { &state_attr.attr, +#ifdef CONFIG_PM_TRACE &pm_trace_attr.attr, + &pm_trace_dev_match_attr.attr, +#endif +#ifdef CONFIG_PM_SLEEP + &pm_async_attr.attr, + &wakeup_count_attr.attr, +#ifdef CONFIG_PM_AUTOSLEEP + &autosleep_attr.attr, +#endif +#ifdef CONFIG_PM_WAKELOCKS + &wake_lock_attr.attr, + &wake_unlock_attr.attr, +#endif +#ifdef CONFIG_PM_DEBUG + &pm_test_attr.attr, +#endif +#ifdef CONFIG_PM_SLEEP_DEBUG + &pm_print_times_attr.attr, +#endif +#endif +#ifdef CONFIG_FREEZER + &pm_freeze_timeout_attr.attr, +#endif NULL, }; -#else -static struct attribute * g[] = { - &state_attr.attr, - NULL, -}; -#endif /* CONFIG_PM_TRACE */ static struct attribute_group attr_group = { .attrs = g, }; +#ifdef CONFIG_PM_RUNTIME +struct workqueue_struct *pm_wq; +EXPORT_SYMBOL_GPL(pm_wq); + +static int __init pm_start_workqueue(void) +{ + pm_wq = alloc_workqueue("pm", WQ_FREEZABLE, 0); + + return pm_wq ? 0 : -ENOMEM; +} +#else +static inline int pm_start_workqueue(void) { return 0; } +#endif static int __init pm_init(void) { - int error = subsystem_register(&power_subsys); - if (!error) - error = sysfs_create_group(&power_subsys.kset.kobj,&attr_group); - return error; + int error = pm_start_workqueue(); + if (error) + return error; + hibernate_image_size_init(); + hibernate_reserved_size_init(); + power_kobj = kobject_create_and_add("power", NULL); + if (!power_kobj) + return -ENOMEM; + error = sysfs_create_group(power_kobj, &attr_group); + if (error) + return error; + pm_print_times_init(); + return pm_autosleep_init(); } core_initcall(pm_init); diff --git a/kernel/power/pm.c b/kernel/power/pm.c deleted file mode 100644 index c50d15266c1..00000000000 --- a/kernel/power/pm.c +++ /dev/null @@ -1,209 +0,0 @@ -/* - * pm.c - Power management interface - * - * Copyright (C) 2000 Andrew Henroid - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ -#include <linux/init.h> -#include <linux/module.h> -#include <linux/spinlock.h> -#include <linux/mm.h> -#include <linux/slab.h> -#include <linux/pm.h> -#include <linux/pm_legacy.h> -#include <linux/interrupt.h> -#include <linux/mutex.h> - -int pm_active; - -/* - * Locking notes: - * pm_devs_lock can be a semaphore providing pm ops are not called - * from an interrupt handler (already a bad idea so no change here). Each - * change must be protected so that an unlink of an entry doesn't clash - * with a pm send - which is permitted to sleep in the current architecture - * - * Module unloads clashing with pm events now work out safely, the module - * unload path will block until the event has been sent. It may well block - * until a resume but that will be fine. - */ - -static DEFINE_MUTEX(pm_devs_lock); -static LIST_HEAD(pm_devs); - -/** - * pm_register - register a device with power management - * @type: device type - * @id: device ID - * @callback: callback function - * - * Add a device to the list of devices that wish to be notified about - * power management events. A &pm_dev structure is returned on success, - * on failure the return is %NULL. - * - * The callback function will be called in process context and - * it may sleep. - */ - -struct pm_dev *pm_register(pm_dev_t type, - unsigned long id, - pm_callback callback) -{ - struct pm_dev *dev = kzalloc(sizeof(struct pm_dev), GFP_KERNEL); - if (dev) { - dev->type = type; - dev->id = id; - dev->callback = callback; - - mutex_lock(&pm_devs_lock); - list_add(&dev->entry, &pm_devs); - mutex_unlock(&pm_devs_lock); - } - return dev; -} - -/** - * pm_send - send request to a single device - * @dev: device to send to - * @rqst: power management request - * @data: data for the callback - * - * Issue a power management request to a given device. The - * %PM_SUSPEND and %PM_RESUME events are handled specially. The - * data field must hold the intended next state. No call is made - * if the state matches. - * - * BUGS: what stops two power management requests occurring in parallel - * and conflicting. - * - * WARNING: Calling pm_send directly is not generally recommended, in - * particular there is no locking against the pm_dev going away. The - * caller must maintain all needed locking or have 'inside knowledge' - * on the safety. Also remember that this function is not locked against - * pm_unregister. This means that you must handle SMP races on callback - * execution and unload yourself. - */ - -static int pm_send(struct pm_dev *dev, pm_request_t rqst, void *data) -{ - int status = 0; - unsigned long prev_state, next_state; - - if (in_interrupt()) - BUG(); - - switch (rqst) { - case PM_SUSPEND: - case PM_RESUME: - prev_state = dev->state; - next_state = (unsigned long) data; - if (prev_state != next_state) { - if (dev->callback) - status = (*dev->callback)(dev, rqst, data); - if (!status) { - dev->state = next_state; - dev->prev_state = prev_state; - } - } - else { - dev->prev_state = prev_state; - } - break; - default: - if (dev->callback) - status = (*dev->callback)(dev, rqst, data); - break; - } - return status; -} - -/* - * Undo incomplete request - */ -static void pm_undo_all(struct pm_dev *last) -{ - struct list_head *entry = last->entry.prev; - while (entry != &pm_devs) { - struct pm_dev *dev = list_entry(entry, struct pm_dev, entry); - if (dev->state != dev->prev_state) { - /* previous state was zero (running) resume or - * previous state was non-zero (suspended) suspend - */ - pm_request_t undo = (dev->prev_state - ? PM_SUSPEND:PM_RESUME); - pm_send(dev, undo, (void*) dev->prev_state); - } - entry = entry->prev; - } -} - -/** - * pm_send_all - send request to all managed devices - * @rqst: power management request - * @data: data for the callback - * - * Issue a power management request to a all devices. The - * %PM_SUSPEND events are handled specially. Any device is - * permitted to fail a suspend by returning a non zero (error) - * value from its callback function. If any device vetoes a - * suspend request then all other devices that have suspended - * during the processing of this request are restored to their - * previous state. - * - * WARNING: This function takes the pm_devs_lock. The lock is not dropped until - * the callbacks have completed. This prevents races against pm locking - * functions, races against module unload pm_unregister code. It does - * mean however that you must not issue pm_ functions within the callback - * or you will deadlock and users will hate you. - * - * Zero is returned on success. If a suspend fails then the status - * from the device that vetoes the suspend is returned. - * - * BUGS: what stops two power management requests occurring in parallel - * and conflicting. - */ - -int pm_send_all(pm_request_t rqst, void *data) -{ - struct list_head *entry; - - mutex_lock(&pm_devs_lock); - entry = pm_devs.next; - while (entry != &pm_devs) { - struct pm_dev *dev = list_entry(entry, struct pm_dev, entry); - if (dev->callback) { - int status = pm_send(dev, rqst, data); - if (status) { - /* return devices to previous state on - * failed suspend request - */ - if (rqst == PM_SUSPEND) - pm_undo_all(dev); - mutex_unlock(&pm_devs_lock); - return status; - } - } - entry = entry->next; - } - mutex_unlock(&pm_devs_lock); - return 0; -} - -EXPORT_SYMBOL(pm_register); -EXPORT_SYMBOL(pm_send_all); -EXPORT_SYMBOL(pm_active); - - diff --git a/kernel/power/power.h b/kernel/power/power.h index bfe999f7b27..c60f13b5270 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -1,5 +1,8 @@ #include <linux/suspend.h> +#include <linux/suspend_ioctls.h> #include <linux/utsname.h> +#include <linux/freezer.h> +#include <linux/compiler.h> struct swsusp_info { struct new_utsname uts; @@ -9,22 +12,63 @@ struct swsusp_info { unsigned long image_pages; unsigned long pages; unsigned long size; -} __attribute__((aligned(PAGE_SIZE))); +} __aligned(PAGE_SIZE); +#ifdef CONFIG_HIBERNATION +/* kernel/power/snapshot.c */ +extern void __init hibernate_reserved_size_init(void); +extern void __init hibernate_image_size_init(void); +#ifdef CONFIG_ARCH_HIBERNATION_HEADER +/* Maximum size of architecture specific data in a hibernation header */ +#define MAX_ARCH_HEADER_SIZE (sizeof(struct new_utsname) + 4) -#ifdef CONFIG_SOFTWARE_SUSPEND -extern int pm_suspend_disk(void); +extern int arch_hibernation_header_save(void *addr, unsigned int max_size); +extern int arch_hibernation_header_restore(void *addr); -#else -static inline int pm_suspend_disk(void) +static inline int init_header_complete(struct swsusp_info *info) { - return -EPERM; + return arch_hibernation_header_save(info, MAX_ARCH_HEADER_SIZE); } -#endif -extern struct semaphore pm_sem; + +static inline char *check_image_kernel(struct swsusp_info *info) +{ + return arch_hibernation_header_restore(info) ? + "architecture specific data" : NULL; +} +#endif /* CONFIG_ARCH_HIBERNATION_HEADER */ + +/* + * Keep some memory free so that I/O operations can succeed without paging + * [Might this be more than 4 MB?] + */ +#define PAGES_FOR_IO ((4096 * 1024) >> PAGE_SHIFT) + +/* + * Keep 1 MB of memory free so that device drivers can allocate some pages in + * their .suspend() routines without breaking the suspend to disk. + */ +#define SPARE_PAGES ((1024 * 1024) >> PAGE_SHIFT) + +asmlinkage int swsusp_save(void); + +/* kernel/power/hibernate.c */ +extern bool freezer_test_done; + +extern int hibernation_snapshot(int platform_mode); +extern int hibernation_restore(int platform_mode); +extern int hibernation_platform_enter(void); + +#else /* !CONFIG_HIBERNATION */ + +static inline void hibernate_reserved_size_init(void) {} +static inline void hibernate_image_size_init(void) {} +#endif /* !CONFIG_HIBERNATION */ + +extern int pfn_is_nosave(unsigned long); + #define power_attr(_name) \ -static struct subsys_attribute _name##_attr = { \ +static struct kobj_attribute _name##_attr = { \ .attr = { \ .name = __stringify(_name), \ .mode = 0644, \ @@ -33,20 +77,20 @@ static struct subsys_attribute _name##_attr = { \ .store = _name##_store, \ } -extern struct subsystem power_subsys; - -/* References to section boundaries */ -extern const void __nosave_begin, __nosave_end; - /* Preferred image size in bytes (default 500 MB) */ extern unsigned long image_size; +/* Size of memory reserved for drivers (default SPARE_PAGES x PAGE_SIZE) */ +extern unsigned long reserved_size; extern int in_suspend; extern dev_t swsusp_resume_device; +extern sector_t swsusp_resume_block; extern asmlinkage int swsusp_arch_suspend(void); extern asmlinkage int swsusp_arch_resume(void); -extern unsigned int count_data_pages(void); +extern int create_basic_memory_bitmaps(void); +extern void free_basic_memory_bitmaps(void); +extern int hibernate_preallocate_memory(void); /** * Auxiliary structure used for reading the snapshot image data and @@ -69,24 +113,12 @@ extern unsigned int count_data_pages(void); */ struct snapshot_handle { - loff_t offset; /* number of the last byte ready for reading - * or writing in the sequence - */ unsigned int cur; /* number of the block of PAGE_SIZE bytes the * next operation will refer to (ie. current) */ - unsigned int cur_offset; /* offset with respect to the current - * block (for the next operation) - */ - unsigned int prev; /* number of the block of PAGE_SIZE bytes that - * was the current one previously - */ void *buffer; /* address of the block to read from * or write to */ - unsigned int buf_offset; /* location to read from or write to, - * given as a displacement from 'buffer' - */ int sync_read; /* Set to one to notify the caller of * snapshot_write_next() that it may * need to call wait_on_bio_chain() @@ -97,59 +129,174 @@ struct snapshot_handle { * snapshot_read_next()/snapshot_write_next() is allowed to * read/write data after the function returns */ -#define data_of(handle) ((handle).buffer + (handle).buf_offset) +#define data_of(handle) ((handle).buffer) extern unsigned int snapshot_additional_pages(struct zone *zone); -extern int snapshot_read_next(struct snapshot_handle *handle, size_t count); -extern int snapshot_write_next(struct snapshot_handle *handle, size_t count); +extern unsigned long snapshot_get_image_size(void); +extern int snapshot_read_next(struct snapshot_handle *handle); +extern int snapshot_write_next(struct snapshot_handle *handle); +extern void snapshot_write_finalize(struct snapshot_handle *handle); extern int snapshot_image_loaded(struct snapshot_handle *handle); -extern void snapshot_free_unused_memory(struct snapshot_handle *handle); - -#define SNAPSHOT_IOC_MAGIC '3' -#define SNAPSHOT_FREEZE _IO(SNAPSHOT_IOC_MAGIC, 1) -#define SNAPSHOT_UNFREEZE _IO(SNAPSHOT_IOC_MAGIC, 2) -#define SNAPSHOT_ATOMIC_SNAPSHOT _IOW(SNAPSHOT_IOC_MAGIC, 3, void *) -#define SNAPSHOT_ATOMIC_RESTORE _IO(SNAPSHOT_IOC_MAGIC, 4) -#define SNAPSHOT_FREE _IO(SNAPSHOT_IOC_MAGIC, 5) -#define SNAPSHOT_SET_IMAGE_SIZE _IOW(SNAPSHOT_IOC_MAGIC, 6, unsigned long) -#define SNAPSHOT_AVAIL_SWAP _IOR(SNAPSHOT_IOC_MAGIC, 7, void *) -#define SNAPSHOT_GET_SWAP_PAGE _IOR(SNAPSHOT_IOC_MAGIC, 8, void *) -#define SNAPSHOT_FREE_SWAP_PAGES _IO(SNAPSHOT_IOC_MAGIC, 9) -#define SNAPSHOT_SET_SWAP_FILE _IOW(SNAPSHOT_IOC_MAGIC, 10, unsigned int) -#define SNAPSHOT_S2RAM _IO(SNAPSHOT_IOC_MAGIC, 11) -#define SNAPSHOT_IOC_MAXNR 11 -/** - * The bitmap is used for tracing allocated swap pages - * - * The entire bitmap consists of a number of bitmap_page - * structures linked with the help of the .next member. - * Thus each page can be allocated individually, so we only - * need to make 0-order memory allocations to create - * the bitmap. +/* If unset, the snapshot device cannot be open. */ +extern atomic_t snapshot_device_available; + +extern sector_t alloc_swapdev_block(int swap); +extern void free_all_swap_pages(int swap); +extern int swsusp_swap_in_use(void); + +/* + * Flags that can be passed from the hibernatig hernel to the "boot" kernel in + * the image header. */ +#define SF_PLATFORM_MODE 1 +#define SF_NOCOMPRESS_MODE 2 +#define SF_CRC32_MODE 4 + +/* kernel/power/hibernate.c */ +extern int swsusp_check(void); +extern void swsusp_free(void); +extern int swsusp_read(unsigned int *flags_p); +extern int swsusp_write(unsigned int flags); +extern void swsusp_close(fmode_t); +#ifdef CONFIG_SUSPEND +extern int swsusp_unmark(void); +#endif + +/* kernel/power/block_io.c */ +extern struct block_device *hib_resume_bdev; + +extern int hib_bio_read_page(pgoff_t page_off, void *addr, + struct bio **bio_chain); +extern int hib_bio_write_page(pgoff_t page_off, void *addr, + struct bio **bio_chain); +extern int hib_wait_on_bio_chain(struct bio **bio_chain); -#define BITMAP_PAGE_SIZE (PAGE_SIZE - sizeof(void *)) -#define BITMAP_PAGE_CHUNKS (BITMAP_PAGE_SIZE / sizeof(long)) -#define BITS_PER_CHUNK (sizeof(long) * 8) -#define BITMAP_PAGE_BITS (BITMAP_PAGE_CHUNKS * BITS_PER_CHUNK) +struct timeval; +/* kernel/power/swsusp.c */ +extern void swsusp_show_speed(struct timeval *, struct timeval *, + unsigned int, char *); -struct bitmap_page { - unsigned long chunks[BITMAP_PAGE_CHUNKS]; - struct bitmap_page *next; +#ifdef CONFIG_SUSPEND +struct pm_sleep_state { + const char *label; + suspend_state_t state; }; -extern void free_bitmap(struct bitmap_page *bitmap); -extern struct bitmap_page *alloc_bitmap(unsigned int nr_bits); -extern unsigned long alloc_swap_page(int swap, struct bitmap_page *bitmap); -extern void free_all_swap_pages(int swap, struct bitmap_page *bitmap); +/* kernel/power/suspend.c */ +extern struct pm_sleep_state pm_states[]; -extern int swsusp_check(void); -extern int swsusp_shrink_memory(void); -extern void swsusp_free(void); -extern int swsusp_suspend(void); -extern int swsusp_resume(void); -extern int swsusp_read(void); -extern int swsusp_write(void); -extern void swsusp_close(void); -extern int suspend_enter(suspend_state_t state); +extern int suspend_devices_and_enter(suspend_state_t state); +#else /* !CONFIG_SUSPEND */ +static inline int suspend_devices_and_enter(suspend_state_t state) +{ + return -ENOSYS; +} +#endif /* !CONFIG_SUSPEND */ + +#ifdef CONFIG_PM_TEST_SUSPEND +/* kernel/power/suspend_test.c */ +extern void suspend_test_start(void); +extern void suspend_test_finish(const char *label); +#else /* !CONFIG_PM_TEST_SUSPEND */ +static inline void suspend_test_start(void) {} +static inline void suspend_test_finish(const char *label) {} +#endif /* !CONFIG_PM_TEST_SUSPEND */ + +#ifdef CONFIG_PM_SLEEP +/* kernel/power/main.c */ +extern int pm_notifier_call_chain(unsigned long val); +#endif + +#ifdef CONFIG_HIGHMEM +int restore_highmem(void); +#else +static inline unsigned int count_highmem_pages(void) { return 0; } +static inline int restore_highmem(void) { return 0; } +#endif + +/* + * Suspend test levels + */ +enum { + /* keep first */ + TEST_NONE, + TEST_CORE, + TEST_CPUS, + TEST_PLATFORM, + TEST_DEVICES, + TEST_FREEZER, + /* keep last */ + __TEST_AFTER_LAST +}; + +#define TEST_FIRST TEST_NONE +#define TEST_MAX (__TEST_AFTER_LAST - 1) + +extern int pm_test_level; + +#ifdef CONFIG_SUSPEND_FREEZER +static inline int suspend_freeze_processes(void) +{ + int error; + + error = freeze_processes(); + /* + * freeze_processes() automatically thaws every task if freezing + * fails. So we need not do anything extra upon error. + */ + if (error) + return error; + + error = freeze_kernel_threads(); + /* + * freeze_kernel_threads() thaws only kernel threads upon freezing + * failure. So we have to thaw the userspace tasks ourselves. + */ + if (error) + thaw_processes(); + + return error; +} + +static inline void suspend_thaw_processes(void) +{ + thaw_processes(); +} +#else +static inline int suspend_freeze_processes(void) +{ + return 0; +} + +static inline void suspend_thaw_processes(void) +{ +} +#endif + +#ifdef CONFIG_PM_AUTOSLEEP + +/* kernel/power/autosleep.c */ +extern int pm_autosleep_init(void); +extern int pm_autosleep_lock(void); +extern void pm_autosleep_unlock(void); +extern suspend_state_t pm_autosleep_state(void); +extern int pm_autosleep_set_state(suspend_state_t state); + +#else /* !CONFIG_PM_AUTOSLEEP */ + +static inline int pm_autosleep_init(void) { return 0; } +static inline int pm_autosleep_lock(void) { return 0; } +static inline void pm_autosleep_unlock(void) {} +static inline suspend_state_t pm_autosleep_state(void) { return PM_SUSPEND_ON; } + +#endif /* !CONFIG_PM_AUTOSLEEP */ + +#ifdef CONFIG_PM_WAKELOCKS + +/* kernel/power/wakelock.c */ +extern ssize_t pm_show_wakelocks(char *buf, bool show_active); +extern int pm_wake_lock(const char *buf); +extern int pm_wake_unlock(const char *buf); + +#endif /* !CONFIG_PM_WAKELOCKS */ diff --git a/kernel/power/poweroff.c b/kernel/power/poweroff.c index 7a4144ba3af..7ef6866b521 100644 --- a/kernel/power/poweroff.c +++ b/kernel/power/poweroff.c @@ -10,33 +10,34 @@ #include <linux/pm.h> #include <linux/workqueue.h> #include <linux/reboot.h> +#include <linux/cpumask.h> /* * When the user hits Sys-Rq o to power down the machine this is the * callback we use. */ -static void do_poweroff(void *dummy) +static void do_poweroff(struct work_struct *dummy) { kernel_power_off(); } -static DECLARE_WORK(poweroff_work, do_poweroff, NULL); +static DECLARE_WORK(poweroff_work, do_poweroff); -static void handle_poweroff(int key, struct pt_regs *pt_regs, - struct tty_struct *tty) +static void handle_poweroff(int key) { - schedule_work(&poweroff_work); + /* run sysrq poweroff on boot cpu */ + schedule_work_on(cpumask_first(cpu_online_mask), &poweroff_work); } static struct sysrq_key_op sysrq_poweroff_op = { .handler = handle_poweroff, - .help_msg = "powerOff", + .help_msg = "poweroff(o)", .action_msg = "Power Off", - .enable_mask = SYSRQ_ENABLE_BOOT, + .enable_mask = SYSRQ_ENABLE_BOOT, }; -static int pm_sysrq_init(void) +static int __init pm_sysrq_init(void) { register_sysrq_key('o', &sysrq_poweroff_op); return 0; diff --git a/kernel/power/process.c b/kernel/power/process.c index 72e72d2c61e..4ee194eb524 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -8,167 +8,221 @@ #undef DEBUG -#include <linux/smp_lock.h> #include <linux/interrupt.h> +#include <linux/oom.h> #include <linux/suspend.h> #include <linux/module.h> #include <linux/syscalls.h> +#include <linux/freezer.h> +#include <linux/delay.h> +#include <linux/workqueue.h> +#include <linux/kmod.h> +#include <trace/events/power.h> /* * Timeout for stopping processes */ -#define TIMEOUT (20 * HZ) +unsigned int __read_mostly freeze_timeout_msecs = 20 * MSEC_PER_SEC; - -static inline int freezeable(struct task_struct * p) +static int try_to_freeze_tasks(bool user_only) { - if ((p == current) || - (p->flags & PF_NOFREEZE) || - (p->exit_state == EXIT_ZOMBIE) || - (p->exit_state == EXIT_DEAD) || - (p->state == TASK_STOPPED)) - return 0; - return 1; -} + struct task_struct *g, *p; + unsigned long end_time; + unsigned int todo; + bool wq_busy = false; + struct timeval start, end; + u64 elapsed_msecs64; + unsigned int elapsed_msecs; + bool wakeup = false; + int sleep_usecs = USEC_PER_MSEC; -/* Refrigerator is place where frozen processes are stored :-). */ -void refrigerator(void) -{ - /* Hmm, should we be allowed to suspend when there are realtime - processes around? */ - long save; - save = current->state; - pr_debug("%s entered refrigerator\n", current->comm); - printk("="); - - frozen_process(current); - spin_lock_irq(¤t->sighand->siglock); - recalc_sigpending(); /* We sent fake signal, clean it up */ - spin_unlock_irq(¤t->sighand->siglock); - - while (frozen(current)) { - current->state = TASK_UNINTERRUPTIBLE; - schedule(); - } - pr_debug("%s left refrigerator\n", current->comm); - current->state = save; -} + do_gettimeofday(&start); -static inline void freeze_process(struct task_struct *p) -{ - unsigned long flags; + end_time = jiffies + msecs_to_jiffies(freeze_timeout_msecs); - if (!freezing(p)) { - freeze(p); - spin_lock_irqsave(&p->sighand->siglock, flags); - signal_wake_up(p, 0); - spin_unlock_irqrestore(&p->sighand->siglock, flags); - } -} + if (!user_only) + freeze_workqueues_begin(); -static void cancel_freezing(struct task_struct *p) -{ - unsigned long flags; - - if (freezing(p)) { - pr_debug(" clean up: %s\n", p->comm); - do_not_freeze(p); - spin_lock_irqsave(&p->sighand->siglock, flags); - recalc_sigpending_tsk(p); - spin_unlock_irqrestore(&p->sighand->siglock, flags); - } -} - -/* 0 = success, else # of processes that we failed to stop */ -int freeze_processes(void) -{ - int todo, nr_user, user_frozen; - unsigned long start_time; - struct task_struct *g, *p; - - printk( "Stopping tasks: " ); - start_time = jiffies; - user_frozen = 0; - do { - nr_user = todo = 0; + while (true) { + todo = 0; read_lock(&tasklist_lock); do_each_thread(g, p) { - if (!freezeable(p)) - continue; - if (frozen(p)) - continue; - if (p->state == TASK_TRACED && frozen(p->parent)) { - cancel_freezing(p); + if (p == current || !freeze_task(p)) continue; - } - if (p->mm && !(p->flags & PF_BORROWED_MM)) { - /* The task is a user-space one. - * Freeze it unless there's a vfork completion - * pending - */ - if (!p->vfork_done) - freeze_process(p); - nr_user++; - } else { - /* Freeze only if the user space is frozen */ - if (user_frozen) - freeze_process(p); + + if (!freezer_should_skip(p)) todo++; - } } while_each_thread(g, p); read_unlock(&tasklist_lock); - todo += nr_user; - if (!user_frozen && !nr_user) { - sys_sync(); - start_time = jiffies; + + if (!user_only) { + wq_busy = freeze_workqueues_busy(); + todo += wq_busy; } - user_frozen = !nr_user; - yield(); /* Yield is okay here */ - if (todo && time_after(jiffies, start_time + TIMEOUT)) + + if (!todo || time_after(jiffies, end_time)) + break; + + if (pm_wakeup_pending()) { + wakeup = true; break; - } while(todo); + } + + /* + * We need to retry, but first give the freezing tasks some + * time to enter the refrigerator. Start with an initial + * 1 ms sleep followed by exponential backoff until 8 ms. + */ + usleep_range(sleep_usecs / 2, sleep_usecs); + if (sleep_usecs < 8 * USEC_PER_MSEC) + sleep_usecs *= 2; + } + + do_gettimeofday(&end); + elapsed_msecs64 = timeval_to_ns(&end) - timeval_to_ns(&start); + do_div(elapsed_msecs64, NSEC_PER_MSEC); + elapsed_msecs = elapsed_msecs64; - /* This does not unfreeze processes that are already frozen - * (we have slightly ugly calling convention in that respect, - * and caller must call thaw_processes() if something fails), - * but it cleans up leftover PF_FREEZE requests. - */ if (todo) { - printk( "\n" ); - printk(KERN_ERR " stopping tasks timed out " - "after %d seconds (%d tasks remaining):\n", - TIMEOUT / HZ, todo); - read_lock(&tasklist_lock); - do_each_thread(g, p) { - if (freezeable(p) && !frozen(p)) - printk(KERN_ERR " %s\n", p->comm); - cancel_freezing(p); - } while_each_thread(g, p); - read_unlock(&tasklist_lock); - return todo; + printk("\n"); + printk(KERN_ERR "Freezing of tasks %s after %d.%03d seconds " + "(%d tasks refusing to freeze, wq_busy=%d):\n", + wakeup ? "aborted" : "failed", + elapsed_msecs / 1000, elapsed_msecs % 1000, + todo - wq_busy, wq_busy); + + if (!wakeup) { + read_lock(&tasklist_lock); + do_each_thread(g, p) { + if (p != current && !freezer_should_skip(p) + && freezing(p) && !frozen(p)) + sched_show_task(p); + } while_each_thread(g, p); + read_unlock(&tasklist_lock); + } + } else { + printk("(elapsed %d.%03d seconds) ", elapsed_msecs / 1000, + elapsed_msecs % 1000); + } + + return todo ? -EBUSY : 0; +} + +/** + * freeze_processes - Signal user space processes to enter the refrigerator. + * The current thread will not be frozen. The same process that calls + * freeze_processes must later call thaw_processes. + * + * On success, returns 0. On failure, -errno and system is fully thawed. + */ +int freeze_processes(void) +{ + int error; + + error = __usermodehelper_disable(UMH_FREEZING); + if (error) + return error; + + /* Make sure this task doesn't get frozen */ + current->flags |= PF_SUSPEND_TASK; + + if (!pm_freezing) + atomic_inc(&system_freezing_cnt); + + printk("Freezing user space processes ... "); + pm_freezing = true; + error = try_to_freeze_tasks(true); + if (!error) { + printk("done."); + __usermodehelper_set_disable_depth(UMH_DISABLED); + oom_killer_disable(); } + printk("\n"); + BUG_ON(in_atomic()); - printk( "|\n" ); + if (error) + thaw_processes(); + return error; +} + +/** + * freeze_kernel_threads - Make freezable kernel threads go to the refrigerator. + * + * On success, returns 0. On failure, -errno and only the kernel threads are + * thawed, so as to give a chance to the caller to do additional cleanups + * (if any) before thawing the userspace tasks. So, it is the responsibility + * of the caller to thaw the userspace tasks, when the time is right. + */ +int freeze_kernel_threads(void) +{ + int error; + + printk("Freezing remaining freezable tasks ... "); + pm_nosig_freezing = true; + error = try_to_freeze_tasks(false); + if (!error) + printk("done."); + + printk("\n"); BUG_ON(in_atomic()); - return 0; + + if (error) + thaw_kernel_threads(); + return error; } void thaw_processes(void) { struct task_struct *g, *p; + struct task_struct *curr = current; + + trace_suspend_resume(TPS("thaw_processes"), 0, true); + if (pm_freezing) + atomic_dec(&system_freezing_cnt); + pm_freezing = false; + pm_nosig_freezing = false; + + oom_killer_enable(); + + printk("Restarting tasks ... "); + + __usermodehelper_set_disable_depth(UMH_FREEZING); + thaw_workqueues(); - printk( "Restarting tasks..." ); read_lock(&tasklist_lock); do_each_thread(g, p) { - if (!freezeable(p)) - continue; - if (!thaw_process(p)) - printk(KERN_INFO " Strange, %s not stopped\n", p->comm ); + /* No other threads should have PF_SUSPEND_TASK set */ + WARN_ON((p != curr) && (p->flags & PF_SUSPEND_TASK)); + __thaw_task(p); } while_each_thread(g, p); - read_unlock(&tasklist_lock); + + WARN_ON(!(curr->flags & PF_SUSPEND_TASK)); + curr->flags &= ~PF_SUSPEND_TASK; + + usermodehelper_enable(); + schedule(); - printk( " done\n" ); + printk("done.\n"); + trace_suspend_resume(TPS("thaw_processes"), 0, false); } -EXPORT_SYMBOL(refrigerator); +void thaw_kernel_threads(void) +{ + struct task_struct *g, *p; + + pm_nosig_freezing = false; + printk("Restarting kernel threads ... "); + + thaw_workqueues(); + + read_lock(&tasklist_lock); + do_each_thread(g, p) { + if (p->flags & (PF_KTHREAD | PF_WQ_WORKER)) + __thaw_task(p); + } while_each_thread(g, p); + read_unlock(&tasklist_lock); + + schedule(); + printk("done.\n"); +} diff --git a/kernel/power/qos.c b/kernel/power/qos.c new file mode 100644 index 00000000000..884b7705886 --- /dev/null +++ b/kernel/power/qos.c @@ -0,0 +1,601 @@ +/* + * This module exposes the interface to kernel space for specifying + * QoS dependencies. It provides infrastructure for registration of: + * + * Dependents on a QoS value : register requests + * Watchers of QoS value : get notified when target QoS value changes + * + * This QoS design is best effort based. Dependents register their QoS needs. + * Watchers register to keep track of the current QoS needs of the system. + * + * There are 3 basic classes of QoS parameter: latency, timeout, throughput + * each have defined units: + * latency: usec + * timeout: usec <-- currently not used. + * throughput: kbs (kilo byte / sec) + * + * There are lists of pm_qos_objects each one wrapping requests, notifiers + * + * User mode requests on a QOS parameter register themselves to the + * subsystem by opening the device node /dev/... and writing there request to + * the node. As long as the process holds a file handle open to the node the + * client continues to be accounted for. Upon file release the usermode + * request is removed and a new qos target is computed. This way when the + * request that the application has is cleaned up when closes the file + * pointer or exits the pm_qos_object will get an opportunity to clean up. + * + * Mark Gross <mgross@linux.intel.com> + */ + +/*#define DEBUG*/ + +#include <linux/pm_qos.h> +#include <linux/sched.h> +#include <linux/spinlock.h> +#include <linux/slab.h> +#include <linux/time.h> +#include <linux/fs.h> +#include <linux/device.h> +#include <linux/miscdevice.h> +#include <linux/string.h> +#include <linux/platform_device.h> +#include <linux/init.h> +#include <linux/kernel.h> + +#include <linux/uaccess.h> +#include <linux/export.h> +#include <trace/events/power.h> + +/* + * locking rule: all changes to constraints or notifiers lists + * or pm_qos_object list and pm_qos_objects need to happen with pm_qos_lock + * held, taken with _irqsave. One lock to rule them all + */ +struct pm_qos_object { + struct pm_qos_constraints *constraints; + struct miscdevice pm_qos_power_miscdev; + char *name; +}; + +static DEFINE_SPINLOCK(pm_qos_lock); + +static struct pm_qos_object null_pm_qos; + +static BLOCKING_NOTIFIER_HEAD(cpu_dma_lat_notifier); +static struct pm_qos_constraints cpu_dma_constraints = { + .list = PLIST_HEAD_INIT(cpu_dma_constraints.list), + .target_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE, + .default_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE, + .no_constraint_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE, + .type = PM_QOS_MIN, + .notifiers = &cpu_dma_lat_notifier, +}; +static struct pm_qos_object cpu_dma_pm_qos = { + .constraints = &cpu_dma_constraints, + .name = "cpu_dma_latency", +}; + +static BLOCKING_NOTIFIER_HEAD(network_lat_notifier); +static struct pm_qos_constraints network_lat_constraints = { + .list = PLIST_HEAD_INIT(network_lat_constraints.list), + .target_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE, + .default_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE, + .no_constraint_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE, + .type = PM_QOS_MIN, + .notifiers = &network_lat_notifier, +}; +static struct pm_qos_object network_lat_pm_qos = { + .constraints = &network_lat_constraints, + .name = "network_latency", +}; + + +static BLOCKING_NOTIFIER_HEAD(network_throughput_notifier); +static struct pm_qos_constraints network_tput_constraints = { + .list = PLIST_HEAD_INIT(network_tput_constraints.list), + .target_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE, + .default_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE, + .no_constraint_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE, + .type = PM_QOS_MAX, + .notifiers = &network_throughput_notifier, +}; +static struct pm_qos_object network_throughput_pm_qos = { + .constraints = &network_tput_constraints, + .name = "network_throughput", +}; + + +static struct pm_qos_object *pm_qos_array[] = { + &null_pm_qos, + &cpu_dma_pm_qos, + &network_lat_pm_qos, + &network_throughput_pm_qos +}; + +static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, + size_t count, loff_t *f_pos); +static ssize_t pm_qos_power_read(struct file *filp, char __user *buf, + size_t count, loff_t *f_pos); +static int pm_qos_power_open(struct inode *inode, struct file *filp); +static int pm_qos_power_release(struct inode *inode, struct file *filp); + +static const struct file_operations pm_qos_power_fops = { + .write = pm_qos_power_write, + .read = pm_qos_power_read, + .open = pm_qos_power_open, + .release = pm_qos_power_release, + .llseek = noop_llseek, +}; + +/* unlocked internal variant */ +static inline int pm_qos_get_value(struct pm_qos_constraints *c) +{ + if (plist_head_empty(&c->list)) + return c->no_constraint_value; + + switch (c->type) { + case PM_QOS_MIN: + return plist_first(&c->list)->prio; + + case PM_QOS_MAX: + return plist_last(&c->list)->prio; + + default: + /* runtime check for not using enum */ + BUG(); + return PM_QOS_DEFAULT_VALUE; + } +} + +s32 pm_qos_read_value(struct pm_qos_constraints *c) +{ + return c->target_value; +} + +static inline void pm_qos_set_value(struct pm_qos_constraints *c, s32 value) +{ + c->target_value = value; +} + +/** + * pm_qos_update_target - manages the constraints list and calls the notifiers + * if needed + * @c: constraints data struct + * @node: request to add to the list, to update or to remove + * @action: action to take on the constraints list + * @value: value of the request to add or update + * + * This function returns 1 if the aggregated constraint value has changed, 0 + * otherwise. + */ +int pm_qos_update_target(struct pm_qos_constraints *c, struct plist_node *node, + enum pm_qos_req_action action, int value) +{ + unsigned long flags; + int prev_value, curr_value, new_value; + int ret; + + spin_lock_irqsave(&pm_qos_lock, flags); + prev_value = pm_qos_get_value(c); + if (value == PM_QOS_DEFAULT_VALUE) + new_value = c->default_value; + else + new_value = value; + + switch (action) { + case PM_QOS_REMOVE_REQ: + plist_del(node, &c->list); + break; + case PM_QOS_UPDATE_REQ: + /* + * to change the list, we atomically remove, reinit + * with new value and add, then see if the extremal + * changed + */ + plist_del(node, &c->list); + case PM_QOS_ADD_REQ: + plist_node_init(node, new_value); + plist_add(node, &c->list); + break; + default: + /* no action */ + ; + } + + curr_value = pm_qos_get_value(c); + pm_qos_set_value(c, curr_value); + + spin_unlock_irqrestore(&pm_qos_lock, flags); + + trace_pm_qos_update_target(action, prev_value, curr_value); + if (prev_value != curr_value) { + ret = 1; + if (c->notifiers) + blocking_notifier_call_chain(c->notifiers, + (unsigned long)curr_value, + NULL); + } else { + ret = 0; + } + return ret; +} + +/** + * pm_qos_flags_remove_req - Remove device PM QoS flags request. + * @pqf: Device PM QoS flags set to remove the request from. + * @req: Request to remove from the set. + */ +static void pm_qos_flags_remove_req(struct pm_qos_flags *pqf, + struct pm_qos_flags_request *req) +{ + s32 val = 0; + + list_del(&req->node); + list_for_each_entry(req, &pqf->list, node) + val |= req->flags; + + pqf->effective_flags = val; +} + +/** + * pm_qos_update_flags - Update a set of PM QoS flags. + * @pqf: Set of flags to update. + * @req: Request to add to the set, to modify, or to remove from the set. + * @action: Action to take on the set. + * @val: Value of the request to add or modify. + * + * Update the given set of PM QoS flags and call notifiers if the aggregate + * value has changed. Returns 1 if the aggregate constraint value has changed, + * 0 otherwise. + */ +bool pm_qos_update_flags(struct pm_qos_flags *pqf, + struct pm_qos_flags_request *req, + enum pm_qos_req_action action, s32 val) +{ + unsigned long irqflags; + s32 prev_value, curr_value; + + spin_lock_irqsave(&pm_qos_lock, irqflags); + + prev_value = list_empty(&pqf->list) ? 0 : pqf->effective_flags; + + switch (action) { + case PM_QOS_REMOVE_REQ: + pm_qos_flags_remove_req(pqf, req); + break; + case PM_QOS_UPDATE_REQ: + pm_qos_flags_remove_req(pqf, req); + case PM_QOS_ADD_REQ: + req->flags = val; + INIT_LIST_HEAD(&req->node); + list_add_tail(&req->node, &pqf->list); + pqf->effective_flags |= val; + break; + default: + /* no action */ + ; + } + + curr_value = list_empty(&pqf->list) ? 0 : pqf->effective_flags; + + spin_unlock_irqrestore(&pm_qos_lock, irqflags); + + trace_pm_qos_update_flags(action, prev_value, curr_value); + return prev_value != curr_value; +} + +/** + * pm_qos_request - returns current system wide qos expectation + * @pm_qos_class: identification of which qos value is requested + * + * This function returns the current target value. + */ +int pm_qos_request(int pm_qos_class) +{ + return pm_qos_read_value(pm_qos_array[pm_qos_class]->constraints); +} +EXPORT_SYMBOL_GPL(pm_qos_request); + +int pm_qos_request_active(struct pm_qos_request *req) +{ + return req->pm_qos_class != 0; +} +EXPORT_SYMBOL_GPL(pm_qos_request_active); + +static void __pm_qos_update_request(struct pm_qos_request *req, + s32 new_value) +{ + trace_pm_qos_update_request(req->pm_qos_class, new_value); + + if (new_value != req->node.prio) + pm_qos_update_target( + pm_qos_array[req->pm_qos_class]->constraints, + &req->node, PM_QOS_UPDATE_REQ, new_value); +} + +/** + * pm_qos_work_fn - the timeout handler of pm_qos_update_request_timeout + * @work: work struct for the delayed work (timeout) + * + * This cancels the timeout request by falling back to the default at timeout. + */ +static void pm_qos_work_fn(struct work_struct *work) +{ + struct pm_qos_request *req = container_of(to_delayed_work(work), + struct pm_qos_request, + work); + + __pm_qos_update_request(req, PM_QOS_DEFAULT_VALUE); +} + +/** + * pm_qos_add_request - inserts new qos request into the list + * @req: pointer to a preallocated handle + * @pm_qos_class: identifies which list of qos request to use + * @value: defines the qos request + * + * This function inserts a new entry in the pm_qos_class list of requested qos + * performance characteristics. It recomputes the aggregate QoS expectations + * for the pm_qos_class of parameters and initializes the pm_qos_request + * handle. Caller needs to save this handle for later use in updates and + * removal. + */ + +void pm_qos_add_request(struct pm_qos_request *req, + int pm_qos_class, s32 value) +{ + if (!req) /*guard against callers passing in null */ + return; + + if (pm_qos_request_active(req)) { + WARN(1, KERN_ERR "pm_qos_add_request() called for already added request\n"); + return; + } + req->pm_qos_class = pm_qos_class; + INIT_DELAYED_WORK(&req->work, pm_qos_work_fn); + trace_pm_qos_add_request(pm_qos_class, value); + pm_qos_update_target(pm_qos_array[pm_qos_class]->constraints, + &req->node, PM_QOS_ADD_REQ, value); +} +EXPORT_SYMBOL_GPL(pm_qos_add_request); + +/** + * pm_qos_update_request - modifies an existing qos request + * @req : handle to list element holding a pm_qos request to use + * @value: defines the qos request + * + * Updates an existing qos request for the pm_qos_class of parameters along + * with updating the target pm_qos_class value. + * + * Attempts are made to make this code callable on hot code paths. + */ +void pm_qos_update_request(struct pm_qos_request *req, + s32 new_value) +{ + if (!req) /*guard against callers passing in null */ + return; + + if (!pm_qos_request_active(req)) { + WARN(1, KERN_ERR "pm_qos_update_request() called for unknown object\n"); + return; + } + + cancel_delayed_work_sync(&req->work); + __pm_qos_update_request(req, new_value); +} +EXPORT_SYMBOL_GPL(pm_qos_update_request); + +/** + * pm_qos_update_request_timeout - modifies an existing qos request temporarily. + * @req : handle to list element holding a pm_qos request to use + * @new_value: defines the temporal qos request + * @timeout_us: the effective duration of this qos request in usecs. + * + * After timeout_us, this qos request is cancelled automatically. + */ +void pm_qos_update_request_timeout(struct pm_qos_request *req, s32 new_value, + unsigned long timeout_us) +{ + if (!req) + return; + if (WARN(!pm_qos_request_active(req), + "%s called for unknown object.", __func__)) + return; + + cancel_delayed_work_sync(&req->work); + + trace_pm_qos_update_request_timeout(req->pm_qos_class, + new_value, timeout_us); + if (new_value != req->node.prio) + pm_qos_update_target( + pm_qos_array[req->pm_qos_class]->constraints, + &req->node, PM_QOS_UPDATE_REQ, new_value); + + schedule_delayed_work(&req->work, usecs_to_jiffies(timeout_us)); +} + +/** + * pm_qos_remove_request - modifies an existing qos request + * @req: handle to request list element + * + * Will remove pm qos request from the list of constraints and + * recompute the current target value for the pm_qos_class. Call this + * on slow code paths. + */ +void pm_qos_remove_request(struct pm_qos_request *req) +{ + if (!req) /*guard against callers passing in null */ + return; + /* silent return to keep pcm code cleaner */ + + if (!pm_qos_request_active(req)) { + WARN(1, KERN_ERR "pm_qos_remove_request() called for unknown object\n"); + return; + } + + cancel_delayed_work_sync(&req->work); + + trace_pm_qos_remove_request(req->pm_qos_class, PM_QOS_DEFAULT_VALUE); + pm_qos_update_target(pm_qos_array[req->pm_qos_class]->constraints, + &req->node, PM_QOS_REMOVE_REQ, + PM_QOS_DEFAULT_VALUE); + memset(req, 0, sizeof(*req)); +} +EXPORT_SYMBOL_GPL(pm_qos_remove_request); + +/** + * pm_qos_add_notifier - sets notification entry for changes to target value + * @pm_qos_class: identifies which qos target changes should be notified. + * @notifier: notifier block managed by caller. + * + * will register the notifier into a notification chain that gets called + * upon changes to the pm_qos_class target value. + */ +int pm_qos_add_notifier(int pm_qos_class, struct notifier_block *notifier) +{ + int retval; + + retval = blocking_notifier_chain_register( + pm_qos_array[pm_qos_class]->constraints->notifiers, + notifier); + + return retval; +} +EXPORT_SYMBOL_GPL(pm_qos_add_notifier); + +/** + * pm_qos_remove_notifier - deletes notification entry from chain. + * @pm_qos_class: identifies which qos target changes are notified. + * @notifier: notifier block to be removed. + * + * will remove the notifier from the notification chain that gets called + * upon changes to the pm_qos_class target value. + */ +int pm_qos_remove_notifier(int pm_qos_class, struct notifier_block *notifier) +{ + int retval; + + retval = blocking_notifier_chain_unregister( + pm_qos_array[pm_qos_class]->constraints->notifiers, + notifier); + + return retval; +} +EXPORT_SYMBOL_GPL(pm_qos_remove_notifier); + +/* User space interface to PM QoS classes via misc devices */ +static int register_pm_qos_misc(struct pm_qos_object *qos) +{ + qos->pm_qos_power_miscdev.minor = MISC_DYNAMIC_MINOR; + qos->pm_qos_power_miscdev.name = qos->name; + qos->pm_qos_power_miscdev.fops = &pm_qos_power_fops; + + return misc_register(&qos->pm_qos_power_miscdev); +} + +static int find_pm_qos_object_by_minor(int minor) +{ + int pm_qos_class; + + for (pm_qos_class = PM_QOS_CPU_DMA_LATENCY; + pm_qos_class < PM_QOS_NUM_CLASSES; pm_qos_class++) { + if (minor == + pm_qos_array[pm_qos_class]->pm_qos_power_miscdev.minor) + return pm_qos_class; + } + return -1; +} + +static int pm_qos_power_open(struct inode *inode, struct file *filp) +{ + long pm_qos_class; + + pm_qos_class = find_pm_qos_object_by_minor(iminor(inode)); + if (pm_qos_class >= PM_QOS_CPU_DMA_LATENCY) { + struct pm_qos_request *req = kzalloc(sizeof(*req), GFP_KERNEL); + if (!req) + return -ENOMEM; + + pm_qos_add_request(req, pm_qos_class, PM_QOS_DEFAULT_VALUE); + filp->private_data = req; + + return 0; + } + return -EPERM; +} + +static int pm_qos_power_release(struct inode *inode, struct file *filp) +{ + struct pm_qos_request *req; + + req = filp->private_data; + pm_qos_remove_request(req); + kfree(req); + + return 0; +} + + +static ssize_t pm_qos_power_read(struct file *filp, char __user *buf, + size_t count, loff_t *f_pos) +{ + s32 value; + unsigned long flags; + struct pm_qos_request *req = filp->private_data; + + if (!req) + return -EINVAL; + if (!pm_qos_request_active(req)) + return -EINVAL; + + spin_lock_irqsave(&pm_qos_lock, flags); + value = pm_qos_get_value(pm_qos_array[req->pm_qos_class]->constraints); + spin_unlock_irqrestore(&pm_qos_lock, flags); + + return simple_read_from_buffer(buf, count, f_pos, &value, sizeof(s32)); +} + +static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, + size_t count, loff_t *f_pos) +{ + s32 value; + struct pm_qos_request *req; + + if (count == sizeof(s32)) { + if (copy_from_user(&value, buf, sizeof(s32))) + return -EFAULT; + } else { + int ret; + + ret = kstrtos32_from_user(buf, count, 16, &value); + if (ret) + return ret; + } + + req = filp->private_data; + pm_qos_update_request(req, value); + + return count; +} + + +static int __init pm_qos_power_init(void) +{ + int ret = 0; + int i; + + BUILD_BUG_ON(ARRAY_SIZE(pm_qos_array) != PM_QOS_NUM_CLASSES); + + for (i = PM_QOS_CPU_DMA_LATENCY; i < PM_QOS_NUM_CLASSES; i++) { + ret = register_pm_qos_misc(pm_qos_array[i]); + if (ret < 0) { + printk(KERN_ERR "pm_qos_param: %s setup failed\n", + pm_qos_array[i]->name); + return ret; + } + } + + return ret; +} + +late_initcall(pm_qos_power_init); diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 1b84313cbab..1ea328aafdc 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -1,30 +1,33 @@ /* * linux/kernel/power/snapshot.c * - * This file provide system snapshot/restore functionality. + * This file provides system snapshot/restore functionality for swsusp. * - * Copyright (C) 1998-2005 Pavel Machek <pavel@suse.cz> + * Copyright (C) 1998-2005 Pavel Machek <pavel@ucw.cz> + * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl> * - * This file is released under the GPLv2, and is based on swsusp.c. + * This file is released under the GPLv2. * */ - #include <linux/version.h> #include <linux/module.h> #include <linux/mm.h> #include <linux/suspend.h> -#include <linux/smp_lock.h> #include <linux/delay.h> #include <linux/bitops.h> #include <linux/spinlock.h> #include <linux/kernel.h> #include <linux/pm.h> #include <linux/device.h> +#include <linux/init.h> #include <linux/bootmem.h> #include <linux/syscalls.h> #include <linux/console.h> #include <linux/highmem.h> +#include <linux/list.h> +#include <linux/slab.h> +#include <linux/compiler.h> #include <asm/uaccess.h> #include <asm/mmu_context.h> @@ -34,137 +37,53 @@ #include "power.h" -/* List of PBEs used for creating and restoring the suspend image */ -struct pbe *restore_pblist; +static int swsusp_page_is_free(struct page *); +static void swsusp_set_page_forbidden(struct page *); +static void swsusp_unset_page_forbidden(struct page *); -static unsigned int nr_copy_pages; -static unsigned int nr_meta_pages; -static void *buffer; +/* + * Number of bytes to reserve for memory allocations made by device drivers + * from their ->freeze() and ->freeze_noirq() callbacks so that they don't + * cause image creation to fail (tunable via /sys/power/reserved_size). + */ +unsigned long reserved_size; -#ifdef CONFIG_HIGHMEM -unsigned int count_highmem_pages(void) +void __init hibernate_reserved_size_init(void) { - struct zone *zone; - unsigned long zone_pfn; - unsigned int n = 0; - - for_each_zone (zone) - if (is_highmem(zone)) { - mark_free_pages(zone); - for (zone_pfn = 0; zone_pfn < zone->spanned_pages; zone_pfn++) { - struct page *page; - unsigned long pfn = zone_pfn + zone->zone_start_pfn; - if (!pfn_valid(pfn)) - continue; - page = pfn_to_page(pfn); - if (PageReserved(page)) - continue; - if (PageNosaveFree(page)) - continue; - n++; - } - } - return n; + reserved_size = SPARE_PAGES * PAGE_SIZE; } -struct highmem_page { - char *data; - struct page *page; - struct highmem_page *next; -}; - -static struct highmem_page *highmem_copy; +/* + * Preferred image size in bytes (tunable via /sys/power/image_size). + * When it is set to N, swsusp will do its best to ensure the image + * size will not exceed N bytes, but if that is impossible, it will + * try to create the smallest image possible. + */ +unsigned long image_size; -static int save_highmem_zone(struct zone *zone) +void __init hibernate_image_size_init(void) { - unsigned long zone_pfn; - mark_free_pages(zone); - for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) { - struct page *page; - struct highmem_page *save; - void *kaddr; - unsigned long pfn = zone_pfn + zone->zone_start_pfn; - - if (!(pfn%10000)) - printk("."); - if (!pfn_valid(pfn)) - continue; - page = pfn_to_page(pfn); - /* - * This condition results from rvmalloc() sans vmalloc_32() - * and architectural memory reservations. This should be - * corrected eventually when the cases giving rise to this - * are better understood. - */ - if (PageReserved(page)) - continue; - BUG_ON(PageNosave(page)); - if (PageNosaveFree(page)) - continue; - save = kmalloc(sizeof(struct highmem_page), GFP_ATOMIC); - if (!save) - return -ENOMEM; - save->next = highmem_copy; - save->page = page; - save->data = (void *) get_zeroed_page(GFP_ATOMIC); - if (!save->data) { - kfree(save); - return -ENOMEM; - } - kaddr = kmap_atomic(page, KM_USER0); - memcpy(save->data, kaddr, PAGE_SIZE); - kunmap_atomic(kaddr, KM_USER0); - highmem_copy = save; - } - return 0; + image_size = ((totalram_pages * 2) / 5) * PAGE_SIZE; } -int save_highmem(void) -{ - struct zone *zone; - int res = 0; - - pr_debug("swsusp: Saving Highmem"); - drain_local_pages(); - for_each_zone (zone) { - if (is_highmem(zone)) - res = save_highmem_zone(zone); - if (res) - return res; - } - printk("\n"); - return 0; -} +/* List of PBEs needed for restoring the pages that were allocated before + * the suspend and included in the suspend image, but have also been + * allocated by the "resume" kernel, so their contents cannot be written + * directly to their "original" page frames. + */ +struct pbe *restore_pblist; -int restore_highmem(void) -{ - printk("swsusp: Restoring Highmem\n"); - while (highmem_copy) { - struct highmem_page *save = highmem_copy; - void *kaddr; - highmem_copy = save->next; - - kaddr = kmap_atomic(save->page, KM_USER0); - memcpy(kaddr, save->data, PAGE_SIZE); - kunmap_atomic(kaddr, KM_USER0); - free_page((long) save->data); - kfree(save); - } - return 0; -} -#else -static inline unsigned int count_highmem_pages(void) {return 0;} -static inline int save_highmem(void) {return 0;} -static inline int restore_highmem(void) {return 0;} -#endif +/* Pointer to an auxiliary buffer (1 page) */ +static void *buffer; /** * @safe_needed - on resume, for storing the PBE list and the image, * we can only use memory pages that do not conflict with the pages - * used before suspend. + * used before suspend. The unsafe pages have PageNosaveFree set + * and we count them using unsafe_pages. * - * The unsafe pages are marked with the PG_nosave_free flag - * and we count them using unsafe_pages + * Each allocated image page is marked as PageNosave and PageNosaveFree + * so that swsusp_free() can release it. */ #define PG_ANY 0 @@ -174,41 +93,60 @@ static inline int restore_highmem(void) {return 0;} static unsigned int allocated_unsafe_pages; -static void *alloc_image_page(gfp_t gfp_mask, int safe_needed) +static void *get_image_page(gfp_t gfp_mask, int safe_needed) { void *res; res = (void *)get_zeroed_page(gfp_mask); if (safe_needed) - while (res && PageNosaveFree(virt_to_page(res))) { + while (res && swsusp_page_is_free(virt_to_page(res))) { /* The page is unsafe, mark it for swsusp_free() */ - SetPageNosave(virt_to_page(res)); + swsusp_set_page_forbidden(virt_to_page(res)); allocated_unsafe_pages++; res = (void *)get_zeroed_page(gfp_mask); } if (res) { - SetPageNosave(virt_to_page(res)); - SetPageNosaveFree(virt_to_page(res)); + swsusp_set_page_forbidden(virt_to_page(res)); + swsusp_set_page_free(virt_to_page(res)); } return res; } unsigned long get_safe_page(gfp_t gfp_mask) { - return (unsigned long)alloc_image_page(gfp_mask, PG_SAFE); + return (unsigned long)get_image_page(gfp_mask, PG_SAFE); +} + +static struct page *alloc_image_page(gfp_t gfp_mask) +{ + struct page *page; + + page = alloc_page(gfp_mask); + if (page) { + swsusp_set_page_forbidden(page); + swsusp_set_page_free(page); + } + return page; } /** * free_image_page - free page represented by @addr, allocated with - * alloc_image_page (page flags set by it must be cleared) + * get_image_page (page flags set by it must be cleared) */ static inline void free_image_page(void *addr, int clear_nosave_free) { - ClearPageNosave(virt_to_page(addr)); + struct page *page; + + BUG_ON(!virt_addr_valid(addr)); + + page = virt_to_page(addr); + + swsusp_unset_page_forbidden(page); if (clear_nosave_free) - ClearPageNosaveFree(virt_to_page(addr)); - free_page((unsigned long)addr); + swsusp_unset_page_free(page); + + __free_page(page); } /* struct linked_page is used to build chains of pages */ @@ -218,7 +156,7 @@ static inline void free_image_page(void *addr, int clear_nosave_free) struct linked_page { struct linked_page *next; char data[LINKED_PAGE_DATA_SIZE]; -} __attribute__((packed)); +} __packed; static inline void free_list_of_pages(struct linked_page *list, int clear_page_nosave) @@ -269,7 +207,7 @@ static void *chain_alloc(struct chain_allocator *ca, unsigned int size) if (LINKED_PAGE_DATA_SIZE - ca->used_space < size) { struct linked_page *lp; - lp = alloc_image_page(ca->gfp_mask, ca->safe_needed); + lp = get_image_page(ca->gfp_mask, ca->safe_needed); if (!lp) return NULL; @@ -282,12 +220,6 @@ static void *chain_alloc(struct chain_allocator *ca, unsigned int size) return ret; } -static void chain_free(struct chain_allocator *ca, int clear_page_nosave) -{ - free_list_of_pages(ca->chain, clear_page_nosave); - memset(ca, 0, sizeof(struct chain_allocator)); -} - /** * Data types related to memory bitmaps. * @@ -295,8 +227,7 @@ static void chain_free(struct chain_allocator *ca, int clear_page_nosave) * objects. The main list's elements are of type struct zone_bitmap * and each of them corresonds to one zone. For each zone bitmap * object there is a list of objects of type struct bm_block that - * represent each blocks of bit chunks in which information is - * stored. + * represent each blocks of bitmap in which information is stored. * * struct memory_bitmap contains a pointer to the main list of zone * bitmap objects, a struct bm_position used for browsing the bitmap, @@ -314,50 +245,36 @@ static void chain_free(struct chain_allocator *ca, int clear_page_nosave) * pfns that correspond to the start and end of the represented zone. * * struct bm_block contains a pointer to the memory page in which - * information is stored (in the form of a block of bit chunks - * of type unsigned long each). It also contains the pfns that - * correspond to the start and end of the represented memory area and - * the number of bit chunks in the block. - * - * NOTE: Memory bitmaps are used for two types of operations only: - * "set a bit" and "find the next bit set". Moreover, the searching - * is always carried out after all of the "set a bit" operations - * on given bitmap. + * information is stored (in the form of a block of bitmap) + * It also contains the pfns that correspond to the start and end of + * the represented memory area. */ #define BM_END_OF_MAP (~0UL) -#define BM_CHUNKS_PER_BLOCK (PAGE_SIZE / sizeof(long)) -#define BM_BITS_PER_CHUNK (sizeof(long) << 3) -#define BM_BITS_PER_BLOCK (PAGE_SIZE << 3) +#define BM_BITS_PER_BLOCK (PAGE_SIZE * BITS_PER_BYTE) struct bm_block { - struct bm_block *next; /* next element of the list */ + struct list_head hook; /* hook into a list of bitmap blocks */ unsigned long start_pfn; /* pfn represented by the first bit */ unsigned long end_pfn; /* pfn represented by the last bit plus 1 */ - unsigned int size; /* number of bit chunks */ - unsigned long *data; /* chunks of bits representing pages */ + unsigned long *data; /* bitmap representing pages */ }; -struct zone_bitmap { - struct zone_bitmap *next; /* next element of the list */ - unsigned long start_pfn; /* minimal pfn in this zone */ - unsigned long end_pfn; /* maximal pfn in this zone plus 1 */ - struct bm_block *bm_blocks; /* list of bitmap blocks */ - struct bm_block *cur_block; /* recently used bitmap block */ -}; +static inline unsigned long bm_block_bits(struct bm_block *bb) +{ + return bb->end_pfn - bb->start_pfn; +} /* strcut bm_position is used for browsing memory bitmaps */ struct bm_position { - struct zone_bitmap *zone_bm; struct bm_block *block; - int chunk; int bit; }; struct memory_bitmap { - struct zone_bitmap *zone_bm_list; /* list of zone bitmaps */ + struct list_head blocks; /* list of bitmap blocks */ struct linked_page *p_list; /* list of pages used to store zone * bitmap objects and bitmap block * objects @@ -367,245 +284,281 @@ struct memory_bitmap { /* Functions that operate on memory bitmaps */ -static inline void memory_bm_reset_chunk(struct memory_bitmap *bm) -{ - bm->cur.chunk = 0; - bm->cur.bit = -1; -} - static void memory_bm_position_reset(struct memory_bitmap *bm) { - struct zone_bitmap *zone_bm; - - zone_bm = bm->zone_bm_list; - bm->cur.zone_bm = zone_bm; - bm->cur.block = zone_bm->bm_blocks; - memory_bm_reset_chunk(bm); + bm->cur.block = list_entry(bm->blocks.next, struct bm_block, hook); + bm->cur.bit = 0; } static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free); /** * create_bm_block_list - create a list of block bitmap objects + * @pages - number of pages to track + * @list - list to put the allocated blocks into + * @ca - chain allocator to be used for allocating memory */ - -static inline struct bm_block * -create_bm_block_list(unsigned int nr_blocks, struct chain_allocator *ca) +static int create_bm_block_list(unsigned long pages, + struct list_head *list, + struct chain_allocator *ca) { - struct bm_block *bblist = NULL; + unsigned int nr_blocks = DIV_ROUND_UP(pages, BM_BITS_PER_BLOCK); while (nr_blocks-- > 0) { struct bm_block *bb; bb = chain_alloc(ca, sizeof(struct bm_block)); if (!bb) - return NULL; - - bb->next = bblist; - bblist = bb; + return -ENOMEM; + list_add(&bb->hook, list); } - return bblist; + + return 0; } +struct mem_extent { + struct list_head hook; + unsigned long start; + unsigned long end; +}; + /** - * create_zone_bm_list - create a list of zone bitmap objects + * free_mem_extents - free a list of memory extents + * @list - list of extents to empty */ +static void free_mem_extents(struct list_head *list) +{ + struct mem_extent *ext, *aux; -static inline struct zone_bitmap * -create_zone_bm_list(unsigned int nr_zones, struct chain_allocator *ca) + list_for_each_entry_safe(ext, aux, list, hook) { + list_del(&ext->hook); + kfree(ext); + } +} + +/** + * create_mem_extents - create a list of memory extents representing + * contiguous ranges of PFNs + * @list - list to put the extents into + * @gfp_mask - mask to use for memory allocations + */ +static int create_mem_extents(struct list_head *list, gfp_t gfp_mask) { - struct zone_bitmap *zbmlist = NULL; + struct zone *zone; - while (nr_zones-- > 0) { - struct zone_bitmap *zbm; + INIT_LIST_HEAD(list); - zbm = chain_alloc(ca, sizeof(struct zone_bitmap)); - if (!zbm) - return NULL; + for_each_populated_zone(zone) { + unsigned long zone_start, zone_end; + struct mem_extent *ext, *cur, *aux; + + zone_start = zone->zone_start_pfn; + zone_end = zone_end_pfn(zone); - zbm->next = zbmlist; - zbmlist = zbm; + list_for_each_entry(ext, list, hook) + if (zone_start <= ext->end) + break; + + if (&ext->hook == list || zone_end < ext->start) { + /* New extent is necessary */ + struct mem_extent *new_ext; + + new_ext = kzalloc(sizeof(struct mem_extent), gfp_mask); + if (!new_ext) { + free_mem_extents(list); + return -ENOMEM; + } + new_ext->start = zone_start; + new_ext->end = zone_end; + list_add_tail(&new_ext->hook, &ext->hook); + continue; + } + + /* Merge this zone's range of PFNs with the existing one */ + if (zone_start < ext->start) + ext->start = zone_start; + if (zone_end > ext->end) + ext->end = zone_end; + + /* More merging may be possible */ + cur = ext; + list_for_each_entry_safe_continue(cur, aux, list, hook) { + if (zone_end < cur->start) + break; + if (zone_end < cur->end) + ext->end = cur->end; + list_del(&cur->hook); + kfree(cur); + } } - return zbmlist; + + return 0; } /** * memory_bm_create - allocate memory for a memory bitmap */ - static int memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed) { struct chain_allocator ca; - struct zone *zone; - struct zone_bitmap *zone_bm; - struct bm_block *bb; - unsigned int nr; + struct list_head mem_extents; + struct mem_extent *ext; + int error; chain_init(&ca, gfp_mask, safe_needed); + INIT_LIST_HEAD(&bm->blocks); - /* Compute the number of zones */ - nr = 0; - for_each_zone (zone) - if (populated_zone(zone) && !is_highmem(zone)) - nr++; - - /* Allocate the list of zones bitmap objects */ - zone_bm = create_zone_bm_list(nr, &ca); - bm->zone_bm_list = zone_bm; - if (!zone_bm) { - chain_free(&ca, PG_UNSAFE_CLEAR); - return -ENOMEM; - } - - /* Initialize the zone bitmap objects */ - for_each_zone (zone) { - unsigned long pfn; + error = create_mem_extents(&mem_extents, gfp_mask); + if (error) + return error; - if (!populated_zone(zone) || is_highmem(zone)) - continue; + list_for_each_entry(ext, &mem_extents, hook) { + struct bm_block *bb; + unsigned long pfn = ext->start; + unsigned long pages = ext->end - ext->start; - zone_bm->start_pfn = zone->zone_start_pfn; - zone_bm->end_pfn = zone->zone_start_pfn + zone->spanned_pages; - /* Allocate the list of bitmap block objects */ - nr = DIV_ROUND_UP(zone->spanned_pages, BM_BITS_PER_BLOCK); - bb = create_bm_block_list(nr, &ca); - zone_bm->bm_blocks = bb; - zone_bm->cur_block = bb; - if (!bb) - goto Free; + bb = list_entry(bm->blocks.prev, struct bm_block, hook); - nr = zone->spanned_pages; - pfn = zone->zone_start_pfn; - /* Initialize the bitmap block objects */ - while (bb) { - unsigned long *ptr; + error = create_bm_block_list(pages, bm->blocks.prev, &ca); + if (error) + goto Error; - ptr = alloc_image_page(gfp_mask, safe_needed); - bb->data = ptr; - if (!ptr) - goto Free; + list_for_each_entry_continue(bb, &bm->blocks, hook) { + bb->data = get_image_page(gfp_mask, safe_needed); + if (!bb->data) { + error = -ENOMEM; + goto Error; + } bb->start_pfn = pfn; - if (nr >= BM_BITS_PER_BLOCK) { + if (pages >= BM_BITS_PER_BLOCK) { pfn += BM_BITS_PER_BLOCK; - bb->size = BM_CHUNKS_PER_BLOCK; - nr -= BM_BITS_PER_BLOCK; + pages -= BM_BITS_PER_BLOCK; } else { /* This is executed only once in the loop */ - pfn += nr; - bb->size = DIV_ROUND_UP(nr, BM_BITS_PER_CHUNK); + pfn += pages; } bb->end_pfn = pfn; - bb = bb->next; } - zone_bm = zone_bm->next; } + bm->p_list = ca.chain; memory_bm_position_reset(bm); - return 0; + Exit: + free_mem_extents(&mem_extents); + return error; -Free: + Error: bm->p_list = ca.chain; memory_bm_free(bm, PG_UNSAFE_CLEAR); - return -ENOMEM; + goto Exit; } /** * memory_bm_free - free memory occupied by the memory bitmap @bm */ - static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free) { - struct zone_bitmap *zone_bm; + struct bm_block *bb; - /* Free the list of bit blocks for each zone_bitmap object */ - zone_bm = bm->zone_bm_list; - while (zone_bm) { - struct bm_block *bb; + list_for_each_entry(bb, &bm->blocks, hook) + if (bb->data) + free_image_page(bb->data, clear_nosave_free); - bb = zone_bm->bm_blocks; - while (bb) { - if (bb->data) - free_image_page(bb->data, clear_nosave_free); - bb = bb->next; - } - zone_bm = zone_bm->next; - } free_list_of_pages(bm->p_list, clear_nosave_free); - bm->zone_bm_list = NULL; + + INIT_LIST_HEAD(&bm->blocks); } /** - * memory_bm_set_bit - set the bit in the bitmap @bm that corresponds + * memory_bm_find_bit - find the bit in the bitmap @bm that corresponds * to given pfn. The cur_zone_bm member of @bm and the cur_block member * of @bm->cur_zone_bm are updated. - * - * If the bit cannot be set, the function returns -EINVAL . */ - -static int -memory_bm_set_bit(struct memory_bitmap *bm, unsigned long pfn) +static int memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn, + void **addr, unsigned int *bit_nr) { - struct zone_bitmap *zone_bm; struct bm_block *bb; - /* Check if the pfn is from the current zone */ - zone_bm = bm->cur.zone_bm; - if (pfn < zone_bm->start_pfn || pfn >= zone_bm->end_pfn) { - zone_bm = bm->zone_bm_list; - /* We don't assume that the zones are sorted by pfns */ - while (pfn < zone_bm->start_pfn || pfn >= zone_bm->end_pfn) { - zone_bm = zone_bm->next; - if (unlikely(!zone_bm)) - return -EINVAL; - } - bm->cur.zone_bm = zone_bm; - } - /* Check if the pfn corresponds to the current bitmap block */ - bb = zone_bm->cur_block; + /* + * Check if the pfn corresponds to the current bitmap block and find + * the block where it fits if this is not the case. + */ + bb = bm->cur.block; if (pfn < bb->start_pfn) - bb = zone_bm->bm_blocks; + list_for_each_entry_continue_reverse(bb, &bm->blocks, hook) + if (pfn >= bb->start_pfn) + break; - while (pfn >= bb->end_pfn) { - bb = bb->next; - if (unlikely(!bb)) - return -EINVAL; - } - zone_bm->cur_block = bb; + if (pfn >= bb->end_pfn) + list_for_each_entry_continue(bb, &bm->blocks, hook) + if (pfn >= bb->start_pfn && pfn < bb->end_pfn) + break; + + if (&bb->hook == &bm->blocks) + return -EFAULT; + + /* The block has been found */ + bm->cur.block = bb; pfn -= bb->start_pfn; - set_bit(pfn % BM_BITS_PER_CHUNK, bb->data + pfn / BM_BITS_PER_CHUNK); + bm->cur.bit = pfn + 1; + *bit_nr = pfn; + *addr = bb->data; return 0; } -/* Two auxiliary functions for memory_bm_next_pfn */ +static void memory_bm_set_bit(struct memory_bitmap *bm, unsigned long pfn) +{ + void *addr; + unsigned int bit; + int error; -/* Find the first set bit in the given chunk, if there is one */ + error = memory_bm_find_bit(bm, pfn, &addr, &bit); + BUG_ON(error); + set_bit(bit, addr); +} -static inline int next_bit_in_chunk(int bit, unsigned long *chunk_p) +static int mem_bm_set_bit_check(struct memory_bitmap *bm, unsigned long pfn) { - bit++; - while (bit < BM_BITS_PER_CHUNK) { - if (test_bit(bit, chunk_p)) - return bit; + void *addr; + unsigned int bit; + int error; - bit++; - } - return -1; + error = memory_bm_find_bit(bm, pfn, &addr, &bit); + if (!error) + set_bit(bit, addr); + return error; } -/* Find a chunk containing some bits set in given block of bits */ +static void memory_bm_clear_bit(struct memory_bitmap *bm, unsigned long pfn) +{ + void *addr; + unsigned int bit; + int error; + + error = memory_bm_find_bit(bm, pfn, &addr, &bit); + BUG_ON(error); + clear_bit(bit, addr); +} -static inline int next_chunk_in_block(int n, struct bm_block *bb) +static int memory_bm_test_bit(struct memory_bitmap *bm, unsigned long pfn) { - n++; - while (n < bb->size) { - if (bb->data[n]) - return n; + void *addr; + unsigned int bit; + int error; - n++; - } - return -1; + error = memory_bm_find_bit(bm, pfn, &addr, &bit); + BUG_ON(error); + return test_bit(bit, addr); +} + +static bool memory_bm_pfn_present(struct memory_bitmap *bm, unsigned long pfn) +{ + void *addr; + unsigned int bit; + + return !memory_bm_find_bit(bm, pfn, &addr, &bit); } /** @@ -619,42 +572,240 @@ static inline int next_chunk_in_block(int n, struct bm_block *bb) static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm) { - struct zone_bitmap *zone_bm; struct bm_block *bb; - int chunk; int bit; + bb = bm->cur.block; do { - bb = bm->cur.block; - do { - chunk = bm->cur.chunk; - bit = bm->cur.bit; - do { - bit = next_bit_in_chunk(bit, bb->data + chunk); - if (bit >= 0) - goto Return_pfn; - - chunk = next_chunk_in_block(chunk, bb); - bit = -1; - } while (chunk >= 0); - bb = bb->next; - bm->cur.block = bb; - memory_bm_reset_chunk(bm); - } while (bb); - zone_bm = bm->cur.zone_bm->next; - if (zone_bm) { - bm->cur.zone_bm = zone_bm; - bm->cur.block = zone_bm->bm_blocks; - memory_bm_reset_chunk(bm); - } - } while (zone_bm); + bit = bm->cur.bit; + bit = find_next_bit(bb->data, bm_block_bits(bb), bit); + if (bit < bm_block_bits(bb)) + goto Return_pfn; + + bb = list_entry(bb->hook.next, struct bm_block, hook); + bm->cur.block = bb; + bm->cur.bit = 0; + } while (&bb->hook != &bm->blocks); + memory_bm_position_reset(bm); return BM_END_OF_MAP; -Return_pfn: - bm->cur.chunk = chunk; - bm->cur.bit = bit; - return bb->start_pfn + chunk * BM_BITS_PER_CHUNK + bit; + Return_pfn: + bm->cur.bit = bit + 1; + return bb->start_pfn + bit; +} + +/** + * This structure represents a range of page frames the contents of which + * should not be saved during the suspend. + */ + +struct nosave_region { + struct list_head list; + unsigned long start_pfn; + unsigned long end_pfn; +}; + +static LIST_HEAD(nosave_regions); + +/** + * register_nosave_region - register a range of page frames the contents + * of which should not be saved during the suspend (to be used in the early + * initialization code) + */ + +void __init +__register_nosave_region(unsigned long start_pfn, unsigned long end_pfn, + int use_kmalloc) +{ + struct nosave_region *region; + + if (start_pfn >= end_pfn) + return; + + if (!list_empty(&nosave_regions)) { + /* Try to extend the previous region (they should be sorted) */ + region = list_entry(nosave_regions.prev, + struct nosave_region, list); + if (region->end_pfn == start_pfn) { + region->end_pfn = end_pfn; + goto Report; + } + } + if (use_kmalloc) { + /* during init, this shouldn't fail */ + region = kmalloc(sizeof(struct nosave_region), GFP_KERNEL); + BUG_ON(!region); + } else + /* This allocation cannot fail */ + region = memblock_virt_alloc(sizeof(struct nosave_region), 0); + region->start_pfn = start_pfn; + region->end_pfn = end_pfn; + list_add_tail(®ion->list, &nosave_regions); + Report: + printk(KERN_INFO "PM: Registered nosave memory: [mem %#010llx-%#010llx]\n", + (unsigned long long) start_pfn << PAGE_SHIFT, + ((unsigned long long) end_pfn << PAGE_SHIFT) - 1); +} + +/* + * Set bits in this map correspond to the page frames the contents of which + * should not be saved during the suspend. + */ +static struct memory_bitmap *forbidden_pages_map; + +/* Set bits in this map correspond to free page frames. */ +static struct memory_bitmap *free_pages_map; + +/* + * Each page frame allocated for creating the image is marked by setting the + * corresponding bits in forbidden_pages_map and free_pages_map simultaneously + */ + +void swsusp_set_page_free(struct page *page) +{ + if (free_pages_map) + memory_bm_set_bit(free_pages_map, page_to_pfn(page)); +} + +static int swsusp_page_is_free(struct page *page) +{ + return free_pages_map ? + memory_bm_test_bit(free_pages_map, page_to_pfn(page)) : 0; +} + +void swsusp_unset_page_free(struct page *page) +{ + if (free_pages_map) + memory_bm_clear_bit(free_pages_map, page_to_pfn(page)); +} + +static void swsusp_set_page_forbidden(struct page *page) +{ + if (forbidden_pages_map) + memory_bm_set_bit(forbidden_pages_map, page_to_pfn(page)); +} + +int swsusp_page_is_forbidden(struct page *page) +{ + return forbidden_pages_map ? + memory_bm_test_bit(forbidden_pages_map, page_to_pfn(page)) : 0; +} + +static void swsusp_unset_page_forbidden(struct page *page) +{ + if (forbidden_pages_map) + memory_bm_clear_bit(forbidden_pages_map, page_to_pfn(page)); +} + +/** + * mark_nosave_pages - set bits corresponding to the page frames the + * contents of which should not be saved in a given bitmap. + */ + +static void mark_nosave_pages(struct memory_bitmap *bm) +{ + struct nosave_region *region; + + if (list_empty(&nosave_regions)) + return; + + list_for_each_entry(region, &nosave_regions, list) { + unsigned long pfn; + + pr_debug("PM: Marking nosave pages: [mem %#010llx-%#010llx]\n", + (unsigned long long) region->start_pfn << PAGE_SHIFT, + ((unsigned long long) region->end_pfn << PAGE_SHIFT) + - 1); + + for (pfn = region->start_pfn; pfn < region->end_pfn; pfn++) + if (pfn_valid(pfn)) { + /* + * It is safe to ignore the result of + * mem_bm_set_bit_check() here, since we won't + * touch the PFNs for which the error is + * returned anyway. + */ + mem_bm_set_bit_check(bm, pfn); + } + } +} + +/** + * create_basic_memory_bitmaps - create bitmaps needed for marking page + * frames that should not be saved and free page frames. The pointers + * forbidden_pages_map and free_pages_map are only modified if everything + * goes well, because we don't want the bits to be used before both bitmaps + * are set up. + */ + +int create_basic_memory_bitmaps(void) +{ + struct memory_bitmap *bm1, *bm2; + int error = 0; + + if (forbidden_pages_map && free_pages_map) + return 0; + else + BUG_ON(forbidden_pages_map || free_pages_map); + + bm1 = kzalloc(sizeof(struct memory_bitmap), GFP_KERNEL); + if (!bm1) + return -ENOMEM; + + error = memory_bm_create(bm1, GFP_KERNEL, PG_ANY); + if (error) + goto Free_first_object; + + bm2 = kzalloc(sizeof(struct memory_bitmap), GFP_KERNEL); + if (!bm2) + goto Free_first_bitmap; + + error = memory_bm_create(bm2, GFP_KERNEL, PG_ANY); + if (error) + goto Free_second_object; + + forbidden_pages_map = bm1; + free_pages_map = bm2; + mark_nosave_pages(forbidden_pages_map); + + pr_debug("PM: Basic memory bitmaps created\n"); + + return 0; + + Free_second_object: + kfree(bm2); + Free_first_bitmap: + memory_bm_free(bm1, PG_UNSAFE_CLEAR); + Free_first_object: + kfree(bm1); + return -ENOMEM; +} + +/** + * free_basic_memory_bitmaps - free memory bitmaps allocated by + * create_basic_memory_bitmaps(). The auxiliary pointers are necessary + * so that the bitmaps themselves are not referred to while they are being + * freed. + */ + +void free_basic_memory_bitmaps(void) +{ + struct memory_bitmap *bm1, *bm2; + + if (WARN_ON(!(forbidden_pages_map && free_pages_map))) + return; + + bm1 = forbidden_pages_map; + bm2 = free_pages_map; + forbidden_pages_map = NULL; + free_pages_map = NULL; + memory_bm_free(bm1, PG_UNSAFE_CLEAR); + kfree(bm1); + memory_bm_free(bm2, PG_UNSAFE_CLEAR); + kfree(bm2); + + pr_debug("PM: Basic memory bitmaps freed\n"); } /** @@ -668,31 +819,99 @@ unsigned int snapshot_additional_pages(struct zone *zone) unsigned int res; res = DIV_ROUND_UP(zone->spanned_pages, BM_BITS_PER_BLOCK); - res += DIV_ROUND_UP(res * sizeof(struct bm_block), PAGE_SIZE); - return res; + res += DIV_ROUND_UP(res * sizeof(struct bm_block), + LINKED_PAGE_DATA_SIZE); + return 2 * res; } +#ifdef CONFIG_HIGHMEM /** - * pfn_is_nosave - check if given pfn is in the 'nosave' section + * count_free_highmem_pages - compute the total number of free highmem + * pages, system-wide. */ -static inline int pfn_is_nosave(unsigned long pfn) +static unsigned int count_free_highmem_pages(void) { - unsigned long nosave_begin_pfn = __pa(&__nosave_begin) >> PAGE_SHIFT; - unsigned long nosave_end_pfn = PAGE_ALIGN(__pa(&__nosave_end)) >> PAGE_SHIFT; - return (pfn >= nosave_begin_pfn) && (pfn < nosave_end_pfn); + struct zone *zone; + unsigned int cnt = 0; + + for_each_populated_zone(zone) + if (is_highmem(zone)) + cnt += zone_page_state(zone, NR_FREE_PAGES); + + return cnt; } /** - * saveable - Determine whether a page should be cloned or not. - * @pfn: The page + * saveable_highmem_page - Determine whether a highmem page should be + * included in the suspend image. * - * We save a page if it isn't Nosave, and is not in the range of pages - * statically defined as 'unsaveable', and it - * isn't a part of a free chunk of pages. + * We should save the page if it isn't Nosave or NosaveFree, or Reserved, + * and it isn't a part of a free chunk of pages. */ +static struct page *saveable_highmem_page(struct zone *zone, unsigned long pfn) +{ + struct page *page; -static struct page *saveable_page(unsigned long pfn) + if (!pfn_valid(pfn)) + return NULL; + + page = pfn_to_page(pfn); + if (page_zone(page) != zone) + return NULL; + + BUG_ON(!PageHighMem(page)); + + if (swsusp_page_is_forbidden(page) || swsusp_page_is_free(page) || + PageReserved(page)) + return NULL; + + if (page_is_guard(page)) + return NULL; + + return page; +} + +/** + * count_highmem_pages - compute the total number of saveable highmem + * pages. + */ + +static unsigned int count_highmem_pages(void) +{ + struct zone *zone; + unsigned int n = 0; + + for_each_populated_zone(zone) { + unsigned long pfn, max_zone_pfn; + + if (!is_highmem(zone)) + continue; + + mark_free_pages(zone); + max_zone_pfn = zone_end_pfn(zone); + for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) + if (saveable_highmem_page(zone, pfn)) + n++; + } + return n; +} +#else +static inline void *saveable_highmem_page(struct zone *z, unsigned long p) +{ + return NULL; +} +#endif /* CONFIG_HIGHMEM */ + +/** + * saveable_page - Determine whether a non-highmem page should be included + * in the suspend image. + * + * We should save the page if it isn't Nosave, and is not in the range + * of pages statically defined as 'unsaveable', and it isn't a part of + * a free chunk of pages. + */ +static struct page *saveable_page(struct zone *zone, unsigned long pfn) { struct page *page; @@ -700,77 +919,172 @@ static struct page *saveable_page(unsigned long pfn) return NULL; page = pfn_to_page(pfn); + if (page_zone(page) != zone) + return NULL; + + BUG_ON(PageHighMem(page)); - if (PageNosave(page)) + if (swsusp_page_is_forbidden(page) || swsusp_page_is_free(page)) return NULL; - if (PageReserved(page) && pfn_is_nosave(pfn)) + + if (PageReserved(page) + && (!kernel_page_present(page) || pfn_is_nosave(pfn))) return NULL; - if (PageNosaveFree(page)) + + if (page_is_guard(page)) return NULL; return page; } -unsigned int count_data_pages(void) +/** + * count_data_pages - compute the total number of saveable non-highmem + * pages. + */ + +static unsigned int count_data_pages(void) { struct zone *zone; unsigned long pfn, max_zone_pfn; unsigned int n = 0; - for_each_zone (zone) { + for_each_populated_zone(zone) { if (is_highmem(zone)) continue; + mark_free_pages(zone); - max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; + max_zone_pfn = zone_end_pfn(zone); for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) - n += !!saveable_page(pfn); + if (saveable_page(zone, pfn)) + n++; } return n; } -static inline void copy_data_page(long *dst, long *src) +/* This is needed, because copy_page and memcpy are not usable for copying + * task structs. + */ +static inline void do_copy_page(long *dst, long *src) { int n; - /* copy_page and memcpy are not usable for copying task structs. */ for (n = PAGE_SIZE / sizeof(long); n; n--) *dst++ = *src++; } + +/** + * safe_copy_page - check if the page we are going to copy is marked as + * present in the kernel page tables (this always is the case if + * CONFIG_DEBUG_PAGEALLOC is not set and in that case + * kernel_page_present() always returns 'true'). + */ +static void safe_copy_page(void *dst, struct page *s_page) +{ + if (kernel_page_present(s_page)) { + do_copy_page(dst, page_address(s_page)); + } else { + kernel_map_pages(s_page, 1, 1); + do_copy_page(dst, page_address(s_page)); + kernel_map_pages(s_page, 1, 0); + } +} + + +#ifdef CONFIG_HIGHMEM +static inline struct page * +page_is_saveable(struct zone *zone, unsigned long pfn) +{ + return is_highmem(zone) ? + saveable_highmem_page(zone, pfn) : saveable_page(zone, pfn); +} + +static void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn) +{ + struct page *s_page, *d_page; + void *src, *dst; + + s_page = pfn_to_page(src_pfn); + d_page = pfn_to_page(dst_pfn); + if (PageHighMem(s_page)) { + src = kmap_atomic(s_page); + dst = kmap_atomic(d_page); + do_copy_page(dst, src); + kunmap_atomic(dst); + kunmap_atomic(src); + } else { + if (PageHighMem(d_page)) { + /* Page pointed to by src may contain some kernel + * data modified by kmap_atomic() + */ + safe_copy_page(buffer, s_page); + dst = kmap_atomic(d_page); + copy_page(dst, buffer); + kunmap_atomic(dst); + } else { + safe_copy_page(page_address(d_page), s_page); + } + } +} +#else +#define page_is_saveable(zone, pfn) saveable_page(zone, pfn) + +static inline void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn) +{ + safe_copy_page(page_address(pfn_to_page(dst_pfn)), + pfn_to_page(src_pfn)); +} +#endif /* CONFIG_HIGHMEM */ + static void copy_data_pages(struct memory_bitmap *copy_bm, struct memory_bitmap *orig_bm) { struct zone *zone; unsigned long pfn; - for_each_zone (zone) { + for_each_populated_zone(zone) { unsigned long max_zone_pfn; - if (is_highmem(zone)) - continue; - mark_free_pages(zone); - max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; + max_zone_pfn = zone_end_pfn(zone); for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) - if (saveable_page(pfn)) + if (page_is_saveable(zone, pfn)) memory_bm_set_bit(orig_bm, pfn); } memory_bm_position_reset(orig_bm); memory_bm_position_reset(copy_bm); - do { + for(;;) { pfn = memory_bm_next_pfn(orig_bm); - if (likely(pfn != BM_END_OF_MAP)) { - struct page *page; - void *src; - - page = pfn_to_page(pfn); - src = page_address(page); - page = pfn_to_page(memory_bm_next_pfn(copy_bm)); - copy_data_page(page_address(page), src); - } - } while (pfn != BM_END_OF_MAP); + if (unlikely(pfn == BM_END_OF_MAP)) + break; + copy_data_page(memory_bm_next_pfn(copy_bm), pfn); + } } +/* Total number of image pages */ +static unsigned int nr_copy_pages; +/* Number of pages needed for saving the original pfns of the image pages */ +static unsigned int nr_meta_pages; +/* + * Numbers of normal and highmem page frames allocated for hibernation image + * before suspending devices. + */ +unsigned int alloc_normal, alloc_highmem; +/* + * Memory bitmap used for marking saveable pages (during hibernation) or + * hibernation image pages (during restore) + */ +static struct memory_bitmap orig_bm; +/* + * Memory bitmap used during hibernation for marking allocated page frames that + * will contain copies of saveable pages. During restore it is initially used + * for marking hibernation image pages, but then the set bits from it are + * duplicated in @orig_bm and it is released. On highmem systems it is next + * used for marking "safe" highmem pages, but it has to be reinitialized for + * this purpose. + */ +static struct memory_bitmap copy_bm; + /** * swsusp_free - free pages allocated for the suspend. * @@ -783,16 +1097,17 @@ void swsusp_free(void) struct zone *zone; unsigned long pfn, max_zone_pfn; - for_each_zone(zone) { - max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; + for_each_populated_zone(zone) { + max_zone_pfn = zone_end_pfn(zone); for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) if (pfn_valid(pfn)) { struct page *page = pfn_to_page(pfn); - if (PageNosave(page) && PageNosaveFree(page)) { - ClearPageNosave(page); - ClearPageNosaveFree(page); - free_page((long) page_address(page)); + if (swsusp_page_is_forbidden(page) && + swsusp_page_is_free(page)) { + swsusp_unset_page_forbidden(page); + swsusp_unset_page_free(page); + __free_page(page); } } } @@ -800,92 +1115,502 @@ void swsusp_free(void) nr_meta_pages = 0; restore_pblist = NULL; buffer = NULL; + alloc_normal = 0; + alloc_highmem = 0; } +/* Helper functions used for the shrinking of memory. */ + +#define GFP_IMAGE (GFP_KERNEL | __GFP_NOWARN) /** - * enough_free_mem - Make sure we enough free memory to snapshot. + * preallocate_image_pages - Allocate a number of pages for hibernation image + * @nr_pages: Number of page frames to allocate. + * @mask: GFP flags to use for the allocation. * - * Returns TRUE or FALSE after checking the number of available - * free pages. + * Return value: Number of page frames actually allocated */ +static unsigned long preallocate_image_pages(unsigned long nr_pages, gfp_t mask) +{ + unsigned long nr_alloc = 0; -static int enough_free_mem(unsigned int nr_pages) + while (nr_pages > 0) { + struct page *page; + + page = alloc_image_page(mask); + if (!page) + break; + memory_bm_set_bit(©_bm, page_to_pfn(page)); + if (PageHighMem(page)) + alloc_highmem++; + else + alloc_normal++; + nr_pages--; + nr_alloc++; + } + + return nr_alloc; +} + +static unsigned long preallocate_image_memory(unsigned long nr_pages, + unsigned long avail_normal) { - struct zone *zone; - unsigned int free = 0, meta = 0; + unsigned long alloc; + + if (avail_normal <= alloc_normal) + return 0; + + alloc = avail_normal - alloc_normal; + if (nr_pages < alloc) + alloc = nr_pages; + + return preallocate_image_pages(alloc, GFP_IMAGE); +} - for_each_zone (zone) - if (!is_highmem(zone)) { - free += zone->free_pages; - meta += snapshot_additional_pages(zone); +#ifdef CONFIG_HIGHMEM +static unsigned long preallocate_image_highmem(unsigned long nr_pages) +{ + return preallocate_image_pages(nr_pages, GFP_IMAGE | __GFP_HIGHMEM); +} + +/** + * __fraction - Compute (an approximation of) x * (multiplier / base) + */ +static unsigned long __fraction(u64 x, u64 multiplier, u64 base) +{ + x *= multiplier; + do_div(x, base); + return (unsigned long)x; +} + +static unsigned long preallocate_highmem_fraction(unsigned long nr_pages, + unsigned long highmem, + unsigned long total) +{ + unsigned long alloc = __fraction(nr_pages, highmem, total); + + return preallocate_image_pages(alloc, GFP_IMAGE | __GFP_HIGHMEM); +} +#else /* CONFIG_HIGHMEM */ +static inline unsigned long preallocate_image_highmem(unsigned long nr_pages) +{ + return 0; +} + +static inline unsigned long preallocate_highmem_fraction(unsigned long nr_pages, + unsigned long highmem, + unsigned long total) +{ + return 0; +} +#endif /* CONFIG_HIGHMEM */ + +/** + * free_unnecessary_pages - Release preallocated pages not needed for the image + */ +static void free_unnecessary_pages(void) +{ + unsigned long save, to_free_normal, to_free_highmem; + + save = count_data_pages(); + if (alloc_normal >= save) { + to_free_normal = alloc_normal - save; + save = 0; + } else { + to_free_normal = 0; + save -= alloc_normal; + } + save += count_highmem_pages(); + if (alloc_highmem >= save) { + to_free_highmem = alloc_highmem - save; + } else { + to_free_highmem = 0; + save -= alloc_highmem; + if (to_free_normal > save) + to_free_normal -= save; + else + to_free_normal = 0; + } + + memory_bm_position_reset(©_bm); + + while (to_free_normal > 0 || to_free_highmem > 0) { + unsigned long pfn = memory_bm_next_pfn(©_bm); + struct page *page = pfn_to_page(pfn); + + if (PageHighMem(page)) { + if (!to_free_highmem) + continue; + to_free_highmem--; + alloc_highmem--; + } else { + if (!to_free_normal) + continue; + to_free_normal--; + alloc_normal--; } + memory_bm_clear_bit(©_bm, pfn); + swsusp_unset_page_forbidden(page); + swsusp_unset_page_free(page); + __free_page(page); + } +} - pr_debug("swsusp: pages needed: %u + %u + %u, available pages: %u\n", - nr_pages, PAGES_FOR_IO, meta, free); +/** + * minimum_image_size - Estimate the minimum acceptable size of an image + * @saveable: Number of saveable pages in the system. + * + * We want to avoid attempting to free too much memory too hard, so estimate the + * minimum acceptable size of a hibernation image to use as the lower limit for + * preallocating memory. + * + * We assume that the minimum image size should be proportional to + * + * [number of saveable pages] - [number of pages that can be freed in theory] + * + * where the second term is the sum of (1) reclaimable slab pages, (2) active + * and (3) inactive anonymous pages, (4) active and (5) inactive file pages, + * minus mapped file pages. + */ +static unsigned long minimum_image_size(unsigned long saveable) +{ + unsigned long size; + + size = global_page_state(NR_SLAB_RECLAIMABLE) + + global_page_state(NR_ACTIVE_ANON) + + global_page_state(NR_INACTIVE_ANON) + + global_page_state(NR_ACTIVE_FILE) + + global_page_state(NR_INACTIVE_FILE) + - global_page_state(NR_FILE_MAPPED); - return free > nr_pages + PAGES_FOR_IO + meta; + return saveable <= size ? 0 : saveable - size; } -static int -swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm, - unsigned int nr_pages) +/** + * hibernate_preallocate_memory - Preallocate memory for hibernation image + * + * To create a hibernation image it is necessary to make a copy of every page + * frame in use. We also need a number of page frames to be free during + * hibernation for allocations made while saving the image and for device + * drivers, in case they need to allocate memory from their hibernation + * callbacks (these two numbers are given by PAGES_FOR_IO (which is a rough + * estimate) and reserverd_size divided by PAGE_SIZE (which is tunable through + * /sys/power/reserved_size, respectively). To make this happen, we compute the + * total number of available page frames and allocate at least + * + * ([page frames total] + PAGES_FOR_IO + [metadata pages]) / 2 + * + 2 * DIV_ROUND_UP(reserved_size, PAGE_SIZE) + * + * of them, which corresponds to the maximum size of a hibernation image. + * + * If image_size is set below the number following from the above formula, + * the preallocation of memory is continued until the total number of saveable + * pages in the system is below the requested image size or the minimum + * acceptable image size returned by minimum_image_size(), whichever is greater. + */ +int hibernate_preallocate_memory(void) { + struct zone *zone; + unsigned long saveable, size, max_size, count, highmem, pages = 0; + unsigned long alloc, save_highmem, pages_highmem, avail_normal; + struct timeval start, stop; int error; - error = memory_bm_create(orig_bm, GFP_ATOMIC | __GFP_COLD, PG_ANY); + printk(KERN_INFO "PM: Preallocating image memory... "); + do_gettimeofday(&start); + + error = memory_bm_create(&orig_bm, GFP_IMAGE, PG_ANY); if (error) - goto Free; + goto err_out; - error = memory_bm_create(copy_bm, GFP_ATOMIC | __GFP_COLD, PG_ANY); + error = memory_bm_create(©_bm, GFP_IMAGE, PG_ANY); if (error) - goto Free; + goto err_out; - while (nr_pages-- > 0) { - struct page *page = alloc_page(GFP_ATOMIC | __GFP_COLD); - if (!page) - goto Free; + alloc_normal = 0; + alloc_highmem = 0; + + /* Count the number of saveable data pages. */ + save_highmem = count_highmem_pages(); + saveable = count_data_pages(); - SetPageNosave(page); - SetPageNosaveFree(page); - memory_bm_set_bit(copy_bm, page_to_pfn(page)); + /* + * Compute the total number of page frames we can use (count) and the + * number of pages needed for image metadata (size). + */ + count = saveable; + saveable += save_highmem; + highmem = save_highmem; + size = 0; + for_each_populated_zone(zone) { + size += snapshot_additional_pages(zone); + if (is_highmem(zone)) + highmem += zone_page_state(zone, NR_FREE_PAGES); + else + count += zone_page_state(zone, NR_FREE_PAGES); } + avail_normal = count; + count += highmem; + count -= totalreserve_pages; + + /* Add number of pages required for page keys (s390 only). */ + size += page_key_additional_pages(saveable); + + /* Compute the maximum number of saveable pages to leave in memory. */ + max_size = (count - (size + PAGES_FOR_IO)) / 2 + - 2 * DIV_ROUND_UP(reserved_size, PAGE_SIZE); + /* Compute the desired number of image pages specified by image_size. */ + size = DIV_ROUND_UP(image_size, PAGE_SIZE); + if (size > max_size) + size = max_size; + /* + * If the desired number of image pages is at least as large as the + * current number of saveable pages in memory, allocate page frames for + * the image and we're done. + */ + if (size >= saveable) { + pages = preallocate_image_highmem(save_highmem); + pages += preallocate_image_memory(saveable - pages, avail_normal); + goto out; + } + + /* Estimate the minimum size of the image. */ + pages = minimum_image_size(saveable); + /* + * To avoid excessive pressure on the normal zone, leave room in it to + * accommodate an image of the minimum size (unless it's already too + * small, in which case don't preallocate pages from it at all). + */ + if (avail_normal > pages) + avail_normal -= pages; + else + avail_normal = 0; + if (size < pages) + size = min_t(unsigned long, pages, max_size); + + /* + * Let the memory management subsystem know that we're going to need a + * large number of page frames to allocate and make it free some memory. + * NOTE: If this is not done, performance will be hurt badly in some + * test cases. + */ + shrink_all_memory(saveable - size); + + /* + * The number of saveable pages in memory was too high, so apply some + * pressure to decrease it. First, make room for the largest possible + * image and fail if that doesn't work. Next, try to decrease the size + * of the image as much as indicated by 'size' using allocations from + * highmem and non-highmem zones separately. + */ + pages_highmem = preallocate_image_highmem(highmem / 2); + alloc = count - max_size; + if (alloc > pages_highmem) + alloc -= pages_highmem; + else + alloc = 0; + pages = preallocate_image_memory(alloc, avail_normal); + if (pages < alloc) { + /* We have exhausted non-highmem pages, try highmem. */ + alloc -= pages; + pages += pages_highmem; + pages_highmem = preallocate_image_highmem(alloc); + if (pages_highmem < alloc) + goto err_out; + pages += pages_highmem; + /* + * size is the desired number of saveable pages to leave in + * memory, so try to preallocate (all memory - size) pages. + */ + alloc = (count - pages) - size; + pages += preallocate_image_highmem(alloc); + } else { + /* + * There are approximately max_size saveable pages at this point + * and we want to reduce this number down to size. + */ + alloc = max_size - size; + size = preallocate_highmem_fraction(alloc, highmem, count); + pages_highmem += size; + alloc -= size; + size = preallocate_image_memory(alloc, avail_normal); + pages_highmem += preallocate_image_highmem(alloc - size); + pages += pages_highmem + size; + } + + /* + * We only need as many page frames for the image as there are saveable + * pages in memory, but we have allocated more. Release the excessive + * ones now. + */ + free_unnecessary_pages(); + + out: + do_gettimeofday(&stop); + printk(KERN_CONT "done (allocated %lu pages)\n", pages); + swsusp_show_speed(&start, &stop, pages, "Allocated"); + return 0; -Free: + err_out: + printk(KERN_CONT "\n"); swsusp_free(); return -ENOMEM; } -/* Memory bitmap used for marking saveable pages */ -static struct memory_bitmap orig_bm; -/* Memory bitmap used for marking allocated pages that will contain the copies - * of saveable pages +#ifdef CONFIG_HIGHMEM +/** + * count_pages_for_highmem - compute the number of non-highmem pages + * that will be necessary for creating copies of highmem pages. + */ + +static unsigned int count_pages_for_highmem(unsigned int nr_highmem) +{ + unsigned int free_highmem = count_free_highmem_pages() + alloc_highmem; + + if (free_highmem >= nr_highmem) + nr_highmem = 0; + else + nr_highmem -= free_highmem; + + return nr_highmem; +} +#else +static unsigned int +count_pages_for_highmem(unsigned int nr_highmem) { return 0; } +#endif /* CONFIG_HIGHMEM */ + +/** + * enough_free_mem - Make sure we have enough free memory for the + * snapshot image. */ -static struct memory_bitmap copy_bm; -asmlinkage int swsusp_save(void) +static int enough_free_mem(unsigned int nr_pages, unsigned int nr_highmem) +{ + struct zone *zone; + unsigned int free = alloc_normal; + + for_each_populated_zone(zone) + if (!is_highmem(zone)) + free += zone_page_state(zone, NR_FREE_PAGES); + + nr_pages += count_pages_for_highmem(nr_highmem); + pr_debug("PM: Normal pages needed: %u + %u, available pages: %u\n", + nr_pages, PAGES_FOR_IO, free); + + return free > nr_pages + PAGES_FOR_IO; +} + +#ifdef CONFIG_HIGHMEM +/** + * get_highmem_buffer - if there are some highmem pages in the suspend + * image, we may need the buffer to copy them and/or load their data. + */ + +static inline int get_highmem_buffer(int safe_needed) +{ + buffer = get_image_page(GFP_ATOMIC | __GFP_COLD, safe_needed); + return buffer ? 0 : -ENOMEM; +} + +/** + * alloc_highmem_image_pages - allocate some highmem pages for the image. + * Try to allocate as many pages as needed, but if the number of free + * highmem pages is lesser than that, allocate them all. + */ + +static inline unsigned int +alloc_highmem_pages(struct memory_bitmap *bm, unsigned int nr_highmem) +{ + unsigned int to_alloc = count_free_highmem_pages(); + + if (to_alloc > nr_highmem) + to_alloc = nr_highmem; + + nr_highmem -= to_alloc; + while (to_alloc-- > 0) { + struct page *page; + + page = alloc_image_page(__GFP_HIGHMEM); + memory_bm_set_bit(bm, page_to_pfn(page)); + } + return nr_highmem; +} +#else +static inline int get_highmem_buffer(int safe_needed) { return 0; } + +static inline unsigned int +alloc_highmem_pages(struct memory_bitmap *bm, unsigned int n) { return 0; } +#endif /* CONFIG_HIGHMEM */ + +/** + * swsusp_alloc - allocate memory for the suspend image + * + * We first try to allocate as many highmem pages as there are + * saveable highmem pages in the system. If that fails, we allocate + * non-highmem pages for the copies of the remaining highmem ones. + * + * In this approach it is likely that the copies of highmem pages will + * also be located in the high memory, because of the way in which + * copy_data_pages() works. + */ + +static int +swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm, + unsigned int nr_pages, unsigned int nr_highmem) +{ + if (nr_highmem > 0) { + if (get_highmem_buffer(PG_ANY)) + goto err_out; + if (nr_highmem > alloc_highmem) { + nr_highmem -= alloc_highmem; + nr_pages += alloc_highmem_pages(copy_bm, nr_highmem); + } + } + if (nr_pages > alloc_normal) { + nr_pages -= alloc_normal; + while (nr_pages-- > 0) { + struct page *page; + + page = alloc_image_page(GFP_ATOMIC | __GFP_COLD); + if (!page) + goto err_out; + memory_bm_set_bit(copy_bm, page_to_pfn(page)); + } + } + + return 0; + + err_out: + swsusp_free(); + return -ENOMEM; +} + +asmlinkage __visible int swsusp_save(void) { - unsigned int nr_pages; + unsigned int nr_pages, nr_highmem; - pr_debug("swsusp: critical section: \n"); + printk(KERN_INFO "PM: Creating hibernation image:\n"); - drain_local_pages(); + drain_local_pages(NULL); nr_pages = count_data_pages(); - printk("swsusp: Need to copy %u pages\n", nr_pages); + nr_highmem = count_highmem_pages(); + printk(KERN_INFO "PM: Need to copy %u pages\n", nr_pages + nr_highmem); - if (!enough_free_mem(nr_pages)) { - printk(KERN_ERR "swsusp: Not enough free memory\n"); + if (!enough_free_mem(nr_pages, nr_highmem)) { + printk(KERN_ERR "PM: Not enough free memory\n"); return -ENOMEM; } - if (swsusp_alloc(&orig_bm, ©_bm, nr_pages)) + if (swsusp_alloc(&orig_bm, ©_bm, nr_pages, nr_highmem)) { + printk(KERN_ERR "PM: Memory allocation failed\n"); return -ENOMEM; + } /* During allocating of suspend pagedir, new cold pages may appear. * Kill them. */ - drain_local_pages(); + drain_local_pages(NULL); copy_data_pages(©_bm, &orig_bm); /* @@ -894,24 +1619,54 @@ asmlinkage int swsusp_save(void) * touch swap space! Except we must write out our image of course. */ + nr_pages += nr_highmem; nr_copy_pages = nr_pages; - nr_meta_pages = (nr_pages * sizeof(long) + PAGE_SIZE - 1) >> PAGE_SHIFT; + nr_meta_pages = DIV_ROUND_UP(nr_pages * sizeof(long), PAGE_SIZE); + + printk(KERN_INFO "PM: Hibernation image created (%d pages copied)\n", + nr_pages); - printk("swsusp: critical section/: done (%d pages copied)\n", nr_pages); return 0; } -static void init_header(struct swsusp_info *info) +#ifndef CONFIG_ARCH_HIBERNATION_HEADER +static int init_header_complete(struct swsusp_info *info) { - memset(info, 0, sizeof(struct swsusp_info)); + memcpy(&info->uts, init_utsname(), sizeof(struct new_utsname)); info->version_code = LINUX_VERSION_CODE; - info->num_physpages = num_physpages; - memcpy(&info->uts, &system_utsname, sizeof(system_utsname)); - info->cpus = num_online_cpus(); + return 0; +} + +static char *check_image_kernel(struct swsusp_info *info) +{ + if (info->version_code != LINUX_VERSION_CODE) + return "kernel version"; + if (strcmp(info->uts.sysname,init_utsname()->sysname)) + return "system type"; + if (strcmp(info->uts.release,init_utsname()->release)) + return "kernel release"; + if (strcmp(info->uts.version,init_utsname()->version)) + return "version"; + if (strcmp(info->uts.machine,init_utsname()->machine)) + return "machine"; + return NULL; +} +#endif /* CONFIG_ARCH_HIBERNATION_HEADER */ + +unsigned long snapshot_get_image_size(void) +{ + return nr_copy_pages + nr_meta_pages + 1; +} + +static int init_header(struct swsusp_info *info) +{ + memset(info, 0, sizeof(struct swsusp_info)); + info->num_physpages = get_num_physpages(); info->image_pages = nr_copy_pages; - info->pages = nr_copy_pages + nr_meta_pages + 1; + info->pages = snapshot_get_image_size(); info->size = info->pages; info->size <<= PAGE_SHIFT; + return init_header_complete(info); } /** @@ -928,6 +1683,8 @@ pack_pfns(unsigned long *buf, struct memory_bitmap *bm) buf[j] = memory_bm_next_pfn(bm); if (unlikely(buf[j] == BM_END_OF_MAP)) break; + /* Save page key for data page (s390 only). */ + page_key_read(buf + j); } } @@ -938,14 +1695,9 @@ pack_pfns(unsigned long *buf, struct memory_bitmap *bm) * snapshot_handle structure. The structure gets updated and a pointer * to it should be passed to this function every next time. * - * The @count parameter should contain the number of bytes the caller - * wants to read from the snapshot. It must not be zero. - * * On success the function returns a positive number. Then, the caller * is allowed to read up to the returned number of bytes from the memory - * location computed by the data_of() macro. The number returned - * may be smaller than @count, but this only happens if the read would - * cross a page boundary otherwise. + * location computed by the data_of() macro. * * The function returns 0 to indicate the end of data stream condition, * and a negative number is returned on error. In such cases the @@ -953,44 +1705,50 @@ pack_pfns(unsigned long *buf, struct memory_bitmap *bm) * any more. */ -int snapshot_read_next(struct snapshot_handle *handle, size_t count) +int snapshot_read_next(struct snapshot_handle *handle) { if (handle->cur > nr_meta_pages + nr_copy_pages) return 0; if (!buffer) { /* This makes the buffer be freed by swsusp_free() */ - buffer = alloc_image_page(GFP_ATOMIC, PG_ANY); + buffer = get_image_page(GFP_ATOMIC, PG_ANY); if (!buffer) return -ENOMEM; } - if (!handle->offset) { - init_header((struct swsusp_info *)buffer); + if (!handle->cur) { + int error; + + error = init_header((struct swsusp_info *)buffer); + if (error) + return error; handle->buffer = buffer; memory_bm_position_reset(&orig_bm); memory_bm_position_reset(©_bm); - } - if (handle->prev < handle->cur) { - if (handle->cur <= nr_meta_pages) { - memset(buffer, 0, PAGE_SIZE); - pack_pfns(buffer, &orig_bm); - } else { - unsigned long pfn = memory_bm_next_pfn(©_bm); + } else if (handle->cur <= nr_meta_pages) { + clear_page(buffer); + pack_pfns(buffer, &orig_bm); + } else { + struct page *page; - handle->buffer = page_address(pfn_to_page(pfn)); + page = pfn_to_page(memory_bm_next_pfn(©_bm)); + if (PageHighMem(page)) { + /* Highmem pages are copied to the buffer, + * because we can't return with a kmapped + * highmem page (we may not be called again). + */ + void *kaddr; + + kaddr = kmap_atomic(page); + copy_page(buffer, kaddr); + kunmap_atomic(kaddr); + handle->buffer = buffer; + } else { + handle->buffer = page_address(page); } - handle->prev = handle->cur; - } - handle->buf_offset = handle->cur_offset; - if (handle->cur_offset + count >= PAGE_SIZE) { - count = PAGE_SIZE - handle->cur_offset; - handle->cur_offset = 0; - handle->cur++; - } else { - handle->cur_offset += count; } - handle->offset += count; - return count; + handle->cur++; + return PAGE_SIZE; } /** @@ -1005,11 +1763,11 @@ static int mark_unsafe_pages(struct memory_bitmap *bm) unsigned long pfn, max_zone_pfn; /* Clear page flags */ - for_each_zone (zone) { - max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; + for_each_populated_zone(zone) { + max_zone_pfn = zone_end_pfn(zone); for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) if (pfn_valid(pfn)) - ClearPageNosaveFree(pfn_to_page(pfn)); + swsusp_unset_page_free(pfn_to_page(pfn)); } /* Mark pages that correspond to the "original" pfns as "unsafe" */ @@ -1018,7 +1776,7 @@ static int mark_unsafe_pages(struct memory_bitmap *bm) pfn = memory_bm_next_pfn(bm); if (likely(pfn != BM_END_OF_MAP)) { if (likely(pfn_valid(pfn))) - SetPageNosaveFree(pfn_to_page(pfn)); + swsusp_set_page_free(pfn_to_page(pfn)); else return -EFAULT; } @@ -1042,24 +1800,15 @@ duplicate_memory_bitmap(struct memory_bitmap *dst, struct memory_bitmap *src) } } -static inline int check_header(struct swsusp_info *info) +static int check_header(struct swsusp_info *info) { - char *reason = NULL; + char *reason; - if (info->version_code != LINUX_VERSION_CODE) - reason = "kernel version"; - if (info->num_physpages != num_physpages) + reason = check_image_kernel(info); + if (!reason && info->num_physpages != get_num_physpages()) reason = "memory size"; - if (strcmp(info->uts.sysname,system_utsname.sysname)) - reason = "system type"; - if (strcmp(info->uts.release,system_utsname.release)) - reason = "kernel release"; - if (strcmp(info->uts.version,system_utsname.version)) - reason = "version"; - if (strcmp(info->uts.machine,system_utsname.machine)) - reason = "machine"; if (reason) { - printk(KERN_ERR "swsusp: Resume mismatch: %s\n", reason); + printk(KERN_ERR "PM: Image mismatch: %s\n", reason); return -EPERM; } return 0; @@ -1087,9 +1836,7 @@ load_header(struct swsusp_info *info) * unpack_orig_pfns - for each element of @buf[] (1 page at a time) set * the corresponding bit in the memory bitmap @bm */ - -static inline void -unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm) +static int unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm) { int j; @@ -1097,10 +1844,230 @@ unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm) if (unlikely(buf[j] == BM_END_OF_MAP)) break; - memory_bm_set_bit(bm, buf[j]); + /* Extract and buffer page key for data page (s390 only). */ + page_key_memorize(buf + j); + + if (memory_bm_pfn_present(bm, buf[j])) + memory_bm_set_bit(bm, buf[j]); + else + return -EFAULT; + } + + return 0; +} + +/* List of "safe" pages that may be used to store data loaded from the suspend + * image + */ +static struct linked_page *safe_pages_list; + +#ifdef CONFIG_HIGHMEM +/* struct highmem_pbe is used for creating the list of highmem pages that + * should be restored atomically during the resume from disk, because the page + * frames they have occupied before the suspend are in use. + */ +struct highmem_pbe { + struct page *copy_page; /* data is here now */ + struct page *orig_page; /* data was here before the suspend */ + struct highmem_pbe *next; +}; + +/* List of highmem PBEs needed for restoring the highmem pages that were + * allocated before the suspend and included in the suspend image, but have + * also been allocated by the "resume" kernel, so their contents cannot be + * written directly to their "original" page frames. + */ +static struct highmem_pbe *highmem_pblist; + +/** + * count_highmem_image_pages - compute the number of highmem pages in the + * suspend image. The bits in the memory bitmap @bm that correspond to the + * image pages are assumed to be set. + */ + +static unsigned int count_highmem_image_pages(struct memory_bitmap *bm) +{ + unsigned long pfn; + unsigned int cnt = 0; + + memory_bm_position_reset(bm); + pfn = memory_bm_next_pfn(bm); + while (pfn != BM_END_OF_MAP) { + if (PageHighMem(pfn_to_page(pfn))) + cnt++; + + pfn = memory_bm_next_pfn(bm); + } + return cnt; +} + +/** + * prepare_highmem_image - try to allocate as many highmem pages as + * there are highmem image pages (@nr_highmem_p points to the variable + * containing the number of highmem image pages). The pages that are + * "safe" (ie. will not be overwritten when the suspend image is + * restored) have the corresponding bits set in @bm (it must be + * unitialized). + * + * NOTE: This function should not be called if there are no highmem + * image pages. + */ + +static unsigned int safe_highmem_pages; + +static struct memory_bitmap *safe_highmem_bm; + +static int +prepare_highmem_image(struct memory_bitmap *bm, unsigned int *nr_highmem_p) +{ + unsigned int to_alloc; + + if (memory_bm_create(bm, GFP_ATOMIC, PG_SAFE)) + return -ENOMEM; + + if (get_highmem_buffer(PG_SAFE)) + return -ENOMEM; + + to_alloc = count_free_highmem_pages(); + if (to_alloc > *nr_highmem_p) + to_alloc = *nr_highmem_p; + else + *nr_highmem_p = to_alloc; + + safe_highmem_pages = 0; + while (to_alloc-- > 0) { + struct page *page; + + page = alloc_page(__GFP_HIGHMEM); + if (!swsusp_page_is_free(page)) { + /* The page is "safe", set its bit the bitmap */ + memory_bm_set_bit(bm, page_to_pfn(page)); + safe_highmem_pages++; + } + /* Mark the page as allocated */ + swsusp_set_page_forbidden(page); + swsusp_set_page_free(page); + } + memory_bm_position_reset(bm); + safe_highmem_bm = bm; + return 0; +} + +/** + * get_highmem_page_buffer - for given highmem image page find the buffer + * that suspend_write_next() should set for its caller to write to. + * + * If the page is to be saved to its "original" page frame or a copy of + * the page is to be made in the highmem, @buffer is returned. Otherwise, + * the copy of the page is to be made in normal memory, so the address of + * the copy is returned. + * + * If @buffer is returned, the caller of suspend_write_next() will write + * the page's contents to @buffer, so they will have to be copied to the + * right location on the next call to suspend_write_next() and it is done + * with the help of copy_last_highmem_page(). For this purpose, if + * @buffer is returned, @last_highmem page is set to the page to which + * the data will have to be copied from @buffer. + */ + +static struct page *last_highmem_page; + +static void * +get_highmem_page_buffer(struct page *page, struct chain_allocator *ca) +{ + struct highmem_pbe *pbe; + void *kaddr; + + if (swsusp_page_is_forbidden(page) && swsusp_page_is_free(page)) { + /* We have allocated the "original" page frame and we can + * use it directly to store the loaded page. + */ + last_highmem_page = page; + return buffer; + } + /* The "original" page frame has not been allocated and we have to + * use a "safe" page frame to store the loaded page. + */ + pbe = chain_alloc(ca, sizeof(struct highmem_pbe)); + if (!pbe) { + swsusp_free(); + return ERR_PTR(-ENOMEM); + } + pbe->orig_page = page; + if (safe_highmem_pages > 0) { + struct page *tmp; + + /* Copy of the page will be stored in high memory */ + kaddr = buffer; + tmp = pfn_to_page(memory_bm_next_pfn(safe_highmem_bm)); + safe_highmem_pages--; + last_highmem_page = tmp; + pbe->copy_page = tmp; + } else { + /* Copy of the page will be stored in normal memory */ + kaddr = safe_pages_list; + safe_pages_list = safe_pages_list->next; + pbe->copy_page = virt_to_page(kaddr); + } + pbe->next = highmem_pblist; + highmem_pblist = pbe; + return kaddr; +} + +/** + * copy_last_highmem_page - copy the contents of a highmem image from + * @buffer, where the caller of snapshot_write_next() has place them, + * to the right location represented by @last_highmem_page . + */ + +static void copy_last_highmem_page(void) +{ + if (last_highmem_page) { + void *dst; + + dst = kmap_atomic(last_highmem_page); + copy_page(dst, buffer); + kunmap_atomic(dst); + last_highmem_page = NULL; } } +static inline int last_highmem_page_copied(void) +{ + return !last_highmem_page; +} + +static inline void free_highmem_data(void) +{ + if (safe_highmem_bm) + memory_bm_free(safe_highmem_bm, PG_UNSAFE_CLEAR); + + if (buffer) + free_image_page(buffer, PG_UNSAFE_CLEAR); +} +#else +static inline int get_safe_write_buffer(void) { return 0; } + +static unsigned int +count_highmem_image_pages(struct memory_bitmap *bm) { return 0; } + +static inline int +prepare_highmem_image(struct memory_bitmap *bm, unsigned int *nr_highmem_p) +{ + return 0; +} + +static inline void * +get_highmem_page_buffer(struct page *page, struct chain_allocator *ca) +{ + return ERR_PTR(-EINVAL); +} + +static inline void copy_last_highmem_page(void) {} +static inline int last_highmem_page_copied(void) { return 1; } +static inline void free_highmem_data(void) {} +#endif /* CONFIG_HIGHMEM */ + /** * prepare_image - use the memory bitmap @bm to mark the pages that will * be overwritten in the process of restoring the system memory state @@ -1110,20 +2077,25 @@ unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm) * The idea is to allocate a new memory bitmap first and then allocate * as many pages as needed for the image data, but not to assign these * pages to specific tasks initially. Instead, we just mark them as - * allocated and create a list of "safe" pages that will be used later. + * allocated and create a lists of "safe" pages that will be used + * later. On systems with high memory a list of "safe" highmem pages is + * also created. */ #define PBES_PER_LINKED_PAGE (LINKED_PAGE_DATA_SIZE / sizeof(struct pbe)) -static struct linked_page *safe_pages_list; - static int prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm) { - unsigned int nr_pages; + unsigned int nr_pages, nr_highmem; struct linked_page *sp_list, *lp; int error; + /* If there is no highmem, the buffer will not be necessary */ + free_image_page(buffer, PG_UNSAFE_CLEAR); + buffer = NULL; + + nr_highmem = count_highmem_image_pages(bm); error = mark_unsafe_pages(bm); if (error) goto Free; @@ -1134,6 +2106,11 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm) duplicate_memory_bitmap(new_bm, bm); memory_bm_free(bm, PG_UNSAFE_KEEP); + if (nr_highmem > 0) { + error = prepare_highmem_image(bm, &nr_highmem); + if (error) + goto Free; + } /* Reserve some safe pages for potential later use. * * NOTE: This way we make sure there will be enough safe pages for the @@ -1142,10 +2119,10 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm) */ sp_list = NULL; /* nr_copy_pages cannot be lesser than allocated_unsafe_pages */ - nr_pages = nr_copy_pages - allocated_unsafe_pages; + nr_pages = nr_copy_pages - nr_highmem - allocated_unsafe_pages; nr_pages = DIV_ROUND_UP(nr_pages, PBES_PER_LINKED_PAGE); while (nr_pages > 0) { - lp = alloc_image_page(GFP_ATOMIC, PG_SAFE); + lp = get_image_page(GFP_ATOMIC, PG_SAFE); if (!lp) { error = -ENOMEM; goto Free; @@ -1156,21 +2133,21 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm) } /* Preallocate memory for the image */ safe_pages_list = NULL; - nr_pages = nr_copy_pages - allocated_unsafe_pages; + nr_pages = nr_copy_pages - nr_highmem - allocated_unsafe_pages; while (nr_pages > 0) { lp = (struct linked_page *)get_zeroed_page(GFP_ATOMIC); if (!lp) { error = -ENOMEM; goto Free; } - if (!PageNosaveFree(virt_to_page(lp))) { + if (!swsusp_page_is_free(virt_to_page(lp))) { /* The page is "safe", add it to the list */ lp->next = safe_pages_list; safe_pages_list = lp; } /* Mark the page as allocated */ - SetPageNosave(virt_to_page(lp)); - SetPageNosaveFree(virt_to_page(lp)); + swsusp_set_page_forbidden(virt_to_page(lp)); + swsusp_set_page_free(virt_to_page(lp)); nr_pages--; } /* Free the reserved safe pages so that chain_alloc() can use them */ @@ -1181,7 +2158,7 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm) } return 0; -Free: + Free: swsusp_free(); return error; } @@ -1194,9 +2171,17 @@ Free: static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca) { struct pbe *pbe; - struct page *page = pfn_to_page(memory_bm_next_pfn(bm)); + struct page *page; + unsigned long pfn = memory_bm_next_pfn(bm); + + if (pfn == BM_END_OF_MAP) + return ERR_PTR(-EFAULT); + + page = pfn_to_page(pfn); + if (PageHighMem(page)) + return get_highmem_page_buffer(page, ca); - if (PageNosave(page) && PageNosaveFree(page)) + if (swsusp_page_is_forbidden(page) && swsusp_page_is_free(page)) /* We have allocated the "original" page frame and we can * use it directly to store the loaded page. */ @@ -1208,14 +2193,14 @@ static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca) pbe = chain_alloc(ca, sizeof(struct pbe)); if (!pbe) { swsusp_free(); - return NULL; + return ERR_PTR(-ENOMEM); } - pbe->orig_address = (unsigned long)page_address(page); - pbe->address = (unsigned long)safe_pages_list; + pbe->orig_address = page_address(page); + pbe->address = safe_pages_list; safe_pages_list = safe_pages_list->next; pbe->next = restore_pblist; restore_pblist = pbe; - return (void *)pbe->address; + return pbe->address; } /** @@ -1225,14 +2210,9 @@ static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca) * snapshot_handle structure. The structure gets updated and a pointer * to it should be passed to this function every next time. * - * The @count parameter should contain the number of bytes the caller - * wants to write to the image. It must not be zero. - * * On success the function returns a positive number. Then, the caller * is allowed to write up to the returned number of bytes to the memory - * location computed by the data_of() macro. The number returned - * may be smaller than @count, but this only happens if the write would - * cross a page boundary otherwise. + * location computed by the data_of() macro. * * The function returns 0 to indicate the "end of file" condition, * and a negative number is returned on error. In such cases the @@ -1240,76 +2220,142 @@ static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca) * any more. */ -int snapshot_write_next(struct snapshot_handle *handle, size_t count) +int snapshot_write_next(struct snapshot_handle *handle) { static struct chain_allocator ca; int error = 0; /* Check if we have already loaded the entire image */ - if (handle->prev && handle->cur > nr_meta_pages + nr_copy_pages) + if (handle->cur > 1 && handle->cur > nr_meta_pages + nr_copy_pages) return 0; - if (!buffer) { - /* This makes the buffer be freed by swsusp_free() */ - buffer = alloc_image_page(GFP_ATOMIC, PG_ANY); + handle->sync_read = 1; + + if (!handle->cur) { + if (!buffer) + /* This makes the buffer be freed by swsusp_free() */ + buffer = get_image_page(GFP_ATOMIC, PG_ANY); + if (!buffer) return -ENOMEM; - } - if (!handle->offset) - handle->buffer = buffer; - handle->sync_read = 1; - if (handle->prev < handle->cur) { - if (handle->prev == 0) { - error = load_header(buffer); - if (error) - return error; - error = memory_bm_create(©_bm, GFP_ATOMIC, PG_ANY); + handle->buffer = buffer; + } else if (handle->cur == 1) { + error = load_header(buffer); + if (error) + return error; + + error = memory_bm_create(©_bm, GFP_ATOMIC, PG_ANY); + if (error) + return error; + + /* Allocate buffer for page keys. */ + error = page_key_alloc(nr_copy_pages); + if (error) + return error; + + } else if (handle->cur <= nr_meta_pages + 1) { + error = unpack_orig_pfns(buffer, ©_bm); + if (error) + return error; + + if (handle->cur == nr_meta_pages + 1) { + error = prepare_image(&orig_bm, ©_bm); if (error) return error; - } else if (handle->prev <= nr_meta_pages) { - unpack_orig_pfns(buffer, ©_bm); - if (handle->prev == nr_meta_pages) { - error = prepare_image(&orig_bm, ©_bm); - if (error) - return error; - - chain_init(&ca, GFP_ATOMIC, PG_SAFE); - memory_bm_position_reset(&orig_bm); - restore_pblist = NULL; - handle->buffer = get_buffer(&orig_bm, &ca); - handle->sync_read = 0; - if (!handle->buffer) - return -ENOMEM; - } - } else { + chain_init(&ca, GFP_ATOMIC, PG_SAFE); + memory_bm_position_reset(&orig_bm); + restore_pblist = NULL; handle->buffer = get_buffer(&orig_bm, &ca); handle->sync_read = 0; + if (IS_ERR(handle->buffer)) + return PTR_ERR(handle->buffer); } - handle->prev = handle->cur; - } - handle->buf_offset = handle->cur_offset; - if (handle->cur_offset + count >= PAGE_SIZE) { - count = PAGE_SIZE - handle->cur_offset; - handle->cur_offset = 0; - handle->cur++; } else { - handle->cur_offset += count; + copy_last_highmem_page(); + /* Restore page key for data page (s390 only). */ + page_key_write(handle->buffer); + handle->buffer = get_buffer(&orig_bm, &ca); + if (IS_ERR(handle->buffer)) + return PTR_ERR(handle->buffer); + if (handle->buffer != buffer) + handle->sync_read = 0; + } + handle->cur++; + return PAGE_SIZE; +} + +/** + * snapshot_write_finalize - must be called after the last call to + * snapshot_write_next() in case the last page in the image happens + * to be a highmem page and its contents should be stored in the + * highmem. Additionally, it releases the memory that will not be + * used any more. + */ + +void snapshot_write_finalize(struct snapshot_handle *handle) +{ + copy_last_highmem_page(); + /* Restore page key for data page (s390 only). */ + page_key_write(handle->buffer); + page_key_free(); + /* Free only if we have loaded the image entirely */ + if (handle->cur > 1 && handle->cur > nr_meta_pages + nr_copy_pages) { + memory_bm_free(&orig_bm, PG_UNSAFE_CLEAR); + free_highmem_data(); } - handle->offset += count; - return count; } int snapshot_image_loaded(struct snapshot_handle *handle) { - return !(!nr_copy_pages || + return !(!nr_copy_pages || !last_highmem_page_copied() || handle->cur <= nr_meta_pages + nr_copy_pages); } -void snapshot_free_unused_memory(struct snapshot_handle *handle) +#ifdef CONFIG_HIGHMEM +/* Assumes that @buf is ready and points to a "safe" page */ +static inline void +swap_two_pages_data(struct page *p1, struct page *p2, void *buf) { - /* Free only if we have loaded the image entirely */ - if (handle->prev && handle->cur > nr_meta_pages + nr_copy_pages) - memory_bm_free(&orig_bm, PG_UNSAFE_CLEAR); + void *kaddr1, *kaddr2; + + kaddr1 = kmap_atomic(p1); + kaddr2 = kmap_atomic(p2); + copy_page(buf, kaddr1); + copy_page(kaddr1, kaddr2); + copy_page(kaddr2, buf); + kunmap_atomic(kaddr2); + kunmap_atomic(kaddr1); +} + +/** + * restore_highmem - for each highmem page that was allocated before + * the suspend and included in the suspend image, and also has been + * allocated by the "resume" kernel swap its current (ie. "before + * resume") contents with the previous (ie. "before suspend") one. + * + * If the resume eventually fails, we can call this function once + * again and restore the "before resume" highmem state. + */ + +int restore_highmem(void) +{ + struct highmem_pbe *pbe = highmem_pblist; + void *buf; + + if (!pbe) + return 0; + + buf = get_image_page(GFP_ATOMIC, PG_SAFE); + if (!buf) + return -ENOMEM; + + while (pbe) { + swap_two_pages_data(pbe->copy_page, pbe->orig_page, buf); + pbe = pbe->next; + } + free_image_page(buf, PG_UNSAFE_CLEAR); + return 0; } +#endif /* CONFIG_HIGHMEM */ diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c new file mode 100644 index 00000000000..ed35a4790af --- /dev/null +++ b/kernel/power/suspend.c @@ -0,0 +1,443 @@ +/* + * kernel/power/suspend.c - Suspend to RAM and standby functionality. + * + * Copyright (c) 2003 Patrick Mochel + * Copyright (c) 2003 Open Source Development Lab + * Copyright (c) 2009 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc. + * + * This file is released under the GPLv2. + */ + +#include <linux/string.h> +#include <linux/delay.h> +#include <linux/errno.h> +#include <linux/init.h> +#include <linux/console.h> +#include <linux/cpu.h> +#include <linux/cpuidle.h> +#include <linux/syscalls.h> +#include <linux/gfp.h> +#include <linux/io.h> +#include <linux/kernel.h> +#include <linux/list.h> +#include <linux/mm.h> +#include <linux/slab.h> +#include <linux/export.h> +#include <linux/suspend.h> +#include <linux/syscore_ops.h> +#include <linux/ftrace.h> +#include <trace/events/power.h> +#include <linux/compiler.h> + +#include "power.h" + +struct pm_sleep_state pm_states[PM_SUSPEND_MAX] = { + [PM_SUSPEND_FREEZE] = { .label = "freeze", .state = PM_SUSPEND_FREEZE }, + [PM_SUSPEND_STANDBY] = { .label = "standby", }, + [PM_SUSPEND_MEM] = { .label = "mem", }, +}; + +static const struct platform_suspend_ops *suspend_ops; +static const struct platform_freeze_ops *freeze_ops; + +static bool need_suspend_ops(suspend_state_t state) +{ + return state > PM_SUSPEND_FREEZE; +} + +static DECLARE_WAIT_QUEUE_HEAD(suspend_freeze_wait_head); +static bool suspend_freeze_wake; + +void freeze_set_ops(const struct platform_freeze_ops *ops) +{ + lock_system_sleep(); + freeze_ops = ops; + unlock_system_sleep(); +} + +static void freeze_begin(void) +{ + suspend_freeze_wake = false; +} + +static void freeze_enter(void) +{ + cpuidle_use_deepest_state(true); + cpuidle_resume(); + wait_event(suspend_freeze_wait_head, suspend_freeze_wake); + cpuidle_pause(); + cpuidle_use_deepest_state(false); +} + +void freeze_wake(void) +{ + suspend_freeze_wake = true; + wake_up(&suspend_freeze_wait_head); +} +EXPORT_SYMBOL_GPL(freeze_wake); + +static bool valid_state(suspend_state_t state) +{ + /* + * PM_SUSPEND_STANDBY and PM_SUSPEND_MEM states need low level + * support and need to be valid to the low level + * implementation, no valid callback implies that none are valid. + */ + return suspend_ops && suspend_ops->valid && suspend_ops->valid(state); +} + +/* + * If this is set, the "mem" label always corresponds to the deepest sleep state + * available, the "standby" label corresponds to the second deepest sleep state + * available (if any), and the "freeze" label corresponds to the remaining + * available sleep state (if there is one). + */ +static bool relative_states; + +static int __init sleep_states_setup(char *str) +{ + relative_states = !strncmp(str, "1", 1); + if (relative_states) { + pm_states[PM_SUSPEND_MEM].state = PM_SUSPEND_FREEZE; + pm_states[PM_SUSPEND_FREEZE].state = 0; + } + return 1; +} + +__setup("relative_sleep_states=", sleep_states_setup); + +/** + * suspend_set_ops - Set the global suspend method table. + * @ops: Suspend operations to use. + */ +void suspend_set_ops(const struct platform_suspend_ops *ops) +{ + suspend_state_t i; + int j = PM_SUSPEND_MAX - 1; + + lock_system_sleep(); + + suspend_ops = ops; + for (i = PM_SUSPEND_MEM; i >= PM_SUSPEND_STANDBY; i--) + if (valid_state(i)) + pm_states[j--].state = i; + else if (!relative_states) + pm_states[j--].state = 0; + + pm_states[j--].state = PM_SUSPEND_FREEZE; + while (j >= PM_SUSPEND_MIN) + pm_states[j--].state = 0; + + unlock_system_sleep(); +} +EXPORT_SYMBOL_GPL(suspend_set_ops); + +/** + * suspend_valid_only_mem - Generic memory-only valid callback. + * + * Platform drivers that implement mem suspend only and only need to check for + * that in their .valid() callback can use this instead of rolling their own + * .valid() callback. + */ +int suspend_valid_only_mem(suspend_state_t state) +{ + return state == PM_SUSPEND_MEM; +} +EXPORT_SYMBOL_GPL(suspend_valid_only_mem); + +static int suspend_test(int level) +{ +#ifdef CONFIG_PM_DEBUG + if (pm_test_level == level) { + printk(KERN_INFO "suspend debug: Waiting for 5 seconds.\n"); + mdelay(5000); + return 1; + } +#endif /* !CONFIG_PM_DEBUG */ + return 0; +} + +/** + * suspend_prepare - Prepare for entering system sleep state. + * + * Common code run for every system sleep state that can be entered (except for + * hibernation). Run suspend notifiers, allocate the "suspend" console and + * freeze processes. + */ +static int suspend_prepare(suspend_state_t state) +{ + int error; + + if (need_suspend_ops(state) && (!suspend_ops || !suspend_ops->enter)) + return -EPERM; + + pm_prepare_console(); + + error = pm_notifier_call_chain(PM_SUSPEND_PREPARE); + if (error) + goto Finish; + + trace_suspend_resume(TPS("freeze_processes"), 0, true); + error = suspend_freeze_processes(); + trace_suspend_resume(TPS("freeze_processes"), 0, false); + if (!error) + return 0; + + suspend_stats.failed_freeze++; + dpm_save_failed_step(SUSPEND_FREEZE); + Finish: + pm_notifier_call_chain(PM_POST_SUSPEND); + pm_restore_console(); + return error; +} + +/* default implementation */ +void __weak arch_suspend_disable_irqs(void) +{ + local_irq_disable(); +} + +/* default implementation */ +void __weak arch_suspend_enable_irqs(void) +{ + local_irq_enable(); +} + +/** + * suspend_enter - Make the system enter the given sleep state. + * @state: System sleep state to enter. + * @wakeup: Returns information that the sleep state should not be re-entered. + * + * This function should be called after devices have been suspended. + */ +static int suspend_enter(suspend_state_t state, bool *wakeup) +{ + int error; + + if (need_suspend_ops(state) && suspend_ops->prepare) { + error = suspend_ops->prepare(); + if (error) + goto Platform_finish; + } + + error = dpm_suspend_end(PMSG_SUSPEND); + if (error) { + printk(KERN_ERR "PM: Some devices failed to power down\n"); + goto Platform_finish; + } + + if (need_suspend_ops(state) && suspend_ops->prepare_late) { + error = suspend_ops->prepare_late(); + if (error) + goto Platform_wake; + } + + if (suspend_test(TEST_PLATFORM)) + goto Platform_wake; + + /* + * PM_SUSPEND_FREEZE equals + * frozen processes + suspended devices + idle processors. + * Thus we should invoke freeze_enter() soon after + * all the devices are suspended. + */ + if (state == PM_SUSPEND_FREEZE) { + trace_suspend_resume(TPS("machine_suspend"), state, true); + freeze_enter(); + trace_suspend_resume(TPS("machine_suspend"), state, false); + goto Platform_wake; + } + + ftrace_stop(); + error = disable_nonboot_cpus(); + if (error || suspend_test(TEST_CPUS)) + goto Enable_cpus; + + arch_suspend_disable_irqs(); + BUG_ON(!irqs_disabled()); + + error = syscore_suspend(); + if (!error) { + *wakeup = pm_wakeup_pending(); + if (!(suspend_test(TEST_CORE) || *wakeup)) { + trace_suspend_resume(TPS("machine_suspend"), + state, true); + error = suspend_ops->enter(state); + trace_suspend_resume(TPS("machine_suspend"), + state, false); + events_check_enabled = false; + } + syscore_resume(); + } + + arch_suspend_enable_irqs(); + BUG_ON(irqs_disabled()); + + Enable_cpus: + enable_nonboot_cpus(); + ftrace_start(); + + Platform_wake: + if (need_suspend_ops(state) && suspend_ops->wake) + suspend_ops->wake(); + + dpm_resume_start(PMSG_RESUME); + + Platform_finish: + if (need_suspend_ops(state) && suspend_ops->finish) + suspend_ops->finish(); + + return error; +} + +/** + * suspend_devices_and_enter - Suspend devices and enter system sleep state. + * @state: System sleep state to enter. + */ +int suspend_devices_and_enter(suspend_state_t state) +{ + int error; + bool wakeup = false; + + if (need_suspend_ops(state) && !suspend_ops) + return -ENOSYS; + + if (need_suspend_ops(state) && suspend_ops->begin) { + error = suspend_ops->begin(state); + if (error) + goto Close; + } else if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->begin) { + error = freeze_ops->begin(); + if (error) + goto Close; + } + suspend_console(); + suspend_test_start(); + error = dpm_suspend_start(PMSG_SUSPEND); + if (error) { + pr_err("PM: Some devices failed to suspend, or early wake event detected\n"); + goto Recover_platform; + } + suspend_test_finish("suspend devices"); + if (suspend_test(TEST_DEVICES)) + goto Recover_platform; + + do { + error = suspend_enter(state, &wakeup); + } while (!error && !wakeup && need_suspend_ops(state) + && suspend_ops->suspend_again && suspend_ops->suspend_again()); + + Resume_devices: + suspend_test_start(); + dpm_resume_end(PMSG_RESUME); + suspend_test_finish("resume devices"); + resume_console(); + Close: + if (need_suspend_ops(state) && suspend_ops->end) + suspend_ops->end(); + else if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->end) + freeze_ops->end(); + + return error; + + Recover_platform: + if (need_suspend_ops(state) && suspend_ops->recover) + suspend_ops->recover(); + goto Resume_devices; +} + +/** + * suspend_finish - Clean up before finishing the suspend sequence. + * + * Call platform code to clean up, restart processes, and free the console that + * we've allocated. This routine is not called for hibernation. + */ +static void suspend_finish(void) +{ + suspend_thaw_processes(); + pm_notifier_call_chain(PM_POST_SUSPEND); + pm_restore_console(); +} + +/** + * enter_state - Do common work needed to enter system sleep state. + * @state: System sleep state to enter. + * + * Make sure that no one else is trying to put the system into a sleep state. + * Fail if that's not the case. Otherwise, prepare for system suspend, make the + * system enter the given sleep state and clean up after wakeup. + */ +static int enter_state(suspend_state_t state) +{ + int error; + + trace_suspend_resume(TPS("suspend_enter"), state, true); + if (state == PM_SUSPEND_FREEZE) { +#ifdef CONFIG_PM_DEBUG + if (pm_test_level != TEST_NONE && pm_test_level <= TEST_CPUS) { + pr_warning("PM: Unsupported test mode for freeze state," + "please choose none/freezer/devices/platform.\n"); + return -EAGAIN; + } +#endif + } else if (!valid_state(state)) { + return -EINVAL; + } + if (!mutex_trylock(&pm_mutex)) + return -EBUSY; + + if (state == PM_SUSPEND_FREEZE) + freeze_begin(); + + trace_suspend_resume(TPS("sync_filesystems"), 0, true); + printk(KERN_INFO "PM: Syncing filesystems ... "); + sys_sync(); + printk("done.\n"); + trace_suspend_resume(TPS("sync_filesystems"), 0, false); + + pr_debug("PM: Preparing system for %s sleep\n", pm_states[state].label); + error = suspend_prepare(state); + if (error) + goto Unlock; + + if (suspend_test(TEST_FREEZER)) + goto Finish; + + trace_suspend_resume(TPS("suspend_enter"), state, false); + pr_debug("PM: Entering %s sleep\n", pm_states[state].label); + pm_restrict_gfp_mask(); + error = suspend_devices_and_enter(state); + pm_restore_gfp_mask(); + + Finish: + pr_debug("PM: Finishing wakeup.\n"); + suspend_finish(); + Unlock: + mutex_unlock(&pm_mutex); + return error; +} + +/** + * pm_suspend - Externally visible function for suspending the system. + * @state: System sleep state to enter. + * + * Check if the value of @state represents one of the supported states, + * execute enter_state() and update system suspend statistics. + */ +int pm_suspend(suspend_state_t state) +{ + int error; + + if (state <= PM_SUSPEND_ON || state >= PM_SUSPEND_MAX) + return -EINVAL; + + error = enter_state(state); + if (error) { + suspend_stats.fail++; + dpm_save_failed_errno(error); + } else { + suspend_stats.success++; + } + return error; +} +EXPORT_SYMBOL(pm_suspend); diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c new file mode 100644 index 00000000000..269b097e78e --- /dev/null +++ b/kernel/power/suspend_test.c @@ -0,0 +1,185 @@ +/* + * kernel/power/suspend_test.c - Suspend to RAM and standby test facility. + * + * Copyright (c) 2009 Pavel Machek <pavel@ucw.cz> + * + * This file is released under the GPLv2. + */ + +#include <linux/init.h> +#include <linux/rtc.h> + +#include "power.h" + +/* + * We test the system suspend code by setting an RTC wakealarm a short + * time in the future, then suspending. Suspending the devices won't + * normally take long ... some systems only need a few milliseconds. + * + * The time it takes is system-specific though, so when we test this + * during system bootup we allow a LOT of time. + */ +#define TEST_SUSPEND_SECONDS 10 + +static unsigned long suspend_test_start_time; + +void suspend_test_start(void) +{ + /* FIXME Use better timebase than "jiffies", ideally a clocksource. + * What we want is a hardware counter that will work correctly even + * during the irqs-are-off stages of the suspend/resume cycle... + */ + suspend_test_start_time = jiffies; +} + +void suspend_test_finish(const char *label) +{ + long nj = jiffies - suspend_test_start_time; + unsigned msec; + + msec = jiffies_to_msecs(abs(nj)); + pr_info("PM: %s took %d.%03d seconds\n", label, + msec / 1000, msec % 1000); + + /* Warning on suspend means the RTC alarm period needs to be + * larger -- the system was sooo slooowwww to suspend that the + * alarm (should have) fired before the system went to sleep! + * + * Warning on either suspend or resume also means the system + * has some performance issues. The stack dump of a WARN_ON + * is more likely to get the right attention than a printk... + */ + WARN(msec > (TEST_SUSPEND_SECONDS * 1000), + "Component: %s, time: %u\n", label, msec); +} + +/* + * To test system suspend, we need a hands-off mechanism to resume the + * system. RTCs wake alarms are a common self-contained mechanism. + */ + +static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state) +{ + static char err_readtime[] __initdata = + KERN_ERR "PM: can't read %s time, err %d\n"; + static char err_wakealarm [] __initdata = + KERN_ERR "PM: can't set %s wakealarm, err %d\n"; + static char err_suspend[] __initdata = + KERN_ERR "PM: suspend test failed, error %d\n"; + static char info_test[] __initdata = + KERN_INFO "PM: test RTC wakeup from '%s' suspend\n"; + + unsigned long now; + struct rtc_wkalrm alm; + int status; + + /* this may fail if the RTC hasn't been initialized */ + status = rtc_read_time(rtc, &alm.time); + if (status < 0) { + printk(err_readtime, dev_name(&rtc->dev), status); + return; + } + rtc_tm_to_time(&alm.time, &now); + + memset(&alm, 0, sizeof alm); + rtc_time_to_tm(now + TEST_SUSPEND_SECONDS, &alm.time); + alm.enabled = true; + + status = rtc_set_alarm(rtc, &alm); + if (status < 0) { + printk(err_wakealarm, dev_name(&rtc->dev), status); + return; + } + + if (state == PM_SUSPEND_MEM) { + printk(info_test, pm_states[state].label); + status = pm_suspend(state); + if (status == -ENODEV) + state = PM_SUSPEND_STANDBY; + } + if (state == PM_SUSPEND_STANDBY) { + printk(info_test, pm_states[state].label); + status = pm_suspend(state); + } + if (status < 0) + printk(err_suspend, status); + + /* Some platforms can't detect that the alarm triggered the + * wakeup, or (accordingly) disable it after it afterwards. + * It's supposed to give oneshot behavior; cope. + */ + alm.enabled = false; + rtc_set_alarm(rtc, &alm); +} + +static int __init has_wakealarm(struct device *dev, const void *data) +{ + struct rtc_device *candidate = to_rtc_device(dev); + + if (!candidate->ops->set_alarm) + return 0; + if (!device_may_wakeup(candidate->dev.parent)) + return 0; + + return 1; +} + +/* + * Kernel options like "test_suspend=mem" force suspend/resume sanity tests + * at startup time. They're normally disabled, for faster boot and because + * we can't know which states really work on this particular system. + */ +static suspend_state_t test_state __initdata = PM_SUSPEND_ON; + +static char warn_bad_state[] __initdata = + KERN_WARNING "PM: can't test '%s' suspend state\n"; + +static int __init setup_test_suspend(char *value) +{ + suspend_state_t i; + + /* "=mem" ==> "mem" */ + value++; + for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++) + if (!strcmp(pm_states[i].label, value)) { + test_state = pm_states[i].state; + return 0; + } + + printk(warn_bad_state, value); + return 0; +} +__setup("test_suspend", setup_test_suspend); + +static int __init test_suspend(void) +{ + static char warn_no_rtc[] __initdata = + KERN_WARNING "PM: no wakealarm-capable RTC driver is ready\n"; + + struct rtc_device *rtc = NULL; + struct device *dev; + + /* PM is initialized by now; is that state testable? */ + if (test_state == PM_SUSPEND_ON) + goto done; + if (!pm_states[test_state].state) { + printk(warn_bad_state, pm_states[test_state].label); + goto done; + } + + /* RTCs have initialized by now too ... can we use one? */ + dev = class_find_device(rtc_class, NULL, NULL, has_wakealarm); + if (dev) + rtc = rtc_class_open(dev_name(dev)); + if (!rtc) { + printk(warn_no_rtc); + goto done; + } + + /* go for it */ + test_wakealarm(rtc, test_state); + rtc_class_close(rtc); +done: + return 0; +} +late_initcall(test_suspend); diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 9b2ee5344de..aaa3261dea5 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -4,64 +4,236 @@ * This file provides functions for reading the suspend image from * and writing it to a swap partition. * - * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@suse.cz> + * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@ucw.cz> * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl> + * Copyright (C) 2010-2012 Bojan Smojver <bojan@rexursive.com> * * This file is released under the GPLv2. * */ #include <linux/module.h> -#include <linux/smp_lock.h> #include <linux/file.h> -#include <linux/utsname.h> -#include <linux/version.h> #include <linux/delay.h> #include <linux/bitops.h> #include <linux/genhd.h> #include <linux/device.h> -#include <linux/buffer_head.h> #include <linux/bio.h> #include <linux/blkdev.h> #include <linux/swap.h> #include <linux/swapops.h> #include <linux/pm.h> +#include <linux/slab.h> +#include <linux/lzo.h> +#include <linux/vmalloc.h> +#include <linux/cpumask.h> +#include <linux/atomic.h> +#include <linux/kthread.h> +#include <linux/crc32.h> #include "power.h" -extern char resume_file[]; +#define HIBERNATE_SIG "S1SUSPEND" -#define SWSUSP_SIG "S1SUSPEND" +/* + * The swap map is a data structure used for keeping track of each page + * written to a swap partition. It consists of many swap_map_page + * structures that contain each an array of MAP_PAGE_ENTRIES swap entries. + * These structures are stored on the swap and linked together with the + * help of the .next_swap member. + * + * The swap map is created during suspend. The swap map pages are + * allocated and populated one at a time, so we only need one memory + * page to set up the entire structure. + * + * During resume we pick up all swap_map_page structures into a list. + */ + +#define MAP_PAGE_ENTRIES (PAGE_SIZE / sizeof(sector_t) - 1) -static struct swsusp_header { - char reserved[PAGE_SIZE - 20 - sizeof(swp_entry_t)]; - swp_entry_t image; +/* + * Number of free pages that are not high. + */ +static inline unsigned long low_free_pages(void) +{ + return nr_free_pages() - nr_free_highpages(); +} + +/* + * Number of pages required to be kept free while writing the image. Always + * half of all available low pages before the writing starts. + */ +static inline unsigned long reqd_free_pages(void) +{ + return low_free_pages() / 2; +} + +struct swap_map_page { + sector_t entries[MAP_PAGE_ENTRIES]; + sector_t next_swap; +}; + +struct swap_map_page_list { + struct swap_map_page *map; + struct swap_map_page_list *next; +}; + +/** + * The swap_map_handle structure is used for handling swap in + * a file-alike way + */ + +struct swap_map_handle { + struct swap_map_page *cur; + struct swap_map_page_list *maps; + sector_t cur_swap; + sector_t first_sector; + unsigned int k; + unsigned long reqd_free_pages; + u32 crc32; +}; + +struct swsusp_header { + char reserved[PAGE_SIZE - 20 - sizeof(sector_t) - sizeof(int) - + sizeof(u32)]; + u32 crc32; + sector_t image; + unsigned int flags; /* Flags to pass to the "boot" kernel */ char orig_sig[10]; char sig[10]; -} __attribute__((packed, aligned(PAGE_SIZE))) swsusp_header; +} __packed; + +static struct swsusp_header *swsusp_header; + +/** + * The following functions are used for tracing the allocated + * swap pages, so that they can be freed in case of an error. + */ + +struct swsusp_extent { + struct rb_node node; + unsigned long start; + unsigned long end; +}; + +static struct rb_root swsusp_extents = RB_ROOT; + +static int swsusp_extents_insert(unsigned long swap_offset) +{ + struct rb_node **new = &(swsusp_extents.rb_node); + struct rb_node *parent = NULL; + struct swsusp_extent *ext; + + /* Figure out where to put the new node */ + while (*new) { + ext = rb_entry(*new, struct swsusp_extent, node); + parent = *new; + if (swap_offset < ext->start) { + /* Try to merge */ + if (swap_offset == ext->start - 1) { + ext->start--; + return 0; + } + new = &((*new)->rb_left); + } else if (swap_offset > ext->end) { + /* Try to merge */ + if (swap_offset == ext->end + 1) { + ext->end++; + return 0; + } + new = &((*new)->rb_right); + } else { + /* It already is in the tree */ + return -EINVAL; + } + } + /* Add the new node and rebalance the tree. */ + ext = kzalloc(sizeof(struct swsusp_extent), GFP_KERNEL); + if (!ext) + return -ENOMEM; + + ext->start = swap_offset; + ext->end = swap_offset; + rb_link_node(&ext->node, parent, new); + rb_insert_color(&ext->node, &swsusp_extents); + return 0; +} + +/** + * alloc_swapdev_block - allocate a swap page and register that it has + * been allocated, so that it can be freed in case of an error. + */ + +sector_t alloc_swapdev_block(int swap) +{ + unsigned long offset; + + offset = swp_offset(get_swap_page_of_type(swap)); + if (offset) { + if (swsusp_extents_insert(offset)) + swap_free(swp_entry(swap, offset)); + else + return swapdev_block(swap, offset); + } + return 0; +} + +/** + * free_all_swap_pages - free swap pages allocated for saving image data. + * It also frees the extents used to register which swap entries had been + * allocated. + */ + +void free_all_swap_pages(int swap) +{ + struct rb_node *node; + + while ((node = swsusp_extents.rb_node)) { + struct swsusp_extent *ext; + unsigned long offset; + + ext = container_of(node, struct swsusp_extent, node); + rb_erase(node, &swsusp_extents); + for (offset = ext->start; offset <= ext->end; offset++) + swap_free(swp_entry(swap, offset)); + + kfree(ext); + } +} + +int swsusp_swap_in_use(void) +{ + return (swsusp_extents.rb_node != NULL); +} /* - * Saving part... + * General things */ static unsigned short root_swap = 0xffff; +struct block_device *hib_resume_bdev; -static int mark_swapfiles(swp_entry_t start) +/* + * Saving part + */ + +static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags) { int error; - rw_swap_page_sync(READ, swp_entry(root_swap, 0), - virt_to_page((unsigned long)&swsusp_header), NULL); - if (!memcmp("SWAP-SPACE",swsusp_header.sig, 10) || - !memcmp("SWAPSPACE2",swsusp_header.sig, 10)) { - memcpy(swsusp_header.orig_sig,swsusp_header.sig, 10); - memcpy(swsusp_header.sig,SWSUSP_SIG, 10); - swsusp_header.image = start; - error = rw_swap_page_sync(WRITE, swp_entry(root_swap, 0), - virt_to_page((unsigned long)&swsusp_header), - NULL); + hib_bio_read_page(swsusp_resume_block, swsusp_header, NULL); + if (!memcmp("SWAP-SPACE",swsusp_header->sig, 10) || + !memcmp("SWAPSPACE2",swsusp_header->sig, 10)) { + memcpy(swsusp_header->orig_sig,swsusp_header->sig, 10); + memcpy(swsusp_header->sig, HIBERNATE_SIG, 10); + swsusp_header->image = handle->first_sector; + swsusp_header->flags = flags; + if (flags & SF_CRC32_MODE) + swsusp_header->crc32 = handle->crc32; + error = hib_bio_write_page(swsusp_resume_block, + swsusp_header, NULL); } else { - pr_debug("swsusp: Partition is not swap space.\n"); + printk(KERN_ERR "PM: Swap header not found!\n"); error = -ENODEV; } return error; @@ -70,16 +242,27 @@ static int mark_swapfiles(swp_entry_t start) /** * swsusp_swap_check - check if the resume device is a swap device * and get its index (if so) + * + * This is called before saving image */ - -static int swsusp_swap_check(void) /* This is called before saving image */ +static int swsusp_swap_check(void) { - int res = swap_type_of(swsusp_resume_device); + int res; + + res = swap_type_of(swsusp_resume_device, swsusp_resume_block, + &hib_resume_bdev); + if (res < 0) + return res; + + root_swap = res; + res = blkdev_get(hib_resume_bdev, FMODE_WRITE, NULL); + if (res) + return res; + + res = set_blocksize(hib_resume_bdev, PAGE_SIZE); + if (res < 0) + blkdev_put(hib_resume_bdev, FMODE_WRITE); - if (res >= 0) { - root_swap = res; - return 0; - } return res; } @@ -90,146 +273,76 @@ static int swsusp_swap_check(void) /* This is called before saving image */ * @bio_chain: Link the next write BIO here */ -static int write_page(void *buf, unsigned long offset, struct bio **bio_chain) +static int write_page(void *buf, sector_t offset, struct bio **bio_chain) { - swp_entry_t entry; - int error = -ENOSPC; + void *src; + int ret; - if (offset) { - struct page *page = virt_to_page(buf); + if (!offset) + return -ENOSPC; - if (bio_chain) { - /* - * Whether or not we successfully allocated a copy page, - * we take a ref on the page here. It gets undone in - * wait_on_bio_chain(). - */ - struct page *page_copy; - page_copy = alloc_page(GFP_ATOMIC); - if (page_copy == NULL) { + if (bio_chain) { + src = (void *)__get_free_page(__GFP_WAIT | __GFP_NOWARN | + __GFP_NORETRY); + if (src) { + copy_page(src, buf); + } else { + ret = hib_wait_on_bio_chain(bio_chain); /* Free pages */ + if (ret) + return ret; + src = (void *)__get_free_page(__GFP_WAIT | + __GFP_NOWARN | + __GFP_NORETRY); + if (src) { + copy_page(src, buf); + } else { WARN_ON_ONCE(1); bio_chain = NULL; /* Go synchronous */ - get_page(page); - } else { - memcpy(page_address(page_copy), - page_address(page), PAGE_SIZE); - page = page_copy; + src = buf; } } - entry = swp_entry(root_swap, offset); - error = rw_swap_page_sync(WRITE, entry, page, bio_chain); + } else { + src = buf; } - return error; + return hib_bio_write_page(offset, src, bio_chain); } -/* - * The swap map is a data structure used for keeping track of each page - * written to a swap partition. It consists of many swap_map_page - * structures that contain each an array of MAP_PAGE_SIZE swap entries. - * These structures are stored on the swap and linked together with the - * help of the .next_swap member. - * - * The swap map is created during suspend. The swap map pages are - * allocated and populated one at a time, so we only need one memory - * page to set up the entire structure. - * - * During resume we also only need to use one swap_map_page structure - * at a time. - */ - -#define MAP_PAGE_ENTRIES (PAGE_SIZE / sizeof(long) - 1) - -struct swap_map_page { - unsigned long entries[MAP_PAGE_ENTRIES]; - unsigned long next_swap; -}; - -/** - * The swap_map_handle structure is used for handling swap in - * a file-alike way - */ - -struct swap_map_handle { - struct swap_map_page *cur; - unsigned long cur_swap; - struct bitmap_page *bitmap; - unsigned int k; -}; - static void release_swap_writer(struct swap_map_handle *handle) { if (handle->cur) free_page((unsigned long)handle->cur); handle->cur = NULL; - if (handle->bitmap) - free_bitmap(handle->bitmap); - handle->bitmap = NULL; -} - -static void show_speed(struct timeval *start, struct timeval *stop, - unsigned nr_pages, char *msg) -{ - s64 elapsed_centisecs64; - int centisecs; - int k; - int kps; - - elapsed_centisecs64 = timeval_to_ns(stop) - timeval_to_ns(start); - do_div(elapsed_centisecs64, NSEC_PER_SEC / 100); - centisecs = elapsed_centisecs64; - if (centisecs == 0) - centisecs = 1; /* avoid div-by-zero */ - k = nr_pages * (PAGE_SIZE / 1024); - kps = (k * 100) / centisecs; - printk("%s %d kbytes in %d.%02d seconds (%d.%02d MB/s)\n", msg, k, - centisecs / 100, centisecs % 100, - kps / 1000, (kps % 1000) / 10); } static int get_swap_writer(struct swap_map_handle *handle) { + int ret; + + ret = swsusp_swap_check(); + if (ret) { + if (ret != -ENOSPC) + printk(KERN_ERR "PM: Cannot find swap device, try " + "swapon -a.\n"); + return ret; + } handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_KERNEL); - if (!handle->cur) - return -ENOMEM; - handle->bitmap = alloc_bitmap(count_swap_pages(root_swap, 0)); - if (!handle->bitmap) { - release_swap_writer(handle); - return -ENOMEM; + if (!handle->cur) { + ret = -ENOMEM; + goto err_close; } - handle->cur_swap = alloc_swap_page(root_swap, handle->bitmap); + handle->cur_swap = alloc_swapdev_block(root_swap); if (!handle->cur_swap) { - release_swap_writer(handle); - return -ENOSPC; + ret = -ENOSPC; + goto err_rel; } handle->k = 0; + handle->reqd_free_pages = reqd_free_pages(); + handle->first_sector = handle->cur_swap; return 0; -} - -static int wait_on_bio_chain(struct bio **bio_chain) -{ - struct bio *bio; - struct bio *next_bio; - int ret = 0; - - if (bio_chain == NULL) - return 0; - - bio = *bio_chain; - if (bio == NULL) - return 0; - while (bio) { - struct page *page; - - next_bio = bio->bi_private; - page = bio->bi_io_vec[0].bv_page; - wait_on_page_locked(page); - if (!PageUptodate(page) || PageError(page)) - ret = -EIO; - put_page(page); - bio_put(bio); - bio = next_bio; - } - *bio_chain = NULL; +err_rel: + release_swap_writer(handle); +err_close: + swsusp_close(FMODE_WRITE); return ret; } @@ -237,31 +350,39 @@ static int swap_write_page(struct swap_map_handle *handle, void *buf, struct bio **bio_chain) { int error = 0; - unsigned long offset; + sector_t offset; if (!handle->cur) return -EINVAL; - offset = alloc_swap_page(root_swap, handle->bitmap); + offset = alloc_swapdev_block(root_swap); error = write_page(buf, offset, bio_chain); if (error) return error; handle->cur->entries[handle->k++] = offset; if (handle->k >= MAP_PAGE_ENTRIES) { - error = wait_on_bio_chain(bio_chain); - if (error) - goto out; - offset = alloc_swap_page(root_swap, handle->bitmap); + offset = alloc_swapdev_block(root_swap); if (!offset) return -ENOSPC; handle->cur->next_swap = offset; - error = write_page(handle->cur, handle->cur_swap, NULL); + error = write_page(handle->cur, handle->cur_swap, bio_chain); if (error) goto out; - memset(handle->cur, 0, PAGE_SIZE); + clear_page(handle->cur); handle->cur_swap = offset; handle->k = 0; + + if (bio_chain && low_free_pages() <= handle->reqd_free_pages) { + error = hib_wait_on_bio_chain(bio_chain); + if (error) + goto out; + /* + * Recalculate the number of required free pages, to + * make sure we never take more than half. + */ + handle->reqd_free_pages = reqd_free_pages(); + } } -out: + out: return error; } @@ -273,6 +394,44 @@ static int flush_swap_writer(struct swap_map_handle *handle) return -EINVAL; } +static int swap_writer_finish(struct swap_map_handle *handle, + unsigned int flags, int error) +{ + if (!error) { + flush_swap_writer(handle); + printk(KERN_INFO "PM: S"); + error = mark_swapfiles(handle, flags); + printk("|\n"); + } + + if (error) + free_all_swap_pages(root_swap); + release_swap_writer(handle); + swsusp_close(FMODE_WRITE); + + return error; +} + +/* We need to remember how much compressed data we need to read. */ +#define LZO_HEADER sizeof(size_t) + +/* Number of pages/bytes we'll compress at one time. */ +#define LZO_UNC_PAGES 32 +#define LZO_UNC_SIZE (LZO_UNC_PAGES * PAGE_SIZE) + +/* Number of pages/bytes we need for compressed data (worst case). */ +#define LZO_CMP_PAGES DIV_ROUND_UP(lzo1x_worst_compress(LZO_UNC_SIZE) + \ + LZO_HEADER, PAGE_SIZE) +#define LZO_CMP_SIZE (LZO_CMP_PAGES * PAGE_SIZE) + +/* Maximum number of threads for compression/decompression. */ +#define LZO_THREADS 3 + +/* Minimum/maximum number of pages for read buffering. */ +#define LZO_MIN_RD_PAGES 1024 +#define LZO_MAX_RD_PAGES 8192 + + /** * save_image - save the suspend image data */ @@ -283,40 +442,344 @@ static int save_image(struct swap_map_handle *handle, { unsigned int m; int ret; - int error = 0; int nr_pages; int err2; struct bio *bio; struct timeval start; struct timeval stop; - printk("Saving image data pages (%u pages) ... ", nr_to_write); - m = nr_to_write / 100; + printk(KERN_INFO "PM: Saving image data pages (%u pages)...\n", + nr_to_write); + m = nr_to_write / 10; if (!m) m = 1; nr_pages = 0; bio = NULL; do_gettimeofday(&start); - do { - ret = snapshot_read_next(snapshot, PAGE_SIZE); - if (ret > 0) { - error = swap_write_page(handle, data_of(*snapshot), - &bio); - if (error) + while (1) { + ret = snapshot_read_next(snapshot); + if (ret <= 0) + break; + ret = swap_write_page(handle, data_of(*snapshot), &bio); + if (ret) + break; + if (!(nr_pages % m)) + printk(KERN_INFO "PM: Image saving progress: %3d%%\n", + nr_pages / m * 10); + nr_pages++; + } + err2 = hib_wait_on_bio_chain(&bio); + do_gettimeofday(&stop); + if (!ret) + ret = err2; + if (!ret) + printk(KERN_INFO "PM: Image saving done.\n"); + swsusp_show_speed(&start, &stop, nr_to_write, "Wrote"); + return ret; +} + +/** + * Structure used for CRC32. + */ +struct crc_data { + struct task_struct *thr; /* thread */ + atomic_t ready; /* ready to start flag */ + atomic_t stop; /* ready to stop flag */ + unsigned run_threads; /* nr current threads */ + wait_queue_head_t go; /* start crc update */ + wait_queue_head_t done; /* crc update done */ + u32 *crc32; /* points to handle's crc32 */ + size_t *unc_len[LZO_THREADS]; /* uncompressed lengths */ + unsigned char *unc[LZO_THREADS]; /* uncompressed data */ +}; + +/** + * CRC32 update function that runs in its own thread. + */ +static int crc32_threadfn(void *data) +{ + struct crc_data *d = data; + unsigned i; + + while (1) { + wait_event(d->go, atomic_read(&d->ready) || + kthread_should_stop()); + if (kthread_should_stop()) { + d->thr = NULL; + atomic_set(&d->stop, 1); + wake_up(&d->done); + break; + } + atomic_set(&d->ready, 0); + + for (i = 0; i < d->run_threads; i++) + *d->crc32 = crc32_le(*d->crc32, + d->unc[i], *d->unc_len[i]); + atomic_set(&d->stop, 1); + wake_up(&d->done); + } + return 0; +} +/** + * Structure used for LZO data compression. + */ +struct cmp_data { + struct task_struct *thr; /* thread */ + atomic_t ready; /* ready to start flag */ + atomic_t stop; /* ready to stop flag */ + int ret; /* return code */ + wait_queue_head_t go; /* start compression */ + wait_queue_head_t done; /* compression done */ + size_t unc_len; /* uncompressed length */ + size_t cmp_len; /* compressed length */ + unsigned char unc[LZO_UNC_SIZE]; /* uncompressed buffer */ + unsigned char cmp[LZO_CMP_SIZE]; /* compressed buffer */ + unsigned char wrk[LZO1X_1_MEM_COMPRESS]; /* compression workspace */ +}; + +/** + * Compression function that runs in its own thread. + */ +static int lzo_compress_threadfn(void *data) +{ + struct cmp_data *d = data; + + while (1) { + wait_event(d->go, atomic_read(&d->ready) || + kthread_should_stop()); + if (kthread_should_stop()) { + d->thr = NULL; + d->ret = -1; + atomic_set(&d->stop, 1); + wake_up(&d->done); + break; + } + atomic_set(&d->ready, 0); + + d->ret = lzo1x_1_compress(d->unc, d->unc_len, + d->cmp + LZO_HEADER, &d->cmp_len, + d->wrk); + atomic_set(&d->stop, 1); + wake_up(&d->done); + } + return 0; +} + +/** + * save_image_lzo - Save the suspend image data compressed with LZO. + * @handle: Swap map handle to use for saving the image. + * @snapshot: Image to read data from. + * @nr_to_write: Number of pages to save. + */ +static int save_image_lzo(struct swap_map_handle *handle, + struct snapshot_handle *snapshot, + unsigned int nr_to_write) +{ + unsigned int m; + int ret = 0; + int nr_pages; + int err2; + struct bio *bio; + struct timeval start; + struct timeval stop; + size_t off; + unsigned thr, run_threads, nr_threads; + unsigned char *page = NULL; + struct cmp_data *data = NULL; + struct crc_data *crc = NULL; + + /* + * We'll limit the number of threads for compression to limit memory + * footprint. + */ + nr_threads = num_online_cpus() - 1; + nr_threads = clamp_val(nr_threads, 1, LZO_THREADS); + + page = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH); + if (!page) { + printk(KERN_ERR "PM: Failed to allocate LZO page\n"); + ret = -ENOMEM; + goto out_clean; + } + + data = vmalloc(sizeof(*data) * nr_threads); + if (!data) { + printk(KERN_ERR "PM: Failed to allocate LZO data\n"); + ret = -ENOMEM; + goto out_clean; + } + for (thr = 0; thr < nr_threads; thr++) + memset(&data[thr], 0, offsetof(struct cmp_data, go)); + + crc = kmalloc(sizeof(*crc), GFP_KERNEL); + if (!crc) { + printk(KERN_ERR "PM: Failed to allocate crc\n"); + ret = -ENOMEM; + goto out_clean; + } + memset(crc, 0, offsetof(struct crc_data, go)); + + /* + * Start the compression threads. + */ + for (thr = 0; thr < nr_threads; thr++) { + init_waitqueue_head(&data[thr].go); + init_waitqueue_head(&data[thr].done); + + data[thr].thr = kthread_run(lzo_compress_threadfn, + &data[thr], + "image_compress/%u", thr); + if (IS_ERR(data[thr].thr)) { + data[thr].thr = NULL; + printk(KERN_ERR + "PM: Cannot start compression threads\n"); + ret = -ENOMEM; + goto out_clean; + } + } + + /* + * Start the CRC32 thread. + */ + init_waitqueue_head(&crc->go); + init_waitqueue_head(&crc->done); + + handle->crc32 = 0; + crc->crc32 = &handle->crc32; + for (thr = 0; thr < nr_threads; thr++) { + crc->unc[thr] = data[thr].unc; + crc->unc_len[thr] = &data[thr].unc_len; + } + + crc->thr = kthread_run(crc32_threadfn, crc, "image_crc32"); + if (IS_ERR(crc->thr)) { + crc->thr = NULL; + printk(KERN_ERR "PM: Cannot start CRC32 thread\n"); + ret = -ENOMEM; + goto out_clean; + } + + /* + * Adjust the number of required free pages after all allocations have + * been done. We don't want to run out of pages when writing. + */ + handle->reqd_free_pages = reqd_free_pages(); + + printk(KERN_INFO + "PM: Using %u thread(s) for compression.\n" + "PM: Compressing and saving image data (%u pages)...\n", + nr_threads, nr_to_write); + m = nr_to_write / 10; + if (!m) + m = 1; + nr_pages = 0; + bio = NULL; + do_gettimeofday(&start); + for (;;) { + for (thr = 0; thr < nr_threads; thr++) { + for (off = 0; off < LZO_UNC_SIZE; off += PAGE_SIZE) { + ret = snapshot_read_next(snapshot); + if (ret < 0) + goto out_finish; + + if (!ret) + break; + + memcpy(data[thr].unc + off, + data_of(*snapshot), PAGE_SIZE); + + if (!(nr_pages % m)) + printk(KERN_INFO + "PM: Image saving progress: " + "%3d%%\n", + nr_pages / m * 10); + nr_pages++; + } + if (!off) break; - if (!(nr_pages % m)) - printk("\b\b\b\b%3d%%", nr_pages / m); - nr_pages++; + + data[thr].unc_len = off; + + atomic_set(&data[thr].ready, 1); + wake_up(&data[thr].go); } - } while (ret > 0); - err2 = wait_on_bio_chain(&bio); + + if (!thr) + break; + + crc->run_threads = thr; + atomic_set(&crc->ready, 1); + wake_up(&crc->go); + + for (run_threads = thr, thr = 0; thr < run_threads; thr++) { + wait_event(data[thr].done, + atomic_read(&data[thr].stop)); + atomic_set(&data[thr].stop, 0); + + ret = data[thr].ret; + + if (ret < 0) { + printk(KERN_ERR "PM: LZO compression failed\n"); + goto out_finish; + } + + if (unlikely(!data[thr].cmp_len || + data[thr].cmp_len > + lzo1x_worst_compress(data[thr].unc_len))) { + printk(KERN_ERR + "PM: Invalid LZO compressed length\n"); + ret = -1; + goto out_finish; + } + + *(size_t *)data[thr].cmp = data[thr].cmp_len; + + /* + * Given we are writing one page at a time to disk, we + * copy that much from the buffer, although the last + * bit will likely be smaller than full page. This is + * OK - we saved the length of the compressed data, so + * any garbage at the end will be discarded when we + * read it. + */ + for (off = 0; + off < LZO_HEADER + data[thr].cmp_len; + off += PAGE_SIZE) { + memcpy(page, data[thr].cmp + off, PAGE_SIZE); + + ret = swap_write_page(handle, page, &bio); + if (ret) + goto out_finish; + } + } + + wait_event(crc->done, atomic_read(&crc->stop)); + atomic_set(&crc->stop, 0); + } + +out_finish: + err2 = hib_wait_on_bio_chain(&bio); do_gettimeofday(&stop); - if (!error) - error = err2; - if (!error) - printk("\b\b\b\bdone\n"); - show_speed(&start, &stop, nr_to_write, "Wrote"); - return error; + if (!ret) + ret = err2; + if (!ret) + printk(KERN_INFO "PM: Image saving done.\n"); + swsusp_show_speed(&start, &stop, nr_to_write, "Wrote"); +out_clean: + if (crc) { + if (crc->thr) + kthread_stop(crc->thr); + kfree(crc); + } + if (data) { + for (thr = 0; thr < nr_threads; thr++) + if (data[thr].thr) + kthread_stop(data[thr].thr); + vfree(data); + } + if (page) free_page((unsigned long)page); + + return ret; } /** @@ -326,16 +789,20 @@ static int save_image(struct swap_map_handle *handle, * space avaiable from the resume partition. */ -static int enough_swap(unsigned int nr_pages) +static int enough_swap(unsigned int nr_pages, unsigned int flags) { unsigned int free_swap = count_swap_pages(root_swap, 1); + unsigned int required; + + pr_debug("PM: Free swap pages: %u\n", free_swap); - pr_debug("swsusp: free swap pages: %u\n", free_swap); - return free_swap > nr_pages + PAGES_FOR_IO; + required = PAGES_FOR_IO + nr_pages; + return free_swap > required; } /** * swsusp_write - Write entire image and metadata. + * @flags: flags to pass to the "boot" kernel in the image header * * It is important _NOT_ to umount filesystems at this point. We want * them synced (in case something goes wrong) but we DO not want to mark @@ -343,104 +810,45 @@ static int enough_swap(unsigned int nr_pages) * correctly, we'll mark system clean, anyway.) */ -int swsusp_write(void) +int swsusp_write(unsigned int flags) { struct swap_map_handle handle; struct snapshot_handle snapshot; struct swsusp_info *header; + unsigned long pages; int error; - if ((error = swsusp_swap_check())) { - printk(KERN_ERR "swsusp: Cannot find swap device, try " - "swapon -a.\n"); + pages = snapshot_get_image_size(); + error = get_swap_writer(&handle); + if (error) { + printk(KERN_ERR "PM: Cannot get swap writer\n"); return error; } - memset(&snapshot, 0, sizeof(struct snapshot_handle)); - error = snapshot_read_next(&snapshot, PAGE_SIZE); - if (error < PAGE_SIZE) - return error < 0 ? error : -EFAULT; - header = (struct swsusp_info *)data_of(snapshot); - if (!enough_swap(header->pages)) { - printk(KERN_ERR "swsusp: Not enough free swap\n"); - return -ENOSPC; - } - error = get_swap_writer(&handle); - if (!error) { - unsigned long start = handle.cur_swap; - error = swap_write_page(&handle, header, NULL); - if (!error) - error = save_image(&handle, &snapshot, - header->pages - 1); - if (!error) { - flush_swap_writer(&handle); - printk("S"); - error = mark_swapfiles(swp_entry(root_swap, start)); - printk("|\n"); + if (flags & SF_NOCOMPRESS_MODE) { + if (!enough_swap(pages, flags)) { + printk(KERN_ERR "PM: Not enough free swap\n"); + error = -ENOSPC; + goto out_finish; } } - if (error) - free_all_swap_pages(root_swap, handle.bitmap); - release_swap_writer(&handle); - return error; -} - -static struct block_device *resume_bdev; - -/** - * submit - submit BIO request. - * @rw: READ or WRITE. - * @off physical offset of page. - * @page: page we're reading or writing. - * @bio_chain: list of pending biod (for async reading) - * - * Straight from the textbook - allocate and initialize the bio. - * If we're reading, make sure the page is marked as dirty. - * Then submit it and, if @bio_chain == NULL, wait. - */ -static int submit(int rw, pgoff_t page_off, struct page *page, - struct bio **bio_chain) -{ - struct bio *bio; - - bio = bio_alloc(GFP_ATOMIC, 1); - if (!bio) - return -ENOMEM; - bio->bi_sector = page_off * (PAGE_SIZE >> 9); - bio->bi_bdev = resume_bdev; - bio->bi_end_io = end_swap_bio_read; + memset(&snapshot, 0, sizeof(struct snapshot_handle)); + error = snapshot_read_next(&snapshot); + if (error < PAGE_SIZE) { + if (error >= 0) + error = -EFAULT; - if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { - printk("swsusp: ERROR: adding page to bio at %ld\n", page_off); - bio_put(bio); - return -EFAULT; + goto out_finish; } - - lock_page(page); - bio_get(bio); - - if (bio_chain == NULL) { - submit_bio(rw | (1 << BIO_RW_SYNC), bio); - wait_on_page_locked(page); - if (rw == READ) - bio_set_pages_dirty(bio); - bio_put(bio); - } else { - get_page(page); - bio->bi_private = *bio_chain; - *bio_chain = bio; - submit_bio(rw | (1 << BIO_RW_SYNC), bio); + header = (struct swsusp_info *)data_of(snapshot); + error = swap_write_page(&handle, header, NULL); + if (!error) { + error = (flags & SF_NOCOMPRESS_MODE) ? + save_image(&handle, &snapshot, pages - 1) : + save_image_lzo(&handle, &snapshot, pages - 1); } - return 0; -} - -static int bio_read_page(pgoff_t page_off, void *addr, struct bio **bio_chain) -{ - return submit(READ, page_off, virt_to_page(addr), bio_chain); -} - -static int bio_write_page(pgoff_t page_off, void *addr) -{ - return submit(WRITE, page_off, virt_to_page(addr), NULL); +out_finish: + error = swap_writer_finish(&handle, flags, error); + return error; } /** @@ -450,56 +858,101 @@ static int bio_write_page(pgoff_t page_off, void *addr) static void release_swap_reader(struct swap_map_handle *handle) { - if (handle->cur) - free_page((unsigned long)handle->cur); + struct swap_map_page_list *tmp; + + while (handle->maps) { + if (handle->maps->map) + free_page((unsigned long)handle->maps->map); + tmp = handle->maps; + handle->maps = handle->maps->next; + kfree(tmp); + } handle->cur = NULL; } static int get_swap_reader(struct swap_map_handle *handle, - swp_entry_t start) + unsigned int *flags_p) { int error; + struct swap_map_page_list *tmp, *last; + sector_t offset; + + *flags_p = swsusp_header->flags; - if (!swp_offset(start)) + if (!swsusp_header->image) /* how can this happen? */ return -EINVAL; - handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_ATOMIC); - if (!handle->cur) - return -ENOMEM; - error = bio_read_page(swp_offset(start), handle->cur, NULL); - if (error) { - release_swap_reader(handle); - return error; + + handle->cur = NULL; + last = handle->maps = NULL; + offset = swsusp_header->image; + while (offset) { + tmp = kmalloc(sizeof(*handle->maps), GFP_KERNEL); + if (!tmp) { + release_swap_reader(handle); + return -ENOMEM; + } + memset(tmp, 0, sizeof(*tmp)); + if (!handle->maps) + handle->maps = tmp; + if (last) + last->next = tmp; + last = tmp; + + tmp->map = (struct swap_map_page *) + __get_free_page(__GFP_WAIT | __GFP_HIGH); + if (!tmp->map) { + release_swap_reader(handle); + return -ENOMEM; + } + + error = hib_bio_read_page(offset, tmp->map, NULL); + if (error) { + release_swap_reader(handle); + return error; + } + offset = tmp->map->next_swap; } handle->k = 0; + handle->cur = handle->maps->map; return 0; } static int swap_read_page(struct swap_map_handle *handle, void *buf, struct bio **bio_chain) { - unsigned long offset; + sector_t offset; int error; + struct swap_map_page_list *tmp; if (!handle->cur) return -EINVAL; offset = handle->cur->entries[handle->k]; if (!offset) return -EFAULT; - error = bio_read_page(offset, buf, bio_chain); + error = hib_bio_read_page(offset, buf, bio_chain); if (error) return error; if (++handle->k >= MAP_PAGE_ENTRIES) { - error = wait_on_bio_chain(bio_chain); handle->k = 0; - offset = handle->cur->next_swap; - if (!offset) + free_page((unsigned long)handle->maps->map); + tmp = handle->maps; + handle->maps = handle->maps->next; + kfree(tmp); + if (!handle->maps) release_swap_reader(handle); - else if (!error) - error = bio_read_page(offset, handle->cur, NULL); + else + handle->cur = handle->maps->map; } return error; } +static int swap_reader_finish(struct swap_map_handle *handle) +{ + release_swap_reader(handle); + + return 0; +} + /** * load_image - load the image using the swap map handle * @handle and the snapshot handle @snapshot @@ -511,79 +964,455 @@ static int load_image(struct swap_map_handle *handle, unsigned int nr_to_read) { unsigned int m; - int error = 0; + int ret = 0; struct timeval start; struct timeval stop; struct bio *bio; int err2; unsigned nr_pages; - printk("Loading image data pages (%u pages) ... ", nr_to_read); - m = nr_to_read / 100; + printk(KERN_INFO "PM: Loading image data pages (%u pages)...\n", + nr_to_read); + m = nr_to_read / 10; if (!m) m = 1; nr_pages = 0; bio = NULL; do_gettimeofday(&start); for ( ; ; ) { - error = snapshot_write_next(snapshot, PAGE_SIZE); - if (error <= 0) + ret = snapshot_write_next(snapshot); + if (ret <= 0) break; - error = swap_read_page(handle, data_of(*snapshot), &bio); - if (error) + ret = swap_read_page(handle, data_of(*snapshot), &bio); + if (ret) break; if (snapshot->sync_read) - error = wait_on_bio_chain(&bio); - if (error) + ret = hib_wait_on_bio_chain(&bio); + if (ret) break; if (!(nr_pages % m)) - printk("\b\b\b\b%3d%%", nr_pages / m); + printk(KERN_INFO "PM: Image loading progress: %3d%%\n", + nr_pages / m * 10); nr_pages++; } - err2 = wait_on_bio_chain(&bio); + err2 = hib_wait_on_bio_chain(&bio); do_gettimeofday(&stop); - if (!error) - error = err2; - if (!error) { - printk("\b\b\b\bdone\n"); - snapshot_free_unused_memory(snapshot); + if (!ret) + ret = err2; + if (!ret) { + printk(KERN_INFO "PM: Image loading done.\n"); + snapshot_write_finalize(snapshot); if (!snapshot_image_loaded(snapshot)) - error = -ENODATA; + ret = -ENODATA; } - show_speed(&start, &stop, nr_to_read, "Read"); - return error; + swsusp_show_speed(&start, &stop, nr_to_read, "Read"); + return ret; } -int swsusp_read(void) +/** + * Structure used for LZO data decompression. + */ +struct dec_data { + struct task_struct *thr; /* thread */ + atomic_t ready; /* ready to start flag */ + atomic_t stop; /* ready to stop flag */ + int ret; /* return code */ + wait_queue_head_t go; /* start decompression */ + wait_queue_head_t done; /* decompression done */ + size_t unc_len; /* uncompressed length */ + size_t cmp_len; /* compressed length */ + unsigned char unc[LZO_UNC_SIZE]; /* uncompressed buffer */ + unsigned char cmp[LZO_CMP_SIZE]; /* compressed buffer */ +}; + +/** + * Deompression function that runs in its own thread. + */ +static int lzo_decompress_threadfn(void *data) +{ + struct dec_data *d = data; + + while (1) { + wait_event(d->go, atomic_read(&d->ready) || + kthread_should_stop()); + if (kthread_should_stop()) { + d->thr = NULL; + d->ret = -1; + atomic_set(&d->stop, 1); + wake_up(&d->done); + break; + } + atomic_set(&d->ready, 0); + + d->unc_len = LZO_UNC_SIZE; + d->ret = lzo1x_decompress_safe(d->cmp + LZO_HEADER, d->cmp_len, + d->unc, &d->unc_len); + atomic_set(&d->stop, 1); + wake_up(&d->done); + } + return 0; +} + +/** + * load_image_lzo - Load compressed image data and decompress them with LZO. + * @handle: Swap map handle to use for loading data. + * @snapshot: Image to copy uncompressed data into. + * @nr_to_read: Number of pages to load. + */ +static int load_image_lzo(struct swap_map_handle *handle, + struct snapshot_handle *snapshot, + unsigned int nr_to_read) +{ + unsigned int m; + int ret = 0; + int eof = 0; + struct bio *bio; + struct timeval start; + struct timeval stop; + unsigned nr_pages; + size_t off; + unsigned i, thr, run_threads, nr_threads; + unsigned ring = 0, pg = 0, ring_size = 0, + have = 0, want, need, asked = 0; + unsigned long read_pages = 0; + unsigned char **page = NULL; + struct dec_data *data = NULL; + struct crc_data *crc = NULL; + + /* + * We'll limit the number of threads for decompression to limit memory + * footprint. + */ + nr_threads = num_online_cpus() - 1; + nr_threads = clamp_val(nr_threads, 1, LZO_THREADS); + + page = vmalloc(sizeof(*page) * LZO_MAX_RD_PAGES); + if (!page) { + printk(KERN_ERR "PM: Failed to allocate LZO page\n"); + ret = -ENOMEM; + goto out_clean; + } + + data = vmalloc(sizeof(*data) * nr_threads); + if (!data) { + printk(KERN_ERR "PM: Failed to allocate LZO data\n"); + ret = -ENOMEM; + goto out_clean; + } + for (thr = 0; thr < nr_threads; thr++) + memset(&data[thr], 0, offsetof(struct dec_data, go)); + + crc = kmalloc(sizeof(*crc), GFP_KERNEL); + if (!crc) { + printk(KERN_ERR "PM: Failed to allocate crc\n"); + ret = -ENOMEM; + goto out_clean; + } + memset(crc, 0, offsetof(struct crc_data, go)); + + /* + * Start the decompression threads. + */ + for (thr = 0; thr < nr_threads; thr++) { + init_waitqueue_head(&data[thr].go); + init_waitqueue_head(&data[thr].done); + + data[thr].thr = kthread_run(lzo_decompress_threadfn, + &data[thr], + "image_decompress/%u", thr); + if (IS_ERR(data[thr].thr)) { + data[thr].thr = NULL; + printk(KERN_ERR + "PM: Cannot start decompression threads\n"); + ret = -ENOMEM; + goto out_clean; + } + } + + /* + * Start the CRC32 thread. + */ + init_waitqueue_head(&crc->go); + init_waitqueue_head(&crc->done); + + handle->crc32 = 0; + crc->crc32 = &handle->crc32; + for (thr = 0; thr < nr_threads; thr++) { + crc->unc[thr] = data[thr].unc; + crc->unc_len[thr] = &data[thr].unc_len; + } + + crc->thr = kthread_run(crc32_threadfn, crc, "image_crc32"); + if (IS_ERR(crc->thr)) { + crc->thr = NULL; + printk(KERN_ERR "PM: Cannot start CRC32 thread\n"); + ret = -ENOMEM; + goto out_clean; + } + + /* + * Set the number of pages for read buffering. + * This is complete guesswork, because we'll only know the real + * picture once prepare_image() is called, which is much later on + * during the image load phase. We'll assume the worst case and + * say that none of the image pages are from high memory. + */ + if (low_free_pages() > snapshot_get_image_size()) + read_pages = (low_free_pages() - snapshot_get_image_size()) / 2; + read_pages = clamp_val(read_pages, LZO_MIN_RD_PAGES, LZO_MAX_RD_PAGES); + + for (i = 0; i < read_pages; i++) { + page[i] = (void *)__get_free_page(i < LZO_CMP_PAGES ? + __GFP_WAIT | __GFP_HIGH : + __GFP_WAIT | __GFP_NOWARN | + __GFP_NORETRY); + + if (!page[i]) { + if (i < LZO_CMP_PAGES) { + ring_size = i; + printk(KERN_ERR + "PM: Failed to allocate LZO pages\n"); + ret = -ENOMEM; + goto out_clean; + } else { + break; + } + } + } + want = ring_size = i; + + printk(KERN_INFO + "PM: Using %u thread(s) for decompression.\n" + "PM: Loading and decompressing image data (%u pages)...\n", + nr_threads, nr_to_read); + m = nr_to_read / 10; + if (!m) + m = 1; + nr_pages = 0; + bio = NULL; + do_gettimeofday(&start); + + ret = snapshot_write_next(snapshot); + if (ret <= 0) + goto out_finish; + + for(;;) { + for (i = 0; !eof && i < want; i++) { + ret = swap_read_page(handle, page[ring], &bio); + if (ret) { + /* + * On real read error, finish. On end of data, + * set EOF flag and just exit the read loop. + */ + if (handle->cur && + handle->cur->entries[handle->k]) { + goto out_finish; + } else { + eof = 1; + break; + } + } + if (++ring >= ring_size) + ring = 0; + } + asked += i; + want -= i; + + /* + * We are out of data, wait for some more. + */ + if (!have) { + if (!asked) + break; + + ret = hib_wait_on_bio_chain(&bio); + if (ret) + goto out_finish; + have += asked; + asked = 0; + if (eof) + eof = 2; + } + + if (crc->run_threads) { + wait_event(crc->done, atomic_read(&crc->stop)); + atomic_set(&crc->stop, 0); + crc->run_threads = 0; + } + + for (thr = 0; have && thr < nr_threads; thr++) { + data[thr].cmp_len = *(size_t *)page[pg]; + if (unlikely(!data[thr].cmp_len || + data[thr].cmp_len > + lzo1x_worst_compress(LZO_UNC_SIZE))) { + printk(KERN_ERR + "PM: Invalid LZO compressed length\n"); + ret = -1; + goto out_finish; + } + + need = DIV_ROUND_UP(data[thr].cmp_len + LZO_HEADER, + PAGE_SIZE); + if (need > have) { + if (eof > 1) { + ret = -1; + goto out_finish; + } + break; + } + + for (off = 0; + off < LZO_HEADER + data[thr].cmp_len; + off += PAGE_SIZE) { + memcpy(data[thr].cmp + off, + page[pg], PAGE_SIZE); + have--; + want++; + if (++pg >= ring_size) + pg = 0; + } + + atomic_set(&data[thr].ready, 1); + wake_up(&data[thr].go); + } + + /* + * Wait for more data while we are decompressing. + */ + if (have < LZO_CMP_PAGES && asked) { + ret = hib_wait_on_bio_chain(&bio); + if (ret) + goto out_finish; + have += asked; + asked = 0; + if (eof) + eof = 2; + } + + for (run_threads = thr, thr = 0; thr < run_threads; thr++) { + wait_event(data[thr].done, + atomic_read(&data[thr].stop)); + atomic_set(&data[thr].stop, 0); + + ret = data[thr].ret; + + if (ret < 0) { + printk(KERN_ERR + "PM: LZO decompression failed\n"); + goto out_finish; + } + + if (unlikely(!data[thr].unc_len || + data[thr].unc_len > LZO_UNC_SIZE || + data[thr].unc_len & (PAGE_SIZE - 1))) { + printk(KERN_ERR + "PM: Invalid LZO uncompressed length\n"); + ret = -1; + goto out_finish; + } + + for (off = 0; + off < data[thr].unc_len; off += PAGE_SIZE) { + memcpy(data_of(*snapshot), + data[thr].unc + off, PAGE_SIZE); + + if (!(nr_pages % m)) + printk(KERN_INFO + "PM: Image loading progress: " + "%3d%%\n", + nr_pages / m * 10); + nr_pages++; + + ret = snapshot_write_next(snapshot); + if (ret <= 0) { + crc->run_threads = thr + 1; + atomic_set(&crc->ready, 1); + wake_up(&crc->go); + goto out_finish; + } + } + } + + crc->run_threads = thr; + atomic_set(&crc->ready, 1); + wake_up(&crc->go); + } + +out_finish: + if (crc->run_threads) { + wait_event(crc->done, atomic_read(&crc->stop)); + atomic_set(&crc->stop, 0); + } + do_gettimeofday(&stop); + if (!ret) { + printk(KERN_INFO "PM: Image loading done.\n"); + snapshot_write_finalize(snapshot); + if (!snapshot_image_loaded(snapshot)) + ret = -ENODATA; + if (!ret) { + if (swsusp_header->flags & SF_CRC32_MODE) { + if(handle->crc32 != swsusp_header->crc32) { + printk(KERN_ERR + "PM: Invalid image CRC32!\n"); + ret = -ENODATA; + } + } + } + } + swsusp_show_speed(&start, &stop, nr_to_read, "Read"); +out_clean: + for (i = 0; i < ring_size; i++) + free_page((unsigned long)page[i]); + if (crc) { + if (crc->thr) + kthread_stop(crc->thr); + kfree(crc); + } + if (data) { + for (thr = 0; thr < nr_threads; thr++) + if (data[thr].thr) + kthread_stop(data[thr].thr); + vfree(data); + } + if (page) vfree(page); + + return ret; +} + +/** + * swsusp_read - read the hibernation image. + * @flags_p: flags passed by the "frozen" kernel in the image header should + * be written into this memory location + */ + +int swsusp_read(unsigned int *flags_p) { int error; struct swap_map_handle handle; struct snapshot_handle snapshot; struct swsusp_info *header; - if (IS_ERR(resume_bdev)) { - pr_debug("swsusp: block device not initialised\n"); - return PTR_ERR(resume_bdev); - } - memset(&snapshot, 0, sizeof(struct snapshot_handle)); - error = snapshot_write_next(&snapshot, PAGE_SIZE); + error = snapshot_write_next(&snapshot); if (error < PAGE_SIZE) return error < 0 ? error : -EFAULT; header = (struct swsusp_info *)data_of(snapshot); - error = get_swap_reader(&handle, swsusp_header.image); + error = get_swap_reader(&handle, flags_p); + if (error) + goto end; if (!error) error = swap_read_page(&handle, header, NULL); + if (!error) { + error = (*flags_p & SF_NOCOMPRESS_MODE) ? + load_image(&handle, &snapshot, header->pages - 1) : + load_image_lzo(&handle, &snapshot, header->pages - 1); + } + swap_reader_finish(&handle); +end: if (!error) - error = load_image(&handle, &snapshot, header->pages - 1); - release_swap_reader(&handle); - - blkdev_put(resume_bdev); - - if (!error) - pr_debug("swsusp: Reading resume file was successful\n"); + pr_debug("PM: Image successfully loaded\n"); else - pr_debug("swsusp: Error %d resuming\n", error); + pr_debug("PM: Error %d resuming\n", error); return error; } @@ -595,29 +1424,36 @@ int swsusp_check(void) { int error; - resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ); - if (!IS_ERR(resume_bdev)) { - set_blocksize(resume_bdev, PAGE_SIZE); - memset(&swsusp_header, 0, sizeof(swsusp_header)); - if ((error = bio_read_page(0, &swsusp_header, NULL))) - return error; - if (!memcmp(SWSUSP_SIG, swsusp_header.sig, 10)) { - memcpy(swsusp_header.sig, swsusp_header.orig_sig, 10); + hib_resume_bdev = blkdev_get_by_dev(swsusp_resume_device, + FMODE_READ, NULL); + if (!IS_ERR(hib_resume_bdev)) { + set_blocksize(hib_resume_bdev, PAGE_SIZE); + clear_page(swsusp_header); + error = hib_bio_read_page(swsusp_resume_block, + swsusp_header, NULL); + if (error) + goto put; + + if (!memcmp(HIBERNATE_SIG, swsusp_header->sig, 10)) { + memcpy(swsusp_header->sig, swsusp_header->orig_sig, 10); /* Reset swap signature now */ - error = bio_write_page(0, &swsusp_header); + error = hib_bio_write_page(swsusp_resume_block, + swsusp_header, NULL); } else { - return -EINVAL; + error = -EINVAL; } + +put: if (error) - blkdev_put(resume_bdev); + blkdev_put(hib_resume_bdev, FMODE_READ); else - pr_debug("swsusp: Signature found, resuming\n"); + pr_debug("PM: Image signature found, resuming\n"); } else { - error = PTR_ERR(resume_bdev); + error = PTR_ERR(hib_resume_bdev); } if (error) - pr_debug("swsusp: Error %d check for resume file\n", error); + pr_debug("PM: Image not found (code %d)\n", error); return error; } @@ -626,12 +1462,50 @@ int swsusp_check(void) * swsusp_close - close swap device. */ -void swsusp_close(void) +void swsusp_close(fmode_t mode) { - if (IS_ERR(resume_bdev)) { - pr_debug("swsusp: block device not initialised\n"); + if (IS_ERR(hib_resume_bdev)) { + pr_debug("PM: Image device not initialised\n"); return; } - blkdev_put(resume_bdev); + blkdev_put(hib_resume_bdev, mode); +} + +/** + * swsusp_unmark - Unmark swsusp signature in the resume device + */ + +#ifdef CONFIG_SUSPEND +int swsusp_unmark(void) +{ + int error; + + hib_bio_read_page(swsusp_resume_block, swsusp_header, NULL); + if (!memcmp(HIBERNATE_SIG,swsusp_header->sig, 10)) { + memcpy(swsusp_header->sig,swsusp_header->orig_sig, 10); + error = hib_bio_write_page(swsusp_resume_block, + swsusp_header, NULL); + } else { + printk(KERN_ERR "PM: Cannot find swsusp signature!\n"); + error = -ENODEV; + } + + /* + * We just returned from suspend, we don't need the image any more. + */ + free_all_swap_pages(root_swap); + + return error; } +#endif + +static int swsusp_header_init(void) +{ + swsusp_header = (struct swsusp_header*) __get_free_page(GFP_KERNEL); + if (!swsusp_header) + panic("Could not allocate memory for swsusp_header\n"); + return 0; +} + +core_initcall(swsusp_header_init); diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c deleted file mode 100644 index 0b66659dc51..00000000000 --- a/kernel/power/swsusp.c +++ /dev/null @@ -1,287 +0,0 @@ -/* - * linux/kernel/power/swsusp.c - * - * This file provides code to write suspend image to swap and read it back. - * - * Copyright (C) 1998-2001 Gabor Kuti <seasons@fornax.hu> - * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@suse.cz> - * - * This file is released under the GPLv2. - * - * I'd like to thank the following people for their work: - * - * Pavel Machek <pavel@ucw.cz>: - * Modifications, defectiveness pointing, being with me at the very beginning, - * suspend to swap space, stop all tasks. Port to 2.4.18-ac and 2.5.17. - * - * Steve Doddi <dirk@loth.demon.co.uk>: - * Support the possibility of hardware state restoring. - * - * Raph <grey.havens@earthling.net>: - * Support for preserving states of network devices and virtual console - * (including X and svgatextmode) - * - * Kurt Garloff <garloff@suse.de>: - * Straightened the critical function in order to prevent compilers from - * playing tricks with local variables. - * - * Andreas Mohr <a.mohr@mailto.de> - * - * Alex Badea <vampire@go.ro>: - * Fixed runaway init - * - * Rafael J. Wysocki <rjw@sisk.pl> - * Reworked the freeing of memory and the handling of swap - * - * More state savers are welcome. Especially for the scsi layer... - * - * For TODOs,FIXMEs also look in Documentation/power/swsusp.txt - */ - -#include <linux/mm.h> -#include <linux/suspend.h> -#include <linux/spinlock.h> -#include <linux/kernel.h> -#include <linux/major.h> -#include <linux/swap.h> -#include <linux/pm.h> -#include <linux/swapops.h> -#include <linux/bootmem.h> -#include <linux/syscalls.h> -#include <linux/highmem.h> - -#include "power.h" - -/* - * Preferred image size in bytes (tunable via /sys/power/image_size). - * When it is set to N, swsusp will do its best to ensure the image - * size will not exceed N bytes, but if that is impossible, it will - * try to create the smallest image possible. - */ -unsigned long image_size = 500 * 1024 * 1024; - -int in_suspend __nosavedata = 0; - -#ifdef CONFIG_HIGHMEM -unsigned int count_highmem_pages(void); -int save_highmem(void); -int restore_highmem(void); -#else -static inline int save_highmem(void) { return 0; } -static inline int restore_highmem(void) { return 0; } -static inline unsigned int count_highmem_pages(void) { return 0; } -#endif - -/** - * The following functions are used for tracing the allocated - * swap pages, so that they can be freed in case of an error. - * - * The functions operate on a linked bitmap structure defined - * in power.h - */ - -void free_bitmap(struct bitmap_page *bitmap) -{ - struct bitmap_page *bp; - - while (bitmap) { - bp = bitmap->next; - free_page((unsigned long)bitmap); - bitmap = bp; - } -} - -struct bitmap_page *alloc_bitmap(unsigned int nr_bits) -{ - struct bitmap_page *bitmap, *bp; - unsigned int n; - - if (!nr_bits) - return NULL; - - bitmap = (struct bitmap_page *)get_zeroed_page(GFP_KERNEL); - bp = bitmap; - for (n = BITMAP_PAGE_BITS; n < nr_bits; n += BITMAP_PAGE_BITS) { - bp->next = (struct bitmap_page *)get_zeroed_page(GFP_KERNEL); - bp = bp->next; - if (!bp) { - free_bitmap(bitmap); - return NULL; - } - } - return bitmap; -} - -static int bitmap_set(struct bitmap_page *bitmap, unsigned long bit) -{ - unsigned int n; - - n = BITMAP_PAGE_BITS; - while (bitmap && n <= bit) { - n += BITMAP_PAGE_BITS; - bitmap = bitmap->next; - } - if (!bitmap) - return -EINVAL; - n -= BITMAP_PAGE_BITS; - bit -= n; - n = 0; - while (bit >= BITS_PER_CHUNK) { - bit -= BITS_PER_CHUNK; - n++; - } - bitmap->chunks[n] |= (1UL << bit); - return 0; -} - -unsigned long alloc_swap_page(int swap, struct bitmap_page *bitmap) -{ - unsigned long offset; - - offset = swp_offset(get_swap_page_of_type(swap)); - if (offset) { - if (bitmap_set(bitmap, offset)) { - swap_free(swp_entry(swap, offset)); - offset = 0; - } - } - return offset; -} - -void free_all_swap_pages(int swap, struct bitmap_page *bitmap) -{ - unsigned int bit, n; - unsigned long test; - - bit = 0; - while (bitmap) { - for (n = 0; n < BITMAP_PAGE_CHUNKS; n++) - for (test = 1UL; test; test <<= 1) { - if (bitmap->chunks[n] & test) - swap_free(swp_entry(swap, bit)); - bit++; - } - bitmap = bitmap->next; - } -} - -/** - * swsusp_shrink_memory - Try to free as much memory as needed - * - * ... but do not OOM-kill anyone - * - * Notice: all userland should be stopped before it is called, or - * livelock is possible. - */ - -#define SHRINK_BITE 10000 -static inline unsigned long __shrink_memory(long tmp) -{ - if (tmp > SHRINK_BITE) - tmp = SHRINK_BITE; - return shrink_all_memory(tmp); -} - -int swsusp_shrink_memory(void) -{ - long size, tmp; - struct zone *zone; - unsigned long pages = 0; - unsigned int i = 0; - char *p = "-\\|/"; - - printk("Shrinking memory... "); - do { - size = 2 * count_highmem_pages(); - size += size / 50 + count_data_pages() + PAGES_FOR_IO; - tmp = size; - for_each_zone (zone) - if (!is_highmem(zone) && populated_zone(zone)) { - tmp -= zone->free_pages; - tmp += zone->lowmem_reserve[ZONE_NORMAL]; - tmp += snapshot_additional_pages(zone); - } - if (tmp > 0) { - tmp = __shrink_memory(tmp); - if (!tmp) - return -ENOMEM; - pages += tmp; - } else if (size > image_size / PAGE_SIZE) { - tmp = __shrink_memory(size - (image_size / PAGE_SIZE)); - pages += tmp; - } - printk("\b%c", p[i++%4]); - } while (tmp > 0); - printk("\bdone (%lu pages freed)\n", pages); - - return 0; -} - -int swsusp_suspend(void) -{ - int error; - - if ((error = arch_prepare_suspend())) - return error; - local_irq_disable(); - /* At this point, device_suspend() has been called, but *not* - * device_power_down(). We *must* device_power_down() now. - * Otherwise, drivers for some devices (e.g. interrupt controllers) - * become desynchronized with the actual state of the hardware - * at resume time, and evil weirdness ensues. - */ - if ((error = device_power_down(PMSG_FREEZE))) { - printk(KERN_ERR "Some devices failed to power down, aborting suspend\n"); - goto Enable_irqs; - } - - if ((error = save_highmem())) { - printk(KERN_ERR "swsusp: Not enough free pages for highmem\n"); - goto Restore_highmem; - } - - save_processor_state(); - if ((error = swsusp_arch_suspend())) - printk(KERN_ERR "Error %d suspending\n", error); - /* Restore control flow magically appears here */ - restore_processor_state(); -Restore_highmem: - restore_highmem(); - /* NOTE: device_power_up() is just a resume() for devices - * that suspended with irqs off ... no overall powerup. - */ - device_power_up(); -Enable_irqs: - local_irq_enable(); - return error; -} - -int swsusp_resume(void) -{ - int error; - - local_irq_disable(); - /* NOTE: device_power_down() is just a suspend() with irqs off; - * it has no special "power things down" semantics - */ - if (device_power_down(PMSG_PRETHAW)) - printk(KERN_ERR "Some devices failed to power down, very bad\n"); - /* We'll ignore saved state, but this gets preempt count (etc) right */ - save_processor_state(); - error = swsusp_arch_resume(); - /* Code below is only ever reached in case of failure. Otherwise - * execution continues at place where swsusp_arch_suspend was called - */ - BUG_ON(!error); - /* The only reason why swsusp_arch_resume() can fail is memory being - * very tight, so we have to free it as soon as we can to avoid - * subsequent failures - */ - swsusp_free(); - restore_processor_state(); - restore_highmem(); - touch_softlockup_watchdog(); - device_power_up(); - local_irq_enable(); - return error; -} diff --git a/kernel/power/user.c b/kernel/power/user.c index 72825c853cd..526e8911460 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -11,6 +11,7 @@ #include <linux/suspend.h> #include <linux/syscalls.h> +#include <linux/reboot.h> #include <linux/string.h> #include <linux/device.h> #include <linux/miscdevice.h> @@ -19,68 +20,115 @@ #include <linux/swapops.h> #include <linux/pm.h> #include <linux/fs.h> +#include <linux/compat.h> +#include <linux/console.h> #include <linux/cpu.h> +#include <linux/freezer.h> #include <asm/uaccess.h> #include "power.h" + #define SNAPSHOT_MINOR 231 static struct snapshot_data { struct snapshot_handle handle; int swap; - struct bitmap_page *bitmap; int mode; - char frozen; - char ready; + bool frozen; + bool ready; + bool platform_support; + bool free_bitmaps; } snapshot_state; -static atomic_t device_available = ATOMIC_INIT(1); +atomic_t snapshot_device_available = ATOMIC_INIT(1); static int snapshot_open(struct inode *inode, struct file *filp) { struct snapshot_data *data; + int error; - if (!atomic_add_unless(&device_available, -1, 0)) - return -EBUSY; + if (!hibernation_available()) + return -EPERM; - if ((filp->f_flags & O_ACCMODE) == O_RDWR) - return -ENOSYS; + lock_system_sleep(); + if (!atomic_add_unless(&snapshot_device_available, -1, 0)) { + error = -EBUSY; + goto Unlock; + } + + if ((filp->f_flags & O_ACCMODE) == O_RDWR) { + atomic_inc(&snapshot_device_available); + error = -ENOSYS; + goto Unlock; + } nonseekable_open(inode, filp); data = &snapshot_state; filp->private_data = data; memset(&data->handle, 0, sizeof(struct snapshot_handle)); if ((filp->f_flags & O_ACCMODE) == O_RDONLY) { - data->swap = swsusp_resume_device ? swap_type_of(swsusp_resume_device) : -1; + /* Hibernating. The image device should be accessible. */ + data->swap = swsusp_resume_device ? + swap_type_of(swsusp_resume_device, 0, NULL) : -1; data->mode = O_RDONLY; + data->free_bitmaps = false; + error = pm_notifier_call_chain(PM_HIBERNATION_PREPARE); + if (error) + pm_notifier_call_chain(PM_POST_HIBERNATION); } else { + /* + * Resuming. We may need to wait for the image device to + * appear. + */ + wait_for_device_probe(); + data->swap = -1; data->mode = O_WRONLY; + error = pm_notifier_call_chain(PM_RESTORE_PREPARE); + if (!error) { + error = create_basic_memory_bitmaps(); + data->free_bitmaps = !error; + } + if (error) + pm_notifier_call_chain(PM_POST_RESTORE); } - data->bitmap = NULL; - data->frozen = 0; - data->ready = 0; + if (error) + atomic_inc(&snapshot_device_available); - return 0; + data->frozen = false; + data->ready = false; + data->platform_support = false; + + Unlock: + unlock_system_sleep(); + + return error; } static int snapshot_release(struct inode *inode, struct file *filp) { struct snapshot_data *data; + lock_system_sleep(); + swsusp_free(); data = filp->private_data; - free_all_swap_pages(data->swap, data->bitmap); - free_bitmap(data->bitmap); + free_all_swap_pages(data->swap); if (data->frozen) { - down(&pm_sem); + pm_restore_gfp_mask(); + free_basic_memory_bitmaps(); thaw_processes(); - enable_nonboot_cpus(); - up(&pm_sem); + } else if (data->free_bitmaps) { + free_basic_memory_bitmaps(); } - atomic_inc(&device_available); + pm_notifier_call_chain(data->mode == O_RDONLY ? + PM_POST_HIBERNATION : PM_POST_RESTORE); + atomic_inc(&snapshot_device_available); + + unlock_system_sleep(); + return 0; } @@ -89,15 +137,31 @@ static ssize_t snapshot_read(struct file *filp, char __user *buf, { struct snapshot_data *data; ssize_t res; + loff_t pg_offp = *offp & ~PAGE_MASK; + + lock_system_sleep(); data = filp->private_data; - res = snapshot_read_next(&data->handle, count); - if (res > 0) { - if (copy_to_user(buf, data_of(data->handle), res)) - res = -EFAULT; - else - *offp = data->handle.offset; + if (!data->ready) { + res = -ENODATA; + goto Unlock; } + if (!pg_offp) { /* on page boundary? */ + res = snapshot_read_next(&data->handle); + if (res <= 0) + goto Unlock; + } else { + res = PAGE_SIZE - pg_offp; + } + + res = simple_read_from_buffer(buf, count, &pg_offp, + data_of(data->handle), res); + if (res > 0) + *offp += res; + + Unlock: + unlock_system_sleep(); + return res; } @@ -106,24 +170,37 @@ static ssize_t snapshot_write(struct file *filp, const char __user *buf, { struct snapshot_data *data; ssize_t res; + loff_t pg_offp = *offp & ~PAGE_MASK; + + lock_system_sleep(); data = filp->private_data; - res = snapshot_write_next(&data->handle, count); - if (res > 0) { - if (copy_from_user(data_of(data->handle), buf, res)) - res = -EFAULT; - else - *offp = data->handle.offset; + + if (!pg_offp) { + res = snapshot_write_next(&data->handle); + if (res <= 0) + goto unlock; + } else { + res = PAGE_SIZE - pg_offp; } + + res = simple_write_to_buffer(data_of(data->handle), res, &pg_offp, + buf, count); + if (res > 0) + *offp += res; +unlock: + unlock_system_sleep(); + return res; } -static int snapshot_ioctl(struct inode *inode, struct file *filp, - unsigned int cmd, unsigned long arg) +static long snapshot_ioctl(struct file *filp, unsigned int cmd, + unsigned long arg) { int error = 0; struct snapshot_data *data; - loff_t offset, avail; + loff_t size; + sector_t offset; if (_IOC_TYPE(cmd) != SNAPSHOT_IOC_MAGIC) return -ENOTTY; @@ -132,6 +209,10 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, if (!capable(CAP_SYS_ADMIN)) return -EPERM; + if (!mutex_trylock(&pm_mutex)) + return -EBUSY; + + lock_device_hotplug(); data = filp->private_data; switch (cmd) { @@ -139,101 +220,98 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, case SNAPSHOT_FREEZE: if (data->frozen) break; - down(&pm_sem); - error = disable_nonboot_cpus(); - if (!error) { - error = freeze_processes(); - if (error) { - thaw_processes(); - error = -EBUSY; - } - } - enable_nonboot_cpus(); - up(&pm_sem); - if (!error) - data->frozen = 1; + + printk("Syncing filesystems ... "); + sys_sync(); + printk("done.\n"); + + error = freeze_processes(); + if (error) + break; + + error = create_basic_memory_bitmaps(); + if (error) + thaw_processes(); + else + data->frozen = true; + break; case SNAPSHOT_UNFREEZE: - if (!data->frozen) + if (!data->frozen || data->ready) break; - down(&pm_sem); + pm_restore_gfp_mask(); + free_basic_memory_bitmaps(); + data->free_bitmaps = false; thaw_processes(); - enable_nonboot_cpus(); - up(&pm_sem); - data->frozen = 0; + data->frozen = false; break; - case SNAPSHOT_ATOMIC_SNAPSHOT: + case SNAPSHOT_CREATE_IMAGE: if (data->mode != O_RDONLY || !data->frozen || data->ready) { error = -EPERM; break; } - down(&pm_sem); - /* Free memory before shutting down devices. */ - error = swsusp_shrink_memory(); + pm_restore_gfp_mask(); + error = hibernation_snapshot(data->platform_support); if (!error) { - error = device_suspend(PMSG_FREEZE); - if (!error) { - in_suspend = 1; - error = swsusp_suspend(); - device_resume(); - } + error = put_user(in_suspend, (int __user *)arg); + data->ready = !freezer_test_done && !error; + freezer_test_done = false; } - up(&pm_sem); - if (!error) - error = put_user(in_suspend, (unsigned int __user *)arg); - if (!error) - data->ready = 1; break; case SNAPSHOT_ATOMIC_RESTORE: + snapshot_write_finalize(&data->handle); if (data->mode != O_WRONLY || !data->frozen || !snapshot_image_loaded(&data->handle)) { error = -EPERM; break; } - snapshot_free_unused_memory(&data->handle); - down(&pm_sem); - pm_prepare_console(); - error = device_suspend(PMSG_PRETHAW); - if (!error) { - error = swsusp_resume(); - device_resume(); - } - pm_restore_console(); - up(&pm_sem); + error = hibernation_restore(data->platform_support); break; case SNAPSHOT_FREE: swsusp_free(); memset(&data->handle, 0, sizeof(struct snapshot_handle)); - data->ready = 0; + data->ready = false; + /* + * It is necessary to thaw kernel threads here, because + * SNAPSHOT_CREATE_IMAGE may be invoked directly after + * SNAPSHOT_FREE. In that case, if kernel threads were not + * thawed, the preallocation of memory carried out by + * hibernation_snapshot() might run into problems (i.e. it + * might fail or even deadlock). + */ + thaw_kernel_threads(); break; - case SNAPSHOT_SET_IMAGE_SIZE: + case SNAPSHOT_PREF_IMAGE_SIZE: image_size = arg; break; - case SNAPSHOT_AVAIL_SWAP: - avail = count_swap_pages(data->swap, 1); - avail <<= PAGE_SHIFT; - error = put_user(avail, (loff_t __user *)arg); + case SNAPSHOT_GET_IMAGE_SIZE: + if (!data->ready) { + error = -ENODATA; + break; + } + size = snapshot_get_image_size(); + size <<= PAGE_SHIFT; + error = put_user(size, (loff_t __user *)arg); + break; + + case SNAPSHOT_AVAIL_SWAP_SIZE: + size = count_swap_pages(data->swap, 1); + size <<= PAGE_SHIFT; + error = put_user(size, (loff_t __user *)arg); break; - case SNAPSHOT_GET_SWAP_PAGE: + case SNAPSHOT_ALLOC_SWAP_PAGE: if (data->swap < 0 || data->swap >= MAX_SWAPFILES) { error = -ENODEV; break; } - if (!data->bitmap) { - data->bitmap = alloc_bitmap(count_swap_pages(data->swap, 0)); - if (!data->bitmap) { - error = -ENOMEM; - break; - } - } - offset = alloc_swap_page(data->swap, data->bitmap); + offset = alloc_swapdev_block(data->swap); if (offset) { offset <<= PAGE_SHIFT; error = put_user(offset, (loff_t __user *)arg); @@ -247,81 +325,143 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, error = -ENODEV; break; } - free_all_swap_pages(data->swap, data->bitmap); - free_bitmap(data->bitmap); - data->bitmap = NULL; + free_all_swap_pages(data->swap); + break; + + case SNAPSHOT_S2RAM: + if (!data->frozen) { + error = -EPERM; + break; + } + /* + * Tasks are frozen and the notifiers have been called with + * PM_HIBERNATION_PREPARE + */ + error = suspend_devices_and_enter(PM_SUSPEND_MEM); + data->ready = false; + break; + + case SNAPSHOT_PLATFORM_SUPPORT: + data->platform_support = !!arg; break; - case SNAPSHOT_SET_SWAP_FILE: - if (!data->bitmap) { + case SNAPSHOT_POWER_OFF: + if (data->platform_support) + error = hibernation_platform_enter(); + break; + + case SNAPSHOT_SET_SWAP_AREA: + if (swsusp_swap_in_use()) { + error = -EPERM; + } else { + struct resume_swap_area swap_area; + dev_t swdev; + + error = copy_from_user(&swap_area, (void __user *)arg, + sizeof(struct resume_swap_area)); + if (error) { + error = -EFAULT; + break; + } + /* * User space encodes device types as two-byte values, * so we need to recode them */ - if (old_decode_dev(arg)) { - data->swap = swap_type_of(old_decode_dev(arg)); + swdev = new_decode_dev(swap_area.dev); + if (swdev) { + offset = swap_area.offset; + data->swap = swap_type_of(swdev, offset, NULL); if (data->swap < 0) error = -ENODEV; } else { data->swap = -1; error = -EINVAL; } - } else { - error = -EPERM; } break; - case SNAPSHOT_S2RAM: - if (!data->frozen) { - error = -EPERM; - break; - } + default: + error = -ENOTTY; - if (down_trylock(&pm_sem)) { - error = -EBUSY; - break; - } + } - if (pm_ops->prepare) { - error = pm_ops->prepare(PM_SUSPEND_MEM); - if (error) - goto OutS3; - } + unlock_device_hotplug(); + mutex_unlock(&pm_mutex); - /* Put devices to sleep */ - error = device_suspend(PMSG_SUSPEND); - if (error) { - printk(KERN_ERR "Failed to suspend some devices.\n"); - } else { - /* Enter S3, system is already frozen */ - suspend_enter(PM_SUSPEND_MEM); + return error; +} - /* Wake up devices */ - device_resume(); - } +#ifdef CONFIG_COMPAT - if (pm_ops->finish) - pm_ops->finish(PM_SUSPEND_MEM); +struct compat_resume_swap_area { + compat_loff_t offset; + u32 dev; +} __packed; -OutS3: - up(&pm_sem); - break; +static long +snapshot_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + BUILD_BUG_ON(sizeof(loff_t) != sizeof(compat_loff_t)); - default: - error = -ENOTTY; + switch (cmd) { + case SNAPSHOT_GET_IMAGE_SIZE: + case SNAPSHOT_AVAIL_SWAP_SIZE: + case SNAPSHOT_ALLOC_SWAP_PAGE: { + compat_loff_t __user *uoffset = compat_ptr(arg); + loff_t offset; + mm_segment_t old_fs; + int err; + + old_fs = get_fs(); + set_fs(KERNEL_DS); + err = snapshot_ioctl(file, cmd, (unsigned long) &offset); + set_fs(old_fs); + if (!err && put_user(offset, uoffset)) + err = -EFAULT; + return err; + } + case SNAPSHOT_CREATE_IMAGE: + return snapshot_ioctl(file, cmd, + (unsigned long) compat_ptr(arg)); + + case SNAPSHOT_SET_SWAP_AREA: { + struct compat_resume_swap_area __user *u_swap_area = + compat_ptr(arg); + struct resume_swap_area swap_area; + mm_segment_t old_fs; + int err; + + err = get_user(swap_area.offset, &u_swap_area->offset); + err |= get_user(swap_area.dev, &u_swap_area->dev); + if (err) + return -EFAULT; + old_fs = get_fs(); + set_fs(KERNEL_DS); + err = snapshot_ioctl(file, SNAPSHOT_SET_SWAP_AREA, + (unsigned long) &swap_area); + set_fs(old_fs); + return err; } - return error; + default: + return snapshot_ioctl(file, cmd, arg); + } } -static struct file_operations snapshot_fops = { +#endif /* CONFIG_COMPAT */ + +static const struct file_operations snapshot_fops = { .open = snapshot_open, .release = snapshot_release, .read = snapshot_read, .write = snapshot_write, .llseek = no_llseek, - .ioctl = snapshot_ioctl, + .unlocked_ioctl = snapshot_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = snapshot_compat_ioctl, +#endif }; static struct miscdevice snapshot_device = { diff --git a/kernel/power/wakelock.c b/kernel/power/wakelock.c new file mode 100644 index 00000000000..019069c84ff --- /dev/null +++ b/kernel/power/wakelock.c @@ -0,0 +1,268 @@ +/* + * kernel/power/wakelock.c + * + * User space wakeup sources support. + * + * Copyright (C) 2012 Rafael J. Wysocki <rjw@sisk.pl> + * + * This code is based on the analogous interface allowing user space to + * manipulate wakelocks on Android. + */ + +#include <linux/capability.h> +#include <linux/ctype.h> +#include <linux/device.h> +#include <linux/err.h> +#include <linux/hrtimer.h> +#include <linux/list.h> +#include <linux/rbtree.h> +#include <linux/slab.h> + +#include "power.h" + +static DEFINE_MUTEX(wakelocks_lock); + +struct wakelock { + char *name; + struct rb_node node; + struct wakeup_source ws; +#ifdef CONFIG_PM_WAKELOCKS_GC + struct list_head lru; +#endif +}; + +static struct rb_root wakelocks_tree = RB_ROOT; + +ssize_t pm_show_wakelocks(char *buf, bool show_active) +{ + struct rb_node *node; + struct wakelock *wl; + char *str = buf; + char *end = buf + PAGE_SIZE; + + mutex_lock(&wakelocks_lock); + + for (node = rb_first(&wakelocks_tree); node; node = rb_next(node)) { + wl = rb_entry(node, struct wakelock, node); + if (wl->ws.active == show_active) + str += scnprintf(str, end - str, "%s ", wl->name); + } + if (str > buf) + str--; + + str += scnprintf(str, end - str, "\n"); + + mutex_unlock(&wakelocks_lock); + return (str - buf); +} + +#if CONFIG_PM_WAKELOCKS_LIMIT > 0 +static unsigned int number_of_wakelocks; + +static inline bool wakelocks_limit_exceeded(void) +{ + return number_of_wakelocks > CONFIG_PM_WAKELOCKS_LIMIT; +} + +static inline void increment_wakelocks_number(void) +{ + number_of_wakelocks++; +} + +static inline void decrement_wakelocks_number(void) +{ + number_of_wakelocks--; +} +#else /* CONFIG_PM_WAKELOCKS_LIMIT = 0 */ +static inline bool wakelocks_limit_exceeded(void) { return false; } +static inline void increment_wakelocks_number(void) {} +static inline void decrement_wakelocks_number(void) {} +#endif /* CONFIG_PM_WAKELOCKS_LIMIT */ + +#ifdef CONFIG_PM_WAKELOCKS_GC +#define WL_GC_COUNT_MAX 100 +#define WL_GC_TIME_SEC 300 + +static LIST_HEAD(wakelocks_lru_list); +static unsigned int wakelocks_gc_count; + +static inline void wakelocks_lru_add(struct wakelock *wl) +{ + list_add(&wl->lru, &wakelocks_lru_list); +} + +static inline void wakelocks_lru_most_recent(struct wakelock *wl) +{ + list_move(&wl->lru, &wakelocks_lru_list); +} + +static void wakelocks_gc(void) +{ + struct wakelock *wl, *aux; + ktime_t now; + + if (++wakelocks_gc_count <= WL_GC_COUNT_MAX) + return; + + now = ktime_get(); + list_for_each_entry_safe_reverse(wl, aux, &wakelocks_lru_list, lru) { + u64 idle_time_ns; + bool active; + + spin_lock_irq(&wl->ws.lock); + idle_time_ns = ktime_to_ns(ktime_sub(now, wl->ws.last_time)); + active = wl->ws.active; + spin_unlock_irq(&wl->ws.lock); + + if (idle_time_ns < ((u64)WL_GC_TIME_SEC * NSEC_PER_SEC)) + break; + + if (!active) { + wakeup_source_remove(&wl->ws); + rb_erase(&wl->node, &wakelocks_tree); + list_del(&wl->lru); + kfree(wl->name); + kfree(wl); + decrement_wakelocks_number(); + } + } + wakelocks_gc_count = 0; +} +#else /* !CONFIG_PM_WAKELOCKS_GC */ +static inline void wakelocks_lru_add(struct wakelock *wl) {} +static inline void wakelocks_lru_most_recent(struct wakelock *wl) {} +static inline void wakelocks_gc(void) {} +#endif /* !CONFIG_PM_WAKELOCKS_GC */ + +static struct wakelock *wakelock_lookup_add(const char *name, size_t len, + bool add_if_not_found) +{ + struct rb_node **node = &wakelocks_tree.rb_node; + struct rb_node *parent = *node; + struct wakelock *wl; + + while (*node) { + int diff; + + parent = *node; + wl = rb_entry(*node, struct wakelock, node); + diff = strncmp(name, wl->name, len); + if (diff == 0) { + if (wl->name[len]) + diff = -1; + else + return wl; + } + if (diff < 0) + node = &(*node)->rb_left; + else + node = &(*node)->rb_right; + } + if (!add_if_not_found) + return ERR_PTR(-EINVAL); + + if (wakelocks_limit_exceeded()) + return ERR_PTR(-ENOSPC); + + /* Not found, we have to add a new one. */ + wl = kzalloc(sizeof(*wl), GFP_KERNEL); + if (!wl) + return ERR_PTR(-ENOMEM); + + wl->name = kstrndup(name, len, GFP_KERNEL); + if (!wl->name) { + kfree(wl); + return ERR_PTR(-ENOMEM); + } + wl->ws.name = wl->name; + wakeup_source_add(&wl->ws); + rb_link_node(&wl->node, parent, node); + rb_insert_color(&wl->node, &wakelocks_tree); + wakelocks_lru_add(wl); + increment_wakelocks_number(); + return wl; +} + +int pm_wake_lock(const char *buf) +{ + const char *str = buf; + struct wakelock *wl; + u64 timeout_ns = 0; + size_t len; + int ret = 0; + + if (!capable(CAP_BLOCK_SUSPEND)) + return -EPERM; + + while (*str && !isspace(*str)) + str++; + + len = str - buf; + if (!len) + return -EINVAL; + + if (*str && *str != '\n') { + /* Find out if there's a valid timeout string appended. */ + ret = kstrtou64(skip_spaces(str), 10, &timeout_ns); + if (ret) + return -EINVAL; + } + + mutex_lock(&wakelocks_lock); + + wl = wakelock_lookup_add(buf, len, true); + if (IS_ERR(wl)) { + ret = PTR_ERR(wl); + goto out; + } + if (timeout_ns) { + u64 timeout_ms = timeout_ns + NSEC_PER_MSEC - 1; + + do_div(timeout_ms, NSEC_PER_MSEC); + __pm_wakeup_event(&wl->ws, timeout_ms); + } else { + __pm_stay_awake(&wl->ws); + } + + wakelocks_lru_most_recent(wl); + + out: + mutex_unlock(&wakelocks_lock); + return ret; +} + +int pm_wake_unlock(const char *buf) +{ + struct wakelock *wl; + size_t len; + int ret = 0; + + if (!capable(CAP_BLOCK_SUSPEND)) + return -EPERM; + + len = strlen(buf); + if (!len) + return -EINVAL; + + if (buf[len-1] == '\n') + len--; + + if (!len) + return -EINVAL; + + mutex_lock(&wakelocks_lock); + + wl = wakelock_lookup_add(buf, len, false); + if (IS_ERR(wl)) { + ret = PTR_ERR(wl); + goto out; + } + __pm_relax(&wl->ws); + + wakelocks_lru_most_recent(wl); + wakelocks_gc(); + + out: + mutex_unlock(&wakelocks_lock); + return ret; +} |
