diff options
Diffstat (limited to 'kernel/power')
| -rw-r--r-- | kernel/power/Kconfig | 307 | ||||
| -rw-r--r-- | kernel/power/Makefile | 18 | ||||
| -rw-r--r-- | kernel/power/autosleep.c | 128 | ||||
| -rw-r--r-- | kernel/power/block_io.c | 103 | ||||
| -rw-r--r-- | kernel/power/console.c | 155 | ||||
| -rw-r--r-- | kernel/power/disk.c | 859 | ||||
| -rw-r--r-- | kernel/power/hibernate.c | 1168 | ||||
| -rw-r--r-- | kernel/power/main.c | 712 | ||||
| -rw-r--r-- | kernel/power/pm.c | 205 | ||||
| -rw-r--r-- | kernel/power/power.h | 129 | ||||
| -rw-r--r-- | kernel/power/poweroff.c | 12 | ||||
| -rw-r--r-- | kernel/power/process.c | 358 | ||||
| -rw-r--r-- | kernel/power/qos.c | 601 | ||||
| -rw-r--r-- | kernel/power/snapshot.c | 1145 | ||||
| -rw-r--r-- | kernel/power/suspend.c | 443 | ||||
| -rw-r--r-- | kernel/power/suspend_test.c | 185 | ||||
| -rw-r--r-- | kernel/power/swap.c | 1377 | ||||
| -rw-r--r-- | kernel/power/swsusp.c | 264 | ||||
| -rw-r--r-- | kernel/power/user.c | 324 | ||||
| -rw-r--r-- | kernel/power/wakelock.c | 268 |
20 files changed, 5977 insertions, 2784 deletions
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 6233f3b4ae6..9a83d780fac 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -1,103 +1,6 @@ -config PM - bool "Power Management support" - depends on !IA64_HP_SIM - ---help--- - "Power Management" means that parts of your computer are shut - off or put into a power conserving "sleep" mode if they are not - being used. There are two competing standards for doing this: APM - and ACPI. If you want to use either one, say Y here and then also - to the requisite support below. - - Power Management is most important for battery powered laptop - computers; if you have a laptop, check out the Linux Laptop home - page on the WWW at <http://www.linux-on-laptops.com/> or - Tuxmobil - Linux on Mobile Computers at <http://www.tuxmobil.org/> - and the Battery Powered Linux mini-HOWTO, available from - <http://www.tldp.org/docs.html#howto>. - - Note that, even if you say N here, Linux on the x86 architecture - will issue the hlt instruction if nothing is to be done, thereby - sending the processor to sleep and saving power. - -config PM_LEGACY - bool "Legacy Power Management API (DEPRECATED)" - depends on PM - default n - ---help--- - Support for pm_register() and friends. This old API is obsoleted - by the driver model. - - If unsure, say N. - -config PM_DEBUG - bool "Power Management Debug Support" - depends on PM - ---help--- - This option enables various debugging support in the Power Management - code. This is helpful when debugging and reporting PM bugs, like - suspend support. - -config PM_VERBOSE - bool "Verbose Power Management debugging" - depends on PM_DEBUG - default n - ---help--- - This option enables verbose messages from the Power Management code. - -config CAN_PM_TRACE - def_bool y - depends on PM_DEBUG && PM_SLEEP && EXPERIMENTAL - -config PM_TRACE - bool - help - This enables code to save the last PM event point across - reboot. The architecture needs to support this, x86 for - example does by saving things in the RTC, see below. - - The architecture specific code must provide the extern - functions from <linux/resume-trace.h> as well as the - <asm/resume-trace.h> header with a TRACE_RESUME() macro. - - The way the information is presented is architecture- - dependent, x86 will print the information during a - late_initcall. - -config PM_TRACE_RTC - bool "Suspend/resume event tracing" - depends on CAN_PM_TRACE - depends on X86 - select PM_TRACE - default n - ---help--- - This enables some cheesy code to save the last PM event point in the - RTC across reboots, so that you can debug a machine that just hangs - during suspend (or more commonly, during resume). - - To use this debugging feature you should attempt to suspend the - machine, reboot it and then run - - dmesg -s 1000000 | grep 'hash matches' - - CAUTION: this option will cause your machine's real-time clock to be - set to an invalid time after a resume. - -config PM_SLEEP_SMP - bool - depends on SMP - depends on ARCH_SUSPEND_POSSIBLE || ARCH_HIBERNATION_POSSIBLE - depends on PM_SLEEP - select HOTPLUG_CPU - default y - -config PM_SLEEP - bool - depends on SUSPEND || HIBERNATION - default y - config SUSPEND bool "Suspend to RAM and standby" - depends on PM && ARCH_SUSPEND_POSSIBLE + depends on ARCH_SUSPEND_POSSIBLE default y ---help--- Allow the system to enter sleep states in which main memory is @@ -115,9 +18,16 @@ config SUSPEND_FREEZER Turning OFF this setting is NOT recommended! If in doubt, say Y. +config HIBERNATE_CALLBACKS + bool + config HIBERNATION bool "Hibernation (aka 'suspend to disk')" - depends on PM && SWAP && ARCH_HIBERNATION_POSSIBLE + depends on SWAP && ARCH_HIBERNATION_POSSIBLE + select HIBERNATE_CALLBACKS + select LZO_COMPRESS + select LZO_DECOMPRESS + select CRC32 ---help--- Enable the suspend to disk (STD) functionality, which is usually called "hibernation" in user interfaces. STD checkpoints the @@ -156,6 +66,9 @@ config HIBERNATION For more information take a look at <file:Documentation/power/swsusp.txt>. +config ARCH_SAVE_PAGE_KEYS + bool + config PM_STD_PARTITION string "Default resume partition" depends on HIBERNATION @@ -178,6 +91,142 @@ config PM_STD_PARTITION suspended image to. It will simply pick the first available swap device. +config PM_SLEEP + def_bool y + depends on SUSPEND || HIBERNATE_CALLBACKS + +config PM_SLEEP_SMP + def_bool y + depends on SMP + depends on ARCH_SUSPEND_POSSIBLE || ARCH_HIBERNATION_POSSIBLE + depends on PM_SLEEP + select HOTPLUG_CPU + +config PM_AUTOSLEEP + bool "Opportunistic sleep" + depends on PM_SLEEP + default n + ---help--- + Allow the kernel to trigger a system transition into a global sleep + state automatically whenever there are no active wakeup sources. + +config PM_WAKELOCKS + bool "User space wakeup sources interface" + depends on PM_SLEEP + default n + ---help--- + Allow user space to create, activate and deactivate wakeup source + objects with the help of a sysfs-based interface. + +config PM_WAKELOCKS_LIMIT + int "Maximum number of user space wakeup sources (0 = no limit)" + range 0 100000 + default 100 + depends on PM_WAKELOCKS + +config PM_WAKELOCKS_GC + bool "Garbage collector for user space wakeup sources" + depends on PM_WAKELOCKS + default y + +config PM_RUNTIME + bool "Run-time PM core functionality" + depends on !IA64_HP_SIM + ---help--- + Enable functionality allowing I/O devices to be put into energy-saving + (low power) states at run time (or autosuspended) after a specified + period of inactivity and woken up in response to a hardware-generated + wake-up event or a driver's request. + + Hardware support is generally required for this functionality to work + and the bus type drivers of the buses the devices are on are + responsible for the actual handling of the autosuspend requests and + wake-up events. + +config PM + def_bool y + depends on PM_SLEEP || PM_RUNTIME + +config PM_DEBUG + bool "Power Management Debug Support" + depends on PM + ---help--- + This option enables various debugging support in the Power Management + code. This is helpful when debugging and reporting PM bugs, like + suspend support. + +config PM_ADVANCED_DEBUG + bool "Extra PM attributes in sysfs for low-level debugging/testing" + depends on PM_DEBUG + ---help--- + Add extra sysfs attributes allowing one to access some Power Management + fields of device objects from user space. If you are not a kernel + developer interested in debugging/testing Power Management, say "no". + +config PM_TEST_SUSPEND + bool "Test suspend/resume and wakealarm during bootup" + depends on SUSPEND && PM_DEBUG && RTC_CLASS=y + ---help--- + This option will let you suspend your machine during bootup, and + make it wake up a few seconds later using an RTC wakeup alarm. + Enable this with a kernel parameter like "test_suspend=mem". + + You probably want to have your system's RTC driver statically + linked, ensuring that it's available when this test runs. + +config PM_SLEEP_DEBUG + def_bool y + depends on PM_DEBUG && PM_SLEEP + +config DPM_WATCHDOG + bool "Device suspend/resume watchdog" + depends on PM_DEBUG && PSTORE + ---help--- + Sets up a watchdog timer to capture drivers that are + locked up attempting to suspend/resume a device. + A detected lockup causes system panic with message + captured in pstore device for inspection in subsequent + boot session. + +config DPM_WATCHDOG_TIMEOUT + int "Watchdog timeout in seconds" + range 1 120 + default 12 + depends on DPM_WATCHDOG + +config PM_TRACE + bool + help + This enables code to save the last PM event point across + reboot. The architecture needs to support this, x86 for + example does by saving things in the RTC, see below. + + The architecture specific code must provide the extern + functions from <linux/resume-trace.h> as well as the + <asm/resume-trace.h> header with a TRACE_RESUME() macro. + + The way the information is presented is architecture- + dependent, x86 will print the information during a + late_initcall. + +config PM_TRACE_RTC + bool "Suspend/resume event tracing" + depends on PM_SLEEP_DEBUG + depends on X86 + select PM_TRACE + ---help--- + This enables some cheesy code to save the last PM event point in the + RTC across reboots, so that you can debug a machine that just hangs + during suspend (or more commonly, during resume). + + To use this debugging feature you should attempt to suspend the + machine, reboot it and then run + + dmesg -s 1000000 | grep 'hash matches' + + CAUTION: this option will cause your machine's real-time clock to be + set to an invalid time after a resume. + config APM_EMULATION tristate "Advanced Power Management Emulation" depends on PM && SYS_SUPPORTS_APM_EMULATION @@ -190,8 +239,8 @@ config APM_EMULATION notification of APM "events" (e.g. battery status change). In order to use APM, you will need supporting software. For location - and more information, read <file:Documentation/power/pm.txt> and the - Battery Powered Linux mini-HOWTO, available from + and more information, read <file:Documentation/power/apm-acpi.txt> + and the Battery Powered Linux mini-HOWTO, available from <http://www.tldp.org/docs.html#howto>. This driver does not spin down disk drives (see the hdparm(8) @@ -203,3 +252,59 @@ config APM_EMULATION random kernel OOPSes or reboots that don't seem to be related to anything, try disabling/enabling this option (or disabling/enabling APM in your BIOS). + +config ARCH_HAS_OPP + bool + +config PM_OPP + bool + ---help--- + SOCs have a standard set of tuples consisting of frequency and + voltage pairs that the device will support per voltage domain. This + is called Operating Performance Point or OPP. The actual definitions + of OPP varies over silicon within the same family of devices. + + OPP layer organizes the data internally using device pointers + representing individual voltage domains and provides SOC + implementations a ready to use framework to manage OPPs. + For more information, read <file:Documentation/power/opp.txt> + +config PM_CLK + def_bool y + depends on PM && HAVE_CLK + +config PM_GENERIC_DOMAINS + bool + depends on PM + +config WQ_POWER_EFFICIENT_DEFAULT + bool "Enable workqueue power-efficient mode by default" + depends on PM + default n + help + Per-cpu workqueues are generally preferred because they show + better performance thanks to cache locality; unfortunately, + per-cpu workqueues tend to be more power hungry than unbound + workqueues. + + Enabling workqueue.power_efficient kernel parameter makes the + per-cpu workqueues which were observed to contribute + significantly to power consumption unbound, leading to measurably + lower power usage at the cost of small performance overhead. + + This config option determines whether workqueue.power_efficient + is enabled by default. + + If in doubt, say N. + +config PM_GENERIC_DOMAINS_SLEEP + def_bool y + depends on PM_SLEEP && PM_GENERIC_DOMAINS + +config PM_GENERIC_DOMAINS_RUNTIME + def_bool y + depends on PM_RUNTIME && PM_GENERIC_DOMAINS + +config CPU_PM + bool + depends on SUSPEND || CPU_IDLE diff --git a/kernel/power/Makefile b/kernel/power/Makefile index f7dfff28ecd..29472bff11e 100644 --- a/kernel/power/Makefile +++ b/kernel/power/Makefile @@ -1,11 +1,15 @@ -ifeq ($(CONFIG_PM_DEBUG),y) -EXTRA_CFLAGS += -DDEBUG -endif +ccflags-$(CONFIG_PM_DEBUG) := -DDEBUG -obj-y := main.o -obj-$(CONFIG_PM_LEGACY) += pm.o -obj-$(CONFIG_PM_SLEEP) += process.o console.o -obj-$(CONFIG_HIBERNATION) += swsusp.o disk.o snapshot.o swap.o user.o +obj-y += qos.o +obj-$(CONFIG_PM) += main.o +obj-$(CONFIG_VT_CONSOLE_SLEEP) += console.o +obj-$(CONFIG_FREEZER) += process.o +obj-$(CONFIG_SUSPEND) += suspend.o +obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o +obj-$(CONFIG_HIBERNATION) += hibernate.o snapshot.o swap.o user.o \ + block_io.o +obj-$(CONFIG_PM_AUTOSLEEP) += autosleep.o +obj-$(CONFIG_PM_WAKELOCKS) += wakelock.o obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o diff --git a/kernel/power/autosleep.c b/kernel/power/autosleep.c new file mode 100644 index 00000000000..9012ecf7b81 --- /dev/null +++ b/kernel/power/autosleep.c @@ -0,0 +1,128 @@ +/* + * kernel/power/autosleep.c + * + * Opportunistic sleep support. + * + * Copyright (C) 2012 Rafael J. Wysocki <rjw@sisk.pl> + */ + +#include <linux/device.h> +#include <linux/mutex.h> +#include <linux/pm_wakeup.h> + +#include "power.h" + +static suspend_state_t autosleep_state; +static struct workqueue_struct *autosleep_wq; +/* + * Note: it is only safe to mutex_lock(&autosleep_lock) if a wakeup_source + * is active, otherwise a deadlock with try_to_suspend() is possible. + * Alternatively mutex_lock_interruptible() can be used. This will then fail + * if an auto_sleep cycle tries to freeze processes. + */ +static DEFINE_MUTEX(autosleep_lock); +static struct wakeup_source *autosleep_ws; + +static void try_to_suspend(struct work_struct *work) +{ + unsigned int initial_count, final_count; + + if (!pm_get_wakeup_count(&initial_count, true)) + goto out; + + mutex_lock(&autosleep_lock); + + if (!pm_save_wakeup_count(initial_count) || + system_state != SYSTEM_RUNNING) { + mutex_unlock(&autosleep_lock); + goto out; + } + + if (autosleep_state == PM_SUSPEND_ON) { + mutex_unlock(&autosleep_lock); + return; + } + if (autosleep_state >= PM_SUSPEND_MAX) + hibernate(); + else + pm_suspend(autosleep_state); + + mutex_unlock(&autosleep_lock); + + if (!pm_get_wakeup_count(&final_count, false)) + goto out; + + /* + * If the wakeup occured for an unknown reason, wait to prevent the + * system from trying to suspend and waking up in a tight loop. + */ + if (final_count == initial_count) + schedule_timeout_uninterruptible(HZ / 2); + + out: + queue_up_suspend_work(); +} + +static DECLARE_WORK(suspend_work, try_to_suspend); + +void queue_up_suspend_work(void) +{ + if (autosleep_state > PM_SUSPEND_ON) + queue_work(autosleep_wq, &suspend_work); +} + +suspend_state_t pm_autosleep_state(void) +{ + return autosleep_state; +} + +int pm_autosleep_lock(void) +{ + return mutex_lock_interruptible(&autosleep_lock); +} + +void pm_autosleep_unlock(void) +{ + mutex_unlock(&autosleep_lock); +} + +int pm_autosleep_set_state(suspend_state_t state) +{ + +#ifndef CONFIG_HIBERNATION + if (state >= PM_SUSPEND_MAX) + return -EINVAL; +#endif + + __pm_stay_awake(autosleep_ws); + + mutex_lock(&autosleep_lock); + + autosleep_state = state; + + __pm_relax(autosleep_ws); + + if (state > PM_SUSPEND_ON) { + pm_wakep_autosleep_enabled(true); + queue_up_suspend_work(); + } else { + pm_wakep_autosleep_enabled(false); + } + + mutex_unlock(&autosleep_lock); + return 0; +} + +int __init pm_autosleep_init(void) +{ + autosleep_ws = wakeup_source_register("autosleep"); + if (!autosleep_ws) + return -ENOMEM; + + autosleep_wq = alloc_ordered_workqueue("autosleep", 0); + if (autosleep_wq) + return 0; + + wakeup_source_unregister(autosleep_ws); + return -ENOMEM; +} diff --git a/kernel/power/block_io.c b/kernel/power/block_io.c new file mode 100644 index 00000000000..9a58bc25881 --- /dev/null +++ b/kernel/power/block_io.c @@ -0,0 +1,103 @@ +/* + * This file provides functions for block I/O operations on swap/file. + * + * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@ucw.cz> + * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl> + * + * This file is released under the GPLv2. + */ + +#include <linux/bio.h> +#include <linux/kernel.h> +#include <linux/pagemap.h> +#include <linux/swap.h> + +#include "power.h" + +/** + * submit - submit BIO request. + * @rw: READ or WRITE. + * @off physical offset of page. + * @page: page we're reading or writing. + * @bio_chain: list of pending biod (for async reading) + * + * Straight from the textbook - allocate and initialize the bio. + * If we're reading, make sure the page is marked as dirty. + * Then submit it and, if @bio_chain == NULL, wait. + */ +static int submit(int rw, struct block_device *bdev, sector_t sector, + struct page *page, struct bio **bio_chain) +{ + const int bio_rw = rw | REQ_SYNC; + struct bio *bio; + + bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1); + bio->bi_iter.bi_sector = sector; + bio->bi_bdev = bdev; + bio->bi_end_io = end_swap_bio_read; + + if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { + printk(KERN_ERR "PM: Adding page to bio failed at %llu\n", + (unsigned long long)sector); + bio_put(bio); + return -EFAULT; + } + + lock_page(page); + bio_get(bio); + + if (bio_chain == NULL) { + submit_bio(bio_rw, bio); + wait_on_page_locked(page); + if (rw == READ) + bio_set_pages_dirty(bio); + bio_put(bio); + } else { + if (rw == READ) + get_page(page); /* These pages are freed later */ + bio->bi_private = *bio_chain; + *bio_chain = bio; + submit_bio(bio_rw, bio); + } + return 0; +} + +int hib_bio_read_page(pgoff_t page_off, void *addr, struct bio **bio_chain) +{ + return submit(READ, hib_resume_bdev, page_off * (PAGE_SIZE >> 9), + virt_to_page(addr), bio_chain); +} + +int hib_bio_write_page(pgoff_t page_off, void *addr, struct bio **bio_chain) +{ + return submit(WRITE, hib_resume_bdev, page_off * (PAGE_SIZE >> 9), + virt_to_page(addr), bio_chain); +} + +int hib_wait_on_bio_chain(struct bio **bio_chain) +{ + struct bio *bio; + struct bio *next_bio; + int ret = 0; + + if (bio_chain == NULL) + return 0; + + bio = *bio_chain; + if (bio == NULL) + return 0; + while (bio) { + struct page *page; + + next_bio = bio->bi_private; + page = bio->bi_io_vec[0].bv_page; + wait_on_page_locked(page); + if (!PageUptodate(page) || PageError(page)) + ret = -EIO; + put_page(page); + bio_put(bio); + bio = next_bio; + } + *bio_chain = NULL; + return ret; +} diff --git a/kernel/power/console.c b/kernel/power/console.c index 89bcf4973ee..aba9c545a0e 100644 --- a/kernel/power/console.c +++ b/kernel/power/console.c @@ -1,58 +1,151 @@ /* - * drivers/power/process.c - Functions for saving/restoring console. + * Functions for saving/restoring console. * * Originally from swsusp. */ +#include <linux/console.h> #include <linux/vt_kern.h> #include <linux/kbd_kern.h> -#include <linux/console.h> +#include <linux/vt.h> +#include <linux/module.h> +#include <linux/slab.h> #include "power.h" -#if defined(CONFIG_VT) && defined(CONFIG_VT_CONSOLE) #define SUSPEND_CONSOLE (MAX_NR_CONSOLES-1) static int orig_fgconsole, orig_kmsg; -int pm_prepare_console(void) +static DEFINE_MUTEX(vt_switch_mutex); + +struct pm_vt_switch { + struct list_head head; + struct device *dev; + bool required; +}; + +static LIST_HEAD(pm_vt_switch_list); + + +/** + * pm_vt_switch_required - indicate VT switch at suspend requirements + * @dev: device + * @required: if true, caller needs VT switch at suspend/resume time + * + * The different console drivers may or may not require VT switches across + * suspend/resume, depending on how they handle restoring video state and + * what may be running. + * + * Drivers can indicate support for switchless suspend/resume, which can + * save time and flicker, by using this routine and passing 'false' as + * the argument. If any loaded driver needs VT switching, or the + * no_console_suspend argument has been passed on the command line, VT + * switches will occur. + */ +void pm_vt_switch_required(struct device *dev, bool required) { - acquire_console_sem(); + struct pm_vt_switch *entry, *tmp; - orig_fgconsole = fg_console; + mutex_lock(&vt_switch_mutex); + list_for_each_entry(tmp, &pm_vt_switch_list, head) { + if (tmp->dev == dev) { + /* already registered, update requirement */ + tmp->required = required; + goto out; + } + } - if (vc_allocate(SUSPEND_CONSOLE)) { - /* we can't have a free VC for now. Too bad, - * we don't want to mess the screen for now. */ - release_console_sem(); - return 1; + entry = kmalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) + goto out; + + entry->required = required; + entry->dev = dev; + + list_add(&entry->head, &pm_vt_switch_list); +out: + mutex_unlock(&vt_switch_mutex); +} +EXPORT_SYMBOL(pm_vt_switch_required); + +/** + * pm_vt_switch_unregister - stop tracking a device's VT switching needs + * @dev: device + * + * Remove @dev from the vt switch list. + */ +void pm_vt_switch_unregister(struct device *dev) +{ + struct pm_vt_switch *tmp; + + mutex_lock(&vt_switch_mutex); + list_for_each_entry(tmp, &pm_vt_switch_list, head) { + if (tmp->dev == dev) { + list_del(&tmp->head); + kfree(tmp); + break; + } } + mutex_unlock(&vt_switch_mutex); +} +EXPORT_SYMBOL(pm_vt_switch_unregister); - if (set_console(SUSPEND_CONSOLE)) { - /* - * We're unable to switch to the SUSPEND_CONSOLE. - * Let the calling function know so it can decide - * what to do. - */ - release_console_sem(); - return 1; +/* + * There are three cases when a VT switch on suspend/resume are required: + * 1) no driver has indicated a requirement one way or another, so preserve + * the old behavior + * 2) console suspend is disabled, we want to see debug messages across + * suspend/resume + * 3) any registered driver indicates it needs a VT switch + * + * If none of these conditions is present, meaning we have at least one driver + * that doesn't need the switch, and none that do, we can avoid it to make + * resume look a little prettier (and suspend too, but that's usually hidden, + * e.g. when closing the lid on a laptop). + */ +static bool pm_vt_switch(void) +{ + struct pm_vt_switch *entry; + bool ret = true; + + mutex_lock(&vt_switch_mutex); + if (list_empty(&pm_vt_switch_list)) + goto out; + + if (!console_suspend_enabled) + goto out; + + list_for_each_entry(entry, &pm_vt_switch_list, head) { + if (entry->required) + goto out; } - release_console_sem(); - if (vt_waitactive(SUSPEND_CONSOLE)) { - pr_debug("Suspend: Can't switch VCs."); + ret = false; +out: + mutex_unlock(&vt_switch_mutex); + return ret; +} + +int pm_prepare_console(void) +{ + if (!pm_vt_switch()) + return 0; + + orig_fgconsole = vt_move_to_console(SUSPEND_CONSOLE, 1); + if (orig_fgconsole < 0) return 1; - } - orig_kmsg = kmsg_redirect; - kmsg_redirect = SUSPEND_CONSOLE; + + orig_kmsg = vt_kmsg_redirect(SUSPEND_CONSOLE); return 0; } void pm_restore_console(void) { - acquire_console_sem(); - set_console(orig_fgconsole); - release_console_sem(); - kmsg_redirect = orig_kmsg; - return; + if (!pm_vt_switch()) + return; + + if (orig_fgconsole >= 0) { + vt_move_to_console(orig_fgconsole, 0); + vt_kmsg_redirect(orig_kmsg); + } } -#endif diff --git a/kernel/power/disk.c b/kernel/power/disk.c deleted file mode 100644 index 14a656cdc65..00000000000 --- a/kernel/power/disk.c +++ /dev/null @@ -1,859 +0,0 @@ -/* - * kernel/power/disk.c - Suspend-to-disk support. - * - * Copyright (c) 2003 Patrick Mochel - * Copyright (c) 2003 Open Source Development Lab - * Copyright (c) 2004 Pavel Machek <pavel@suse.cz> - * - * This file is released under the GPLv2. - * - */ - -#include <linux/suspend.h> -#include <linux/syscalls.h> -#include <linux/reboot.h> -#include <linux/string.h> -#include <linux/device.h> -#include <linux/delay.h> -#include <linux/fs.h> -#include <linux/mount.h> -#include <linux/pm.h> -#include <linux/console.h> -#include <linux/cpu.h> -#include <linux/freezer.h> - -#include "power.h" - - -static int noresume = 0; -static char resume_file[256] = CONFIG_PM_STD_PARTITION; -dev_t swsusp_resume_device; -sector_t swsusp_resume_block; - -enum { - HIBERNATION_INVALID, - HIBERNATION_PLATFORM, - HIBERNATION_TEST, - HIBERNATION_TESTPROC, - HIBERNATION_SHUTDOWN, - HIBERNATION_REBOOT, - /* keep last */ - __HIBERNATION_AFTER_LAST -}; -#define HIBERNATION_MAX (__HIBERNATION_AFTER_LAST-1) -#define HIBERNATION_FIRST (HIBERNATION_INVALID + 1) - -static int hibernation_mode = HIBERNATION_SHUTDOWN; - -static struct platform_hibernation_ops *hibernation_ops; - -/** - * hibernation_set_ops - set the global hibernate operations - * @ops: the hibernation operations to use in subsequent hibernation transitions - */ - -void hibernation_set_ops(struct platform_hibernation_ops *ops) -{ - if (ops && !(ops->begin && ops->end && ops->pre_snapshot - && ops->prepare && ops->finish && ops->enter && ops->pre_restore - && ops->restore_cleanup)) { - WARN_ON(1); - return; - } - mutex_lock(&pm_mutex); - hibernation_ops = ops; - if (ops) - hibernation_mode = HIBERNATION_PLATFORM; - else if (hibernation_mode == HIBERNATION_PLATFORM) - hibernation_mode = HIBERNATION_SHUTDOWN; - - mutex_unlock(&pm_mutex); -} - -#ifdef CONFIG_PM_DEBUG -static void hibernation_debug_sleep(void) -{ - printk(KERN_INFO "hibernation debug: Waiting for 5 seconds.\n"); - mdelay(5000); -} - -static int hibernation_testmode(int mode) -{ - if (hibernation_mode == mode) { - hibernation_debug_sleep(); - return 1; - } - return 0; -} - -static int hibernation_test(int level) -{ - if (pm_test_level == level) { - hibernation_debug_sleep(); - return 1; - } - return 0; -} -#else /* !CONFIG_PM_DEBUG */ -static int hibernation_testmode(int mode) { return 0; } -static int hibernation_test(int level) { return 0; } -#endif /* !CONFIG_PM_DEBUG */ - -/** - * platform_begin - tell the platform driver that we're starting - * hibernation - */ - -static int platform_begin(int platform_mode) -{ - return (platform_mode && hibernation_ops) ? - hibernation_ops->begin() : 0; -} - -/** - * platform_end - tell the platform driver that we've entered the - * working state - */ - -static void platform_end(int platform_mode) -{ - if (platform_mode && hibernation_ops) - hibernation_ops->end(); -} - -/** - * platform_pre_snapshot - prepare the machine for hibernation using the - * platform driver if so configured and return an error code if it fails - */ - -static int platform_pre_snapshot(int platform_mode) -{ - return (platform_mode && hibernation_ops) ? - hibernation_ops->pre_snapshot() : 0; -} - -/** - * platform_leave - prepare the machine for switching to the normal mode - * of operation using the platform driver (called with interrupts disabled) - */ - -static void platform_leave(int platform_mode) -{ - if (platform_mode && hibernation_ops) - hibernation_ops->leave(); -} - -/** - * platform_finish - switch the machine to the normal mode of operation - * using the platform driver (must be called after platform_prepare()) - */ - -static void platform_finish(int platform_mode) -{ - if (platform_mode && hibernation_ops) - hibernation_ops->finish(); -} - -/** - * platform_pre_restore - prepare the platform for the restoration from a - * hibernation image. If the restore fails after this function has been - * called, platform_restore_cleanup() must be called. - */ - -static int platform_pre_restore(int platform_mode) -{ - return (platform_mode && hibernation_ops) ? - hibernation_ops->pre_restore() : 0; -} - -/** - * platform_restore_cleanup - switch the platform to the normal mode of - * operation after a failing restore. If platform_pre_restore() has been - * called before the failing restore, this function must be called too, - * regardless of the result of platform_pre_restore(). - */ - -static void platform_restore_cleanup(int platform_mode) -{ - if (platform_mode && hibernation_ops) - hibernation_ops->restore_cleanup(); -} - -/** - * create_image - freeze devices that need to be frozen with interrupts - * off, create the hibernation image and thaw those devices. Control - * reappears in this routine after a restore. - */ - -static int create_image(int platform_mode) -{ - int error; - - error = arch_prepare_suspend(); - if (error) - return error; - - local_irq_disable(); - /* At this point, device_suspend() has been called, but *not* - * device_power_down(). We *must* call device_power_down() now. - * Otherwise, drivers for some devices (e.g. interrupt controllers) - * become desynchronized with the actual state of the hardware - * at resume time, and evil weirdness ensues. - */ - error = device_power_down(PMSG_FREEZE); - if (error) { - printk(KERN_ERR "PM: Some devices failed to power down, " - "aborting hibernation\n"); - goto Enable_irqs; - } - - if (hibernation_test(TEST_CORE)) - goto Power_up; - - in_suspend = 1; - save_processor_state(); - error = swsusp_arch_suspend(); - if (error) - printk(KERN_ERR "PM: Error %d creating hibernation image\n", - error); - /* Restore control flow magically appears here */ - restore_processor_state(); - if (!in_suspend) - platform_leave(platform_mode); - Power_up: - /* NOTE: device_power_up() is just a resume() for devices - * that suspended with irqs off ... no overall powerup. - */ - device_power_up(); - Enable_irqs: - local_irq_enable(); - return error; -} - -/** - * hibernation_snapshot - quiesce devices and create the hibernation - * snapshot image. - * @platform_mode - if set, use the platform driver, if available, to - * prepare the platform frimware for the power transition. - * - * Must be called with pm_mutex held - */ - -int hibernation_snapshot(int platform_mode) -{ - int error; - - /* Free memory before shutting down devices. */ - error = swsusp_shrink_memory(); - if (error) - return error; - - error = platform_begin(platform_mode); - if (error) - goto Close; - - suspend_console(); - error = device_suspend(PMSG_FREEZE); - if (error) - goto Resume_console; - - if (hibernation_test(TEST_DEVICES)) - goto Resume_devices; - - error = platform_pre_snapshot(platform_mode); - if (error || hibernation_test(TEST_PLATFORM)) - goto Finish; - - error = disable_nonboot_cpus(); - if (!error) { - if (hibernation_test(TEST_CPUS)) - goto Enable_cpus; - - if (hibernation_testmode(HIBERNATION_TEST)) - goto Enable_cpus; - - error = create_image(platform_mode); - /* Control returns here after successful restore */ - } - Enable_cpus: - enable_nonboot_cpus(); - Finish: - platform_finish(platform_mode); - Resume_devices: - device_resume(); - Resume_console: - resume_console(); - Close: - platform_end(platform_mode); - return error; -} - -/** - * resume_target_kernel - prepare devices that need to be suspended with - * interrupts off, restore the contents of highmem that have not been - * restored yet from the image and run the low level code that will restore - * the remaining contents of memory and switch to the just restored target - * kernel. - */ - -static int resume_target_kernel(void) -{ - int error; - - local_irq_disable(); - error = device_power_down(PMSG_PRETHAW); - if (error) { - printk(KERN_ERR "PM: Some devices failed to power down, " - "aborting resume\n"); - goto Enable_irqs; - } - /* We'll ignore saved state, but this gets preempt count (etc) right */ - save_processor_state(); - error = restore_highmem(); - if (!error) { - error = swsusp_arch_resume(); - /* - * The code below is only ever reached in case of a failure. - * Otherwise execution continues at place where - * swsusp_arch_suspend() was called - */ - BUG_ON(!error); - /* This call to restore_highmem() undos the previous one */ - restore_highmem(); - } - /* - * The only reason why swsusp_arch_resume() can fail is memory being - * very tight, so we have to free it as soon as we can to avoid - * subsequent failures - */ - swsusp_free(); - restore_processor_state(); - touch_softlockup_watchdog(); - device_power_up(); - Enable_irqs: - local_irq_enable(); - return error; -} - -/** - * hibernation_restore - quiesce devices and restore the hibernation - * snapshot image. If successful, control returns in hibernation_snaphot() - * @platform_mode - if set, use the platform driver, if available, to - * prepare the platform frimware for the transition. - * - * Must be called with pm_mutex held - */ - -int hibernation_restore(int platform_mode) -{ - int error; - - pm_prepare_console(); - suspend_console(); - error = device_suspend(PMSG_PRETHAW); - if (error) - goto Finish; - - error = platform_pre_restore(platform_mode); - if (!error) { - error = disable_nonboot_cpus(); - if (!error) - error = resume_target_kernel(); - enable_nonboot_cpus(); - } - platform_restore_cleanup(platform_mode); - device_resume(); - Finish: - resume_console(); - pm_restore_console(); - return error; -} - -/** - * hibernation_platform_enter - enter the hibernation state using the - * platform driver (if available) - */ - -int hibernation_platform_enter(void) -{ - int error; - - if (!hibernation_ops) - return -ENOSYS; - - /* - * We have cancelled the power transition by running - * hibernation_ops->finish() before saving the image, so we should let - * the firmware know that we're going to enter the sleep state after all - */ - error = hibernation_ops->begin(); - if (error) - goto Close; - - suspend_console(); - error = device_suspend(PMSG_HIBERNATE); - if (error) - goto Resume_console; - - error = hibernation_ops->prepare(); - if (error) - goto Resume_devices; - - error = disable_nonboot_cpus(); - if (error) - goto Finish; - - local_irq_disable(); - error = device_power_down(PMSG_HIBERNATE); - if (!error) { - hibernation_ops->enter(); - /* We should never get here */ - while (1); - } - local_irq_enable(); - - /* - * We don't need to reenable the nonboot CPUs or resume consoles, since - * the system is going to be halted anyway. - */ - Finish: - hibernation_ops->finish(); - Resume_devices: - device_resume(); - Resume_console: - resume_console(); - Close: - hibernation_ops->end(); - return error; -} - -/** - * power_down - Shut the machine down for hibernation. - * - * Use the platform driver, if configured so; otherwise try - * to power off or reboot. - */ - -static void power_down(void) -{ - switch (hibernation_mode) { - case HIBERNATION_TEST: - case HIBERNATION_TESTPROC: - break; - case HIBERNATION_REBOOT: - kernel_restart(NULL); - break; - case HIBERNATION_PLATFORM: - hibernation_platform_enter(); - case HIBERNATION_SHUTDOWN: - kernel_power_off(); - break; - } - kernel_halt(); - /* - * Valid image is on the disk, if we continue we risk serious data - * corruption after resume. - */ - printk(KERN_CRIT "PM: Please power down manually\n"); - while(1); -} - -static int prepare_processes(void) -{ - int error = 0; - - if (freeze_processes()) { - error = -EBUSY; - thaw_processes(); - } - return error; -} - -/** - * hibernate - The granpappy of the built-in hibernation management - */ - -int hibernate(void) -{ - int error; - - mutex_lock(&pm_mutex); - /* The snapshot device should not be opened while we're running */ - if (!atomic_add_unless(&snapshot_device_available, -1, 0)) { - error = -EBUSY; - goto Unlock; - } - - pm_prepare_console(); - error = pm_notifier_call_chain(PM_HIBERNATION_PREPARE); - if (error) - goto Exit; - - /* Allocate memory management structures */ - error = create_basic_memory_bitmaps(); - if (error) - goto Exit; - - printk(KERN_INFO "PM: Syncing filesystems ... "); - sys_sync(); - printk("done.\n"); - - error = prepare_processes(); - if (error) - goto Finish; - - if (hibernation_test(TEST_FREEZER)) - goto Thaw; - - if (hibernation_testmode(HIBERNATION_TESTPROC)) - goto Thaw; - - error = hibernation_snapshot(hibernation_mode == HIBERNATION_PLATFORM); - if (in_suspend && !error) { - unsigned int flags = 0; - - if (hibernation_mode == HIBERNATION_PLATFORM) - flags |= SF_PLATFORM_MODE; - pr_debug("PM: writing image.\n"); - error = swsusp_write(flags); - swsusp_free(); - if (!error) - power_down(); - } else { - pr_debug("PM: Image restored successfully.\n"); - swsusp_free(); - } - Thaw: - thaw_processes(); - Finish: - free_basic_memory_bitmaps(); - Exit: - pm_notifier_call_chain(PM_POST_HIBERNATION); - pm_restore_console(); - atomic_inc(&snapshot_device_available); - Unlock: - mutex_unlock(&pm_mutex); - return error; -} - - -/** - * software_resume - Resume from a saved image. - * - * Called as a late_initcall (so all devices are discovered and - * initialized), we call swsusp to see if we have a saved image or not. - * If so, we quiesce devices, the restore the saved image. We will - * return above (in hibernate() ) if everything goes well. - * Otherwise, we fail gracefully and return to the normally - * scheduled program. - * - */ - -static int software_resume(void) -{ - int error; - unsigned int flags; - - /* - * name_to_dev_t() below takes a sysfs buffer mutex when sysfs - * is configured into the kernel. Since the regular hibernate - * trigger path is via sysfs which takes a buffer mutex before - * calling hibernate functions (which take pm_mutex) this can - * cause lockdep to complain about a possible ABBA deadlock - * which cannot happen since we're in the boot code here and - * sysfs can't be invoked yet. Therefore, we use a subclass - * here to avoid lockdep complaining. - */ - mutex_lock_nested(&pm_mutex, SINGLE_DEPTH_NESTING); - if (!swsusp_resume_device) { - if (!strlen(resume_file)) { - mutex_unlock(&pm_mutex); - return -ENOENT; - } - swsusp_resume_device = name_to_dev_t(resume_file); - pr_debug("PM: Resume from partition %s\n", resume_file); - } else { - pr_debug("PM: Resume from partition %d:%d\n", - MAJOR(swsusp_resume_device), - MINOR(swsusp_resume_device)); - } - - if (noresume) { - /** - * FIXME: If noresume is specified, we need to find the - * partition and reset it back to normal swap space. - */ - mutex_unlock(&pm_mutex); - return 0; - } - - pr_debug("PM: Checking hibernation image.\n"); - error = swsusp_check(); - if (error) - goto Unlock; - - /* The snapshot device should not be opened while we're running */ - if (!atomic_add_unless(&snapshot_device_available, -1, 0)) { - error = -EBUSY; - goto Unlock; - } - - pm_prepare_console(); - error = pm_notifier_call_chain(PM_RESTORE_PREPARE); - if (error) - goto Finish; - - error = create_basic_memory_bitmaps(); - if (error) - goto Finish; - - pr_debug("PM: Preparing processes for restore.\n"); - error = prepare_processes(); - if (error) { - swsusp_close(); - goto Done; - } - - pr_debug("PM: Reading hibernation image.\n"); - - error = swsusp_read(&flags); - if (!error) - hibernation_restore(flags & SF_PLATFORM_MODE); - - printk(KERN_ERR "PM: Restore failed, recovering.\n"); - swsusp_free(); - thaw_processes(); - Done: - free_basic_memory_bitmaps(); - Finish: - pm_notifier_call_chain(PM_POST_RESTORE); - pm_restore_console(); - atomic_inc(&snapshot_device_available); - /* For success case, the suspend path will release the lock */ - Unlock: - mutex_unlock(&pm_mutex); - pr_debug("PM: Resume from disk failed.\n"); - return error; -} - -late_initcall(software_resume); - - -static const char * const hibernation_modes[] = { - [HIBERNATION_PLATFORM] = "platform", - [HIBERNATION_SHUTDOWN] = "shutdown", - [HIBERNATION_REBOOT] = "reboot", - [HIBERNATION_TEST] = "test", - [HIBERNATION_TESTPROC] = "testproc", -}; - -/** - * disk - Control hibernation mode - * - * Suspend-to-disk can be handled in several ways. We have a few options - * for putting the system to sleep - using the platform driver (e.g. ACPI - * or other hibernation_ops), powering off the system or rebooting the - * system (for testing) as well as the two test modes. - * - * The system can support 'platform', and that is known a priori (and - * encoded by the presence of hibernation_ops). However, the user may - * choose 'shutdown' or 'reboot' as alternatives, as well as one fo the - * test modes, 'test' or 'testproc'. - * - * show() will display what the mode is currently set to. - * store() will accept one of - * - * 'platform' - * 'shutdown' - * 'reboot' - * 'test' - * 'testproc' - * - * It will only change to 'platform' if the system - * supports it (as determined by having hibernation_ops). - */ - -static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr, - char *buf) -{ - int i; - char *start = buf; - - for (i = HIBERNATION_FIRST; i <= HIBERNATION_MAX; i++) { - if (!hibernation_modes[i]) - continue; - switch (i) { - case HIBERNATION_SHUTDOWN: - case HIBERNATION_REBOOT: - case HIBERNATION_TEST: - case HIBERNATION_TESTPROC: - break; - case HIBERNATION_PLATFORM: - if (hibernation_ops) - break; - /* not a valid mode, continue with loop */ - continue; - } - if (i == hibernation_mode) - buf += sprintf(buf, "[%s] ", hibernation_modes[i]); - else - buf += sprintf(buf, "%s ", hibernation_modes[i]); - } - buf += sprintf(buf, "\n"); - return buf-start; -} - - -static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr, - const char *buf, size_t n) -{ - int error = 0; - int i; - int len; - char *p; - int mode = HIBERNATION_INVALID; - - p = memchr(buf, '\n', n); - len = p ? p - buf : n; - - mutex_lock(&pm_mutex); - for (i = HIBERNATION_FIRST; i <= HIBERNATION_MAX; i++) { - if (len == strlen(hibernation_modes[i]) - && !strncmp(buf, hibernation_modes[i], len)) { - mode = i; - break; - } - } - if (mode != HIBERNATION_INVALID) { - switch (mode) { - case HIBERNATION_SHUTDOWN: - case HIBERNATION_REBOOT: - case HIBERNATION_TEST: - case HIBERNATION_TESTPROC: - hibernation_mode = mode; - break; - case HIBERNATION_PLATFORM: - if (hibernation_ops) - hibernation_mode = mode; - else - error = -EINVAL; - } - } else - error = -EINVAL; - - if (!error) - pr_debug("PM: Hibernation mode set to '%s'\n", - hibernation_modes[mode]); - mutex_unlock(&pm_mutex); - return error ? error : n; -} - -power_attr(disk); - -static ssize_t resume_show(struct kobject *kobj, struct kobj_attribute *attr, - char *buf) -{ - return sprintf(buf,"%d:%d\n", MAJOR(swsusp_resume_device), - MINOR(swsusp_resume_device)); -} - -static ssize_t resume_store(struct kobject *kobj, struct kobj_attribute *attr, - const char *buf, size_t n) -{ - unsigned int maj, min; - dev_t res; - int ret = -EINVAL; - - if (sscanf(buf, "%u:%u", &maj, &min) != 2) - goto out; - - res = MKDEV(maj,min); - if (maj != MAJOR(res) || min != MINOR(res)) - goto out; - - mutex_lock(&pm_mutex); - swsusp_resume_device = res; - mutex_unlock(&pm_mutex); - printk(KERN_INFO "PM: Starting manual resume from disk\n"); - noresume = 0; - software_resume(); - ret = n; - out: - return ret; -} - -power_attr(resume); - -static ssize_t image_size_show(struct kobject *kobj, struct kobj_attribute *attr, - char *buf) -{ - return sprintf(buf, "%lu\n", image_size); -} - -static ssize_t image_size_store(struct kobject *kobj, struct kobj_attribute *attr, - const char *buf, size_t n) -{ - unsigned long size; - - if (sscanf(buf, "%lu", &size) == 1) { - image_size = size; - return n; - } - - return -EINVAL; -} - -power_attr(image_size); - -static struct attribute * g[] = { - &disk_attr.attr, - &resume_attr.attr, - &image_size_attr.attr, - NULL, -}; - - -static struct attribute_group attr_group = { - .attrs = g, -}; - - -static int __init pm_disk_init(void) -{ - return sysfs_create_group(power_kobj, &attr_group); -} - -core_initcall(pm_disk_init); - - -static int __init resume_setup(char *str) -{ - if (noresume) - return 1; - - strncpy( resume_file, str, 255 ); - return 1; -} - -static int __init resume_offset_setup(char *str) -{ - unsigned long long offset; - - if (noresume) - return 1; - - if (sscanf(str, "%llu", &offset) == 1) - swsusp_resume_block = offset; - - return 1; -} - -static int __init noresume_setup(char *str) -{ - noresume = 1; - return 1; -} - -__setup("noresume", noresume_setup); -__setup("resume_offset=", resume_offset_setup); -__setup("resume=", resume_setup); diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c new file mode 100644 index 00000000000..fcc2611d3f1 --- /dev/null +++ b/kernel/power/hibernate.c @@ -0,0 +1,1168 @@ +/* + * kernel/power/hibernate.c - Hibernation (a.k.a suspend-to-disk) support. + * + * Copyright (c) 2003 Patrick Mochel + * Copyright (c) 2003 Open Source Development Lab + * Copyright (c) 2004 Pavel Machek <pavel@ucw.cz> + * Copyright (c) 2009 Rafael J. Wysocki, Novell Inc. + * Copyright (C) 2012 Bojan Smojver <bojan@rexursive.com> + * + * This file is released under the GPLv2. + */ + +#include <linux/export.h> +#include <linux/suspend.h> +#include <linux/syscalls.h> +#include <linux/reboot.h> +#include <linux/string.h> +#include <linux/device.h> +#include <linux/async.h> +#include <linux/delay.h> +#include <linux/fs.h> +#include <linux/mount.h> +#include <linux/pm.h> +#include <linux/console.h> +#include <linux/cpu.h> +#include <linux/freezer.h> +#include <linux/gfp.h> +#include <linux/syscore_ops.h> +#include <linux/ctype.h> +#include <linux/genhd.h> +#include <trace/events/power.h> + +#include "power.h" + + +static int nocompress; +static int noresume; +static int nohibernate; +static int resume_wait; +static unsigned int resume_delay; +static char resume_file[256] = CONFIG_PM_STD_PARTITION; +dev_t swsusp_resume_device; +sector_t swsusp_resume_block; +__visible int in_suspend __nosavedata; + +enum { + HIBERNATION_INVALID, + HIBERNATION_PLATFORM, + HIBERNATION_SHUTDOWN, + HIBERNATION_REBOOT, +#ifdef CONFIG_SUSPEND + HIBERNATION_SUSPEND, +#endif + /* keep last */ + __HIBERNATION_AFTER_LAST +}; +#define HIBERNATION_MAX (__HIBERNATION_AFTER_LAST-1) +#define HIBERNATION_FIRST (HIBERNATION_INVALID + 1) + +static int hibernation_mode = HIBERNATION_SHUTDOWN; + +bool freezer_test_done; + +static const struct platform_hibernation_ops *hibernation_ops; + +bool hibernation_available(void) +{ + return (nohibernate == 0); +} + +/** + * hibernation_set_ops - Set the global hibernate operations. + * @ops: Hibernation operations to use in subsequent hibernation transitions. + */ +void hibernation_set_ops(const struct platform_hibernation_ops *ops) +{ + if (ops && !(ops->begin && ops->end && ops->pre_snapshot + && ops->prepare && ops->finish && ops->enter && ops->pre_restore + && ops->restore_cleanup && ops->leave)) { + WARN_ON(1); + return; + } + lock_system_sleep(); + hibernation_ops = ops; + if (ops) + hibernation_mode = HIBERNATION_PLATFORM; + else if (hibernation_mode == HIBERNATION_PLATFORM) + hibernation_mode = HIBERNATION_SHUTDOWN; + + unlock_system_sleep(); +} +EXPORT_SYMBOL_GPL(hibernation_set_ops); + +static bool entering_platform_hibernation; + +bool system_entering_hibernation(void) +{ + return entering_platform_hibernation; +} +EXPORT_SYMBOL(system_entering_hibernation); + +#ifdef CONFIG_PM_DEBUG +static void hibernation_debug_sleep(void) +{ + printk(KERN_INFO "hibernation debug: Waiting for 5 seconds.\n"); + mdelay(5000); +} + +static int hibernation_test(int level) +{ + if (pm_test_level == level) { + hibernation_debug_sleep(); + return 1; + } + return 0; +} +#else /* !CONFIG_PM_DEBUG */ +static int hibernation_test(int level) { return 0; } +#endif /* !CONFIG_PM_DEBUG */ + +/** + * platform_begin - Call platform to start hibernation. + * @platform_mode: Whether or not to use the platform driver. + */ +static int platform_begin(int platform_mode) +{ + return (platform_mode && hibernation_ops) ? + hibernation_ops->begin() : 0; +} + +/** + * platform_end - Call platform to finish transition to the working state. + * @platform_mode: Whether or not to use the platform driver. + */ +static void platform_end(int platform_mode) +{ + if (platform_mode && hibernation_ops) + hibernation_ops->end(); +} + +/** + * platform_pre_snapshot - Call platform to prepare the machine for hibernation. + * @platform_mode: Whether or not to use the platform driver. + * + * Use the platform driver to prepare the system for creating a hibernate image, + * if so configured, and return an error code if that fails. + */ + +static int platform_pre_snapshot(int platform_mode) +{ + return (platform_mode && hibernation_ops) ? + hibernation_ops->pre_snapshot() : 0; +} + +/** + * platform_leave - Call platform to prepare a transition to the working state. + * @platform_mode: Whether or not to use the platform driver. + * + * Use the platform driver prepare to prepare the machine for switching to the + * normal mode of operation. + * + * This routine is called on one CPU with interrupts disabled. + */ +static void platform_leave(int platform_mode) +{ + if (platform_mode && hibernation_ops) + hibernation_ops->leave(); +} + +/** + * platform_finish - Call platform to switch the system to the working state. + * @platform_mode: Whether or not to use the platform driver. + * + * Use the platform driver to switch the machine to the normal mode of + * operation. + * + * This routine must be called after platform_prepare(). + */ +static void platform_finish(int platform_mode) +{ + if (platform_mode && hibernation_ops) + hibernation_ops->finish(); +} + +/** + * platform_pre_restore - Prepare for hibernate image restoration. + * @platform_mode: Whether or not to use the platform driver. + * + * Use the platform driver to prepare the system for resume from a hibernation + * image. + * + * If the restore fails after this function has been called, + * platform_restore_cleanup() must be called. + */ +static int platform_pre_restore(int platform_mode) +{ + return (platform_mode && hibernation_ops) ? + hibernation_ops->pre_restore() : 0; +} + +/** + * platform_restore_cleanup - Switch to the working state after failing restore. + * @platform_mode: Whether or not to use the platform driver. + * + * Use the platform driver to switch the system to the normal mode of operation + * after a failing restore. + * + * If platform_pre_restore() has been called before the failing restore, this + * function must be called too, regardless of the result of + * platform_pre_restore(). + */ +static void platform_restore_cleanup(int platform_mode) +{ + if (platform_mode && hibernation_ops) + hibernation_ops->restore_cleanup(); +} + +/** + * platform_recover - Recover from a failure to suspend devices. + * @platform_mode: Whether or not to use the platform driver. + */ +static void platform_recover(int platform_mode) +{ + if (platform_mode && hibernation_ops && hibernation_ops->recover) + hibernation_ops->recover(); +} + +/** + * swsusp_show_speed - Print time elapsed between two events during hibernation. + * @start: Starting event. + * @stop: Final event. + * @nr_pages: Number of memory pages processed between @start and @stop. + * @msg: Additional diagnostic message to print. + */ +void swsusp_show_speed(struct timeval *start, struct timeval *stop, + unsigned nr_pages, char *msg) +{ + u64 elapsed_centisecs64; + unsigned int centisecs; + unsigned int k; + unsigned int kps; + + elapsed_centisecs64 = timeval_to_ns(stop) - timeval_to_ns(start); + /* + * If "(s64)elapsed_centisecs64 < 0", it will print long elapsed time, + * it is obvious enough for what went wrong. + */ + do_div(elapsed_centisecs64, NSEC_PER_SEC / 100); + centisecs = elapsed_centisecs64; + if (centisecs == 0) + centisecs = 1; /* avoid div-by-zero */ + k = nr_pages * (PAGE_SIZE / 1024); + kps = (k * 100) / centisecs; + printk(KERN_INFO "PM: %s %u kbytes in %u.%02u seconds (%u.%02u MB/s)\n", + msg, k, + centisecs / 100, centisecs % 100, + kps / 1000, (kps % 1000) / 10); +} + +/** + * create_image - Create a hibernation image. + * @platform_mode: Whether or not to use the platform driver. + * + * Execute device drivers' "late" and "noirq" freeze callbacks, create a + * hibernation image and run the drivers' "noirq" and "early" thaw callbacks. + * + * Control reappears in this routine after the subsequent restore. + */ +static int create_image(int platform_mode) +{ + int error; + + error = dpm_suspend_end(PMSG_FREEZE); + if (error) { + printk(KERN_ERR "PM: Some devices failed to power down, " + "aborting hibernation\n"); + return error; + } + + error = platform_pre_snapshot(platform_mode); + if (error || hibernation_test(TEST_PLATFORM)) + goto Platform_finish; + + error = disable_nonboot_cpus(); + if (error || hibernation_test(TEST_CPUS)) + goto Enable_cpus; + + local_irq_disable(); + + error = syscore_suspend(); + if (error) { + printk(KERN_ERR "PM: Some system devices failed to power down, " + "aborting hibernation\n"); + goto Enable_irqs; + } + + if (hibernation_test(TEST_CORE) || pm_wakeup_pending()) + goto Power_up; + + in_suspend = 1; + save_processor_state(); + trace_suspend_resume(TPS("machine_suspend"), PM_EVENT_HIBERNATE, true); + error = swsusp_arch_suspend(); + trace_suspend_resume(TPS("machine_suspend"), PM_EVENT_HIBERNATE, false); + if (error) + printk(KERN_ERR "PM: Error %d creating hibernation image\n", + error); + /* Restore control flow magically appears here */ + restore_processor_state(); + if (!in_suspend) + events_check_enabled = false; + + platform_leave(platform_mode); + + Power_up: + syscore_resume(); + + Enable_irqs: + local_irq_enable(); + + Enable_cpus: + enable_nonboot_cpus(); + + Platform_finish: + platform_finish(platform_mode); + + dpm_resume_start(in_suspend ? + (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); + + return error; +} + +/** + * hibernation_snapshot - Quiesce devices and create a hibernation image. + * @platform_mode: If set, use platform driver to prepare for the transition. + * + * This routine must be called with pm_mutex held. + */ +int hibernation_snapshot(int platform_mode) +{ + pm_message_t msg; + int error; + + error = platform_begin(platform_mode); + if (error) + goto Close; + + /* Preallocate image memory before shutting down devices. */ + error = hibernate_preallocate_memory(); + if (error) + goto Close; + + error = freeze_kernel_threads(); + if (error) + goto Cleanup; + + if (hibernation_test(TEST_FREEZER)) { + + /* + * Indicate to the caller that we are returning due to a + * successful freezer test. + */ + freezer_test_done = true; + goto Thaw; + } + + error = dpm_prepare(PMSG_FREEZE); + if (error) { + dpm_complete(PMSG_RECOVER); + goto Thaw; + } + + suspend_console(); + ftrace_stop(); + pm_restrict_gfp_mask(); + + error = dpm_suspend(PMSG_FREEZE); + + if (error || hibernation_test(TEST_DEVICES)) + platform_recover(platform_mode); + else + error = create_image(platform_mode); + + /* + * In the case that we call create_image() above, the control + * returns here (1) after the image has been created or the + * image creation has failed and (2) after a successful restore. + */ + + /* We may need to release the preallocated image pages here. */ + if (error || !in_suspend) + swsusp_free(); + + msg = in_suspend ? (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE; + dpm_resume(msg); + + if (error || !in_suspend) + pm_restore_gfp_mask(); + + ftrace_start(); + resume_console(); + dpm_complete(msg); + + Close: + platform_end(platform_mode); + return error; + + Thaw: + thaw_kernel_threads(); + Cleanup: + swsusp_free(); + goto Close; +} + +/** + * resume_target_kernel - Restore system state from a hibernation image. + * @platform_mode: Whether or not to use the platform driver. + * + * Execute device drivers' "noirq" and "late" freeze callbacks, restore the + * contents of highmem that have not been restored yet from the image and run + * the low-level code that will restore the remaining contents of memory and + * switch to the just restored target kernel. + */ +static int resume_target_kernel(bool platform_mode) +{ + int error; + + error = dpm_suspend_end(PMSG_QUIESCE); + if (error) { + printk(KERN_ERR "PM: Some devices failed to power down, " + "aborting resume\n"); + return error; + } + + error = platform_pre_restore(platform_mode); + if (error) + goto Cleanup; + + error = disable_nonboot_cpus(); + if (error) + goto Enable_cpus; + + local_irq_disable(); + + error = syscore_suspend(); + if (error) + goto Enable_irqs; + + save_processor_state(); + error = restore_highmem(); + if (!error) { + error = swsusp_arch_resume(); + /* + * The code below is only ever reached in case of a failure. + * Otherwise, execution continues at the place where + * swsusp_arch_suspend() was called. + */ + BUG_ON(!error); + /* + * This call to restore_highmem() reverts the changes made by + * the previous one. + */ + restore_highmem(); + } + /* + * The only reason why swsusp_arch_resume() can fail is memory being + * very tight, so we have to free it as soon as we can to avoid + * subsequent failures. + */ + swsusp_free(); + restore_processor_state(); + touch_softlockup_watchdog(); + + syscore_resume(); + + Enable_irqs: + local_irq_enable(); + + Enable_cpus: + enable_nonboot_cpus(); + + Cleanup: + platform_restore_cleanup(platform_mode); + + dpm_resume_start(PMSG_RECOVER); + + return error; +} + +/** + * hibernation_restore - Quiesce devices and restore from a hibernation image. + * @platform_mode: If set, use platform driver to prepare for the transition. + * + * This routine must be called with pm_mutex held. If it is successful, control + * reappears in the restored target kernel in hibernation_snapshot(). + */ +int hibernation_restore(int platform_mode) +{ + int error; + + pm_prepare_console(); + suspend_console(); + ftrace_stop(); + pm_restrict_gfp_mask(); + error = dpm_suspend_start(PMSG_QUIESCE); + if (!error) { + error = resume_target_kernel(platform_mode); + dpm_resume_end(PMSG_RECOVER); + } + pm_restore_gfp_mask(); + ftrace_start(); + resume_console(); + pm_restore_console(); + return error; +} + +/** + * hibernation_platform_enter - Power off the system using the platform driver. + */ +int hibernation_platform_enter(void) +{ + int error; + + if (!hibernation_ops) + return -ENOSYS; + + /* + * We have cancelled the power transition by running + * hibernation_ops->finish() before saving the image, so we should let + * the firmware know that we're going to enter the sleep state after all + */ + error = hibernation_ops->begin(); + if (error) + goto Close; + + entering_platform_hibernation = true; + suspend_console(); + ftrace_stop(); + error = dpm_suspend_start(PMSG_HIBERNATE); + if (error) { + if (hibernation_ops->recover) + hibernation_ops->recover(); + goto Resume_devices; + } + + error = dpm_suspend_end(PMSG_HIBERNATE); + if (error) + goto Resume_devices; + + error = hibernation_ops->prepare(); + if (error) + goto Platform_finish; + + error = disable_nonboot_cpus(); + if (error) + goto Platform_finish; + + local_irq_disable(); + syscore_suspend(); + if (pm_wakeup_pending()) { + error = -EAGAIN; + goto Power_up; + } + + hibernation_ops->enter(); + /* We should never get here */ + while (1); + + Power_up: + syscore_resume(); + local_irq_enable(); + enable_nonboot_cpus(); + + Platform_finish: + hibernation_ops->finish(); + + dpm_resume_start(PMSG_RESTORE); + + Resume_devices: + entering_platform_hibernation = false; + dpm_resume_end(PMSG_RESTORE); + ftrace_start(); + resume_console(); + + Close: + hibernation_ops->end(); + + return error; +} + +/** + * power_down - Shut the machine down for hibernation. + * + * Use the platform driver, if configured, to put the system into the sleep + * state corresponding to hibernation, or try to power it off or reboot, + * depending on the value of hibernation_mode. + */ +static void power_down(void) +{ +#ifdef CONFIG_SUSPEND + int error; +#endif + + switch (hibernation_mode) { + case HIBERNATION_REBOOT: + kernel_restart(NULL); + break; + case HIBERNATION_PLATFORM: + hibernation_platform_enter(); + case HIBERNATION_SHUTDOWN: + if (pm_power_off) + kernel_power_off(); + break; +#ifdef CONFIG_SUSPEND + case HIBERNATION_SUSPEND: + error = suspend_devices_and_enter(PM_SUSPEND_MEM); + if (error) { + if (hibernation_ops) + hibernation_mode = HIBERNATION_PLATFORM; + else + hibernation_mode = HIBERNATION_SHUTDOWN; + power_down(); + } + /* + * Restore swap signature. + */ + error = swsusp_unmark(); + if (error) + printk(KERN_ERR "PM: Swap will be unusable! " + "Try swapon -a.\n"); + return; +#endif + } + kernel_halt(); + /* + * Valid image is on the disk, if we continue we risk serious data + * corruption after resume. + */ + printk(KERN_CRIT "PM: Please power down manually\n"); + while (1) + cpu_relax(); +} + +/** + * hibernate - Carry out system hibernation, including saving the image. + */ +int hibernate(void) +{ + int error; + + if (!hibernation_available()) { + pr_debug("PM: Hibernation not available.\n"); + return -EPERM; + } + + lock_system_sleep(); + /* The snapshot device should not be opened while we're running */ + if (!atomic_add_unless(&snapshot_device_available, -1, 0)) { + error = -EBUSY; + goto Unlock; + } + + pm_prepare_console(); + error = pm_notifier_call_chain(PM_HIBERNATION_PREPARE); + if (error) + goto Exit; + + printk(KERN_INFO "PM: Syncing filesystems ... "); + sys_sync(); + printk("done.\n"); + + error = freeze_processes(); + if (error) + goto Exit; + + lock_device_hotplug(); + /* Allocate memory management structures */ + error = create_basic_memory_bitmaps(); + if (error) + goto Thaw; + + error = hibernation_snapshot(hibernation_mode == HIBERNATION_PLATFORM); + if (error || freezer_test_done) + goto Free_bitmaps; + + if (in_suspend) { + unsigned int flags = 0; + + if (hibernation_mode == HIBERNATION_PLATFORM) + flags |= SF_PLATFORM_MODE; + if (nocompress) + flags |= SF_NOCOMPRESS_MODE; + else + flags |= SF_CRC32_MODE; + + pr_debug("PM: writing image.\n"); + error = swsusp_write(flags); + swsusp_free(); + if (!error) + power_down(); + in_suspend = 0; + pm_restore_gfp_mask(); + } else { + pr_debug("PM: Image restored successfully.\n"); + } + + Free_bitmaps: + free_basic_memory_bitmaps(); + Thaw: + unlock_device_hotplug(); + thaw_processes(); + + /* Don't bother checking whether freezer_test_done is true */ + freezer_test_done = false; + Exit: + pm_notifier_call_chain(PM_POST_HIBERNATION); + pm_restore_console(); + atomic_inc(&snapshot_device_available); + Unlock: + unlock_system_sleep(); + return error; +} + + +/** + * software_resume - Resume from a saved hibernation image. + * + * This routine is called as a late initcall, when all devices have been + * discovered and initialized already. + * + * The image reading code is called to see if there is a hibernation image + * available for reading. If that is the case, devices are quiesced and the + * contents of memory is restored from the saved image. + * + * If this is successful, control reappears in the restored target kernel in + * hibernation_snaphot() which returns to hibernate(). Otherwise, the routine + * attempts to recover gracefully and make the kernel return to the normal mode + * of operation. + */ +static int software_resume(void) +{ + int error; + unsigned int flags; + + /* + * If the user said "noresume".. bail out early. + */ + if (noresume || !hibernation_available()) + return 0; + + /* + * name_to_dev_t() below takes a sysfs buffer mutex when sysfs + * is configured into the kernel. Since the regular hibernate + * trigger path is via sysfs which takes a buffer mutex before + * calling hibernate functions (which take pm_mutex) this can + * cause lockdep to complain about a possible ABBA deadlock + * which cannot happen since we're in the boot code here and + * sysfs can't be invoked yet. Therefore, we use a subclass + * here to avoid lockdep complaining. + */ + mutex_lock_nested(&pm_mutex, SINGLE_DEPTH_NESTING); + + if (swsusp_resume_device) + goto Check_image; + + if (!strlen(resume_file)) { + error = -ENOENT; + goto Unlock; + } + + pr_debug("PM: Checking hibernation image partition %s\n", resume_file); + + if (resume_delay) { + printk(KERN_INFO "Waiting %dsec before reading resume device...\n", + resume_delay); + ssleep(resume_delay); + } + + /* Check if the device is there */ + swsusp_resume_device = name_to_dev_t(resume_file); + + /* + * name_to_dev_t is ineffective to verify parition if resume_file is in + * integer format. (e.g. major:minor) + */ + if (isdigit(resume_file[0]) && resume_wait) { + int partno; + while (!get_gendisk(swsusp_resume_device, &partno)) + msleep(10); + } + + if (!swsusp_resume_device) { + /* + * Some device discovery might still be in progress; we need + * to wait for this to finish. + */ + wait_for_device_probe(); + + if (resume_wait) { + while ((swsusp_resume_device = name_to_dev_t(resume_file)) == 0) + msleep(10); + async_synchronize_full(); + } + + swsusp_resume_device = name_to_dev_t(resume_file); + if (!swsusp_resume_device) { + error = -ENODEV; + goto Unlock; + } + } + + Check_image: + pr_debug("PM: Hibernation image partition %d:%d present\n", + MAJOR(swsusp_resume_device), MINOR(swsusp_resume_device)); + + pr_debug("PM: Looking for hibernation image.\n"); + error = swsusp_check(); + if (error) + goto Unlock; + + /* The snapshot device should not be opened while we're running */ + if (!atomic_add_unless(&snapshot_device_available, -1, 0)) { + error = -EBUSY; + swsusp_close(FMODE_READ); + goto Unlock; + } + + pm_prepare_console(); + error = pm_notifier_call_chain(PM_RESTORE_PREPARE); + if (error) + goto Close_Finish; + + pr_debug("PM: Preparing processes for restore.\n"); + error = freeze_processes(); + if (error) + goto Close_Finish; + + pr_debug("PM: Loading hibernation image.\n"); + + lock_device_hotplug(); + error = create_basic_memory_bitmaps(); + if (error) + goto Thaw; + + error = swsusp_read(&flags); + swsusp_close(FMODE_READ); + if (!error) + hibernation_restore(flags & SF_PLATFORM_MODE); + + printk(KERN_ERR "PM: Failed to load hibernation image, recovering.\n"); + swsusp_free(); + free_basic_memory_bitmaps(); + Thaw: + unlock_device_hotplug(); + thaw_processes(); + Finish: + pm_notifier_call_chain(PM_POST_RESTORE); + pm_restore_console(); + atomic_inc(&snapshot_device_available); + /* For success case, the suspend path will release the lock */ + Unlock: + mutex_unlock(&pm_mutex); + pr_debug("PM: Hibernation image not present or could not be loaded.\n"); + return error; + Close_Finish: + swsusp_close(FMODE_READ); + goto Finish; +} + +late_initcall_sync(software_resume); + + +static const char * const hibernation_modes[] = { + [HIBERNATION_PLATFORM] = "platform", + [HIBERNATION_SHUTDOWN] = "shutdown", + [HIBERNATION_REBOOT] = "reboot", +#ifdef CONFIG_SUSPEND + [HIBERNATION_SUSPEND] = "suspend", +#endif +}; + +/* + * /sys/power/disk - Control hibernation mode. + * + * Hibernation can be handled in several ways. There are a few different ways + * to put the system into the sleep state: using the platform driver (e.g. ACPI + * or other hibernation_ops), powering it off or rebooting it (for testing + * mostly). + * + * The sysfs file /sys/power/disk provides an interface for selecting the + * hibernation mode to use. Reading from this file causes the available modes + * to be printed. There are 3 modes that can be supported: + * + * 'platform' + * 'shutdown' + * 'reboot' + * + * If a platform hibernation driver is in use, 'platform' will be supported + * and will be used by default. Otherwise, 'shutdown' will be used by default. + * The selected option (i.e. the one corresponding to the current value of + * hibernation_mode) is enclosed by a square bracket. + * + * To select a given hibernation mode it is necessary to write the mode's + * string representation (as returned by reading from /sys/power/disk) back + * into /sys/power/disk. + */ + +static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + int i; + char *start = buf; + + if (!hibernation_available()) + return sprintf(buf, "[disabled]\n"); + + for (i = HIBERNATION_FIRST; i <= HIBERNATION_MAX; i++) { + if (!hibernation_modes[i]) + continue; + switch (i) { + case HIBERNATION_SHUTDOWN: + case HIBERNATION_REBOOT: +#ifdef CONFIG_SUSPEND + case HIBERNATION_SUSPEND: +#endif + break; + case HIBERNATION_PLATFORM: + if (hibernation_ops) + break; + /* not a valid mode, continue with loop */ + continue; + } + if (i == hibernation_mode) + buf += sprintf(buf, "[%s] ", hibernation_modes[i]); + else + buf += sprintf(buf, "%s ", hibernation_modes[i]); + } + buf += sprintf(buf, "\n"); + return buf-start; +} + +static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t n) +{ + int error = 0; + int i; + int len; + char *p; + int mode = HIBERNATION_INVALID; + + if (!hibernation_available()) + return -EPERM; + + p = memchr(buf, '\n', n); + len = p ? p - buf : n; + + lock_system_sleep(); + for (i = HIBERNATION_FIRST; i <= HIBERNATION_MAX; i++) { + if (len == strlen(hibernation_modes[i]) + && !strncmp(buf, hibernation_modes[i], len)) { + mode = i; + break; + } + } + if (mode != HIBERNATION_INVALID) { + switch (mode) { + case HIBERNATION_SHUTDOWN: + case HIBERNATION_REBOOT: +#ifdef CONFIG_SUSPEND + case HIBERNATION_SUSPEND: +#endif + hibernation_mode = mode; + break; + case HIBERNATION_PLATFORM: + if (hibernation_ops) + hibernation_mode = mode; + else + error = -EINVAL; + } + } else + error = -EINVAL; + + if (!error) + pr_debug("PM: Hibernation mode set to '%s'\n", + hibernation_modes[mode]); + unlock_system_sleep(); + return error ? error : n; +} + +power_attr(disk); + +static ssize_t resume_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf,"%d:%d\n", MAJOR(swsusp_resume_device), + MINOR(swsusp_resume_device)); +} + +static ssize_t resume_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t n) +{ + dev_t res; + int len = n; + char *name; + + if (len && buf[len-1] == '\n') + len--; + name = kstrndup(buf, len, GFP_KERNEL); + if (!name) + return -ENOMEM; + + res = name_to_dev_t(name); + kfree(name); + if (!res) + return -EINVAL; + + lock_system_sleep(); + swsusp_resume_device = res; + unlock_system_sleep(); + printk(KERN_INFO "PM: Starting manual resume from disk\n"); + noresume = 0; + software_resume(); + return n; +} + +power_attr(resume); + +static ssize_t image_size_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, "%lu\n", image_size); +} + +static ssize_t image_size_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t n) +{ + unsigned long size; + + if (sscanf(buf, "%lu", &size) == 1) { + image_size = size; + return n; + } + + return -EINVAL; +} + +power_attr(image_size); + +static ssize_t reserved_size_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%lu\n", reserved_size); +} + +static ssize_t reserved_size_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t n) +{ + unsigned long size; + + if (sscanf(buf, "%lu", &size) == 1) { + reserved_size = size; + return n; + } + + return -EINVAL; +} + +power_attr(reserved_size); + +static struct attribute * g[] = { + &disk_attr.attr, + &resume_attr.attr, + &image_size_attr.attr, + &reserved_size_attr.attr, + NULL, +}; + + +static struct attribute_group attr_group = { + .attrs = g, +}; + + +static int __init pm_disk_init(void) +{ + return sysfs_create_group(power_kobj, &attr_group); +} + +core_initcall(pm_disk_init); + + +static int __init resume_setup(char *str) +{ + if (noresume) + return 1; + + strncpy( resume_file, str, 255 ); + return 1; +} + +static int __init resume_offset_setup(char *str) +{ + unsigned long long offset; + + if (noresume) + return 1; + + if (sscanf(str, "%llu", &offset) == 1) + swsusp_resume_block = offset; + + return 1; +} + +static int __init hibernate_setup(char *str) +{ + if (!strncmp(str, "noresume", 8)) + noresume = 1; + else if (!strncmp(str, "nocompress", 10)) + nocompress = 1; + else if (!strncmp(str, "no", 2)) { + noresume = 1; + nohibernate = 1; + } + return 1; +} + +static int __init noresume_setup(char *str) +{ + noresume = 1; + return 1; +} + +static int __init resumewait_setup(char *str) +{ + resume_wait = 1; + return 1; +} + +static int __init resumedelay_setup(char *str) +{ + int rc = kstrtouint(str, 0, &resume_delay); + + if (rc) + return rc; + return 1; +} + +static int __init nohibernate_setup(char *str) +{ + noresume = 1; + nohibernate = 1; + return 1; +} + +static int __init kaslr_nohibernate_setup(char *str) +{ + return nohibernate_setup(str); +} + +__setup("noresume", noresume_setup); +__setup("resume_offset=", resume_offset_setup); +__setup("resume=", resume_setup); +__setup("hibernate=", hibernate_setup); +__setup("resumewait", resumewait_setup); +__setup("resumedelay=", resumedelay_setup); +__setup("nohibernate", nohibernate_setup); +__setup("kaslr", kaslr_nohibernate_setup); diff --git a/kernel/power/main.c b/kernel/power/main.c index 6a6d5eb3524..8e90f330f13 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -3,32 +3,23 @@ * * Copyright (c) 2003 Patrick Mochel * Copyright (c) 2003 Open Source Development Lab - * + * * This file is released under the GPLv2 * */ -#include <linux/module.h> -#include <linux/suspend.h> +#include <linux/export.h> #include <linux/kobject.h> #include <linux/string.h> -#include <linux/delay.h> -#include <linux/errno.h> -#include <linux/init.h> -#include <linux/console.h> -#include <linux/cpu.h> #include <linux/resume-trace.h> -#include <linux/freezer.h> -#include <linux/vmstat.h> -#include <linux/syscalls.h> +#include <linux/workqueue.h> +#include <linux/debugfs.h> +#include <linux/seq_file.h> #include "power.h" DEFINE_MUTEX(pm_mutex); -unsigned int pm_flags; -EXPORT_SYMBOL(pm_flags); - #ifdef CONFIG_PM_SLEEP /* Routines for PM-transition notifications */ @@ -49,23 +40,40 @@ EXPORT_SYMBOL_GPL(unregister_pm_notifier); int pm_notifier_call_chain(unsigned long val) { - return (blocking_notifier_call_chain(&pm_chain_head, val, NULL) - == NOTIFY_BAD) ? -EINVAL : 0; + int ret = blocking_notifier_call_chain(&pm_chain_head, val, NULL); + + return notifier_to_errno(ret); } -#ifdef CONFIG_PM_DEBUG -int pm_test_level = TEST_NONE; +/* If set, devices may be suspended and resumed asynchronously. */ +int pm_async_enabled = 1; -static int suspend_test(int level) +static ssize_t pm_async_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) { - if (pm_test_level == level) { - printk(KERN_INFO "suspend debug: Waiting for 5 seconds.\n"); - mdelay(5000); - return 1; - } - return 0; + return sprintf(buf, "%d\n", pm_async_enabled); +} + +static ssize_t pm_async_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t n) +{ + unsigned long val; + + if (kstrtoul(buf, 10, &val)) + return -EINVAL; + + if (val > 1) + return -EINVAL; + + pm_async_enabled = val; + return n; } +power_attr(pm_async); + +#ifdef CONFIG_PM_DEBUG +int pm_test_level = TEST_NONE; + static const char * const pm_tests[__TEST_AFTER_LAST] = { [TEST_NONE] = "none", [TEST_CORE] = "core", @@ -108,7 +116,7 @@ static ssize_t pm_test_store(struct kobject *kobj, struct kobj_attribute *attr, p = memchr(buf, '\n', n); len = p ? p - buf : n; - mutex_lock(&pm_mutex); + lock_system_sleep(); level = TEST_FIRST; for (s = &pm_tests[level]; level <= TEST_MAX; s++, level++) @@ -118,352 +126,391 @@ static ssize_t pm_test_store(struct kobject *kobj, struct kobj_attribute *attr, break; } - mutex_unlock(&pm_mutex); + unlock_system_sleep(); return error ? error : n; } power_attr(pm_test); -#else /* !CONFIG_PM_DEBUG */ -static inline int suspend_test(int level) { return 0; } -#endif /* !CONFIG_PM_DEBUG */ +#endif /* CONFIG_PM_DEBUG */ -#endif /* CONFIG_PM_SLEEP */ +#ifdef CONFIG_DEBUG_FS +static char *suspend_step_name(enum suspend_stat_step step) +{ + switch (step) { + case SUSPEND_FREEZE: + return "freeze"; + case SUSPEND_PREPARE: + return "prepare"; + case SUSPEND_SUSPEND: + return "suspend"; + case SUSPEND_SUSPEND_NOIRQ: + return "suspend_noirq"; + case SUSPEND_RESUME_NOIRQ: + return "resume_noirq"; + case SUSPEND_RESUME: + return "resume"; + default: + return ""; + } +} -#ifdef CONFIG_SUSPEND +static int suspend_stats_show(struct seq_file *s, void *unused) +{ + int i, index, last_dev, last_errno, last_step; + + last_dev = suspend_stats.last_failed_dev + REC_FAILED_NUM - 1; + last_dev %= REC_FAILED_NUM; + last_errno = suspend_stats.last_failed_errno + REC_FAILED_NUM - 1; + last_errno %= REC_FAILED_NUM; + last_step = suspend_stats.last_failed_step + REC_FAILED_NUM - 1; + last_step %= REC_FAILED_NUM; + seq_printf(s, "%s: %d\n%s: %d\n%s: %d\n%s: %d\n%s: %d\n" + "%s: %d\n%s: %d\n%s: %d\n%s: %d\n%s: %d\n", + "success", suspend_stats.success, + "fail", suspend_stats.fail, + "failed_freeze", suspend_stats.failed_freeze, + "failed_prepare", suspend_stats.failed_prepare, + "failed_suspend", suspend_stats.failed_suspend, + "failed_suspend_late", + suspend_stats.failed_suspend_late, + "failed_suspend_noirq", + suspend_stats.failed_suspend_noirq, + "failed_resume", suspend_stats.failed_resume, + "failed_resume_early", + suspend_stats.failed_resume_early, + "failed_resume_noirq", + suspend_stats.failed_resume_noirq); + seq_printf(s, "failures:\n last_failed_dev:\t%-s\n", + suspend_stats.failed_devs[last_dev]); + for (i = 1; i < REC_FAILED_NUM; i++) { + index = last_dev + REC_FAILED_NUM - i; + index %= REC_FAILED_NUM; + seq_printf(s, "\t\t\t%-s\n", + suspend_stats.failed_devs[index]); + } + seq_printf(s, " last_failed_errno:\t%-d\n", + suspend_stats.errno[last_errno]); + for (i = 1; i < REC_FAILED_NUM; i++) { + index = last_errno + REC_FAILED_NUM - i; + index %= REC_FAILED_NUM; + seq_printf(s, "\t\t\t%-d\n", + suspend_stats.errno[index]); + } + seq_printf(s, " last_failed_step:\t%-s\n", + suspend_step_name( + suspend_stats.failed_steps[last_step])); + for (i = 1; i < REC_FAILED_NUM; i++) { + index = last_step + REC_FAILED_NUM - i; + index %= REC_FAILED_NUM; + seq_printf(s, "\t\t\t%-s\n", + suspend_step_name( + suspend_stats.failed_steps[index])); + } -/* This is just an arbitrary number */ -#define FREE_PAGE_NUMBER (100) + return 0; +} -static struct platform_suspend_ops *suspend_ops; +static int suspend_stats_open(struct inode *inode, struct file *file) +{ + return single_open(file, suspend_stats_show, NULL); +} -/** - * suspend_set_ops - Set the global suspend method table. - * @ops: Pointer to ops structure. - */ +static const struct file_operations suspend_stats_operations = { + .open = suspend_stats_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; -void suspend_set_ops(struct platform_suspend_ops *ops) +static int __init pm_debugfs_init(void) { - mutex_lock(&pm_mutex); - suspend_ops = ops; - mutex_unlock(&pm_mutex); + debugfs_create_file("suspend_stats", S_IFREG | S_IRUGO, + NULL, NULL, &suspend_stats_operations); + return 0; } -/** - * suspend_valid_only_mem - generic memory-only valid callback +late_initcall(pm_debugfs_init); +#endif /* CONFIG_DEBUG_FS */ + +#endif /* CONFIG_PM_SLEEP */ + +#ifdef CONFIG_PM_SLEEP_DEBUG +/* + * pm_print_times: print time taken by devices to suspend and resume. * - * Platform drivers that implement mem suspend only and only need - * to check for that in their .valid callback can use this instead - * of rolling their own .valid callback. + * show() returns whether printing of suspend and resume times is enabled. + * store() accepts 0 or 1. 0 disables printing and 1 enables it. */ -int suspend_valid_only_mem(suspend_state_t state) +bool pm_print_times_enabled; + +static ssize_t pm_print_times_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) { - return state == PM_SUSPEND_MEM; + return sprintf(buf, "%d\n", pm_print_times_enabled); } -/** - * suspend_prepare - Do prep work before entering low-power state. - * - * This is common code that is called for each state that we're entering. - * Run suspend notifiers, allocate a console and stop all processes. - */ -static int suspend_prepare(void) +static ssize_t pm_print_times_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t n) { - int error; - unsigned int free_pages; - - if (!suspend_ops || !suspend_ops->enter) - return -EPERM; + unsigned long val; - pm_prepare_console(); + if (kstrtoul(buf, 10, &val)) + return -EINVAL; - error = pm_notifier_call_chain(PM_SUSPEND_PREPARE); - if (error) - goto Finish; + if (val > 1) + return -EINVAL; - if (suspend_freeze_processes()) { - error = -EAGAIN; - goto Thaw; - } - - free_pages = global_page_state(NR_FREE_PAGES); - if (free_pages < FREE_PAGE_NUMBER) { - pr_debug("PM: free some memory\n"); - shrink_all_memory(FREE_PAGE_NUMBER - free_pages); - if (nr_free_pages() < FREE_PAGE_NUMBER) { - error = -ENOMEM; - printk(KERN_ERR "PM: No enough memory\n"); - } - } - if (!error) - return 0; - - Thaw: - suspend_thaw_processes(); - Finish: - pm_notifier_call_chain(PM_POST_SUSPEND); - pm_restore_console(); - return error; + pm_print_times_enabled = !!val; + return n; } -/* default implementation */ -void __attribute__ ((weak)) arch_suspend_disable_irqs(void) -{ - local_irq_disable(); -} +power_attr(pm_print_times); -/* default implementation */ -void __attribute__ ((weak)) arch_suspend_enable_irqs(void) +static inline void pm_print_times_init(void) { - local_irq_enable(); + pm_print_times_enabled = !!initcall_debug; } +#else /* !CONFIG_PP_SLEEP_DEBUG */ +static inline void pm_print_times_init(void) {} +#endif /* CONFIG_PM_SLEEP_DEBUG */ + +struct kobject *power_kobj; /** - * suspend_enter - enter the desired system sleep state. - * @state: state to enter + * state - control system sleep states. + * + * show() returns available sleep state labels, which may be "mem", "standby", + * "freeze" and "disk" (hibernation). See Documentation/power/states.txt for a + * description of what they mean. * - * This function should be called after devices have been suspended. + * store() accepts one of those strings, translates it into the proper + * enumerated value, and initiates a suspend transition. */ -static int suspend_enter(suspend_state_t state) +static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) { - int error = 0; - - arch_suspend_disable_irqs(); - BUG_ON(!irqs_disabled()); - - if ((error = device_power_down(PMSG_SUSPEND))) { - printk(KERN_ERR "PM: Some devices failed to power down\n"); - goto Done; - } + char *s = buf; +#ifdef CONFIG_SUSPEND + suspend_state_t i; - if (!suspend_test(TEST_CORE)) - error = suspend_ops->enter(state); + for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++) + if (pm_states[i].state) + s += sprintf(s,"%s ", pm_states[i].label); - device_power_up(); - Done: - arch_suspend_enable_irqs(); - BUG_ON(irqs_disabled()); - return error; +#endif + if (hibernation_available()) + s += sprintf(s, "disk "); + if (s != buf) + /* convert the last space to a newline */ + *(s-1) = '\n'; + return (s - buf); } -/** - * suspend_devices_and_enter - suspend devices and enter the desired system - * sleep state. - * @state: state to enter - */ -int suspend_devices_and_enter(suspend_state_t state) +static suspend_state_t decode_state(const char *buf, size_t n) { - int error; - - if (!suspend_ops) - return -ENOSYS; +#ifdef CONFIG_SUSPEND + suspend_state_t state = PM_SUSPEND_MIN; + struct pm_sleep_state *s; +#endif + char *p; + int len; - if (suspend_ops->begin) { - error = suspend_ops->begin(state); - if (error) - goto Close; - } - suspend_console(); - error = device_suspend(PMSG_SUSPEND); - if (error) { - printk(KERN_ERR "PM: Some devices failed to suspend\n"); - goto Resume_console; - } + p = memchr(buf, '\n', n); + len = p ? p - buf : n; - if (suspend_test(TEST_DEVICES)) - goto Resume_devices; + /* Check hibernation first. */ + if (len == 4 && !strncmp(buf, "disk", len)) + return PM_SUSPEND_MAX; - if (suspend_ops->prepare) { - error = suspend_ops->prepare(); - if (error) - goto Resume_devices; - } +#ifdef CONFIG_SUSPEND + for (s = &pm_states[state]; state < PM_SUSPEND_MAX; s++, state++) + if (s->state && len == strlen(s->label) + && !strncmp(buf, s->label, len)) + return s->state; +#endif - if (suspend_test(TEST_PLATFORM)) - goto Finish; - - error = disable_nonboot_cpus(); - if (!error && !suspend_test(TEST_CPUS)) - suspend_enter(state); - - enable_nonboot_cpus(); - Finish: - if (suspend_ops->finish) - suspend_ops->finish(); - Resume_devices: - device_resume(); - Resume_console: - resume_console(); - Close: - if (suspend_ops->end) - suspend_ops->end(); - return error; + return PM_SUSPEND_ON; } -/** - * suspend_finish - Do final work before exiting suspend sequence. - * - * Call platform code to clean up, restart processes, and free the - * console that we've allocated. This is not called for suspend-to-disk. - */ -static void suspend_finish(void) +static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t n) { - suspend_thaw_processes(); - pm_notifier_call_chain(PM_POST_SUSPEND); - pm_restore_console(); -} - + suspend_state_t state; + int error; + error = pm_autosleep_lock(); + if (error) + return error; + if (pm_autosleep_state() > PM_SUSPEND_ON) { + error = -EBUSY; + goto out; + } -static const char * const pm_states[PM_SUSPEND_MAX] = { - [PM_SUSPEND_STANDBY] = "standby", - [PM_SUSPEND_MEM] = "mem", -}; + state = decode_state(buf, n); + if (state < PM_SUSPEND_MAX) + error = pm_suspend(state); + else if (state == PM_SUSPEND_MAX) + error = hibernate(); + else + error = -EINVAL; -static inline int valid_state(suspend_state_t state) -{ - /* All states need lowlevel support and need to be valid - * to the lowlevel implementation, no valid callback - * implies that none are valid. */ - if (!suspend_ops || !suspend_ops->valid || !suspend_ops->valid(state)) - return 0; - return 1; + out: + pm_autosleep_unlock(); + return error ? error : n; } +power_attr(state); -/** - * enter_state - Do common work of entering low-power state. - * @state: pm_state structure for state we're entering. +#ifdef CONFIG_PM_SLEEP +/* + * The 'wakeup_count' attribute, along with the functions defined in + * drivers/base/power/wakeup.c, provides a means by which wakeup events can be + * handled in a non-racy way. + * + * If a wakeup event occurs when the system is in a sleep state, it simply is + * woken up. In turn, if an event that would wake the system up from a sleep + * state occurs when it is undergoing a transition to that sleep state, the + * transition should be aborted. Moreover, if such an event occurs when the + * system is in the working state, an attempt to start a transition to the + * given sleep state should fail during certain period after the detection of + * the event. Using the 'state' attribute alone is not sufficient to satisfy + * these requirements, because a wakeup event may occur exactly when 'state' + * is being written to and may be delivered to user space right before it is + * frozen, so the event will remain only partially processed until the system is + * woken up by another event. In particular, it won't cause the transition to + * a sleep state to be aborted. * - * Make sure we're the only ones trying to enter a sleep state. Fail - * if someone has beat us to it, since we don't want anything weird to - * happen when we wake up. - * Then, do the setup for suspend, enter the state, and cleaup (after - * we've woken up). + * This difficulty may be overcome if user space uses 'wakeup_count' before + * writing to 'state'. It first should read from 'wakeup_count' and store + * the read value. Then, after carrying out its own preparations for the system + * transition to a sleep state, it should write the stored value to + * 'wakeup_count'. If that fails, at least one wakeup event has occurred since + * 'wakeup_count' was read and 'state' should not be written to. Otherwise, it + * is allowed to write to 'state', but the transition will be aborted if there + * are any wakeup events detected after 'wakeup_count' was written to. */ -static int enter_state(suspend_state_t state) -{ - int error; - if (!valid_state(state)) - return -ENODEV; +static ssize_t wakeup_count_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + unsigned int val; - if (!mutex_trylock(&pm_mutex)) - return -EBUSY; + return pm_get_wakeup_count(&val, true) ? + sprintf(buf, "%u\n", val) : -EINTR; +} - printk(KERN_INFO "PM: Syncing filesystems ... "); - sys_sync(); - printk("done.\n"); +static ssize_t wakeup_count_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t n) +{ + unsigned int val; + int error; - pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]); - error = suspend_prepare(); + error = pm_autosleep_lock(); if (error) - goto Unlock; + return error; - if (suspend_test(TEST_FREEZER)) - goto Finish; + if (pm_autosleep_state() > PM_SUSPEND_ON) { + error = -EBUSY; + goto out; + } - pr_debug("PM: Entering %s sleep\n", pm_states[state]); - error = suspend_devices_and_enter(state); + error = -EINVAL; + if (sscanf(buf, "%u", &val) == 1) { + if (pm_save_wakeup_count(val)) + error = n; + else + pm_print_active_wakeup_sources(); + } - Finish: - pr_debug("PM: Finishing wakeup.\n"); - suspend_finish(); - Unlock: - mutex_unlock(&pm_mutex); + out: + pm_autosleep_unlock(); return error; } +power_attr(wakeup_count); -/** - * pm_suspend - Externally visible function for suspending system. - * @state: Enumerated value of state to enter. - * - * Determine whether or not value is within range, get state - * structure, and enter (above). - */ - -int pm_suspend(suspend_state_t state) +#ifdef CONFIG_PM_AUTOSLEEP +static ssize_t autosleep_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) { - if (state > PM_SUSPEND_ON && state <= PM_SUSPEND_MAX) - return enter_state(state); - return -EINVAL; -} + suspend_state_t state = pm_autosleep_state(); -EXPORT_SYMBOL(pm_suspend); + if (state == PM_SUSPEND_ON) + return sprintf(buf, "off\n"); -#endif /* CONFIG_SUSPEND */ - -struct kobject *power_kobj; - -/** - * state - control system power state. - * - * show() returns what states are supported, which is hard-coded to - * 'standby' (Power-On Suspend), 'mem' (Suspend-to-RAM), and - * 'disk' (Suspend-to-Disk). - * - * store() accepts one of those strings, translates it into the - * proper enumerated value, and initiates a suspend transition. - */ - -static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr, - char *buf) -{ - char *s = buf; #ifdef CONFIG_SUSPEND - int i; - - for (i = 0; i < PM_SUSPEND_MAX; i++) { - if (pm_states[i] && valid_state(i)) - s += sprintf(s,"%s ", pm_states[i]); - } + if (state < PM_SUSPEND_MAX) + return sprintf(buf, "%s\n", pm_states[state].state ? + pm_states[state].label : "error"); #endif #ifdef CONFIG_HIBERNATION - s += sprintf(s, "%s\n", "disk"); + return sprintf(buf, "disk\n"); #else - if (s != buf) - /* convert the last space to a newline */ - *(s-1) = '\n'; + return sprintf(buf, "error"); #endif - return (s - buf); } -static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr, - const char *buf, size_t n) +static ssize_t autosleep_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t n) { -#ifdef CONFIG_SUSPEND - suspend_state_t state = PM_SUSPEND_STANDBY; - const char * const *s; -#endif - char *p; - int len; - int error = -EINVAL; + suspend_state_t state = decode_state(buf, n); + int error; - p = memchr(buf, '\n', n); - len = p ? p - buf : n; + if (state == PM_SUSPEND_ON + && strcmp(buf, "off") && strcmp(buf, "off\n")) + return -EINVAL; - /* First, check if we are requested to hibernate */ - if (len == 4 && !strncmp(buf, "disk", len)) { - error = hibernate(); - goto Exit; - } + error = pm_autosleep_set_state(state); + return error ? error : n; +} -#ifdef CONFIG_SUSPEND - for (s = &pm_states[state]; state < PM_SUSPEND_MAX; s++, state++) { - if (*s && len == strlen(*s) && !strncmp(buf, *s, len)) - break; - } - if (state < PM_SUSPEND_MAX && *s) - error = enter_state(state); -#endif +power_attr(autosleep); +#endif /* CONFIG_PM_AUTOSLEEP */ - Exit: +#ifdef CONFIG_PM_WAKELOCKS +static ssize_t wake_lock_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return pm_show_wakelocks(buf, true); +} + +static ssize_t wake_lock_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t n) +{ + int error = pm_wake_lock(buf); return error ? error : n; } -power_attr(state); +power_attr(wake_lock); + +static ssize_t wake_unlock_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return pm_show_wakelocks(buf, false); +} + +static ssize_t wake_unlock_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t n) +{ + int error = pm_wake_unlock(buf); + return error ? error : n; +} + +power_attr(wake_unlock); + +#endif /* CONFIG_PM_WAKELOCKS */ +#endif /* CONFIG_PM_SLEEP */ #ifdef CONFIG_PM_TRACE int pm_trace_enabled; @@ -482,22 +529,85 @@ pm_trace_store(struct kobject *kobj, struct kobj_attribute *attr, if (sscanf(buf, "%d", &val) == 1) { pm_trace_enabled = !!val; + if (pm_trace_enabled) { + pr_warn("PM: Enabling pm_trace changes system date and time during resume.\n" + "PM: Correct system time has to be restored manually after resume.\n"); + } return n; } return -EINVAL; } power_attr(pm_trace); + +static ssize_t pm_trace_dev_match_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return show_trace_dev_match(buf, PAGE_SIZE); +} + +static ssize_t +pm_trace_dev_match_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t n) +{ + return -EINVAL; +} + +power_attr(pm_trace_dev_match); + #endif /* CONFIG_PM_TRACE */ +#ifdef CONFIG_FREEZER +static ssize_t pm_freeze_timeout_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%u\n", freeze_timeout_msecs); +} + +static ssize_t pm_freeze_timeout_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t n) +{ + unsigned long val; + + if (kstrtoul(buf, 10, &val)) + return -EINVAL; + + freeze_timeout_msecs = val; + return n; +} + +power_attr(pm_freeze_timeout); + +#endif /* CONFIG_FREEZER*/ + static struct attribute * g[] = { &state_attr.attr, #ifdef CONFIG_PM_TRACE &pm_trace_attr.attr, + &pm_trace_dev_match_attr.attr, +#endif +#ifdef CONFIG_PM_SLEEP + &pm_async_attr.attr, + &wakeup_count_attr.attr, +#ifdef CONFIG_PM_AUTOSLEEP + &autosleep_attr.attr, #endif -#if defined(CONFIG_PM_SLEEP) && defined(CONFIG_PM_DEBUG) +#ifdef CONFIG_PM_WAKELOCKS + &wake_lock_attr.attr, + &wake_unlock_attr.attr, +#endif +#ifdef CONFIG_PM_DEBUG &pm_test_attr.attr, #endif +#ifdef CONFIG_PM_SLEEP_DEBUG + &pm_print_times_attr.attr, +#endif +#endif +#ifdef CONFIG_FREEZER + &pm_freeze_timeout_attr.attr, +#endif NULL, }; @@ -505,13 +615,35 @@ static struct attribute_group attr_group = { .attrs = g, }; +#ifdef CONFIG_PM_RUNTIME +struct workqueue_struct *pm_wq; +EXPORT_SYMBOL_GPL(pm_wq); + +static int __init pm_start_workqueue(void) +{ + pm_wq = alloc_workqueue("pm", WQ_FREEZABLE, 0); + + return pm_wq ? 0 : -ENOMEM; +} +#else +static inline int pm_start_workqueue(void) { return 0; } +#endif static int __init pm_init(void) { + int error = pm_start_workqueue(); + if (error) + return error; + hibernate_image_size_init(); + hibernate_reserved_size_init(); power_kobj = kobject_create_and_add("power", NULL); if (!power_kobj) return -ENOMEM; - return sysfs_create_group(power_kobj, &attr_group); + error = sysfs_create_group(power_kobj, &attr_group); + if (error) + return error; + pm_print_times_init(); + return pm_autosleep_init(); } core_initcall(pm_init); diff --git a/kernel/power/pm.c b/kernel/power/pm.c deleted file mode 100644 index 60c73fa670d..00000000000 --- a/kernel/power/pm.c +++ /dev/null @@ -1,205 +0,0 @@ -/* - * pm.c - Power management interface - * - * Copyright (C) 2000 Andrew Henroid - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ -#include <linux/init.h> -#include <linux/module.h> -#include <linux/spinlock.h> -#include <linux/mm.h> -#include <linux/slab.h> -#include <linux/pm.h> -#include <linux/pm_legacy.h> -#include <linux/interrupt.h> -#include <linux/mutex.h> - -/* - * Locking notes: - * pm_devs_lock can be a semaphore providing pm ops are not called - * from an interrupt handler (already a bad idea so no change here). Each - * change must be protected so that an unlink of an entry doesn't clash - * with a pm send - which is permitted to sleep in the current architecture - * - * Module unloads clashing with pm events now work out safely, the module - * unload path will block until the event has been sent. It may well block - * until a resume but that will be fine. - */ - -static DEFINE_MUTEX(pm_devs_lock); -static LIST_HEAD(pm_devs); - -/** - * pm_register - register a device with power management - * @type: device type - * @id: device ID - * @callback: callback function - * - * Add a device to the list of devices that wish to be notified about - * power management events. A &pm_dev structure is returned on success, - * on failure the return is %NULL. - * - * The callback function will be called in process context and - * it may sleep. - */ - -struct pm_dev *pm_register(pm_dev_t type, - unsigned long id, - pm_callback callback) -{ - struct pm_dev *dev = kzalloc(sizeof(struct pm_dev), GFP_KERNEL); - if (dev) { - dev->type = type; - dev->id = id; - dev->callback = callback; - - mutex_lock(&pm_devs_lock); - list_add(&dev->entry, &pm_devs); - mutex_unlock(&pm_devs_lock); - } - return dev; -} - -/** - * pm_send - send request to a single device - * @dev: device to send to - * @rqst: power management request - * @data: data for the callback - * - * Issue a power management request to a given device. The - * %PM_SUSPEND and %PM_RESUME events are handled specially. The - * data field must hold the intended next state. No call is made - * if the state matches. - * - * BUGS: what stops two power management requests occurring in parallel - * and conflicting. - * - * WARNING: Calling pm_send directly is not generally recommended, in - * particular there is no locking against the pm_dev going away. The - * caller must maintain all needed locking or have 'inside knowledge' - * on the safety. Also remember that this function is not locked against - * pm_unregister. This means that you must handle SMP races on callback - * execution and unload yourself. - */ - -static int pm_send(struct pm_dev *dev, pm_request_t rqst, void *data) -{ - int status = 0; - unsigned long prev_state, next_state; - - if (in_interrupt()) - BUG(); - - switch (rqst) { - case PM_SUSPEND: - case PM_RESUME: - prev_state = dev->state; - next_state = (unsigned long) data; - if (prev_state != next_state) { - if (dev->callback) - status = (*dev->callback)(dev, rqst, data); - if (!status) { - dev->state = next_state; - dev->prev_state = prev_state; - } - } - else { - dev->prev_state = prev_state; - } - break; - default: - if (dev->callback) - status = (*dev->callback)(dev, rqst, data); - break; - } - return status; -} - -/* - * Undo incomplete request - */ -static void pm_undo_all(struct pm_dev *last) -{ - struct list_head *entry = last->entry.prev; - while (entry != &pm_devs) { - struct pm_dev *dev = list_entry(entry, struct pm_dev, entry); - if (dev->state != dev->prev_state) { - /* previous state was zero (running) resume or - * previous state was non-zero (suspended) suspend - */ - pm_request_t undo = (dev->prev_state - ? PM_SUSPEND:PM_RESUME); - pm_send(dev, undo, (void*) dev->prev_state); - } - entry = entry->prev; - } -} - -/** - * pm_send_all - send request to all managed devices - * @rqst: power management request - * @data: data for the callback - * - * Issue a power management request to a all devices. The - * %PM_SUSPEND events are handled specially. Any device is - * permitted to fail a suspend by returning a non zero (error) - * value from its callback function. If any device vetoes a - * suspend request then all other devices that have suspended - * during the processing of this request are restored to their - * previous state. - * - * WARNING: This function takes the pm_devs_lock. The lock is not dropped until - * the callbacks have completed. This prevents races against pm locking - * functions, races against module unload pm_unregister code. It does - * mean however that you must not issue pm_ functions within the callback - * or you will deadlock and users will hate you. - * - * Zero is returned on success. If a suspend fails then the status - * from the device that vetoes the suspend is returned. - * - * BUGS: what stops two power management requests occurring in parallel - * and conflicting. - */ - -int pm_send_all(pm_request_t rqst, void *data) -{ - struct list_head *entry; - - mutex_lock(&pm_devs_lock); - entry = pm_devs.next; - while (entry != &pm_devs) { - struct pm_dev *dev = list_entry(entry, struct pm_dev, entry); - if (dev->callback) { - int status = pm_send(dev, rqst, data); - if (status) { - /* return devices to previous state on - * failed suspend request - */ - if (rqst == PM_SUSPEND) - pm_undo_all(dev); - mutex_unlock(&pm_devs_lock); - return status; - } - } - entry = entry->next; - } - mutex_unlock(&pm_devs_lock); - return 0; -} - -EXPORT_SYMBOL(pm_register); -EXPORT_SYMBOL(pm_send_all); - diff --git a/kernel/power/power.h b/kernel/power/power.h index 700f44ec840..c60f13b5270 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -2,6 +2,7 @@ #include <linux/suspend_ioctls.h> #include <linux/utsname.h> #include <linux/freezer.h> +#include <linux/compiler.h> struct swsusp_info { struct new_utsname uts; @@ -11,9 +12,13 @@ struct swsusp_info { unsigned long image_pages; unsigned long pages; unsigned long size; -} __attribute__((aligned(PAGE_SIZE))); +} __aligned(PAGE_SIZE); #ifdef CONFIG_HIBERNATION +/* kernel/power/snapshot.c */ +extern void __init hibernate_reserved_size_init(void); +extern void __init hibernate_image_size_init(void); + #ifdef CONFIG_ARCH_HIBERNATION_HEADER /* Maximum size of architecture specific data in a hibernation header */ #define MAX_ARCH_HEADER_SIZE (sizeof(struct new_utsname) + 4) @@ -45,15 +50,22 @@ static inline char *check_image_kernel(struct swsusp_info *info) */ #define SPARE_PAGES ((1024 * 1024) >> PAGE_SHIFT) -/* kernel/power/disk.c */ +asmlinkage int swsusp_save(void); + +/* kernel/power/hibernate.c */ +extern bool freezer_test_done; + extern int hibernation_snapshot(int platform_mode); extern int hibernation_restore(int platform_mode); extern int hibernation_platform_enter(void); -#endif -extern int pfn_is_nosave(unsigned long); +#else /* !CONFIG_HIBERNATION */ + +static inline void hibernate_reserved_size_init(void) {} +static inline void hibernate_image_size_init(void) {} +#endif /* !CONFIG_HIBERNATION */ -extern struct mutex pm_mutex; +extern int pfn_is_nosave(unsigned long); #define power_attr(_name) \ static struct kobj_attribute _name##_attr = { \ @@ -67,6 +79,8 @@ static struct kobj_attribute _name##_attr = { \ /* Preferred image size in bytes (default 500 MB) */ extern unsigned long image_size; +/* Size of memory reserved for drivers (default SPARE_PAGES x PAGE_SIZE) */ +extern unsigned long reserved_size; extern int in_suspend; extern dev_t swsusp_resume_device; extern sector_t swsusp_resume_block; @@ -76,7 +90,7 @@ extern asmlinkage int swsusp_arch_resume(void); extern int create_basic_memory_bitmaps(void); extern void free_basic_memory_bitmaps(void); -extern unsigned int count_data_pages(void); +extern int hibernate_preallocate_memory(void); /** * Auxiliary structure used for reading the snapshot image data and @@ -99,24 +113,12 @@ extern unsigned int count_data_pages(void); */ struct snapshot_handle { - loff_t offset; /* number of the last byte ready for reading - * or writing in the sequence - */ unsigned int cur; /* number of the block of PAGE_SIZE bytes the * next operation will refer to (ie. current) */ - unsigned int cur_offset; /* offset with respect to the current - * block (for the next operation) - */ - unsigned int prev; /* number of the block of PAGE_SIZE bytes that - * was the current one previously - */ void *buffer; /* address of the block to read from * or write to */ - unsigned int buf_offset; /* location to read from or write to, - * given as a displacement from 'buffer' - */ int sync_read; /* Set to one to notify the caller of * snapshot_write_next() that it may * need to call wait_on_bio_chain() @@ -127,12 +129,12 @@ struct snapshot_handle { * snapshot_read_next()/snapshot_write_next() is allowed to * read/write data after the function returns */ -#define data_of(handle) ((handle).buffer + (handle).buf_offset) +#define data_of(handle) ((handle).buffer) extern unsigned int snapshot_additional_pages(struct zone *zone); extern unsigned long snapshot_get_image_size(void); -extern int snapshot_read_next(struct snapshot_handle *handle, size_t count); -extern int snapshot_write_next(struct snapshot_handle *handle, size_t count); +extern int snapshot_read_next(struct snapshot_handle *handle); +extern int snapshot_write_next(struct snapshot_handle *handle); extern void snapshot_write_finalize(struct snapshot_handle *handle); extern int snapshot_image_loaded(struct snapshot_handle *handle); @@ -148,14 +150,27 @@ extern int swsusp_swap_in_use(void); * the image header. */ #define SF_PLATFORM_MODE 1 +#define SF_NOCOMPRESS_MODE 2 +#define SF_CRC32_MODE 4 -/* kernel/power/disk.c */ +/* kernel/power/hibernate.c */ extern int swsusp_check(void); -extern int swsusp_shrink_memory(void); extern void swsusp_free(void); extern int swsusp_read(unsigned int *flags_p); extern int swsusp_write(unsigned int flags); -extern void swsusp_close(void); +extern void swsusp_close(fmode_t); +#ifdef CONFIG_SUSPEND +extern int swsusp_unmark(void); +#endif + +/* kernel/power/block_io.c */ +extern struct block_device *hib_resume_bdev; + +extern int hib_bio_read_page(pgoff_t page_off, void *addr, + struct bio **bio_chain); +extern int hib_bio_write_page(pgoff_t page_off, void *addr, + struct bio **bio_chain); +extern int hib_wait_on_bio_chain(struct bio **bio_chain); struct timeval; /* kernel/power/swsusp.c */ @@ -163,7 +178,14 @@ extern void swsusp_show_speed(struct timeval *, struct timeval *, unsigned int, char *); #ifdef CONFIG_SUSPEND -/* kernel/power/main.c */ +struct pm_sleep_state { + const char *label; + suspend_state_t state; +}; + +/* kernel/power/suspend.c */ +extern struct pm_sleep_state pm_states[]; + extern int suspend_devices_and_enter(suspend_state_t state); #else /* !CONFIG_SUSPEND */ static inline int suspend_devices_and_enter(suspend_state_t state) @@ -172,13 +194,21 @@ static inline int suspend_devices_and_enter(suspend_state_t state) } #endif /* !CONFIG_SUSPEND */ +#ifdef CONFIG_PM_TEST_SUSPEND +/* kernel/power/suspend_test.c */ +extern void suspend_test_start(void); +extern void suspend_test_finish(const char *label); +#else /* !CONFIG_PM_TEST_SUSPEND */ +static inline void suspend_test_start(void) {} +static inline void suspend_test_finish(const char *label) {} +#endif /* !CONFIG_PM_TEST_SUSPEND */ + #ifdef CONFIG_PM_SLEEP /* kernel/power/main.c */ extern int pm_notifier_call_chain(unsigned long val); #endif #ifdef CONFIG_HIGHMEM -unsigned int count_highmem_pages(void); int restore_highmem(void); #else static inline unsigned int count_highmem_pages(void) { return 0; } @@ -208,7 +238,25 @@ extern int pm_test_level; #ifdef CONFIG_SUSPEND_FREEZER static inline int suspend_freeze_processes(void) { - return freeze_processes(); + int error; + + error = freeze_processes(); + /* + * freeze_processes() automatically thaws every task if freezing + * fails. So we need not do anything extra upon error. + */ + if (error) + return error; + + error = freeze_kernel_threads(); + /* + * freeze_kernel_threads() thaws only kernel threads upon freezing + * failure. So we have to thaw the userspace tasks ourselves. + */ + if (error) + thaw_processes(); + + return error; } static inline void suspend_thaw_processes(void) @@ -225,3 +273,30 @@ static inline void suspend_thaw_processes(void) { } #endif + +#ifdef CONFIG_PM_AUTOSLEEP + +/* kernel/power/autosleep.c */ +extern int pm_autosleep_init(void); +extern int pm_autosleep_lock(void); +extern void pm_autosleep_unlock(void); +extern suspend_state_t pm_autosleep_state(void); +extern int pm_autosleep_set_state(suspend_state_t state); + +#else /* !CONFIG_PM_AUTOSLEEP */ + +static inline int pm_autosleep_init(void) { return 0; } +static inline int pm_autosleep_lock(void) { return 0; } +static inline void pm_autosleep_unlock(void) {} +static inline suspend_state_t pm_autosleep_state(void) { return PM_SUSPEND_ON; } + +#endif /* !CONFIG_PM_AUTOSLEEP */ + +#ifdef CONFIG_PM_WAKELOCKS + +/* kernel/power/wakelock.c */ +extern ssize_t pm_show_wakelocks(char *buf, bool show_active); +extern int pm_wake_lock(const char *buf); +extern int pm_wake_unlock(const char *buf); + +#endif /* !CONFIG_PM_WAKELOCKS */ diff --git a/kernel/power/poweroff.c b/kernel/power/poweroff.c index 678ec736076..7ef6866b521 100644 --- a/kernel/power/poweroff.c +++ b/kernel/power/poweroff.c @@ -10,6 +10,7 @@ #include <linux/pm.h> #include <linux/workqueue.h> #include <linux/reboot.h> +#include <linux/cpumask.h> /* * When the user hits Sys-Rq o to power down the machine this is the @@ -23,19 +24,20 @@ static void do_poweroff(struct work_struct *dummy) static DECLARE_WORK(poweroff_work, do_poweroff); -static void handle_poweroff(int key, struct tty_struct *tty) +static void handle_poweroff(int key) { - schedule_work(&poweroff_work); + /* run sysrq poweroff on boot cpu */ + schedule_work_on(cpumask_first(cpu_online_mask), &poweroff_work); } static struct sysrq_key_op sysrq_poweroff_op = { .handler = handle_poweroff, - .help_msg = "powerOff", + .help_msg = "poweroff(o)", .action_msg = "Power Off", - .enable_mask = SYSRQ_ENABLE_BOOT, + .enable_mask = SYSRQ_ENABLE_BOOT, }; -static int pm_sysrq_init(void) +static int __init pm_sysrq_init(void) { register_sysrq_key('o', &sysrq_poweroff_op); return 0; diff --git a/kernel/power/process.c b/kernel/power/process.c index f1d0b345c9b..4ee194eb524 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -9,272 +9,220 @@ #undef DEBUG #include <linux/interrupt.h> +#include <linux/oom.h> #include <linux/suspend.h> #include <linux/module.h> #include <linux/syscalls.h> #include <linux/freezer.h> +#include <linux/delay.h> +#include <linux/workqueue.h> +#include <linux/kmod.h> +#include <trace/events/power.h> /* * Timeout for stopping processes */ -#define TIMEOUT (20 * HZ) +unsigned int __read_mostly freeze_timeout_msecs = 20 * MSEC_PER_SEC; -#define FREEZER_KERNEL_THREADS 0 -#define FREEZER_USER_SPACE 1 - -static inline int freezeable(struct task_struct * p) -{ - if ((p == current) || - (p->flags & PF_NOFREEZE) || - (p->exit_state != 0)) - return 0; - return 1; -} - -/* - * freezing is complete, mark current process as frozen - */ -static inline void frozen_process(void) -{ - if (!unlikely(current->flags & PF_NOFREEZE)) { - current->flags |= PF_FROZEN; - wmb(); - } - clear_freeze_flag(current); -} - -/* Refrigerator is place where frozen processes are stored :-). */ -void refrigerator(void) -{ - /* Hmm, should we be allowed to suspend when there are realtime - processes around? */ - long save; - - task_lock(current); - if (freezing(current)) { - frozen_process(); - task_unlock(current); - } else { - task_unlock(current); - return; - } - save = current->state; - pr_debug("%s entered refrigerator\n", current->comm); - - spin_lock_irq(¤t->sighand->siglock); - recalc_sigpending(); /* We sent fake signal, clean it up */ - spin_unlock_irq(¤t->sighand->siglock); - - for (;;) { - set_current_state(TASK_UNINTERRUPTIBLE); - if (!frozen(current)) - break; - schedule(); - } - pr_debug("%s left refrigerator\n", current->comm); - __set_current_state(save); -} - -static void fake_signal_wake_up(struct task_struct *p) -{ - unsigned long flags; - - spin_lock_irqsave(&p->sighand->siglock, flags); - signal_wake_up(p, 0); - spin_unlock_irqrestore(&p->sighand->siglock, flags); -} - -static int has_mm(struct task_struct *p) -{ - return (p->mm && !(p->flags & PF_BORROWED_MM)); -} - -/** - * freeze_task - send a freeze request to given task - * @p: task to send the request to - * @with_mm_only: if set, the request will only be sent if the task has its - * own mm - * Return value: 0, if @with_mm_only is set and the task has no mm of its - * own or the task is frozen, 1, otherwise - * - * The freeze request is sent by seting the tasks's TIF_FREEZE flag and - * either sending a fake signal to it or waking it up, depending on whether - * or not it has its own mm (ie. it is a user land task). If @with_mm_only - * is set and the task has no mm of its own (ie. it is a kernel thread), - * its TIF_FREEZE flag should not be set. - * - * The task_lock() is necessary to prevent races with exit_mm() or - * use_mm()/unuse_mm() from occuring. - */ -static int freeze_task(struct task_struct *p, int with_mm_only) -{ - int ret = 1; - - task_lock(p); - if (freezing(p)) { - if (has_mm(p)) { - if (!signal_pending(p)) - fake_signal_wake_up(p); - } else { - if (with_mm_only) - ret = 0; - else - wake_up_state(p, TASK_INTERRUPTIBLE); - } - } else { - rmb(); - if (frozen(p)) { - ret = 0; - } else { - if (has_mm(p)) { - set_freeze_flag(p); - fake_signal_wake_up(p); - } else { - if (with_mm_only) { - ret = 0; - } else { - set_freeze_flag(p); - wake_up_state(p, TASK_INTERRUPTIBLE); - } - } - } - } - task_unlock(p); - return ret; -} - -static void cancel_freezing(struct task_struct *p) -{ - unsigned long flags; - - if (freezing(p)) { - pr_debug(" clean up: %s\n", p->comm); - clear_freeze_flag(p); - spin_lock_irqsave(&p->sighand->siglock, flags); - recalc_sigpending_and_wake(p); - spin_unlock_irqrestore(&p->sighand->siglock, flags); - } -} - -static int try_to_freeze_tasks(int freeze_user_space) +static int try_to_freeze_tasks(bool user_only) { struct task_struct *g, *p; unsigned long end_time; unsigned int todo; + bool wq_busy = false; struct timeval start, end; - s64 elapsed_csecs64; - unsigned int elapsed_csecs; + u64 elapsed_msecs64; + unsigned int elapsed_msecs; + bool wakeup = false; + int sleep_usecs = USEC_PER_MSEC; do_gettimeofday(&start); - end_time = jiffies + TIMEOUT; - do { + end_time = jiffies + msecs_to_jiffies(freeze_timeout_msecs); + + if (!user_only) + freeze_workqueues_begin(); + + while (true) { todo = 0; read_lock(&tasklist_lock); do_each_thread(g, p) { - if (frozen(p) || !freezeable(p)) - continue; - - if (!freeze_task(p, freeze_user_space)) + if (p == current || !freeze_task(p)) continue; - /* - * Now that we've done set_freeze_flag, don't - * perturb a task in TASK_STOPPED or TASK_TRACED. - * It is "frozen enough". If the task does wake - * up, it will immediately call try_to_freeze. - */ - if (!task_is_stopped_or_traced(p) && - !freezer_should_skip(p)) + if (!freezer_should_skip(p)) todo++; } while_each_thread(g, p); read_unlock(&tasklist_lock); - yield(); /* Yield is okay here */ - if (time_after(jiffies, end_time)) + + if (!user_only) { + wq_busy = freeze_workqueues_busy(); + todo += wq_busy; + } + + if (!todo || time_after(jiffies, end_time)) break; - } while (todo); + + if (pm_wakeup_pending()) { + wakeup = true; + break; + } + + /* + * We need to retry, but first give the freezing tasks some + * time to enter the refrigerator. Start with an initial + * 1 ms sleep followed by exponential backoff until 8 ms. + */ + usleep_range(sleep_usecs / 2, sleep_usecs); + if (sleep_usecs < 8 * USEC_PER_MSEC) + sleep_usecs *= 2; + } do_gettimeofday(&end); - elapsed_csecs64 = timeval_to_ns(&end) - timeval_to_ns(&start); - do_div(elapsed_csecs64, NSEC_PER_SEC / 100); - elapsed_csecs = elapsed_csecs64; + elapsed_msecs64 = timeval_to_ns(&end) - timeval_to_ns(&start); + do_div(elapsed_msecs64, NSEC_PER_MSEC); + elapsed_msecs = elapsed_msecs64; if (todo) { - /* This does not unfreeze processes that are already frozen - * (we have slightly ugly calling convention in that respect, - * and caller must call thaw_processes() if something fails), - * but it cleans up leftover PF_FREEZE requests. - */ printk("\n"); - printk(KERN_ERR "Freezing of tasks failed after %d.%02d seconds " - "(%d tasks refusing to freeze):\n", - elapsed_csecs / 100, elapsed_csecs % 100, todo); - show_state(); - read_lock(&tasklist_lock); - do_each_thread(g, p) { - task_lock(p); - if (freezing(p) && !freezer_should_skip(p)) - printk(KERN_ERR " %s\n", p->comm); - cancel_freezing(p); - task_unlock(p); - } while_each_thread(g, p); - read_unlock(&tasklist_lock); + printk(KERN_ERR "Freezing of tasks %s after %d.%03d seconds " + "(%d tasks refusing to freeze, wq_busy=%d):\n", + wakeup ? "aborted" : "failed", + elapsed_msecs / 1000, elapsed_msecs % 1000, + todo - wq_busy, wq_busy); + + if (!wakeup) { + read_lock(&tasklist_lock); + do_each_thread(g, p) { + if (p != current && !freezer_should_skip(p) + && freezing(p) && !frozen(p)) + sched_show_task(p); + } while_each_thread(g, p); + read_unlock(&tasklist_lock); + } } else { - printk("(elapsed %d.%02d seconds) ", elapsed_csecs / 100, - elapsed_csecs % 100); + printk("(elapsed %d.%03d seconds) ", elapsed_msecs / 1000, + elapsed_msecs % 1000); } return todo ? -EBUSY : 0; } /** - * freeze_processes - tell processes to enter the refrigerator + * freeze_processes - Signal user space processes to enter the refrigerator. + * The current thread will not be frozen. The same process that calls + * freeze_processes must later call thaw_processes. + * + * On success, returns 0. On failure, -errno and system is fully thawed. */ int freeze_processes(void) { int error; + error = __usermodehelper_disable(UMH_FREEZING); + if (error) + return error; + + /* Make sure this task doesn't get frozen */ + current->flags |= PF_SUSPEND_TASK; + + if (!pm_freezing) + atomic_inc(&system_freezing_cnt); + printk("Freezing user space processes ... "); - error = try_to_freeze_tasks(FREEZER_USER_SPACE); + pm_freezing = true; + error = try_to_freeze_tasks(true); + if (!error) { + printk("done."); + __usermodehelper_set_disable_depth(UMH_DISABLED); + oom_killer_disable(); + } + printk("\n"); + BUG_ON(in_atomic()); + if (error) - goto Exit; - printk("done.\n"); + thaw_processes(); + return error; +} + +/** + * freeze_kernel_threads - Make freezable kernel threads go to the refrigerator. + * + * On success, returns 0. On failure, -errno and only the kernel threads are + * thawed, so as to give a chance to the caller to do additional cleanups + * (if any) before thawing the userspace tasks. So, it is the responsibility + * of the caller to thaw the userspace tasks, when the time is right. + */ +int freeze_kernel_threads(void) +{ + int error; printk("Freezing remaining freezable tasks ... "); - error = try_to_freeze_tasks(FREEZER_KERNEL_THREADS); - if (error) - goto Exit; - printk("done."); - Exit: - BUG_ON(in_atomic()); + pm_nosig_freezing = true; + error = try_to_freeze_tasks(false); + if (!error) + printk("done."); + printk("\n"); + BUG_ON(in_atomic()); + + if (error) + thaw_kernel_threads(); return error; } -static void thaw_tasks(int thaw_user_space) +void thaw_processes(void) { struct task_struct *g, *p; + struct task_struct *curr = current; - read_lock(&tasklist_lock); - do_each_thread(g, p) { - if (!freezeable(p)) - continue; + trace_suspend_resume(TPS("thaw_processes"), 0, true); + if (pm_freezing) + atomic_dec(&system_freezing_cnt); + pm_freezing = false; + pm_nosig_freezing = false; + + oom_killer_enable(); + + printk("Restarting tasks ... "); - if (!p->mm == thaw_user_space) - continue; + __usermodehelper_set_disable_depth(UMH_FREEZING); + thaw_workqueues(); - thaw_process(p); + read_lock(&tasklist_lock); + do_each_thread(g, p) { + /* No other threads should have PF_SUSPEND_TASK set */ + WARN_ON((p != curr) && (p->flags & PF_SUSPEND_TASK)); + __thaw_task(p); } while_each_thread(g, p); read_unlock(&tasklist_lock); + + WARN_ON(!(curr->flags & PF_SUSPEND_TASK)); + curr->flags &= ~PF_SUSPEND_TASK; + + usermodehelper_enable(); + + schedule(); + printk("done.\n"); + trace_suspend_resume(TPS("thaw_processes"), 0, false); } -void thaw_processes(void) +void thaw_kernel_threads(void) { - printk("Restarting tasks ... "); - thaw_tasks(FREEZER_KERNEL_THREADS); - thaw_tasks(FREEZER_USER_SPACE); + struct task_struct *g, *p; + + pm_nosig_freezing = false; + printk("Restarting kernel threads ... "); + + thaw_workqueues(); + + read_lock(&tasklist_lock); + do_each_thread(g, p) { + if (p->flags & (PF_KTHREAD | PF_WQ_WORKER)) + __thaw_task(p); + } while_each_thread(g, p); + read_unlock(&tasklist_lock); + schedule(); printk("done.\n"); } - -EXPORT_SYMBOL(refrigerator); diff --git a/kernel/power/qos.c b/kernel/power/qos.c new file mode 100644 index 00000000000..884b7705886 --- /dev/null +++ b/kernel/power/qos.c @@ -0,0 +1,601 @@ +/* + * This module exposes the interface to kernel space for specifying + * QoS dependencies. It provides infrastructure for registration of: + * + * Dependents on a QoS value : register requests + * Watchers of QoS value : get notified when target QoS value changes + * + * This QoS design is best effort based. Dependents register their QoS needs. + * Watchers register to keep track of the current QoS needs of the system. + * + * There are 3 basic classes of QoS parameter: latency, timeout, throughput + * each have defined units: + * latency: usec + * timeout: usec <-- currently not used. + * throughput: kbs (kilo byte / sec) + * + * There are lists of pm_qos_objects each one wrapping requests, notifiers + * + * User mode requests on a QOS parameter register themselves to the + * subsystem by opening the device node /dev/... and writing there request to + * the node. As long as the process holds a file handle open to the node the + * client continues to be accounted for. Upon file release the usermode + * request is removed and a new qos target is computed. This way when the + * request that the application has is cleaned up when closes the file + * pointer or exits the pm_qos_object will get an opportunity to clean up. + * + * Mark Gross <mgross@linux.intel.com> + */ + +/*#define DEBUG*/ + +#include <linux/pm_qos.h> +#include <linux/sched.h> +#include <linux/spinlock.h> +#include <linux/slab.h> +#include <linux/time.h> +#include <linux/fs.h> +#include <linux/device.h> +#include <linux/miscdevice.h> +#include <linux/string.h> +#include <linux/platform_device.h> +#include <linux/init.h> +#include <linux/kernel.h> + +#include <linux/uaccess.h> +#include <linux/export.h> +#include <trace/events/power.h> + +/* + * locking rule: all changes to constraints or notifiers lists + * or pm_qos_object list and pm_qos_objects need to happen with pm_qos_lock + * held, taken with _irqsave. One lock to rule them all + */ +struct pm_qos_object { + struct pm_qos_constraints *constraints; + struct miscdevice pm_qos_power_miscdev; + char *name; +}; + +static DEFINE_SPINLOCK(pm_qos_lock); + +static struct pm_qos_object null_pm_qos; + +static BLOCKING_NOTIFIER_HEAD(cpu_dma_lat_notifier); +static struct pm_qos_constraints cpu_dma_constraints = { + .list = PLIST_HEAD_INIT(cpu_dma_constraints.list), + .target_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE, + .default_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE, + .no_constraint_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE, + .type = PM_QOS_MIN, + .notifiers = &cpu_dma_lat_notifier, +}; +static struct pm_qos_object cpu_dma_pm_qos = { + .constraints = &cpu_dma_constraints, + .name = "cpu_dma_latency", +}; + +static BLOCKING_NOTIFIER_HEAD(network_lat_notifier); +static struct pm_qos_constraints network_lat_constraints = { + .list = PLIST_HEAD_INIT(network_lat_constraints.list), + .target_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE, + .default_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE, + .no_constraint_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE, + .type = PM_QOS_MIN, + .notifiers = &network_lat_notifier, +}; +static struct pm_qos_object network_lat_pm_qos = { + .constraints = &network_lat_constraints, + .name = "network_latency", +}; + + +static BLOCKING_NOTIFIER_HEAD(network_throughput_notifier); +static struct pm_qos_constraints network_tput_constraints = { + .list = PLIST_HEAD_INIT(network_tput_constraints.list), + .target_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE, + .default_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE, + .no_constraint_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE, + .type = PM_QOS_MAX, + .notifiers = &network_throughput_notifier, +}; +static struct pm_qos_object network_throughput_pm_qos = { + .constraints = &network_tput_constraints, + .name = "network_throughput", +}; + + +static struct pm_qos_object *pm_qos_array[] = { + &null_pm_qos, + &cpu_dma_pm_qos, + &network_lat_pm_qos, + &network_throughput_pm_qos +}; + +static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, + size_t count, loff_t *f_pos); +static ssize_t pm_qos_power_read(struct file *filp, char __user *buf, + size_t count, loff_t *f_pos); +static int pm_qos_power_open(struct inode *inode, struct file *filp); +static int pm_qos_power_release(struct inode *inode, struct file *filp); + +static const struct file_operations pm_qos_power_fops = { + .write = pm_qos_power_write, + .read = pm_qos_power_read, + .open = pm_qos_power_open, + .release = pm_qos_power_release, + .llseek = noop_llseek, +}; + +/* unlocked internal variant */ +static inline int pm_qos_get_value(struct pm_qos_constraints *c) +{ + if (plist_head_empty(&c->list)) + return c->no_constraint_value; + + switch (c->type) { + case PM_QOS_MIN: + return plist_first(&c->list)->prio; + + case PM_QOS_MAX: + return plist_last(&c->list)->prio; + + default: + /* runtime check for not using enum */ + BUG(); + return PM_QOS_DEFAULT_VALUE; + } +} + +s32 pm_qos_read_value(struct pm_qos_constraints *c) +{ + return c->target_value; +} + +static inline void pm_qos_set_value(struct pm_qos_constraints *c, s32 value) +{ + c->target_value = value; +} + +/** + * pm_qos_update_target - manages the constraints list and calls the notifiers + * if needed + * @c: constraints data struct + * @node: request to add to the list, to update or to remove + * @action: action to take on the constraints list + * @value: value of the request to add or update + * + * This function returns 1 if the aggregated constraint value has changed, 0 + * otherwise. + */ +int pm_qos_update_target(struct pm_qos_constraints *c, struct plist_node *node, + enum pm_qos_req_action action, int value) +{ + unsigned long flags; + int prev_value, curr_value, new_value; + int ret; + + spin_lock_irqsave(&pm_qos_lock, flags); + prev_value = pm_qos_get_value(c); + if (value == PM_QOS_DEFAULT_VALUE) + new_value = c->default_value; + else + new_value = value; + + switch (action) { + case PM_QOS_REMOVE_REQ: + plist_del(node, &c->list); + break; + case PM_QOS_UPDATE_REQ: + /* + * to change the list, we atomically remove, reinit + * with new value and add, then see if the extremal + * changed + */ + plist_del(node, &c->list); + case PM_QOS_ADD_REQ: + plist_node_init(node, new_value); + plist_add(node, &c->list); + break; + default: + /* no action */ + ; + } + + curr_value = pm_qos_get_value(c); + pm_qos_set_value(c, curr_value); + + spin_unlock_irqrestore(&pm_qos_lock, flags); + + trace_pm_qos_update_target(action, prev_value, curr_value); + if (prev_value != curr_value) { + ret = 1; + if (c->notifiers) + blocking_notifier_call_chain(c->notifiers, + (unsigned long)curr_value, + NULL); + } else { + ret = 0; + } + return ret; +} + +/** + * pm_qos_flags_remove_req - Remove device PM QoS flags request. + * @pqf: Device PM QoS flags set to remove the request from. + * @req: Request to remove from the set. + */ +static void pm_qos_flags_remove_req(struct pm_qos_flags *pqf, + struct pm_qos_flags_request *req) +{ + s32 val = 0; + + list_del(&req->node); + list_for_each_entry(req, &pqf->list, node) + val |= req->flags; + + pqf->effective_flags = val; +} + +/** + * pm_qos_update_flags - Update a set of PM QoS flags. + * @pqf: Set of flags to update. + * @req: Request to add to the set, to modify, or to remove from the set. + * @action: Action to take on the set. + * @val: Value of the request to add or modify. + * + * Update the given set of PM QoS flags and call notifiers if the aggregate + * value has changed. Returns 1 if the aggregate constraint value has changed, + * 0 otherwise. + */ +bool pm_qos_update_flags(struct pm_qos_flags *pqf, + struct pm_qos_flags_request *req, + enum pm_qos_req_action action, s32 val) +{ + unsigned long irqflags; + s32 prev_value, curr_value; + + spin_lock_irqsave(&pm_qos_lock, irqflags); + + prev_value = list_empty(&pqf->list) ? 0 : pqf->effective_flags; + + switch (action) { + case PM_QOS_REMOVE_REQ: + pm_qos_flags_remove_req(pqf, req); + break; + case PM_QOS_UPDATE_REQ: + pm_qos_flags_remove_req(pqf, req); + case PM_QOS_ADD_REQ: + req->flags = val; + INIT_LIST_HEAD(&req->node); + list_add_tail(&req->node, &pqf->list); + pqf->effective_flags |= val; + break; + default: + /* no action */ + ; + } + + curr_value = list_empty(&pqf->list) ? 0 : pqf->effective_flags; + + spin_unlock_irqrestore(&pm_qos_lock, irqflags); + + trace_pm_qos_update_flags(action, prev_value, curr_value); + return prev_value != curr_value; +} + +/** + * pm_qos_request - returns current system wide qos expectation + * @pm_qos_class: identification of which qos value is requested + * + * This function returns the current target value. + */ +int pm_qos_request(int pm_qos_class) +{ + return pm_qos_read_value(pm_qos_array[pm_qos_class]->constraints); +} +EXPORT_SYMBOL_GPL(pm_qos_request); + +int pm_qos_request_active(struct pm_qos_request *req) +{ + return req->pm_qos_class != 0; +} +EXPORT_SYMBOL_GPL(pm_qos_request_active); + +static void __pm_qos_update_request(struct pm_qos_request *req, + s32 new_value) +{ + trace_pm_qos_update_request(req->pm_qos_class, new_value); + + if (new_value != req->node.prio) + pm_qos_update_target( + pm_qos_array[req->pm_qos_class]->constraints, + &req->node, PM_QOS_UPDATE_REQ, new_value); +} + +/** + * pm_qos_work_fn - the timeout handler of pm_qos_update_request_timeout + * @work: work struct for the delayed work (timeout) + * + * This cancels the timeout request by falling back to the default at timeout. + */ +static void pm_qos_work_fn(struct work_struct *work) +{ + struct pm_qos_request *req = container_of(to_delayed_work(work), + struct pm_qos_request, + work); + + __pm_qos_update_request(req, PM_QOS_DEFAULT_VALUE); +} + +/** + * pm_qos_add_request - inserts new qos request into the list + * @req: pointer to a preallocated handle + * @pm_qos_class: identifies which list of qos request to use + * @value: defines the qos request + * + * This function inserts a new entry in the pm_qos_class list of requested qos + * performance characteristics. It recomputes the aggregate QoS expectations + * for the pm_qos_class of parameters and initializes the pm_qos_request + * handle. Caller needs to save this handle for later use in updates and + * removal. + */ + +void pm_qos_add_request(struct pm_qos_request *req, + int pm_qos_class, s32 value) +{ + if (!req) /*guard against callers passing in null */ + return; + + if (pm_qos_request_active(req)) { + WARN(1, KERN_ERR "pm_qos_add_request() called for already added request\n"); + return; + } + req->pm_qos_class = pm_qos_class; + INIT_DELAYED_WORK(&req->work, pm_qos_work_fn); + trace_pm_qos_add_request(pm_qos_class, value); + pm_qos_update_target(pm_qos_array[pm_qos_class]->constraints, + &req->node, PM_QOS_ADD_REQ, value); +} +EXPORT_SYMBOL_GPL(pm_qos_add_request); + +/** + * pm_qos_update_request - modifies an existing qos request + * @req : handle to list element holding a pm_qos request to use + * @value: defines the qos request + * + * Updates an existing qos request for the pm_qos_class of parameters along + * with updating the target pm_qos_class value. + * + * Attempts are made to make this code callable on hot code paths. + */ +void pm_qos_update_request(struct pm_qos_request *req, + s32 new_value) +{ + if (!req) /*guard against callers passing in null */ + return; + + if (!pm_qos_request_active(req)) { + WARN(1, KERN_ERR "pm_qos_update_request() called for unknown object\n"); + return; + } + + cancel_delayed_work_sync(&req->work); + __pm_qos_update_request(req, new_value); +} +EXPORT_SYMBOL_GPL(pm_qos_update_request); + +/** + * pm_qos_update_request_timeout - modifies an existing qos request temporarily. + * @req : handle to list element holding a pm_qos request to use + * @new_value: defines the temporal qos request + * @timeout_us: the effective duration of this qos request in usecs. + * + * After timeout_us, this qos request is cancelled automatically. + */ +void pm_qos_update_request_timeout(struct pm_qos_request *req, s32 new_value, + unsigned long timeout_us) +{ + if (!req) + return; + if (WARN(!pm_qos_request_active(req), + "%s called for unknown object.", __func__)) + return; + + cancel_delayed_work_sync(&req->work); + + trace_pm_qos_update_request_timeout(req->pm_qos_class, + new_value, timeout_us); + if (new_value != req->node.prio) + pm_qos_update_target( + pm_qos_array[req->pm_qos_class]->constraints, + &req->node, PM_QOS_UPDATE_REQ, new_value); + + schedule_delayed_work(&req->work, usecs_to_jiffies(timeout_us)); +} + +/** + * pm_qos_remove_request - modifies an existing qos request + * @req: handle to request list element + * + * Will remove pm qos request from the list of constraints and + * recompute the current target value for the pm_qos_class. Call this + * on slow code paths. + */ +void pm_qos_remove_request(struct pm_qos_request *req) +{ + if (!req) /*guard against callers passing in null */ + return; + /* silent return to keep pcm code cleaner */ + + if (!pm_qos_request_active(req)) { + WARN(1, KERN_ERR "pm_qos_remove_request() called for unknown object\n"); + return; + } + + cancel_delayed_work_sync(&req->work); + + trace_pm_qos_remove_request(req->pm_qos_class, PM_QOS_DEFAULT_VALUE); + pm_qos_update_target(pm_qos_array[req->pm_qos_class]->constraints, + &req->node, PM_QOS_REMOVE_REQ, + PM_QOS_DEFAULT_VALUE); + memset(req, 0, sizeof(*req)); +} +EXPORT_SYMBOL_GPL(pm_qos_remove_request); + +/** + * pm_qos_add_notifier - sets notification entry for changes to target value + * @pm_qos_class: identifies which qos target changes should be notified. + * @notifier: notifier block managed by caller. + * + * will register the notifier into a notification chain that gets called + * upon changes to the pm_qos_class target value. + */ +int pm_qos_add_notifier(int pm_qos_class, struct notifier_block *notifier) +{ + int retval; + + retval = blocking_notifier_chain_register( + pm_qos_array[pm_qos_class]->constraints->notifiers, + notifier); + + return retval; +} +EXPORT_SYMBOL_GPL(pm_qos_add_notifier); + +/** + * pm_qos_remove_notifier - deletes notification entry from chain. + * @pm_qos_class: identifies which qos target changes are notified. + * @notifier: notifier block to be removed. + * + * will remove the notifier from the notification chain that gets called + * upon changes to the pm_qos_class target value. + */ +int pm_qos_remove_notifier(int pm_qos_class, struct notifier_block *notifier) +{ + int retval; + + retval = blocking_notifier_chain_unregister( + pm_qos_array[pm_qos_class]->constraints->notifiers, + notifier); + + return retval; +} +EXPORT_SYMBOL_GPL(pm_qos_remove_notifier); + +/* User space interface to PM QoS classes via misc devices */ +static int register_pm_qos_misc(struct pm_qos_object *qos) +{ + qos->pm_qos_power_miscdev.minor = MISC_DYNAMIC_MINOR; + qos->pm_qos_power_miscdev.name = qos->name; + qos->pm_qos_power_miscdev.fops = &pm_qos_power_fops; + + return misc_register(&qos->pm_qos_power_miscdev); +} + +static int find_pm_qos_object_by_minor(int minor) +{ + int pm_qos_class; + + for (pm_qos_class = PM_QOS_CPU_DMA_LATENCY; + pm_qos_class < PM_QOS_NUM_CLASSES; pm_qos_class++) { + if (minor == + pm_qos_array[pm_qos_class]->pm_qos_power_miscdev.minor) + return pm_qos_class; + } + return -1; +} + +static int pm_qos_power_open(struct inode *inode, struct file *filp) +{ + long pm_qos_class; + + pm_qos_class = find_pm_qos_object_by_minor(iminor(inode)); + if (pm_qos_class >= PM_QOS_CPU_DMA_LATENCY) { + struct pm_qos_request *req = kzalloc(sizeof(*req), GFP_KERNEL); + if (!req) + return -ENOMEM; + + pm_qos_add_request(req, pm_qos_class, PM_QOS_DEFAULT_VALUE); + filp->private_data = req; + + return 0; + } + return -EPERM; +} + +static int pm_qos_power_release(struct inode *inode, struct file *filp) +{ + struct pm_qos_request *req; + + req = filp->private_data; + pm_qos_remove_request(req); + kfree(req); + + return 0; +} + + +static ssize_t pm_qos_power_read(struct file *filp, char __user *buf, + size_t count, loff_t *f_pos) +{ + s32 value; + unsigned long flags; + struct pm_qos_request *req = filp->private_data; + + if (!req) + return -EINVAL; + if (!pm_qos_request_active(req)) + return -EINVAL; + + spin_lock_irqsave(&pm_qos_lock, flags); + value = pm_qos_get_value(pm_qos_array[req->pm_qos_class]->constraints); + spin_unlock_irqrestore(&pm_qos_lock, flags); + + return simple_read_from_buffer(buf, count, f_pos, &value, sizeof(s32)); +} + +static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, + size_t count, loff_t *f_pos) +{ + s32 value; + struct pm_qos_request *req; + + if (count == sizeof(s32)) { + if (copy_from_user(&value, buf, sizeof(s32))) + return -EFAULT; + } else { + int ret; + + ret = kstrtos32_from_user(buf, count, 16, &value); + if (ret) + return ret; + } + + req = filp->private_data; + pm_qos_update_request(req, value); + + return count; +} + + +static int __init pm_qos_power_init(void) +{ + int ret = 0; + int i; + + BUILD_BUG_ON(ARRAY_SIZE(pm_qos_array) != PM_QOS_NUM_CLASSES); + + for (i = PM_QOS_CPU_DMA_LATENCY; i < PM_QOS_NUM_CLASSES; i++) { + ret = register_pm_qos_misc(pm_qos_array[i]); + if (ret < 0) { + printk(KERN_ERR "pm_qos_param: %s setup failed\n", + pm_qos_array[i]->name); + return ret; + } + } + + return ret; +} + +late_initcall(pm_qos_power_init); diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 5f91a07c4ea..1ea328aafdc 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -3,7 +3,7 @@ * * This file provides system snapshot/restore functionality for swsusp. * - * Copyright (C) 1998-2005 Pavel Machek <pavel@suse.cz> + * Copyright (C) 1998-2005 Pavel Machek <pavel@ucw.cz> * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl> * * This file is released under the GPLv2. @@ -25,6 +25,9 @@ #include <linux/syscalls.h> #include <linux/console.h> #include <linux/highmem.h> +#include <linux/list.h> +#include <linux/slab.h> +#include <linux/compiler.h> #include <asm/uaccess.h> #include <asm/mmu_context.h> @@ -38,6 +41,31 @@ static int swsusp_page_is_free(struct page *); static void swsusp_set_page_forbidden(struct page *); static void swsusp_unset_page_forbidden(struct page *); +/* + * Number of bytes to reserve for memory allocations made by device drivers + * from their ->freeze() and ->freeze_noirq() callbacks so that they don't + * cause image creation to fail (tunable via /sys/power/reserved_size). + */ +unsigned long reserved_size; + +void __init hibernate_reserved_size_init(void) +{ + reserved_size = SPARE_PAGES * PAGE_SIZE; +} + +/* + * Preferred image size in bytes (tunable via /sys/power/image_size). + * When it is set to N, swsusp will do its best to ensure the image + * size will not exceed N bytes, but if that is impossible, it will + * try to create the smallest image possible. + */ +unsigned long image_size; + +void __init hibernate_image_size_init(void) +{ + image_size = ((totalram_pages * 2) / 5) * PAGE_SIZE; +} + /* List of PBEs needed for restoring the pages that were allocated before * the suspend and included in the suspend image, but have also been * allocated by the "resume" kernel, so their contents cannot be written @@ -128,7 +156,7 @@ static inline void free_image_page(void *addr, int clear_nosave_free) struct linked_page { struct linked_page *next; char data[LINKED_PAGE_DATA_SIZE]; -} __attribute__((packed)); +} __packed; static inline void free_list_of_pages(struct linked_page *list, int clear_page_nosave) @@ -192,12 +220,6 @@ static void *chain_alloc(struct chain_allocator *ca, unsigned int size) return ret; } -static void chain_free(struct chain_allocator *ca, int clear_page_nosave) -{ - free_list_of_pages(ca->chain, clear_page_nosave); - memset(ca, 0, sizeof(struct chain_allocator)); -} - /** * Data types related to memory bitmaps. * @@ -205,8 +227,7 @@ static void chain_free(struct chain_allocator *ca, int clear_page_nosave) * objects. The main list's elements are of type struct zone_bitmap * and each of them corresonds to one zone. For each zone bitmap * object there is a list of objects of type struct bm_block that - * represent each blocks of bit chunks in which information is - * stored. + * represent each blocks of bitmap in which information is stored. * * struct memory_bitmap contains a pointer to the main list of zone * bitmap objects, a struct bm_position used for browsing the bitmap, @@ -224,45 +245,36 @@ static void chain_free(struct chain_allocator *ca, int clear_page_nosave) * pfns that correspond to the start and end of the represented zone. * * struct bm_block contains a pointer to the memory page in which - * information is stored (in the form of a block of bit chunks - * of type unsigned long each). It also contains the pfns that - * correspond to the start and end of the represented memory area and - * the number of bit chunks in the block. + * information is stored (in the form of a block of bitmap) + * It also contains the pfns that correspond to the start and end of + * the represented memory area. */ #define BM_END_OF_MAP (~0UL) -#define BM_CHUNKS_PER_BLOCK (PAGE_SIZE / sizeof(long)) -#define BM_BITS_PER_CHUNK (sizeof(long) << 3) -#define BM_BITS_PER_BLOCK (PAGE_SIZE << 3) +#define BM_BITS_PER_BLOCK (PAGE_SIZE * BITS_PER_BYTE) struct bm_block { - struct bm_block *next; /* next element of the list */ + struct list_head hook; /* hook into a list of bitmap blocks */ unsigned long start_pfn; /* pfn represented by the first bit */ unsigned long end_pfn; /* pfn represented by the last bit plus 1 */ - unsigned int size; /* number of bit chunks */ - unsigned long *data; /* chunks of bits representing pages */ + unsigned long *data; /* bitmap representing pages */ }; -struct zone_bitmap { - struct zone_bitmap *next; /* next element of the list */ - unsigned long start_pfn; /* minimal pfn in this zone */ - unsigned long end_pfn; /* maximal pfn in this zone plus 1 */ - struct bm_block *bm_blocks; /* list of bitmap blocks */ - struct bm_block *cur_block; /* recently used bitmap block */ -}; +static inline unsigned long bm_block_bits(struct bm_block *bb) +{ + return bb->end_pfn - bb->start_pfn; +} /* strcut bm_position is used for browsing memory bitmaps */ struct bm_position { - struct zone_bitmap *zone_bm; struct bm_block *block; - int chunk; int bit; }; struct memory_bitmap { - struct zone_bitmap *zone_bm_list; /* list of zone bitmaps */ + struct list_head blocks; /* list of bitmap blocks */ struct linked_page *p_list; /* list of pages used to store zone * bitmap objects and bitmap block * objects @@ -272,173 +284,191 @@ struct memory_bitmap { /* Functions that operate on memory bitmaps */ -static inline void memory_bm_reset_chunk(struct memory_bitmap *bm) -{ - bm->cur.chunk = 0; - bm->cur.bit = -1; -} - static void memory_bm_position_reset(struct memory_bitmap *bm) { - struct zone_bitmap *zone_bm; - - zone_bm = bm->zone_bm_list; - bm->cur.zone_bm = zone_bm; - bm->cur.block = zone_bm->bm_blocks; - memory_bm_reset_chunk(bm); + bm->cur.block = list_entry(bm->blocks.next, struct bm_block, hook); + bm->cur.bit = 0; } static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free); /** * create_bm_block_list - create a list of block bitmap objects + * @pages - number of pages to track + * @list - list to put the allocated blocks into + * @ca - chain allocator to be used for allocating memory */ - -static inline struct bm_block * -create_bm_block_list(unsigned int nr_blocks, struct chain_allocator *ca) +static int create_bm_block_list(unsigned long pages, + struct list_head *list, + struct chain_allocator *ca) { - struct bm_block *bblist = NULL; + unsigned int nr_blocks = DIV_ROUND_UP(pages, BM_BITS_PER_BLOCK); while (nr_blocks-- > 0) { struct bm_block *bb; bb = chain_alloc(ca, sizeof(struct bm_block)); if (!bb) - return NULL; - - bb->next = bblist; - bblist = bb; + return -ENOMEM; + list_add(&bb->hook, list); } - return bblist; + + return 0; } +struct mem_extent { + struct list_head hook; + unsigned long start; + unsigned long end; +}; + /** - * create_zone_bm_list - create a list of zone bitmap objects + * free_mem_extents - free a list of memory extents + * @list - list of extents to empty */ +static void free_mem_extents(struct list_head *list) +{ + struct mem_extent *ext, *aux; -static inline struct zone_bitmap * -create_zone_bm_list(unsigned int nr_zones, struct chain_allocator *ca) + list_for_each_entry_safe(ext, aux, list, hook) { + list_del(&ext->hook); + kfree(ext); + } +} + +/** + * create_mem_extents - create a list of memory extents representing + * contiguous ranges of PFNs + * @list - list to put the extents into + * @gfp_mask - mask to use for memory allocations + */ +static int create_mem_extents(struct list_head *list, gfp_t gfp_mask) { - struct zone_bitmap *zbmlist = NULL; + struct zone *zone; - while (nr_zones-- > 0) { - struct zone_bitmap *zbm; + INIT_LIST_HEAD(list); - zbm = chain_alloc(ca, sizeof(struct zone_bitmap)); - if (!zbm) - return NULL; + for_each_populated_zone(zone) { + unsigned long zone_start, zone_end; + struct mem_extent *ext, *cur, *aux; + + zone_start = zone->zone_start_pfn; + zone_end = zone_end_pfn(zone); + + list_for_each_entry(ext, list, hook) + if (zone_start <= ext->end) + break; - zbm->next = zbmlist; - zbmlist = zbm; + if (&ext->hook == list || zone_end < ext->start) { + /* New extent is necessary */ + struct mem_extent *new_ext; + + new_ext = kzalloc(sizeof(struct mem_extent), gfp_mask); + if (!new_ext) { + free_mem_extents(list); + return -ENOMEM; + } + new_ext->start = zone_start; + new_ext->end = zone_end; + list_add_tail(&new_ext->hook, &ext->hook); + continue; + } + + /* Merge this zone's range of PFNs with the existing one */ + if (zone_start < ext->start) + ext->start = zone_start; + if (zone_end > ext->end) + ext->end = zone_end; + + /* More merging may be possible */ + cur = ext; + list_for_each_entry_safe_continue(cur, aux, list, hook) { + if (zone_end < cur->start) + break; + if (zone_end < cur->end) + ext->end = cur->end; + list_del(&cur->hook); + kfree(cur); + } } - return zbmlist; + + return 0; } /** * memory_bm_create - allocate memory for a memory bitmap */ - static int memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed) { struct chain_allocator ca; - struct zone *zone; - struct zone_bitmap *zone_bm; - struct bm_block *bb; - unsigned int nr; + struct list_head mem_extents; + struct mem_extent *ext; + int error; chain_init(&ca, gfp_mask, safe_needed); + INIT_LIST_HEAD(&bm->blocks); - /* Compute the number of zones */ - nr = 0; - for_each_zone(zone) - if (populated_zone(zone)) - nr++; - - /* Allocate the list of zones bitmap objects */ - zone_bm = create_zone_bm_list(nr, &ca); - bm->zone_bm_list = zone_bm; - if (!zone_bm) { - chain_free(&ca, PG_UNSAFE_CLEAR); - return -ENOMEM; - } - - /* Initialize the zone bitmap objects */ - for_each_zone(zone) { - unsigned long pfn; + error = create_mem_extents(&mem_extents, gfp_mask); + if (error) + return error; - if (!populated_zone(zone)) - continue; + list_for_each_entry(ext, &mem_extents, hook) { + struct bm_block *bb; + unsigned long pfn = ext->start; + unsigned long pages = ext->end - ext->start; - zone_bm->start_pfn = zone->zone_start_pfn; - zone_bm->end_pfn = zone->zone_start_pfn + zone->spanned_pages; - /* Allocate the list of bitmap block objects */ - nr = DIV_ROUND_UP(zone->spanned_pages, BM_BITS_PER_BLOCK); - bb = create_bm_block_list(nr, &ca); - zone_bm->bm_blocks = bb; - zone_bm->cur_block = bb; - if (!bb) - goto Free; + bb = list_entry(bm->blocks.prev, struct bm_block, hook); - nr = zone->spanned_pages; - pfn = zone->zone_start_pfn; - /* Initialize the bitmap block objects */ - while (bb) { - unsigned long *ptr; + error = create_bm_block_list(pages, bm->blocks.prev, &ca); + if (error) + goto Error; - ptr = get_image_page(gfp_mask, safe_needed); - bb->data = ptr; - if (!ptr) - goto Free; + list_for_each_entry_continue(bb, &bm->blocks, hook) { + bb->data = get_image_page(gfp_mask, safe_needed); + if (!bb->data) { + error = -ENOMEM; + goto Error; + } bb->start_pfn = pfn; - if (nr >= BM_BITS_PER_BLOCK) { + if (pages >= BM_BITS_PER_BLOCK) { pfn += BM_BITS_PER_BLOCK; - bb->size = BM_CHUNKS_PER_BLOCK; - nr -= BM_BITS_PER_BLOCK; + pages -= BM_BITS_PER_BLOCK; } else { /* This is executed only once in the loop */ - pfn += nr; - bb->size = DIV_ROUND_UP(nr, BM_BITS_PER_CHUNK); + pfn += pages; } bb->end_pfn = pfn; - bb = bb->next; } - zone_bm = zone_bm->next; } + bm->p_list = ca.chain; memory_bm_position_reset(bm); - return 0; + Exit: + free_mem_extents(&mem_extents); + return error; - Free: + Error: bm->p_list = ca.chain; memory_bm_free(bm, PG_UNSAFE_CLEAR); - return -ENOMEM; + goto Exit; } /** * memory_bm_free - free memory occupied by the memory bitmap @bm */ - static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free) { - struct zone_bitmap *zone_bm; + struct bm_block *bb; - /* Free the list of bit blocks for each zone_bitmap object */ - zone_bm = bm->zone_bm_list; - while (zone_bm) { - struct bm_block *bb; + list_for_each_entry(bb, &bm->blocks, hook) + if (bb->data) + free_image_page(bb->data, clear_nosave_free); - bb = zone_bm->bm_blocks; - while (bb) { - if (bb->data) - free_image_page(bb->data, clear_nosave_free); - bb = bb->next; - } - zone_bm = zone_bm->next; - } free_list_of_pages(bm->p_list, clear_nosave_free); - bm->zone_bm_list = NULL; + + INIT_LIST_HEAD(&bm->blocks); } /** @@ -446,40 +476,35 @@ static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free) * to given pfn. The cur_zone_bm member of @bm and the cur_block member * of @bm->cur_zone_bm are updated. */ - static int memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn, void **addr, unsigned int *bit_nr) { - struct zone_bitmap *zone_bm; struct bm_block *bb; - /* Check if the pfn is from the current zone */ - zone_bm = bm->cur.zone_bm; - if (pfn < zone_bm->start_pfn || pfn >= zone_bm->end_pfn) { - zone_bm = bm->zone_bm_list; - /* We don't assume that the zones are sorted by pfns */ - while (pfn < zone_bm->start_pfn || pfn >= zone_bm->end_pfn) { - zone_bm = zone_bm->next; - - if (!zone_bm) - return -EFAULT; - } - bm->cur.zone_bm = zone_bm; - } - /* Check if the pfn corresponds to the current bitmap block */ - bb = zone_bm->cur_block; + /* + * Check if the pfn corresponds to the current bitmap block and find + * the block where it fits if this is not the case. + */ + bb = bm->cur.block; if (pfn < bb->start_pfn) - bb = zone_bm->bm_blocks; + list_for_each_entry_continue_reverse(bb, &bm->blocks, hook) + if (pfn >= bb->start_pfn) + break; - while (pfn >= bb->end_pfn) { - bb = bb->next; + if (pfn >= bb->end_pfn) + list_for_each_entry_continue(bb, &bm->blocks, hook) + if (pfn >= bb->start_pfn && pfn < bb->end_pfn) + break; - BUG_ON(!bb); - } - zone_bm->cur_block = bb; + if (&bb->hook == &bm->blocks) + return -EFAULT; + + /* The block has been found */ + bm->cur.block = bb; pfn -= bb->start_pfn; - *bit_nr = pfn % BM_BITS_PER_CHUNK; - *addr = bb->data + pfn / BM_BITS_PER_CHUNK; + bm->cur.bit = pfn + 1; + *bit_nr = pfn; + *addr = bb->data; return 0; } @@ -528,34 +553,12 @@ static int memory_bm_test_bit(struct memory_bitmap *bm, unsigned long pfn) return test_bit(bit, addr); } -/* Two auxiliary functions for memory_bm_next_pfn */ - -/* Find the first set bit in the given chunk, if there is one */ - -static inline int next_bit_in_chunk(int bit, unsigned long *chunk_p) -{ - bit++; - while (bit < BM_BITS_PER_CHUNK) { - if (test_bit(bit, chunk_p)) - return bit; - - bit++; - } - return -1; -} - -/* Find a chunk containing some bits set in given block of bits */ - -static inline int next_chunk_in_block(int n, struct bm_block *bb) +static bool memory_bm_pfn_present(struct memory_bitmap *bm, unsigned long pfn) { - n++; - while (n < bb->size) { - if (bb->data[n]) - return n; + void *addr; + unsigned int bit; - n++; - } - return -1; + return !memory_bm_find_bit(bm, pfn, &addr, &bit); } /** @@ -569,42 +572,27 @@ static inline int next_chunk_in_block(int n, struct bm_block *bb) static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm) { - struct zone_bitmap *zone_bm; struct bm_block *bb; - int chunk; int bit; + bb = bm->cur.block; do { - bb = bm->cur.block; - do { - chunk = bm->cur.chunk; - bit = bm->cur.bit; - do { - bit = next_bit_in_chunk(bit, bb->data + chunk); - if (bit >= 0) - goto Return_pfn; - - chunk = next_chunk_in_block(chunk, bb); - bit = -1; - } while (chunk >= 0); - bb = bb->next; - bm->cur.block = bb; - memory_bm_reset_chunk(bm); - } while (bb); - zone_bm = bm->cur.zone_bm->next; - if (zone_bm) { - bm->cur.zone_bm = zone_bm; - bm->cur.block = zone_bm->bm_blocks; - memory_bm_reset_chunk(bm); - } - } while (zone_bm); + bit = bm->cur.bit; + bit = find_next_bit(bb->data, bm_block_bits(bb), bit); + if (bit < bm_block_bits(bb)) + goto Return_pfn; + + bb = list_entry(bb->hook.next, struct bm_block, hook); + bm->cur.block = bb; + bm->cur.bit = 0; + } while (&bb->hook != &bm->blocks); + memory_bm_position_reset(bm); return BM_END_OF_MAP; Return_pfn: - bm->cur.chunk = chunk; - bm->cur.bit = bit; - return bb->start_pfn + chunk * BM_BITS_PER_CHUNK + bit; + bm->cur.bit = bit + 1; + return bb->start_pfn + bit; } /** @@ -650,13 +638,14 @@ __register_nosave_region(unsigned long start_pfn, unsigned long end_pfn, BUG_ON(!region); } else /* This allocation cannot fail */ - region = alloc_bootmem_low(sizeof(struct nosave_region)); + region = memblock_virt_alloc(sizeof(struct nosave_region), 0); region->start_pfn = start_pfn; region->end_pfn = end_pfn; list_add_tail(®ion->list, &nosave_regions); Report: - printk(KERN_INFO "PM: Registered nosave memory: %016lx - %016lx\n", - start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT); + printk(KERN_INFO "PM: Registered nosave memory: [mem %#010llx-%#010llx]\n", + (unsigned long long) start_pfn << PAGE_SHIFT, + ((unsigned long long) end_pfn << PAGE_SHIFT) - 1); } /* @@ -724,9 +713,10 @@ static void mark_nosave_pages(struct memory_bitmap *bm) list_for_each_entry(region, &nosave_regions, list) { unsigned long pfn; - pr_debug("PM: Marking nosave pages: %016lx - %016lx\n", - region->start_pfn << PAGE_SHIFT, - region->end_pfn << PAGE_SHIFT); + pr_debug("PM: Marking nosave pages: [mem %#010llx-%#010llx]\n", + (unsigned long long) region->start_pfn << PAGE_SHIFT, + ((unsigned long long) region->end_pfn << PAGE_SHIFT) + - 1); for (pfn = region->start_pfn; pfn < region->end_pfn; pfn++) if (pfn_valid(pfn)) { @@ -754,7 +744,10 @@ int create_basic_memory_bitmaps(void) struct memory_bitmap *bm1, *bm2; int error = 0; - BUG_ON(forbidden_pages_map || free_pages_map); + if (forbidden_pages_map && free_pages_map) + return 0; + else + BUG_ON(forbidden_pages_map || free_pages_map); bm1 = kzalloc(sizeof(struct memory_bitmap), GFP_KERNEL); if (!bm1) @@ -800,7 +793,8 @@ void free_basic_memory_bitmaps(void) { struct memory_bitmap *bm1, *bm2; - BUG_ON(!(forbidden_pages_map && free_pages_map)); + if (WARN_ON(!(forbidden_pages_map && free_pages_map))) + return; bm1 = forbidden_pages_map; bm2 = free_pages_map; @@ -825,7 +819,8 @@ unsigned int snapshot_additional_pages(struct zone *zone) unsigned int res; res = DIV_ROUND_UP(zone->spanned_pages, BM_BITS_PER_BLOCK); - res += DIV_ROUND_UP(res * sizeof(struct bm_block), PAGE_SIZE); + res += DIV_ROUND_UP(res * sizeof(struct bm_block), + LINKED_PAGE_DATA_SIZE); return 2 * res; } @@ -840,8 +835,8 @@ static unsigned int count_free_highmem_pages(void) struct zone *zone; unsigned int cnt = 0; - for_each_zone(zone) - if (populated_zone(zone) && is_highmem(zone)) + for_each_populated_zone(zone) + if (is_highmem(zone)) cnt += zone_page_state(zone, NR_FREE_PAGES); return cnt; @@ -854,8 +849,7 @@ static unsigned int count_free_highmem_pages(void) * We should save the page if it isn't Nosave or NosaveFree, or Reserved, * and it isn't a part of a free chunk of pages. */ - -static struct page *saveable_highmem_page(unsigned long pfn) +static struct page *saveable_highmem_page(struct zone *zone, unsigned long pfn) { struct page *page; @@ -863,6 +857,8 @@ static struct page *saveable_highmem_page(unsigned long pfn) return NULL; page = pfn_to_page(pfn); + if (page_zone(page) != zone) + return NULL; BUG_ON(!PageHighMem(page)); @@ -870,6 +866,9 @@ static struct page *saveable_highmem_page(unsigned long pfn) PageReserved(page)) return NULL; + if (page_is_guard(page)) + return NULL; + return page; } @@ -878,27 +877,30 @@ static struct page *saveable_highmem_page(unsigned long pfn) * pages. */ -unsigned int count_highmem_pages(void) +static unsigned int count_highmem_pages(void) { struct zone *zone; unsigned int n = 0; - for_each_zone(zone) { + for_each_populated_zone(zone) { unsigned long pfn, max_zone_pfn; if (!is_highmem(zone)) continue; mark_free_pages(zone); - max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; + max_zone_pfn = zone_end_pfn(zone); for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) - if (saveable_highmem_page(pfn)) + if (saveable_highmem_page(zone, pfn)) n++; } return n; } #else -static inline void *saveable_highmem_page(unsigned long pfn) { return NULL; } +static inline void *saveable_highmem_page(struct zone *z, unsigned long p) +{ + return NULL; +} #endif /* CONFIG_HIGHMEM */ /** @@ -909,8 +911,7 @@ static inline void *saveable_highmem_page(unsigned long pfn) { return NULL; } * of pages statically defined as 'unsaveable', and it isn't a part of * a free chunk of pages. */ - -static struct page *saveable_page(unsigned long pfn) +static struct page *saveable_page(struct zone *zone, unsigned long pfn) { struct page *page; @@ -918,6 +919,8 @@ static struct page *saveable_page(unsigned long pfn) return NULL; page = pfn_to_page(pfn); + if (page_zone(page) != zone) + return NULL; BUG_ON(PageHighMem(page)); @@ -928,6 +931,9 @@ static struct page *saveable_page(unsigned long pfn) && (!kernel_page_present(page) || pfn_is_nosave(pfn))) return NULL; + if (page_is_guard(page)) + return NULL; + return page; } @@ -936,20 +942,20 @@ static struct page *saveable_page(unsigned long pfn) * pages. */ -unsigned int count_data_pages(void) +static unsigned int count_data_pages(void) { struct zone *zone; unsigned long pfn, max_zone_pfn; unsigned int n = 0; - for_each_zone(zone) { + for_each_populated_zone(zone) { if (is_highmem(zone)) continue; mark_free_pages(zone); - max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; + max_zone_pfn = zone_end_pfn(zone); for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) - if(saveable_page(pfn)) + if (saveable_page(zone, pfn)) n++; } return n; @@ -990,7 +996,7 @@ static inline struct page * page_is_saveable(struct zone *zone, unsigned long pfn) { return is_highmem(zone) ? - saveable_highmem_page(pfn) : saveable_page(pfn); + saveable_highmem_page(zone, pfn) : saveable_page(zone, pfn); } static void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn) @@ -1001,27 +1007,27 @@ static void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn) s_page = pfn_to_page(src_pfn); d_page = pfn_to_page(dst_pfn); if (PageHighMem(s_page)) { - src = kmap_atomic(s_page, KM_USER0); - dst = kmap_atomic(d_page, KM_USER1); + src = kmap_atomic(s_page); + dst = kmap_atomic(d_page); do_copy_page(dst, src); - kunmap_atomic(src, KM_USER0); - kunmap_atomic(dst, KM_USER1); + kunmap_atomic(dst); + kunmap_atomic(src); } else { if (PageHighMem(d_page)) { /* Page pointed to by src may contain some kernel * data modified by kmap_atomic() */ safe_copy_page(buffer, s_page); - dst = kmap_atomic(pfn_to_page(dst_pfn), KM_USER0); - memcpy(dst, buffer, PAGE_SIZE); - kunmap_atomic(dst, KM_USER0); + dst = kmap_atomic(d_page); + copy_page(dst, buffer); + kunmap_atomic(dst); } else { safe_copy_page(page_address(d_page), s_page); } } } #else -#define page_is_saveable(zone, pfn) saveable_page(pfn) +#define page_is_saveable(zone, pfn) saveable_page(zone, pfn) static inline void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn) { @@ -1036,11 +1042,11 @@ copy_data_pages(struct memory_bitmap *copy_bm, struct memory_bitmap *orig_bm) struct zone *zone; unsigned long pfn; - for_each_zone(zone) { + for_each_populated_zone(zone) { unsigned long max_zone_pfn; mark_free_pages(zone); - max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; + max_zone_pfn = zone_end_pfn(zone); for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) if (page_is_saveable(zone, pfn)) memory_bm_set_bit(orig_bm, pfn); @@ -1059,6 +1065,25 @@ copy_data_pages(struct memory_bitmap *copy_bm, struct memory_bitmap *orig_bm) static unsigned int nr_copy_pages; /* Number of pages needed for saving the original pfns of the image pages */ static unsigned int nr_meta_pages; +/* + * Numbers of normal and highmem page frames allocated for hibernation image + * before suspending devices. + */ +unsigned int alloc_normal, alloc_highmem; +/* + * Memory bitmap used for marking saveable pages (during hibernation) or + * hibernation image pages (during restore) + */ +static struct memory_bitmap orig_bm; +/* + * Memory bitmap used during hibernation for marking allocated page frames that + * will contain copies of saveable pages. During restore it is initially used + * for marking hibernation image pages, but then the set bits from it are + * duplicated in @orig_bm and it is released. On highmem systems it is next + * used for marking "safe" highmem pages, but it has to be reinitialized for + * this purpose. + */ +static struct memory_bitmap copy_bm; /** * swsusp_free - free pages allocated for the suspend. @@ -1072,8 +1097,8 @@ void swsusp_free(void) struct zone *zone; unsigned long pfn, max_zone_pfn; - for_each_zone(zone) { - max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; + for_each_populated_zone(zone) { + max_zone_pfn = zone_end_pfn(zone); for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) if (pfn_valid(pfn)) { struct page *page = pfn_to_page(pfn); @@ -1090,6 +1115,347 @@ void swsusp_free(void) nr_meta_pages = 0; restore_pblist = NULL; buffer = NULL; + alloc_normal = 0; + alloc_highmem = 0; +} + +/* Helper functions used for the shrinking of memory. */ + +#define GFP_IMAGE (GFP_KERNEL | __GFP_NOWARN) + +/** + * preallocate_image_pages - Allocate a number of pages for hibernation image + * @nr_pages: Number of page frames to allocate. + * @mask: GFP flags to use for the allocation. + * + * Return value: Number of page frames actually allocated + */ +static unsigned long preallocate_image_pages(unsigned long nr_pages, gfp_t mask) +{ + unsigned long nr_alloc = 0; + + while (nr_pages > 0) { + struct page *page; + + page = alloc_image_page(mask); + if (!page) + break; + memory_bm_set_bit(©_bm, page_to_pfn(page)); + if (PageHighMem(page)) + alloc_highmem++; + else + alloc_normal++; + nr_pages--; + nr_alloc++; + } + + return nr_alloc; +} + +static unsigned long preallocate_image_memory(unsigned long nr_pages, + unsigned long avail_normal) +{ + unsigned long alloc; + + if (avail_normal <= alloc_normal) + return 0; + + alloc = avail_normal - alloc_normal; + if (nr_pages < alloc) + alloc = nr_pages; + + return preallocate_image_pages(alloc, GFP_IMAGE); +} + +#ifdef CONFIG_HIGHMEM +static unsigned long preallocate_image_highmem(unsigned long nr_pages) +{ + return preallocate_image_pages(nr_pages, GFP_IMAGE | __GFP_HIGHMEM); +} + +/** + * __fraction - Compute (an approximation of) x * (multiplier / base) + */ +static unsigned long __fraction(u64 x, u64 multiplier, u64 base) +{ + x *= multiplier; + do_div(x, base); + return (unsigned long)x; +} + +static unsigned long preallocate_highmem_fraction(unsigned long nr_pages, + unsigned long highmem, + unsigned long total) +{ + unsigned long alloc = __fraction(nr_pages, highmem, total); + + return preallocate_image_pages(alloc, GFP_IMAGE | __GFP_HIGHMEM); +} +#else /* CONFIG_HIGHMEM */ +static inline unsigned long preallocate_image_highmem(unsigned long nr_pages) +{ + return 0; +} + +static inline unsigned long preallocate_highmem_fraction(unsigned long nr_pages, + unsigned long highmem, + unsigned long total) +{ + return 0; +} +#endif /* CONFIG_HIGHMEM */ + +/** + * free_unnecessary_pages - Release preallocated pages not needed for the image + */ +static void free_unnecessary_pages(void) +{ + unsigned long save, to_free_normal, to_free_highmem; + + save = count_data_pages(); + if (alloc_normal >= save) { + to_free_normal = alloc_normal - save; + save = 0; + } else { + to_free_normal = 0; + save -= alloc_normal; + } + save += count_highmem_pages(); + if (alloc_highmem >= save) { + to_free_highmem = alloc_highmem - save; + } else { + to_free_highmem = 0; + save -= alloc_highmem; + if (to_free_normal > save) + to_free_normal -= save; + else + to_free_normal = 0; + } + + memory_bm_position_reset(©_bm); + + while (to_free_normal > 0 || to_free_highmem > 0) { + unsigned long pfn = memory_bm_next_pfn(©_bm); + struct page *page = pfn_to_page(pfn); + + if (PageHighMem(page)) { + if (!to_free_highmem) + continue; + to_free_highmem--; + alloc_highmem--; + } else { + if (!to_free_normal) + continue; + to_free_normal--; + alloc_normal--; + } + memory_bm_clear_bit(©_bm, pfn); + swsusp_unset_page_forbidden(page); + swsusp_unset_page_free(page); + __free_page(page); + } +} + +/** + * minimum_image_size - Estimate the minimum acceptable size of an image + * @saveable: Number of saveable pages in the system. + * + * We want to avoid attempting to free too much memory too hard, so estimate the + * minimum acceptable size of a hibernation image to use as the lower limit for + * preallocating memory. + * + * We assume that the minimum image size should be proportional to + * + * [number of saveable pages] - [number of pages that can be freed in theory] + * + * where the second term is the sum of (1) reclaimable slab pages, (2) active + * and (3) inactive anonymous pages, (4) active and (5) inactive file pages, + * minus mapped file pages. + */ +static unsigned long minimum_image_size(unsigned long saveable) +{ + unsigned long size; + + size = global_page_state(NR_SLAB_RECLAIMABLE) + + global_page_state(NR_ACTIVE_ANON) + + global_page_state(NR_INACTIVE_ANON) + + global_page_state(NR_ACTIVE_FILE) + + global_page_state(NR_INACTIVE_FILE) + - global_page_state(NR_FILE_MAPPED); + + return saveable <= size ? 0 : saveable - size; +} + +/** + * hibernate_preallocate_memory - Preallocate memory for hibernation image + * + * To create a hibernation image it is necessary to make a copy of every page + * frame in use. We also need a number of page frames to be free during + * hibernation for allocations made while saving the image and for device + * drivers, in case they need to allocate memory from their hibernation + * callbacks (these two numbers are given by PAGES_FOR_IO (which is a rough + * estimate) and reserverd_size divided by PAGE_SIZE (which is tunable through + * /sys/power/reserved_size, respectively). To make this happen, we compute the + * total number of available page frames and allocate at least + * + * ([page frames total] + PAGES_FOR_IO + [metadata pages]) / 2 + * + 2 * DIV_ROUND_UP(reserved_size, PAGE_SIZE) + * + * of them, which corresponds to the maximum size of a hibernation image. + * + * If image_size is set below the number following from the above formula, + * the preallocation of memory is continued until the total number of saveable + * pages in the system is below the requested image size or the minimum + * acceptable image size returned by minimum_image_size(), whichever is greater. + */ +int hibernate_preallocate_memory(void) +{ + struct zone *zone; + unsigned long saveable, size, max_size, count, highmem, pages = 0; + unsigned long alloc, save_highmem, pages_highmem, avail_normal; + struct timeval start, stop; + int error; + + printk(KERN_INFO "PM: Preallocating image memory... "); + do_gettimeofday(&start); + + error = memory_bm_create(&orig_bm, GFP_IMAGE, PG_ANY); + if (error) + goto err_out; + + error = memory_bm_create(©_bm, GFP_IMAGE, PG_ANY); + if (error) + goto err_out; + + alloc_normal = 0; + alloc_highmem = 0; + + /* Count the number of saveable data pages. */ + save_highmem = count_highmem_pages(); + saveable = count_data_pages(); + + /* + * Compute the total number of page frames we can use (count) and the + * number of pages needed for image metadata (size). + */ + count = saveable; + saveable += save_highmem; + highmem = save_highmem; + size = 0; + for_each_populated_zone(zone) { + size += snapshot_additional_pages(zone); + if (is_highmem(zone)) + highmem += zone_page_state(zone, NR_FREE_PAGES); + else + count += zone_page_state(zone, NR_FREE_PAGES); + } + avail_normal = count; + count += highmem; + count -= totalreserve_pages; + + /* Add number of pages required for page keys (s390 only). */ + size += page_key_additional_pages(saveable); + + /* Compute the maximum number of saveable pages to leave in memory. */ + max_size = (count - (size + PAGES_FOR_IO)) / 2 + - 2 * DIV_ROUND_UP(reserved_size, PAGE_SIZE); + /* Compute the desired number of image pages specified by image_size. */ + size = DIV_ROUND_UP(image_size, PAGE_SIZE); + if (size > max_size) + size = max_size; + /* + * If the desired number of image pages is at least as large as the + * current number of saveable pages in memory, allocate page frames for + * the image and we're done. + */ + if (size >= saveable) { + pages = preallocate_image_highmem(save_highmem); + pages += preallocate_image_memory(saveable - pages, avail_normal); + goto out; + } + + /* Estimate the minimum size of the image. */ + pages = minimum_image_size(saveable); + /* + * To avoid excessive pressure on the normal zone, leave room in it to + * accommodate an image of the minimum size (unless it's already too + * small, in which case don't preallocate pages from it at all). + */ + if (avail_normal > pages) + avail_normal -= pages; + else + avail_normal = 0; + if (size < pages) + size = min_t(unsigned long, pages, max_size); + + /* + * Let the memory management subsystem know that we're going to need a + * large number of page frames to allocate and make it free some memory. + * NOTE: If this is not done, performance will be hurt badly in some + * test cases. + */ + shrink_all_memory(saveable - size); + + /* + * The number of saveable pages in memory was too high, so apply some + * pressure to decrease it. First, make room for the largest possible + * image and fail if that doesn't work. Next, try to decrease the size + * of the image as much as indicated by 'size' using allocations from + * highmem and non-highmem zones separately. + */ + pages_highmem = preallocate_image_highmem(highmem / 2); + alloc = count - max_size; + if (alloc > pages_highmem) + alloc -= pages_highmem; + else + alloc = 0; + pages = preallocate_image_memory(alloc, avail_normal); + if (pages < alloc) { + /* We have exhausted non-highmem pages, try highmem. */ + alloc -= pages; + pages += pages_highmem; + pages_highmem = preallocate_image_highmem(alloc); + if (pages_highmem < alloc) + goto err_out; + pages += pages_highmem; + /* + * size is the desired number of saveable pages to leave in + * memory, so try to preallocate (all memory - size) pages. + */ + alloc = (count - pages) - size; + pages += preallocate_image_highmem(alloc); + } else { + /* + * There are approximately max_size saveable pages at this point + * and we want to reduce this number down to size. + */ + alloc = max_size - size; + size = preallocate_highmem_fraction(alloc, highmem, count); + pages_highmem += size; + alloc -= size; + size = preallocate_image_memory(alloc, avail_normal); + pages_highmem += preallocate_image_highmem(alloc - size); + pages += pages_highmem + size; + } + + /* + * We only need as many page frames for the image as there are saveable + * pages in memory, but we have allocated more. Release the excessive + * ones now. + */ + free_unnecessary_pages(); + + out: + do_gettimeofday(&stop); + printk(KERN_CONT "done (allocated %lu pages)\n", pages); + swsusp_show_speed(&start, &stop, pages, "Allocated"); + + return 0; + + err_out: + printk(KERN_CONT "\n"); + swsusp_free(); + return -ENOMEM; } #ifdef CONFIG_HIGHMEM @@ -1100,7 +1466,7 @@ void swsusp_free(void) static unsigned int count_pages_for_highmem(unsigned int nr_highmem) { - unsigned int free_highmem = count_free_highmem_pages(); + unsigned int free_highmem = count_free_highmem_pages() + alloc_highmem; if (free_highmem >= nr_highmem) nr_highmem = 0; @@ -1122,19 +1488,17 @@ count_pages_for_highmem(unsigned int nr_highmem) { return 0; } static int enough_free_mem(unsigned int nr_pages, unsigned int nr_highmem) { struct zone *zone; - unsigned int free = 0, meta = 0; + unsigned int free = alloc_normal; - for_each_zone(zone) { - meta += snapshot_additional_pages(zone); + for_each_populated_zone(zone) if (!is_highmem(zone)) free += zone_page_state(zone, NR_FREE_PAGES); - } nr_pages += count_pages_for_highmem(nr_highmem); - pr_debug("PM: Normal pages needed: %u + %u + %u, available pages: %u\n", - nr_pages, PAGES_FOR_IO, meta, free); + pr_debug("PM: Normal pages needed: %u + %u, available pages: %u\n", + nr_pages, PAGES_FOR_IO, free); - return free > nr_pages + PAGES_FOR_IO + meta; + return free > nr_pages + PAGES_FOR_IO; } #ifdef CONFIG_HIGHMEM @@ -1156,7 +1520,7 @@ static inline int get_highmem_buffer(int safe_needed) */ static inline unsigned int -alloc_highmem_image_pages(struct memory_bitmap *bm, unsigned int nr_highmem) +alloc_highmem_pages(struct memory_bitmap *bm, unsigned int nr_highmem) { unsigned int to_alloc = count_free_highmem_pages(); @@ -1176,7 +1540,7 @@ alloc_highmem_image_pages(struct memory_bitmap *bm, unsigned int nr_highmem) static inline int get_highmem_buffer(int safe_needed) { return 0; } static inline unsigned int -alloc_highmem_image_pages(struct memory_bitmap *bm, unsigned int n) { return 0; } +alloc_highmem_pages(struct memory_bitmap *bm, unsigned int n) { return 0; } #endif /* CONFIG_HIGHMEM */ /** @@ -1195,56 +1559,38 @@ static int swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm, unsigned int nr_pages, unsigned int nr_highmem) { - int error; - - error = memory_bm_create(orig_bm, GFP_ATOMIC | __GFP_COLD, PG_ANY); - if (error) - goto Free; - - error = memory_bm_create(copy_bm, GFP_ATOMIC | __GFP_COLD, PG_ANY); - if (error) - goto Free; - if (nr_highmem > 0) { - error = get_highmem_buffer(PG_ANY); - if (error) - goto Free; - - nr_pages += alloc_highmem_image_pages(copy_bm, nr_highmem); + if (get_highmem_buffer(PG_ANY)) + goto err_out; + if (nr_highmem > alloc_highmem) { + nr_highmem -= alloc_highmem; + nr_pages += alloc_highmem_pages(copy_bm, nr_highmem); + } } - while (nr_pages-- > 0) { - struct page *page = alloc_image_page(GFP_ATOMIC | __GFP_COLD); - - if (!page) - goto Free; + if (nr_pages > alloc_normal) { + nr_pages -= alloc_normal; + while (nr_pages-- > 0) { + struct page *page; - memory_bm_set_bit(copy_bm, page_to_pfn(page)); + page = alloc_image_page(GFP_ATOMIC | __GFP_COLD); + if (!page) + goto err_out; + memory_bm_set_bit(copy_bm, page_to_pfn(page)); + } } + return 0; - Free: + err_out: swsusp_free(); return -ENOMEM; } -/* Memory bitmap used for marking saveable pages (during suspend) or the - * suspend image pages (during resume) - */ -static struct memory_bitmap orig_bm; -/* Memory bitmap used on suspend for marking allocated pages that will contain - * the copies of saveable pages. During resume it is initially used for - * marking the suspend image pages, but then its set bits are duplicated in - * @orig_bm and it is released. Next, on systems with high memory, it may be - * used for marking "safe" highmem pages, but it has to be reinitialized for - * this purpose. - */ -static struct memory_bitmap copy_bm; - -asmlinkage int swsusp_save(void) +asmlinkage __visible int swsusp_save(void) { unsigned int nr_pages, nr_highmem; - printk(KERN_INFO "PM: Creating hibernation image: \n"); + printk(KERN_INFO "PM: Creating hibernation image:\n"); drain_local_pages(NULL); nr_pages = count_data_pages(); @@ -1315,7 +1661,7 @@ unsigned long snapshot_get_image_size(void) static int init_header(struct swsusp_info *info) { memset(info, 0, sizeof(struct swsusp_info)); - info->num_physpages = num_physpages; + info->num_physpages = get_num_physpages(); info->image_pages = nr_copy_pages; info->pages = snapshot_get_image_size(); info->size = info->pages; @@ -1337,6 +1683,8 @@ pack_pfns(unsigned long *buf, struct memory_bitmap *bm) buf[j] = memory_bm_next_pfn(bm); if (unlikely(buf[j] == BM_END_OF_MAP)) break; + /* Save page key for data page (s390 only). */ + page_key_read(buf + j); } } @@ -1347,14 +1695,9 @@ pack_pfns(unsigned long *buf, struct memory_bitmap *bm) * snapshot_handle structure. The structure gets updated and a pointer * to it should be passed to this function every next time. * - * The @count parameter should contain the number of bytes the caller - * wants to read from the snapshot. It must not be zero. - * * On success the function returns a positive number. Then, the caller * is allowed to read up to the returned number of bytes from the memory - * location computed by the data_of() macro. The number returned - * may be smaller than @count, but this only happens if the read would - * cross a page boundary otherwise. + * location computed by the data_of() macro. * * The function returns 0 to indicate the end of data stream condition, * and a negative number is returned on error. In such cases the @@ -1362,7 +1705,7 @@ pack_pfns(unsigned long *buf, struct memory_bitmap *bm) * any more. */ -int snapshot_read_next(struct snapshot_handle *handle, size_t count) +int snapshot_read_next(struct snapshot_handle *handle) { if (handle->cur > nr_meta_pages + nr_copy_pages) return 0; @@ -1373,7 +1716,7 @@ int snapshot_read_next(struct snapshot_handle *handle, size_t count) if (!buffer) return -ENOMEM; } - if (!handle->offset) { + if (!handle->cur) { int error; error = init_header((struct swsusp_info *)buffer); @@ -1382,42 +1725,30 @@ int snapshot_read_next(struct snapshot_handle *handle, size_t count) handle->buffer = buffer; memory_bm_position_reset(&orig_bm); memory_bm_position_reset(©_bm); - } - if (handle->prev < handle->cur) { - if (handle->cur <= nr_meta_pages) { - memset(buffer, 0, PAGE_SIZE); - pack_pfns(buffer, &orig_bm); - } else { - struct page *page; + } else if (handle->cur <= nr_meta_pages) { + clear_page(buffer); + pack_pfns(buffer, &orig_bm); + } else { + struct page *page; - page = pfn_to_page(memory_bm_next_pfn(©_bm)); - if (PageHighMem(page)) { - /* Highmem pages are copied to the buffer, - * because we can't return with a kmapped - * highmem page (we may not be called again). - */ - void *kaddr; + page = pfn_to_page(memory_bm_next_pfn(©_bm)); + if (PageHighMem(page)) { + /* Highmem pages are copied to the buffer, + * because we can't return with a kmapped + * highmem page (we may not be called again). + */ + void *kaddr; - kaddr = kmap_atomic(page, KM_USER0); - memcpy(buffer, kaddr, PAGE_SIZE); - kunmap_atomic(kaddr, KM_USER0); - handle->buffer = buffer; - } else { - handle->buffer = page_address(page); - } + kaddr = kmap_atomic(page); + copy_page(buffer, kaddr); + kunmap_atomic(kaddr); + handle->buffer = buffer; + } else { + handle->buffer = page_address(page); } - handle->prev = handle->cur; } - handle->buf_offset = handle->cur_offset; - if (handle->cur_offset + count >= PAGE_SIZE) { - count = PAGE_SIZE - handle->cur_offset; - handle->cur_offset = 0; - handle->cur++; - } else { - handle->cur_offset += count; - } - handle->offset += count; - return count; + handle->cur++; + return PAGE_SIZE; } /** @@ -1432,8 +1763,8 @@ static int mark_unsafe_pages(struct memory_bitmap *bm) unsigned long pfn, max_zone_pfn; /* Clear page flags */ - for_each_zone(zone) { - max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; + for_each_populated_zone(zone) { + max_zone_pfn = zone_end_pfn(zone); for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) if (pfn_valid(pfn)) swsusp_unset_page_free(pfn_to_page(pfn)); @@ -1474,7 +1805,7 @@ static int check_header(struct swsusp_info *info) char *reason; reason = check_image_kernel(info); - if (!reason && info->num_physpages != num_physpages) + if (!reason && info->num_physpages != get_num_physpages()) reason = "memory size"; if (reason) { printk(KERN_ERR "PM: Image mismatch: %s\n", reason); @@ -1505,9 +1836,7 @@ load_header(struct swsusp_info *info) * unpack_orig_pfns - for each element of @buf[] (1 page at a time) set * the corresponding bit in the memory bitmap @bm */ - -static inline void -unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm) +static int unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm) { int j; @@ -1515,8 +1844,16 @@ unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm) if (unlikely(buf[j] == BM_END_OF_MAP)) break; - memory_bm_set_bit(bm, buf[j]); + /* Extract and buffer page key for data page (s390 only). */ + page_key_memorize(buf + j); + + if (memory_bm_pfn_present(bm, buf[j])) + memory_bm_set_bit(bm, buf[j]); + else + return -EFAULT; } + + return 0; } /* List of "safe" pages that may be used to store data loaded from the suspend @@ -1654,7 +1991,7 @@ get_highmem_page_buffer(struct page *page, struct chain_allocator *ca) pbe = chain_alloc(ca, sizeof(struct highmem_pbe)); if (!pbe) { swsusp_free(); - return NULL; + return ERR_PTR(-ENOMEM); } pbe->orig_page = page; if (safe_highmem_pages > 0) { @@ -1688,9 +2025,9 @@ static void copy_last_highmem_page(void) if (last_highmem_page) { void *dst; - dst = kmap_atomic(last_highmem_page, KM_USER0); - memcpy(dst, buffer, PAGE_SIZE); - kunmap_atomic(dst, KM_USER0); + dst = kmap_atomic(last_highmem_page); + copy_page(dst, buffer); + kunmap_atomic(dst); last_highmem_page = NULL; } } @@ -1723,7 +2060,7 @@ prepare_highmem_image(struct memory_bitmap *bm, unsigned int *nr_highmem_p) static inline void * get_highmem_page_buffer(struct page *page, struct chain_allocator *ca) { - return NULL; + return ERR_PTR(-EINVAL); } static inline void copy_last_highmem_page(void) {} @@ -1834,8 +2171,13 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm) static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca) { struct pbe *pbe; - struct page *page = pfn_to_page(memory_bm_next_pfn(bm)); + struct page *page; + unsigned long pfn = memory_bm_next_pfn(bm); + if (pfn == BM_END_OF_MAP) + return ERR_PTR(-EFAULT); + + page = pfn_to_page(pfn); if (PageHighMem(page)) return get_highmem_page_buffer(page, ca); @@ -1851,7 +2193,7 @@ static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca) pbe = chain_alloc(ca, sizeof(struct pbe)); if (!pbe) { swsusp_free(); - return NULL; + return ERR_PTR(-ENOMEM); } pbe->orig_address = page_address(page); pbe->address = safe_pages_list; @@ -1868,14 +2210,9 @@ static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca) * snapshot_handle structure. The structure gets updated and a pointer * to it should be passed to this function every next time. * - * The @count parameter should contain the number of bytes the caller - * wants to write to the image. It must not be zero. - * * On success the function returns a positive number. Then, the caller * is allowed to write up to the returned number of bytes to the memory - * location computed by the data_of() macro. The number returned - * may be smaller than @count, but this only happens if the write would - * cross a page boundary otherwise. + * location computed by the data_of() macro. * * The function returns 0 to indicate the "end of file" condition, * and a negative number is returned on error. In such cases the @@ -1883,16 +2220,18 @@ static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca) * any more. */ -int snapshot_write_next(struct snapshot_handle *handle, size_t count) +int snapshot_write_next(struct snapshot_handle *handle) { static struct chain_allocator ca; int error = 0; /* Check if we have already loaded the entire image */ - if (handle->prev && handle->cur > nr_meta_pages + nr_copy_pages) + if (handle->cur > 1 && handle->cur > nr_meta_pages + nr_copy_pages) return 0; - if (handle->offset == 0) { + handle->sync_read = 1; + + if (!handle->cur) { if (!buffer) /* This makes the buffer be freed by swsusp_free() */ buffer = get_image_page(GFP_ATOMIC, PG_ANY); @@ -1901,51 +2240,50 @@ int snapshot_write_next(struct snapshot_handle *handle, size_t count) return -ENOMEM; handle->buffer = buffer; - } - handle->sync_read = 1; - if (handle->prev < handle->cur) { - if (handle->prev == 0) { - error = load_header(buffer); - if (error) - return error; + } else if (handle->cur == 1) { + error = load_header(buffer); + if (error) + return error; + + error = memory_bm_create(©_bm, GFP_ATOMIC, PG_ANY); + if (error) + return error; - error = memory_bm_create(©_bm, GFP_ATOMIC, PG_ANY); + /* Allocate buffer for page keys. */ + error = page_key_alloc(nr_copy_pages); + if (error) + return error; + + } else if (handle->cur <= nr_meta_pages + 1) { + error = unpack_orig_pfns(buffer, ©_bm); + if (error) + return error; + + if (handle->cur == nr_meta_pages + 1) { + error = prepare_image(&orig_bm, ©_bm); if (error) return error; - } else if (handle->prev <= nr_meta_pages) { - unpack_orig_pfns(buffer, ©_bm); - if (handle->prev == nr_meta_pages) { - error = prepare_image(&orig_bm, ©_bm); - if (error) - return error; - - chain_init(&ca, GFP_ATOMIC, PG_SAFE); - memory_bm_position_reset(&orig_bm); - restore_pblist = NULL; - handle->buffer = get_buffer(&orig_bm, &ca); - handle->sync_read = 0; - if (!handle->buffer) - return -ENOMEM; - } - } else { - copy_last_highmem_page(); + chain_init(&ca, GFP_ATOMIC, PG_SAFE); + memory_bm_position_reset(&orig_bm); + restore_pblist = NULL; handle->buffer = get_buffer(&orig_bm, &ca); - if (handle->buffer != buffer) - handle->sync_read = 0; + handle->sync_read = 0; + if (IS_ERR(handle->buffer)) + return PTR_ERR(handle->buffer); } - handle->prev = handle->cur; - } - handle->buf_offset = handle->cur_offset; - if (handle->cur_offset + count >= PAGE_SIZE) { - count = PAGE_SIZE - handle->cur_offset; - handle->cur_offset = 0; - handle->cur++; } else { - handle->cur_offset += count; + copy_last_highmem_page(); + /* Restore page key for data page (s390 only). */ + page_key_write(handle->buffer); + handle->buffer = get_buffer(&orig_bm, &ca); + if (IS_ERR(handle->buffer)) + return PTR_ERR(handle->buffer); + if (handle->buffer != buffer) + handle->sync_read = 0; } - handle->offset += count; - return count; + handle->cur++; + return PAGE_SIZE; } /** @@ -1959,8 +2297,11 @@ int snapshot_write_next(struct snapshot_handle *handle, size_t count) void snapshot_write_finalize(struct snapshot_handle *handle) { copy_last_highmem_page(); + /* Restore page key for data page (s390 only). */ + page_key_write(handle->buffer); + page_key_free(); /* Free only if we have loaded the image entirely */ - if (handle->prev && handle->cur > nr_meta_pages + nr_copy_pages) { + if (handle->cur > 1 && handle->cur > nr_meta_pages + nr_copy_pages) { memory_bm_free(&orig_bm, PG_UNSAFE_CLEAR); free_highmem_data(); } @@ -1979,13 +2320,13 @@ swap_two_pages_data(struct page *p1, struct page *p2, void *buf) { void *kaddr1, *kaddr2; - kaddr1 = kmap_atomic(p1, KM_USER0); - kaddr2 = kmap_atomic(p2, KM_USER1); - memcpy(buf, kaddr1, PAGE_SIZE); - memcpy(kaddr1, kaddr2, PAGE_SIZE); - memcpy(kaddr2, buf, PAGE_SIZE); - kunmap_atomic(kaddr1, KM_USER0); - kunmap_atomic(kaddr2, KM_USER1); + kaddr1 = kmap_atomic(p1); + kaddr2 = kmap_atomic(p2); + copy_page(buf, kaddr1); + copy_page(kaddr1, kaddr2); + copy_page(kaddr2, buf); + kunmap_atomic(kaddr2); + kunmap_atomic(kaddr1); } /** diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c new file mode 100644 index 00000000000..ed35a4790af --- /dev/null +++ b/kernel/power/suspend.c @@ -0,0 +1,443 @@ +/* + * kernel/power/suspend.c - Suspend to RAM and standby functionality. + * + * Copyright (c) 2003 Patrick Mochel + * Copyright (c) 2003 Open Source Development Lab + * Copyright (c) 2009 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc. + * + * This file is released under the GPLv2. + */ + +#include <linux/string.h> +#include <linux/delay.h> +#include <linux/errno.h> +#include <linux/init.h> +#include <linux/console.h> +#include <linux/cpu.h> +#include <linux/cpuidle.h> +#include <linux/syscalls.h> +#include <linux/gfp.h> +#include <linux/io.h> +#include <linux/kernel.h> +#include <linux/list.h> +#include <linux/mm.h> +#include <linux/slab.h> +#include <linux/export.h> +#include <linux/suspend.h> +#include <linux/syscore_ops.h> +#include <linux/ftrace.h> +#include <trace/events/power.h> +#include <linux/compiler.h> + +#include "power.h" + +struct pm_sleep_state pm_states[PM_SUSPEND_MAX] = { + [PM_SUSPEND_FREEZE] = { .label = "freeze", .state = PM_SUSPEND_FREEZE }, + [PM_SUSPEND_STANDBY] = { .label = "standby", }, + [PM_SUSPEND_MEM] = { .label = "mem", }, +}; + +static const struct platform_suspend_ops *suspend_ops; +static const struct platform_freeze_ops *freeze_ops; + +static bool need_suspend_ops(suspend_state_t state) +{ + return state > PM_SUSPEND_FREEZE; +} + +static DECLARE_WAIT_QUEUE_HEAD(suspend_freeze_wait_head); +static bool suspend_freeze_wake; + +void freeze_set_ops(const struct platform_freeze_ops *ops) +{ + lock_system_sleep(); + freeze_ops = ops; + unlock_system_sleep(); +} + +static void freeze_begin(void) +{ + suspend_freeze_wake = false; +} + +static void freeze_enter(void) +{ + cpuidle_use_deepest_state(true); + cpuidle_resume(); + wait_event(suspend_freeze_wait_head, suspend_freeze_wake); + cpuidle_pause(); + cpuidle_use_deepest_state(false); +} + +void freeze_wake(void) +{ + suspend_freeze_wake = true; + wake_up(&suspend_freeze_wait_head); +} +EXPORT_SYMBOL_GPL(freeze_wake); + +static bool valid_state(suspend_state_t state) +{ + /* + * PM_SUSPEND_STANDBY and PM_SUSPEND_MEM states need low level + * support and need to be valid to the low level + * implementation, no valid callback implies that none are valid. + */ + return suspend_ops && suspend_ops->valid && suspend_ops->valid(state); +} + +/* + * If this is set, the "mem" label always corresponds to the deepest sleep state + * available, the "standby" label corresponds to the second deepest sleep state + * available (if any), and the "freeze" label corresponds to the remaining + * available sleep state (if there is one). + */ +static bool relative_states; + +static int __init sleep_states_setup(char *str) +{ + relative_states = !strncmp(str, "1", 1); + if (relative_states) { + pm_states[PM_SUSPEND_MEM].state = PM_SUSPEND_FREEZE; + pm_states[PM_SUSPEND_FREEZE].state = 0; + } + return 1; +} + +__setup("relative_sleep_states=", sleep_states_setup); + +/** + * suspend_set_ops - Set the global suspend method table. + * @ops: Suspend operations to use. + */ +void suspend_set_ops(const struct platform_suspend_ops *ops) +{ + suspend_state_t i; + int j = PM_SUSPEND_MAX - 1; + + lock_system_sleep(); + + suspend_ops = ops; + for (i = PM_SUSPEND_MEM; i >= PM_SUSPEND_STANDBY; i--) + if (valid_state(i)) + pm_states[j--].state = i; + else if (!relative_states) + pm_states[j--].state = 0; + + pm_states[j--].state = PM_SUSPEND_FREEZE; + while (j >= PM_SUSPEND_MIN) + pm_states[j--].state = 0; + + unlock_system_sleep(); +} +EXPORT_SYMBOL_GPL(suspend_set_ops); + +/** + * suspend_valid_only_mem - Generic memory-only valid callback. + * + * Platform drivers that implement mem suspend only and only need to check for + * that in their .valid() callback can use this instead of rolling their own + * .valid() callback. + */ +int suspend_valid_only_mem(suspend_state_t state) +{ + return state == PM_SUSPEND_MEM; +} +EXPORT_SYMBOL_GPL(suspend_valid_only_mem); + +static int suspend_test(int level) +{ +#ifdef CONFIG_PM_DEBUG + if (pm_test_level == level) { + printk(KERN_INFO "suspend debug: Waiting for 5 seconds.\n"); + mdelay(5000); + return 1; + } +#endif /* !CONFIG_PM_DEBUG */ + return 0; +} + +/** + * suspend_prepare - Prepare for entering system sleep state. + * + * Common code run for every system sleep state that can be entered (except for + * hibernation). Run suspend notifiers, allocate the "suspend" console and + * freeze processes. + */ +static int suspend_prepare(suspend_state_t state) +{ + int error; + + if (need_suspend_ops(state) && (!suspend_ops || !suspend_ops->enter)) + return -EPERM; + + pm_prepare_console(); + + error = pm_notifier_call_chain(PM_SUSPEND_PREPARE); + if (error) + goto Finish; + + trace_suspend_resume(TPS("freeze_processes"), 0, true); + error = suspend_freeze_processes(); + trace_suspend_resume(TPS("freeze_processes"), 0, false); + if (!error) + return 0; + + suspend_stats.failed_freeze++; + dpm_save_failed_step(SUSPEND_FREEZE); + Finish: + pm_notifier_call_chain(PM_POST_SUSPEND); + pm_restore_console(); + return error; +} + +/* default implementation */ +void __weak arch_suspend_disable_irqs(void) +{ + local_irq_disable(); +} + +/* default implementation */ +void __weak arch_suspend_enable_irqs(void) +{ + local_irq_enable(); +} + +/** + * suspend_enter - Make the system enter the given sleep state. + * @state: System sleep state to enter. + * @wakeup: Returns information that the sleep state should not be re-entered. + * + * This function should be called after devices have been suspended. + */ +static int suspend_enter(suspend_state_t state, bool *wakeup) +{ + int error; + + if (need_suspend_ops(state) && suspend_ops->prepare) { + error = suspend_ops->prepare(); + if (error) + goto Platform_finish; + } + + error = dpm_suspend_end(PMSG_SUSPEND); + if (error) { + printk(KERN_ERR "PM: Some devices failed to power down\n"); + goto Platform_finish; + } + + if (need_suspend_ops(state) && suspend_ops->prepare_late) { + error = suspend_ops->prepare_late(); + if (error) + goto Platform_wake; + } + + if (suspend_test(TEST_PLATFORM)) + goto Platform_wake; + + /* + * PM_SUSPEND_FREEZE equals + * frozen processes + suspended devices + idle processors. + * Thus we should invoke freeze_enter() soon after + * all the devices are suspended. + */ + if (state == PM_SUSPEND_FREEZE) { + trace_suspend_resume(TPS("machine_suspend"), state, true); + freeze_enter(); + trace_suspend_resume(TPS("machine_suspend"), state, false); + goto Platform_wake; + } + + ftrace_stop(); + error = disable_nonboot_cpus(); + if (error || suspend_test(TEST_CPUS)) + goto Enable_cpus; + + arch_suspend_disable_irqs(); + BUG_ON(!irqs_disabled()); + + error = syscore_suspend(); + if (!error) { + *wakeup = pm_wakeup_pending(); + if (!(suspend_test(TEST_CORE) || *wakeup)) { + trace_suspend_resume(TPS("machine_suspend"), + state, true); + error = suspend_ops->enter(state); + trace_suspend_resume(TPS("machine_suspend"), + state, false); + events_check_enabled = false; + } + syscore_resume(); + } + + arch_suspend_enable_irqs(); + BUG_ON(irqs_disabled()); + + Enable_cpus: + enable_nonboot_cpus(); + ftrace_start(); + + Platform_wake: + if (need_suspend_ops(state) && suspend_ops->wake) + suspend_ops->wake(); + + dpm_resume_start(PMSG_RESUME); + + Platform_finish: + if (need_suspend_ops(state) && suspend_ops->finish) + suspend_ops->finish(); + + return error; +} + +/** + * suspend_devices_and_enter - Suspend devices and enter system sleep state. + * @state: System sleep state to enter. + */ +int suspend_devices_and_enter(suspend_state_t state) +{ + int error; + bool wakeup = false; + + if (need_suspend_ops(state) && !suspend_ops) + return -ENOSYS; + + if (need_suspend_ops(state) && suspend_ops->begin) { + error = suspend_ops->begin(state); + if (error) + goto Close; + } else if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->begin) { + error = freeze_ops->begin(); + if (error) + goto Close; + } + suspend_console(); + suspend_test_start(); + error = dpm_suspend_start(PMSG_SUSPEND); + if (error) { + pr_err("PM: Some devices failed to suspend, or early wake event detected\n"); + goto Recover_platform; + } + suspend_test_finish("suspend devices"); + if (suspend_test(TEST_DEVICES)) + goto Recover_platform; + + do { + error = suspend_enter(state, &wakeup); + } while (!error && !wakeup && need_suspend_ops(state) + && suspend_ops->suspend_again && suspend_ops->suspend_again()); + + Resume_devices: + suspend_test_start(); + dpm_resume_end(PMSG_RESUME); + suspend_test_finish("resume devices"); + resume_console(); + Close: + if (need_suspend_ops(state) && suspend_ops->end) + suspend_ops->end(); + else if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->end) + freeze_ops->end(); + + return error; + + Recover_platform: + if (need_suspend_ops(state) && suspend_ops->recover) + suspend_ops->recover(); + goto Resume_devices; +} + +/** + * suspend_finish - Clean up before finishing the suspend sequence. + * + * Call platform code to clean up, restart processes, and free the console that + * we've allocated. This routine is not called for hibernation. + */ +static void suspend_finish(void) +{ + suspend_thaw_processes(); + pm_notifier_call_chain(PM_POST_SUSPEND); + pm_restore_console(); +} + +/** + * enter_state - Do common work needed to enter system sleep state. + * @state: System sleep state to enter. + * + * Make sure that no one else is trying to put the system into a sleep state. + * Fail if that's not the case. Otherwise, prepare for system suspend, make the + * system enter the given sleep state and clean up after wakeup. + */ +static int enter_state(suspend_state_t state) +{ + int error; + + trace_suspend_resume(TPS("suspend_enter"), state, true); + if (state == PM_SUSPEND_FREEZE) { +#ifdef CONFIG_PM_DEBUG + if (pm_test_level != TEST_NONE && pm_test_level <= TEST_CPUS) { + pr_warning("PM: Unsupported test mode for freeze state," + "please choose none/freezer/devices/platform.\n"); + return -EAGAIN; + } +#endif + } else if (!valid_state(state)) { + return -EINVAL; + } + if (!mutex_trylock(&pm_mutex)) + return -EBUSY; + + if (state == PM_SUSPEND_FREEZE) + freeze_begin(); + + trace_suspend_resume(TPS("sync_filesystems"), 0, true); + printk(KERN_INFO "PM: Syncing filesystems ... "); + sys_sync(); + printk("done.\n"); + trace_suspend_resume(TPS("sync_filesystems"), 0, false); + + pr_debug("PM: Preparing system for %s sleep\n", pm_states[state].label); + error = suspend_prepare(state); + if (error) + goto Unlock; + + if (suspend_test(TEST_FREEZER)) + goto Finish; + + trace_suspend_resume(TPS("suspend_enter"), state, false); + pr_debug("PM: Entering %s sleep\n", pm_states[state].label); + pm_restrict_gfp_mask(); + error = suspend_devices_and_enter(state); + pm_restore_gfp_mask(); + + Finish: + pr_debug("PM: Finishing wakeup.\n"); + suspend_finish(); + Unlock: + mutex_unlock(&pm_mutex); + return error; +} + +/** + * pm_suspend - Externally visible function for suspending the system. + * @state: System sleep state to enter. + * + * Check if the value of @state represents one of the supported states, + * execute enter_state() and update system suspend statistics. + */ +int pm_suspend(suspend_state_t state) +{ + int error; + + if (state <= PM_SUSPEND_ON || state >= PM_SUSPEND_MAX) + return -EINVAL; + + error = enter_state(state); + if (error) { + suspend_stats.fail++; + dpm_save_failed_errno(error); + } else { + suspend_stats.success++; + } + return error; +} +EXPORT_SYMBOL(pm_suspend); diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c new file mode 100644 index 00000000000..269b097e78e --- /dev/null +++ b/kernel/power/suspend_test.c @@ -0,0 +1,185 @@ +/* + * kernel/power/suspend_test.c - Suspend to RAM and standby test facility. + * + * Copyright (c) 2009 Pavel Machek <pavel@ucw.cz> + * + * This file is released under the GPLv2. + */ + +#include <linux/init.h> +#include <linux/rtc.h> + +#include "power.h" + +/* + * We test the system suspend code by setting an RTC wakealarm a short + * time in the future, then suspending. Suspending the devices won't + * normally take long ... some systems only need a few milliseconds. + * + * The time it takes is system-specific though, so when we test this + * during system bootup we allow a LOT of time. + */ +#define TEST_SUSPEND_SECONDS 10 + +static unsigned long suspend_test_start_time; + +void suspend_test_start(void) +{ + /* FIXME Use better timebase than "jiffies", ideally a clocksource. + * What we want is a hardware counter that will work correctly even + * during the irqs-are-off stages of the suspend/resume cycle... + */ + suspend_test_start_time = jiffies; +} + +void suspend_test_finish(const char *label) +{ + long nj = jiffies - suspend_test_start_time; + unsigned msec; + + msec = jiffies_to_msecs(abs(nj)); + pr_info("PM: %s took %d.%03d seconds\n", label, + msec / 1000, msec % 1000); + + /* Warning on suspend means the RTC alarm period needs to be + * larger -- the system was sooo slooowwww to suspend that the + * alarm (should have) fired before the system went to sleep! + * + * Warning on either suspend or resume also means the system + * has some performance issues. The stack dump of a WARN_ON + * is more likely to get the right attention than a printk... + */ + WARN(msec > (TEST_SUSPEND_SECONDS * 1000), + "Component: %s, time: %u\n", label, msec); +} + +/* + * To test system suspend, we need a hands-off mechanism to resume the + * system. RTCs wake alarms are a common self-contained mechanism. + */ + +static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state) +{ + static char err_readtime[] __initdata = + KERN_ERR "PM: can't read %s time, err %d\n"; + static char err_wakealarm [] __initdata = + KERN_ERR "PM: can't set %s wakealarm, err %d\n"; + static char err_suspend[] __initdata = + KERN_ERR "PM: suspend test failed, error %d\n"; + static char info_test[] __initdata = + KERN_INFO "PM: test RTC wakeup from '%s' suspend\n"; + + unsigned long now; + struct rtc_wkalrm alm; + int status; + + /* this may fail if the RTC hasn't been initialized */ + status = rtc_read_time(rtc, &alm.time); + if (status < 0) { + printk(err_readtime, dev_name(&rtc->dev), status); + return; + } + rtc_tm_to_time(&alm.time, &now); + + memset(&alm, 0, sizeof alm); + rtc_time_to_tm(now + TEST_SUSPEND_SECONDS, &alm.time); + alm.enabled = true; + + status = rtc_set_alarm(rtc, &alm); + if (status < 0) { + printk(err_wakealarm, dev_name(&rtc->dev), status); + return; + } + + if (state == PM_SUSPEND_MEM) { + printk(info_test, pm_states[state].label); + status = pm_suspend(state); + if (status == -ENODEV) + state = PM_SUSPEND_STANDBY; + } + if (state == PM_SUSPEND_STANDBY) { + printk(info_test, pm_states[state].label); + status = pm_suspend(state); + } + if (status < 0) + printk(err_suspend, status); + + /* Some platforms can't detect that the alarm triggered the + * wakeup, or (accordingly) disable it after it afterwards. + * It's supposed to give oneshot behavior; cope. + */ + alm.enabled = false; + rtc_set_alarm(rtc, &alm); +} + +static int __init has_wakealarm(struct device *dev, const void *data) +{ + struct rtc_device *candidate = to_rtc_device(dev); + + if (!candidate->ops->set_alarm) + return 0; + if (!device_may_wakeup(candidate->dev.parent)) + return 0; + + return 1; +} + +/* + * Kernel options like "test_suspend=mem" force suspend/resume sanity tests + * at startup time. They're normally disabled, for faster boot and because + * we can't know which states really work on this particular system. + */ +static suspend_state_t test_state __initdata = PM_SUSPEND_ON; + +static char warn_bad_state[] __initdata = + KERN_WARNING "PM: can't test '%s' suspend state\n"; + +static int __init setup_test_suspend(char *value) +{ + suspend_state_t i; + + /* "=mem" ==> "mem" */ + value++; + for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++) + if (!strcmp(pm_states[i].label, value)) { + test_state = pm_states[i].state; + return 0; + } + + printk(warn_bad_state, value); + return 0; +} +__setup("test_suspend", setup_test_suspend); + +static int __init test_suspend(void) +{ + static char warn_no_rtc[] __initdata = + KERN_WARNING "PM: no wakealarm-capable RTC driver is ready\n"; + + struct rtc_device *rtc = NULL; + struct device *dev; + + /* PM is initialized by now; is that state testable? */ + if (test_state == PM_SUSPEND_ON) + goto done; + if (!pm_states[test_state].state) { + printk(warn_bad_state, pm_states[test_state].label); + goto done; + } + + /* RTCs have initialized by now too ... can we use one? */ + dev = class_find_device(rtc_class, NULL, NULL, has_wakealarm); + if (dev) + rtc = rtc_class_open(dev_name(dev)); + if (!rtc) { + printk(warn_no_rtc); + goto done; + } + + /* go for it */ + test_wakealarm(rtc, test_state); + rtc_class_close(rtc); +done: + return 0; +} +late_initcall(test_suspend); diff --git a/kernel/power/swap.c b/kernel/power/swap.c index a0abf9a463f..aaa3261dea5 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -4,8 +4,9 @@ * This file provides functions for reading the suspend image from * and writing it to a swap partition. * - * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@suse.cz> + * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@ucw.cz> * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl> + * Copyright (C) 2010-2012 Bojan Smojver <bojan@rexursive.com> * * This file is released under the GPLv2. * @@ -13,143 +14,223 @@ #include <linux/module.h> #include <linux/file.h> -#include <linux/utsname.h> -#include <linux/version.h> #include <linux/delay.h> #include <linux/bitops.h> #include <linux/genhd.h> #include <linux/device.h> -#include <linux/buffer_head.h> #include <linux/bio.h> #include <linux/blkdev.h> #include <linux/swap.h> #include <linux/swapops.h> #include <linux/pm.h> +#include <linux/slab.h> +#include <linux/lzo.h> +#include <linux/vmalloc.h> +#include <linux/cpumask.h> +#include <linux/atomic.h> +#include <linux/kthread.h> +#include <linux/crc32.h> #include "power.h" -#define SWSUSP_SIG "S1SUSPEND" +#define HIBERNATE_SIG "S1SUSPEND" + +/* + * The swap map is a data structure used for keeping track of each page + * written to a swap partition. It consists of many swap_map_page + * structures that contain each an array of MAP_PAGE_ENTRIES swap entries. + * These structures are stored on the swap and linked together with the + * help of the .next_swap member. + * + * The swap map is created during suspend. The swap map pages are + * allocated and populated one at a time, so we only need one memory + * page to set up the entire structure. + * + * During resume we pick up all swap_map_page structures into a list. + */ + +#define MAP_PAGE_ENTRIES (PAGE_SIZE / sizeof(sector_t) - 1) + +/* + * Number of free pages that are not high. + */ +static inline unsigned long low_free_pages(void) +{ + return nr_free_pages() - nr_free_highpages(); +} + +/* + * Number of pages required to be kept free while writing the image. Always + * half of all available low pages before the writing starts. + */ +static inline unsigned long reqd_free_pages(void) +{ + return low_free_pages() / 2; +} + +struct swap_map_page { + sector_t entries[MAP_PAGE_ENTRIES]; + sector_t next_swap; +}; + +struct swap_map_page_list { + struct swap_map_page *map; + struct swap_map_page_list *next; +}; + +/** + * The swap_map_handle structure is used for handling swap in + * a file-alike way + */ + +struct swap_map_handle { + struct swap_map_page *cur; + struct swap_map_page_list *maps; + sector_t cur_swap; + sector_t first_sector; + unsigned int k; + unsigned long reqd_free_pages; + u32 crc32; +}; struct swsusp_header { - char reserved[PAGE_SIZE - 20 - sizeof(sector_t) - sizeof(int)]; + char reserved[PAGE_SIZE - 20 - sizeof(sector_t) - sizeof(int) - + sizeof(u32)]; + u32 crc32; sector_t image; unsigned int flags; /* Flags to pass to the "boot" kernel */ char orig_sig[10]; char sig[10]; -} __attribute__((packed)); +} __packed; static struct swsusp_header *swsusp_header; -/* - * General things +/** + * The following functions are used for tracing the allocated + * swap pages, so that they can be freed in case of an error. */ -static unsigned short root_swap = 0xffff; -static struct block_device *resume_bdev; +struct swsusp_extent { + struct rb_node node; + unsigned long start; + unsigned long end; +}; -/** - * submit - submit BIO request. - * @rw: READ or WRITE. - * @off physical offset of page. - * @page: page we're reading or writing. - * @bio_chain: list of pending biod (for async reading) - * - * Straight from the textbook - allocate and initialize the bio. - * If we're reading, make sure the page is marked as dirty. - * Then submit it and, if @bio_chain == NULL, wait. - */ -static int submit(int rw, pgoff_t page_off, struct page *page, - struct bio **bio_chain) -{ - struct bio *bio; +static struct rb_root swsusp_extents = RB_ROOT; - bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1); - if (!bio) - return -ENOMEM; - bio->bi_sector = page_off * (PAGE_SIZE >> 9); - bio->bi_bdev = resume_bdev; - bio->bi_end_io = end_swap_bio_read; - - if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { - printk(KERN_ERR "PM: Adding page to bio failed at %ld\n", - page_off); - bio_put(bio); - return -EFAULT; +static int swsusp_extents_insert(unsigned long swap_offset) +{ + struct rb_node **new = &(swsusp_extents.rb_node); + struct rb_node *parent = NULL; + struct swsusp_extent *ext; + + /* Figure out where to put the new node */ + while (*new) { + ext = rb_entry(*new, struct swsusp_extent, node); + parent = *new; + if (swap_offset < ext->start) { + /* Try to merge */ + if (swap_offset == ext->start - 1) { + ext->start--; + return 0; + } + new = &((*new)->rb_left); + } else if (swap_offset > ext->end) { + /* Try to merge */ + if (swap_offset == ext->end + 1) { + ext->end++; + return 0; + } + new = &((*new)->rb_right); + } else { + /* It already is in the tree */ + return -EINVAL; + } } + /* Add the new node and rebalance the tree. */ + ext = kzalloc(sizeof(struct swsusp_extent), GFP_KERNEL); + if (!ext) + return -ENOMEM; - lock_page(page); - bio_get(bio); + ext->start = swap_offset; + ext->end = swap_offset; + rb_link_node(&ext->node, parent, new); + rb_insert_color(&ext->node, &swsusp_extents); + return 0; +} - if (bio_chain == NULL) { - submit_bio(rw | (1 << BIO_RW_SYNC), bio); - wait_on_page_locked(page); - if (rw == READ) - bio_set_pages_dirty(bio); - bio_put(bio); - } else { - if (rw == READ) - get_page(page); /* These pages are freed later */ - bio->bi_private = *bio_chain; - *bio_chain = bio; - submit_bio(rw | (1 << BIO_RW_SYNC), bio); +/** + * alloc_swapdev_block - allocate a swap page and register that it has + * been allocated, so that it can be freed in case of an error. + */ + +sector_t alloc_swapdev_block(int swap) +{ + unsigned long offset; + + offset = swp_offset(get_swap_page_of_type(swap)); + if (offset) { + if (swsusp_extents_insert(offset)) + swap_free(swp_entry(swap, offset)); + else + return swapdev_block(swap, offset); } return 0; } -static int bio_read_page(pgoff_t page_off, void *addr, struct bio **bio_chain) +/** + * free_all_swap_pages - free swap pages allocated for saving image data. + * It also frees the extents used to register which swap entries had been + * allocated. + */ + +void free_all_swap_pages(int swap) { - return submit(READ, page_off, virt_to_page(addr), bio_chain); + struct rb_node *node; + + while ((node = swsusp_extents.rb_node)) { + struct swsusp_extent *ext; + unsigned long offset; + + ext = container_of(node, struct swsusp_extent, node); + rb_erase(node, &swsusp_extents); + for (offset = ext->start; offset <= ext->end; offset++) + swap_free(swp_entry(swap, offset)); + + kfree(ext); + } } -static int bio_write_page(pgoff_t page_off, void *addr, struct bio **bio_chain) +int swsusp_swap_in_use(void) { - return submit(WRITE, page_off, virt_to_page(addr), bio_chain); + return (swsusp_extents.rb_node != NULL); } -static int wait_on_bio_chain(struct bio **bio_chain) -{ - struct bio *bio; - struct bio *next_bio; - int ret = 0; +/* + * General things + */ - if (bio_chain == NULL) - return 0; - - bio = *bio_chain; - if (bio == NULL) - return 0; - while (bio) { - struct page *page; - - next_bio = bio->bi_private; - page = bio->bi_io_vec[0].bv_page; - wait_on_page_locked(page); - if (!PageUptodate(page) || PageError(page)) - ret = -EIO; - put_page(page); - bio_put(bio); - bio = next_bio; - } - *bio_chain = NULL; - return ret; -} +static unsigned short root_swap = 0xffff; +struct block_device *hib_resume_bdev; /* * Saving part */ -static int mark_swapfiles(sector_t start, unsigned int flags) +static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags) { int error; - bio_read_page(swsusp_resume_block, swsusp_header, NULL); + hib_bio_read_page(swsusp_resume_block, swsusp_header, NULL); if (!memcmp("SWAP-SPACE",swsusp_header->sig, 10) || !memcmp("SWAPSPACE2",swsusp_header->sig, 10)) { memcpy(swsusp_header->orig_sig,swsusp_header->sig, 10); - memcpy(swsusp_header->sig,SWSUSP_SIG, 10); - swsusp_header->image = start; + memcpy(swsusp_header->sig, HIBERNATE_SIG, 10); + swsusp_header->image = handle->first_sector; swsusp_header->flags = flags; - error = bio_write_page(swsusp_resume_block, + if (flags & SF_CRC32_MODE) + swsusp_header->crc32 = handle->crc32; + error = hib_bio_write_page(swsusp_resume_block, swsusp_header, NULL); } else { printk(KERN_ERR "PM: Swap header not found!\n"); @@ -161,25 +242,26 @@ static int mark_swapfiles(sector_t start, unsigned int flags) /** * swsusp_swap_check - check if the resume device is a swap device * and get its index (if so) + * + * This is called before saving image */ - -static int swsusp_swap_check(void) /* This is called before saving image */ +static int swsusp_swap_check(void) { int res; res = swap_type_of(swsusp_resume_device, swsusp_resume_block, - &resume_bdev); + &hib_resume_bdev); if (res < 0) return res; root_swap = res; - res = blkdev_get(resume_bdev, FMODE_WRITE, O_RDWR); + res = blkdev_get(hib_resume_bdev, FMODE_WRITE, NULL); if (res) return res; - res = set_blocksize(resume_bdev, PAGE_SIZE); + res = set_blocksize(hib_resume_bdev, PAGE_SIZE); if (res < 0) - blkdev_put(resume_bdev); + blkdev_put(hib_resume_bdev, FMODE_WRITE); return res; } @@ -194,58 +276,37 @@ static int swsusp_swap_check(void) /* This is called before saving image */ static int write_page(void *buf, sector_t offset, struct bio **bio_chain) { void *src; + int ret; if (!offset) return -ENOSPC; if (bio_chain) { - src = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH); + src = (void *)__get_free_page(__GFP_WAIT | __GFP_NOWARN | + __GFP_NORETRY); if (src) { - memcpy(src, buf, PAGE_SIZE); + copy_page(src, buf); } else { - WARN_ON_ONCE(1); - bio_chain = NULL; /* Go synchronous */ - src = buf; + ret = hib_wait_on_bio_chain(bio_chain); /* Free pages */ + if (ret) + return ret; + src = (void *)__get_free_page(__GFP_WAIT | + __GFP_NOWARN | + __GFP_NORETRY); + if (src) { + copy_page(src, buf); + } else { + WARN_ON_ONCE(1); + bio_chain = NULL; /* Go synchronous */ + src = buf; + } } } else { src = buf; } - return bio_write_page(offset, src, bio_chain); + return hib_bio_write_page(offset, src, bio_chain); } -/* - * The swap map is a data structure used for keeping track of each page - * written to a swap partition. It consists of many swap_map_page - * structures that contain each an array of MAP_PAGE_SIZE swap entries. - * These structures are stored on the swap and linked together with the - * help of the .next_swap member. - * - * The swap map is created during suspend. The swap map pages are - * allocated and populated one at a time, so we only need one memory - * page to set up the entire structure. - * - * During resume we also only need to use one swap_map_page structure - * at a time. - */ - -#define MAP_PAGE_ENTRIES (PAGE_SIZE / sizeof(sector_t) - 1) - -struct swap_map_page { - sector_t entries[MAP_PAGE_ENTRIES]; - sector_t next_swap; -}; - -/** - * The swap_map_handle structure is used for handling swap in - * a file-alike way - */ - -struct swap_map_handle { - struct swap_map_page *cur; - sector_t cur_swap; - unsigned int k; -}; - static void release_swap_writer(struct swap_map_handle *handle) { if (handle->cur) @@ -255,16 +316,34 @@ static void release_swap_writer(struct swap_map_handle *handle) static int get_swap_writer(struct swap_map_handle *handle) { + int ret; + + ret = swsusp_swap_check(); + if (ret) { + if (ret != -ENOSPC) + printk(KERN_ERR "PM: Cannot find swap device, try " + "swapon -a.\n"); + return ret; + } handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_KERNEL); - if (!handle->cur) - return -ENOMEM; + if (!handle->cur) { + ret = -ENOMEM; + goto err_close; + } handle->cur_swap = alloc_swapdev_block(root_swap); if (!handle->cur_swap) { - release_swap_writer(handle); - return -ENOSPC; + ret = -ENOSPC; + goto err_rel; } handle->k = 0; + handle->reqd_free_pages = reqd_free_pages(); + handle->first_sector = handle->cur_swap; return 0; +err_rel: + release_swap_writer(handle); +err_close: + swsusp_close(FMODE_WRITE); + return ret; } static int swap_write_page(struct swap_map_handle *handle, void *buf, @@ -281,19 +360,27 @@ static int swap_write_page(struct swap_map_handle *handle, void *buf, return error; handle->cur->entries[handle->k++] = offset; if (handle->k >= MAP_PAGE_ENTRIES) { - error = wait_on_bio_chain(bio_chain); - if (error) - goto out; offset = alloc_swapdev_block(root_swap); if (!offset) return -ENOSPC; handle->cur->next_swap = offset; - error = write_page(handle->cur, handle->cur_swap, NULL); + error = write_page(handle->cur, handle->cur_swap, bio_chain); if (error) goto out; - memset(handle->cur, 0, PAGE_SIZE); + clear_page(handle->cur); handle->cur_swap = offset; handle->k = 0; + + if (bio_chain && low_free_pages() <= handle->reqd_free_pages) { + error = hib_wait_on_bio_chain(bio_chain); + if (error) + goto out; + /* + * Recalculate the number of required free pages, to + * make sure we never take more than half. + */ + handle->reqd_free_pages = reqd_free_pages(); + } } out: return error; @@ -307,6 +394,44 @@ static int flush_swap_writer(struct swap_map_handle *handle) return -EINVAL; } +static int swap_writer_finish(struct swap_map_handle *handle, + unsigned int flags, int error) +{ + if (!error) { + flush_swap_writer(handle); + printk(KERN_INFO "PM: S"); + error = mark_swapfiles(handle, flags); + printk("|\n"); + } + + if (error) + free_all_swap_pages(root_swap); + release_swap_writer(handle); + swsusp_close(FMODE_WRITE); + + return error; +} + +/* We need to remember how much compressed data we need to read. */ +#define LZO_HEADER sizeof(size_t) + +/* Number of pages/bytes we'll compress at one time. */ +#define LZO_UNC_PAGES 32 +#define LZO_UNC_SIZE (LZO_UNC_PAGES * PAGE_SIZE) + +/* Number of pages/bytes we need for compressed data (worst case). */ +#define LZO_CMP_PAGES DIV_ROUND_UP(lzo1x_worst_compress(LZO_UNC_SIZE) + \ + LZO_HEADER, PAGE_SIZE) +#define LZO_CMP_SIZE (LZO_CMP_PAGES * PAGE_SIZE) + +/* Maximum number of threads for compression/decompression. */ +#define LZO_THREADS 3 + +/* Minimum/maximum number of pages for read buffering. */ +#define LZO_MIN_RD_PAGES 1024 +#define LZO_MAX_RD_PAGES 8192 + + /** * save_image - save the suspend image data */ @@ -317,41 +442,344 @@ static int save_image(struct swap_map_handle *handle, { unsigned int m; int ret; - int error = 0; int nr_pages; int err2; struct bio *bio; struct timeval start; struct timeval stop; - printk(KERN_INFO "PM: Saving image data pages (%u pages) ... ", + printk(KERN_INFO "PM: Saving image data pages (%u pages)...\n", nr_to_write); - m = nr_to_write / 100; + m = nr_to_write / 10; if (!m) m = 1; nr_pages = 0; bio = NULL; do_gettimeofday(&start); - do { - ret = snapshot_read_next(snapshot, PAGE_SIZE); - if (ret > 0) { - error = swap_write_page(handle, data_of(*snapshot), - &bio); - if (error) + while (1) { + ret = snapshot_read_next(snapshot); + if (ret <= 0) + break; + ret = swap_write_page(handle, data_of(*snapshot), &bio); + if (ret) + break; + if (!(nr_pages % m)) + printk(KERN_INFO "PM: Image saving progress: %3d%%\n", + nr_pages / m * 10); + nr_pages++; + } + err2 = hib_wait_on_bio_chain(&bio); + do_gettimeofday(&stop); + if (!ret) + ret = err2; + if (!ret) + printk(KERN_INFO "PM: Image saving done.\n"); + swsusp_show_speed(&start, &stop, nr_to_write, "Wrote"); + return ret; +} + +/** + * Structure used for CRC32. + */ +struct crc_data { + struct task_struct *thr; /* thread */ + atomic_t ready; /* ready to start flag */ + atomic_t stop; /* ready to stop flag */ + unsigned run_threads; /* nr current threads */ + wait_queue_head_t go; /* start crc update */ + wait_queue_head_t done; /* crc update done */ + u32 *crc32; /* points to handle's crc32 */ + size_t *unc_len[LZO_THREADS]; /* uncompressed lengths */ + unsigned char *unc[LZO_THREADS]; /* uncompressed data */ +}; + +/** + * CRC32 update function that runs in its own thread. + */ +static int crc32_threadfn(void *data) +{ + struct crc_data *d = data; + unsigned i; + + while (1) { + wait_event(d->go, atomic_read(&d->ready) || + kthread_should_stop()); + if (kthread_should_stop()) { + d->thr = NULL; + atomic_set(&d->stop, 1); + wake_up(&d->done); + break; + } + atomic_set(&d->ready, 0); + + for (i = 0; i < d->run_threads; i++) + *d->crc32 = crc32_le(*d->crc32, + d->unc[i], *d->unc_len[i]); + atomic_set(&d->stop, 1); + wake_up(&d->done); + } + return 0; +} +/** + * Structure used for LZO data compression. + */ +struct cmp_data { + struct task_struct *thr; /* thread */ + atomic_t ready; /* ready to start flag */ + atomic_t stop; /* ready to stop flag */ + int ret; /* return code */ + wait_queue_head_t go; /* start compression */ + wait_queue_head_t done; /* compression done */ + size_t unc_len; /* uncompressed length */ + size_t cmp_len; /* compressed length */ + unsigned char unc[LZO_UNC_SIZE]; /* uncompressed buffer */ + unsigned char cmp[LZO_CMP_SIZE]; /* compressed buffer */ + unsigned char wrk[LZO1X_1_MEM_COMPRESS]; /* compression workspace */ +}; + +/** + * Compression function that runs in its own thread. + */ +static int lzo_compress_threadfn(void *data) +{ + struct cmp_data *d = data; + + while (1) { + wait_event(d->go, atomic_read(&d->ready) || + kthread_should_stop()); + if (kthread_should_stop()) { + d->thr = NULL; + d->ret = -1; + atomic_set(&d->stop, 1); + wake_up(&d->done); + break; + } + atomic_set(&d->ready, 0); + + d->ret = lzo1x_1_compress(d->unc, d->unc_len, + d->cmp + LZO_HEADER, &d->cmp_len, + d->wrk); + atomic_set(&d->stop, 1); + wake_up(&d->done); + } + return 0; +} + +/** + * save_image_lzo - Save the suspend image data compressed with LZO. + * @handle: Swap map handle to use for saving the image. + * @snapshot: Image to read data from. + * @nr_to_write: Number of pages to save. + */ +static int save_image_lzo(struct swap_map_handle *handle, + struct snapshot_handle *snapshot, + unsigned int nr_to_write) +{ + unsigned int m; + int ret = 0; + int nr_pages; + int err2; + struct bio *bio; + struct timeval start; + struct timeval stop; + size_t off; + unsigned thr, run_threads, nr_threads; + unsigned char *page = NULL; + struct cmp_data *data = NULL; + struct crc_data *crc = NULL; + + /* + * We'll limit the number of threads for compression to limit memory + * footprint. + */ + nr_threads = num_online_cpus() - 1; + nr_threads = clamp_val(nr_threads, 1, LZO_THREADS); + + page = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH); + if (!page) { + printk(KERN_ERR "PM: Failed to allocate LZO page\n"); + ret = -ENOMEM; + goto out_clean; + } + + data = vmalloc(sizeof(*data) * nr_threads); + if (!data) { + printk(KERN_ERR "PM: Failed to allocate LZO data\n"); + ret = -ENOMEM; + goto out_clean; + } + for (thr = 0; thr < nr_threads; thr++) + memset(&data[thr], 0, offsetof(struct cmp_data, go)); + + crc = kmalloc(sizeof(*crc), GFP_KERNEL); + if (!crc) { + printk(KERN_ERR "PM: Failed to allocate crc\n"); + ret = -ENOMEM; + goto out_clean; + } + memset(crc, 0, offsetof(struct crc_data, go)); + + /* + * Start the compression threads. + */ + for (thr = 0; thr < nr_threads; thr++) { + init_waitqueue_head(&data[thr].go); + init_waitqueue_head(&data[thr].done); + + data[thr].thr = kthread_run(lzo_compress_threadfn, + &data[thr], + "image_compress/%u", thr); + if (IS_ERR(data[thr].thr)) { + data[thr].thr = NULL; + printk(KERN_ERR + "PM: Cannot start compression threads\n"); + ret = -ENOMEM; + goto out_clean; + } + } + + /* + * Start the CRC32 thread. + */ + init_waitqueue_head(&crc->go); + init_waitqueue_head(&crc->done); + + handle->crc32 = 0; + crc->crc32 = &handle->crc32; + for (thr = 0; thr < nr_threads; thr++) { + crc->unc[thr] = data[thr].unc; + crc->unc_len[thr] = &data[thr].unc_len; + } + + crc->thr = kthread_run(crc32_threadfn, crc, "image_crc32"); + if (IS_ERR(crc->thr)) { + crc->thr = NULL; + printk(KERN_ERR "PM: Cannot start CRC32 thread\n"); + ret = -ENOMEM; + goto out_clean; + } + + /* + * Adjust the number of required free pages after all allocations have + * been done. We don't want to run out of pages when writing. + */ + handle->reqd_free_pages = reqd_free_pages(); + + printk(KERN_INFO + "PM: Using %u thread(s) for compression.\n" + "PM: Compressing and saving image data (%u pages)...\n", + nr_threads, nr_to_write); + m = nr_to_write / 10; + if (!m) + m = 1; + nr_pages = 0; + bio = NULL; + do_gettimeofday(&start); + for (;;) { + for (thr = 0; thr < nr_threads; thr++) { + for (off = 0; off < LZO_UNC_SIZE; off += PAGE_SIZE) { + ret = snapshot_read_next(snapshot); + if (ret < 0) + goto out_finish; + + if (!ret) + break; + + memcpy(data[thr].unc + off, + data_of(*snapshot), PAGE_SIZE); + + if (!(nr_pages % m)) + printk(KERN_INFO + "PM: Image saving progress: " + "%3d%%\n", + nr_pages / m * 10); + nr_pages++; + } + if (!off) break; - if (!(nr_pages % m)) - printk("\b\b\b\b%3d%%", nr_pages / m); - nr_pages++; + + data[thr].unc_len = off; + + atomic_set(&data[thr].ready, 1); + wake_up(&data[thr].go); + } + + if (!thr) + break; + + crc->run_threads = thr; + atomic_set(&crc->ready, 1); + wake_up(&crc->go); + + for (run_threads = thr, thr = 0; thr < run_threads; thr++) { + wait_event(data[thr].done, + atomic_read(&data[thr].stop)); + atomic_set(&data[thr].stop, 0); + + ret = data[thr].ret; + + if (ret < 0) { + printk(KERN_ERR "PM: LZO compression failed\n"); + goto out_finish; + } + + if (unlikely(!data[thr].cmp_len || + data[thr].cmp_len > + lzo1x_worst_compress(data[thr].unc_len))) { + printk(KERN_ERR + "PM: Invalid LZO compressed length\n"); + ret = -1; + goto out_finish; + } + + *(size_t *)data[thr].cmp = data[thr].cmp_len; + + /* + * Given we are writing one page at a time to disk, we + * copy that much from the buffer, although the last + * bit will likely be smaller than full page. This is + * OK - we saved the length of the compressed data, so + * any garbage at the end will be discarded when we + * read it. + */ + for (off = 0; + off < LZO_HEADER + data[thr].cmp_len; + off += PAGE_SIZE) { + memcpy(page, data[thr].cmp + off, PAGE_SIZE); + + ret = swap_write_page(handle, page, &bio); + if (ret) + goto out_finish; + } } - } while (ret > 0); - err2 = wait_on_bio_chain(&bio); + + wait_event(crc->done, atomic_read(&crc->stop)); + atomic_set(&crc->stop, 0); + } + +out_finish: + err2 = hib_wait_on_bio_chain(&bio); do_gettimeofday(&stop); - if (!error) - error = err2; - if (!error) - printk("\b\b\b\bdone\n"); + if (!ret) + ret = err2; + if (!ret) + printk(KERN_INFO "PM: Image saving done.\n"); swsusp_show_speed(&start, &stop, nr_to_write, "Wrote"); - return error; +out_clean: + if (crc) { + if (crc->thr) + kthread_stop(crc->thr); + kfree(crc); + } + if (data) { + for (thr = 0; thr < nr_threads; thr++) + if (data[thr].thr) + kthread_stop(data[thr].thr); + vfree(data); + } + if (page) free_page((unsigned long)page); + + return ret; } /** @@ -361,12 +789,15 @@ static int save_image(struct swap_map_handle *handle, * space avaiable from the resume partition. */ -static int enough_swap(unsigned int nr_pages) +static int enough_swap(unsigned int nr_pages, unsigned int flags) { unsigned int free_swap = count_swap_pages(root_swap, 1); + unsigned int required; pr_debug("PM: Free swap pages: %u\n", free_swap); - return free_swap > nr_pages + PAGES_FOR_IO; + + required = PAGES_FOR_IO + nr_pages; + return free_swap > required; } /** @@ -384,50 +815,39 @@ int swsusp_write(unsigned int flags) struct swap_map_handle handle; struct snapshot_handle snapshot; struct swsusp_info *header; + unsigned long pages; int error; - error = swsusp_swap_check(); + pages = snapshot_get_image_size(); + error = get_swap_writer(&handle); if (error) { - printk(KERN_ERR "PM: Cannot find swap device, try " - "swapon -a.\n"); + printk(KERN_ERR "PM: Cannot get swap writer\n"); return error; } + if (flags & SF_NOCOMPRESS_MODE) { + if (!enough_swap(pages, flags)) { + printk(KERN_ERR "PM: Not enough free swap\n"); + error = -ENOSPC; + goto out_finish; + } + } memset(&snapshot, 0, sizeof(struct snapshot_handle)); - error = snapshot_read_next(&snapshot, PAGE_SIZE); + error = snapshot_read_next(&snapshot); if (error < PAGE_SIZE) { if (error >= 0) error = -EFAULT; - goto out; + goto out_finish; } header = (struct swsusp_info *)data_of(snapshot); - if (!enough_swap(header->pages)) { - printk(KERN_ERR "PM: Not enough free swap\n"); - error = -ENOSPC; - goto out; - } - error = get_swap_writer(&handle); + error = swap_write_page(&handle, header, NULL); if (!error) { - sector_t start = handle.cur_swap; - - error = swap_write_page(&handle, header, NULL); - if (!error) - error = save_image(&handle, &snapshot, - header->pages - 1); - - if (!error) { - flush_swap_writer(&handle); - printk(KERN_INFO "PM: S"); - error = mark_swapfiles(start, flags); - printk("|\n"); - } + error = (flags & SF_NOCOMPRESS_MODE) ? + save_image(&handle, &snapshot, pages - 1) : + save_image_lzo(&handle, &snapshot, pages - 1); } - if (error) - free_all_swap_pages(root_swap); - - release_swap_writer(&handle); - out: - swsusp_close(); +out_finish: + error = swap_writer_finish(&handle, flags, error); return error; } @@ -438,28 +858,62 @@ int swsusp_write(unsigned int flags) static void release_swap_reader(struct swap_map_handle *handle) { - if (handle->cur) - free_page((unsigned long)handle->cur); + struct swap_map_page_list *tmp; + + while (handle->maps) { + if (handle->maps->map) + free_page((unsigned long)handle->maps->map); + tmp = handle->maps; + handle->maps = handle->maps->next; + kfree(tmp); + } handle->cur = NULL; } -static int get_swap_reader(struct swap_map_handle *handle, sector_t start) +static int get_swap_reader(struct swap_map_handle *handle, + unsigned int *flags_p) { int error; + struct swap_map_page_list *tmp, *last; + sector_t offset; - if (!start) + *flags_p = swsusp_header->flags; + + if (!swsusp_header->image) /* how can this happen? */ return -EINVAL; - handle->cur = (struct swap_map_page *)get_zeroed_page(__GFP_WAIT | __GFP_HIGH); - if (!handle->cur) - return -ENOMEM; + handle->cur = NULL; + last = handle->maps = NULL; + offset = swsusp_header->image; + while (offset) { + tmp = kmalloc(sizeof(*handle->maps), GFP_KERNEL); + if (!tmp) { + release_swap_reader(handle); + return -ENOMEM; + } + memset(tmp, 0, sizeof(*tmp)); + if (!handle->maps) + handle->maps = tmp; + if (last) + last->next = tmp; + last = tmp; + + tmp->map = (struct swap_map_page *) + __get_free_page(__GFP_WAIT | __GFP_HIGH); + if (!tmp->map) { + release_swap_reader(handle); + return -ENOMEM; + } - error = bio_read_page(start, handle->cur, NULL); - if (error) { - release_swap_reader(handle); - return error; + error = hib_bio_read_page(offset, tmp->map, NULL); + if (error) { + release_swap_reader(handle); + return error; + } + offset = tmp->map->next_swap; } handle->k = 0; + handle->cur = handle->maps->map; return 0; } @@ -468,27 +922,37 @@ static int swap_read_page(struct swap_map_handle *handle, void *buf, { sector_t offset; int error; + struct swap_map_page_list *tmp; if (!handle->cur) return -EINVAL; offset = handle->cur->entries[handle->k]; if (!offset) return -EFAULT; - error = bio_read_page(offset, buf, bio_chain); + error = hib_bio_read_page(offset, buf, bio_chain); if (error) return error; if (++handle->k >= MAP_PAGE_ENTRIES) { - error = wait_on_bio_chain(bio_chain); handle->k = 0; - offset = handle->cur->next_swap; - if (!offset) + free_page((unsigned long)handle->maps->map); + tmp = handle->maps; + handle->maps = handle->maps->next; + kfree(tmp); + if (!handle->maps) release_swap_reader(handle); - else if (!error) - error = bio_read_page(offset, handle->cur, NULL); + else + handle->cur = handle->maps->map; } return error; } +static int swap_reader_finish(struct swap_map_handle *handle) +{ + release_swap_reader(handle); + + return 0; +} + /** * load_image - load the image using the swap map handle * @handle and the snapshot handle @snapshot @@ -500,54 +964,425 @@ static int load_image(struct swap_map_handle *handle, unsigned int nr_to_read) { unsigned int m; - int error = 0; + int ret = 0; struct timeval start; struct timeval stop; struct bio *bio; int err2; unsigned nr_pages; - printk(KERN_INFO "PM: Loading image data pages (%u pages) ... ", + printk(KERN_INFO "PM: Loading image data pages (%u pages)...\n", nr_to_read); - m = nr_to_read / 100; + m = nr_to_read / 10; if (!m) m = 1; nr_pages = 0; bio = NULL; do_gettimeofday(&start); for ( ; ; ) { - error = snapshot_write_next(snapshot, PAGE_SIZE); - if (error <= 0) + ret = snapshot_write_next(snapshot); + if (ret <= 0) break; - error = swap_read_page(handle, data_of(*snapshot), &bio); - if (error) + ret = swap_read_page(handle, data_of(*snapshot), &bio); + if (ret) break; if (snapshot->sync_read) - error = wait_on_bio_chain(&bio); - if (error) + ret = hib_wait_on_bio_chain(&bio); + if (ret) break; if (!(nr_pages % m)) - printk("\b\b\b\b%3d%%", nr_pages / m); + printk(KERN_INFO "PM: Image loading progress: %3d%%\n", + nr_pages / m * 10); nr_pages++; } - err2 = wait_on_bio_chain(&bio); + err2 = hib_wait_on_bio_chain(&bio); do_gettimeofday(&stop); - if (!error) - error = err2; - if (!error) { - printk("\b\b\b\bdone\n"); + if (!ret) + ret = err2; + if (!ret) { + printk(KERN_INFO "PM: Image loading done.\n"); snapshot_write_finalize(snapshot); if (!snapshot_image_loaded(snapshot)) - error = -ENODATA; + ret = -ENODATA; } swsusp_show_speed(&start, &stop, nr_to_read, "Read"); - return error; + return ret; +} + +/** + * Structure used for LZO data decompression. + */ +struct dec_data { + struct task_struct *thr; /* thread */ + atomic_t ready; /* ready to start flag */ + atomic_t stop; /* ready to stop flag */ + int ret; /* return code */ + wait_queue_head_t go; /* start decompression */ + wait_queue_head_t done; /* decompression done */ + size_t unc_len; /* uncompressed length */ + size_t cmp_len; /* compressed length */ + unsigned char unc[LZO_UNC_SIZE]; /* uncompressed buffer */ + unsigned char cmp[LZO_CMP_SIZE]; /* compressed buffer */ +}; + +/** + * Deompression function that runs in its own thread. + */ +static int lzo_decompress_threadfn(void *data) +{ + struct dec_data *d = data; + + while (1) { + wait_event(d->go, atomic_read(&d->ready) || + kthread_should_stop()); + if (kthread_should_stop()) { + d->thr = NULL; + d->ret = -1; + atomic_set(&d->stop, 1); + wake_up(&d->done); + break; + } + atomic_set(&d->ready, 0); + + d->unc_len = LZO_UNC_SIZE; + d->ret = lzo1x_decompress_safe(d->cmp + LZO_HEADER, d->cmp_len, + d->unc, &d->unc_len); + atomic_set(&d->stop, 1); + wake_up(&d->done); + } + return 0; +} + +/** + * load_image_lzo - Load compressed image data and decompress them with LZO. + * @handle: Swap map handle to use for loading data. + * @snapshot: Image to copy uncompressed data into. + * @nr_to_read: Number of pages to load. + */ +static int load_image_lzo(struct swap_map_handle *handle, + struct snapshot_handle *snapshot, + unsigned int nr_to_read) +{ + unsigned int m; + int ret = 0; + int eof = 0; + struct bio *bio; + struct timeval start; + struct timeval stop; + unsigned nr_pages; + size_t off; + unsigned i, thr, run_threads, nr_threads; + unsigned ring = 0, pg = 0, ring_size = 0, + have = 0, want, need, asked = 0; + unsigned long read_pages = 0; + unsigned char **page = NULL; + struct dec_data *data = NULL; + struct crc_data *crc = NULL; + + /* + * We'll limit the number of threads for decompression to limit memory + * footprint. + */ + nr_threads = num_online_cpus() - 1; + nr_threads = clamp_val(nr_threads, 1, LZO_THREADS); + + page = vmalloc(sizeof(*page) * LZO_MAX_RD_PAGES); + if (!page) { + printk(KERN_ERR "PM: Failed to allocate LZO page\n"); + ret = -ENOMEM; + goto out_clean; + } + + data = vmalloc(sizeof(*data) * nr_threads); + if (!data) { + printk(KERN_ERR "PM: Failed to allocate LZO data\n"); + ret = -ENOMEM; + goto out_clean; + } + for (thr = 0; thr < nr_threads; thr++) + memset(&data[thr], 0, offsetof(struct dec_data, go)); + + crc = kmalloc(sizeof(*crc), GFP_KERNEL); + if (!crc) { + printk(KERN_ERR "PM: Failed to allocate crc\n"); + ret = -ENOMEM; + goto out_clean; + } + memset(crc, 0, offsetof(struct crc_data, go)); + + /* + * Start the decompression threads. + */ + for (thr = 0; thr < nr_threads; thr++) { + init_waitqueue_head(&data[thr].go); + init_waitqueue_head(&data[thr].done); + + data[thr].thr = kthread_run(lzo_decompress_threadfn, + &data[thr], + "image_decompress/%u", thr); + if (IS_ERR(data[thr].thr)) { + data[thr].thr = NULL; + printk(KERN_ERR + "PM: Cannot start decompression threads\n"); + ret = -ENOMEM; + goto out_clean; + } + } + + /* + * Start the CRC32 thread. + */ + init_waitqueue_head(&crc->go); + init_waitqueue_head(&crc->done); + + handle->crc32 = 0; + crc->crc32 = &handle->crc32; + for (thr = 0; thr < nr_threads; thr++) { + crc->unc[thr] = data[thr].unc; + crc->unc_len[thr] = &data[thr].unc_len; + } + + crc->thr = kthread_run(crc32_threadfn, crc, "image_crc32"); + if (IS_ERR(crc->thr)) { + crc->thr = NULL; + printk(KERN_ERR "PM: Cannot start CRC32 thread\n"); + ret = -ENOMEM; + goto out_clean; + } + + /* + * Set the number of pages for read buffering. + * This is complete guesswork, because we'll only know the real + * picture once prepare_image() is called, which is much later on + * during the image load phase. We'll assume the worst case and + * say that none of the image pages are from high memory. + */ + if (low_free_pages() > snapshot_get_image_size()) + read_pages = (low_free_pages() - snapshot_get_image_size()) / 2; + read_pages = clamp_val(read_pages, LZO_MIN_RD_PAGES, LZO_MAX_RD_PAGES); + + for (i = 0; i < read_pages; i++) { + page[i] = (void *)__get_free_page(i < LZO_CMP_PAGES ? + __GFP_WAIT | __GFP_HIGH : + __GFP_WAIT | __GFP_NOWARN | + __GFP_NORETRY); + + if (!page[i]) { + if (i < LZO_CMP_PAGES) { + ring_size = i; + printk(KERN_ERR + "PM: Failed to allocate LZO pages\n"); + ret = -ENOMEM; + goto out_clean; + } else { + break; + } + } + } + want = ring_size = i; + + printk(KERN_INFO + "PM: Using %u thread(s) for decompression.\n" + "PM: Loading and decompressing image data (%u pages)...\n", + nr_threads, nr_to_read); + m = nr_to_read / 10; + if (!m) + m = 1; + nr_pages = 0; + bio = NULL; + do_gettimeofday(&start); + + ret = snapshot_write_next(snapshot); + if (ret <= 0) + goto out_finish; + + for(;;) { + for (i = 0; !eof && i < want; i++) { + ret = swap_read_page(handle, page[ring], &bio); + if (ret) { + /* + * On real read error, finish. On end of data, + * set EOF flag and just exit the read loop. + */ + if (handle->cur && + handle->cur->entries[handle->k]) { + goto out_finish; + } else { + eof = 1; + break; + } + } + if (++ring >= ring_size) + ring = 0; + } + asked += i; + want -= i; + + /* + * We are out of data, wait for some more. + */ + if (!have) { + if (!asked) + break; + + ret = hib_wait_on_bio_chain(&bio); + if (ret) + goto out_finish; + have += asked; + asked = 0; + if (eof) + eof = 2; + } + + if (crc->run_threads) { + wait_event(crc->done, atomic_read(&crc->stop)); + atomic_set(&crc->stop, 0); + crc->run_threads = 0; + } + + for (thr = 0; have && thr < nr_threads; thr++) { + data[thr].cmp_len = *(size_t *)page[pg]; + if (unlikely(!data[thr].cmp_len || + data[thr].cmp_len > + lzo1x_worst_compress(LZO_UNC_SIZE))) { + printk(KERN_ERR + "PM: Invalid LZO compressed length\n"); + ret = -1; + goto out_finish; + } + + need = DIV_ROUND_UP(data[thr].cmp_len + LZO_HEADER, + PAGE_SIZE); + if (need > have) { + if (eof > 1) { + ret = -1; + goto out_finish; + } + break; + } + + for (off = 0; + off < LZO_HEADER + data[thr].cmp_len; + off += PAGE_SIZE) { + memcpy(data[thr].cmp + off, + page[pg], PAGE_SIZE); + have--; + want++; + if (++pg >= ring_size) + pg = 0; + } + + atomic_set(&data[thr].ready, 1); + wake_up(&data[thr].go); + } + + /* + * Wait for more data while we are decompressing. + */ + if (have < LZO_CMP_PAGES && asked) { + ret = hib_wait_on_bio_chain(&bio); + if (ret) + goto out_finish; + have += asked; + asked = 0; + if (eof) + eof = 2; + } + + for (run_threads = thr, thr = 0; thr < run_threads; thr++) { + wait_event(data[thr].done, + atomic_read(&data[thr].stop)); + atomic_set(&data[thr].stop, 0); + + ret = data[thr].ret; + + if (ret < 0) { + printk(KERN_ERR + "PM: LZO decompression failed\n"); + goto out_finish; + } + + if (unlikely(!data[thr].unc_len || + data[thr].unc_len > LZO_UNC_SIZE || + data[thr].unc_len & (PAGE_SIZE - 1))) { + printk(KERN_ERR + "PM: Invalid LZO uncompressed length\n"); + ret = -1; + goto out_finish; + } + + for (off = 0; + off < data[thr].unc_len; off += PAGE_SIZE) { + memcpy(data_of(*snapshot), + data[thr].unc + off, PAGE_SIZE); + + if (!(nr_pages % m)) + printk(KERN_INFO + "PM: Image loading progress: " + "%3d%%\n", + nr_pages / m * 10); + nr_pages++; + + ret = snapshot_write_next(snapshot); + if (ret <= 0) { + crc->run_threads = thr + 1; + atomic_set(&crc->ready, 1); + wake_up(&crc->go); + goto out_finish; + } + } + } + + crc->run_threads = thr; + atomic_set(&crc->ready, 1); + wake_up(&crc->go); + } + +out_finish: + if (crc->run_threads) { + wait_event(crc->done, atomic_read(&crc->stop)); + atomic_set(&crc->stop, 0); + } + do_gettimeofday(&stop); + if (!ret) { + printk(KERN_INFO "PM: Image loading done.\n"); + snapshot_write_finalize(snapshot); + if (!snapshot_image_loaded(snapshot)) + ret = -ENODATA; + if (!ret) { + if (swsusp_header->flags & SF_CRC32_MODE) { + if(handle->crc32 != swsusp_header->crc32) { + printk(KERN_ERR + "PM: Invalid image CRC32!\n"); + ret = -ENODATA; + } + } + } + } + swsusp_show_speed(&start, &stop, nr_to_read, "Read"); +out_clean: + for (i = 0; i < ring_size; i++) + free_page((unsigned long)page[i]); + if (crc) { + if (crc->thr) + kthread_stop(crc->thr); + kfree(crc); + } + if (data) { + for (thr = 0; thr < nr_threads; thr++) + if (data[thr].thr) + kthread_stop(data[thr].thr); + vfree(data); + } + if (page) vfree(page); + + return ret; } /** * swsusp_read - read the hibernation image. * @flags_p: flags passed by the "frozen" kernel in the image header should - * be written into this memeory location + * be written into this memory location */ int swsusp_read(unsigned int *flags_p) @@ -557,26 +1392,23 @@ int swsusp_read(unsigned int *flags_p) struct snapshot_handle snapshot; struct swsusp_info *header; - *flags_p = swsusp_header->flags; - if (IS_ERR(resume_bdev)) { - pr_debug("PM: Image device not initialised\n"); - return PTR_ERR(resume_bdev); - } - memset(&snapshot, 0, sizeof(struct snapshot_handle)); - error = snapshot_write_next(&snapshot, PAGE_SIZE); + error = snapshot_write_next(&snapshot); if (error < PAGE_SIZE) return error < 0 ? error : -EFAULT; header = (struct swsusp_info *)data_of(snapshot); - error = get_swap_reader(&handle, swsusp_header->image); + error = get_swap_reader(&handle, flags_p); + if (error) + goto end; if (!error) error = swap_read_page(&handle, header, NULL); - if (!error) - error = load_image(&handle, &snapshot, header->pages - 1); - release_swap_reader(&handle); - - blkdev_put(resume_bdev); - + if (!error) { + error = (*flags_p & SF_NOCOMPRESS_MODE) ? + load_image(&handle, &snapshot, header->pages - 1) : + load_image_lzo(&handle, &snapshot, header->pages - 1); + } + swap_reader_finish(&handle); +end: if (!error) pr_debug("PM: Image successfully loaded\n"); else @@ -592,33 +1424,36 @@ int swsusp_check(void) { int error; - resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ); - if (!IS_ERR(resume_bdev)) { - set_blocksize(resume_bdev, PAGE_SIZE); - memset(swsusp_header, 0, PAGE_SIZE); - error = bio_read_page(swsusp_resume_block, + hib_resume_bdev = blkdev_get_by_dev(swsusp_resume_device, + FMODE_READ, NULL); + if (!IS_ERR(hib_resume_bdev)) { + set_blocksize(hib_resume_bdev, PAGE_SIZE); + clear_page(swsusp_header); + error = hib_bio_read_page(swsusp_resume_block, swsusp_header, NULL); if (error) - return error; + goto put; - if (!memcmp(SWSUSP_SIG, swsusp_header->sig, 10)) { + if (!memcmp(HIBERNATE_SIG, swsusp_header->sig, 10)) { memcpy(swsusp_header->sig, swsusp_header->orig_sig, 10); /* Reset swap signature now */ - error = bio_write_page(swsusp_resume_block, + error = hib_bio_write_page(swsusp_resume_block, swsusp_header, NULL); } else { - return -EINVAL; + error = -EINVAL; } + +put: if (error) - blkdev_put(resume_bdev); + blkdev_put(hib_resume_bdev, FMODE_READ); else - pr_debug("PM: Signature found, resuming\n"); + pr_debug("PM: Image signature found, resuming\n"); } else { - error = PTR_ERR(resume_bdev); + error = PTR_ERR(hib_resume_bdev); } if (error) - pr_debug("PM: Error %d checking image file\n", error); + pr_debug("PM: Image not found (code %d)\n", error); return error; } @@ -627,15 +1462,43 @@ int swsusp_check(void) * swsusp_close - close swap device. */ -void swsusp_close(void) +void swsusp_close(fmode_t mode) { - if (IS_ERR(resume_bdev)) { + if (IS_ERR(hib_resume_bdev)) { pr_debug("PM: Image device not initialised\n"); return; } - blkdev_put(resume_bdev); + blkdev_put(hib_resume_bdev, mode); +} + +/** + * swsusp_unmark - Unmark swsusp signature in the resume device + */ + +#ifdef CONFIG_SUSPEND +int swsusp_unmark(void) +{ + int error; + + hib_bio_read_page(swsusp_resume_block, swsusp_header, NULL); + if (!memcmp(HIBERNATE_SIG,swsusp_header->sig, 10)) { + memcpy(swsusp_header->sig,swsusp_header->orig_sig, 10); + error = hib_bio_write_page(swsusp_resume_block, + swsusp_header, NULL); + } else { + printk(KERN_ERR "PM: Cannot find swsusp signature!\n"); + error = -ENODEV; + } + + /* + * We just returned from suspend, we don't need the image any more. + */ + free_all_swap_pages(root_swap); + + return error; } +#endif static int swsusp_header_init(void) { diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c deleted file mode 100644 index 023ff2a31d8..00000000000 --- a/kernel/power/swsusp.c +++ /dev/null @@ -1,264 +0,0 @@ -/* - * linux/kernel/power/swsusp.c - * - * This file provides code to write suspend image to swap and read it back. - * - * Copyright (C) 1998-2001 Gabor Kuti <seasons@fornax.hu> - * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@suse.cz> - * - * This file is released under the GPLv2. - * - * I'd like to thank the following people for their work: - * - * Pavel Machek <pavel@ucw.cz>: - * Modifications, defectiveness pointing, being with me at the very beginning, - * suspend to swap space, stop all tasks. Port to 2.4.18-ac and 2.5.17. - * - * Steve Doddi <dirk@loth.demon.co.uk>: - * Support the possibility of hardware state restoring. - * - * Raph <grey.havens@earthling.net>: - * Support for preserving states of network devices and virtual console - * (including X and svgatextmode) - * - * Kurt Garloff <garloff@suse.de>: - * Straightened the critical function in order to prevent compilers from - * playing tricks with local variables. - * - * Andreas Mohr <a.mohr@mailto.de> - * - * Alex Badea <vampire@go.ro>: - * Fixed runaway init - * - * Rafael J. Wysocki <rjw@sisk.pl> - * Reworked the freeing of memory and the handling of swap - * - * More state savers are welcome. Especially for the scsi layer... - * - * For TODOs,FIXMEs also look in Documentation/power/swsusp.txt - */ - -#include <linux/mm.h> -#include <linux/suspend.h> -#include <linux/spinlock.h> -#include <linux/kernel.h> -#include <linux/major.h> -#include <linux/swap.h> -#include <linux/pm.h> -#include <linux/swapops.h> -#include <linux/bootmem.h> -#include <linux/syscalls.h> -#include <linux/highmem.h> -#include <linux/time.h> -#include <linux/rbtree.h> - -#include "power.h" - -/* - * Preferred image size in bytes (tunable via /sys/power/image_size). - * When it is set to N, swsusp will do its best to ensure the image - * size will not exceed N bytes, but if that is impossible, it will - * try to create the smallest image possible. - */ -unsigned long image_size = 500 * 1024 * 1024; - -int in_suspend __nosavedata = 0; - -/** - * The following functions are used for tracing the allocated - * swap pages, so that they can be freed in case of an error. - */ - -struct swsusp_extent { - struct rb_node node; - unsigned long start; - unsigned long end; -}; - -static struct rb_root swsusp_extents = RB_ROOT; - -static int swsusp_extents_insert(unsigned long swap_offset) -{ - struct rb_node **new = &(swsusp_extents.rb_node); - struct rb_node *parent = NULL; - struct swsusp_extent *ext; - - /* Figure out where to put the new node */ - while (*new) { - ext = container_of(*new, struct swsusp_extent, node); - parent = *new; - if (swap_offset < ext->start) { - /* Try to merge */ - if (swap_offset == ext->start - 1) { - ext->start--; - return 0; - } - new = &((*new)->rb_left); - } else if (swap_offset > ext->end) { - /* Try to merge */ - if (swap_offset == ext->end + 1) { - ext->end++; - return 0; - } - new = &((*new)->rb_right); - } else { - /* It already is in the tree */ - return -EINVAL; - } - } - /* Add the new node and rebalance the tree. */ - ext = kzalloc(sizeof(struct swsusp_extent), GFP_KERNEL); - if (!ext) - return -ENOMEM; - - ext->start = swap_offset; - ext->end = swap_offset; - rb_link_node(&ext->node, parent, new); - rb_insert_color(&ext->node, &swsusp_extents); - return 0; -} - -/** - * alloc_swapdev_block - allocate a swap page and register that it has - * been allocated, so that it can be freed in case of an error. - */ - -sector_t alloc_swapdev_block(int swap) -{ - unsigned long offset; - - offset = swp_offset(get_swap_page_of_type(swap)); - if (offset) { - if (swsusp_extents_insert(offset)) - swap_free(swp_entry(swap, offset)); - else - return swapdev_block(swap, offset); - } - return 0; -} - -/** - * free_all_swap_pages - free swap pages allocated for saving image data. - * It also frees the extents used to register which swap entres had been - * allocated. - */ - -void free_all_swap_pages(int swap) -{ - struct rb_node *node; - - while ((node = swsusp_extents.rb_node)) { - struct swsusp_extent *ext; - unsigned long offset; - - ext = container_of(node, struct swsusp_extent, node); - rb_erase(node, &swsusp_extents); - for (offset = ext->start; offset <= ext->end; offset++) - swap_free(swp_entry(swap, offset)); - - kfree(ext); - } -} - -int swsusp_swap_in_use(void) -{ - return (swsusp_extents.rb_node != NULL); -} - -/** - * swsusp_show_speed - print the time elapsed between two events represented by - * @start and @stop - * - * @nr_pages - number of pages processed between @start and @stop - * @msg - introductory message to print - */ - -void swsusp_show_speed(struct timeval *start, struct timeval *stop, - unsigned nr_pages, char *msg) -{ - s64 elapsed_centisecs64; - int centisecs; - int k; - int kps; - - elapsed_centisecs64 = timeval_to_ns(stop) - timeval_to_ns(start); - do_div(elapsed_centisecs64, NSEC_PER_SEC / 100); - centisecs = elapsed_centisecs64; - if (centisecs == 0) - centisecs = 1; /* avoid div-by-zero */ - k = nr_pages * (PAGE_SIZE / 1024); - kps = (k * 100) / centisecs; - printk(KERN_INFO "PM: %s %d kbytes in %d.%02d seconds (%d.%02d MB/s)\n", - msg, k, - centisecs / 100, centisecs % 100, - kps / 1000, (kps % 1000) / 10); -} - -/** - * swsusp_shrink_memory - Try to free as much memory as needed - * - * ... but do not OOM-kill anyone - * - * Notice: all userland should be stopped before it is called, or - * livelock is possible. - */ - -#define SHRINK_BITE 10000 -static inline unsigned long __shrink_memory(long tmp) -{ - if (tmp > SHRINK_BITE) - tmp = SHRINK_BITE; - return shrink_all_memory(tmp); -} - -int swsusp_shrink_memory(void) -{ - long tmp; - struct zone *zone; - unsigned long pages = 0; - unsigned int i = 0; - char *p = "-\\|/"; - struct timeval start, stop; - - printk(KERN_INFO "PM: Shrinking memory... "); - do_gettimeofday(&start); - do { - long size, highmem_size; - - highmem_size = count_highmem_pages(); - size = count_data_pages() + PAGES_FOR_IO + SPARE_PAGES; - tmp = size; - size += highmem_size; - for_each_zone (zone) - if (populated_zone(zone)) { - tmp += snapshot_additional_pages(zone); - if (is_highmem(zone)) { - highmem_size -= - zone_page_state(zone, NR_FREE_PAGES); - } else { - tmp -= zone_page_state(zone, NR_FREE_PAGES); - tmp += zone->lowmem_reserve[ZONE_NORMAL]; - } - } - - if (highmem_size < 0) - highmem_size = 0; - - tmp += highmem_size; - if (tmp > 0) { - tmp = __shrink_memory(tmp); - if (!tmp) - return -ENOMEM; - pages += tmp; - } else if (size > image_size / PAGE_SIZE) { - tmp = __shrink_memory(size - (image_size / PAGE_SIZE)); - pages += tmp; - } - printk("\b%c", p[i++%4]); - } while (tmp > 0); - do_gettimeofday(&stop); - printk("\bdone (%lu pages freed)\n", pages); - swsusp_show_speed(&start, &stop, pages, "Freed"); - - return 0; -} diff --git a/kernel/power/user.c b/kernel/power/user.c index f5512cb3aa8..526e8911460 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -20,6 +20,7 @@ #include <linux/swapops.h> #include <linux/pm.h> #include <linux/fs.h> +#include <linux/compat.h> #include <linux/console.h> #include <linux/cpu.h> #include <linux/freezer.h> @@ -28,28 +29,6 @@ #include "power.h" -/* - * NOTE: The SNAPSHOT_SET_SWAP_FILE and SNAPSHOT_PMOPS ioctls are obsolete and - * will be removed in the future. They are only preserved here for - * compatibility with existing userland utilities. - */ -#define SNAPSHOT_SET_SWAP_FILE _IOW(SNAPSHOT_IOC_MAGIC, 10, unsigned int) -#define SNAPSHOT_PMOPS _IOW(SNAPSHOT_IOC_MAGIC, 12, unsigned int) - -#define PMOPS_PREPARE 1 -#define PMOPS_ENTER 2 -#define PMOPS_FINISH 3 - -/* - * NOTE: The following ioctl definitions are wrong and have been replaced with - * correct ones. They are only preserved here for compatibility with existing - * userland utilities and will be removed in the future. - */ -#define SNAPSHOT_ATOMIC_SNAPSHOT _IOW(SNAPSHOT_IOC_MAGIC, 3, void *) -#define SNAPSHOT_SET_IMAGE_SIZE _IOW(SNAPSHOT_IOC_MAGIC, 6, unsigned long) -#define SNAPSHOT_AVAIL_SWAP _IOR(SNAPSHOT_IOC_MAGIC, 7, void *) -#define SNAPSHOT_GET_SWAP_PAGE _IOR(SNAPSHOT_IOC_MAGIC, 8, void *) - #define SNAPSHOT_MINOR 231 @@ -57,9 +36,10 @@ static struct snapshot_data { struct snapshot_handle handle; int swap; int mode; - char frozen; - char ready; - char platform_support; + bool frozen; + bool ready; + bool platform_support; + bool free_bitmaps; } snapshot_state; atomic_t snapshot_device_available = ATOMIC_INIT(1); @@ -69,62 +49,86 @@ static int snapshot_open(struct inode *inode, struct file *filp) struct snapshot_data *data; int error; - if (!atomic_add_unless(&snapshot_device_available, -1, 0)) - return -EBUSY; + if (!hibernation_available()) + return -EPERM; - if ((filp->f_flags & O_ACCMODE) == O_RDWR) { - atomic_inc(&snapshot_device_available); - return -ENOSYS; + lock_system_sleep(); + + if (!atomic_add_unless(&snapshot_device_available, -1, 0)) { + error = -EBUSY; + goto Unlock; } - if(create_basic_memory_bitmaps()) { + + if ((filp->f_flags & O_ACCMODE) == O_RDWR) { atomic_inc(&snapshot_device_available); - return -ENOMEM; + error = -ENOSYS; + goto Unlock; } nonseekable_open(inode, filp); data = &snapshot_state; filp->private_data = data; memset(&data->handle, 0, sizeof(struct snapshot_handle)); if ((filp->f_flags & O_ACCMODE) == O_RDONLY) { + /* Hibernating. The image device should be accessible. */ data->swap = swsusp_resume_device ? swap_type_of(swsusp_resume_device, 0, NULL) : -1; data->mode = O_RDONLY; - error = pm_notifier_call_chain(PM_RESTORE_PREPARE); + data->free_bitmaps = false; + error = pm_notifier_call_chain(PM_HIBERNATION_PREPARE); if (error) - pm_notifier_call_chain(PM_POST_RESTORE); + pm_notifier_call_chain(PM_POST_HIBERNATION); } else { + /* + * Resuming. We may need to wait for the image device to + * appear. + */ + wait_for_device_probe(); + data->swap = -1; data->mode = O_WRONLY; - error = pm_notifier_call_chain(PM_HIBERNATION_PREPARE); + error = pm_notifier_call_chain(PM_RESTORE_PREPARE); + if (!error) { + error = create_basic_memory_bitmaps(); + data->free_bitmaps = !error; + } if (error) - pm_notifier_call_chain(PM_POST_HIBERNATION); + pm_notifier_call_chain(PM_POST_RESTORE); } - if (error) { + if (error) atomic_inc(&snapshot_device_available); - return error; - } - data->frozen = 0; - data->ready = 0; - data->platform_support = 0; - return 0; + data->frozen = false; + data->ready = false; + data->platform_support = false; + + Unlock: + unlock_system_sleep(); + + return error; } static int snapshot_release(struct inode *inode, struct file *filp) { struct snapshot_data *data; + lock_system_sleep(); + swsusp_free(); - free_basic_memory_bitmaps(); data = filp->private_data; free_all_swap_pages(data->swap); if (data->frozen) { - mutex_lock(&pm_mutex); + pm_restore_gfp_mask(); + free_basic_memory_bitmaps(); thaw_processes(); - mutex_unlock(&pm_mutex); + } else if (data->free_bitmaps) { + free_basic_memory_bitmaps(); } - pm_notifier_call_chain(data->mode == O_WRONLY ? + pm_notifier_call_chain(data->mode == O_RDONLY ? PM_POST_HIBERNATION : PM_POST_RESTORE); atomic_inc(&snapshot_device_available); + + unlock_system_sleep(); + return 0; } @@ -133,17 +137,31 @@ static ssize_t snapshot_read(struct file *filp, char __user *buf, { struct snapshot_data *data; ssize_t res; + loff_t pg_offp = *offp & ~PAGE_MASK; + + lock_system_sleep(); data = filp->private_data; - if (!data->ready) - return -ENODATA; - res = snapshot_read_next(&data->handle, count); - if (res > 0) { - if (copy_to_user(buf, data_of(data->handle), res)) - res = -EFAULT; - else - *offp = data->handle.offset; + if (!data->ready) { + res = -ENODATA; + goto Unlock; + } + if (!pg_offp) { /* on page boundary? */ + res = snapshot_read_next(&data->handle); + if (res <= 0) + goto Unlock; + } else { + res = PAGE_SIZE - pg_offp; } + + res = simple_read_from_buffer(buf, count, &pg_offp, + data_of(data->handle), res); + if (res > 0) + *offp += res; + + Unlock: + unlock_system_sleep(); + return res; } @@ -152,20 +170,32 @@ static ssize_t snapshot_write(struct file *filp, const char __user *buf, { struct snapshot_data *data; ssize_t res; + loff_t pg_offp = *offp & ~PAGE_MASK; + + lock_system_sleep(); data = filp->private_data; - res = snapshot_write_next(&data->handle, count); - if (res > 0) { - if (copy_from_user(data_of(data->handle), buf, res)) - res = -EFAULT; - else - *offp = data->handle.offset; + + if (!pg_offp) { + res = snapshot_write_next(&data->handle); + if (res <= 0) + goto unlock; + } else { + res = PAGE_SIZE - pg_offp; } + + res = simple_write_to_buffer(data_of(data->handle), res, &pg_offp, + buf, count); + if (res > 0) + *offp += res; +unlock: + unlock_system_sleep(); + return res; } -static int snapshot_ioctl(struct inode *inode, struct file *filp, - unsigned int cmd, unsigned long arg) +static long snapshot_ioctl(struct file *filp, unsigned int cmd, + unsigned long arg) { int error = 0; struct snapshot_data *data; @@ -179,6 +209,10 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, if (!capable(CAP_SYS_ADMIN)) return -EPERM; + if (!mutex_trylock(&pm_mutex)) + return -EBUSY; + + lock_device_hotplug(); data = filp->private_data; switch (cmd) { @@ -186,39 +220,45 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, case SNAPSHOT_FREEZE: if (data->frozen) break; - mutex_lock(&pm_mutex); + printk("Syncing filesystems ... "); sys_sync(); printk("done.\n"); error = freeze_processes(); if (error) + break; + + error = create_basic_memory_bitmaps(); + if (error) thaw_processes(); - mutex_unlock(&pm_mutex); - if (!error) - data->frozen = 1; + else + data->frozen = true; + break; case SNAPSHOT_UNFREEZE: if (!data->frozen || data->ready) break; - mutex_lock(&pm_mutex); + pm_restore_gfp_mask(); + free_basic_memory_bitmaps(); + data->free_bitmaps = false; thaw_processes(); - mutex_unlock(&pm_mutex); - data->frozen = 0; + data->frozen = false; break; case SNAPSHOT_CREATE_IMAGE: - case SNAPSHOT_ATOMIC_SNAPSHOT: if (data->mode != O_RDONLY || !data->frozen || data->ready) { error = -EPERM; break; } + pm_restore_gfp_mask(); error = hibernation_snapshot(data->platform_support); - if (!error) + if (!error) { error = put_user(in_suspend, (int __user *)arg); - if (!error) - data->ready = 1; + data->ready = !freezer_test_done && !error; + freezer_test_done = false; + } break; case SNAPSHOT_ATOMIC_RESTORE: @@ -234,11 +274,19 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, case SNAPSHOT_FREE: swsusp_free(); memset(&data->handle, 0, sizeof(struct snapshot_handle)); - data->ready = 0; + data->ready = false; + /* + * It is necessary to thaw kernel threads here, because + * SNAPSHOT_CREATE_IMAGE may be invoked directly after + * SNAPSHOT_FREE. In that case, if kernel threads were not + * thawed, the preallocation of memory carried out by + * hibernation_snapshot() might run into problems (i.e. it + * might fail or even deadlock). + */ + thaw_kernel_threads(); break; case SNAPSHOT_PREF_IMAGE_SIZE: - case SNAPSHOT_SET_IMAGE_SIZE: image_size = arg; break; @@ -253,14 +301,12 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, break; case SNAPSHOT_AVAIL_SWAP_SIZE: - case SNAPSHOT_AVAIL_SWAP: size = count_swap_pages(data->swap, 1); size <<= PAGE_SHIFT; error = put_user(size, (loff_t __user *)arg); break; case SNAPSHOT_ALLOC_SWAP_PAGE: - case SNAPSHOT_GET_SWAP_PAGE: if (data->swap < 0 || data->swap >= MAX_SWAPFILES) { error = -ENODEV; break; @@ -282,41 +328,17 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, free_all_swap_pages(data->swap); break; - case SNAPSHOT_SET_SWAP_FILE: /* This ioctl is deprecated */ - if (!swsusp_swap_in_use()) { - /* - * User space encodes device types as two-byte values, - * so we need to recode them - */ - if (old_decode_dev(arg)) { - data->swap = swap_type_of(old_decode_dev(arg), - 0, NULL); - if (data->swap < 0) - error = -ENODEV; - } else { - data->swap = -1; - error = -EINVAL; - } - } else { - error = -EPERM; - } - break; - case SNAPSHOT_S2RAM: if (!data->frozen) { error = -EPERM; break; } - if (!mutex_trylock(&pm_mutex)) { - error = -EBUSY; - break; - } /* * Tasks are frozen and the notifiers have been called with * PM_HIBERNATION_PREPARE */ error = suspend_devices_and_enter(PM_SUSPEND_MEM); - mutex_unlock(&pm_mutex); + data->ready = false; break; case SNAPSHOT_PLATFORM_SUPPORT: @@ -328,32 +350,6 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, error = hibernation_platform_enter(); break; - case SNAPSHOT_PMOPS: /* This ioctl is deprecated */ - error = -EINVAL; - - switch (arg) { - - case PMOPS_PREPARE: - data->platform_support = 1; - error = 0; - break; - - case PMOPS_ENTER: - if (data->platform_support) - error = hibernation_platform_enter(); - break; - - case PMOPS_FINISH: - if (data->platform_support) - error = 0; - break; - - default: - printk(KERN_ERR "SNAPSHOT_PMOPS: invalid argument %ld\n", arg); - - } - break; - case SNAPSHOT_SET_SWAP_AREA: if (swsusp_swap_in_use()) { error = -EPERM; @@ -372,7 +368,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, * User space encodes device types as two-byte values, * so we need to recode them */ - swdev = old_decode_dev(swap_area.dev); + swdev = new_decode_dev(swap_area.dev); if (swdev) { offset = swap_area.offset; data->swap = swap_type_of(swdev, offset, NULL); @@ -390,16 +386,82 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, } + unlock_device_hotplug(); + mutex_unlock(&pm_mutex); + return error; } +#ifdef CONFIG_COMPAT + +struct compat_resume_swap_area { + compat_loff_t offset; + u32 dev; +} __packed; + +static long +snapshot_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + BUILD_BUG_ON(sizeof(loff_t) != sizeof(compat_loff_t)); + + switch (cmd) { + case SNAPSHOT_GET_IMAGE_SIZE: + case SNAPSHOT_AVAIL_SWAP_SIZE: + case SNAPSHOT_ALLOC_SWAP_PAGE: { + compat_loff_t __user *uoffset = compat_ptr(arg); + loff_t offset; + mm_segment_t old_fs; + int err; + + old_fs = get_fs(); + set_fs(KERNEL_DS); + err = snapshot_ioctl(file, cmd, (unsigned long) &offset); + set_fs(old_fs); + if (!err && put_user(offset, uoffset)) + err = -EFAULT; + return err; + } + + case SNAPSHOT_CREATE_IMAGE: + return snapshot_ioctl(file, cmd, + (unsigned long) compat_ptr(arg)); + + case SNAPSHOT_SET_SWAP_AREA: { + struct compat_resume_swap_area __user *u_swap_area = + compat_ptr(arg); + struct resume_swap_area swap_area; + mm_segment_t old_fs; + int err; + + err = get_user(swap_area.offset, &u_swap_area->offset); + err |= get_user(swap_area.dev, &u_swap_area->dev); + if (err) + return -EFAULT; + old_fs = get_fs(); + set_fs(KERNEL_DS); + err = snapshot_ioctl(file, SNAPSHOT_SET_SWAP_AREA, + (unsigned long) &swap_area); + set_fs(old_fs); + return err; + } + + default: + return snapshot_ioctl(file, cmd, arg); + } +} + +#endif /* CONFIG_COMPAT */ + static const struct file_operations snapshot_fops = { .open = snapshot_open, .release = snapshot_release, .read = snapshot_read, .write = snapshot_write, .llseek = no_llseek, - .ioctl = snapshot_ioctl, + .unlocked_ioctl = snapshot_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = snapshot_compat_ioctl, +#endif }; static struct miscdevice snapshot_device = { diff --git a/kernel/power/wakelock.c b/kernel/power/wakelock.c new file mode 100644 index 00000000000..019069c84ff --- /dev/null +++ b/kernel/power/wakelock.c @@ -0,0 +1,268 @@ +/* + * kernel/power/wakelock.c + * + * User space wakeup sources support. + * + * Copyright (C) 2012 Rafael J. Wysocki <rjw@sisk.pl> + * + * This code is based on the analogous interface allowing user space to + * manipulate wakelocks on Android. + */ + +#include <linux/capability.h> +#include <linux/ctype.h> +#include <linux/device.h> +#include <linux/err.h> +#include <linux/hrtimer.h> +#include <linux/list.h> +#include <linux/rbtree.h> +#include <linux/slab.h> + +#include "power.h" + +static DEFINE_MUTEX(wakelocks_lock); + +struct wakelock { + char *name; + struct rb_node node; + struct wakeup_source ws; +#ifdef CONFIG_PM_WAKELOCKS_GC + struct list_head lru; +#endif +}; + +static struct rb_root wakelocks_tree = RB_ROOT; + +ssize_t pm_show_wakelocks(char *buf, bool show_active) +{ + struct rb_node *node; + struct wakelock *wl; + char *str = buf; + char *end = buf + PAGE_SIZE; + + mutex_lock(&wakelocks_lock); + + for (node = rb_first(&wakelocks_tree); node; node = rb_next(node)) { + wl = rb_entry(node, struct wakelock, node); + if (wl->ws.active == show_active) + str += scnprintf(str, end - str, "%s ", wl->name); + } + if (str > buf) + str--; + + str += scnprintf(str, end - str, "\n"); + + mutex_unlock(&wakelocks_lock); + return (str - buf); +} + +#if CONFIG_PM_WAKELOCKS_LIMIT > 0 +static unsigned int number_of_wakelocks; + +static inline bool wakelocks_limit_exceeded(void) +{ + return number_of_wakelocks > CONFIG_PM_WAKELOCKS_LIMIT; +} + +static inline void increment_wakelocks_number(void) +{ + number_of_wakelocks++; +} + +static inline void decrement_wakelocks_number(void) +{ + number_of_wakelocks--; +} +#else /* CONFIG_PM_WAKELOCKS_LIMIT = 0 */ +static inline bool wakelocks_limit_exceeded(void) { return false; } +static inline void increment_wakelocks_number(void) {} +static inline void decrement_wakelocks_number(void) {} +#endif /* CONFIG_PM_WAKELOCKS_LIMIT */ + +#ifdef CONFIG_PM_WAKELOCKS_GC +#define WL_GC_COUNT_MAX 100 +#define WL_GC_TIME_SEC 300 + +static LIST_HEAD(wakelocks_lru_list); +static unsigned int wakelocks_gc_count; + +static inline void wakelocks_lru_add(struct wakelock *wl) +{ + list_add(&wl->lru, &wakelocks_lru_list); +} + +static inline void wakelocks_lru_most_recent(struct wakelock *wl) +{ + list_move(&wl->lru, &wakelocks_lru_list); +} + +static void wakelocks_gc(void) +{ + struct wakelock *wl, *aux; + ktime_t now; + + if (++wakelocks_gc_count <= WL_GC_COUNT_MAX) + return; + + now = ktime_get(); + list_for_each_entry_safe_reverse(wl, aux, &wakelocks_lru_list, lru) { + u64 idle_time_ns; + bool active; + + spin_lock_irq(&wl->ws.lock); + idle_time_ns = ktime_to_ns(ktime_sub(now, wl->ws.last_time)); + active = wl->ws.active; + spin_unlock_irq(&wl->ws.lock); + + if (idle_time_ns < ((u64)WL_GC_TIME_SEC * NSEC_PER_SEC)) + break; + + if (!active) { + wakeup_source_remove(&wl->ws); + rb_erase(&wl->node, &wakelocks_tree); + list_del(&wl->lru); + kfree(wl->name); + kfree(wl); + decrement_wakelocks_number(); + } + } + wakelocks_gc_count = 0; +} +#else /* !CONFIG_PM_WAKELOCKS_GC */ +static inline void wakelocks_lru_add(struct wakelock *wl) {} +static inline void wakelocks_lru_most_recent(struct wakelock *wl) {} +static inline void wakelocks_gc(void) {} +#endif /* !CONFIG_PM_WAKELOCKS_GC */ + +static struct wakelock *wakelock_lookup_add(const char *name, size_t len, + bool add_if_not_found) +{ + struct rb_node **node = &wakelocks_tree.rb_node; + struct rb_node *parent = *node; + struct wakelock *wl; + + while (*node) { + int diff; + + parent = *node; + wl = rb_entry(*node, struct wakelock, node); + diff = strncmp(name, wl->name, len); + if (diff == 0) { + if (wl->name[len]) + diff = -1; + else + return wl; + } + if (diff < 0) + node = &(*node)->rb_left; + else + node = &(*node)->rb_right; + } + if (!add_if_not_found) + return ERR_PTR(-EINVAL); + + if (wakelocks_limit_exceeded()) + return ERR_PTR(-ENOSPC); + + /* Not found, we have to add a new one. */ + wl = kzalloc(sizeof(*wl), GFP_KERNEL); + if (!wl) + return ERR_PTR(-ENOMEM); + + wl->name = kstrndup(name, len, GFP_KERNEL); + if (!wl->name) { + kfree(wl); + return ERR_PTR(-ENOMEM); + } + wl->ws.name = wl->name; + wakeup_source_add(&wl->ws); + rb_link_node(&wl->node, parent, node); + rb_insert_color(&wl->node, &wakelocks_tree); + wakelocks_lru_add(wl); + increment_wakelocks_number(); + return wl; +} + +int pm_wake_lock(const char *buf) +{ + const char *str = buf; + struct wakelock *wl; + u64 timeout_ns = 0; + size_t len; + int ret = 0; + + if (!capable(CAP_BLOCK_SUSPEND)) + return -EPERM; + + while (*str && !isspace(*str)) + str++; + + len = str - buf; + if (!len) + return -EINVAL; + + if (*str && *str != '\n') { + /* Find out if there's a valid timeout string appended. */ + ret = kstrtou64(skip_spaces(str), 10, &timeout_ns); + if (ret) + return -EINVAL; + } + + mutex_lock(&wakelocks_lock); + + wl = wakelock_lookup_add(buf, len, true); + if (IS_ERR(wl)) { + ret = PTR_ERR(wl); + goto out; + } + if (timeout_ns) { + u64 timeout_ms = timeout_ns + NSEC_PER_MSEC - 1; + + do_div(timeout_ms, NSEC_PER_MSEC); + __pm_wakeup_event(&wl->ws, timeout_ms); + } else { + __pm_stay_awake(&wl->ws); + } + + wakelocks_lru_most_recent(wl); + + out: + mutex_unlock(&wakelocks_lock); + return ret; +} + +int pm_wake_unlock(const char *buf) +{ + struct wakelock *wl; + size_t len; + int ret = 0; + + if (!capable(CAP_BLOCK_SUSPEND)) + return -EPERM; + + len = strlen(buf); + if (!len) + return -EINVAL; + + if (buf[len-1] == '\n') + len--; + + if (!len) + return -EINVAL; + + mutex_lock(&wakelocks_lock); + + wl = wakelock_lookup_add(buf, len, false); + if (IS_ERR(wl)) { + ret = PTR_ERR(wl); + goto out; + } + __pm_relax(&wl->ws); + + wakelocks_lru_most_recent(wl); + wakelocks_gc(); + + out: + mutex_unlock(&wakelocks_lock); + return ret; +} |
