From 1eede070a59e1cc73da51e1aaa00d9ab86572cfc Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 20 May 2008 23:00:01 +0200 Subject: Introduce new top level suspend and hibernation callbacks Introduce 'struct pm_ops' and 'struct pm_ext_ops' ('ext' meaning 'extended') representing suspend and hibernation operations for bus types, device classes, device types and device drivers. Modify the PM core to use 'struct pm_ops' and 'struct pm_ext_ops' objects, if defined, instead of the ->suspend(), ->resume(), ->suspend_late(), and ->resume_early() callbacks (the old callbacks will be considered as legacy and gradually phased out). The main purpose of doing this is to separate suspend (aka S2RAM and standby) callbacks from hibernation callbacks in such a way that the new callbacks won't take arguments and the semantics of each of them will be clearly specified. This has been requested for multiple times by many people, including Linus himself, and the reason is that within the current scheme if ->resume() is called, for example, it's difficult to say why it's been called (ie. is it a resume from RAM or from hibernation or a suspend/hibernation failure etc.?). The second purpose is to make the suspend/hibernation callbacks more flexible so that device drivers can handle more than they can within the current scheme. For example, some drivers may need to prevent new children of the device from being registered before their ->suspend() callbacks are executed or they may want to carry out some operations requiring the availability of some other devices, not directly bound via the parent-child relationship, in order to prepare for the execution of ->suspend(), etc. Ultimately, we'd like to stop using the freezing of tasks for suspend and therefore the drivers' suspend/hibernation code will have to take care of the handling of the user space during suspend/hibernation. That, in turn, would be difficult within the current scheme, without the new ->prepare() and ->complete() callbacks. Signed-off-by: Rafael J. Wysocki Acked-by: Pavel Machek Signed-off-by: Jesse Barnes --- kernel/power/disk.c | 22 +++++++++++++++------- kernel/power/main.c | 6 ++++-- 2 files changed, 19 insertions(+), 9 deletions(-) (limited to 'kernel/power') diff --git a/kernel/power/disk.c b/kernel/power/disk.c index 14a656cdc65..d416be0efa8 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c @@ -193,6 +193,7 @@ static int create_image(int platform_mode) if (error) return error; + device_pm_lock(); local_irq_disable(); /* At this point, device_suspend() has been called, but *not* * device_power_down(). We *must* call device_power_down() now. @@ -224,9 +225,11 @@ static int create_image(int platform_mode) /* NOTE: device_power_up() is just a resume() for devices * that suspended with irqs off ... no overall powerup. */ - device_power_up(); + device_power_up(in_suspend ? + (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); Enable_irqs: local_irq_enable(); + device_pm_unlock(); return error; } @@ -280,7 +283,8 @@ int hibernation_snapshot(int platform_mode) Finish: platform_finish(platform_mode); Resume_devices: - device_resume(); + device_resume(in_suspend ? + (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); Resume_console: resume_console(); Close: @@ -300,8 +304,9 @@ static int resume_target_kernel(void) { int error; + device_pm_lock(); local_irq_disable(); - error = device_power_down(PMSG_PRETHAW); + error = device_power_down(PMSG_QUIESCE); if (error) { printk(KERN_ERR "PM: Some devices failed to power down, " "aborting resume\n"); @@ -329,9 +334,10 @@ static int resume_target_kernel(void) swsusp_free(); restore_processor_state(); touch_softlockup_watchdog(); - device_power_up(); + device_power_up(PMSG_RECOVER); Enable_irqs: local_irq_enable(); + device_pm_unlock(); return error; } @@ -350,7 +356,7 @@ int hibernation_restore(int platform_mode) pm_prepare_console(); suspend_console(); - error = device_suspend(PMSG_PRETHAW); + error = device_suspend(PMSG_QUIESCE); if (error) goto Finish; @@ -362,7 +368,7 @@ int hibernation_restore(int platform_mode) enable_nonboot_cpus(); } platform_restore_cleanup(platform_mode); - device_resume(); + device_resume(PMSG_RECOVER); Finish: resume_console(); pm_restore_console(); @@ -403,6 +409,7 @@ int hibernation_platform_enter(void) if (error) goto Finish; + device_pm_lock(); local_irq_disable(); error = device_power_down(PMSG_HIBERNATE); if (!error) { @@ -411,6 +418,7 @@ int hibernation_platform_enter(void) while (1); } local_irq_enable(); + device_pm_unlock(); /* * We don't need to reenable the nonboot CPUs or resume consoles, since @@ -419,7 +427,7 @@ int hibernation_platform_enter(void) Finish: hibernation_ops->finish(); Resume_devices: - device_resume(); + device_resume(PMSG_RESTORE); Resume_console: resume_console(); Close: diff --git a/kernel/power/main.c b/kernel/power/main.c index 6a6d5eb3524..d023b6b584e 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -228,6 +228,7 @@ static int suspend_enter(suspend_state_t state) { int error = 0; + device_pm_lock(); arch_suspend_disable_irqs(); BUG_ON(!irqs_disabled()); @@ -239,10 +240,11 @@ static int suspend_enter(suspend_state_t state) if (!suspend_test(TEST_CORE)) error = suspend_ops->enter(state); - device_power_up(); + device_power_up(PMSG_RESUME); Done: arch_suspend_enable_irqs(); BUG_ON(irqs_disabled()); + device_pm_unlock(); return error; } @@ -291,7 +293,7 @@ int suspend_devices_and_enter(suspend_state_t state) if (suspend_ops->finish) suspend_ops->finish(); Resume_devices: - device_resume(); + device_resume(PMSG_RESUME); Resume_console: resume_console(); Close: -- cgit v1.2.3-70-g09d2 From d8f3de0d2412bb91639cfefc5b3c79dbf3812212 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 12 Jun 2008 23:24:06 +0200 Subject: Suspend-related patches for 2.6.27 ACPI PM: Add possibility to change suspend sequence There are some systems out there that don't work correctly with our current suspend/hibernation code ordering. Provide a workaround for these systems allowing them to pass 'acpi_sleep=old_ordering' in the kernel command line so that it will use the pre-ACPI 2.0 ("old") suspend code ordering. Unfortunately, this requires us to add a platform hook to the resuming of devices for recovering the platform in case one of the device drivers' .suspend() routines returns error code. Namely, ACPI 1.0 specifies that _PTS should be called before suspending devices, but _WAK still should be called before resuming them in order to undo the changes made by _PTS. However, if there is an error during suspending devices, they are automatically resumed without returning control to the PM core, so the _WAK has to be called from within device_resume() in that cases. The patch also reorders and refactors the ACPI suspend/hibernation code to avoid duplication as far as reasonably possible. Signed-off-by: Rafael J. Wysocki Acked-by: Pavel Machek Signed-off-by: Jesse Barnes --- Documentation/kernel-parameters.txt | 6 +- arch/x86/kernel/acpi/sleep.c | 2 + drivers/acpi/sleep/main.c | 276 +++++++++++++++++++++--------------- drivers/base/power/main.c | 2 - include/linux/acpi.h | 3 + include/linux/suspend.h | 14 +- kernel/power/disk.c | 28 +++- kernel/power/main.c | 10 +- 8 files changed, 215 insertions(+), 126 deletions(-) (limited to 'kernel/power') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 9cf7b34f2db..18d793ea0dd 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -147,10 +147,14 @@ and is between 256 and 4096 characters. It is defined in the file default: 0 acpi_sleep= [HW,ACPI] Sleep options - Format: { s3_bios, s3_mode, s3_beep } + Format: { s3_bios, s3_mode, s3_beep, old_ordering } See Documentation/power/video.txt for s3_bios and s3_mode. s3_beep is for debugging; it makes the PC's speaker beep as soon as the kernel's real-mode entry point is called. + old_ordering causes the ACPI 1.0 ordering of the _PTS + control method, wrt putting devices into low power + states, to be enforced (the ACPI 2.0 ordering of _PTS is + used by default). acpi_sci= [HW,ACPI] ACPI System Control Interrupt trigger mode Format: { level | edge | high | low } diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index afc25ee9964..882e970032d 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -124,6 +124,8 @@ static int __init acpi_sleep_setup(char *str) acpi_realmode_flags |= 2; if (strncmp(str, "s3_beep", 7) == 0) acpi_realmode_flags |= 4; + if (strncmp(str, "old_ordering", 12) == 0) + acpi_old_suspend_ordering(); str = strchr(str, ','); if (str != NULL) str += strspn(str, ", \t"); diff --git a/drivers/acpi/sleep/main.c b/drivers/acpi/sleep/main.c index 0f2caea2fc8..4addf8ad50a 100644 --- a/drivers/acpi/sleep/main.c +++ b/drivers/acpi/sleep/main.c @@ -24,10 +24,6 @@ u8 sleep_states[ACPI_S_STATE_COUNT]; -#ifdef CONFIG_PM_SLEEP -static u32 acpi_target_sleep_state = ACPI_STATE_S0; -#endif - static int acpi_sleep_prepare(u32 acpi_state) { #ifdef CONFIG_ACPI_SLEEP @@ -50,9 +46,96 @@ static int acpi_sleep_prepare(u32 acpi_state) return 0; } -#ifdef CONFIG_SUSPEND -static struct platform_suspend_ops acpi_suspend_ops; +#ifdef CONFIG_PM_SLEEP +static u32 acpi_target_sleep_state = ACPI_STATE_S0; + +/* + * ACPI 1.0 wants us to execute _PTS before suspending devices, so we allow the + * user to request that behavior by using the 'acpi_old_suspend_ordering' + * kernel command line option that causes the following variable to be set. + */ +static bool old_suspend_ordering; + +void __init acpi_old_suspend_ordering(void) +{ + old_suspend_ordering = true; +} + +/** + * acpi_pm_disable_gpes - Disable the GPEs. + */ +static int acpi_pm_disable_gpes(void) +{ + acpi_hw_disable_all_gpes(); + return 0; +} + +/** + * __acpi_pm_prepare - Prepare the platform to enter the target state. + * + * If necessary, set the firmware waking vector and do arch-specific + * nastiness to get the wakeup code to the waking vector. + */ +static int __acpi_pm_prepare(void) +{ + int error = acpi_sleep_prepare(acpi_target_sleep_state); + + if (error) + acpi_target_sleep_state = ACPI_STATE_S0; + return error; +} + +/** + * acpi_pm_prepare - Prepare the platform to enter the target sleep + * state and disable the GPEs. + */ +static int acpi_pm_prepare(void) +{ + int error = __acpi_pm_prepare(); + + if (!error) + acpi_hw_disable_all_gpes(); + return error; +} + +/** + * acpi_pm_finish - Instruct the platform to leave a sleep state. + * + * This is called after we wake back up (or if entering the sleep state + * failed). + */ +static void acpi_pm_finish(void) +{ + u32 acpi_state = acpi_target_sleep_state; + + if (acpi_state == ACPI_STATE_S0) + return; + printk(KERN_INFO PREFIX "Waking up from system sleep state S%d\n", + acpi_state); + acpi_disable_wakeup_device(acpi_state); + acpi_leave_sleep_state(acpi_state); + + /* reset firmware waking vector */ + acpi_set_firmware_waking_vector((acpi_physical_address) 0); + + acpi_target_sleep_state = ACPI_STATE_S0; +} + +/** + * acpi_pm_end - Finish up suspend sequence. + */ +static void acpi_pm_end(void) +{ + /* + * This is necessary in case acpi_pm_finish() is not called during a + * failing transition to a sleep state. + */ + acpi_target_sleep_state = ACPI_STATE_S0; +} +#endif /* CONFIG_PM_SLEEP */ + +#ifdef CONFIG_SUSPEND extern void do_suspend_lowlevel(void); static u32 acpi_suspend_states[] = { @@ -66,7 +149,6 @@ static u32 acpi_suspend_states[] = { * acpi_suspend_begin - Set the target system sleep state to the state * associated with given @pm_state, if supported. */ - static int acpi_suspend_begin(suspend_state_t pm_state) { u32 acpi_state = acpi_suspend_states[pm_state]; @@ -82,25 +164,6 @@ static int acpi_suspend_begin(suspend_state_t pm_state) return error; } -/** - * acpi_suspend_prepare - Do preliminary suspend work. - * - * If necessary, set the firmware waking vector and do arch-specific - * nastiness to get the wakeup code to the waking vector. - */ - -static int acpi_suspend_prepare(void) -{ - int error = acpi_sleep_prepare(acpi_target_sleep_state); - - if (error) { - acpi_target_sleep_state = ACPI_STATE_S0; - return error; - } - - return ACPI_SUCCESS(acpi_hw_disable_all_gpes()) ? 0 : -EFAULT; -} - /** * acpi_suspend_enter - Actually enter a sleep state. * @pm_state: ignored @@ -109,7 +172,6 @@ static int acpi_suspend_prepare(void) * assembly, which in turn call acpi_enter_sleep_state(). * It's unfortunate, but it works. Please fix if you're feeling frisky. */ - static int acpi_suspend_enter(suspend_state_t pm_state) { acpi_status status = AE_OK; @@ -166,39 +228,6 @@ static int acpi_suspend_enter(suspend_state_t pm_state) return ACPI_SUCCESS(status) ? 0 : -EFAULT; } -/** - * acpi_suspend_finish - Instruct the platform to leave a sleep state. - * - * This is called after we wake back up (or if entering the sleep state - * failed). - */ - -static void acpi_suspend_finish(void) -{ - u32 acpi_state = acpi_target_sleep_state; - - acpi_disable_wakeup_device(acpi_state); - acpi_leave_sleep_state(acpi_state); - - /* reset firmware waking vector */ - acpi_set_firmware_waking_vector((acpi_physical_address) 0); - - acpi_target_sleep_state = ACPI_STATE_S0; -} - -/** - * acpi_suspend_end - Finish up suspend sequence. - */ - -static void acpi_suspend_end(void) -{ - /* - * This is necessary in case acpi_suspend_finish() is not called during a - * failing transition to a sleep state. - */ - acpi_target_sleep_state = ACPI_STATE_S0; -} - static int acpi_suspend_state_valid(suspend_state_t pm_state) { u32 acpi_state; @@ -218,10 +247,39 @@ static int acpi_suspend_state_valid(suspend_state_t pm_state) static struct platform_suspend_ops acpi_suspend_ops = { .valid = acpi_suspend_state_valid, .begin = acpi_suspend_begin, - .prepare = acpi_suspend_prepare, + .prepare = acpi_pm_prepare, + .enter = acpi_suspend_enter, + .finish = acpi_pm_finish, + .end = acpi_pm_end, +}; + +/** + * acpi_suspend_begin_old - Set the target system sleep state to the + * state associated with given @pm_state, if supported, and + * execute the _PTS control method. This function is used if the + * pre-ACPI 2.0 suspend ordering has been requested. + */ +static int acpi_suspend_begin_old(suspend_state_t pm_state) +{ + int error = acpi_suspend_begin(pm_state); + + if (!error) + error = __acpi_pm_prepare(); + return error; +} + +/* + * The following callbacks are used if the pre-ACPI 2.0 suspend ordering has + * been requested. + */ +static struct platform_suspend_ops acpi_suspend_ops_old = { + .valid = acpi_suspend_state_valid, + .begin = acpi_suspend_begin_old, + .prepare = acpi_pm_disable_gpes, .enter = acpi_suspend_enter, - .finish = acpi_suspend_finish, - .end = acpi_suspend_end, + .finish = acpi_pm_finish, + .end = acpi_pm_end, + .recover = acpi_pm_finish, }; #endif /* CONFIG_SUSPEND */ @@ -229,22 +287,9 @@ static struct platform_suspend_ops acpi_suspend_ops = { static int acpi_hibernation_begin(void) { acpi_target_sleep_state = ACPI_STATE_S4; - return 0; } -static int acpi_hibernation_prepare(void) -{ - int error = acpi_sleep_prepare(ACPI_STATE_S4); - - if (error) { - acpi_target_sleep_state = ACPI_STATE_S0; - return error; - } - - return ACPI_SUCCESS(acpi_hw_disable_all_gpes()) ? 0 : -EFAULT; -} - static int acpi_hibernation_enter(void) { acpi_status status = AE_OK; @@ -274,52 +319,55 @@ static void acpi_hibernation_leave(void) acpi_leave_sleep_state_prep(ACPI_STATE_S4); } -static void acpi_hibernation_finish(void) +static void acpi_pm_enable_gpes(void) { - acpi_disable_wakeup_device(ACPI_STATE_S4); - acpi_leave_sleep_state(ACPI_STATE_S4); - - /* reset firmware waking vector */ - acpi_set_firmware_waking_vector((acpi_physical_address) 0); - - acpi_target_sleep_state = ACPI_STATE_S0; + acpi_hw_enable_all_runtime_gpes(); } -static void acpi_hibernation_end(void) -{ - /* - * This is necessary in case acpi_hibernation_finish() is not called - * during a failing transition to the sleep state. - */ - acpi_target_sleep_state = ACPI_STATE_S0; -} +static struct platform_hibernation_ops acpi_hibernation_ops = { + .begin = acpi_hibernation_begin, + .end = acpi_pm_end, + .pre_snapshot = acpi_pm_prepare, + .finish = acpi_pm_finish, + .prepare = acpi_pm_prepare, + .enter = acpi_hibernation_enter, + .leave = acpi_hibernation_leave, + .pre_restore = acpi_pm_disable_gpes, + .restore_cleanup = acpi_pm_enable_gpes, +}; -static int acpi_hibernation_pre_restore(void) +/** + * acpi_hibernation_begin_old - Set the target system sleep state to + * ACPI_STATE_S4 and execute the _PTS control method. This + * function is used if the pre-ACPI 2.0 suspend ordering has been + * requested. + */ +static int acpi_hibernation_begin_old(void) { - acpi_status status; - - status = acpi_hw_disable_all_gpes(); - - return ACPI_SUCCESS(status) ? 0 : -EFAULT; -} + int error = acpi_sleep_prepare(ACPI_STATE_S4); -static void acpi_hibernation_restore_cleanup(void) -{ - acpi_hw_enable_all_runtime_gpes(); + if (!error) + acpi_target_sleep_state = ACPI_STATE_S4; + return error; } -static struct platform_hibernation_ops acpi_hibernation_ops = { - .begin = acpi_hibernation_begin, - .end = acpi_hibernation_end, - .pre_snapshot = acpi_hibernation_prepare, - .finish = acpi_hibernation_finish, - .prepare = acpi_hibernation_prepare, +/* + * The following callbacks are used if the pre-ACPI 2.0 suspend ordering has + * been requested. + */ +static struct platform_hibernation_ops acpi_hibernation_ops_old = { + .begin = acpi_hibernation_begin_old, + .end = acpi_pm_end, + .pre_snapshot = acpi_pm_disable_gpes, + .finish = acpi_pm_finish, + .prepare = acpi_pm_disable_gpes, .enter = acpi_hibernation_enter, .leave = acpi_hibernation_leave, - .pre_restore = acpi_hibernation_pre_restore, - .restore_cleanup = acpi_hibernation_restore_cleanup, + .pre_restore = acpi_pm_disable_gpes, + .restore_cleanup = acpi_pm_enable_gpes, + .recover = acpi_pm_finish, }; -#endif /* CONFIG_HIBERNATION */ +#endif /* CONFIG_HIBERNATION */ int acpi_suspend(u32 acpi_state) { @@ -461,13 +509,15 @@ int __init acpi_sleep_init(void) } } - suspend_set_ops(&acpi_suspend_ops); + suspend_set_ops(old_suspend_ordering ? + &acpi_suspend_ops_old : &acpi_suspend_ops); #endif #ifdef CONFIG_HIBERNATION status = acpi_get_sleep_type_data(ACPI_STATE_S4, &type_a, &type_b); if (ACPI_SUCCESS(status)) { - hibernation_set_ops(&acpi_hibernation_ops); + hibernation_set_ops(old_suspend_ordering ? + &acpi_hibernation_ops_old : &acpi_hibernation_ops); sleep_states[ACPI_STATE_S4] = 1; printk(" S4"); } diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c index d571204aaff..3250c5257b7 100644 --- a/drivers/base/power/main.c +++ b/drivers/base/power/main.c @@ -781,8 +781,6 @@ int device_suspend(pm_message_t state) error = dpm_prepare(state); if (!error) error = dpm_suspend(state); - if (error) - device_resume(resume_event(state)); return error; } EXPORT_SYMBOL_GPL(device_suspend); diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 41f7ce7edd7..33adcf91ef4 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -234,6 +234,9 @@ int acpi_check_region(resource_size_t start, resource_size_t n, int acpi_check_mem_region(resource_size_t start, resource_size_t n, const char *name); +#ifdef CONFIG_PM_SLEEP +void __init acpi_old_suspend_ordering(void); +#endif /* CONFIG_PM_SLEEP */ #else /* CONFIG_ACPI */ static inline int early_acpi_boot_init(void) diff --git a/include/linux/suspend.h b/include/linux/suspend.h index a6977423baf..e8e69159af7 100644 --- a/include/linux/suspend.h +++ b/include/linux/suspend.h @@ -86,6 +86,11 @@ typedef int __bitwise suspend_state_t; * that implement @begin(), but platforms implementing @begin() should * also provide a @end() which cleans up transitions aborted before * @enter(). + * + * @recover: Recover the platform from a suspend failure. + * Called by the PM core if the suspending of devices fails. + * This callback is optional and should only be implemented by platforms + * which require special recovery actions in that situation. */ struct platform_suspend_ops { int (*valid)(suspend_state_t state); @@ -94,6 +99,7 @@ struct platform_suspend_ops { int (*enter)(suspend_state_t state); void (*finish)(void); void (*end)(void); + void (*recover)(void); }; #ifdef CONFIG_SUSPEND @@ -149,7 +155,7 @@ extern void mark_free_pages(struct zone *zone); * The methods in this structure allow a platform to carry out special * operations required by it during a hibernation transition. * - * All the methods below must be implemented. + * All the methods below, except for @recover(), must be implemented. * * @begin: Tell the platform driver that we're starting hibernation. * Called right after shrinking memory and before freezing devices. @@ -189,6 +195,11 @@ extern void mark_free_pages(struct zone *zone); * @restore_cleanup: Clean up after a failing image restoration. * Called right after the nonboot CPUs have been enabled and before * thawing devices (runs with IRQs on). + * + * @recover: Recover the platform from a failure to suspend devices. + * Called by the PM core if the suspending of devices during hibernation + * fails. This callback is optional and should only be implemented by + * platforms which require special recovery actions in that situation. */ struct platform_hibernation_ops { int (*begin)(void); @@ -200,6 +211,7 @@ struct platform_hibernation_ops { void (*leave)(void); int (*pre_restore)(void); void (*restore_cleanup)(void); + void (*recover)(void); }; #ifdef CONFIG_HIBERNATION diff --git a/kernel/power/disk.c b/kernel/power/disk.c index d416be0efa8..f011e0870b5 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c @@ -179,6 +179,17 @@ static void platform_restore_cleanup(int platform_mode) hibernation_ops->restore_cleanup(); } +/** + * platform_recover - recover the platform from a failure to suspend + * devices. + */ + +static void platform_recover(int platform_mode) +{ + if (platform_mode && hibernation_ops && hibernation_ops->recover) + hibernation_ops->recover(); +} + /** * create_image - freeze devices that need to be frozen with interrupts * off, create the hibernation image and thaw those devices. Control @@ -258,10 +269,10 @@ int hibernation_snapshot(int platform_mode) suspend_console(); error = device_suspend(PMSG_FREEZE); if (error) - goto Resume_console; + goto Recover_platform; if (hibernation_test(TEST_DEVICES)) - goto Resume_devices; + goto Recover_platform; error = platform_pre_snapshot(platform_mode); if (error || hibernation_test(TEST_PLATFORM)) @@ -285,11 +296,14 @@ int hibernation_snapshot(int platform_mode) Resume_devices: device_resume(in_suspend ? (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); - Resume_console: resume_console(); Close: platform_end(platform_mode); return error; + + Recover_platform: + platform_recover(platform_mode); + goto Resume_devices; } /** @@ -398,8 +412,11 @@ int hibernation_platform_enter(void) suspend_console(); error = device_suspend(PMSG_HIBERNATE); - if (error) - goto Resume_console; + if (error) { + if (hibernation_ops->recover) + hibernation_ops->recover(); + goto Resume_devices; + } error = hibernation_ops->prepare(); if (error) @@ -428,7 +445,6 @@ int hibernation_platform_enter(void) hibernation_ops->finish(); Resume_devices: device_resume(PMSG_RESTORE); - Resume_console: resume_console(); Close: hibernation_ops->end(); diff --git a/kernel/power/main.c b/kernel/power/main.c index d023b6b584e..3398f4651aa 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -269,11 +269,11 @@ int suspend_devices_and_enter(suspend_state_t state) error = device_suspend(PMSG_SUSPEND); if (error) { printk(KERN_ERR "PM: Some devices failed to suspend\n"); - goto Resume_console; + goto Recover_platform; } if (suspend_test(TEST_DEVICES)) - goto Resume_devices; + goto Recover_platform; if (suspend_ops->prepare) { error = suspend_ops->prepare(); @@ -294,12 +294,16 @@ int suspend_devices_and_enter(suspend_state_t state) suspend_ops->finish(); Resume_devices: device_resume(PMSG_RESUME); - Resume_console: resume_console(); Close: if (suspend_ops->end) suspend_ops->end(); return error; + + Recover_platform: + if (suspend_ops->recover) + suspend_ops->recover(); + goto Resume_devices; } /** -- cgit v1.2.3-70-g09d2 From 0775b3dbcb6d17b531b36df520ddab735647f3f7 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 8 Jul 2008 15:07:08 -0700 Subject: suspend, xen: enable PM_SLEEP for CONFIG_XEN Xen save/restore requires PM_SLEEP to be set without requiring SUSPEND or HIBERNATION. Signed-off-by: Jeremy Fitzhardinge Cc: Stephen Tweedie Cc: Eduardo Habkost Cc: Mark McLoughlin Signed-off-by: Ingo Molnar --- kernel/power/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/power') diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index b45da40e8d2..1436c47cb39 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -82,7 +82,7 @@ config PM_SLEEP_SMP config PM_SLEEP bool - depends on SUSPEND || HIBERNATION + depends on SUSPEND || HIBERNATION || XEN default y config SUSPEND -- cgit v1.2.3-70-g09d2 From 6717ef1aa750b54ddb9d8854b91707ee21f0ad23 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 9 Jul 2008 22:17:01 +0200 Subject: Revert "suspend, xen: enable PM_SLEEP for CONFIG_XEN" This reverts commit 6fbbec428c8e7bb617da2e8a589af2e97bcf3bc4. Rafael doesnt like it - it breaks various assumptions. Signed-off-by: Ingo Molnar --- kernel/power/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/power') diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 1436c47cb39..b45da40e8d2 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -82,7 +82,7 @@ config PM_SLEEP_SMP config PM_SLEEP bool - depends on SUSPEND || HIBERNATION || XEN + depends on SUSPEND || HIBERNATION default y config SUSPEND -- cgit v1.2.3-70-g09d2 From ebb12db51f6c13b30752fcf506baad4c617b153c Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 11 Jun 2008 22:04:29 +0200 Subject: Freezer: Introduce PF_FREEZER_NOSIG The freezer currently attempts to distinguish kernel threads from user space tasks by checking if their mm pointer is unset and it does not send fake signals to kernel threads. However, there are kernel threads, mostly related to networking, that behave like user space tasks and may want to be sent a fake signal to be frozen. Introduce the new process flag PF_FREEZER_NOSIG that will be set by default for all kernel threads and make the freezer only send fake signals to the tasks having PF_FREEZER_NOSIG unset. Provide the set_freezable_with_signal() function to be called by the kernel threads that want to be sent a fake signal for freezing. This patch should not change the freezer's observable behavior. Signed-off-by: Rafael J. Wysocki Signed-off-by: Andi Kleen Acked-by: Pavel Machek Signed-off-by: Len Brown --- include/linux/freezer.h | 10 +++++ include/linux/sched.h | 1 + kernel/kthread.c | 2 +- kernel/power/process.c | 97 +++++++++++++++++++++---------------------------- 4 files changed, 54 insertions(+), 56 deletions(-) (limited to 'kernel/power') diff --git a/include/linux/freezer.h b/include/linux/freezer.h index 08934995c7a..deddeedf325 100644 --- a/include/linux/freezer.h +++ b/include/linux/freezer.h @@ -127,6 +127,15 @@ static inline void set_freezable(void) current->flags &= ~PF_NOFREEZE; } +/* + * Tell the freezer that the current task should be frozen by it and that it + * should send a fake signal to the task to freeze it. + */ +static inline void set_freezable_with_signal(void) +{ + current->flags &= ~(PF_NOFREEZE | PF_FREEZER_NOSIG); +} + /* * Freezer-friendly wrappers around wait_event_interruptible() and * wait_event_interruptible_timeout(), originally defined in @@ -174,6 +183,7 @@ static inline void freezer_do_not_count(void) {} static inline void freezer_count(void) {} static inline int freezer_should_skip(struct task_struct *p) { return 0; } static inline void set_freezable(void) {} +static inline void set_freezable_with_signal(void) {} #define wait_event_freezable(wq, condition) \ wait_event_interruptible(wq, condition) diff --git a/include/linux/sched.h b/include/linux/sched.h index 21349173d14..ba2f859c6e4 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1494,6 +1494,7 @@ static inline void put_task_struct(struct task_struct *t) #define PF_MEMPOLICY 0x10000000 /* Non-default NUMA mempolicy */ #define PF_MUTEX_TESTER 0x20000000 /* Thread belongs to the rt mutex tester */ #define PF_FREEZER_SKIP 0x40000000 /* Freezer should not count it as freezeable */ +#define PF_FREEZER_NOSIG 0x80000000 /* Freezer won't send signals to it */ /* * Only the _current_ task can read/write to tsk->flags, but other diff --git a/kernel/kthread.c b/kernel/kthread.c index 97747cdd37c..ac3fb732664 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -235,7 +235,7 @@ int kthreadd(void *unused) set_user_nice(tsk, KTHREAD_NICE_LEVEL); set_cpus_allowed(tsk, CPU_MASK_ALL); - current->flags |= PF_NOFREEZE; + current->flags |= PF_NOFREEZE | PF_FREEZER_NOSIG; for (;;) { set_current_state(TASK_INTERRUPTIBLE); diff --git a/kernel/power/process.c b/kernel/power/process.c index f1d0b345c9b..5fb87652f21 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -19,9 +19,6 @@ */ #define TIMEOUT (20 * HZ) -#define FREEZER_KERNEL_THREADS 0 -#define FREEZER_USER_SPACE 1 - static inline int freezeable(struct task_struct * p) { if ((p == current) || @@ -84,63 +81,53 @@ static void fake_signal_wake_up(struct task_struct *p) spin_unlock_irqrestore(&p->sighand->siglock, flags); } -static int has_mm(struct task_struct *p) +static inline bool should_send_signal(struct task_struct *p) { - return (p->mm && !(p->flags & PF_BORROWED_MM)); + return !(p->flags & PF_FREEZER_NOSIG); } /** * freeze_task - send a freeze request to given task * @p: task to send the request to - * @with_mm_only: if set, the request will only be sent if the task has its - * own mm - * Return value: 0, if @with_mm_only is set and the task has no mm of its - * own or the task is frozen, 1, otherwise + * @sig_only: if set, the request will only be sent if the task has the + * PF_FREEZER_NOSIG flag unset + * Return value: 'false', if @sig_only is set and the task has + * PF_FREEZER_NOSIG set or the task is frozen, 'true', otherwise * - * The freeze request is sent by seting the tasks's TIF_FREEZE flag and + * The freeze request is sent by setting the tasks's TIF_FREEZE flag and * either sending a fake signal to it or waking it up, depending on whether - * or not it has its own mm (ie. it is a user land task). If @with_mm_only - * is set and the task has no mm of its own (ie. it is a kernel thread), - * its TIF_FREEZE flag should not be set. - * - * The task_lock() is necessary to prevent races with exit_mm() or - * use_mm()/unuse_mm() from occuring. + * or not it has PF_FREEZER_NOSIG set. If @sig_only is set and the task + * has PF_FREEZER_NOSIG set (ie. it is a typical kernel thread), its + * TIF_FREEZE flag will not be set. */ -static int freeze_task(struct task_struct *p, int with_mm_only) +static bool freeze_task(struct task_struct *p, bool sig_only) { - int ret = 1; + /* + * We first check if the task is freezing and next if it has already + * been frozen to avoid the race with frozen_process() which first marks + * the task as frozen and next clears its TIF_FREEZE. + */ + if (!freezing(p)) { + rmb(); + if (frozen(p)) + return false; - task_lock(p); - if (freezing(p)) { - if (has_mm(p)) { - if (!signal_pending(p)) - fake_signal_wake_up(p); - } else { - if (with_mm_only) - ret = 0; - else - wake_up_state(p, TASK_INTERRUPTIBLE); - } + if (!sig_only || should_send_signal(p)) + set_freeze_flag(p); + else + return false; + } + + if (should_send_signal(p)) { + if (!signal_pending(p)) + fake_signal_wake_up(p); + } else if (sig_only) { + return false; } else { - rmb(); - if (frozen(p)) { - ret = 0; - } else { - if (has_mm(p)) { - set_freeze_flag(p); - fake_signal_wake_up(p); - } else { - if (with_mm_only) { - ret = 0; - } else { - set_freeze_flag(p); - wake_up_state(p, TASK_INTERRUPTIBLE); - } - } - } + wake_up_state(p, TASK_INTERRUPTIBLE); } - task_unlock(p); - return ret; + + return true; } static void cancel_freezing(struct task_struct *p) @@ -156,7 +143,7 @@ static void cancel_freezing(struct task_struct *p) } } -static int try_to_freeze_tasks(int freeze_user_space) +static int try_to_freeze_tasks(bool sig_only) { struct task_struct *g, *p; unsigned long end_time; @@ -175,7 +162,7 @@ static int try_to_freeze_tasks(int freeze_user_space) if (frozen(p) || !freezeable(p)) continue; - if (!freeze_task(p, freeze_user_space)) + if (!freeze_task(p, sig_only)) continue; /* @@ -235,13 +222,13 @@ int freeze_processes(void) int error; printk("Freezing user space processes ... "); - error = try_to_freeze_tasks(FREEZER_USER_SPACE); + error = try_to_freeze_tasks(true); if (error) goto Exit; printk("done.\n"); printk("Freezing remaining freezable tasks ... "); - error = try_to_freeze_tasks(FREEZER_KERNEL_THREADS); + error = try_to_freeze_tasks(false); if (error) goto Exit; printk("done."); @@ -251,7 +238,7 @@ int freeze_processes(void) return error; } -static void thaw_tasks(int thaw_user_space) +static void thaw_tasks(bool nosig_only) { struct task_struct *g, *p; @@ -260,7 +247,7 @@ static void thaw_tasks(int thaw_user_space) if (!freezeable(p)) continue; - if (!p->mm == thaw_user_space) + if (nosig_only && should_send_signal(p)) continue; thaw_process(p); @@ -271,8 +258,8 @@ static void thaw_tasks(int thaw_user_space) void thaw_processes(void) { printk("Restarting tasks ... "); - thaw_tasks(FREEZER_KERNEL_THREADS); - thaw_tasks(FREEZER_USER_SPACE); + thaw_tasks(true); + thaw_tasks(false); schedule(); printk("done.\n"); } -- cgit v1.2.3-70-g09d2 From 52d11025dba32bed696eaee1822b26529e764770 Mon Sep 17 00:00:00 2001 From: Alan Cox Date: Wed, 11 Jun 2008 22:07:52 +0200 Subject: snapshot: Push BKL down into ioctl handlers Push BKL down into ioctl handlers - snapshot device. Signed-off-by: Alan Cox Signed-off-by: Rafael J. Wysocki Signed-off-by: Len Brown Signed-off-by: Andi Kleen --- kernel/power/user.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'kernel/power') diff --git a/kernel/power/user.c b/kernel/power/user.c index f5512cb3aa8..658262b1599 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -23,6 +23,7 @@ #include #include #include +#include #include @@ -164,8 +165,8 @@ static ssize_t snapshot_write(struct file *filp, const char __user *buf, return res; } -static int snapshot_ioctl(struct inode *inode, struct file *filp, - unsigned int cmd, unsigned long arg) +static long snapshot_ioctl(struct file *filp, unsigned int cmd, + unsigned long arg) { int error = 0; struct snapshot_data *data; @@ -181,6 +182,8 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, data = filp->private_data; + lock_kernel(); + switch (cmd) { case SNAPSHOT_FREEZE: @@ -389,7 +392,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, error = -ENOTTY; } - + unlock_kernel(); return error; } @@ -399,7 +402,7 @@ static const struct file_operations snapshot_fops = { .read = snapshot_read, .write = snapshot_write, .llseek = no_llseek, - .ioctl = snapshot_ioctl, + .unlocked_ioctl = snapshot_ioctl, }; static struct miscdevice snapshot_device = { -- cgit v1.2.3-70-g09d2 From 25f2f3daadaf0768a61d02ee3ed3d9a21e9dc46c Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 11 Jun 2008 22:09:45 +0200 Subject: snapshot: Use pm_mutex for mutual exclusion We can avoid taking the BKL in snapshot_ioctl() if pm_mutex is used to prevent the ioctls from being executed concurrently. In addition, although it is only possible to open /dev/snapshot once, the task which has done that may spawn a child that will inherit the open descriptor, so in theory they can call snapshot_write(), snapshot_read() and snapshot_release() concurrently. pm_mutex can also be used for mutual exclusion in such cases. Signed-off-by: Rafael J. Wysocki Signed-off-by: Andi Kleen Acked-by: Pavel Machek Signed-off-by: Len Brown --- kernel/power/user.c | 68 +++++++++++++++++++++++++++++++++-------------------- 1 file changed, 42 insertions(+), 26 deletions(-) (limited to 'kernel/power') diff --git a/kernel/power/user.c b/kernel/power/user.c index 658262b1599..a6332a31326 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -70,16 +70,22 @@ static int snapshot_open(struct inode *inode, struct file *filp) struct snapshot_data *data; int error; - if (!atomic_add_unless(&snapshot_device_available, -1, 0)) - return -EBUSY; + mutex_lock(&pm_mutex); + + if (!atomic_add_unless(&snapshot_device_available, -1, 0)) { + error = -EBUSY; + goto Unlock; + } if ((filp->f_flags & O_ACCMODE) == O_RDWR) { atomic_inc(&snapshot_device_available); - return -ENOSYS; + error = -ENOSYS; + goto Unlock; } if(create_basic_memory_bitmaps()) { atomic_inc(&snapshot_device_available); - return -ENOMEM; + error = -ENOMEM; + goto Unlock; } nonseekable_open(inode, filp); data = &snapshot_state; @@ -99,33 +105,36 @@ static int snapshot_open(struct inode *inode, struct file *filp) if (error) pm_notifier_call_chain(PM_POST_HIBERNATION); } - if (error) { + if (error) atomic_inc(&snapshot_device_available); - return error; - } data->frozen = 0; data->ready = 0; data->platform_support = 0; - return 0; + Unlock: + mutex_unlock(&pm_mutex); + + return error; } static int snapshot_release(struct inode *inode, struct file *filp) { struct snapshot_data *data; + mutex_lock(&pm_mutex); + swsusp_free(); free_basic_memory_bitmaps(); data = filp->private_data; free_all_swap_pages(data->swap); - if (data->frozen) { - mutex_lock(&pm_mutex); + if (data->frozen) thaw_processes(); - mutex_unlock(&pm_mutex); - } pm_notifier_call_chain(data->mode == O_WRONLY ? PM_POST_HIBERNATION : PM_POST_RESTORE); atomic_inc(&snapshot_device_available); + + mutex_unlock(&pm_mutex); + return 0; } @@ -135,9 +144,13 @@ static ssize_t snapshot_read(struct file *filp, char __user *buf, struct snapshot_data *data; ssize_t res; + mutex_lock(&pm_mutex); + data = filp->private_data; - if (!data->ready) - return -ENODATA; + if (!data->ready) { + res = -ENODATA; + goto Unlock; + } res = snapshot_read_next(&data->handle, count); if (res > 0) { if (copy_to_user(buf, data_of(data->handle), res)) @@ -145,6 +158,10 @@ static ssize_t snapshot_read(struct file *filp, char __user *buf, else *offp = data->handle.offset; } + + Unlock: + mutex_unlock(&pm_mutex); + return res; } @@ -154,6 +171,8 @@ static ssize_t snapshot_write(struct file *filp, const char __user *buf, struct snapshot_data *data; ssize_t res; + mutex_lock(&pm_mutex); + data = filp->private_data; res = snapshot_write_next(&data->handle, count); if (res > 0) { @@ -162,6 +181,9 @@ static ssize_t snapshot_write(struct file *filp, const char __user *buf, else *offp = data->handle.offset; } + + mutex_unlock(&pm_mutex); + return res; } @@ -180,16 +202,16 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, if (!capable(CAP_SYS_ADMIN)) return -EPERM; - data = filp->private_data; + if (!mutex_trylock(&pm_mutex)) + return -EBUSY; - lock_kernel(); + data = filp->private_data; switch (cmd) { case SNAPSHOT_FREEZE: if (data->frozen) break; - mutex_lock(&pm_mutex); printk("Syncing filesystems ... "); sys_sync(); printk("done.\n"); @@ -197,7 +219,6 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, error = freeze_processes(); if (error) thaw_processes(); - mutex_unlock(&pm_mutex); if (!error) data->frozen = 1; break; @@ -205,9 +226,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, case SNAPSHOT_UNFREEZE: if (!data->frozen || data->ready) break; - mutex_lock(&pm_mutex); thaw_processes(); - mutex_unlock(&pm_mutex); data->frozen = 0; break; @@ -310,16 +329,11 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, error = -EPERM; break; } - if (!mutex_trylock(&pm_mutex)) { - error = -EBUSY; - break; - } /* * Tasks are frozen and the notifiers have been called with * PM_HIBERNATION_PREPARE */ error = suspend_devices_and_enter(PM_SUSPEND_MEM); - mutex_unlock(&pm_mutex); break; case SNAPSHOT_PLATFORM_SUPPORT: @@ -392,7 +406,9 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, error = -ENOTTY; } - unlock_kernel(); + + mutex_unlock(&pm_mutex); + return error; } -- cgit v1.2.3-70-g09d2 From 93a0886e2368eafb9df5e2021fb185195cee88b2 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 15 Jul 2008 13:43:42 -0700 Subject: x86, xen, power: fix up config dependencies on PM Xen save/restore needs bits of code enabled by PM_SLEEP, and PM_SLEEP depends on PM. So make XEN_SAVE_RESTORE depend on PM and PM_SLEEP depend on XEN_SAVE_RESTORE. Signed-off-by: Jeremy Fitzhardinge Acked-by: Rafael J. Wysocki Signed-off-by: Ingo Molnar --- arch/x86/xen/Kconfig | 5 +++++ kernel/power/Kconfig | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel/power') diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig index 20b49729bed..3815e425f47 100644 --- a/arch/x86/xen/Kconfig +++ b/arch/x86/xen/Kconfig @@ -23,3 +23,8 @@ config XEN_MAX_DOMAIN_MEMORY according to the maximum possible memory size of a Xen domain. This array uses 1 page per gigabyte, so there's no need to be too stingy here. + +config XEN_SAVE_RESTORE + bool + depends on PM + default y \ No newline at end of file diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index b45da40e8d2..59dfdf1e1d2 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -82,7 +82,7 @@ config PM_SLEEP_SMP config PM_SLEEP bool - depends on SUSPEND || HIBERNATION + depends on SUSPEND || HIBERNATION || XEN_SAVE_RESTORE default y config SUSPEND -- cgit v1.2.3-70-g09d2 From 77437fd4e61f87cc94d9314baa5cbf50e3ccdf54 Mon Sep 17 00:00:00 2001 From: David Brownell Date: Wed, 23 Jul 2008 21:28:33 -0700 Subject: pm: boot time suspend selftest Boot-time test for system suspend states (STR or standby). The generic RTC framework triggers wakeup alarms, which are used to exit those states. - Measures some aspects of suspend time ... this uses "jiffies" until someone converts it to use a timebase that works properly even while timer IRQs are disabled. - Triggered by a command line parameter. By default nothing even vaguely troublesome will happen, but "test_suspend=mem" will give you a brief STR test during system boot. (Or you may need to use "test_suspend=standby" instead, if your hardware needs that.) This isn't without problems. It fires early enough during boot that for example both PCMCIA and MMC stacks have misbehaved. The workaround in those cases was to boot without such media cards inserted. [matthltc@us.ibm.com: fix compile failure in boot time suspend selftest] Signed-off-by: David Brownell Cc: Ingo Molnar Cc: Pavel Machek Cc: "Rafael J. Wysocki" Signed-off-by: Matt Helsley Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/kernel-parameters.txt | 9 +- kernel/power/Kconfig | 11 ++ kernel/power/main.c | 194 +++++++++++++++++++++++++++++++++++- 3 files changed, 212 insertions(+), 2 deletions(-) (limited to 'kernel/power') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 01a2992b575..4d705713cab 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -87,7 +87,8 @@ parameter is applicable: SH SuperH architecture is enabled. SMP The kernel is an SMP kernel. SPARC Sparc architecture is enabled. - SWSUSP Software suspend is enabled. + SWSUSP Software suspend (hibernation) is enabled. + SUSPEND System suspend states are enabled. TS Appropriate touchscreen support is enabled. USB USB support is enabled. USBHID USB Human Interface Device support is enabled. @@ -2123,6 +2124,12 @@ and is between 256 and 4096 characters. It is defined in the file tdfx= [HW,DRM] + test_suspend= [SUSPEND] + Specify "mem" (for Suspend-to-RAM) or "standby" (for + standby suspend) as the system sleep state to briefly + enter during system startup. The system is woken from + this state using a wakeup-capable RTC alarm. + thash_entries= [KNL,NET] Set number of hash buckets for TCP connection diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 59dfdf1e1d2..dcd165f92a8 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -94,6 +94,17 @@ config SUSPEND powered and thus its contents are preserved, such as the suspend-to-RAM state (e.g. the ACPI S3 state). +config PM_TEST_SUSPEND + bool "Test suspend/resume and wakealarm during bootup" + depends on SUSPEND && PM_DEBUG && RTC_LIB=y + ---help--- + This option will let you suspend your machine during bootup, and + make it wake up a few seconds later using an RTC wakeup alarm. + Enable this with a kernel parameter like "test_suspend=mem". + + You probably want to have your system's RTC driver statically + linked, ensuring that it's available when this test runs. + config SUSPEND_FREEZER bool "Enable freezer for suspend to RAM/standby" \ if ARCH_WANTS_FREEZER_CONTROL || BROKEN diff --git a/kernel/power/main.c b/kernel/power/main.c index 3398f4651aa..95bff23ecda 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -132,6 +132,61 @@ static inline int suspend_test(int level) { return 0; } #ifdef CONFIG_SUSPEND +#ifdef CONFIG_PM_TEST_SUSPEND + +/* + * We test the system suspend code by setting an RTC wakealarm a short + * time in the future, then suspending. Suspending the devices won't + * normally take long ... some systems only need a few milliseconds. + * + * The time it takes is system-specific though, so when we test this + * during system bootup we allow a LOT of time. + */ +#define TEST_SUSPEND_SECONDS 5 + +static unsigned long suspend_test_start_time; + +static void suspend_test_start(void) +{ + /* FIXME Use better timebase than "jiffies", ideally a clocksource. + * What we want is a hardware counter that will work correctly even + * during the irqs-are-off stages of the suspend/resume cycle... + */ + suspend_test_start_time = jiffies; +} + +static void suspend_test_finish(const char *label) +{ + long nj = jiffies - suspend_test_start_time; + unsigned msec; + + msec = jiffies_to_msecs(abs(nj)); + pr_info("PM: %s took %d.%03d seconds\n", label, + msec / 1000, msec % 1000); + + /* Warning on suspend means the RTC alarm period needs to be + * larger -- the system was sooo slooowwww to suspend that the + * alarm (should have) fired before the system went to sleep! + * + * Warning on either suspend or resume also means the system + * has some performance issues. The stack dump of a WARN_ON + * is more likely to get the right attention than a printk... + */ + WARN_ON(msec > (TEST_SUSPEND_SECONDS * 1000)); +} + +#else + +static void suspend_test_start(void) +{ +} + +static void suspend_test_finish(const char *label) +{ +} + +#endif + /* This is just an arbitrary number */ #define FREE_PAGE_NUMBER (100) @@ -266,12 +321,13 @@ int suspend_devices_and_enter(suspend_state_t state) goto Close; } suspend_console(); + suspend_test_start(); error = device_suspend(PMSG_SUSPEND); if (error) { printk(KERN_ERR "PM: Some devices failed to suspend\n"); goto Recover_platform; } - + suspend_test_finish("suspend devices"); if (suspend_test(TEST_DEVICES)) goto Recover_platform; @@ -293,7 +349,9 @@ int suspend_devices_and_enter(suspend_state_t state) if (suspend_ops->finish) suspend_ops->finish(); Resume_devices: + suspend_test_start(); device_resume(PMSG_RESUME); + suspend_test_finish("resume devices"); resume_console(); Close: if (suspend_ops->end) @@ -521,3 +579,137 @@ static int __init pm_init(void) } core_initcall(pm_init); + + +#ifdef CONFIG_PM_TEST_SUSPEND + +#include + +/* + * To test system suspend, we need a hands-off mechanism to resume the + * system. RTCs wake alarms are a common self-contained mechanism. + */ + +static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state) +{ + static char err_readtime[] __initdata = + KERN_ERR "PM: can't read %s time, err %d\n"; + static char err_wakealarm [] __initdata = + KERN_ERR "PM: can't set %s wakealarm, err %d\n"; + static char err_suspend[] __initdata = + KERN_ERR "PM: suspend test failed, error %d\n"; + static char info_test[] __initdata = + KERN_INFO "PM: test RTC wakeup from '%s' suspend\n"; + + unsigned long now; + struct rtc_wkalrm alm; + int status; + + /* this may fail if the RTC hasn't been initialized */ + status = rtc_read_time(rtc, &alm.time); + if (status < 0) { + printk(err_readtime, rtc->dev.bus_id, status); + return; + } + rtc_tm_to_time(&alm.time, &now); + + memset(&alm, 0, sizeof alm); + rtc_time_to_tm(now + TEST_SUSPEND_SECONDS, &alm.time); + alm.enabled = true; + + status = rtc_set_alarm(rtc, &alm); + if (status < 0) { + printk(err_wakealarm, rtc->dev.bus_id, status); + return; + } + + if (state == PM_SUSPEND_MEM) { + printk(info_test, pm_states[state]); + status = pm_suspend(state); + if (status == -ENODEV) + state = PM_SUSPEND_STANDBY; + } + if (state == PM_SUSPEND_STANDBY) { + printk(info_test, pm_states[state]); + status = pm_suspend(state); + } + if (status < 0) + printk(err_suspend, status); +} + +static int __init has_wakealarm(struct device *dev, void *name_ptr) +{ + struct rtc_device *candidate = to_rtc_device(dev); + + if (!candidate->ops->set_alarm) + return 0; + if (!device_may_wakeup(candidate->dev.parent)) + return 0; + + *(char **)name_ptr = dev->bus_id; + return 1; +} + +/* + * Kernel options like "test_suspend=mem" force suspend/resume sanity tests + * at startup time. They're normally disabled, for faster boot and because + * we can't know which states really work on this particular system. + */ +static suspend_state_t test_state __initdata = PM_SUSPEND_ON; + +static char warn_bad_state[] __initdata = + KERN_WARNING "PM: can't test '%s' suspend state\n"; + +static int __init setup_test_suspend(char *value) +{ + unsigned i; + + /* "=mem" ==> "mem" */ + value++; + for (i = 0; i < PM_SUSPEND_MAX; i++) { + if (!pm_states[i]) + continue; + if (strcmp(pm_states[i], value) != 0) + continue; + test_state = (__force suspend_state_t) i; + return 0; + } + printk(warn_bad_state, value); + return 0; +} +__setup("test_suspend", setup_test_suspend); + +static int __init test_suspend(void) +{ + static char warn_no_rtc[] __initdata = + KERN_WARNING "PM: no wakealarm-capable RTC driver is ready\n"; + + char *pony = NULL; + struct rtc_device *rtc = NULL; + + /* PM is initialized by now; is that state testable? */ + if (test_state == PM_SUSPEND_ON) + goto done; + if (!valid_state(test_state)) { + printk(warn_bad_state, pm_states[test_state]); + goto done; + } + + /* RTCs have initialized by now too ... can we use one? */ + class_find_device(rtc_class, NULL, &pony, has_wakealarm); + if (pony) + rtc = rtc_class_open(pony); + if (!rtc) { + printk(warn_no_rtc); + goto done; + } + + /* go for it */ + test_wakealarm(rtc, test_state); + rtc_class_close(rtc); +done: + return 0; +} +late_initcall(test_suspend); + +#endif /* CONFIG_PM_TEST_SUSPEND */ -- cgit v1.2.3-70-g09d2 From 0d83304c7e7bd3b05be90281b3a47841bc8f057a Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Wed, 23 Jul 2008 21:28:38 -0700 Subject: pm: hibernation: simplify memory bitmap This patch simplifies the memory bitmap manipulations. - remove the member size in struct bm_block It is not necessary for struct bm_block to have the number of bit chunks that can be calculated by using end_pfn and start_pfn. - use find_next_bit() for memory_bm_next_pfn No need to invent the bitmap library only for the memory bitmap. Signed-off-by: Akinobu Mita Signed-off-by: Rafael J. Wysocki Acked-by: Pavel Machek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/snapshot.c | 88 ++++++++++++------------------------------------- 1 file changed, 21 insertions(+), 67 deletions(-) (limited to 'kernel/power') diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 5f91a07c4ea..5d2ab836e99 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -205,8 +205,7 @@ static void chain_free(struct chain_allocator *ca, int clear_page_nosave) * objects. The main list's elements are of type struct zone_bitmap * and each of them corresonds to one zone. For each zone bitmap * object there is a list of objects of type struct bm_block that - * represent each blocks of bit chunks in which information is - * stored. + * represent each blocks of bitmap in which information is stored. * * struct memory_bitmap contains a pointer to the main list of zone * bitmap objects, a struct bm_position used for browsing the bitmap, @@ -224,26 +223,27 @@ static void chain_free(struct chain_allocator *ca, int clear_page_nosave) * pfns that correspond to the start and end of the represented zone. * * struct bm_block contains a pointer to the memory page in which - * information is stored (in the form of a block of bit chunks - * of type unsigned long each). It also contains the pfns that - * correspond to the start and end of the represented memory area and - * the number of bit chunks in the block. + * information is stored (in the form of a block of bitmap) + * It also contains the pfns that correspond to the start and end of + * the represented memory area. */ #define BM_END_OF_MAP (~0UL) -#define BM_CHUNKS_PER_BLOCK (PAGE_SIZE / sizeof(long)) -#define BM_BITS_PER_CHUNK (sizeof(long) << 3) #define BM_BITS_PER_BLOCK (PAGE_SIZE << 3) struct bm_block { struct bm_block *next; /* next element of the list */ unsigned long start_pfn; /* pfn represented by the first bit */ unsigned long end_pfn; /* pfn represented by the last bit plus 1 */ - unsigned int size; /* number of bit chunks */ - unsigned long *data; /* chunks of bits representing pages */ + unsigned long *data; /* bitmap representing pages */ }; +static inline unsigned long bm_block_bits(struct bm_block *bb) +{ + return bb->end_pfn - bb->start_pfn; +} + struct zone_bitmap { struct zone_bitmap *next; /* next element of the list */ unsigned long start_pfn; /* minimal pfn in this zone */ @@ -257,7 +257,6 @@ struct zone_bitmap { struct bm_position { struct zone_bitmap *zone_bm; struct bm_block *block; - int chunk; int bit; }; @@ -272,12 +271,6 @@ struct memory_bitmap { /* Functions that operate on memory bitmaps */ -static inline void memory_bm_reset_chunk(struct memory_bitmap *bm) -{ - bm->cur.chunk = 0; - bm->cur.bit = -1; -} - static void memory_bm_position_reset(struct memory_bitmap *bm) { struct zone_bitmap *zone_bm; @@ -285,7 +278,7 @@ static void memory_bm_position_reset(struct memory_bitmap *bm) zone_bm = bm->zone_bm_list; bm->cur.zone_bm = zone_bm; bm->cur.block = zone_bm->bm_blocks; - memory_bm_reset_chunk(bm); + bm->cur.bit = 0; } static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free); @@ -394,12 +387,10 @@ memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed) bb->start_pfn = pfn; if (nr >= BM_BITS_PER_BLOCK) { pfn += BM_BITS_PER_BLOCK; - bb->size = BM_CHUNKS_PER_BLOCK; nr -= BM_BITS_PER_BLOCK; } else { /* This is executed only once in the loop */ pfn += nr; - bb->size = DIV_ROUND_UP(nr, BM_BITS_PER_CHUNK); } bb->end_pfn = pfn; bb = bb->next; @@ -478,8 +469,8 @@ static int memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn, } zone_bm->cur_block = bb; pfn -= bb->start_pfn; - *bit_nr = pfn % BM_BITS_PER_CHUNK; - *addr = bb->data + pfn / BM_BITS_PER_CHUNK; + *bit_nr = pfn; + *addr = bb->data; return 0; } @@ -528,36 +519,6 @@ static int memory_bm_test_bit(struct memory_bitmap *bm, unsigned long pfn) return test_bit(bit, addr); } -/* Two auxiliary functions for memory_bm_next_pfn */ - -/* Find the first set bit in the given chunk, if there is one */ - -static inline int next_bit_in_chunk(int bit, unsigned long *chunk_p) -{ - bit++; - while (bit < BM_BITS_PER_CHUNK) { - if (test_bit(bit, chunk_p)) - return bit; - - bit++; - } - return -1; -} - -/* Find a chunk containing some bits set in given block of bits */ - -static inline int next_chunk_in_block(int n, struct bm_block *bb) -{ - n++; - while (n < bb->size) { - if (bb->data[n]) - return n; - - n++; - } - return -1; -} - /** * memory_bm_next_pfn - find the pfn that corresponds to the next set bit * in the bitmap @bm. If the pfn cannot be found, BM_END_OF_MAP is @@ -571,40 +532,33 @@ static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm) { struct zone_bitmap *zone_bm; struct bm_block *bb; - int chunk; int bit; do { bb = bm->cur.block; do { - chunk = bm->cur.chunk; bit = bm->cur.bit; - do { - bit = next_bit_in_chunk(bit, bb->data + chunk); - if (bit >= 0) - goto Return_pfn; - - chunk = next_chunk_in_block(chunk, bb); - bit = -1; - } while (chunk >= 0); + bit = find_next_bit(bb->data, bm_block_bits(bb), bit); + if (bit < bm_block_bits(bb)) + goto Return_pfn; + bb = bb->next; bm->cur.block = bb; - memory_bm_reset_chunk(bm); + bm->cur.bit = 0; } while (bb); zone_bm = bm->cur.zone_bm->next; if (zone_bm) { bm->cur.zone_bm = zone_bm; bm->cur.block = zone_bm->bm_blocks; - memory_bm_reset_chunk(bm); + bm->cur.bit = 0; } } while (zone_bm); memory_bm_position_reset(bm); return BM_END_OF_MAP; Return_pfn: - bm->cur.chunk = chunk; - bm->cur.bit = bit; - return bb->start_pfn + chunk * BM_BITS_PER_CHUNK + bit; + bm->cur.bit = bit + 1; + return bb->start_pfn + bit; } /** -- cgit v1.2.3-70-g09d2 From 2f15fc4bdf91eb399da3f47a09c55831d9f22826 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Wed, 23 Jul 2008 21:28:40 -0700 Subject: pm: schedule sysrq poweroff on boot cpu schedule sysrq poweroff on boot cpu. sysrq poweroff needs to disable nonboot cpus, and we need to run this on boot cpu to avoid any recursion. http://bugzilla.kernel.org/show_bug.cgi?id=10897 [kosaki.motohiro@jp.fujitsu.com: build fix] Signed-off-by: Zhang Rui Tested-by: Rus Signed-off-by: Rafael J. Wysocki Acked-by: Pavel Machek Signed-off-by: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/poweroff.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel/power') diff --git a/kernel/power/poweroff.c b/kernel/power/poweroff.c index 678ec736076..72016f05147 100644 --- a/kernel/power/poweroff.c +++ b/kernel/power/poweroff.c @@ -10,6 +10,7 @@ #include #include #include +#include /* * When the user hits Sys-Rq o to power down the machine this is the @@ -25,7 +26,8 @@ static DECLARE_WORK(poweroff_work, do_poweroff); static void handle_poweroff(int key, struct tty_struct *tty) { - schedule_work(&poweroff_work); + /* run sysrq poweroff on boot cpu */ + schedule_work_on(first_cpu(cpu_online_map), &poweroff_work); } static struct sysrq_key_op sysrq_poweroff_op = { -- cgit v1.2.3-70-g09d2 From f0af566da6e9a4a2f5a83c5a70f3d0a772050e21 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 23 Jul 2008 21:28:44 -0700 Subject: pm: fix try_to_freeze_tasks()'s use of do_div() Fix try_to_freeze_tasks()'s use of do_div() on an s64 by making elapsed_csecs64 a u64 instead and dividing that. Possibly this should be guarded lest the interval calculation turn up negative, but the possible negativity of the result of the division is cast away anyway. This was introduced by patch 438e2ce68dfd4af4cfcec2f873564fb921db4bb5. Signed-off-by: David Howells Acked-by: "Rafael J. Wysocki" Acked-by: Pavel Machek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/process.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/power') diff --git a/kernel/power/process.c b/kernel/power/process.c index 5fb87652f21..278946aecaf 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -149,7 +149,7 @@ static int try_to_freeze_tasks(bool sig_only) unsigned long end_time; unsigned int todo; struct timeval start, end; - s64 elapsed_csecs64; + u64 elapsed_csecs64; unsigned int elapsed_csecs; do_gettimeofday(&start); -- cgit v1.2.3-70-g09d2 From a2e2e3577c3ef2b5dbb866e97e612aae4adfa32f Mon Sep 17 00:00:00 2001 From: David Brownell Date: Fri, 25 Jul 2008 19:44:38 -0700 Subject: pm selftest: rtc paranoia Cope with a quirk of some RTCs (notably ACPI ones) which aren't guaranteed to implement oneshot behavior when they woke the system from sleeep: forcibly disable the alarm, just in case. Signed-off-by: David Brownell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/main.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel/power') diff --git a/kernel/power/main.c b/kernel/power/main.c index 95bff23ecda..0b7476f5d2a 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -635,6 +635,13 @@ static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state) } if (status < 0) printk(err_suspend, status); + + /* Some platforms can't detect that the alarm triggered the + * wakeup, or (accordingly) disable it after it afterwards. + * It's supposed to give oneshot behavior; cope. + */ + alm.enabled = false; + rtc_set_alarm(rtc, &alm); } static int __init has_wakealarm(struct device *dev, void *name_ptr) -- cgit v1.2.3-70-g09d2 From 89081d17f7bb81d89fa1aa9b70f821c5cf4d39e9 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Fri, 25 Jul 2008 19:45:10 -0700 Subject: kexec jump: save/restore device state This patch implements devices state save/restore before after kexec. This patch together with features in kexec_jump patch can be used for following: - A simple hibernation implementation without ACPI support. You can kexec a hibernating kernel, save the memory image of original system and shutdown the system. When resuming, you restore the memory image of original system via ordinary kexec load then jump back. - Kernel/system debug through making system snapshot. You can make system snapshot, jump back, do some thing and make another system snapshot. - Cooperative multi-kernel/system. With kexec jump, you can switch between several kernels/systems quickly without boot process except the first time. This appears like swap a whole kernel/system out/in. - A general method to call program in physical mode (paging turning off). This can be used to invoke BIOS code under Linux. The following user-space tools can be used with kexec jump: - kexec-tools needs to be patched to support kexec jump. The patches and the precompiled kexec can be download from the following URL: source: http://khibernation.sourceforge.net/download/release_v10/kexec-tools/kexec-tools-src_git_kh10.tar.bz2 patches: http://khibernation.sourceforge.net/download/release_v10/kexec-tools/kexec-tools-patches_git_kh10.tar.bz2 binary: http://khibernation.sourceforge.net/download/release_v10/kexec-tools/kexec_git_kh10 - makedumpfile with patches are used as memory image saving tool, it can exclude free pages from original kernel memory image file. The patches and the precompiled makedumpfile can be download from the following URL: source: http://khibernation.sourceforge.net/download/release_v10/makedumpfile/makedumpfile-src_cvs_kh10.tar.bz2 patches: http://khibernation.sourceforge.net/download/release_v10/makedumpfile/makedumpfile-patches_cvs_kh10.tar.bz2 binary: http://khibernation.sourceforge.net/download/release_v10/makedumpfile/makedumpfile_cvs_kh10 - An initramfs image can be used as the root file system of kexeced kernel. An initramfs image built with "BuildRoot" can be downloaded from the following URL: initramfs image: http://khibernation.sourceforge.net/download/release_v10/initramfs/rootfs_cvs_kh10.gz All user space tools above are included in the initramfs image. Usage example of simple hibernation: 1. Compile and install patched kernel with following options selected: CONFIG_X86_32=y CONFIG_RELOCATABLE=y CONFIG_KEXEC=y CONFIG_CRASH_DUMP=y CONFIG_PM=y CONFIG_HIBERNATION=y CONFIG_KEXEC_JUMP=y 2. Build an initramfs image contains kexec-tool and makedumpfile, or download the pre-built initramfs image, called rootfs.gz in following text. 3. Prepare a partition to save memory image of original kernel, called hibernating partition in following text. 4. Boot kernel compiled in step 1 (kernel A). 5. In the kernel A, load kernel compiled in step 1 (kernel B) with /sbin/kexec. The shell command line can be as follow: /sbin/kexec --load-preserve-context /boot/bzImage --mem-min=0x100000 --mem-max=0xffffff --initrd=rootfs.gz 6. Boot the kernel B with following shell command line: /sbin/kexec -e 7. The kernel B will boot as normal kexec. In kernel B the memory image of kernel A can be saved into hibernating partition as follow: jump_back_entry=`cat /proc/cmdline | tr ' ' '\n' | grep kexec_jump_back_entry | cut -d '='` echo $jump_back_entry > kexec_jump_back_entry cp /proc/vmcore dump.elf Then you can shutdown the machine as normal. 8. Boot kernel compiled in step 1 (kernel C). Use the rootfs.gz as root file system. 9. In kernel C, load the memory image of kernel A as follow: /sbin/kexec -l --args-none --entry=`cat kexec_jump_back_entry` dump.elf 10. Jump back to the kernel A as follow: /sbin/kexec -e Then, kernel A is resumed. Implementation point: To support jumping between two kernels, before jumping to (executing) the new kernel and jumping back to the original kernel, the devices are put into quiescent state, and the state of devices and CPU is saved. After jumping back from kexeced kernel and jumping to the new kernel, the state of devices and CPU are restored accordingly. The devices/CPU state save/restore code of software suspend is called to implement corresponding function. Known issues: - Because the segment number supported by sys_kexec_load is limited, hibernation image with many segments may not be load. This is planned to be eliminated by adding a new flag to sys_kexec_load to make a image can be loaded with multiple sys_kexec_load invoking. Now, only the i386 architecture is supported. Signed-off-by: Huang Ying Acked-by: Vivek Goyal Cc: "Eric W. Biederman" Cc: Pavel Machek Cc: Nigel Cunningham Cc: "Rafael J. Wysocki" Cc: Ingo Molnar Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/Kconfig | 5 +++-- arch/x86/kernel/machine_kexec_32.c | 12 ++++++++++++ include/linux/suspend.h | 2 ++ kernel/kexec.c | 39 ++++++++++++++++++++++++++++++++++++++ kernel/power/power.h | 2 -- 5 files changed, 56 insertions(+), 4 deletions(-) (limited to 'kernel/power') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 7ecb679f013..6b2debfabdd 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1282,9 +1282,10 @@ config CRASH_DUMP config KEXEC_JUMP bool "kexec jump (EXPERIMENTAL)" depends on EXPERIMENTAL - depends on KEXEC && PM_SLEEP && X86_32 + depends on KEXEC && HIBERNATION && X86_32 help - Invoke code in physical address mode via KEXEC + Jump between original kernel and kexeced kernel and invoke + code in physical address mode via KEXEC config PHYSICAL_START hex "Physical address where the kernel is loaded" if (EMBEDDED || CRASH_DUMP) diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c index 2b67609d0a1..9fe478d9840 100644 --- a/arch/x86/kernel/machine_kexec_32.c +++ b/arch/x86/kernel/machine_kexec_32.c @@ -125,6 +125,18 @@ void machine_kexec(struct kimage *image) /* Interrupts aren't acceptable while we reboot */ local_irq_disable(); + if (image->preserve_context) { +#ifdef CONFIG_X86_IO_APIC + /* We need to put APICs in legacy mode so that we can + * get timer interrupts in second kernel. kexec/kdump + * paths already have calls to disable_IO_APIC() in + * one form or other. kexec jump path also need + * one. + */ + disable_IO_APIC(); +#endif + } + control_page = page_address(image->control_code_page); memcpy(control_page, relocate_kernel, PAGE_SIZE/2); diff --git a/include/linux/suspend.h b/include/linux/suspend.h index e8e69159af7..c6343509597 100644 --- a/include/linux/suspend.h +++ b/include/linux/suspend.h @@ -278,4 +278,6 @@ static inline void register_nosave_region_late(unsigned long b, unsigned long e) } #endif +extern struct mutex pm_mutex; + #endif /* _LINUX_SUSPEND_H */ diff --git a/kernel/kexec.c b/kernel/kexec.c index a0d920915b3..c8a4370e2a3 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -26,6 +26,10 @@ #include #include #include +#include +#include +#include +#include #include #include @@ -1441,7 +1445,31 @@ int kernel_kexec(void) if (kexec_image->preserve_context) { #ifdef CONFIG_KEXEC_JUMP + mutex_lock(&pm_mutex); + pm_prepare_console(); + error = freeze_processes(); + if (error) { + error = -EBUSY; + goto Restore_console; + } + suspend_console(); + error = device_suspend(PMSG_FREEZE); + if (error) + goto Resume_console; + error = disable_nonboot_cpus(); + if (error) + goto Resume_devices; local_irq_disable(); + /* At this point, device_suspend() has been called, + * but *not* device_power_down(). We *must* + * device_power_down() now. Otherwise, drivers for + * some devices (e.g. interrupt controllers) become + * desynchronized with the actual state of the + * hardware at resume time, and evil weirdness ensues. + */ + error = device_power_down(PMSG_FREEZE); + if (error) + goto Enable_irqs; save_processor_state(); #endif } else { @@ -1459,7 +1487,18 @@ int kernel_kexec(void) if (kexec_image->preserve_context) { #ifdef CONFIG_KEXEC_JUMP restore_processor_state(); + device_power_up(PMSG_RESTORE); + Enable_irqs: local_irq_enable(); + enable_nonboot_cpus(); + Resume_devices: + device_resume(PMSG_RESTORE); + Resume_console: + resume_console(); + thaw_processes(); + Restore_console: + pm_restore_console(); + mutex_unlock(&pm_mutex); #endif } diff --git a/kernel/power/power.h b/kernel/power/power.h index 700f44ec840..acc0c101dbd 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -53,8 +53,6 @@ extern int hibernation_platform_enter(void); extern int pfn_is_nosave(unsigned long); -extern struct mutex pm_mutex; - #define power_attr(_name) \ static struct kobj_attribute _name##_attr = { \ .attr = { \ -- cgit v1.2.3-70-g09d2