1 files changed, 273 insertions, 103 deletions
diff --git a/arch/powerpc/kernel/crash.c b/arch/powerpc/kernel/crash.c
index 8c21d378f5d..51dbace3269 100644
--- a/arch/powerpc/kernel/crash.c
+++ b/arch/powerpc/kernel/crash.c
@@ -10,171 +10,303 @@
  *
  */
 
-#undef DEBUG
-
 #include <linux/kernel.h>
 #include <linux/smp.h>
 #include <linux/reboot.h>
 #include <linux/kexec.h>
-#include <linux/bootmem.h>
+#include <linux/export.h>
 #include <linux/crash_dump.h>
 #include <linux/delay.h>
-#include <linux/elf.h>
-#include <linux/elfcore.h>
-#include <linux/init.h>
+#include <linux/irq.h>
 #include <linux/types.h>
 
 #include <asm/processor.h>
 #include <asm/machdep.h>
+#include <asm/kexec.h>
 #include <asm/kdump.h>
-#include <asm/lmb.h>
-#include <asm/firmware.h>
+#include <asm/prom.h>
 #include <asm/smp.h>
+#include <asm/setjmp.h>
+#include <asm/debug.h>
 
-#ifdef DEBUG
-#include <asm/udbg.h>
-#define DBG(fmt...) udbg_printf(fmt)
-#else
-#define DBG(fmt...)
-#endif
+/*
+ * The primary CPU waits a while for all secondary CPUs to enter. This is to
+ * avoid sending an IPI if the secondary CPUs are entering
+ * crash_kexec_secondary on their own (eg via a system reset).
+ *
+ * The secondary timeout has to be longer than the primary. Both timeouts are
+ * in milliseconds.
+ */
+#define PRIMARY_TIMEOUT		500
+#define SECONDARY_TIMEOUT	1000
 
-/* This keeps a track of which one is crashing cpu. */
-int crashing_cpu = -1;
+#define IPI_TIMEOUT		10000
+#define REAL_MODE_TIMEOUT	10000
 
-static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data,
-							       size_t data_len)
-{
-	struct elf_note note;
-
-	note.n_namesz = strlen(name) + 1;
-	note.n_descsz = data_len;
-	note.n_type   = type;
-	memcpy(buf, &note, sizeof(note));
-	buf += (sizeof(note) +3)/4;
-	memcpy(buf, name, note.n_namesz);
-	buf += (note.n_namesz + 3)/4;
-	memcpy(buf, data, note.n_descsz);
-	buf += (note.n_descsz + 3)/4;
-
-	return buf;
-}
+/* This keeps a track of which one is the crashing cpu. */
+int crashing_cpu = -1;
+static int time_to_dump;
 
-static void final_note(u32 *buf)
-{
-	struct elf_note note;
+#define CRASH_HANDLER_MAX 3
+/* NULL terminated list of shutdown handles */
+static crash_shutdown_t crash_shutdown_handles[CRASH_HANDLER_MAX+1];
+static DEFINE_SPINLOCK(crash_handlers_lock);
 
-	note.n_namesz = 0;
-	note.n_descsz = 0;
-	note.n_type   = 0;
-	memcpy(buf, &note, sizeof(note));
-}
+static unsigned long crash_shutdown_buf[JMP_BUF_LEN];
+static int crash_shutdown_cpu = -1;
 
-static void crash_save_this_cpu(struct pt_regs *regs, int cpu)
+static int handle_fault(struct pt_regs *regs)
 {
-	struct elf_prstatus prstatus;
-	u32 *buf;
-
-	if ((cpu < 0) || (cpu >= NR_CPUS))
-		return;
-
-	/* Using ELF notes here is opportunistic.
-	 * I need a well defined structure format
-	 * for the data I pass, and I need tags
-	 * on the data to indicate what information I have
-	 * squirrelled away.  ELF notes happen to provide
-	 * all of that that no need to invent something new.
-	 */
-	buf = (u32*)per_cpu_ptr(crash_notes, cpu);
-	if (!buf) 
-		return;
-
-	memset(&prstatus, 0, sizeof(prstatus));
-	prstatus.pr_pid = current->pid;
-	elf_core_copy_regs(&prstatus.pr_reg, regs);
-	buf = append_elf_note(buf, "CORE", NT_PRSTATUS, &prstatus,
-			sizeof(prstatus));
-	final_note(buf);
+	if (crash_shutdown_cpu == smp_processor_id())
+		longjmp(crash_shutdown_buf, 1);
+	return 0;
 }
 
 #ifdef CONFIG_SMP
-static atomic_t waiting_for_crash_ipi;
 
+static atomic_t cpus_in_crash;
 void crash_ipi_callback(struct pt_regs *regs)
 {
-	int cpu = smp_processor_id();
+	static cpumask_t cpus_state_saved = CPU_MASK_NONE;
 
-	if (cpu == crashing_cpu)
-		return;
+	int cpu = smp_processor_id();
 
 	if (!cpu_online(cpu))
 		return;
 
+	hard_irq_disable();
+	if (!cpumask_test_cpu(cpu, &cpus_state_saved)) {
+		crash_save_cpu(regs, cpu);
+		cpumask_set_cpu(cpu, &cpus_state_saved);
+	}
+
+	atomic_inc(&cpus_in_crash);
+	smp_mb__after_atomic();
+
+	/*
+	 * Starting the kdump boot.
+	 * This barrier is needed to make sure that all CPUs are stopped.
+	 */
+	while (!time_to_dump)
+		cpu_relax();
+
 	if (ppc_md.kexec_cpu_down)
 		ppc_md.kexec_cpu_down(1, 1);
 
-	local_irq_disable();
-
-	crash_save_this_cpu(regs, cpu);
-	atomic_dec(&waiting_for_crash_ipi);
+#ifdef CONFIG_PPC64
 	kexec_smp_wait();
+#else
+	for (;;);	/* FIXME */
+#endif
+
 	/* NOTREACHED */
 }
 
-static void crash_kexec_prepare_cpus(void)
+static void crash_kexec_prepare_cpus(int cpu)
 {
 	unsigned int msecs;
+	unsigned int ncpus = num_online_cpus() - 1;/* Excluding the panic cpu */
+	int tries = 0;
+	int (*old_handler)(struct pt_regs *regs);
 
-	atomic_set(&waiting_for_crash_ipi, num_online_cpus() - 1);
+	printk(KERN_EMERG "Sending IPI to other CPUs\n");
 
 	crash_send_ipi(crash_ipi_callback);
 	smp_wmb();
 
+again:
 	/*
-	 * FIXME: Until we will have the way to stop other CPUSs reliabally,
+	 * FIXME: Until we will have the way to stop other CPUs reliably,
 	 * the crash CPU will send an IPI and wait for other CPUs to
-	 * respond. If not, proceed the kexec boot even though we failed to
-	 * capture other CPU states.
+	 * respond.
 	 */
-	msecs = 1000000;
-	while ((atomic_read(&waiting_for_crash_ipi) > 0) && (--msecs > 0)) {
-		barrier();
+	msecs = IPI_TIMEOUT;
+	while ((atomic_read(&cpus_in_crash) < ncpus) && (--msecs > 0))
 		mdelay(1);
-	}
 
 	/* Would it be better to replace the trap vector here? */
 
+	if (atomic_read(&cpus_in_crash) >= ncpus) {
+		printk(KERN_EMERG "IPI complete\n");
+		return;
+	}
+
+	printk(KERN_EMERG "ERROR: %d cpu(s) not responding\n",
+		ncpus - atomic_read(&cpus_in_crash));
+
 	/*
-	 * FIXME: In case if we do not get all CPUs, one possibility: ask the
-	 * user to do soft reset such that we get all.
-	 * IPI handler is already set by the panic cpu initially. Therefore,
-	 * all cpus could invoke this handler from die() and the panic CPU
-	 * will call machine_kexec() directly from this handler to do
-	 * kexec boot.
+	 * If we have a panic timeout set then we can't wait indefinitely
+	 * for someone to activate system reset. We also give up on the
+	 * second time through if system reset fail to work.
 	 */
-	if (atomic_read(&waiting_for_crash_ipi))
-		printk(KERN_ALERT "done waiting: %d cpus not responding\n",
-			atomic_read(&waiting_for_crash_ipi));
-	/* Leave the IPI callback set */
+	if ((panic_timeout > 0) || (tries > 0))
+		return;
+
+	/*
+	 * A system reset will cause all CPUs to take an 0x100 exception.
+	 * The primary CPU returns here via setjmp, and the secondary
+	 * CPUs reexecute the crash_kexec_secondary path.
+	 */
+	old_handler = __debugger;
+	__debugger = handle_fault;
+	crash_shutdown_cpu = smp_processor_id();
+
+	if (setjmp(crash_shutdown_buf) == 0) {
+		printk(KERN_EMERG "Activate system reset (dumprestart) "
+				  "to stop other cpu(s)\n");
+
+		/*
+		 * A system reset will force all CPUs to execute the
+		 * crash code again. We need to reset cpus_in_crash so we
+		 * wait for everyone to do this.
+		 */
+		atomic_set(&cpus_in_crash, 0);
+		smp_mb();
+
+		while (atomic_read(&cpus_in_crash) < ncpus)
+			cpu_relax();
+	}
+
+	crash_shutdown_cpu = -1;
+	__debugger = old_handler;
+
+	tries++;
+	goto again;
 }
-#else
-static void crash_kexec_prepare_cpus(void)
+
+/*
+ * This function will be called by secondary cpus.
+ */
+void crash_kexec_secondary(struct pt_regs *regs)
+{
+	unsigned long flags;
+	int msecs = SECONDARY_TIMEOUT;
+
+	local_irq_save(flags);
+
+	/* Wait for the primary crash CPU to signal its progress */
+	while (crashing_cpu < 0) {
+		if (--msecs < 0) {
+			/* No response, kdump image may not have been loaded */
+			local_irq_restore(flags);
+			return;
+		}
+
+		mdelay(1);
+	}
+
+	crash_ipi_callback(regs);
+}
+
+#else	/* ! CONFIG_SMP */
+
+static void crash_kexec_prepare_cpus(int cpu)
 {
 	/*
-	 * move the secondarys to us so that we can copy
+	 * move the secondaries to us so that we can copy
 	 * the new kernel 0-0x100 safely
 	 *
 	 * do this if kexec in setup.c ?
 	 */
+#ifdef CONFIG_PPC64
 	smp_release_cpus();
+#else
+	/* FIXME */
+#endif
 }
 
-#endif
+void crash_kexec_secondary(struct pt_regs *regs)
+{
+}
+#endif	/* CONFIG_SMP */
+
+/* wait for all the CPUs to hit real mode but timeout if they don't come in */
+#if defined(CONFIG_SMP) && defined(CONFIG_PPC_STD_MMU_64)
+static void crash_kexec_wait_realmode(int cpu)
+{
+	unsigned int msecs;
+	int i;
+
+	msecs = REAL_MODE_TIMEOUT;
+	for (i=0; i < nr_cpu_ids && msecs > 0; i++) {
+		if (i == cpu)
+			continue;
+
+		while (paca[i].kexec_state < KEXEC_STATE_REAL_MODE) {
+			barrier();
+			if (!cpu_possible(i) || !cpu_online(i) || (msecs <= 0))
+				break;
+			msecs--;
+			mdelay(1);
+		}
+	}
+	mb();
+}
+#else
+static inline void crash_kexec_wait_realmode(int cpu) {}
+#endif	/* CONFIG_SMP && CONFIG_PPC_STD_MMU_64 */
+
+/*
+ * Register a function to be called on shutdown.  Only use this if you
+ * can't reset your device in the second kernel.
+ */
+int crash_shutdown_register(crash_shutdown_t handler)
+{
+	unsigned int i, rc;
+
+	spin_lock(&crash_handlers_lock);
+	for (i = 0 ; i < CRASH_HANDLER_MAX; i++)
+		if (!crash_shutdown_handles[i]) {
+			/* Insert handle at first empty entry */
+			crash_shutdown_handles[i] = handler;
+			rc = 0;
+			break;
+		}
+
+	if (i == CRASH_HANDLER_MAX) {
+		printk(KERN_ERR "Crash shutdown handles full, "
+		       "not registered.\n");
+		rc = 1;
+	}
+
+	spin_unlock(&crash_handlers_lock);
+	return rc;
+}
+EXPORT_SYMBOL(crash_shutdown_register);
+
+int crash_shutdown_unregister(crash_shutdown_t handler)
+{
+	unsigned int i, rc;
+
+	spin_lock(&crash_handlers_lock);
+	for (i = 0 ; i < CRASH_HANDLER_MAX; i++)
+		if (crash_shutdown_handles[i] == handler)
+			break;
+
+	if (i == CRASH_HANDLER_MAX) {
+		printk(KERN_ERR "Crash shutdown handle not found\n");
+		rc = 1;
+	} else {
+		/* Shift handles down */
+		for (; crash_shutdown_handles[i]; i++)
+			crash_shutdown_handles[i] =
+				crash_shutdown_handles[i+1];
+		rc = 0;
+	}
+
+	spin_unlock(&crash_handlers_lock);
+	return rc;
+}
+EXPORT_SYMBOL(crash_shutdown_unregister);
 
 void default_machine_crash_shutdown(struct pt_regs *regs)
 {
+	unsigned int i;
+	int (*old_handler)(struct pt_regs *regs);
+
 	/*
 	 * This function is only called after the system
-	 * has paniced or is otherwise in a critical state.
+	 * has panicked or is otherwise in a critical state.
 	 * The minimum amount of code to allow a kexec'd kernel
 	 * to run successfully needs to happen here.
 	 *
@@ -182,16 +314,54 @@ void default_machine_crash_shutdown(struct pt_regs *regs)
 	 * an SMP system.
 	 * The kernel is broken so disable interrupts.
 	 */
-	local_irq_disable();
-
-	if (ppc_md.kexec_cpu_down)
-		ppc_md.kexec_cpu_down(1, 0);
+	hard_irq_disable();
 
 	/*
 	 * Make a note of crashing cpu. Will be used in machine_kexec
 	 * such that another IPI will not be sent.
 	 */
 	crashing_cpu = smp_processor_id();
-	crash_kexec_prepare_cpus();
-	crash_save_this_cpu(regs, crashing_cpu);
+
+	/*
+	 * If we came in via system reset, wait a while for the secondary
+	 * CPUs to enter.
+	 */
+	if (TRAP(regs) == 0x100)
+		mdelay(PRIMARY_TIMEOUT);
+
+	crash_kexec_prepare_cpus(crashing_cpu);
+
+	crash_save_cpu(regs, crashing_cpu);
+
+	time_to_dump = 1;
+
+	crash_kexec_wait_realmode(crashing_cpu);
+
+	machine_kexec_mask_interrupts();
+
+	/*
+	 * Call registered shutdown routines safely.  Swap out
+	 * __debugger_fault_handler, and replace on exit.
+	 */
+	old_handler = __debugger_fault_handler;
+	__debugger_fault_handler = handle_fault;
+	crash_shutdown_cpu = smp_processor_id();
+	for (i = 0; crash_shutdown_handles[i]; i++) {
+		if (setjmp(crash_shutdown_buf) == 0) {
+			/*
+			 * Insert syncs and delay to ensure
+			 * instructions in the dangerous region don't
+			 * leak away from this protected region.
+			 */
+			asm volatile("sync; isync");
+			/* dangerous region */
+			crash_shutdown_handles[i]();
+			asm volatile("sync; isync");
+		}
+	}
+	crash_shutdown_cpu = -1;
+	__debugger_fault_handler = old_handler;
+
+	if (ppc_md.kexec_cpu_down)
+		ppc_md.kexec_cpu_down(1, 0);
 }