From 1767c8f392857694899403a65942cc70b5b7d132 Mon Sep 17 00:00:00 2001 From: Milton Miller Date: Wed, 22 Oct 2008 10:39:18 +0000 Subject: powerpc: Kexec exit should not use magic numbers Commit 54622f10a6aabb8bb2bdacf3dd070046f03dc246 ("powerpc: Support for relocatable kdump kernel") added a magic flag value in a register to tell purgatory that it should be a panic kernel. This part is wrong and is reverted by this commit. The kernel gets a list of memory blocks and a entry point from user space. Its job is to copy the blocks into place and then branch to the designated entry point (after turning "off" the mmu). The user space tool inserts a trampoline, called purgatory, that runs before the user supplied code. Its job is to establish the entry environment for the new kernel or other application based on the contents of memory. The purgatory code is compiled and embedded in the tool, where it is later patched using the elf symbol table using elf symbols. Since the tool knows it is creating a purgatory that will run after a kernel crash, it should just patch purgatory (or the kernel directly) if something needs to happen. Signed-off-by: Milton Miller Signed-off-by: Paul Mackerras --- arch/powerpc/kernel/machine_kexec_64.c | 9 ++------- arch/powerpc/kernel/misc_64.S | 9 +++------ 2 files changed, 5 insertions(+), 13 deletions(-) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/kernel/machine_kexec_64.c b/arch/powerpc/kernel/machine_kexec_64.c index e6efec788c4..3c4ca046e85 100644 --- a/arch/powerpc/kernel/machine_kexec_64.c +++ b/arch/powerpc/kernel/machine_kexec_64.c @@ -255,14 +255,11 @@ static union thread_union kexec_stack /* Our assembly helper, in kexec_stub.S */ extern NORET_TYPE void kexec_sequence(void *newstack, unsigned long start, void *image, void *control, - void (*clear_all)(void), - unsigned long kdump_flag) ATTRIB_NORET; + void (*clear_all)(void)) ATTRIB_NORET; /* too late to fail here */ void default_machine_kexec(struct kimage *image) { - unsigned long kdump_flag = 0; - /* prepare control code if any */ /* @@ -275,8 +272,6 @@ void default_machine_kexec(struct kimage *image) if (crashing_cpu == -1) kexec_prepare_cpus(); - else - kdump_flag = KDUMP_SIGNATURE; /* switch to a staticly allocated stack. Based on irq stack code. * XXX: the task struct will likely be invalid once we do the copy! @@ -289,7 +284,7 @@ void default_machine_kexec(struct kimage *image) */ kexec_sequence(&kexec_stack, image->start, image, page_address(image->control_code_page), - ppc_md.hpte_clear_all, kdump_flag); + ppc_md.hpte_clear_all); /* NOTREACHED */ } diff --git a/arch/powerpc/kernel/misc_64.S b/arch/powerpc/kernel/misc_64.S index a243fd072a7..3053fe5c62f 100644 --- a/arch/powerpc/kernel/misc_64.S +++ b/arch/powerpc/kernel/misc_64.S @@ -611,12 +611,10 @@ real_mode: /* assume normal blr return */ /* - * kexec_sequence(newstack, start, image, control, clear_all(), kdump_flag) + * kexec_sequence(newstack, start, image, control, clear_all()) * * does the grungy work with stack switching and real mode switches * also does simple calls to other code - * - * kdump_flag says whether the next kernel should be a kdump kernel. */ _GLOBAL(kexec_sequence) @@ -649,7 +647,7 @@ _GLOBAL(kexec_sequence) mr r29,r5 /* image (virt) */ mr r28,r6 /* control, unused */ mr r27,r7 /* clear_all() fn desc */ - mr r26,r8 /* kdump flag */ + mr r26,r8 /* spare */ lhz r25,PACAHWCPUID(r13) /* get our phys cpu from paca */ /* disable interrupts, we are overwriting kernel data next */ @@ -711,6 +709,5 @@ _GLOBAL(kexec_sequence) mr r4,r30 # start, aka phys mem offset mtlr 4 li r5,0 - mr r6,r26 /* kdump_flag */ - blr /* image->start(physid, image->start, 0, kdump_flag); */ + blr /* image->start(physid, image->start, 0); */ #endif /* CONFIG_KEXEC */ -- cgit v1.2.3-18-g5258 From 62a8bd6c9246c0e1f19dfb8fc65ad7c4f7cac8bb Mon Sep 17 00:00:00 2001 From: Milton Miller Date: Wed, 22 Oct 2008 15:39:04 -0500 Subject: powerpc: Use is_kdump_kernel() linux/crash_dump.h defines is_kdump_kernel() to be used by code that needs to know if the previous kernel crashed instead of a (clean) boot or reboot. This updates the just added powerpc code to use it. This is needed for the next commit, which will remove __kdump_flag. Signed-off-by: Milton Miller Signed-off-by: Paul Mackerras --- arch/powerpc/kernel/iommu.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c index 3857d7e2af0..45f47c97fd1 100644 --- a/arch/powerpc/kernel/iommu.c +++ b/arch/powerpc/kernel/iommu.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #include #include @@ -460,7 +461,7 @@ void iommu_unmap_sg(struct iommu_table *tbl, struct scatterlist *sglist, static void iommu_table_clear(struct iommu_table *tbl) { - if (!__kdump_flag) { + if (!is_kdump_kernel()) { /* Clear the table in case firmware left allocations in it */ ppc_md.tce_free(tbl, tbl->it_offset, tbl->it_size); return; -- cgit v1.2.3-18-g5258 From 8b8b0cc1c736ddca39b60bb098bd0a23daaa495f Mon Sep 17 00:00:00 2001 From: Milton Miller Date: Thu, 23 Oct 2008 18:41:09 +0000 Subject: powerpc/ppc64/kdump: Better flag for running relocatable The __kdump_flag ABI is overly constraining for future development. As of 2.6.27, the kernel entry point has 4 constraints: Offset 0 is the starting point for the master (boot) cpu (entered with r3 pointing to the device tree structure), offset 0x60 is code for the slave cpus (entered with r3 set to their device tree physical id), offset 0x20 is used by the iseries hypervisor, and secondary cpus must be well behaved when the first 256 bytes are copied to address 0. Placing the __kdump_flag at 0x18 is bad because: - It was taking the last 8 bytes before the iseries hypervisor data. - It was 8 bytes for a boolean flag - It had no way of identifying that the flag was present - It does leave any room for the master to add any additional code before branching, which hurts debug. - It will be unnecessarily hard for 32 bit code to be common (8 bytes) Now that we have eliminated the use of __kdump_flag in favor of the standard is_kdump_kernel(), this flag only controls run without relocating the kernel to PHYSICAL_START (0), so rename it __run_at_load. Move the flag to 0x5c, 1 word before the secondary cpu entry point at 0x60. Initialize it with "run0" to say it will run at 0 unless it is set to 1. It only exists if we are relocatable. Signed-off-by: Milton Miller Signed-off-by: Michael Neuling Signed-off-by: Paul Mackerras --- arch/powerpc/kernel/head_64.S | 30 +++++++++++++++++++----------- 1 file changed, 19 insertions(+), 11 deletions(-) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/kernel/head_64.S b/arch/powerpc/kernel/head_64.S index 69489bd3210..b4bcf5a930f 100644 --- a/arch/powerpc/kernel/head_64.S +++ b/arch/powerpc/kernel/head_64.S @@ -97,12 +97,6 @@ __secondary_hold_spinloop: __secondary_hold_acknowledge: .llong 0x0 - /* This flag is set by purgatory if we should be a kdump kernel. */ - /* Do not move this variable as purgatory knows about it. */ - .globl __kdump_flag -__kdump_flag: - .llong 0x0 - #ifdef CONFIG_PPC_ISERIES /* * At offset 0x20, there is a pointer to iSeries LPAR data. @@ -112,6 +106,20 @@ __kdump_flag: .llong hvReleaseData-KERNELBASE #endif /* CONFIG_PPC_ISERIES */ +#ifdef CONFIG_CRASH_DUMP + /* This flag is set to 1 by a loader if the kernel should run + * at the loaded address instead of the linked address. This + * is used by kexec-tools to keep the the kdump kernel in the + * crash_kernel region. The loader is responsible for + * observing the alignment requirement. + */ + /* Do not move this variable as kexec-tools knows about it. */ + . = 0x5c + .globl __run_at_load +__run_at_load: + .long 0x72756e30 /* "run0" -- relocate to 0 by default */ +#endif + . = 0x60 /* * The following code is used to hold secondary processors @@ -1391,8 +1399,8 @@ _STATIC(__after_prom_start) lis r25,PAGE_OFFSET@highest /* compute virtual base of kernel */ sldi r25,r25,32 #ifdef CONFIG_CRASH_DUMP - ld r7,__kdump_flag-_stext(r26) - cmpldi cr0,r7,1 /* kdump kernel ? - stay where we are */ + lwz r7,__run_at_load-_stext(r26) + cmplwi cr0,r7,1 /* kdump kernel ? - stay where we are */ bne 1f add r25,r25,r26 #endif @@ -1416,11 +1424,11 @@ _STATIC(__after_prom_start) #ifdef CONFIG_CRASH_DUMP /* * Check if the kernel has to be running as relocatable kernel based on the - * variable __kdump_flag, if it is set the kernel is treated as relocatable + * variable __run_at_load, if it is set the kernel is treated as relocatable * kernel, otherwise it will be moved to PHYSICAL_START */ - ld r7,__kdump_flag-_stext(r26) - cmpldi cr0,r7,1 + lwz r7,__run_at_load-_stext(r26) + cmplwi cr0,r7,1 bne 3f li r5,__end_interrupts - _stext /* just copy interrupts */ -- cgit v1.2.3-18-g5258 From 2a4b9c5af82035c591adca951a9af1665ad1a2b0 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Wed, 22 Oct 2008 18:43:45 +0000 Subject: powerpc: Work around ld bug in older binutils Commit 549e8152de8039506f69c677a4546e5427aa6ae7 ("powerpc: Make the 64-bit kernel as a position-independent executable") added lines to vmlinux.lds.S to add the extra sections needed to implement a relocatable kernel. However, those lines seem to trigger a bug in older versions of GNU ld (such as 2.16.1) when building a non-relocatable kernel. Since ld 2.16.1 is still a popular choice for cross-toolchains, this adds an #ifdef to vmlinux.lds.S so the added lines are only included when building a relocatable kernel. Signed-off-by: Paul Mackerras --- arch/powerpc/kernel/vmlinux.lds.S | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S index b39c27ed791..384dca5a9c1 100644 --- a/arch/powerpc/kernel/vmlinux.lds.S +++ b/arch/powerpc/kernel/vmlinux.lds.S @@ -187,6 +187,7 @@ SECTIONS *(.machine.desc) __machine_desc_end = . ; } +#ifdef CONFIG_RELOCATABLE . = ALIGN(8); .dynsym : AT(ADDR(.dynsym) - LOAD_OFFSET) { *(.dynsym) } .dynstr : AT(ADDR(.dynstr) - LOAD_OFFSET) { *(.dynstr) } @@ -202,6 +203,7 @@ SECTIONS __rela_dyn_start = .; *(.rela*) } +#endif /* Fake ELF header containing RPA note; for addnote */ .fakeelf : AT(ADDR(.fakeelf) - LOAD_OFFSET) { *(.fakeelf) } -- cgit v1.2.3-18-g5258 From b160544cccb403310cf38ddb3ebc156ea454848a Mon Sep 17 00:00:00 2001 From: Michael Neuling Date: Wed, 22 Oct 2008 19:39:49 +0000 Subject: powerpc: Fix compiler warning for the relocatable kernel Fixes this warning: arch/powerpc/kernel/setup_64.c:447:5: warning: "kernstart_addr" is not defined which arises because PHYSICAL_START is no longer a constant when CONFIG_RELOCATABLE=y. Signed-off-by: Michael Neuling Signed-off-by: Paul Mackerras --- arch/powerpc/kernel/setup_64.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index 843c0af210d..169d74cef15 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -444,9 +444,9 @@ void __init setup_system(void) if (htab_address) printk("htab_address = 0x%p\n", htab_address); printk("htab_hash_mask = 0x%lx\n", htab_hash_mask); -#if PHYSICAL_START > 0 - printk("physical_start = 0x%lx\n", PHYSICAL_START); -#endif + if (PHYSICAL_START > 0) + printk("physical_start = 0x%lx\n", + PHYSICAL_START); printk("-----------------------------------------------------\n"); DBG(" <- setup_system()\n"); -- cgit v1.2.3-18-g5258 From 16c29d180becc5bdf92fd0fc7314a44a671b5f4e Mon Sep 17 00:00:00 2001 From: Michael Neuling Date: Thu, 23 Oct 2008 00:42:36 +0000 Subject: powerpc: Fix swapcontext system for VSX + old ucontext size Since VSX support was added, we now have two sizes of ucontext_t; the older, smaller size without the extra VSX state, and the new larger size with the extra VSX state. A program using the sys_swapcontext system call and supplying smaller ucontext_t structures will currently get an EINVAL error if the task has used VSX (e.g. because of calling library code that uses VSX) and the old_ctx argument is non-NULL (i.e. the program is asking for its current context to be saved). Thus the program will start getting EINVAL errors on calls that previously worked. This commit changes this behaviour so that we don't send an EINVAL in this case. It will now return the smaller context but the VSX MSR bit will always be cleared to indicate that the ucontext_t doesn't include the extra VSX state, even if the task has executed VSX instructions. Both 32 and 64 bit cases are updated. [paulus@samba.org - also fix some access_ok() and get_user() calls] Thanks to Ben Herrenschmidt for noticing this problem. Signed-off-by: Michael Neuling Signed-off-by: Paul Mackerras --- arch/powerpc/kernel/signal_32.c | 36 +++++++++++++++--------------------- arch/powerpc/kernel/signal_64.c | 33 +++++++++++++++------------------ 2 files changed, 30 insertions(+), 39 deletions(-) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c index 3e80aa32b8b..a6a43103655 100644 --- a/arch/powerpc/kernel/signal_32.c +++ b/arch/powerpc/kernel/signal_32.c @@ -410,7 +410,7 @@ inline unsigned long copy_fpr_from_user(struct task_struct *task, * altivec/spe instructions at some point. */ static int save_user_regs(struct pt_regs *regs, struct mcontext __user *frame, - int sigret) + int sigret, int ctx_has_vsx_region) { unsigned long msr = regs->msr; @@ -451,7 +451,7 @@ static int save_user_regs(struct pt_regs *regs, struct mcontext __user *frame, * the saved MSR value to indicate that frame->mc_vregs * contains valid data */ - if (current->thread.used_vsr) { + if (current->thread.used_vsr && ctx_has_vsx_region) { __giveup_vsx(current); if (copy_vsx_to_user(&frame->mc_vsregs, current)) return 1; @@ -858,11 +858,11 @@ int handle_rt_signal32(unsigned long sig, struct k_sigaction *ka, frame = &rt_sf->uc.uc_mcontext; addr = frame; if (vdso32_rt_sigtramp && current->mm->context.vdso_base) { - if (save_user_regs(regs, frame, 0)) + if (save_user_regs(regs, frame, 0, 1)) goto badframe; regs->link = current->mm->context.vdso_base + vdso32_rt_sigtramp; } else { - if (save_user_regs(regs, frame, __NR_rt_sigreturn)) + if (save_user_regs(regs, frame, __NR_rt_sigreturn, 1)) goto badframe; regs->link = (unsigned long) frame->tramp; } @@ -936,12 +936,13 @@ long sys_swapcontext(struct ucontext __user *old_ctx, int ctx_size, int r6, int r7, int r8, struct pt_regs *regs) { unsigned char tmp; + int ctx_has_vsx_region = 0; #ifdef CONFIG_PPC64 unsigned long new_msr = 0; if (new_ctx && - __get_user(new_msr, &new_ctx->uc_mcontext.mc_gregs[PT_MSR])) + get_user(new_msr, &new_ctx->uc_mcontext.mc_gregs[PT_MSR])) return -EFAULT; /* * Check that the context is not smaller than the original @@ -956,16 +957,9 @@ long sys_swapcontext(struct ucontext __user *old_ctx, if ((ctx_size < sizeof(struct ucontext)) && (new_msr & MSR_VSX)) return -EINVAL; -#ifdef CONFIG_VSX - /* - * If userspace doesn't provide enough room for VSX data, - * but current thread has used VSX, we don't have anywhere - * to store the full context back into. - */ - if ((ctx_size < sizeof(struct ucontext)) && - (current->thread.used_vsr && old_ctx)) - return -EINVAL; -#endif + /* Does the context have enough room to store VSX data? */ + if (ctx_size >= sizeof(struct ucontext)) + ctx_has_vsx_region = 1; #else /* Context size is for future use. Right now, we only make sure * we are passed something we understand @@ -985,17 +979,17 @@ long sys_swapcontext(struct ucontext __user *old_ctx, */ mctx = (struct mcontext __user *) ((unsigned long) &old_ctx->uc_mcontext & ~0xfUL); - if (!access_ok(VERIFY_WRITE, old_ctx, sizeof(*old_ctx)) - || save_user_regs(regs, mctx, 0) + if (!access_ok(VERIFY_WRITE, old_ctx, ctx_size) + || save_user_regs(regs, mctx, 0, ctx_has_vsx_region) || put_sigset_t(&old_ctx->uc_sigmask, ¤t->blocked) || __put_user(to_user_ptr(mctx), &old_ctx->uc_regs)) return -EFAULT; } if (new_ctx == NULL) return 0; - if (!access_ok(VERIFY_READ, new_ctx, sizeof(*new_ctx)) + if (!access_ok(VERIFY_READ, new_ctx, ctx_size) || __get_user(tmp, (u8 __user *) new_ctx) - || __get_user(tmp, (u8 __user *) (new_ctx + 1) - 1)) + || __get_user(tmp, (u8 __user *) new_ctx + ctx_size - 1)) return -EFAULT; /* @@ -1196,11 +1190,11 @@ int handle_signal32(unsigned long sig, struct k_sigaction *ka, goto badframe; if (vdso32_sigtramp && current->mm->context.vdso_base) { - if (save_user_regs(regs, &frame->mctx, 0)) + if (save_user_regs(regs, &frame->mctx, 0, 1)) goto badframe; regs->link = current->mm->context.vdso_base + vdso32_sigtramp; } else { - if (save_user_regs(regs, &frame->mctx, __NR_sigreturn)) + if (save_user_regs(regs, &frame->mctx, __NR_sigreturn, 1)) goto badframe; regs->link = (unsigned long) frame->mctx.tramp; } diff --git a/arch/powerpc/kernel/signal_64.c b/arch/powerpc/kernel/signal_64.c index c6a8f2326b6..e132891d3ce 100644 --- a/arch/powerpc/kernel/signal_64.c +++ b/arch/powerpc/kernel/signal_64.c @@ -74,7 +74,8 @@ static const char fmt64[] = KERN_INFO \ */ static long setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs, - int signr, sigset_t *set, unsigned long handler) + int signr, sigset_t *set, unsigned long handler, + int ctx_has_vsx_region) { /* When CONFIG_ALTIVEC is set, we _always_ setup v_regs even if the * process never used altivec yet (MSR_VEC is zero in pt_regs of @@ -121,7 +122,7 @@ static long setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs, * then out to userspace. Update v_regs to point after the * VMX data. */ - if (current->thread.used_vsr) { + if (current->thread.used_vsr && ctx_has_vsx_region) { __giveup_vsx(current); v_regs += ELF_NVRREG; err |= copy_vsx_to_user(v_regs, current); @@ -282,9 +283,10 @@ int sys_swapcontext(struct ucontext __user *old_ctx, unsigned char tmp; sigset_t set; unsigned long new_msr = 0; + int ctx_has_vsx_region = 0; if (new_ctx && - __get_user(new_msr, &new_ctx->uc_mcontext.gp_regs[PT_MSR])) + get_user(new_msr, &new_ctx->uc_mcontext.gp_regs[PT_MSR])) return -EFAULT; /* * Check that the context is not smaller than the original @@ -299,28 +301,23 @@ int sys_swapcontext(struct ucontext __user *old_ctx, if ((ctx_size < sizeof(struct ucontext)) && (new_msr & MSR_VSX)) return -EINVAL; -#ifdef CONFIG_VSX - /* - * If userspace doesn't provide enough room for VSX data, - * but current thread has used VSX, we don't have anywhere - * to store the full context back into. - */ - if ((ctx_size < sizeof(struct ucontext)) && - (current->thread.used_vsr && old_ctx)) - return -EINVAL; -#endif + /* Does the context have enough room to store VSX data? */ + if (ctx_size >= sizeof(struct ucontext)) + ctx_has_vsx_region = 1; + if (old_ctx != NULL) { - if (!access_ok(VERIFY_WRITE, old_ctx, sizeof(*old_ctx)) - || setup_sigcontext(&old_ctx->uc_mcontext, regs, 0, NULL, 0) + if (!access_ok(VERIFY_WRITE, old_ctx, ctx_size) + || setup_sigcontext(&old_ctx->uc_mcontext, regs, 0, NULL, 0, + ctx_has_vsx_region) || __copy_to_user(&old_ctx->uc_sigmask, ¤t->blocked, sizeof(sigset_t))) return -EFAULT; } if (new_ctx == NULL) return 0; - if (!access_ok(VERIFY_READ, new_ctx, sizeof(*new_ctx)) + if (!access_ok(VERIFY_READ, new_ctx, ctx_size) || __get_user(tmp, (u8 __user *) new_ctx) - || __get_user(tmp, (u8 __user *) (new_ctx + 1) - 1)) + || __get_user(tmp, (u8 __user *) new_ctx + ctx_size - 1)) return -EFAULT; /* @@ -423,7 +420,7 @@ int handle_rt_signal64(int signr, struct k_sigaction *ka, siginfo_t *info, &frame->uc.uc_stack.ss_flags); err |= __put_user(current->sas_ss_size, &frame->uc.uc_stack.ss_size); err |= setup_sigcontext(&frame->uc.uc_mcontext, regs, signr, NULL, - (unsigned long)ka->sa.sa_handler); + (unsigned long)ka->sa.sa_handler, 1); err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); if (err) goto badframe; -- cgit v1.2.3-18-g5258 From 6098e2ee14849e0819ffa887ebf470dcfad4a2be Mon Sep 17 00:00:00 2001 From: Jeremy Kerr Date: Sun, 26 Oct 2008 21:51:25 +0000 Subject: OF-device: Don't overwrite numa_node in device registration Currently, the numa_node of OF-devices will be overwritten during device_register, which simply sets the node to -1. On cell machines, this means that devices can't find their IOMMU, which is referenced through the device's numa node. Set the numa node for OF devices with no parent, and use the lower-level device_initialize and device_add functions, so that the node is preserved. We can remove the call to set_dev_node in of_device_alloc, as it will be overwritten during register. Signed-off-by: Jeremy Kerr Signed-off-by: Paul Mackerras --- arch/powerpc/kernel/of_device.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/kernel/of_device.c b/arch/powerpc/kernel/of_device.c index 93ae5b169f4..f3c9cae01dd 100644 --- a/arch/powerpc/kernel/of_device.c +++ b/arch/powerpc/kernel/of_device.c @@ -78,7 +78,6 @@ struct of_device *of_device_alloc(struct device_node *np, dev->dev.parent = parent; dev->dev.release = of_release_dev; dev->dev.archdata.of_node = np; - set_dev_node(&dev->dev, of_node_to_nid(np)); if (bus_id) strlcpy(dev->dev.bus_id, bus_id, BUS_ID_SIZE); -- cgit v1.2.3-18-g5258 From e90a13184600ec756875238ad130e2f205cd9a1b Mon Sep 17 00:00:00 2001 From: Nathan Fontenot Date: Mon, 27 Oct 2008 19:48:17 +0000 Subject: powerpc/pci: Properly allocate bus resources for hotplug PHBs Resources for PHB's that are dynamically added to a system are not properly allocated in the resource tree. Not having these resources allocated causes an oops when removing the PHB when we try to release them. The diff appears a bit messy, this is mainly due to moving everything one tab to the left in the pcibios_allocate_bus_resources routine. The functionality change in this routine is only that the list_for_each_entry() loop is pulled out and moved to the necessary calling routine. Signed-off-by: Nathan Fontenot Signed-off-by: Benjamin Herrenschmidt Signed-off-by: Paul Mackerras --- arch/powerpc/kernel/pci-common.c | 110 +++++++++++++++++++-------------------- 1 file changed, 55 insertions(+), 55 deletions(-) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c index 1ec73938a00..f36936d9fda 100644 --- a/arch/powerpc/kernel/pci-common.c +++ b/arch/powerpc/kernel/pci-common.c @@ -1239,69 +1239,66 @@ static int __init reparent_resources(struct resource *parent, * as well. */ -static void __init pcibios_allocate_bus_resources(struct list_head *bus_list) +void pcibios_allocate_bus_resources(struct pci_bus *bus) { - struct pci_bus *bus; + struct pci_bus *b; int i; struct resource *res, *pr; - /* Depth-First Search on bus tree */ - list_for_each_entry(bus, bus_list, node) { - for (i = 0; i < PCI_BUS_NUM_RESOURCES; ++i) { - if ((res = bus->resource[i]) == NULL || !res->flags - || res->start > res->end) - continue; - if (bus->parent == NULL) - pr = (res->flags & IORESOURCE_IO) ? - &ioport_resource : &iomem_resource; - else { - /* Don't bother with non-root busses when - * re-assigning all resources. We clear the - * resource flags as if they were colliding - * and as such ensure proper re-allocation - * later. + for (i = 0; i < PCI_BUS_NUM_RESOURCES; ++i) { + if ((res = bus->resource[i]) == NULL || !res->flags + || res->start > res->end) + continue; + if (bus->parent == NULL) + pr = (res->flags & IORESOURCE_IO) ? + &ioport_resource : &iomem_resource; + else { + /* Don't bother with non-root busses when + * re-assigning all resources. We clear the + * resource flags as if they were colliding + * and as such ensure proper re-allocation + * later. + */ + if (ppc_pci_flags & PPC_PCI_REASSIGN_ALL_RSRC) + goto clear_resource; + pr = pci_find_parent_resource(bus->self, res); + if (pr == res) { + /* this happens when the generic PCI + * code (wrongly) decides that this + * bridge is transparent -- paulus */ - if (ppc_pci_flags & PPC_PCI_REASSIGN_ALL_RSRC) - goto clear_resource; - pr = pci_find_parent_resource(bus->self, res); - if (pr == res) { - /* this happens when the generic PCI - * code (wrongly) decides that this - * bridge is transparent -- paulus - */ - continue; - } + continue; } + } - DBG("PCI: %s (bus %d) bridge rsrc %d: %016llx-%016llx " - "[0x%x], parent %p (%s)\n", - bus->self ? pci_name(bus->self) : "PHB", - bus->number, i, - (unsigned long long)res->start, - (unsigned long long)res->end, - (unsigned int)res->flags, - pr, (pr && pr->name) ? pr->name : "nil"); - - if (pr && !(pr->flags & IORESOURCE_UNSET)) { - if (request_resource(pr, res) == 0) - continue; - /* - * Must be a conflict with an existing entry. - * Move that entry (or entries) under the - * bridge resource and try again. - */ - if (reparent_resources(pr, res) == 0) - continue; - } - printk(KERN_WARNING - "PCI: Cannot allocate resource region " - "%d of PCI bridge %d, will remap\n", - i, bus->number); -clear_resource: - res->flags = 0; + DBG("PCI: %s (bus %d) bridge rsrc %d: %016llx-%016llx " + "[0x%x], parent %p (%s)\n", + bus->self ? pci_name(bus->self) : "PHB", + bus->number, i, + (unsigned long long)res->start, + (unsigned long long)res->end, + (unsigned int)res->flags, + pr, (pr && pr->name) ? pr->name : "nil"); + + if (pr && !(pr->flags & IORESOURCE_UNSET)) { + if (request_resource(pr, res) == 0) + continue; + /* + * Must be a conflict with an existing entry. + * Move that entry (or entries) under the + * bridge resource and try again. + */ + if (reparent_resources(pr, res) == 0) + continue; } - pcibios_allocate_bus_resources(&bus->children); + printk(KERN_WARNING "PCI: Cannot allocate resource region " + "%d of PCI bridge %d, will remap\n", i, bus->number); +clear_resource: + res->flags = 0; } + + list_for_each_entry(b, &bus->children, node) + pcibios_allocate_bus_resources(b); } static inline void __devinit alloc_resource(struct pci_dev *dev, int idx) @@ -1372,10 +1369,13 @@ static void __init pcibios_allocate_resources(int pass) void __init pcibios_resource_survey(void) { + struct pci_bus *b; + /* Allocate and assign resources. If we re-assign everything, then * we skip the allocate phase */ - pcibios_allocate_bus_resources(&pci_root_buses); + list_for_each_entry(b, &pci_root_buses, node) + pcibios_allocate_bus_resources(b); if (!(ppc_pci_flags & PPC_PCI_REASSIGN_ALL_RSRC)) { pcibios_allocate_resources(0); -- cgit v1.2.3-18-g5258 From b30115ea8f685bcd1769553fe8511745f985053c Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Mon, 27 Oct 2008 19:48:47 +0000 Subject: powerpc/pci: Fix unmapping of IO space on 64-bit A typo/thinko made us pass the wrong argument to __flush_hash_table_range when unplugging bridges, thus not flushing all the translations for the IO space on unplug. The third parameter to __flush_hash_table_range is `end', not `size'. This causes the hypervisor to refuse unplugging slots. Signed-off-by: Benjamin Herrenschmidt Signed-off-by: Paul Mackerras --- arch/powerpc/kernel/pci_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/kernel/pci_64.c b/arch/powerpc/kernel/pci_64.c index 8247cff1cb3..3502b9101e6 100644 --- a/arch/powerpc/kernel/pci_64.c +++ b/arch/powerpc/kernel/pci_64.c @@ -426,7 +426,7 @@ int pcibios_unmap_io_space(struct pci_bus *bus) pci_name(bus->self)); __flush_hash_table_range(&init_mm, res->start + _IO_BASE, - res->end - res->start + 1); + res->end + _IO_BASE + 1); return 0; } -- cgit v1.2.3-18-g5258 From f9226d572d2f8b5f564596db8c6a13e458c46191 Mon Sep 17 00:00:00 2001 From: Mark Nelson Date: Mon, 27 Oct 2008 20:38:08 +0000 Subject: powerpc: Update remaining dma_mapping_ops to use map/unmap_page After the merge of the 32 and 64bit DMA code, dma_direct_ops lost their map/unmap_single() functions but gained map/unmap_page(). This caused a problem for Cell because Cell's dma_iommu_fixed_ops called the dma_direct_ops if the fixed linear mapping was to be used or the iommu ops if the dynamic window was to be used. So in order to fix this problem we need to update the 64bit DMA code to use map/unmap_page. First, we update the generic IOMMU code so that iommu_map_single() becomes iommu_map_page() and iommu_unmap_single() becomes iommu_unmap_page(). Then we propagate these changes up through all the callers of these two functions and in the process update all the dma_mapping_ops so that they have map/unmap_page rahter than map/unmap_single. We can do this because on 64bit there is no HIGHMEM memory so map/unmap_page ends up performing exactly the same function as map/unmap_single, just taking different arguments. This has no affect on drivers because the dma_map_single_attrs() just ends up calling the map_page() function of the appropriate dma_mapping_ops and similarly the dma_unmap_single_attrs() calls unmap_page(). This fixes an oops on Cell blades, which oops on boot without this because they call dma_direct_ops.map_single, which is NULL. Signed-off-by: Mark Nelson Signed-off-by: Paul Mackerras --- arch/powerpc/kernel/dma-iommu.c | 34 ++++++++++++++++------------------ arch/powerpc/kernel/ibmebus.c | 27 ++++++++++++++------------- arch/powerpc/kernel/iommu.c | 22 ++++++++++++---------- arch/powerpc/kernel/vio.c | 25 +++++++++++++------------ 4 files changed, 55 insertions(+), 53 deletions(-) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/kernel/dma-iommu.c b/arch/powerpc/kernel/dma-iommu.c index 49248f89ce2..14183af1b3f 100644 --- a/arch/powerpc/kernel/dma-iommu.c +++ b/arch/powerpc/kernel/dma-iommu.c @@ -30,28 +30,26 @@ static void dma_iommu_free_coherent(struct device *dev, size_t size, } /* Creates TCEs for a user provided buffer. The user buffer must be - * contiguous real kernel storage (not vmalloc). The address of the buffer - * passed here is the kernel (virtual) address of the buffer. The buffer - * need not be page aligned, the dma_addr_t returned will point to the same - * byte within the page as vaddr. + * contiguous real kernel storage (not vmalloc). The address passed here + * comprises a page address and offset into that page. The dma_addr_t + * returned will point to the same byte within the page as was passed in. */ -static dma_addr_t dma_iommu_map_single(struct device *dev, void *vaddr, - size_t size, - enum dma_data_direction direction, - struct dma_attrs *attrs) +static dma_addr_t dma_iommu_map_page(struct device *dev, struct page *page, + unsigned long offset, size_t size, + enum dma_data_direction direction, + struct dma_attrs *attrs) { - return iommu_map_single(dev, dev->archdata.dma_data, vaddr, size, - device_to_mask(dev), direction, attrs); + return iommu_map_page(dev, dev->archdata.dma_data, page, offset, size, + device_to_mask(dev), direction, attrs); } -static void dma_iommu_unmap_single(struct device *dev, dma_addr_t dma_handle, - size_t size, - enum dma_data_direction direction, - struct dma_attrs *attrs) +static void dma_iommu_unmap_page(struct device *dev, dma_addr_t dma_handle, + size_t size, enum dma_data_direction direction, + struct dma_attrs *attrs) { - iommu_unmap_single(dev->archdata.dma_data, dma_handle, size, direction, - attrs); + iommu_unmap_page(dev->archdata.dma_data, dma_handle, size, direction, + attrs); } @@ -94,10 +92,10 @@ static int dma_iommu_dma_supported(struct device *dev, u64 mask) struct dma_mapping_ops dma_iommu_ops = { .alloc_coherent = dma_iommu_alloc_coherent, .free_coherent = dma_iommu_free_coherent, - .map_single = dma_iommu_map_single, - .unmap_single = dma_iommu_unmap_single, .map_sg = dma_iommu_map_sg, .unmap_sg = dma_iommu_unmap_sg, .dma_supported = dma_iommu_dma_supported, + .map_page = dma_iommu_map_page, + .unmap_page = dma_iommu_unmap_page, }; EXPORT_SYMBOL(dma_iommu_ops); diff --git a/arch/powerpc/kernel/ibmebus.c b/arch/powerpc/kernel/ibmebus.c index a06362223f8..64299d28f36 100644 --- a/arch/powerpc/kernel/ibmebus.c +++ b/arch/powerpc/kernel/ibmebus.c @@ -79,20 +79,21 @@ static void ibmebus_free_coherent(struct device *dev, kfree(vaddr); } -static dma_addr_t ibmebus_map_single(struct device *dev, - void *ptr, - size_t size, - enum dma_data_direction direction, - struct dma_attrs *attrs) +static dma_addr_t ibmebus_map_page(struct device *dev, + struct page *page, + unsigned long offset, + size_t size, + enum dma_data_direction direction, + struct dma_attrs *attrs) { - return (dma_addr_t)(ptr); + return (dma_addr_t)(page_address(page) + offset); } -static void ibmebus_unmap_single(struct device *dev, - dma_addr_t dma_addr, - size_t size, - enum dma_data_direction direction, - struct dma_attrs *attrs) +static void ibmebus_unmap_page(struct device *dev, + dma_addr_t dma_addr, + size_t size, + enum dma_data_direction direction, + struct dma_attrs *attrs) { return; } @@ -129,11 +130,11 @@ static int ibmebus_dma_supported(struct device *dev, u64 mask) static struct dma_mapping_ops ibmebus_dma_ops = { .alloc_coherent = ibmebus_alloc_coherent, .free_coherent = ibmebus_free_coherent, - .map_single = ibmebus_map_single, - .unmap_single = ibmebus_unmap_single, .map_sg = ibmebus_map_sg, .unmap_sg = ibmebus_unmap_sg, .dma_supported = ibmebus_dma_supported, + .map_page = ibmebus_map_page, + .unmap_page = ibmebus_unmap_page, }; static int ibmebus_match_path(struct device *dev, void *data) diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c index 45f47c97fd1..1bfa706b96e 100644 --- a/arch/powerpc/kernel/iommu.c +++ b/arch/powerpc/kernel/iommu.c @@ -565,21 +565,23 @@ void iommu_free_table(struct iommu_table *tbl, const char *node_name) } /* Creates TCEs for a user provided buffer. The user buffer must be - * contiguous real kernel storage (not vmalloc). The address of the buffer - * passed here is the kernel (virtual) address of the buffer. The buffer - * need not be page aligned, the dma_addr_t returned will point to the same - * byte within the page as vaddr. + * contiguous real kernel storage (not vmalloc). The address passed here + * comprises a page address and offset into that page. The dma_addr_t + * returned will point to the same byte within the page as was passed in. */ -dma_addr_t iommu_map_single(struct device *dev, struct iommu_table *tbl, - void *vaddr, size_t size, unsigned long mask, - enum dma_data_direction direction, struct dma_attrs *attrs) +dma_addr_t iommu_map_page(struct device *dev, struct iommu_table *tbl, + struct page *page, unsigned long offset, size_t size, + unsigned long mask, enum dma_data_direction direction, + struct dma_attrs *attrs) { dma_addr_t dma_handle = DMA_ERROR_CODE; + void *vaddr; unsigned long uaddr; unsigned int npages, align; BUG_ON(direction == DMA_NONE); + vaddr = page_address(page) + offset; uaddr = (unsigned long)vaddr; npages = iommu_num_pages(uaddr, size, IOMMU_PAGE_SIZE); @@ -605,9 +607,9 @@ dma_addr_t iommu_map_single(struct device *dev, struct iommu_table *tbl, return dma_handle; } -void iommu_unmap_single(struct iommu_table *tbl, dma_addr_t dma_handle, - size_t size, enum dma_data_direction direction, - struct dma_attrs *attrs) +void iommu_unmap_page(struct iommu_table *tbl, dma_addr_t dma_handle, + size_t size, enum dma_data_direction direction, + struct dma_attrs *attrs) { unsigned int npages; diff --git a/arch/powerpc/kernel/vio.c b/arch/powerpc/kernel/vio.c index 434c92a85c0..a11e6bc59b3 100644 --- a/arch/powerpc/kernel/vio.c +++ b/arch/powerpc/kernel/vio.c @@ -516,10 +516,10 @@ static void vio_dma_iommu_free_coherent(struct device *dev, size_t size, vio_cmo_dealloc(viodev, roundup(size, IOMMU_PAGE_SIZE)); } -static dma_addr_t vio_dma_iommu_map_single(struct device *dev, void *vaddr, - size_t size, - enum dma_data_direction direction, - struct dma_attrs *attrs) +static dma_addr_t vio_dma_iommu_map_page(struct device *dev, struct page *page, + unsigned long offset, size_t size, + enum dma_data_direction direction, + struct dma_attrs *attrs) { struct vio_dev *viodev = to_vio_dev(dev); dma_addr_t ret = DMA_ERROR_CODE; @@ -529,7 +529,7 @@ static dma_addr_t vio_dma_iommu_map_single(struct device *dev, void *vaddr, return ret; } - ret = dma_iommu_ops.map_single(dev, vaddr, size, direction, attrs); + ret = dma_iommu_ops.map_page(dev, page, offset, size, direction, attrs); if (unlikely(dma_mapping_error(dev, ret))) { vio_cmo_dealloc(viodev, roundup(size, IOMMU_PAGE_SIZE)); atomic_inc(&viodev->cmo.allocs_failed); @@ -538,14 +538,14 @@ static dma_addr_t vio_dma_iommu_map_single(struct device *dev, void *vaddr, return ret; } -static void vio_dma_iommu_unmap_single(struct device *dev, - dma_addr_t dma_handle, size_t size, - enum dma_data_direction direction, - struct dma_attrs *attrs) +static void vio_dma_iommu_unmap_page(struct device *dev, dma_addr_t dma_handle, + size_t size, + enum dma_data_direction direction, + struct dma_attrs *attrs) { struct vio_dev *viodev = to_vio_dev(dev); - dma_iommu_ops.unmap_single(dev, dma_handle, size, direction, attrs); + dma_iommu_ops.unmap_page(dev, dma_handle, size, direction, attrs); vio_cmo_dealloc(viodev, roundup(size, IOMMU_PAGE_SIZE)); } @@ -603,10 +603,11 @@ static void vio_dma_iommu_unmap_sg(struct device *dev, struct dma_mapping_ops vio_dma_mapping_ops = { .alloc_coherent = vio_dma_iommu_alloc_coherent, .free_coherent = vio_dma_iommu_free_coherent, - .map_single = vio_dma_iommu_map_single, - .unmap_single = vio_dma_iommu_unmap_single, .map_sg = vio_dma_iommu_map_sg, .unmap_sg = vio_dma_iommu_unmap_sg, + .map_page = vio_dma_iommu_map_page, + .unmap_page = vio_dma_iommu_unmap_page, + }; /** -- cgit v1.2.3-18-g5258 From 5663a1232bd557b4b2141ad345dd56785fa51c2a Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Fri, 31 Oct 2008 22:27:17 +1100 Subject: Revert "powerpc: Sync RPA note in zImage with kernel's RPA note" This reverts commit 91a00302959545a9ae423e99732b1e46eb19e877, plus commit 0dcd440120ef12879ff34fc78d7e4abf171c79e4 ("powerpc: Revert CHRP boot wrapper to real-base = 12MB on 32-bit") which depended on it. Commit 91a00302 was causing NVRAM corruption on some pSeries machines, for as-yet unknown reasons, so this reverts it until the cause is identified. Signed-off-by: Paul Mackerras --- arch/powerpc/kernel/prom_init.c | 10 +++++----- arch/powerpc/kernel/vmlinux.lds.S | 3 --- 2 files changed, 5 insertions(+), 8 deletions(-) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c index 23e0db20332..2445945d376 100644 --- a/arch/powerpc/kernel/prom_init.c +++ b/arch/powerpc/kernel/prom_init.c @@ -671,7 +671,7 @@ static struct fake_elf { u32 ignore_me; } rpadesc; } rpanote; -} fake_elf __section(.fakeelf) = { +} fake_elf = { .elfhdr = { .e_ident = { 0x7f, 'E', 'L', 'F', ELFCLASS32, ELFDATA2MSB, EV_CURRENT }, @@ -713,13 +713,13 @@ static struct fake_elf { .type = 0x12759999, .name = "IBM,RPA-Client-Config", .rpadesc = { - .lpar_affinity = 1, - .min_rmo_size = 128, /* in megabytes */ + .lpar_affinity = 0, + .min_rmo_size = 64, /* in megabytes */ .min_rmo_percent = 0, - .max_pft_size = 46, /* 2^46 bytes max PFT size */ + .max_pft_size = 48, /* 2^48 bytes max PFT size */ .splpar = 1, .min_load = ~0U, - .new_mem_def = 1 + .new_mem_def = 0 } } }; diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S index 384dca5a9c1..2412c056baa 100644 --- a/arch/powerpc/kernel/vmlinux.lds.S +++ b/arch/powerpc/kernel/vmlinux.lds.S @@ -205,9 +205,6 @@ SECTIONS } #endif - /* Fake ELF header containing RPA note; for addnote */ - .fakeelf : AT(ADDR(.fakeelf) - LOAD_OFFSET) { *(.fakeelf) } - /* freed after init ends here */ . = ALIGN(PAGE_SIZE); __init_end = .; -- cgit v1.2.3-18-g5258