diff options
Diffstat (limited to 'arch/ia64/kernel')
85 files changed, 14515 insertions, 7752 deletions
diff --git a/arch/ia64/kernel/.gitignore b/arch/ia64/kernel/.gitignore new file mode 100644 index 00000000000..21cb0da5ded --- /dev/null +++ b/arch/ia64/kernel/.gitignore @@ -0,0 +1,2 @@ +gate.lds +vmlinux.lds diff --git a/arch/ia64/kernel/Makefile b/arch/ia64/kernel/Makefile index 307514f7a28..20678a9ed11 100644 --- a/arch/ia64/kernel/Makefile +++ b/arch/ia64/kernel/Makefile @@ -2,17 +2,20 @@ # Makefile for the linux kernel. # +ifdef CONFIG_DYNAMIC_FTRACE +CFLAGS_REMOVE_ftrace.o = -pg +endif + extra-y := head.o init_task.o vmlinux.lds -obj-y := acpi.o entry.o efi.o efi_stub.o gate-data.o fsys.o ia64_ksyms.o irq.o irq_ia64.o \ - irq_lsapic.o ivt.o machvec.o pal.o patch.o process.o perfmon.o ptrace.o sal.o \ - salinfo.o semaphore.o setup.o signal.o sys_ia64.o time.o traps.o unaligned.o \ - unwind.o mca.o mca_asm.o topology.o +obj-y := entry.o efi.o efi_stub.o gate-data.o fsys.o ia64_ksyms.o irq.o irq_ia64.o \ + irq_lsapic.o ivt.o machvec.o pal.o paravirt_patchlist.o patch.o process.o perfmon.o ptrace.o sal.o \ + salinfo.o setup.o signal.o sys_ia64.o time.o traps.o unaligned.o \ + unwind.o mca.o mca_asm.o topology.o dma-mapping.o +obj-$(CONFIG_ACPI) += acpi.o acpi-ext.o obj-$(CONFIG_IA64_BRL_EMU) += brl_emu.o -obj-$(CONFIG_IA64_GENERIC) += acpi-ext.o -obj-$(CONFIG_IA64_HP_ZX1) += acpi-ext.o -obj-$(CONFIG_IA64_HP_ZX1_SWIOTLB) += acpi-ext.o + obj-$(CONFIG_IA64_PALINFO) += palinfo.o obj-$(CONFIG_IOSAPIC) += iosapic.o obj-$(CONFIG_MODULES) += module.o @@ -20,37 +23,91 @@ obj-$(CONFIG_SMP) += smp.o smpboot.o obj-$(CONFIG_NUMA) += numa.o obj-$(CONFIG_PERFMON) += perfmon_default_smpl.o obj-$(CONFIG_IA64_CYCLONE) += cyclone.o -obj-$(CONFIG_CPU_FREQ) += cpufreq/ obj-$(CONFIG_IA64_MCA_RECOVERY) += mca_recovery.o obj-$(CONFIG_KPROBES) += kprobes.o jprobes.o +obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o +obj-$(CONFIG_KEXEC) += machine_kexec.o relocate_kernel.o crash.o +obj-$(CONFIG_CRASH_DUMP) += crash_dump.o obj-$(CONFIG_IA64_UNCACHED_ALLOCATOR) += uncached.o +obj-$(CONFIG_AUDIT) += audit.o +obj-$(CONFIG_PCI_MSI) += msi_ia64.o mca_recovery-y += mca_drv.o mca_drv_asm.o +obj-$(CONFIG_IA64_MC_ERR_INJECT)+= err_inject.o +obj-$(CONFIG_STACKTRACE) += stacktrace.o -# The gate DSO image is built using a special linker script. -targets += gate.so gate-syms.o +obj-$(CONFIG_PARAVIRT) += paravirt.o paravirtentry.o \ + paravirt_patch.o -extra-y += gate.so gate-syms.o gate.lds gate.o +obj-$(CONFIG_IA64_ESI) += esi.o +ifneq ($(CONFIG_IA64_ESI),) +obj-y += esi_stub.o # must be in kernel proper +endif +obj-$(CONFIG_INTEL_IOMMU) += pci-dma.o +obj-$(CONFIG_SWIOTLB) += pci-swiotlb.o + +obj-$(CONFIG_BINFMT_ELF) += elfcore.o # fp_emulate() expects f2-f5,f16-f31 to contain the user-level state. CFLAGS_traps.o += -mfixed-range=f2-f5,f16-f31 -CPPFLAGS_gate.lds := -P -C -U$(ARCH) +# The gate DSO image is built using a special linker script. +include $(srctree)/arch/ia64/kernel/Makefile.gate +# tell compiled for native +CPPFLAGS_gate.lds += -D__IA64_GATE_PARAVIRTUALIZED_NATIVE + +# Calculate NR_IRQ = max(IA64_NATIVE_NR_IRQS, XEN_NR_IRQS, ...) based on config +define sed-y + "/^->/{s:^->\([^ ]*\) [\$$#]*\([^ ]*\) \(.*\):#define \1 \2 /* \3 */:; s:->::; p;}" +endef +quiet_cmd_nr_irqs = GEN $@ +define cmd_nr_irqs + (set -e; \ + echo "#ifndef __ASM_NR_IRQS_H__"; \ + echo "#define __ASM_NR_IRQS_H__"; \ + echo "/*"; \ + echo " * DO NOT MODIFY."; \ + echo " *"; \ + echo " * This file was generated by Kbuild"; \ + echo " *"; \ + echo " */"; \ + echo ""; \ + sed -ne $(sed-y) $<; \ + echo ""; \ + echo "#endif" ) > $@ +endef -quiet_cmd_gate = GATE $@ - cmd_gate = $(CC) -nostdlib $(GATECFLAGS_$(@F)) -Wl,-T,$(filter-out FORCE,$^) -o $@ +# We use internal kbuild rules to avoid the "is up to date" message from make +arch/$(SRCARCH)/kernel/nr-irqs.s: arch/$(SRCARCH)/kernel/nr-irqs.c + $(Q)mkdir -p $(dir $@) + $(call if_changed_dep,cc_s_c) -GATECFLAGS_gate.so = -shared -s -Wl,-soname=linux-gate.so.1 -$(obj)/gate.so: $(obj)/gate.lds $(obj)/gate.o FORCE - $(call if_changed,gate) +include/generated/nr-irqs.h: arch/$(SRCARCH)/kernel/nr-irqs.s + $(Q)mkdir -p $(dir $@) + $(call cmd,nr_irqs) -$(obj)/built-in.o: $(obj)/gate-syms.o -$(obj)/built-in.o: ld_flags += -R $(obj)/gate-syms.o +# +# native ivt.S, entry.S and fsys.S +# +ASM_PARAVIRT_OBJS = ivt.o entry.o fsys.o +define paravirtualized_native +AFLAGS_$(1) += -D__IA64_ASM_PARAVIRTUALIZED_NATIVE +AFLAGS_pvchk-sed-$(1) += -D__IA64_ASM_PARAVIRTUALIZED_PVCHECK +extra-y += pvchk-$(1) +endef +$(foreach obj,$(ASM_PARAVIRT_OBJS),$(eval $(call paravirtualized_native,$(obj)))) -GATECFLAGS_gate-syms.o = -r -$(obj)/gate-syms.o: $(obj)/gate.lds $(obj)/gate.o FORCE - $(call if_changed,gate) +# +# Checker for paravirtualizations of privileged operations. +# +quiet_cmd_pv_check_sed = PVCHK $@ +define cmd_pv_check_sed + sed -f $(srctree)/arch/$(SRCARCH)/scripts/pvcheck.sed $< > $@ +endef -# gate-data.o contains the gate DSO image as data in section .data.gate. -# We must build gate.so before we can assemble it. -# Note: kbuild does not track this dependency due to usage of .incbin -$(obj)/gate-data.o: $(obj)/gate.so +$(obj)/pvchk-sed-%.s: $(src)/%.S $(srctree)/arch/$(SRCARCH)/scripts/pvcheck.sed FORCE + $(call if_changed_dep,as_s_S) +$(obj)/pvchk-%.s: $(obj)/pvchk-sed-%.s FORCE + $(call if_changed,pv_check_sed) +$(obj)/pvchk-%.o: $(obj)/pvchk-%.s FORCE + $(call if_changed,as_o_S) +.PRECIOUS: $(obj)/pvchk-sed-%.s $(obj)/pvchk-%.s $(obj)/pvchk-%.o diff --git a/arch/ia64/kernel/Makefile.gate b/arch/ia64/kernel/Makefile.gate new file mode 100644 index 00000000000..ceeffc50976 --- /dev/null +++ b/arch/ia64/kernel/Makefile.gate @@ -0,0 +1,27 @@ +# The gate DSO image is built using a special linker script. + +targets += gate.so gate-syms.o + +extra-y += gate.so gate-syms.o gate.lds gate.o + +CPPFLAGS_gate.lds := -P -C -U$(ARCH) + +quiet_cmd_gate = GATE $@ + cmd_gate = $(CC) -nostdlib $(GATECFLAGS_$(@F)) -Wl,-T,$(filter-out FORCE,$^) -o $@ + +GATECFLAGS_gate.so = -shared -s -Wl,-soname=linux-gate.so.1 \ + $(call cc-ldoption, -Wl$(comma)--hash-style=sysv) +$(obj)/gate.so: $(obj)/gate.lds $(obj)/gate.o FORCE + $(call if_changed,gate) + +$(obj)/built-in.o: $(obj)/gate-syms.o +$(obj)/built-in.o: ld_flags += -R $(obj)/gate-syms.o + +GATECFLAGS_gate-syms.o = -r +$(obj)/gate-syms.o: $(obj)/gate.lds $(obj)/gate.o FORCE + $(call if_changed,gate) + +# gate-data.o contains the gate DSO image as data in section .data..gate. +# We must build gate.so before we can assemble it. +# Note: kbuild does not track this dependency due to usage of .incbin +$(obj)/gate-data.o: $(obj)/gate.so diff --git a/arch/ia64/kernel/acpi-ext.c b/arch/ia64/kernel/acpi-ext.c index 13a5b3b49bf..8b9318d311a 100644 --- a/arch/ia64/kernel/acpi-ext.c +++ b/arch/ia64/kernel/acpi-ext.c @@ -1,105 +1,104 @@ /* - * arch/ia64/kernel/acpi-ext.c + * (c) Copyright 2003, 2006 Hewlett-Packard Development Company, L.P. + * Alex Williamson <alex.williamson@hp.com> + * Bjorn Helgaas <bjorn.helgaas@hp.com> * - * Copyright (C) 2003 Hewlett-Packard - * Copyright (C) Alex Williamson - * Copyright (C) Bjorn Helgaas - * - * Vendor specific extensions to ACPI. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. */ -#include <linux/config.h> #include <linux/module.h> #include <linux/types.h> +#include <linux/slab.h> #include <linux/acpi.h> -#include <linux/efi.h> #include <asm/acpi-ext.h> -struct acpi_vendor_descriptor { - u8 guid_id; - efi_guid_t guid; -}; +/* + * Device CSRs that do not appear in PCI config space should be described + * via ACPI. This would normally be done with Address Space Descriptors + * marked as "consumer-only," but old versions of Windows and Linux ignore + * the producer/consumer flag, so HP invented a vendor-defined resource to + * describe the location and size of CSR space. + */ -struct acpi_vendor_info { - struct acpi_vendor_descriptor *descriptor; - u8 *data; - u32 length; +struct acpi_vendor_uuid hp_ccsr_uuid = { + .subtype = 2, + .data = { 0xf9, 0xad, 0xe9, 0x69, 0x4f, 0x92, 0x5f, 0xab, 0xf6, 0x4a, + 0x24, 0xd2, 0x01, 0x37, 0x0e, 0xad }, }; -acpi_status -acpi_vendor_resource_match(struct acpi_resource *resource, void *context) +static acpi_status hp_ccsr_locate(acpi_handle obj, u64 *base, u64 *length) { - struct acpi_vendor_info *info = (struct acpi_vendor_info *)context; - struct acpi_resource_vendor *vendor; - struct acpi_vendor_descriptor *descriptor; - u32 length; - - if (resource->id != ACPI_RSTYPE_VENDOR) - return AE_OK; - - vendor = (struct acpi_resource_vendor *)&resource->data; - descriptor = (struct acpi_vendor_descriptor *)vendor->reserved; - if (vendor->length <= sizeof(*info->descriptor) || - descriptor->guid_id != info->descriptor->guid_id || - efi_guidcmp(descriptor->guid, info->descriptor->guid)) - return AE_OK; - - length = vendor->length - sizeof(struct acpi_vendor_descriptor); - info->data = acpi_os_allocate(length); - if (!info->data) - return AE_NO_MEMORY; - - memcpy(info->data, - vendor->reserved + sizeof(struct acpi_vendor_descriptor), - length); - info->length = length; - return AE_CTRL_TERMINATE; -} + acpi_status status; + struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL }; + struct acpi_resource *resource; + struct acpi_resource_vendor_typed *vendor; -acpi_status -acpi_find_vendor_resource(acpi_handle obj, struct acpi_vendor_descriptor * id, - u8 ** data, u32 * length) -{ - struct acpi_vendor_info info; + status = acpi_get_vendor_resource(obj, METHOD_NAME__CRS, &hp_ccsr_uuid, + &buffer); - info.descriptor = id; - info.data = NULL; + resource = buffer.pointer; + vendor = &resource->data.vendor_typed; - acpi_walk_resources(obj, METHOD_NAME__CRS, acpi_vendor_resource_match, - &info); - if (!info.data) - return AE_NOT_FOUND; + if (ACPI_FAILURE(status) || vendor->byte_length < 16) { + status = AE_NOT_FOUND; + goto exit; + } - *data = info.data; - *length = info.length; - return AE_OK; + memcpy(base, vendor->byte_data, sizeof(*base)); + memcpy(length, vendor->byte_data + 8, sizeof(*length)); + + exit: + kfree(buffer.pointer); + return status; } -struct acpi_vendor_descriptor hp_ccsr_descriptor = { - .guid_id = 2, - .guid = - EFI_GUID(0x69e9adf9, 0x924f, 0xab5f, 0xf6, 0x4a, 0x24, 0xd2, 0x01, - 0x37, 0x0e, 0xad) +struct csr_space { + u64 base; + u64 length; }; -acpi_status hp_acpi_csr_space(acpi_handle obj, u64 * csr_base, u64 * csr_length) +static acpi_status find_csr_space(struct acpi_resource *resource, void *data) { + struct csr_space *space = data; + struct acpi_resource_address64 addr; acpi_status status; - u8 *data; - u32 length; - status = - acpi_find_vendor_resource(obj, &hp_ccsr_descriptor, &data, &length); + status = acpi_resource_to_address64(resource, &addr); + if (ACPI_SUCCESS(status) && + addr.resource_type == ACPI_MEMORY_RANGE && + addr.address_length && + addr.producer_consumer == ACPI_CONSUMER) { + space->base = addr.minimum; + space->length = addr.address_length; + return AE_CTRL_TERMINATE; + } + return AE_OK; /* keep looking */ +} - if (ACPI_FAILURE(status) || length != 16) - return AE_NOT_FOUND; +static acpi_status hp_crs_locate(acpi_handle obj, u64 *base, u64 *length) +{ + struct csr_space space = { 0, 0 }; - memcpy(csr_base, data, sizeof(*csr_base)); - memcpy(csr_length, data + 8, sizeof(*csr_length)); - acpi_os_free(data); + acpi_walk_resources(obj, METHOD_NAME__CRS, find_csr_space, &space); + if (!space.length) + return AE_NOT_FOUND; + *base = space.base; + *length = space.length; return AE_OK; } +acpi_status hp_acpi_csr_space(acpi_handle obj, u64 *csr_base, u64 *csr_length) +{ + acpi_status status; + + status = hp_ccsr_locate(obj, csr_base, csr_length); + if (ACPI_SUCCESS(status)) + return status; + + return hp_crs_locate(obj, csr_base, csr_length); +} EXPORT_SYMBOL(hp_acpi_csr_space); diff --git a/arch/ia64/kernel/acpi.c b/arch/ia64/kernel/acpi.c index 9ad94ddf668..615ef81def4 100644 --- a/arch/ia64/kernel/acpi.c +++ b/arch/ia64/kernel/acpi.c @@ -32,7 +32,6 @@ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ -#include <linux/config.h> #include <linux/module.h> #include <linux/init.h> #include <linux/kernel.h> @@ -45,46 +44,47 @@ #include <linux/efi.h> #include <linux/mmzone.h> #include <linux/nodemask.h> +#include <linux/slab.h> +#include <acpi/processor.h> #include <asm/io.h> #include <asm/iosapic.h> #include <asm/machvec.h> #include <asm/page.h> -#include <asm/system.h> #include <asm/numa.h> #include <asm/sal.h> #include <asm/cyclone.h> -#define BAD_MADT_ENTRY(entry, end) ( \ - (!entry) || (unsigned long)entry + sizeof(*entry) > end || \ - ((acpi_table_entry_header *)entry)->length != sizeof(*entry)) - #define PREFIX "ACPI: " -void (*pm_idle) (void); -EXPORT_SYMBOL(pm_idle); -void (*pm_power_off) (void); -EXPORT_SYMBOL(pm_power_off); - -unsigned char acpi_kbd_controller_present = 1; -unsigned char acpi_legacy_devices; - -static unsigned int __initdata acpi_madt_rev; - +int acpi_lapic; unsigned int acpi_cpei_override; unsigned int acpi_cpei_phys_cpuid; -#define MAX_SAPICS 256 -u16 ia64_acpiid_to_sapicid[MAX_SAPICS] = {[0 ... MAX_SAPICS - 1] = -1 }; +unsigned long acpi_wakeup_address = 0; -EXPORT_SYMBOL(ia64_acpiid_to_sapicid); +#ifdef CONFIG_IA64_GENERIC +static unsigned long __init acpi_find_rsdp(void) +{ + unsigned long rsdp_phys = 0; + + if (efi.acpi20 != EFI_INVALID_TABLE_ADDR) + rsdp_phys = efi.acpi20; + else if (efi.acpi != EFI_INVALID_TABLE_ADDR) + printk(KERN_WARNING PREFIX + "v1.0/r0.71 tables no longer supported\n"); + return rsdp_phys; +} -const char *acpi_get_sysname(void) +const char __init * +acpi_get_sysname(void) { -#ifdef CONFIG_IA64_GENERIC unsigned long rsdp_phys; - struct acpi20_table_rsdp *rsdp; + struct acpi_table_rsdp *rsdp; struct acpi_table_xsdt *xsdt; struct acpi_table_header *hdr; +#ifdef CONFIG_INTEL_IOMMU + u64 i, nentries; +#endif rsdp_phys = acpi_find_rsdp(); if (!rsdp_phys) { @@ -93,16 +93,16 @@ const char *acpi_get_sysname(void) return "dig"; } - rsdp = (struct acpi20_table_rsdp *)__va(rsdp_phys); - if (strncmp(rsdp->signature, RSDP_SIG, sizeof(RSDP_SIG) - 1)) { + rsdp = (struct acpi_table_rsdp *)__va(rsdp_phys); + if (strncmp(rsdp->signature, ACPI_SIG_RSDP, sizeof(ACPI_SIG_RSDP) - 1)) { printk(KERN_ERR "ACPI 2.0 RSDP signature incorrect, default to \"dig\"\n"); return "dig"; } - xsdt = (struct acpi_table_xsdt *)__va(rsdp->xsdt_address); + xsdt = (struct acpi_table_xsdt *)__va(rsdp->xsdt_physical_address); hdr = &xsdt->header; - if (strncmp(hdr->signature, XSDT_SIG, sizeof(XSDT_SIG) - 1)) { + if (strncmp(hdr->signature, ACPI_SIG_XSDT, sizeof(ACPI_SIG_XSDT) - 1)) { printk(KERN_ERR "ACPI 2.0 XSDT signature incorrect, default to \"dig\"\n"); return "dig"; @@ -111,28 +111,27 @@ const char *acpi_get_sysname(void) if (!strcmp(hdr->oem_id, "HP")) { return "hpzx1"; } else if (!strcmp(hdr->oem_id, "SGI")) { - return "sn2"; + if (!strcmp(hdr->oem_table_id + 4, "UV")) + return "uv"; + else + return "sn2"; } - return "dig"; -#else -# if defined (CONFIG_IA64_HP_SIM) - return "hpsim"; -# elif defined (CONFIG_IA64_HP_ZX1) - return "hpzx1"; -# elif defined (CONFIG_IA64_HP_ZX1_SWIOTLB) - return "hpzx1_swiotlb"; -# elif defined (CONFIG_IA64_SGI_SN2) - return "sn2"; -# elif defined (CONFIG_IA64_DIG) - return "dig"; -# else -# error Unknown platform. Fix acpi.c. -# endif +#ifdef CONFIG_INTEL_IOMMU + /* Look for Intel IOMMU */ + nentries = (hdr->length - sizeof(*hdr)) / + sizeof(xsdt->table_offset_entry[0]); + for (i = 0; i < nentries; i++) { + hdr = __va(xsdt->table_offset_entry[i]); + if (strncmp(hdr->signature, ACPI_SIG_DMAR, + sizeof(ACPI_SIG_DMAR) - 1) == 0) + return "dig_vtd"; + } #endif -} -#ifdef CONFIG_ACPI + return "dig"; +} +#endif /* CONFIG_IA64_GENERIC */ #define ACPI_MAX_PLATFORM_INTERRUPTS 256 @@ -160,27 +159,30 @@ int acpi_request_vector(u32 int_type) return vector; } -char *__acpi_map_table(unsigned long phys_addr, unsigned long size) +char *__init __acpi_map_table(unsigned long phys_addr, unsigned long size) { return __va(phys_addr); } +void __init __acpi_unmap_table(char *map, unsigned long size) +{ +} + /* -------------------------------------------------------------------------- Boot-time Table Parsing -------------------------------------------------------------------------- */ -static int total_cpus __initdata; static int available_cpus __initdata; struct acpi_table_madt *acpi_madt __initdata; static u8 has_8259; static int __init -acpi_parse_lapic_addr_ovr(acpi_table_entry_header * header, +acpi_parse_lapic_addr_ovr(struct acpi_subtable_header * header, const unsigned long end) { - struct acpi_table_lapic_addr_ovr *lapic; + struct acpi_madt_local_apic_override *lapic; - lapic = (struct acpi_table_lapic_addr_ovr *)header; + lapic = (struct acpi_madt_local_apic_override *)header; if (BAD_MADT_ENTRY(lapic, end)) return -EINVAL; @@ -193,22 +195,19 @@ acpi_parse_lapic_addr_ovr(acpi_table_entry_header * header, } static int __init -acpi_parse_lsapic(acpi_table_entry_header * header, const unsigned long end) +acpi_parse_lsapic(struct acpi_subtable_header * header, const unsigned long end) { - struct acpi_table_lsapic *lsapic; + struct acpi_madt_local_sapic *lsapic; - lsapic = (struct acpi_table_lsapic *)header; + lsapic = (struct acpi_madt_local_sapic *)header; - if (BAD_MADT_ENTRY(lsapic, end)) - return -EINVAL; + /*Skip BAD_MADT_ENTRY check, as lsapic size could vary */ - if (lsapic->flags.enabled) { + if (lsapic->lapic_flags & ACPI_MADT_ENABLED) { #ifdef CONFIG_SMP smp_boot_data.cpu_phys_id[available_cpus] = (lsapic->id << 8) | lsapic->eid; #endif - ia64_acpiid_to_sapicid[lsapic->acpi_id] = - (lsapic->id << 8) | lsapic->eid; ++available_cpus; } @@ -217,11 +216,11 @@ acpi_parse_lsapic(acpi_table_entry_header * header, const unsigned long end) } static int __init -acpi_parse_lapic_nmi(acpi_table_entry_header * header, const unsigned long end) +acpi_parse_lapic_nmi(struct acpi_subtable_header * header, const unsigned long end) { - struct acpi_table_lapic_nmi *lacpi_nmi; + struct acpi_madt_local_apic_nmi *lacpi_nmi; - lacpi_nmi = (struct acpi_table_lapic_nmi *)header; + lacpi_nmi = (struct acpi_madt_local_apic_nmi *)header; if (BAD_MADT_ENTRY(lacpi_nmi, end)) return -EINVAL; @@ -231,11 +230,11 @@ acpi_parse_lapic_nmi(acpi_table_entry_header * header, const unsigned long end) } static int __init -acpi_parse_iosapic(acpi_table_entry_header * header, const unsigned long end) +acpi_parse_iosapic(struct acpi_subtable_header * header, const unsigned long end) { - struct acpi_table_iosapic *iosapic; + struct acpi_madt_io_sapic *iosapic; - iosapic = (struct acpi_table_iosapic *)header; + iosapic = (struct acpi_madt_io_sapic *)header; if (BAD_MADT_ENTRY(iosapic, end)) return -EINVAL; @@ -243,14 +242,16 @@ acpi_parse_iosapic(acpi_table_entry_header * header, const unsigned long end) return iosapic_init(iosapic->address, iosapic->global_irq_base); } +static unsigned int __initdata acpi_madt_rev; + static int __init -acpi_parse_plat_int_src(acpi_table_entry_header * header, +acpi_parse_plat_int_src(struct acpi_subtable_header * header, const unsigned long end) { - struct acpi_table_plat_int_src *plintsrc; + struct acpi_madt_interrupt_source *plintsrc; int vector; - plintsrc = (struct acpi_table_plat_int_src *)header; + plintsrc = (struct acpi_madt_interrupt_source *)header; if (BAD_MADT_ENTRY(plintsrc, end)) return -EINVAL; @@ -261,19 +262,19 @@ acpi_parse_plat_int_src(acpi_table_entry_header * header, */ vector = iosapic_register_platform_intr(plintsrc->type, plintsrc->global_irq, - plintsrc->iosapic_vector, + plintsrc->io_sapic_vector, plintsrc->eid, plintsrc->id, - (plintsrc->flags.polarity == - 1) ? IOSAPIC_POL_HIGH : - IOSAPIC_POL_LOW, - (plintsrc->flags.trigger == - 1) ? IOSAPIC_EDGE : - IOSAPIC_LEVEL); + ((plintsrc->inti_flags & ACPI_MADT_POLARITY_MASK) == + ACPI_MADT_POLARITY_ACTIVE_HIGH) ? + IOSAPIC_POL_HIGH : IOSAPIC_POL_LOW, + ((plintsrc->inti_flags & ACPI_MADT_TRIGGER_MASK) == + ACPI_MADT_TRIGGER_EDGE) ? + IOSAPIC_EDGE : IOSAPIC_LEVEL); platform_intr_list[plintsrc->type] = vector; if (acpi_madt_rev > 1) { - acpi_cpei_override = plintsrc->plint_flags.cpei_override_flag; + acpi_cpei_override = plintsrc->flags & ACPI_MADT_CPEI_OVERRIDE; } /* @@ -284,19 +285,24 @@ acpi_parse_plat_int_src(acpi_table_entry_header * header, return 0; } +#ifdef CONFIG_HOTPLUG_CPU unsigned int can_cpei_retarget(void) { extern int cpe_vector; + extern unsigned int force_cpei_retarget; /* * Only if CPEI is supported and the override flag * is present, otherwise return that its re-targettable * if we are in polling mode. */ - if (cpe_vector > 0 && !acpi_cpei_override) - return 0; - else - return 1; + if (cpe_vector > 0) { + if (acpi_cpei_override || force_cpei_retarget) + return 1; + else + return 0; + } + return 1; } unsigned int is_cpu_cpei_target(unsigned int cpu) @@ -315,6 +321,7 @@ void set_cpei_target_cpu(unsigned int cpu) { acpi_cpei_phys_cpuid = cpu_physical_id(cpu); } +#endif unsigned int get_cpei_target_cpu(void) { @@ -322,30 +329,32 @@ unsigned int get_cpei_target_cpu(void) } static int __init -acpi_parse_int_src_ovr(acpi_table_entry_header * header, +acpi_parse_int_src_ovr(struct acpi_subtable_header * header, const unsigned long end) { - struct acpi_table_int_src_ovr *p; + struct acpi_madt_interrupt_override *p; - p = (struct acpi_table_int_src_ovr *)header; + p = (struct acpi_madt_interrupt_override *)header; if (BAD_MADT_ENTRY(p, end)) return -EINVAL; - iosapic_override_isa_irq(p->bus_irq, p->global_irq, - (p->flags.polarity == - 1) ? IOSAPIC_POL_HIGH : IOSAPIC_POL_LOW, - (p->flags.trigger == - 1) ? IOSAPIC_EDGE : IOSAPIC_LEVEL); + iosapic_override_isa_irq(p->source_irq, p->global_irq, + ((p->inti_flags & ACPI_MADT_POLARITY_MASK) == + ACPI_MADT_POLARITY_ACTIVE_LOW) ? + IOSAPIC_POL_LOW : IOSAPIC_POL_HIGH, + ((p->inti_flags & ACPI_MADT_TRIGGER_MASK) == + ACPI_MADT_TRIGGER_LEVEL) ? + IOSAPIC_LEVEL : IOSAPIC_EDGE); return 0; } static int __init -acpi_parse_nmi_src(acpi_table_entry_header * header, const unsigned long end) +acpi_parse_nmi_src(struct acpi_subtable_header * header, const unsigned long end) { - struct acpi_table_nmi_src *nmi_src; + struct acpi_madt_nmi_source *nmi_src; - nmi_src = (struct acpi_table_nmi_src *)header; + nmi_src = (struct acpi_madt_nmi_source *)header; if (BAD_MADT_ENTRY(nmi_src, end)) return -EINVAL; @@ -369,12 +378,12 @@ static void __init acpi_madt_oem_check(char *oem_id, char *oem_table_id) } } -static int __init acpi_parse_madt(unsigned long phys_addr, unsigned long size) +static int __init acpi_parse_madt(struct acpi_table_header *table) { - if (!phys_addr || !size) + if (!table) return -EINVAL; - acpi_madt = (struct acpi_table_madt *)__va(phys_addr); + acpi_madt = (struct acpi_table_madt *)table; acpi_madt_rev = acpi_madt->header.revision; @@ -382,14 +391,14 @@ static int __init acpi_parse_madt(unsigned long phys_addr, unsigned long size) #ifdef CONFIG_ITANIUM has_8259 = 1; /* Firmware on old Itanium systems is broken */ #else - has_8259 = acpi_madt->flags.pcat_compat; + has_8259 = acpi_madt->flags & ACPI_MADT_PCAT_COMPAT; #endif iosapic_system_init(has_8259); /* Get base address of IPI Message Block */ - if (acpi_madt->lapic_address) - ipi_base_addr = ioremap(acpi_madt->lapic_address, 0); + if (acpi_madt->address) + ipi_base_addr = ioremap(acpi_madt->address, 0); printk(KERN_INFO PREFIX "Local APIC address %p\n", ipi_base_addr); @@ -406,13 +415,34 @@ static int __init acpi_parse_madt(unsigned long phys_addr, unsigned long size) #define PXM_FLAG_LEN ((MAX_PXM_DOMAINS + 1)/32) static int __initdata srat_num_cpus; /* number of cpus */ -static u32 __devinitdata pxm_flag[PXM_FLAG_LEN]; +static u32 pxm_flag[PXM_FLAG_LEN]; #define pxm_bit_set(bit) (set_bit(bit,(void *)pxm_flag)) #define pxm_bit_test(bit) (test_bit(bit,(void *)pxm_flag)) -/* maps to convert between proximity domain and logical node ID */ -int __devinitdata pxm_to_nid_map[MAX_PXM_DOMAINS]; -int __initdata nid_to_pxm_map[MAX_NUMNODES]; static struct acpi_table_slit __initdata *slit_table; +cpumask_t early_cpu_possible_map = CPU_MASK_NONE; + +static int __init +get_processor_proximity_domain(struct acpi_srat_cpu_affinity *pa) +{ + int pxm; + + pxm = pa->proximity_domain_lo; + if (ia64_platform_is("sn2") || acpi_srat_revision >= 2) + pxm += pa->proximity_domain_hi[0] << 8; + return pxm; +} + +static int __init +get_memory_proximity_domain(struct acpi_srat_mem_affinity *ma) +{ + int pxm; + + pxm = ma->proximity_domain; + if (!ia64_platform_is("sn2") && acpi_srat_revision <= 1) + pxm &= 0xff; + + return pxm; +} /* * ACPI 2.0 SLIT (System Locality Information Table) @@ -423,48 +453,59 @@ void __init acpi_numa_slit_init(struct acpi_table_slit *slit) u32 len; len = sizeof(struct acpi_table_header) + 8 - + slit->localities * slit->localities; + + slit->locality_count * slit->locality_count; if (slit->header.length != len) { printk(KERN_ERR "ACPI 2.0 SLIT: size mismatch: %d expected, %d actual\n", len, slit->header.length); - memset(numa_slit, 10, sizeof(numa_slit)); return; } slit_table = slit; } void __init -acpi_numa_processor_affinity_init(struct acpi_table_processor_affinity *pa) +acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa) { + int pxm; + + if (!(pa->flags & ACPI_SRAT_CPU_ENABLED)) + return; + + if (srat_num_cpus >= ARRAY_SIZE(node_cpuid)) { + printk_once(KERN_WARNING + "node_cpuid[%ld] is too small, may not be able to use all cpus\n", + ARRAY_SIZE(node_cpuid)); + return; + } + pxm = get_processor_proximity_domain(pa); + /* record this node in proximity bitmap */ - pxm_bit_set(pa->proximity_domain); + pxm_bit_set(pxm); node_cpuid[srat_num_cpus].phys_id = - (pa->apic_id << 8) | (pa->lsapic_eid); + (pa->apic_id << 8) | (pa->local_sapic_eid); /* nid should be overridden as logical node id later */ - node_cpuid[srat_num_cpus].nid = pa->proximity_domain; + node_cpuid[srat_num_cpus].nid = pxm; + cpu_set(srat_num_cpus, early_cpu_possible_map); srat_num_cpus++; } -void __init -acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma) +int __init +acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) { unsigned long paddr, size; - u8 pxm; + int pxm; struct node_memblk_s *p, *q, *pend; - pxm = ma->proximity_domain; + pxm = get_memory_proximity_domain(ma); /* fill node memory chunk structure */ - paddr = ma->base_addr_hi; - paddr = (paddr << 32) | ma->base_addr_lo; - size = ma->length_hi; - size = (size << 32) | ma->length_lo; + paddr = ma->base_address; + size = ma->length; /* Ignore disabled entries */ - if (!ma->flags.enabled) - return; + if (!(ma->flags & ACPI_SRAT_MEM_ENABLED)) + return -1; /* record this node in proximity bitmap */ pxm_bit_set(pxm); @@ -483,6 +524,7 @@ acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma) p->size = size; p->nid = pxm; num_node_memblks++; + return 0; } void __init acpi_numa_arch_fixup(void) @@ -500,22 +542,17 @@ void __init acpi_numa_arch_fixup(void) * MCD - This can probably be dropped now. No need for pxm ID to node ID * mapping with sparse node numbering iff MAX_PXM_DOMAINS <= MAX_NUMNODES. */ - /* calculate total number of nodes in system from PXM bitmap */ - memset(pxm_to_nid_map, -1, sizeof(pxm_to_nid_map)); - memset(nid_to_pxm_map, -1, sizeof(nid_to_pxm_map)); nodes_clear(node_online_map); for (i = 0; i < MAX_PXM_DOMAINS; i++) { if (pxm_bit_test(i)) { - int nid = num_online_nodes(); - pxm_to_nid_map[i] = nid; - nid_to_pxm_map[nid] = i; + int nid = acpi_map_pxm_to_node(i); node_set_online(nid); } } /* set logical node id in memory chunk structure */ for (i = 0; i < num_node_memblks; i++) - node_memblk[i].nid = pxm_to_nid_map[node_memblk[i].nid]; + node_memblk[i].nid = pxm_to_node(node_memblk[i].nid); /* assign memory bank numbers for each chunk on each node */ for_each_online_node(i) { @@ -528,27 +565,33 @@ void __init acpi_numa_arch_fixup(void) } /* set logical node id in cpu structure */ - for (i = 0; i < srat_num_cpus; i++) - node_cpuid[i].nid = pxm_to_nid_map[node_cpuid[i].nid]; + for_each_possible_early_cpu(i) + node_cpuid[i].nid = pxm_to_node(node_cpuid[i].nid); printk(KERN_INFO "Number of logical nodes in system = %d\n", num_online_nodes()); printk(KERN_INFO "Number of memory chunks in system = %d\n", num_node_memblks); - if (!slit_table) + if (!slit_table) { + for (i = 0; i < MAX_NUMNODES; i++) + for (j = 0; j < MAX_NUMNODES; j++) + node_distance(i, j) = i == j ? LOCAL_DISTANCE : + REMOTE_DISTANCE; return; + } + memset(numa_slit, -1, sizeof(numa_slit)); - for (i = 0; i < slit_table->localities; i++) { + for (i = 0; i < slit_table->locality_count; i++) { if (!pxm_bit_test(i)) continue; - node_from = pxm_to_nid_map[i]; - for (j = 0; j < slit_table->localities; j++) { + node_from = pxm_to_node(i); + for (j = 0; j < slit_table->locality_count; j++) { if (!pxm_bit_test(j)) continue; - node_to = pxm_to_nid_map[j]; + node_to = pxm_to_node(j); node_distance(node_from, node_to) = - slit_table->entry[i * slit_table->localities + j]; + slit_table->entry[i * slit_table->locality_count + j]; } } @@ -567,63 +610,91 @@ void __init acpi_numa_arch_fixup(void) * success: return IRQ number (>=0) * failure: return < 0 */ -int acpi_register_gsi(u32 gsi, int edge_level, int active_high_low) +int acpi_register_gsi(struct device *dev, u32 gsi, int triggering, int polarity) { + if (acpi_irq_model == ACPI_IRQ_MODEL_PLATFORM) + return gsi; + if (has_8259 && gsi < 16) return isa_irq_to_vector(gsi); return iosapic_register_intr(gsi, - (active_high_low == + (polarity == ACPI_ACTIVE_HIGH) ? IOSAPIC_POL_HIGH : IOSAPIC_POL_LOW, - (edge_level == + (triggering == ACPI_EDGE_SENSITIVE) ? IOSAPIC_EDGE : IOSAPIC_LEVEL); } - -EXPORT_SYMBOL(acpi_register_gsi); +EXPORT_SYMBOL_GPL(acpi_register_gsi); void acpi_unregister_gsi(u32 gsi) { + if (acpi_irq_model == ACPI_IRQ_MODEL_PLATFORM) + return; + + if (has_8259 && gsi < 16) + return; + iosapic_unregister_intr(gsi); } +EXPORT_SYMBOL_GPL(acpi_unregister_gsi); -EXPORT_SYMBOL(acpi_unregister_gsi); - -static int __init acpi_parse_fadt(unsigned long phys_addr, unsigned long size) +static int __init acpi_parse_fadt(struct acpi_table_header *table) { struct acpi_table_header *fadt_header; - struct fadt_descriptor_rev2 *fadt; + struct acpi_table_fadt *fadt; - if (!phys_addr || !size) + if (!table) return -EINVAL; - fadt_header = (struct acpi_table_header *)__va(phys_addr); + fadt_header = (struct acpi_table_header *)table; if (fadt_header->revision != 3) return -ENODEV; /* Only deal with ACPI 2.0 FADT */ - fadt = (struct fadt_descriptor_rev2 *)fadt_header; - - if (!(fadt->iapc_boot_arch & BAF_8042_KEYBOARD_CONTROLLER)) - acpi_kbd_controller_present = 0; - - if (fadt->iapc_boot_arch & BAF_LEGACY_DEVICES) - acpi_legacy_devices = 1; + fadt = (struct acpi_table_fadt *)fadt_header; - acpi_register_gsi(fadt->sci_int, ACPI_LEVEL_SENSITIVE, ACPI_ACTIVE_LOW); + acpi_register_gsi(NULL, fadt->sci_interrupt, ACPI_LEVEL_SENSITIVE, + ACPI_ACTIVE_LOW); return 0; } -unsigned long __init acpi_find_rsdp(void) +int __init early_acpi_boot_init(void) { - unsigned long rsdp_phys = 0; + int ret; - if (efi.acpi20) - rsdp_phys = __pa(efi.acpi20); - else if (efi.acpi) - printk(KERN_WARNING PREFIX - "v1.0/r0.71 tables no longer supported\n"); - return rsdp_phys; + /* + * do a partial walk of MADT to determine how many CPUs + * we have including offline CPUs + */ + if (acpi_table_parse(ACPI_SIG_MADT, acpi_parse_madt)) { + printk(KERN_ERR PREFIX "Can't find MADT\n"); + return 0; + } + + ret = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_SAPIC, + acpi_parse_lsapic, NR_CPUS); + if (ret < 1) + printk(KERN_ERR PREFIX + "Error parsing MADT - no LAPIC entries\n"); + else + acpi_lapic = 1; + +#ifdef CONFIG_SMP + if (available_cpus == 0) { + printk(KERN_INFO "ACPI: Found 0 CPUS; assuming 1\n"); + printk(KERN_INFO "CPU 0 (0x%04x)", hard_smp_processor_id()); + smp_boot_data.cpu_phys_id[available_cpus] = + hard_smp_processor_id(); + available_cpus = 1; /* We've got at least one of these, no? */ + } + smp_boot_data.cpu_count = available_cpus; +#endif + /* Make boot-up look pretty */ + printk(KERN_INFO "%d CPUs available, %d CPUs total\n", available_cpus, + total_cpus); + + return 0; } int __init acpi_boot_init(void) @@ -637,7 +708,7 @@ int __init acpi_boot_init(void) * information -- the successor to MPS tables. */ - if (acpi_table_parse(ACPI_APIC, acpi_parse_madt) < 1) { + if (acpi_table_parse(ACPI_SIG_MADT, acpi_parse_madt)) { printk(KERN_ERR PREFIX "Can't find MADT\n"); goto skip_madt; } @@ -645,40 +716,37 @@ int __init acpi_boot_init(void) /* Local APIC */ if (acpi_table_parse_madt - (ACPI_MADT_LAPIC_ADDR_OVR, acpi_parse_lapic_addr_ovr, 0) < 0) + (ACPI_MADT_TYPE_LOCAL_APIC_OVERRIDE, acpi_parse_lapic_addr_ovr, 0) < 0) printk(KERN_ERR PREFIX "Error parsing LAPIC address override entry\n"); - if (acpi_table_parse_madt(ACPI_MADT_LSAPIC, acpi_parse_lsapic, NR_CPUS) - < 1) - printk(KERN_ERR PREFIX - "Error parsing MADT - no LAPIC entries\n"); - - if (acpi_table_parse_madt(ACPI_MADT_LAPIC_NMI, acpi_parse_lapic_nmi, 0) + if (acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC_NMI, acpi_parse_lapic_nmi, 0) < 0) printk(KERN_ERR PREFIX "Error parsing LAPIC NMI entry\n"); /* I/O APIC */ if (acpi_table_parse_madt - (ACPI_MADT_IOSAPIC, acpi_parse_iosapic, NR_IOSAPICS) < 1) - printk(KERN_ERR PREFIX - "Error parsing MADT - no IOSAPIC entries\n"); + (ACPI_MADT_TYPE_IO_SAPIC, acpi_parse_iosapic, NR_IOSAPICS) < 1) { + if (!ia64_platform_is("sn2")) + printk(KERN_ERR PREFIX + "Error parsing MADT - no IOSAPIC entries\n"); + } /* System-Level Interrupt Routing */ if (acpi_table_parse_madt - (ACPI_MADT_PLAT_INT_SRC, acpi_parse_plat_int_src, + (ACPI_MADT_TYPE_INTERRUPT_SOURCE, acpi_parse_plat_int_src, ACPI_MAX_PLATFORM_INTERRUPTS) < 0) printk(KERN_ERR PREFIX "Error parsing platform interrupt source entry\n"); if (acpi_table_parse_madt - (ACPI_MADT_INT_SRC_OVR, acpi_parse_int_src_ovr, 0) < 0) + (ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, acpi_parse_int_src_ovr, 0) < 0) printk(KERN_ERR PREFIX "Error parsing interrupt source overrides entry\n"); - if (acpi_table_parse_madt(ACPI_MADT_NMI_SRC, acpi_parse_nmi_src, 0) < 0) + if (acpi_table_parse_madt(ACPI_MADT_TYPE_NMI_SOURCE, acpi_parse_nmi_src, 0) < 0) printk(KERN_ERR PREFIX "Error parsing NMI SRC entry\n"); skip_madt: @@ -688,21 +756,11 @@ int __init acpi_boot_init(void) * gets interrupts such as power and sleep buttons. If it's not * on a Legacy interrupt, it needs to be setup. */ - if (acpi_table_parse(ACPI_FADT, acpi_parse_fadt) < 1) + if (acpi_table_parse(ACPI_SIG_FADT, acpi_parse_fadt)) printk(KERN_ERR PREFIX "Can't find FADT\n"); +#ifdef CONFIG_ACPI_NUMA #ifdef CONFIG_SMP - if (available_cpus == 0) { - printk(KERN_INFO "ACPI: Found 0 CPUS; assuming 1\n"); - printk(KERN_INFO "CPU 0 (0x%04x)", hard_smp_processor_id()); - smp_boot_data.cpu_phys_id[available_cpus] = - hard_smp_processor_id(); - available_cpus = 1; /* We've got at least one of these, no? */ - } - smp_boot_data.cpu_count = available_cpus; - - smp_build_cpu_map(); -# ifdef CONFIG_ACPI_NUMA if (srat_num_cpus == 0) { int cpu, i = 1; for (cpu = 0; cpu < smp_boot_data.cpu_count; cpu++) @@ -711,121 +769,140 @@ int __init acpi_boot_init(void) node_cpuid[i++].phys_id = smp_boot_data.cpu_phys_id[cpu]; } -# endif #endif -#ifdef CONFIG_ACPI_NUMA build_cpu_to_node_map(); #endif - /* Make boot-up look pretty */ - printk(KERN_INFO "%d CPUs available, %d CPUs total\n", available_cpus, - total_cpus); return 0; } int acpi_gsi_to_irq(u32 gsi, unsigned int *irq) { - int vector; + int tmp; if (has_8259 && gsi < 16) *irq = isa_irq_to_vector(gsi); else { - vector = gsi_to_vector(gsi); - if (vector == -1) + tmp = gsi_to_irq(gsi); + if (tmp == -1) return -1; - - *irq = vector; + *irq = tmp; } return 0; } +int acpi_isa_irq_to_gsi(unsigned isa_irq, u32 *gsi) +{ + if (isa_irq >= 16) + return -1; + *gsi = isa_irq; + return 0; +} + /* * ACPI based hotplug CPU support */ #ifdef CONFIG_ACPI_HOTPLUG_CPU -static -int acpi_map_cpu2node(acpi_handle handle, int cpu, long physid) +static int acpi_map_cpu2node(acpi_handle handle, int cpu, int physid) { #ifdef CONFIG_ACPI_NUMA - int pxm_id; - - pxm_id = acpi_get_pxm(handle); - /* - * Assuming that the container driver would have set the proximity - * domain and would have initialized pxm_to_nid_map[pxm_id] && pxm_flag + * We don't have cpu-only-node hotadd. But if the system equips + * SRAT table, pxm is already found and node is ready. + * So, just pxm_to_nid(pxm) is OK. + * This code here is for the system which doesn't have full SRAT + * table for possible cpus. */ - node_cpuid[cpu].nid = (pxm_id < 0) ? 0 : pxm_to_nid_map[pxm_id]; - node_cpuid[cpu].phys_id = physid; + node_cpuid[cpu].nid = acpi_get_node(handle); #endif - return (0); + return 0; } -int acpi_map_lsapic(acpi_handle handle, int *pcpu) +int additional_cpus __initdata = -1; + +static __init int setup_additional_cpus(char *s) { - struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL }; - union acpi_object *obj; - struct acpi_table_lsapic *lsapic; - cpumask_t tmp_map; - long physid; - int cpu; + if (s) + additional_cpus = simple_strtol(s, NULL, 0); - if (ACPI_FAILURE(acpi_evaluate_object(handle, "_MAT", NULL, &buffer))) - return -EINVAL; + return 0; +} - if (!buffer.length || !buffer.pointer) - return -EINVAL; +early_param("additional_cpus", setup_additional_cpus); - obj = buffer.pointer; - if (obj->type != ACPI_TYPE_BUFFER || - obj->buffer.length < sizeof(*lsapic)) { - acpi_os_free(buffer.pointer); - return -EINVAL; - } +/* + * cpu_possible_mask should be static, it cannot change as CPUs + * are onlined, or offlined. The reason is per-cpu data-structures + * are allocated by some modules at init time, and dont expect to + * do this dynamically on cpu arrival/departure. + * cpu_present_mask on the other hand can change dynamically. + * In case when cpu_hotplug is not compiled, then we resort to current + * behaviour, which is cpu_possible == cpu_present. + * - Ashok Raj + * + * Three ways to find out the number of additional hotplug CPUs: + * - If the BIOS specified disabled CPUs in ACPI/mptables use that. + * - The user can overwrite it with additional_cpus=NUM + * - Otherwise don't reserve additional CPUs. + */ +__init void prefill_possible_map(void) +{ + int i; + int possible, disabled_cpus; - lsapic = (struct acpi_table_lsapic *)obj->buffer.pointer; + disabled_cpus = total_cpus - available_cpus; - if ((lsapic->header.type != ACPI_MADT_LSAPIC) || - (!lsapic->flags.enabled)) { - acpi_os_free(buffer.pointer); - return -EINVAL; - } + if (additional_cpus == -1) { + if (disabled_cpus > 0) + additional_cpus = disabled_cpus; + else + additional_cpus = 0; + } - physid = ((lsapic->id << 8) | (lsapic->eid)); + possible = available_cpus + additional_cpus; - acpi_os_free(buffer.pointer); - buffer.length = ACPI_ALLOCATE_BUFFER; - buffer.pointer = NULL; + if (possible > nr_cpu_ids) + possible = nr_cpu_ids; - cpus_complement(tmp_map, cpu_present_map); - cpu = first_cpu(tmp_map); - if (cpu >= NR_CPUS) + printk(KERN_INFO "SMP: Allowing %d CPUs, %d hotplug CPUs\n", + possible, max((possible - available_cpus), 0)); + + for (i = 0; i < possible; i++) + set_cpu_possible(i, true); +} + +static int _acpi_map_lsapic(acpi_handle handle, int physid, int *pcpu) +{ + cpumask_t tmp_map; + int cpu; + + cpumask_complement(&tmp_map, cpu_present_mask); + cpu = cpumask_first(&tmp_map); + if (cpu >= nr_cpu_ids) return -EINVAL; acpi_map_cpu2node(handle, cpu, physid); - cpu_set(cpu, cpu_present_map); + set_cpu_present(cpu, true); ia64_cpu_to_sapicid[cpu] = physid; - ia64_acpiid_to_sapicid[lsapic->acpi_id] = ia64_cpu_to_sapicid[cpu]; + + acpi_processor_set_pdc(handle); *pcpu = cpu; return (0); } +/* wrapper to silence section mismatch warning */ +int __ref acpi_map_lsapic(acpi_handle handle, int physid, int *pcpu) +{ + return _acpi_map_lsapic(handle, physid, pcpu); +} EXPORT_SYMBOL(acpi_map_lsapic); int acpi_unmap_lsapic(int cpu) { - int i; - - for (i = 0; i < MAX_SAPICS; i++) { - if (ia64_acpiid_to_sapicid[i] == ia64_cpu_to_sapicid[cpu]) { - ia64_acpiid_to_sapicid[i] = -1; - break; - } - } ia64_cpu_to_sapicid[cpu] = -1; - cpu_clear(cpu, cpu_present_map); + set_cpu_present(cpu, false); #ifdef CONFIG_ACPI_NUMA /* NUMA specific cleanup's */ @@ -838,14 +915,14 @@ EXPORT_SYMBOL(acpi_unmap_lsapic); #endif /* CONFIG_ACPI_HOTPLUG_CPU */ #ifdef CONFIG_ACPI_NUMA -static acpi_status __devinit -acpi_map_iosapic(acpi_handle handle, u32 depth, void *context, void **ret) +static acpi_status acpi_map_iosapic(acpi_handle handle, u32 depth, + void *context, void **ret) { struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL }; union acpi_object *obj; - struct acpi_table_iosapic *iosapic; + struct acpi_madt_io_sapic *iosapic; unsigned int gsi_base; - int pxm, node; + int node; /* Only care about objects w/ a method that returns the MADT */ if (ACPI_FAILURE(acpi_evaluate_object(handle, "_MAT", NULL, &buffer))) @@ -857,33 +934,25 @@ acpi_map_iosapic(acpi_handle handle, u32 depth, void *context, void **ret) obj = buffer.pointer; if (obj->type != ACPI_TYPE_BUFFER || obj->buffer.length < sizeof(*iosapic)) { - acpi_os_free(buffer.pointer); + kfree(buffer.pointer); return AE_OK; } - iosapic = (struct acpi_table_iosapic *)obj->buffer.pointer; + iosapic = (struct acpi_madt_io_sapic *)obj->buffer.pointer; - if (iosapic->header.type != ACPI_MADT_IOSAPIC) { - acpi_os_free(buffer.pointer); + if (iosapic->header.type != ACPI_MADT_TYPE_IO_SAPIC) { + kfree(buffer.pointer); return AE_OK; } gsi_base = iosapic->global_irq_base; - acpi_os_free(buffer.pointer); - - /* - * OK, it's an IOSAPIC MADT entry, look for a _PXM value to tell - * us which node to associate this with. - */ - pxm = acpi_get_pxm(handle); - if (pxm < 0) - return AE_OK; - - node = pxm_to_nid_map[pxm]; + kfree(buffer.pointer); - if (node >= MAX_NUMNODES || !node_online(node) || - cpus_empty(node_to_cpumask(node))) + /* OK, it's an IOSAPIC MADT entry; associate it with a node */ + node = acpi_get_node(handle); + if (node == NUMA_NO_NODE || !node_online(node) || + cpumask_empty(cpumask_of_node(node))) return AE_OK; /* We know a gsi to node mapping! */ @@ -901,7 +970,7 @@ acpi_map_iosapics (void) fs_initcall(acpi_map_iosapics); #endif /* CONFIG_ACPI_NUMA */ -int acpi_register_ioapic(acpi_handle handle, u64 phys_addr, u32 gsi_base) +int __ref acpi_register_ioapic(acpi_handle handle, u64 phys_addr, u32 gsi_base) { int err; @@ -924,4 +993,9 @@ int acpi_unregister_ioapic(acpi_handle handle, u32 gsi_base) EXPORT_SYMBOL(acpi_unregister_ioapic); -#endif /* CONFIG_ACPI */ +/* + * acpi_suspend_lowlevel() - save kernel state and suspend. + * + * TBD when when IA64 starts to support suspend... + */ +int acpi_suspend_lowlevel(void) { return 0; } diff --git a/arch/ia64/kernel/asm-offsets.c b/arch/ia64/kernel/asm-offsets.c index 77225659e96..60ef83e6db7 100644 --- a/arch/ia64/kernel/asm-offsets.c +++ b/arch/ia64/kernel/asm-offsets.c @@ -5,22 +5,19 @@ */ #define ASM_OFFSETS_C 1 -#include <linux/config.h> #include <linux/sched.h> - -#include <asm-ia64/processor.h> -#include <asm-ia64/ptrace.h> -#include <asm-ia64/siginfo.h> -#include <asm-ia64/sigcontext.h> -#include <asm-ia64/mca.h> +#include <linux/pid.h> +#include <linux/clocksource.h> +#include <linux/kbuild.h> +#include <asm/processor.h> +#include <asm/ptrace.h> +#include <asm/siginfo.h> +#include <asm/sigcontext.h> +#include <asm/mca.h> #include "../kernel/sigframe.h" - -#define DEFINE(sym, val) \ - asm volatile("\n->" #sym " %0 " #val : : "i" (val)) - -#define BLANK() asm volatile("\n->" : : ) +#include "../kernel/fsyscall_gtod_data.h" void foo(void) { @@ -33,16 +30,29 @@ void foo(void) DEFINE(SIGFRAME_SIZE, sizeof (struct sigframe)); DEFINE(UNW_FRAME_INFO_SIZE, sizeof (struct unw_frame_info)); + BUILD_BUG_ON(sizeof(struct upid) != 32); + DEFINE(IA64_UPID_SHIFT, 5); + BLANK(); DEFINE(TI_FLAGS, offsetof(struct thread_info, flags)); + DEFINE(TI_CPU, offsetof(struct thread_info, cpu)); DEFINE(TI_PRE_COUNT, offsetof(struct thread_info, preempt_count)); +#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE + DEFINE(TI_AC_STAMP, offsetof(struct thread_info, ac_stamp)); + DEFINE(TI_AC_LEAVE, offsetof(struct thread_info, ac_leave)); + DEFINE(TI_AC_STIME, offsetof(struct thread_info, ac_stime)); + DEFINE(TI_AC_UTIME, offsetof(struct thread_info, ac_utime)); +#endif BLANK(); DEFINE(IA64_TASK_BLOCKED_OFFSET,offsetof (struct task_struct, blocked)); DEFINE(IA64_TASK_CLEAR_CHILD_TID_OFFSET,offsetof (struct task_struct, clear_child_tid)); DEFINE(IA64_TASK_GROUP_LEADER_OFFSET, offsetof (struct task_struct, group_leader)); + DEFINE(IA64_TASK_TGIDLINK_OFFSET, offsetof (struct task_struct, pids[PIDTYPE_PID].pid)); + DEFINE(IA64_PID_LEVEL_OFFSET, offsetof (struct pid, level)); + DEFINE(IA64_PID_UPID_OFFSET, offsetof (struct pid, numbers[0])); DEFINE(IA64_TASK_PENDING_OFFSET,offsetof (struct task_struct, pending)); DEFINE(IA64_TASK_PID_OFFSET, offsetof (struct task_struct, pid)); DEFINE(IA64_TASK_REAL_PARENT_OFFSET, offsetof (struct task_struct, real_parent)); @@ -217,16 +227,24 @@ void foo(void) DEFINE(IA64_MCA_CPU_INIT_STACK_OFFSET, offsetof (struct ia64_mca_cpu, init_stack)); BLANK(); - DEFINE(IA64_SAL_OS_STATE_COMMON_OFFSET, - offsetof (struct ia64_sal_os_state, sal_ra)); DEFINE(IA64_SAL_OS_STATE_OS_GP_OFFSET, offsetof (struct ia64_sal_os_state, os_gp)); - DEFINE(IA64_SAL_OS_STATE_PAL_MIN_STATE_OFFSET, - offsetof (struct ia64_sal_os_state, pal_min_state)); DEFINE(IA64_SAL_OS_STATE_PROC_STATE_PARAM_OFFSET, offsetof (struct ia64_sal_os_state, proc_state_param)); + DEFINE(IA64_SAL_OS_STATE_SAL_RA_OFFSET, + offsetof (struct ia64_sal_os_state, sal_ra)); + DEFINE(IA64_SAL_OS_STATE_SAL_GP_OFFSET, + offsetof (struct ia64_sal_os_state, sal_gp)); + DEFINE(IA64_SAL_OS_STATE_PAL_MIN_STATE_OFFSET, + offsetof (struct ia64_sal_os_state, pal_min_state)); + DEFINE(IA64_SAL_OS_STATE_OS_STATUS_OFFSET, + offsetof (struct ia64_sal_os_state, os_status)); + DEFINE(IA64_SAL_OS_STATE_CONTEXT_OFFSET, + offsetof (struct ia64_sal_os_state, context)); DEFINE(IA64_SAL_OS_STATE_SIZE, sizeof (struct ia64_sal_os_state)); + BLANK(); + DEFINE(IA64_PMSA_GR_OFFSET, offsetof (struct pal_min_state_area_s, pmsa_gr)); DEFINE(IA64_PMSA_BANK1_GR_OFFSET, @@ -248,17 +266,25 @@ void foo(void) BLANK(); /* used by fsys_gettimeofday in arch/ia64/kernel/fsys.S */ - DEFINE(IA64_TIME_INTERPOLATOR_ADDRESS_OFFSET, offsetof (struct time_interpolator, addr)); - DEFINE(IA64_TIME_INTERPOLATOR_SOURCE_OFFSET, offsetof (struct time_interpolator, source)); - DEFINE(IA64_TIME_INTERPOLATOR_SHIFT_OFFSET, offsetof (struct time_interpolator, shift)); - DEFINE(IA64_TIME_INTERPOLATOR_NSEC_OFFSET, offsetof (struct time_interpolator, nsec_per_cyc)); - DEFINE(IA64_TIME_INTERPOLATOR_OFFSET_OFFSET, offsetof (struct time_interpolator, offset)); - DEFINE(IA64_TIME_INTERPOLATOR_LAST_CYCLE_OFFSET, offsetof (struct time_interpolator, last_cycle)); - DEFINE(IA64_TIME_INTERPOLATOR_LAST_COUNTER_OFFSET, offsetof (struct time_interpolator, last_counter)); - DEFINE(IA64_TIME_INTERPOLATOR_JITTER_OFFSET, offsetof (struct time_interpolator, jitter)); - DEFINE(IA64_TIME_INTERPOLATOR_MASK_OFFSET, offsetof (struct time_interpolator, mask)); - DEFINE(IA64_TIME_SOURCE_CPU, TIME_SOURCE_CPU); - DEFINE(IA64_TIME_SOURCE_MMIO64, TIME_SOURCE_MMIO64); - DEFINE(IA64_TIME_SOURCE_MMIO32, TIME_SOURCE_MMIO32); - DEFINE(IA64_TIMESPEC_TV_NSEC_OFFSET, offsetof (struct timespec, tv_nsec)); + DEFINE(IA64_GTOD_SEQ_OFFSET, + offsetof (struct fsyscall_gtod_data_t, seq)); + DEFINE(IA64_GTOD_WALL_TIME_OFFSET, + offsetof (struct fsyscall_gtod_data_t, wall_time)); + DEFINE(IA64_GTOD_MONO_TIME_OFFSET, + offsetof (struct fsyscall_gtod_data_t, monotonic_time)); + DEFINE(IA64_CLKSRC_MASK_OFFSET, + offsetof (struct fsyscall_gtod_data_t, clk_mask)); + DEFINE(IA64_CLKSRC_MULT_OFFSET, + offsetof (struct fsyscall_gtod_data_t, clk_mult)); + DEFINE(IA64_CLKSRC_SHIFT_OFFSET, + offsetof (struct fsyscall_gtod_data_t, clk_shift)); + DEFINE(IA64_CLKSRC_MMIO_OFFSET, + offsetof (struct fsyscall_gtod_data_t, clk_fsys_mmio)); + DEFINE(IA64_CLKSRC_CYCLE_LAST_OFFSET, + offsetof (struct fsyscall_gtod_data_t, clk_cycle_last)); + DEFINE(IA64_ITC_JITTER_OFFSET, + offsetof (struct itc_jitter_data_t, itc_jitter)); + DEFINE(IA64_ITC_LASTCYCLE_OFFSET, + offsetof (struct itc_jitter_data_t, itc_lastcycle)); + } diff --git a/arch/ia64/kernel/audit.c b/arch/ia64/kernel/audit.c new file mode 100644 index 00000000000..96a9d18ff4c --- /dev/null +++ b/arch/ia64/kernel/audit.c @@ -0,0 +1,60 @@ +#include <linux/init.h> +#include <linux/types.h> +#include <linux/audit.h> +#include <asm/unistd.h> + +static unsigned dir_class[] = { +#include <asm-generic/audit_dir_write.h> +~0U +}; + +static unsigned read_class[] = { +#include <asm-generic/audit_read.h> +~0U +}; + +static unsigned write_class[] = { +#include <asm-generic/audit_write.h> +~0U +}; + +static unsigned chattr_class[] = { +#include <asm-generic/audit_change_attr.h> +~0U +}; + +static unsigned signal_class[] = { +#include <asm-generic/audit_signal.h> +~0U +}; + +int audit_classify_arch(int arch) +{ + return 0; +} + +int audit_classify_syscall(int abi, unsigned syscall) +{ + switch(syscall) { + case __NR_open: + return 2; + case __NR_openat: + return 3; + case __NR_execve: + return 5; + default: + return 0; + } +} + +static int __init audit_classes_init(void) +{ + audit_register_class(AUDIT_CLASS_WRITE, write_class); + audit_register_class(AUDIT_CLASS_READ, read_class); + audit_register_class(AUDIT_CLASS_DIR_WRITE, dir_class); + audit_register_class(AUDIT_CLASS_CHATTR, chattr_class); + audit_register_class(AUDIT_CLASS_SIGNAL, signal_class); + return 0; +} + +__initcall(audit_classes_init); diff --git a/arch/ia64/kernel/cpufreq/Kconfig b/arch/ia64/kernel/cpufreq/Kconfig deleted file mode 100644 index 2d9d5279b98..00000000000 --- a/arch/ia64/kernel/cpufreq/Kconfig +++ /dev/null @@ -1,29 +0,0 @@ - -# -# CPU Frequency scaling -# - -menu "CPU Frequency scaling" - -source "drivers/cpufreq/Kconfig" - -if CPU_FREQ - -comment "CPUFreq processor drivers" - -config IA64_ACPI_CPUFREQ - tristate "ACPI Processor P-States driver" - select CPU_FREQ_TABLE - depends on ACPI_PROCESSOR - help - This driver adds a CPUFreq driver which utilizes the ACPI - Processor Performance States. - - For details, take a look at <file:Documentation/cpu-freq/>. - - If in doubt, say N. - -endif # CPU_FREQ - -endmenu - diff --git a/arch/ia64/kernel/cpufreq/Makefile b/arch/ia64/kernel/cpufreq/Makefile deleted file mode 100644 index f748d34c02f..00000000000 --- a/arch/ia64/kernel/cpufreq/Makefile +++ /dev/null @@ -1 +0,0 @@ -obj-$(CONFIG_IA64_ACPI_CPUFREQ) += acpi-cpufreq.o diff --git a/arch/ia64/kernel/cpufreq/acpi-cpufreq.c b/arch/ia64/kernel/cpufreq/acpi-cpufreq.c deleted file mode 100644 index da4d5cf80a4..00000000000 --- a/arch/ia64/kernel/cpufreq/acpi-cpufreq.c +++ /dev/null @@ -1,499 +0,0 @@ -/* - * arch/ia64/kernel/cpufreq/acpi-cpufreq.c - * This file provides the ACPI based P-state support. This - * module works with generic cpufreq infrastructure. Most of - * the code is based on i386 version - * (arch/i386/kernel/cpu/cpufreq/acpi-cpufreq.c) - * - * Copyright (C) 2005 Intel Corp - * Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> - */ - -#include <linux/config.h> -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/init.h> -#include <linux/cpufreq.h> -#include <linux/proc_fs.h> -#include <linux/seq_file.h> -#include <asm/io.h> -#include <asm/uaccess.h> -#include <asm/pal.h> - -#include <linux/acpi.h> -#include <acpi/processor.h> - -#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "acpi-cpufreq", msg) - -MODULE_AUTHOR("Venkatesh Pallipadi"); -MODULE_DESCRIPTION("ACPI Processor P-States Driver"); -MODULE_LICENSE("GPL"); - - -struct cpufreq_acpi_io { - struct acpi_processor_performance acpi_data; - struct cpufreq_frequency_table *freq_table; - unsigned int resume; -}; - -static struct cpufreq_acpi_io *acpi_io_data[NR_CPUS]; - -static struct cpufreq_driver acpi_cpufreq_driver; - - -static int -processor_set_pstate ( - u32 value) -{ - s64 retval; - - dprintk("processor_set_pstate\n"); - - retval = ia64_pal_set_pstate((u64)value); - - if (retval) { - dprintk("Failed to set freq to 0x%x, with error 0x%x\n", - value, retval); - return -ENODEV; - } - return (int)retval; -} - - -static int -processor_get_pstate ( - u32 *value) -{ - u64 pstate_index = 0; - s64 retval; - - dprintk("processor_get_pstate\n"); - - retval = ia64_pal_get_pstate(&pstate_index); - *value = (u32) pstate_index; - - if (retval) - dprintk("Failed to get current freq with " - "error 0x%x, idx 0x%x\n", retval, *value); - - return (int)retval; -} - - -/* To be used only after data->acpi_data is initialized */ -static unsigned -extract_clock ( - struct cpufreq_acpi_io *data, - unsigned value, - unsigned int cpu) -{ - unsigned long i; - - dprintk("extract_clock\n"); - - for (i = 0; i < data->acpi_data.state_count; i++) { - if (value >= data->acpi_data.states[i].control) - return data->acpi_data.states[i].core_frequency; - } - return data->acpi_data.states[i-1].core_frequency; -} - - -static unsigned int -processor_get_freq ( - struct cpufreq_acpi_io *data, - unsigned int cpu) -{ - int ret = 0; - u32 value = 0; - cpumask_t saved_mask; - unsigned long clock_freq; - - dprintk("processor_get_freq\n"); - - saved_mask = current->cpus_allowed; - set_cpus_allowed(current, cpumask_of_cpu(cpu)); - if (smp_processor_id() != cpu) { - ret = -EAGAIN; - goto migrate_end; - } - - /* - * processor_get_pstate gets the average frequency since the - * last get. So, do two PAL_get_freq()... - */ - ret = processor_get_pstate(&value); - ret = processor_get_pstate(&value); - - if (ret) { - set_cpus_allowed(current, saved_mask); - printk(KERN_WARNING "get performance failed with error %d\n", - ret); - ret = -EAGAIN; - goto migrate_end; - } - clock_freq = extract_clock(data, value, cpu); - ret = (clock_freq*1000); - -migrate_end: - set_cpus_allowed(current, saved_mask); - return ret; -} - - -static int -processor_set_freq ( - struct cpufreq_acpi_io *data, - unsigned int cpu, - int state) -{ - int ret = 0; - u32 value = 0; - struct cpufreq_freqs cpufreq_freqs; - cpumask_t saved_mask; - int retval; - - dprintk("processor_set_freq\n"); - - saved_mask = current->cpus_allowed; - set_cpus_allowed(current, cpumask_of_cpu(cpu)); - if (smp_processor_id() != cpu) { - retval = -EAGAIN; - goto migrate_end; - } - - if (state == data->acpi_data.state) { - if (unlikely(data->resume)) { - dprintk("Called after resume, resetting to P%d\n", state); - data->resume = 0; - } else { - dprintk("Already at target state (P%d)\n", state); - retval = 0; - goto migrate_end; - } - } - - dprintk("Transitioning from P%d to P%d\n", - data->acpi_data.state, state); - - /* cpufreq frequency struct */ - cpufreq_freqs.cpu = cpu; - cpufreq_freqs.old = data->freq_table[data->acpi_data.state].frequency; - cpufreq_freqs.new = data->freq_table[state].frequency; - - /* notify cpufreq */ - cpufreq_notify_transition(&cpufreq_freqs, CPUFREQ_PRECHANGE); - - /* - * First we write the target state's 'control' value to the - * control_register. - */ - - value = (u32) data->acpi_data.states[state].control; - - dprintk("Transitioning to state: 0x%08x\n", value); - - ret = processor_set_pstate(value); - if (ret) { - unsigned int tmp = cpufreq_freqs.new; - cpufreq_notify_transition(&cpufreq_freqs, CPUFREQ_POSTCHANGE); - cpufreq_freqs.new = cpufreq_freqs.old; - cpufreq_freqs.old = tmp; - cpufreq_notify_transition(&cpufreq_freqs, CPUFREQ_PRECHANGE); - cpufreq_notify_transition(&cpufreq_freqs, CPUFREQ_POSTCHANGE); - printk(KERN_WARNING "Transition failed with error %d\n", ret); - retval = -ENODEV; - goto migrate_end; - } - - cpufreq_notify_transition(&cpufreq_freqs, CPUFREQ_POSTCHANGE); - - data->acpi_data.state = state; - - retval = 0; - -migrate_end: - set_cpus_allowed(current, saved_mask); - return (retval); -} - - -static unsigned int -acpi_cpufreq_get ( - unsigned int cpu) -{ - struct cpufreq_acpi_io *data = acpi_io_data[cpu]; - - dprintk("acpi_cpufreq_get\n"); - - return processor_get_freq(data, cpu); -} - - -static int -acpi_cpufreq_target ( - struct cpufreq_policy *policy, - unsigned int target_freq, - unsigned int relation) -{ - struct cpufreq_acpi_io *data = acpi_io_data[policy->cpu]; - unsigned int next_state = 0; - unsigned int result = 0; - - dprintk("acpi_cpufreq_setpolicy\n"); - - result = cpufreq_frequency_table_target(policy, - data->freq_table, target_freq, relation, &next_state); - if (result) - return (result); - - result = processor_set_freq(data, policy->cpu, next_state); - - return (result); -} - - -static int -acpi_cpufreq_verify ( - struct cpufreq_policy *policy) -{ - unsigned int result = 0; - struct cpufreq_acpi_io *data = acpi_io_data[policy->cpu]; - - dprintk("acpi_cpufreq_verify\n"); - - result = cpufreq_frequency_table_verify(policy, - data->freq_table); - - return (result); -} - - -/* - * processor_init_pdc - let BIOS know about the SMP capabilities - * of this driver - * @perf: processor-specific acpi_io_data struct - * @cpu: CPU being initialized - * - * To avoid issues with legacy OSes, some BIOSes require to be informed of - * the SMP capabilities of OS P-state driver. Here we set the bits in _PDC - * accordingly. Actual call to _PDC is done in driver/acpi/processor.c - */ -static void -processor_init_pdc ( - struct acpi_processor_performance *perf, - unsigned int cpu, - struct acpi_object_list *obj_list - ) -{ - union acpi_object *obj; - u32 *buf; - - dprintk("processor_init_pdc\n"); - - perf->pdc = NULL; - /* Initialize pdc. It will be used later. */ - if (!obj_list) - return; - - if (!(obj_list->count && obj_list->pointer)) - return; - - obj = obj_list->pointer; - if ((obj->buffer.length == 12) && obj->buffer.pointer) { - buf = (u32 *)obj->buffer.pointer; - buf[0] = ACPI_PDC_REVISION_ID; - buf[1] = 1; - buf[2] = ACPI_PDC_EST_CAPABILITY_SMP; - perf->pdc = obj_list; - } - return; -} - - -static int -acpi_cpufreq_cpu_init ( - struct cpufreq_policy *policy) -{ - unsigned int i; - unsigned int cpu = policy->cpu; - struct cpufreq_acpi_io *data; - unsigned int result = 0; - - union acpi_object arg0 = {ACPI_TYPE_BUFFER}; - u32 arg0_buf[3]; - struct acpi_object_list arg_list = {1, &arg0}; - - dprintk("acpi_cpufreq_cpu_init\n"); - /* setup arg_list for _PDC settings */ - arg0.buffer.length = 12; - arg0.buffer.pointer = (u8 *) arg0_buf; - - data = kmalloc(sizeof(struct cpufreq_acpi_io), GFP_KERNEL); - if (!data) - return (-ENOMEM); - - memset(data, 0, sizeof(struct cpufreq_acpi_io)); - - acpi_io_data[cpu] = data; - - processor_init_pdc(&data->acpi_data, cpu, &arg_list); - result = acpi_processor_register_performance(&data->acpi_data, cpu); - data->acpi_data.pdc = NULL; - - if (result) - goto err_free; - - /* capability check */ - if (data->acpi_data.state_count <= 1) { - dprintk("No P-States\n"); - result = -ENODEV; - goto err_unreg; - } - - if ((data->acpi_data.control_register.space_id != - ACPI_ADR_SPACE_FIXED_HARDWARE) || - (data->acpi_data.status_register.space_id != - ACPI_ADR_SPACE_FIXED_HARDWARE)) { - dprintk("Unsupported address space [%d, %d]\n", - (u32) (data->acpi_data.control_register.space_id), - (u32) (data->acpi_data.status_register.space_id)); - result = -ENODEV; - goto err_unreg; - } - - /* alloc freq_table */ - data->freq_table = kmalloc(sizeof(struct cpufreq_frequency_table) * - (data->acpi_data.state_count + 1), - GFP_KERNEL); - if (!data->freq_table) { - result = -ENOMEM; - goto err_unreg; - } - - /* detect transition latency */ - policy->cpuinfo.transition_latency = 0; - for (i=0; i<data->acpi_data.state_count; i++) { - if ((data->acpi_data.states[i].transition_latency * 1000) > - policy->cpuinfo.transition_latency) { - policy->cpuinfo.transition_latency = - data->acpi_data.states[i].transition_latency * 1000; - } - } - policy->governor = CPUFREQ_DEFAULT_GOVERNOR; - - policy->cur = processor_get_freq(data, policy->cpu); - - /* table init */ - for (i = 0; i <= data->acpi_data.state_count; i++) - { - data->freq_table[i].index = i; - if (i < data->acpi_data.state_count) { - data->freq_table[i].frequency = - data->acpi_data.states[i].core_frequency * 1000; - } else { - data->freq_table[i].frequency = CPUFREQ_TABLE_END; - } - } - - result = cpufreq_frequency_table_cpuinfo(policy, data->freq_table); - if (result) { - goto err_freqfree; - } - - /* notify BIOS that we exist */ - acpi_processor_notify_smm(THIS_MODULE); - - printk(KERN_INFO "acpi-cpufreq: CPU%u - ACPI performance management " - "activated.\n", cpu); - - for (i = 0; i < data->acpi_data.state_count; i++) - dprintk(" %cP%d: %d MHz, %d mW, %d uS, %d uS, 0x%x 0x%x\n", - (i == data->acpi_data.state?'*':' '), i, - (u32) data->acpi_data.states[i].core_frequency, - (u32) data->acpi_data.states[i].power, - (u32) data->acpi_data.states[i].transition_latency, - (u32) data->acpi_data.states[i].bus_master_latency, - (u32) data->acpi_data.states[i].status, - (u32) data->acpi_data.states[i].control); - - cpufreq_frequency_table_get_attr(data->freq_table, policy->cpu); - - /* the first call to ->target() should result in us actually - * writing something to the appropriate registers. */ - data->resume = 1; - - return (result); - - err_freqfree: - kfree(data->freq_table); - err_unreg: - acpi_processor_unregister_performance(&data->acpi_data, cpu); - err_free: - kfree(data); - acpi_io_data[cpu] = NULL; - - return (result); -} - - -static int -acpi_cpufreq_cpu_exit ( - struct cpufreq_policy *policy) -{ - struct cpufreq_acpi_io *data = acpi_io_data[policy->cpu]; - - dprintk("acpi_cpufreq_cpu_exit\n"); - - if (data) { - cpufreq_frequency_table_put_attr(policy->cpu); - acpi_io_data[policy->cpu] = NULL; - acpi_processor_unregister_performance(&data->acpi_data, - policy->cpu); - kfree(data); - } - - return (0); -} - - -static struct freq_attr* acpi_cpufreq_attr[] = { - &cpufreq_freq_attr_scaling_available_freqs, - NULL, -}; - - -static struct cpufreq_driver acpi_cpufreq_driver = { - .verify = acpi_cpufreq_verify, - .target = acpi_cpufreq_target, - .get = acpi_cpufreq_get, - .init = acpi_cpufreq_cpu_init, - .exit = acpi_cpufreq_cpu_exit, - .name = "acpi-cpufreq", - .owner = THIS_MODULE, - .attr = acpi_cpufreq_attr, -}; - - -static int __init -acpi_cpufreq_init (void) -{ - dprintk("acpi_cpufreq_init\n"); - - return cpufreq_register_driver(&acpi_cpufreq_driver); -} - - -static void __exit -acpi_cpufreq_exit (void) -{ - dprintk("acpi_cpufreq_exit\n"); - - cpufreq_unregister_driver(&acpi_cpufreq_driver); - return; -} - - -late_initcall(acpi_cpufreq_init); -module_exit(acpi_cpufreq_exit); - diff --git a/arch/ia64/kernel/crash.c b/arch/ia64/kernel/crash.c new file mode 100644 index 00000000000..2955f359e2a --- /dev/null +++ b/arch/ia64/kernel/crash.c @@ -0,0 +1,286 @@ +/* + * arch/ia64/kernel/crash.c + * + * Architecture specific (ia64) functions for kexec based crash dumps. + * + * Created by: Khalid Aziz <khalid.aziz@hp.com> + * Copyright (C) 2005 Hewlett-Packard Development Company, L.P. + * Copyright (C) 2005 Intel Corp Zou Nan hai <nanhai.zou@intel.com> + * + */ +#include <linux/smp.h> +#include <linux/delay.h> +#include <linux/crash_dump.h> +#include <linux/bootmem.h> +#include <linux/kexec.h> +#include <linux/elfcore.h> +#include <linux/sysctl.h> +#include <linux/init.h> +#include <linux/kdebug.h> + +#include <asm/mca.h> + +int kdump_status[NR_CPUS]; +static atomic_t kdump_cpu_frozen; +atomic_t kdump_in_progress; +static int kdump_freeze_monarch; +static int kdump_on_init = 1; +static int kdump_on_fatal_mca = 1; + +static inline Elf64_Word +*append_elf_note(Elf64_Word *buf, char *name, unsigned type, void *data, + size_t data_len) +{ + struct elf_note *note = (struct elf_note *)buf; + note->n_namesz = strlen(name) + 1; + note->n_descsz = data_len; + note->n_type = type; + buf += (sizeof(*note) + 3)/4; + memcpy(buf, name, note->n_namesz); + buf += (note->n_namesz + 3)/4; + memcpy(buf, data, data_len); + buf += (data_len + 3)/4; + return buf; +} + +static void +final_note(void *buf) +{ + memset(buf, 0, sizeof(struct elf_note)); +} + +extern void ia64_dump_cpu_regs(void *); + +static DEFINE_PER_CPU(struct elf_prstatus, elf_prstatus); + +void +crash_save_this_cpu(void) +{ + void *buf; + unsigned long cfm, sof, sol; + + int cpu = smp_processor_id(); + struct elf_prstatus *prstatus = &per_cpu(elf_prstatus, cpu); + + elf_greg_t *dst = (elf_greg_t *)&(prstatus->pr_reg); + memset(prstatus, 0, sizeof(*prstatus)); + prstatus->pr_pid = current->pid; + + ia64_dump_cpu_regs(dst); + cfm = dst[43]; + sol = (cfm >> 7) & 0x7f; + sof = cfm & 0x7f; + dst[46] = (unsigned long)ia64_rse_skip_regs((unsigned long *)dst[46], + sof - sol); + + buf = (u64 *) per_cpu_ptr(crash_notes, cpu); + if (!buf) + return; + buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS, prstatus, + sizeof(*prstatus)); + final_note(buf); +} + +#ifdef CONFIG_SMP +static int +kdump_wait_cpu_freeze(void) +{ + int cpu_num = num_online_cpus() - 1; + int timeout = 1000; + while(timeout-- > 0) { + if (atomic_read(&kdump_cpu_frozen) == cpu_num) + return 0; + udelay(1000); + } + return 1; +} +#endif + +void +machine_crash_shutdown(struct pt_regs *pt) +{ + /* This function is only called after the system + * has paniced or is otherwise in a critical state. + * The minimum amount of code to allow a kexec'd kernel + * to run successfully needs to happen here. + * + * In practice this means shooting down the other cpus in + * an SMP system. + */ + kexec_disable_iosapic(); +#ifdef CONFIG_SMP + /* + * If kdump_on_init is set and an INIT is asserted here, kdump will + * be started again via INIT monarch. + */ + local_irq_disable(); + ia64_set_psr_mc(); /* mask MCA/INIT */ + if (atomic_inc_return(&kdump_in_progress) != 1) + unw_init_running(kdump_cpu_freeze, NULL); + + /* + * Now this cpu is ready for kdump. + * Stop all others by IPI or INIT. They could receive INIT from + * outside and might be INIT monarch, but only thing they have to + * do is falling into kdump_cpu_freeze(). + * + * If an INIT is asserted here: + * - All receivers might be slaves, since some of cpus could already + * be frozen and INIT might be masked on monarch. In this case, + * all slaves will be frozen soon since kdump_in_progress will let + * them into DIE_INIT_SLAVE_LEAVE. + * - One might be a monarch, but INIT rendezvous will fail since + * at least this cpu already have INIT masked so it never join + * to the rendezvous. In this case, all slaves and monarch will + * be frozen soon with no wait since the INIT rendezvous is skipped + * by kdump_in_progress. + */ + kdump_smp_send_stop(); + /* not all cpu response to IPI, send INIT to freeze them */ + if (kdump_wait_cpu_freeze()) { + kdump_smp_send_init(); + /* wait again, don't go ahead if possible */ + kdump_wait_cpu_freeze(); + } +#endif +} + +static void +machine_kdump_on_init(void) +{ + crash_save_vmcoreinfo(); + local_irq_disable(); + kexec_disable_iosapic(); + machine_kexec(ia64_kimage); +} + +void +kdump_cpu_freeze(struct unw_frame_info *info, void *arg) +{ + int cpuid; + + local_irq_disable(); + cpuid = smp_processor_id(); + crash_save_this_cpu(); + current->thread.ksp = (__u64)info->sw - 16; + + ia64_set_psr_mc(); /* mask MCA/INIT and stop reentrance */ + + atomic_inc(&kdump_cpu_frozen); + kdump_status[cpuid] = 1; + mb(); + for (;;) + cpu_relax(); +} + +static int +kdump_init_notifier(struct notifier_block *self, unsigned long val, void *data) +{ + struct ia64_mca_notify_die *nd; + struct die_args *args = data; + + if (atomic_read(&kdump_in_progress)) { + switch (val) { + case DIE_INIT_MONARCH_LEAVE: + if (!kdump_freeze_monarch) + break; + /* fall through */ + case DIE_INIT_SLAVE_LEAVE: + case DIE_INIT_MONARCH_ENTER: + case DIE_MCA_RENDZVOUS_LEAVE: + unw_init_running(kdump_cpu_freeze, NULL); + break; + } + } + + if (!kdump_on_init && !kdump_on_fatal_mca) + return NOTIFY_DONE; + + if (!ia64_kimage) { + if (val == DIE_INIT_MONARCH_LEAVE) + ia64_mca_printk(KERN_NOTICE + "%s: kdump not configured\n", + __func__); + return NOTIFY_DONE; + } + + if (val != DIE_INIT_MONARCH_LEAVE && + val != DIE_INIT_MONARCH_PROCESS && + val != DIE_MCA_MONARCH_LEAVE) + return NOTIFY_DONE; + + nd = (struct ia64_mca_notify_die *)args->err; + + switch (val) { + case DIE_INIT_MONARCH_PROCESS: + /* Reason code 1 means machine check rendezvous*/ + if (kdump_on_init && (nd->sos->rv_rc != 1)) { + if (atomic_inc_return(&kdump_in_progress) != 1) + kdump_freeze_monarch = 1; + } + break; + case DIE_INIT_MONARCH_LEAVE: + /* Reason code 1 means machine check rendezvous*/ + if (kdump_on_init && (nd->sos->rv_rc != 1)) + machine_kdump_on_init(); + break; + case DIE_MCA_MONARCH_LEAVE: + /* *(nd->data) indicate if MCA is recoverable */ + if (kdump_on_fatal_mca && !(*(nd->data))) { + if (atomic_inc_return(&kdump_in_progress) == 1) + machine_kdump_on_init(); + /* We got fatal MCA while kdump!? No way!! */ + } + break; + } + return NOTIFY_DONE; +} + +#ifdef CONFIG_SYSCTL +static struct ctl_table kdump_ctl_table[] = { + { + .procname = "kdump_on_init", + .data = &kdump_on_init, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "kdump_on_fatal_mca", + .data = &kdump_on_fatal_mca, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { } +}; + +static struct ctl_table sys_table[] = { + { + .procname = "kernel", + .mode = 0555, + .child = kdump_ctl_table, + }, + { } +}; +#endif + +static int +machine_crash_setup(void) +{ + /* be notified before default_monarch_init_process */ + static struct notifier_block kdump_init_notifier_nb = { + .notifier_call = kdump_init_notifier, + .priority = 1, + }; + int ret; + if((ret = register_die_notifier(&kdump_init_notifier_nb)) != 0) + return ret; +#ifdef CONFIG_SYSCTL + register_sysctl_table(sys_table); +#endif + return 0; +} + +__initcall(machine_crash_setup); + diff --git a/arch/ia64/kernel/crash_dump.c b/arch/ia64/kernel/crash_dump.c new file mode 100644 index 00000000000..c8c9298666f --- /dev/null +++ b/arch/ia64/kernel/crash_dump.c @@ -0,0 +1,50 @@ +/* + * kernel/crash_dump.c - Memory preserving reboot related code. + * + * Created by: Simon Horman <horms@verge.net.au> + * Original code moved from kernel/crash.c + * Original code comment copied from the i386 version of this file + */ + +#include <linux/errno.h> +#include <linux/types.h> +#include <linux/crash_dump.h> + +#include <asm/page.h> +#include <asm/uaccess.h> + +/** + * copy_oldmem_page - copy one page from "oldmem" + * @pfn: page frame number to be copied + * @buf: target memory address for the copy; this can be in kernel address + * space or user address space (see @userbuf) + * @csize: number of bytes to copy + * @offset: offset in bytes into the page (based on pfn) to begin the copy + * @userbuf: if set, @buf is in user address space, use copy_to_user(), + * otherwise @buf is in kernel address space, use memcpy(). + * + * Copy a page from "oldmem". For this page, there is no pte mapped + * in the current kernel. We stitch up a pte, similar to kmap_atomic. + * + * Calling copy_to_user() in atomic context is not desirable. Hence first + * copying the data to a pre-allocated kernel page and then copying to user + * space in non-atomic context. + */ +ssize_t +copy_oldmem_page(unsigned long pfn, char *buf, + size_t csize, unsigned long offset, int userbuf) +{ + void *vaddr; + + if (!csize) + return 0; + vaddr = __va(pfn<<PAGE_SHIFT); + if (userbuf) { + if (copy_to_user(buf, (vaddr + offset), csize)) { + return -EFAULT; + } + } else + memcpy(buf, (vaddr + offset), csize); + return csize; +} + diff --git a/arch/ia64/kernel/cyclone.c b/arch/ia64/kernel/cyclone.c index 6ade3790ce0..4826ff957a3 100644 --- a/arch/ia64/kernel/cyclone.c +++ b/arch/ia64/kernel/cyclone.c @@ -3,6 +3,7 @@ #include <linux/time.h> #include <linux/errno.h> #include <linux/timex.h> +#include <linux/clocksource.h> #include <asm/io.h> /* IBM Summit (EXA) Cyclone counter code*/ @@ -18,49 +19,58 @@ void __init cyclone_setup(void) use_cyclone = 1; } +static void __iomem *cyclone_mc; -struct time_interpolator cyclone_interpolator = { - .source = TIME_SOURCE_MMIO64, - .shift = 16, - .frequency = CYCLONE_TIMER_FREQ, - .drift = -100, - .mask = (1LL << 40) - 1 +static cycle_t read_cyclone(struct clocksource *cs) +{ + return (cycle_t)readq((void __iomem *)cyclone_mc); +} + +static struct clocksource clocksource_cyclone = { + .name = "cyclone", + .rating = 300, + .read = read_cyclone, + .mask = (1LL << 40) - 1, + .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; int __init init_cyclone_clock(void) { - u64* reg; + u64 __iomem *reg; u64 base; /* saved cyclone base address */ u64 offset; /* offset from pageaddr to cyclone_timer register */ int i; - u32* volatile cyclone_timer; /* Cyclone MPMC0 register */ + u32 __iomem *cyclone_timer; /* Cyclone MPMC0 register */ if (!use_cyclone) - return -ENODEV; + return 0; printk(KERN_INFO "Summit chipset: Starting Cyclone Counter.\n"); /* find base address */ offset = (CYCLONE_CBAR_ADDR); - reg = (u64*)ioremap_nocache(offset, sizeof(u64)); + reg = ioremap_nocache(offset, sizeof(u64)); if(!reg){ - printk(KERN_ERR "Summit chipset: Could not find valid CBAR register.\n"); + printk(KERN_ERR "Summit chipset: Could not find valid CBAR" + " register.\n"); use_cyclone = 0; return -ENODEV; } base = readq(reg); + iounmap(reg); if(!base){ - printk(KERN_ERR "Summit chipset: Could not find valid CBAR value.\n"); + printk(KERN_ERR "Summit chipset: Could not find valid CBAR" + " value.\n"); use_cyclone = 0; return -ENODEV; } - iounmap(reg); /* setup PMCC */ offset = (base + CYCLONE_PMCC_OFFSET); - reg = (u64*)ioremap_nocache(offset, sizeof(u64)); + reg = ioremap_nocache(offset, sizeof(u64)); if(!reg){ - printk(KERN_ERR "Summit chipset: Could not find valid PMCC register.\n"); + printk(KERN_ERR "Summit chipset: Could not find valid PMCC" + " register.\n"); use_cyclone = 0; return -ENODEV; } @@ -69,9 +79,10 @@ int __init init_cyclone_clock(void) /* setup MPCS */ offset = (base + CYCLONE_MPCS_OFFSET); - reg = (u64*)ioremap_nocache(offset, sizeof(u64)); + reg = ioremap_nocache(offset, sizeof(u64)); if(!reg){ - printk(KERN_ERR "Summit chipset: Could not find valid MPCS register.\n"); + printk(KERN_ERR "Summit chipset: Could not find valid MPCS" + " register.\n"); use_cyclone = 0; return -ENODEV; } @@ -80,9 +91,10 @@ int __init init_cyclone_clock(void) /* map in cyclone_timer */ offset = (base + CYCLONE_MPMC_OFFSET); - cyclone_timer = (u32*)ioremap_nocache(offset, sizeof(u32)); + cyclone_timer = ioremap_nocache(offset, sizeof(u32)); if(!cyclone_timer){ - printk(KERN_ERR "Summit chipset: Could not find valid MPMC register.\n"); + printk(KERN_ERR "Summit chipset: Could not find valid MPMC" + " register.\n"); use_cyclone = 0; return -ENODEV; } @@ -93,16 +105,18 @@ int __init init_cyclone_clock(void) int stall = 100; while(stall--) barrier(); if(readl(cyclone_timer) == old){ - printk(KERN_ERR "Summit chipset: Counter not counting! DISABLED\n"); + printk(KERN_ERR "Summit chipset: Counter not counting!" + " DISABLED\n"); iounmap(cyclone_timer); - cyclone_timer = 0; + cyclone_timer = NULL; use_cyclone = 0; return -ENODEV; } } /* initialize last tick */ - cyclone_interpolator.addr = cyclone_timer; - register_time_interpolator(&cyclone_interpolator); + cyclone_mc = cyclone_timer; + clocksource_cyclone.archdata.fsys_mmio = cyclone_timer; + clocksource_register_hz(&clocksource_cyclone, CYCLONE_TIMER_FREQ); return 0; } diff --git a/arch/ia64/kernel/dma-mapping.c b/arch/ia64/kernel/dma-mapping.c new file mode 100644 index 00000000000..7f791623820 --- /dev/null +++ b/arch/ia64/kernel/dma-mapping.c @@ -0,0 +1,24 @@ +#include <linux/dma-mapping.h> +#include <linux/export.h> + +/* Set this to 1 if there is a HW IOMMU in the system */ +int iommu_detected __read_mostly; + +struct dma_map_ops *dma_ops; +EXPORT_SYMBOL(dma_ops); + +#define PREALLOC_DMA_DEBUG_ENTRIES (1 << 16) + +static int __init dma_init(void) +{ + dma_debug_init(PREALLOC_DMA_DEBUG_ENTRIES); + + return 0; +} +fs_initcall(dma_init); + +struct dma_map_ops *dma_get_ops(struct device *dev) +{ + return dma_ops; +} +EXPORT_SYMBOL(dma_get_ops); diff --git a/arch/ia64/kernel/efi.c b/arch/ia64/kernel/efi.c index f72ea6aebcb..741b99c1a0b 100644 --- a/arch/ia64/kernel/efi.c +++ b/arch/ia64/kernel/efi.c @@ -1,13 +1,16 @@ /* * Extensible Firmware Interface * - * Based on Extensible Firmware Interface Specification version 0.9 April 30, 1999 + * Based on Extensible Firmware Interface Specification version 0.9 + * April 30, 1999 * * Copyright (C) 1999 VA Linux Systems * Copyright (C) 1999 Walt Drummond <drummond@valinux.com> * Copyright (C) 1999-2003 Hewlett-Packard Co. * David Mosberger-Tang <davidm@hpl.hp.com> * Stephane Eranian <eranian@hpl.hp.com> + * (c) Copyright 2006 Hewlett-Packard Development Company, L.P. + * Bjorn Helgaas <bjorn.helgaas@hp.com> * * All EFI Runtime Services are not implemented yet as EFI only * supports physical mode addressing on SoftSDV. This is to be fixed @@ -18,13 +21,17 @@ * Goutham Rao: <goutham.rao@intel.com> * Skip non-WB memory and ignore empty memory ranges. */ -#include <linux/config.h> #include <linux/module.h> +#include <linux/bootmem.h> +#include <linux/crash_dump.h> #include <linux/kernel.h> #include <linux/init.h> #include <linux/types.h> +#include <linux/slab.h> #include <linux/time.h> #include <linux/efi.h> +#include <linux/kexec.h> +#include <linux/mm.h> #include <asm/io.h> #include <asm/kregs.h> @@ -32,157 +39,176 @@ #include <asm/pgtable.h> #include <asm/processor.h> #include <asm/mca.h> +#include <asm/setup.h> +#include <asm/tlbflush.h> #define EFI_DEBUG 0 +static __initdata unsigned long palo_phys; + +static __initdata efi_config_table_type_t arch_tables[] = { + {PROCESSOR_ABSTRACTION_LAYER_OVERWRITE_GUID, "PALO", &palo_phys}, + {NULL_GUID, NULL, 0}, +}; + extern efi_status_t efi_call_phys (void *, ...); -struct efi efi; -EXPORT_SYMBOL(efi); static efi_runtime_services_t *runtime; -static unsigned long mem_limit = ~0UL, max_addr = ~0UL; +static u64 mem_limit = ~0UL, max_addr = ~0UL, min_addr = 0UL; #define efi_call_virt(f, args...) (*(f))(args) -#define STUB_GET_TIME(prefix, adjust_arg) \ -static efi_status_t \ -prefix##_get_time (efi_time_t *tm, efi_time_cap_t *tc) \ -{ \ - struct ia64_fpreg fr[6]; \ - efi_time_cap_t *atc = NULL; \ - efi_status_t ret; \ - \ - if (tc) \ - atc = adjust_arg(tc); \ - ia64_save_scratch_fpregs(fr); \ - ret = efi_call_##prefix((efi_get_time_t *) __va(runtime->get_time), adjust_arg(tm), atc); \ - ia64_load_scratch_fpregs(fr); \ - return ret; \ -} - -#define STUB_SET_TIME(prefix, adjust_arg) \ -static efi_status_t \ -prefix##_set_time (efi_time_t *tm) \ -{ \ - struct ia64_fpreg fr[6]; \ - efi_status_t ret; \ - \ - ia64_save_scratch_fpregs(fr); \ - ret = efi_call_##prefix((efi_set_time_t *) __va(runtime->set_time), adjust_arg(tm)); \ - ia64_load_scratch_fpregs(fr); \ - return ret; \ -} - -#define STUB_GET_WAKEUP_TIME(prefix, adjust_arg) \ -static efi_status_t \ -prefix##_get_wakeup_time (efi_bool_t *enabled, efi_bool_t *pending, efi_time_t *tm) \ -{ \ - struct ia64_fpreg fr[6]; \ - efi_status_t ret; \ - \ - ia64_save_scratch_fpregs(fr); \ - ret = efi_call_##prefix((efi_get_wakeup_time_t *) __va(runtime->get_wakeup_time), \ - adjust_arg(enabled), adjust_arg(pending), adjust_arg(tm)); \ - ia64_load_scratch_fpregs(fr); \ - return ret; \ -} - -#define STUB_SET_WAKEUP_TIME(prefix, adjust_arg) \ -static efi_status_t \ -prefix##_set_wakeup_time (efi_bool_t enabled, efi_time_t *tm) \ -{ \ - struct ia64_fpreg fr[6]; \ - efi_time_t *atm = NULL; \ - efi_status_t ret; \ - \ - if (tm) \ - atm = adjust_arg(tm); \ - ia64_save_scratch_fpregs(fr); \ - ret = efi_call_##prefix((efi_set_wakeup_time_t *) __va(runtime->set_wakeup_time), \ - enabled, atm); \ - ia64_load_scratch_fpregs(fr); \ - return ret; \ -} - -#define STUB_GET_VARIABLE(prefix, adjust_arg) \ -static efi_status_t \ -prefix##_get_variable (efi_char16_t *name, efi_guid_t *vendor, u32 *attr, \ - unsigned long *data_size, void *data) \ -{ \ - struct ia64_fpreg fr[6]; \ - u32 *aattr = NULL; \ - efi_status_t ret; \ - \ - if (attr) \ - aattr = adjust_arg(attr); \ - ia64_save_scratch_fpregs(fr); \ - ret = efi_call_##prefix((efi_get_variable_t *) __va(runtime->get_variable), \ - adjust_arg(name), adjust_arg(vendor), aattr, \ - adjust_arg(data_size), adjust_arg(data)); \ - ia64_load_scratch_fpregs(fr); \ - return ret; \ -} - -#define STUB_GET_NEXT_VARIABLE(prefix, adjust_arg) \ -static efi_status_t \ -prefix##_get_next_variable (unsigned long *name_size, efi_char16_t *name, efi_guid_t *vendor) \ -{ \ - struct ia64_fpreg fr[6]; \ - efi_status_t ret; \ - \ - ia64_save_scratch_fpregs(fr); \ - ret = efi_call_##prefix((efi_get_next_variable_t *) __va(runtime->get_next_variable), \ - adjust_arg(name_size), adjust_arg(name), adjust_arg(vendor)); \ - ia64_load_scratch_fpregs(fr); \ - return ret; \ -} - -#define STUB_SET_VARIABLE(prefix, adjust_arg) \ -static efi_status_t \ -prefix##_set_variable (efi_char16_t *name, efi_guid_t *vendor, unsigned long attr, \ - unsigned long data_size, void *data) \ -{ \ - struct ia64_fpreg fr[6]; \ - efi_status_t ret; \ - \ - ia64_save_scratch_fpregs(fr); \ - ret = efi_call_##prefix((efi_set_variable_t *) __va(runtime->set_variable), \ - adjust_arg(name), adjust_arg(vendor), attr, data_size, \ - adjust_arg(data)); \ - ia64_load_scratch_fpregs(fr); \ - return ret; \ -} - -#define STUB_GET_NEXT_HIGH_MONO_COUNT(prefix, adjust_arg) \ -static efi_status_t \ -prefix##_get_next_high_mono_count (u32 *count) \ -{ \ - struct ia64_fpreg fr[6]; \ - efi_status_t ret; \ - \ - ia64_save_scratch_fpregs(fr); \ - ret = efi_call_##prefix((efi_get_next_high_mono_count_t *) \ - __va(runtime->get_next_high_mono_count), adjust_arg(count)); \ - ia64_load_scratch_fpregs(fr); \ - return ret; \ -} - -#define STUB_RESET_SYSTEM(prefix, adjust_arg) \ -static void \ -prefix##_reset_system (int reset_type, efi_status_t status, \ - unsigned long data_size, efi_char16_t *data) \ -{ \ - struct ia64_fpreg fr[6]; \ - efi_char16_t *adata = NULL; \ - \ - if (data) \ - adata = adjust_arg(data); \ - \ - ia64_save_scratch_fpregs(fr); \ - efi_call_##prefix((efi_reset_system_t *) __va(runtime->reset_system), \ - reset_type, status, data_size, adata); \ - /* should not return, but just in case... */ \ - ia64_load_scratch_fpregs(fr); \ +#define STUB_GET_TIME(prefix, adjust_arg) \ +static efi_status_t \ +prefix##_get_time (efi_time_t *tm, efi_time_cap_t *tc) \ +{ \ + struct ia64_fpreg fr[6]; \ + efi_time_cap_t *atc = NULL; \ + efi_status_t ret; \ + \ + if (tc) \ + atc = adjust_arg(tc); \ + ia64_save_scratch_fpregs(fr); \ + ret = efi_call_##prefix((efi_get_time_t *) __va(runtime->get_time), \ + adjust_arg(tm), atc); \ + ia64_load_scratch_fpregs(fr); \ + return ret; \ +} + +#define STUB_SET_TIME(prefix, adjust_arg) \ +static efi_status_t \ +prefix##_set_time (efi_time_t *tm) \ +{ \ + struct ia64_fpreg fr[6]; \ + efi_status_t ret; \ + \ + ia64_save_scratch_fpregs(fr); \ + ret = efi_call_##prefix((efi_set_time_t *) __va(runtime->set_time), \ + adjust_arg(tm)); \ + ia64_load_scratch_fpregs(fr); \ + return ret; \ +} + +#define STUB_GET_WAKEUP_TIME(prefix, adjust_arg) \ +static efi_status_t \ +prefix##_get_wakeup_time (efi_bool_t *enabled, efi_bool_t *pending, \ + efi_time_t *tm) \ +{ \ + struct ia64_fpreg fr[6]; \ + efi_status_t ret; \ + \ + ia64_save_scratch_fpregs(fr); \ + ret = efi_call_##prefix( \ + (efi_get_wakeup_time_t *) __va(runtime->get_wakeup_time), \ + adjust_arg(enabled), adjust_arg(pending), adjust_arg(tm)); \ + ia64_load_scratch_fpregs(fr); \ + return ret; \ +} + +#define STUB_SET_WAKEUP_TIME(prefix, adjust_arg) \ +static efi_status_t \ +prefix##_set_wakeup_time (efi_bool_t enabled, efi_time_t *tm) \ +{ \ + struct ia64_fpreg fr[6]; \ + efi_time_t *atm = NULL; \ + efi_status_t ret; \ + \ + if (tm) \ + atm = adjust_arg(tm); \ + ia64_save_scratch_fpregs(fr); \ + ret = efi_call_##prefix( \ + (efi_set_wakeup_time_t *) __va(runtime->set_wakeup_time), \ + enabled, atm); \ + ia64_load_scratch_fpregs(fr); \ + return ret; \ +} + +#define STUB_GET_VARIABLE(prefix, adjust_arg) \ +static efi_status_t \ +prefix##_get_variable (efi_char16_t *name, efi_guid_t *vendor, u32 *attr, \ + unsigned long *data_size, void *data) \ +{ \ + struct ia64_fpreg fr[6]; \ + u32 *aattr = NULL; \ + efi_status_t ret; \ + \ + if (attr) \ + aattr = adjust_arg(attr); \ + ia64_save_scratch_fpregs(fr); \ + ret = efi_call_##prefix( \ + (efi_get_variable_t *) __va(runtime->get_variable), \ + adjust_arg(name), adjust_arg(vendor), aattr, \ + adjust_arg(data_size), adjust_arg(data)); \ + ia64_load_scratch_fpregs(fr); \ + return ret; \ +} + +#define STUB_GET_NEXT_VARIABLE(prefix, adjust_arg) \ +static efi_status_t \ +prefix##_get_next_variable (unsigned long *name_size, efi_char16_t *name, \ + efi_guid_t *vendor) \ +{ \ + struct ia64_fpreg fr[6]; \ + efi_status_t ret; \ + \ + ia64_save_scratch_fpregs(fr); \ + ret = efi_call_##prefix( \ + (efi_get_next_variable_t *) __va(runtime->get_next_variable), \ + adjust_arg(name_size), adjust_arg(name), adjust_arg(vendor)); \ + ia64_load_scratch_fpregs(fr); \ + return ret; \ +} + +#define STUB_SET_VARIABLE(prefix, adjust_arg) \ +static efi_status_t \ +prefix##_set_variable (efi_char16_t *name, efi_guid_t *vendor, \ + u32 attr, unsigned long data_size, \ + void *data) \ +{ \ + struct ia64_fpreg fr[6]; \ + efi_status_t ret; \ + \ + ia64_save_scratch_fpregs(fr); \ + ret = efi_call_##prefix( \ + (efi_set_variable_t *) __va(runtime->set_variable), \ + adjust_arg(name), adjust_arg(vendor), attr, data_size, \ + adjust_arg(data)); \ + ia64_load_scratch_fpregs(fr); \ + return ret; \ +} + +#define STUB_GET_NEXT_HIGH_MONO_COUNT(prefix, adjust_arg) \ +static efi_status_t \ +prefix##_get_next_high_mono_count (u32 *count) \ +{ \ + struct ia64_fpreg fr[6]; \ + efi_status_t ret; \ + \ + ia64_save_scratch_fpregs(fr); \ + ret = efi_call_##prefix((efi_get_next_high_mono_count_t *) \ + __va(runtime->get_next_high_mono_count), \ + adjust_arg(count)); \ + ia64_load_scratch_fpregs(fr); \ + return ret; \ +} + +#define STUB_RESET_SYSTEM(prefix, adjust_arg) \ +static void \ +prefix##_reset_system (int reset_type, efi_status_t status, \ + unsigned long data_size, efi_char16_t *data) \ +{ \ + struct ia64_fpreg fr[6]; \ + efi_char16_t *adata = NULL; \ + \ + if (data) \ + adata = adjust_arg(data); \ + \ + ia64_save_scratch_fpregs(fr); \ + efi_call_##prefix( \ + (efi_reset_system_t *) __va(runtime->reset_system), \ + reset_type, status, data_size, adata); \ + /* should not return, but just in case... */ \ + ia64_load_scratch_fpregs(fr); \ } #define phys_ptr(arg) ((__typeof__(arg)) ia64_tpa(arg)) @@ -214,16 +240,18 @@ efi_gettimeofday (struct timespec *ts) { efi_time_t tm; - memset(ts, 0, sizeof(ts)); - if ((*efi.get_time)(&tm, NULL) != EFI_SUCCESS) + if ((*efi.get_time)(&tm, NULL) != EFI_SUCCESS) { + memset(ts, 0, sizeof(*ts)); return; + } - ts->tv_sec = mktime(tm.year, tm.month, tm.day, tm.hour, tm.minute, tm.second); + ts->tv_sec = mktime(tm.year, tm.month, tm.day, + tm.hour, tm.minute, tm.second); ts->tv_nsec = tm.nanosecond; } static int -is_available_memory (efi_memory_desc_t *md) +is_memory_available (efi_memory_desc_t *md) { if (!(md->attribute & EFI_MEMORY_WB)) return 0; @@ -247,6 +275,32 @@ typedef struct kern_memdesc { static kern_memdesc_t *kern_memmap; +#define efi_md_size(md) (md->num_pages << EFI_PAGE_SHIFT) + +static inline u64 +kmd_end(kern_memdesc_t *kmd) +{ + return (kmd->start + (kmd->num_pages << EFI_PAGE_SHIFT)); +} + +static inline u64 +efi_md_end(efi_memory_desc_t *md) +{ + return (md->phys_addr + efi_md_size(md)); +} + +static inline int +efi_wb(efi_memory_desc_t *md) +{ + return (md->attribute & EFI_MEMORY_WB); +} + +static inline int +efi_uc(efi_memory_desc_t *md) +{ + return (md->attribute & EFI_MEMORY_UC); +} + static void walk (efi_freemem_callback_t callback, void *arg, u64 attr) { @@ -266,8 +320,8 @@ walk (efi_freemem_callback_t callback, void *arg, u64 attr) } /* - * Walks the EFI memory map and calls CALLBACK once for each EFI memory descriptor that - * has memory that is available for OS use. + * Walk the EFI memory map and call CALLBACK once for each EFI memory + * descriptor that has memory that is available for OS use. */ void efi_memmap_walk (efi_freemem_callback_t callback, void *arg) @@ -276,8 +330,8 @@ efi_memmap_walk (efi_freemem_callback_t callback, void *arg) } /* - * Walks the EFI memory map and calls CALLBACK once for each EFI memory descriptor that - * has memory that is available for uncached allocator. + * Walk the EFI memory map and call CALLBACK once for each EFI memory + * descriptor that has memory that is available for uncached allocator. */ void efi_memmap_walk_uc (efi_freemem_callback_t callback, void *arg) @@ -286,11 +340,10 @@ efi_memmap_walk_uc (efi_freemem_callback_t callback, void *arg) } /* - * Look for the PAL_CODE region reported by EFI and maps it using an + * Look for the PAL_CODE region reported by EFI and map it using an * ITR to enable safe PAL calls in virtual mode. See IA-64 Processor * Abstraction Layer chapter 11 in ADAG */ - void * efi_get_pal_addr (void) { @@ -310,53 +363,90 @@ efi_get_pal_addr (void) continue; if (++pal_code_count > 1) { - printk(KERN_ERR "Too many EFI Pal Code memory ranges, dropped @ %lx\n", - md->phys_addr); + printk(KERN_ERR "Too many EFI Pal Code memory ranges, " + "dropped @ %llx\n", md->phys_addr); continue; } /* - * The only ITLB entry in region 7 that is used is the one installed by - * __start(). That entry covers a 64MB range. + * The only ITLB entry in region 7 that is used is the one + * installed by __start(). That entry covers a 64MB range. */ mask = ~((1 << KERNEL_TR_PAGE_SHIFT) - 1); vaddr = PAGE_OFFSET + md->phys_addr; /* - * We must check that the PAL mapping won't overlap with the kernel - * mapping. + * We must check that the PAL mapping won't overlap with the + * kernel mapping. * - * PAL code is guaranteed to be aligned on a power of 2 between 4k and - * 256KB and that only one ITR is needed to map it. This implies that the - * PAL code is always aligned on its size, i.e., the closest matching page - * size supported by the TLB. Therefore PAL code is guaranteed never to - * cross a 64MB unless it is bigger than 64MB (very unlikely!). So for - * now the following test is enough to determine whether or not we need a - * dedicated ITR for the PAL code. + * PAL code is guaranteed to be aligned on a power of 2 between + * 4k and 256KB and that only one ITR is needed to map it. This + * implies that the PAL code is always aligned on its size, + * i.e., the closest matching page size supported by the TLB. + * Therefore PAL code is guaranteed never to cross a 64MB unless + * it is bigger than 64MB (very unlikely!). So for now the + * following test is enough to determine whether or not we need + * a dedicated ITR for the PAL code. */ if ((vaddr & mask) == (KERNEL_START & mask)) { printk(KERN_INFO "%s: no need to install ITR for PAL code\n", - __FUNCTION__); + __func__); continue; } - if (md->num_pages << EFI_PAGE_SHIFT > IA64_GRANULE_SIZE) - panic("Woah! PAL code size bigger than a granule!"); + if (efi_md_size(md) > IA64_GRANULE_SIZE) + panic("Whoa! PAL code size bigger than a granule!"); #if EFI_DEBUG mask = ~((1 << IA64_GRANULE_SHIFT) - 1); - printk(KERN_INFO "CPU %d: mapping PAL code [0x%lx-0x%lx) into [0x%lx-0x%lx)\n", - smp_processor_id(), md->phys_addr, - md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT), - vaddr & mask, (vaddr & mask) + IA64_GRANULE_SIZE); + printk(KERN_INFO "CPU %d: mapping PAL code " + "[0x%lx-0x%lx) into [0x%lx-0x%lx)\n", + smp_processor_id(), md->phys_addr, + md->phys_addr + efi_md_size(md), + vaddr & mask, (vaddr & mask) + IA64_GRANULE_SIZE); #endif return __va(md->phys_addr); } - printk(KERN_WARNING "%s: no PAL-code memory-descriptor found", - __FUNCTION__); + printk(KERN_WARNING "%s: no PAL-code memory-descriptor found\n", + __func__); return NULL; } + +static u8 __init palo_checksum(u8 *buffer, u32 length) +{ + u8 sum = 0; + u8 *end = buffer + length; + + while (buffer < end) + sum = (u8) (sum + *(buffer++)); + + return sum; +} + +/* + * Parse and handle PALO table which is published at: + * http://www.dig64.org/home/DIG64_PALO_R1_0.pdf + */ +static void __init handle_palo(unsigned long phys_addr) +{ + struct palo_table *palo = __va(phys_addr); + u8 checksum; + + if (strncmp(palo->signature, PALO_SIG, sizeof(PALO_SIG) - 1)) { + printk(KERN_INFO "PALO signature incorrect.\n"); + return; + } + + checksum = palo_checksum((u8 *)palo, palo->length); + if (checksum) { + printk(KERN_INFO "PALO checksum incorrect.\n"); + return; + } + + setup_ptcg_sem(palo->max_tlb_purges, NPTCG_FROM_PALO); +} + void efi_map_pal_code (void) { @@ -370,38 +460,37 @@ efi_map_pal_code (void) * Cannot write to CRx with PSR.ic=1 */ psr = ia64_clear_ic(); - ia64_itr(0x1, IA64_TR_PALCODE, GRANULEROUNDDOWN((unsigned long) pal_vaddr), + ia64_itr(0x1, IA64_TR_PALCODE, + GRANULEROUNDDOWN((unsigned long) pal_vaddr), pte_val(pfn_pte(__pa(pal_vaddr) >> PAGE_SHIFT, PAGE_KERNEL)), IA64_GRANULE_SHIFT); + paravirt_dv_serialize_data(); ia64_set_psr(psr); /* restore psr */ - ia64_srlz_i(); } void __init efi_init (void) { void *efi_map_start, *efi_map_end; - efi_config_table_t *config_tables; efi_char16_t *c16; u64 efi_desc_size; - char *cp, *end, vendor[100] = "unknown"; - extern char saved_command_line[]; + char *cp, vendor[100] = "unknown"; int i; - /* it's too early to be able to use the standard kernel command line support... */ - for (cp = saved_command_line; *cp; ) { + set_bit(EFI_BOOT, &efi.flags); + set_bit(EFI_64BIT, &efi.flags); + + /* + * It's too early to be able to use the standard kernel command line + * support... + */ + for (cp = boot_command_line; *cp; ) { if (memcmp(cp, "mem=", 4) == 0) { - cp += 4; - mem_limit = memparse(cp, &end); - if (end != cp) - break; - cp = end; + mem_limit = memparse(cp + 4, &cp); } else if (memcmp(cp, "max_addr=", 9) == 0) { - cp += 9; - max_addr = GRANULEROUNDDOWN(memparse(cp, &end)); - if (end != cp) - break; - cp = end; + max_addr = GRANULEROUNDDOWN(memparse(cp + 9, &cp)); + } else if (memcmp(cp, "min_addr=", 9) == 0) { + min_addr = GRANULEROUNDDOWN(memparse(cp + 9, &cp)); } else { while (*cp != ' ' && *cp) ++cp; @@ -409,8 +498,12 @@ efi_init (void) ++cp; } } + if (min_addr != 0UL) + printk(KERN_INFO "Ignoring memory below %lluMB\n", + min_addr >> 20); if (max_addr != ~0UL) - printk(KERN_INFO "Ignoring memory above %luMB\n", max_addr >> 20); + printk(KERN_INFO "Ignoring memory above %lluMB\n", + max_addr >> 20); efi.systab = __va(ia64_boot_param->efi_systab); @@ -418,50 +511,36 @@ efi_init (void) * Verify the EFI Table */ if (efi.systab == NULL) - panic("Woah! Can't find EFI system table.\n"); + panic("Whoa! Can't find EFI system table.\n"); if (efi.systab->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE) - panic("Woah! EFI system table signature incorrect\n"); - if ((efi.systab->hdr.revision ^ EFI_SYSTEM_TABLE_REVISION) >> 16 != 0) - printk(KERN_WARNING "Warning: EFI system table major version mismatch: " - "got %d.%02d, expected %d.%02d\n", - efi.systab->hdr.revision >> 16, efi.systab->hdr.revision & 0xffff, - EFI_SYSTEM_TABLE_REVISION >> 16, EFI_SYSTEM_TABLE_REVISION & 0xffff); - - config_tables = __va(efi.systab->tables); + panic("Whoa! EFI system table signature incorrect\n"); + if ((efi.systab->hdr.revision >> 16) == 0) + printk(KERN_WARNING "Warning: EFI system table version " + "%d.%02d, expected 1.00 or greater\n", + efi.systab->hdr.revision >> 16, + efi.systab->hdr.revision & 0xffff); /* Show what we know for posterity */ c16 = __va(efi.systab->fw_vendor); if (c16) { - for (i = 0;i < (int) sizeof(vendor) && *c16; ++i) + for (i = 0;i < (int) sizeof(vendor) - 1 && *c16; ++i) vendor[i] = *c16++; vendor[i] = '\0'; } printk(KERN_INFO "EFI v%u.%.02u by %s:", - efi.systab->hdr.revision >> 16, efi.systab->hdr.revision & 0xffff, vendor); - - for (i = 0; i < (int) efi.systab->nr_tables; i++) { - if (efi_guidcmp(config_tables[i].guid, MPS_TABLE_GUID) == 0) { - efi.mps = __va(config_tables[i].table); - printk(" MPS=0x%lx", config_tables[i].table); - } else if (efi_guidcmp(config_tables[i].guid, ACPI_20_TABLE_GUID) == 0) { - efi.acpi20 = __va(config_tables[i].table); - printk(" ACPI 2.0=0x%lx", config_tables[i].table); - } else if (efi_guidcmp(config_tables[i].guid, ACPI_TABLE_GUID) == 0) { - efi.acpi = __va(config_tables[i].table); - printk(" ACPI=0x%lx", config_tables[i].table); - } else if (efi_guidcmp(config_tables[i].guid, SMBIOS_TABLE_GUID) == 0) { - efi.smbios = __va(config_tables[i].table); - printk(" SMBIOS=0x%lx", config_tables[i].table); - } else if (efi_guidcmp(config_tables[i].guid, SAL_SYSTEM_TABLE_GUID) == 0) { - efi.sal_systab = __va(config_tables[i].table); - printk(" SALsystab=0x%lx", config_tables[i].table); - } else if (efi_guidcmp(config_tables[i].guid, HCDP_TABLE_GUID) == 0) { - efi.hcdp = __va(config_tables[i].table); - printk(" HCDP=0x%lx", config_tables[i].table); - } - } - printk("\n"); + efi.systab->hdr.revision >> 16, + efi.systab->hdr.revision & 0xffff, vendor); + + set_bit(EFI_SYSTEM_TABLES, &efi.flags); + + palo_phys = EFI_INVALID_TABLE_ADDR; + + if (efi_config_init(arch_tables) != 0) + return; + + if (palo_phys != EFI_INVALID_TABLE_ADDR) + handle_palo(palo_phys); runtime = __va(efi.systab->runtime); efi.get_time = phys_get_time; @@ -484,12 +563,33 @@ efi_init (void) efi_memory_desc_t *md; void *p; - for (i = 0, p = efi_map_start; p < efi_map_end; ++i, p += efi_desc_size) { + for (i = 0, p = efi_map_start; p < efi_map_end; + ++i, p += efi_desc_size) + { + const char *unit; + unsigned long size; + md = p; - printk("mem%02u: type=%u, attr=0x%lx, range=[0x%016lx-0x%016lx) (%luMB)\n", + size = md->num_pages << EFI_PAGE_SHIFT; + + if ((size >> 40) > 0) { + size >>= 40; + unit = "TB"; + } else if ((size >> 30) > 0) { + size >>= 30; + unit = "GB"; + } else if ((size >> 20) > 0) { + size >>= 20; + unit = "MB"; + } else { + size >>= 10; + unit = "KB"; + } + + printk("mem%02d: type=%2u, attr=0x%016lx, " + "range=[0x%016lx-0x%016lx) (%4lu%s)\n", i, md->type, md->attribute, md->phys_addr, - md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT), - md->num_pages >> (20 - EFI_PAGE_SHIFT)); + md->phys_addr + efi_md_size(md), size, unit); } } #endif @@ -514,8 +614,8 @@ efi_enter_virtual_mode (void) md = p; if (md->attribute & EFI_MEMORY_RUNTIME) { /* - * Some descriptors have multiple bits set, so the order of - * the tests is relevant. + * Some descriptors have multiple bits set, so the + * order of the tests is relevant. */ if (md->attribute & EFI_MEMORY_WB) { md->virt_addr = (u64) __va(md->phys_addr); @@ -523,21 +623,26 @@ efi_enter_virtual_mode (void) md->virt_addr = (u64) ioremap(md->phys_addr, 0); } else if (md->attribute & EFI_MEMORY_WC) { #if 0 - md->virt_addr = ia64_remap(md->phys_addr, (_PAGE_A | _PAGE_P - | _PAGE_D - | _PAGE_MA_WC - | _PAGE_PL_0 - | _PAGE_AR_RW)); + md->virt_addr = ia64_remap(md->phys_addr, + (_PAGE_A | + _PAGE_P | + _PAGE_D | + _PAGE_MA_WC | + _PAGE_PL_0 | + _PAGE_AR_RW)); #else printk(KERN_INFO "EFI_MEMORY_WC mapping\n"); md->virt_addr = (u64) ioremap(md->phys_addr, 0); #endif } else if (md->attribute & EFI_MEMORY_WT) { #if 0 - md->virt_addr = ia64_remap(md->phys_addr, (_PAGE_A | _PAGE_P - | _PAGE_D | _PAGE_MA_WT - | _PAGE_PL_0 - | _PAGE_AR_RW)); + md->virt_addr = ia64_remap(md->phys_addr, + (_PAGE_A | + _PAGE_P | + _PAGE_D | + _PAGE_MA_WT | + _PAGE_PL_0 | + _PAGE_AR_RW)); #else printk(KERN_INFO "EFI_MEMORY_WT mapping\n"); md->virt_addr = (u64) ioremap(md->phys_addr, 0); @@ -548,16 +653,20 @@ efi_enter_virtual_mode (void) status = efi_call_phys(__va(runtime->set_virtual_address_map), ia64_boot_param->efi_memmap_size, - efi_desc_size, ia64_boot_param->efi_memdesc_version, + efi_desc_size, + ia64_boot_param->efi_memdesc_version, ia64_boot_param->efi_memmap); if (status != EFI_SUCCESS) { - printk(KERN_WARNING "warning: unable to switch EFI into virtual mode " - "(status=%lu)\n", status); + printk(KERN_WARNING "warning: unable to switch EFI into " + "virtual mode (status=%lu)\n", status); return; } + set_bit(EFI_RUNTIME_SERVICES, &efi.flags); + /* - * Now that EFI is in virtual mode, we call the EFI functions more efficiently: + * Now that EFI is in virtual mode, we call the EFI functions more + * efficiently: */ efi.get_time = virt_get_time; efi.set_time = virt_set_time; @@ -571,8 +680,8 @@ efi_enter_virtual_mode (void) } /* - * Walk the EFI memory map looking for the I/O port range. There can only be one entry of - * this type, other I/O port ranges should be described via ACPI. + * Walk the EFI memory map looking for the I/O port range. There can only be + * one entry of this type, other I/O port ranges should be described via ACPI. */ u64 efi_get_iobase (void) @@ -595,8 +704,20 @@ efi_get_iobase (void) return 0; } -u32 -efi_mem_type (unsigned long phys_addr) +static struct kern_memdesc * +kern_memory_descriptor (unsigned long phys_addr) +{ + struct kern_memdesc *md; + + for (md = kern_memmap; md->start != ~0UL; md++) { + if (phys_addr - md->start < (md->num_pages << EFI_PAGE_SHIFT)) + return md; + } + return NULL; +} + +static efi_memory_desc_t * +efi_memory_descriptor (unsigned long phys_addr) { void *efi_map_start, *efi_map_end, *p; efi_memory_desc_t *md; @@ -609,59 +730,193 @@ efi_mem_type (unsigned long phys_addr) for (p = efi_map_start; p < efi_map_end; p += efi_desc_size) { md = p; - if (phys_addr - md->phys_addr < (md->num_pages << EFI_PAGE_SHIFT)) - return md->type; + if (phys_addr - md->phys_addr < efi_md_size(md)) + return md; } - return 0; + return NULL; } -u64 -efi_mem_attributes (unsigned long phys_addr) +static int +efi_memmap_intersects (unsigned long phys_addr, unsigned long size) { void *efi_map_start, *efi_map_end, *p; efi_memory_desc_t *md; u64 efi_desc_size; + unsigned long end; efi_map_start = __va(ia64_boot_param->efi_memmap); efi_map_end = efi_map_start + ia64_boot_param->efi_memmap_size; efi_desc_size = ia64_boot_param->efi_memdesc_size; + end = phys_addr + size; + for (p = efi_map_start; p < efi_map_end; p += efi_desc_size) { md = p; - - if (phys_addr - md->phys_addr < (md->num_pages << EFI_PAGE_SHIFT)) - return md->attribute; + if (md->phys_addr < end && efi_md_end(md) > phys_addr) + return 1; } return 0; } + +u32 +efi_mem_type (unsigned long phys_addr) +{ + efi_memory_desc_t *md = efi_memory_descriptor(phys_addr); + + if (md) + return md->type; + return 0; +} + +u64 +efi_mem_attributes (unsigned long phys_addr) +{ + efi_memory_desc_t *md = efi_memory_descriptor(phys_addr); + + if (md) + return md->attribute; + return 0; +} EXPORT_SYMBOL(efi_mem_attributes); -int -valid_phys_addr_range (unsigned long phys_addr, unsigned long *size) +u64 +efi_mem_attribute (unsigned long phys_addr, unsigned long size) { - void *efi_map_start, *efi_map_end, *p; - efi_memory_desc_t *md; - u64 efi_desc_size; + unsigned long end = phys_addr + size; + efi_memory_desc_t *md = efi_memory_descriptor(phys_addr); + u64 attr; - efi_map_start = __va(ia64_boot_param->efi_memmap); - efi_map_end = efi_map_start + ia64_boot_param->efi_memmap_size; - efi_desc_size = ia64_boot_param->efi_memdesc_size; + if (!md) + return 0; - for (p = efi_map_start; p < efi_map_end; p += efi_desc_size) { - md = p; + /* + * EFI_MEMORY_RUNTIME is not a memory attribute; it just tells + * the kernel that firmware needs this region mapped. + */ + attr = md->attribute & ~EFI_MEMORY_RUNTIME; + do { + unsigned long md_end = efi_md_end(md); + + if (end <= md_end) + return attr; + + md = efi_memory_descriptor(md_end); + if (!md || (md->attribute & ~EFI_MEMORY_RUNTIME) != attr) + return 0; + } while (md); + return 0; /* never reached */ +} - if (phys_addr - md->phys_addr < (md->num_pages << EFI_PAGE_SHIFT)) { - if (!(md->attribute & EFI_MEMORY_WB)) - return 0; +u64 +kern_mem_attribute (unsigned long phys_addr, unsigned long size) +{ + unsigned long end = phys_addr + size; + struct kern_memdesc *md; + u64 attr; - if (*size > md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT) - phys_addr) - *size = md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT) - phys_addr; - return 1; - } + /* + * This is a hack for ioremap calls before we set up kern_memmap. + * Maybe we should do efi_memmap_init() earlier instead. + */ + if (!kern_memmap) { + attr = efi_mem_attribute(phys_addr, size); + if (attr & EFI_MEMORY_WB) + return EFI_MEMORY_WB; + return 0; } + + md = kern_memory_descriptor(phys_addr); + if (!md) + return 0; + + attr = md->attribute; + do { + unsigned long md_end = kmd_end(md); + + if (end <= md_end) + return attr; + + md = kern_memory_descriptor(md_end); + if (!md || md->attribute != attr) + return 0; + } while (md); + return 0; /* never reached */ +} +EXPORT_SYMBOL(kern_mem_attribute); + +int +valid_phys_addr_range (phys_addr_t phys_addr, unsigned long size) +{ + u64 attr; + + /* + * /dev/mem reads and writes use copy_to_user(), which implicitly + * uses a granule-sized kernel identity mapping. It's really + * only safe to do this for regions in kern_memmap. For more + * details, see Documentation/ia64/aliasing.txt. + */ + attr = kern_mem_attribute(phys_addr, size); + if (attr & EFI_MEMORY_WB || attr & EFI_MEMORY_UC) + return 1; return 0; } +int +valid_mmap_phys_addr_range (unsigned long pfn, unsigned long size) +{ + unsigned long phys_addr = pfn << PAGE_SHIFT; + u64 attr; + + attr = efi_mem_attribute(phys_addr, size); + + /* + * /dev/mem mmap uses normal user pages, so we don't need the entire + * granule, but the entire region we're mapping must support the same + * attribute. + */ + if (attr & EFI_MEMORY_WB || attr & EFI_MEMORY_UC) + return 1; + + /* + * Intel firmware doesn't tell us about all the MMIO regions, so + * in general we have to allow mmap requests. But if EFI *does* + * tell us about anything inside this region, we should deny it. + * The user can always map a smaller region to avoid the overlap. + */ + if (efi_memmap_intersects(phys_addr, size)) + return 0; + + return 1; +} + +pgprot_t +phys_mem_access_prot(struct file *file, unsigned long pfn, unsigned long size, + pgprot_t vma_prot) +{ + unsigned long phys_addr = pfn << PAGE_SHIFT; + u64 attr; + + /* + * For /dev/mem mmap, we use user mappings, but if the region is + * in kern_memmap (and hence may be covered by a kernel mapping), + * we must use the same attribute as the kernel mapping. + */ + attr = kern_mem_attribute(phys_addr, size); + if (attr & EFI_MEMORY_WB) + return pgprot_cacheable(vma_prot); + else if (attr & EFI_MEMORY_UC) + return pgprot_noncached(vma_prot); + + /* + * Some chipsets don't support UC access to memory. If + * WB is supported, we prefer that. + */ + if (efi_mem_attribute(phys_addr, size) & EFI_MEMORY_WB) + return pgprot_cacheable(vma_prot); + + return pgprot_noncached(vma_prot); +} + int __init efi_uart_console_only(void) { @@ -701,38 +956,12 @@ efi_uart_console_only(void) return 1; uart = 0; } - hdr = (struct efi_generic_dev_path *) ((u8 *) hdr + hdr->length); + hdr = (struct efi_generic_dev_path *)((u8 *) hdr + hdr->length); } printk(KERN_ERR "Malformed %s value\n", name); return 0; } -#define efi_md_size(md) (md->num_pages << EFI_PAGE_SHIFT) - -static inline u64 -kmd_end(kern_memdesc_t *kmd) -{ - return (kmd->start + (kmd->num_pages << EFI_PAGE_SHIFT)); -} - -static inline u64 -efi_md_end(efi_memory_desc_t *md) -{ - return (md->phys_addr + efi_md_size(md)); -} - -static inline int -efi_wb(efi_memory_desc_t *md) -{ - return (md->attribute & EFI_MEMORY_WB); -} - -static inline int -efi_uc(efi_memory_desc_t *md) -{ - return (md->attribute & EFI_MEMORY_UC); -} - /* * Look for the first granule aligned memory descriptor memory * that is big enough to hold EFI memory map. Make sure this @@ -765,10 +994,12 @@ find_memmap_space (void) if (!efi_wb(md)) { continue; } - if (pmd == NULL || !efi_wb(pmd) || efi_md_end(pmd) != md->phys_addr) { + if (pmd == NULL || !efi_wb(pmd) || + efi_md_end(pmd) != md->phys_addr) { contig_low = GRANULEROUNDUP(md->phys_addr); contig_high = efi_md_end(md); - for (q = p + efi_desc_size; q < efi_map_end; q += efi_desc_size) { + for (q = p + efi_desc_size; q < efi_map_end; + q += efi_desc_size) { check_md = q; if (!efi_wb(check_md)) break; @@ -778,14 +1009,15 @@ find_memmap_space (void) } contig_high = GRANULEROUNDDOWN(contig_high); } - if (!is_available_memory(md) || md->type == EFI_LOADER_DATA) + if (!is_memory_available(md) || md->type == EFI_LOADER_DATA) continue; /* Round ends inward to granule boundaries */ as = max(contig_low, md->phys_addr); ae = min(contig_high, efi_md_end(md)); - /* keep within max_addr= command line arg */ + /* keep within max_addr= and min_addr= command line arg */ + as = max(as, min_addr); ae = min(ae, max_addr); if (ae <= as) continue; @@ -811,10 +1043,10 @@ find_memmap_space (void) * to use. We can allocate partial granules only if the unavailable * parts exist, and are WB. */ -void -efi_memmap_init(unsigned long *s, unsigned long *e) +unsigned long +efi_memmap_init(u64 *s, u64 *e) { - struct kern_memdesc *k, *prev = 0; + struct kern_memdesc *k, *prev = NULL; u64 contig_low=0, contig_high=0; u64 as, ae, lim; void *efi_map_start, *efi_map_end, *p, *q; @@ -831,8 +1063,9 @@ efi_memmap_init(unsigned long *s, unsigned long *e) for (p = efi_map_start; p < efi_map_end; pmd = md, p += efi_desc_size) { md = p; if (!efi_wb(md)) { - if (efi_uc(md) && (md->type == EFI_CONVENTIONAL_MEMORY || - md->type == EFI_BOOT_SERVICES_DATA)) { + if (efi_uc(md) && + (md->type == EFI_CONVENTIONAL_MEMORY || + md->type == EFI_BOOT_SERVICES_DATA)) { k->attribute = EFI_MEMORY_UC; k->start = md->phys_addr; k->num_pages = md->num_pages; @@ -840,10 +1073,12 @@ efi_memmap_init(unsigned long *s, unsigned long *e) } continue; } - if (pmd == NULL || !efi_wb(pmd) || efi_md_end(pmd) != md->phys_addr) { + if (pmd == NULL || !efi_wb(pmd) || + efi_md_end(pmd) != md->phys_addr) { contig_low = GRANULEROUNDUP(md->phys_addr); contig_high = efi_md_end(md); - for (q = p + efi_desc_size; q < efi_map_end; q += efi_desc_size) { + for (q = p + efi_desc_size; q < efi_map_end; + q += efi_desc_size) { check_md = q; if (!efi_wb(check_md)) break; @@ -853,7 +1088,7 @@ efi_memmap_init(unsigned long *s, unsigned long *e) } contig_high = GRANULEROUNDDOWN(contig_high); } - if (!is_available_memory(md)) + if (!is_memory_available(md)) continue; /* @@ -863,13 +1098,17 @@ efi_memmap_init(unsigned long *s, unsigned long *e) if (md->phys_addr < contig_low) { lim = min(efi_md_end(md), contig_low); if (efi_uc(md)) { - if (k > kern_memmap && (k-1)->attribute == EFI_MEMORY_UC && + if (k > kern_memmap && + (k-1)->attribute == EFI_MEMORY_UC && kmd_end(k-1) == md->phys_addr) { - (k-1)->num_pages += (lim - md->phys_addr) >> EFI_PAGE_SHIFT; + (k-1)->num_pages += + (lim - md->phys_addr) + >> EFI_PAGE_SHIFT; } else { k->attribute = EFI_MEMORY_UC; k->start = md->phys_addr; - k->num_pages = (lim - md->phys_addr) >> EFI_PAGE_SHIFT; + k->num_pages = (lim - md->phys_addr) + >> EFI_PAGE_SHIFT; k++; } } @@ -887,7 +1126,8 @@ efi_memmap_init(unsigned long *s, unsigned long *e) } else { k->attribute = EFI_MEMORY_UC; k->start = lim; - k->num_pages = (efi_md_end(md) - lim) >> EFI_PAGE_SHIFT; + k->num_pages = (efi_md_end(md) - lim) + >> EFI_PAGE_SHIFT; k++; } } @@ -895,7 +1135,8 @@ efi_memmap_init(unsigned long *s, unsigned long *e) } else ae = efi_md_end(md); - /* keep within max_addr= command line arg */ + /* keep within max_addr= and min_addr= command line arg */ + as = max(as, min_addr); ae = min(ae, max_addr); if (ae <= as) continue; @@ -922,11 +1163,14 @@ efi_memmap_init(unsigned long *s, unsigned long *e) /* reserve the memory we are using for kern_memmap */ *s = (u64)kern_memmap; *e = (u64)++k; + + return total_mem; } void efi_initialize_iomem_resources(struct resource *code_resource, - struct resource *data_resource) + struct resource *data_resource, + struct resource *bss_resource) { struct resource *res; void *efi_map_start, *efi_map_end, *p; @@ -947,7 +1191,7 @@ efi_initialize_iomem_resources(struct resource *code_resource, if (md->num_pages == 0) /* should not happen */ continue; - flags = IORESOURCE_MEM; + flags = IORESOURCE_MEM | IORESOURCE_BUSY; switch (md->type) { case EFI_MEMORY_MAPPED_IO: @@ -962,19 +1206,19 @@ efi_initialize_iomem_resources(struct resource *code_resource, if (md->attribute & EFI_MEMORY_WP) { name = "System ROM"; flags |= IORESOURCE_READONLY; - } else { + } else if (md->attribute == EFI_MEMORY_UC) + name = "Uncached RAM"; + else name = "System RAM"; - } break; case EFI_ACPI_MEMORY_NVS: name = "ACPI Non-volatile Storage"; - flags |= IORESOURCE_BUSY; break; case EFI_UNUSABLE_MEMORY: name = "reserved"; - flags |= IORESOURCE_BUSY | IORESOURCE_DISABLED; + flags |= IORESOURCE_DISABLED; break; case EFI_RESERVED_TYPE: @@ -983,18 +1227,19 @@ efi_initialize_iomem_resources(struct resource *code_resource, case EFI_ACPI_RECLAIM_MEMORY: default: name = "reserved"; - flags |= IORESOURCE_BUSY; break; } - if ((res = kcalloc(1, sizeof(struct resource), GFP_KERNEL)) == NULL) { - printk(KERN_ERR "failed to alocate resource for iomem\n"); + if ((res = kzalloc(sizeof(struct resource), + GFP_KERNEL)) == NULL) { + printk(KERN_ERR + "failed to allocate resource for iomem\n"); return; } res->name = name; res->start = md->phys_addr; - res->end = md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT) - 1; + res->end = md->phys_addr + efi_md_size(md) - 1; res->flags = flags; if (insert_resource(&iomem_resource, res) < 0) @@ -1007,6 +1252,89 @@ efi_initialize_iomem_resources(struct resource *code_resource, */ insert_resource(res, code_resource); insert_resource(res, data_resource); + insert_resource(res, bss_resource); +#ifdef CONFIG_KEXEC + insert_resource(res, &efi_memmap_res); + insert_resource(res, &boot_param_res); + if (crashk_res.end > crashk_res.start) + insert_resource(res, &crashk_res); +#endif + } + } +} + +#ifdef CONFIG_KEXEC +/* find a block of memory aligned to 64M exclude reserved regions + rsvd_regions are sorted + */ +unsigned long __init +kdump_find_rsvd_region (unsigned long size, struct rsvd_region *r, int n) +{ + int i; + u64 start, end; + u64 alignment = 1UL << _PAGE_SIZE_64M; + void *efi_map_start, *efi_map_end, *p; + efi_memory_desc_t *md; + u64 efi_desc_size; + + efi_map_start = __va(ia64_boot_param->efi_memmap); + efi_map_end = efi_map_start + ia64_boot_param->efi_memmap_size; + efi_desc_size = ia64_boot_param->efi_memdesc_size; + + for (p = efi_map_start; p < efi_map_end; p += efi_desc_size) { + md = p; + if (!efi_wb(md)) + continue; + start = ALIGN(md->phys_addr, alignment); + end = efi_md_end(md); + for (i = 0; i < n; i++) { + if (__pa(r[i].start) >= start && __pa(r[i].end) < end) { + if (__pa(r[i].start) > start + size) + return start; + start = ALIGN(__pa(r[i].end), alignment); + if (i < n-1 && + __pa(r[i+1].start) < start + size) + continue; + else + break; + } + } + if (end > start + size) + return start; + } + + printk(KERN_WARNING + "Cannot reserve 0x%lx byte of memory for crashdump\n", size); + return ~0UL; +} +#endif + +#ifdef CONFIG_CRASH_DUMP +/* locate the size find a the descriptor at a certain address */ +unsigned long __init +vmcore_find_descriptor_size (unsigned long address) +{ + void *efi_map_start, *efi_map_end, *p; + efi_memory_desc_t *md; + u64 efi_desc_size; + unsigned long ret = 0; + + efi_map_start = __va(ia64_boot_param->efi_memmap); + efi_map_end = efi_map_start + ia64_boot_param->efi_memmap_size; + efi_desc_size = ia64_boot_param->efi_memdesc_size; + + for (p = efi_map_start; p < efi_map_end; p += efi_desc_size) { + md = p; + if (efi_wb(md) && md->type == EFI_LOADER_DATA + && md->phys_addr == address) { + ret = efi_md_size(md); + break; } } + + if (ret == 0) + printk(KERN_WARNING "Cannot locate EFI vmcore descriptor\n"); + + return ret; } +#endif diff --git a/arch/ia64/kernel/efi_stub.S b/arch/ia64/kernel/efi_stub.S index 5a7fe70212a..a56e161d751 100644 --- a/arch/ia64/kernel/efi_stub.S +++ b/arch/ia64/kernel/efi_stub.S @@ -61,7 +61,7 @@ GLOBAL_ENTRY(efi_call_phys) or loc3=loc3,r17 mov b6=r2 ;; - andcm r16=loc3,r16 // get psr with IT, DT, and RT bits cleared + andcm r16=loc3,r16 // get psr with IT, DT, and RT bits cleared br.call.sptk.many rp=ia64_switch_mode_phys .ret0: mov out4=in5 mov out0=in1 diff --git a/arch/ia64/kernel/elfcore.c b/arch/ia64/kernel/elfcore.c new file mode 100644 index 00000000000..04bc8fd5f89 --- /dev/null +++ b/arch/ia64/kernel/elfcore.c @@ -0,0 +1,76 @@ +#include <linux/elf.h> +#include <linux/coredump.h> +#include <linux/fs.h> +#include <linux/mm.h> + +#include <asm/elf.h> + + +Elf64_Half elf_core_extra_phdrs(void) +{ + return GATE_EHDR->e_phnum; +} + +int elf_core_write_extra_phdrs(struct coredump_params *cprm, loff_t offset) +{ + const struct elf_phdr *const gate_phdrs = + (const struct elf_phdr *) (GATE_ADDR + GATE_EHDR->e_phoff); + int i; + Elf64_Off ofs = 0; + + for (i = 0; i < GATE_EHDR->e_phnum; ++i) { + struct elf_phdr phdr = gate_phdrs[i]; + + if (phdr.p_type == PT_LOAD) { + phdr.p_memsz = PAGE_ALIGN(phdr.p_memsz); + phdr.p_filesz = phdr.p_memsz; + if (ofs == 0) { + ofs = phdr.p_offset = offset; + offset += phdr.p_filesz; + } else { + phdr.p_offset = ofs; + } + } else { + phdr.p_offset += ofs; + } + phdr.p_paddr = 0; /* match other core phdrs */ + if (!dump_emit(cprm, &phdr, sizeof(phdr))) + return 0; + } + return 1; +} + +int elf_core_write_extra_data(struct coredump_params *cprm) +{ + const struct elf_phdr *const gate_phdrs = + (const struct elf_phdr *) (GATE_ADDR + GATE_EHDR->e_phoff); + int i; + + for (i = 0; i < GATE_EHDR->e_phnum; ++i) { + if (gate_phdrs[i].p_type == PT_LOAD) { + void *addr = (void *)gate_phdrs[i].p_vaddr; + size_t memsz = PAGE_ALIGN(gate_phdrs[i].p_memsz); + + if (!dump_emit(cprm, addr, memsz)) + return 0; + break; + } + } + return 1; +} + +size_t elf_core_extra_data_size(void) +{ + const struct elf_phdr *const gate_phdrs = + (const struct elf_phdr *) (GATE_ADDR + GATE_EHDR->e_phoff); + int i; + size_t size = 0; + + for (i = 0; i < GATE_EHDR->e_phnum; ++i) { + if (gate_phdrs[i].p_type == PT_LOAD) { + size += PAGE_ALIGN(gate_phdrs[i].p_memsz); + break; + } + } + return size; +} diff --git a/arch/ia64/kernel/entry.S b/arch/ia64/kernel/entry.S index 0741b066b98..ba3d03503e8 100644 --- a/arch/ia64/kernel/entry.S +++ b/arch/ia64/kernel/entry.S @@ -1,5 +1,5 @@ /* - * ia64/kernel/entry.S + * arch/ia64/kernel/entry.S * * Kernel entry points. * @@ -23,6 +23,11 @@ * 11/07/2000 */ /* + * Copyright (c) 2008 Isaku Yamahata <yamahata at valinux co jp> + * VA Linux Systems Japan K.K. + * pv_ops. + */ +/* * Global (preserved) predicate usage on syscall entry/exit path: * * pKStk: See entry.h. @@ -31,7 +36,6 @@ * pNonSys: !pSys */ -#include <linux/config.h> #include <asm/asmmacro.h> #include <asm/cache.h> @@ -43,9 +47,11 @@ #include <asm/processor.h> #include <asm/thread_info.h> #include <asm/unistd.h> +#include <asm/ftrace.h> #include "minstate.h" +#ifdef __IA64_ASM_PARAVIRTUALIZED_NATIVE /* * execve() is special because in case of success, we need to * setup a null register window frame. @@ -55,31 +61,20 @@ ENTRY(ia64_execve) * Allocate 8 input registers since ptrace() may clobber them */ .prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(8) - alloc loc1=ar.pfs,8,2,4,0 + alloc loc1=ar.pfs,8,2,3,0 mov loc0=rp .body mov out0=in0 // filename ;; // stop bit between alloc and call mov out1=in1 // argv mov out2=in2 // envp - add out3=16,sp // regs br.call.sptk.many rp=sys_execve .ret0: -#ifdef CONFIG_IA32_SUPPORT - /* - * Check if we're returning to ia32 mode. If so, we need to restore ia32 registers - * from pt_regs. - */ - adds r16=PT(CR_IPSR)+16,sp - ;; - ld8 r16=[r16] -#endif cmp4.ge p6,p7=r8,r0 mov ar.pfs=loc1 // restore ar.pfs sxt4 r8=r8 // return 64-bit result ;; stf.spill [sp]=f0 -(p6) cmp.ne pKStk,pUStk=r0,r0 // a successful execve() lands us in user-mode... mov rp=loc0 (p6) mov ar.pfs=r0 // clear ar.pfs on success (p7) br.ret.sptk.many rp @@ -102,12 +97,6 @@ ENTRY(ia64_execve) ldf.fill f23=[sp]; ldf.fill f24=[sp]; mov f25=f0 ldf.fill f26=[sp]; ldf.fill f27=[sp]; mov f28=f0 ldf.fill f29=[sp]; ldf.fill f30=[sp]; mov f31=f0 -#ifdef CONFIG_IA32_SUPPORT - tbit.nz p6,p0=r16, IA64_PSR_IS_BIT - movl loc0=ia64_ret_from_ia32_execve - ;; -(p6) mov rp=loc0 -#endif br.ret.sptk.many rp END(ia64_execve) @@ -127,13 +116,12 @@ GLOBAL_ENTRY(sys_clone2) mov loc1=r16 // save ar.pfs across do_fork .body mov out1=in1 - mov out3=in2 + mov out2=in2 tbit.nz p6,p0=in0,CLONE_SETTLS_BIT - mov out4=in3 // parent_tidptr: valid only w/CLONE_PARENT_SETTID + mov out3=in3 // parent_tidptr: valid only w/CLONE_PARENT_SETTID ;; (p6) st8 [r2]=in5 // store TLS in r16 for copy_thread() - mov out5=in4 // child_tidptr: valid only w/CLONE_CHILD_SETTID or CLONE_CHILD_CLEARTID - adds out2=IA64_SWITCH_STACK_SIZE+16,sp // out2 = ®s + mov out4=in4 // child_tidptr: valid only w/CLONE_CHILD_SETTID or CLONE_CHILD_CLEARTID mov out0=in0 // out0 = clone_flags br.call.sptk.many rp=do_fork .ret1: .restore sp @@ -159,13 +147,12 @@ GLOBAL_ENTRY(sys_clone) mov loc1=r16 // save ar.pfs across do_fork .body mov out1=in1 - mov out3=16 // stacksize (compensates for 16-byte scratch area) + mov out2=16 // stacksize (compensates for 16-byte scratch area) tbit.nz p6,p0=in0,CLONE_SETTLS_BIT - mov out4=in2 // parent_tidptr: valid only w/CLONE_PARENT_SETTID + mov out3=in2 // parent_tidptr: valid only w/CLONE_PARENT_SETTID ;; (p6) st8 [r2]=in4 // store TLS in r13 (tp) - mov out5=in3 // child_tidptr: valid only w/CLONE_CHILD_SETTID or CLONE_CHILD_CLEARTID - adds out2=IA64_SWITCH_STACK_SIZE+16,sp // out2 = ®s + mov out4=in3 // child_tidptr: valid only w/CLONE_CHILD_SETTID or CLONE_CHILD_CLEARTID mov out0=in0 // out0 = clone_flags br.call.sptk.many rp=do_fork .ret2: .restore sp @@ -174,6 +161,7 @@ GLOBAL_ENTRY(sys_clone) mov rp=loc0 br.ret.sptk.many rp END(sys_clone) +#endif /* __IA64_ASM_PARAVIRTUALIZED_NATIVE */ /* * prev_task <- ia64_switch_to(struct task_struct *next) @@ -181,7 +169,7 @@ END(sys_clone) * called. The code starting at .map relies on this. The rest of the code * doesn't care about the interrupt masking status. */ -GLOBAL_ENTRY(ia64_switch_to) +GLOBAL_ENTRY(__paravirt_switch_to) .prologue alloc r16=ar.pfs,1,0,0,0 DO_SAVE_SWITCH_STACK @@ -205,7 +193,7 @@ GLOBAL_ENTRY(ia64_switch_to) ;; .done: ld8 sp=[r21] // load kernel stack pointer of new task - mov IA64_KR(CURRENT)=in0 // update "current" application register + MOV_TO_KR(CURRENT, in0, r8, r9) // update "current" application register mov r8=r13 // return pointer to previously running task mov r13=in0 // set "current" pointer ;; @@ -217,26 +205,25 @@ GLOBAL_ENTRY(ia64_switch_to) br.ret.sptk.many rp // boogie on out in new context .map: - rsm psr.ic // interrupts (psr.i) are already disabled here + RSM_PSR_IC(r25) // interrupts (psr.i) are already disabled here movl r25=PAGE_KERNEL ;; srlz.d or r23=r25,r20 // construct PA | page properties mov r25=IA64_GRANULE_SHIFT<<2 ;; - mov cr.itir=r25 - mov cr.ifa=in0 // VA of next task... + MOV_TO_ITIR(p0, r25, r8) + MOV_TO_IFA(in0, r8) // VA of next task... ;; mov r25=IA64_TR_CURRENT_STACK - mov IA64_KR(CURRENT_STACK)=r26 // remember last page we mapped... + MOV_TO_KR(CURRENT_STACK, r26, r8, r9) // remember last page we mapped... ;; itr.d dtr[r25]=r23 // wire in new mapping... - ssm psr.ic // reenable the psr.ic bit - ;; - srlz.d + SSM_PSR_IC_AND_SRLZ_D(r8, r9) // reenable the psr.ic bit br.cond.sptk .done -END(ia64_switch_to) +END(__paravirt_switch_to) +#ifdef __IA64_ASM_PARAVIRTUALIZED_NATIVE /* * Note that interrupts are enabled during save_switch_stack and load_switch_stack. This * means that we may get an interrupt with "sp" pointing to the new kernel stack while @@ -376,7 +363,7 @@ END(save_switch_stack) * - b7 holds address to return to * - must not touch r8-r11 */ -ENTRY(load_switch_stack) +GLOBAL_ENTRY(load_switch_stack) .prologue .altrp b7 @@ -493,18 +480,6 @@ GLOBAL_ENTRY(prefetch_stack) br.ret.sptk.many rp END(prefetch_stack) -GLOBAL_ENTRY(execve) - mov r15=__NR_execve // put syscall number in place - break __BREAK_SYSCALL - br.ret.sptk.many rp -END(execve) - -GLOBAL_ENTRY(clone) - mov r15=__NR_clone // put syscall number in place - break __BREAK_SYSCALL - br.ret.sptk.many rp -END(clone) - /* * Invoke a system call, but do some tracing before and after the call. * We MUST preserve the current register frame throughout this routine @@ -529,6 +504,11 @@ GLOBAL_ENTRY(ia64_trace_syscall) stf.spill [r16]=f10 stf.spill [r17]=f11 br.call.sptk.many rp=syscall_trace_enter // give parent a chance to catch syscall args + cmp.lt p6,p0=r8,r0 // check tracehook + adds r2=PT(R8)+16,sp // r2 = &pt_regs.r8 + adds r3=PT(R10)+16,sp // r3 = &pt_regs.r10 + mov r10=0 +(p6) br.cond.sptk strace_error // syscall failed -> adds r16=PT(F6)+16,sp adds r17=PT(F7)+16,sp ;; @@ -569,7 +549,10 @@ GLOBAL_ENTRY(ia64_trace_syscall) .mem.offset 0,0; st8.spill [r2]=r8 // store return value in slot for r8 .mem.offset 8,0; st8.spill [r3]=r10 // clear error indication in slot for r10 br.call.sptk.many rp=syscall_trace_leave // give parent a chance to catch return value -.ret3: br.cond.sptk .work_pending_syscall_end +.ret3: +(pUStk) cmp.eq.unc p6,p0=r0,r0 // p6 <- pUStk +(pUStk) rsm psr.i // disable interrupts + br.cond.sptk ia64_work_pending_syscall_end strace_error: ld8 r3=[r2] // load pt_regs.r8 @@ -600,6 +583,27 @@ GLOBAL_ENTRY(ia64_strace_leave_kernel) .ret4: br.cond.sptk ia64_leave_kernel END(ia64_strace_leave_kernel) +ENTRY(call_payload) + .prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(0) + /* call the kernel_thread payload; fn is in r4, arg - in r5 */ + alloc loc1=ar.pfs,0,3,1,0 + mov loc0=rp + mov loc2=gp + mov out0=r5 // arg + ld8 r14 = [r4], 8 // fn.address + ;; + mov b6 = r14 + ld8 gp = [r4] // fn.gp + ;; + br.call.sptk.many rp=b6 // fn(arg) +.ret12: mov gp=loc2 + mov rp=loc0 + mov ar.pfs=loc1 + /* ... and if it has returned, we are going to userland */ + cmp.ne pKStk,pUStk=r0,r0 + br.ret.sptk.many rp +END(call_payload) + GLOBAL_ENTRY(ia64_ret_from_clone) PT_REGS_UNWIND_INFO(0) { /* @@ -616,6 +620,7 @@ GLOBAL_ENTRY(ia64_ret_from_clone) br.call.sptk.many rp=ia64_invoke_schedule_tail } .ret8: +(pKStk) br.call.sptk.many rp=call_payload adds r2=TI_FLAGS+IA64_TASK_SIZE,r13 ;; ld4 r2=[r2] @@ -634,8 +639,17 @@ GLOBAL_ENTRY(ia64_ret_from_syscall) adds r2=PT(R8)+16,sp // r2 = &pt_regs.r8 mov r10=r0 // clear error indication in r10 (p7) br.cond.spnt handle_syscall_error // handle potential syscall failure +#ifdef CONFIG_PARAVIRT + ;; + br.cond.sptk.few ia64_leave_syscall + ;; +#endif /* CONFIG_PARAVIRT */ END(ia64_ret_from_syscall) +#ifndef CONFIG_PARAVIRT // fall through +#endif +#endif /* __IA64_ASM_PARAVIRTUALIZED_NATIVE */ + /* * ia64_leave_syscall(): Same as ia64_leave_kernel, except that it doesn't * need to switch to bank 0 and doesn't restore the scratch registers. @@ -680,7 +694,7 @@ END(ia64_ret_from_syscall) * ar.csd: cleared * ar.ssd: cleared */ -ENTRY(ia64_leave_syscall) +GLOBAL_ENTRY(__paravirt_leave_syscall) PT_REGS_UNWIND_INFO(0) /* * work.need_resched etc. mustn't get changed by this CPU before it returns to @@ -690,11 +704,11 @@ ENTRY(ia64_leave_syscall) * extra work. We always check for extra work when returning to user-level. * With CONFIG_PREEMPT, we also check for extra work when the preempt_count * is 0. After extra work processing has been completed, execution - * resumes at .work_processed_syscall with p6 set to 1 if the extra-work-check + * resumes at ia64_work_processed_syscall with p6 set to 1 if the extra-work-check * needs to be redone. */ #ifdef CONFIG_PREEMPT - rsm psr.i // disable interrupts + RSM_PSR_I(p0, r2, r18) // disable interrupts cmp.eq pLvSys,p0=r0,r0 // pLvSys=1: leave from syscall (pKStk) adds r20=TI_PRE_COUNT+IA64_TASK_SIZE,r13 ;; @@ -704,11 +718,22 @@ ENTRY(ia64_leave_syscall) ;; cmp.eq p6,p0=r21,r0 // p6 <- pUStk || (preempt_count == 0) #else /* !CONFIG_PREEMPT */ -(pUStk) rsm psr.i + RSM_PSR_I(pUStk, r2, r18) cmp.eq pLvSys,p0=r0,r0 // pLvSys=1: leave from syscall (pUStk) cmp.eq.unc p6,p0=r0,r0 // p6 <- pUStk #endif -.work_processed_syscall: +.global __paravirt_work_processed_syscall; +__paravirt_work_processed_syscall: +#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE + adds r2=PT(LOADRS)+16,r12 + MOV_FROM_ITC(pUStk, p9, r22, r19) // fetch time at leave + adds r18=TI_FLAGS+IA64_TASK_SIZE,r13 + ;; +(p6) ld4 r31=[r18] // load current_thread_info()->flags + ld8 r19=[r2],PT(B6)-PT(LOADRS) // load ar.rsc value for "loadrs" + adds r3=PT(AR_BSPSTORE)+16,r12 // deferred + ;; +#else adds r2=PT(LOADRS)+16,r12 adds r3=PT(AR_BSPSTORE)+16,r12 adds r18=TI_FLAGS+IA64_TASK_SIZE,r13 @@ -717,6 +742,7 @@ ENTRY(ia64_leave_syscall) ld8 r19=[r2],PT(B6)-PT(LOADRS) // load ar.rsc value for "loadrs" nop.i 0 ;; +#endif mov r16=ar.bsp // M2 get existing backing store pointer ld8 r18=[r2],PT(R9)-PT(B6) // load b6 (p6) and r15=TIF_WORK_MASK,r31 // any work other than TIF_SYSCALL_TRACE? @@ -731,19 +757,28 @@ ENTRY(ia64_leave_syscall) (pNonSys) break 0 // bug check: we shouldn't be here if pNonSys is TRUE! ;; invala // M0|1 invalidate ALAT - rsm psr.i | psr.ic // M2 turn off interrupts and interruption collection + RSM_PSR_I_IC(r28, r29, r30) // M2 turn off interrupts and interruption collection cmp.eq p9,p0=r0,r0 // A set p9 to indicate that we should restore cr.ifs ld8 r29=[r2],16 // M0|1 load cr.ipsr ld8 r28=[r3],16 // M0|1 load cr.iip +#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE +(pUStk) add r14=TI_AC_LEAVE+IA64_TASK_SIZE,r13 + ;; + ld8 r30=[r2],16 // M0|1 load cr.ifs + ld8 r25=[r3],16 // M0|1 load ar.unat +(pUStk) add r15=IA64_TASK_THREAD_ON_USTACK_OFFSET,r13 + ;; +#else mov r22=r0 // A clear r22 ;; ld8 r30=[r2],16 // M0|1 load cr.ifs ld8 r25=[r3],16 // M0|1 load ar.unat (pUStk) add r14=IA64_TASK_THREAD_ON_USTACK_OFFSET,r13 ;; +#endif ld8 r26=[r2],PT(B0)-PT(AR_PFS) // M0|1 load ar.pfs -(pKStk) mov r22=psr // M2 read PSR now that interrupts are disabled + MOV_FROM_PSR(pKStk, r22, r21) // M2 read PSR now that interrupts are disabled nop 0 ;; ld8 r21=[r2],PT(AR_RNAT)-PT(B0) // M0|1 load b0 @@ -758,7 +793,11 @@ ENTRY(ia64_leave_syscall) ld8.fill r1=[r3],16 // M0|1 load r1 (pUStk) mov r17=1 // A ;; +#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE +(pUStk) st1 [r15]=r17 // M2|3 +#else (pUStk) st1 [r14]=r17 // M2|3 +#endif ld8.fill r13=[r3],16 // M0|1 mov f8=f0 // F clear f8 ;; @@ -766,21 +805,30 @@ ENTRY(ia64_leave_syscall) ld8.fill r15=[r3] // M0|1 restore r15 mov b6=r18 // I0 restore b6 - addl r17=THIS_CPU(ia64_phys_stacked_size_p8),r0 // A + LOAD_PHYS_STACK_REG_SIZE(r17) mov f9=f0 // F clear f9 (pKStk) br.cond.dpnt.many skip_rbs_switch // B srlz.d // M0 ensure interruption collection is off (for cover) shr.u r18=r19,16 // I0|1 get byte size of existing "dirty" partition - cover // B add current frame into dirty partition & set cr.ifs + COVER // B add current frame into dirty partition & set cr.ifs + ;; +#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE + mov r19=ar.bsp // M2 get new backing store pointer + st8 [r14]=r22 // M save time at leave + mov f10=f0 // F clear f10 + + mov r22=r0 // A clear r22 + movl r14=__kernel_syscall_via_epc // X ;; -(pUStk) ld4 r17=[r17] // M0|1 r17 = cpu_data->phys_stacked_size_p8 +#else mov r19=ar.bsp // M2 get new backing store pointer mov f10=f0 // F clear f10 nop.m 0 movl r14=__kernel_syscall_via_epc // X ;; +#endif mov.m ar.csd=r0 // M2 clear ar.csd mov.m ar.ccv=r0 // M2 clear ar.ccv mov b7=r14 // I0 clear b7 (hint with __kernel_syscall_via_epc) @@ -788,22 +836,9 @@ ENTRY(ia64_leave_syscall) mov.m ar.ssd=r0 // M2 clear ar.ssd mov f11=f0 // F clear f11 br.cond.sptk.many rbs_switch // B -END(ia64_leave_syscall) +END(__paravirt_leave_syscall) -#ifdef CONFIG_IA32_SUPPORT -GLOBAL_ENTRY(ia64_ret_from_ia32_execve) - PT_REGS_UNWIND_INFO(0) - adds r2=PT(R8)+16,sp // r2 = &pt_regs.r8 - adds r3=PT(R10)+16,sp // r3 = &pt_regs.r10 - ;; - .mem.offset 0,0 - st8.spill [r2]=r8 // store return value in slot for r8 and set unat bit - .mem.offset 8,0 - st8.spill [r3]=r0 // clear error indication in slot for r10 and set unat bit -END(ia64_ret_from_ia32_execve) - // fall through -#endif /* CONFIG_IA32_SUPPORT */ -GLOBAL_ENTRY(ia64_leave_kernel) +GLOBAL_ENTRY(__paravirt_leave_kernel) PT_REGS_UNWIND_INFO(0) /* * work.need_resched etc. mustn't get changed by this CPU before it returns to @@ -817,7 +852,7 @@ GLOBAL_ENTRY(ia64_leave_kernel) * needs to be redone. */ #ifdef CONFIG_PREEMPT - rsm psr.i // disable interrupts + RSM_PSR_I(p0, r17, r31) // disable interrupts cmp.eq p0,pLvSys=r0,r0 // pLvSys=0: leave from kernel (pKStk) adds r20=TI_PRE_COUNT+IA64_TASK_SIZE,r13 ;; @@ -827,7 +862,7 @@ GLOBAL_ENTRY(ia64_leave_kernel) ;; cmp.eq p6,p0=r21,r0 // p6 <- pUStk || (preempt_count == 0) #else -(pUStk) rsm psr.i + RSM_PSR_I(pUStk, r17, r31) cmp.eq p0,pLvSys=r0,r0 // pLvSys=0: leave from kernel (pUStk) cmp.eq.unc p6,p0=r0,r0 // p6 <- pUStk #endif @@ -875,7 +910,7 @@ GLOBAL_ENTRY(ia64_leave_kernel) mov ar.csd=r30 mov ar.ssd=r31 ;; - rsm psr.i | psr.ic // initiate turning off of interrupt and interruption collection + RSM_PSR_I_IC(r23, r22, r25) // initiate turning off of interrupt and interruption collection invala // invalidate ALAT ;; ld8.fill r22=[r2],24 @@ -907,16 +942,24 @@ GLOBAL_ENTRY(ia64_leave_kernel) mov ar.ccv=r15 ;; ldf.fill f11=[r2] - bsw.0 // switch back to bank 0 (no stop bit required beforehand...) + BSW_0(r2, r3, r15) // switch back to bank 0 (no stop bit required beforehand...) ;; (pUStk) mov r18=IA64_KR(CURRENT)// M2 (12 cycle read latency) adds r16=PT(CR_IPSR)+16,r12 adds r17=PT(CR_IIP)+16,r12 -(pKStk) mov r22=psr // M2 read PSR now that interrupts are disabled +#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE + .pred.rel.mutex pUStk,pKStk + MOV_FROM_PSR(pKStk, r22, r29) // M2 read PSR now that interrupts are disabled + MOV_FROM_ITC(pUStk, p9, r22, r29) // M fetch time at leave + nop.i 0 + ;; +#else + MOV_FROM_PSR(pKStk, r22, r29) // M2 read PSR now that interrupts are disabled nop.i 0 nop.i 0 ;; +#endif ld8 r29=[r16],16 // load cr.ipsr ld8 r28=[r17],16 // load cr.iip ;; @@ -938,24 +981,45 @@ GLOBAL_ENTRY(ia64_leave_kernel) ;; ld8.fill r12=[r16],16 ld8.fill r13=[r17],16 +#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE +(pUStk) adds r3=TI_AC_LEAVE+IA64_TASK_SIZE,r18 +#else (pUStk) adds r18=IA64_TASK_THREAD_ON_USTACK_OFFSET,r18 +#endif ;; ld8 r20=[r16],16 // ar.fpsr ld8.fill r15=[r17],16 +#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE +(pUStk) adds r18=IA64_TASK_THREAD_ON_USTACK_OFFSET,r18 // deferred +#endif ;; ld8.fill r14=[r16],16 ld8.fill r2=[r17] (pUStk) mov r17=1 ;; - ld8.fill r3=[r16] +#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE + // mmi_ : ld8 st1 shr;; mmi_ : st8 st1 shr;; + // mib : mov add br -> mib : ld8 add br + // bbb_ : br nop cover;; mbb_ : mov br cover;; + // + // no one require bsp in r16 if (pKStk) branch is selected. +(pUStk) st8 [r3]=r22 // save time at leave (pUStk) st1 [r18]=r17 // restore current->thread.on_ustack shr.u r18=r19,16 // get byte size of existing "dirty" partition ;; + ld8.fill r3=[r16] // deferred + LOAD_PHYS_STACK_REG_SIZE(r17) +(pKStk) br.cond.dpnt skip_rbs_switch mov r16=ar.bsp // get existing backing store pointer - addl r17=THIS_CPU(ia64_phys_stacked_size_p8),r0 +#else + ld8.fill r3=[r16] +(pUStk) st1 [r18]=r17 // restore current->thread.on_ustack + shr.u r18=r19,16 // get byte size of existing "dirty" partition ;; - ld4 r17=[r17] // r17 = cpu_data->phys_stacked_size_p8 + mov r16=ar.bsp // get existing backing store pointer + LOAD_PHYS_STACK_REG_SIZE(r17) (pKStk) br.cond.dpnt skip_rbs_switch +#endif /* * Restore user backing store. @@ -963,7 +1027,7 @@ GLOBAL_ENTRY(ia64_leave_kernel) * NOTE: alloc, loadrs, and cover can't be predicated. */ (pNonSys) br.cond.dpnt dont_preserve_current_frame - cover // add current frame into dirty partition and set cr.ifs + COVER // add current frame into dirty partition and set cr.ifs ;; mov r19=ar.bsp // get new backing store pointer rbs_switch: @@ -1066,16 +1130,16 @@ skip_rbs_switch: (pKStk) dep r29=r22,r29,21,1 // I0 update ipsr.pp with psr.pp (pLvSys)mov r16=r0 // A clear r16 for leave_syscall, no-op otherwise ;; - mov cr.ipsr=r29 // M2 + MOV_TO_IPSR(p0, r29, r25) // M2 mov ar.pfs=r26 // I0 (pLvSys)mov r17=r0 // A clear r17 for leave_syscall, no-op otherwise -(p9) mov cr.ifs=r30 // M2 + MOV_TO_IFS(p9, r30, r25)// M2 mov b0=r21 // I0 (pLvSys)mov r18=r0 // A clear r18 for leave_syscall, no-op otherwise mov ar.fpsr=r20 // M2 - mov cr.iip=r28 // M2 + MOV_TO_IIP(r28, r25) // M2 nop 0 ;; (pUStk) mov ar.rnat=r24 // M2 must happen with RSE in lazy mode @@ -1084,7 +1148,7 @@ skip_rbs_switch: mov ar.rsc=r27 // M2 mov pr=r31,-1 // I0 - rfi // B + RFI // B /* * On entry: @@ -1092,6 +1156,9 @@ skip_rbs_switch: * r31 = current->thread_info->flags * On exit: * p6 = TRUE if work-pending-check needs to be redone + * + * Interrupts are disabled on entry, reenabled depend on work, and + * disabled on exit. */ .work_pending_syscall: add r2=-8,r2 @@ -1100,56 +1167,30 @@ skip_rbs_switch: st8 [r2]=r8 st8 [r3]=r10 .work_pending: - tbit.nz p6,p0=r31,TIF_SIGDELAYED // signal delayed from MCA/INIT/NMI/PMI context? -(p6) br.cond.sptk.few .sigdelayed - ;; - tbit.z p6,p0=r31,TIF_NEED_RESCHED // current_thread_info()->need_resched==0? + tbit.z p6,p0=r31,TIF_NEED_RESCHED // is resched not needed? (p6) br.cond.sptk.few .notify -#ifdef CONFIG_PREEMPT -(pKStk) dep r21=-1,r0,PREEMPT_ACTIVE_BIT,1 - ;; -(pKStk) st4 [r20]=r21 - ssm psr.i // enable interrupts -#endif - br.call.spnt.many rp=schedule -.ret9: cmp.eq p6,p0=r0,r0 // p6 <- 1 - rsm psr.i // disable interrupts - ;; -#ifdef CONFIG_PREEMPT -(pKStk) adds r20=TI_PRE_COUNT+IA64_TASK_SIZE,r13 - ;; -(pKStk) st4 [r20]=r0 // preempt_count() <- 0 -#endif -(pLvSys)br.cond.sptk.few .work_pending_syscall_end - br.cond.sptk.many .work_processed_kernel // re-check + br.call.spnt.many rp=preempt_schedule_irq +.ret9: cmp.eq p6,p0=r0,r0 // p6 <- 1 (re-check) +(pLvSys)br.cond.sptk.few __paravirt_pending_syscall_end + br.cond.sptk.many .work_processed_kernel .notify: (pUStk) br.call.spnt.many rp=notify_resume_user -.ret10: cmp.ne p6,p0=r0,r0 // p6 <- 0 -(pLvSys)br.cond.sptk.few .work_pending_syscall_end - br.cond.sptk.many .work_processed_kernel // don't re-check - -// There is a delayed signal that was detected in MCA/INIT/NMI/PMI context where -// it could not be delivered. Deliver it now. The signal might be for us and -// may set TIF_SIGPENDING, so redrive ia64_leave_* after processing the delayed -// signal. - -.sigdelayed: - br.call.sptk.many rp=do_sigdelayed - cmp.eq p6,p0=r0,r0 // p6 <- 1, always re-check -(pLvSys)br.cond.sptk.few .work_pending_syscall_end - br.cond.sptk.many .work_processed_kernel // re-check - -.work_pending_syscall_end: +.ret10: cmp.ne p6,p0=r0,r0 // p6 <- 0 (don't re-check) +(pLvSys)br.cond.sptk.few __paravirt_pending_syscall_end + br.cond.sptk.many .work_processed_kernel + +.global __paravirt_pending_syscall_end; +__paravirt_pending_syscall_end: adds r2=PT(R8)+16,r12 adds r3=PT(R10)+16,r12 ;; ld8 r8=[r2] ld8 r10=[r3] - br.cond.sptk.many .work_processed_syscall // re-check - -END(ia64_leave_kernel) + br.cond.sptk.many __paravirt_work_processed_syscall_target +END(__paravirt_leave_kernel) +#ifdef __IA64_ASM_PARAVIRTUALIZED_NATIVE ENTRY(handle_syscall_error) /* * Some system calls (e.g., ptrace, mmap) can return arbitrary values which could @@ -1184,11 +1225,14 @@ GLOBAL_ENTRY(ia64_invoke_schedule_tail) END(ia64_invoke_schedule_tail) /* - * Setup stack and call do_notify_resume_user(). Note that pSys and pNonSys need to - * be set up by the caller. We declare 8 input registers so the system call - * args get preserved, in case we need to restart a system call. + * Setup stack and call do_notify_resume_user(), keeping interrupts + * disabled. + * + * Note that pSys and pNonSys need to be set up by the caller. + * We declare 8 input registers so the system call args get preserved, + * in case we need to restart a system call. */ -ENTRY(notify_resume_user) +GLOBAL_ENTRY(notify_resume_user) .prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(8) alloc loc1=ar.pfs,8,2,3,0 // preserve all eight input regs in case of syscall restart! mov r9=ar.unat @@ -1215,32 +1259,6 @@ ENTRY(notify_resume_user) br.ret.sptk.many rp END(notify_resume_user) -GLOBAL_ENTRY(sys_rt_sigsuspend) - .prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(8) - alloc loc1=ar.pfs,8,2,3,0 // preserve all eight input regs in case of syscall restart! - mov r9=ar.unat - mov loc0=rp // save return address - mov out0=in0 // mask - mov out1=in1 // sigsetsize - adds out2=8,sp // out2=&sigscratch->ar_pfs - ;; - .fframe 16 - .spillsp ar.unat, 16 - st8 [sp]=r9,-16 // allocate space for ar.unat and save it - st8 [out2]=loc1,-8 // save ar.pfs, out2=&sigscratch - .body - br.call.sptk.many rp=ia64_rt_sigsuspend -.ret17: .restore sp - adds sp=16,sp // pop scratch stack space - ;; - ld8 r9=[sp] // load new unat from sw->caller_unat - mov rp=loc0 - ;; - mov ar.unat=r9 - mov ar.pfs=loc1 - br.ret.sptk.many rp -END(sys_rt_sigsuspend) - ENTRY(sys_rt_sigreturn) PT_REGS_UNWIND_INFO(0) /* @@ -1276,7 +1294,7 @@ ENTRY(sys_rt_sigreturn) adds sp=16,sp ;; ld8 r9=[sp] // load new ar.unat - mov.sptk b7=r8,ia64_leave_kernel + mov.sptk b7=r8,ia64_native_leave_kernel ;; mov ar.unat=r9 br.many b7 @@ -1340,6 +1358,105 @@ GLOBAL_ENTRY(unw_init_running) br.ret.sptk.many rp END(unw_init_running) +#ifdef CONFIG_FUNCTION_TRACER +#ifdef CONFIG_DYNAMIC_FTRACE +GLOBAL_ENTRY(_mcount) + br ftrace_stub +END(_mcount) + +.here: + br.ret.sptk.many b0 + +GLOBAL_ENTRY(ftrace_caller) + alloc out0 = ar.pfs, 8, 0, 4, 0 + mov out3 = r0 + ;; + mov out2 = b0 + add r3 = 0x20, r3 + mov out1 = r1; + br.call.sptk.many b0 = ftrace_patch_gp + //this might be called from module, so we must patch gp +ftrace_patch_gp: + movl gp=__gp + mov b0 = r3 + ;; +.global ftrace_call; +ftrace_call: +{ + .mlx + nop.m 0x0 + movl r3 = .here;; +} + alloc loc0 = ar.pfs, 4, 4, 2, 0 + ;; + mov loc1 = b0 + mov out0 = b0 + mov loc2 = r8 + mov loc3 = r15 + ;; + adds out0 = -MCOUNT_INSN_SIZE, out0 + mov out1 = in2 + mov b6 = r3 + + br.call.sptk.many b0 = b6 + ;; + mov ar.pfs = loc0 + mov b0 = loc1 + mov r8 = loc2 + mov r15 = loc3 + br ftrace_stub + ;; +END(ftrace_caller) + +#else +GLOBAL_ENTRY(_mcount) + movl r2 = ftrace_stub + movl r3 = ftrace_trace_function;; + ld8 r3 = [r3];; + ld8 r3 = [r3];; + cmp.eq p7,p0 = r2, r3 +(p7) br.sptk.many ftrace_stub + ;; + + alloc loc0 = ar.pfs, 4, 4, 2, 0 + ;; + mov loc1 = b0 + mov out0 = b0 + mov loc2 = r8 + mov loc3 = r15 + ;; + adds out0 = -MCOUNT_INSN_SIZE, out0 + mov out1 = in2 + mov b6 = r3 + + br.call.sptk.many b0 = b6 + ;; + mov ar.pfs = loc0 + mov b0 = loc1 + mov r8 = loc2 + mov r15 = loc3 + br ftrace_stub + ;; +END(_mcount) +#endif + +GLOBAL_ENTRY(ftrace_stub) + mov r3 = b0 + movl r2 = _mcount_ret_helper + ;; + mov b6 = r2 + mov b7 = r3 + br.ret.sptk.many b6 + +_mcount_ret_helper: + mov b0 = r42 + mov r1 = r41 + mov ar.pfs = r40 + br b7 +END(ftrace_stub) + +#endif /* CONFIG_FUNCTION_TRACER */ + .rodata .align 8 .globl sys_call_table @@ -1378,7 +1495,7 @@ sys_call_table: data8 sys_mkdir // 1055 data8 sys_rmdir data8 sys_dup - data8 sys_pipe + data8 sys_ia64_pipe data8 sys_times data8 ia64_brk // 1060 data8 sys_setgid @@ -1489,7 +1606,7 @@ sys_call_table: data8 sys_sched_get_priority_min data8 sys_sched_rr_get_interval data8 sys_nanosleep - data8 sys_nfsservctl + data8 sys_ni_syscall // old nfsservctl data8 sys_prctl // 1170 data8 sys_getpagesize data8 sys_mmap2 @@ -1588,7 +1705,7 @@ sys_call_table: data8 sys_mq_timedreceive // 1265 data8 sys_mq_notify data8 sys_mq_getsetattr - data8 sys_ni_syscall // reserved for kexec_load + data8 sys_kexec_load data8 sys_ni_syscall // reserved for vserver data8 sys_waitid // 1270 data8 sys_add_key @@ -1596,9 +1713,69 @@ sys_call_table: data8 sys_keyctl data8 sys_ioprio_set data8 sys_ioprio_get // 1275 - data8 sys_ni_syscall + data8 sys_move_pages data8 sys_inotify_init data8 sys_inotify_add_watch data8 sys_inotify_rm_watch + data8 sys_migrate_pages // 1280 + data8 sys_openat + data8 sys_mkdirat + data8 sys_mknodat + data8 sys_fchownat + data8 sys_futimesat // 1285 + data8 sys_newfstatat + data8 sys_unlinkat + data8 sys_renameat + data8 sys_linkat + data8 sys_symlinkat // 1290 + data8 sys_readlinkat + data8 sys_fchmodat + data8 sys_faccessat + data8 sys_pselect6 + data8 sys_ppoll // 1295 + data8 sys_unshare + data8 sys_splice + data8 sys_set_robust_list + data8 sys_get_robust_list + data8 sys_sync_file_range // 1300 + data8 sys_tee + data8 sys_vmsplice + data8 sys_fallocate + data8 sys_getcpu + data8 sys_epoll_pwait // 1305 + data8 sys_utimensat + data8 sys_signalfd + data8 sys_ni_syscall + data8 sys_eventfd + data8 sys_timerfd_create // 1310 + data8 sys_timerfd_settime + data8 sys_timerfd_gettime + data8 sys_signalfd4 + data8 sys_eventfd2 + data8 sys_epoll_create1 // 1315 + data8 sys_dup3 + data8 sys_pipe2 + data8 sys_inotify_init1 + data8 sys_preadv + data8 sys_pwritev // 1320 + data8 sys_rt_tgsigqueueinfo + data8 sys_recvmmsg + data8 sys_fanotify_init + data8 sys_fanotify_mark + data8 sys_prlimit64 // 1325 + data8 sys_name_to_handle_at + data8 sys_open_by_handle_at + data8 sys_clock_adjtime + data8 sys_syncfs + data8 sys_setns // 1330 + data8 sys_sendmmsg + data8 sys_process_vm_readv + data8 sys_process_vm_writev + data8 sys_accept4 + data8 sys_finit_module // 1335 + data8 sys_sched_setattr + data8 sys_sched_getattr + data8 sys_renameat2 .org sys_call_table + 8*NR_syscalls // guard against failures to increase NR_syscalls +#endif /* __IA64_ASM_PARAVIRTUALIZED_NATIVE */ diff --git a/arch/ia64/kernel/entry.h b/arch/ia64/kernel/entry.h index 78eeb079341..b83edac0296 100644 --- a/arch/ia64/kernel/entry.h +++ b/arch/ia64/kernel/entry.h @@ -1,4 +1,3 @@ -#include <linux/config.h> /* * Preserved registers that are shared between code in ivt.S and @@ -23,6 +22,7 @@ #define PT(f) (IA64_PT_REGS_##f##_OFFSET) #define SW(f) (IA64_SWITCH_STACK_##f##_OFFSET) +#define SOS(f) (IA64_SAL_OS_STATE_##f##_OFFSET) #define PT_REGS_SAVES(off) \ .unwabi 3, 'i'; \ diff --git a/arch/ia64/kernel/err_inject.c b/arch/ia64/kernel/err_inject.c new file mode 100644 index 00000000000..0c161ed6d18 --- /dev/null +++ b/arch/ia64/kernel/err_inject.c @@ -0,0 +1,314 @@ +/* + * err_inject.c - + * 1.) Inject errors to a processor. + * 2.) Query error injection capabilities. + * This driver along with user space code can be acting as an error + * injection tool. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * Written by: Fenghua Yu <fenghua.yu@intel.com>, Intel Corporation + * Copyright (C) 2006, Intel Corp. All rights reserved. + * + */ +#include <linux/device.h> +#include <linux/init.h> +#include <linux/mm.h> +#include <linux/cpu.h> +#include <linux/module.h> + +#define ERR_INJ_DEBUG + +#define ERR_DATA_BUFFER_SIZE 3 // Three 8-byte; + +#define define_one_ro(name) \ +static DEVICE_ATTR(name, 0444, show_##name, NULL) + +#define define_one_rw(name) \ +static DEVICE_ATTR(name, 0644, show_##name, store_##name) + +static u64 call_start[NR_CPUS]; +static u64 phys_addr[NR_CPUS]; +static u64 err_type_info[NR_CPUS]; +static u64 err_struct_info[NR_CPUS]; +static struct { + u64 data1; + u64 data2; + u64 data3; +} __attribute__((__aligned__(16))) err_data_buffer[NR_CPUS]; +static s64 status[NR_CPUS]; +static u64 capabilities[NR_CPUS]; +static u64 resources[NR_CPUS]; + +#define show(name) \ +static ssize_t \ +show_##name(struct device *dev, struct device_attribute *attr, \ + char *buf) \ +{ \ + u32 cpu=dev->id; \ + return sprintf(buf, "%lx\n", name[cpu]); \ +} + +#define store(name) \ +static ssize_t \ +store_##name(struct device *dev, struct device_attribute *attr, \ + const char *buf, size_t size) \ +{ \ + unsigned int cpu=dev->id; \ + name[cpu] = simple_strtoull(buf, NULL, 16); \ + return size; \ +} + +show(call_start) + +/* It's user's responsibility to call the PAL procedure on a specific + * processor. The cpu number in driver is only used for storing data. + */ +static ssize_t +store_call_start(struct device *dev, struct device_attribute *attr, + const char *buf, size_t size) +{ + unsigned int cpu=dev->id; + unsigned long call_start = simple_strtoull(buf, NULL, 16); + +#ifdef ERR_INJ_DEBUG + printk(KERN_DEBUG "pal_mc_err_inject for cpu%d:\n", cpu); + printk(KERN_DEBUG "err_type_info=%lx,\n", err_type_info[cpu]); + printk(KERN_DEBUG "err_struct_info=%lx,\n", err_struct_info[cpu]); + printk(KERN_DEBUG "err_data_buffer=%lx, %lx, %lx.\n", + err_data_buffer[cpu].data1, + err_data_buffer[cpu].data2, + err_data_buffer[cpu].data3); +#endif + switch (call_start) { + case 0: /* Do nothing. */ + break; + case 1: /* Call pal_mc_error_inject in physical mode. */ + status[cpu]=ia64_pal_mc_error_inject_phys(err_type_info[cpu], + err_struct_info[cpu], + ia64_tpa(&err_data_buffer[cpu]), + &capabilities[cpu], + &resources[cpu]); + break; + case 2: /* Call pal_mc_error_inject in virtual mode. */ + status[cpu]=ia64_pal_mc_error_inject_virt(err_type_info[cpu], + err_struct_info[cpu], + ia64_tpa(&err_data_buffer[cpu]), + &capabilities[cpu], + &resources[cpu]); + break; + default: + status[cpu] = -EINVAL; + break; + } + +#ifdef ERR_INJ_DEBUG + printk(KERN_DEBUG "Returns: status=%d,\n", (int)status[cpu]); + printk(KERN_DEBUG "capapbilities=%lx,\n", capabilities[cpu]); + printk(KERN_DEBUG "resources=%lx\n", resources[cpu]); +#endif + return size; +} + +show(err_type_info) +store(err_type_info) + +static ssize_t +show_virtual_to_phys(struct device *dev, struct device_attribute *attr, + char *buf) +{ + unsigned int cpu=dev->id; + return sprintf(buf, "%lx\n", phys_addr[cpu]); +} + +static ssize_t +store_virtual_to_phys(struct device *dev, struct device_attribute *attr, + const char *buf, size_t size) +{ + unsigned int cpu=dev->id; + u64 virt_addr=simple_strtoull(buf, NULL, 16); + int ret; + + ret = get_user_pages(current, current->mm, virt_addr, + 1, VM_READ, 0, NULL, NULL); + if (ret<=0) { +#ifdef ERR_INJ_DEBUG + printk("Virtual address %lx is not existing.\n",virt_addr); +#endif + return -EINVAL; + } + + phys_addr[cpu] = ia64_tpa(virt_addr); + return size; +} + +show(err_struct_info) +store(err_struct_info) + +static ssize_t +show_err_data_buffer(struct device *dev, + struct device_attribute *attr, char *buf) +{ + unsigned int cpu=dev->id; + + return sprintf(buf, "%lx, %lx, %lx\n", + err_data_buffer[cpu].data1, + err_data_buffer[cpu].data2, + err_data_buffer[cpu].data3); +} + +static ssize_t +store_err_data_buffer(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t size) +{ + unsigned int cpu=dev->id; + int ret; + +#ifdef ERR_INJ_DEBUG + printk("write err_data_buffer=[%lx,%lx,%lx] on cpu%d\n", + err_data_buffer[cpu].data1, + err_data_buffer[cpu].data2, + err_data_buffer[cpu].data3, + cpu); +#endif + ret=sscanf(buf, "%lx, %lx, %lx", + &err_data_buffer[cpu].data1, + &err_data_buffer[cpu].data2, + &err_data_buffer[cpu].data3); + if (ret!=ERR_DATA_BUFFER_SIZE) + return -EINVAL; + + return size; +} + +show(status) +show(capabilities) +show(resources) + +define_one_rw(call_start); +define_one_rw(err_type_info); +define_one_rw(err_struct_info); +define_one_rw(err_data_buffer); +define_one_rw(virtual_to_phys); +define_one_ro(status); +define_one_ro(capabilities); +define_one_ro(resources); + +static struct attribute *default_attrs[] = { + &dev_attr_call_start.attr, + &dev_attr_virtual_to_phys.attr, + &dev_attr_err_type_info.attr, + &dev_attr_err_struct_info.attr, + &dev_attr_err_data_buffer.attr, + &dev_attr_status.attr, + &dev_attr_capabilities.attr, + &dev_attr_resources.attr, + NULL +}; + +static struct attribute_group err_inject_attr_group = { + .attrs = default_attrs, + .name = "err_inject" +}; +/* Add/Remove err_inject interface for CPU device */ +static int err_inject_add_dev(struct device *sys_dev) +{ + return sysfs_create_group(&sys_dev->kobj, &err_inject_attr_group); +} + +static int err_inject_remove_dev(struct device *sys_dev) +{ + sysfs_remove_group(&sys_dev->kobj, &err_inject_attr_group); + return 0; +} +static int err_inject_cpu_callback(struct notifier_block *nfb, + unsigned long action, void *hcpu) +{ + unsigned int cpu = (unsigned long)hcpu; + struct device *sys_dev; + + sys_dev = get_cpu_device(cpu); + switch (action) { + case CPU_ONLINE: + case CPU_ONLINE_FROZEN: + err_inject_add_dev(sys_dev); + break; + case CPU_DEAD: + case CPU_DEAD_FROZEN: + err_inject_remove_dev(sys_dev); + break; + } + + return NOTIFY_OK; +} + +static struct notifier_block err_inject_cpu_notifier = +{ + .notifier_call = err_inject_cpu_callback, +}; + +static int __init +err_inject_init(void) +{ + int i; + +#ifdef ERR_INJ_DEBUG + printk(KERN_INFO "Enter error injection driver.\n"); +#endif + + cpu_notifier_register_begin(); + + for_each_online_cpu(i) { + err_inject_cpu_callback(&err_inject_cpu_notifier, CPU_ONLINE, + (void *)(long)i); + } + + __register_hotcpu_notifier(&err_inject_cpu_notifier); + + cpu_notifier_register_done(); + + return 0; +} + +static void __exit +err_inject_exit(void) +{ + int i; + struct device *sys_dev; + +#ifdef ERR_INJ_DEBUG + printk(KERN_INFO "Exit error injection driver.\n"); +#endif + + cpu_notifier_register_begin(); + + for_each_online_cpu(i) { + sys_dev = get_cpu_device(i); + sysfs_remove_group(&sys_dev->kobj, &err_inject_attr_group); + } + + __unregister_hotcpu_notifier(&err_inject_cpu_notifier); + + cpu_notifier_register_done(); +} + +module_init(err_inject_init); +module_exit(err_inject_exit); + +MODULE_AUTHOR("Fenghua Yu <fenghua.yu@intel.com>"); +MODULE_DESCRIPTION("MC error injection kernel sysfs interface"); +MODULE_LICENSE("GPL"); diff --git a/arch/ia64/kernel/esi.c b/arch/ia64/kernel/esi.c new file mode 100644 index 00000000000..b091111270c --- /dev/null +++ b/arch/ia64/kernel/esi.c @@ -0,0 +1,205 @@ +/* + * Extensible SAL Interface (ESI) support routines. + * + * Copyright (C) 2006 Hewlett-Packard Co + * Alex Williamson <alex.williamson@hp.com> + */ +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/string.h> + +#include <asm/esi.h> +#include <asm/sal.h> + +MODULE_AUTHOR("Alex Williamson <alex.williamson@hp.com>"); +MODULE_DESCRIPTION("Extensible SAL Interface (ESI) support"); +MODULE_LICENSE("GPL"); + +#define MODULE_NAME "esi" + +#define ESI_TABLE_GUID \ + EFI_GUID(0x43EA58DC, 0xCF28, 0x4b06, 0xB3, \ + 0x91, 0xB7, 0x50, 0x59, 0x34, 0x2B, 0xD4) + +enum esi_systab_entry_type { + ESI_DESC_ENTRY_POINT = 0 +}; + +/* + * Entry type: Size: + * 0 48 + */ +#define ESI_DESC_SIZE(type) "\060"[(unsigned) (type)] + +typedef struct ia64_esi_desc_entry_point { + u8 type; + u8 reserved1[15]; + u64 esi_proc; + u64 gp; + efi_guid_t guid; +} ia64_esi_desc_entry_point_t; + +struct pdesc { + void *addr; + void *gp; +}; + +static struct ia64_sal_systab *esi_systab; + +static int __init esi_init (void) +{ + efi_config_table_t *config_tables; + struct ia64_sal_systab *systab; + unsigned long esi = 0; + char *p; + int i; + + config_tables = __va(efi.systab->tables); + + for (i = 0; i < (int) efi.systab->nr_tables; ++i) { + if (efi_guidcmp(config_tables[i].guid, ESI_TABLE_GUID) == 0) { + esi = config_tables[i].table; + break; + } + } + + if (!esi) + return -ENODEV; + + systab = __va(esi); + + if (strncmp(systab->signature, "ESIT", 4) != 0) { + printk(KERN_ERR "bad signature in ESI system table!"); + return -ENODEV; + } + + p = (char *) (systab + 1); + for (i = 0; i < systab->entry_count; i++) { + /* + * The first byte of each entry type contains the type + * descriptor. + */ + switch (*p) { + case ESI_DESC_ENTRY_POINT: + break; + default: + printk(KERN_WARNING "Unknown table type %d found in " + "ESI table, ignoring rest of table\n", *p); + return -ENODEV; + } + + p += ESI_DESC_SIZE(*p); + } + + esi_systab = systab; + return 0; +} + + +int ia64_esi_call (efi_guid_t guid, struct ia64_sal_retval *isrvp, + enum esi_proc_type proc_type, u64 func, + u64 arg1, u64 arg2, u64 arg3, u64 arg4, u64 arg5, u64 arg6, + u64 arg7) +{ + struct ia64_fpreg fr[6]; + unsigned long flags = 0; + int i; + char *p; + + if (!esi_systab) + return -1; + + p = (char *) (esi_systab + 1); + for (i = 0; i < esi_systab->entry_count; i++) { + if (*p == ESI_DESC_ENTRY_POINT) { + ia64_esi_desc_entry_point_t *esi = (void *)p; + if (!efi_guidcmp(guid, esi->guid)) { + ia64_sal_handler esi_proc; + struct pdesc pdesc; + + pdesc.addr = __va(esi->esi_proc); + pdesc.gp = __va(esi->gp); + + esi_proc = (ia64_sal_handler) &pdesc; + + ia64_save_scratch_fpregs(fr); + if (proc_type == ESI_PROC_SERIALIZED) + spin_lock_irqsave(&sal_lock, flags); + else if (proc_type == ESI_PROC_MP_SAFE) + local_irq_save(flags); + else + preempt_disable(); + *isrvp = (*esi_proc)(func, arg1, arg2, arg3, + arg4, arg5, arg6, arg7); + if (proc_type == ESI_PROC_SERIALIZED) + spin_unlock_irqrestore(&sal_lock, + flags); + else if (proc_type == ESI_PROC_MP_SAFE) + local_irq_restore(flags); + else + preempt_enable(); + ia64_load_scratch_fpregs(fr); + return 0; + } + } + p += ESI_DESC_SIZE(*p); + } + return -1; +} +EXPORT_SYMBOL_GPL(ia64_esi_call); + +int ia64_esi_call_phys (efi_guid_t guid, struct ia64_sal_retval *isrvp, + u64 func, u64 arg1, u64 arg2, u64 arg3, u64 arg4, + u64 arg5, u64 arg6, u64 arg7) +{ + struct ia64_fpreg fr[6]; + unsigned long flags; + u64 esi_params[8]; + char *p; + int i; + + if (!esi_systab) + return -1; + + p = (char *) (esi_systab + 1); + for (i = 0; i < esi_systab->entry_count; i++) { + if (*p == ESI_DESC_ENTRY_POINT) { + ia64_esi_desc_entry_point_t *esi = (void *)p; + if (!efi_guidcmp(guid, esi->guid)) { + ia64_sal_handler esi_proc; + struct pdesc pdesc; + + pdesc.addr = (void *)esi->esi_proc; + pdesc.gp = (void *)esi->gp; + + esi_proc = (ia64_sal_handler) &pdesc; + + esi_params[0] = func; + esi_params[1] = arg1; + esi_params[2] = arg2; + esi_params[3] = arg3; + esi_params[4] = arg4; + esi_params[5] = arg5; + esi_params[6] = arg6; + esi_params[7] = arg7; + ia64_save_scratch_fpregs(fr); + spin_lock_irqsave(&sal_lock, flags); + *isrvp = esi_call_phys(esi_proc, esi_params); + spin_unlock_irqrestore(&sal_lock, flags); + ia64_load_scratch_fpregs(fr); + return 0; + } + } + p += ESI_DESC_SIZE(*p); + } + return -1; +} +EXPORT_SYMBOL_GPL(ia64_esi_call_phys); + +static void __exit esi_exit (void) +{ +} + +module_init(esi_init); +module_exit(esi_exit); /* makes module removable... */ diff --git a/arch/ia64/kernel/esi_stub.S b/arch/ia64/kernel/esi_stub.S new file mode 100644 index 00000000000..6b3d6c1f99b --- /dev/null +++ b/arch/ia64/kernel/esi_stub.S @@ -0,0 +1,96 @@ +/* + * ESI call stub. + * + * Copyright (C) 2005 Hewlett-Packard Co + * Alex Williamson <alex.williamson@hp.com> + * + * Based on EFI call stub by David Mosberger. The stub is virtually + * identical to the one for EFI phys-mode calls, except that ESI + * calls may have up to 8 arguments, so they get passed to this routine + * through memory. + * + * This stub allows us to make ESI calls in physical mode with interrupts + * turned off. ESI calls may not support calling from virtual mode. + * + * Google for "Extensible SAL specification" for a document describing the + * ESI standard. + */ + +/* + * PSR settings as per SAL spec (Chapter 8 in the "IA-64 System + * Abstraction Layer Specification", revision 2.6e). Note that + * psr.dfl and psr.dfh MUST be cleared, despite what this manual says. + * Otherwise, SAL dies whenever it's trying to do an IA-32 BIOS call + * (the br.ia instruction fails unless psr.dfl and psr.dfh are + * cleared). Fortunately, SAL promises not to touch the floating + * point regs, so at least we don't have to save f2-f127. + */ +#define PSR_BITS_TO_CLEAR \ + (IA64_PSR_I | IA64_PSR_IT | IA64_PSR_DT | IA64_PSR_RT | \ + IA64_PSR_DD | IA64_PSR_SS | IA64_PSR_RI | IA64_PSR_ED | \ + IA64_PSR_DFL | IA64_PSR_DFH) + +#define PSR_BITS_TO_SET \ + (IA64_PSR_BN) + +#include <asm/processor.h> +#include <asm/asmmacro.h> + +/* + * Inputs: + * in0 = address of function descriptor of ESI routine to call + * in1 = address of array of ESI parameters + * + * Outputs: + * r8 = result returned by called function + */ +GLOBAL_ENTRY(esi_call_phys) + .prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(2) + alloc loc1=ar.pfs,2,7,8,0 + ld8 r2=[in0],8 // load ESI function's entry point + mov loc0=rp + .body + ;; + ld8 out0=[in1],8 // ESI params loaded from array + ;; // passing all as inputs doesn't work + ld8 out1=[in1],8 + ;; + ld8 out2=[in1],8 + ;; + ld8 out3=[in1],8 + ;; + ld8 out4=[in1],8 + ;; + ld8 out5=[in1],8 + ;; + ld8 out6=[in1],8 + ;; + ld8 out7=[in1] + mov loc2=gp // save global pointer + mov loc4=ar.rsc // save RSE configuration + mov ar.rsc=0 // put RSE in enforced lazy, LE mode + ;; + ld8 gp=[in0] // load ESI function's global pointer + movl r16=PSR_BITS_TO_CLEAR + mov loc3=psr // save processor status word + movl r17=PSR_BITS_TO_SET + ;; + or loc3=loc3,r17 + mov b6=r2 + ;; + andcm r16=loc3,r16 // get psr with IT, DT, and RT bits cleared + br.call.sptk.many rp=ia64_switch_mode_phys +.ret0: mov loc5=r19 // old ar.bsp + mov loc6=r20 // old sp + br.call.sptk.many rp=b6 // call the ESI function +.ret1: mov ar.rsc=0 // put RSE in enforced lazy, LE mode + mov r16=loc3 // save virtual mode psr + mov r19=loc5 // save virtual mode bspstore + mov r20=loc6 // save virtual mode sp + br.call.sptk.many rp=ia64_switch_mode_virt // return to virtual mode +.ret2: mov ar.rsc=loc4 // restore RSE configuration + mov ar.pfs=loc1 + mov rp=loc0 + mov gp=loc2 + br.ret.sptk.many rp +END(esi_call_phys) diff --git a/arch/ia64/kernel/fsys.S b/arch/ia64/kernel/fsys.S index 2ddbac6f499..abc6dee3799 100644 --- a/arch/ia64/kernel/fsys.S +++ b/arch/ia64/kernel/fsys.S @@ -10,6 +10,8 @@ * probably broke it along the way... ;-) * 13-Jul-04 clameter Implement fsys_clock_gettime and revise fsys_gettimeofday to make * it capable of using memory based clocks without falling back to C code. + * 08-Feb-07 Fenghua Yu Implement fsys_getcpu. + * */ #include <asm/asmmacro.h> @@ -19,10 +21,10 @@ #include <asm/thread_info.h> #include <asm/sal.h> #include <asm/signal.h> -#include <asm/system.h> #include <asm/unistd.h> #include "entry.h" +#include "paravirt_inst.h" /* * See Documentation/ia64/fsys.txt for details on fsyscalls. @@ -59,80 +61,59 @@ ENTRY(fsys_getpid) .prologue .altrp b6 .body - add r9=TI_FLAGS+IA64_TASK_SIZE,r16 - ;; - ld4 r9=[r9] - add r8=IA64_TASK_TGID_OFFSET,r16 - ;; - and r9=TIF_ALLWORK_MASK,r9 - ld4 r8=[r8] // r8 = current->tgid - ;; - cmp.ne p8,p0=0,r9 -(p8) br.spnt.many fsys_fallback_syscall - FSYS_RETURN -END(fsys_getpid) - -ENTRY(fsys_getppid) - .prologue - .altrp b6 - .body add r17=IA64_TASK_GROUP_LEADER_OFFSET,r16 ;; ld8 r17=[r17] // r17 = current->group_leader add r9=TI_FLAGS+IA64_TASK_SIZE,r16 ;; - ld4 r9=[r9] - add r17=IA64_TASK_REAL_PARENT_OFFSET,r17 // r17 = ¤t->group_leader->real_parent + add r17=IA64_TASK_TGIDLINK_OFFSET,r17 ;; and r9=TIF_ALLWORK_MASK,r9 - -1: ld8 r18=[r17] // r18 = current->group_leader->real_parent + ld8 r17=[r17] // r17 = current->group_leader->pids[PIDTYPE_PID].pid ;; - cmp.ne p8,p0=0,r9 - add r8=IA64_TASK_TGID_OFFSET,r18 // r8 = ¤t->group_leader->real_parent->tgid + add r8=IA64_PID_LEVEL_OFFSET,r17 ;; - - /* - * The .acq is needed to ensure that the read of tgid has returned its data before - * we re-check "real_parent". - */ - ld4.acq r8=[r8] // r8 = current->group_leader->real_parent->tgid -#ifdef CONFIG_SMP - /* - * Re-read current->group_leader->real_parent. - */ - ld8 r19=[r17] // r19 = current->group_leader->real_parent -(p8) br.spnt.many fsys_fallback_syscall + ld4 r8=[r8] // r8 = pid->level + add r17=IA64_PID_UPID_OFFSET,r17 // r17 = &pid->numbers[0] ;; - cmp.ne p6,p0=r18,r19 // did real_parent change? - mov r19=0 // i must not leak kernel bits... -(p6) br.cond.spnt.few 1b // yes -> redo the read of tgid and the check + shl r8=r8,IA64_UPID_SHIFT ;; - mov r17=0 // i must not leak kernel bits... - mov r18=0 // i must not leak kernel bits... -#else - mov r17=0 // i must not leak kernel bits... - mov r18=0 // i must not leak kernel bits... - mov r19=0 // i must not leak kernel bits... -#endif + add r17=r17,r8 // r17 = &pid->numbers[pid->level] + ;; + ld4 r8=[r17] // r8 = pid->numbers[pid->level].nr + ;; + mov r17=0 + ;; + cmp.ne p8,p0=0,r9 +(p8) br.spnt.many fsys_fallback_syscall FSYS_RETURN -END(fsys_getppid) +END(fsys_getpid) ENTRY(fsys_set_tid_address) .prologue .altrp b6 .body add r9=TI_FLAGS+IA64_TASK_SIZE,r16 + add r17=IA64_TASK_TGIDLINK_OFFSET,r16 ;; ld4 r9=[r9] tnat.z p6,p7=r32 // check argument register for being NaT + ld8 r17=[r17] // r17 = current->pids[PIDTYPE_PID].pid ;; and r9=TIF_ALLWORK_MASK,r9 - add r8=IA64_TASK_PID_OFFSET,r16 + add r8=IA64_PID_LEVEL_OFFSET,r17 add r18=IA64_TASK_CLEAR_CHILD_TID_OFFSET,r16 ;; - ld4 r8=[r8] + ld4 r8=[r8] // r8 = pid->level + add r17=IA64_PID_UPID_OFFSET,r17 // r17 = &pid->numbers[0] + ;; + shl r8=r8,IA64_UPID_SHIFT + ;; + add r17=r17,r8 // r17 = &pid->numbers[pid->level] + ;; + ld4 r8=[r17] // r8 = pid->numbers[pid->level].nr + ;; cmp.ne p8,p0=0,r9 mov r17=-1 ;; @@ -145,12 +126,11 @@ ENTRY(fsys_set_tid_address) FSYS_RETURN END(fsys_set_tid_address) -/* - * Ensure that the time interpolator structure is compatible with the asm code - */ -#if IA64_TIME_INTERPOLATOR_SOURCE_OFFSET !=0 || IA64_TIME_INTERPOLATOR_SHIFT_OFFSET != 2 \ - || IA64_TIME_INTERPOLATOR_JITTER_OFFSET != 3 || IA64_TIME_INTERPOLATOR_NSEC_OFFSET != 4 -#error fsys_gettimeofday incompatible with changes to struct time_interpolator +#if IA64_GTOD_SEQ_OFFSET !=0 +#error fsys_gettimeofday incompatible with changes to struct fsyscall_gtod_data_t +#endif +#if IA64_ITC_JITTER_OFFSET !=0 +#error fsys_gettimeofday incompatible with changes to struct itc_jitter_data_t #endif #define CLOCK_REALTIME 0 #define CLOCK_MONOTONIC 1 @@ -177,124 +157,120 @@ ENTRY(fsys_gettimeofday) // r11 = preserved: saved ar.pfs // r12 = preserved: memory stack // r13 = preserved: thread pointer - // r14 = address of mask / mask + // r14 = address of mask / mask value // r15 = preserved: system call number // r16 = preserved: current task pointer - // r17 = wall to monotonic use - // r18 = time_interpolator->offset - // r19 = address of wall_to_monotonic - // r20 = pointer to struct time_interpolator / pointer to time_interpolator->address - // r21 = shift factor - // r22 = address of time interpolator->last_counter - // r23 = address of time_interpolator->last_cycle - // r24 = adress of time_interpolator->offset - // r25 = last_cycle value - // r26 = last_counter value - // r27 = pointer to xtime + // r17 = (not used) + // r18 = (not used) + // r19 = address of itc_lastcycle + // r20 = struct fsyscall_gtod_data (= address of gtod_lock.sequence) + // r21 = address of mmio_ptr + // r22 = address of wall_time or monotonic_time + // r23 = address of shift / value + // r24 = address mult factor / cycle_last value + // r25 = itc_lastcycle value + // r26 = address clocksource cycle_last + // r27 = (not used) // r28 = sequence number at the beginning of critcal section - // r29 = address of seqlock + // r29 = address of itc_jitter // r30 = time processing flags / memory address // r31 = pointer to result // Predicates // p6,p7 short term use // p8 = timesource ar.itc // p9 = timesource mmio64 - // p10 = timesource mmio32 + // p10 = timesource mmio32 - not used // p11 = timesource not to be handled by asm code - // p12 = memory time source ( = p9 | p10) - // p13 = do cmpxchg with time_interpolator_last_cycle + // p12 = memory time source ( = p9 | p10) - not used + // p13 = do cmpxchg with itc_lastcycle // p14 = Divide by 1000 // p15 = Add monotonic // - // Note that instructions are optimized for McKinley. McKinley can process two - // bundles simultaneously and therefore we continuously try to feed the CPU - // two bundles and then a stop. - tnat.nz p6,p0 = r31 // branch deferred since it does not fit into bundle structure - mov pr = r30,0xc000 // Set predicates according to function + // Note that instructions are optimized for McKinley. McKinley can + // process two bundles simultaneously and therefore we continuously + // try to feed the CPU two bundles and then a stop. + add r2 = TI_FLAGS+IA64_TASK_SIZE,r16 - movl r20 = time_interpolator - ;; - ld8 r20 = [r20] // get pointer to time_interpolator structure - movl r29 = xtime_lock - ld4 r2 = [r2] // process work pending flags - movl r27 = xtime - ;; // only one bundle here - ld8 r21 = [r20] // first quad with control information + tnat.nz p6,p0 = r31 // guard against Nat argument +(p6) br.cond.spnt.few .fail_einval + movl r20 = fsyscall_gtod_data // load fsyscall gettimeofday data address + ;; + ld4 r2 = [r2] // process work pending flags + movl r29 = itc_jitter_data // itc_jitter + add r22 = IA64_GTOD_WALL_TIME_OFFSET,r20 // wall_time + add r21 = IA64_CLKSRC_MMIO_OFFSET,r20 + mov pr = r30,0xc000 // Set predicates according to function + ;; and r2 = TIF_ALLWORK_MASK,r2 -(p6) br.cond.spnt.few .fail_einval // deferred branch + add r19 = IA64_ITC_LASTCYCLE_OFFSET,r29 +(p15) add r22 = IA64_GTOD_MONO_TIME_OFFSET,r20 // monotonic_time ;; - add r10 = IA64_TIME_INTERPOLATOR_ADDRESS_OFFSET,r20 - extr r3 = r21,32,32 // time_interpolator->nsec_per_cyc - extr r8 = r21,0,16 // time_interpolator->source + add r26 = IA64_CLKSRC_CYCLE_LAST_OFFSET,r20 // clksrc_cycle_last cmp.ne p6, p0 = 0, r2 // Fallback if work is scheduled -(p6) br.cond.spnt.many fsys_fallback_syscall +(p6) br.cond.spnt.many fsys_fallback_syscall ;; - cmp.eq p8,p12 = 0,r8 // Check for cpu timer - cmp.eq p9,p0 = 1,r8 // MMIO64 ? - extr r2 = r21,24,8 // time_interpolator->jitter - cmp.eq p10,p0 = 2,r8 // MMIO32 ? - cmp.ltu p11,p0 = 2,r8 // function or other clock -(p11) br.cond.spnt.many fsys_fallback_syscall + // Begin critical section +.time_redo: + ld4.acq r28 = [r20] // gtod_lock.sequence, Must take first ;; - setf.sig f7 = r3 // Setup for scaling of counter -(p15) movl r19 = wall_to_monotonic -(p12) ld8 r30 = [r10] - cmp.ne p13,p0 = r2,r0 // need jitter compensation? - extr r21 = r21,16,8 // shift factor + and r28 = ~1,r28 // And make sequence even to force retry if odd ;; -.time_redo: - .pred.rel.mutex p8,p9,p10 - ld4.acq r28 = [r29] // xtime_lock.sequence. Must come first for locking purposes -(p8) mov r2 = ar.itc // CPU_TIMER. 36 clocks latency!!! - add r22 = IA64_TIME_INTERPOLATOR_LAST_COUNTER_OFFSET,r20 -(p9) ld8 r2 = [r30] // readq(ti->address). Could also have latency issues.. -(p10) ld4 r2 = [r30] // readw(ti->address) -(p13) add r23 = IA64_TIME_INTERPOLATOR_LAST_CYCLE_OFFSET,r20 - ;; // could be removed by moving the last add upward - ld8 r26 = [r22] // time_interpolator->last_counter -(p13) ld8 r25 = [r23] // time interpolator->last_cycle - add r24 = IA64_TIME_INTERPOLATOR_OFFSET_OFFSET,r20 -(p15) ld8 r17 = [r19],IA64_TIMESPEC_TV_NSEC_OFFSET - ld8 r9 = [r27],IA64_TIMESPEC_TV_NSEC_OFFSET - add r14 = IA64_TIME_INTERPOLATOR_MASK_OFFSET, r20 - ;; - ld8 r18 = [r24] // time_interpolator->offset - ld8 r8 = [r27],-IA64_TIMESPEC_TV_NSEC_OFFSET // xtime.tv_nsec -(p13) sub r3 = r25,r2 // Diff needed before comparison (thanks davidm) - ;; - ld8 r14 = [r14] // time_interpolator->mask -(p13) cmp.gt.unc p6,p7 = r3,r0 // check if it is less than last. p6,p7 cleared - sub r10 = r2,r26 // current_counter - last_counter - ;; -(p6) sub r10 = r25,r26 // time we got was less than last_cycle + ld8 r30 = [r21] // clocksource->mmio_ptr + add r24 = IA64_CLKSRC_MULT_OFFSET,r20 + ld4 r2 = [r29] // itc_jitter value + add r23 = IA64_CLKSRC_SHIFT_OFFSET,r20 + add r14 = IA64_CLKSRC_MASK_OFFSET,r20 + ;; + ld4 r3 = [r24] // clocksource mult value + ld8 r14 = [r14] // clocksource mask value + cmp.eq p8,p9 = 0,r30 // use cpu timer if no mmio_ptr + ;; + setf.sig f7 = r3 // Setup for mult scaling of counter +(p8) cmp.ne p13,p0 = r2,r0 // need itc_jitter compensation, set p13 + ld4 r23 = [r23] // clocksource shift value + ld8 r24 = [r26] // get clksrc_cycle_last value +(p9) cmp.eq p13,p0 = 0,r30 // if mmio_ptr, clear p13 jitter control + ;; + .pred.rel.mutex p8,p9 + MOV_FROM_ITC(p8, p6, r2, r10) // CPU_TIMER. 36 clocks latency!!! +(p9) ld8 r2 = [r30] // MMIO_TIMER. Could also have latency issues.. +(p13) ld8 r25 = [r19] // get itc_lastcycle value + ld8 r9 = [r22],IA64_TIMESPEC_TV_NSEC_OFFSET // tv_sec + ;; + ld8 r8 = [r22],-IA64_TIMESPEC_TV_NSEC_OFFSET // tv_nsec +(p13) sub r3 = r25,r2 // Diff needed before comparison (thanks davidm) + ;; +(p13) cmp.gt.unc p6,p7 = r3,r0 // check if it is less than last. p6,p7 cleared + sub r10 = r2,r24 // current_cycle - last_cycle + ;; +(p6) sub r10 = r25,r24 // time we got was less than last_cycle (p7) mov ar.ccv = r25 // more than last_cycle. Prep for cmpxchg ;; +(p7) cmpxchg8.rel r3 = [r19],r2,ar.ccv + ;; +(p7) cmp.ne p7,p0 = r25,r3 // if cmpxchg not successful + ;; +(p7) sub r10 = r3,r24 // then use new last_cycle instead + ;; and r10 = r10,r14 // Apply mask ;; setf.sig f8 = r10 nop.i 123 ;; -(p7) cmpxchg8.rel r3 = [r23],r2,ar.ccv -EX(.fail_efault, probe.w.fault r31, 3) // This takes 5 cycles and we have spare time + // fault check takes 5 cycles and we have spare time +EX(.fail_efault, probe.w.fault r31, 3) xmpy.l f8 = f8,f7 // nsec_per_cyc*(counter-last_counter) -(p15) add r9 = r9,r17 // Add wall to monotonic.secs to result secs ;; -(p15) ld8 r17 = [r19],-IA64_TIMESPEC_TV_NSEC_OFFSET -(p7) cmp.ne p7,p0 = r25,r3 // if cmpxchg not successful redo - // simulate tbit.nz.or p7,p0 = r28,0 - and r28 = ~1,r28 // Make sequence even to force retry if odd getf.sig r2 = f8 mf - add r8 = r8,r18 // Add time interpolator offset ;; - ld4 r10 = [r29] // xtime_lock.sequence -(p15) add r8 = r8, r17 // Add monotonic.nsecs to nsecs - shr.u r2 = r2,r21 - ;; // overloaded 3 bundles! - // End critical section. + ld4 r10 = [r20] // gtod_lock.sequence + shr.u r2 = r2,r23 // shift by factor + ;; add r8 = r8,r2 // Add xtime.nsecs - cmp4.ne.or p7,p0 = r28,r10 -(p7) br.cond.dpnt.few .time_redo // sequence number changed ? + cmp4.ne p7,p0 = r28,r10 +(p7) br.cond.dpnt.few .time_redo // sequence number changed, redo + // End critical section. // Now r8=tv->tv_nsec and r9=tv->tv_sec mov r10 = r0 movl r2 = 1000000000 @@ -304,23 +280,23 @@ EX(.fail_efault, probe.w.fault r31, 3) // This takes 5 cycles and we have spare .time_normalize: mov r21 = r8 cmp.ge p6,p0 = r8,r2 -(p14) shr.u r20 = r8, 3 // We can repeat this if necessary just wasting some time +(p14) shr.u r20 = r8, 3 // We can repeat this if necessary just wasting time ;; (p14) setf.sig f8 = r20 (p6) sub r8 = r8,r2 -(p6) add r9 = 1,r9 // two nops before the branch. -(p14) setf.sig f7 = r3 // Chances for repeats are 1 in 10000 for gettod +(p6) add r9 = 1,r9 // two nops before the branch. +(p14) setf.sig f7 = r3 // Chances for repeats are 1 in 10000 for gettod (p6) br.cond.dpnt.few .time_normalize ;; // Divided by 8 though shift. Now divide by 125 // The compiler was able to do that with a multiply // and a shift and we do the same -EX(.fail_efault, probe.w.fault r23, 3) // This also costs 5 cycles -(p14) xmpy.hu f8 = f8, f7 // xmpy has 5 cycles latency so use it... +EX(.fail_efault, probe.w.fault r23, 3) // This also costs 5 cycles +(p14) xmpy.hu f8 = f8, f7 // xmpy has 5 cycles latency so use it ;; - mov r8 = r0 (p14) getf.sig r2 = f8 ;; + mov r8 = r0 (p14) shr.u r21 = r2, 4 ;; EX(.fail_efault, st8 [r31] = r9) @@ -349,161 +325,61 @@ ENTRY(fsys_clock_gettime) END(fsys_clock_gettime) /* - * long fsys_rt_sigprocmask (int how, sigset_t *set, sigset_t *oset, size_t sigsetsize). + * fsys_getcpu doesn't use the third parameter in this implementation. It reads + * current_thread_info()->cpu and corresponding node in cpu_to_node_map. */ -#if _NSIG_WORDS != 1 -# error Sorry, fsys_rt_sigprocmask() needs to be updated for _NSIG_WORDS != 1. -#endif -ENTRY(fsys_rt_sigprocmask) +ENTRY(fsys_getcpu) .prologue .altrp b6 .body - - add r2=IA64_TASK_BLOCKED_OFFSET,r16 - add r9=TI_FLAGS+IA64_TASK_SIZE,r16 - cmp4.ltu p6,p0=SIG_SETMASK,r32 - - cmp.ne p15,p0=r0,r34 // oset != NULL? - tnat.nz p8,p0=r34 - add r31=IA64_TASK_SIGHAND_OFFSET,r16 ;; - ld8 r3=[r2] // read/prefetch current->blocked - ld4 r9=[r9] - tnat.nz.or p6,p0=r35 - - cmp.ne.or p6,p0=_NSIG_WORDS*8,r35 - tnat.nz.or p6,p0=r32 -(p6) br.spnt.few .fail_einval // fail with EINVAL + add r2=TI_FLAGS+IA64_TASK_SIZE,r16 + tnat.nz p6,p0 = r32 // guard against NaT argument + add r3=TI_CPU+IA64_TASK_SIZE,r16 ;; -#ifdef CONFIG_SMP - ld8 r31=[r31] // r31 <- current->sighand -#endif - and r9=TIF_ALLWORK_MASK,r9 - tnat.nz.or p8,p0=r33 - ;; - cmp.ne p7,p0=0,r9 - cmp.eq p6,p0=r0,r33 // set == NULL? - add r31=IA64_SIGHAND_SIGLOCK_OFFSET,r31 // r31 <- current->sighand->siglock -(p8) br.spnt.few .fail_efault // fail with EFAULT -(p7) br.spnt.many fsys_fallback_syscall // got pending kernel work... -(p6) br.dpnt.many .store_mask // -> short-circuit to just reading the signal mask - - /* Argh, we actually have to do some work and _update_ the signal mask: */ - -EX(.fail_efault, probe.r.fault r33, 3) // verify user has read-access to *set -EX(.fail_efault, ld8 r14=[r33]) // r14 <- *set - mov r17=(1 << (SIGKILL - 1)) | (1 << (SIGSTOP - 1)) + ld4 r3=[r3] // M r3 = thread_info->cpu + ld4 r2=[r2] // M r2 = thread_info->flags +(p6) br.cond.spnt.few .fail_einval // B ;; - - rsm psr.i // mask interrupt delivery - mov ar.ccv=0 - andcm r14=r14,r17 // filter out SIGKILL & SIGSTOP - -#ifdef CONFIG_SMP - mov r17=1 + tnat.nz p7,p0 = r33 // I guard against NaT argument +(p7) br.cond.spnt.few .fail_einval // B ;; - cmpxchg4.acq r18=[r31],r17,ar.ccv // try to acquire the lock - mov r8=EINVAL // default to EINVAL + cmp.ne p6,p0=r32,r0 + cmp.ne p7,p0=r33,r0 ;; - ld8 r3=[r2] // re-read current->blocked now that we hold the lock - cmp4.ne p6,p0=r18,r0 -(p6) br.cond.spnt.many .lock_contention +#ifdef CONFIG_NUMA + movl r17=cpu_to_node_map ;; -#else - ld8 r3=[r2] // re-read current->blocked now that we hold the lock - mov r8=EINVAL // default to EINVAL -#endif - add r18=IA64_TASK_PENDING_OFFSET+IA64_SIGPENDING_SIGNAL_OFFSET,r16 - add r19=IA64_TASK_SIGNAL_OFFSET,r16 - cmp4.eq p6,p0=SIG_BLOCK,r32 - ;; - ld8 r19=[r19] // r19 <- current->signal - cmp4.eq p7,p0=SIG_UNBLOCK,r32 - cmp4.eq p8,p0=SIG_SETMASK,r32 - ;; - ld8 r18=[r18] // r18 <- current->pending.signal - .pred.rel.mutex p6,p7,p8 -(p6) or r14=r3,r14 // SIG_BLOCK -(p7) andcm r14=r3,r14 // SIG_UNBLOCK - -(p8) mov r14=r14 // SIG_SETMASK -(p6) mov r8=0 // clear error code - // recalc_sigpending() - add r17=IA64_SIGNAL_GROUP_STOP_COUNT_OFFSET,r19 - - add r19=IA64_SIGNAL_SHARED_PENDING_OFFSET+IA64_SIGPENDING_SIGNAL_OFFSET,r19 +EX(.fail_efault, (p6) probe.w.fault r32, 3) // M This takes 5 cycles +EX(.fail_efault, (p7) probe.w.fault r33, 3) // M This takes 5 cycles + shladd r18=r3,1,r17 ;; - ld4 r17=[r17] // r17 <- current->signal->group_stop_count -(p7) mov r8=0 // clear error code - - ld8 r19=[r19] // r19 <- current->signal->shared_pending + ld2 r20=[r18] // r20 = cpu_to_node_map[cpu] + and r2 = TIF_ALLWORK_MASK,r2 ;; - cmp4.gt p6,p7=r17,r0 // p6/p7 <- (current->signal->group_stop_count > 0)? -(p8) mov r8=0 // clear error code - - or r18=r18,r19 // r18 <- current->pending | current->signal->shared_pending + cmp.ne p8,p0=0,r2 +(p8) br.spnt.many fsys_fallback_syscall ;; - // r18 <- (current->pending | current->signal->shared_pending) & ~current->blocked: - andcm r18=r18,r14 - add r9=TI_FLAGS+IA64_TASK_SIZE,r16 ;; - -(p7) cmp.ne.or.andcm p6,p7=r18,r0 // p6/p7 <- signal pending - mov r19=0 // i must not leak kernel bits... -(p6) br.cond.dpnt.many .sig_pending +EX(.fail_efault, (p6) st4 [r32] = r3) +EX(.fail_efault, (p7) st2 [r33] = r20) + mov r8=0 ;; - -1: ld4 r17=[r9] // r17 <- current->thread_info->flags +#else +EX(.fail_efault, (p6) probe.w.fault r32, 3) // M This takes 5 cycles +EX(.fail_efault, (p7) probe.w.fault r33, 3) // M This takes 5 cycles + and r2 = TIF_ALLWORK_MASK,r2 ;; - mov ar.ccv=r17 - and r18=~_TIF_SIGPENDING,r17 // r18 <- r17 & ~(1 << TIF_SIGPENDING) + cmp.ne p8,p0=0,r2 +(p8) br.spnt.many fsys_fallback_syscall ;; - - st8 [r2]=r14 // update current->blocked with new mask - cmpxchg4.acq r8=[r9],r18,ar.ccv // current->thread_info->flags <- r18 +EX(.fail_efault, (p6) st4 [r32] = r3) +EX(.fail_efault, (p7) st2 [r33] = r0) + mov r8=0 ;; - cmp.ne p6,p0=r17,r8 // update failed? -(p6) br.cond.spnt.few 1b // yes -> retry - -#ifdef CONFIG_SMP - st4.rel [r31]=r0 // release the lock #endif - ssm psr.i - ;; - - srlz.d // ensure psr.i is set again - mov r18=0 // i must not leak kernel bits... - -.store_mask: -EX(.fail_efault, (p15) probe.w.fault r34, 3) // verify user has write-access to *oset -EX(.fail_efault, (p15) st8 [r34]=r3) - mov r2=0 // i must not leak kernel bits... - mov r3=0 // i must not leak kernel bits... - mov r8=0 // return 0 - mov r9=0 // i must not leak kernel bits... - mov r14=0 // i must not leak kernel bits... - mov r17=0 // i must not leak kernel bits... - mov r31=0 // i must not leak kernel bits... FSYS_RETURN - -.sig_pending: -#ifdef CONFIG_SMP - st4.rel [r31]=r0 // release the lock -#endif - ssm psr.i - ;; - srlz.d - br.sptk.many fsys_fallback_syscall // with signal pending, do the heavy-weight syscall - -#ifdef CONFIG_SMP -.lock_contention: - /* Rather than spinning here, fall back on doing a heavy-weight syscall. */ - ssm psr.i - ;; - srlz.d - br.sptk.many fsys_fallback_syscall -#endif -END(fsys_rt_sigprocmask) +END(fsys_getcpu) ENTRY(fsys_fallback_syscall) .prologue @@ -516,17 +392,17 @@ ENTRY(fsys_fallback_syscall) adds r17=-1024,r15 movl r14=sys_call_table ;; - rsm psr.i + RSM_PSR_I(p0, r26, r27) shladd r18=r17,3,r14 ;; ld8 r18=[r18] // load normal (heavy-weight) syscall entry-point - mov r29=psr // read psr (12 cyc load latency) + MOV_FROM_PSR(p0, r29, r26) // read psr (12 cyc load latency) mov r27=ar.rsc mov r21=ar.fpsr mov r26=ar.pfs END(fsys_fallback_syscall) /* FALL THROUGH */ -GLOBAL_ENTRY(fsys_bubble_down) +GLOBAL_ENTRY(paravirt_fsys_bubble_down) .prologue .altrp b6 .body @@ -564,7 +440,7 @@ GLOBAL_ENTRY(fsys_bubble_down) * * PSR.BE : already is turned off in __kernel_syscall_via_epc() * PSR.AC : don't care (kernel normally turns PSR.AC on) - * PSR.I : already turned off by the time fsys_bubble_down gets + * PSR.I : already turned off by the time paravirt_fsys_bubble_down gets * invoked * PSR.DFL: always 0 (kernel never turns it on) * PSR.DFH: don't care --- kernel never touches f32-f127 on its own @@ -574,7 +450,7 @@ GLOBAL_ENTRY(fsys_bubble_down) * PSR.DB : don't care --- kernel never enables kernel-level * breakpoints * PSR.TB : must be 0 already; if it wasn't zero on entry to - * __kernel_syscall_via_epc, the branch to fsys_bubble_down + * __kernel_syscall_via_epc, the branch to paravirt_fsys_bubble_down * will trigger a taken branch; the taken-trap-handler then * converts the syscall into a break-based system-call. */ @@ -606,7 +482,11 @@ GLOBAL_ENTRY(fsys_bubble_down) nop.i 0 ;; mov ar.rsc=0 // M2 set enforced lazy mode, pl 0, LE, loadrs=0 +#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE + MOV_FROM_ITC(p0, p6, r30, r23) // M get cycle for accounting +#else nop.m 0 +#endif nop.i 0 ;; mov r23=ar.bspstore // M2 (12 cyc) save ar.bspstore @@ -628,25 +508,47 @@ GLOBAL_ENTRY(fsys_bubble_down) cmp.ne pKStk,pUStk=r0,r0 // A set pKStk <- 0, pUStk <- 1 br.call.sptk.many b7=ia64_syscall_setup // B ;; +#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE + // mov.m r30=ar.itc is called in advance + add r16=TI_AC_STAMP+IA64_TASK_SIZE,r2 + add r17=TI_AC_LEAVE+IA64_TASK_SIZE,r2 + ;; + ld8 r18=[r16],TI_AC_STIME-TI_AC_STAMP // time at last check in kernel + ld8 r19=[r17],TI_AC_UTIME-TI_AC_LEAVE // time at leave kernel + ;; + ld8 r20=[r16],TI_AC_STAMP-TI_AC_STIME // cumulated stime + ld8 r21=[r17] // cumulated utime + sub r22=r19,r18 // stime before leave kernel + ;; + st8 [r16]=r30,TI_AC_STIME-TI_AC_STAMP // update stamp + sub r18=r30,r19 // elapsed time in user mode + ;; + add r20=r20,r22 // sum stime + add r21=r21,r18 // sum utime + ;; + st8 [r16]=r20 // update stime + st8 [r17]=r21 // update utime + ;; +#endif mov ar.rsc=0x3 // M2 set eager mode, pl 0, LE, loadrs=0 mov rp=r14 // I0 set the real return addr and r3=_TIF_SYSCALL_TRACEAUDIT,r3 // A ;; - ssm psr.i // M2 we're on kernel stacks now, reenable irqs + SSM_PSR_I(p0, p6, r22) // M2 we're on kernel stacks now, reenable irqs cmp.eq p8,p0=r3,r0 // A (p10) br.cond.spnt.many ia64_ret_from_syscall // B return if bad call-frame or r15 is a NaT nop.m 0 (p8) br.call.sptk.many b6=b6 // B (ignore return address) br.cond.spnt ia64_trace_syscall // B -END(fsys_bubble_down) +END(paravirt_fsys_bubble_down) .rodata .align 8 - .globl fsyscall_table + .globl paravirt_fsyscall_table - data8 fsys_bubble_down -fsyscall_table: + data8 paravirt_fsys_bubble_down +paravirt_fsyscall_table: data8 fsys_ni_syscall data8 0 // exit // 1025 data8 0 // read @@ -665,7 +567,7 @@ fsyscall_table: data8 0 // chown data8 0 // lseek // 1040 data8 fsys_getpid // getpid - data8 fsys_getppid // getppid + data8 0 // getppid data8 0 // mount data8 0 // umount data8 0 // setuid // 1045 @@ -802,7 +704,7 @@ fsyscall_table: data8 0 // sigaltstack data8 0 // rt_sigaction data8 0 // rt_sigpending - data8 fsys_rt_sigprocmask // rt_sigprocmask + data8 0 // rt_sigprocmask data8 0 // rt_sigqueueinfo // 1180 data8 0 // rt_sigreturn data8 0 // rt_sigsuspend @@ -882,9 +784,9 @@ fsyscall_table: data8 0 // clock_nanosleep data8 0 // fstatfs64 data8 0 // statfs64 - data8 0 - data8 0 // 1260 - data8 0 + data8 0 // mbind + data8 0 // get_mempolicy // 1260 + data8 0 // set_mempolicy data8 0 // mq_open data8 0 // mq_unlink data8 0 // mq_timedsend @@ -892,16 +794,43 @@ fsyscall_table: data8 0 // mq_notify data8 0 // mq_getsetattr data8 0 // kexec_load + data8 0 // vserver + data8 0 // waitid // 1270 + data8 0 // add_key + data8 0 // request_key + data8 0 // keyctl + data8 0 // ioprio_set + data8 0 // ioprio_get // 1275 + data8 0 // move_pages + data8 0 // inotify_init + data8 0 // inotify_add_watch + data8 0 // inotify_rm_watch + data8 0 // migrate_pages // 1280 + data8 0 // openat + data8 0 // mkdirat + data8 0 // mknodat + data8 0 // fchownat + data8 0 // futimesat // 1285 + data8 0 // newfstatat + data8 0 // unlinkat + data8 0 // renameat + data8 0 // linkat + data8 0 // symlinkat // 1290 + data8 0 // readlinkat + data8 0 // fchmodat + data8 0 // faccessat data8 0 - data8 0 // 1270 - data8 0 - data8 0 - data8 0 - data8 0 - data8 0 // 1275 - data8 0 - data8 0 - data8 0 + data8 0 // 1295 + data8 0 // unshare + data8 0 // splice + data8 0 // set_robust_list + data8 0 // get_robust_list + data8 0 // sync_file_range // 1300 + data8 0 // tee + data8 0 // vmsplice data8 0 + data8 fsys_getcpu // getcpu // 1304 - .org fsyscall_table + 8*NR_syscalls // guard against failures to increase NR_syscalls + // fill in zeros for the remaining entries + .zero: + .space paravirt_fsyscall_table + 8*NR_syscalls - .zero, 0 diff --git a/arch/ia64/kernel/fsyscall_gtod_data.h b/arch/ia64/kernel/fsyscall_gtod_data.h new file mode 100644 index 00000000000..146b15b5fec --- /dev/null +++ b/arch/ia64/kernel/fsyscall_gtod_data.h @@ -0,0 +1,23 @@ +/* + * (c) Copyright 2007 Hewlett-Packard Development Company, L.P. + * Contributed by Peter Keilty <peter.keilty@hp.com> + * + * fsyscall gettimeofday data + */ + +struct fsyscall_gtod_data_t { + seqcount_t seq; + struct timespec wall_time; + struct timespec monotonic_time; + cycle_t clk_mask; + u32 clk_mult; + u32 clk_shift; + void *clk_fsys_mmio; + cycle_t clk_cycle_last; +} ____cacheline_aligned; + +struct itc_jitter_data_t { + int itc_jitter; + cycle_t itc_lastcycle; +} ____cacheline_aligned; + diff --git a/arch/ia64/kernel/ftrace.c b/arch/ia64/kernel/ftrace.c new file mode 100644 index 00000000000..3b0c2aa0785 --- /dev/null +++ b/arch/ia64/kernel/ftrace.c @@ -0,0 +1,204 @@ +/* + * Dynamic function tracing support. + * + * Copyright (C) 2008 Shaohua Li <shaohua.li@intel.com> + * + * For licencing details, see COPYING. + * + * Defines low-level handling of mcount calls when the kernel + * is compiled with the -pg flag. When using dynamic ftrace, the + * mcount call-sites get patched lazily with NOP till they are + * enabled. All code mutation routines here take effect atomically. + */ + +#include <linux/uaccess.h> +#include <linux/ftrace.h> + +#include <asm/cacheflush.h> +#include <asm/patch.h> + +/* In IA64, each function will be added below two bundles with -pg option */ +static unsigned char __attribute__((aligned(8))) +ftrace_orig_code[MCOUNT_INSN_SIZE] = { + 0x02, 0x40, 0x31, 0x10, 0x80, 0x05, /* alloc r40=ar.pfs,12,8,0 */ + 0xb0, 0x02, 0x00, 0x00, 0x42, 0x40, /* mov r43=r0;; */ + 0x05, 0x00, 0xc4, 0x00, /* mov r42=b0 */ + 0x11, 0x48, 0x01, 0x02, 0x00, 0x21, /* mov r41=r1 */ + 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, /* nop.i 0x0 */ + 0x08, 0x00, 0x00, 0x50 /* br.call.sptk.many b0 = _mcount;; */ +}; + +struct ftrace_orig_insn { + u64 dummy1, dummy2, dummy3; + u64 dummy4:64-41+13; + u64 imm20:20; + u64 dummy5:3; + u64 sign:1; + u64 dummy6:4; +}; + +/* mcount stub will be converted below for nop */ +static unsigned char ftrace_nop_code[MCOUNT_INSN_SIZE] = { + 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, /* [MII] nop.m 0x0 */ + 0x30, 0x00, 0x00, 0x60, 0x00, 0x00, /* mov r3=ip */ + 0x00, 0x00, 0x04, 0x00, /* nop.i 0x0 */ + 0x05, 0x00, 0x00, 0x00, 0x01, 0x00, /* [MLX] nop.m 0x0 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* nop.x 0x0;; */ + 0x00, 0x00, 0x04, 0x00 +}; + +static unsigned char *ftrace_nop_replace(void) +{ + return ftrace_nop_code; +} + +/* + * mcount stub will be converted below for call + * Note: Just the last instruction is changed against nop + * */ +static unsigned char __attribute__((aligned(8))) +ftrace_call_code[MCOUNT_INSN_SIZE] = { + 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, /* [MII] nop.m 0x0 */ + 0x30, 0x00, 0x00, 0x60, 0x00, 0x00, /* mov r3=ip */ + 0x00, 0x00, 0x04, 0x00, /* nop.i 0x0 */ + 0x05, 0x00, 0x00, 0x00, 0x01, 0x00, /* [MLX] nop.m 0x0 */ + 0xff, 0xff, 0xff, 0xff, 0x7f, 0x00, /* brl.many .;;*/ + 0xf8, 0xff, 0xff, 0xc8 +}; + +struct ftrace_call_insn { + u64 dummy1, dummy2; + u64 dummy3:48; + u64 imm39_l:16; + u64 imm39_h:23; + u64 dummy4:13; + u64 imm20:20; + u64 dummy5:3; + u64 i:1; + u64 dummy6:4; +}; + +static unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr) +{ + struct ftrace_call_insn *code = (void *)ftrace_call_code; + unsigned long offset = addr - (ip + 0x10); + + code->imm39_l = offset >> 24; + code->imm39_h = offset >> 40; + code->imm20 = offset >> 4; + code->i = offset >> 63; + return ftrace_call_code; +} + +static int +ftrace_modify_code(unsigned long ip, unsigned char *old_code, + unsigned char *new_code, int do_check) +{ + unsigned char replaced[MCOUNT_INSN_SIZE]; + + /* + * Note: Due to modules and __init, code can + * disappear and change, we need to protect against faulting + * as well as code changing. We do this by using the + * probe_kernel_* functions. + * + * No real locking needed, this code is run through + * kstop_machine, or before SMP starts. + */ + + if (!do_check) + goto skip_check; + + /* read the text we want to modify */ + if (probe_kernel_read(replaced, (void *)ip, MCOUNT_INSN_SIZE)) + return -EFAULT; + + /* Make sure it is what we expect it to be */ + if (memcmp(replaced, old_code, MCOUNT_INSN_SIZE) != 0) + return -EINVAL; + +skip_check: + /* replace the text with the new text */ + if (probe_kernel_write(((void *)ip), new_code, MCOUNT_INSN_SIZE)) + return -EPERM; + flush_icache_range(ip, ip + MCOUNT_INSN_SIZE); + + return 0; +} + +static int ftrace_make_nop_check(struct dyn_ftrace *rec, unsigned long addr) +{ + unsigned char __attribute__((aligned(8))) replaced[MCOUNT_INSN_SIZE]; + unsigned long ip = rec->ip; + + if (probe_kernel_read(replaced, (void *)ip, MCOUNT_INSN_SIZE)) + return -EFAULT; + if (rec->flags & FTRACE_FL_CONVERTED) { + struct ftrace_call_insn *call_insn, *tmp_call; + + call_insn = (void *)ftrace_call_code; + tmp_call = (void *)replaced; + call_insn->imm39_l = tmp_call->imm39_l; + call_insn->imm39_h = tmp_call->imm39_h; + call_insn->imm20 = tmp_call->imm20; + call_insn->i = tmp_call->i; + if (memcmp(replaced, ftrace_call_code, MCOUNT_INSN_SIZE) != 0) + return -EINVAL; + return 0; + } else { + struct ftrace_orig_insn *call_insn, *tmp_call; + + call_insn = (void *)ftrace_orig_code; + tmp_call = (void *)replaced; + call_insn->sign = tmp_call->sign; + call_insn->imm20 = tmp_call->imm20; + if (memcmp(replaced, ftrace_orig_code, MCOUNT_INSN_SIZE) != 0) + return -EINVAL; + return 0; + } +} + +int ftrace_make_nop(struct module *mod, + struct dyn_ftrace *rec, unsigned long addr) +{ + int ret; + char *new; + + ret = ftrace_make_nop_check(rec, addr); + if (ret) + return ret; + new = ftrace_nop_replace(); + return ftrace_modify_code(rec->ip, NULL, new, 0); +} + +int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) +{ + unsigned long ip = rec->ip; + unsigned char *old, *new; + + old= ftrace_nop_replace(); + new = ftrace_call_replace(ip, addr); + return ftrace_modify_code(ip, old, new, 1); +} + +/* in IA64, _mcount can't directly call ftrace_stub. Only jump is ok */ +int ftrace_update_ftrace_func(ftrace_func_t func) +{ + unsigned long ip; + unsigned long addr = ((struct fnptr *)ftrace_call)->ip; + + if (func == ftrace_stub) + return 0; + ip = ((struct fnptr *)func)->ip; + + ia64_patch_imm64(addr + 2, ip); + + flush_icache_range(addr, addr + 16); + return 0; +} + +/* run from kstop_machine */ +int __init ftrace_dyn_arch_init(void) +{ + return 0; +} diff --git a/arch/ia64/kernel/gate-data.S b/arch/ia64/kernel/gate-data.S index 258c0a3238f..b3ef1c72e13 100644 --- a/arch/ia64/kernel/gate-data.S +++ b/arch/ia64/kernel/gate-data.S @@ -1,3 +1,3 @@ - .section .data.gate, "aw" + .section .data..gate, "aw" .incbin "arch/ia64/kernel/gate.so" diff --git a/arch/ia64/kernel/gate.S b/arch/ia64/kernel/gate.S index 86064ca9895..b5f8bdd8618 100644 --- a/arch/ia64/kernel/gate.S +++ b/arch/ia64/kernel/gate.S @@ -6,14 +6,15 @@ * David Mosberger-Tang <davidm@hpl.hp.com> */ -#include <linux/config.h> #include <asm/asmmacro.h> #include <asm/errno.h> #include <asm/asm-offsets.h> #include <asm/sigcontext.h> -#include <asm/system.h> #include <asm/unistd.h> +#include <asm/kregs.h> +#include <asm/page.h> +#include "paravirt_inst.h" /* * We can't easily refer to symbols inside the kernel. To avoid full runtime relocation, @@ -21,17 +22,18 @@ * to targets outside the shared object) and to avoid multi-phase kernel builds, we * simply create minimalistic "patch lists" in special ELF sections. */ - .section ".data.patch.fsyscall_table", "a" + .section ".data..patch.fsyscall_table", "a" .previous #define LOAD_FSYSCALL_TABLE(reg) \ [1:] movl reg=0; \ - .xdata4 ".data.patch.fsyscall_table", 1b-. + .xdata4 ".data..patch.fsyscall_table", 1b-. - .section ".data.patch.brl_fsys_bubble_down", "a" + .section ".data..patch.brl_fsys_bubble_down", "a" .previous #define BRL_COND_FSYS_BUBBLE_DOWN(pr) \ [1:](pr)brl.cond.sptk 0; \ - .xdata4 ".data.patch.brl_fsys_bubble_down", 1b-. + ;; \ + .xdata4 ".data..patch.brl_fsys_bubble_down", 1b-. GLOBAL_ENTRY(__kernel_syscall_via_break) .prologue @@ -48,87 +50,6 @@ GLOBAL_ENTRY(__kernel_syscall_via_break) } END(__kernel_syscall_via_break) -/* - * On entry: - * r11 = saved ar.pfs - * r15 = system call # - * b0 = saved return address - * b6 = return address - * On exit: - * r11 = saved ar.pfs - * r15 = system call # - * b0 = saved return address - * all other "scratch" registers: undefined - * all "preserved" registers: same as on entry - */ - -GLOBAL_ENTRY(__kernel_syscall_via_epc) - .prologue - .altrp b6 - .body -{ - /* - * Note: the kernel cannot assume that the first two instructions in this - * bundle get executed. The remaining code must be safe even if - * they do not get executed. - */ - adds r17=-1024,r15 // A - mov r10=0 // A default to successful syscall execution - epc // B causes split-issue -} - ;; - rsm psr.be | psr.i // M2 (5 cyc to srlz.d) - LOAD_FSYSCALL_TABLE(r14) // X - ;; - mov r16=IA64_KR(CURRENT) // M2 (12 cyc) - shladd r18=r17,3,r14 // A - mov r19=NR_syscalls-1 // A - ;; - lfetch [r18] // M0|1 - mov r29=psr // M2 (12 cyc) - // If r17 is a NaT, p6 will be zero - cmp.geu p6,p7=r19,r17 // A (sysnr > 0 && sysnr < 1024+NR_syscalls)? - ;; - mov r21=ar.fpsr // M2 (12 cyc) - tnat.nz p10,p9=r15 // I0 - mov.i r26=ar.pfs // I0 (would stall anyhow due to srlz.d...) - ;; - srlz.d // M0 (forces split-issue) ensure PSR.BE==0 -(p6) ld8 r18=[r18] // M0|1 - nop.i 0 - ;; - nop.m 0 -(p6) tbit.z.unc p8,p0=r18,0 // I0 (dual-issues with "mov b7=r18"!) - nop.i 0 - ;; -(p8) ssm psr.i -(p6) mov b7=r18 // I0 -(p8) br.dptk.many b7 // B - - mov r27=ar.rsc // M2 (12 cyc) -/* - * brl.cond doesn't work as intended because the linker would convert this branch - * into a branch to a PLT. Perhaps there will be a way to avoid this with some - * future version of the linker. In the meantime, we just use an indirect branch - * instead. - */ -#ifdef CONFIG_ITANIUM -(p6) add r14=-8,r14 // r14 <- addr of fsys_bubble_down entry - ;; -(p6) ld8 r14=[r14] // r14 <- fsys_bubble_down - ;; -(p6) mov b7=r14 -(p6) br.sptk.many b7 -#else - BRL_COND_FSYS_BUBBLE_DOWN(p6) -#endif - ssm psr.i - mov r10=-1 -(p10) mov r8=EINVAL -(p9) mov r8=ENOSYS - FSYS_RETURN -END(__kernel_syscall_via_epc) - # define ARG0_OFF (16 + IA64_SIGFRAME_ARG0_OFFSET) # define ARG1_OFF (16 + IA64_SIGFRAME_ARG1_OFFSET) # define ARG2_OFF (16 + IA64_SIGFRAME_ARG2_OFFSET) @@ -374,3 +295,92 @@ restore_rbs: // invala not necessary as that will happen when returning to user-mode br.cond.sptk back_from_restore_rbs END(__kernel_sigtramp) + +/* + * On entry: + * r11 = saved ar.pfs + * r15 = system call # + * b0 = saved return address + * b6 = return address + * On exit: + * r11 = saved ar.pfs + * r15 = system call # + * b0 = saved return address + * all other "scratch" registers: undefined + * all "preserved" registers: same as on entry + */ + +GLOBAL_ENTRY(__kernel_syscall_via_epc) + .prologue + .altrp b6 + .body +{ + /* + * Note: the kernel cannot assume that the first two instructions in this + * bundle get executed. The remaining code must be safe even if + * they do not get executed. + */ + adds r17=-1024,r15 // A + mov r10=0 // A default to successful syscall execution + epc // B causes split-issue +} + ;; + RSM_PSR_BE_I(r20, r22) // M2 (5 cyc to srlz.d) + LOAD_FSYSCALL_TABLE(r14) // X + ;; + mov r16=IA64_KR(CURRENT) // M2 (12 cyc) + shladd r18=r17,3,r14 // A + mov r19=NR_syscalls-1 // A + ;; + lfetch [r18] // M0|1 + MOV_FROM_PSR(p0, r29, r8) // M2 (12 cyc) + // If r17 is a NaT, p6 will be zero + cmp.geu p6,p7=r19,r17 // A (sysnr > 0 && sysnr < 1024+NR_syscalls)? + ;; + mov r21=ar.fpsr // M2 (12 cyc) + tnat.nz p10,p9=r15 // I0 + mov.i r26=ar.pfs // I0 (would stall anyhow due to srlz.d...) + ;; + srlz.d // M0 (forces split-issue) ensure PSR.BE==0 +(p6) ld8 r18=[r18] // M0|1 + nop.i 0 + ;; + nop.m 0 +(p6) tbit.z.unc p8,p0=r18,0 // I0 (dual-issues with "mov b7=r18"!) + nop.i 0 + ;; + SSM_PSR_I(p8, p14, r25) +(p6) mov b7=r18 // I0 +(p8) br.dptk.many b7 // B + + mov r27=ar.rsc // M2 (12 cyc) +/* + * brl.cond doesn't work as intended because the linker would convert this branch + * into a branch to a PLT. Perhaps there will be a way to avoid this with some + * future version of the linker. In the meantime, we just use an indirect branch + * instead. + */ +#ifdef CONFIG_ITANIUM +(p6) add r14=-8,r14 // r14 <- addr of fsys_bubble_down entry + ;; +(p6) ld8 r14=[r14] // r14 <- fsys_bubble_down + ;; +(p6) mov b7=r14 +(p6) br.sptk.many b7 +#else + BRL_COND_FSYS_BUBBLE_DOWN(p6) +#endif + SSM_PSR_I(p0, p14, r10) + mov r10=-1 +(p10) mov r8=EINVAL +(p9) mov r8=ENOSYS + FSYS_RETURN + +#ifdef CONFIG_PARAVIRT + /* + * padd to make the size of this symbol constant + * independent of paravirtualization. + */ + .align PAGE_SIZE / 8 +#endif +END(__kernel_syscall_via_epc) diff --git a/arch/ia64/kernel/gate.lds.S b/arch/ia64/kernel/gate.lds.S index e1e4aba9ecd..e518f7902af 100644 --- a/arch/ia64/kernel/gate.lds.S +++ b/arch/ia64/kernel/gate.lds.S @@ -1,79 +1,92 @@ /* - * Linker script for gate DSO. The gate pages are an ELF shared object prelinked to its - * virtual address, with only one read-only segment and one execute-only segment (both fit - * in one page). This script controls its layout. + * Linker script for gate DSO. The gate pages are an ELF shared object + * prelinked to its virtual address, with only one read-only segment and + * one execute-only segment (both fit in one page). This script controls + * its layout. */ -#include <linux/config.h> - -#include <asm/system.h> +#include <asm/page.h> +#include "paravirt_patchlist.h" SECTIONS { - . = GATE_ADDR + SIZEOF_HEADERS; - - .hash : { *(.hash) } :readable - .dynsym : { *(.dynsym) } - .dynstr : { *(.dynstr) } - .gnu.version : { *(.gnu.version) } - .gnu.version_d : { *(.gnu.version_d) } - .gnu.version_r : { *(.gnu.version_r) } - .dynamic : { *(.dynamic) } :readable :dynamic - - /* - * This linker script is used both with -r and with -shared. For the layouts to match, - * we need to skip more than enough space for the dynamic symbol table et al. If this - * amount is insufficient, ld -shared will barf. Just increase it here. - */ - . = GATE_ADDR + 0x500; - - .data.patch : { - __start_gate_mckinley_e9_patchlist = .; - *(.data.patch.mckinley_e9) - __end_gate_mckinley_e9_patchlist = .; - - __start_gate_vtop_patchlist = .; - *(.data.patch.vtop) - __end_gate_vtop_patchlist = .; - - __start_gate_fsyscall_patchlist = .; - *(.data.patch.fsyscall_table) - __end_gate_fsyscall_patchlist = .; - - __start_gate_brl_fsys_bubble_down_patchlist = .; - *(.data.patch.brl_fsys_bubble_down) - __end_gate_brl_fsys_bubble_down_patchlist = .; - } :readable - .IA_64.unwind_info : { *(.IA_64.unwind_info*) } - .IA_64.unwind : { *(.IA_64.unwind*) } :readable :unwind + . = GATE_ADDR + SIZEOF_HEADERS; + + .hash : { *(.hash) } :readable + .gnu.hash : { *(.gnu.hash) } + .dynsym : { *(.dynsym) } + .dynstr : { *(.dynstr) } + .gnu.version : { *(.gnu.version) } + .gnu.version_d : { *(.gnu.version_d) } + .gnu.version_r : { *(.gnu.version_r) } + + .note : { *(.note*) } :readable :note + + .dynamic : { *(.dynamic) } :readable :dynamic + + /* + * This linker script is used both with -r and with -shared. For + * the layouts to match, we need to skip more than enough space for + * the dynamic symbol table et al. If this amount is insufficient, + * ld -shared will barf. Just increase it here. + */ + . = GATE_ADDR + 0x600; + + .data..patch : { + __paravirt_start_gate_mckinley_e9_patchlist = .; + *(.data..patch.mckinley_e9) + __paravirt_end_gate_mckinley_e9_patchlist = .; + + __paravirt_start_gate_vtop_patchlist = .; + *(.data..patch.vtop) + __paravirt_end_gate_vtop_patchlist = .; + + __paravirt_start_gate_fsyscall_patchlist = .; + *(.data..patch.fsyscall_table) + __paravirt_end_gate_fsyscall_patchlist = .; + + __paravirt_start_gate_brl_fsys_bubble_down_patchlist = .; + *(.data..patch.brl_fsys_bubble_down) + __paravirt_end_gate_brl_fsys_bubble_down_patchlist = .; + } :readable + + .IA_64.unwind_info : { *(.IA_64.unwind_info*) } + .IA_64.unwind : { *(.IA_64.unwind*) } :readable :unwind #ifdef HAVE_BUGGY_SEGREL - .text (GATE_ADDR + PAGE_SIZE) : { *(.text) *(.text.*) } :readable + .text (GATE_ADDR + PAGE_SIZE) : { *(.text) *(.text.*) } :readable #else - . = ALIGN (PERCPU_PAGE_SIZE) + (. & (PERCPU_PAGE_SIZE - 1)); - .text : { *(.text) *(.text.*) } :epc + . = ALIGN(PERCPU_PAGE_SIZE) + (. & (PERCPU_PAGE_SIZE - 1)); + .text : { *(.text) *(.text.*) } :epc #endif - /DISCARD/ : { - *(.got.plt) *(.got) - *(.data .data.* .gnu.linkonce.d.*) - *(.dynbss) - *(.bss .bss.* .gnu.linkonce.b.*) - *(__ex_table) - } + /DISCARD/ : { + *(.got.plt) *(.got) + *(.data .data.* .gnu.linkonce.d.*) + *(.dynbss) + *(.bss .bss.* .gnu.linkonce.b.*) + *(__ex_table) + *(__mca_table) + } } /* + * ld does not recognize this name token; use the constant. + */ +#define PT_IA_64_UNWIND 0x70000001 + +/* * We must supply the ELF program headers explicitly to get just one * PT_LOAD segment, and set the flags explicitly to make segments read-only. */ PHDRS { - readable PT_LOAD FILEHDR PHDRS FLAGS(4); /* PF_R */ + readable PT_LOAD FILEHDR PHDRS FLAGS(4); /* PF_R */ #ifndef HAVE_BUGGY_SEGREL - epc PT_LOAD FILEHDR PHDRS FLAGS(1); /* PF_X */ + epc PT_LOAD FILEHDR PHDRS FLAGS(1); /* PF_X */ #endif - dynamic PT_DYNAMIC FLAGS(4); /* PF_R */ - unwind 0x70000001; /* PT_IA_64_UNWIND, but ld doesn't match the name */ + dynamic PT_DYNAMIC FLAGS(4); /* PF_R */ + note PT_NOTE FLAGS(4); /* PF_R */ + unwind PT_IA_64_UNWIND; } /* @@ -81,14 +94,14 @@ PHDRS */ VERSION { - LINUX_2.5 { - global: - __kernel_syscall_via_break; - __kernel_syscall_via_epc; - __kernel_sigtramp; - - local: *; - }; + LINUX_2.5 { + global: + __kernel_syscall_via_break; + __kernel_syscall_via_epc; + __kernel_sigtramp; + + local: *; + }; } /* The ELF entry point can be used to set the AT_SYSINFO value. */ diff --git a/arch/ia64/kernel/head.S b/arch/ia64/kernel/head.S index bfe65b2e862..a4acddad0c7 100644 --- a/arch/ia64/kernel/head.S +++ b/arch/ia64/kernel/head.S @@ -19,7 +19,6 @@ * Support for CPU Hotplug */ -#include <linux/config.h> #include <asm/asmmacro.h> #include <asm/fpu.h> @@ -27,11 +26,13 @@ #include <asm/mmu_context.h> #include <asm/asm-offsets.h> #include <asm/pal.h> +#include <asm/paravirt.h> #include <asm/pgtable.h> #include <asm/processor.h> #include <asm/ptrace.h> -#include <asm/system.h> #include <asm/mca_asm.h> +#include <linux/init.h> +#include <linux/linkage.h> #ifdef CONFIG_HOTPLUG_CPU #define SAL_PSR_BITS_TO_SET \ @@ -165,7 +166,7 @@ RestRR: \ mov _tmp2=((ia64_rid(IA64_REGION_ID_KERNEL, (num<<61)) << 8) | (pgsize << 2) | vhpt);; \ mov rr[_tmp1]=_tmp2 - .section __special_page_section,"ax" + __PAGE_ALIGNED_DATA .global empty_zero_page empty_zero_page: @@ -179,7 +180,7 @@ swapper_pg_dir: halt_msg: stringz "Halting kernel\n" - .text + __REF .global start_ap @@ -198,6 +199,11 @@ start_ap: ;; srlz.i ;; + { + flushrs // must be first insn in group + srlz.i + } + ;; /* * Save the region registers, predicate before they get clobbered */ @@ -352,6 +358,32 @@ start_ap: mov ar.rsc=0 // place RSE in enforced lazy mode ;; loadrs // clear the dirty partition + movl r19=__phys_per_cpu_start + mov r18=PERCPU_PAGE_SIZE + ;; +#ifndef CONFIG_SMP + add r19=r19,r18 + ;; +#else +(isAP) br.few 2f + movl r20=__cpu0_per_cpu + ;; + shr.u r18=r18,3 +1: + ld8 r21=[r19],8;; + st8[r20]=r21,8 + adds r18=-1,r18;; + cmp4.lt p7,p6=0,r18 +(p7) br.cond.dptk.few 1b + mov r19=r20 + ;; +2: +#endif + tpa r19=r19 + ;; + .pred.rel.mutex isBP,isAP +(isBP) mov IA64_KR(PER_CPU_DATA)=r19 // per-CPU base for cpu0 +(isAP) mov IA64_KR(PER_CPU_DATA)=r0 // clear physical per-CPU base ;; mov ar.bspstore=r2 // establish the new RSE stack ;; @@ -362,6 +394,41 @@ start_ap: ;; (isBP) st8 [r2]=r28 // save the address of the boot param area passed by the bootloader +#ifdef CONFIG_PARAVIRT + + movl r14=hypervisor_setup_hooks + movl r15=hypervisor_type + mov r16=num_hypervisor_hooks + ;; + ld8 r2=[r15] + ;; + cmp.ltu p7,p0=r2,r16 // array size check + shladd r8=r2,3,r14 + ;; +(p7) ld8 r9=[r8] + ;; +(p7) mov b1=r9 +(p7) cmp.ne.unc p7,p0=r9,r0 // no actual branch to NULL + ;; +(p7) br.call.sptk.many rp=b1 + + __INITDATA + +default_setup_hook = 0 // Currently nothing needs to be done. + + .global hypervisor_type +hypervisor_type: + data8 PARAVIRT_HYPERVISOR_TYPE_DEFAULT + + // must have the same order with PARAVIRT_HYPERVISOR_TYPE_xxx + +hypervisor_setup_hooks: + data8 default_setup_hook +num_hypervisor_hooks = (. - hypervisor_setup_hooks) / 8 + .previous + +#endif + #ifdef CONFIG_SMP (isAP) br.call.sptk.many rp=start_secondary .ret0: @@ -387,6 +454,8 @@ self: hint @pause br.sptk.many self // endless loop END(_start) + .text + GLOBAL_ENTRY(ia64_save_debug_regs) alloc r16=ar.pfs,1,0,0,0 mov r20=ar.lc // preserve ar.lc @@ -853,7 +922,6 @@ END(__ia64_init_fpu) */ GLOBAL_ENTRY(ia64_switch_mode_phys) { - alloc r2=ar.pfs,0,0,0,0 rsm psr.i | psr.ic // disable interrupts and interrupt collection mov r15=ip } @@ -902,7 +970,6 @@ END(ia64_switch_mode_phys) */ GLOBAL_ENTRY(ia64_switch_mode_virt) { - alloc r2=ar.pfs,0,0,0,0 rsm psr.i | psr.ic // disable interrupts and interrupt collection mov r15=ip } @@ -965,7 +1032,7 @@ END(ia64_delay_loop) * Return a CPU-local timestamp in nano-seconds. This timestamp is * NOT synchronized across CPUs its return value must never be * compared against the values returned on another CPU. The usage in - * kernel/sched.c ensures that. + * kernel/sched/core.c ensures that. * * The return-value of sched_clock() is NOT supposed to wrap-around. * If it did, it would cause some scheduling hiccups (at the worst). @@ -979,8 +1046,8 @@ END(ia64_delay_loop) * except that the multiplication and the shift are done with 128-bit * intermediate precision so that we can produce a full 64-bit result. */ -GLOBAL_ENTRY(sched_clock) - addl r8=THIS_CPU(cpu_info) + IA64_CPUINFO_NSEC_PER_CYC_OFFSET,r0 +GLOBAL_ENTRY(ia64_native_sched_clock) + addl r8=THIS_CPU(ia64_cpu_info) + IA64_CPUINFO_NSEC_PER_CYC_OFFSET,r0 mov.m r9=ar.itc // fetch cycle-counter (35 cyc) ;; ldf8 f8=[r8] @@ -995,20 +1062,33 @@ GLOBAL_ENTRY(sched_clock) ;; shrp r8=r9,r8,IA64_NSEC_PER_CYC_SHIFT br.ret.sptk.many rp -END(sched_clock) +END(ia64_native_sched_clock) +#ifndef CONFIG_PARAVIRT + //unsigned long long + //sched_clock(void) __attribute__((alias("ia64_native_sched_clock"))); + .global sched_clock +sched_clock = ia64_native_sched_clock +#endif -GLOBAL_ENTRY(start_kernel_thread) - .prologue - .save rp, r0 // this is the end of the call-chain - .body - alloc r2 = ar.pfs, 0, 0, 2, 0 - mov out0 = r9 - mov out1 = r11;; - br.call.sptk.many rp = kernel_thread_helper;; - mov out0 = r8 - br.call.sptk.many rp = sys_exit;; -1: br.sptk.few 1b // not reached -END(start_kernel_thread) +#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE +GLOBAL_ENTRY(cycle_to_cputime) + alloc r16=ar.pfs,1,0,0,0 + addl r8=THIS_CPU(ia64_cpu_info) + IA64_CPUINFO_NSEC_PER_CYC_OFFSET,r0 + ;; + ldf8 f8=[r8] + ;; + setf.sig f9=r32 + ;; + xmpy.lu f10=f9,f8 // calculate low 64 bits of 128-bit product (4 cyc) + xmpy.hu f11=f9,f8 // calculate high 64 bits of 128-bit product + ;; + getf.sig r8=f10 // (5 cyc) + getf.sig r9=f11 + ;; + shrp r8=r9,r8,IA64_NSEC_PER_CYC_SHIFT + br.ret.sptk.many rp +END(cycle_to_cputime) +#endif /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ #ifdef CONFIG_IA64_BRL_EMU @@ -1033,95 +1113,6 @@ SET_REG(b5); #endif /* CONFIG_IA64_BRL_EMU */ #ifdef CONFIG_SMP - /* - * This routine handles spinlock contention. It uses a non-standard calling - * convention to avoid converting leaf routines into interior routines. Because - * of this special convention, there are several restrictions: - * - * - do not use gp relative variables, this code is called from the kernel - * and from modules, r1 is undefined. - * - do not use stacked registers, the caller owns them. - * - do not use the scratch stack space, the caller owns it. - * - do not use any registers other than the ones listed below - * - * Inputs: - * ar.pfs - saved CFM of caller - * ar.ccv - 0 (and available for use) - * r27 - flags from spin_lock_irqsave or 0. Must be preserved. - * r28 - available for use. - * r29 - available for use. - * r30 - available for use. - * r31 - address of lock, available for use. - * b6 - return address - * p14 - available for use. - * p15 - used to track flag status. - * - * If you patch this code to use more registers, do not forget to update - * the clobber lists for spin_lock() in include/asm-ia64/spinlock.h. - */ - -#if __GNUC__ < 3 || (__GNUC__ == 3 && __GNUC_MINOR__ < 3) - -GLOBAL_ENTRY(ia64_spinlock_contention_pre3_4) - .prologue - .save ar.pfs, r0 // this code effectively has a zero frame size - .save rp, r28 - .body - nop 0 - tbit.nz p15,p0=r27,IA64_PSR_I_BIT - .restore sp // pop existing prologue after next insn - mov b6 = r28 - .prologue - .save ar.pfs, r0 - .altrp b6 - .body - ;; -(p15) ssm psr.i // reenable interrupts if they were on - // DavidM says that srlz.d is slow and is not required in this case -.wait: - // exponential backoff, kdb, lockmeter etc. go in here - hint @pause - ld4 r30=[r31] // don't use ld4.bias; if it's contended, we won't write the word - nop 0 - ;; - cmp4.ne p14,p0=r30,r0 -(p14) br.cond.sptk.few .wait -(p15) rsm psr.i // disable interrupts if we reenabled them - br.cond.sptk.few b6 // lock is now free, try to acquire - .global ia64_spinlock_contention_pre3_4_end // for kernprof -ia64_spinlock_contention_pre3_4_end: -END(ia64_spinlock_contention_pre3_4) - -#else - -GLOBAL_ENTRY(ia64_spinlock_contention) - .prologue - .altrp b6 - .body - tbit.nz p15,p0=r27,IA64_PSR_I_BIT - ;; -.wait: -(p15) ssm psr.i // reenable interrupts if they were on - // DavidM says that srlz.d is slow and is not required in this case -.wait2: - // exponential backoff, kdb, lockmeter etc. go in here - hint @pause - ld4 r30=[r31] // don't use ld4.bias; if it's contended, we won't write the word - ;; - cmp4.ne p14,p0=r30,r0 - mov r30 = 1 -(p14) br.cond.sptk.few .wait2 -(p15) rsm psr.i // disable interrupts if we reenabled them - ;; - cmpxchg4.acq r30=[r31], r30, ar.ccv - ;; - cmp4.ne p14,p0=r0,r30 -(p14) br.cond.sptk.few .wait - - br.ret.sptk.many b6 // lock is now taken -END(ia64_spinlock_contention) - -#endif #ifdef CONFIG_HOTPLUG_CPU GLOBAL_ENTRY(ia64_jump_to_sal) @@ -1145,7 +1136,7 @@ GLOBAL_ENTRY(ia64_jump_to_sal) movl r16=SAL_PSR_BITS_TO_SET;; mov cr.ipsr=r16 mov cr.ifs=r0;; - rfi;; + rfi;; // note: this unmask MCA/INIT (psr.mc) 1: /* * Invalidate all TLB data/inst @@ -1171,6 +1162,7 @@ tlb_purge_done: RESTORE_REG(cr.dcr, r25, r17);; RESTORE_REG(cr.iva, r25, r17);; RESTORE_REG(cr.pta, r25, r17);; + srlz.d;; // required not to violate RAW dependency RESTORE_REG(cr.itv, r25, r17);; RESTORE_REG(cr.pmv, r25, r17);; RESTORE_REG(cr.cmcv, r25, r17);; diff --git a/arch/ia64/kernel/ia64_ksyms.c b/arch/ia64/kernel/ia64_ksyms.c index 01572814abe..5b7791dd396 100644 --- a/arch/ia64/kernel/ia64_ksyms.c +++ b/arch/ia64/kernel/ia64_ksyms.c @@ -5,50 +5,34 @@ * All other exports should be put directly after the definition. */ -#include <linux/config.h> #include <linux/module.h> #include <linux/string.h> EXPORT_SYMBOL(memset); -EXPORT_SYMBOL(memchr); -EXPORT_SYMBOL(memcmp); EXPORT_SYMBOL(memcpy); -EXPORT_SYMBOL(memmove); -EXPORT_SYMBOL(memscan); -EXPORT_SYMBOL(strcat); -EXPORT_SYMBOL(strchr); -EXPORT_SYMBOL(strcmp); -EXPORT_SYMBOL(strcpy); EXPORT_SYMBOL(strlen); -EXPORT_SYMBOL(strncat); -EXPORT_SYMBOL(strncmp); -EXPORT_SYMBOL(strncpy); -EXPORT_SYMBOL(strnlen); -EXPORT_SYMBOL(strrchr); -EXPORT_SYMBOL(strstr); -EXPORT_SYMBOL(strpbrk); + +#include <asm/pgtable.h> +EXPORT_SYMBOL_GPL(empty_zero_page); #include <asm/checksum.h> EXPORT_SYMBOL(ip_fast_csum); /* hand-coded assembly */ - -#include <asm/semaphore.h> -EXPORT_SYMBOL(__down); -EXPORT_SYMBOL(__down_interruptible); -EXPORT_SYMBOL(__down_trylock); -EXPORT_SYMBOL(__up); +EXPORT_SYMBOL(csum_ipv6_magic); #include <asm/page.h> EXPORT_SYMBOL(clear_page); +EXPORT_SYMBOL(copy_page); #ifdef CONFIG_VIRTUAL_MEM_MAP #include <linux/bootmem.h> +EXPORT_SYMBOL(min_low_pfn); /* defined by bootmem.c, but not exported by generic code */ EXPORT_SYMBOL(max_low_pfn); /* defined by bootmem.c, but not exported by generic code */ #endif #include <asm/processor.h> -EXPORT_SYMBOL(per_cpu__cpu_info); +EXPORT_SYMBOL(ia64_cpu_info); #ifdef CONFIG_SMP -EXPORT_SYMBOL(per_cpu__local_per_cpu_offset); +EXPORT_SYMBOL(local_per_cpu_offset); #endif #include <asm/uaccess.h> @@ -77,7 +61,7 @@ EXPORT_SYMBOL(__udivdi3); EXPORT_SYMBOL(__moddi3); EXPORT_SYMBOL(__umoddi3); -#if defined(CONFIG_MD_RAID5) || defined(CONFIG_MD_RAID5_MODULE) +#if defined(CONFIG_MD_RAID456) || defined(CONFIG_MD_RAID456_MODULE) extern void xor_ia64_2(void); extern void xor_ia64_3(void); extern void xor_ia64_4(void); @@ -100,25 +84,15 @@ EXPORT_SYMBOL(ia64_save_scratch_fpregs); #include <asm/unwind.h> EXPORT_SYMBOL(unw_init_running); -#ifdef ASM_SUPPORTED -# ifdef CONFIG_SMP -# if __GNUC__ < 3 || (__GNUC__ == 3 && __GNUC_MINOR__ < 3) -/* - * This is not a normal routine and we don't want a function descriptor for it, so we use - * a fake declaration here. - */ -extern char ia64_spinlock_contention_pre3_4; -EXPORT_SYMBOL(ia64_spinlock_contention_pre3_4); -# else -/* - * This is not a normal routine and we don't want a function descriptor for it, so we use - * a fake declaration here. - */ -extern char ia64_spinlock_contention; -EXPORT_SYMBOL(ia64_spinlock_contention); -# endif -# endif +#if defined(CONFIG_IA64_ESI) || defined(CONFIG_IA64_ESI_MODULE) +extern void esi_call_phys (void); +EXPORT_SYMBOL_GPL(esi_call_phys); #endif - extern char ia64_ivt[]; EXPORT_SYMBOL(ia64_ivt); + +#include <asm/ftrace.h> +#ifdef CONFIG_FUNCTION_TRACER +/* mcount is defined in assembly */ +EXPORT_SYMBOL(_mcount); +#endif diff --git a/arch/ia64/kernel/init_task.c b/arch/ia64/kernel/init_task.c index b69c397ed1b..f9efe9739d3 100644 --- a/arch/ia64/kernel/init_task.c +++ b/arch/ia64/kernel/init_task.c @@ -8,6 +8,7 @@ #include <linux/init.h> #include <linux/mm.h> +#include <linux/fs.h> #include <linux/module.h> #include <linux/sched.h> #include <linux/init_task.h> @@ -16,19 +17,13 @@ #include <asm/uaccess.h> #include <asm/pgtable.h> -static struct fs_struct init_fs = INIT_FS; -static struct files_struct init_files = INIT_FILES; static struct signal_struct init_signals = INIT_SIGNALS(init_signals); static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); -struct mm_struct init_mm = INIT_MM(init_mm); - -EXPORT_SYMBOL(init_mm); - /* * Initial task structure. * * We need to make sure that this is properly aligned due to the way process stacks are - * handled. This is done by having a special ".data.init_task" section... + * handled. This is done by having a special ".data..init_task" section... */ #define init_thread_info init_task_mem.s.thread_info @@ -38,7 +33,8 @@ union { struct thread_info thread_info; } s; unsigned long stack[KERNEL_STACK_SIZE/sizeof (unsigned long)]; -} init_task_mem asm ("init_task") __attribute__((section(".data.init_task"))) = {{ +} init_task_mem asm ("init_task") __init_task_data = + {{ .task = INIT_TASK(init_task_mem.s.task), .thread_info = INIT_THREAD_INFO(init_task_mem.s.task) }}; diff --git a/arch/ia64/kernel/iosapic.c b/arch/ia64/kernel/iosapic.c index 574084f343f..cd44a57c73b 100644 --- a/arch/ia64/kernel/iosapic.c +++ b/arch/ia64/kernel/iosapic.c @@ -9,66 +9,76 @@ * Copyright (C) 1999 VA Linux Systems * Copyright (C) 1999,2000 Walt Drummond <drummond@valinux.com> * - * 00/04/19 D. Mosberger Rewritten to mirror more closely the x86 I/O APIC code. - * In particular, we now have separate handlers for edge - * and level triggered interrupts. - * 00/10/27 Asit Mallick, Goutham Rao <goutham.rao@intel.com> IRQ vector allocation - * PCI to vector mapping, shared PCI interrupts. - * 00/10/27 D. Mosberger Document things a bit more to make them more understandable. - * Clean up much of the old IOSAPIC cruft. - * 01/07/27 J.I. Lee PCI irq routing, Platform/Legacy interrupts and fixes for - * ACPI S5(SoftOff) support. + * 00/04/19 D. Mosberger Rewritten to mirror more closely the x86 I/O + * APIC code. In particular, we now have separate + * handlers for edge and level triggered + * interrupts. + * 00/10/27 Asit Mallick, Goutham Rao <goutham.rao@intel.com> IRQ vector + * allocation PCI to vector mapping, shared PCI + * interrupts. + * 00/10/27 D. Mosberger Document things a bit more to make them more + * understandable. Clean up much of the old + * IOSAPIC cruft. + * 01/07/27 J.I. Lee PCI irq routing, Platform/Legacy interrupts + * and fixes for ACPI S5(SoftOff) support. * 02/01/23 J.I. Lee iosapic pgm fixes for PCI irq routing from _PRT - * 02/01/07 E. Focht <efocht@ess.nec.de> Redirectable interrupt vectors in - * iosapic_set_affinity(), initializations for - * /proc/irq/#/smp_affinity + * 02/01/07 E. Focht <efocht@ess.nec.de> Redirectable interrupt + * vectors in iosapic_set_affinity(), + * initializations for /proc/irq/#/smp_affinity * 02/04/02 P. Diefenbaugh Cleaned up ACPI PCI IRQ routing. * 02/04/18 J.I. Lee bug fix in iosapic_init_pci_irq - * 02/04/30 J.I. Lee bug fix in find_iosapic to fix ACPI PCI IRQ to IOSAPIC mapping - * error + * 02/04/30 J.I. Lee bug fix in find_iosapic to fix ACPI PCI IRQ to + * IOSAPIC mapping error * 02/07/29 T. Kochi Allocate interrupt vectors dynamically - * 02/08/04 T. Kochi Cleaned up terminology (irq, global system interrupt, vector, etc.) - * 02/09/20 D. Mosberger Simplified by taking advantage of ACPI's pci_irq code. + * 02/08/04 T. Kochi Cleaned up terminology (irq, global system + * interrupt, vector, etc.) + * 02/09/20 D. Mosberger Simplified by taking advantage of ACPI's + * pci_irq code. * 03/02/19 B. Helgaas Make pcat_compat system-wide, not per-IOSAPIC. - * Remove iosapic_address & gsi_base from external interfaces. - * Rationalize __init/__devinit attributes. + * Remove iosapic_address & gsi_base from + * external interfaces. Rationalize + * __init/__devinit attributes. * 04/12/04 Ashok Raj <ashok.raj@intel.com> Intel Corporation 2004 - * Updated to work with irq migration necessary for CPU Hotplug + * Updated to work with irq migration necessary + * for CPU Hotplug */ /* - * Here is what the interrupt logic between a PCI device and the kernel looks like: + * Here is what the interrupt logic between a PCI device and the kernel looks + * like: * - * (1) A PCI device raises one of the four interrupt pins (INTA, INTB, INTC, INTD). The - * device is uniquely identified by its bus--, and slot-number (the function - * number does not matter here because all functions share the same interrupt - * lines). + * (1) A PCI device raises one of the four interrupt pins (INTA, INTB, INTC, + * INTD). The device is uniquely identified by its bus-, and slot-number + * (the function number does not matter here because all functions share + * the same interrupt lines). * - * (2) The motherboard routes the interrupt line to a pin on a IOSAPIC controller. - * Multiple interrupt lines may have to share the same IOSAPIC pin (if they're level - * triggered and use the same polarity). Each interrupt line has a unique Global - * System Interrupt (GSI) number which can be calculated as the sum of the controller's - * base GSI number and the IOSAPIC pin number to which the line connects. + * (2) The motherboard routes the interrupt line to a pin on a IOSAPIC + * controller. Multiple interrupt lines may have to share the same + * IOSAPIC pin (if they're level triggered and use the same polarity). + * Each interrupt line has a unique Global System Interrupt (GSI) number + * which can be calculated as the sum of the controller's base GSI number + * and the IOSAPIC pin number to which the line connects. * - * (3) The IOSAPIC uses an internal routing table entries (RTEs) to map the IOSAPIC pin - * into the IA-64 interrupt vector. This interrupt vector is then sent to the CPU. + * (3) The IOSAPIC uses an internal routing table entries (RTEs) to map the + * IOSAPIC pin into the IA-64 interrupt vector. This interrupt vector is then + * sent to the CPU. * - * (4) The kernel recognizes an interrupt as an IRQ. The IRQ interface is used as - * architecture-independent interrupt handling mechanism in Linux. As an - * IRQ is a number, we have to have IA-64 interrupt vector number <-> IRQ number - * mapping. On smaller systems, we use one-to-one mapping between IA-64 vector and - * IRQ. A platform can implement platform_irq_to_vector(irq) and + * (4) The kernel recognizes an interrupt as an IRQ. The IRQ interface is + * used as architecture-independent interrupt handling mechanism in Linux. + * As an IRQ is a number, we have to have + * IA-64 interrupt vector number <-> IRQ number mapping. On smaller + * systems, we use one-to-one mapping between IA-64 vector and IRQ. A + * platform can implement platform_irq_to_vector(irq) and * platform_local_vector_to_irq(vector) APIs to differentiate the mapping. - * Please see also include/asm-ia64/hw_irq.h for those APIs. + * Please see also arch/ia64/include/asm/hw_irq.h for those APIs. * * To sum up, there are three levels of mappings involved: * * PCI pin -> global system interrupt (GSI) -> IA-64 vector <-> IRQ * - * Note: The term "IRQ" is loosely used everywhere in Linux kernel to describe interrupts. - * Now we use "IRQ" only for Linux IRQ's. ISA IRQ (isa_irq) is the only exception in this - * source code. + * Note: The term "IRQ" is loosely used everywhere in Linux kernel to + * describe interrupts. Now we use "IRQ" only for Linux IRQ's. ISA IRQ + * (isa_irq) is the only exception in this source code. */ -#include <linux/config.h> #include <linux/acpi.h> #include <linux/init.h> @@ -76,8 +86,8 @@ #include <linux/kernel.h> #include <linux/list.h> #include <linux/pci.h> +#include <linux/slab.h> #include <linux/smp.h> -#include <linux/smp_lock.h> #include <linux/string.h> #include <linux/bootmem.h> @@ -88,8 +98,6 @@ #include <asm/machvec.h> #include <asm/processor.h> #include <asm/ptrace.h> -#include <asm/system.h> - #undef DEBUG_INTERRUPT_ROUTING @@ -99,46 +107,57 @@ #define DBG(fmt...) #endif -#define NR_PREALLOCATE_RTE_ENTRIES (PAGE_SIZE / sizeof(struct iosapic_rte_info)) -#define RTE_PREALLOCATED (1) - static DEFINE_SPINLOCK(iosapic_lock); -/* These tables map IA-64 vectors to the IOSAPIC pin that generates this vector. */ +/* + * These tables map IA-64 vectors to the IOSAPIC pin that generates this + * vector. + */ + +#define NO_REF_RTE 0 -struct iosapic_rte_info { - struct list_head rte_list; /* node in list of RTEs sharing the same vector */ +static struct iosapic { char __iomem *addr; /* base address of IOSAPIC */ - unsigned int gsi_base; /* first GSI assigned to this IOSAPIC */ + unsigned int gsi_base; /* GSI base */ + unsigned short num_rte; /* # of RTEs on this IOSAPIC */ + int rtes_inuse; /* # of RTEs in use on this IOSAPIC */ +#ifdef CONFIG_NUMA + unsigned short node; /* numa node association via pxm */ +#endif + spinlock_t lock; /* lock for indirect reg access */ +} iosapic_lists[NR_IOSAPICS]; + +struct iosapic_rte_info { + struct list_head rte_list; /* RTEs sharing the same vector */ char rte_index; /* IOSAPIC RTE index */ int refcnt; /* reference counter */ - unsigned int flags; /* flags */ + struct iosapic *iosapic; } ____cacheline_aligned; static struct iosapic_intr_info { - struct list_head rtes; /* RTEs using this vector (empty => not an IOSAPIC interrupt) */ - int count; /* # of RTEs that shares this vector */ - u32 low32; /* current value of low word of Redirection table entry */ + struct list_head rtes; /* RTEs using this vector (empty => + * not an IOSAPIC interrupt) */ + int count; /* # of registered RTEs */ + u32 low32; /* current value of low word of + * Redirection table entry */ unsigned int dest; /* destination CPU physical ID */ unsigned char dmode : 3; /* delivery mode (see iosapic.h) */ - unsigned char polarity: 1; /* interrupt polarity (see iosapic.h) */ + unsigned char polarity: 1; /* interrupt polarity + * (see iosapic.h) */ unsigned char trigger : 1; /* trigger mode (see iosapic.h) */ -} iosapic_intr_info[IA64_NUM_VECTORS]; +} iosapic_intr_info[NR_IRQS]; -static struct iosapic { - char __iomem *addr; /* base address of IOSAPIC */ - unsigned int gsi_base; /* first GSI assigned to this IOSAPIC */ - unsigned short num_rte; /* number of RTE in this IOSAPIC */ - int rtes_inuse; /* # of RTEs in use on this IOSAPIC */ -#ifdef CONFIG_NUMA - unsigned short node; /* numa node association via pxm */ -#endif -} iosapic_lists[NR_IOSAPICS]; +static unsigned char pcat_compat; /* 8259 compatibility flag */ -static unsigned char pcat_compat __devinitdata; /* 8259 compatibility flag */ +static inline void +iosapic_write(struct iosapic *iosapic, unsigned int reg, u32 val) +{ + unsigned long flags; -static int iosapic_kmalloc_ok; -static LIST_HEAD(free_rte_list); + spin_lock_irqsave(&iosapic->lock, flags); + __iosapic_write(iosapic->addr, reg, val); + spin_unlock_irqrestore(&iosapic->lock, flags); +} /* * Find an IOSAPIC associated with a GSI @@ -149,98 +168,76 @@ find_iosapic (unsigned int gsi) int i; for (i = 0; i < NR_IOSAPICS; i++) { - if ((unsigned) (gsi - iosapic_lists[i].gsi_base) < iosapic_lists[i].num_rte) + if ((unsigned) (gsi - iosapic_lists[i].gsi_base) < + iosapic_lists[i].num_rte) return i; } return -1; } -static inline int -_gsi_to_vector (unsigned int gsi) +static inline int __gsi_to_irq(unsigned int gsi) { + int irq; struct iosapic_intr_info *info; struct iosapic_rte_info *rte; - for (info = iosapic_intr_info; info < iosapic_intr_info + IA64_NUM_VECTORS; ++info) + for (irq = 0; irq < NR_IRQS; irq++) { + info = &iosapic_intr_info[irq]; list_for_each_entry(rte, &info->rtes, rte_list) - if (rte->gsi_base + rte->rte_index == gsi) - return info - iosapic_intr_info; + if (rte->iosapic->gsi_base + rte->rte_index == gsi) + return irq; + } return -1; } -/* - * Translate GSI number to the corresponding IA-64 interrupt vector. If no - * entry exists, return -1. - */ -inline int -gsi_to_vector (unsigned int gsi) -{ - return _gsi_to_vector(gsi); -} - int gsi_to_irq (unsigned int gsi) { unsigned long flags; int irq; - /* - * XXX fix me: this assumes an identity mapping vetween IA-64 vector and Linux irq - * numbers... - */ + spin_lock_irqsave(&iosapic_lock, flags); - { - irq = _gsi_to_vector(gsi); - } + irq = __gsi_to_irq(gsi); spin_unlock_irqrestore(&iosapic_lock, flags); - return irq; } -static struct iosapic_rte_info *gsi_vector_to_rte(unsigned int gsi, unsigned int vec) +static struct iosapic_rte_info *find_rte(unsigned int irq, unsigned int gsi) { struct iosapic_rte_info *rte; - list_for_each_entry(rte, &iosapic_intr_info[vec].rtes, rte_list) - if (rte->gsi_base + rte->rte_index == gsi) + list_for_each_entry(rte, &iosapic_intr_info[irq].rtes, rte_list) + if (rte->iosapic->gsi_base + rte->rte_index == gsi) return rte; return NULL; } static void -set_rte (unsigned int gsi, unsigned int vector, unsigned int dest, int mask) +set_rte (unsigned int gsi, unsigned int irq, unsigned int dest, int mask) { unsigned long pol, trigger, dmode; u32 low32, high32; - char __iomem *addr; int rte_index; char redir; struct iosapic_rte_info *rte; + ia64_vector vector = irq_to_vector(irq); DBG(KERN_DEBUG"IOSAPIC: routing vector %d to 0x%x\n", vector, dest); - rte = gsi_vector_to_rte(gsi, vector); + rte = find_rte(irq, gsi); if (!rte) return; /* not an IOSAPIC interrupt */ rte_index = rte->rte_index; - addr = rte->addr; - pol = iosapic_intr_info[vector].polarity; - trigger = iosapic_intr_info[vector].trigger; - dmode = iosapic_intr_info[vector].dmode; + pol = iosapic_intr_info[irq].polarity; + trigger = iosapic_intr_info[irq].trigger; + dmode = iosapic_intr_info[irq].dmode; redir = (dmode == IOSAPIC_LOWEST_PRIORITY) ? 1 : 0; #ifdef CONFIG_SMP - { - unsigned int irq; - - for (irq = 0; irq < NR_IRQS; ++irq) - if (irq_to_vector(irq) == vector) { - set_irq_affinity_info(irq, (int)(dest & 0xffff), redir); - break; - } - } + set_irq_affinity_info(irq, (int)(dest & 0xffff), redir); #endif low32 = ((pol << IOSAPIC_POLARITY_SHIFT) | @@ -252,120 +249,132 @@ set_rte (unsigned int gsi, unsigned int vector, unsigned int dest, int mask) /* dest contains both id and eid */ high32 = (dest << IOSAPIC_DEST_SHIFT); - iosapic_write(addr, IOSAPIC_RTE_HIGH(rte_index), high32); - iosapic_write(addr, IOSAPIC_RTE_LOW(rte_index), low32); - iosapic_intr_info[vector].low32 = low32; - iosapic_intr_info[vector].dest = dest; + iosapic_write(rte->iosapic, IOSAPIC_RTE_HIGH(rte_index), high32); + iosapic_write(rte->iosapic, IOSAPIC_RTE_LOW(rte_index), low32); + iosapic_intr_info[irq].low32 = low32; + iosapic_intr_info[irq].dest = dest; } static void -nop (unsigned int vector) +nop (struct irq_data *data) { /* do nothing... */ } + +#ifdef CONFIG_KEXEC +void +kexec_disable_iosapic(void) +{ + struct iosapic_intr_info *info; + struct iosapic_rte_info *rte; + ia64_vector vec; + int irq; + + for (irq = 0; irq < NR_IRQS; irq++) { + info = &iosapic_intr_info[irq]; + vec = irq_to_vector(irq); + list_for_each_entry(rte, &info->rtes, + rte_list) { + iosapic_write(rte->iosapic, + IOSAPIC_RTE_LOW(rte->rte_index), + IOSAPIC_MASK|vec); + iosapic_eoi(rte->iosapic->addr, vec); + } + } +} +#endif + static void -mask_irq (unsigned int irq) +mask_irq (struct irq_data *data) { - unsigned long flags; - char __iomem *addr; + unsigned int irq = data->irq; u32 low32; int rte_index; - ia64_vector vec = irq_to_vector(irq); struct iosapic_rte_info *rte; - if (list_empty(&iosapic_intr_info[vec].rtes)) + if (!iosapic_intr_info[irq].count) return; /* not an IOSAPIC interrupt! */ - spin_lock_irqsave(&iosapic_lock, flags); - { - /* set only the mask bit */ - low32 = iosapic_intr_info[vec].low32 |= IOSAPIC_MASK; - list_for_each_entry(rte, &iosapic_intr_info[vec].rtes, rte_list) { - addr = rte->addr; - rte_index = rte->rte_index; - iosapic_write(addr, IOSAPIC_RTE_LOW(rte_index), low32); - } + /* set only the mask bit */ + low32 = iosapic_intr_info[irq].low32 |= IOSAPIC_MASK; + list_for_each_entry(rte, &iosapic_intr_info[irq].rtes, rte_list) { + rte_index = rte->rte_index; + iosapic_write(rte->iosapic, IOSAPIC_RTE_LOW(rte_index), low32); } - spin_unlock_irqrestore(&iosapic_lock, flags); } static void -unmask_irq (unsigned int irq) +unmask_irq (struct irq_data *data) { - unsigned long flags; - char __iomem *addr; + unsigned int irq = data->irq; u32 low32; int rte_index; - ia64_vector vec = irq_to_vector(irq); struct iosapic_rte_info *rte; - if (list_empty(&iosapic_intr_info[vec].rtes)) + if (!iosapic_intr_info[irq].count) return; /* not an IOSAPIC interrupt! */ - spin_lock_irqsave(&iosapic_lock, flags); - { - low32 = iosapic_intr_info[vec].low32 &= ~IOSAPIC_MASK; - list_for_each_entry(rte, &iosapic_intr_info[vec].rtes, rte_list) { - addr = rte->addr; - rte_index = rte->rte_index; - iosapic_write(addr, IOSAPIC_RTE_LOW(rte_index), low32); - } + low32 = iosapic_intr_info[irq].low32 &= ~IOSAPIC_MASK; + list_for_each_entry(rte, &iosapic_intr_info[irq].rtes, rte_list) { + rte_index = rte->rte_index; + iosapic_write(rte->iosapic, IOSAPIC_RTE_LOW(rte_index), low32); } - spin_unlock_irqrestore(&iosapic_lock, flags); } -static void -iosapic_set_affinity (unsigned int irq, cpumask_t mask) +static int +iosapic_set_affinity(struct irq_data *data, const struct cpumask *mask, + bool force) { #ifdef CONFIG_SMP - unsigned long flags; + unsigned int irq = data->irq; u32 high32, low32; - int dest, rte_index; - char __iomem *addr; + int cpu, dest, rte_index; int redir = (irq & IA64_IRQ_REDIRECTED) ? 1 : 0; - ia64_vector vec; struct iosapic_rte_info *rte; + struct iosapic *iosapic; irq &= (~IA64_IRQ_REDIRECTED); - vec = irq_to_vector(irq); - if (cpus_empty(mask)) - return; + cpu = cpumask_first_and(cpu_online_mask, mask); + if (cpu >= nr_cpu_ids) + return -1; - dest = cpu_physical_id(first_cpu(mask)); + if (irq_prepare_move(irq, cpu)) + return -1; + + dest = cpu_physical_id(cpu); - if (list_empty(&iosapic_intr_info[vec].rtes)) - return; /* not an IOSAPIC interrupt */ + if (!iosapic_intr_info[irq].count) + return -1; /* not an IOSAPIC interrupt */ set_irq_affinity_info(irq, dest, redir); /* dest contains both id and eid */ high32 = dest << IOSAPIC_DEST_SHIFT; - spin_lock_irqsave(&iosapic_lock, flags); - { - low32 = iosapic_intr_info[vec].low32 & ~(7 << IOSAPIC_DELIVERY_SHIFT); - - if (redir) - /* change delivery mode to lowest priority */ - low32 |= (IOSAPIC_LOWEST_PRIORITY << IOSAPIC_DELIVERY_SHIFT); - else - /* change delivery mode to fixed */ - low32 |= (IOSAPIC_FIXED << IOSAPIC_DELIVERY_SHIFT); - - iosapic_intr_info[vec].low32 = low32; - iosapic_intr_info[vec].dest = dest; - list_for_each_entry(rte, &iosapic_intr_info[vec].rtes, rte_list) { - addr = rte->addr; - rte_index = rte->rte_index; - iosapic_write(addr, IOSAPIC_RTE_HIGH(rte_index), high32); - iosapic_write(addr, IOSAPIC_RTE_LOW(rte_index), low32); - } + low32 = iosapic_intr_info[irq].low32 & ~(7 << IOSAPIC_DELIVERY_SHIFT); + if (redir) + /* change delivery mode to lowest priority */ + low32 |= (IOSAPIC_LOWEST_PRIORITY << IOSAPIC_DELIVERY_SHIFT); + else + /* change delivery mode to fixed */ + low32 |= (IOSAPIC_FIXED << IOSAPIC_DELIVERY_SHIFT); + low32 &= IOSAPIC_VECTOR_MASK; + low32 |= irq_to_vector(irq); + + iosapic_intr_info[irq].low32 = low32; + iosapic_intr_info[irq].dest = dest; + list_for_each_entry(rte, &iosapic_intr_info[irq].rtes, rte_list) { + iosapic = rte->iosapic; + rte_index = rte->rte_index; + iosapic_write(iosapic, IOSAPIC_RTE_HIGH(rte_index), high32); + iosapic_write(iosapic, IOSAPIC_RTE_LOW(rte_index), low32); } - spin_unlock_irqrestore(&iosapic_lock, flags); + #endif + return 0; } /* @@ -373,21 +382,34 @@ iosapic_set_affinity (unsigned int irq, cpumask_t mask) */ static unsigned int -iosapic_startup_level_irq (unsigned int irq) +iosapic_startup_level_irq (struct irq_data *data) { - unmask_irq(irq); + unmask_irq(data); return 0; } static void -iosapic_end_level_irq (unsigned int irq) +iosapic_unmask_level_irq (struct irq_data *data) { + unsigned int irq = data->irq; ia64_vector vec = irq_to_vector(irq); struct iosapic_rte_info *rte; + int do_unmask_irq = 0; + + irq_complete_move(irq); + if (unlikely(irqd_is_setaffinity_pending(data))) { + do_unmask_irq = 1; + mask_irq(data); + } else + unmask_irq(data); + + list_for_each_entry(rte, &iosapic_intr_info[irq].rtes, rte_list) + iosapic_eoi(rte->iosapic->addr, vec); - move_irq(irq); - list_for_each_entry(rte, &iosapic_intr_info[vec].rtes, rte_list) - iosapic_eoi(rte->addr, vec); + if (unlikely(do_unmask_irq)) { + irq_move_masked_irq(data); + unmask_irq(data); + } } #define iosapic_shutdown_level_irq mask_irq @@ -395,15 +417,16 @@ iosapic_end_level_irq (unsigned int irq) #define iosapic_disable_level_irq mask_irq #define iosapic_ack_level_irq nop -struct hw_interrupt_type irq_type_iosapic_level = { - .typename = "IO-SAPIC-level", - .startup = iosapic_startup_level_irq, - .shutdown = iosapic_shutdown_level_irq, - .enable = iosapic_enable_level_irq, - .disable = iosapic_disable_level_irq, - .ack = iosapic_ack_level_irq, - .end = iosapic_end_level_irq, - .set_affinity = iosapic_set_affinity +static struct irq_chip irq_type_iosapic_level = { + .name = "IO-SAPIC-level", + .irq_startup = iosapic_startup_level_irq, + .irq_shutdown = iosapic_shutdown_level_irq, + .irq_enable = iosapic_enable_level_irq, + .irq_disable = iosapic_disable_level_irq, + .irq_ack = iosapic_ack_level_irq, + .irq_mask = mask_irq, + .irq_unmask = iosapic_unmask_level_irq, + .irq_set_affinity = iosapic_set_affinity }; /* @@ -411,9 +434,9 @@ struct hw_interrupt_type irq_type_iosapic_level = { */ static unsigned int -iosapic_startup_edge_irq (unsigned int irq) +iosapic_startup_edge_irq (struct irq_data *data) { - unmask_irq(irq); + unmask_irq(data); /* * IOSAPIC simply drops interrupts pended while the * corresponding pin was masked, so we can't know if an @@ -423,36 +446,28 @@ iosapic_startup_edge_irq (unsigned int irq) } static void -iosapic_ack_edge_irq (unsigned int irq) +iosapic_ack_edge_irq (struct irq_data *data) { - irq_desc_t *idesc = irq_descp(irq); - - move_irq(irq); - /* - * Once we have recorded IRQ_PENDING already, we can mask the - * interrupt for real. This prevents IRQ storms from unhandled - * devices. - */ - if ((idesc->status & (IRQ_PENDING|IRQ_DISABLED)) == (IRQ_PENDING|IRQ_DISABLED)) - mask_irq(irq); + irq_complete_move(data->irq); + irq_move_irq(data); } #define iosapic_enable_edge_irq unmask_irq #define iosapic_disable_edge_irq nop -#define iosapic_end_edge_irq nop - -struct hw_interrupt_type irq_type_iosapic_edge = { - .typename = "IO-SAPIC-edge", - .startup = iosapic_startup_edge_irq, - .shutdown = iosapic_disable_edge_irq, - .enable = iosapic_enable_edge_irq, - .disable = iosapic_disable_edge_irq, - .ack = iosapic_ack_edge_irq, - .end = iosapic_end_edge_irq, - .set_affinity = iosapic_set_affinity + +static struct irq_chip irq_type_iosapic_edge = { + .name = "IO-SAPIC-edge", + .irq_startup = iosapic_startup_edge_irq, + .irq_shutdown = iosapic_disable_edge_irq, + .irq_enable = iosapic_enable_edge_irq, + .irq_disable = iosapic_disable_edge_irq, + .irq_ack = iosapic_ack_edge_irq, + .irq_mask = mask_irq, + .irq_unmask = unmask_irq, + .irq_set_affinity = iosapic_set_affinity }; -unsigned int +static unsigned int iosapic_version (char __iomem *addr) { /* @@ -464,12 +479,12 @@ iosapic_version (char __iomem *addr) * unsigned int reserved2 : 8; * } */ - return iosapic_read(addr, IOSAPIC_VERSION); + return __iosapic_read(addr, IOSAPIC_VERSION); } -static int iosapic_find_sharable_vector (unsigned long trigger, unsigned long pol) +static int iosapic_find_sharable_irq(unsigned long trigger, unsigned long pol) { - int i, vector = -1, min_count = -1; + int i, irq = -ENOSPC, min_count = -1; struct iosapic_intr_info *info; /* @@ -477,20 +492,21 @@ static int iosapic_find_sharable_vector (unsigned long trigger, unsigned long po * supported yet */ if (trigger == IOSAPIC_EDGE) - return -1; + return -EINVAL; - for (i = IA64_FIRST_DEVICE_VECTOR; i <= IA64_LAST_DEVICE_VECTOR; i++) { + for (i = 0; i < NR_IRQS; i++) { info = &iosapic_intr_info[i]; if (info->trigger == trigger && info->polarity == pol && - (info->dmode == IOSAPIC_FIXED || info->dmode == IOSAPIC_LOWEST_PRIORITY)) { + (info->dmode == IOSAPIC_FIXED || + info->dmode == IOSAPIC_LOWEST_PRIORITY) && + can_request_irq(i, IRQF_SHARED)) { if (min_count == -1 || info->count < min_count) { - vector = i; + irq = i; min_count = info->count; } } } - - return vector; + return irq; } /* @@ -498,146 +514,122 @@ static int iosapic_find_sharable_vector (unsigned long trigger, unsigned long po * assign a new vector for the other and make the vector available */ static void __init -iosapic_reassign_vector (int vector) +iosapic_reassign_vector (int irq) { - int new_vector; - - if (!list_empty(&iosapic_intr_info[vector].rtes)) { - new_vector = assign_irq_vector(AUTO_ASSIGN); - if (new_vector < 0) - panic("%s: out of interrupt vectors!\n", __FUNCTION__); - printk(KERN_INFO "Reassigning vector %d to %d\n", vector, new_vector); - memcpy(&iosapic_intr_info[new_vector], &iosapic_intr_info[vector], + int new_irq; + + if (iosapic_intr_info[irq].count) { + new_irq = create_irq(); + if (new_irq < 0) + panic("%s: out of interrupt vectors!\n", __func__); + printk(KERN_INFO "Reassigning vector %d to %d\n", + irq_to_vector(irq), irq_to_vector(new_irq)); + memcpy(&iosapic_intr_info[new_irq], &iosapic_intr_info[irq], sizeof(struct iosapic_intr_info)); - INIT_LIST_HEAD(&iosapic_intr_info[new_vector].rtes); - list_move(iosapic_intr_info[vector].rtes.next, &iosapic_intr_info[new_vector].rtes); - memset(&iosapic_intr_info[vector], 0, sizeof(struct iosapic_intr_info)); - iosapic_intr_info[vector].low32 = IOSAPIC_MASK; - INIT_LIST_HEAD(&iosapic_intr_info[vector].rtes); + INIT_LIST_HEAD(&iosapic_intr_info[new_irq].rtes); + list_move(iosapic_intr_info[irq].rtes.next, + &iosapic_intr_info[new_irq].rtes); + memset(&iosapic_intr_info[irq], 0, + sizeof(struct iosapic_intr_info)); + iosapic_intr_info[irq].low32 = IOSAPIC_MASK; + INIT_LIST_HEAD(&iosapic_intr_info[irq].rtes); } } -static struct iosapic_rte_info *iosapic_alloc_rte (void) +static inline int irq_is_shared (int irq) { - int i; - struct iosapic_rte_info *rte; - int preallocated = 0; - - if (!iosapic_kmalloc_ok && list_empty(&free_rte_list)) { - rte = alloc_bootmem(sizeof(struct iosapic_rte_info) * NR_PREALLOCATE_RTE_ENTRIES); - if (!rte) - return NULL; - for (i = 0; i < NR_PREALLOCATE_RTE_ENTRIES; i++, rte++) - list_add(&rte->rte_list, &free_rte_list); - } - - if (!list_empty(&free_rte_list)) { - rte = list_entry(free_rte_list.next, struct iosapic_rte_info, rte_list); - list_del(&rte->rte_list); - preallocated++; - } else { - rte = kmalloc(sizeof(struct iosapic_rte_info), GFP_ATOMIC); - if (!rte) - return NULL; - } - - memset(rte, 0, sizeof(struct iosapic_rte_info)); - if (preallocated) - rte->flags |= RTE_PREALLOCATED; - - return rte; + return (iosapic_intr_info[irq].count > 1); } -static void iosapic_free_rte (struct iosapic_rte_info *rte) +struct irq_chip* +ia64_native_iosapic_get_irq_chip(unsigned long trigger) { - if (rte->flags & RTE_PREALLOCATED) - list_add_tail(&rte->rte_list, &free_rte_list); + if (trigger == IOSAPIC_EDGE) + return &irq_type_iosapic_edge; else - kfree(rte); -} - -static inline int vector_is_shared (int vector) -{ - return (iosapic_intr_info[vector].count > 1); + return &irq_type_iosapic_level; } static int -register_intr (unsigned int gsi, int vector, unsigned char delivery, +register_intr (unsigned int gsi, int irq, unsigned char delivery, unsigned long polarity, unsigned long trigger) { - irq_desc_t *idesc; - struct hw_interrupt_type *irq_type; - int rte_index; + struct irq_chip *chip, *irq_type; int index; - unsigned long gsi_base; - void __iomem *iosapic_address; struct iosapic_rte_info *rte; index = find_iosapic(gsi); if (index < 0) { - printk(KERN_WARNING "%s: No IOSAPIC for GSI %u\n", __FUNCTION__, gsi); + printk(KERN_WARNING "%s: No IOSAPIC for GSI %u\n", + __func__, gsi); return -ENODEV; } - iosapic_address = iosapic_lists[index].addr; - gsi_base = iosapic_lists[index].gsi_base; - - rte = gsi_vector_to_rte(gsi, vector); + rte = find_rte(irq, gsi); if (!rte) { - rte = iosapic_alloc_rte(); + rte = kzalloc(sizeof (*rte), GFP_ATOMIC); if (!rte) { - printk(KERN_WARNING "%s: cannot allocate memory\n", __FUNCTION__); + printk(KERN_WARNING "%s: cannot allocate memory\n", + __func__); return -ENOMEM; } - rte_index = gsi - gsi_base; - rte->rte_index = rte_index; - rte->addr = iosapic_address; - rte->gsi_base = gsi_base; + rte->iosapic = &iosapic_lists[index]; + rte->rte_index = gsi - rte->iosapic->gsi_base; rte->refcnt++; - list_add_tail(&rte->rte_list, &iosapic_intr_info[vector].rtes); - iosapic_intr_info[vector].count++; + list_add_tail(&rte->rte_list, &iosapic_intr_info[irq].rtes); + iosapic_intr_info[irq].count++; iosapic_lists[index].rtes_inuse++; } - else if (vector_is_shared(vector)) { - struct iosapic_intr_info *info = &iosapic_intr_info[vector]; - if (info->trigger != trigger || info->polarity != polarity) { - printk (KERN_WARNING "%s: cannot override the interrupt\n", __FUNCTION__); + else if (rte->refcnt == NO_REF_RTE) { + struct iosapic_intr_info *info = &iosapic_intr_info[irq]; + if (info->count > 0 && + (info->trigger != trigger || info->polarity != polarity)){ + printk (KERN_WARNING + "%s: cannot override the interrupt\n", + __func__); return -EINVAL; } + rte->refcnt++; + iosapic_intr_info[irq].count++; + iosapic_lists[index].rtes_inuse++; } - iosapic_intr_info[vector].polarity = polarity; - iosapic_intr_info[vector].dmode = delivery; - iosapic_intr_info[vector].trigger = trigger; + iosapic_intr_info[irq].polarity = polarity; + iosapic_intr_info[irq].dmode = delivery; + iosapic_intr_info[irq].trigger = trigger; - if (trigger == IOSAPIC_EDGE) - irq_type = &irq_type_iosapic_edge; - else - irq_type = &irq_type_iosapic_level; - - idesc = irq_descp(vector); - if (idesc->handler != irq_type) { - if (idesc->handler != &no_irq_type) - printk(KERN_WARNING "%s: changing vector %d from %s to %s\n", - __FUNCTION__, vector, idesc->handler->typename, irq_type->typename); - idesc->handler = irq_type; + irq_type = iosapic_get_irq_chip(trigger); + + chip = irq_get_chip(irq); + if (irq_type != NULL && chip != irq_type) { + if (chip != &no_irq_chip) + printk(KERN_WARNING + "%s: changing vector %d from %s to %s\n", + __func__, irq_to_vector(irq), + chip->name, irq_type->name); + chip = irq_type; } + __irq_set_chip_handler_name_locked(irq, chip, trigger == IOSAPIC_EDGE ? + handle_edge_irq : handle_level_irq, + NULL); return 0; } static unsigned int -get_target_cpu (unsigned int gsi, int vector) +get_target_cpu (unsigned int gsi, int irq) { #ifdef CONFIG_SMP static int cpu = -1; + extern int cpe_vector; + cpumask_t domain = irq_to_domain(irq); /* * In case of vector shared by multiple RTEs, all RTEs that * share the vector need to use the same destination CPU. */ - if (!list_empty(&iosapic_intr_info[vector].rtes)) - return iosapic_intr_info[vector].dest; + if (iosapic_intr_info[irq].count) + return iosapic_intr_info[irq].dest; /* * If the platform supports redirection via XTP, let it @@ -653,35 +645,39 @@ get_target_cpu (unsigned int gsi, int vector) if (!cpu_online(smp_processor_id())) return cpu_physical_id(smp_processor_id()); +#ifdef CONFIG_ACPI + if (cpe_vector > 0 && irq_to_vector(irq) == IA64_CPEP_VECTOR) + return get_cpei_target_cpu(); +#endif + #ifdef CONFIG_NUMA { int num_cpus, cpu_index, iosapic_index, numa_cpu, i = 0; - cpumask_t cpu_mask; + const struct cpumask *cpu_mask; iosapic_index = find_iosapic(gsi); if (iosapic_index < 0 || iosapic_lists[iosapic_index].node == MAX_NUMNODES) goto skip_numa_setup; - cpu_mask = node_to_cpumask(iosapic_lists[iosapic_index].node); - - for_each_cpu_mask(numa_cpu, cpu_mask) { - if (!cpu_online(numa_cpu)) - cpu_clear(numa_cpu, cpu_mask); + cpu_mask = cpumask_of_node(iosapic_lists[iosapic_index].node); + num_cpus = 0; + for_each_cpu_and(numa_cpu, cpu_mask, &domain) { + if (cpu_online(numa_cpu)) + num_cpus++; } - num_cpus = cpus_weight(cpu_mask); - if (!num_cpus) goto skip_numa_setup; - /* Use vector assigment to distribute across cpus in node */ - cpu_index = vector % num_cpus; + /* Use irq assignment to distribute across cpus in node */ + cpu_index = irq % num_cpus; - for (numa_cpu = first_cpu(cpu_mask) ; i < cpu_index ; i++) - numa_cpu = next_cpu(numa_cpu, cpu_mask); + for_each_cpu_and(numa_cpu, cpu_mask, &domain) + if (cpu_online(numa_cpu) && i++ >= cpu_index) + break; - if (numa_cpu != NR_CPUS) + if (numa_cpu < nr_cpu_ids) return cpu_physical_id(numa_cpu); } skip_numa_setup: @@ -692,16 +688,25 @@ skip_numa_setup: * case of NUMA.) */ do { - if (++cpu >= NR_CPUS) + if (++cpu >= nr_cpu_ids) cpu = 0; - } while (!cpu_online(cpu)); + } while (!cpu_online(cpu) || !cpu_isset(cpu, domain)); return cpu_physical_id(cpu); -#else +#else /* CONFIG_SMP */ return cpu_physical_id(smp_processor_id()); #endif } +static inline unsigned char choose_dmode(void) +{ +#ifdef CONFIG_SMP + if (smp_int_redirect & SMP_IRQ_REDIRECTION) + return IOSAPIC_LOWEST_PRIORITY; +#endif + return IOSAPIC_FIXED; +} + /* * ACPI can describe IOSAPIC interrupts via static tables and namespace * methods. This provides an interface to register those interrupts and @@ -711,83 +716,76 @@ int iosapic_register_intr (unsigned int gsi, unsigned long polarity, unsigned long trigger) { - int vector, mask = 1, err; + int irq, mask = 1, err; unsigned int dest; unsigned long flags; struct iosapic_rte_info *rte; u32 low32; -again: + unsigned char dmode; + struct irq_desc *desc; + /* * If this GSI has already been registered (i.e., it's a * shared interrupt, or we lost a race to register it), * don't touch the RTE. */ spin_lock_irqsave(&iosapic_lock, flags); - { - vector = gsi_to_vector(gsi); - if (vector > 0) { - rte = gsi_vector_to_rte(gsi, vector); + irq = __gsi_to_irq(gsi); + if (irq > 0) { + rte = find_rte(irq, gsi); + if(iosapic_intr_info[irq].count == 0) { + assign_irq_vector(irq); + irq_init_desc(irq); + } else if (rte->refcnt != NO_REF_RTE) { rte->refcnt++; - spin_unlock_irqrestore(&iosapic_lock, flags); - return vector; + goto unlock_iosapic_lock; } - } - spin_unlock_irqrestore(&iosapic_lock, flags); + } else + irq = create_irq(); /* If vector is running out, we try to find a sharable vector */ - vector = assign_irq_vector(AUTO_ASSIGN); - if (vector < 0) { - vector = iosapic_find_sharable_vector(trigger, polarity); - if (vector < 0) - return -ENOSPC; + if (irq < 0) { + irq = iosapic_find_sharable_irq(trigger, polarity); + if (irq < 0) + goto unlock_iosapic_lock; } - spin_lock_irqsave(&irq_descp(vector)->lock, flags); - spin_lock(&iosapic_lock); - { - if (gsi_to_vector(gsi) > 0) { - if (list_empty(&iosapic_intr_info[vector].rtes)) - free_irq_vector(vector); - spin_unlock(&iosapic_lock); - spin_unlock_irqrestore(&irq_descp(vector)->lock, flags); - goto again; - } - - dest = get_target_cpu(gsi, vector); - err = register_intr(gsi, vector, IOSAPIC_LOWEST_PRIORITY, - polarity, trigger); - if (err < 0) { - spin_unlock(&iosapic_lock); - spin_unlock_irqrestore(&irq_descp(vector)->lock, flags); - return err; - } - - /* - * If the vector is shared and already unmasked for - * other interrupt sources, don't mask it. - */ - low32 = iosapic_intr_info[vector].low32; - if (vector_is_shared(vector) && !(low32 & IOSAPIC_MASK)) - mask = 0; - set_rte(gsi, vector, dest, mask); + desc = irq_to_desc(irq); + raw_spin_lock(&desc->lock); + dest = get_target_cpu(gsi, irq); + dmode = choose_dmode(); + err = register_intr(gsi, irq, dmode, polarity, trigger); + if (err < 0) { + raw_spin_unlock(&desc->lock); + irq = err; + goto unlock_iosapic_lock; } - spin_unlock(&iosapic_lock); - spin_unlock_irqrestore(&irq_descp(vector)->lock, flags); + + /* + * If the vector is shared and already unmasked for other + * interrupt sources, don't mask it. + */ + low32 = iosapic_intr_info[irq].low32; + if (irq_is_shared(irq) && !(low32 & IOSAPIC_MASK)) + mask = 0; + set_rte(gsi, irq, dest, mask); printk(KERN_INFO "GSI %u (%s, %s) -> CPU %d (0x%04x) vector %d\n", gsi, (trigger == IOSAPIC_EDGE ? "edge" : "level"), (polarity == IOSAPIC_POL_HIGH ? "high" : "low"), - cpu_logical_id(dest), dest, vector); + cpu_logical_id(dest), dest, irq_to_vector(irq)); - return vector; + raw_spin_unlock(&desc->lock); + unlock_iosapic_lock: + spin_unlock_irqrestore(&iosapic_lock, flags); + return irq; } void iosapic_unregister_intr (unsigned int gsi) { unsigned long flags; - int irq, vector, index; - irq_desc_t *idesc; + int irq, index; u32 low32; unsigned long trigger, polarity; unsigned int dest; @@ -800,74 +798,64 @@ iosapic_unregister_intr (unsigned int gsi) */ irq = gsi_to_irq(gsi); if (irq < 0) { - printk(KERN_ERR "iosapic_unregister_intr(%u) unbalanced\n", gsi); + printk(KERN_ERR "iosapic_unregister_intr(%u) unbalanced\n", + gsi); WARN_ON(1); return; } - vector = irq_to_vector(irq); - idesc = irq_descp(irq); - spin_lock_irqsave(&idesc->lock, flags); - spin_lock(&iosapic_lock); - { - if ((rte = gsi_vector_to_rte(gsi, vector)) == NULL) { - printk(KERN_ERR "iosapic_unregister_intr(%u) unbalanced\n", gsi); - WARN_ON(1); - goto out; - } + spin_lock_irqsave(&iosapic_lock, flags); + if ((rte = find_rte(irq, gsi)) == NULL) { + printk(KERN_ERR "iosapic_unregister_intr(%u) unbalanced\n", + gsi); + WARN_ON(1); + goto out; + } - if (--rte->refcnt > 0) - goto out; + if (--rte->refcnt > 0) + goto out; - /* Mask the interrupt */ - low32 = iosapic_intr_info[vector].low32 | IOSAPIC_MASK; - iosapic_write(rte->addr, IOSAPIC_RTE_LOW(rte->rte_index), low32); - - /* Remove the rte entry from the list */ - list_del(&rte->rte_list); - iosapic_intr_info[vector].count--; - iosapic_free_rte(rte); - index = find_iosapic(gsi); - iosapic_lists[index].rtes_inuse--; - WARN_ON(iosapic_lists[index].rtes_inuse < 0); - - trigger = iosapic_intr_info[vector].trigger; - polarity = iosapic_intr_info[vector].polarity; - dest = iosapic_intr_info[vector].dest; - printk(KERN_INFO "GSI %u (%s, %s) -> CPU %d (0x%04x) vector %d unregistered\n", - gsi, (trigger == IOSAPIC_EDGE ? "edge" : "level"), - (polarity == IOSAPIC_POL_HIGH ? "high" : "low"), - cpu_logical_id(dest), dest, vector); - - if (list_empty(&iosapic_intr_info[vector].rtes)) { - /* Sanity check */ - BUG_ON(iosapic_intr_info[vector].count); - - /* Clear the interrupt controller descriptor */ - idesc->handler = &no_irq_type; - - /* Clear the interrupt information */ - memset(&iosapic_intr_info[vector], 0, sizeof(struct iosapic_intr_info)); - iosapic_intr_info[vector].low32 |= IOSAPIC_MASK; - INIT_LIST_HEAD(&iosapic_intr_info[vector].rtes); - - if (idesc->action) { - printk(KERN_ERR "interrupt handlers still exist on IRQ %u\n", irq); - WARN_ON(1); - } + rte->refcnt = NO_REF_RTE; - /* Free the interrupt vector */ - free_irq_vector(vector); - } + /* Mask the interrupt */ + low32 = iosapic_intr_info[irq].low32 | IOSAPIC_MASK; + iosapic_write(rte->iosapic, IOSAPIC_RTE_LOW(rte->rte_index), low32); + + iosapic_intr_info[irq].count--; + index = find_iosapic(gsi); + iosapic_lists[index].rtes_inuse--; + WARN_ON(iosapic_lists[index].rtes_inuse < 0); + + trigger = iosapic_intr_info[irq].trigger; + polarity = iosapic_intr_info[irq].polarity; + dest = iosapic_intr_info[irq].dest; + printk(KERN_INFO + "GSI %u (%s, %s) -> CPU %d (0x%04x) vector %d unregistered\n", + gsi, (trigger == IOSAPIC_EDGE ? "edge" : "level"), + (polarity == IOSAPIC_POL_HIGH ? "high" : "low"), + cpu_logical_id(dest), dest, irq_to_vector(irq)); + + if (iosapic_intr_info[irq].count == 0) { +#ifdef CONFIG_SMP + /* Clear affinity */ + cpumask_setall(irq_get_irq_data(irq)->affinity); +#endif + /* Clear the interrupt information */ + iosapic_intr_info[irq].dest = 0; + iosapic_intr_info[irq].dmode = 0; + iosapic_intr_info[irq].polarity = 0; + iosapic_intr_info[irq].trigger = 0; + iosapic_intr_info[irq].low32 |= IOSAPIC_MASK; + + /* Destroy and reserve IRQ */ + destroy_and_reserve_irq(irq); } out: - spin_unlock(&iosapic_lock); - spin_unlock_irqrestore(&idesc->lock, flags); + spin_unlock_irqrestore(&iosapic_lock, flags); } /* * ACPI calls this when it finds an entry for a platform interrupt. - * Note that the irq_base and IOSAPIC address must be set in iosapic_init(). */ int __init iosapic_register_platform_intr (u32 int_type, unsigned int gsi, @@ -876,94 +864,110 @@ iosapic_register_platform_intr (u32 int_type, unsigned int gsi, { static const char * const name[] = {"unknown", "PMI", "INIT", "CPEI"}; unsigned char delivery; - int vector, mask = 0; + int irq, vector, mask = 0; unsigned int dest = ((id << 8) | eid) & 0xffff; switch (int_type) { case ACPI_INTERRUPT_PMI: - vector = iosapic_vector; + irq = vector = iosapic_vector; + bind_irq_vector(irq, vector, CPU_MASK_ALL); /* * since PMI vector is alloc'd by FW(ACPI) not by kernel, * we need to make sure the vector is available */ - iosapic_reassign_vector(vector); + iosapic_reassign_vector(irq); delivery = IOSAPIC_PMI; break; case ACPI_INTERRUPT_INIT: - vector = assign_irq_vector(AUTO_ASSIGN); - if (vector < 0) - panic("%s: out of interrupt vectors!\n", __FUNCTION__); + irq = create_irq(); + if (irq < 0) + panic("%s: out of interrupt vectors!\n", __func__); + vector = irq_to_vector(irq); delivery = IOSAPIC_INIT; break; case ACPI_INTERRUPT_CPEI: - vector = IA64_CPE_VECTOR; - delivery = IOSAPIC_LOWEST_PRIORITY; + irq = vector = IA64_CPE_VECTOR; + BUG_ON(bind_irq_vector(irq, vector, CPU_MASK_ALL)); + delivery = IOSAPIC_FIXED; mask = 1; break; default: - printk(KERN_ERR "iosapic_register_platform_irq(): invalid int type 0x%x\n", int_type); + printk(KERN_ERR "%s: invalid int type 0x%x\n", __func__, + int_type); return -1; } - register_intr(gsi, vector, delivery, polarity, trigger); + register_intr(gsi, irq, delivery, polarity, trigger); - printk(KERN_INFO "PLATFORM int %s (0x%x): GSI %u (%s, %s) -> CPU %d (0x%04x) vector %d\n", + printk(KERN_INFO + "PLATFORM int %s (0x%x): GSI %u (%s, %s) -> CPU %d (0x%04x)" + " vector %d\n", int_type < ARRAY_SIZE(name) ? name[int_type] : "unknown", int_type, gsi, (trigger == IOSAPIC_EDGE ? "edge" : "level"), (polarity == IOSAPIC_POL_HIGH ? "high" : "low"), cpu_logical_id(dest), dest, vector); - set_rte(gsi, vector, dest, mask); + set_rte(gsi, irq, dest, mask); return vector; } - /* * ACPI calls this when it finds an entry for a legacy ISA IRQ override. - * Note that the gsi_base and IOSAPIC address must be set in iosapic_init(). */ -void __init -iosapic_override_isa_irq (unsigned int isa_irq, unsigned int gsi, - unsigned long polarity, - unsigned long trigger) +void iosapic_override_isa_irq(unsigned int isa_irq, unsigned int gsi, + unsigned long polarity, unsigned long trigger) { - int vector; + int vector, irq; unsigned int dest = cpu_physical_id(smp_processor_id()); + unsigned char dmode; - vector = isa_irq_to_vector(isa_irq); - - register_intr(gsi, vector, IOSAPIC_LOWEST_PRIORITY, polarity, trigger); + irq = vector = isa_irq_to_vector(isa_irq); + BUG_ON(bind_irq_vector(irq, vector, CPU_MASK_ALL)); + dmode = choose_dmode(); + register_intr(gsi, irq, dmode, polarity, trigger); DBG("ISA: IRQ %u -> GSI %u (%s,%s) -> CPU %d (0x%04x) vector %d\n", isa_irq, gsi, trigger == IOSAPIC_EDGE ? "edge" : "level", polarity == IOSAPIC_POL_HIGH ? "high" : "low", cpu_logical_id(dest), dest, vector); - set_rte(gsi, vector, dest, 1); + set_rte(gsi, irq, dest, 1); } void __init -iosapic_system_init (int system_pcat_compat) +ia64_native_iosapic_pcat_compat_init(void) { - int vector; - - for (vector = 0; vector < IA64_NUM_VECTORS; ++vector) { - iosapic_intr_info[vector].low32 = IOSAPIC_MASK; - INIT_LIST_HEAD(&iosapic_intr_info[vector].rtes); /* mark as unused */ - } - - pcat_compat = system_pcat_compat; if (pcat_compat) { /* - * Disable the compatibility mode interrupts (8259 style), needs IN/OUT support - * enabled. + * Disable the compatibility mode interrupts (8259 style), + * needs IN/OUT support enabled. */ - printk(KERN_INFO "%s: Disabling PC-AT compatible 8259 interrupts\n", __FUNCTION__); + printk(KERN_INFO + "%s: Disabling PC-AT compatible 8259 interrupts\n", + __func__); outb(0xff, 0xA1); outb(0xff, 0x21); } } +void __init +iosapic_system_init (int system_pcat_compat) +{ + int irq; + + for (irq = 0; irq < NR_IRQS; ++irq) { + iosapic_intr_info[irq].low32 = IOSAPIC_MASK; + /* mark as unused */ + INIT_LIST_HEAD(&iosapic_intr_info[irq].rtes); + + iosapic_intr_info[irq].count = 0; + } + + pcat_compat = system_pcat_compat; + if (pcat_compat) + iosapic_pcat_compat_init(); +} + static inline int iosapic_alloc (void) { @@ -973,7 +977,7 @@ iosapic_alloc (void) if (!iosapic_lists[index].addr) return index; - printk(KERN_WARNING "%s: failed to allocate iosapic\n", __FUNCTION__); + printk(KERN_WARNING "%s: failed to allocate iosapic\n", __func__); return -1; } @@ -998,10 +1002,7 @@ iosapic_check_gsi_range (unsigned int gsi_base, unsigned int ver) base = iosapic_lists[index].gsi_base; end = base + iosapic_lists[index].num_rte - 1; - if (gsi_base < base && gsi_end < base) - continue;/* OK */ - - if (gsi_base > end && gsi_end > end) + if (gsi_end < base || end < gsi_base) continue; /* OK */ return -EBUSY; @@ -1009,8 +1010,27 @@ iosapic_check_gsi_range (unsigned int gsi_base, unsigned int ver) return 0; } -int __devinit -iosapic_init (unsigned long phys_addr, unsigned int gsi_base) +static int +iosapic_delete_rte(unsigned int irq, unsigned int gsi) +{ + struct iosapic_rte_info *rte, *temp; + + list_for_each_entry_safe(rte, temp, &iosapic_intr_info[irq].rtes, + rte_list) { + if (rte->iosapic->gsi_base + rte->rte_index == gsi) { + if (rte->refcnt) + return -EBUSY; + + list_del(&rte->rte_list); + kfree(rte); + return 0; + } + } + + return -EINVAL; +} + +int iosapic_init(unsigned long phys_addr, unsigned int gsi_base) { int num_rte, err, index; unsigned int isa_irq, ver; @@ -1018,97 +1038,104 @@ iosapic_init (unsigned long phys_addr, unsigned int gsi_base) unsigned long flags; spin_lock_irqsave(&iosapic_lock, flags); - { - addr = ioremap(phys_addr, 0); - ver = iosapic_version(addr); + index = find_iosapic(gsi_base); + if (index >= 0) { + spin_unlock_irqrestore(&iosapic_lock, flags); + return -EBUSY; + } - if ((err = iosapic_check_gsi_range(gsi_base, ver))) { - iounmap(addr); - spin_unlock_irqrestore(&iosapic_lock, flags); - return err; - } + addr = ioremap(phys_addr, 0); + if (addr == NULL) { + spin_unlock_irqrestore(&iosapic_lock, flags); + return -ENOMEM; + } + ver = iosapic_version(addr); + if ((err = iosapic_check_gsi_range(gsi_base, ver))) { + iounmap(addr); + spin_unlock_irqrestore(&iosapic_lock, flags); + return err; + } - /* - * The MAX_REDIR register holds the highest input pin - * number (starting from 0). - * We add 1 so that we can use it for number of pins (= RTEs) - */ - num_rte = ((ver >> 16) & 0xff) + 1; + /* + * The MAX_REDIR register holds the highest input pin number + * (starting from 0). We add 1 so that we can use it for + * number of pins (= RTEs) + */ + num_rte = ((ver >> 16) & 0xff) + 1; - index = iosapic_alloc(); - iosapic_lists[index].addr = addr; - iosapic_lists[index].gsi_base = gsi_base; - iosapic_lists[index].num_rte = num_rte; + index = iosapic_alloc(); + iosapic_lists[index].addr = addr; + iosapic_lists[index].gsi_base = gsi_base; + iosapic_lists[index].num_rte = num_rte; #ifdef CONFIG_NUMA - iosapic_lists[index].node = MAX_NUMNODES; + iosapic_lists[index].node = MAX_NUMNODES; #endif - } + spin_lock_init(&iosapic_lists[index].lock); spin_unlock_irqrestore(&iosapic_lock, flags); if ((gsi_base == 0) && pcat_compat) { /* - * Map the legacy ISA devices into the IOSAPIC data. Some of these may - * get reprogrammed later on with data from the ACPI Interrupt Source - * Override table. + * Map the legacy ISA devices into the IOSAPIC data. Some of + * these may get reprogrammed later on with data from the ACPI + * Interrupt Source Override table. */ for (isa_irq = 0; isa_irq < 16; ++isa_irq) - iosapic_override_isa_irq(isa_irq, isa_irq, IOSAPIC_POL_HIGH, IOSAPIC_EDGE); + iosapic_override_isa_irq(isa_irq, isa_irq, + IOSAPIC_POL_HIGH, + IOSAPIC_EDGE); } return 0; } -#ifdef CONFIG_HOTPLUG -int -iosapic_remove (unsigned int gsi_base) +int iosapic_remove(unsigned int gsi_base) { - int index, err = 0; + int i, irq, index, err = 0; unsigned long flags; spin_lock_irqsave(&iosapic_lock, flags); - { - index = find_iosapic(gsi_base); - if (index < 0) { - printk(KERN_WARNING "%s: No IOSAPIC for GSI base %u\n", - __FUNCTION__, gsi_base); - goto out; - } + index = find_iosapic(gsi_base); + if (index < 0) { + printk(KERN_WARNING "%s: No IOSAPIC for GSI base %u\n", + __func__, gsi_base); + goto out; + } - if (iosapic_lists[index].rtes_inuse) { - err = -EBUSY; - printk(KERN_WARNING "%s: IOSAPIC for GSI base %u is busy\n", - __FUNCTION__, gsi_base); - goto out; - } + if (iosapic_lists[index].rtes_inuse) { + err = -EBUSY; + printk(KERN_WARNING "%s: IOSAPIC for GSI base %u is busy\n", + __func__, gsi_base); + goto out; + } + + for (i = gsi_base; i < gsi_base + iosapic_lists[index].num_rte; i++) { + irq = __gsi_to_irq(i); + if (irq < 0) + continue; - iounmap(iosapic_lists[index].addr); - iosapic_free(index); + err = iosapic_delete_rte(irq, i); + if (err) + goto out; } + + iounmap(iosapic_lists[index].addr); + iosapic_free(index); out: spin_unlock_irqrestore(&iosapic_lock, flags); return err; } -#endif /* CONFIG_HOTPLUG */ #ifdef CONFIG_NUMA -void __devinit -map_iosapic_to_node(unsigned int gsi_base, int node) +void map_iosapic_to_node(unsigned int gsi_base, int node) { int index; index = find_iosapic(gsi_base); if (index < 0) { printk(KERN_WARNING "%s: No IOSAPIC for GSI %u\n", - __FUNCTION__, gsi_base); + __func__, gsi_base); return; } iosapic_lists[index].node = node; return; } #endif - -static int __init iosapic_enable_kmalloc (void) -{ - iosapic_kmalloc_ok = 1; - return 0; -} -core_initcall (iosapic_enable_kmalloc); diff --git a/arch/ia64/kernel/irq.c b/arch/ia64/kernel/irq.c index d33244c3275..f2c41828113 100644 --- a/arch/ia64/kernel/irq.c +++ b/arch/ia64/kernel/irq.c @@ -4,7 +4,7 @@ * Copyright (C) 1992, 1998 Linus Torvalds, Ingo Molnar * * This file contains the code used by various IRQ handling routines: - * asking for different IRQ's should be done through these routines + * asking for different IRQs should be done through these routines * instead of just grabbing them. Thus setups with different IRQ numbers * shouldn't result in any weird surprises, and installing new handlers * should be easier. @@ -12,7 +12,7 @@ * Copyright (C) Ashok Raj<ashok.raj@intel.com>, Intel Corporation 2004 * * 4/14/2004: Added code to handle cpu migration and do safe irq - * migration without lossing interrupts for iosapic + * migration without losing interrupts for iosapic * architecture. */ @@ -23,6 +23,8 @@ #include <linux/interrupt.h> #include <linux/kernel_stat.h> +#include <asm/mca.h> + /* * 'what should we do if we get a hw irq event on an illegal vector'. * each architecture has to answer this themselves. @@ -33,9 +35,14 @@ void ack_bad_irq(unsigned int irq) } #ifdef CONFIG_IA64_GENERIC +ia64_vector __ia64_irq_to_vector(int irq) +{ + return irq_cfg[irq].vector; +} + unsigned int __ia64_local_vector_to_irq (ia64_vector vec) { - return (unsigned int) vec; + return __get_cpu_var(vector_irq)[vec]; } #endif @@ -48,45 +55,9 @@ atomic_t irq_err_count; /* * /proc/interrupts printing: */ - -int show_interrupts(struct seq_file *p, void *v) +int arch_show_interrupts(struct seq_file *p, int prec) { - int i = *(loff_t *) v, j; - struct irqaction * action; - unsigned long flags; - - if (i == 0) { - seq_printf(p, " "); - for_each_online_cpu(j) { - seq_printf(p, "CPU%d ",j); - } - seq_putc(p, '\n'); - } - - if (i < NR_IRQS) { - spin_lock_irqsave(&irq_desc[i].lock, flags); - action = irq_desc[i].action; - if (!action) - goto skip; - seq_printf(p, "%3d: ",i); -#ifndef CONFIG_SMP - seq_printf(p, "%10u ", kstat_irqs(i)); -#else - for_each_online_cpu(j) { - seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]); - } -#endif - seq_printf(p, " %14s", irq_desc[i].handler->typename); - seq_printf(p, " %s", action->name); - - for (action=action->next; action; action = action->next) - seq_printf(p, ", %s", action->name); - - seq_putc(p, '\n'); -skip: - spin_unlock_irqrestore(&irq_desc[i].lock, flags); - } else if (i == NR_IRQS) - seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count)); + seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count)); return 0; } @@ -95,33 +66,49 @@ static char irq_redir [NR_IRQS]; // = { [0 ... NR_IRQS-1] = 1 }; void set_irq_affinity_info (unsigned int irq, int hwid, int redir) { - cpumask_t mask = CPU_MASK_NONE; - - cpu_set(cpu_logical_id(hwid), mask); - if (irq < NR_IRQS) { - irq_affinity[irq] = mask; - set_irq_info(irq, mask); + cpumask_copy(irq_get_irq_data(irq)->affinity, + cpumask_of(cpu_logical_id(hwid))); irq_redir[irq] = (char) (redir & 0xff); } } + +bool is_affinity_mask_valid(const struct cpumask *cpumask) +{ + if (ia64_platform_is("sn2")) { + /* Only allow one CPU to be specified in the smp_affinity mask */ + if (cpumask_weight(cpumask) != 1) + return false; + } + return true; +} + #endif /* CONFIG_SMP */ +int __init arch_early_irq_init(void) +{ + ia64_mca_irq_init(); + return 0; +} + #ifdef CONFIG_HOTPLUG_CPU unsigned int vectors_in_migration[NR_IRQS]; /* - * Since cpu_online_map is already updated, we just need to check for + * Since cpu_online_mask is already updated, we just need to check for * affinity that has zeros */ static void migrate_irqs(void) { - cpumask_t mask; - irq_desc_t *desc; int irq, new_cpu; for (irq=0; irq < NR_IRQS; irq++) { - desc = irq_descp(irq); + struct irq_desc *desc = irq_to_desc(irq); + struct irq_data *data = irq_desc_get_irq_data(desc); + struct irq_chip *chip = irq_data_get_irq_chip(data); + + if (irqd_irq_disabled(data)) + continue; /* * No handling for now. @@ -129,31 +116,31 @@ static void migrate_irqs(void) * tell CPU not to respond to these local intr sources. * such as ITV,CPEI,MCA etc. */ - if (desc->status == IRQ_PER_CPU) + if (irqd_is_per_cpu(data)) continue; - cpus_and(mask, irq_affinity[irq], cpu_online_map); - if (any_online_cpu(mask) == NR_CPUS) { + if (cpumask_any_and(data->affinity, cpu_online_mask) + >= nr_cpu_ids) { /* * Save it for phase 2 processing */ vectors_in_migration[irq] = irq; - new_cpu = any_online_cpu(cpu_online_map); - mask = cpumask_of_cpu(new_cpu); + new_cpu = cpumask_any(cpu_online_mask); /* * Al three are essential, currently WARN_ON.. maybe panic? */ - if (desc->handler && desc->handler->disable && - desc->handler->enable && desc->handler->set_affinity) { - desc->handler->disable(irq); - desc->handler->set_affinity(irq, mask); - desc->handler->enable(irq); + if (chip && chip->irq_disable && + chip->irq_enable && chip->irq_set_affinity) { + chip->irq_disable(data); + chip->irq_set_affinity(data, + cpumask_of(new_cpu), false); + chip->irq_enable(data); } else { - WARN_ON((!(desc->handler) || !(desc->handler->disable) || - !(desc->handler->enable) || - !(desc->handler->set_affinity))); + WARN_ON((!chip || !chip->irq_disable || + !chip->irq_enable || + !chip->irq_set_affinity)); } } } @@ -163,10 +150,21 @@ void fixup_irqs(void) { unsigned int irq; extern void ia64_process_pending_intr(void); + extern volatile int time_keeper_id; + + /* Mask ITV to disable timer */ + ia64_set_itv(1 << 16); - ia64_set_itv(1<<16); /* - * Phase 1: Locate irq's bound to this cpu and + * Find a new timesync master + */ + if (smp_processor_id() == time_keeper_id) { + time_keeper_id = cpumask_first(cpu_online_mask); + printk ("CPU %d is now promoted to time-keeper master\n", time_keeper_id); + } + + /* + * Phase 1: Locate IRQs bound to this cpu and * relocate them for cpu removal. */ migrate_irqs(); @@ -184,8 +182,11 @@ void fixup_irqs(void) */ for (irq=0; irq < NR_IRQS; irq++) { if (vectors_in_migration[irq]) { + struct pt_regs *old_regs = set_irq_regs(NULL); + vectors_in_migration[irq]=0; - __do_IRQ(irq, NULL); + generic_handle_irq(irq); + set_irq_regs(old_regs); } } diff --git a/arch/ia64/kernel/irq_ia64.c b/arch/ia64/kernel/irq_ia64.c index 6c4d59fd036..03ea78ed64a 100644 --- a/arch/ia64/kernel/irq_ia64.c +++ b/arch/ia64/kernel/irq_ia64.c @@ -1,5 +1,5 @@ /* - * linux/arch/ia64/kernel/irq.c + * linux/arch/ia64/kernel/irq_ia64.c * * Copyright (C) 1998-2001 Hewlett-Packard Co * Stephane Eranian <eranian@hpl.hp.com> @@ -14,7 +14,6 @@ * Added CPU Hotplug handling for IPF. */ -#include <linux/config.h> #include <linux/module.h> #include <linux/jiffies.h> @@ -23,14 +22,15 @@ #include <linux/interrupt.h> #include <linux/ioport.h> #include <linux/kernel_stat.h> -#include <linux/slab.h> #include <linux/ptrace.h> -#include <linux/random.h> /* for rand_initialize_irq() */ #include <linux/signal.h> #include <linux/smp.h> -#include <linux/smp_lock.h> #include <linux/threads.h> #include <linux/bitops.h> +#include <linux/irq.h> +#include <linux/ratelimit.h> +#include <linux/acpi.h> +#include <linux/sched.h> #include <asm/delay.h> #include <asm/intrinsics.h> @@ -38,7 +38,7 @@ #include <asm/hw_irq.h> #include <asm/machvec.h> #include <asm/pgtable.h> -#include <asm/system.h> +#include <asm/tlbflush.h> #ifdef CONFIG_PERFMON # include <asm/perfmon.h> @@ -46,10 +46,22 @@ #define IRQ_DEBUG 0 +#define IRQ_VECTOR_UNASSIGNED (0) + +#define IRQ_UNUSED (0) +#define IRQ_USED (1) +#define IRQ_RSVD (2) + +/* These can be overridden in platform_irq_init */ +int ia64_first_device_vector = IA64_DEF_FIRST_DEVICE_VECTOR; +int ia64_last_device_vector = IA64_DEF_LAST_DEVICE_VECTOR; + /* default base addr of IPI table */ void __iomem *ipi_base_addr = ((void __iomem *) (__IA64_UNCACHED_OFFSET | IA64_IPI_DEFAULT_BASE_ADDR)); +static cpumask_t vector_allocation_domain(int cpu); + /* * Legacy IRQ to IA-64 vector translation table. */ @@ -60,39 +72,365 @@ __u8 isa_irq_to_vector_map[16] = { }; EXPORT_SYMBOL(isa_irq_to_vector_map); -static unsigned long ia64_vector_mask[BITS_TO_LONGS(IA64_NUM_DEVICE_VECTORS)]; +DEFINE_SPINLOCK(vector_lock); -int -assign_irq_vector (int irq) +struct irq_cfg irq_cfg[NR_IRQS] __read_mostly = { + [0 ... NR_IRQS - 1] = { + .vector = IRQ_VECTOR_UNASSIGNED, + .domain = CPU_MASK_NONE + } +}; + +DEFINE_PER_CPU(int[IA64_NUM_VECTORS], vector_irq) = { + [0 ... IA64_NUM_VECTORS - 1] = -1 +}; + +static cpumask_t vector_table[IA64_NUM_VECTORS] = { + [0 ... IA64_NUM_VECTORS - 1] = CPU_MASK_NONE +}; + +static int irq_status[NR_IRQS] = { + [0 ... NR_IRQS -1] = IRQ_UNUSED +}; + +static inline int find_unassigned_irq(void) { + int irq; + + for (irq = IA64_FIRST_DEVICE_VECTOR; irq < NR_IRQS; irq++) + if (irq_status[irq] == IRQ_UNUSED) + return irq; + return -ENOSPC; +} + +static inline int find_unassigned_vector(cpumask_t domain) +{ + cpumask_t mask; int pos, vector; - again: - pos = find_first_zero_bit(ia64_vector_mask, IA64_NUM_DEVICE_VECTORS); - vector = IA64_FIRST_DEVICE_VECTOR + pos; - if (vector > IA64_LAST_DEVICE_VECTOR) - return -ENOSPC; - if (test_and_set_bit(pos, ia64_vector_mask)) - goto again; + + cpumask_and(&mask, &domain, cpu_online_mask); + if (cpus_empty(mask)) + return -EINVAL; + + for (pos = 0; pos < IA64_NUM_DEVICE_VECTORS; pos++) { + vector = IA64_FIRST_DEVICE_VECTOR + pos; + cpus_and(mask, domain, vector_table[vector]); + if (!cpus_empty(mask)) + continue; + return vector; + } + return -ENOSPC; +} + +static int __bind_irq_vector(int irq, int vector, cpumask_t domain) +{ + cpumask_t mask; + int cpu; + struct irq_cfg *cfg = &irq_cfg[irq]; + + BUG_ON((unsigned)irq >= NR_IRQS); + BUG_ON((unsigned)vector >= IA64_NUM_VECTORS); + + cpumask_and(&mask, &domain, cpu_online_mask); + if (cpus_empty(mask)) + return -EINVAL; + if ((cfg->vector == vector) && cpus_equal(cfg->domain, domain)) + return 0; + if (cfg->vector != IRQ_VECTOR_UNASSIGNED) + return -EBUSY; + for_each_cpu_mask(cpu, mask) + per_cpu(vector_irq, cpu)[vector] = irq; + cfg->vector = vector; + cfg->domain = domain; + irq_status[irq] = IRQ_USED; + cpus_or(vector_table[vector], vector_table[vector], domain); + return 0; +} + +int bind_irq_vector(int irq, int vector, cpumask_t domain) +{ + unsigned long flags; + int ret; + + spin_lock_irqsave(&vector_lock, flags); + ret = __bind_irq_vector(irq, vector, domain); + spin_unlock_irqrestore(&vector_lock, flags); + return ret; +} + +static void __clear_irq_vector(int irq) +{ + int vector, cpu; + cpumask_t mask; + cpumask_t domain; + struct irq_cfg *cfg = &irq_cfg[irq]; + + BUG_ON((unsigned)irq >= NR_IRQS); + BUG_ON(cfg->vector == IRQ_VECTOR_UNASSIGNED); + vector = cfg->vector; + domain = cfg->domain; + cpumask_and(&mask, &cfg->domain, cpu_online_mask); + for_each_cpu_mask(cpu, mask) + per_cpu(vector_irq, cpu)[vector] = -1; + cfg->vector = IRQ_VECTOR_UNASSIGNED; + cfg->domain = CPU_MASK_NONE; + irq_status[irq] = IRQ_UNUSED; + cpus_andnot(vector_table[vector], vector_table[vector], domain); +} + +static void clear_irq_vector(int irq) +{ + unsigned long flags; + + spin_lock_irqsave(&vector_lock, flags); + __clear_irq_vector(irq); + spin_unlock_irqrestore(&vector_lock, flags); +} + +int +ia64_native_assign_irq_vector (int irq) +{ + unsigned long flags; + int vector, cpu; + cpumask_t domain = CPU_MASK_NONE; + + vector = -ENOSPC; + + spin_lock_irqsave(&vector_lock, flags); + for_each_online_cpu(cpu) { + domain = vector_allocation_domain(cpu); + vector = find_unassigned_vector(domain); + if (vector >= 0) + break; + } + if (vector < 0) + goto out; + if (irq == AUTO_ASSIGN) + irq = vector; + BUG_ON(__bind_irq_vector(irq, vector, domain)); + out: + spin_unlock_irqrestore(&vector_lock, flags); return vector; } void -free_irq_vector (int vector) +ia64_native_free_irq_vector (int vector) +{ + if (vector < IA64_FIRST_DEVICE_VECTOR || + vector > IA64_LAST_DEVICE_VECTOR) + return; + clear_irq_vector(vector); +} + +int +reserve_irq_vector (int vector) +{ + if (vector < IA64_FIRST_DEVICE_VECTOR || + vector > IA64_LAST_DEVICE_VECTOR) + return -EINVAL; + return !!bind_irq_vector(vector, vector, CPU_MASK_ALL); +} + +/* + * Initialize vector_irq on a new cpu. This function must be called + * with vector_lock held. + */ +void __setup_vector_irq(int cpu) +{ + int irq, vector; + + /* Clear vector_irq */ + for (vector = 0; vector < IA64_NUM_VECTORS; ++vector) + per_cpu(vector_irq, cpu)[vector] = -1; + /* Mark the inuse vectors */ + for (irq = 0; irq < NR_IRQS; ++irq) { + if (!cpu_isset(cpu, irq_cfg[irq].domain)) + continue; + vector = irq_to_vector(irq); + per_cpu(vector_irq, cpu)[vector] = irq; + } +} + +#if defined(CONFIG_SMP) && (defined(CONFIG_IA64_GENERIC) || defined(CONFIG_IA64_DIG)) + +static enum vector_domain_type { + VECTOR_DOMAIN_NONE, + VECTOR_DOMAIN_PERCPU +} vector_domain_type = VECTOR_DOMAIN_NONE; + +static cpumask_t vector_allocation_domain(int cpu) { - int pos; + if (vector_domain_type == VECTOR_DOMAIN_PERCPU) + return cpumask_of_cpu(cpu); + return CPU_MASK_ALL; +} + +static int __irq_prepare_move(int irq, int cpu) +{ + struct irq_cfg *cfg = &irq_cfg[irq]; + int vector; + cpumask_t domain; + + if (cfg->move_in_progress || cfg->move_cleanup_count) + return -EBUSY; + if (cfg->vector == IRQ_VECTOR_UNASSIGNED || !cpu_online(cpu)) + return -EINVAL; + if (cpu_isset(cpu, cfg->domain)) + return 0; + domain = vector_allocation_domain(cpu); + vector = find_unassigned_vector(domain); + if (vector < 0) + return -ENOSPC; + cfg->move_in_progress = 1; + cfg->old_domain = cfg->domain; + cfg->vector = IRQ_VECTOR_UNASSIGNED; + cfg->domain = CPU_MASK_NONE; + BUG_ON(__bind_irq_vector(irq, vector, domain)); + return 0; +} + +int irq_prepare_move(int irq, int cpu) +{ + unsigned long flags; + int ret; - if (vector < IA64_FIRST_DEVICE_VECTOR || vector > IA64_LAST_DEVICE_VECTOR) + spin_lock_irqsave(&vector_lock, flags); + ret = __irq_prepare_move(irq, cpu); + spin_unlock_irqrestore(&vector_lock, flags); + return ret; +} + +void irq_complete_move(unsigned irq) +{ + struct irq_cfg *cfg = &irq_cfg[irq]; + cpumask_t cleanup_mask; + int i; + + if (likely(!cfg->move_in_progress)) + return; + + if (unlikely(cpu_isset(smp_processor_id(), cfg->old_domain))) return; - pos = vector - IA64_FIRST_DEVICE_VECTOR; - if (!test_and_clear_bit(pos, ia64_vector_mask)) - printk(KERN_WARNING "%s: double free!\n", __FUNCTION__); + cpumask_and(&cleanup_mask, &cfg->old_domain, cpu_online_mask); + cfg->move_cleanup_count = cpus_weight(cleanup_mask); + for_each_cpu_mask(i, cleanup_mask) + platform_send_ipi(i, IA64_IRQ_MOVE_VECTOR, IA64_IPI_DM_INT, 0); + cfg->move_in_progress = 0; +} + +static irqreturn_t smp_irq_move_cleanup_interrupt(int irq, void *dev_id) +{ + int me = smp_processor_id(); + ia64_vector vector; + unsigned long flags; + + for (vector = IA64_FIRST_DEVICE_VECTOR; + vector < IA64_LAST_DEVICE_VECTOR; vector++) { + int irq; + struct irq_desc *desc; + struct irq_cfg *cfg; + irq = __get_cpu_var(vector_irq)[vector]; + if (irq < 0) + continue; + + desc = irq_to_desc(irq); + cfg = irq_cfg + irq; + raw_spin_lock(&desc->lock); + if (!cfg->move_cleanup_count) + goto unlock; + + if (!cpu_isset(me, cfg->old_domain)) + goto unlock; + + spin_lock_irqsave(&vector_lock, flags); + __get_cpu_var(vector_irq)[vector] = -1; + cpu_clear(me, vector_table[vector]); + spin_unlock_irqrestore(&vector_lock, flags); + cfg->move_cleanup_count--; + unlock: + raw_spin_unlock(&desc->lock); + } + return IRQ_HANDLED; +} + +static struct irqaction irq_move_irqaction = { + .handler = smp_irq_move_cleanup_interrupt, + .name = "irq_move" +}; + +static int __init parse_vector_domain(char *arg) +{ + if (!arg) + return -EINVAL; + if (!strcmp(arg, "percpu")) { + vector_domain_type = VECTOR_DOMAIN_PERCPU; + no_int_routing = 1; + } + return 0; +} +early_param("vector", parse_vector_domain); +#else +static cpumask_t vector_allocation_domain(int cpu) +{ + return CPU_MASK_ALL; +} +#endif + + +void destroy_and_reserve_irq(unsigned int irq) +{ + unsigned long flags; + + irq_init_desc(irq); + spin_lock_irqsave(&vector_lock, flags); + __clear_irq_vector(irq); + irq_status[irq] = IRQ_RSVD; + spin_unlock_irqrestore(&vector_lock, flags); +} + +/* + * Dynamic irq allocate and deallocation for MSI + */ +int create_irq(void) +{ + unsigned long flags; + int irq, vector, cpu; + cpumask_t domain = CPU_MASK_NONE; + + irq = vector = -ENOSPC; + spin_lock_irqsave(&vector_lock, flags); + for_each_online_cpu(cpu) { + domain = vector_allocation_domain(cpu); + vector = find_unassigned_vector(domain); + if (vector >= 0) + break; + } + if (vector < 0) + goto out; + irq = find_unassigned_irq(); + if (irq < 0) + goto out; + BUG_ON(__bind_irq_vector(irq, vector, domain)); + out: + spin_unlock_irqrestore(&vector_lock, flags); + if (irq >= 0) + irq_init_desc(irq); + return irq; +} + +void destroy_irq(unsigned int irq) +{ + irq_init_desc(irq); + clear_irq_vector(irq); } #ifdef CONFIG_SMP # define IS_RESCHEDULE(vec) (vec == IA64_IPI_RESCHEDULE) +# define IS_LOCAL_TLB_FLUSH(vec) (vec == IA64_IPI_LOCAL_TLB_FLUSH) #else # define IS_RESCHEDULE(vec) (0) +# define IS_LOCAL_TLB_FLUSH(vec) (0) #endif /* * That's where the IVT branches when we get an external @@ -102,6 +440,7 @@ free_irq_vector (int vector) void ia64_handle_irq (ia64_vector vector, struct pt_regs *regs) { + struct pt_regs *old_regs = set_irq_regs(regs); unsigned long saved_tpr; #if IRQ_DEBUG @@ -119,13 +458,9 @@ ia64_handle_irq (ia64_vector vector, struct pt_regs *regs) sp = ia64_getreg(_IA64_REG_SP); if ((sp - bsp) < 1024) { - static unsigned char count; - static long last_time; + static DEFINE_RATELIMIT_STATE(ratelimit, 5 * HZ, 5); - if (jiffies - last_time > 5*HZ) - count = 0; - if (++count < 5) { - last_time = jiffies; + if (__ratelimit(&ratelimit)) { printk("ia64_handle_irq: DANGER: less than " "1KB of free stack space!!\n" "(bsp=0x%lx, sp=%lx)\n", bsp, sp); @@ -143,11 +478,25 @@ ia64_handle_irq (ia64_vector vector, struct pt_regs *regs) saved_tpr = ia64_getreg(_IA64_REG_CR_TPR); ia64_srlz_d(); while (vector != IA64_SPURIOUS_INT_VECTOR) { - if (!IS_RESCHEDULE(vector)) { + int irq = local_vector_to_irq(vector); + + if (unlikely(IS_LOCAL_TLB_FLUSH(vector))) { + smp_local_flush_tlb(); + kstat_incr_irq_this_cpu(irq); + } else if (unlikely(IS_RESCHEDULE(vector))) { + scheduler_ipi(); + kstat_incr_irq_this_cpu(irq); + } else { ia64_setreg(_IA64_REG_CR_TPR, vector); ia64_srlz_d(); - __do_IRQ(local_vector_to_irq(vector), regs); + if (unlikely(irq < 0)) { + printk(KERN_ERR "%s: Unexpected interrupt " + "vector %d on CPU %d is not mapped " + "to any IRQ!\n", __func__, vector, + smp_processor_id()); + } else + generic_handle_irq(irq); /* * Disable interrupts and send EOI: @@ -164,6 +513,7 @@ ia64_handle_irq (ia64_vector vector, struct pt_regs *regs) * come through until ia64_eoi() has been done. */ irq_exit(); + set_irq_regs(old_regs); } #ifdef CONFIG_HOTPLUG_CPU @@ -179,15 +529,24 @@ void ia64_process_pending_intr(void) vector = ia64_get_ivr(); - irq_enter(); - saved_tpr = ia64_getreg(_IA64_REG_CR_TPR); - ia64_srlz_d(); + irq_enter(); + saved_tpr = ia64_getreg(_IA64_REG_CR_TPR); + ia64_srlz_d(); /* * Perform normal interrupt style processing */ while (vector != IA64_SPURIOUS_INT_VECTOR) { - if (!IS_RESCHEDULE(vector)) { + int irq = local_vector_to_irq(vector); + + if (unlikely(IS_LOCAL_TLB_FLUSH(vector))) { + smp_local_flush_tlb(); + kstat_incr_irq_this_cpu(irq); + } else if (unlikely(IS_RESCHEDULE(vector))) { + kstat_incr_irq_this_cpu(irq); + } else { + struct pt_regs *old_regs = set_irq_regs(NULL); + ia64_setreg(_IA64_REG_CR_TPR, vector); ia64_srlz_d(); @@ -197,8 +556,16 @@ void ia64_process_pending_intr(void) * it will work. I hope it works!. * Probably could shared code. */ - vectors_in_migration[local_vector_to_irq(vector)]=0; - __do_IRQ(local_vector_to_irq(vector), NULL); + if (unlikely(irq < 0)) { + printk(KERN_ERR "%s: Unexpected interrupt " + "vector %d on CPU %d not being mapped " + "to any IRQ!!\n", __func__, vector, + smp_processor_id()); + } else { + vectors_in_migration[irq]=0; + generic_handle_irq(irq); + } + set_irq_regs(old_regs); /* * Disable interrupts and send EOI @@ -215,37 +582,69 @@ void ia64_process_pending_intr(void) #ifdef CONFIG_SMP -extern irqreturn_t handle_IPI (int irq, void *dev_id, struct pt_regs *regs); + +static irqreturn_t dummy_handler (int irq, void *dev_id) +{ + BUG(); +} static struct irqaction ipi_irqaction = { .handler = handle_IPI, - .flags = SA_INTERRUPT, .name = "IPI" }; + +/* + * KVM uses this interrupt to force a cpu out of guest mode + */ +static struct irqaction resched_irqaction = { + .handler = dummy_handler, + .name = "resched" +}; + +static struct irqaction tlb_irqaction = { + .handler = dummy_handler, + .name = "tlb_flush" +}; + #endif void -register_percpu_irq (ia64_vector vec, struct irqaction *action) +ia64_native_register_percpu_irq (ia64_vector vec, struct irqaction *action) { - irq_desc_t *desc; unsigned int irq; - for (irq = 0; irq < NR_IRQS; ++irq) - if (irq_to_vector(irq) == vec) { - desc = irq_descp(irq); - desc->status |= IRQ_PER_CPU; - desc->handler = &irq_type_ia64_lsapic; - if (action) - setup_irq(irq, action); - } + irq = vec; + BUG_ON(bind_irq_vector(irq, vec, CPU_MASK_ALL)); + irq_set_status_flags(irq, IRQ_PER_CPU); + irq_set_chip(irq, &irq_type_ia64_lsapic); + if (action) + setup_irq(irq, action); + irq_set_handler(irq, handle_percpu_irq); +} + +void __init +ia64_native_register_ipi(void) +{ +#ifdef CONFIG_SMP + register_percpu_irq(IA64_IPI_VECTOR, &ipi_irqaction); + register_percpu_irq(IA64_IPI_RESCHEDULE, &resched_irqaction); + register_percpu_irq(IA64_IPI_LOCAL_TLB_FLUSH, &tlb_irqaction); +#endif } void __init init_IRQ (void) { +#ifdef CONFIG_ACPI + acpi_boot_init(); +#endif + ia64_register_ipi(); register_percpu_irq(IA64_SPURIOUS_INT_VECTOR, NULL); #ifdef CONFIG_SMP - register_percpu_irq(IA64_IPI_VECTOR, &ipi_irqaction); +#if defined(CONFIG_IA64_GENERIC) || defined(CONFIG_IA64_DIG) + if (vector_domain_type != VECTOR_DOMAIN_NONE) + register_percpu_irq(IA64_IRQ_MOVE_VECTOR, &irq_move_irqaction); +#endif #endif #ifdef CONFIG_PERFMON pfm_init_percpu(); @@ -260,11 +659,7 @@ ia64_send_ipi (int cpu, int vector, int delivery_mode, int redirect) unsigned long ipi_data; unsigned long phys_cpu_id; -#ifdef CONFIG_SMP phys_cpu_id = cpu_physical_id(cpu); -#else - phys_cpu_id = (ia64_getreg(_IA64_REG_CR_LID) >> 16) & 0xffff; -#endif /* * cpu number is in 8bit ID and 8bit EID diff --git a/arch/ia64/kernel/irq_lsapic.c b/arch/ia64/kernel/irq_lsapic.c index ea14e6a0440..1b3a776e516 100644 --- a/arch/ia64/kernel/irq_lsapic.c +++ b/arch/ia64/kernel/irq_lsapic.c @@ -15,23 +15,30 @@ #include <linux/irq.h> static unsigned int -lsapic_noop_startup (unsigned int irq) +lsapic_noop_startup (struct irq_data *data) { return 0; } static void -lsapic_noop (unsigned int irq) +lsapic_noop (struct irq_data *data) { - /* nuthing to do... */ + /* nothing to do... */ } -struct hw_interrupt_type irq_type_ia64_lsapic = { - .typename = "LSAPIC", - .startup = lsapic_noop_startup, - .shutdown = lsapic_noop, - .enable = lsapic_noop, - .disable = lsapic_noop, - .ack = lsapic_noop, - .end = lsapic_noop +static int lsapic_retrigger(struct irq_data *data) +{ + ia64_resend_irq(data->irq); + + return 1; +} + +struct irq_chip irq_type_ia64_lsapic = { + .name = "LSAPIC", + .irq_startup = lsapic_noop_startup, + .irq_shutdown = lsapic_noop, + .irq_enable = lsapic_noop, + .irq_disable = lsapic_noop, + .irq_ack = lsapic_noop, + .irq_retrigger = lsapic_retrigger, }; diff --git a/arch/ia64/kernel/ivt.S b/arch/ia64/kernel/ivt.S index c13ca0d49c4..18e794a5724 100644 --- a/arch/ia64/kernel/ivt.S +++ b/arch/ia64/kernel/ivt.S @@ -12,6 +12,14 @@ * * 00/08/23 Asit Mallick <asit.k.mallick@intel.com> TLB handling for SMP * 00/12/20 David Mosberger-Tang <davidm@hpl.hp.com> DTLB/ITLB handler now uses virtual PT. + * + * Copyright (C) 2005 Hewlett-Packard Co + * Dan Magenheimer <dan.magenheimer@hp.com> + * Xen paravirtualization + * Copyright (c) 2008 Isaku Yamahata <yamahata at valinux co jp> + * VA Linux Systems Japan K.K. + * pv_ops. + * Yaozu (Eddie) Dong <eddie.dong@intel.com> */ /* * This file defines the interruption vector table used by the CPU. @@ -38,22 +46,19 @@ * Table is based upon EAS2.6 (Oct 1999) */ -#include <linux/config.h> #include <asm/asmmacro.h> #include <asm/break.h> -#include <asm/ia32.h> #include <asm/kregs.h> #include <asm/asm-offsets.h> #include <asm/pgtable.h> #include <asm/processor.h> #include <asm/ptrace.h> -#include <asm/system.h> #include <asm/thread_info.h> #include <asm/unistd.h> #include <asm/errno.h> -#if 1 +#if 0 # define PSR_DEFAULT_BITS psr.ac #else # define PSR_DEFAULT_BITS 0 @@ -76,7 +81,7 @@ mov r19=n;; /* prepare to save predicates */ \ br.sptk.many dispatch_to_fault_handler - .section .text.ivt,"ax" + .section .text..ivt,"ax" .align 32768 // align on 32KB boundary .global ia64_ivt @@ -91,30 +96,31 @@ ENTRY(vhpt_miss) * (the "original") TLB miss, which may either be caused by an instruction * fetch or a data access (or non-access). * - * What we do here is normal TLB miss handing for the _original_ miss, followed - * by inserting the TLB entry for the virtual page table page that the VHPT - * walker was attempting to access. The latter gets inserted as long - * as both L1 and L2 have valid mappings for the faulting address. - * The TLB entry for the original miss gets inserted only if - * the L3 entry indicates that the page is present. + * What we do here is normal TLB miss handing for the _original_ miss, + * followed by inserting the TLB entry for the virtual page table page + * that the VHPT walker was attempting to access. The latter gets + * inserted as long as page table entry above pte level have valid + * mappings for the faulting address. The TLB entry for the original + * miss gets inserted only if the pte entry indicates that the page is + * present. * * do_page_fault gets invoked in the following cases: * - the faulting virtual address uses unimplemented address bits - * - the faulting virtual address has no L1, L2, or L3 mapping + * - the faulting virtual address has no valid page table mapping */ - mov r16=cr.ifa // get address that caused the TLB miss + MOV_FROM_IFA(r16) // get address that caused the TLB miss #ifdef CONFIG_HUGETLB_PAGE movl r18=PAGE_SHIFT - mov r25=cr.itir + MOV_FROM_ITIR(r25) #endif ;; - rsm psr.dt // use physical addressing for data + RSM_PSR_DT // use physical addressing for data mov r31=pr // save the predicate registers mov r19=IA64_KR(PT_BASE) // get page table base address shl r21=r16,3 // shift bit 60 into sign bit shr.u r17=r16,61 // get the region number into r17 ;; - shr r22=r21,3 + shr.u r22=r21,3 #ifdef CONFIG_HUGETLB_PAGE extr.u r26=r25,2,6 ;; @@ -126,7 +132,7 @@ ENTRY(vhpt_miss) #endif ;; cmp.eq p6,p7=5,r17 // is IFA pointing into to region 5? - shr.u r18=r22,PGDIR_SHIFT // get bits 33-63 of the faulting address + shr.u r18=r22,PGDIR_SHIFT // get bottom portion of pgd index bit ;; (p7) dep r17=r17,r19,(PAGE_SHIFT-3),3 // put region number bits in place @@ -137,38 +143,52 @@ ENTRY(vhpt_miss) (p6) shr.u r21=r21,PGDIR_SHIFT+PAGE_SHIFT (p7) shr.u r21=r21,PGDIR_SHIFT+PAGE_SHIFT-3 ;; -(p6) dep r17=r18,r19,3,(PAGE_SHIFT-3) // r17=PTA + IFA(33,42)*8 -(p7) dep r17=r18,r17,3,(PAGE_SHIFT-6) // r17=PTA + (((IFA(61,63) << 7) | IFA(33,39))*8) +(p6) dep r17=r18,r19,3,(PAGE_SHIFT-3) // r17=pgd_offset for region 5 +(p7) dep r17=r18,r17,3,(PAGE_SHIFT-6) // r17=pgd_offset for region[0-4] cmp.eq p7,p6=0,r21 // unused address bits all zeroes? - shr.u r18=r22,PMD_SHIFT // shift L2 index into position +#ifdef CONFIG_PGTABLE_4 + shr.u r28=r22,PUD_SHIFT // shift pud index into position +#else + shr.u r18=r22,PMD_SHIFT // shift pmd index into position +#endif ;; - ld8 r17=[r17] // fetch the L1 entry (may be 0) + ld8 r17=[r17] // get *pgd (may be 0) ;; -(p7) cmp.eq p6,p7=r17,r0 // was L1 entry NULL? - dep r17=r18,r17,3,(PAGE_SHIFT-3) // compute address of L2 page table entry +(p7) cmp.eq p6,p7=r17,r0 // was pgd_present(*pgd) == NULL? +#ifdef CONFIG_PGTABLE_4 + dep r28=r28,r17,3,(PAGE_SHIFT-3) // r28=pud_offset(pgd,addr) ;; -(p7) ld8 r20=[r17] // fetch the L2 entry (may be 0) - shr.u r19=r22,PAGE_SHIFT // shift L3 index into position + shr.u r18=r22,PMD_SHIFT // shift pmd index into position +(p7) ld8 r29=[r28] // get *pud (may be 0) ;; -(p7) cmp.eq.or.andcm p6,p7=r20,r0 // was L2 entry NULL? - dep r21=r19,r20,3,(PAGE_SHIFT-3) // compute address of L3 page table entry +(p7) cmp.eq.or.andcm p6,p7=r29,r0 // was pud_present(*pud) == NULL? + dep r17=r18,r29,3,(PAGE_SHIFT-3) // r17=pmd_offset(pud,addr) +#else + dep r17=r18,r17,3,(PAGE_SHIFT-3) // r17=pmd_offset(pgd,addr) +#endif ;; -(p7) ld8 r18=[r21] // read the L3 PTE - mov r19=cr.isr // cr.isr bit 0 tells us if this is an insn miss +(p7) ld8 r20=[r17] // get *pmd (may be 0) + shr.u r19=r22,PAGE_SHIFT // shift pte index into position + ;; +(p7) cmp.eq.or.andcm p6,p7=r20,r0 // was pmd_present(*pmd) == NULL? + dep r21=r19,r20,3,(PAGE_SHIFT-3) // r21=pte_offset(pmd,addr) + ;; +(p7) ld8 r18=[r21] // read *pte + MOV_FROM_ISR(r19) // cr.isr bit 32 tells us if this is an insn miss ;; (p7) tbit.z p6,p7=r18,_PAGE_P_BIT // page present bit cleared? - mov r22=cr.iha // get the VHPT address that caused the TLB miss + MOV_FROM_IHA(r22) // get the VHPT address that caused the TLB miss ;; // avoid RAW on p7 (p7) tbit.nz.unc p10,p11=r19,32 // is it an instruction TLB miss? dep r23=0,r20,0,PAGE_SHIFT // clear low bits to get page address ;; -(p10) itc.i r18 // insert the instruction TLB entry -(p11) itc.d r18 // insert the data TLB entry + ITC_I_AND_D(p10, p11, r18, r24) // insert the instruction TLB entry and + // insert the data TLB entry (p6) br.cond.spnt.many page_fault // handle bad address/page not present (page fault) - mov cr.ifa=r22 + MOV_TO_IFA(r22, r24) #ifdef CONFIG_HUGETLB_PAGE -(p8) mov cr.itir=r25 // change to default page-size for VHPT + MOV_TO_ITIR(p8, r25, r24) // change to default page-size for VHPT #endif /* @@ -178,7 +198,7 @@ ENTRY(vhpt_miss) */ adds r24=__DIRTY_BITS_NO_ED|_PAGE_PL_0|_PAGE_AR_RW,r23 ;; -(p7) itc.d r24 + ITC_D(p7, r24, r25) ;; #ifdef CONFIG_SMP /* @@ -188,24 +208,39 @@ ENTRY(vhpt_miss) dv_serialize_data /* - * Re-check L2 and L3 pagetable. If they changed, we may have received a ptc.g + * Re-check pagetable entry. If they changed, we may have received a ptc.g * between reading the pagetable and the "itc". If so, flush the entry we - * inserted and retry. + * inserted and retry. At this point, we have: + * + * r28 = equivalent of pud_offset(pgd, ifa) + * r17 = equivalent of pmd_offset(pud, ifa) + * r21 = equivalent of pte_offset(pmd, ifa) + * + * r29 = *pud + * r20 = *pmd + * r18 = *pte */ - ld8 r25=[r21] // read L3 PTE again - ld8 r26=[r17] // read L2 entry again + ld8 r25=[r21] // read *pte again + ld8 r26=[r17] // read *pmd again +#ifdef CONFIG_PGTABLE_4 + ld8 r19=[r28] // read *pud again +#endif + cmp.ne p6,p7=r0,r0 ;; - cmp.ne p6,p7=r26,r20 // did L2 entry change + cmp.ne.or.andcm p6,p7=r26,r20 // did *pmd change +#ifdef CONFIG_PGTABLE_4 + cmp.ne.or.andcm p6,p7=r19,r29 // did *pud change +#endif mov r27=PAGE_SHIFT<<2 ;; (p6) ptc.l r22,r27 // purge PTE page translation -(p7) cmp.ne.or.andcm p6,p7=r25,r18 // did L3 PTE change +(p7) cmp.ne.or.andcm p6,p7=r25,r18 // did *pte change ;; (p6) ptc.l r16,r27 // purge translation #endif mov pr=r31,-1 // restore predicate registers - rfi + RFI END(vhpt_miss) .org ia64_ivt+0x400 @@ -214,25 +249,25 @@ END(vhpt_miss) ENTRY(itlb_miss) DBG_FAULT(1) /* - * The ITLB handler accesses the L3 PTE via the virtually mapped linear + * The ITLB handler accesses the PTE via the virtually mapped linear * page table. If a nested TLB miss occurs, we switch into physical - * mode, walk the page table, and then re-execute the L3 PTE read - * and go on normally after that. + * mode, walk the page table, and then re-execute the PTE read and + * go on normally after that. */ - mov r16=cr.ifa // get virtual address + MOV_FROM_IFA(r16) // get virtual address mov r29=b0 // save b0 mov r31=pr // save predicates .itlb_fault: - mov r17=cr.iha // get virtual address of L3 PTE + MOV_FROM_IHA(r17) // get virtual address of PTE movl r30=1f // load nested fault continuation point ;; -1: ld8 r18=[r17] // read L3 PTE +1: ld8 r18=[r17] // read *pte ;; mov b0=r29 tbit.z p6,p0=r18,_PAGE_P_BIT // page present bit cleared? (p6) br.cond.spnt page_fault ;; - itc.i r18 + ITC_I(p0, r18, r19) ;; #ifdef CONFIG_SMP /* @@ -241,7 +276,7 @@ ENTRY(itlb_miss) */ dv_serialize_data - ld8 r19=[r17] // read L3 PTE again and see if same + ld8 r19=[r17] // read *pte again and see if same mov r20=PAGE_SHIFT<<2 // setup page size for purge ;; cmp.ne p7,p0=r18,r19 @@ -249,7 +284,7 @@ ENTRY(itlb_miss) (p7) ptc.l r16,r20 #endif mov pr=r31,-1 - rfi + RFI END(itlb_miss) .org ia64_ivt+0x0800 @@ -258,25 +293,25 @@ END(itlb_miss) ENTRY(dtlb_miss) DBG_FAULT(2) /* - * The DTLB handler accesses the L3 PTE via the virtually mapped linear + * The DTLB handler accesses the PTE via the virtually mapped linear * page table. If a nested TLB miss occurs, we switch into physical - * mode, walk the page table, and then re-execute the L3 PTE read - * and go on normally after that. + * mode, walk the page table, and then re-execute the PTE read and + * go on normally after that. */ - mov r16=cr.ifa // get virtual address + MOV_FROM_IFA(r16) // get virtual address mov r29=b0 // save b0 mov r31=pr // save predicates dtlb_fault: - mov r17=cr.iha // get virtual address of L3 PTE + MOV_FROM_IHA(r17) // get virtual address of PTE movl r30=1f // load nested fault continuation point ;; -1: ld8 r18=[r17] // read L3 PTE +1: ld8 r18=[r17] // read *pte ;; mov b0=r29 tbit.z p6,p0=r18,_PAGE_P_BIT // page present bit cleared? (p6) br.cond.spnt page_fault ;; - itc.d r18 + ITC_D(p0, r18, r19) ;; #ifdef CONFIG_SMP /* @@ -285,7 +320,7 @@ dtlb_fault: */ dv_serialize_data - ld8 r19=[r17] // read L3 PTE again and see if same + ld8 r19=[r17] // read *pte again and see if same mov r20=PAGE_SHIFT<<2 // setup page size for purge ;; cmp.ne p7,p0=r18,r19 @@ -293,7 +328,7 @@ dtlb_fault: (p7) ptc.l r16,r20 #endif mov pr=r31,-1 - rfi + RFI END(dtlb_miss) .org ia64_ivt+0x0c00 @@ -301,9 +336,9 @@ END(dtlb_miss) // 0x0c00 Entry 3 (size 64 bundles) Alt ITLB (19) ENTRY(alt_itlb_miss) DBG_FAULT(3) - mov r16=cr.ifa // get address that caused the TLB miss + MOV_FROM_IFA(r16) // get address that caused the TLB miss movl r17=PAGE_KERNEL - mov r21=cr.ipsr + MOV_FROM_IPSR(p0, r21) movl r19=(((1 << IA64_MAX_PHYS_BITS) - 1) & ~0xfff) mov r31=pr ;; @@ -312,9 +347,9 @@ ENTRY(alt_itlb_miss) ;; cmp.gt p8,p0=6,r22 // user mode ;; -(p8) thash r17=r16 + THASH(p8, r17, r16, r23) ;; -(p8) mov cr.iha=r17 + MOV_TO_IHA(p8, r17, r23) (p8) mov r29=b0 // save b0 (p8) br.cond.dptk .itlb_fault #endif @@ -329,9 +364,9 @@ ENTRY(alt_itlb_miss) or r19=r19,r18 // set bit 4 (uncached) if the access was to region 6 (p8) br.cond.spnt page_fault ;; - itc.i r19 // insert the TLB entry + ITC_I(p0, r19, r18) // insert the TLB entry mov pr=r31,-1 - rfi + RFI END(alt_itlb_miss) .org ia64_ivt+0x1000 @@ -339,45 +374,54 @@ END(alt_itlb_miss) // 0x1000 Entry 4 (size 64 bundles) Alt DTLB (7,46) ENTRY(alt_dtlb_miss) DBG_FAULT(4) - mov r16=cr.ifa // get address that caused the TLB miss + MOV_FROM_IFA(r16) // get address that caused the TLB miss movl r17=PAGE_KERNEL - mov r20=cr.isr + MOV_FROM_ISR(r20) movl r19=(((1 << IA64_MAX_PHYS_BITS) - 1) & ~0xfff) - mov r21=cr.ipsr + MOV_FROM_IPSR(p0, r21) mov r31=pr + mov r24=PERCPU_ADDR ;; #ifdef CONFIG_DISABLE_VHPT shr.u r22=r16,61 // get the region number into r21 ;; cmp.gt p8,p0=6,r22 // access to region 0-5 ;; -(p8) thash r17=r16 + THASH(p8, r17, r16, r25) ;; -(p8) mov cr.iha=r17 + MOV_TO_IHA(p8, r17, r25) (p8) mov r29=b0 // save b0 (p8) br.cond.dptk dtlb_fault #endif + cmp.ge p10,p11=r16,r24 // access to per_cpu_data? + tbit.z p12,p0=r16,61 // access to region 6? + mov r25=PERCPU_PAGE_SHIFT << 2 + mov r26=PERCPU_PAGE_SIZE + nop.m 0 + nop.b 0 + ;; +(p10) mov r19=IA64_KR(PER_CPU_DATA) +(p11) and r19=r19,r16 // clear non-ppn fields extr.u r23=r21,IA64_PSR_CPL0_BIT,2 // extract psr.cpl and r22=IA64_ISR_CODE_MASK,r20 // get the isr.code field tbit.nz p6,p7=r20,IA64_ISR_SP_BIT // is speculation bit on? - shr.u r18=r16,57 // move address bit 61 to bit 4 - and r19=r19,r16 // clear ed, reserved bits, and PTE control bits tbit.nz p9,p0=r20,IA64_ISR_NA_BIT // is non-access bit on? ;; - andcm r18=0x10,r18 // bit 4=~address-bit(61) +(p10) sub r19=r19,r26 + MOV_TO_ITIR(p10, r25, r24) cmp.ne p8,p0=r0,r23 (p9) cmp.eq.or.andcm p6,p7=IA64_ISR_CODE_LFETCH,r22 // check isr.code field +(p12) dep r17=-1,r17,4,1 // set ma=UC for region 6 addr (p8) br.cond.spnt page_fault dep r21=-1,r21,IA64_PSR_ED_BIT,1 - or r19=r19,r17 // insert PTE control bits into r19 ;; - or r19=r19,r18 // set bit 4 (uncached) if the access was to region 6 -(p6) mov cr.ipsr=r21 + or r19=r19,r17 // insert PTE control bits into r19 + MOV_TO_IPSR(p6, r21, r24) ;; -(p7) itc.d r19 // insert the TLB entry + ITC_D(p7, r19, r18) // insert the TLB entry mov pr=r31,-1 - rfi + RFI END(alt_dtlb_miss) .org ia64_ivt+0x1400 @@ -399,17 +443,17 @@ ENTRY(nested_dtlb_miss) * r30: continuation address * r31: saved pr * - * Output: r17: physical address of L3 PTE of faulting address + * Output: r17: physical address of PTE of faulting address * r29: saved b0 * r30: continuation address * r31: saved pr * * Clobbered: b0, r18, r19, r21, r22, psr.dt (cleared) */ - rsm psr.dt // switch to using physical data addressing + RSM_PSR_DT // switch to using physical data addressing mov r19=IA64_KR(PT_BASE) // get the page table base address shl r21=r16,3 // shift bit 60 into sign bit - mov r18=cr.itir + MOV_FROM_ITIR(r18) ;; shr.u r17=r16,61 // get the region number into r17 extr.u r18=r18,2,6 // get the faulting page size @@ -429,21 +473,33 @@ ENTRY(nested_dtlb_miss) (p6) shr.u r21=r21,PGDIR_SHIFT+PAGE_SHIFT (p7) shr.u r21=r21,PGDIR_SHIFT+PAGE_SHIFT-3 ;; -(p6) dep r17=r18,r19,3,(PAGE_SHIFT-3) // r17=PTA + IFA(33,42)*8 -(p7) dep r17=r18,r17,3,(PAGE_SHIFT-6) // r17=PTA + (((IFA(61,63) << 7) | IFA(33,39))*8) +(p6) dep r17=r18,r19,3,(PAGE_SHIFT-3) // r17=pgd_offset for region 5 +(p7) dep r17=r18,r17,3,(PAGE_SHIFT-6) // r17=pgd_offset for region[0-4] cmp.eq p7,p6=0,r21 // unused address bits all zeroes? - shr.u r18=r22,PMD_SHIFT // shift L2 index into position +#ifdef CONFIG_PGTABLE_4 + shr.u r18=r22,PUD_SHIFT // shift pud index into position +#else + shr.u r18=r22,PMD_SHIFT // shift pmd index into position +#endif + ;; + ld8 r17=[r17] // get *pgd (may be 0) ;; - ld8 r17=[r17] // fetch the L1 entry (may be 0) +(p7) cmp.eq p6,p7=r17,r0 // was pgd_present(*pgd) == NULL? + dep r17=r18,r17,3,(PAGE_SHIFT-3) // r17=p[u|m]d_offset(pgd,addr) ;; -(p7) cmp.eq p6,p7=r17,r0 // was L1 entry NULL? - dep r17=r18,r17,3,(PAGE_SHIFT-3) // compute address of L2 page table entry +#ifdef CONFIG_PGTABLE_4 +(p7) ld8 r17=[r17] // get *pud (may be 0) + shr.u r18=r22,PMD_SHIFT // shift pmd index into position ;; -(p7) ld8 r17=[r17] // fetch the L2 entry (may be 0) - shr.u r19=r22,PAGE_SHIFT // shift L3 index into position +(p7) cmp.eq.or.andcm p6,p7=r17,r0 // was pud_present(*pud) == NULL? + dep r17=r18,r17,3,(PAGE_SHIFT-3) // r17=pmd_offset(pud,addr) ;; -(p7) cmp.eq.or.andcm p6,p7=r17,r0 // was L2 entry NULL? - dep r17=r19,r17,3,(PAGE_SHIFT-3) // compute address of L3 page table entry +#endif +(p7) ld8 r17=[r17] // get *pmd (may be 0) + shr.u r19=r22,PAGE_SHIFT // shift pte index into position + ;; +(p7) cmp.eq.or.andcm p6,p7=r17,r0 // was pmd_present(*pmd) == NULL? + dep r17=r19,r17,3,(PAGE_SHIFT-3) // r17=pte_offset(pmd,addr); (p6) br.cond.spnt page_fault mov b0=r30 br.sptk.many b0 // return to continuation point @@ -457,33 +513,6 @@ ENTRY(ikey_miss) FAULT(6) END(ikey_miss) - //----------------------------------------------------------------------------------- - // call do_page_fault (predicates are in r31, psr.dt may be off, r16 is faulting address) -ENTRY(page_fault) - ssm psr.dt - ;; - srlz.i - ;; - SAVE_MIN_WITH_COVER - alloc r15=ar.pfs,0,0,3,0 - mov out0=cr.ifa - mov out1=cr.isr - adds r3=8,r2 // set up second base pointer - ;; - ssm psr.ic | PSR_DEFAULT_BITS - ;; - srlz.i // guarantee that interruption collectin is on - ;; -(p15) ssm psr.i // restore psr.i - movl r14=ia64_leave_kernel - ;; - SAVE_REST - mov rp=r14 - ;; - adds out2=16,r12 // out2 = pointer to pt_regs - br.call.sptk.many b6=ia64_do_page_fault // ignore return address -END(page_fault) - .org ia64_ivt+0x1c00 ///////////////////////////////////////////////////////////////////////////////////////// // 0x1c00 Entry 7 (size 64 bundles) Data Key Miss (12,51) @@ -506,10 +535,10 @@ ENTRY(dirty_bit) * page table TLB entry isn't present, we take a nested TLB miss hit where we look * up the physical address of the L3 PTE and then continue at label 1 below. */ - mov r16=cr.ifa // get the address that caused the fault + MOV_FROM_IFA(r16) // get the address that caused the fault movl r30=1f // load continuation point in case of nested fault ;; - thash r17=r16 // compute virtual address of L3 PTE + THASH(p0, r17, r16, r18) // compute virtual address of L3 PTE mov r29=b0 // save b0 in case of nested fault mov r31=pr // save pr #ifdef CONFIG_SMP @@ -519,13 +548,14 @@ ENTRY(dirty_bit) ;; // avoid RAW on r18 mov ar.ccv=r18 // set compare value for cmpxchg or r25=_PAGE_D|_PAGE_A,r18 // set the dirty and accessed bits + tbit.z p7,p6 = r18,_PAGE_P_BIT // Check present bit ;; - cmpxchg8.acq r26=[r17],r25,ar.ccv +(p6) cmpxchg8.acq r26=[r17],r25,ar.ccv // Only update if page is present mov r24=PAGE_SHIFT<<2 ;; - cmp.eq p6,p7=r26,r18 +(p6) cmp.eq p6,p7=r26,r18 // Only compare if page is present ;; -(p6) itc.d r25 // install updated PTE + ITC_D(p6, r25, r18) // install updated PTE ;; /* * Tell the assemblers dependency-violation checker that the above "itc" instructions @@ -548,10 +578,10 @@ ENTRY(dirty_bit) mov b0=r29 // restore b0 ;; st8 [r17]=r18 // store back updated PTE - itc.d r18 // install updated PTE + ITC_D(p0, r18, r16) // install updated PTE #endif mov pr=r31,-1 // restore pr - rfi + RFI END(dirty_bit) .org ia64_ivt+0x2400 @@ -560,22 +590,22 @@ END(dirty_bit) ENTRY(iaccess_bit) DBG_FAULT(9) // Like Entry 8, except for instruction access - mov r16=cr.ifa // get the address that caused the fault + MOV_FROM_IFA(r16) // get the address that caused the fault movl r30=1f // load continuation point in case of nested fault mov r31=pr // save predicates #ifdef CONFIG_ITANIUM /* * Erratum 10 (IFA may contain incorrect address) has "NoFix" status. */ - mov r17=cr.ipsr + MOV_FROM_IPSR(p0, r17) ;; - mov r18=cr.iip + MOV_FROM_IIP(r18) tbit.z p6,p0=r17,IA64_PSR_IS_BIT // IA64 instruction set? ;; (p6) mov r16=r18 // if so, use cr.iip instead of cr.ifa #endif /* CONFIG_ITANIUM */ ;; - thash r17=r16 // compute virtual address of L3 PTE + THASH(p0, r17, r16, r18) // compute virtual address of L3 PTE mov r29=b0 // save b0 in case of nested fault) #ifdef CONFIG_SMP mov r28=ar.ccv // save ar.ccv @@ -584,13 +614,14 @@ ENTRY(iaccess_bit) ;; mov ar.ccv=r18 // set compare value for cmpxchg or r25=_PAGE_A,r18 // set the accessed bit + tbit.z p7,p6 = r18,_PAGE_P_BIT // Check present bit ;; - cmpxchg8.acq r26=[r17],r25,ar.ccv +(p6) cmpxchg8.acq r26=[r17],r25,ar.ccv // Only if page present mov r24=PAGE_SHIFT<<2 ;; - cmp.eq p6,p7=r26,r18 +(p6) cmp.eq p6,p7=r26,r18 // Only if page present ;; -(p6) itc.i r25 // install updated PTE + ITC_I(p6, r25, r26) // install updated PTE ;; /* * Tell the assemblers dependency-violation checker that the above "itc" instructions @@ -613,10 +644,10 @@ ENTRY(iaccess_bit) mov b0=r29 // restore b0 ;; st8 [r17]=r18 // store back updated PTE - itc.i r18 // install updated PTE + ITC_I(p0, r18, r16) // install updated PTE #endif /* !CONFIG_SMP */ mov pr=r31,-1 - rfi + RFI END(iaccess_bit) .org ia64_ivt+0x2800 @@ -625,10 +656,10 @@ END(iaccess_bit) ENTRY(daccess_bit) DBG_FAULT(10) // Like Entry 8, except for data access - mov r16=cr.ifa // get the address that caused the fault + MOV_FROM_IFA(r16) // get the address that caused the fault movl r30=1f // load continuation point in case of nested fault ;; - thash r17=r16 // compute virtual address of L3 PTE + THASH(p0, r17, r16, r18) // compute virtual address of L3 PTE mov r31=pr mov r29=b0 // save b0 in case of nested fault) #ifdef CONFIG_SMP @@ -638,13 +669,14 @@ ENTRY(daccess_bit) ;; // avoid RAW on r18 mov ar.ccv=r18 // set compare value for cmpxchg or r25=_PAGE_A,r18 // set the dirty bit + tbit.z p7,p6 = r18,_PAGE_P_BIT // Check present bit ;; - cmpxchg8.acq r26=[r17],r25,ar.ccv +(p6) cmpxchg8.acq r26=[r17],r25,ar.ccv // Only if page is present mov r24=PAGE_SHIFT<<2 ;; - cmp.eq p6,p7=r26,r18 +(p6) cmp.eq p6,p7=r26,r18 // Only if page is present ;; -(p6) itc.d r25 // install updated PTE + ITC_D(p6, r25, r26) // install updated PTE /* * Tell the assemblers dependency-violation checker that the above "itc" instructions * cannot possibly affect the following loads: @@ -664,11 +696,11 @@ ENTRY(daccess_bit) or r18=_PAGE_A,r18 // set the accessed bit ;; st8 [r17]=r18 // store back updated PTE - itc.d r18 // install updated PTE + ITC_D(p0, r18, r16) // install updated PTE #endif mov b0=r29 // restore b0 mov pr=r31,-1 - rfi + RFI END(daccess_bit) .org ia64_ivt+0x2c00 @@ -692,10 +724,10 @@ ENTRY(break_fault) */ DBG_FAULT(11) mov.m r16=IA64_KR(CURRENT) // M2 r16 <- current task (12 cyc) - mov r29=cr.ipsr // M2 (12 cyc) + MOV_FROM_IPSR(p0, r29) // M2 (12 cyc) mov r31=pr // I0 (2 cyc) - mov r17=cr.iim // M2 (2 cyc) + MOV_FROM_IIM(r17) // M2 (2 cyc) mov.m r27=ar.rsc // M2 (12 cyc) mov r18=__IA64_BREAK_SYSCALL // A @@ -714,7 +746,7 @@ ENTRY(break_fault) nop.m 0 movl r30=sys_call_table // X - mov r28=cr.iip // M2 (2 cyc) + MOV_FROM_IIP(r28) // M2 (2 cyc) cmp.eq p0,p7=r18,r17 // I0 is this a system call? (p7) br.cond.spnt non_syscall // B no -> // @@ -752,8 +784,13 @@ ENTRY(break_fault) (p8) adds r28=16,r28 // A switch cr.iip to next bundle (p9) adds r8=1,r8 // A increment ei to next slot +#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE + ;; + mov b6=r30 // I0 setup syscall handler branch reg early +#else nop.i 0 ;; +#endif mov.m r25=ar.unat // M2 (5 cyc) dep r29=r8,r29,41,2 // I0 insert new ei into cr.ipsr @@ -764,7 +801,11 @@ ENTRY(break_fault) // /////////////////////////////////////////////////////////////////////// st1 [r16]=r0 // M2|3 clear current->thread.on_ustack flag +#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE + MOV_FROM_ITC(p0, p14, r30, r18) // M get cycle for accounting +#else mov b6=r30 // I0 setup syscall handler branch reg early +#endif cmp.eq pKStk,pUStk=r0,r17 // A were we on kernel stacks already? and r9=_TIF_SYSCALL_TRACEAUDIT,r9 // A mask trace or audit @@ -776,20 +817,43 @@ ENTRY(break_fault) cmp.eq p14,p0=r9,r0 // A are syscalls being traced/audited? br.call.sptk.many b7=ia64_syscall_setup // B 1: +#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE + // mov.m r30=ar.itc is called in advance, and r13 is current + add r16=TI_AC_STAMP+IA64_TASK_SIZE,r13 // A + add r17=TI_AC_LEAVE+IA64_TASK_SIZE,r13 // A +(pKStk) br.cond.spnt .skip_accounting // B unlikely skip + ;; + ld8 r18=[r16],TI_AC_STIME-TI_AC_STAMP // M get last stamp + ld8 r19=[r17],TI_AC_UTIME-TI_AC_LEAVE // M time at leave + ;; + ld8 r20=[r16],TI_AC_STAMP-TI_AC_STIME // M cumulated stime + ld8 r21=[r17] // M cumulated utime + sub r22=r19,r18 // A stime before leave + ;; + st8 [r16]=r30,TI_AC_STIME-TI_AC_STAMP // M update stamp + sub r18=r30,r19 // A elapsed time in user + ;; + add r20=r20,r22 // A sum stime + add r21=r21,r18 // A sum utime + ;; + st8 [r16]=r20 // M update stime + st8 [r17]=r21 // M update utime + ;; +.skip_accounting: +#endif mov ar.rsc=0x3 // M2 set eager mode, pl 0, LE, loadrs=0 nop 0 - bsw.1 // B (6 cyc) regs are saved, switch to bank 1 + BSW_1(r2, r14) // B (6 cyc) regs are saved, switch to bank 1 ;; - ssm psr.ic | PSR_DEFAULT_BITS // M2 now it's safe to re-enable intr.-collection + SSM_PSR_IC_AND_DEFAULT_BITS_AND_SRLZ_I(r3, r16) // M2 now it's safe to re-enable intr.-collection + // M0 ensure interruption collection is on movl r3=ia64_ret_from_syscall // X ;; - - srlz.i // M0 ensure interruption collection is on mov rp=r3 // I0 set the real return addr (p10) br.cond.spnt.many ia64_ret_from_syscall // B return if bad call-frame or r15 is a NaT -(p15) ssm psr.i // M2 restore psr.i + SSM_PSR_I(p15, p15, r16) // M2 restore psr.i (p14) br.call.sptk.many b6=b6 // B invoke syscall-handker (ignore return addr) br.cond.spnt.many ia64_trace_syscall // B do syscall-tracing thingamagic // NOT REACHED @@ -809,26 +873,8 @@ END(break_fault) ///////////////////////////////////////////////////////////////////////////////////////// // 0x3000 Entry 12 (size 64 bundles) External Interrupt (4) ENTRY(interrupt) - DBG_FAULT(12) - mov r31=pr // prepare to save predicates - ;; - SAVE_MIN_WITH_COVER // uses r31; defines r2 and r3 - ssm psr.ic | PSR_DEFAULT_BITS - ;; - adds r3=8,r2 // set up second base pointer for SAVE_REST - srlz.i // ensure everybody knows psr.ic is back on - ;; - SAVE_REST - ;; - alloc r14=ar.pfs,0,0,2,0 // must be first in an insn group - mov out0=cr.ivr // pass cr.ivr as first arg - add out1=16,sp // pass pointer to pt_regs as second arg - ;; - srlz.d // make sure we see the effect of cr.ivr - movl r14=ia64_leave_kernel - ;; - mov rp=r14 - br.call.sptk.many b6=ia64_handle_irq + /* interrupt handler has become too big to fit this area. */ + br.sptk.many __interrupt END(interrupt) .org ia64_ivt+0x3400 @@ -874,6 +920,7 @@ END(interrupt) * - r27: saved ar.rsc * - r28: saved cr.iip * - r29: saved cr.ipsr + * - r30: ar.itc for accounting (don't touch) * - r31: saved pr * - b0: original contents (to be saved) * On exit: @@ -890,6 +937,7 @@ END(interrupt) * - ar.fpsr: set to kernel settings * - b6: preserved (same as on entry) */ +#ifdef __IA64_ASM_PARAVIRTUALIZED_NATIVE GLOBAL_ENTRY(ia64_syscall_setup) #if PT(B6) != 0 # error This code assumes that b6 is the first field in pt_regs. @@ -981,6 +1029,7 @@ GLOBAL_ENTRY(ia64_syscall_setup) (p10) mov r8=-EINVAL br.ret.sptk.many b7 END(ia64_syscall_setup) +#endif /* __IA64_ASM_PARAVIRTUALIZED_NATIVE */ .org ia64_ivt+0x3c00 ///////////////////////////////////////////////////////////////////////////////////////// @@ -988,53 +1037,46 @@ END(ia64_syscall_setup) DBG_FAULT(15) FAULT(15) + .org ia64_ivt+0x4000 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x4000 Entry 16 (size 64 bundles) Reserved + DBG_FAULT(16) + FAULT(16) + +#if defined(CONFIG_VIRT_CPU_ACCOUNTING_NATIVE) && defined(__IA64_ASM_PARAVIRTUALIZED_NATIVE) /* - * Squatting in this space ... + * There is no particular reason for this code to be here, other than + * that there happens to be space here that would go unused otherwise. + * If this fault ever gets "unreserved", simply moved the following + * code to a more suitable spot... * - * This special case dispatcher for illegal operation faults allows preserved - * registers to be modified through a callback function (asm only) that is handed - * back from the fault handler in r8. Up to three arguments can be passed to the - * callback function by returning an aggregate with the callback as its first - * element, followed by the arguments. + * account_sys_enter is called from SAVE_MIN* macros if accounting is + * enabled and if the macro is entered from user mode. */ -ENTRY(dispatch_illegal_op_fault) - .prologue - .body - SAVE_MIN_WITH_COVER - ssm psr.ic | PSR_DEFAULT_BITS +GLOBAL_ENTRY(account_sys_enter) + // mov.m r20=ar.itc is called in advance, and r13 is current + add r16=TI_AC_STAMP+IA64_TASK_SIZE,r13 + add r17=TI_AC_LEAVE+IA64_TASK_SIZE,r13 ;; - srlz.i // guarantee that interruption collection is on + ld8 r18=[r16],TI_AC_STIME-TI_AC_STAMP // time at last check in kernel + ld8 r19=[r17],TI_AC_UTIME-TI_AC_LEAVE // time at left from kernel + ;; + ld8 r23=[r16],TI_AC_STAMP-TI_AC_STIME // cumulated stime + ld8 r21=[r17] // cumulated utime + sub r22=r19,r18 // stime before leave kernel ;; -(p15) ssm psr.i // restore psr.i - adds r3=8,r2 // set up second base pointer for SAVE_REST + st8 [r16]=r20,TI_AC_STIME-TI_AC_STAMP // update stamp + sub r18=r20,r19 // elapsed time in user mode ;; - alloc r14=ar.pfs,0,0,1,0 // must be first in insn group - mov out0=ar.ec + add r23=r23,r22 // sum stime + add r21=r21,r18 // sum utime ;; - SAVE_REST - PT_REGS_UNWIND_INFO(0) - ;; - br.call.sptk.many rp=ia64_illegal_op_fault -.ret0: ;; - alloc r14=ar.pfs,0,0,3,0 // must be first in insn group - mov out0=r9 - mov out1=r10 - mov out2=r11 - movl r15=ia64_leave_kernel - ;; - mov rp=r15 - mov b6=r8 + st8 [r16]=r23 // update stime + st8 [r17]=r21 // update utime ;; - cmp.ne p6,p0=0,r8 -(p6) br.call.dpnt.many b6=b6 // call returns to ia64_leave_kernel - br.sptk.many ia64_leave_kernel -END(dispatch_illegal_op_fault) - - .org ia64_ivt+0x4000 -///////////////////////////////////////////////////////////////////////////////////////// -// 0x4000 Entry 16 (size 64 bundles) Reserved - DBG_FAULT(16) - FAULT(16) + br.ret.sptk.many rp +END(account_sys_enter) +#endif .org ia64_ivt+0x4400 ///////////////////////////////////////////////////////////////////////////////////////// @@ -1042,110 +1084,18 @@ END(dispatch_illegal_op_fault) DBG_FAULT(17) FAULT(17) -ENTRY(non_syscall) - mov ar.rsc=r27 // restore ar.rsc before SAVE_MIN_WITH_COVER - ;; - SAVE_MIN_WITH_COVER - - // There is no particular reason for this code to be here, other than that - // there happens to be space here that would go unused otherwise. If this - // fault ever gets "unreserved", simply moved the following code to a more - // suitable spot... - - alloc r14=ar.pfs,0,0,2,0 - mov out0=cr.iim - add out1=16,sp - adds r3=8,r2 // set up second base pointer for SAVE_REST - - ssm psr.ic | PSR_DEFAULT_BITS - ;; - srlz.i // guarantee that interruption collection is on - ;; -(p15) ssm psr.i // restore psr.i - movl r15=ia64_leave_kernel - ;; - SAVE_REST - mov rp=r15 - ;; - br.call.sptk.many b6=ia64_bad_break // avoid WAW on CFM and ignore return addr -END(non_syscall) - .org ia64_ivt+0x4800 ///////////////////////////////////////////////////////////////////////////////////////// // 0x4800 Entry 18 (size 64 bundles) Reserved DBG_FAULT(18) FAULT(18) - /* - * There is no particular reason for this code to be here, other than that - * there happens to be space here that would go unused otherwise. If this - * fault ever gets "unreserved", simply moved the following code to a more - * suitable spot... - */ - -ENTRY(dispatch_unaligned_handler) - SAVE_MIN_WITH_COVER - ;; - alloc r14=ar.pfs,0,0,2,0 // now it's safe (must be first in insn group!) - mov out0=cr.ifa - adds out1=16,sp - - ssm psr.ic | PSR_DEFAULT_BITS - ;; - srlz.i // guarantee that interruption collection is on - ;; -(p15) ssm psr.i // restore psr.i - adds r3=8,r2 // set up second base pointer - ;; - SAVE_REST - movl r14=ia64_leave_kernel - ;; - mov rp=r14 - br.sptk.many ia64_prepare_handle_unaligned -END(dispatch_unaligned_handler) - .org ia64_ivt+0x4c00 ///////////////////////////////////////////////////////////////////////////////////////// // 0x4c00 Entry 19 (size 64 bundles) Reserved DBG_FAULT(19) FAULT(19) - /* - * There is no particular reason for this code to be here, other than that - * there happens to be space here that would go unused otherwise. If this - * fault ever gets "unreserved", simply moved the following code to a more - * suitable spot... - */ - -ENTRY(dispatch_to_fault_handler) - /* - * Input: - * psr.ic: off - * r19: fault vector number (e.g., 24 for General Exception) - * r31: contains saved predicates (pr) - */ - SAVE_MIN_WITH_COVER_R19 - alloc r14=ar.pfs,0,0,5,0 - mov out0=r15 - mov out1=cr.isr - mov out2=cr.ifa - mov out3=cr.iim - mov out4=cr.itir - ;; - ssm psr.ic | PSR_DEFAULT_BITS - ;; - srlz.i // guarantee that interruption collection is on - ;; -(p15) ssm psr.i // restore psr.i - adds r3=8,r2 // set up second base pointer for SAVE_REST - ;; - SAVE_REST - movl r14=ia64_leave_kernel - ;; - mov rp=r14 - br.call.sptk.many b6=ia64_fault -END(dispatch_to_fault_handler) - // // --- End of long entries, Beginning of short entries // @@ -1155,8 +1105,8 @@ END(dispatch_to_fault_handler) // 0x5000 Entry 20 (size 16 bundles) Page Not Present (10,22,49) ENTRY(page_not_present) DBG_FAULT(20) - mov r16=cr.ifa - rsm psr.dt + MOV_FROM_IFA(r16) + RSM_PSR_DT /* * The Linux page fault handler doesn't expect non-present pages to be in * the TLB. Flush the existing entry now, so we meet that expectation. @@ -1175,8 +1125,8 @@ END(page_not_present) // 0x5100 Entry 21 (size 16 bundles) Key Permission (13,25,52) ENTRY(key_permission) DBG_FAULT(21) - mov r16=cr.ifa - rsm psr.dt + MOV_FROM_IFA(r16) + RSM_PSR_DT mov r31=pr ;; srlz.d @@ -1188,8 +1138,8 @@ END(key_permission) // 0x5200 Entry 22 (size 16 bundles) Instruction Access Rights (26) ENTRY(iaccess_rights) DBG_FAULT(22) - mov r16=cr.ifa - rsm psr.dt + MOV_FROM_IFA(r16) + RSM_PSR_DT mov r31=pr ;; srlz.d @@ -1201,8 +1151,8 @@ END(iaccess_rights) // 0x5300 Entry 23 (size 16 bundles) Data Access Rights (14,53) ENTRY(daccess_rights) DBG_FAULT(23) - mov r16=cr.ifa - rsm psr.dt + MOV_FROM_IFA(r16) + RSM_PSR_DT mov r31=pr ;; srlz.d @@ -1214,7 +1164,7 @@ END(daccess_rights) // 0x5400 Entry 24 (size 16 bundles) General Exception (5,32,34,36,38,39) ENTRY(general_exception) DBG_FAULT(24) - mov r16=cr.isr + MOV_FROM_ISR(r16) mov r31=pr ;; cmp4.eq p6,p0=0,r16 @@ -1243,8 +1193,8 @@ END(disabled_fp_reg) ENTRY(nat_consumption) DBG_FAULT(26) - mov r16=cr.ipsr - mov r17=cr.isr + MOV_FROM_IPSR(p0, r16) + MOV_FROM_ISR(r17) mov r31=pr // save PR ;; and r18=0xf,r17 // r18 = cr.ipsr.code{3:0} @@ -1254,10 +1204,10 @@ ENTRY(nat_consumption) dep r16=-1,r16,IA64_PSR_ED_BIT,1 (p6) br.cond.spnt 1f // branch if (cr.ispr.na == 0 || cr.ipsr.code{3:0} != LFETCH) ;; - mov cr.ipsr=r16 // set cr.ipsr.na + MOV_TO_IPSR(p0, r16, r18) mov pr=r31,-1 ;; - rfi + RFI 1: mov pr=r31,-1 ;; @@ -1279,26 +1229,26 @@ ENTRY(speculation_vector) * * cr.imm contains zero_ext(imm21) */ - mov r18=cr.iim + MOV_FROM_IIM(r18) ;; - mov r17=cr.iip + MOV_FROM_IIP(r17) shl r18=r18,43 // put sign bit in position (43=64-21) ;; - mov r16=cr.ipsr + MOV_FROM_IPSR(p0, r16) shr r18=r18,39 // sign extend (39=43-4) ;; add r17=r17,r18 // now add the offset ;; - mov cr.iip=r17 + MOV_TO_IIP(r17, r19) dep r16=0,r16,41,2 // clear EI ;; - mov cr.ipsr=r16 + MOV_TO_IPSR(p0, r16, r19) ;; - rfi // and go back + RFI END(speculation_vector) .org ia64_ivt+0x5800 @@ -1320,7 +1270,6 @@ END(debug_vector) // 0x5a00 Entry 30 (size 16 bundles) Unaligned Reference (57) ENTRY(unaligned_access) DBG_FAULT(30) - mov r16=cr.ipsr mov r31=pr // prepare to save predicates ;; br.sptk.many dispatch_unaligned_handler @@ -1435,28 +1384,6 @@ END(ia32_exception) // 0x6a00 Entry 46 (size 16 bundles) IA-32 Intercept (30,31,59,70,71) ENTRY(ia32_intercept) DBG_FAULT(46) -#ifdef CONFIG_IA32_SUPPORT - mov r31=pr - mov r16=cr.isr - ;; - extr.u r17=r16,16,8 // get ISR.code - mov r18=ar.eflag - mov r19=cr.iim // old eflag value - ;; - cmp.ne p6,p0=2,r17 -(p6) br.cond.spnt 1f // not a system flag fault - xor r16=r18,r19 - ;; - extr.u r17=r16,18,1 // get the eflags.ac bit - ;; - cmp.eq p6,p0=0,r17 -(p6) br.cond.spnt 1f // eflags.ac bit didn't change - ;; - mov pr=r31,-1 // restore predicate registers - rfi - -1: -#endif // CONFIG_IA32_SUPPORT FAULT(46) END(ia32_intercept) @@ -1465,12 +1392,7 @@ END(ia32_intercept) // 0x6b00 Entry 47 (size 16 bundles) IA-32 Interrupt (74) ENTRY(ia32_interrupt) DBG_FAULT(47) -#ifdef CONFIG_IA32_SUPPORT - mov r31=pr - br.sptk.many dispatch_to_ia32_handler -#else FAULT(47) -#endif END(ia32_interrupt) .org ia64_ivt+0x6c00 @@ -1593,89 +1515,174 @@ END(ia32_interrupt) DBG_FAULT(67) FAULT(67) -#ifdef CONFIG_IA32_SUPPORT + //----------------------------------------------------------------------------------- + // call do_page_fault (predicates are in r31, psr.dt may be off, r16 is faulting address) +ENTRY(page_fault) + SSM_PSR_DT_AND_SRLZ_I + ;; + SAVE_MIN_WITH_COVER + alloc r15=ar.pfs,0,0,3,0 + MOV_FROM_IFA(out0) + MOV_FROM_ISR(out1) + SSM_PSR_IC_AND_DEFAULT_BITS_AND_SRLZ_I(r14, r3) + adds r3=8,r2 // set up second base pointer + SSM_PSR_I(p15, p15, r14) // restore psr.i + movl r14=ia64_leave_kernel + ;; + SAVE_REST + mov rp=r14 + ;; + adds out2=16,r12 // out2 = pointer to pt_regs + br.call.sptk.many b6=ia64_do_page_fault // ignore return address +END(page_fault) + +ENTRY(non_syscall) + mov ar.rsc=r27 // restore ar.rsc before SAVE_MIN_WITH_COVER + ;; + SAVE_MIN_WITH_COVER - /* - * There is no particular reason for this code to be here, other than that - * there happens to be space here that would go unused otherwise. If this - * fault ever gets "unreserved", simply moved the following code to a more - * suitable spot... - */ + // There is no particular reason for this code to be here, other than that + // there happens to be space here that would go unused otherwise. If this + // fault ever gets "unreserved", simply moved the following code to a more + // suitable spot... - // IA32 interrupt entry point + alloc r14=ar.pfs,0,0,2,0 + MOV_FROM_IIM(out0) + add out1=16,sp + adds r3=8,r2 // set up second base pointer for SAVE_REST -ENTRY(dispatch_to_ia32_handler) - SAVE_MIN + SSM_PSR_IC_AND_DEFAULT_BITS_AND_SRLZ_I(r15, r24) + // guarantee that interruption collection is on + SSM_PSR_I(p15, p15, r15) // restore psr.i + movl r15=ia64_leave_kernel ;; - mov r14=cr.isr - ssm psr.ic | PSR_DEFAULT_BITS + SAVE_REST + mov rp=r15 ;; - srlz.i // guarantee that interruption collection is on + br.call.sptk.many b6=ia64_bad_break // avoid WAW on CFM and ignore return addr +END(non_syscall) + +ENTRY(__interrupt) + DBG_FAULT(12) + mov r31=pr // prepare to save predicates ;; -(p15) ssm psr.i - adds r3=8,r2 // Base pointer for SAVE_REST + SAVE_MIN_WITH_COVER // uses r31; defines r2 and r3 + SSM_PSR_IC_AND_DEFAULT_BITS_AND_SRLZ_I(r3, r14) + // ensure everybody knows psr.ic is back on + adds r3=8,r2 // set up second base pointer for SAVE_REST ;; SAVE_REST ;; - mov r15=0x80 - shr r14=r14,16 // Get interrupt number - ;; - cmp.ne p6,p0=r14,r15 -(p6) br.call.dpnt.many b6=non_ia32_syscall - - adds r14=IA64_PT_REGS_R8_OFFSET + 16,sp // 16 byte hole per SW conventions - adds r15=IA64_PT_REGS_R1_OFFSET + 16,sp + MCA_RECOVER_RANGE(interrupt) + alloc r14=ar.pfs,0,0,2,0 // must be first in an insn group + MOV_FROM_IVR(out0, r8) // pass cr.ivr as first arg + add out1=16,sp // pass pointer to pt_regs as second arg ;; - cmp.eq pSys,pNonSys=r0,r0 // set pSys=1, pNonSys=0 - ld8 r8=[r14] // get r8 + srlz.d // make sure we see the effect of cr.ivr + movl r14=ia64_leave_kernel ;; - st8 [r15]=r8 // save original EAX in r1 (IA32 procs don't use the GP) + mov rp=r14 + br.call.sptk.many b6=ia64_handle_irq +END(__interrupt) + + /* + * There is no particular reason for this code to be here, other than that + * there happens to be space here that would go unused otherwise. If this + * fault ever gets "unreserved", simply moved the following code to a more + * suitable spot... + */ + +ENTRY(dispatch_unaligned_handler) + SAVE_MIN_WITH_COVER ;; - alloc r15=ar.pfs,0,0,6,0 // must first in an insn group + alloc r14=ar.pfs,0,0,2,0 // now it's safe (must be first in insn group!) + MOV_FROM_IFA(out0) + adds out1=16,sp + + SSM_PSR_IC_AND_DEFAULT_BITS_AND_SRLZ_I(r3, r24) + // guarantee that interruption collection is on + SSM_PSR_I(p15, p15, r3) // restore psr.i + adds r3=8,r2 // set up second base pointer ;; - ld4 r8=[r14],8 // r8 == eax (syscall number) - mov r15=IA32_NR_syscalls + SAVE_REST + movl r14=ia64_leave_kernel ;; - cmp.ltu.unc p6,p7=r8,r15 - ld4 out1=[r14],8 // r9 == ecx + mov rp=r14 + br.sptk.many ia64_prepare_handle_unaligned +END(dispatch_unaligned_handler) + + /* + * There is no particular reason for this code to be here, other than that + * there happens to be space here that would go unused otherwise. If this + * fault ever gets "unreserved", simply moved the following code to a more + * suitable spot... + */ + +ENTRY(dispatch_to_fault_handler) + /* + * Input: + * psr.ic: off + * r19: fault vector number (e.g., 24 for General Exception) + * r31: contains saved predicates (pr) + */ + SAVE_MIN_WITH_COVER_R19 + alloc r14=ar.pfs,0,0,5,0 + MOV_FROM_ISR(out1) + MOV_FROM_IFA(out2) + MOV_FROM_IIM(out3) + MOV_FROM_ITIR(out4) ;; - ld4 out2=[r14],8 // r10 == edx + SSM_PSR_IC_AND_DEFAULT_BITS_AND_SRLZ_I(r3, out0) + // guarantee that interruption collection is on + mov out0=r15 ;; - ld4 out0=[r14] // r11 == ebx - adds r14=(IA64_PT_REGS_R13_OFFSET) + 16,sp + SSM_PSR_I(p15, p15, r3) // restore psr.i + adds r3=8,r2 // set up second base pointer for SAVE_REST ;; - ld4 out5=[r14],PT(R14)-PT(R13) // r13 == ebp + SAVE_REST + movl r14=ia64_leave_kernel ;; - ld4 out3=[r14],PT(R15)-PT(R14) // r14 == esi - adds r2=TI_FLAGS+IA64_TASK_SIZE,r13 + mov rp=r14 + br.call.sptk.many b6=ia64_fault +END(dispatch_to_fault_handler) + + /* + * Squatting in this space ... + * + * This special case dispatcher for illegal operation faults allows preserved + * registers to be modified through a callback function (asm only) that is handed + * back from the fault handler in r8. Up to three arguments can be passed to the + * callback function by returning an aggregate with the callback as its first + * element, followed by the arguments. + */ +ENTRY(dispatch_illegal_op_fault) + .prologue + .body + SAVE_MIN_WITH_COVER + SSM_PSR_IC_AND_DEFAULT_BITS_AND_SRLZ_I(r3, r24) + // guarantee that interruption collection is on ;; - ld4 out4=[r14] // r15 == edi - movl r16=ia32_syscall_table + SSM_PSR_I(p15, p15, r3) // restore psr.i + adds r3=8,r2 // set up second base pointer for SAVE_REST ;; -(p6) shladd r16=r8,3,r16 // force ni_syscall if not valid syscall number - ld4 r2=[r2] // r2 = current_thread_info()->flags + alloc r14=ar.pfs,0,0,1,0 // must be first in insn group + mov out0=ar.ec ;; - ld8 r16=[r16] - and r2=_TIF_SYSCALL_TRACEAUDIT,r2 // mask trace or audit + SAVE_REST + PT_REGS_UNWIND_INFO(0) ;; - mov b6=r16 - movl r15=ia32_ret_from_syscall - cmp.eq p8,p0=r2,r0 + br.call.sptk.many rp=ia64_illegal_op_fault +.ret0: ;; + alloc r14=ar.pfs,0,0,3,0 // must be first in insn group + mov out0=r9 + mov out1=r10 + mov out2=r11 + movl r15=ia64_leave_kernel ;; mov rp=r15 -(p8) br.call.sptk.many b6=b6 - br.cond.sptk ia32_trace_syscall - -non_ia32_syscall: - alloc r15=ar.pfs,0,0,2,0 - mov out0=r14 // interrupt # - add out1=16,sp // pointer to pt_regs - ;; // avoid WAW on CFM - br.call.sptk.many rp=ia32_bad_interrupt -.ret1: movl r15=ia64_leave_kernel + mov b6=r8 ;; - mov rp=r15 - br.ret.sptk.many rp -END(dispatch_to_ia32_handler) - -#endif /* CONFIG_IA32_SUPPORT */ + cmp.ne p6,p0=0,r8 +(p6) br.call.dpnt.many b6=b6 // call returns to ia64_leave_kernel + br.sptk.many ia64_leave_kernel +END(dispatch_illegal_op_fault) diff --git a/arch/ia64/kernel/jprobes.S b/arch/ia64/kernel/jprobes.S index 2323377e369..f69389c7be1 100644 --- a/arch/ia64/kernel/jprobes.S +++ b/arch/ia64/kernel/jprobes.S @@ -45,13 +45,14 @@ * to the correct location. */ #include <asm/asmmacro.h> +#include <asm/break.h> /* * void jprobe_break(void) */ .section .kprobes.text, "ax" ENTRY(jprobe_break) - break.m 0x80300 + break.m __IA64_BREAK_JPROBE END(jprobe_break) /* @@ -60,3 +61,30 @@ END(jprobe_break) GLOBAL_ENTRY(jprobe_inst_return) br.call.sptk.many b0=jprobe_break END(jprobe_inst_return) + +GLOBAL_ENTRY(invalidate_stacked_regs) + movl r16=invalidate_restore_cfm + ;; + mov b6=r16 + ;; + br.ret.sptk.many b6 + ;; +invalidate_restore_cfm: + mov r16=ar.rsc + ;; + mov ar.rsc=r0 + ;; + loadrs + ;; + mov ar.rsc=r16 + ;; + br.cond.sptk.many rp +END(invalidate_stacked_regs) + +GLOBAL_ENTRY(flush_register_stack) + // flush dirty regs to backing store (must be first in insn group) + flushrs + ;; + br.ret.sptk.many rp +END(flush_register_stack) + diff --git a/arch/ia64/kernel/kprobes.c b/arch/ia64/kernel/kprobes.c index 471086b808a..074fde49c9e 100644 --- a/arch/ia64/kernel/kprobes.c +++ b/arch/ia64/kernel/kprobes.c @@ -23,28 +23,24 @@ * <anil.s.keshavamurthy@intel.com> adapted from i386 */ -#include <linux/config.h> #include <linux/kprobes.h> #include <linux/ptrace.h> -#include <linux/spinlock.h> #include <linux/string.h> #include <linux/slab.h> #include <linux/preempt.h> #include <linux/moduleloader.h> +#include <linux/kdebug.h> #include <asm/pgtable.h> -#include <asm/kdebug.h> #include <asm/sections.h> +#include <asm/uaccess.h> extern void jprobe_inst_return(void); -/* kprobe_status settings */ -#define KPROBE_HIT_ACTIVE 0x00000001 -#define KPROBE_HIT_SS 0x00000002 +DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL; +DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk); -static struct kprobe *current_kprobe, *kprobe_prev; -static unsigned long kprobe_status, kprobe_status_prev; -static struct pt_regs jprobe_saved_regs; +struct kretprobe_blackpoint kretprobe_blacklist[] = {{NULL, NULL}}; enum instruction_type {A, I, M, F, B, L, X, u}; static enum instruction_type bundle_encoding[32][3] = { @@ -82,6 +78,20 @@ static enum instruction_type bundle_encoding[32][3] = { { u, u, u }, /* 1F */ }; +/* Insert a long branch code */ +static void __kprobes set_brl_inst(void *from, void *to) +{ + s64 rel = ((s64) to - (s64) from) >> 4; + bundle_t *brl; + brl = (bundle_t *) ((u64) from & ~0xf); + brl->quad0.template = 0x05; /* [MLX](stop) */ + brl->quad0.slot0 = NOP_M_INST; /* nop.m 0x0 */ + brl->quad0.slot1_p0 = ((rel >> 20) & 0x7fffffffff) << 2; + brl->quad1.slot1_p1 = (((rel >> 20) & 0x7fffffffff) << 2) >> (64 - 46); + /* brl.cond.sptk.many.clr rel<<4 (qp=0) */ + brl->quad1.slot2 = BRL_INST(rel >> 59, rel & 0xfffff); +} + /* * In this function we check to see if the instruction * is IP relative instruction and update the kprobe @@ -94,9 +104,10 @@ static void __kprobes update_kprobe_inst_flag(uint template, uint slot, { p->ainsn.inst_flag = 0; p->ainsn.target_br_reg = 0; + p->ainsn.slot = slot; /* Check for Break instruction - * Bits 37:40 Major opcode to be zero + * Bits 37:40 Major opcode to be zero * Bits 27:32 X6 to be zero * Bits 32:35 X3 to be zero */ @@ -110,19 +121,19 @@ static void __kprobes update_kprobe_inst_flag(uint template, uint slot, switch (major_opcode) { case INDIRECT_CALL_OPCODE: p->ainsn.inst_flag |= INST_FLAG_FIX_BRANCH_REG; - p->ainsn.target_br_reg = ((kprobe_inst >> 6) & 0x7); - break; + p->ainsn.target_br_reg = ((kprobe_inst >> 6) & 0x7); + break; case IP_RELATIVE_PREDICT_OPCODE: case IP_RELATIVE_BRANCH_OPCODE: p->ainsn.inst_flag |= INST_FLAG_FIX_RELATIVE_IP_ADDR; - break; + break; case IP_RELATIVE_CALL_OPCODE: - p->ainsn.inst_flag |= INST_FLAG_FIX_RELATIVE_IP_ADDR; - p->ainsn.inst_flag |= INST_FLAG_FIX_BRANCH_REG; - p->ainsn.target_br_reg = ((kprobe_inst >> 6) & 0x7); - break; + p->ainsn.inst_flag |= INST_FLAG_FIX_RELATIVE_IP_ADDR; + p->ainsn.inst_flag |= INST_FLAG_FIX_BRANCH_REG; + p->ainsn.target_br_reg = ((kprobe_inst >> 6) & 0x7); + break; } - } else if (bundle_encoding[template][slot] == X) { + } else if (bundle_encoding[template][slot] == X) { switch (major_opcode) { case LONG_CALL_OPCODE: p->ainsn.inst_flag |= INST_FLAG_FIX_BRANCH_REG; @@ -135,20 +146,66 @@ static void __kprobes update_kprobe_inst_flag(uint template, uint slot, /* * In this function we check to see if the instruction + * (qp) cmpx.crel.ctype p1,p2=r2,r3 + * on which we are inserting kprobe is cmp instruction + * with ctype as unc. + */ +static uint __kprobes is_cmp_ctype_unc_inst(uint template, uint slot, + uint major_opcode, + unsigned long kprobe_inst) +{ + cmp_inst_t cmp_inst; + uint ctype_unc = 0; + + if (!((bundle_encoding[template][slot] == I) || + (bundle_encoding[template][slot] == M))) + goto out; + + if (!((major_opcode == 0xC) || (major_opcode == 0xD) || + (major_opcode == 0xE))) + goto out; + + cmp_inst.l = kprobe_inst; + if ((cmp_inst.f.x2 == 0) || (cmp_inst.f.x2 == 1)) { + /* Integer compare - Register Register (A6 type)*/ + if ((cmp_inst.f.tb == 0) && (cmp_inst.f.ta == 0) + &&(cmp_inst.f.c == 1)) + ctype_unc = 1; + } else if ((cmp_inst.f.x2 == 2)||(cmp_inst.f.x2 == 3)) { + /* Integer compare - Immediate Register (A8 type)*/ + if ((cmp_inst.f.ta == 0) &&(cmp_inst.f.c == 1)) + ctype_unc = 1; + } +out: + return ctype_unc; +} + +/* + * In this function we check to see if the instruction * on which we are inserting kprobe is supported. - * Returns 0 if supported + * Returns qp value if supported * Returns -EINVAL if unsupported */ static int __kprobes unsupported_inst(uint template, uint slot, uint major_opcode, unsigned long kprobe_inst, - struct kprobe *p) + unsigned long addr) { - unsigned long addr = (unsigned long)p->addr; + int qp; + + qp = kprobe_inst & 0x3f; + if (is_cmp_ctype_unc_inst(template, slot, major_opcode, kprobe_inst)) { + if (slot == 1 && qp) { + printk(KERN_WARNING "Kprobes on cmp unc " + "instruction on slot 1 at <0x%lx> " + "is not supported\n", addr); + return -EINVAL; - if (bundle_encoding[template][slot] == I) { - switch (major_opcode) { - case 0x0: //I_UNIT_MISC_OPCODE: + } + qp = 0; + } + else if (bundle_encoding[template][slot] == I) { + if (major_opcode == 0) { /* * Check for Integer speculation instruction * - Bit 33-35 to be equal to 0x1 @@ -156,10 +213,9 @@ static int __kprobes unsupported_inst(uint template, uint slot, if (((kprobe_inst >> 33) & 0x7) == 1) { printk(KERN_WARNING "Kprobes on speculation inst at <0x%lx> not supported\n", - addr); + addr); return -EINVAL; } - /* * IP relative mov instruction * - Bit 27-35 to be equal to 0x30 @@ -167,50 +223,74 @@ static int __kprobes unsupported_inst(uint template, uint slot, if (((kprobe_inst >> 27) & 0x1FF) == 0x30) { printk(KERN_WARNING "Kprobes on \"mov r1=ip\" at <0x%lx> not supported\n", - addr); + addr); return -EINVAL; } } + else if ((major_opcode == 5) && !(kprobe_inst & (0xFUl << 33)) && + (kprobe_inst & (0x1UL << 12))) { + /* test bit instructions, tbit,tnat,tf + * bit 33-36 to be equal to 0 + * bit 12 to be equal to 1 + */ + if (slot == 1 && qp) { + printk(KERN_WARNING "Kprobes on test bit " + "instruction on slot at <0x%lx> " + "is not supported\n", addr); + return -EINVAL; + } + qp = 0; + } } - return 0; -} - - -/* - * In this function we check to see if the instruction - * (qp) cmpx.crel.ctype p1,p2=r2,r3 - * on which we are inserting kprobe is cmp instruction - * with ctype as unc. - */ -static uint __kprobes is_cmp_ctype_unc_inst(uint template, uint slot, - uint major_opcode, - unsigned long kprobe_inst) -{ - cmp_inst_t cmp_inst; - uint ctype_unc = 0; - - if (!((bundle_encoding[template][slot] == I) || - (bundle_encoding[template][slot] == M))) - goto out; - - if (!((major_opcode == 0xC) || (major_opcode == 0xD) || - (major_opcode == 0xE))) - goto out; + else if (bundle_encoding[template][slot] == B) { + if (major_opcode == 7) { + /* IP-Relative Predict major code is 7 */ + printk(KERN_WARNING "Kprobes on IP-Relative" + "Predict is not supported\n"); + return -EINVAL; + } + else if (major_opcode == 2) { + /* Indirect Predict, major code is 2 + * bit 27-32 to be equal to 10 or 11 + */ + int x6=(kprobe_inst >> 27) & 0x3F; + if ((x6 == 0x10) || (x6 == 0x11)) { + printk(KERN_WARNING "Kprobes on " + "Indirect Predict is not supported\n"); + return -EINVAL; + } + } + } + /* kernel does not use float instruction, here for safety kprobe + * will judge whether it is fcmp/flass/float approximation instruction + */ + else if (unlikely(bundle_encoding[template][slot] == F)) { + if ((major_opcode == 4 || major_opcode == 5) && + (kprobe_inst & (0x1 << 12))) { + /* fcmp/fclass unc instruction */ + if (slot == 1 && qp) { + printk(KERN_WARNING "Kprobes on fcmp/fclass " + "instruction on slot at <0x%lx> " + "is not supported\n", addr); + return -EINVAL; - cmp_inst.l = kprobe_inst; - if ((cmp_inst.f.x2 == 0) || (cmp_inst.f.x2 == 1)) { - /* Integere compare - Register Register (A6 type)*/ - if ((cmp_inst.f.tb == 0) && (cmp_inst.f.ta == 0) - &&(cmp_inst.f.c == 1)) - ctype_unc = 1; - } else if ((cmp_inst.f.x2 == 2)||(cmp_inst.f.x2 == 3)) { - /* Integere compare - Immediate Register (A8 type)*/ - if ((cmp_inst.f.ta == 0) &&(cmp_inst.f.c == 1)) - ctype_unc = 1; + } + qp = 0; + } + if ((major_opcode == 0 || major_opcode == 1) && + (kprobe_inst & (0x1UL << 33))) { + /* float Approximation instruction */ + if (slot == 1 && qp) { + printk(KERN_WARNING "Kprobes on float Approx " + "instr at <0x%lx> is not supported\n", + addr); + return -EINVAL; + } + qp = 0; + } } -out: - return ctype_unc; + return qp; } /* @@ -220,20 +300,17 @@ out: static void __kprobes prepare_break_inst(uint template, uint slot, uint major_opcode, unsigned long kprobe_inst, - struct kprobe *p) + struct kprobe *p, + int qp) { unsigned long break_inst = BREAK_INST; - bundle_t *bundle = &p->ainsn.insn.bundle; + bundle_t *bundle = &p->opcode.bundle; /* * Copy the original kprobe_inst qualifying predicate(qp) - * to the break instruction iff !is_cmp_ctype_unc_inst - * because for cmp instruction with ctype equal to unc, - * which is a special instruction always needs to be - * executed regradless of qp + * to the break instruction */ - if (!is_cmp_ctype_unc_inst(template, slot, major_opcode, kprobe_inst)) - break_inst |= (0x3f & kprobe_inst); + break_inst |= qp; switch (slot) { case 0: @@ -256,7 +333,7 @@ static void __kprobes prepare_break_inst(uint template, uint slot, update_kprobe_inst_flag(template, slot, major_opcode, kprobe_inst, p); } -static inline void get_kprobe_inst(bundle_t *bundle, uint slot, +static void __kprobes get_kprobe_inst(bundle_t *bundle, uint slot, unsigned long *kprobe_inst, uint *major_opcode) { unsigned long kprobe_inst_p0, kprobe_inst_p1; @@ -266,24 +343,24 @@ static inline void get_kprobe_inst(bundle_t *bundle, uint slot, switch (slot) { case 0: - *major_opcode = (bundle->quad0.slot0 >> SLOT0_OPCODE_SHIFT); - *kprobe_inst = bundle->quad0.slot0; - break; + *major_opcode = (bundle->quad0.slot0 >> SLOT0_OPCODE_SHIFT); + *kprobe_inst = bundle->quad0.slot0; + break; case 1: - *major_opcode = (bundle->quad1.slot1_p1 >> SLOT1_p1_OPCODE_SHIFT); - kprobe_inst_p0 = bundle->quad0.slot1_p0; - kprobe_inst_p1 = bundle->quad1.slot1_p1; - *kprobe_inst = kprobe_inst_p0 | (kprobe_inst_p1 << (64-46)); + *major_opcode = (bundle->quad1.slot1_p1 >> SLOT1_p1_OPCODE_SHIFT); + kprobe_inst_p0 = bundle->quad0.slot1_p0; + kprobe_inst_p1 = bundle->quad1.slot1_p1; + *kprobe_inst = kprobe_inst_p0 | (kprobe_inst_p1 << (64-46)); break; case 2: - *major_opcode = (bundle->quad1.slot2 >> SLOT2_OPCODE_SHIFT); - *kprobe_inst = bundle->quad1.slot2; + *major_opcode = (bundle->quad1.slot2 >> SLOT2_OPCODE_SHIFT); + *kprobe_inst = bundle->quad1.slot2; break; } } /* Returns non-zero if the addr is in the Interrupt Vector Table */ -static inline int in_ivt_functions(unsigned long addr) +static int __kprobes in_ivt_functions(unsigned long addr) { return (addr >= (unsigned long)__start_ivt_text && addr < (unsigned long)__end_ivt_text); @@ -298,36 +375,36 @@ static int __kprobes valid_kprobe_addr(int template, int slot, return -EINVAL; } - if (in_ivt_functions(addr)) { - printk(KERN_WARNING "Kprobes can't be inserted inside " + if (in_ivt_functions(addr)) { + printk(KERN_WARNING "Kprobes can't be inserted inside " "IVT functions at 0x%lx\n", addr); - return -EINVAL; - } - - if (slot == 1 && bundle_encoding[template][1] != L) { - printk(KERN_WARNING "Inserting kprobes on slot #1 " - "is not supported\n"); return -EINVAL; } return 0; } -static inline void save_previous_kprobe(void) +static void __kprobes save_previous_kprobe(struct kprobe_ctlblk *kcb) { - kprobe_prev = current_kprobe; - kprobe_status_prev = kprobe_status; + unsigned int i; + i = atomic_add_return(1, &kcb->prev_kprobe_index); + kcb->prev_kprobe[i-1].kp = kprobe_running(); + kcb->prev_kprobe[i-1].status = kcb->kprobe_status; } -static inline void restore_previous_kprobe(void) +static void __kprobes restore_previous_kprobe(struct kprobe_ctlblk *kcb) { - current_kprobe = kprobe_prev; - kprobe_status = kprobe_status_prev; + unsigned int i; + i = atomic_read(&kcb->prev_kprobe_index); + __get_cpu_var(current_kprobe) = kcb->prev_kprobe[i-1].kp; + kcb->kprobe_status = kcb->prev_kprobe[i-1].status; + atomic_sub(1, &kcb->prev_kprobe_index); } -static inline void set_current_kprobe(struct kprobe *p) +static void __kprobes set_current_kprobe(struct kprobe *p, + struct kprobe_ctlblk *kcb) { - current_kprobe = p; + __get_cpu_var(current_kprobe) = p; } static void kretprobe_trampoline(void) @@ -345,18 +422,19 @@ static void kretprobe_trampoline(void) int __kprobes trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs) { struct kretprobe_instance *ri = NULL; - struct hlist_head *head; - struct hlist_node *node, *tmp; - unsigned long orig_ret_address = 0; + struct hlist_head *head, empty_rp; + struct hlist_node *tmp; + unsigned long flags, orig_ret_address = 0; unsigned long trampoline_address = ((struct fnptr *)kretprobe_trampoline)->ip; - head = kretprobe_inst_table_head(current); + INIT_HLIST_HEAD(&empty_rp); + kretprobe_hash_lock(current, &head, &flags); /* * It is possible to have multiple instances associated with a given * task either because an multiple functions in the call path - * have a return probe installed on them, and/or more then one return + * have a return probe installed on them, and/or more than one return * return probe was registered for a target function. * * We can handle this because: @@ -366,16 +444,33 @@ int __kprobes trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs) * real return address, and all the rest will point to * kretprobe_trampoline */ - hlist_for_each_entry_safe(ri, node, tmp, head, hlist) { - if (ri->task != current) + hlist_for_each_entry_safe(ri, tmp, head, hlist) { + if (ri->task != current) + /* another task is sharing our hash bucket */ + continue; + + orig_ret_address = (unsigned long)ri->ret_addr; + if (orig_ret_address != trampoline_address) + /* + * This is the real return address. Any other + * instances associated with this task are for + * other calls deeper on the call stack + */ + break; + } + + regs->cr_iip = orig_ret_address; + + hlist_for_each_entry_safe(ri, tmp, head, hlist) { + if (ri->task != current) /* another task is sharing our hash bucket */ - continue; + continue; if (ri->rp && ri->rp->handler) ri->rp->handler(ri, regs); orig_ret_address = (unsigned long)ri->ret_addr; - recycle_rp_inst(ri); + recycle_rp_inst(ri, &empty_rp); if (orig_ret_address != trampoline_address) /* @@ -386,36 +481,101 @@ int __kprobes trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs) break; } - BUG_ON(!orig_ret_address || (orig_ret_address == trampoline_address)); - regs->cr_iip = orig_ret_address; + kretprobe_assert(ri, orig_ret_address, trampoline_address); - unlock_kprobes(); + reset_current_kprobe(); + kretprobe_hash_unlock(current, &flags); preempt_enable_no_resched(); - /* - * By returning a non-zero value, we are telling - * kprobe_handler() that we have handled unlocking - * and re-enabling preemption. - */ - return 1; + hlist_for_each_entry_safe(ri, tmp, &empty_rp, hlist) { + hlist_del(&ri->hlist); + kfree(ri); + } + /* + * By returning a non-zero value, we are telling + * kprobe_handler() that we don't want the post_handler + * to run (and have re-enabled preemption) + */ + return 1; } -void __kprobes arch_prepare_kretprobe(struct kretprobe *rp, +void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs) { - struct kretprobe_instance *ri; + ri->ret_addr = (kprobe_opcode_t *)regs->b0; + + /* Replace the return addr with trampoline addr */ + regs->b0 = ((struct fnptr *)kretprobe_trampoline)->ip; +} - if ((ri = get_free_rp_inst(rp)) != NULL) { - ri->rp = rp; - ri->task = current; - ri->ret_addr = (kprobe_opcode_t *)regs->b0; +/* Check the instruction in the slot is break */ +static int __kprobes __is_ia64_break_inst(bundle_t *bundle, uint slot) +{ + unsigned int major_opcode; + unsigned int template = bundle->quad0.template; + unsigned long kprobe_inst; - /* Replace the return addr with trampoline addr */ - regs->b0 = ((struct fnptr *)kretprobe_trampoline)->ip; + /* Move to slot 2, if bundle is MLX type and kprobe slot is 1 */ + if (slot == 1 && bundle_encoding[template][1] == L) + slot++; - add_rp_inst(ri); - } else { - rp->nmissed++; + /* Get Kprobe probe instruction at given slot*/ + get_kprobe_inst(bundle, slot, &kprobe_inst, &major_opcode); + + /* For break instruction, + * Bits 37:40 Major opcode to be zero + * Bits 27:32 X6 to be zero + * Bits 32:35 X3 to be zero + */ + if (major_opcode || ((kprobe_inst >> 27) & 0x1FF)) { + /* Not a break instruction */ + return 0; + } + + /* Is a break instruction */ + return 1; +} + +/* + * In this function, we check whether the target bundle modifies IP or + * it triggers an exception. If so, it cannot be boostable. + */ +static int __kprobes can_boost(bundle_t *bundle, uint slot, + unsigned long bundle_addr) +{ + unsigned int template = bundle->quad0.template; + + do { + if (search_exception_tables(bundle_addr + slot) || + __is_ia64_break_inst(bundle, slot)) + return 0; /* exception may occur in this bundle*/ + } while ((++slot) < 3); + template &= 0x1e; + if (template >= 0x10 /* including B unit */ || + template == 0x04 /* including X unit */ || + template == 0x06) /* undefined */ + return 0; + + return 1; +} + +/* Prepare long jump bundle and disables other boosters if need */ +static void __kprobes prepare_booster(struct kprobe *p) +{ + unsigned long addr = (unsigned long)p->addr & ~0xFULL; + unsigned int slot = (unsigned long)p->addr & 0xf; + struct kprobe *other_kp; + + if (can_boost(&p->ainsn.insn[0].bundle, slot, addr)) { + set_brl_inst(&p->ainsn.insn[1].bundle, (bundle_t *)addr + 1); + p->ainsn.inst_flag |= INST_FLAG_BOOSTABLE; + } + + /* disables boosters in previous slots */ + for (; addr < (unsigned long)p->addr; addr++) { + other_kp = get_kprobe((void *)addr); + if (other_kp) + other_kp->ainsn.inst_flag &= ~INST_FLAG_BOOSTABLE; } } @@ -425,79 +585,123 @@ int __kprobes arch_prepare_kprobe(struct kprobe *p) unsigned long *kprobe_addr = (unsigned long *)(addr & ~0xFULL); unsigned long kprobe_inst=0; unsigned int slot = addr & 0xf, template, major_opcode = 0; - bundle_t *bundle = &p->ainsn.insn.bundle; - - memcpy(&p->opcode.bundle, kprobe_addr, sizeof(bundle_t)); - memcpy(&p->ainsn.insn.bundle, kprobe_addr, sizeof(bundle_t)); + bundle_t *bundle; + int qp; - template = bundle->quad0.template; + bundle = &((kprobe_opcode_t *)kprobe_addr)->bundle; + template = bundle->quad0.template; if(valid_kprobe_addr(template, slot, addr)) return -EINVAL; /* Move to slot 2, if bundle is MLX type and kprobe slot is 1 */ - if (slot == 1 && bundle_encoding[template][1] == L) - slot++; + if (slot == 1 && bundle_encoding[template][1] == L) + slot++; /* Get kprobe_inst and major_opcode from the bundle */ get_kprobe_inst(bundle, slot, &kprobe_inst, &major_opcode); - if (unsupported_inst(template, slot, major_opcode, kprobe_inst, p)) - return -EINVAL; + qp = unsupported_inst(template, slot, major_opcode, kprobe_inst, addr); + if (qp < 0) + return -EINVAL; - prepare_break_inst(template, slot, major_opcode, kprobe_inst, p); + p->ainsn.insn = get_insn_slot(); + if (!p->ainsn.insn) + return -ENOMEM; + memcpy(&p->opcode, kprobe_addr, sizeof(kprobe_opcode_t)); + memcpy(p->ainsn.insn, kprobe_addr, sizeof(kprobe_opcode_t)); + + prepare_break_inst(template, slot, major_opcode, kprobe_inst, p, qp); + + prepare_booster(p); return 0; } void __kprobes arch_arm_kprobe(struct kprobe *p) { - unsigned long addr = (unsigned long)p->addr; - unsigned long arm_addr = addr & ~0xFULL; + unsigned long arm_addr; + bundle_t *src, *dest; + + arm_addr = ((unsigned long)p->addr) & ~0xFUL; + dest = &((kprobe_opcode_t *)arm_addr)->bundle; + src = &p->opcode.bundle; + + flush_icache_range((unsigned long)p->ainsn.insn, + (unsigned long)p->ainsn.insn + + sizeof(kprobe_opcode_t) * MAX_INSN_SIZE); - memcpy((char *)arm_addr, &p->ainsn.insn.bundle, sizeof(bundle_t)); - flush_icache_range(arm_addr, arm_addr + sizeof(bundle_t)); + switch (p->ainsn.slot) { + case 0: + dest->quad0.slot0 = src->quad0.slot0; + break; + case 1: + dest->quad1.slot1_p1 = src->quad1.slot1_p1; + break; + case 2: + dest->quad1.slot2 = src->quad1.slot2; + break; + } + flush_icache_range(arm_addr, arm_addr + sizeof(kprobe_opcode_t)); } void __kprobes arch_disarm_kprobe(struct kprobe *p) { - unsigned long addr = (unsigned long)p->addr; - unsigned long arm_addr = addr & ~0xFULL; - - /* p->opcode contains the original unaltered bundle */ - memcpy((char *) arm_addr, (char *) &p->opcode.bundle, sizeof(bundle_t)); - flush_icache_range(arm_addr, arm_addr + sizeof(bundle_t)); + unsigned long arm_addr; + bundle_t *src, *dest; + + arm_addr = ((unsigned long)p->addr) & ~0xFUL; + dest = &((kprobe_opcode_t *)arm_addr)->bundle; + /* p->ainsn.insn contains the original unaltered kprobe_opcode_t */ + src = &p->ainsn.insn->bundle; + switch (p->ainsn.slot) { + case 0: + dest->quad0.slot0 = src->quad0.slot0; + break; + case 1: + dest->quad1.slot1_p1 = src->quad1.slot1_p1; + break; + case 2: + dest->quad1.slot2 = src->quad1.slot2; + break; + } + flush_icache_range(arm_addr, arm_addr + sizeof(kprobe_opcode_t)); } void __kprobes arch_remove_kprobe(struct kprobe *p) { + if (p->ainsn.insn) { + free_insn_slot(p->ainsn.insn, + p->ainsn.inst_flag & INST_FLAG_BOOSTABLE); + p->ainsn.insn = NULL; + } } - /* * We are resuming execution after a single step fault, so the pt_regs * structure reflects the register state after we executed the instruction - * located in the kprobe (p->ainsn.insn.bundle). We still need to adjust + * located in the kprobe (p->ainsn.insn->bundle). We still need to adjust * the ip to point back to the original stack address. To set the IP address * to original stack address, handle the case where we need to fixup the * relative IP address and/or fixup branch register. */ static void __kprobes resume_execution(struct kprobe *p, struct pt_regs *regs) { - unsigned long bundle_addr = ((unsigned long) (&p->opcode.bundle)) & ~0xFULL; - unsigned long resume_addr = (unsigned long)p->addr & ~0xFULL; - unsigned long template; - int slot = ((unsigned long)p->addr & 0xf); + unsigned long bundle_addr = (unsigned long) (&p->ainsn.insn->bundle); + unsigned long resume_addr = (unsigned long)p->addr & ~0xFULL; + unsigned long template; + int slot = ((unsigned long)p->addr & 0xf); - template = p->opcode.bundle.quad0.template; + template = p->ainsn.insn->bundle.quad0.template; - if (slot == 1 && bundle_encoding[template][1] == L) - slot = 2; + if (slot == 1 && bundle_encoding[template][1] == L) + slot = 2; - if (p->ainsn.inst_flag) { + if (p->ainsn.inst_flag & ~INST_FLAG_BOOSTABLE) { if (p->ainsn.inst_flag & INST_FLAG_FIX_RELATIVE_IP_ADDR) { /* Fix relative IP address */ - regs->cr_iip = (regs->cr_iip - bundle_addr) + resume_addr; + regs->cr_iip = (regs->cr_iip - bundle_addr) + + resume_addr; } if (p->ainsn.inst_flag & INST_FLAG_FIX_BRANCH_REG) { @@ -534,23 +738,23 @@ static void __kprobes resume_execution(struct kprobe *p, struct pt_regs *regs) } if (slot == 2) { - if (regs->cr_iip == bundle_addr + 0x10) { - regs->cr_iip = resume_addr + 0x10; - } - } else { - if (regs->cr_iip == bundle_addr) { - regs->cr_iip = resume_addr; - } + if (regs->cr_iip == bundle_addr + 0x10) { + regs->cr_iip = resume_addr + 0x10; + } + } else { + if (regs->cr_iip == bundle_addr) { + regs->cr_iip = resume_addr; + } } turn_ss_off: - /* Turn off Single Step bit */ - ia64_psr(regs)->ss = 0; + /* Turn off Single Step bit */ + ia64_psr(regs)->ss = 0; } static void __kprobes prepare_ss(struct kprobe *p, struct pt_regs *regs) { - unsigned long bundle_addr = (unsigned long) &p->opcode.bundle; + unsigned long bundle_addr = (unsigned long) &p->ainsn.insn->bundle; unsigned long slot = (unsigned long)p->addr & 0xf; /* single step inline if break instruction */ @@ -571,33 +775,12 @@ static void __kprobes prepare_ss(struct kprobe *p, struct pt_regs *regs) static int __kprobes is_ia64_break_inst(struct pt_regs *regs) { unsigned int slot = ia64_psr(regs)->ri; - unsigned int template, major_opcode; - unsigned long kprobe_inst; unsigned long *kprobe_addr = (unsigned long *)regs->cr_iip; bundle_t bundle; memcpy(&bundle, kprobe_addr, sizeof(bundle_t)); - template = bundle.quad0.template; - - /* Move to slot 2, if bundle is MLX type and kprobe slot is 1 */ - if (slot == 1 && bundle_encoding[template][1] == L) - slot++; - - /* Get Kprobe probe instruction at given slot*/ - get_kprobe_inst(&bundle, slot, &kprobe_inst, &major_opcode); - - /* For break instruction, - * Bits 37:40 Major opcode to be zero - * Bits 27:32 X6 to be zero - * Bits 32:35 X3 to be zero - */ - if (major_opcode || ((kprobe_inst >> 27) & 0x1FF) ) { - /* Not a break instruction */ - return 0; - } - /* Is a break instruction */ - return 1; + return __is_ia64_break_inst(&bundle, slot); } static int __kprobes pre_kprobes_handler(struct die_args *args) @@ -606,17 +789,22 @@ static int __kprobes pre_kprobes_handler(struct die_args *args) int ret = 0; struct pt_regs *regs = args->regs; kprobe_opcode_t *addr = (kprobe_opcode_t *)instruction_pointer(regs); + struct kprobe_ctlblk *kcb; + /* + * We don't want to be preempted for the entire + * duration of kprobe processing + */ preempt_disable(); + kcb = get_kprobe_ctlblk(); /* Handle recursion cases */ if (kprobe_running()) { p = get_kprobe(addr); if (p) { - if ( (kprobe_status == KPROBE_HIT_SS) && + if ((kcb->kprobe_status == KPROBE_HIT_SS) && (p->ainsn.inst_flag == INST_FLAG_BREAK_INST)) { - ia64_psr(regs)->ss = 0; - unlock_kprobes(); + ia64_psr(regs)->ss = 0; goto no_kprobe; } /* We have reentered the pre_kprobe_handler(), since @@ -625,30 +813,35 @@ static int __kprobes pre_kprobes_handler(struct die_args *args) * just single step on the instruction of the new probe * without calling any user handlers. */ - save_previous_kprobe(); - set_current_kprobe(p); - p->nmissed++; + save_previous_kprobe(kcb); + set_current_kprobe(p, kcb); + kprobes_inc_nmissed_count(p); prepare_ss(p, regs); - kprobe_status = KPROBE_REENTER; + kcb->kprobe_status = KPROBE_REENTER; return 1; } else if (args->err == __IA64_BREAK_JPROBE) { /* * jprobe instrumented function just completed */ - p = current_kprobe; + p = __get_cpu_var(current_kprobe); if (p->break_handler && p->break_handler(p, regs)) { goto ss_probe; } + } else if (!is_ia64_break_inst(regs)) { + /* The breakpoint instruction was removed by + * another cpu right after we hit, no further + * handling of this interrupt is appropriate + */ + ret = 1; + goto no_kprobe; } else { /* Not our break */ goto no_kprobe; } } - lock_kprobes(); p = get_kprobe(addr); if (!p) { - unlock_kprobes(); if (!is_ia64_break_inst(regs)) { /* * The breakpoint instruction was removed right @@ -665,8 +858,8 @@ static int __kprobes pre_kprobes_handler(struct die_args *args) goto no_kprobe; } - kprobe_status = KPROBE_HIT_ACTIVE; - set_current_kprobe(p); + set_current_kprobe(p, kcb); + kcb->kprobe_status = KPROBE_HIT_ACTIVE; if (p->pre_handler && p->pre_handler(p, regs)) /* @@ -677,8 +870,21 @@ static int __kprobes pre_kprobes_handler(struct die_args *args) return 1; ss_probe: +#if !defined(CONFIG_PREEMPT) + if (p->ainsn.inst_flag == INST_FLAG_BOOSTABLE && !p->post_handler) { + /* Boost up -- we can execute copied instructions directly */ + ia64_psr(regs)->ri = p->ainsn.slot; + regs->cr_iip = (unsigned long)&p->ainsn.insn->bundle & ~0xFULL; + /* turn single stepping off */ + ia64_psr(regs)->ss = 0; + + reset_current_kprobe(); + preempt_enable_no_resched(); + return 1; + } +#endif prepare_ss(p, regs); - kprobe_status = KPROBE_HIT_SS; + kcb->kprobe_status = KPROBE_HIT_SS; return 1; no_kprobe: @@ -688,42 +894,86 @@ no_kprobe: static int __kprobes post_kprobes_handler(struct pt_regs *regs) { - if (!kprobe_running()) + struct kprobe *cur = kprobe_running(); + struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); + + if (!cur) return 0; - if ((kprobe_status != KPROBE_REENTER) && current_kprobe->post_handler) { - kprobe_status = KPROBE_HIT_SSDONE; - current_kprobe->post_handler(current_kprobe, regs, 0); + if ((kcb->kprobe_status != KPROBE_REENTER) && cur->post_handler) { + kcb->kprobe_status = KPROBE_HIT_SSDONE; + cur->post_handler(cur, regs, 0); } - resume_execution(current_kprobe, regs); + resume_execution(cur, regs); /*Restore back the original saved kprobes variables and continue. */ - if (kprobe_status == KPROBE_REENTER) { - restore_previous_kprobe(); + if (kcb->kprobe_status == KPROBE_REENTER) { + restore_previous_kprobe(kcb); goto out; } - - unlock_kprobes(); + reset_current_kprobe(); out: preempt_enable_no_resched(); return 1; } -static int __kprobes kprobes_fault_handler(struct pt_regs *regs, int trapnr) +int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr) { - if (!kprobe_running()) - return 0; + struct kprobe *cur = kprobe_running(); + struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); - if (current_kprobe->fault_handler && - current_kprobe->fault_handler(current_kprobe, regs, trapnr)) - return 1; - if (kprobe_status & KPROBE_HIT_SS) { - resume_execution(current_kprobe, regs); - unlock_kprobes(); + switch(kcb->kprobe_status) { + case KPROBE_HIT_SS: + case KPROBE_REENTER: + /* + * We are here because the instruction being single + * stepped caused a page fault. We reset the current + * kprobe and the instruction pointer points back to + * the probe address and allow the page fault handler + * to continue as a normal page fault. + */ + regs->cr_iip = ((unsigned long)cur->addr) & ~0xFULL; + ia64_psr(regs)->ri = ((unsigned long)cur->addr) & 0xf; + if (kcb->kprobe_status == KPROBE_REENTER) + restore_previous_kprobe(kcb); + else + reset_current_kprobe(); preempt_enable_no_resched(); + break; + case KPROBE_HIT_ACTIVE: + case KPROBE_HIT_SSDONE: + /* + * We increment the nmissed count for accounting, + * we can also use npre/npostfault count for accounting + * these specific fault cases. + */ + kprobes_inc_nmissed_count(cur); + + /* + * We come here because instructions in the pre/post + * handler caused the page_fault, this could happen + * if handler tries to access user space by + * copy_from_user(), get_user() etc. Let the + * user-specified handler try to fix it first. + */ + if (cur->fault_handler && cur->fault_handler(cur, regs, trapnr)) + return 1; + /* + * In case the user-specified fault handler returned + * zero, try to fix up. + */ + if (ia64_done_with_exception(regs)) + return 1; + + /* + * Let ia64_do_page_fault() fix it. + */ + break; + default: + break; } return 0; @@ -733,31 +983,90 @@ int __kprobes kprobe_exceptions_notify(struct notifier_block *self, unsigned long val, void *data) { struct die_args *args = (struct die_args *)data; + int ret = NOTIFY_DONE; + + if (args->regs && user_mode(args->regs)) + return ret; + switch(val) { case DIE_BREAK: - if (pre_kprobes_handler(args)) - return NOTIFY_STOP; + /* err is break number from ia64_bad_break() */ + if ((args->err >> 12) == (__IA64_BREAK_KPROBE >> 12) + || args->err == __IA64_BREAK_JPROBE + || args->err == 0) + if (pre_kprobes_handler(args)) + ret = NOTIFY_STOP; break; - case DIE_SS: - if (post_kprobes_handler(args->regs)) - return NOTIFY_STOP; + case DIE_FAULT: + /* err is vector number from ia64_fault() */ + if (args->err == 36) + if (post_kprobes_handler(args->regs)) + ret = NOTIFY_STOP; break; - case DIE_PAGE_FAULT: - if (kprobes_fault_handler(args->regs, args->trapnr)) - return NOTIFY_STOP; default: break; } - return NOTIFY_DONE; + return ret; +} + +struct param_bsp_cfm { + unsigned long ip; + unsigned long *bsp; + unsigned long cfm; +}; + +static void ia64_get_bsp_cfm(struct unw_frame_info *info, void *arg) +{ + unsigned long ip; + struct param_bsp_cfm *lp = arg; + + do { + unw_get_ip(info, &ip); + if (ip == 0) + break; + if (ip == lp->ip) { + unw_get_bsp(info, (unsigned long*)&lp->bsp); + unw_get_cfm(info, (unsigned long*)&lp->cfm); + return; + } + } while (unw_unwind(info) >= 0); + lp->bsp = NULL; + lp->cfm = 0; + return; +} + +unsigned long arch_deref_entry_point(void *entry) +{ + return ((struct fnptr *)entry)->ip; } int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs) { struct jprobe *jp = container_of(p, struct jprobe, kp); - unsigned long addr = ((struct fnptr *)(jp->entry))->ip; + unsigned long addr = arch_deref_entry_point(jp->entry); + struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); + struct param_bsp_cfm pa; + int bytes; + + /* + * Callee owns the argument space and could overwrite it, eg + * tail call optimization. So to be absolutely safe + * we save the argument space before transferring the control + * to instrumented jprobe function which runs in + * the process context + */ + pa.ip = regs->cr_iip; + unw_init_running(ia64_get_bsp_cfm, &pa); + bytes = (char *)ia64_rse_skip_regs(pa.bsp, pa.cfm & 0x3f) + - (char *)pa.bsp; + memcpy( kcb->jprobes_saved_stacked_regs, + pa.bsp, + bytes ); + kcb->bsp = pa.bsp; + kcb->cfm = pa.cfm; /* save architectural state */ - jprobe_saved_regs = *regs; + kcb->jprobe_saved_regs = *regs; /* after rfi, execute the jprobe instrumented function */ regs->cr_iip = addr & ~0xFULL; @@ -768,14 +1077,34 @@ int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs) * fix the return address to our jprobe_inst_return() function * in the jprobes.S file */ - regs->b0 = ((struct fnptr *)(jprobe_inst_return))->ip; + regs->b0 = ((struct fnptr *)(jprobe_inst_return))->ip; return 1; } +/* ia64 does not need this */ +void __kprobes jprobe_return(void) +{ +} + int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs) { - *regs = jprobe_saved_regs; + struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); + int bytes; + + /* restoring architectural state */ + *regs = kcb->jprobe_saved_regs; + + /* restoring the original argument space */ + flush_register_stack(); + bytes = (char *)ia64_rse_skip_regs(kcb->bsp, kcb->cfm & 0x3f) + - (char *)kcb->bsp; + memcpy( kcb->bsp, + kcb->jprobes_saved_stacked_regs, + bytes ); + invalidate_stacked_regs(); + + preempt_enable_no_resched(); return 1; } @@ -789,3 +1118,12 @@ int __init arch_init_kprobes(void) (kprobe_opcode_t *)((struct fnptr *)kretprobe_trampoline)->ip; return register_kprobe(&trampoline_p); } + +int __kprobes arch_trampoline_kprobe(struct kprobe *p) +{ + if (p->addr == + (kprobe_opcode_t *)((struct fnptr *)kretprobe_trampoline)->ip) + return 1; + + return 0; +} diff --git a/arch/ia64/kernel/machine_kexec.c b/arch/ia64/kernel/machine_kexec.c new file mode 100644 index 00000000000..5151a649c96 --- /dev/null +++ b/arch/ia64/kernel/machine_kexec.c @@ -0,0 +1,170 @@ +/* + * arch/ia64/kernel/machine_kexec.c + * + * Handle transition of Linux booting another kernel + * Copyright (C) 2005 Hewlett-Packard Development Comapny, L.P. + * Copyright (C) 2005 Khalid Aziz <khalid.aziz@hp.com> + * Copyright (C) 2006 Intel Corp, Zou Nan hai <nanhai.zou@intel.com> + * + * This source code is licensed under the GNU General Public License, + * Version 2. See the file COPYING for more details. + */ + +#include <linux/mm.h> +#include <linux/kexec.h> +#include <linux/cpu.h> +#include <linux/irq.h> +#include <linux/efi.h> +#include <linux/numa.h> +#include <linux/mmzone.h> + +#include <asm/numa.h> +#include <asm/mmu_context.h> +#include <asm/setup.h> +#include <asm/delay.h> +#include <asm/meminit.h> +#include <asm/processor.h> +#include <asm/sal.h> +#include <asm/mca.h> + +typedef void (*relocate_new_kernel_t)( + unsigned long indirection_page, + unsigned long start_address, + struct ia64_boot_param *boot_param, + unsigned long pal_addr) __noreturn; + +struct kimage *ia64_kimage; + +struct resource efi_memmap_res = { + .name = "EFI Memory Map", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_MEM +}; + +struct resource boot_param_res = { + .name = "Boot parameter", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_MEM +}; + + +/* + * Do what every setup is needed on image and the + * reboot code buffer to allow us to avoid allocations + * later. + */ +int machine_kexec_prepare(struct kimage *image) +{ + void *control_code_buffer; + const unsigned long *func; + + func = (unsigned long *)&relocate_new_kernel; + /* Pre-load control code buffer to minimize work in kexec path */ + control_code_buffer = page_address(image->control_code_page); + memcpy((void *)control_code_buffer, (const void *)func[0], + relocate_new_kernel_size); + flush_icache_range((unsigned long)control_code_buffer, + (unsigned long)control_code_buffer + relocate_new_kernel_size); + ia64_kimage = image; + + return 0; +} + +void machine_kexec_cleanup(struct kimage *image) +{ +} + +/* + * Do not allocate memory (or fail in any way) in machine_kexec(). + * We are past the point of no return, committed to rebooting now. + */ +static void ia64_machine_kexec(struct unw_frame_info *info, void *arg) +{ + struct kimage *image = arg; + relocate_new_kernel_t rnk; + void *pal_addr = efi_get_pal_addr(); + unsigned long code_addr; + int ii; + u64 fp, gp; + ia64_fptr_t *init_handler = (ia64_fptr_t *)ia64_os_init_on_kdump; + + BUG_ON(!image); + code_addr = (unsigned long)page_address(image->control_code_page); + if (image->type == KEXEC_TYPE_CRASH) { + crash_save_this_cpu(); + current->thread.ksp = (__u64)info->sw - 16; + + /* Register noop init handler */ + fp = ia64_tpa(init_handler->fp); + gp = ia64_tpa(ia64_getreg(_IA64_REG_GP)); + ia64_sal_set_vectors(SAL_VECTOR_OS_INIT, fp, gp, 0, fp, gp, 0); + } else { + /* Unregister init handlers of current kernel */ + ia64_sal_set_vectors(SAL_VECTOR_OS_INIT, 0, 0, 0, 0, 0, 0); + } + + /* Unregister mca handler - No more recovery on current kernel */ + ia64_sal_set_vectors(SAL_VECTOR_OS_MCA, 0, 0, 0, 0, 0, 0); + + /* Interrupts aren't acceptable while we reboot */ + local_irq_disable(); + + /* Mask CMC and Performance Monitor interrupts */ + ia64_setreg(_IA64_REG_CR_PMV, 1 << 16); + ia64_setreg(_IA64_REG_CR_CMCV, 1 << 16); + + /* Mask ITV and Local Redirect Registers */ + ia64_set_itv(1 << 16); + ia64_set_lrr0(1 << 16); + ia64_set_lrr1(1 << 16); + + /* terminate possible nested in-service interrupts */ + for (ii = 0; ii < 16; ii++) + ia64_eoi(); + + /* unmask TPR and clear any pending interrupts */ + ia64_setreg(_IA64_REG_CR_TPR, 0); + ia64_srlz_d(); + while (ia64_get_ivr() != IA64_SPURIOUS_INT_VECTOR) + ia64_eoi(); + platform_kernel_launch_event(); + rnk = (relocate_new_kernel_t)&code_addr; + (*rnk)(image->head, image->start, ia64_boot_param, + GRANULEROUNDDOWN((unsigned long) pal_addr)); + BUG(); +} + +void machine_kexec(struct kimage *image) +{ + BUG_ON(!image); + unw_init_running(ia64_machine_kexec, image); + for(;;); +} + +void arch_crash_save_vmcoreinfo(void) +{ +#if defined(CONFIG_DISCONTIGMEM) || defined(CONFIG_SPARSEMEM) + VMCOREINFO_SYMBOL(pgdat_list); + VMCOREINFO_LENGTH(pgdat_list, MAX_NUMNODES); +#endif +#ifdef CONFIG_NUMA + VMCOREINFO_SYMBOL(node_memblk); + VMCOREINFO_LENGTH(node_memblk, NR_NODE_MEMBLKS); + VMCOREINFO_STRUCT_SIZE(node_memblk_s); + VMCOREINFO_OFFSET(node_memblk_s, start_paddr); + VMCOREINFO_OFFSET(node_memblk_s, size); +#endif +#ifdef CONFIG_PGTABLE_3 + VMCOREINFO_CONFIG(PGTABLE_3); +#elif defined(CONFIG_PGTABLE_4) + VMCOREINFO_CONFIG(PGTABLE_4); +#endif +} + +unsigned long paddr_vmcoreinfo_note(void) +{ + return ia64_tpa((unsigned long)(char *)&vmcoreinfo_note); +} + diff --git a/arch/ia64/kernel/machvec.c b/arch/ia64/kernel/machvec.c index c3a04ee7f4f..f5a1e5246b3 100644 --- a/arch/ia64/kernel/machvec.c +++ b/arch/ia64/kernel/machvec.c @@ -1,8 +1,6 @@ -#include <linux/config.h> #include <linux/module.h> - +#include <linux/dma-mapping.h> #include <asm/machvec.h> -#include <asm/system.h> #ifdef CONFIG_IA64_GENERIC @@ -14,7 +12,7 @@ struct ia64_machine_vector ia64_mv; EXPORT_SYMBOL(ia64_mv); -static struct ia64_machine_vector * +static struct ia64_machine_vector * __init lookup_machvec (const char *name) { extern struct ia64_machine_vector machvec_start[]; @@ -28,19 +26,39 @@ lookup_machvec (const char *name) return 0; } -void +void __init machvec_init (const char *name) { struct ia64_machine_vector *mv; + if (!name) + name = acpi_get_sysname(); mv = lookup_machvec(name); - if (!mv) { - panic("generic kernel failed to find machine vector for platform %s!", name); - } + if (!mv) + panic("generic kernel failed to find machine vector for" + " platform %s!", name); + ia64_mv = *mv; printk(KERN_INFO "booting generic kernel on platform %s\n", name); } +void __init +machvec_init_from_cmdline(const char *cmdline) +{ + char str[64]; + const char *start; + char *end; + + if (! (start = strstr(cmdline, "machvec=")) ) + return machvec_init(NULL); + + strlcpy(str, start + strlen("machvec="), sizeof(str)); + if ( (end = strchr(str, ' ')) ) + *end = '\0'; + + return machvec_init(str); +} + #endif /* CONFIG_IA64_GENERIC */ void @@ -50,20 +68,22 @@ machvec_setup (char **arg) EXPORT_SYMBOL(machvec_setup); void -machvec_timer_interrupt (int irq, void *dev_id, struct pt_regs *regs) +machvec_timer_interrupt (int irq, void *dev_id) { } EXPORT_SYMBOL(machvec_timer_interrupt); void -machvec_dma_sync_single (struct device *hwdev, dma_addr_t dma_handle, size_t size, int dir) +machvec_dma_sync_single(struct device *hwdev, dma_addr_t dma_handle, size_t size, + enum dma_data_direction dir) { mb(); } EXPORT_SYMBOL(machvec_dma_sync_single); void -machvec_dma_sync_sg (struct device *hwdev, struct scatterlist *sg, int n, int dir) +machvec_dma_sync_sg(struct device *hwdev, struct scatterlist *sg, int n, + enum dma_data_direction dir) { mb(); } diff --git a/arch/ia64/kernel/mca.c b/arch/ia64/kernel/mca.c index 52c47da1724..db7b36bb068 100644 --- a/arch/ia64/kernel/mca.c +++ b/arch/ia64/kernel/mca.c @@ -2,64 +2,79 @@ * File: mca.c * Purpose: Generic MCA handling layer * - * Updated for latest kernel * Copyright (C) 2003 Hewlett-Packard Co * David Mosberger-Tang <davidm@hpl.hp.com> * * Copyright (C) 2002 Dell Inc. - * Copyright (C) Matt Domsch (Matt_Domsch@dell.com) + * Copyright (C) Matt Domsch <Matt_Domsch@dell.com> * * Copyright (C) 2002 Intel - * Copyright (C) Jenna Hall (jenna.s.hall@intel.com) + * Copyright (C) Jenna Hall <jenna.s.hall@intel.com> * * Copyright (C) 2001 Intel - * Copyright (C) Fred Lewis (frederick.v.lewis@intel.com) + * Copyright (C) Fred Lewis <frederick.v.lewis@intel.com> * * Copyright (C) 2000 Intel - * Copyright (C) Chuck Fleckenstein (cfleck@co.intel.com) + * Copyright (C) Chuck Fleckenstein <cfleck@co.intel.com> * - * Copyright (C) 1999, 2004 Silicon Graphics, Inc. - * Copyright (C) Vijay Chander(vijay@engr.sgi.com) + * Copyright (C) 1999, 2004-2008 Silicon Graphics, Inc. + * Copyright (C) Vijay Chander <vijay@engr.sgi.com> * - * 03/04/15 D. Mosberger Added INIT backtrace support. - * 02/03/25 M. Domsch GUID cleanups + * Copyright (C) 2006 FUJITSU LIMITED + * Copyright (C) Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com> * - * 02/01/04 J. Hall Aligned MCA stack to 16 bytes, added platform vs. CPU - * error flag, set SAL default return values, changed - * error record structure to linked list, added init call - * to sal_get_state_info_size(). + * 2000-03-29 Chuck Fleckenstein <cfleck@co.intel.com> + * Fixed PAL/SAL update issues, began MCA bug fixes, logging issues, + * added min save state dump, added INIT handler. * - * 01/01/03 F. Lewis Added setup of CMCI and CPEI IRQs, logging of corrected - * platform errors, completed code for logging of - * corrected & uncorrected machine check errors, and - * updated for conformance with Nov. 2000 revision of the - * SAL 3.0 spec. - * 00/03/29 C. Fleckenstein Fixed PAL/SAL update issues, began MCA bug fixes, logging issues, - * added min save state dump, added INIT handler. + * 2001-01-03 Fred Lewis <frederick.v.lewis@intel.com> + * Added setup of CMCI and CPEI IRQs, logging of corrected platform + * errors, completed code for logging of corrected & uncorrected + * machine check errors, and updated for conformance with Nov. 2000 + * revision of the SAL 3.0 spec. + * + * 2002-01-04 Jenna Hall <jenna.s.hall@intel.com> + * Aligned MCA stack to 16 bytes, added platform vs. CPU error flag, + * set SAL default return values, changed error record structure to + * linked list, added init call to sal_get_state_info_size(). + * + * 2002-03-25 Matt Domsch <Matt_Domsch@dell.com> + * GUID cleanups. + * + * 2003-04-15 David Mosberger-Tang <davidm@hpl.hp.com> + * Added INIT backtrace support. * * 2003-12-08 Keith Owens <kaos@sgi.com> - * smp_call_function() must not be called from interrupt context (can - * deadlock on tasklist_lock). Use keventd to call smp_call_function(). + * smp_call_function() must not be called from interrupt context + * (can deadlock on tasklist_lock). + * Use keventd to call smp_call_function(). * * 2004-02-01 Keith Owens <kaos@sgi.com> - * Avoid deadlock when using printk() for MCA and INIT records. - * Delete all record printing code, moved to salinfo_decode in user space. - * Mark variables and functions static where possible. - * Delete dead variables and functions. - * Reorder to remove the need for forward declarations and to consolidate - * related code. + * Avoid deadlock when using printk() for MCA and INIT records. + * Delete all record printing code, moved to salinfo_decode in user + * space. Mark variables and functions static where possible. + * Delete dead variables and functions. Reorder to remove the need + * for forward declarations and to consolidate related code. * * 2005-08-12 Keith Owens <kaos@sgi.com> - * Convert MCA/INIT handlers to use per event stacks and SAL/OS state. + * Convert MCA/INIT handlers to use per event stacks and SAL/OS + * state. + * + * 2005-10-07 Keith Owens <kaos@sgi.com> + * Add notify_die() hooks. + * + * 2006-09-15 Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com> + * Add printing support for MCA/INIT. + * + * 2007-04-27 Russ Anderson <rja@sgi.com> + * Support multiple cpus going through OS_MCA in the same event. */ -#include <linux/config.h> +#include <linux/jiffies.h> #include <linux/types.h> #include <linux/init.h> #include <linux/sched.h> #include <linux/interrupt.h> #include <linux/irq.h> -#include <linux/kallsyms.h> -#include <linux/smp_lock.h> #include <linux/bootmem.h> #include <linux/acpi.h> #include <linux/timer.h> @@ -67,19 +82,25 @@ #include <linux/kernel.h> #include <linux/smp.h> #include <linux/workqueue.h> +#include <linux/cpumask.h> +#include <linux/kdebug.h> +#include <linux/cpu.h> +#include <linux/gfp.h> #include <asm/delay.h> #include <asm/machvec.h> #include <asm/meminit.h> #include <asm/page.h> #include <asm/ptrace.h> -#include <asm/system.h> #include <asm/sal.h> #include <asm/mca.h> +#include <asm/kexec.h> #include <asm/irq.h> #include <asm/hw_irq.h> +#include <asm/tlb.h> +#include "mca_drv.h" #include "entry.h" #if defined(IA64_MCA_DEBUG_INFO) @@ -88,12 +109,26 @@ # define IA64_MCA_DEBUG(fmt...) #endif +#define NOTIFY_INIT(event, regs, arg, spin) \ +do { \ + if ((notify_die((event), "INIT", (regs), (arg), 0, 0) \ + == NOTIFY_STOP) && ((spin) == 1)) \ + ia64_mca_spin(__func__); \ +} while (0) + +#define NOTIFY_MCA(event, regs, arg, spin) \ +do { \ + if ((notify_die((event), "MCA", (regs), (arg), 0, 0) \ + == NOTIFY_STOP) && ((spin) == 1)) \ + ia64_mca_spin(__func__); \ +} while (0) + /* Used by mca_asm.S */ -u32 ia64_mca_serialize; DEFINE_PER_CPU(u64, ia64_mca_data); /* == __per_cpu_mca[smp_processor_id()] */ DEFINE_PER_CPU(u64, ia64_mca_per_cpu_pte); /* PTE to map per-CPU area */ DEFINE_PER_CPU(u64, ia64_mca_pal_pte); /* PTE to map PAL code */ DEFINE_PER_CPU(u64, ia64_mca_pal_base); /* vaddr PAL code granule */ +DEFINE_PER_CPU(u64, ia64_mca_tr_reload); /* Flag for TR reload */ unsigned long __per_cpu_mca[NR_CPUS]; @@ -111,7 +146,9 @@ static ia64_mc_info_t ia64_mc_info; #define CPE_HISTORY_LENGTH 5 #define CMC_HISTORY_LENGTH 5 +#ifdef CONFIG_ACPI static struct timer_list cpe_poll_timer; +#endif static struct timer_list cmc_poll_timer; /* * This variable tells whether we are currently in polling mode. @@ -130,9 +167,181 @@ static int cpe_poll_enabled = 1; extern void salinfo_log_wakeup(int type, u8 *buffer, u64 size, int irqsafe); -static int mca_init; +static int mca_init __initdata; + +/* + * limited & delayed printing support for MCA/INIT handler + */ + +#define mprintk(fmt...) ia64_mca_printk(fmt) + +#define MLOGBUF_SIZE (512+256*NR_CPUS) +#define MLOGBUF_MSGMAX 256 +static char mlogbuf[MLOGBUF_SIZE]; +static DEFINE_SPINLOCK(mlogbuf_wlock); /* mca context only */ +static DEFINE_SPINLOCK(mlogbuf_rlock); /* normal context only */ +static unsigned long mlogbuf_start; +static unsigned long mlogbuf_end; +static unsigned int mlogbuf_finished = 0; +static unsigned long mlogbuf_timestamp = 0; + +static int loglevel_save = -1; +#define BREAK_LOGLEVEL(__console_loglevel) \ + oops_in_progress = 1; \ + if (loglevel_save < 0) \ + loglevel_save = __console_loglevel; \ + __console_loglevel = 15; + +#define RESTORE_LOGLEVEL(__console_loglevel) \ + if (loglevel_save >= 0) { \ + __console_loglevel = loglevel_save; \ + loglevel_save = -1; \ + } \ + mlogbuf_finished = 0; \ + oops_in_progress = 0; + +/* + * Push messages into buffer, print them later if not urgent. + */ +void ia64_mca_printk(const char *fmt, ...) +{ + va_list args; + int printed_len; + char temp_buf[MLOGBUF_MSGMAX]; + char *p; + + va_start(args, fmt); + printed_len = vscnprintf(temp_buf, sizeof(temp_buf), fmt, args); + va_end(args); + + /* Copy the output into mlogbuf */ + if (oops_in_progress) { + /* mlogbuf was abandoned, use printk directly instead. */ + printk("%s", temp_buf); + } else { + spin_lock(&mlogbuf_wlock); + for (p = temp_buf; *p; p++) { + unsigned long next = (mlogbuf_end + 1) % MLOGBUF_SIZE; + if (next != mlogbuf_start) { + mlogbuf[mlogbuf_end] = *p; + mlogbuf_end = next; + } else { + /* buffer full */ + break; + } + } + mlogbuf[mlogbuf_end] = '\0'; + spin_unlock(&mlogbuf_wlock); + } +} +EXPORT_SYMBOL(ia64_mca_printk); /* + * Print buffered messages. + * NOTE: call this after returning normal context. (ex. from salinfod) + */ +void ia64_mlogbuf_dump(void) +{ + char temp_buf[MLOGBUF_MSGMAX]; + char *p; + unsigned long index; + unsigned long flags; + unsigned int printed_len; + + /* Get output from mlogbuf */ + while (mlogbuf_start != mlogbuf_end) { + temp_buf[0] = '\0'; + p = temp_buf; + printed_len = 0; + + spin_lock_irqsave(&mlogbuf_rlock, flags); + + index = mlogbuf_start; + while (index != mlogbuf_end) { + *p = mlogbuf[index]; + index = (index + 1) % MLOGBUF_SIZE; + if (!*p) + break; + p++; + if (++printed_len >= MLOGBUF_MSGMAX - 1) + break; + } + *p = '\0'; + if (temp_buf[0]) + printk("%s", temp_buf); + mlogbuf_start = index; + + mlogbuf_timestamp = 0; + spin_unlock_irqrestore(&mlogbuf_rlock, flags); + } +} +EXPORT_SYMBOL(ia64_mlogbuf_dump); + +/* + * Call this if system is going to down or if immediate flushing messages to + * console is required. (ex. recovery was failed, crash dump is going to be + * invoked, long-wait rendezvous etc.) + * NOTE: this should be called from monarch. + */ +static void ia64_mlogbuf_finish(int wait) +{ + BREAK_LOGLEVEL(console_loglevel); + + spin_lock_init(&mlogbuf_rlock); + ia64_mlogbuf_dump(); + printk(KERN_EMERG "mlogbuf_finish: printing switched to urgent mode, " + "MCA/INIT might be dodgy or fail.\n"); + + if (!wait) + return; + + /* wait for console */ + printk("Delaying for 5 seconds...\n"); + udelay(5*1000000); + + mlogbuf_finished = 1; +} + +/* + * Print buffered messages from INIT context. + */ +static void ia64_mlogbuf_dump_from_init(void) +{ + if (mlogbuf_finished) + return; + + if (mlogbuf_timestamp && + time_before(jiffies, mlogbuf_timestamp + 30 * HZ)) { + printk(KERN_ERR "INIT: mlogbuf_dump is interrupted by INIT " + " and the system seems to be messed up.\n"); + ia64_mlogbuf_finish(0); + return; + } + + if (!spin_trylock(&mlogbuf_rlock)) { + printk(KERN_ERR "INIT: mlogbuf_dump is interrupted by INIT. " + "Generated messages other than stack dump will be " + "buffered to mlogbuf and will be printed later.\n"); + printk(KERN_ERR "INIT: If messages would not printed after " + "this INIT, wait 30sec and assert INIT again.\n"); + if (!mlogbuf_timestamp) + mlogbuf_timestamp = jiffies; + return; + } + spin_unlock(&mlogbuf_rlock); + ia64_mlogbuf_dump(); +} + +static void inline +ia64_mca_spin(const char *func) +{ + if (monarch_cpu == smp_processor_id()) + ia64_mlogbuf_finish(0); + mprintk(KERN_EMERG "%s: spinning here, not returning to SAL\n", func); + while (1) + cpu_relax(); +} +/* * IA64_MCA log support */ #define IA64_MAX_LOGS 2 /* Double-buffering for nested MCAs */ @@ -173,7 +382,7 @@ static ia64_state_log_t ia64_state_log[IA64_MAX_LOG_TYPES]; * Inputs : info_type (SAL_INFO_TYPE_{MCA,INIT,CMC,CPE}) * Outputs : None */ -static void +static void __init ia64_log_init(int sal_info_type) { u64 max_size = 0; @@ -209,7 +418,7 @@ ia64_log_get(int sal_info_type, u8 **buffer, int irq_safe) { sal_log_record_header_t *log_buffer; u64 total_len = 0; - int s; + unsigned long s; IA64_LOG_LOCK(sal_info_type); @@ -222,8 +431,8 @@ ia64_log_get(int sal_info_type, u8 **buffer, int irq_safe) IA64_LOG_INDEX_INC(sal_info_type); IA64_LOG_UNLOCK(sal_info_type); if (irq_safe) { - IA64_MCA_DEBUG("%s: SAL error record type %d retrieved. " - "Record length = %ld\n", __FUNCTION__, sal_info_type, total_len); + IA64_MCA_DEBUG("%s: SAL error record type %d retrieved. Record length = %ld\n", + __func__, sal_info_type, total_len); } *buffer = (u8 *) log_buffer; return total_len; @@ -271,30 +480,67 @@ ia64_mca_log_sal_error_record(int sal_info_type) } /* - * platform dependent error handling + * search_mca_table + * See if the MCA surfaced in an instruction range + * that has been tagged as recoverable. + * + * Inputs + * first First address range to check + * last Last address range to check + * ip Instruction pointer, address we are looking for + * + * Return value: + * 1 on Success (in the table)/ 0 on Failure (not in the table) */ -#ifndef PLATFORM_MCA_HANDLERS +int +search_mca_table (const struct mca_table_entry *first, + const struct mca_table_entry *last, + unsigned long ip) +{ + const struct mca_table_entry *curr; + u64 curr_start, curr_end; + + curr = first; + while (curr <= last) { + curr_start = (u64) &curr->start_addr + curr->start_addr; + curr_end = (u64) &curr->end_addr + curr->end_addr; + + if ((ip >= curr_start) && (ip <= curr_end)) { + return 1; + } + curr++; + } + return 0; +} + +/* Given an address, look for it in the mca tables. */ +int mca_recover_range(unsigned long addr) +{ + extern struct mca_table_entry __start___mca_table[]; + extern struct mca_table_entry __stop___mca_table[]; + + return search_mca_table(__start___mca_table, __stop___mca_table-1, addr); +} +EXPORT_SYMBOL_GPL(mca_recover_range); #ifdef CONFIG_ACPI int cpe_vector = -1; +int ia64_cpe_irq = -1; static irqreturn_t -ia64_mca_cpe_int_handler (int cpe_irq, void *arg, struct pt_regs *ptregs) +ia64_mca_cpe_int_handler (int cpe_irq, void *arg) { static unsigned long cpe_history[CPE_HISTORY_LENGTH]; static int index; static DEFINE_SPINLOCK(cpe_history_lock); IA64_MCA_DEBUG("%s: received interrupt vector = %#x on CPU %d\n", - __FUNCTION__, cpe_irq, smp_processor_id()); + __func__, cpe_irq, smp_processor_id()); /* SAL spec states this should run w/ interrupts enabled */ local_irq_enable(); - /* Get the CPE error record and log it */ - ia64_mca_log_sal_error_record(SAL_INFO_TYPE_CPE); - spin_lock(&cpe_history_lock); if (!cpe_poll_enabled && cpe_vector >= 0) { @@ -323,7 +569,7 @@ ia64_mca_cpe_int_handler (int cpe_irq, void *arg, struct pt_regs *ptregs) mod_timer(&cpe_poll_timer, jiffies + MIN_CPE_POLL_INTERVAL); /* lock already released, get out now */ - return IRQ_HANDLED; + goto out; } else { cpe_history[index++] = now; if (index == CPE_HISTORY_LENGTH) @@ -331,6 +577,12 @@ ia64_mca_cpe_int_handler (int cpe_irq, void *arg, struct pt_regs *ptregs) } } spin_unlock(&cpe_history_lock); +out: + /* Get the CPE error record and log it */ + ia64_mca_log_sal_error_record(SAL_INFO_TYPE_CPE); + + local_irq_disable(); + return IRQ_HANDLED; } @@ -348,7 +600,7 @@ ia64_mca_cpe_int_handler (int cpe_irq, void *arg, struct pt_regs *ptregs) * Outputs * None */ -static void +void ia64_mca_register_cpev (int cpev) { /* Register the CPE interrupt vector with SAL */ @@ -362,12 +614,10 @@ ia64_mca_register_cpev (int cpev) } IA64_MCA_DEBUG("%s: corrected platform error " - "vector %#x registered\n", __FUNCTION__, cpev); + "vector %#x registered\n", __func__, cpev); } #endif /* CONFIG_ACPI */ -#endif /* PLATFORM_MCA_HANDLERS */ - /* * ia64_mca_cmc_vector_setup * @@ -391,12 +641,11 @@ ia64_mca_cmc_vector_setup (void) cmcv.cmcv_vector = IA64_CMC_VECTOR; ia64_setreg(_IA64_REG_CR_CMCV, cmcv.cmcv_regval); - IA64_MCA_DEBUG("%s: CPU %d corrected " - "machine check vector %#x registered.\n", - __FUNCTION__, smp_processor_id(), IA64_CMC_VECTOR); + IA64_MCA_DEBUG("%s: CPU %d corrected machine check vector %#x registered.\n", + __func__, smp_processor_id(), IA64_CMC_VECTOR); IA64_MCA_DEBUG("%s: CPU %d CMCV = %#016lx\n", - __FUNCTION__, smp_processor_id(), ia64_getreg(_IA64_REG_CR_CMCV)); + __func__, smp_processor_id(), ia64_getreg(_IA64_REG_CR_CMCV)); } /* @@ -421,9 +670,8 @@ ia64_mca_cmc_vector_disable (void *dummy) cmcv.cmcv_mask = 1; /* Mask/disable interrupt */ ia64_setreg(_IA64_REG_CR_CMCV, cmcv.cmcv_regval); - IA64_MCA_DEBUG("%s: CPU %d corrected " - "machine check vector %#x disabled.\n", - __FUNCTION__, smp_processor_id(), cmcv.cmcv_vector); + IA64_MCA_DEBUG("%s: CPU %d corrected machine check vector %#x disabled.\n", + __func__, smp_processor_id(), cmcv.cmcv_vector); } /* @@ -448,9 +696,8 @@ ia64_mca_cmc_vector_enable (void *dummy) cmcv.cmcv_mask = 0; /* Unmask/enable interrupt */ ia64_setreg(_IA64_REG_CR_CMCV, cmcv.cmcv_regval); - IA64_MCA_DEBUG("%s: CPU %d corrected " - "machine check vector %#x enabled.\n", - __FUNCTION__, smp_processor_id(), cmcv.cmcv_vector); + IA64_MCA_DEBUG("%s: CPU %d corrected machine check vector %#x enabled.\n", + __func__, smp_processor_id(), cmcv.cmcv_vector); } /* @@ -460,9 +707,9 @@ ia64_mca_cmc_vector_enable (void *dummy) * disable the cmc interrupt vector. */ static void -ia64_mca_cmc_vector_disable_keventd(void *unused) +ia64_mca_cmc_vector_disable_keventd(struct work_struct *unused) { - on_each_cpu(ia64_mca_cmc_vector_disable, NULL, 1, 0); + on_each_cpu(ia64_mca_cmc_vector_disable, NULL, 0); } /* @@ -472,16 +719,15 @@ ia64_mca_cmc_vector_disable_keventd(void *unused) * enable the cmc interrupt vector. */ static void -ia64_mca_cmc_vector_enable_keventd(void *unused) +ia64_mca_cmc_vector_enable_keventd(struct work_struct *unused) { - on_each_cpu(ia64_mca_cmc_vector_enable, NULL, 1, 0); + on_each_cpu(ia64_mca_cmc_vector_enable, NULL, 0); } /* * ia64_mca_wakeup * - * Send an inter-cpu interrupt to wake-up a particular cpu - * and mark that cpu to be out of rendez. + * Send an inter-cpu interrupt to wake-up a particular cpu. * * Inputs : cpuid * Outputs : None @@ -490,14 +736,12 @@ static void ia64_mca_wakeup(int cpu) { platform_send_ipi(cpu, IA64_MCA_WAKEUP_VECTOR, IA64_IPI_DM_INT, 0); - ia64_mc_info.imi_rendez_checkin[cpu] = IA64_MCA_RENDEZ_CHECKIN_NOTDONE; - } /* * ia64_mca_wakeup_all * - * Wakeup all the cpus which have rendez'ed previously. + * Wakeup all the slave cpus which have rendez'ed previously. * * Inputs : None * Outputs : None @@ -520,30 +764,42 @@ ia64_mca_wakeup_all(void) * * This is handler used to put slave processors into spinloop * while the monarch processor does the mca handling and later - * wake each slave up once the monarch is done. + * wake each slave up once the monarch is done. The state + * IA64_MCA_RENDEZ_CHECKIN_DONE indicates the cpu is rendez'ed + * in SAL. The state IA64_MCA_RENDEZ_CHECKIN_NOTDONE indicates + * the cpu has come out of OS rendezvous. * * Inputs : None * Outputs : None */ static irqreturn_t -ia64_mca_rendez_int_handler(int rendez_irq, void *arg, struct pt_regs *ptregs) +ia64_mca_rendez_int_handler(int rendez_irq, void *arg) { unsigned long flags; int cpu = smp_processor_id(); + struct ia64_mca_notify_die nd = + { .sos = NULL, .monarch_cpu = &monarch_cpu }; /* Mask all interrupts */ local_irq_save(flags); + NOTIFY_MCA(DIE_MCA_RENDZVOUS_ENTER, get_irq_regs(), (long)&nd, 1); + ia64_mc_info.imi_rendez_checkin[cpu] = IA64_MCA_RENDEZ_CHECKIN_DONE; /* Register with the SAL monarch that the slave has * reached SAL */ ia64_sal_mc_rendez(); + NOTIFY_MCA(DIE_MCA_RENDZVOUS_PROCESS, get_irq_regs(), (long)&nd, 1); + /* Wait for the monarch cpu to exit. */ while (monarch_cpu != -1) cpu_relax(); /* spin until monarch leaves */ + NOTIFY_MCA(DIE_MCA_RENDZVOUS_LEAVE, get_irq_regs(), (long)&nd, 1); + + ia64_mc_info.imi_rendez_checkin[cpu] = IA64_MCA_RENDEZ_CHECKIN_NOTDONE; /* Enable all interrupts */ local_irq_restore(flags); return IRQ_HANDLED; @@ -560,12 +816,11 @@ ia64_mca_rendez_int_handler(int rendez_irq, void *arg, struct pt_regs *ptregs) * * Inputs : wakeup_irq (Wakeup-interrupt bit) * arg (Interrupt handler specific argument) - * ptregs (Exception frame at the time of the interrupt) * Outputs : None * */ static irqreturn_t -ia64_mca_wakeup_int_handler(int wakeup_irq, void *arg, struct pt_regs *ptregs) +ia64_mca_wakeup_int_handler(int wakeup_irq, void *arg) { return IRQ_HANDLED; } @@ -597,7 +852,7 @@ EXPORT_SYMBOL(ia64_unreg_MCA_extension); static inline void -copy_reg(const u64 *fr, u64 fnat, u64 *tr, u64 *tnat) +copy_reg(const u64 *fr, u64 fnat, unsigned long *tr, unsigned long *tnat) { u64 fslot, tslot, nat; *tr = *fr; @@ -608,6 +863,91 @@ copy_reg(const u64 *fr, u64 fnat, u64 *tr, u64 *tnat) *tnat |= (nat << tslot); } +/* Change the comm field on the MCA/INT task to include the pid that + * was interrupted, it makes for easier debugging. If that pid was 0 + * (swapper or nested MCA/INIT) then use the start of the previous comm + * field suffixed with its cpu. + */ + +static void +ia64_mca_modify_comm(const struct task_struct *previous_current) +{ + char *p, comm[sizeof(current->comm)]; + if (previous_current->pid) + snprintf(comm, sizeof(comm), "%s %d", + current->comm, previous_current->pid); + else { + int l; + if ((p = strchr(previous_current->comm, ' '))) + l = p - previous_current->comm; + else + l = strlen(previous_current->comm); + snprintf(comm, sizeof(comm), "%s %*s %d", + current->comm, l, previous_current->comm, + task_thread_info(previous_current)->cpu); + } + memcpy(current->comm, comm, sizeof(current->comm)); +} + +static void +finish_pt_regs(struct pt_regs *regs, struct ia64_sal_os_state *sos, + unsigned long *nat) +{ + const pal_min_state_area_t *ms = sos->pal_min_state; + const u64 *bank; + + /* If ipsr.ic then use pmsa_{iip,ipsr,ifs}, else use + * pmsa_{xip,xpsr,xfs} + */ + if (ia64_psr(regs)->ic) { + regs->cr_iip = ms->pmsa_iip; + regs->cr_ipsr = ms->pmsa_ipsr; + regs->cr_ifs = ms->pmsa_ifs; + } else { + regs->cr_iip = ms->pmsa_xip; + regs->cr_ipsr = ms->pmsa_xpsr; + regs->cr_ifs = ms->pmsa_xfs; + + sos->iip = ms->pmsa_iip; + sos->ipsr = ms->pmsa_ipsr; + sos->ifs = ms->pmsa_ifs; + } + regs->pr = ms->pmsa_pr; + regs->b0 = ms->pmsa_br0; + regs->ar_rsc = ms->pmsa_rsc; + copy_reg(&ms->pmsa_gr[1-1], ms->pmsa_nat_bits, ®s->r1, nat); + copy_reg(&ms->pmsa_gr[2-1], ms->pmsa_nat_bits, ®s->r2, nat); + copy_reg(&ms->pmsa_gr[3-1], ms->pmsa_nat_bits, ®s->r3, nat); + copy_reg(&ms->pmsa_gr[8-1], ms->pmsa_nat_bits, ®s->r8, nat); + copy_reg(&ms->pmsa_gr[9-1], ms->pmsa_nat_bits, ®s->r9, nat); + copy_reg(&ms->pmsa_gr[10-1], ms->pmsa_nat_bits, ®s->r10, nat); + copy_reg(&ms->pmsa_gr[11-1], ms->pmsa_nat_bits, ®s->r11, nat); + copy_reg(&ms->pmsa_gr[12-1], ms->pmsa_nat_bits, ®s->r12, nat); + copy_reg(&ms->pmsa_gr[13-1], ms->pmsa_nat_bits, ®s->r13, nat); + copy_reg(&ms->pmsa_gr[14-1], ms->pmsa_nat_bits, ®s->r14, nat); + copy_reg(&ms->pmsa_gr[15-1], ms->pmsa_nat_bits, ®s->r15, nat); + if (ia64_psr(regs)->bn) + bank = ms->pmsa_bank1_gr; + else + bank = ms->pmsa_bank0_gr; + copy_reg(&bank[16-16], ms->pmsa_nat_bits, ®s->r16, nat); + copy_reg(&bank[17-16], ms->pmsa_nat_bits, ®s->r17, nat); + copy_reg(&bank[18-16], ms->pmsa_nat_bits, ®s->r18, nat); + copy_reg(&bank[19-16], ms->pmsa_nat_bits, ®s->r19, nat); + copy_reg(&bank[20-16], ms->pmsa_nat_bits, ®s->r20, nat); + copy_reg(&bank[21-16], ms->pmsa_nat_bits, ®s->r21, nat); + copy_reg(&bank[22-16], ms->pmsa_nat_bits, ®s->r22, nat); + copy_reg(&bank[23-16], ms->pmsa_nat_bits, ®s->r23, nat); + copy_reg(&bank[24-16], ms->pmsa_nat_bits, ®s->r24, nat); + copy_reg(&bank[25-16], ms->pmsa_nat_bits, ®s->r25, nat); + copy_reg(&bank[26-16], ms->pmsa_nat_bits, ®s->r26, nat); + copy_reg(&bank[27-16], ms->pmsa_nat_bits, ®s->r27, nat); + copy_reg(&bank[28-16], ms->pmsa_nat_bits, ®s->r28, nat); + copy_reg(&bank[29-16], ms->pmsa_nat_bits, ®s->r29, nat); + copy_reg(&bank[30-16], ms->pmsa_nat_bits, ®s->r30, nat); + copy_reg(&bank[31-16], ms->pmsa_nat_bits, ®s->r31, nat); +} + /* On entry to this routine, we are running on the per cpu stack, see * mca_asm.h. The original stack has not been touched by this event. Some of * the original stack's registers will be in the RBS on this stack. This stack @@ -620,29 +960,28 @@ copy_reg(const u64 *fr, u64 fnat, u64 *tr, u64 *tnat) * that we can do backtrace on the MCA/INIT handler code itself. */ -static task_t * +static struct task_struct * ia64_mca_modify_original_stack(struct pt_regs *regs, const struct switch_stack *sw, struct ia64_sal_os_state *sos, const char *type) { - char *p, comm[sizeof(current->comm)]; + char *p; ia64_va va; extern char ia64_leave_kernel[]; /* Need asm address, not function descriptor */ const pal_min_state_area_t *ms = sos->pal_min_state; - task_t *previous_current; + struct task_struct *previous_current; struct pt_regs *old_regs; struct switch_stack *old_sw; unsigned size = sizeof(struct pt_regs) + sizeof(struct switch_stack) + 16; - u64 *old_bspstore, *old_bsp; - u64 *new_bspstore, *new_bsp; - u64 old_unat, old_rnat, new_rnat, nat; + unsigned long *old_bspstore, *old_bsp; + unsigned long *new_bspstore, *new_bsp; + unsigned long old_unat, old_rnat, new_rnat, nat; u64 slots, loadrs = regs->loadrs; u64 r12 = ms->pmsa_gr[12-1], r13 = ms->pmsa_gr[13-1]; u64 ar_bspstore = regs->ar_bspstore; u64 ar_bsp = regs->ar_bspstore + (loadrs >> 16); - const u64 *bank; const char *msg; int cpu = smp_processor_id(); @@ -689,64 +1028,54 @@ ia64_mca_modify_original_stack(struct pt_regs *regs, * loadrs for the new stack and save it in the new pt_regs, where * ia64_old_stack() can get it. */ - old_bspstore = (u64 *)ar_bspstore; - old_bsp = (u64 *)ar_bsp; + old_bspstore = (unsigned long *)ar_bspstore; + old_bsp = (unsigned long *)ar_bsp; slots = ia64_rse_num_regs(old_bspstore, old_bsp); - new_bspstore = (u64 *)((u64)current + IA64_RBS_OFFSET); + new_bspstore = (unsigned long *)((u64)current + IA64_RBS_OFFSET); new_bsp = ia64_rse_skip_regs(new_bspstore, slots); regs->loadrs = (new_bsp - new_bspstore) * 8 << 16; /* Verify the previous stack state before we change it */ if (user_mode(regs)) { msg = "occurred in user space"; + /* previous_current is guaranteed to be valid when the task was + * in user space, so ... + */ + ia64_mca_modify_comm(previous_current); goto no_mod; } + if (r13 != sos->prev_IA64_KR_CURRENT) { msg = "inconsistent previous current and r13"; goto no_mod; } - if ((r12 - r13) >= KERNEL_STACK_SIZE) { - msg = "inconsistent r12 and r13"; - goto no_mod; - } - if ((ar_bspstore - r13) >= KERNEL_STACK_SIZE) { - msg = "inconsistent ar.bspstore and r13"; - goto no_mod; - } - va.p = old_bspstore; - if (va.f.reg < 5) { - msg = "old_bspstore is in the wrong region"; - goto no_mod; - } - if ((ar_bsp - r13) >= KERNEL_STACK_SIZE) { - msg = "inconsistent ar.bsp and r13"; - goto no_mod; - } - size += (ia64_rse_skip_regs(old_bspstore, slots) - old_bspstore) * 8; - if (ar_bspstore + size > r12) { - msg = "no room for blocked state"; - goto no_mod; - } - /* Change the comm field on the MCA/INT task to include the pid that - * was interrupted, it makes for easier debugging. If that pid was 0 - * (swapper or nested MCA/INIT) then use the start of the previous comm - * field suffixed with its cpu. - */ - if (previous_current->pid) - snprintf(comm, sizeof(comm), "%s %d", - current->comm, previous_current->pid); - else { - int l; - if ((p = strchr(previous_current->comm, ' '))) - l = p - previous_current->comm; - else - l = strlen(previous_current->comm); - snprintf(comm, sizeof(comm), "%s %*s %d", - current->comm, l, previous_current->comm, - previous_current->thread_info->cpu); + if (!mca_recover_range(ms->pmsa_iip)) { + if ((r12 - r13) >= KERNEL_STACK_SIZE) { + msg = "inconsistent r12 and r13"; + goto no_mod; + } + if ((ar_bspstore - r13) >= KERNEL_STACK_SIZE) { + msg = "inconsistent ar.bspstore and r13"; + goto no_mod; + } + va.p = old_bspstore; + if (va.f.reg < 5) { + msg = "old_bspstore is in the wrong region"; + goto no_mod; + } + if ((ar_bsp - r13) >= KERNEL_STACK_SIZE) { + msg = "inconsistent ar.bsp and r13"; + goto no_mod; + } + size += (ia64_rse_skip_regs(old_bspstore, slots) - old_bspstore) * 8; + if (ar_bspstore + size > r12) { + msg = "no room for blocked state"; + goto no_mod; + } } - memcpy(current->comm, comm, sizeof(current->comm)); + + ia64_mca_modify_comm(previous_current); /* Make the original task look blocked. First stack a struct pt_regs, * describing the state at the time of interrupt. mca_asm.S built a @@ -755,54 +1084,9 @@ ia64_mca_modify_original_stack(struct pt_regs *regs, p = (char *)r12 - sizeof(*regs); old_regs = (struct pt_regs *)p; memcpy(old_regs, regs, sizeof(*regs)); - /* If ipsr.ic then use pmsa_{iip,ipsr,ifs}, else use - * pmsa_{xip,xpsr,xfs} - */ - if (ia64_psr(regs)->ic) { - old_regs->cr_iip = ms->pmsa_iip; - old_regs->cr_ipsr = ms->pmsa_ipsr; - old_regs->cr_ifs = ms->pmsa_ifs; - } else { - old_regs->cr_iip = ms->pmsa_xip; - old_regs->cr_ipsr = ms->pmsa_xpsr; - old_regs->cr_ifs = ms->pmsa_xfs; - } - old_regs->pr = ms->pmsa_pr; - old_regs->b0 = ms->pmsa_br0; old_regs->loadrs = loadrs; - old_regs->ar_rsc = ms->pmsa_rsc; old_unat = old_regs->ar_unat; - copy_reg(&ms->pmsa_gr[1-1], ms->pmsa_nat_bits, &old_regs->r1, &old_unat); - copy_reg(&ms->pmsa_gr[2-1], ms->pmsa_nat_bits, &old_regs->r2, &old_unat); - copy_reg(&ms->pmsa_gr[3-1], ms->pmsa_nat_bits, &old_regs->r3, &old_unat); - copy_reg(&ms->pmsa_gr[8-1], ms->pmsa_nat_bits, &old_regs->r8, &old_unat); - copy_reg(&ms->pmsa_gr[9-1], ms->pmsa_nat_bits, &old_regs->r9, &old_unat); - copy_reg(&ms->pmsa_gr[10-1], ms->pmsa_nat_bits, &old_regs->r10, &old_unat); - copy_reg(&ms->pmsa_gr[11-1], ms->pmsa_nat_bits, &old_regs->r11, &old_unat); - copy_reg(&ms->pmsa_gr[12-1], ms->pmsa_nat_bits, &old_regs->r12, &old_unat); - copy_reg(&ms->pmsa_gr[13-1], ms->pmsa_nat_bits, &old_regs->r13, &old_unat); - copy_reg(&ms->pmsa_gr[14-1], ms->pmsa_nat_bits, &old_regs->r14, &old_unat); - copy_reg(&ms->pmsa_gr[15-1], ms->pmsa_nat_bits, &old_regs->r15, &old_unat); - if (ia64_psr(old_regs)->bn) - bank = ms->pmsa_bank1_gr; - else - bank = ms->pmsa_bank0_gr; - copy_reg(&bank[16-16], ms->pmsa_nat_bits, &old_regs->r16, &old_unat); - copy_reg(&bank[17-16], ms->pmsa_nat_bits, &old_regs->r17, &old_unat); - copy_reg(&bank[18-16], ms->pmsa_nat_bits, &old_regs->r18, &old_unat); - copy_reg(&bank[19-16], ms->pmsa_nat_bits, &old_regs->r19, &old_unat); - copy_reg(&bank[20-16], ms->pmsa_nat_bits, &old_regs->r20, &old_unat); - copy_reg(&bank[21-16], ms->pmsa_nat_bits, &old_regs->r21, &old_unat); - copy_reg(&bank[22-16], ms->pmsa_nat_bits, &old_regs->r22, &old_unat); - copy_reg(&bank[23-16], ms->pmsa_nat_bits, &old_regs->r23, &old_unat); - copy_reg(&bank[24-16], ms->pmsa_nat_bits, &old_regs->r24, &old_unat); - copy_reg(&bank[25-16], ms->pmsa_nat_bits, &old_regs->r25, &old_unat); - copy_reg(&bank[26-16], ms->pmsa_nat_bits, &old_regs->r26, &old_unat); - copy_reg(&bank[27-16], ms->pmsa_nat_bits, &old_regs->r27, &old_unat); - copy_reg(&bank[28-16], ms->pmsa_nat_bits, &old_regs->r28, &old_unat); - copy_reg(&bank[29-16], ms->pmsa_nat_bits, &old_regs->r29, &old_unat); - copy_reg(&bank[30-16], ms->pmsa_nat_bits, &old_regs->r30, &old_unat); - copy_reg(&bank[31-16], ms->pmsa_nat_bits, &old_regs->r31, &old_unat); + finish_pt_regs(old_regs, sos, &old_unat); /* Next stack a struct switch_stack. mca_asm.S built a partial * switch_stack, copy it and fill in the blanks using pt_regs and @@ -870,8 +1154,10 @@ ia64_mca_modify_original_stack(struct pt_regs *regs, return previous_current; no_mod: - printk(KERN_INFO "cpu %d, %s %s, original stack not modified\n", + mprintk(KERN_INFO "cpu %d, %s %s, original stack not modified\n", smp_processor_id(), type, msg); + old_unat = regs->ar_unat; + finish_pt_regs(regs, sos, &old_unat); return previous_current; } @@ -884,28 +1170,92 @@ no_mod: */ static void -ia64_wait_for_slaves(int monarch) +ia64_wait_for_slaves(int monarch, const char *type) { - int c, wait = 0; - for_each_online_cpu(c) { - if (c == monarch) - continue; - if (ia64_mc_info.imi_rendez_checkin[c] == IA64_MCA_RENDEZ_CHECKIN_NOTDONE) { - udelay(1000); /* short wait first */ - wait = 1; - break; + int c, i , wait; + + /* + * wait 5 seconds total for slaves (arbitrary) + */ + for (i = 0; i < 5000; i++) { + wait = 0; + for_each_online_cpu(c) { + if (c == monarch) + continue; + if (ia64_mc_info.imi_rendez_checkin[c] + == IA64_MCA_RENDEZ_CHECKIN_NOTDONE) { + udelay(1000); /* short wait */ + wait = 1; + break; + } } + if (!wait) + goto all_in; } - if (!wait) - return; + + /* + * Maybe slave(s) dead. Print buffered messages immediately. + */ + ia64_mlogbuf_finish(0); + mprintk(KERN_INFO "OS %s slave did not rendezvous on cpu", type); for_each_online_cpu(c) { if (c == monarch) continue; - if (ia64_mc_info.imi_rendez_checkin[c] == IA64_MCA_RENDEZ_CHECKIN_NOTDONE) { - udelay(5*1000000); /* wait 5 seconds for slaves (arbitrary) */ - break; + if (ia64_mc_info.imi_rendez_checkin[c] == IA64_MCA_RENDEZ_CHECKIN_NOTDONE) + mprintk(" %d", c); + } + mprintk("\n"); + return; + +all_in: + mprintk(KERN_INFO "All OS %s slaves have reached rendezvous\n", type); + return; +} + +/* mca_insert_tr + * + * Switch rid when TR reload and needed! + * iord: 1: itr, 2: itr; + * +*/ +static void mca_insert_tr(u64 iord) +{ + + int i; + u64 old_rr; + struct ia64_tr_entry *p; + unsigned long psr; + int cpu = smp_processor_id(); + + if (!ia64_idtrs[cpu]) + return; + + psr = ia64_clear_ic(); + for (i = IA64_TR_ALLOC_BASE; i < IA64_TR_ALLOC_MAX; i++) { + p = ia64_idtrs[cpu] + (iord - 1) * IA64_TR_ALLOC_MAX; + if (p->pte & 0x1) { + old_rr = ia64_get_rr(p->ifa); + if (old_rr != p->rr) { + ia64_set_rr(p->ifa, p->rr); + ia64_srlz_d(); + } + ia64_ptr(iord, p->ifa, p->itir >> 2); + ia64_srlz_i(); + if (iord & 0x1) { + ia64_itr(0x1, i, p->ifa, p->pte, p->itir >> 2); + ia64_srlz_i(); + } + if (iord & 0x2) { + ia64_itr(0x2, i, p->ifa, p->pte, p->itir >> 2); + ia64_srlz_i(); + } + if (old_rr != p->rr) { + ia64_set_rr(p->ifa, old_rr); + ia64_srlz_d(); + } } } + ia64_set_psr(psr); } /* @@ -920,36 +1270,63 @@ ia64_wait_for_slaves(int monarch) * further MCA logging is enabled by clearing logs. * Monarch also has the duty of sending wakeup-IPIs to pull the * slave processors out of rendezvous spinloop. + * + * If multiple processors call into OS_MCA, the first will become + * the monarch. Subsequent cpus will be recorded in the mca_cpu + * bitmask. After the first monarch has processed its MCA, it + * will wake up the next cpu in the mca_cpu bitmask and then go + * into the rendezvous loop. When all processors have serviced + * their MCA, the last monarch frees up the rest of the processors. */ void ia64_mca_handler(struct pt_regs *regs, struct switch_stack *sw, struct ia64_sal_os_state *sos) { - pal_processor_state_info_t *psp = (pal_processor_state_info_t *) - &sos->proc_state_param; int recover, cpu = smp_processor_id(); - task_t *previous_current; + struct task_struct *previous_current; + struct ia64_mca_notify_die nd = + { .sos = sos, .monarch_cpu = &monarch_cpu, .data = &recover }; + static atomic_t mca_count; + static cpumask_t mca_cpu; + + if (atomic_add_return(1, &mca_count) == 1) { + monarch_cpu = cpu; + sos->monarch = 1; + } else { + cpu_set(cpu, mca_cpu); + sos->monarch = 0; + } + mprintk(KERN_INFO "Entered OS MCA handler. PSP=%lx cpu=%d " + "monarch=%ld\n", sos->proc_state_param, cpu, sos->monarch); - oops_in_progress = 1; /* FIXME: make printk NMI/MCA/INIT safe */ previous_current = ia64_mca_modify_original_stack(regs, sw, sos, "MCA"); - monarch_cpu = cpu; - ia64_wait_for_slaves(cpu); - /* Wakeup all the processors which are spinning in the rendezvous loop. - * They will leave SAL, then spin in the OS with interrupts disabled - * until this monarch cpu leaves the MCA handler. That gets control - * back to the OS so we can backtrace the other cpus, backtrace when - * spinning in SAL does not work. - */ - ia64_mca_wakeup_all(); + NOTIFY_MCA(DIE_MCA_MONARCH_ENTER, regs, (long)&nd, 1); + + ia64_mc_info.imi_rendez_checkin[cpu] = IA64_MCA_RENDEZ_CHECKIN_CONCURRENT_MCA; + if (sos->monarch) { + ia64_wait_for_slaves(cpu, "MCA"); + + /* Wakeup all the processors which are spinning in the + * rendezvous loop. They will leave SAL, then spin in the OS + * with interrupts disabled until this monarch cpu leaves the + * MCA handler. That gets control back to the OS so we can + * backtrace the other cpus, backtrace when spinning in SAL + * does not work. + */ + ia64_mca_wakeup_all(); + } else { + while (cpu_isset(cpu, mca_cpu)) + cpu_relax(); /* spin until monarch wakes us */ + } + + NOTIFY_MCA(DIE_MCA_MONARCH_PROCESS, regs, (long)&nd, 1); /* Get the MCA error record and log it */ ia64_mca_log_sal_error_record(SAL_INFO_TYPE_MCA); - /* TLB error is only exist in this SAL error record */ - recover = (psp->tc && !(psp->cc || psp->bc || psp->rc || psp->uc)) - /* other error recovery */ - || (ia64_mca_ucmc_extension + /* MCA error recovery */ + recover = (ia64_mca_ucmc_extension && ia64_mca_ucmc_extension( IA64_LOG_CURR_BUFFER(SAL_INFO_TYPE_MCA), sos)); @@ -959,14 +1336,44 @@ ia64_mca_handler(struct pt_regs *regs, struct switch_stack *sw, rh->severity = sal_log_severity_corrected; ia64_sal_clear_state_info(SAL_INFO_TYPE_MCA); sos->os_status = IA64_MCA_CORRECTED; + } else { + /* Dump buffered message to console */ + ia64_mlogbuf_finish(1); + } + + if (__get_cpu_var(ia64_mca_tr_reload)) { + mca_insert_tr(0x1); /*Reload dynamic itrs*/ + mca_insert_tr(0x2); /*Reload dynamic itrs*/ } + NOTIFY_MCA(DIE_MCA_MONARCH_LEAVE, regs, (long)&nd, 1); + + if (atomic_dec_return(&mca_count) > 0) { + int i; + + /* wake up the next monarch cpu, + * and put this cpu in the rendez loop. + */ + for_each_online_cpu(i) { + if (cpu_isset(i, mca_cpu)) { + monarch_cpu = i; + cpu_clear(i, mca_cpu); /* wake next cpu */ + while (monarch_cpu != -1) + cpu_relax(); /* spin until last cpu leaves */ + set_curr_task(cpu, previous_current); + ia64_mc_info.imi_rendez_checkin[cpu] + = IA64_MCA_RENDEZ_CHECKIN_NOTDONE; + return; + } + } + } set_curr_task(cpu, previous_current); - monarch_cpu = -1; + ia64_mc_info.imi_rendez_checkin[cpu] = IA64_MCA_RENDEZ_CHECKIN_NOTDONE; + monarch_cpu = -1; /* This frees the slaves and previous monarchs */ } -static DECLARE_WORK(cmc_disable_work, ia64_mca_cmc_vector_disable_keventd, NULL); -static DECLARE_WORK(cmc_enable_work, ia64_mca_cmc_vector_enable_keventd, NULL); +static DECLARE_WORK(cmc_disable_work, ia64_mca_cmc_vector_disable_keventd); +static DECLARE_WORK(cmc_enable_work, ia64_mca_cmc_vector_enable_keventd); /* * ia64_mca_cmc_int_handler @@ -978,27 +1385,23 @@ static DECLARE_WORK(cmc_enable_work, ia64_mca_cmc_vector_enable_keventd, NULL); * Inputs * interrupt number * client data arg ptr - * saved registers ptr * * Outputs * None */ static irqreturn_t -ia64_mca_cmc_int_handler(int cmc_irq, void *arg, struct pt_regs *ptregs) +ia64_mca_cmc_int_handler(int cmc_irq, void *arg) { static unsigned long cmc_history[CMC_HISTORY_LENGTH]; static int index; static DEFINE_SPINLOCK(cmc_history_lock); IA64_MCA_DEBUG("%s: received interrupt vector = %#x on CPU %d\n", - __FUNCTION__, cmc_irq, smp_processor_id()); + __func__, cmc_irq, smp_processor_id()); /* SAL spec states this should run w/ interrupts enabled */ local_irq_enable(); - /* Get the CMC error record and log it */ - ia64_mca_log_sal_error_record(SAL_INFO_TYPE_CMC); - spin_lock(&cmc_history_lock); if (!cmc_polling_enabled) { int i, count = 1; /* we know 1 happened now */ @@ -1031,7 +1434,7 @@ ia64_mca_cmc_int_handler(int cmc_irq, void *arg, struct pt_regs *ptregs) mod_timer(&cmc_poll_timer, jiffies + CMC_POLL_INTERVAL); /* lock already released, get out now */ - return IRQ_HANDLED; + goto out; } else { cmc_history[index++] = now; if (index == CMC_HISTORY_LENGTH) @@ -1039,6 +1442,12 @@ ia64_mca_cmc_int_handler(int cmc_irq, void *arg, struct pt_regs *ptregs) } } spin_unlock(&cmc_history_lock); +out: + /* Get the CMC error record and log it */ + ia64_mca_log_sal_error_record(SAL_INFO_TYPE_CMC); + + local_irq_disable(); + return IRQ_HANDLED; } @@ -1052,12 +1461,11 @@ ia64_mca_cmc_int_handler(int cmc_irq, void *arg, struct pt_regs *ptregs) * Inputs * interrupt number * client data arg ptr - * saved registers ptr * Outputs * handled */ static irqreturn_t -ia64_mca_cmc_int_caller(int cmc_irq, void *arg, struct pt_regs *ptregs) +ia64_mca_cmc_int_caller(int cmc_irq, void *arg) { static int start_count = -1; unsigned int cpuid; @@ -1068,11 +1476,11 @@ ia64_mca_cmc_int_caller(int cmc_irq, void *arg, struct pt_regs *ptregs) if (start_count == -1) start_count = IA64_LOG_COUNT(SAL_INFO_TYPE_CMC); - ia64_mca_cmc_int_handler(cmc_irq, arg, ptregs); + ia64_mca_cmc_int_handler(cmc_irq, arg); - for (++cpuid ; cpuid < NR_CPUS && !cpu_online(cpuid) ; cpuid++); + cpuid = cpumask_next(cpuid+1, cpu_online_mask); - if (cpuid < NR_CPUS) { + if (cpuid < nr_cpu_ids) { platform_send_ipi(cpuid, IA64_CMCP_VECTOR, IA64_IPI_DM_INT, 0); } else { /* If no log record, switch out of polling mode */ @@ -1106,7 +1514,8 @@ static void ia64_mca_cmc_poll (unsigned long dummy) { /* Trigger a CMC interrupt cascade */ - platform_send_ipi(first_cpu(cpu_online_map), IA64_CMCP_VECTOR, IA64_IPI_DM_INT, 0); + platform_send_ipi(cpumask_first(cpu_online_mask), IA64_CMCP_VECTOR, + IA64_IPI_DM_INT, 0); } /* @@ -1119,14 +1528,13 @@ ia64_mca_cmc_poll (unsigned long dummy) * Inputs * interrupt number * client data arg ptr - * saved registers ptr * Outputs * handled */ #ifdef CONFIG_ACPI static irqreturn_t -ia64_mca_cpe_int_caller(int cpe_irq, void *arg, struct pt_regs *ptregs) +ia64_mca_cpe_int_caller(int cpe_irq, void *arg) { static int start_count = -1; static int poll_time = MIN_CPE_POLL_INTERVAL; @@ -1138,9 +1546,9 @@ ia64_mca_cpe_int_caller(int cpe_irq, void *arg, struct pt_regs *ptregs) if (start_count == -1) start_count = IA64_LOG_COUNT(SAL_INFO_TYPE_CPE); - ia64_mca_cpe_int_handler(cpe_irq, arg, ptregs); + ia64_mca_cpe_int_handler(cpe_irq, arg); - for (++cpuid ; cpuid < NR_CPUS && !cpu_online(cpuid) ; cpuid++); + cpuid = cpumask_next(cpuid+1, cpu_online_mask); if (cpuid < NR_CPUS) { platform_send_ipi(cpuid, IA64_CPEP_VECTOR, IA64_IPI_DM_INT, 0); @@ -1183,11 +1591,58 @@ static void ia64_mca_cpe_poll (unsigned long dummy) { /* Trigger a CPE interrupt cascade */ - platform_send_ipi(first_cpu(cpu_online_map), IA64_CPEP_VECTOR, IA64_IPI_DM_INT, 0); + platform_send_ipi(cpumask_first(cpu_online_mask), IA64_CPEP_VECTOR, + IA64_IPI_DM_INT, 0); } #endif /* CONFIG_ACPI */ +static int +default_monarch_init_process(struct notifier_block *self, unsigned long val, void *data) +{ + int c; + struct task_struct *g, *t; + if (val != DIE_INIT_MONARCH_PROCESS) + return NOTIFY_DONE; +#ifdef CONFIG_KEXEC + if (atomic_read(&kdump_in_progress)) + return NOTIFY_DONE; +#endif + + /* + * FIXME: mlogbuf will brim over with INIT stack dumps. + * To enable show_stack from INIT, we use oops_in_progress which should + * be used in real oops. This would cause something wrong after INIT. + */ + BREAK_LOGLEVEL(console_loglevel); + ia64_mlogbuf_dump_from_init(); + + printk(KERN_ERR "Processes interrupted by INIT -"); + for_each_online_cpu(c) { + struct ia64_sal_os_state *s; + t = __va(__per_cpu_mca[c] + IA64_MCA_CPU_INIT_STACK_OFFSET); + s = (struct ia64_sal_os_state *)((char *)t + MCA_SOS_OFFSET); + g = s->prev_task; + if (g) { + if (g->pid) + printk(" %d", g->pid); + else + printk(" %d (cpu %d task 0x%p)", g->pid, task_cpu(g), g); + } + } + printk("\n\n"); + if (read_trylock(&tasklist_lock)) { + do_each_thread (g, t) { + printk("\nBacktrace of pid %d (%s)\n", t->pid, t->comm); + show_stack(t, NULL); + } while_each_thread (g, t); + read_unlock(&tasklist_lock); + } + /* FIXME: This will not restore zapped printk locks. */ + RESTORE_LOGLEVEL(console_loglevel); + return NOTIFY_DONE; +} + /* * C portion of the OS INIT handler * @@ -1211,14 +1666,14 @@ ia64_init_handler(struct pt_regs *regs, struct switch_stack *sw, { static atomic_t slaves; static atomic_t monarchs; - task_t *previous_current; - int cpu = smp_processor_id(), c; - struct task_struct *g, *t; + struct task_struct *previous_current; + int cpu = smp_processor_id(); + struct ia64_mca_notify_die nd = + { .sos = sos, .monarch_cpu = &monarch_cpu }; - oops_in_progress = 1; /* FIXME: make printk NMI/MCA/INIT safe */ - console_loglevel = 15; /* make sure printks make it to console */ + NOTIFY_INIT(DIE_INIT_ENTER, regs, (long)&nd, 0); - printk(KERN_INFO "Entered OS INIT handler. PSP=%lx cpu=%d monarch=%ld\n", + mprintk(KERN_INFO "Entered OS INIT handler. PSP=%lx cpu=%d monarch=%ld\n", sos->proc_state_param, cpu, sos->monarch); salinfo_log_wakeup(SAL_INFO_TYPE_INIT, NULL, 0, 0); @@ -1231,8 +1686,8 @@ ia64_init_handler(struct pt_regs *regs, struct switch_stack *sw, * fix their proms and get their customers updated. */ if (!sos->monarch && atomic_add_return(1, &slaves) == num_online_cpus()) { - printk(KERN_WARNING "%s: Promoting cpu %d to monarch.\n", - __FUNCTION__, cpu); + mprintk(KERN_WARNING "%s: Promoting cpu %d to monarch.\n", + __func__, cpu); atomic_dec(&slaves); sos->monarch = 1; } @@ -1243,19 +1698,37 @@ ia64_init_handler(struct pt_regs *regs, struct switch_stack *sw, * fix their proms and get their customers updated. */ if (sos->monarch && atomic_add_return(1, &monarchs) > 1) { - printk(KERN_WARNING "%s: Demoting cpu %d to slave.\n", - __FUNCTION__, cpu); + mprintk(KERN_WARNING "%s: Demoting cpu %d to slave.\n", + __func__, cpu); atomic_dec(&monarchs); sos->monarch = 0; } if (!sos->monarch) { ia64_mc_info.imi_rendez_checkin[cpu] = IA64_MCA_RENDEZ_CHECKIN_INIT; + +#ifdef CONFIG_KEXEC + while (monarch_cpu == -1 && !atomic_read(&kdump_in_progress)) + udelay(1000); +#else while (monarch_cpu == -1) - cpu_relax(); /* spin until monarch enters */ + cpu_relax(); /* spin until monarch enters */ +#endif + + NOTIFY_INIT(DIE_INIT_SLAVE_ENTER, regs, (long)&nd, 1); + NOTIFY_INIT(DIE_INIT_SLAVE_PROCESS, regs, (long)&nd, 1); + +#ifdef CONFIG_KEXEC + while (monarch_cpu != -1 && !atomic_read(&kdump_in_progress)) + udelay(1000); +#else while (monarch_cpu != -1) - cpu_relax(); /* spin until monarch leaves */ - printk("Slave on cpu %d returning to normal service.\n", cpu); + cpu_relax(); /* spin until monarch leaves */ +#endif + + NOTIFY_INIT(DIE_INIT_SLAVE_LEAVE, regs, (long)&nd, 1); + + mprintk("Slave on cpu %d returning to normal service.\n", cpu); set_curr_task(cpu, previous_current); ia64_mc_info.imi_rendez_checkin[cpu] = IA64_MCA_RENDEZ_CHECKIN_NOTDONE; atomic_dec(&slaves); @@ -1263,6 +1736,7 @@ ia64_init_handler(struct pt_regs *regs, struct switch_stack *sw, } monarch_cpu = cpu; + NOTIFY_INIT(DIE_INIT_MONARCH_ENTER, regs, (long)&nd, 1); /* * Wait for a bit. On some machines (e.g., HP's zx2000 and zx6000, INIT can be @@ -1270,31 +1744,17 @@ ia64_init_handler(struct pt_regs *regs, struct switch_stack *sw, * same serial line, the user will need some time to switch out of the BMC before * the dump begins. */ - printk("Delaying for 5 seconds...\n"); + mprintk("Delaying for 5 seconds...\n"); udelay(5*1000000); - ia64_wait_for_slaves(cpu); - printk(KERN_ERR "Processes interrupted by INIT -"); - for_each_online_cpu(c) { - struct ia64_sal_os_state *s; - t = __va(__per_cpu_mca[c] + IA64_MCA_CPU_INIT_STACK_OFFSET); - s = (struct ia64_sal_os_state *)((char *)t + MCA_SOS_OFFSET); - g = s->prev_task; - if (g) { - if (g->pid) - printk(" %d", g->pid); - else - printk(" %d (cpu %d task 0x%p)", g->pid, task_cpu(g), g); - } - } - printk("\n\n"); - if (read_trylock(&tasklist_lock)) { - do_each_thread (g, t) { - printk("\nBacktrace of pid %d (%s)\n", t->pid, t->comm); - show_stack(t, NULL); - } while_each_thread (g, t); - read_unlock(&tasklist_lock); - } - printk("\nINIT dump complete. Monarch on cpu %d returning to normal service.\n", cpu); + ia64_wait_for_slaves(cpu, "INIT"); + /* If nobody intercepts DIE_INIT_MONARCH_PROCESS then we drop through + * to default_monarch_init_process() above and just print all the + * tasks. + */ + NOTIFY_INIT(DIE_INIT_MONARCH_PROCESS, regs, (long)&nd, 1); + NOTIFY_INIT(DIE_INIT_MONARCH_LEAVE, regs, (long)&nd, 1); + + mprintk("\nINIT dump complete. Monarch on cpu %d returning to normal service.\n", cpu); atomic_dec(&monarchs); set_curr_task(cpu, previous_current); monarch_cpu = -1; @@ -1312,38 +1772,32 @@ __setup("disable_cpe_poll", ia64_mca_disable_cpe_polling); static struct irqaction cmci_irqaction = { .handler = ia64_mca_cmc_int_handler, - .flags = SA_INTERRUPT, .name = "cmc_hndlr" }; static struct irqaction cmcp_irqaction = { .handler = ia64_mca_cmc_int_caller, - .flags = SA_INTERRUPT, .name = "cmc_poll" }; static struct irqaction mca_rdzv_irqaction = { .handler = ia64_mca_rendez_int_handler, - .flags = SA_INTERRUPT, .name = "mca_rdzv" }; static struct irqaction mca_wkup_irqaction = { .handler = ia64_mca_wakeup_int_handler, - .flags = SA_INTERRUPT, .name = "mca_wkup" }; #ifdef CONFIG_ACPI static struct irqaction mca_cpe_irqaction = { .handler = ia64_mca_cpe_int_handler, - .flags = SA_INTERRUPT, .name = "cpe_hndlr" }; static struct irqaction mca_cpep_irqaction = { .handler = ia64_mca_cpe_int_caller, - .flags = SA_INTERRUPT, .name = "cpe_poll" }; #endif /* CONFIG_ACPI */ @@ -1361,14 +1815,14 @@ format_mca_init_stack(void *mca_data, unsigned long offset, struct task_struct *p = (struct task_struct *)((char *)mca_data + offset); struct thread_info *ti; memset(p, 0, KERNEL_STACK_SIZE); - ti = (struct thread_info *)((char *)p + IA64_TASK_SIZE); + ti = task_thread_info(p); ti->flags = _TIF_MCA_INIT; ti->preempt_count = 1; ti->task = p; ti->cpu = cpu; - p->thread_info = ti; + p->stack = ti; p->state = TASK_UNINTERRUPTIBLE; - __set_bit(cpu, &p->cpus_allowed); + cpu_set(cpu, p->cpus_allowed); INIT_LIST_HEAD(&p->tasks); p->parent = p->real_parent = p->group_leader = p; INIT_LIST_HEAD(&p->children); @@ -1376,41 +1830,45 @@ format_mca_init_stack(void *mca_data, unsigned long offset, strncpy(p->comm, type, sizeof(p->comm)-1); } -/* Do per-CPU MCA-related initialization. */ +/* Caller prevents this from being called after init */ +static void * __init_refok mca_bootmem(void) +{ + return __alloc_bootmem(sizeof(struct ia64_mca_cpu), + KERNEL_STACK_SIZE, 0); +} -void __devinit +/* Do per-CPU MCA-related initialization. */ +void ia64_mca_cpu_init(void *cpu_data) { void *pal_vaddr; - - if (smp_processor_id() == 0) { - void *mca_data; - int cpu; - - mca_data = alloc_bootmem(sizeof(struct ia64_mca_cpu) - * NR_CPUS + KERNEL_STACK_SIZE); - mca_data = (void *)(((unsigned long)mca_data + - KERNEL_STACK_SIZE - 1) & - (-KERNEL_STACK_SIZE)); - for (cpu = 0; cpu < NR_CPUS; cpu++) { - format_mca_init_stack(mca_data, - offsetof(struct ia64_mca_cpu, mca_stack), - "MCA", cpu); - format_mca_init_stack(mca_data, - offsetof(struct ia64_mca_cpu, init_stack), - "INIT", cpu); - __per_cpu_mca[cpu] = __pa(mca_data); - mca_data += sizeof(struct ia64_mca_cpu); - } - } + void *data; + long sz = sizeof(struct ia64_mca_cpu); + int cpu = smp_processor_id(); + static int first_time = 1; /* - * The MCA info structure was allocated earlier and its - * physical address saved in __per_cpu_mca[cpu]. Copy that - * address * to ia64_mca_data so we can access it as a per-CPU - * variable. + * Structure will already be allocated if cpu has been online, + * then offlined. */ - __get_cpu_var(ia64_mca_data) = __per_cpu_mca[smp_processor_id()]; + if (__per_cpu_mca[cpu]) { + data = __va(__per_cpu_mca[cpu]); + } else { + if (first_time) { + data = mca_bootmem(); + first_time = 0; + } else + data = (void *)__get_free_pages(GFP_KERNEL, + get_order(sz)); + if (!data) + panic("Could not allocate MCA memory for cpu %d\n", + cpu); + } + format_mca_init_stack(data, offsetof(struct ia64_mca_cpu, mca_stack), + "MCA", cpu); + format_mca_init_stack(data, offsetof(struct ia64_mca_cpu, init_stack), + "INIT", cpu); + __get_cpu_var(ia64_mca_data) = __per_cpu_mca[cpu] = __pa(data); /* * Stash away a copy of the PTE needed to map the per-CPU page. @@ -1432,6 +1890,36 @@ ia64_mca_cpu_init(void *cpu_data) PAGE_KERNEL)); } +static void ia64_mca_cmc_vector_adjust(void *dummy) +{ + unsigned long flags; + + local_irq_save(flags); + if (!cmc_polling_enabled) + ia64_mca_cmc_vector_enable(NULL); + local_irq_restore(flags); +} + +static int mca_cpu_callback(struct notifier_block *nfb, + unsigned long action, + void *hcpu) +{ + int hotcpu = (unsigned long) hcpu; + + switch (action) { + case CPU_ONLINE: + case CPU_ONLINE_FROZEN: + smp_call_function_single(hotcpu, ia64_mca_cmc_vector_adjust, + NULL, 0); + break; + } + return NOTIFY_OK; +} + +static struct notifier_block mca_cpu_notifier = { + .notifier_call = mca_cpu_callback +}; + /* * ia64_mca_init * @@ -1459,11 +1947,15 @@ ia64_mca_init(void) ia64_fptr_t *init_hldlr_ptr_slave = (ia64_fptr_t *)ia64_os_init_dispatch_slave; ia64_fptr_t *mca_hldlr_ptr = (ia64_fptr_t *)ia64_os_mca_dispatch; int i; - s64 rc; + long rc; struct ia64_sal_retval isrv; - u64 timeout = IA64_MCA_RENDEZ_TIMEOUT; /* platform specific */ + unsigned long timeout = IA64_MCA_RENDEZ_TIMEOUT; /* platform specific */ + static struct notifier_block default_init_monarch_nb = { + .notifier_call = default_monarch_init_process, + .priority = 0/* we need to notified last */ + }; - IA64_MCA_DEBUG("%s: begin\n", __FUNCTION__); + IA64_MCA_DEBUG("%s: begin\n", __func__); /* Clear the Rendez checkin flag for all cpus */ for(i = 0 ; i < NR_CPUS; i++) @@ -1487,6 +1979,7 @@ ia64_mca_init(void) printk(KERN_INFO "Increasing MCA rendezvous timeout from " "%ld to %ld milliseconds\n", timeout, isrv.v0); timeout = isrv.v0; + NOTIFY_MCA(DIE_MCA_NEW_TIMEOUT, NULL, timeout, 0); continue; } printk(KERN_ERR "Failed to register rendezvous interrupt " @@ -1506,7 +1999,7 @@ ia64_mca_init(void) return; } - IA64_MCA_DEBUG("%s: registered MCA rendezvous spinloop and wakeup mech.\n", __FUNCTION__); + IA64_MCA_DEBUG("%s: registered MCA rendezvous spinloop and wakeup mech.\n", __func__); ia64_mc_info.imi_mca_handler = ia64_tpa(mca_hldlr_ptr->fp); /* @@ -1527,7 +2020,7 @@ ia64_mca_init(void) return; } - IA64_MCA_DEBUG("%s: registered OS MCA handler with SAL at 0x%lx, gp = 0x%lx\n", __FUNCTION__, + IA64_MCA_DEBUG("%s: registered OS MCA handler with SAL at 0x%lx, gp = 0x%lx\n", __func__, ia64_mc_info.imi_mca_handler, ia64_tpa(mca_hldlr_ptr->gp)); /* @@ -1539,7 +2032,7 @@ ia64_mca_init(void) ia64_mc_info.imi_slave_init_handler = ia64_tpa(init_hldlr_ptr_slave->fp); ia64_mc_info.imi_slave_init_handler_size = 0; - IA64_MCA_DEBUG("%s: OS INIT handler at %lx\n", __FUNCTION__, + IA64_MCA_DEBUG("%s: OS INIT handler at %lx\n", __func__, ia64_mc_info.imi_monarch_init_handler); /* Register the os init handler with SAL */ @@ -1555,9 +2048,36 @@ ia64_mca_init(void) "(status %ld)\n", rc); return; } + if (register_die_notifier(&default_init_monarch_nb)) { + printk(KERN_ERR "Failed to register default monarch INIT process\n"); + return; + } + + IA64_MCA_DEBUG("%s: registered OS INIT handler with SAL\n", __func__); + + /* Initialize the areas set aside by the OS to buffer the + * platform/processor error states for MCA/INIT/CMC + * handling. + */ + ia64_log_init(SAL_INFO_TYPE_MCA); + ia64_log_init(SAL_INFO_TYPE_INIT); + ia64_log_init(SAL_INFO_TYPE_CMC); + ia64_log_init(SAL_INFO_TYPE_CPE); + + mca_init = 1; + printk(KERN_INFO "MCA related initialization done\n"); +} - IA64_MCA_DEBUG("%s: registered OS INIT handler with SAL\n", __FUNCTION__); +/* + * These pieces cannot be done in ia64_mca_init() because it is called before + * early_irq_init() which would wipe out our percpu irq registrations. But we + * cannot leave them until ia64_mca_late_init() because by then all the other + * processors have been brought online and have set their own CMC vectors to + * point at a non-existant action. Called from arch_early_irq_init(). + */ +void __init ia64_mca_irq_init(void) +{ /* * Configure the CMCI/P vector and handler. Interrupts for CMC are * per-processor, so AP CMC interrupts are setup in smp_callin() (smpboot.c). @@ -1576,18 +2096,6 @@ ia64_mca_init(void) /* Setup the CPEI/P handler */ register_percpu_irq(IA64_CPEP_VECTOR, &mca_cpep_irqaction); #endif - - /* Initialize the areas set aside by the OS to buffer the - * platform/processor error states for MCA/INIT/CMC - * handling. - */ - ia64_log_init(SAL_INFO_TYPE_MCA); - ia64_log_init(SAL_INFO_TYPE_INIT); - ia64_log_init(SAL_INFO_TYPE_CMC); - ia64_log_init(SAL_INFO_TYPE_CPE); - - mca_init = 1; - printk(KERN_INFO "MCA related initialization done\n"); } /* @@ -1606,6 +2114,8 @@ ia64_mca_late_init(void) if (!mca_init) return 0; + register_hotcpu_notifier(&mca_cpu_notifier); + /* Setup the CMCI/P vector and handler */ init_timer(&cmc_poll_timer); cmc_poll_timer.function = ia64_mca_cmc_poll; @@ -1614,7 +2124,7 @@ ia64_mca_late_init(void) cmc_polling_enabled = 0; schedule_work(&cmc_enable_work); - IA64_MCA_DEBUG("%s: CMCI/P setup and enabled.\n", __FUNCTION__); + IA64_MCA_DEBUG("%s: CMCI/P setup and enabled.\n", __func__); #ifdef CONFIG_ACPI /* Setup the CPEI/P vector and handler */ @@ -1623,26 +2133,29 @@ ia64_mca_late_init(void) cpe_poll_timer.function = ia64_mca_cpe_poll; { - irq_desc_t *desc; unsigned int irq; if (cpe_vector >= 0) { /* If platform supports CPEI, enable the irq. */ - cpe_poll_enabled = 0; - for (irq = 0; irq < NR_IRQS; ++irq) - if (irq_to_vector(irq) == cpe_vector) { - desc = irq_descp(irq); - desc->status |= IRQ_PER_CPU; - setup_irq(irq, &mca_cpe_irqaction); - } - ia64_mca_register_cpev(cpe_vector); - IA64_MCA_DEBUG("%s: CPEI/P setup and enabled.\n", __FUNCTION__); - } else { - /* If platform doesn't support CPEI, get the timer going. */ - if (cpe_poll_enabled) { - ia64_mca_cpe_poll(0UL); - IA64_MCA_DEBUG("%s: CPEP setup and enabled.\n", __FUNCTION__); + irq = local_vector_to_irq(cpe_vector); + if (irq > 0) { + cpe_poll_enabled = 0; + irq_set_status_flags(irq, IRQ_PER_CPU); + setup_irq(irq, &mca_cpe_irqaction); + ia64_cpe_irq = irq; + ia64_mca_register_cpev(cpe_vector); + IA64_MCA_DEBUG("%s: CPEI/P setup and enabled.\n", + __func__); + return 0; } + printk(KERN_ERR "%s: Failed to find irq for CPE " + "interrupt handler, vector %d\n", + __func__, cpe_vector); + } + /* If platform doesn't support CPEI, get the timer going. */ + if (cpe_poll_enabled) { + ia64_mca_cpe_poll(0UL); + IA64_MCA_DEBUG("%s: CPEP setup and enabled.\n", __func__); } } #endif diff --git a/arch/ia64/kernel/mca_asm.S b/arch/ia64/kernel/mca_asm.S index db32fc1d393..d5bdf9de36b 100644 --- a/arch/ia64/kernel/mca_asm.S +++ b/arch/ia64/kernel/mca_asm.S @@ -1,25 +1,28 @@ -// -// assembly portion of the IA64 MCA handling -// -// Mods by cfleck to integrate into kernel build -// 00/03/15 davidm Added various stop bits to get a clean compile -// -// 00/03/29 cfleck Added code to save INIT handoff state in pt_regs format, switch to temp -// kstack, switch modes, jump to C INIT handler -// -// 02/01/04 J.Hall <jenna.s.hall@intel.com> -// Before entering virtual mode code: -// 1. Check for TLB CPU error -// 2. Restore current thread pointer to kr6 -// 3. Move stack ptr 16 bytes to conform to C calling convention -// -// 04/11/12 Russ Anderson <rja@sgi.com> -// Added per cpu MCA/INIT stack save areas. -// -// 12/08/05 Keith Owens <kaos@sgi.com> -// Use per cpu MCA/INIT stacks for all data. -// -#include <linux/config.h> +/* + * File: mca_asm.S + * Purpose: assembly portion of the IA64 MCA handling + * + * Mods by cfleck to integrate into kernel build + * + * 2000-03-15 David Mosberger-Tang <davidm@hpl.hp.com> + * Added various stop bits to get a clean compile + * + * 2000-03-29 Chuck Fleckenstein <cfleck@co.intel.com> + * Added code to save INIT handoff state in pt_regs format, + * switch to temp kstack, switch modes, jump to C INIT handler + * + * 2002-01-04 J.Hall <jenna.s.hall@intel.com> + * Before entering virtual mode code: + * 1. Check for TLB CPU error + * 2. Restore current thread pointer to kr6 + * 3. Move stack ptr 16 bytes to conform to C calling convention + * + * 2004-11-12 Russ Anderson <rja@sgi.com> + * Added per cpu MCA/INIT stack save areas. + * + * 2005-12-08 Keith Owens <kaos@sgi.com> + * Use per cpu MCA/INIT stacks for all data. + */ #include <linux/threads.h> #include <asm/asmmacro.h> @@ -37,6 +40,7 @@ .global ia64_do_tlb_purge .global ia64_os_mca_dispatch + .global ia64_os_init_on_kdump .global ia64_os_init_dispatch_monarch .global ia64_os_init_dispatch_slave @@ -55,7 +59,7 @@ ia64_do_tlb_purge: #define O(member) IA64_CPUINFO_##member##_OFFSET - GET_THIS_PADDR(r2, cpu_info) // load phys addr of cpu_info into r2 + GET_THIS_PADDR(r2, ia64_cpu_info) // load phys addr of cpu_info into r2 ;; addl r17=O(PTCE_STRIDE),r2 addl r2=O(PTCE_BASE),r2 @@ -102,14 +106,6 @@ ia64_do_tlb_purge: ;; srlz.d ;; - // 2. Purge DTR for PERCPU data. - movl r16=PERCPU_ADDR - mov r18=PERCPU_PAGE_SHIFT<<2 - ;; - ptr.d r16,r18 - ;; - srlz.d - ;; // 3. Purge ITR for PAL code. GET_THIS_PADDR(r2, ia64_mca_pal_base) ;; @@ -142,14 +138,6 @@ ia64_do_tlb_purge: //StartMain//////////////////////////////////////////////////////////////////// ia64_os_mca_dispatch: - // Serialize all MCA processing - mov r3=1;; - LOAD_PHYSICAL(p0,r2,ia64_mca_serialize);; -ia64_os_mca_spin: - xchg4 r4=[r2],r3;; - cmp.ne p6,p0=r4,r0 -(p6) br ia64_os_mca_spin - mov r3=IA64_MCA_CPU_MCA_STACK_OFFSET // use the MCA stack LOAD_PHYSICAL(p0,r2,1f) // return address mov r19=1 // All MCA events are treated as monarch (for now) @@ -159,7 +147,7 @@ ia64_os_mca_spin: GET_IA64_MCA_DATA(r2) // Using MCA stack, struct ia64_sal_os_state, variable proc_state_param ;; - add r3=IA64_MCA_CPU_MCA_STACK_OFFSET+MCA_SOS_OFFSET+IA64_SAL_OS_STATE_PROC_STATE_PARAM_OFFSET, r2 + add r3=IA64_MCA_CPU_MCA_STACK_OFFSET+MCA_SOS_OFFSET+SOS(PROC_STATE_PARAM), r2 ;; ld8 r18=[r3] // Get processor state parameter on existing PALE_CHECK. ;; @@ -197,22 +185,6 @@ ia64_reload_tr: srlz.i srlz.d ;; - // 2. Reload DTR register for PERCPU data. - GET_THIS_PADDR(r2, ia64_mca_per_cpu_pte) - ;; - movl r16=PERCPU_ADDR // vaddr - movl r18=PERCPU_PAGE_SHIFT<<2 - ;; - mov cr.itir=r18 - mov cr.ifa=r16 - ;; - ld8 r18=[r2] // load per-CPU PTE - mov r16=IA64_TR_PERCPU_DATA; - ;; - itr.d dtr[r16]=r18 - ;; - srlz.d - ;; // 3. Reload ITR for PAL code. GET_THIS_PADDR(r2, ia64_mca_pal_pte) ;; @@ -248,8 +220,13 @@ ia64_reload_tr: mov r20=IA64_TR_CURRENT_STACK ;; itr.d dtr[r20]=r16 + GET_THIS_PADDR(r2, ia64_mca_tr_reload) + mov r18 = 1 ;; srlz.d + ;; + st8 [r2] =r18 + ;; done_tlb_purge_and_reload: @@ -316,10 +293,6 @@ END(ia64_os_mca_virtual_begin) mov b0=r12 // SAL_CHECK return address - // release lock - LOAD_PHYSICAL(p0,r3,ia64_mca_serialize);; - st4.rel [r3]=r0 - br b0 //EndMain////////////////////////////////////////////////////////////////////// @@ -327,6 +300,25 @@ END(ia64_os_mca_virtual_begin) //StartMain//////////////////////////////////////////////////////////////////// // +// NOP init handler for kdump. In panic situation, we may receive INIT +// while kernel transition. Since we initialize registers on leave from +// current kernel, no longer monarch/slave handlers of current kernel in +// virtual mode are called safely. +// We can unregister these init handlers from SAL, however then the INIT +// will result in warmboot by SAL and we cannot retrieve the crashdump. +// Therefore register this NOP function to SAL, to prevent entering virtual +// mode and resulting warmboot by SAL. +// +ia64_os_init_on_kdump: + mov r8=r0 // IA64_INIT_RESUME + mov r9=r10 // SAL_GP + mov r22=r17 // *minstate + ;; + mov r10=r0 // return to same context + mov b0=r12 // SAL_CHECK return address + br b0 + +// // SAL to OS entry point for INIT on all processors. This has been defined for // registration purposes with SAL as a part of ia64_mca_init. Monarch and // slave INIT have identical processing, except for the value of the @@ -479,9 +471,11 @@ ia64_state_save: st8 [temp2]=r11,16 // rv_rc mov r11=cr.iipa ;; - st8 [temp1]=r18,16 // proc_state_param - st8 [temp2]=r19,16 // monarch + st8 [temp1]=r18 // proc_state_param + st8 [temp2]=r19 // monarch mov r6=IA64_KR(CURRENT) + add temp1=SOS(SAL_RA), regs + add temp2=SOS(SAL_GP), regs ;; st8 [temp1]=r12,16 // sal_ra st8 [temp2]=r10,16 // sal_gp @@ -503,16 +497,18 @@ ia64_state_save: st8 [temp2]=r11,16 // cr.iipa mov r12=cr.iim ;; - st8 [temp1]=r12,16 // cr.iim + st8 [temp1]=r12 // cr.iim (p1) mov r12=IA64_MCA_COLD_BOOT (p2) mov r12=IA64_INIT_WARM_BOOT mov r6=cr.iha + add temp1=SOS(OS_STATUS), regs ;; - st8 [temp2]=r6,16 // cr.iha + st8 [temp2]=r6 // cr.iha + add temp2=SOS(CONTEXT), regs st8 [temp1]=r12 // os_status, default is cold boot mov r6=IA64_MCA_SAME_CONTEXT ;; - st8 [temp1]=r6 // context, default is same context + st8 [temp2]=r6 // context, default is same context // Save the pt_regs data that is not in minstate. The previous code // left regs at sos. @@ -820,14 +816,14 @@ ia64_state_restore: // Restore the SAL to OS state. The previous code left regs at pt_regs. add regs=MCA_SOS_OFFSET-MCA_PT_REGS_OFFSET, regs ;; - add temp1=IA64_SAL_OS_STATE_COMMON_OFFSET, regs - add temp2=IA64_SAL_OS_STATE_COMMON_OFFSET+8, regs + add temp1=SOS(SAL_RA), regs + add temp2=SOS(SAL_GP), regs ;; ld8 r12=[temp1],16 // sal_ra ld8 r9=[temp2],16 // sal_gp ;; ld8 r22=[temp1],16 // pal_min_state, virtual - ld8 r21=[temp2],16 // prev_IA64_KR_CURRENT + ld8 r13=[temp2],16 // prev_IA64_KR_CURRENT ;; ld8 r16=[temp1],16 // prev_IA64_KR_CURRENT_STACK ld8 r20=[temp2],16 // prev_task @@ -842,13 +838,15 @@ ia64_state_restore: ;; mov cr.itir=temp3 mov cr.iipa=temp4 - ld8 temp3=[temp1],16 // cr.iim - ld8 temp4=[temp2],16 // cr.iha + ld8 temp3=[temp1] // cr.iim + ld8 temp4=[temp2] // cr.iha + add temp1=SOS(OS_STATUS), regs + add temp2=SOS(CONTEXT), regs ;; mov cr.iim=temp3 mov cr.iha=temp4 - dep r22=0,r22,62,2 // pal_min_state, physical, uncached - mov IA64_KR(CURRENT)=r21 + dep r22=0,r22,62,1 // pal_min_state, physical, uncached + mov IA64_KR(CURRENT)=r13 ld8 r8=[temp1] // os_status ld8 r10=[temp2] // context @@ -856,7 +854,7 @@ ia64_state_restore: * avoid any dependencies on the algorithm in ia64_switch_to(), just * purge any existing CURRENT_STACK mapping and insert the new one. * - * r16 contains prev_IA64_KR_CURRENT_STACK, r21 contains + * r16 contains prev_IA64_KR_CURRENT_STACK, r13 contains * prev_IA64_KR_CURRENT, these values may have been changed by the C * code. Do not use r8, r9, r10, r22, they contain values ready for * the return to SAL. @@ -873,7 +871,7 @@ ia64_state_restore: ;; srlz.d - extr.u r19=r21,61,3 // r21 = prev_IA64_KR_CURRENT + extr.u r19=r13,61,3 // r13 = prev_IA64_KR_CURRENT shl r20=r16,IA64_GRANULE_SHIFT // r16 = prev_IA64_KR_CURRENT_STACK movl r21=PAGE_KERNEL // page properties ;; @@ -883,7 +881,7 @@ ia64_state_restore: (p6) br.spnt 1f // the dreaded cpu 0 idle task in region 5:( ;; mov cr.itir=r18 - mov cr.ifa=r21 + mov cr.ifa=r13 mov r20=IA64_TR_CURRENT_STACK ;; itr.d dtr[r20]=r21 @@ -916,7 +914,7 @@ ia64_state_restore: ia64_new_stack: add regs=MCA_PT_REGS_OFFSET, r3 - add temp2=MCA_SOS_OFFSET+IA64_SAL_OS_STATE_PAL_MIN_STATE_OFFSET, r3 + add temp2=MCA_SOS_OFFSET+SOS(PAL_MIN_STATE), r3 mov b0=r2 // save return address GET_IA64_MCA_DATA(temp1) invala @@ -1020,18 +1018,13 @@ ia64_old_stack: ia64_set_kernel_registers: add temp3=MCA_SP_OFFSET, r3 - add temp4=MCA_SOS_OFFSET+IA64_SAL_OS_STATE_OS_GP_OFFSET, r3 mov b0=r2 // save return address GET_IA64_MCA_DATA(temp1) ;; - add temp4=temp4, temp1 // &struct ia64_sal_os_state.os_gp add r12=temp1, temp3 // kernel stack pointer on MCA/INIT stack add r13=temp1, r3 // set current to start of MCA/INIT stack add r20=temp1, r3 // physical start of MCA/INIT stack ;; - ld8 r1=[temp4] // OS GP from SAL OS state - ;; - DATA_PA_TO_VA(r1,temp1) DATA_PA_TO_VA(r12,temp2) DATA_PA_TO_VA(r13,temp3) ;; @@ -1062,6 +1055,10 @@ ia64_set_kernel_registers: mov cr.itir=r18 mov cr.ifa=r13 mov r20=IA64_TR_CURRENT_STACK + + movl r17=FPSR_DEFAULT + ;; + mov.m ar.fpsr=r17 // set ar.fpsr to kernel default value ;; itr.d dtr[r20]=r21 ;; @@ -1096,3 +1093,30 @@ GLOBAL_ENTRY(ia64_get_rnat) mov ar.rsc=3 br.ret.sptk.many rp END(ia64_get_rnat) + + +// void ia64_set_psr_mc(void) +// +// Set psr.mc bit to mask MCA/INIT. +GLOBAL_ENTRY(ia64_set_psr_mc) + rsm psr.i | psr.ic // disable interrupts + ;; + srlz.d + ;; + mov r14 = psr // get psr{36:35,31:0} + movl r15 = 1f + ;; + dep r14 = -1, r14, PSR_MC, 1 // set psr.mc + ;; + dep r14 = -1, r14, PSR_IC, 1 // set psr.ic + ;; + dep r14 = -1, r14, PSR_BN, 1 // keep bank1 in use + ;; + mov cr.ipsr = r14 + mov cr.ifs = r0 + mov cr.iip = r15 + ;; + rfi +1: + br.ret.sptk.many rp +END(ia64_set_psr_mc) diff --git a/arch/ia64/kernel/mca_drv.c b/arch/ia64/kernel/mca_drv.c index f081c60ab20..94f8bf777af 100644 --- a/arch/ia64/kernel/mca_drv.c +++ b/arch/ia64/kernel/mca_drv.c @@ -3,18 +3,17 @@ * Purpose: Generic MCA handling layer * * Copyright (C) 2004 FUJITSU LIMITED - * Copyright (C) Hidetoshi Seto (seto.hidetoshi@jp.fujitsu.com) + * Copyright (C) 2004 Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com> * Copyright (C) 2005 Silicon Graphics, Inc * Copyright (C) 2005 Keith Owens <kaos@sgi.com> + * Copyright (C) 2006 Russ Anderson <rja@sgi.com> */ -#include <linux/config.h> #include <linux/types.h> #include <linux/init.h> #include <linux/sched.h> #include <linux/interrupt.h> #include <linux/irq.h> #include <linux/kallsyms.h> -#include <linux/smp_lock.h> #include <linux/bootmem.h> #include <linux/acpi.h> #include <linux/timer.h> @@ -23,12 +22,12 @@ #include <linux/smp.h> #include <linux/workqueue.h> #include <linux/mm.h> +#include <linux/slab.h> #include <asm/delay.h> #include <asm/machvec.h> #include <asm/page.h> #include <asm/ptrace.h> -#include <asm/system.h> #include <asm/sal.h> #include <asm/mca.h> @@ -61,6 +60,11 @@ typedef enum { ISOLATE_NONE } isolate_status_t; +typedef enum { + MCA_NOT_RECOVERED = 0, + MCA_RECOVERED = 1 +} recovery_status_t; + /* * This pool keeps pointers to the section part of SAL error record */ @@ -70,6 +74,34 @@ static struct { int max_idx; /* Maximum index of section pointer list pool */ } slidx_pool; +static int +fatal_mca(const char *fmt, ...) +{ + va_list args; + char buf[256]; + + va_start(args, fmt); + vsnprintf(buf, sizeof(buf), fmt, args); + va_end(args); + ia64_mca_printk(KERN_ALERT "MCA: %s\n", buf); + + return MCA_NOT_RECOVERED; +} + +static int +mca_recovered(const char *fmt, ...) +{ + va_list args; + char buf[256]; + + va_start(args, fmt); + vsnprintf(buf, sizeof(buf), fmt, args); + va_end(args); + ia64_mca_printk(KERN_INFO "MCA: %s\n", buf); + + return MCA_RECOVERED; +} + /** * mca_page_isolate - isolate a poisoned page in order not to use it later * @paddr: poisoned memory location @@ -88,7 +120,7 @@ mca_page_isolate(unsigned long paddr) if (!ia64_phys_addr_valid(paddr)) return ISOLATE_NONE; - if (!pfn_valid(paddr)) + if (!pfn_valid(paddr >> PAGE_SHIFT)) return ISOLATE_NONE; /* convert physical address to physical page number */ @@ -108,6 +140,7 @@ mca_page_isolate(unsigned long paddr) return ISOLATE_NG; /* add attribute 'Reserved' and register the page */ + get_page(p); SetPageReserved(p); page_isolate[num_page_isolate++] = p; @@ -120,10 +153,14 @@ mca_page_isolate(unsigned long paddr) */ void -mca_handler_bh(unsigned long paddr) +mca_handler_bh(unsigned long paddr, void *iip, unsigned long ipsr) { - printk(KERN_DEBUG "OS_MCA: process [pid: %d](%s) encounters MCA.\n", - current->pid, current->comm); + ia64_mlogbuf_dump(); + printk(KERN_ERR "OS_MCA: process [cpu %d, pid: %d, uid: %d, " + "iip: %p, psr: 0x%lx,paddr: 0x%lx](%s) encounters MCA.\n", + raw_smp_processor_id(), current->pid, + from_kuid(&init_user_ns, current_uid()), + iip, ipsr, paddr, current->comm); spin_lock(&mca_bh_lock); switch (mca_page_isolate(paddr)) { @@ -131,7 +168,7 @@ mca_handler_bh(unsigned long paddr) printk(KERN_DEBUG "Page isolation: ( %lx ) success.\n", paddr); break; case ISOLATE_NG: - printk(KERN_DEBUG "Page isolation: ( %lx ) failure.\n", paddr); + printk(KERN_CRIT "Page isolation: ( %lx ) failure.\n", paddr); break; default: break; @@ -312,7 +349,7 @@ init_record_index_pools(void) /* - 3 - */ slidx_pool.max_idx = (rec_max_size/sect_min_size) * 2 + 1; - slidx_pool.buffer = (slidx_list_t *) + slidx_pool.buffer = kmalloc(slidx_pool.max_idx * sizeof(slidx_list_t), GFP_KERNEL); return slidx_pool.buffer ? 0 : -ENOMEM; @@ -398,6 +435,50 @@ is_mca_global(peidx_table_t *peidx, pal_bus_check_info_t *pbci, } /** + * get_target_identifier - Get the valid Cache or Bus check target identifier. + * @peidx: pointer of index of processor error section + * + * Return value: + * target address on Success / 0 on Failure + */ +static u64 +get_target_identifier(peidx_table_t *peidx) +{ + u64 target_address = 0; + sal_log_mod_error_info_t *smei; + pal_cache_check_info_t *pcci; + int i, level = 9; + + /* + * Look through the cache checks for a valid target identifier + * If more than one valid target identifier, return the one + * with the lowest cache level. + */ + for (i = 0; i < peidx_cache_check_num(peidx); i++) { + smei = (sal_log_mod_error_info_t *)peidx_cache_check(peidx, i); + if (smei->valid.target_identifier && smei->target_identifier) { + pcci = (pal_cache_check_info_t *)&(smei->check_info); + if (!target_address || (pcci->level < level)) { + target_address = smei->target_identifier; + level = pcci->level; + continue; + } + } + } + if (target_address) + return target_address; + + /* + * Look at the bus check for a valid target identifier + */ + smei = peidx_bus_check(peidx, 0); + if (smei && smei->valid.target_identifier) + return smei->target_identifier; + + return 0; +} + +/** * recover_from_read_error - Try to recover the errors which type are "read"s. * @slidx: pointer of index of SAL error record * @peidx: pointer of index of processor error section @@ -413,14 +494,15 @@ recover_from_read_error(slidx_table_t *slidx, peidx_table_t *peidx, pal_bus_check_info_t *pbci, struct ia64_sal_os_state *sos) { - sal_log_mod_error_info_t *smei; + u64 target_identifier; pal_min_state_area_t *pmsa; struct ia64_psr *psr1, *psr2; ia64_fptr_t *mca_hdlr_bh = (ia64_fptr_t*)mca_handler_bhhook; /* Is target address valid? */ - if (!pbci->tv) - return 0; + target_identifier = get_target_identifier(peidx); + if (!target_identifier) + return fatal_mca("target address not valid"); /* * cpu read or memory-mapped io read @@ -436,39 +518,46 @@ recover_from_read_error(slidx_table_t *slidx, * the process not have any locks of kernel. */ + /* Is minstate valid? */ + if (!peidx_bottom(peidx) || !(peidx_bottom(peidx)->valid.minstate)) + return fatal_mca("minstate not valid"); psr1 =(struct ia64_psr *)&(peidx_minstate_area(peidx)->pmsa_ipsr); + psr2 =(struct ia64_psr *)&(peidx_minstate_area(peidx)->pmsa_xpsr); /* * Check the privilege level of interrupted context. * If it is user-mode, then terminate affected process. */ - if (psr1->cpl != 0) { - smei = peidx_bus_check(peidx, 0); - if (smei->valid.target_identifier) { - /* - * setup for resume to bottom half of MCA, - * "mca_handler_bhhook" - */ - pmsa = sos->pal_min_state; - /* pass to bhhook as 1st argument (gr8) */ - pmsa->pmsa_gr[8-1] = smei->target_identifier; - /* set interrupted return address (but no use) */ - pmsa->pmsa_br0 = pmsa->pmsa_iip; - /* change resume address to bottom half */ - pmsa->pmsa_iip = mca_hdlr_bh->fp; - pmsa->pmsa_gr[1-1] = mca_hdlr_bh->gp; - /* set cpl with kernel mode */ - psr2 = (struct ia64_psr *)&pmsa->pmsa_ipsr; - psr2->cpl = 0; - psr2->ri = 0; - psr2->i = 0; - - return 1; - } + pmsa = sos->pal_min_state; + if (psr1->cpl != 0 || + ((psr2->cpl != 0) && mca_recover_range(pmsa->pmsa_iip))) { + /* + * setup for resume to bottom half of MCA, + * "mca_handler_bhhook" + */ + /* pass to bhhook as argument (gr8, ...) */ + pmsa->pmsa_gr[8-1] = target_identifier; + pmsa->pmsa_gr[9-1] = pmsa->pmsa_iip; + pmsa->pmsa_gr[10-1] = pmsa->pmsa_ipsr; + /* set interrupted return address (but no use) */ + pmsa->pmsa_br0 = pmsa->pmsa_iip; + /* change resume address to bottom half */ + pmsa->pmsa_iip = mca_hdlr_bh->fp; + pmsa->pmsa_gr[1-1] = mca_hdlr_bh->gp; + /* set cpl with kernel mode */ + psr2 = (struct ia64_psr *)&pmsa->pmsa_ipsr; + psr2->cpl = 0; + psr2->ri = 0; + psr2->bn = 1; + psr2->i = 0; + + return mca_recovered("user memory corruption. " + "kill affected process - recovered."); } - return 0; + return fatal_mca("kernel context not recovered, iip 0x%lx\n", + pmsa->pmsa_iip); } /** @@ -513,11 +602,40 @@ recover_from_platform_error(slidx_table_t *slidx, peidx_table_t *peidx, default: break; } + } else if (psp->cc && !psp->bc) { /* Cache error */ + status = recover_from_read_error(slidx, peidx, pbci, sos); } return status; } +/* + * recover_from_tlb_check + * @peidx: pointer of index of processor error section + * + * Return value: + * 1 on Success / 0 on Failure + */ +static int +recover_from_tlb_check(peidx_table_t *peidx) +{ + sal_log_mod_error_info_t *smei; + pal_tlb_check_info_t *ptci; + + smei = (sal_log_mod_error_info_t *)peidx_tlb_check(peidx, 0); + ptci = (pal_tlb_check_info_t *)&(smei->check_info); + + /* + * Look for signature of a duplicate TLB DTC entry, which is + * a SW bug and always fatal. + */ + if (ptci->op == PAL_TLB_CHECK_OP_PURGE + && !(ptci->itr || ptci->dtc || ptci->itc)) + return fatal_mca("Duplicate TLB entry"); + + return mca_recovered("TLB check recovered"); +} + /** * recover_from_processor_error * @platform: whether there are some platform error section or not @@ -529,13 +647,6 @@ recover_from_platform_error(slidx_table_t *slidx, peidx_table_t *peidx, * Return value: * 1 on Success / 0 on Failure */ -/* - * Later we try to recover when below all conditions are satisfied. - * 1. Only one processor error section is exist. - * 2. BUS_CHECK is exist and the others are not exist.(Except TLB_CHECK) - * 3. The entry of BUS_CHECK_INFO is 1. - * 4. "External bus error" flag is set and the others are not set. - */ static int recover_from_processor_error(int platform, slidx_table_t *slidx, @@ -546,43 +657,60 @@ recover_from_processor_error(int platform, slidx_table_t *slidx, (pal_processor_state_info_t*)peidx_psp(peidx); /* - * We cannot recover errors with other than bus_check. + * Processor recovery status must key off of the PAL recovery + * status in the Processor State Parameter. */ - if (psp->cc || psp->rc || psp->uc) - return 0; /* - * If there is no bus error, record is weird but we need not to recover. + * The machine check is corrected. */ - if (psp->bc == 0 || pbci == NULL) - return 1; + if (psp->cm == 1) + return mca_recovered("machine check is already corrected."); /* - * Sorry, we cannot handle so many. + * The error was not contained. Software must be reset. */ - if (peidx_bus_check_num(peidx) > 1) - return 0; + if (psp->us || psp->ci == 0) + return fatal_mca("error not contained"); + /* - * Well, here is only one bus error. + * Look for recoverable TLB check */ - if (pbci->ib || pbci->cc) - return 0; + if (psp->tc && !(psp->cc || psp->bc || psp->rc || psp->uc)) + return recover_from_tlb_check(peidx); + + /* + * The cache check and bus check bits have four possible states + * cc bc + * 1 1 Memory error, attempt recovery + * 1 0 Cache error, attempt recovery + * 0 1 I/O error, attempt recovery + * 0 0 Other error type, not recovered + */ + if (psp->cc == 0 && (psp->bc == 0 || pbci == NULL)) + return fatal_mca("No cache or bus check"); + + /* + * Cannot handle more than one bus check. + */ + if (peidx_bus_check_num(peidx) > 1) + return fatal_mca("Too many bus checks"); + + if (pbci->ib) + return fatal_mca("Internal Bus error"); if (pbci->eb && pbci->bsi > 0) - return 0; - if (psp->ci == 0) - return 0; + return fatal_mca("External bus check fatal status"); /* - * This is a local MCA and estimated as recoverble external bus error. - * (e.g. a load from poisoned memory) - * This means "there are some platform errors". + * This is a local MCA and estimated as a recoverable error. */ if (platform) return recover_from_platform_error(slidx, peidx, pbci, sos); + /* * On account of strange SAL error record, we cannot recover. */ - return 0; + return fatal_mca("Strange SAL record"); } /** @@ -611,12 +739,10 @@ mca_try_to_recover(void *rec, struct ia64_sal_os_state *sos) /* Now, OS can recover when there is one processor error section */ if (n_proc_err > 1) - return 0; - else if (n_proc_err == 0) { - /* Weird SAL record ... We need not to recover */ - - return 1; - } + return fatal_mca("Too Many Errors"); + else if (n_proc_err == 0) + /* Weird SAL record ... We can't do anything */ + return fatal_mca("Weird SAL record"); /* Make index of processor error section */ mca_make_peidx((sal_log_processor_info_t*) @@ -627,7 +753,7 @@ mca_try_to_recover(void *rec, struct ia64_sal_os_state *sos) /* Check whether MCA is global or not */ if (is_mca_global(&peidx, &pbci, sos)) - return 0; + return fatal_mca("global MCA"); /* Try to recover a processor error */ return recover_from_processor_error(platform_err, &slidx, &peidx, diff --git a/arch/ia64/kernel/mca_drv.h b/arch/ia64/kernel/mca_drv.h index e2f6fa1e0ef..53b8ecb5b4b 100644 --- a/arch/ia64/kernel/mca_drv.h +++ b/arch/ia64/kernel/mca_drv.h @@ -3,7 +3,7 @@ * Purpose: Define helpers for Generic MCA handling * * Copyright (C) 2004 FUJITSU LIMITED - * Copyright (C) Hidetoshi Seto (seto.hidetoshi@jp.fujitsu.com) + * Copyright (C) 2004 Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com> */ /* * Processor error section: @@ -111,3 +111,12 @@ typedef struct slidx_table { slidx_foreach_entry(__pos, &((slidx)->sec)) { __count++; }\ __count; }) +struct mca_table_entry { + int start_addr; /* location-relative starting address of MCA recoverable range */ + int end_addr; /* location-relative ending address of MCA recoverable range */ +}; + +extern const struct mca_table_entry *search_mca_tables (unsigned long addr); +extern int mca_recover_range(unsigned long); +extern void ia64_mlogbuf_dump(void); + diff --git a/arch/ia64/kernel/mca_drv_asm.S b/arch/ia64/kernel/mca_drv_asm.S index 3f298ee4d00..767ac2c20d1 100644 --- a/arch/ia64/kernel/mca_drv_asm.S +++ b/arch/ia64/kernel/mca_drv_asm.S @@ -3,9 +3,8 @@ * Purpose: Assembly portion of Generic MCA handling * * Copyright (C) 2004 FUJITSU LIMITED - * Copyright (C) Hidetoshi Seto (seto.hidetoshi@jp.fujitsu.com) + * Copyright (C) 2004 Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com> */ -#include <linux/config.h> #include <linux/threads.h> #include <asm/asmmacro.h> @@ -14,15 +13,12 @@ GLOBAL_ENTRY(mca_handler_bhhook) invala // clear RSE ? - ;; cover ;; clrrrb ;; - alloc r16=ar.pfs,0,2,1,0 // make a new frame - ;; + alloc r16=ar.pfs,0,2,3,0 // make a new frame mov ar.rsc=0 - ;; mov r13=IA64_KR(CURRENT) // current task pointer ;; mov r2=r13 @@ -30,7 +26,6 @@ GLOBAL_ENTRY(mca_handler_bhhook) addl r22=IA64_RBS_OFFSET,r2 ;; mov ar.bspstore=r22 - ;; addl sp=IA64_STK_OFFSET-IA64_PT_REGS_SIZE,r2 ;; adds r2=IA64_TASK_THREAD_ON_USTACK_OFFSET,r13 @@ -40,12 +35,16 @@ GLOBAL_ENTRY(mca_handler_bhhook) movl loc1=mca_handler_bh // recovery C function ;; mov out0=r8 // poisoned address + mov out1=r9 // iip + mov out2=r10 // psr mov b6=loc1 ;; mov loc1=rp + ssm psr.ic ;; - ssm psr.i + srlz.i ;; + ssm psr.i br.call.sptk.many rp=b6 // does not return ... ;; mov ar.pfs=loc0 @@ -53,5 +52,4 @@ GLOBAL_ENTRY(mca_handler_bhhook) ;; mov r8=r0 br.ret.sptk.many rp - ;; END(mca_handler_bhhook) diff --git a/arch/ia64/kernel/minstate.h b/arch/ia64/kernel/minstate.h index 85ed54179af..cc82a7d744c 100644 --- a/arch/ia64/kernel/minstate.h +++ b/arch/ia64/kernel/minstate.h @@ -1,8 +1,23 @@ -#include <linux/config.h> #include <asm/cache.h> #include "entry.h" +#include "paravirt_inst.h" + +#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE +/* read ar.itc in advance, and use it before leaving bank 0 */ +#define ACCOUNT_GET_STAMP \ +(pUStk) mov.m r20=ar.itc; +#define ACCOUNT_SYS_ENTER \ +(pUStk) br.call.spnt rp=account_sys_enter \ + ;; +#else +#define ACCOUNT_GET_STAMP +#define ACCOUNT_SYS_ENTER +#endif + +.section ".data..patch.rse", "a" +.previous /* * DO_SAVE_MIN switches to the kernel stacks (if necessary) and saves @@ -29,16 +44,16 @@ * Note that psr.ic is NOT turned on by this macro. This is so that * we can pass interruption state as arguments to a handler. */ -#define DO_SAVE_MIN(COVER,SAVE_IFS,EXTRA) \ +#define IA64_NATIVE_DO_SAVE_MIN(__COVER,SAVE_IFS,EXTRA,WORKAROUND) \ mov r16=IA64_KR(CURRENT); /* M */ \ mov r27=ar.rsc; /* M */ \ mov r20=r1; /* A */ \ mov r25=ar.unat; /* M */ \ - mov r29=cr.ipsr; /* M */ \ + MOV_FROM_IPSR(p0,r29); /* M */ \ mov r26=ar.pfs; /* I */ \ - mov r28=cr.iip; /* M */ \ + MOV_FROM_IIP(r28); /* M */ \ mov r21=ar.fpsr; /* M */ \ - COVER; /* B;; (or nothing) */ \ + __COVER; /* B;; (or nothing) */ \ ;; \ adds r16=IA64_TASK_THREAD_ON_USTACK_OFFSET,r16; \ ;; \ @@ -76,6 +91,7 @@ tbit.nz p15,p0=r29,IA64_PSR_I_BIT; \ mov r29=b0 \ ;; \ + WORKAROUND; \ adds r16=PT(R8),r1; /* initialize first base pointer */ \ adds r17=PT(R9),r1; /* initialize second base pointer */ \ (pKStk) mov r18=r0; /* make sure r18 isn't NaT */ \ @@ -123,11 +139,13 @@ ;; \ .mem.offset 0,0; st8.spill [r16]=r2,16; \ .mem.offset 8,0; st8.spill [r17]=r3,16; \ + ACCOUNT_GET_STAMP \ adds r2=IA64_PT_REGS_R16_OFFSET,r1; \ ;; \ EXTRA; \ movl r1=__gp; /* establish kernel global pointer */ \ ;; \ + ACCOUNT_SYS_ENTER \ bsw.1; /* switch back to bank 1 (must be last in insn group) */ \ ;; @@ -193,6 +211,40 @@ st8 [r25]=r10; /* ar.ssd */ \ ;; -#define SAVE_MIN_WITH_COVER DO_SAVE_MIN(cover, mov r30=cr.ifs,) -#define SAVE_MIN_WITH_COVER_R19 DO_SAVE_MIN(cover, mov r30=cr.ifs, mov r15=r19) -#define SAVE_MIN DO_SAVE_MIN( , mov r30=r0, ) +#define RSE_WORKAROUND \ +(pUStk) extr.u r17=r18,3,6; \ +(pUStk) sub r16=r18,r22; \ +[1:](pKStk) br.cond.sptk.many 1f; \ + .xdata4 ".data..patch.rse",1b-. \ + ;; \ + cmp.ge p6,p7 = 33,r17; \ + ;; \ +(p6) mov r17=0x310; \ +(p7) mov r17=0x308; \ + ;; \ + cmp.leu p1,p0=r16,r17; \ +(p1) br.cond.sptk.many 1f; \ + dep.z r17=r26,0,62; \ + movl r16=2f; \ + ;; \ + mov ar.pfs=r17; \ + dep r27=r0,r27,16,14; \ + mov b0=r16; \ + ;; \ + br.ret.sptk b0; \ + ;; \ +2: \ + mov ar.rsc=r0 \ + ;; \ + flushrs; \ + ;; \ + mov ar.bspstore=r22 \ + ;; \ + mov r18=ar.bsp; \ + ;; \ +1: \ + .pred.rel "mutex", pKStk, pUStk + +#define SAVE_MIN_WITH_COVER DO_SAVE_MIN(COVER, mov r30=cr.ifs, , RSE_WORKAROUND) +#define SAVE_MIN_WITH_COVER_R19 DO_SAVE_MIN(COVER, mov r30=cr.ifs, mov r15=r19, RSE_WORKAROUND) +#define SAVE_MIN DO_SAVE_MIN( , mov r30=r0, , ) diff --git a/arch/ia64/kernel/module.c b/arch/ia64/kernel/module.c index 7a2f0a798d1..24603be24c1 100644 --- a/arch/ia64/kernel/module.c +++ b/arch/ia64/kernel/module.c @@ -25,7 +25,6 @@ SEGREL64LSB */ -#include <linux/config.h> #include <linux/kernel.h> #include <linux/sched.h> @@ -136,15 +135,6 @@ static const char *reloc_name[256] = { #undef N -struct got_entry { - uint64_t val; -}; - -struct fdesc { - uint64_t ip; - uint64_t gp; -}; - /* Opaque struct for insns, to protect against derefs. */ struct insn; @@ -181,7 +171,8 @@ apply_imm60 (struct module *mod, struct insn *insn, uint64_t val) return 0; } if (val + ((uint64_t) 1 << 59) >= (1UL << 60)) { - printk(KERN_ERR "%s: value %ld out of IMM60 range\n", mod->name, (int64_t) val); + printk(KERN_ERR "%s: value %ld out of IMM60 range\n", + mod->name, (long) val); return 0; } ia64_patch_imm60((u64) insn, val); @@ -192,7 +183,8 @@ static int apply_imm22 (struct module *mod, struct insn *insn, uint64_t val) { if (val + (1 << 21) >= (1 << 22)) { - printk(KERN_ERR "%s: value %li out of IMM22 range\n", mod->name, (int64_t)val); + printk(KERN_ERR "%s: value %li out of IMM22 range\n", + mod->name, (long)val); return 0; } ia64_patch((u64) insn, 0x01fffcfe000UL, ( ((val & 0x200000UL) << 15) /* bit 21 -> 36 */ @@ -206,7 +198,8 @@ static int apply_imm21b (struct module *mod, struct insn *insn, uint64_t val) { if (val + (1 << 20) >= (1 << 21)) { - printk(KERN_ERR "%s: value %li out of IMM21b range\n", mod->name, (int64_t)val); + printk(KERN_ERR "%s: value %li out of IMM21b range\n", + mod->name, (long)val); return 0; } ia64_patch((u64) insn, 0x11ffffe000UL, ( ((val & 0x100000UL) << 16) /* bit 20 -> 36 */ @@ -311,18 +304,11 @@ plt_target (struct plt_entry *plt) #endif /* !USE_BRL */ -void * -module_alloc (unsigned long size) -{ - if (!size) - return NULL; - return vmalloc(size); -} - void module_free (struct module *mod, void *module_region) { - if (mod->arch.init_unw_table && module_region == mod->module_init) { + if (mod && mod->arch.init_unw_table && + module_region == mod->module_init) { unw_remove_unwind_table(mod->arch.init_unw_table); mod->arch.init_unw_table = NULL; } @@ -455,6 +441,14 @@ module_frob_arch_sections (Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, char *secstrings, mod->arch.opd = s; else if (strcmp(".IA_64.unwind", secstrings + s->sh_name) == 0) mod->arch.unwind = s; +#ifdef CONFIG_PARAVIRT + else if (strcmp(".paravirt_bundles", + secstrings + s->sh_name) == 0) + mod->arch.paravirt_bundles = s; + else if (strcmp(".paravirt_insts", + secstrings + s->sh_name) == 0) + mod->arch.paravirt_insts = s; +#endif if (!mod->arch.core_plt || !mod->arch.init_plt || !mod->arch.got || !mod->arch.opd) { printk(KERN_ERR "%s: sections missing\n", mod->name); @@ -494,7 +488,7 @@ module_frob_arch_sections (Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, char *secstrings, mod->arch.opd->sh_addralign = 8; mod->arch.opd->sh_size = fdescs * sizeof(struct fdesc); DEBUGP("%s: core.plt=%lx, init.plt=%lx, got=%lx, fdesc=%lx\n", - __FUNCTION__, mod->arch.core_plt->sh_size, mod->arch.init_plt->sh_size, + __func__, mod->arch.core_plt->sh_size, mod->arch.init_plt->sh_size, mod->arch.got->sh_size, mod->arch.opd->sh_size); return 0; } @@ -534,8 +528,7 @@ get_ltoff (struct module *mod, uint64_t value, int *okp) goto found; /* Not enough GOT entries? */ - if (e >= (struct got_entry *) (mod->arch.got->sh_addr + mod->arch.got->sh_size)) - BUG(); + BUG_ON(e >= (struct got_entry *) (mod->arch.got->sh_addr + mod->arch.got->sh_size)); e->val = value; ++mod->arch.next_got_entry; @@ -586,7 +579,7 @@ get_plt (struct module *mod, const struct insn *insn, uint64_t value, int *okp) #if ARCH_MODULE_DEBUG if (plt_target(plt) != target_ip) { printk("%s: mistargeted PLT: wanted %lx, got %lx\n", - __FUNCTION__, target_ip, plt_target(plt)); + __func__, target_ip, plt_target(plt)); *okp = 0; return 0; } @@ -703,8 +696,9 @@ do_reloc (struct module *mod, uint8_t r_type, Elf64_Sym *sym, uint64_t addend, case RV_PCREL2: if (r_type == R_IA64_PCREL21BI) { if (!is_internal(mod, val)) { - printk(KERN_ERR "%s: %s reloc against non-local symbol (%lx)\n", - __FUNCTION__, reloc_name[r_type], val); + printk(KERN_ERR "%s: %s reloc against " + "non-local symbol (%lx)\n", __func__, + reloc_name[r_type], (unsigned long)val); return -ENOEXEC; } format = RF_INSN21B; @@ -738,7 +732,7 @@ do_reloc (struct module *mod, uint8_t r_type, Elf64_Sym *sym, uint64_t addend, case R_IA64_LDXMOV: if (gp_addressable(mod, val)) { /* turn "ld8" into "mov": */ - DEBUGP("%s: patching ld8 at %p to mov\n", __FUNCTION__, location); + DEBUGP("%s: patching ld8 at %p to mov\n", __func__, location); ia64_patch((u64) location, 0x1fff80fe000UL, 0x10000000000UL); } return 0; @@ -772,7 +766,7 @@ do_reloc (struct module *mod, uint8_t r_type, Elf64_Sym *sym, uint64_t addend, if (!ok) return -ENOEXEC; - DEBUGP("%s: [%p]<-%016lx = %s(%lx)\n", __FUNCTION__, location, val, + DEBUGP("%s: [%p]<-%016lx = %s(%lx)\n", __func__, location, val, reloc_name[r_type] ? reloc_name[r_type] : "?", sym->st_value + addend); switch (format) { @@ -808,7 +802,7 @@ apply_relocate_add (Elf64_Shdr *sechdrs, const char *strtab, unsigned int symind Elf64_Shdr *target_sec; int ret; - DEBUGP("%s: applying section %u (%u relocs) to %u\n", __FUNCTION__, + DEBUGP("%s: applying section %u (%u relocs) to %u\n", __func__, relsec, n, sechdrs[relsec].sh_info); target_sec = sechdrs + sechdrs[relsec].sh_info; @@ -836,7 +830,7 @@ apply_relocate_add (Elf64_Shdr *sechdrs, const char *strtab, unsigned int symind gp = mod->core_size / 2; gp = (uint64_t) mod->module_core + ((gp + 7) & -8); mod->arch.gp = gp; - DEBUGP("%s: placing gp at 0x%lx\n", __FUNCTION__, gp); + DEBUGP("%s: placing gp at 0x%lx\n", __func__, gp); } for (i = 0; i < n; i++) { @@ -851,18 +845,10 @@ apply_relocate_add (Elf64_Shdr *sechdrs, const char *strtab, unsigned int symind return 0; } -int -apply_relocate (Elf64_Shdr *sechdrs, const char *strtab, unsigned int symindex, - unsigned int relsec, struct module *mod) -{ - printk(KERN_ERR "module %s: REL relocs in section %u unsupported\n", mod->name, relsec); - return -ENOEXEC; -} - /* * Modules contain a single unwind table which covers both the core and the init text * sections but since the two are not contiguous, we need to split this table up such that - * we can register (and unregister) each "segment" seperately. Fortunately, this sounds + * we can register (and unregister) each "segment" separately. Fortunately, this sounds * more complicated than it really is. */ static void @@ -904,7 +890,7 @@ register_unwind_table (struct module *mod) init = start + num_core; } - DEBUGP("%s: name=%s, gp=%lx, num_init=%lu, num_core=%lu\n", __FUNCTION__, + DEBUGP("%s: name=%s, gp=%lx, num_init=%lu, num_core=%lu\n", __func__, mod->name, mod->arch.gp, num_init, num_core); /* @@ -913,13 +899,13 @@ register_unwind_table (struct module *mod) if (num_core > 0) { mod->arch.core_unw_table = unw_add_unwind_table(mod->name, 0, mod->arch.gp, core, core + num_core); - DEBUGP("%s: core: handle=%p [%p-%p)\n", __FUNCTION__, + DEBUGP("%s: core: handle=%p [%p-%p)\n", __func__, mod->arch.core_unw_table, core, core + num_core); } if (num_init > 0) { mod->arch.init_unw_table = unw_add_unwind_table(mod->name, 0, mod->arch.gp, init, init + num_init); - DEBUGP("%s: init: handle=%p [%p-%p)\n", __FUNCTION__, + DEBUGP("%s: init: handle=%p [%p-%p)\n", __func__, mod->arch.init_unw_table, init, init + num_init); } } @@ -927,9 +913,33 @@ register_unwind_table (struct module *mod) int module_finalize (const Elf_Ehdr *hdr, const Elf_Shdr *sechdrs, struct module *mod) { - DEBUGP("%s: init: entry=%p\n", __FUNCTION__, mod->init); + DEBUGP("%s: init: entry=%p\n", __func__, mod->init); if (mod->arch.unwind) register_unwind_table(mod); +#ifdef CONFIG_PARAVIRT + if (mod->arch.paravirt_bundles) { + struct paravirt_patch_site_bundle *start = + (struct paravirt_patch_site_bundle *) + mod->arch.paravirt_bundles->sh_addr; + struct paravirt_patch_site_bundle *end = + (struct paravirt_patch_site_bundle *) + (mod->arch.paravirt_bundles->sh_addr + + mod->arch.paravirt_bundles->sh_size); + + paravirt_patch_apply_bundle(start, end); + } + if (mod->arch.paravirt_insts) { + struct paravirt_patch_site_inst *start = + (struct paravirt_patch_site_inst *) + mod->arch.paravirt_insts->sh_addr; + struct paravirt_patch_site_inst *end = + (struct paravirt_patch_site_inst *) + (mod->arch.paravirt_insts->sh_addr + + mod->arch.paravirt_insts->sh_size); + + paravirt_patch_apply_inst(start, end); + } +#endif return 0; } @@ -941,14 +951,3 @@ module_arch_cleanup (struct module *mod) if (mod->arch.core_unw_table) unw_remove_unwind_table(mod->arch.core_unw_table); } - -#ifdef CONFIG_SMP -void -percpu_modcopy (void *pcpudst, const void *src, unsigned long size) -{ - unsigned int i; - for_each_cpu(i) { - memcpy(pcpudst + __per_cpu_offset[i], src, size); - } -} -#endif /* CONFIG_SMP */ diff --git a/arch/ia64/kernel/msi_ia64.c b/arch/ia64/kernel/msi_ia64.c new file mode 100644 index 00000000000..c430f9198d1 --- /dev/null +++ b/arch/ia64/kernel/msi_ia64.c @@ -0,0 +1,208 @@ +/* + * MSI hooks for standard x86 apic + */ + +#include <linux/pci.h> +#include <linux/irq.h> +#include <linux/msi.h> +#include <linux/dmar.h> +#include <asm/smp.h> +#include <asm/msidef.h> + +static struct irq_chip ia64_msi_chip; + +#ifdef CONFIG_SMP +static int ia64_set_msi_irq_affinity(struct irq_data *idata, + const cpumask_t *cpu_mask, bool force) +{ + struct msi_msg msg; + u32 addr, data; + int cpu = cpumask_first_and(cpu_mask, cpu_online_mask); + unsigned int irq = idata->irq; + + if (irq_prepare_move(irq, cpu)) + return -1; + + get_cached_msi_msg(irq, &msg); + + addr = msg.address_lo; + addr &= MSI_ADDR_DEST_ID_MASK; + addr |= MSI_ADDR_DEST_ID_CPU(cpu_physical_id(cpu)); + msg.address_lo = addr; + + data = msg.data; + data &= MSI_DATA_VECTOR_MASK; + data |= MSI_DATA_VECTOR(irq_to_vector(irq)); + msg.data = data; + + write_msi_msg(irq, &msg); + cpumask_copy(idata->affinity, cpumask_of(cpu)); + + return 0; +} +#endif /* CONFIG_SMP */ + +int ia64_setup_msi_irq(struct pci_dev *pdev, struct msi_desc *desc) +{ + struct msi_msg msg; + unsigned long dest_phys_id; + int irq, vector; + cpumask_t mask; + + irq = create_irq(); + if (irq < 0) + return irq; + + irq_set_msi_desc(irq, desc); + cpumask_and(&mask, &(irq_to_domain(irq)), cpu_online_mask); + dest_phys_id = cpu_physical_id(first_cpu(mask)); + vector = irq_to_vector(irq); + + msg.address_hi = 0; + msg.address_lo = + MSI_ADDR_HEADER | + MSI_ADDR_DEST_MODE_PHYS | + MSI_ADDR_REDIRECTION_CPU | + MSI_ADDR_DEST_ID_CPU(dest_phys_id); + + msg.data = + MSI_DATA_TRIGGER_EDGE | + MSI_DATA_LEVEL_ASSERT | + MSI_DATA_DELIVERY_FIXED | + MSI_DATA_VECTOR(vector); + + write_msi_msg(irq, &msg); + irq_set_chip_and_handler(irq, &ia64_msi_chip, handle_edge_irq); + + return 0; +} + +void ia64_teardown_msi_irq(unsigned int irq) +{ + destroy_irq(irq); +} + +static void ia64_ack_msi_irq(struct irq_data *data) +{ + irq_complete_move(data->irq); + irq_move_irq(data); + ia64_eoi(); +} + +static int ia64_msi_retrigger_irq(struct irq_data *data) +{ + unsigned int vector = irq_to_vector(data->irq); + ia64_resend_irq(vector); + + return 1; +} + +/* + * Generic ops used on most IA64 platforms. + */ +static struct irq_chip ia64_msi_chip = { + .name = "PCI-MSI", + .irq_mask = mask_msi_irq, + .irq_unmask = unmask_msi_irq, + .irq_ack = ia64_ack_msi_irq, +#ifdef CONFIG_SMP + .irq_set_affinity = ia64_set_msi_irq_affinity, +#endif + .irq_retrigger = ia64_msi_retrigger_irq, +}; + + +int arch_setup_msi_irq(struct pci_dev *pdev, struct msi_desc *desc) +{ + if (platform_setup_msi_irq) + return platform_setup_msi_irq(pdev, desc); + + return ia64_setup_msi_irq(pdev, desc); +} + +void arch_teardown_msi_irq(unsigned int irq) +{ + if (platform_teardown_msi_irq) + return platform_teardown_msi_irq(irq); + + return ia64_teardown_msi_irq(irq); +} + +#ifdef CONFIG_INTEL_IOMMU +#ifdef CONFIG_SMP +static int dmar_msi_set_affinity(struct irq_data *data, + const struct cpumask *mask, bool force) +{ + unsigned int irq = data->irq; + struct irq_cfg *cfg = irq_cfg + irq; + struct msi_msg msg; + int cpu = cpumask_first_and(mask, cpu_online_mask); + + if (irq_prepare_move(irq, cpu)) + return -1; + + dmar_msi_read(irq, &msg); + + msg.data &= ~MSI_DATA_VECTOR_MASK; + msg.data |= MSI_DATA_VECTOR(cfg->vector); + msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK; + msg.address_lo |= MSI_ADDR_DEST_ID_CPU(cpu_physical_id(cpu)); + + dmar_msi_write(irq, &msg); + cpumask_copy(data->affinity, mask); + + return 0; +} +#endif /* CONFIG_SMP */ + +static struct irq_chip dmar_msi_type = { + .name = "DMAR_MSI", + .irq_unmask = dmar_msi_unmask, + .irq_mask = dmar_msi_mask, + .irq_ack = ia64_ack_msi_irq, +#ifdef CONFIG_SMP + .irq_set_affinity = dmar_msi_set_affinity, +#endif + .irq_retrigger = ia64_msi_retrigger_irq, +}; + +static int +msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg) +{ + struct irq_cfg *cfg = irq_cfg + irq; + unsigned dest; + cpumask_t mask; + + cpumask_and(&mask, &(irq_to_domain(irq)), cpu_online_mask); + dest = cpu_physical_id(first_cpu(mask)); + + msg->address_hi = 0; + msg->address_lo = + MSI_ADDR_HEADER | + MSI_ADDR_DEST_MODE_PHYS | + MSI_ADDR_REDIRECTION_CPU | + MSI_ADDR_DEST_ID_CPU(dest); + + msg->data = + MSI_DATA_TRIGGER_EDGE | + MSI_DATA_LEVEL_ASSERT | + MSI_DATA_DELIVERY_FIXED | + MSI_DATA_VECTOR(cfg->vector); + return 0; +} + +int arch_setup_dmar_msi(unsigned int irq) +{ + int ret; + struct msi_msg msg; + + ret = msi_compose_msg(NULL, irq, &msg); + if (ret < 0) + return ret; + dmar_msi_write(irq, &msg); + irq_set_chip_and_handler_name(irq, &dmar_msi_type, handle_edge_irq, + "edge"); + return 0; +} +#endif /* CONFIG_INTEL_IOMMU */ + diff --git a/arch/ia64/kernel/nr-irqs.c b/arch/ia64/kernel/nr-irqs.c new file mode 100644 index 00000000000..f6769cd54bd --- /dev/null +++ b/arch/ia64/kernel/nr-irqs.c @@ -0,0 +1,21 @@ +/* + * calculate + * NR_IRQS = max(IA64_NATIVE_NR_IRQS, XEN_NR_IRQS, FOO_NR_IRQS...) + * depending on config. + * This must be calculated before processing asm-offset.c. + */ + +#define ASM_OFFSETS_C 1 + +#include <linux/kbuild.h> +#include <linux/threads.h> +#include <asm/native/irq.h> + +void foo(void) +{ + union paravirt_nr_irqs_max { + char ia64_native_nr_irqs[IA64_NATIVE_NR_IRQS]; + }; + + DEFINE(NR_IRQS, sizeof (union paravirt_nr_irqs_max)); +} diff --git a/arch/ia64/kernel/numa.c b/arch/ia64/kernel/numa.c index a68ce667809..d288cde9360 100644 --- a/arch/ia64/kernel/numa.c +++ b/arch/ia64/kernel/numa.c @@ -19,16 +19,46 @@ * Copyright (C) 2004 Silicon Graphics, Inc. * Jesse Barnes <jbarnes@sgi.com> */ -#include <linux/config.h> #include <linux/topology.h> #include <linux/module.h> #include <asm/processor.h> #include <asm/smp.h> -u8 cpu_to_node_map[NR_CPUS] __cacheline_aligned; +u16 cpu_to_node_map[NR_CPUS] __cacheline_aligned; EXPORT_SYMBOL(cpu_to_node_map); cpumask_t node_to_cpu_mask[MAX_NUMNODES] __cacheline_aligned; +EXPORT_SYMBOL(node_to_cpu_mask); + +void map_cpu_to_node(int cpu, int nid) +{ + int oldnid; + if (nid < 0) { /* just initialize by zero */ + cpu_to_node_map[cpu] = 0; + return; + } + /* sanity check first */ + oldnid = cpu_to_node_map[cpu]; + if (cpu_isset(cpu, node_to_cpu_mask[oldnid])) { + return; /* nothing to do */ + } + /* we don't have cpu-driven node hot add yet... + In usual case, node is created from SRAT at boot time. */ + if (!node_online(nid)) + nid = first_online_node; + cpu_to_node_map[cpu] = nid; + cpu_set(cpu, node_to_cpu_mask[nid]); + return; +} + +void unmap_cpu_from_node(int cpu, int nid) +{ + WARN_ON(!cpu_isset(cpu, node_to_cpu_mask[nid])); + WARN_ON(cpu_to_node_map[cpu] != nid); + cpu_to_node_map[cpu] = 0; + cpu_clear(cpu, node_to_cpu_mask[nid]); +} + /** * build_cpu_to_node_map - setup cpu to node and node to cpumask arrays @@ -43,15 +73,13 @@ void __init build_cpu_to_node_map(void) for(node=0; node < MAX_NUMNODES; node++) cpus_clear(node_to_cpu_mask[node]); - for(cpu = 0; cpu < NR_CPUS; ++cpu) { + for_each_possible_early_cpu(cpu) { node = -1; for (i = 0; i < NR_CPUS; ++i) if (cpu_physical_id(cpu) == node_cpuid[i].phys_id) { node = node_cpuid[i].nid; break; } - cpu_to_node_map[cpu] = (node >= 0) ? node : 0; - if (node >= 0) - cpu_set(cpu, node_to_cpu_mask[node]); + map_cpu_to_node(cpu, node); } } diff --git a/arch/ia64/kernel/pal.S b/arch/ia64/kernel/pal.S index 5018c7f2e7a..0b533441c3c 100644 --- a/arch/ia64/kernel/pal.S +++ b/arch/ia64/kernel/pal.S @@ -21,11 +21,12 @@ pal_entry_point: .text /* - * Set the PAL entry point address. This could be written in C code, but we do it here - * to keep it all in one module (besides, it's so trivial that it's + * Set the PAL entry point address. This could be written in C code, but we + * do it here to keep it all in one module (besides, it's so trivial that it's * not a big deal). * - * in0 Address of the PAL entry point (text address, NOT a function descriptor). + * in0 Address of the PAL entry point (text address, NOT a function + * descriptor). */ GLOBAL_ENTRY(ia64_pal_handler_init) alloc r3=ar.pfs,1,0,0,0 @@ -36,9 +37,9 @@ GLOBAL_ENTRY(ia64_pal_handler_init) END(ia64_pal_handler_init) /* - * Default PAL call handler. This needs to be coded in assembly because it uses - * the static calling convention, i.e., the RSE may not be used and calls are - * done via "br.cond" (not "br.call"). + * Default PAL call handler. This needs to be coded in assembly because it + * uses the static calling convention, i.e., the RSE may not be used and + * calls are done via "br.cond" (not "br.call"). */ GLOBAL_ENTRY(ia64_pal_default_handler) mov r8=-1 @@ -50,12 +51,10 @@ END(ia64_pal_default_handler) * * in0 Index of PAL service * in1 - in3 Remaining PAL arguments - * in4 1 ==> clear psr.ic, 0 ==> don't clear psr.ic - * */ GLOBAL_ENTRY(ia64_pal_call_static) - .prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(5) - alloc loc1 = ar.pfs,5,5,0,0 + .prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(4) + alloc loc1 = ar.pfs,4,5,0,0 movl loc2 = pal_entry_point 1: { mov r28 = in0 @@ -64,7 +63,6 @@ GLOBAL_ENTRY(ia64_pal_call_static) } ;; ld8 loc2 = [loc2] // loc2 <- entry point - tbit.nz p6,p7 = in4, 0 adds r8 = 1f-1b,r8 mov loc4=ar.rsc // save RSE configuration ;; @@ -74,13 +72,11 @@ GLOBAL_ENTRY(ia64_pal_call_static) .body mov r30 = in2 -(p6) rsm psr.i | psr.ic mov r31 = in3 mov b7 = loc2 -(p7) rsm psr.i + rsm psr.i ;; -(p6) srlz.i mov rp = r8 br.cond.sptk.many b7 1: mov psr.l = loc3 @@ -96,8 +92,8 @@ END(ia64_pal_call_static) * Make a PAL call using the stacked registers calling convention. * * Inputs: - * in0 Index of PAL service - * in2 - in3 Remaning PAL arguments + * in0 Index of PAL service + * in2 - in3 Remaining PAL arguments */ GLOBAL_ENTRY(ia64_pal_call_stacked) .prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(4) @@ -131,18 +127,18 @@ END(ia64_pal_call_stacked) * Make a physical mode PAL call using the static registers calling convention. * * Inputs: - * in0 Index of PAL service - * in2 - in3 Remaning PAL arguments + * in0 Index of PAL service + * in2 - in3 Remaining PAL arguments * * PSR_LP, PSR_TB, PSR_ID, PSR_DA are never set by the kernel. * So we don't need to clear them. */ -#define PAL_PSR_BITS_TO_CLEAR \ - (IA64_PSR_I | IA64_PSR_IT | IA64_PSR_DT | IA64_PSR_DB | IA64_PSR_RT | \ - IA64_PSR_DD | IA64_PSR_SS | IA64_PSR_RI | IA64_PSR_ED | \ +#define PAL_PSR_BITS_TO_CLEAR \ + (IA64_PSR_I | IA64_PSR_IT | IA64_PSR_DT | IA64_PSR_DB | IA64_PSR_RT |\ + IA64_PSR_DD | IA64_PSR_SS | IA64_PSR_RI | IA64_PSR_ED | \ IA64_PSR_DFL | IA64_PSR_DFH) -#define PAL_PSR_BITS_TO_SET \ +#define PAL_PSR_BITS_TO_SET \ (IA64_PSR_BN) @@ -178,7 +174,7 @@ GLOBAL_ENTRY(ia64_pal_call_phys_static) ;; andcm r16=loc3,r16 // removes bits to clear from psr br.call.sptk.many rp=ia64_switch_mode_phys -.ret1: mov rp = r8 // install return address (physical) + mov rp = r8 // install return address (physical) mov loc5 = r19 mov loc6 = r20 br.cond.sptk.many b7 @@ -188,7 +184,6 @@ GLOBAL_ENTRY(ia64_pal_call_phys_static) mov r19=loc5 mov r20=loc6 br.call.sptk.many rp=ia64_switch_mode_virt // return to virtual mode -.ret2: mov psr.l = loc3 // restore init PSR mov ar.pfs = loc1 @@ -203,8 +198,8 @@ END(ia64_pal_call_phys_static) * Make a PAL call using the stacked registers in physical mode. * * Inputs: - * in0 Index of PAL service - * in2 - in3 Remaning PAL arguments + * in0 Index of PAL service + * in2 - in3 Remaining PAL arguments */ GLOBAL_ENTRY(ia64_pal_call_phys_stacked) .prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(5) @@ -212,17 +207,12 @@ GLOBAL_ENTRY(ia64_pal_call_phys_stacked) movl loc2 = pal_entry_point 1: { mov r28 = in0 // copy procedure index - mov loc0 = rp // save rp + mov loc0 = rp // save rp } .body ;; ld8 loc2 = [loc2] // loc2 <- entry point - mov out0 = in0 // first argument - mov out1 = in1 // copy arg2 - mov out2 = in2 // copy arg3 - mov out3 = in3 // copy arg3 - ;; - mov loc3 = psr // save psr + mov loc3 = psr // save psr ;; mov loc4=ar.rsc // save RSE configuration dep.z loc2=loc2,0,61 // convert pal entry point to physical @@ -236,18 +226,23 @@ GLOBAL_ENTRY(ia64_pal_call_phys_stacked) ;; andcm r16=loc3,r16 // removes bits to clear from psr br.call.sptk.many rp=ia64_switch_mode_phys -.ret6: + + mov out0 = in0 // first argument + mov out1 = in1 // copy arg2 + mov out2 = in2 // copy arg3 + mov out3 = in3 // copy arg3 mov loc5 = r19 mov loc6 = r20 + br.call.sptk.many rp=b7 // now make the call -.ret7: + mov ar.rsc=0 // put RSE in enforced lazy, LE mode mov r16=loc3 // r16= original psr mov r19=loc5 mov r20=loc6 - br.call.sptk.many rp=ia64_switch_mode_virt // return to virtual mode + br.call.sptk.many rp=ia64_switch_mode_virt // return to virtual mode -.ret8: mov psr.l = loc3 // restore init PSR + mov psr.l = loc3 // restore init PSR mov ar.pfs = loc1 mov rp = loc0 ;; @@ -257,10 +252,11 @@ GLOBAL_ENTRY(ia64_pal_call_phys_stacked) END(ia64_pal_call_phys_stacked) /* - * Save scratch fp scratch regs which aren't saved in pt_regs already (fp10-fp15). + * Save scratch fp scratch regs which aren't saved in pt_regs already + * (fp10-fp15). * - * NOTE: We need to do this since firmware (SAL and PAL) may use any of the scratch - * regs fp-low partition. + * NOTE: We need to do this since firmware (SAL and PAL) may use any of the + * scratch regs fp-low partition. * * Inputs: * in0 Address of stack storage for fp regs diff --git a/arch/ia64/kernel/palinfo.c b/arch/ia64/kernel/palinfo.c index 89faa603c6b..c39c3cd3ac3 100644 --- a/arch/ia64/kernel/palinfo.c +++ b/arch/ia64/kernel/palinfo.c @@ -16,12 +16,13 @@ * 02/05/2001 S.Eranian fixed module support * 10/23/2001 S.Eranian updated pal_perf_mon_info bug fixes * 03/24/2004 Ashok Raj updated to work with CPU Hotplug + * 10/26/2006 Russ Anderson updated processor features to rev 2.2 spec */ -#include <linux/config.h> #include <linux/types.h> #include <linux/errno.h> #include <linux/init.h> #include <linux/proc_fs.h> +#include <linux/seq_file.h> #include <linux/mm.h> #include <linux/module.h> #include <linux/efi.h> @@ -41,7 +42,7 @@ MODULE_LICENSE("GPL"); #define PALINFO_VERSION "0.5" -typedef int (*palinfo_func_t)(char*); +typedef int (*palinfo_func_t)(struct seq_file *); typedef struct { const char *name; /* name of the proc entry */ @@ -54,7 +55,7 @@ typedef struct { * A bunch of string array to get pretty printing */ -static char *cache_types[] = { +static const char *cache_types[] = { "", /* not used */ "Instruction", "Data", @@ -122,19 +123,16 @@ static const char *mem_attrib[]={ * - a pointer to the end of the buffer * */ -static char * -bitvector_process(char *p, u64 vector) +static void bitvector_process(struct seq_file *m, u64 vector) { int i,j; - const char *units[]={ "", "K", "M", "G", "T" }; + static const char *units[]={ "", "K", "M", "G", "T" }; for (i=0, j=0; i < 64; i++ , j=i/10) { - if (vector & 0x1) { - p += sprintf(p, "%d%s ", 1 << (i-j*10), units[j]); - } + if (vector & 0x1) + seq_printf(m, "%d%s ", 1 << (i-j*10), units[j]); vector >>= 1; } - return p; } /* @@ -149,8 +147,7 @@ bitvector_process(char *p, u64 vector) * - a pointer to the end of the buffer * */ -static char * -bitregister_process(char *p, u64 *reg_info, int max) +static void bitregister_process(struct seq_file *m, u64 *reg_info, int max) { int i, begin, skip = 0; u64 value = reg_info[0]; @@ -163,9 +160,9 @@ bitregister_process(char *p, u64 *reg_info, int max) if ((value & 0x1) == 0 && skip == 0) { if (begin <= i - 2) - p += sprintf(p, "%d-%d ", begin, i-1); + seq_printf(m, "%d-%d ", begin, i-1); else - p += sprintf(p, "%d ", i-1); + seq_printf(m, "%d ", i-1); skip = 1; begin = -1; } else if ((value & 0x1) && skip == 1) { @@ -176,19 +173,15 @@ bitregister_process(char *p, u64 *reg_info, int max) } if (begin > -1) { if (begin < 127) - p += sprintf(p, "%d-127", begin); + seq_printf(m, "%d-127", begin); else - p += sprintf(p, "127"); + seq_puts(m, "127"); } - - return p; } -static int -power_info(char *page) +static int power_info(struct seq_file *m) { s64 status; - char *p = page; u64 halt_info_buffer[8]; pal_power_mgmt_info_u_t *halt_info =(pal_power_mgmt_info_u_t *)halt_info_buffer; int i; @@ -198,103 +191,103 @@ power_info(char *page) for (i=0; i < 8 ; i++ ) { if (halt_info[i].pal_power_mgmt_info_s.im == 1) { - p += sprintf(p, "Power level %d:\n" - "\tentry_latency : %d cycles\n" - "\texit_latency : %d cycles\n" - "\tpower consumption : %d mW\n" - "\tCache+TLB coherency : %s\n", i, - halt_info[i].pal_power_mgmt_info_s.entry_latency, - halt_info[i].pal_power_mgmt_info_s.exit_latency, - halt_info[i].pal_power_mgmt_info_s.power_consumption, - halt_info[i].pal_power_mgmt_info_s.co ? "Yes" : "No"); + seq_printf(m, + "Power level %d:\n" + "\tentry_latency : %d cycles\n" + "\texit_latency : %d cycles\n" + "\tpower consumption : %d mW\n" + "\tCache+TLB coherency : %s\n", i, + halt_info[i].pal_power_mgmt_info_s.entry_latency, + halt_info[i].pal_power_mgmt_info_s.exit_latency, + halt_info[i].pal_power_mgmt_info_s.power_consumption, + halt_info[i].pal_power_mgmt_info_s.co ? "Yes" : "No"); } else { - p += sprintf(p,"Power level %d: not implemented\n",i); + seq_printf(m,"Power level %d: not implemented\n", i); } } - return p - page; + return 0; } -static int -cache_info(char *page) +static int cache_info(struct seq_file *m) { - char *p = page; - u64 i, levels, unique_caches; + unsigned long i, levels, unique_caches; pal_cache_config_info_t cci; int j, k; - s64 status; + long status; if ((status = ia64_pal_cache_summary(&levels, &unique_caches)) != 0) { printk(KERN_ERR "ia64_pal_cache_summary=%ld\n", status); return 0; } - p += sprintf(p, "Cache levels : %ld\nUnique caches : %ld\n\n", levels, unique_caches); + seq_printf(m, "Cache levels : %ld\nUnique caches : %ld\n\n", + levels, unique_caches); for (i=0; i < levels; i++) { - for (j=2; j >0 ; j--) { - /* even without unification some level may not be present */ - if ((status=ia64_pal_cache_config_info(i,j, &cci)) != 0) { + if ((status=ia64_pal_cache_config_info(i,j, &cci)) != 0) continue; - } - p += sprintf(p, - "%s Cache level %lu:\n" - "\tSize : %lu bytes\n" - "\tAttributes : ", - cache_types[j+cci.pcci_unified], i+1, - cci.pcci_cache_size); - - if (cci.pcci_unified) p += sprintf(p, "Unified "); - - p += sprintf(p, "%s\n", cache_mattrib[cci.pcci_cache_attr]); - - p += sprintf(p, - "\tAssociativity : %d\n" - "\tLine size : %d bytes\n" - "\tStride : %d bytes\n", - cci.pcci_assoc, 1<<cci.pcci_line_size, 1<<cci.pcci_stride); + + seq_printf(m, + "%s Cache level %lu:\n" + "\tSize : %u bytes\n" + "\tAttributes : ", + cache_types[j+cci.pcci_unified], i+1, + cci.pcci_cache_size); + + if (cci.pcci_unified) + seq_puts(m, "Unified "); + + seq_printf(m, "%s\n", cache_mattrib[cci.pcci_cache_attr]); + + seq_printf(m, + "\tAssociativity : %d\n" + "\tLine size : %d bytes\n" + "\tStride : %d bytes\n", + cci.pcci_assoc, + 1<<cci.pcci_line_size, + 1<<cci.pcci_stride); if (j == 1) - p += sprintf(p, "\tStore latency : N/A\n"); + seq_puts(m, "\tStore latency : N/A\n"); else - p += sprintf(p, "\tStore latency : %d cycle(s)\n", - cci.pcci_st_latency); + seq_printf(m, "\tStore latency : %d cycle(s)\n", + cci.pcci_st_latency); - p += sprintf(p, - "\tLoad latency : %d cycle(s)\n" - "\tStore hints : ", cci.pcci_ld_latency); + seq_printf(m, + "\tLoad latency : %d cycle(s)\n" + "\tStore hints : ", cci.pcci_ld_latency); for(k=0; k < 8; k++ ) { if ( cci.pcci_st_hints & 0x1) - p += sprintf(p, "[%s]", cache_st_hints[k]); + seq_printf(m, "[%s]", cache_st_hints[k]); cci.pcci_st_hints >>=1; } - p += sprintf(p, "\n\tLoad hints : "); + seq_puts(m, "\n\tLoad hints : "); for(k=0; k < 8; k++ ) { if (cci.pcci_ld_hints & 0x1) - p += sprintf(p, "[%s]", cache_ld_hints[k]); + seq_printf(m, "[%s]", cache_ld_hints[k]); cci.pcci_ld_hints >>=1; } - p += sprintf(p, - "\n\tAlias boundary : %d byte(s)\n" - "\tTag LSB : %d\n" - "\tTag MSB : %d\n", - 1<<cci.pcci_alias_boundary, cci.pcci_tag_lsb, - cci.pcci_tag_msb); + seq_printf(m, + "\n\tAlias boundary : %d byte(s)\n" + "\tTag LSB : %d\n" + "\tTag MSB : %d\n", + 1<<cci.pcci_alias_boundary, cci.pcci_tag_lsb, + cci.pcci_tag_msb); /* when unified, data(j=2) is enough */ - if (cci.pcci_unified) break; + if (cci.pcci_unified) + break; } } - return p - page; + return 0; } -static int -vm_info(char *page) +static int vm_info(struct seq_file *m) { - char *p = page; u64 tr_pages =0, vw_pages=0, tc_pages; u64 attrib; pal_vm_info_1_u_t vm_info_1; @@ -303,63 +296,70 @@ vm_info(char *page) ia64_ptce_info_t ptce; const char *sep; int i, j; - s64 status; + long status; if ((status = ia64_pal_vm_summary(&vm_info_1, &vm_info_2)) !=0) { printk(KERN_ERR "ia64_pal_vm_summary=%ld\n", status); } else { - p += sprintf(p, + seq_printf(m, "Physical Address Space : %d bits\n" "Virtual Address Space : %d bits\n" "Protection Key Registers(PKR) : %d\n" "Implemented bits in PKR.key : %d\n" "Hash Tag ID : 0x%x\n" - "Size of RR.rid : %d\n", + "Size of RR.rid : %d\n" + "Max Purges : ", vm_info_1.pal_vm_info_1_s.phys_add_size, vm_info_2.pal_vm_info_2_s.impl_va_msb+1, vm_info_1.pal_vm_info_1_s.max_pkr+1, vm_info_1.pal_vm_info_1_s.key_size, vm_info_1.pal_vm_info_1_s.hash_tag_id, vm_info_2.pal_vm_info_2_s.rid_size); + if (vm_info_2.pal_vm_info_2_s.max_purges == PAL_MAX_PURGES) + seq_puts(m, "unlimited\n"); + else + seq_printf(m, "%d\n", + vm_info_2.pal_vm_info_2_s.max_purges ? + vm_info_2.pal_vm_info_2_s.max_purges : 1); } if (ia64_pal_mem_attrib(&attrib) == 0) { - p += sprintf(p, "Supported memory attributes : "); + seq_puts(m, "Supported memory attributes : "); sep = ""; for (i = 0; i < 8; i++) { if (attrib & (1 << i)) { - p += sprintf(p, "%s%s", sep, mem_attrib[i]); + seq_printf(m, "%s%s", sep, mem_attrib[i]); sep = ", "; } } - p += sprintf(p, "\n"); + seq_putc(m, '\n'); } if ((status = ia64_pal_vm_page_size(&tr_pages, &vw_pages)) !=0) { printk(KERN_ERR "ia64_pal_vm_page_size=%ld\n", status); } else { - p += sprintf(p, - "\nTLB walker : %simplemented\n" - "Number of DTR : %d\n" - "Number of ITR : %d\n" - "TLB insertable page sizes : ", - vm_info_1.pal_vm_info_1_s.vw ? "" : "not ", - vm_info_1.pal_vm_info_1_s.max_dtr_entry+1, - vm_info_1.pal_vm_info_1_s.max_itr_entry+1); - + seq_printf(m, + "\nTLB walker : %simplemented\n" + "Number of DTR : %d\n" + "Number of ITR : %d\n" + "TLB insertable page sizes : ", + vm_info_1.pal_vm_info_1_s.vw ? "" : "not ", + vm_info_1.pal_vm_info_1_s.max_dtr_entry+1, + vm_info_1.pal_vm_info_1_s.max_itr_entry+1); - p = bitvector_process(p, tr_pages); + bitvector_process(m, tr_pages); - p += sprintf(p, "\nTLB purgeable page sizes : "); + seq_puts(m, "\nTLB purgeable page sizes : "); - p = bitvector_process(p, vw_pages); + bitvector_process(m, vw_pages); } - if ((status=ia64_get_ptce(&ptce)) != 0) { + + if ((status = ia64_get_ptce(&ptce)) != 0) { printk(KERN_ERR "ia64_get_ptce=%ld\n", status); } else { - p += sprintf(p, + seq_printf(m, "\nPurge base address : 0x%016lx\n" "Purge outer loop count : %d\n" "Purge inner loop count : %d\n" @@ -368,7 +368,7 @@ vm_info(char *page) ptce.base, ptce.count[0], ptce.count[1], ptce.stride[0], ptce.stride[1]); - p += sprintf(p, + seq_printf(m, "TC Levels : %d\n" "Unique TC(s) : %d\n", vm_info_1.pal_vm_info_1_s.num_tc_levels, @@ -378,13 +378,11 @@ vm_info(char *page) for (j=2; j>0 ; j--) { tc_pages = 0; /* just in case */ - /* even without unification, some levels may not be present */ - if ((status=ia64_pal_vm_info(i,j, &tc_info, &tc_pages)) != 0) { + if ((status=ia64_pal_vm_info(i,j, &tc_info, &tc_pages)) != 0) continue; - } - p += sprintf(p, + seq_printf(m, "\n%s Translation Cache Level %d:\n" "\tHash sets : %d\n" "\tAssociativity : %d\n" @@ -396,15 +394,15 @@ vm_info(char *page) tc_info.tc_num_entries); if (tc_info.tc_pf) - p += sprintf(p, "PreferredPageSizeOptimized "); + seq_puts(m, "PreferredPageSizeOptimized "); if (tc_info.tc_unified) - p += sprintf(p, "Unified "); + seq_puts(m, "Unified "); if (tc_info.tc_reduce_tr) - p += sprintf(p, "TCReduction"); + seq_puts(m, "TCReduction"); - p += sprintf(p, "\n\tSupported page sizes: "); + seq_puts(m, "\n\tSupported page sizes: "); - p = bitvector_process(p, tc_pages); + bitvector_process(m, tc_pages); /* when unified date (j=2) is enough */ if (tc_info.tc_unified) @@ -412,22 +410,20 @@ vm_info(char *page) } } } - p += sprintf(p, "\n"); - return p - page; + seq_putc(m, '\n'); + return 0; } -static int -register_info(char *page) +static int register_info(struct seq_file *m) { - char *p = page; u64 reg_info[2]; u64 info; - u64 phys_stacked; + unsigned long phys_stacked; pal_hints_u_t hints; - u64 iregs, dregs; - char *info_type[]={ + unsigned long iregs, dregs; + static const char * const info_type[] = { "Implemented AR(s)", "AR(s) with read side-effects", "Implemented CR(s)", @@ -435,40 +431,40 @@ register_info(char *page) }; for(info=0; info < 4; info++) { - - if (ia64_pal_register_info(info, ®_info[0], ®_info[1]) != 0) return 0; - - p += sprintf(p, "%-32s : ", info_type[info]); - - p = bitregister_process(p, reg_info, 128); - - p += sprintf(p, "\n"); + if (ia64_pal_register_info(info, ®_info[0], ®_info[1]) != 0) + return 0; + seq_printf(m, "%-32s : ", info_type[info]); + bitregister_process(m, reg_info, 128); + seq_putc(m, '\n'); } - if (ia64_pal_rse_info(&phys_stacked, &hints) == 0) { + if (ia64_pal_rse_info(&phys_stacked, &hints) == 0) + seq_printf(m, + "RSE stacked physical registers : %ld\n" + "RSE load/store hints : %ld (%s)\n", + phys_stacked, hints.ph_data, + hints.ph_data < RSE_HINTS_COUNT ? rse_hints[hints.ph_data]: "(??)"); - p += sprintf(p, - "RSE stacked physical registers : %ld\n" - "RSE load/store hints : %ld (%s)\n", - phys_stacked, hints.ph_data, - hints.ph_data < RSE_HINTS_COUNT ? rse_hints[hints.ph_data]: "(??)"); - } if (ia64_pal_debug_info(&iregs, &dregs)) return 0; - p += sprintf(p, - "Instruction debug register pairs : %ld\n" - "Data debug register pairs : %ld\n", iregs, dregs); + seq_printf(m, + "Instruction debug register pairs : %ld\n" + "Data debug register pairs : %ld\n", iregs, dregs); - return p - page; + return 0; } -static const char *proc_features[]={ +static const char *const proc_features_0[]={ /* Feature set 0 */ NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL, NULL,NULL,NULL,NULL,NULL,NULL,NULL, NULL,NULL, NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL, NULL,NULL,NULL,NULL,NULL, NULL,NULL,NULL,NULL, - NULL,NULL,NULL,NULL,NULL, + "Unimplemented instruction address fault", + "INIT, PMI, and LINT pins", + "Simple unimplemented instr addresses", + "Variable P-state performance", + "Virtual machine features implemented", "XIP,XPSR,XFS implemented", "XR1-XR3 implemented", "Disable dynamic predicate prediction", @@ -476,7 +472,11 @@ static const char *proc_features[]={ "Disable dynamic data cache prefetch", "Disable dynamic inst cache prefetch", "Disable dynamic branch prediction", - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, + "Disable P-states", + "Enable MCA on Data Poisoning", + "Enable vmsw instruction", + "Enable extern environmental notification", "Disable BINIT on processor time-out", "Disable dynamic power management (DPM)", "Disable coherency", @@ -487,29 +487,91 @@ static const char *proc_features[]={ "Enable BERR promotion" }; +static const char *const proc_features_16[]={ /* Feature set 16 */ + "Disable ETM", + "Enable ETM", + "Enable MCA on half-way timer", + "Enable snoop WC", + NULL, + "Enable Fast Deferral", + "Disable MCA on memory aliasing", + "Enable RSB", + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + "DP system processor", + "Low Voltage", + "HT supported", + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL +}; -static int -processor_info(char *page) +static const char *const *const proc_features[]={ + proc_features_0, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, + proc_features_16, + NULL, NULL, NULL, NULL, +}; + +static void feature_set_info(struct seq_file *m, u64 avail, u64 status, u64 control, + unsigned long set) { - char *p = page; - const char **v = proc_features; - u64 avail=1, status=1, control=1; + const char *const *vf, *const *v; int i; - s64 ret; - if ((ret=ia64_pal_proc_get_features(&avail, &status, &control)) != 0) return 0; + vf = v = proc_features[set]; + for(i=0; i < 64; i++, avail >>=1, status >>=1, control >>=1) { - for(i=0; i < 64; i++, v++,avail >>=1, status >>=1, control >>=1) { - if ( ! *v ) continue; - p += sprintf(p, "%-40s : %s%s %s\n", *v, - avail & 0x1 ? "" : "NotImpl", - avail & 0x1 ? (status & 0x1 ? "On" : "Off"): "", - avail & 0x1 ? (control & 0x1 ? "Ctrl" : "NoCtrl"): ""); + if (!(control)) /* No remaining bits set */ + break; + if (!(avail & 0x1)) /* Print only bits that are available */ + continue; + if (vf) + v = vf + i; + if ( v && *v ) { + seq_printf(m, "%-40s : %s %s\n", *v, + avail & 0x1 ? (status & 0x1 ? + "On " : "Off"): "", + avail & 0x1 ? (control & 0x1 ? + "Ctrl" : "NoCtrl"): ""); + } else { + seq_printf(m, "Feature set %2ld bit %2d\t\t\t" + " : %s %s\n", + set, i, + avail & 0x1 ? (status & 0x1 ? + "On " : "Off"): "", + avail & 0x1 ? (control & 0x1 ? + "Ctrl" : "NoCtrl"): ""); + } } - return p - page; } -static const char *bus_features[]={ +static int processor_info(struct seq_file *m) +{ + u64 avail=1, status=1, control=1, feature_set=0; + s64 ret; + + do { + ret = ia64_pal_proc_get_features(&avail, &status, &control, + feature_set); + if (ret < 0) + return 0; + + if (ret == 1) { + feature_set++; + continue; + } + + feature_set_info(m, avail, status, control, feature_set); + feature_set++; + } while(1); + + return 0; +} + +static const char *const bus_features[]={ NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL, NULL,NULL,NULL,NULL,NULL,NULL,NULL, NULL,NULL, NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL, @@ -535,168 +597,155 @@ static const char *bus_features[]={ }; -static int -bus_info(char *page) +static int bus_info(struct seq_file *m) { - char *p = page; - const char **v = bus_features; + const char *const *v = bus_features; pal_bus_features_u_t av, st, ct; u64 avail, status, control; int i; s64 ret; - if ((ret=ia64_pal_bus_get_features(&av, &st, &ct)) != 0) return 0; + if ((ret=ia64_pal_bus_get_features(&av, &st, &ct)) != 0) + return 0; avail = av.pal_bus_features_val; status = st.pal_bus_features_val; control = ct.pal_bus_features_val; for(i=0; i < 64; i++, v++, avail >>=1, status >>=1, control >>=1) { - if ( ! *v ) continue; - p += sprintf(p, "%-48s : %s%s %s\n", *v, - avail & 0x1 ? "" : "NotImpl", - avail & 0x1 ? (status & 0x1 ? "On" : "Off"): "", - avail & 0x1 ? (control & 0x1 ? "Ctrl" : "NoCtrl"): ""); + if ( ! *v ) + continue; + seq_printf(m, "%-48s : %s%s %s\n", *v, + avail & 0x1 ? "" : "NotImpl", + avail & 0x1 ? (status & 0x1 ? "On" : "Off"): "", + avail & 0x1 ? (control & 0x1 ? "Ctrl" : "NoCtrl"): ""); } - return p - page; + return 0; } -static int -version_info(char *page) +static int version_info(struct seq_file *m) { pal_version_u_t min_ver, cur_ver; - char *p = page; - /* The PAL_VERSION call is advertised as being able to support - * both physical and virtual mode calls. This seems to be a documentation - * bug rather than firmware bug. In fact, it does only support physical mode. - * So now the code reflects this fact and the pal_version() has been updated - * accordingly. - */ - if (ia64_pal_version(&min_ver, &cur_ver) != 0) return 0; - - p += sprintf(p, - "PAL_vendor : 0x%02x (min=0x%02x)\n" - "PAL_A : %x.%x.%x (min=%x.%x.%x)\n" - "PAL_B : %x.%x.%x (min=%x.%x.%x)\n", - cur_ver.pal_version_s.pv_pal_vendor, min_ver.pal_version_s.pv_pal_vendor, - - cur_ver.pal_version_s.pv_pal_a_model>>4, - cur_ver.pal_version_s.pv_pal_a_model&0xf, cur_ver.pal_version_s.pv_pal_a_rev, - min_ver.pal_version_s.pv_pal_a_model>>4, - min_ver.pal_version_s.pv_pal_a_model&0xf, min_ver.pal_version_s.pv_pal_a_rev, - - cur_ver.pal_version_s.pv_pal_b_model>>4, - cur_ver.pal_version_s.pv_pal_b_model&0xf, cur_ver.pal_version_s.pv_pal_b_rev, - min_ver.pal_version_s.pv_pal_b_model>>4, - min_ver.pal_version_s.pv_pal_b_model&0xf, min_ver.pal_version_s.pv_pal_b_rev); - return p - page; + if (ia64_pal_version(&min_ver, &cur_ver) != 0) + return 0; + + seq_printf(m, + "PAL_vendor : 0x%02x (min=0x%02x)\n" + "PAL_A : %02x.%02x (min=%02x.%02x)\n" + "PAL_B : %02x.%02x (min=%02x.%02x)\n", + cur_ver.pal_version_s.pv_pal_vendor, + min_ver.pal_version_s.pv_pal_vendor, + cur_ver.pal_version_s.pv_pal_a_model, + cur_ver.pal_version_s.pv_pal_a_rev, + min_ver.pal_version_s.pv_pal_a_model, + min_ver.pal_version_s.pv_pal_a_rev, + cur_ver.pal_version_s.pv_pal_b_model, + cur_ver.pal_version_s.pv_pal_b_rev, + min_ver.pal_version_s.pv_pal_b_model, + min_ver.pal_version_s.pv_pal_b_rev); + return 0; } -static int -perfmon_info(char *page) +static int perfmon_info(struct seq_file *m) { - char *p = page; u64 pm_buffer[16]; pal_perf_mon_info_u_t pm_info; - if (ia64_pal_perf_mon_info(pm_buffer, &pm_info) != 0) return 0; - - p += sprintf(p, - "PMC/PMD pairs : %d\n" - "Counter width : %d bits\n" - "Cycle event number : %d\n" - "Retired event number : %d\n" - "Implemented PMC : ", - pm_info.pal_perf_mon_info_s.generic, pm_info.pal_perf_mon_info_s.width, - pm_info.pal_perf_mon_info_s.cycles, pm_info.pal_perf_mon_info_s.retired); + if (ia64_pal_perf_mon_info(pm_buffer, &pm_info) != 0) + return 0; - p = bitregister_process(p, pm_buffer, 256); - p += sprintf(p, "\nImplemented PMD : "); - p = bitregister_process(p, pm_buffer+4, 256); - p += sprintf(p, "\nCycles count capable : "); - p = bitregister_process(p, pm_buffer+8, 256); - p += sprintf(p, "\nRetired bundles count capable : "); + seq_printf(m, + "PMC/PMD pairs : %d\n" + "Counter width : %d bits\n" + "Cycle event number : %d\n" + "Retired event number : %d\n" + "Implemented PMC : ", + pm_info.pal_perf_mon_info_s.generic, + pm_info.pal_perf_mon_info_s.width, + pm_info.pal_perf_mon_info_s.cycles, + pm_info.pal_perf_mon_info_s.retired); + + bitregister_process(m, pm_buffer, 256); + seq_puts(m, "\nImplemented PMD : "); + bitregister_process(m, pm_buffer+4, 256); + seq_puts(m, "\nCycles count capable : "); + bitregister_process(m, pm_buffer+8, 256); + seq_puts(m, "\nRetired bundles count capable : "); #ifdef CONFIG_ITANIUM /* * PAL_PERF_MON_INFO reports that only PMC4 can be used to count CPU_CYCLES * which is wrong, both PMC4 and PMD5 support it. */ - if (pm_buffer[12] == 0x10) pm_buffer[12]=0x30; + if (pm_buffer[12] == 0x10) + pm_buffer[12]=0x30; #endif - p = bitregister_process(p, pm_buffer+12, 256); - - p += sprintf(p, "\n"); - - return p - page; + bitregister_process(m, pm_buffer+12, 256); + seq_putc(m, '\n'); + return 0; } -static int -frequency_info(char *page) +static int frequency_info(struct seq_file *m) { - char *p = page; struct pal_freq_ratio proc, itc, bus; - u64 base; + unsigned long base; if (ia64_pal_freq_base(&base) == -1) - p += sprintf(p, "Output clock : not implemented\n"); + seq_puts(m, "Output clock : not implemented\n"); else - p += sprintf(p, "Output clock : %ld ticks/s\n", base); + seq_printf(m, "Output clock : %ld ticks/s\n", base); if (ia64_pal_freq_ratios(&proc, &bus, &itc) != 0) return 0; - p += sprintf(p, - "Processor/Clock ratio : %ld/%ld\n" - "Bus/Clock ratio : %ld/%ld\n" - "ITC/Clock ratio : %ld/%ld\n", + seq_printf(m, + "Processor/Clock ratio : %d/%d\n" + "Bus/Clock ratio : %d/%d\n" + "ITC/Clock ratio : %d/%d\n", proc.num, proc.den, bus.num, bus.den, itc.num, itc.den); - - return p - page; + return 0; } -static int -tr_info(char *page) +static int tr_info(struct seq_file *m) { - char *p = page; - s64 status; + long status; pal_tr_valid_u_t tr_valid; u64 tr_buffer[4]; pal_vm_info_1_u_t vm_info_1; pal_vm_info_2_u_t vm_info_2; - u64 i, j; - u64 max[3], pgm; + unsigned long i, j; + unsigned long max[3], pgm; struct ifa_reg { - u64 valid:1; - u64 ig:11; - u64 vpn:52; + unsigned long valid:1; + unsigned long ig:11; + unsigned long vpn:52; } *ifa_reg; struct itir_reg { - u64 rv1:2; - u64 ps:6; - u64 key:24; - u64 rv2:32; + unsigned long rv1:2; + unsigned long ps:6; + unsigned long key:24; + unsigned long rv2:32; } *itir_reg; struct gr_reg { - u64 p:1; - u64 rv1:1; - u64 ma:3; - u64 a:1; - u64 d:1; - u64 pl:2; - u64 ar:3; - u64 ppn:38; - u64 rv2:2; - u64 ed:1; - u64 ig:11; + unsigned long p:1; + unsigned long rv1:1; + unsigned long ma:3; + unsigned long a:1; + unsigned long d:1; + unsigned long pl:2; + unsigned long ar:3; + unsigned long ppn:38; + unsigned long rv2:2; + unsigned long ed:1; + unsigned long ig:11; } *gr_reg; struct rid_reg { - u64 ig1:1; - u64 rv1:1; - u64 ig2:6; - u64 rid:24; - u64 rv2:32; + unsigned long ig1:1; + unsigned long rv1:1; + unsigned long ig2:6; + unsigned long rid:24; + unsigned long rv2:32; } *rid_reg; if ((status = ia64_pal_vm_summary(&vm_info_1, &vm_info_2)) !=0) { @@ -718,39 +767,40 @@ tr_info(char *page) ifa_reg = (struct ifa_reg *)&tr_buffer[2]; - if (ifa_reg->valid == 0) continue; + if (ifa_reg->valid == 0) + continue; gr_reg = (struct gr_reg *)tr_buffer; itir_reg = (struct itir_reg *)&tr_buffer[1]; rid_reg = (struct rid_reg *)&tr_buffer[3]; pgm = -1 << (itir_reg->ps - 12); - p += sprintf(p, - "%cTR%lu: av=%d pv=%d dv=%d mv=%d\n" - "\tppn : 0x%lx\n" - "\tvpn : 0x%lx\n" - "\tps : ", - "ID"[i], j, - tr_valid.pal_tr_valid_s.access_rights_valid, - tr_valid.pal_tr_valid_s.priv_level_valid, - tr_valid.pal_tr_valid_s.dirty_bit_valid, - tr_valid.pal_tr_valid_s.mem_attr_valid, - (gr_reg->ppn & pgm)<< 12, (ifa_reg->vpn & pgm)<< 12); - - p = bitvector_process(p, 1<< itir_reg->ps); - - p += sprintf(p, - "\n\tpl : %d\n" - "\tar : %d\n" - "\trid : %x\n" - "\tp : %d\n" - "\tma : %d\n" - "\td : %d\n", - gr_reg->pl, gr_reg->ar, rid_reg->rid, gr_reg->p, gr_reg->ma, - gr_reg->d); + seq_printf(m, + "%cTR%lu: av=%d pv=%d dv=%d mv=%d\n" + "\tppn : 0x%lx\n" + "\tvpn : 0x%lx\n" + "\tps : ", + "ID"[i], j, + tr_valid.pal_tr_valid_s.access_rights_valid, + tr_valid.pal_tr_valid_s.priv_level_valid, + tr_valid.pal_tr_valid_s.dirty_bit_valid, + tr_valid.pal_tr_valid_s.mem_attr_valid, + (gr_reg->ppn & pgm)<< 12, (ifa_reg->vpn & pgm)<< 12); + + bitvector_process(m, 1<< itir_reg->ps); + + seq_printf(m, + "\n\tpl : %d\n" + "\tar : %d\n" + "\trid : %x\n" + "\tp : %d\n" + "\tma : %d\n" + "\td : %d\n", + gr_reg->pl, gr_reg->ar, rid_reg->rid, gr_reg->p, gr_reg->ma, + gr_reg->d); } } - return p - page; + return 0; } @@ -758,7 +808,7 @@ tr_info(char *page) /* * List {name,function} pairs for every entry in /proc/palinfo/cpu* */ -static palinfo_entry_t palinfo_entries[]={ +static const palinfo_entry_t palinfo_entries[]={ { "version_info", version_info, }, { "vm_info", vm_info, }, { "cache_info", cache_info, }, @@ -773,17 +823,6 @@ static palinfo_entry_t palinfo_entries[]={ #define NR_PALINFO_ENTRIES (int) ARRAY_SIZE(palinfo_entries) -/* - * this array is used to keep track of the proc entries we create. This is - * required in the module mode when we need to remove all entries. The procfs code - * does not do recursion of deletion - * - * Notes: - * - +1 accounts for the cpuN directory entry in /proc/pal - */ -#define NR_PALINFO_PROC_ENTRIES (NR_CPUS*(NR_PALINFO_ENTRIES+1)) - -static struct proc_dir_entry *palinfo_proc_entries[NR_PALINFO_PROC_ENTRIES]; static struct proc_dir_entry *palinfo_dir; /* @@ -811,7 +850,7 @@ typedef union { */ typedef struct { palinfo_func_t func; /* pointer to function to call */ - char *page; /* buffer to store results */ + struct seq_file *m; /* buffer to store results */ int ret; /* return value from call */ } palinfo_smp_data_t; @@ -824,13 +863,7 @@ static void palinfo_smp_call(void *info) { palinfo_smp_data_t *data = (palinfo_smp_data_t *)info; - if (data == NULL) { - printk(KERN_ERR "palinfo: data pointer is NULL\n"); - data->ret = 0; /* no output */ - return; - } - /* does this actual call */ - data->ret = (*data->func)(data->page); + data->ret = (*data->func)(data->m); } /* @@ -840,18 +873,18 @@ palinfo_smp_call(void *info) * otherwise how many bytes in the "page" buffer were written */ static -int palinfo_handle_smp(pal_func_cpu_u_t *f, char *page) +int palinfo_handle_smp(struct seq_file *m, pal_func_cpu_u_t *f) { palinfo_smp_data_t ptr; int ret; ptr.func = palinfo_entries[f->func_id].proc_read; - ptr.page = page; + ptr.m = m; ptr.ret = 0; /* just in case */ /* will send IPI to other CPU and wait for completion of remote call */ - if ((ret=smp_call_function_single(f->req_cpu, palinfo_smp_call, &ptr, 0, 1))) { + if ((ret=smp_call_function_single(f->req_cpu, palinfo_smp_call, &ptr, 1))) { printk(KERN_ERR "palinfo: remote CPU call from %d to %d on function %d: " "error %d\n", smp_processor_id(), f->req_cpu, f->func_id, ret); return 0; @@ -860,7 +893,7 @@ int palinfo_handle_smp(pal_func_cpu_u_t *f, char *page) } #else /* ! CONFIG_SMP */ static -int palinfo_handle_smp(pal_func_cpu_u_t *f, char *page) +int palinfo_handle_smp(struct seq_file *m, pal_func_cpu_u_t *f) { printk(KERN_ERR "palinfo: should not be called with non SMP kernel\n"); return 0; @@ -870,115 +903,84 @@ int palinfo_handle_smp(pal_func_cpu_u_t *f, char *page) /* * Entry point routine: all calls go through this function */ -static int -palinfo_read_entry(char *page, char **start, off_t off, int count, int *eof, void *data) +static int proc_palinfo_show(struct seq_file *m, void *v) { - int len=0; - pal_func_cpu_u_t *f = (pal_func_cpu_u_t *)&data; + pal_func_cpu_u_t *f = (pal_func_cpu_u_t *)&m->private; /* * in SMP mode, we may need to call another CPU to get correct * information. PAL, by definition, is processor specific */ if (f->req_cpu == get_cpu()) - len = (*palinfo_entries[f->func_id].proc_read)(page); + (*palinfo_entries[f->func_id].proc_read)(m); else - len = palinfo_handle_smp(f, page); + palinfo_handle_smp(m, f); put_cpu(); + return 0; +} - if (len <= off+count) *eof = 1; - - *start = page + off; - len -= off; - - if (len>count) len = count; - if (len<0) len = 0; - - return len; +static int proc_palinfo_open(struct inode *inode, struct file *file) +{ + return single_open(file, proc_palinfo_show, PDE_DATA(inode)); } +static const struct file_operations proc_palinfo_fops = { + .open = proc_palinfo_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + static void create_palinfo_proc_entries(unsigned int cpu) { -# define CPUSTR "cpu%d" - pal_func_cpu_u_t f; - struct proc_dir_entry **pdir; struct proc_dir_entry *cpu_dir; int j; - char cpustr[sizeof(CPUSTR)]; - - - /* - * we keep track of created entries in a depth-first order for - * cleanup purposes. Each entry is stored into palinfo_proc_entries - */ - sprintf(cpustr,CPUSTR, cpu); + char cpustr[3+4+1]; /* cpu numbers are up to 4095 on itanic */ + sprintf(cpustr, "cpu%d", cpu); cpu_dir = proc_mkdir(cpustr, palinfo_dir); + if (!cpu_dir) + return; f.req_cpu = cpu; - /* - * Compute the location to store per cpu entries - * We dont store the top level entry in this list, but - * remove it finally after removing all cpu entries. - */ - pdir = &palinfo_proc_entries[cpu*(NR_PALINFO_ENTRIES+1)]; - *pdir++ = cpu_dir; for (j=0; j < NR_PALINFO_ENTRIES; j++) { f.func_id = j; - *pdir = create_proc_read_entry( - palinfo_entries[j].name, 0, cpu_dir, - palinfo_read_entry, (void *)f.value); - if (*pdir) - (*pdir)->owner = THIS_MODULE; - pdir++; + proc_create_data(palinfo_entries[j].name, 0, cpu_dir, + &proc_palinfo_fops, (void *)f.value); } } static void remove_palinfo_proc_entries(unsigned int hcpu) { - int j; - struct proc_dir_entry *cpu_dir, **pdir; - - pdir = &palinfo_proc_entries[hcpu*(NR_PALINFO_ENTRIES+1)]; - cpu_dir = *pdir; - *pdir++=NULL; - for (j=0; j < (NR_PALINFO_ENTRIES); j++) { - if ((*pdir)) { - remove_proc_entry ((*pdir)->name, cpu_dir); - *pdir ++= NULL; - } - } - - if (cpu_dir) { - remove_proc_entry(cpu_dir->name, palinfo_dir); - } + char cpustr[3+4+1]; /* cpu numbers are up to 4095 on itanic */ + sprintf(cpustr, "cpu%d", hcpu); + remove_proc_subtree(cpustr, palinfo_dir); } -static int __devinit palinfo_cpu_callback(struct notifier_block *nfb, - unsigned long action, - void *hcpu) +static int palinfo_cpu_callback(struct notifier_block *nfb, + unsigned long action, void *hcpu) { unsigned int hotcpu = (unsigned long)hcpu; switch (action) { case CPU_ONLINE: + case CPU_ONLINE_FROZEN: create_palinfo_proc_entries(hotcpu); break; -#ifdef CONFIG_HOTPLUG_CPU case CPU_DEAD: + case CPU_DEAD_FROZEN: remove_palinfo_proc_entries(hotcpu); break; -#endif } return NOTIFY_OK; } -static struct notifier_block palinfo_cpu_notifier = +static struct notifier_block __refdata palinfo_cpu_notifier = { .notifier_call = palinfo_cpu_callback, .priority = 0, @@ -991,6 +993,10 @@ palinfo_init(void) printk(KERN_INFO "PAL Information Facility v%s\n", PALINFO_VERSION); palinfo_dir = proc_mkdir("pal", NULL); + if (!palinfo_dir) + return -ENOMEM; + + cpu_notifier_register_begin(); /* Create palinfo dirs in /proc for all online cpus */ for_each_online_cpu(i) { @@ -998,7 +1004,9 @@ palinfo_init(void) } /* Register for future delivery via notify registration */ - register_cpu_notifier(&palinfo_cpu_notifier); + __register_hotcpu_notifier(&palinfo_cpu_notifier); + + cpu_notifier_register_done(); return 0; } @@ -1006,22 +1014,8 @@ palinfo_init(void) static void __exit palinfo_exit(void) { - int i = 0; - - /* remove all nodes: depth first pass. Could optimize this */ - for_each_online_cpu(i) { - remove_palinfo_proc_entries(i); - } - - /* - * Remove the top level entry finally - */ - remove_proc_entry(palinfo_dir->name, NULL); - - /* - * Unregister from cpu notifier callbacks - */ - unregister_cpu_notifier(&palinfo_cpu_notifier); + unregister_hotcpu_notifier(&palinfo_cpu_notifier); + remove_proc_subtree("pal", NULL); } module_init(palinfo_init); diff --git a/arch/ia64/kernel/paravirt.c b/arch/ia64/kernel/paravirt.c new file mode 100644 index 00000000000..1b22f6de293 --- /dev/null +++ b/arch/ia64/kernel/paravirt.c @@ -0,0 +1,902 @@ +/****************************************************************************** + * arch/ia64/kernel/paravirt.c + * + * Copyright (c) 2008 Isaku Yamahata <yamahata at valinux co jp> + * VA Linux Systems Japan K.K. + * Yaozu (Eddie) Dong <eddie.dong@intel.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + */ + +#include <linux/init.h> + +#include <linux/compiler.h> +#include <linux/io.h> +#include <linux/irq.h> +#include <linux/module.h> +#include <linux/types.h> + +#include <asm/iosapic.h> +#include <asm/paravirt.h> + +/*************************************************************************** + * general info + */ +struct pv_info pv_info = { + .kernel_rpl = 0, + .paravirt_enabled = 0, + .name = "bare hardware" +}; + +/*************************************************************************** + * pv_init_ops + * initialization hooks. + */ + +static void __init +ia64_native_patch_branch(unsigned long tag, unsigned long type); + +struct pv_init_ops pv_init_ops = +{ +#ifdef ASM_SUPPORTED + .patch_bundle = ia64_native_patch_bundle, +#endif + .patch_branch = ia64_native_patch_branch, +}; + +/*************************************************************************** + * pv_cpu_ops + * intrinsics hooks. + */ + +#ifndef ASM_SUPPORTED +/* ia64_native_xxx are macros so that we have to make them real functions */ + +#define DEFINE_VOID_FUNC1(name) \ + static void \ + ia64_native_ ## name ## _func(unsigned long arg) \ + { \ + ia64_native_ ## name(arg); \ + } + +#define DEFINE_VOID_FUNC1_VOID(name) \ + static void \ + ia64_native_ ## name ## _func(void *arg) \ + { \ + ia64_native_ ## name(arg); \ + } + +#define DEFINE_VOID_FUNC2(name) \ + static void \ + ia64_native_ ## name ## _func(unsigned long arg0, \ + unsigned long arg1) \ + { \ + ia64_native_ ## name(arg0, arg1); \ + } + +#define DEFINE_FUNC0(name) \ + static unsigned long \ + ia64_native_ ## name ## _func(void) \ + { \ + return ia64_native_ ## name(); \ + } + +#define DEFINE_FUNC1(name, type) \ + static unsigned long \ + ia64_native_ ## name ## _func(type arg) \ + { \ + return ia64_native_ ## name(arg); \ + } \ + +DEFINE_VOID_FUNC1_VOID(fc); +DEFINE_VOID_FUNC1(intrin_local_irq_restore); + +DEFINE_VOID_FUNC2(ptcga); +DEFINE_VOID_FUNC2(set_rr); + +DEFINE_FUNC0(get_psr_i); + +DEFINE_FUNC1(thash, unsigned long); +DEFINE_FUNC1(get_cpuid, int); +DEFINE_FUNC1(get_pmd, int); +DEFINE_FUNC1(get_rr, unsigned long); + +static void +ia64_native_ssm_i_func(void) +{ + ia64_native_ssm(IA64_PSR_I); +} + +static void +ia64_native_rsm_i_func(void) +{ + ia64_native_rsm(IA64_PSR_I); +} + +static void +ia64_native_set_rr0_to_rr4_func(unsigned long val0, unsigned long val1, + unsigned long val2, unsigned long val3, + unsigned long val4) +{ + ia64_native_set_rr0_to_rr4(val0, val1, val2, val3, val4); +} + +#define CASE_GET_REG(id) \ + case _IA64_REG_ ## id: \ + res = ia64_native_getreg(_IA64_REG_ ## id); \ + break; +#define CASE_GET_AR(id) CASE_GET_REG(AR_ ## id) +#define CASE_GET_CR(id) CASE_GET_REG(CR_ ## id) + +unsigned long +ia64_native_getreg_func(int regnum) +{ + unsigned long res = -1; + switch (regnum) { + CASE_GET_REG(GP); + /*CASE_GET_REG(IP);*/ /* returned ip value shouldn't be constant */ + CASE_GET_REG(PSR); + CASE_GET_REG(TP); + CASE_GET_REG(SP); + + CASE_GET_AR(KR0); + CASE_GET_AR(KR1); + CASE_GET_AR(KR2); + CASE_GET_AR(KR3); + CASE_GET_AR(KR4); + CASE_GET_AR(KR5); + CASE_GET_AR(KR6); + CASE_GET_AR(KR7); + CASE_GET_AR(RSC); + CASE_GET_AR(BSP); + CASE_GET_AR(BSPSTORE); + CASE_GET_AR(RNAT); + CASE_GET_AR(FCR); + CASE_GET_AR(EFLAG); + CASE_GET_AR(CSD); + CASE_GET_AR(SSD); + CASE_GET_AR(CFLAG); + CASE_GET_AR(FSR); + CASE_GET_AR(FIR); + CASE_GET_AR(FDR); + CASE_GET_AR(CCV); + CASE_GET_AR(UNAT); + CASE_GET_AR(FPSR); + CASE_GET_AR(ITC); + CASE_GET_AR(PFS); + CASE_GET_AR(LC); + CASE_GET_AR(EC); + + CASE_GET_CR(DCR); + CASE_GET_CR(ITM); + CASE_GET_CR(IVA); + CASE_GET_CR(PTA); + CASE_GET_CR(IPSR); + CASE_GET_CR(ISR); + CASE_GET_CR(IIP); + CASE_GET_CR(IFA); + CASE_GET_CR(ITIR); + CASE_GET_CR(IIPA); + CASE_GET_CR(IFS); + CASE_GET_CR(IIM); + CASE_GET_CR(IHA); + CASE_GET_CR(LID); + CASE_GET_CR(IVR); + CASE_GET_CR(TPR); + CASE_GET_CR(EOI); + CASE_GET_CR(IRR0); + CASE_GET_CR(IRR1); + CASE_GET_CR(IRR2); + CASE_GET_CR(IRR3); + CASE_GET_CR(ITV); + CASE_GET_CR(PMV); + CASE_GET_CR(CMCV); + CASE_GET_CR(LRR0); + CASE_GET_CR(LRR1); + + default: + printk(KERN_CRIT "wrong_getreg %d\n", regnum); + break; + } + return res; +} + +#define CASE_SET_REG(id) \ + case _IA64_REG_ ## id: \ + ia64_native_setreg(_IA64_REG_ ## id, val); \ + break; +#define CASE_SET_AR(id) CASE_SET_REG(AR_ ## id) +#define CASE_SET_CR(id) CASE_SET_REG(CR_ ## id) + +void +ia64_native_setreg_func(int regnum, unsigned long val) +{ + switch (regnum) { + case _IA64_REG_PSR_L: + ia64_native_setreg(_IA64_REG_PSR_L, val); + ia64_dv_serialize_data(); + break; + CASE_SET_REG(SP); + CASE_SET_REG(GP); + + CASE_SET_AR(KR0); + CASE_SET_AR(KR1); + CASE_SET_AR(KR2); + CASE_SET_AR(KR3); + CASE_SET_AR(KR4); + CASE_SET_AR(KR5); + CASE_SET_AR(KR6); + CASE_SET_AR(KR7); + CASE_SET_AR(RSC); + CASE_SET_AR(BSP); + CASE_SET_AR(BSPSTORE); + CASE_SET_AR(RNAT); + CASE_SET_AR(FCR); + CASE_SET_AR(EFLAG); + CASE_SET_AR(CSD); + CASE_SET_AR(SSD); + CASE_SET_AR(CFLAG); + CASE_SET_AR(FSR); + CASE_SET_AR(FIR); + CASE_SET_AR(FDR); + CASE_SET_AR(CCV); + CASE_SET_AR(UNAT); + CASE_SET_AR(FPSR); + CASE_SET_AR(ITC); + CASE_SET_AR(PFS); + CASE_SET_AR(LC); + CASE_SET_AR(EC); + + CASE_SET_CR(DCR); + CASE_SET_CR(ITM); + CASE_SET_CR(IVA); + CASE_SET_CR(PTA); + CASE_SET_CR(IPSR); + CASE_SET_CR(ISR); + CASE_SET_CR(IIP); + CASE_SET_CR(IFA); + CASE_SET_CR(ITIR); + CASE_SET_CR(IIPA); + CASE_SET_CR(IFS); + CASE_SET_CR(IIM); + CASE_SET_CR(IHA); + CASE_SET_CR(LID); + CASE_SET_CR(IVR); + CASE_SET_CR(TPR); + CASE_SET_CR(EOI); + CASE_SET_CR(IRR0); + CASE_SET_CR(IRR1); + CASE_SET_CR(IRR2); + CASE_SET_CR(IRR3); + CASE_SET_CR(ITV); + CASE_SET_CR(PMV); + CASE_SET_CR(CMCV); + CASE_SET_CR(LRR0); + CASE_SET_CR(LRR1); + default: + printk(KERN_CRIT "wrong setreg %d\n", regnum); + break; + } +} +#else + +#define __DEFINE_FUNC(name, code) \ + extern const char ia64_native_ ## name ## _direct_start[]; \ + extern const char ia64_native_ ## name ## _direct_end[]; \ + asm (".align 32\n" \ + ".proc ia64_native_" #name "_func\n" \ + "ia64_native_" #name "_func:\n" \ + "ia64_native_" #name "_direct_start:\n" \ + code \ + "ia64_native_" #name "_direct_end:\n" \ + "br.cond.sptk.many b6\n" \ + ".endp ia64_native_" #name "_func\n") + +#define DEFINE_VOID_FUNC0(name, code) \ + extern void \ + ia64_native_ ## name ## _func(void); \ + __DEFINE_FUNC(name, code) + +#define DEFINE_VOID_FUNC1(name, code) \ + extern void \ + ia64_native_ ## name ## _func(unsigned long arg); \ + __DEFINE_FUNC(name, code) + +#define DEFINE_VOID_FUNC1_VOID(name, code) \ + extern void \ + ia64_native_ ## name ## _func(void *arg); \ + __DEFINE_FUNC(name, code) + +#define DEFINE_VOID_FUNC2(name, code) \ + extern void \ + ia64_native_ ## name ## _func(unsigned long arg0, \ + unsigned long arg1); \ + __DEFINE_FUNC(name, code) + +#define DEFINE_FUNC0(name, code) \ + extern unsigned long \ + ia64_native_ ## name ## _func(void); \ + __DEFINE_FUNC(name, code) + +#define DEFINE_FUNC1(name, type, code) \ + extern unsigned long \ + ia64_native_ ## name ## _func(type arg); \ + __DEFINE_FUNC(name, code) + +DEFINE_VOID_FUNC1_VOID(fc, + "fc r8\n"); +DEFINE_VOID_FUNC1(intrin_local_irq_restore, + ";;\n" + " cmp.ne p6, p7 = r8, r0\n" + ";;\n" + "(p6) ssm psr.i\n" + "(p7) rsm psr.i\n" + ";;\n" + "(p6) srlz.d\n"); + +DEFINE_VOID_FUNC2(ptcga, + "ptc.ga r8, r9\n"); +DEFINE_VOID_FUNC2(set_rr, + "mov rr[r8] = r9\n"); + +/* ia64_native_getreg(_IA64_REG_PSR) & IA64_PSR_I */ +DEFINE_FUNC0(get_psr_i, + "mov r2 = " __stringify(1 << IA64_PSR_I_BIT) "\n" + "mov r8 = psr\n" + ";;\n" + "and r8 = r2, r8\n"); + +DEFINE_FUNC1(thash, unsigned long, + "thash r8 = r8\n"); +DEFINE_FUNC1(get_cpuid, int, + "mov r8 = cpuid[r8]\n"); +DEFINE_FUNC1(get_pmd, int, + "mov r8 = pmd[r8]\n"); +DEFINE_FUNC1(get_rr, unsigned long, + "mov r8 = rr[r8]\n"); + +DEFINE_VOID_FUNC0(ssm_i, + "ssm psr.i\n"); +DEFINE_VOID_FUNC0(rsm_i, + "rsm psr.i\n"); + +extern void +ia64_native_set_rr0_to_rr4_func(unsigned long val0, unsigned long val1, + unsigned long val2, unsigned long val3, + unsigned long val4); +__DEFINE_FUNC(set_rr0_to_rr4, + "mov rr[r0] = r8\n" + "movl r2 = 0x2000000000000000\n" + ";;\n" + "mov rr[r2] = r9\n" + "shl r3 = r2, 1\n" /* movl r3 = 0x4000000000000000 */ + ";;\n" + "add r2 = r2, r3\n" /* movl r2 = 0x6000000000000000 */ + "mov rr[r3] = r10\n" + ";;\n" + "mov rr[r2] = r11\n" + "shl r3 = r3, 1\n" /* movl r3 = 0x8000000000000000 */ + ";;\n" + "mov rr[r3] = r14\n"); + +extern unsigned long ia64_native_getreg_func(int regnum); +asm(".global ia64_native_getreg_func\n"); +#define __DEFINE_GET_REG(id, reg) \ + "mov r2 = " __stringify(_IA64_REG_ ## id) "\n" \ + ";;\n" \ + "cmp.eq p6, p0 = r2, r8\n" \ + ";;\n" \ + "(p6) mov r8 = " #reg "\n" \ + "(p6) br.cond.sptk.many b6\n" \ + ";;\n" +#define __DEFINE_GET_AR(id, reg) __DEFINE_GET_REG(AR_ ## id, ar.reg) +#define __DEFINE_GET_CR(id, reg) __DEFINE_GET_REG(CR_ ## id, cr.reg) + +__DEFINE_FUNC(getreg, + __DEFINE_GET_REG(GP, gp) + /*__DEFINE_GET_REG(IP, ip)*/ /* returned ip value shouldn't be constant */ + __DEFINE_GET_REG(PSR, psr) + __DEFINE_GET_REG(TP, tp) + __DEFINE_GET_REG(SP, sp) + + __DEFINE_GET_REG(AR_KR0, ar0) + __DEFINE_GET_REG(AR_KR1, ar1) + __DEFINE_GET_REG(AR_KR2, ar2) + __DEFINE_GET_REG(AR_KR3, ar3) + __DEFINE_GET_REG(AR_KR4, ar4) + __DEFINE_GET_REG(AR_KR5, ar5) + __DEFINE_GET_REG(AR_KR6, ar6) + __DEFINE_GET_REG(AR_KR7, ar7) + __DEFINE_GET_AR(RSC, rsc) + __DEFINE_GET_AR(BSP, bsp) + __DEFINE_GET_AR(BSPSTORE, bspstore) + __DEFINE_GET_AR(RNAT, rnat) + __DEFINE_GET_AR(FCR, fcr) + __DEFINE_GET_AR(EFLAG, eflag) + __DEFINE_GET_AR(CSD, csd) + __DEFINE_GET_AR(SSD, ssd) + __DEFINE_GET_REG(AR_CFLAG, ar27) + __DEFINE_GET_AR(FSR, fsr) + __DEFINE_GET_AR(FIR, fir) + __DEFINE_GET_AR(FDR, fdr) + __DEFINE_GET_AR(CCV, ccv) + __DEFINE_GET_AR(UNAT, unat) + __DEFINE_GET_AR(FPSR, fpsr) + __DEFINE_GET_AR(ITC, itc) + __DEFINE_GET_AR(PFS, pfs) + __DEFINE_GET_AR(LC, lc) + __DEFINE_GET_AR(EC, ec) + + __DEFINE_GET_CR(DCR, dcr) + __DEFINE_GET_CR(ITM, itm) + __DEFINE_GET_CR(IVA, iva) + __DEFINE_GET_CR(PTA, pta) + __DEFINE_GET_CR(IPSR, ipsr) + __DEFINE_GET_CR(ISR, isr) + __DEFINE_GET_CR(IIP, iip) + __DEFINE_GET_CR(IFA, ifa) + __DEFINE_GET_CR(ITIR, itir) + __DEFINE_GET_CR(IIPA, iipa) + __DEFINE_GET_CR(IFS, ifs) + __DEFINE_GET_CR(IIM, iim) + __DEFINE_GET_CR(IHA, iha) + __DEFINE_GET_CR(LID, lid) + __DEFINE_GET_CR(IVR, ivr) + __DEFINE_GET_CR(TPR, tpr) + __DEFINE_GET_CR(EOI, eoi) + __DEFINE_GET_CR(IRR0, irr0) + __DEFINE_GET_CR(IRR1, irr1) + __DEFINE_GET_CR(IRR2, irr2) + __DEFINE_GET_CR(IRR3, irr3) + __DEFINE_GET_CR(ITV, itv) + __DEFINE_GET_CR(PMV, pmv) + __DEFINE_GET_CR(CMCV, cmcv) + __DEFINE_GET_CR(LRR0, lrr0) + __DEFINE_GET_CR(LRR1, lrr1) + + "mov r8 = -1\n" /* unsupported case */ + ); + +extern void ia64_native_setreg_func(int regnum, unsigned long val); +asm(".global ia64_native_setreg_func\n"); +#define __DEFINE_SET_REG(id, reg) \ + "mov r2 = " __stringify(_IA64_REG_ ## id) "\n" \ + ";;\n" \ + "cmp.eq p6, p0 = r2, r9\n" \ + ";;\n" \ + "(p6) mov " #reg " = r8\n" \ + "(p6) br.cond.sptk.many b6\n" \ + ";;\n" +#define __DEFINE_SET_AR(id, reg) __DEFINE_SET_REG(AR_ ## id, ar.reg) +#define __DEFINE_SET_CR(id, reg) __DEFINE_SET_REG(CR_ ## id, cr.reg) +__DEFINE_FUNC(setreg, + "mov r2 = " __stringify(_IA64_REG_PSR_L) "\n" + ";;\n" + "cmp.eq p6, p0 = r2, r9\n" + ";;\n" + "(p6) mov psr.l = r8\n" +#ifdef HAVE_SERIALIZE_DIRECTIVE + ".serialize.data\n" +#endif + "(p6) br.cond.sptk.many b6\n" + __DEFINE_SET_REG(GP, gp) + __DEFINE_SET_REG(SP, sp) + + __DEFINE_SET_REG(AR_KR0, ar0) + __DEFINE_SET_REG(AR_KR1, ar1) + __DEFINE_SET_REG(AR_KR2, ar2) + __DEFINE_SET_REG(AR_KR3, ar3) + __DEFINE_SET_REG(AR_KR4, ar4) + __DEFINE_SET_REG(AR_KR5, ar5) + __DEFINE_SET_REG(AR_KR6, ar6) + __DEFINE_SET_REG(AR_KR7, ar7) + __DEFINE_SET_AR(RSC, rsc) + __DEFINE_SET_AR(BSP, bsp) + __DEFINE_SET_AR(BSPSTORE, bspstore) + __DEFINE_SET_AR(RNAT, rnat) + __DEFINE_SET_AR(FCR, fcr) + __DEFINE_SET_AR(EFLAG, eflag) + __DEFINE_SET_AR(CSD, csd) + __DEFINE_SET_AR(SSD, ssd) + __DEFINE_SET_REG(AR_CFLAG, ar27) + __DEFINE_SET_AR(FSR, fsr) + __DEFINE_SET_AR(FIR, fir) + __DEFINE_SET_AR(FDR, fdr) + __DEFINE_SET_AR(CCV, ccv) + __DEFINE_SET_AR(UNAT, unat) + __DEFINE_SET_AR(FPSR, fpsr) + __DEFINE_SET_AR(ITC, itc) + __DEFINE_SET_AR(PFS, pfs) + __DEFINE_SET_AR(LC, lc) + __DEFINE_SET_AR(EC, ec) + + __DEFINE_SET_CR(DCR, dcr) + __DEFINE_SET_CR(ITM, itm) + __DEFINE_SET_CR(IVA, iva) + __DEFINE_SET_CR(PTA, pta) + __DEFINE_SET_CR(IPSR, ipsr) + __DEFINE_SET_CR(ISR, isr) + __DEFINE_SET_CR(IIP, iip) + __DEFINE_SET_CR(IFA, ifa) + __DEFINE_SET_CR(ITIR, itir) + __DEFINE_SET_CR(IIPA, iipa) + __DEFINE_SET_CR(IFS, ifs) + __DEFINE_SET_CR(IIM, iim) + __DEFINE_SET_CR(IHA, iha) + __DEFINE_SET_CR(LID, lid) + __DEFINE_SET_CR(IVR, ivr) + __DEFINE_SET_CR(TPR, tpr) + __DEFINE_SET_CR(EOI, eoi) + __DEFINE_SET_CR(IRR0, irr0) + __DEFINE_SET_CR(IRR1, irr1) + __DEFINE_SET_CR(IRR2, irr2) + __DEFINE_SET_CR(IRR3, irr3) + __DEFINE_SET_CR(ITV, itv) + __DEFINE_SET_CR(PMV, pmv) + __DEFINE_SET_CR(CMCV, cmcv) + __DEFINE_SET_CR(LRR0, lrr0) + __DEFINE_SET_CR(LRR1, lrr1) + ); +#endif + +struct pv_cpu_ops pv_cpu_ops = { + .fc = ia64_native_fc_func, + .thash = ia64_native_thash_func, + .get_cpuid = ia64_native_get_cpuid_func, + .get_pmd = ia64_native_get_pmd_func, + .ptcga = ia64_native_ptcga_func, + .get_rr = ia64_native_get_rr_func, + .set_rr = ia64_native_set_rr_func, + .set_rr0_to_rr4 = ia64_native_set_rr0_to_rr4_func, + .ssm_i = ia64_native_ssm_i_func, + .getreg = ia64_native_getreg_func, + .setreg = ia64_native_setreg_func, + .rsm_i = ia64_native_rsm_i_func, + .get_psr_i = ia64_native_get_psr_i_func, + .intrin_local_irq_restore + = ia64_native_intrin_local_irq_restore_func, +}; +EXPORT_SYMBOL(pv_cpu_ops); + +/****************************************************************************** + * replacement of hand written assembly codes. + */ + +void +paravirt_cpu_asm_init(const struct pv_cpu_asm_switch *cpu_asm_switch) +{ + extern unsigned long paravirt_switch_to_targ; + extern unsigned long paravirt_leave_syscall_targ; + extern unsigned long paravirt_work_processed_syscall_targ; + extern unsigned long paravirt_leave_kernel_targ; + + paravirt_switch_to_targ = cpu_asm_switch->switch_to; + paravirt_leave_syscall_targ = cpu_asm_switch->leave_syscall; + paravirt_work_processed_syscall_targ = + cpu_asm_switch->work_processed_syscall; + paravirt_leave_kernel_targ = cpu_asm_switch->leave_kernel; +} + +/*************************************************************************** + * pv_iosapic_ops + * iosapic read/write hooks. + */ + +static unsigned int +ia64_native_iosapic_read(char __iomem *iosapic, unsigned int reg) +{ + return __ia64_native_iosapic_read(iosapic, reg); +} + +static void +ia64_native_iosapic_write(char __iomem *iosapic, unsigned int reg, u32 val) +{ + __ia64_native_iosapic_write(iosapic, reg, val); +} + +struct pv_iosapic_ops pv_iosapic_ops = { + .pcat_compat_init = ia64_native_iosapic_pcat_compat_init, + .__get_irq_chip = ia64_native_iosapic_get_irq_chip, + + .__read = ia64_native_iosapic_read, + .__write = ia64_native_iosapic_write, +}; + +/*************************************************************************** + * pv_irq_ops + * irq operations + */ + +struct pv_irq_ops pv_irq_ops = { + .register_ipi = ia64_native_register_ipi, + + .assign_irq_vector = ia64_native_assign_irq_vector, + .free_irq_vector = ia64_native_free_irq_vector, + .register_percpu_irq = ia64_native_register_percpu_irq, + + .resend_irq = ia64_native_resend_irq, +}; + +/*************************************************************************** + * pv_time_ops + * time operations + */ +struct static_key paravirt_steal_enabled; +struct static_key paravirt_steal_rq_enabled; + +static int +ia64_native_do_steal_accounting(unsigned long *new_itm) +{ + return 0; +} + +struct pv_time_ops pv_time_ops = { + .do_steal_accounting = ia64_native_do_steal_accounting, + .sched_clock = ia64_native_sched_clock, +}; + +/*************************************************************************** + * binary pacthing + * pv_init_ops.patch_bundle + */ + +#ifdef ASM_SUPPORTED +#define IA64_NATIVE_PATCH_DEFINE_GET_REG(name, reg) \ + __DEFINE_FUNC(get_ ## name, \ + ";;\n" \ + "mov r8 = " #reg "\n" \ + ";;\n") + +#define IA64_NATIVE_PATCH_DEFINE_SET_REG(name, reg) \ + __DEFINE_FUNC(set_ ## name, \ + ";;\n" \ + "mov " #reg " = r8\n" \ + ";;\n") + +#define IA64_NATIVE_PATCH_DEFINE_REG(name, reg) \ + IA64_NATIVE_PATCH_DEFINE_GET_REG(name, reg); \ + IA64_NATIVE_PATCH_DEFINE_SET_REG(name, reg) \ + +#define IA64_NATIVE_PATCH_DEFINE_AR(name, reg) \ + IA64_NATIVE_PATCH_DEFINE_REG(ar_ ## name, ar.reg) + +#define IA64_NATIVE_PATCH_DEFINE_CR(name, reg) \ + IA64_NATIVE_PATCH_DEFINE_REG(cr_ ## name, cr.reg) + + +IA64_NATIVE_PATCH_DEFINE_GET_REG(psr, psr); +IA64_NATIVE_PATCH_DEFINE_GET_REG(tp, tp); + +/* IA64_NATIVE_PATCH_DEFINE_SET_REG(psr_l, psr.l); */ +__DEFINE_FUNC(set_psr_l, + ";;\n" + "mov psr.l = r8\n" +#ifdef HAVE_SERIALIZE_DIRECTIVE + ".serialize.data\n" +#endif + ";;\n"); + +IA64_NATIVE_PATCH_DEFINE_REG(gp, gp); +IA64_NATIVE_PATCH_DEFINE_REG(sp, sp); + +IA64_NATIVE_PATCH_DEFINE_REG(kr0, ar0); +IA64_NATIVE_PATCH_DEFINE_REG(kr1, ar1); +IA64_NATIVE_PATCH_DEFINE_REG(kr2, ar2); +IA64_NATIVE_PATCH_DEFINE_REG(kr3, ar3); +IA64_NATIVE_PATCH_DEFINE_REG(kr4, ar4); +IA64_NATIVE_PATCH_DEFINE_REG(kr5, ar5); +IA64_NATIVE_PATCH_DEFINE_REG(kr6, ar6); +IA64_NATIVE_PATCH_DEFINE_REG(kr7, ar7); + +IA64_NATIVE_PATCH_DEFINE_AR(rsc, rsc); +IA64_NATIVE_PATCH_DEFINE_AR(bsp, bsp); +IA64_NATIVE_PATCH_DEFINE_AR(bspstore, bspstore); +IA64_NATIVE_PATCH_DEFINE_AR(rnat, rnat); +IA64_NATIVE_PATCH_DEFINE_AR(fcr, fcr); +IA64_NATIVE_PATCH_DEFINE_AR(eflag, eflag); +IA64_NATIVE_PATCH_DEFINE_AR(csd, csd); +IA64_NATIVE_PATCH_DEFINE_AR(ssd, ssd); +IA64_NATIVE_PATCH_DEFINE_REG(ar27, ar27); +IA64_NATIVE_PATCH_DEFINE_AR(fsr, fsr); +IA64_NATIVE_PATCH_DEFINE_AR(fir, fir); +IA64_NATIVE_PATCH_DEFINE_AR(fdr, fdr); +IA64_NATIVE_PATCH_DEFINE_AR(ccv, ccv); +IA64_NATIVE_PATCH_DEFINE_AR(unat, unat); +IA64_NATIVE_PATCH_DEFINE_AR(fpsr, fpsr); +IA64_NATIVE_PATCH_DEFINE_AR(itc, itc); +IA64_NATIVE_PATCH_DEFINE_AR(pfs, pfs); +IA64_NATIVE_PATCH_DEFINE_AR(lc, lc); +IA64_NATIVE_PATCH_DEFINE_AR(ec, ec); + +IA64_NATIVE_PATCH_DEFINE_CR(dcr, dcr); +IA64_NATIVE_PATCH_DEFINE_CR(itm, itm); +IA64_NATIVE_PATCH_DEFINE_CR(iva, iva); +IA64_NATIVE_PATCH_DEFINE_CR(pta, pta); +IA64_NATIVE_PATCH_DEFINE_CR(ipsr, ipsr); +IA64_NATIVE_PATCH_DEFINE_CR(isr, isr); +IA64_NATIVE_PATCH_DEFINE_CR(iip, iip); +IA64_NATIVE_PATCH_DEFINE_CR(ifa, ifa); +IA64_NATIVE_PATCH_DEFINE_CR(itir, itir); +IA64_NATIVE_PATCH_DEFINE_CR(iipa, iipa); +IA64_NATIVE_PATCH_DEFINE_CR(ifs, ifs); +IA64_NATIVE_PATCH_DEFINE_CR(iim, iim); +IA64_NATIVE_PATCH_DEFINE_CR(iha, iha); +IA64_NATIVE_PATCH_DEFINE_CR(lid, lid); +IA64_NATIVE_PATCH_DEFINE_CR(ivr, ivr); +IA64_NATIVE_PATCH_DEFINE_CR(tpr, tpr); +IA64_NATIVE_PATCH_DEFINE_CR(eoi, eoi); +IA64_NATIVE_PATCH_DEFINE_CR(irr0, irr0); +IA64_NATIVE_PATCH_DEFINE_CR(irr1, irr1); +IA64_NATIVE_PATCH_DEFINE_CR(irr2, irr2); +IA64_NATIVE_PATCH_DEFINE_CR(irr3, irr3); +IA64_NATIVE_PATCH_DEFINE_CR(itv, itv); +IA64_NATIVE_PATCH_DEFINE_CR(pmv, pmv); +IA64_NATIVE_PATCH_DEFINE_CR(cmcv, cmcv); +IA64_NATIVE_PATCH_DEFINE_CR(lrr0, lrr0); +IA64_NATIVE_PATCH_DEFINE_CR(lrr1, lrr1); + +static const struct paravirt_patch_bundle_elem ia64_native_patch_bundle_elems[] +__initdata_or_module = +{ +#define IA64_NATIVE_PATCH_BUNDLE_ELEM(name, type) \ + { \ + (void*)ia64_native_ ## name ## _direct_start, \ + (void*)ia64_native_ ## name ## _direct_end, \ + PARAVIRT_PATCH_TYPE_ ## type, \ + } + + IA64_NATIVE_PATCH_BUNDLE_ELEM(fc, FC), + IA64_NATIVE_PATCH_BUNDLE_ELEM(thash, THASH), + IA64_NATIVE_PATCH_BUNDLE_ELEM(get_cpuid, GET_CPUID), + IA64_NATIVE_PATCH_BUNDLE_ELEM(get_pmd, GET_PMD), + IA64_NATIVE_PATCH_BUNDLE_ELEM(ptcga, PTCGA), + IA64_NATIVE_PATCH_BUNDLE_ELEM(get_rr, GET_RR), + IA64_NATIVE_PATCH_BUNDLE_ELEM(set_rr, SET_RR), + IA64_NATIVE_PATCH_BUNDLE_ELEM(set_rr0_to_rr4, SET_RR0_TO_RR4), + IA64_NATIVE_PATCH_BUNDLE_ELEM(ssm_i, SSM_I), + IA64_NATIVE_PATCH_BUNDLE_ELEM(rsm_i, RSM_I), + IA64_NATIVE_PATCH_BUNDLE_ELEM(get_psr_i, GET_PSR_I), + IA64_NATIVE_PATCH_BUNDLE_ELEM(intrin_local_irq_restore, + INTRIN_LOCAL_IRQ_RESTORE), + +#define IA64_NATIVE_PATCH_BUNDLE_ELEM_GETREG(name, reg) \ + { \ + (void*)ia64_native_get_ ## name ## _direct_start, \ + (void*)ia64_native_get_ ## name ## _direct_end, \ + PARAVIRT_PATCH_TYPE_GETREG + _IA64_REG_ ## reg, \ + } + +#define IA64_NATIVE_PATCH_BUNDLE_ELEM_SETREG(name, reg) \ + { \ + (void*)ia64_native_set_ ## name ## _direct_start, \ + (void*)ia64_native_set_ ## name ## _direct_end, \ + PARAVIRT_PATCH_TYPE_SETREG + _IA64_REG_ ## reg, \ + } + +#define IA64_NATIVE_PATCH_BUNDLE_ELEM_REG(name, reg) \ + IA64_NATIVE_PATCH_BUNDLE_ELEM_GETREG(name, reg), \ + IA64_NATIVE_PATCH_BUNDLE_ELEM_SETREG(name, reg) \ + +#define IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(name, reg) \ + IA64_NATIVE_PATCH_BUNDLE_ELEM_REG(ar_ ## name, AR_ ## reg) + +#define IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(name, reg) \ + IA64_NATIVE_PATCH_BUNDLE_ELEM_REG(cr_ ## name, CR_ ## reg) + + IA64_NATIVE_PATCH_BUNDLE_ELEM_GETREG(psr, PSR), + IA64_NATIVE_PATCH_BUNDLE_ELEM_GETREG(tp, TP), + + IA64_NATIVE_PATCH_BUNDLE_ELEM_SETREG(psr_l, PSR_L), + + IA64_NATIVE_PATCH_BUNDLE_ELEM_REG(gp, GP), + IA64_NATIVE_PATCH_BUNDLE_ELEM_REG(sp, SP), + + IA64_NATIVE_PATCH_BUNDLE_ELEM_REG(kr0, AR_KR0), + IA64_NATIVE_PATCH_BUNDLE_ELEM_REG(kr1, AR_KR1), + IA64_NATIVE_PATCH_BUNDLE_ELEM_REG(kr2, AR_KR2), + IA64_NATIVE_PATCH_BUNDLE_ELEM_REG(kr3, AR_KR3), + IA64_NATIVE_PATCH_BUNDLE_ELEM_REG(kr4, AR_KR4), + IA64_NATIVE_PATCH_BUNDLE_ELEM_REG(kr5, AR_KR5), + IA64_NATIVE_PATCH_BUNDLE_ELEM_REG(kr6, AR_KR6), + IA64_NATIVE_PATCH_BUNDLE_ELEM_REG(kr7, AR_KR7), + + IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(rsc, RSC), + IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(bsp, BSP), + IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(bspstore, BSPSTORE), + IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(rnat, RNAT), + IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(fcr, FCR), + IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(eflag, EFLAG), + IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(csd, CSD), + IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(ssd, SSD), + IA64_NATIVE_PATCH_BUNDLE_ELEM_REG(ar27, AR_CFLAG), + IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(fsr, FSR), + IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(fir, FIR), + IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(fdr, FDR), + IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(ccv, CCV), + IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(unat, UNAT), + IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(fpsr, FPSR), + IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(itc, ITC), + IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(pfs, PFS), + IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(lc, LC), + IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(ec, EC), + + IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(dcr, DCR), + IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(itm, ITM), + IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(iva, IVA), + IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(pta, PTA), + IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(ipsr, IPSR), + IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(isr, ISR), + IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(iip, IIP), + IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(ifa, IFA), + IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(itir, ITIR), + IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(iipa, IIPA), + IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(ifs, IFS), + IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(iim, IIM), + IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(iha, IHA), + IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(lid, LID), + IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(ivr, IVR), + IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(tpr, TPR), + IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(eoi, EOI), + IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(irr0, IRR0), + IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(irr1, IRR1), + IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(irr2, IRR2), + IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(irr3, IRR3), + IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(itv, ITV), + IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(pmv, PMV), + IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(cmcv, CMCV), + IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(lrr0, LRR0), + IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(lrr1, LRR1), +}; + +unsigned long __init_or_module +ia64_native_patch_bundle(void *sbundle, void *ebundle, unsigned long type) +{ + const unsigned long nelems = sizeof(ia64_native_patch_bundle_elems) / + sizeof(ia64_native_patch_bundle_elems[0]); + + return __paravirt_patch_apply_bundle(sbundle, ebundle, type, + ia64_native_patch_bundle_elems, + nelems, NULL); +} +#endif /* ASM_SUPPOTED */ + +extern const char ia64_native_switch_to[]; +extern const char ia64_native_leave_syscall[]; +extern const char ia64_native_work_processed_syscall[]; +extern const char ia64_native_leave_kernel[]; + +const struct paravirt_patch_branch_target ia64_native_branch_target[] +__initconst = { +#define PARAVIRT_BR_TARGET(name, type) \ + { \ + ia64_native_ ## name, \ + PARAVIRT_PATCH_TYPE_BR_ ## type, \ + } + PARAVIRT_BR_TARGET(switch_to, SWITCH_TO), + PARAVIRT_BR_TARGET(leave_syscall, LEAVE_SYSCALL), + PARAVIRT_BR_TARGET(work_processed_syscall, WORK_PROCESSED_SYSCALL), + PARAVIRT_BR_TARGET(leave_kernel, LEAVE_KERNEL), +}; + +static void __init +ia64_native_patch_branch(unsigned long tag, unsigned long type) +{ + const unsigned long nelem = + sizeof(ia64_native_branch_target) / + sizeof(ia64_native_branch_target[0]); + __paravirt_patch_apply_branch(tag, type, + ia64_native_branch_target, nelem); +} diff --git a/arch/ia64/kernel/paravirt_inst.h b/arch/ia64/kernel/paravirt_inst.h new file mode 100644 index 00000000000..1ad7512b5f6 --- /dev/null +++ b/arch/ia64/kernel/paravirt_inst.h @@ -0,0 +1,28 @@ +/****************************************************************************** + * linux/arch/ia64/xen/paravirt_inst.h + * + * Copyright (c) 2008 Isaku Yamahata <yamahata at valinux co jp> + * VA Linux Systems Japan K.K. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + */ + +#ifdef __IA64_ASM_PARAVIRTUALIZED_PVCHECK +#include <asm/native/pvchk_inst.h> +#else +#include <asm/native/inst.h> +#endif + diff --git a/arch/ia64/kernel/paravirt_patch.c b/arch/ia64/kernel/paravirt_patch.c new file mode 100644 index 00000000000..bfdfef1b1ff --- /dev/null +++ b/arch/ia64/kernel/paravirt_patch.c @@ -0,0 +1,514 @@ +/****************************************************************************** + * linux/arch/ia64/xen/paravirt_patch.c + * + * Copyright (c) 2008 Isaku Yamahata <yamahata at valinux co jp> + * VA Linux Systems Japan K.K. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + */ + +#include <linux/init.h> +#include <asm/intrinsics.h> +#include <asm/kprobes.h> +#include <asm/paravirt.h> +#include <asm/paravirt_patch.h> + +typedef union ia64_inst { + struct { + unsigned long long qp : 6; + unsigned long long : 31; + unsigned long long opcode : 4; + unsigned long long reserved : 23; + } generic; + unsigned long long l; +} ia64_inst_t; + +/* + * flush_icache_range() can't be used here. + * we are here before cpu_init() which initializes + * ia64_i_cache_stride_shift. flush_icache_range() uses it. + */ +void __init_or_module +paravirt_flush_i_cache_range(const void *instr, unsigned long size) +{ + extern void paravirt_fc_i(const void *addr); + unsigned long i; + + for (i = 0; i < size; i += sizeof(bundle_t)) + paravirt_fc_i(instr + i); +} + +bundle_t* __init_or_module +paravirt_get_bundle(unsigned long tag) +{ + return (bundle_t *)(tag & ~3UL); +} + +unsigned long __init_or_module +paravirt_get_slot(unsigned long tag) +{ + return tag & 3UL; +} + +unsigned long __init_or_module +paravirt_get_num_inst(unsigned long stag, unsigned long etag) +{ + bundle_t *sbundle = paravirt_get_bundle(stag); + unsigned long sslot = paravirt_get_slot(stag); + bundle_t *ebundle = paravirt_get_bundle(etag); + unsigned long eslot = paravirt_get_slot(etag); + + return (ebundle - sbundle) * 3 + eslot - sslot + 1; +} + +unsigned long __init_or_module +paravirt_get_next_tag(unsigned long tag) +{ + unsigned long slot = paravirt_get_slot(tag); + + switch (slot) { + case 0: + case 1: + return tag + 1; + case 2: { + bundle_t *bundle = paravirt_get_bundle(tag); + return (unsigned long)(bundle + 1); + } + default: + BUG(); + } + /* NOTREACHED */ +} + +ia64_inst_t __init_or_module +paravirt_read_slot0(const bundle_t *bundle) +{ + ia64_inst_t inst; + inst.l = bundle->quad0.slot0; + return inst; +} + +ia64_inst_t __init_or_module +paravirt_read_slot1(const bundle_t *bundle) +{ + ia64_inst_t inst; + inst.l = bundle->quad0.slot1_p0 | + ((unsigned long long)bundle->quad1.slot1_p1 << 18UL); + return inst; +} + +ia64_inst_t __init_or_module +paravirt_read_slot2(const bundle_t *bundle) +{ + ia64_inst_t inst; + inst.l = bundle->quad1.slot2; + return inst; +} + +ia64_inst_t __init_or_module +paravirt_read_inst(unsigned long tag) +{ + bundle_t *bundle = paravirt_get_bundle(tag); + unsigned long slot = paravirt_get_slot(tag); + + switch (slot) { + case 0: + return paravirt_read_slot0(bundle); + case 1: + return paravirt_read_slot1(bundle); + case 2: + return paravirt_read_slot2(bundle); + default: + BUG(); + } + /* NOTREACHED */ +} + +void __init_or_module +paravirt_write_slot0(bundle_t *bundle, ia64_inst_t inst) +{ + bundle->quad0.slot0 = inst.l; +} + +void __init_or_module +paravirt_write_slot1(bundle_t *bundle, ia64_inst_t inst) +{ + bundle->quad0.slot1_p0 = inst.l; + bundle->quad1.slot1_p1 = inst.l >> 18UL; +} + +void __init_or_module +paravirt_write_slot2(bundle_t *bundle, ia64_inst_t inst) +{ + bundle->quad1.slot2 = inst.l; +} + +void __init_or_module +paravirt_write_inst(unsigned long tag, ia64_inst_t inst) +{ + bundle_t *bundle = paravirt_get_bundle(tag); + unsigned long slot = paravirt_get_slot(tag); + + switch (slot) { + case 0: + paravirt_write_slot0(bundle, inst); + break; + case 1: + paravirt_write_slot1(bundle, inst); + break; + case 2: + paravirt_write_slot2(bundle, inst); + break; + default: + BUG(); + break; + } + paravirt_flush_i_cache_range(bundle, sizeof(*bundle)); +} + +/* for debug */ +void +paravirt_print_bundle(const bundle_t *bundle) +{ + const unsigned long *quad = (const unsigned long *)bundle; + ia64_inst_t slot0 = paravirt_read_slot0(bundle); + ia64_inst_t slot1 = paravirt_read_slot1(bundle); + ia64_inst_t slot2 = paravirt_read_slot2(bundle); + + printk(KERN_DEBUG + "bundle 0x%p 0x%016lx 0x%016lx\n", bundle, quad[0], quad[1]); + printk(KERN_DEBUG + "bundle template 0x%x\n", + bundle->quad0.template); + printk(KERN_DEBUG + "slot0 0x%lx slot1_p0 0x%lx slot1_p1 0x%lx slot2 0x%lx\n", + (unsigned long)bundle->quad0.slot0, + (unsigned long)bundle->quad0.slot1_p0, + (unsigned long)bundle->quad1.slot1_p1, + (unsigned long)bundle->quad1.slot2); + printk(KERN_DEBUG + "slot0 0x%016llx slot1 0x%016llx slot2 0x%016llx\n", + slot0.l, slot1.l, slot2.l); +} + +static int noreplace_paravirt __init_or_module = 0; + +static int __init setup_noreplace_paravirt(char *str) +{ + noreplace_paravirt = 1; + return 1; +} +__setup("noreplace-paravirt", setup_noreplace_paravirt); + +#ifdef ASM_SUPPORTED +static void __init_or_module +fill_nop_bundle(void *sbundle, void *ebundle) +{ + extern const char paravirt_nop_bundle[]; + extern const unsigned long paravirt_nop_bundle_size; + + void *bundle = sbundle; + + BUG_ON((((unsigned long)sbundle) % sizeof(bundle_t)) != 0); + BUG_ON((((unsigned long)ebundle) % sizeof(bundle_t)) != 0); + + while (bundle < ebundle) { + memcpy(bundle, paravirt_nop_bundle, paravirt_nop_bundle_size); + + bundle += paravirt_nop_bundle_size; + } +} + +/* helper function */ +unsigned long __init_or_module +__paravirt_patch_apply_bundle(void *sbundle, void *ebundle, unsigned long type, + const struct paravirt_patch_bundle_elem *elems, + unsigned long nelems, + const struct paravirt_patch_bundle_elem **found) +{ + unsigned long used = 0; + unsigned long i; + + BUG_ON((((unsigned long)sbundle) % sizeof(bundle_t)) != 0); + BUG_ON((((unsigned long)ebundle) % sizeof(bundle_t)) != 0); + + found = NULL; + for (i = 0; i < nelems; i++) { + const struct paravirt_patch_bundle_elem *p = &elems[i]; + if (p->type == type) { + unsigned long need = p->ebundle - p->sbundle; + unsigned long room = ebundle - sbundle; + + if (found != NULL) + *found = p; + + if (room < need) { + /* no room to replace. skip it */ + printk(KERN_DEBUG + "the space is too small to put " + "bundles. type %ld need %ld room %ld\n", + type, need, room); + break; + } + + used = need; + memcpy(sbundle, p->sbundle, used); + break; + } + } + + return used; +} + +void __init_or_module +paravirt_patch_apply_bundle(const struct paravirt_patch_site_bundle *start, + const struct paravirt_patch_site_bundle *end) +{ + const struct paravirt_patch_site_bundle *p; + + if (noreplace_paravirt) + return; + if (pv_init_ops.patch_bundle == NULL) + return; + + for (p = start; p < end; p++) { + unsigned long used; + + used = (*pv_init_ops.patch_bundle)(p->sbundle, p->ebundle, + p->type); + if (used == 0) + continue; + + fill_nop_bundle(p->sbundle + used, p->ebundle); + paravirt_flush_i_cache_range(p->sbundle, + p->ebundle - p->sbundle); + } + ia64_sync_i(); + ia64_srlz_i(); +} + +/* + * nop.i, nop.m, nop.f instruction are same format. + * but nop.b has differennt format. + * This doesn't support nop.b for now. + */ +static void __init_or_module +fill_nop_inst(unsigned long stag, unsigned long etag) +{ + extern const bundle_t paravirt_nop_mfi_inst_bundle[]; + unsigned long tag; + const ia64_inst_t nop_inst = + paravirt_read_slot0(paravirt_nop_mfi_inst_bundle); + + for (tag = stag; tag < etag; tag = paravirt_get_next_tag(tag)) + paravirt_write_inst(tag, nop_inst); +} + +void __init_or_module +paravirt_patch_apply_inst(const struct paravirt_patch_site_inst *start, + const struct paravirt_patch_site_inst *end) +{ + const struct paravirt_patch_site_inst *p; + + if (noreplace_paravirt) + return; + if (pv_init_ops.patch_inst == NULL) + return; + + for (p = start; p < end; p++) { + unsigned long tag; + bundle_t *sbundle; + bundle_t *ebundle; + + tag = (*pv_init_ops.patch_inst)(p->stag, p->etag, p->type); + if (tag == p->stag) + continue; + + fill_nop_inst(tag, p->etag); + sbundle = paravirt_get_bundle(p->stag); + ebundle = paravirt_get_bundle(p->etag) + 1; + paravirt_flush_i_cache_range(sbundle, (ebundle - sbundle) * + sizeof(bundle_t)); + } + ia64_sync_i(); + ia64_srlz_i(); +} +#endif /* ASM_SUPPOTED */ + +/* brl.cond.sptk.many <target64> X3 */ +typedef union inst_x3_op { + ia64_inst_t inst; + struct { + unsigned long qp: 6; + unsigned long btyp: 3; + unsigned long unused: 3; + unsigned long p: 1; + unsigned long imm20b: 20; + unsigned long wh: 2; + unsigned long d: 1; + unsigned long i: 1; + unsigned long opcode: 4; + }; + unsigned long l; +} inst_x3_op_t; + +typedef union inst_x3_imm { + ia64_inst_t inst; + struct { + unsigned long unused: 2; + unsigned long imm39: 39; + }; + unsigned long l; +} inst_x3_imm_t; + +void __init_or_module +paravirt_patch_reloc_brl(unsigned long tag, const void *target) +{ + unsigned long tag_op = paravirt_get_next_tag(tag); + unsigned long tag_imm = tag; + bundle_t *bundle = paravirt_get_bundle(tag); + + ia64_inst_t inst_op = paravirt_read_inst(tag_op); + ia64_inst_t inst_imm = paravirt_read_inst(tag_imm); + + inst_x3_op_t inst_x3_op = { .l = inst_op.l }; + inst_x3_imm_t inst_x3_imm = { .l = inst_imm.l }; + + unsigned long imm60 = + ((unsigned long)target - (unsigned long)bundle) >> 4; + + BUG_ON(paravirt_get_slot(tag) != 1); /* MLX */ + BUG_ON(((unsigned long)target & (sizeof(bundle_t) - 1)) != 0); + + /* imm60[59] 1bit */ + inst_x3_op.i = (imm60 >> 59) & 1; + /* imm60[19:0] 20bit */ + inst_x3_op.imm20b = imm60 & ((1UL << 20) - 1); + /* imm60[58:20] 39bit */ + inst_x3_imm.imm39 = (imm60 >> 20) & ((1UL << 39) - 1); + + inst_op.l = inst_x3_op.l; + inst_imm.l = inst_x3_imm.l; + + paravirt_write_inst(tag_op, inst_op); + paravirt_write_inst(tag_imm, inst_imm); +} + +/* br.cond.sptk.many <target25> B1 */ +typedef union inst_b1 { + ia64_inst_t inst; + struct { + unsigned long qp: 6; + unsigned long btype: 3; + unsigned long unused: 3; + unsigned long p: 1; + unsigned long imm20b: 20; + unsigned long wh: 2; + unsigned long d: 1; + unsigned long s: 1; + unsigned long opcode: 4; + }; + unsigned long l; +} inst_b1_t; + +void __init +paravirt_patch_reloc_br(unsigned long tag, const void *target) +{ + bundle_t *bundle = paravirt_get_bundle(tag); + ia64_inst_t inst = paravirt_read_inst(tag); + unsigned long target25 = (unsigned long)target - (unsigned long)bundle; + inst_b1_t inst_b1; + + BUG_ON(((unsigned long)target & (sizeof(bundle_t) - 1)) != 0); + + inst_b1.l = inst.l; + if (target25 & (1UL << 63)) + inst_b1.s = 1; + else + inst_b1.s = 0; + + inst_b1.imm20b = target25 >> 4; + inst.l = inst_b1.l; + + paravirt_write_inst(tag, inst); +} + +void __init +__paravirt_patch_apply_branch( + unsigned long tag, unsigned long type, + const struct paravirt_patch_branch_target *entries, + unsigned int nr_entries) +{ + unsigned int i; + for (i = 0; i < nr_entries; i++) { + if (entries[i].type == type) { + paravirt_patch_reloc_br(tag, entries[i].entry); + break; + } + } +} + +static void __init +paravirt_patch_apply_branch(const struct paravirt_patch_site_branch *start, + const struct paravirt_patch_site_branch *end) +{ + const struct paravirt_patch_site_branch *p; + + if (noreplace_paravirt) + return; + if (pv_init_ops.patch_branch == NULL) + return; + + for (p = start; p < end; p++) + (*pv_init_ops.patch_branch)(p->tag, p->type); + + ia64_sync_i(); + ia64_srlz_i(); +} + +void __init +paravirt_patch_apply(void) +{ + extern const char __start_paravirt_bundles[]; + extern const char __stop_paravirt_bundles[]; + extern const char __start_paravirt_insts[]; + extern const char __stop_paravirt_insts[]; + extern const char __start_paravirt_branches[]; + extern const char __stop_paravirt_branches[]; + + paravirt_patch_apply_bundle((const struct paravirt_patch_site_bundle *) + __start_paravirt_bundles, + (const struct paravirt_patch_site_bundle *) + __stop_paravirt_bundles); + paravirt_patch_apply_inst((const struct paravirt_patch_site_inst *) + __start_paravirt_insts, + (const struct paravirt_patch_site_inst *) + __stop_paravirt_insts); + paravirt_patch_apply_branch((const struct paravirt_patch_site_branch *) + __start_paravirt_branches, + (const struct paravirt_patch_site_branch *) + __stop_paravirt_branches); +} + +/* + * Local variables: + * mode: C + * c-set-style: "linux" + * c-basic-offset: 8 + * tab-width: 8 + * indent-tabs-mode: t + * End: + */ diff --git a/arch/ia64/kernel/paravirt_patchlist.c b/arch/ia64/kernel/paravirt_patchlist.c new file mode 100644 index 00000000000..0a70720662e --- /dev/null +++ b/arch/ia64/kernel/paravirt_patchlist.c @@ -0,0 +1,81 @@ +/****************************************************************************** + * Copyright (c) 2008 Isaku Yamahata <yamahata at valinux co jp> + * VA Linux Systems Japan K.K. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + */ + +#include <linux/bug.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <asm/paravirt.h> + +#define DECLARE(name) \ + extern unsigned long \ + __ia64_native_start_gate_##name##_patchlist[]; \ + extern unsigned long \ + __ia64_native_end_gate_##name##_patchlist[] + +DECLARE(fsyscall); +DECLARE(brl_fsys_bubble_down); +DECLARE(vtop); +DECLARE(mckinley_e9); + +extern unsigned long __start_gate_section[]; + +#define ASSIGN(name) \ + .start_##name##_patchlist = \ + (unsigned long)__ia64_native_start_gate_##name##_patchlist, \ + .end_##name##_patchlist = \ + (unsigned long)__ia64_native_end_gate_##name##_patchlist + +struct pv_patchdata pv_patchdata __initdata = { + ASSIGN(fsyscall), + ASSIGN(brl_fsys_bubble_down), + ASSIGN(vtop), + ASSIGN(mckinley_e9), + + .gate_section = (void*)__start_gate_section, +}; + + +unsigned long __init +paravirt_get_gate_patchlist(enum pv_gate_patchlist type) +{ + +#define CASE(NAME, name) \ + case PV_GATE_START_##NAME: \ + return pv_patchdata.start_##name##_patchlist; \ + case PV_GATE_END_##NAME: \ + return pv_patchdata.end_##name##_patchlist; \ + + switch (type) { + CASE(FSYSCALL, fsyscall); + CASE(BRL_FSYS_BUBBLE_DOWN, brl_fsys_bubble_down); + CASE(VTOP, vtop); + CASE(MCKINLEY_E9, mckinley_e9); + default: + BUG(); + break; + } + return 0; +} + +void * __init +paravirt_get_gate_section(void) +{ + return pv_patchdata.gate_section; +} diff --git a/arch/ia64/kernel/paravirt_patchlist.h b/arch/ia64/kernel/paravirt_patchlist.h new file mode 100644 index 00000000000..67cffc3643a --- /dev/null +++ b/arch/ia64/kernel/paravirt_patchlist.h @@ -0,0 +1,24 @@ +/****************************************************************************** + * linux/arch/ia64/xen/paravirt_patchlist.h + * + * Copyright (c) 2008 Isaku Yamahata <yamahata at valinux co jp> + * VA Linux Systems Japan K.K. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + */ + +#include <asm/native/patchlist.h> + diff --git a/arch/ia64/kernel/paravirtentry.S b/arch/ia64/kernel/paravirtentry.S new file mode 100644 index 00000000000..92d880c4d3d --- /dev/null +++ b/arch/ia64/kernel/paravirtentry.S @@ -0,0 +1,121 @@ +/****************************************************************************** + * linux/arch/ia64/xen/paravirtentry.S + * + * Copyright (c) 2008 Isaku Yamahata <yamahata at valinux co jp> + * VA Linux Systems Japan K.K. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + */ + +#include <linux/init.h> +#include <asm/asmmacro.h> +#include <asm/asm-offsets.h> +#include <asm/paravirt_privop.h> +#include <asm/paravirt_patch.h> +#include "entry.h" + +#define DATA8(sym, init_value) \ + .pushsection .data..read_mostly ; \ + .align 8 ; \ + .global sym ; \ + sym: ; \ + data8 init_value ; \ + .popsection + +#define BRANCH(targ, reg, breg, type) \ + PARAVIRT_PATCH_SITE_BR(PARAVIRT_PATCH_TYPE_BR_ ## type) ; \ + ;; \ + movl reg=targ ; \ + ;; \ + ld8 reg=[reg] ; \ + ;; \ + mov breg=reg ; \ + br.cond.sptk.many breg + +#define BRANCH_PROC(sym, reg, breg, type) \ + DATA8(paravirt_ ## sym ## _targ, ia64_native_ ## sym) ; \ + GLOBAL_ENTRY(paravirt_ ## sym) ; \ + BRANCH(paravirt_ ## sym ## _targ, reg, breg, type) ; \ + END(paravirt_ ## sym) + +#define BRANCH_PROC_UNWINFO(sym, reg, breg, type) \ + DATA8(paravirt_ ## sym ## _targ, ia64_native_ ## sym) ; \ + GLOBAL_ENTRY(paravirt_ ## sym) ; \ + PT_REGS_UNWIND_INFO(0) ; \ + BRANCH(paravirt_ ## sym ## _targ, reg, breg, type) ; \ + END(paravirt_ ## sym) + + +BRANCH_PROC(switch_to, r22, b7, SWITCH_TO) +BRANCH_PROC_UNWINFO(leave_syscall, r22, b7, LEAVE_SYSCALL) +BRANCH_PROC(work_processed_syscall, r2, b7, WORK_PROCESSED_SYSCALL) +BRANCH_PROC_UNWINFO(leave_kernel, r22, b7, LEAVE_KERNEL) + + +#ifdef CONFIG_MODULES +#define __INIT_OR_MODULE .text +#define __INITDATA_OR_MODULE .data +#else +#define __INIT_OR_MODULE __INIT +#define __INITDATA_OR_MODULE __INITDATA +#endif /* CONFIG_MODULES */ + + __INIT_OR_MODULE + GLOBAL_ENTRY(paravirt_fc_i) + fc.i r32 + br.ret.sptk.many rp + END(paravirt_fc_i) + __FINIT + + __INIT_OR_MODULE + .align 32 + GLOBAL_ENTRY(paravirt_nop_b_inst_bundle) + { + nop.b 0 + nop.b 0 + nop.b 0 + } + END(paravirt_nop_b_inst_bundle) + __FINIT + + /* NOTE: nop.[mfi] has same format */ + __INIT_OR_MODULE + GLOBAL_ENTRY(paravirt_nop_mfi_inst_bundle) + { + nop.m 0 + nop.f 0 + nop.i 0 + } + END(paravirt_nop_mfi_inst_bundle) + __FINIT + + __INIT_OR_MODULE + GLOBAL_ENTRY(paravirt_nop_bundle) +paravirt_nop_bundle_start: + { + nop 0 + nop 0 + nop 0 + } +paravirt_nop_bundle_end: + END(paravirt_nop_bundle) + __FINIT + + __INITDATA_OR_MODULE + .align 8 + .global paravirt_nop_bundle_size +paravirt_nop_bundle_size: + data8 paravirt_nop_bundle_end - paravirt_nop_bundle_start diff --git a/arch/ia64/kernel/patch.c b/arch/ia64/kernel/patch.c index 6a4ac7d70b3..1cf09179371 100644 --- a/arch/ia64/kernel/patch.c +++ b/arch/ia64/kernel/patch.c @@ -7,10 +7,10 @@ #include <linux/init.h> #include <linux/string.h> +#include <asm/paravirt.h> #include <asm/patch.h> #include <asm/processor.h> #include <asm/sections.h> -#include <asm/system.h> #include <asm/unistd.h> /* @@ -115,7 +115,30 @@ ia64_patch_vtop (unsigned long start, unsigned long end) ia64_srlz_i(); } -void +/* + * Disable the RSE workaround by turning the conditional branch + * that we tagged in each place the workaround was used into an + * unconditional branch. + */ +void __init +ia64_patch_rse (unsigned long start, unsigned long end) +{ + s32 *offp = (s32 *) start; + u64 ip, *b; + + while (offp < (s32 *) end) { + ip = (u64) offp + *offp; + + b = (u64 *)(ip & -16); + b[1] &= ~0xf800000L; + ia64_fc((void *) ip); + ++offp; + } + ia64_sync_i(); + ia64_srlz_i(); +} + +void __init ia64_patch_mckinley_e9 (unsigned long start, unsigned long end) { static int first_time = 1; @@ -129,19 +152,16 @@ ia64_patch_mckinley_e9 (unsigned long start, unsigned long end) first_time = 0; if (need_workaround) printk(KERN_INFO "Leaving McKinley Errata 9 workaround enabled\n"); - else - printk(KERN_INFO "McKinley Errata 9 workaround not needed; " - "disabling it\n"); } if (need_workaround) return; while (offp < (s32 *) end) { wp = (u64 *) ia64_imva((char *) offp + *offp); - wp[0] = 0x0000000100000000UL; /* nop.m 0; nop.i 0; nop.i 0 */ - wp[1] = 0x0004000000000200UL; - wp[2] = 0x0000000100000011UL; /* nop.m 0; nop.i 0; br.ret.sptk.many b6 */ - wp[3] = 0x0084006880000200UL; + wp[0] = 0x0000000100000011UL; /* nop.m 0; nop.i 0; br.ret.sptk.many b6 */ + wp[1] = 0x0084006880000200UL; + wp[2] = 0x0000000100000000UL; /* nop.m 0; nop.i 0; nop.i 0 */ + wp[3] = 0x0004000000000200UL; ia64_fc(wp); ia64_fc(wp + 2); ++offp; } @@ -149,16 +169,35 @@ ia64_patch_mckinley_e9 (unsigned long start, unsigned long end) ia64_srlz_i(); } -static void +extern unsigned long ia64_native_fsyscall_table[NR_syscalls]; +extern char ia64_native_fsys_bubble_down[]; +struct pv_fsys_data pv_fsys_data __initdata = { + .fsyscall_table = (unsigned long *)ia64_native_fsyscall_table, + .fsys_bubble_down = (void *)ia64_native_fsys_bubble_down, +}; + +unsigned long * __init +paravirt_get_fsyscall_table(void) +{ + return pv_fsys_data.fsyscall_table; +} + +char * __init +paravirt_get_fsys_bubble_down(void) +{ + return pv_fsys_data.fsys_bubble_down; +} + +static void __init patch_fsyscall_table (unsigned long start, unsigned long end) { - extern unsigned long fsyscall_table[NR_syscalls]; + u64 fsyscall_table = (u64)paravirt_get_fsyscall_table(); s32 *offp = (s32 *) start; u64 ip; while (offp < (s32 *) end) { ip = (u64) ia64_imva((char *) offp + *offp); - ia64_patch_imm64(ip, (u64) fsyscall_table); + ia64_patch_imm64(ip, fsyscall_table); ia64_fc((void *) ip); ++offp; } @@ -166,10 +205,10 @@ patch_fsyscall_table (unsigned long start, unsigned long end) ia64_srlz_i(); } -static void +static void __init patch_brl_fsys_bubble_down (unsigned long start, unsigned long end) { - extern char fsys_bubble_down[]; + u64 fsys_bubble_down = (u64)paravirt_get_fsys_bubble_down(); s32 *offp = (s32 *) start; u64 ip; @@ -184,14 +223,34 @@ patch_brl_fsys_bubble_down (unsigned long start, unsigned long end) ia64_srlz_i(); } -void +void __init ia64_patch_gate (void) { -# define START(name) ((unsigned long) __start_gate_##name##_patchlist) -# define END(name) ((unsigned long)__end_gate_##name##_patchlist) +# define START(name) paravirt_get_gate_patchlist(PV_GATE_START_##name) +# define END(name) paravirt_get_gate_patchlist(PV_GATE_END_##name) + + patch_fsyscall_table(START(FSYSCALL), END(FSYSCALL)); + patch_brl_fsys_bubble_down(START(BRL_FSYS_BUBBLE_DOWN), END(BRL_FSYS_BUBBLE_DOWN)); + ia64_patch_vtop(START(VTOP), END(VTOP)); + ia64_patch_mckinley_e9(START(MCKINLEY_E9), END(MCKINLEY_E9)); +} + +void ia64_patch_phys_stack_reg(unsigned long val) +{ + s32 * offp = (s32 *) __start___phys_stack_reg_patchlist; + s32 * end = (s32 *) __end___phys_stack_reg_patchlist; + u64 ip, mask, imm; - patch_fsyscall_table(START(fsyscall), END(fsyscall)); - patch_brl_fsys_bubble_down(START(brl_fsys_bubble_down), END(brl_fsys_bubble_down)); - ia64_patch_vtop(START(vtop), END(vtop)); - ia64_patch_mckinley_e9(START(mckinley_e9), END(mckinley_e9)); + /* see instruction format A4: adds r1 = imm13, r3 */ + mask = (0x3fUL << 27) | (0x7f << 13); + imm = (((val >> 7) & 0x3f) << 27) | (val & 0x7f) << 13; + + while (offp < end) { + ip = (u64) offp + *offp; + ia64_patch(ip, mask, imm); + ia64_fc((void *)ip); + ++offp; + } + ia64_sync_i(); + ia64_srlz_i(); } diff --git a/arch/ia64/kernel/pci-dma.c b/arch/ia64/kernel/pci-dma.c new file mode 100644 index 00000000000..992c1098c52 --- /dev/null +++ b/arch/ia64/kernel/pci-dma.c @@ -0,0 +1,110 @@ +/* + * Dynamic DMA mapping support. + */ + +#include <linux/types.h> +#include <linux/mm.h> +#include <linux/string.h> +#include <linux/pci.h> +#include <linux/module.h> +#include <linux/dmar.h> +#include <asm/iommu.h> +#include <asm/machvec.h> +#include <linux/dma-mapping.h> + + +#ifdef CONFIG_INTEL_IOMMU + +#include <linux/kernel.h> + +#include <asm/page.h> + +dma_addr_t bad_dma_address __read_mostly; +EXPORT_SYMBOL(bad_dma_address); + +static int iommu_sac_force __read_mostly; + +int no_iommu __read_mostly; +#ifdef CONFIG_IOMMU_DEBUG +int force_iommu __read_mostly = 1; +#else +int force_iommu __read_mostly; +#endif + +int iommu_pass_through; + +extern struct dma_map_ops intel_dma_ops; + +static int __init pci_iommu_init(void) +{ + if (iommu_detected) + intel_iommu_init(); + + return 0; +} + +/* Must execute after PCI subsystem */ +fs_initcall(pci_iommu_init); + +void pci_iommu_shutdown(void) +{ + return; +} + +void __init +iommu_dma_init(void) +{ + return; +} + +int iommu_dma_supported(struct device *dev, u64 mask) +{ + /* Copied from i386. Doesn't make much sense, because it will + only work for pci_alloc_coherent. + The caller just has to use GFP_DMA in this case. */ + if (mask < DMA_BIT_MASK(24)) + return 0; + + /* Tell the device to use SAC when IOMMU force is on. This + allows the driver to use cheaper accesses in some cases. + + Problem with this is that if we overflow the IOMMU area and + return DAC as fallback address the device may not handle it + correctly. + + As a special case some controllers have a 39bit address + mode that is as efficient as 32bit (aic79xx). Don't force + SAC for these. Assume all masks <= 40 bits are of this + type. Normally this doesn't make any difference, but gives + more gentle handling of IOMMU overflow. */ + if (iommu_sac_force && (mask >= DMA_BIT_MASK(40))) { + dev_info(dev, "Force SAC with mask %llx\n", mask); + return 0; + } + + return 1; +} +EXPORT_SYMBOL(iommu_dma_supported); + +void __init pci_iommu_alloc(void) +{ + dma_ops = &intel_dma_ops; + + dma_ops->sync_single_for_cpu = machvec_dma_sync_single; + dma_ops->sync_sg_for_cpu = machvec_dma_sync_sg; + dma_ops->sync_single_for_device = machvec_dma_sync_single; + dma_ops->sync_sg_for_device = machvec_dma_sync_sg; + dma_ops->dma_supported = iommu_dma_supported; + + /* + * The order of these functions is important for + * fall-back/fail-over reasons + */ + detect_intel_iommu(); + +#ifdef CONFIG_SWIOTLB + pci_swiotlb_init(); +#endif +} + +#endif diff --git a/arch/ia64/kernel/pci-swiotlb.c b/arch/ia64/kernel/pci-swiotlb.c new file mode 100644 index 00000000000..939260aeac9 --- /dev/null +++ b/arch/ia64/kernel/pci-swiotlb.c @@ -0,0 +1,67 @@ +/* Glue code to lib/swiotlb.c */ + +#include <linux/pci.h> +#include <linux/gfp.h> +#include <linux/cache.h> +#include <linux/module.h> +#include <linux/dma-mapping.h> + +#include <asm/swiotlb.h> +#include <asm/dma.h> +#include <asm/iommu.h> +#include <asm/machvec.h> + +int swiotlb __read_mostly; +EXPORT_SYMBOL(swiotlb); + +static void *ia64_swiotlb_alloc_coherent(struct device *dev, size_t size, + dma_addr_t *dma_handle, gfp_t gfp, + struct dma_attrs *attrs) +{ + if (dev->coherent_dma_mask != DMA_BIT_MASK(64)) + gfp |= GFP_DMA; + return swiotlb_alloc_coherent(dev, size, dma_handle, gfp); +} + +static void ia64_swiotlb_free_coherent(struct device *dev, size_t size, + void *vaddr, dma_addr_t dma_addr, + struct dma_attrs *attrs) +{ + swiotlb_free_coherent(dev, size, vaddr, dma_addr); +} + +struct dma_map_ops swiotlb_dma_ops = { + .alloc = ia64_swiotlb_alloc_coherent, + .free = ia64_swiotlb_free_coherent, + .map_page = swiotlb_map_page, + .unmap_page = swiotlb_unmap_page, + .map_sg = swiotlb_map_sg_attrs, + .unmap_sg = swiotlb_unmap_sg_attrs, + .sync_single_for_cpu = swiotlb_sync_single_for_cpu, + .sync_single_for_device = swiotlb_sync_single_for_device, + .sync_sg_for_cpu = swiotlb_sync_sg_for_cpu, + .sync_sg_for_device = swiotlb_sync_sg_for_device, + .dma_supported = swiotlb_dma_supported, + .mapping_error = swiotlb_dma_mapping_error, +}; + +void __init swiotlb_dma_init(void) +{ + dma_ops = &swiotlb_dma_ops; + swiotlb_init(1); +} + +void __init pci_swiotlb_init(void) +{ + if (!iommu_detected) { +#ifdef CONFIG_IA64_GENERIC + swiotlb = 1; + printk(KERN_INFO "PCI-DMA: Re-initialize machine vector.\n"); + machvec_init("dig"); + swiotlb_init(1); + dma_ops = &swiotlb_dma_ops; +#else + panic("Unable to find Intel IOMMU"); +#endif + } +} diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c index f7dfc107cb7..5845ffea67c 100644 --- a/arch/ia64/kernel/perfmon.c +++ b/arch/ia64/kernel/perfmon.c @@ -19,12 +19,10 @@ * http://www.hpl.hp.com/research/linux/perfmon */ -#include <linux/config.h> #include <linux/module.h> #include <linux/kernel.h> #include <linux/sched.h> #include <linux/interrupt.h> -#include <linux/smp_lock.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/init.h> @@ -35,10 +33,16 @@ #include <linux/file.h> #include <linux/poll.h> #include <linux/vfs.h> +#include <linux/smp.h> #include <linux/pagemap.h> #include <linux/mount.h> #include <linux/bitops.h> +#include <linux/capability.h> #include <linux/rcupdate.h> +#include <linux/completion.h> +#include <linux/tracehook.h> +#include <linux/slab.h> +#include <linux/cpu.h> #include <asm/errno.h> #include <asm/intrinsics.h> @@ -46,7 +50,6 @@ #include <asm/perfmon.h> #include <asm/processor.h> #include <asm/signal.h> -#include <asm/system.h> #include <asm/uaccess.h> #include <asm/delay.h> @@ -61,6 +64,9 @@ #define PFM_INVALID_ACTIVATION (~0UL) +#define PFM_NUM_PMC_REGS 64 /* PMC save area for ctxsw */ +#define PFM_NUM_PMD_REGS 64 /* PMD save area for ctxsw */ + /* * depth of message queue */ @@ -142,7 +148,7 @@ * in UP: * - we need to protect against PMU overflow interrupts (local_irq_disable) * - * spin_lock_irqsave()/spin_lock_irqrestore(): + * spin_lock_irqsave()/spin_unlock_irqrestore(): * in SMP: local_irq_disable + spin_lock * in UP : local_irq_disable * @@ -154,14 +160,14 @@ */ #define PROTECT_CTX(c, f) \ do { \ - DPRINT(("spinlock_irq_save ctx %p by [%d]\n", c, current->pid)); \ + DPRINT(("spinlock_irq_save ctx %p by [%d]\n", c, task_pid_nr(current))); \ spin_lock_irqsave(&(c)->ctx_lock, f); \ - DPRINT(("spinlocked ctx %p by [%d]\n", c, current->pid)); \ + DPRINT(("spinlocked ctx %p by [%d]\n", c, task_pid_nr(current))); \ } while(0) #define UNPROTECT_CTX(c, f) \ do { \ - DPRINT(("spinlock_irq_restore ctx %p by [%d]\n", c, current->pid)); \ + DPRINT(("spinlock_irq_restore ctx %p by [%d]\n", c, task_pid_nr(current))); \ spin_unlock_irqrestore(&(c)->ctx_lock, f); \ } while(0) @@ -223,12 +229,12 @@ #ifdef PFM_DEBUGGING #define DPRINT(a) \ do { \ - if (unlikely(pfm_sysctl.debug >0)) { printk("%s.%d: CPU%d [%d] ", __FUNCTION__, __LINE__, smp_processor_id(), current->pid); printk a; } \ + if (unlikely(pfm_sysctl.debug >0)) { printk("%s.%d: CPU%d [%d] ", __func__, __LINE__, smp_processor_id(), task_pid_nr(current)); printk a; } \ } while (0) #define DPRINT_ovfl(a) \ do { \ - if (unlikely(pfm_sysctl.debug > 0 && pfm_sysctl.debug_ovfl >0)) { printk("%s.%d: CPU%d [%d] ", __FUNCTION__, __LINE__, smp_processor_id(), current->pid); printk a; } \ + if (unlikely(pfm_sysctl.debug > 0 && pfm_sysctl.debug_ovfl >0)) { printk("%s.%d: CPU%d [%d] ", __func__, __LINE__, smp_processor_id(), task_pid_nr(current)); printk a; } \ } while (0) #endif @@ -285,7 +291,7 @@ typedef struct pfm_context { unsigned long ctx_ovfl_regs[4]; /* which registers overflowed (notification) */ - struct semaphore ctx_restart_sem; /* use for blocking notification mode */ + struct completion ctx_restart_done; /* use for blocking notification mode */ unsigned long ctx_used_pmds[4]; /* bitmask of PMD used */ unsigned long ctx_all_pmds[4]; /* bitmask of all accessible PMDs */ @@ -295,16 +301,19 @@ typedef struct pfm_context { unsigned long ctx_reload_pmcs[4]; /* bitmask of force reload PMC on ctxsw in */ unsigned long ctx_used_monitors[4]; /* bitmask of monitor PMC being used */ - unsigned long ctx_pmcs[IA64_NUM_PMC_REGS]; /* saved copies of PMC values */ + unsigned long ctx_pmcs[PFM_NUM_PMC_REGS]; /* saved copies of PMC values */ unsigned int ctx_used_ibrs[1]; /* bitmask of used IBR (speedup ctxsw in) */ unsigned int ctx_used_dbrs[1]; /* bitmask of used DBR (speedup ctxsw in) */ unsigned long ctx_dbrs[IA64_NUM_DBG_REGS]; /* DBR values (cache) when not loaded */ unsigned long ctx_ibrs[IA64_NUM_DBG_REGS]; /* IBR values (cache) when not loaded */ - pfm_counter_t ctx_pmds[IA64_NUM_PMD_REGS]; /* software state for PMDS */ + pfm_counter_t ctx_pmds[PFM_NUM_PMD_REGS]; /* software state for PMDS */ + + unsigned long th_pmcs[PFM_NUM_PMC_REGS]; /* PMC thread save state */ + unsigned long th_pmds[PFM_NUM_PMD_REGS]; /* PMD thread save state */ - u64 ctx_saved_psr_up; /* only contains psr.up value */ + unsigned long ctx_saved_psr_up; /* only contains psr.up value */ unsigned long ctx_last_activation; /* context last activation number for last_cpu */ unsigned int ctx_last_cpu; /* CPU id of current or last CPU used (SMP only) */ @@ -512,25 +521,56 @@ static pmu_config_t *pmu_conf; pfm_sysctl_t pfm_sysctl; EXPORT_SYMBOL(pfm_sysctl); -static ctl_table pfm_ctl_table[]={ - {1, "debug", &pfm_sysctl.debug, sizeof(int), 0666, NULL, &proc_dointvec, NULL,}, - {2, "debug_ovfl", &pfm_sysctl.debug_ovfl, sizeof(int), 0666, NULL, &proc_dointvec, NULL,}, - {3, "fastctxsw", &pfm_sysctl.fastctxsw, sizeof(int), 0600, NULL, &proc_dointvec, NULL,}, - {4, "expert_mode", &pfm_sysctl.expert_mode, sizeof(int), 0600, NULL, &proc_dointvec, NULL,}, - { 0, }, +static struct ctl_table pfm_ctl_table[] = { + { + .procname = "debug", + .data = &pfm_sysctl.debug, + .maxlen = sizeof(int), + .mode = 0666, + .proc_handler = proc_dointvec, + }, + { + .procname = "debug_ovfl", + .data = &pfm_sysctl.debug_ovfl, + .maxlen = sizeof(int), + .mode = 0666, + .proc_handler = proc_dointvec, + }, + { + .procname = "fastctxsw", + .data = &pfm_sysctl.fastctxsw, + .maxlen = sizeof(int), + .mode = 0600, + .proc_handler = proc_dointvec, + }, + { + .procname = "expert_mode", + .data = &pfm_sysctl.expert_mode, + .maxlen = sizeof(int), + .mode = 0600, + .proc_handler = proc_dointvec, + }, + {} }; -static ctl_table pfm_sysctl_dir[] = { - {1, "perfmon", NULL, 0, 0755, pfm_ctl_table, }, - {0,}, +static struct ctl_table pfm_sysctl_dir[] = { + { + .procname = "perfmon", + .mode = 0555, + .child = pfm_ctl_table, + }, + {} }; -static ctl_table pfm_sysctl_root[] = { - {1, "kernel", NULL, 0, 0755, pfm_sysctl_dir, }, - {0,}, +static struct ctl_table pfm_sysctl_root[] = { + { + .procname = "kernel", + .mode = 0555, + .child = pfm_sysctl_dir, + }, + {} }; static struct ctl_table_header *pfm_sysctl_header; static int pfm_context_unload(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs); -static int pfm_flush(struct file *filp); #define pfm_get_cpu_var(v) __ia64_per_cpu_var(v) #define pfm_get_cpu_data(a,b) per_cpu(a, b) @@ -542,21 +582,6 @@ pfm_put_task(struct task_struct *task) } static inline void -pfm_set_task_notify(struct task_struct *task) -{ - struct thread_info *info; - - info = (struct thread_info *) ((char *) task + IA64_TASK_SIZE); - set_bit(TIF_NOTIFY_RESUME, &info->flags); -} - -static inline void -pfm_clear_task_notify(void) -{ - clear_thread_flag(TIF_NOTIFY_RESUME); -} - -static inline void pfm_reserve_page(unsigned long a) { SetPageReserved(vmalloc_to_page((void *)a)); @@ -580,30 +605,22 @@ pfm_unprotect_ctx_ctxsw(pfm_context_t *x, unsigned long f) spin_unlock(&(x)->ctx_lock); } -static inline unsigned int -pfm_do_munmap(struct mm_struct *mm, unsigned long addr, size_t len, int acct) -{ - return do_munmap(mm, addr, len); -} - -static inline unsigned long -pfm_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags, unsigned long exec) -{ - return get_unmapped_area(file, addr, len, pgoff, flags); -} - +/* forward declaration */ +static const struct dentry_operations pfmfs_dentry_operations; -static struct super_block * -pfmfs_get_sb(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) +static struct dentry * +pfmfs_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { - return get_sb_pseudo(fs_type, "pfm:", NULL, PFMFS_MAGIC); + return mount_pseudo(fs_type, "pfm:", NULL, &pfmfs_dentry_operations, + PFMFS_MAGIC); } static struct file_system_type pfm_fs_type = { .name = "pfmfs", - .get_sb = pfmfs_get_sb, + .mount = pfmfs_mount, .kill_sb = kill_anon_super, }; +MODULE_ALIAS_FS("pfmfs"); DEFINE_PER_CPU(unsigned long, pfm_syst_info); DEFINE_PER_CPU(struct task_struct *, pmu_owner); @@ -613,7 +630,7 @@ EXPORT_PER_CPU_SYMBOL_GPL(pfm_syst_info); /* forward declaration */ -static struct file_operations pfm_file_ops; +static const struct file_operations pfm_file_ops; /* * forward declarations @@ -627,9 +644,11 @@ static int pfm_write_ibr_dbr(int mode, pfm_context_t *ctx, void *arg, int count, #include "perfmon_itanium.h" #include "perfmon_mckinley.h" +#include "perfmon_montecito.h" #include "perfmon_generic.h" static pmu_config_t *pmu_confs[]={ + &pmu_conf_mont, &pmu_conf_mck, &pmu_conf_ita, &pmu_conf_gen, /* must be last */ @@ -802,10 +821,9 @@ pfm_rvmalloc(unsigned long size) unsigned long addr; size = PAGE_ALIGN(size); - mem = vmalloc(size); + mem = vzalloc(size); if (mem) { //printk("perfmon: CPU%d pfm_rvmalloc(%ld)=%p\n", smp_processor_id(), size, mem); - memset(mem, 0, size); addr = (unsigned long)mem; while (size > 0) { pfm_reserve_page(addr); @@ -835,7 +853,7 @@ pfm_rvfree(void *mem, unsigned long size) } static pfm_context_t * -pfm_context_alloc(void) +pfm_context_alloc(int ctx_flags) { pfm_context_t *ctx; @@ -843,10 +861,49 @@ pfm_context_alloc(void) * allocate context descriptor * must be able to free with interrupts disabled */ - ctx = kmalloc(sizeof(pfm_context_t), GFP_KERNEL); + ctx = kzalloc(sizeof(pfm_context_t), GFP_KERNEL); if (ctx) { - memset(ctx, 0, sizeof(pfm_context_t)); DPRINT(("alloc ctx @%p\n", ctx)); + + /* + * init context protection lock + */ + spin_lock_init(&ctx->ctx_lock); + + /* + * context is unloaded + */ + ctx->ctx_state = PFM_CTX_UNLOADED; + + /* + * initialization of context's flags + */ + ctx->ctx_fl_block = (ctx_flags & PFM_FL_NOTIFY_BLOCK) ? 1 : 0; + ctx->ctx_fl_system = (ctx_flags & PFM_FL_SYSTEM_WIDE) ? 1: 0; + ctx->ctx_fl_no_msg = (ctx_flags & PFM_FL_OVFL_NO_MSG) ? 1: 0; + /* + * will move to set properties + * ctx->ctx_fl_excl_idle = (ctx_flags & PFM_FL_EXCL_IDLE) ? 1: 0; + */ + + /* + * init restart semaphore to locked + */ + init_completion(&ctx->ctx_restart_done); + + /* + * activation is used in SMP only + */ + ctx->ctx_last_activation = PFM_INVALID_ACTIVATION; + SET_LAST_CPU(ctx, -1); + + /* + * initialize notification message queue + */ + ctx->ctx_msgq_head = ctx->ctx_msgq_tail = 0; + init_waitqueue_head(&ctx->ctx_msgq_wait); + init_waitqueue_head(&ctx->ctx_zombieq); + } return ctx; } @@ -864,11 +921,10 @@ static void pfm_mask_monitoring(struct task_struct *task) { pfm_context_t *ctx = PFM_GET_CTX(task); - struct thread_struct *th = &task->thread; unsigned long mask, val, ovfl_mask; int i; - DPRINT_ovfl(("masking monitoring for [%d]\n", task->pid)); + DPRINT_ovfl(("masking monitoring for [%d]\n", task_pid_nr(task))); ovfl_mask = pmu_conf->ovfl_val; /* @@ -885,7 +941,7 @@ pfm_mask_monitoring(struct task_struct *task) * So in both cases, the live register contains the owner's * state. We can ONLY touch the PMU registers and NOT the PSR. * - * As a consequence to this call, the thread->pmds[] array + * As a consequence to this call, the ctx->th_pmds[] array * contains stale information which must be ignored * when context is reloaded AND monitoring is active (see * pfm_restart). @@ -920,9 +976,9 @@ pfm_mask_monitoring(struct task_struct *task) mask = ctx->ctx_used_monitors[0] >> PMU_FIRST_COUNTER; for(i= PMU_FIRST_COUNTER; mask; i++, mask>>=1) { if ((mask & 0x1) == 0UL) continue; - ia64_set_pmc(i, th->pmcs[i] & ~0xfUL); - th->pmcs[i] &= ~0xfUL; - DPRINT_ovfl(("pmc[%d]=0x%lx\n", i, th->pmcs[i])); + ia64_set_pmc(i, ctx->th_pmcs[i] & ~0xfUL); + ctx->th_pmcs[i] &= ~0xfUL; + DPRINT_ovfl(("pmc[%d]=0x%lx\n", i, ctx->th_pmcs[i])); } /* * make all of this visible @@ -939,7 +995,6 @@ static void pfm_restore_monitoring(struct task_struct *task) { pfm_context_t *ctx = PFM_GET_CTX(task); - struct thread_struct *th = &task->thread; unsigned long mask, ovfl_mask; unsigned long psr, val; int i, is_system; @@ -948,12 +1003,12 @@ pfm_restore_monitoring(struct task_struct *task) ovfl_mask = pmu_conf->ovfl_val; if (task != current) { - printk(KERN_ERR "perfmon.%d: invalid task[%d] current[%d]\n", __LINE__, task->pid, current->pid); + printk(KERN_ERR "perfmon.%d: invalid task[%d] current[%d]\n", __LINE__, task_pid_nr(task), task_pid_nr(current)); return; } if (ctx->ctx_state != PFM_CTX_MASKED) { printk(KERN_ERR "perfmon.%d: task[%d] current[%d] invalid state=%d\n", __LINE__, - task->pid, current->pid, ctx->ctx_state); + task_pid_nr(task), task_pid_nr(current), ctx->ctx_state); return; } psr = pfm_get_psr(); @@ -1005,9 +1060,10 @@ pfm_restore_monitoring(struct task_struct *task) mask = ctx->ctx_used_monitors[0] >> PMU_FIRST_COUNTER; for(i= PMU_FIRST_COUNTER; mask; i++, mask>>=1) { if ((mask & 0x1) == 0UL) continue; - th->pmcs[i] = ctx->ctx_pmcs[i]; - ia64_set_pmc(i, th->pmcs[i]); - DPRINT(("[%d] pmc[%d]=0x%lx\n", task->pid, i, th->pmcs[i])); + ctx->th_pmcs[i] = ctx->ctx_pmcs[i]; + ia64_set_pmc(i, ctx->th_pmcs[i]); + DPRINT(("[%d] pmc[%d]=0x%lx\n", + task_pid_nr(task), i, ctx->th_pmcs[i])); } ia64_srlz_d(); @@ -1066,7 +1122,6 @@ pfm_restore_pmds(unsigned long *pmds, unsigned long mask) static inline void pfm_copy_pmds(struct task_struct *task, pfm_context_t *ctx) { - struct thread_struct *thread = &task->thread; unsigned long ovfl_val = pmu_conf->ovfl_val; unsigned long mask = ctx->ctx_all_pmds[0]; unsigned long val; @@ -1088,11 +1143,11 @@ pfm_copy_pmds(struct task_struct *task, pfm_context_t *ctx) ctx->ctx_pmds[i].val = val & ~ovfl_val; val &= ovfl_val; } - thread->pmds[i] = val; + ctx->th_pmds[i] = val; DPRINT(("pmd[%d]=0x%lx soft_val=0x%lx\n", i, - thread->pmds[i], + ctx->th_pmds[i], ctx->ctx_pmds[i].val)); } } @@ -1103,7 +1158,6 @@ pfm_copy_pmds(struct task_struct *task, pfm_context_t *ctx) static inline void pfm_copy_pmcs(struct task_struct *task, pfm_context_t *ctx) { - struct thread_struct *thread = &task->thread; unsigned long mask = ctx->ctx_all_pmcs[0]; int i; @@ -1111,8 +1165,8 @@ pfm_copy_pmcs(struct task_struct *task, pfm_context_t *ctx) for (i=0; mask; i++, mask>>=1) { /* masking 0 with ovfl_val yields 0 */ - thread->pmcs[i] = ctx->ctx_pmcs[i]; - DPRINT(("pmc[%d]=0x%lx\n", i, thread->pmcs[i])); + ctx->th_pmcs[i] = ctx->ctx_pmcs[i]; + DPRINT(("pmc[%d]=0x%lx\n", i, ctx->th_pmcs[i])); } } @@ -1269,14 +1323,12 @@ out: } EXPORT_SYMBOL(pfm_unregister_buffer_fmt); -extern void update_pal_halt_status(int); - static int pfm_reserve_session(struct task_struct *task, int is_syswide, unsigned int cpu) { unsigned long flags; /* - * validy checks on cpu_mask have been done upstream + * validity checks on cpu_mask have been done upstream */ LOCK_PFS(flags); @@ -1318,9 +1370,9 @@ pfm_reserve_session(struct task_struct *task, int is_syswide, unsigned int cpu) cpu)); /* - * disable default_idle() to go to PAL_HALT + * Force idle() into poll mode */ - update_pal_halt_status(0); + cpu_idle_poll_ctrl(true); UNLOCK_PFS(flags); @@ -1328,7 +1380,7 @@ pfm_reserve_session(struct task_struct *task, int is_syswide, unsigned int cpu) error_conflict: DPRINT(("system wide not possible, conflicting session [%d] on CPU%d\n", - pfm_sessions.pfs_sys_session[cpu]->pid, + task_pid_nr(pfm_sessions.pfs_sys_session[cpu]), cpu)); abort: UNLOCK_PFS(flags); @@ -1342,7 +1394,7 @@ pfm_unreserve_session(pfm_context_t *ctx, int is_syswide, unsigned int cpu) { unsigned long flags; /* - * validy checks on cpu_mask have been done upstream + * validity checks on cpu_mask have been done upstream */ LOCK_PFS(flags); @@ -1377,11 +1429,8 @@ pfm_unreserve_session(pfm_context_t *ctx, int is_syswide, unsigned int cpu) is_syswide, cpu)); - /* - * if possible, enable default_idle() to go into PAL_HALT - */ - if (pfm_sessions.pfs_task_sessions == 0 && pfm_sessions.pfs_sys_sessions == 0) - update_pal_halt_status(1); + /* Undo forced polling. Last session reenables pal_halt */ + cpu_idle_poll_ctrl(false); UNLOCK_PFS(flags); @@ -1394,13 +1443,14 @@ pfm_unreserve_session(pfm_context_t *ctx, int is_syswide, unsigned int cpu) * a PROTECT_CTX() section. */ static int -pfm_remove_smpl_mapping(struct task_struct *task, void *vaddr, unsigned long size) +pfm_remove_smpl_mapping(void *vaddr, unsigned long size) { + struct task_struct *task = current; int r; /* sanity checks */ if (task->mm == NULL || size == 0UL || vaddr == NULL) { - printk(KERN_ERR "perfmon: pfm_remove_smpl_mapping [%d] invalid context mm=%p\n", task->pid, task->mm); + printk(KERN_ERR "perfmon: pfm_remove_smpl_mapping [%d] invalid context mm=%p\n", task_pid_nr(task), task->mm); return -EINVAL; } @@ -1409,15 +1459,10 @@ pfm_remove_smpl_mapping(struct task_struct *task, void *vaddr, unsigned long siz /* * does the actual unmapping */ - down_write(&task->mm->mmap_sem); - - DPRINT(("down_write done smpl_vaddr=%p size=%lu\n", vaddr, size)); - - r = pfm_do_munmap(task->mm, (unsigned long)vaddr, size, 0); + r = vm_munmap((unsigned long)vaddr, size); - up_write(&task->mm->mmap_sem); if (r !=0) { - printk(KERN_ERR "perfmon: [%d] unable to unmap sampling buffer @%p size=%lu\n", task->pid, vaddr, size); + printk(KERN_ERR "perfmon: [%d] unable to unmap sampling buffer @%p size=%lu\n", task_pid_nr(task), vaddr, size); } DPRINT(("do_unmap(%p, %lu)=%d\n", vaddr, size, r)); @@ -1459,7 +1504,7 @@ pfm_free_smpl_buffer(pfm_context_t *ctx) return 0; invalid_free: - printk(KERN_ERR "perfmon: pfm_free_smpl_buffer [%d] no buffer\n", current->pid); + printk(KERN_ERR "perfmon: pfm_free_smpl_buffer [%d] no buffer\n", task_pid_nr(current)); return -EINVAL; } #endif @@ -1479,7 +1524,7 @@ pfm_exit_smpl_buffer(pfm_buffer_fmt_t *fmt) * any operations on the root directory. However, we need a non-trivial * d_name - pfm: will go nicely and kill the special-casing in procfs. */ -static struct vfsmount *pfmfs_mnt; +static struct vfsmount *pfmfs_mnt __read_mostly; static int __init init_pfm_fs(void) @@ -1496,13 +1541,6 @@ init_pfm_fs(void) return err; } -static void __exit -exit_pfm_fs(void) -{ - unregister_filesystem(&pfm_fs_type); - mntput(pfmfs_mnt); -} - static ssize_t pfm_read(struct file *filp, char __user *buf, size_t size, loff_t *ppos) { @@ -1512,13 +1550,13 @@ pfm_read(struct file *filp, char __user *buf, size_t size, loff_t *ppos) unsigned long flags; DECLARE_WAITQUEUE(wait, current); if (PFM_IS_FILE(filp) == 0) { - printk(KERN_ERR "perfmon: pfm_poll: bad magic [%d]\n", current->pid); + printk(KERN_ERR "perfmon: pfm_poll: bad magic [%d]\n", task_pid_nr(current)); return -EINVAL; } - ctx = (pfm_context_t *)filp->private_data; + ctx = filp->private_data; if (ctx == NULL) { - printk(KERN_ERR "perfmon: pfm_read: NULL ctx [%d]\n", current->pid); + printk(KERN_ERR "perfmon: pfm_read: NULL ctx [%d]\n", task_pid_nr(current)); return -EINVAL; } @@ -1572,7 +1610,7 @@ pfm_read(struct file *filp, char __user *buf, size_t size, loff_t *ppos) PROTECT_CTX(ctx, flags); } - DPRINT(("[%d] back to running ret=%ld\n", current->pid, ret)); + DPRINT(("[%d] back to running ret=%ld\n", task_pid_nr(current), ret)); set_current_state(TASK_RUNNING); remove_wait_queue(&ctx->ctx_msgq_wait, &wait); @@ -1581,7 +1619,7 @@ pfm_read(struct file *filp, char __user *buf, size_t size, loff_t *ppos) ret = -EINVAL; msg = pfm_get_next_msg(ctx); if (msg == NULL) { - printk(KERN_ERR "perfmon: pfm_read no msg for ctx=%p [%d]\n", ctx, current->pid); + printk(KERN_ERR "perfmon: pfm_read no msg for ctx=%p [%d]\n", ctx, task_pid_nr(current)); goto abort_locked; } @@ -1612,13 +1650,13 @@ pfm_poll(struct file *filp, poll_table * wait) unsigned int mask = 0; if (PFM_IS_FILE(filp) == 0) { - printk(KERN_ERR "perfmon: pfm_poll: bad magic [%d]\n", current->pid); + printk(KERN_ERR "perfmon: pfm_poll: bad magic [%d]\n", task_pid_nr(current)); return 0; } - ctx = (pfm_context_t *)filp->private_data; + ctx = filp->private_data; if (ctx == NULL) { - printk(KERN_ERR "perfmon: pfm_poll: NULL ctx [%d]\n", current->pid); + printk(KERN_ERR "perfmon: pfm_poll: NULL ctx [%d]\n", task_pid_nr(current)); return 0; } @@ -1639,8 +1677,8 @@ pfm_poll(struct file *filp, poll_table * wait) return mask; } -static int -pfm_ioctl(struct inode *inode, struct file *file, unsigned int cmd, unsigned long arg) +static long +pfm_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { DPRINT(("pfm_ioctl called\n")); return -EINVAL; @@ -1657,7 +1695,7 @@ pfm_do_fasync(int fd, struct file *filp, pfm_context_t *ctx, int on) ret = fasync_helper (fd, filp, on, &ctx->ctx_async_queue); DPRINT(("pfm_fasync called by [%d] on ctx_fd=%d on=%d async_queue=%p ret=%d\n", - current->pid, + task_pid_nr(current), fd, on, ctx->ctx_async_queue, ret)); @@ -1672,13 +1710,13 @@ pfm_fasync(int fd, struct file *filp, int on) int ret; if (PFM_IS_FILE(filp) == 0) { - printk(KERN_ERR "perfmon: pfm_fasync bad magic [%d]\n", current->pid); + printk(KERN_ERR "perfmon: pfm_fasync bad magic [%d]\n", task_pid_nr(current)); return -EBADF; } - ctx = (pfm_context_t *)filp->private_data; + ctx = filp->private_data; if (ctx == NULL) { - printk(KERN_ERR "perfmon: pfm_fasync NULL ctx [%d]\n", current->pid); + printk(KERN_ERR "perfmon: pfm_fasync NULL ctx [%d]\n", task_pid_nr(current)); return -EBADF; } /* @@ -1709,7 +1747,7 @@ static void pfm_syswide_force_stop(void *info) { pfm_context_t *ctx = (pfm_context_t *)info; - struct pt_regs *regs = ia64_task_regs(current); + struct pt_regs *regs = task_pt_regs(current); struct task_struct *owner; unsigned long flags; int ret; @@ -1724,7 +1762,7 @@ pfm_syswide_force_stop(void *info) if (owner != ctx->ctx_task) { printk(KERN_ERR "perfmon: pfm_syswide_force_stop CPU%d unexpected owner [%d] instead of [%d]\n", smp_processor_id(), - owner->pid, ctx->ctx_task->pid); + task_pid_nr(owner), task_pid_nr(ctx->ctx_task)); return; } if (GET_PMU_CTX() != ctx) { @@ -1734,7 +1772,7 @@ pfm_syswide_force_stop(void *info) return; } - DPRINT(("on CPU%d forcing system wide stop for [%d]\n", smp_processor_id(), ctx->ctx_task->pid)); + DPRINT(("on CPU%d forcing system wide stop for [%d]\n", smp_processor_id(), task_pid_nr(ctx->ctx_task))); /* * the context is already protected in pfm_close(), we simply * need to mask interrupts to avoid a PMU interrupt race on @@ -1759,7 +1797,7 @@ pfm_syswide_cleanup_other_cpu(pfm_context_t *ctx) int ret; DPRINT(("calling CPU%d for cleanup\n", ctx->ctx_cpu)); - ret = smp_call_function_single(ctx->ctx_cpu, pfm_syswide_force_stop, ctx, 0, 1); + ret = smp_call_function_single(ctx->ctx_cpu, pfm_syswide_force_stop, ctx, 1); DPRINT(("called CPU%d for cleanup ret=%d\n", ctx->ctx_cpu, ret)); } #endif /* CONFIG_SMP */ @@ -1769,7 +1807,7 @@ pfm_syswide_cleanup_other_cpu(pfm_context_t *ctx) * When caller is self-monitoring, the context is unloaded. */ static int -pfm_flush(struct file *filp) +pfm_flush(struct file *filp, fl_owner_t id) { pfm_context_t *ctx; struct task_struct *task; @@ -1784,16 +1822,16 @@ pfm_flush(struct file *filp) return -EBADF; } - ctx = (pfm_context_t *)filp->private_data; + ctx = filp->private_data; if (ctx == NULL) { - printk(KERN_ERR "perfmon: pfm_flush: NULL ctx [%d]\n", current->pid); + printk(KERN_ERR "perfmon: pfm_flush: NULL ctx [%d]\n", task_pid_nr(current)); return -EBADF; } /* * remove our file from the async queue, if we use this mode. * This can be done without the context being protected. We come - * here when the context has become unreacheable by other tasks. + * here when the context has become unreachable by other tasks. * * We may still have active monitoring at this point and we may * end up in pfm_overflow_handler(). However, fasync_helper() @@ -1803,18 +1841,13 @@ pfm_flush(struct file *filp) * invoked after, it will find an empty queue and no * signal will be sent. In both case, we are safe */ - if (filp->f_flags & FASYNC) { - DPRINT(("cleaning up async_queue=%p\n", ctx->ctx_async_queue)); - pfm_do_fasync (-1, filp, ctx, 0); - } - PROTECT_CTX(ctx, flags); state = ctx->ctx_state; is_system = ctx->ctx_fl_system; task = PFM_CTX_TASK(ctx); - regs = ia64_task_regs(task); + regs = task_pt_regs(task); DPRINT(("ctx_state=%d is_current=%d\n", state, @@ -1893,7 +1926,7 @@ pfm_flush(struct file *filp) * because some VM function reenables interrupts. * */ - if (smpl_buf_vaddr) pfm_remove_smpl_mapping(current, smpl_buf_vaddr, smpl_buf_size); + if (smpl_buf_vaddr) pfm_remove_smpl_mapping(smpl_buf_vaddr, smpl_buf_size); return 0; } @@ -1932,9 +1965,9 @@ pfm_close(struct inode *inode, struct file *filp) return -EBADF; } - ctx = (pfm_context_t *)filp->private_data; + ctx = filp->private_data; if (ctx == NULL) { - printk(KERN_ERR "perfmon: pfm_close: NULL ctx [%d]\n", current->pid); + printk(KERN_ERR "perfmon: pfm_close: NULL ctx [%d]\n", task_pid_nr(current)); return -EBADF; } @@ -1944,7 +1977,7 @@ pfm_close(struct inode *inode, struct file *filp) is_system = ctx->ctx_fl_system; task = PFM_CTX_TASK(ctx); - regs = ia64_task_regs(task); + regs = task_pt_regs(task); DPRINT(("ctx_state=%d is_current=%d\n", state, @@ -1988,7 +2021,7 @@ pfm_close(struct inode *inode, struct file *filp) /* * force task to wake up from MASKED state */ - up(&ctx->ctx_restart_sem); + complete(&ctx->ctx_restart_done); DPRINT(("waking up ctx_state=%d\n", state)); @@ -2031,7 +2064,7 @@ pfm_close(struct inode *inode, struct file *filp) */ ctx->ctx_state = PFM_CTX_ZOMBIE; - DPRINT(("zombie ctx for [%d]\n", task->pid)); + DPRINT(("zombie ctx for [%d]\n", task_pid_nr(task))); /* * cannot free the context on the spot. deferred until * the task notices the ZOMBIE state @@ -2090,7 +2123,7 @@ doit: filp->private_data = NULL; /* - * if we free on the spot, the context is now completely unreacheable + * if we free on the spot, the context is now completely unreachable * from the callers side. The monitored task side is also cut, so we * can freely cut. * @@ -2121,116 +2154,73 @@ pfm_no_open(struct inode *irrelevant, struct file *dontcare) -static struct file_operations pfm_file_ops = { - .llseek = no_llseek, - .read = pfm_read, - .write = pfm_write, - .poll = pfm_poll, - .ioctl = pfm_ioctl, - .open = pfm_no_open, /* special open code to disallow open via /proc */ - .fasync = pfm_fasync, - .release = pfm_close, - .flush = pfm_flush +static const struct file_operations pfm_file_ops = { + .llseek = no_llseek, + .read = pfm_read, + .write = pfm_write, + .poll = pfm_poll, + .unlocked_ioctl = pfm_ioctl, + .open = pfm_no_open, /* special open code to disallow open via /proc */ + .fasync = pfm_fasync, + .release = pfm_close, + .flush = pfm_flush }; -static int -pfmfs_delete_dentry(struct dentry *dentry) +static char *pfmfs_dname(struct dentry *dentry, char *buffer, int buflen) { - return 1; + return dynamic_dname(dentry, buffer, buflen, "pfm:[%lu]", + dentry->d_inode->i_ino); } -static struct dentry_operations pfmfs_dentry_operations = { - .d_delete = pfmfs_delete_dentry, +static const struct dentry_operations pfmfs_dentry_operations = { + .d_delete = always_delete_dentry, + .d_dname = pfmfs_dname, }; -static int -pfm_alloc_fd(struct file **cfile) +static struct file * +pfm_alloc_file(pfm_context_t *ctx) { - int fd, ret = 0; - struct file *file = NULL; - struct inode * inode; - char name[32]; - struct qstr this; - - fd = get_unused_fd(); - if (fd < 0) return -ENFILE; - - ret = -ENFILE; - - file = get_empty_filp(); - if (!file) goto out; + struct file *file; + struct inode *inode; + struct path path; + struct qstr this = { .name = "" }; /* * allocate a new inode */ inode = new_inode(pfmfs_mnt->mnt_sb); - if (!inode) goto out; + if (!inode) + return ERR_PTR(-ENOMEM); DPRINT(("new inode ino=%ld @%p\n", inode->i_ino, inode)); inode->i_mode = S_IFCHR|S_IRUGO; - inode->i_uid = current->fsuid; - inode->i_gid = current->fsgid; - - sprintf(name, "[%lu]", inode->i_ino); - this.name = name; - this.len = strlen(name); - this.hash = inode->i_ino; - - ret = -ENOMEM; + inode->i_uid = current_fsuid(); + inode->i_gid = current_fsgid(); /* * allocate a new dcache entry */ - file->f_dentry = d_alloc(pfmfs_mnt->mnt_sb->s_root, &this); - if (!file->f_dentry) goto out; + path.dentry = d_alloc(pfmfs_mnt->mnt_root, &this); + if (!path.dentry) { + iput(inode); + return ERR_PTR(-ENOMEM); + } + path.mnt = mntget(pfmfs_mnt); - file->f_dentry->d_op = &pfmfs_dentry_operations; + d_add(path.dentry, inode); - d_add(file->f_dentry, inode); - file->f_vfsmnt = mntget(pfmfs_mnt); - file->f_mapping = inode->i_mapping; + file = alloc_file(&path, FMODE_READ, &pfm_file_ops); + if (IS_ERR(file)) { + path_put(&path); + return file; + } - file->f_op = &pfm_file_ops; - file->f_mode = FMODE_READ; file->f_flags = O_RDONLY; - file->f_pos = 0; - - /* - * may have to delay until context is attached? - */ - fd_install(fd, file); + file->private_data = ctx; - /* - * the file structure we will use - */ - *cfile = file; - - return fd; -out: - if (file) put_filp(file); - put_unused_fd(fd); - return ret; -} - -static void -pfm_free_fd(int fd, struct file *file) -{ - struct files_struct *files = current->files; - struct fdtable *fdt; - - /* - * there ie no fd_uninstall(), so we do it here - */ - spin_lock(&files->file_lock); - fdt = files_fdtable(files); - rcu_assign_pointer(fdt->fd[fd], NULL); - spin_unlock(&files->file_lock); - - if (file) - put_filp(file); - put_unused_fd(fd); + return file; } static int @@ -2256,7 +2246,7 @@ pfm_remap_buffer(struct vm_area_struct *vma, unsigned long buf, unsigned long ad * allocate a sampling buffer and remaps it into the user address space of the task */ static int -pfm_smpl_buffer_alloc(struct task_struct *task, pfm_context_t *ctx, unsigned long rsize, void **user_vaddr) +pfm_smpl_buffer_alloc(struct task_struct *task, struct file *filp, pfm_context_t *ctx, unsigned long rsize, void **user_vaddr) { struct mm_struct *mm = task->mm; struct vm_area_struct *vma = NULL; @@ -2279,7 +2269,7 @@ pfm_smpl_buffer_alloc(struct task_struct *task, pfm_context_t *ctx, unsigned lon * if ((mm->total_vm << PAGE_SHIFT) + len> task->rlim[RLIMIT_AS].rlim_cur) * return -ENOMEM; */ - if (size > task->signal->rlim[RLIMIT_MEMLOCK].rlim_cur) + if (size > task_rlimit(task, RLIMIT_MEMLOCK)) return -ENOMEM; /* @@ -2296,18 +2286,19 @@ pfm_smpl_buffer_alloc(struct task_struct *task, pfm_context_t *ctx, unsigned lon DPRINT(("smpl_buf @%p\n", smpl_buf)); /* allocate vma */ - vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); + vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); if (!vma) { DPRINT(("Cannot allocate vma\n")); goto error_kmem; } - memset(vma, 0, sizeof(*vma)); + INIT_LIST_HEAD(&vma->anon_vma_chain); /* * partially initialize the vma for the sampling buffer */ vma->vm_mm = mm; - vma->vm_flags = VM_READ| VM_MAYREAD |VM_RESERVED; + vma->vm_file = get_file(filp); + vma->vm_flags = VM_READ|VM_MAYREAD|VM_DONTEXPAND|VM_DONTDUMP; vma->vm_page_prot = PAGE_READONLY; /* XXX may need to change */ /* @@ -2327,8 +2318,8 @@ pfm_smpl_buffer_alloc(struct task_struct *task, pfm_context_t *ctx, unsigned lon down_write(&task->mm->mmap_sem); /* find some free area in address space, must have mmap sem held */ - vma->vm_start = pfm_get_unmapped_area(NULL, 0, size, 0, MAP_PRIVATE|MAP_ANONYMOUS, 0); - if (vma->vm_start == 0UL) { + vma->vm_start = get_unmapped_area(NULL, 0, size, 0, MAP_PRIVATE|MAP_ANONYMOUS); + if (IS_ERR_VALUE(vma->vm_start)) { DPRINT(("Cannot find unmapped area for size %ld\n", size)); up_write(&task->mm->mmap_sem); goto error; @@ -2351,7 +2342,6 @@ pfm_smpl_buffer_alloc(struct task_struct *task, pfm_context_t *ctx, unsigned lon */ insert_vm_struct(mm, vma); - mm->total_vm += size >> PAGE_SHIFT; vm_stat_account(vma->vm_mm, vma->vm_flags, vma->vm_file, vma_pages(vma)); up_write(&task->mm->mmap_sem); @@ -2378,22 +2368,33 @@ error_kmem: static int pfm_bad_permissions(struct task_struct *task) { + const struct cred *tcred; + kuid_t uid = current_uid(); + kgid_t gid = current_gid(); + int ret; + + rcu_read_lock(); + tcred = __task_cred(task); + /* inspired by ptrace_attach() */ DPRINT(("cur: uid=%d gid=%d task: euid=%d suid=%d uid=%d egid=%d sgid=%d\n", - current->uid, - current->gid, - task->euid, - task->suid, - task->uid, - task->egid, - task->sgid)); - - return ((current->uid != task->euid) - || (current->uid != task->suid) - || (current->uid != task->uid) - || (current->gid != task->egid) - || (current->gid != task->sgid) - || (current->gid != task->gid)) && !capable(CAP_SYS_PTRACE); + from_kuid(&init_user_ns, uid), + from_kgid(&init_user_ns, gid), + from_kuid(&init_user_ns, tcred->euid), + from_kuid(&init_user_ns, tcred->suid), + from_kuid(&init_user_ns, tcred->uid), + from_kgid(&init_user_ns, tcred->egid), + from_kgid(&init_user_ns, tcred->sgid))); + + ret = ((!uid_eq(uid, tcred->euid)) + || (!uid_eq(uid, tcred->suid)) + || (!uid_eq(uid, tcred->uid)) + || (!gid_eq(gid, tcred->egid)) + || (!gid_eq(gid, tcred->sgid)) + || (!gid_eq(gid, tcred->gid))) && !capable(CAP_SYS_PTRACE); + + rcu_read_unlock(); + return ret; } static int @@ -2422,7 +2423,7 @@ pfarg_is_sane(struct task_struct *task, pfarg_context_t *pfx) } static int -pfm_setup_buffer_fmt(struct task_struct *task, pfm_context_t *ctx, unsigned int ctx_flags, +pfm_setup_buffer_fmt(struct task_struct *task, struct file *filp, pfm_context_t *ctx, unsigned int ctx_flags, unsigned int cpu, pfarg_context_t *arg) { pfm_buffer_fmt_t *fmt = NULL; @@ -2435,7 +2436,7 @@ pfm_setup_buffer_fmt(struct task_struct *task, pfm_context_t *ctx, unsigned int /* invoke and lock buffer format, if found */ fmt = pfm_find_buffer_fmt(arg->ctx_smpl_buf_id); if (fmt == NULL) { - DPRINT(("[%d] cannot find buffer format\n", task->pid)); + DPRINT(("[%d] cannot find buffer format\n", task_pid_nr(task))); return -EINVAL; } @@ -2446,12 +2447,13 @@ pfm_setup_buffer_fmt(struct task_struct *task, pfm_context_t *ctx, unsigned int ret = pfm_buf_fmt_validate(fmt, task, ctx_flags, cpu, fmt_arg); - DPRINT(("[%d] after validate(0x%x,%d,%p)=%d\n", task->pid, ctx_flags, cpu, fmt_arg, ret)); + DPRINT(("[%d] after validate(0x%x,%d,%p)=%d\n", task_pid_nr(task), ctx_flags, cpu, fmt_arg, ret)); if (ret) goto error; /* link buffer format and context */ ctx->ctx_buf_fmt = fmt; + ctx->ctx_fl_is_sampling = 1; /* assume record() is defined */ /* * check if buffer format wants to use perfmon buffer allocation/mapping service @@ -2463,7 +2465,7 @@ pfm_setup_buffer_fmt(struct task_struct *task, pfm_context_t *ctx, unsigned int /* * buffer is always remapped into the caller's address space */ - ret = pfm_smpl_buffer_alloc(current, ctx, size, &uaddr); + ret = pfm_smpl_buffer_alloc(current, filp, ctx, size, &uaddr); if (ret) goto error; /* keep track of user address of buffer */ @@ -2518,7 +2520,7 @@ pfm_reset_pmu_state(pfm_context_t *ctx) ctx->ctx_all_pmcs[0] = pmu_conf->impl_pmcs[0] & ~0x1; /* - * bitmask of all PMDs that are accesible to this context + * bitmask of all PMDs that are accessible to this context */ ctx->ctx_all_pmds[0] = pmu_conf->impl_pmds[0]; @@ -2568,23 +2570,23 @@ pfm_task_incompatible(pfm_context_t *ctx, struct task_struct *task) * no kernel task or task not owner by caller */ if (task->mm == NULL) { - DPRINT(("task [%d] has not memory context (kernel thread)\n", task->pid)); + DPRINT(("task [%d] has not memory context (kernel thread)\n", task_pid_nr(task))); return -EPERM; } if (pfm_bad_permissions(task)) { - DPRINT(("no permission to attach to [%d]\n", task->pid)); + DPRINT(("no permission to attach to [%d]\n", task_pid_nr(task))); return -EPERM; } /* * cannot block in self-monitoring mode */ if (CTX_OVFL_NOBLOCK(ctx) == 0 && task == current) { - DPRINT(("cannot load a blocking context on self for [%d]\n", task->pid)); + DPRINT(("cannot load a blocking context on self for [%d]\n", task_pid_nr(task))); return -EINVAL; } if (task->exit_state == EXIT_ZOMBIE) { - DPRINT(("cannot attach to zombie task [%d]\n", task->pid)); + DPRINT(("cannot attach to zombie task [%d]\n", task_pid_nr(task))); return -EBUSY; } @@ -2593,14 +2595,14 @@ pfm_task_incompatible(pfm_context_t *ctx, struct task_struct *task) */ if (task == current) return 0; - if ((task->state != TASK_STOPPED) && (task->state != TASK_TRACED)) { - DPRINT(("cannot attach to non-stopped task [%d] state=%ld\n", task->pid, task->state)); + if (!task_is_stopped_or_traced(task)) { + DPRINT(("cannot attach to non-stopped task [%d] state=%ld\n", task_pid_nr(task), task->state)); return -EBUSY; } /* * make sure the task is off any CPU */ - wait_task_inactive(task); + wait_task_inactive(task, 0); /* more to come... */ @@ -2616,11 +2618,11 @@ pfm_get_task(pfm_context_t *ctx, pid_t pid, struct task_struct **task) /* XXX: need to add more checks here */ if (pid < 2) return -EPERM; - if (pid != current->pid) { + if (pid != task_pid_vnr(current)) { read_lock(&tasklist_lock); - p = find_task_by_pid(pid); + p = find_task_by_vpid(pid); /* make sure task cannot go away while we operate on it */ if (p) get_task_struct(p); @@ -2646,79 +2648,46 @@ pfm_context_create(pfm_context_t *ctx, void *arg, int count, struct pt_regs *reg { pfarg_context_t *req = (pfarg_context_t *)arg; struct file *filp; + struct path path; int ctx_flags; + int fd; int ret; /* let's check the arguments first */ ret = pfarg_is_sane(current, req); - if (ret < 0) return ret; + if (ret < 0) + return ret; ctx_flags = req->ctx_flags; ret = -ENOMEM; - ctx = pfm_context_alloc(); - if (!ctx) goto error; + fd = get_unused_fd(); + if (fd < 0) + return fd; - ret = pfm_alloc_fd(&filp); - if (ret < 0) goto error_file; + ctx = pfm_context_alloc(ctx_flags); + if (!ctx) + goto error; - req->ctx_fd = ctx->ctx_fd = ret; + filp = pfm_alloc_file(ctx); + if (IS_ERR(filp)) { + ret = PTR_ERR(filp); + goto error_file; + } - /* - * attach context to file - */ - filp->private_data = ctx; + req->ctx_fd = ctx->ctx_fd = fd; /* * does the user want to sample? */ if (pfm_uuid_cmp(req->ctx_smpl_buf_id, pfm_null_uuid)) { - ret = pfm_setup_buffer_fmt(current, ctx, ctx_flags, 0, req); - if (ret) goto buffer_error; + ret = pfm_setup_buffer_fmt(current, filp, ctx, ctx_flags, 0, req); + if (ret) + goto buffer_error; } - /* - * init context protection lock - */ - spin_lock_init(&ctx->ctx_lock); - - /* - * context is unloaded - */ - ctx->ctx_state = PFM_CTX_UNLOADED; - - /* - * initialization of context's flags - */ - ctx->ctx_fl_block = (ctx_flags & PFM_FL_NOTIFY_BLOCK) ? 1 : 0; - ctx->ctx_fl_system = (ctx_flags & PFM_FL_SYSTEM_WIDE) ? 1: 0; - ctx->ctx_fl_is_sampling = ctx->ctx_buf_fmt ? 1 : 0; /* assume record() is defined */ - ctx->ctx_fl_no_msg = (ctx_flags & PFM_FL_OVFL_NO_MSG) ? 1: 0; - /* - * will move to set properties - * ctx->ctx_fl_excl_idle = (ctx_flags & PFM_FL_EXCL_IDLE) ? 1: 0; - */ - - /* - * init restart semaphore to locked - */ - sema_init(&ctx->ctx_restart_sem, 0); - - /* - * activation is used in SMP only - */ - ctx->ctx_last_activation = PFM_INVALID_ACTIVATION; - SET_LAST_CPU(ctx, -1); - - /* - * initialize notification message queue - */ - ctx->ctx_msgq_head = ctx->ctx_msgq_tail = 0; - init_waitqueue_head(&ctx->ctx_msgq_wait); - init_waitqueue_head(&ctx->ctx_zombieq); - - DPRINT(("ctx=%p flags=0x%x system=%d notify_block=%d excl_idle=%d no_msg=%d ctx_fd=%d \n", + DPRINT(("ctx=%p flags=0x%x system=%d notify_block=%d excl_idle=%d no_msg=%d ctx_fd=%d\n", ctx, ctx_flags, ctx->ctx_fl_system, @@ -2732,10 +2701,14 @@ pfm_context_create(pfm_context_t *ctx, void *arg, int count, struct pt_regs *reg */ pfm_reset_pmu_state(ctx); + fd_install(fd, filp); + return 0; buffer_error: - pfm_free_fd(ctx->ctx_fd, filp); + path = filp->f_path; + put_filp(filp); + path_put(&path); if (ctx->ctx_buf_fmt) { pfm_buf_fmt_exit(ctx->ctx_buf_fmt, current, NULL, regs); @@ -2744,6 +2717,7 @@ error_file: pfm_context_free(ctx); error: + put_unused_fd(fd); return ret; } @@ -2856,7 +2830,6 @@ pfm_reset_regs(pfm_context_t *ctx, unsigned long *ovfl_regs, int is_long_reset) static int pfm_write_pmcs(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs) { - struct thread_struct *thread = NULL; struct task_struct *task; pfarg_reg_t *req = (pfarg_reg_t *)arg; unsigned long value, pmc_pm; @@ -2877,7 +2850,6 @@ pfm_write_pmcs(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs) if (state == PFM_CTX_ZOMBIE) return -EINVAL; if (is_loaded) { - thread = &task->thread; /* * In system wide and when the context is loaded, access can only happen * when the caller is running on the CPU being monitored by the session. @@ -3032,7 +3004,7 @@ pfm_write_pmcs(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs) * * The value in ctx_pmcs[] can only be changed in pfm_write_pmcs(). * - * The value in thread->pmcs[] may be modified on overflow, i.e., when + * The value in th_pmcs[] may be modified on overflow, i.e., when * monitoring needs to be stopped. */ if (is_monitor) CTX_USED_MONITOR(ctx, 1UL << cnum); @@ -3046,7 +3018,7 @@ pfm_write_pmcs(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs) /* * write thread state */ - if (is_system == 0) thread->pmcs[cnum] = value; + if (is_system == 0) ctx->th_pmcs[cnum] = value; /* * write hardware register if we can @@ -3098,7 +3070,6 @@ error: static int pfm_write_pmds(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs) { - struct thread_struct *thread = NULL; struct task_struct *task; pfarg_reg_t *req = (pfarg_reg_t *)arg; unsigned long value, hw_value, ovfl_mask; @@ -3122,7 +3093,6 @@ pfm_write_pmds(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs) * the owner of the local PMU. */ if (likely(is_loaded)) { - thread = &task->thread; /* * In system wide and when the context is loaded, access can only happen * when the caller is running on the CPU being monitored by the session. @@ -3230,7 +3200,7 @@ pfm_write_pmds(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs) /* * write thread state */ - if (is_system == 0) thread->pmds[cnum] = hw_value; + if (is_system == 0) ctx->th_pmds[cnum] = hw_value; /* * write hardware register if we can @@ -3296,7 +3266,6 @@ abort_mission: static int pfm_read_pmds(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs) { - struct thread_struct *thread = NULL; struct task_struct *task; unsigned long val = 0UL, lval, ovfl_mask, sval; pfarg_reg_t *req = (pfarg_reg_t *)arg; @@ -3320,7 +3289,6 @@ pfm_read_pmds(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs) if (state == PFM_CTX_ZOMBIE) return -EINVAL; if (likely(is_loaded)) { - thread = &task->thread; /* * In system wide and when the context is loaded, access can only happen * when the caller is running on the CPU being monitored by the session. @@ -3357,7 +3325,7 @@ pfm_read_pmds(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs) if (unlikely(!PMD_IS_IMPL(cnum))) goto error; /* * we can only read the register that we use. That includes - * the one we explicitely initialize AND the one we want included + * the one we explicitly initialize AND the one we want included * in the sampling buffer (smpl_regs). * * Having this restriction allows optimization in the ctxsw routine @@ -3382,7 +3350,7 @@ pfm_read_pmds(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs) * if context is zombie, then task does not exist anymore. * In this case, we use the full value saved in the context (pfm_flush_regs()). */ - val = is_loaded ? thread->pmds[cnum] : 0UL; + val = is_loaded ? ctx->th_pmds[cnum] : 0UL; } rd_func = pmu_conf->pmd_desc[cnum].read_check; @@ -3481,7 +3449,7 @@ pfm_use_debug_registers(struct task_struct *task) if (pmu_conf->use_rr_dbregs == 0) return 0; - DPRINT(("called for [%d]\n", task->pid)); + DPRINT(("called for [%d]\n", task_pid_nr(task))); /* * do it only once @@ -3512,7 +3480,7 @@ pfm_use_debug_registers(struct task_struct *task) DPRINT(("ptrace_use_dbregs=%u sys_use_dbregs=%u by [%d] ret = %d\n", pfm_sessions.pfs_ptrace_use_dbregs, pfm_sessions.pfs_sys_use_dbregs, - task->pid, ret)); + task_pid_nr(task), ret)); UNLOCK_PFS(flags); @@ -3524,7 +3492,7 @@ pfm_use_debug_registers(struct task_struct *task) * IA64_THREAD_DBG_VALID set. This indicates a task which was * able to use the debug registers for debugging purposes via * ptrace(). Therefore we know it was not using them for - * perfmormance monitoring, so we only decrement the number + * performance monitoring, so we only decrement the number * of "ptraced" debug register users to keep the count up to date */ int @@ -3537,7 +3505,7 @@ pfm_release_debug_registers(struct task_struct *task) LOCK_PFS(flags); if (pfm_sessions.pfs_ptrace_use_dbregs == 0) { - printk(KERN_ERR "perfmon: invalid release for [%d] ptrace_use_dbregs=0\n", task->pid); + printk(KERN_ERR "perfmon: invalid release for [%d] ptrace_use_dbregs=0\n", task_pid_nr(task)); ret = -1; } else { pfm_sessions.pfs_ptrace_use_dbregs--; @@ -3589,7 +3557,7 @@ pfm_restart(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs) /* sanity check */ if (unlikely(task == NULL)) { - printk(KERN_ERR "perfmon: [%d] pfm_restart no task\n", current->pid); + printk(KERN_ERR "perfmon: [%d] pfm_restart no task\n", task_pid_nr(current)); return -EINVAL; } @@ -3598,7 +3566,7 @@ pfm_restart(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs) fmt = ctx->ctx_buf_fmt; DPRINT(("restarting self %d ovfl=0x%lx\n", - task->pid, + task_pid_nr(task), ctx->ctx_ovfl_regs[0])); if (CTX_HAS_SMPL(ctx)) { @@ -3622,11 +3590,11 @@ pfm_restart(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs) pfm_reset_regs(ctx, ctx->ctx_ovfl_regs, PFM_PMD_LONG_RESET); if (rst_ctrl.bits.mask_monitoring == 0) { - DPRINT(("resuming monitoring for [%d]\n", task->pid)); + DPRINT(("resuming monitoring for [%d]\n", task_pid_nr(task))); if (state == PFM_CTX_MASKED) pfm_restore_monitoring(task); } else { - DPRINT(("keeping monitoring stopped for [%d]\n", task->pid)); + DPRINT(("keeping monitoring stopped for [%d]\n", task_pid_nr(task))); // cannot use pfm_stop_monitoring(task, regs); } @@ -3677,22 +3645,22 @@ pfm_restart(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs) * if non-blocking, then we ensure that the task will go into * pfm_handle_work() before returning to user mode. * - * We cannot explicitely reset another task, it MUST always + * We cannot explicitly reset another task, it MUST always * be done by the task itself. This works for system wide because * the tool that is controlling the session is logically doing * "self-monitoring". */ if (CTX_OVFL_NOBLOCK(ctx) == 0 && state == PFM_CTX_MASKED) { - DPRINT(("unblocking [%d] \n", task->pid)); - up(&ctx->ctx_restart_sem); + DPRINT(("unblocking [%d]\n", task_pid_nr(task))); + complete(&ctx->ctx_restart_done); } else { - DPRINT(("[%d] armed exit trap\n", task->pid)); + DPRINT(("[%d] armed exit trap\n", task_pid_nr(task))); ctx->ctx_fl_trap_reason = PFM_TRAP_REASON_RESET; PFM_SET_WORK_PENDING(task, 1); - pfm_set_task_notify(task); + set_notify_resume(task); /* * XXX: send reschedule if task runs on another CPU @@ -3774,7 +3742,7 @@ pfm_write_ibr_dbr(int mode, pfm_context_t *ctx, void *arg, int count, struct pt_ * don't bother if we are loaded and task is being debugged */ if (is_loaded && (thread->flags & IA64_THREAD_DBG_VALID) != 0) { - DPRINT(("debug registers already in use for [%d]\n", task->pid)); + DPRINT(("debug registers already in use for [%d]\n", task_pid_nr(task))); return -EBUSY; } @@ -3815,7 +3783,7 @@ pfm_write_ibr_dbr(int mode, pfm_context_t *ctx, void *arg, int count, struct pt_ * is shared by all processes running on it */ if (first_time && can_access_pmu) { - DPRINT(("[%d] clearing ibrs, dbrs\n", task->pid)); + DPRINT(("[%d] clearing ibrs, dbrs\n", task_pid_nr(task))); for (i=0; i < pmu_conf->num_ibrs; i++) { ia64_set_ibr(i, 0UL); ia64_dv_serialize_instruction(); @@ -4004,7 +3972,7 @@ pfm_stop(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs) return -EBUSY; } DPRINT(("task [%d] ctx_state=%d is_system=%d\n", - PFM_CTX_TASK(ctx)->pid, + task_pid_nr(PFM_CTX_TASK(ctx)), state, is_system)); /* @@ -4051,7 +4019,7 @@ pfm_stop(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs) */ ia64_psr(regs)->up = 0; } else { - tregs = ia64_task_regs(task); + tregs = task_pt_regs(task); /* * stop monitoring at the user level @@ -4062,7 +4030,7 @@ pfm_stop(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs) * monitoring disabled in kernel at next reschedule */ ctx->ctx_saved_psr_up = 0; - DPRINT(("task=[%d]\n", task->pid)); + DPRINT(("task=[%d]\n", task_pid_nr(task))); } return 0; } @@ -4133,7 +4101,7 @@ pfm_start(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs) ia64_psr(regs)->up = 1; } else { - tregs = ia64_task_regs(ctx->ctx_task); + tregs = task_pt_regs(ctx->ctx_task); /* * start monitoring at the kernel level the next @@ -4187,10 +4155,10 @@ pfm_check_task_exist(pfm_context_t *ctx) do_each_thread (g, t) { if (t->thread.pfm_context == ctx) { ret = 0; - break; + goto out; } } while_each_thread (g, t); - +out: read_unlock(&tasklist_lock); DPRINT(("pfm_check_task_exist: ret=%d ctx=%p\n", ret, ctx)); @@ -4267,11 +4235,12 @@ pfm_context_load(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs) if (is_system) { if (pfm_sessions.pfs_ptrace_use_dbregs) { - DPRINT(("cannot load [%d] dbregs in use\n", task->pid)); + DPRINT(("cannot load [%d] dbregs in use\n", + task_pid_nr(task))); ret = -EBUSY; } else { pfm_sessions.pfs_sys_use_dbregs++; - DPRINT(("load [%d] increased sys_use_dbreg=%u\n", task->pid, pfm_sessions.pfs_sys_use_dbregs)); + DPRINT(("load [%d] increased sys_use_dbreg=%u\n", task_pid_nr(task), pfm_sessions.pfs_sys_use_dbregs)); set_dbregs = 1; } } @@ -4351,8 +4320,8 @@ pfm_context_load(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs) pfm_copy_pmds(task, ctx); pfm_copy_pmcs(task, ctx); - pmcs_source = thread->pmcs; - pmds_source = thread->pmds; + pmcs_source = ctx->th_pmcs; + pmds_source = ctx->th_pmds; /* * always the case for system-wide @@ -4363,7 +4332,7 @@ pfm_context_load(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs) /* allow user level control */ ia64_psr(regs)->sp = 0; - DPRINT(("clearing psr.sp for [%d]\n", task->pid)); + DPRINT(("clearing psr.sp for [%d]\n", task_pid_nr(task))); SET_LAST_CPU(ctx, smp_processor_id()); INC_ACTIVATION(); @@ -4398,12 +4367,12 @@ pfm_context_load(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs) */ SET_PMU_OWNER(task, ctx); - DPRINT(("context loaded on PMU for [%d]\n", task->pid)); + DPRINT(("context loaded on PMU for [%d]\n", task_pid_nr(task))); } else { /* * when not current, task MUST be stopped, so this is safe */ - regs = ia64_task_regs(task); + regs = task_pt_regs(task); /* force a full reload */ ctx->ctx_last_activation = PFM_INVALID_ACTIVATION; @@ -4462,7 +4431,7 @@ pfm_context_unload(pfm_context_t *ctx, void *arg, int count, struct pt_regs *reg int prev_state, is_system; int ret; - DPRINT(("ctx_state=%d task [%d]\n", ctx->ctx_state, task ? task->pid : -1)); + DPRINT(("ctx_state=%d task [%d]\n", ctx->ctx_state, task ? task_pid_nr(task) : -1)); prev_state = ctx->ctx_state; is_system = ctx->ctx_fl_system; @@ -4529,7 +4498,7 @@ pfm_context_unload(pfm_context_t *ctx, void *arg, int count, struct pt_regs *reg /* * per-task mode */ - tregs = task == current ? regs : ia64_task_regs(task); + tregs = task == current ? regs : task_pt_regs(task); if (task == current) { /* @@ -4537,7 +4506,7 @@ pfm_context_unload(pfm_context_t *ctx, void *arg, int count, struct pt_regs *reg */ ia64_psr(regs)->sp = 1; - DPRINT(("setting psr.sp for [%d]\n", task->pid)); + DPRINT(("setting psr.sp for [%d]\n", task_pid_nr(task))); } /* * save PMDs to context @@ -4577,7 +4546,7 @@ pfm_context_unload(pfm_context_t *ctx, void *arg, int count, struct pt_regs *reg ctx->ctx_fl_can_restart = 0; ctx->ctx_fl_going_zombie = 0; - DPRINT(("disconnected [%d] from context\n", task->pid)); + DPRINT(("disconnected [%d] from context\n", task_pid_nr(task))); return 0; } @@ -4592,7 +4561,7 @@ pfm_exit_thread(struct task_struct *task) { pfm_context_t *ctx; unsigned long flags; - struct pt_regs *regs = ia64_task_regs(task); + struct pt_regs *regs = task_pt_regs(task); int ret, state; int free_ok = 0; @@ -4600,22 +4569,22 @@ pfm_exit_thread(struct task_struct *task) PROTECT_CTX(ctx, flags); - DPRINT(("state=%d task [%d]\n", ctx->ctx_state, task->pid)); + DPRINT(("state=%d task [%d]\n", ctx->ctx_state, task_pid_nr(task))); state = ctx->ctx_state; switch(state) { case PFM_CTX_UNLOADED: /* - * only comes to thios function if pfm_context is not NULL, i.e., cannot + * only comes to this function if pfm_context is not NULL, i.e., cannot * be in unloaded state */ - printk(KERN_ERR "perfmon: pfm_exit_thread [%d] ctx unloaded\n", task->pid); + printk(KERN_ERR "perfmon: pfm_exit_thread [%d] ctx unloaded\n", task_pid_nr(task)); break; case PFM_CTX_LOADED: case PFM_CTX_MASKED: ret = pfm_context_unload(ctx, NULL, 0, regs); if (ret) { - printk(KERN_ERR "perfmon: pfm_exit_thread [%d] state=%d unload failed %d\n", task->pid, state, ret); + printk(KERN_ERR "perfmon: pfm_exit_thread [%d] state=%d unload failed %d\n", task_pid_nr(task), state, ret); } DPRINT(("ctx unloaded for current state was %d\n", state)); @@ -4624,12 +4593,12 @@ pfm_exit_thread(struct task_struct *task) case PFM_CTX_ZOMBIE: ret = pfm_context_unload(ctx, NULL, 0, regs); if (ret) { - printk(KERN_ERR "perfmon: pfm_exit_thread [%d] state=%d unload failed %d\n", task->pid, state, ret); + printk(KERN_ERR "perfmon: pfm_exit_thread [%d] state=%d unload failed %d\n", task_pid_nr(task), state, ret); } free_ok = 1; break; default: - printk(KERN_ERR "perfmon: pfm_exit_thread [%d] unexpected state=%d\n", task->pid, state); + printk(KERN_ERR "perfmon: pfm_exit_thread [%d] unexpected state=%d\n", task_pid_nr(task), state); break; } UNPROTECT_CTX(ctx, flags); @@ -4713,7 +4682,7 @@ recheck: DPRINT(("context %d state=%d [%d] task_state=%ld must_stop=%d\n", ctx->ctx_fd, state, - task->pid, + task_pid_nr(task), task->state, PFM_CMD_STOPPED(cmd))); /* @@ -4759,8 +4728,8 @@ recheck: * the task must be stopped. */ if (PFM_CMD_STOPPED(cmd)) { - if ((task->state != TASK_STOPPED) && (task->state != TASK_TRACED)) { - DPRINT(("[%d] task not in stopped state\n", task->pid)); + if (!task_is_stopped_or_traced(task)) { + DPRINT(("[%d] task not in stopped state\n", task_pid_nr(task))); return -EBUSY; } /* @@ -4781,7 +4750,7 @@ recheck: UNPROTECT_CTX(ctx, flags); - wait_task_inactive(task); + wait_task_inactive(task, 0); PROTECT_CTX(ctx, flags); @@ -4802,7 +4771,7 @@ recheck: asmlinkage long sys_perfmonctl (int fd, int cmd, void __user *arg, int count) { - struct file *file = NULL; + struct fd f = {NULL, 0}; pfm_context_t *ctx = NULL; unsigned long flags = 0UL; void *args_k = NULL; @@ -4853,7 +4822,7 @@ restart_args: * limit abuse to min page size */ if (unlikely(sz > PFM_MAX_ARGSIZE)) { - printk(KERN_ERR "perfmon: [%d] argument too big %lu\n", current->pid, sz); + printk(KERN_ERR "perfmon: [%d] argument too big %lu\n", task_pid_nr(current), sz); return -E2BIG; } @@ -4899,17 +4868,17 @@ restart_args: ret = -EBADF; - file = fget(fd); - if (unlikely(file == NULL)) { + f = fdget(fd); + if (unlikely(f.file == NULL)) { DPRINT(("invalid fd %d\n", fd)); goto error_args; } - if (unlikely(PFM_IS_FILE(file) == 0)) { + if (unlikely(PFM_IS_FILE(f.file) == 0)) { DPRINT(("fd %d not related to perfmon\n", fd)); goto error_args; } - ctx = (pfm_context_t *)file->private_data; + ctx = f.file->private_data; if (unlikely(ctx == NULL)) { DPRINT(("no context for fd %d\n", fd)); goto error_args; @@ -4925,7 +4894,7 @@ restart_args: if (unlikely(ret)) goto abort_locked; skip_fd: - ret = (*func)(ctx, args_k, count, ia64_task_regs(current)); + ret = (*func)(ctx, args_k, count, task_pt_regs(current)); call_made = 1; @@ -4933,14 +4902,16 @@ abort_locked: if (likely(ctx)) { DPRINT(("context unlocked\n")); UNPROTECT_CTX(ctx, flags); - fput(file); } /* copy argument back to user, if needed */ if (call_made && PFM_CMD_RW_ARG(cmd) && copy_to_user(arg, args_k, base_sz*count)) ret = -EFAULT; error_args: - if (args_k) kfree(args_k); + if (f.file) + fdput(f); + + kfree(args_k); DPRINT(("cmd=%s ret=%ld\n", PFM_CMD_NAME(cmd), ret)); @@ -4998,11 +4969,11 @@ pfm_context_force_terminate(pfm_context_t *ctx, struct pt_regs *regs) { int ret; - DPRINT(("entering for [%d]\n", current->pid)); + DPRINT(("entering for [%d]\n", task_pid_nr(current))); ret = pfm_context_unload(ctx, NULL, 0, regs); if (ret) { - printk(KERN_ERR "pfm_context_force_terminate: [%d] unloaded failed with %d\n", current->pid, ret); + printk(KERN_ERR "pfm_context_force_terminate: [%d] unloaded failed with %d\n", task_pid_nr(current), ret); } /* @@ -5018,12 +4989,13 @@ pfm_context_force_terminate(pfm_context_t *ctx, struct pt_regs *regs) } static int pfm_ovfl_notify_user(pfm_context_t *ctx, unsigned long ovfl_pmds); + /* * pfm_handle_work() can be called with interrupts enabled * (TIF_NEED_RESCHED) or disabled. The down_interruptible * call may sleep, therefore we must re-enable interrupts * to avoid deadlocks. It is safe to do so because this function - * is called ONLY when returning to user level (PUStk=1), in which case + * is called ONLY when returning to user level (pUStk=1), in which case * there is no risk of kernel stack overflow due to deep * interrupt nesting. */ @@ -5039,7 +5011,8 @@ pfm_handle_work(void) ctx = PFM_GET_CTX(current); if (ctx == NULL) { - printk(KERN_ERR "perfmon: [%d] has no PFM context\n", current->pid); + printk(KERN_ERR "perfmon: [%d] has no PFM context\n", + task_pid_nr(current)); return; } @@ -5047,9 +5020,7 @@ pfm_handle_work(void) PFM_SET_WORK_PENDING(current, 0); - pfm_clear_task_notify(); - - regs = ia64_task_regs(current); + regs = task_pt_regs(current); /* * extract reason for being here and clear @@ -5063,11 +5034,12 @@ pfm_handle_work(void) /* * must be done before we check for simple-reset mode */ - if (ctx->ctx_fl_going_zombie || ctx->ctx_state == PFM_CTX_ZOMBIE) goto do_zombie; - + if (ctx->ctx_fl_going_zombie || ctx->ctx_state == PFM_CTX_ZOMBIE) + goto do_zombie; //if (CTX_OVFL_NOBLOCK(ctx)) goto skip_blocking; - if (reason == PFM_TRAP_REASON_RESET) goto skip_blocking; + if (reason == PFM_TRAP_REASON_RESET) + goto skip_blocking; /* * restore interrupt mask to what it was on entry. @@ -5086,7 +5058,7 @@ pfm_handle_work(void) * may go through without blocking on SMP systems * if restart has been received already by the time we call down() */ - ret = down_interruptible(&ctx->ctx_restart_sem); + ret = wait_for_completion_interruptible(&ctx->ctx_restart_done); DPRINT(("after block sleeping ret=%d\n", ret)); @@ -5115,7 +5087,8 @@ do_zombie: /* * in case of interruption of down() we don't restart anything */ - if (ret < 0) goto nothing_to_do; + if (ret < 0) + goto nothing_to_do; skip_blocking: pfm_resume_after_ovfl(ctx, ovfl_regs, regs); @@ -5207,10 +5180,10 @@ pfm_end_notify_user(pfm_context_t *ctx) /* * main overflow processing routine. - * it can be called from the interrupt path or explicitely during the context switch code + * it can be called from the interrupt path or explicitly during the context switch code */ -static void -pfm_overflow_handler(struct task_struct *task, pfm_context_t *ctx, u64 pmc0, struct pt_regs *regs) +static void pfm_overflow_handler(struct task_struct *task, pfm_context_t *ctx, + unsigned long pmc0, struct pt_regs *regs) { pfm_ovfl_arg_t *ovfl_arg; unsigned long mask; @@ -5236,7 +5209,7 @@ pfm_overflow_handler(struct task_struct *task, pfm_context_t *ctx, u64 pmc0, str DPRINT_ovfl(("pmc0=0x%lx pid=%d iip=0x%lx, %s " "used_pmds=0x%lx\n", pmc0, - task ? task->pid: -1, + task ? task_pid_nr(task): -1, (regs ? regs->cr_iip : 0), CTX_OVFL_NOBLOCK(ctx) ? "nonblocking" : "blocking", ctx->ctx_used_pmds[0])); @@ -5415,7 +5388,7 @@ pfm_overflow_handler(struct task_struct *task, pfm_context_t *ctx, u64 pmc0, str * when coming from ctxsw, current still points to the * previous task, therefore we must work with task and not current. */ - pfm_set_task_notify(task); + set_notify_resume(task); } /* * defer until state is changed (shorten spin window). the context is locked @@ -5425,7 +5398,7 @@ pfm_overflow_handler(struct task_struct *task, pfm_context_t *ctx, u64 pmc0, str } DPRINT_ovfl(("owner [%d] pending=%ld reason=%u ovfl_pmds=0x%lx ovfl_notify=0x%lx masked=%d\n", - GET_PMU_OWNER() ? GET_PMU_OWNER()->pid : -1, + GET_PMU_OWNER() ? task_pid_nr(GET_PMU_OWNER()) : -1, PFM_GET_WORK_PENDING(task), ctx->ctx_fl_trap_reason, ovfl_pmds, @@ -5450,7 +5423,7 @@ pfm_overflow_handler(struct task_struct *task, pfm_context_t *ctx, u64 pmc0, str sanity_check: printk(KERN_ERR "perfmon: CPU%d overflow handler [%d] pmc0=0x%lx\n", smp_processor_id(), - task ? task->pid : -1, + task ? task_pid_nr(task) : -1, pmc0); return; @@ -5483,7 +5456,7 @@ stop_monitoring: * * Overall pretty hairy stuff.... */ - DPRINT(("ctx is zombie for [%d], converted to spurious\n", task ? task->pid: -1)); + DPRINT(("ctx is zombie for [%d], converted to spurious\n", task ? task_pid_nr(task): -1)); pfm_clear_psr_up(); ia64_psr(regs)->up = 0; ia64_psr(regs)->sp = 1; @@ -5491,7 +5464,7 @@ stop_monitoring: } static int -pfm_do_interrupt_handler(int irq, void *arg, struct pt_regs *regs) +pfm_do_interrupt_handler(void *arg, struct pt_regs *regs) { struct task_struct *task; pfm_context_t *ctx; @@ -5544,24 +5517,25 @@ pfm_do_interrupt_handler(int irq, void *arg, struct pt_regs *regs) report_spurious1: printk(KERN_INFO "perfmon: spurious overflow interrupt on CPU%d: process %d has no PFM context\n", - this_cpu, task->pid); + this_cpu, task_pid_nr(task)); pfm_unfreeze_pmu(); return -1; report_spurious2: printk(KERN_INFO "perfmon: spurious overflow interrupt on CPU%d: process %d, invalid flag\n", this_cpu, - task->pid); + task_pid_nr(task)); pfm_unfreeze_pmu(); return -1; } static irqreturn_t -pfm_interrupt_handler(int irq, void *arg, struct pt_regs *regs) +pfm_interrupt_handler(int irq, void *arg) { unsigned long start_cycles, total_cycles; unsigned long min, max; int this_cpu; int ret; + struct pt_regs *regs = get_irq_regs(); this_cpu = get_cpu(); if (likely(!pfm_alt_intr_handler)) { @@ -5570,7 +5544,7 @@ pfm_interrupt_handler(int irq, void *arg, struct pt_regs *regs) start_cycles = ia64_get_itc(); - ret = pfm_do_interrupt_handler(irq, arg, regs); + ret = pfm_do_interrupt_handler(arg, regs); total_cycles = ia64_get_itc(); @@ -5590,7 +5564,7 @@ pfm_interrupt_handler(int irq, void *arg, struct pt_regs *regs) (*pfm_alt_intr_handler->handler)(irq, arg, regs); } - put_cpu_no_resched(); + put_cpu(); return IRQ_HANDLED; } @@ -5598,7 +5572,7 @@ pfm_interrupt_handler(int irq, void *arg, struct pt_regs *regs) * /proc/perfmon interface, for debug only */ -#define PFM_PROC_SHOW_HEADER ((void *)NR_CPUS+1) +#define PFM_PROC_SHOW_HEADER ((void *)(long)nr_cpu_ids+1) static void * pfm_proc_start(struct seq_file *m, loff_t *pos) @@ -5607,7 +5581,7 @@ pfm_proc_start(struct seq_file *m, loff_t *pos) return PFM_PROC_SHOW_HEADER; } - while (*pos <= NR_CPUS) { + while (*pos <= nr_cpu_ids) { if (cpu_online(*pos - 1)) { return (void *)*pos; } @@ -5667,24 +5641,8 @@ pfm_proc_show_header(struct seq_file *m) list_for_each(pos, &pfm_buffer_fmt_list) { entry = list_entry(pos, pfm_buffer_fmt_t, fmt_list); - seq_printf(m, "format : %02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x %s\n", - entry->fmt_uuid[0], - entry->fmt_uuid[1], - entry->fmt_uuid[2], - entry->fmt_uuid[3], - entry->fmt_uuid[4], - entry->fmt_uuid[5], - entry->fmt_uuid[6], - entry->fmt_uuid[7], - entry->fmt_uuid[8], - entry->fmt_uuid[9], - entry->fmt_uuid[10], - entry->fmt_uuid[11], - entry->fmt_uuid[12], - entry->fmt_uuid[13], - entry->fmt_uuid[14], - entry->fmt_uuid[15], - entry->fmt_name); + seq_printf(m, "format : %16phD %s\n", + entry->fmt_uuid, entry->fmt_name); } spin_unlock(&pfm_buffer_fmt_lock); @@ -5759,7 +5717,7 @@ pfm_proc_show(struct seq_file *m, void *v) return 0; } -struct seq_operations pfm_seq_ops = { +const struct seq_operations pfm_seq_ops = { .start = pfm_proc_start, .next = pfm_proc_next, .stop = pfm_proc_stop, @@ -5793,7 +5751,7 @@ pfm_syst_wide_update_task(struct task_struct *task, unsigned long info, int is_c * on every CPU, so we can rely on the pid to identify the idle task. */ if ((info & PFM_CPUINFO_EXCL_IDLE) == 0 || task->pid) { - regs = ia64_task_regs(task); + regs = task_pt_regs(task); ia64_psr(regs)->pp = is_ctxswin ? dcr_pp : 0; return; } @@ -5836,7 +5794,8 @@ pfm_force_cleanup(pfm_context_t *ctx, struct pt_regs *regs) ia64_psr(regs)->sp = 1; if (GET_PMU_OWNER() == task) { - DPRINT(("cleared ownership for [%d]\n", ctx->ctx_task->pid)); + DPRINT(("cleared ownership for [%d]\n", + task_pid_nr(ctx->ctx_task))); SET_PMU_OWNER(NULL, NULL); } @@ -5848,7 +5807,7 @@ pfm_force_cleanup(pfm_context_t *ctx, struct pt_regs *regs) task->thread.pfm_context = NULL; task->thread.flags &= ~IA64_THREAD_PM_VALID; - DPRINT(("force cleanup for [%d]\n", task->pid)); + DPRINT(("force cleanup for [%d]\n", task_pid_nr(task))); } @@ -5859,14 +5818,12 @@ void pfm_save_regs(struct task_struct *task) { pfm_context_t *ctx; - struct thread_struct *t; unsigned long flags; u64 psr; ctx = PFM_GET_CTX(task); if (ctx == NULL) return; - t = &task->thread; /* * we always come here with interrupts ALREADY disabled by @@ -5876,7 +5833,7 @@ pfm_save_regs(struct task_struct *task) flags = pfm_protect_ctx_ctxsw(ctx); if (ctx->ctx_state == PFM_CTX_ZOMBIE) { - struct pt_regs *regs = ia64_task_regs(task); + struct pt_regs *regs = task_pt_regs(task); pfm_clear_psr_up(); @@ -5924,19 +5881,19 @@ pfm_save_regs(struct task_struct *task) * guarantee we will be schedule at that same * CPU again. */ - pfm_save_pmds(t->pmds, ctx->ctx_used_pmds[0]); + pfm_save_pmds(ctx->th_pmds, ctx->ctx_used_pmds[0]); /* * save pmc0 ia64_srlz_d() done in pfm_save_pmds() * we will need it on the restore path to check * for pending overflow. */ - t->pmcs[0] = ia64_get_pmc(0); + ctx->th_pmcs[0] = ia64_get_pmc(0); /* * unfreeze PMU if had pending overflows */ - if (t->pmcs[0] & ~0x1UL) pfm_unfreeze_pmu(); + if (ctx->th_pmcs[0] & ~0x1UL) pfm_unfreeze_pmu(); /* * finally, allow context access. @@ -5981,7 +5938,6 @@ static void pfm_lazy_save_regs (struct task_struct *task) { pfm_context_t *ctx; - struct thread_struct *t; unsigned long flags; { u64 psr = pfm_get_psr(); @@ -5989,7 +5945,6 @@ pfm_lazy_save_regs (struct task_struct *task) } ctx = PFM_GET_CTX(task); - t = &task->thread; /* * we need to mask PMU overflow here to @@ -6014,19 +5969,19 @@ pfm_lazy_save_regs (struct task_struct *task) /* * save all the pmds we use */ - pfm_save_pmds(t->pmds, ctx->ctx_used_pmds[0]); + pfm_save_pmds(ctx->th_pmds, ctx->ctx_used_pmds[0]); /* * save pmc0 ia64_srlz_d() done in pfm_save_pmds() * it is needed to check for pended overflow * on the restore path */ - t->pmcs[0] = ia64_get_pmc(0); + ctx->th_pmcs[0] = ia64_get_pmc(0); /* * unfreeze PMU if had pending overflows */ - if (t->pmcs[0] & ~0x1UL) pfm_unfreeze_pmu(); + if (ctx->th_pmcs[0] & ~0x1UL) pfm_unfreeze_pmu(); /* * now get can unmask PMU interrupts, they will @@ -6045,7 +6000,6 @@ void pfm_load_regs (struct task_struct *task) { pfm_context_t *ctx; - struct thread_struct *t; unsigned long pmc_mask = 0UL, pmd_mask = 0UL; unsigned long flags; u64 psr, psr_up; @@ -6056,11 +6010,10 @@ pfm_load_regs (struct task_struct *task) BUG_ON(GET_PMU_OWNER()); - t = &task->thread; /* * possible on unload */ - if (unlikely((t->flags & IA64_THREAD_PM_VALID) == 0)) return; + if (unlikely((task->thread.flags & IA64_THREAD_PM_VALID) == 0)) return; /* * we always come here with interrupts ALREADY disabled by @@ -6076,7 +6029,7 @@ pfm_load_regs (struct task_struct *task) BUG_ON(psr & IA64_PSR_I); if (unlikely(ctx->ctx_state == PFM_CTX_ZOMBIE)) { - struct pt_regs *regs = ia64_task_regs(task); + struct pt_regs *regs = task_pt_regs(task); BUG_ON(ctx->ctx_smpl_hdr); @@ -6142,26 +6095,26 @@ pfm_load_regs (struct task_struct *task) * * XXX: optimize here */ - if (pmd_mask) pfm_restore_pmds(t->pmds, pmd_mask); - if (pmc_mask) pfm_restore_pmcs(t->pmcs, pmc_mask); + if (pmd_mask) pfm_restore_pmds(ctx->th_pmds, pmd_mask); + if (pmc_mask) pfm_restore_pmcs(ctx->th_pmcs, pmc_mask); /* * check for pending overflow at the time the state * was saved. */ - if (unlikely(PMC0_HAS_OVFL(t->pmcs[0]))) { + if (unlikely(PMC0_HAS_OVFL(ctx->th_pmcs[0]))) { /* * reload pmc0 with the overflow information * On McKinley PMU, this will trigger a PMU interrupt */ - ia64_set_pmc(0, t->pmcs[0]); + ia64_set_pmc(0, ctx->th_pmcs[0]); ia64_srlz_d(); - t->pmcs[0] = 0UL; + ctx->th_pmcs[0] = 0UL; /* * will replay the PMU interrupt */ - if (need_irq_resend) hw_resend_irq(NULL, IA64_PERFMON_VECTOR); + if (need_irq_resend) ia64_resend_irq(IA64_PERFMON_VECTOR); pfm_stats[smp_processor_id()].pfm_replay_ovfl_intr_count++; } @@ -6209,7 +6162,6 @@ pfm_load_regs (struct task_struct *task) void pfm_load_regs (struct task_struct *task) { - struct thread_struct *t; pfm_context_t *ctx; struct task_struct *owner; unsigned long pmd_mask, pmc_mask; @@ -6218,7 +6170,6 @@ pfm_load_regs (struct task_struct *task) owner = GET_PMU_OWNER(); ctx = PFM_GET_CTX(task); - t = &task->thread; psr = pfm_get_psr(); BUG_ON(psr & (IA64_PSR_UP|IA64_PSR_PP)); @@ -6281,27 +6232,27 @@ pfm_load_regs (struct task_struct *task) */ pmc_mask = ctx->ctx_all_pmcs[0]; - pfm_restore_pmds(t->pmds, pmd_mask); - pfm_restore_pmcs(t->pmcs, pmc_mask); + pfm_restore_pmds(ctx->th_pmds, pmd_mask); + pfm_restore_pmcs(ctx->th_pmcs, pmc_mask); /* * check for pending overflow at the time the state * was saved. */ - if (unlikely(PMC0_HAS_OVFL(t->pmcs[0]))) { + if (unlikely(PMC0_HAS_OVFL(ctx->th_pmcs[0]))) { /* * reload pmc0 with the overflow information * On McKinley PMU, this will trigger a PMU interrupt */ - ia64_set_pmc(0, t->pmcs[0]); + ia64_set_pmc(0, ctx->th_pmcs[0]); ia64_srlz_d(); - t->pmcs[0] = 0UL; + ctx->th_pmcs[0] = 0UL; /* * will replay the PMU interrupt */ - if (need_irq_resend) hw_resend_irq(NULL, IA64_PERFMON_VECTOR); + if (need_irq_resend) ia64_resend_irq(IA64_PERFMON_VECTOR); pfm_stats[smp_processor_id()].pfm_replay_ovfl_intr_count++; } @@ -6371,11 +6322,11 @@ pfm_flush_pmds(struct task_struct *task, pfm_context_t *ctx) */ pfm_unfreeze_pmu(); } else { - pmc0 = task->thread.pmcs[0]; + pmc0 = ctx->th_pmcs[0]; /* * clear whatever overflow status bits there were */ - task->thread.pmcs[0] = 0; + ctx->th_pmcs[0] = 0; } ovfl_val = pmu_conf->ovfl_val; /* @@ -6396,11 +6347,11 @@ pfm_flush_pmds(struct task_struct *task, pfm_context_t *ctx) /* * can access PMU always true in system wide mode */ - val = pmd_val = can_access_pmu ? ia64_get_pmd(i) : task->thread.pmds[i]; + val = pmd_val = can_access_pmu ? ia64_get_pmd(i) : ctx->th_pmds[i]; if (PMD_IS_COUNTING(i)) { DPRINT(("[%d] pmd[%d] ctx_pmd=0x%lx hw_pmd=0x%lx\n", - task->pid, + task_pid_nr(task), i, ctx->ctx_pmds[i].val, val & ovfl_val)); @@ -6422,13 +6373,13 @@ pfm_flush_pmds(struct task_struct *task, pfm_context_t *ctx) */ if (pmc0 & (1UL << i)) { val += 1 + ovfl_val; - DPRINT(("[%d] pmd[%d] overflowed\n", task->pid, i)); + DPRINT(("[%d] pmd[%d] overflowed\n", task_pid_nr(task), i)); } } - DPRINT(("[%d] ctx_pmd[%d]=0x%lx pmd_val=0x%lx\n", task->pid, i, val, pmd_val)); + DPRINT(("[%d] ctx_pmd[%d]=0x%lx pmd_val=0x%lx\n", task_pid_nr(task), i, val, pmd_val)); - if (is_self) task->thread.pmds[i] = pmd_val; + if (is_self) ctx->th_pmds[i] = pmd_val; ctx->ctx_pmds[i].val = val; } @@ -6436,7 +6387,6 @@ pfm_flush_pmds(struct task_struct *task, pfm_context_t *ctx) static struct irqaction perfmon_irqaction = { .handler = pfm_interrupt_handler, - .flags = SA_INTERRUPT, .name = "perfmon" }; @@ -6445,7 +6395,7 @@ pfm_alt_save_pmu_state(void *data) { struct pt_regs *regs; - regs = ia64_task_regs(current); + regs = task_pt_regs(current); DPRINT(("called\n")); @@ -6471,7 +6421,7 @@ pfm_alt_restore_pmu_state(void *data) { struct pt_regs *regs; - regs = ia64_task_regs(current); + regs = task_pt_regs(current); DPRINT(("called\n")); @@ -6515,7 +6465,7 @@ pfm_install_alt_pmu_interrupt(pfm_intr_handler_desc_t *hdl) } /* save the current system wide pmu states */ - ret = on_each_cpu(pfm_alt_save_pmu_state, NULL, 0, 1); + ret = on_each_cpu(pfm_alt_save_pmu_state, NULL, 1); if (ret) { DPRINT(("on_each_cpu() failed: %d\n", ret)); goto cleanup_reserve; @@ -6560,7 +6510,7 @@ pfm_remove_alt_pmu_interrupt(pfm_intr_handler_desc_t *hdl) pfm_alt_intr_handler = NULL; - ret = on_each_cpu(pfm_alt_restore_pmu_state, NULL, 0, 1); + ret = on_each_cpu(pfm_alt_restore_pmu_state, NULL, 1); if (ret) { DPRINT(("on_each_cpu() failed: %d\n", ret)); } @@ -6603,7 +6553,7 @@ found: return 0; } -static struct file_operations pfm_proc_fops = { +static const struct file_operations pfm_proc_fops = { .open = pfm_proc_open, .read = seq_read, .llseek = seq_lseek, @@ -6672,7 +6622,7 @@ pfm_init(void) ffz(pmu_conf->ovfl_val)); /* sanity check */ - if (pmu_conf->num_pmds >= IA64_NUM_PMD_REGS || pmu_conf->num_pmcs >= IA64_NUM_PMC_REGS) { + if (pmu_conf->num_pmds >= PFM_NUM_PMD_REGS || pmu_conf->num_pmcs >= PFM_NUM_PMC_REGS) { printk(KERN_ERR "perfmon: not enough pmc/pmd, perfmon disabled\n"); pmu_conf = NULL; return -1; @@ -6681,21 +6631,17 @@ pfm_init(void) /* * create /proc/perfmon (mostly for debugging purposes) */ - perfmon_dir = create_proc_entry("perfmon", S_IRUGO, NULL); + perfmon_dir = proc_create("perfmon", S_IRUGO, NULL, &pfm_proc_fops); if (perfmon_dir == NULL) { printk(KERN_ERR "perfmon: cannot create /proc entry, perfmon disabled\n"); pmu_conf = NULL; return -1; } - /* - * install customized file operations for /proc/perfmon entry - */ - perfmon_dir->proc_fops = &pfm_proc_fops; /* * create /proc/sys/kernel/perfmon (for debugging purposes) */ - pfm_sysctl_header = register_sysctl_table(pfm_sysctl_root, 0); + pfm_sysctl_header = register_sysctl_table(pfm_sysctl_root); /* * initialize all our spinlocks @@ -6718,6 +6664,7 @@ __initcall(pfm_init); void pfm_init_percpu (void) { + static int first_time=1; /* * make sure no measurement is active * (may inherit programmed PMCs from EFI). @@ -6730,8 +6677,10 @@ pfm_init_percpu (void) */ pfm_unfreeze_pmu(); - if (smp_processor_id() == 0) + if (first_time) { register_percpu_irq(IA64_PERFMON_VECTOR, &perfmon_irqaction); + first_time=0; + } ia64_setreg(_IA64_REG_CR_PMV, IA64_PERFMON_VECTOR); ia64_srlz_d(); @@ -6744,7 +6693,6 @@ void dump_pmu_state(const char *from) { struct task_struct *task; - struct thread_struct *t; struct pt_regs *regs; pfm_context_t *ctx; unsigned long psr, dcr, info, flags; @@ -6753,7 +6701,7 @@ dump_pmu_state(const char *from) local_irq_save(flags); this_cpu = smp_processor_id(); - regs = ia64_task_regs(current); + regs = task_pt_regs(current); info = PFM_CPUINFO_GET(); dcr = ia64_getreg(_IA64_REG_CR_DCR); @@ -6765,14 +6713,14 @@ dump_pmu_state(const char *from) printk("CPU%d from %s() current [%d] iip=0x%lx %s\n", this_cpu, from, - current->pid, + task_pid_nr(current), regs->cr_iip, current->comm); task = GET_PMU_OWNER(); ctx = GET_PMU_CTX(); - printk("->CPU%d owner [%d] ctx=%p\n", this_cpu, task ? task->pid : -1, ctx); + printk("->CPU%d owner [%d] ctx=%p\n", this_cpu, task ? task_pid_nr(task) : -1, ctx); psr = pfm_get_psr(); @@ -6789,16 +6737,14 @@ dump_pmu_state(const char *from) ia64_psr(regs)->up = 0; ia64_psr(regs)->pp = 0; - t = ¤t->thread; - for (i=1; PMC_IS_LAST(i) == 0; i++) { if (PMC_IS_IMPL(i) == 0) continue; - printk("->CPU%d pmc[%d]=0x%lx thread_pmc[%d]=0x%lx\n", this_cpu, i, ia64_get_pmc(i), i, t->pmcs[i]); + printk("->CPU%d pmc[%d]=0x%lx thread_pmc[%d]=0x%lx\n", this_cpu, i, ia64_get_pmc(i), i, ctx->th_pmcs[i]); } for (i=1; PMD_IS_LAST(i) == 0; i++) { if (PMD_IS_IMPL(i) == 0) continue; - printk("->CPU%d pmd[%d]=0x%lx thread_pmd[%d]=0x%lx\n", this_cpu, i, ia64_get_pmd(i), i, t->pmds[i]); + printk("->CPU%d pmd[%d]=0x%lx thread_pmd[%d]=0x%lx\n", this_cpu, i, ia64_get_pmd(i), i, ctx->th_pmds[i]); } if (ctx) { @@ -6822,7 +6768,7 @@ pfm_inherit(struct task_struct *task, struct pt_regs *regs) { struct thread_struct *thread; - DPRINT(("perfmon: pfm_inherit clearing state for [%d]\n", task->pid)); + DPRINT(("perfmon: pfm_inherit clearing state for [%d]\n", task_pid_nr(task))); thread = &task->thread; diff --git a/arch/ia64/kernel/perfmon_default_smpl.c b/arch/ia64/kernel/perfmon_default_smpl.c index 344941db0a9..30c644ea44c 100644 --- a/arch/ia64/kernel/perfmon_default_smpl.c +++ b/arch/ia64/kernel/perfmon_default_smpl.c @@ -8,7 +8,6 @@ #include <linux/kernel.h> #include <linux/types.h> #include <linux/module.h> -#include <linux/config.h> #include <linux/init.h> #include <asm/delay.h> #include <linux/smp.h> @@ -25,12 +24,12 @@ MODULE_LICENSE("GPL"); #ifdef DEFAULT_DEBUG #define DPRINT(a) \ do { \ - if (unlikely(pfm_sysctl.debug >0)) { printk("%s.%d: CPU%d ", __FUNCTION__, __LINE__, smp_processor_id()); printk a; } \ + if (unlikely(pfm_sysctl.debug >0)) { printk("%s.%d: CPU%d ", __func__, __LINE__, smp_processor_id()); printk a; } \ } while (0) #define DPRINT_ovfl(a) \ do { \ - if (unlikely(pfm_sysctl.debug > 0 && pfm_sysctl.debug_ovfl >0)) { printk("%s.%d: CPU%d ", __FUNCTION__, __LINE__, smp_processor_id()); printk a; } \ + if (unlikely(pfm_sysctl.debug > 0 && pfm_sysctl.debug_ovfl >0)) { printk("%s.%d: CPU%d ", __func__, __LINE__, smp_processor_id()); printk a; } \ } while (0) #else @@ -45,11 +44,11 @@ default_validate(struct task_struct *task, unsigned int flags, int cpu, void *da int ret = 0; if (data == NULL) { - DPRINT(("[%d] no argument passed\n", task->pid)); + DPRINT(("[%d] no argument passed\n", task_pid_nr(task))); return -EINVAL; } - DPRINT(("[%d] validate flags=0x%x CPU%d\n", task->pid, flags, cpu)); + DPRINT(("[%d] validate flags=0x%x CPU%d\n", task_pid_nr(task), flags, cpu)); /* * must hold at least the buffer header + one minimally sized entry @@ -89,7 +88,7 @@ default_init(struct task_struct *task, void *buf, unsigned int flags, int cpu, v hdr->hdr_count = 0UL; DPRINT(("[%d] buffer=%p buf_size=%lu hdr_size=%lu hdr_version=%u cur_offs=%lu\n", - task->pid, + task_pid_nr(task), buf, hdr->hdr_buf_size, sizeof(*hdr), @@ -151,7 +150,7 @@ default_handler(struct task_struct *task, void *buf, pfm_ovfl_arg_t *arg, struct * current = task running at the time of the overflow. * * per-task mode: - * - this is ususally the task being monitored. + * - this is usually the task being monitored. * Under certain conditions, it might be a different task * * system-wide: @@ -246,7 +245,7 @@ default_restart(struct task_struct *task, pfm_ovfl_ctrl_t *ctrl, void *buf, stru static int default_exit(struct task_struct *task, void *buf, struct pt_regs *regs) { - DPRINT(("[%d] exit(%p)\n", task->pid, buf)); + DPRINT(("[%d] exit(%p)\n", task_pid_nr(task), buf)); return 0; } diff --git a/arch/ia64/kernel/perfmon_mckinley.h b/arch/ia64/kernel/perfmon_mckinley.h index 9becccda289..c4bec7a9d18 100644 --- a/arch/ia64/kernel/perfmon_mckinley.h +++ b/arch/ia64/kernel/perfmon_mckinley.h @@ -181,7 +181,7 @@ static pmu_config_t pmu_conf_mck={ .pmc_desc = pfm_mck_pmc_desc, .num_ibrs = 8, .num_dbrs = 8, - .use_rr_dbregs = 1 /* debug register are use for range retrictions */ + .use_rr_dbregs = 1 /* debug register are use for range restrictions */ }; diff --git a/arch/ia64/kernel/perfmon_montecito.h b/arch/ia64/kernel/perfmon_montecito.h new file mode 100644 index 00000000000..7f8da4c7ca6 --- /dev/null +++ b/arch/ia64/kernel/perfmon_montecito.h @@ -0,0 +1,269 @@ +/* + * This file contains the Montecito PMU register description tables + * and pmc checker used by perfmon.c. + * + * Copyright (c) 2005-2006 Hewlett-Packard Development Company, L.P. + * Contributed by Stephane Eranian <eranian@hpl.hp.com> + */ +static int pfm_mont_pmc_check(struct task_struct *task, pfm_context_t *ctx, unsigned int cnum, unsigned long *val, struct pt_regs *regs); + +#define RDEP_MONT_ETB (RDEP(38)|RDEP(39)|RDEP(48)|RDEP(49)|RDEP(50)|RDEP(51)|RDEP(52)|RDEP(53)|RDEP(54)|\ + RDEP(55)|RDEP(56)|RDEP(57)|RDEP(58)|RDEP(59)|RDEP(60)|RDEP(61)|RDEP(62)|RDEP(63)) +#define RDEP_MONT_DEAR (RDEP(32)|RDEP(33)|RDEP(36)) +#define RDEP_MONT_IEAR (RDEP(34)|RDEP(35)) + +static pfm_reg_desc_t pfm_mont_pmc_desc[PMU_MAX_PMCS]={ +/* pmc0 */ { PFM_REG_CONTROL , 0, 0x0, -1, NULL, NULL, {0,0, 0, 0}, {0,0, 0, 0}}, +/* pmc1 */ { PFM_REG_CONTROL , 0, 0x0, -1, NULL, NULL, {0,0, 0, 0}, {0,0, 0, 0}}, +/* pmc2 */ { PFM_REG_CONTROL , 0, 0x0, -1, NULL, NULL, {0,0, 0, 0}, {0,0, 0, 0}}, +/* pmc3 */ { PFM_REG_CONTROL , 0, 0x0, -1, NULL, NULL, {0,0, 0, 0}, {0,0, 0, 0}}, +/* pmc4 */ { PFM_REG_COUNTING, 6, 0x2000000, 0x7c7fff7f, NULL, pfm_mont_pmc_check, {RDEP(4),0, 0, 0}, {0,0, 0, 0}}, +/* pmc5 */ { PFM_REG_COUNTING, 6, 0x2000000, 0x7c7fff7f, NULL, pfm_mont_pmc_check, {RDEP(5),0, 0, 0}, {0,0, 0, 0}}, +/* pmc6 */ { PFM_REG_COUNTING, 6, 0x2000000, 0x7c7fff7f, NULL, pfm_mont_pmc_check, {RDEP(6),0, 0, 0}, {0,0, 0, 0}}, +/* pmc7 */ { PFM_REG_COUNTING, 6, 0x2000000, 0x7c7fff7f, NULL, pfm_mont_pmc_check, {RDEP(7),0, 0, 0}, {0,0, 0, 0}}, +/* pmc8 */ { PFM_REG_COUNTING, 6, 0x2000000, 0x7c7fff7f, NULL, pfm_mont_pmc_check, {RDEP(8),0, 0, 0}, {0,0, 0, 0}}, +/* pmc9 */ { PFM_REG_COUNTING, 6, 0x2000000, 0x7c7fff7f, NULL, pfm_mont_pmc_check, {RDEP(9),0, 0, 0}, {0,0, 0, 0}}, +/* pmc10 */ { PFM_REG_COUNTING, 6, 0x2000000, 0x7c7fff7f, NULL, pfm_mont_pmc_check, {RDEP(10),0, 0, 0}, {0,0, 0, 0}}, +/* pmc11 */ { PFM_REG_COUNTING, 6, 0x2000000, 0x7c7fff7f, NULL, pfm_mont_pmc_check, {RDEP(11),0, 0, 0}, {0,0, 0, 0}}, +/* pmc12 */ { PFM_REG_COUNTING, 6, 0x2000000, 0x7c7fff7f, NULL, pfm_mont_pmc_check, {RDEP(12),0, 0, 0}, {0,0, 0, 0}}, +/* pmc13 */ { PFM_REG_COUNTING, 6, 0x2000000, 0x7c7fff7f, NULL, pfm_mont_pmc_check, {RDEP(13),0, 0, 0}, {0,0, 0, 0}}, +/* pmc14 */ { PFM_REG_COUNTING, 6, 0x2000000, 0x7c7fff7f, NULL, pfm_mont_pmc_check, {RDEP(14),0, 0, 0}, {0,0, 0, 0}}, +/* pmc15 */ { PFM_REG_COUNTING, 6, 0x2000000, 0x7c7fff7f, NULL, pfm_mont_pmc_check, {RDEP(15),0, 0, 0}, {0,0, 0, 0}}, +/* pmc16 */ { PFM_REG_NOTIMPL, }, +/* pmc17 */ { PFM_REG_NOTIMPL, }, +/* pmc18 */ { PFM_REG_NOTIMPL, }, +/* pmc19 */ { PFM_REG_NOTIMPL, }, +/* pmc20 */ { PFM_REG_NOTIMPL, }, +/* pmc21 */ { PFM_REG_NOTIMPL, }, +/* pmc22 */ { PFM_REG_NOTIMPL, }, +/* pmc23 */ { PFM_REG_NOTIMPL, }, +/* pmc24 */ { PFM_REG_NOTIMPL, }, +/* pmc25 */ { PFM_REG_NOTIMPL, }, +/* pmc26 */ { PFM_REG_NOTIMPL, }, +/* pmc27 */ { PFM_REG_NOTIMPL, }, +/* pmc28 */ { PFM_REG_NOTIMPL, }, +/* pmc29 */ { PFM_REG_NOTIMPL, }, +/* pmc30 */ { PFM_REG_NOTIMPL, }, +/* pmc31 */ { PFM_REG_NOTIMPL, }, +/* pmc32 */ { PFM_REG_CONFIG, 0, 0x30f01ffffffffffUL, 0x30f01ffffffffffUL, NULL, pfm_mont_pmc_check, {0,0, 0, 0}, {0,0, 0, 0}}, +/* pmc33 */ { PFM_REG_CONFIG, 0, 0x0, 0x1ffffffffffUL, NULL, pfm_mont_pmc_check, {0,0, 0, 0}, {0,0, 0, 0}}, +/* pmc34 */ { PFM_REG_CONFIG, 0, 0xf01ffffffffffUL, 0xf01ffffffffffUL, NULL, pfm_mont_pmc_check, {0,0, 0, 0}, {0,0, 0, 0}}, +/* pmc35 */ { PFM_REG_CONFIG, 0, 0x0, 0x1ffffffffffUL, NULL, pfm_mont_pmc_check, {0,0, 0, 0}, {0,0, 0, 0}}, +/* pmc36 */ { PFM_REG_CONFIG, 0, 0xfffffff0, 0xf, NULL, pfm_mont_pmc_check, {0,0, 0, 0}, {0,0, 0, 0}}, +/* pmc37 */ { PFM_REG_MONITOR, 4, 0x0, 0x3fff, NULL, pfm_mont_pmc_check, {RDEP_MONT_IEAR, 0, 0, 0}, {0, 0, 0, 0}}, +/* pmc38 */ { PFM_REG_CONFIG, 0, 0xdb6, 0x2492, NULL, pfm_mont_pmc_check, {0,0, 0, 0}, {0,0, 0, 0}}, +/* pmc39 */ { PFM_REG_MONITOR, 6, 0x0, 0xffcf, NULL, pfm_mont_pmc_check, {RDEP_MONT_ETB,0, 0, 0}, {0,0, 0, 0}}, +/* pmc40 */ { PFM_REG_MONITOR, 6, 0x2000000, 0xf01cf, NULL, pfm_mont_pmc_check, {RDEP_MONT_DEAR,0, 0, 0}, {0,0, 0, 0}}, +/* pmc41 */ { PFM_REG_CONFIG, 0, 0x00002078fefefefeUL, 0x1e00018181818UL, NULL, pfm_mont_pmc_check, {0,0, 0, 0}, {0,0, 0, 0}}, +/* pmc42 */ { PFM_REG_MONITOR, 6, 0x0, 0x7ff4f, NULL, pfm_mont_pmc_check, {RDEP_MONT_ETB,0, 0, 0}, {0,0, 0, 0}}, + { PFM_REG_END , 0, 0x0, -1, NULL, NULL, {0,}, {0,}}, /* end marker */ +}; + +static pfm_reg_desc_t pfm_mont_pmd_desc[PMU_MAX_PMDS]={ +/* pmd0 */ { PFM_REG_NOTIMPL, }, +/* pmd1 */ { PFM_REG_NOTIMPL, }, +/* pmd2 */ { PFM_REG_NOTIMPL, }, +/* pmd3 */ { PFM_REG_NOTIMPL, }, +/* pmd4 */ { PFM_REG_COUNTING, 0, 0x0, -1, NULL, NULL, {0,0, 0, 0}, {RDEP(4),0, 0, 0}}, +/* pmd5 */ { PFM_REG_COUNTING, 0, 0x0, -1, NULL, NULL, {0,0, 0, 0}, {RDEP(5),0, 0, 0}}, +/* pmd6 */ { PFM_REG_COUNTING, 0, 0x0, -1, NULL, NULL, {0,0, 0, 0}, {RDEP(6),0, 0, 0}}, +/* pmd7 */ { PFM_REG_COUNTING, 0, 0x0, -1, NULL, NULL, {0,0, 0, 0}, {RDEP(7),0, 0, 0}}, +/* pmd8 */ { PFM_REG_COUNTING, 0, 0x0, -1, NULL, NULL, {0,0, 0, 0}, {RDEP(8),0, 0, 0}}, +/* pmd9 */ { PFM_REG_COUNTING, 0, 0x0, -1, NULL, NULL, {0,0, 0, 0}, {RDEP(9),0, 0, 0}}, +/* pmd10 */ { PFM_REG_COUNTING, 0, 0x0, -1, NULL, NULL, {0,0, 0, 0}, {RDEP(10),0, 0, 0}}, +/* pmd11 */ { PFM_REG_COUNTING, 0, 0x0, -1, NULL, NULL, {0,0, 0, 0}, {RDEP(11),0, 0, 0}}, +/* pmd12 */ { PFM_REG_COUNTING, 0, 0x0, -1, NULL, NULL, {0,0, 0, 0}, {RDEP(12),0, 0, 0}}, +/* pmd13 */ { PFM_REG_COUNTING, 0, 0x0, -1, NULL, NULL, {0,0, 0, 0}, {RDEP(13),0, 0, 0}}, +/* pmd14 */ { PFM_REG_COUNTING, 0, 0x0, -1, NULL, NULL, {0,0, 0, 0}, {RDEP(14),0, 0, 0}}, +/* pmd15 */ { PFM_REG_COUNTING, 0, 0x0, -1, NULL, NULL, {0,0, 0, 0}, {RDEP(15),0, 0, 0}}, +/* pmd16 */ { PFM_REG_NOTIMPL, }, +/* pmd17 */ { PFM_REG_NOTIMPL, }, +/* pmd18 */ { PFM_REG_NOTIMPL, }, +/* pmd19 */ { PFM_REG_NOTIMPL, }, +/* pmd20 */ { PFM_REG_NOTIMPL, }, +/* pmd21 */ { PFM_REG_NOTIMPL, }, +/* pmd22 */ { PFM_REG_NOTIMPL, }, +/* pmd23 */ { PFM_REG_NOTIMPL, }, +/* pmd24 */ { PFM_REG_NOTIMPL, }, +/* pmd25 */ { PFM_REG_NOTIMPL, }, +/* pmd26 */ { PFM_REG_NOTIMPL, }, +/* pmd27 */ { PFM_REG_NOTIMPL, }, +/* pmd28 */ { PFM_REG_NOTIMPL, }, +/* pmd29 */ { PFM_REG_NOTIMPL, }, +/* pmd30 */ { PFM_REG_NOTIMPL, }, +/* pmd31 */ { PFM_REG_NOTIMPL, }, +/* pmd32 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP(33)|RDEP(36),0, 0, 0}, {RDEP(40),0, 0, 0}}, +/* pmd33 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP(32)|RDEP(36),0, 0, 0}, {RDEP(40),0, 0, 0}}, +/* pmd34 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP(35),0, 0, 0}, {RDEP(37),0, 0, 0}}, +/* pmd35 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP(34),0, 0, 0}, {RDEP(37),0, 0, 0}}, +/* pmd36 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP(32)|RDEP(33),0, 0, 0}, {RDEP(40),0, 0, 0}}, +/* pmd37 */ { PFM_REG_NOTIMPL, }, +/* pmd38 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP_MONT_ETB,0, 0, 0}, {RDEP(39),0, 0, 0}}, +/* pmd39 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP_MONT_ETB,0, 0, 0}, {RDEP(39),0, 0, 0}}, +/* pmd40 */ { PFM_REG_NOTIMPL, }, +/* pmd41 */ { PFM_REG_NOTIMPL, }, +/* pmd42 */ { PFM_REG_NOTIMPL, }, +/* pmd43 */ { PFM_REG_NOTIMPL, }, +/* pmd44 */ { PFM_REG_NOTIMPL, }, +/* pmd45 */ { PFM_REG_NOTIMPL, }, +/* pmd46 */ { PFM_REG_NOTIMPL, }, +/* pmd47 */ { PFM_REG_NOTIMPL, }, +/* pmd48 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP_MONT_ETB,0, 0, 0}, {RDEP(39),0, 0, 0}}, +/* pmd49 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP_MONT_ETB,0, 0, 0}, {RDEP(39),0, 0, 0}}, +/* pmd50 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP_MONT_ETB,0, 0, 0}, {RDEP(39),0, 0, 0}}, +/* pmd51 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP_MONT_ETB,0, 0, 0}, {RDEP(39),0, 0, 0}}, +/* pmd52 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP_MONT_ETB,0, 0, 0}, {RDEP(39),0, 0, 0}}, +/* pmd53 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP_MONT_ETB,0, 0, 0}, {RDEP(39),0, 0, 0}}, +/* pmd54 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP_MONT_ETB,0, 0, 0}, {RDEP(39),0, 0, 0}}, +/* pmd55 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP_MONT_ETB,0, 0, 0}, {RDEP(39),0, 0, 0}}, +/* pmd56 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP_MONT_ETB,0, 0, 0}, {RDEP(39),0, 0, 0}}, +/* pmd57 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP_MONT_ETB,0, 0, 0}, {RDEP(39),0, 0, 0}}, +/* pmd58 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP_MONT_ETB,0, 0, 0}, {RDEP(39),0, 0, 0}}, +/* pmd59 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP_MONT_ETB,0, 0, 0}, {RDEP(39),0, 0, 0}}, +/* pmd60 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP_MONT_ETB,0, 0, 0}, {RDEP(39),0, 0, 0}}, +/* pmd61 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP_MONT_ETB,0, 0, 0}, {RDEP(39),0, 0, 0}}, +/* pmd62 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP_MONT_ETB,0, 0, 0}, {RDEP(39),0, 0, 0}}, +/* pmd63 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP_MONT_ETB,0, 0, 0}, {RDEP(39),0, 0, 0}}, + { PFM_REG_END , 0, 0x0, -1, NULL, NULL, {0,}, {0,}}, /* end marker */ +}; + +/* + * PMC reserved fields must have their power-up values preserved + */ +static int +pfm_mont_reserved(unsigned int cnum, unsigned long *val, struct pt_regs *regs) +{ + unsigned long tmp1, tmp2, ival = *val; + + /* remove reserved areas from user value */ + tmp1 = ival & PMC_RSVD_MASK(cnum); + + /* get reserved fields values */ + tmp2 = PMC_DFL_VAL(cnum) & ~PMC_RSVD_MASK(cnum); + + *val = tmp1 | tmp2; + + DPRINT(("pmc[%d]=0x%lx, mask=0x%lx, reset=0x%lx, val=0x%lx\n", + cnum, ival, PMC_RSVD_MASK(cnum), PMC_DFL_VAL(cnum), *val)); + return 0; +} + +/* + * task can be NULL if the context is unloaded + */ +static int +pfm_mont_pmc_check(struct task_struct *task, pfm_context_t *ctx, unsigned int cnum, unsigned long *val, struct pt_regs *regs) +{ + int ret = 0; + unsigned long val32 = 0, val38 = 0, val41 = 0; + unsigned long tmpval; + int check_case1 = 0; + int is_loaded; + + /* first preserve the reserved fields */ + pfm_mont_reserved(cnum, val, regs); + + tmpval = *val; + + /* sanity check */ + if (ctx == NULL) return -EINVAL; + + is_loaded = ctx->ctx_state == PFM_CTX_LOADED || ctx->ctx_state == PFM_CTX_MASKED; + + /* + * we must clear the debug registers if pmc41 has a value which enable + * memory pipeline event constraints. In this case we need to clear the + * the debug registers if they have not yet been accessed. This is required + * to avoid picking stale state. + * PMC41 is "active" if: + * one of the pmc41.cfg_dtagXX field is different from 0x3 + * AND + * at the corresponding pmc41.en_dbrpXX is set. + * AND + * ctx_fl_using_dbreg == 0 (i.e., dbr not yet used) + */ + DPRINT(("cnum=%u val=0x%lx, using_dbreg=%d loaded=%d\n", cnum, tmpval, ctx->ctx_fl_using_dbreg, is_loaded)); + + if (cnum == 41 && is_loaded + && (tmpval & 0x1e00000000000UL) && (tmpval & 0x18181818UL) != 0x18181818UL && ctx->ctx_fl_using_dbreg == 0) { + + DPRINT(("pmc[%d]=0x%lx has active pmc41 settings, clearing dbr\n", cnum, tmpval)); + + /* don't mix debug with perfmon */ + if (task && (task->thread.flags & IA64_THREAD_DBG_VALID) != 0) return -EINVAL; + + /* + * a count of 0 will mark the debug registers if: + * AND + */ + ret = pfm_write_ibr_dbr(PFM_DATA_RR, ctx, NULL, 0, regs); + if (ret) return ret; + } + /* + * we must clear the (instruction) debug registers if: + * pmc38.ig_ibrpX is 0 (enabled) + * AND + * ctx_fl_using_dbreg == 0 (i.e., dbr not yet used) + */ + if (cnum == 38 && is_loaded && ((tmpval & 0x492UL) != 0x492UL) && ctx->ctx_fl_using_dbreg == 0) { + + DPRINT(("pmc38=0x%lx has active pmc38 settings, clearing ibr\n", tmpval)); + + /* don't mix debug with perfmon */ + if (task && (task->thread.flags & IA64_THREAD_DBG_VALID) != 0) return -EINVAL; + + /* + * a count of 0 will mark the debug registers as in use and also + * ensure that they are properly cleared. + */ + ret = pfm_write_ibr_dbr(PFM_CODE_RR, ctx, NULL, 0, regs); + if (ret) return ret; + + } + switch(cnum) { + case 32: val32 = *val; + val38 = ctx->ctx_pmcs[38]; + val41 = ctx->ctx_pmcs[41]; + check_case1 = 1; + break; + case 38: val38 = *val; + val32 = ctx->ctx_pmcs[32]; + val41 = ctx->ctx_pmcs[41]; + check_case1 = 1; + break; + case 41: val41 = *val; + val32 = ctx->ctx_pmcs[32]; + val38 = ctx->ctx_pmcs[38]; + check_case1 = 1; + break; + } + /* check illegal configuration which can produce inconsistencies in tagging + * i-side events in L1D and L2 caches + */ + if (check_case1) { + ret = (((val41 >> 45) & 0xf) == 0 && ((val32>>57) & 0x1) == 0) + && ((((val38>>1) & 0x3) == 0x2 || ((val38>>1) & 0x3) == 0) + || (((val38>>4) & 0x3) == 0x2 || ((val38>>4) & 0x3) == 0)); + if (ret) { + DPRINT(("invalid config pmc38=0x%lx pmc41=0x%lx pmc32=0x%lx\n", val38, val41, val32)); + return -EINVAL; + } + } + *val = tmpval; + return 0; +} + +/* + * impl_pmcs, impl_pmds are computed at runtime to minimize errors! + */ +static pmu_config_t pmu_conf_mont={ + .pmu_name = "Montecito", + .pmu_family = 0x20, + .flags = PFM_PMU_IRQ_RESEND, + .ovfl_val = (1UL << 47) - 1, + .pmd_desc = pfm_mont_pmd_desc, + .pmc_desc = pfm_mont_pmc_desc, + .num_ibrs = 8, + .num_dbrs = 8, + .use_rr_dbregs = 1 /* debug register are use for range retrictions */ +}; diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c index 051e050359e..55d4ba47a90 100644 --- a/arch/ia64/kernel/process.c +++ b/arch/ia64/kernel/process.c @@ -4,10 +4,10 @@ * Copyright (C) 1998-2003 Hewlett-Packard Co * David Mosberger-Tang <davidm@hpl.hp.com> * 04/11/17 Ashok Raj <ashok.raj@intel.com> Added CPU Hotplug Support + * + * 2005-10-07 Keith Owens <kaos@sgi.com> + * Add notify_die() hooks. */ -#define __KERNEL_SYSCALLS__ /* see <asm/unistd.h> */ -#include <linux/config.h> - #include <linux/cpu.h> #include <linux/pm.h> #include <linux/elf.h> @@ -15,28 +15,31 @@ #include <linux/kallsyms.h> #include <linux/kernel.h> #include <linux/mm.h> +#include <linux/slab.h> #include <linux/module.h> #include <linux/notifier.h> #include <linux/personality.h> #include <linux/sched.h> -#include <linux/slab.h> -#include <linux/smp_lock.h> #include <linux/stddef.h> #include <linux/thread_info.h> #include <linux/unistd.h> #include <linux/efi.h> #include <linux/interrupt.h> #include <linux/delay.h> -#include <linux/kprobes.h> +#include <linux/kdebug.h> +#include <linux/utsname.h> +#include <linux/tracehook.h> +#include <linux/rcupdate.h> #include <asm/cpu.h> #include <asm/delay.h> #include <asm/elf.h> -#include <asm/ia32.h> #include <asm/irq.h> +#include <asm/kexec.h> #include <asm/pgalloc.h> #include <asm/processor.h> #include <asm/sal.h> +#include <asm/switch_to.h> #include <asm/tlbflush.h> #include <asm/uaccess.h> #include <asm/unwind.h> @@ -51,10 +54,11 @@ #include "sigframe.h" void (*ia64_mark_idle)(int); -static DEFINE_PER_CPU(unsigned int, cpu_idle_state); -unsigned long boot_option_idle_override = 0; +unsigned long boot_option_idle_override = IDLE_NO_OVERRIDE; EXPORT_SYMBOL(boot_option_idle_override); +void (*pm_power_off) (void); +EXPORT_SYMBOL(pm_power_off); void ia64_do_show_stack (struct unw_frame_info *info, void *arg) @@ -92,22 +96,16 @@ show_stack (struct task_struct *task, unsigned long *sp) } void -dump_stack (void) -{ - show_stack(NULL, NULL); -} - -EXPORT_SYMBOL(dump_stack); - -void show_regs (struct pt_regs *regs) { unsigned long ip = regs->cr_iip + ia64_psr(regs)->ri; print_modules(); - printk("\nPid: %d, CPU %d, comm: %20s\n", current->pid, smp_processor_id(), current->comm); - printk("psr : %016lx ifs : %016lx ip : [<%016lx>] %s\n", - regs->cr_ipsr, regs->cr_ifs, ip, print_tainted()); + printk("\n"); + show_regs_print_info(KERN_DEFAULT); + printk("psr : %016lx ifs : %016lx ip : [<%016lx>] %s (%s)\n", + regs->cr_ipsr, regs->cr_ifs, ip, print_tainted(), + init_utsname()->release); print_symbol("ip is at %s\n", ip); printk("unat: %016lx pfs : %016lx rsc : %016lx\n", regs->ar_unat, regs->ar_pfs, regs->ar_rsc); @@ -154,11 +152,21 @@ show_regs (struct pt_regs *regs) show_stack(NULL, NULL); } +/* local support for deprecated console_print */ +void +console_print(const char *s) +{ + printk(KERN_EMERG "%s", s); +} + void -do_notify_resume_user (sigset_t *oldset, struct sigscratch *scr, long in_syscall) +do_notify_resume_user(sigset_t *unused, struct sigscratch *scr, long in_syscall) { if (fsys_mode(current, &scr->pt)) { - /* defer signal-handling etc. until we return to privilege-level 0. */ + /* + * defer signal-handling etc. until we return to + * privilege-level 0. + */ if (!ia64_psr(&scr->pt)->lp) ia64_psr(&scr->pt)->lp = 1; return; @@ -166,49 +174,44 @@ do_notify_resume_user (sigset_t *oldset, struct sigscratch *scr, long in_syscall #ifdef CONFIG_PERFMON if (current->thread.pfm_needs_checking) + /* + * Note: pfm_handle_work() allow us to call it with interrupts + * disabled, and may enable interrupts within the function. + */ pfm_handle_work(); #endif /* deal with pending signal delivery */ - if (test_thread_flag(TIF_SIGPENDING)) - ia64_do_signal(oldset, scr, in_syscall); -} + if (test_thread_flag(TIF_SIGPENDING)) { + local_irq_enable(); /* force interrupt enable */ + ia64_do_signal(scr, in_syscall); + } -static int pal_halt = 1; -static int can_do_pal_halt = 1; + if (test_and_clear_thread_flag(TIF_NOTIFY_RESUME)) { + local_irq_enable(); /* force interrupt enable */ + tracehook_notify_resume(&scr->pt); + } -static int __init nohalt_setup(char * str) -{ - pal_halt = can_do_pal_halt = 0; - return 1; -} -__setup("nohalt", nohalt_setup); + /* copy user rbs to kernel rbs */ + if (unlikely(test_thread_flag(TIF_RESTORE_RSE))) { + local_irq_enable(); /* force interrupt enable */ + ia64_sync_krbs(); + } -void -update_pal_halt_status(int status) -{ - can_do_pal_halt = pal_halt && status; + local_irq_disable(); /* force interrupt disable */ } -/* - * We use this if we don't have any better idle routine.. - */ -void -default_idle (void) +static int __init nohalt_setup(char * str) { - local_irq_enable(); - while (!need_resched()) - if (can_do_pal_halt) - safe_halt(); - else - cpu_relax(); + cpu_idle_poll_ctrl(true); + return 1; } +__setup("nohalt", nohalt_setup); #ifdef CONFIG_HOTPLUG_CPU /* We don't actually take CPU down, just spin without interrupts. */ static inline void play_dead(void) { - extern void ia64_cpu_local_tick (void); unsigned int this_cpu = smp_processor_id(); /* Ack it */ @@ -231,72 +234,29 @@ static inline void play_dead(void) } #endif /* CONFIG_HOTPLUG_CPU */ -void cpu_idle_wait(void) +void arch_cpu_idle_dead(void) { - unsigned int cpu, this_cpu = get_cpu(); - cpumask_t map; - - set_cpus_allowed(current, cpumask_of_cpu(this_cpu)); - put_cpu(); - - cpus_clear(map); - for_each_online_cpu(cpu) { - per_cpu(cpu_idle_state, cpu) = 1; - cpu_set(cpu, map); - } - - __get_cpu_var(cpu_idle_state) = 0; - - wmb(); - do { - ssleep(1); - for_each_online_cpu(cpu) { - if (cpu_isset(cpu, map) && !per_cpu(cpu_idle_state, cpu)) - cpu_clear(cpu, map); - } - cpus_and(map, map, cpu_online_map); - } while (!cpus_empty(map)); + play_dead(); } -EXPORT_SYMBOL_GPL(cpu_idle_wait); -void __attribute__((noreturn)) -cpu_idle (void) +void arch_cpu_idle(void) { void (*mark_idle)(int) = ia64_mark_idle; - /* endless idle loop with no priority at all */ - while (1) { #ifdef CONFIG_SMP - if (!need_resched()) - min_xtp(); + min_xtp(); #endif - while (!need_resched()) { - void (*idle)(void); - - if (__get_cpu_var(cpu_idle_state)) - __get_cpu_var(cpu_idle_state) = 0; - - rmb(); - if (mark_idle) - (*mark_idle)(1); + rmb(); + if (mark_idle) + (*mark_idle)(1); - idle = pm_idle; - if (!idle) - idle = default_idle; - (*idle)(); - } - - if (mark_idle) - (*mark_idle)(0); + safe_halt(); + if (mark_idle) + (*mark_idle)(0); #ifdef CONFIG_SMP - normal_xtp(); + normal_xtp(); #endif - schedule(); - check_pgt_cache(); - if (cpu_is_offline(smp_processor_id())) - play_dead(); - } } void @@ -317,11 +277,6 @@ ia64_save_extra (struct task_struct *task) if (info & PFM_CPUINFO_SYST_WIDE) pfm_syst_wide_update_task(task, info, 0); #endif - -#ifdef CONFIG_IA32_SUPPORT - if (IS_IA32_PROCESS(ia64_task_regs(task))) - ia32_save_state(task); -#endif } void @@ -342,11 +297,6 @@ ia64_load_extra (struct task_struct *task) if (info & PFM_CPUINFO_SYST_WIDE) pfm_syst_wide_update_task(task, info, 1); #endif - -#ifdef CONFIG_IA32_SUPPORT - if (IS_IA32_PROCESS(ia64_task_regs(task))) - ia32_load_state(task); -#endif } /* @@ -381,77 +331,26 @@ ia64_load_extra (struct task_struct *task) * so there is nothing to worry about. */ int -copy_thread (int nr, unsigned long clone_flags, +copy_thread(unsigned long clone_flags, unsigned long user_stack_base, unsigned long user_stack_size, - struct task_struct *p, struct pt_regs *regs) + struct task_struct *p) { - extern char ia64_ret_from_clone, ia32_ret_from_clone; + extern char ia64_ret_from_clone; struct switch_stack *child_stack, *stack; unsigned long rbs, child_rbs, rbs_size; struct pt_regs *child_ptregs; + struct pt_regs *regs = current_pt_regs(); int retval = 0; -#ifdef CONFIG_SMP - /* - * For SMP idle threads, fork_by_hand() calls do_fork with - * NULL regs. - */ - if (!regs) - return 0; -#endif - - stack = ((struct switch_stack *) regs) - 1; - child_ptregs = (struct pt_regs *) ((unsigned long) p + IA64_STK_OFFSET) - 1; child_stack = (struct switch_stack *) child_ptregs - 1; - /* copy parent's switch_stack & pt_regs to child: */ - memcpy(child_stack, stack, sizeof(*child_ptregs) + sizeof(*child_stack)); - rbs = (unsigned long) current + IA64_RBS_OFFSET; child_rbs = (unsigned long) p + IA64_RBS_OFFSET; - rbs_size = stack->ar_bspstore - rbs; - - /* copy the parent's register backing store to the child: */ - memcpy((void *) child_rbs, (void *) rbs, rbs_size); - - if (likely(user_mode(child_ptregs))) { - if ((clone_flags & CLONE_SETTLS) && !IS_IA32_PROCESS(regs)) - child_ptregs->r13 = regs->r16; /* see sys_clone2() in entry.S */ - if (user_stack_base) { - child_ptregs->r12 = user_stack_base + user_stack_size - 16; - child_ptregs->ar_bspstore = user_stack_base; - child_ptregs->ar_rnat = 0; - child_ptregs->loadrs = 0; - } - } else { - /* - * Note: we simply preserve the relative position of - * the stack pointer here. There is no need to - * allocate a scratch area here, since that will have - * been taken care of by the caller of sys_clone() - * already. - */ - child_ptregs->r12 = (unsigned long) child_ptregs - 16; /* kernel sp */ - child_ptregs->r13 = (unsigned long) p; /* set `current' pointer */ - } - child_stack->ar_bspstore = child_rbs + rbs_size; - if (IS_IA32_PROCESS(regs)) - child_stack->b0 = (unsigned long) &ia32_ret_from_clone; - else - child_stack->b0 = (unsigned long) &ia64_ret_from_clone; /* copy parts of thread_struct: */ p->thread.ksp = (unsigned long) child_stack - 16; - /* stop some PSR bits from being inherited. - * the psr.up/psr.pp bits must be cleared on fork but inherited on execve() - * therefore we must specify them explicitly here and not include them in - * IA64_PSR_BITS_TO_CLEAR. - */ - child_ptregs->cr_ipsr = ((child_ptregs->cr_ipsr | IA64_PSR_BITS_TO_SET) - & ~(IA64_PSR_BITS_TO_CLEAR | IA64_PSR_PP | IA64_PSR_UP)); - /* * NOTE: The calling convention considers all floating point * registers in the high partition (fph) to be scratch. Since @@ -473,22 +372,65 @@ copy_thread (int nr, unsigned long clone_flags, # define THREAD_FLAGS_TO_SET 0 p->thread.flags = ((current->thread.flags & ~THREAD_FLAGS_TO_CLEAR) | THREAD_FLAGS_TO_SET); + ia64_drop_fpu(p); /* don't pick up stale state from a CPU's fph */ -#ifdef CONFIG_IA32_SUPPORT - /* - * If we're cloning an IA32 task then save the IA32 extra - * state from the current task to the new task - */ - if (IS_IA32_PROCESS(ia64_task_regs(current))) { - ia32_save_state(p); - if (clone_flags & CLONE_SETTLS) - retval = ia32_clone_tls(p, child_ptregs); - - /* Copy partially mapped page list */ - if (!retval) - retval = ia32_copy_partial_page_list(p, clone_flags); + + if (unlikely(p->flags & PF_KTHREAD)) { + if (unlikely(!user_stack_base)) { + /* fork_idle() called us */ + return 0; + } + memset(child_stack, 0, sizeof(*child_ptregs) + sizeof(*child_stack)); + child_stack->r4 = user_stack_base; /* payload */ + child_stack->r5 = user_stack_size; /* argument */ + /* + * Preserve PSR bits, except for bits 32-34 and 37-45, + * which we can't read. + */ + child_ptregs->cr_ipsr = ia64_getreg(_IA64_REG_PSR) | IA64_PSR_BN; + /* mark as valid, empty frame */ + child_ptregs->cr_ifs = 1UL << 63; + child_stack->ar_fpsr = child_ptregs->ar_fpsr + = ia64_getreg(_IA64_REG_AR_FPSR); + child_stack->pr = (1 << PRED_KERNEL_STACK); + child_stack->ar_bspstore = child_rbs; + child_stack->b0 = (unsigned long) &ia64_ret_from_clone; + + /* stop some PSR bits from being inherited. + * the psr.up/psr.pp bits must be cleared on fork but inherited on execve() + * therefore we must specify them explicitly here and not include them in + * IA64_PSR_BITS_TO_CLEAR. + */ + child_ptregs->cr_ipsr = ((child_ptregs->cr_ipsr | IA64_PSR_BITS_TO_SET) + & ~(IA64_PSR_BITS_TO_CLEAR | IA64_PSR_PP | IA64_PSR_UP)); + + return 0; } -#endif + stack = ((struct switch_stack *) regs) - 1; + /* copy parent's switch_stack & pt_regs to child: */ + memcpy(child_stack, stack, sizeof(*child_ptregs) + sizeof(*child_stack)); + + /* copy the parent's register backing store to the child: */ + rbs_size = stack->ar_bspstore - rbs; + memcpy((void *) child_rbs, (void *) rbs, rbs_size); + if (clone_flags & CLONE_SETTLS) + child_ptregs->r13 = regs->r16; /* see sys_clone2() in entry.S */ + if (user_stack_base) { + child_ptregs->r12 = user_stack_base + user_stack_size - 16; + child_ptregs->ar_bspstore = user_stack_base; + child_ptregs->ar_rnat = 0; + child_ptregs->loadrs = 0; + } + child_stack->ar_bspstore = child_rbs + rbs_size; + child_stack->b0 = (unsigned long) &ia64_ret_from_clone; + + /* stop some PSR bits from being inherited. + * the psr.up/psr.pp bits must be cleared on fork but inherited on execve() + * therefore we must specify them explicitly here and not include them in + * IA64_PSR_BITS_TO_CLEAR. + */ + child_ptregs->cr_ipsr = ((child_ptregs->cr_ipsr | IA64_PSR_BITS_TO_SET) + & ~(IA64_PSR_BITS_TO_CLEAR | IA64_PSR_PP | IA64_PSR_UP)); #ifdef CONFIG_PERFMON if (current->thread.pfm_context) @@ -500,7 +442,8 @@ copy_thread (int nr, unsigned long clone_flags, static void do_copy_task_regs (struct task_struct *task, struct unw_frame_info *info, void *arg) { - unsigned long mask, sp, nat_bits = 0, ip, ar_rnat, urbs_end, cfm; + unsigned long mask, sp, nat_bits = 0, ar_rnat, urbs_end, cfm; + unsigned long uninitialized_var(ip); /* GCC be quiet */ elf_greg_t *dst = arg; struct pt_regs *pt; char nat; @@ -602,21 +545,6 @@ do_dump_fpu (struct unw_frame_info *info, void *arg) do_dump_task_fpu(current, info, arg); } -int -dump_task_regs(struct task_struct *task, elf_gregset_t *regs) -{ - struct unw_frame_info tcore_info; - - if (current == task) { - unw_init_running(do_copy_regs, regs); - } else { - memset(&tcore_info, 0, sizeof(tcore_info)); - unw_init_from_blocked_task(&tcore_info, task); - do_copy_task_regs(task, &tcore_info, regs); - } - return 1; -} - void ia64_elf_core_copy_regs (struct pt_regs *pt, elf_gregset_t dst) { @@ -624,103 +552,21 @@ ia64_elf_core_copy_regs (struct pt_regs *pt, elf_gregset_t dst) } int -dump_task_fpu (struct task_struct *task, elf_fpregset_t *dst) -{ - struct unw_frame_info tcore_info; - - if (current == task) { - unw_init_running(do_dump_fpu, dst); - } else { - memset(&tcore_info, 0, sizeof(tcore_info)); - unw_init_from_blocked_task(&tcore_info, task); - do_dump_task_fpu(task, &tcore_info, dst); - } - return 1; -} - -int dump_fpu (struct pt_regs *pt, elf_fpregset_t dst) { unw_init_running(do_dump_fpu, dst); return 1; /* f0-f31 are always valid so we always return 1 */ } -long -sys_execve (char __user *filename, char __user * __user *argv, char __user * __user *envp, - struct pt_regs *regs) -{ - char *fname; - int error; - - fname = getname(filename); - error = PTR_ERR(fname); - if (IS_ERR(fname)) - goto out; - error = do_execve(fname, argv, envp, regs); - putname(fname); -out: - return error; -} - -pid_t -kernel_thread (int (*fn)(void *), void *arg, unsigned long flags) -{ - extern void start_kernel_thread (void); - unsigned long *helper_fptr = (unsigned long *) &start_kernel_thread; - struct { - struct switch_stack sw; - struct pt_regs pt; - } regs; - - memset(®s, 0, sizeof(regs)); - regs.pt.cr_iip = helper_fptr[0]; /* set entry point (IP) */ - regs.pt.r1 = helper_fptr[1]; /* set GP */ - regs.pt.r9 = (unsigned long) fn; /* 1st argument */ - regs.pt.r11 = (unsigned long) arg; /* 2nd argument */ - /* Preserve PSR bits, except for bits 32-34 and 37-45, which we can't read. */ - regs.pt.cr_ipsr = ia64_getreg(_IA64_REG_PSR) | IA64_PSR_BN; - regs.pt.cr_ifs = 1UL << 63; /* mark as valid, empty frame */ - regs.sw.ar_fpsr = regs.pt.ar_fpsr = ia64_getreg(_IA64_REG_AR_FPSR); - regs.sw.ar_bspstore = (unsigned long) current + IA64_RBS_OFFSET; - regs.sw.pr = (1 << PRED_KERNEL_STACK); - return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, ®s.pt, 0, NULL, NULL); -} -EXPORT_SYMBOL(kernel_thread); - -/* This gets called from kernel_thread() via ia64_invoke_thread_helper(). */ -int -kernel_thread_helper (int (*fn)(void *), void *arg) -{ -#ifdef CONFIG_IA32_SUPPORT - if (IS_IA32_PROCESS(ia64_task_regs(current))) { - /* A kernel thread is always a 64-bit process. */ - current->thread.map_base = DEFAULT_MAP_BASE; - current->thread.task_size = DEFAULT_TASK_SIZE; - ia64_set_kr(IA64_KR_IO_BASE, current->thread.old_iob); - ia64_set_kr(IA64_KR_TSSD, current->thread.old_k1); - } -#endif - return (*fn)(arg); -} - /* * Flush thread state. This is called when a thread does an execve(). */ void flush_thread (void) { - /* - * Remove function-return probe instances associated with this task - * and put them back on the free list. Do not insert an exit probe for - * this function, it will be disabled by kprobe_flush_task if you do. - */ - kprobe_flush_task(current); - /* drop floating-point and debug-register state if it exists: */ current->thread.flags &= ~(IA64_THREAD_FPH_VALID | IA64_THREAD_DBG_VALID); ia64_drop_fpu(current); - if (IS_IA32_PROCESS(ia64_task_regs(current))) - ia32_drop_partial_page_list(current); } /* @@ -731,13 +577,6 @@ void exit_thread (void) { - /* - * Remove function-return probe instances associated with this task - * and put them back on the free list. Do not insert an exit probe for - * this function, it will be disabled by kprobe_flush_task if you do. - */ - kprobe_flush_task(current); - ia64_drop_fpu(current); #ifdef CONFIG_PERFMON /* if needed, stop monitoring and flush state to perfmon context */ @@ -748,8 +587,6 @@ exit_thread (void) if (current->thread.flags & IA64_THREAD_DBG_VALID) pfm_release_debug_registers(current); #endif - if (IS_IA32_PROCESS(ia64_task_regs(current))) - ia32_drop_partial_page_list(current); } unsigned long @@ -759,6 +596,9 @@ get_wchan (struct task_struct *p) unsigned long ip; int count = 0; + if (!p || p == current || p->state == TASK_RUNNING) + return 0; + /* * Note: p may not be a blocked task (it could be current or * another process running on some other CPU. Rather than @@ -769,6 +609,8 @@ get_wchan (struct task_struct *p) */ unw_init_from_blocked_task(&info, p); do { + if (p->state == TASK_RUNNING) + return 0; if (unw_unwind(&info) < 0) return 0; unw_get_ip(&info, &ip); @@ -801,15 +643,32 @@ cpu_halt (void) ia64_pal_halt(min_power_state); } +void machine_shutdown(void) +{ +#ifdef CONFIG_HOTPLUG_CPU + int cpu; + + for_each_online_cpu(cpu) { + if (cpu != smp_processor_id()) + cpu_down(cpu); + } +#endif +#ifdef CONFIG_KEXEC + kexec_disable_iosapic(); +#endif +} + void machine_restart (char *restart_cmd) { + (void) notify_die(DIE_MACHINE_RESTART, restart_cmd, NULL, 0, 0, 0); (*efi.reset_system)(EFI_RESET_WARM, 0, 0, NULL); } void machine_halt (void) { + (void) notify_die(DIE_MACHINE_HALT, "", NULL, 0, 0, 0); cpu_halt(); } diff --git a/arch/ia64/kernel/ptrace.c b/arch/ia64/kernel/ptrace.c index 4b19d041063..b7a5fffe092 100644 --- a/arch/ia64/kernel/ptrace.c +++ b/arch/ia64/kernel/ptrace.c @@ -3,27 +3,29 @@ * * Copyright (C) 1999-2005 Hewlett-Packard Co * David Mosberger-Tang <davidm@hpl.hp.com> + * Copyright (C) 2006 Intel Co + * 2006-08-12 - IA64 Native Utrace implementation support added by + * Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com> * * Derived from the x86 and Alpha versions. */ -#include <linux/config.h> #include <linux/kernel.h> #include <linux/sched.h> -#include <linux/slab.h> #include <linux/mm.h> #include <linux/errno.h> #include <linux/ptrace.h> -#include <linux/smp_lock.h> #include <linux/user.h> #include <linux/security.h> #include <linux/audit.h> #include <linux/signal.h> +#include <linux/regset.h> +#include <linux/elf.h> +#include <linux/tracehook.h> #include <asm/pgtable.h> #include <asm/processor.h> #include <asm/ptrace_offsets.h> #include <asm/rse.h> -#include <asm/system.h> #include <asm/uaccess.h> #include <asm/unwind.h> #ifdef CONFIG_PERFMON @@ -254,7 +256,7 @@ get_rnat (struct task_struct *task, struct switch_stack *sw, long num_regs, nbits; struct pt_regs *pt; - pt = ia64_task_regs(task); + pt = task_pt_regs(task); kbsp = (unsigned long *) sw->ar_bspstore; ubspstore = (unsigned long *) pt->ar_bspstore; @@ -314,7 +316,7 @@ put_rnat (struct task_struct *task, struct switch_stack *sw, struct pt_regs *pt; unsigned long cfm, *urbs_kargs; - pt = ia64_task_regs(task); + pt = task_pt_regs(task); kbsp = (unsigned long *) sw->ar_bspstore; ubspstore = (unsigned long *) pt->ar_bspstore; @@ -407,7 +409,7 @@ ia64_peek (struct task_struct *child, struct switch_stack *child_stack, urbs_end = (long *) user_rbs_end; laddr = (unsigned long *) addr; - child_regs = ia64_task_regs(child); + child_regs = task_pt_regs(child); bspstore = (unsigned long *) child_regs->ar_bspstore; krbs = (unsigned long *) child + IA64_RBS_OFFSET/8; if (on_kernel_rbs(addr, (unsigned long) bspstore, @@ -467,7 +469,7 @@ ia64_poke (struct task_struct *child, struct switch_stack *child_stack, struct pt_regs *child_regs; laddr = (unsigned long *) addr; - child_regs = ia64_task_regs(child); + child_regs = task_pt_regs(child); bspstore = (unsigned long *) child_regs->ar_bspstore; krbs = (unsigned long *) child + IA64_RBS_OFFSET/8; if (on_kernel_rbs(addr, (unsigned long) bspstore, @@ -548,77 +550,126 @@ ia64_sync_user_rbs (struct task_struct *child, struct switch_stack *sw, return 0; } -static inline int -thread_matches (struct task_struct *thread, unsigned long addr) +static long +ia64_sync_kernel_rbs (struct task_struct *child, struct switch_stack *sw, + unsigned long user_rbs_start, unsigned long user_rbs_end) { - unsigned long thread_rbs_end; - struct pt_regs *thread_regs; + unsigned long addr, val; + long ret; - if (ptrace_check_attach(thread, 0) < 0) - /* - * If the thread is not in an attachable state, we'll - * ignore it. The net effect is that if ADDR happens - * to overlap with the portion of the thread's - * register backing store that is currently residing - * on the thread's kernel stack, then ptrace() may end - * up accessing a stale value. But if the thread - * isn't stopped, that's a problem anyhow, so we're - * doing as well as we can... - */ - return 0; + /* now copy word for word from user rbs to kernel rbs: */ + for (addr = user_rbs_start; addr < user_rbs_end; addr += 8) { + if (access_process_vm(child, addr, &val, sizeof(val), 0) + != sizeof(val)) + return -EIO; - thread_regs = ia64_task_regs(thread); - thread_rbs_end = ia64_get_user_rbs_end(thread, thread_regs, NULL); - if (!on_kernel_rbs(addr, thread_regs->ar_bspstore, thread_rbs_end)) - return 0; + ret = ia64_poke(child, sw, user_rbs_end, addr, val); + if (ret < 0) + return ret; + } + return 0; +} + +typedef long (*syncfunc_t)(struct task_struct *, struct switch_stack *, + unsigned long, unsigned long); + +static void do_sync_rbs(struct unw_frame_info *info, void *arg) +{ + struct pt_regs *pt; + unsigned long urbs_end; + syncfunc_t fn = arg; - return 1; /* looks like we've got a winner */ + if (unw_unwind_to_user(info) < 0) + return; + pt = task_pt_regs(info->task); + urbs_end = ia64_get_user_rbs_end(info->task, pt, NULL); + + fn(info->task, info->sw, pt->ar_bspstore, urbs_end); } /* - * GDB apparently wants to be able to read the register-backing store - * of any thread when attached to a given process. If we are peeking - * or poking an address that happens to reside in the kernel-backing - * store of another thread, we need to attach to that thread, because - * otherwise we end up accessing stale data. - * - * task_list_lock must be read-locked before calling this routine! + * when a thread is stopped (ptraced), debugger might change thread's user + * stack (change memory directly), and we must avoid the RSE stored in kernel + * to override user stack (user space's RSE is newer than kernel's in the + * case). To workaround the issue, we copy kernel RSE to user RSE before the + * task is stopped, so user RSE has updated data. we then copy user RSE to + * kernel after the task is resummed from traced stop and kernel will use the + * newer RSE to return to user. TIF_RESTORE_RSE is the flag to indicate we need + * synchronize user RSE to kernel. */ -static struct task_struct * -find_thread_for_addr (struct task_struct *child, unsigned long addr) +void ia64_ptrace_stop(void) { - struct task_struct *p; - struct mm_struct *mm; - struct list_head *this, *next; - int mm_users; + if (test_and_set_tsk_thread_flag(current, TIF_RESTORE_RSE)) + return; + set_notify_resume(current); + unw_init_running(do_sync_rbs, ia64_sync_user_rbs); +} - if (!(mm = get_task_mm(child))) - return child; +/* + * This is called to read back the register backing store. + */ +void ia64_sync_krbs(void) +{ + clear_tsk_thread_flag(current, TIF_RESTORE_RSE); - /* -1 because of our get_task_mm(): */ - mm_users = atomic_read(&mm->mm_users) - 1; - if (mm_users <= 1) - goto out; /* not multi-threaded */ + unw_init_running(do_sync_rbs, ia64_sync_kernel_rbs); +} + +/* + * After PTRACE_ATTACH, a thread's register backing store area in user + * space is assumed to contain correct data whenever the thread is + * stopped. arch_ptrace_stop takes care of this on tracing stops. + * But if the child was already stopped for job control when we attach + * to it, then it might not ever get into ptrace_stop by the time we + * want to examine the user memory containing the RBS. + */ +void +ptrace_attach_sync_user_rbs (struct task_struct *child) +{ + int stopped = 0; + struct unw_frame_info info; /* - * Traverse the current process' children list. Every task that - * one attaches to becomes a child. And it is only attached children - * of the debugger that are of interest (ptrace_check_attach checks - * for this). + * If the child is in TASK_STOPPED, we need to change that to + * TASK_TRACED momentarily while we operate on it. This ensures + * that the child won't be woken up and return to user mode while + * we are doing the sync. (It can only be woken up for SIGKILL.) */ - list_for_each_safe(this, next, ¤t->children) { - p = list_entry(this, struct task_struct, sibling); - if (p->mm != mm) - continue; - if (thread_matches(p, addr)) { - child = p; - goto out; + + read_lock(&tasklist_lock); + if (child->sighand) { + spin_lock_irq(&child->sighand->siglock); + if (child->state == TASK_STOPPED && + !test_and_set_tsk_thread_flag(child, TIF_RESTORE_RSE)) { + set_notify_resume(child); + + child->state = TASK_TRACED; + stopped = 1; } + spin_unlock_irq(&child->sighand->siglock); } + read_unlock(&tasklist_lock); - out: - mmput(mm); - return child; + if (!stopped) + return; + + unw_init_from_blocked_task(&info, child); + do_sync_rbs(&info, ia64_sync_user_rbs); + + /* + * Now move the child back into TASK_STOPPED if it should be in a + * job control stop, so that SIGCONT can be used to wake it up. + */ + read_lock(&tasklist_lock); + if (child->sighand) { + spin_lock_irq(&child->sighand->siglock); + if (child->state == TASK_TRACED && + (child->signal->flags & SIGNAL_STOP_STOPPED)) { + child->state = TASK_STOPPED; + } + spin_unlock_irq(&child->sighand->siglock); + } + read_unlock(&tasklist_lock); } /* @@ -627,7 +678,7 @@ find_thread_for_addr (struct task_struct *child, unsigned long addr) inline void ia64_flush_fph (struct task_struct *task) { - struct ia64_psr *psr = ia64_psr(ia64_task_regs(task)); + struct ia64_psr *psr = ia64_psr(task_pt_regs(task)); /* * Prevent migrating this task while @@ -653,7 +704,7 @@ ia64_flush_fph (struct task_struct *task) void ia64_sync_fph (struct task_struct *task) { - struct ia64_psr *psr = ia64_psr(ia64_task_regs(task)); + struct ia64_psr *psr = ia64_psr(task_pt_regs(task)); ia64_flush_fph(task); if (!(task->thread.flags & IA64_THREAD_FPH_VALID)) { @@ -664,25 +715,6 @@ ia64_sync_fph (struct task_struct *task) psr->dfh = 1; } -static int -access_fr (struct unw_frame_info *info, int regnum, int hi, - unsigned long *data, int write_access) -{ - struct ia64_fpreg fpval; - int ret; - - ret = unw_get_fr(info, regnum, &fpval); - if (ret < 0) - return ret; - - if (write_access) { - fpval.u.bits[hi] = *data; - ret = unw_set_fr(info, regnum, fpval); - } else - *data = fpval.u.bits[hi]; - return ret; -} - /* * Change the machine-state of CHILD such that it will return via the normal * kernel exit-path, rather than the syscall-exit path. @@ -704,14 +736,14 @@ convert_to_non_syscall (struct task_struct *child, struct pt_regs *pt, if ((long)((unsigned long)child + IA64_STK_OFFSET - sp) < IA64_PT_REGS_SIZE) { dprintk("ptrace.%s: ran off the top of the kernel " - "stack\n", __FUNCTION__); + "stack\n", __func__); return; } if (unw_get_pr (&prev_info, &pr) < 0) { unw_get_rp(&prev_info, &ip); dprintk("ptrace.%s: failed to read " "predicate register (ip=0x%lx)\n", - __FUNCTION__, ip); + __func__, ip); return; } if (unw_is_intr_frame(&info) @@ -784,326 +816,7 @@ access_nat_bits (struct task_struct *child, struct pt_regs *pt, static int access_uarea (struct task_struct *child, unsigned long addr, - unsigned long *data, int write_access) -{ - unsigned long *ptr, regnum, urbs_end, rnat_addr, cfm; - struct switch_stack *sw; - struct pt_regs *pt; -# define pt_reg_addr(pt, reg) ((void *) \ - ((unsigned long) (pt) \ - + offsetof(struct pt_regs, reg))) - - - pt = ia64_task_regs(child); - sw = (struct switch_stack *) (child->thread.ksp + 16); - - if ((addr & 0x7) != 0) { - dprintk("ptrace: unaligned register address 0x%lx\n", addr); - return -1; - } - - if (addr < PT_F127 + 16) { - /* accessing fph */ - if (write_access) - ia64_sync_fph(child); - else - ia64_flush_fph(child); - ptr = (unsigned long *) - ((unsigned long) &child->thread.fph + addr); - } else if ((addr >= PT_F10) && (addr < PT_F11 + 16)) { - /* scratch registers untouched by kernel (saved in pt_regs) */ - ptr = pt_reg_addr(pt, f10) + (addr - PT_F10); - } else if (addr >= PT_F12 && addr < PT_F15 + 16) { - /* - * Scratch registers untouched by kernel (saved in - * switch_stack). - */ - ptr = (unsigned long *) ((long) sw - + (addr - PT_NAT_BITS - 32)); - } else if (addr < PT_AR_LC + 8) { - /* preserved state: */ - struct unw_frame_info info; - char nat = 0; - int ret; - - unw_init_from_blocked_task(&info, child); - if (unw_unwind_to_user(&info) < 0) - return -1; - - switch (addr) { - case PT_NAT_BITS: - return access_nat_bits(child, pt, &info, - data, write_access); - - case PT_R4: case PT_R5: case PT_R6: case PT_R7: - if (write_access) { - /* read NaT bit first: */ - unsigned long dummy; - - ret = unw_get_gr(&info, (addr - PT_R4)/8 + 4, - &dummy, &nat); - if (ret < 0) - return ret; - } - return unw_access_gr(&info, (addr - PT_R4)/8 + 4, data, - &nat, write_access); - - case PT_B1: case PT_B2: case PT_B3: - case PT_B4: case PT_B5: - return unw_access_br(&info, (addr - PT_B1)/8 + 1, data, - write_access); - - case PT_AR_EC: - return unw_access_ar(&info, UNW_AR_EC, data, - write_access); - - case PT_AR_LC: - return unw_access_ar(&info, UNW_AR_LC, data, - write_access); - - default: - if (addr >= PT_F2 && addr < PT_F5 + 16) - return access_fr(&info, (addr - PT_F2)/16 + 2, - (addr & 8) != 0, data, - write_access); - else if (addr >= PT_F16 && addr < PT_F31 + 16) - return access_fr(&info, - (addr - PT_F16)/16 + 16, - (addr & 8) != 0, - data, write_access); - else { - dprintk("ptrace: rejecting access to register " - "address 0x%lx\n", addr); - return -1; - } - } - } else if (addr < PT_F9+16) { - /* scratch state */ - switch (addr) { - case PT_AR_BSP: - /* - * By convention, we use PT_AR_BSP to refer to - * the end of the user-level backing store. - * Use ia64_rse_skip_regs(PT_AR_BSP, -CFM.sof) - * to get the real value of ar.bsp at the time - * the kernel was entered. - * - * Furthermore, when changing the contents of - * PT_AR_BSP (or PT_CFM) we MUST copy any - * users-level stacked registers that are - * stored on the kernel stack back to - * user-space because otherwise, we might end - * up clobbering kernel stacked registers. - * Also, if this happens while the task is - * blocked in a system call, which convert the - * state such that the non-system-call exit - * path is used. This ensures that the proper - * state will be picked up when resuming - * execution. However, it *also* means that - * once we write PT_AR_BSP/PT_CFM, it won't be - * possible to modify the syscall arguments of - * the pending system call any longer. This - * shouldn't be an issue because modifying - * PT_AR_BSP/PT_CFM generally implies that - * we're either abandoning the pending system - * call or that we defer it's re-execution - * (e.g., due to GDB doing an inferior - * function call). - */ - urbs_end = ia64_get_user_rbs_end(child, pt, &cfm); - if (write_access) { - if (*data != urbs_end) { - if (ia64_sync_user_rbs(child, sw, - pt->ar_bspstore, - urbs_end) < 0) - return -1; - if (in_syscall(pt)) - convert_to_non_syscall(child, - pt, - cfm); - /* - * Simulate user-level write - * of ar.bsp: - */ - pt->loadrs = 0; - pt->ar_bspstore = *data; - } - } else - *data = urbs_end; - return 0; - - case PT_CFM: - urbs_end = ia64_get_user_rbs_end(child, pt, &cfm); - if (write_access) { - if (((cfm ^ *data) & PFM_MASK) != 0) { - if (ia64_sync_user_rbs(child, sw, - pt->ar_bspstore, - urbs_end) < 0) - return -1; - if (in_syscall(pt)) - convert_to_non_syscall(child, - pt, - cfm); - pt->cr_ifs = ((pt->cr_ifs & ~PFM_MASK) - | (*data & PFM_MASK)); - } - } else - *data = cfm; - return 0; - - case PT_CR_IPSR: - if (write_access) - pt->cr_ipsr = ((*data & IPSR_MASK) - | (pt->cr_ipsr & ~IPSR_MASK)); - else - *data = (pt->cr_ipsr & IPSR_MASK); - return 0; - - case PT_AR_RSC: - if (write_access) - pt->ar_rsc = *data | (3 << 2); /* force PL3 */ - else - *data = pt->ar_rsc; - return 0; - - case PT_AR_RNAT: - urbs_end = ia64_get_user_rbs_end(child, pt, NULL); - rnat_addr = (long) ia64_rse_rnat_addr((long *) - urbs_end); - if (write_access) - return ia64_poke(child, sw, urbs_end, - rnat_addr, *data); - else - return ia64_peek(child, sw, urbs_end, - rnat_addr, data); - - case PT_R1: - ptr = pt_reg_addr(pt, r1); - break; - case PT_R2: case PT_R3: - ptr = pt_reg_addr(pt, r2) + (addr - PT_R2); - break; - case PT_R8: case PT_R9: case PT_R10: case PT_R11: - ptr = pt_reg_addr(pt, r8) + (addr - PT_R8); - break; - case PT_R12: case PT_R13: - ptr = pt_reg_addr(pt, r12) + (addr - PT_R12); - break; - case PT_R14: - ptr = pt_reg_addr(pt, r14); - break; - case PT_R15: - ptr = pt_reg_addr(pt, r15); - break; - case PT_R16: case PT_R17: case PT_R18: case PT_R19: - case PT_R20: case PT_R21: case PT_R22: case PT_R23: - case PT_R24: case PT_R25: case PT_R26: case PT_R27: - case PT_R28: case PT_R29: case PT_R30: case PT_R31: - ptr = pt_reg_addr(pt, r16) + (addr - PT_R16); - break; - case PT_B0: - ptr = pt_reg_addr(pt, b0); - break; - case PT_B6: - ptr = pt_reg_addr(pt, b6); - break; - case PT_B7: - ptr = pt_reg_addr(pt, b7); - break; - case PT_F6: case PT_F6+8: case PT_F7: case PT_F7+8: - case PT_F8: case PT_F8+8: case PT_F9: case PT_F9+8: - ptr = pt_reg_addr(pt, f6) + (addr - PT_F6); - break; - case PT_AR_BSPSTORE: - ptr = pt_reg_addr(pt, ar_bspstore); - break; - case PT_AR_UNAT: - ptr = pt_reg_addr(pt, ar_unat); - break; - case PT_AR_PFS: - ptr = pt_reg_addr(pt, ar_pfs); - break; - case PT_AR_CCV: - ptr = pt_reg_addr(pt, ar_ccv); - break; - case PT_AR_FPSR: - ptr = pt_reg_addr(pt, ar_fpsr); - break; - case PT_CR_IIP: - ptr = pt_reg_addr(pt, cr_iip); - break; - case PT_PR: - ptr = pt_reg_addr(pt, pr); - break; - /* scratch register */ - - default: - /* disallow accessing anything else... */ - dprintk("ptrace: rejecting access to register " - "address 0x%lx\n", addr); - return -1; - } - } else if (addr <= PT_AR_SSD) { - ptr = pt_reg_addr(pt, ar_csd) + (addr - PT_AR_CSD); - } else { - /* access debug registers */ - - if (addr >= PT_IBR) { - regnum = (addr - PT_IBR) >> 3; - ptr = &child->thread.ibr[0]; - } else { - regnum = (addr - PT_DBR) >> 3; - ptr = &child->thread.dbr[0]; - } - - if (regnum >= 8) { - dprintk("ptrace: rejecting access to register " - "address 0x%lx\n", addr); - return -1; - } -#ifdef CONFIG_PERFMON - /* - * Check if debug registers are used by perfmon. This - * test must be done once we know that we can do the - * operation, i.e. the arguments are all valid, but - * before we start modifying the state. - * - * Perfmon needs to keep a count of how many processes - * are trying to modify the debug registers for system - * wide monitoring sessions. - * - * We also include read access here, because they may - * cause the PMU-installed debug register state - * (dbr[], ibr[]) to be reset. The two arrays are also - * used by perfmon, but we do not use - * IA64_THREAD_DBG_VALID. The registers are restored - * by the PMU context switch code. - */ - if (pfm_use_debug_registers(child)) return -1; -#endif - - if (!(child->thread.flags & IA64_THREAD_DBG_VALID)) { - child->thread.flags |= IA64_THREAD_DBG_VALID; - memset(child->thread.dbr, 0, - sizeof(child->thread.dbr)); - memset(child->thread.ibr, 0, - sizeof(child->thread.ibr)); - } - - ptr += regnum; - - if ((regnum & 1) && write_access) { - /* don't let the user set kernel-level breakpoints: */ - *ptr = *data & ~(7UL << 56); - return 0; - } - } - if (write_access) - *ptr = *data; - else - *data = *ptr; - return 0; -} + unsigned long *data, int write_access); static long ptrace_getregs (struct task_struct *child, struct pt_all_user_regs __user *ppr) @@ -1120,7 +833,7 @@ ptrace_getregs (struct task_struct *child, struct pt_all_user_regs __user *ppr) if (!access_ok(VERIFY_WRITE, ppr, sizeof(struct pt_all_user_regs))) return -EIO; - pt = ia64_task_regs(child); + pt = task_pt_regs(child); sw = (struct switch_stack *) (child->thread.ksp + 16); unw_init_from_blocked_task(&info, child); if (unw_unwind_to_user(&info) < 0) { @@ -1265,7 +978,7 @@ ptrace_setregs (struct task_struct *child, struct pt_all_user_regs __user *ppr) if (!access_ok(VERIFY_READ, ppr, sizeof(struct pt_all_user_regs))) return -EIO; - pt = ia64_task_regs(child); + pt = task_pt_regs(child); sw = (struct switch_stack *) (child->thread.ksp + 16); unw_init_from_blocked_task(&info, child); if (unw_unwind_to_user(&info) < 0) { @@ -1395,6 +1108,35 @@ ptrace_setregs (struct task_struct *child, struct pt_all_user_regs __user *ppr) return ret; } +void +user_enable_single_step (struct task_struct *child) +{ + struct ia64_psr *child_psr = ia64_psr(task_pt_regs(child)); + + set_tsk_thread_flag(child, TIF_SINGLESTEP); + child_psr->ss = 1; +} + +void +user_enable_block_step (struct task_struct *child) +{ + struct ia64_psr *child_psr = ia64_psr(task_pt_regs(child)); + + set_tsk_thread_flag(child, TIF_SINGLESTEP); + child_psr->tb = 1; +} + +void +user_disable_single_step (struct task_struct *child) +{ + struct ia64_psr *child_psr = ia64_psr(task_pt_regs(child)); + + /* make sure the single step/taken-branch trap bits are not set: */ + clear_tsk_thread_flag(child, TIF_SINGLESTEP); + child_psr->ss = 0; + child_psr->tb = 0; +} + /* * Called by kernel/ptrace.c when detaching.. * @@ -1403,270 +1145,1050 @@ ptrace_setregs (struct task_struct *child, struct pt_all_user_regs __user *ppr) void ptrace_disable (struct task_struct *child) { - struct ia64_psr *child_psr = ia64_psr(ia64_task_regs(child)); + user_disable_single_step(child); +} - /* make sure the single step/taken-branch trap bits are not set: */ - child_psr->ss = 0; - child_psr->tb = 0; +long +arch_ptrace (struct task_struct *child, long request, + unsigned long addr, unsigned long data) +{ + switch (request) { + case PTRACE_PEEKTEXT: + case PTRACE_PEEKDATA: + /* read word at location addr */ + if (access_process_vm(child, addr, &data, sizeof(data), 0) + != sizeof(data)) + return -EIO; + /* ensure return value is not mistaken for error code */ + force_successful_syscall_return(); + return data; + + /* PTRACE_POKETEXT and PTRACE_POKEDATA is handled + * by the generic ptrace_request(). + */ + + case PTRACE_PEEKUSR: + /* read the word at addr in the USER area */ + if (access_uarea(child, addr, &data, 0) < 0) + return -EIO; + /* ensure return value is not mistaken for error code */ + force_successful_syscall_return(); + return data; + + case PTRACE_POKEUSR: + /* write the word at addr in the USER area */ + if (access_uarea(child, addr, &data, 1) < 0) + return -EIO; + return 0; + + case PTRACE_OLD_GETSIGINFO: + /* for backwards-compatibility */ + return ptrace_request(child, PTRACE_GETSIGINFO, addr, data); + + case PTRACE_OLD_SETSIGINFO: + /* for backwards-compatibility */ + return ptrace_request(child, PTRACE_SETSIGINFO, addr, data); + + case PTRACE_GETREGS: + return ptrace_getregs(child, + (struct pt_all_user_regs __user *) data); + + case PTRACE_SETREGS: + return ptrace_setregs(child, + (struct pt_all_user_regs __user *) data); + + default: + return ptrace_request(child, request, addr, data); + } } + +/* "asmlinkage" so the input arguments are preserved... */ + asmlinkage long -sys_ptrace (long request, pid_t pid, unsigned long addr, unsigned long data) +syscall_trace_enter (long arg0, long arg1, long arg2, long arg3, + long arg4, long arg5, long arg6, long arg7, + struct pt_regs regs) +{ + if (test_thread_flag(TIF_SYSCALL_TRACE)) + if (tracehook_report_syscall_entry(®s)) + return -ENOSYS; + + /* copy user rbs to kernel rbs */ + if (test_thread_flag(TIF_RESTORE_RSE)) + ia64_sync_krbs(); + + + audit_syscall_entry(AUDIT_ARCH_IA64, regs.r15, arg0, arg1, arg2, arg3); + + return 0; +} + +/* "asmlinkage" so the input arguments are preserved... */ + +asmlinkage void +syscall_trace_leave (long arg0, long arg1, long arg2, long arg3, + long arg4, long arg5, long arg6, long arg7, + struct pt_regs regs) +{ + int step; + + audit_syscall_exit(®s); + + step = test_thread_flag(TIF_SINGLESTEP); + if (step || test_thread_flag(TIF_SYSCALL_TRACE)) + tracehook_report_syscall_exit(®s, step); + + /* copy user rbs to kernel rbs */ + if (test_thread_flag(TIF_RESTORE_RSE)) + ia64_sync_krbs(); +} + +/* Utrace implementation starts here */ +struct regset_get { + void *kbuf; + void __user *ubuf; +}; + +struct regset_set { + const void *kbuf; + const void __user *ubuf; +}; + +struct regset_getset { + struct task_struct *target; + const struct user_regset *regset; + union { + struct regset_get get; + struct regset_set set; + } u; + unsigned int pos; + unsigned int count; + int ret; +}; + +static int +access_elf_gpreg(struct task_struct *target, struct unw_frame_info *info, + unsigned long addr, unsigned long *data, int write_access) { struct pt_regs *pt; - unsigned long urbs_end, peek_or_poke; - struct task_struct *child; - struct switch_stack *sw; - long ret; + unsigned long *ptr = NULL; + int ret; + char nat = 0; - lock_kernel(); - ret = -EPERM; - if (request == PTRACE_TRACEME) { - /* are we already being traced? */ - if (current->ptrace & PT_PTRACED) - goto out; - ret = security_ptrace(current->parent, current); - if (ret) - goto out; - current->ptrace |= PT_PTRACED; - ret = 0; - goto out; + pt = task_pt_regs(target); + switch (addr) { + case ELF_GR_OFFSET(1): + ptr = &pt->r1; + break; + case ELF_GR_OFFSET(2): + case ELF_GR_OFFSET(3): + ptr = (void *)&pt->r2 + (addr - ELF_GR_OFFSET(2)); + break; + case ELF_GR_OFFSET(4) ... ELF_GR_OFFSET(7): + if (write_access) { + /* read NaT bit first: */ + unsigned long dummy; + + ret = unw_get_gr(info, addr/8, &dummy, &nat); + if (ret < 0) + return ret; + } + return unw_access_gr(info, addr/8, data, &nat, write_access); + case ELF_GR_OFFSET(8) ... ELF_GR_OFFSET(11): + ptr = (void *)&pt->r8 + addr - ELF_GR_OFFSET(8); + break; + case ELF_GR_OFFSET(12): + case ELF_GR_OFFSET(13): + ptr = (void *)&pt->r12 + addr - ELF_GR_OFFSET(12); + break; + case ELF_GR_OFFSET(14): + ptr = &pt->r14; + break; + case ELF_GR_OFFSET(15): + ptr = &pt->r15; } + if (write_access) + *ptr = *data; + else + *data = *ptr; + return 0; +} - peek_or_poke = (request == PTRACE_PEEKTEXT - || request == PTRACE_PEEKDATA - || request == PTRACE_POKETEXT - || request == PTRACE_POKEDATA); - ret = -ESRCH; - read_lock(&tasklist_lock); - { - child = find_task_by_pid(pid); - if (child) { - if (peek_or_poke) - child = find_thread_for_addr(child, addr); - get_task_struct(child); +static int +access_elf_breg(struct task_struct *target, struct unw_frame_info *info, + unsigned long addr, unsigned long *data, int write_access) +{ + struct pt_regs *pt; + unsigned long *ptr = NULL; + + pt = task_pt_regs(target); + switch (addr) { + case ELF_BR_OFFSET(0): + ptr = &pt->b0; + break; + case ELF_BR_OFFSET(1) ... ELF_BR_OFFSET(5): + return unw_access_br(info, (addr - ELF_BR_OFFSET(0))/8, + data, write_access); + case ELF_BR_OFFSET(6): + ptr = &pt->b6; + break; + case ELF_BR_OFFSET(7): + ptr = &pt->b7; + } + if (write_access) + *ptr = *data; + else + *data = *ptr; + return 0; +} + +static int +access_elf_areg(struct task_struct *target, struct unw_frame_info *info, + unsigned long addr, unsigned long *data, int write_access) +{ + struct pt_regs *pt; + unsigned long cfm, urbs_end; + unsigned long *ptr = NULL; + + pt = task_pt_regs(target); + if (addr >= ELF_AR_RSC_OFFSET && addr <= ELF_AR_SSD_OFFSET) { + switch (addr) { + case ELF_AR_RSC_OFFSET: + /* force PL3 */ + if (write_access) + pt->ar_rsc = *data | (3 << 2); + else + *data = pt->ar_rsc; + return 0; + case ELF_AR_BSP_OFFSET: + /* + * By convention, we use PT_AR_BSP to refer to + * the end of the user-level backing store. + * Use ia64_rse_skip_regs(PT_AR_BSP, -CFM.sof) + * to get the real value of ar.bsp at the time + * the kernel was entered. + * + * Furthermore, when changing the contents of + * PT_AR_BSP (or PT_CFM) while the task is + * blocked in a system call, convert the state + * so that the non-system-call exit + * path is used. This ensures that the proper + * state will be picked up when resuming + * execution. However, it *also* means that + * once we write PT_AR_BSP/PT_CFM, it won't be + * possible to modify the syscall arguments of + * the pending system call any longer. This + * shouldn't be an issue because modifying + * PT_AR_BSP/PT_CFM generally implies that + * we're either abandoning the pending system + * call or that we defer it's re-execution + * (e.g., due to GDB doing an inferior + * function call). + */ + urbs_end = ia64_get_user_rbs_end(target, pt, &cfm); + if (write_access) { + if (*data != urbs_end) { + if (in_syscall(pt)) + convert_to_non_syscall(target, + pt, + cfm); + /* + * Simulate user-level write + * of ar.bsp: + */ + pt->loadrs = 0; + pt->ar_bspstore = *data; + } + } else + *data = urbs_end; + return 0; + case ELF_AR_BSPSTORE_OFFSET: + ptr = &pt->ar_bspstore; + break; + case ELF_AR_RNAT_OFFSET: + ptr = &pt->ar_rnat; + break; + case ELF_AR_CCV_OFFSET: + ptr = &pt->ar_ccv; + break; + case ELF_AR_UNAT_OFFSET: + ptr = &pt->ar_unat; + break; + case ELF_AR_FPSR_OFFSET: + ptr = &pt->ar_fpsr; + break; + case ELF_AR_PFS_OFFSET: + ptr = &pt->ar_pfs; + break; + case ELF_AR_LC_OFFSET: + return unw_access_ar(info, UNW_AR_LC, data, + write_access); + case ELF_AR_EC_OFFSET: + return unw_access_ar(info, UNW_AR_EC, data, + write_access); + case ELF_AR_CSD_OFFSET: + ptr = &pt->ar_csd; + break; + case ELF_AR_SSD_OFFSET: + ptr = &pt->ar_ssd; } + } else if (addr >= ELF_CR_IIP_OFFSET && addr <= ELF_CR_IPSR_OFFSET) { + switch (addr) { + case ELF_CR_IIP_OFFSET: + ptr = &pt->cr_iip; + break; + case ELF_CFM_OFFSET: + urbs_end = ia64_get_user_rbs_end(target, pt, &cfm); + if (write_access) { + if (((cfm ^ *data) & PFM_MASK) != 0) { + if (in_syscall(pt)) + convert_to_non_syscall(target, + pt, + cfm); + pt->cr_ifs = ((pt->cr_ifs & ~PFM_MASK) + | (*data & PFM_MASK)); + } + } else + *data = cfm; + return 0; + case ELF_CR_IPSR_OFFSET: + if (write_access) { + unsigned long tmp = *data; + /* psr.ri==3 is a reserved value: SDM 2:25 */ + if ((tmp & IA64_PSR_RI) == IA64_PSR_RI) + tmp &= ~IA64_PSR_RI; + pt->cr_ipsr = ((tmp & IPSR_MASK) + | (pt->cr_ipsr & ~IPSR_MASK)); + } else + *data = (pt->cr_ipsr & IPSR_MASK); + return 0; + } + } else if (addr == ELF_NAT_OFFSET) + return access_nat_bits(target, pt, info, + data, write_access); + else if (addr == ELF_PR_OFFSET) + ptr = &pt->pr; + else + return -1; + + if (write_access) + *ptr = *data; + else + *data = *ptr; + + return 0; +} + +static int +access_elf_reg(struct task_struct *target, struct unw_frame_info *info, + unsigned long addr, unsigned long *data, int write_access) +{ + if (addr >= ELF_GR_OFFSET(1) && addr <= ELF_GR_OFFSET(15)) + return access_elf_gpreg(target, info, addr, data, write_access); + else if (addr >= ELF_BR_OFFSET(0) && addr <= ELF_BR_OFFSET(7)) + return access_elf_breg(target, info, addr, data, write_access); + else + return access_elf_areg(target, info, addr, data, write_access); +} + +void do_gpregs_get(struct unw_frame_info *info, void *arg) +{ + struct pt_regs *pt; + struct regset_getset *dst = arg; + elf_greg_t tmp[16]; + unsigned int i, index, min_copy; + + if (unw_unwind_to_user(info) < 0) + return; + + /* + * coredump format: + * r0-r31 + * NaT bits (for r0-r31; bit N == 1 iff rN is a NaT) + * predicate registers (p0-p63) + * b0-b7 + * ip cfm user-mask + * ar.rsc ar.bsp ar.bspstore ar.rnat + * ar.ccv ar.unat ar.fpsr ar.pfs ar.lc ar.ec + */ + + + /* Skip r0 */ + if (dst->count > 0 && dst->pos < ELF_GR_OFFSET(1)) { + dst->ret = user_regset_copyout_zero(&dst->pos, &dst->count, + &dst->u.get.kbuf, + &dst->u.get.ubuf, + 0, ELF_GR_OFFSET(1)); + if (dst->ret || dst->count == 0) + return; } - read_unlock(&tasklist_lock); - if (!child) - goto out; - ret = -EPERM; - if (pid == 1) /* no messing around with init! */ - goto out_tsk; - - if (request == PTRACE_ATTACH) { - ret = ptrace_attach(child); - goto out_tsk; + + /* gr1 - gr15 */ + if (dst->count > 0 && dst->pos < ELF_GR_OFFSET(16)) { + index = (dst->pos - ELF_GR_OFFSET(1)) / sizeof(elf_greg_t); + min_copy = ELF_GR_OFFSET(16) > (dst->pos + dst->count) ? + (dst->pos + dst->count) : ELF_GR_OFFSET(16); + for (i = dst->pos; i < min_copy; i += sizeof(elf_greg_t), + index++) + if (access_elf_reg(dst->target, info, i, + &tmp[index], 0) < 0) { + dst->ret = -EIO; + return; + } + dst->ret = user_regset_copyout(&dst->pos, &dst->count, + &dst->u.get.kbuf, &dst->u.get.ubuf, tmp, + ELF_GR_OFFSET(1), ELF_GR_OFFSET(16)); + if (dst->ret || dst->count == 0) + return; } - ret = ptrace_check_attach(child, request == PTRACE_KILL); - if (ret < 0) - goto out_tsk; + /* r16-r31 */ + if (dst->count > 0 && dst->pos < ELF_NAT_OFFSET) { + pt = task_pt_regs(dst->target); + dst->ret = user_regset_copyout(&dst->pos, &dst->count, + &dst->u.get.kbuf, &dst->u.get.ubuf, &pt->r16, + ELF_GR_OFFSET(16), ELF_NAT_OFFSET); + if (dst->ret || dst->count == 0) + return; + } - pt = ia64_task_regs(child); - sw = (struct switch_stack *) (child->thread.ksp + 16); + /* nat, pr, b0 - b7 */ + if (dst->count > 0 && dst->pos < ELF_CR_IIP_OFFSET) { + index = (dst->pos - ELF_NAT_OFFSET) / sizeof(elf_greg_t); + min_copy = ELF_CR_IIP_OFFSET > (dst->pos + dst->count) ? + (dst->pos + dst->count) : ELF_CR_IIP_OFFSET; + for (i = dst->pos; i < min_copy; i += sizeof(elf_greg_t), + index++) + if (access_elf_reg(dst->target, info, i, + &tmp[index], 0) < 0) { + dst->ret = -EIO; + return; + } + dst->ret = user_regset_copyout(&dst->pos, &dst->count, + &dst->u.get.kbuf, &dst->u.get.ubuf, tmp, + ELF_NAT_OFFSET, ELF_CR_IIP_OFFSET); + if (dst->ret || dst->count == 0) + return; + } - switch (request) { - case PTRACE_PEEKTEXT: - case PTRACE_PEEKDATA: - /* read word at location addr */ - urbs_end = ia64_get_user_rbs_end(child, pt, NULL); - ret = ia64_peek(child, sw, urbs_end, addr, &data); - if (ret == 0) { - ret = data; - /* ensure "ret" is not mistaken as an error code: */ - force_successful_syscall_return(); - } - goto out_tsk; + /* ip cfm psr ar.rsc ar.bsp ar.bspstore ar.rnat + * ar.ccv ar.unat ar.fpsr ar.pfs ar.lc ar.ec ar.csd ar.ssd + */ + if (dst->count > 0 && dst->pos < (ELF_AR_END_OFFSET)) { + index = (dst->pos - ELF_CR_IIP_OFFSET) / sizeof(elf_greg_t); + min_copy = ELF_AR_END_OFFSET > (dst->pos + dst->count) ? + (dst->pos + dst->count) : ELF_AR_END_OFFSET; + for (i = dst->pos; i < min_copy; i += sizeof(elf_greg_t), + index++) + if (access_elf_reg(dst->target, info, i, + &tmp[index], 0) < 0) { + dst->ret = -EIO; + return; + } + dst->ret = user_regset_copyout(&dst->pos, &dst->count, + &dst->u.get.kbuf, &dst->u.get.ubuf, tmp, + ELF_CR_IIP_OFFSET, ELF_AR_END_OFFSET); + } +} - case PTRACE_POKETEXT: - case PTRACE_POKEDATA: - /* write the word at location addr */ - urbs_end = ia64_get_user_rbs_end(child, pt, NULL); - ret = ia64_poke(child, sw, urbs_end, addr, data); - goto out_tsk; +void do_gpregs_set(struct unw_frame_info *info, void *arg) +{ + struct pt_regs *pt; + struct regset_getset *dst = arg; + elf_greg_t tmp[16]; + unsigned int i, index; - case PTRACE_PEEKUSR: - /* read the word at addr in the USER area */ - if (access_uarea(child, addr, &data, 0) < 0) { - ret = -EIO; - goto out_tsk; - } - ret = data; - /* ensure "ret" is not mistaken as an error code */ - force_successful_syscall_return(); - goto out_tsk; + if (unw_unwind_to_user(info) < 0) + return; - case PTRACE_POKEUSR: - /* write the word at addr in the USER area */ - if (access_uarea(child, addr, &data, 1) < 0) { - ret = -EIO; - goto out_tsk; - } - ret = 0; - goto out_tsk; + /* Skip r0 */ + if (dst->count > 0 && dst->pos < ELF_GR_OFFSET(1)) { + dst->ret = user_regset_copyin_ignore(&dst->pos, &dst->count, + &dst->u.set.kbuf, + &dst->u.set.ubuf, + 0, ELF_GR_OFFSET(1)); + if (dst->ret || dst->count == 0) + return; + } - case PTRACE_OLD_GETSIGINFO: - /* for backwards-compatibility */ - ret = ptrace_request(child, PTRACE_GETSIGINFO, addr, data); - goto out_tsk; + /* gr1-gr15 */ + if (dst->count > 0 && dst->pos < ELF_GR_OFFSET(16)) { + i = dst->pos; + index = (dst->pos - ELF_GR_OFFSET(1)) / sizeof(elf_greg_t); + dst->ret = user_regset_copyin(&dst->pos, &dst->count, + &dst->u.set.kbuf, &dst->u.set.ubuf, tmp, + ELF_GR_OFFSET(1), ELF_GR_OFFSET(16)); + if (dst->ret) + return; + for ( ; i < dst->pos; i += sizeof(elf_greg_t), index++) + if (access_elf_reg(dst->target, info, i, + &tmp[index], 1) < 0) { + dst->ret = -EIO; + return; + } + if (dst->count == 0) + return; + } - case PTRACE_OLD_SETSIGINFO: - /* for backwards-compatibility */ - ret = ptrace_request(child, PTRACE_SETSIGINFO, addr, data); - goto out_tsk; - - case PTRACE_SYSCALL: - /* continue and stop at next (return from) syscall */ - case PTRACE_CONT: - /* restart after signal. */ - ret = -EIO; - if (!valid_signal(data)) - goto out_tsk; - if (request == PTRACE_SYSCALL) - set_tsk_thread_flag(child, TIF_SYSCALL_TRACE); + /* gr16-gr31 */ + if (dst->count > 0 && dst->pos < ELF_NAT_OFFSET) { + pt = task_pt_regs(dst->target); + dst->ret = user_regset_copyin(&dst->pos, &dst->count, + &dst->u.set.kbuf, &dst->u.set.ubuf, &pt->r16, + ELF_GR_OFFSET(16), ELF_NAT_OFFSET); + if (dst->ret || dst->count == 0) + return; + } + + /* nat, pr, b0 - b7 */ + if (dst->count > 0 && dst->pos < ELF_CR_IIP_OFFSET) { + i = dst->pos; + index = (dst->pos - ELF_NAT_OFFSET) / sizeof(elf_greg_t); + dst->ret = user_regset_copyin(&dst->pos, &dst->count, + &dst->u.set.kbuf, &dst->u.set.ubuf, tmp, + ELF_NAT_OFFSET, ELF_CR_IIP_OFFSET); + if (dst->ret) + return; + for (; i < dst->pos; i += sizeof(elf_greg_t), index++) + if (access_elf_reg(dst->target, info, i, + &tmp[index], 1) < 0) { + dst->ret = -EIO; + return; + } + if (dst->count == 0) + return; + } + + /* ip cfm psr ar.rsc ar.bsp ar.bspstore ar.rnat + * ar.ccv ar.unat ar.fpsr ar.pfs ar.lc ar.ec ar.csd ar.ssd + */ + if (dst->count > 0 && dst->pos < (ELF_AR_END_OFFSET)) { + i = dst->pos; + index = (dst->pos - ELF_CR_IIP_OFFSET) / sizeof(elf_greg_t); + dst->ret = user_regset_copyin(&dst->pos, &dst->count, + &dst->u.set.kbuf, &dst->u.set.ubuf, tmp, + ELF_CR_IIP_OFFSET, ELF_AR_END_OFFSET); + if (dst->ret) + return; + for ( ; i < dst->pos; i += sizeof(elf_greg_t), index++) + if (access_elf_reg(dst->target, info, i, + &tmp[index], 1) < 0) { + dst->ret = -EIO; + return; + } + } +} + +#define ELF_FP_OFFSET(i) (i * sizeof(elf_fpreg_t)) + +void do_fpregs_get(struct unw_frame_info *info, void *arg) +{ + struct regset_getset *dst = arg; + struct task_struct *task = dst->target; + elf_fpreg_t tmp[30]; + int index, min_copy, i; + + if (unw_unwind_to_user(info) < 0) + return; + + /* Skip pos 0 and 1 */ + if (dst->count > 0 && dst->pos < ELF_FP_OFFSET(2)) { + dst->ret = user_regset_copyout_zero(&dst->pos, &dst->count, + &dst->u.get.kbuf, + &dst->u.get.ubuf, + 0, ELF_FP_OFFSET(2)); + if (dst->count == 0 || dst->ret) + return; + } + + /* fr2-fr31 */ + if (dst->count > 0 && dst->pos < ELF_FP_OFFSET(32)) { + index = (dst->pos - ELF_FP_OFFSET(2)) / sizeof(elf_fpreg_t); + + min_copy = min(((unsigned int)ELF_FP_OFFSET(32)), + dst->pos + dst->count); + for (i = dst->pos; i < min_copy; i += sizeof(elf_fpreg_t), + index++) + if (unw_get_fr(info, i / sizeof(elf_fpreg_t), + &tmp[index])) { + dst->ret = -EIO; + return; + } + dst->ret = user_regset_copyout(&dst->pos, &dst->count, + &dst->u.get.kbuf, &dst->u.get.ubuf, tmp, + ELF_FP_OFFSET(2), ELF_FP_OFFSET(32)); + if (dst->count == 0 || dst->ret) + return; + } + + /* fph */ + if (dst->count > 0) { + ia64_flush_fph(dst->target); + if (task->thread.flags & IA64_THREAD_FPH_VALID) + dst->ret = user_regset_copyout( + &dst->pos, &dst->count, + &dst->u.get.kbuf, &dst->u.get.ubuf, + &dst->target->thread.fph, + ELF_FP_OFFSET(32), -1); else - clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE); - child->exit_code = data; + /* Zero fill instead. */ + dst->ret = user_regset_copyout_zero( + &dst->pos, &dst->count, + &dst->u.get.kbuf, &dst->u.get.ubuf, + ELF_FP_OFFSET(32), -1); + } +} - /* - * Make sure the single step/taken-branch trap bits - * are not set: - */ - ia64_psr(pt)->ss = 0; - ia64_psr(pt)->tb = 0; +void do_fpregs_set(struct unw_frame_info *info, void *arg) +{ + struct regset_getset *dst = arg; + elf_fpreg_t fpreg, tmp[30]; + int index, start, end; - wake_up_process(child); - ret = 0; - goto out_tsk; + if (unw_unwind_to_user(info) < 0) + return; - case PTRACE_KILL: - /* - * Make the child exit. Best I can do is send it a - * sigkill. Perhaps it should be put in the status - * that it wants to exit. - */ - if (child->exit_state == EXIT_ZOMBIE) - /* already dead */ - goto out_tsk; - child->exit_code = SIGKILL; - - ptrace_disable(child); - wake_up_process(child); - ret = 0; - goto out_tsk; - - case PTRACE_SINGLESTEP: - /* let child execute for one instruction */ - case PTRACE_SINGLEBLOCK: - ret = -EIO; - if (!valid_signal(data)) - goto out_tsk; - - clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE); - if (request == PTRACE_SINGLESTEP) { - ia64_psr(pt)->ss = 1; - } else { - ia64_psr(pt)->tb = 1; + /* Skip pos 0 and 1 */ + if (dst->count > 0 && dst->pos < ELF_FP_OFFSET(2)) { + dst->ret = user_regset_copyin_ignore(&dst->pos, &dst->count, + &dst->u.set.kbuf, + &dst->u.set.ubuf, + 0, ELF_FP_OFFSET(2)); + if (dst->count == 0 || dst->ret) + return; + } + + /* fr2-fr31 */ + if (dst->count > 0 && dst->pos < ELF_FP_OFFSET(32)) { + start = dst->pos; + end = min(((unsigned int)ELF_FP_OFFSET(32)), + dst->pos + dst->count); + dst->ret = user_regset_copyin(&dst->pos, &dst->count, + &dst->u.set.kbuf, &dst->u.set.ubuf, tmp, + ELF_FP_OFFSET(2), ELF_FP_OFFSET(32)); + if (dst->ret) + return; + + if (start & 0xF) { /* only write high part */ + if (unw_get_fr(info, start / sizeof(elf_fpreg_t), + &fpreg)) { + dst->ret = -EIO; + return; + } + tmp[start / sizeof(elf_fpreg_t) - 2].u.bits[0] + = fpreg.u.bits[0]; + start &= ~0xFUL; + } + if (end & 0xF) { /* only write low part */ + if (unw_get_fr(info, end / sizeof(elf_fpreg_t), + &fpreg)) { + dst->ret = -EIO; + return; + } + tmp[end / sizeof(elf_fpreg_t) - 2].u.bits[1] + = fpreg.u.bits[1]; + end = (end + 0xF) & ~0xFUL; } - child->exit_code = data; - - /* give it a chance to run. */ - wake_up_process(child); - ret = 0; - goto out_tsk; - - case PTRACE_DETACH: - /* detach a process that was attached. */ - ret = ptrace_detach(child, data); - goto out_tsk; - - case PTRACE_GETREGS: - ret = ptrace_getregs(child, - (struct pt_all_user_regs __user *) data); - goto out_tsk; - - case PTRACE_SETREGS: - ret = ptrace_setregs(child, - (struct pt_all_user_regs __user *) data); - goto out_tsk; - - default: - ret = ptrace_request(child, request, addr, data); - goto out_tsk; + + for ( ; start < end ; start += sizeof(elf_fpreg_t)) { + index = start / sizeof(elf_fpreg_t); + if (unw_set_fr(info, index, tmp[index - 2])) { + dst->ret = -EIO; + return; + } + } + if (dst->ret || dst->count == 0) + return; + } + + /* fph */ + if (dst->count > 0 && dst->pos < ELF_FP_OFFSET(128)) { + ia64_sync_fph(dst->target); + dst->ret = user_regset_copyin(&dst->pos, &dst->count, + &dst->u.set.kbuf, + &dst->u.set.ubuf, + &dst->target->thread.fph, + ELF_FP_OFFSET(32), -1); } - out_tsk: - put_task_struct(child); - out: - unlock_kernel(); - return ret; } +static int +do_regset_call(void (*call)(struct unw_frame_info *, void *), + struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + struct regset_getset info = { .target = target, .regset = regset, + .pos = pos, .count = count, + .u.set = { .kbuf = kbuf, .ubuf = ubuf }, + .ret = 0 }; -void -syscall_trace (void) + if (target == current) + unw_init_running(call, &info); + else { + struct unw_frame_info ufi; + memset(&ufi, 0, sizeof(ufi)); + unw_init_from_blocked_task(&ufi, target); + (*call)(&ufi, &info); + } + + return info.ret; +} + +static int +gpregs_get(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + void *kbuf, void __user *ubuf) { - if (!test_thread_flag(TIF_SYSCALL_TRACE)) - return; - if (!(current->ptrace & PT_PTRACED)) - return; - /* - * The 0x80 provides a way for the tracing parent to - * distinguish between a syscall stop and SIGTRAP delivery. - */ - ptrace_notify(SIGTRAP - | ((current->ptrace & PT_TRACESYSGOOD) ? 0x80 : 0)); + return do_regset_call(do_gpregs_get, target, regset, pos, count, + kbuf, ubuf); +} +static int gpregs_set(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + return do_regset_call(do_gpregs_set, target, regset, pos, count, + kbuf, ubuf); +} + +static void do_gpregs_writeback(struct unw_frame_info *info, void *arg) +{ + do_sync_rbs(info, ia64_sync_user_rbs); +} + +/* + * This is called to write back the register backing store. + * ptrace does this before it stops, so that a tracer reading the user + * memory after the thread stops will get the current register data. + */ +static int +gpregs_writeback(struct task_struct *target, + const struct user_regset *regset, + int now) +{ + if (test_and_set_tsk_thread_flag(target, TIF_RESTORE_RSE)) + return 0; + set_notify_resume(target); + return do_regset_call(do_gpregs_writeback, target, regset, 0, 0, + NULL, NULL); +} + +static int +fpregs_active(struct task_struct *target, const struct user_regset *regset) +{ + return (target->thread.flags & IA64_THREAD_FPH_VALID) ? 128 : 32; +} + +static int fpregs_get(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + void *kbuf, void __user *ubuf) +{ + return do_regset_call(do_fpregs_get, target, regset, pos, count, + kbuf, ubuf); +} + +static int fpregs_set(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + return do_regset_call(do_fpregs_set, target, regset, pos, count, + kbuf, ubuf); +} + +static int +access_uarea(struct task_struct *child, unsigned long addr, + unsigned long *data, int write_access) +{ + unsigned int pos = -1; /* an invalid value */ + int ret; + unsigned long *ptr, regnum; + + if ((addr & 0x7) != 0) { + dprintk("ptrace: unaligned register address 0x%lx\n", addr); + return -1; + } + if ((addr >= PT_NAT_BITS + 8 && addr < PT_F2) || + (addr >= PT_R7 + 8 && addr < PT_B1) || + (addr >= PT_AR_LC + 8 && addr < PT_CR_IPSR) || + (addr >= PT_AR_SSD + 8 && addr < PT_DBR)) { + dprintk("ptrace: rejecting access to register " + "address 0x%lx\n", addr); + return -1; + } + + switch (addr) { + case PT_F32 ... (PT_F127 + 15): + pos = addr - PT_F32 + ELF_FP_OFFSET(32); + break; + case PT_F2 ... (PT_F5 + 15): + pos = addr - PT_F2 + ELF_FP_OFFSET(2); + break; + case PT_F10 ... (PT_F31 + 15): + pos = addr - PT_F10 + ELF_FP_OFFSET(10); + break; + case PT_F6 ... (PT_F9 + 15): + pos = addr - PT_F6 + ELF_FP_OFFSET(6); + break; + } + + if (pos != -1) { + if (write_access) + ret = fpregs_set(child, NULL, pos, + sizeof(unsigned long), data, NULL); + else + ret = fpregs_get(child, NULL, pos, + sizeof(unsigned long), data, NULL); + if (ret != 0) + return -1; + return 0; + } + + switch (addr) { + case PT_NAT_BITS: + pos = ELF_NAT_OFFSET; + break; + case PT_R4 ... PT_R7: + pos = addr - PT_R4 + ELF_GR_OFFSET(4); + break; + case PT_B1 ... PT_B5: + pos = addr - PT_B1 + ELF_BR_OFFSET(1); + break; + case PT_AR_EC: + pos = ELF_AR_EC_OFFSET; + break; + case PT_AR_LC: + pos = ELF_AR_LC_OFFSET; + break; + case PT_CR_IPSR: + pos = ELF_CR_IPSR_OFFSET; + break; + case PT_CR_IIP: + pos = ELF_CR_IIP_OFFSET; + break; + case PT_CFM: + pos = ELF_CFM_OFFSET; + break; + case PT_AR_UNAT: + pos = ELF_AR_UNAT_OFFSET; + break; + case PT_AR_PFS: + pos = ELF_AR_PFS_OFFSET; + break; + case PT_AR_RSC: + pos = ELF_AR_RSC_OFFSET; + break; + case PT_AR_RNAT: + pos = ELF_AR_RNAT_OFFSET; + break; + case PT_AR_BSPSTORE: + pos = ELF_AR_BSPSTORE_OFFSET; + break; + case PT_PR: + pos = ELF_PR_OFFSET; + break; + case PT_B6: + pos = ELF_BR_OFFSET(6); + break; + case PT_AR_BSP: + pos = ELF_AR_BSP_OFFSET; + break; + case PT_R1 ... PT_R3: + pos = addr - PT_R1 + ELF_GR_OFFSET(1); + break; + case PT_R12 ... PT_R15: + pos = addr - PT_R12 + ELF_GR_OFFSET(12); + break; + case PT_R8 ... PT_R11: + pos = addr - PT_R8 + ELF_GR_OFFSET(8); + break; + case PT_R16 ... PT_R31: + pos = addr - PT_R16 + ELF_GR_OFFSET(16); + break; + case PT_AR_CCV: + pos = ELF_AR_CCV_OFFSET; + break; + case PT_AR_FPSR: + pos = ELF_AR_FPSR_OFFSET; + break; + case PT_B0: + pos = ELF_BR_OFFSET(0); + break; + case PT_B7: + pos = ELF_BR_OFFSET(7); + break; + case PT_AR_CSD: + pos = ELF_AR_CSD_OFFSET; + break; + case PT_AR_SSD: + pos = ELF_AR_SSD_OFFSET; + break; + } + + if (pos != -1) { + if (write_access) + ret = gpregs_set(child, NULL, pos, + sizeof(unsigned long), data, NULL); + else + ret = gpregs_get(child, NULL, pos, + sizeof(unsigned long), data, NULL); + if (ret != 0) + return -1; + return 0; + } + + /* access debug registers */ + if (addr >= PT_IBR) { + regnum = (addr - PT_IBR) >> 3; + ptr = &child->thread.ibr[0]; + } else { + regnum = (addr - PT_DBR) >> 3; + ptr = &child->thread.dbr[0]; + } + + if (regnum >= 8) { + dprintk("ptrace: rejecting access to register " + "address 0x%lx\n", addr); + return -1; + } +#ifdef CONFIG_PERFMON /* - * This isn't the same as continuing with a signal, but it - * will do for normal use. strace only continues with a - * signal if the stopping signal is not SIGTRAP. -brl + * Check if debug registers are used by perfmon. This + * test must be done once we know that we can do the + * operation, i.e. the arguments are all valid, but + * before we start modifying the state. + * + * Perfmon needs to keep a count of how many processes + * are trying to modify the debug registers for system + * wide monitoring sessions. + * + * We also include read access here, because they may + * cause the PMU-installed debug register state + * (dbr[], ibr[]) to be reset. The two arrays are also + * used by perfmon, but we do not use + * IA64_THREAD_DBG_VALID. The registers are restored + * by the PMU context switch code. */ - if (current->exit_code) { - send_sig(current->exit_code, current, 1); - current->exit_code = 0; + if (pfm_use_debug_registers(child)) + return -1; +#endif + + if (!(child->thread.flags & IA64_THREAD_DBG_VALID)) { + child->thread.flags |= IA64_THREAD_DBG_VALID; + memset(child->thread.dbr, 0, + sizeof(child->thread.dbr)); + memset(child->thread.ibr, 0, + sizeof(child->thread.ibr)); + } + + ptr += regnum; + + if ((regnum & 1) && write_access) { + /* don't let the user set kernel-level breakpoints: */ + *ptr = *data & ~(7UL << 56); + return 0; } + if (write_access) + *ptr = *data; + else + *data = *ptr; + return 0; } -/* "asmlinkage" so the input arguments are preserved... */ +static const struct user_regset native_regsets[] = { + { + .core_note_type = NT_PRSTATUS, + .n = ELF_NGREG, + .size = sizeof(elf_greg_t), .align = sizeof(elf_greg_t), + .get = gpregs_get, .set = gpregs_set, + .writeback = gpregs_writeback + }, + { + .core_note_type = NT_PRFPREG, + .n = ELF_NFPREG, + .size = sizeof(elf_fpreg_t), .align = sizeof(elf_fpreg_t), + .get = fpregs_get, .set = fpregs_set, .active = fpregs_active + }, +}; + +static const struct user_regset_view user_ia64_view = { + .name = "ia64", + .e_machine = EM_IA_64, + .regsets = native_regsets, .n = ARRAY_SIZE(native_regsets) +}; + +const struct user_regset_view *task_user_regset_view(struct task_struct *tsk) +{ + return &user_ia64_view; +} -asmlinkage void -syscall_trace_enter (long arg0, long arg1, long arg2, long arg3, - long arg4, long arg5, long arg6, long arg7, - struct pt_regs regs) +struct syscall_get_set_args { + unsigned int i; + unsigned int n; + unsigned long *args; + struct pt_regs *regs; + int rw; +}; + +static void syscall_get_set_args_cb(struct unw_frame_info *info, void *data) { - if (test_thread_flag(TIF_SYSCALL_TRACE) - && (current->ptrace & PT_PTRACED)) - syscall_trace(); - - if (unlikely(current->audit_context)) { - long syscall; - int arch; - - if (IS_IA32_PROCESS(®s)) { - syscall = regs.r1; - arch = AUDIT_ARCH_I386; - } else { - syscall = regs.r15; - arch = AUDIT_ARCH_IA64; - } + struct syscall_get_set_args *args = data; + struct pt_regs *pt = args->regs; + unsigned long *krbs, cfm, ndirty; + int i, count; - audit_syscall_entry(current, arch, syscall, arg0, arg1, arg2, arg3); + if (unw_unwind_to_user(info) < 0) + return; + + cfm = pt->cr_ifs; + krbs = (unsigned long *)info->task + IA64_RBS_OFFSET/8; + ndirty = ia64_rse_num_regs(krbs, krbs + (pt->loadrs >> 19)); + + count = 0; + if (in_syscall(pt)) + count = min_t(int, args->n, cfm & 0x7f); + + for (i = 0; i < count; i++) { + if (args->rw) + *ia64_rse_skip_regs(krbs, ndirty + i + args->i) = + args->args[i]; + else + args->args[i] = *ia64_rse_skip_regs(krbs, + ndirty + i + args->i); } + if (!args->rw) { + while (i < args->n) { + args->args[i] = 0; + i++; + } + } } -/* "asmlinkage" so the input arguments are preserved... */ - -asmlinkage void -syscall_trace_leave (long arg0, long arg1, long arg2, long arg3, - long arg4, long arg5, long arg6, long arg7, - struct pt_regs regs) +void ia64_syscall_get_set_arguments(struct task_struct *task, + struct pt_regs *regs, unsigned int i, unsigned int n, + unsigned long *args, int rw) { - if (unlikely(current->audit_context)) - audit_syscall_exit(current, AUDITSC_RESULT(regs.r10), regs.r8); - - if (test_thread_flag(TIF_SYSCALL_TRACE) - && (current->ptrace & PT_PTRACED)) - syscall_trace(); + struct syscall_get_set_args data = { + .i = i, + .n = n, + .args = args, + .regs = regs, + .rw = rw, + }; + + if (task == current) + unw_init_running(syscall_get_set_args_cb, &data); + else { + struct unw_frame_info ufi; + memset(&ufi, 0, sizeof(ufi)); + unw_init_from_blocked_task(&ufi, task); + syscall_get_set_args_cb(&ufi, &data); + } } diff --git a/arch/ia64/kernel/relocate_kernel.S b/arch/ia64/kernel/relocate_kernel.S new file mode 100644 index 00000000000..c370e02f006 --- /dev/null +++ b/arch/ia64/kernel/relocate_kernel.S @@ -0,0 +1,325 @@ +/* + * arch/ia64/kernel/relocate_kernel.S + * + * Relocate kexec'able kernel and start it + * + * Copyright (C) 2005 Hewlett-Packard Development Company, L.P. + * Copyright (C) 2005 Khalid Aziz <khalid.aziz@hp.com> + * Copyright (C) 2005 Intel Corp, Zou Nan hai <nanhai.zou@intel.com> + * + * This source code is licensed under the GNU General Public License, + * Version 2. See the file COPYING for more details. + */ +#include <asm/asmmacro.h> +#include <asm/kregs.h> +#include <asm/page.h> +#include <asm/pgtable.h> +#include <asm/mca_asm.h> + + /* Must be relocatable PIC code callable as a C function + */ +GLOBAL_ENTRY(relocate_new_kernel) + .prologue + alloc r31=ar.pfs,4,0,0,0 + .body +.reloc_entry: +{ + rsm psr.i| psr.ic + mov r2=ip +} + ;; +{ + flushrs // must be first insn in group + srlz.i +} + ;; + dep r2=0,r2,61,3 //to physical address + ;; + //first switch to physical mode + add r3=1f-.reloc_entry, r2 + movl r16 = IA64_PSR_AC|IA64_PSR_BN|IA64_PSR_IC + mov ar.rsc=0 // put RSE in enforced lazy mode + ;; + add sp=(memory_stack_end - 16 - .reloc_entry),r2 + add r8=(register_stack - .reloc_entry),r2 + ;; + mov r18=ar.rnat + mov ar.bspstore=r8 + ;; + mov cr.ipsr=r16 + mov cr.iip=r3 + mov cr.ifs=r0 + srlz.i + ;; + mov ar.rnat=r18 + rfi // note: this unmask MCA/INIT (psr.mc) + ;; +1: + //physical mode code begin + mov b6=in1 + dep r28=0,in2,61,3 //to physical address + + // purge all TC entries +#define O(member) IA64_CPUINFO_##member##_OFFSET + GET_THIS_PADDR(r2, ia64_cpu_info) // load phys addr of cpu_info into r2 + ;; + addl r17=O(PTCE_STRIDE),r2 + addl r2=O(PTCE_BASE),r2 + ;; + ld8 r18=[r2],(O(PTCE_COUNT)-O(PTCE_BASE));; // r18=ptce_base + ld4 r19=[r2],4 // r19=ptce_count[0] + ld4 r21=[r17],4 // r21=ptce_stride[0] + ;; + ld4 r20=[r2] // r20=ptce_count[1] + ld4 r22=[r17] // r22=ptce_stride[1] + mov r24=r0 + ;; + adds r20=-1,r20 + ;; +#undef O +2: + cmp.ltu p6,p7=r24,r19 +(p7) br.cond.dpnt.few 4f + mov ar.lc=r20 +3: + ptc.e r18 + ;; + add r18=r22,r18 + br.cloop.sptk.few 3b + ;; + add r18=r21,r18 + add r24=1,r24 + ;; + br.sptk.few 2b +4: + srlz.i + ;; + // purge TR entry for kernel text and data + movl r16=KERNEL_START + mov r18=KERNEL_TR_PAGE_SHIFT<<2 + ;; + ptr.i r16, r18 + ptr.d r16, r18 + ;; + srlz.i + ;; + + // purge TR entry for pal code + mov r16=in3 + mov r18=IA64_GRANULE_SHIFT<<2 + ;; + ptr.i r16,r18 + ;; + srlz.i + ;; + + // purge TR entry for stack + mov r16=IA64_KR(CURRENT_STACK) + ;; + shl r16=r16,IA64_GRANULE_SHIFT + movl r19=PAGE_OFFSET + ;; + add r16=r19,r16 + mov r18=IA64_GRANULE_SHIFT<<2 + ;; + ptr.d r16,r18 + ;; + srlz.i + ;; + + //copy segments + movl r16=PAGE_MASK + mov r30=in0 // in0 is page_list + br.sptk.few .dest_page + ;; +.loop: + ld8 r30=[in0], 8;; +.dest_page: + tbit.z p0, p6=r30, 0;; // 0x1 dest page +(p6) and r17=r30, r16 +(p6) br.cond.sptk.few .loop;; + + tbit.z p0, p6=r30, 1;; // 0x2 indirect page +(p6) and in0=r30, r16 +(p6) br.cond.sptk.few .loop;; + + tbit.z p0, p6=r30, 2;; // 0x4 end flag +(p6) br.cond.sptk.few .end_loop;; + + tbit.z p6, p0=r30, 3;; // 0x8 source page +(p6) br.cond.sptk.few .loop + + and r18=r30, r16 + + // simple copy page, may optimize later + movl r14=PAGE_SIZE/8 - 1;; + mov ar.lc=r14;; +1: + ld8 r14=[r18], 8;; + st8 [r17]=r14;; + fc.i r17 + add r17=8, r17 + br.ctop.sptk.few 1b + br.sptk.few .loop + ;; + +.end_loop: + sync.i // for fc.i + ;; + srlz.i + ;; + srlz.d + ;; + br.call.sptk.many b0=b6;; + +.align 32 +memory_stack: + .fill 8192, 1, 0 +memory_stack_end: +register_stack: + .fill 8192, 1, 0 +register_stack_end: +relocate_new_kernel_end: +END(relocate_new_kernel) + +.global relocate_new_kernel_size +relocate_new_kernel_size: + data8 relocate_new_kernel_end - relocate_new_kernel + +GLOBAL_ENTRY(ia64_dump_cpu_regs) + .prologue + alloc loc0=ar.pfs,1,2,0,0 + .body + mov ar.rsc=0 // put RSE in enforced lazy mode + add loc1=4*8, in0 // save r4 and r5 first + ;; +{ + flushrs // flush dirty regs to backing store + srlz.i +} + st8 [loc1]=r4, 8 + ;; + st8 [loc1]=r5, 8 + ;; + add loc1=32*8, in0 + mov r4=ar.rnat + ;; + st8 [in0]=r0, 8 // r0 + st8 [loc1]=r4, 8 // rnat + mov r5=pr + ;; + st8 [in0]=r1, 8 // r1 + st8 [loc1]=r5, 8 // pr + mov r4=b0 + ;; + st8 [in0]=r2, 8 // r2 + st8 [loc1]=r4, 8 // b0 + mov r5=b1; + ;; + st8 [in0]=r3, 24 // r3 + st8 [loc1]=r5, 8 // b1 + mov r4=b2 + ;; + st8 [in0]=r6, 8 // r6 + st8 [loc1]=r4, 8 // b2 + mov r5=b3 + ;; + st8 [in0]=r7, 8 // r7 + st8 [loc1]=r5, 8 // b3 + mov r4=b4 + ;; + st8 [in0]=r8, 8 // r8 + st8 [loc1]=r4, 8 // b4 + mov r5=b5 + ;; + st8 [in0]=r9, 8 // r9 + st8 [loc1]=r5, 8 // b5 + mov r4=b6 + ;; + st8 [in0]=r10, 8 // r10 + st8 [loc1]=r5, 8 // b6 + mov r5=b7 + ;; + st8 [in0]=r11, 8 // r11 + st8 [loc1]=r5, 8 // b7 + mov r4=b0 + ;; + st8 [in0]=r12, 8 // r12 + st8 [loc1]=r4, 8 // ip + mov r5=loc0 + ;; + st8 [in0]=r13, 8 // r13 + extr.u r5=r5, 0, 38 // ar.pfs.pfm + mov r4=r0 // user mask + ;; + st8 [in0]=r14, 8 // r14 + st8 [loc1]=r5, 8 // cfm + ;; + st8 [in0]=r15, 8 // r15 + st8 [loc1]=r4, 8 // user mask + mov r5=ar.rsc + ;; + st8 [in0]=r16, 8 // r16 + st8 [loc1]=r5, 8 // ar.rsc + mov r4=ar.bsp + ;; + st8 [in0]=r17, 8 // r17 + st8 [loc1]=r4, 8 // ar.bsp + mov r5=ar.bspstore + ;; + st8 [in0]=r18, 8 // r18 + st8 [loc1]=r5, 8 // ar.bspstore + mov r4=ar.rnat + ;; + st8 [in0]=r19, 8 // r19 + st8 [loc1]=r4, 8 // ar.rnat + mov r5=ar.ccv + ;; + st8 [in0]=r20, 8 // r20 + st8 [loc1]=r5, 8 // ar.ccv + mov r4=ar.unat + ;; + st8 [in0]=r21, 8 // r21 + st8 [loc1]=r4, 8 // ar.unat + mov r5 = ar.fpsr + ;; + st8 [in0]=r22, 8 // r22 + st8 [loc1]=r5, 8 // ar.fpsr + mov r4 = ar.unat + ;; + st8 [in0]=r23, 8 // r23 + st8 [loc1]=r4, 8 // unat + mov r5 = ar.fpsr + ;; + st8 [in0]=r24, 8 // r24 + st8 [loc1]=r5, 8 // fpsr + mov r4 = ar.pfs + ;; + st8 [in0]=r25, 8 // r25 + st8 [loc1]=r4, 8 // ar.pfs + mov r5 = ar.lc + ;; + st8 [in0]=r26, 8 // r26 + st8 [loc1]=r5, 8 // ar.lc + mov r4 = ar.ec + ;; + st8 [in0]=r27, 8 // r27 + st8 [loc1]=r4, 8 // ar.ec + mov r5 = ar.csd + ;; + st8 [in0]=r28, 8 // r28 + st8 [loc1]=r5, 8 // ar.csd + mov r4 = ar.ssd + ;; + st8 [in0]=r29, 8 // r29 + st8 [loc1]=r4, 8 // ar.ssd + ;; + st8 [in0]=r30, 8 // r30 + ;; + st8 [in0]=r31, 8 // r31 + mov ar.pfs=loc0 + ;; + br.ret.sptk.many rp +END(ia64_dump_cpu_regs) + + diff --git a/arch/ia64/kernel/sal.c b/arch/ia64/kernel/sal.c index acc0f132f86..0464173ea56 100644 --- a/arch/ia64/kernel/sal.c +++ b/arch/ia64/kernel/sal.c @@ -6,7 +6,6 @@ * Copyright (C) 1999 VA Linux Systems * Copyright (C) 1999 Walt Drummond <drummond@valinux.com> */ -#include <linux/config.h> #include <linux/kernel.h> #include <linux/init.h> @@ -14,6 +13,7 @@ #include <linux/spinlock.h> #include <linux/string.h> +#include <asm/delay.h> #include <asm/page.h> #include <asm/sal.h> #include <asm/pal.h> @@ -109,6 +109,13 @@ check_versions (struct ia64_sal_systab *systab) sal_revision = SAL_VERSION_CODE(2, 8); sal_version = SAL_VERSION_CODE(0, 0); } + + if (ia64_platform_is("sn2") && (sal_revision == SAL_VERSION_CODE(2, 9))) + /* + * SGI Altix has hard-coded version 2.9 in their prom + * but they actually implement 3.2, so let's fix it here. + */ + sal_revision = SAL_VERSION_CODE(3, 2); } static void __init @@ -134,7 +141,7 @@ set_smp_redirect (int flag) * interrupt redirection. The reason is this would require that * All interrupts be stopped and hard bind the irq to a cpu. * Later when the interrupt is fired we need to set the redir hint - * on again in the vector. This is combersome for something that the + * on again in the vector. This is cumbersome for something that the * user mode irq balancer will solve anyways. */ no_int_routing=1; @@ -194,9 +201,8 @@ static void __init chk_nointroute_opt(void) { char *cp; - extern char saved_command_line[]; - for (cp = saved_command_line; *cp; ) { + for (cp = boot_command_line; *cp; ) { if (memcmp(cp, "nointroute", 10) == 0) { no_int_routing = 1; printk ("no_int_routing on\n"); @@ -214,6 +220,89 @@ chk_nointroute_opt(void) static void __init sal_desc_ap_wakeup(void *p) { } #endif +/* + * HP rx5670 firmware polls for interrupts during SAL_CACHE_FLUSH by reading + * cr.ivr, but it never writes cr.eoi. This leaves any interrupt marked as + * "in-service" and masks other interrupts of equal or lower priority. + * + * HP internal defect reports: F1859, F2775, F3031. + */ +static int sal_cache_flush_drops_interrupts; + +static int __init +force_pal_cache_flush(char *str) +{ + sal_cache_flush_drops_interrupts = 1; + return 0; +} +early_param("force_pal_cache_flush", force_pal_cache_flush); + +void __init +check_sal_cache_flush (void) +{ + unsigned long flags; + int cpu; + u64 vector, cache_type = 3; + struct ia64_sal_retval isrv; + + if (sal_cache_flush_drops_interrupts) + return; + + cpu = get_cpu(); + local_irq_save(flags); + + /* + * Send ourselves a timer interrupt, wait until it's reported, and see + * if SAL_CACHE_FLUSH drops it. + */ + platform_send_ipi(cpu, IA64_TIMER_VECTOR, IA64_IPI_DM_INT, 0); + + while (!ia64_get_irr(IA64_TIMER_VECTOR)) + cpu_relax(); + + SAL_CALL(isrv, SAL_CACHE_FLUSH, cache_type, 0, 0, 0, 0, 0, 0); + + if (isrv.status) + printk(KERN_ERR "SAL_CAL_FLUSH failed with %ld\n", isrv.status); + + if (ia64_get_irr(IA64_TIMER_VECTOR)) { + vector = ia64_get_ivr(); + ia64_eoi(); + WARN_ON(vector != IA64_TIMER_VECTOR); + } else { + sal_cache_flush_drops_interrupts = 1; + printk(KERN_ERR "SAL: SAL_CACHE_FLUSH drops interrupts; " + "PAL_CACHE_FLUSH will be used instead\n"); + ia64_eoi(); + } + + local_irq_restore(flags); + put_cpu(); +} + +s64 +ia64_sal_cache_flush (u64 cache_type) +{ + struct ia64_sal_retval isrv; + + if (sal_cache_flush_drops_interrupts) { + unsigned long flags; + u64 progress; + s64 rc; + + progress = 0; + local_irq_save(flags); + rc = ia64_pal_cache_flush(cache_type, + PAL_CACHE_FLUSH_INVALIDATE, &progress, NULL); + local_irq_restore(flags); + return rc; + } + + SAL_CALL(isrv, SAL_CACHE_FLUSH, cache_type, 0, 0, 0, 0, 0, 0); + return isrv.status; +} +EXPORT_SYMBOL_GPL(ia64_sal_cache_flush); + void __init ia64_sal_init (struct ia64_sal_systab *systab) { @@ -262,6 +351,7 @@ ia64_sal_init (struct ia64_sal_systab *systab) } p += SAL_DESC_SIZE(*p); } + } int @@ -300,3 +390,16 @@ ia64_sal_oemcall_reentrant(struct ia64_sal_retval *isrvp, u64 oemfunc, return 0; } EXPORT_SYMBOL(ia64_sal_oemcall_reentrant); + +long +ia64_sal_freq_base (unsigned long which, unsigned long *ticks_per_second, + unsigned long *drift_info) +{ + struct ia64_sal_retval isrv; + + SAL_CALL(isrv, SAL_FREQ_BASE, which, 0, 0, 0, 0, 0, 0); + *ticks_per_second = isrv.v0; + *drift_info = isrv.v1; + return isrv.status; +} +EXPORT_SYMBOL_GPL(ia64_sal_freq_base); diff --git a/arch/ia64/kernel/salinfo.c b/arch/ia64/kernel/salinfo.c index ca68e6e44a7..ee9719eebb1 100644 --- a/arch/ia64/kernel/salinfo.c +++ b/arch/ia64/kernel/salinfo.c @@ -3,7 +3,7 @@ * * Creates entries in /proc/sal for various system features. * - * Copyright (c) 2003 Silicon Graphics, Inc. All rights reserved. + * Copyright (c) 2003, 2006 Silicon Graphics, Inc. All rights reserved. * Copyright (c) 2003 Hewlett-Packard Co * Bjorn Helgaas <bjorn.helgaas@hp.com> * @@ -27,17 +27,26 @@ * mca.c may not pass a buffer, a NULL buffer just indicates that a new * record is available in SAL. * Replace some NR_CPUS by cpus_online, for hotplug cpu. + * + * Jan 5 2006 kaos@sgi.com + * Handle hotplug cpus coming online. + * Handle hotplug cpus going offline while they still have outstanding records. + * Use the cpu_* macros consistently. + * Replace the counting semaphore with a mutex and a test if the cpumask is non-empty. + * Modify the locking to make the test for "work to do" an atomic operation. */ +#include <linux/capability.h> +#include <linux/cpu.h> #include <linux/types.h> #include <linux/proc_fs.h> +#include <linux/seq_file.h> #include <linux/module.h> #include <linux/smp.h> -#include <linux/smp_lock.h> #include <linux/timer.h> #include <linux/vmalloc.h> +#include <linux/semaphore.h> -#include <asm/semaphore.h> #include <asm/sal.h> #include <asm/uaccess.h> @@ -45,7 +54,7 @@ MODULE_AUTHOR("Jesse Barnes <jbarnes@sgi.com>"); MODULE_DESCRIPTION("/proc interface to IA-64 SAL features"); MODULE_LICENSE("GPL"); -static int salinfo_read(char *page, char **start, off_t off, int count, int *eof, void *data); +static const struct file_operations proc_salinfo_fops; typedef struct { const char *name; /* name of the proc entry */ @@ -57,7 +66,7 @@ typedef struct { * List {name,feature} pairs for every entry in /proc/sal/<feature> * that this module exports */ -static salinfo_entry_t salinfo_entries[]={ +static const salinfo_entry_t salinfo_entries[]={ { "bus_lock", IA64_SAL_PLATFORM_FEATURE_BUS_LOCK, }, { "irq_redirection", IA64_SAL_PLATFORM_FEATURE_IRQ_REDIR_HINT, }, { "ipi_redirection", IA64_SAL_PLATFORM_FEATURE_IPI_REDIR_HINT, }, @@ -131,8 +140,8 @@ enum salinfo_state { }; struct salinfo_data { - volatile cpumask_t cpu_event; /* which cpus have outstanding events */ - struct semaphore sem; /* count of cpus with outstanding events (bits set in cpu_event) */ + cpumask_t cpu_event; /* which cpus have outstanding events */ + struct semaphore mutex; u8 *log_buffer; u64 log_size; u8 *oemdata; /* decoded oem data */ @@ -154,7 +163,7 @@ static DEFINE_SPINLOCK(data_saved_lock); /** salinfo_platform_oemdata - optional callback to decode oemdata from an error * record. * @sect_header: pointer to the start of the section to decode. - * @oemdata: returns vmalloc area containing the decded output. + * @oemdata: returns vmalloc area containing the decoded output. * @oemdata_size: returns length of decoded output (strlen). * * Description: If user space asks for oem data to be decoded by the kernel @@ -173,6 +182,21 @@ struct salinfo_platform_oemdata_parms { int ret; }; +/* Kick the mutex that tells user space that there is work to do. Instead of + * trying to track the state of the mutex across multiple cpus, in user + * context, interrupt context, non-maskable interrupt context and hotplug cpu, + * it is far easier just to grab the mutex if it is free then release it. + * + * This routine must be called with data_saved_lock held, to make the down/up + * operation atomic. + */ +static void +salinfo_work_to_do(struct salinfo_data *data) +{ + (void)(down_trylock(&data->mutex) ?: 0); + up(&data->mutex); +} + static void salinfo_platform_oemdata_cpu(void *context) { @@ -211,9 +235,9 @@ salinfo_log_wakeup(int type, u8 *buffer, u64 size, int irqsafe) BUG_ON(type >= ARRAY_SIZE(salinfo_log_name)); + if (irqsafe) + spin_lock_irqsave(&data_saved_lock, flags); if (buffer) { - if (irqsafe) - spin_lock_irqsave(&data_saved_lock, flags); for (i = 0, data_saved = data->data_saved; i < saved_size; ++i, ++data_saved) { if (!data_saved->buffer) break; @@ -231,39 +255,36 @@ salinfo_log_wakeup(int type, u8 *buffer, u64 size, int irqsafe) data_saved->size = size; data_saved->buffer = buffer; } - if (irqsafe) - spin_unlock_irqrestore(&data_saved_lock, flags); } - - if (!test_and_set_bit(smp_processor_id(), &data->cpu_event)) { - if (irqsafe) - up(&data->sem); + cpu_set(smp_processor_id(), data->cpu_event); + if (irqsafe) { + salinfo_work_to_do(data); + spin_unlock_irqrestore(&data_saved_lock, flags); } } /* Check for outstanding MCA/INIT records every minute (arbitrary) */ #define SALINFO_TIMER_DELAY (60*HZ) static struct timer_list salinfo_timer; +extern void ia64_mlogbuf_dump(void); static void salinfo_timeout_check(struct salinfo_data *data) { - int i; + unsigned long flags; if (!data->open) return; - for_each_online_cpu(i) { - if (test_bit(i, &data->cpu_event)) { - /* double up() is not a problem, user space will see no - * records for the additional "events". - */ - up(&data->sem); - } + if (!cpus_empty(data->cpu_event)) { + spin_lock_irqsave(&data_saved_lock, flags); + salinfo_work_to_do(data); + spin_unlock_irqrestore(&data_saved_lock, flags); } } -static void +static void salinfo_timeout (unsigned long arg) { + ia64_mlogbuf_dump(); salinfo_timeout_check(salinfo_data + SAL_INFO_TYPE_MCA); salinfo_timeout_check(salinfo_data + SAL_INFO_TYPE_INIT); salinfo_timer.expires = jiffies + SALINFO_TIMER_DELAY; @@ -281,40 +302,41 @@ salinfo_event_open(struct inode *inode, struct file *file) static ssize_t salinfo_event_read(struct file *file, char __user *buffer, size_t count, loff_t *ppos) { - struct inode *inode = file->f_dentry->d_inode; - struct proc_dir_entry *entry = PDE(inode); - struct salinfo_data *data = entry->data; + struct salinfo_data *data = PDE_DATA(file_inode(file)); char cmd[32]; size_t size; int i, n, cpu = -1; retry: - if (down_trylock(&data->sem)) { + if (cpus_empty(data->cpu_event) && down_trylock(&data->mutex)) { if (file->f_flags & O_NONBLOCK) return -EAGAIN; - if (down_interruptible(&data->sem)) - return -ERESTARTSYS; + if (down_interruptible(&data->mutex)) + return -EINTR; } n = data->cpu_check; - for (i = 0; i < NR_CPUS; i++) { - if (test_bit(n, &data->cpu_event) && cpu_online(n)) { + for (i = 0; i < nr_cpu_ids; i++) { + if (cpu_isset(n, data->cpu_event)) { + if (!cpu_online(n)) { + cpu_clear(n, data->cpu_event); + continue; + } cpu = n; break; } - if (++n == NR_CPUS) + if (++n == nr_cpu_ids) n = 0; } if (cpu == -1) goto retry; - /* events are sticky until the user says "clear" */ - up(&data->sem); + ia64_mlogbuf_dump(); /* for next read, start checking at next CPU */ data->cpu_check = cpu; - if (++data->cpu_check == NR_CPUS) + if (++data->cpu_check == nr_cpu_ids) data->cpu_check = 0; snprintf(cmd, sizeof(cmd), "read %d\n", cpu); @@ -328,16 +350,16 @@ retry: return size; } -static struct file_operations salinfo_event_fops = { +static const struct file_operations salinfo_event_fops = { .open = salinfo_event_open, .read = salinfo_event_read, + .llseek = noop_llseek, }; static int salinfo_log_open(struct inode *inode, struct file *file) { - struct proc_dir_entry *entry = PDE(inode); - struct salinfo_data *data = entry->data; + struct salinfo_data *data = PDE_DATA(inode); if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -362,8 +384,7 @@ salinfo_log_open(struct inode *inode, struct file *file) static int salinfo_log_release(struct inode *inode, struct file *file) { - struct proc_dir_entry *entry = PDE(inode); - struct salinfo_data *data = entry->data; + struct salinfo_data *data = PDE_DATA(inode); if (data->state == STATE_NO_DATA) { vfree(data->log_buffer); @@ -380,13 +401,10 @@ salinfo_log_release(struct inode *inode, struct file *file) static void call_on_cpu(int cpu, void (*fn)(void *), void *arg) { - cpumask_t save_cpus_allowed, new_cpus_allowed; - memcpy(&save_cpus_allowed, ¤t->cpus_allowed, sizeof(save_cpus_allowed)); - memset(&new_cpus_allowed, 0, sizeof(new_cpus_allowed)); - set_bit(cpu, &new_cpus_allowed); - set_cpus_allowed(current, new_cpus_allowed); + cpumask_t save_cpus_allowed = current->cpus_allowed; + set_cpus_allowed_ptr(current, cpumask_of(cpu)); (*fn)(arg); - set_cpus_allowed(current, save_cpus_allowed); + set_cpus_allowed_ptr(current, &save_cpus_allowed); } static void @@ -432,19 +450,17 @@ retry: if (!data->saved_num) call_on_cpu(cpu, salinfo_log_read_cpu, data); if (!data->log_size) { - data->state = STATE_NO_DATA; - clear_bit(cpu, &data->cpu_event); + data->state = STATE_NO_DATA; + cpu_clear(cpu, data->cpu_event); } else { - data->state = STATE_LOG_RECORD; + data->state = STATE_LOG_RECORD; } } static ssize_t salinfo_log_read(struct file *file, char __user *buffer, size_t count, loff_t *ppos) { - struct inode *inode = file->f_dentry->d_inode; - struct proc_dir_entry *entry = PDE(inode); - struct salinfo_data *data = entry->data; + struct salinfo_data *data = PDE_DATA(file_inode(file)); u8 *buf; u64 bufsize; @@ -472,36 +488,38 @@ static int salinfo_log_clear(struct salinfo_data *data, int cpu) { sal_log_record_header_t *rh; + unsigned long flags; + spin_lock_irqsave(&data_saved_lock, flags); data->state = STATE_NO_DATA; - if (!test_bit(cpu, &data->cpu_event)) + if (!cpu_isset(cpu, data->cpu_event)) { + spin_unlock_irqrestore(&data_saved_lock, flags); return 0; - down(&data->sem); - clear_bit(cpu, &data->cpu_event); + } + cpu_clear(cpu, data->cpu_event); if (data->saved_num) { - unsigned long flags; - spin_lock_irqsave(&data_saved_lock, flags); - shift1_data_saved(data, data->saved_num - 1 ); + shift1_data_saved(data, data->saved_num - 1); data->saved_num = 0; - spin_unlock_irqrestore(&data_saved_lock, flags); } + spin_unlock_irqrestore(&data_saved_lock, flags); rh = (sal_log_record_header_t *)(data->log_buffer); /* Corrected errors have already been cleared from SAL */ if (rh->severity != sal_log_severity_corrected) call_on_cpu(cpu, salinfo_log_clear_cpu, data); /* clearing a record may make a new record visible */ salinfo_log_new_read(cpu, data); - if (data->state == STATE_LOG_RECORD && - !test_and_set_bit(cpu, &data->cpu_event)) - up(&data->sem); + if (data->state == STATE_LOG_RECORD) { + spin_lock_irqsave(&data_saved_lock, flags); + cpu_set(cpu, data->cpu_event); + salinfo_work_to_do(data); + spin_unlock_irqrestore(&data_saved_lock, flags); + } return 0; } static ssize_t salinfo_log_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos) { - struct inode *inode = file->f_dentry->d_inode; - struct proc_dir_entry *entry = PDE(inode); - struct salinfo_data *data = entry->data; + struct salinfo_data *data = PDE_DATA(file_inode(file)); char cmd[32]; size_t size; u32 offset; @@ -542,11 +560,59 @@ salinfo_log_write(struct file *file, const char __user *buffer, size_t count, lo return count; } -static struct file_operations salinfo_data_fops = { +static const struct file_operations salinfo_data_fops = { .open = salinfo_log_open, .release = salinfo_log_release, .read = salinfo_log_read, .write = salinfo_log_write, + .llseek = default_llseek, +}; + +static int +salinfo_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu) +{ + unsigned int i, cpu = (unsigned long)hcpu; + unsigned long flags; + struct salinfo_data *data; + switch (action) { + case CPU_ONLINE: + case CPU_ONLINE_FROZEN: + spin_lock_irqsave(&data_saved_lock, flags); + for (i = 0, data = salinfo_data; + i < ARRAY_SIZE(salinfo_data); + ++i, ++data) { + cpu_set(cpu, data->cpu_event); + salinfo_work_to_do(data); + } + spin_unlock_irqrestore(&data_saved_lock, flags); + break; + case CPU_DEAD: + case CPU_DEAD_FROZEN: + spin_lock_irqsave(&data_saved_lock, flags); + for (i = 0, data = salinfo_data; + i < ARRAY_SIZE(salinfo_data); + ++i, ++data) { + struct salinfo_data_saved *data_saved; + int j; + for (j = ARRAY_SIZE(data->data_saved) - 1, data_saved = data->data_saved + j; + j >= 0; + --j, --data_saved) { + if (data_saved->buffer && data_saved->cpu == cpu) { + shift1_data_saved(data, j); + } + } + cpu_clear(cpu, data->cpu_event); + } + spin_unlock_irqrestore(&data_saved_lock, flags); + break; + } + return NOTIFY_OK; +} + +static struct notifier_block salinfo_cpu_notifier = +{ + .notifier_call = salinfo_cpu_callback, + .priority = 0, }; static int __init @@ -556,7 +622,7 @@ salinfo_init(void) struct proc_dir_entry **sdir = salinfo_proc_entries; /* keeps track of every entry */ struct proc_dir_entry *dir, *entry; struct salinfo_data *data; - int i, j, online; + int i, j; salinfo_dir = proc_mkdir("sal", NULL); if (!salinfo_dir) @@ -564,39 +630,36 @@ salinfo_init(void) for (i=0; i < NR_SALINFO_ENTRIES; i++) { /* pass the feature bit in question as misc data */ - *sdir++ = create_proc_read_entry (salinfo_entries[i].name, 0, salinfo_dir, - salinfo_read, (void *)salinfo_entries[i].feature); + *sdir++ = proc_create_data(salinfo_entries[i].name, 0, salinfo_dir, + &proc_salinfo_fops, + (void *)salinfo_entries[i].feature); } + cpu_notifier_register_begin(); + for (i = 0; i < ARRAY_SIZE(salinfo_log_name); i++) { data = salinfo_data + i; data->type = i; - sema_init(&data->sem, 0); + sema_init(&data->mutex, 1); dir = proc_mkdir(salinfo_log_name[i], salinfo_dir); if (!dir) continue; - entry = create_proc_entry("event", S_IRUSR, dir); + entry = proc_create_data("event", S_IRUSR, dir, + &salinfo_event_fops, data); if (!entry) continue; - entry->data = data; - entry->proc_fops = &salinfo_event_fops; *sdir++ = entry; - entry = create_proc_entry("data", S_IRUSR | S_IWUSR, dir); + entry = proc_create_data("data", S_IRUSR | S_IWUSR, dir, + &salinfo_data_fops, data); if (!entry) continue; - entry->data = data; - entry->proc_fops = &salinfo_data_fops; *sdir++ = entry; /* we missed any events before now */ - online = 0; - for_each_online_cpu(j) { - set_bit(j, &data->cpu_event); - ++online; - } - sema_init(&data->sem, online); + for_each_online_cpu(j) + cpu_set(j, data->cpu_event); *sdir++ = dir; } @@ -608,6 +671,10 @@ salinfo_init(void) salinfo_timer.function = &salinfo_timeout; add_timer(&salinfo_timer); + __register_hotcpu_notifier(&salinfo_cpu_notifier); + + cpu_notifier_register_done(); + return 0; } @@ -615,22 +682,23 @@ salinfo_init(void) * 'data' contains an integer that corresponds to the feature we're * testing */ -static int -salinfo_read(char *page, char **start, off_t off, int count, int *eof, void *data) +static int proc_salinfo_show(struct seq_file *m, void *v) { - int len = 0; - - len = sprintf(page, (sal_platform_features & (unsigned long)data) ? "1\n" : "0\n"); - - if (len <= off+count) *eof = 1; - - *start = page + off; - len -= off; - - if (len>count) len = count; - if (len<0) len = 0; + unsigned long data = (unsigned long)v; + seq_puts(m, (sal_platform_features & data) ? "1\n" : "0\n"); + return 0; +} - return len; +static int proc_salinfo_open(struct inode *inode, struct file *file) +{ + return single_open(file, proc_salinfo_show, PDE_DATA(inode)); } +static const struct file_operations proc_salinfo_fops = { + .open = proc_salinfo_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + module_init(salinfo_init); diff --git a/arch/ia64/kernel/semaphore.c b/arch/ia64/kernel/semaphore.c deleted file mode 100644 index 2724ef3fbae..00000000000 --- a/arch/ia64/kernel/semaphore.c +++ /dev/null @@ -1,165 +0,0 @@ -/* - * IA-64 semaphore implementation (derived from x86 version). - * - * Copyright (C) 1999-2000, 2002 Hewlett-Packard Co - * David Mosberger-Tang <davidm@hpl.hp.com> - */ - -/* - * Semaphores are implemented using a two-way counter: The "count" - * variable is decremented for each process that tries to acquire the - * semaphore, while the "sleepers" variable is a count of such - * acquires. - * - * Notably, the inline "up()" and "down()" functions can efficiently - * test if they need to do any extra work (up needs to do something - * only if count was negative before the increment operation. - * - * "sleeping" and the contention routine ordering is protected - * by the spinlock in the semaphore's waitqueue head. - * - * Note that these functions are only called when there is contention - * on the lock, and as such all this is the "non-critical" part of the - * whole semaphore business. The critical part is the inline stuff in - * <asm/semaphore.h> where we want to avoid any extra jumps and calls. - */ -#include <linux/sched.h> -#include <linux/init.h> - -#include <asm/errno.h> -#include <asm/semaphore.h> - -/* - * Logic: - * - Only on a boundary condition do we need to care. When we go - * from a negative count to a non-negative, we wake people up. - * - When we go from a non-negative count to a negative do we - * (a) synchronize with the "sleepers" count and (b) make sure - * that we're on the wakeup list before we synchronize so that - * we cannot lose wakeup events. - */ - -void -__up (struct semaphore *sem) -{ - wake_up(&sem->wait); -} - -void __sched __down (struct semaphore *sem) -{ - struct task_struct *tsk = current; - DECLARE_WAITQUEUE(wait, tsk); - unsigned long flags; - - tsk->state = TASK_UNINTERRUPTIBLE; - spin_lock_irqsave(&sem->wait.lock, flags); - add_wait_queue_exclusive_locked(&sem->wait, &wait); - - sem->sleepers++; - for (;;) { - int sleepers = sem->sleepers; - - /* - * Add "everybody else" into it. They aren't - * playing, because we own the spinlock in - * the wait_queue_head. - */ - if (!atomic_add_negative(sleepers - 1, &sem->count)) { - sem->sleepers = 0; - break; - } - sem->sleepers = 1; /* us - see -1 above */ - spin_unlock_irqrestore(&sem->wait.lock, flags); - - schedule(); - - spin_lock_irqsave(&sem->wait.lock, flags); - tsk->state = TASK_UNINTERRUPTIBLE; - } - remove_wait_queue_locked(&sem->wait, &wait); - wake_up_locked(&sem->wait); - spin_unlock_irqrestore(&sem->wait.lock, flags); - tsk->state = TASK_RUNNING; -} - -int __sched __down_interruptible (struct semaphore * sem) -{ - int retval = 0; - struct task_struct *tsk = current; - DECLARE_WAITQUEUE(wait, tsk); - unsigned long flags; - - tsk->state = TASK_INTERRUPTIBLE; - spin_lock_irqsave(&sem->wait.lock, flags); - add_wait_queue_exclusive_locked(&sem->wait, &wait); - - sem->sleepers ++; - for (;;) { - int sleepers = sem->sleepers; - - /* - * With signals pending, this turns into - * the trylock failure case - we won't be - * sleeping, and we* can't get the lock as - * it has contention. Just correct the count - * and exit. - */ - if (signal_pending(current)) { - retval = -EINTR; - sem->sleepers = 0; - atomic_add(sleepers, &sem->count); - break; - } - - /* - * Add "everybody else" into it. They aren't - * playing, because we own the spinlock in - * wait_queue_head. The "-1" is because we're - * still hoping to get the semaphore. - */ - if (!atomic_add_negative(sleepers - 1, &sem->count)) { - sem->sleepers = 0; - break; - } - sem->sleepers = 1; /* us - see -1 above */ - spin_unlock_irqrestore(&sem->wait.lock, flags); - - schedule(); - - spin_lock_irqsave(&sem->wait.lock, flags); - tsk->state = TASK_INTERRUPTIBLE; - } - remove_wait_queue_locked(&sem->wait, &wait); - wake_up_locked(&sem->wait); - spin_unlock_irqrestore(&sem->wait.lock, flags); - - tsk->state = TASK_RUNNING; - return retval; -} - -/* - * Trylock failed - make sure we correct for having decremented the - * count. - */ -int -__down_trylock (struct semaphore *sem) -{ - unsigned long flags; - int sleepers; - - spin_lock_irqsave(&sem->wait.lock, flags); - sleepers = sem->sleepers + 1; - sem->sleepers = 0; - - /* - * Add "everybody else" and us into it. They aren't - * playing, because we own the spinlock in the - * wait_queue_head. - */ - if (!atomic_add_negative(sleepers, &sem->count)) { - wake_up_locked(&sem->wait); - } - - spin_unlock_irqrestore(&sem->wait.lock, flags); - return 1; -} diff --git a/arch/ia64/kernel/setup.c b/arch/ia64/kernel/setup.c index 3af6de36a48..d86669bcdfb 100644 --- a/arch/ia64/kernel/setup.c +++ b/arch/ia64/kernel/setup.c @@ -22,7 +22,6 @@ * 06/24/99 W.Drummond added boot_cpu_data. * 05/28/05 Z. Menyhart Dynamic stride size for "flush_icache_range()" */ -#include <linux/config.h> #include <linux/module.h> #include <linux/init.h> @@ -36,29 +35,33 @@ #include <linux/seq_file.h> #include <linux/string.h> #include <linux/threads.h> -#include <linux/tty.h> +#include <linux/screen_info.h> +#include <linux/dmi.h> #include <linux/serial.h> #include <linux/serial_core.h> #include <linux/efi.h> #include <linux/initrd.h> -#include <linux/platform.h> #include <linux/pm.h> +#include <linux/cpufreq.h> +#include <linux/kexec.h> +#include <linux/crash_dump.h> -#include <asm/ia32.h> #include <asm/machvec.h> #include <asm/mca.h> #include <asm/meminit.h> #include <asm/page.h> +#include <asm/paravirt.h> +#include <asm/paravirt_patch.h> #include <asm/patch.h> #include <asm/pgtable.h> #include <asm/processor.h> #include <asm/sal.h> #include <asm/sections.h> -#include <asm/serial.h> #include <asm/setup.h> #include <asm/smp.h> -#include <asm/system.h> +#include <asm/tlbflush.h> #include <asm/unistd.h> +#include <asm/hpsim.h> #if defined(CONFIG_SMP) && (IA64_CPU_SIZE > PAGE_SIZE) # error "struct cpuinfo_ia64 too big!" @@ -69,9 +72,8 @@ unsigned long __per_cpu_offset[NR_CPUS]; EXPORT_SYMBOL(__per_cpu_offset); #endif -DEFINE_PER_CPU(struct cpuinfo_ia64, cpu_info); +DEFINE_PER_CPU(struct cpuinfo_ia64, ia64_cpu_info); DEFINE_PER_CPU(unsigned long, local_per_cpu_offset); -DEFINE_PER_CPU(unsigned long, ia64_phys_stacked_size_p8); unsigned long ia64_cycles_per_usec; struct ia64_boot_param *ia64_boot_param; struct screen_info screen_info; @@ -87,17 +89,13 @@ static struct resource code_resource = { .name = "Kernel code", .flags = IORESOURCE_BUSY | IORESOURCE_MEM }; -extern void efi_initialize_iomem_resources(struct resource *, - struct resource *); -extern char _text[], _end[], _etext[]; -unsigned long ia64_max_cacheline_size; +static struct resource bss_resource = { + .name = "Kernel bss", + .flags = IORESOURCE_BUSY | IORESOURCE_MEM +}; -int dma_get_cache_alignment(void) -{ - return ia64_max_cacheline_size; -} -EXPORT_SYMBOL(dma_get_cache_alignment); +unsigned long ia64_max_cacheline_size; unsigned long ia64_iobase; /* virtual address for I/O accesses */ EXPORT_SYMBOL(ia64_iobase); @@ -111,6 +109,13 @@ unsigned int num_io_spaces; */ #define I_CACHE_STRIDE_SHIFT 5 /* Safest way to go: 32 bytes by 32 bytes */ unsigned long ia64_i_cache_stride_shift = ~0; +/* + * "clflush_cache_range()" needs to know what processor dependent stride size to + * use when it flushes cache lines including both d-cache and i-cache. + */ +/* Safest way to go: 32 bytes by 32 bytes */ +#define CACHE_STRIDE_SHIFT 5 +unsigned long ia64_cache_stride_shift = ~0; /* * The merge_mask variable needs to be set to (max(iommu_page_size(iommu)) - 1). This @@ -127,8 +132,8 @@ EXPORT_SYMBOL(ia64_max_iommu_merge_mask); /* * We use a special marker for the end of memory and it uses the extra (+1) slot */ -struct rsvd_region rsvd_region[IA64_MAX_RSVD_REGIONS + 1]; -int num_rsvd_regions; +struct rsvd_region rsvd_region[IA64_MAX_RSVD_REGIONS + 1] __initdata; +int num_rsvd_regions __initdata; /* @@ -137,10 +142,10 @@ int num_rsvd_regions; * caller-specified function is called with the memory ranges that remain after filtering. * This routine does not assume the incoming segments are sorted. */ -int -filter_rsvd_memory (unsigned long start, unsigned long end, void *arg) +int __init +filter_rsvd_memory (u64 start, u64 end, void *arg) { - unsigned long range_start, range_end, prev_start; + u64 range_start, range_end, prev_start; void (*func)(unsigned long, unsigned long, int); int i; @@ -173,7 +178,30 @@ filter_rsvd_memory (unsigned long start, unsigned long end, void *arg) return 0; } -static void +/* + * Similar to "filter_rsvd_memory()", but the reserved memory ranges + * are not filtered out. + */ +int __init +filter_memory(u64 start, u64 end, void *arg) +{ + void (*func)(unsigned long, unsigned long, int); + +#if IGNORE_PFN0 + if (start == PAGE_OFFSET) { + printk(KERN_WARNING "warning: skipping physical page 0\n"); + start += PAGE_SIZE; + if (start >= end) + return 0; + } +#endif + func = arg; + if (start < end) + call_pernode_memory(__pa(start), end - start, func); + return 0; +} + +static void __init sort_regions (struct rsvd_region *rsvd_region, int max) { int j; @@ -191,6 +219,23 @@ sort_regions (struct rsvd_region *rsvd_region, int max) } } +/* merge overlaps */ +static int __init +merge_regions (struct rsvd_region *rsvd_region, int max) +{ + int i; + for (i = 1; i < max; ++i) { + if (rsvd_region[i].start >= rsvd_region[i-1].end) + continue; + if (rsvd_region[i].end > rsvd_region[i-1].end) + rsvd_region[i-1].end = rsvd_region[i].end; + --max; + memmove(&rsvd_region[i], &rsvd_region[i+1], + (max - i) * sizeof(struct rsvd_region)); + } + return max; +} + /* * Request address space for all standard resources */ @@ -199,25 +244,101 @@ static int __init register_memory(void) code_resource.start = ia64_tpa(_text); code_resource.end = ia64_tpa(_etext) - 1; data_resource.start = ia64_tpa(_etext); - data_resource.end = ia64_tpa(_end) - 1; - efi_initialize_iomem_resources(&code_resource, &data_resource); + data_resource.end = ia64_tpa(_edata) - 1; + bss_resource.start = ia64_tpa(__bss_start); + bss_resource.end = ia64_tpa(_end) - 1; + efi_initialize_iomem_resources(&code_resource, &data_resource, + &bss_resource); return 0; } __initcall(register_memory); + +#ifdef CONFIG_KEXEC + +/* + * This function checks if the reserved crashkernel is allowed on the specific + * IA64 machine flavour. Machines without an IO TLB use swiotlb and require + * some memory below 4 GB (i.e. in 32 bit area), see the implementation of + * lib/swiotlb.c. The hpzx1 architecture has an IO TLB but cannot use that + * in kdump case. See the comment in sba_init() in sba_iommu.c. + * + * So, the only machvec that really supports loading the kdump kernel + * over 4 GB is "sn2". + */ +static int __init check_crashkernel_memory(unsigned long pbase, size_t size) +{ + if (ia64_platform_is("sn2") || ia64_platform_is("uv")) + return 1; + else + return pbase < (1UL << 32); +} + +static void __init setup_crashkernel(unsigned long total, int *n) +{ + unsigned long long base = 0, size = 0; + int ret; + + ret = parse_crashkernel(boot_command_line, total, + &size, &base); + if (ret == 0 && size > 0) { + if (!base) { + sort_regions(rsvd_region, *n); + *n = merge_regions(rsvd_region, *n); + base = kdump_find_rsvd_region(size, + rsvd_region, *n); + } + + if (!check_crashkernel_memory(base, size)) { + pr_warning("crashkernel: There would be kdump memory " + "at %ld GB but this is unusable because it " + "must\nbe below 4 GB. Change the memory " + "configuration of the machine.\n", + (unsigned long)(base >> 30)); + return; + } + + if (base != ~0UL) { + printk(KERN_INFO "Reserving %ldMB of memory at %ldMB " + "for crashkernel (System RAM: %ldMB)\n", + (unsigned long)(size >> 20), + (unsigned long)(base >> 20), + (unsigned long)(total >> 20)); + rsvd_region[*n].start = + (unsigned long)__va(base); + rsvd_region[*n].end = + (unsigned long)__va(base + size); + (*n)++; + crashk_res.start = base; + crashk_res.end = base + size - 1; + } + } + efi_memmap_res.start = ia64_boot_param->efi_memmap; + efi_memmap_res.end = efi_memmap_res.start + + ia64_boot_param->efi_memmap_size; + boot_param_res.start = __pa(ia64_boot_param); + boot_param_res.end = boot_param_res.start + + sizeof(*ia64_boot_param); +} +#else +static inline void __init setup_crashkernel(unsigned long total, int *n) +{} +#endif + /** * reserve_memory - setup reserved memory areas * * Setup the reserved memory areas set aside for the boot parameters, * initrd, etc. There are currently %IA64_MAX_RSVD_REGIONS defined, - * see include/asm-ia64/meminit.h if you need to define more. + * see arch/ia64/include/asm/meminit.h if you need to define more. */ -void +void __init reserve_memory (void) { int n = 0; + unsigned long total_memory; /* * none of the entries in this table overlap @@ -239,6 +360,8 @@ reserve_memory (void) rsvd_region[n].end = (unsigned long) ia64_imva(_end); n++; + n += paravirt_reserve_memory(&rsvd_region[n]); + #ifdef CONFIG_BLK_DEV_INITRD if (ia64_boot_param->initrd_start) { rsvd_region[n].start = (unsigned long)__va(ia64_boot_param->initrd_start); @@ -247,26 +370,37 @@ reserve_memory (void) } #endif - efi_memmap_init(&rsvd_region[n].start, &rsvd_region[n].end); +#ifdef CONFIG_CRASH_DUMP + if (reserve_elfcorehdr(&rsvd_region[n].start, + &rsvd_region[n].end) == 0) + n++; +#endif + + total_memory = efi_memmap_init(&rsvd_region[n].start, &rsvd_region[n].end); n++; + setup_crashkernel(total_memory, &n); + /* end of memory marker */ rsvd_region[n].start = ~0UL; rsvd_region[n].end = ~0UL; n++; num_rsvd_regions = n; + BUG_ON(IA64_MAX_RSVD_REGIONS + 1 < n); sort_regions(rsvd_region, num_rsvd_regions); + num_rsvd_regions = merge_regions(rsvd_region, num_rsvd_regions); } + /** * find_initrd - get initrd parameters from the boot parameter structure * * Grab the initrd start and end from the boot parameter struct given us by * the boot loader. */ -void +void __init find_initrd (void) { #ifdef CONFIG_BLK_DEV_INITRD @@ -274,7 +408,7 @@ find_initrd (void) initrd_start = (unsigned long)__va(ia64_boot_param->initrd_start); initrd_end = initrd_start+ia64_boot_param->initrd_size; - printk(KERN_INFO "Initial ramdisk at: 0x%lx (%lu bytes)\n", + printk(KERN_INFO "Initial ramdisk at: 0x%lx (%llu bytes)\n", initrd_start, ia64_boot_param->initrd_size); } #endif @@ -340,10 +474,8 @@ early_console_setup (char *cmdline) if (!efi_setup_pcdp_console(cmdline)) earlycons++; #endif -#ifdef CONFIG_SERIAL_8250_CONSOLE - if (!early_serial_console_init(cmdline)) + if (!simcons_register()) earlycons++; -#endif return (earlycons) ? 0 : -1; } @@ -353,118 +485,117 @@ mark_bsp_online (void) { #ifdef CONFIG_SMP /* If we register an early console, allow CPU 0 to printk */ - cpu_set(smp_processor_id(), cpu_online_map); + set_cpu_online(smp_processor_id(), true); #endif } -#ifdef CONFIG_SMP -static void -check_for_logical_procs (void) +static __initdata int nomca; +static __init int setup_nomca(char *s) { - pal_logical_to_physical_t info; - s64 status; - - status = ia64_pal_logical_to_phys(0, &info); - if (status == -1) { - printk(KERN_INFO "No logical to physical processor mapping " - "available\n"); - return; - } - if (status) { - printk(KERN_ERR "ia64_pal_logical_to_phys failed with %ld\n", - status); - return; - } - /* - * Total number of siblings that BSP has. Though not all of them - * may have booted successfully. The correct number of siblings - * booted is in info.overview_num_log. + nomca = 1; + return 0; +} +early_param("nomca", setup_nomca); + +#ifdef CONFIG_CRASH_DUMP +int __init reserve_elfcorehdr(u64 *start, u64 *end) +{ + u64 length; + + /* We get the address using the kernel command line, + * but the size is extracted from the EFI tables. + * Both address and size are required for reservation + * to work properly. */ - smp_num_siblings = info.overview_tpc; - smp_num_cpucores = info.overview_cpp; + + if (!is_vmcore_usable()) + return -EINVAL; + + if ((length = vmcore_find_descriptor_size(elfcorehdr_addr)) == 0) { + vmcore_unusable(); + return -EINVAL; + } + + *start = (unsigned long)__va(elfcorehdr_addr); + *end = *start + length; + return 0; } -#endif + +#endif /* CONFIG_PROC_VMCORE */ void __init setup_arch (char **cmdline_p) { unw_init(); + paravirt_arch_setup_early(); + ia64_patch_vtop((u64) __start___vtop_patchlist, (u64) __end___vtop_patchlist); + paravirt_patch_apply(); *cmdline_p = __va(ia64_boot_param->command_line); - strlcpy(saved_command_line, *cmdline_p, COMMAND_LINE_SIZE); + strlcpy(boot_command_line, *cmdline_p, COMMAND_LINE_SIZE); efi_init(); io_port_init(); #ifdef CONFIG_IA64_GENERIC - { - const char *mvec_name = strstr (*cmdline_p, "machvec="); - char str[64]; - - if (mvec_name) { - const char *end; - size_t len; - - mvec_name += 8; - end = strchr (mvec_name, ' '); - if (end) - len = end - mvec_name; - else - len = strlen (mvec_name); - len = min(len, sizeof (str) - 1); - strncpy (str, mvec_name, len); - str[len] = '\0'; - mvec_name = str; - } else - mvec_name = acpi_get_sysname(); - machvec_init(mvec_name); - } + /* machvec needs to be parsed from the command line + * before parse_early_param() is called to ensure + * that ia64_mv is initialised before any command line + * settings may cause console setup to occur + */ + machvec_init_from_cmdline(*cmdline_p); #endif + parse_early_param(); + if (early_console_setup(*cmdline_p) == 0) mark_bsp_online(); #ifdef CONFIG_ACPI /* Initialize the ACPI boot-time table parser */ acpi_table_init(); + early_acpi_boot_init(); # ifdef CONFIG_ACPI_NUMA acpi_numa_init(); -# endif -#else -# ifdef CONFIG_SMP - smp_build_cpu_map(); /* happens, e.g., with the Ski simulator */ +# ifdef CONFIG_ACPI_HOTPLUG_CPU + prefill_possible_map(); +# endif + per_cpu_scan_finalize((cpus_weight(early_cpu_possible_map) == 0 ? + 32 : cpus_weight(early_cpu_possible_map)), + additional_cpus > 0 ? additional_cpus : 0); # endif #endif /* CONFIG_APCI_BOOT */ +#ifdef CONFIG_SMP + smp_build_cpu_map(); +#endif find_memory(); /* process SAL system table: */ - ia64_sal_init(efi.sal_systab); + ia64_sal_init(__va(efi.sal_systab)); + +#ifdef CONFIG_ITANIUM + ia64_patch_rse((u64) __start___rse_patchlist, (u64) __end___rse_patchlist); +#else + { + unsigned long num_phys_stacked; + + if (ia64_pal_rse_info(&num_phys_stacked, 0) == 0 && num_phys_stacked > 96) + ia64_patch_rse((u64) __start___rse_patchlist, (u64) __end___rse_patchlist); + } +#endif #ifdef CONFIG_SMP cpu_physical_id(0) = hard_smp_processor_id(); - - cpu_set(0, cpu_sibling_map[0]); - cpu_set(0, cpu_core_map[0]); - - check_for_logical_procs(); - if (smp_num_cpucores > 1) - printk(KERN_INFO - "cpu package is Multi-Core capable: number of cores=%d\n", - smp_num_cpucores); - if (smp_num_siblings > 1) - printk(KERN_INFO - "cpu package is Multi-Threading capable: number of siblings=%d\n", - smp_num_siblings); #endif cpu_init(); /* initialize the bootstrap CPU */ + mmu_context_init(); /* initialize context_id bitmap */ -#ifdef CONFIG_ACPI - acpi_boot_init(); -#endif + paravirt_banner(); + paravirt_arch_setup_console(cmdline_p); #ifdef CONFIG_VT if (!conswitchp) { @@ -485,15 +616,20 @@ setup_arch (char **cmdline_p) #endif /* enable IA-64 Machine Check Abort Handling unless disabled */ - if (!strstr(saved_command_line, "nomca")) + if (paravirt_arch_setup_nomca()) + nomca = 1; + if (!nomca) ia64_mca_init(); platform_setup(cmdline_p); +#ifndef CONFIG_IA64_HP_SIM + check_sal_cache_flush(); +#endif paging_init(); } /* - * Display cpu info for all cpu's. + * Display cpu info for all CPUs. */ static int show_cpuinfo (struct seq_file *m, void *v) @@ -513,68 +649,67 @@ show_cpuinfo (struct seq_file *m, void *v) { 1UL << 1, "spontaneous deferral"}, { 1UL << 2, "16-byte atomic ops" } }; - char family[32], features[128], *cp, sep; + char features[128], *cp, *sep; struct cpuinfo_ia64 *c = v; unsigned long mask; - int i; + unsigned long proc_freq; + int i, size; mask = c->features; - switch (c->family) { - case 0x07: memcpy(family, "Itanium", 8); break; - case 0x1f: memcpy(family, "Itanium 2", 10); break; - default: sprintf(family, "%u", c->family); break; - } - /* build the feature string: */ - memcpy(features, " standard", 10); + memcpy(features, "standard", 9); cp = features; - sep = 0; - for (i = 0; i < (int) ARRAY_SIZE(feature_bits); ++i) { + size = sizeof(features); + sep = ""; + for (i = 0; i < ARRAY_SIZE(feature_bits) && size > 1; ++i) { if (mask & feature_bits[i].mask) { - if (sep) - *cp++ = sep; - sep = ','; - *cp++ = ' '; - strcpy(cp, feature_bits[i].feature_name); - cp += strlen(feature_bits[i].feature_name); + cp += snprintf(cp, size, "%s%s", sep, + feature_bits[i].feature_name), + sep = ", "; mask &= ~feature_bits[i].mask; + size = sizeof(features) - (cp - features); } } - if (mask) { - /* print unknown features as a hex value: */ - if (sep) - *cp++ = sep; - sprintf(cp, " 0x%lx", mask); + if (mask && size > 1) { + /* print unknown features as a hex value */ + snprintf(cp, size, "%s0x%lx", sep, mask); } + proc_freq = cpufreq_quick_get(cpunum); + if (!proc_freq) + proc_freq = c->proc_freq / 1000; + seq_printf(m, "processor : %d\n" "vendor : %s\n" "arch : IA-64\n" - "family : %s\n" + "family : %u\n" "model : %u\n" + "model name : %s\n" "revision : %u\n" "archrev : %u\n" - "features :%s\n" /* don't change this---it _is_ right! */ + "features : %s\n" "cpu number : %lu\n" "cpu regs : %u\n" - "cpu MHz : %lu.%06lu\n" + "cpu MHz : %lu.%03lu\n" "itc MHz : %lu.%06lu\n" "BogoMIPS : %lu.%02lu\n", - cpunum, c->vendor, family, c->model, c->revision, c->archrev, + cpunum, c->vendor, c->family, c->model, + c->model_name, c->revision, c->archrev, features, c->ppn, c->number, - c->proc_freq / 1000000, c->proc_freq % 1000000, + proc_freq / 1000, proc_freq % 1000, c->itc_freq / 1000000, c->itc_freq % 1000000, lpj*HZ/500000, (lpj*HZ/5000) % 100); #ifdef CONFIG_SMP seq_printf(m, "siblings : %u\n", cpus_weight(cpu_core_map[cpunum])); + if (c->socket_id != -1) + seq_printf(m, "physical id: %u\n", c->socket_id); if (c->threads_per_core > 1 || c->cores_per_socket > 1) seq_printf(m, - "physical id: %u\n" - "core id : %u\n" - "thread id : %u\n", - c->socket_id, c->core_id, c->thread_id); + "core id : %u\n" + "thread id : %u\n", + c->core_id, c->thread_id); #endif seq_printf(m,"\n"); @@ -585,10 +720,10 @@ static void * c_start (struct seq_file *m, loff_t *pos) { #ifdef CONFIG_SMP - while (*pos < NR_CPUS && !cpu_isset(*pos, cpu_online_map)) + while (*pos < nr_cpu_ids && !cpu_online(*pos)) ++*pos; #endif - return *pos < NR_CPUS ? cpu_data(*pos) : NULL; + return *pos < nr_cpu_ids ? cpu_data(*pos) : NULL; } static void * @@ -603,14 +738,47 @@ c_stop (struct seq_file *m, void *v) { } -struct seq_operations cpuinfo_op = { +const struct seq_operations cpuinfo_op = { .start = c_start, .next = c_next, .stop = c_stop, .show = show_cpuinfo }; -void +#define MAX_BRANDS 8 +static char brandname[MAX_BRANDS][128]; + +static char * +get_model_name(__u8 family, __u8 model) +{ + static int overflow; + char brand[128]; + int i; + + memcpy(brand, "Unknown", 8); + if (ia64_pal_get_brand_info(brand)) { + if (family == 0x7) + memcpy(brand, "Merced", 7); + else if (family == 0x1f) switch (model) { + case 0: memcpy(brand, "McKinley", 9); break; + case 1: memcpy(brand, "Madison", 8); break; + case 2: memcpy(brand, "Madison up to 9M cache", 23); break; + } + } + for (i = 0; i < MAX_BRANDS; i++) + if (strcmp(brandname[i], brand) == 0) + return brandname[i]; + for (i = 0; i < MAX_BRANDS; i++) + if (brandname[i][0] == '\0') + return strcpy(brandname[i], brand); + if (overflow++ == 0) + printk(KERN_ERR + "%s: Table overflow. Some processor model information will be missing\n", + __func__); + return "Unknown"; +} + +static void identify_cpu (struct cpuinfo_ia64 *c) { union { @@ -639,7 +807,6 @@ identify_cpu (struct cpuinfo_ia64 *c) pal_status_t status; unsigned long impl_va_msb = 50, phys_addr_size = 44; /* Itanium defaults */ int i; - for (i = 0; i < 5; ++i) cpuid.bits[i] = ia64_get_cpuid(i); @@ -648,12 +815,15 @@ identify_cpu (struct cpuinfo_ia64 *c) c->cpu = smp_processor_id(); /* below default values will be overwritten by identify_siblings() - * for Multi-Threading/Multi-Core capable cpu's + * for Multi-Threading/Multi-Core capable CPUs */ c->threads_per_core = c->cores_per_socket = c->num_log = 1; c->socket_id = -1; identify_siblings(c); + + if (c->threads_per_core > smp_num_siblings) + smp_num_siblings = c->threads_per_core; #endif c->ppn = cpuid.field.ppn; c->number = cpuid.field.number; @@ -662,6 +832,7 @@ identify_cpu (struct cpuinfo_ia64 *c) c->family = cpuid.field.family; c->archrev = cpuid.field.archrev; c->features = cpuid.field.features; + c->model_name = get_model_name(c->family, c->model); status = ia64_pal_vm_summary(&vm1, &vm2); if (status == PAL_STATUS_SUCCESS) { @@ -672,60 +843,63 @@ identify_cpu (struct cpuinfo_ia64 *c) c->unimpl_pa_mask = ~((1L<<63) | ((1L << phys_addr_size) - 1)); } -void -setup_per_cpu_areas (void) -{ - /* start_kernel() requires this... */ -} - /* - * Calculate the max. cache line size. + * Do the following calculations: * - * In addition, the minimum of the i-cache stride sizes is calculated for - * "flush_icache_range()". + * 1. the max. cache line size. + * 2. the minimum of the i-cache stride sizes for "flush_icache_range()". + * 3. the minimum of the cache stride sizes for "clflush_cache_range()". */ static void -get_max_cacheline_size (void) +get_cache_info(void) { unsigned long line_size, max = 1; - u64 l, levels, unique_caches; - pal_cache_config_info_t cci; - s64 status; + unsigned long l, levels, unique_caches; + pal_cache_config_info_t cci; + long status; status = ia64_pal_cache_summary(&levels, &unique_caches); if (status != 0) { printk(KERN_ERR "%s: ia64_pal_cache_summary() failed (status=%ld)\n", - __FUNCTION__, status); + __func__, status); max = SMP_CACHE_BYTES; /* Safest setup for "flush_icache_range()" */ ia64_i_cache_stride_shift = I_CACHE_STRIDE_SHIFT; + /* Safest setup for "clflush_cache_range()" */ + ia64_cache_stride_shift = CACHE_STRIDE_SHIFT; goto out; } for (l = 0; l < levels; ++l) { - status = ia64_pal_cache_config_info(l, /* cache_type (data_or_unified)= */ 2, - &cci); + /* cache_type (data_or_unified)=2 */ + status = ia64_pal_cache_config_info(l, 2, &cci); if (status != 0) { - printk(KERN_ERR - "%s: ia64_pal_cache_config_info(l=%lu, 2) failed (status=%ld)\n", - __FUNCTION__, l, status); + printk(KERN_ERR "%s: ia64_pal_cache_config_info" + "(l=%lu, 2) failed (status=%ld)\n", + __func__, l, status); max = SMP_CACHE_BYTES; /* The safest setup for "flush_icache_range()" */ cci.pcci_stride = I_CACHE_STRIDE_SHIFT; + /* The safest setup for "clflush_cache_range()" */ + ia64_cache_stride_shift = CACHE_STRIDE_SHIFT; cci.pcci_unified = 1; + } else { + if (cci.pcci_stride < ia64_cache_stride_shift) + ia64_cache_stride_shift = cci.pcci_stride; + + line_size = 1 << cci.pcci_line_size; + if (line_size > max) + max = line_size; } - line_size = 1 << cci.pcci_line_size; - if (line_size > max) - max = line_size; + if (!cci.pcci_unified) { - status = ia64_pal_cache_config_info(l, - /* cache_type (instruction)= */ 1, - &cci); + /* cache_type (instruction)=1*/ + status = ia64_pal_cache_config_info(l, 1, &cci); if (status != 0) { - printk(KERN_ERR - "%s: ia64_pal_cache_config_info(l=%lu, 1) failed (status=%ld)\n", - __FUNCTION__, l, status); - /* The safest setup for "flush_icache_range()" */ + printk(KERN_ERR "%s: ia64_pal_cache_config_info" + "(l=%lu, 1) failed (status=%ld)\n", + __func__, l, status); + /* The safest setup for flush_icache_range() */ cci.pcci_stride = I_CACHE_STRIDE_SHIFT; } } @@ -744,7 +918,8 @@ get_max_cacheline_size (void) void cpu_init (void) { - extern void __devinit ia64_mmu_init (void *); + extern void ia64_mmu_init(void *); + static unsigned long max_num_phys_stacked = IA64_NUM_PHYS_STACK_REG; unsigned long num_phys_stacked; pal_vm_info_2_u_t vmi; unsigned int max_ctx; @@ -752,16 +927,28 @@ cpu_init (void) void *cpu_data; cpu_data = per_cpu_init(); - +#ifdef CONFIG_SMP /* - * We set ar.k3 so that assembly code in MCA handler can compute - * physical addresses of per cpu variables with a simple: - * phys = ar.k3 + &per_cpu_var + * insert boot cpu into sibling and core mapes + * (must be done after per_cpu area is setup) */ - ia64_set_kr(IA64_KR_PER_CPU_DATA, - ia64_tpa(cpu_data) - (long) __per_cpu_start); + if (smp_processor_id() == 0) { + cpu_set(0, per_cpu(cpu_sibling_map, 0)); + cpu_set(0, cpu_core_map[0]); + } else { + /* + * Set ar.k3 so that assembly code in MCA handler can compute + * physical addresses of per cpu variables with a simple: + * phys = ar.k3 + &per_cpu_var + * and the alt-dtlb-miss handler can set per-cpu mapping into + * the TLB when needed. head.S already did this for cpu0. + */ + ia64_set_kr(IA64_KR_PER_CPU_DATA, + ia64_tpa(cpu_data) - (long) __per_cpu_start); + } +#endif - get_max_cacheline_size(); + get_cache_info(); /* * We can't pass "local_cpu_data" to identify_cpu() because we haven't called @@ -769,7 +956,7 @@ cpu_init (void) * depends on the data returned by identify_cpu(). We break the dependency by * accessing cpu_data() through the canonical per-CPU address. */ - cpu_info = cpu_data + ((char *) &__ia64_per_cpu_var(cpu_info) - __per_cpu_start); + cpu_info = cpu_data + ((char *) &__ia64_per_cpu_var(ia64_cpu_info) - __per_cpu_start); identify_cpu(cpu_info); #ifdef CONFIG_MCKINLEY @@ -787,7 +974,7 @@ cpu_init (void) #endif /* Clear the stack memory reserved for pt_regs: */ - memset(ia64_task_regs(current), 0, sizeof(struct pt_regs)); + memset(task_pt_regs(current), 0, sizeof(struct pt_regs)); ia64_set_kr(IA64_KR_FPU_OWNER, 0); @@ -813,17 +1000,12 @@ cpu_init (void) | IA64_DCR_DA | IA64_DCR_DD | IA64_DCR_LC)); atomic_inc(&init_mm.mm_count); current->active_mm = &init_mm; - if (current->mm) - BUG(); + BUG_ON(current->mm); ia64_mmu_init(ia64_imva(cpu_data)); ia64_mca_cpu_init(ia64_imva(cpu_data)); -#ifdef CONFIG_IA32_SUPPORT - ia32_cpu_init(); -#endif - - /* Clear ITC to eliminiate sched_clock() overflows in human time. */ + /* Clear ITC to eliminate sched_clock() overflows in human time. */ ia64_set_itc(0); /* disable all local interrupt sources: */ @@ -835,14 +1017,20 @@ cpu_init (void) /* clear TPR & XTP to enable all interrupt classes: */ ia64_setreg(_IA64_REG_CR_TPR, 0); + + /* Clear any pending interrupts left by SAL/EFI */ + while (ia64_get_ivr() != IA64_SPURIOUS_INT_VECTOR) + ia64_eoi(); + #ifdef CONFIG_SMP normal_xtp(); #endif /* set ia64_ctx.max_rid to the maximum RID that is supported by all CPUs: */ - if (ia64_pal_vm_summary(NULL, &vmi) == 0) + if (ia64_pal_vm_summary(NULL, &vmi) == 0) { max_ctx = (1U << (vmi.pal_vm_info_2_s.rid_size - 3)) - 1; - else { + setup_ptcg_sem(vmi.pal_vm_info_2_s.max_purges, NPTCG_FROM_PAL); + } else { printk(KERN_WARNING "cpu_init: PAL VM summary failed, assuming 18 RID bits\n"); max_ctx = (1U << 15) - 1; /* use architected minimum */ } @@ -858,14 +1046,25 @@ cpu_init (void) num_phys_stacked = 96; } /* size of physical stacked register partition plus 8 bytes: */ - __get_cpu_var(ia64_phys_stacked_size_p8) = num_phys_stacked*8 + 8; + if (num_phys_stacked > max_num_phys_stacked) { + ia64_patch_phys_stack_reg(num_phys_stacked*8 + 8); + max_num_phys_stacked = num_phys_stacked; + } platform_cpu_init(); - pm_idle = default_idle; } -void +void __init check_bugs (void) { ia64_patch_mckinley_e9((unsigned long) __start___mckinley_e9_bundles, (unsigned long) __end___mckinley_e9_bundles); } + +static int __init run_dmi_scan(void) +{ + dmi_scan_machine(); + dmi_memdev_walk(); + dmi_set_dump_stack_arch_desc(); + return 0; +} +core_initcall(run_dmi_scan); diff --git a/arch/ia64/kernel/sigframe.h b/arch/ia64/kernel/sigframe.h index 37b986cb86e..9fd9a1933b3 100644 --- a/arch/ia64/kernel/sigframe.h +++ b/arch/ia64/kernel/sigframe.h @@ -22,4 +22,4 @@ struct sigframe { struct sigcontext sc; }; -extern long ia64_do_signal (sigset_t *, struct sigscratch *, long); +extern void ia64_do_signal (struct sigscratch *, long); diff --git a/arch/ia64/kernel/signal.c b/arch/ia64/kernel/signal.c index 774f34b675c..33cab9a8adf 100644 --- a/arch/ia64/kernel/signal.c +++ b/arch/ia64/kernel/signal.c @@ -7,22 +7,20 @@ * Derived from i386 and Alpha versions. */ -#include <linux/config.h> #include <linux/errno.h> #include <linux/kernel.h> #include <linux/mm.h> #include <linux/ptrace.h> +#include <linux/tracehook.h> #include <linux/sched.h> #include <linux/signal.h> #include <linux/smp.h> -#include <linux/smp_lock.h> #include <linux/stddef.h> #include <linux/tty.h> #include <linux/binfmts.h> #include <linux/unistd.h> #include <linux/wait.h> -#include <asm/ia32.h> #include <asm/intrinsics.h> #include <asm/uaccess.h> #include <asm/rse.h> @@ -32,7 +30,6 @@ #define DEBUG_SIG 0 #define STACK_ALIGN 16 /* minimal alignment for stack pointer */ -#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP))) #if _NSIG_WORDS > 1 # define PUT_SIGSET(k,u) __copy_to_user((u)->sig, (k)->sig, sizeof(sigset_t)) @@ -42,55 +39,6 @@ # define GET_SIGSET(k,u) __get_user((k)->sig[0], &(u)->sig[0]) #endif -long -ia64_rt_sigsuspend (sigset_t __user *uset, size_t sigsetsize, struct sigscratch *scr) -{ - sigset_t oldset, set; - - /* XXX: Don't preclude handling different sized sigset_t's. */ - if (sigsetsize != sizeof(sigset_t)) - return -EINVAL; - - if (!access_ok(VERIFY_READ, uset, sigsetsize)) - return -EFAULT; - - if (GET_SIGSET(&set, uset)) - return -EFAULT; - - sigdelsetmask(&set, ~_BLOCKABLE); - - spin_lock_irq(¤t->sighand->siglock); - { - oldset = current->blocked; - current->blocked = set; - recalc_sigpending(); - } - spin_unlock_irq(¤t->sighand->siglock); - - /* - * The return below usually returns to the signal handler. We need to - * pre-set the correct error code here to ensure that the right values - * get saved in sigcontext by ia64_do_signal. - */ - scr->pt.r8 = EINTR; - scr->pt.r10 = -1; - - while (1) { - current->state = TASK_INTERRUPTIBLE; - schedule(); - if (ia64_do_signal(&oldset, scr, 1)) - return -EINTR; - } -} - -asmlinkage long -sys_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, long arg2, - long arg3, long arg4, long arg5, long arg6, long arg7, - struct pt_regs regs) -{ - return do_sigaltstack(uss, uoss, regs.r12); -} - static long restore_sigcontext (struct sigcontext __user *sc, struct sigscratch *scr) { @@ -141,7 +89,7 @@ restore_sigcontext (struct sigcontext __user *sc, struct sigscratch *scr) if ((flags & IA64_SC_FLAG_FPH_VALID) != 0) { struct ia64_psr *psr = ia64_psr(&scr->pt); - __copy_from_user(current->thread.fph, &sc->sc_fr[32], 96*16); + err |= __copy_from_user(current->thread.fph, &sc->sc_fr[32], 96*16); psr->mfh = 0; /* drop signal handler's fph contents... */ preempt_disable(); if (psr->dfh) @@ -157,7 +105,7 @@ restore_sigcontext (struct sigcontext __user *sc, struct sigscratch *scr) } int -copy_siginfo_to_user (siginfo_t __user *to, siginfo_t *from) +copy_siginfo_to_user (siginfo_t __user *to, const siginfo_t *from) { if (!access_ok(VERIFY_WRITE, to, sizeof(siginfo_t))) return -EFAULT; @@ -243,14 +191,7 @@ ia64_rt_sigreturn (struct sigscratch *scr) if (GET_SIGSET(&set, &sc->sc_mask)) goto give_sigsegv; - sigdelsetmask(&set, ~_BLOCKABLE); - - spin_lock_irq(¤t->sighand->siglock); - { - current->blocked = set; - recalc_sigpending(); - } - spin_unlock_irq(¤t->sighand->siglock); + set_current_blocked(&set); if (restore_sigcontext(sc, scr)) goto give_sigsegv; @@ -259,19 +200,16 @@ ia64_rt_sigreturn (struct sigscratch *scr) printk("SIG return (%s:%d): sp=%lx ip=%lx\n", current->comm, current->pid, scr->pt.r12, scr->pt.cr_iip); #endif - /* - * It is more difficult to avoid calling this function than to - * call it and ignore errors. - */ - do_sigaltstack(&sc->sc_stack, NULL, scr->pt.r12); + if (restore_altstack(&sc->sc_stack)) + goto give_sigsegv; return retval; give_sigsegv: si.si_signo = SIGSEGV; si.si_errno = 0; si.si_code = SI_KERNEL; - si.si_pid = current->pid; - si.si_uid = current->uid; + si.si_pid = task_pid_vnr(current); + si.si_uid = from_kuid_munged(current_user_ns(), current_uid()); si.si_addr = sc; force_sig_info(SIGSEGV, &si, current); return retval; @@ -287,7 +225,7 @@ static long setup_sigcontext (struct sigcontext __user *sc, sigset_t *mask, struct sigscratch *scr) { unsigned long flags = 0, ifs, cfm, nat; - long err; + long err = 0; ifs = scr->pt.cr_ifs; @@ -300,12 +238,12 @@ setup_sigcontext (struct sigcontext __user *sc, sigset_t *mask, struct sigscratc ia64_flush_fph(current); if ((current->thread.flags & IA64_THREAD_FPH_VALID)) { flags |= IA64_SC_FLAG_FPH_VALID; - __copy_to_user(&sc->sc_fr[32], current->thread.fph, 96*16); + err = __copy_to_user(&sc->sc_fr[32], current->thread.fph, 96*16); } nat = ia64_get_scratch_nat_bits(&scr->pt, scr->scratch_unat); - err = __put_user(flags, &sc->sc_flags); + err |= __put_user(flags, &sc->sc_flags); err |= __put_user(nat, &sc->sc_nat); err |= PUT_SIGSET(mask, &sc->sc_mask); err |= __put_user(cfm, &sc->sc_cfm); @@ -323,15 +261,7 @@ setup_sigcontext (struct sigcontext __user *sc, sigset_t *mask, struct sigscratc err |= __copy_to_user(&sc->sc_gr[15], &scr->pt.r15, 8); /* r15 */ err |= __put_user(scr->pt.cr_iip + ia64_psr(&scr->pt)->ri, &sc->sc_ip); - if (flags & IA64_SC_FLAG_IN_SYSCALL) { - /* Clear scratch registers if the signal interrupted a system call. */ - err |= __put_user(0, &sc->sc_ar_ccv); /* ar.ccv */ - err |= __put_user(0, &sc->sc_br[7]); /* b7 */ - err |= __put_user(0, &sc->sc_gr[14]); /* r14 */ - err |= __clear_user(&sc->sc_ar25, 2*8); /* ar.csd & ar.ssd */ - err |= __clear_user(&sc->sc_gr[2], 2*8); /* r2-r3 */ - err |= __clear_user(&sc->sc_gr[16], 16*8); /* r16-r31 */ - } else { + if (!(flags & IA64_SC_FLAG_IN_SYSCALL)) { /* Copy scratch regs to sigcontext if the signal didn't interrupt a syscall. */ err |= __put_user(scr->pt.ar_ccv, &sc->sc_ar_ccv); /* ar.ccv */ err |= __put_user(scr->pt.b7, &sc->sc_br[7]); /* b7 */ @@ -375,8 +305,8 @@ force_sigsegv_info (int sig, void __user *addr) si.si_signo = SIGSEGV; si.si_errno = 0; si.si_code = SI_KERNEL; - si.si_pid = current->pid; - si.si_uid = current->uid; + si.si_pid = task_pid_vnr(current); + si.si_uid = from_kuid_munged(current_user_ns(), current_uid()); si.si_addr = addr; force_sig_info(SIGSEGV, &si, current); return 0; @@ -387,24 +317,41 @@ setup_frame (int sig, struct k_sigaction *ka, siginfo_t *info, sigset_t *set, struct sigscratch *scr) { extern char __kernel_sigtramp[]; - unsigned long tramp_addr, new_rbs = 0; + unsigned long tramp_addr, new_rbs = 0, new_sp; struct sigframe __user *frame; long err; - frame = (void __user *) scr->pt.r12; + new_sp = scr->pt.r12; tramp_addr = (unsigned long) __kernel_sigtramp; - if ((ka->sa.sa_flags & SA_ONSTACK) && sas_ss_flags((unsigned long) frame) == 0) { - frame = (void __user *) ((current->sas_ss_sp + current->sas_ss_size) - & ~(STACK_ALIGN - 1)); - /* - * We need to check for the register stack being on the signal stack - * separately, because it's switched separately (memory stack is switched - * in the kernel, register stack is switched in the signal trampoline). - */ - if (!rbs_on_sig_stack(scr->pt.ar_bspstore)) - new_rbs = (current->sas_ss_sp + sizeof(long) - 1) & ~(sizeof(long) - 1); + if (ka->sa.sa_flags & SA_ONSTACK) { + int onstack = sas_ss_flags(new_sp); + + if (onstack == 0) { + new_sp = current->sas_ss_sp + current->sas_ss_size; + /* + * We need to check for the register stack being on the + * signal stack separately, because it's switched + * separately (memory stack is switched in the kernel, + * register stack is switched in the signal trampoline). + */ + if (!rbs_on_sig_stack(scr->pt.ar_bspstore)) + new_rbs = ALIGN(current->sas_ss_sp, + sizeof(long)); + } else if (onstack == SS_ONSTACK) { + unsigned long check_sp; + + /* + * If we are on the alternate signal stack and would + * overflow it, don't. Return an always-bogus address + * instead so we will die with SIGSEGV. + */ + check_sp = (new_sp - sizeof(*frame)) & -STACK_ALIGN; + if (!likely(on_sig_stack(check_sp))) + return force_sigsegv_info(sig, (void __user *) + check_sp); + } } - frame = (void __user *) frame - ((sizeof(*frame) + STACK_ALIGN - 1) & ~(STACK_ALIGN - 1)); + frame = (void __user *) ((new_sp - sizeof(*frame)) & -STACK_ALIGN); if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) return force_sigsegv_info(sig, frame); @@ -418,9 +365,7 @@ setup_frame (int sig, struct k_sigaction *ka, siginfo_t *info, sigset_t *set, err |= copy_siginfo_to_user(&frame->info, info); - err |= __put_user(current->sas_ss_sp, &frame->sc.sc_stack.ss_sp); - err |= __put_user(current->sas_ss_size, &frame->sc.sc_stack.ss_size); - err |= __put_user(sas_ss_flags(scr->pt.r12), &frame->sc.sc_stack.ss_flags); + err |= __save_altstack(&frame->sc.sc_stack, scr->pt.r12); err |= setup_sigcontext(&frame->sc, set, scr); if (unlikely(err)) @@ -455,24 +400,15 @@ setup_frame (int sig, struct k_sigaction *ka, siginfo_t *info, sigset_t *set, } static long -handle_signal (unsigned long sig, struct k_sigaction *ka, siginfo_t *info, sigset_t *oldset, +handle_signal (unsigned long sig, struct k_sigaction *ka, siginfo_t *info, struct sigscratch *scr) { - if (IS_IA32_PROCESS(&scr->pt)) { - /* send signal to IA-32 process */ - if (!ia32_setup_frame1(sig, ka, info, oldset, &scr->pt)) - return 0; - } else - /* send signal to IA-64 process */ - if (!setup_frame(sig, ka, info, oldset, scr)) - return 0; - - spin_lock_irq(¤t->sighand->siglock); - sigorsets(¤t->blocked, ¤t->blocked, &ka->sa.sa_mask); - if (!(ka->sa.sa_flags & SA_NODEFER)) - sigaddset(¤t->blocked, sig); - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); + if (!setup_frame(sig, ka, info, sigmask_to_save(), scr)) + return 0; + + signal_delivered(sig, info, ka, &scr->pt, + test_thread_flag(TIF_SINGLESTEP)); + return 1; } @@ -480,25 +416,13 @@ handle_signal (unsigned long sig, struct k_sigaction *ka, siginfo_t *info, sigse * Note that `init' is a special process: it doesn't get signals it doesn't want to * handle. Thus you cannot kill init even with a SIGKILL even by mistake. */ -long -ia64_do_signal (sigset_t *oldset, struct sigscratch *scr, long in_syscall) +void +ia64_do_signal (struct sigscratch *scr, long in_syscall) { struct k_sigaction ka; siginfo_t info; long restart = in_syscall; long errno = scr->pt.r8; -# define ERR_CODE(c) (IS_IA32_PROCESS(&scr->pt) ? -(c) : (c)) - - /* - * In the ia64_leave_kernel code path, we want the common case to go fast, which - * is why we may in certain cases get here from kernel mode. Just return without - * doing anything if so. - */ - if (!user_mode(&scr->pt)) - return 0; - - if (!oldset) - oldset = ¤t->blocked; /* * This only loops in the rare cases of handle_signal() failing, in which case we @@ -513,14 +437,7 @@ ia64_do_signal (sigset_t *oldset, struct sigscratch *scr, long in_syscall) * inferior call), thus it's important to check for restarting _after_ * get_signal_to_deliver(). */ - if (IS_IA32_PROCESS(&scr->pt)) { - if (in_syscall) { - if (errno >= 0) - restart = 0; - else - errno = -errno; - } - } else if ((long) scr->pt.r10 != -1) + if ((long) scr->pt.r10 != -1) /* * A system calls has to be restarted only if one of the error codes * ERESTARTNOHAND, ERESTARTSYS, or ERESTARTNOINTR is returned. If r10 @@ -536,22 +453,18 @@ ia64_do_signal (sigset_t *oldset, struct sigscratch *scr, long in_syscall) switch (errno) { case ERESTART_RESTARTBLOCK: case ERESTARTNOHAND: - scr->pt.r8 = ERR_CODE(EINTR); + scr->pt.r8 = EINTR; /* note: scr->pt.r10 is already -1 */ break; case ERESTARTSYS: if ((ka.sa.sa_flags & SA_RESTART) == 0) { - scr->pt.r8 = ERR_CODE(EINTR); + scr->pt.r8 = EINTR; /* note: scr->pt.r10 is already -1 */ break; } case ERESTARTNOINTR: - if (IS_IA32_PROCESS(&scr->pt)) { - scr->pt.r8 = scr->pt.r1; - scr->pt.cr_iip -= 2; - } else - ia64_decrement_ip(&scr->pt); + ia64_decrement_ip(&scr->pt); restart = 0; /* don't restart twice if handle_signal() fails... */ } } @@ -560,8 +473,8 @@ ia64_do_signal (sigset_t *oldset, struct sigscratch *scr, long in_syscall) * Whee! Actually deliver the signal. If the delivery failed, we need to * continue to iterate in this loop so we can deliver the SIGSEGV... */ - if (handle_signal(signr, &ka, &info, oldset, scr)) - return 1; + if (handle_signal(signr, &ka, &info, scr)) + return; } /* Did we come from a system call? */ @@ -570,123 +483,18 @@ ia64_do_signal (sigset_t *oldset, struct sigscratch *scr, long in_syscall) if (errno == ERESTARTNOHAND || errno == ERESTARTSYS || errno == ERESTARTNOINTR || errno == ERESTART_RESTARTBLOCK) { - if (IS_IA32_PROCESS(&scr->pt)) { - scr->pt.r8 = scr->pt.r1; - scr->pt.cr_iip -= 2; - if (errno == ERESTART_RESTARTBLOCK) - scr->pt.r8 = 0; /* x86 version of __NR_restart_syscall */ - } else { - /* - * Note: the syscall number is in r15 which is saved in - * pt_regs so all we need to do here is adjust ip so that - * the "break" instruction gets re-executed. - */ - ia64_decrement_ip(&scr->pt); - if (errno == ERESTART_RESTARTBLOCK) - scr->pt.r15 = __NR_restart_syscall; - } - } - } - return 0; -} - -/* Set a delayed signal that was detected in MCA/INIT/NMI/PMI context where it - * could not be delivered. It is important that the target process is not - * allowed to do any more work in user space. Possible cases for the target - * process: - * - * - It is sleeping and will wake up soon. Store the data in the current task, - * the signal will be sent when the current task returns from the next - * interrupt. - * - * - It is running in user context. Store the data in the current task, the - * signal will be sent when the current task returns from the next interrupt. - * - * - It is running in kernel context on this or another cpu and will return to - * user context. Store the data in the target task, the signal will be sent - * to itself when the target task returns to user space. - * - * - It is running in kernel context on this cpu and will sleep before - * returning to user context. Because this is also the current task, the - * signal will not get delivered and the task could sleep indefinitely. - * Store the data in the idle task for this cpu, the signal will be sent - * after the idle task processes its next interrupt. - * - * To cover all cases, store the data in the target task, the current task and - * the idle task on this cpu. Whatever happens, the signal will be delivered - * to the target task before it can do any useful user space work. Multiple - * deliveries have no unwanted side effects. - * - * Note: This code is executed in MCA/INIT/NMI/PMI context, with interrupts - * disabled. It must not take any locks nor use kernel structures or services - * that require locks. - */ - -/* To ensure that we get the right pid, check its start time. To avoid extra - * include files in thread_info.h, convert the task start_time to unsigned long, - * giving us a cycle time of > 580 years. - */ -static inline unsigned long -start_time_ul(const struct task_struct *t) -{ - return t->start_time.tv_sec * NSEC_PER_SEC + t->start_time.tv_nsec; -} - -void -set_sigdelayed(pid_t pid, int signo, int code, void __user *addr) -{ - struct task_struct *t; - unsigned long start_time = 0; - int i; - - for (i = 1; i <= 3; ++i) { - switch (i) { - case 1: - t = find_task_by_pid(pid); - if (t) - start_time = start_time_ul(t); - break; - case 2: - t = current; - break; - default: - t = idle_task(smp_processor_id()); - break; + /* + * Note: the syscall number is in r15 which is saved in + * pt_regs so all we need to do here is adjust ip so that + * the "break" instruction gets re-executed. + */ + ia64_decrement_ip(&scr->pt); + if (errno == ERESTART_RESTARTBLOCK) + scr->pt.r15 = __NR_restart_syscall; } - - if (!t) - return; - t->thread_info->sigdelayed.signo = signo; - t->thread_info->sigdelayed.code = code; - t->thread_info->sigdelayed.addr = addr; - t->thread_info->sigdelayed.start_time = start_time; - t->thread_info->sigdelayed.pid = pid; - wmb(); - set_tsk_thread_flag(t, TIF_SIGDELAYED); } -} - -/* Called from entry.S when it detects TIF_SIGDELAYED, a delayed signal that - * was detected in MCA/INIT/NMI/PMI context where it could not be delivered. - */ -void -do_sigdelayed(void) -{ - struct siginfo siginfo; - pid_t pid; - struct task_struct *t; - - clear_thread_flag(TIF_SIGDELAYED); - memset(&siginfo, 0, sizeof(siginfo)); - siginfo.si_signo = current_thread_info()->sigdelayed.signo; - siginfo.si_code = current_thread_info()->sigdelayed.code; - siginfo.si_addr = current_thread_info()->sigdelayed.addr; - pid = current_thread_info()->sigdelayed.pid; - t = find_task_by_pid(pid); - if (!t) - return; - if (current_thread_info()->sigdelayed.start_time != start_time_ul(t)) - return; - force_sig_info(siginfo.si_signo, &siginfo, t); + /* if there's no signal to deliver, we just put the saved sigmask + * back */ + restore_saved_sigmask(); } diff --git a/arch/ia64/kernel/smp.c b/arch/ia64/kernel/smp.c index 657ac99a451..9fcd4e63048 100644 --- a/arch/ia64/kernel/smp.c +++ b/arch/ia64/kernel/smp.c @@ -30,8 +30,9 @@ #include <linux/delay.h> #include <linux/efi.h> #include <linux/bitops.h> +#include <linux/kexec.h> -#include <asm/atomic.h> +#include <linux/atomic.h> #include <asm/current.h> #include <asm/delay.h> #include <asm/machvec.h> @@ -43,54 +44,39 @@ #include <asm/processor.h> #include <asm/ptrace.h> #include <asm/sal.h> -#include <asm/system.h> #include <asm/tlbflush.h> #include <asm/unistd.h> #include <asm/mca.h> /* - * Structure and data for smp_call_function(). This is designed to minimise static memory - * requirements. It also looks cleaner. + * Note: alignment of 4 entries/cacheline was empirically determined + * to be a good tradeoff between hot cachelines & spreading the array + * across too many cacheline. */ -static __cacheline_aligned DEFINE_SPINLOCK(call_lock); +static struct local_tlb_flush_counts { + unsigned int count; +} __attribute__((__aligned__(32))) local_tlb_flush_counts[NR_CPUS]; -struct call_data_struct { - void (*func) (void *info); - void *info; - long wait; - atomic_t started; - atomic_t finished; -}; - -static volatile struct call_data_struct *call_data; +static DEFINE_PER_CPU_SHARED_ALIGNED(unsigned short [NR_CPUS], + shadow_flush_counts); #define IPI_CALL_FUNC 0 #define IPI_CPU_STOP 1 +#define IPI_CALL_FUNC_SINGLE 2 +#define IPI_KDUMP_CPU_STOP 3 /* This needs to be cacheline aligned because it is written to by *other* CPUs. */ -static DEFINE_PER_CPU(u64, ipi_operation) ____cacheline_aligned; +static DEFINE_PER_CPU_SHARED_ALIGNED(unsigned long, ipi_operation); extern void cpu_halt (void); -void -lock_ipi_calllock(void) -{ - spin_lock_irq(&call_lock); -} - -void -unlock_ipi_calllock(void) -{ - spin_unlock_irq(&call_lock); -} - static void -stop_this_cpu (void) +stop_this_cpu(void) { /* * Remove this CPU: */ - cpu_clear(smp_processor_id(), cpu_online_map); + set_cpu_online(smp_processor_id(), false); max_xtp(); local_irq_disable(); cpu_halt(); @@ -108,7 +94,7 @@ cpu_die(void) } irqreturn_t -handle_IPI (int irq, void *dev_id, struct pt_regs *regs) +handle_IPI (int irq, void *dev_id) { int this_cpu = get_cpu(); unsigned long *pending_ipis = &__ia64_per_cpu_var(ipi_operation); @@ -124,40 +110,23 @@ handle_IPI (int irq, void *dev_id, struct pt_regs *regs) ops &= ~(1 << which); switch (which) { - case IPI_CALL_FUNC: - { - struct call_data_struct *data; - void (*func)(void *info); - void *info; - int wait; - - /* release the 'pointer lock' */ - data = (struct call_data_struct *) call_data; - func = data->func; - info = data->info; - wait = data->wait; - - mb(); - atomic_inc(&data->started); - /* - * At this point the structure may be gone unless - * wait is true. - */ - (*func)(info); - - /* Notify the sending CPU that the task is done. */ - mb(); - if (wait) - atomic_inc(&data->finished); - } - break; - - case IPI_CPU_STOP: + case IPI_CPU_STOP: stop_this_cpu(); break; - - default: - printk(KERN_CRIT "Unknown IPI on CPU %d: %lu\n", this_cpu, which); + case IPI_CALL_FUNC: + generic_smp_call_function_interrupt(); + break; + case IPI_CALL_FUNC_SINGLE: + generic_smp_call_function_single_interrupt(); + break; +#ifdef CONFIG_KEXEC + case IPI_KDUMP_CPU_STOP: + unw_init_running(kdump_cpu_freeze, NULL); + break; +#endif + default: + printk(KERN_CRIT "Unknown IPI on CPU %d: %lu\n", + this_cpu, which); break; } } while (ops); @@ -167,8 +136,10 @@ handle_IPI (int irq, void *dev_id, struct pt_regs *regs) return IRQ_HANDLED; } + + /* - * Called with preeemption disabled. + * Called with preemption disabled. */ static inline void send_IPI_single (int dest_cpu, int op) @@ -178,7 +149,7 @@ send_IPI_single (int dest_cpu, int op) } /* - * Called with preeemption disabled. + * Called with preemption disabled. */ static inline void send_IPI_allbutself (int op) @@ -192,7 +163,20 @@ send_IPI_allbutself (int op) } /* - * Called with preeemption disabled. + * Called with preemption disabled. + */ +static inline void +send_IPI_mask(const struct cpumask *mask, int op) +{ + unsigned int cpu; + + for_each_cpu(cpu, mask) { + send_IPI_single(cpu, op); + } +} + +/* + * Called with preemption disabled. */ static inline void send_IPI_all (int op) @@ -205,7 +189,7 @@ send_IPI_all (int op) } /* - * Called with preeemption disabled. + * Called with preemption disabled. */ static inline void send_IPI_self (int op) @@ -213,155 +197,134 @@ send_IPI_self (int op) send_IPI_single(smp_processor_id(), op); } +#ifdef CONFIG_KEXEC +void +kdump_smp_send_stop(void) +{ + send_IPI_allbutself(IPI_KDUMP_CPU_STOP); +} + +void +kdump_smp_send_init(void) +{ + unsigned int cpu, self_cpu; + self_cpu = smp_processor_id(); + for_each_online_cpu(cpu) { + if (cpu != self_cpu) { + if(kdump_status[cpu] == 0) + platform_send_ipi(cpu, 0, IA64_IPI_DM_INIT, 0); + } + } +} +#endif /* - * Called with preeemption disabled. + * Called with preemption disabled. */ void smp_send_reschedule (int cpu) { platform_send_ipi(cpu, IA64_IPI_RESCHEDULE, IA64_IPI_DM_INT, 0); } +EXPORT_SYMBOL_GPL(smp_send_reschedule); -void -smp_flush_tlb_all (void) +/* + * Called with preemption disabled. + */ +static void +smp_send_local_flush_tlb (int cpu) { - on_each_cpu((void (*)(void *))local_flush_tlb_all, NULL, 1, 1); + platform_send_ipi(cpu, IA64_IPI_LOCAL_TLB_FLUSH, IA64_IPI_DM_INT, 0); } void -smp_flush_tlb_mm (struct mm_struct *mm) +smp_local_flush_tlb(void) { - preempt_disable(); - /* this happens for the common case of a single-threaded fork(): */ - if (likely(mm == current->active_mm && atomic_read(&mm->mm_users) == 1)) - { - local_finish_flush_tlb_mm(mm); - preempt_enable(); - return; - } - - preempt_enable(); /* - * We could optimize this further by using mm->cpu_vm_mask to track which CPUs - * have been running in the address space. It's not clear that this is worth the - * trouble though: to avoid races, we have to raise the IPI on the target CPU - * anyhow, and once a CPU is interrupted, the cost of local_flush_tlb_all() is - * rather trivial. + * Use atomic ops. Otherwise, the load/increment/store sequence from + * a "++" operation can have the line stolen between the load & store. + * The overhead of the atomic op in negligible in this case & offers + * significant benefit for the brief periods where lots of cpus + * are simultaneously flushing TLBs. */ - on_each_cpu((void (*)(void *))local_finish_flush_tlb_mm, mm, 1, 1); + ia64_fetchadd(1, &local_tlb_flush_counts[smp_processor_id()].count, acq); + local_flush_tlb_all(); } -/* - * Run a function on another CPU - * <func> The function to run. This must be fast and non-blocking. - * <info> An arbitrary pointer to pass to the function. - * <nonatomic> Currently unused. - * <wait> If true, wait until function has completed on other CPUs. - * [RETURNS] 0 on success, else a negative status code. - * - * Does not return until the remote CPU is nearly ready to execute <func> - * or is or has executed. - */ +#define FLUSH_DELAY 5 /* Usec backoff to eliminate excessive cacheline bouncing */ -int -smp_call_function_single (int cpuid, void (*func) (void *info), void *info, int nonatomic, - int wait) +void +smp_flush_tlb_cpumask(cpumask_t xcpumask) { - struct call_data_struct data; - int cpus = 1; - int me = get_cpu(); /* prevent preemption and reschedule on another processor */ - - if (cpuid == me) { - printk(KERN_INFO "%s: trying to call self\n", __FUNCTION__); - put_cpu(); - return -EBUSY; - } + unsigned short *counts = __ia64_per_cpu_var(shadow_flush_counts); + cpumask_t cpumask = xcpumask; + int mycpu, cpu, flush_mycpu = 0; - data.func = func; - data.info = info; - atomic_set(&data.started, 0); - data.wait = wait; - if (wait) - atomic_set(&data.finished, 0); + preempt_disable(); + mycpu = smp_processor_id(); - spin_lock_bh(&call_lock); + for_each_cpu_mask(cpu, cpumask) + counts[cpu] = local_tlb_flush_counts[cpu].count & 0xffff; - call_data = &data; - mb(); /* ensure store to call_data precedes setting of IPI_CALL_FUNC */ - send_IPI_single(cpuid, IPI_CALL_FUNC); + mb(); + for_each_cpu_mask(cpu, cpumask) { + if (cpu == mycpu) + flush_mycpu = 1; + else + smp_send_local_flush_tlb(cpu); + } - /* Wait for response */ - while (atomic_read(&data.started) != cpus) - cpu_relax(); + if (flush_mycpu) + smp_local_flush_tlb(); - if (wait) - while (atomic_read(&data.finished) != cpus) - cpu_relax(); - call_data = NULL; + for_each_cpu_mask(cpu, cpumask) + while(counts[cpu] == (local_tlb_flush_counts[cpu].count & 0xffff)) + udelay(FLUSH_DELAY); - spin_unlock_bh(&call_lock); - put_cpu(); - return 0; + preempt_enable(); } -EXPORT_SYMBOL(smp_call_function_single); -/* - * this function sends a 'generic call function' IPI to all other CPUs - * in the system. - */ - -/* - * [SUMMARY] Run a function on all other CPUs. - * <func> The function to run. This must be fast and non-blocking. - * <info> An arbitrary pointer to pass to the function. - * <nonatomic> currently unused. - * <wait> If true, wait (atomically) until function has completed on other CPUs. - * [RETURNS] 0 on success, else a negative status code. - * - * Does not return until remote CPUs are nearly ready to execute <func> or are or have - * executed. - * - * You must not call this function with disabled interrupts or from a - * hardware interrupt handler or from a bottom half handler. - */ -int -smp_call_function (void (*func) (void *info), void *info, int nonatomic, int wait) +void +smp_flush_tlb_all (void) { - struct call_data_struct data; - int cpus = num_online_cpus()-1; - - if (!cpus) - return 0; - - /* Can deadlock when called with interrupts disabled */ - WARN_ON(irqs_disabled()); - - data.func = func; - data.info = info; - atomic_set(&data.started, 0); - data.wait = wait; - if (wait) - atomic_set(&data.finished, 0); - - spin_lock(&call_lock); - - call_data = &data; - mb(); /* ensure store to call_data precedes setting of IPI_CALL_FUNC */ - send_IPI_allbutself(IPI_CALL_FUNC); + on_each_cpu((void (*)(void *))local_flush_tlb_all, NULL, 1); +} - /* Wait for response */ - while (atomic_read(&data.started) != cpus) - cpu_relax(); +void +smp_flush_tlb_mm (struct mm_struct *mm) +{ + cpumask_var_t cpus; + preempt_disable(); + /* this happens for the common case of a single-threaded fork(): */ + if (likely(mm == current->active_mm && atomic_read(&mm->mm_users) == 1)) + { + local_finish_flush_tlb_mm(mm); + preempt_enable(); + return; + } + if (!alloc_cpumask_var(&cpus, GFP_ATOMIC)) { + smp_call_function((void (*)(void *))local_finish_flush_tlb_mm, + mm, 1); + } else { + cpumask_copy(cpus, mm_cpumask(mm)); + smp_call_function_many(cpus, + (void (*)(void *))local_finish_flush_tlb_mm, mm, 1); + free_cpumask_var(cpus); + } + local_irq_disable(); + local_finish_flush_tlb_mm(mm); + local_irq_enable(); + preempt_enable(); +} - if (wait) - while (atomic_read(&data.finished) != cpus) - cpu_relax(); - call_data = NULL; +void arch_send_call_function_single_ipi(int cpu) +{ + send_IPI_single(cpu, IPI_CALL_FUNC_SINGLE); +} - spin_unlock(&call_lock); - return 0; +void arch_send_call_function_ipi_mask(const struct cpumask *mask) +{ + send_IPI_mask(mask, IPI_CALL_FUNC); } -EXPORT_SYMBOL(smp_call_function); /* * this function calls the 'stop' function on all other CPUs in the system. @@ -372,7 +335,7 @@ smp_send_stop (void) send_IPI_allbutself(IPI_CPU_STOP); } -int __init +int setup_profiling_timer (unsigned int multiplier) { return -EINVAL; diff --git a/arch/ia64/kernel/smpboot.c b/arch/ia64/kernel/smpboot.c index 400a4898712..547a48d78bd 100644 --- a/arch/ia64/kernel/smpboot.c +++ b/arch/ia64/kernel/smpboot.c @@ -21,7 +21,6 @@ * 05/01/30 Suresh Siddha <suresh.b.siddha@intel.com> * Setup cpu_sibling_map and cpu_core_map */ -#include <linux/config.h> #include <linux/module.h> #include <linux/acpi.h> @@ -36,30 +35,29 @@ #include <linux/mm.h> #include <linux/notifier.h> #include <linux/smp.h> -#include <linux/smp_lock.h> #include <linux/spinlock.h> #include <linux/efi.h> #include <linux/percpu.h> #include <linux/bitops.h> -#include <asm/atomic.h> +#include <linux/atomic.h> #include <asm/cache.h> #include <asm/current.h> #include <asm/delay.h> -#include <asm/ia32.h> #include <asm/io.h> #include <asm/irq.h> #include <asm/machvec.h> #include <asm/mca.h> #include <asm/page.h> +#include <asm/paravirt.h> #include <asm/pgalloc.h> #include <asm/pgtable.h> #include <asm/processor.h> #include <asm/ptrace.h> #include <asm/sal.h> -#include <asm/system.h> #include <asm/tlbflush.h> #include <asm/unistd.h> +#include <asm/sn/arch.h> #define SMP_DEBUG 0 @@ -70,12 +68,11 @@ #endif #ifdef CONFIG_HOTPLUG_CPU -/* - * Store all idle threads, this can be reused instead of creating - * a new thread. Also avoids complicated thread destroy functionality - * for idle threads. - */ -struct task_struct *idle_thread_array[NR_CPUS]; +#ifdef CONFIG_PERMIT_BSP_REMOVE +#define bsp_remove_ok 1 +#else +#define bsp_remove_ok 0 +#endif /* * Global array allocated for NR_CPUS at boot time @@ -90,13 +87,7 @@ struct sal_to_os_boot *sal_state_for_booting_cpu = &sal_boot_rendez_state[0]; #define set_brendez_area(x) (sal_state_for_booting_cpu = &sal_boot_rendez_state[(x)]); -#define get_idle_for_cpu(x) (idle_thread_array[(x)]) -#define set_idle_for_cpu(x,p) (idle_thread_array[(x)] = (p)) - #else - -#define get_idle_for_cpu(x) (NULL) -#define set_idle_for_cpu(x,p) #define set_brendez_area(x) #endif @@ -104,7 +95,7 @@ struct sal_to_os_boot *sal_state_for_booting_cpu = &sal_boot_rendez_state[0]; /* * ITC synchronization related stuff: */ -#define MASTER 0 +#define MASTER (0) #define SLAVE (SMP_CACHE_BYTES/8) #define NUM_ROUNDS 64 /* magic value */ @@ -115,27 +106,22 @@ static volatile unsigned long go[SLAVE + 1]; #define DEBUG_ITC_SYNC 0 -extern void __devinit calibrate_delay (void); extern void start_ap (void); extern unsigned long ia64_iobase; -task_t *task_for_booting_cpu; +struct task_struct *task_for_booting_cpu; /* * State for each CPU */ DEFINE_PER_CPU(int, cpu_state); -/* Bitmasks of currently online, and possible CPUs */ -cpumask_t cpu_online_map; -EXPORT_SYMBOL(cpu_online_map); -cpumask_t cpu_possible_map; -EXPORT_SYMBOL(cpu_possible_map); - cpumask_t cpu_core_map[NR_CPUS] __cacheline_aligned; -cpumask_t cpu_sibling_map[NR_CPUS] __cacheline_aligned; +EXPORT_SYMBOL(cpu_core_map); +DEFINE_PER_CPU_SHARED_ALIGNED(cpumask_t, cpu_sibling_map); +EXPORT_PER_CPU_SYMBOL(cpu_sibling_map); + int smp_num_siblings = 1; -int smp_num_cpucores = 1; /* which logical CPU number maps to which CPU (physical APIC ID) */ volatile int ia64_cpu_to_sapicid[NR_CPUS]; @@ -151,6 +137,27 @@ char __initdata no_int_routing; unsigned char smp_int_redirect; /* are INT and IPI redirectable by the chipset? */ +#ifdef CONFIG_FORCE_CPEI_RETARGET +#define CPEI_OVERRIDE_DEFAULT (1) +#else +#define CPEI_OVERRIDE_DEFAULT (0) +#endif + +unsigned int force_cpei_retarget = CPEI_OVERRIDE_DEFAULT; + +static int __init +cmdl_force_cpei(char *str) +{ + int value=0; + + get_option (&str, &value); + force_cpei_retarget = value; + + return 1; +} + +__setup("force_cpei=", cmdl_force_cpei); + static int __init nointroute (char *str) { @@ -161,6 +168,27 @@ nointroute (char *str) __setup("nointroute", nointroute); +static void fix_b0_for_bsp(void) +{ +#ifdef CONFIG_HOTPLUG_CPU + int cpuid; + static int fix_bsp_b0 = 1; + + cpuid = smp_processor_id(); + + /* + * Cache the b0 value on the first AP that comes up + */ + if (!(fix_bsp_b0 && cpuid)) + return; + + sal_boot_rendez_state[0].br[0] = sal_boot_rendez_state[cpuid].br[0]; + printk ("Fixed BSP b0 value from CPU %d\n", cpuid); + + fix_bsp_b0 = 0; +#endif +} + void sync_master (void *arg) { @@ -270,7 +298,7 @@ ia64_sync_itc (unsigned int master) go[MASTER] = 1; - if (smp_call_function_single(master, sync_master, NULL, 1, 0) < 0) { + if (smp_call_function_single(master, sync_master, NULL, 0) < 0) { printk(KERN_ERR "sync_itc: failed to get attention of CPU %u!\n", master); return; } @@ -319,16 +347,17 @@ ia64_sync_itc (unsigned int master) /* * Ideally sets up per-cpu profiling hooks. Doesn't do much now... */ -static inline void __devinit -smp_setup_percpu_timer (void) +static inline void smp_setup_percpu_timer(void) { } -static void __devinit +static void smp_callin (void) { - int cpuid, phys_id; + int cpuid, phys_id, itc_master; + struct cpuinfo_ia64 *last_cpuinfo, *this_cpuinfo; extern void ia64_init_itm(void); + extern volatile int time_keeper_id; #ifdef CONFIG_PERFMON extern void pfm_init_percpu(void); @@ -336,6 +365,7 @@ smp_callin (void) cpuid = smp_processor_id(); phys_id = hard_smp_processor_id(); + itc_master = time_keeper_id; if (cpu_online(cpuid)) { printk(KERN_ERR "huh, phys CPU#0x%x, CPU#0x%x already present??\n", @@ -343,10 +373,21 @@ smp_callin (void) BUG(); } - lock_ipi_calllock(); - cpu_set(cpuid, cpu_online_map); - unlock_ipi_calllock(); + fix_b0_for_bsp(); + + /* + * numa_node_id() works after this. + */ + set_numa_node(cpu_to_node_map[cpuid]); + set_numa_mem(local_memory_node(cpu_to_node_map[cpuid])); + + spin_lock(&vector_lock); + /* Setup the per cpu irq handling data structures */ + __setup_vector_irq(cpuid); + notify_cpu_starting(cpuid); + set_cpu_online(cpuid, true); per_cpu(cpu_state, cpuid) = CPU_ONLINE; + spin_unlock(&vector_lock); smp_setup_percpu_timer(); @@ -365,20 +406,30 @@ smp_callin (void) * calls spin_unlock_bh(), which calls spin_unlock_bh(), which calls * local_bh_enable(), which bugs out if irqs are not enabled... */ - Dprintk("Going to syncup ITC with BP.\n"); - ia64_sync_itc(0); + Dprintk("Going to syncup ITC with ITC Master.\n"); + ia64_sync_itc(itc_master); } /* * Get our bogomips. */ ia64_init_itm(); - calibrate_delay(); - local_cpu_data->loops_per_jiffy = loops_per_jiffy; -#ifdef CONFIG_IA32_SUPPORT - ia32_gdt_init(); -#endif + /* + * Delay calibration can be skipped if new processor is identical to the + * previous processor. + */ + last_cpuinfo = cpu_data(cpuid - 1); + this_cpuinfo = local_cpu_data; + if (last_cpuinfo->itc_freq != this_cpuinfo->itc_freq || + last_cpuinfo->proc_freq != this_cpuinfo->proc_freq || + last_cpuinfo->features != this_cpuinfo->features || + last_cpuinfo->revision != this_cpuinfo->revision || + last_cpuinfo->family != this_cpuinfo->family || + last_cpuinfo->archrev != this_cpuinfo->archrev || + last_cpuinfo->model != this_cpuinfo->model) + calibrate_delay(); + local_cpu_data->loops_per_jiffy = loops_per_jiffy; /* * Allow the master to continue. @@ -391,74 +442,29 @@ smp_callin (void) /* * Activate a secondary processor. head.S calls this. */ -int __devinit +int start_secondary (void *unused) { /* Early console may use I/O ports */ ia64_set_kr(IA64_KR_IO_BASE, __pa(ia64_iobase)); +#ifndef CONFIG_PRINTK_TIME Dprintk("start_secondary: starting CPU 0x%x\n", hard_smp_processor_id()); +#endif efi_map_pal_code(); cpu_init(); + preempt_disable(); smp_callin(); - cpu_idle(); + cpu_startup_entry(CPUHP_ONLINE); return 0; } -struct pt_regs * __devinit idle_regs(struct pt_regs *regs) -{ - return NULL; -} - -struct create_idle { - struct task_struct *idle; - struct completion done; - int cpu; -}; - -void -do_fork_idle(void *_c_idle) -{ - struct create_idle *c_idle = _c_idle; - - c_idle->idle = fork_idle(c_idle->cpu); - complete(&c_idle->done); -} - -static int __devinit -do_boot_cpu (int sapicid, int cpu) +static int +do_boot_cpu (int sapicid, int cpu, struct task_struct *idle) { int timeout; - struct create_idle c_idle = { - .cpu = cpu, - .done = COMPLETION_INITIALIZER(c_idle.done), - }; - DECLARE_WORK(work, do_fork_idle, &c_idle); - - c_idle.idle = get_idle_for_cpu(cpu); - if (c_idle.idle) { - init_idle(c_idle.idle, cpu); - goto do_rest; - } - - /* - * We can't use kernel_thread since we must avoid to reschedule the child. - */ - if (!keventd_up() || current_is_keventd()) - work.func(work.data); - else { - schedule_work(&work); - wait_for_completion(&c_idle.done); - } - - if (IS_ERR(c_idle.idle)) - panic("failed fork for CPU %d", cpu); - - set_idle_for_cpu(cpu, c_idle.idle); - -do_rest: - task_for_booting_cpu = c_idle.idle; + task_for_booting_cpu = idle; Dprintk("Sending wakeup vector %lu to AP 0x%x/0x%x.\n", ap_wakeup_vector, cpu, sapicid); set_brendez_area(cpu); @@ -478,7 +484,7 @@ do_rest: if (!cpu_isset(cpu, cpu_callin_map)) { printk(KERN_ERR "Processor 0x%x/0x%x is stuck.\n", cpu, sapicid); ia64_cpu_to_sapicid[cpu] = -1; - cpu_clear(cpu, cpu_online_map); /* was set in smp_callin() */ + set_cpu_online(cpu, false); /* was set in smp_callin() */ return -EINVAL; } return 0; @@ -505,21 +511,17 @@ smp_build_cpu_map (void) for (cpu = 0; cpu < NR_CPUS; cpu++) { ia64_cpu_to_sapicid[cpu] = -1; -#ifdef CONFIG_HOTPLUG_CPU - cpu_set(cpu, cpu_possible_map); -#endif } ia64_cpu_to_sapicid[0] = boot_cpu_id; - cpus_clear(cpu_present_map); - cpu_set(0, cpu_present_map); - cpu_set(0, cpu_possible_map); + init_cpu_present(cpumask_of(0)); + set_cpu_possible(0, true); for (cpu = 1, i = 0; i < smp_boot_data.cpu_count; i++) { sapicid = smp_boot_data.cpu_phys_id[i]; if (sapicid == boot_cpu_id) continue; - cpu_set(cpu, cpu_present_map); - cpu_set(cpu, cpu_possible_map); + set_cpu_present(cpu, true); + set_cpu_possible(cpu, true); ia64_cpu_to_sapicid[cpu] = sapicid; cpu++; } @@ -539,10 +541,6 @@ smp_prepare_cpus (unsigned int max_cpus) smp_setup_percpu_timer(); - /* - * We have the boot CPU online for sure. - */ - cpu_set(0, cpu_online_map); cpu_set(0, cpu_callin_map); local_cpu_data->loops_per_jiffy = loops_per_jiffy; @@ -557,59 +555,34 @@ smp_prepare_cpus (unsigned int max_cpus) */ if (!max_cpus) { printk(KERN_INFO "SMP mode deactivated.\n"); - cpus_clear(cpu_online_map); - cpus_clear(cpu_present_map); - cpus_clear(cpu_possible_map); - cpu_set(0, cpu_online_map); - cpu_set(0, cpu_present_map); - cpu_set(0, cpu_possible_map); + init_cpu_online(cpumask_of(0)); + init_cpu_present(cpumask_of(0)); + init_cpu_possible(cpumask_of(0)); return; } } -void __devinit smp_prepare_boot_cpu(void) +void smp_prepare_boot_cpu(void) { - cpu_set(smp_processor_id(), cpu_online_map); + set_cpu_online(smp_processor_id(), true); cpu_set(smp_processor_id(), cpu_callin_map); + set_numa_node(cpu_to_node_map[smp_processor_id()]); per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE; + paravirt_post_smp_prepare_boot_cpu(); } -/* - * mt_info[] is a temporary store for all info returned by - * PAL_LOGICAL_TO_PHYSICAL, to be copied into cpuinfo_ia64 when the - * specific cpu comes. - */ -static struct { - __u32 socket_id; - __u16 core_id; - __u16 thread_id; - __u16 proc_fixed_addr; - __u8 valid; -} mt_info[NR_CPUS] __devinitdata; - #ifdef CONFIG_HOTPLUG_CPU static inline void -remove_from_mtinfo(int cpu) -{ - int i; - - for_each_cpu(i) - if (mt_info[i].valid && mt_info[i].socket_id == - cpu_data(cpu)->socket_id) - mt_info[i].valid = 0; -} - -static inline void clear_cpu_sibling_map(int cpu) { int i; - for_each_cpu_mask(i, cpu_sibling_map[cpu]) - cpu_clear(cpu, cpu_sibling_map[i]); + for_each_cpu_mask(i, per_cpu(cpu_sibling_map, cpu)) + cpu_clear(cpu, per_cpu(cpu_sibling_map, i)); for_each_cpu_mask(i, cpu_core_map[cpu]) cpu_clear(cpu, cpu_core_map[i]); - cpu_sibling_map[cpu] = cpu_core_map[cpu] = CPU_MASK_NONE; + per_cpu(cpu_sibling_map, cpu) = cpu_core_map[cpu] = CPU_MASK_NONE; } static void @@ -620,7 +593,7 @@ remove_siblinginfo(int cpu) if (cpu_data(cpu)->threads_per_core == 1 && cpu_data(cpu)->cores_per_socket == 1) { cpu_clear(cpu, cpu_core_map[cpu]); - cpu_clear(cpu, cpu_sibling_map[cpu]); + cpu_clear(cpu, per_cpu(cpu_sibling_map, cpu)); return; } @@ -628,15 +601,50 @@ remove_siblinginfo(int cpu) /* remove it from all sibling map's */ clear_cpu_sibling_map(cpu); +} + +extern void fixup_irqs(void); - /* if this cpu is the last in the core group, remove all its info - * from mt_info structure +int migrate_platform_irqs(unsigned int cpu) +{ + int new_cpei_cpu; + struct irq_data *data = NULL; + const struct cpumask *mask; + int retval = 0; + + /* + * dont permit CPEI target to removed. */ - if (last) - remove_from_mtinfo(cpu); + if (cpe_vector > 0 && is_cpu_cpei_target(cpu)) { + printk ("CPU (%d) is CPEI Target\n", cpu); + if (can_cpei_retarget()) { + /* + * Now re-target the CPEI to a different processor + */ + new_cpei_cpu = cpumask_any(cpu_online_mask); + mask = cpumask_of(new_cpei_cpu); + set_cpei_target_cpu(new_cpei_cpu); + data = irq_get_irq_data(ia64_cpe_irq); + /* + * Switch for now, immediately, we need to do fake intr + * as other interrupts, but need to study CPEI behaviour with + * polling before making changes. + */ + if (data && data->chip) { + data->chip->irq_disable(data); + data->chip->irq_set_affinity(data, mask, false); + data->chip->irq_enable(data); + printk ("Re-targeting CPEI to cpu %d\n", new_cpei_cpu); + } + } + if (!data) { + printk ("Unable to retarget CPEI, offline cpu [%d] failed\n", cpu); + retval = -EBUSY; + } + } + return retval; } -extern void fixup_irqs(void); /* must be called with cpucontrol mutex held */ int __cpu_disable(void) { @@ -645,11 +653,24 @@ int __cpu_disable(void) /* * dont permit boot processor for now */ - if (cpu == 0) + if (cpu == 0 && !bsp_remove_ok) { + printk ("Your platform does not support removal of BSP\n"); + return (-EBUSY); + } + + if (ia64_platform_is("sn2")) { + if (!sn_cpu_disable_allowed(cpu)) + return -EBUSY; + } + + set_cpu_online(cpu, false); + + if (migrate_platform_irqs(cpu)) { + set_cpu_online(cpu, true); return -EBUSY; + } remove_siblinginfo(cpu); - cpu_clear(cpu, cpu_online_map); fixup_irqs(); local_flush_tlb_all(); cpu_clear(cpu, cpu_callin_map); @@ -671,17 +692,6 @@ void __cpu_die(unsigned int cpu) } printk(KERN_ERR "CPU %u didn't die...\n", cpu); } -#else /* !CONFIG_HOTPLUG_CPU */ -int __cpu_disable(void) -{ - return -ENOSYS; -} - -void __cpu_die(unsigned int cpu) -{ - /* We said "no" in __cpu_disable */ - BUG(); -} #endif /* CONFIG_HOTPLUG_CPU */ void @@ -702,8 +712,7 @@ smp_cpus_done (unsigned int dummy) (int)num_online_cpus(), bogosum/(500000/HZ), (bogosum/(5000/HZ))%100); } -static inline void __devinit -set_cpu_sibling_map(int cpu) +static inline void set_cpu_sibling_map(int cpu) { int i; @@ -712,15 +721,15 @@ set_cpu_sibling_map(int cpu) cpu_set(i, cpu_core_map[cpu]); cpu_set(cpu, cpu_core_map[i]); if (cpu_data(cpu)->core_id == cpu_data(i)->core_id) { - cpu_set(i, cpu_sibling_map[cpu]); - cpu_set(cpu, cpu_sibling_map[i]); + cpu_set(i, per_cpu(cpu_sibling_map, cpu)); + cpu_set(cpu, per_cpu(cpu_sibling_map, i)); } } } } -int __devinit -__cpu_up (unsigned int cpu) +int +__cpu_up(unsigned int cpu, struct task_struct *tidle) { int ret; int sapicid; @@ -738,13 +747,13 @@ __cpu_up (unsigned int cpu) per_cpu(cpu_state, cpu) = CPU_UP_PREPARE; /* Processor goes to start_secondary(), sets online flag */ - ret = do_boot_cpu(sapicid, cpu); + ret = do_boot_cpu(sapicid, cpu, tidle); if (ret < 0) return ret; if (cpu_data(cpu)->threads_per_core == 1 && cpu_data(cpu)->cores_per_socket == 1) { - cpu_set(cpu, cpu_sibling_map[cpu]); + cpu_set(cpu, per_cpu(cpu_sibling_map, cpu)); cpu_set(cpu, cpu_core_map[cpu]); return 0; } @@ -755,7 +764,7 @@ __cpu_up (unsigned int cpu) } /* - * Assume that CPU's have been discovered by some platform-dependent interface. For + * Assume that CPUs have been discovered by some platform-dependent interface. For * SoftSDV/Lion, that would be ACPI. * * Setup of the IPI irq handler is done in irq.c:init_IRQ_SMP(). @@ -769,7 +778,7 @@ init_smp_config(void) } *ap_startup; long sal_ret; - /* Tell SAL where to drop the AP's. */ + /* Tell SAL where to drop the APs. */ ap_startup = (struct fptr *) start_ap; sal_ret = ia64_sal_set_vectors(SAL_VECTOR_OS_BOOT_RENDEZ, ia64_tpa(ap_startup->fp), ia64_tpa(ap_startup->gp), 0, 0, 0, 0); @@ -778,106 +787,72 @@ init_smp_config(void) ia64_sal_strerror(sal_ret)); } -static inline int __devinit -check_for_mtinfo_index(void) -{ - int i; - - for_each_cpu(i) - if (!mt_info[i].valid) - return i; - - return -1; -} - -/* - * Search the mt_info to find out if this socket's cid/tid information is - * cached or not. If the socket exists, fill in the core_id and thread_id - * in cpuinfo - */ -static int __devinit -check_for_new_socket(__u16 logical_address, struct cpuinfo_ia64 *c) -{ - int i; - __u32 sid = c->socket_id; - - for_each_cpu(i) { - if (mt_info[i].valid && mt_info[i].proc_fixed_addr == logical_address - && mt_info[i].socket_id == sid) { - c->core_id = mt_info[i].core_id; - c->thread_id = mt_info[i].thread_id; - return 1; /* not a new socket */ - } - } - return 0; -} - /* * identify_siblings(cpu) gets called from identify_cpu. This populates the * information related to logical execution units in per_cpu_data structure. */ -void __devinit -identify_siblings(struct cpuinfo_ia64 *c) +void identify_siblings(struct cpuinfo_ia64 *c) { - s64 status; + long status; u16 pltid; - u64 proc_fixed_addr; - int count, i; pal_logical_to_physical_t info; - if (smp_num_cpucores == 1 && smp_num_siblings == 1) - return; + status = ia64_pal_logical_to_phys(-1, &info); + if (status != PAL_STATUS_SUCCESS) { + if (status != PAL_STATUS_UNIMPLEMENTED) { + printk(KERN_ERR + "ia64_pal_logical_to_phys failed with %ld\n", + status); + return; + } - if ((status = ia64_pal_logical_to_phys(0, &info)) != PAL_STATUS_SUCCESS) { - printk(KERN_ERR "ia64_pal_logical_to_phys failed with %ld\n", - status); - return; - } - if ((status = ia64_sal_physical_id_info(&pltid)) != PAL_STATUS_SUCCESS) { - printk(KERN_ERR "ia64_sal_pltid failed with %ld\n", status); - return; + info.overview_ppid = 0; + info.overview_cpp = 1; + info.overview_tpc = 1; } - if ((status = ia64_pal_fixed_addr(&proc_fixed_addr)) != PAL_STATUS_SUCCESS) { - printk(KERN_ERR "ia64_pal_fixed_addr failed with %ld\n", status); + + status = ia64_sal_physical_id_info(&pltid); + if (status != PAL_STATUS_SUCCESS) { + if (status != PAL_STATUS_UNIMPLEMENTED) + printk(KERN_ERR + "ia64_sal_pltid failed with %ld\n", + status); return; } c->socket_id = (pltid << 8) | info.overview_ppid; - c->cores_per_socket = info.overview_cpp; - c->threads_per_core = info.overview_tpc; - count = c->num_log = info.overview_num_log; - /* If the thread and core id information is already cached, then - * we will simply update cpu_info and return. Otherwise, we will - * do the PAL calls and cache core and thread id's of all the siblings. - */ - if (check_for_new_socket(proc_fixed_addr, c)) + if (info.overview_cpp == 1 && info.overview_tpc == 1) return; - for (i = 0; i < count; i++) { - int index; - - if (i && (status = ia64_pal_logical_to_phys(i, &info)) - != PAL_STATUS_SUCCESS) { - printk(KERN_ERR "ia64_pal_logical_to_phys failed" - " with %ld\n", status); - return; - } - if (info.log2_la == proc_fixed_addr) { - c->core_id = info.log1_cid; - c->thread_id = info.log1_tid; - } + c->cores_per_socket = info.overview_cpp; + c->threads_per_core = info.overview_tpc; + c->num_log = info.overview_num_log; - index = check_for_mtinfo_index(); - /* We will not do the mt_info caching optimization in this case. - */ - if (index < 0) - continue; + c->core_id = info.log1_cid; + c->thread_id = info.log1_tid; +} - mt_info[index].valid = 1; - mt_info[index].socket_id = c->socket_id; - mt_info[index].core_id = info.log1_cid; - mt_info[index].thread_id = info.log1_tid; - mt_info[index].proc_fixed_addr = info.log2_la; +/* + * returns non zero, if multi-threading is enabled + * on at least one physical package. Due to hotplug cpu + * and (maxcpus=), all threads may not necessarily be enabled + * even though the processor supports multi-threading. + */ +int is_multithreading_enabled(void) +{ + int i, j; + + for_each_present_cpu(i) { + for_each_present_cpu(j) { + if (j == i) + continue; + if ((cpu_data(j)->socket_id == cpu_data(i)->socket_id)) { + if (cpu_data(j)->core_id == cpu_data(i)->core_id) + return 1; + } + } } + return 0; } +EXPORT_SYMBOL_GPL(is_multithreading_enabled); diff --git a/arch/ia64/kernel/stacktrace.c b/arch/ia64/kernel/stacktrace.c new file mode 100644 index 00000000000..5af2783a87f --- /dev/null +++ b/arch/ia64/kernel/stacktrace.c @@ -0,0 +1,39 @@ +/* + * arch/ia64/kernel/stacktrace.c + * + * Stack trace management functions + * + */ +#include <linux/sched.h> +#include <linux/stacktrace.h> +#include <linux/module.h> + +static void +ia64_do_save_stack(struct unw_frame_info *info, void *arg) +{ + struct stack_trace *trace = arg; + unsigned long ip; + int skip = trace->skip; + + trace->nr_entries = 0; + do { + unw_get_ip(info, &ip); + if (ip == 0) + break; + if (skip == 0) { + trace->entries[trace->nr_entries++] = ip; + if (trace->nr_entries == trace->max_entries) + break; + } else + skip--; + } while (unw_unwind(info) >= 0); +} + +/* + * Save stack-backtrace addresses into a stack_trace buffer. + */ +void save_stack_trace(struct stack_trace *trace) +{ + unw_init_running(ia64_do_save_stack, trace); +} +EXPORT_SYMBOL(save_stack_trace); diff --git a/arch/ia64/kernel/sys_ia64.c b/arch/ia64/kernel/sys_ia64.c index f2dbcd1db0d..41e33f84c18 100644 --- a/arch/ia64/kernel/sys_ia64.c +++ b/arch/ia64/kernel/sys_ia64.c @@ -5,7 +5,6 @@ * Copyright (C) 1999-2000, 2002-2003, 2005 Hewlett-Packard Co * David Mosberger-Tang <davidm@hpl.hp.com> */ -#include <linux/config.h> #include <linux/errno.h> #include <linux/fs.h> #include <linux/mm.h> @@ -14,7 +13,6 @@ #include <linux/shm.h> #include <linux/file.h> /* doh, must come after sched.h... */ #include <linux/smp.h> -#include <linux/smp_lock.h> #include <linux/syscalls.h> #include <linux/highuid.h> #include <linux/hugetlb.h> @@ -27,19 +25,26 @@ arch_get_unmapped_area (struct file *filp, unsigned long addr, unsigned long len unsigned long pgoff, unsigned long flags) { long map_shared = (flags & MAP_SHARED); - unsigned long start_addr, align_mask = PAGE_SIZE - 1; + unsigned long align_mask = 0; struct mm_struct *mm = current->mm; - struct vm_area_struct *vma; + struct vm_unmapped_area_info info; if (len > RGN_MAP_LIMIT) return -ENOMEM; + /* handle fixed mapping: prevent overlap with huge pages */ + if (flags & MAP_FIXED) { + if (is_hugepage_only_range(mm, addr, len)) + return -EINVAL; + return addr; + } + #ifdef CONFIG_HUGETLB_PAGE if (REGION_NUMBER(addr) == RGN_HPAGE) addr = 0; #endif if (!addr) - addr = mm->free_area_cache; + addr = TASK_UNMAPPED_BASE; if (map_shared && (TASK_SIZE > 0xfffffffful)) /* @@ -48,28 +53,15 @@ arch_get_unmapped_area (struct file *filp, unsigned long addr, unsigned long len * tasks, we prefer to avoid exhausting the address space too quickly by * limiting alignment to a single page. */ - align_mask = SHMLBA - 1; - - full_search: - start_addr = addr = (addr + align_mask) & ~align_mask; - - for (vma = find_vma(mm, addr); ; vma = vma->vm_next) { - /* At this point: (!vma || addr < vma->vm_end). */ - if (TASK_SIZE - len < addr || RGN_MAP_LIMIT - len < REGION_OFFSET(addr)) { - if (start_addr != TASK_UNMAPPED_BASE) { - /* Start a new search --- just in case we missed some holes. */ - addr = TASK_UNMAPPED_BASE; - goto full_search; - } - return -ENOMEM; - } - if (!vma || addr + len <= vma->vm_start) { - /* Remember the address where we stopped this search: */ - mm->free_area_cache = addr + len; - return addr; - } - addr = (vma->vm_end + align_mask) & ~align_mask; - } + align_mask = PAGE_MASK & (SHMLBA - 1); + + info.flags = 0; + info.length = len; + info.low_limit = addr; + info.high_limit = TASK_SIZE; + info.align_mask = align_mask; + info.align_offset = 0; + return vm_unmapped_area(&info); } asmlinkage long @@ -95,51 +87,7 @@ sys_getpagesize (void) asmlinkage unsigned long ia64_brk (unsigned long brk) { - unsigned long rlim, retval, newbrk, oldbrk; - struct mm_struct *mm = current->mm; - - /* - * Most of this replicates the code in sys_brk() except for an additional safety - * check and the clearing of r8. However, we can't call sys_brk() because we need - * to acquire the mmap_sem before we can do the test... - */ - down_write(&mm->mmap_sem); - - if (brk < mm->end_code) - goto out; - newbrk = PAGE_ALIGN(brk); - oldbrk = PAGE_ALIGN(mm->brk); - if (oldbrk == newbrk) - goto set_brk; - - /* Always allow shrinking brk. */ - if (brk <= mm->brk) { - if (!do_munmap(mm, newbrk, oldbrk-newbrk)) - goto set_brk; - goto out; - } - - /* Check against unimplemented/unmapped addresses: */ - if ((newbrk - oldbrk) > RGN_MAP_LIMIT || REGION_OFFSET(newbrk) > RGN_MAP_LIMIT) - goto out; - - /* Check against rlimit.. */ - rlim = current->signal->rlim[RLIMIT_DATA].rlim_cur; - if (rlim < RLIM_INFINITY && brk - mm->start_data > rlim) - goto out; - - /* Check against existing mmap mappings. */ - if (find_vma_intersection(mm, oldbrk, newbrk+PAGE_SIZE)) - goto out; - - /* Ok, looks good - let it rip. */ - if (do_brk(oldbrk, newbrk-oldbrk) != oldbrk) - goto out; -set_brk: - mm->brk = brk; -out: - retval = mm->brk; - up_write(&mm->mmap_sem); + unsigned long retval = sys_brk(brk); force_successful_syscall_return(); return retval; } @@ -149,13 +97,13 @@ out: * and r9) as this is faster than doing a copy_to_user(). */ asmlinkage long -sys_pipe (void) +sys_ia64_pipe (void) { - struct pt_regs *regs = ia64_task_regs(current); + struct pt_regs *regs = task_pt_regs(current); int fd[2]; int retval; - retval = do_pipe(fd); + retval = do_pipe_flags(fd, 0); if (retval) goto out; retval = fd[0]; @@ -164,49 +112,20 @@ sys_pipe (void) return retval; } -static inline unsigned long -do_mmap2 (unsigned long addr, unsigned long len, int prot, int flags, int fd, unsigned long pgoff) +int ia64_mmap_check(unsigned long addr, unsigned long len, + unsigned long flags) { unsigned long roff; - struct file *file = NULL; - - flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - if (!(flags & MAP_ANONYMOUS)) { - file = fget(fd); - if (!file) - return -EBADF; - - if (!file->f_op || !file->f_op->mmap) { - addr = -ENODEV; - goto out; - } - } - - /* Careful about overflows.. */ - len = PAGE_ALIGN(len); - if (!len || len > TASK_SIZE) { - addr = -EINVAL; - goto out; - } /* - * Don't permit mappings into unmapped space, the virtual page table of a region, - * or across a region boundary. Note: RGN_MAP_LIMIT is equal to 2^n-PAGE_SIZE - * (for some integer n <= 61) and len > 0. + * Don't permit mappings into unmapped space, the virtual page table + * of a region, or across a region boundary. Note: RGN_MAP_LIMIT is + * equal to 2^n-PAGE_SIZE (for some integer n <= 61) and len > 0. */ roff = REGION_OFFSET(addr); - if ((len > RGN_MAP_LIMIT) || (roff > (RGN_MAP_LIMIT - len))) { - addr = -EINVAL; - goto out; - } - - down_write(¤t->mm->mmap_sem); - addr = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); - up_write(¤t->mm->mmap_sem); - -out: if (file) - fput(file); - return addr; + if ((len > RGN_MAP_LIMIT) || (roff > (RGN_MAP_LIMIT - len))) + return -EINVAL; + return 0; } /* @@ -217,7 +136,7 @@ out: if (file) asmlinkage unsigned long sys_mmap2 (unsigned long addr, unsigned long len, int prot, int flags, int fd, long pgoff) { - addr = do_mmap2(addr, len, prot, flags, fd, pgoff); + addr = sys_mmap_pgoff(addr, len, prot, flags, fd, pgoff); if (!IS_ERR((void *) addr)) force_successful_syscall_return(); return addr; @@ -229,7 +148,7 @@ sys_mmap (unsigned long addr, unsigned long len, int prot, int flags, int fd, lo if (offset_in_page(off) != 0) return -EINVAL; - addr = do_mmap2(addr, len, prot, flags, fd, off >> PAGE_SHIFT); + addr = sys_mmap_pgoff(addr, len, prot, flags, fd, off >> PAGE_SHIFT); if (!IS_ERR((void *) addr)) force_successful_syscall_return(); return addr; @@ -239,22 +158,9 @@ asmlinkage unsigned long ia64_mremap (unsigned long addr, unsigned long old_len, unsigned long new_len, unsigned long flags, unsigned long new_addr) { - extern unsigned long do_mremap (unsigned long addr, - unsigned long old_len, - unsigned long new_len, - unsigned long flags, - unsigned long new_addr); - - down_write(¤t->mm->mmap_sem); - { - addr = do_mremap(addr, old_len, new_len, flags, new_addr); - } - up_write(¤t->mm->mmap_sem); - - if (IS_ERR((void *) addr)) - return addr; - - force_successful_syscall_return(); + addr = sys_mremap(addr, old_len, new_len, flags, new_addr); + if (!IS_ERR((void *) addr)) + force_successful_syscall_return(); return addr; } diff --git a/arch/ia64/kernel/time.c b/arch/ia64/kernel/time.c index 5b7e736f3b4..71c52bc7c28 100644 --- a/arch/ia64/kernel/time.c +++ b/arch/ia64/kernel/time.c @@ -8,7 +8,6 @@ * Copyright (C) 1999-2000 VA Linux Systems * Copyright (C) 1999-2000 Walt Drummond <drummond@valinux.com> */ -#include <linux/config.h> #include <linux/cpu.h> #include <linux/init.h> @@ -19,20 +18,27 @@ #include <linux/time.h> #include <linux/interrupt.h> #include <linux/efi.h> -#include <linux/profile.h> #include <linux/timex.h> +#include <linux/timekeeper_internal.h> +#include <linux/platform_device.h> #include <asm/machvec.h> #include <asm/delay.h> #include <asm/hw_irq.h> +#include <asm/paravirt.h> #include <asm/ptrace.h> #include <asm/sal.h> #include <asm/sections.h> -#include <asm/system.h> -extern unsigned long wall_jiffies; +#include "fsyscall_gtod_data.h" + +static cycle_t itc_get_cycles(struct clocksource *cs); + +struct fsyscall_gtod_data_t fsyscall_gtod_data; + +struct itc_jitter_data_t itc_jitter_data; -#define TIME_KEEPER_ID 0 /* smp_processor_id() of time-keeper */ +volatile int time_keeper_id = 0; /* smp_processor_id() of time-keeper */ #ifdef CONFIG_IA64_DEBUG_IRQ @@ -41,22 +47,114 @@ EXPORT_SYMBOL(last_cli_ip); #endif -static struct time_interpolator itc_interpolator = { - .shift = 16, - .mask = 0xffffffffffffffffLL, - .source = TIME_SOURCE_CPU +#ifdef CONFIG_PARAVIRT +/* We need to define a real function for sched_clock, to override the + weak default version */ +unsigned long long sched_clock(void) +{ + return paravirt_sched_clock(); +} +#endif + +#ifdef CONFIG_PARAVIRT +static void +paravirt_clocksource_resume(struct clocksource *cs) +{ + if (pv_time_ops.clocksource_resume) + pv_time_ops.clocksource_resume(); +} +#endif + +static struct clocksource clocksource_itc = { + .name = "itc", + .rating = 350, + .read = itc_get_cycles, + .mask = CLOCKSOURCE_MASK(64), + .flags = CLOCK_SOURCE_IS_CONTINUOUS, +#ifdef CONFIG_PARAVIRT + .resume = paravirt_clocksource_resume, +#endif }; +static struct clocksource *itc_clocksource; + +#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE + +#include <linux/kernel_stat.h> + +extern cputime_t cycle_to_cputime(u64 cyc); + +void vtime_account_user(struct task_struct *tsk) +{ + cputime_t delta_utime; + struct thread_info *ti = task_thread_info(tsk); + + if (ti->ac_utime) { + delta_utime = cycle_to_cputime(ti->ac_utime); + account_user_time(tsk, delta_utime, delta_utime); + ti->ac_utime = 0; + } +} + +/* + * Called from the context switch with interrupts disabled, to charge all + * accumulated times to the current process, and to prepare accounting on + * the next process. + */ +void arch_vtime_task_switch(struct task_struct *prev) +{ + struct thread_info *pi = task_thread_info(prev); + struct thread_info *ni = task_thread_info(current); + + pi->ac_stamp = ni->ac_stamp; + ni->ac_stime = ni->ac_utime = 0; +} + +/* + * Account time for a transition between system, hard irq or soft irq state. + * Note that this function is called with interrupts enabled. + */ +static cputime_t vtime_delta(struct task_struct *tsk) +{ + struct thread_info *ti = task_thread_info(tsk); + cputime_t delta_stime; + __u64 now; + + WARN_ON_ONCE(!irqs_disabled()); + + now = ia64_get_itc(); + + delta_stime = cycle_to_cputime(ti->ac_stime + (now - ti->ac_stamp)); + ti->ac_stime = 0; + ti->ac_stamp = now; + + return delta_stime; +} + +void vtime_account_system(struct task_struct *tsk) +{ + cputime_t delta = vtime_delta(tsk); + + account_system_time(tsk, 0, delta, delta); +} +EXPORT_SYMBOL_GPL(vtime_account_system); + +void vtime_account_idle(struct task_struct *tsk) +{ + account_idle_time(vtime_delta(tsk)); +} + +#endif /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ static irqreturn_t -timer_interrupt (int irq, void *dev_id, struct pt_regs *regs) +timer_interrupt (int irq, void *dev_id) { unsigned long new_itm; - if (unlikely(cpu_is_offline(smp_processor_id()))) { + if (cpu_is_offline(smp_processor_id())) { return IRQ_HANDLED; } - platform_timer_interrupt(irq, dev_id, regs); + platform_timer_interrupt(irq, dev_id); new_itm = local_cpu_data->itm_next; @@ -64,38 +162,40 @@ timer_interrupt (int irq, void *dev_id, struct pt_regs *regs) printk(KERN_ERR "Oops: timer tick before it's due (itc=%lx,itm=%lx)\n", ia64_get_itc(), new_itm); - profile_tick(CPU_PROFILING, regs); + profile_tick(CPU_PROFILING); + + if (paravirt_do_steal_accounting(&new_itm)) + goto skip_process_time_accounting; while (1) { - update_process_times(user_mode(regs)); + update_process_times(user_mode(get_irq_regs())); new_itm += local_cpu_data->itm_delta; - if (smp_processor_id() == TIME_KEEPER_ID) { - /* - * Here we are in the timer irq handler. We have irqs locally - * disabled, but we don't know if the timer_bh is running on - * another CPU. We need to avoid to SMP race by acquiring the - * xtime_lock. - */ - write_seqlock(&xtime_lock); - do_timer(regs); - local_cpu_data->itm_next = new_itm; - write_sequnlock(&xtime_lock); - } else - local_cpu_data->itm_next = new_itm; + if (smp_processor_id() == time_keeper_id) + xtime_update(1); + + local_cpu_data->itm_next = new_itm; if (time_after(new_itm, ia64_get_itc())) break; + + /* + * Allow IPIs to interrupt the timer loop. + */ + local_irq_enable(); + local_irq_disable(); } +skip_process_time_accounting: + do { /* * If we're too close to the next clock tick for * comfort, we increase the safety margin by * intentionally dropping the next tick(s). We do NOT * update itm.next because that would force us to call - * do_timer() which in turn would let our clock run + * xtime_update() which in turn would let our clock run * too fast (with the potentially devastating effect * of losing monotony of time). */ @@ -144,8 +244,7 @@ static int __init nojitter_setup(char *str) __setup("nojitter", nojitter_setup); -void __devinit -ia64_init_itm (void) +void ia64_init_itm(void) { unsigned long platform_base_freq, itc_freq; struct pal_freq_ratio itc_ratio, proc_ratio; @@ -188,7 +287,7 @@ ia64_init_itm (void) itc_freq = (platform_base_freq*itc_ratio.num)/itc_ratio.den; local_cpu_data->itm_delta = (itc_freq + HZ/2) / HZ; - printk(KERN_DEBUG "CPU %d: base freq=%lu.%03luMHz, ITC ratio=%lu/%lu, " + printk(KERN_DEBUG "CPU %d: base freq=%lu.%03luMHz, ITC ratio=%u/%u, " "ITC freq=%lu.%03luMHz", smp_processor_id(), platform_base_freq / 1000000, (platform_base_freq / 1000) % 1000, itc_ratio.num, itc_ratio.den, itc_freq / 1000000, (itc_freq / 1000) % 1000); @@ -208,8 +307,6 @@ ia64_init_itm (void) + itc_freq/2)/itc_freq; if (!(sal_platform_features & IA64_SAL_PLATFORM_FEATURE_ITC_DRIFT)) { - itc_interpolator.frequency = local_cpu_data->itc_freq; - itc_interpolator.drift = itc_drift; #ifdef CONFIG_SMP /* On IA64 in an SMP configuration ITCs are never accurately synchronized. * Jitter compensation requires a cmpxchg which may limit @@ -221,31 +318,154 @@ ia64_init_itm (void) * even going backward) if the ITC offsets between the individual CPUs * are too large. */ - if (!nojitter) itc_interpolator.jitter = 1; + if (!nojitter) + itc_jitter_data.itc_jitter = 1; #endif - register_time_interpolator(&itc_interpolator); - } + } else + /* + * ITC is drifty and we have not synchronized the ITCs in smpboot.c. + * ITC values may fluctuate significantly between processors. + * Clock should not be used for hrtimers. Mark itc as only + * useful for boot and testing. + * + * Note that jitter compensation is off! There is no point of + * synchronizing ITCs since they may be large differentials + * that change over time. + * + * The only way to fix this would be to repeatedly sync the + * ITCs. Until that time we have to avoid ITC. + */ + clocksource_itc.rating = 50; + + paravirt_init_missing_ticks_accounting(smp_processor_id()); + + /* avoid softlock up message when cpu is unplug and plugged again. */ + touch_softlockup_watchdog(); /* Setup the CPU local timer tick */ ia64_cpu_local_tick(); + + if (!itc_clocksource) { + clocksource_register_hz(&clocksource_itc, + local_cpu_data->itc_freq); + itc_clocksource = &clocksource_itc; + } +} + +static cycle_t itc_get_cycles(struct clocksource *cs) +{ + unsigned long lcycle, now, ret; + + if (!itc_jitter_data.itc_jitter) + return get_cycles(); + + lcycle = itc_jitter_data.itc_lastcycle; + now = get_cycles(); + if (lcycle && time_after(lcycle, now)) + return lcycle; + + /* + * Keep track of the last timer value returned. + * In an SMP environment, you could lose out in contention of + * cmpxchg. If so, your cmpxchg returns new value which the + * winner of contention updated to. Use the new value instead. + */ + ret = cmpxchg(&itc_jitter_data.itc_lastcycle, lcycle, now); + if (unlikely(ret != lcycle)) + return ret; + + return now; } + static struct irqaction timer_irqaction = { .handler = timer_interrupt, - .flags = SA_INTERRUPT, + .flags = IRQF_IRQPOLL, .name = "timer" }; +static struct platform_device rtc_efi_dev = { + .name = "rtc-efi", + .id = -1, +}; + +static int __init rtc_init(void) +{ + if (platform_device_register(&rtc_efi_dev) < 0) + printk(KERN_ERR "unable to register rtc device...\n"); + + /* not necessarily an error */ + return 0; +} +module_init(rtc_init); + +void read_persistent_clock(struct timespec *ts) +{ + efi_gettimeofday(ts); +} + void __init time_init (void) { register_percpu_irq(IA64_TIMER_VECTOR, &timer_irqaction); - efi_gettimeofday(&xtime); ia64_init_itm(); +} - /* - * Initialize wall_to_monotonic such that adding it to xtime will yield zero, the - * tv_nsec field must be normalized (i.e., 0 <= nsec < NSEC_PER_SEC). - */ - set_normalized_timespec(&wall_to_monotonic, -xtime.tv_sec, -xtime.tv_nsec); +/* + * Generic udelay assumes that if preemption is allowed and the thread + * migrates to another CPU, that the ITC values are synchronized across + * all CPUs. + */ +static void +ia64_itc_udelay (unsigned long usecs) +{ + unsigned long start = ia64_get_itc(); + unsigned long end = start + usecs*local_cpu_data->cyc_per_usec; + + while (time_before(ia64_get_itc(), end)) + cpu_relax(); +} + +void (*ia64_udelay)(unsigned long usecs) = &ia64_itc_udelay; + +void +udelay (unsigned long usecs) +{ + (*ia64_udelay)(usecs); +} +EXPORT_SYMBOL(udelay); + +/* IA64 doesn't cache the timezone */ +void update_vsyscall_tz(void) +{ } + +void update_vsyscall_old(struct timespec *wall, struct timespec *wtm, + struct clocksource *c, u32 mult) +{ + write_seqcount_begin(&fsyscall_gtod_data.seq); + + /* copy fsyscall clock data */ + fsyscall_gtod_data.clk_mask = c->mask; + fsyscall_gtod_data.clk_mult = mult; + fsyscall_gtod_data.clk_shift = c->shift; + fsyscall_gtod_data.clk_fsys_mmio = c->archdata.fsys_mmio; + fsyscall_gtod_data.clk_cycle_last = c->cycle_last; + + /* copy kernel time structures */ + fsyscall_gtod_data.wall_time.tv_sec = wall->tv_sec; + fsyscall_gtod_data.wall_time.tv_nsec = wall->tv_nsec; + fsyscall_gtod_data.monotonic_time.tv_sec = wtm->tv_sec + + wall->tv_sec; + fsyscall_gtod_data.monotonic_time.tv_nsec = wtm->tv_nsec + + wall->tv_nsec; + + /* normalize */ + while (fsyscall_gtod_data.monotonic_time.tv_nsec >= NSEC_PER_SEC) { + fsyscall_gtod_data.monotonic_time.tv_nsec -= NSEC_PER_SEC; + fsyscall_gtod_data.monotonic_time.tv_sec++; + } + + write_seqcount_end(&fsyscall_gtod_data.seq); +} + diff --git a/arch/ia64/kernel/topology.c b/arch/ia64/kernel/topology.c index 706b7734e19..f295f9abba4 100644 --- a/arch/ia64/kernel/topology.c +++ b/arch/ia64/kernel/topology.c @@ -9,60 +9,65 @@ * 2002/08/07 Erich Focht <efocht@ess.nec.de> * Populate cpu entries in sysfs for non-numa systems as well * Intel Corporation - Ashok Raj + * 02/27/2006 Zhang, Yanmin + * Populate cpu cache entries in sysfs for cpu cache info */ -#include <linux/config.h> #include <linux/cpu.h> #include <linux/kernel.h> #include <linux/mm.h> #include <linux/node.h> +#include <linux/slab.h> #include <linux/init.h> #include <linux/bootmem.h> #include <linux/nodemask.h> +#include <linux/notifier.h> +#include <linux/export.h> #include <asm/mmzone.h> #include <asm/numa.h> #include <asm/cpu.h> -#ifdef CONFIG_NUMA -static struct node *sysfs_nodes; -#endif static struct ia64_cpu *sysfs_cpus; -int arch_register_cpu(int num) +void arch_fix_phys_package_id(int num, u32 slot) { - struct node *parent = NULL; - -#ifdef CONFIG_NUMA - parent = &sysfs_nodes[cpu_to_node(num)]; -#endif /* CONFIG_NUMA */ +#ifdef CONFIG_SMP + if (cpu_data(num)->socket_id == -1) + cpu_data(num)->socket_id = slot; +#endif +} +EXPORT_SYMBOL_GPL(arch_fix_phys_package_id); + +#ifdef CONFIG_HOTPLUG_CPU +int __ref arch_register_cpu(int num) +{ #ifdef CONFIG_ACPI /* - * If CPEI cannot be re-targetted, and this is - * CPEI target, then dont create the control file + * If CPEI can be re-targeted or if this is not + * CPEI target, then it is hotpluggable */ - if (!can_cpei_retarget() && is_cpu_cpei_target(num)) - sysfs_cpus[num].cpu.no_control = 1; + if (can_cpei_retarget() || !is_cpu_cpei_target(num)) + sysfs_cpus[num].cpu.hotpluggable = 1; + map_cpu_to_node(num, node_cpuid[num].nid); #endif - - return register_cpu(&sysfs_cpus[num].cpu, num, parent); + return register_cpu(&sysfs_cpus[num].cpu, num); } +EXPORT_SYMBOL(arch_register_cpu); -#ifdef CONFIG_HOTPLUG_CPU - -void arch_unregister_cpu(int num) +void __ref arch_unregister_cpu(int num) { - struct node *parent = NULL; - -#ifdef CONFIG_NUMA - int node = cpu_to_node(num); - parent = &sysfs_nodes[node]; -#endif /* CONFIG_NUMA */ - - return unregister_cpu(&sysfs_cpus[num].cpu, parent); + unregister_cpu(&sysfs_cpus[num].cpu); +#ifdef CONFIG_ACPI + unmap_cpu_from_node(num, cpu_to_node(num)); +#endif } -EXPORT_SYMBOL(arch_register_cpu); EXPORT_SYMBOL(arch_unregister_cpu); +#else +static int __init arch_register_cpu(int num) +{ + return register_cpu(&sysfs_cpus[num].cpu, num); +} #endif /*CONFIG_HOTPLUG_CPU*/ @@ -71,31 +76,397 @@ static int __init topology_init(void) int i, err = 0; #ifdef CONFIG_NUMA - sysfs_nodes = kmalloc(sizeof(struct node) * MAX_NUMNODES, GFP_KERNEL); - if (!sysfs_nodes) { - err = -ENOMEM; - goto out; - } - memset(sysfs_nodes, 0, sizeof(struct node) * MAX_NUMNODES); - - /* MCD - Do we want to register all ONLINE nodes, or all POSSIBLE nodes? */ - for_each_online_node(i) - if ((err = register_node(&sysfs_nodes[i], i, 0))) + /* + * MCD - Do we want to register all ONLINE nodes, or all POSSIBLE nodes? + */ + for_each_online_node(i) { + if ((err = register_one_node(i))) goto out; + } #endif - sysfs_cpus = kmalloc(sizeof(struct ia64_cpu) * NR_CPUS, GFP_KERNEL); - if (!sysfs_cpus) { - err = -ENOMEM; - goto out; - } - memset(sysfs_cpus, 0, sizeof(struct ia64_cpu) * NR_CPUS); + sysfs_cpus = kzalloc(sizeof(struct ia64_cpu) * NR_CPUS, GFP_KERNEL); + if (!sysfs_cpus) + panic("kzalloc in topology_init failed - NR_CPUS too big?"); - for_each_present_cpu(i) + for_each_present_cpu(i) { if((err = arch_register_cpu(i))) goto out; + } out: return err; } -__initcall(topology_init); +subsys_initcall(topology_init); + + +/* + * Export cpu cache information through sysfs + */ + +/* + * A bunch of string array to get pretty printing + */ +static const char *cache_types[] = { + "", /* not used */ + "Instruction", + "Data", + "Unified" /* unified */ +}; + +static const char *cache_mattrib[]={ + "WriteThrough", + "WriteBack", + "", /* reserved */ + "" /* reserved */ +}; + +struct cache_info { + pal_cache_config_info_t cci; + cpumask_t shared_cpu_map; + int level; + int type; + struct kobject kobj; +}; + +struct cpu_cache_info { + struct cache_info *cache_leaves; + int num_cache_leaves; + struct kobject kobj; +}; + +static struct cpu_cache_info all_cpu_cache_info[NR_CPUS]; +#define LEAF_KOBJECT_PTR(x,y) (&all_cpu_cache_info[x].cache_leaves[y]) + +#ifdef CONFIG_SMP +static void cache_shared_cpu_map_setup(unsigned int cpu, + struct cache_info * this_leaf) +{ + pal_cache_shared_info_t csi; + int num_shared, i = 0; + unsigned int j; + + if (cpu_data(cpu)->threads_per_core <= 1 && + cpu_data(cpu)->cores_per_socket <= 1) { + cpu_set(cpu, this_leaf->shared_cpu_map); + return; + } + + if (ia64_pal_cache_shared_info(this_leaf->level, + this_leaf->type, + 0, + &csi) != PAL_STATUS_SUCCESS) + return; + + num_shared = (int) csi.num_shared; + do { + for_each_possible_cpu(j) + if (cpu_data(cpu)->socket_id == cpu_data(j)->socket_id + && cpu_data(j)->core_id == csi.log1_cid + && cpu_data(j)->thread_id == csi.log1_tid) + cpu_set(j, this_leaf->shared_cpu_map); + + i++; + } while (i < num_shared && + ia64_pal_cache_shared_info(this_leaf->level, + this_leaf->type, + i, + &csi) == PAL_STATUS_SUCCESS); +} +#else +static void cache_shared_cpu_map_setup(unsigned int cpu, + struct cache_info * this_leaf) +{ + cpu_set(cpu, this_leaf->shared_cpu_map); + return; +} +#endif + +static ssize_t show_coherency_line_size(struct cache_info *this_leaf, + char *buf) +{ + return sprintf(buf, "%u\n", 1 << this_leaf->cci.pcci_line_size); +} + +static ssize_t show_ways_of_associativity(struct cache_info *this_leaf, + char *buf) +{ + return sprintf(buf, "%u\n", this_leaf->cci.pcci_assoc); +} + +static ssize_t show_attributes(struct cache_info *this_leaf, char *buf) +{ + return sprintf(buf, + "%s\n", + cache_mattrib[this_leaf->cci.pcci_cache_attr]); +} + +static ssize_t show_size(struct cache_info *this_leaf, char *buf) +{ + return sprintf(buf, "%uK\n", this_leaf->cci.pcci_cache_size / 1024); +} + +static ssize_t show_number_of_sets(struct cache_info *this_leaf, char *buf) +{ + unsigned number_of_sets = this_leaf->cci.pcci_cache_size; + number_of_sets /= this_leaf->cci.pcci_assoc; + number_of_sets /= 1 << this_leaf->cci.pcci_line_size; + + return sprintf(buf, "%u\n", number_of_sets); +} + +static ssize_t show_shared_cpu_map(struct cache_info *this_leaf, char *buf) +{ + ssize_t len; + cpumask_t shared_cpu_map; + + cpumask_and(&shared_cpu_map, + &this_leaf->shared_cpu_map, cpu_online_mask); + len = cpumask_scnprintf(buf, NR_CPUS+1, &shared_cpu_map); + len += sprintf(buf+len, "\n"); + return len; +} + +static ssize_t show_type(struct cache_info *this_leaf, char *buf) +{ + int type = this_leaf->type + this_leaf->cci.pcci_unified; + return sprintf(buf, "%s\n", cache_types[type]); +} + +static ssize_t show_level(struct cache_info *this_leaf, char *buf) +{ + return sprintf(buf, "%u\n", this_leaf->level); +} + +struct cache_attr { + struct attribute attr; + ssize_t (*show)(struct cache_info *, char *); + ssize_t (*store)(struct cache_info *, const char *, size_t count); +}; + +#ifdef define_one_ro + #undef define_one_ro +#endif +#define define_one_ro(_name) \ + static struct cache_attr _name = \ +__ATTR(_name, 0444, show_##_name, NULL) + +define_one_ro(level); +define_one_ro(type); +define_one_ro(coherency_line_size); +define_one_ro(ways_of_associativity); +define_one_ro(size); +define_one_ro(number_of_sets); +define_one_ro(shared_cpu_map); +define_one_ro(attributes); + +static struct attribute * cache_default_attrs[] = { + &type.attr, + &level.attr, + &coherency_line_size.attr, + &ways_of_associativity.attr, + &attributes.attr, + &size.attr, + &number_of_sets.attr, + &shared_cpu_map.attr, + NULL +}; + +#define to_object(k) container_of(k, struct cache_info, kobj) +#define to_attr(a) container_of(a, struct cache_attr, attr) + +static ssize_t ia64_cache_show(struct kobject * kobj, struct attribute * attr, char * buf) +{ + struct cache_attr *fattr = to_attr(attr); + struct cache_info *this_leaf = to_object(kobj); + ssize_t ret; + + ret = fattr->show ? fattr->show(this_leaf, buf) : 0; + return ret; +} + +static const struct sysfs_ops cache_sysfs_ops = { + .show = ia64_cache_show +}; + +static struct kobj_type cache_ktype = { + .sysfs_ops = &cache_sysfs_ops, + .default_attrs = cache_default_attrs, +}; + +static struct kobj_type cache_ktype_percpu_entry = { + .sysfs_ops = &cache_sysfs_ops, +}; + +static void cpu_cache_sysfs_exit(unsigned int cpu) +{ + kfree(all_cpu_cache_info[cpu].cache_leaves); + all_cpu_cache_info[cpu].cache_leaves = NULL; + all_cpu_cache_info[cpu].num_cache_leaves = 0; + memset(&all_cpu_cache_info[cpu].kobj, 0, sizeof(struct kobject)); + return; +} + +static int cpu_cache_sysfs_init(unsigned int cpu) +{ + unsigned long i, levels, unique_caches; + pal_cache_config_info_t cci; + int j; + long status; + struct cache_info *this_cache; + int num_cache_leaves = 0; + + if ((status = ia64_pal_cache_summary(&levels, &unique_caches)) != 0) { + printk(KERN_ERR "ia64_pal_cache_summary=%ld\n", status); + return -1; + } + + this_cache=kzalloc(sizeof(struct cache_info)*unique_caches, + GFP_KERNEL); + if (this_cache == NULL) + return -ENOMEM; + + for (i=0; i < levels; i++) { + for (j=2; j >0 ; j--) { + if ((status=ia64_pal_cache_config_info(i,j, &cci)) != + PAL_STATUS_SUCCESS) + continue; + + this_cache[num_cache_leaves].cci = cci; + this_cache[num_cache_leaves].level = i + 1; + this_cache[num_cache_leaves].type = j; + + cache_shared_cpu_map_setup(cpu, + &this_cache[num_cache_leaves]); + num_cache_leaves ++; + } + } + + all_cpu_cache_info[cpu].cache_leaves = this_cache; + all_cpu_cache_info[cpu].num_cache_leaves = num_cache_leaves; + + memset(&all_cpu_cache_info[cpu].kobj, 0, sizeof(struct kobject)); + + return 0; +} + +/* Add cache interface for CPU device */ +static int cache_add_dev(struct device *sys_dev) +{ + unsigned int cpu = sys_dev->id; + unsigned long i, j; + struct cache_info *this_object; + int retval = 0; + cpumask_t oldmask; + + if (all_cpu_cache_info[cpu].kobj.parent) + return 0; + + oldmask = current->cpus_allowed; + retval = set_cpus_allowed_ptr(current, cpumask_of(cpu)); + if (unlikely(retval)) + return retval; + + retval = cpu_cache_sysfs_init(cpu); + set_cpus_allowed_ptr(current, &oldmask); + if (unlikely(retval < 0)) + return retval; + + retval = kobject_init_and_add(&all_cpu_cache_info[cpu].kobj, + &cache_ktype_percpu_entry, &sys_dev->kobj, + "%s", "cache"); + if (unlikely(retval < 0)) { + cpu_cache_sysfs_exit(cpu); + return retval; + } + + for (i = 0; i < all_cpu_cache_info[cpu].num_cache_leaves; i++) { + this_object = LEAF_KOBJECT_PTR(cpu,i); + retval = kobject_init_and_add(&(this_object->kobj), + &cache_ktype, + &all_cpu_cache_info[cpu].kobj, + "index%1lu", i); + if (unlikely(retval)) { + for (j = 0; j < i; j++) { + kobject_put(&(LEAF_KOBJECT_PTR(cpu,j)->kobj)); + } + kobject_put(&all_cpu_cache_info[cpu].kobj); + cpu_cache_sysfs_exit(cpu); + return retval; + } + kobject_uevent(&(this_object->kobj), KOBJ_ADD); + } + kobject_uevent(&all_cpu_cache_info[cpu].kobj, KOBJ_ADD); + return retval; +} + +/* Remove cache interface for CPU device */ +static int cache_remove_dev(struct device *sys_dev) +{ + unsigned int cpu = sys_dev->id; + unsigned long i; + + for (i = 0; i < all_cpu_cache_info[cpu].num_cache_leaves; i++) + kobject_put(&(LEAF_KOBJECT_PTR(cpu,i)->kobj)); + + if (all_cpu_cache_info[cpu].kobj.parent) { + kobject_put(&all_cpu_cache_info[cpu].kobj); + memset(&all_cpu_cache_info[cpu].kobj, + 0, + sizeof(struct kobject)); + } + + cpu_cache_sysfs_exit(cpu); + + return 0; +} + +/* + * When a cpu is hot-plugged, do a check and initiate + * cache kobject if necessary + */ +static int cache_cpu_callback(struct notifier_block *nfb, + unsigned long action, void *hcpu) +{ + unsigned int cpu = (unsigned long)hcpu; + struct device *sys_dev; + + sys_dev = get_cpu_device(cpu); + switch (action) { + case CPU_ONLINE: + case CPU_ONLINE_FROZEN: + cache_add_dev(sys_dev); + break; + case CPU_DEAD: + case CPU_DEAD_FROZEN: + cache_remove_dev(sys_dev); + break; + } + return NOTIFY_OK; +} + +static struct notifier_block cache_cpu_notifier = +{ + .notifier_call = cache_cpu_callback +}; + +static int __init cache_sysfs_init(void) +{ + int i; + + cpu_notifier_register_begin(); + + for_each_online_cpu(i) { + struct device *sys_dev = get_cpu_device((unsigned int)i); + cache_add_dev(sys_dev); + } + + __register_hotcpu_notifier(&cache_cpu_notifier); + + cpu_notifier_register_done(); + + return 0; +} + +device_initcall(cache_sysfs_init); + diff --git a/arch/ia64/kernel/traps.c b/arch/ia64/kernel/traps.c index f970359e7ed..d3636e67a98 100644 --- a/arch/ia64/kernel/traps.c +++ b/arch/ia64/kernel/traps.c @@ -7,7 +7,6 @@ * 05/12/00 grao <goutham.rao@intel.com> : added isr in siginfo for SIGFPE */ -#include <linux/config.h> #include <linux/kernel.h> #include <linux/init.h> #include <linux/sched.h> @@ -16,32 +15,18 @@ #include <linux/module.h> /* for EXPORT_SYMBOL */ #include <linux/hardirq.h> #include <linux/kprobes.h> +#include <linux/delay.h> /* for ssleep() */ +#include <linux/kdebug.h> #include <asm/fpswa.h> -#include <asm/ia32.h> #include <asm/intrinsics.h> #include <asm/processor.h> #include <asm/uaccess.h> -#include <asm/kdebug.h> - -extern spinlock_t timerlist_lock; +#include <asm/setup.h> fpswa_interface_t *fpswa_interface; EXPORT_SYMBOL(fpswa_interface); -struct notifier_block *ia64die_chain; -static DEFINE_SPINLOCK(die_notifier_lock); - -int register_die_notifier(struct notifier_block *nb) -{ - int err = 0; - unsigned long flags; - spin_lock_irqsave(&die_notifier_lock, flags); - err = notifier_chain_register(&ia64die_chain, nb); - spin_unlock_irqrestore(&die_notifier_lock, flags); - return err; -} - void __init trap_init (void) { @@ -50,35 +35,7 @@ trap_init (void) fpswa_interface = __va(ia64_boot_param->fpswa); } -/* - * Unlock any spinlocks which will prevent us from getting the message out (timerlist_lock - * is acquired through the console unblank code) - */ -void -bust_spinlocks (int yes) -{ - int loglevel_save = console_loglevel; - - if (yes) { - oops_in_progress = 1; - return; - } - -#ifdef CONFIG_VT - unblank_screen(); -#endif - oops_in_progress = 0; - /* - * OK, the message is on the console. Now we call printk() without - * oops_in_progress set so that printk will give klogd a poke. Hold onto - * your hats... - */ - console_loglevel = 15; /* NMI oopser may have shut the console up */ - printk(" "); - console_loglevel = loglevel_save; -} - -void +int die (const char *str, struct pt_regs *regs, long err) { static struct { @@ -86,9 +43,9 @@ die (const char *str, struct pt_regs *regs, long err) u32 lock_owner; int lock_owner_depth; } die = { - .lock = SPIN_LOCK_UNLOCKED, - .lock_owner = -1, - .lock_owner_depth = 0 + .lock = __SPIN_LOCK_UNLOCKED(die.lock), + .lock_owner = -1, + .lock_owner_depth = 0 }; static int die_counter; int cpu = get_cpu(); @@ -104,22 +61,36 @@ die (const char *str, struct pt_regs *regs, long err) if (++die.lock_owner_depth < 3) { printk("%s[%d]: %s %ld [%d]\n", - current->comm, current->pid, str, err, ++die_counter); - show_regs(regs); + current->comm, task_pid_nr(current), str, err, ++die_counter); + if (notify_die(DIE_OOPS, str, regs, err, 255, SIGSEGV) + != NOTIFY_STOP) + show_regs(regs); + else + regs = NULL; } else printk(KERN_ERR "Recursive die() failure, output suppressed\n"); bust_spinlocks(0); die.lock_owner = -1; + add_taint(TAINT_DIE, LOCKDEP_NOW_UNRELIABLE); spin_unlock_irq(&die.lock); + + if (!regs) + return 1; + + if (panic_on_oops) + panic("Fatal exception"); + do_exit(SIGSEGV); + return 0; } -void +int die_if_kernel (char *str, struct pt_regs *regs, long err) { if (!user_mode(regs)) - die(str, regs, err); + return die(str, regs, err); + return 0; } void @@ -128,24 +99,6 @@ __kprobes ia64_bad_break (unsigned long break_num, struct pt_regs *regs) siginfo_t siginfo; int sig, code; - /* break.b always sets cr.iim to 0, which causes problems for - * debuggers. Get the real break number from the original instruction, - * but only for kernel code. User space break.b is left alone, to - * preserve the existing behaviour. All break codings have the same - * format, so there is no need to check the slot type. - */ - if (break_num == 0 && !user_mode(regs)) { - struct ia64_psr *ipsr = ia64_psr(regs); - unsigned long *bundle = (unsigned long *)regs->cr_iip; - unsigned long slot; - switch (ipsr->ri) { - case 0: slot = (bundle[0] >> 5); break; - case 1: slot = (bundle[0] >> 46) | (bundle[1] << 18); break; - default: slot = (bundle[1] >> 23); break; - } - break_num = ((slot >> 36 & 1) << 20) | (slot >> 6 & 0xfffff); - } - /* SIGILL, SIGFPE, SIGSEGV, and SIGBUS want these field initialized: */ siginfo.si_addr = (void __user *) (regs->cr_iip + ia64_psr(regs)->ri); siginfo.si_imm = break_num; @@ -155,10 +108,10 @@ __kprobes ia64_bad_break (unsigned long break_num, struct pt_regs *regs) switch (break_num) { case 0: /* unknown error (used by GCC for __builtin_abort()) */ if (notify_die(DIE_BREAK, "break 0", regs, break_num, TRAP_BRKPT, SIGTRAP) - == NOTIFY_STOP) { + == NOTIFY_STOP) + return; + if (die_if_kernel("bugcheck!", regs, break_num)) return; - } - die_if_kernel("bugcheck!", regs, break_num); sig = SIGILL; code = ILL_ILLOPC; break; @@ -210,22 +163,17 @@ __kprobes ia64_bad_break (unsigned long break_num, struct pt_regs *regs) sig = SIGILL; code = __ILL_BNDMOD; break; - case 0x80200: - case 0x80300: - if (notify_die(DIE_BREAK, "kprobe", regs, break_num, TRAP_BRKPT, SIGTRAP) - == NOTIFY_STOP) { - return; - } - sig = SIGTRAP; code = TRAP_BRKPT; - break; - default: - if (break_num < 0x40000 || break_num > 0x100000) - die_if_kernel("Bad break", regs, break_num); + if ((break_num < 0x40000 || break_num > 0x100000) + && die_if_kernel("Bad break", regs, break_num)) + return; if (break_num < 0x80000) { sig = SIGILL; code = __ILL_BREAK; } else { + if (notify_die(DIE_BREAK, "bad break", regs, break_num, TRAP_BRKPT, SIGTRAP) + == NOTIFY_STOP) + return; sig = SIGTRAP; code = TRAP_BRKPT; } } @@ -324,6 +272,15 @@ fp_emulate (int fp_fault, void *bundle, long *ipsr, long *fpsr, long *isr, long return ret.status; } +struct fpu_swa_msg { + unsigned long count; + unsigned long time; +}; +static DEFINE_PER_CPU(struct fpu_swa_msg, cpulast); +DECLARE_PER_CPU(struct fpu_swa_msg, cpulast); +static struct fpu_swa_msg last __cacheline_aligned; + + /* * Handle floating-point assist faults and traps. */ @@ -333,8 +290,6 @@ handle_fpu_swa (int fp_fault, struct pt_regs *regs, unsigned long isr) long exception, bundle[2]; unsigned long fault_ip; struct siginfo siginfo; - static int fpu_swa_count = 0; - static unsigned long last_time; fault_ip = regs->cr_iip; if (!fp_fault && (ia64_psr(regs)->ri == 0)) @@ -342,14 +297,37 @@ handle_fpu_swa (int fp_fault, struct pt_regs *regs, unsigned long isr) if (copy_from_user(bundle, (void __user *) fault_ip, sizeof(bundle))) return -1; - if (jiffies - last_time > 5*HZ) - fpu_swa_count = 0; - if ((fpu_swa_count < 4) && !(current->thread.flags & IA64_THREAD_FPEMU_NOPRINT)) { - last_time = jiffies; - ++fpu_swa_count; - printk(KERN_WARNING - "%s(%d): floating-point assist fault at ip %016lx, isr %016lx\n", - current->comm, current->pid, regs->cr_iip + ia64_psr(regs)->ri, isr); + if (!(current->thread.flags & IA64_THREAD_FPEMU_NOPRINT)) { + unsigned long count, current_jiffies = jiffies; + struct fpu_swa_msg *cp = &__get_cpu_var(cpulast); + + if (unlikely(current_jiffies > cp->time)) + cp->count = 0; + if (unlikely(cp->count < 5)) { + cp->count++; + cp->time = current_jiffies + 5 * HZ; + + /* minimize races by grabbing a copy of count BEFORE checking last.time. */ + count = last.count; + barrier(); + + /* + * Lower 4 bits are used as a count. Upper bits are a sequence + * number that is updated when count is reset. The cmpxchg will + * fail is seqno has changed. This minimizes mutiple cpus + * resetting the count. + */ + if (current_jiffies > last.time) + (void) cmpxchg_acq(&last.count, count, 16 + (count & ~15)); + + /* used fetchadd to atomically update the count */ + if ((last.count & 15) < 5 && (ia64_fetchadd(1, &last.count, acq) & 15) < 5) { + last.time = current_jiffies + 5 * HZ; + printk(KERN_WARNING + "%s(%d): floating-point assist fault at ip %016lx, isr %016lx\n", + current->comm, task_pid_nr(current), regs->cr_iip + ia64_psr(regs)->ri, isr); + } + } } exception = fp_emulate(fp_fault, bundle, ®s->cr_ipsr, ®s->ar_fpsr, &isr, ®s->pr, @@ -434,14 +412,15 @@ ia64_illegal_op_fault (unsigned long ec, long arg1, long arg2, long arg3, #endif sprintf(buf, "IA-64 Illegal operation fault"); - die_if_kernel(buf, ®s, 0); + rv.fkt = 0; + if (die_if_kernel(buf, ®s, 0)) + return rv; memset(&si, 0, sizeof(si)); si.si_signo = SIGILL; si.si_code = ILL_ILLOPC; si.si_addr = (void __user *) (regs.cr_iip + ia64_psr(®s)->ri); force_sig_info(SIGILL, &si, current); - rv.fkt = 0; return rv; } @@ -485,7 +464,7 @@ ia64_fault (unsigned long vector, unsigned long isr, unsigned long ifa, if (code == 8) { # ifdef CONFIG_IA64_PRINT_HAZARDS printk("%s[%d]: possible hazard @ ip=%016lx (pr = %016lx)\n", - current->comm, current->pid, + current->comm, task_pid_nr(current), regs.cr_iip + ia64_psr(®s)->ri, regs.pr); # endif return; @@ -551,12 +530,15 @@ ia64_fault (unsigned long vector, unsigned long isr, unsigned long ifa, if (fsys_mode(current, ®s)) { extern char __kernel_syscall_via_break[]; /* - * Got a trap in fsys-mode: Taken Branch Trap and Single Step trap - * need special handling; Debug trap is not supposed to happen. + * Got a trap in fsys-mode: Taken Branch Trap + * and Single Step trap need special handling; + * Debug trap is ignored (we disable it here + * and re-enable it in the lower-privilege trap). */ if (unlikely(vector == 29)) { - die("Got debug trap in fsys-mode---not supposed to happen!", - ®s, 0); + set_thread_flag(TIF_DB_DISABLED); + ia64_psr(®s)->db = 0; + ia64_psr(®s)->lp = 1; return; } /* re-do the system call via break 0x100000: */ @@ -578,12 +560,11 @@ ia64_fault (unsigned long vector, unsigned long isr, unsigned long ifa, #endif break; case 35: siginfo.si_code = TRAP_BRANCH; ifa = 0; break; - case 36: - if (notify_die(DIE_SS, "ss", ®s, vector, - vector, SIGTRAP) == NOTIFY_STOP) - return; - siginfo.si_code = TRAP_TRACE; ifa = 0; break; + case 36: siginfo.si_code = TRAP_TRACE; ifa = 0; break; } + if (notify_die(DIE_FAULT, "ia64_fault", ®s, vector, siginfo.si_code, SIGTRAP) + == NOTIFY_STOP) + return; siginfo.si_signo = SIGTRAP; siginfo.si_errno = 0; siginfo.si_addr = (void __user *) ifa; @@ -611,10 +592,19 @@ ia64_fault (unsigned long vector, unsigned long isr, unsigned long ifa, case 34: if (isr & 0x2) { /* Lower-Privilege Transfer Trap */ + + /* If we disabled debug traps during an fsyscall, + * re-enable them here. + */ + if (test_thread_flag(TIF_DB_DISABLED)) { + clear_thread_flag(TIF_DB_DISABLED); + ia64_psr(®s)->db = 1; + } + /* - * Just clear PSR.lp and then return immediately: all the - * interesting work (e.g., signal delivery is done in the kernel - * exit path). + * Just clear PSR.lp and then return immediately: + * all the interesting work (e.g., signal delivery) + * is done in the kernel exit path. */ ia64_psr(®s)->lp = 0; return; @@ -636,21 +626,13 @@ ia64_fault (unsigned long vector, unsigned long isr, unsigned long ifa, break; case 45: -#ifdef CONFIG_IA32_SUPPORT - if (ia32_exception(®s, isr) == 0) - return; -#endif printk(KERN_ERR "Unexpected IA-32 exception (Trap 45)\n"); printk(KERN_ERR " iip - 0x%lx, ifa - 0x%lx, isr - 0x%lx\n", iip, ifa, isr); force_sig(SIGSEGV, current); - break; + return; case 46: -#ifdef CONFIG_IA32_SUPPORT - if (ia32_intercept(®s, isr) == 0) - return; -#endif printk(KERN_ERR "Unexpected IA-32 intercept trap (Trap 46)\n"); printk(KERN_ERR " iip - 0x%lx, ifa - 0x%lx, isr - 0x%lx, iim - 0x%lx\n", iip, ifa, isr, iim); @@ -665,6 +647,6 @@ ia64_fault (unsigned long vector, unsigned long isr, unsigned long ifa, sprintf(buf, "Fault %lu", vector); break; } - die_if_kernel(buf, ®s, error); - force_sig(SIGILL, current); + if (!die_if_kernel(buf, ®s, error)) + force_sig(SIGILL, current); } diff --git a/arch/ia64/kernel/unaligned.c b/arch/ia64/kernel/unaligned.c index 43b45b65ee5..622772b7fb6 100644 --- a/arch/ia64/kernel/unaligned.c +++ b/arch/ia64/kernel/unaligned.c @@ -13,10 +13,11 @@ * 2001/08/13 Correct size of extended floats (float_fsz) from 16 to 10 bytes. * 2001/01/17 Add support emulation of unaligned kernel accesses. */ +#include <linux/jiffies.h> #include <linux/kernel.h> #include <linux/sched.h> -#include <linux/smp_lock.h> #include <linux/tty.h> +#include <linux/ratelimit.h> #include <asm/intrinsics.h> #include <asm/processor.h> @@ -24,12 +25,12 @@ #include <asm/uaccess.h> #include <asm/unaligned.h> -extern void die_if_kernel(char *str, struct pt_regs *regs, long err) __attribute__ ((noreturn)); +extern int die_if_kernel(char *str, struct pt_regs *regs, long err); #undef DEBUG_UNALIGNED_TRAP #ifdef DEBUG_UNALIGNED_TRAP -# define DPRINT(a...) do { printk("%s %u: ", __FUNCTION__, __LINE__); printk (a); } while (0) +# define DPRINT(a...) do { printk("%s %u: ", __func__, __LINE__); printk (a); } while (0) # define DDUMP(str,vp,len) dump(str, vp, len) static void @@ -53,6 +54,15 @@ dump (const char *str, void *vp, size_t len) #define SIGN_EXT9 0xffffffffffffff00ul /* + * sysctl settable hook which tells the kernel whether to honor the + * IA64_THREAD_UAC_NOPRINT prctl. Because this is user settable, we want + * to allow the super user to enable/disable this for security reasons + * (i.e. don't allow attacker to fill up logs with unaligned accesses). + */ +int no_unaligned_warning; +int unaligned_dump_stack; + +/* * For M-unit: * * opcode | m | x6 | @@ -666,9 +676,10 @@ emulate_load_updates (update_t type, load_store_t ld, struct pt_regs *regs, unsi * just in case. */ if (ld.x6_op == 1 || ld.x6_op == 3) { - printk(KERN_ERR "%s: register update on speculative load, error\n", __FUNCTION__); - die_if_kernel("unaligned reference on speculative load with register update\n", - regs, 30); + printk(KERN_ERR "%s: register update on speculative load, error\n", __func__); + if (die_if_kernel("unaligned reference on speculative load with register update\n", + regs, 30)) + return; } @@ -1095,7 +1106,7 @@ emulate_load_floatpair (unsigned long ifa, load_store_t ld, struct pt_regs *regs */ if (ld.x6_op == 1 || ld.x6_op == 3) printk(KERN_ERR "%s: register update on speculative load pair, error\n", - __FUNCTION__); + __func__); setreg(ld.r3, ifa, 0, regs); } @@ -1273,23 +1284,9 @@ emulate_store_float (unsigned long ifa, load_store_t ld, struct pt_regs *regs) /* * Make sure we log the unaligned access, so that user/sysadmin can notice it and * eventually fix the program. However, we don't want to do that for every access so we - * pace it with jiffies. This isn't really MP-safe, but it doesn't really have to be - * either... + * pace it with jiffies. */ -static int -within_logging_rate_limit (void) -{ - static unsigned long count, last_time; - - if (jiffies - last_time > 5*HZ) - count = 0; - if (++count < 5) { - last_time = jiffies; - return 1; - } - return 0; - -} +static DEFINE_RATELIMIT_STATE(logging_rate_limit, 5 * HZ, 5); void ia64_handle_unaligned (unsigned long ifa, struct pt_regs *regs) @@ -1308,7 +1305,8 @@ ia64_handle_unaligned (unsigned long ifa, struct pt_regs *regs) if (ia64_psr(regs)->be) { /* we don't support big-endian accesses */ - die_if_kernel("big-endian unaligned accesses are not supported", regs, 0); + if (die_if_kernel("big-endian unaligned accesses are not supported", regs, 0)) + return; goto force_sigbus; } @@ -1323,14 +1321,16 @@ ia64_handle_unaligned (unsigned long ifa, struct pt_regs *regs) if ((current->thread.flags & IA64_THREAD_UAC_SIGBUS) != 0) goto force_sigbus; - if (!(current->thread.flags & IA64_THREAD_UAC_NOPRINT) - && within_logging_rate_limit()) + if (!no_unaligned_warning && + !(current->thread.flags & IA64_THREAD_UAC_NOPRINT) && + __ratelimit(&logging_rate_limit)) { char buf[200]; /* comm[] is at most 16 bytes... */ size_t len; len = sprintf(buf, "%s(%d): unaligned access to 0x%016lx, " - "ip=0x%016lx\n\r", current->comm, current->pid, + "ip=0x%016lx\n\r", current->comm, + task_pid_nr(current), ifa, regs->cr_iip + ipsr->ri); /* * Don't call tty_write_message() if we're in the kernel; we might @@ -1339,12 +1339,29 @@ ia64_handle_unaligned (unsigned long ifa, struct pt_regs *regs) if (user_mode(regs)) tty_write_message(current->signal->tty, buf); buf[len-1] = '\0'; /* drop '\r' */ - printk(KERN_WARNING "%s", buf); /* watch for command names containing %s */ + /* watch for command names containing %s */ + printk(KERN_WARNING "%s", buf); + } else { + if (no_unaligned_warning) { + printk_once(KERN_WARNING "%s(%d) encountered an " + "unaligned exception which required\n" + "kernel assistance, which degrades " + "the performance of the application.\n" + "Unaligned exception warnings have " + "been disabled by the system " + "administrator\n" + "echo 0 > /proc/sys/kernel/ignore-" + "unaligned-usertrap to re-enable\n", + current->comm, task_pid_nr(current)); + } } } else { - if (within_logging_rate_limit()) + if (__ratelimit(&logging_rate_limit)) { printk(KERN_WARNING "kernel unaligned access to 0x%016lx, ip=0x%016lx\n", ifa, regs->cr_iip + ipsr->ri); + if (unaligned_dump_stack) + dump_stack(); + } set_fs(KERNEL_DS); } @@ -1462,16 +1479,19 @@ ia64_handle_unaligned (unsigned long ifa, struct pt_regs *regs) case LDFA_OP: case LDFCCLR_OP: case LDFCNC_OP: - case LDF_IMM_OP: - case LDFA_IMM_OP: - case LDFCCLR_IMM_OP: - case LDFCNC_IMM_OP: if (u.insn.x) ret = emulate_load_floatpair(ifa, u.insn, regs); else ret = emulate_load_float(ifa, u.insn, regs); break; + case LDF_IMM_OP: + case LDFA_IMM_OP: + case LDFCCLR_IMM_OP: + case LDFCNC_IMM_OP: + ret = emulate_load_float(ifa, u.insn, regs); + break; + case STF_OP: case STF_IMM_OP: ret = emulate_store_float(ifa, u.insn, regs); @@ -1505,7 +1525,8 @@ ia64_handle_unaligned (unsigned long ifa, struct pt_regs *regs) ia64_handle_exception(regs, eh); goto done; } - die_if_kernel("error during unaligned kernel access\n", regs, ret); + if (die_if_kernel("error during unaligned kernel access\n", regs, ret)) + return; /* NOT_REACHED */ } force_sigbus: diff --git a/arch/ia64/kernel/uncached.c b/arch/ia64/kernel/uncached.c index c6d40446c2c..20e8a9b21d7 100644 --- a/arch/ia64/kernel/uncached.c +++ b/arch/ia64/kernel/uncached.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2001-2005 Silicon Graphics, Inc. All rights reserved. + * Copyright (C) 2001-2008 Silicon Graphics, Inc. All rights reserved. * * This program is free software; you can redistribute it and/or modify it * under the terms of version 2 of the GNU General Public License @@ -18,79 +18,97 @@ #include <linux/init.h> #include <linux/errno.h> #include <linux/string.h> -#include <linux/slab.h> #include <linux/efi.h> #include <linux/genalloc.h> +#include <linux/gfp.h> #include <asm/page.h> #include <asm/pal.h> -#include <asm/system.h> #include <asm/pgtable.h> -#include <asm/atomic.h> +#include <linux/atomic.h> #include <asm/tlbflush.h> #include <asm/sn/arch.h> -#define DEBUG 0 -#if DEBUG -#define dprintk printk -#else -#define dprintk(x...) do { } while (0) -#endif +extern void __init efi_memmap_walk_uc(efi_freemem_callback_t, void *); -void __init efi_memmap_walk_uc (efi_freemem_callback_t callback); +struct uncached_pool { + struct gen_pool *pool; + struct mutex add_chunk_mutex; /* serialize adding a converted chunk */ + int nchunks_added; /* #of converted chunks added to pool */ + atomic_t status; /* smp called function's return status*/ +}; -#define MAX_UNCACHED_GRANULES 5 -static int allocated_granules; +#define MAX_CONVERTED_CHUNKS_PER_NODE 2 -struct gen_pool *uncached_pool[MAX_NUMNODES]; +struct uncached_pool uncached_pools[MAX_NUMNODES]; static void uncached_ipi_visibility(void *data) { int status; + struct uncached_pool *uc_pool = (struct uncached_pool *)data; status = ia64_pal_prefetch_visibility(PAL_VISIBILITY_PHYSICAL); if ((status != PAL_VISIBILITY_OK) && (status != PAL_VISIBILITY_OK_REMOTE_NEEDED)) - printk(KERN_DEBUG "pal_prefetch_visibility() returns %i on " - "CPU %i\n", status, get_cpu()); + atomic_inc(&uc_pool->status); } static void uncached_ipi_mc_drain(void *data) { int status; + struct uncached_pool *uc_pool = (struct uncached_pool *)data; + status = ia64_pal_mc_drain(); - if (status) - printk(KERN_WARNING "ia64_pal_mc_drain() failed with %i on " - "CPU %i\n", status, get_cpu()); + if (status != PAL_STATUS_SUCCESS) + atomic_inc(&uc_pool->status); } -static unsigned long -uncached_get_new_chunk(struct gen_pool *poolp) +/* + * Add a new chunk of uncached memory pages to the specified pool. + * + * @pool: pool to add new chunk of uncached memory to + * @nid: node id of node to allocate memory from, or -1 + * + * This is accomplished by first allocating a granule of cached memory pages + * and then converting them to uncached memory pages. + */ +static int uncached_add_chunk(struct uncached_pool *uc_pool, int nid) { struct page *page; - void *tmp; - int status, i; - unsigned long addr, node; + int status, i, nchunks_added = uc_pool->nchunks_added; + unsigned long c_addr, uc_addr; - if (allocated_granules >= MAX_UNCACHED_GRANULES) + if (mutex_lock_interruptible(&uc_pool->add_chunk_mutex) != 0) + return -1; /* interrupted by a signal */ + + if (uc_pool->nchunks_added > nchunks_added) { + /* someone added a new chunk while we were waiting */ + mutex_unlock(&uc_pool->add_chunk_mutex); return 0; + } + + if (uc_pool->nchunks_added >= MAX_CONVERTED_CHUNKS_PER_NODE) { + mutex_unlock(&uc_pool->add_chunk_mutex); + return -1; + } + + /* attempt to allocate a granule's worth of cached memory pages */ - node = poolp->private; - page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, + page = alloc_pages_exact_node(nid, + GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE, IA64_GRANULE_SHIFT-PAGE_SHIFT); + if (!page) { + mutex_unlock(&uc_pool->add_chunk_mutex); + return -1; + } - dprintk(KERN_INFO "get_new_chunk page %p, addr %lx\n", - page, (unsigned long)(page-vmem_map) << PAGE_SHIFT); + /* convert the memory pages from cached to uncached */ - /* - * Do magic if no mem on local node! XXX - */ - if (!page) - return 0; - tmp = page_address(page); + c_addr = (unsigned long)page_address(page); + uc_addr = c_addr - PAGE_OFFSET + __IA64_UNCACHED_OFFSET; /* * There's a small race here where it's possible for someone to @@ -100,76 +118,99 @@ uncached_get_new_chunk(struct gen_pool *poolp) for (i = 0; i < (IA64_GRANULE_SIZE / PAGE_SIZE); i++) SetPageUncached(&page[i]); - flush_tlb_kernel_range(tmp, tmp + IA64_GRANULE_SIZE); + flush_tlb_kernel_range(uc_addr, uc_addr + IA64_GRANULE_SIZE); status = ia64_pal_prefetch_visibility(PAL_VISIBILITY_PHYSICAL); + if (status == PAL_VISIBILITY_OK_REMOTE_NEEDED) { + atomic_set(&uc_pool->status, 0); + status = smp_call_function(uncached_ipi_visibility, uc_pool, 1); + if (status || atomic_read(&uc_pool->status)) + goto failed; + } else if (status != PAL_VISIBILITY_OK) + goto failed; - dprintk(KERN_INFO "pal_prefetch_visibility() returns %i on cpu %i\n", - status, get_cpu()); - - if (!status) { - status = smp_call_function(uncached_ipi_visibility, NULL, 0, 1); - if (status) - printk(KERN_WARNING "smp_call_function failed for " - "uncached_ipi_visibility! (%i)\n", status); - } + preempt_disable(); if (ia64_platform_is("sn2")) - sn_flush_all_caches((unsigned long)tmp, IA64_GRANULE_SIZE); + sn_flush_all_caches(uc_addr, IA64_GRANULE_SIZE); else - flush_icache_range((unsigned long)tmp, - (unsigned long)tmp+IA64_GRANULE_SIZE); + flush_icache_range(uc_addr, uc_addr + IA64_GRANULE_SIZE); - ia64_pal_mc_drain(); - status = smp_call_function(uncached_ipi_mc_drain, NULL, 0, 1); + /* flush the just introduced uncached translation from the TLB */ + local_flush_tlb_all(); + + preempt_enable(); + + status = ia64_pal_mc_drain(); + if (status != PAL_STATUS_SUCCESS) + goto failed; + atomic_set(&uc_pool->status, 0); + status = smp_call_function(uncached_ipi_mc_drain, uc_pool, 1); + if (status || atomic_read(&uc_pool->status)) + goto failed; + + /* + * The chunk of memory pages has been converted to uncached so now we + * can add it to the pool. + */ + status = gen_pool_add(uc_pool->pool, uc_addr, IA64_GRANULE_SIZE, nid); if (status) - printk(KERN_WARNING "smp_call_function failed for " - "uncached_ipi_mc_drain! (%i)\n", status); + goto failed; + + uc_pool->nchunks_added++; + mutex_unlock(&uc_pool->add_chunk_mutex); + return 0; - addr = (unsigned long)tmp - PAGE_OFFSET + __IA64_UNCACHED_OFFSET; + /* failed to convert or add the chunk so give it back to the kernel */ +failed: + for (i = 0; i < (IA64_GRANULE_SIZE / PAGE_SIZE); i++) + ClearPageUncached(&page[i]); - allocated_granules++; - return addr; + free_pages(c_addr, IA64_GRANULE_SHIFT-PAGE_SHIFT); + mutex_unlock(&uc_pool->add_chunk_mutex); + return -1; } /* * uncached_alloc_page * - * Allocate 1 uncached page. Allocates on the requested node. If no - * uncached pages are available on the requested node, roundrobin starting - * with higher nodes. + * @starting_nid: node id of node to start with, or -1 + * @n_pages: number of contiguous pages to allocate + * + * Allocate the specified number of contiguous uncached pages on the + * the requested node. If not enough contiguous uncached pages are available + * on the requested node, roundrobin starting with the next higher node. */ -unsigned long -uncached_alloc_page(int nid) +unsigned long uncached_alloc_page(int starting_nid, int n_pages) { - unsigned long maddr; + unsigned long uc_addr; + struct uncached_pool *uc_pool; + int nid; - maddr = gen_pool_alloc(uncached_pool[nid], PAGE_SIZE); + if (unlikely(starting_nid >= MAX_NUMNODES)) + return 0; - dprintk(KERN_DEBUG "uncached_alloc_page returns %lx on node %i\n", - maddr, nid); + if (starting_nid < 0) + starting_nid = numa_node_id(); + nid = starting_nid; - /* - * If no memory is availble on our local node, try the - * remaining nodes in the system. - */ - if (!maddr) { - int i; - - for (i = MAX_NUMNODES - 1; i >= 0; i--) { - if (i == nid || !node_online(i)) - continue; - maddr = gen_pool_alloc(uncached_pool[i], PAGE_SIZE); - dprintk(KERN_DEBUG "uncached_alloc_page alternate search " - "returns %lx on node %i\n", maddr, i); - if (maddr) { - break; - } - } - } + do { + if (!node_state(nid, N_HIGH_MEMORY)) + continue; + uc_pool = &uncached_pools[nid]; + if (uc_pool->pool == NULL) + continue; + do { + uc_addr = gen_pool_alloc(uc_pool->pool, + n_pages * PAGE_SIZE); + if (uc_addr != 0) + return uc_addr; + } while (uncached_add_chunk(uc_pool, nid) == 0); - return maddr; + } while ((nid = (nid + 1) % MAX_NUMNODES) != starting_nid); + + return 0; } EXPORT_SYMBOL(uncached_alloc_page); @@ -177,21 +218,23 @@ EXPORT_SYMBOL(uncached_alloc_page); /* * uncached_free_page * - * Free a single uncached page. + * @uc_addr: uncached address of first page to free + * @n_pages: number of contiguous pages to free + * + * Free the specified number of uncached pages. */ -void -uncached_free_page(unsigned long maddr) +void uncached_free_page(unsigned long uc_addr, int n_pages) { - int node; + int nid = paddr_to_nid(uc_addr - __IA64_UNCACHED_OFFSET); + struct gen_pool *pool = uncached_pools[nid].pool; - node = paddr_to_nid(maddr - __IA64_UNCACHED_OFFSET); + if (unlikely(pool == NULL)) + return; - dprintk(KERN_DEBUG "uncached_free_page(%lx) on node %i\n", maddr, node); + if ((uc_addr & (0XFUL << 60)) != __IA64_UNCACHED_OFFSET) + panic("uncached_free_page invalid address %lx\n", uc_addr); - if ((maddr & (0XFUL << 60)) != __IA64_UNCACHED_OFFSET) - panic("uncached_free_page invalid address %lx\n", maddr); - - gen_pool_free(uncached_pool[node], maddr, PAGE_SIZE); + gen_pool_free(pool, uc_addr, n_pages * PAGE_SIZE); } EXPORT_SYMBOL(uncached_free_page); @@ -199,42 +242,39 @@ EXPORT_SYMBOL(uncached_free_page); /* * uncached_build_memmap, * + * @uc_start: uncached starting address of a chunk of uncached memory + * @uc_end: uncached ending address of a chunk of uncached memory + * @arg: ignored, (NULL argument passed in on call to efi_memmap_walk_uc()) + * * Called at boot time to build a map of pages that can be used for * memory special operations. */ -static int __init -uncached_build_memmap(unsigned long start, unsigned long end, void *arg) +static int __init uncached_build_memmap(u64 uc_start, u64 uc_end, void *arg) { - long length = end - start; - int node; - - dprintk(KERN_ERR "uncached_build_memmap(%lx %lx)\n", start, end); + int nid = paddr_to_nid(uc_start - __IA64_UNCACHED_OFFSET); + struct gen_pool *pool = uncached_pools[nid].pool; + size_t size = uc_end - uc_start; - memset((char *)start, 0, length); + touch_softlockup_watchdog(); - node = paddr_to_nid(start - __IA64_UNCACHED_OFFSET); - - for (; start < end ; start += PAGE_SIZE) { - dprintk(KERN_INFO "sticking %lx into the pool!\n", start); - gen_pool_free(uncached_pool[node], start, PAGE_SIZE); + if (pool != NULL) { + memset((char *)uc_start, 0, size); + (void) gen_pool_add(pool, uc_start, size, nid); } - return 0; } -static int __init uncached_init(void) { - int i; +static int __init uncached_init(void) +{ + int nid; - for (i = 0; i < MAX_NUMNODES; i++) { - if (!node_online(i)) - continue; - uncached_pool[i] = gen_pool_create(0, IA64_GRANULE_SHIFT, - &uncached_get_new_chunk, i); + for_each_node_state(nid, N_ONLINE) { + uncached_pools[nid].pool = gen_pool_create(PAGE_SHIFT, nid); + mutex_init(&uncached_pools[nid].add_chunk_mutex); } - efi_memmap_walk_uc(uncached_build_memmap); - + efi_memmap_walk_uc(uncached_build_memmap, NULL); return 0; } diff --git a/arch/ia64/kernel/unwind.c b/arch/ia64/kernel/unwind.c index 93d5a3b41f6..8f66195999e 100644 --- a/arch/ia64/kernel/unwind.c +++ b/arch/ia64/kernel/unwind.c @@ -2,7 +2,7 @@ * Copyright (C) 1999-2004 Hewlett-Packard Co * David Mosberger-Tang <davidm@hpl.hp.com> * Copyright (C) 2003 Fenghua Yu <fenghua.yu@intel.com> - * - Change pt_regs_off() to make it less dependant on pt_regs structure. + * - Change pt_regs_off() to make it less dependent on pt_regs structure. */ /* * This file implements call frame unwind support for the Linux @@ -41,7 +41,6 @@ #include <asm/ptrace_offsets.h> #include <asm/rse.h> #include <asm/sections.h> -#include <asm/system.h> #include <asm/uaccess.h> #include "entry.h" @@ -60,6 +59,7 @@ # define UNW_DEBUG_ON(n) unw_debug_level >= n /* Do not code a printk level, not all debug lines end in newline */ # define UNW_DPRINT(n, ...) if (UNW_DEBUG_ON(n)) printk(__VA_ARGS__) +# undef inline # define inline #else /* !UNW_DEBUG */ # define UNW_DEBUG_ON(n) 0 @@ -145,7 +145,7 @@ static struct { # endif } unw = { .tables = &unw.kernel_table, - .lock = SPIN_LOCK_UNLOCKED, + .lock = __SPIN_LOCK_UNLOCKED(unw.lock), .save_order = { UNW_REG_RP, UNW_REG_PFS, UNW_REG_PSP, UNW_REG_PR, UNW_REG_UNAT, UNW_REG_LC, UNW_REG_FPSR, UNW_REG_PRI_UNAT_GR @@ -256,7 +256,7 @@ pt_regs_off (unsigned long reg) off = unw.pt_regs_offsets[reg]; if (off < 0) { - UNW_DPRINT(0, "unwind.%s: bad scratch reg r%lu\n", __FUNCTION__, reg); + UNW_DPRINT(0, "unwind.%s: bad scratch reg r%lu\n", __func__, reg); off = 0; } return (unsigned long) off; @@ -267,13 +267,13 @@ get_scratch_regs (struct unw_frame_info *info) { if (!info->pt) { /* This should not happen with valid unwind info. */ - UNW_DPRINT(0, "unwind.%s: bad unwind info: resetting info->pt\n", __FUNCTION__); + UNW_DPRINT(0, "unwind.%s: bad unwind info: resetting info->pt\n", __func__); if (info->flags & UNW_FLAG_INTERRUPT_FRAME) info->pt = (unsigned long) ((struct pt_regs *) info->psp - 1); else info->pt = info->sp - 16; } - UNW_DPRINT(3, "unwind.%s: sp 0x%lx pt 0x%lx\n", __FUNCTION__, info->sp, info->pt); + UNW_DPRINT(3, "unwind.%s: sp 0x%lx pt 0x%lx\n", __func__, info->sp, info->pt); return (struct pt_regs *) info->pt; } @@ -293,7 +293,7 @@ unw_access_gr (struct unw_frame_info *info, int regnum, unsigned long *val, char return 0; } UNW_DPRINT(0, "unwind.%s: trying to access non-existent r%u\n", - __FUNCTION__, regnum); + __func__, regnum); return -1; } @@ -340,7 +340,7 @@ unw_access_gr (struct unw_frame_info *info, int regnum, unsigned long *val, char { UNW_DPRINT(0, "unwind.%s: %p outside of regstk " "[0x%lx-0x%lx)\n", - __FUNCTION__, (void *) addr, + __func__, (void *) addr, info->regstk.limit, info->regstk.top); return -1; @@ -373,7 +373,7 @@ unw_access_gr (struct unw_frame_info *info, int regnum, unsigned long *val, char || (unsigned long) addr >= info->regstk.top) { UNW_DPRINT(0, "unwind.%s: ignoring attempt to access register outside " - "of rbs\n", __FUNCTION__); + "of rbs\n", __func__); return -1; } if ((unsigned long) nat_addr >= info->regstk.top) @@ -384,7 +384,7 @@ unw_access_gr (struct unw_frame_info *info, int regnum, unsigned long *val, char if (write) { if (read_only(addr)) { UNW_DPRINT(0, "unwind.%s: ignoring attempt to write read-only location\n", - __FUNCTION__); + __func__); } else { *addr = *val; if (*nat) @@ -426,13 +426,13 @@ unw_access_br (struct unw_frame_info *info, int regnum, unsigned long *val, int default: UNW_DPRINT(0, "unwind.%s: trying to access non-existent b%u\n", - __FUNCTION__, regnum); + __func__, regnum); return -1; } if (write) if (read_only(addr)) { UNW_DPRINT(0, "unwind.%s: ignoring attempt to write read-only location\n", - __FUNCTION__); + __func__); } else *addr = *val; else @@ -449,7 +449,7 @@ unw_access_fr (struct unw_frame_info *info, int regnum, struct ia64_fpreg *val, if ((unsigned) (regnum - 2) >= 126) { UNW_DPRINT(0, "unwind.%s: trying to access non-existent f%u\n", - __FUNCTION__, regnum); + __func__, regnum); return -1; } @@ -481,7 +481,7 @@ unw_access_fr (struct unw_frame_info *info, int regnum, struct ia64_fpreg *val, if (write) if (read_only(addr)) { UNW_DPRINT(0, "unwind.%s: ignoring attempt to write read-only location\n", - __FUNCTION__); + __func__); } else *addr = *val; else @@ -571,14 +571,14 @@ unw_access_ar (struct unw_frame_info *info, int regnum, unsigned long *val, int default: UNW_DPRINT(0, "unwind.%s: trying to access non-existent ar%u\n", - __FUNCTION__, regnum); + __func__, regnum); return -1; } if (write) { if (read_only(addr)) { UNW_DPRINT(0, "unwind.%s: ignoring attempt to write read-only location\n", - __FUNCTION__); + __func__); } else *addr = *val; } else @@ -599,7 +599,7 @@ unw_access_pr (struct unw_frame_info *info, unsigned long *val, int write) if (write) { if (read_only(addr)) { UNW_DPRINT(0, "unwind.%s: ignoring attempt to write read-only location\n", - __FUNCTION__); + __func__); } else *addr = *val; } else @@ -698,7 +698,7 @@ decode_abreg (unsigned char abreg, int memory) default: break; } - UNW_DPRINT(0, "unwind.%s: bad abreg=0x%x\n", __FUNCTION__, abreg); + UNW_DPRINT(0, "unwind.%s: bad abreg=0x%x\n", __func__, abreg); return UNW_REG_LC; } @@ -738,7 +738,7 @@ spill_next_when (struct unw_reg_info **regp, struct unw_reg_info *lim, unw_word return; } } - UNW_DPRINT(0, "unwind.%s: excess spill!\n", __FUNCTION__); + UNW_DPRINT(0, "unwind.%s: excess spill!\n", __func__); } static inline void @@ -854,11 +854,11 @@ desc_abi (unsigned char abi, unsigned char context, struct unw_state_record *sr) { if (abi == 3 && context == 'i') { sr->flags |= UNW_FLAG_INTERRUPT_FRAME; - UNW_DPRINT(3, "unwind.%s: interrupt frame\n", __FUNCTION__); + UNW_DPRINT(3, "unwind.%s: interrupt frame\n", __func__); } else UNW_DPRINT(0, "unwind%s: ignoring unwabi(abi=0x%x,context=0x%x)\n", - __FUNCTION__, abi, context); + __func__, abi, context); } static inline void @@ -1203,10 +1203,10 @@ desc_spill_sprel_p (unsigned char qp, unw_word t, unsigned char abreg, unw_word static inline unw_hash_index_t hash (unsigned long ip) { -# define hashmagic 0x9e3779b97f4a7c16UL /* based on (sqrt(5)/2-1)*2^64 */ + /* magic number = ((sqrt(5)-1)/2)*2^64 */ + static const unsigned long hashmagic = 0x9e3779b97f4a7c16UL; - return (ip >> 4)*hashmagic >> (64 - UNW_LOG_HASH_SIZE); -#undef hashmagic + return (ip >> 4) * hashmagic >> (64 - UNW_LOG_HASH_SIZE); } static inline long @@ -1346,7 +1346,7 @@ script_emit (struct unw_script *script, struct unw_insn insn) { if (script->count >= UNW_MAX_SCRIPT_LEN) { UNW_DPRINT(0, "unwind.%s: script exceeds maximum size of %u instructions!\n", - __FUNCTION__, UNW_MAX_SCRIPT_LEN); + __func__, UNW_MAX_SCRIPT_LEN); return; } script->insn[script->count++] = insn; @@ -1388,7 +1388,7 @@ emit_nat_info (struct unw_state_record *sr, int i, struct unw_script *script) default: UNW_DPRINT(0, "unwind.%s: don't know how to emit nat info for where = %u\n", - __FUNCTION__, r->where); + __func__, r->where); return; } insn.opc = opc; @@ -1445,7 +1445,7 @@ compile_reg (struct unw_state_record *sr, int i, struct unw_script *script) val = offsetof(struct pt_regs, f6) + 16*(rval - 6); else UNW_DPRINT(0, "unwind.%s: kernel may not touch f%lu\n", - __FUNCTION__, rval); + __func__, rval); } break; @@ -1473,7 +1473,7 @@ compile_reg (struct unw_state_record *sr, int i, struct unw_script *script) default: UNW_DPRINT(0, "unwind%s: register %u has unexpected `where' value of %u\n", - __FUNCTION__, i, r->where); + __func__, i, r->where); break; } insn.opc = opc; @@ -1530,7 +1530,7 @@ build_script (struct unw_frame_info *info) struct unw_labeled_state *ls, *next; unsigned long ip = info->ip; struct unw_state_record sr; - struct unw_table *table; + struct unw_table *table, *prev; struct unw_reg_info *r; struct unw_insn insn; u8 *dp, *desc_end; @@ -1546,10 +1546,10 @@ build_script (struct unw_frame_info *info) r->when = UNW_WHEN_NEVER; sr.pr_val = info->pr; - UNW_DPRINT(3, "unwind.%s: ip 0x%lx\n", __FUNCTION__, ip); + UNW_DPRINT(3, "unwind.%s: ip 0x%lx\n", __func__, ip); script = script_new(ip); if (!script) { - UNW_DPRINT(0, "unwind.%s: failed to create unwind script\n", __FUNCTION__); + UNW_DPRINT(0, "unwind.%s: failed to create unwind script\n", __func__); STAT(unw.stat.script.build_time += ia64_get_itc() - start); return NULL; } @@ -1559,16 +1559,31 @@ build_script (struct unw_frame_info *info) STAT(parse_start = ia64_get_itc()); + prev = NULL; for (table = unw.tables; table; table = table->next) { if (ip >= table->start && ip < table->end) { + /* + * Leave the kernel unwind table at the very front, + * lest moving it breaks some assumption elsewhere. + * Otherwise, move the matching table to the second + * position in the list so that traversals can benefit + * from commonality in backtrace paths. + */ + if (prev && prev != unw.tables) { + /* unw is safe - we're already spinlocked */ + prev->next = table->next; + table->next = unw.tables->next; + unw.tables->next = table; + } e = lookup(table, ip - table->segment_base); break; } + prev = table; } if (!e) { /* no info, return default unwinder (leaf proc, no mem stack, no saved regs) */ UNW_DPRINT(1, "unwind.%s: no unwind info for ip=0x%lx (prev ip=0x%lx)\n", - __FUNCTION__, ip, unw.cache[info->prev_script].ip); + __func__, ip, unw.cache[info->prev_script].ip); sr.curr.reg[UNW_REG_RP].where = UNW_WHERE_BR; sr.curr.reg[UNW_REG_RP].when = -1; sr.curr.reg[UNW_REG_RP].val = 0; @@ -1617,13 +1632,13 @@ build_script (struct unw_frame_info *info) sr.curr.reg[UNW_REG_RP].when = -1; sr.curr.reg[UNW_REG_RP].val = sr.return_link_reg; UNW_DPRINT(1, "unwind.%s: using default for rp at ip=0x%lx where=%d val=0x%lx\n", - __FUNCTION__, ip, sr.curr.reg[UNW_REG_RP].where, + __func__, ip, sr.curr.reg[UNW_REG_RP].where, sr.curr.reg[UNW_REG_RP].val); } #ifdef UNW_DEBUG UNW_DPRINT(1, "unwind.%s: state record for func 0x%lx, t=%u:\n", - __FUNCTION__, table->segment_base + e->start_offset, sr.when_target); + __func__, table->segment_base + e->start_offset, sr.when_target); for (r = sr.curr.reg; r < sr.curr.reg + UNW_NUM_REGS; ++r) { if (r->where != UNW_WHERE_NONE || r->when != UNW_WHEN_NEVER) { UNW_DPRINT(1, " %s <- ", unw.preg_name[r - sr.curr.reg]); @@ -1745,7 +1760,7 @@ run_script (struct unw_script *script, struct unw_frame_info *state) } else { s[dst] = 0; UNW_DPRINT(0, "unwind.%s: no state->pt, dst=%ld, val=%ld\n", - __FUNCTION__, dst, val); + __func__, dst, val); } break; @@ -1755,7 +1770,7 @@ run_script (struct unw_script *script, struct unw_frame_info *state) else { s[dst] = 0; UNW_DPRINT(0, "unwind.%s: UNW_INSN_MOVE_CONST bad val=%ld\n", - __FUNCTION__, val); + __func__, val); } break; @@ -1790,7 +1805,7 @@ run_script (struct unw_script *script, struct unw_frame_info *state) || s[val] < TASK_SIZE) { UNW_DPRINT(0, "unwind.%s: rejecting bad psp=0x%lx\n", - __FUNCTION__, s[val]); + __func__, s[val]); break; } #endif @@ -1824,7 +1839,7 @@ find_save_locs (struct unw_frame_info *info) if ((info->ip & (local_cpu_data->unimpl_va_mask | 0xf)) || info->ip < TASK_SIZE) { /* don't let obviously bad addresses pollute the cache */ /* FIXME: should really be level 0 but it occurs too often. KAO */ - UNW_DPRINT(1, "unwind.%s: rejecting bad ip=0x%lx\n", __FUNCTION__, info->ip); + UNW_DPRINT(1, "unwind.%s: rejecting bad ip=0x%lx\n", __func__, info->ip); info->rp_loc = NULL; return -1; } @@ -1837,7 +1852,7 @@ find_save_locs (struct unw_frame_info *info) spin_unlock_irqrestore(&unw.lock, flags); UNW_DPRINT(0, "unwind.%s: failed to locate/build unwind script for ip %lx\n", - __FUNCTION__, info->ip); + __func__, info->ip); return -1; } have_write_lock = 1; @@ -1855,6 +1870,14 @@ find_save_locs (struct unw_frame_info *info) return 0; } +static int +unw_valid(const struct unw_frame_info *info, unsigned long* p) +{ + unsigned long loc = (unsigned long)p; + return (loc >= info->regstk.limit && loc < info->regstk.top) || + (loc >= info->memstk.top && loc < info->memstk.limit); +} + int unw_unwind (struct unw_frame_info *info) { @@ -1869,27 +1892,29 @@ unw_unwind (struct unw_frame_info *info) prev_sp = info->sp; prev_bsp = info->bsp; - /* restore the ip */ - if (!info->rp_loc) { + /* validate the return IP pointer */ + if (!unw_valid(info, info->rp_loc)) { /* FIXME: should really be level 0 but it occurs too often. KAO */ UNW_DPRINT(1, "unwind.%s: failed to locate return link (ip=0x%lx)!\n", - __FUNCTION__, info->ip); + __func__, info->ip); STAT(unw.stat.api.unwind_time += ia64_get_itc() - start; local_irq_restore(flags)); return -1; } + /* restore the ip */ ip = info->ip = *info->rp_loc; if (ip < GATE_ADDR) { - UNW_DPRINT(2, "unwind.%s: reached user-space (ip=0x%lx)\n", __FUNCTION__, ip); + UNW_DPRINT(2, "unwind.%s: reached user-space (ip=0x%lx)\n", __func__, ip); STAT(unw.stat.api.unwind_time += ia64_get_itc() - start; local_irq_restore(flags)); return -1; } - /* restore the cfm: */ - if (!info->pfs_loc) { - UNW_DPRINT(0, "unwind.%s: failed to locate ar.pfs!\n", __FUNCTION__); + /* validate the previous stack frame pointer */ + if (!unw_valid(info, info->pfs_loc)) { + UNW_DPRINT(0, "unwind.%s: failed to locate ar.pfs!\n", __func__); STAT(unw.stat.api.unwind_time += ia64_get_itc() - start; local_irq_restore(flags)); return -1; } + /* restore the cfm: */ info->cfm_loc = info->pfs_loc; /* restore the bsp: */ @@ -1901,13 +1926,13 @@ unw_unwind (struct unw_frame_info *info) num_regs = *info->cfm_loc & 0x7f; /* size of frame */ info->pfs_loc = (unsigned long *) (info->pt + offsetof(struct pt_regs, ar_pfs)); - UNW_DPRINT(3, "unwind.%s: interrupt_frame pt 0x%lx\n", __FUNCTION__, info->pt); + UNW_DPRINT(3, "unwind.%s: interrupt_frame pt 0x%lx\n", __func__, info->pt); } else num_regs = (*info->cfm_loc >> 7) & 0x7f; /* size of locals */ info->bsp = (unsigned long) ia64_rse_skip_regs((unsigned long *) info->bsp, -num_regs); if (info->bsp < info->regstk.limit || info->bsp > info->regstk.top) { UNW_DPRINT(0, "unwind.%s: bsp (0x%lx) out of range [0x%lx-0x%lx]\n", - __FUNCTION__, info->bsp, info->regstk.limit, info->regstk.top); + __func__, info->bsp, info->regstk.limit, info->regstk.top); STAT(unw.stat.api.unwind_time += ia64_get_itc() - start; local_irq_restore(flags)); return -1; } @@ -1916,14 +1941,14 @@ unw_unwind (struct unw_frame_info *info) info->sp = info->psp; if (info->sp < info->memstk.top || info->sp > info->memstk.limit) { UNW_DPRINT(0, "unwind.%s: sp (0x%lx) out of range [0x%lx-0x%lx]\n", - __FUNCTION__, info->sp, info->memstk.top, info->memstk.limit); + __func__, info->sp, info->memstk.top, info->memstk.limit); STAT(unw.stat.api.unwind_time += ia64_get_itc() - start; local_irq_restore(flags)); return -1; } if (info->ip == prev_ip && info->sp == prev_sp && info->bsp == prev_bsp) { UNW_DPRINT(0, "unwind.%s: ip, sp, bsp unchanged; stopping here (ip=0x%lx)\n", - __FUNCTION__, ip); + __func__, ip); STAT(unw.stat.api.unwind_time += ia64_get_itc() - start; local_irq_restore(flags)); return -1; } @@ -1943,14 +1968,14 @@ EXPORT_SYMBOL(unw_unwind); int unw_unwind_to_user (struct unw_frame_info *info) { - unsigned long ip, sp, pr = 0; + unsigned long ip, sp, pr = info->pr; - while (unw_unwind(info) >= 0) { + do { unw_get_sp(info, &sp); if ((long)((unsigned long)info->task + IA64_STK_OFFSET - sp) < IA64_PT_REGS_SIZE) { UNW_DPRINT(0, "unwind.%s: ran off the top of the kernel stack\n", - __FUNCTION__); + __func__); break; } if (unw_is_intr_frame(info) && @@ -1960,13 +1985,13 @@ unw_unwind_to_user (struct unw_frame_info *info) unw_get_rp(info, &ip); UNW_DPRINT(0, "unwind.%s: failed to read " "predicate register (ip=0x%lx)\n", - __FUNCTION__, ip); + __func__, ip); return -1; } - } + } while (unw_unwind(info) >= 0); unw_get_ip(info, &ip); UNW_DPRINT(0, "unwind.%s: failed to unwind to user-level (ip=0x%lx)\n", - __FUNCTION__, ip); + __func__, ip); return -1; } EXPORT_SYMBOL(unw_unwind_to_user); @@ -1991,13 +2016,16 @@ init_frame_info (struct unw_frame_info *info, struct task_struct *t, memset(info, 0, sizeof(*info)); rbslimit = (unsigned long) t + IA64_RBS_OFFSET; + stklimit = (unsigned long) t + IA64_STK_OFFSET; + rbstop = sw->ar_bspstore; - if (rbstop - (unsigned long) t >= IA64_STK_OFFSET) + if (rbstop > stklimit || rbstop < rbslimit) rbstop = rbslimit; - stklimit = (unsigned long) t + IA64_STK_OFFSET; if (stktop <= rbstop) stktop = rbstop; + if (stktop > stklimit) + stktop = stklimit; info->regstk.limit = rbslimit; info->regstk.top = rbstop; @@ -2014,7 +2042,7 @@ init_frame_info (struct unw_frame_info *info, struct task_struct *t, " pr 0x%lx\n" " sw 0x%lx\n" " sp 0x%lx\n", - __FUNCTION__, (unsigned long) t, rbslimit, rbstop, stktop, stklimit, + __func__, (unsigned long) t, rbslimit, rbstop, stktop, stklimit, info->pr, (unsigned long) info->sw, info->sp); STAT(unw.stat.api.init_time += ia64_get_itc() - start; local_irq_restore(flags)); } @@ -2033,7 +2061,7 @@ unw_init_frame_info (struct unw_frame_info *info, struct task_struct *t, struct " bsp 0x%lx\n" " sol 0x%lx\n" " ip 0x%lx\n", - __FUNCTION__, info->bsp, sol, info->ip); + __func__, info->bsp, sol, info->ip); find_save_locs(info); } @@ -2044,7 +2072,7 @@ unw_init_from_blocked_task (struct unw_frame_info *info, struct task_struct *t) { struct switch_stack *sw = (struct switch_stack *) (t->thread.ksp + 16); - UNW_DPRINT(1, "unwind.%s\n", __FUNCTION__); + UNW_DPRINT(1, "unwind.%s\n", __func__); unw_init_frame_info(info, t, sw); } EXPORT_SYMBOL(unw_init_from_blocked_task); @@ -2074,7 +2102,7 @@ unw_add_unwind_table (const char *name, unsigned long segment_base, unsigned lon if (end - start <= 0) { UNW_DPRINT(0, "unwind.%s: ignoring attempt to insert empty unwind table\n", - __FUNCTION__); + __func__); return NULL; } @@ -2105,14 +2133,14 @@ unw_remove_unwind_table (void *handle) if (!handle) { UNW_DPRINT(0, "unwind.%s: ignoring attempt to remove non-existent unwind table\n", - __FUNCTION__); + __func__); return; } table = handle; if (table == &unw.kernel_table) { UNW_DPRINT(0, "unwind.%s: sorry, freeing the kernel's unwind table is a " - "no-can-do!\n", __FUNCTION__); + "no-can-do!\n", __func__); return; } @@ -2125,7 +2153,7 @@ unw_remove_unwind_table (void *handle) break; if (!prev) { UNW_DPRINT(0, "unwind.%s: failed to find unwind table %p\n", - __FUNCTION__, (void *) table); + __func__, (void *) table); spin_unlock_irqrestore(&unw.lock, flags); return; } @@ -2135,7 +2163,7 @@ unw_remove_unwind_table (void *handle) /* next, remove hash table entries for this table */ - for (index = 0; index <= UNW_HASH_SIZE; ++index) { + for (index = 0; index < UNW_HASH_SIZE; ++index) { tmp = unw.cache + unw.hash[index]; if (unw.hash[index] >= UNW_CACHE_SIZE || tmp->ip < table->start || tmp->ip >= table->end) @@ -2171,7 +2199,7 @@ create_gate_table (void) } if (!punw) { - printk("%s: failed to find gate DSO's unwind table!\n", __FUNCTION__); + printk("%s: failed to find gate DSO's unwind table!\n", __func__); return 0; } @@ -2188,7 +2216,7 @@ create_gate_table (void) unw.gate_table = kmalloc(size, GFP_KERNEL); if (!unw.gate_table) { unw.gate_table_size = 0; - printk(KERN_ERR "%s: unable to create unwind data for gate page!\n", __FUNCTION__); + printk(KERN_ERR "%s: unable to create unwind data for gate page!\n", __func__); return 0; } unw.gate_table_size = size; diff --git a/arch/ia64/kernel/vmlinux.lds.S b/arch/ia64/kernel/vmlinux.lds.S index 30d8564e960..84f8a52ac5a 100644 --- a/arch/ia64/kernel/vmlinux.lds.S +++ b/arch/ia64/kernel/vmlinux.lds.S @@ -1,257 +1,248 @@ -#include <linux/config.h> #include <asm/cache.h> #include <asm/ptrace.h> -#include <asm/system.h> #include <asm/pgtable.h> -#define LOAD_OFFSET (KERNEL_START - KERNEL_TR_PAGE_SIZE) #include <asm-generic/vmlinux.lds.h> -#define IVT_TEXT \ - VMLINUX_SYMBOL(__start_ivt_text) = .; \ - *(.text.ivt) \ - VMLINUX_SYMBOL(__end_ivt_text) = .; - OUTPUT_FORMAT("elf64-ia64-little") OUTPUT_ARCH(ia64) ENTRY(phys_start) jiffies = jiffies_64; + PHDRS { - code PT_LOAD; - percpu PT_LOAD; - data PT_LOAD; + code PT_LOAD; + percpu PT_LOAD; + data PT_LOAD; + note PT_NOTE; + unwind 0x70000001; /* PT_IA_64_UNWIND, but ld doesn't match the name */ } -SECTIONS -{ - /* Sections to be discarded */ - /DISCARD/ : { - *(.exit.text) - *(.exit.data) - *(.exitcall.exit) - *(.IA_64.unwind.exit.text) - *(.IA_64.unwind_info.exit.text) - } - v = PAGE_OFFSET; /* this symbol is here to make debugging easier... */ - phys_start = _start - LOAD_OFFSET; - - code : { } :code - . = KERNEL_START; - - _text = .; - _stext = .; - - .text : AT(ADDR(.text) - LOAD_OFFSET) - { - IVT_TEXT - *(.text) - SCHED_TEXT - LOCK_TEXT - KPROBES_TEXT - *(.gnu.linkonce.t*) - } - .text2 : AT(ADDR(.text2) - LOAD_OFFSET) - { *(.text2) } -#ifdef CONFIG_SMP - .text.lock : AT(ADDR(.text.lock) - LOAD_OFFSET) - { *(.text.lock) } -#endif - _etext = .; +SECTIONS { + /* + * unwind exit sections must be discarded before + * the rest of the sections get included. + */ + /DISCARD/ : { + *(.IA_64.unwind.exit.text) + *(.IA_64.unwind_info.exit.text) + *(.comment) + *(.note) + } - /* Read-only data */ + v = PAGE_OFFSET; /* this symbol is here to make debugging easier... */ + phys_start = _start - LOAD_OFFSET; + + code : { + } :code + . = KERNEL_START; + + _text = .; + _stext = .; + + .text : AT(ADDR(.text) - LOAD_OFFSET) { + __start_ivt_text = .; + *(.text..ivt) + __end_ivt_text = .; + TEXT_TEXT + SCHED_TEXT + LOCK_TEXT + KPROBES_TEXT + *(.gnu.linkonce.t*) + } - /* Exception table */ - . = ALIGN(16); - __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) - { - __start___ex_table = .; - *(__ex_table) - __stop___ex_table = .; + .text2 : AT(ADDR(.text2) - LOAD_OFFSET) { + *(.text2) } - .data.patch.vtop : AT(ADDR(.data.patch.vtop) - LOAD_OFFSET) - { - __start___vtop_patchlist = .; - *(.data.patch.vtop) - __end___vtop_patchlist = .; +#ifdef CONFIG_SMP + .text..lock : AT(ADDR(.text..lock) - LOAD_OFFSET) { + *(.text..lock) + } +#endif + _etext = .; + + /* + * Read-only data + */ + NOTES :code :note /* put .notes in text and mark in PT_NOTE */ + code_continues : { + } : code /* switch back to regular program... */ + + EXCEPTION_TABLE(16) + + /* MCA table */ + . = ALIGN(16); + __mca_table : AT(ADDR(__mca_table) - LOAD_OFFSET) { + __start___mca_table = .; + *(__mca_table) + __stop___mca_table = .; } - .data.patch.mckinley_e9 : AT(ADDR(.data.patch.mckinley_e9) - LOAD_OFFSET) - { - __start___mckinley_e9_bundles = .; - *(.data.patch.mckinley_e9) - __end___mckinley_e9_bundles = .; + .data..patch.phys_stack_reg : AT(ADDR(.data..patch.phys_stack_reg) - LOAD_OFFSET) { + __start___phys_stack_reg_patchlist = .; + *(.data..patch.phys_stack_reg) + __end___phys_stack_reg_patchlist = .; } - /* Global data */ - _data = .; + /* + * Global data + */ + _data = .; -#if defined(CONFIG_IA64_GENERIC) - /* Machine Vector */ - . = ALIGN(16); - .machvec : AT(ADDR(.machvec) - LOAD_OFFSET) - { - machvec_start = .; - *(.machvec) - machvec_end = .; + /* Unwind info & table: */ + . = ALIGN(8); + .IA_64.unwind_info : AT(ADDR(.IA_64.unwind_info) - LOAD_OFFSET) { + *(.IA_64.unwind_info*) } -#endif - - /* Unwind info & table: */ - . = ALIGN(8); - .IA_64.unwind_info : AT(ADDR(.IA_64.unwind_info) - LOAD_OFFSET) - { *(.IA_64.unwind_info*) } - .IA_64.unwind : AT(ADDR(.IA_64.unwind) - LOAD_OFFSET) - { - __start_unwind = .; - *(.IA_64.unwind*) - __end_unwind = .; + .IA_64.unwind : AT(ADDR(.IA_64.unwind) - LOAD_OFFSET) { + __start_unwind = .; + *(.IA_64.unwind*) + __end_unwind = .; + } :code :unwind + code_continues2 : { + } : code + + RODATA + + .opd : AT(ADDR(.opd) - LOAD_OFFSET) { + *(.opd) } - RODATA + /* + * Initialization code and data: + */ + . = ALIGN(PAGE_SIZE); + __init_begin = .; - .opd : AT(ADDR(.opd) - LOAD_OFFSET) - { *(.opd) } + INIT_TEXT_SECTION(PAGE_SIZE) + INIT_DATA_SECTION(16) - /* Initialization code and data: */ + .data..patch.vtop : AT(ADDR(.data..patch.vtop) - LOAD_OFFSET) { + __start___vtop_patchlist = .; + *(.data..patch.vtop) + __end___vtop_patchlist = .; + } - . = ALIGN(PAGE_SIZE); - __init_begin = .; - .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) - { - _sinittext = .; - *(.init.text) - _einittext = .; + .data..patch.rse : AT(ADDR(.data..patch.rse) - LOAD_OFFSET) { + __start___rse_patchlist = .; + *(.data..patch.rse) + __end___rse_patchlist = .; } - .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) - { *(.init.data) } + .data..patch.mckinley_e9 : AT(ADDR(.data..patch.mckinley_e9) - LOAD_OFFSET) { + __start___mckinley_e9_bundles = .; + *(.data..patch.mckinley_e9) + __end___mckinley_e9_bundles = .; + } - .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) - { - __initramfs_start = .; - *(.init.ramfs) - __initramfs_end = .; +#if defined(CONFIG_PARAVIRT) + . = ALIGN(16); + .paravirt_bundles : AT(ADDR(.paravirt_bundles) - LOAD_OFFSET) { + __start_paravirt_bundles = .; + *(.paravirt_bundles) + __stop_paravirt_bundles = .; + } + . = ALIGN(16); + .paravirt_insts : AT(ADDR(.paravirt_insts) - LOAD_OFFSET) { + __start_paravirt_insts = .; + *(.paravirt_insts) + __stop_paravirt_insts = .; } + . = ALIGN(16); + .paravirt_branches : AT(ADDR(.paravirt_branches) - LOAD_OFFSET) { + __start_paravirt_branches = .; + *(.paravirt_branches) + __stop_paravirt_branches = .; + } +#endif - . = ALIGN(16); - .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) - { - __setup_start = .; - *(.init.setup) - __setup_end = .; +#if defined(CONFIG_IA64_GENERIC) + /* Machine Vector */ + . = ALIGN(16); + .machvec : AT(ADDR(.machvec) - LOAD_OFFSET) { + machvec_start = .; + *(.machvec) + machvec_end = .; } - .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) - { - __initcall_start = .; - *(.initcall1.init) - *(.initcall2.init) - *(.initcall3.init) - *(.initcall4.init) - *(.initcall5.init) - *(.initcall6.init) - *(.initcall7.init) - __initcall_end = .; +#endif + +#ifdef CONFIG_SMP + . = ALIGN(PERCPU_PAGE_SIZE); + __cpu0_per_cpu = .; + . = . + PERCPU_PAGE_SIZE; /* cpu0 per-cpu space */ +#endif + + . = ALIGN(PAGE_SIZE); + __init_end = .; + + .data..page_aligned : AT(ADDR(.data..page_aligned) - LOAD_OFFSET) { + PAGE_ALIGNED_DATA(PAGE_SIZE) + . = ALIGN(PAGE_SIZE); + __start_gate_section = .; + *(.data..gate) + __stop_gate_section = .; } - __con_initcall_start = .; - .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) - { *(.con_initcall.init) } - __con_initcall_end = .; - __security_initcall_start = .; - .security_initcall.init : AT(ADDR(.security_initcall.init) - LOAD_OFFSET) - { *(.security_initcall.init) } - __security_initcall_end = .; - . = ALIGN(PAGE_SIZE); - __init_end = .; - - /* The initial task and kernel stack */ - .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) - { *(.data.init_task) } - - .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) - { *(__special_page_section) - __start_gate_section = .; - *(.data.gate) - __stop_gate_section = .; + /* + * make sure the gate page doesn't expose + * kernel data + */ + . = ALIGN(PAGE_SIZE); + + /* Per-cpu data: */ + . = ALIGN(PERCPU_PAGE_SIZE); + PERCPU_VADDR(SMP_CACHE_BYTES, PERCPU_ADDR, :percpu) + __phys_per_cpu_start = __per_cpu_load; + /* + * ensure percpu data fits + * into percpu page size + */ + . = __phys_per_cpu_start + PERCPU_PAGE_SIZE; + + data : { + } :data + .data : AT(ADDR(.data) - LOAD_OFFSET) { + _sdata = .; + INIT_TASK_DATA(PAGE_SIZE) + CACHELINE_ALIGNED_DATA(SMP_CACHE_BYTES) + READ_MOSTLY_DATA(SMP_CACHE_BYTES) + DATA_DATA + *(.data1) + *(.gnu.linkonce.d*) + CONSTRUCTORS } - . = ALIGN(PAGE_SIZE); /* make sure the gate page doesn't expose kernel data */ - - .data.cacheline_aligned : AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) - { *(.data.cacheline_aligned) } - - /* Per-cpu data: */ - percpu : { } :percpu - . = ALIGN(PERCPU_PAGE_SIZE); - __phys_per_cpu_start = .; - .data.percpu PERCPU_ADDR : AT(__phys_per_cpu_start - LOAD_OFFSET) - { - __per_cpu_start = .; - *(.data.percpu) - __per_cpu_end = .; + + . = ALIGN(16); /* gp must be 16-byte aligned for exc. table */ + .got : AT(ADDR(.got) - LOAD_OFFSET) { + *(.got.plt) + *(.got) + } + __gp = ADDR(.got) + 0x200000; + + /* + * We want the small data sections together, + * so single-instruction offsets can access + * them all, and initialized data all before + * uninitialized, so we can shorten the + * on-disk segment size. + */ + .sdata : AT(ADDR(.sdata) - LOAD_OFFSET) { + *(.sdata) + *(.sdata1) + *(.srdata) } - . = __phys_per_cpu_start + PERCPU_PAGE_SIZE; /* ensure percpu data fits into percpu page size */ - - data : { } :data - .data : AT(ADDR(.data) - LOAD_OFFSET) - { *(.data) *(.data1) *(.gnu.linkonce.d*) CONSTRUCTORS } - - . = ALIGN(16); /* gp must be 16-byte aligned for exc. table */ - .got : AT(ADDR(.got) - LOAD_OFFSET) - { *(.got.plt) *(.got) } - __gp = ADDR(.got) + 0x200000; - /* We want the small data sections together, so single-instruction offsets - can access them all, and initialized data all before uninitialized, so - we can shorten the on-disk segment size. */ - .sdata : AT(ADDR(.sdata) - LOAD_OFFSET) - { *(.sdata) *(.sdata1) *(.srdata) } - _edata = .; - _bss = .; - .sbss : AT(ADDR(.sbss) - LOAD_OFFSET) - { *(.sbss) *(.scommon) } - .bss : AT(ADDR(.bss) - LOAD_OFFSET) - { *(.bss) *(COMMON) } - - _end = .; - - code : { } :code - /* Stabs debugging sections. */ - .stab 0 : { *(.stab) } - .stabstr 0 : { *(.stabstr) } - .stab.excl 0 : { *(.stab.excl) } - .stab.exclstr 0 : { *(.stab.exclstr) } - .stab.index 0 : { *(.stab.index) } - .stab.indexstr 0 : { *(.stab.indexstr) } - /* DWARF debug sections. - Symbols in the DWARF debugging sections are relative to the beginning - of the section so we begin them at 0. */ - /* DWARF 1 */ - .debug 0 : { *(.debug) } - .line 0 : { *(.line) } - /* GNU DWARF 1 extensions */ - .debug_srcinfo 0 : { *(.debug_srcinfo) } - .debug_sfnames 0 : { *(.debug_sfnames) } - /* DWARF 1.1 and DWARF 2 */ - .debug_aranges 0 : { *(.debug_aranges) } - .debug_pubnames 0 : { *(.debug_pubnames) } - /* DWARF 2 */ - .debug_info 0 : { *(.debug_info) } - .debug_abbrev 0 : { *(.debug_abbrev) } - .debug_line 0 : { *(.debug_line) } - .debug_frame 0 : { *(.debug_frame) } - .debug_str 0 : { *(.debug_str) } - .debug_loc 0 : { *(.debug_loc) } - .debug_macinfo 0 : { *(.debug_macinfo) } - /* SGI/MIPS DWARF 2 extensions */ - .debug_weaknames 0 : { *(.debug_weaknames) } - .debug_funcnames 0 : { *(.debug_funcnames) } - .debug_typenames 0 : { *(.debug_typenames) } - .debug_varnames 0 : { *(.debug_varnames) } - /* These must appear regardless of . */ - /* Discard them for now since Intel SoftSDV cannot handle them. - .comment 0 : { *(.comment) } - .note 0 : { *(.note) } - */ - /DISCARD/ : { *(.comment) } - /DISCARD/ : { *(.note) } + _edata = .; + + BSS_SECTION(0, 0, 0) + + _end = .; + + code : { + } :code + + STABS_DEBUG + DWARF_DEBUG + + /* Default discards */ + DISCARDS } |
