85 files changed, 14515 insertions, 7752 deletions
diff --git a/arch/ia64/kernel/.gitignore b/arch/ia64/kernel/.gitignore
new file mode 100644
index 00000000000..21cb0da5ded
--- /dev/null
+++ b/arch/ia64/kernel/.gitignore
@@ -0,0 +1,2 @@
+gate.lds
+vmlinux.lds
diff --git a/arch/ia64/kernel/Makefile b/arch/ia64/kernel/Makefile
index 307514f7a28..20678a9ed11 100644
--- a/arch/ia64/kernel/Makefile
+++ b/arch/ia64/kernel/Makefile
@@ -2,17 +2,20 @@
 # Makefile for the linux kernel.
 #
 
+ifdef CONFIG_DYNAMIC_FTRACE
+CFLAGS_REMOVE_ftrace.o = -pg
+endif
+
 extra-y	:= head.o init_task.o vmlinux.lds
 
-obj-y := acpi.o entry.o efi.o efi_stub.o gate-data.o fsys.o ia64_ksyms.o irq.o irq_ia64.o	\
-	 irq_lsapic.o ivt.o machvec.o pal.o patch.o process.o perfmon.o ptrace.o sal.o		\
-	 salinfo.o semaphore.o setup.o signal.o sys_ia64.o time.o traps.o unaligned.o \
-	 unwind.o mca.o mca_asm.o topology.o
+obj-y := entry.o efi.o efi_stub.o gate-data.o fsys.o ia64_ksyms.o irq.o irq_ia64.o	\
+	 irq_lsapic.o ivt.o machvec.o pal.o paravirt_patchlist.o patch.o process.o perfmon.o ptrace.o sal.o		\
+	 salinfo.o setup.o signal.o sys_ia64.o time.o traps.o unaligned.o \
+	 unwind.o mca.o mca_asm.o topology.o dma-mapping.o
 
+obj-$(CONFIG_ACPI)		+= acpi.o acpi-ext.o
 obj-$(CONFIG_IA64_BRL_EMU)	+= brl_emu.o
-obj-$(CONFIG_IA64_GENERIC)	+= acpi-ext.o
-obj-$(CONFIG_IA64_HP_ZX1)	+= acpi-ext.o
-obj-$(CONFIG_IA64_HP_ZX1_SWIOTLB) += acpi-ext.o
+
 obj-$(CONFIG_IA64_PALINFO)	+= palinfo.o
 obj-$(CONFIG_IOSAPIC)		+= iosapic.o
 obj-$(CONFIG_MODULES)		+= module.o
@@ -20,37 +23,91 @@ obj-$(CONFIG_SMP)		+= smp.o smpboot.o
 obj-$(CONFIG_NUMA)		+= numa.o
 obj-$(CONFIG_PERFMON)		+= perfmon_default_smpl.o
 obj-$(CONFIG_IA64_CYCLONE)	+= cyclone.o
-obj-$(CONFIG_CPU_FREQ)		+= cpufreq/
 obj-$(CONFIG_IA64_MCA_RECOVERY)	+= mca_recovery.o
 obj-$(CONFIG_KPROBES)		+= kprobes.o jprobes.o
+obj-$(CONFIG_DYNAMIC_FTRACE)	+= ftrace.o
+obj-$(CONFIG_KEXEC)		+= machine_kexec.o relocate_kernel.o crash.o
+obj-$(CONFIG_CRASH_DUMP)	+= crash_dump.o
 obj-$(CONFIG_IA64_UNCACHED_ALLOCATOR)	+= uncached.o
+obj-$(CONFIG_AUDIT)		+= audit.o
+obj-$(CONFIG_PCI_MSI)		+= msi_ia64.o
 mca_recovery-y			+= mca_drv.o mca_drv_asm.o
+obj-$(CONFIG_IA64_MC_ERR_INJECT)+= err_inject.o
+obj-$(CONFIG_STACKTRACE)	+= stacktrace.o
 
-# The gate DSO image is built using a special linker script.
-targets += gate.so gate-syms.o
+obj-$(CONFIG_PARAVIRT)		+= paravirt.o paravirtentry.o \
+				   paravirt_patch.o
 
-extra-y += gate.so gate-syms.o gate.lds gate.o
+obj-$(CONFIG_IA64_ESI)		+= esi.o
+ifneq ($(CONFIG_IA64_ESI),)
+obj-y				+= esi_stub.o	# must be in kernel proper
+endif
+obj-$(CONFIG_INTEL_IOMMU)	+= pci-dma.o
+obj-$(CONFIG_SWIOTLB)		+= pci-swiotlb.o
+
+obj-$(CONFIG_BINFMT_ELF)	+= elfcore.o
 
 # fp_emulate() expects f2-f5,f16-f31 to contain the user-level state.
 CFLAGS_traps.o  += -mfixed-range=f2-f5,f16-f31
 
-CPPFLAGS_gate.lds := -P -C -U$(ARCH)
+# The gate DSO image is built using a special linker script.
+include $(srctree)/arch/ia64/kernel/Makefile.gate
+# tell compiled for native
+CPPFLAGS_gate.lds += -D__IA64_GATE_PARAVIRTUALIZED_NATIVE
+
+# Calculate NR_IRQ = max(IA64_NATIVE_NR_IRQS, XEN_NR_IRQS, ...) based on config
+define sed-y
+	"/^->/{s:^->\([^ ]*\) [\$$#]*\([^ ]*\) \(.*\):#define \1 \2 /* \3 */:; s:->::; p;}"
+endef
+quiet_cmd_nr_irqs = GEN     $@
+define cmd_nr_irqs
+	(set -e; \
+	 echo "#ifndef __ASM_NR_IRQS_H__"; \
+	 echo "#define __ASM_NR_IRQS_H__"; \
+	 echo "/*"; \
+	 echo " * DO NOT MODIFY."; \
+	 echo " *"; \
+	 echo " * This file was generated by Kbuild"; \
+	 echo " *"; \
+	 echo " */"; \
+	 echo ""; \
+	 sed -ne $(sed-y) $<; \
+	 echo ""; \
+	 echo "#endif" ) > $@
+endef
 
-quiet_cmd_gate = GATE $@
-      cmd_gate = $(CC) -nostdlib $(GATECFLAGS_$(@F)) -Wl,-T,$(filter-out FORCE,$^) -o $@
+# We use internal kbuild rules to avoid the "is up to date" message from make
+arch/$(SRCARCH)/kernel/nr-irqs.s: arch/$(SRCARCH)/kernel/nr-irqs.c
+	$(Q)mkdir -p $(dir $@)
+	$(call if_changed_dep,cc_s_c)
 
-GATECFLAGS_gate.so = -shared -s -Wl,-soname=linux-gate.so.1
-$(obj)/gate.so: $(obj)/gate.lds $(obj)/gate.o FORCE
-	$(call if_changed,gate)
+include/generated/nr-irqs.h: arch/$(SRCARCH)/kernel/nr-irqs.s
+	$(Q)mkdir -p $(dir $@)
+	$(call cmd,nr_irqs)
 
-$(obj)/built-in.o: $(obj)/gate-syms.o
-$(obj)/built-in.o: ld_flags += -R $(obj)/gate-syms.o
+#
+# native ivt.S, entry.S and fsys.S
+#
+ASM_PARAVIRT_OBJS = ivt.o entry.o fsys.o
+define paravirtualized_native
+AFLAGS_$(1) += -D__IA64_ASM_PARAVIRTUALIZED_NATIVE
+AFLAGS_pvchk-sed-$(1) += -D__IA64_ASM_PARAVIRTUALIZED_PVCHECK
+extra-y += pvchk-$(1)
+endef
+$(foreach obj,$(ASM_PARAVIRT_OBJS),$(eval $(call paravirtualized_native,$(obj))))
 
-GATECFLAGS_gate-syms.o = -r
-$(obj)/gate-syms.o: $(obj)/gate.lds $(obj)/gate.o FORCE
-	$(call if_changed,gate)
+#
+# Checker for paravirtualizations of privileged operations.
+#
+quiet_cmd_pv_check_sed = PVCHK   $@
+define cmd_pv_check_sed
+	sed -f $(srctree)/arch/$(SRCARCH)/scripts/pvcheck.sed $< > $@
+endef
 
-# gate-data.o contains the gate DSO image as data in section .data.gate.
-# We must build gate.so before we can assemble it.
-# Note: kbuild does not track this dependency due to usage of .incbin
-$(obj)/gate-data.o: $(obj)/gate.so
+$(obj)/pvchk-sed-%.s: $(src)/%.S $(srctree)/arch/$(SRCARCH)/scripts/pvcheck.sed FORCE
+	$(call if_changed_dep,as_s_S)
+$(obj)/pvchk-%.s: $(obj)/pvchk-sed-%.s FORCE
+	$(call if_changed,pv_check_sed)
+$(obj)/pvchk-%.o: $(obj)/pvchk-%.s FORCE
+	$(call if_changed,as_o_S)
+.PRECIOUS: $(obj)/pvchk-sed-%.s $(obj)/pvchk-%.s $(obj)/pvchk-%.o
diff --git a/arch/ia64/kernel/Makefile.gate b/arch/ia64/kernel/Makefile.gate
new file mode 100644
index 00000000000..ceeffc50976
--- /dev/null
+++ b/arch/ia64/kernel/Makefile.gate
@@ -0,0 +1,27 @@
+# The gate DSO image is built using a special linker script.
+
+targets += gate.so gate-syms.o
+
+extra-y += gate.so gate-syms.o gate.lds gate.o
+
+CPPFLAGS_gate.lds := -P -C -U$(ARCH)
+
+quiet_cmd_gate = GATE $@
+      cmd_gate = $(CC) -nostdlib $(GATECFLAGS_$(@F)) -Wl,-T,$(filter-out FORCE,$^) -o $@
+
+GATECFLAGS_gate.so = -shared -s -Wl,-soname=linux-gate.so.1 \
+		     $(call cc-ldoption, -Wl$(comma)--hash-style=sysv)
+$(obj)/gate.so: $(obj)/gate.lds $(obj)/gate.o FORCE
+	$(call if_changed,gate)
+
+$(obj)/built-in.o: $(obj)/gate-syms.o
+$(obj)/built-in.o: ld_flags += -R $(obj)/gate-syms.o
+
+GATECFLAGS_gate-syms.o = -r
+$(obj)/gate-syms.o: $(obj)/gate.lds $(obj)/gate.o FORCE
+	$(call if_changed,gate)
+
+# gate-data.o contains the gate DSO image as data in section .data..gate.
+# We must build gate.so before we can assemble it.
+# Note: kbuild does not track this dependency due to usage of .incbin
+$(obj)/gate-data.o: $(obj)/gate.so
diff --git a/arch/ia64/kernel/acpi-ext.c b/arch/ia64/kernel/acpi-ext.c
index 13a5b3b49bf..8b9318d311a 100644
--- a/arch/ia64/kernel/acpi-ext.c
+++ b/arch/ia64/kernel/acpi-ext.c
@@ -1,105 +1,104 @@
 /*
- * arch/ia64/kernel/acpi-ext.c
+ * (c) Copyright 2003, 2006 Hewlett-Packard Development Company, L.P.
+ *	Alex Williamson <alex.williamson@hp.com>
+ *	Bjorn Helgaas <bjorn.helgaas@hp.com>
  *
- * Copyright (C) 2003 Hewlett-Packard
- * Copyright (C) Alex Williamson
- * Copyright (C) Bjorn Helgaas
- *
- * Vendor specific extensions to ACPI.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
  */
 
-#include <linux/config.h>
 #include <linux/module.h>
 #include <linux/types.h>
+#include <linux/slab.h>
 #include <linux/acpi.h>
-#include <linux/efi.h>
 
 #include <asm/acpi-ext.h>
 
-struct acpi_vendor_descriptor {
-	u8 guid_id;
-	efi_guid_t guid;
-};
+/*
+ * Device CSRs that do not appear in PCI config space should be described
+ * via ACPI.  This would normally be done with Address Space Descriptors
+ * marked as "consumer-only," but old versions of Windows and Linux ignore
+ * the producer/consumer flag, so HP invented a vendor-defined resource to
+ * describe the location and size of CSR space.
+ */
 
-struct acpi_vendor_info {
-	struct acpi_vendor_descriptor *descriptor;
-	u8 *data;
-	u32 length;
+struct acpi_vendor_uuid hp_ccsr_uuid = {
+	.subtype = 2,
+	.data = { 0xf9, 0xad, 0xe9, 0x69, 0x4f, 0x92, 0x5f, 0xab, 0xf6, 0x4a,
+	    0x24, 0xd2, 0x01, 0x37, 0x0e, 0xad },
 };
 
-acpi_status
-acpi_vendor_resource_match(struct acpi_resource *resource, void *context)
+static acpi_status hp_ccsr_locate(acpi_handle obj, u64 *base, u64 *length)
 {
-	struct acpi_vendor_info *info = (struct acpi_vendor_info *)context;
-	struct acpi_resource_vendor *vendor;
-	struct acpi_vendor_descriptor *descriptor;
-	u32 length;
-
-	if (resource->id != ACPI_RSTYPE_VENDOR)
-		return AE_OK;
-
-	vendor = (struct acpi_resource_vendor *)&resource->data;
-	descriptor = (struct acpi_vendor_descriptor *)vendor->reserved;
-	if (vendor->length <= sizeof(*info->descriptor) ||
-	    descriptor->guid_id != info->descriptor->guid_id ||
-	    efi_guidcmp(descriptor->guid, info->descriptor->guid))
-		return AE_OK;
-
-	length = vendor->length - sizeof(struct acpi_vendor_descriptor);
-	info->data = acpi_os_allocate(length);
-	if (!info->data)
-		return AE_NO_MEMORY;
-
-	memcpy(info->data,
-	       vendor->reserved + sizeof(struct acpi_vendor_descriptor),
-	       length);
-	info->length = length;
-	return AE_CTRL_TERMINATE;
-}
+	acpi_status status;
+	struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL };
+	struct acpi_resource *resource;
+	struct acpi_resource_vendor_typed *vendor;
 
-acpi_status
-acpi_find_vendor_resource(acpi_handle obj, struct acpi_vendor_descriptor * id,
-			  u8 ** data, u32 * length)
-{
-	struct acpi_vendor_info info;
+	status = acpi_get_vendor_resource(obj, METHOD_NAME__CRS, &hp_ccsr_uuid,
+		&buffer);
 
-	info.descriptor = id;
-	info.data = NULL;
+	resource = buffer.pointer;
+	vendor = &resource->data.vendor_typed;
 
-	acpi_walk_resources(obj, METHOD_NAME__CRS, acpi_vendor_resource_match,
-			    &info);
-	if (!info.data)
-		return AE_NOT_FOUND;
+	if (ACPI_FAILURE(status) || vendor->byte_length < 16) {
+		status = AE_NOT_FOUND;
+		goto exit;
+	}
 
-	*data = info.data;
-	*length = info.length;
-	return AE_OK;
+	memcpy(base, vendor->byte_data, sizeof(*base));
+	memcpy(length, vendor->byte_data + 8, sizeof(*length));
+
+  exit:
+	kfree(buffer.pointer);
+	return status;
 }
 
-struct acpi_vendor_descriptor hp_ccsr_descriptor = {
-	.guid_id = 2,
-	.guid =
-	    EFI_GUID(0x69e9adf9, 0x924f, 0xab5f, 0xf6, 0x4a, 0x24, 0xd2, 0x01,
-		     0x37, 0x0e, 0xad)
+struct csr_space {
+	u64	base;
+	u64	length;
 };
 
-acpi_status hp_acpi_csr_space(acpi_handle obj, u64 * csr_base, u64 * csr_length)
+static acpi_status find_csr_space(struct acpi_resource *resource, void *data)
 {
+	struct csr_space *space = data;
+	struct acpi_resource_address64 addr;
 	acpi_status status;
-	u8 *data;
-	u32 length;
 
-	status =
-	    acpi_find_vendor_resource(obj, &hp_ccsr_descriptor, &data, &length);
+	status = acpi_resource_to_address64(resource, &addr);
+	if (ACPI_SUCCESS(status) &&
+	    addr.resource_type == ACPI_MEMORY_RANGE &&
+	    addr.address_length &&
+	    addr.producer_consumer == ACPI_CONSUMER) {
+		space->base = addr.minimum;
+		space->length = addr.address_length;
+		return AE_CTRL_TERMINATE;
+	}
+	return AE_OK;		/* keep looking */
+}
 
-	if (ACPI_FAILURE(status) || length != 16)
-		return AE_NOT_FOUND;
+static acpi_status hp_crs_locate(acpi_handle obj, u64 *base, u64 *length)
+{
+	struct csr_space space = { 0, 0 };
 
-	memcpy(csr_base, data, sizeof(*csr_base));
-	memcpy(csr_length, data + 8, sizeof(*csr_length));
-	acpi_os_free(data);
+	acpi_walk_resources(obj, METHOD_NAME__CRS, find_csr_space, &space);
+	if (!space.length)
+		return AE_NOT_FOUND;
 
+	*base = space.base;
+	*length = space.length;
 	return AE_OK;
 }
 
+acpi_status hp_acpi_csr_space(acpi_handle obj, u64 *csr_base, u64 *csr_length)
+{
+	acpi_status status;
+
+	status = hp_ccsr_locate(obj, csr_base, csr_length);
+	if (ACPI_SUCCESS(status))
+		return status;
+
+	return hp_crs_locate(obj, csr_base, csr_length);
+}
 EXPORT_SYMBOL(hp_acpi_csr_space);
diff --git a/arch/ia64/kernel/acpi.c b/arch/ia64/kernel/acpi.c
index 9ad94ddf668..615ef81def4 100644
--- a/arch/ia64/kernel/acpi.c
+++ b/arch/ia64/kernel/acpi.c
@@ -32,7 +32,6 @@
  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  */
 
-#include <linux/config.h>
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
@@ -45,46 +44,47 @@
 #include <linux/efi.h>
 #include <linux/mmzone.h>
 #include <linux/nodemask.h>
+#include <linux/slab.h>
+#include <acpi/processor.h>
 #include <asm/io.h>
 #include <asm/iosapic.h>
 #include <asm/machvec.h>
 #include <asm/page.h>
-#include <asm/system.h>
 #include <asm/numa.h>
 #include <asm/sal.h>
 #include <asm/cyclone.h>
 
-#define BAD_MADT_ENTRY(entry, end) (                                        \
-		(!entry) || (unsigned long)entry + sizeof(*entry) > end ||  \
-		((acpi_table_entry_header *)entry)->length != sizeof(*entry))
-
 #define PREFIX			"ACPI: "
 
-void (*pm_idle) (void);
-EXPORT_SYMBOL(pm_idle);
-void (*pm_power_off) (void);
-EXPORT_SYMBOL(pm_power_off);
-
-unsigned char acpi_kbd_controller_present = 1;
-unsigned char acpi_legacy_devices;
-
-static unsigned int __initdata acpi_madt_rev;
-
+int acpi_lapic;
 unsigned int acpi_cpei_override;
 unsigned int acpi_cpei_phys_cpuid;
 
-#define MAX_SAPICS 256
-u16 ia64_acpiid_to_sapicid[MAX_SAPICS] = {[0 ... MAX_SAPICS - 1] = -1 };
+unsigned long acpi_wakeup_address = 0;
 
-EXPORT_SYMBOL(ia64_acpiid_to_sapicid);
+#ifdef CONFIG_IA64_GENERIC
+static unsigned long __init acpi_find_rsdp(void)
+{
+	unsigned long rsdp_phys = 0;
+
+	if (efi.acpi20 != EFI_INVALID_TABLE_ADDR)
+		rsdp_phys = efi.acpi20;
+	else if (efi.acpi != EFI_INVALID_TABLE_ADDR)
+		printk(KERN_WARNING PREFIX
+		       "v1.0/r0.71 tables no longer supported\n");
+	return rsdp_phys;
+}
 
-const char *acpi_get_sysname(void)
+const char __init *
+acpi_get_sysname(void)
 {
-#ifdef CONFIG_IA64_GENERIC
 	unsigned long rsdp_phys;
-	struct acpi20_table_rsdp *rsdp;
+	struct acpi_table_rsdp *rsdp;
 	struct acpi_table_xsdt *xsdt;
 	struct acpi_table_header *hdr;
+#ifdef CONFIG_INTEL_IOMMU
+	u64 i, nentries;
+#endif
 
 	rsdp_phys = acpi_find_rsdp();
 	if (!rsdp_phys) {
@@ -93,16 +93,16 @@ const char *acpi_get_sysname(void)
 		return "dig";
 	}
 
-	rsdp = (struct acpi20_table_rsdp *)__va(rsdp_phys);
-	if (strncmp(rsdp->signature, RSDP_SIG, sizeof(RSDP_SIG) - 1)) {
+	rsdp = (struct acpi_table_rsdp *)__va(rsdp_phys);
+	if (strncmp(rsdp->signature, ACPI_SIG_RSDP, sizeof(ACPI_SIG_RSDP) - 1)) {
 		printk(KERN_ERR
 		       "ACPI 2.0 RSDP signature incorrect, default to \"dig\"\n");
 		return "dig";
 	}
 
-	xsdt = (struct acpi_table_xsdt *)__va(rsdp->xsdt_address);
+	xsdt = (struct acpi_table_xsdt *)__va(rsdp->xsdt_physical_address);
 	hdr = &xsdt->header;
-	if (strncmp(hdr->signature, XSDT_SIG, sizeof(XSDT_SIG) - 1)) {
+	if (strncmp(hdr->signature, ACPI_SIG_XSDT, sizeof(ACPI_SIG_XSDT) - 1)) {
 		printk(KERN_ERR
 		       "ACPI 2.0 XSDT signature incorrect, default to \"dig\"\n");
 		return "dig";
@@ -111,28 +111,27 @@ const char *acpi_get_sysname(void)
 	if (!strcmp(hdr->oem_id, "HP")) {
 		return "hpzx1";
 	} else if (!strcmp(hdr->oem_id, "SGI")) {
-		return "sn2";
+		if (!strcmp(hdr->oem_table_id + 4, "UV"))
+			return "uv";
+		else
+			return "sn2";
 	}
 
-	return "dig";
-#else
-# if defined (CONFIG_IA64_HP_SIM)
-	return "hpsim";
-# elif defined (CONFIG_IA64_HP_ZX1)
-	return "hpzx1";
-# elif defined (CONFIG_IA64_HP_ZX1_SWIOTLB)
-	return "hpzx1_swiotlb";
-# elif defined (CONFIG_IA64_SGI_SN2)
-	return "sn2";
-# elif defined (CONFIG_IA64_DIG)
-	return "dig";
-# else
-#	error Unknown platform.  Fix acpi.c.
-# endif
+#ifdef CONFIG_INTEL_IOMMU
+	/* Look for Intel IOMMU */
+	nentries = (hdr->length - sizeof(*hdr)) /
+			 sizeof(xsdt->table_offset_entry[0]);
+	for (i = 0; i < nentries; i++) {
+		hdr = __va(xsdt->table_offset_entry[i]);
+		if (strncmp(hdr->signature, ACPI_SIG_DMAR,
+			sizeof(ACPI_SIG_DMAR) - 1) == 0)
+			return "dig_vtd";
+	}
 #endif
-}
 
-#ifdef CONFIG_ACPI
+	return "dig";
+}
+#endif /* CONFIG_IA64_GENERIC */
 
 #define ACPI_MAX_PLATFORM_INTERRUPTS	256
 
@@ -160,27 +159,30 @@ int acpi_request_vector(u32 int_type)
 	return vector;
 }
 
-char *__acpi_map_table(unsigned long phys_addr, unsigned long size)
+char *__init __acpi_map_table(unsigned long phys_addr, unsigned long size)
 {
 	return __va(phys_addr);
 }
 
+void __init __acpi_unmap_table(char *map, unsigned long size)
+{
+}
+
 /* --------------------------------------------------------------------------
                             Boot-time Table Parsing
    -------------------------------------------------------------------------- */
 
-static int total_cpus __initdata;
 static int available_cpus __initdata;
 struct acpi_table_madt *acpi_madt __initdata;
 static u8 has_8259;
 
 static int __init
-acpi_parse_lapic_addr_ovr(acpi_table_entry_header * header,
+acpi_parse_lapic_addr_ovr(struct acpi_subtable_header * header,
 			  const unsigned long end)
 {
-	struct acpi_table_lapic_addr_ovr *lapic;
+	struct acpi_madt_local_apic_override *lapic;
 
-	lapic = (struct acpi_table_lapic_addr_ovr *)header;
+	lapic = (struct acpi_madt_local_apic_override *)header;
 
 	if (BAD_MADT_ENTRY(lapic, end))
 		return -EINVAL;
@@ -193,22 +195,19 @@ acpi_parse_lapic_addr_ovr(acpi_table_entry_header * header,
 }
 
 static int __init
-acpi_parse_lsapic(acpi_table_entry_header * header, const unsigned long end)
+acpi_parse_lsapic(struct acpi_subtable_header * header, const unsigned long end)
 {
-	struct acpi_table_lsapic *lsapic;
+	struct acpi_madt_local_sapic *lsapic;
 
-	lsapic = (struct acpi_table_lsapic *)header;
+	lsapic = (struct acpi_madt_local_sapic *)header;
 
-	if (BAD_MADT_ENTRY(lsapic, end))
-		return -EINVAL;
+	/*Skip BAD_MADT_ENTRY check, as lsapic size could vary */
 
-	if (lsapic->flags.enabled) {
+	if (lsapic->lapic_flags & ACPI_MADT_ENABLED) {
 #ifdef CONFIG_SMP
 		smp_boot_data.cpu_phys_id[available_cpus] =
 		    (lsapic->id << 8) | lsapic->eid;
 #endif
-		ia64_acpiid_to_sapicid[lsapic->acpi_id] =
-		    (lsapic->id << 8) | lsapic->eid;
 		++available_cpus;
 	}
 
@@ -217,11 +216,11 @@ acpi_parse_lsapic(acpi_table_entry_header * header, const unsigned long end)
 }
 
 static int __init
-acpi_parse_lapic_nmi(acpi_table_entry_header * header, const unsigned long end)
+acpi_parse_lapic_nmi(struct acpi_subtable_header * header, const unsigned long end)
 {
-	struct acpi_table_lapic_nmi *lacpi_nmi;
+	struct acpi_madt_local_apic_nmi *lacpi_nmi;
 
-	lacpi_nmi = (struct acpi_table_lapic_nmi *)header;
+	lacpi_nmi = (struct acpi_madt_local_apic_nmi *)header;
 
 	if (BAD_MADT_ENTRY(lacpi_nmi, end))
 		return -EINVAL;
@@ -231,11 +230,11 @@ acpi_parse_lapic_nmi(acpi_table_entry_header * header, const unsigned long end)
 }
 
 static int __init
-acpi_parse_iosapic(acpi_table_entry_header * header, const unsigned long end)
+acpi_parse_iosapic(struct acpi_subtable_header * header, const unsigned long end)
 {
-	struct acpi_table_iosapic *iosapic;
+	struct acpi_madt_io_sapic *iosapic;
 
-	iosapic = (struct acpi_table_iosapic *)header;
+	iosapic = (struct acpi_madt_io_sapic *)header;
 
 	if (BAD_MADT_ENTRY(iosapic, end))
 		return -EINVAL;
@@ -243,14 +242,16 @@ acpi_parse_iosapic(acpi_table_entry_header * header, const unsigned long end)
 	return iosapic_init(iosapic->address, iosapic->global_irq_base);
 }
 
+static unsigned int __initdata acpi_madt_rev;
+
 static int __init
-acpi_parse_plat_int_src(acpi_table_entry_header * header,
+acpi_parse_plat_int_src(struct acpi_subtable_header * header,
 			const unsigned long end)
 {
-	struct acpi_table_plat_int_src *plintsrc;
+	struct acpi_madt_interrupt_source *plintsrc;
 	int vector;
 
-	plintsrc = (struct acpi_table_plat_int_src *)header;
+	plintsrc = (struct acpi_madt_interrupt_source *)header;
 
 	if (BAD_MADT_ENTRY(plintsrc, end))
 		return -EINVAL;
@@ -261,19 +262,19 @@ acpi_parse_plat_int_src(acpi_table_entry_header * header,
 	 */
 	vector = iosapic_register_platform_intr(plintsrc->type,
 						plintsrc->global_irq,
-						plintsrc->iosapic_vector,
+						plintsrc->io_sapic_vector,
 						plintsrc->eid,
 						plintsrc->id,
-						(plintsrc->flags.polarity ==
-						 1) ? IOSAPIC_POL_HIGH :
-						IOSAPIC_POL_LOW,
-						(plintsrc->flags.trigger ==
-						 1) ? IOSAPIC_EDGE :
-						IOSAPIC_LEVEL);
+						((plintsrc->inti_flags & ACPI_MADT_POLARITY_MASK) ==
+						 ACPI_MADT_POLARITY_ACTIVE_HIGH) ?
+						IOSAPIC_POL_HIGH : IOSAPIC_POL_LOW,
+						((plintsrc->inti_flags & ACPI_MADT_TRIGGER_MASK) ==
+						 ACPI_MADT_TRIGGER_EDGE) ?
+						IOSAPIC_EDGE : IOSAPIC_LEVEL);
 
 	platform_intr_list[plintsrc->type] = vector;
 	if (acpi_madt_rev > 1) {
-		acpi_cpei_override = plintsrc->plint_flags.cpei_override_flag;
+		acpi_cpei_override = plintsrc->flags & ACPI_MADT_CPEI_OVERRIDE;
 	}
 
 	/*
@@ -284,19 +285,24 @@ acpi_parse_plat_int_src(acpi_table_entry_header * header,
 	return 0;
 }
 
+#ifdef CONFIG_HOTPLUG_CPU
 unsigned int can_cpei_retarget(void)
 {
 	extern int cpe_vector;
+	extern unsigned int force_cpei_retarget;
 
 	/*
 	 * Only if CPEI is supported and the override flag
 	 * is present, otherwise return that its re-targettable
 	 * if we are in polling mode.
 	 */
-	if (cpe_vector > 0 && !acpi_cpei_override)
-		return 0;
-	else
-		return 1;
+	if (cpe_vector > 0) {
+		if (acpi_cpei_override || force_cpei_retarget)
+			return 1;
+		else
+			return 0;
+	}
+	return 1;
 }
 
 unsigned int is_cpu_cpei_target(unsigned int cpu)
@@ -315,6 +321,7 @@ void set_cpei_target_cpu(unsigned int cpu)
 {
 	acpi_cpei_phys_cpuid = cpu_physical_id(cpu);
 }
+#endif
 
 unsigned int get_cpei_target_cpu(void)
 {
@@ -322,30 +329,32 @@ unsigned int get_cpei_target_cpu(void)
 }
 
 static int __init
-acpi_parse_int_src_ovr(acpi_table_entry_header * header,
+acpi_parse_int_src_ovr(struct acpi_subtable_header * header,
 		       const unsigned long end)
 {
-	struct acpi_table_int_src_ovr *p;
+	struct acpi_madt_interrupt_override *p;
 
-	p = (struct acpi_table_int_src_ovr *)header;
+	p = (struct acpi_madt_interrupt_override *)header;
 
 	if (BAD_MADT_ENTRY(p, end))
 		return -EINVAL;
 
-	iosapic_override_isa_irq(p->bus_irq, p->global_irq,
-				 (p->flags.polarity ==
-				  1) ? IOSAPIC_POL_HIGH : IOSAPIC_POL_LOW,
-				 (p->flags.trigger ==
-				  1) ? IOSAPIC_EDGE : IOSAPIC_LEVEL);
+	iosapic_override_isa_irq(p->source_irq, p->global_irq,
+				 ((p->inti_flags & ACPI_MADT_POLARITY_MASK) ==
+				  ACPI_MADT_POLARITY_ACTIVE_LOW) ?
+				 IOSAPIC_POL_LOW : IOSAPIC_POL_HIGH,
+				 ((p->inti_flags & ACPI_MADT_TRIGGER_MASK) ==
+				 ACPI_MADT_TRIGGER_LEVEL) ?
+				 IOSAPIC_LEVEL : IOSAPIC_EDGE);
 	return 0;
 }
 
 static int __init
-acpi_parse_nmi_src(acpi_table_entry_header * header, const unsigned long end)
+acpi_parse_nmi_src(struct acpi_subtable_header * header, const unsigned long end)
 {
-	struct acpi_table_nmi_src *nmi_src;
+	struct acpi_madt_nmi_source *nmi_src;
 
-	nmi_src = (struct acpi_table_nmi_src *)header;
+	nmi_src = (struct acpi_madt_nmi_source *)header;
 
 	if (BAD_MADT_ENTRY(nmi_src, end))
 		return -EINVAL;
@@ -369,12 +378,12 @@ static void __init acpi_madt_oem_check(char *oem_id, char *oem_table_id)
 	}
 }
 
-static int __init acpi_parse_madt(unsigned long phys_addr, unsigned long size)
+static int __init acpi_parse_madt(struct acpi_table_header *table)
 {
-	if (!phys_addr || !size)
+	if (!table)
 		return -EINVAL;
 
-	acpi_madt = (struct acpi_table_madt *)__va(phys_addr);
+	acpi_madt = (struct acpi_table_madt *)table;
 
 	acpi_madt_rev = acpi_madt->header.revision;
 
@@ -382,14 +391,14 @@ static int __init acpi_parse_madt(unsigned long phys_addr, unsigned long size)
 #ifdef CONFIG_ITANIUM
 	has_8259 = 1;		/* Firmware on old Itanium systems is broken */
 #else
-	has_8259 = acpi_madt->flags.pcat_compat;
+	has_8259 = acpi_madt->flags & ACPI_MADT_PCAT_COMPAT;
 #endif
 	iosapic_system_init(has_8259);
 
 	/* Get base address of IPI Message Block */
 
-	if (acpi_madt->lapic_address)
-		ipi_base_addr = ioremap(acpi_madt->lapic_address, 0);
+	if (acpi_madt->address)
+		ipi_base_addr = ioremap(acpi_madt->address, 0);
 
 	printk(KERN_INFO PREFIX "Local APIC address %p\n", ipi_base_addr);
 
@@ -406,13 +415,34 @@ static int __init acpi_parse_madt(unsigned long phys_addr, unsigned long size)
 #define PXM_FLAG_LEN ((MAX_PXM_DOMAINS + 1)/32)
 
 static int __initdata srat_num_cpus;	/* number of cpus */
-static u32 __devinitdata pxm_flag[PXM_FLAG_LEN];
+static u32 pxm_flag[PXM_FLAG_LEN];
 #define pxm_bit_set(bit)	(set_bit(bit,(void *)pxm_flag))
 #define pxm_bit_test(bit)	(test_bit(bit,(void *)pxm_flag))
-/* maps to convert between proximity domain and logical node ID */
-int __devinitdata pxm_to_nid_map[MAX_PXM_DOMAINS];
-int __initdata nid_to_pxm_map[MAX_NUMNODES];
 static struct acpi_table_slit __initdata *slit_table;
+cpumask_t early_cpu_possible_map = CPU_MASK_NONE;
+
+static int __init
+get_processor_proximity_domain(struct acpi_srat_cpu_affinity *pa)
+{
+	int pxm;
+
+	pxm = pa->proximity_domain_lo;
+	if (ia64_platform_is("sn2") || acpi_srat_revision >= 2)
+		pxm += pa->proximity_domain_hi[0] << 8;
+	return pxm;
+}
+
+static int __init
+get_memory_proximity_domain(struct acpi_srat_mem_affinity *ma)
+{
+	int pxm;
+
+	pxm = ma->proximity_domain;
+	if (!ia64_platform_is("sn2") && acpi_srat_revision <= 1)
+		pxm &= 0xff;
+
+	return pxm;
+}
 
 /*
  * ACPI 2.0 SLIT (System Locality Information Table)
@@ -423,48 +453,59 @@ void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
 	u32 len;
 
 	len = sizeof(struct acpi_table_header) + 8
-	    + slit->localities * slit->localities;
+	    + slit->locality_count * slit->locality_count;
 	if (slit->header.length != len) {
 		printk(KERN_ERR
 		       "ACPI 2.0 SLIT: size mismatch: %d expected, %d actual\n",
 		       len, slit->header.length);
-		memset(numa_slit, 10, sizeof(numa_slit));
 		return;
 	}
 	slit_table = slit;
 }
 
 void __init
-acpi_numa_processor_affinity_init(struct acpi_table_processor_affinity *pa)
+acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
 {
+	int pxm;
+
+	if (!(pa->flags & ACPI_SRAT_CPU_ENABLED))
+		return;
+
+	if (srat_num_cpus >= ARRAY_SIZE(node_cpuid)) {
+		printk_once(KERN_WARNING
+			    "node_cpuid[%ld] is too small, may not be able to use all cpus\n",
+			    ARRAY_SIZE(node_cpuid));
+		return;
+	}
+	pxm = get_processor_proximity_domain(pa);
+
 	/* record this node in proximity bitmap */
-	pxm_bit_set(pa->proximity_domain);
+	pxm_bit_set(pxm);
 
 	node_cpuid[srat_num_cpus].phys_id =
-	    (pa->apic_id << 8) | (pa->lsapic_eid);
+	    (pa->apic_id << 8) | (pa->local_sapic_eid);
 	/* nid should be overridden as logical node id later */
-	node_cpuid[srat_num_cpus].nid = pa->proximity_domain;
+	node_cpuid[srat_num_cpus].nid = pxm;
+	cpu_set(srat_num_cpus, early_cpu_possible_map);
 	srat_num_cpus++;
 }
 
-void __init
-acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma)
+int __init
+acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
 {
 	unsigned long paddr, size;
-	u8 pxm;
+	int pxm;
 	struct node_memblk_s *p, *q, *pend;
 
-	pxm = ma->proximity_domain;
+	pxm = get_memory_proximity_domain(ma);
 
 	/* fill node memory chunk structure */
-	paddr = ma->base_addr_hi;
-	paddr = (paddr << 32) | ma->base_addr_lo;
-	size = ma->length_hi;
-	size = (size << 32) | ma->length_lo;
+	paddr = ma->base_address;
+	size = ma->length;
 
 	/* Ignore disabled entries */
-	if (!ma->flags.enabled)
-		return;
+	if (!(ma->flags & ACPI_SRAT_MEM_ENABLED))
+		return -1;
 
 	/* record this node in proximity bitmap */
 	pxm_bit_set(pxm);
@@ -483,6 +524,7 @@ acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma)
 	p->size = size;
 	p->nid = pxm;
 	num_node_memblks++;
+	return 0;
 }
 
 void __init acpi_numa_arch_fixup(void)
@@ -500,22 +542,17 @@ void __init acpi_numa_arch_fixup(void)
 	 * MCD - This can probably be dropped now.  No need for pxm ID to node ID
 	 * mapping with sparse node numbering iff MAX_PXM_DOMAINS <= MAX_NUMNODES.
 	 */
-	/* calculate total number of nodes in system from PXM bitmap */
-	memset(pxm_to_nid_map, -1, sizeof(pxm_to_nid_map));
-	memset(nid_to_pxm_map, -1, sizeof(nid_to_pxm_map));
 	nodes_clear(node_online_map);
 	for (i = 0; i < MAX_PXM_DOMAINS; i++) {
 		if (pxm_bit_test(i)) {
-			int nid = num_online_nodes();
-			pxm_to_nid_map[i] = nid;
-			nid_to_pxm_map[nid] = i;
+			int nid = acpi_map_pxm_to_node(i);
 			node_set_online(nid);
 		}
 	}
 
 	/* set logical node id in memory chunk structure */
 	for (i = 0; i < num_node_memblks; i++)
-		node_memblk[i].nid = pxm_to_nid_map[node_memblk[i].nid];
+		node_memblk[i].nid = pxm_to_node(node_memblk[i].nid);
 
 	/* assign memory bank numbers for each chunk on each node */
 	for_each_online_node(i) {
@@ -528,27 +565,33 @@ void __init acpi_numa_arch_fixup(void)
 	}
 
 	/* set logical node id in cpu structure */
-	for (i = 0; i < srat_num_cpus; i++)
-		node_cpuid[i].nid = pxm_to_nid_map[node_cpuid[i].nid];
+	for_each_possible_early_cpu(i)
+		node_cpuid[i].nid = pxm_to_node(node_cpuid[i].nid);
 
 	printk(KERN_INFO "Number of logical nodes in system = %d\n",
 	       num_online_nodes());
 	printk(KERN_INFO "Number of memory chunks in system = %d\n",
 	       num_node_memblks);
 
-	if (!slit_table)
+	if (!slit_table) {
+		for (i = 0; i < MAX_NUMNODES; i++)
+			for (j = 0; j < MAX_NUMNODES; j++)
+				node_distance(i, j) = i == j ? LOCAL_DISTANCE :
+							REMOTE_DISTANCE;
 		return;
+	}
+
 	memset(numa_slit, -1, sizeof(numa_slit));
-	for (i = 0; i < slit_table->localities; i++) {
+	for (i = 0; i < slit_table->locality_count; i++) {
 		if (!pxm_bit_test(i))
 			continue;
-		node_from = pxm_to_nid_map[i];
-		for (j = 0; j < slit_table->localities; j++) {
+		node_from = pxm_to_node(i);
+		for (j = 0; j < slit_table->locality_count; j++) {
 			if (!pxm_bit_test(j))
 				continue;
-			node_to = pxm_to_nid_map[j];
+			node_to = pxm_to_node(j);
 			node_distance(node_from, node_to) =
-			    slit_table->entry[i * slit_table->localities + j];
+			    slit_table->entry[i * slit_table->locality_count + j];
 		}
 	}
 
@@ -567,63 +610,91 @@ void __init acpi_numa_arch_fixup(void)
  * success: return IRQ number (>=0)
  * failure: return < 0
  */
-int acpi_register_gsi(u32 gsi, int edge_level, int active_high_low)
+int acpi_register_gsi(struct device *dev, u32 gsi, int triggering, int polarity)
 {
+	if (acpi_irq_model == ACPI_IRQ_MODEL_PLATFORM)
+		return gsi;
+
 	if (has_8259 && gsi < 16)
 		return isa_irq_to_vector(gsi);
 
 	return iosapic_register_intr(gsi,
-				     (active_high_low ==
+				     (polarity ==
 				      ACPI_ACTIVE_HIGH) ? IOSAPIC_POL_HIGH :
 				     IOSAPIC_POL_LOW,
-				     (edge_level ==
+				     (triggering ==
 				      ACPI_EDGE_SENSITIVE) ? IOSAPIC_EDGE :
 				     IOSAPIC_LEVEL);
 }
-
-EXPORT_SYMBOL(acpi_register_gsi);
+EXPORT_SYMBOL_GPL(acpi_register_gsi);
 
 void acpi_unregister_gsi(u32 gsi)
 {
+	if (acpi_irq_model == ACPI_IRQ_MODEL_PLATFORM)
+		return;
+
+	if (has_8259 && gsi < 16)
+		return;
+
 	iosapic_unregister_intr(gsi);
 }
+EXPORT_SYMBOL_GPL(acpi_unregister_gsi);
 
-EXPORT_SYMBOL(acpi_unregister_gsi);
-
-static int __init acpi_parse_fadt(unsigned long phys_addr, unsigned long size)
+static int __init acpi_parse_fadt(struct acpi_table_header *table)
 {
 	struct acpi_table_header *fadt_header;
-	struct fadt_descriptor_rev2 *fadt;
+	struct acpi_table_fadt *fadt;
 
-	if (!phys_addr || !size)
+	if (!table)
 		return -EINVAL;
 
-	fadt_header = (struct acpi_table_header *)__va(phys_addr);
+	fadt_header = (struct acpi_table_header *)table;
 	if (fadt_header->revision != 3)
 		return -ENODEV;	/* Only deal with ACPI 2.0 FADT */
 
-	fadt = (struct fadt_descriptor_rev2 *)fadt_header;
-
-	if (!(fadt->iapc_boot_arch & BAF_8042_KEYBOARD_CONTROLLER))
-		acpi_kbd_controller_present = 0;
-
-	if (fadt->iapc_boot_arch & BAF_LEGACY_DEVICES)
-		acpi_legacy_devices = 1;
+	fadt = (struct acpi_table_fadt *)fadt_header;
 
-	acpi_register_gsi(fadt->sci_int, ACPI_LEVEL_SENSITIVE, ACPI_ACTIVE_LOW);
+	acpi_register_gsi(NULL, fadt->sci_interrupt, ACPI_LEVEL_SENSITIVE,
+				 ACPI_ACTIVE_LOW);
 	return 0;
 }
 
-unsigned long __init acpi_find_rsdp(void)
+int __init early_acpi_boot_init(void)
 {
-	unsigned long rsdp_phys = 0;
+	int ret;
 
-	if (efi.acpi20)
-		rsdp_phys = __pa(efi.acpi20);
-	else if (efi.acpi)
-		printk(KERN_WARNING PREFIX
-		       "v1.0/r0.71 tables no longer supported\n");
-	return rsdp_phys;
+	/*
+	 * do a partial walk of MADT to determine how many CPUs
+	 * we have including offline CPUs
+	 */
+	if (acpi_table_parse(ACPI_SIG_MADT, acpi_parse_madt)) {
+		printk(KERN_ERR PREFIX "Can't find MADT\n");
+		return 0;
+	}
+
+	ret = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_SAPIC,
+		acpi_parse_lsapic, NR_CPUS);
+	if (ret < 1)
+		printk(KERN_ERR PREFIX
+		       "Error parsing MADT - no LAPIC entries\n");
+	else
+		acpi_lapic = 1;
+
+#ifdef CONFIG_SMP
+	if (available_cpus == 0) {
+		printk(KERN_INFO "ACPI: Found 0 CPUS; assuming 1\n");
+		printk(KERN_INFO "CPU 0 (0x%04x)", hard_smp_processor_id());
+		smp_boot_data.cpu_phys_id[available_cpus] =
+		    hard_smp_processor_id();
+		available_cpus = 1;	/* We've got at least one of these, no? */
+	}
+	smp_boot_data.cpu_count = available_cpus;
+#endif
+	/* Make boot-up look pretty */
+	printk(KERN_INFO "%d CPUs available, %d CPUs total\n", available_cpus,
+	       total_cpus);
+
+	return 0;
 }
 
 int __init acpi_boot_init(void)
@@ -637,7 +708,7 @@ int __init acpi_boot_init(void)
 	 * information -- the successor to MPS tables.
 	 */
 
-	if (acpi_table_parse(ACPI_APIC, acpi_parse_madt) < 1) {
+	if (acpi_table_parse(ACPI_SIG_MADT, acpi_parse_madt)) {
 		printk(KERN_ERR PREFIX "Can't find MADT\n");
 		goto skip_madt;
 	}
@@ -645,40 +716,37 @@ int __init acpi_boot_init(void)
 	/* Local APIC */
 
 	if (acpi_table_parse_madt
-	    (ACPI_MADT_LAPIC_ADDR_OVR, acpi_parse_lapic_addr_ovr, 0) < 0)
+	    (ACPI_MADT_TYPE_LOCAL_APIC_OVERRIDE, acpi_parse_lapic_addr_ovr, 0) < 0)
 		printk(KERN_ERR PREFIX
 		       "Error parsing LAPIC address override entry\n");
 
-	if (acpi_table_parse_madt(ACPI_MADT_LSAPIC, acpi_parse_lsapic, NR_CPUS)
-	    < 1)
-		printk(KERN_ERR PREFIX
-		       "Error parsing MADT - no LAPIC entries\n");
-
-	if (acpi_table_parse_madt(ACPI_MADT_LAPIC_NMI, acpi_parse_lapic_nmi, 0)
+	if (acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC_NMI, acpi_parse_lapic_nmi, 0)
 	    < 0)
 		printk(KERN_ERR PREFIX "Error parsing LAPIC NMI entry\n");
 
 	/* I/O APIC */
 
 	if (acpi_table_parse_madt
-	    (ACPI_MADT_IOSAPIC, acpi_parse_iosapic, NR_IOSAPICS) < 1)
-		printk(KERN_ERR PREFIX
-		       "Error parsing MADT - no IOSAPIC entries\n");
+	    (ACPI_MADT_TYPE_IO_SAPIC, acpi_parse_iosapic, NR_IOSAPICS) < 1) {
+		if (!ia64_platform_is("sn2"))
+			printk(KERN_ERR PREFIX
+			       "Error parsing MADT - no IOSAPIC entries\n");
+	}
 
 	/* System-Level Interrupt Routing */
 
 	if (acpi_table_parse_madt
-	    (ACPI_MADT_PLAT_INT_SRC, acpi_parse_plat_int_src,
+	    (ACPI_MADT_TYPE_INTERRUPT_SOURCE, acpi_parse_plat_int_src,
 	     ACPI_MAX_PLATFORM_INTERRUPTS) < 0)
 		printk(KERN_ERR PREFIX
 		       "Error parsing platform interrupt source entry\n");
 
 	if (acpi_table_parse_madt
-	    (ACPI_MADT_INT_SRC_OVR, acpi_parse_int_src_ovr, 0) < 0)
+	    (ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, acpi_parse_int_src_ovr, 0) < 0)
 		printk(KERN_ERR PREFIX
 		       "Error parsing interrupt source overrides entry\n");
 
-	if (acpi_table_parse_madt(ACPI_MADT_NMI_SRC, acpi_parse_nmi_src, 0) < 0)
+	if (acpi_table_parse_madt(ACPI_MADT_TYPE_NMI_SOURCE, acpi_parse_nmi_src, 0) < 0)
 		printk(KERN_ERR PREFIX "Error parsing NMI SRC entry\n");
       skip_madt:
 
@@ -688,21 +756,11 @@ int __init acpi_boot_init(void)
 	 * gets interrupts such as power and sleep buttons.  If it's not
 	 * on a Legacy interrupt, it needs to be setup.
 	 */
-	if (acpi_table_parse(ACPI_FADT, acpi_parse_fadt) < 1)
+	if (acpi_table_parse(ACPI_SIG_FADT, acpi_parse_fadt))
 		printk(KERN_ERR PREFIX "Can't find FADT\n");
 
+#ifdef CONFIG_ACPI_NUMA
 #ifdef CONFIG_SMP
-	if (available_cpus == 0) {
-		printk(KERN_INFO "ACPI: Found 0 CPUS; assuming 1\n");
-		printk(KERN_INFO "CPU 0 (0x%04x)", hard_smp_processor_id());
-		smp_boot_data.cpu_phys_id[available_cpus] =
-		    hard_smp_processor_id();
-		available_cpus = 1;	/* We've got at least one of these, no? */
-	}
-	smp_boot_data.cpu_count = available_cpus;
-
-	smp_build_cpu_map();
-# ifdef CONFIG_ACPI_NUMA
 	if (srat_num_cpus == 0) {
 		int cpu, i = 1;
 		for (cpu = 0; cpu < smp_boot_data.cpu_count; cpu++)
@@ -711,121 +769,140 @@ int __init acpi_boot_init(void)
 				node_cpuid[i++].phys_id =
 				    smp_boot_data.cpu_phys_id[cpu];
 	}
-# endif
 #endif
-#ifdef CONFIG_ACPI_NUMA
 	build_cpu_to_node_map();
 #endif
-	/* Make boot-up look pretty */
-	printk(KERN_INFO "%d CPUs available, %d CPUs total\n", available_cpus,
-	       total_cpus);
 	return 0;
 }
 
 int acpi_gsi_to_irq(u32 gsi, unsigned int *irq)
 {
-	int vector;
+	int tmp;
 
 	if (has_8259 && gsi < 16)
 		*irq = isa_irq_to_vector(gsi);
 	else {
-		vector = gsi_to_vector(gsi);
-		if (vector == -1)
+		tmp = gsi_to_irq(gsi);
+		if (tmp == -1)
 			return -1;
-
-		*irq = vector;
+		*irq = tmp;
 	}
 	return 0;
 }
 
+int acpi_isa_irq_to_gsi(unsigned isa_irq, u32 *gsi)
+{
+	if (isa_irq >= 16)
+		return -1;
+	*gsi = isa_irq;
+	return 0;
+}
+
 /*
  *  ACPI based hotplug CPU support
  */
 #ifdef CONFIG_ACPI_HOTPLUG_CPU
-static
-int acpi_map_cpu2node(acpi_handle handle, int cpu, long physid)
+static int acpi_map_cpu2node(acpi_handle handle, int cpu, int physid)
 {
 #ifdef CONFIG_ACPI_NUMA
-	int pxm_id;
-
-	pxm_id = acpi_get_pxm(handle);
-
 	/*
-	 * Assuming that the container driver would have set the proximity
-	 * domain and would have initialized pxm_to_nid_map[pxm_id] && pxm_flag
+	 * We don't have cpu-only-node hotadd. But if the system equips
+	 * SRAT table, pxm is already found and node is ready.
+  	 * So, just pxm_to_nid(pxm) is OK.
+	 * This code here is for the system which doesn't have full SRAT
+  	 * table for possible cpus.
 	 */
-	node_cpuid[cpu].nid = (pxm_id < 0) ? 0 : pxm_to_nid_map[pxm_id];
-
 	node_cpuid[cpu].phys_id = physid;
+	node_cpuid[cpu].nid = acpi_get_node(handle);
 #endif
-	return (0);
+	return 0;
 }
 
-int acpi_map_lsapic(acpi_handle handle, int *pcpu)
+int additional_cpus __initdata = -1;
+
+static __init int setup_additional_cpus(char *s)
 {
-	struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL };
-	union acpi_object *obj;
-	struct acpi_table_lsapic *lsapic;
-	cpumask_t tmp_map;
-	long physid;
-	int cpu;
+	if (s)
+		additional_cpus = simple_strtol(s, NULL, 0);
 
-	if (ACPI_FAILURE(acpi_evaluate_object(handle, "_MAT", NULL, &buffer)))
-		return -EINVAL;
+	return 0;
+}
 
-	if (!buffer.length || !buffer.pointer)
-		return -EINVAL;
+early_param("additional_cpus", setup_additional_cpus);
 
-	obj = buffer.pointer;
-	if (obj->type != ACPI_TYPE_BUFFER ||
-	    obj->buffer.length < sizeof(*lsapic)) {
-		acpi_os_free(buffer.pointer);
-		return -EINVAL;
-	}
+/*
+ * cpu_possible_mask should be static, it cannot change as CPUs
+ * are onlined, or offlined. The reason is per-cpu data-structures
+ * are allocated by some modules at init time, and dont expect to
+ * do this dynamically on cpu arrival/departure.
+ * cpu_present_mask on the other hand can change dynamically.
+ * In case when cpu_hotplug is not compiled, then we resort to current
+ * behaviour, which is cpu_possible == cpu_present.
+ * - Ashok Raj
+ *
+ * Three ways to find out the number of additional hotplug CPUs:
+ * - If the BIOS specified disabled CPUs in ACPI/mptables use that.
+ * - The user can overwrite it with additional_cpus=NUM
+ * - Otherwise don't reserve additional CPUs.
+ */
+__init void prefill_possible_map(void)
+{
+	int i;
+	int possible, disabled_cpus;
 
-	lsapic = (struct acpi_table_lsapic *)obj->buffer.pointer;
+	disabled_cpus = total_cpus - available_cpus;
 
-	if ((lsapic->header.type != ACPI_MADT_LSAPIC) ||
-	    (!lsapic->flags.enabled)) {
-		acpi_os_free(buffer.pointer);
-		return -EINVAL;
-	}
+ 	if (additional_cpus == -1) {
+ 		if (disabled_cpus > 0)
+			additional_cpus = disabled_cpus;
+ 		else
+			additional_cpus = 0;
+ 	}
 
-	physid = ((lsapic->id << 8) | (lsapic->eid));
+	possible = available_cpus + additional_cpus;
 
-	acpi_os_free(buffer.pointer);
-	buffer.length = ACPI_ALLOCATE_BUFFER;
-	buffer.pointer = NULL;
+	if (possible > nr_cpu_ids)
+		possible = nr_cpu_ids;
 
-	cpus_complement(tmp_map, cpu_present_map);
-	cpu = first_cpu(tmp_map);
-	if (cpu >= NR_CPUS)
+	printk(KERN_INFO "SMP: Allowing %d CPUs, %d hotplug CPUs\n",
+		possible, max((possible - available_cpus), 0));
+
+	for (i = 0; i < possible; i++)
+		set_cpu_possible(i, true);
+}
+
+static int _acpi_map_lsapic(acpi_handle handle, int physid, int *pcpu)
+{
+	cpumask_t tmp_map;
+	int cpu;
+
+	cpumask_complement(&tmp_map, cpu_present_mask);
+	cpu = cpumask_first(&tmp_map);
+	if (cpu >= nr_cpu_ids)
 		return -EINVAL;
 
 	acpi_map_cpu2node(handle, cpu, physid);
 
-	cpu_set(cpu, cpu_present_map);
+	set_cpu_present(cpu, true);
 	ia64_cpu_to_sapicid[cpu] = physid;
-	ia64_acpiid_to_sapicid[lsapic->acpi_id] = ia64_cpu_to_sapicid[cpu];
+
+	acpi_processor_set_pdc(handle);
 
 	*pcpu = cpu;
 	return (0);
 }
 
+/* wrapper to silence section mismatch warning */
+int __ref acpi_map_lsapic(acpi_handle handle, int physid, int *pcpu)
+{
+	return _acpi_map_lsapic(handle, physid, pcpu);
+}
 EXPORT_SYMBOL(acpi_map_lsapic);
 
 int acpi_unmap_lsapic(int cpu)
 {
-	int i;
-
-	for (i = 0; i < MAX_SAPICS; i++) {
-		if (ia64_acpiid_to_sapicid[i] == ia64_cpu_to_sapicid[cpu]) {
-			ia64_acpiid_to_sapicid[i] = -1;
-			break;
-		}
-	}
 	ia64_cpu_to_sapicid[cpu] = -1;
-	cpu_clear(cpu, cpu_present_map);
+	set_cpu_present(cpu, false);
 
 #ifdef CONFIG_ACPI_NUMA
 	/* NUMA specific cleanup's */
@@ -838,14 +915,14 @@ EXPORT_SYMBOL(acpi_unmap_lsapic);
 #endif				/* CONFIG_ACPI_HOTPLUG_CPU */
 
 #ifdef CONFIG_ACPI_NUMA
-static acpi_status __devinit
-acpi_map_iosapic(acpi_handle handle, u32 depth, void *context, void **ret)
+static acpi_status acpi_map_iosapic(acpi_handle handle, u32 depth,
+				    void *context, void **ret)
 {
 	struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL };
 	union acpi_object *obj;
-	struct acpi_table_iosapic *iosapic;
+	struct acpi_madt_io_sapic *iosapic;
 	unsigned int gsi_base;
-	int pxm, node;
+	int node;
 
 	/* Only care about objects w/ a method that returns the MADT */
 	if (ACPI_FAILURE(acpi_evaluate_object(handle, "_MAT", NULL, &buffer)))
@@ -857,33 +934,25 @@ acpi_map_iosapic(acpi_handle handle, u32 depth, void *context, void **ret)
 	obj = buffer.pointer;
 	if (obj->type != ACPI_TYPE_BUFFER ||
 	    obj->buffer.length < sizeof(*iosapic)) {
-		acpi_os_free(buffer.pointer);
+		kfree(buffer.pointer);
 		return AE_OK;
 	}
 
-	iosapic = (struct acpi_table_iosapic *)obj->buffer.pointer;
+	iosapic = (struct acpi_madt_io_sapic *)obj->buffer.pointer;
 
-	if (iosapic->header.type != ACPI_MADT_IOSAPIC) {
-		acpi_os_free(buffer.pointer);
+	if (iosapic->header.type != ACPI_MADT_TYPE_IO_SAPIC) {
+		kfree(buffer.pointer);
 		return AE_OK;
 	}
 
 	gsi_base = iosapic->global_irq_base;
 
-	acpi_os_free(buffer.pointer);
-
-	/*
-	 * OK, it's an IOSAPIC MADT entry, look for a _PXM value to tell
-	 * us which node to associate this with.
-	 */
-	pxm = acpi_get_pxm(handle);
-	if (pxm < 0)
-		return AE_OK;
-
-	node = pxm_to_nid_map[pxm];
+	kfree(buffer.pointer);
 
-	if (node >= MAX_NUMNODES || !node_online(node) ||
-	    cpus_empty(node_to_cpumask(node)))
+	/* OK, it's an IOSAPIC MADT entry; associate it with a node */
+	node = acpi_get_node(handle);
+	if (node == NUMA_NO_NODE || !node_online(node) ||
+	    cpumask_empty(cpumask_of_node(node)))
 		return AE_OK;
 
 	/* We know a gsi to node mapping! */
@@ -901,7 +970,7 @@ acpi_map_iosapics (void)
 fs_initcall(acpi_map_iosapics);
 #endif				/* CONFIG_ACPI_NUMA */
 
-int acpi_register_ioapic(acpi_handle handle, u64 phys_addr, u32 gsi_base)
+int __ref acpi_register_ioapic(acpi_handle handle, u64 phys_addr, u32 gsi_base)
 {
 	int err;
 
@@ -924,4 +993,9 @@ int acpi_unregister_ioapic(acpi_handle handle, u32 gsi_base)
 
 EXPORT_SYMBOL(acpi_unregister_ioapic);
 
-#endif				/* CONFIG_ACPI */
+/*
+ * acpi_suspend_lowlevel() - save kernel state and suspend.
+ *
+ * TBD when when IA64 starts to support suspend...
+ */
+int acpi_suspend_lowlevel(void) { return 0; }
diff --git a/arch/ia64/kernel/asm-offsets.c b/arch/ia64/kernel/asm-offsets.c
index 77225659e96..60ef83e6db7 100644
--- a/arch/ia64/kernel/asm-offsets.c
+++ b/arch/ia64/kernel/asm-offsets.c
@@ -5,22 +5,19 @@
  */
 
 #define ASM_OFFSETS_C 1
-#include <linux/config.h>
 
 #include <linux/sched.h>
-
-#include <asm-ia64/processor.h>
-#include <asm-ia64/ptrace.h>
-#include <asm-ia64/siginfo.h>
-#include <asm-ia64/sigcontext.h>
-#include <asm-ia64/mca.h>
+#include <linux/pid.h>
+#include <linux/clocksource.h>
+#include <linux/kbuild.h>
+#include <asm/processor.h>
+#include <asm/ptrace.h>
+#include <asm/siginfo.h>
+#include <asm/sigcontext.h>
+#include <asm/mca.h>
 
 #include "../kernel/sigframe.h"
-
-#define DEFINE(sym, val) \
-        asm volatile("\n->" #sym " %0 " #val : : "i" (val))
-
-#define BLANK() asm volatile("\n->" : : )
+#include "../kernel/fsyscall_gtod_data.h"
 
 void foo(void)
 {
@@ -33,16 +30,29 @@ void foo(void)
 	DEFINE(SIGFRAME_SIZE, sizeof (struct sigframe));
 	DEFINE(UNW_FRAME_INFO_SIZE, sizeof (struct unw_frame_info));
 
+	BUILD_BUG_ON(sizeof(struct upid) != 32);
+	DEFINE(IA64_UPID_SHIFT, 5);
+
 	BLANK();
 
 	DEFINE(TI_FLAGS, offsetof(struct thread_info, flags));
+	DEFINE(TI_CPU, offsetof(struct thread_info, cpu));
 	DEFINE(TI_PRE_COUNT, offsetof(struct thread_info, preempt_count));
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
+	DEFINE(TI_AC_STAMP, offsetof(struct thread_info, ac_stamp));
+	DEFINE(TI_AC_LEAVE, offsetof(struct thread_info, ac_leave));
+	DEFINE(TI_AC_STIME, offsetof(struct thread_info, ac_stime));
+	DEFINE(TI_AC_UTIME, offsetof(struct thread_info, ac_utime));
+#endif
 
 	BLANK();
 
 	DEFINE(IA64_TASK_BLOCKED_OFFSET,offsetof (struct task_struct, blocked));
 	DEFINE(IA64_TASK_CLEAR_CHILD_TID_OFFSET,offsetof (struct task_struct, clear_child_tid));
 	DEFINE(IA64_TASK_GROUP_LEADER_OFFSET, offsetof (struct task_struct, group_leader));
+	DEFINE(IA64_TASK_TGIDLINK_OFFSET, offsetof (struct task_struct, pids[PIDTYPE_PID].pid));
+	DEFINE(IA64_PID_LEVEL_OFFSET, offsetof (struct pid, level));
+	DEFINE(IA64_PID_UPID_OFFSET, offsetof (struct pid, numbers[0]));
 	DEFINE(IA64_TASK_PENDING_OFFSET,offsetof (struct task_struct, pending));
 	DEFINE(IA64_TASK_PID_OFFSET, offsetof (struct task_struct, pid));
 	DEFINE(IA64_TASK_REAL_PARENT_OFFSET, offsetof (struct task_struct, real_parent));
@@ -217,16 +227,24 @@ void foo(void)
 	DEFINE(IA64_MCA_CPU_INIT_STACK_OFFSET,
 	       offsetof (struct ia64_mca_cpu, init_stack));
 	BLANK();
-	DEFINE(IA64_SAL_OS_STATE_COMMON_OFFSET,
-	       offsetof (struct ia64_sal_os_state, sal_ra));
 	DEFINE(IA64_SAL_OS_STATE_OS_GP_OFFSET,
 	       offsetof (struct ia64_sal_os_state, os_gp));
-	DEFINE(IA64_SAL_OS_STATE_PAL_MIN_STATE_OFFSET,
-	       offsetof (struct ia64_sal_os_state, pal_min_state));
 	DEFINE(IA64_SAL_OS_STATE_PROC_STATE_PARAM_OFFSET,
 	       offsetof (struct ia64_sal_os_state, proc_state_param));
+	DEFINE(IA64_SAL_OS_STATE_SAL_RA_OFFSET,
+	       offsetof (struct ia64_sal_os_state, sal_ra));
+	DEFINE(IA64_SAL_OS_STATE_SAL_GP_OFFSET,
+	       offsetof (struct ia64_sal_os_state, sal_gp));
+	DEFINE(IA64_SAL_OS_STATE_PAL_MIN_STATE_OFFSET,
+	       offsetof (struct ia64_sal_os_state, pal_min_state));
+	DEFINE(IA64_SAL_OS_STATE_OS_STATUS_OFFSET,
+	       offsetof (struct ia64_sal_os_state, os_status));
+	DEFINE(IA64_SAL_OS_STATE_CONTEXT_OFFSET,
+	       offsetof (struct ia64_sal_os_state, context));
 	DEFINE(IA64_SAL_OS_STATE_SIZE,
 	       sizeof (struct ia64_sal_os_state));
+	BLANK();
+
 	DEFINE(IA64_PMSA_GR_OFFSET,
 	       offsetof (struct pal_min_state_area_s, pmsa_gr));
 	DEFINE(IA64_PMSA_BANK1_GR_OFFSET,
@@ -248,17 +266,25 @@ void foo(void)
 	BLANK();
 
 	/* used by fsys_gettimeofday in arch/ia64/kernel/fsys.S */
-	DEFINE(IA64_TIME_INTERPOLATOR_ADDRESS_OFFSET, offsetof (struct time_interpolator, addr));
-	DEFINE(IA64_TIME_INTERPOLATOR_SOURCE_OFFSET, offsetof (struct time_interpolator, source));
-	DEFINE(IA64_TIME_INTERPOLATOR_SHIFT_OFFSET, offsetof (struct time_interpolator, shift));
-	DEFINE(IA64_TIME_INTERPOLATOR_NSEC_OFFSET, offsetof (struct time_interpolator, nsec_per_cyc));
-	DEFINE(IA64_TIME_INTERPOLATOR_OFFSET_OFFSET, offsetof (struct time_interpolator, offset));
-	DEFINE(IA64_TIME_INTERPOLATOR_LAST_CYCLE_OFFSET, offsetof (struct time_interpolator, last_cycle));
-	DEFINE(IA64_TIME_INTERPOLATOR_LAST_COUNTER_OFFSET, offsetof (struct time_interpolator, last_counter));
-	DEFINE(IA64_TIME_INTERPOLATOR_JITTER_OFFSET, offsetof (struct time_interpolator, jitter));
-	DEFINE(IA64_TIME_INTERPOLATOR_MASK_OFFSET, offsetof (struct time_interpolator, mask));
-	DEFINE(IA64_TIME_SOURCE_CPU, TIME_SOURCE_CPU);
-	DEFINE(IA64_TIME_SOURCE_MMIO64, TIME_SOURCE_MMIO64);
-	DEFINE(IA64_TIME_SOURCE_MMIO32, TIME_SOURCE_MMIO32);
-	DEFINE(IA64_TIMESPEC_TV_NSEC_OFFSET, offsetof (struct timespec, tv_nsec));
+	DEFINE(IA64_GTOD_SEQ_OFFSET,
+	       offsetof (struct fsyscall_gtod_data_t, seq));
+	DEFINE(IA64_GTOD_WALL_TIME_OFFSET,
+		offsetof (struct fsyscall_gtod_data_t, wall_time));
+	DEFINE(IA64_GTOD_MONO_TIME_OFFSET,
+		offsetof (struct fsyscall_gtod_data_t, monotonic_time));
+	DEFINE(IA64_CLKSRC_MASK_OFFSET,
+		offsetof (struct fsyscall_gtod_data_t, clk_mask));
+	DEFINE(IA64_CLKSRC_MULT_OFFSET,
+		offsetof (struct fsyscall_gtod_data_t, clk_mult));
+	DEFINE(IA64_CLKSRC_SHIFT_OFFSET,
+		offsetof (struct fsyscall_gtod_data_t, clk_shift));
+	DEFINE(IA64_CLKSRC_MMIO_OFFSET,
+		offsetof (struct fsyscall_gtod_data_t, clk_fsys_mmio));
+	DEFINE(IA64_CLKSRC_CYCLE_LAST_OFFSET,
+		offsetof (struct fsyscall_gtod_data_t, clk_cycle_last));
+	DEFINE(IA64_ITC_JITTER_OFFSET,
+		offsetof (struct itc_jitter_data_t, itc_jitter));
+	DEFINE(IA64_ITC_LASTCYCLE_OFFSET,
+		offsetof (struct itc_jitter_data_t, itc_lastcycle));
+
 }
diff --git a/arch/ia64/kernel/audit.c b/arch/ia64/kernel/audit.c
new file mode 100644
index 00000000000..96a9d18ff4c
--- /dev/null
+++ b/arch/ia64/kernel/audit.c
@@ -0,0 +1,60 @@
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/audit.h>
+#include <asm/unistd.h>
+
+static unsigned dir_class[] = {
+#include <asm-generic/audit_dir_write.h>
+~0U
+};
+
+static unsigned read_class[] = {
+#include <asm-generic/audit_read.h>
+~0U
+};
+
+static unsigned write_class[] = {
+#include <asm-generic/audit_write.h>
+~0U
+};
+
+static unsigned chattr_class[] = {
+#include <asm-generic/audit_change_attr.h>
+~0U
+};
+
+static unsigned signal_class[] = {
+#include <asm-generic/audit_signal.h>
+~0U
+};
+
+int audit_classify_arch(int arch)
+{
+	return 0;
+}
+
+int audit_classify_syscall(int abi, unsigned syscall)
+{
+	switch(syscall) {
+	case __NR_open:
+		return 2;
+	case __NR_openat:
+		return 3;
+	case __NR_execve:
+		return 5;
+	default:
+		return 0;
+	}
+}
+
+static int __init audit_classes_init(void)
+{
+	audit_register_class(AUDIT_CLASS_WRITE, write_class);
+	audit_register_class(AUDIT_CLASS_READ, read_class);
+	audit_register_class(AUDIT_CLASS_DIR_WRITE, dir_class);
+	audit_register_class(AUDIT_CLASS_CHATTR, chattr_class);
+	audit_register_class(AUDIT_CLASS_SIGNAL, signal_class);
+	return 0;
+}
+
+__initcall(audit_classes_init);
diff --git a/arch/ia64/kernel/cpufreq/Kconfig b/arch/ia64/kernel/cpufreq/Kconfig
deleted file mode 100644
index 2d9d5279b98..00000000000
--- a/arch/ia64/kernel/cpufreq/Kconfig
+++ /dev/null
@@ -1,29 +0,0 @@
-
-#
-# CPU Frequency scaling
-#
-
-menu "CPU Frequency scaling"
-
-source "drivers/cpufreq/Kconfig"
-
-if CPU_FREQ
-
-comment "CPUFreq processor drivers"
-
-config IA64_ACPI_CPUFREQ
-	tristate "ACPI Processor P-States driver"
-	select CPU_FREQ_TABLE
-	depends on ACPI_PROCESSOR
-	help
-	This driver adds a CPUFreq driver which utilizes the ACPI
-	Processor Performance States.
-
-	For details, take a look at <file:Documentation/cpu-freq/>.
-
-	If in doubt, say N.
-
-endif   # CPU_FREQ
-
-endmenu
-
diff --git a/arch/ia64/kernel/cpufreq/Makefile b/arch/ia64/kernel/cpufreq/Makefile
deleted file mode 100644
index f748d34c02f..00000000000
--- a/arch/ia64/kernel/cpufreq/Makefile
+++ /dev/null
@@ -1 +0,0 @@
-obj-$(CONFIG_IA64_ACPI_CPUFREQ)		+= acpi-cpufreq.o
diff --git a/arch/ia64/kernel/cpufreq/acpi-cpufreq.c b/arch/ia64/kernel/cpufreq/acpi-cpufreq.c
deleted file mode 100644
index da4d5cf80a4..00000000000
--- a/arch/ia64/kernel/cpufreq/acpi-cpufreq.c
+++ /dev/null
@@ -1,499 +0,0 @@
-/*
- * arch/ia64/kernel/cpufreq/acpi-cpufreq.c
- * This file provides the ACPI based P-state support. This
- * module works with generic cpufreq infrastructure. Most of
- * the code is based on i386 version
- * (arch/i386/kernel/cpu/cpufreq/acpi-cpufreq.c)
- *
- * Copyright (C) 2005 Intel Corp
- *      Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
- */
-
-#include <linux/config.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/cpufreq.h>
-#include <linux/proc_fs.h>
-#include <linux/seq_file.h>
-#include <asm/io.h>
-#include <asm/uaccess.h>
-#include <asm/pal.h>
-
-#include <linux/acpi.h>
-#include <acpi/processor.h>
-
-#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "acpi-cpufreq", msg)
-
-MODULE_AUTHOR("Venkatesh Pallipadi");
-MODULE_DESCRIPTION("ACPI Processor P-States Driver");
-MODULE_LICENSE("GPL");
-
-
-struct cpufreq_acpi_io {
-	struct acpi_processor_performance	acpi_data;
-	struct cpufreq_frequency_table		*freq_table;
-	unsigned int				resume;
-};
-
-static struct cpufreq_acpi_io	*acpi_io_data[NR_CPUS];
-
-static struct cpufreq_driver acpi_cpufreq_driver;
-
-
-static int
-processor_set_pstate (
-	u32	value)
-{
-	s64 retval;
-
-	dprintk("processor_set_pstate\n");
-
-	retval = ia64_pal_set_pstate((u64)value);
-
-	if (retval) {
-		dprintk("Failed to set freq to 0x%x, with error 0x%x\n",
-		        value, retval);
-		return -ENODEV;
-	}
-	return (int)retval;
-}
-
-
-static int
-processor_get_pstate (
-	u32	*value)
-{
-	u64	pstate_index = 0;
-	s64 	retval;
-
-	dprintk("processor_get_pstate\n");
-
-	retval = ia64_pal_get_pstate(&pstate_index);
-	*value = (u32) pstate_index;
-
-	if (retval)
-		dprintk("Failed to get current freq with "
-		        "error 0x%x, idx 0x%x\n", retval, *value);
-
-	return (int)retval;
-}
-
-
-/* To be used only after data->acpi_data is initialized */
-static unsigned
-extract_clock (
-	struct cpufreq_acpi_io *data,
-	unsigned value,
-	unsigned int cpu)
-{
-	unsigned long i;
-
-	dprintk("extract_clock\n");
-
-	for (i = 0; i < data->acpi_data.state_count; i++) {
-		if (value >= data->acpi_data.states[i].control)
-			return data->acpi_data.states[i].core_frequency;
-	}
-	return data->acpi_data.states[i-1].core_frequency;
-}
-
-
-static unsigned int
-processor_get_freq (
-	struct cpufreq_acpi_io	*data,
-	unsigned int		cpu)
-{
-	int			ret = 0;
-	u32			value = 0;
-	cpumask_t		saved_mask;
-	unsigned long 		clock_freq;
-
-	dprintk("processor_get_freq\n");
-
-	saved_mask = current->cpus_allowed;
-	set_cpus_allowed(current, cpumask_of_cpu(cpu));
-	if (smp_processor_id() != cpu) {
-		ret = -EAGAIN;
-		goto migrate_end;
-	}
-
-	/*
-	 * processor_get_pstate gets the average frequency since the
-	 * last get. So, do two PAL_get_freq()...
-	 */
-	ret = processor_get_pstate(&value);
-	ret = processor_get_pstate(&value);
-
-	if (ret) {
-		set_cpus_allowed(current, saved_mask);
-		printk(KERN_WARNING "get performance failed with error %d\n",
-		       ret);
-		ret = -EAGAIN;
-		goto migrate_end;
-	}
-	clock_freq = extract_clock(data, value, cpu);
-	ret = (clock_freq*1000);
-
-migrate_end:
-	set_cpus_allowed(current, saved_mask);
-	return ret;
-}
-
-
-static int
-processor_set_freq (
-	struct cpufreq_acpi_io	*data,
-	unsigned int		cpu,
-	int			state)
-{
-	int			ret = 0;
-	u32			value = 0;
-	struct cpufreq_freqs    cpufreq_freqs;
-	cpumask_t		saved_mask;
-	int			retval;
-
-	dprintk("processor_set_freq\n");
-
-	saved_mask = current->cpus_allowed;
-	set_cpus_allowed(current, cpumask_of_cpu(cpu));
-	if (smp_processor_id() != cpu) {
-		retval = -EAGAIN;
-		goto migrate_end;
-	}
-
-	if (state == data->acpi_data.state) {
-		if (unlikely(data->resume)) {
-			dprintk("Called after resume, resetting to P%d\n", state);
-			data->resume = 0;
-		} else {
-			dprintk("Already at target state (P%d)\n", state);
-			retval = 0;
-			goto migrate_end;
-		}
-	}
-
-	dprintk("Transitioning from P%d to P%d\n",
-		data->acpi_data.state, state);
-
-	/* cpufreq frequency struct */
-	cpufreq_freqs.cpu = cpu;
-	cpufreq_freqs.old = data->freq_table[data->acpi_data.state].frequency;
-	cpufreq_freqs.new = data->freq_table[state].frequency;
-
-	/* notify cpufreq */
-	cpufreq_notify_transition(&cpufreq_freqs, CPUFREQ_PRECHANGE);
-
-	/*
-	 * First we write the target state's 'control' value to the
-	 * control_register.
-	 */
-
-	value = (u32) data->acpi_data.states[state].control;
-
-	dprintk("Transitioning to state: 0x%08x\n", value);
-
-	ret = processor_set_pstate(value);
-	if (ret) {
-		unsigned int tmp = cpufreq_freqs.new;
-		cpufreq_notify_transition(&cpufreq_freqs, CPUFREQ_POSTCHANGE);
-		cpufreq_freqs.new = cpufreq_freqs.old;
-		cpufreq_freqs.old = tmp;
-		cpufreq_notify_transition(&cpufreq_freqs, CPUFREQ_PRECHANGE);
-		cpufreq_notify_transition(&cpufreq_freqs, CPUFREQ_POSTCHANGE);
-		printk(KERN_WARNING "Transition failed with error %d\n", ret);
-		retval = -ENODEV;
-		goto migrate_end;
-	}
-
-	cpufreq_notify_transition(&cpufreq_freqs, CPUFREQ_POSTCHANGE);
-
-	data->acpi_data.state = state;
-
-	retval = 0;
-
-migrate_end:
-	set_cpus_allowed(current, saved_mask);
-	return (retval);
-}
-
-
-static unsigned int
-acpi_cpufreq_get (
-	unsigned int		cpu)
-{
-	struct cpufreq_acpi_io *data = acpi_io_data[cpu];
-
-	dprintk("acpi_cpufreq_get\n");
-
-	return processor_get_freq(data, cpu);
-}
-
-
-static int
-acpi_cpufreq_target (
-	struct cpufreq_policy   *policy,
-	unsigned int target_freq,
-	unsigned int relation)
-{
-	struct cpufreq_acpi_io *data = acpi_io_data[policy->cpu];
-	unsigned int next_state = 0;
-	unsigned int result = 0;
-
-	dprintk("acpi_cpufreq_setpolicy\n");
-
-	result = cpufreq_frequency_table_target(policy,
-			data->freq_table, target_freq, relation, &next_state);
-	if (result)
-		return (result);
-
-	result = processor_set_freq(data, policy->cpu, next_state);
-
-	return (result);
-}
-
-
-static int
-acpi_cpufreq_verify (
-	struct cpufreq_policy   *policy)
-{
-	unsigned int result = 0;
-	struct cpufreq_acpi_io *data = acpi_io_data[policy->cpu];
-
-	dprintk("acpi_cpufreq_verify\n");
-
-	result = cpufreq_frequency_table_verify(policy,
-			data->freq_table);
-
-	return (result);
-}
-
-
-/*
- * processor_init_pdc - let BIOS know about the SMP capabilities
- * of this driver
- * @perf: processor-specific acpi_io_data struct
- * @cpu: CPU being initialized
- *
- * To avoid issues with legacy OSes, some BIOSes require to be informed of
- * the SMP capabilities of OS P-state driver. Here we set the bits in _PDC
- * accordingly. Actual call to _PDC is done in driver/acpi/processor.c
- */
-static void
-processor_init_pdc (
-		struct acpi_processor_performance *perf,
-		unsigned int cpu,
-		struct acpi_object_list *obj_list
-		)
-{
-	union acpi_object *obj;
-	u32 *buf;
-
-	dprintk("processor_init_pdc\n");
-
-	perf->pdc = NULL;
-	/* Initialize pdc. It will be used later. */
-	if (!obj_list)
-		return;
-
-	if (!(obj_list->count && obj_list->pointer))
-		return;
-
-	obj = obj_list->pointer;
-	if ((obj->buffer.length == 12) && obj->buffer.pointer) {
-		buf = (u32 *)obj->buffer.pointer;
-       		buf[0] = ACPI_PDC_REVISION_ID;
-       		buf[1] = 1;
-       		buf[2] = ACPI_PDC_EST_CAPABILITY_SMP;
-		perf->pdc = obj_list;
-	}
-	return;
-}
-
-
-static int
-acpi_cpufreq_cpu_init (
-	struct cpufreq_policy   *policy)
-{
-	unsigned int		i;
-	unsigned int		cpu = policy->cpu;
-	struct cpufreq_acpi_io	*data;
-	unsigned int		result = 0;
-
-	union acpi_object		arg0 = {ACPI_TYPE_BUFFER};
-	u32				arg0_buf[3];
-	struct acpi_object_list 	arg_list = {1, &arg0};
-
-	dprintk("acpi_cpufreq_cpu_init\n");
-	/* setup arg_list for _PDC settings */
-        arg0.buffer.length = 12;
-        arg0.buffer.pointer = (u8 *) arg0_buf;
-
-	data = kmalloc(sizeof(struct cpufreq_acpi_io), GFP_KERNEL);
-	if (!data)
-		return (-ENOMEM);
-
-	memset(data, 0, sizeof(struct cpufreq_acpi_io));
-
-	acpi_io_data[cpu] = data;
-
-	processor_init_pdc(&data->acpi_data, cpu, &arg_list);
-	result = acpi_processor_register_performance(&data->acpi_data, cpu);
-	data->acpi_data.pdc = NULL;
-
-	if (result)
-		goto err_free;
-
-	/* capability check */
-	if (data->acpi_data.state_count <= 1) {
-		dprintk("No P-States\n");
-		result = -ENODEV;
-		goto err_unreg;
-	}
-
-	if ((data->acpi_data.control_register.space_id !=
-					ACPI_ADR_SPACE_FIXED_HARDWARE) ||
-	    (data->acpi_data.status_register.space_id !=
-					ACPI_ADR_SPACE_FIXED_HARDWARE)) {
-		dprintk("Unsupported address space [%d, %d]\n",
-			(u32) (data->acpi_data.control_register.space_id),
-			(u32) (data->acpi_data.status_register.space_id));
-		result = -ENODEV;
-		goto err_unreg;
-	}
-
-	/* alloc freq_table */
-	data->freq_table = kmalloc(sizeof(struct cpufreq_frequency_table) *
-	                           (data->acpi_data.state_count + 1),
-	                           GFP_KERNEL);
-	if (!data->freq_table) {
-		result = -ENOMEM;
-		goto err_unreg;
-	}
-
-	/* detect transition latency */
-	policy->cpuinfo.transition_latency = 0;
-	for (i=0; i<data->acpi_data.state_count; i++) {
-		if ((data->acpi_data.states[i].transition_latency * 1000) >
-		    policy->cpuinfo.transition_latency) {
-			policy->cpuinfo.transition_latency =
-			    data->acpi_data.states[i].transition_latency * 1000;
-		}
-	}
-	policy->governor = CPUFREQ_DEFAULT_GOVERNOR;
-
-	policy->cur = processor_get_freq(data, policy->cpu);
-
-	/* table init */
-	for (i = 0; i <= data->acpi_data.state_count; i++)
-	{
-		data->freq_table[i].index = i;
-		if (i < data->acpi_data.state_count) {
-			data->freq_table[i].frequency =
-			      data->acpi_data.states[i].core_frequency * 1000;
-		} else {
-			data->freq_table[i].frequency = CPUFREQ_TABLE_END;
-		}
-	}
-
-	result = cpufreq_frequency_table_cpuinfo(policy, data->freq_table);
-	if (result) {
-		goto err_freqfree;
-	}
-
-	/* notify BIOS that we exist */
-	acpi_processor_notify_smm(THIS_MODULE);
-
-	printk(KERN_INFO "acpi-cpufreq: CPU%u - ACPI performance management "
-	       "activated.\n", cpu);
-
-	for (i = 0; i < data->acpi_data.state_count; i++)
-		dprintk("     %cP%d: %d MHz, %d mW, %d uS, %d uS, 0x%x 0x%x\n",
-			(i == data->acpi_data.state?'*':' '), i,
-			(u32) data->acpi_data.states[i].core_frequency,
-			(u32) data->acpi_data.states[i].power,
-			(u32) data->acpi_data.states[i].transition_latency,
-			(u32) data->acpi_data.states[i].bus_master_latency,
-			(u32) data->acpi_data.states[i].status,
-			(u32) data->acpi_data.states[i].control);
-
-	cpufreq_frequency_table_get_attr(data->freq_table, policy->cpu);
-
-	/* the first call to ->target() should result in us actually
-	 * writing something to the appropriate registers. */
-	data->resume = 1;
-
-	return (result);
-
- err_freqfree:
-	kfree(data->freq_table);
- err_unreg:
-	acpi_processor_unregister_performance(&data->acpi_data, cpu);
- err_free:
-	kfree(data);
-	acpi_io_data[cpu] = NULL;
-
-	return (result);
-}
-
-
-static int
-acpi_cpufreq_cpu_exit (
-	struct cpufreq_policy   *policy)
-{
-	struct cpufreq_acpi_io *data = acpi_io_data[policy->cpu];
-
-	dprintk("acpi_cpufreq_cpu_exit\n");
-
-	if (data) {
-		cpufreq_frequency_table_put_attr(policy->cpu);
-		acpi_io_data[policy->cpu] = NULL;
-		acpi_processor_unregister_performance(&data->acpi_data,
-		                                      policy->cpu);
-		kfree(data);
-	}
-
-	return (0);
-}
-
-
-static struct freq_attr* acpi_cpufreq_attr[] = {
-	&cpufreq_freq_attr_scaling_available_freqs,
-	NULL,
-};
-
-
-static struct cpufreq_driver acpi_cpufreq_driver = {
-	.verify 	= acpi_cpufreq_verify,
-	.target 	= acpi_cpufreq_target,
-	.get 		= acpi_cpufreq_get,
-	.init		= acpi_cpufreq_cpu_init,
-	.exit		= acpi_cpufreq_cpu_exit,
-	.name		= "acpi-cpufreq",
-	.owner		= THIS_MODULE,
-	.attr           = acpi_cpufreq_attr,
-};
-
-
-static int __init
-acpi_cpufreq_init (void)
-{
-	dprintk("acpi_cpufreq_init\n");
-
- 	return cpufreq_register_driver(&acpi_cpufreq_driver);
-}
-
-
-static void __exit
-acpi_cpufreq_exit (void)
-{
-	dprintk("acpi_cpufreq_exit\n");
-
-	cpufreq_unregister_driver(&acpi_cpufreq_driver);
-	return;
-}
-
-
-late_initcall(acpi_cpufreq_init);
-module_exit(acpi_cpufreq_exit);
-
diff --git a/arch/ia64/kernel/crash.c b/arch/ia64/kernel/crash.c
new file mode 100644
index 00000000000..2955f359e2a
--- /dev/null
+++ b/arch/ia64/kernel/crash.c
@@ -0,0 +1,286 @@
+/*
+ * arch/ia64/kernel/crash.c
+ *
+ * Architecture specific (ia64) functions for kexec based crash dumps.
+ *
+ * Created by: Khalid Aziz <khalid.aziz@hp.com>
+ * Copyright (C) 2005 Hewlett-Packard Development Company, L.P.
+ * Copyright (C) 2005 Intel Corp	Zou Nan hai <nanhai.zou@intel.com>
+ *
+ */
+#include <linux/smp.h>
+#include <linux/delay.h>
+#include <linux/crash_dump.h>
+#include <linux/bootmem.h>
+#include <linux/kexec.h>
+#include <linux/elfcore.h>
+#include <linux/sysctl.h>
+#include <linux/init.h>
+#include <linux/kdebug.h>
+
+#include <asm/mca.h>
+
+int kdump_status[NR_CPUS];
+static atomic_t kdump_cpu_frozen;
+atomic_t kdump_in_progress;
+static int kdump_freeze_monarch;
+static int kdump_on_init = 1;
+static int kdump_on_fatal_mca = 1;
+
+static inline Elf64_Word
+*append_elf_note(Elf64_Word *buf, char *name, unsigned type, void *data,
+		size_t data_len)
+{
+	struct elf_note *note = (struct elf_note *)buf;
+	note->n_namesz = strlen(name) + 1;
+	note->n_descsz = data_len;
+	note->n_type   = type;
+	buf += (sizeof(*note) + 3)/4;
+	memcpy(buf, name, note->n_namesz);
+	buf += (note->n_namesz + 3)/4;
+	memcpy(buf, data, data_len);
+	buf += (data_len + 3)/4;
+	return buf;
+}
+
+static void
+final_note(void *buf)
+{
+	memset(buf, 0, sizeof(struct elf_note));
+}
+
+extern void ia64_dump_cpu_regs(void *);
+
+static DEFINE_PER_CPU(struct elf_prstatus, elf_prstatus);
+
+void
+crash_save_this_cpu(void)
+{
+	void *buf;
+	unsigned long cfm, sof, sol;
+
+	int cpu = smp_processor_id();
+	struct elf_prstatus *prstatus = &per_cpu(elf_prstatus, cpu);
+
+	elf_greg_t *dst = (elf_greg_t *)&(prstatus->pr_reg);
+	memset(prstatus, 0, sizeof(*prstatus));
+	prstatus->pr_pid = current->pid;
+
+	ia64_dump_cpu_regs(dst);
+	cfm = dst[43];
+	sol = (cfm >> 7) & 0x7f;
+	sof = cfm & 0x7f;
+	dst[46] = (unsigned long)ia64_rse_skip_regs((unsigned long *)dst[46],
+			sof - sol);
+
+	buf = (u64 *) per_cpu_ptr(crash_notes, cpu);
+	if (!buf)
+		return;
+	buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS, prstatus,
+			sizeof(*prstatus));
+	final_note(buf);
+}
+
+#ifdef CONFIG_SMP
+static int
+kdump_wait_cpu_freeze(void)
+{
+	int cpu_num = num_online_cpus() - 1;
+	int timeout = 1000;
+	while(timeout-- > 0) {
+		if (atomic_read(&kdump_cpu_frozen) == cpu_num)
+			return 0;
+		udelay(1000);
+	}
+	return 1;
+}
+#endif
+
+void
+machine_crash_shutdown(struct pt_regs *pt)
+{
+	/* This function is only called after the system
+	 * has paniced or is otherwise in a critical state.
+	 * The minimum amount of code to allow a kexec'd kernel
+	 * to run successfully needs to happen here.
+	 *
+	 * In practice this means shooting down the other cpus in
+	 * an SMP system.
+	 */
+	kexec_disable_iosapic();
+#ifdef CONFIG_SMP
+	/*
+	 * If kdump_on_init is set and an INIT is asserted here, kdump will
+	 * be started again via INIT monarch.
+	 */
+	local_irq_disable();
+	ia64_set_psr_mc();	/* mask MCA/INIT */
+	if (atomic_inc_return(&kdump_in_progress) != 1)
+		unw_init_running(kdump_cpu_freeze, NULL);
+
+	/*
+	 * Now this cpu is ready for kdump.
+	 * Stop all others by IPI or INIT.  They could receive INIT from
+	 * outside and might be INIT monarch, but only thing they have to
+	 * do is falling into kdump_cpu_freeze().
+	 *
+	 * If an INIT is asserted here:
+	 * - All receivers might be slaves, since some of cpus could already
+	 *   be frozen and INIT might be masked on monarch.  In this case,
+	 *   all slaves will be frozen soon since kdump_in_progress will let
+	 *   them into DIE_INIT_SLAVE_LEAVE.
+	 * - One might be a monarch, but INIT rendezvous will fail since
+	 *   at least this cpu already have INIT masked so it never join
+	 *   to the rendezvous.  In this case, all slaves and monarch will
+	 *   be frozen soon with no wait since the INIT rendezvous is skipped
+	 *   by kdump_in_progress.
+	 */
+	kdump_smp_send_stop();
+	/* not all cpu response to IPI, send INIT to freeze them */
+	if (kdump_wait_cpu_freeze()) {
+		kdump_smp_send_init();
+		/* wait again, don't go ahead if possible */
+		kdump_wait_cpu_freeze();
+	}
+#endif
+}
+
+static void
+machine_kdump_on_init(void)
+{
+	crash_save_vmcoreinfo();
+	local_irq_disable();
+	kexec_disable_iosapic();
+	machine_kexec(ia64_kimage);
+}
+
+void
+kdump_cpu_freeze(struct unw_frame_info *info, void *arg)
+{
+	int cpuid;
+
+	local_irq_disable();
+	cpuid = smp_processor_id();
+	crash_save_this_cpu();
+	current->thread.ksp = (__u64)info->sw - 16;
+
+	ia64_set_psr_mc();	/* mask MCA/INIT and stop reentrance */
+
+	atomic_inc(&kdump_cpu_frozen);
+	kdump_status[cpuid] = 1;
+	mb();
+	for (;;)
+		cpu_relax();
+}
+
+static int
+kdump_init_notifier(struct notifier_block *self, unsigned long val, void *data)
+{
+	struct ia64_mca_notify_die *nd;
+	struct die_args *args = data;
+
+	if (atomic_read(&kdump_in_progress)) {
+		switch (val) {
+		case DIE_INIT_MONARCH_LEAVE:
+			if (!kdump_freeze_monarch)
+				break;
+			/* fall through */
+		case DIE_INIT_SLAVE_LEAVE:
+		case DIE_INIT_MONARCH_ENTER:
+		case DIE_MCA_RENDZVOUS_LEAVE:
+			unw_init_running(kdump_cpu_freeze, NULL);
+			break;
+		}
+	}
+
+	if (!kdump_on_init && !kdump_on_fatal_mca)
+		return NOTIFY_DONE;
+
+	if (!ia64_kimage) {
+		if (val == DIE_INIT_MONARCH_LEAVE)
+			ia64_mca_printk(KERN_NOTICE
+					"%s: kdump not configured\n",
+					__func__);
+		return NOTIFY_DONE;
+	}
+
+	if (val != DIE_INIT_MONARCH_LEAVE &&
+	    val != DIE_INIT_MONARCH_PROCESS &&
+	    val != DIE_MCA_MONARCH_LEAVE)
+		return NOTIFY_DONE;
+
+	nd = (struct ia64_mca_notify_die *)args->err;
+
+	switch (val) {
+	case DIE_INIT_MONARCH_PROCESS:
+		/* Reason code 1 means machine check rendezvous*/
+		if (kdump_on_init && (nd->sos->rv_rc != 1)) {
+			if (atomic_inc_return(&kdump_in_progress) != 1)
+				kdump_freeze_monarch = 1;
+		}
+		break;
+	case DIE_INIT_MONARCH_LEAVE:
+		/* Reason code 1 means machine check rendezvous*/
+		if (kdump_on_init && (nd->sos->rv_rc != 1))
+			machine_kdump_on_init();
+		break;
+	case DIE_MCA_MONARCH_LEAVE:
+		/* *(nd->data) indicate if MCA is recoverable */
+		if (kdump_on_fatal_mca && !(*(nd->data))) {
+			if (atomic_inc_return(&kdump_in_progress) == 1)
+				machine_kdump_on_init();
+			/* We got fatal MCA while kdump!? No way!! */
+		}
+		break;
+	}
+	return NOTIFY_DONE;
+}
+
+#ifdef CONFIG_SYSCTL
+static struct ctl_table kdump_ctl_table[] = {
+	{
+		.procname = "kdump_on_init",
+		.data = &kdump_on_init,
+		.maxlen = sizeof(int),
+		.mode = 0644,
+		.proc_handler = proc_dointvec,
+	},
+	{
+		.procname = "kdump_on_fatal_mca",
+		.data = &kdump_on_fatal_mca,
+		.maxlen = sizeof(int),
+		.mode = 0644,
+		.proc_handler = proc_dointvec,
+	},
+	{ }
+};
+
+static struct ctl_table sys_table[] = {
+	{
+	  .procname = "kernel",
+	  .mode = 0555,
+	  .child = kdump_ctl_table,
+	},
+	{ }
+};
+#endif
+
+static int
+machine_crash_setup(void)
+{
+	/* be notified before default_monarch_init_process */
+	static struct notifier_block kdump_init_notifier_nb = {
+		.notifier_call = kdump_init_notifier,
+		.priority = 1,
+	};
+	int ret;
+	if((ret = register_die_notifier(&kdump_init_notifier_nb)) != 0)
+		return ret;
+#ifdef CONFIG_SYSCTL
+	register_sysctl_table(sys_table);
+#endif
+	return 0;
+}
+
+__initcall(machine_crash_setup);
+
diff --git a/arch/ia64/kernel/crash_dump.c b/arch/ia64/kernel/crash_dump.c
new file mode 100644
index 00000000000..c8c9298666f
--- /dev/null
+++ b/arch/ia64/kernel/crash_dump.c
@@ -0,0 +1,50 @@
+/*
+ *	kernel/crash_dump.c - Memory preserving reboot related code.
+ *
+ *	Created by: Simon Horman <horms@verge.net.au>
+ *	Original code moved from kernel/crash.c
+ *	Original code comment copied from the i386 version of this file
+ */
+
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/crash_dump.h>
+
+#include <asm/page.h>
+#include <asm/uaccess.h>
+
+/**
+ * copy_oldmem_page - copy one page from "oldmem"
+ * @pfn: page frame number to be copied
+ * @buf: target memory address for the copy; this can be in kernel address
+ *	space or user address space (see @userbuf)
+ * @csize: number of bytes to copy
+ * @offset: offset in bytes into the page (based on pfn) to begin the copy
+ * @userbuf: if set, @buf is in user address space, use copy_to_user(),
+ *	otherwise @buf is in kernel address space, use memcpy().
+ *
+ * Copy a page from "oldmem". For this page, there is no pte mapped
+ * in the current kernel. We stitch up a pte, similar to kmap_atomic.
+ *
+ * Calling copy_to_user() in atomic context is not desirable. Hence first
+ * copying the data to a pre-allocated kernel page and then copying to user
+ * space in non-atomic context.
+ */
+ssize_t
+copy_oldmem_page(unsigned long pfn, char *buf,
+		size_t csize, unsigned long offset, int userbuf)
+{
+	void  *vaddr;
+
+	if (!csize)
+		return 0;
+	vaddr = __va(pfn<<PAGE_SHIFT);
+	if (userbuf) {
+		if (copy_to_user(buf, (vaddr + offset), csize)) {
+			return -EFAULT;
+		}
+	} else
+		memcpy(buf, (vaddr + offset), csize);
+	return csize;
+}
+
diff --git a/arch/ia64/kernel/cyclone.c b/arch/ia64/kernel/cyclone.c
index 6ade3790ce0..4826ff957a3 100644
--- a/arch/ia64/kernel/cyclone.c
+++ b/arch/ia64/kernel/cyclone.c
@@ -3,6 +3,7 @@
 #include <linux/time.h>
 #include <linux/errno.h>
 #include <linux/timex.h>
+#include <linux/clocksource.h>
 #include <asm/io.h>
 
 /* IBM Summit (EXA) Cyclone counter code*/
@@ -18,49 +19,58 @@ void __init cyclone_setup(void)
 	use_cyclone = 1;
 }
 
+static void __iomem *cyclone_mc;
 
-struct time_interpolator cyclone_interpolator = {
-	.source =	TIME_SOURCE_MMIO64,
-	.shift =	16,
-	.frequency =	CYCLONE_TIMER_FREQ,
-	.drift =	-100,
-	.mask =		(1LL << 40) - 1
+static cycle_t read_cyclone(struct clocksource *cs)
+{
+	return (cycle_t)readq((void __iomem *)cyclone_mc);
+}
+
+static struct clocksource clocksource_cyclone = {
+        .name           = "cyclone",
+        .rating         = 300,
+        .read           = read_cyclone,
+        .mask           = (1LL << 40) - 1,
+        .flags          = CLOCK_SOURCE_IS_CONTINUOUS,
 };
 
 int __init init_cyclone_clock(void)
 {
-	u64* reg;
+	u64 __iomem *reg;
 	u64 base;	/* saved cyclone base address */
 	u64 offset;	/* offset from pageaddr to cyclone_timer register */
 	int i;
-	u32* volatile cyclone_timer;	/* Cyclone MPMC0 register */
+	u32 __iomem *cyclone_timer;	/* Cyclone MPMC0 register */
 
 	if (!use_cyclone)
-		return -ENODEV;
+		return 0;
 
 	printk(KERN_INFO "Summit chipset: Starting Cyclone Counter.\n");
 
 	/* find base address */
 	offset = (CYCLONE_CBAR_ADDR);
-	reg = (u64*)ioremap_nocache(offset, sizeof(u64));
+	reg = ioremap_nocache(offset, sizeof(u64));
 	if(!reg){
-		printk(KERN_ERR "Summit chipset: Could not find valid CBAR register.\n");
+		printk(KERN_ERR "Summit chipset: Could not find valid CBAR"
+				" register.\n");
 		use_cyclone = 0;
 		return -ENODEV;
 	}
 	base = readq(reg);
+	iounmap(reg);
 	if(!base){
-		printk(KERN_ERR "Summit chipset: Could not find valid CBAR value.\n");
+		printk(KERN_ERR "Summit chipset: Could not find valid CBAR"
+				" value.\n");
 		use_cyclone = 0;
 		return -ENODEV;
 	}
-	iounmap(reg);
 
 	/* setup PMCC */
 	offset = (base + CYCLONE_PMCC_OFFSET);
-	reg = (u64*)ioremap_nocache(offset, sizeof(u64));
+	reg = ioremap_nocache(offset, sizeof(u64));
 	if(!reg){
-		printk(KERN_ERR "Summit chipset: Could not find valid PMCC register.\n");
+		printk(KERN_ERR "Summit chipset: Could not find valid PMCC"
+				" register.\n");
 		use_cyclone = 0;
 		return -ENODEV;
 	}
@@ -69,9 +79,10 @@ int __init init_cyclone_clock(void)
 
 	/* setup MPCS */
 	offset = (base + CYCLONE_MPCS_OFFSET);
-	reg = (u64*)ioremap_nocache(offset, sizeof(u64));
+	reg = ioremap_nocache(offset, sizeof(u64));
 	if(!reg){
-		printk(KERN_ERR "Summit chipset: Could not find valid MPCS register.\n");
+		printk(KERN_ERR "Summit chipset: Could not find valid MPCS"
+				" register.\n");
 		use_cyclone = 0;
 		return -ENODEV;
 	}
@@ -80,9 +91,10 @@ int __init init_cyclone_clock(void)
 
 	/* map in cyclone_timer */
 	offset = (base + CYCLONE_MPMC_OFFSET);
-	cyclone_timer = (u32*)ioremap_nocache(offset, sizeof(u32));
+	cyclone_timer = ioremap_nocache(offset, sizeof(u32));
 	if(!cyclone_timer){
-		printk(KERN_ERR "Summit chipset: Could not find valid MPMC register.\n");
+		printk(KERN_ERR "Summit chipset: Could not find valid MPMC"
+				" register.\n");
 		use_cyclone = 0;
 		return -ENODEV;
 	}
@@ -93,16 +105,18 @@ int __init init_cyclone_clock(void)
 		int stall = 100;
 		while(stall--) barrier();
 		if(readl(cyclone_timer) == old){
-			printk(KERN_ERR "Summit chipset: Counter not counting! DISABLED\n");
+			printk(KERN_ERR "Summit chipset: Counter not counting!"
+					" DISABLED\n");
 			iounmap(cyclone_timer);
-			cyclone_timer = 0;
+			cyclone_timer = NULL;
 			use_cyclone = 0;
 			return -ENODEV;
 		}
 	}
 	/* initialize last tick */
-	cyclone_interpolator.addr = cyclone_timer;
-	register_time_interpolator(&cyclone_interpolator);
+	cyclone_mc = cyclone_timer;
+	clocksource_cyclone.archdata.fsys_mmio = cyclone_timer;
+	clocksource_register_hz(&clocksource_cyclone, CYCLONE_TIMER_FREQ);
 
 	return 0;
 }
diff --git a/arch/ia64/kernel/dma-mapping.c b/arch/ia64/kernel/dma-mapping.c
new file mode 100644
index 00000000000..7f791623820
--- /dev/null
+++ b/arch/ia64/kernel/dma-mapping.c
@@ -0,0 +1,24 @@
+#include <linux/dma-mapping.h>
+#include <linux/export.h>
+
+/* Set this to 1 if there is a HW IOMMU in the system */
+int iommu_detected __read_mostly;
+
+struct dma_map_ops *dma_ops;
+EXPORT_SYMBOL(dma_ops);
+
+#define PREALLOC_DMA_DEBUG_ENTRIES (1 << 16)
+
+static int __init dma_init(void)
+{
+	dma_debug_init(PREALLOC_DMA_DEBUG_ENTRIES);
+
+	return 0;
+}
+fs_initcall(dma_init);
+
+struct dma_map_ops *dma_get_ops(struct device *dev)
+{
+	return dma_ops;
+}
+EXPORT_SYMBOL(dma_get_ops);
diff --git a/arch/ia64/kernel/efi.c b/arch/ia64/kernel/efi.c
index f72ea6aebcb..741b99c1a0b 100644
--- a/arch/ia64/kernel/efi.c
+++ b/arch/ia64/kernel/efi.c
@@ -1,13 +1,16 @@
 /*
  * Extensible Firmware Interface
  *
- * Based on Extensible Firmware Interface Specification version 0.9 April 30, 1999
+ * Based on Extensible Firmware Interface Specification version 0.9
+ * April 30, 1999
  *
  * Copyright (C) 1999 VA Linux Systems
  * Copyright (C) 1999 Walt Drummond <drummond@valinux.com>
  * Copyright (C) 1999-2003 Hewlett-Packard Co.
  *	David Mosberger-Tang <davidm@hpl.hp.com>
  *	Stephane Eranian <eranian@hpl.hp.com>
+ * (c) Copyright 2006 Hewlett-Packard Development Company, L.P.
+ *	Bjorn Helgaas <bjorn.helgaas@hp.com>
  *
  * All EFI Runtime Services are not implemented yet as EFI only
  * supports physical mode addressing on SoftSDV. This is to be fixed
@@ -18,13 +21,17 @@
  * Goutham Rao: <goutham.rao@intel.com>
  *	Skip non-WB memory and ignore empty memory ranges.
  */
-#include <linux/config.h>
 #include <linux/module.h>
+#include <linux/bootmem.h>
+#include <linux/crash_dump.h>
 #include <linux/kernel.h>
 #include <linux/init.h>
 #include <linux/types.h>
+#include <linux/slab.h>
 #include <linux/time.h>
 #include <linux/efi.h>
+#include <linux/kexec.h>
+#include <linux/mm.h>
 
 #include <asm/io.h>
 #include <asm/kregs.h>
@@ -32,157 +39,176 @@
 #include <asm/pgtable.h>
 #include <asm/processor.h>
 #include <asm/mca.h>
+#include <asm/setup.h>
+#include <asm/tlbflush.h>
 
 #define EFI_DEBUG	0
 
+static __initdata unsigned long palo_phys;
+
+static __initdata efi_config_table_type_t arch_tables[] = {
+	{PROCESSOR_ABSTRACTION_LAYER_OVERWRITE_GUID, "PALO", &palo_phys},
+	{NULL_GUID, NULL, 0},
+};
+
 extern efi_status_t efi_call_phys (void *, ...);
 
-struct efi efi;
-EXPORT_SYMBOL(efi);
 static efi_runtime_services_t *runtime;
-static unsigned long mem_limit = ~0UL, max_addr = ~0UL;
+static u64 mem_limit = ~0UL, max_addr = ~0UL, min_addr = 0UL;
 
 #define efi_call_virt(f, args...)	(*(f))(args)
 
-#define STUB_GET_TIME(prefix, adjust_arg)							  \
-static efi_status_t										  \
-prefix##_get_time (efi_time_t *tm, efi_time_cap_t *tc)						  \
-{												  \
-	struct ia64_fpreg fr[6];								  \
-	efi_time_cap_t *atc = NULL;								  \
-	efi_status_t ret;									  \
-												  \
-	if (tc)											  \
-		atc = adjust_arg(tc);								  \
-	ia64_save_scratch_fpregs(fr);								  \
-	ret = efi_call_##prefix((efi_get_time_t *) __va(runtime->get_time), adjust_arg(tm), atc); \
-	ia64_load_scratch_fpregs(fr);								  \
-	return ret;										  \
-}
-
-#define STUB_SET_TIME(prefix, adjust_arg)							\
-static efi_status_t										\
-prefix##_set_time (efi_time_t *tm)								\
-{												\
-	struct ia64_fpreg fr[6];								\
-	efi_status_t ret;									\
-												\
-	ia64_save_scratch_fpregs(fr);								\
-	ret = efi_call_##prefix((efi_set_time_t *) __va(runtime->set_time), adjust_arg(tm));	\
-	ia64_load_scratch_fpregs(fr);								\
-	return ret;										\
-}
-
-#define STUB_GET_WAKEUP_TIME(prefix, adjust_arg)						\
-static efi_status_t										\
-prefix##_get_wakeup_time (efi_bool_t *enabled, efi_bool_t *pending, efi_time_t *tm)		\
-{												\
-	struct ia64_fpreg fr[6];								\
-	efi_status_t ret;									\
-												\
-	ia64_save_scratch_fpregs(fr);								\
-	ret = efi_call_##prefix((efi_get_wakeup_time_t *) __va(runtime->get_wakeup_time),	\
-				adjust_arg(enabled), adjust_arg(pending), adjust_arg(tm));	\
-	ia64_load_scratch_fpregs(fr);								\
-	return ret;										\
-}
-
-#define STUB_SET_WAKEUP_TIME(prefix, adjust_arg)						\
-static efi_status_t										\
-prefix##_set_wakeup_time (efi_bool_t enabled, efi_time_t *tm)					\
-{												\
-	struct ia64_fpreg fr[6];								\
-	efi_time_t *atm = NULL;									\
-	efi_status_t ret;									\
-												\
-	if (tm)											\
-		atm = adjust_arg(tm);								\
-	ia64_save_scratch_fpregs(fr);								\
-	ret = efi_call_##prefix((efi_set_wakeup_time_t *) __va(runtime->set_wakeup_time),	\
-				enabled, atm);							\
-	ia64_load_scratch_fpregs(fr);								\
-	return ret;										\
-}
-
-#define STUB_GET_VARIABLE(prefix, adjust_arg)						\
-static efi_status_t									\
-prefix##_get_variable (efi_char16_t *name, efi_guid_t *vendor, u32 *attr,		\
-		       unsigned long *data_size, void *data)				\
-{											\
-	struct ia64_fpreg fr[6];							\
-	u32 *aattr = NULL;									\
-	efi_status_t ret;								\
-											\
-	if (attr)									\
-		aattr = adjust_arg(attr);						\
-	ia64_save_scratch_fpregs(fr);							\
-	ret = efi_call_##prefix((efi_get_variable_t *) __va(runtime->get_variable),	\
-				adjust_arg(name), adjust_arg(vendor), aattr,		\
-				adjust_arg(data_size), adjust_arg(data));		\
-	ia64_load_scratch_fpregs(fr);							\
-	return ret;									\
-}
-
-#define STUB_GET_NEXT_VARIABLE(prefix, adjust_arg)						\
-static efi_status_t										\
-prefix##_get_next_variable (unsigned long *name_size, efi_char16_t *name, efi_guid_t *vendor)	\
-{												\
-	struct ia64_fpreg fr[6];								\
-	efi_status_t ret;									\
-												\
-	ia64_save_scratch_fpregs(fr);								\
-	ret = efi_call_##prefix((efi_get_next_variable_t *) __va(runtime->get_next_variable),	\
-				adjust_arg(name_size), adjust_arg(name), adjust_arg(vendor));	\
-	ia64_load_scratch_fpregs(fr);								\
-	return ret;										\
-}
-
-#define STUB_SET_VARIABLE(prefix, adjust_arg)						\
-static efi_status_t									\
-prefix##_set_variable (efi_char16_t *name, efi_guid_t *vendor, unsigned long attr,	\
-		       unsigned long data_size, void *data)				\
-{											\
-	struct ia64_fpreg fr[6];							\
-	efi_status_t ret;								\
-											\
-	ia64_save_scratch_fpregs(fr);							\
-	ret = efi_call_##prefix((efi_set_variable_t *) __va(runtime->set_variable),	\
-				adjust_arg(name), adjust_arg(vendor), attr, data_size,	\
-				adjust_arg(data));					\
-	ia64_load_scratch_fpregs(fr);							\
-	return ret;									\
-}
-
-#define STUB_GET_NEXT_HIGH_MONO_COUNT(prefix, adjust_arg)					\
-static efi_status_t										\
-prefix##_get_next_high_mono_count (u32 *count)							\
-{												\
-	struct ia64_fpreg fr[6];								\
-	efi_status_t ret;									\
-												\
-	ia64_save_scratch_fpregs(fr);								\
-	ret = efi_call_##prefix((efi_get_next_high_mono_count_t *)				\
-				__va(runtime->get_next_high_mono_count), adjust_arg(count));	\
-	ia64_load_scratch_fpregs(fr);								\
-	return ret;										\
-}
-
-#define STUB_RESET_SYSTEM(prefix, adjust_arg)					\
-static void									\
-prefix##_reset_system (int reset_type, efi_status_t status,			\
-		       unsigned long data_size, efi_char16_t *data)		\
-{										\
-	struct ia64_fpreg fr[6];						\
-	efi_char16_t *adata = NULL;						\
-										\
-	if (data)								\
-		adata = adjust_arg(data);					\
-										\
-	ia64_save_scratch_fpregs(fr);						\
-	efi_call_##prefix((efi_reset_system_t *) __va(runtime->reset_system),	\
-			  reset_type, status, data_size, adata);		\
-	/* should not return, but just in case... */				\
-	ia64_load_scratch_fpregs(fr);						\
+#define STUB_GET_TIME(prefix, adjust_arg)				       \
+static efi_status_t							       \
+prefix##_get_time (efi_time_t *tm, efi_time_cap_t *tc)			       \
+{									       \
+	struct ia64_fpreg fr[6];					       \
+	efi_time_cap_t *atc = NULL;					       \
+	efi_status_t ret;						       \
+									       \
+	if (tc)								       \
+		atc = adjust_arg(tc);					       \
+	ia64_save_scratch_fpregs(fr);					       \
+	ret = efi_call_##prefix((efi_get_time_t *) __va(runtime->get_time),    \
+				adjust_arg(tm), atc);			       \
+	ia64_load_scratch_fpregs(fr);					       \
+	return ret;							       \
+}
+
+#define STUB_SET_TIME(prefix, adjust_arg)				       \
+static efi_status_t							       \
+prefix##_set_time (efi_time_t *tm)					       \
+{									       \
+	struct ia64_fpreg fr[6];					       \
+	efi_status_t ret;						       \
+									       \
+	ia64_save_scratch_fpregs(fr);					       \
+	ret = efi_call_##prefix((efi_set_time_t *) __va(runtime->set_time),    \
+				adjust_arg(tm));			       \
+	ia64_load_scratch_fpregs(fr);					       \
+	return ret;							       \
+}
+
+#define STUB_GET_WAKEUP_TIME(prefix, adjust_arg)			       \
+static efi_status_t							       \
+prefix##_get_wakeup_time (efi_bool_t *enabled, efi_bool_t *pending,	       \
+			  efi_time_t *tm)				       \
+{									       \
+	struct ia64_fpreg fr[6];					       \
+	efi_status_t ret;						       \
+									       \
+	ia64_save_scratch_fpregs(fr);					       \
+	ret = efi_call_##prefix(					       \
+		(efi_get_wakeup_time_t *) __va(runtime->get_wakeup_time),      \
+		adjust_arg(enabled), adjust_arg(pending), adjust_arg(tm));     \
+	ia64_load_scratch_fpregs(fr);					       \
+	return ret;							       \
+}
+
+#define STUB_SET_WAKEUP_TIME(prefix, adjust_arg)			       \
+static efi_status_t							       \
+prefix##_set_wakeup_time (efi_bool_t enabled, efi_time_t *tm)		       \
+{									       \
+	struct ia64_fpreg fr[6];					       \
+	efi_time_t *atm = NULL;						       \
+	efi_status_t ret;						       \
+									       \
+	if (tm)								       \
+		atm = adjust_arg(tm);					       \
+	ia64_save_scratch_fpregs(fr);					       \
+	ret = efi_call_##prefix(					       \
+		(efi_set_wakeup_time_t *) __va(runtime->set_wakeup_time),      \
+		enabled, atm);						       \
+	ia64_load_scratch_fpregs(fr);					       \
+	return ret;							       \
+}
+
+#define STUB_GET_VARIABLE(prefix, adjust_arg)				       \
+static efi_status_t							       \
+prefix##_get_variable (efi_char16_t *name, efi_guid_t *vendor, u32 *attr,      \
+		       unsigned long *data_size, void *data)		       \
+{									       \
+	struct ia64_fpreg fr[6];					       \
+	u32 *aattr = NULL;						       \
+	efi_status_t ret;						       \
+									       \
+	if (attr)							       \
+		aattr = adjust_arg(attr);				       \
+	ia64_save_scratch_fpregs(fr);					       \
+	ret = efi_call_##prefix(					       \
+		(efi_get_variable_t *) __va(runtime->get_variable),	       \
+		adjust_arg(name), adjust_arg(vendor), aattr,		       \
+		adjust_arg(data_size), adjust_arg(data));		       \
+	ia64_load_scratch_fpregs(fr);					       \
+	return ret;							       \
+}
+
+#define STUB_GET_NEXT_VARIABLE(prefix, adjust_arg)			       \
+static efi_status_t							       \
+prefix##_get_next_variable (unsigned long *name_size, efi_char16_t *name,      \
+			    efi_guid_t *vendor)				       \
+{									       \
+	struct ia64_fpreg fr[6];					       \
+	efi_status_t ret;						       \
+									       \
+	ia64_save_scratch_fpregs(fr);					       \
+	ret = efi_call_##prefix(					       \
+		(efi_get_next_variable_t *) __va(runtime->get_next_variable),  \
+		adjust_arg(name_size), adjust_arg(name), adjust_arg(vendor));  \
+	ia64_load_scratch_fpregs(fr);					       \
+	return ret;							       \
+}
+
+#define STUB_SET_VARIABLE(prefix, adjust_arg)				       \
+static efi_status_t							       \
+prefix##_set_variable (efi_char16_t *name, efi_guid_t *vendor,		       \
+		       u32 attr, unsigned long data_size,		       \
+		       void *data)					       \
+{									       \
+	struct ia64_fpreg fr[6];					       \
+	efi_status_t ret;						       \
+									       \
+	ia64_save_scratch_fpregs(fr);					       \
+	ret = efi_call_##prefix(					       \
+		(efi_set_variable_t *) __va(runtime->set_variable),	       \
+		adjust_arg(name), adjust_arg(vendor), attr, data_size,	       \
+		adjust_arg(data));					       \
+	ia64_load_scratch_fpregs(fr);					       \
+	return ret;							       \
+}
+
+#define STUB_GET_NEXT_HIGH_MONO_COUNT(prefix, adjust_arg)		       \
+static efi_status_t							       \
+prefix##_get_next_high_mono_count (u32 *count)				       \
+{									       \
+	struct ia64_fpreg fr[6];					       \
+	efi_status_t ret;						       \
+									       \
+	ia64_save_scratch_fpregs(fr);					       \
+	ret = efi_call_##prefix((efi_get_next_high_mono_count_t *)	       \
+				__va(runtime->get_next_high_mono_count),       \
+				adjust_arg(count));			       \
+	ia64_load_scratch_fpregs(fr);					       \
+	return ret;							       \
+}
+
+#define STUB_RESET_SYSTEM(prefix, adjust_arg)				       \
+static void								       \
+prefix##_reset_system (int reset_type, efi_status_t status,		       \
+		       unsigned long data_size, efi_char16_t *data)	       \
+{									       \
+	struct ia64_fpreg fr[6];					       \
+	efi_char16_t *adata = NULL;					       \
+									       \
+	if (data)							       \
+		adata = adjust_arg(data);				       \
+									       \
+	ia64_save_scratch_fpregs(fr);					       \
+	efi_call_##prefix(						       \
+		(efi_reset_system_t *) __va(runtime->reset_system),	       \
+		reset_type, status, data_size, adata);			       \
+	/* should not return, but just in case... */			       \
+	ia64_load_scratch_fpregs(fr);					       \
 }
 
 #define phys_ptr(arg)	((__typeof__(arg)) ia64_tpa(arg))
@@ -214,16 +240,18 @@ efi_gettimeofday (struct timespec *ts)
 {
 	efi_time_t tm;
 
-	memset(ts, 0, sizeof(ts));
-	if ((*efi.get_time)(&tm, NULL) != EFI_SUCCESS)
+	if ((*efi.get_time)(&tm, NULL) != EFI_SUCCESS) {
+		memset(ts, 0, sizeof(*ts));
 		return;
+	}
 
-	ts->tv_sec = mktime(tm.year, tm.month, tm.day, tm.hour, tm.minute, tm.second);
+	ts->tv_sec = mktime(tm.year, tm.month, tm.day,
+			    tm.hour, tm.minute, tm.second);
 	ts->tv_nsec = tm.nanosecond;
 }
 
 static int
-is_available_memory (efi_memory_desc_t *md)
+is_memory_available (efi_memory_desc_t *md)
 {
 	if (!(md->attribute & EFI_MEMORY_WB))
 		return 0;
@@ -247,6 +275,32 @@ typedef struct kern_memdesc {
 
 static kern_memdesc_t *kern_memmap;
 
+#define efi_md_size(md)	(md->num_pages << EFI_PAGE_SHIFT)
+
+static inline u64
+kmd_end(kern_memdesc_t *kmd)
+{
+	return (kmd->start + (kmd->num_pages << EFI_PAGE_SHIFT));
+}
+
+static inline u64
+efi_md_end(efi_memory_desc_t *md)
+{
+	return (md->phys_addr + efi_md_size(md));
+}
+
+static inline int
+efi_wb(efi_memory_desc_t *md)
+{
+	return (md->attribute & EFI_MEMORY_WB);
+}
+
+static inline int
+efi_uc(efi_memory_desc_t *md)
+{
+	return (md->attribute & EFI_MEMORY_UC);
+}
+
 static void
 walk (efi_freemem_callback_t callback, void *arg, u64 attr)
 {
@@ -266,8 +320,8 @@ walk (efi_freemem_callback_t callback, void *arg, u64 attr)
 }
 
 /*
- * Walks the EFI memory map and calls CALLBACK once for each EFI memory descriptor that
- * has memory that is available for OS use.
+ * Walk the EFI memory map and call CALLBACK once for each EFI memory
+ * descriptor that has memory that is available for OS use.
  */
 void
 efi_memmap_walk (efi_freemem_callback_t callback, void *arg)
@@ -276,8 +330,8 @@ efi_memmap_walk (efi_freemem_callback_t callback, void *arg)
 }
 
 /*
- * Walks the EFI memory map and calls CALLBACK once for each EFI memory descriptor that
- * has memory that is available for uncached allocator.
+ * Walk the EFI memory map and call CALLBACK once for each EFI memory
+ * descriptor that has memory that is available for uncached allocator.
  */
 void
 efi_memmap_walk_uc (efi_freemem_callback_t callback, void *arg)
@@ -286,11 +340,10 @@ efi_memmap_walk_uc (efi_freemem_callback_t callback, void *arg)
 }
 
 /*
- * Look for the PAL_CODE region reported by EFI and maps it using an
+ * Look for the PAL_CODE region reported by EFI and map it using an
  * ITR to enable safe PAL calls in virtual mode.  See IA-64 Processor
  * Abstraction Layer chapter 11 in ADAG
  */
-
 void *
 efi_get_pal_addr (void)
 {
@@ -310,53 +363,90 @@ efi_get_pal_addr (void)
 			continue;
 
 		if (++pal_code_count > 1) {
-			printk(KERN_ERR "Too many EFI Pal Code memory ranges, dropped @ %lx\n",
-			       md->phys_addr);
+			printk(KERN_ERR "Too many EFI Pal Code memory ranges, "
+			       "dropped @ %llx\n", md->phys_addr);
 			continue;
 		}
 		/*
-		 * The only ITLB entry in region 7 that is used is the one installed by
-		 * __start().  That entry covers a 64MB range.
+		 * The only ITLB entry in region 7 that is used is the one
+		 * installed by __start().  That entry covers a 64MB range.
 		 */
 		mask  = ~((1 << KERNEL_TR_PAGE_SHIFT) - 1);
 		vaddr = PAGE_OFFSET + md->phys_addr;
 
 		/*
-		 * We must check that the PAL mapping won't overlap with the kernel
-		 * mapping.
+		 * We must check that the PAL mapping won't overlap with the
+		 * kernel mapping.
 		 *
-		 * PAL code is guaranteed to be aligned on a power of 2 between 4k and
-		 * 256KB and that only one ITR is needed to map it. This implies that the
-		 * PAL code is always aligned on its size, i.e., the closest matching page
-		 * size supported by the TLB. Therefore PAL code is guaranteed never to
-		 * cross a 64MB unless it is bigger than 64MB (very unlikely!).  So for
-		 * now the following test is enough to determine whether or not we need a
-		 * dedicated ITR for the PAL code.
+		 * PAL code is guaranteed to be aligned on a power of 2 between
+		 * 4k and 256KB and that only one ITR is needed to map it. This
+		 * implies that the PAL code is always aligned on its size,
+		 * i.e., the closest matching page size supported by the TLB.
+		 * Therefore PAL code is guaranteed never to cross a 64MB unless
+		 * it is bigger than 64MB (very unlikely!).  So for now the
+		 * following test is enough to determine whether or not we need
+		 * a dedicated ITR for the PAL code.
 		 */
 		if ((vaddr & mask) == (KERNEL_START & mask)) {
 			printk(KERN_INFO "%s: no need to install ITR for PAL code\n",
-			       __FUNCTION__);
+			       __func__);
 			continue;
 		}
 
-		if (md->num_pages << EFI_PAGE_SHIFT > IA64_GRANULE_SIZE)
-			panic("Woah!  PAL code size bigger than a granule!");
+		if (efi_md_size(md) > IA64_GRANULE_SIZE)
+			panic("Whoa!  PAL code size bigger than a granule!");
 
 #if EFI_DEBUG
 		mask  = ~((1 << IA64_GRANULE_SHIFT) - 1);
 
-		printk(KERN_INFO "CPU %d: mapping PAL code [0x%lx-0x%lx) into [0x%lx-0x%lx)\n",
-			smp_processor_id(), md->phys_addr,
-			md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT),
-			vaddr & mask, (vaddr & mask) + IA64_GRANULE_SIZE);
+		printk(KERN_INFO "CPU %d: mapping PAL code "
+                       "[0x%lx-0x%lx) into [0x%lx-0x%lx)\n",
+                       smp_processor_id(), md->phys_addr,
+                       md->phys_addr + efi_md_size(md),
+                       vaddr & mask, (vaddr & mask) + IA64_GRANULE_SIZE);
 #endif
 		return __va(md->phys_addr);
 	}
-	printk(KERN_WARNING "%s: no PAL-code memory-descriptor found",
-	       __FUNCTION__);
+	printk(KERN_WARNING "%s: no PAL-code memory-descriptor found\n",
+	       __func__);
 	return NULL;
 }
 
+
+static u8 __init palo_checksum(u8 *buffer, u32 length)
+{
+	u8 sum = 0;
+	u8 *end = buffer + length;
+
+	while (buffer < end)
+		sum = (u8) (sum + *(buffer++));
+
+	return sum;
+}
+
+/*
+ * Parse and handle PALO table which is published at:
+ * http://www.dig64.org/home/DIG64_PALO_R1_0.pdf
+ */
+static void __init handle_palo(unsigned long phys_addr)
+{
+	struct palo_table *palo = __va(phys_addr);
+	u8  checksum;
+
+	if (strncmp(palo->signature, PALO_SIG, sizeof(PALO_SIG) - 1)) {
+		printk(KERN_INFO "PALO signature incorrect.\n");
+		return;
+	}
+
+	checksum = palo_checksum((u8 *)palo, palo->length);
+	if (checksum) {
+		printk(KERN_INFO "PALO checksum incorrect.\n");
+		return;
+	}
+
+	setup_ptcg_sem(palo->max_tlb_purges, NPTCG_FROM_PALO);
+}
+
 void
 efi_map_pal_code (void)
 {
@@ -370,38 +460,37 @@ efi_map_pal_code (void)
 	 * Cannot write to CRx with PSR.ic=1
 	 */
 	psr = ia64_clear_ic();
-	ia64_itr(0x1, IA64_TR_PALCODE, GRANULEROUNDDOWN((unsigned long) pal_vaddr),
+	ia64_itr(0x1, IA64_TR_PALCODE,
+		 GRANULEROUNDDOWN((unsigned long) pal_vaddr),
 		 pte_val(pfn_pte(__pa(pal_vaddr) >> PAGE_SHIFT, PAGE_KERNEL)),
 		 IA64_GRANULE_SHIFT);
+	paravirt_dv_serialize_data();
 	ia64_set_psr(psr);		/* restore psr */
-	ia64_srlz_i();
 }
 
 void __init
 efi_init (void)
 {
 	void *efi_map_start, *efi_map_end;
-	efi_config_table_t *config_tables;
 	efi_char16_t *c16;
 	u64 efi_desc_size;
-	char *cp, *end, vendor[100] = "unknown";
-	extern char saved_command_line[];
+	char *cp, vendor[100] = "unknown";
 	int i;
 
-	/* it's too early to be able to use the standard kernel command line support... */
-	for (cp = saved_command_line; *cp; ) {
+	set_bit(EFI_BOOT, &efi.flags);
+	set_bit(EFI_64BIT, &efi.flags);
+
+	/*
+	 * It's too early to be able to use the standard kernel command line
+	 * support...
+	 */
+	for (cp = boot_command_line; *cp; ) {
 		if (memcmp(cp, "mem=", 4) == 0) {
-			cp += 4;
-			mem_limit = memparse(cp, &end);
-			if (end != cp)
-				break;
-			cp = end;
+			mem_limit = memparse(cp + 4, &cp);
 		} else if (memcmp(cp, "max_addr=", 9) == 0) {
-			cp += 9;
-			max_addr = GRANULEROUNDDOWN(memparse(cp, &end));
-			if (end != cp)
-				break;
-			cp = end;
+			max_addr = GRANULEROUNDDOWN(memparse(cp + 9, &cp));
+		} else if (memcmp(cp, "min_addr=", 9) == 0) {
+			min_addr = GRANULEROUNDDOWN(memparse(cp + 9, &cp));
 		} else {
 			while (*cp != ' ' && *cp)
 				++cp;
@@ -409,8 +498,12 @@ efi_init (void)
 				++cp;
 		}
 	}
+	if (min_addr != 0UL)
+		printk(KERN_INFO "Ignoring memory below %lluMB\n",
+		       min_addr >> 20);
 	if (max_addr != ~0UL)
-		printk(KERN_INFO "Ignoring memory above %luMB\n", max_addr >> 20);
+		printk(KERN_INFO "Ignoring memory above %lluMB\n",
+		       max_addr >> 20);
 
 	efi.systab = __va(ia64_boot_param->efi_systab);
 
@@ -418,50 +511,36 @@ efi_init (void)
 	 * Verify the EFI Table
 	 */
 	if (efi.systab == NULL)
-		panic("Woah! Can't find EFI system table.\n");
+		panic("Whoa! Can't find EFI system table.\n");
 	if (efi.systab->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE)
-		panic("Woah! EFI system table signature incorrect\n");
-	if ((efi.systab->hdr.revision ^ EFI_SYSTEM_TABLE_REVISION) >> 16 != 0)
-		printk(KERN_WARNING "Warning: EFI system table major version mismatch: "
-		       "got %d.%02d, expected %d.%02d\n",
-		       efi.systab->hdr.revision >> 16, efi.systab->hdr.revision & 0xffff,
-		       EFI_SYSTEM_TABLE_REVISION >> 16, EFI_SYSTEM_TABLE_REVISION & 0xffff);
-
-	config_tables = __va(efi.systab->tables);
+		panic("Whoa! EFI system table signature incorrect\n");
+	if ((efi.systab->hdr.revision >> 16) == 0)
+		printk(KERN_WARNING "Warning: EFI system table version "
+		       "%d.%02d, expected 1.00 or greater\n",
+		       efi.systab->hdr.revision >> 16,
+		       efi.systab->hdr.revision & 0xffff);
 
 	/* Show what we know for posterity */
 	c16 = __va(efi.systab->fw_vendor);
 	if (c16) {
-		for (i = 0;i < (int) sizeof(vendor) && *c16; ++i)
+		for (i = 0;i < (int) sizeof(vendor) - 1 && *c16; ++i)
 			vendor[i] = *c16++;
 		vendor[i] = '\0';
 	}
 
 	printk(KERN_INFO "EFI v%u.%.02u by %s:",
-	       efi.systab->hdr.revision >> 16, efi.systab->hdr.revision & 0xffff, vendor);
-
-	for (i = 0; i < (int) efi.systab->nr_tables; i++) {
-		if (efi_guidcmp(config_tables[i].guid, MPS_TABLE_GUID) == 0) {
-			efi.mps = __va(config_tables[i].table);
-			printk(" MPS=0x%lx", config_tables[i].table);
-		} else if (efi_guidcmp(config_tables[i].guid, ACPI_20_TABLE_GUID) == 0) {
-			efi.acpi20 = __va(config_tables[i].table);
-			printk(" ACPI 2.0=0x%lx", config_tables[i].table);
-		} else if (efi_guidcmp(config_tables[i].guid, ACPI_TABLE_GUID) == 0) {
-			efi.acpi = __va(config_tables[i].table);
-			printk(" ACPI=0x%lx", config_tables[i].table);
-		} else if (efi_guidcmp(config_tables[i].guid, SMBIOS_TABLE_GUID) == 0) {
-			efi.smbios = __va(config_tables[i].table);
-			printk(" SMBIOS=0x%lx", config_tables[i].table);
-		} else if (efi_guidcmp(config_tables[i].guid, SAL_SYSTEM_TABLE_GUID) == 0) {
-			efi.sal_systab = __va(config_tables[i].table);
-			printk(" SALsystab=0x%lx", config_tables[i].table);
-		} else if (efi_guidcmp(config_tables[i].guid, HCDP_TABLE_GUID) == 0) {
-			efi.hcdp = __va(config_tables[i].table);
-			printk(" HCDP=0x%lx", config_tables[i].table);
-		}
-	}
-	printk("\n");
+	       efi.systab->hdr.revision >> 16,
+	       efi.systab->hdr.revision & 0xffff, vendor);
+
+	set_bit(EFI_SYSTEM_TABLES, &efi.flags);
+
+	palo_phys      = EFI_INVALID_TABLE_ADDR;
+
+	if (efi_config_init(arch_tables) != 0)
+		return;
+
+	if (palo_phys != EFI_INVALID_TABLE_ADDR)
+		handle_palo(palo_phys);
 
 	runtime = __va(efi.systab->runtime);
 	efi.get_time = phys_get_time;
@@ -484,12 +563,33 @@ efi_init (void)
 		efi_memory_desc_t *md;
 		void *p;
 
-		for (i = 0, p = efi_map_start; p < efi_map_end; ++i, p += efi_desc_size) {
+		for (i = 0, p = efi_map_start; p < efi_map_end;
+		     ++i, p += efi_desc_size)
+		{
+			const char *unit;
+			unsigned long size;
+
 			md = p;
-			printk("mem%02u: type=%u, attr=0x%lx, range=[0x%016lx-0x%016lx) (%luMB)\n",
+			size = md->num_pages << EFI_PAGE_SHIFT;
+
+			if ((size >> 40) > 0) {
+				size >>= 40;
+				unit = "TB";
+			} else if ((size >> 30) > 0) {
+				size >>= 30;
+				unit = "GB";
+			} else if ((size >> 20) > 0) {
+				size >>= 20;
+				unit = "MB";
+			} else {
+				size >>= 10;
+				unit = "KB";
+			}
+
+			printk("mem%02d: type=%2u, attr=0x%016lx, "
+			       "range=[0x%016lx-0x%016lx) (%4lu%s)\n",
 			       i, md->type, md->attribute, md->phys_addr,
-			       md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT),
-			       md->num_pages >> (20 - EFI_PAGE_SHIFT));
+			       md->phys_addr + efi_md_size(md), size, unit);
 		}
 	}
 #endif
@@ -514,8 +614,8 @@ efi_enter_virtual_mode (void)
 		md = p;
 		if (md->attribute & EFI_MEMORY_RUNTIME) {
 			/*
-			 * Some descriptors have multiple bits set, so the order of
-			 * the tests is relevant.
+			 * Some descriptors have multiple bits set, so the
+			 * order of the tests is relevant.
 			 */
 			if (md->attribute & EFI_MEMORY_WB) {
 				md->virt_addr = (u64) __va(md->phys_addr);
@@ -523,21 +623,26 @@ efi_enter_virtual_mode (void)
 				md->virt_addr = (u64) ioremap(md->phys_addr, 0);
 			} else if (md->attribute & EFI_MEMORY_WC) {
 #if 0
-				md->virt_addr = ia64_remap(md->phys_addr, (_PAGE_A | _PAGE_P
-									   | _PAGE_D
-									   | _PAGE_MA_WC
-									   | _PAGE_PL_0
-									   | _PAGE_AR_RW));
+				md->virt_addr = ia64_remap(md->phys_addr,
+							   (_PAGE_A |
+							    _PAGE_P |
+							    _PAGE_D |
+							    _PAGE_MA_WC |
+							    _PAGE_PL_0 |
+							    _PAGE_AR_RW));
 #else
 				printk(KERN_INFO "EFI_MEMORY_WC mapping\n");
 				md->virt_addr = (u64) ioremap(md->phys_addr, 0);
 #endif
 			} else if (md->attribute & EFI_MEMORY_WT) {
 #if 0
-				md->virt_addr = ia64_remap(md->phys_addr, (_PAGE_A | _PAGE_P
-									   | _PAGE_D | _PAGE_MA_WT
-									   | _PAGE_PL_0
-									   | _PAGE_AR_RW));
+				md->virt_addr = ia64_remap(md->phys_addr,
+							   (_PAGE_A |
+							    _PAGE_P |
+							    _PAGE_D |
+							    _PAGE_MA_WT |
+							    _PAGE_PL_0 |
+							    _PAGE_AR_RW));
 #else
 				printk(KERN_INFO "EFI_MEMORY_WT mapping\n");
 				md->virt_addr = (u64) ioremap(md->phys_addr, 0);
@@ -548,16 +653,20 @@ efi_enter_virtual_mode (void)
 
 	status = efi_call_phys(__va(runtime->set_virtual_address_map),
 			       ia64_boot_param->efi_memmap_size,
-			       efi_desc_size, ia64_boot_param->efi_memdesc_version,
+			       efi_desc_size,
+			       ia64_boot_param->efi_memdesc_version,
 			       ia64_boot_param->efi_memmap);
 	if (status != EFI_SUCCESS) {
-		printk(KERN_WARNING "warning: unable to switch EFI into virtual mode "
-		       "(status=%lu)\n", status);
+		printk(KERN_WARNING "warning: unable to switch EFI into "
+		       "virtual mode (status=%lu)\n", status);
 		return;
 	}
 
+	set_bit(EFI_RUNTIME_SERVICES, &efi.flags);
+
 	/*
-	 * Now that EFI is in virtual mode, we call the EFI functions more efficiently:
+	 * Now that EFI is in virtual mode, we call the EFI functions more
+	 * efficiently:
 	 */
 	efi.get_time = virt_get_time;
 	efi.set_time = virt_set_time;
@@ -571,8 +680,8 @@ efi_enter_virtual_mode (void)
 }
 
 /*
- * Walk the EFI memory map looking for the I/O port range.  There can only be one entry of
- * this type, other I/O port ranges should be described via ACPI.
+ * Walk the EFI memory map looking for the I/O port range.  There can only be
+ * one entry of this type, other I/O port ranges should be described via ACPI.
  */
 u64
 efi_get_iobase (void)
@@ -595,8 +704,20 @@ efi_get_iobase (void)
 	return 0;
 }
 
-u32
-efi_mem_type (unsigned long phys_addr)
+static struct kern_memdesc *
+kern_memory_descriptor (unsigned long phys_addr)
+{
+	struct kern_memdesc *md;
+
+	for (md = kern_memmap; md->start != ~0UL; md++) {
+		if (phys_addr - md->start < (md->num_pages << EFI_PAGE_SHIFT))
+			 return md;
+	}
+	return NULL;
+}
+
+static efi_memory_desc_t *
+efi_memory_descriptor (unsigned long phys_addr)
 {
 	void *efi_map_start, *efi_map_end, *p;
 	efi_memory_desc_t *md;
@@ -609,59 +730,193 @@ efi_mem_type (unsigned long phys_addr)
 	for (p = efi_map_start; p < efi_map_end; p += efi_desc_size) {
 		md = p;
 
-		if (phys_addr - md->phys_addr < (md->num_pages << EFI_PAGE_SHIFT))
-			 return md->type;
+		if (phys_addr - md->phys_addr < efi_md_size(md))
+			 return md;
 	}
-	return 0;
+	return NULL;
 }
 
-u64
-efi_mem_attributes (unsigned long phys_addr)
+static int
+efi_memmap_intersects (unsigned long phys_addr, unsigned long size)
 {
 	void *efi_map_start, *efi_map_end, *p;
 	efi_memory_desc_t *md;
 	u64 efi_desc_size;
+	unsigned long end;
 
 	efi_map_start = __va(ia64_boot_param->efi_memmap);
 	efi_map_end   = efi_map_start + ia64_boot_param->efi_memmap_size;
 	efi_desc_size = ia64_boot_param->efi_memdesc_size;
 
+	end = phys_addr + size;
+
 	for (p = efi_map_start; p < efi_map_end; p += efi_desc_size) {
 		md = p;
-
-		if (phys_addr - md->phys_addr < (md->num_pages << EFI_PAGE_SHIFT))
-			return md->attribute;
+		if (md->phys_addr < end && efi_md_end(md) > phys_addr)
+			return 1;
 	}
 	return 0;
 }
+
+u32
+efi_mem_type (unsigned long phys_addr)
+{
+	efi_memory_desc_t *md = efi_memory_descriptor(phys_addr);
+
+	if (md)
+		return md->type;
+	return 0;
+}
+
+u64
+efi_mem_attributes (unsigned long phys_addr)
+{
+	efi_memory_desc_t *md = efi_memory_descriptor(phys_addr);
+
+	if (md)
+		return md->attribute;
+	return 0;
+}
 EXPORT_SYMBOL(efi_mem_attributes);
 
-int
-valid_phys_addr_range (unsigned long phys_addr, unsigned long *size)
+u64
+efi_mem_attribute (unsigned long phys_addr, unsigned long size)
 {
-	void *efi_map_start, *efi_map_end, *p;
-	efi_memory_desc_t *md;
-	u64 efi_desc_size;
+	unsigned long end = phys_addr + size;
+	efi_memory_desc_t *md = efi_memory_descriptor(phys_addr);
+	u64 attr;
 
-	efi_map_start = __va(ia64_boot_param->efi_memmap);
-	efi_map_end   = efi_map_start + ia64_boot_param->efi_memmap_size;
-	efi_desc_size = ia64_boot_param->efi_memdesc_size;
+	if (!md)
+		return 0;
 
-	for (p = efi_map_start; p < efi_map_end; p += efi_desc_size) {
-		md = p;
+	/*
+	 * EFI_MEMORY_RUNTIME is not a memory attribute; it just tells
+	 * the kernel that firmware needs this region mapped.
+	 */
+	attr = md->attribute & ~EFI_MEMORY_RUNTIME;
+	do {
+		unsigned long md_end = efi_md_end(md);
+
+		if (end <= md_end)
+			return attr;
+
+		md = efi_memory_descriptor(md_end);
+		if (!md || (md->attribute & ~EFI_MEMORY_RUNTIME) != attr)
+			return 0;
+	} while (md);
+	return 0;	/* never reached */
+}
 
-		if (phys_addr - md->phys_addr < (md->num_pages << EFI_PAGE_SHIFT)) {
-			if (!(md->attribute & EFI_MEMORY_WB))
-				return 0;
+u64
+kern_mem_attribute (unsigned long phys_addr, unsigned long size)
+{
+	unsigned long end = phys_addr + size;
+	struct kern_memdesc *md;
+	u64 attr;
 
-			if (*size > md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT) - phys_addr)
-				*size = md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT) - phys_addr;
-			return 1;
-		}
+	/*
+	 * This is a hack for ioremap calls before we set up kern_memmap.
+	 * Maybe we should do efi_memmap_init() earlier instead.
+	 */
+	if (!kern_memmap) {
+		attr = efi_mem_attribute(phys_addr, size);
+		if (attr & EFI_MEMORY_WB)
+			return EFI_MEMORY_WB;
+		return 0;
 	}
+
+	md = kern_memory_descriptor(phys_addr);
+	if (!md)
+		return 0;
+
+	attr = md->attribute;
+	do {
+		unsigned long md_end = kmd_end(md);
+
+		if (end <= md_end)
+			return attr;
+
+		md = kern_memory_descriptor(md_end);
+		if (!md || md->attribute != attr)
+			return 0;
+	} while (md);
+	return 0;	/* never reached */
+}
+EXPORT_SYMBOL(kern_mem_attribute);
+
+int
+valid_phys_addr_range (phys_addr_t phys_addr, unsigned long size)
+{
+	u64 attr;
+
+	/*
+	 * /dev/mem reads and writes use copy_to_user(), which implicitly
+	 * uses a granule-sized kernel identity mapping.  It's really
+	 * only safe to do this for regions in kern_memmap.  For more
+	 * details, see Documentation/ia64/aliasing.txt.
+	 */
+	attr = kern_mem_attribute(phys_addr, size);
+	if (attr & EFI_MEMORY_WB || attr & EFI_MEMORY_UC)
+		return 1;
 	return 0;
 }
 
+int
+valid_mmap_phys_addr_range (unsigned long pfn, unsigned long size)
+{
+	unsigned long phys_addr = pfn << PAGE_SHIFT;
+	u64 attr;
+
+	attr = efi_mem_attribute(phys_addr, size);
+
+	/*
+	 * /dev/mem mmap uses normal user pages, so we don't need the entire
+	 * granule, but the entire region we're mapping must support the same
+	 * attribute.
+	 */
+	if (attr & EFI_MEMORY_WB || attr & EFI_MEMORY_UC)
+		return 1;
+
+	/*
+	 * Intel firmware doesn't tell us about all the MMIO regions, so
+	 * in general we have to allow mmap requests.  But if EFI *does*
+	 * tell us about anything inside this region, we should deny it.
+	 * The user can always map a smaller region to avoid the overlap.
+	 */
+	if (efi_memmap_intersects(phys_addr, size))
+		return 0;
+
+	return 1;
+}
+
+pgprot_t
+phys_mem_access_prot(struct file *file, unsigned long pfn, unsigned long size,
+		     pgprot_t vma_prot)
+{
+	unsigned long phys_addr = pfn << PAGE_SHIFT;
+	u64 attr;
+
+	/*
+	 * For /dev/mem mmap, we use user mappings, but if the region is
+	 * in kern_memmap (and hence may be covered by a kernel mapping),
+	 * we must use the same attribute as the kernel mapping.
+	 */
+	attr = kern_mem_attribute(phys_addr, size);
+	if (attr & EFI_MEMORY_WB)
+		return pgprot_cacheable(vma_prot);
+	else if (attr & EFI_MEMORY_UC)
+		return pgprot_noncached(vma_prot);
+
+	/*
+	 * Some chipsets don't support UC access to memory.  If
+	 * WB is supported, we prefer that.
+	 */
+	if (efi_mem_attribute(phys_addr, size) & EFI_MEMORY_WB)
+		return pgprot_cacheable(vma_prot);
+
+	return pgprot_noncached(vma_prot);
+}
+
 int __init
 efi_uart_console_only(void)
 {
@@ -701,38 +956,12 @@ efi_uart_console_only(void)
 				return 1;
 			uart = 0;
 		}
-		hdr = (struct efi_generic_dev_path *) ((u8 *) hdr + hdr->length);
+		hdr = (struct efi_generic_dev_path *)((u8 *) hdr + hdr->length);
 	}
 	printk(KERN_ERR "Malformed %s value\n", name);
 	return 0;
 }
 
-#define efi_md_size(md)	(md->num_pages << EFI_PAGE_SHIFT)
-
-static inline u64
-kmd_end(kern_memdesc_t *kmd)
-{
-	return (kmd->start + (kmd->num_pages << EFI_PAGE_SHIFT));
-}
-
-static inline u64
-efi_md_end(efi_memory_desc_t *md)
-{
-	return (md->phys_addr + efi_md_size(md));
-}
-
-static inline int
-efi_wb(efi_memory_desc_t *md)
-{
-	return (md->attribute & EFI_MEMORY_WB);
-}
-
-static inline int
-efi_uc(efi_memory_desc_t *md)
-{
-	return (md->attribute & EFI_MEMORY_UC);
-}
-
 /*
  * Look for the first granule aligned memory descriptor memory
  * that is big enough to hold EFI memory map. Make sure this
@@ -765,10 +994,12 @@ find_memmap_space (void)
 		if (!efi_wb(md)) {
 			continue;
 		}
-		if (pmd == NULL || !efi_wb(pmd) || efi_md_end(pmd) != md->phys_addr) {
+		if (pmd == NULL || !efi_wb(pmd) ||
+		    efi_md_end(pmd) != md->phys_addr) {
 			contig_low = GRANULEROUNDUP(md->phys_addr);
 			contig_high = efi_md_end(md);
-			for (q = p + efi_desc_size; q < efi_map_end; q += efi_desc_size) {
+			for (q = p + efi_desc_size; q < efi_map_end;
+			     q += efi_desc_size) {
 				check_md = q;
 				if (!efi_wb(check_md))
 					break;
@@ -778,14 +1009,15 @@ find_memmap_space (void)
 			}
 			contig_high = GRANULEROUNDDOWN(contig_high);
 		}
-		if (!is_available_memory(md) || md->type == EFI_LOADER_DATA)
+		if (!is_memory_available(md) || md->type == EFI_LOADER_DATA)
 			continue;
 
 		/* Round ends inward to granule boundaries */
 		as = max(contig_low, md->phys_addr);
 		ae = min(contig_high, efi_md_end(md));
 
-		/* keep within max_addr= command line arg */
+		/* keep within max_addr= and min_addr= command line arg */
+		as = max(as, min_addr);
 		ae = min(ae, max_addr);
 		if (ae <= as)
 			continue;
@@ -811,10 +1043,10 @@ find_memmap_space (void)
  * to use.  We can allocate partial granules only if the unavailable
  * parts exist, and are WB.
  */
-void
-efi_memmap_init(unsigned long *s, unsigned long *e)
+unsigned long
+efi_memmap_init(u64 *s, u64 *e)
 {
-	struct kern_memdesc *k, *prev = 0;
+	struct kern_memdesc *k, *prev = NULL;
 	u64	contig_low=0, contig_high=0;
 	u64	as, ae, lim;
 	void *efi_map_start, *efi_map_end, *p, *q;
@@ -831,8 +1063,9 @@ efi_memmap_init(unsigned long *s, unsigned long *e)
 	for (p = efi_map_start; p < efi_map_end; pmd = md, p += efi_desc_size) {
 		md = p;
 		if (!efi_wb(md)) {
-			if (efi_uc(md) && (md->type == EFI_CONVENTIONAL_MEMORY ||
-				    	   md->type == EFI_BOOT_SERVICES_DATA)) {
+			if (efi_uc(md) &&
+			    (md->type == EFI_CONVENTIONAL_MEMORY ||
+			     md->type == EFI_BOOT_SERVICES_DATA)) {
 				k->attribute = EFI_MEMORY_UC;
 				k->start = md->phys_addr;
 				k->num_pages = md->num_pages;
@@ -840,10 +1073,12 @@ efi_memmap_init(unsigned long *s, unsigned long *e)
 			}
 			continue;
 		}
-		if (pmd == NULL || !efi_wb(pmd) || efi_md_end(pmd) != md->phys_addr) {
+		if (pmd == NULL || !efi_wb(pmd) ||
+		    efi_md_end(pmd) != md->phys_addr) {
 			contig_low = GRANULEROUNDUP(md->phys_addr);
 			contig_high = efi_md_end(md);
-			for (q = p + efi_desc_size; q < efi_map_end; q += efi_desc_size) {
+			for (q = p + efi_desc_size; q < efi_map_end;
+			     q += efi_desc_size) {
 				check_md = q;
 				if (!efi_wb(check_md))
 					break;
@@ -853,7 +1088,7 @@ efi_memmap_init(unsigned long *s, unsigned long *e)
 			}
 			contig_high = GRANULEROUNDDOWN(contig_high);
 		}
-		if (!is_available_memory(md))
+		if (!is_memory_available(md))
 			continue;
 
 		/*
@@ -863,13 +1098,17 @@ efi_memmap_init(unsigned long *s, unsigned long *e)
 		if (md->phys_addr < contig_low) {
 			lim = min(efi_md_end(md), contig_low);
 			if (efi_uc(md)) {
-				if (k > kern_memmap && (k-1)->attribute == EFI_MEMORY_UC &&
+				if (k > kern_memmap &&
+				    (k-1)->attribute == EFI_MEMORY_UC &&
 				    kmd_end(k-1) == md->phys_addr) {
-					(k-1)->num_pages += (lim - md->phys_addr) >> EFI_PAGE_SHIFT;
+					(k-1)->num_pages +=
+						(lim - md->phys_addr)
+						>> EFI_PAGE_SHIFT;
 				} else {
 					k->attribute = EFI_MEMORY_UC;
 					k->start = md->phys_addr;
-					k->num_pages = (lim - md->phys_addr) >> EFI_PAGE_SHIFT;
+					k->num_pages = (lim - md->phys_addr)
+						>> EFI_PAGE_SHIFT;
 					k++;
 				}
 			}
@@ -887,7 +1126,8 @@ efi_memmap_init(unsigned long *s, unsigned long *e)
 				} else {
 					k->attribute = EFI_MEMORY_UC;
 					k->start = lim;
-					k->num_pages = (efi_md_end(md) - lim) >> EFI_PAGE_SHIFT;
+					k->num_pages = (efi_md_end(md) - lim)
+						>> EFI_PAGE_SHIFT;
 					k++;
 				}
 			}
@@ -895,7 +1135,8 @@ efi_memmap_init(unsigned long *s, unsigned long *e)
 		} else
 			ae = efi_md_end(md);
 
-		/* keep within max_addr= command line arg */
+		/* keep within max_addr= and min_addr= command line arg */
+		as = max(as, min_addr);
 		ae = min(ae, max_addr);
 		if (ae <= as)
 			continue;
@@ -922,11 +1163,14 @@ efi_memmap_init(unsigned long *s, unsigned long *e)
 	/* reserve the memory we are using for kern_memmap */
 	*s = (u64)kern_memmap;
 	*e = (u64)++k;
+
+	return total_mem;
 }
 
 void
 efi_initialize_iomem_resources(struct resource *code_resource,
-			       struct resource *data_resource)
+			       struct resource *data_resource,
+			       struct resource *bss_resource)
 {
 	struct resource *res;
 	void *efi_map_start, *efi_map_end, *p;
@@ -947,7 +1191,7 @@ efi_initialize_iomem_resources(struct resource *code_resource,
 		if (md->num_pages == 0) /* should not happen */
 			continue;
 
-		flags = IORESOURCE_MEM;
+		flags = IORESOURCE_MEM | IORESOURCE_BUSY;
 		switch (md->type) {
 
 			case EFI_MEMORY_MAPPED_IO:
@@ -962,19 +1206,19 @@ efi_initialize_iomem_resources(struct resource *code_resource,
 				if (md->attribute & EFI_MEMORY_WP) {
 					name = "System ROM";
 					flags |= IORESOURCE_READONLY;
-				} else {
+				} else if (md->attribute == EFI_MEMORY_UC)
+					name = "Uncached RAM";
+				else
 					name = "System RAM";
-				}
 				break;
 
 			case EFI_ACPI_MEMORY_NVS:
 				name = "ACPI Non-volatile Storage";
-				flags |= IORESOURCE_BUSY;
 				break;
 
 			case EFI_UNUSABLE_MEMORY:
 				name = "reserved";
-				flags |= IORESOURCE_BUSY | IORESOURCE_DISABLED;
+				flags |= IORESOURCE_DISABLED;
 				break;
 
 			case EFI_RESERVED_TYPE:
@@ -983,18 +1227,19 @@ efi_initialize_iomem_resources(struct resource *code_resource,
 			case EFI_ACPI_RECLAIM_MEMORY:
 			default:
 				name = "reserved";
-				flags |= IORESOURCE_BUSY;
 				break;
 		}
 
-		if ((res = kcalloc(1, sizeof(struct resource), GFP_KERNEL)) == NULL) {
-			printk(KERN_ERR "failed to alocate resource for iomem\n");
+		if ((res = kzalloc(sizeof(struct resource),
+				   GFP_KERNEL)) == NULL) {
+			printk(KERN_ERR
+			       "failed to allocate resource for iomem\n");
 			return;
 		}
 
 		res->name = name;
 		res->start = md->phys_addr;
-		res->end = md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT) - 1;
+		res->end = md->phys_addr + efi_md_size(md) - 1;
 		res->flags = flags;
 
 		if (insert_resource(&iomem_resource, res) < 0)
@@ -1007,6 +1252,89 @@ efi_initialize_iomem_resources(struct resource *code_resource,
 			 */
 			insert_resource(res, code_resource);
 			insert_resource(res, data_resource);
+			insert_resource(res, bss_resource);
+#ifdef CONFIG_KEXEC
+                        insert_resource(res, &efi_memmap_res);
+                        insert_resource(res, &boot_param_res);
+			if (crashk_res.end > crashk_res.start)
+				insert_resource(res, &crashk_res);
+#endif
+		}
+	}
+}
+
+#ifdef CONFIG_KEXEC
+/* find a block of memory aligned to 64M exclude reserved regions
+   rsvd_regions are sorted
+ */
+unsigned long __init
+kdump_find_rsvd_region (unsigned long size, struct rsvd_region *r, int n)
+{
+	int i;
+	u64 start, end;
+	u64 alignment = 1UL << _PAGE_SIZE_64M;
+	void *efi_map_start, *efi_map_end, *p;
+	efi_memory_desc_t *md;
+	u64 efi_desc_size;
+
+	efi_map_start = __va(ia64_boot_param->efi_memmap);
+	efi_map_end   = efi_map_start + ia64_boot_param->efi_memmap_size;
+	efi_desc_size = ia64_boot_param->efi_memdesc_size;
+
+	for (p = efi_map_start; p < efi_map_end; p += efi_desc_size) {
+		md = p;
+		if (!efi_wb(md))
+			continue;
+		start = ALIGN(md->phys_addr, alignment);
+		end = efi_md_end(md);
+		for (i = 0; i < n; i++) {
+			if (__pa(r[i].start) >= start && __pa(r[i].end) < end) {
+				if (__pa(r[i].start) > start + size)
+					return start;
+				start = ALIGN(__pa(r[i].end), alignment);
+				if (i < n-1 &&
+				    __pa(r[i+1].start) < start + size)
+					continue;
+				else
+					break;
+			}
+		}
+		if (end > start + size)
+			return start;
+	}
+
+	printk(KERN_WARNING
+	       "Cannot reserve 0x%lx byte of memory for crashdump\n", size);
+	return ~0UL;
+}
+#endif
+
+#ifdef CONFIG_CRASH_DUMP
+/* locate the size find a the descriptor at a certain address */
+unsigned long __init
+vmcore_find_descriptor_size (unsigned long address)
+{
+	void *efi_map_start, *efi_map_end, *p;
+	efi_memory_desc_t *md;
+	u64 efi_desc_size;
+	unsigned long ret = 0;
+
+	efi_map_start = __va(ia64_boot_param->efi_memmap);
+	efi_map_end   = efi_map_start + ia64_boot_param->efi_memmap_size;
+	efi_desc_size = ia64_boot_param->efi_memdesc_size;
+
+	for (p = efi_map_start; p < efi_map_end; p += efi_desc_size) {
+		md = p;
+		if (efi_wb(md) && md->type == EFI_LOADER_DATA
+		    && md->phys_addr == address) {
+			ret = efi_md_size(md);
+			break;
 		}
 	}
+
+	if (ret == 0)
+		printk(KERN_WARNING "Cannot locate EFI vmcore descriptor\n");
+
+	return ret;
 }
+#endif
diff --git a/arch/ia64/kernel/efi_stub.S b/arch/ia64/kernel/efi_stub.S
index 5a7fe70212a..a56e161d751 100644
--- a/arch/ia64/kernel/efi_stub.S
+++ b/arch/ia64/kernel/efi_stub.S
@@ -61,7 +61,7 @@ GLOBAL_ENTRY(efi_call_phys)
 	or loc3=loc3,r17
 	mov b6=r2
 	;;
-	andcm r16=loc3,r16		// get psr with IT, DT, and RT bits cleared
+	andcm r16=loc3,r16	// get psr with IT, DT, and RT bits cleared
 	br.call.sptk.many rp=ia64_switch_mode_phys
 .ret0:	mov out4=in5
 	mov out0=in1
diff --git a/arch/ia64/kernel/elfcore.c b/arch/ia64/kernel/elfcore.c
new file mode 100644
index 00000000000..04bc8fd5f89
--- /dev/null
+++ b/arch/ia64/kernel/elfcore.c
@@ -0,0 +1,76 @@
+#include <linux/elf.h>
+#include <linux/coredump.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+
+#include <asm/elf.h>
+
+
+Elf64_Half elf_core_extra_phdrs(void)
+{
+	return GATE_EHDR->e_phnum;
+}
+
+int elf_core_write_extra_phdrs(struct coredump_params *cprm, loff_t offset)
+{
+	const struct elf_phdr *const gate_phdrs =
+		(const struct elf_phdr *) (GATE_ADDR + GATE_EHDR->e_phoff);
+	int i;
+	Elf64_Off ofs = 0;
+
+	for (i = 0; i < GATE_EHDR->e_phnum; ++i) {
+		struct elf_phdr phdr = gate_phdrs[i];
+
+		if (phdr.p_type == PT_LOAD) {
+			phdr.p_memsz = PAGE_ALIGN(phdr.p_memsz);
+			phdr.p_filesz = phdr.p_memsz;
+			if (ofs == 0) {
+				ofs = phdr.p_offset = offset;
+				offset += phdr.p_filesz;
+			} else {
+				phdr.p_offset = ofs;
+			}
+		} else {
+			phdr.p_offset += ofs;
+		}
+		phdr.p_paddr = 0; /* match other core phdrs */
+		if (!dump_emit(cprm, &phdr, sizeof(phdr)))
+			return 0;
+	}
+	return 1;
+}
+
+int elf_core_write_extra_data(struct coredump_params *cprm)
+{
+	const struct elf_phdr *const gate_phdrs =
+		(const struct elf_phdr *) (GATE_ADDR + GATE_EHDR->e_phoff);
+	int i;
+
+	for (i = 0; i < GATE_EHDR->e_phnum; ++i) {
+		if (gate_phdrs[i].p_type == PT_LOAD) {
+			void *addr = (void *)gate_phdrs[i].p_vaddr;
+			size_t memsz = PAGE_ALIGN(gate_phdrs[i].p_memsz);
+
+			if (!dump_emit(cprm, addr, memsz))
+				return 0;
+			break;
+		}
+	}
+	return 1;
+}
+
+size_t elf_core_extra_data_size(void)
+{
+	const struct elf_phdr *const gate_phdrs =
+		(const struct elf_phdr *) (GATE_ADDR + GATE_EHDR->e_phoff);
+	int i;
+	size_t size = 0;
+
+	for (i = 0; i < GATE_EHDR->e_phnum; ++i) {
+		if (gate_phdrs[i].p_type == PT_LOAD) {
+			size += PAGE_ALIGN(gate_phdrs[i].p_memsz);
+			break;
+		}
+	}
+	return size;
+}
diff --git a/arch/ia64/kernel/entry.S b/arch/ia64/kernel/entry.S
index 0741b066b98..ba3d03503e8 100644
--- a/arch/ia64/kernel/entry.S
+++ b/arch/ia64/kernel/entry.S
@@ -1,5 +1,5 @@
 /*
- * ia64/kernel/entry.S
+ * arch/ia64/kernel/entry.S
  *
  * Kernel entry points.
  *
@@ -23,6 +23,11 @@
  * 11/07/2000
  */
 /*
+ * Copyright (c) 2008 Isaku Yamahata <yamahata at valinux co jp>
+ *                    VA Linux Systems Japan K.K.
+ *                    pv_ops.
+ */
+/*
  * Global (preserved) predicate usage on syscall entry/exit path:
  *
  *	pKStk:		See entry.h.
@@ -31,7 +36,6 @@
  *	pNonSys:	!pSys
  */
 
-#include <linux/config.h>
 
 #include <asm/asmmacro.h>
 #include <asm/cache.h>
@@ -43,9 +47,11 @@
 #include <asm/processor.h>
 #include <asm/thread_info.h>
 #include <asm/unistd.h>
+#include <asm/ftrace.h>
 
 #include "minstate.h"
 
+#ifdef __IA64_ASM_PARAVIRTUALIZED_NATIVE
 	/*
 	 * execve() is special because in case of success, we need to
 	 * setup a null register window frame.
@@ -55,31 +61,20 @@ ENTRY(ia64_execve)
 	 * Allocate 8 input registers since ptrace() may clobber them
 	 */
 	.prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(8)
-	alloc loc1=ar.pfs,8,2,4,0
+	alloc loc1=ar.pfs,8,2,3,0
 	mov loc0=rp
 	.body
 	mov out0=in0			// filename
 	;;				// stop bit between alloc and call
 	mov out1=in1			// argv
 	mov out2=in2			// envp
-	add out3=16,sp			// regs
 	br.call.sptk.many rp=sys_execve
 .ret0:
-#ifdef CONFIG_IA32_SUPPORT
-	/*
-	 * Check if we're returning to ia32 mode. If so, we need to restore ia32 registers
-	 * from pt_regs.
-	 */
-	adds r16=PT(CR_IPSR)+16,sp
-	;;
-	ld8 r16=[r16]
-#endif
 	cmp4.ge p6,p7=r8,r0
 	mov ar.pfs=loc1			// restore ar.pfs
 	sxt4 r8=r8			// return 64-bit result
 	;;
 	stf.spill [sp]=f0
-(p6)	cmp.ne pKStk,pUStk=r0,r0	// a successful execve() lands us in user-mode...
 	mov rp=loc0
 (p6)	mov ar.pfs=r0			// clear ar.pfs on success
 (p7)	br.ret.sptk.many rp
@@ -102,12 +97,6 @@ ENTRY(ia64_execve)
 	ldf.fill f23=[sp];	ldf.fill f24=[sp];	mov f25=f0
 	ldf.fill f26=[sp];	ldf.fill f27=[sp];	mov f28=f0
 	ldf.fill f29=[sp];	ldf.fill f30=[sp];	mov f31=f0
-#ifdef CONFIG_IA32_SUPPORT
-	tbit.nz p6,p0=r16, IA64_PSR_IS_BIT
-	movl loc0=ia64_ret_from_ia32_execve
-	;;
-(p6)	mov rp=loc0
-#endif
 	br.ret.sptk.many rp
 END(ia64_execve)
 
@@ -127,13 +116,12 @@ GLOBAL_ENTRY(sys_clone2)
 	mov loc1=r16				// save ar.pfs across do_fork
 	.body
 	mov out1=in1
-	mov out3=in2
+	mov out2=in2
 	tbit.nz p6,p0=in0,CLONE_SETTLS_BIT
-	mov out4=in3	// parent_tidptr: valid only w/CLONE_PARENT_SETTID
+	mov out3=in3	// parent_tidptr: valid only w/CLONE_PARENT_SETTID
 	;;
 (p6)	st8 [r2]=in5				// store TLS in r16 for copy_thread()
-	mov out5=in4	// child_tidptr:  valid only w/CLONE_CHILD_SETTID or CLONE_CHILD_CLEARTID
-	adds out2=IA64_SWITCH_STACK_SIZE+16,sp	// out2 = &regs
+	mov out4=in4	// child_tidptr:  valid only w/CLONE_CHILD_SETTID or CLONE_CHILD_CLEARTID
 	mov out0=in0				// out0 = clone_flags
 	br.call.sptk.many rp=do_fork
 .ret1:	.restore sp
@@ -159,13 +147,12 @@ GLOBAL_ENTRY(sys_clone)
 	mov loc1=r16				// save ar.pfs across do_fork
 	.body
 	mov out1=in1
-	mov out3=16				// stacksize (compensates for 16-byte scratch area)
+	mov out2=16				// stacksize (compensates for 16-byte scratch area)
 	tbit.nz p6,p0=in0,CLONE_SETTLS_BIT
-	mov out4=in2	// parent_tidptr: valid only w/CLONE_PARENT_SETTID
+	mov out3=in2	// parent_tidptr: valid only w/CLONE_PARENT_SETTID
 	;;
 (p6)	st8 [r2]=in4				// store TLS in r13 (tp)
-	mov out5=in3	// child_tidptr:  valid only w/CLONE_CHILD_SETTID or CLONE_CHILD_CLEARTID
-	adds out2=IA64_SWITCH_STACK_SIZE+16,sp	// out2 = &regs
+	mov out4=in3	// child_tidptr:  valid only w/CLONE_CHILD_SETTID or CLONE_CHILD_CLEARTID
 	mov out0=in0				// out0 = clone_flags
 	br.call.sptk.many rp=do_fork
 .ret2:	.restore sp
@@ -174,6 +161,7 @@ GLOBAL_ENTRY(sys_clone)
 	mov rp=loc0
 	br.ret.sptk.many rp
 END(sys_clone)
+#endif /* __IA64_ASM_PARAVIRTUALIZED_NATIVE */
 
 /*
  * prev_task <- ia64_switch_to(struct task_struct *next)
@@ -181,7 +169,7 @@ END(sys_clone)
  *	called.  The code starting at .map relies on this.  The rest of the code
  *	doesn't care about the interrupt masking status.
  */
-GLOBAL_ENTRY(ia64_switch_to)
+GLOBAL_ENTRY(__paravirt_switch_to)
 	.prologue
 	alloc r16=ar.pfs,1,0,0,0
 	DO_SAVE_SWITCH_STACK
@@ -205,7 +193,7 @@ GLOBAL_ENTRY(ia64_switch_to)
 	;;
 .done:
 	ld8 sp=[r21]			// load kernel stack pointer of new task
-	mov IA64_KR(CURRENT)=in0	// update "current" application register
+	MOV_TO_KR(CURRENT, in0, r8, r9)		// update "current" application register
 	mov r8=r13			// return pointer to previously running task
 	mov r13=in0			// set "current" pointer
 	;;
@@ -217,26 +205,25 @@ GLOBAL_ENTRY(ia64_switch_to)
 	br.ret.sptk.many rp		// boogie on out in new context
 
 .map:
-	rsm psr.ic			// interrupts (psr.i) are already disabled here
+	RSM_PSR_IC(r25)			// interrupts (psr.i) are already disabled here
 	movl r25=PAGE_KERNEL
 	;;
 	srlz.d
 	or r23=r25,r20			// construct PA | page properties
 	mov r25=IA64_GRANULE_SHIFT<<2
 	;;
-	mov cr.itir=r25
-	mov cr.ifa=in0			// VA of next task...
+	MOV_TO_ITIR(p0, r25, r8)
+	MOV_TO_IFA(in0, r8)		// VA of next task...
 	;;
 	mov r25=IA64_TR_CURRENT_STACK
-	mov IA64_KR(CURRENT_STACK)=r26	// remember last page we mapped...
+	MOV_TO_KR(CURRENT_STACK, r26, r8, r9)	// remember last page we mapped...
 	;;
 	itr.d dtr[r25]=r23		// wire in new mapping...
-	ssm psr.ic			// reenable the psr.ic bit
-	;;
-	srlz.d
+	SSM_PSR_IC_AND_SRLZ_D(r8, r9)	// reenable the psr.ic bit
 	br.cond.sptk .done
-END(ia64_switch_to)
+END(__paravirt_switch_to)
 
+#ifdef __IA64_ASM_PARAVIRTUALIZED_NATIVE
 /*
  * Note that interrupts are enabled during save_switch_stack and load_switch_stack.  This
  * means that we may get an interrupt with "sp" pointing to the new kernel stack while
@@ -376,7 +363,7 @@ END(save_switch_stack)
  *	- b7 holds address to return to
  *	- must not touch r8-r11
  */
-ENTRY(load_switch_stack)
+GLOBAL_ENTRY(load_switch_stack)
 	.prologue
 	.altrp b7
 
@@ -493,18 +480,6 @@ GLOBAL_ENTRY(prefetch_stack)
 	br.ret.sptk.many rp
 END(prefetch_stack)
 
-GLOBAL_ENTRY(execve)
-	mov r15=__NR_execve			// put syscall number in place
-	break __BREAK_SYSCALL
-	br.ret.sptk.many rp
-END(execve)
-
-GLOBAL_ENTRY(clone)
-	mov r15=__NR_clone			// put syscall number in place
-	break __BREAK_SYSCALL
-	br.ret.sptk.many rp
-END(clone)
-
 	/*
 	 * Invoke a system call, but do some tracing before and after the call.
 	 * We MUST preserve the current register frame throughout this routine
@@ -529,6 +504,11 @@ GLOBAL_ENTRY(ia64_trace_syscall)
  	stf.spill [r16]=f10
  	stf.spill [r17]=f11
 	br.call.sptk.many rp=syscall_trace_enter // give parent a chance to catch syscall args
+	cmp.lt p6,p0=r8,r0			// check tracehook
+	adds r2=PT(R8)+16,sp			// r2 = &pt_regs.r8
+	adds r3=PT(R10)+16,sp			// r3 = &pt_regs.r10
+	mov r10=0
+(p6)	br.cond.sptk strace_error		// syscall failed ->
 	adds r16=PT(F6)+16,sp
 	adds r17=PT(F7)+16,sp
 	;;
@@ -569,7 +549,10 @@ GLOBAL_ENTRY(ia64_trace_syscall)
 .mem.offset 0,0; st8.spill [r2]=r8		// store return value in slot for r8
 .mem.offset 8,0; st8.spill [r3]=r10		// clear error indication in slot for r10
 	br.call.sptk.many rp=syscall_trace_leave // give parent a chance to catch return value
-.ret3:	br.cond.sptk .work_pending_syscall_end
+.ret3:
+(pUStk)	cmp.eq.unc p6,p0=r0,r0			// p6 <- pUStk
+(pUStk)	rsm psr.i				// disable interrupts
+	br.cond.sptk ia64_work_pending_syscall_end
 
 strace_error:
 	ld8 r3=[r2]				// load pt_regs.r8
@@ -600,6 +583,27 @@ GLOBAL_ENTRY(ia64_strace_leave_kernel)
 .ret4:	br.cond.sptk ia64_leave_kernel
 END(ia64_strace_leave_kernel)
 
+ENTRY(call_payload)
+	.prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(0)
+	/* call the kernel_thread payload; fn is in r4, arg - in r5 */
+	alloc loc1=ar.pfs,0,3,1,0
+	mov loc0=rp
+	mov loc2=gp
+	mov out0=r5		// arg
+	ld8 r14 = [r4], 8	// fn.address
+	;;
+	mov b6 = r14
+	ld8 gp = [r4]		// fn.gp
+	;;
+	br.call.sptk.many rp=b6	// fn(arg)
+.ret12:	mov gp=loc2
+	mov rp=loc0
+	mov ar.pfs=loc1
+	/* ... and if it has returned, we are going to userland */
+	cmp.ne pKStk,pUStk=r0,r0
+	br.ret.sptk.many rp
+END(call_payload)
+
 GLOBAL_ENTRY(ia64_ret_from_clone)
 	PT_REGS_UNWIND_INFO(0)
 {	/*
@@ -616,6 +620,7 @@ GLOBAL_ENTRY(ia64_ret_from_clone)
 	br.call.sptk.many rp=ia64_invoke_schedule_tail
 }
 .ret8:
+(pKStk)	br.call.sptk.many rp=call_payload
 	adds r2=TI_FLAGS+IA64_TASK_SIZE,r13
 	;;
 	ld4 r2=[r2]
@@ -634,8 +639,17 @@ GLOBAL_ENTRY(ia64_ret_from_syscall)
 	adds r2=PT(R8)+16,sp			// r2 = &pt_regs.r8
 	mov r10=r0				// clear error indication in r10
 (p7)	br.cond.spnt handle_syscall_error	// handle potential syscall failure
+#ifdef CONFIG_PARAVIRT
+	;;
+	br.cond.sptk.few ia64_leave_syscall
+	;;
+#endif /* CONFIG_PARAVIRT */
 END(ia64_ret_from_syscall)
+#ifndef CONFIG_PARAVIRT
 	// fall through
+#endif
+#endif /* __IA64_ASM_PARAVIRTUALIZED_NATIVE */
+
 /*
  * ia64_leave_syscall(): Same as ia64_leave_kernel, except that it doesn't
  *	need to switch to bank 0 and doesn't restore the scratch registers.
@@ -680,7 +694,7 @@ END(ia64_ret_from_syscall)
  *	      ar.csd: cleared
  *	      ar.ssd: cleared
  */
-ENTRY(ia64_leave_syscall)
+GLOBAL_ENTRY(__paravirt_leave_syscall)
 	PT_REGS_UNWIND_INFO(0)
 	/*
 	 * work.need_resched etc. mustn't get changed by this CPU before it returns to
@@ -690,11 +704,11 @@ ENTRY(ia64_leave_syscall)
 	 * extra work.  We always check for extra work when returning to user-level.
 	 * With CONFIG_PREEMPT, we also check for extra work when the preempt_count
 	 * is 0.  After extra work processing has been completed, execution
-	 * resumes at .work_processed_syscall with p6 set to 1 if the extra-work-check
+	 * resumes at ia64_work_processed_syscall with p6 set to 1 if the extra-work-check
 	 * needs to be redone.
 	 */
 #ifdef CONFIG_PREEMPT
-	rsm psr.i				// disable interrupts
+	RSM_PSR_I(p0, r2, r18)			// disable interrupts
 	cmp.eq pLvSys,p0=r0,r0			// pLvSys=1: leave from syscall
 (pKStk) adds r20=TI_PRE_COUNT+IA64_TASK_SIZE,r13
 	;;
@@ -704,11 +718,22 @@ ENTRY(ia64_leave_syscall)
 	;;
 	cmp.eq p6,p0=r21,r0		// p6 <- pUStk || (preempt_count == 0)
 #else /* !CONFIG_PREEMPT */
-(pUStk)	rsm psr.i
+	RSM_PSR_I(pUStk, r2, r18)
 	cmp.eq pLvSys,p0=r0,r0		// pLvSys=1: leave from syscall
 (pUStk)	cmp.eq.unc p6,p0=r0,r0		// p6 <- pUStk
 #endif
-.work_processed_syscall:
+.global __paravirt_work_processed_syscall;
+__paravirt_work_processed_syscall:
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
+	adds r2=PT(LOADRS)+16,r12
+	MOV_FROM_ITC(pUStk, p9, r22, r19)	// fetch time at leave
+	adds r18=TI_FLAGS+IA64_TASK_SIZE,r13
+	;;
+(p6)	ld4 r31=[r18]				// load current_thread_info()->flags
+	ld8 r19=[r2],PT(B6)-PT(LOADRS)		// load ar.rsc value for "loadrs"
+	adds r3=PT(AR_BSPSTORE)+16,r12		// deferred
+	;;
+#else
 	adds r2=PT(LOADRS)+16,r12
 	adds r3=PT(AR_BSPSTORE)+16,r12
 	adds r18=TI_FLAGS+IA64_TASK_SIZE,r13
@@ -717,6 +742,7 @@ ENTRY(ia64_leave_syscall)
 	ld8 r19=[r2],PT(B6)-PT(LOADRS)		// load ar.rsc value for "loadrs"
 	nop.i 0
 	;;
+#endif
 	mov r16=ar.bsp				// M2  get existing backing store pointer
 	ld8 r18=[r2],PT(R9)-PT(B6)		// load b6
 (p6)	and r15=TIF_WORK_MASK,r31		// any work other than TIF_SYSCALL_TRACE?
@@ -731,19 +757,28 @@ ENTRY(ia64_leave_syscall)
 (pNonSys) break 0		//      bug check: we shouldn't be here if pNonSys is TRUE!
 	;;
 	invala			// M0|1 invalidate ALAT
-	rsm psr.i | psr.ic	// M2   turn off interrupts and interruption collection
+	RSM_PSR_I_IC(r28, r29, r30)	// M2   turn off interrupts and interruption collection
 	cmp.eq p9,p0=r0,r0	// A    set p9 to indicate that we should restore cr.ifs
 
 	ld8 r29=[r2],16		// M0|1 load cr.ipsr
 	ld8 r28=[r3],16		// M0|1 load cr.iip
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
+(pUStk) add r14=TI_AC_LEAVE+IA64_TASK_SIZE,r13
+	;;
+	ld8 r30=[r2],16		// M0|1 load cr.ifs
+	ld8 r25=[r3],16		// M0|1 load ar.unat
+(pUStk) add r15=IA64_TASK_THREAD_ON_USTACK_OFFSET,r13
+	;;
+#else
 	mov r22=r0		// A    clear r22
 	;;
 	ld8 r30=[r2],16		// M0|1 load cr.ifs
 	ld8 r25=[r3],16		// M0|1 load ar.unat
 (pUStk) add r14=IA64_TASK_THREAD_ON_USTACK_OFFSET,r13
 	;;
+#endif
 	ld8 r26=[r2],PT(B0)-PT(AR_PFS)	// M0|1 load ar.pfs
-(pKStk)	mov r22=psr			// M2   read PSR now that interrupts are disabled
+	MOV_FROM_PSR(pKStk, r22, r21)	// M2   read PSR now that interrupts are disabled
 	nop 0
 	;;
 	ld8 r21=[r2],PT(AR_RNAT)-PT(B0) // M0|1 load b0
@@ -758,7 +793,11 @@ ENTRY(ia64_leave_syscall)
 	ld8.fill r1=[r3],16			// M0|1 load r1
 (pUStk) mov r17=1				// A
 	;;
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
+(pUStk) st1 [r15]=r17				// M2|3
+#else
 (pUStk) st1 [r14]=r17				// M2|3
+#endif
 	ld8.fill r13=[r3],16			// M0|1
 	mov f8=f0				// F    clear f8
 	;;
@@ -766,21 +805,30 @@ ENTRY(ia64_leave_syscall)
 	ld8.fill r15=[r3]			// M0|1 restore r15
 	mov b6=r18				// I0   restore b6
 
-	addl r17=THIS_CPU(ia64_phys_stacked_size_p8),r0 // A
+	LOAD_PHYS_STACK_REG_SIZE(r17)
 	mov f9=f0					// F    clear f9
 (pKStk) br.cond.dpnt.many skip_rbs_switch		// B
 
 	srlz.d				// M0   ensure interruption collection is off (for cover)
 	shr.u r18=r19,16		// I0|1 get byte size of existing "dirty" partition
-	cover				// B    add current frame into dirty partition & set cr.ifs
+	COVER				// B    add current frame into dirty partition & set cr.ifs
+	;;
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
+	mov r19=ar.bsp			// M2   get new backing store pointer
+	st8 [r14]=r22			// M	save time at leave
+	mov f10=f0			// F    clear f10
+
+	mov r22=r0			// A	clear r22
+	movl r14=__kernel_syscall_via_epc // X
 	;;
-(pUStk) ld4 r17=[r17]			// M0|1 r17 = cpu_data->phys_stacked_size_p8
+#else
 	mov r19=ar.bsp			// M2   get new backing store pointer
 	mov f10=f0			// F    clear f10
 
 	nop.m 0
 	movl r14=__kernel_syscall_via_epc // X
 	;;
+#endif
 	mov.m ar.csd=r0			// M2   clear ar.csd
 	mov.m ar.ccv=r0			// M2   clear ar.ccv
 	mov b7=r14			// I0   clear b7 (hint with __kernel_syscall_via_epc)
@@ -788,22 +836,9 @@ ENTRY(ia64_leave_syscall)
 	mov.m ar.ssd=r0			// M2   clear ar.ssd
 	mov f11=f0			// F    clear f11
 	br.cond.sptk.many rbs_switch	// B
-END(ia64_leave_syscall)
+END(__paravirt_leave_syscall)
 
-#ifdef CONFIG_IA32_SUPPORT
-GLOBAL_ENTRY(ia64_ret_from_ia32_execve)
-	PT_REGS_UNWIND_INFO(0)
-	adds r2=PT(R8)+16,sp			// r2 = &pt_regs.r8
-	adds r3=PT(R10)+16,sp			// r3 = &pt_regs.r10
-	;;
-	.mem.offset 0,0
-	st8.spill [r2]=r8	// store return value in slot for r8 and set unat bit
-	.mem.offset 8,0
-	st8.spill [r3]=r0	// clear error indication in slot for r10 and set unat bit
-END(ia64_ret_from_ia32_execve)
-	// fall through
-#endif /* CONFIG_IA32_SUPPORT */
-GLOBAL_ENTRY(ia64_leave_kernel)
+GLOBAL_ENTRY(__paravirt_leave_kernel)
 	PT_REGS_UNWIND_INFO(0)
 	/*
 	 * work.need_resched etc. mustn't get changed by this CPU before it returns to
@@ -817,7 +852,7 @@ GLOBAL_ENTRY(ia64_leave_kernel)
 	 * needs to be redone.
 	 */
 #ifdef CONFIG_PREEMPT
-	rsm psr.i				// disable interrupts
+	RSM_PSR_I(p0, r17, r31)			// disable interrupts
 	cmp.eq p0,pLvSys=r0,r0			// pLvSys=0: leave from kernel
 (pKStk)	adds r20=TI_PRE_COUNT+IA64_TASK_SIZE,r13
 	;;
@@ -827,7 +862,7 @@ GLOBAL_ENTRY(ia64_leave_kernel)
 	;;
 	cmp.eq p6,p0=r21,r0		// p6 <- pUStk || (preempt_count == 0)
 #else
-(pUStk)	rsm psr.i
+	RSM_PSR_I(pUStk, r17, r31)
 	cmp.eq p0,pLvSys=r0,r0		// pLvSys=0: leave from kernel
 (pUStk)	cmp.eq.unc p6,p0=r0,r0		// p6 <- pUStk
 #endif
@@ -875,7 +910,7 @@ GLOBAL_ENTRY(ia64_leave_kernel)
 	mov ar.csd=r30
 	mov ar.ssd=r31
 	;;
-	rsm psr.i | psr.ic	// initiate turning off of interrupt and interruption collection
+	RSM_PSR_I_IC(r23, r22, r25)	// initiate turning off of interrupt and interruption collection
 	invala			// invalidate ALAT
 	;;
 	ld8.fill r22=[r2],24
@@ -907,16 +942,24 @@ GLOBAL_ENTRY(ia64_leave_kernel)
 	mov ar.ccv=r15
 	;;
 	ldf.fill f11=[r2]
-	bsw.0			// switch back to bank 0 (no stop bit required beforehand...)
+	BSW_0(r2, r3, r15)	// switch back to bank 0 (no stop bit required beforehand...)
 	;;
 (pUStk)	mov r18=IA64_KR(CURRENT)// M2 (12 cycle read latency)
 	adds r16=PT(CR_IPSR)+16,r12
 	adds r17=PT(CR_IIP)+16,r12
 
-(pKStk)	mov r22=psr		// M2 read PSR now that interrupts are disabled
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
+	.pred.rel.mutex pUStk,pKStk
+	MOV_FROM_PSR(pKStk, r22, r29)	// M2 read PSR now that interrupts are disabled
+	MOV_FROM_ITC(pUStk, p9, r22, r29)	// M  fetch time at leave
+	nop.i 0
+	;;
+#else
+	MOV_FROM_PSR(pKStk, r22, r29)	// M2 read PSR now that interrupts are disabled
 	nop.i 0
 	nop.i 0
 	;;
+#endif
 	ld8 r29=[r16],16	// load cr.ipsr
 	ld8 r28=[r17],16	// load cr.iip
 	;;
@@ -938,24 +981,45 @@ GLOBAL_ENTRY(ia64_leave_kernel)
 	;;
 	ld8.fill r12=[r16],16
 	ld8.fill r13=[r17],16
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
+(pUStk)	adds r3=TI_AC_LEAVE+IA64_TASK_SIZE,r18
+#else
 (pUStk)	adds r18=IA64_TASK_THREAD_ON_USTACK_OFFSET,r18
+#endif
 	;;
 	ld8 r20=[r16],16	// ar.fpsr
 	ld8.fill r15=[r17],16
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
+(pUStk)	adds r18=IA64_TASK_THREAD_ON_USTACK_OFFSET,r18	// deferred
+#endif
 	;;
 	ld8.fill r14=[r16],16
 	ld8.fill r2=[r17]
 (pUStk)	mov r17=1
 	;;
-	ld8.fill r3=[r16]
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
+	//  mmi_ :  ld8 st1 shr;;         mmi_ : st8 st1 shr;;
+	//  mib  :  mov add br        ->  mib  : ld8 add br
+	//  bbb_ :  br  nop cover;;       mbb_ : mov br  cover;;
+	//
+	//  no one require bsp in r16 if (pKStk) branch is selected.
+(pUStk)	st8 [r3]=r22		// save time at leave
 (pUStk)	st1 [r18]=r17		// restore current->thread.on_ustack
 	shr.u r18=r19,16	// get byte size of existing "dirty" partition
 	;;
+	ld8.fill r3=[r16]	// deferred
+	LOAD_PHYS_STACK_REG_SIZE(r17)
+(pKStk)	br.cond.dpnt skip_rbs_switch
 	mov r16=ar.bsp		// get existing backing store pointer
-	addl r17=THIS_CPU(ia64_phys_stacked_size_p8),r0
+#else
+	ld8.fill r3=[r16]
+(pUStk)	st1 [r18]=r17		// restore current->thread.on_ustack
+	shr.u r18=r19,16	// get byte size of existing "dirty" partition
 	;;
-	ld4 r17=[r17]		// r17 = cpu_data->phys_stacked_size_p8
+	mov r16=ar.bsp		// get existing backing store pointer
+	LOAD_PHYS_STACK_REG_SIZE(r17)
 (pKStk)	br.cond.dpnt skip_rbs_switch
+#endif
 
 	/*
 	 * Restore user backing store.
@@ -963,7 +1027,7 @@ GLOBAL_ENTRY(ia64_leave_kernel)
 	 * NOTE: alloc, loadrs, and cover can't be predicated.
 	 */
 (pNonSys) br.cond.dpnt dont_preserve_current_frame
-	cover				// add current frame into dirty partition and set cr.ifs
+	COVER				// add current frame into dirty partition and set cr.ifs
 	;;
 	mov r19=ar.bsp			// get new backing store pointer
 rbs_switch:
@@ -1066,16 +1130,16 @@ skip_rbs_switch:
 (pKStk)	dep r29=r22,r29,21,1	// I0 update ipsr.pp with psr.pp
 (pLvSys)mov r16=r0		// A  clear r16 for leave_syscall, no-op otherwise
 	;;
-	mov cr.ipsr=r29		// M2
+	MOV_TO_IPSR(p0, r29, r25)	// M2
 	mov ar.pfs=r26		// I0
 (pLvSys)mov r17=r0		// A  clear r17 for leave_syscall, no-op otherwise
 
-(p9)	mov cr.ifs=r30		// M2
+	MOV_TO_IFS(p9, r30, r25)// M2
 	mov b0=r21		// I0
 (pLvSys)mov r18=r0		// A  clear r18 for leave_syscall, no-op otherwise
 
 	mov ar.fpsr=r20		// M2
-	mov cr.iip=r28		// M2
+	MOV_TO_IIP(r28, r25)	// M2
 	nop 0
 	;;
 (pUStk)	mov ar.rnat=r24		// M2 must happen with RSE in lazy mode
@@ -1084,7 +1148,7 @@ skip_rbs_switch:
 
 	mov ar.rsc=r27		// M2
 	mov pr=r31,-1		// I0
-	rfi			// B
+	RFI			// B
 
 	/*
 	 * On entry:
@@ -1092,6 +1156,9 @@ skip_rbs_switch:
 	 *	r31 = current->thread_info->flags
 	 * On exit:
 	 *	p6 = TRUE if work-pending-check needs to be redone
+	 *
+	 * Interrupts are disabled on entry, reenabled depend on work, and
+	 * disabled on exit.
 	 */
 .work_pending_syscall:
 	add r2=-8,r2
@@ -1100,56 +1167,30 @@ skip_rbs_switch:
 	st8 [r2]=r8
 	st8 [r3]=r10
 .work_pending:
-	tbit.nz p6,p0=r31,TIF_SIGDELAYED		// signal delayed from  MCA/INIT/NMI/PMI context?
-(p6)	br.cond.sptk.few .sigdelayed
-	;;
-	tbit.z p6,p0=r31,TIF_NEED_RESCHED		// current_thread_info()->need_resched==0?
+	tbit.z p6,p0=r31,TIF_NEED_RESCHED	// is resched not needed?
 (p6)	br.cond.sptk.few .notify
-#ifdef CONFIG_PREEMPT
-(pKStk) dep r21=-1,r0,PREEMPT_ACTIVE_BIT,1
-	;;
-(pKStk) st4 [r20]=r21
-	ssm psr.i		// enable interrupts
-#endif
-	br.call.spnt.many rp=schedule
-.ret9:	cmp.eq p6,p0=r0,r0				// p6 <- 1
-	rsm psr.i		// disable interrupts
-	;;
-#ifdef CONFIG_PREEMPT
-(pKStk)	adds r20=TI_PRE_COUNT+IA64_TASK_SIZE,r13
-	;;
-(pKStk)	st4 [r20]=r0		// preempt_count() <- 0
-#endif
-(pLvSys)br.cond.sptk.few  .work_pending_syscall_end
-	br.cond.sptk.many .work_processed_kernel	// re-check
+	br.call.spnt.many rp=preempt_schedule_irq
+.ret9:	cmp.eq p6,p0=r0,r0	// p6 <- 1 (re-check)
+(pLvSys)br.cond.sptk.few  __paravirt_pending_syscall_end
+	br.cond.sptk.many .work_processed_kernel
 
 .notify:
 (pUStk)	br.call.spnt.many rp=notify_resume_user
-.ret10:	cmp.ne p6,p0=r0,r0				// p6 <- 0
-(pLvSys)br.cond.sptk.few  .work_pending_syscall_end
-	br.cond.sptk.many .work_processed_kernel	// don't re-check
-
-// There is a delayed signal that was detected in MCA/INIT/NMI/PMI context where
-// it could not be delivered.  Deliver it now.  The signal might be for us and
-// may set TIF_SIGPENDING, so redrive ia64_leave_* after processing the delayed
-// signal.
-
-.sigdelayed:
-	br.call.sptk.many rp=do_sigdelayed
-	cmp.eq p6,p0=r0,r0				// p6 <- 1, always re-check
-(pLvSys)br.cond.sptk.few  .work_pending_syscall_end
-	br.cond.sptk.many .work_processed_kernel	// re-check
-
-.work_pending_syscall_end:
+.ret10:	cmp.ne p6,p0=r0,r0	// p6 <- 0 (don't re-check)
+(pLvSys)br.cond.sptk.few  __paravirt_pending_syscall_end
+	br.cond.sptk.many .work_processed_kernel
+
+.global __paravirt_pending_syscall_end;
+__paravirt_pending_syscall_end:
 	adds r2=PT(R8)+16,r12
 	adds r3=PT(R10)+16,r12
 	;;
 	ld8 r8=[r2]
 	ld8 r10=[r3]
-	br.cond.sptk.many .work_processed_syscall	// re-check
-
-END(ia64_leave_kernel)
+	br.cond.sptk.many __paravirt_work_processed_syscall_target
+END(__paravirt_leave_kernel)
 
+#ifdef __IA64_ASM_PARAVIRTUALIZED_NATIVE
 ENTRY(handle_syscall_error)
 	/*
 	 * Some system calls (e.g., ptrace, mmap) can return arbitrary values which could
@@ -1184,11 +1225,14 @@ GLOBAL_ENTRY(ia64_invoke_schedule_tail)
 END(ia64_invoke_schedule_tail)
 
 	/*
-	 * Setup stack and call do_notify_resume_user().  Note that pSys and pNonSys need to
-	 * be set up by the caller.  We declare 8 input registers so the system call
-	 * args get preserved, in case we need to restart a system call.
+	 * Setup stack and call do_notify_resume_user(), keeping interrupts
+	 * disabled.
+	 *
+	 * Note that pSys and pNonSys need to be set up by the caller.
+	 * We declare 8 input registers so the system call args get preserved,
+	 * in case we need to restart a system call.
 	 */
-ENTRY(notify_resume_user)
+GLOBAL_ENTRY(notify_resume_user)
 	.prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(8)
 	alloc loc1=ar.pfs,8,2,3,0 // preserve all eight input regs in case of syscall restart!
 	mov r9=ar.unat
@@ -1215,32 +1259,6 @@ ENTRY(notify_resume_user)
 	br.ret.sptk.many rp
 END(notify_resume_user)
 
-GLOBAL_ENTRY(sys_rt_sigsuspend)
-	.prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(8)
-	alloc loc1=ar.pfs,8,2,3,0 // preserve all eight input regs in case of syscall restart!
-	mov r9=ar.unat
-	mov loc0=rp				// save return address
-	mov out0=in0				// mask
-	mov out1=in1				// sigsetsize
-	adds out2=8,sp				// out2=&sigscratch->ar_pfs
-	;;
-	.fframe 16
-	.spillsp ar.unat, 16
-	st8 [sp]=r9,-16				// allocate space for ar.unat and save it
-	st8 [out2]=loc1,-8			// save ar.pfs, out2=&sigscratch
-	.body
-	br.call.sptk.many rp=ia64_rt_sigsuspend
-.ret17:	.restore sp
-	adds sp=16,sp				// pop scratch stack space
-	;;
-	ld8 r9=[sp]				// load new unat from sw->caller_unat
-	mov rp=loc0
-	;;
-	mov ar.unat=r9
-	mov ar.pfs=loc1
-	br.ret.sptk.many rp
-END(sys_rt_sigsuspend)
-
 ENTRY(sys_rt_sigreturn)
 	PT_REGS_UNWIND_INFO(0)
 	/*
@@ -1276,7 +1294,7 @@ ENTRY(sys_rt_sigreturn)
 	adds sp=16,sp
 	;;
 	ld8 r9=[sp]				// load new ar.unat
-	mov.sptk b7=r8,ia64_leave_kernel
+	mov.sptk b7=r8,ia64_native_leave_kernel
 	;;
 	mov ar.unat=r9
 	br.many b7
@@ -1340,6 +1358,105 @@ GLOBAL_ENTRY(unw_init_running)
 	br.ret.sptk.many rp
 END(unw_init_running)
 
+#ifdef CONFIG_FUNCTION_TRACER
+#ifdef CONFIG_DYNAMIC_FTRACE
+GLOBAL_ENTRY(_mcount)
+	br ftrace_stub
+END(_mcount)
+
+.here:
+	br.ret.sptk.many b0
+
+GLOBAL_ENTRY(ftrace_caller)
+	alloc out0 = ar.pfs, 8, 0, 4, 0
+	mov out3 = r0
+	;;
+	mov out2 = b0
+	add r3 = 0x20, r3
+	mov out1 = r1;
+	br.call.sptk.many b0 = ftrace_patch_gp
+	//this might be called from module, so we must patch gp
+ftrace_patch_gp:
+	movl gp=__gp
+	mov b0 = r3
+	;;
+.global ftrace_call;
+ftrace_call:
+{
+	.mlx
+	nop.m 0x0
+	movl r3 = .here;;
+}
+	alloc loc0 = ar.pfs, 4, 4, 2, 0
+	;;
+	mov loc1 = b0
+	mov out0 = b0
+	mov loc2 = r8
+	mov loc3 = r15
+	;;
+	adds out0 = -MCOUNT_INSN_SIZE, out0
+	mov out1 = in2
+	mov b6 = r3
+
+	br.call.sptk.many b0 = b6
+	;;
+	mov ar.pfs = loc0
+	mov b0 = loc1
+	mov r8 = loc2
+	mov r15 = loc3
+	br ftrace_stub
+	;;
+END(ftrace_caller)
+
+#else
+GLOBAL_ENTRY(_mcount)
+	movl r2 = ftrace_stub
+	movl r3 = ftrace_trace_function;;
+	ld8 r3 = [r3];;
+	ld8 r3 = [r3];;
+	cmp.eq p7,p0 = r2, r3
+(p7)	br.sptk.many ftrace_stub
+	;;
+
+	alloc loc0 = ar.pfs, 4, 4, 2, 0
+	;;
+	mov loc1 = b0
+	mov out0 = b0
+	mov loc2 = r8
+	mov loc3 = r15
+	;;
+	adds out0 = -MCOUNT_INSN_SIZE, out0
+	mov out1 = in2
+	mov b6 = r3
+
+	br.call.sptk.many b0 = b6
+	;;
+	mov ar.pfs = loc0
+	mov b0 = loc1
+	mov r8 = loc2
+	mov r15 = loc3
+	br ftrace_stub
+	;;
+END(_mcount)
+#endif
+
+GLOBAL_ENTRY(ftrace_stub)
+	mov r3 = b0
+	movl r2 = _mcount_ret_helper
+	;;
+	mov b6 = r2
+	mov b7 = r3
+	br.ret.sptk.many b6
+
+_mcount_ret_helper:
+	mov b0 = r42
+	mov r1 = r41
+	mov ar.pfs = r40
+	br b7
+END(ftrace_stub)
+
+#endif /* CONFIG_FUNCTION_TRACER */
+
 	.rodata
 	.align 8
 	.globl sys_call_table
@@ -1378,7 +1495,7 @@ sys_call_table:
 	data8 sys_mkdir				// 1055
 	data8 sys_rmdir
 	data8 sys_dup
-	data8 sys_pipe
+	data8 sys_ia64_pipe
 	data8 sys_times
 	data8 ia64_brk				// 1060
 	data8 sys_setgid
@@ -1489,7 +1606,7 @@ sys_call_table:
 	data8 sys_sched_get_priority_min
 	data8 sys_sched_rr_get_interval
 	data8 sys_nanosleep
-	data8 sys_nfsservctl
+	data8 sys_ni_syscall			// old nfsservctl
 	data8 sys_prctl				// 1170
 	data8 sys_getpagesize
 	data8 sys_mmap2
@@ -1588,7 +1705,7 @@ sys_call_table:
 	data8 sys_mq_timedreceive		// 1265
 	data8 sys_mq_notify
 	data8 sys_mq_getsetattr
-	data8 sys_ni_syscall			// reserved for kexec_load
+	data8 sys_kexec_load
 	data8 sys_ni_syscall			// reserved for vserver
 	data8 sys_waitid			// 1270
 	data8 sys_add_key
@@ -1596,9 +1713,69 @@ sys_call_table:
 	data8 sys_keyctl
 	data8 sys_ioprio_set
 	data8 sys_ioprio_get			// 1275
-	data8 sys_ni_syscall
+	data8 sys_move_pages
 	data8 sys_inotify_init
 	data8 sys_inotify_add_watch
 	data8 sys_inotify_rm_watch
+	data8 sys_migrate_pages			// 1280
+	data8 sys_openat
+	data8 sys_mkdirat
+	data8 sys_mknodat
+	data8 sys_fchownat
+	data8 sys_futimesat			// 1285
+	data8 sys_newfstatat
+	data8 sys_unlinkat
+	data8 sys_renameat
+	data8 sys_linkat
+	data8 sys_symlinkat			// 1290
+	data8 sys_readlinkat
+	data8 sys_fchmodat
+	data8 sys_faccessat
+	data8 sys_pselect6
+	data8 sys_ppoll				// 1295
+	data8 sys_unshare
+	data8 sys_splice
+	data8 sys_set_robust_list
+	data8 sys_get_robust_list
+	data8 sys_sync_file_range		// 1300
+	data8 sys_tee
+	data8 sys_vmsplice
+	data8 sys_fallocate
+	data8 sys_getcpu
+	data8 sys_epoll_pwait			// 1305
+	data8 sys_utimensat
+	data8 sys_signalfd
+	data8 sys_ni_syscall
+	data8 sys_eventfd
+	data8 sys_timerfd_create		// 1310
+	data8 sys_timerfd_settime
+	data8 sys_timerfd_gettime
+	data8 sys_signalfd4
+	data8 sys_eventfd2
+	data8 sys_epoll_create1			// 1315
+	data8 sys_dup3
+	data8 sys_pipe2
+	data8 sys_inotify_init1
+	data8 sys_preadv
+	data8 sys_pwritev			// 1320
+	data8 sys_rt_tgsigqueueinfo
+	data8 sys_recvmmsg
+	data8 sys_fanotify_init
+	data8 sys_fanotify_mark
+	data8 sys_prlimit64			// 1325
+	data8 sys_name_to_handle_at
+	data8 sys_open_by_handle_at
+	data8 sys_clock_adjtime
+	data8 sys_syncfs
+	data8 sys_setns				// 1330
+	data8 sys_sendmmsg
+	data8 sys_process_vm_readv
+	data8 sys_process_vm_writev
+	data8 sys_accept4
+	data8 sys_finit_module			// 1335
+	data8 sys_sched_setattr
+	data8 sys_sched_getattr
+	data8 sys_renameat2
 
 	.org sys_call_table + 8*NR_syscalls	// guard against failures to increase NR_syscalls
+#endif /* __IA64_ASM_PARAVIRTUALIZED_NATIVE */
diff --git a/arch/ia64/kernel/entry.h b/arch/ia64/kernel/entry.h
index 78eeb079341..b83edac0296 100644
--- a/arch/ia64/kernel/entry.h
+++ b/arch/ia64/kernel/entry.h
@@ -1,4 +1,3 @@
-#include <linux/config.h>
 
 /*
  * Preserved registers that are shared between code in ivt.S and
@@ -23,6 +22,7 @@
 
 #define PT(f)		(IA64_PT_REGS_##f##_OFFSET)
 #define SW(f)		(IA64_SWITCH_STACK_##f##_OFFSET)
+#define SOS(f)		(IA64_SAL_OS_STATE_##f##_OFFSET)
 
 #define PT_REGS_SAVES(off)			\
 	.unwabi 3, 'i';				\
diff --git a/arch/ia64/kernel/err_inject.c b/arch/ia64/kernel/err_inject.c
new file mode 100644
index 00000000000..0c161ed6d18
--- /dev/null
+++ b/arch/ia64/kernel/err_inject.c
@@ -0,0 +1,314 @@
+/*
+ * err_inject.c -
+ *	1.) Inject errors to a processor.
+ *	2.) Query error injection capabilities.
+ * This driver along with user space code can be acting as an error
+ * injection tool.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT.  See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Written by: Fenghua Yu <fenghua.yu@intel.com>, Intel Corporation
+ * Copyright (C) 2006, Intel Corp.  All rights reserved.
+ *
+ */
+#include <linux/device.h>
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/cpu.h>
+#include <linux/module.h>
+
+#define ERR_INJ_DEBUG
+
+#define ERR_DATA_BUFFER_SIZE 3 		// Three 8-byte;
+
+#define define_one_ro(name) 						\
+static DEVICE_ATTR(name, 0444, show_##name, NULL)
+
+#define define_one_rw(name) 						\
+static DEVICE_ATTR(name, 0644, show_##name, store_##name)
+
+static u64 call_start[NR_CPUS];
+static u64 phys_addr[NR_CPUS];
+static u64 err_type_info[NR_CPUS];
+static u64 err_struct_info[NR_CPUS];
+static struct {
+	u64 data1;
+	u64 data2;
+	u64 data3;
+} __attribute__((__aligned__(16))) err_data_buffer[NR_CPUS];
+static s64 status[NR_CPUS];
+static u64 capabilities[NR_CPUS];
+static u64 resources[NR_CPUS];
+
+#define show(name) 							\
+static ssize_t 								\
+show_##name(struct device *dev, struct device_attribute *attr,	\
+		char *buf)						\
+{									\
+	u32 cpu=dev->id;						\
+	return sprintf(buf, "%lx\n", name[cpu]);			\
+}
+
+#define store(name)							\
+static ssize_t 								\
+store_##name(struct device *dev, struct device_attribute *attr,	\
+					const char *buf, size_t size)	\
+{									\
+	unsigned int cpu=dev->id;					\
+	name[cpu] = simple_strtoull(buf, NULL, 16);			\
+	return size;							\
+}
+
+show(call_start)
+
+/* It's user's responsibility to call the PAL procedure on a specific
+ * processor. The cpu number in driver is only used for storing data.
+ */
+static ssize_t
+store_call_start(struct device *dev, struct device_attribute *attr,
+		const char *buf, size_t size)
+{
+	unsigned int cpu=dev->id;
+	unsigned long call_start = simple_strtoull(buf, NULL, 16);
+
+#ifdef ERR_INJ_DEBUG
+	printk(KERN_DEBUG "pal_mc_err_inject for cpu%d:\n", cpu);
+	printk(KERN_DEBUG "err_type_info=%lx,\n", err_type_info[cpu]);
+	printk(KERN_DEBUG "err_struct_info=%lx,\n", err_struct_info[cpu]);
+	printk(KERN_DEBUG "err_data_buffer=%lx, %lx, %lx.\n",
+			  err_data_buffer[cpu].data1,
+			  err_data_buffer[cpu].data2,
+			  err_data_buffer[cpu].data3);
+#endif
+	switch (call_start) {
+	    case 0: /* Do nothing. */
+		break;
+	    case 1: /* Call pal_mc_error_inject in physical mode. */
+		status[cpu]=ia64_pal_mc_error_inject_phys(err_type_info[cpu],
+					err_struct_info[cpu],
+					ia64_tpa(&err_data_buffer[cpu]),
+					&capabilities[cpu],
+			 		&resources[cpu]);
+		break;
+	    case 2: /* Call pal_mc_error_inject in virtual mode. */
+		status[cpu]=ia64_pal_mc_error_inject_virt(err_type_info[cpu],
+					err_struct_info[cpu],
+					ia64_tpa(&err_data_buffer[cpu]),
+					&capabilities[cpu],
+			 		&resources[cpu]);
+		break;
+	    default:
+		status[cpu] = -EINVAL;
+		break;
+	}
+
+#ifdef ERR_INJ_DEBUG
+	printk(KERN_DEBUG "Returns: status=%d,\n", (int)status[cpu]);
+	printk(KERN_DEBUG "capapbilities=%lx,\n", capabilities[cpu]);
+	printk(KERN_DEBUG "resources=%lx\n", resources[cpu]);
+#endif
+	return size;
+}
+
+show(err_type_info)
+store(err_type_info)
+
+static ssize_t
+show_virtual_to_phys(struct device *dev, struct device_attribute *attr,
+			char *buf)
+{
+	unsigned int cpu=dev->id;
+	return sprintf(buf, "%lx\n", phys_addr[cpu]);
+}
+
+static ssize_t
+store_virtual_to_phys(struct device *dev, struct device_attribute *attr,
+			const char *buf, size_t size)
+{
+	unsigned int cpu=dev->id;
+	u64 virt_addr=simple_strtoull(buf, NULL, 16);
+	int ret;
+
+        ret = get_user_pages(current, current->mm, virt_addr,
+                        1, VM_READ, 0, NULL, NULL);
+	if (ret<=0) {
+#ifdef ERR_INJ_DEBUG
+		printk("Virtual address %lx is not existing.\n",virt_addr);
+#endif
+		return -EINVAL;
+	}
+
+	phys_addr[cpu] = ia64_tpa(virt_addr);
+	return size;
+}
+
+show(err_struct_info)
+store(err_struct_info)
+
+static ssize_t
+show_err_data_buffer(struct device *dev,
+			struct device_attribute *attr, char *buf)
+{
+	unsigned int cpu=dev->id;
+
+	return sprintf(buf, "%lx, %lx, %lx\n",
+			err_data_buffer[cpu].data1,
+			err_data_buffer[cpu].data2,
+			err_data_buffer[cpu].data3);
+}
+
+static ssize_t
+store_err_data_buffer(struct device *dev,
+			struct device_attribute *attr,
+			const char *buf, size_t size)
+{
+	unsigned int cpu=dev->id;
+	int ret;
+
+#ifdef ERR_INJ_DEBUG
+	printk("write err_data_buffer=[%lx,%lx,%lx] on cpu%d\n",
+		 err_data_buffer[cpu].data1,
+		 err_data_buffer[cpu].data2,
+		 err_data_buffer[cpu].data3,
+		 cpu);
+#endif
+	ret=sscanf(buf, "%lx, %lx, %lx",
+			&err_data_buffer[cpu].data1,
+			&err_data_buffer[cpu].data2,
+			&err_data_buffer[cpu].data3);
+	if (ret!=ERR_DATA_BUFFER_SIZE)
+		return -EINVAL;
+
+	return size;
+}
+
+show(status)
+show(capabilities)
+show(resources)
+
+define_one_rw(call_start);
+define_one_rw(err_type_info);
+define_one_rw(err_struct_info);
+define_one_rw(err_data_buffer);
+define_one_rw(virtual_to_phys);
+define_one_ro(status);
+define_one_ro(capabilities);
+define_one_ro(resources);
+
+static struct attribute *default_attrs[] = {
+	&dev_attr_call_start.attr,
+	&dev_attr_virtual_to_phys.attr,
+	&dev_attr_err_type_info.attr,
+	&dev_attr_err_struct_info.attr,
+	&dev_attr_err_data_buffer.attr,
+	&dev_attr_status.attr,
+	&dev_attr_capabilities.attr,
+	&dev_attr_resources.attr,
+	NULL
+};
+
+static struct attribute_group err_inject_attr_group = {
+	.attrs = default_attrs,
+	.name = "err_inject"
+};
+/* Add/Remove err_inject interface for CPU device */
+static int err_inject_add_dev(struct device *sys_dev)
+{
+	return sysfs_create_group(&sys_dev->kobj, &err_inject_attr_group);
+}
+
+static int err_inject_remove_dev(struct device *sys_dev)
+{
+	sysfs_remove_group(&sys_dev->kobj, &err_inject_attr_group);
+	return 0;
+}
+static int err_inject_cpu_callback(struct notifier_block *nfb,
+		unsigned long action, void *hcpu)
+{
+	unsigned int cpu = (unsigned long)hcpu;
+	struct device *sys_dev;
+
+	sys_dev = get_cpu_device(cpu);
+	switch (action) {
+	case CPU_ONLINE:
+	case CPU_ONLINE_FROZEN:
+		err_inject_add_dev(sys_dev);
+		break;
+	case CPU_DEAD:
+	case CPU_DEAD_FROZEN:
+		err_inject_remove_dev(sys_dev);
+		break;
+	}
+
+	return NOTIFY_OK;
+}
+
+static struct notifier_block err_inject_cpu_notifier =
+{
+	.notifier_call = err_inject_cpu_callback,
+};
+
+static int __init
+err_inject_init(void)
+{
+	int i;
+
+#ifdef ERR_INJ_DEBUG
+	printk(KERN_INFO "Enter error injection driver.\n");
+#endif
+
+	cpu_notifier_register_begin();
+
+	for_each_online_cpu(i) {
+		err_inject_cpu_callback(&err_inject_cpu_notifier, CPU_ONLINE,
+				(void *)(long)i);
+	}
+
+	__register_hotcpu_notifier(&err_inject_cpu_notifier);
+
+	cpu_notifier_register_done();
+
+	return 0;
+}
+
+static void __exit
+err_inject_exit(void)
+{
+	int i;
+	struct device *sys_dev;
+
+#ifdef ERR_INJ_DEBUG
+	printk(KERN_INFO "Exit error injection driver.\n");
+#endif
+
+	cpu_notifier_register_begin();
+
+	for_each_online_cpu(i) {
+		sys_dev = get_cpu_device(i);
+		sysfs_remove_group(&sys_dev->kobj, &err_inject_attr_group);
+	}
+
+	__unregister_hotcpu_notifier(&err_inject_cpu_notifier);
+
+	cpu_notifier_register_done();
+}
+
+module_init(err_inject_init);
+module_exit(err_inject_exit);
+
+MODULE_AUTHOR("Fenghua Yu <fenghua.yu@intel.com>");
+MODULE_DESCRIPTION("MC error injection kernel sysfs interface");
+MODULE_LICENSE("GPL");
diff --git a/arch/ia64/kernel/esi.c b/arch/ia64/kernel/esi.c
new file mode 100644
index 00000000000..b091111270c
--- /dev/null
+++ b/arch/ia64/kernel/esi.c
@@ -0,0 +1,205 @@
+/*
+ * Extensible SAL Interface (ESI) support routines.
+ *
+ * Copyright (C) 2006 Hewlett-Packard Co
+ * 	Alex Williamson <alex.williamson@hp.com>
+ */
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/string.h>
+
+#include <asm/esi.h>
+#include <asm/sal.h>
+
+MODULE_AUTHOR("Alex Williamson <alex.williamson@hp.com>");
+MODULE_DESCRIPTION("Extensible SAL Interface (ESI) support");
+MODULE_LICENSE("GPL");
+
+#define MODULE_NAME	"esi"
+
+#define ESI_TABLE_GUID					\
+    EFI_GUID(0x43EA58DC, 0xCF28, 0x4b06, 0xB3,		\
+	     0x91, 0xB7, 0x50, 0x59, 0x34, 0x2B, 0xD4)
+
+enum esi_systab_entry_type {
+	ESI_DESC_ENTRY_POINT = 0
+};
+
+/*
+ * Entry type:	Size:
+ *	0	48
+ */
+#define ESI_DESC_SIZE(type)	"\060"[(unsigned) (type)]
+
+typedef struct ia64_esi_desc_entry_point {
+	u8 type;
+	u8 reserved1[15];
+	u64 esi_proc;
+	u64 gp;
+	efi_guid_t guid;
+} ia64_esi_desc_entry_point_t;
+
+struct pdesc {
+	void *addr;
+	void *gp;
+};
+
+static struct ia64_sal_systab *esi_systab;
+
+static int __init esi_init (void)
+{
+	efi_config_table_t *config_tables;
+	struct ia64_sal_systab *systab;
+	unsigned long esi = 0;
+	char *p;
+	int i;
+
+	config_tables = __va(efi.systab->tables);
+
+	for (i = 0; i < (int) efi.systab->nr_tables; ++i) {
+		if (efi_guidcmp(config_tables[i].guid, ESI_TABLE_GUID) == 0) {
+			esi = config_tables[i].table;
+			break;
+		}
+	}
+
+	if (!esi)
+		return -ENODEV;
+
+	systab = __va(esi);
+
+	if (strncmp(systab->signature, "ESIT", 4) != 0) {
+		printk(KERN_ERR "bad signature in ESI system table!");
+		return -ENODEV;
+	}
+
+	p = (char *) (systab + 1);
+	for (i = 0; i < systab->entry_count; i++) {
+		/*
+		 * The first byte of each entry type contains the type
+		 * descriptor.
+		 */
+		switch (*p) {
+		      case ESI_DESC_ENTRY_POINT:
+			break;
+		      default:
+			printk(KERN_WARNING "Unknown table type %d found in "
+			       "ESI table, ignoring rest of table\n", *p);
+			return -ENODEV;
+		}
+
+		p += ESI_DESC_SIZE(*p);
+	}
+
+	esi_systab = systab;
+	return 0;
+}
+
+
+int ia64_esi_call (efi_guid_t guid, struct ia64_sal_retval *isrvp,
+		   enum esi_proc_type proc_type, u64 func,
+		   u64 arg1, u64 arg2, u64 arg3, u64 arg4, u64 arg5, u64 arg6,
+		   u64 arg7)
+{
+	struct ia64_fpreg fr[6];
+	unsigned long flags = 0;
+	int i;
+	char *p;
+
+	if (!esi_systab)
+		return -1;
+
+	p = (char *) (esi_systab + 1);
+	for (i = 0; i < esi_systab->entry_count; i++) {
+		if (*p == ESI_DESC_ENTRY_POINT) {
+			ia64_esi_desc_entry_point_t *esi = (void *)p;
+			if (!efi_guidcmp(guid, esi->guid)) {
+				ia64_sal_handler esi_proc;
+				struct pdesc pdesc;
+
+				pdesc.addr = __va(esi->esi_proc);
+				pdesc.gp = __va(esi->gp);
+
+				esi_proc = (ia64_sal_handler) &pdesc;
+
+				ia64_save_scratch_fpregs(fr);
+				if (proc_type == ESI_PROC_SERIALIZED)
+					spin_lock_irqsave(&sal_lock, flags);
+				else if (proc_type == ESI_PROC_MP_SAFE)
+					local_irq_save(flags);
+				else
+					preempt_disable();
+				*isrvp = (*esi_proc)(func, arg1, arg2, arg3,
+						     arg4, arg5, arg6, arg7);
+				if (proc_type == ESI_PROC_SERIALIZED)
+					spin_unlock_irqrestore(&sal_lock,
+							       flags);
+				else if (proc_type == ESI_PROC_MP_SAFE)
+					local_irq_restore(flags);
+				else
+					preempt_enable();
+				ia64_load_scratch_fpregs(fr);
+				return 0;
+			}
+		}
+		p += ESI_DESC_SIZE(*p);
+	}
+	return -1;
+}
+EXPORT_SYMBOL_GPL(ia64_esi_call);
+
+int ia64_esi_call_phys (efi_guid_t guid, struct ia64_sal_retval *isrvp,
+			u64 func, u64 arg1, u64 arg2, u64 arg3, u64 arg4,
+			u64 arg5, u64 arg6, u64 arg7)
+{
+	struct ia64_fpreg fr[6];
+	unsigned long flags;
+	u64 esi_params[8];
+	char *p;
+	int i;
+
+	if (!esi_systab)
+		return -1;
+
+	p = (char *) (esi_systab + 1);
+	for (i = 0; i < esi_systab->entry_count; i++) {
+		if (*p == ESI_DESC_ENTRY_POINT) {
+			ia64_esi_desc_entry_point_t *esi = (void *)p;
+			if (!efi_guidcmp(guid, esi->guid)) {
+				ia64_sal_handler esi_proc;
+				struct pdesc pdesc;
+
+				pdesc.addr = (void *)esi->esi_proc;
+				pdesc.gp = (void *)esi->gp;
+
+				esi_proc = (ia64_sal_handler) &pdesc;
+
+				esi_params[0] = func;
+				esi_params[1] = arg1;
+				esi_params[2] = arg2;
+				esi_params[3] = arg3;
+				esi_params[4] = arg4;
+				esi_params[5] = arg5;
+				esi_params[6] = arg6;
+				esi_params[7] = arg7;
+				ia64_save_scratch_fpregs(fr);
+				spin_lock_irqsave(&sal_lock, flags);
+				*isrvp = esi_call_phys(esi_proc, esi_params);
+				spin_unlock_irqrestore(&sal_lock, flags);
+				ia64_load_scratch_fpregs(fr);
+				return 0;
+			}
+		}
+		p += ESI_DESC_SIZE(*p);
+	}
+	return -1;
+}
+EXPORT_SYMBOL_GPL(ia64_esi_call_phys);
+
+static void __exit esi_exit (void)
+{
+}
+
+module_init(esi_init);
+module_exit(esi_exit);	/* makes module removable... */
diff --git a/arch/ia64/kernel/esi_stub.S b/arch/ia64/kernel/esi_stub.S
new file mode 100644
index 00000000000..6b3d6c1f99b
--- /dev/null
+++ b/arch/ia64/kernel/esi_stub.S
@@ -0,0 +1,96 @@
+/*
+ * ESI call stub.
+ *
+ * Copyright (C) 2005 Hewlett-Packard Co
+ *	Alex Williamson <alex.williamson@hp.com>
+ *
+ * Based on EFI call stub by David Mosberger.  The stub is virtually
+ * identical to the one for EFI phys-mode calls, except that ESI
+ * calls may have up to 8 arguments, so they get passed to this routine
+ * through memory.
+ *
+ * This stub allows us to make ESI calls in physical mode with interrupts
+ * turned off.  ESI calls may not support calling from virtual mode.
+ *
+ * Google for "Extensible SAL specification" for a document describing the
+ * ESI standard.
+ */
+
+/*
+ * PSR settings as per SAL spec (Chapter 8 in the "IA-64 System
+ * Abstraction Layer Specification", revision 2.6e).  Note that
+ * psr.dfl and psr.dfh MUST be cleared, despite what this manual says.
+ * Otherwise, SAL dies whenever it's trying to do an IA-32 BIOS call
+ * (the br.ia instruction fails unless psr.dfl and psr.dfh are
+ * cleared).  Fortunately, SAL promises not to touch the floating
+ * point regs, so at least we don't have to save f2-f127.
+ */
+#define PSR_BITS_TO_CLEAR						\
+	(IA64_PSR_I | IA64_PSR_IT | IA64_PSR_DT | IA64_PSR_RT |		\
+	 IA64_PSR_DD | IA64_PSR_SS | IA64_PSR_RI | IA64_PSR_ED |	\
+	 IA64_PSR_DFL | IA64_PSR_DFH)
+
+#define PSR_BITS_TO_SET							\
+	(IA64_PSR_BN)
+
+#include <asm/processor.h>
+#include <asm/asmmacro.h>
+
+/*
+ * Inputs:
+ *	in0 = address of function descriptor of ESI routine to call
+ *	in1 = address of array of ESI parameters
+ *
+ * Outputs:
+ *	r8 = result returned by called function
+ */
+GLOBAL_ENTRY(esi_call_phys)
+	.prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(2)
+	alloc loc1=ar.pfs,2,7,8,0
+	ld8 r2=[in0],8			// load ESI function's entry point
+	mov loc0=rp
+	.body
+	;;
+	ld8 out0=[in1],8		// ESI params loaded from array
+	;;				// passing all as inputs doesn't work
+	ld8 out1=[in1],8
+	;;
+	ld8 out2=[in1],8
+	;;
+	ld8 out3=[in1],8
+	;;
+	ld8 out4=[in1],8
+	;;
+	ld8 out5=[in1],8
+	;;
+	ld8 out6=[in1],8
+	;;
+	ld8 out7=[in1]
+	mov loc2=gp			// save global pointer
+	mov loc4=ar.rsc			// save RSE configuration
+	mov ar.rsc=0			// put RSE in enforced lazy, LE mode
+	;;
+	ld8 gp=[in0]			// load ESI function's global pointer
+	movl r16=PSR_BITS_TO_CLEAR
+	mov loc3=psr			// save processor status word
+	movl r17=PSR_BITS_TO_SET
+	;;
+	or loc3=loc3,r17
+	mov b6=r2
+	;;
+	andcm r16=loc3,r16	// get psr with IT, DT, and RT bits cleared
+	br.call.sptk.many rp=ia64_switch_mode_phys
+.ret0:	mov loc5=r19			// old ar.bsp
+	mov loc6=r20			// old sp
+	br.call.sptk.many rp=b6		// call the ESI function
+.ret1:	mov ar.rsc=0			// put RSE in enforced lazy, LE mode
+	mov r16=loc3			// save virtual mode psr
+	mov r19=loc5			// save virtual mode bspstore
+	mov r20=loc6			// save virtual mode sp
+	br.call.sptk.many rp=ia64_switch_mode_virt // return to virtual mode
+.ret2:	mov ar.rsc=loc4			// restore RSE configuration
+	mov ar.pfs=loc1
+	mov rp=loc0
+	mov gp=loc2
+	br.ret.sptk.many rp
+END(esi_call_phys)
diff --git a/arch/ia64/kernel/fsys.S b/arch/ia64/kernel/fsys.S
index 2ddbac6f499..abc6dee3799 100644
--- a/arch/ia64/kernel/fsys.S
+++ b/arch/ia64/kernel/fsys.S
@@ -10,6 +10,8 @@
  *			probably broke it along the way... ;-)
  * 13-Jul-04 clameter   Implement fsys_clock_gettime and revise fsys_gettimeofday to make
  *                      it capable of using memory based clocks without falling back to C code.
+ * 08-Feb-07 Fenghua Yu Implement fsys_getcpu.
+ *
  */
 
 #include <asm/asmmacro.h>
@@ -19,10 +21,10 @@
 #include <asm/thread_info.h>
 #include <asm/sal.h>
 #include <asm/signal.h>
-#include <asm/system.h>
 #include <asm/unistd.h>
 
 #include "entry.h"
+#include "paravirt_inst.h"
 
 /*
  * See Documentation/ia64/fsys.txt for details on fsyscalls.
@@ -59,80 +61,59 @@ ENTRY(fsys_getpid)
 	.prologue
 	.altrp b6
 	.body
-	add r9=TI_FLAGS+IA64_TASK_SIZE,r16
-	;;
-	ld4 r9=[r9]
-	add r8=IA64_TASK_TGID_OFFSET,r16
-	;;
-	and r9=TIF_ALLWORK_MASK,r9
-	ld4 r8=[r8]				// r8 = current->tgid
-	;;
-	cmp.ne p8,p0=0,r9
-(p8)	br.spnt.many fsys_fallback_syscall
-	FSYS_RETURN
-END(fsys_getpid)
-
-ENTRY(fsys_getppid)
-	.prologue
-	.altrp b6
-	.body
 	add r17=IA64_TASK_GROUP_LEADER_OFFSET,r16
 	;;
 	ld8 r17=[r17]				// r17 = current->group_leader
 	add r9=TI_FLAGS+IA64_TASK_SIZE,r16
 	;;
-
 	ld4 r9=[r9]
-	add r17=IA64_TASK_REAL_PARENT_OFFSET,r17 // r17 = &current->group_leader->real_parent
+	add r17=IA64_TASK_TGIDLINK_OFFSET,r17
 	;;
 	and r9=TIF_ALLWORK_MASK,r9
-
-1:	ld8 r18=[r17]				// r18 = current->group_leader->real_parent
+	ld8 r17=[r17]				// r17 = current->group_leader->pids[PIDTYPE_PID].pid
 	;;
-	cmp.ne p8,p0=0,r9
-	add r8=IA64_TASK_TGID_OFFSET,r18	// r8 = &current->group_leader->real_parent->tgid
+	add r8=IA64_PID_LEVEL_OFFSET,r17
 	;;
-
-	/*
-	 * The .acq is needed to ensure that the read of tgid has returned its data before
-	 * we re-check "real_parent".
-	 */
-	ld4.acq r8=[r8]				// r8 = current->group_leader->real_parent->tgid
-#ifdef CONFIG_SMP
-	/*
-	 * Re-read current->group_leader->real_parent.
-	 */
-	ld8 r19=[r17]				// r19 = current->group_leader->real_parent
-(p8)	br.spnt.many fsys_fallback_syscall
+	ld4 r8=[r8]				// r8 = pid->level
+	add r17=IA64_PID_UPID_OFFSET,r17	// r17 = &pid->numbers[0]
 	;;
-	cmp.ne p6,p0=r18,r19			// did real_parent change?
-	mov r19=0			// i must not leak kernel bits...
-(p6)	br.cond.spnt.few 1b			// yes -> redo the read of tgid and the check
+	shl r8=r8,IA64_UPID_SHIFT
 	;;
-	mov r17=0			// i must not leak kernel bits...
-	mov r18=0			// i must not leak kernel bits...
-#else
-	mov r17=0			// i must not leak kernel bits...
-	mov r18=0			// i must not leak kernel bits...
-	mov r19=0			// i must not leak kernel bits...
-#endif
+	add r17=r17,r8				// r17 = &pid->numbers[pid->level]
+	;;
+	ld4 r8=[r17]				// r8 = pid->numbers[pid->level].nr
+	;;
+	mov r17=0
+	;;
+	cmp.ne p8,p0=0,r9
+(p8)	br.spnt.many fsys_fallback_syscall
 	FSYS_RETURN
-END(fsys_getppid)
+END(fsys_getpid)
 
 ENTRY(fsys_set_tid_address)
 	.prologue
 	.altrp b6
 	.body
 	add r9=TI_FLAGS+IA64_TASK_SIZE,r16
+	add r17=IA64_TASK_TGIDLINK_OFFSET,r16
 	;;
 	ld4 r9=[r9]
 	tnat.z p6,p7=r32		// check argument register for being NaT
+	ld8 r17=[r17]				// r17 = current->pids[PIDTYPE_PID].pid
 	;;
 	and r9=TIF_ALLWORK_MASK,r9
-	add r8=IA64_TASK_PID_OFFSET,r16
+	add r8=IA64_PID_LEVEL_OFFSET,r17
 	add r18=IA64_TASK_CLEAR_CHILD_TID_OFFSET,r16
 	;;
-	ld4 r8=[r8]
+	ld4 r8=[r8]				// r8 = pid->level
+	add r17=IA64_PID_UPID_OFFSET,r17	// r17 = &pid->numbers[0]
+	;;
+	shl r8=r8,IA64_UPID_SHIFT
+	;;
+	add r17=r17,r8				// r17 = &pid->numbers[pid->level]
+	;;
+	ld4 r8=[r17]				// r8 = pid->numbers[pid->level].nr
+	;;
 	cmp.ne p8,p0=0,r9
 	mov r17=-1
 	;;
@@ -145,12 +126,11 @@ ENTRY(fsys_set_tid_address)
 	FSYS_RETURN
 END(fsys_set_tid_address)
 
-/*
- * Ensure that the time interpolator structure is compatible with the asm code
- */
-#if IA64_TIME_INTERPOLATOR_SOURCE_OFFSET !=0 || IA64_TIME_INTERPOLATOR_SHIFT_OFFSET != 2 \
-	|| IA64_TIME_INTERPOLATOR_JITTER_OFFSET != 3 || IA64_TIME_INTERPOLATOR_NSEC_OFFSET != 4
-#error fsys_gettimeofday incompatible with changes to struct time_interpolator
+#if IA64_GTOD_SEQ_OFFSET !=0
+#error fsys_gettimeofday incompatible with changes to struct fsyscall_gtod_data_t
+#endif
+#if IA64_ITC_JITTER_OFFSET !=0
+#error fsys_gettimeofday incompatible with changes to struct itc_jitter_data_t
 #endif
 #define CLOCK_REALTIME 0
 #define CLOCK_MONOTONIC 1
@@ -177,124 +157,120 @@ ENTRY(fsys_gettimeofday)
 	// r11 = preserved: saved ar.pfs
 	// r12 = preserved: memory stack
 	// r13 = preserved: thread pointer
-	// r14 = address of mask / mask
+	// r14 = address of mask / mask value
 	// r15 = preserved: system call number
 	// r16 = preserved: current task pointer
-	// r17 = wall to monotonic use
-	// r18 = time_interpolator->offset
-	// r19 = address of wall_to_monotonic
-	// r20 = pointer to struct time_interpolator / pointer to time_interpolator->address
-	// r21 = shift factor
-	// r22 = address of time interpolator->last_counter
-	// r23 = address of time_interpolator->last_cycle
-	// r24 = adress of time_interpolator->offset
-	// r25 = last_cycle value
-	// r26 = last_counter value
-	// r27 = pointer to xtime
+	// r17 = (not used)
+	// r18 = (not used)
+	// r19 = address of itc_lastcycle
+	// r20 = struct fsyscall_gtod_data (= address of gtod_lock.sequence)
+	// r21 = address of mmio_ptr
+	// r22 = address of wall_time or monotonic_time
+	// r23 = address of shift / value
+	// r24 = address mult factor / cycle_last value
+	// r25 = itc_lastcycle value
+	// r26 = address clocksource cycle_last
+	// r27 = (not used)
 	// r28 = sequence number at the beginning of critcal section
-	// r29 = address of seqlock
+	// r29 = address of itc_jitter
 	// r30 = time processing flags / memory address
 	// r31 = pointer to result
 	// Predicates
 	// p6,p7 short term use
 	// p8 = timesource ar.itc
 	// p9 = timesource mmio64
-	// p10 = timesource mmio32
+	// p10 = timesource mmio32 - not used
 	// p11 = timesource not to be handled by asm code
-	// p12 = memory time source ( = p9 | p10)
-	// p13 = do cmpxchg with time_interpolator_last_cycle
+	// p12 = memory time source ( = p9 | p10) - not used
+	// p13 = do cmpxchg with itc_lastcycle
 	// p14 = Divide by 1000
 	// p15 = Add monotonic
 	//
-	// Note that instructions are optimized for McKinley. McKinley can process two
-	// bundles simultaneously and therefore we continuously try to feed the CPU
-	// two bundles and then a stop.
-	tnat.nz p6,p0 = r31	// branch deferred since it does not fit into bundle structure
-	mov pr = r30,0xc000	// Set predicates according to function
+	// Note that instructions are optimized for McKinley. McKinley can
+	// process two bundles simultaneously and therefore we continuously
+	// try to feed the CPU two bundles and then a stop.
+
 	add r2 = TI_FLAGS+IA64_TASK_SIZE,r16
-	movl r20 = time_interpolator
-	;;
-	ld8 r20 = [r20]		// get pointer to time_interpolator structure
-	movl r29 = xtime_lock
-	ld4 r2 = [r2]		// process work pending flags
-	movl r27 = xtime
-	;;	// only one bundle here
-	ld8 r21 = [r20]		// first quad with control information
+	tnat.nz p6,p0 = r31		// guard against Nat argument
+(p6)	br.cond.spnt.few .fail_einval
+	movl r20 = fsyscall_gtod_data // load fsyscall gettimeofday data address
+	;;
+	ld4 r2 = [r2]			// process work pending flags
+	movl r29 = itc_jitter_data	// itc_jitter
+	add r22 = IA64_GTOD_WALL_TIME_OFFSET,r20	// wall_time
+	add r21 = IA64_CLKSRC_MMIO_OFFSET,r20
+	mov pr = r30,0xc000	// Set predicates according to function
+	;;
 	and r2 = TIF_ALLWORK_MASK,r2
-(p6)    br.cond.spnt.few .fail_einval	// deferred branch
+	add r19 = IA64_ITC_LASTCYCLE_OFFSET,r29
+(p15)	add r22 = IA64_GTOD_MONO_TIME_OFFSET,r20	// monotonic_time
 	;;
-	add r10 = IA64_TIME_INTERPOLATOR_ADDRESS_OFFSET,r20
-	extr r3 = r21,32,32	// time_interpolator->nsec_per_cyc
-	extr r8 = r21,0,16	// time_interpolator->source
+	add r26 = IA64_CLKSRC_CYCLE_LAST_OFFSET,r20	// clksrc_cycle_last
 	cmp.ne p6, p0 = 0, r2	// Fallback if work is scheduled
-(p6)    br.cond.spnt.many fsys_fallback_syscall
+(p6)	br.cond.spnt.many fsys_fallback_syscall
 	;;
-	cmp.eq p8,p12 = 0,r8	// Check for cpu timer
-	cmp.eq p9,p0 = 1,r8	// MMIO64 ?
-	extr r2 = r21,24,8	// time_interpolator->jitter
-	cmp.eq p10,p0 = 2,r8	// MMIO32 ?
-	cmp.ltu p11,p0 = 2,r8	// function or other clock
-(p11)	br.cond.spnt.many fsys_fallback_syscall
+	// Begin critical section
+.time_redo:
+	ld4.acq r28 = [r20]	// gtod_lock.sequence, Must take first
 	;;
-	setf.sig f7 = r3	// Setup for scaling of counter
-(p15)	movl r19 = wall_to_monotonic
-(p12)	ld8 r30 = [r10]
-	cmp.ne p13,p0 = r2,r0	// need jitter compensation?
-	extr r21 = r21,16,8	// shift factor
+	and r28 = ~1,r28	// And make sequence even to force retry if odd
 	;;
-.time_redo:
-	.pred.rel.mutex p8,p9,p10
-	ld4.acq r28 = [r29]	// xtime_lock.sequence. Must come first for locking purposes
-(p8)	mov r2 = ar.itc		// CPU_TIMER. 36 clocks latency!!!
-	add r22 = IA64_TIME_INTERPOLATOR_LAST_COUNTER_OFFSET,r20
-(p9)	ld8 r2 = [r30]		// readq(ti->address). Could also have latency issues..
-(p10)	ld4 r2 = [r30]		// readw(ti->address)
-(p13)	add r23 = IA64_TIME_INTERPOLATOR_LAST_CYCLE_OFFSET,r20
-	;;			// could be removed by moving the last add upward
-	ld8 r26 = [r22]		// time_interpolator->last_counter
-(p13)	ld8 r25 = [r23]		// time interpolator->last_cycle
-	add r24 = IA64_TIME_INTERPOLATOR_OFFSET_OFFSET,r20
-(p15)	ld8 r17 = [r19],IA64_TIMESPEC_TV_NSEC_OFFSET
- 	ld8 r9 = [r27],IA64_TIMESPEC_TV_NSEC_OFFSET
-	add r14 = IA64_TIME_INTERPOLATOR_MASK_OFFSET, r20
-	;;
-	ld8 r18 = [r24]		// time_interpolator->offset
-	ld8 r8 = [r27],-IA64_TIMESPEC_TV_NSEC_OFFSET	// xtime.tv_nsec
-(p13)	sub r3 = r25,r2	// Diff needed before comparison (thanks davidm)
-	;;
-	ld8 r14 = [r14]		// time_interpolator->mask
-(p13)	cmp.gt.unc p6,p7 = r3,r0	// check if it is less than last. p6,p7 cleared
-	sub r10 = r2,r26	// current_counter - last_counter
-	;;
-(p6)	sub r10 = r25,r26	// time we got was less than last_cycle
+	ld8 r30 = [r21]		// clocksource->mmio_ptr
+	add r24 = IA64_CLKSRC_MULT_OFFSET,r20
+	ld4 r2 = [r29]		// itc_jitter value
+	add r23 = IA64_CLKSRC_SHIFT_OFFSET,r20
+	add r14 = IA64_CLKSRC_MASK_OFFSET,r20
+	;;
+	ld4 r3 = [r24]		// clocksource mult value
+	ld8 r14 = [r14]         // clocksource mask value
+	cmp.eq p8,p9 = 0,r30	// use cpu timer if no mmio_ptr
+	;;
+	setf.sig f7 = r3	// Setup for mult scaling of counter
+(p8)	cmp.ne p13,p0 = r2,r0	// need itc_jitter compensation, set p13
+	ld4 r23 = [r23]		// clocksource shift value
+	ld8 r24 = [r26]		// get clksrc_cycle_last value
+(p9)	cmp.eq p13,p0 = 0,r30	// if mmio_ptr, clear p13 jitter control
+	;;
+	.pred.rel.mutex p8,p9
+	MOV_FROM_ITC(p8, p6, r2, r10)	// CPU_TIMER. 36 clocks latency!!!
+(p9)	ld8 r2 = [r30]		// MMIO_TIMER. Could also have latency issues..
+(p13)	ld8 r25 = [r19]		// get itc_lastcycle value
+	ld8 r9 = [r22],IA64_TIMESPEC_TV_NSEC_OFFSET	// tv_sec
+	;;
+	ld8 r8 = [r22],-IA64_TIMESPEC_TV_NSEC_OFFSET	// tv_nsec
+(p13)	sub r3 = r25,r2		// Diff needed before comparison (thanks davidm)
+	;;
+(p13)	cmp.gt.unc p6,p7 = r3,r0 // check if it is less than last. p6,p7 cleared
+	sub r10 = r2,r24	// current_cycle - last_cycle
+	;;
+(p6)	sub r10 = r25,r24	// time we got was less than last_cycle
 (p7)	mov ar.ccv = r25	// more than last_cycle. Prep for cmpxchg
 	;;
+(p7)	cmpxchg8.rel r3 = [r19],r2,ar.ccv
+	;;
+(p7)	cmp.ne p7,p0 = r25,r3	// if cmpxchg not successful
+	;;
+(p7)	sub r10 = r3,r24	// then use new last_cycle instead
+	;;
 	and r10 = r10,r14	// Apply mask
 	;;
 	setf.sig f8 = r10
 	nop.i 123
 	;;
-(p7)	cmpxchg8.rel r3 = [r23],r2,ar.ccv
-EX(.fail_efault, probe.w.fault r31, 3)	// This takes 5 cycles and we have spare time
+	// fault check takes 5 cycles and we have spare time
+EX(.fail_efault, probe.w.fault r31, 3)
 	xmpy.l f8 = f8,f7	// nsec_per_cyc*(counter-last_counter)
-(p15)	add r9 = r9,r17		// Add wall to monotonic.secs to result secs
 	;;
-(p15)	ld8 r17 = [r19],-IA64_TIMESPEC_TV_NSEC_OFFSET
-(p7)	cmp.ne p7,p0 = r25,r3	// if cmpxchg not successful redo
-	// simulate tbit.nz.or p7,p0 = r28,0
-	and r28 = ~1,r28	// Make sequence even to force retry if odd
 	getf.sig r2 = f8
 	mf
-	add r8 = r8,r18		// Add time interpolator offset
 	;;
-	ld4 r10 = [r29]		// xtime_lock.sequence
-(p15)	add r8 = r8, r17	// Add monotonic.nsecs to nsecs
-	shr.u r2 = r2,r21
-	;;		// overloaded 3 bundles!
-	// End critical section.
+	ld4 r10 = [r20]		// gtod_lock.sequence
+	shr.u r2 = r2,r23	// shift by factor
+	;;
 	add r8 = r8,r2		// Add xtime.nsecs
-	cmp4.ne.or p7,p0 = r28,r10
-(p7)	br.cond.dpnt.few .time_redo	// sequence number changed ?
+	cmp4.ne p7,p0 = r28,r10
+(p7)	br.cond.dpnt.few .time_redo	// sequence number changed, redo
+	// End critical section.
 	// Now r8=tv->tv_nsec and r9=tv->tv_sec
 	mov r10 = r0
 	movl r2 = 1000000000
@@ -304,23 +280,23 @@ EX(.fail_efault, probe.w.fault r31, 3)	// This takes 5 cycles and we have spare
 .time_normalize:
 	mov r21 = r8
 	cmp.ge p6,p0 = r8,r2
-(p14)	shr.u r20 = r8, 3		// We can repeat this if necessary just wasting some time
+(p14)	shr.u r20 = r8, 3 // We can repeat this if necessary just wasting time
 	;;
 (p14)	setf.sig f8 = r20
 (p6)	sub r8 = r8,r2
-(p6)	add r9 = 1,r9			// two nops before the branch.
-(p14)	setf.sig f7 = r3		// Chances for repeats are 1 in 10000 for gettod
+(p6)	add r9 = 1,r9		// two nops before the branch.
+(p14)	setf.sig f7 = r3	// Chances for repeats are 1 in 10000 for gettod
 (p6)	br.cond.dpnt.few .time_normalize
 	;;
 	// Divided by 8 though shift. Now divide by 125
 	// The compiler was able to do that with a multiply
 	// and a shift and we do the same
-EX(.fail_efault, probe.w.fault r23, 3)		// This also costs 5 cycles
-(p14)	xmpy.hu f8 = f8, f7			// xmpy has 5 cycles latency so use it...
+EX(.fail_efault, probe.w.fault r23, 3)	// This also costs 5 cycles
+(p14)	xmpy.hu f8 = f8, f7		// xmpy has 5 cycles latency so use it
 	;;
-	mov r8 = r0
 (p14)	getf.sig r2 = f8
 	;;
+	mov r8 = r0
 (p14)	shr.u r21 = r2, 4
 	;;
 EX(.fail_efault, st8 [r31] = r9)
@@ -349,161 +325,61 @@ ENTRY(fsys_clock_gettime)
 END(fsys_clock_gettime)
 
 /*
- * long fsys_rt_sigprocmask (int how, sigset_t *set, sigset_t *oset, size_t sigsetsize).
+ * fsys_getcpu doesn't use the third parameter in this implementation. It reads
+ * current_thread_info()->cpu and corresponding node in cpu_to_node_map.
  */
-#if _NSIG_WORDS != 1
-# error Sorry, fsys_rt_sigprocmask() needs to be updated for _NSIG_WORDS != 1.
-#endif
-ENTRY(fsys_rt_sigprocmask)
+ENTRY(fsys_getcpu)
 	.prologue
 	.altrp b6
 	.body
-
-	add r2=IA64_TASK_BLOCKED_OFFSET,r16
-	add r9=TI_FLAGS+IA64_TASK_SIZE,r16
-	cmp4.ltu p6,p0=SIG_SETMASK,r32
-
-	cmp.ne p15,p0=r0,r34			// oset != NULL?
-	tnat.nz p8,p0=r34
-	add r31=IA64_TASK_SIGHAND_OFFSET,r16
 	;;
-	ld8 r3=[r2]				// read/prefetch current->blocked
-	ld4 r9=[r9]
-	tnat.nz.or p6,p0=r35
-
-	cmp.ne.or p6,p0=_NSIG_WORDS*8,r35
-	tnat.nz.or p6,p0=r32
-(p6)	br.spnt.few .fail_einval		// fail with EINVAL
+	add r2=TI_FLAGS+IA64_TASK_SIZE,r16
+	tnat.nz p6,p0 = r32			// guard against NaT argument
+	add r3=TI_CPU+IA64_TASK_SIZE,r16
 	;;
-#ifdef CONFIG_SMP
-	ld8 r31=[r31]				// r31 <- current->sighand
-#endif
-	and r9=TIF_ALLWORK_MASK,r9
-	tnat.nz.or p8,p0=r33
-	;;
-	cmp.ne p7,p0=0,r9
-	cmp.eq p6,p0=r0,r33			// set == NULL?
-	add r31=IA64_SIGHAND_SIGLOCK_OFFSET,r31	// r31 <- current->sighand->siglock
-(p8)	br.spnt.few .fail_efault		// fail with EFAULT
-(p7)	br.spnt.many fsys_fallback_syscall	// got pending kernel work...
-(p6)	br.dpnt.many .store_mask		// -> short-circuit to just reading the signal mask
-
-	/* Argh, we actually have to do some work and _update_ the signal mask: */
-
-EX(.fail_efault, probe.r.fault r33, 3)		// verify user has read-access to *set
-EX(.fail_efault, ld8 r14=[r33])			// r14 <- *set
-	mov r17=(1 << (SIGKILL - 1)) | (1 << (SIGSTOP - 1))
+	ld4 r3=[r3]				// M r3 = thread_info->cpu
+	ld4 r2=[r2]				// M r2 = thread_info->flags
+(p6)    br.cond.spnt.few .fail_einval		// B
 	;;
-
-	rsm psr.i				// mask interrupt delivery
-	mov ar.ccv=0
-	andcm r14=r14,r17			// filter out SIGKILL & SIGSTOP
-
-#ifdef CONFIG_SMP
-	mov r17=1
+	tnat.nz p7,p0 = r33			// I guard against NaT argument
+(p7)    br.cond.spnt.few .fail_einval		// B
 	;;
-	cmpxchg4.acq r18=[r31],r17,ar.ccv	// try to acquire the lock
-	mov r8=EINVAL			// default to EINVAL
+	cmp.ne p6,p0=r32,r0
+	cmp.ne p7,p0=r33,r0
 	;;
-	ld8 r3=[r2]			// re-read current->blocked now that we hold the lock
-	cmp4.ne p6,p0=r18,r0
-(p6)	br.cond.spnt.many .lock_contention
+#ifdef CONFIG_NUMA
+	movl r17=cpu_to_node_map
 	;;
-#else
-	ld8 r3=[r2]			// re-read current->blocked now that we hold the lock
-	mov r8=EINVAL			// default to EINVAL
-#endif
-	add r18=IA64_TASK_PENDING_OFFSET+IA64_SIGPENDING_SIGNAL_OFFSET,r16
-	add r19=IA64_TASK_SIGNAL_OFFSET,r16
-	cmp4.eq p6,p0=SIG_BLOCK,r32
-	;;
-	ld8 r19=[r19]			// r19 <- current->signal
-	cmp4.eq p7,p0=SIG_UNBLOCK,r32
-	cmp4.eq p8,p0=SIG_SETMASK,r32
-	;;
-	ld8 r18=[r18]			// r18 <- current->pending.signal
-	.pred.rel.mutex p6,p7,p8
-(p6)	or r14=r3,r14			// SIG_BLOCK
-(p7)	andcm r14=r3,r14		// SIG_UNBLOCK
-
-(p8)	mov r14=r14			// SIG_SETMASK
-(p6)	mov r8=0			// clear error code
-	// recalc_sigpending()
-	add r17=IA64_SIGNAL_GROUP_STOP_COUNT_OFFSET,r19
-
-	add r19=IA64_SIGNAL_SHARED_PENDING_OFFSET+IA64_SIGPENDING_SIGNAL_OFFSET,r19
+EX(.fail_efault, (p6) probe.w.fault r32, 3)		// M This takes 5 cycles
+EX(.fail_efault, (p7) probe.w.fault r33, 3)		// M This takes 5 cycles
+	shladd r18=r3,1,r17
 	;;
-	ld4 r17=[r17]		// r17 <- current->signal->group_stop_count
-(p7)	mov r8=0		// clear error code
-
-	ld8 r19=[r19]		// r19 <- current->signal->shared_pending
+	ld2 r20=[r18]				// r20 = cpu_to_node_map[cpu]
+	and r2 = TIF_ALLWORK_MASK,r2
 	;;
-	cmp4.gt p6,p7=r17,r0	// p6/p7 <- (current->signal->group_stop_count > 0)?
-(p8)	mov r8=0		// clear error code
-
-	or r18=r18,r19		// r18 <- current->pending | current->signal->shared_pending
+	cmp.ne p8,p0=0,r2
+(p8)	br.spnt.many fsys_fallback_syscall
 	;;
-	// r18 <- (current->pending | current->signal->shared_pending) & ~current->blocked:
-	andcm r18=r18,r14
-	add r9=TI_FLAGS+IA64_TASK_SIZE,r16
 	;;
-
-(p7)	cmp.ne.or.andcm p6,p7=r18,r0		// p6/p7 <- signal pending
-	mov r19=0					// i must not leak kernel bits...
-(p6)	br.cond.dpnt.many .sig_pending
+EX(.fail_efault, (p6) st4 [r32] = r3)
+EX(.fail_efault, (p7) st2 [r33] = r20)
+	mov r8=0
 	;;
-
-1:	ld4 r17=[r9]				// r17 <- current->thread_info->flags
+#else
+EX(.fail_efault, (p6) probe.w.fault r32, 3)		// M This takes 5 cycles
+EX(.fail_efault, (p7) probe.w.fault r33, 3)		// M This takes 5 cycles
+	and r2 = TIF_ALLWORK_MASK,r2
 	;;
-	mov ar.ccv=r17
-	and r18=~_TIF_SIGPENDING,r17		// r18 <- r17 & ~(1 << TIF_SIGPENDING)
+	cmp.ne p8,p0=0,r2
+(p8)	br.spnt.many fsys_fallback_syscall
 	;;
-
-	st8 [r2]=r14				// update current->blocked with new mask
-	cmpxchg4.acq r8=[r9],r18,ar.ccv		// current->thread_info->flags <- r18
+EX(.fail_efault, (p6) st4 [r32] = r3)
+EX(.fail_efault, (p7) st2 [r33] = r0)
+	mov r8=0
 	;;
-	cmp.ne p6,p0=r17,r8			// update failed?
-(p6)	br.cond.spnt.few 1b			// yes -> retry
-
-#ifdef CONFIG_SMP
-	st4.rel [r31]=r0			// release the lock
 #endif
-	ssm psr.i
-	;;
-
-	srlz.d					// ensure psr.i is set again
-	mov r18=0					// i must not leak kernel bits...
-
-.store_mask:
-EX(.fail_efault, (p15) probe.w.fault r34, 3)	// verify user has write-access to *oset
-EX(.fail_efault, (p15) st8 [r34]=r3)
-	mov r2=0					// i must not leak kernel bits...
-	mov r3=0					// i must not leak kernel bits...
-	mov r8=0				// return 0
-	mov r9=0					// i must not leak kernel bits...
-	mov r14=0					// i must not leak kernel bits...
-	mov r17=0					// i must not leak kernel bits...
-	mov r31=0					// i must not leak kernel bits...
 	FSYS_RETURN
-
-.sig_pending:
-#ifdef CONFIG_SMP
-	st4.rel [r31]=r0			// release the lock
-#endif
-	ssm psr.i
-	;;
-	srlz.d
-	br.sptk.many fsys_fallback_syscall	// with signal pending, do the heavy-weight syscall
-
-#ifdef CONFIG_SMP
-.lock_contention:
-	/* Rather than spinning here, fall back on doing a heavy-weight syscall.  */
-	ssm psr.i
-	;;
-	srlz.d
-	br.sptk.many fsys_fallback_syscall
-#endif
-END(fsys_rt_sigprocmask)
+END(fsys_getcpu)
 
 ENTRY(fsys_fallback_syscall)
 	.prologue
@@ -516,17 +392,17 @@ ENTRY(fsys_fallback_syscall)
 	adds r17=-1024,r15
 	movl r14=sys_call_table
 	;;
-	rsm psr.i
+	RSM_PSR_I(p0, r26, r27)
 	shladd r18=r17,3,r14
 	;;
 	ld8 r18=[r18]				// load normal (heavy-weight) syscall entry-point
-	mov r29=psr				// read psr (12 cyc load latency)
+	MOV_FROM_PSR(p0, r29, r26)		// read psr (12 cyc load latency)
 	mov r27=ar.rsc
 	mov r21=ar.fpsr
 	mov r26=ar.pfs
 END(fsys_fallback_syscall)
 	/* FALL THROUGH */
-GLOBAL_ENTRY(fsys_bubble_down)
+GLOBAL_ENTRY(paravirt_fsys_bubble_down)
 	.prologue
 	.altrp b6
 	.body
@@ -564,7 +440,7 @@ GLOBAL_ENTRY(fsys_bubble_down)
 	 *
 	 * PSR.BE : already is turned off in __kernel_syscall_via_epc()
 	 * PSR.AC : don't care (kernel normally turns PSR.AC on)
-	 * PSR.I  : already turned off by the time fsys_bubble_down gets
+	 * PSR.I  : already turned off by the time paravirt_fsys_bubble_down gets
 	 *	    invoked
 	 * PSR.DFL: always 0 (kernel never turns it on)
 	 * PSR.DFH: don't care --- kernel never touches f32-f127 on its own
@@ -574,7 +450,7 @@ GLOBAL_ENTRY(fsys_bubble_down)
 	 * PSR.DB : don't care --- kernel never enables kernel-level
 	 *	    breakpoints
 	 * PSR.TB : must be 0 already; if it wasn't zero on entry to
-	 *          __kernel_syscall_via_epc, the branch to fsys_bubble_down
+	 *          __kernel_syscall_via_epc, the branch to paravirt_fsys_bubble_down
 	 *          will trigger a taken branch; the taken-trap-handler then
 	 *          converts the syscall into a break-based system-call.
 	 */
@@ -606,7 +482,11 @@ GLOBAL_ENTRY(fsys_bubble_down)
 	nop.i 0
 	;;
 	mov ar.rsc=0				// M2   set enforced lazy mode, pl 0, LE, loadrs=0
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
+	MOV_FROM_ITC(p0, p6, r30, r23)		// M    get cycle for accounting
+#else
 	nop.m 0
+#endif
 	nop.i 0
 	;;
 	mov r23=ar.bspstore			// M2 (12 cyc) save ar.bspstore
@@ -628,25 +508,47 @@ GLOBAL_ENTRY(fsys_bubble_down)
 	cmp.ne pKStk,pUStk=r0,r0		// A    set pKStk <- 0, pUStk <- 1
 	br.call.sptk.many b7=ia64_syscall_setup	// B
 	;;
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
+	// mov.m r30=ar.itc is called in advance
+	add r16=TI_AC_STAMP+IA64_TASK_SIZE,r2
+	add r17=TI_AC_LEAVE+IA64_TASK_SIZE,r2
+	;;
+	ld8 r18=[r16],TI_AC_STIME-TI_AC_STAMP	// time at last check in kernel
+	ld8 r19=[r17],TI_AC_UTIME-TI_AC_LEAVE	// time at leave kernel
+	;;
+	ld8 r20=[r16],TI_AC_STAMP-TI_AC_STIME	// cumulated stime
+	ld8 r21=[r17]				// cumulated utime
+	sub r22=r19,r18				// stime before leave kernel
+	;;
+	st8 [r16]=r30,TI_AC_STIME-TI_AC_STAMP	// update stamp
+	sub r18=r30,r19				// elapsed time in user mode
+	;;
+	add r20=r20,r22				// sum stime
+	add r21=r21,r18				// sum utime
+	;;
+	st8 [r16]=r20				// update stime
+	st8 [r17]=r21				// update utime
+	;;
+#endif
 	mov ar.rsc=0x3				// M2   set eager mode, pl 0, LE, loadrs=0
 	mov rp=r14				// I0   set the real return addr
 	and r3=_TIF_SYSCALL_TRACEAUDIT,r3	// A
 	;;
-	ssm psr.i				// M2   we're on kernel stacks now, reenable irqs
+	SSM_PSR_I(p0, p6, r22)			// M2   we're on kernel stacks now, reenable irqs
 	cmp.eq p8,p0=r3,r0			// A
 (p10)	br.cond.spnt.many ia64_ret_from_syscall	// B    return if bad call-frame or r15 is a NaT
 
 	nop.m 0
 (p8)	br.call.sptk.many b6=b6			// B    (ignore return address)
 	br.cond.spnt ia64_trace_syscall		// B
-END(fsys_bubble_down)
+END(paravirt_fsys_bubble_down)
 
 	.rodata
 	.align 8
-	.globl fsyscall_table
+	.globl paravirt_fsyscall_table
 
-	data8 fsys_bubble_down
-fsyscall_table:
+	data8 paravirt_fsys_bubble_down
+paravirt_fsyscall_table:
 	data8 fsys_ni_syscall
 	data8 0				// exit			// 1025
 	data8 0				// read
@@ -665,7 +567,7 @@ fsyscall_table:
 	data8 0				// chown
 	data8 0				// lseek		// 1040
 	data8 fsys_getpid		// getpid
-	data8 fsys_getppid		// getppid
+	data8 0				// getppid
 	data8 0				// mount
 	data8 0				// umount
 	data8 0				// setuid		// 1045
@@ -802,7 +704,7 @@ fsyscall_table:
 	data8 0				// sigaltstack
 	data8 0				// rt_sigaction
 	data8 0				// rt_sigpending
-	data8 fsys_rt_sigprocmask	// rt_sigprocmask
+	data8 0				// rt_sigprocmask
 	data8 0				// rt_sigqueueinfo	// 1180
 	data8 0				// rt_sigreturn
 	data8 0				// rt_sigsuspend
@@ -882,9 +784,9 @@ fsyscall_table:
 	data8 0				// clock_nanosleep
 	data8 0				// fstatfs64
 	data8 0				// statfs64
-	data8 0
-	data8 0							// 1260
-	data8 0
+	data8 0				// mbind
+	data8 0				// get_mempolicy	// 1260
+	data8 0				// set_mempolicy
 	data8 0				// mq_open
 	data8 0				// mq_unlink
 	data8 0				// mq_timedsend
@@ -892,16 +794,43 @@ fsyscall_table:
 	data8 0				// mq_notify
 	data8 0				// mq_getsetattr
 	data8 0				// kexec_load
+	data8 0				// vserver
+	data8 0				// waitid		// 1270
+	data8 0				// add_key
+	data8 0				// request_key
+	data8 0				// keyctl
+	data8 0				// ioprio_set
+	data8 0				// ioprio_get		// 1275
+	data8 0				// move_pages
+	data8 0				// inotify_init
+	data8 0				// inotify_add_watch
+	data8 0				// inotify_rm_watch
+	data8 0				// migrate_pages	// 1280
+	data8 0				// openat
+	data8 0				// mkdirat
+	data8 0				// mknodat
+	data8 0				// fchownat
+	data8 0				// futimesat		// 1285
+	data8 0				// newfstatat
+	data8 0				// unlinkat
+	data8 0				// renameat
+	data8 0				// linkat
+	data8 0				// symlinkat		// 1290
+	data8 0				// readlinkat
+	data8 0				// fchmodat
+	data8 0				// faccessat
 	data8 0
-	data8 0							// 1270
-	data8 0
-	data8 0
-	data8 0
-	data8 0
-	data8 0							// 1275
-	data8 0
-	data8 0
-	data8 0
+	data8 0							// 1295
+	data8 0				// unshare
+	data8 0				// splice
+	data8 0				// set_robust_list
+	data8 0				// get_robust_list
+	data8 0				// sync_file_range	// 1300
+	data8 0				// tee
+	data8 0				// vmsplice
 	data8 0
+	data8 fsys_getcpu		// getcpu		// 1304
 
-	.org fsyscall_table + 8*NR_syscalls	// guard against failures to increase NR_syscalls
+	// fill in zeros for the remaining entries
+	.zero:
+	.space paravirt_fsyscall_table + 8*NR_syscalls - .zero, 0
diff --git a/arch/ia64/kernel/fsyscall_gtod_data.h b/arch/ia64/kernel/fsyscall_gtod_data.h
new file mode 100644
index 00000000000..146b15b5fec
--- /dev/null
+++ b/arch/ia64/kernel/fsyscall_gtod_data.h
@@ -0,0 +1,23 @@
+/*
+ * (c) Copyright 2007 Hewlett-Packard Development Company, L.P.
+ *        Contributed by Peter Keilty <peter.keilty@hp.com>
+ *
+ * fsyscall gettimeofday data
+ */
+
+struct fsyscall_gtod_data_t {
+	seqcount_t	seq;
+	struct timespec	wall_time;
+	struct timespec monotonic_time;
+	cycle_t		clk_mask;
+	u32		clk_mult;
+	u32		clk_shift;
+	void		*clk_fsys_mmio;
+	cycle_t		clk_cycle_last;
+} ____cacheline_aligned;
+
+struct itc_jitter_data_t {
+	int		itc_jitter;
+	cycle_t		itc_lastcycle;
+} ____cacheline_aligned;
+
diff --git a/arch/ia64/kernel/ftrace.c b/arch/ia64/kernel/ftrace.c
new file mode 100644
index 00000000000..3b0c2aa0785
--- /dev/null
+++ b/arch/ia64/kernel/ftrace.c
@@ -0,0 +1,204 @@
+/*
+ * Dynamic function tracing support.
+ *
+ * Copyright (C) 2008 Shaohua Li <shaohua.li@intel.com>
+ *
+ * For licencing details, see COPYING.
+ *
+ * Defines low-level handling of mcount calls when the kernel
+ * is compiled with the -pg flag. When using dynamic ftrace, the
+ * mcount call-sites get patched lazily with NOP till they are
+ * enabled. All code mutation routines here take effect atomically.
+ */
+
+#include <linux/uaccess.h>
+#include <linux/ftrace.h>
+
+#include <asm/cacheflush.h>
+#include <asm/patch.h>
+
+/* In IA64, each function will be added below two bundles with -pg option */
+static unsigned char __attribute__((aligned(8)))
+ftrace_orig_code[MCOUNT_INSN_SIZE] = {
+	0x02, 0x40, 0x31, 0x10, 0x80, 0x05, /* alloc r40=ar.pfs,12,8,0 */
+	0xb0, 0x02, 0x00, 0x00, 0x42, 0x40, /* mov r43=r0;; */
+	0x05, 0x00, 0xc4, 0x00,             /* mov r42=b0 */
+	0x11, 0x48, 0x01, 0x02, 0x00, 0x21, /* mov r41=r1 */
+	0x00, 0x00, 0x00, 0x02, 0x00, 0x00, /* nop.i 0x0 */
+	0x08, 0x00, 0x00, 0x50              /* br.call.sptk.many b0 = _mcount;; */
+};
+
+struct ftrace_orig_insn {
+	u64 dummy1, dummy2, dummy3;
+	u64 dummy4:64-41+13;
+	u64 imm20:20;
+	u64 dummy5:3;
+	u64 sign:1;
+	u64 dummy6:4;
+};
+
+/* mcount stub will be converted below for nop */
+static unsigned char ftrace_nop_code[MCOUNT_INSN_SIZE] = {
+	0x00, 0x00, 0x00, 0x00, 0x01, 0x00, /* [MII] nop.m 0x0 */
+	0x30, 0x00, 0x00, 0x60, 0x00, 0x00, /* mov r3=ip */
+	0x00, 0x00, 0x04, 0x00,             /* nop.i 0x0 */
+	0x05, 0x00, 0x00, 0x00, 0x01, 0x00, /* [MLX] nop.m 0x0 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* nop.x 0x0;; */
+	0x00, 0x00, 0x04, 0x00
+};
+
+static unsigned char *ftrace_nop_replace(void)
+{
+	return ftrace_nop_code;
+}
+
+/*
+ * mcount stub will be converted below for call
+ * Note: Just the last instruction is changed against nop
+ * */
+static unsigned char __attribute__((aligned(8)))
+ftrace_call_code[MCOUNT_INSN_SIZE] = {
+	0x00, 0x00, 0x00, 0x00, 0x01, 0x00, /* [MII] nop.m 0x0 */
+	0x30, 0x00, 0x00, 0x60, 0x00, 0x00, /* mov r3=ip */
+	0x00, 0x00, 0x04, 0x00,             /* nop.i 0x0 */
+	0x05, 0x00, 0x00, 0x00, 0x01, 0x00, /* [MLX] nop.m 0x0 */
+	0xff, 0xff, 0xff, 0xff, 0x7f, 0x00, /* brl.many .;;*/
+	0xf8, 0xff, 0xff, 0xc8
+};
+
+struct ftrace_call_insn {
+	u64 dummy1, dummy2;
+	u64 dummy3:48;
+	u64 imm39_l:16;
+	u64 imm39_h:23;
+	u64 dummy4:13;
+	u64 imm20:20;
+	u64 dummy5:3;
+	u64 i:1;
+	u64 dummy6:4;
+};
+
+static unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)
+{
+	struct ftrace_call_insn *code = (void *)ftrace_call_code;
+	unsigned long offset = addr - (ip + 0x10);
+
+	code->imm39_l = offset >> 24;
+	code->imm39_h = offset >> 40;
+	code->imm20 = offset >> 4;
+	code->i = offset >> 63;
+	return ftrace_call_code;
+}
+
+static int
+ftrace_modify_code(unsigned long ip, unsigned char *old_code,
+		   unsigned char *new_code, int do_check)
+{
+	unsigned char replaced[MCOUNT_INSN_SIZE];
+
+	/*
+	 * Note: Due to modules and __init, code can
+	 *  disappear and change, we need to protect against faulting
+	 *  as well as code changing. We do this by using the
+	 *  probe_kernel_* functions.
+	 *
+	 * No real locking needed, this code is run through
+	 * kstop_machine, or before SMP starts.
+	 */
+
+	if (!do_check)
+		goto skip_check;
+
+	/* read the text we want to modify */
+	if (probe_kernel_read(replaced, (void *)ip, MCOUNT_INSN_SIZE))
+		return -EFAULT;
+
+	/* Make sure it is what we expect it to be */
+	if (memcmp(replaced, old_code, MCOUNT_INSN_SIZE) != 0)
+		return -EINVAL;
+
+skip_check:
+	/* replace the text with the new text */
+	if (probe_kernel_write(((void *)ip), new_code, MCOUNT_INSN_SIZE))
+		return -EPERM;
+	flush_icache_range(ip, ip + MCOUNT_INSN_SIZE);
+
+	return 0;
+}
+
+static int ftrace_make_nop_check(struct dyn_ftrace *rec, unsigned long addr)
+{
+	unsigned char __attribute__((aligned(8))) replaced[MCOUNT_INSN_SIZE];
+	unsigned long ip = rec->ip;
+
+	if (probe_kernel_read(replaced, (void *)ip, MCOUNT_INSN_SIZE))
+		return -EFAULT;
+	if (rec->flags & FTRACE_FL_CONVERTED) {
+		struct ftrace_call_insn *call_insn, *tmp_call;
+
+		call_insn = (void *)ftrace_call_code;
+		tmp_call = (void *)replaced;
+		call_insn->imm39_l = tmp_call->imm39_l;
+		call_insn->imm39_h = tmp_call->imm39_h;
+		call_insn->imm20 = tmp_call->imm20;
+		call_insn->i = tmp_call->i;
+		if (memcmp(replaced, ftrace_call_code, MCOUNT_INSN_SIZE) != 0)
+			return -EINVAL;
+		return 0;
+	} else {
+		struct ftrace_orig_insn *call_insn, *tmp_call;
+
+		call_insn = (void *)ftrace_orig_code;
+		tmp_call = (void *)replaced;
+		call_insn->sign = tmp_call->sign;
+		call_insn->imm20 = tmp_call->imm20;
+		if (memcmp(replaced, ftrace_orig_code, MCOUNT_INSN_SIZE) != 0)
+			return -EINVAL;
+		return 0;
+	}
+}
+
+int ftrace_make_nop(struct module *mod,
+		    struct dyn_ftrace *rec, unsigned long addr)
+{
+	int ret;
+	char *new;
+
+	ret = ftrace_make_nop_check(rec, addr);
+	if (ret)
+		return ret;
+	new = ftrace_nop_replace();
+	return ftrace_modify_code(rec->ip, NULL, new, 0);
+}
+
+int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
+{
+	unsigned long ip = rec->ip;
+	unsigned char *old, *new;
+
+	old=  ftrace_nop_replace();
+	new = ftrace_call_replace(ip, addr);
+	return ftrace_modify_code(ip, old, new, 1);
+}
+
+/* in IA64, _mcount can't directly call ftrace_stub. Only jump is ok */
+int ftrace_update_ftrace_func(ftrace_func_t func)
+{
+	unsigned long ip;
+	unsigned long addr = ((struct fnptr *)ftrace_call)->ip;
+
+	if (func == ftrace_stub)
+		return 0;
+	ip = ((struct fnptr *)func)->ip;
+
+	ia64_patch_imm64(addr + 2, ip);
+
+	flush_icache_range(addr, addr + 16);
+	return 0;
+}
+
+/* run from kstop_machine */
+int __init ftrace_dyn_arch_init(void)
+{
+	return 0;
+}
diff --git a/arch/ia64/kernel/gate-data.S b/arch/ia64/kernel/gate-data.S
index 258c0a3238f..b3ef1c72e13 100644
--- a/arch/ia64/kernel/gate-data.S
+++ b/arch/ia64/kernel/gate-data.S
@@ -1,3 +1,3 @@
-	.section .data.gate, "aw"
+	.section .data..gate, "aw"
 
 	.incbin "arch/ia64/kernel/gate.so"
diff --git a/arch/ia64/kernel/gate.S b/arch/ia64/kernel/gate.S
index 86064ca9895..b5f8bdd8618 100644
--- a/arch/ia64/kernel/gate.S
+++ b/arch/ia64/kernel/gate.S
@@ -6,14 +6,15 @@
  * 	David Mosberger-Tang <davidm@hpl.hp.com>
  */
 
-#include <linux/config.h>
 
 #include <asm/asmmacro.h>
 #include <asm/errno.h>
 #include <asm/asm-offsets.h>
 #include <asm/sigcontext.h>
-#include <asm/system.h>
 #include <asm/unistd.h>
+#include <asm/kregs.h>
+#include <asm/page.h>
+#include "paravirt_inst.h"
 
 /*
  * We can't easily refer to symbols inside the kernel.  To avoid full runtime relocation,
@@ -21,17 +22,18 @@
  * to targets outside the shared object) and to avoid multi-phase kernel builds, we
  * simply create minimalistic "patch lists" in special ELF sections.
  */
-	.section ".data.patch.fsyscall_table", "a"
+	.section ".data..patch.fsyscall_table", "a"
 	.previous
 #define LOAD_FSYSCALL_TABLE(reg)			\
 [1:]	movl reg=0;					\
-	.xdata4 ".data.patch.fsyscall_table", 1b-.
+	.xdata4 ".data..patch.fsyscall_table", 1b-.
 
-	.section ".data.patch.brl_fsys_bubble_down", "a"
+	.section ".data..patch.brl_fsys_bubble_down", "a"
 	.previous
 #define BRL_COND_FSYS_BUBBLE_DOWN(pr)			\
 [1:](pr)brl.cond.sptk 0;				\
-	.xdata4 ".data.patch.brl_fsys_bubble_down", 1b-.
+	;;						\
+	.xdata4 ".data..patch.brl_fsys_bubble_down", 1b-.
 
 GLOBAL_ENTRY(__kernel_syscall_via_break)
 	.prologue
@@ -48,87 +50,6 @@ GLOBAL_ENTRY(__kernel_syscall_via_break)
 }
 END(__kernel_syscall_via_break)
 
-/*
- * On entry:
- *	r11 = saved ar.pfs
- *	r15 = system call #
- *	b0  = saved return address
- *	b6  = return address
- * On exit:
- *	r11 = saved ar.pfs
- *	r15 = system call #
- *	b0  = saved return address
- *	all other "scratch" registers:	undefined
- *	all "preserved" registers:	same as on entry
- */
-
-GLOBAL_ENTRY(__kernel_syscall_via_epc)
-	.prologue
-	.altrp b6
-	.body
-{
-	/*
-	 * Note: the kernel cannot assume that the first two instructions in this
-	 * bundle get executed.  The remaining code must be safe even if
-	 * they do not get executed.
-	 */
-	adds r17=-1024,r15			// A
-	mov r10=0				// A    default to successful syscall execution
-	epc					// B	causes split-issue
-}
-	;;
-	rsm psr.be | psr.i			// M2 (5 cyc to srlz.d)
-	LOAD_FSYSCALL_TABLE(r14)		// X
-	;;
-	mov r16=IA64_KR(CURRENT)		// M2 (12 cyc)
-	shladd r18=r17,3,r14			// A
-	mov r19=NR_syscalls-1			// A
-	;;
-	lfetch [r18]				// M0|1
-	mov r29=psr				// M2 (12 cyc)
-	// If r17 is a NaT, p6 will be zero
-	cmp.geu p6,p7=r19,r17			// A    (sysnr > 0 && sysnr < 1024+NR_syscalls)?
-	;;
-	mov r21=ar.fpsr				// M2 (12 cyc)
-	tnat.nz p10,p9=r15			// I0
-	mov.i r26=ar.pfs			// I0 (would stall anyhow due to srlz.d...)
-	;;
-	srlz.d					// M0 (forces split-issue) ensure PSR.BE==0
-(p6)	ld8 r18=[r18]				// M0|1
-	nop.i 0
-	;;
-	nop.m 0
-(p6)	tbit.z.unc p8,p0=r18,0			// I0 (dual-issues with "mov b7=r18"!)
-	nop.i 0
-	;;
-(p8)	ssm psr.i
-(p6)	mov b7=r18				// I0
-(p8)	br.dptk.many b7				// B
-
-	mov r27=ar.rsc				// M2 (12 cyc)
-/*
- * brl.cond doesn't work as intended because the linker would convert this branch
- * into a branch to a PLT.  Perhaps there will be a way to avoid this with some
- * future version of the linker.  In the meantime, we just use an indirect branch
- * instead.
- */
-#ifdef CONFIG_ITANIUM
-(p6)	add r14=-8,r14				// r14 <- addr of fsys_bubble_down entry
-	;;
-(p6)	ld8 r14=[r14]				// r14 <- fsys_bubble_down
-	;;
-(p6)	mov b7=r14
-(p6)	br.sptk.many b7
-#else
-	BRL_COND_FSYS_BUBBLE_DOWN(p6)
-#endif
-	ssm psr.i
-	mov r10=-1
-(p10)	mov r8=EINVAL
-(p9)	mov r8=ENOSYS
-	FSYS_RETURN
-END(__kernel_syscall_via_epc)
-
 #	define ARG0_OFF		(16 + IA64_SIGFRAME_ARG0_OFFSET)
 #	define ARG1_OFF		(16 + IA64_SIGFRAME_ARG1_OFFSET)
 #	define ARG2_OFF		(16 + IA64_SIGFRAME_ARG2_OFFSET)
@@ -374,3 +295,92 @@ restore_rbs:
 	// invala not necessary as that will happen when returning to user-mode
 	br.cond.sptk back_from_restore_rbs
 END(__kernel_sigtramp)
+
+/*
+ * On entry:
+ *	r11 = saved ar.pfs
+ *	r15 = system call #
+ *	b0  = saved return address
+ *	b6  = return address
+ * On exit:
+ *	r11 = saved ar.pfs
+ *	r15 = system call #
+ *	b0  = saved return address
+ *	all other "scratch" registers:	undefined
+ *	all "preserved" registers:	same as on entry
+ */
+
+GLOBAL_ENTRY(__kernel_syscall_via_epc)
+	.prologue
+	.altrp b6
+	.body
+{
+	/*
+	 * Note: the kernel cannot assume that the first two instructions in this
+	 * bundle get executed.  The remaining code must be safe even if
+	 * they do not get executed.
+	 */
+	adds r17=-1024,r15			// A
+	mov r10=0				// A    default to successful syscall execution
+	epc					// B	causes split-issue
+}
+	;;
+	RSM_PSR_BE_I(r20, r22)			// M2 (5 cyc to srlz.d)
+	LOAD_FSYSCALL_TABLE(r14)		// X
+	;;
+	mov r16=IA64_KR(CURRENT)		// M2 (12 cyc)
+	shladd r18=r17,3,r14			// A
+	mov r19=NR_syscalls-1			// A
+	;;
+	lfetch [r18]				// M0|1
+	MOV_FROM_PSR(p0, r29, r8)		// M2 (12 cyc)
+	// If r17 is a NaT, p6 will be zero
+	cmp.geu p6,p7=r19,r17			// A    (sysnr > 0 && sysnr < 1024+NR_syscalls)?
+	;;
+	mov r21=ar.fpsr				// M2 (12 cyc)
+	tnat.nz p10,p9=r15			// I0
+	mov.i r26=ar.pfs			// I0 (would stall anyhow due to srlz.d...)
+	;;
+	srlz.d					// M0 (forces split-issue) ensure PSR.BE==0
+(p6)	ld8 r18=[r18]				// M0|1
+	nop.i 0
+	;;
+	nop.m 0
+(p6)	tbit.z.unc p8,p0=r18,0			// I0 (dual-issues with "mov b7=r18"!)
+	nop.i 0
+	;;
+	SSM_PSR_I(p8, p14, r25)
+(p6)	mov b7=r18				// I0
+(p8)	br.dptk.many b7				// B
+
+	mov r27=ar.rsc				// M2 (12 cyc)
+/*
+ * brl.cond doesn't work as intended because the linker would convert this branch
+ * into a branch to a PLT.  Perhaps there will be a way to avoid this with some
+ * future version of the linker.  In the meantime, we just use an indirect branch
+ * instead.
+ */
+#ifdef CONFIG_ITANIUM
+(p6)	add r14=-8,r14				// r14 <- addr of fsys_bubble_down entry
+	;;
+(p6)	ld8 r14=[r14]				// r14 <- fsys_bubble_down
+	;;
+(p6)	mov b7=r14
+(p6)	br.sptk.many b7
+#else
+	BRL_COND_FSYS_BUBBLE_DOWN(p6)
+#endif
+	SSM_PSR_I(p0, p14, r10)
+	mov r10=-1
+(p10)	mov r8=EINVAL
+(p9)	mov r8=ENOSYS
+	FSYS_RETURN
+
+#ifdef CONFIG_PARAVIRT
+	/*
+	 * padd to make the size of this symbol constant
+	 * independent of paravirtualization.
+	 */
+	.align PAGE_SIZE / 8
+#endif
+END(__kernel_syscall_via_epc)
diff --git a/arch/ia64/kernel/gate.lds.S b/arch/ia64/kernel/gate.lds.S
index e1e4aba9ecd..e518f7902af 100644
--- a/arch/ia64/kernel/gate.lds.S
+++ b/arch/ia64/kernel/gate.lds.S
@@ -1,79 +1,92 @@
 /*
- * Linker script for gate DSO.  The gate pages are an ELF shared object prelinked to its
- * virtual address, with only one read-only segment and one execute-only segment (both fit
- * in one page).  This script controls its layout.
+ * Linker script for gate DSO.  The gate pages are an ELF shared object
+ * prelinked to its virtual address, with only one read-only segment and
+ * one execute-only segment (both fit in one page).  This script controls
+ * its layout.
  */
 
-#include <linux/config.h>
-
-#include <asm/system.h>
+#include <asm/page.h>
+#include "paravirt_patchlist.h"
 
 SECTIONS
 {
-  . = GATE_ADDR + SIZEOF_HEADERS;
-
-  .hash				: { *(.hash) }				:readable
-  .dynsym			: { *(.dynsym) }
-  .dynstr			: { *(.dynstr) }
-  .gnu.version			: { *(.gnu.version) }
-  .gnu.version_d		: { *(.gnu.version_d) }
-  .gnu.version_r		: { *(.gnu.version_r) }
-  .dynamic			: { *(.dynamic) }			:readable :dynamic
-
-  /*
-   * This linker script is used both with -r and with -shared.  For the layouts to match,
-   * we need to skip more than enough space for the dynamic symbol table et al.  If this
-   * amount is insufficient, ld -shared will barf.  Just increase it here.
-   */
-  . = GATE_ADDR + 0x500;
-
-  .data.patch			: {
-				    __start_gate_mckinley_e9_patchlist = .;
-				    *(.data.patch.mckinley_e9)
-				    __end_gate_mckinley_e9_patchlist = .;
-
-				    __start_gate_vtop_patchlist = .;
-				    *(.data.patch.vtop)
-				    __end_gate_vtop_patchlist = .;
-
-				    __start_gate_fsyscall_patchlist = .;
-				    *(.data.patch.fsyscall_table)
-				    __end_gate_fsyscall_patchlist = .;
-
-				    __start_gate_brl_fsys_bubble_down_patchlist = .;
-				    *(.data.patch.brl_fsys_bubble_down)
-				    __end_gate_brl_fsys_bubble_down_patchlist = .;
-  }									:readable
-  .IA_64.unwind_info		: { *(.IA_64.unwind_info*) }
-  .IA_64.unwind			: { *(.IA_64.unwind*) }			:readable :unwind
+	. = GATE_ADDR + SIZEOF_HEADERS;
+
+	.hash			: { *(.hash) }		:readable
+	.gnu.hash		: { *(.gnu.hash) }
+	.dynsym			: { *(.dynsym) }
+	.dynstr			: { *(.dynstr) }
+	.gnu.version		: { *(.gnu.version) }
+	.gnu.version_d		: { *(.gnu.version_d) }
+	.gnu.version_r		: { *(.gnu.version_r) }
+
+	.note			: { *(.note*) }		:readable	:note
+
+	.dynamic		: { *(.dynamic) }	:readable	:dynamic
+
+	/*
+	 * This linker script is used both with -r and with -shared.  For
+	 * the layouts to match, we need to skip more than enough space for
+	 * the dynamic symbol table et al.  If this amount is insufficient,
+	 * ld -shared will barf.  Just increase it here.
+	 */
+	. = GATE_ADDR + 0x600;
+
+	.data..patch		: {
+		__paravirt_start_gate_mckinley_e9_patchlist = .;
+		*(.data..patch.mckinley_e9)
+		__paravirt_end_gate_mckinley_e9_patchlist = .;
+
+		__paravirt_start_gate_vtop_patchlist = .;
+		*(.data..patch.vtop)
+		__paravirt_end_gate_vtop_patchlist = .;
+
+		__paravirt_start_gate_fsyscall_patchlist = .;
+		*(.data..patch.fsyscall_table)
+		__paravirt_end_gate_fsyscall_patchlist = .;
+
+		__paravirt_start_gate_brl_fsys_bubble_down_patchlist = .;
+		*(.data..patch.brl_fsys_bubble_down)
+		__paravirt_end_gate_brl_fsys_bubble_down_patchlist = .;
+	}						:readable
+
+	.IA_64.unwind_info	: { *(.IA_64.unwind_info*) }
+	.IA_64.unwind		: { *(.IA_64.unwind*) }	:readable	:unwind
 #ifdef HAVE_BUGGY_SEGREL
-  .text (GATE_ADDR + PAGE_SIZE)	: { *(.text) *(.text.*) }		:readable
+	.text (GATE_ADDR + PAGE_SIZE) : { *(.text) *(.text.*) }	:readable
 #else
-  . = ALIGN (PERCPU_PAGE_SIZE) + (. & (PERCPU_PAGE_SIZE - 1));
-  .text				: { *(.text) *(.text.*) }		:epc
+	. = ALIGN(PERCPU_PAGE_SIZE) + (. & (PERCPU_PAGE_SIZE - 1));
+	.text			: { *(.text) *(.text.*) }	:epc
 #endif
 
-  /DISCARD/			: {
-  	*(.got.plt) *(.got)
-	*(.data .data.* .gnu.linkonce.d.*)
-	*(.dynbss)
-	*(.bss .bss.* .gnu.linkonce.b.*)
-	*(__ex_table)
-  }
+	/DISCARD/		: {
+		*(.got.plt) *(.got)
+		*(.data .data.* .gnu.linkonce.d.*)
+		*(.dynbss)
+		*(.bss .bss.* .gnu.linkonce.b.*)
+		*(__ex_table)
+		*(__mca_table)
+	}
 }
 
 /*
+ * ld does not recognize this name token; use the constant.
+ */
+#define	PT_IA_64_UNWIND	0x70000001
+
+/*
  * We must supply the ELF program headers explicitly to get just one
  * PT_LOAD segment, and set the flags explicitly to make segments read-only.
  */
 PHDRS
 {
-  readable  PT_LOAD	FILEHDR	PHDRS	FLAGS(4);	/* PF_R */
+	readable	PT_LOAD	FILEHDR	PHDRS	FLAGS(4);	/* PF_R */
 #ifndef HAVE_BUGGY_SEGREL
-  epc	    PT_LOAD	FILEHDR PHDRS	FLAGS(1);	/* PF_X */
+	epc		PT_LOAD	FILEHDR PHDRS	FLAGS(1);	/* PF_X */
 #endif
-  dynamic   PT_DYNAMIC			FLAGS(4);	/* PF_R */
-  unwind    0x70000001; /* PT_IA_64_UNWIND, but ld doesn't match the name */
+	dynamic		PT_DYNAMIC		FLAGS(4);	/* PF_R */
+	note		PT_NOTE			FLAGS(4);	/* PF_R */
+	unwind		PT_IA_64_UNWIND;
 }
 
 /*
@@ -81,14 +94,14 @@ PHDRS
  */
 VERSION
 {
-  LINUX_2.5 {
-    global:
-	__kernel_syscall_via_break;
-	__kernel_syscall_via_epc;
-	__kernel_sigtramp;
-
-    local: *;
-  };
+	LINUX_2.5 {
+	global:
+		__kernel_syscall_via_break;
+		__kernel_syscall_via_epc;
+		__kernel_sigtramp;
+
+	local: *;
+	};
 }
 
 /* The ELF entry point can be used to set the AT_SYSINFO value.  */
diff --git a/arch/ia64/kernel/head.S b/arch/ia64/kernel/head.S
index bfe65b2e862..a4acddad0c7 100644
--- a/arch/ia64/kernel/head.S
+++ b/arch/ia64/kernel/head.S
@@ -19,7 +19,6 @@
  *   Support for CPU Hotplug
  */
 
-#include <linux/config.h>
 
 #include <asm/asmmacro.h>
 #include <asm/fpu.h>
@@ -27,11 +26,13 @@
 #include <asm/mmu_context.h>
 #include <asm/asm-offsets.h>
 #include <asm/pal.h>
+#include <asm/paravirt.h>
 #include <asm/pgtable.h>
 #include <asm/processor.h>
 #include <asm/ptrace.h>
-#include <asm/system.h>
 #include <asm/mca_asm.h>
+#include <linux/init.h>
+#include <linux/linkage.h>
 
 #ifdef CONFIG_HOTPLUG_CPU
 #define SAL_PSR_BITS_TO_SET				\
@@ -165,7 +166,7 @@ RestRR:											\
 	mov _tmp2=((ia64_rid(IA64_REGION_ID_KERNEL, (num<<61)) << 8) | (pgsize << 2) | vhpt);; \
 	mov rr[_tmp1]=_tmp2
 
-	.section __special_page_section,"ax"
+	__PAGE_ALIGNED_DATA
 
 	.global empty_zero_page
 empty_zero_page:
@@ -179,7 +180,7 @@ swapper_pg_dir:
 halt_msg:
 	stringz "Halting kernel\n"
 
-	.text
+	__REF
 
 	.global start_ap
 
@@ -198,6 +199,11 @@ start_ap:
 	;;
 	srlz.i
 	;;
+ {
+	flushrs				// must be first insn in group
+	srlz.i
+ }
+	;;
 	/*
 	 * Save the region registers, predicate before they get clobbered
 	 */
@@ -352,6 +358,32 @@ start_ap:
 	mov ar.rsc=0		// place RSE in enforced lazy mode
 	;;
 	loadrs			// clear the dirty partition
+	movl r19=__phys_per_cpu_start
+	mov r18=PERCPU_PAGE_SIZE
+	;;
+#ifndef CONFIG_SMP
+	add r19=r19,r18
+	;;
+#else
+(isAP)	br.few 2f
+	movl r20=__cpu0_per_cpu
+	;;
+	shr.u r18=r18,3
+1:
+	ld8 r21=[r19],8;;
+	st8[r20]=r21,8
+	adds r18=-1,r18;;
+	cmp4.lt p7,p6=0,r18
+(p7)	br.cond.dptk.few 1b
+	mov r19=r20
+	;;
+2:
+#endif
+	tpa r19=r19
+	;;
+	.pred.rel.mutex isBP,isAP
+(isBP)	mov IA64_KR(PER_CPU_DATA)=r19	// per-CPU base for cpu0
+(isAP)	mov IA64_KR(PER_CPU_DATA)=r0	// clear physical per-CPU base
 	;;
 	mov ar.bspstore=r2	// establish the new RSE stack
 	;;
@@ -362,6 +394,41 @@ start_ap:
 	;;
 (isBP)	st8 [r2]=r28		// save the address of the boot param area passed by the bootloader
 
+#ifdef CONFIG_PARAVIRT
+
+	movl r14=hypervisor_setup_hooks
+	movl r15=hypervisor_type
+	mov r16=num_hypervisor_hooks
+	;;
+	ld8 r2=[r15]
+	;;
+	cmp.ltu p7,p0=r2,r16	// array size check
+	shladd r8=r2,3,r14
+	;;
+(p7)	ld8 r9=[r8]
+	;;
+(p7)	mov b1=r9
+(p7)	cmp.ne.unc p7,p0=r9,r0	// no actual branch to NULL
+	;;
+(p7)	br.call.sptk.many rp=b1
+
+	__INITDATA
+
+default_setup_hook = 0		// Currently nothing needs to be done.
+
+	.global hypervisor_type
+hypervisor_type:
+	data8		PARAVIRT_HYPERVISOR_TYPE_DEFAULT
+
+	// must have the same order with PARAVIRT_HYPERVISOR_TYPE_xxx
+
+hypervisor_setup_hooks:
+	data8		default_setup_hook
+num_hypervisor_hooks = (. - hypervisor_setup_hooks) / 8
+	.previous
+
+#endif
+
 #ifdef CONFIG_SMP
 (isAP)	br.call.sptk.many rp=start_secondary
 .ret0:
@@ -387,6 +454,8 @@ self:	hint @pause
 	br.sptk.many self		// endless loop
 END(_start)
 
+	.text
+
 GLOBAL_ENTRY(ia64_save_debug_regs)
 	alloc r16=ar.pfs,1,0,0,0
 	mov r20=ar.lc			// preserve ar.lc
@@ -853,7 +922,6 @@ END(__ia64_init_fpu)
  */
 GLOBAL_ENTRY(ia64_switch_mode_phys)
  {
-	alloc r2=ar.pfs,0,0,0,0
 	rsm psr.i | psr.ic		// disable interrupts and interrupt collection
 	mov r15=ip
  }
@@ -902,7 +970,6 @@ END(ia64_switch_mode_phys)
  */
 GLOBAL_ENTRY(ia64_switch_mode_virt)
  {
-	alloc r2=ar.pfs,0,0,0,0
 	rsm psr.i | psr.ic		// disable interrupts and interrupt collection
 	mov r15=ip
  }
@@ -965,7 +1032,7 @@ END(ia64_delay_loop)
  * Return a CPU-local timestamp in nano-seconds.  This timestamp is
  * NOT synchronized across CPUs its return value must never be
  * compared against the values returned on another CPU.  The usage in
- * kernel/sched.c ensures that.
+ * kernel/sched/core.c ensures that.
  *
  * The return-value of sched_clock() is NOT supposed to wrap-around.
  * If it did, it would cause some scheduling hiccups (at the worst).
@@ -979,8 +1046,8 @@ END(ia64_delay_loop)
  * except that the multiplication and the shift are done with 128-bit
  * intermediate precision so that we can produce a full 64-bit result.
  */
-GLOBAL_ENTRY(sched_clock)
-	addl r8=THIS_CPU(cpu_info) + IA64_CPUINFO_NSEC_PER_CYC_OFFSET,r0
+GLOBAL_ENTRY(ia64_native_sched_clock)
+	addl r8=THIS_CPU(ia64_cpu_info) + IA64_CPUINFO_NSEC_PER_CYC_OFFSET,r0
 	mov.m r9=ar.itc		// fetch cycle-counter				(35 cyc)
 	;;
 	ldf8 f8=[r8]
@@ -995,20 +1062,33 @@ GLOBAL_ENTRY(sched_clock)
 	;;
 	shrp r8=r9,r8,IA64_NSEC_PER_CYC_SHIFT
 	br.ret.sptk.many rp
-END(sched_clock)
+END(ia64_native_sched_clock)
+#ifndef CONFIG_PARAVIRT
+	//unsigned long long
+	//sched_clock(void) __attribute__((alias("ia64_native_sched_clock")));
+	.global sched_clock
+sched_clock = ia64_native_sched_clock
+#endif
 
-GLOBAL_ENTRY(start_kernel_thread)
-	.prologue
-	.save rp, r0				// this is the end of the call-chain
-	.body
-	alloc r2 = ar.pfs, 0, 0, 2, 0
-	mov out0 = r9
-	mov out1 = r11;;
-	br.call.sptk.many rp = kernel_thread_helper;;
-	mov out0 = r8
-	br.call.sptk.many rp = sys_exit;;
-1:	br.sptk.few 1b				// not reached
-END(start_kernel_thread)
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
+GLOBAL_ENTRY(cycle_to_cputime)
+	alloc r16=ar.pfs,1,0,0,0
+	addl r8=THIS_CPU(ia64_cpu_info) + IA64_CPUINFO_NSEC_PER_CYC_OFFSET,r0
+	;;
+	ldf8 f8=[r8]
+	;;
+	setf.sig f9=r32
+	;;
+	xmpy.lu f10=f9,f8	// calculate low 64 bits of 128-bit product	(4 cyc)
+	xmpy.hu f11=f9,f8	// calculate high 64 bits of 128-bit product
+	;;
+	getf.sig r8=f10		//						(5 cyc)
+	getf.sig r9=f11
+	;;
+	shrp r8=r9,r8,IA64_NSEC_PER_CYC_SHIFT
+	br.ret.sptk.many rp
+END(cycle_to_cputime)
+#endif /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
 
 #ifdef CONFIG_IA64_BRL_EMU
 
@@ -1033,95 +1113,6 @@ SET_REG(b5);
 #endif /* CONFIG_IA64_BRL_EMU */
 
 #ifdef CONFIG_SMP
-	/*
-	 * This routine handles spinlock contention.  It uses a non-standard calling
-	 * convention to avoid converting leaf routines into interior routines.  Because
-	 * of this special convention, there are several restrictions:
-	 *
-	 * - do not use gp relative variables, this code is called from the kernel
-	 *   and from modules, r1 is undefined.
-	 * - do not use stacked registers, the caller owns them.
-	 * - do not use the scratch stack space, the caller owns it.
-	 * - do not use any registers other than the ones listed below
-	 *
-	 * Inputs:
-	 *   ar.pfs - saved CFM of caller
-	 *   ar.ccv - 0 (and available for use)
-	 *   r27    - flags from spin_lock_irqsave or 0.  Must be preserved.
-	 *   r28    - available for use.
-	 *   r29    - available for use.
-	 *   r30    - available for use.
-	 *   r31    - address of lock, available for use.
-	 *   b6     - return address
-	 *   p14    - available for use.
-	 *   p15    - used to track flag status.
-	 *
-	 * If you patch this code to use more registers, do not forget to update
-	 * the clobber lists for spin_lock() in include/asm-ia64/spinlock.h.
-	 */
-
-#if __GNUC__ < 3 || (__GNUC__ == 3 && __GNUC_MINOR__ < 3)
-
-GLOBAL_ENTRY(ia64_spinlock_contention_pre3_4)
-	.prologue
-	.save ar.pfs, r0	// this code effectively has a zero frame size
-	.save rp, r28
-	.body
-	nop 0
-	tbit.nz p15,p0=r27,IA64_PSR_I_BIT
-	.restore sp		// pop existing prologue after next insn
-	mov b6 = r28
-	.prologue
-	.save ar.pfs, r0
-	.altrp b6
-	.body
-	;;
-(p15)	ssm psr.i		// reenable interrupts if they were on
-				// DavidM says that srlz.d is slow and is not required in this case
-.wait:
-	// exponential backoff, kdb, lockmeter etc. go in here
-	hint @pause
-	ld4 r30=[r31]		// don't use ld4.bias; if it's contended, we won't write the word
-	nop 0
-	;;
-	cmp4.ne p14,p0=r30,r0
-(p14)	br.cond.sptk.few .wait
-(p15)	rsm psr.i		// disable interrupts if we reenabled them
-	br.cond.sptk.few b6	// lock is now free, try to acquire
-	.global ia64_spinlock_contention_pre3_4_end	// for kernprof
-ia64_spinlock_contention_pre3_4_end:
-END(ia64_spinlock_contention_pre3_4)
-
-#else
-
-GLOBAL_ENTRY(ia64_spinlock_contention)
-	.prologue
-	.altrp b6
-	.body
-	tbit.nz p15,p0=r27,IA64_PSR_I_BIT
-	;;
-.wait:
-(p15)	ssm psr.i		// reenable interrupts if they were on
-				// DavidM says that srlz.d is slow and is not required in this case
-.wait2:
-	// exponential backoff, kdb, lockmeter etc. go in here
-	hint @pause
-	ld4 r30=[r31]		// don't use ld4.bias; if it's contended, we won't write the word
-	;;
-	cmp4.ne p14,p0=r30,r0
-	mov r30 = 1
-(p14)	br.cond.sptk.few .wait2
-(p15)	rsm psr.i		// disable interrupts if we reenabled them
-	;;
-	cmpxchg4.acq r30=[r31], r30, ar.ccv
-	;;
-	cmp4.ne p14,p0=r0,r30
-(p14)	br.cond.sptk.few .wait
-
-	br.ret.sptk.many b6	// lock is now taken
-END(ia64_spinlock_contention)
-
-#endif
 
 #ifdef CONFIG_HOTPLUG_CPU
 GLOBAL_ENTRY(ia64_jump_to_sal)
@@ -1145,7 +1136,7 @@ GLOBAL_ENTRY(ia64_jump_to_sal)
 	movl r16=SAL_PSR_BITS_TO_SET;;
 	mov cr.ipsr=r16
 	mov cr.ifs=r0;;
-	rfi;;
+	rfi;;			// note: this unmask MCA/INIT (psr.mc)
 1:
 	/*
 	 * Invalidate all TLB data/inst
@@ -1171,6 +1162,7 @@ tlb_purge_done:
 	RESTORE_REG(cr.dcr, r25, r17);;
 	RESTORE_REG(cr.iva, r25, r17);;
 	RESTORE_REG(cr.pta, r25, r17);;
+	srlz.d;;	// required not to violate RAW dependency
 	RESTORE_REG(cr.itv, r25, r17);;
 	RESTORE_REG(cr.pmv, r25, r17);;
 	RESTORE_REG(cr.cmcv, r25, r17);;
diff --git a/arch/ia64/kernel/ia64_ksyms.c b/arch/ia64/kernel/ia64_ksyms.c
index 01572814abe..5b7791dd396 100644
--- a/arch/ia64/kernel/ia64_ksyms.c
+++ b/arch/ia64/kernel/ia64_ksyms.c
@@ -5,50 +5,34 @@
  * All other exports should be put directly after the definition.
  */
 
-#include <linux/config.h>
 #include <linux/module.h>
 
 #include <linux/string.h>
 EXPORT_SYMBOL(memset);
-EXPORT_SYMBOL(memchr);
-EXPORT_SYMBOL(memcmp);
 EXPORT_SYMBOL(memcpy);
-EXPORT_SYMBOL(memmove);
-EXPORT_SYMBOL(memscan);
-EXPORT_SYMBOL(strcat);
-EXPORT_SYMBOL(strchr);
-EXPORT_SYMBOL(strcmp);
-EXPORT_SYMBOL(strcpy);
 EXPORT_SYMBOL(strlen);
-EXPORT_SYMBOL(strncat);
-EXPORT_SYMBOL(strncmp);
-EXPORT_SYMBOL(strncpy);
-EXPORT_SYMBOL(strnlen);
-EXPORT_SYMBOL(strrchr);
-EXPORT_SYMBOL(strstr);
-EXPORT_SYMBOL(strpbrk);
+
+#include <asm/pgtable.h>
+EXPORT_SYMBOL_GPL(empty_zero_page);
 
 #include <asm/checksum.h>
 EXPORT_SYMBOL(ip_fast_csum);		/* hand-coded assembly */
-
-#include <asm/semaphore.h>
-EXPORT_SYMBOL(__down);
-EXPORT_SYMBOL(__down_interruptible);
-EXPORT_SYMBOL(__down_trylock);
-EXPORT_SYMBOL(__up);
+EXPORT_SYMBOL(csum_ipv6_magic);
 
 #include <asm/page.h>
 EXPORT_SYMBOL(clear_page);
+EXPORT_SYMBOL(copy_page);
 
 #ifdef CONFIG_VIRTUAL_MEM_MAP
 #include <linux/bootmem.h>
+EXPORT_SYMBOL(min_low_pfn);	/* defined by bootmem.c, but not exported by generic code */
 EXPORT_SYMBOL(max_low_pfn);	/* defined by bootmem.c, but not exported by generic code */
 #endif
 
 #include <asm/processor.h>
-EXPORT_SYMBOL(per_cpu__cpu_info);
+EXPORT_SYMBOL(ia64_cpu_info);
 #ifdef CONFIG_SMP
-EXPORT_SYMBOL(per_cpu__local_per_cpu_offset);
+EXPORT_SYMBOL(local_per_cpu_offset);
 #endif
 
 #include <asm/uaccess.h>
@@ -77,7 +61,7 @@ EXPORT_SYMBOL(__udivdi3);
 EXPORT_SYMBOL(__moddi3);
 EXPORT_SYMBOL(__umoddi3);
 
-#if defined(CONFIG_MD_RAID5) || defined(CONFIG_MD_RAID5_MODULE)
+#if defined(CONFIG_MD_RAID456) || defined(CONFIG_MD_RAID456_MODULE)
 extern void xor_ia64_2(void);
 extern void xor_ia64_3(void);
 extern void xor_ia64_4(void);
@@ -100,25 +84,15 @@ EXPORT_SYMBOL(ia64_save_scratch_fpregs);
 #include <asm/unwind.h>
 EXPORT_SYMBOL(unw_init_running);
 
-#ifdef ASM_SUPPORTED
-# ifdef CONFIG_SMP
-#  if __GNUC__ < 3 || (__GNUC__ == 3 && __GNUC_MINOR__ < 3)
-/*
- * This is not a normal routine and we don't want a function descriptor for it, so we use
- * a fake declaration here.
- */
-extern char ia64_spinlock_contention_pre3_4;
-EXPORT_SYMBOL(ia64_spinlock_contention_pre3_4);
-#  else
-/*
- * This is not a normal routine and we don't want a function descriptor for it, so we use
- * a fake declaration here.
- */
-extern char ia64_spinlock_contention;
-EXPORT_SYMBOL(ia64_spinlock_contention);
-#  endif
-# endif
+#if defined(CONFIG_IA64_ESI) || defined(CONFIG_IA64_ESI_MODULE)
+extern void esi_call_phys (void);
+EXPORT_SYMBOL_GPL(esi_call_phys);
 #endif
-
 extern char ia64_ivt[];
 EXPORT_SYMBOL(ia64_ivt);
+
+#include <asm/ftrace.h>
+#ifdef CONFIG_FUNCTION_TRACER
+/* mcount is defined in assembly */
+EXPORT_SYMBOL(_mcount);
+#endif
diff --git a/arch/ia64/kernel/init_task.c b/arch/ia64/kernel/init_task.c
index b69c397ed1b..f9efe9739d3 100644
--- a/arch/ia64/kernel/init_task.c
+++ b/arch/ia64/kernel/init_task.c
@@ -8,6 +8,7 @@
 
 #include <linux/init.h>
 #include <linux/mm.h>
+#include <linux/fs.h>
 #include <linux/module.h>
 #include <linux/sched.h>
 #include <linux/init_task.h>
@@ -16,19 +17,13 @@
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
 
-static struct fs_struct init_fs = INIT_FS;
-static struct files_struct init_files = INIT_FILES;
 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
-struct mm_struct init_mm = INIT_MM(init_mm);
-
-EXPORT_SYMBOL(init_mm);
-
 /*
  * Initial task structure.
  *
  * We need to make sure that this is properly aligned due to the way process stacks are
- * handled. This is done by having a special ".data.init_task" section...
+ * handled. This is done by having a special ".data..init_task" section...
  */
 #define init_thread_info	init_task_mem.s.thread_info
 
@@ -38,7 +33,8 @@ union {
 		struct thread_info thread_info;
 	} s;
 	unsigned long stack[KERNEL_STACK_SIZE/sizeof (unsigned long)];
-} init_task_mem asm ("init_task") __attribute__((section(".data.init_task"))) = {{
+} init_task_mem asm ("init_task") __init_task_data =
+	{{
 	.task =		INIT_TASK(init_task_mem.s.task),
 	.thread_info =	INIT_THREAD_INFO(init_task_mem.s.task)
 }};
diff --git a/arch/ia64/kernel/iosapic.c b/arch/ia64/kernel/iosapic.c
index 574084f343f..cd44a57c73b 100644
--- a/arch/ia64/kernel/iosapic.c
+++ b/arch/ia64/kernel/iosapic.c
@@ -9,66 +9,76 @@
  * Copyright (C) 1999 VA Linux Systems
  * Copyright (C) 1999,2000 Walt Drummond <drummond@valinux.com>
  *
- * 00/04/19	D. Mosberger	Rewritten to mirror more closely the x86 I/O APIC code.
- *				In particular, we now have separate handlers for edge
- *				and level triggered interrupts.
- * 00/10/27	Asit Mallick, Goutham Rao <goutham.rao@intel.com> IRQ vector allocation
- *				PCI to vector mapping, shared PCI interrupts.
- * 00/10/27	D. Mosberger	Document things a bit more to make them more understandable.
- *				Clean up much of the old IOSAPIC cruft.
- * 01/07/27	J.I. Lee	PCI irq routing, Platform/Legacy interrupts and fixes for
- *				ACPI S5(SoftOff) support.
+ * 00/04/19	D. Mosberger	Rewritten to mirror more closely the x86 I/O
+ *				APIC code.  In particular, we now have separate
+ *				handlers for edge and level triggered
+ *				interrupts.
+ * 00/10/27	Asit Mallick, Goutham Rao <goutham.rao@intel.com> IRQ vector
+ *				allocation PCI to vector mapping, shared PCI
+ *				interrupts.
+ * 00/10/27	D. Mosberger	Document things a bit more to make them more
+ *				understandable.  Clean up much of the old
+ *				IOSAPIC cruft.
+ * 01/07/27	J.I. Lee	PCI irq routing, Platform/Legacy interrupts
+ *				and fixes for ACPI S5(SoftOff) support.
  * 02/01/23	J.I. Lee	iosapic pgm fixes for PCI irq routing from _PRT
- * 02/01/07     E. Focht        <efocht@ess.nec.de> Redirectable interrupt vectors in
- *                              iosapic_set_affinity(), initializations for
- *                              /proc/irq/#/smp_affinity
+ * 02/01/07     E. Focht        <efocht@ess.nec.de> Redirectable interrupt
+ *				vectors in iosapic_set_affinity(),
+ *				initializations for /proc/irq/#/smp_affinity
  * 02/04/02	P. Diefenbaugh	Cleaned up ACPI PCI IRQ routing.
  * 02/04/18	J.I. Lee	bug fix in iosapic_init_pci_irq
- * 02/04/30	J.I. Lee	bug fix in find_iosapic to fix ACPI PCI IRQ to IOSAPIC mapping
- *				error
+ * 02/04/30	J.I. Lee	bug fix in find_iosapic to fix ACPI PCI IRQ to
+ *				IOSAPIC mapping error
  * 02/07/29	T. Kochi	Allocate interrupt vectors dynamically
- * 02/08/04	T. Kochi	Cleaned up terminology (irq, global system interrupt, vector, etc.)
- * 02/09/20	D. Mosberger	Simplified by taking advantage of ACPI's pci_irq code.
+ * 02/08/04	T. Kochi	Cleaned up terminology (irq, global system
+ *				interrupt, vector, etc.)
+ * 02/09/20	D. Mosberger	Simplified by taking advantage of ACPI's
+ *				pci_irq code.
  * 03/02/19	B. Helgaas	Make pcat_compat system-wide, not per-IOSAPIC.
- *				Remove iosapic_address & gsi_base from external interfaces.
- *				Rationalize __init/__devinit attributes.
+ *				Remove iosapic_address & gsi_base from
+ *				external interfaces.  Rationalize
+ *				__init/__devinit attributes.
  * 04/12/04 Ashok Raj	<ashok.raj@intel.com> Intel Corporation 2004
- *				Updated to work with irq migration necessary for CPU Hotplug
+ *				Updated to work with irq migration necessary
+ *				for CPU Hotplug
  */
 /*
- * Here is what the interrupt logic between a PCI device and the kernel looks like:
+ * Here is what the interrupt logic between a PCI device and the kernel looks
+ * like:
  *
- * (1) A PCI device raises one of the four interrupt pins (INTA, INTB, INTC, INTD).  The
- *     device is uniquely identified by its bus--, and slot-number (the function
- *     number does not matter here because all functions share the same interrupt
- *     lines).
+ * (1) A PCI device raises one of the four interrupt pins (INTA, INTB, INTC,
+ *     INTD).  The device is uniquely identified by its bus-, and slot-number
+ *     (the function number does not matter here because all functions share
+ *     the same interrupt lines).
  *
- * (2) The motherboard routes the interrupt line to a pin on a IOSAPIC controller.
- *     Multiple interrupt lines may have to share the same IOSAPIC pin (if they're level
- *     triggered and use the same polarity).  Each interrupt line has a unique Global
- *     System Interrupt (GSI) number which can be calculated as the sum of the controller's
- *     base GSI number and the IOSAPIC pin number to which the line connects.
+ * (2) The motherboard routes the interrupt line to a pin on a IOSAPIC
+ *     controller.  Multiple interrupt lines may have to share the same
+ *     IOSAPIC pin (if they're level triggered and use the same polarity).
+ *     Each interrupt line has a unique Global System Interrupt (GSI) number
+ *     which can be calculated as the sum of the controller's base GSI number
+ *     and the IOSAPIC pin number to which the line connects.
  *
- * (3) The IOSAPIC uses an internal routing table entries (RTEs) to map the IOSAPIC pin
- *     into the IA-64 interrupt vector.  This interrupt vector is then sent to the CPU.
+ * (3) The IOSAPIC uses an internal routing table entries (RTEs) to map the
+ * IOSAPIC pin into the IA-64 interrupt vector.  This interrupt vector is then
+ * sent to the CPU.
  *
- * (4) The kernel recognizes an interrupt as an IRQ.  The IRQ interface is used as
- *     architecture-independent interrupt handling mechanism in Linux.  As an
- *     IRQ is a number, we have to have IA-64 interrupt vector number <-> IRQ number
- *     mapping.  On smaller systems, we use one-to-one mapping between IA-64 vector and
- *     IRQ.  A platform can implement platform_irq_to_vector(irq) and
+ * (4) The kernel recognizes an interrupt as an IRQ.  The IRQ interface is
+ *     used as architecture-independent interrupt handling mechanism in Linux.
+ *     As an IRQ is a number, we have to have
+ *     IA-64 interrupt vector number <-> IRQ number mapping.  On smaller
+ *     systems, we use one-to-one mapping between IA-64 vector and IRQ.  A
+ *     platform can implement platform_irq_to_vector(irq) and
  *     platform_local_vector_to_irq(vector) APIs to differentiate the mapping.
- *     Please see also include/asm-ia64/hw_irq.h for those APIs.
+ *     Please see also arch/ia64/include/asm/hw_irq.h for those APIs.
  *
  * To sum up, there are three levels of mappings involved:
  *
  *	PCI pin -> global system interrupt (GSI) -> IA-64 vector <-> IRQ
  *
- * Note: The term "IRQ" is loosely used everywhere in Linux kernel to describe interrupts.
- * Now we use "IRQ" only for Linux IRQ's.  ISA IRQ (isa_irq) is the only exception in this
- * source code.
+ * Note: The term "IRQ" is loosely used everywhere in Linux kernel to
+ * describe interrupts.  Now we use "IRQ" only for Linux IRQ's.  ISA IRQ
+ * (isa_irq) is the only exception in this source code.
  */
-#include <linux/config.h>
 
 #include <linux/acpi.h>
 #include <linux/init.h>
@@ -76,8 +86,8 @@
 #include <linux/kernel.h>
 #include <linux/list.h>
 #include <linux/pci.h>
+#include <linux/slab.h>
 #include <linux/smp.h>
-#include <linux/smp_lock.h>
 #include <linux/string.h>
 #include <linux/bootmem.h>
 
@@ -88,8 +98,6 @@
 #include <asm/machvec.h>
 #include <asm/processor.h>
 #include <asm/ptrace.h>
-#include <asm/system.h>
-
 
 #undef DEBUG_INTERRUPT_ROUTING
 
@@ -99,46 +107,57 @@
 #define DBG(fmt...)
 #endif
 
-#define NR_PREALLOCATE_RTE_ENTRIES	(PAGE_SIZE / sizeof(struct iosapic_rte_info))
-#define RTE_PREALLOCATED	(1)
-
 static DEFINE_SPINLOCK(iosapic_lock);
 
-/* These tables map IA-64 vectors to the IOSAPIC pin that generates this vector. */
+/*
+ * These tables map IA-64 vectors to the IOSAPIC pin that generates this
+ * vector.
+ */
+
+#define NO_REF_RTE	0
 
-struct iosapic_rte_info {
-	struct list_head rte_list;	/* node in list of RTEs sharing the same vector */
+static struct iosapic {
 	char __iomem	*addr;		/* base address of IOSAPIC */
-	unsigned int	gsi_base;	/* first GSI assigned to this IOSAPIC */
+	unsigned int	gsi_base;	/* GSI base */
+	unsigned short	num_rte;	/* # of RTEs on this IOSAPIC */
+	int		rtes_inuse;	/* # of RTEs in use on this IOSAPIC */
+#ifdef CONFIG_NUMA
+	unsigned short	node;		/* numa node association via pxm */
+#endif
+	spinlock_t	lock;		/* lock for indirect reg access */
+} iosapic_lists[NR_IOSAPICS];
+
+struct iosapic_rte_info {
+	struct list_head rte_list;	/* RTEs sharing the same vector */
 	char		rte_index;	/* IOSAPIC RTE index */
 	int		refcnt;		/* reference counter */
-	unsigned int	flags;		/* flags */
+	struct iosapic	*iosapic;
 } ____cacheline_aligned;
 
 static struct iosapic_intr_info {
-	struct list_head rtes;		/* RTEs using this vector (empty => not an IOSAPIC interrupt) */
-	int		count;		/* # of RTEs that shares this vector */
-	u32		low32;		/* current value of low word of Redirection table entry */
+	struct list_head rtes;		/* RTEs using this vector (empty =>
+					 * not an IOSAPIC interrupt) */
+	int		count;		/* # of registered RTEs */
+	u32		low32;		/* current value of low word of
+					 * Redirection table entry */
 	unsigned int	dest;		/* destination CPU physical ID */
 	unsigned char	dmode	: 3;	/* delivery mode (see iosapic.h) */
-	unsigned char 	polarity: 1;	/* interrupt polarity (see iosapic.h) */
+	unsigned char 	polarity: 1;	/* interrupt polarity
+					 * (see iosapic.h) */
 	unsigned char	trigger	: 1;	/* trigger mode (see iosapic.h) */
-} iosapic_intr_info[IA64_NUM_VECTORS];
+} iosapic_intr_info[NR_IRQS];
 
-static struct iosapic {
-	char __iomem	*addr;		/* base address of IOSAPIC */
-	unsigned int 	gsi_base;	/* first GSI assigned to this IOSAPIC */
-	unsigned short 	num_rte;	/* number of RTE in this IOSAPIC */
-	int		rtes_inuse;	/* # of RTEs in use on this IOSAPIC */
-#ifdef CONFIG_NUMA
-	unsigned short	node;		/* numa node association via pxm */
-#endif
-} iosapic_lists[NR_IOSAPICS];
+static unsigned char pcat_compat;	/* 8259 compatibility flag */
 
-static unsigned char pcat_compat __devinitdata;	/* 8259 compatibility flag */
+static inline void
+iosapic_write(struct iosapic *iosapic, unsigned int reg, u32 val)
+{
+	unsigned long flags;
 
-static int iosapic_kmalloc_ok;
-static LIST_HEAD(free_rte_list);
+	spin_lock_irqsave(&iosapic->lock, flags);
+	__iosapic_write(iosapic->addr, reg, val);
+	spin_unlock_irqrestore(&iosapic->lock, flags);
+}
 
 /*
  * Find an IOSAPIC associated with a GSI
@@ -149,98 +168,76 @@ find_iosapic (unsigned int gsi)
 	int i;
 
 	for (i = 0; i < NR_IOSAPICS; i++) {
-		if ((unsigned) (gsi - iosapic_lists[i].gsi_base) < iosapic_lists[i].num_rte)
+		if ((unsigned) (gsi - iosapic_lists[i].gsi_base) <
+		    iosapic_lists[i].num_rte)
 			return i;
 	}
 
 	return -1;
 }
 
-static inline int
-_gsi_to_vector (unsigned int gsi)
+static inline int __gsi_to_irq(unsigned int gsi)
 {
+	int irq;
 	struct iosapic_intr_info *info;
 	struct iosapic_rte_info *rte;
 
-	for (info = iosapic_intr_info; info < iosapic_intr_info + IA64_NUM_VECTORS; ++info)
+	for (irq = 0; irq < NR_IRQS; irq++) {
+		info = &iosapic_intr_info[irq];
 		list_for_each_entry(rte, &info->rtes, rte_list)
-			if (rte->gsi_base + rte->rte_index == gsi)
-				return info - iosapic_intr_info;
+			if (rte->iosapic->gsi_base + rte->rte_index == gsi)
+				return irq;
+	}
 	return -1;
 }
 
-/*
- * Translate GSI number to the corresponding IA-64 interrupt vector.  If no
- * entry exists, return -1.
- */
-inline int
-gsi_to_vector (unsigned int gsi)
-{
-	return _gsi_to_vector(gsi);
-}
-
 int
 gsi_to_irq (unsigned int gsi)
 {
 	unsigned long flags;
 	int irq;
-	/*
-	 * XXX fix me: this assumes an identity mapping vetween IA-64 vector and Linux irq
-	 * numbers...
-	 */
+
 	spin_lock_irqsave(&iosapic_lock, flags);
-	{
-		irq = _gsi_to_vector(gsi);
-	}
+	irq = __gsi_to_irq(gsi);
 	spin_unlock_irqrestore(&iosapic_lock, flags);
-
 	return irq;
 }
 
-static struct iosapic_rte_info *gsi_vector_to_rte(unsigned int gsi, unsigned int vec)
+static struct iosapic_rte_info *find_rte(unsigned int irq, unsigned int gsi)
 {
 	struct iosapic_rte_info *rte;
 
-	list_for_each_entry(rte, &iosapic_intr_info[vec].rtes, rte_list)
-		if (rte->gsi_base + rte->rte_index == gsi)
+	list_for_each_entry(rte, &iosapic_intr_info[irq].rtes, rte_list)
+		if (rte->iosapic->gsi_base + rte->rte_index == gsi)
 			return rte;
 	return NULL;
 }
 
 static void
-set_rte (unsigned int gsi, unsigned int vector, unsigned int dest, int mask)
+set_rte (unsigned int gsi, unsigned int irq, unsigned int dest, int mask)
 {
 	unsigned long pol, trigger, dmode;
 	u32 low32, high32;
-	char __iomem *addr;
 	int rte_index;
 	char redir;
 	struct iosapic_rte_info *rte;
+	ia64_vector vector = irq_to_vector(irq);
 
 	DBG(KERN_DEBUG"IOSAPIC: routing vector %d to 0x%x\n", vector, dest);
 
-	rte = gsi_vector_to_rte(gsi, vector);
+	rte = find_rte(irq, gsi);
 	if (!rte)
 		return;		/* not an IOSAPIC interrupt */
 
 	rte_index = rte->rte_index;
-	addr	= rte->addr;
-	pol     = iosapic_intr_info[vector].polarity;
-	trigger = iosapic_intr_info[vector].trigger;
-	dmode   = iosapic_intr_info[vector].dmode;
+	pol     = iosapic_intr_info[irq].polarity;
+	trigger = iosapic_intr_info[irq].trigger;
+	dmode   = iosapic_intr_info[irq].dmode;
 
 	redir = (dmode == IOSAPIC_LOWEST_PRIORITY) ? 1 : 0;
 
 #ifdef CONFIG_SMP
-	{
-		unsigned int irq;
-
-		for (irq = 0; irq < NR_IRQS; ++irq)
-			if (irq_to_vector(irq) == vector) {
-				set_irq_affinity_info(irq, (int)(dest & 0xffff), redir);
-				break;
-			}
-	}
+	set_irq_affinity_info(irq, (int)(dest & 0xffff), redir);
 #endif
 
 	low32 = ((pol << IOSAPIC_POLARITY_SHIFT) |
@@ -252,120 +249,132 @@ set_rte (unsigned int gsi, unsigned int vector, unsigned int dest, int mask)
 	/* dest contains both id and eid */
 	high32 = (dest << IOSAPIC_DEST_SHIFT);
 
-	iosapic_write(addr, IOSAPIC_RTE_HIGH(rte_index), high32);
-	iosapic_write(addr, IOSAPIC_RTE_LOW(rte_index), low32);
-	iosapic_intr_info[vector].low32 = low32;
-	iosapic_intr_info[vector].dest = dest;
+	iosapic_write(rte->iosapic, IOSAPIC_RTE_HIGH(rte_index), high32);
+	iosapic_write(rte->iosapic, IOSAPIC_RTE_LOW(rte_index), low32);
+	iosapic_intr_info[irq].low32 = low32;
+	iosapic_intr_info[irq].dest = dest;
 }
 
 static void
-nop (unsigned int vector)
+nop (struct irq_data *data)
 {
 	/* do nothing... */
 }
 
+
+#ifdef CONFIG_KEXEC
+void
+kexec_disable_iosapic(void)
+{
+	struct iosapic_intr_info *info;
+	struct iosapic_rte_info *rte;
+	ia64_vector vec;
+	int irq;
+
+	for (irq = 0; irq < NR_IRQS; irq++) {
+		info = &iosapic_intr_info[irq];
+		vec = irq_to_vector(irq);
+		list_for_each_entry(rte, &info->rtes,
+				rte_list) {
+			iosapic_write(rte->iosapic,
+					IOSAPIC_RTE_LOW(rte->rte_index),
+					IOSAPIC_MASK|vec);
+			iosapic_eoi(rte->iosapic->addr, vec);
+		}
+	}
+}
+#endif
+
 static void
-mask_irq (unsigned int irq)
+mask_irq (struct irq_data *data)
 {
-	unsigned long flags;
-	char __iomem *addr;
+	unsigned int irq = data->irq;
 	u32 low32;
 	int rte_index;
-	ia64_vector vec = irq_to_vector(irq);
 	struct iosapic_rte_info *rte;
 
-	if (list_empty(&iosapic_intr_info[vec].rtes))
+	if (!iosapic_intr_info[irq].count)
 		return;			/* not an IOSAPIC interrupt! */
 
-	spin_lock_irqsave(&iosapic_lock, flags);
-	{
-		/* set only the mask bit */
-		low32 = iosapic_intr_info[vec].low32 |= IOSAPIC_MASK;
-		list_for_each_entry(rte, &iosapic_intr_info[vec].rtes, rte_list) {
-			addr = rte->addr;
-			rte_index = rte->rte_index;
-			iosapic_write(addr, IOSAPIC_RTE_LOW(rte_index), low32);
-		}
+	/* set only the mask bit */
+	low32 = iosapic_intr_info[irq].low32 |= IOSAPIC_MASK;
+	list_for_each_entry(rte, &iosapic_intr_info[irq].rtes, rte_list) {
+		rte_index = rte->rte_index;
+		iosapic_write(rte->iosapic, IOSAPIC_RTE_LOW(rte_index), low32);
 	}
-	spin_unlock_irqrestore(&iosapic_lock, flags);
 }
 
 static void
-unmask_irq (unsigned int irq)
+unmask_irq (struct irq_data *data)
 {
-	unsigned long flags;
-	char __iomem *addr;
+	unsigned int irq = data->irq;
 	u32 low32;
 	int rte_index;
-	ia64_vector vec = irq_to_vector(irq);
 	struct iosapic_rte_info *rte;
 
-	if (list_empty(&iosapic_intr_info[vec].rtes))
+	if (!iosapic_intr_info[irq].count)
 		return;			/* not an IOSAPIC interrupt! */
 
-	spin_lock_irqsave(&iosapic_lock, flags);
-	{
-		low32 = iosapic_intr_info[vec].low32 &= ~IOSAPIC_MASK;
-		list_for_each_entry(rte, &iosapic_intr_info[vec].rtes, rte_list) {
-			addr = rte->addr;
-			rte_index = rte->rte_index;
-			iosapic_write(addr, IOSAPIC_RTE_LOW(rte_index), low32);
-		}
+	low32 = iosapic_intr_info[irq].low32 &= ~IOSAPIC_MASK;
+	list_for_each_entry(rte, &iosapic_intr_info[irq].rtes, rte_list) {
+		rte_index = rte->rte_index;
+		iosapic_write(rte->iosapic, IOSAPIC_RTE_LOW(rte_index), low32);
 	}
-	spin_unlock_irqrestore(&iosapic_lock, flags);
 }
 
 
-static void
-iosapic_set_affinity (unsigned int irq, cpumask_t mask)
+static int
+iosapic_set_affinity(struct irq_data *data, const struct cpumask *mask,
+		     bool force)
 {
 #ifdef CONFIG_SMP
-	unsigned long flags;
+	unsigned int irq = data->irq;
 	u32 high32, low32;
-	int dest, rte_index;
-	char __iomem *addr;
+	int cpu, dest, rte_index;
 	int redir = (irq & IA64_IRQ_REDIRECTED) ? 1 : 0;
-	ia64_vector vec;
 	struct iosapic_rte_info *rte;
+	struct iosapic *iosapic;
 
 	irq &= (~IA64_IRQ_REDIRECTED);
-	vec = irq_to_vector(irq);
 
-	if (cpus_empty(mask))
-		return;
+	cpu = cpumask_first_and(cpu_online_mask, mask);
+	if (cpu >= nr_cpu_ids)
+		return -1;
 
-	dest = cpu_physical_id(first_cpu(mask));
+	if (irq_prepare_move(irq, cpu))
+		return -1;
+
+	dest = cpu_physical_id(cpu);
 
-	if (list_empty(&iosapic_intr_info[vec].rtes))
-		return;			/* not an IOSAPIC interrupt */
+	if (!iosapic_intr_info[irq].count)
+		return -1;			/* not an IOSAPIC interrupt */
 
 	set_irq_affinity_info(irq, dest, redir);
 
 	/* dest contains both id and eid */
 	high32 = dest << IOSAPIC_DEST_SHIFT;
 
-	spin_lock_irqsave(&iosapic_lock, flags);
-	{
-		low32 = iosapic_intr_info[vec].low32 & ~(7 << IOSAPIC_DELIVERY_SHIFT);
-
-		if (redir)
-		        /* change delivery mode to lowest priority */
-			low32 |= (IOSAPIC_LOWEST_PRIORITY << IOSAPIC_DELIVERY_SHIFT);
-		else
-		        /* change delivery mode to fixed */
-			low32 |= (IOSAPIC_FIXED << IOSAPIC_DELIVERY_SHIFT);
-
-		iosapic_intr_info[vec].low32 = low32;
-		iosapic_intr_info[vec].dest = dest;
-		list_for_each_entry(rte, &iosapic_intr_info[vec].rtes, rte_list) {
-			addr = rte->addr;
-			rte_index = rte->rte_index;
-			iosapic_write(addr, IOSAPIC_RTE_HIGH(rte_index), high32);
-			iosapic_write(addr, IOSAPIC_RTE_LOW(rte_index), low32);
-		}
+	low32 = iosapic_intr_info[irq].low32 & ~(7 << IOSAPIC_DELIVERY_SHIFT);
+	if (redir)
+		/* change delivery mode to lowest priority */
+		low32 |= (IOSAPIC_LOWEST_PRIORITY << IOSAPIC_DELIVERY_SHIFT);
+	else
+		/* change delivery mode to fixed */
+		low32 |= (IOSAPIC_FIXED << IOSAPIC_DELIVERY_SHIFT);
+	low32 &= IOSAPIC_VECTOR_MASK;
+	low32 |= irq_to_vector(irq);
+
+	iosapic_intr_info[irq].low32 = low32;
+	iosapic_intr_info[irq].dest = dest;
+	list_for_each_entry(rte, &iosapic_intr_info[irq].rtes, rte_list) {
+		iosapic = rte->iosapic;
+		rte_index = rte->rte_index;
+		iosapic_write(iosapic, IOSAPIC_RTE_HIGH(rte_index), high32);
+		iosapic_write(iosapic, IOSAPIC_RTE_LOW(rte_index), low32);
 	}
-	spin_unlock_irqrestore(&iosapic_lock, flags);
+
 #endif
+	return 0;
 }
 
 /*
@@ -373,21 +382,34 @@ iosapic_set_affinity (unsigned int irq, cpumask_t mask)
  */
 
 static unsigned int
-iosapic_startup_level_irq (unsigned int irq)
+iosapic_startup_level_irq (struct irq_data *data)
 {
-	unmask_irq(irq);
+	unmask_irq(data);
 	return 0;
 }
 
 static void
-iosapic_end_level_irq (unsigned int irq)
+iosapic_unmask_level_irq (struct irq_data *data)
 {
+	unsigned int irq = data->irq;
 	ia64_vector vec = irq_to_vector(irq);
 	struct iosapic_rte_info *rte;
+	int do_unmask_irq = 0;
+
+	irq_complete_move(irq);
+	if (unlikely(irqd_is_setaffinity_pending(data))) {
+		do_unmask_irq = 1;
+		mask_irq(data);
+	} else
+		unmask_irq(data);
+
+	list_for_each_entry(rte, &iosapic_intr_info[irq].rtes, rte_list)
+		iosapic_eoi(rte->iosapic->addr, vec);
 
-	move_irq(irq);
-	list_for_each_entry(rte, &iosapic_intr_info[vec].rtes, rte_list)
-		iosapic_eoi(rte->addr, vec);
+	if (unlikely(do_unmask_irq)) {
+		irq_move_masked_irq(data);
+		unmask_irq(data);
+	}
 }
 
 #define iosapic_shutdown_level_irq	mask_irq
@@ -395,15 +417,16 @@ iosapic_end_level_irq (unsigned int irq)
 #define iosapic_disable_level_irq	mask_irq
 #define iosapic_ack_level_irq		nop
 
-struct hw_interrupt_type irq_type_iosapic_level = {
-	.typename =	"IO-SAPIC-level",
-	.startup =	iosapic_startup_level_irq,
-	.shutdown =	iosapic_shutdown_level_irq,
-	.enable =	iosapic_enable_level_irq,
-	.disable =	iosapic_disable_level_irq,
-	.ack =		iosapic_ack_level_irq,
-	.end =		iosapic_end_level_irq,
-	.set_affinity =	iosapic_set_affinity
+static struct irq_chip irq_type_iosapic_level = {
+	.name =			"IO-SAPIC-level",
+	.irq_startup =		iosapic_startup_level_irq,
+	.irq_shutdown =		iosapic_shutdown_level_irq,
+	.irq_enable =		iosapic_enable_level_irq,
+	.irq_disable =		iosapic_disable_level_irq,
+	.irq_ack =		iosapic_ack_level_irq,
+	.irq_mask =		mask_irq,
+	.irq_unmask =		iosapic_unmask_level_irq,
+	.irq_set_affinity =	iosapic_set_affinity
 };
 
 /*
@@ -411,9 +434,9 @@ struct hw_interrupt_type irq_type_iosapic_level = {
  */
 
 static unsigned int
-iosapic_startup_edge_irq (unsigned int irq)
+iosapic_startup_edge_irq (struct irq_data *data)
 {
-	unmask_irq(irq);
+	unmask_irq(data);
 	/*
 	 * IOSAPIC simply drops interrupts pended while the
 	 * corresponding pin was masked, so we can't know if an
@@ -423,36 +446,28 @@ iosapic_startup_edge_irq (unsigned int irq)
 }
 
 static void
-iosapic_ack_edge_irq (unsigned int irq)
+iosapic_ack_edge_irq (struct irq_data *data)
 {
-	irq_desc_t *idesc = irq_descp(irq);
-
-	move_irq(irq);
-	/*
-	 * Once we have recorded IRQ_PENDING already, we can mask the
-	 * interrupt for real. This prevents IRQ storms from unhandled
-	 * devices.
-	 */
-	if ((idesc->status & (IRQ_PENDING|IRQ_DISABLED)) == (IRQ_PENDING|IRQ_DISABLED))
-		mask_irq(irq);
+	irq_complete_move(data->irq);
+	irq_move_irq(data);
 }
 
 #define iosapic_enable_edge_irq		unmask_irq
 #define iosapic_disable_edge_irq	nop
-#define iosapic_end_edge_irq		nop
-
-struct hw_interrupt_type irq_type_iosapic_edge = {
-	.typename =	"IO-SAPIC-edge",
-	.startup =	iosapic_startup_edge_irq,
-	.shutdown =	iosapic_disable_edge_irq,
-	.enable =	iosapic_enable_edge_irq,
-	.disable =	iosapic_disable_edge_irq,
-	.ack =		iosapic_ack_edge_irq,
-	.end =		iosapic_end_edge_irq,
-	.set_affinity =	iosapic_set_affinity
+
+static struct irq_chip irq_type_iosapic_edge = {
+	.name =			"IO-SAPIC-edge",
+	.irq_startup =		iosapic_startup_edge_irq,
+	.irq_shutdown =		iosapic_disable_edge_irq,
+	.irq_enable =		iosapic_enable_edge_irq,
+	.irq_disable =		iosapic_disable_edge_irq,
+	.irq_ack =		iosapic_ack_edge_irq,
+	.irq_mask =		mask_irq,
+	.irq_unmask =		unmask_irq,
+	.irq_set_affinity =	iosapic_set_affinity
 };
 
-unsigned int
+static unsigned int
 iosapic_version (char __iomem *addr)
 {
 	/*
@@ -464,12 +479,12 @@ iosapic_version (char __iomem *addr)
 	 *	unsigned int reserved2 : 8;
 	 * }
 	 */
-	return iosapic_read(addr, IOSAPIC_VERSION);
+	return __iosapic_read(addr, IOSAPIC_VERSION);
 }
 
-static int iosapic_find_sharable_vector (unsigned long trigger, unsigned long pol)
+static int iosapic_find_sharable_irq(unsigned long trigger, unsigned long pol)
 {
-	int i, vector = -1, min_count = -1;
+	int i, irq = -ENOSPC, min_count = -1;
 	struct iosapic_intr_info *info;
 
 	/*
@@ -477,20 +492,21 @@ static int iosapic_find_sharable_vector (unsigned long trigger, unsigned long po
 	 * supported yet
 	 */
 	if (trigger == IOSAPIC_EDGE)
-		return -1;
+		return -EINVAL;
 
-	for (i = IA64_FIRST_DEVICE_VECTOR; i <= IA64_LAST_DEVICE_VECTOR; i++) {
+	for (i = 0; i < NR_IRQS; i++) {
 		info = &iosapic_intr_info[i];
 		if (info->trigger == trigger && info->polarity == pol &&
-		    (info->dmode == IOSAPIC_FIXED || info->dmode == IOSAPIC_LOWEST_PRIORITY)) {
+		    (info->dmode == IOSAPIC_FIXED ||
+		     info->dmode == IOSAPIC_LOWEST_PRIORITY) &&
+		    can_request_irq(i, IRQF_SHARED)) {
 			if (min_count == -1 || info->count < min_count) {
-				vector = i;
+				irq = i;
 				min_count = info->count;
 			}
 		}
 	}
-
-	return vector;
+	return irq;
 }
 
 /*
@@ -498,146 +514,122 @@ static int iosapic_find_sharable_vector (unsigned long trigger, unsigned long po
  *  assign a new vector for the other and make the vector available
  */
 static void __init
-iosapic_reassign_vector (int vector)
+iosapic_reassign_vector (int irq)
 {
-	int new_vector;
-
-	if (!list_empty(&iosapic_intr_info[vector].rtes)) {
-		new_vector = assign_irq_vector(AUTO_ASSIGN);
-		if (new_vector < 0)
-			panic("%s: out of interrupt vectors!\n", __FUNCTION__);
-		printk(KERN_INFO "Reassigning vector %d to %d\n", vector, new_vector);
-		memcpy(&iosapic_intr_info[new_vector], &iosapic_intr_info[vector],
+	int new_irq;
+
+	if (iosapic_intr_info[irq].count) {
+		new_irq = create_irq();
+		if (new_irq < 0)
+			panic("%s: out of interrupt vectors!\n", __func__);
+		printk(KERN_INFO "Reassigning vector %d to %d\n",
+		       irq_to_vector(irq), irq_to_vector(new_irq));
+		memcpy(&iosapic_intr_info[new_irq], &iosapic_intr_info[irq],
 		       sizeof(struct iosapic_intr_info));
-		INIT_LIST_HEAD(&iosapic_intr_info[new_vector].rtes);
-		list_move(iosapic_intr_info[vector].rtes.next, &iosapic_intr_info[new_vector].rtes);
-		memset(&iosapic_intr_info[vector], 0, sizeof(struct iosapic_intr_info));
-		iosapic_intr_info[vector].low32 = IOSAPIC_MASK;
-		INIT_LIST_HEAD(&iosapic_intr_info[vector].rtes);
+		INIT_LIST_HEAD(&iosapic_intr_info[new_irq].rtes);
+		list_move(iosapic_intr_info[irq].rtes.next,
+			  &iosapic_intr_info[new_irq].rtes);
+		memset(&iosapic_intr_info[irq], 0,
+		       sizeof(struct iosapic_intr_info));
+		iosapic_intr_info[irq].low32 = IOSAPIC_MASK;
+		INIT_LIST_HEAD(&iosapic_intr_info[irq].rtes);
 	}
 }
 
-static struct iosapic_rte_info *iosapic_alloc_rte (void)
+static inline int irq_is_shared (int irq)
 {
-	int i;
-	struct iosapic_rte_info *rte;
-	int preallocated = 0;
-
-	if (!iosapic_kmalloc_ok && list_empty(&free_rte_list)) {
-		rte = alloc_bootmem(sizeof(struct iosapic_rte_info) * NR_PREALLOCATE_RTE_ENTRIES);
-		if (!rte)
-			return NULL;
-		for (i = 0; i < NR_PREALLOCATE_RTE_ENTRIES; i++, rte++)
-			list_add(&rte->rte_list, &free_rte_list);
-	}
-
-	if (!list_empty(&free_rte_list)) {
-		rte = list_entry(free_rte_list.next, struct iosapic_rte_info, rte_list);
-		list_del(&rte->rte_list);
-		preallocated++;
-	} else {
-		rte = kmalloc(sizeof(struct iosapic_rte_info), GFP_ATOMIC);
-		if (!rte)
-			return NULL;
-	}
-
-	memset(rte, 0, sizeof(struct iosapic_rte_info));
-	if (preallocated)
-		rte->flags |= RTE_PREALLOCATED;
-
-	return rte;
+	return (iosapic_intr_info[irq].count > 1);
 }
 
-static void iosapic_free_rte (struct iosapic_rte_info *rte)
+struct irq_chip*
+ia64_native_iosapic_get_irq_chip(unsigned long trigger)
 {
-	if (rte->flags & RTE_PREALLOCATED)
-		list_add_tail(&rte->rte_list, &free_rte_list);
+	if (trigger == IOSAPIC_EDGE)
+		return &irq_type_iosapic_edge;
 	else
-		kfree(rte);
-}
-
-static inline int vector_is_shared (int vector)
-{
-	return (iosapic_intr_info[vector].count > 1);
+		return &irq_type_iosapic_level;
 }
 
 static int
-register_intr (unsigned int gsi, int vector, unsigned char delivery,
+register_intr (unsigned int gsi, int irq, unsigned char delivery,
 	       unsigned long polarity, unsigned long trigger)
 {
-	irq_desc_t *idesc;
-	struct hw_interrupt_type *irq_type;
-	int rte_index;
+	struct irq_chip *chip, *irq_type;
 	int index;
-	unsigned long gsi_base;
-	void __iomem *iosapic_address;
 	struct iosapic_rte_info *rte;
 
 	index = find_iosapic(gsi);
 	if (index < 0) {
-		printk(KERN_WARNING "%s: No IOSAPIC for GSI %u\n", __FUNCTION__, gsi);
+		printk(KERN_WARNING "%s: No IOSAPIC for GSI %u\n",
+		       __func__, gsi);
 		return -ENODEV;
 	}
 
-	iosapic_address = iosapic_lists[index].addr;
-	gsi_base = iosapic_lists[index].gsi_base;
-
-	rte = gsi_vector_to_rte(gsi, vector);
+	rte = find_rte(irq, gsi);
 	if (!rte) {
-		rte = iosapic_alloc_rte();
+		rte = kzalloc(sizeof (*rte), GFP_ATOMIC);
 		if (!rte) {
-			printk(KERN_WARNING "%s: cannot allocate memory\n", __FUNCTION__);
+			printk(KERN_WARNING "%s: cannot allocate memory\n",
+			       __func__);
 			return -ENOMEM;
 		}
 
-		rte_index = gsi - gsi_base;
-		rte->rte_index	= rte_index;
-		rte->addr	= iosapic_address;
-		rte->gsi_base	= gsi_base;
+		rte->iosapic	= &iosapic_lists[index];
+		rte->rte_index	= gsi - rte->iosapic->gsi_base;
 		rte->refcnt++;
-		list_add_tail(&rte->rte_list, &iosapic_intr_info[vector].rtes);
-		iosapic_intr_info[vector].count++;
+		list_add_tail(&rte->rte_list, &iosapic_intr_info[irq].rtes);
+		iosapic_intr_info[irq].count++;
 		iosapic_lists[index].rtes_inuse++;
 	}
-	else if (vector_is_shared(vector)) {
-		struct iosapic_intr_info *info = &iosapic_intr_info[vector];
-		if (info->trigger != trigger || info->polarity != polarity) {
-			printk (KERN_WARNING "%s: cannot override the interrupt\n", __FUNCTION__);
+	else if (rte->refcnt == NO_REF_RTE) {
+		struct iosapic_intr_info *info = &iosapic_intr_info[irq];
+		if (info->count > 0 &&
+		    (info->trigger != trigger || info->polarity != polarity)){
+			printk (KERN_WARNING
+				"%s: cannot override the interrupt\n",
+				__func__);
 			return -EINVAL;
 		}
+		rte->refcnt++;
+		iosapic_intr_info[irq].count++;
+		iosapic_lists[index].rtes_inuse++;
 	}
 
-	iosapic_intr_info[vector].polarity = polarity;
-	iosapic_intr_info[vector].dmode    = delivery;
-	iosapic_intr_info[vector].trigger  = trigger;
+	iosapic_intr_info[irq].polarity = polarity;
+	iosapic_intr_info[irq].dmode    = delivery;
+	iosapic_intr_info[irq].trigger  = trigger;
 
-	if (trigger == IOSAPIC_EDGE)
-		irq_type = &irq_type_iosapic_edge;
-	else
-		irq_type = &irq_type_iosapic_level;
-
-	idesc = irq_descp(vector);
-	if (idesc->handler != irq_type) {
-		if (idesc->handler != &no_irq_type)
-			printk(KERN_WARNING "%s: changing vector %d from %s to %s\n",
-			       __FUNCTION__, vector, idesc->handler->typename, irq_type->typename);
-		idesc->handler = irq_type;
+	irq_type = iosapic_get_irq_chip(trigger);
+
+	chip = irq_get_chip(irq);
+	if (irq_type != NULL && chip != irq_type) {
+		if (chip != &no_irq_chip)
+			printk(KERN_WARNING
+			       "%s: changing vector %d from %s to %s\n",
+			       __func__, irq_to_vector(irq),
+			       chip->name, irq_type->name);
+		chip = irq_type;
 	}
+	__irq_set_chip_handler_name_locked(irq, chip, trigger == IOSAPIC_EDGE ?
+					   handle_edge_irq : handle_level_irq,
+					   NULL);
 	return 0;
 }
 
 static unsigned int
-get_target_cpu (unsigned int gsi, int vector)
+get_target_cpu (unsigned int gsi, int irq)
 {
 #ifdef CONFIG_SMP
 	static int cpu = -1;
+	extern int cpe_vector;
+	cpumask_t domain = irq_to_domain(irq);
 
 	/*
 	 * In case of vector shared by multiple RTEs, all RTEs that
 	 * share the vector need to use the same destination CPU.
 	 */
-	if (!list_empty(&iosapic_intr_info[vector].rtes))
-		return iosapic_intr_info[vector].dest;
+	if (iosapic_intr_info[irq].count)
+		return iosapic_intr_info[irq].dest;
 
 	/*
 	 * If the platform supports redirection via XTP, let it
@@ -653,35 +645,39 @@ get_target_cpu (unsigned int gsi, int vector)
 	if (!cpu_online(smp_processor_id()))
 		return cpu_physical_id(smp_processor_id());
 
+#ifdef CONFIG_ACPI
+	if (cpe_vector > 0 && irq_to_vector(irq) == IA64_CPEP_VECTOR)
+		return get_cpei_target_cpu();
+#endif
+
 #ifdef CONFIG_NUMA
 	{
 		int num_cpus, cpu_index, iosapic_index, numa_cpu, i = 0;
-		cpumask_t cpu_mask;
+		const struct cpumask *cpu_mask;
 
 		iosapic_index = find_iosapic(gsi);
 		if (iosapic_index < 0 ||
 		    iosapic_lists[iosapic_index].node == MAX_NUMNODES)
 			goto skip_numa_setup;
 
-		cpu_mask = node_to_cpumask(iosapic_lists[iosapic_index].node);
-
-		for_each_cpu_mask(numa_cpu, cpu_mask) {
-			if (!cpu_online(numa_cpu))
-				cpu_clear(numa_cpu, cpu_mask);
+		cpu_mask = cpumask_of_node(iosapic_lists[iosapic_index].node);
+		num_cpus = 0;
+		for_each_cpu_and(numa_cpu, cpu_mask, &domain) {
+			if (cpu_online(numa_cpu))
+				num_cpus++;
 		}
 
-		num_cpus = cpus_weight(cpu_mask);
-
 		if (!num_cpus)
 			goto skip_numa_setup;
 
-		/* Use vector assigment to distribute across cpus in node */
-		cpu_index = vector % num_cpus;
+		/* Use irq assignment to distribute across cpus in node */
+		cpu_index = irq % num_cpus;
 
-		for (numa_cpu = first_cpu(cpu_mask) ; i < cpu_index ; i++)
-			numa_cpu = next_cpu(numa_cpu, cpu_mask);
+		for_each_cpu_and(numa_cpu, cpu_mask, &domain)
+			if (cpu_online(numa_cpu) && i++ >= cpu_index)
+				break;
 
-		if (numa_cpu != NR_CPUS)
+		if (numa_cpu < nr_cpu_ids)
 			return cpu_physical_id(numa_cpu);
 	}
 skip_numa_setup:
@@ -692,16 +688,25 @@ skip_numa_setup:
 	 * case of NUMA.)
 	 */
 	do {
-		if (++cpu >= NR_CPUS)
+		if (++cpu >= nr_cpu_ids)
 			cpu = 0;
-	} while (!cpu_online(cpu));
+	} while (!cpu_online(cpu) || !cpu_isset(cpu, domain));
 
 	return cpu_physical_id(cpu);
-#else
+#else  /* CONFIG_SMP */
 	return cpu_physical_id(smp_processor_id());
 #endif
 }
 
+static inline unsigned char choose_dmode(void)
+{
+#ifdef CONFIG_SMP
+	if (smp_int_redirect & SMP_IRQ_REDIRECTION)
+		return IOSAPIC_LOWEST_PRIORITY;
+#endif
+	return IOSAPIC_FIXED;
+}
+
 /*
  * ACPI can describe IOSAPIC interrupts via static tables and namespace
  * methods.  This provides an interface to register those interrupts and
@@ -711,83 +716,76 @@ int
 iosapic_register_intr (unsigned int gsi,
 		       unsigned long polarity, unsigned long trigger)
 {
-	int vector, mask = 1, err;
+	int irq, mask = 1, err;
 	unsigned int dest;
 	unsigned long flags;
 	struct iosapic_rte_info *rte;
 	u32 low32;
-again:
+	unsigned char dmode;
+	struct irq_desc *desc;
+
 	/*
 	 * If this GSI has already been registered (i.e., it's a
 	 * shared interrupt, or we lost a race to register it),
 	 * don't touch the RTE.
 	 */
 	spin_lock_irqsave(&iosapic_lock, flags);
-	{
-		vector = gsi_to_vector(gsi);
-		if (vector > 0) {
-			rte = gsi_vector_to_rte(gsi, vector);
+	irq = __gsi_to_irq(gsi);
+	if (irq > 0) {
+		rte = find_rte(irq, gsi);
+		if(iosapic_intr_info[irq].count == 0) {
+			assign_irq_vector(irq);
+			irq_init_desc(irq);
+		} else if (rte->refcnt != NO_REF_RTE) {
 			rte->refcnt++;
-			spin_unlock_irqrestore(&iosapic_lock, flags);
-			return vector;
+			goto unlock_iosapic_lock;
 		}
-	}
-	spin_unlock_irqrestore(&iosapic_lock, flags);
+	} else
+		irq = create_irq();
 
 	/* If vector is running out, we try to find a sharable vector */
-	vector = assign_irq_vector(AUTO_ASSIGN);
-	if (vector < 0) {
-		vector = iosapic_find_sharable_vector(trigger, polarity);
-  		if (vector < 0)
-			return -ENOSPC;
+	if (irq < 0) {
+		irq = iosapic_find_sharable_irq(trigger, polarity);
+		if (irq < 0)
+			goto unlock_iosapic_lock;
 	}
 
-	spin_lock_irqsave(&irq_descp(vector)->lock, flags);
-	spin_lock(&iosapic_lock);
-	{
-		if (gsi_to_vector(gsi) > 0) {
-			if (list_empty(&iosapic_intr_info[vector].rtes))
-				free_irq_vector(vector);
-			spin_unlock(&iosapic_lock);
-			spin_unlock_irqrestore(&irq_descp(vector)->lock, flags);
-			goto again;
-		}
-
-		dest = get_target_cpu(gsi, vector);
-		err = register_intr(gsi, vector, IOSAPIC_LOWEST_PRIORITY,
-			      polarity, trigger);
-		if (err < 0) {
-			spin_unlock(&iosapic_lock);
-			spin_unlock_irqrestore(&irq_descp(vector)->lock, flags);
-			return err;
-		}
-
-		/*
-		 * If the vector is shared and already unmasked for
-		 * other interrupt sources, don't mask it.
-		 */
-		low32 = iosapic_intr_info[vector].low32;
-		if (vector_is_shared(vector) && !(low32 & IOSAPIC_MASK))
-			mask = 0;
-		set_rte(gsi, vector, dest, mask);
+	desc = irq_to_desc(irq);
+	raw_spin_lock(&desc->lock);
+	dest = get_target_cpu(gsi, irq);
+	dmode = choose_dmode();
+	err = register_intr(gsi, irq, dmode, polarity, trigger);
+	if (err < 0) {
+		raw_spin_unlock(&desc->lock);
+		irq = err;
+		goto unlock_iosapic_lock;
 	}
-	spin_unlock(&iosapic_lock);
-	spin_unlock_irqrestore(&irq_descp(vector)->lock, flags);
+
+	/*
+	 * If the vector is shared and already unmasked for other
+	 * interrupt sources, don't mask it.
+	 */
+	low32 = iosapic_intr_info[irq].low32;
+	if (irq_is_shared(irq) && !(low32 & IOSAPIC_MASK))
+		mask = 0;
+	set_rte(gsi, irq, dest, mask);
 
 	printk(KERN_INFO "GSI %u (%s, %s) -> CPU %d (0x%04x) vector %d\n",
 	       gsi, (trigger == IOSAPIC_EDGE ? "edge" : "level"),
 	       (polarity == IOSAPIC_POL_HIGH ? "high" : "low"),
-	       cpu_logical_id(dest), dest, vector);
+	       cpu_logical_id(dest), dest, irq_to_vector(irq));
 
-	return vector;
+	raw_spin_unlock(&desc->lock);
+ unlock_iosapic_lock:
+	spin_unlock_irqrestore(&iosapic_lock, flags);
+	return irq;
 }
 
 void
 iosapic_unregister_intr (unsigned int gsi)
 {
 	unsigned long flags;
-	int irq, vector, index;
-	irq_desc_t *idesc;
+	int irq, index;
 	u32 low32;
 	unsigned long trigger, polarity;
 	unsigned int dest;
@@ -800,74 +798,64 @@ iosapic_unregister_intr (unsigned int gsi)
 	 */
 	irq = gsi_to_irq(gsi);
 	if (irq < 0) {
-		printk(KERN_ERR "iosapic_unregister_intr(%u) unbalanced\n", gsi);
+		printk(KERN_ERR "iosapic_unregister_intr(%u) unbalanced\n",
+		       gsi);
 		WARN_ON(1);
 		return;
 	}
-	vector = irq_to_vector(irq);
 
-	idesc = irq_descp(irq);
-	spin_lock_irqsave(&idesc->lock, flags);
-	spin_lock(&iosapic_lock);
-	{
-		if ((rte = gsi_vector_to_rte(gsi, vector)) == NULL) {
-			printk(KERN_ERR "iosapic_unregister_intr(%u) unbalanced\n", gsi);
-			WARN_ON(1);
-			goto out;
-		}
+	spin_lock_irqsave(&iosapic_lock, flags);
+	if ((rte = find_rte(irq, gsi)) == NULL) {
+		printk(KERN_ERR "iosapic_unregister_intr(%u) unbalanced\n",
+		       gsi);
+		WARN_ON(1);
+		goto out;
+	}
 
-		if (--rte->refcnt > 0)
-			goto out;
+	if (--rte->refcnt > 0)
+		goto out;
 
-		/* Mask the interrupt */
-		low32 = iosapic_intr_info[vector].low32 | IOSAPIC_MASK;
-		iosapic_write(rte->addr, IOSAPIC_RTE_LOW(rte->rte_index), low32);
-
-		/* Remove the rte entry from the list */
-		list_del(&rte->rte_list);
-		iosapic_intr_info[vector].count--;
-		iosapic_free_rte(rte);
-		index = find_iosapic(gsi);
-		iosapic_lists[index].rtes_inuse--;
-		WARN_ON(iosapic_lists[index].rtes_inuse < 0);
-
-		trigger	 = iosapic_intr_info[vector].trigger;
-		polarity = iosapic_intr_info[vector].polarity;
-		dest     = iosapic_intr_info[vector].dest;
-		printk(KERN_INFO "GSI %u (%s, %s) -> CPU %d (0x%04x) vector %d unregistered\n",
-		       gsi, (trigger == IOSAPIC_EDGE ? "edge" : "level"),
-		       (polarity == IOSAPIC_POL_HIGH ? "high" : "low"),
-		       cpu_logical_id(dest), dest, vector);
-
-		if (list_empty(&iosapic_intr_info[vector].rtes)) {
-			/* Sanity check */
-			BUG_ON(iosapic_intr_info[vector].count);
-
-			/* Clear the interrupt controller descriptor */
-			idesc->handler = &no_irq_type;
-
-			/* Clear the interrupt information */
-			memset(&iosapic_intr_info[vector], 0, sizeof(struct iosapic_intr_info));
-			iosapic_intr_info[vector].low32 |= IOSAPIC_MASK;
-			INIT_LIST_HEAD(&iosapic_intr_info[vector].rtes);
-
-			if (idesc->action) {
-				printk(KERN_ERR "interrupt handlers still exist on IRQ %u\n", irq);
-				WARN_ON(1);
-			}
+	rte->refcnt = NO_REF_RTE;
 
-			/* Free the interrupt vector */
-			free_irq_vector(vector);
-		}
+	/* Mask the interrupt */
+	low32 = iosapic_intr_info[irq].low32 | IOSAPIC_MASK;
+	iosapic_write(rte->iosapic, IOSAPIC_RTE_LOW(rte->rte_index), low32);
+
+	iosapic_intr_info[irq].count--;
+	index = find_iosapic(gsi);
+	iosapic_lists[index].rtes_inuse--;
+	WARN_ON(iosapic_lists[index].rtes_inuse < 0);
+
+	trigger  = iosapic_intr_info[irq].trigger;
+	polarity = iosapic_intr_info[irq].polarity;
+	dest     = iosapic_intr_info[irq].dest;
+	printk(KERN_INFO
+	       "GSI %u (%s, %s) -> CPU %d (0x%04x) vector %d unregistered\n",
+	       gsi, (trigger == IOSAPIC_EDGE ? "edge" : "level"),
+	       (polarity == IOSAPIC_POL_HIGH ? "high" : "low"),
+	       cpu_logical_id(dest), dest, irq_to_vector(irq));
+
+	if (iosapic_intr_info[irq].count == 0) {
+#ifdef CONFIG_SMP
+		/* Clear affinity */
+		cpumask_setall(irq_get_irq_data(irq)->affinity);
+#endif
+		/* Clear the interrupt information */
+		iosapic_intr_info[irq].dest = 0;
+		iosapic_intr_info[irq].dmode = 0;
+		iosapic_intr_info[irq].polarity = 0;
+		iosapic_intr_info[irq].trigger = 0;
+		iosapic_intr_info[irq].low32 |= IOSAPIC_MASK;
+
+		/* Destroy and reserve IRQ */
+		destroy_and_reserve_irq(irq);
 	}
  out:
-	spin_unlock(&iosapic_lock);
-	spin_unlock_irqrestore(&idesc->lock, flags);
+	spin_unlock_irqrestore(&iosapic_lock, flags);
 }
 
 /*
  * ACPI calls this when it finds an entry for a platform interrupt.
- * Note that the irq_base and IOSAPIC address must be set in iosapic_init().
  */
 int __init
 iosapic_register_platform_intr (u32 int_type, unsigned int gsi,
@@ -876,94 +864,110 @@ iosapic_register_platform_intr (u32 int_type, unsigned int gsi,
 {
 	static const char * const name[] = {"unknown", "PMI", "INIT", "CPEI"};
 	unsigned char delivery;
-	int vector, mask = 0;
+	int irq, vector, mask = 0;
 	unsigned int dest = ((id << 8) | eid) & 0xffff;
 
 	switch (int_type) {
 	      case ACPI_INTERRUPT_PMI:
-		vector = iosapic_vector;
+		irq = vector = iosapic_vector;
+		bind_irq_vector(irq, vector, CPU_MASK_ALL);
 		/*
 		 * since PMI vector is alloc'd by FW(ACPI) not by kernel,
 		 * we need to make sure the vector is available
 		 */
-		iosapic_reassign_vector(vector);
+		iosapic_reassign_vector(irq);
 		delivery = IOSAPIC_PMI;
 		break;
 	      case ACPI_INTERRUPT_INIT:
-		vector = assign_irq_vector(AUTO_ASSIGN);
-		if (vector < 0)
-			panic("%s: out of interrupt vectors!\n", __FUNCTION__);
+		irq = create_irq();
+		if (irq < 0)
+			panic("%s: out of interrupt vectors!\n", __func__);
+		vector = irq_to_vector(irq);
 		delivery = IOSAPIC_INIT;
 		break;
 	      case ACPI_INTERRUPT_CPEI:
-		vector = IA64_CPE_VECTOR;
-		delivery = IOSAPIC_LOWEST_PRIORITY;
+		irq = vector = IA64_CPE_VECTOR;
+		BUG_ON(bind_irq_vector(irq, vector, CPU_MASK_ALL));
+		delivery = IOSAPIC_FIXED;
 		mask = 1;
 		break;
 	      default:
-		printk(KERN_ERR "iosapic_register_platform_irq(): invalid int type 0x%x\n", int_type);
+		printk(KERN_ERR "%s: invalid int type 0x%x\n", __func__,
+		       int_type);
 		return -1;
 	}
 
-	register_intr(gsi, vector, delivery, polarity, trigger);
+	register_intr(gsi, irq, delivery, polarity, trigger);
 
-	printk(KERN_INFO "PLATFORM int %s (0x%x): GSI %u (%s, %s) -> CPU %d (0x%04x) vector %d\n",
+	printk(KERN_INFO
+	       "PLATFORM int %s (0x%x): GSI %u (%s, %s) -> CPU %d (0x%04x)"
+	       " vector %d\n",
 	       int_type < ARRAY_SIZE(name) ? name[int_type] : "unknown",
 	       int_type, gsi, (trigger == IOSAPIC_EDGE ? "edge" : "level"),
 	       (polarity == IOSAPIC_POL_HIGH ? "high" : "low"),
 	       cpu_logical_id(dest), dest, vector);
 
-	set_rte(gsi, vector, dest, mask);
+	set_rte(gsi, irq, dest, mask);
 	return vector;
 }
 
-
 /*
  * ACPI calls this when it finds an entry for a legacy ISA IRQ override.
- * Note that the gsi_base and IOSAPIC address must be set in iosapic_init().
  */
-void __init
-iosapic_override_isa_irq (unsigned int isa_irq, unsigned int gsi,
-			  unsigned long polarity,
-			  unsigned long trigger)
+void iosapic_override_isa_irq(unsigned int isa_irq, unsigned int gsi,
+			      unsigned long polarity, unsigned long trigger)
 {
-	int vector;
+	int vector, irq;
 	unsigned int dest = cpu_physical_id(smp_processor_id());
+	unsigned char dmode;
 
-	vector = isa_irq_to_vector(isa_irq);
-
-	register_intr(gsi, vector, IOSAPIC_LOWEST_PRIORITY, polarity, trigger);
+	irq = vector = isa_irq_to_vector(isa_irq);
+	BUG_ON(bind_irq_vector(irq, vector, CPU_MASK_ALL));
+	dmode = choose_dmode();
+	register_intr(gsi, irq, dmode, polarity, trigger);
 
 	DBG("ISA: IRQ %u -> GSI %u (%s,%s) -> CPU %d (0x%04x) vector %d\n",
 	    isa_irq, gsi, trigger == IOSAPIC_EDGE ? "edge" : "level",
 	    polarity == IOSAPIC_POL_HIGH ? "high" : "low",
 	    cpu_logical_id(dest), dest, vector);
 
-	set_rte(gsi, vector, dest, 1);
+	set_rte(gsi, irq, dest, 1);
 }
 
 void __init
-iosapic_system_init (int system_pcat_compat)
+ia64_native_iosapic_pcat_compat_init(void)
 {
-	int vector;
-
-	for (vector = 0; vector < IA64_NUM_VECTORS; ++vector) {
-		iosapic_intr_info[vector].low32 = IOSAPIC_MASK;
-		INIT_LIST_HEAD(&iosapic_intr_info[vector].rtes);	/* mark as unused */
-	}
-
-	pcat_compat = system_pcat_compat;
 	if (pcat_compat) {
 		/*
-		 * Disable the compatibility mode interrupts (8259 style), needs IN/OUT support
-		 * enabled.
+		 * Disable the compatibility mode interrupts (8259 style),
+		 * needs IN/OUT support enabled.
 		 */
-		printk(KERN_INFO "%s: Disabling PC-AT compatible 8259 interrupts\n", __FUNCTION__);
+		printk(KERN_INFO
+		       "%s: Disabling PC-AT compatible 8259 interrupts\n",
+		       __func__);
 		outb(0xff, 0xA1);
 		outb(0xff, 0x21);
 	}
 }
 
+void __init
+iosapic_system_init (int system_pcat_compat)
+{
+	int irq;
+
+	for (irq = 0; irq < NR_IRQS; ++irq) {
+		iosapic_intr_info[irq].low32 = IOSAPIC_MASK;
+		/* mark as unused */
+		INIT_LIST_HEAD(&iosapic_intr_info[irq].rtes);
+
+		iosapic_intr_info[irq].count = 0;
+	}
+
+	pcat_compat = system_pcat_compat;
+	if (pcat_compat)
+		iosapic_pcat_compat_init();
+}
+
 static inline int
 iosapic_alloc (void)
 {
@@ -973,7 +977,7 @@ iosapic_alloc (void)
 		if (!iosapic_lists[index].addr)
 			return index;
 
-	printk(KERN_WARNING "%s: failed to allocate iosapic\n", __FUNCTION__);
+	printk(KERN_WARNING "%s: failed to allocate iosapic\n", __func__);
 	return -1;
 }
 
@@ -998,10 +1002,7 @@ iosapic_check_gsi_range (unsigned int gsi_base, unsigned int ver)
 		base = iosapic_lists[index].gsi_base;
 		end  = base + iosapic_lists[index].num_rte - 1;
 
-		if (gsi_base < base && gsi_end < base)
-			continue;/* OK */
-
-		if (gsi_base > end && gsi_end > end)
+		if (gsi_end < base || end < gsi_base)
 			continue; /* OK */
 
 		return -EBUSY;
@@ -1009,8 +1010,27 @@ iosapic_check_gsi_range (unsigned int gsi_base, unsigned int ver)
 	return 0;
 }
 
-int __devinit
-iosapic_init (unsigned long phys_addr, unsigned int gsi_base)
+static int
+iosapic_delete_rte(unsigned int irq, unsigned int gsi)
+{
+	struct iosapic_rte_info *rte, *temp;
+
+	list_for_each_entry_safe(rte, temp, &iosapic_intr_info[irq].rtes,
+								rte_list) {
+		if (rte->iosapic->gsi_base + rte->rte_index == gsi) {
+			if (rte->refcnt)
+				return -EBUSY;
+
+			list_del(&rte->rte_list);
+			kfree(rte);
+			return 0;
+		}
+	}
+
+	return -EINVAL;
+}
+
+int iosapic_init(unsigned long phys_addr, unsigned int gsi_base)
 {
 	int num_rte, err, index;
 	unsigned int isa_irq, ver;
@@ -1018,97 +1038,104 @@ iosapic_init (unsigned long phys_addr, unsigned int gsi_base)
 	unsigned long flags;
 
 	spin_lock_irqsave(&iosapic_lock, flags);
-	{
-		addr = ioremap(phys_addr, 0);
-		ver = iosapic_version(addr);
+	index = find_iosapic(gsi_base);
+	if (index >= 0) {
+		spin_unlock_irqrestore(&iosapic_lock, flags);
+		return -EBUSY;
+	}
 
-		if ((err = iosapic_check_gsi_range(gsi_base, ver))) {
-			iounmap(addr);
-			spin_unlock_irqrestore(&iosapic_lock, flags);
-			return err;
-		}
+	addr = ioremap(phys_addr, 0);
+	if (addr == NULL) {
+		spin_unlock_irqrestore(&iosapic_lock, flags);
+		return -ENOMEM;
+	}
+	ver = iosapic_version(addr);
+	if ((err = iosapic_check_gsi_range(gsi_base, ver))) {
+		iounmap(addr);
+		spin_unlock_irqrestore(&iosapic_lock, flags);
+		return err;
+	}
 
-		/*
-		 * The MAX_REDIR register holds the highest input pin
-		 * number (starting from 0).
-		 * We add 1 so that we can use it for number of pins (= RTEs)
-		 */
-		num_rte = ((ver >> 16) & 0xff) + 1;
+	/*
+	 * The MAX_REDIR register holds the highest input pin number
+	 * (starting from 0).  We add 1 so that we can use it for
+	 * number of pins (= RTEs)
+	 */
+	num_rte = ((ver >> 16) & 0xff) + 1;
 
-		index = iosapic_alloc();
-		iosapic_lists[index].addr = addr;
-		iosapic_lists[index].gsi_base = gsi_base;
-		iosapic_lists[index].num_rte = num_rte;
+	index = iosapic_alloc();
+	iosapic_lists[index].addr = addr;
+	iosapic_lists[index].gsi_base = gsi_base;
+	iosapic_lists[index].num_rte = num_rte;
 #ifdef CONFIG_NUMA
-		iosapic_lists[index].node = MAX_NUMNODES;
+	iosapic_lists[index].node = MAX_NUMNODES;
 #endif
-	}
+	spin_lock_init(&iosapic_lists[index].lock);
 	spin_unlock_irqrestore(&iosapic_lock, flags);
 
 	if ((gsi_base == 0) && pcat_compat) {
 		/*
-		 * Map the legacy ISA devices into the IOSAPIC data.  Some of these may
-		 * get reprogrammed later on with data from the ACPI Interrupt Source
-		 * Override table.
+		 * Map the legacy ISA devices into the IOSAPIC data.  Some of
+		 * these may get reprogrammed later on with data from the ACPI
+		 * Interrupt Source Override table.
 		 */
 		for (isa_irq = 0; isa_irq < 16; ++isa_irq)
-			iosapic_override_isa_irq(isa_irq, isa_irq, IOSAPIC_POL_HIGH, IOSAPIC_EDGE);
+			iosapic_override_isa_irq(isa_irq, isa_irq,
+						 IOSAPIC_POL_HIGH,
+						 IOSAPIC_EDGE);
 	}
 	return 0;
 }
 
-#ifdef CONFIG_HOTPLUG
-int
-iosapic_remove (unsigned int gsi_base)
+int iosapic_remove(unsigned int gsi_base)
 {
-	int index, err = 0;
+	int i, irq, index, err = 0;
 	unsigned long flags;
 
 	spin_lock_irqsave(&iosapic_lock, flags);
-	{
-		index = find_iosapic(gsi_base);
-		if (index < 0) {
-			printk(KERN_WARNING "%s: No IOSAPIC for GSI base %u\n",
-			       __FUNCTION__, gsi_base);
-			goto out;
-		}
+	index = find_iosapic(gsi_base);
+	if (index < 0) {
+		printk(KERN_WARNING "%s: No IOSAPIC for GSI base %u\n",
+		       __func__, gsi_base);
+		goto out;
+	}
 
-		if (iosapic_lists[index].rtes_inuse) {
-			err = -EBUSY;
-			printk(KERN_WARNING "%s: IOSAPIC for GSI base %u is busy\n",
-			       __FUNCTION__, gsi_base);
-			goto out;
-		}
+	if (iosapic_lists[index].rtes_inuse) {
+		err = -EBUSY;
+		printk(KERN_WARNING "%s: IOSAPIC for GSI base %u is busy\n",
+		       __func__, gsi_base);
+		goto out;
+	}
+
+	for (i = gsi_base; i < gsi_base + iosapic_lists[index].num_rte; i++) {
+		irq = __gsi_to_irq(i);
+		if (irq < 0)
+			continue;
 
-		iounmap(iosapic_lists[index].addr);
-		iosapic_free(index);
+		err = iosapic_delete_rte(irq, i);
+		if (err)
+			goto out;
 	}
+
+	iounmap(iosapic_lists[index].addr);
+	iosapic_free(index);
  out:
 	spin_unlock_irqrestore(&iosapic_lock, flags);
 	return err;
 }
-#endif /* CONFIG_HOTPLUG */
 
 #ifdef CONFIG_NUMA
-void __devinit
-map_iosapic_to_node(unsigned int gsi_base, int node)
+void map_iosapic_to_node(unsigned int gsi_base, int node)
 {
 	int index;
 
 	index = find_iosapic(gsi_base);
 	if (index < 0) {
 		printk(KERN_WARNING "%s: No IOSAPIC for GSI %u\n",
-		       __FUNCTION__, gsi_base);
+		       __func__, gsi_base);
 		return;
 	}
 	iosapic_lists[index].node = node;
 	return;
 }
 #endif
-
-static int __init iosapic_enable_kmalloc (void)
-{
-	iosapic_kmalloc_ok = 1;
-	return 0;
-}
-core_initcall (iosapic_enable_kmalloc);
diff --git a/arch/ia64/kernel/irq.c b/arch/ia64/kernel/irq.c
index d33244c3275..f2c41828113 100644
--- a/arch/ia64/kernel/irq.c
+++ b/arch/ia64/kernel/irq.c
@@ -4,7 +4,7 @@
  *	Copyright (C) 1992, 1998 Linus Torvalds, Ingo Molnar
  *
  * This file contains the code used by various IRQ handling routines:
- * asking for different IRQ's should be done through these routines
+ * asking for different IRQs should be done through these routines
  * instead of just grabbing them. Thus setups with different IRQ numbers
  * shouldn't result in any weird surprises, and installing new handlers
  * should be easier.
@@ -12,7 +12,7 @@
  * Copyright (C) Ashok Raj<ashok.raj@intel.com>, Intel Corporation 2004
  *
  * 4/14/2004: Added code to handle cpu migration and do safe irq
- *			migration without lossing interrupts for iosapic
+ *			migration without losing interrupts for iosapic
  *			architecture.
  */
 
@@ -23,6 +23,8 @@
 #include <linux/interrupt.h>
 #include <linux/kernel_stat.h>
 
+#include <asm/mca.h>
+
 /*
  * 'what should we do if we get a hw irq event on an illegal vector'.
  * each architecture has to answer this themselves.
@@ -33,9 +35,14 @@ void ack_bad_irq(unsigned int irq)
 }
 
 #ifdef CONFIG_IA64_GENERIC
+ia64_vector __ia64_irq_to_vector(int irq)
+{
+	return irq_cfg[irq].vector;
+}
+
 unsigned int __ia64_local_vector_to_irq (ia64_vector vec)
 {
-	return (unsigned int) vec;
+	return __get_cpu_var(vector_irq)[vec];
 }
 #endif
 
@@ -48,45 +55,9 @@ atomic_t irq_err_count;
 /*
  * /proc/interrupts printing:
  */
-
-int show_interrupts(struct seq_file *p, void *v)
+int arch_show_interrupts(struct seq_file *p, int prec)
 {
-	int i = *(loff_t *) v, j;
-	struct irqaction * action;
-	unsigned long flags;
-
-	if (i == 0) {
-		seq_printf(p, "           ");
-		for_each_online_cpu(j) {
-			seq_printf(p, "CPU%d       ",j);
-		}
-		seq_putc(p, '\n');
-	}
-
-	if (i < NR_IRQS) {
-		spin_lock_irqsave(&irq_desc[i].lock, flags);
-		action = irq_desc[i].action;
-		if (!action)
-			goto skip;
-		seq_printf(p, "%3d: ",i);
-#ifndef CONFIG_SMP
-		seq_printf(p, "%10u ", kstat_irqs(i));
-#else
-		for_each_online_cpu(j) {
-			seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]);
-		}
-#endif
-		seq_printf(p, " %14s", irq_desc[i].handler->typename);
-		seq_printf(p, "  %s", action->name);
-
-		for (action=action->next; action; action = action->next)
-			seq_printf(p, ", %s", action->name);
-
-		seq_putc(p, '\n');
-skip:
-		spin_unlock_irqrestore(&irq_desc[i].lock, flags);
-	} else if (i == NR_IRQS)
-		seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count));
+	seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count));
 	return 0;
 }
 
@@ -95,33 +66,49 @@ static char irq_redir [NR_IRQS]; // = { [0 ... NR_IRQS-1] = 1 };
 
 void set_irq_affinity_info (unsigned int irq, int hwid, int redir)
 {
-	cpumask_t mask = CPU_MASK_NONE;
-
-	cpu_set(cpu_logical_id(hwid), mask);
-
 	if (irq < NR_IRQS) {
-		irq_affinity[irq] = mask;
-		set_irq_info(irq, mask);
+		cpumask_copy(irq_get_irq_data(irq)->affinity,
+			     cpumask_of(cpu_logical_id(hwid)));
 		irq_redir[irq] = (char) (redir & 0xff);
 	}
 }
+
+bool is_affinity_mask_valid(const struct cpumask *cpumask)
+{
+	if (ia64_platform_is("sn2")) {
+		/* Only allow one CPU to be specified in the smp_affinity mask */
+		if (cpumask_weight(cpumask) != 1)
+			return false;
+	}
+	return true;
+}
+
 #endif /* CONFIG_SMP */
 
+int __init arch_early_irq_init(void)
+{
+	ia64_mca_irq_init();
+	return 0;
+}
+
 #ifdef CONFIG_HOTPLUG_CPU
 unsigned int vectors_in_migration[NR_IRQS];
 
 /*
- * Since cpu_online_map is already updated, we just need to check for
+ * Since cpu_online_mask is already updated, we just need to check for
  * affinity that has zeros
  */
 static void migrate_irqs(void)
 {
-	cpumask_t	mask;
-	irq_desc_t *desc;
 	int 		irq, new_cpu;
 
 	for (irq=0; irq < NR_IRQS; irq++) {
-		desc = irq_descp(irq);
+		struct irq_desc *desc = irq_to_desc(irq);
+		struct irq_data *data = irq_desc_get_irq_data(desc);
+		struct irq_chip *chip = irq_data_get_irq_chip(data);
+
+		if (irqd_irq_disabled(data))
+			continue;
 
 		/*
 		 * No handling for now.
@@ -129,31 +116,31 @@ static void migrate_irqs(void)
 		 * tell CPU not to respond to these local intr sources.
 		 * such as ITV,CPEI,MCA etc.
 		 */
-		if (desc->status == IRQ_PER_CPU)
+		if (irqd_is_per_cpu(data))
 			continue;
 
-		cpus_and(mask, irq_affinity[irq], cpu_online_map);
-		if (any_online_cpu(mask) == NR_CPUS) {
+		if (cpumask_any_and(data->affinity, cpu_online_mask)
+		    >= nr_cpu_ids) {
 			/*
 			 * Save it for phase 2 processing
 			 */
 			vectors_in_migration[irq] = irq;
 
-			new_cpu = any_online_cpu(cpu_online_map);
-			mask = cpumask_of_cpu(new_cpu);
+			new_cpu = cpumask_any(cpu_online_mask);
 
 			/*
 			 * Al three are essential, currently WARN_ON.. maybe panic?
 			 */
-			if (desc->handler && desc->handler->disable &&
-				desc->handler->enable && desc->handler->set_affinity) {
-				desc->handler->disable(irq);
-				desc->handler->set_affinity(irq, mask);
-				desc->handler->enable(irq);
+			if (chip && chip->irq_disable &&
+				chip->irq_enable && chip->irq_set_affinity) {
+				chip->irq_disable(data);
+				chip->irq_set_affinity(data,
+						       cpumask_of(new_cpu), false);
+				chip->irq_enable(data);
 			} else {
-				WARN_ON((!(desc->handler) || !(desc->handler->disable) ||
-						!(desc->handler->enable) ||
-						!(desc->handler->set_affinity)));
+				WARN_ON((!chip || !chip->irq_disable ||
+					 !chip->irq_enable ||
+					 !chip->irq_set_affinity));
 			}
 		}
 	}
@@ -163,10 +150,21 @@ void fixup_irqs(void)
 {
 	unsigned int irq;
 	extern void ia64_process_pending_intr(void);
+	extern volatile int time_keeper_id;
+
+	/* Mask ITV to disable timer */
+	ia64_set_itv(1 << 16);
 
-	ia64_set_itv(1<<16);
 	/*
-	 * Phase 1: Locate irq's bound to this cpu and
+	 * Find a new timesync master
+	 */
+	if (smp_processor_id() == time_keeper_id) {
+		time_keeper_id = cpumask_first(cpu_online_mask);
+		printk ("CPU %d is now promoted to time-keeper master\n", time_keeper_id);
+	}
+
+	/*
+	 * Phase 1: Locate IRQs bound to this cpu and
 	 * relocate them for cpu removal.
 	 */
 	migrate_irqs();
@@ -184,8 +182,11 @@ void fixup_irqs(void)
 	 */
 	for (irq=0; irq < NR_IRQS; irq++) {
 		if (vectors_in_migration[irq]) {
+			struct pt_regs *old_regs = set_irq_regs(NULL);
+
 			vectors_in_migration[irq]=0;
-			__do_IRQ(irq, NULL);
+			generic_handle_irq(irq);
+			set_irq_regs(old_regs);
 		}
 	}
 
diff --git a/arch/ia64/kernel/irq_ia64.c b/arch/ia64/kernel/irq_ia64.c
index 6c4d59fd036..03ea78ed64a 100644
--- a/arch/ia64/kernel/irq_ia64.c
+++ b/arch/ia64/kernel/irq_ia64.c
@@ -1,5 +1,5 @@
 /*
- * linux/arch/ia64/kernel/irq.c
+ * linux/arch/ia64/kernel/irq_ia64.c
  *
  * Copyright (C) 1998-2001 Hewlett-Packard Co
  *	Stephane Eranian <eranian@hpl.hp.com>
@@ -14,7 +14,6 @@
  *						Added CPU Hotplug handling for IPF.
  */
 
-#include <linux/config.h>
 #include <linux/module.h>
 
 #include <linux/jiffies.h>
@@ -23,14 +22,15 @@
 #include <linux/interrupt.h>
 #include <linux/ioport.h>
 #include <linux/kernel_stat.h>
-#include <linux/slab.h>
 #include <linux/ptrace.h>
-#include <linux/random.h>	/* for rand_initialize_irq() */
 #include <linux/signal.h>
 #include <linux/smp.h>
-#include <linux/smp_lock.h>
 #include <linux/threads.h>
 #include <linux/bitops.h>
+#include <linux/irq.h>
+#include <linux/ratelimit.h>
+#include <linux/acpi.h>
+#include <linux/sched.h>
 
 #include <asm/delay.h>
 #include <asm/intrinsics.h>
@@ -38,7 +38,7 @@
 #include <asm/hw_irq.h>
 #include <asm/machvec.h>
 #include <asm/pgtable.h>
-#include <asm/system.h>
+#include <asm/tlbflush.h>
 
 #ifdef CONFIG_PERFMON
 # include <asm/perfmon.h>
@@ -46,10 +46,22 @@
 
 #define IRQ_DEBUG	0
 
+#define IRQ_VECTOR_UNASSIGNED	(0)
+
+#define IRQ_UNUSED		(0)
+#define IRQ_USED		(1)
+#define IRQ_RSVD		(2)
+
+/* These can be overridden in platform_irq_init */
+int ia64_first_device_vector = IA64_DEF_FIRST_DEVICE_VECTOR;
+int ia64_last_device_vector = IA64_DEF_LAST_DEVICE_VECTOR;
+
 /* default base addr of IPI table */
 void __iomem *ipi_base_addr = ((void __iomem *)
 			       (__IA64_UNCACHED_OFFSET | IA64_IPI_DEFAULT_BASE_ADDR));
 
+static cpumask_t vector_allocation_domain(int cpu);
+
 /*
  * Legacy IRQ to IA-64 vector translation table.
  */
@@ -60,39 +72,365 @@ __u8 isa_irq_to_vector_map[16] = {
 };
 EXPORT_SYMBOL(isa_irq_to_vector_map);
 
-static unsigned long ia64_vector_mask[BITS_TO_LONGS(IA64_NUM_DEVICE_VECTORS)];
+DEFINE_SPINLOCK(vector_lock);
 
-int
-assign_irq_vector (int irq)
+struct irq_cfg irq_cfg[NR_IRQS] __read_mostly = {
+	[0 ... NR_IRQS - 1] = {
+		.vector = IRQ_VECTOR_UNASSIGNED,
+		.domain = CPU_MASK_NONE
+	}
+};
+
+DEFINE_PER_CPU(int[IA64_NUM_VECTORS], vector_irq) = {
+	[0 ... IA64_NUM_VECTORS - 1] = -1
+};
+
+static cpumask_t vector_table[IA64_NUM_VECTORS] = {
+	[0 ... IA64_NUM_VECTORS - 1] = CPU_MASK_NONE
+};
+
+static int irq_status[NR_IRQS] = {
+	[0 ... NR_IRQS -1] = IRQ_UNUSED
+};
+
+static inline int find_unassigned_irq(void)
 {
+	int irq;
+
+	for (irq = IA64_FIRST_DEVICE_VECTOR; irq < NR_IRQS; irq++)
+		if (irq_status[irq] == IRQ_UNUSED)
+			return irq;
+	return -ENOSPC;
+}
+
+static inline int find_unassigned_vector(cpumask_t domain)
+{
+	cpumask_t mask;
 	int pos, vector;
- again:
-	pos = find_first_zero_bit(ia64_vector_mask, IA64_NUM_DEVICE_VECTORS);
-	vector = IA64_FIRST_DEVICE_VECTOR + pos;
-	if (vector > IA64_LAST_DEVICE_VECTOR)
-		return -ENOSPC;
-	if (test_and_set_bit(pos, ia64_vector_mask))
-		goto again;
+
+	cpumask_and(&mask, &domain, cpu_online_mask);
+	if (cpus_empty(mask))
+		return -EINVAL;
+
+	for (pos = 0; pos < IA64_NUM_DEVICE_VECTORS; pos++) {
+		vector = IA64_FIRST_DEVICE_VECTOR + pos;
+		cpus_and(mask, domain, vector_table[vector]);
+		if (!cpus_empty(mask))
+			continue;
+		return vector;
+	}
+	return -ENOSPC;
+}
+
+static int __bind_irq_vector(int irq, int vector, cpumask_t domain)
+{
+	cpumask_t mask;
+	int cpu;
+	struct irq_cfg *cfg = &irq_cfg[irq];
+
+	BUG_ON((unsigned)irq >= NR_IRQS);
+	BUG_ON((unsigned)vector >= IA64_NUM_VECTORS);
+
+	cpumask_and(&mask, &domain, cpu_online_mask);
+	if (cpus_empty(mask))
+		return -EINVAL;
+	if ((cfg->vector == vector) && cpus_equal(cfg->domain, domain))
+		return 0;
+	if (cfg->vector != IRQ_VECTOR_UNASSIGNED)
+		return -EBUSY;
+	for_each_cpu_mask(cpu, mask)
+		per_cpu(vector_irq, cpu)[vector] = irq;
+	cfg->vector = vector;
+	cfg->domain = domain;
+	irq_status[irq] = IRQ_USED;
+	cpus_or(vector_table[vector], vector_table[vector], domain);
+	return 0;
+}
+
+int bind_irq_vector(int irq, int vector, cpumask_t domain)
+{
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&vector_lock, flags);
+	ret = __bind_irq_vector(irq, vector, domain);
+	spin_unlock_irqrestore(&vector_lock, flags);
+	return ret;
+}
+
+static void __clear_irq_vector(int irq)
+{
+	int vector, cpu;
+	cpumask_t mask;
+	cpumask_t domain;
+	struct irq_cfg *cfg = &irq_cfg[irq];
+
+	BUG_ON((unsigned)irq >= NR_IRQS);
+	BUG_ON(cfg->vector == IRQ_VECTOR_UNASSIGNED);
+	vector = cfg->vector;
+	domain = cfg->domain;
+	cpumask_and(&mask, &cfg->domain, cpu_online_mask);
+	for_each_cpu_mask(cpu, mask)
+		per_cpu(vector_irq, cpu)[vector] = -1;
+	cfg->vector = IRQ_VECTOR_UNASSIGNED;
+	cfg->domain = CPU_MASK_NONE;
+	irq_status[irq] = IRQ_UNUSED;
+	cpus_andnot(vector_table[vector], vector_table[vector], domain);
+}
+
+static void clear_irq_vector(int irq)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&vector_lock, flags);
+	__clear_irq_vector(irq);
+	spin_unlock_irqrestore(&vector_lock, flags);
+}
+
+int
+ia64_native_assign_irq_vector (int irq)
+{
+	unsigned long flags;
+	int vector, cpu;
+	cpumask_t domain = CPU_MASK_NONE;
+
+	vector = -ENOSPC;
+
+	spin_lock_irqsave(&vector_lock, flags);
+	for_each_online_cpu(cpu) {
+		domain = vector_allocation_domain(cpu);
+		vector = find_unassigned_vector(domain);
+		if (vector >= 0)
+			break;
+	}
+	if (vector < 0)
+		goto out;
+	if (irq == AUTO_ASSIGN)
+		irq = vector;
+	BUG_ON(__bind_irq_vector(irq, vector, domain));
+ out:
+	spin_unlock_irqrestore(&vector_lock, flags);
 	return vector;
 }
 
 void
-free_irq_vector (int vector)
+ia64_native_free_irq_vector (int vector)
+{
+	if (vector < IA64_FIRST_DEVICE_VECTOR ||
+	    vector > IA64_LAST_DEVICE_VECTOR)
+		return;
+	clear_irq_vector(vector);
+}
+
+int
+reserve_irq_vector (int vector)
+{
+	if (vector < IA64_FIRST_DEVICE_VECTOR ||
+	    vector > IA64_LAST_DEVICE_VECTOR)
+		return -EINVAL;
+	return !!bind_irq_vector(vector, vector, CPU_MASK_ALL);
+}
+
+/*
+ * Initialize vector_irq on a new cpu. This function must be called
+ * with vector_lock held.
+ */
+void __setup_vector_irq(int cpu)
+{
+	int irq, vector;
+
+	/* Clear vector_irq */
+	for (vector = 0; vector < IA64_NUM_VECTORS; ++vector)
+		per_cpu(vector_irq, cpu)[vector] = -1;
+	/* Mark the inuse vectors */
+	for (irq = 0; irq < NR_IRQS; ++irq) {
+		if (!cpu_isset(cpu, irq_cfg[irq].domain))
+			continue;
+		vector = irq_to_vector(irq);
+		per_cpu(vector_irq, cpu)[vector] = irq;
+	}
+}
+
+#if defined(CONFIG_SMP) && (defined(CONFIG_IA64_GENERIC) || defined(CONFIG_IA64_DIG))
+
+static enum vector_domain_type {
+	VECTOR_DOMAIN_NONE,
+	VECTOR_DOMAIN_PERCPU
+} vector_domain_type = VECTOR_DOMAIN_NONE;
+
+static cpumask_t vector_allocation_domain(int cpu)
 {
-	int pos;
+	if (vector_domain_type == VECTOR_DOMAIN_PERCPU)
+		return cpumask_of_cpu(cpu);
+	return CPU_MASK_ALL;
+}
+
+static int __irq_prepare_move(int irq, int cpu)
+{
+	struct irq_cfg *cfg = &irq_cfg[irq];
+	int vector;
+	cpumask_t domain;
+
+	if (cfg->move_in_progress || cfg->move_cleanup_count)
+		return -EBUSY;
+	if (cfg->vector == IRQ_VECTOR_UNASSIGNED || !cpu_online(cpu))
+		return -EINVAL;
+	if (cpu_isset(cpu, cfg->domain))
+		return 0;
+	domain = vector_allocation_domain(cpu);
+	vector = find_unassigned_vector(domain);
+	if (vector < 0)
+		return -ENOSPC;
+	cfg->move_in_progress = 1;
+	cfg->old_domain = cfg->domain;
+	cfg->vector = IRQ_VECTOR_UNASSIGNED;
+	cfg->domain = CPU_MASK_NONE;
+	BUG_ON(__bind_irq_vector(irq, vector, domain));
+	return 0;
+}
+
+int irq_prepare_move(int irq, int cpu)
+{
+	unsigned long flags;
+	int ret;
 
-	if (vector < IA64_FIRST_DEVICE_VECTOR || vector > IA64_LAST_DEVICE_VECTOR)
+	spin_lock_irqsave(&vector_lock, flags);
+	ret = __irq_prepare_move(irq, cpu);
+	spin_unlock_irqrestore(&vector_lock, flags);
+	return ret;
+}
+
+void irq_complete_move(unsigned irq)
+{
+	struct irq_cfg *cfg = &irq_cfg[irq];
+	cpumask_t cleanup_mask;
+	int i;
+
+	if (likely(!cfg->move_in_progress))
+		return;
+
+	if (unlikely(cpu_isset(smp_processor_id(), cfg->old_domain)))
 		return;
 
-	pos = vector - IA64_FIRST_DEVICE_VECTOR;
-	if (!test_and_clear_bit(pos, ia64_vector_mask))
-		printk(KERN_WARNING "%s: double free!\n", __FUNCTION__);
+	cpumask_and(&cleanup_mask, &cfg->old_domain, cpu_online_mask);
+	cfg->move_cleanup_count = cpus_weight(cleanup_mask);
+	for_each_cpu_mask(i, cleanup_mask)
+		platform_send_ipi(i, IA64_IRQ_MOVE_VECTOR, IA64_IPI_DM_INT, 0);
+	cfg->move_in_progress = 0;
+}
+
+static irqreturn_t smp_irq_move_cleanup_interrupt(int irq, void *dev_id)
+{
+	int me = smp_processor_id();
+	ia64_vector vector;
+	unsigned long flags;
+
+	for (vector = IA64_FIRST_DEVICE_VECTOR;
+	     vector < IA64_LAST_DEVICE_VECTOR; vector++) {
+		int irq;
+		struct irq_desc *desc;
+		struct irq_cfg *cfg;
+		irq = __get_cpu_var(vector_irq)[vector];
+		if (irq < 0)
+			continue;
+
+		desc = irq_to_desc(irq);
+		cfg = irq_cfg + irq;
+		raw_spin_lock(&desc->lock);
+		if (!cfg->move_cleanup_count)
+			goto unlock;
+
+		if (!cpu_isset(me, cfg->old_domain))
+			goto unlock;
+
+		spin_lock_irqsave(&vector_lock, flags);
+		__get_cpu_var(vector_irq)[vector] = -1;
+		cpu_clear(me, vector_table[vector]);
+		spin_unlock_irqrestore(&vector_lock, flags);
+		cfg->move_cleanup_count--;
+	unlock:
+		raw_spin_unlock(&desc->lock);
+	}
+	return IRQ_HANDLED;
+}
+
+static struct irqaction irq_move_irqaction = {
+	.handler =	smp_irq_move_cleanup_interrupt,
+	.name =		"irq_move"
+};
+
+static int __init parse_vector_domain(char *arg)
+{
+	if (!arg)
+		return -EINVAL;
+	if (!strcmp(arg, "percpu")) {
+		vector_domain_type = VECTOR_DOMAIN_PERCPU;
+		no_int_routing = 1;
+	}
+	return 0;
+}
+early_param("vector", parse_vector_domain);
+#else
+static cpumask_t vector_allocation_domain(int cpu)
+{
+	return CPU_MASK_ALL;
+}
+#endif
+
+
+void destroy_and_reserve_irq(unsigned int irq)
+{
+	unsigned long flags;
+
+	irq_init_desc(irq);
+	spin_lock_irqsave(&vector_lock, flags);
+	__clear_irq_vector(irq);
+	irq_status[irq] = IRQ_RSVD;
+	spin_unlock_irqrestore(&vector_lock, flags);
+}
+
+/*
+ * Dynamic irq allocate and deallocation for MSI
+ */
+int create_irq(void)
+{
+	unsigned long flags;
+	int irq, vector, cpu;
+	cpumask_t domain = CPU_MASK_NONE;
+
+	irq = vector = -ENOSPC;
+	spin_lock_irqsave(&vector_lock, flags);
+	for_each_online_cpu(cpu) {
+		domain = vector_allocation_domain(cpu);
+		vector = find_unassigned_vector(domain);
+		if (vector >= 0)
+			break;
+	}
+	if (vector < 0)
+		goto out;
+	irq = find_unassigned_irq();
+	if (irq < 0)
+		goto out;
+	BUG_ON(__bind_irq_vector(irq, vector, domain));
+ out:
+	spin_unlock_irqrestore(&vector_lock, flags);
+	if (irq >= 0)
+		irq_init_desc(irq);
+	return irq;
+}
+
+void destroy_irq(unsigned int irq)
+{
+	irq_init_desc(irq);
+	clear_irq_vector(irq);
 }
 
 #ifdef CONFIG_SMP
 #	define IS_RESCHEDULE(vec)	(vec == IA64_IPI_RESCHEDULE)
+#	define IS_LOCAL_TLB_FLUSH(vec)	(vec == IA64_IPI_LOCAL_TLB_FLUSH)
 #else
 #	define IS_RESCHEDULE(vec)	(0)
+#	define IS_LOCAL_TLB_FLUSH(vec)	(0)
 #endif
 /*
  * That's where the IVT branches when we get an external
@@ -102,6 +440,7 @@ free_irq_vector (int vector)
 void
 ia64_handle_irq (ia64_vector vector, struct pt_regs *regs)
 {
+	struct pt_regs *old_regs = set_irq_regs(regs);
 	unsigned long saved_tpr;
 
 #if IRQ_DEBUG
@@ -119,13 +458,9 @@ ia64_handle_irq (ia64_vector vector, struct pt_regs *regs)
 		sp = ia64_getreg(_IA64_REG_SP);
 
 		if ((sp - bsp) < 1024) {
-			static unsigned char count;
-			static long last_time;
+			static DEFINE_RATELIMIT_STATE(ratelimit, 5 * HZ, 5);
 
-			if (jiffies - last_time > 5*HZ)
-				count = 0;
-			if (++count < 5) {
-				last_time = jiffies;
+			if (__ratelimit(&ratelimit)) {
 				printk("ia64_handle_irq: DANGER: less than "
 				       "1KB of free stack space!!\n"
 				       "(bsp=0x%lx, sp=%lx)\n", bsp, sp);
@@ -143,11 +478,25 @@ ia64_handle_irq (ia64_vector vector, struct pt_regs *regs)
 	saved_tpr = ia64_getreg(_IA64_REG_CR_TPR);
 	ia64_srlz_d();
 	while (vector != IA64_SPURIOUS_INT_VECTOR) {
-		if (!IS_RESCHEDULE(vector)) {
+		int irq = local_vector_to_irq(vector);
+
+		if (unlikely(IS_LOCAL_TLB_FLUSH(vector))) {
+			smp_local_flush_tlb();
+			kstat_incr_irq_this_cpu(irq);
+		} else if (unlikely(IS_RESCHEDULE(vector))) {
+			scheduler_ipi();
+			kstat_incr_irq_this_cpu(irq);
+		} else {
 			ia64_setreg(_IA64_REG_CR_TPR, vector);
 			ia64_srlz_d();
 
-			__do_IRQ(local_vector_to_irq(vector), regs);
+			if (unlikely(irq < 0)) {
+				printk(KERN_ERR "%s: Unexpected interrupt "
+				       "vector %d on CPU %d is not mapped "
+				       "to any IRQ!\n", __func__, vector,
+				       smp_processor_id());
+			} else
+				generic_handle_irq(irq);
 
 			/*
 			 * Disable interrupts and send EOI:
@@ -164,6 +513,7 @@ ia64_handle_irq (ia64_vector vector, struct pt_regs *regs)
 	 * come through until ia64_eoi() has been done.
 	 */
 	irq_exit();
+	set_irq_regs(old_regs);
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
@@ -179,15 +529,24 @@ void ia64_process_pending_intr(void)
 
 	vector = ia64_get_ivr();
 
-	 irq_enter();
-	 saved_tpr = ia64_getreg(_IA64_REG_CR_TPR);
-	 ia64_srlz_d();
+	irq_enter();
+	saved_tpr = ia64_getreg(_IA64_REG_CR_TPR);
+	ia64_srlz_d();
 
 	 /*
 	  * Perform normal interrupt style processing
 	  */
 	while (vector != IA64_SPURIOUS_INT_VECTOR) {
-		if (!IS_RESCHEDULE(vector)) {
+		int irq = local_vector_to_irq(vector);
+
+		if (unlikely(IS_LOCAL_TLB_FLUSH(vector))) {
+			smp_local_flush_tlb();
+			kstat_incr_irq_this_cpu(irq);
+		} else if (unlikely(IS_RESCHEDULE(vector))) {
+			kstat_incr_irq_this_cpu(irq);
+		} else {
+			struct pt_regs *old_regs = set_irq_regs(NULL);
+
 			ia64_setreg(_IA64_REG_CR_TPR, vector);
 			ia64_srlz_d();
 
@@ -197,8 +556,16 @@ void ia64_process_pending_intr(void)
 			 * it will work. I hope it works!.
 			 * Probably could shared code.
 			 */
-			vectors_in_migration[local_vector_to_irq(vector)]=0;
-			__do_IRQ(local_vector_to_irq(vector), NULL);
+			if (unlikely(irq < 0)) {
+				printk(KERN_ERR "%s: Unexpected interrupt "
+				       "vector %d on CPU %d not being mapped "
+				       "to any IRQ!!\n", __func__, vector,
+				       smp_processor_id());
+			} else {
+				vectors_in_migration[irq]=0;
+				generic_handle_irq(irq);
+			}
+			set_irq_regs(old_regs);
 
 			/*
 			 * Disable interrupts and send EOI
@@ -215,37 +582,69 @@ void ia64_process_pending_intr(void)
 
 
 #ifdef CONFIG_SMP
-extern irqreturn_t handle_IPI (int irq, void *dev_id, struct pt_regs *regs);
+
+static irqreturn_t dummy_handler (int irq, void *dev_id)
+{
+	BUG();
+}
 
 static struct irqaction ipi_irqaction = {
 	.handler =	handle_IPI,
-	.flags =	SA_INTERRUPT,
 	.name =		"IPI"
 };
+
+/*
+ * KVM uses this interrupt to force a cpu out of guest mode
+ */
+static struct irqaction resched_irqaction = {
+	.handler =	dummy_handler,
+	.name =		"resched"
+};
+
+static struct irqaction tlb_irqaction = {
+	.handler =	dummy_handler,
+	.name =		"tlb_flush"
+};
+
 #endif
 
 void
-register_percpu_irq (ia64_vector vec, struct irqaction *action)
+ia64_native_register_percpu_irq (ia64_vector vec, struct irqaction *action)
 {
-	irq_desc_t *desc;
 	unsigned int irq;
 
-	for (irq = 0; irq < NR_IRQS; ++irq)
-		if (irq_to_vector(irq) == vec) {
-			desc = irq_descp(irq);
-			desc->status |= IRQ_PER_CPU;
-			desc->handler = &irq_type_ia64_lsapic;
-			if (action)
-				setup_irq(irq, action);
-		}
+	irq = vec;
+	BUG_ON(bind_irq_vector(irq, vec, CPU_MASK_ALL));
+	irq_set_status_flags(irq, IRQ_PER_CPU);
+	irq_set_chip(irq, &irq_type_ia64_lsapic);
+	if (action)
+		setup_irq(irq, action);
+	irq_set_handler(irq, handle_percpu_irq);
+}
+
+void __init
+ia64_native_register_ipi(void)
+{
+#ifdef CONFIG_SMP
+	register_percpu_irq(IA64_IPI_VECTOR, &ipi_irqaction);
+	register_percpu_irq(IA64_IPI_RESCHEDULE, &resched_irqaction);
+	register_percpu_irq(IA64_IPI_LOCAL_TLB_FLUSH, &tlb_irqaction);
+#endif
 }
 
 void __init
 init_IRQ (void)
 {
+#ifdef CONFIG_ACPI
+	acpi_boot_init();
+#endif
+	ia64_register_ipi();
 	register_percpu_irq(IA64_SPURIOUS_INT_VECTOR, NULL);
 #ifdef CONFIG_SMP
-	register_percpu_irq(IA64_IPI_VECTOR, &ipi_irqaction);
+#if defined(CONFIG_IA64_GENERIC) || defined(CONFIG_IA64_DIG)
+	if (vector_domain_type != VECTOR_DOMAIN_NONE)
+		register_percpu_irq(IA64_IRQ_MOVE_VECTOR, &irq_move_irqaction);
+#endif
 #endif
 #ifdef CONFIG_PERFMON
 	pfm_init_percpu();
@@ -260,11 +659,7 @@ ia64_send_ipi (int cpu, int vector, int delivery_mode, int redirect)
 	unsigned long ipi_data;
 	unsigned long phys_cpu_id;
 
-#ifdef CONFIG_SMP
 	phys_cpu_id = cpu_physical_id(cpu);
-#else
-	phys_cpu_id = (ia64_getreg(_IA64_REG_CR_LID) >> 16) & 0xffff;
-#endif
 
 	/*
 	 * cpu number is in 8bit ID and 8bit EID
diff --git a/arch/ia64/kernel/irq_lsapic.c b/arch/ia64/kernel/irq_lsapic.c
index ea14e6a0440..1b3a776e516 100644
--- a/arch/ia64/kernel/irq_lsapic.c
+++ b/arch/ia64/kernel/irq_lsapic.c
@@ -15,23 +15,30 @@
 #include <linux/irq.h>
 
 static unsigned int
-lsapic_noop_startup (unsigned int irq)
+lsapic_noop_startup (struct irq_data *data)
 {
 	return 0;
 }
 
 static void
-lsapic_noop (unsigned int irq)
+lsapic_noop (struct irq_data *data)
 {
-	/* nuthing to do... */
+	/* nothing to do... */
 }
 
-struct hw_interrupt_type irq_type_ia64_lsapic = {
-	.typename =	"LSAPIC",
-	.startup =	lsapic_noop_startup,
-	.shutdown =	lsapic_noop,
-	.enable =	lsapic_noop,
-	.disable =	lsapic_noop,
-	.ack =		lsapic_noop,
-	.end =		lsapic_noop
+static int lsapic_retrigger(struct irq_data *data)
+{
+	ia64_resend_irq(data->irq);
+
+	return 1;
+}
+
+struct irq_chip irq_type_ia64_lsapic = {
+	.name =			"LSAPIC",
+	.irq_startup =		lsapic_noop_startup,
+	.irq_shutdown =		lsapic_noop,
+	.irq_enable =		lsapic_noop,
+	.irq_disable =		lsapic_noop,
+	.irq_ack =		lsapic_noop,
+	.irq_retrigger =	lsapic_retrigger,
 };
diff --git a/arch/ia64/kernel/ivt.S b/arch/ia64/kernel/ivt.S
index c13ca0d49c4..18e794a5724 100644
--- a/arch/ia64/kernel/ivt.S
+++ b/arch/ia64/kernel/ivt.S
@@ -12,6 +12,14 @@
  *
  * 00/08/23 Asit Mallick <asit.k.mallick@intel.com> TLB handling for SMP
  * 00/12/20 David Mosberger-Tang <davidm@hpl.hp.com> DTLB/ITLB handler now uses virtual PT.
+ *
+ * Copyright (C) 2005 Hewlett-Packard Co
+ *	Dan Magenheimer <dan.magenheimer@hp.com>
+ *      Xen paravirtualization
+ * Copyright (c) 2008 Isaku Yamahata <yamahata at valinux co jp>
+ *                    VA Linux Systems Japan K.K.
+ *                    pv_ops.
+ *      Yaozu (Eddie) Dong <eddie.dong@intel.com>
  */
 /*
  * This file defines the interruption vector table used by the CPU.
@@ -38,22 +46,19 @@
  * Table is based upon EAS2.6 (Oct 1999)
  */
 
-#include <linux/config.h>
 
 #include <asm/asmmacro.h>
 #include <asm/break.h>
-#include <asm/ia32.h>
 #include <asm/kregs.h>
 #include <asm/asm-offsets.h>
 #include <asm/pgtable.h>
 #include <asm/processor.h>
 #include <asm/ptrace.h>
-#include <asm/system.h>
 #include <asm/thread_info.h>
 #include <asm/unistd.h>
 #include <asm/errno.h>
 
-#if 1
+#if 0
 # define PSR_DEFAULT_BITS	psr.ac
 #else
 # define PSR_DEFAULT_BITS	0
@@ -76,7 +81,7 @@
 	mov r19=n;;			/* prepare to save predicates */		\
 	br.sptk.many dispatch_to_fault_handler
 
-	.section .text.ivt,"ax"
+	.section .text..ivt,"ax"
 
 	.align 32768	// align on 32KB boundary
 	.global ia64_ivt
@@ -91,30 +96,31 @@ ENTRY(vhpt_miss)
 	 * (the "original") TLB miss, which may either be caused by an instruction
 	 * fetch or a data access (or non-access).
 	 *
-	 * What we do here is normal TLB miss handing for the _original_ miss, followed
-	 * by inserting the TLB entry for the virtual page table page that the VHPT
-	 * walker was attempting to access.  The latter gets inserted as long
-	 * as both L1 and L2 have valid mappings for the faulting address.
-	 * The TLB entry for the original miss gets inserted only if
-	 * the L3 entry indicates that the page is present.
+	 * What we do here is normal TLB miss handing for the _original_ miss,
+	 * followed by inserting the TLB entry for the virtual page table page
+	 * that the VHPT walker was attempting to access.  The latter gets
+	 * inserted as long as page table entry above pte level have valid
+	 * mappings for the faulting address.  The TLB entry for the original
+	 * miss gets inserted only if the pte entry indicates that the page is
+	 * present.
 	 *
 	 * do_page_fault gets invoked in the following cases:
 	 *	- the faulting virtual address uses unimplemented address bits
-	 *	- the faulting virtual address has no L1, L2, or L3 mapping
+	 *	- the faulting virtual address has no valid page table mapping
 	 */
-	mov r16=cr.ifa				// get address that caused the TLB miss
+	MOV_FROM_IFA(r16)			// get address that caused the TLB miss
 #ifdef CONFIG_HUGETLB_PAGE
 	movl r18=PAGE_SHIFT
-	mov r25=cr.itir
+	MOV_FROM_ITIR(r25)
 #endif
 	;;
-	rsm psr.dt				// use physical addressing for data
+	RSM_PSR_DT				// use physical addressing for data
 	mov r31=pr				// save the predicate registers
 	mov r19=IA64_KR(PT_BASE)		// get page table base address
 	shl r21=r16,3				// shift bit 60 into sign bit
 	shr.u r17=r16,61			// get the region number into r17
 	;;
-	shr r22=r21,3
+	shr.u r22=r21,3
 #ifdef CONFIG_HUGETLB_PAGE
 	extr.u r26=r25,2,6
 	;;
@@ -126,7 +132,7 @@ ENTRY(vhpt_miss)
 #endif
 	;;
 	cmp.eq p6,p7=5,r17			// is IFA pointing into to region 5?
-	shr.u r18=r22,PGDIR_SHIFT		// get bits 33-63 of the faulting address
+	shr.u r18=r22,PGDIR_SHIFT		// get bottom portion of pgd index bit
 	;;
 (p7)	dep r17=r17,r19,(PAGE_SHIFT-3),3	// put region number bits in place
 
@@ -137,38 +143,52 @@ ENTRY(vhpt_miss)
 (p6)	shr.u r21=r21,PGDIR_SHIFT+PAGE_SHIFT
 (p7)	shr.u r21=r21,PGDIR_SHIFT+PAGE_SHIFT-3
 	;;
-(p6)	dep r17=r18,r19,3,(PAGE_SHIFT-3)	// r17=PTA + IFA(33,42)*8
-(p7)	dep r17=r18,r17,3,(PAGE_SHIFT-6)	// r17=PTA + (((IFA(61,63) << 7) | IFA(33,39))*8)
+(p6)	dep r17=r18,r19,3,(PAGE_SHIFT-3)	// r17=pgd_offset for region 5
+(p7)	dep r17=r18,r17,3,(PAGE_SHIFT-6)	// r17=pgd_offset for region[0-4]
 	cmp.eq p7,p6=0,r21			// unused address bits all zeroes?
-	shr.u r18=r22,PMD_SHIFT			// shift L2 index into position
+#ifdef CONFIG_PGTABLE_4
+	shr.u r28=r22,PUD_SHIFT			// shift pud index into position
+#else
+	shr.u r18=r22,PMD_SHIFT			// shift pmd index into position
+#endif
 	;;
-	ld8 r17=[r17]				// fetch the L1 entry (may be 0)
+	ld8 r17=[r17]				// get *pgd (may be 0)
 	;;
-(p7)	cmp.eq p6,p7=r17,r0			// was L1 entry NULL?
-	dep r17=r18,r17,3,(PAGE_SHIFT-3)	// compute address of L2 page table entry
+(p7)	cmp.eq p6,p7=r17,r0			// was pgd_present(*pgd) == NULL?
+#ifdef CONFIG_PGTABLE_4
+	dep r28=r28,r17,3,(PAGE_SHIFT-3)	// r28=pud_offset(pgd,addr)
 	;;
-(p7)	ld8 r20=[r17]				// fetch the L2 entry (may be 0)
-	shr.u r19=r22,PAGE_SHIFT		// shift L3 index into position
+	shr.u r18=r22,PMD_SHIFT			// shift pmd index into position
+(p7)	ld8 r29=[r28]				// get *pud (may be 0)
 	;;
-(p7)	cmp.eq.or.andcm p6,p7=r20,r0		// was L2 entry NULL?
-	dep r21=r19,r20,3,(PAGE_SHIFT-3)	// compute address of L3 page table entry
+(p7)	cmp.eq.or.andcm p6,p7=r29,r0		// was pud_present(*pud) == NULL?
+	dep r17=r18,r29,3,(PAGE_SHIFT-3)	// r17=pmd_offset(pud,addr)
+#else
+	dep r17=r18,r17,3,(PAGE_SHIFT-3)	// r17=pmd_offset(pgd,addr)
+#endif
 	;;
-(p7)	ld8 r18=[r21]				// read the L3 PTE
-	mov r19=cr.isr				// cr.isr bit 0 tells us if this is an insn miss
+(p7)	ld8 r20=[r17]				// get *pmd (may be 0)
+	shr.u r19=r22,PAGE_SHIFT		// shift pte index into position
+	;;
+(p7)	cmp.eq.or.andcm p6,p7=r20,r0		// was pmd_present(*pmd) == NULL?
+	dep r21=r19,r20,3,(PAGE_SHIFT-3)	// r21=pte_offset(pmd,addr)
+	;;
+(p7)	ld8 r18=[r21]				// read *pte
+	MOV_FROM_ISR(r19)			// cr.isr bit 32 tells us if this is an insn miss
 	;;
 (p7)	tbit.z p6,p7=r18,_PAGE_P_BIT		// page present bit cleared?
-	mov r22=cr.iha				// get the VHPT address that caused the TLB miss
+	MOV_FROM_IHA(r22)			// get the VHPT address that caused the TLB miss
 	;;					// avoid RAW on p7
 (p7)	tbit.nz.unc p10,p11=r19,32		// is it an instruction TLB miss?
 	dep r23=0,r20,0,PAGE_SHIFT		// clear low bits to get page address
 	;;
-(p10)	itc.i r18				// insert the instruction TLB entry
-(p11)	itc.d r18				// insert the data TLB entry
+	ITC_I_AND_D(p10, p11, r18, r24)		// insert the instruction TLB entry and
+						// insert the data TLB entry
 (p6)	br.cond.spnt.many page_fault		// handle bad address/page not present (page fault)
-	mov cr.ifa=r22
+	MOV_TO_IFA(r22, r24)
 
 #ifdef CONFIG_HUGETLB_PAGE
-(p8)	mov cr.itir=r25				// change to default page-size for VHPT
+	MOV_TO_ITIR(p8, r25, r24)		// change to default page-size for VHPT
 #endif
 
 	/*
@@ -178,7 +198,7 @@ ENTRY(vhpt_miss)
 	 */
 	adds r24=__DIRTY_BITS_NO_ED|_PAGE_PL_0|_PAGE_AR_RW,r23
 	;;
-(p7)	itc.d r24
+	ITC_D(p7, r24, r25)
 	;;
 #ifdef CONFIG_SMP
 	/*
@@ -188,24 +208,39 @@ ENTRY(vhpt_miss)
 	dv_serialize_data
 
 	/*
-	 * Re-check L2 and L3 pagetable.  If they changed, we may have received a ptc.g
+	 * Re-check pagetable entry.  If they changed, we may have received a ptc.g
 	 * between reading the pagetable and the "itc".  If so, flush the entry we
-	 * inserted and retry.
+	 * inserted and retry.  At this point, we have:
+	 *
+	 * r28 = equivalent of pud_offset(pgd, ifa)
+	 * r17 = equivalent of pmd_offset(pud, ifa)
+	 * r21 = equivalent of pte_offset(pmd, ifa)
+	 *
+	 * r29 = *pud
+	 * r20 = *pmd
+	 * r18 = *pte
 	 */
-	ld8 r25=[r21]				// read L3 PTE again
-	ld8 r26=[r17]				// read L2 entry again
+	ld8 r25=[r21]				// read *pte again
+	ld8 r26=[r17]				// read *pmd again
+#ifdef CONFIG_PGTABLE_4
+	ld8 r19=[r28]				// read *pud again
+#endif
+	cmp.ne p6,p7=r0,r0
 	;;
-	cmp.ne p6,p7=r26,r20			// did L2 entry change
+	cmp.ne.or.andcm p6,p7=r26,r20		// did *pmd change
+#ifdef CONFIG_PGTABLE_4
+	cmp.ne.or.andcm p6,p7=r19,r29		// did *pud change
+#endif
 	mov r27=PAGE_SHIFT<<2
 	;;
 (p6)	ptc.l r22,r27				// purge PTE page translation
-(p7)	cmp.ne.or.andcm p6,p7=r25,r18		// did L3 PTE change
+(p7)	cmp.ne.or.andcm p6,p7=r25,r18		// did *pte change
 	;;
 (p6)	ptc.l r16,r27				// purge translation
 #endif
 
 	mov pr=r31,-1				// restore predicate registers
-	rfi
+	RFI
 END(vhpt_miss)
 
 	.org ia64_ivt+0x400
@@ -214,25 +249,25 @@ END(vhpt_miss)
 ENTRY(itlb_miss)
 	DBG_FAULT(1)
 	/*
-	 * The ITLB handler accesses the L3 PTE via the virtually mapped linear
+	 * The ITLB handler accesses the PTE via the virtually mapped linear
 	 * page table.  If a nested TLB miss occurs, we switch into physical
-	 * mode, walk the page table, and then re-execute the L3 PTE read
-	 * and go on normally after that.
+	 * mode, walk the page table, and then re-execute the PTE read and
+	 * go on normally after that.
 	 */
-	mov r16=cr.ifa				// get virtual address
+	MOV_FROM_IFA(r16)			// get virtual address
 	mov r29=b0				// save b0
 	mov r31=pr				// save predicates
 .itlb_fault:
-	mov r17=cr.iha				// get virtual address of L3 PTE
+	MOV_FROM_IHA(r17)			// get virtual address of PTE
 	movl r30=1f				// load nested fault continuation point
 	;;
-1:	ld8 r18=[r17]				// read L3 PTE
+1:	ld8 r18=[r17]				// read *pte
 	;;
 	mov b0=r29
 	tbit.z p6,p0=r18,_PAGE_P_BIT		// page present bit cleared?
 (p6)	br.cond.spnt page_fault
 	;;
-	itc.i r18
+	ITC_I(p0, r18, r19)
 	;;
 #ifdef CONFIG_SMP
 	/*
@@ -241,7 +276,7 @@ ENTRY(itlb_miss)
 	 */
 	dv_serialize_data
 
-	ld8 r19=[r17]				// read L3 PTE again and see if same
+	ld8 r19=[r17]				// read *pte again and see if same
 	mov r20=PAGE_SHIFT<<2			// setup page size for purge
 	;;
 	cmp.ne p7,p0=r18,r19
@@ -249,7 +284,7 @@ ENTRY(itlb_miss)
 (p7)	ptc.l r16,r20
 #endif
 	mov pr=r31,-1
-	rfi
+	RFI
 END(itlb_miss)
 
 	.org ia64_ivt+0x0800
@@ -258,25 +293,25 @@ END(itlb_miss)
 ENTRY(dtlb_miss)
 	DBG_FAULT(2)
 	/*
-	 * The DTLB handler accesses the L3 PTE via the virtually mapped linear
+	 * The DTLB handler accesses the PTE via the virtually mapped linear
 	 * page table.  If a nested TLB miss occurs, we switch into physical
-	 * mode, walk the page table, and then re-execute the L3 PTE read
-	 * and go on normally after that.
+	 * mode, walk the page table, and then re-execute the PTE read and
+	 * go on normally after that.
 	 */
-	mov r16=cr.ifa				// get virtual address
+	MOV_FROM_IFA(r16)			// get virtual address
 	mov r29=b0				// save b0
 	mov r31=pr				// save predicates
 dtlb_fault:
-	mov r17=cr.iha				// get virtual address of L3 PTE
+	MOV_FROM_IHA(r17)			// get virtual address of PTE
 	movl r30=1f				// load nested fault continuation point
 	;;
-1:	ld8 r18=[r17]				// read L3 PTE
+1:	ld8 r18=[r17]				// read *pte
 	;;
 	mov b0=r29
 	tbit.z p6,p0=r18,_PAGE_P_BIT		// page present bit cleared?
 (p6)	br.cond.spnt page_fault
 	;;
-	itc.d r18
+	ITC_D(p0, r18, r19)
 	;;
 #ifdef CONFIG_SMP
 	/*
@@ -285,7 +320,7 @@ dtlb_fault:
 	 */
 	dv_serialize_data
 
-	ld8 r19=[r17]				// read L3 PTE again and see if same
+	ld8 r19=[r17]				// read *pte again and see if same
 	mov r20=PAGE_SHIFT<<2			// setup page size for purge
 	;;
 	cmp.ne p7,p0=r18,r19
@@ -293,7 +328,7 @@ dtlb_fault:
 (p7)	ptc.l r16,r20
 #endif
 	mov pr=r31,-1
-	rfi
+	RFI
 END(dtlb_miss)
 
 	.org ia64_ivt+0x0c00
@@ -301,9 +336,9 @@ END(dtlb_miss)
 // 0x0c00 Entry 3 (size 64 bundles) Alt ITLB (19)
 ENTRY(alt_itlb_miss)
 	DBG_FAULT(3)
-	mov r16=cr.ifa		// get address that caused the TLB miss
+	MOV_FROM_IFA(r16)	// get address that caused the TLB miss
 	movl r17=PAGE_KERNEL
-	mov r21=cr.ipsr
+	MOV_FROM_IPSR(p0, r21)
 	movl r19=(((1 << IA64_MAX_PHYS_BITS) - 1) & ~0xfff)
 	mov r31=pr
 	;;
@@ -312,9 +347,9 @@ ENTRY(alt_itlb_miss)
 	;;
 	cmp.gt p8,p0=6,r22			// user mode
 	;;
-(p8)	thash r17=r16
+	THASH(p8, r17, r16, r23)
 	;;
-(p8)	mov cr.iha=r17
+	MOV_TO_IHA(p8, r17, r23)
 (p8)	mov r29=b0				// save b0
 (p8)	br.cond.dptk .itlb_fault
 #endif
@@ -329,9 +364,9 @@ ENTRY(alt_itlb_miss)
 	or r19=r19,r18		// set bit 4 (uncached) if the access was to region 6
 (p8)	br.cond.spnt page_fault
 	;;
-	itc.i r19		// insert the TLB entry
+	ITC_I(p0, r19, r18)	// insert the TLB entry
 	mov pr=r31,-1
-	rfi
+	RFI
 END(alt_itlb_miss)
 
 	.org ia64_ivt+0x1000
@@ -339,45 +374,54 @@ END(alt_itlb_miss)
 // 0x1000 Entry 4 (size 64 bundles) Alt DTLB (7,46)
 ENTRY(alt_dtlb_miss)
 	DBG_FAULT(4)
-	mov r16=cr.ifa		// get address that caused the TLB miss
+	MOV_FROM_IFA(r16)	// get address that caused the TLB miss
 	movl r17=PAGE_KERNEL
-	mov r20=cr.isr
+	MOV_FROM_ISR(r20)
 	movl r19=(((1 << IA64_MAX_PHYS_BITS) - 1) & ~0xfff)
-	mov r21=cr.ipsr
+	MOV_FROM_IPSR(p0, r21)
 	mov r31=pr
+	mov r24=PERCPU_ADDR
 	;;
 #ifdef CONFIG_DISABLE_VHPT
 	shr.u r22=r16,61			// get the region number into r21
 	;;
 	cmp.gt p8,p0=6,r22			// access to region 0-5
 	;;
-(p8)	thash r17=r16
+	THASH(p8, r17, r16, r25)
 	;;
-(p8)	mov cr.iha=r17
+	MOV_TO_IHA(p8, r17, r25)
 (p8)	mov r29=b0				// save b0
 (p8)	br.cond.dptk dtlb_fault
 #endif
+	cmp.ge p10,p11=r16,r24			// access to per_cpu_data?
+	tbit.z p12,p0=r16,61			// access to region 6?
+	mov r25=PERCPU_PAGE_SHIFT << 2
+	mov r26=PERCPU_PAGE_SIZE
+	nop.m 0
+	nop.b 0
+	;;
+(p10)	mov r19=IA64_KR(PER_CPU_DATA)
+(p11)	and r19=r19,r16				// clear non-ppn fields
 	extr.u r23=r21,IA64_PSR_CPL0_BIT,2	// extract psr.cpl
 	and r22=IA64_ISR_CODE_MASK,r20		// get the isr.code field
 	tbit.nz p6,p7=r20,IA64_ISR_SP_BIT	// is speculation bit on?
-	shr.u r18=r16,57			// move address bit 61 to bit 4
-	and r19=r19,r16				// clear ed, reserved bits, and PTE control bits
 	tbit.nz p9,p0=r20,IA64_ISR_NA_BIT	// is non-access bit on?
 	;;
-	andcm r18=0x10,r18	// bit 4=~address-bit(61)
+(p10)	sub r19=r19,r26
+	MOV_TO_ITIR(p10, r25, r24)
 	cmp.ne p8,p0=r0,r23
 (p9)	cmp.eq.or.andcm p6,p7=IA64_ISR_CODE_LFETCH,r22	// check isr.code field
+(p12)	dep r17=-1,r17,4,1			// set ma=UC for region 6 addr
 (p8)	br.cond.spnt page_fault
 
 	dep r21=-1,r21,IA64_PSR_ED_BIT,1
-	or r19=r19,r17		// insert PTE control bits into r19
 	;;
-	or r19=r19,r18		// set bit 4 (uncached) if the access was to region 6
-(p6)	mov cr.ipsr=r21
+	or r19=r19,r17		// insert PTE control bits into r19
+	MOV_TO_IPSR(p6, r21, r24)
 	;;
-(p7)	itc.d r19		// insert the TLB entry
+	ITC_D(p7, r19, r18)	// insert the TLB entry
 	mov pr=r31,-1
-	rfi
+	RFI
 END(alt_dtlb_miss)
 
 	.org ia64_ivt+0x1400
@@ -399,17 +443,17 @@ ENTRY(nested_dtlb_miss)
 	 *		r30:	continuation address
 	 *		r31:	saved pr
 	 *
-	 * Output:	r17:	physical address of L3 PTE of faulting address
+	 * Output:	r17:	physical address of PTE of faulting address
 	 *		r29:	saved b0
 	 *		r30:	continuation address
 	 *		r31:	saved pr
 	 *
 	 * Clobbered:	b0, r18, r19, r21, r22, psr.dt (cleared)
 	 */
-	rsm psr.dt				// switch to using physical data addressing
+	RSM_PSR_DT				// switch to using physical data addressing
 	mov r19=IA64_KR(PT_BASE)		// get the page table base address
 	shl r21=r16,3				// shift bit 60 into sign bit
-	mov r18=cr.itir
+	MOV_FROM_ITIR(r18)
 	;;
 	shr.u r17=r16,61			// get the region number into r17
 	extr.u r18=r18,2,6			// get the faulting page size
@@ -429,21 +473,33 @@ ENTRY(nested_dtlb_miss)
 (p6)	shr.u r21=r21,PGDIR_SHIFT+PAGE_SHIFT
 (p7)	shr.u r21=r21,PGDIR_SHIFT+PAGE_SHIFT-3
 	;;
-(p6)	dep r17=r18,r19,3,(PAGE_SHIFT-3)	// r17=PTA + IFA(33,42)*8
-(p7)	dep r17=r18,r17,3,(PAGE_SHIFT-6)	// r17=PTA + (((IFA(61,63) << 7) | IFA(33,39))*8)
+(p6)	dep r17=r18,r19,3,(PAGE_SHIFT-3)	// r17=pgd_offset for region 5
+(p7)	dep r17=r18,r17,3,(PAGE_SHIFT-6)	// r17=pgd_offset for region[0-4]
 	cmp.eq p7,p6=0,r21			// unused address bits all zeroes?
-	shr.u r18=r22,PMD_SHIFT			// shift L2 index into position
+#ifdef CONFIG_PGTABLE_4
+	shr.u r18=r22,PUD_SHIFT			// shift pud index into position
+#else
+	shr.u r18=r22,PMD_SHIFT			// shift pmd index into position
+#endif
+	;;
+	ld8 r17=[r17]				// get *pgd (may be 0)
 	;;
-	ld8 r17=[r17]				// fetch the L1 entry (may be 0)
+(p7)	cmp.eq p6,p7=r17,r0			// was pgd_present(*pgd) == NULL?
+	dep r17=r18,r17,3,(PAGE_SHIFT-3)	// r17=p[u|m]d_offset(pgd,addr)
 	;;
-(p7)	cmp.eq p6,p7=r17,r0			// was L1 entry NULL?
-	dep r17=r18,r17,3,(PAGE_SHIFT-3)	// compute address of L2 page table entry
+#ifdef CONFIG_PGTABLE_4
+(p7)	ld8 r17=[r17]				// get *pud (may be 0)
+	shr.u r18=r22,PMD_SHIFT			// shift pmd index into position
 	;;
-(p7)	ld8 r17=[r17]				// fetch the L2 entry (may be 0)
-	shr.u r19=r22,PAGE_SHIFT		// shift L3 index into position
+(p7)	cmp.eq.or.andcm p6,p7=r17,r0		// was pud_present(*pud) == NULL?
+	dep r17=r18,r17,3,(PAGE_SHIFT-3)	// r17=pmd_offset(pud,addr)
 	;;
-(p7)	cmp.eq.or.andcm p6,p7=r17,r0		// was L2 entry NULL?
-	dep r17=r19,r17,3,(PAGE_SHIFT-3)	// compute address of L3 page table entry
+#endif
+(p7)	ld8 r17=[r17]				// get *pmd (may be 0)
+	shr.u r19=r22,PAGE_SHIFT		// shift pte index into position
+	;;
+(p7)	cmp.eq.or.andcm p6,p7=r17,r0		// was pmd_present(*pmd) == NULL?
+	dep r17=r19,r17,3,(PAGE_SHIFT-3)	// r17=pte_offset(pmd,addr);
 (p6)	br.cond.spnt page_fault
 	mov b0=r30
 	br.sptk.many b0				// return to continuation point
@@ -457,33 +513,6 @@ ENTRY(ikey_miss)
 	FAULT(6)
 END(ikey_miss)
 
-	//-----------------------------------------------------------------------------------
-	// call do_page_fault (predicates are in r31, psr.dt may be off, r16 is faulting address)
-ENTRY(page_fault)
-	ssm psr.dt
-	;;
-	srlz.i
-	;;
-	SAVE_MIN_WITH_COVER
-	alloc r15=ar.pfs,0,0,3,0
-	mov out0=cr.ifa
-	mov out1=cr.isr
-	adds r3=8,r2				// set up second base pointer
-	;;
-	ssm psr.ic | PSR_DEFAULT_BITS
-	;;
-	srlz.i					// guarantee that interruption collectin is on
-	;;
-(p15)	ssm psr.i				// restore psr.i
-	movl r14=ia64_leave_kernel
-	;;
-	SAVE_REST
-	mov rp=r14
-	;;
-	adds out2=16,r12			// out2 = pointer to pt_regs
-	br.call.sptk.many b6=ia64_do_page_fault	// ignore return address
-END(page_fault)
-
 	.org ia64_ivt+0x1c00
 /////////////////////////////////////////////////////////////////////////////////////////
 // 0x1c00 Entry 7 (size 64 bundles) Data Key Miss (12,51)
@@ -506,10 +535,10 @@ ENTRY(dirty_bit)
 	 * page table TLB entry isn't present, we take a nested TLB miss hit where we look
 	 * up the physical address of the L3 PTE and then continue at label 1 below.
 	 */
-	mov r16=cr.ifa				// get the address that caused the fault
+	MOV_FROM_IFA(r16)			// get the address that caused the fault
 	movl r30=1f				// load continuation point in case of nested fault
 	;;
-	thash r17=r16				// compute virtual address of L3 PTE
+	THASH(p0, r17, r16, r18)		// compute virtual address of L3 PTE
 	mov r29=b0				// save b0 in case of nested fault
 	mov r31=pr				// save pr
 #ifdef CONFIG_SMP
@@ -519,13 +548,14 @@ ENTRY(dirty_bit)
 	;;					// avoid RAW on r18
 	mov ar.ccv=r18				// set compare value for cmpxchg
 	or r25=_PAGE_D|_PAGE_A,r18		// set the dirty and accessed bits
+	tbit.z p7,p6 = r18,_PAGE_P_BIT		// Check present bit
 	;;
-	cmpxchg8.acq r26=[r17],r25,ar.ccv
+(p6)	cmpxchg8.acq r26=[r17],r25,ar.ccv	// Only update if page is present
 	mov r24=PAGE_SHIFT<<2
 	;;
-	cmp.eq p6,p7=r26,r18
+(p6)	cmp.eq p6,p7=r26,r18			// Only compare if page is present
 	;;
-(p6)	itc.d r25				// install updated PTE
+	ITC_D(p6, r25, r18)			// install updated PTE
 	;;
 	/*
 	 * Tell the assemblers dependency-violation checker that the above "itc" instructions
@@ -548,10 +578,10 @@ ENTRY(dirty_bit)
 	mov b0=r29				// restore b0
 	;;
 	st8 [r17]=r18				// store back updated PTE
-	itc.d r18				// install updated PTE
+	ITC_D(p0, r18, r16)			// install updated PTE
 #endif
 	mov pr=r31,-1				// restore pr
-	rfi
+	RFI
 END(dirty_bit)
 
 	.org ia64_ivt+0x2400
@@ -560,22 +590,22 @@ END(dirty_bit)
 ENTRY(iaccess_bit)
 	DBG_FAULT(9)
 	// Like Entry 8, except for instruction access
-	mov r16=cr.ifa				// get the address that caused the fault
+	MOV_FROM_IFA(r16)			// get the address that caused the fault
 	movl r30=1f				// load continuation point in case of nested fault
 	mov r31=pr				// save predicates
 #ifdef CONFIG_ITANIUM
 	/*
 	 * Erratum 10 (IFA may contain incorrect address) has "NoFix" status.
 	 */
-	mov r17=cr.ipsr
+	MOV_FROM_IPSR(p0, r17)
 	;;
-	mov r18=cr.iip
+	MOV_FROM_IIP(r18)
 	tbit.z p6,p0=r17,IA64_PSR_IS_BIT	// IA64 instruction set?
 	;;
 (p6)	mov r16=r18				// if so, use cr.iip instead of cr.ifa
 #endif /* CONFIG_ITANIUM */
 	;;
-	thash r17=r16				// compute virtual address of L3 PTE
+	THASH(p0, r17, r16, r18)		// compute virtual address of L3 PTE
 	mov r29=b0				// save b0 in case of nested fault)
 #ifdef CONFIG_SMP
 	mov r28=ar.ccv				// save ar.ccv
@@ -584,13 +614,14 @@ ENTRY(iaccess_bit)
 	;;
 	mov ar.ccv=r18				// set compare value for cmpxchg
 	or r25=_PAGE_A,r18			// set the accessed bit
+	tbit.z p7,p6 = r18,_PAGE_P_BIT	 	// Check present bit
 	;;
-	cmpxchg8.acq r26=[r17],r25,ar.ccv
+(p6)	cmpxchg8.acq r26=[r17],r25,ar.ccv	// Only if page present
 	mov r24=PAGE_SHIFT<<2
 	;;
-	cmp.eq p6,p7=r26,r18
+(p6)	cmp.eq p6,p7=r26,r18			// Only if page present
 	;;
-(p6)	itc.i r25				// install updated PTE
+	ITC_I(p6, r25, r26)			// install updated PTE
 	;;
 	/*
 	 * Tell the assemblers dependency-violation checker that the above "itc" instructions
@@ -613,10 +644,10 @@ ENTRY(iaccess_bit)
 	mov b0=r29				// restore b0
 	;;
 	st8 [r17]=r18				// store back updated PTE
-	itc.i r18				// install updated PTE
+	ITC_I(p0, r18, r16)			// install updated PTE
 #endif /* !CONFIG_SMP */
 	mov pr=r31,-1
-	rfi
+	RFI
 END(iaccess_bit)
 
 	.org ia64_ivt+0x2800
@@ -625,10 +656,10 @@ END(iaccess_bit)
 ENTRY(daccess_bit)
 	DBG_FAULT(10)
 	// Like Entry 8, except for data access
-	mov r16=cr.ifa				// get the address that caused the fault
+	MOV_FROM_IFA(r16)			// get the address that caused the fault
 	movl r30=1f				// load continuation point in case of nested fault
 	;;
-	thash r17=r16				// compute virtual address of L3 PTE
+	THASH(p0, r17, r16, r18)		// compute virtual address of L3 PTE
 	mov r31=pr
 	mov r29=b0				// save b0 in case of nested fault)
 #ifdef CONFIG_SMP
@@ -638,13 +669,14 @@ ENTRY(daccess_bit)
 	;;					// avoid RAW on r18
 	mov ar.ccv=r18				// set compare value for cmpxchg
 	or r25=_PAGE_A,r18			// set the dirty bit
+	tbit.z p7,p6 = r18,_PAGE_P_BIT		// Check present bit
 	;;
-	cmpxchg8.acq r26=[r17],r25,ar.ccv
+(p6)	cmpxchg8.acq r26=[r17],r25,ar.ccv	// Only if page is present
 	mov r24=PAGE_SHIFT<<2
 	;;
-	cmp.eq p6,p7=r26,r18
+(p6)	cmp.eq p6,p7=r26,r18			// Only if page is present
 	;;
-(p6)	itc.d r25				// install updated PTE
+	ITC_D(p6, r25, r26)			// install updated PTE
 	/*
 	 * Tell the assemblers dependency-violation checker that the above "itc" instructions
 	 * cannot possibly affect the following loads:
@@ -664,11 +696,11 @@ ENTRY(daccess_bit)
 	or r18=_PAGE_A,r18			// set the accessed bit
 	;;
 	st8 [r17]=r18				// store back updated PTE
-	itc.d r18				// install updated PTE
+	ITC_D(p0, r18, r16)			// install updated PTE
 #endif
 	mov b0=r29				// restore b0
 	mov pr=r31,-1
-	rfi
+	RFI
 END(daccess_bit)
 
 	.org ia64_ivt+0x2c00
@@ -692,10 +724,10 @@ ENTRY(break_fault)
 	 */
 	DBG_FAULT(11)
 	mov.m r16=IA64_KR(CURRENT)		// M2 r16 <- current task (12 cyc)
-	mov r29=cr.ipsr				// M2 (12 cyc)
+	MOV_FROM_IPSR(p0, r29)			// M2 (12 cyc)
 	mov r31=pr				// I0 (2 cyc)
 
-	mov r17=cr.iim				// M2 (2 cyc)
+	MOV_FROM_IIM(r17)			// M2 (2 cyc)
 	mov.m r27=ar.rsc			// M2 (12 cyc)
 	mov r18=__IA64_BREAK_SYSCALL		// A
 
@@ -714,7 +746,7 @@ ENTRY(break_fault)
 	nop.m 0
 	movl r30=sys_call_table			// X
 
-	mov r28=cr.iip				// M2 (2 cyc)
+	MOV_FROM_IIP(r28)			// M2 (2 cyc)
 	cmp.eq p0,p7=r18,r17			// I0 is this a system call?
 (p7)	br.cond.spnt non_syscall		// B  no ->
 	//
@@ -752,8 +784,13 @@ ENTRY(break_fault)
 
 (p8)	adds r28=16,r28				// A    switch cr.iip to next bundle
 (p9)	adds r8=1,r8				// A    increment ei to next slot
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
+	;;
+	mov b6=r30				// I0   setup syscall handler branch reg early
+#else
 	nop.i 0
 	;;
+#endif
 
 	mov.m r25=ar.unat			// M2 (5 cyc)
 	dep r29=r8,r29,41,2			// I0   insert new ei into cr.ipsr
@@ -764,7 +801,11 @@ ENTRY(break_fault)
 	//
 ///////////////////////////////////////////////////////////////////////
 	st1 [r16]=r0				// M2|3 clear current->thread.on_ustack flag
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
+	MOV_FROM_ITC(p0, p14, r30, r18)		// M    get cycle for accounting
+#else
 	mov b6=r30				// I0   setup syscall handler branch reg early
+#endif
 	cmp.eq pKStk,pUStk=r0,r17		// A    were we on kernel stacks already?
 
 	and r9=_TIF_SYSCALL_TRACEAUDIT,r9	// A    mask trace or audit
@@ -776,20 +817,43 @@ ENTRY(break_fault)
 	cmp.eq p14,p0=r9,r0			// A    are syscalls being traced/audited?
 	br.call.sptk.many b7=ia64_syscall_setup	// B
 1:
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
+	// mov.m r30=ar.itc is called in advance, and r13 is current
+	add r16=TI_AC_STAMP+IA64_TASK_SIZE,r13	// A
+	add r17=TI_AC_LEAVE+IA64_TASK_SIZE,r13	// A
+(pKStk)	br.cond.spnt .skip_accounting		// B	unlikely skip
+	;;
+	ld8 r18=[r16],TI_AC_STIME-TI_AC_STAMP	// M  get last stamp
+	ld8 r19=[r17],TI_AC_UTIME-TI_AC_LEAVE	// M  time at leave
+	;;
+	ld8 r20=[r16],TI_AC_STAMP-TI_AC_STIME	// M  cumulated stime
+	ld8 r21=[r17]				// M  cumulated utime
+	sub r22=r19,r18				// A  stime before leave
+	;;
+	st8 [r16]=r30,TI_AC_STIME-TI_AC_STAMP	// M  update stamp
+	sub r18=r30,r19				// A  elapsed time in user
+	;;
+	add r20=r20,r22				// A  sum stime
+	add r21=r21,r18				// A  sum utime
+	;;
+	st8 [r16]=r20				// M  update stime
+	st8 [r17]=r21				// M  update utime
+	;;
+.skip_accounting:
+#endif
 	mov ar.rsc=0x3				// M2   set eager mode, pl 0, LE, loadrs=0
 	nop 0
-	bsw.1					// B (6 cyc) regs are saved, switch to bank 1
+	BSW_1(r2, r14)				// B (6 cyc) regs are saved, switch to bank 1
 	;;
 
-	ssm psr.ic | PSR_DEFAULT_BITS		// M2	now it's safe to re-enable intr.-collection
+	SSM_PSR_IC_AND_DEFAULT_BITS_AND_SRLZ_I(r3, r16)	// M2	now it's safe to re-enable intr.-collection
+						// M0   ensure interruption collection is on
 	movl r3=ia64_ret_from_syscall		// X
 	;;
-
-	srlz.i					// M0   ensure interruption collection is on
 	mov rp=r3				// I0   set the real return addr
 (p10)	br.cond.spnt.many ia64_ret_from_syscall	// B    return if bad call-frame or r15 is a NaT
 
-(p15)	ssm psr.i				// M2   restore psr.i
+	SSM_PSR_I(p15, p15, r16)		// M2   restore psr.i
 (p14)	br.call.sptk.many b6=b6			// B    invoke syscall-handker (ignore return addr)
 	br.cond.spnt.many ia64_trace_syscall	// B	do syscall-tracing thingamagic
 	// NOT REACHED
@@ -809,26 +873,8 @@ END(break_fault)
 /////////////////////////////////////////////////////////////////////////////////////////
 // 0x3000 Entry 12 (size 64 bundles) External Interrupt (4)
 ENTRY(interrupt)
-	DBG_FAULT(12)
-	mov r31=pr		// prepare to save predicates
-	;;
-	SAVE_MIN_WITH_COVER	// uses r31; defines r2 and r3
-	ssm psr.ic | PSR_DEFAULT_BITS
-	;;
-	adds r3=8,r2		// set up second base pointer for SAVE_REST
-	srlz.i			// ensure everybody knows psr.ic is back on
-	;;
-	SAVE_REST
-	;;
-	alloc r14=ar.pfs,0,0,2,0 // must be first in an insn group
-	mov out0=cr.ivr		// pass cr.ivr as first arg
-	add out1=16,sp		// pass pointer to pt_regs as second arg
-	;;
-	srlz.d			// make sure we see the effect of cr.ivr
-	movl r14=ia64_leave_kernel
-	;;
-	mov rp=r14
-	br.call.sptk.many b6=ia64_handle_irq
+	/* interrupt handler has become too big to fit this area. */
+	br.sptk.many __interrupt
 END(interrupt)
 
 	.org ia64_ivt+0x3400
@@ -874,6 +920,7 @@ END(interrupt)
 	 *	- r27: saved ar.rsc
 	 *	- r28: saved cr.iip
 	 *	- r29: saved cr.ipsr
+	 *	- r30: ar.itc for accounting (don't touch)
 	 *	- r31: saved pr
 	 *	-  b0: original contents (to be saved)
 	 * On exit:
@@ -890,6 +937,7 @@ END(interrupt)
 	 *	- ar.fpsr: set to kernel settings
 	 *	-  b6: preserved (same as on entry)
 	 */
+#ifdef __IA64_ASM_PARAVIRTUALIZED_NATIVE
 GLOBAL_ENTRY(ia64_syscall_setup)
 #if PT(B6) != 0
 # error This code assumes that b6 is the first field in pt_regs.
@@ -981,6 +1029,7 @@ GLOBAL_ENTRY(ia64_syscall_setup)
 (p10)	mov r8=-EINVAL
 	br.ret.sptk.many b7
 END(ia64_syscall_setup)
+#endif /* __IA64_ASM_PARAVIRTUALIZED_NATIVE */
 
 	.org ia64_ivt+0x3c00
 /////////////////////////////////////////////////////////////////////////////////////////
@@ -988,53 +1037,46 @@ END(ia64_syscall_setup)
 	DBG_FAULT(15)
 	FAULT(15)
 
+	.org ia64_ivt+0x4000
+/////////////////////////////////////////////////////////////////////////////////////////
+// 0x4000 Entry 16 (size 64 bundles) Reserved
+	DBG_FAULT(16)
+	FAULT(16)
+
+#if defined(CONFIG_VIRT_CPU_ACCOUNTING_NATIVE) && defined(__IA64_ASM_PARAVIRTUALIZED_NATIVE)
 	/*
-	 * Squatting in this space ...
+	 * There is no particular reason for this code to be here, other than
+	 * that there happens to be space here that would go unused otherwise.
+	 * If this fault ever gets "unreserved", simply moved the following
+	 * code to a more suitable spot...
 	 *
-	 * This special case dispatcher for illegal operation faults allows preserved
-	 * registers to be modified through a callback function (asm only) that is handed
-	 * back from the fault handler in r8. Up to three arguments can be passed to the
-	 * callback function by returning an aggregate with the callback as its first
-	 * element, followed by the arguments.
+	 * account_sys_enter is called from SAVE_MIN* macros if accounting is
+	 * enabled and if the macro is entered from user mode.
 	 */
-ENTRY(dispatch_illegal_op_fault)
-	.prologue
-	.body
-	SAVE_MIN_WITH_COVER
-	ssm psr.ic | PSR_DEFAULT_BITS
+GLOBAL_ENTRY(account_sys_enter)
+	// mov.m r20=ar.itc is called in advance, and r13 is current
+	add r16=TI_AC_STAMP+IA64_TASK_SIZE,r13
+	add r17=TI_AC_LEAVE+IA64_TASK_SIZE,r13
 	;;
-	srlz.i		// guarantee that interruption collection is on
+	ld8 r18=[r16],TI_AC_STIME-TI_AC_STAMP	// time at last check in kernel
+	ld8 r19=[r17],TI_AC_UTIME-TI_AC_LEAVE	// time at left from kernel
+        ;;
+	ld8 r23=[r16],TI_AC_STAMP-TI_AC_STIME	// cumulated stime
+	ld8 r21=[r17]				// cumulated utime
+	sub r22=r19,r18				// stime before leave kernel
 	;;
-(p15)	ssm psr.i	// restore psr.i
-	adds r3=8,r2	// set up second base pointer for SAVE_REST
+	st8 [r16]=r20,TI_AC_STIME-TI_AC_STAMP	// update stamp
+	sub r18=r20,r19				// elapsed time in user mode
 	;;
-	alloc r14=ar.pfs,0,0,1,0	// must be first in insn group
-	mov out0=ar.ec
+	add r23=r23,r22				// sum stime
+	add r21=r21,r18				// sum utime
 	;;
-	SAVE_REST
-	PT_REGS_UNWIND_INFO(0)
-	;;
-	br.call.sptk.many rp=ia64_illegal_op_fault
-.ret0:	;;
-	alloc r14=ar.pfs,0,0,3,0	// must be first in insn group
-	mov out0=r9
-	mov out1=r10
-	mov out2=r11
-	movl r15=ia64_leave_kernel
-	;;
-	mov rp=r15
-	mov b6=r8
+	st8 [r16]=r23				// update stime
+	st8 [r17]=r21				// update utime
 	;;
-	cmp.ne p6,p0=0,r8
-(p6)	br.call.dpnt.many b6=b6		// call returns to ia64_leave_kernel
-	br.sptk.many ia64_leave_kernel
-END(dispatch_illegal_op_fault)
-
-	.org ia64_ivt+0x4000
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x4000 Entry 16 (size 64 bundles) Reserved
-	DBG_FAULT(16)
-	FAULT(16)
+	br.ret.sptk.many rp
+END(account_sys_enter)
+#endif
 
 	.org ia64_ivt+0x4400
 /////////////////////////////////////////////////////////////////////////////////////////
@@ -1042,110 +1084,18 @@ END(dispatch_illegal_op_fault)
 	DBG_FAULT(17)
 	FAULT(17)
 
-ENTRY(non_syscall)
-	mov ar.rsc=r27			// restore ar.rsc before SAVE_MIN_WITH_COVER
-	;;
-	SAVE_MIN_WITH_COVER
-
-	// There is no particular reason for this code to be here, other than that
-	// there happens to be space here that would go unused otherwise.  If this
-	// fault ever gets "unreserved", simply moved the following code to a more
-	// suitable spot...
-
-	alloc r14=ar.pfs,0,0,2,0
-	mov out0=cr.iim
-	add out1=16,sp
-	adds r3=8,r2			// set up second base pointer for SAVE_REST
-
-	ssm psr.ic | PSR_DEFAULT_BITS
-	;;
-	srlz.i				// guarantee that interruption collection is on
-	;;
-(p15)	ssm psr.i			// restore psr.i
-	movl r15=ia64_leave_kernel
-	;;
-	SAVE_REST
-	mov rp=r15
-	;;
-	br.call.sptk.many b6=ia64_bad_break	// avoid WAW on CFM and ignore return addr
-END(non_syscall)
-
 	.org ia64_ivt+0x4800
 /////////////////////////////////////////////////////////////////////////////////////////
 // 0x4800 Entry 18 (size 64 bundles) Reserved
 	DBG_FAULT(18)
 	FAULT(18)
 
-	/*
-	 * There is no particular reason for this code to be here, other than that
-	 * there happens to be space here that would go unused otherwise.  If this
-	 * fault ever gets "unreserved", simply moved the following code to a more
-	 * suitable spot...
-	 */
-
-ENTRY(dispatch_unaligned_handler)
-	SAVE_MIN_WITH_COVER
-	;;
-	alloc r14=ar.pfs,0,0,2,0		// now it's safe (must be first in insn group!)
-	mov out0=cr.ifa
-	adds out1=16,sp
-
-	ssm psr.ic | PSR_DEFAULT_BITS
-	;;
-	srlz.i					// guarantee that interruption collection is on
-	;;
-(p15)	ssm psr.i				// restore psr.i
-	adds r3=8,r2				// set up second base pointer
-	;;
-	SAVE_REST
-	movl r14=ia64_leave_kernel
-	;;
-	mov rp=r14
-	br.sptk.many ia64_prepare_handle_unaligned
-END(dispatch_unaligned_handler)
-
 	.org ia64_ivt+0x4c00
 /////////////////////////////////////////////////////////////////////////////////////////
 // 0x4c00 Entry 19 (size 64 bundles) Reserved
 	DBG_FAULT(19)
 	FAULT(19)
 
-	/*
-	 * There is no particular reason for this code to be here, other than that
-	 * there happens to be space here that would go unused otherwise.  If this
-	 * fault ever gets "unreserved", simply moved the following code to a more
-	 * suitable spot...
-	 */
-
-ENTRY(dispatch_to_fault_handler)
-	/*
-	 * Input:
-	 *	psr.ic:	off
-	 *	r19:	fault vector number (e.g., 24 for General Exception)
-	 *	r31:	contains saved predicates (pr)
-	 */
-	SAVE_MIN_WITH_COVER_R19
-	alloc r14=ar.pfs,0,0,5,0
-	mov out0=r15
-	mov out1=cr.isr
-	mov out2=cr.ifa
-	mov out3=cr.iim
-	mov out4=cr.itir
-	;;
-	ssm psr.ic | PSR_DEFAULT_BITS
-	;;
-	srlz.i					// guarantee that interruption collection is on
-	;;
-(p15)	ssm psr.i				// restore psr.i
-	adds r3=8,r2				// set up second base pointer for SAVE_REST
-	;;
-	SAVE_REST
-	movl r14=ia64_leave_kernel
-	;;
-	mov rp=r14
-	br.call.sptk.many b6=ia64_fault
-END(dispatch_to_fault_handler)
-
 //
 // --- End of long entries, Beginning of short entries
 //
@@ -1155,8 +1105,8 @@ END(dispatch_to_fault_handler)
 // 0x5000 Entry 20 (size 16 bundles) Page Not Present (10,22,49)
 ENTRY(page_not_present)
 	DBG_FAULT(20)
-	mov r16=cr.ifa
-	rsm psr.dt
+	MOV_FROM_IFA(r16)
+	RSM_PSR_DT
 	/*
 	 * The Linux page fault handler doesn't expect non-present pages to be in
 	 * the TLB.  Flush the existing entry now, so we meet that expectation.
@@ -1175,8 +1125,8 @@ END(page_not_present)
 // 0x5100 Entry 21 (size 16 bundles) Key Permission (13,25,52)
 ENTRY(key_permission)
 	DBG_FAULT(21)
-	mov r16=cr.ifa
-	rsm psr.dt
+	MOV_FROM_IFA(r16)
+	RSM_PSR_DT
 	mov r31=pr
 	;;
 	srlz.d
@@ -1188,8 +1138,8 @@ END(key_permission)
 // 0x5200 Entry 22 (size 16 bundles) Instruction Access Rights (26)
 ENTRY(iaccess_rights)
 	DBG_FAULT(22)
-	mov r16=cr.ifa
-	rsm psr.dt
+	MOV_FROM_IFA(r16)
+	RSM_PSR_DT
 	mov r31=pr
 	;;
 	srlz.d
@@ -1201,8 +1151,8 @@ END(iaccess_rights)
 // 0x5300 Entry 23 (size 16 bundles) Data Access Rights (14,53)
 ENTRY(daccess_rights)
 	DBG_FAULT(23)
-	mov r16=cr.ifa
-	rsm psr.dt
+	MOV_FROM_IFA(r16)
+	RSM_PSR_DT
 	mov r31=pr
 	;;
 	srlz.d
@@ -1214,7 +1164,7 @@ END(daccess_rights)
 // 0x5400 Entry 24 (size 16 bundles) General Exception (5,32,34,36,38,39)
 ENTRY(general_exception)
 	DBG_FAULT(24)
-	mov r16=cr.isr
+	MOV_FROM_ISR(r16)
 	mov r31=pr
 	;;
 	cmp4.eq p6,p0=0,r16
@@ -1243,8 +1193,8 @@ END(disabled_fp_reg)
 ENTRY(nat_consumption)
 	DBG_FAULT(26)
 
-	mov r16=cr.ipsr
-	mov r17=cr.isr
+	MOV_FROM_IPSR(p0, r16)
+	MOV_FROM_ISR(r17)
 	mov r31=pr				// save PR
 	;;
 	and r18=0xf,r17				// r18 = cr.ipsr.code{3:0}
@@ -1254,10 +1204,10 @@ ENTRY(nat_consumption)
 	dep r16=-1,r16,IA64_PSR_ED_BIT,1
 (p6)	br.cond.spnt 1f		// branch if (cr.ispr.na == 0 || cr.ipsr.code{3:0} != LFETCH)
 	;;
-	mov cr.ipsr=r16		// set cr.ipsr.na
+	MOV_TO_IPSR(p0, r16, r18)
 	mov pr=r31,-1
 	;;
-	rfi
+	RFI
 
 1:	mov pr=r31,-1
 	;;
@@ -1279,26 +1229,26 @@ ENTRY(speculation_vector)
 	 *
 	 * cr.imm contains zero_ext(imm21)
 	 */
-	mov r18=cr.iim
+	MOV_FROM_IIM(r18)
 	;;
-	mov r17=cr.iip
+	MOV_FROM_IIP(r17)
 	shl r18=r18,43			// put sign bit in position (43=64-21)
 	;;
 
-	mov r16=cr.ipsr
+	MOV_FROM_IPSR(p0, r16)
 	shr r18=r18,39			// sign extend (39=43-4)
 	;;
 
 	add r17=r17,r18			// now add the offset
 	;;
-	mov cr.iip=r17
+	MOV_TO_IIP(r17, r19)
 	dep r16=0,r16,41,2		// clear EI
 	;;
 
-	mov cr.ipsr=r16
+	MOV_TO_IPSR(p0, r16, r19)
 	;;
 
-	rfi				// and go back
+	RFI
 END(speculation_vector)
 
 	.org ia64_ivt+0x5800
@@ -1320,7 +1270,6 @@ END(debug_vector)
 // 0x5a00 Entry 30 (size 16 bundles) Unaligned Reference (57)
 ENTRY(unaligned_access)
 	DBG_FAULT(30)
-	mov r16=cr.ipsr
 	mov r31=pr		// prepare to save predicates
 	;;
 	br.sptk.many dispatch_unaligned_handler
@@ -1435,28 +1384,6 @@ END(ia32_exception)
 // 0x6a00 Entry 46 (size 16 bundles) IA-32 Intercept  (30,31,59,70,71)
 ENTRY(ia32_intercept)
 	DBG_FAULT(46)
-#ifdef	CONFIG_IA32_SUPPORT
-	mov r31=pr
-	mov r16=cr.isr
-	;;
-	extr.u r17=r16,16,8	// get ISR.code
-	mov r18=ar.eflag
-	mov r19=cr.iim		// old eflag value
-	;;
-	cmp.ne p6,p0=2,r17
-(p6)	br.cond.spnt 1f		// not a system flag fault
-	xor r16=r18,r19
-	;;
-	extr.u r17=r16,18,1	// get the eflags.ac bit
-	;;
-	cmp.eq p6,p0=0,r17
-(p6)	br.cond.spnt 1f		// eflags.ac bit didn't change
-	;;
-	mov pr=r31,-1		// restore predicate registers
-	rfi
-
-1:
-#endif	// CONFIG_IA32_SUPPORT
 	FAULT(46)
 END(ia32_intercept)
 
@@ -1465,12 +1392,7 @@ END(ia32_intercept)
 // 0x6b00 Entry 47 (size 16 bundles) IA-32 Interrupt  (74)
 ENTRY(ia32_interrupt)
 	DBG_FAULT(47)
-#ifdef CONFIG_IA32_SUPPORT
-	mov r31=pr
-	br.sptk.many dispatch_to_ia32_handler
-#else
 	FAULT(47)
-#endif
 END(ia32_interrupt)
 
 	.org ia64_ivt+0x6c00
@@ -1593,89 +1515,174 @@ END(ia32_interrupt)
 	DBG_FAULT(67)
 	FAULT(67)
 
-#ifdef CONFIG_IA32_SUPPORT
+	//-----------------------------------------------------------------------------------
+	// call do_page_fault (predicates are in r31, psr.dt may be off, r16 is faulting address)
+ENTRY(page_fault)
+	SSM_PSR_DT_AND_SRLZ_I
+	;;
+	SAVE_MIN_WITH_COVER
+	alloc r15=ar.pfs,0,0,3,0
+	MOV_FROM_IFA(out0)
+	MOV_FROM_ISR(out1)
+	SSM_PSR_IC_AND_DEFAULT_BITS_AND_SRLZ_I(r14, r3)
+	adds r3=8,r2				// set up second base pointer
+	SSM_PSR_I(p15, p15, r14)		// restore psr.i
+	movl r14=ia64_leave_kernel
+	;;
+	SAVE_REST
+	mov rp=r14
+	;;
+	adds out2=16,r12			// out2 = pointer to pt_regs
+	br.call.sptk.many b6=ia64_do_page_fault	// ignore return address
+END(page_fault)
+
+ENTRY(non_syscall)
+	mov ar.rsc=r27			// restore ar.rsc before SAVE_MIN_WITH_COVER
+	;;
+	SAVE_MIN_WITH_COVER
 
-	/*
-	 * There is no particular reason for this code to be here, other than that
-	 * there happens to be space here that would go unused otherwise.  If this
-	 * fault ever gets "unreserved", simply moved the following code to a more
-	 * suitable spot...
-	 */
+	// There is no particular reason for this code to be here, other than that
+	// there happens to be space here that would go unused otherwise.  If this
+	// fault ever gets "unreserved", simply moved the following code to a more
+	// suitable spot...
 
-	// IA32 interrupt entry point
+	alloc r14=ar.pfs,0,0,2,0
+	MOV_FROM_IIM(out0)
+	add out1=16,sp
+	adds r3=8,r2			// set up second base pointer for SAVE_REST
 
-ENTRY(dispatch_to_ia32_handler)
-	SAVE_MIN
+	SSM_PSR_IC_AND_DEFAULT_BITS_AND_SRLZ_I(r15, r24)
+					// guarantee that interruption collection is on
+	SSM_PSR_I(p15, p15, r15)	// restore psr.i
+	movl r15=ia64_leave_kernel
 	;;
-	mov r14=cr.isr
-	ssm psr.ic | PSR_DEFAULT_BITS
+	SAVE_REST
+	mov rp=r15
 	;;
-	srlz.i					// guarantee that interruption collection is on
+	br.call.sptk.many b6=ia64_bad_break	// avoid WAW on CFM and ignore return addr
+END(non_syscall)
+
+ENTRY(__interrupt)
+	DBG_FAULT(12)
+	mov r31=pr		// prepare to save predicates
 	;;
-(p15)	ssm psr.i
-	adds r3=8,r2		// Base pointer for SAVE_REST
+	SAVE_MIN_WITH_COVER	// uses r31; defines r2 and r3
+	SSM_PSR_IC_AND_DEFAULT_BITS_AND_SRLZ_I(r3, r14)
+				// ensure everybody knows psr.ic is back on
+	adds r3=8,r2		// set up second base pointer for SAVE_REST
 	;;
 	SAVE_REST
 	;;
-	mov r15=0x80
-	shr r14=r14,16		// Get interrupt number
-	;;
-	cmp.ne p6,p0=r14,r15
-(p6)	br.call.dpnt.many b6=non_ia32_syscall
-
-	adds r14=IA64_PT_REGS_R8_OFFSET + 16,sp	// 16 byte hole per SW conventions
-	adds r15=IA64_PT_REGS_R1_OFFSET + 16,sp
+	MCA_RECOVER_RANGE(interrupt)
+	alloc r14=ar.pfs,0,0,2,0 // must be first in an insn group
+	MOV_FROM_IVR(out0, r8)	// pass cr.ivr as first arg
+	add out1=16,sp		// pass pointer to pt_regs as second arg
 	;;
-	cmp.eq pSys,pNonSys=r0,r0 // set pSys=1, pNonSys=0
-	ld8 r8=[r14]		// get r8
+	srlz.d			// make sure we see the effect of cr.ivr
+	movl r14=ia64_leave_kernel
 	;;
-	st8 [r15]=r8		// save original EAX in r1 (IA32 procs don't use the GP)
+	mov rp=r14
+	br.call.sptk.many b6=ia64_handle_irq
+END(__interrupt)
+
+	/*
+	 * There is no particular reason for this code to be here, other than that
+	 * there happens to be space here that would go unused otherwise.  If this
+	 * fault ever gets "unreserved", simply moved the following code to a more
+	 * suitable spot...
+	 */
+
+ENTRY(dispatch_unaligned_handler)
+	SAVE_MIN_WITH_COVER
 	;;
-	alloc r15=ar.pfs,0,0,6,0	// must first in an insn group
+	alloc r14=ar.pfs,0,0,2,0		// now it's safe (must be first in insn group!)
+	MOV_FROM_IFA(out0)
+	adds out1=16,sp
+
+	SSM_PSR_IC_AND_DEFAULT_BITS_AND_SRLZ_I(r3, r24)
+						// guarantee that interruption collection is on
+	SSM_PSR_I(p15, p15, r3)			// restore psr.i
+	adds r3=8,r2				// set up second base pointer
 	;;
-	ld4 r8=[r14],8		// r8 == eax (syscall number)
-	mov r15=IA32_NR_syscalls
+	SAVE_REST
+	movl r14=ia64_leave_kernel
 	;;
-	cmp.ltu.unc p6,p7=r8,r15
-	ld4 out1=[r14],8	// r9 == ecx
+	mov rp=r14
+	br.sptk.many ia64_prepare_handle_unaligned
+END(dispatch_unaligned_handler)
+
+	/*
+	 * There is no particular reason for this code to be here, other than that
+	 * there happens to be space here that would go unused otherwise.  If this
+	 * fault ever gets "unreserved", simply moved the following code to a more
+	 * suitable spot...
+	 */
+
+ENTRY(dispatch_to_fault_handler)
+	/*
+	 * Input:
+	 *	psr.ic:	off
+	 *	r19:	fault vector number (e.g., 24 for General Exception)
+	 *	r31:	contains saved predicates (pr)
+	 */
+	SAVE_MIN_WITH_COVER_R19
+	alloc r14=ar.pfs,0,0,5,0
+	MOV_FROM_ISR(out1)
+	MOV_FROM_IFA(out2)
+	MOV_FROM_IIM(out3)
+	MOV_FROM_ITIR(out4)
 	;;
-	ld4 out2=[r14],8	// r10 == edx
+	SSM_PSR_IC_AND_DEFAULT_BITS_AND_SRLZ_I(r3, out0)
+						// guarantee that interruption collection is on
+	mov out0=r15
 	;;
-	ld4 out0=[r14]		// r11 == ebx
-	adds r14=(IA64_PT_REGS_R13_OFFSET) + 16,sp
+	SSM_PSR_I(p15, p15, r3)			// restore psr.i
+	adds r3=8,r2				// set up second base pointer for SAVE_REST
 	;;
-	ld4 out5=[r14],PT(R14)-PT(R13)	// r13 == ebp
+	SAVE_REST
+	movl r14=ia64_leave_kernel
 	;;
-	ld4 out3=[r14],PT(R15)-PT(R14)	// r14 == esi
-	adds r2=TI_FLAGS+IA64_TASK_SIZE,r13
+	mov rp=r14
+	br.call.sptk.many b6=ia64_fault
+END(dispatch_to_fault_handler)
+
+	/*
+	 * Squatting in this space ...
+	 *
+	 * This special case dispatcher for illegal operation faults allows preserved
+	 * registers to be modified through a callback function (asm only) that is handed
+	 * back from the fault handler in r8. Up to three arguments can be passed to the
+	 * callback function by returning an aggregate with the callback as its first
+	 * element, followed by the arguments.
+	 */
+ENTRY(dispatch_illegal_op_fault)
+	.prologue
+	.body
+	SAVE_MIN_WITH_COVER
+	SSM_PSR_IC_AND_DEFAULT_BITS_AND_SRLZ_I(r3, r24)
+				// guarantee that interruption collection is on
 	;;
-	ld4 out4=[r14]		// r15 == edi
-	movl r16=ia32_syscall_table
+	SSM_PSR_I(p15, p15, r3)	// restore psr.i
+	adds r3=8,r2	// set up second base pointer for SAVE_REST
 	;;
-(p6)	shladd r16=r8,3,r16	// force ni_syscall if not valid syscall number
-	ld4 r2=[r2]		// r2 = current_thread_info()->flags
+	alloc r14=ar.pfs,0,0,1,0	// must be first in insn group
+	mov out0=ar.ec
 	;;
-	ld8 r16=[r16]
-	and r2=_TIF_SYSCALL_TRACEAUDIT,r2	// mask trace or audit
+	SAVE_REST
+	PT_REGS_UNWIND_INFO(0)
 	;;
-	mov b6=r16
-	movl r15=ia32_ret_from_syscall
-	cmp.eq p8,p0=r2,r0
+	br.call.sptk.many rp=ia64_illegal_op_fault
+.ret0:	;;
+	alloc r14=ar.pfs,0,0,3,0	// must be first in insn group
+	mov out0=r9
+	mov out1=r10
+	mov out2=r11
+	movl r15=ia64_leave_kernel
 	;;
 	mov rp=r15
-(p8)	br.call.sptk.many b6=b6
-	br.cond.sptk ia32_trace_syscall
-
-non_ia32_syscall:
-	alloc r15=ar.pfs,0,0,2,0
-	mov out0=r14				// interrupt #
-	add out1=16,sp				// pointer to pt_regs
-	;;			// avoid WAW on CFM
-	br.call.sptk.many rp=ia32_bad_interrupt
-.ret1:	movl r15=ia64_leave_kernel
+	mov b6=r8
 	;;
-	mov rp=r15
-	br.ret.sptk.many rp
-END(dispatch_to_ia32_handler)
-
-#endif /* CONFIG_IA32_SUPPORT */
+	cmp.ne p6,p0=0,r8
+(p6)	br.call.dpnt.many b6=b6		// call returns to ia64_leave_kernel
+	br.sptk.many ia64_leave_kernel
+END(dispatch_illegal_op_fault)
diff --git a/arch/ia64/kernel/jprobes.S b/arch/ia64/kernel/jprobes.S
index 2323377e369..f69389c7be1 100644
--- a/arch/ia64/kernel/jprobes.S
+++ b/arch/ia64/kernel/jprobes.S
@@ -45,13 +45,14 @@
  * to the correct location.
  */
 #include <asm/asmmacro.h>
+#include <asm/break.h>
 
 	/*
 	 * void jprobe_break(void)
 	 */
 	.section .kprobes.text, "ax"
 ENTRY(jprobe_break)
-	break.m 0x80300
+	break.m __IA64_BREAK_JPROBE
 END(jprobe_break)
 
 	/*
@@ -60,3 +61,30 @@ END(jprobe_break)
 GLOBAL_ENTRY(jprobe_inst_return)
 	br.call.sptk.many b0=jprobe_break
 END(jprobe_inst_return)
+
+GLOBAL_ENTRY(invalidate_stacked_regs)
+	movl r16=invalidate_restore_cfm
+	;;
+	mov b6=r16
+	;;
+	br.ret.sptk.many b6
+	;;
+invalidate_restore_cfm:
+	mov r16=ar.rsc
+	;;
+	mov ar.rsc=r0
+	;;
+	loadrs
+	;;
+	mov ar.rsc=r16
+	;;
+	br.cond.sptk.many rp
+END(invalidate_stacked_regs)
+
+GLOBAL_ENTRY(flush_register_stack)
+	// flush dirty regs to backing store (must be first in insn group)
+	flushrs
+	;;
+	br.ret.sptk.many rp
+END(flush_register_stack)
+
diff --git a/arch/ia64/kernel/kprobes.c b/arch/ia64/kernel/kprobes.c
index 471086b808a..074fde49c9e 100644
--- a/arch/ia64/kernel/kprobes.c
+++ b/arch/ia64/kernel/kprobes.c
@@ -23,28 +23,24 @@
  *              <anil.s.keshavamurthy@intel.com> adapted from i386
  */
 
-#include <linux/config.h>
 #include <linux/kprobes.h>
 #include <linux/ptrace.h>
-#include <linux/spinlock.h>
 #include <linux/string.h>
 #include <linux/slab.h>
 #include <linux/preempt.h>
 #include <linux/moduleloader.h>
+#include <linux/kdebug.h>
 
 #include <asm/pgtable.h>
-#include <asm/kdebug.h>
 #include <asm/sections.h>
+#include <asm/uaccess.h>
 
 extern void jprobe_inst_return(void);
 
-/* kprobe_status settings */
-#define KPROBE_HIT_ACTIVE	0x00000001
-#define KPROBE_HIT_SS		0x00000002
+DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL;
+DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk);
 
-static struct kprobe *current_kprobe, *kprobe_prev;
-static unsigned long kprobe_status, kprobe_status_prev;
-static struct pt_regs jprobe_saved_regs;
+struct kretprobe_blackpoint kretprobe_blacklist[] = {{NULL, NULL}};
 
 enum instruction_type {A, I, M, F, B, L, X, u};
 static enum instruction_type bundle_encoding[32][3] = {
@@ -82,6 +78,20 @@ static enum instruction_type bundle_encoding[32][3] = {
   { u, u, u },  			/* 1F */
 };
 
+/* Insert a long branch code */
+static void __kprobes set_brl_inst(void *from, void *to)
+{
+	s64 rel = ((s64) to - (s64) from) >> 4;
+	bundle_t *brl;
+	brl = (bundle_t *) ((u64) from & ~0xf);
+	brl->quad0.template = 0x05;	/* [MLX](stop) */
+	brl->quad0.slot0 = NOP_M_INST;	/* nop.m 0x0 */
+	brl->quad0.slot1_p0 = ((rel >> 20) & 0x7fffffffff) << 2;
+	brl->quad1.slot1_p1 = (((rel >> 20) & 0x7fffffffff) << 2) >> (64 - 46);
+	/* brl.cond.sptk.many.clr rel<<4 (qp=0) */
+	brl->quad1.slot2 = BRL_INST(rel >> 59, rel & 0xfffff);
+}
+
 /*
  * In this function we check to see if the instruction
  * is IP relative instruction and update the kprobe
@@ -94,9 +104,10 @@ static void __kprobes update_kprobe_inst_flag(uint template, uint  slot,
 {
 	p->ainsn.inst_flag = 0;
 	p->ainsn.target_br_reg = 0;
+	p->ainsn.slot = slot;
 
 	/* Check for Break instruction
- 	 * Bits 37:40 Major opcode to be zero
+	 * Bits 37:40 Major opcode to be zero
 	 * Bits 27:32 X6 to be zero
 	 * Bits 32:35 X3 to be zero
 	 */
@@ -110,19 +121,19 @@ static void __kprobes update_kprobe_inst_flag(uint template, uint  slot,
 		switch (major_opcode) {
 		  case INDIRECT_CALL_OPCODE:
 	 		p->ainsn.inst_flag |= INST_FLAG_FIX_BRANCH_REG;
- 			p->ainsn.target_br_reg = ((kprobe_inst >> 6) & 0x7);
- 			break;
+			p->ainsn.target_br_reg = ((kprobe_inst >> 6) & 0x7);
+			break;
 		  case IP_RELATIVE_PREDICT_OPCODE:
 		  case IP_RELATIVE_BRANCH_OPCODE:
 			p->ainsn.inst_flag |= INST_FLAG_FIX_RELATIVE_IP_ADDR;
- 			break;
+			break;
 		  case IP_RELATIVE_CALL_OPCODE:
- 			p->ainsn.inst_flag |= INST_FLAG_FIX_RELATIVE_IP_ADDR;
- 			p->ainsn.inst_flag |= INST_FLAG_FIX_BRANCH_REG;
- 			p->ainsn.target_br_reg = ((kprobe_inst >> 6) & 0x7);
- 			break;
+			p->ainsn.inst_flag |= INST_FLAG_FIX_RELATIVE_IP_ADDR;
+			p->ainsn.inst_flag |= INST_FLAG_FIX_BRANCH_REG;
+			p->ainsn.target_br_reg = ((kprobe_inst >> 6) & 0x7);
+			break;
 		}
- 	} else if (bundle_encoding[template][slot] == X) {
+	} else if (bundle_encoding[template][slot] == X) {
 		switch (major_opcode) {
 		  case LONG_CALL_OPCODE:
 			p->ainsn.inst_flag |= INST_FLAG_FIX_BRANCH_REG;
@@ -135,20 +146,66 @@ static void __kprobes update_kprobe_inst_flag(uint template, uint  slot,
 
 /*
  * In this function we check to see if the instruction
+ * (qp) cmpx.crel.ctype p1,p2=r2,r3
+ * on which we are inserting kprobe is cmp instruction
+ * with ctype as unc.
+ */
+static uint __kprobes is_cmp_ctype_unc_inst(uint template, uint slot,
+					    uint major_opcode,
+					    unsigned long kprobe_inst)
+{
+	cmp_inst_t cmp_inst;
+	uint ctype_unc = 0;
+
+	if (!((bundle_encoding[template][slot] == I) ||
+		(bundle_encoding[template][slot] == M)))
+		goto out;
+
+	if (!((major_opcode == 0xC) || (major_opcode == 0xD) ||
+		(major_opcode == 0xE)))
+		goto out;
+
+	cmp_inst.l = kprobe_inst;
+	if ((cmp_inst.f.x2 == 0) || (cmp_inst.f.x2 == 1)) {
+		/* Integer compare - Register Register (A6 type)*/
+		if ((cmp_inst.f.tb == 0) && (cmp_inst.f.ta == 0)
+				&&(cmp_inst.f.c == 1))
+			ctype_unc = 1;
+	} else if ((cmp_inst.f.x2 == 2)||(cmp_inst.f.x2 == 3)) {
+		/* Integer compare - Immediate Register (A8 type)*/
+		if ((cmp_inst.f.ta == 0) &&(cmp_inst.f.c == 1))
+			ctype_unc = 1;
+	}
+out:
+	return ctype_unc;
+}
+
+/*
+ * In this function we check to see if the instruction
  * on which we are inserting kprobe is supported.
- * Returns 0 if supported
+ * Returns qp value if supported
  * Returns -EINVAL if unsupported
  */
 static int __kprobes unsupported_inst(uint template, uint  slot,
 				      uint major_opcode,
 				      unsigned long kprobe_inst,
-				      struct kprobe *p)
+				      unsigned long addr)
 {
-	unsigned long addr = (unsigned long)p->addr;
+	int qp;
+
+	qp = kprobe_inst & 0x3f;
+	if (is_cmp_ctype_unc_inst(template, slot, major_opcode, kprobe_inst)) {
+		if (slot == 1 && qp)  {
+			printk(KERN_WARNING "Kprobes on cmp unc "
+					"instruction on slot 1 at <0x%lx> "
+					"is not supported\n", addr);
+			return -EINVAL;
 
-	if (bundle_encoding[template][slot] == I) {
-		switch (major_opcode) {
-			case 0x0: //I_UNIT_MISC_OPCODE:
+		}
+		qp = 0;
+	}
+	else if (bundle_encoding[template][slot] == I) {
+		if (major_opcode == 0) {
 			/*
 			 * Check for Integer speculation instruction
 			 * - Bit 33-35 to be equal to 0x1
@@ -156,10 +213,9 @@ static int __kprobes unsupported_inst(uint template, uint  slot,
 			if (((kprobe_inst >> 33) & 0x7) == 1) {
 				printk(KERN_WARNING
 					"Kprobes on speculation inst at <0x%lx> not supported\n",
-					addr);
+						addr);
 				return -EINVAL;
 			}
-
 			/*
 			 * IP relative mov instruction
 			 *  - Bit 27-35 to be equal to 0x30
@@ -167,50 +223,74 @@ static int __kprobes unsupported_inst(uint template, uint  slot,
 			if (((kprobe_inst >> 27) & 0x1FF) == 0x30) {
 				printk(KERN_WARNING
 					"Kprobes on \"mov r1=ip\" at <0x%lx> not supported\n",
-					addr);
+						addr);
 				return -EINVAL;
 
 			}
 		}
+		else if ((major_opcode == 5) &&	!(kprobe_inst & (0xFUl << 33)) &&
+				(kprobe_inst & (0x1UL << 12))) {
+			/* test bit instructions, tbit,tnat,tf
+			 * bit 33-36 to be equal to 0
+			 * bit 12 to be equal to 1
+			 */
+			if (slot == 1 && qp) {
+				printk(KERN_WARNING "Kprobes on test bit "
+						"instruction on slot at <0x%lx> "
+						"is not supported\n", addr);
+				return -EINVAL;
+			}
+			qp = 0;
+		}
 	}
-	return 0;
-}
-
-
-/*
- * In this function we check to see if the instruction
- * (qp) cmpx.crel.ctype p1,p2=r2,r3
- * on which we are inserting kprobe is cmp instruction
- * with ctype as unc.
- */
-static uint __kprobes is_cmp_ctype_unc_inst(uint template, uint slot,
-					    uint major_opcode,
-					    unsigned long kprobe_inst)
-{
-	cmp_inst_t cmp_inst;
-	uint ctype_unc = 0;
-
-	if (!((bundle_encoding[template][slot] == I) ||
-		(bundle_encoding[template][slot] == M)))
-		goto out;
-
-	if (!((major_opcode == 0xC) || (major_opcode == 0xD) ||
-		(major_opcode == 0xE)))
-		goto out;
+	else if (bundle_encoding[template][slot] == B) {
+		if (major_opcode == 7) {
+			/* IP-Relative Predict major code is 7 */
+			printk(KERN_WARNING "Kprobes on IP-Relative"
+					"Predict is not supported\n");
+			return -EINVAL;
+		}
+		else if (major_opcode == 2) {
+			/* Indirect Predict, major code is 2
+			 * bit 27-32 to be equal to 10 or 11
+			 */
+			int x6=(kprobe_inst >> 27) & 0x3F;
+			if ((x6 == 0x10) || (x6 == 0x11)) {
+				printk(KERN_WARNING "Kprobes on "
+					"Indirect Predict is not supported\n");
+				return -EINVAL;
+			}
+		}
+	}
+	/* kernel does not use float instruction, here for safety kprobe
+	 * will judge whether it is fcmp/flass/float approximation instruction
+	 */
+	else if (unlikely(bundle_encoding[template][slot] == F)) {
+		if ((major_opcode == 4 || major_opcode == 5) &&
+				(kprobe_inst  & (0x1 << 12))) {
+			/* fcmp/fclass unc instruction */
+			if (slot == 1 && qp) {
+				printk(KERN_WARNING "Kprobes on fcmp/fclass "
+					"instruction on slot at <0x%lx> "
+					"is not supported\n", addr);
+				return -EINVAL;
 
-	cmp_inst.l = kprobe_inst;
-	if ((cmp_inst.f.x2 == 0) || (cmp_inst.f.x2 == 1)) {
-		/* Integere compare - Register Register (A6 type)*/
-		if ((cmp_inst.f.tb == 0) && (cmp_inst.f.ta == 0)
-				&&(cmp_inst.f.c == 1))
-			ctype_unc = 1;
-	} else if ((cmp_inst.f.x2 == 2)||(cmp_inst.f.x2 == 3)) {
-		/* Integere compare - Immediate Register (A8 type)*/
-		if ((cmp_inst.f.ta == 0) &&(cmp_inst.f.c == 1))
-			ctype_unc = 1;
+			}
+			qp = 0;
+		}
+		if ((major_opcode == 0 || major_opcode == 1) &&
+			(kprobe_inst & (0x1UL << 33))) {
+			/* float Approximation instruction */
+			if (slot == 1 && qp) {
+				printk(KERN_WARNING "Kprobes on float Approx "
+					"instr at <0x%lx> is not supported\n",
+						addr);
+				return -EINVAL;
+			}
+			qp = 0;
+		}
 	}
-out:
-	return ctype_unc;
+	return qp;
 }
 
 /*
@@ -220,20 +300,17 @@ out:
 static void __kprobes prepare_break_inst(uint template, uint  slot,
 					 uint major_opcode,
 					 unsigned long kprobe_inst,
-					 struct kprobe *p)
+					 struct kprobe *p,
+					 int qp)
 {
 	unsigned long break_inst = BREAK_INST;
-	bundle_t *bundle = &p->ainsn.insn.bundle;
+	bundle_t *bundle = &p->opcode.bundle;
 
 	/*
 	 * Copy the original kprobe_inst qualifying predicate(qp)
-	 * to the break instruction iff !is_cmp_ctype_unc_inst
-	 * because for cmp instruction with ctype equal to unc,
-	 * which is a special instruction always needs to be
-	 * executed regradless of qp
+	 * to the break instruction
 	 */
-	if (!is_cmp_ctype_unc_inst(template, slot, major_opcode, kprobe_inst))
-		break_inst |= (0x3f & kprobe_inst);
+	break_inst |= qp;
 
 	switch (slot) {
 	  case 0:
@@ -256,7 +333,7 @@ static void __kprobes prepare_break_inst(uint template, uint  slot,
 	update_kprobe_inst_flag(template, slot, major_opcode, kprobe_inst, p);
 }
 
-static inline void get_kprobe_inst(bundle_t *bundle, uint slot,
+static void __kprobes get_kprobe_inst(bundle_t *bundle, uint slot,
 	       	unsigned long *kprobe_inst, uint *major_opcode)
 {
 	unsigned long kprobe_inst_p0, kprobe_inst_p1;
@@ -266,24 +343,24 @@ static inline void get_kprobe_inst(bundle_t *bundle, uint slot,
 
 	switch (slot) {
 	  case 0:
- 		*major_opcode = (bundle->quad0.slot0 >> SLOT0_OPCODE_SHIFT);
- 		*kprobe_inst = bundle->quad0.slot0;
-		break;
+		*major_opcode = (bundle->quad0.slot0 >> SLOT0_OPCODE_SHIFT);
+		*kprobe_inst = bundle->quad0.slot0;
+		  break;
 	  case 1:
- 		*major_opcode = (bundle->quad1.slot1_p1 >> SLOT1_p1_OPCODE_SHIFT);
-  		kprobe_inst_p0 = bundle->quad0.slot1_p0;
-  		kprobe_inst_p1 = bundle->quad1.slot1_p1;
-  		*kprobe_inst = kprobe_inst_p0 | (kprobe_inst_p1 << (64-46));
+		*major_opcode = (bundle->quad1.slot1_p1 >> SLOT1_p1_OPCODE_SHIFT);
+		kprobe_inst_p0 = bundle->quad0.slot1_p0;
+		kprobe_inst_p1 = bundle->quad1.slot1_p1;
+		*kprobe_inst = kprobe_inst_p0 | (kprobe_inst_p1 << (64-46));
 		break;
 	  case 2:
- 		*major_opcode = (bundle->quad1.slot2 >> SLOT2_OPCODE_SHIFT);
- 		*kprobe_inst = bundle->quad1.slot2;
+		*major_opcode = (bundle->quad1.slot2 >> SLOT2_OPCODE_SHIFT);
+		*kprobe_inst = bundle->quad1.slot2;
 		break;
 	}
 }
 
 /* Returns non-zero if the addr is in the Interrupt Vector Table */
-static inline int in_ivt_functions(unsigned long addr)
+static int __kprobes in_ivt_functions(unsigned long addr)
 {
 	return (addr >= (unsigned long)__start_ivt_text
 		&& addr < (unsigned long)__end_ivt_text);
@@ -298,36 +375,36 @@ static int __kprobes valid_kprobe_addr(int template, int slot,
 		return -EINVAL;
 	}
 
- 	if (in_ivt_functions(addr)) {
- 		printk(KERN_WARNING "Kprobes can't be inserted inside "
+	if (in_ivt_functions(addr)) {
+		printk(KERN_WARNING "Kprobes can't be inserted inside "
 				"IVT functions at 0x%lx\n", addr);
- 		return -EINVAL;
- 	}
-
-	if (slot == 1 && bundle_encoding[template][1] != L) {
-		printk(KERN_WARNING "Inserting kprobes on slot #1 "
-		       "is not supported\n");
 		return -EINVAL;
 	}
 
 	return 0;
 }
 
-static inline void save_previous_kprobe(void)
+static void __kprobes save_previous_kprobe(struct kprobe_ctlblk *kcb)
 {
-	kprobe_prev = current_kprobe;
-	kprobe_status_prev = kprobe_status;
+	unsigned int i;
+	i = atomic_add_return(1, &kcb->prev_kprobe_index);
+	kcb->prev_kprobe[i-1].kp = kprobe_running();
+	kcb->prev_kprobe[i-1].status = kcb->kprobe_status;
 }
 
-static inline void restore_previous_kprobe(void)
+static void __kprobes restore_previous_kprobe(struct kprobe_ctlblk *kcb)
 {
-	current_kprobe = kprobe_prev;
-	kprobe_status = kprobe_status_prev;
+	unsigned int i;
+	i = atomic_read(&kcb->prev_kprobe_index);
+	__get_cpu_var(current_kprobe) = kcb->prev_kprobe[i-1].kp;
+	kcb->kprobe_status = kcb->prev_kprobe[i-1].status;
+	atomic_sub(1, &kcb->prev_kprobe_index);
 }
 
-static inline void set_current_kprobe(struct kprobe *p)
+static void __kprobes set_current_kprobe(struct kprobe *p,
+			struct kprobe_ctlblk *kcb)
 {
-	current_kprobe = p;
+	__get_cpu_var(current_kprobe) = p;
 }
 
 static void kretprobe_trampoline(void)
@@ -345,18 +422,19 @@ static void kretprobe_trampoline(void)
 int __kprobes trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs)
 {
 	struct kretprobe_instance *ri = NULL;
-	struct hlist_head *head;
-	struct hlist_node *node, *tmp;
-	unsigned long orig_ret_address = 0;
+	struct hlist_head *head, empty_rp;
+	struct hlist_node *tmp;
+	unsigned long flags, orig_ret_address = 0;
 	unsigned long trampoline_address =
 		((struct fnptr *)kretprobe_trampoline)->ip;
 
-        head = kretprobe_inst_table_head(current);
+	INIT_HLIST_HEAD(&empty_rp);
+	kretprobe_hash_lock(current, &head, &flags);
 
 	/*
 	 * It is possible to have multiple instances associated with a given
 	 * task either because an multiple functions in the call path
-	 * have a return probe installed on them, and/or more then one return
+	 * have a return probe installed on them, and/or more than one return
 	 * return probe was registered for a target function.
 	 *
 	 * We can handle this because:
@@ -366,16 +444,33 @@ int __kprobes trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs)
 	 *       real return address, and all the rest will point to
 	 *       kretprobe_trampoline
 	 */
-	hlist_for_each_entry_safe(ri, node, tmp, head, hlist) {
-                if (ri->task != current)
+	hlist_for_each_entry_safe(ri, tmp, head, hlist) {
+		if (ri->task != current)
+			/* another task is sharing our hash bucket */
+			continue;
+
+		orig_ret_address = (unsigned long)ri->ret_addr;
+		if (orig_ret_address != trampoline_address)
+			/*
+			 * This is the real return address. Any other
+			 * instances associated with this task are for
+			 * other calls deeper on the call stack
+			 */
+			break;
+	}
+
+	regs->cr_iip = orig_ret_address;
+
+	hlist_for_each_entry_safe(ri, tmp, head, hlist) {
+		if (ri->task != current)
 			/* another task is sharing our hash bucket */
-                        continue;
+			continue;
 
 		if (ri->rp && ri->rp->handler)
 			ri->rp->handler(ri, regs);
 
 		orig_ret_address = (unsigned long)ri->ret_addr;
-		recycle_rp_inst(ri);
+		recycle_rp_inst(ri, &empty_rp);
 
 		if (orig_ret_address != trampoline_address)
 			/*
@@ -386,36 +481,101 @@ int __kprobes trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs)
 			break;
 	}
 
-	BUG_ON(!orig_ret_address || (orig_ret_address == trampoline_address));
-	regs->cr_iip = orig_ret_address;
+	kretprobe_assert(ri, orig_ret_address, trampoline_address);
 
-	unlock_kprobes();
+	reset_current_kprobe();
+	kretprobe_hash_unlock(current, &flags);
 	preempt_enable_no_resched();
 
-        /*
-         * By returning a non-zero value, we are telling
-         * kprobe_handler() that we have handled unlocking
-         * and re-enabling preemption.
-         */
-        return 1;
+	hlist_for_each_entry_safe(ri, tmp, &empty_rp, hlist) {
+		hlist_del(&ri->hlist);
+		kfree(ri);
+	}
+	/*
+	 * By returning a non-zero value, we are telling
+	 * kprobe_handler() that we don't want the post_handler
+	 * to run (and have re-enabled preemption)
+	 */
+	return 1;
 }
 
-void __kprobes arch_prepare_kretprobe(struct kretprobe *rp,
+void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
 				      struct pt_regs *regs)
 {
-	struct kretprobe_instance *ri;
+	ri->ret_addr = (kprobe_opcode_t *)regs->b0;
+
+	/* Replace the return addr with trampoline addr */
+	regs->b0 = ((struct fnptr *)kretprobe_trampoline)->ip;
+}
 
-	if ((ri = get_free_rp_inst(rp)) != NULL) {
-		ri->rp = rp;
-		ri->task = current;
-		ri->ret_addr = (kprobe_opcode_t *)regs->b0;
+/* Check the instruction in the slot is break */
+static int __kprobes __is_ia64_break_inst(bundle_t *bundle, uint slot)
+{
+	unsigned int major_opcode;
+	unsigned int template = bundle->quad0.template;
+	unsigned long kprobe_inst;
 
-		/* Replace the return addr with trampoline addr */
-		regs->b0 = ((struct fnptr *)kretprobe_trampoline)->ip;
+	/* Move to slot 2, if bundle is MLX type and kprobe slot is 1 */
+	if (slot == 1 && bundle_encoding[template][1] == L)
+		slot++;
 
-		add_rp_inst(ri);
-	} else {
-		rp->nmissed++;
+	/* Get Kprobe probe instruction at given slot*/
+	get_kprobe_inst(bundle, slot, &kprobe_inst, &major_opcode);
+
+	/* For break instruction,
+	 * Bits 37:40 Major opcode to be zero
+	 * Bits 27:32 X6 to be zero
+	 * Bits 32:35 X3 to be zero
+	 */
+	if (major_opcode || ((kprobe_inst >> 27) & 0x1FF)) {
+		/* Not a break instruction */
+		return 0;
+	}
+
+	/* Is a break instruction */
+	return 1;
+}
+
+/*
+ * In this function, we check whether the target bundle modifies IP or
+ * it triggers an exception. If so, it cannot be boostable.
+ */
+static int __kprobes can_boost(bundle_t *bundle, uint slot,
+			       unsigned long bundle_addr)
+{
+	unsigned int template = bundle->quad0.template;
+
+	do {
+		if (search_exception_tables(bundle_addr + slot) ||
+		    __is_ia64_break_inst(bundle, slot))
+			return 0;	/* exception may occur in this bundle*/
+	} while ((++slot) < 3);
+	template &= 0x1e;
+	if (template >= 0x10 /* including B unit */ ||
+	    template == 0x04 /* including X unit */ ||
+	    template == 0x06) /* undefined */
+		return 0;
+
+	return 1;
+}
+
+/* Prepare long jump bundle and disables other boosters if need */
+static void __kprobes prepare_booster(struct kprobe *p)
+{
+	unsigned long addr = (unsigned long)p->addr & ~0xFULL;
+	unsigned int slot = (unsigned long)p->addr & 0xf;
+	struct kprobe *other_kp;
+
+	if (can_boost(&p->ainsn.insn[0].bundle, slot, addr)) {
+		set_brl_inst(&p->ainsn.insn[1].bundle, (bundle_t *)addr + 1);
+		p->ainsn.inst_flag |= INST_FLAG_BOOSTABLE;
+	}
+
+	/* disables boosters in previous slots */
+	for (; addr < (unsigned long)p->addr; addr++) {
+		other_kp = get_kprobe((void *)addr);
+		if (other_kp)
+			other_kp->ainsn.inst_flag &= ~INST_FLAG_BOOSTABLE;
 	}
 }
 
@@ -425,79 +585,123 @@ int __kprobes arch_prepare_kprobe(struct kprobe *p)
 	unsigned long *kprobe_addr = (unsigned long *)(addr & ~0xFULL);
 	unsigned long kprobe_inst=0;
 	unsigned int slot = addr & 0xf, template, major_opcode = 0;
-	bundle_t *bundle = &p->ainsn.insn.bundle;
-
-	memcpy(&p->opcode.bundle, kprobe_addr, sizeof(bundle_t));
-	memcpy(&p->ainsn.insn.bundle, kprobe_addr, sizeof(bundle_t));
+	bundle_t *bundle;
+	int qp;
 
- 	template = bundle->quad0.template;
+	bundle = &((kprobe_opcode_t *)kprobe_addr)->bundle;
+	template = bundle->quad0.template;
 
 	if(valid_kprobe_addr(template, slot, addr))
 		return -EINVAL;
 
 	/* Move to slot 2, if bundle is MLX type and kprobe slot is 1 */
- 	if (slot == 1 && bundle_encoding[template][1] == L)
-  		slot++;
+	if (slot == 1 && bundle_encoding[template][1] == L)
+		slot++;
 
 	/* Get kprobe_inst and major_opcode from the bundle */
 	get_kprobe_inst(bundle, slot, &kprobe_inst, &major_opcode);
 
-	if (unsupported_inst(template, slot, major_opcode, kprobe_inst, p))
-			return -EINVAL;
+	qp = unsupported_inst(template, slot, major_opcode, kprobe_inst, addr);
+	if (qp < 0)
+		return -EINVAL;
 
-	prepare_break_inst(template, slot, major_opcode, kprobe_inst, p);
+	p->ainsn.insn = get_insn_slot();
+	if (!p->ainsn.insn)
+		return -ENOMEM;
+	memcpy(&p->opcode, kprobe_addr, sizeof(kprobe_opcode_t));
+	memcpy(p->ainsn.insn, kprobe_addr, sizeof(kprobe_opcode_t));
+
+	prepare_break_inst(template, slot, major_opcode, kprobe_inst, p, qp);
+
+	prepare_booster(p);
 
 	return 0;
 }
 
 void __kprobes arch_arm_kprobe(struct kprobe *p)
 {
-	unsigned long addr = (unsigned long)p->addr;
-	unsigned long arm_addr = addr & ~0xFULL;
+	unsigned long arm_addr;
+	bundle_t *src, *dest;
+
+	arm_addr = ((unsigned long)p->addr) & ~0xFUL;
+	dest = &((kprobe_opcode_t *)arm_addr)->bundle;
+	src = &p->opcode.bundle;
+
+	flush_icache_range((unsigned long)p->ainsn.insn,
+			   (unsigned long)p->ainsn.insn +
+			   sizeof(kprobe_opcode_t) * MAX_INSN_SIZE);
 
-	memcpy((char *)arm_addr, &p->ainsn.insn.bundle, sizeof(bundle_t));
-	flush_icache_range(arm_addr, arm_addr + sizeof(bundle_t));
+	switch (p->ainsn.slot) {
+		case 0:
+			dest->quad0.slot0 = src->quad0.slot0;
+			break;
+		case 1:
+			dest->quad1.slot1_p1 = src->quad1.slot1_p1;
+			break;
+		case 2:
+			dest->quad1.slot2 = src->quad1.slot2;
+			break;
+	}
+	flush_icache_range(arm_addr, arm_addr + sizeof(kprobe_opcode_t));
 }
 
 void __kprobes arch_disarm_kprobe(struct kprobe *p)
 {
-	unsigned long addr = (unsigned long)p->addr;
-	unsigned long arm_addr = addr & ~0xFULL;
-
-	/* p->opcode contains the original unaltered bundle */
-	memcpy((char *) arm_addr, (char *) &p->opcode.bundle, sizeof(bundle_t));
-	flush_icache_range(arm_addr, arm_addr + sizeof(bundle_t));
+	unsigned long arm_addr;
+	bundle_t *src, *dest;
+
+	arm_addr = ((unsigned long)p->addr) & ~0xFUL;
+	dest = &((kprobe_opcode_t *)arm_addr)->bundle;
+	/* p->ainsn.insn contains the original unaltered kprobe_opcode_t */
+	src = &p->ainsn.insn->bundle;
+	switch (p->ainsn.slot) {
+		case 0:
+			dest->quad0.slot0 = src->quad0.slot0;
+			break;
+		case 1:
+			dest->quad1.slot1_p1 = src->quad1.slot1_p1;
+			break;
+		case 2:
+			dest->quad1.slot2 = src->quad1.slot2;
+			break;
+	}
+	flush_icache_range(arm_addr, arm_addr + sizeof(kprobe_opcode_t));
 }
 
 void __kprobes arch_remove_kprobe(struct kprobe *p)
 {
+	if (p->ainsn.insn) {
+		free_insn_slot(p->ainsn.insn,
+			       p->ainsn.inst_flag & INST_FLAG_BOOSTABLE);
+		p->ainsn.insn = NULL;
+	}
 }
-
 /*
  * We are resuming execution after a single step fault, so the pt_regs
  * structure reflects the register state after we executed the instruction
- * located in the kprobe (p->ainsn.insn.bundle).  We still need to adjust
+ * located in the kprobe (p->ainsn.insn->bundle).  We still need to adjust
  * the ip to point back to the original stack address. To set the IP address
  * to original stack address, handle the case where we need to fixup the
  * relative IP address and/or fixup branch register.
  */
 static void __kprobes resume_execution(struct kprobe *p, struct pt_regs *regs)
 {
-  	unsigned long bundle_addr = ((unsigned long) (&p->opcode.bundle)) & ~0xFULL;
-  	unsigned long resume_addr = (unsigned long)p->addr & ~0xFULL;
- 	unsigned long template;
- 	int slot = ((unsigned long)p->addr & 0xf);
+	unsigned long bundle_addr = (unsigned long) (&p->ainsn.insn->bundle);
+	unsigned long resume_addr = (unsigned long)p->addr & ~0xFULL;
+	unsigned long template;
+	int slot = ((unsigned long)p->addr & 0xf);
 
-	template = p->opcode.bundle.quad0.template;
+	template = p->ainsn.insn->bundle.quad0.template;
 
- 	if (slot == 1 && bundle_encoding[template][1] == L)
- 		slot = 2;
+	if (slot == 1 && bundle_encoding[template][1] == L)
+		slot = 2;
 
-	if (p->ainsn.inst_flag) {
+	if (p->ainsn.inst_flag & ~INST_FLAG_BOOSTABLE) {
 
 		if (p->ainsn.inst_flag & INST_FLAG_FIX_RELATIVE_IP_ADDR) {
 			/* Fix relative IP address */
- 			regs->cr_iip = (regs->cr_iip - bundle_addr) + resume_addr;
+			regs->cr_iip = (regs->cr_iip - bundle_addr) +
+					resume_addr;
 		}
 
 		if (p->ainsn.inst_flag & INST_FLAG_FIX_BRANCH_REG) {
@@ -534,23 +738,23 @@ static void __kprobes resume_execution(struct kprobe *p, struct pt_regs *regs)
 	}
 
 	if (slot == 2) {
- 		if (regs->cr_iip == bundle_addr + 0x10) {
- 			regs->cr_iip = resume_addr + 0x10;
- 		}
- 	} else {
- 		if (regs->cr_iip == bundle_addr) {
- 			regs->cr_iip = resume_addr;
- 		}
+		if (regs->cr_iip == bundle_addr + 0x10) {
+			regs->cr_iip = resume_addr + 0x10;
+		}
+	} else {
+		if (regs->cr_iip == bundle_addr) {
+			regs->cr_iip = resume_addr;
+		}
 	}
 
 turn_ss_off:
-  	/* Turn off Single Step bit */
-  	ia64_psr(regs)->ss = 0;
+	/* Turn off Single Step bit */
+	ia64_psr(regs)->ss = 0;
 }
 
 static void __kprobes prepare_ss(struct kprobe *p, struct pt_regs *regs)
 {
-	unsigned long bundle_addr = (unsigned long) &p->opcode.bundle;
+	unsigned long bundle_addr = (unsigned long) &p->ainsn.insn->bundle;
 	unsigned long slot = (unsigned long)p->addr & 0xf;
 
 	/* single step inline if break instruction */
@@ -571,33 +775,12 @@ static void __kprobes prepare_ss(struct kprobe *p, struct pt_regs *regs)
 static int __kprobes is_ia64_break_inst(struct pt_regs *regs)
 {
 	unsigned int slot = ia64_psr(regs)->ri;
-	unsigned int template, major_opcode;
-	unsigned long kprobe_inst;
 	unsigned long *kprobe_addr = (unsigned long *)regs->cr_iip;
 	bundle_t bundle;
 
 	memcpy(&bundle, kprobe_addr, sizeof(bundle_t));
-	template = bundle.quad0.template;
-
-	/* Move to slot 2, if bundle is MLX type and kprobe slot is 1 */
-	if (slot == 1 && bundle_encoding[template][1] == L)
-  		slot++;
-
-	/* Get Kprobe probe instruction at given slot*/
-	get_kprobe_inst(&bundle, slot, &kprobe_inst, &major_opcode);
-
-	/* For break instruction,
-	 * Bits 37:40 Major opcode to be zero
-	 * Bits 27:32 X6 to be zero
-	 * Bits 32:35 X3 to be zero
-	 */
-	if (major_opcode || ((kprobe_inst >> 27) & 0x1FF) ) {
-		/* Not a break instruction */
-		return 0;
-	}
 
-	/* Is a break instruction */
-	return 1;
+	return __is_ia64_break_inst(&bundle, slot);
 }
 
 static int __kprobes pre_kprobes_handler(struct die_args *args)
@@ -606,17 +789,22 @@ static int __kprobes pre_kprobes_handler(struct die_args *args)
 	int ret = 0;
 	struct pt_regs *regs = args->regs;
 	kprobe_opcode_t *addr = (kprobe_opcode_t *)instruction_pointer(regs);
+	struct kprobe_ctlblk *kcb;
 
+	/*
+	 * We don't want to be preempted for the entire
+	 * duration of kprobe processing
+	 */
 	preempt_disable();
+	kcb = get_kprobe_ctlblk();
 
 	/* Handle recursion cases */
 	if (kprobe_running()) {
 		p = get_kprobe(addr);
 		if (p) {
-			if ( (kprobe_status == KPROBE_HIT_SS) &&
+			if ((kcb->kprobe_status == KPROBE_HIT_SS) &&
 	 		     (p->ainsn.inst_flag == INST_FLAG_BREAK_INST)) {
-  				ia64_psr(regs)->ss = 0;
-				unlock_kprobes();
+				ia64_psr(regs)->ss = 0;
 				goto no_kprobe;
 			}
 			/* We have reentered the pre_kprobe_handler(), since
@@ -625,30 +813,35 @@ static int __kprobes pre_kprobes_handler(struct die_args *args)
 			 * just single step on the instruction of the new probe
 			 * without calling any user handlers.
 			 */
-			save_previous_kprobe();
-			set_current_kprobe(p);
-			p->nmissed++;
+			save_previous_kprobe(kcb);
+			set_current_kprobe(p, kcb);
+			kprobes_inc_nmissed_count(p);
 			prepare_ss(p, regs);
-			kprobe_status = KPROBE_REENTER;
+			kcb->kprobe_status = KPROBE_REENTER;
 			return 1;
 		} else if (args->err == __IA64_BREAK_JPROBE) {
 			/*
 			 * jprobe instrumented function just completed
 			 */
-			p = current_kprobe;
+			p = __get_cpu_var(current_kprobe);
 			if (p->break_handler && p->break_handler(p, regs)) {
 				goto ss_probe;
 			}
+		} else if (!is_ia64_break_inst(regs)) {
+			/* The breakpoint instruction was removed by
+			 * another cpu right after we hit, no further
+			 * handling of this interrupt is appropriate
+			 */
+			ret = 1;
+			goto no_kprobe;
 		} else {
 			/* Not our break */
 			goto no_kprobe;
 		}
 	}
 
-	lock_kprobes();
 	p = get_kprobe(addr);
 	if (!p) {
-		unlock_kprobes();
 		if (!is_ia64_break_inst(regs)) {
 			/*
 			 * The breakpoint instruction was removed right
@@ -665,8 +858,8 @@ static int __kprobes pre_kprobes_handler(struct die_args *args)
 		goto no_kprobe;
 	}
 
-	kprobe_status = KPROBE_HIT_ACTIVE;
-	set_current_kprobe(p);
+	set_current_kprobe(p, kcb);
+	kcb->kprobe_status = KPROBE_HIT_ACTIVE;
 
 	if (p->pre_handler && p->pre_handler(p, regs))
 		/*
@@ -677,8 +870,21 @@ static int __kprobes pre_kprobes_handler(struct die_args *args)
 		return 1;
 
 ss_probe:
+#if !defined(CONFIG_PREEMPT)
+	if (p->ainsn.inst_flag == INST_FLAG_BOOSTABLE && !p->post_handler) {
+		/* Boost up -- we can execute copied instructions directly */
+		ia64_psr(regs)->ri = p->ainsn.slot;
+		regs->cr_iip = (unsigned long)&p->ainsn.insn->bundle & ~0xFULL;
+		/* turn single stepping off */
+		ia64_psr(regs)->ss = 0;
+
+		reset_current_kprobe();
+		preempt_enable_no_resched();
+		return 1;
+	}
+#endif
 	prepare_ss(p, regs);
-	kprobe_status = KPROBE_HIT_SS;
+	kcb->kprobe_status = KPROBE_HIT_SS;
 	return 1;
 
 no_kprobe:
@@ -688,42 +894,86 @@ no_kprobe:
 
 static int __kprobes post_kprobes_handler(struct pt_regs *regs)
 {
-	if (!kprobe_running())
+	struct kprobe *cur = kprobe_running();
+	struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
+
+	if (!cur)
 		return 0;
 
-	if ((kprobe_status != KPROBE_REENTER) && current_kprobe->post_handler) {
-		kprobe_status = KPROBE_HIT_SSDONE;
-		current_kprobe->post_handler(current_kprobe, regs, 0);
+	if ((kcb->kprobe_status != KPROBE_REENTER) && cur->post_handler) {
+		kcb->kprobe_status = KPROBE_HIT_SSDONE;
+		cur->post_handler(cur, regs, 0);
 	}
 
-	resume_execution(current_kprobe, regs);
+	resume_execution(cur, regs);
 
 	/*Restore back the original saved kprobes variables and continue. */
-	if (kprobe_status == KPROBE_REENTER) {
-		restore_previous_kprobe();
+	if (kcb->kprobe_status == KPROBE_REENTER) {
+		restore_previous_kprobe(kcb);
 		goto out;
 	}
-
-	unlock_kprobes();
+	reset_current_kprobe();
 
 out:
 	preempt_enable_no_resched();
 	return 1;
 }
 
-static int __kprobes kprobes_fault_handler(struct pt_regs *regs, int trapnr)
+int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr)
 {
-	if (!kprobe_running())
-		return 0;
+	struct kprobe *cur = kprobe_running();
+	struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
 
-	if (current_kprobe->fault_handler &&
-	    current_kprobe->fault_handler(current_kprobe, regs, trapnr))
-		return 1;
 
-	if (kprobe_status & KPROBE_HIT_SS) {
-		resume_execution(current_kprobe, regs);
-		unlock_kprobes();
+	switch(kcb->kprobe_status) {
+	case KPROBE_HIT_SS:
+	case KPROBE_REENTER:
+		/*
+		 * We are here because the instruction being single
+		 * stepped caused a page fault. We reset the current
+		 * kprobe and the instruction pointer points back to
+		 * the probe address and allow the page fault handler
+		 * to continue as a normal page fault.
+		 */
+		regs->cr_iip = ((unsigned long)cur->addr) & ~0xFULL;
+		ia64_psr(regs)->ri = ((unsigned long)cur->addr) & 0xf;
+		if (kcb->kprobe_status == KPROBE_REENTER)
+			restore_previous_kprobe(kcb);
+		else
+			reset_current_kprobe();
 		preempt_enable_no_resched();
+		break;
+	case KPROBE_HIT_ACTIVE:
+	case KPROBE_HIT_SSDONE:
+		/*
+		 * We increment the nmissed count for accounting,
+		 * we can also use npre/npostfault count for accounting
+		 * these specific fault cases.
+		 */
+		kprobes_inc_nmissed_count(cur);
+
+		/*
+		 * We come here because instructions in the pre/post
+		 * handler caused the page_fault, this could happen
+		 * if handler tries to access user space by
+		 * copy_from_user(), get_user() etc. Let the
+		 * user-specified handler try to fix it first.
+		 */
+		if (cur->fault_handler && cur->fault_handler(cur, regs, trapnr))
+			return 1;
+		/*
+		 * In case the user-specified fault handler returned
+		 * zero, try to fix up.
+		 */
+		if (ia64_done_with_exception(regs))
+			return 1;
+
+		/*
+		 * Let ia64_do_page_fault() fix it.
+		 */
+		break;
+	default:
+		break;
 	}
 
 	return 0;
@@ -733,31 +983,90 @@ int __kprobes kprobe_exceptions_notify(struct notifier_block *self,
 				       unsigned long val, void *data)
 {
 	struct die_args *args = (struct die_args *)data;
+	int ret = NOTIFY_DONE;
+
+	if (args->regs && user_mode(args->regs))
+		return ret;
+
 	switch(val) {
 	case DIE_BREAK:
-		if (pre_kprobes_handler(args))
-			return NOTIFY_STOP;
+		/* err is break number from ia64_bad_break() */
+		if ((args->err >> 12) == (__IA64_BREAK_KPROBE >> 12)
+			|| args->err == __IA64_BREAK_JPROBE
+			|| args->err == 0)
+			if (pre_kprobes_handler(args))
+				ret = NOTIFY_STOP;
 		break;
-	case DIE_SS:
-		if (post_kprobes_handler(args->regs))
-			return NOTIFY_STOP;
+	case DIE_FAULT:
+		/* err is vector number from ia64_fault() */
+		if (args->err == 36)
+			if (post_kprobes_handler(args->regs))
+				ret = NOTIFY_STOP;
 		break;
-	case DIE_PAGE_FAULT:
-		if (kprobes_fault_handler(args->regs, args->trapnr))
-			return NOTIFY_STOP;
 	default:
 		break;
 	}
-	return NOTIFY_DONE;
+	return ret;
+}
+
+struct param_bsp_cfm {
+	unsigned long ip;
+	unsigned long *bsp;
+	unsigned long cfm;
+};
+
+static void ia64_get_bsp_cfm(struct unw_frame_info *info, void *arg)
+{
+	unsigned long ip;
+	struct param_bsp_cfm *lp = arg;
+
+	do {
+		unw_get_ip(info, &ip);
+		if (ip == 0)
+			break;
+		if (ip == lp->ip) {
+			unw_get_bsp(info, (unsigned long*)&lp->bsp);
+			unw_get_cfm(info, (unsigned long*)&lp->cfm);
+			return;
+		}
+	} while (unw_unwind(info) >= 0);
+	lp->bsp = NULL;
+	lp->cfm = 0;
+	return;
+}
+
+unsigned long arch_deref_entry_point(void *entry)
+{
+	return ((struct fnptr *)entry)->ip;
 }
 
 int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
 {
 	struct jprobe *jp = container_of(p, struct jprobe, kp);
-	unsigned long addr = ((struct fnptr *)(jp->entry))->ip;
+	unsigned long addr = arch_deref_entry_point(jp->entry);
+	struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
+	struct param_bsp_cfm pa;
+	int bytes;
+
+	/*
+	 * Callee owns the argument space and could overwrite it, eg
+	 * tail call optimization. So to be absolutely safe
+	 * we save the argument space before transferring the control
+	 * to instrumented jprobe function which runs in
+	 * the process context
+	 */
+	pa.ip = regs->cr_iip;
+	unw_init_running(ia64_get_bsp_cfm, &pa);
+	bytes = (char *)ia64_rse_skip_regs(pa.bsp, pa.cfm & 0x3f)
+				- (char *)pa.bsp;
+	memcpy( kcb->jprobes_saved_stacked_regs,
+		pa.bsp,
+		bytes );
+	kcb->bsp = pa.bsp;
+	kcb->cfm = pa.cfm;
 
 	/* save architectural state */
-	jprobe_saved_regs = *regs;
+	kcb->jprobe_saved_regs = *regs;
 
 	/* after rfi, execute the jprobe instrumented function */
 	regs->cr_iip = addr & ~0xFULL;
@@ -768,14 +1077,34 @@ int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
 	 * fix the return address to our jprobe_inst_return() function
 	 * in the jprobes.S file
 	 */
- 	regs->b0 = ((struct fnptr *)(jprobe_inst_return))->ip;
+	regs->b0 = ((struct fnptr *)(jprobe_inst_return))->ip;
 
 	return 1;
 }
 
+/* ia64 does not need this */
+void __kprobes jprobe_return(void)
+{
+}
+
 int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
 {
-	*regs = jprobe_saved_regs;
+	struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
+	int bytes;
+
+	/* restoring architectural state */
+	*regs = kcb->jprobe_saved_regs;
+
+	/* restoring the original argument space */
+	flush_register_stack();
+	bytes = (char *)ia64_rse_skip_regs(kcb->bsp, kcb->cfm & 0x3f)
+				- (char *)kcb->bsp;
+	memcpy( kcb->bsp,
+		kcb->jprobes_saved_stacked_regs,
+		bytes );
+	invalidate_stacked_regs();
+
+	preempt_enable_no_resched();
 	return 1;
 }
 
@@ -789,3 +1118,12 @@ int __init arch_init_kprobes(void)
 		(kprobe_opcode_t *)((struct fnptr *)kretprobe_trampoline)->ip;
 	return register_kprobe(&trampoline_p);
 }
+
+int __kprobes arch_trampoline_kprobe(struct kprobe *p)
+{
+	if (p->addr ==
+		(kprobe_opcode_t *)((struct fnptr *)kretprobe_trampoline)->ip)
+		return 1;
+
+	return 0;
+}
diff --git a/arch/ia64/kernel/machine_kexec.c b/arch/ia64/kernel/machine_kexec.c
new file mode 100644
index 00000000000..5151a649c96
--- /dev/null
+++ b/arch/ia64/kernel/machine_kexec.c
@@ -0,0 +1,170 @@
+/*
+ * arch/ia64/kernel/machine_kexec.c
+ *
+ * Handle transition of Linux booting another kernel
+ * Copyright (C) 2005 Hewlett-Packard Development Comapny, L.P.
+ * Copyright (C) 2005 Khalid Aziz <khalid.aziz@hp.com>
+ * Copyright (C) 2006 Intel Corp, Zou Nan hai <nanhai.zou@intel.com>
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2.  See the file COPYING for more details.
+ */
+
+#include <linux/mm.h>
+#include <linux/kexec.h>
+#include <linux/cpu.h>
+#include <linux/irq.h>
+#include <linux/efi.h>
+#include <linux/numa.h>
+#include <linux/mmzone.h>
+
+#include <asm/numa.h>
+#include <asm/mmu_context.h>
+#include <asm/setup.h>
+#include <asm/delay.h>
+#include <asm/meminit.h>
+#include <asm/processor.h>
+#include <asm/sal.h>
+#include <asm/mca.h>
+
+typedef void (*relocate_new_kernel_t)(
+					unsigned long indirection_page,
+					unsigned long start_address,
+					struct ia64_boot_param *boot_param,
+					unsigned long pal_addr) __noreturn;
+
+struct kimage *ia64_kimage;
+
+struct resource efi_memmap_res = {
+        .name  = "EFI Memory Map",
+        .start = 0,
+        .end   = 0,
+        .flags = IORESOURCE_BUSY | IORESOURCE_MEM
+};
+
+struct resource boot_param_res = {
+        .name  = "Boot parameter",
+        .start = 0,
+        .end   = 0,
+        .flags = IORESOURCE_BUSY | IORESOURCE_MEM
+};
+
+
+/*
+ * Do what every setup is needed on image and the
+ * reboot code buffer to allow us to avoid allocations
+ * later.
+ */
+int machine_kexec_prepare(struct kimage *image)
+{
+	void *control_code_buffer;
+	const unsigned long *func;
+
+	func = (unsigned long *)&relocate_new_kernel;
+	/* Pre-load control code buffer to minimize work in kexec path */
+	control_code_buffer = page_address(image->control_code_page);
+	memcpy((void *)control_code_buffer, (const void *)func[0],
+			relocate_new_kernel_size);
+	flush_icache_range((unsigned long)control_code_buffer,
+			(unsigned long)control_code_buffer + relocate_new_kernel_size);
+	ia64_kimage = image;
+
+	return 0;
+}
+
+void machine_kexec_cleanup(struct kimage *image)
+{
+}
+
+/*
+ * Do not allocate memory (or fail in any way) in machine_kexec().
+ * We are past the point of no return, committed to rebooting now.
+ */
+static void ia64_machine_kexec(struct unw_frame_info *info, void *arg)
+{
+	struct kimage *image = arg;
+	relocate_new_kernel_t rnk;
+	void *pal_addr = efi_get_pal_addr();
+	unsigned long code_addr;
+	int ii;
+	u64 fp, gp;
+	ia64_fptr_t *init_handler = (ia64_fptr_t *)ia64_os_init_on_kdump;
+
+	BUG_ON(!image);
+	code_addr = (unsigned long)page_address(image->control_code_page);
+	if (image->type == KEXEC_TYPE_CRASH) {
+		crash_save_this_cpu();
+		current->thread.ksp = (__u64)info->sw - 16;
+
+		/* Register noop init handler */
+		fp = ia64_tpa(init_handler->fp);
+		gp = ia64_tpa(ia64_getreg(_IA64_REG_GP));
+		ia64_sal_set_vectors(SAL_VECTOR_OS_INIT, fp, gp, 0, fp, gp, 0);
+	} else {
+		/* Unregister init handlers of current kernel */
+		ia64_sal_set_vectors(SAL_VECTOR_OS_INIT, 0, 0, 0, 0, 0, 0);
+	}
+
+	/* Unregister mca handler - No more recovery on current kernel */
+	ia64_sal_set_vectors(SAL_VECTOR_OS_MCA, 0, 0, 0, 0, 0, 0);
+
+	/* Interrupts aren't acceptable while we reboot */
+	local_irq_disable();
+
+	/* Mask CMC and Performance Monitor interrupts */
+	ia64_setreg(_IA64_REG_CR_PMV, 1 << 16);
+	ia64_setreg(_IA64_REG_CR_CMCV, 1 << 16);
+
+	/* Mask ITV and Local Redirect Registers */
+	ia64_set_itv(1 << 16);
+	ia64_set_lrr0(1 << 16);
+	ia64_set_lrr1(1 << 16);
+
+	/* terminate possible nested in-service interrupts */
+	for (ii = 0; ii < 16; ii++)
+		ia64_eoi();
+
+	/* unmask TPR and clear any pending interrupts */
+	ia64_setreg(_IA64_REG_CR_TPR, 0);
+	ia64_srlz_d();
+	while (ia64_get_ivr() != IA64_SPURIOUS_INT_VECTOR)
+		ia64_eoi();
+	platform_kernel_launch_event();
+	rnk = (relocate_new_kernel_t)&code_addr;
+	(*rnk)(image->head, image->start, ia64_boot_param,
+		     GRANULEROUNDDOWN((unsigned long) pal_addr));
+	BUG();
+}
+
+void machine_kexec(struct kimage *image)
+{
+	BUG_ON(!image);
+	unw_init_running(ia64_machine_kexec, image);
+	for(;;);
+}
+
+void arch_crash_save_vmcoreinfo(void)
+{
+#if defined(CONFIG_DISCONTIGMEM) || defined(CONFIG_SPARSEMEM)
+	VMCOREINFO_SYMBOL(pgdat_list);
+	VMCOREINFO_LENGTH(pgdat_list, MAX_NUMNODES);
+#endif
+#ifdef CONFIG_NUMA
+	VMCOREINFO_SYMBOL(node_memblk);
+	VMCOREINFO_LENGTH(node_memblk, NR_NODE_MEMBLKS);
+	VMCOREINFO_STRUCT_SIZE(node_memblk_s);
+	VMCOREINFO_OFFSET(node_memblk_s, start_paddr);
+	VMCOREINFO_OFFSET(node_memblk_s, size);
+#endif
+#ifdef CONFIG_PGTABLE_3
+	VMCOREINFO_CONFIG(PGTABLE_3);
+#elif defined(CONFIG_PGTABLE_4)
+	VMCOREINFO_CONFIG(PGTABLE_4);
+#endif
+}
+
+unsigned long paddr_vmcoreinfo_note(void)
+{
+	return ia64_tpa((unsigned long)(char *)&vmcoreinfo_note);
+}
+
diff --git a/arch/ia64/kernel/machvec.c b/arch/ia64/kernel/machvec.c
index c3a04ee7f4f..f5a1e5246b3 100644
--- a/arch/ia64/kernel/machvec.c
+++ b/arch/ia64/kernel/machvec.c
@@ -1,8 +1,6 @@
-#include <linux/config.h>
 #include <linux/module.h>
-
+#include <linux/dma-mapping.h>
 #include <asm/machvec.h>
-#include <asm/system.h>
 
 #ifdef CONFIG_IA64_GENERIC
 
@@ -14,7 +12,7 @@
 struct ia64_machine_vector ia64_mv;
 EXPORT_SYMBOL(ia64_mv);
 
-static struct ia64_machine_vector *
+static struct ia64_machine_vector * __init
 lookup_machvec (const char *name)
 {
 	extern struct ia64_machine_vector machvec_start[];
@@ -28,19 +26,39 @@ lookup_machvec (const char *name)
 	return 0;
 }
 
-void
+void __init
 machvec_init (const char *name)
 {
 	struct ia64_machine_vector *mv;
 
+	if (!name)
+		name = acpi_get_sysname();
 	mv = lookup_machvec(name);
-	if (!mv) {
-		panic("generic kernel failed to find machine vector for platform %s!", name);
-	}
+	if (!mv)
+		panic("generic kernel failed to find machine vector for"
+		      " platform %s!", name);
+
 	ia64_mv = *mv;
 	printk(KERN_INFO "booting generic kernel on platform %s\n", name);
 }
 
+void __init
+machvec_init_from_cmdline(const char *cmdline)
+{
+	char str[64];
+	const char *start;
+	char *end;
+
+	if (! (start = strstr(cmdline, "machvec=")) )
+		return machvec_init(NULL);
+
+	strlcpy(str, start + strlen("machvec="), sizeof(str));
+	if ( (end = strchr(str, ' ')) )
+		*end = '\0';
+
+	return machvec_init(str);
+}
+
 #endif /* CONFIG_IA64_GENERIC */
 
 void
@@ -50,20 +68,22 @@ machvec_setup (char **arg)
 EXPORT_SYMBOL(machvec_setup);
 
 void
-machvec_timer_interrupt (int irq, void *dev_id, struct pt_regs *regs)
+machvec_timer_interrupt (int irq, void *dev_id)
 {
 }
 EXPORT_SYMBOL(machvec_timer_interrupt);
 
 void
-machvec_dma_sync_single (struct device *hwdev, dma_addr_t dma_handle, size_t size, int dir)
+machvec_dma_sync_single(struct device *hwdev, dma_addr_t dma_handle, size_t size,
+			enum dma_data_direction dir)
 {
 	mb();
 }
 EXPORT_SYMBOL(machvec_dma_sync_single);
 
 void
-machvec_dma_sync_sg (struct device *hwdev, struct scatterlist *sg, int n, int dir)
+machvec_dma_sync_sg(struct device *hwdev, struct scatterlist *sg, int n,
+		    enum dma_data_direction dir)
 {
 	mb();
 }
diff --git a/arch/ia64/kernel/mca.c b/arch/ia64/kernel/mca.c
index 52c47da1724..db7b36bb068 100644
--- a/arch/ia64/kernel/mca.c
+++ b/arch/ia64/kernel/mca.c
@@ -2,64 +2,79 @@
  * File:	mca.c
  * Purpose:	Generic MCA handling layer
  *
- * Updated for latest kernel
  * Copyright (C) 2003 Hewlett-Packard Co
  *	David Mosberger-Tang <davidm@hpl.hp.com>
  *
  * Copyright (C) 2002 Dell Inc.
- * Copyright (C) Matt Domsch (Matt_Domsch@dell.com)
+ * Copyright (C) Matt Domsch <Matt_Domsch@dell.com>
  *
  * Copyright (C) 2002 Intel
- * Copyright (C) Jenna Hall (jenna.s.hall@intel.com)
+ * Copyright (C) Jenna Hall <jenna.s.hall@intel.com>
  *
  * Copyright (C) 2001 Intel
- * Copyright (C) Fred Lewis (frederick.v.lewis@intel.com)
+ * Copyright (C) Fred Lewis <frederick.v.lewis@intel.com>
  *
  * Copyright (C) 2000 Intel
- * Copyright (C) Chuck Fleckenstein (cfleck@co.intel.com)
+ * Copyright (C) Chuck Fleckenstein <cfleck@co.intel.com>
  *
- * Copyright (C) 1999, 2004 Silicon Graphics, Inc.
- * Copyright (C) Vijay Chander(vijay@engr.sgi.com)
+ * Copyright (C) 1999, 2004-2008 Silicon Graphics, Inc.
+ * Copyright (C) Vijay Chander <vijay@engr.sgi.com>
  *
- * 03/04/15 D. Mosberger Added INIT backtrace support.
- * 02/03/25 M. Domsch	GUID cleanups
+ * Copyright (C) 2006 FUJITSU LIMITED
+ * Copyright (C) Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
  *
- * 02/01/04 J. Hall	Aligned MCA stack to 16 bytes, added platform vs. CPU
- *			error flag, set SAL default return values, changed
- *			error record structure to linked list, added init call
- *			to sal_get_state_info_size().
+ * 2000-03-29 Chuck Fleckenstein <cfleck@co.intel.com>
+ *	      Fixed PAL/SAL update issues, began MCA bug fixes, logging issues,
+ *	      added min save state dump, added INIT handler.
  *
- * 01/01/03 F. Lewis    Added setup of CMCI and CPEI IRQs, logging of corrected
- *                      platform errors, completed code for logging of
- *                      corrected & uncorrected machine check errors, and
- *                      updated for conformance with Nov. 2000 revision of the
- *                      SAL 3.0 spec.
- * 00/03/29 C. Fleckenstein  Fixed PAL/SAL update issues, began MCA bug fixes, logging issues,
- *                           added min save state dump, added INIT handler.
+ * 2001-01-03 Fred Lewis <frederick.v.lewis@intel.com>
+ *	      Added setup of CMCI and CPEI IRQs, logging of corrected platform
+ *	      errors, completed code for logging of corrected & uncorrected
+ *	      machine check errors, and updated for conformance with Nov. 2000
+ *	      revision of the SAL 3.0 spec.
+ *
+ * 2002-01-04 Jenna Hall <jenna.s.hall@intel.com>
+ *	      Aligned MCA stack to 16 bytes, added platform vs. CPU error flag,
+ *	      set SAL default return values, changed error record structure to
+ *	      linked list, added init call to sal_get_state_info_size().
+ *
+ * 2002-03-25 Matt Domsch <Matt_Domsch@dell.com>
+ *	      GUID cleanups.
+ *
+ * 2003-04-15 David Mosberger-Tang <davidm@hpl.hp.com>
+ *	      Added INIT backtrace support.
  *
  * 2003-12-08 Keith Owens <kaos@sgi.com>
- *            smp_call_function() must not be called from interrupt context (can
- *            deadlock on tasklist_lock).  Use keventd to call smp_call_function().
+ *	      smp_call_function() must not be called from interrupt context
+ *	      (can deadlock on tasklist_lock).
+ *	      Use keventd to call smp_call_function().
  *
  * 2004-02-01 Keith Owens <kaos@sgi.com>
- *            Avoid deadlock when using printk() for MCA and INIT records.
- *            Delete all record printing code, moved to salinfo_decode in user space.
- *            Mark variables and functions static where possible.
- *            Delete dead variables and functions.
- *            Reorder to remove the need for forward declarations and to consolidate
- *            related code.
+ *	      Avoid deadlock when using printk() for MCA and INIT records.
+ *	      Delete all record printing code, moved to salinfo_decode in user
+ *	      space.  Mark variables and functions static where possible.
+ *	      Delete dead variables and functions.  Reorder to remove the need
+ *	      for forward declarations and to consolidate related code.
  *
  * 2005-08-12 Keith Owens <kaos@sgi.com>
- *	      Convert MCA/INIT handlers to use per event stacks and SAL/OS state.
+ *	      Convert MCA/INIT handlers to use per event stacks and SAL/OS
+ *	      state.
+ *
+ * 2005-10-07 Keith Owens <kaos@sgi.com>
+ *	      Add notify_die() hooks.
+ *
+ * 2006-09-15 Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
+ *	      Add printing support for MCA/INIT.
+ *
+ * 2007-04-27 Russ Anderson <rja@sgi.com>
+ *	      Support multiple cpus going through OS_MCA in the same event.
  */
-#include <linux/config.h>
+#include <linux/jiffies.h>
 #include <linux/types.h>
 #include <linux/init.h>
 #include <linux/sched.h>
 #include <linux/interrupt.h>
 #include <linux/irq.h>
-#include <linux/kallsyms.h>
-#include <linux/smp_lock.h>
 #include <linux/bootmem.h>
 #include <linux/acpi.h>
 #include <linux/timer.h>
@@ -67,19 +82,25 @@
 #include <linux/kernel.h>
 #include <linux/smp.h>
 #include <linux/workqueue.h>
+#include <linux/cpumask.h>
+#include <linux/kdebug.h>
+#include <linux/cpu.h>
+#include <linux/gfp.h>
 
 #include <asm/delay.h>
 #include <asm/machvec.h>
 #include <asm/meminit.h>
 #include <asm/page.h>
 #include <asm/ptrace.h>
-#include <asm/system.h>
 #include <asm/sal.h>
 #include <asm/mca.h>
+#include <asm/kexec.h>
 
 #include <asm/irq.h>
 #include <asm/hw_irq.h>
+#include <asm/tlb.h>
 
+#include "mca_drv.h"
 #include "entry.h"
 
 #if defined(IA64_MCA_DEBUG_INFO)
@@ -88,12 +109,26 @@
 # define IA64_MCA_DEBUG(fmt...)
 #endif
 
+#define NOTIFY_INIT(event, regs, arg, spin)				\
+do {									\
+	if ((notify_die((event), "INIT", (regs), (arg), 0, 0)		\
+			== NOTIFY_STOP) && ((spin) == 1))		\
+		ia64_mca_spin(__func__);				\
+} while (0)
+
+#define NOTIFY_MCA(event, regs, arg, spin)				\
+do {									\
+	if ((notify_die((event), "MCA", (regs), (arg), 0, 0)		\
+			== NOTIFY_STOP) && ((spin) == 1))		\
+		ia64_mca_spin(__func__);				\
+} while (0)
+
 /* Used by mca_asm.S */
-u32				ia64_mca_serialize;
 DEFINE_PER_CPU(u64, ia64_mca_data); /* == __per_cpu_mca[smp_processor_id()] */
 DEFINE_PER_CPU(u64, ia64_mca_per_cpu_pte); /* PTE to map per-CPU area */
 DEFINE_PER_CPU(u64, ia64_mca_pal_pte);	    /* PTE to map PAL code */
 DEFINE_PER_CPU(u64, ia64_mca_pal_base);    /* vaddr PAL code granule */
+DEFINE_PER_CPU(u64, ia64_mca_tr_reload);   /* Flag for TR reload */
 
 unsigned long __per_cpu_mca[NR_CPUS];
 
@@ -111,7 +146,9 @@ static ia64_mc_info_t		ia64_mc_info;
 #define CPE_HISTORY_LENGTH    5
 #define CMC_HISTORY_LENGTH    5
 
+#ifdef CONFIG_ACPI
 static struct timer_list cpe_poll_timer;
+#endif
 static struct timer_list cmc_poll_timer;
 /*
  * This variable tells whether we are currently in polling mode.
@@ -130,9 +167,181 @@ static int cpe_poll_enabled = 1;
 
 extern void salinfo_log_wakeup(int type, u8 *buffer, u64 size, int irqsafe);
 
-static int mca_init;
+static int mca_init __initdata;
+
+/*
+ * limited & delayed printing support for MCA/INIT handler
+ */
+
+#define mprintk(fmt...) ia64_mca_printk(fmt)
+
+#define MLOGBUF_SIZE (512+256*NR_CPUS)
+#define MLOGBUF_MSGMAX 256
+static char mlogbuf[MLOGBUF_SIZE];
+static DEFINE_SPINLOCK(mlogbuf_wlock);	/* mca context only */
+static DEFINE_SPINLOCK(mlogbuf_rlock);	/* normal context only */
+static unsigned long mlogbuf_start;
+static unsigned long mlogbuf_end;
+static unsigned int mlogbuf_finished = 0;
+static unsigned long mlogbuf_timestamp = 0;
+
+static int loglevel_save = -1;
+#define BREAK_LOGLEVEL(__console_loglevel)		\
+	oops_in_progress = 1;				\
+	if (loglevel_save < 0)				\
+		loglevel_save = __console_loglevel;	\
+	__console_loglevel = 15;
+
+#define RESTORE_LOGLEVEL(__console_loglevel)		\
+	if (loglevel_save >= 0) {			\
+		__console_loglevel = loglevel_save;	\
+		loglevel_save = -1;			\
+	}						\
+	mlogbuf_finished = 0;				\
+	oops_in_progress = 0;
+
+/*
+ * Push messages into buffer, print them later if not urgent.
+ */
+void ia64_mca_printk(const char *fmt, ...)
+{
+	va_list args;
+	int printed_len;
+	char temp_buf[MLOGBUF_MSGMAX];
+	char *p;
+
+	va_start(args, fmt);
+	printed_len = vscnprintf(temp_buf, sizeof(temp_buf), fmt, args);
+	va_end(args);
+
+	/* Copy the output into mlogbuf */
+	if (oops_in_progress) {
+		/* mlogbuf was abandoned, use printk directly instead. */
+		printk("%s", temp_buf);
+	} else {
+		spin_lock(&mlogbuf_wlock);
+		for (p = temp_buf; *p; p++) {
+			unsigned long next = (mlogbuf_end + 1) % MLOGBUF_SIZE;
+			if (next != mlogbuf_start) {
+				mlogbuf[mlogbuf_end] = *p;
+				mlogbuf_end = next;
+			} else {
+				/* buffer full */
+				break;
+			}
+		}
+		mlogbuf[mlogbuf_end] = '\0';
+		spin_unlock(&mlogbuf_wlock);
+	}
+}
+EXPORT_SYMBOL(ia64_mca_printk);
 
 /*
+ * Print buffered messages.
+ *  NOTE: call this after returning normal context. (ex. from salinfod)
+ */
+void ia64_mlogbuf_dump(void)
+{
+	char temp_buf[MLOGBUF_MSGMAX];
+	char *p;
+	unsigned long index;
+	unsigned long flags;
+	unsigned int printed_len;
+
+	/* Get output from mlogbuf */
+	while (mlogbuf_start != mlogbuf_end) {
+		temp_buf[0] = '\0';
+		p = temp_buf;
+		printed_len = 0;
+
+		spin_lock_irqsave(&mlogbuf_rlock, flags);
+
+		index = mlogbuf_start;
+		while (index != mlogbuf_end) {
+			*p = mlogbuf[index];
+			index = (index + 1) % MLOGBUF_SIZE;
+			if (!*p)
+				break;
+			p++;
+			if (++printed_len >= MLOGBUF_MSGMAX - 1)
+				break;
+		}
+		*p = '\0';
+		if (temp_buf[0])
+			printk("%s", temp_buf);
+		mlogbuf_start = index;
+
+		mlogbuf_timestamp = 0;
+		spin_unlock_irqrestore(&mlogbuf_rlock, flags);
+	}
+}
+EXPORT_SYMBOL(ia64_mlogbuf_dump);
+
+/*
+ * Call this if system is going to down or if immediate flushing messages to
+ * console is required. (ex. recovery was failed, crash dump is going to be
+ * invoked, long-wait rendezvous etc.)
+ *  NOTE: this should be called from monarch.
+ */
+static void ia64_mlogbuf_finish(int wait)
+{
+	BREAK_LOGLEVEL(console_loglevel);
+
+	spin_lock_init(&mlogbuf_rlock);
+	ia64_mlogbuf_dump();
+	printk(KERN_EMERG "mlogbuf_finish: printing switched to urgent mode, "
+		"MCA/INIT might be dodgy or fail.\n");
+
+	if (!wait)
+		return;
+
+	/* wait for console */
+	printk("Delaying for 5 seconds...\n");
+	udelay(5*1000000);
+
+	mlogbuf_finished = 1;
+}
+
+/*
+ * Print buffered messages from INIT context.
+ */
+static void ia64_mlogbuf_dump_from_init(void)
+{
+	if (mlogbuf_finished)
+		return;
+
+	if (mlogbuf_timestamp &&
+			time_before(jiffies, mlogbuf_timestamp + 30 * HZ)) {
+		printk(KERN_ERR "INIT: mlogbuf_dump is interrupted by INIT "
+			" and the system seems to be messed up.\n");
+		ia64_mlogbuf_finish(0);
+		return;
+	}
+
+	if (!spin_trylock(&mlogbuf_rlock)) {
+		printk(KERN_ERR "INIT: mlogbuf_dump is interrupted by INIT. "
+			"Generated messages other than stack dump will be "
+			"buffered to mlogbuf and will be printed later.\n");
+		printk(KERN_ERR "INIT: If messages would not printed after "
+			"this INIT, wait 30sec and assert INIT again.\n");
+		if (!mlogbuf_timestamp)
+			mlogbuf_timestamp = jiffies;
+		return;
+	}
+	spin_unlock(&mlogbuf_rlock);
+	ia64_mlogbuf_dump();
+}
+
+static void inline
+ia64_mca_spin(const char *func)
+{
+	if (monarch_cpu == smp_processor_id())
+		ia64_mlogbuf_finish(0);
+	mprintk(KERN_EMERG "%s: spinning here, not returning to SAL\n", func);
+	while (1)
+		cpu_relax();
+}
+/*
  * IA64_MCA log support
  */
 #define IA64_MAX_LOGS		2	/* Double-buffering for nested MCAs */
@@ -173,7 +382,7 @@ static ia64_state_log_t ia64_state_log[IA64_MAX_LOG_TYPES];
  * Inputs   :   info_type   (SAL_INFO_TYPE_{MCA,INIT,CMC,CPE})
  * Outputs	:	None
  */
-static void
+static void __init
 ia64_log_init(int sal_info_type)
 {
 	u64	max_size = 0;
@@ -209,7 +418,7 @@ ia64_log_get(int sal_info_type, u8 **buffer, int irq_safe)
 {
 	sal_log_record_header_t     *log_buffer;
 	u64                         total_len = 0;
-	int                         s;
+	unsigned long               s;
 
 	IA64_LOG_LOCK(sal_info_type);
 
@@ -222,8 +431,8 @@ ia64_log_get(int sal_info_type, u8 **buffer, int irq_safe)
 		IA64_LOG_INDEX_INC(sal_info_type);
 		IA64_LOG_UNLOCK(sal_info_type);
 		if (irq_safe) {
-			IA64_MCA_DEBUG("%s: SAL error record type %d retrieved. "
-				       "Record length = %ld\n", __FUNCTION__, sal_info_type, total_len);
+			IA64_MCA_DEBUG("%s: SAL error record type %d retrieved. Record length = %ld\n",
+				       __func__, sal_info_type, total_len);
 		}
 		*buffer = (u8 *) log_buffer;
 		return total_len;
@@ -271,30 +480,67 @@ ia64_mca_log_sal_error_record(int sal_info_type)
 }
 
 /*
- * platform dependent error handling
+ * search_mca_table
+ *  See if the MCA surfaced in an instruction range
+ *  that has been tagged as recoverable.
+ *
+ *  Inputs
+ *	first	First address range to check
+ *	last	Last address range to check
+ *	ip	Instruction pointer, address we are looking for
+ *
+ * Return value:
+ *      1 on Success (in the table)/ 0 on Failure (not in the  table)
  */
-#ifndef PLATFORM_MCA_HANDLERS
+int
+search_mca_table (const struct mca_table_entry *first,
+                const struct mca_table_entry *last,
+                unsigned long ip)
+{
+        const struct mca_table_entry *curr;
+        u64 curr_start, curr_end;
+
+        curr = first;
+        while (curr <= last) {
+                curr_start = (u64) &curr->start_addr + curr->start_addr;
+                curr_end = (u64) &curr->end_addr + curr->end_addr;
+
+                if ((ip >= curr_start) && (ip <= curr_end)) {
+                        return 1;
+                }
+                curr++;
+        }
+        return 0;
+}
+
+/* Given an address, look for it in the mca tables. */
+int mca_recover_range(unsigned long addr)
+{
+	extern struct mca_table_entry __start___mca_table[];
+	extern struct mca_table_entry __stop___mca_table[];
+
+	return search_mca_table(__start___mca_table, __stop___mca_table-1, addr);
+}
+EXPORT_SYMBOL_GPL(mca_recover_range);
 
 #ifdef CONFIG_ACPI
 
 int cpe_vector = -1;
+int ia64_cpe_irq = -1;
 
 static irqreturn_t
-ia64_mca_cpe_int_handler (int cpe_irq, void *arg, struct pt_regs *ptregs)
+ia64_mca_cpe_int_handler (int cpe_irq, void *arg)
 {
 	static unsigned long	cpe_history[CPE_HISTORY_LENGTH];
 	static int		index;
 	static DEFINE_SPINLOCK(cpe_history_lock);
 
 	IA64_MCA_DEBUG("%s: received interrupt vector = %#x on CPU %d\n",
-		       __FUNCTION__, cpe_irq, smp_processor_id());
+		       __func__, cpe_irq, smp_processor_id());
 
 	/* SAL spec states this should run w/ interrupts enabled */
 	local_irq_enable();
 
-	/* Get the CPE error record and log it */
-	ia64_mca_log_sal_error_record(SAL_INFO_TYPE_CPE);
-
 	spin_lock(&cpe_history_lock);
 	if (!cpe_poll_enabled && cpe_vector >= 0) {
 
@@ -323,7 +569,7 @@ ia64_mca_cpe_int_handler (int cpe_irq, void *arg, struct pt_regs *ptregs)
 			mod_timer(&cpe_poll_timer, jiffies + MIN_CPE_POLL_INTERVAL);
 
 			/* lock already released, get out now */
-			return IRQ_HANDLED;
+			goto out;
 		} else {
 			cpe_history[index++] = now;
 			if (index == CPE_HISTORY_LENGTH)
@@ -331,6 +577,12 @@ ia64_mca_cpe_int_handler (int cpe_irq, void *arg, struct pt_regs *ptregs)
 		}
 	}
 	spin_unlock(&cpe_history_lock);
+out:
+	/* Get the CPE error record and log it */
+	ia64_mca_log_sal_error_record(SAL_INFO_TYPE_CPE);
+
+	local_irq_disable();
+
 	return IRQ_HANDLED;
 }
 
@@ -348,7 +600,7 @@ ia64_mca_cpe_int_handler (int cpe_irq, void *arg, struct pt_regs *ptregs)
  *  Outputs
  *      None
  */
-static void
+void
 ia64_mca_register_cpev (int cpev)
 {
 	/* Register the CPE interrupt vector with SAL */
@@ -362,12 +614,10 @@ ia64_mca_register_cpev (int cpev)
 	}
 
 	IA64_MCA_DEBUG("%s: corrected platform error "
-		       "vector %#x registered\n", __FUNCTION__, cpev);
+		       "vector %#x registered\n", __func__, cpev);
 }
 #endif /* CONFIG_ACPI */
 
-#endif /* PLATFORM_MCA_HANDLERS */
-
 /*
  * ia64_mca_cmc_vector_setup
  *
@@ -391,12 +641,11 @@ ia64_mca_cmc_vector_setup (void)
 	cmcv.cmcv_vector	= IA64_CMC_VECTOR;
 	ia64_setreg(_IA64_REG_CR_CMCV, cmcv.cmcv_regval);
 
-	IA64_MCA_DEBUG("%s: CPU %d corrected "
-		       "machine check vector %#x registered.\n",
-		       __FUNCTION__, smp_processor_id(), IA64_CMC_VECTOR);
+	IA64_MCA_DEBUG("%s: CPU %d corrected machine check vector %#x registered.\n",
+		       __func__, smp_processor_id(), IA64_CMC_VECTOR);
 
 	IA64_MCA_DEBUG("%s: CPU %d CMCV = %#016lx\n",
-		       __FUNCTION__, smp_processor_id(), ia64_getreg(_IA64_REG_CR_CMCV));
+		       __func__, smp_processor_id(), ia64_getreg(_IA64_REG_CR_CMCV));
 }
 
 /*
@@ -421,9 +670,8 @@ ia64_mca_cmc_vector_disable (void *dummy)
 	cmcv.cmcv_mask = 1; /* Mask/disable interrupt */
 	ia64_setreg(_IA64_REG_CR_CMCV, cmcv.cmcv_regval);
 
-	IA64_MCA_DEBUG("%s: CPU %d corrected "
-		       "machine check vector %#x disabled.\n",
-		       __FUNCTION__, smp_processor_id(), cmcv.cmcv_vector);
+	IA64_MCA_DEBUG("%s: CPU %d corrected machine check vector %#x disabled.\n",
+		       __func__, smp_processor_id(), cmcv.cmcv_vector);
 }
 
 /*
@@ -448,9 +696,8 @@ ia64_mca_cmc_vector_enable (void *dummy)
 	cmcv.cmcv_mask = 0; /* Unmask/enable interrupt */
 	ia64_setreg(_IA64_REG_CR_CMCV, cmcv.cmcv_regval);
 
-	IA64_MCA_DEBUG("%s: CPU %d corrected "
-		       "machine check vector %#x enabled.\n",
-		       __FUNCTION__, smp_processor_id(), cmcv.cmcv_vector);
+	IA64_MCA_DEBUG("%s: CPU %d corrected machine check vector %#x enabled.\n",
+		       __func__, smp_processor_id(), cmcv.cmcv_vector);
 }
 
 /*
@@ -460,9 +707,9 @@ ia64_mca_cmc_vector_enable (void *dummy)
  * disable the cmc interrupt vector.
  */
 static void
-ia64_mca_cmc_vector_disable_keventd(void *unused)
+ia64_mca_cmc_vector_disable_keventd(struct work_struct *unused)
 {
-	on_each_cpu(ia64_mca_cmc_vector_disable, NULL, 1, 0);
+	on_each_cpu(ia64_mca_cmc_vector_disable, NULL, 0);
 }
 
 /*
@@ -472,16 +719,15 @@ ia64_mca_cmc_vector_disable_keventd(void *unused)
  * enable the cmc interrupt vector.
  */
 static void
-ia64_mca_cmc_vector_enable_keventd(void *unused)
+ia64_mca_cmc_vector_enable_keventd(struct work_struct *unused)
 {
-	on_each_cpu(ia64_mca_cmc_vector_enable, NULL, 1, 0);
+	on_each_cpu(ia64_mca_cmc_vector_enable, NULL, 0);
 }
 
 /*
  * ia64_mca_wakeup
  *
- *	Send an inter-cpu interrupt to wake-up a particular cpu
- *	and mark that cpu to be out of rendez.
+ *	Send an inter-cpu interrupt to wake-up a particular cpu.
  *
  *  Inputs  :   cpuid
  *  Outputs :   None
@@ -490,14 +736,12 @@ static void
 ia64_mca_wakeup(int cpu)
 {
 	platform_send_ipi(cpu, IA64_MCA_WAKEUP_VECTOR, IA64_IPI_DM_INT, 0);
-	ia64_mc_info.imi_rendez_checkin[cpu] = IA64_MCA_RENDEZ_CHECKIN_NOTDONE;
-
 }
 
 /*
  * ia64_mca_wakeup_all
  *
- *	Wakeup all the cpus which have rendez'ed previously.
+ *	Wakeup all the slave cpus which have rendez'ed previously.
  *
  *  Inputs  :   None
  *  Outputs :   None
@@ -520,30 +764,42 @@ ia64_mca_wakeup_all(void)
  *
  *	This is handler used to put slave processors into spinloop
  *	while the monarch processor does the mca handling and later
- *	wake each slave up once the monarch is done.
+ *	wake each slave up once the monarch is done.  The state
+ *	IA64_MCA_RENDEZ_CHECKIN_DONE indicates the cpu is rendez'ed
+ *	in SAL.  The state IA64_MCA_RENDEZ_CHECKIN_NOTDONE indicates
+ *	the cpu has come out of OS rendezvous.
  *
  *  Inputs  :   None
  *  Outputs :   None
  */
 static irqreturn_t
-ia64_mca_rendez_int_handler(int rendez_irq, void *arg, struct pt_regs *ptregs)
+ia64_mca_rendez_int_handler(int rendez_irq, void *arg)
 {
 	unsigned long flags;
 	int cpu = smp_processor_id();
+	struct ia64_mca_notify_die nd =
+		{ .sos = NULL, .monarch_cpu = &monarch_cpu };
 
 	/* Mask all interrupts */
 	local_irq_save(flags);
 
+	NOTIFY_MCA(DIE_MCA_RENDZVOUS_ENTER, get_irq_regs(), (long)&nd, 1);
+
 	ia64_mc_info.imi_rendez_checkin[cpu] = IA64_MCA_RENDEZ_CHECKIN_DONE;
 	/* Register with the SAL monarch that the slave has
 	 * reached SAL
 	 */
 	ia64_sal_mc_rendez();
 
+	NOTIFY_MCA(DIE_MCA_RENDZVOUS_PROCESS, get_irq_regs(), (long)&nd, 1);
+
 	/* Wait for the monarch cpu to exit. */
 	while (monarch_cpu != -1)
 	       cpu_relax();	/* spin until monarch leaves */
 
+	NOTIFY_MCA(DIE_MCA_RENDZVOUS_LEAVE, get_irq_regs(), (long)&nd, 1);
+
+	ia64_mc_info.imi_rendez_checkin[cpu] = IA64_MCA_RENDEZ_CHECKIN_NOTDONE;
 	/* Enable all interrupts */
 	local_irq_restore(flags);
 	return IRQ_HANDLED;
@@ -560,12 +816,11 @@ ia64_mca_rendez_int_handler(int rendez_irq, void *arg, struct pt_regs *ptregs)
  *
  *  Inputs  :   wakeup_irq  (Wakeup-interrupt bit)
  *	arg		(Interrupt handler specific argument)
- *	ptregs		(Exception frame at the time of the interrupt)
  *  Outputs :   None
  *
  */
 static irqreturn_t
-ia64_mca_wakeup_int_handler(int wakeup_irq, void *arg, struct pt_regs *ptregs)
+ia64_mca_wakeup_int_handler(int wakeup_irq, void *arg)
 {
 	return IRQ_HANDLED;
 }
@@ -597,7 +852,7 @@ EXPORT_SYMBOL(ia64_unreg_MCA_extension);
 
 
 static inline void
-copy_reg(const u64 *fr, u64 fnat, u64 *tr, u64 *tnat)
+copy_reg(const u64 *fr, u64 fnat, unsigned long *tr, unsigned long *tnat)
 {
 	u64 fslot, tslot, nat;
 	*tr = *fr;
@@ -608,6 +863,91 @@ copy_reg(const u64 *fr, u64 fnat, u64 *tr, u64 *tnat)
 	*tnat |= (nat << tslot);
 }
 
+/* Change the comm field on the MCA/INT task to include the pid that
+ * was interrupted, it makes for easier debugging.  If that pid was 0
+ * (swapper or nested MCA/INIT) then use the start of the previous comm
+ * field suffixed with its cpu.
+ */
+
+static void
+ia64_mca_modify_comm(const struct task_struct *previous_current)
+{
+	char *p, comm[sizeof(current->comm)];
+	if (previous_current->pid)
+		snprintf(comm, sizeof(comm), "%s %d",
+			current->comm, previous_current->pid);
+	else {
+		int l;
+		if ((p = strchr(previous_current->comm, ' ')))
+			l = p - previous_current->comm;
+		else
+			l = strlen(previous_current->comm);
+		snprintf(comm, sizeof(comm), "%s %*s %d",
+			current->comm, l, previous_current->comm,
+			task_thread_info(previous_current)->cpu);
+	}
+	memcpy(current->comm, comm, sizeof(current->comm));
+}
+
+static void
+finish_pt_regs(struct pt_regs *regs, struct ia64_sal_os_state *sos,
+		unsigned long *nat)
+{
+	const pal_min_state_area_t *ms = sos->pal_min_state;
+	const u64 *bank;
+
+	/* If ipsr.ic then use pmsa_{iip,ipsr,ifs}, else use
+	 * pmsa_{xip,xpsr,xfs}
+	 */
+	if (ia64_psr(regs)->ic) {
+		regs->cr_iip = ms->pmsa_iip;
+		regs->cr_ipsr = ms->pmsa_ipsr;
+		regs->cr_ifs = ms->pmsa_ifs;
+	} else {
+		regs->cr_iip = ms->pmsa_xip;
+		regs->cr_ipsr = ms->pmsa_xpsr;
+		regs->cr_ifs = ms->pmsa_xfs;
+
+		sos->iip = ms->pmsa_iip;
+		sos->ipsr = ms->pmsa_ipsr;
+		sos->ifs = ms->pmsa_ifs;
+	}
+	regs->pr = ms->pmsa_pr;
+	regs->b0 = ms->pmsa_br0;
+	regs->ar_rsc = ms->pmsa_rsc;
+	copy_reg(&ms->pmsa_gr[1-1], ms->pmsa_nat_bits, &regs->r1, nat);
+	copy_reg(&ms->pmsa_gr[2-1], ms->pmsa_nat_bits, &regs->r2, nat);
+	copy_reg(&ms->pmsa_gr[3-1], ms->pmsa_nat_bits, &regs->r3, nat);
+	copy_reg(&ms->pmsa_gr[8-1], ms->pmsa_nat_bits, &regs->r8, nat);
+	copy_reg(&ms->pmsa_gr[9-1], ms->pmsa_nat_bits, &regs->r9, nat);
+	copy_reg(&ms->pmsa_gr[10-1], ms->pmsa_nat_bits, &regs->r10, nat);
+	copy_reg(&ms->pmsa_gr[11-1], ms->pmsa_nat_bits, &regs->r11, nat);
+	copy_reg(&ms->pmsa_gr[12-1], ms->pmsa_nat_bits, &regs->r12, nat);
+	copy_reg(&ms->pmsa_gr[13-1], ms->pmsa_nat_bits, &regs->r13, nat);
+	copy_reg(&ms->pmsa_gr[14-1], ms->pmsa_nat_bits, &regs->r14, nat);
+	copy_reg(&ms->pmsa_gr[15-1], ms->pmsa_nat_bits, &regs->r15, nat);
+	if (ia64_psr(regs)->bn)
+		bank = ms->pmsa_bank1_gr;
+	else
+		bank = ms->pmsa_bank0_gr;
+	copy_reg(&bank[16-16], ms->pmsa_nat_bits, &regs->r16, nat);
+	copy_reg(&bank[17-16], ms->pmsa_nat_bits, &regs->r17, nat);
+	copy_reg(&bank[18-16], ms->pmsa_nat_bits, &regs->r18, nat);
+	copy_reg(&bank[19-16], ms->pmsa_nat_bits, &regs->r19, nat);
+	copy_reg(&bank[20-16], ms->pmsa_nat_bits, &regs->r20, nat);
+	copy_reg(&bank[21-16], ms->pmsa_nat_bits, &regs->r21, nat);
+	copy_reg(&bank[22-16], ms->pmsa_nat_bits, &regs->r22, nat);
+	copy_reg(&bank[23-16], ms->pmsa_nat_bits, &regs->r23, nat);
+	copy_reg(&bank[24-16], ms->pmsa_nat_bits, &regs->r24, nat);
+	copy_reg(&bank[25-16], ms->pmsa_nat_bits, &regs->r25, nat);
+	copy_reg(&bank[26-16], ms->pmsa_nat_bits, &regs->r26, nat);
+	copy_reg(&bank[27-16], ms->pmsa_nat_bits, &regs->r27, nat);
+	copy_reg(&bank[28-16], ms->pmsa_nat_bits, &regs->r28, nat);
+	copy_reg(&bank[29-16], ms->pmsa_nat_bits, &regs->r29, nat);
+	copy_reg(&bank[30-16], ms->pmsa_nat_bits, &regs->r30, nat);
+	copy_reg(&bank[31-16], ms->pmsa_nat_bits, &regs->r31, nat);
+}
+
 /* On entry to this routine, we are running on the per cpu stack, see
  * mca_asm.h.  The original stack has not been touched by this event.  Some of
  * the original stack's registers will be in the RBS on this stack.  This stack
@@ -620,29 +960,28 @@ copy_reg(const u64 *fr, u64 fnat, u64 *tr, u64 *tnat)
  * that we can do backtrace on the MCA/INIT handler code itself.
  */
 
-static task_t *
+static struct task_struct *
 ia64_mca_modify_original_stack(struct pt_regs *regs,
 		const struct switch_stack *sw,
 		struct ia64_sal_os_state *sos,
 		const char *type)
 {
-	char *p, comm[sizeof(current->comm)];
+	char *p;
 	ia64_va va;
 	extern char ia64_leave_kernel[];	/* Need asm address, not function descriptor */
 	const pal_min_state_area_t *ms = sos->pal_min_state;
-	task_t *previous_current;
+	struct task_struct *previous_current;
 	struct pt_regs *old_regs;
 	struct switch_stack *old_sw;
 	unsigned size = sizeof(struct pt_regs) +
 			sizeof(struct switch_stack) + 16;
-	u64 *old_bspstore, *old_bsp;
-	u64 *new_bspstore, *new_bsp;
-	u64 old_unat, old_rnat, new_rnat, nat;
+	unsigned long *old_bspstore, *old_bsp;
+	unsigned long *new_bspstore, *new_bsp;
+	unsigned long old_unat, old_rnat, new_rnat, nat;
 	u64 slots, loadrs = regs->loadrs;
 	u64 r12 = ms->pmsa_gr[12-1], r13 = ms->pmsa_gr[13-1];
 	u64 ar_bspstore = regs->ar_bspstore;
 	u64 ar_bsp = regs->ar_bspstore + (loadrs >> 16);
-	const u64 *bank;
 	const char *msg;
 	int cpu = smp_processor_id();
 
@@ -689,64 +1028,54 @@ ia64_mca_modify_original_stack(struct pt_regs *regs,
 	 * loadrs for the new stack and save it in the new pt_regs, where
 	 * ia64_old_stack() can get it.
 	 */
-	old_bspstore = (u64 *)ar_bspstore;
-	old_bsp = (u64 *)ar_bsp;
+	old_bspstore = (unsigned long *)ar_bspstore;
+	old_bsp = (unsigned long *)ar_bsp;
 	slots = ia64_rse_num_regs(old_bspstore, old_bsp);
-	new_bspstore = (u64 *)((u64)current + IA64_RBS_OFFSET);
+	new_bspstore = (unsigned long *)((u64)current + IA64_RBS_OFFSET);
 	new_bsp = ia64_rse_skip_regs(new_bspstore, slots);
 	regs->loadrs = (new_bsp - new_bspstore) * 8 << 16;
 
 	/* Verify the previous stack state before we change it */
 	if (user_mode(regs)) {
 		msg = "occurred in user space";
+		/* previous_current is guaranteed to be valid when the task was
+		 * in user space, so ...
+		 */
+		ia64_mca_modify_comm(previous_current);
 		goto no_mod;
 	}
+
 	if (r13 != sos->prev_IA64_KR_CURRENT) {
 		msg = "inconsistent previous current and r13";
 		goto no_mod;
 	}
-	if ((r12 - r13) >= KERNEL_STACK_SIZE) {
-		msg = "inconsistent r12 and r13";
-		goto no_mod;
-	}
-	if ((ar_bspstore - r13) >= KERNEL_STACK_SIZE) {
-		msg = "inconsistent ar.bspstore and r13";
-		goto no_mod;
-	}
-	va.p = old_bspstore;
-	if (va.f.reg < 5) {
-		msg = "old_bspstore is in the wrong region";
-		goto no_mod;
-	}
-	if ((ar_bsp - r13) >= KERNEL_STACK_SIZE) {
-		msg = "inconsistent ar.bsp and r13";
-		goto no_mod;
-	}
-	size += (ia64_rse_skip_regs(old_bspstore, slots) - old_bspstore) * 8;
-	if (ar_bspstore + size > r12) {
-		msg = "no room for blocked state";
-		goto no_mod;
-	}
 
-	/* Change the comm field on the MCA/INT task to include the pid that
-	 * was interrupted, it makes for easier debugging.  If that pid was 0
-	 * (swapper or nested MCA/INIT) then use the start of the previous comm
-	 * field suffixed with its cpu.
-	 */
-	if (previous_current->pid)
-		snprintf(comm, sizeof(comm), "%s %d",
-			current->comm, previous_current->pid);
-	else {
-		int l;
-		if ((p = strchr(previous_current->comm, ' ')))
-			l = p - previous_current->comm;
-		else
-			l = strlen(previous_current->comm);
-		snprintf(comm, sizeof(comm), "%s %*s %d",
-			current->comm, l, previous_current->comm,
-			previous_current->thread_info->cpu);
+	if (!mca_recover_range(ms->pmsa_iip)) {
+		if ((r12 - r13) >= KERNEL_STACK_SIZE) {
+			msg = "inconsistent r12 and r13";
+			goto no_mod;
+		}
+		if ((ar_bspstore - r13) >= KERNEL_STACK_SIZE) {
+			msg = "inconsistent ar.bspstore and r13";
+			goto no_mod;
+		}
+		va.p = old_bspstore;
+		if (va.f.reg < 5) {
+			msg = "old_bspstore is in the wrong region";
+			goto no_mod;
+		}
+		if ((ar_bsp - r13) >= KERNEL_STACK_SIZE) {
+			msg = "inconsistent ar.bsp and r13";
+			goto no_mod;
+		}
+		size += (ia64_rse_skip_regs(old_bspstore, slots) - old_bspstore) * 8;
+		if (ar_bspstore + size > r12) {
+			msg = "no room for blocked state";
+			goto no_mod;
+		}
 	}
-	memcpy(current->comm, comm, sizeof(current->comm));
+
+	ia64_mca_modify_comm(previous_current);
 
 	/* Make the original task look blocked.  First stack a struct pt_regs,
 	 * describing the state at the time of interrupt.  mca_asm.S built a
@@ -755,54 +1084,9 @@ ia64_mca_modify_original_stack(struct pt_regs *regs,
 	p = (char *)r12 - sizeof(*regs);
 	old_regs = (struct pt_regs *)p;
 	memcpy(old_regs, regs, sizeof(*regs));
-	/* If ipsr.ic then use pmsa_{iip,ipsr,ifs}, else use
-	 * pmsa_{xip,xpsr,xfs}
-	 */
-	if (ia64_psr(regs)->ic) {
-		old_regs->cr_iip = ms->pmsa_iip;
-		old_regs->cr_ipsr = ms->pmsa_ipsr;
-		old_regs->cr_ifs = ms->pmsa_ifs;
-	} else {
-		old_regs->cr_iip = ms->pmsa_xip;
-		old_regs->cr_ipsr = ms->pmsa_xpsr;
-		old_regs->cr_ifs = ms->pmsa_xfs;
-	}
-	old_regs->pr = ms->pmsa_pr;
-	old_regs->b0 = ms->pmsa_br0;
 	old_regs->loadrs = loadrs;
-	old_regs->ar_rsc = ms->pmsa_rsc;
 	old_unat = old_regs->ar_unat;
-	copy_reg(&ms->pmsa_gr[1-1], ms->pmsa_nat_bits, &old_regs->r1, &old_unat);
-	copy_reg(&ms->pmsa_gr[2-1], ms->pmsa_nat_bits, &old_regs->r2, &old_unat);
-	copy_reg(&ms->pmsa_gr[3-1], ms->pmsa_nat_bits, &old_regs->r3, &old_unat);
-	copy_reg(&ms->pmsa_gr[8-1], ms->pmsa_nat_bits, &old_regs->r8, &old_unat);
-	copy_reg(&ms->pmsa_gr[9-1], ms->pmsa_nat_bits, &old_regs->r9, &old_unat);
-	copy_reg(&ms->pmsa_gr[10-1], ms->pmsa_nat_bits, &old_regs->r10, &old_unat);
-	copy_reg(&ms->pmsa_gr[11-1], ms->pmsa_nat_bits, &old_regs->r11, &old_unat);
-	copy_reg(&ms->pmsa_gr[12-1], ms->pmsa_nat_bits, &old_regs->r12, &old_unat);
-	copy_reg(&ms->pmsa_gr[13-1], ms->pmsa_nat_bits, &old_regs->r13, &old_unat);
-	copy_reg(&ms->pmsa_gr[14-1], ms->pmsa_nat_bits, &old_regs->r14, &old_unat);
-	copy_reg(&ms->pmsa_gr[15-1], ms->pmsa_nat_bits, &old_regs->r15, &old_unat);
-	if (ia64_psr(old_regs)->bn)
-		bank = ms->pmsa_bank1_gr;
-	else
-		bank = ms->pmsa_bank0_gr;
-	copy_reg(&bank[16-16], ms->pmsa_nat_bits, &old_regs->r16, &old_unat);
-	copy_reg(&bank[17-16], ms->pmsa_nat_bits, &old_regs->r17, &old_unat);
-	copy_reg(&bank[18-16], ms->pmsa_nat_bits, &old_regs->r18, &old_unat);
-	copy_reg(&bank[19-16], ms->pmsa_nat_bits, &old_regs->r19, &old_unat);
-	copy_reg(&bank[20-16], ms->pmsa_nat_bits, &old_regs->r20, &old_unat);
-	copy_reg(&bank[21-16], ms->pmsa_nat_bits, &old_regs->r21, &old_unat);
-	copy_reg(&bank[22-16], ms->pmsa_nat_bits, &old_regs->r22, &old_unat);
-	copy_reg(&bank[23-16], ms->pmsa_nat_bits, &old_regs->r23, &old_unat);
-	copy_reg(&bank[24-16], ms->pmsa_nat_bits, &old_regs->r24, &old_unat);
-	copy_reg(&bank[25-16], ms->pmsa_nat_bits, &old_regs->r25, &old_unat);
-	copy_reg(&bank[26-16], ms->pmsa_nat_bits, &old_regs->r26, &old_unat);
-	copy_reg(&bank[27-16], ms->pmsa_nat_bits, &old_regs->r27, &old_unat);
-	copy_reg(&bank[28-16], ms->pmsa_nat_bits, &old_regs->r28, &old_unat);
-	copy_reg(&bank[29-16], ms->pmsa_nat_bits, &old_regs->r29, &old_unat);
-	copy_reg(&bank[30-16], ms->pmsa_nat_bits, &old_regs->r30, &old_unat);
-	copy_reg(&bank[31-16], ms->pmsa_nat_bits, &old_regs->r31, &old_unat);
+	finish_pt_regs(old_regs, sos, &old_unat);
 
 	/* Next stack a struct switch_stack.  mca_asm.S built a partial
 	 * switch_stack, copy it and fill in the blanks using pt_regs and
@@ -870,8 +1154,10 @@ ia64_mca_modify_original_stack(struct pt_regs *regs,
 	return previous_current;
 
 no_mod:
-	printk(KERN_INFO "cpu %d, %s %s, original stack not modified\n",
+	mprintk(KERN_INFO "cpu %d, %s %s, original stack not modified\n",
 			smp_processor_id(), type, msg);
+	old_unat = regs->ar_unat;
+	finish_pt_regs(regs, sos, &old_unat);
 	return previous_current;
 }
 
@@ -884,28 +1170,92 @@ no_mod:
  */
 
 static void
-ia64_wait_for_slaves(int monarch)
+ia64_wait_for_slaves(int monarch, const char *type)
 {
-	int c, wait = 0;
-	for_each_online_cpu(c) {
-		if (c == monarch)
-			continue;
-		if (ia64_mc_info.imi_rendez_checkin[c] == IA64_MCA_RENDEZ_CHECKIN_NOTDONE) {
-			udelay(1000);		/* short wait first */
-			wait = 1;
-			break;
+	int c, i , wait;
+
+	/*
+	 * wait 5 seconds total for slaves (arbitrary)
+	 */
+	for (i = 0; i < 5000; i++) {
+		wait = 0;
+		for_each_online_cpu(c) {
+			if (c == monarch)
+				continue;
+			if (ia64_mc_info.imi_rendez_checkin[c]
+					== IA64_MCA_RENDEZ_CHECKIN_NOTDONE) {
+				udelay(1000);		/* short wait */
+				wait = 1;
+				break;
+			}
 		}
+		if (!wait)
+			goto all_in;
 	}
-	if (!wait)
-		return;
+
+	/*
+	 * Maybe slave(s) dead. Print buffered messages immediately.
+	 */
+	ia64_mlogbuf_finish(0);
+	mprintk(KERN_INFO "OS %s slave did not rendezvous on cpu", type);
 	for_each_online_cpu(c) {
 		if (c == monarch)
 			continue;
-		if (ia64_mc_info.imi_rendez_checkin[c] == IA64_MCA_RENDEZ_CHECKIN_NOTDONE) {
-			udelay(5*1000000);	/* wait 5 seconds for slaves (arbitrary) */
-			break;
+		if (ia64_mc_info.imi_rendez_checkin[c] == IA64_MCA_RENDEZ_CHECKIN_NOTDONE)
+			mprintk(" %d", c);
+	}
+	mprintk("\n");
+	return;
+
+all_in:
+	mprintk(KERN_INFO "All OS %s slaves have reached rendezvous\n", type);
+	return;
+}
+
+/*  mca_insert_tr
+ *
+ *  Switch rid when TR reload and needed!
+ *  iord: 1: itr, 2: itr;
+ *
+*/
+static void mca_insert_tr(u64 iord)
+{
+
+	int i;
+	u64 old_rr;
+	struct ia64_tr_entry *p;
+	unsigned long psr;
+	int cpu = smp_processor_id();
+
+	if (!ia64_idtrs[cpu])
+		return;
+
+	psr = ia64_clear_ic();
+	for (i = IA64_TR_ALLOC_BASE; i < IA64_TR_ALLOC_MAX; i++) {
+		p = ia64_idtrs[cpu] + (iord - 1) * IA64_TR_ALLOC_MAX;
+		if (p->pte & 0x1) {
+			old_rr = ia64_get_rr(p->ifa);
+			if (old_rr != p->rr) {
+				ia64_set_rr(p->ifa, p->rr);
+				ia64_srlz_d();
+			}
+			ia64_ptr(iord, p->ifa, p->itir >> 2);
+			ia64_srlz_i();
+			if (iord & 0x1) {
+				ia64_itr(0x1, i, p->ifa, p->pte, p->itir >> 2);
+				ia64_srlz_i();
+			}
+			if (iord & 0x2) {
+				ia64_itr(0x2, i, p->ifa, p->pte, p->itir >> 2);
+				ia64_srlz_i();
+			}
+			if (old_rr != p->rr) {
+				ia64_set_rr(p->ifa, old_rr);
+				ia64_srlz_d();
+			}
 		}
 	}
+	ia64_set_psr(psr);
 }
 
 /*
@@ -920,36 +1270,63 @@ ia64_wait_for_slaves(int monarch)
  *	further MCA logging is enabled by clearing logs.
  *	Monarch also has the duty of sending wakeup-IPIs to pull the
  *	slave processors out of rendezvous spinloop.
+ *
+ *	If multiple processors call into OS_MCA, the first will become
+ *	the monarch.  Subsequent cpus will be recorded in the mca_cpu
+ *	bitmask.  After the first monarch has processed its MCA, it
+ *	will wake up the next cpu in the mca_cpu bitmask and then go
+ *	into the rendezvous loop.  When all processors have serviced
+ *	their MCA, the last monarch frees up the rest of the processors.
  */
 void
 ia64_mca_handler(struct pt_regs *regs, struct switch_stack *sw,
 		 struct ia64_sal_os_state *sos)
 {
-	pal_processor_state_info_t *psp = (pal_processor_state_info_t *)
-		&sos->proc_state_param;
 	int recover, cpu = smp_processor_id();
-	task_t *previous_current;
+	struct task_struct *previous_current;
+	struct ia64_mca_notify_die nd =
+		{ .sos = sos, .monarch_cpu = &monarch_cpu, .data = &recover };
+	static atomic_t mca_count;
+	static cpumask_t mca_cpu;
+
+	if (atomic_add_return(1, &mca_count) == 1) {
+		monarch_cpu = cpu;
+		sos->monarch = 1;
+	} else {
+		cpu_set(cpu, mca_cpu);
+		sos->monarch = 0;
+	}
+	mprintk(KERN_INFO "Entered OS MCA handler. PSP=%lx cpu=%d "
+		"monarch=%ld\n", sos->proc_state_param, cpu, sos->monarch);
 
-	oops_in_progress = 1;	/* FIXME: make printk NMI/MCA/INIT safe */
 	previous_current = ia64_mca_modify_original_stack(regs, sw, sos, "MCA");
-	monarch_cpu = cpu;
-	ia64_wait_for_slaves(cpu);
 
-	/* Wakeup all the processors which are spinning in the rendezvous loop.
-	 * They will leave SAL, then spin in the OS with interrupts disabled
-	 * until this monarch cpu leaves the MCA handler.  That gets control
-	 * back to the OS so we can backtrace the other cpus, backtrace when
-	 * spinning in SAL does not work.
-	 */
-	ia64_mca_wakeup_all();
+	NOTIFY_MCA(DIE_MCA_MONARCH_ENTER, regs, (long)&nd, 1);
+
+	ia64_mc_info.imi_rendez_checkin[cpu] = IA64_MCA_RENDEZ_CHECKIN_CONCURRENT_MCA;
+	if (sos->monarch) {
+		ia64_wait_for_slaves(cpu, "MCA");
+
+		/* Wakeup all the processors which are spinning in the
+		 * rendezvous loop.  They will leave SAL, then spin in the OS
+		 * with interrupts disabled until this monarch cpu leaves the
+		 * MCA handler.  That gets control back to the OS so we can
+		 * backtrace the other cpus, backtrace when spinning in SAL
+		 * does not work.
+		 */
+		ia64_mca_wakeup_all();
+	} else {
+		while (cpu_isset(cpu, mca_cpu))
+			cpu_relax();	/* spin until monarch wakes us */
+	}
+
+	NOTIFY_MCA(DIE_MCA_MONARCH_PROCESS, regs, (long)&nd, 1);
 
 	/* Get the MCA error record and log it */
 	ia64_mca_log_sal_error_record(SAL_INFO_TYPE_MCA);
 
-	/* TLB error is only exist in this SAL error record */
-	recover = (psp->tc && !(psp->cc || psp->bc || psp->rc || psp->uc))
-	/* other error recovery */
-	   || (ia64_mca_ucmc_extension
+	/* MCA error recovery */
+	recover = (ia64_mca_ucmc_extension
 		&& ia64_mca_ucmc_extension(
 			IA64_LOG_CURR_BUFFER(SAL_INFO_TYPE_MCA),
 			sos));
@@ -959,14 +1336,44 @@ ia64_mca_handler(struct pt_regs *regs, struct switch_stack *sw,
 		rh->severity = sal_log_severity_corrected;
 		ia64_sal_clear_state_info(SAL_INFO_TYPE_MCA);
 		sos->os_status = IA64_MCA_CORRECTED;
+	} else {
+		/* Dump buffered message to console */
+		ia64_mlogbuf_finish(1);
+	}
+
+	if (__get_cpu_var(ia64_mca_tr_reload)) {
+		mca_insert_tr(0x1); /*Reload dynamic itrs*/
+		mca_insert_tr(0x2); /*Reload dynamic itrs*/
 	}
 
+	NOTIFY_MCA(DIE_MCA_MONARCH_LEAVE, regs, (long)&nd, 1);
+
+	if (atomic_dec_return(&mca_count) > 0) {
+		int i;
+
+		/* wake up the next monarch cpu,
+		 * and put this cpu in the rendez loop.
+		 */
+		for_each_online_cpu(i) {
+			if (cpu_isset(i, mca_cpu)) {
+				monarch_cpu = i;
+				cpu_clear(i, mca_cpu);	/* wake next cpu */
+				while (monarch_cpu != -1)
+					cpu_relax();	/* spin until last cpu leaves */
+				set_curr_task(cpu, previous_current);
+				ia64_mc_info.imi_rendez_checkin[cpu]
+						= IA64_MCA_RENDEZ_CHECKIN_NOTDONE;
+				return;
+			}
+		}
+	}
 	set_curr_task(cpu, previous_current);
-	monarch_cpu = -1;
+	ia64_mc_info.imi_rendez_checkin[cpu] = IA64_MCA_RENDEZ_CHECKIN_NOTDONE;
+	monarch_cpu = -1;	/* This frees the slaves and previous monarchs */
 }
 
-static DECLARE_WORK(cmc_disable_work, ia64_mca_cmc_vector_disable_keventd, NULL);
-static DECLARE_WORK(cmc_enable_work, ia64_mca_cmc_vector_enable_keventd, NULL);
+static DECLARE_WORK(cmc_disable_work, ia64_mca_cmc_vector_disable_keventd);
+static DECLARE_WORK(cmc_enable_work, ia64_mca_cmc_vector_enable_keventd);
 
 /*
  * ia64_mca_cmc_int_handler
@@ -978,27 +1385,23 @@ static DECLARE_WORK(cmc_enable_work, ia64_mca_cmc_vector_enable_keventd, NULL);
  * Inputs
  *      interrupt number
  *      client data arg ptr
- *      saved registers ptr
  *
  * Outputs
  *	None
  */
 static irqreturn_t
-ia64_mca_cmc_int_handler(int cmc_irq, void *arg, struct pt_regs *ptregs)
+ia64_mca_cmc_int_handler(int cmc_irq, void *arg)
 {
 	static unsigned long	cmc_history[CMC_HISTORY_LENGTH];
 	static int		index;
 	static DEFINE_SPINLOCK(cmc_history_lock);
 
 	IA64_MCA_DEBUG("%s: received interrupt vector = %#x on CPU %d\n",
-		       __FUNCTION__, cmc_irq, smp_processor_id());
+		       __func__, cmc_irq, smp_processor_id());
 
 	/* SAL spec states this should run w/ interrupts enabled */
 	local_irq_enable();
 
-	/* Get the CMC error record and log it */
-	ia64_mca_log_sal_error_record(SAL_INFO_TYPE_CMC);
-
 	spin_lock(&cmc_history_lock);
 	if (!cmc_polling_enabled) {
 		int i, count = 1; /* we know 1 happened now */
@@ -1031,7 +1434,7 @@ ia64_mca_cmc_int_handler(int cmc_irq, void *arg, struct pt_regs *ptregs)
 			mod_timer(&cmc_poll_timer, jiffies + CMC_POLL_INTERVAL);
 
 			/* lock already released, get out now */
-			return IRQ_HANDLED;
+			goto out;
 		} else {
 			cmc_history[index++] = now;
 			if (index == CMC_HISTORY_LENGTH)
@@ -1039,6 +1442,12 @@ ia64_mca_cmc_int_handler(int cmc_irq, void *arg, struct pt_regs *ptregs)
 		}
 	}
 	spin_unlock(&cmc_history_lock);
+out:
+	/* Get the CMC error record and log it */
+	ia64_mca_log_sal_error_record(SAL_INFO_TYPE_CMC);
+
+	local_irq_disable();
+
 	return IRQ_HANDLED;
 }
 
@@ -1052,12 +1461,11 @@ ia64_mca_cmc_int_handler(int cmc_irq, void *arg, struct pt_regs *ptregs)
  * Inputs
  *	interrupt number
  *	client data arg ptr
- *	saved registers ptr
  * Outputs
  * 	handled
  */
 static irqreturn_t
-ia64_mca_cmc_int_caller(int cmc_irq, void *arg, struct pt_regs *ptregs)
+ia64_mca_cmc_int_caller(int cmc_irq, void *arg)
 {
 	static int start_count = -1;
 	unsigned int cpuid;
@@ -1068,11 +1476,11 @@ ia64_mca_cmc_int_caller(int cmc_irq, void *arg, struct pt_regs *ptregs)
 	if (start_count == -1)
 		start_count = IA64_LOG_COUNT(SAL_INFO_TYPE_CMC);
 
-	ia64_mca_cmc_int_handler(cmc_irq, arg, ptregs);
+	ia64_mca_cmc_int_handler(cmc_irq, arg);
 
-	for (++cpuid ; cpuid < NR_CPUS && !cpu_online(cpuid) ; cpuid++);
+	cpuid = cpumask_next(cpuid+1, cpu_online_mask);
 
-	if (cpuid < NR_CPUS) {
+	if (cpuid < nr_cpu_ids) {
 		platform_send_ipi(cpuid, IA64_CMCP_VECTOR, IA64_IPI_DM_INT, 0);
 	} else {
 		/* If no log record, switch out of polling mode */
@@ -1106,7 +1514,8 @@ static void
 ia64_mca_cmc_poll (unsigned long dummy)
 {
 	/* Trigger a CMC interrupt cascade  */
-	platform_send_ipi(first_cpu(cpu_online_map), IA64_CMCP_VECTOR, IA64_IPI_DM_INT, 0);
+	platform_send_ipi(cpumask_first(cpu_online_mask), IA64_CMCP_VECTOR,
+							IA64_IPI_DM_INT, 0);
 }
 
 /*
@@ -1119,14 +1528,13 @@ ia64_mca_cmc_poll (unsigned long dummy)
  * Inputs
  *	interrupt number
  *	client data arg ptr
- *	saved registers ptr
  * Outputs
  * 	handled
  */
 #ifdef CONFIG_ACPI
 
 static irqreturn_t
-ia64_mca_cpe_int_caller(int cpe_irq, void *arg, struct pt_regs *ptregs)
+ia64_mca_cpe_int_caller(int cpe_irq, void *arg)
 {
 	static int start_count = -1;
 	static int poll_time = MIN_CPE_POLL_INTERVAL;
@@ -1138,9 +1546,9 @@ ia64_mca_cpe_int_caller(int cpe_irq, void *arg, struct pt_regs *ptregs)
 	if (start_count == -1)
 		start_count = IA64_LOG_COUNT(SAL_INFO_TYPE_CPE);
 
-	ia64_mca_cpe_int_handler(cpe_irq, arg, ptregs);
+	ia64_mca_cpe_int_handler(cpe_irq, arg);
 
-	for (++cpuid ; cpuid < NR_CPUS && !cpu_online(cpuid) ; cpuid++);
+	cpuid = cpumask_next(cpuid+1, cpu_online_mask);
 
 	if (cpuid < NR_CPUS) {
 		platform_send_ipi(cpuid, IA64_CPEP_VECTOR, IA64_IPI_DM_INT, 0);
@@ -1183,11 +1591,58 @@ static void
 ia64_mca_cpe_poll (unsigned long dummy)
 {
 	/* Trigger a CPE interrupt cascade  */
-	platform_send_ipi(first_cpu(cpu_online_map), IA64_CPEP_VECTOR, IA64_IPI_DM_INT, 0);
+	platform_send_ipi(cpumask_first(cpu_online_mask), IA64_CPEP_VECTOR,
+							IA64_IPI_DM_INT, 0);
 }
 
 #endif /* CONFIG_ACPI */
 
+static int
+default_monarch_init_process(struct notifier_block *self, unsigned long val, void *data)
+{
+	int c;
+	struct task_struct *g, *t;
+	if (val != DIE_INIT_MONARCH_PROCESS)
+		return NOTIFY_DONE;
+#ifdef CONFIG_KEXEC
+	if (atomic_read(&kdump_in_progress))
+		return NOTIFY_DONE;
+#endif
+
+	/*
+	 * FIXME: mlogbuf will brim over with INIT stack dumps.
+	 * To enable show_stack from INIT, we use oops_in_progress which should
+	 * be used in real oops. This would cause something wrong after INIT.
+	 */
+	BREAK_LOGLEVEL(console_loglevel);
+	ia64_mlogbuf_dump_from_init();
+
+	printk(KERN_ERR "Processes interrupted by INIT -");
+	for_each_online_cpu(c) {
+		struct ia64_sal_os_state *s;
+		t = __va(__per_cpu_mca[c] + IA64_MCA_CPU_INIT_STACK_OFFSET);
+		s = (struct ia64_sal_os_state *)((char *)t + MCA_SOS_OFFSET);
+		g = s->prev_task;
+		if (g) {
+			if (g->pid)
+				printk(" %d", g->pid);
+			else
+				printk(" %d (cpu %d task 0x%p)", g->pid, task_cpu(g), g);
+		}
+	}
+	printk("\n\n");
+	if (read_trylock(&tasklist_lock)) {
+		do_each_thread (g, t) {
+			printk("\nBacktrace of pid %d (%s)\n", t->pid, t->comm);
+			show_stack(t, NULL);
+		} while_each_thread (g, t);
+		read_unlock(&tasklist_lock);
+	}
+	/* FIXME: This will not restore zapped printk locks. */
+	RESTORE_LOGLEVEL(console_loglevel);
+	return NOTIFY_DONE;
+}
+
 /*
  * C portion of the OS INIT handler
  *
@@ -1211,14 +1666,14 @@ ia64_init_handler(struct pt_regs *regs, struct switch_stack *sw,
 {
 	static atomic_t slaves;
 	static atomic_t monarchs;
-	task_t *previous_current;
-	int cpu = smp_processor_id(), c;
-	struct task_struct *g, *t;
+	struct task_struct *previous_current;
+	int cpu = smp_processor_id();
+	struct ia64_mca_notify_die nd =
+		{ .sos = sos, .monarch_cpu = &monarch_cpu };
 
-	oops_in_progress = 1;	/* FIXME: make printk NMI/MCA/INIT safe */
-	console_loglevel = 15;	/* make sure printks make it to console */
+	NOTIFY_INIT(DIE_INIT_ENTER, regs, (long)&nd, 0);
 
-	printk(KERN_INFO "Entered OS INIT handler. PSP=%lx cpu=%d monarch=%ld\n",
+	mprintk(KERN_INFO "Entered OS INIT handler. PSP=%lx cpu=%d monarch=%ld\n",
 		sos->proc_state_param, cpu, sos->monarch);
 	salinfo_log_wakeup(SAL_INFO_TYPE_INIT, NULL, 0, 0);
 
@@ -1231,8 +1686,8 @@ ia64_init_handler(struct pt_regs *regs, struct switch_stack *sw,
 	 * fix their proms and get their customers updated.
 	 */
 	if (!sos->monarch && atomic_add_return(1, &slaves) == num_online_cpus()) {
-		printk(KERN_WARNING "%s: Promoting cpu %d to monarch.\n",
-		       __FUNCTION__, cpu);
+		mprintk(KERN_WARNING "%s: Promoting cpu %d to monarch.\n",
+		        __func__, cpu);
 		atomic_dec(&slaves);
 		sos->monarch = 1;
 	}
@@ -1243,19 +1698,37 @@ ia64_init_handler(struct pt_regs *regs, struct switch_stack *sw,
 	 * fix their proms and get their customers updated.
 	 */
 	if (sos->monarch && atomic_add_return(1, &monarchs) > 1) {
-		printk(KERN_WARNING "%s: Demoting cpu %d to slave.\n",
-			       __FUNCTION__, cpu);
+		mprintk(KERN_WARNING "%s: Demoting cpu %d to slave.\n",
+			       __func__, cpu);
 		atomic_dec(&monarchs);
 		sos->monarch = 0;
 	}
 
 	if (!sos->monarch) {
 		ia64_mc_info.imi_rendez_checkin[cpu] = IA64_MCA_RENDEZ_CHECKIN_INIT;
+
+#ifdef CONFIG_KEXEC
+		while (monarch_cpu == -1 && !atomic_read(&kdump_in_progress))
+			udelay(1000);
+#else
 		while (monarch_cpu == -1)
-		       cpu_relax();	/* spin until monarch enters */
+			cpu_relax();	/* spin until monarch enters */
+#endif
+
+		NOTIFY_INIT(DIE_INIT_SLAVE_ENTER, regs, (long)&nd, 1);
+		NOTIFY_INIT(DIE_INIT_SLAVE_PROCESS, regs, (long)&nd, 1);
+
+#ifdef CONFIG_KEXEC
+		while (monarch_cpu != -1 && !atomic_read(&kdump_in_progress))
+			udelay(1000);
+#else
 		while (monarch_cpu != -1)
-		       cpu_relax();	/* spin until monarch leaves */
-		printk("Slave on cpu %d returning to normal service.\n", cpu);
+			cpu_relax();	/* spin until monarch leaves */
+#endif
+
+		NOTIFY_INIT(DIE_INIT_SLAVE_LEAVE, regs, (long)&nd, 1);
+
+		mprintk("Slave on cpu %d returning to normal service.\n", cpu);
 		set_curr_task(cpu, previous_current);
 		ia64_mc_info.imi_rendez_checkin[cpu] = IA64_MCA_RENDEZ_CHECKIN_NOTDONE;
 		atomic_dec(&slaves);
@@ -1263,6 +1736,7 @@ ia64_init_handler(struct pt_regs *regs, struct switch_stack *sw,
 	}
 
 	monarch_cpu = cpu;
+	NOTIFY_INIT(DIE_INIT_MONARCH_ENTER, regs, (long)&nd, 1);
 
 	/*
 	 * Wait for a bit.  On some machines (e.g., HP's zx2000 and zx6000, INIT can be
@@ -1270,31 +1744,17 @@ ia64_init_handler(struct pt_regs *regs, struct switch_stack *sw,
 	 * same serial line, the user will need some time to switch out of the BMC before
 	 * the dump begins.
 	 */
-	printk("Delaying for 5 seconds...\n");
+	mprintk("Delaying for 5 seconds...\n");
 	udelay(5*1000000);
-	ia64_wait_for_slaves(cpu);
-	printk(KERN_ERR "Processes interrupted by INIT -");
-	for_each_online_cpu(c) {
-		struct ia64_sal_os_state *s;
-		t = __va(__per_cpu_mca[c] + IA64_MCA_CPU_INIT_STACK_OFFSET);
-		s = (struct ia64_sal_os_state *)((char *)t + MCA_SOS_OFFSET);
-		g = s->prev_task;
-		if (g) {
-			if (g->pid)
-				printk(" %d", g->pid);
-			else
-				printk(" %d (cpu %d task 0x%p)", g->pid, task_cpu(g), g);
-		}
-	}
-	printk("\n\n");
-	if (read_trylock(&tasklist_lock)) {
-		do_each_thread (g, t) {
-			printk("\nBacktrace of pid %d (%s)\n", t->pid, t->comm);
-			show_stack(t, NULL);
-		} while_each_thread (g, t);
-		read_unlock(&tasklist_lock);
-	}
-	printk("\nINIT dump complete.  Monarch on cpu %d returning to normal service.\n", cpu);
+	ia64_wait_for_slaves(cpu, "INIT");
+	/* If nobody intercepts DIE_INIT_MONARCH_PROCESS then we drop through
+	 * to default_monarch_init_process() above and just print all the
+	 * tasks.
+	 */
+	NOTIFY_INIT(DIE_INIT_MONARCH_PROCESS, regs, (long)&nd, 1);
+	NOTIFY_INIT(DIE_INIT_MONARCH_LEAVE, regs, (long)&nd, 1);
+
+	mprintk("\nINIT dump complete.  Monarch on cpu %d returning to normal service.\n", cpu);
 	atomic_dec(&monarchs);
 	set_curr_task(cpu, previous_current);
 	monarch_cpu = -1;
@@ -1312,38 +1772,32 @@ __setup("disable_cpe_poll", ia64_mca_disable_cpe_polling);
 
 static struct irqaction cmci_irqaction = {
 	.handler =	ia64_mca_cmc_int_handler,
-	.flags =	SA_INTERRUPT,
 	.name =		"cmc_hndlr"
 };
 
 static struct irqaction cmcp_irqaction = {
 	.handler =	ia64_mca_cmc_int_caller,
-	.flags =	SA_INTERRUPT,
 	.name =		"cmc_poll"
 };
 
 static struct irqaction mca_rdzv_irqaction = {
 	.handler =	ia64_mca_rendez_int_handler,
-	.flags =	SA_INTERRUPT,
 	.name =		"mca_rdzv"
 };
 
 static struct irqaction mca_wkup_irqaction = {
 	.handler =	ia64_mca_wakeup_int_handler,
-	.flags =	SA_INTERRUPT,
 	.name =		"mca_wkup"
 };
 
 #ifdef CONFIG_ACPI
 static struct irqaction mca_cpe_irqaction = {
 	.handler =	ia64_mca_cpe_int_handler,
-	.flags =	SA_INTERRUPT,
 	.name =		"cpe_hndlr"
 };
 
 static struct irqaction mca_cpep_irqaction = {
 	.handler =	ia64_mca_cpe_int_caller,
-	.flags =	SA_INTERRUPT,
 	.name =		"cpe_poll"
 };
 #endif /* CONFIG_ACPI */
@@ -1361,14 +1815,14 @@ format_mca_init_stack(void *mca_data, unsigned long offset,
 	struct task_struct *p = (struct task_struct *)((char *)mca_data + offset);
 	struct thread_info *ti;
 	memset(p, 0, KERNEL_STACK_SIZE);
-	ti = (struct thread_info *)((char *)p + IA64_TASK_SIZE);
+	ti = task_thread_info(p);
 	ti->flags = _TIF_MCA_INIT;
 	ti->preempt_count = 1;
 	ti->task = p;
 	ti->cpu = cpu;
-	p->thread_info = ti;
+	p->stack = ti;
 	p->state = TASK_UNINTERRUPTIBLE;
-	__set_bit(cpu, &p->cpus_allowed);
+	cpu_set(cpu, p->cpus_allowed);
 	INIT_LIST_HEAD(&p->tasks);
 	p->parent = p->real_parent = p->group_leader = p;
 	INIT_LIST_HEAD(&p->children);
@@ -1376,41 +1830,45 @@ format_mca_init_stack(void *mca_data, unsigned long offset,
 	strncpy(p->comm, type, sizeof(p->comm)-1);
 }
 
-/* Do per-CPU MCA-related initialization.  */
+/* Caller prevents this from being called after init */
+static void * __init_refok mca_bootmem(void)
+{
+	return __alloc_bootmem(sizeof(struct ia64_mca_cpu),
+	                    KERNEL_STACK_SIZE, 0);
+}
 
-void __devinit
+/* Do per-CPU MCA-related initialization.  */
+void
 ia64_mca_cpu_init(void *cpu_data)
 {
 	void *pal_vaddr;
-
-	if (smp_processor_id() == 0) {
-		void *mca_data;
-		int cpu;
-
-		mca_data = alloc_bootmem(sizeof(struct ia64_mca_cpu)
-					 * NR_CPUS + KERNEL_STACK_SIZE);
-		mca_data = (void *)(((unsigned long)mca_data +
-					KERNEL_STACK_SIZE - 1) &
-				(-KERNEL_STACK_SIZE));
-		for (cpu = 0; cpu < NR_CPUS; cpu++) {
-			format_mca_init_stack(mca_data,
-					offsetof(struct ia64_mca_cpu, mca_stack),
-					"MCA", cpu);
-			format_mca_init_stack(mca_data,
-					offsetof(struct ia64_mca_cpu, init_stack),
-					"INIT", cpu);
-			__per_cpu_mca[cpu] = __pa(mca_data);
-			mca_data += sizeof(struct ia64_mca_cpu);
-		}
-	}
+	void *data;
+	long sz = sizeof(struct ia64_mca_cpu);
+	int cpu = smp_processor_id();
+	static int first_time = 1;
 
 	/*
-	 * The MCA info structure was allocated earlier and its
-	 * physical address saved in __per_cpu_mca[cpu].  Copy that
-	 * address * to ia64_mca_data so we can access it as a per-CPU
-	 * variable.
+	 * Structure will already be allocated if cpu has been online,
+	 * then offlined.
 	 */
-	__get_cpu_var(ia64_mca_data) = __per_cpu_mca[smp_processor_id()];
+	if (__per_cpu_mca[cpu]) {
+		data = __va(__per_cpu_mca[cpu]);
+	} else {
+		if (first_time) {
+			data = mca_bootmem();
+			first_time = 0;
+		} else
+			data = (void *)__get_free_pages(GFP_KERNEL,
+							get_order(sz));
+		if (!data)
+			panic("Could not allocate MCA memory for cpu %d\n",
+					cpu);
+	}
+	format_mca_init_stack(data, offsetof(struct ia64_mca_cpu, mca_stack),
+		"MCA", cpu);
+	format_mca_init_stack(data, offsetof(struct ia64_mca_cpu, init_stack),
+		"INIT", cpu);
+	__get_cpu_var(ia64_mca_data) = __per_cpu_mca[cpu] = __pa(data);
 
 	/*
 	 * Stash away a copy of the PTE needed to map the per-CPU page.
@@ -1432,6 +1890,36 @@ ia64_mca_cpu_init(void *cpu_data)
 							      PAGE_KERNEL));
 }
 
+static void ia64_mca_cmc_vector_adjust(void *dummy)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+	if (!cmc_polling_enabled)
+		ia64_mca_cmc_vector_enable(NULL);
+	local_irq_restore(flags);
+}
+
+static int mca_cpu_callback(struct notifier_block *nfb,
+				      unsigned long action,
+				      void *hcpu)
+{
+	int hotcpu = (unsigned long) hcpu;
+
+	switch (action) {
+	case CPU_ONLINE:
+	case CPU_ONLINE_FROZEN:
+		smp_call_function_single(hotcpu, ia64_mca_cmc_vector_adjust,
+					 NULL, 0);
+		break;
+	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block mca_cpu_notifier = {
+	.notifier_call = mca_cpu_callback
+};
+
 /*
  * ia64_mca_init
  *
@@ -1459,11 +1947,15 @@ ia64_mca_init(void)
 	ia64_fptr_t *init_hldlr_ptr_slave = (ia64_fptr_t *)ia64_os_init_dispatch_slave;
 	ia64_fptr_t *mca_hldlr_ptr = (ia64_fptr_t *)ia64_os_mca_dispatch;
 	int i;
-	s64 rc;
+	long rc;
 	struct ia64_sal_retval isrv;
-	u64 timeout = IA64_MCA_RENDEZ_TIMEOUT;	/* platform specific */
+	unsigned long timeout = IA64_MCA_RENDEZ_TIMEOUT; /* platform specific */
+	static struct notifier_block default_init_monarch_nb = {
+		.notifier_call = default_monarch_init_process,
+		.priority = 0/* we need to notified last */
+	};
 
-	IA64_MCA_DEBUG("%s: begin\n", __FUNCTION__);
+	IA64_MCA_DEBUG("%s: begin\n", __func__);
 
 	/* Clear the Rendez checkin flag for all cpus */
 	for(i = 0 ; i < NR_CPUS; i++)
@@ -1487,6 +1979,7 @@ ia64_mca_init(void)
 			printk(KERN_INFO "Increasing MCA rendezvous timeout from "
 				"%ld to %ld milliseconds\n", timeout, isrv.v0);
 			timeout = isrv.v0;
+			NOTIFY_MCA(DIE_MCA_NEW_TIMEOUT, NULL, timeout, 0);
 			continue;
 		}
 		printk(KERN_ERR "Failed to register rendezvous interrupt "
@@ -1506,7 +1999,7 @@ ia64_mca_init(void)
 		return;
 	}
 
-	IA64_MCA_DEBUG("%s: registered MCA rendezvous spinloop and wakeup mech.\n", __FUNCTION__);
+	IA64_MCA_DEBUG("%s: registered MCA rendezvous spinloop and wakeup mech.\n", __func__);
 
 	ia64_mc_info.imi_mca_handler        = ia64_tpa(mca_hldlr_ptr->fp);
 	/*
@@ -1527,7 +2020,7 @@ ia64_mca_init(void)
 		return;
 	}
 
-	IA64_MCA_DEBUG("%s: registered OS MCA handler with SAL at 0x%lx, gp = 0x%lx\n", __FUNCTION__,
+	IA64_MCA_DEBUG("%s: registered OS MCA handler with SAL at 0x%lx, gp = 0x%lx\n", __func__,
 		       ia64_mc_info.imi_mca_handler, ia64_tpa(mca_hldlr_ptr->gp));
 
 	/*
@@ -1539,7 +2032,7 @@ ia64_mca_init(void)
 	ia64_mc_info.imi_slave_init_handler		= ia64_tpa(init_hldlr_ptr_slave->fp);
 	ia64_mc_info.imi_slave_init_handler_size	= 0;
 
-	IA64_MCA_DEBUG("%s: OS INIT handler at %lx\n", __FUNCTION__,
+	IA64_MCA_DEBUG("%s: OS INIT handler at %lx\n", __func__,
 		       ia64_mc_info.imi_monarch_init_handler);
 
 	/* Register the os init handler with SAL */
@@ -1555,9 +2048,36 @@ ia64_mca_init(void)
 		       "(status %ld)\n", rc);
 		return;
 	}
+	if (register_die_notifier(&default_init_monarch_nb)) {
+		printk(KERN_ERR "Failed to register default monarch INIT process\n");
+		return;
+	}
+
+	IA64_MCA_DEBUG("%s: registered OS INIT handler with SAL\n", __func__);
+
+	/* Initialize the areas set aside by the OS to buffer the
+	 * platform/processor error states for MCA/INIT/CMC
+	 * handling.
+	 */
+	ia64_log_init(SAL_INFO_TYPE_MCA);
+	ia64_log_init(SAL_INFO_TYPE_INIT);
+	ia64_log_init(SAL_INFO_TYPE_CMC);
+	ia64_log_init(SAL_INFO_TYPE_CPE);
+
+	mca_init = 1;
+	printk(KERN_INFO "MCA related initialization done\n");
+}
 
-	IA64_MCA_DEBUG("%s: registered OS INIT handler with SAL\n", __FUNCTION__);
 
+/*
+ * These pieces cannot be done in ia64_mca_init() because it is called before
+ * early_irq_init() which would wipe out our percpu irq registrations. But we
+ * cannot leave them until ia64_mca_late_init() because by then all the other
+ * processors have been brought online and have set their own CMC vectors to
+ * point at a non-existant action. Called from arch_early_irq_init().
+ */
+void __init ia64_mca_irq_init(void)
+{
 	/*
 	 *  Configure the CMCI/P vector and handler. Interrupts for CMC are
 	 *  per-processor, so AP CMC interrupts are setup in smp_callin() (smpboot.c).
@@ -1576,18 +2096,6 @@ ia64_mca_init(void)
 	/* Setup the CPEI/P handler */
 	register_percpu_irq(IA64_CPEP_VECTOR, &mca_cpep_irqaction);
 #endif
-
-	/* Initialize the areas set aside by the OS to buffer the
-	 * platform/processor error states for MCA/INIT/CMC
-	 * handling.
-	 */
-	ia64_log_init(SAL_INFO_TYPE_MCA);
-	ia64_log_init(SAL_INFO_TYPE_INIT);
-	ia64_log_init(SAL_INFO_TYPE_CMC);
-	ia64_log_init(SAL_INFO_TYPE_CPE);
-
-	mca_init = 1;
-	printk(KERN_INFO "MCA related initialization done\n");
 }
 
 /*
@@ -1606,6 +2114,8 @@ ia64_mca_late_init(void)
 	if (!mca_init)
 		return 0;
 
+	register_hotcpu_notifier(&mca_cpu_notifier);
+
 	/* Setup the CMCI/P vector and handler */
 	init_timer(&cmc_poll_timer);
 	cmc_poll_timer.function = ia64_mca_cmc_poll;
@@ -1614,7 +2124,7 @@ ia64_mca_late_init(void)
 	cmc_polling_enabled = 0;
 	schedule_work(&cmc_enable_work);
 
-	IA64_MCA_DEBUG("%s: CMCI/P setup and enabled.\n", __FUNCTION__);
+	IA64_MCA_DEBUG("%s: CMCI/P setup and enabled.\n", __func__);
 
 #ifdef CONFIG_ACPI
 	/* Setup the CPEI/P vector and handler */
@@ -1623,26 +2133,29 @@ ia64_mca_late_init(void)
 	cpe_poll_timer.function = ia64_mca_cpe_poll;
 
 	{
-		irq_desc_t *desc;
 		unsigned int irq;
 
 		if (cpe_vector >= 0) {
 			/* If platform supports CPEI, enable the irq. */
-			cpe_poll_enabled = 0;
-			for (irq = 0; irq < NR_IRQS; ++irq)
-				if (irq_to_vector(irq) == cpe_vector) {
-					desc = irq_descp(irq);
-					desc->status |= IRQ_PER_CPU;
-					setup_irq(irq, &mca_cpe_irqaction);
-				}
-			ia64_mca_register_cpev(cpe_vector);
-			IA64_MCA_DEBUG("%s: CPEI/P setup and enabled.\n", __FUNCTION__);
-		} else {
-			/* If platform doesn't support CPEI, get the timer going. */
-			if (cpe_poll_enabled) {
-				ia64_mca_cpe_poll(0UL);
-				IA64_MCA_DEBUG("%s: CPEP setup and enabled.\n", __FUNCTION__);
+			irq = local_vector_to_irq(cpe_vector);
+			if (irq > 0) {
+				cpe_poll_enabled = 0;
+				irq_set_status_flags(irq, IRQ_PER_CPU);
+				setup_irq(irq, &mca_cpe_irqaction);
+				ia64_cpe_irq = irq;
+				ia64_mca_register_cpev(cpe_vector);
+				IA64_MCA_DEBUG("%s: CPEI/P setup and enabled.\n",
+					__func__);
+				return 0;
 			}
+			printk(KERN_ERR "%s: Failed to find irq for CPE "
+					"interrupt handler, vector %d\n",
+					__func__, cpe_vector);
+		}
+		/* If platform doesn't support CPEI, get the timer going. */
+		if (cpe_poll_enabled) {
+			ia64_mca_cpe_poll(0UL);
+			IA64_MCA_DEBUG("%s: CPEP setup and enabled.\n", __func__);
 		}
 	}
 #endif
diff --git a/arch/ia64/kernel/mca_asm.S b/arch/ia64/kernel/mca_asm.S
index db32fc1d393..d5bdf9de36b 100644
--- a/arch/ia64/kernel/mca_asm.S
+++ b/arch/ia64/kernel/mca_asm.S
@@ -1,25 +1,28 @@
-//
-// assembly portion of the IA64 MCA handling
-//
-// Mods by cfleck to integrate into kernel build
-// 00/03/15 davidm Added various stop bits to get a clean compile
-//
-// 00/03/29 cfleck Added code to save INIT handoff state in pt_regs format, switch to temp
-//		   kstack, switch modes, jump to C INIT handler
-//
-// 02/01/04 J.Hall <jenna.s.hall@intel.com>
-//		   Before entering virtual mode code:
-//		   1. Check for TLB CPU error
-//		   2. Restore current thread pointer to kr6
-//		   3. Move stack ptr 16 bytes to conform to C calling convention
-//
-// 04/11/12 Russ Anderson <rja@sgi.com>
-//		   Added per cpu MCA/INIT stack save areas.
-//
-// 12/08/05 Keith Owens <kaos@sgi.com>
-//		   Use per cpu MCA/INIT stacks for all data.
-//
-#include <linux/config.h>
+/*
+ * File:	mca_asm.S
+ * Purpose:	assembly portion of the IA64 MCA handling
+ *
+ * Mods by cfleck to integrate into kernel build
+ *
+ * 2000-03-15 David Mosberger-Tang <davidm@hpl.hp.com>
+ *		Added various stop bits to get a clean compile
+ *
+ * 2000-03-29 Chuck Fleckenstein <cfleck@co.intel.com>
+ *		Added code to save INIT handoff state in pt_regs format,
+ *		switch to temp kstack, switch modes, jump to C INIT handler
+ *
+ * 2002-01-04 J.Hall <jenna.s.hall@intel.com>
+ *		Before entering virtual mode code:
+ *		 1. Check for TLB CPU error
+ *		 2. Restore current thread pointer to kr6
+ *		 3. Move stack ptr 16 bytes to conform to C calling convention
+ *
+ * 2004-11-12 Russ Anderson <rja@sgi.com>
+ *		Added per cpu MCA/INIT stack save areas.
+ *
+ * 2005-12-08 Keith Owens <kaos@sgi.com>
+ *		Use per cpu MCA/INIT stacks for all data.
+ */
 #include <linux/threads.h>
 
 #include <asm/asmmacro.h>
@@ -37,6 +40,7 @@
 
 	.global ia64_do_tlb_purge
 	.global ia64_os_mca_dispatch
+	.global ia64_os_init_on_kdump
 	.global ia64_os_init_dispatch_monarch
 	.global ia64_os_init_dispatch_slave
 
@@ -55,7 +59,7 @@
 ia64_do_tlb_purge:
 #define O(member)	IA64_CPUINFO_##member##_OFFSET
 
-	GET_THIS_PADDR(r2, cpu_info)	// load phys addr of cpu_info into r2
+	GET_THIS_PADDR(r2, ia64_cpu_info) // load phys addr of cpu_info into r2
 	;;
 	addl r17=O(PTCE_STRIDE),r2
 	addl r2=O(PTCE_BASE),r2
@@ -102,14 +106,6 @@ ia64_do_tlb_purge:
 	;;
 	srlz.d
 	;;
-	// 2. Purge DTR for PERCPU data.
-	movl r16=PERCPU_ADDR
-	mov r18=PERCPU_PAGE_SHIFT<<2
-	;;
-	ptr.d r16,r18
-	;;
-	srlz.d
-	;;
 	// 3. Purge ITR for PAL code.
 	GET_THIS_PADDR(r2, ia64_mca_pal_base)
 	;;
@@ -142,14 +138,6 @@ ia64_do_tlb_purge:
 //StartMain////////////////////////////////////////////////////////////////////
 
 ia64_os_mca_dispatch:
-	// Serialize all MCA processing
-	mov	r3=1;;
-	LOAD_PHYSICAL(p0,r2,ia64_mca_serialize);;
-ia64_os_mca_spin:
-	xchg4	r4=[r2],r3;;
-	cmp.ne	p6,p0=r4,r0
-(p6)	br ia64_os_mca_spin
-
 	mov r3=IA64_MCA_CPU_MCA_STACK_OFFSET	// use the MCA stack
 	LOAD_PHYSICAL(p0,r2,1f)			// return address
 	mov r19=1				// All MCA events are treated as monarch (for now)
@@ -159,7 +147,7 @@ ia64_os_mca_spin:
 	GET_IA64_MCA_DATA(r2)
 	// Using MCA stack, struct ia64_sal_os_state, variable proc_state_param
 	;;
-	add r3=IA64_MCA_CPU_MCA_STACK_OFFSET+MCA_SOS_OFFSET+IA64_SAL_OS_STATE_PROC_STATE_PARAM_OFFSET, r2
+	add r3=IA64_MCA_CPU_MCA_STACK_OFFSET+MCA_SOS_OFFSET+SOS(PROC_STATE_PARAM), r2
 	;;
 	ld8 r18=[r3]				// Get processor state parameter on existing PALE_CHECK.
 	;;
@@ -197,22 +185,6 @@ ia64_reload_tr:
 	srlz.i
 	srlz.d
 	;;
-	// 2. Reload DTR register for PERCPU data.
-	GET_THIS_PADDR(r2, ia64_mca_per_cpu_pte)
-	;;
-	movl r16=PERCPU_ADDR		// vaddr
-	movl r18=PERCPU_PAGE_SHIFT<<2
-	;;
-	mov cr.itir=r18
-	mov cr.ifa=r16
-	;;
-	ld8 r18=[r2]			// load per-CPU PTE
-	mov r16=IA64_TR_PERCPU_DATA;
-	;;
-	itr.d dtr[r16]=r18
-	;;
-	srlz.d
-	;;
 	// 3. Reload ITR for PAL code.
 	GET_THIS_PADDR(r2, ia64_mca_pal_pte)
 	;;
@@ -248,8 +220,13 @@ ia64_reload_tr:
 	mov r20=IA64_TR_CURRENT_STACK
 	;;
 	itr.d dtr[r20]=r16
+	GET_THIS_PADDR(r2, ia64_mca_tr_reload)
+	mov r18 = 1
 	;;
 	srlz.d
+	;;
+	st8 [r2] =r18
+	;;
 
 done_tlb_purge_and_reload:
 
@@ -316,10 +293,6 @@ END(ia64_os_mca_virtual_begin)
 
 	mov		b0=r12			// SAL_CHECK return address
 
-	// release lock
-	LOAD_PHYSICAL(p0,r3,ia64_mca_serialize);;
-	st4.rel		[r3]=r0
-
 	br		b0
 
 //EndMain//////////////////////////////////////////////////////////////////////
@@ -327,6 +300,25 @@ END(ia64_os_mca_virtual_begin)
 //StartMain////////////////////////////////////////////////////////////////////
 
 //
+// NOP init handler for kdump.  In panic situation, we may receive INIT
+// while kernel transition.  Since we initialize registers on leave from
+// current kernel, no longer monarch/slave handlers of current kernel in
+// virtual mode are called safely.
+// We can unregister these init handlers from SAL, however then the INIT
+// will result in warmboot by SAL and we cannot retrieve the crashdump.
+// Therefore register this NOP function to SAL, to prevent entering virtual
+// mode and resulting warmboot by SAL.
+//
+ia64_os_init_on_kdump:
+	mov		r8=r0		// IA64_INIT_RESUME
+	mov             r9=r10		// SAL_GP
+	mov		r22=r17		// *minstate
+	;;
+	mov		r10=r0		// return to same context
+	mov		b0=r12		// SAL_CHECK return address
+	br		b0
+
+//
 // SAL to OS entry point for INIT on all processors.  This has been defined for
 // registration purposes with SAL as a part of ia64_mca_init.  Monarch and
 // slave INIT have identical processing, except for the value of the
@@ -479,9 +471,11 @@ ia64_state_save:
 	st8 [temp2]=r11,16	// rv_rc
 	mov r11=cr.iipa
 	;;
-	st8 [temp1]=r18,16	// proc_state_param
-	st8 [temp2]=r19,16	// monarch
+	st8 [temp1]=r18		// proc_state_param
+	st8 [temp2]=r19		// monarch
 	mov r6=IA64_KR(CURRENT)
+	add temp1=SOS(SAL_RA), regs
+	add temp2=SOS(SAL_GP), regs
 	;;
 	st8 [temp1]=r12,16	// sal_ra
 	st8 [temp2]=r10,16	// sal_gp
@@ -503,16 +497,18 @@ ia64_state_save:
 	st8 [temp2]=r11,16	// cr.iipa
 	mov r12=cr.iim
 	;;
-	st8 [temp1]=r12,16	// cr.iim
+	st8 [temp1]=r12		// cr.iim
 (p1)	mov r12=IA64_MCA_COLD_BOOT
 (p2)	mov r12=IA64_INIT_WARM_BOOT
 	mov r6=cr.iha
+	add temp1=SOS(OS_STATUS), regs
 	;;
-	st8 [temp2]=r6,16	// cr.iha
+	st8 [temp2]=r6		// cr.iha
+	add temp2=SOS(CONTEXT), regs
 	st8 [temp1]=r12		// os_status, default is cold boot
 	mov r6=IA64_MCA_SAME_CONTEXT
 	;;
-	st8 [temp1]=r6		// context, default is same context
+	st8 [temp2]=r6		// context, default is same context
 
 	// Save the pt_regs data that is not in minstate.  The previous code
 	// left regs at sos.
@@ -820,14 +816,14 @@ ia64_state_restore:
 	// Restore the SAL to OS state. The previous code left regs at pt_regs.
 	add regs=MCA_SOS_OFFSET-MCA_PT_REGS_OFFSET, regs
 	;;
-	add temp1=IA64_SAL_OS_STATE_COMMON_OFFSET, regs
-	add temp2=IA64_SAL_OS_STATE_COMMON_OFFSET+8, regs
+	add temp1=SOS(SAL_RA), regs
+	add temp2=SOS(SAL_GP), regs
 	;;
 	ld8 r12=[temp1],16	// sal_ra
 	ld8 r9=[temp2],16	// sal_gp
 	;;
 	ld8 r22=[temp1],16	// pal_min_state, virtual
-	ld8 r21=[temp2],16	// prev_IA64_KR_CURRENT
+	ld8 r13=[temp2],16	// prev_IA64_KR_CURRENT
 	;;
 	ld8 r16=[temp1],16	// prev_IA64_KR_CURRENT_STACK
 	ld8 r20=[temp2],16	// prev_task
@@ -842,13 +838,15 @@ ia64_state_restore:
 	;;
 	mov cr.itir=temp3
 	mov cr.iipa=temp4
-	ld8 temp3=[temp1],16	// cr.iim
-	ld8 temp4=[temp2],16	// cr.iha
+	ld8 temp3=[temp1]	// cr.iim
+	ld8 temp4=[temp2]		// cr.iha
+	add temp1=SOS(OS_STATUS), regs
+	add temp2=SOS(CONTEXT), regs
 	;;
 	mov cr.iim=temp3
 	mov cr.iha=temp4
-	dep r22=0,r22,62,2	// pal_min_state, physical, uncached
-	mov IA64_KR(CURRENT)=r21
+	dep r22=0,r22,62,1	// pal_min_state, physical, uncached
+	mov IA64_KR(CURRENT)=r13
 	ld8 r8=[temp1]		// os_status
 	ld8 r10=[temp2]		// context
 
@@ -856,7 +854,7 @@ ia64_state_restore:
 	 * avoid any dependencies on the algorithm in ia64_switch_to(), just
 	 * purge any existing CURRENT_STACK mapping and insert the new one.
 	 *
-	 * r16 contains prev_IA64_KR_CURRENT_STACK, r21 contains
+	 * r16 contains prev_IA64_KR_CURRENT_STACK, r13 contains
 	 * prev_IA64_KR_CURRENT, these values may have been changed by the C
 	 * code.  Do not use r8, r9, r10, r22, they contain values ready for
 	 * the return to SAL.
@@ -873,7 +871,7 @@ ia64_state_restore:
 	;;
 	srlz.d
 
-	extr.u r19=r21,61,3			// r21 = prev_IA64_KR_CURRENT
+	extr.u r19=r13,61,3			// r13 = prev_IA64_KR_CURRENT
 	shl r20=r16,IA64_GRANULE_SHIFT		// r16 = prev_IA64_KR_CURRENT_STACK
 	movl r21=PAGE_KERNEL			// page properties
 	;;
@@ -883,7 +881,7 @@ ia64_state_restore:
 (p6)	br.spnt 1f				// the dreaded cpu 0 idle task in region 5:(
 	;;
 	mov cr.itir=r18
-	mov cr.ifa=r21
+	mov cr.ifa=r13
 	mov r20=IA64_TR_CURRENT_STACK
 	;;
 	itr.d dtr[r20]=r21
@@ -916,7 +914,7 @@ ia64_state_restore:
 
 ia64_new_stack:
 	add regs=MCA_PT_REGS_OFFSET, r3
-	add temp2=MCA_SOS_OFFSET+IA64_SAL_OS_STATE_PAL_MIN_STATE_OFFSET, r3
+	add temp2=MCA_SOS_OFFSET+SOS(PAL_MIN_STATE), r3
 	mov b0=r2			// save return address
 	GET_IA64_MCA_DATA(temp1)
 	invala
@@ -1020,18 +1018,13 @@ ia64_old_stack:
 
 ia64_set_kernel_registers:
 	add temp3=MCA_SP_OFFSET, r3
-	add temp4=MCA_SOS_OFFSET+IA64_SAL_OS_STATE_OS_GP_OFFSET, r3
 	mov b0=r2		// save return address
 	GET_IA64_MCA_DATA(temp1)
 	;;
-	add temp4=temp4, temp1	// &struct ia64_sal_os_state.os_gp
 	add r12=temp1, temp3	// kernel stack pointer on MCA/INIT stack
 	add r13=temp1, r3	// set current to start of MCA/INIT stack
 	add r20=temp1, r3	// physical start of MCA/INIT stack
 	;;
-	ld8 r1=[temp4]		// OS GP from SAL OS state
-	;;
-	DATA_PA_TO_VA(r1,temp1)
 	DATA_PA_TO_VA(r12,temp2)
 	DATA_PA_TO_VA(r13,temp3)
 	;;
@@ -1062,6 +1055,10 @@ ia64_set_kernel_registers:
 	mov cr.itir=r18
 	mov cr.ifa=r13
 	mov r20=IA64_TR_CURRENT_STACK
+
+	movl r17=FPSR_DEFAULT
+	;;
+	mov.m ar.fpsr=r17			// set ar.fpsr to kernel default value
 	;;
 	itr.d dtr[r20]=r21
 	;;
@@ -1096,3 +1093,30 @@ GLOBAL_ENTRY(ia64_get_rnat)
 	mov ar.rsc=3
 	br.ret.sptk.many rp
 END(ia64_get_rnat)
+
+
+// void ia64_set_psr_mc(void)
+//
+// Set psr.mc bit to mask MCA/INIT.
+GLOBAL_ENTRY(ia64_set_psr_mc)
+	rsm psr.i | psr.ic		// disable interrupts
+	;;
+	srlz.d
+	;;
+	mov r14 = psr			// get psr{36:35,31:0}
+	movl r15 = 1f
+	;;
+	dep r14 = -1, r14, PSR_MC, 1	// set psr.mc
+	;;
+	dep r14 = -1, r14, PSR_IC, 1	// set psr.ic
+	;;
+	dep r14 = -1, r14, PSR_BN, 1	// keep bank1 in use
+	;;
+	mov cr.ipsr = r14
+	mov cr.ifs = r0
+	mov cr.iip = r15
+	;;
+	rfi
+1:
+	br.ret.sptk.many rp
+END(ia64_set_psr_mc)
diff --git a/arch/ia64/kernel/mca_drv.c b/arch/ia64/kernel/mca_drv.c
index f081c60ab20..94f8bf777af 100644
--- a/arch/ia64/kernel/mca_drv.c
+++ b/arch/ia64/kernel/mca_drv.c
@@ -3,18 +3,17 @@
  * Purpose:	Generic MCA handling layer
  *
  * Copyright (C) 2004 FUJITSU LIMITED
- * Copyright (C) Hidetoshi Seto (seto.hidetoshi@jp.fujitsu.com)
+ * Copyright (C) 2004 Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
  * Copyright (C) 2005 Silicon Graphics, Inc
  * Copyright (C) 2005 Keith Owens <kaos@sgi.com>
+ * Copyright (C) 2006 Russ Anderson <rja@sgi.com>
  */
-#include <linux/config.h>
 #include <linux/types.h>
 #include <linux/init.h>
 #include <linux/sched.h>
 #include <linux/interrupt.h>
 #include <linux/irq.h>
 #include <linux/kallsyms.h>
-#include <linux/smp_lock.h>
 #include <linux/bootmem.h>
 #include <linux/acpi.h>
 #include <linux/timer.h>
@@ -23,12 +22,12 @@
 #include <linux/smp.h>
 #include <linux/workqueue.h>
 #include <linux/mm.h>
+#include <linux/slab.h>
 
 #include <asm/delay.h>
 #include <asm/machvec.h>
 #include <asm/page.h>
 #include <asm/ptrace.h>
-#include <asm/system.h>
 #include <asm/sal.h>
 #include <asm/mca.h>
 
@@ -61,6 +60,11 @@ typedef enum {
 	ISOLATE_NONE
 } isolate_status_t;
 
+typedef enum {
+	MCA_NOT_RECOVERED = 0,
+	MCA_RECOVERED	  = 1
+} recovery_status_t;
+
 /*
  *  This pool keeps pointers to the section part of SAL error record
  */
@@ -70,6 +74,34 @@ static struct {
 	int	     max_idx; /* Maximum index of section pointer list pool */
 } slidx_pool;
 
+static int
+fatal_mca(const char *fmt, ...)
+{
+	va_list args;
+	char buf[256];
+
+	va_start(args, fmt);
+	vsnprintf(buf, sizeof(buf), fmt, args);
+	va_end(args);
+	ia64_mca_printk(KERN_ALERT "MCA: %s\n", buf);
+
+	return MCA_NOT_RECOVERED;
+}
+
+static int
+mca_recovered(const char *fmt, ...)
+{
+	va_list args;
+	char buf[256];
+
+	va_start(args, fmt);
+	vsnprintf(buf, sizeof(buf), fmt, args);
+	va_end(args);
+	ia64_mca_printk(KERN_INFO "MCA: %s\n", buf);
+
+	return MCA_RECOVERED;
+}
+
 /**
  * mca_page_isolate - isolate a poisoned page in order not to use it later
  * @paddr:	poisoned memory location
@@ -88,7 +120,7 @@ mca_page_isolate(unsigned long paddr)
 	if (!ia64_phys_addr_valid(paddr))
 		return ISOLATE_NONE;
 
-	if (!pfn_valid(paddr))
+	if (!pfn_valid(paddr >> PAGE_SHIFT))
 		return ISOLATE_NONE;
 
 	/* convert physical address to physical page number */
@@ -108,6 +140,7 @@ mca_page_isolate(unsigned long paddr)
 		return ISOLATE_NG;
 
 	/* add attribute 'Reserved' and register the page */
+	get_page(p);
 	SetPageReserved(p);
 	page_isolate[num_page_isolate++] = p;
 
@@ -120,10 +153,14 @@ mca_page_isolate(unsigned long paddr)
  */
 
 void
-mca_handler_bh(unsigned long paddr)
+mca_handler_bh(unsigned long paddr, void *iip, unsigned long ipsr)
 {
-	printk(KERN_DEBUG "OS_MCA: process [pid: %d](%s) encounters MCA.\n",
-		current->pid, current->comm);
+	ia64_mlogbuf_dump();
+	printk(KERN_ERR "OS_MCA: process [cpu %d, pid: %d, uid: %d, "
+		"iip: %p, psr: 0x%lx,paddr: 0x%lx](%s) encounters MCA.\n",
+	       raw_smp_processor_id(), current->pid,
+		from_kuid(&init_user_ns, current_uid()),
+		iip, ipsr, paddr, current->comm);
 
 	spin_lock(&mca_bh_lock);
 	switch (mca_page_isolate(paddr)) {
@@ -131,7 +168,7 @@ mca_handler_bh(unsigned long paddr)
 		printk(KERN_DEBUG "Page isolation: ( %lx ) success.\n", paddr);
 		break;
 	case ISOLATE_NG:
-		printk(KERN_DEBUG "Page isolation: ( %lx ) failure.\n", paddr);
+		printk(KERN_CRIT "Page isolation: ( %lx ) failure.\n", paddr);
 		break;
 	default:
 		break;
@@ -312,7 +349,7 @@ init_record_index_pools(void)
 
 	/* - 3 - */
 	slidx_pool.max_idx = (rec_max_size/sect_min_size) * 2 + 1;
-	slidx_pool.buffer = (slidx_list_t *)
+	slidx_pool.buffer =
 		kmalloc(slidx_pool.max_idx * sizeof(slidx_list_t), GFP_KERNEL);
 
 	return slidx_pool.buffer ? 0 : -ENOMEM;
@@ -398,6 +435,50 @@ is_mca_global(peidx_table_t *peidx, pal_bus_check_info_t *pbci,
 }
 
 /**
+ * get_target_identifier - Get the valid Cache or Bus check target identifier.
+ * @peidx:	pointer of index of processor error section
+ *
+ * Return value:
+ *	target address on Success / 0 on Failure
+ */
+static u64
+get_target_identifier(peidx_table_t *peidx)
+{
+	u64 target_address = 0;
+	sal_log_mod_error_info_t *smei;
+	pal_cache_check_info_t *pcci;
+	int i, level = 9;
+
+	/*
+	 * Look through the cache checks for a valid target identifier
+	 * If more than one valid target identifier, return the one
+	 * with the lowest cache level.
+	 */
+	for (i = 0; i < peidx_cache_check_num(peidx); i++) {
+		smei = (sal_log_mod_error_info_t *)peidx_cache_check(peidx, i);
+		if (smei->valid.target_identifier && smei->target_identifier) {
+			pcci = (pal_cache_check_info_t *)&(smei->check_info);
+			if (!target_address || (pcci->level < level)) {
+				target_address = smei->target_identifier;
+				level = pcci->level;
+				continue;
+			}
+		}
+	}
+	if (target_address)
+		return target_address;
+
+	/*
+	 * Look at the bus check for a valid target identifier
+	 */
+	smei = peidx_bus_check(peidx, 0);
+	if (smei && smei->valid.target_identifier)
+		return smei->target_identifier;
+
+	return 0;
+}
+
+/**
  * recover_from_read_error - Try to recover the errors which type are "read"s.
  * @slidx:	pointer of index of SAL error record
  * @peidx:	pointer of index of processor error section
@@ -413,14 +494,15 @@ recover_from_read_error(slidx_table_t *slidx,
 			peidx_table_t *peidx, pal_bus_check_info_t *pbci,
 			struct ia64_sal_os_state *sos)
 {
-	sal_log_mod_error_info_t *smei;
+	u64 target_identifier;
 	pal_min_state_area_t *pmsa;
 	struct ia64_psr *psr1, *psr2;
 	ia64_fptr_t *mca_hdlr_bh = (ia64_fptr_t*)mca_handler_bhhook;
 
 	/* Is target address valid? */
-	if (!pbci->tv)
-		return 0;
+	target_identifier = get_target_identifier(peidx);
+	if (!target_identifier)
+		return fatal_mca("target address not valid");
 
 	/*
 	 * cpu read or memory-mapped io read
@@ -436,39 +518,46 @@ recover_from_read_error(slidx_table_t *slidx,
 	 *    the process not have any locks of kernel.
 	 */
 
+	/* Is minstate valid? */
+	if (!peidx_bottom(peidx) || !(peidx_bottom(peidx)->valid.minstate))
+		return fatal_mca("minstate not valid");
 	psr1 =(struct ia64_psr *)&(peidx_minstate_area(peidx)->pmsa_ipsr);
+	psr2 =(struct ia64_psr *)&(peidx_minstate_area(peidx)->pmsa_xpsr);
 
 	/*
 	 *  Check the privilege level of interrupted context.
 	 *   If it is user-mode, then terminate affected process.
 	 */
-	if (psr1->cpl != 0) {
-		smei = peidx_bus_check(peidx, 0);
-		if (smei->valid.target_identifier) {
-			/*
-			 *  setup for resume to bottom half of MCA,
-			 * "mca_handler_bhhook"
-			 */
-			pmsa = sos->pal_min_state;
-			/* pass to bhhook as 1st argument (gr8) */
-			pmsa->pmsa_gr[8-1] = smei->target_identifier;
-			/* set interrupted return address (but no use) */
-			pmsa->pmsa_br0 = pmsa->pmsa_iip;
-			/* change resume address to bottom half */
-			pmsa->pmsa_iip = mca_hdlr_bh->fp;
-			pmsa->pmsa_gr[1-1] = mca_hdlr_bh->gp;
-			/* set cpl with kernel mode */
-			psr2 = (struct ia64_psr *)&pmsa->pmsa_ipsr;
-			psr2->cpl = 0;
-			psr2->ri  = 0;
-			psr2->i  = 0;
-
-			return 1;
-		}
 
+	pmsa = sos->pal_min_state;
+	if (psr1->cpl != 0 ||
+	   ((psr2->cpl != 0) && mca_recover_range(pmsa->pmsa_iip))) {
+		/*
+		 *  setup for resume to bottom half of MCA,
+		 * "mca_handler_bhhook"
+		 */
+		/* pass to bhhook as argument (gr8, ...) */
+		pmsa->pmsa_gr[8-1] = target_identifier;
+		pmsa->pmsa_gr[9-1] = pmsa->pmsa_iip;
+		pmsa->pmsa_gr[10-1] = pmsa->pmsa_ipsr;
+		/* set interrupted return address (but no use) */
+		pmsa->pmsa_br0 = pmsa->pmsa_iip;
+		/* change resume address to bottom half */
+		pmsa->pmsa_iip = mca_hdlr_bh->fp;
+		pmsa->pmsa_gr[1-1] = mca_hdlr_bh->gp;
+		/* set cpl with kernel mode */
+		psr2 = (struct ia64_psr *)&pmsa->pmsa_ipsr;
+		psr2->cpl = 0;
+		psr2->ri  = 0;
+		psr2->bn  = 1;
+		psr2->i  = 0;
+
+		return mca_recovered("user memory corruption. "
+				"kill affected process - recovered.");
 	}
 
-	return 0;
+	return fatal_mca("kernel context not recovered, iip 0x%lx\n",
+			 pmsa->pmsa_iip);
 }
 
 /**
@@ -513,11 +602,40 @@ recover_from_platform_error(slidx_table_t *slidx, peidx_table_t *peidx,
 		default:
 			break;
 		}
+	} else if (psp->cc && !psp->bc) {	/* Cache error */
+		status = recover_from_read_error(slidx, peidx, pbci, sos);
 	}
 
 	return status;
 }
 
+/*
+ * recover_from_tlb_check
+ * @peidx:	pointer of index of processor error section
+ *
+ * Return value:
+ *	1 on Success / 0 on Failure
+ */
+static int
+recover_from_tlb_check(peidx_table_t *peidx)
+{
+	sal_log_mod_error_info_t *smei;
+	pal_tlb_check_info_t *ptci;
+
+	smei = (sal_log_mod_error_info_t *)peidx_tlb_check(peidx, 0);
+	ptci = (pal_tlb_check_info_t *)&(smei->check_info);
+
+	/*
+	 * Look for signature of a duplicate TLB DTC entry, which is
+	 * a SW bug and always fatal.
+	 */
+	if (ptci->op == PAL_TLB_CHECK_OP_PURGE
+	    && !(ptci->itr || ptci->dtc || ptci->itc))
+		return fatal_mca("Duplicate TLB entry");
+
+	return mca_recovered("TLB check recovered");
+}
+
 /**
  * recover_from_processor_error
  * @platform:	whether there are some platform error section or not
@@ -529,13 +647,6 @@ recover_from_platform_error(slidx_table_t *slidx, peidx_table_t *peidx,
  * Return value:
  *	1 on Success / 0 on Failure
  */
-/*
- *  Later we try to recover when below all conditions are satisfied.
- *   1. Only one processor error section is exist.
- *   2. BUS_CHECK is exist and the others are not exist.(Except TLB_CHECK)
- *   3. The entry of BUS_CHECK_INFO is 1.
- *   4. "External bus error" flag is set and the others are not set.
- */
 
 static int
 recover_from_processor_error(int platform, slidx_table_t *slidx,
@@ -546,43 +657,60 @@ recover_from_processor_error(int platform, slidx_table_t *slidx,
 		(pal_processor_state_info_t*)peidx_psp(peidx);
 
 	/*
-	 * We cannot recover errors with other than bus_check.
+	 * Processor recovery status must key off of the PAL recovery
+	 * status in the Processor State Parameter.
 	 */
-	if (psp->cc || psp->rc || psp->uc)
-		return 0;
 
 	/*
-	 * If there is no bus error, record is weird but we need not to recover.
+	 * The machine check is corrected.
 	 */
-	if (psp->bc == 0 || pbci == NULL)
-		return 1;
+	if (psp->cm == 1)
+		return mca_recovered("machine check is already corrected.");
 
 	/*
-	 * Sorry, we cannot handle so many.
+	 * The error was not contained.  Software must be reset.
 	 */
-	if (peidx_bus_check_num(peidx) > 1)
-		return 0;
+	if (psp->us || psp->ci == 0)
+		return fatal_mca("error not contained");
+
 	/*
-	 * Well, here is only one bus error.
+	 * Look for recoverable TLB check
 	 */
-	if (pbci->ib || pbci->cc)
-		return 0;
+	if (psp->tc && !(psp->cc || psp->bc || psp->rc || psp->uc))
+		return recover_from_tlb_check(peidx);
+
+	/*
+	 * The cache check and bus check bits have four possible states
+	 *   cc bc
+	 *    1  1	Memory error, attempt recovery
+	 *    1  0	Cache error, attempt recovery
+	 *    0  1	I/O error, attempt recovery
+	 *    0  0	Other error type, not recovered
+	 */
+	if (psp->cc == 0 && (psp->bc == 0 || pbci == NULL))
+		return fatal_mca("No cache or bus check");
+
+	/*
+	 * Cannot handle more than one bus check.
+	 */
+	if (peidx_bus_check_num(peidx) > 1)
+		return fatal_mca("Too many bus checks");
+
+	if (pbci->ib)
+		return fatal_mca("Internal Bus error");
 	if (pbci->eb && pbci->bsi > 0)
-		return 0;
-	if (psp->ci == 0)
-		return 0;
+		return fatal_mca("External bus check fatal status");
 
 	/*
-	 * This is a local MCA and estimated as recoverble external bus error.
-	 * (e.g. a load from poisoned memory)
-	 * This means "there are some platform errors".
+	 * This is a local MCA and estimated as a recoverable error.
 	 */
 	if (platform)
 		return recover_from_platform_error(slidx, peidx, pbci, sos);
+
 	/*
 	 * On account of strange SAL error record, we cannot recover.
 	 */
-	return 0;
+	return fatal_mca("Strange SAL record");
 }
 
 /**
@@ -611,12 +739,10 @@ mca_try_to_recover(void *rec, struct ia64_sal_os_state *sos)
 
 	 /* Now, OS can recover when there is one processor error section */
 	if (n_proc_err > 1)
-		return 0;
-	else if (n_proc_err == 0) {
-		/* Weird SAL record ... We need not to recover */
-
-		return 1;
-	}
+		return fatal_mca("Too Many Errors");
+	else if (n_proc_err == 0)
+		/* Weird SAL record ... We can't do anything */
+		return fatal_mca("Weird SAL record");
 
 	/* Make index of processor error section */
 	mca_make_peidx((sal_log_processor_info_t*)
@@ -627,7 +753,7 @@ mca_try_to_recover(void *rec, struct ia64_sal_os_state *sos)
 
 	/* Check whether MCA is global or not */
 	if (is_mca_global(&peidx, &pbci, sos))
-		return 0;
+		return fatal_mca("global MCA");
 	
 	/* Try to recover a processor error */
 	return recover_from_processor_error(platform_err, &slidx, &peidx,
diff --git a/arch/ia64/kernel/mca_drv.h b/arch/ia64/kernel/mca_drv.h
index e2f6fa1e0ef..53b8ecb5b4b 100644
--- a/arch/ia64/kernel/mca_drv.h
+++ b/arch/ia64/kernel/mca_drv.h
@@ -3,7 +3,7 @@
  * Purpose:	Define helpers for Generic MCA handling
  *
  * Copyright (C) 2004 FUJITSU LIMITED
- * Copyright (C) Hidetoshi Seto (seto.hidetoshi@jp.fujitsu.com)
+ * Copyright (C) 2004 Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
  */
 /*
  * Processor error section:
@@ -111,3 +111,12 @@ typedef struct slidx_table {
 	slidx_foreach_entry(__pos, &((slidx)->sec)) { __count++; }\
 	__count; })
 
+struct mca_table_entry {
+	int start_addr;	/* location-relative starting address of MCA recoverable range */
+	int end_addr;	/* location-relative ending address of MCA recoverable range */
+};
+
+extern const struct mca_table_entry *search_mca_tables (unsigned long addr);
+extern int mca_recover_range(unsigned long);
+extern void ia64_mlogbuf_dump(void);
+
diff --git a/arch/ia64/kernel/mca_drv_asm.S b/arch/ia64/kernel/mca_drv_asm.S
index 3f298ee4d00..767ac2c20d1 100644
--- a/arch/ia64/kernel/mca_drv_asm.S
+++ b/arch/ia64/kernel/mca_drv_asm.S
@@ -3,9 +3,8 @@
  * Purpose:     Assembly portion of Generic MCA handling
  *
  * Copyright (C) 2004 FUJITSU LIMITED
- * Copyright (C) Hidetoshi Seto (seto.hidetoshi@jp.fujitsu.com)
+ * Copyright (C) 2004 Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
  */
-#include <linux/config.h>
 #include <linux/threads.h>
 
 #include <asm/asmmacro.h>
@@ -14,15 +13,12 @@
 
 GLOBAL_ENTRY(mca_handler_bhhook)
 	invala				// clear RSE ?
-	;;
 	cover
 	;;
 	clrrrb
 	;;						
-	alloc	r16=ar.pfs,0,2,1,0	// make a new frame
-	;;
+	alloc	r16=ar.pfs,0,2,3,0	// make a new frame
 	mov	ar.rsc=0
-	;;
 	mov	r13=IA64_KR(CURRENT)	// current task pointer
 	;;
 	mov	r2=r13
@@ -30,7 +26,6 @@ GLOBAL_ENTRY(mca_handler_bhhook)
 	addl	r22=IA64_RBS_OFFSET,r2
 	;;
 	mov	ar.bspstore=r22
-	;;
 	addl	sp=IA64_STK_OFFSET-IA64_PT_REGS_SIZE,r2
 	;;
 	adds	r2=IA64_TASK_THREAD_ON_USTACK_OFFSET,r13
@@ -40,12 +35,16 @@ GLOBAL_ENTRY(mca_handler_bhhook)
 	movl	loc1=mca_handler_bh	// recovery C function
 	;;
 	mov	out0=r8			// poisoned address
+	mov	out1=r9			// iip
+	mov	out2=r10		// psr
 	mov	b6=loc1
 	;;
 	mov	loc1=rp
+	ssm	psr.ic
 	;;
-	ssm	psr.i
+	srlz.i
 	;;
+	ssm	psr.i
 	br.call.sptk.many rp=b6		// does not return ...
 	;;
 	mov	ar.pfs=loc0
@@ -53,5 +52,4 @@ GLOBAL_ENTRY(mca_handler_bhhook)
 	;;
 	mov	r8=r0
 	br.ret.sptk.many rp
-	;;
 END(mca_handler_bhhook)
diff --git a/arch/ia64/kernel/minstate.h b/arch/ia64/kernel/minstate.h
index 85ed54179af..cc82a7d744c 100644
--- a/arch/ia64/kernel/minstate.h
+++ b/arch/ia64/kernel/minstate.h
@@ -1,8 +1,23 @@
-#include <linux/config.h>
 
 #include <asm/cache.h>
 
 #include "entry.h"
+#include "paravirt_inst.h"
+
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
+/* read ar.itc in advance, and use it before leaving bank 0 */
+#define ACCOUNT_GET_STAMP				\
+(pUStk) mov.m r20=ar.itc;
+#define ACCOUNT_SYS_ENTER				\
+(pUStk) br.call.spnt rp=account_sys_enter		\
+	;;
+#else
+#define ACCOUNT_GET_STAMP
+#define ACCOUNT_SYS_ENTER
+#endif
+
+.section ".data..patch.rse", "a"
+.previous
 
 /*
  * DO_SAVE_MIN switches to the kernel stacks (if necessary) and saves
@@ -29,16 +44,16 @@
  * Note that psr.ic is NOT turned on by this macro.  This is so that
  * we can pass interruption state as arguments to a handler.
  */
-#define DO_SAVE_MIN(COVER,SAVE_IFS,EXTRA)							\
+#define IA64_NATIVE_DO_SAVE_MIN(__COVER,SAVE_IFS,EXTRA,WORKAROUND)				\
 	mov r16=IA64_KR(CURRENT);	/* M */							\
 	mov r27=ar.rsc;			/* M */							\
 	mov r20=r1;			/* A */							\
 	mov r25=ar.unat;		/* M */							\
-	mov r29=cr.ipsr;		/* M */							\
+	MOV_FROM_IPSR(p0,r29);		/* M */							\
 	mov r26=ar.pfs;			/* I */							\
-	mov r28=cr.iip;			/* M */							\
+	MOV_FROM_IIP(r28);			/* M */						\
 	mov r21=ar.fpsr;		/* M */							\
-	COVER;				/* B;; (or nothing) */					\
+	__COVER;				/* B;; (or nothing) */				\
 	;;											\
 	adds r16=IA64_TASK_THREAD_ON_USTACK_OFFSET,r16;						\
 	;;											\
@@ -76,6 +91,7 @@
 	tbit.nz p15,p0=r29,IA64_PSR_I_BIT;							\
 	mov r29=b0										\
 	;;											\
+	WORKAROUND;										\
 	adds r16=PT(R8),r1;	/* initialize first base pointer */				\
 	adds r17=PT(R9),r1;	/* initialize second base pointer */				\
 (pKStk)	mov r18=r0;		/* make sure r18 isn't NaT */					\
@@ -123,11 +139,13 @@
 	;;											\
 .mem.offset 0,0; st8.spill [r16]=r2,16;								\
 .mem.offset 8,0; st8.spill [r17]=r3,16;								\
+	ACCOUNT_GET_STAMP									\
 	adds r2=IA64_PT_REGS_R16_OFFSET,r1;							\
 	;;											\
 	EXTRA;											\
 	movl r1=__gp;		/* establish kernel global pointer */				\
 	;;											\
+	ACCOUNT_SYS_ENTER									\
 	bsw.1;			/* switch back to bank 1 (must be last in insn group) */	\
 	;;
 
@@ -193,6 +211,40 @@
 	st8 [r25]=r10;      	/* ar.ssd */	\
 	;;
 
-#define SAVE_MIN_WITH_COVER	DO_SAVE_MIN(cover, mov r30=cr.ifs,)
-#define SAVE_MIN_WITH_COVER_R19	DO_SAVE_MIN(cover, mov r30=cr.ifs, mov r15=r19)
-#define SAVE_MIN		DO_SAVE_MIN(     , mov r30=r0, )
+#define RSE_WORKAROUND				\
+(pUStk) extr.u r17=r18,3,6;			\
+(pUStk)	sub r16=r18,r22;			\
+[1:](pKStk)	br.cond.sptk.many 1f;		\
+	.xdata4 ".data..patch.rse",1b-.		\
+	;;					\
+	cmp.ge p6,p7 = 33,r17;			\
+	;;					\
+(p6)	mov r17=0x310;				\
+(p7)	mov r17=0x308;				\
+	;;					\
+	cmp.leu p1,p0=r16,r17;			\
+(p1)	br.cond.sptk.many 1f;			\
+	dep.z r17=r26,0,62;			\
+	movl r16=2f;				\
+	;;					\
+	mov ar.pfs=r17;				\
+	dep r27=r0,r27,16,14;			\
+	mov b0=r16;				\
+	;;					\
+	br.ret.sptk b0;				\
+	;;					\
+2:						\
+	mov ar.rsc=r0				\
+	;;					\
+	flushrs;				\
+	;;					\
+	mov ar.bspstore=r22			\
+	;;					\
+	mov r18=ar.bsp;				\
+	;;					\
+1:						\
+	.pred.rel "mutex", pKStk, pUStk
+
+#define SAVE_MIN_WITH_COVER	DO_SAVE_MIN(COVER, mov r30=cr.ifs, , RSE_WORKAROUND)
+#define SAVE_MIN_WITH_COVER_R19	DO_SAVE_MIN(COVER, mov r30=cr.ifs, mov r15=r19, RSE_WORKAROUND)
+#define SAVE_MIN			DO_SAVE_MIN(     , mov r30=r0, , )
diff --git a/arch/ia64/kernel/module.c b/arch/ia64/kernel/module.c
index 7a2f0a798d1..24603be24c1 100644
--- a/arch/ia64/kernel/module.c
+++ b/arch/ia64/kernel/module.c
@@ -25,7 +25,6 @@
    SEGREL64LSB
  */
 
-#include <linux/config.h>
 
 #include <linux/kernel.h>
 #include <linux/sched.h>
@@ -136,15 +135,6 @@ static const char *reloc_name[256] = {
 
 #undef N
 
-struct got_entry {
-	uint64_t val;
-};
-
-struct fdesc {
-	uint64_t ip;
-	uint64_t gp;
-};
-
 /* Opaque struct for insns, to protect against derefs. */
 struct insn;
 
@@ -181,7 +171,8 @@ apply_imm60 (struct module *mod, struct insn *insn, uint64_t val)
 		return 0;
 	}
 	if (val + ((uint64_t) 1 << 59) >= (1UL << 60)) {
-		printk(KERN_ERR "%s: value %ld out of IMM60 range\n", mod->name, (int64_t) val);
+		printk(KERN_ERR "%s: value %ld out of IMM60 range\n",
+			mod->name, (long) val);
 		return 0;
 	}
 	ia64_patch_imm60((u64) insn, val);
@@ -192,7 +183,8 @@ static int
 apply_imm22 (struct module *mod, struct insn *insn, uint64_t val)
 {
 	if (val + (1 << 21) >= (1 << 22)) {
-		printk(KERN_ERR "%s: value %li out of IMM22 range\n", mod->name, (int64_t)val);
+		printk(KERN_ERR "%s: value %li out of IMM22 range\n",
+			mod->name, (long)val);
 		return 0;
 	}
 	ia64_patch((u64) insn, 0x01fffcfe000UL, (  ((val & 0x200000UL) << 15) /* bit 21 -> 36 */
@@ -206,7 +198,8 @@ static int
 apply_imm21b (struct module *mod, struct insn *insn, uint64_t val)
 {
 	if (val + (1 << 20) >= (1 << 21)) {
-		printk(KERN_ERR "%s: value %li out of IMM21b range\n", mod->name, (int64_t)val);
+		printk(KERN_ERR "%s: value %li out of IMM21b range\n",
+			mod->name, (long)val);
 		return 0;
 	}
 	ia64_patch((u64) insn, 0x11ffffe000UL, (  ((val & 0x100000UL) << 16) /* bit 20 -> 36 */
@@ -311,18 +304,11 @@ plt_target (struct plt_entry *plt)
 
 #endif /* !USE_BRL */
 
-void *
-module_alloc (unsigned long size)
-{
-	if (!size)
-		return NULL;
-	return vmalloc(size);
-}
-
 void
 module_free (struct module *mod, void *module_region)
 {
-	if (mod->arch.init_unw_table && module_region == mod->module_init) {
+	if (mod && mod->arch.init_unw_table &&
+	    module_region == mod->module_init) {
 		unw_remove_unwind_table(mod->arch.init_unw_table);
 		mod->arch.init_unw_table = NULL;
 	}
@@ -455,6 +441,14 @@ module_frob_arch_sections (Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, char *secstrings,
 			mod->arch.opd = s;
 		else if (strcmp(".IA_64.unwind", secstrings + s->sh_name) == 0)
 			mod->arch.unwind = s;
+#ifdef CONFIG_PARAVIRT
+		else if (strcmp(".paravirt_bundles",
+				secstrings + s->sh_name) == 0)
+			mod->arch.paravirt_bundles = s;
+		else if (strcmp(".paravirt_insts",
+				secstrings + s->sh_name) == 0)
+			mod->arch.paravirt_insts = s;
+#endif
 
 	if (!mod->arch.core_plt || !mod->arch.init_plt || !mod->arch.got || !mod->arch.opd) {
 		printk(KERN_ERR "%s: sections missing\n", mod->name);
@@ -494,7 +488,7 @@ module_frob_arch_sections (Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, char *secstrings,
 	mod->arch.opd->sh_addralign = 8;
 	mod->arch.opd->sh_size = fdescs * sizeof(struct fdesc);
 	DEBUGP("%s: core.plt=%lx, init.plt=%lx, got=%lx, fdesc=%lx\n",
-	       __FUNCTION__, mod->arch.core_plt->sh_size, mod->arch.init_plt->sh_size,
+	       __func__, mod->arch.core_plt->sh_size, mod->arch.init_plt->sh_size,
 	       mod->arch.got->sh_size, mod->arch.opd->sh_size);
 	return 0;
 }
@@ -534,8 +528,7 @@ get_ltoff (struct module *mod, uint64_t value, int *okp)
 			goto found;
 
 	/* Not enough GOT entries? */
-	if (e >= (struct got_entry *) (mod->arch.got->sh_addr + mod->arch.got->sh_size))
-		BUG();
+	BUG_ON(e >= (struct got_entry *) (mod->arch.got->sh_addr + mod->arch.got->sh_size));
 
 	e->val = value;
 	++mod->arch.next_got_entry;
@@ -586,7 +579,7 @@ get_plt (struct module *mod, const struct insn *insn, uint64_t value, int *okp)
 #if ARCH_MODULE_DEBUG
 	if (plt_target(plt) != target_ip) {
 		printk("%s: mistargeted PLT: wanted %lx, got %lx\n",
-		       __FUNCTION__, target_ip, plt_target(plt));
+		       __func__, target_ip, plt_target(plt));
 		*okp = 0;
 		return 0;
 	}
@@ -703,8 +696,9 @@ do_reloc (struct module *mod, uint8_t r_type, Elf64_Sym *sym, uint64_t addend,
 	      case RV_PCREL2:
 		if (r_type == R_IA64_PCREL21BI) {
 			if (!is_internal(mod, val)) {
-				printk(KERN_ERR "%s: %s reloc against non-local symbol (%lx)\n",
-				       __FUNCTION__, reloc_name[r_type], val);
+				printk(KERN_ERR "%s: %s reloc against "
+					"non-local symbol (%lx)\n", __func__,
+					reloc_name[r_type], (unsigned long)val);
 				return -ENOEXEC;
 			}
 			format = RF_INSN21B;
@@ -738,7 +732,7 @@ do_reloc (struct module *mod, uint8_t r_type, Elf64_Sym *sym, uint64_t addend,
 		      case R_IA64_LDXMOV:
 			if (gp_addressable(mod, val)) {
 				/* turn "ld8" into "mov": */
-				DEBUGP("%s: patching ld8 at %p to mov\n", __FUNCTION__, location);
+				DEBUGP("%s: patching ld8 at %p to mov\n", __func__, location);
 				ia64_patch((u64) location, 0x1fff80fe000UL, 0x10000000000UL);
 			}
 			return 0;
@@ -772,7 +766,7 @@ do_reloc (struct module *mod, uint8_t r_type, Elf64_Sym *sym, uint64_t addend,
 	if (!ok)
 		return -ENOEXEC;
 
-	DEBUGP("%s: [%p]<-%016lx = %s(%lx)\n", __FUNCTION__, location, val,
+	DEBUGP("%s: [%p]<-%016lx = %s(%lx)\n", __func__, location, val,
 	       reloc_name[r_type] ? reloc_name[r_type] : "?", sym->st_value + addend);
 
 	switch (format) {
@@ -808,7 +802,7 @@ apply_relocate_add (Elf64_Shdr *sechdrs, const char *strtab, unsigned int symind
 	Elf64_Shdr *target_sec;
 	int ret;
 
-	DEBUGP("%s: applying section %u (%u relocs) to %u\n", __FUNCTION__,
+	DEBUGP("%s: applying section %u (%u relocs) to %u\n", __func__,
 	       relsec, n, sechdrs[relsec].sh_info);
 
 	target_sec = sechdrs + sechdrs[relsec].sh_info;
@@ -836,7 +830,7 @@ apply_relocate_add (Elf64_Shdr *sechdrs, const char *strtab, unsigned int symind
 			gp = mod->core_size / 2;
 		gp = (uint64_t) mod->module_core + ((gp + 7) & -8);
 		mod->arch.gp = gp;
-		DEBUGP("%s: placing gp at 0x%lx\n", __FUNCTION__, gp);
+		DEBUGP("%s: placing gp at 0x%lx\n", __func__, gp);
 	}
 
 	for (i = 0; i < n; i++) {
@@ -851,18 +845,10 @@ apply_relocate_add (Elf64_Shdr *sechdrs, const char *strtab, unsigned int symind
 	return 0;
 }
 
-int
-apply_relocate (Elf64_Shdr *sechdrs, const char *strtab, unsigned int symindex,
-		unsigned int relsec, struct module *mod)
-{
-	printk(KERN_ERR "module %s: REL relocs in section %u unsupported\n", mod->name, relsec);
-	return -ENOEXEC;
-}
-
 /*
  * Modules contain a single unwind table which covers both the core and the init text
  * sections but since the two are not contiguous, we need to split this table up such that
- * we can register (and unregister) each "segment" seperately.  Fortunately, this sounds
+ * we can register (and unregister) each "segment" separately.  Fortunately, this sounds
  * more complicated than it really is.
  */
 static void
@@ -904,7 +890,7 @@ register_unwind_table (struct module *mod)
 		init = start + num_core;
 	}
 
-	DEBUGP("%s: name=%s, gp=%lx, num_init=%lu, num_core=%lu\n", __FUNCTION__,
+	DEBUGP("%s: name=%s, gp=%lx, num_init=%lu, num_core=%lu\n", __func__,
 	       mod->name, mod->arch.gp, num_init, num_core);
 
 	/*
@@ -913,13 +899,13 @@ register_unwind_table (struct module *mod)
 	if (num_core > 0) {
 		mod->arch.core_unw_table = unw_add_unwind_table(mod->name, 0, mod->arch.gp,
 								core, core + num_core);
-		DEBUGP("%s:  core: handle=%p [%p-%p)\n", __FUNCTION__,
+		DEBUGP("%s:  core: handle=%p [%p-%p)\n", __func__,
 		       mod->arch.core_unw_table, core, core + num_core);
 	}
 	if (num_init > 0) {
 		mod->arch.init_unw_table = unw_add_unwind_table(mod->name, 0, mod->arch.gp,
 								init, init + num_init);
-		DEBUGP("%s:  init: handle=%p [%p-%p)\n", __FUNCTION__,
+		DEBUGP("%s:  init: handle=%p [%p-%p)\n", __func__,
 		       mod->arch.init_unw_table, init, init + num_init);
 	}
 }
@@ -927,9 +913,33 @@ register_unwind_table (struct module *mod)
 int
 module_finalize (const Elf_Ehdr *hdr, const Elf_Shdr *sechdrs, struct module *mod)
 {
-	DEBUGP("%s: init: entry=%p\n", __FUNCTION__, mod->init);
+	DEBUGP("%s: init: entry=%p\n", __func__, mod->init);
 	if (mod->arch.unwind)
 		register_unwind_table(mod);
+#ifdef CONFIG_PARAVIRT
+        if (mod->arch.paravirt_bundles) {
+                struct paravirt_patch_site_bundle *start =
+                        (struct paravirt_patch_site_bundle *)
+                        mod->arch.paravirt_bundles->sh_addr;
+                struct paravirt_patch_site_bundle *end =
+                        (struct paravirt_patch_site_bundle *)
+                        (mod->arch.paravirt_bundles->sh_addr +
+                         mod->arch.paravirt_bundles->sh_size);
+
+                paravirt_patch_apply_bundle(start, end);
+        }
+        if (mod->arch.paravirt_insts) {
+                struct paravirt_patch_site_inst *start =
+                        (struct paravirt_patch_site_inst *)
+                        mod->arch.paravirt_insts->sh_addr;
+                struct paravirt_patch_site_inst *end =
+                        (struct paravirt_patch_site_inst *)
+                        (mod->arch.paravirt_insts->sh_addr +
+                         mod->arch.paravirt_insts->sh_size);
+
+                paravirt_patch_apply_inst(start, end);
+        }
+#endif
 	return 0;
 }
 
@@ -941,14 +951,3 @@ module_arch_cleanup (struct module *mod)
 	if (mod->arch.core_unw_table)
 		unw_remove_unwind_table(mod->arch.core_unw_table);
 }
-
-#ifdef CONFIG_SMP
-void
-percpu_modcopy (void *pcpudst, const void *src, unsigned long size)
-{
-	unsigned int i;
-	for_each_cpu(i) {
-		memcpy(pcpudst + __per_cpu_offset[i], src, size);
-	}
-}
-#endif /* CONFIG_SMP */
diff --git a/arch/ia64/kernel/msi_ia64.c b/arch/ia64/kernel/msi_ia64.c
new file mode 100644
index 00000000000..c430f9198d1
--- /dev/null
+++ b/arch/ia64/kernel/msi_ia64.c
@@ -0,0 +1,208 @@
+/*
+ * MSI hooks for standard x86 apic
+ */
+
+#include <linux/pci.h>
+#include <linux/irq.h>
+#include <linux/msi.h>
+#include <linux/dmar.h>
+#include <asm/smp.h>
+#include <asm/msidef.h>
+
+static struct irq_chip	ia64_msi_chip;
+
+#ifdef CONFIG_SMP
+static int ia64_set_msi_irq_affinity(struct irq_data *idata,
+				     const cpumask_t *cpu_mask, bool force)
+{
+	struct msi_msg msg;
+	u32 addr, data;
+	int cpu = cpumask_first_and(cpu_mask, cpu_online_mask);
+	unsigned int irq = idata->irq;
+
+	if (irq_prepare_move(irq, cpu))
+		return -1;
+
+	get_cached_msi_msg(irq, &msg);
+
+	addr = msg.address_lo;
+	addr &= MSI_ADDR_DEST_ID_MASK;
+	addr |= MSI_ADDR_DEST_ID_CPU(cpu_physical_id(cpu));
+	msg.address_lo = addr;
+
+	data = msg.data;
+	data &= MSI_DATA_VECTOR_MASK;
+	data |= MSI_DATA_VECTOR(irq_to_vector(irq));
+	msg.data = data;
+
+	write_msi_msg(irq, &msg);
+	cpumask_copy(idata->affinity, cpumask_of(cpu));
+
+	return 0;
+}
+#endif /* CONFIG_SMP */
+
+int ia64_setup_msi_irq(struct pci_dev *pdev, struct msi_desc *desc)
+{
+	struct msi_msg	msg;
+	unsigned long	dest_phys_id;
+	int	irq, vector;
+	cpumask_t mask;
+
+	irq = create_irq();
+	if (irq < 0)
+		return irq;
+
+	irq_set_msi_desc(irq, desc);
+	cpumask_and(&mask, &(irq_to_domain(irq)), cpu_online_mask);
+	dest_phys_id = cpu_physical_id(first_cpu(mask));
+	vector = irq_to_vector(irq);
+
+	msg.address_hi = 0;
+	msg.address_lo =
+		MSI_ADDR_HEADER |
+		MSI_ADDR_DEST_MODE_PHYS |
+		MSI_ADDR_REDIRECTION_CPU |
+		MSI_ADDR_DEST_ID_CPU(dest_phys_id);
+
+	msg.data =
+		MSI_DATA_TRIGGER_EDGE |
+		MSI_DATA_LEVEL_ASSERT |
+		MSI_DATA_DELIVERY_FIXED |
+		MSI_DATA_VECTOR(vector);
+
+	write_msi_msg(irq, &msg);
+	irq_set_chip_and_handler(irq, &ia64_msi_chip, handle_edge_irq);
+
+	return 0;
+}
+
+void ia64_teardown_msi_irq(unsigned int irq)
+{
+	destroy_irq(irq);
+}
+
+static void ia64_ack_msi_irq(struct irq_data *data)
+{
+	irq_complete_move(data->irq);
+	irq_move_irq(data);
+	ia64_eoi();
+}
+
+static int ia64_msi_retrigger_irq(struct irq_data *data)
+{
+	unsigned int vector = irq_to_vector(data->irq);
+	ia64_resend_irq(vector);
+
+	return 1;
+}
+
+/*
+ * Generic ops used on most IA64 platforms.
+ */
+static struct irq_chip ia64_msi_chip = {
+	.name			= "PCI-MSI",
+	.irq_mask		= mask_msi_irq,
+	.irq_unmask		= unmask_msi_irq,
+	.irq_ack		= ia64_ack_msi_irq,
+#ifdef CONFIG_SMP
+	.irq_set_affinity	= ia64_set_msi_irq_affinity,
+#endif
+	.irq_retrigger		= ia64_msi_retrigger_irq,
+};
+
+
+int arch_setup_msi_irq(struct pci_dev *pdev, struct msi_desc *desc)
+{
+	if (platform_setup_msi_irq)
+		return platform_setup_msi_irq(pdev, desc);
+
+	return ia64_setup_msi_irq(pdev, desc);
+}
+
+void arch_teardown_msi_irq(unsigned int irq)
+{
+	if (platform_teardown_msi_irq)
+		return platform_teardown_msi_irq(irq);
+
+	return ia64_teardown_msi_irq(irq);
+}
+
+#ifdef CONFIG_INTEL_IOMMU
+#ifdef CONFIG_SMP
+static int dmar_msi_set_affinity(struct irq_data *data,
+				 const struct cpumask *mask, bool force)
+{
+	unsigned int irq = data->irq;
+	struct irq_cfg *cfg = irq_cfg + irq;
+	struct msi_msg msg;
+	int cpu = cpumask_first_and(mask, cpu_online_mask);
+
+	if (irq_prepare_move(irq, cpu))
+		return -1;
+
+	dmar_msi_read(irq, &msg);
+
+	msg.data &= ~MSI_DATA_VECTOR_MASK;
+	msg.data |= MSI_DATA_VECTOR(cfg->vector);
+	msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
+	msg.address_lo |= MSI_ADDR_DEST_ID_CPU(cpu_physical_id(cpu));
+
+	dmar_msi_write(irq, &msg);
+	cpumask_copy(data->affinity, mask);
+
+	return 0;
+}
+#endif /* CONFIG_SMP */
+
+static struct irq_chip dmar_msi_type = {
+	.name = "DMAR_MSI",
+	.irq_unmask = dmar_msi_unmask,
+	.irq_mask = dmar_msi_mask,
+	.irq_ack = ia64_ack_msi_irq,
+#ifdef CONFIG_SMP
+	.irq_set_affinity = dmar_msi_set_affinity,
+#endif
+	.irq_retrigger = ia64_msi_retrigger_irq,
+};
+
+static int
+msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg)
+{
+	struct irq_cfg *cfg = irq_cfg + irq;
+	unsigned dest;
+	cpumask_t mask;
+
+	cpumask_and(&mask, &(irq_to_domain(irq)), cpu_online_mask);
+	dest = cpu_physical_id(first_cpu(mask));
+
+	msg->address_hi = 0;
+	msg->address_lo =
+		MSI_ADDR_HEADER |
+		MSI_ADDR_DEST_MODE_PHYS |
+		MSI_ADDR_REDIRECTION_CPU |
+		MSI_ADDR_DEST_ID_CPU(dest);
+
+	msg->data =
+		MSI_DATA_TRIGGER_EDGE |
+		MSI_DATA_LEVEL_ASSERT |
+		MSI_DATA_DELIVERY_FIXED |
+		MSI_DATA_VECTOR(cfg->vector);
+	return 0;
+}
+
+int arch_setup_dmar_msi(unsigned int irq)
+{
+	int ret;
+	struct msi_msg msg;
+
+	ret = msi_compose_msg(NULL, irq, &msg);
+	if (ret < 0)
+		return ret;
+	dmar_msi_write(irq, &msg);
+	irq_set_chip_and_handler_name(irq, &dmar_msi_type, handle_edge_irq,
+				      "edge");
+	return 0;
+}
+#endif /* CONFIG_INTEL_IOMMU */
+
diff --git a/arch/ia64/kernel/nr-irqs.c b/arch/ia64/kernel/nr-irqs.c
new file mode 100644
index 00000000000..f6769cd54bd
--- /dev/null
+++ b/arch/ia64/kernel/nr-irqs.c
@@ -0,0 +1,21 @@
+/*
+ * calculate
+ * NR_IRQS = max(IA64_NATIVE_NR_IRQS, XEN_NR_IRQS, FOO_NR_IRQS...)
+ * depending on config.
+ * This must be calculated before processing asm-offset.c.
+ */
+
+#define ASM_OFFSETS_C 1
+
+#include <linux/kbuild.h>
+#include <linux/threads.h>
+#include <asm/native/irq.h>
+
+void foo(void)
+{
+	union paravirt_nr_irqs_max {
+		char ia64_native_nr_irqs[IA64_NATIVE_NR_IRQS];
+	};
+
+	DEFINE(NR_IRQS, sizeof (union paravirt_nr_irqs_max));
+}
diff --git a/arch/ia64/kernel/numa.c b/arch/ia64/kernel/numa.c
index a68ce667809..d288cde9360 100644
--- a/arch/ia64/kernel/numa.c
+++ b/arch/ia64/kernel/numa.c
@@ -19,16 +19,46 @@
  * Copyright (C) 2004 Silicon Graphics, Inc.
  *   Jesse Barnes <jbarnes@sgi.com>
  */
-#include <linux/config.h>
 #include <linux/topology.h>
 #include <linux/module.h>
 #include <asm/processor.h>
 #include <asm/smp.h>
 
-u8 cpu_to_node_map[NR_CPUS] __cacheline_aligned;
+u16 cpu_to_node_map[NR_CPUS] __cacheline_aligned;
 EXPORT_SYMBOL(cpu_to_node_map);
 
 cpumask_t node_to_cpu_mask[MAX_NUMNODES] __cacheline_aligned;
+EXPORT_SYMBOL(node_to_cpu_mask);
+
+void map_cpu_to_node(int cpu, int nid)
+{
+	int oldnid;
+	if (nid < 0) { /* just initialize by zero */
+		cpu_to_node_map[cpu] = 0;
+		return;
+	}
+	/* sanity check first */
+	oldnid = cpu_to_node_map[cpu];
+	if (cpu_isset(cpu, node_to_cpu_mask[oldnid])) {
+		return; /* nothing to do */
+	}
+	/* we don't have cpu-driven node hot add yet...
+	   In usual case, node is created from SRAT at boot time. */
+	if (!node_online(nid))
+		nid = first_online_node;
+	cpu_to_node_map[cpu] = nid;
+	cpu_set(cpu, node_to_cpu_mask[nid]);
+	return;
+}
+
+void unmap_cpu_from_node(int cpu, int nid)
+{
+	WARN_ON(!cpu_isset(cpu, node_to_cpu_mask[nid]));
+	WARN_ON(cpu_to_node_map[cpu] != nid);
+	cpu_to_node_map[cpu] = 0;
+	cpu_clear(cpu, node_to_cpu_mask[nid]);
+}
+
 
 /**
  * build_cpu_to_node_map - setup cpu to node and node to cpumask arrays
@@ -43,15 +73,13 @@ void __init build_cpu_to_node_map(void)
 	for(node=0; node < MAX_NUMNODES; node++)
 		cpus_clear(node_to_cpu_mask[node]);
 
-	for(cpu = 0; cpu < NR_CPUS; ++cpu) {
+	for_each_possible_early_cpu(cpu) {
 		node = -1;
 		for (i = 0; i < NR_CPUS; ++i)
 			if (cpu_physical_id(cpu) == node_cpuid[i].phys_id) {
 				node = node_cpuid[i].nid;
 				break;
 			}
-		cpu_to_node_map[cpu] = (node >= 0) ? node : 0;
-		if (node >= 0)
-			cpu_set(cpu, node_to_cpu_mask[node]);
+		map_cpu_to_node(cpu, node);
 	}
 }
diff --git a/arch/ia64/kernel/pal.S b/arch/ia64/kernel/pal.S
index 5018c7f2e7a..0b533441c3c 100644
--- a/arch/ia64/kernel/pal.S
+++ b/arch/ia64/kernel/pal.S
@@ -21,11 +21,12 @@ pal_entry_point:
 	.text
 
 /*
- * Set the PAL entry point address.  This could be written in C code, but we do it here
- * to keep it all in one module (besides, it's so trivial that it's
+ * Set the PAL entry point address.  This could be written in C code, but we
+ * do it here to keep it all in one module (besides, it's so trivial that it's
  * not a big deal).
  *
- * in0		Address of the PAL entry point (text address, NOT a function descriptor).
+ * in0		Address of the PAL entry point (text address, NOT a function
+ *		descriptor).
  */
 GLOBAL_ENTRY(ia64_pal_handler_init)
 	alloc r3=ar.pfs,1,0,0,0
@@ -36,9 +37,9 @@ GLOBAL_ENTRY(ia64_pal_handler_init)
 END(ia64_pal_handler_init)
 
 /*
- * Default PAL call handler.  This needs to be coded in assembly because it uses
- * the static calling convention, i.e., the RSE may not be used and calls are
- * done via "br.cond" (not "br.call").
+ * Default PAL call handler.  This needs to be coded in assembly because it
+ * uses the static calling convention, i.e., the RSE may not be used and
+ * calls are done via "br.cond" (not "br.call").
  */
 GLOBAL_ENTRY(ia64_pal_default_handler)
 	mov r8=-1
@@ -50,12 +51,10 @@ END(ia64_pal_default_handler)
  *
  * in0         Index of PAL service
  * in1 - in3   Remaining PAL arguments
- * in4	       1 ==> clear psr.ic,  0 ==> don't clear psr.ic
- *
  */
 GLOBAL_ENTRY(ia64_pal_call_static)
-	.prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(5)
-	alloc loc1 = ar.pfs,5,5,0,0
+	.prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(4)
+	alloc loc1 = ar.pfs,4,5,0,0
 	movl loc2 = pal_entry_point
 1:	{
 	  mov r28 = in0
@@ -64,7 +63,6 @@ GLOBAL_ENTRY(ia64_pal_call_static)
 	}
 	;;
 	ld8 loc2 = [loc2]		// loc2 <- entry point
-	tbit.nz p6,p7 = in4, 0
 	adds r8 = 1f-1b,r8
 	mov loc4=ar.rsc			// save RSE configuration
 	;;
@@ -74,13 +72,11 @@ GLOBAL_ENTRY(ia64_pal_call_static)
 	.body
 	mov r30 = in2
 
-(p6)	rsm psr.i | psr.ic
 	mov r31 = in3
 	mov b7 = loc2
 
-(p7)	rsm psr.i
+	rsm psr.i
 	;;
-(p6)	srlz.i
 	mov rp = r8
 	br.cond.sptk.many b7
 1:	mov psr.l = loc3
@@ -96,8 +92,8 @@ END(ia64_pal_call_static)
  * Make a PAL call using the stacked registers calling convention.
  *
  * Inputs:
- * 	in0         Index of PAL service
- * 	in2 - in3   Remaning PAL arguments
+ *	in0         Index of PAL service
+ *	in2 - in3   Remaining PAL arguments
  */
 GLOBAL_ENTRY(ia64_pal_call_stacked)
 	.prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(4)
@@ -131,18 +127,18 @@ END(ia64_pal_call_stacked)
  * Make a physical mode PAL call using the static registers calling convention.
  *
  * Inputs:
- * 	in0         Index of PAL service
- * 	in2 - in3   Remaning PAL arguments
+ *	in0         Index of PAL service
+ *	in2 - in3   Remaining PAL arguments
  *
  * PSR_LP, PSR_TB, PSR_ID, PSR_DA are never set by the kernel.
  * So we don't need to clear them.
  */
-#define PAL_PSR_BITS_TO_CLEAR							\
-	(IA64_PSR_I | IA64_PSR_IT | IA64_PSR_DT  | IA64_PSR_DB | IA64_PSR_RT |	\
-	 IA64_PSR_DD | IA64_PSR_SS | IA64_PSR_RI | IA64_PSR_ED |		\
+#define PAL_PSR_BITS_TO_CLEAR						      \
+	(IA64_PSR_I | IA64_PSR_IT | IA64_PSR_DT  | IA64_PSR_DB | IA64_PSR_RT |\
+	 IA64_PSR_DD | IA64_PSR_SS | IA64_PSR_RI | IA64_PSR_ED |	      \
 	 IA64_PSR_DFL | IA64_PSR_DFH)
 
-#define PAL_PSR_BITS_TO_SET							\
+#define PAL_PSR_BITS_TO_SET						      \
 	(IA64_PSR_BN)
 
 
@@ -178,7 +174,7 @@ GLOBAL_ENTRY(ia64_pal_call_phys_static)
 	;;
 	andcm r16=loc3,r16		// removes bits to clear from psr
 	br.call.sptk.many rp=ia64_switch_mode_phys
-.ret1:	mov rp = r8			// install return address (physical)
+	mov rp = r8			// install return address (physical)
 	mov loc5 = r19
 	mov loc6 = r20
 	br.cond.sptk.many b7
@@ -188,7 +184,6 @@ GLOBAL_ENTRY(ia64_pal_call_phys_static)
 	mov r19=loc5
 	mov r20=loc6
 	br.call.sptk.many rp=ia64_switch_mode_virt // return to virtual mode
-.ret2:
 	mov psr.l = loc3		// restore init PSR
 
 	mov ar.pfs = loc1
@@ -203,8 +198,8 @@ END(ia64_pal_call_phys_static)
  * Make a PAL call using the stacked registers in physical mode.
  *
  * Inputs:
- * 	in0         Index of PAL service
- * 	in2 - in3   Remaning PAL arguments
+ *	in0         Index of PAL service
+ *	in2 - in3   Remaining PAL arguments
  */
 GLOBAL_ENTRY(ia64_pal_call_phys_stacked)
 	.prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(5)
@@ -212,17 +207,12 @@ GLOBAL_ENTRY(ia64_pal_call_phys_stacked)
 	movl	loc2 = pal_entry_point
 1:	{
 	  mov r28  = in0		// copy procedure index
-	  mov loc0 = rp		// save rp
+	  mov loc0 = rp			// save rp
 	}
 	.body
 	;;
 	ld8 loc2 = [loc2]		// loc2 <- entry point
-	mov out0 = in0		// first argument
-	mov out1 = in1		// copy arg2
-	mov out2 = in2		// copy arg3
-	mov out3 = in3		// copy arg3
-	;;
-	mov loc3 = psr		// save psr
+	mov loc3 = psr			// save psr
 	;;
 	mov loc4=ar.rsc			// save RSE configuration
 	dep.z loc2=loc2,0,61		// convert pal entry point to physical
@@ -236,18 +226,23 @@ GLOBAL_ENTRY(ia64_pal_call_phys_stacked)
 	;;
 	andcm r16=loc3,r16		// removes bits to clear from psr
 	br.call.sptk.many rp=ia64_switch_mode_phys
-.ret6:
+
+	mov out0 = in0			// first argument
+	mov out1 = in1			// copy arg2
+	mov out2 = in2			// copy arg3
+	mov out3 = in3			// copy arg3
 	mov loc5 = r19
 	mov loc6 = r20
+
 	br.call.sptk.many rp=b7		// now make the call
-.ret7:
+
 	mov ar.rsc=0			// put RSE in enforced lazy, LE mode
 	mov r16=loc3			// r16= original psr
 	mov r19=loc5
 	mov r20=loc6
-	br.call.sptk.many rp=ia64_switch_mode_virt	// return to virtual mode
+	br.call.sptk.many rp=ia64_switch_mode_virt // return to virtual mode
 
-.ret8:	mov psr.l  = loc3		// restore init PSR
+	mov psr.l  = loc3		// restore init PSR
 	mov ar.pfs = loc1
 	mov rp = loc0
 	;;
@@ -257,10 +252,11 @@ GLOBAL_ENTRY(ia64_pal_call_phys_stacked)
 END(ia64_pal_call_phys_stacked)
 
 /*
- * Save scratch fp scratch regs which aren't saved in pt_regs already (fp10-fp15).
+ * Save scratch fp scratch regs which aren't saved in pt_regs already
+ * (fp10-fp15).
  *
- * NOTE: We need to do this since firmware (SAL and PAL) may use any of the scratch
- * regs fp-low partition.
+ * NOTE: We need to do this since firmware (SAL and PAL) may use any of the
+ * scratch regs fp-low partition.
  *
  * Inputs:
  *      in0	Address of stack storage for fp regs
diff --git a/arch/ia64/kernel/palinfo.c b/arch/ia64/kernel/palinfo.c
index 89faa603c6b..c39c3cd3ac3 100644
--- a/arch/ia64/kernel/palinfo.c
+++ b/arch/ia64/kernel/palinfo.c
@@ -16,12 +16,13 @@
  * 02/05/2001   S.Eranian	fixed module support
  * 10/23/2001	S.Eranian	updated pal_perf_mon_info bug fixes
  * 03/24/2004	Ashok Raj	updated to work with CPU Hotplug
+ * 10/26/2006   Russ Anderson	updated processor features to rev 2.2 spec
  */
-#include <linux/config.h>
 #include <linux/types.h>
 #include <linux/errno.h>
 #include <linux/init.h>
 #include <linux/proc_fs.h>
+#include <linux/seq_file.h>
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/efi.h>
@@ -41,7 +42,7 @@ MODULE_LICENSE("GPL");
 
 #define PALINFO_VERSION "0.5"
 
-typedef int (*palinfo_func_t)(char*);
+typedef int (*palinfo_func_t)(struct seq_file *);
 
 typedef struct {
 	const char		*name;		/* name of the proc entry */
@@ -54,7 +55,7 @@ typedef struct {
  *  A bunch of string array to get pretty printing
  */
 
-static char *cache_types[] = {
+static const char *cache_types[] = {
 	"",			/* not used */
 	"Instruction",
 	"Data",
@@ -122,19 +123,16 @@ static const char *mem_attrib[]={
  *	- a pointer to the end of the buffer
  *
  */
-static char *
-bitvector_process(char *p, u64 vector)
+static void bitvector_process(struct seq_file *m, u64 vector)
 {
 	int i,j;
-	const char *units[]={ "", "K", "M", "G", "T" };
+	static const char *units[]={ "", "K", "M", "G", "T" };
 
 	for (i=0, j=0; i < 64; i++ , j=i/10) {
-		if (vector & 0x1) {
-			p += sprintf(p, "%d%s ", 1 << (i-j*10), units[j]);
-		}
+		if (vector & 0x1)
+			seq_printf(m, "%d%s ", 1 << (i-j*10), units[j]);
 		vector >>= 1;
 	}
-	return p;
 }
 
 /*
@@ -149,8 +147,7 @@ bitvector_process(char *p, u64 vector)
  *	- a pointer to the end of the buffer
  *
  */
-static char *
-bitregister_process(char *p, u64 *reg_info, int max)
+static void bitregister_process(struct seq_file *m, u64 *reg_info, int max)
 {
 	int i, begin, skip = 0;
 	u64 value = reg_info[0];
@@ -163,9 +160,9 @@ bitregister_process(char *p, u64 *reg_info, int max)
 
 		if ((value & 0x1) == 0 && skip == 0) {
 			if (begin  <= i - 2)
-				p += sprintf(p, "%d-%d ", begin, i-1);
+				seq_printf(m, "%d-%d ", begin, i-1);
 			else
-				p += sprintf(p, "%d ", i-1);
+				seq_printf(m, "%d ", i-1);
 			skip  = 1;
 			begin = -1;
 		} else if ((value & 0x1) && skip == 1) {
@@ -176,19 +173,15 @@ bitregister_process(char *p, u64 *reg_info, int max)
 	}
 	if (begin > -1) {
 		if (begin < 127)
-			p += sprintf(p, "%d-127", begin);
+			seq_printf(m, "%d-127", begin);
 		else
-			p += sprintf(p, "127");
+			seq_puts(m, "127");
 	}
-
-	return p;
 }
 
-static int
-power_info(char *page)
+static int power_info(struct seq_file *m)
 {
 	s64 status;
-	char *p = page;
 	u64 halt_info_buffer[8];
 	pal_power_mgmt_info_u_t *halt_info =(pal_power_mgmt_info_u_t *)halt_info_buffer;
 	int i;
@@ -198,103 +191,103 @@ power_info(char *page)
 
 	for (i=0; i < 8 ; i++ ) {
 		if (halt_info[i].pal_power_mgmt_info_s.im == 1) {
-			p += sprintf(p,	"Power level %d:\n"
-				     "\tentry_latency       : %d cycles\n"
-				     "\texit_latency        : %d cycles\n"
-				     "\tpower consumption   : %d mW\n"
-				     "\tCache+TLB coherency : %s\n", i,
-				     halt_info[i].pal_power_mgmt_info_s.entry_latency,
-				     halt_info[i].pal_power_mgmt_info_s.exit_latency,
-				     halt_info[i].pal_power_mgmt_info_s.power_consumption,
-				     halt_info[i].pal_power_mgmt_info_s.co ? "Yes" : "No");
+			seq_printf(m,
+				   "Power level %d:\n"
+				   "\tentry_latency       : %d cycles\n"
+				   "\texit_latency        : %d cycles\n"
+				   "\tpower consumption   : %d mW\n"
+				   "\tCache+TLB coherency : %s\n", i,
+				   halt_info[i].pal_power_mgmt_info_s.entry_latency,
+				   halt_info[i].pal_power_mgmt_info_s.exit_latency,
+				   halt_info[i].pal_power_mgmt_info_s.power_consumption,
+				   halt_info[i].pal_power_mgmt_info_s.co ? "Yes" : "No");
 		} else {
-			p += sprintf(p,"Power level %d: not implemented\n",i);
+			seq_printf(m,"Power level %d: not implemented\n", i);
 		}
 	}
-	return p - page;
+	return 0;
 }
 
-static int
-cache_info(char *page)
+static int cache_info(struct seq_file *m)
 {
-	char *p = page;
-	u64 i, levels, unique_caches;
+	unsigned long i, levels, unique_caches;
 	pal_cache_config_info_t cci;
 	int j, k;
-	s64 status;
+	long status;
 
 	if ((status = ia64_pal_cache_summary(&levels, &unique_caches)) != 0) {
 		printk(KERN_ERR "ia64_pal_cache_summary=%ld\n", status);
 		return 0;
 	}
 
-	p += sprintf(p, "Cache levels  : %ld\nUnique caches : %ld\n\n", levels, unique_caches);
+	seq_printf(m, "Cache levels  : %ld\nUnique caches : %ld\n\n",
+		   levels, unique_caches);
 
 	for (i=0; i < levels; i++) {
-
 		for (j=2; j >0 ; j--) {
-
 			/* even without unification some level may not be present */
-			if ((status=ia64_pal_cache_config_info(i,j, &cci)) != 0) {
+			if ((status=ia64_pal_cache_config_info(i,j, &cci)) != 0)
 				continue;
-			}
-			p += sprintf(p,
-				     "%s Cache level %lu:\n"
-				     "\tSize           : %lu bytes\n"
-				     "\tAttributes     : ",
-				     cache_types[j+cci.pcci_unified], i+1,
-				     cci.pcci_cache_size);
-
-			if (cci.pcci_unified) p += sprintf(p, "Unified ");
-
-			p += sprintf(p, "%s\n", cache_mattrib[cci.pcci_cache_attr]);
-
-			p += sprintf(p,
-				     "\tAssociativity  : %d\n"
-				     "\tLine size      : %d bytes\n"
-				     "\tStride         : %d bytes\n",
-				     cci.pcci_assoc, 1<<cci.pcci_line_size, 1<<cci.pcci_stride);
+
+			seq_printf(m,
+				   "%s Cache level %lu:\n"
+				   "\tSize           : %u bytes\n"
+				   "\tAttributes     : ",
+				   cache_types[j+cci.pcci_unified], i+1,
+				   cci.pcci_cache_size);
+
+			if (cci.pcci_unified)
+				seq_puts(m, "Unified ");
+
+			seq_printf(m, "%s\n", cache_mattrib[cci.pcci_cache_attr]);
+
+			seq_printf(m,
+				   "\tAssociativity  : %d\n"
+				   "\tLine size      : %d bytes\n"
+				   "\tStride         : %d bytes\n",
+				   cci.pcci_assoc,
+				   1<<cci.pcci_line_size,
+				   1<<cci.pcci_stride);
 			if (j == 1)
-				p += sprintf(p, "\tStore latency  : N/A\n");
+				seq_puts(m, "\tStore latency  : N/A\n");
 			else
-				p += sprintf(p, "\tStore latency  : %d cycle(s)\n",
-						cci.pcci_st_latency);
+				seq_printf(m, "\tStore latency  : %d cycle(s)\n",
+					   cci.pcci_st_latency);
 
-			p += sprintf(p,
-				     "\tLoad latency   : %d cycle(s)\n"
-				     "\tStore hints    : ", cci.pcci_ld_latency);
+			seq_printf(m,
+				   "\tLoad latency   : %d cycle(s)\n"
+				   "\tStore hints    : ", cci.pcci_ld_latency);
 
 			for(k=0; k < 8; k++ ) {
 				if ( cci.pcci_st_hints & 0x1)
-					p += sprintf(p, "[%s]", cache_st_hints[k]);
+					seq_printf(m, "[%s]", cache_st_hints[k]);
 				cci.pcci_st_hints >>=1;
 			}
-			p += sprintf(p, "\n\tLoad hints     : ");
+			seq_puts(m, "\n\tLoad hints     : ");
 
 			for(k=0; k < 8; k++ ) {
 				if (cci.pcci_ld_hints & 0x1)
-					p += sprintf(p, "[%s]", cache_ld_hints[k]);
+					seq_printf(m, "[%s]", cache_ld_hints[k]);
 				cci.pcci_ld_hints >>=1;
 			}
-			p += sprintf(p,
-				     "\n\tAlias boundary : %d byte(s)\n"
-				     "\tTag LSB        : %d\n"
-				     "\tTag MSB        : %d\n",
-				     1<<cci.pcci_alias_boundary, cci.pcci_tag_lsb,
-				     cci.pcci_tag_msb);
+			seq_printf(m,
+				   "\n\tAlias boundary : %d byte(s)\n"
+				   "\tTag LSB        : %d\n"
+				   "\tTag MSB        : %d\n",
+				   1<<cci.pcci_alias_boundary, cci.pcci_tag_lsb,
+				   cci.pcci_tag_msb);
 
 			/* when unified, data(j=2) is enough */
-			if (cci.pcci_unified) break;
+			if (cci.pcci_unified)
+				break;
 		}
 	}
-	return p - page;
+	return 0;
 }
 
 
-static int
-vm_info(char *page)
+static int vm_info(struct seq_file *m)
 {
-	char *p = page;
 	u64 tr_pages =0, vw_pages=0, tc_pages;
 	u64 attrib;
 	pal_vm_info_1_u_t vm_info_1;
@@ -303,63 +296,70 @@ vm_info(char *page)
 	ia64_ptce_info_t ptce;
 	const char *sep;
 	int i, j;
-	s64 status;
+	long status;
 
 	if ((status = ia64_pal_vm_summary(&vm_info_1, &vm_info_2)) !=0) {
 		printk(KERN_ERR "ia64_pal_vm_summary=%ld\n", status);
 	} else {
 
-		p += sprintf(p,
+		seq_printf(m,
 		     "Physical Address Space         : %d bits\n"
 		     "Virtual Address Space          : %d bits\n"
 		     "Protection Key Registers(PKR)  : %d\n"
 		     "Implemented bits in PKR.key    : %d\n"
 		     "Hash Tag ID                    : 0x%x\n"
-		     "Size of RR.rid                 : %d\n",
+		     "Size of RR.rid                 : %d\n"
+		     "Max Purges                     : ",
 		     vm_info_1.pal_vm_info_1_s.phys_add_size,
 		     vm_info_2.pal_vm_info_2_s.impl_va_msb+1,
 		     vm_info_1.pal_vm_info_1_s.max_pkr+1,
 		     vm_info_1.pal_vm_info_1_s.key_size,
 		     vm_info_1.pal_vm_info_1_s.hash_tag_id,
 		     vm_info_2.pal_vm_info_2_s.rid_size);
+		if (vm_info_2.pal_vm_info_2_s.max_purges == PAL_MAX_PURGES)
+			seq_puts(m, "unlimited\n");
+		else
+			seq_printf(m, "%d\n",
+		     		vm_info_2.pal_vm_info_2_s.max_purges ?
+				vm_info_2.pal_vm_info_2_s.max_purges : 1);
 	}
 
 	if (ia64_pal_mem_attrib(&attrib) == 0) {
-		p += sprintf(p, "Supported memory attributes    : ");
+		seq_puts(m, "Supported memory attributes    : ");
 		sep = "";
 		for (i = 0; i < 8; i++) {
 			if (attrib & (1 << i)) {
-				p += sprintf(p, "%s%s", sep, mem_attrib[i]);
+				seq_printf(m, "%s%s", sep, mem_attrib[i]);
 				sep = ", ";
 			}
 		}
-		p += sprintf(p, "\n");
+		seq_putc(m, '\n');
 	}
 
 	if ((status = ia64_pal_vm_page_size(&tr_pages, &vw_pages)) !=0) {
 		printk(KERN_ERR "ia64_pal_vm_page_size=%ld\n", status);
 	} else {
 
-		p += sprintf(p,
-			     "\nTLB walker                     : %simplemented\n"
-			     "Number of DTR                  : %d\n"
-			     "Number of ITR                  : %d\n"
-			     "TLB insertable page sizes      : ",
-			     vm_info_1.pal_vm_info_1_s.vw ? "" : "not ",
-			     vm_info_1.pal_vm_info_1_s.max_dtr_entry+1,
-			     vm_info_1.pal_vm_info_1_s.max_itr_entry+1);
-
+		seq_printf(m,
+			   "\nTLB walker                     : %simplemented\n"
+			   "Number of DTR                  : %d\n"
+			   "Number of ITR                  : %d\n"
+			   "TLB insertable page sizes      : ",
+			   vm_info_1.pal_vm_info_1_s.vw ? "" : "not ",
+			   vm_info_1.pal_vm_info_1_s.max_dtr_entry+1,
+			   vm_info_1.pal_vm_info_1_s.max_itr_entry+1);
 
-		p = bitvector_process(p, tr_pages);
+		bitvector_process(m, tr_pages);
 
-		p += sprintf(p, "\nTLB purgeable page sizes       : ");
+		seq_puts(m, "\nTLB purgeable page sizes       : ");
 
-		p = bitvector_process(p, vw_pages);
+		bitvector_process(m, vw_pages);
 	}
-	if ((status=ia64_get_ptce(&ptce)) != 0) {
+
+	if ((status = ia64_get_ptce(&ptce)) != 0) {
 		printk(KERN_ERR "ia64_get_ptce=%ld\n", status);
 	} else {
-		p += sprintf(p,
+		seq_printf(m,
 		     "\nPurge base address             : 0x%016lx\n"
 		     "Purge outer loop count         : %d\n"
 		     "Purge inner loop count         : %d\n"
@@ -368,7 +368,7 @@ vm_info(char *page)
 		     ptce.base, ptce.count[0], ptce.count[1],
 		     ptce.stride[0], ptce.stride[1]);
 
-		p += sprintf(p,
+		seq_printf(m,
 		     "TC Levels                      : %d\n"
 		     "Unique TC(s)                   : %d\n",
 		     vm_info_1.pal_vm_info_1_s.num_tc_levels,
@@ -378,13 +378,11 @@ vm_info(char *page)
 			for (j=2; j>0 ; j--) {
 				tc_pages = 0; /* just in case */
 
-
 				/* even without unification, some levels may not be present */
-				if ((status=ia64_pal_vm_info(i,j, &tc_info, &tc_pages)) != 0) {
+				if ((status=ia64_pal_vm_info(i,j, &tc_info, &tc_pages)) != 0)
 					continue;
-				}
 
-				p += sprintf(p,
+				seq_printf(m,
 				     "\n%s Translation Cache Level %d:\n"
 				     "\tHash sets           : %d\n"
 				     "\tAssociativity       : %d\n"
@@ -396,15 +394,15 @@ vm_info(char *page)
 				     tc_info.tc_num_entries);
 
 				if (tc_info.tc_pf)
-					p += sprintf(p, "PreferredPageSizeOptimized ");
+					seq_puts(m, "PreferredPageSizeOptimized ");
 				if (tc_info.tc_unified)
-					p += sprintf(p, "Unified ");
+					seq_puts(m, "Unified ");
 				if (tc_info.tc_reduce_tr)
-					p += sprintf(p, "TCReduction");
+					seq_puts(m, "TCReduction");
 
-				p += sprintf(p, "\n\tSupported page sizes: ");
+				seq_puts(m, "\n\tSupported page sizes: ");
 
-				p = bitvector_process(p, tc_pages);
+				bitvector_process(m, tc_pages);
 
 				/* when unified date (j=2) is enough */
 				if (tc_info.tc_unified)
@@ -412,22 +410,20 @@ vm_info(char *page)
 			}
 		}
 	}
-	p += sprintf(p, "\n");
 
-	return p - page;
+	seq_putc(m, '\n');
+	return 0;
 }
 
 
-static int
-register_info(char *page)
+static int register_info(struct seq_file *m)
 {
-	char *p = page;
 	u64 reg_info[2];
 	u64 info;
-	u64 phys_stacked;
+	unsigned long phys_stacked;
 	pal_hints_u_t hints;
-	u64 iregs, dregs;
-	char *info_type[]={
+	unsigned long iregs, dregs;
+	static const char * const info_type[] = {
 		"Implemented AR(s)",
 		"AR(s) with read side-effects",
 		"Implemented CR(s)",
@@ -435,40 +431,40 @@ register_info(char *page)
 	};
 
 	for(info=0; info < 4; info++) {
-
-		if (ia64_pal_register_info(info, &reg_info[0], &reg_info[1]) != 0) return 0;
-
-		p += sprintf(p, "%-32s : ", info_type[info]);
-
-		p = bitregister_process(p, reg_info, 128);
-
-		p += sprintf(p, "\n");
+		if (ia64_pal_register_info(info, &reg_info[0], &reg_info[1]) != 0)
+			return 0;
+		seq_printf(m, "%-32s : ", info_type[info]);
+		bitregister_process(m, reg_info, 128);
+		seq_putc(m, '\n');
 	}
 
-	if (ia64_pal_rse_info(&phys_stacked, &hints) == 0) {
+	if (ia64_pal_rse_info(&phys_stacked, &hints) == 0)
+		seq_printf(m,
+			   "RSE stacked physical registers   : %ld\n"
+			   "RSE load/store hints             : %ld (%s)\n",
+			   phys_stacked, hints.ph_data,
+			   hints.ph_data < RSE_HINTS_COUNT ? rse_hints[hints.ph_data]: "(??)");
 
-	p += sprintf(p,
-		     "RSE stacked physical registers   : %ld\n"
-		     "RSE load/store hints             : %ld (%s)\n",
-		     phys_stacked, hints.ph_data,
-		     hints.ph_data < RSE_HINTS_COUNT ? rse_hints[hints.ph_data]: "(??)");
-	}
 	if (ia64_pal_debug_info(&iregs, &dregs))
 		return 0;
 
-	p += sprintf(p,
-		     "Instruction debug register pairs : %ld\n"
-		     "Data debug register pairs        : %ld\n", iregs, dregs);
+	seq_printf(m,
+		   "Instruction debug register pairs : %ld\n"
+		   "Data debug register pairs        : %ld\n", iregs, dregs);
 
-	return p - page;
+	return 0;
 }
 
-static const char *proc_features[]={
+static const char *const proc_features_0[]={		/* Feature set 0 */
 	NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,
 	NULL,NULL,NULL,NULL,NULL,NULL,NULL, NULL,NULL,
 	NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,
 	NULL,NULL,NULL,NULL,NULL, NULL,NULL,NULL,NULL,
-	NULL,NULL,NULL,NULL,NULL,
+	"Unimplemented instruction address fault",
+	"INIT, PMI, and LINT pins",
+	"Simple unimplemented instr addresses",
+	"Variable P-state performance",
+	"Virtual machine features implemented",
 	"XIP,XPSR,XFS implemented",
 	"XR1-XR3 implemented",
 	"Disable dynamic predicate prediction",
@@ -476,7 +472,11 @@ static const char *proc_features[]={
 	"Disable dynamic data cache prefetch",
 	"Disable dynamic inst cache prefetch",
 	"Disable dynamic branch prediction",
-	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+	NULL, NULL, NULL, NULL,
+	"Disable P-states",
+	"Enable MCA on Data Poisoning",
+	"Enable vmsw instruction",
+	"Enable extern environmental notification",
 	"Disable BINIT on processor time-out",
 	"Disable dynamic power management (DPM)",
 	"Disable coherency",
@@ -487,29 +487,91 @@ static const char *proc_features[]={
 	"Enable BERR promotion"
 };
 
+static const char *const proc_features_16[]={		/* Feature set 16 */
+	"Disable ETM",
+	"Enable ETM",
+	"Enable MCA on half-way timer",
+	"Enable snoop WC",
+	NULL,
+	"Enable Fast Deferral",
+	"Disable MCA on memory aliasing",
+	"Enable RSB",
+	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+	"DP system processor",
+	"Low Voltage",
+	"HT supported",
+	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+	NULL, NULL, NULL, NULL, NULL
+};
 
-static int
-processor_info(char *page)
+static const char *const *const proc_features[]={
+	proc_features_0,
+	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+	NULL, NULL, NULL, NULL,
+	proc_features_16,
+	NULL, NULL, NULL, NULL,
+};
+
+static void feature_set_info(struct seq_file *m, u64 avail, u64 status, u64 control,
+			     unsigned long set)
 {
-	char *p = page;
-	const char **v = proc_features;
-	u64 avail=1, status=1, control=1;
+	const char *const *vf, *const *v;
 	int i;
-	s64 ret;
 
-	if ((ret=ia64_pal_proc_get_features(&avail, &status, &control)) != 0) return 0;
+	vf = v = proc_features[set];
+	for(i=0; i < 64; i++, avail >>=1, status >>=1, control >>=1) {
 
-	for(i=0; i < 64; i++, v++,avail >>=1, status >>=1, control >>=1) {
-		if ( ! *v ) continue;
-		p += sprintf(p, "%-40s : %s%s %s\n", *v,
-				avail & 0x1 ? "" : "NotImpl",
-				avail & 0x1 ? (status & 0x1 ? "On" : "Off"): "",
-				avail & 0x1 ? (control & 0x1 ? "Ctrl" : "NoCtrl"): "");
+		if (!(control))		/* No remaining bits set */
+			break;
+		if (!(avail & 0x1))	/* Print only bits that are available */
+			continue;
+		if (vf)
+			v = vf + i;
+		if ( v && *v ) {
+			seq_printf(m, "%-40s : %s %s\n", *v,
+				avail & 0x1 ? (status & 0x1 ?
+					      "On " : "Off"): "",
+				avail & 0x1 ? (control & 0x1 ?
+						"Ctrl" : "NoCtrl"): "");
+		} else {
+			seq_printf(m, "Feature set %2ld bit %2d\t\t\t"
+					" : %s %s\n",
+				set, i,
+				avail & 0x1 ? (status & 0x1 ?
+						"On " : "Off"): "",
+				avail & 0x1 ? (control & 0x1 ?
+						"Ctrl" : "NoCtrl"): "");
+		}
 	}
-	return p - page;
 }
 
-static const char *bus_features[]={
+static int processor_info(struct seq_file *m)
+{
+	u64 avail=1, status=1, control=1, feature_set=0;
+	s64 ret;
+
+	do {
+		ret = ia64_pal_proc_get_features(&avail, &status, &control,
+						feature_set);
+		if (ret < 0)
+			return 0;
+
+		if (ret == 1) {
+			feature_set++;
+			continue;
+		}
+
+		feature_set_info(m, avail, status, control, feature_set);
+		feature_set++;
+	} while(1);
+
+	return 0;
+}
+
+static const char *const bus_features[]={
 	NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,
 	NULL,NULL,NULL,NULL,NULL,NULL,NULL, NULL,NULL,
 	NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,
@@ -535,168 +597,155 @@ static const char *bus_features[]={
 };
 
 
-static int
-bus_info(char *page)
+static int bus_info(struct seq_file *m)
 {
-	char *p = page;
-	const char **v = bus_features;
+	const char *const *v = bus_features;
 	pal_bus_features_u_t av, st, ct;
 	u64 avail, status, control;
 	int i;
 	s64 ret;
 
-	if ((ret=ia64_pal_bus_get_features(&av, &st, &ct)) != 0) return 0;
+	if ((ret=ia64_pal_bus_get_features(&av, &st, &ct)) != 0)
+		return 0;
 
 	avail   = av.pal_bus_features_val;
 	status  = st.pal_bus_features_val;
 	control = ct.pal_bus_features_val;
 
 	for(i=0; i < 64; i++, v++, avail >>=1, status >>=1, control >>=1) {
-		if ( ! *v ) continue;
-		p += sprintf(p, "%-48s : %s%s %s\n", *v,
-				avail & 0x1 ? "" : "NotImpl",
-				avail & 0x1 ? (status  & 0x1 ? "On" : "Off"): "",
-				avail & 0x1 ? (control & 0x1 ? "Ctrl" : "NoCtrl"): "");
+		if ( ! *v )
+			continue;
+		seq_printf(m, "%-48s : %s%s %s\n", *v,
+			   avail & 0x1 ? "" : "NotImpl",
+			   avail & 0x1 ? (status  & 0x1 ? "On" : "Off"): "",
+			   avail & 0x1 ? (control & 0x1 ? "Ctrl" : "NoCtrl"): "");
 	}
-	return p - page;
+	return 0;
 }
 
-static int
-version_info(char *page)
+static int version_info(struct seq_file *m)
 {
 	pal_version_u_t min_ver, cur_ver;
-	char *p = page;
 
-	/* The PAL_VERSION call is advertised as being able to support
-	 * both physical and virtual mode calls. This seems to be a documentation
-	 * bug rather than firmware bug. In fact, it does only support physical mode.
-	 * So now the code reflects this fact and the pal_version() has been updated
-	 * accordingly.
-	 */
-	if (ia64_pal_version(&min_ver, &cur_ver) != 0) return 0;
-
-	p += sprintf(p,
-		     "PAL_vendor : 0x%02x (min=0x%02x)\n"
-		     "PAL_A      : %x.%x.%x (min=%x.%x.%x)\n"
-		     "PAL_B      : %x.%x.%x (min=%x.%x.%x)\n",
-		     cur_ver.pal_version_s.pv_pal_vendor, min_ver.pal_version_s.pv_pal_vendor,
-
-		     cur_ver.pal_version_s.pv_pal_a_model>>4,
-		     cur_ver.pal_version_s.pv_pal_a_model&0xf, cur_ver.pal_version_s.pv_pal_a_rev,
-		     min_ver.pal_version_s.pv_pal_a_model>>4,
-		     min_ver.pal_version_s.pv_pal_a_model&0xf, min_ver.pal_version_s.pv_pal_a_rev,
-
-		     cur_ver.pal_version_s.pv_pal_b_model>>4,
-		     cur_ver.pal_version_s.pv_pal_b_model&0xf, cur_ver.pal_version_s.pv_pal_b_rev,
-		     min_ver.pal_version_s.pv_pal_b_model>>4,
-		     min_ver.pal_version_s.pv_pal_b_model&0xf, min_ver.pal_version_s.pv_pal_b_rev);
-	return p - page;
+	if (ia64_pal_version(&min_ver, &cur_ver) != 0)
+		return 0;
+
+	seq_printf(m,
+		   "PAL_vendor : 0x%02x (min=0x%02x)\n"
+		   "PAL_A      : %02x.%02x (min=%02x.%02x)\n"
+		   "PAL_B      : %02x.%02x (min=%02x.%02x)\n",
+		   cur_ver.pal_version_s.pv_pal_vendor,
+		   min_ver.pal_version_s.pv_pal_vendor,
+		   cur_ver.pal_version_s.pv_pal_a_model,
+		   cur_ver.pal_version_s.pv_pal_a_rev,
+		   min_ver.pal_version_s.pv_pal_a_model,
+		   min_ver.pal_version_s.pv_pal_a_rev,
+		   cur_ver.pal_version_s.pv_pal_b_model,
+		   cur_ver.pal_version_s.pv_pal_b_rev,
+		   min_ver.pal_version_s.pv_pal_b_model,
+		   min_ver.pal_version_s.pv_pal_b_rev);
+	return 0;
 }
 
-static int
-perfmon_info(char *page)
+static int perfmon_info(struct seq_file *m)
 {
-	char *p = page;
 	u64 pm_buffer[16];
 	pal_perf_mon_info_u_t pm_info;
 
-	if (ia64_pal_perf_mon_info(pm_buffer, &pm_info) != 0) return 0;
-
-	p += sprintf(p,
-		     "PMC/PMD pairs                 : %d\n"
-		     "Counter width                 : %d bits\n"
-		     "Cycle event number            : %d\n"
-		     "Retired event number          : %d\n"
-		     "Implemented PMC               : ",
-		     pm_info.pal_perf_mon_info_s.generic, pm_info.pal_perf_mon_info_s.width,
-		     pm_info.pal_perf_mon_info_s.cycles, pm_info.pal_perf_mon_info_s.retired);
+	if (ia64_pal_perf_mon_info(pm_buffer, &pm_info) != 0)
+		return 0;
 
-	p = bitregister_process(p, pm_buffer, 256);
-	p += sprintf(p, "\nImplemented PMD               : ");
-	p = bitregister_process(p, pm_buffer+4, 256);
-	p += sprintf(p, "\nCycles count capable          : ");
-	p = bitregister_process(p, pm_buffer+8, 256);
-	p += sprintf(p, "\nRetired bundles count capable : ");
+	seq_printf(m,
+		   "PMC/PMD pairs                 : %d\n"
+		   "Counter width                 : %d bits\n"
+		   "Cycle event number            : %d\n"
+		   "Retired event number          : %d\n"
+		   "Implemented PMC               : ",
+		   pm_info.pal_perf_mon_info_s.generic,
+		   pm_info.pal_perf_mon_info_s.width,
+		   pm_info.pal_perf_mon_info_s.cycles,
+		   pm_info.pal_perf_mon_info_s.retired);
+
+	bitregister_process(m, pm_buffer, 256);
+	seq_puts(m, "\nImplemented PMD               : ");
+	bitregister_process(m, pm_buffer+4, 256);
+	seq_puts(m, "\nCycles count capable          : ");
+	bitregister_process(m, pm_buffer+8, 256);
+	seq_puts(m, "\nRetired bundles count capable : ");
 
 #ifdef CONFIG_ITANIUM
 	/*
 	 * PAL_PERF_MON_INFO reports that only PMC4 can be used to count CPU_CYCLES
 	 * which is wrong, both PMC4 and PMD5 support it.
 	 */
-	if (pm_buffer[12] == 0x10) pm_buffer[12]=0x30;
+	if (pm_buffer[12] == 0x10)
+		pm_buffer[12]=0x30;
 #endif
 
-	p = bitregister_process(p, pm_buffer+12, 256);
-
-	p += sprintf(p, "\n");
-
-	return p - page;
+	bitregister_process(m, pm_buffer+12, 256);
+	seq_putc(m, '\n');
+	return 0;
 }
 
-static int
-frequency_info(char *page)
+static int frequency_info(struct seq_file *m)
 {
-	char *p = page;
 	struct pal_freq_ratio proc, itc, bus;
-	u64 base;
+	unsigned long base;
 
 	if (ia64_pal_freq_base(&base) == -1)
-		p += sprintf(p, "Output clock            : not implemented\n");
+		seq_puts(m, "Output clock            : not implemented\n");
 	else
-		p += sprintf(p, "Output clock            : %ld ticks/s\n", base);
+		seq_printf(m, "Output clock            : %ld ticks/s\n", base);
 
 	if (ia64_pal_freq_ratios(&proc, &bus, &itc) != 0) return 0;
 
-	p += sprintf(p,
-		     "Processor/Clock ratio   : %ld/%ld\n"
-		     "Bus/Clock ratio         : %ld/%ld\n"
-		     "ITC/Clock ratio         : %ld/%ld\n",
+	seq_printf(m,
+		     "Processor/Clock ratio   : %d/%d\n"
+		     "Bus/Clock ratio         : %d/%d\n"
+		     "ITC/Clock ratio         : %d/%d\n",
 		     proc.num, proc.den, bus.num, bus.den, itc.num, itc.den);
-
-	return p - page;
+	return 0;
 }
 
-static int
-tr_info(char *page)
+static int tr_info(struct seq_file *m)
 {
-	char *p = page;
-	s64 status;
+	long status;
 	pal_tr_valid_u_t tr_valid;
 	u64 tr_buffer[4];
 	pal_vm_info_1_u_t vm_info_1;
 	pal_vm_info_2_u_t vm_info_2;
-	u64 i, j;
-	u64 max[3], pgm;
+	unsigned long i, j;
+	unsigned long max[3], pgm;
 	struct ifa_reg {
-		u64 valid:1;
-		u64 ig:11;
-		u64 vpn:52;
+		unsigned long valid:1;
+		unsigned long ig:11;
+		unsigned long vpn:52;
 	} *ifa_reg;
 	struct itir_reg {
-		u64 rv1:2;
-		u64 ps:6;
-		u64 key:24;
-		u64 rv2:32;
+		unsigned long rv1:2;
+		unsigned long ps:6;
+		unsigned long key:24;
+		unsigned long rv2:32;
 	} *itir_reg;
 	struct gr_reg {
-		u64 p:1;
-		u64 rv1:1;
-		u64 ma:3;
-		u64 a:1;
-		u64 d:1;
-		u64 pl:2;
-		u64 ar:3;
-		u64 ppn:38;
-		u64 rv2:2;
-		u64 ed:1;
-		u64 ig:11;
+		unsigned long p:1;
+		unsigned long rv1:1;
+		unsigned long ma:3;
+		unsigned long a:1;
+		unsigned long d:1;
+		unsigned long pl:2;
+		unsigned long ar:3;
+		unsigned long ppn:38;
+		unsigned long rv2:2;
+		unsigned long ed:1;
+		unsigned long ig:11;
 	} *gr_reg;
 	struct rid_reg {
-		u64 ig1:1;
-		u64 rv1:1;
-		u64 ig2:6;
-		u64 rid:24;
-		u64 rv2:32;
+		unsigned long ig1:1;
+		unsigned long rv1:1;
+		unsigned long ig2:6;
+		unsigned long rid:24;
+		unsigned long rv2:32;
 	} *rid_reg;
 
 	if ((status = ia64_pal_vm_summary(&vm_info_1, &vm_info_2)) !=0) {
@@ -718,39 +767,40 @@ tr_info(char *page)
 
 		ifa_reg  = (struct ifa_reg *)&tr_buffer[2];
 
-		if (ifa_reg->valid == 0) continue;
+		if (ifa_reg->valid == 0)
+			continue;
 
 		gr_reg   = (struct gr_reg *)tr_buffer;
 		itir_reg = (struct itir_reg *)&tr_buffer[1];
 		rid_reg  = (struct rid_reg *)&tr_buffer[3];
 
 		pgm	 = -1 << (itir_reg->ps - 12);
-		p += sprintf(p,
-			     "%cTR%lu: av=%d pv=%d dv=%d mv=%d\n"
-			     "\tppn  : 0x%lx\n"
-			     "\tvpn  : 0x%lx\n"
-			     "\tps   : ",
-			     "ID"[i], j,
-			     tr_valid.pal_tr_valid_s.access_rights_valid,
-			     tr_valid.pal_tr_valid_s.priv_level_valid,
-			     tr_valid.pal_tr_valid_s.dirty_bit_valid,
-			     tr_valid.pal_tr_valid_s.mem_attr_valid,
-			     (gr_reg->ppn & pgm)<< 12, (ifa_reg->vpn & pgm)<< 12);
-
-		p = bitvector_process(p, 1<< itir_reg->ps);
-
-		p += sprintf(p,
-			     "\n\tpl   : %d\n"
-			     "\tar   : %d\n"
-			     "\trid  : %x\n"
-			     "\tp    : %d\n"
-			     "\tma   : %d\n"
-			     "\td    : %d\n",
-			     gr_reg->pl, gr_reg->ar, rid_reg->rid, gr_reg->p, gr_reg->ma,
-			     gr_reg->d);
+		seq_printf(m,
+			   "%cTR%lu: av=%d pv=%d dv=%d mv=%d\n"
+			   "\tppn  : 0x%lx\n"
+			   "\tvpn  : 0x%lx\n"
+			   "\tps   : ",
+			   "ID"[i], j,
+			   tr_valid.pal_tr_valid_s.access_rights_valid,
+			   tr_valid.pal_tr_valid_s.priv_level_valid,
+			   tr_valid.pal_tr_valid_s.dirty_bit_valid,
+			   tr_valid.pal_tr_valid_s.mem_attr_valid,
+			   (gr_reg->ppn & pgm)<< 12, (ifa_reg->vpn & pgm)<< 12);
+
+		bitvector_process(m, 1<< itir_reg->ps);
+
+		seq_printf(m,
+			   "\n\tpl   : %d\n"
+			   "\tar   : %d\n"
+			   "\trid  : %x\n"
+			   "\tp    : %d\n"
+			   "\tma   : %d\n"
+			   "\td    : %d\n",
+			   gr_reg->pl, gr_reg->ar, rid_reg->rid, gr_reg->p, gr_reg->ma,
+			   gr_reg->d);
 		}
 	}
-	return p - page;
+	return 0;
 }
 
 
@@ -758,7 +808,7 @@ tr_info(char *page)
 /*
  * List {name,function} pairs for every entry in /proc/palinfo/cpu*
  */
-static palinfo_entry_t palinfo_entries[]={
+static const palinfo_entry_t palinfo_entries[]={
 	{ "version_info",	version_info, },
 	{ "vm_info",		vm_info, },
 	{ "cache_info",		cache_info, },
@@ -773,17 +823,6 @@ static palinfo_entry_t palinfo_entries[]={
 
 #define NR_PALINFO_ENTRIES	(int) ARRAY_SIZE(palinfo_entries)
 
-/*
- * this array is used to keep track of the proc entries we create. This is
- * required in the module mode when we need to remove all entries. The procfs code
- * does not do recursion of deletion
- *
- * Notes:
- *	- +1 accounts for the cpuN directory entry in /proc/pal
- */
-#define NR_PALINFO_PROC_ENTRIES	(NR_CPUS*(NR_PALINFO_ENTRIES+1))
-
-static struct proc_dir_entry *palinfo_proc_entries[NR_PALINFO_PROC_ENTRIES];
 static struct proc_dir_entry *palinfo_dir;
 
 /*
@@ -811,7 +850,7 @@ typedef union {
  */
 typedef struct {
 	palinfo_func_t	func;	/* pointer to function to call */
-	char		*page;	/* buffer to store results */
+	struct seq_file *m;	/* buffer to store results */
 	int		ret;	/* return value from call */
 } palinfo_smp_data_t;
 
@@ -824,13 +863,7 @@ static void
 palinfo_smp_call(void *info)
 {
 	palinfo_smp_data_t *data = (palinfo_smp_data_t *)info;
-	if (data == NULL) {
-		printk(KERN_ERR "palinfo: data pointer is NULL\n");
-		data->ret = 0; /* no output */
-		return;
-	}
-	/* does this actual call */
-	data->ret = (*data->func)(data->page);
+	data->ret = (*data->func)(data->m);
 }
 
 /*
@@ -840,18 +873,18 @@ palinfo_smp_call(void *info)
  *	otherwise how many bytes in the "page" buffer were written
  */
 static
-int palinfo_handle_smp(pal_func_cpu_u_t *f, char *page)
+int palinfo_handle_smp(struct seq_file *m, pal_func_cpu_u_t *f)
 {
 	palinfo_smp_data_t ptr;
 	int ret;
 
 	ptr.func = palinfo_entries[f->func_id].proc_read;
-	ptr.page = page;
+	ptr.m = m;
 	ptr.ret  = 0; /* just in case */
 
 
 	/* will send IPI to other CPU and wait for completion of remote call */
-	if ((ret=smp_call_function_single(f->req_cpu, palinfo_smp_call, &ptr, 0, 1))) {
+	if ((ret=smp_call_function_single(f->req_cpu, palinfo_smp_call, &ptr, 1))) {
 		printk(KERN_ERR "palinfo: remote CPU call from %d to %d on function %d: "
 		       "error %d\n", smp_processor_id(), f->req_cpu, f->func_id, ret);
 		return 0;
@@ -860,7 +893,7 @@ int palinfo_handle_smp(pal_func_cpu_u_t *f, char *page)
 }
 #else /* ! CONFIG_SMP */
 static
-int palinfo_handle_smp(pal_func_cpu_u_t *f, char *page)
+int palinfo_handle_smp(struct seq_file *m, pal_func_cpu_u_t *f)
 {
 	printk(KERN_ERR "palinfo: should not be called with non SMP kernel\n");
 	return 0;
@@ -870,115 +903,84 @@ int palinfo_handle_smp(pal_func_cpu_u_t *f, char *page)
 /*
  * Entry point routine: all calls go through this function
  */
-static int
-palinfo_read_entry(char *page, char **start, off_t off, int count, int *eof, void *data)
+static int proc_palinfo_show(struct seq_file *m, void *v)
 {
-	int len=0;
-	pal_func_cpu_u_t *f = (pal_func_cpu_u_t *)&data;
+	pal_func_cpu_u_t *f = (pal_func_cpu_u_t *)&m->private;
 
 	/*
 	 * in SMP mode, we may need to call another CPU to get correct
 	 * information. PAL, by definition, is processor specific
 	 */
 	if (f->req_cpu == get_cpu())
-		len = (*palinfo_entries[f->func_id].proc_read)(page);
+		(*palinfo_entries[f->func_id].proc_read)(m);
 	else
-		len = palinfo_handle_smp(f, page);
+		palinfo_handle_smp(m, f);
 
 	put_cpu();
+	return 0;
+}
 
-	if (len <= off+count) *eof = 1;
-
-	*start = page + off;
-	len   -= off;
-
-	if (len>count) len = count;
-	if (len<0) len = 0;
-
-	return len;
+static int proc_palinfo_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, proc_palinfo_show, PDE_DATA(inode));
 }
 
+static const struct file_operations proc_palinfo_fops = {
+	.open		= proc_palinfo_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
 static void
 create_palinfo_proc_entries(unsigned int cpu)
 {
-#	define CPUSTR	"cpu%d"
-
 	pal_func_cpu_u_t f;
-	struct proc_dir_entry **pdir;
 	struct proc_dir_entry *cpu_dir;
 	int j;
-	char cpustr[sizeof(CPUSTR)];
-
-
-	/*
-	 * we keep track of created entries in a depth-first order for
-	 * cleanup purposes. Each entry is stored into palinfo_proc_entries
-	 */
-	sprintf(cpustr,CPUSTR, cpu);
+	char cpustr[3+4+1];	/* cpu numbers are up to 4095 on itanic */
+	sprintf(cpustr, "cpu%d", cpu);
 
 	cpu_dir = proc_mkdir(cpustr, palinfo_dir);
+	if (!cpu_dir)
+		return;
 
 	f.req_cpu = cpu;
 
-	/*
-	 * Compute the location to store per cpu entries
-	 * We dont store the top level entry in this list, but
-	 * remove it finally after removing all cpu entries.
-	 */
-	pdir = &palinfo_proc_entries[cpu*(NR_PALINFO_ENTRIES+1)];
-	*pdir++ = cpu_dir;
 	for (j=0; j < NR_PALINFO_ENTRIES; j++) {
 		f.func_id = j;
-		*pdir = create_proc_read_entry(
-				palinfo_entries[j].name, 0, cpu_dir,
-				palinfo_read_entry, (void *)f.value);
-		if (*pdir)
-			(*pdir)->owner = THIS_MODULE;
-		pdir++;
+		proc_create_data(palinfo_entries[j].name, 0, cpu_dir,
+				 &proc_palinfo_fops, (void *)f.value);
 	}
 }
 
 static void
 remove_palinfo_proc_entries(unsigned int hcpu)
 {
-	int j;
-	struct proc_dir_entry *cpu_dir, **pdir;
-
-	pdir = &palinfo_proc_entries[hcpu*(NR_PALINFO_ENTRIES+1)];
-	cpu_dir = *pdir;
-	*pdir++=NULL;
-	for (j=0; j < (NR_PALINFO_ENTRIES); j++) {
-		if ((*pdir)) {
-			remove_proc_entry ((*pdir)->name, cpu_dir);
-			*pdir ++= NULL;
-		}
-	}
-
-	if (cpu_dir) {
-		remove_proc_entry(cpu_dir->name, palinfo_dir);
-	}
+	char cpustr[3+4+1];	/* cpu numbers are up to 4095 on itanic */
+	sprintf(cpustr, "cpu%d", hcpu);
+	remove_proc_subtree(cpustr, palinfo_dir);
 }
 
-static int __devinit palinfo_cpu_callback(struct notifier_block *nfb,
-								unsigned long action,
-								void *hcpu)
+static int palinfo_cpu_callback(struct notifier_block *nfb,
+					unsigned long action, void *hcpu)
 {
 	unsigned int hotcpu = (unsigned long)hcpu;
 
 	switch (action) {
 	case CPU_ONLINE:
+	case CPU_ONLINE_FROZEN:
 		create_palinfo_proc_entries(hotcpu);
 		break;
-#ifdef CONFIG_HOTPLUG_CPU
 	case CPU_DEAD:
+	case CPU_DEAD_FROZEN:
 		remove_palinfo_proc_entries(hotcpu);
 		break;
-#endif
 	}
 	return NOTIFY_OK;
 }
 
-static struct notifier_block palinfo_cpu_notifier =
+static struct notifier_block __refdata palinfo_cpu_notifier =
 {
 	.notifier_call = palinfo_cpu_callback,
 	.priority = 0,
@@ -991,6 +993,10 @@ palinfo_init(void)
 
 	printk(KERN_INFO "PAL Information Facility v%s\n", PALINFO_VERSION);
 	palinfo_dir = proc_mkdir("pal", NULL);
+	if (!palinfo_dir)
+		return -ENOMEM;
+
+	cpu_notifier_register_begin();
 
 	/* Create palinfo dirs in /proc for all online cpus */
 	for_each_online_cpu(i) {
@@ -998,7 +1004,9 @@ palinfo_init(void)
 	}
 
 	/* Register for future delivery via notify registration */
-	register_cpu_notifier(&palinfo_cpu_notifier);
+	__register_hotcpu_notifier(&palinfo_cpu_notifier);
+
+	cpu_notifier_register_done();
 
 	return 0;
 }
@@ -1006,22 +1014,8 @@ palinfo_init(void)
 static void __exit
 palinfo_exit(void)
 {
-	int i = 0;
-
-	/* remove all nodes: depth first pass. Could optimize this  */
-	for_each_online_cpu(i) {
-		remove_palinfo_proc_entries(i);
-	}
-
-	/*
-	 * Remove the top level entry finally
-	 */
-	remove_proc_entry(palinfo_dir->name, NULL);
-
-	/*
-	 * Unregister from cpu notifier callbacks
-	 */
-	unregister_cpu_notifier(&palinfo_cpu_notifier);
+	unregister_hotcpu_notifier(&palinfo_cpu_notifier);
+	remove_proc_subtree("pal", NULL);
 }
 
 module_init(palinfo_init);
diff --git a/arch/ia64/kernel/paravirt.c b/arch/ia64/kernel/paravirt.c
new file mode 100644
index 00000000000..1b22f6de293
--- /dev/null
+++ b/arch/ia64/kernel/paravirt.c
@@ -0,0 +1,902 @@
+/******************************************************************************
+ * arch/ia64/kernel/paravirt.c
+ *
+ * Copyright (c) 2008 Isaku Yamahata <yamahata at valinux co jp>
+ *                    VA Linux Systems Japan K.K.
+ *     Yaozu (Eddie) Dong <eddie.dong@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ */
+
+#include <linux/init.h>
+
+#include <linux/compiler.h>
+#include <linux/io.h>
+#include <linux/irq.h>
+#include <linux/module.h>
+#include <linux/types.h>
+
+#include <asm/iosapic.h>
+#include <asm/paravirt.h>
+
+/***************************************************************************
+ * general info
+ */
+struct pv_info pv_info = {
+	.kernel_rpl = 0,
+	.paravirt_enabled = 0,
+	.name = "bare hardware"
+};
+
+/***************************************************************************
+ * pv_init_ops
+ * initialization hooks.
+ */
+
+static void __init
+ia64_native_patch_branch(unsigned long tag, unsigned long type);
+
+struct pv_init_ops pv_init_ops =
+{
+#ifdef ASM_SUPPORTED
+	.patch_bundle = ia64_native_patch_bundle,
+#endif
+	.patch_branch = ia64_native_patch_branch,
+};
+
+/***************************************************************************
+ * pv_cpu_ops
+ * intrinsics hooks.
+ */
+
+#ifndef ASM_SUPPORTED
+/* ia64_native_xxx are macros so that we have to make them real functions */
+
+#define DEFINE_VOID_FUNC1(name)					\
+	static void						\
+	ia64_native_ ## name ## _func(unsigned long arg)	\
+	{							\
+		ia64_native_ ## name(arg);			\
+	}
+
+#define DEFINE_VOID_FUNC1_VOID(name)				\
+	static void						\
+	ia64_native_ ## name ## _func(void *arg)		\
+	{							\
+		ia64_native_ ## name(arg);			\
+	}
+
+#define DEFINE_VOID_FUNC2(name)					\
+	static void						\
+	ia64_native_ ## name ## _func(unsigned long arg0,	\
+				      unsigned long arg1)	\
+	{							\
+		ia64_native_ ## name(arg0, arg1);		\
+	}
+
+#define DEFINE_FUNC0(name)			\
+	static unsigned long			\
+	ia64_native_ ## name ## _func(void)	\
+	{					\
+		return ia64_native_ ## name();	\
+	}
+
+#define DEFINE_FUNC1(name, type)			\
+	static unsigned long				\
+	ia64_native_ ## name ## _func(type arg)		\
+	{						\
+		return ia64_native_ ## name(arg);	\
+	}						\
+
+DEFINE_VOID_FUNC1_VOID(fc);
+DEFINE_VOID_FUNC1(intrin_local_irq_restore);
+
+DEFINE_VOID_FUNC2(ptcga);
+DEFINE_VOID_FUNC2(set_rr);
+
+DEFINE_FUNC0(get_psr_i);
+
+DEFINE_FUNC1(thash, unsigned long);
+DEFINE_FUNC1(get_cpuid, int);
+DEFINE_FUNC1(get_pmd, int);
+DEFINE_FUNC1(get_rr, unsigned long);
+
+static void
+ia64_native_ssm_i_func(void)
+{
+	ia64_native_ssm(IA64_PSR_I);
+}
+
+static void
+ia64_native_rsm_i_func(void)
+{
+	ia64_native_rsm(IA64_PSR_I);
+}
+
+static void
+ia64_native_set_rr0_to_rr4_func(unsigned long val0, unsigned long val1,
+				unsigned long val2, unsigned long val3,
+				unsigned long val4)
+{
+	ia64_native_set_rr0_to_rr4(val0, val1, val2, val3, val4);
+}
+
+#define CASE_GET_REG(id)				\
+	case _IA64_REG_ ## id:				\
+	res = ia64_native_getreg(_IA64_REG_ ## id);	\
+	break;
+#define CASE_GET_AR(id) CASE_GET_REG(AR_ ## id)
+#define CASE_GET_CR(id) CASE_GET_REG(CR_ ## id)
+
+unsigned long
+ia64_native_getreg_func(int regnum)
+{
+	unsigned long res = -1;
+	switch (regnum) {
+	CASE_GET_REG(GP);
+	/*CASE_GET_REG(IP);*/ /* returned ip value shouldn't be constant */
+	CASE_GET_REG(PSR);
+	CASE_GET_REG(TP);
+	CASE_GET_REG(SP);
+
+	CASE_GET_AR(KR0);
+	CASE_GET_AR(KR1);
+	CASE_GET_AR(KR2);
+	CASE_GET_AR(KR3);
+	CASE_GET_AR(KR4);
+	CASE_GET_AR(KR5);
+	CASE_GET_AR(KR6);
+	CASE_GET_AR(KR7);
+	CASE_GET_AR(RSC);
+	CASE_GET_AR(BSP);
+	CASE_GET_AR(BSPSTORE);
+	CASE_GET_AR(RNAT);
+	CASE_GET_AR(FCR);
+	CASE_GET_AR(EFLAG);
+	CASE_GET_AR(CSD);
+	CASE_GET_AR(SSD);
+	CASE_GET_AR(CFLAG);
+	CASE_GET_AR(FSR);
+	CASE_GET_AR(FIR);
+	CASE_GET_AR(FDR);
+	CASE_GET_AR(CCV);
+	CASE_GET_AR(UNAT);
+	CASE_GET_AR(FPSR);
+	CASE_GET_AR(ITC);
+	CASE_GET_AR(PFS);
+	CASE_GET_AR(LC);
+	CASE_GET_AR(EC);
+
+	CASE_GET_CR(DCR);
+	CASE_GET_CR(ITM);
+	CASE_GET_CR(IVA);
+	CASE_GET_CR(PTA);
+	CASE_GET_CR(IPSR);
+	CASE_GET_CR(ISR);
+	CASE_GET_CR(IIP);
+	CASE_GET_CR(IFA);
+	CASE_GET_CR(ITIR);
+	CASE_GET_CR(IIPA);
+	CASE_GET_CR(IFS);
+	CASE_GET_CR(IIM);
+	CASE_GET_CR(IHA);
+	CASE_GET_CR(LID);
+	CASE_GET_CR(IVR);
+	CASE_GET_CR(TPR);
+	CASE_GET_CR(EOI);
+	CASE_GET_CR(IRR0);
+	CASE_GET_CR(IRR1);
+	CASE_GET_CR(IRR2);
+	CASE_GET_CR(IRR3);
+	CASE_GET_CR(ITV);
+	CASE_GET_CR(PMV);
+	CASE_GET_CR(CMCV);
+	CASE_GET_CR(LRR0);
+	CASE_GET_CR(LRR1);
+
+	default:
+		printk(KERN_CRIT "wrong_getreg %d\n", regnum);
+		break;
+	}
+	return res;
+}
+
+#define CASE_SET_REG(id)				\
+	case _IA64_REG_ ## id:				\
+	ia64_native_setreg(_IA64_REG_ ## id, val);	\
+	break;
+#define CASE_SET_AR(id) CASE_SET_REG(AR_ ## id)
+#define CASE_SET_CR(id) CASE_SET_REG(CR_ ## id)
+
+void
+ia64_native_setreg_func(int regnum, unsigned long val)
+{
+	switch (regnum) {
+	case _IA64_REG_PSR_L:
+		ia64_native_setreg(_IA64_REG_PSR_L, val);
+		ia64_dv_serialize_data();
+		break;
+	CASE_SET_REG(SP);
+	CASE_SET_REG(GP);
+
+	CASE_SET_AR(KR0);
+	CASE_SET_AR(KR1);
+	CASE_SET_AR(KR2);
+	CASE_SET_AR(KR3);
+	CASE_SET_AR(KR4);
+	CASE_SET_AR(KR5);
+	CASE_SET_AR(KR6);
+	CASE_SET_AR(KR7);
+	CASE_SET_AR(RSC);
+	CASE_SET_AR(BSP);
+	CASE_SET_AR(BSPSTORE);
+	CASE_SET_AR(RNAT);
+	CASE_SET_AR(FCR);
+	CASE_SET_AR(EFLAG);
+	CASE_SET_AR(CSD);
+	CASE_SET_AR(SSD);
+	CASE_SET_AR(CFLAG);
+	CASE_SET_AR(FSR);
+	CASE_SET_AR(FIR);
+	CASE_SET_AR(FDR);
+	CASE_SET_AR(CCV);
+	CASE_SET_AR(UNAT);
+	CASE_SET_AR(FPSR);
+	CASE_SET_AR(ITC);
+	CASE_SET_AR(PFS);
+	CASE_SET_AR(LC);
+	CASE_SET_AR(EC);
+
+	CASE_SET_CR(DCR);
+	CASE_SET_CR(ITM);
+	CASE_SET_CR(IVA);
+	CASE_SET_CR(PTA);
+	CASE_SET_CR(IPSR);
+	CASE_SET_CR(ISR);
+	CASE_SET_CR(IIP);
+	CASE_SET_CR(IFA);
+	CASE_SET_CR(ITIR);
+	CASE_SET_CR(IIPA);
+	CASE_SET_CR(IFS);
+	CASE_SET_CR(IIM);
+	CASE_SET_CR(IHA);
+	CASE_SET_CR(LID);
+	CASE_SET_CR(IVR);
+	CASE_SET_CR(TPR);
+	CASE_SET_CR(EOI);
+	CASE_SET_CR(IRR0);
+	CASE_SET_CR(IRR1);
+	CASE_SET_CR(IRR2);
+	CASE_SET_CR(IRR3);
+	CASE_SET_CR(ITV);
+	CASE_SET_CR(PMV);
+	CASE_SET_CR(CMCV);
+	CASE_SET_CR(LRR0);
+	CASE_SET_CR(LRR1);
+	default:
+		printk(KERN_CRIT "wrong setreg %d\n", regnum);
+		break;
+	}
+}
+#else
+
+#define __DEFINE_FUNC(name, code)					\
+	extern const char ia64_native_ ## name ## _direct_start[];	\
+	extern const char ia64_native_ ## name ## _direct_end[];	\
+	asm (".align 32\n"						\
+	     ".proc ia64_native_" #name "_func\n"			\
+	     "ia64_native_" #name "_func:\n"				\
+	     "ia64_native_" #name "_direct_start:\n"			\
+	     code							\
+	     "ia64_native_" #name "_direct_end:\n"			\
+	     "br.cond.sptk.many b6\n"					\
+	     ".endp ia64_native_" #name "_func\n")
+
+#define DEFINE_VOID_FUNC0(name, code)				\
+	extern void						\
+	ia64_native_ ## name ## _func(void);			\
+	__DEFINE_FUNC(name, code)
+
+#define DEFINE_VOID_FUNC1(name, code)				\
+	extern void						\
+	ia64_native_ ## name ## _func(unsigned long arg);	\
+	__DEFINE_FUNC(name, code)
+
+#define DEFINE_VOID_FUNC1_VOID(name, code)			\
+	extern void						\
+	ia64_native_ ## name ## _func(void *arg);		\
+	__DEFINE_FUNC(name, code)
+
+#define DEFINE_VOID_FUNC2(name, code)				\
+	extern void						\
+	ia64_native_ ## name ## _func(unsigned long arg0,	\
+				      unsigned long arg1);	\
+	__DEFINE_FUNC(name, code)
+
+#define DEFINE_FUNC0(name, code)		\
+	extern unsigned long			\
+	ia64_native_ ## name ## _func(void);	\
+	__DEFINE_FUNC(name, code)
+
+#define DEFINE_FUNC1(name, type, code)			\
+	extern unsigned long				\
+	ia64_native_ ## name ## _func(type arg);	\
+	__DEFINE_FUNC(name, code)
+
+DEFINE_VOID_FUNC1_VOID(fc,
+		       "fc r8\n");
+DEFINE_VOID_FUNC1(intrin_local_irq_restore,
+		  ";;\n"
+		  "     cmp.ne p6, p7 = r8, r0\n"
+		  ";;\n"
+		  "(p6) ssm psr.i\n"
+		  "(p7) rsm psr.i\n"
+		  ";;\n"
+		  "(p6) srlz.d\n");
+
+DEFINE_VOID_FUNC2(ptcga,
+		  "ptc.ga r8, r9\n");
+DEFINE_VOID_FUNC2(set_rr,
+		  "mov rr[r8] = r9\n");
+
+/* ia64_native_getreg(_IA64_REG_PSR) & IA64_PSR_I */
+DEFINE_FUNC0(get_psr_i,
+	     "mov r2 = " __stringify(1 << IA64_PSR_I_BIT) "\n"
+	     "mov r8 = psr\n"
+	     ";;\n"
+	     "and r8 = r2, r8\n");
+
+DEFINE_FUNC1(thash, unsigned long,
+	     "thash r8 = r8\n");
+DEFINE_FUNC1(get_cpuid, int,
+	     "mov r8 = cpuid[r8]\n");
+DEFINE_FUNC1(get_pmd, int,
+	     "mov r8 = pmd[r8]\n");
+DEFINE_FUNC1(get_rr, unsigned long,
+	     "mov r8 = rr[r8]\n");
+
+DEFINE_VOID_FUNC0(ssm_i,
+		  "ssm psr.i\n");
+DEFINE_VOID_FUNC0(rsm_i,
+		  "rsm psr.i\n");
+
+extern void
+ia64_native_set_rr0_to_rr4_func(unsigned long val0, unsigned long val1,
+				unsigned long val2, unsigned long val3,
+				unsigned long val4);
+__DEFINE_FUNC(set_rr0_to_rr4,
+	      "mov rr[r0] = r8\n"
+	      "movl r2 = 0x2000000000000000\n"
+	      ";;\n"
+	      "mov rr[r2] = r9\n"
+	      "shl r3 = r2, 1\n"	/* movl r3 = 0x4000000000000000 */
+	      ";;\n"
+	      "add r2 = r2, r3\n"	/* movl r2 = 0x6000000000000000 */
+	      "mov rr[r3] = r10\n"
+	      ";;\n"
+	      "mov rr[r2] = r11\n"
+	      "shl r3 = r3, 1\n"	/* movl r3 = 0x8000000000000000 */
+	      ";;\n"
+	      "mov rr[r3] = r14\n");
+
+extern unsigned long ia64_native_getreg_func(int regnum);
+asm(".global ia64_native_getreg_func\n");
+#define __DEFINE_GET_REG(id, reg)			\
+	"mov r2 = " __stringify(_IA64_REG_ ## id) "\n"	\
+	";;\n"						\
+	"cmp.eq p6, p0 = r2, r8\n"			\
+	";;\n"						\
+	"(p6) mov r8 = " #reg "\n"			\
+	"(p6) br.cond.sptk.many b6\n"			\
+	";;\n"
+#define __DEFINE_GET_AR(id, reg)	__DEFINE_GET_REG(AR_ ## id, ar.reg)
+#define __DEFINE_GET_CR(id, reg)	__DEFINE_GET_REG(CR_ ## id, cr.reg)
+
+__DEFINE_FUNC(getreg,
+	      __DEFINE_GET_REG(GP, gp)
+	      /*__DEFINE_GET_REG(IP, ip)*/ /* returned ip value shouldn't be constant */
+	      __DEFINE_GET_REG(PSR, psr)
+	      __DEFINE_GET_REG(TP, tp)
+	      __DEFINE_GET_REG(SP, sp)
+
+	      __DEFINE_GET_REG(AR_KR0, ar0)
+	      __DEFINE_GET_REG(AR_KR1, ar1)
+	      __DEFINE_GET_REG(AR_KR2, ar2)
+	      __DEFINE_GET_REG(AR_KR3, ar3)
+	      __DEFINE_GET_REG(AR_KR4, ar4)
+	      __DEFINE_GET_REG(AR_KR5, ar5)
+	      __DEFINE_GET_REG(AR_KR6, ar6)
+	      __DEFINE_GET_REG(AR_KR7, ar7)
+	      __DEFINE_GET_AR(RSC, rsc)
+	      __DEFINE_GET_AR(BSP, bsp)
+	      __DEFINE_GET_AR(BSPSTORE, bspstore)
+	      __DEFINE_GET_AR(RNAT, rnat)
+	      __DEFINE_GET_AR(FCR, fcr)
+	      __DEFINE_GET_AR(EFLAG, eflag)
+	      __DEFINE_GET_AR(CSD, csd)
+	      __DEFINE_GET_AR(SSD, ssd)
+	      __DEFINE_GET_REG(AR_CFLAG, ar27)
+	      __DEFINE_GET_AR(FSR, fsr)
+	      __DEFINE_GET_AR(FIR, fir)
+	      __DEFINE_GET_AR(FDR, fdr)
+	      __DEFINE_GET_AR(CCV, ccv)
+	      __DEFINE_GET_AR(UNAT, unat)
+	      __DEFINE_GET_AR(FPSR, fpsr)
+	      __DEFINE_GET_AR(ITC, itc)
+	      __DEFINE_GET_AR(PFS, pfs)
+	      __DEFINE_GET_AR(LC, lc)
+	      __DEFINE_GET_AR(EC, ec)
+
+	      __DEFINE_GET_CR(DCR, dcr)
+	      __DEFINE_GET_CR(ITM, itm)
+	      __DEFINE_GET_CR(IVA, iva)
+	      __DEFINE_GET_CR(PTA, pta)
+	      __DEFINE_GET_CR(IPSR, ipsr)
+	      __DEFINE_GET_CR(ISR, isr)
+	      __DEFINE_GET_CR(IIP, iip)
+	      __DEFINE_GET_CR(IFA, ifa)
+	      __DEFINE_GET_CR(ITIR, itir)
+	      __DEFINE_GET_CR(IIPA, iipa)
+	      __DEFINE_GET_CR(IFS, ifs)
+	      __DEFINE_GET_CR(IIM, iim)
+	      __DEFINE_GET_CR(IHA, iha)
+	      __DEFINE_GET_CR(LID, lid)
+	      __DEFINE_GET_CR(IVR, ivr)
+	      __DEFINE_GET_CR(TPR, tpr)
+	      __DEFINE_GET_CR(EOI, eoi)
+	      __DEFINE_GET_CR(IRR0, irr0)
+	      __DEFINE_GET_CR(IRR1, irr1)
+	      __DEFINE_GET_CR(IRR2, irr2)
+	      __DEFINE_GET_CR(IRR3, irr3)
+	      __DEFINE_GET_CR(ITV, itv)
+	      __DEFINE_GET_CR(PMV, pmv)
+	      __DEFINE_GET_CR(CMCV, cmcv)
+	      __DEFINE_GET_CR(LRR0, lrr0)
+	      __DEFINE_GET_CR(LRR1, lrr1)
+
+	      "mov r8 = -1\n"	/* unsupported case */
+	);
+
+extern void ia64_native_setreg_func(int regnum, unsigned long val);
+asm(".global ia64_native_setreg_func\n");
+#define __DEFINE_SET_REG(id, reg)			\
+	"mov r2 = " __stringify(_IA64_REG_ ## id) "\n"	\
+	";;\n"						\
+	"cmp.eq p6, p0 = r2, r9\n"			\
+	";;\n"						\
+	"(p6) mov " #reg " = r8\n"			\
+	"(p6) br.cond.sptk.many b6\n"			\
+	";;\n"
+#define __DEFINE_SET_AR(id, reg)	__DEFINE_SET_REG(AR_ ## id, ar.reg)
+#define __DEFINE_SET_CR(id, reg)	__DEFINE_SET_REG(CR_ ## id, cr.reg)
+__DEFINE_FUNC(setreg,
+	      "mov r2 = " __stringify(_IA64_REG_PSR_L) "\n"
+	      ";;\n"
+	      "cmp.eq p6, p0 = r2, r9\n"
+	      ";;\n"
+	      "(p6) mov psr.l = r8\n"
+#ifdef HAVE_SERIALIZE_DIRECTIVE
+	      ".serialize.data\n"
+#endif
+	      "(p6) br.cond.sptk.many b6\n"
+	      __DEFINE_SET_REG(GP, gp)
+	      __DEFINE_SET_REG(SP, sp)
+
+	      __DEFINE_SET_REG(AR_KR0, ar0)
+	      __DEFINE_SET_REG(AR_KR1, ar1)
+	      __DEFINE_SET_REG(AR_KR2, ar2)
+	      __DEFINE_SET_REG(AR_KR3, ar3)
+	      __DEFINE_SET_REG(AR_KR4, ar4)
+	      __DEFINE_SET_REG(AR_KR5, ar5)
+	      __DEFINE_SET_REG(AR_KR6, ar6)
+	      __DEFINE_SET_REG(AR_KR7, ar7)
+	      __DEFINE_SET_AR(RSC, rsc)
+	      __DEFINE_SET_AR(BSP, bsp)
+	      __DEFINE_SET_AR(BSPSTORE, bspstore)
+	      __DEFINE_SET_AR(RNAT, rnat)
+	      __DEFINE_SET_AR(FCR, fcr)
+	      __DEFINE_SET_AR(EFLAG, eflag)
+	      __DEFINE_SET_AR(CSD, csd)
+	      __DEFINE_SET_AR(SSD, ssd)
+	      __DEFINE_SET_REG(AR_CFLAG, ar27)
+	      __DEFINE_SET_AR(FSR, fsr)
+	      __DEFINE_SET_AR(FIR, fir)
+	      __DEFINE_SET_AR(FDR, fdr)
+	      __DEFINE_SET_AR(CCV, ccv)
+	      __DEFINE_SET_AR(UNAT, unat)
+	      __DEFINE_SET_AR(FPSR, fpsr)
+	      __DEFINE_SET_AR(ITC, itc)
+	      __DEFINE_SET_AR(PFS, pfs)
+	      __DEFINE_SET_AR(LC, lc)
+	      __DEFINE_SET_AR(EC, ec)
+
+	      __DEFINE_SET_CR(DCR, dcr)
+	      __DEFINE_SET_CR(ITM, itm)
+	      __DEFINE_SET_CR(IVA, iva)
+	      __DEFINE_SET_CR(PTA, pta)
+	      __DEFINE_SET_CR(IPSR, ipsr)
+	      __DEFINE_SET_CR(ISR, isr)
+	      __DEFINE_SET_CR(IIP, iip)
+	      __DEFINE_SET_CR(IFA, ifa)
+	      __DEFINE_SET_CR(ITIR, itir)
+	      __DEFINE_SET_CR(IIPA, iipa)
+	      __DEFINE_SET_CR(IFS, ifs)
+	      __DEFINE_SET_CR(IIM, iim)
+	      __DEFINE_SET_CR(IHA, iha)
+	      __DEFINE_SET_CR(LID, lid)
+	      __DEFINE_SET_CR(IVR, ivr)
+	      __DEFINE_SET_CR(TPR, tpr)
+	      __DEFINE_SET_CR(EOI, eoi)
+	      __DEFINE_SET_CR(IRR0, irr0)
+	      __DEFINE_SET_CR(IRR1, irr1)
+	      __DEFINE_SET_CR(IRR2, irr2)
+	      __DEFINE_SET_CR(IRR3, irr3)
+	      __DEFINE_SET_CR(ITV, itv)
+	      __DEFINE_SET_CR(PMV, pmv)
+	      __DEFINE_SET_CR(CMCV, cmcv)
+	      __DEFINE_SET_CR(LRR0, lrr0)
+	      __DEFINE_SET_CR(LRR1, lrr1)
+	);
+#endif
+
+struct pv_cpu_ops pv_cpu_ops = {
+	.fc		= ia64_native_fc_func,
+	.thash		= ia64_native_thash_func,
+	.get_cpuid	= ia64_native_get_cpuid_func,
+	.get_pmd	= ia64_native_get_pmd_func,
+	.ptcga		= ia64_native_ptcga_func,
+	.get_rr		= ia64_native_get_rr_func,
+	.set_rr		= ia64_native_set_rr_func,
+	.set_rr0_to_rr4	= ia64_native_set_rr0_to_rr4_func,
+	.ssm_i		= ia64_native_ssm_i_func,
+	.getreg		= ia64_native_getreg_func,
+	.setreg		= ia64_native_setreg_func,
+	.rsm_i		= ia64_native_rsm_i_func,
+	.get_psr_i	= ia64_native_get_psr_i_func,
+	.intrin_local_irq_restore
+			= ia64_native_intrin_local_irq_restore_func,
+};
+EXPORT_SYMBOL(pv_cpu_ops);
+
+/******************************************************************************
+ * replacement of hand written assembly codes.
+ */
+
+void
+paravirt_cpu_asm_init(const struct pv_cpu_asm_switch *cpu_asm_switch)
+{
+	extern unsigned long paravirt_switch_to_targ;
+	extern unsigned long paravirt_leave_syscall_targ;
+	extern unsigned long paravirt_work_processed_syscall_targ;
+	extern unsigned long paravirt_leave_kernel_targ;
+
+	paravirt_switch_to_targ = cpu_asm_switch->switch_to;
+	paravirt_leave_syscall_targ = cpu_asm_switch->leave_syscall;
+	paravirt_work_processed_syscall_targ =
+		cpu_asm_switch->work_processed_syscall;
+	paravirt_leave_kernel_targ = cpu_asm_switch->leave_kernel;
+}
+
+/***************************************************************************
+ * pv_iosapic_ops
+ * iosapic read/write hooks.
+ */
+
+static unsigned int
+ia64_native_iosapic_read(char __iomem *iosapic, unsigned int reg)
+{
+	return __ia64_native_iosapic_read(iosapic, reg);
+}
+
+static void
+ia64_native_iosapic_write(char __iomem *iosapic, unsigned int reg, u32 val)
+{
+	__ia64_native_iosapic_write(iosapic, reg, val);
+}
+
+struct pv_iosapic_ops pv_iosapic_ops = {
+	.pcat_compat_init = ia64_native_iosapic_pcat_compat_init,
+	.__get_irq_chip = ia64_native_iosapic_get_irq_chip,
+
+	.__read = ia64_native_iosapic_read,
+	.__write = ia64_native_iosapic_write,
+};
+
+/***************************************************************************
+ * pv_irq_ops
+ * irq operations
+ */
+
+struct pv_irq_ops pv_irq_ops = {
+	.register_ipi = ia64_native_register_ipi,
+
+	.assign_irq_vector = ia64_native_assign_irq_vector,
+	.free_irq_vector = ia64_native_free_irq_vector,
+	.register_percpu_irq = ia64_native_register_percpu_irq,
+
+	.resend_irq = ia64_native_resend_irq,
+};
+
+/***************************************************************************
+ * pv_time_ops
+ * time operations
+ */
+struct static_key paravirt_steal_enabled;
+struct static_key paravirt_steal_rq_enabled;
+
+static int
+ia64_native_do_steal_accounting(unsigned long *new_itm)
+{
+	return 0;
+}
+
+struct pv_time_ops pv_time_ops = {
+	.do_steal_accounting = ia64_native_do_steal_accounting,
+	.sched_clock = ia64_native_sched_clock,
+};
+
+/***************************************************************************
+ * binary pacthing
+ * pv_init_ops.patch_bundle
+ */
+
+#ifdef ASM_SUPPORTED
+#define IA64_NATIVE_PATCH_DEFINE_GET_REG(name, reg)	\
+	__DEFINE_FUNC(get_ ## name,			\
+		      ";;\n"				\
+		      "mov r8 = " #reg "\n"		\
+		      ";;\n")
+
+#define IA64_NATIVE_PATCH_DEFINE_SET_REG(name, reg)	\
+	__DEFINE_FUNC(set_ ## name,			\
+		      ";;\n"				\
+		      "mov " #reg " = r8\n"		\
+		      ";;\n")
+
+#define IA64_NATIVE_PATCH_DEFINE_REG(name, reg)		\
+	IA64_NATIVE_PATCH_DEFINE_GET_REG(name, reg);	\
+	IA64_NATIVE_PATCH_DEFINE_SET_REG(name, reg)	\
+
+#define IA64_NATIVE_PATCH_DEFINE_AR(name, reg)			\
+	IA64_NATIVE_PATCH_DEFINE_REG(ar_ ## name, ar.reg)
+
+#define IA64_NATIVE_PATCH_DEFINE_CR(name, reg)			\
+	IA64_NATIVE_PATCH_DEFINE_REG(cr_ ## name, cr.reg)
+
+
+IA64_NATIVE_PATCH_DEFINE_GET_REG(psr, psr);
+IA64_NATIVE_PATCH_DEFINE_GET_REG(tp, tp);
+
+/* IA64_NATIVE_PATCH_DEFINE_SET_REG(psr_l, psr.l); */
+__DEFINE_FUNC(set_psr_l,
+	      ";;\n"
+	      "mov psr.l = r8\n"
+#ifdef HAVE_SERIALIZE_DIRECTIVE
+	      ".serialize.data\n"
+#endif
+	      ";;\n");
+
+IA64_NATIVE_PATCH_DEFINE_REG(gp, gp);
+IA64_NATIVE_PATCH_DEFINE_REG(sp, sp);
+
+IA64_NATIVE_PATCH_DEFINE_REG(kr0, ar0);
+IA64_NATIVE_PATCH_DEFINE_REG(kr1, ar1);
+IA64_NATIVE_PATCH_DEFINE_REG(kr2, ar2);
+IA64_NATIVE_PATCH_DEFINE_REG(kr3, ar3);
+IA64_NATIVE_PATCH_DEFINE_REG(kr4, ar4);
+IA64_NATIVE_PATCH_DEFINE_REG(kr5, ar5);
+IA64_NATIVE_PATCH_DEFINE_REG(kr6, ar6);
+IA64_NATIVE_PATCH_DEFINE_REG(kr7, ar7);
+
+IA64_NATIVE_PATCH_DEFINE_AR(rsc, rsc);
+IA64_NATIVE_PATCH_DEFINE_AR(bsp, bsp);
+IA64_NATIVE_PATCH_DEFINE_AR(bspstore, bspstore);
+IA64_NATIVE_PATCH_DEFINE_AR(rnat, rnat);
+IA64_NATIVE_PATCH_DEFINE_AR(fcr, fcr);
+IA64_NATIVE_PATCH_DEFINE_AR(eflag, eflag);
+IA64_NATIVE_PATCH_DEFINE_AR(csd, csd);
+IA64_NATIVE_PATCH_DEFINE_AR(ssd, ssd);
+IA64_NATIVE_PATCH_DEFINE_REG(ar27, ar27);
+IA64_NATIVE_PATCH_DEFINE_AR(fsr, fsr);
+IA64_NATIVE_PATCH_DEFINE_AR(fir, fir);
+IA64_NATIVE_PATCH_DEFINE_AR(fdr, fdr);
+IA64_NATIVE_PATCH_DEFINE_AR(ccv, ccv);
+IA64_NATIVE_PATCH_DEFINE_AR(unat, unat);
+IA64_NATIVE_PATCH_DEFINE_AR(fpsr, fpsr);
+IA64_NATIVE_PATCH_DEFINE_AR(itc, itc);
+IA64_NATIVE_PATCH_DEFINE_AR(pfs, pfs);
+IA64_NATIVE_PATCH_DEFINE_AR(lc, lc);
+IA64_NATIVE_PATCH_DEFINE_AR(ec, ec);
+
+IA64_NATIVE_PATCH_DEFINE_CR(dcr, dcr);
+IA64_NATIVE_PATCH_DEFINE_CR(itm, itm);
+IA64_NATIVE_PATCH_DEFINE_CR(iva, iva);
+IA64_NATIVE_PATCH_DEFINE_CR(pta, pta);
+IA64_NATIVE_PATCH_DEFINE_CR(ipsr, ipsr);
+IA64_NATIVE_PATCH_DEFINE_CR(isr, isr);
+IA64_NATIVE_PATCH_DEFINE_CR(iip, iip);
+IA64_NATIVE_PATCH_DEFINE_CR(ifa, ifa);
+IA64_NATIVE_PATCH_DEFINE_CR(itir, itir);
+IA64_NATIVE_PATCH_DEFINE_CR(iipa, iipa);
+IA64_NATIVE_PATCH_DEFINE_CR(ifs, ifs);
+IA64_NATIVE_PATCH_DEFINE_CR(iim, iim);
+IA64_NATIVE_PATCH_DEFINE_CR(iha, iha);
+IA64_NATIVE_PATCH_DEFINE_CR(lid, lid);
+IA64_NATIVE_PATCH_DEFINE_CR(ivr, ivr);
+IA64_NATIVE_PATCH_DEFINE_CR(tpr, tpr);
+IA64_NATIVE_PATCH_DEFINE_CR(eoi, eoi);
+IA64_NATIVE_PATCH_DEFINE_CR(irr0, irr0);
+IA64_NATIVE_PATCH_DEFINE_CR(irr1, irr1);
+IA64_NATIVE_PATCH_DEFINE_CR(irr2, irr2);
+IA64_NATIVE_PATCH_DEFINE_CR(irr3, irr3);
+IA64_NATIVE_PATCH_DEFINE_CR(itv, itv);
+IA64_NATIVE_PATCH_DEFINE_CR(pmv, pmv);
+IA64_NATIVE_PATCH_DEFINE_CR(cmcv, cmcv);
+IA64_NATIVE_PATCH_DEFINE_CR(lrr0, lrr0);
+IA64_NATIVE_PATCH_DEFINE_CR(lrr1, lrr1);
+
+static const struct paravirt_patch_bundle_elem ia64_native_patch_bundle_elems[]
+__initdata_or_module =
+{
+#define IA64_NATIVE_PATCH_BUNDLE_ELEM(name, type)		\
+	{							\
+		(void*)ia64_native_ ## name ## _direct_start,	\
+		(void*)ia64_native_ ## name ## _direct_end,	\
+		PARAVIRT_PATCH_TYPE_ ## type,			\
+	}
+
+	IA64_NATIVE_PATCH_BUNDLE_ELEM(fc, FC),
+	IA64_NATIVE_PATCH_BUNDLE_ELEM(thash, THASH),
+	IA64_NATIVE_PATCH_BUNDLE_ELEM(get_cpuid, GET_CPUID),
+	IA64_NATIVE_PATCH_BUNDLE_ELEM(get_pmd, GET_PMD),
+	IA64_NATIVE_PATCH_BUNDLE_ELEM(ptcga, PTCGA),
+	IA64_NATIVE_PATCH_BUNDLE_ELEM(get_rr, GET_RR),
+	IA64_NATIVE_PATCH_BUNDLE_ELEM(set_rr, SET_RR),
+	IA64_NATIVE_PATCH_BUNDLE_ELEM(set_rr0_to_rr4, SET_RR0_TO_RR4),
+	IA64_NATIVE_PATCH_BUNDLE_ELEM(ssm_i, SSM_I),
+	IA64_NATIVE_PATCH_BUNDLE_ELEM(rsm_i, RSM_I),
+	IA64_NATIVE_PATCH_BUNDLE_ELEM(get_psr_i, GET_PSR_I),
+	IA64_NATIVE_PATCH_BUNDLE_ELEM(intrin_local_irq_restore,
+				      INTRIN_LOCAL_IRQ_RESTORE),
+
+#define IA64_NATIVE_PATCH_BUNDLE_ELEM_GETREG(name, reg)			\
+	{								\
+		(void*)ia64_native_get_ ## name ## _direct_start,	\
+		(void*)ia64_native_get_ ## name ## _direct_end,		\
+		PARAVIRT_PATCH_TYPE_GETREG + _IA64_REG_ ## reg,		\
+	}
+
+#define IA64_NATIVE_PATCH_BUNDLE_ELEM_SETREG(name, reg)			\
+	{								\
+		(void*)ia64_native_set_ ## name ## _direct_start,	\
+		(void*)ia64_native_set_ ## name ## _direct_end,		\
+		PARAVIRT_PATCH_TYPE_SETREG + _IA64_REG_ ## reg,		\
+	}
+
+#define IA64_NATIVE_PATCH_BUNDLE_ELEM_REG(name, reg)		\
+	IA64_NATIVE_PATCH_BUNDLE_ELEM_GETREG(name, reg),	\
+	IA64_NATIVE_PATCH_BUNDLE_ELEM_SETREG(name, reg)		\
+
+#define IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(name, reg)		\
+	IA64_NATIVE_PATCH_BUNDLE_ELEM_REG(ar_ ## name, AR_ ## reg)
+
+#define IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(name, reg)		\
+	IA64_NATIVE_PATCH_BUNDLE_ELEM_REG(cr_ ## name, CR_ ## reg)
+
+	IA64_NATIVE_PATCH_BUNDLE_ELEM_GETREG(psr, PSR),
+	IA64_NATIVE_PATCH_BUNDLE_ELEM_GETREG(tp, TP),
+
+	IA64_NATIVE_PATCH_BUNDLE_ELEM_SETREG(psr_l, PSR_L),
+
+	IA64_NATIVE_PATCH_BUNDLE_ELEM_REG(gp, GP),
+	IA64_NATIVE_PATCH_BUNDLE_ELEM_REG(sp, SP),
+
+	IA64_NATIVE_PATCH_BUNDLE_ELEM_REG(kr0, AR_KR0),
+	IA64_NATIVE_PATCH_BUNDLE_ELEM_REG(kr1, AR_KR1),
+	IA64_NATIVE_PATCH_BUNDLE_ELEM_REG(kr2, AR_KR2),
+	IA64_NATIVE_PATCH_BUNDLE_ELEM_REG(kr3, AR_KR3),
+	IA64_NATIVE_PATCH_BUNDLE_ELEM_REG(kr4, AR_KR4),
+	IA64_NATIVE_PATCH_BUNDLE_ELEM_REG(kr5, AR_KR5),
+	IA64_NATIVE_PATCH_BUNDLE_ELEM_REG(kr6, AR_KR6),
+	IA64_NATIVE_PATCH_BUNDLE_ELEM_REG(kr7, AR_KR7),
+
+	IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(rsc, RSC),
+	IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(bsp, BSP),
+	IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(bspstore, BSPSTORE),
+	IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(rnat, RNAT),
+	IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(fcr, FCR),
+	IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(eflag, EFLAG),
+	IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(csd, CSD),
+	IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(ssd, SSD),
+	IA64_NATIVE_PATCH_BUNDLE_ELEM_REG(ar27, AR_CFLAG),
+	IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(fsr, FSR),
+	IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(fir, FIR),
+	IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(fdr, FDR),
+	IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(ccv, CCV),
+	IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(unat, UNAT),
+	IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(fpsr, FPSR),
+	IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(itc, ITC),
+	IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(pfs, PFS),
+	IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(lc, LC),
+	IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(ec, EC),
+
+	IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(dcr, DCR),
+	IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(itm, ITM),
+	IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(iva, IVA),
+	IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(pta, PTA),
+	IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(ipsr, IPSR),
+	IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(isr, ISR),
+	IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(iip, IIP),
+	IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(ifa, IFA),
+	IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(itir, ITIR),
+	IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(iipa, IIPA),
+	IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(ifs, IFS),
+	IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(iim, IIM),
+	IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(iha, IHA),
+	IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(lid, LID),
+	IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(ivr, IVR),
+	IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(tpr, TPR),
+	IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(eoi, EOI),
+	IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(irr0, IRR0),
+	IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(irr1, IRR1),
+	IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(irr2, IRR2),
+	IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(irr3, IRR3),
+	IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(itv, ITV),
+	IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(pmv, PMV),
+	IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(cmcv, CMCV),
+	IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(lrr0, LRR0),
+	IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(lrr1, LRR1),
+};
+
+unsigned long __init_or_module
+ia64_native_patch_bundle(void *sbundle, void *ebundle, unsigned long type)
+{
+	const unsigned long nelems = sizeof(ia64_native_patch_bundle_elems) /
+		sizeof(ia64_native_patch_bundle_elems[0]);
+
+	return __paravirt_patch_apply_bundle(sbundle, ebundle, type,
+					      ia64_native_patch_bundle_elems,
+					      nelems, NULL);
+}
+#endif /* ASM_SUPPOTED */
+
+extern const char ia64_native_switch_to[];
+extern const char ia64_native_leave_syscall[];
+extern const char ia64_native_work_processed_syscall[];
+extern const char ia64_native_leave_kernel[];
+
+const struct paravirt_patch_branch_target ia64_native_branch_target[]
+__initconst = {
+#define PARAVIRT_BR_TARGET(name, type)			\
+	{						\
+		ia64_native_ ## name,			\
+		PARAVIRT_PATCH_TYPE_BR_ ## type,	\
+	}
+	PARAVIRT_BR_TARGET(switch_to, SWITCH_TO),
+	PARAVIRT_BR_TARGET(leave_syscall, LEAVE_SYSCALL),
+	PARAVIRT_BR_TARGET(work_processed_syscall, WORK_PROCESSED_SYSCALL),
+	PARAVIRT_BR_TARGET(leave_kernel, LEAVE_KERNEL),
+};
+
+static void __init
+ia64_native_patch_branch(unsigned long tag, unsigned long type)
+{
+	const unsigned long nelem =
+		sizeof(ia64_native_branch_target) /
+		sizeof(ia64_native_branch_target[0]);
+	__paravirt_patch_apply_branch(tag, type,
+				      ia64_native_branch_target, nelem);
+}
diff --git a/arch/ia64/kernel/paravirt_inst.h b/arch/ia64/kernel/paravirt_inst.h
new file mode 100644
index 00000000000..1ad7512b5f6
--- /dev/null
+++ b/arch/ia64/kernel/paravirt_inst.h
@@ -0,0 +1,28 @@
+/******************************************************************************
+ * linux/arch/ia64/xen/paravirt_inst.h
+ *
+ * Copyright (c) 2008 Isaku Yamahata <yamahata at valinux co jp>
+ *                    VA Linux Systems Japan K.K.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ */
+
+#ifdef __IA64_ASM_PARAVIRTUALIZED_PVCHECK
+#include <asm/native/pvchk_inst.h>
+#else
+#include <asm/native/inst.h>
+#endif
+
diff --git a/arch/ia64/kernel/paravirt_patch.c b/arch/ia64/kernel/paravirt_patch.c
new file mode 100644
index 00000000000..bfdfef1b1ff
--- /dev/null
+++ b/arch/ia64/kernel/paravirt_patch.c
@@ -0,0 +1,514 @@
+/******************************************************************************
+ * linux/arch/ia64/xen/paravirt_patch.c
+ *
+ * Copyright (c) 2008 Isaku Yamahata <yamahata at valinux co jp>
+ *                    VA Linux Systems Japan K.K.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ */
+
+#include <linux/init.h>
+#include <asm/intrinsics.h>
+#include <asm/kprobes.h>
+#include <asm/paravirt.h>
+#include <asm/paravirt_patch.h>
+
+typedef union ia64_inst {
+        struct {
+		unsigned long long qp : 6;
+		unsigned long long : 31;
+		unsigned long long opcode : 4;
+		unsigned long long reserved : 23;
+        } generic;
+        unsigned long long l;
+} ia64_inst_t;
+
+/*
+ * flush_icache_range() can't be used here.
+ * we are here before cpu_init() which initializes
+ * ia64_i_cache_stride_shift. flush_icache_range() uses it.
+ */
+void __init_or_module
+paravirt_flush_i_cache_range(const void *instr, unsigned long size)
+{
+	extern void paravirt_fc_i(const void *addr);
+	unsigned long i;
+
+	for (i = 0; i < size; i += sizeof(bundle_t))
+		paravirt_fc_i(instr + i);
+}
+
+bundle_t* __init_or_module
+paravirt_get_bundle(unsigned long tag)
+{
+	return (bundle_t *)(tag & ~3UL);
+}
+
+unsigned long __init_or_module
+paravirt_get_slot(unsigned long tag)
+{
+	return tag & 3UL;
+}
+
+unsigned long __init_or_module
+paravirt_get_num_inst(unsigned long stag, unsigned long etag)
+{
+	bundle_t *sbundle = paravirt_get_bundle(stag);
+	unsigned long sslot = paravirt_get_slot(stag);
+	bundle_t *ebundle = paravirt_get_bundle(etag);
+	unsigned long eslot = paravirt_get_slot(etag);
+
+	return (ebundle - sbundle) * 3 + eslot - sslot + 1;
+}
+
+unsigned long __init_or_module
+paravirt_get_next_tag(unsigned long tag)
+{
+	unsigned long slot = paravirt_get_slot(tag);
+
+	switch (slot) {
+	case 0:
+	case 1:
+		return tag + 1;
+	case 2: {
+		bundle_t *bundle = paravirt_get_bundle(tag);
+		return (unsigned long)(bundle + 1);
+	}
+	default:
+		BUG();
+	}
+	/* NOTREACHED */
+}
+
+ia64_inst_t __init_or_module
+paravirt_read_slot0(const bundle_t *bundle)
+{
+	ia64_inst_t inst;
+	inst.l = bundle->quad0.slot0;
+	return inst;
+}
+
+ia64_inst_t __init_or_module
+paravirt_read_slot1(const bundle_t *bundle)
+{
+	ia64_inst_t inst;
+	inst.l = bundle->quad0.slot1_p0 |
+		((unsigned long long)bundle->quad1.slot1_p1 << 18UL);
+	return inst;
+}
+
+ia64_inst_t __init_or_module
+paravirt_read_slot2(const bundle_t *bundle)
+{
+	ia64_inst_t inst;
+	inst.l = bundle->quad1.slot2;
+	return inst;
+}
+
+ia64_inst_t __init_or_module
+paravirt_read_inst(unsigned long tag)
+{
+	bundle_t *bundle = paravirt_get_bundle(tag);
+	unsigned long slot = paravirt_get_slot(tag);
+
+	switch (slot) {
+	case 0:
+		return paravirt_read_slot0(bundle);
+	case 1:
+		return paravirt_read_slot1(bundle);
+	case 2:
+		return paravirt_read_slot2(bundle);
+	default:
+		BUG();
+	}
+	/* NOTREACHED */
+}
+
+void __init_or_module
+paravirt_write_slot0(bundle_t *bundle, ia64_inst_t inst)
+{
+	bundle->quad0.slot0 = inst.l;
+}
+
+void __init_or_module
+paravirt_write_slot1(bundle_t *bundle, ia64_inst_t inst)
+{
+	bundle->quad0.slot1_p0 = inst.l;
+	bundle->quad1.slot1_p1 = inst.l >> 18UL;
+}
+
+void __init_or_module
+paravirt_write_slot2(bundle_t *bundle, ia64_inst_t inst)
+{
+	bundle->quad1.slot2 = inst.l;
+}
+
+void __init_or_module
+paravirt_write_inst(unsigned long tag, ia64_inst_t inst)
+{
+	bundle_t *bundle = paravirt_get_bundle(tag);
+	unsigned long slot = paravirt_get_slot(tag);
+
+	switch (slot) {
+	case 0:
+		paravirt_write_slot0(bundle, inst);
+		break;
+	case 1:
+		paravirt_write_slot1(bundle, inst);
+		break;
+	case 2:
+		paravirt_write_slot2(bundle, inst);
+		break;
+	default:
+		BUG();
+		break;
+	}
+	paravirt_flush_i_cache_range(bundle, sizeof(*bundle));
+}
+
+/* for debug */
+void
+paravirt_print_bundle(const bundle_t *bundle)
+{
+	const unsigned long *quad = (const unsigned long *)bundle;
+	ia64_inst_t slot0 = paravirt_read_slot0(bundle);
+	ia64_inst_t slot1 = paravirt_read_slot1(bundle);
+	ia64_inst_t slot2 = paravirt_read_slot2(bundle);
+
+	printk(KERN_DEBUG
+	       "bundle 0x%p 0x%016lx 0x%016lx\n", bundle, quad[0], quad[1]);
+	printk(KERN_DEBUG
+	       "bundle template 0x%x\n",
+	       bundle->quad0.template);
+	printk(KERN_DEBUG
+	       "slot0 0x%lx slot1_p0 0x%lx slot1_p1 0x%lx slot2 0x%lx\n",
+	       (unsigned long)bundle->quad0.slot0,
+	       (unsigned long)bundle->quad0.slot1_p0,
+	       (unsigned long)bundle->quad1.slot1_p1,
+	       (unsigned long)bundle->quad1.slot2);
+	printk(KERN_DEBUG
+	       "slot0 0x%016llx slot1 0x%016llx slot2 0x%016llx\n",
+	       slot0.l, slot1.l, slot2.l);
+}
+
+static int noreplace_paravirt __init_or_module = 0;
+
+static int __init setup_noreplace_paravirt(char *str)
+{
+	noreplace_paravirt = 1;
+	return 1;
+}
+__setup("noreplace-paravirt", setup_noreplace_paravirt);
+
+#ifdef ASM_SUPPORTED
+static void __init_or_module
+fill_nop_bundle(void *sbundle, void *ebundle)
+{
+	extern const char paravirt_nop_bundle[];
+	extern const unsigned long paravirt_nop_bundle_size;
+
+	void *bundle = sbundle;
+
+	BUG_ON((((unsigned long)sbundle) % sizeof(bundle_t)) != 0);
+	BUG_ON((((unsigned long)ebundle) % sizeof(bundle_t)) != 0);
+
+	while (bundle < ebundle) {
+		memcpy(bundle, paravirt_nop_bundle, paravirt_nop_bundle_size);
+
+		bundle += paravirt_nop_bundle_size;
+	}
+}
+
+/* helper function */
+unsigned long __init_or_module
+__paravirt_patch_apply_bundle(void *sbundle, void *ebundle, unsigned long type,
+			      const struct paravirt_patch_bundle_elem *elems,
+			      unsigned long nelems,
+			      const struct paravirt_patch_bundle_elem **found)
+{
+	unsigned long used = 0;
+	unsigned long i;
+
+	BUG_ON((((unsigned long)sbundle) % sizeof(bundle_t)) != 0);
+	BUG_ON((((unsigned long)ebundle) % sizeof(bundle_t)) != 0);
+
+	found = NULL;
+	for (i = 0; i < nelems; i++) {
+		const struct paravirt_patch_bundle_elem *p = &elems[i];
+		if (p->type == type) {
+			unsigned long need = p->ebundle - p->sbundle;
+			unsigned long room = ebundle - sbundle;
+
+			if (found != NULL)
+				*found = p;
+
+			if (room < need) {
+				/* no room to replace. skip it */
+				printk(KERN_DEBUG
+				       "the space is too small to put "
+				       "bundles. type %ld need %ld room %ld\n",
+				       type, need, room);
+				break;
+			}
+
+			used = need;
+			memcpy(sbundle, p->sbundle, used);
+			break;
+		}
+	}
+
+	return used;
+}
+
+void __init_or_module
+paravirt_patch_apply_bundle(const struct paravirt_patch_site_bundle *start,
+			    const struct paravirt_patch_site_bundle *end)
+{
+	const struct paravirt_patch_site_bundle *p;
+
+	if (noreplace_paravirt)
+		return;
+	if (pv_init_ops.patch_bundle == NULL)
+		return;
+
+	for (p = start; p < end; p++) {
+		unsigned long used;
+
+		used = (*pv_init_ops.patch_bundle)(p->sbundle, p->ebundle,
+						   p->type);
+		if (used == 0)
+			continue;
+
+		fill_nop_bundle(p->sbundle + used, p->ebundle);
+		paravirt_flush_i_cache_range(p->sbundle,
+					     p->ebundle - p->sbundle);
+	}
+	ia64_sync_i();
+	ia64_srlz_i();
+}
+
+/*
+ * nop.i, nop.m, nop.f instruction are same format.
+ * but nop.b has differennt format.
+ * This doesn't support nop.b for now.
+ */
+static void __init_or_module
+fill_nop_inst(unsigned long stag, unsigned long etag)
+{
+	extern const bundle_t paravirt_nop_mfi_inst_bundle[];
+	unsigned long tag;
+	const ia64_inst_t nop_inst =
+		paravirt_read_slot0(paravirt_nop_mfi_inst_bundle);
+
+	for (tag = stag; tag < etag; tag = paravirt_get_next_tag(tag))
+		paravirt_write_inst(tag, nop_inst);
+}
+
+void __init_or_module
+paravirt_patch_apply_inst(const struct paravirt_patch_site_inst *start,
+			  const struct paravirt_patch_site_inst *end)
+{
+	const struct paravirt_patch_site_inst *p;
+
+	if (noreplace_paravirt)
+		return;
+	if (pv_init_ops.patch_inst == NULL)
+		return;
+
+	for (p = start; p < end; p++) {
+		unsigned long tag;
+		bundle_t *sbundle;
+		bundle_t *ebundle;
+
+		tag = (*pv_init_ops.patch_inst)(p->stag, p->etag, p->type);
+		if (tag == p->stag)
+			continue;
+
+		fill_nop_inst(tag, p->etag);
+		sbundle = paravirt_get_bundle(p->stag);
+		ebundle = paravirt_get_bundle(p->etag) + 1;
+		paravirt_flush_i_cache_range(sbundle, (ebundle - sbundle) *
+					     sizeof(bundle_t));
+	}
+	ia64_sync_i();
+	ia64_srlz_i();
+}
+#endif /* ASM_SUPPOTED */
+
+/* brl.cond.sptk.many <target64> X3 */
+typedef union inst_x3_op {
+	ia64_inst_t inst;
+	struct {
+		unsigned long qp: 6;
+		unsigned long btyp: 3;
+		unsigned long unused: 3;
+		unsigned long p: 1;
+		unsigned long imm20b: 20;
+		unsigned long wh: 2;
+		unsigned long d: 1;
+		unsigned long i: 1;
+		unsigned long opcode: 4;
+	};
+	unsigned long l;
+} inst_x3_op_t;
+
+typedef union inst_x3_imm {
+	ia64_inst_t inst;
+	struct {
+		unsigned long unused: 2;
+		unsigned long imm39: 39;
+	};
+	unsigned long l;
+} inst_x3_imm_t;
+
+void __init_or_module
+paravirt_patch_reloc_brl(unsigned long tag, const void *target)
+{
+	unsigned long tag_op = paravirt_get_next_tag(tag);
+	unsigned long tag_imm = tag;
+	bundle_t *bundle = paravirt_get_bundle(tag);
+
+	ia64_inst_t inst_op = paravirt_read_inst(tag_op);
+	ia64_inst_t inst_imm = paravirt_read_inst(tag_imm);
+
+	inst_x3_op_t inst_x3_op = { .l = inst_op.l };
+	inst_x3_imm_t inst_x3_imm = { .l = inst_imm.l };
+
+	unsigned long imm60 =
+		((unsigned long)target - (unsigned long)bundle) >> 4;
+
+	BUG_ON(paravirt_get_slot(tag) != 1); /* MLX */
+	BUG_ON(((unsigned long)target & (sizeof(bundle_t) - 1)) != 0);
+
+	/* imm60[59] 1bit */
+	inst_x3_op.i = (imm60 >> 59) & 1;
+	/* imm60[19:0] 20bit */
+	inst_x3_op.imm20b = imm60 & ((1UL << 20) - 1);
+	/* imm60[58:20] 39bit */
+	inst_x3_imm.imm39 = (imm60 >> 20) & ((1UL << 39) - 1);
+
+	inst_op.l = inst_x3_op.l;
+	inst_imm.l = inst_x3_imm.l;
+
+	paravirt_write_inst(tag_op, inst_op);
+	paravirt_write_inst(tag_imm, inst_imm);
+}
+
+/* br.cond.sptk.many <target25>	B1 */
+typedef union inst_b1 {
+	ia64_inst_t inst;
+	struct {
+		unsigned long qp: 6;
+		unsigned long btype: 3;
+		unsigned long unused: 3;
+		unsigned long p: 1;
+		unsigned long imm20b: 20;
+		unsigned long wh: 2;
+		unsigned long d: 1;
+		unsigned long s: 1;
+		unsigned long opcode: 4;
+	};
+	unsigned long l;
+} inst_b1_t;
+
+void __init
+paravirt_patch_reloc_br(unsigned long tag, const void *target)
+{
+	bundle_t *bundle = paravirt_get_bundle(tag);
+	ia64_inst_t inst = paravirt_read_inst(tag);
+	unsigned long target25 = (unsigned long)target - (unsigned long)bundle;
+	inst_b1_t inst_b1;
+
+	BUG_ON(((unsigned long)target & (sizeof(bundle_t) - 1)) != 0);
+
+	inst_b1.l = inst.l;
+	if (target25 & (1UL << 63))
+		inst_b1.s = 1;
+	else
+		inst_b1.s = 0;
+
+	inst_b1.imm20b = target25 >> 4;
+	inst.l = inst_b1.l;
+
+	paravirt_write_inst(tag, inst);
+}
+
+void __init
+__paravirt_patch_apply_branch(
+	unsigned long tag, unsigned long type,
+	const struct paravirt_patch_branch_target *entries,
+	unsigned int nr_entries)
+{
+	unsigned int i;
+	for (i = 0; i < nr_entries; i++) {
+		if (entries[i].type == type) {
+			paravirt_patch_reloc_br(tag, entries[i].entry);
+			break;
+		}
+	}
+}
+
+static void __init
+paravirt_patch_apply_branch(const struct paravirt_patch_site_branch *start,
+			    const struct paravirt_patch_site_branch *end)
+{
+	const struct paravirt_patch_site_branch *p;
+
+	if (noreplace_paravirt)
+		return;
+	if (pv_init_ops.patch_branch == NULL)
+		return;
+
+	for (p = start; p < end; p++)
+		(*pv_init_ops.patch_branch)(p->tag, p->type);
+
+	ia64_sync_i();
+	ia64_srlz_i();
+}
+
+void __init
+paravirt_patch_apply(void)
+{
+	extern const char __start_paravirt_bundles[];
+	extern const char __stop_paravirt_bundles[];
+	extern const char __start_paravirt_insts[];
+	extern const char __stop_paravirt_insts[];
+	extern const char __start_paravirt_branches[];
+	extern const char __stop_paravirt_branches[];
+
+	paravirt_patch_apply_bundle((const struct paravirt_patch_site_bundle *)
+				    __start_paravirt_bundles,
+				    (const struct paravirt_patch_site_bundle *)
+				    __stop_paravirt_bundles);
+	paravirt_patch_apply_inst((const struct paravirt_patch_site_inst *)
+				  __start_paravirt_insts,
+				  (const struct paravirt_patch_site_inst *)
+				  __stop_paravirt_insts);
+	paravirt_patch_apply_branch((const struct paravirt_patch_site_branch *)
+				    __start_paravirt_branches,
+				    (const struct paravirt_patch_site_branch *)
+				    __stop_paravirt_branches);
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "linux"
+ * c-basic-offset: 8
+ * tab-width: 8
+ * indent-tabs-mode: t
+ * End:
+ */
diff --git a/arch/ia64/kernel/paravirt_patchlist.c b/arch/ia64/kernel/paravirt_patchlist.c
new file mode 100644
index 00000000000..0a70720662e
--- /dev/null
+++ b/arch/ia64/kernel/paravirt_patchlist.c
@@ -0,0 +1,81 @@
+/******************************************************************************
+ * Copyright (c) 2008 Isaku Yamahata <yamahata at valinux co jp>
+ *                    VA Linux Systems Japan K.K.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ */
+
+#include <linux/bug.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <asm/paravirt.h>
+
+#define DECLARE(name)						\
+	extern unsigned long					\
+		__ia64_native_start_gate_##name##_patchlist[];	\
+	extern unsigned long					\
+		__ia64_native_end_gate_##name##_patchlist[]
+
+DECLARE(fsyscall);
+DECLARE(brl_fsys_bubble_down);
+DECLARE(vtop);
+DECLARE(mckinley_e9);
+
+extern unsigned long __start_gate_section[];
+
+#define ASSIGN(name)							    \
+	.start_##name##_patchlist =					    \
+		(unsigned long)__ia64_native_start_gate_##name##_patchlist, \
+	.end_##name##_patchlist =					    \
+		(unsigned long)__ia64_native_end_gate_##name##_patchlist
+
+struct pv_patchdata pv_patchdata __initdata = {
+	ASSIGN(fsyscall),
+	ASSIGN(brl_fsys_bubble_down),
+	ASSIGN(vtop),
+	ASSIGN(mckinley_e9),
+
+	.gate_section = (void*)__start_gate_section,
+};
+
+
+unsigned long __init
+paravirt_get_gate_patchlist(enum pv_gate_patchlist type)
+{
+
+#define CASE(NAME, name)					\
+	case PV_GATE_START_##NAME:				\
+		return pv_patchdata.start_##name##_patchlist;	\
+	case PV_GATE_END_##NAME:				\
+		return pv_patchdata.end_##name##_patchlist;	\
+
+	switch (type) {
+		CASE(FSYSCALL, fsyscall);
+		CASE(BRL_FSYS_BUBBLE_DOWN, brl_fsys_bubble_down);
+		CASE(VTOP, vtop);
+		CASE(MCKINLEY_E9, mckinley_e9);
+	default:
+		BUG();
+		break;
+	}
+	return 0;
+}
+
+void * __init
+paravirt_get_gate_section(void)
+{
+	return pv_patchdata.gate_section;
+}
diff --git a/arch/ia64/kernel/paravirt_patchlist.h b/arch/ia64/kernel/paravirt_patchlist.h
new file mode 100644
index 00000000000..67cffc3643a
--- /dev/null
+++ b/arch/ia64/kernel/paravirt_patchlist.h
@@ -0,0 +1,24 @@
+/******************************************************************************
+ * linux/arch/ia64/xen/paravirt_patchlist.h
+ *
+ * Copyright (c) 2008 Isaku Yamahata <yamahata at valinux co jp>
+ *                    VA Linux Systems Japan K.K.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ */
+
+#include <asm/native/patchlist.h>
+
diff --git a/arch/ia64/kernel/paravirtentry.S b/arch/ia64/kernel/paravirtentry.S
new file mode 100644
index 00000000000..92d880c4d3d
--- /dev/null
+++ b/arch/ia64/kernel/paravirtentry.S
@@ -0,0 +1,121 @@
+/******************************************************************************
+ * linux/arch/ia64/xen/paravirtentry.S
+ *
+ * Copyright (c) 2008 Isaku Yamahata <yamahata at valinux co jp>
+ *                    VA Linux Systems Japan K.K.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ */
+
+#include <linux/init.h>
+#include <asm/asmmacro.h>
+#include <asm/asm-offsets.h>
+#include <asm/paravirt_privop.h>
+#include <asm/paravirt_patch.h>
+#include "entry.h"
+
+#define DATA8(sym, init_value)			\
+	.pushsection .data..read_mostly ;	\
+	.align 8 ;				\
+	.global sym ;				\
+	sym: ;					\
+	data8 init_value ;			\
+	.popsection
+
+#define BRANCH(targ, reg, breg, type)					\
+	PARAVIRT_PATCH_SITE_BR(PARAVIRT_PATCH_TYPE_BR_ ## type) ;	\
+	;;								\
+	movl reg=targ ;							\
+	;;								\
+	ld8 reg=[reg] ;							\
+	;;								\
+	mov breg=reg ;							\
+	br.cond.sptk.many breg
+
+#define BRANCH_PROC(sym, reg, breg, type)				\
+	DATA8(paravirt_ ## sym ## _targ, ia64_native_ ## sym) ;		\
+	GLOBAL_ENTRY(paravirt_ ## sym) ;				\
+		BRANCH(paravirt_ ## sym ## _targ, reg, breg, type) ;	\
+	END(paravirt_ ## sym)
+
+#define BRANCH_PROC_UNWINFO(sym, reg, breg, type)			\
+	DATA8(paravirt_ ## sym ## _targ, ia64_native_ ## sym) ;		\
+	GLOBAL_ENTRY(paravirt_ ## sym) ;				\
+		PT_REGS_UNWIND_INFO(0) ;				\
+		BRANCH(paravirt_ ## sym ## _targ, reg, breg, type) ;	\
+	END(paravirt_ ## sym)
+
+
+BRANCH_PROC(switch_to, r22, b7, SWITCH_TO)
+BRANCH_PROC_UNWINFO(leave_syscall, r22, b7, LEAVE_SYSCALL)
+BRANCH_PROC(work_processed_syscall, r2, b7, WORK_PROCESSED_SYSCALL)
+BRANCH_PROC_UNWINFO(leave_kernel, r22, b7, LEAVE_KERNEL)
+
+
+#ifdef CONFIG_MODULES
+#define __INIT_OR_MODULE	.text
+#define __INITDATA_OR_MODULE	.data
+#else
+#define __INIT_OR_MODULE	__INIT
+#define __INITDATA_OR_MODULE	__INITDATA
+#endif /* CONFIG_MODULES */
+
+	__INIT_OR_MODULE
+	GLOBAL_ENTRY(paravirt_fc_i)
+	fc.i r32
+	br.ret.sptk.many rp
+	END(paravirt_fc_i)
+	__FINIT
+
+	__INIT_OR_MODULE
+	.align 32
+	GLOBAL_ENTRY(paravirt_nop_b_inst_bundle)
+	{
+		nop.b 0
+		nop.b 0
+		nop.b 0
+	}
+	END(paravirt_nop_b_inst_bundle)
+	__FINIT
+
+	/* NOTE: nop.[mfi] has same format */
+	__INIT_OR_MODULE
+	GLOBAL_ENTRY(paravirt_nop_mfi_inst_bundle)
+	{
+		nop.m 0
+		nop.f 0
+		nop.i 0
+	}
+	END(paravirt_nop_mfi_inst_bundle)
+	__FINIT
+
+	__INIT_OR_MODULE
+	GLOBAL_ENTRY(paravirt_nop_bundle)
+paravirt_nop_bundle_start:
+	{
+		nop 0
+		nop 0
+		nop 0
+	}
+paravirt_nop_bundle_end:
+	END(paravirt_nop_bundle)
+	__FINIT
+
+	__INITDATA_OR_MODULE
+	.align 8
+	.global paravirt_nop_bundle_size
+paravirt_nop_bundle_size:
+	data8	paravirt_nop_bundle_end - paravirt_nop_bundle_start
diff --git a/arch/ia64/kernel/patch.c b/arch/ia64/kernel/patch.c
index 6a4ac7d70b3..1cf09179371 100644
--- a/arch/ia64/kernel/patch.c
+++ b/arch/ia64/kernel/patch.c
@@ -7,10 +7,10 @@
 #include <linux/init.h>
 #include <linux/string.h>
 
+#include <asm/paravirt.h>
 #include <asm/patch.h>
 #include <asm/processor.h>
 #include <asm/sections.h>
-#include <asm/system.h>
 #include <asm/unistd.h>
 
 /*
@@ -115,7 +115,30 @@ ia64_patch_vtop (unsigned long start, unsigned long end)
 	ia64_srlz_i();
 }
 
-void
+/*
+ * Disable the RSE workaround by turning the conditional branch
+ * that we tagged in each place the workaround was used into an
+ * unconditional branch.
+ */
+void __init
+ia64_patch_rse (unsigned long start, unsigned long end)
+{
+	s32 *offp = (s32 *) start;
+	u64 ip, *b;
+
+	while (offp < (s32 *) end) {
+		ip = (u64) offp + *offp;
+
+		b = (u64 *)(ip & -16);
+		b[1] &= ~0xf800000L;
+		ia64_fc((void *) ip);
+		++offp;
+	}
+	ia64_sync_i();
+	ia64_srlz_i();
+}
+
+void __init
 ia64_patch_mckinley_e9 (unsigned long start, unsigned long end)
 {
 	static int first_time = 1;
@@ -129,19 +152,16 @@ ia64_patch_mckinley_e9 (unsigned long start, unsigned long end)
 		first_time = 0;
 		if (need_workaround)
 			printk(KERN_INFO "Leaving McKinley Errata 9 workaround enabled\n");
-		else
-			printk(KERN_INFO "McKinley Errata 9 workaround not needed; "
-			       "disabling it\n");
 	}
 	if (need_workaround)
 		return;
 
 	while (offp < (s32 *) end) {
 		wp = (u64 *) ia64_imva((char *) offp + *offp);
-		wp[0] = 0x0000000100000000UL; /* nop.m 0; nop.i 0; nop.i 0 */
-		wp[1] = 0x0004000000000200UL;
-		wp[2] = 0x0000000100000011UL; /* nop.m 0; nop.i 0; br.ret.sptk.many b6 */
-		wp[3] = 0x0084006880000200UL;
+		wp[0] = 0x0000000100000011UL; /* nop.m 0; nop.i 0; br.ret.sptk.many b6 */
+		wp[1] = 0x0084006880000200UL;
+		wp[2] = 0x0000000100000000UL; /* nop.m 0; nop.i 0; nop.i 0 */
+		wp[3] = 0x0004000000000200UL;
 		ia64_fc(wp); ia64_fc(wp + 2);
 		++offp;
 	}
@@ -149,16 +169,35 @@ ia64_patch_mckinley_e9 (unsigned long start, unsigned long end)
 	ia64_srlz_i();
 }
 
-static void
+extern unsigned long ia64_native_fsyscall_table[NR_syscalls];
+extern char ia64_native_fsys_bubble_down[];
+struct pv_fsys_data pv_fsys_data __initdata = {
+	.fsyscall_table = (unsigned long *)ia64_native_fsyscall_table,
+	.fsys_bubble_down = (void *)ia64_native_fsys_bubble_down,
+};
+
+unsigned long * __init
+paravirt_get_fsyscall_table(void)
+{
+	return pv_fsys_data.fsyscall_table;
+}
+
+char * __init
+paravirt_get_fsys_bubble_down(void)
+{
+	return pv_fsys_data.fsys_bubble_down;
+}
+
+static void __init
 patch_fsyscall_table (unsigned long start, unsigned long end)
 {
-	extern unsigned long fsyscall_table[NR_syscalls];
+	u64 fsyscall_table = (u64)paravirt_get_fsyscall_table();
 	s32 *offp = (s32 *) start;
 	u64 ip;
 
 	while (offp < (s32 *) end) {
 		ip = (u64) ia64_imva((char *) offp + *offp);
-		ia64_patch_imm64(ip, (u64) fsyscall_table);
+		ia64_patch_imm64(ip, fsyscall_table);
 		ia64_fc((void *) ip);
 		++offp;
 	}
@@ -166,10 +205,10 @@ patch_fsyscall_table (unsigned long start, unsigned long end)
 	ia64_srlz_i();
 }
 
-static void
+static void __init
 patch_brl_fsys_bubble_down (unsigned long start, unsigned long end)
 {
-	extern char fsys_bubble_down[];
+	u64 fsys_bubble_down = (u64)paravirt_get_fsys_bubble_down();
 	s32 *offp = (s32 *) start;
 	u64 ip;
 
@@ -184,14 +223,34 @@ patch_brl_fsys_bubble_down (unsigned long start, unsigned long end)
 	ia64_srlz_i();
 }
 
-void
+void __init
 ia64_patch_gate (void)
 {
-#	define START(name)	((unsigned long) __start_gate_##name##_patchlist)
-#	define END(name)	((unsigned long)__end_gate_##name##_patchlist)
+#	define START(name)	paravirt_get_gate_patchlist(PV_GATE_START_##name)
+#	define END(name)	paravirt_get_gate_patchlist(PV_GATE_END_##name)
+
+	patch_fsyscall_table(START(FSYSCALL), END(FSYSCALL));
+	patch_brl_fsys_bubble_down(START(BRL_FSYS_BUBBLE_DOWN), END(BRL_FSYS_BUBBLE_DOWN));
+	ia64_patch_vtop(START(VTOP), END(VTOP));
+	ia64_patch_mckinley_e9(START(MCKINLEY_E9), END(MCKINLEY_E9));
+}
+
+void ia64_patch_phys_stack_reg(unsigned long val)
+{
+	s32 * offp = (s32 *) __start___phys_stack_reg_patchlist;
+	s32 * end = (s32 *) __end___phys_stack_reg_patchlist;
+	u64 ip, mask, imm;
 
-	patch_fsyscall_table(START(fsyscall), END(fsyscall));
-	patch_brl_fsys_bubble_down(START(brl_fsys_bubble_down), END(brl_fsys_bubble_down));
-	ia64_patch_vtop(START(vtop), END(vtop));
-	ia64_patch_mckinley_e9(START(mckinley_e9), END(mckinley_e9));
+	/* see instruction format A4: adds r1 = imm13, r3 */
+	mask = (0x3fUL << 27) | (0x7f << 13);
+	imm = (((val >> 7) & 0x3f) << 27) | (val & 0x7f) << 13;
+
+	while (offp < end) {
+		ip = (u64) offp + *offp;
+		ia64_patch(ip, mask, imm);
+		ia64_fc((void *)ip);
+		++offp;
+	}
+	ia64_sync_i();
+	ia64_srlz_i();
 }
diff --git a/arch/ia64/kernel/pci-dma.c b/arch/ia64/kernel/pci-dma.c
new file mode 100644
index 00000000000..992c1098c52
--- /dev/null
+++ b/arch/ia64/kernel/pci-dma.c
@@ -0,0 +1,110 @@
+/*
+ * Dynamic DMA mapping support.
+ */
+
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/pci.h>
+#include <linux/module.h>
+#include <linux/dmar.h>
+#include <asm/iommu.h>
+#include <asm/machvec.h>
+#include <linux/dma-mapping.h>
+
+
+#ifdef CONFIG_INTEL_IOMMU
+
+#include <linux/kernel.h>
+
+#include <asm/page.h>
+
+dma_addr_t bad_dma_address __read_mostly;
+EXPORT_SYMBOL(bad_dma_address);
+
+static int iommu_sac_force __read_mostly;
+
+int no_iommu __read_mostly;
+#ifdef CONFIG_IOMMU_DEBUG
+int force_iommu __read_mostly = 1;
+#else
+int force_iommu __read_mostly;
+#endif
+
+int iommu_pass_through;
+
+extern struct dma_map_ops intel_dma_ops;
+
+static int __init pci_iommu_init(void)
+{
+	if (iommu_detected)
+		intel_iommu_init();
+
+	return 0;
+}
+
+/* Must execute after PCI subsystem */
+fs_initcall(pci_iommu_init);
+
+void pci_iommu_shutdown(void)
+{
+	return;
+}
+
+void __init
+iommu_dma_init(void)
+{
+	return;
+}
+
+int iommu_dma_supported(struct device *dev, u64 mask)
+{
+	/* Copied from i386. Doesn't make much sense, because it will
+	   only work for pci_alloc_coherent.
+	   The caller just has to use GFP_DMA in this case. */
+	if (mask < DMA_BIT_MASK(24))
+		return 0;
+
+	/* Tell the device to use SAC when IOMMU force is on.  This
+	   allows the driver to use cheaper accesses in some cases.
+
+	   Problem with this is that if we overflow the IOMMU area and
+	   return DAC as fallback address the device may not handle it
+	   correctly.
+
+	   As a special case some controllers have a 39bit address
+	   mode that is as efficient as 32bit (aic79xx). Don't force
+	   SAC for these.  Assume all masks <= 40 bits are of this
+	   type. Normally this doesn't make any difference, but gives
+	   more gentle handling of IOMMU overflow. */
+	if (iommu_sac_force && (mask >= DMA_BIT_MASK(40))) {
+		dev_info(dev, "Force SAC with mask %llx\n", mask);
+		return 0;
+	}
+
+	return 1;
+}
+EXPORT_SYMBOL(iommu_dma_supported);
+
+void __init pci_iommu_alloc(void)
+{
+	dma_ops = &intel_dma_ops;
+
+	dma_ops->sync_single_for_cpu = machvec_dma_sync_single;
+	dma_ops->sync_sg_for_cpu = machvec_dma_sync_sg;
+	dma_ops->sync_single_for_device = machvec_dma_sync_single;
+	dma_ops->sync_sg_for_device = machvec_dma_sync_sg;
+	dma_ops->dma_supported = iommu_dma_supported;
+
+	/*
+	 * The order of these functions is important for
+	 * fall-back/fail-over reasons
+	 */
+	detect_intel_iommu();
+
+#ifdef CONFIG_SWIOTLB
+	pci_swiotlb_init();
+#endif
+}
+
+#endif
diff --git a/arch/ia64/kernel/pci-swiotlb.c b/arch/ia64/kernel/pci-swiotlb.c
new file mode 100644
index 00000000000..939260aeac9
--- /dev/null
+++ b/arch/ia64/kernel/pci-swiotlb.c
@@ -0,0 +1,67 @@
+/* Glue code to lib/swiotlb.c */
+
+#include <linux/pci.h>
+#include <linux/gfp.h>
+#include <linux/cache.h>
+#include <linux/module.h>
+#include <linux/dma-mapping.h>
+
+#include <asm/swiotlb.h>
+#include <asm/dma.h>
+#include <asm/iommu.h>
+#include <asm/machvec.h>
+
+int swiotlb __read_mostly;
+EXPORT_SYMBOL(swiotlb);
+
+static void *ia64_swiotlb_alloc_coherent(struct device *dev, size_t size,
+					 dma_addr_t *dma_handle, gfp_t gfp,
+					 struct dma_attrs *attrs)
+{
+	if (dev->coherent_dma_mask != DMA_BIT_MASK(64))
+		gfp |= GFP_DMA;
+	return swiotlb_alloc_coherent(dev, size, dma_handle, gfp);
+}
+
+static void ia64_swiotlb_free_coherent(struct device *dev, size_t size,
+				       void *vaddr, dma_addr_t dma_addr,
+				       struct dma_attrs *attrs)
+{
+	swiotlb_free_coherent(dev, size, vaddr, dma_addr);
+}
+
+struct dma_map_ops swiotlb_dma_ops = {
+	.alloc = ia64_swiotlb_alloc_coherent,
+	.free = ia64_swiotlb_free_coherent,
+	.map_page = swiotlb_map_page,
+	.unmap_page = swiotlb_unmap_page,
+	.map_sg = swiotlb_map_sg_attrs,
+	.unmap_sg = swiotlb_unmap_sg_attrs,
+	.sync_single_for_cpu = swiotlb_sync_single_for_cpu,
+	.sync_single_for_device = swiotlb_sync_single_for_device,
+	.sync_sg_for_cpu = swiotlb_sync_sg_for_cpu,
+	.sync_sg_for_device = swiotlb_sync_sg_for_device,
+	.dma_supported = swiotlb_dma_supported,
+	.mapping_error = swiotlb_dma_mapping_error,
+};
+
+void __init swiotlb_dma_init(void)
+{
+	dma_ops = &swiotlb_dma_ops;
+	swiotlb_init(1);
+}
+
+void __init pci_swiotlb_init(void)
+{
+	if (!iommu_detected) {
+#ifdef CONFIG_IA64_GENERIC
+		swiotlb = 1;
+		printk(KERN_INFO "PCI-DMA: Re-initialize machine vector.\n");
+		machvec_init("dig");
+		swiotlb_init(1);
+		dma_ops = &swiotlb_dma_ops;
+#else
+		panic("Unable to find Intel IOMMU");
+#endif
+	}
+}
diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c
index f7dfc107cb7..5845ffea67c 100644
--- a/arch/ia64/kernel/perfmon.c
+++ b/arch/ia64/kernel/perfmon.c
@@ -19,12 +19,10 @@
  * 	http://www.hpl.hp.com/research/linux/perfmon
  */
 
-#include <linux/config.h>
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/sched.h>
 #include <linux/interrupt.h>
-#include <linux/smp_lock.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
 #include <linux/init.h>
@@ -35,10 +33,16 @@
 #include <linux/file.h>
 #include <linux/poll.h>
 #include <linux/vfs.h>
+#include <linux/smp.h>
 #include <linux/pagemap.h>
 #include <linux/mount.h>
 #include <linux/bitops.h>
+#include <linux/capability.h>
 #include <linux/rcupdate.h>
+#include <linux/completion.h>
+#include <linux/tracehook.h>
+#include <linux/slab.h>
+#include <linux/cpu.h>
 
 #include <asm/errno.h>
 #include <asm/intrinsics.h>
@@ -46,7 +50,6 @@
 #include <asm/perfmon.h>
 #include <asm/processor.h>
 #include <asm/signal.h>
-#include <asm/system.h>
 #include <asm/uaccess.h>
 #include <asm/delay.h>
 
@@ -61,6 +64,9 @@
 
 #define PFM_INVALID_ACTIVATION	(~0UL)
 
+#define PFM_NUM_PMC_REGS	64	/* PMC save area for ctxsw */
+#define PFM_NUM_PMD_REGS	64	/* PMD save area for ctxsw */
+
 /*
  * depth of message queue
  */
@@ -142,7 +148,7 @@
  * in UP:
  * 	- we need to protect against PMU overflow interrupts (local_irq_disable)
  *
- * spin_lock_irqsave()/spin_lock_irqrestore():
+ * spin_lock_irqsave()/spin_unlock_irqrestore():
  * 	in SMP: local_irq_disable + spin_lock
  * 	in UP : local_irq_disable
  *
@@ -154,14 +160,14 @@
  */
 #define PROTECT_CTX(c, f) \
 	do {  \
-		DPRINT(("spinlock_irq_save ctx %p by [%d]\n", c, current->pid)); \
+		DPRINT(("spinlock_irq_save ctx %p by [%d]\n", c, task_pid_nr(current))); \
 		spin_lock_irqsave(&(c)->ctx_lock, f); \
-		DPRINT(("spinlocked ctx %p  by [%d]\n", c, current->pid)); \
+		DPRINT(("spinlocked ctx %p  by [%d]\n", c, task_pid_nr(current))); \
 	} while(0)
 
 #define UNPROTECT_CTX(c, f) \
 	do { \
-		DPRINT(("spinlock_irq_restore ctx %p by [%d]\n", c, current->pid)); \
+		DPRINT(("spinlock_irq_restore ctx %p by [%d]\n", c, task_pid_nr(current))); \
 		spin_unlock_irqrestore(&(c)->ctx_lock, f); \
 	} while(0)
 
@@ -223,12 +229,12 @@
 #ifdef PFM_DEBUGGING
 #define DPRINT(a) \
 	do { \
-		if (unlikely(pfm_sysctl.debug >0)) { printk("%s.%d: CPU%d [%d] ", __FUNCTION__, __LINE__, smp_processor_id(), current->pid); printk a; } \
+		if (unlikely(pfm_sysctl.debug >0)) { printk("%s.%d: CPU%d [%d] ", __func__, __LINE__, smp_processor_id(), task_pid_nr(current)); printk a; } \
 	} while (0)
 
 #define DPRINT_ovfl(a) \
 	do { \
-		if (unlikely(pfm_sysctl.debug > 0 && pfm_sysctl.debug_ovfl >0)) { printk("%s.%d: CPU%d [%d] ", __FUNCTION__, __LINE__, smp_processor_id(), current->pid); printk a; } \
+		if (unlikely(pfm_sysctl.debug > 0 && pfm_sysctl.debug_ovfl >0)) { printk("%s.%d: CPU%d [%d] ", __func__, __LINE__, smp_processor_id(), task_pid_nr(current)); printk a; } \
 	} while (0)
 #endif
 
@@ -285,7 +291,7 @@ typedef struct pfm_context {
 
 	unsigned long		ctx_ovfl_regs[4];	/* which registers overflowed (notification) */
 
-	struct semaphore	ctx_restart_sem;   	/* use for blocking notification mode */
+	struct completion	ctx_restart_done;  	/* use for blocking notification mode */
 
 	unsigned long		ctx_used_pmds[4];	/* bitmask of PMD used            */
 	unsigned long		ctx_all_pmds[4];	/* bitmask of all accessible PMDs */
@@ -295,16 +301,19 @@ typedef struct pfm_context {
 	unsigned long		ctx_reload_pmcs[4];	/* bitmask of force reload PMC on ctxsw in */
 	unsigned long		ctx_used_monitors[4];	/* bitmask of monitor PMC being used */
 
-	unsigned long		ctx_pmcs[IA64_NUM_PMC_REGS];	/*  saved copies of PMC values */
+	unsigned long		ctx_pmcs[PFM_NUM_PMC_REGS];	/*  saved copies of PMC values */
 
 	unsigned int		ctx_used_ibrs[1];		/* bitmask of used IBR (speedup ctxsw in) */
 	unsigned int		ctx_used_dbrs[1];		/* bitmask of used DBR (speedup ctxsw in) */
 	unsigned long		ctx_dbrs[IA64_NUM_DBG_REGS];	/* DBR values (cache) when not loaded */
 	unsigned long		ctx_ibrs[IA64_NUM_DBG_REGS];	/* IBR values (cache) when not loaded */
 
-	pfm_counter_t		ctx_pmds[IA64_NUM_PMD_REGS]; /* software state for PMDS */
+	pfm_counter_t		ctx_pmds[PFM_NUM_PMD_REGS]; /* software state for PMDS */
+
+	unsigned long		th_pmcs[PFM_NUM_PMC_REGS];	/* PMC thread save state */
+	unsigned long		th_pmds[PFM_NUM_PMD_REGS];	/* PMD thread save state */
 
-	u64			ctx_saved_psr_up;	/* only contains psr.up value */
+	unsigned long		ctx_saved_psr_up;	/* only contains psr.up value */
 
 	unsigned long		ctx_last_activation;	/* context last activation number for last_cpu */
 	unsigned int		ctx_last_cpu;		/* CPU id of current or last CPU used (SMP only) */
@@ -512,25 +521,56 @@ static pmu_config_t		*pmu_conf;
 pfm_sysctl_t pfm_sysctl;
 EXPORT_SYMBOL(pfm_sysctl);
 
-static ctl_table pfm_ctl_table[]={
-	{1, "debug", &pfm_sysctl.debug, sizeof(int), 0666, NULL, &proc_dointvec, NULL,},
-	{2, "debug_ovfl", &pfm_sysctl.debug_ovfl, sizeof(int), 0666, NULL, &proc_dointvec, NULL,},
-	{3, "fastctxsw", &pfm_sysctl.fastctxsw, sizeof(int), 0600, NULL, &proc_dointvec, NULL,},
-	{4, "expert_mode", &pfm_sysctl.expert_mode, sizeof(int), 0600, NULL, &proc_dointvec, NULL,},
-	{ 0, },
+static struct ctl_table pfm_ctl_table[] = {
+	{
+		.procname	= "debug",
+		.data		= &pfm_sysctl.debug,
+		.maxlen		= sizeof(int),
+		.mode		= 0666,
+		.proc_handler	= proc_dointvec,
+	},
+	{
+		.procname	= "debug_ovfl",
+		.data		= &pfm_sysctl.debug_ovfl,
+		.maxlen		= sizeof(int),
+		.mode		= 0666,
+		.proc_handler	= proc_dointvec,
+	},
+	{
+		.procname	= "fastctxsw",
+		.data		= &pfm_sysctl.fastctxsw,
+		.maxlen		= sizeof(int),
+		.mode		= 0600,
+		.proc_handler	= proc_dointvec,
+	},
+	{
+		.procname	= "expert_mode",
+		.data		= &pfm_sysctl.expert_mode,
+		.maxlen		= sizeof(int),
+		.mode		= 0600,
+		.proc_handler	= proc_dointvec,
+	},
+	{}
 };
-static ctl_table pfm_sysctl_dir[] = {
-	{1, "perfmon", NULL, 0, 0755, pfm_ctl_table, },
- 	{0,},
+static struct ctl_table pfm_sysctl_dir[] = {
+	{
+		.procname	= "perfmon",
+		.mode		= 0555,
+		.child		= pfm_ctl_table,
+	},
+ 	{}
 };
-static ctl_table pfm_sysctl_root[] = {
-	{1, "kernel", NULL, 0, 0755, pfm_sysctl_dir, },
- 	{0,},
+static struct ctl_table pfm_sysctl_root[] = {
+	{
+		.procname	= "kernel",
+		.mode		= 0555,
+		.child		= pfm_sysctl_dir,
+	},
+ 	{}
 };
 static struct ctl_table_header *pfm_sysctl_header;
 
 static int pfm_context_unload(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs);
-static int pfm_flush(struct file *filp);
 
 #define pfm_get_cpu_var(v)		__ia64_per_cpu_var(v)
 #define pfm_get_cpu_data(a,b)		per_cpu(a, b)
@@ -542,21 +582,6 @@ pfm_put_task(struct task_struct *task)
 }
 
 static inline void
-pfm_set_task_notify(struct task_struct *task)
-{
-	struct thread_info *info;
-
-	info = (struct thread_info *) ((char *) task + IA64_TASK_SIZE);
-	set_bit(TIF_NOTIFY_RESUME, &info->flags);
-}
-
-static inline void
-pfm_clear_task_notify(void)
-{
-	clear_thread_flag(TIF_NOTIFY_RESUME);
-}
-
-static inline void
 pfm_reserve_page(unsigned long a)
 {
 	SetPageReserved(vmalloc_to_page((void *)a));
@@ -580,30 +605,22 @@ pfm_unprotect_ctx_ctxsw(pfm_context_t *x, unsigned long f)
 	spin_unlock(&(x)->ctx_lock);
 }
 
-static inline unsigned int
-pfm_do_munmap(struct mm_struct *mm, unsigned long addr, size_t len, int acct)
-{
-	return do_munmap(mm, addr, len);
-}
-
-static inline unsigned long 
-pfm_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags, unsigned long exec)
-{
-	return get_unmapped_area(file, addr, len, pgoff, flags);
-}
-
+/* forward declaration */
+static const struct dentry_operations pfmfs_dentry_operations;
 
-static struct super_block *
-pfmfs_get_sb(struct file_system_type *fs_type, int flags, const char *dev_name, void *data)
+static struct dentry *
+pfmfs_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data)
 {
-	return get_sb_pseudo(fs_type, "pfm:", NULL, PFMFS_MAGIC);
+	return mount_pseudo(fs_type, "pfm:", NULL, &pfmfs_dentry_operations,
+			PFMFS_MAGIC);
 }
 
 static struct file_system_type pfm_fs_type = {
 	.name     = "pfmfs",
-	.get_sb   = pfmfs_get_sb,
+	.mount    = pfmfs_mount,
 	.kill_sb  = kill_anon_super,
 };
+MODULE_ALIAS_FS("pfmfs");
 
 DEFINE_PER_CPU(unsigned long, pfm_syst_info);
 DEFINE_PER_CPU(struct task_struct *, pmu_owner);
@@ -613,7 +630,7 @@ EXPORT_PER_CPU_SYMBOL_GPL(pfm_syst_info);
 
 
 /* forward declaration */
-static struct file_operations pfm_file_ops;
+static const struct file_operations pfm_file_ops;
 
 /*
  * forward declarations
@@ -627,9 +644,11 @@ static int pfm_write_ibr_dbr(int mode, pfm_context_t *ctx, void *arg, int count,
 
 #include "perfmon_itanium.h"
 #include "perfmon_mckinley.h"
+#include "perfmon_montecito.h"
 #include "perfmon_generic.h"
 
 static pmu_config_t *pmu_confs[]={
+	&pmu_conf_mont,
 	&pmu_conf_mck,
 	&pmu_conf_ita,
 	&pmu_conf_gen, /* must be last */
@@ -802,10 +821,9 @@ pfm_rvmalloc(unsigned long size)
 	unsigned long addr;
 
 	size = PAGE_ALIGN(size);
-	mem  = vmalloc(size);
+	mem  = vzalloc(size);
 	if (mem) {
 		//printk("perfmon: CPU%d pfm_rvmalloc(%ld)=%p\n", smp_processor_id(), size, mem);
-		memset(mem, 0, size);
 		addr = (unsigned long)mem;
 		while (size > 0) {
 			pfm_reserve_page(addr);
@@ -835,7 +853,7 @@ pfm_rvfree(void *mem, unsigned long size)
 }
 
 static pfm_context_t *
-pfm_context_alloc(void)
+pfm_context_alloc(int ctx_flags)
 {
 	pfm_context_t *ctx;
 
@@ -843,10 +861,49 @@ pfm_context_alloc(void)
 	 * allocate context descriptor 
 	 * must be able to free with interrupts disabled
 	 */
-	ctx = kmalloc(sizeof(pfm_context_t), GFP_KERNEL);
+	ctx = kzalloc(sizeof(pfm_context_t), GFP_KERNEL);
 	if (ctx) {
-		memset(ctx, 0, sizeof(pfm_context_t));
 		DPRINT(("alloc ctx @%p\n", ctx));
+
+		/*
+		 * init context protection lock
+		 */
+		spin_lock_init(&ctx->ctx_lock);
+
+		/*
+		 * context is unloaded
+		 */
+		ctx->ctx_state = PFM_CTX_UNLOADED;
+
+		/*
+		 * initialization of context's flags
+		 */
+		ctx->ctx_fl_block       = (ctx_flags & PFM_FL_NOTIFY_BLOCK) ? 1 : 0;
+		ctx->ctx_fl_system      = (ctx_flags & PFM_FL_SYSTEM_WIDE) ? 1: 0;
+		ctx->ctx_fl_no_msg      = (ctx_flags & PFM_FL_OVFL_NO_MSG) ? 1: 0;
+		/*
+		 * will move to set properties
+		 * ctx->ctx_fl_excl_idle   = (ctx_flags & PFM_FL_EXCL_IDLE) ? 1: 0;
+		 */
+
+		/*
+		 * init restart semaphore to locked
+		 */
+		init_completion(&ctx->ctx_restart_done);
+
+		/*
+		 * activation is used in SMP only
+		 */
+		ctx->ctx_last_activation = PFM_INVALID_ACTIVATION;
+		SET_LAST_CPU(ctx, -1);
+
+		/*
+		 * initialize notification message queue
+		 */
+		ctx->ctx_msgq_head = ctx->ctx_msgq_tail = 0;
+		init_waitqueue_head(&ctx->ctx_msgq_wait);
+		init_waitqueue_head(&ctx->ctx_zombieq);
+
 	}
 	return ctx;
 }
@@ -864,11 +921,10 @@ static void
 pfm_mask_monitoring(struct task_struct *task)
 {
 	pfm_context_t *ctx = PFM_GET_CTX(task);
-	struct thread_struct *th = &task->thread;
 	unsigned long mask, val, ovfl_mask;
 	int i;
 
-	DPRINT_ovfl(("masking monitoring for [%d]\n", task->pid));
+	DPRINT_ovfl(("masking monitoring for [%d]\n", task_pid_nr(task)));
 
 	ovfl_mask = pmu_conf->ovfl_val;
 	/*
@@ -885,7 +941,7 @@ pfm_mask_monitoring(struct task_struct *task)
 	 * So in both cases, the live register contains the owner's
 	 * state. We can ONLY touch the PMU registers and NOT the PSR.
 	 *
-	 * As a consequence to this call, the thread->pmds[] array
+	 * As a consequence to this call, the ctx->th_pmds[] array
 	 * contains stale information which must be ignored
 	 * when context is reloaded AND monitoring is active (see
 	 * pfm_restart).
@@ -920,9 +976,9 @@ pfm_mask_monitoring(struct task_struct *task)
 	mask = ctx->ctx_used_monitors[0] >> PMU_FIRST_COUNTER;
 	for(i= PMU_FIRST_COUNTER; mask; i++, mask>>=1) {
 		if ((mask & 0x1) == 0UL) continue;
-		ia64_set_pmc(i, th->pmcs[i] & ~0xfUL);
-		th->pmcs[i] &= ~0xfUL;
-		DPRINT_ovfl(("pmc[%d]=0x%lx\n", i, th->pmcs[i]));
+		ia64_set_pmc(i, ctx->th_pmcs[i] & ~0xfUL);
+		ctx->th_pmcs[i] &= ~0xfUL;
+		DPRINT_ovfl(("pmc[%d]=0x%lx\n", i, ctx->th_pmcs[i]));
 	}
 	/*
 	 * make all of this visible
@@ -939,7 +995,6 @@ static void
 pfm_restore_monitoring(struct task_struct *task)
 {
 	pfm_context_t *ctx = PFM_GET_CTX(task);
-	struct thread_struct *th = &task->thread;
 	unsigned long mask, ovfl_mask;
 	unsigned long psr, val;
 	int i, is_system;
@@ -948,12 +1003,12 @@ pfm_restore_monitoring(struct task_struct *task)
 	ovfl_mask = pmu_conf->ovfl_val;
 
 	if (task != current) {
-		printk(KERN_ERR "perfmon.%d: invalid task[%d] current[%d]\n", __LINE__, task->pid, current->pid);
+		printk(KERN_ERR "perfmon.%d: invalid task[%d] current[%d]\n", __LINE__, task_pid_nr(task), task_pid_nr(current));
 		return;
 	}
 	if (ctx->ctx_state != PFM_CTX_MASKED) {
 		printk(KERN_ERR "perfmon.%d: task[%d] current[%d] invalid state=%d\n", __LINE__,
-			task->pid, current->pid, ctx->ctx_state);
+			task_pid_nr(task), task_pid_nr(current), ctx->ctx_state);
 		return;
 	}
 	psr = pfm_get_psr();
@@ -1005,9 +1060,10 @@ pfm_restore_monitoring(struct task_struct *task)
 	mask = ctx->ctx_used_monitors[0] >> PMU_FIRST_COUNTER;
 	for(i= PMU_FIRST_COUNTER; mask; i++, mask>>=1) {
 		if ((mask & 0x1) == 0UL) continue;
-		th->pmcs[i] = ctx->ctx_pmcs[i];
-		ia64_set_pmc(i, th->pmcs[i]);
-		DPRINT(("[%d] pmc[%d]=0x%lx\n", task->pid, i, th->pmcs[i]));
+		ctx->th_pmcs[i] = ctx->ctx_pmcs[i];
+		ia64_set_pmc(i, ctx->th_pmcs[i]);
+		DPRINT(("[%d] pmc[%d]=0x%lx\n",
+					task_pid_nr(task), i, ctx->th_pmcs[i]));
 	}
 	ia64_srlz_d();
 
@@ -1066,7 +1122,6 @@ pfm_restore_pmds(unsigned long *pmds, unsigned long mask)
 static inline void
 pfm_copy_pmds(struct task_struct *task, pfm_context_t *ctx)
 {
-	struct thread_struct *thread = &task->thread;
 	unsigned long ovfl_val = pmu_conf->ovfl_val;
 	unsigned long mask = ctx->ctx_all_pmds[0];
 	unsigned long val;
@@ -1088,11 +1143,11 @@ pfm_copy_pmds(struct task_struct *task, pfm_context_t *ctx)
 			ctx->ctx_pmds[i].val = val & ~ovfl_val;
 			 val &= ovfl_val;
 		}
-		thread->pmds[i] = val;
+		ctx->th_pmds[i] = val;
 
 		DPRINT(("pmd[%d]=0x%lx soft_val=0x%lx\n",
 			i,
-			thread->pmds[i],
+			ctx->th_pmds[i],
 			ctx->ctx_pmds[i].val));
 	}
 }
@@ -1103,7 +1158,6 @@ pfm_copy_pmds(struct task_struct *task, pfm_context_t *ctx)
 static inline void
 pfm_copy_pmcs(struct task_struct *task, pfm_context_t *ctx)
 {
-	struct thread_struct *thread = &task->thread;
 	unsigned long mask = ctx->ctx_all_pmcs[0];
 	int i;
 
@@ -1111,8 +1165,8 @@ pfm_copy_pmcs(struct task_struct *task, pfm_context_t *ctx)
 
 	for (i=0; mask; i++, mask>>=1) {
 		/* masking 0 with ovfl_val yields 0 */
-		thread->pmcs[i] = ctx->ctx_pmcs[i];
-		DPRINT(("pmc[%d]=0x%lx\n", i, thread->pmcs[i]));
+		ctx->th_pmcs[i] = ctx->ctx_pmcs[i];
+		DPRINT(("pmc[%d]=0x%lx\n", i, ctx->th_pmcs[i]));
 	}
 }
 
@@ -1269,14 +1323,12 @@ out:
 }
 EXPORT_SYMBOL(pfm_unregister_buffer_fmt);
 
-extern void update_pal_halt_status(int);
-
 static int
 pfm_reserve_session(struct task_struct *task, int is_syswide, unsigned int cpu)
 {
 	unsigned long flags;
 	/*
-	 * validy checks on cpu_mask have been done upstream
+	 * validity checks on cpu_mask have been done upstream
 	 */
 	LOCK_PFS(flags);
 
@@ -1318,9 +1370,9 @@ pfm_reserve_session(struct task_struct *task, int is_syswide, unsigned int cpu)
 		cpu));
 
 	/*
-	 * disable default_idle() to go to PAL_HALT
+	 * Force idle() into poll mode
 	 */
-	update_pal_halt_status(0);
+	cpu_idle_poll_ctrl(true);
 
 	UNLOCK_PFS(flags);
 
@@ -1328,7 +1380,7 @@ pfm_reserve_session(struct task_struct *task, int is_syswide, unsigned int cpu)
 
 error_conflict:
 	DPRINT(("system wide not possible, conflicting session [%d] on CPU%d\n",
-  		pfm_sessions.pfs_sys_session[cpu]->pid,
+  		task_pid_nr(pfm_sessions.pfs_sys_session[cpu]),
 		cpu));
 abort:
 	UNLOCK_PFS(flags);
@@ -1342,7 +1394,7 @@ pfm_unreserve_session(pfm_context_t *ctx, int is_syswide, unsigned int cpu)
 {
 	unsigned long flags;
 	/*
-	 * validy checks on cpu_mask have been done upstream
+	 * validity checks on cpu_mask have been done upstream
 	 */
 	LOCK_PFS(flags);
 
@@ -1377,11 +1429,8 @@ pfm_unreserve_session(pfm_context_t *ctx, int is_syswide, unsigned int cpu)
 		is_syswide,
 		cpu));
 
-	/*
-	 * if possible, enable default_idle() to go into PAL_HALT
-	 */
-	if (pfm_sessions.pfs_task_sessions == 0 && pfm_sessions.pfs_sys_sessions == 0)
-		update_pal_halt_status(1);
+	/* Undo forced polling. Last session reenables pal_halt */
+	cpu_idle_poll_ctrl(false);
 
 	UNLOCK_PFS(flags);
 
@@ -1394,13 +1443,14 @@ pfm_unreserve_session(pfm_context_t *ctx, int is_syswide, unsigned int cpu)
  * a PROTECT_CTX() section.
  */
 static int
-pfm_remove_smpl_mapping(struct task_struct *task, void *vaddr, unsigned long size)
+pfm_remove_smpl_mapping(void *vaddr, unsigned long size)
 {
+	struct task_struct *task = current;
 	int r;
 
 	/* sanity checks */
 	if (task->mm == NULL || size == 0UL || vaddr == NULL) {
-		printk(KERN_ERR "perfmon: pfm_remove_smpl_mapping [%d] invalid context mm=%p\n", task->pid, task->mm);
+		printk(KERN_ERR "perfmon: pfm_remove_smpl_mapping [%d] invalid context mm=%p\n", task_pid_nr(task), task->mm);
 		return -EINVAL;
 	}
 
@@ -1409,15 +1459,10 @@ pfm_remove_smpl_mapping(struct task_struct *task, void *vaddr, unsigned long siz
 	/*
 	 * does the actual unmapping
 	 */
-	down_write(&task->mm->mmap_sem);
-
-	DPRINT(("down_write done smpl_vaddr=%p size=%lu\n", vaddr, size));
-
-	r = pfm_do_munmap(task->mm, (unsigned long)vaddr, size, 0);
+	r = vm_munmap((unsigned long)vaddr, size);
 
-	up_write(&task->mm->mmap_sem);
 	if (r !=0) {
-		printk(KERN_ERR "perfmon: [%d] unable to unmap sampling buffer @%p size=%lu\n", task->pid, vaddr, size);
+		printk(KERN_ERR "perfmon: [%d] unable to unmap sampling buffer @%p size=%lu\n", task_pid_nr(task), vaddr, size);
 	}
 
 	DPRINT(("do_unmap(%p, %lu)=%d\n", vaddr, size, r));
@@ -1459,7 +1504,7 @@ pfm_free_smpl_buffer(pfm_context_t *ctx)
 	return 0;
 
 invalid_free:
-	printk(KERN_ERR "perfmon: pfm_free_smpl_buffer [%d] no buffer\n", current->pid);
+	printk(KERN_ERR "perfmon: pfm_free_smpl_buffer [%d] no buffer\n", task_pid_nr(current));
 	return -EINVAL;
 }
 #endif
@@ -1479,7 +1524,7 @@ pfm_exit_smpl_buffer(pfm_buffer_fmt_t *fmt)
  * any operations on the root directory. However, we need a non-trivial
  * d_name - pfm: will go nicely and kill the special-casing in procfs.
  */
-static struct vfsmount *pfmfs_mnt;
+static struct vfsmount *pfmfs_mnt __read_mostly;
 
 static int __init
 init_pfm_fs(void)
@@ -1496,13 +1541,6 @@ init_pfm_fs(void)
 	return err;
 }
 
-static void __exit
-exit_pfm_fs(void)
-{
-	unregister_filesystem(&pfm_fs_type);
-	mntput(pfmfs_mnt);
-}
-
 static ssize_t
 pfm_read(struct file *filp, char __user *buf, size_t size, loff_t *ppos)
 {
@@ -1512,13 +1550,13 @@ pfm_read(struct file *filp, char __user *buf, size_t size, loff_t *ppos)
 	unsigned long flags;
   	DECLARE_WAITQUEUE(wait, current);
 	if (PFM_IS_FILE(filp) == 0) {
-		printk(KERN_ERR "perfmon: pfm_poll: bad magic [%d]\n", current->pid);
+		printk(KERN_ERR "perfmon: pfm_poll: bad magic [%d]\n", task_pid_nr(current));
 		return -EINVAL;
 	}
 
-	ctx = (pfm_context_t *)filp->private_data;
+	ctx = filp->private_data;
 	if (ctx == NULL) {
-		printk(KERN_ERR "perfmon: pfm_read: NULL ctx [%d]\n", current->pid);
+		printk(KERN_ERR "perfmon: pfm_read: NULL ctx [%d]\n", task_pid_nr(current));
 		return -EINVAL;
 	}
 
@@ -1572,7 +1610,7 @@ pfm_read(struct file *filp, char __user *buf, size_t size, loff_t *ppos)
 
 		PROTECT_CTX(ctx, flags);
 	}
-	DPRINT(("[%d] back to running ret=%ld\n", current->pid, ret));
+	DPRINT(("[%d] back to running ret=%ld\n", task_pid_nr(current), ret));
   	set_current_state(TASK_RUNNING);
 	remove_wait_queue(&ctx->ctx_msgq_wait, &wait);
 
@@ -1581,7 +1619,7 @@ pfm_read(struct file *filp, char __user *buf, size_t size, loff_t *ppos)
 	ret = -EINVAL;
 	msg = pfm_get_next_msg(ctx);
 	if (msg == NULL) {
-		printk(KERN_ERR "perfmon: pfm_read no msg for ctx=%p [%d]\n", ctx, current->pid);
+		printk(KERN_ERR "perfmon: pfm_read no msg for ctx=%p [%d]\n", ctx, task_pid_nr(current));
 		goto abort_locked;
 	}
 
@@ -1612,13 +1650,13 @@ pfm_poll(struct file *filp, poll_table * wait)
 	unsigned int mask = 0;
 
 	if (PFM_IS_FILE(filp) == 0) {
-		printk(KERN_ERR "perfmon: pfm_poll: bad magic [%d]\n", current->pid);
+		printk(KERN_ERR "perfmon: pfm_poll: bad magic [%d]\n", task_pid_nr(current));
 		return 0;
 	}
 
-	ctx = (pfm_context_t *)filp->private_data;
+	ctx = filp->private_data;
 	if (ctx == NULL) {
-		printk(KERN_ERR "perfmon: pfm_poll: NULL ctx [%d]\n", current->pid);
+		printk(KERN_ERR "perfmon: pfm_poll: NULL ctx [%d]\n", task_pid_nr(current));
 		return 0;
 	}
 
@@ -1639,8 +1677,8 @@ pfm_poll(struct file *filp, poll_table * wait)
 	return mask;
 }
 
-static int
-pfm_ioctl(struct inode *inode, struct file *file, unsigned int cmd, unsigned long arg)
+static long
+pfm_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 {
 	DPRINT(("pfm_ioctl called\n"));
 	return -EINVAL;
@@ -1657,7 +1695,7 @@ pfm_do_fasync(int fd, struct file *filp, pfm_context_t *ctx, int on)
 	ret = fasync_helper (fd, filp, on, &ctx->ctx_async_queue);
 
 	DPRINT(("pfm_fasync called by [%d] on ctx_fd=%d on=%d async_queue=%p ret=%d\n",
-		current->pid,
+		task_pid_nr(current),
 		fd,
 		on,
 		ctx->ctx_async_queue, ret));
@@ -1672,13 +1710,13 @@ pfm_fasync(int fd, struct file *filp, int on)
 	int ret;
 
 	if (PFM_IS_FILE(filp) == 0) {
-		printk(KERN_ERR "perfmon: pfm_fasync bad magic [%d]\n", current->pid);
+		printk(KERN_ERR "perfmon: pfm_fasync bad magic [%d]\n", task_pid_nr(current));
 		return -EBADF;
 	}
 
-	ctx = (pfm_context_t *)filp->private_data;
+	ctx = filp->private_data;
 	if (ctx == NULL) {
-		printk(KERN_ERR "perfmon: pfm_fasync NULL ctx [%d]\n", current->pid);
+		printk(KERN_ERR "perfmon: pfm_fasync NULL ctx [%d]\n", task_pid_nr(current));
 		return -EBADF;
 	}
 	/*
@@ -1709,7 +1747,7 @@ static void
 pfm_syswide_force_stop(void *info)
 {
 	pfm_context_t   *ctx = (pfm_context_t *)info;
-	struct pt_regs *regs = ia64_task_regs(current);
+	struct pt_regs *regs = task_pt_regs(current);
 	struct task_struct *owner;
 	unsigned long flags;
 	int ret;
@@ -1724,7 +1762,7 @@ pfm_syswide_force_stop(void *info)
 	if (owner != ctx->ctx_task) {
 		printk(KERN_ERR "perfmon: pfm_syswide_force_stop CPU%d unexpected owner [%d] instead of [%d]\n",
 			smp_processor_id(),
-			owner->pid, ctx->ctx_task->pid);
+			task_pid_nr(owner), task_pid_nr(ctx->ctx_task));
 		return;
 	}
 	if (GET_PMU_CTX() != ctx) {
@@ -1734,7 +1772,7 @@ pfm_syswide_force_stop(void *info)
 		return;
 	}
 
-	DPRINT(("on CPU%d forcing system wide stop for [%d]\n", smp_processor_id(), ctx->ctx_task->pid));	
+	DPRINT(("on CPU%d forcing system wide stop for [%d]\n", smp_processor_id(), task_pid_nr(ctx->ctx_task)));
 	/*
 	 * the context is already protected in pfm_close(), we simply
 	 * need to mask interrupts to avoid a PMU interrupt race on
@@ -1759,7 +1797,7 @@ pfm_syswide_cleanup_other_cpu(pfm_context_t *ctx)
 	int ret;
 
 	DPRINT(("calling CPU%d for cleanup\n", ctx->ctx_cpu));
-	ret = smp_call_function_single(ctx->ctx_cpu, pfm_syswide_force_stop, ctx, 0, 1);
+	ret = smp_call_function_single(ctx->ctx_cpu, pfm_syswide_force_stop, ctx, 1);
 	DPRINT(("called CPU%d for cleanup ret=%d\n", ctx->ctx_cpu, ret));
 }
 #endif /* CONFIG_SMP */
@@ -1769,7 +1807,7 @@ pfm_syswide_cleanup_other_cpu(pfm_context_t *ctx)
  * When caller is self-monitoring, the context is unloaded.
  */
 static int
-pfm_flush(struct file *filp)
+pfm_flush(struct file *filp, fl_owner_t id)
 {
 	pfm_context_t *ctx;
 	struct task_struct *task;
@@ -1784,16 +1822,16 @@ pfm_flush(struct file *filp)
 		return -EBADF;
 	}
 
-	ctx = (pfm_context_t *)filp->private_data;
+	ctx = filp->private_data;
 	if (ctx == NULL) {
-		printk(KERN_ERR "perfmon: pfm_flush: NULL ctx [%d]\n", current->pid);
+		printk(KERN_ERR "perfmon: pfm_flush: NULL ctx [%d]\n", task_pid_nr(current));
 		return -EBADF;
 	}
 
 	/*
 	 * remove our file from the async queue, if we use this mode.
 	 * This can be done without the context being protected. We come
-	 * here when the context has become unreacheable by other tasks.
+	 * here when the context has become unreachable by other tasks.
 	 *
 	 * We may still have active monitoring at this point and we may
 	 * end up in pfm_overflow_handler(). However, fasync_helper()
@@ -1803,18 +1841,13 @@ pfm_flush(struct file *filp)
 	 * invoked after, it will find an empty queue and no
 	 * signal will be sent. In both case, we are safe
 	 */
-	if (filp->f_flags & FASYNC) {
-		DPRINT(("cleaning up async_queue=%p\n", ctx->ctx_async_queue));
-		pfm_do_fasync (-1, filp, ctx, 0);
-	}
-
 	PROTECT_CTX(ctx, flags);
 
 	state     = ctx->ctx_state;
 	is_system = ctx->ctx_fl_system;
 
 	task = PFM_CTX_TASK(ctx);
-	regs = ia64_task_regs(task);
+	regs = task_pt_regs(task);
 
 	DPRINT(("ctx_state=%d is_current=%d\n",
 		state,
@@ -1893,7 +1926,7 @@ pfm_flush(struct file *filp)
 	 * because some VM function reenables interrupts.
 	 *
 	 */
-	if (smpl_buf_vaddr) pfm_remove_smpl_mapping(current, smpl_buf_vaddr, smpl_buf_size);
+	if (smpl_buf_vaddr) pfm_remove_smpl_mapping(smpl_buf_vaddr, smpl_buf_size);
 
 	return 0;
 }
@@ -1932,9 +1965,9 @@ pfm_close(struct inode *inode, struct file *filp)
 		return -EBADF;
 	}
 	
-	ctx = (pfm_context_t *)filp->private_data;
+	ctx = filp->private_data;
 	if (ctx == NULL) {
-		printk(KERN_ERR "perfmon: pfm_close: NULL ctx [%d]\n", current->pid);
+		printk(KERN_ERR "perfmon: pfm_close: NULL ctx [%d]\n", task_pid_nr(current));
 		return -EBADF;
 	}
 
@@ -1944,7 +1977,7 @@ pfm_close(struct inode *inode, struct file *filp)
 	is_system = ctx->ctx_fl_system;
 
 	task = PFM_CTX_TASK(ctx);
-	regs = ia64_task_regs(task);
+	regs = task_pt_regs(task);
 
 	DPRINT(("ctx_state=%d is_current=%d\n", 
 		state,
@@ -1988,7 +2021,7 @@ pfm_close(struct inode *inode, struct file *filp)
 		/*
 		 * force task to wake up from MASKED state
 		 */
-		up(&ctx->ctx_restart_sem);
+		complete(&ctx->ctx_restart_done);
 
 		DPRINT(("waking up ctx_state=%d\n", state));
 
@@ -2031,7 +2064,7 @@ pfm_close(struct inode *inode, struct file *filp)
 	 	 */
 		ctx->ctx_state = PFM_CTX_ZOMBIE;
 
-		DPRINT(("zombie ctx for [%d]\n", task->pid));
+		DPRINT(("zombie ctx for [%d]\n", task_pid_nr(task)));
 		/*
 		 * cannot free the context on the spot. deferred until
 		 * the task notices the ZOMBIE state
@@ -2090,7 +2123,7 @@ doit:
 	filp->private_data = NULL;
 
 	/*
-	 * if we free on the spot, the context is now completely unreacheable
+	 * if we free on the spot, the context is now completely unreachable
 	 * from the callers side. The monitored task side is also cut, so we
 	 * can freely cut.
 	 *
@@ -2121,116 +2154,73 @@ pfm_no_open(struct inode *irrelevant, struct file *dontcare)
 
 
 
-static struct file_operations pfm_file_ops = {
-	.llseek   = no_llseek,
-	.read     = pfm_read,
-	.write    = pfm_write,
-	.poll     = pfm_poll,
-	.ioctl    = pfm_ioctl,
-	.open     = pfm_no_open,	/* special open code to disallow open via /proc */
-	.fasync   = pfm_fasync,
-	.release  = pfm_close,
-	.flush	  = pfm_flush
+static const struct file_operations pfm_file_ops = {
+	.llseek		= no_llseek,
+	.read		= pfm_read,
+	.write		= pfm_write,
+	.poll		= pfm_poll,
+	.unlocked_ioctl = pfm_ioctl,
+	.open		= pfm_no_open,	/* special open code to disallow open via /proc */
+	.fasync		= pfm_fasync,
+	.release	= pfm_close,
+	.flush		= pfm_flush
 };
 
-static int
-pfmfs_delete_dentry(struct dentry *dentry)
+static char *pfmfs_dname(struct dentry *dentry, char *buffer, int buflen)
 {
-	return 1;
+	return dynamic_dname(dentry, buffer, buflen, "pfm:[%lu]",
+			     dentry->d_inode->i_ino);
 }
 
-static struct dentry_operations pfmfs_dentry_operations = {
-	.d_delete = pfmfs_delete_dentry,
+static const struct dentry_operations pfmfs_dentry_operations = {
+	.d_delete = always_delete_dentry,
+	.d_dname = pfmfs_dname,
 };
 
 
-static int
-pfm_alloc_fd(struct file **cfile)
+static struct file *
+pfm_alloc_file(pfm_context_t *ctx)
 {
-	int fd, ret = 0;
-	struct file *file = NULL;
-	struct inode * inode;
-	char name[32];
-	struct qstr this;
-
-	fd = get_unused_fd();
-	if (fd < 0) return -ENFILE;
-
-	ret = -ENFILE;
-
-	file = get_empty_filp();
-	if (!file) goto out;
+	struct file *file;
+	struct inode *inode;
+	struct path path;
+	struct qstr this = { .name = "" };
 
 	/*
 	 * allocate a new inode
 	 */
 	inode = new_inode(pfmfs_mnt->mnt_sb);
-	if (!inode) goto out;
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
 
 	DPRINT(("new inode ino=%ld @%p\n", inode->i_ino, inode));
 
 	inode->i_mode = S_IFCHR|S_IRUGO;
-	inode->i_uid  = current->fsuid;
-	inode->i_gid  = current->fsgid;
-
-	sprintf(name, "[%lu]", inode->i_ino);
-	this.name = name;
-	this.len  = strlen(name);
-	this.hash = inode->i_ino;
-
-	ret = -ENOMEM;
+	inode->i_uid  = current_fsuid();
+	inode->i_gid  = current_fsgid();
 
 	/*
 	 * allocate a new dcache entry
 	 */
-	file->f_dentry = d_alloc(pfmfs_mnt->mnt_sb->s_root, &this);
-	if (!file->f_dentry) goto out;
+	path.dentry = d_alloc(pfmfs_mnt->mnt_root, &this);
+	if (!path.dentry) {
+		iput(inode);
+		return ERR_PTR(-ENOMEM);
+	}
+	path.mnt = mntget(pfmfs_mnt);
 
-	file->f_dentry->d_op = &pfmfs_dentry_operations;
+	d_add(path.dentry, inode);
 
-	d_add(file->f_dentry, inode);
-	file->f_vfsmnt = mntget(pfmfs_mnt);
-	file->f_mapping = inode->i_mapping;
+	file = alloc_file(&path, FMODE_READ, &pfm_file_ops);
+	if (IS_ERR(file)) {
+		path_put(&path);
+		return file;
+	}
 
-	file->f_op    = &pfm_file_ops;
-	file->f_mode  = FMODE_READ;
 	file->f_flags = O_RDONLY;
-	file->f_pos   = 0;
-
-	/*
-	 * may have to delay until context is attached?
-	 */
-	fd_install(fd, file);
+	file->private_data = ctx;
 
-	/*
-	 * the file structure we will use
-	 */
-	*cfile = file;
-
-	return fd;
-out:
-	if (file) put_filp(file);
-	put_unused_fd(fd);
-	return ret;
-}
-
-static void
-pfm_free_fd(int fd, struct file *file)
-{
-	struct files_struct *files = current->files;
-	struct fdtable *fdt;
-
-	/* 
-	 * there ie no fd_uninstall(), so we do it here
-	 */
-	spin_lock(&files->file_lock);
-	fdt = files_fdtable(files);
-	rcu_assign_pointer(fdt->fd[fd], NULL);
-	spin_unlock(&files->file_lock);
-
-	if (file)
-		put_filp(file);
-	put_unused_fd(fd);
+	return file;
 }
 
 static int
@@ -2256,7 +2246,7 @@ pfm_remap_buffer(struct vm_area_struct *vma, unsigned long buf, unsigned long ad
  * allocate a sampling buffer and remaps it into the user address space of the task
  */
 static int
-pfm_smpl_buffer_alloc(struct task_struct *task, pfm_context_t *ctx, unsigned long rsize, void **user_vaddr)
+pfm_smpl_buffer_alloc(struct task_struct *task, struct file *filp, pfm_context_t *ctx, unsigned long rsize, void **user_vaddr)
 {
 	struct mm_struct *mm = task->mm;
 	struct vm_area_struct *vma = NULL;
@@ -2279,7 +2269,7 @@ pfm_smpl_buffer_alloc(struct task_struct *task, pfm_context_t *ctx, unsigned lon
 	 * if ((mm->total_vm << PAGE_SHIFT) + len> task->rlim[RLIMIT_AS].rlim_cur)
 	 * 	return -ENOMEM;
 	 */
-	if (size > task->signal->rlim[RLIMIT_MEMLOCK].rlim_cur)
+	if (size > task_rlimit(task, RLIMIT_MEMLOCK))
 		return -ENOMEM;
 
 	/*
@@ -2296,18 +2286,19 @@ pfm_smpl_buffer_alloc(struct task_struct *task, pfm_context_t *ctx, unsigned lon
 	DPRINT(("smpl_buf @%p\n", smpl_buf));
 
 	/* allocate vma */
-	vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
+	vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
 	if (!vma) {
 		DPRINT(("Cannot allocate vma\n"));
 		goto error_kmem;
 	}
-	memset(vma, 0, sizeof(*vma));
+	INIT_LIST_HEAD(&vma->anon_vma_chain);
 
 	/*
 	 * partially initialize the vma for the sampling buffer
 	 */
 	vma->vm_mm	     = mm;
-	vma->vm_flags	     = VM_READ| VM_MAYREAD |VM_RESERVED;
+	vma->vm_file	     = get_file(filp);
+	vma->vm_flags	     = VM_READ|VM_MAYREAD|VM_DONTEXPAND|VM_DONTDUMP;
 	vma->vm_page_prot    = PAGE_READONLY; /* XXX may need to change */
 
 	/*
@@ -2327,8 +2318,8 @@ pfm_smpl_buffer_alloc(struct task_struct *task, pfm_context_t *ctx, unsigned lon
 	down_write(&task->mm->mmap_sem);
 
 	/* find some free area in address space, must have mmap sem held */
-	vma->vm_start = pfm_get_unmapped_area(NULL, 0, size, 0, MAP_PRIVATE|MAP_ANONYMOUS, 0);
-	if (vma->vm_start == 0UL) {
+	vma->vm_start = get_unmapped_area(NULL, 0, size, 0, MAP_PRIVATE|MAP_ANONYMOUS);
+	if (IS_ERR_VALUE(vma->vm_start)) {
 		DPRINT(("Cannot find unmapped area for size %ld\n", size));
 		up_write(&task->mm->mmap_sem);
 		goto error;
@@ -2351,7 +2342,6 @@ pfm_smpl_buffer_alloc(struct task_struct *task, pfm_context_t *ctx, unsigned lon
 	 */
 	insert_vm_struct(mm, vma);
 
-	mm->total_vm  += size >> PAGE_SHIFT;
 	vm_stat_account(vma->vm_mm, vma->vm_flags, vma->vm_file,
 							vma_pages(vma));
 	up_write(&task->mm->mmap_sem);
@@ -2378,22 +2368,33 @@ error_kmem:
 static int
 pfm_bad_permissions(struct task_struct *task)
 {
+	const struct cred *tcred;
+	kuid_t uid = current_uid();
+	kgid_t gid = current_gid();
+	int ret;
+
+	rcu_read_lock();
+	tcred = __task_cred(task);
+
 	/* inspired by ptrace_attach() */
 	DPRINT(("cur: uid=%d gid=%d task: euid=%d suid=%d uid=%d egid=%d sgid=%d\n",
-		current->uid,
-		current->gid,
-		task->euid,
-		task->suid,
-		task->uid,
-		task->egid,
-		task->sgid));
-
-	return ((current->uid != task->euid)
-	    || (current->uid != task->suid)
-	    || (current->uid != task->uid)
-	    || (current->gid != task->egid)
-	    || (current->gid != task->sgid)
-	    || (current->gid != task->gid)) && !capable(CAP_SYS_PTRACE);
+		from_kuid(&init_user_ns, uid),
+		from_kgid(&init_user_ns, gid),
+		from_kuid(&init_user_ns, tcred->euid),
+		from_kuid(&init_user_ns, tcred->suid),
+		from_kuid(&init_user_ns, tcred->uid),
+		from_kgid(&init_user_ns, tcred->egid),
+		from_kgid(&init_user_ns, tcred->sgid)));
+
+	ret = ((!uid_eq(uid, tcred->euid))
+	       || (!uid_eq(uid, tcred->suid))
+	       || (!uid_eq(uid, tcred->uid))
+	       || (!gid_eq(gid, tcred->egid))
+	       || (!gid_eq(gid, tcred->sgid))
+	       || (!gid_eq(gid, tcred->gid))) && !capable(CAP_SYS_PTRACE);
+
+	rcu_read_unlock();
+	return ret;
 }
 
 static int
@@ -2422,7 +2423,7 @@ pfarg_is_sane(struct task_struct *task, pfarg_context_t *pfx)
 }
 
 static int
-pfm_setup_buffer_fmt(struct task_struct *task, pfm_context_t *ctx, unsigned int ctx_flags,
+pfm_setup_buffer_fmt(struct task_struct *task, struct file *filp, pfm_context_t *ctx, unsigned int ctx_flags,
 		     unsigned int cpu, pfarg_context_t *arg)
 {
 	pfm_buffer_fmt_t *fmt = NULL;
@@ -2435,7 +2436,7 @@ pfm_setup_buffer_fmt(struct task_struct *task, pfm_context_t *ctx, unsigned int
 	/* invoke and lock buffer format, if found */
 	fmt = pfm_find_buffer_fmt(arg->ctx_smpl_buf_id);
 	if (fmt == NULL) {
-		DPRINT(("[%d] cannot find buffer format\n", task->pid));
+		DPRINT(("[%d] cannot find buffer format\n", task_pid_nr(task)));
 		return -EINVAL;
 	}
 
@@ -2446,12 +2447,13 @@ pfm_setup_buffer_fmt(struct task_struct *task, pfm_context_t *ctx, unsigned int
 
 	ret = pfm_buf_fmt_validate(fmt, task, ctx_flags, cpu, fmt_arg);
 
-	DPRINT(("[%d] after validate(0x%x,%d,%p)=%d\n", task->pid, ctx_flags, cpu, fmt_arg, ret));
+	DPRINT(("[%d] after validate(0x%x,%d,%p)=%d\n", task_pid_nr(task), ctx_flags, cpu, fmt_arg, ret));
 
 	if (ret) goto error;
 
 	/* link buffer format and context */
 	ctx->ctx_buf_fmt = fmt;
+	ctx->ctx_fl_is_sampling = 1; /* assume record() is defined */
 
 	/*
 	 * check if buffer format wants to use perfmon buffer allocation/mapping service
@@ -2463,7 +2465,7 @@ pfm_setup_buffer_fmt(struct task_struct *task, pfm_context_t *ctx, unsigned int
 		/*
 		 * buffer is always remapped into the caller's address space
 		 */
-		ret = pfm_smpl_buffer_alloc(current, ctx, size, &uaddr);
+		ret = pfm_smpl_buffer_alloc(current, filp, ctx, size, &uaddr);
 		if (ret) goto error;
 
 		/* keep track of user address of buffer */
@@ -2518,7 +2520,7 @@ pfm_reset_pmu_state(pfm_context_t *ctx)
 	ctx->ctx_all_pmcs[0] = pmu_conf->impl_pmcs[0] & ~0x1;
 
 	/*
-	 * bitmask of all PMDs that are accesible to this context
+	 * bitmask of all PMDs that are accessible to this context
 	 */
 	ctx->ctx_all_pmds[0] = pmu_conf->impl_pmds[0];
 
@@ -2568,23 +2570,23 @@ pfm_task_incompatible(pfm_context_t *ctx, struct task_struct *task)
 	 * no kernel task or task not owner by caller
 	 */
 	if (task->mm == NULL) {
-		DPRINT(("task [%d] has not memory context (kernel thread)\n", task->pid));
+		DPRINT(("task [%d] has not memory context (kernel thread)\n", task_pid_nr(task)));
 		return -EPERM;
 	}
 	if (pfm_bad_permissions(task)) {
-		DPRINT(("no permission to attach to  [%d]\n", task->pid));
+		DPRINT(("no permission to attach to  [%d]\n", task_pid_nr(task)));
 		return -EPERM;
 	}
 	/*
 	 * cannot block in self-monitoring mode
 	 */
 	if (CTX_OVFL_NOBLOCK(ctx) == 0 && task == current) {
-		DPRINT(("cannot load a blocking context on self for [%d]\n", task->pid));
+		DPRINT(("cannot load a blocking context on self for [%d]\n", task_pid_nr(task)));
 		return -EINVAL;
 	}
 
 	if (task->exit_state == EXIT_ZOMBIE) {
-		DPRINT(("cannot attach to  zombie task [%d]\n", task->pid));
+		DPRINT(("cannot attach to  zombie task [%d]\n", task_pid_nr(task)));
 		return -EBUSY;
 	}
 
@@ -2593,14 +2595,14 @@ pfm_task_incompatible(pfm_context_t *ctx, struct task_struct *task)
 	 */
 	if (task == current) return 0;
 
-	if ((task->state != TASK_STOPPED) && (task->state != TASK_TRACED)) {
-		DPRINT(("cannot attach to non-stopped task [%d] state=%ld\n", task->pid, task->state));
+	if (!task_is_stopped_or_traced(task)) {
+		DPRINT(("cannot attach to non-stopped task [%d] state=%ld\n", task_pid_nr(task), task->state));
 		return -EBUSY;
 	}
 	/*
 	 * make sure the task is off any CPU
 	 */
-	wait_task_inactive(task);
+	wait_task_inactive(task, 0);
 
 	/* more to come... */
 
@@ -2616,11 +2618,11 @@ pfm_get_task(pfm_context_t *ctx, pid_t pid, struct task_struct **task)
 	/* XXX: need to add more checks here */
 	if (pid < 2) return -EPERM;
 
-	if (pid != current->pid) {
+	if (pid != task_pid_vnr(current)) {
 
 		read_lock(&tasklist_lock);
 
-		p = find_task_by_pid(pid);
+		p = find_task_by_vpid(pid);
 
 		/* make sure task cannot go away while we operate on it */
 		if (p) get_task_struct(p);
@@ -2646,79 +2648,46 @@ pfm_context_create(pfm_context_t *ctx, void *arg, int count, struct pt_regs *reg
 {
 	pfarg_context_t *req = (pfarg_context_t *)arg;
 	struct file *filp;
+	struct path path;
 	int ctx_flags;
+	int fd;
 	int ret;
 
 	/* let's check the arguments first */
 	ret = pfarg_is_sane(current, req);
-	if (ret < 0) return ret;
+	if (ret < 0)
+		return ret;
 
 	ctx_flags = req->ctx_flags;
 
 	ret = -ENOMEM;
 
-	ctx = pfm_context_alloc();
-	if (!ctx) goto error;
+	fd = get_unused_fd();
+	if (fd < 0)
+		return fd;
 
-	ret = pfm_alloc_fd(&filp);
-	if (ret < 0) goto error_file;
+	ctx = pfm_context_alloc(ctx_flags);
+	if (!ctx)
+		goto error;
 
-	req->ctx_fd = ctx->ctx_fd = ret;
+	filp = pfm_alloc_file(ctx);
+	if (IS_ERR(filp)) {
+		ret = PTR_ERR(filp);
+		goto error_file;
+	}
 
-	/*
-	 * attach context to file
-	 */
-	filp->private_data = ctx;
+	req->ctx_fd = ctx->ctx_fd = fd;
 
 	/*
 	 * does the user want to sample?
 	 */
 	if (pfm_uuid_cmp(req->ctx_smpl_buf_id, pfm_null_uuid)) {
-		ret = pfm_setup_buffer_fmt(current, ctx, ctx_flags, 0, req);
-		if (ret) goto buffer_error;
+		ret = pfm_setup_buffer_fmt(current, filp, ctx, ctx_flags, 0, req);
+		if (ret)
+			goto buffer_error;
 	}
 
-	/*
-	 * init context protection lock
-	 */
-	spin_lock_init(&ctx->ctx_lock);
-
-	/*
-	 * context is unloaded
-	 */
-	ctx->ctx_state = PFM_CTX_UNLOADED;
-
-	/*
-	 * initialization of context's flags
-	 */
-	ctx->ctx_fl_block       = (ctx_flags & PFM_FL_NOTIFY_BLOCK) ? 1 : 0;
-	ctx->ctx_fl_system      = (ctx_flags & PFM_FL_SYSTEM_WIDE) ? 1: 0;
-	ctx->ctx_fl_is_sampling = ctx->ctx_buf_fmt ? 1 : 0; /* assume record() is defined */
-	ctx->ctx_fl_no_msg      = (ctx_flags & PFM_FL_OVFL_NO_MSG) ? 1: 0;
-	/*
-	 * will move to set properties
-	 * ctx->ctx_fl_excl_idle   = (ctx_flags & PFM_FL_EXCL_IDLE) ? 1: 0;
-	 */
-
-	/*
-	 * init restart semaphore to locked
-	 */
-	sema_init(&ctx->ctx_restart_sem, 0);
-
-	/*
-	 * activation is used in SMP only
-	 */
-	ctx->ctx_last_activation = PFM_INVALID_ACTIVATION;
-	SET_LAST_CPU(ctx, -1);
-
-	/*
-	 * initialize notification message queue
-	 */
-	ctx->ctx_msgq_head = ctx->ctx_msgq_tail = 0;
-	init_waitqueue_head(&ctx->ctx_msgq_wait);
-	init_waitqueue_head(&ctx->ctx_zombieq);
-
-	DPRINT(("ctx=%p flags=0x%x system=%d notify_block=%d excl_idle=%d no_msg=%d ctx_fd=%d \n",
+	DPRINT(("ctx=%p flags=0x%x system=%d notify_block=%d excl_idle=%d no_msg=%d ctx_fd=%d\n",
 		ctx,
 		ctx_flags,
 		ctx->ctx_fl_system,
@@ -2732,10 +2701,14 @@ pfm_context_create(pfm_context_t *ctx, void *arg, int count, struct pt_regs *reg
 	 */
 	pfm_reset_pmu_state(ctx);
 
+	fd_install(fd, filp);
+
 	return 0;
 
 buffer_error:
-	pfm_free_fd(ctx->ctx_fd, filp);
+	path = filp->f_path;
+	put_filp(filp);
+	path_put(&path);
 
 	if (ctx->ctx_buf_fmt) {
 		pfm_buf_fmt_exit(ctx->ctx_buf_fmt, current, NULL, regs);
@@ -2744,6 +2717,7 @@ error_file:
 	pfm_context_free(ctx);
 
 error:
+	put_unused_fd(fd);
 	return ret;
 }
 
@@ -2856,7 +2830,6 @@ pfm_reset_regs(pfm_context_t *ctx, unsigned long *ovfl_regs, int is_long_reset)
 static int
 pfm_write_pmcs(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
 {
-	struct thread_struct *thread = NULL;
 	struct task_struct *task;
 	pfarg_reg_t *req = (pfarg_reg_t *)arg;
 	unsigned long value, pmc_pm;
@@ -2877,7 +2850,6 @@ pfm_write_pmcs(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
 	if (state == PFM_CTX_ZOMBIE) return -EINVAL;
 
 	if (is_loaded) {
-		thread = &task->thread;
 		/*
 		 * In system wide and when the context is loaded, access can only happen
 		 * when the caller is running on the CPU being monitored by the session.
@@ -3032,7 +3004,7 @@ pfm_write_pmcs(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
 		 *
 		 * The value in ctx_pmcs[] can only be changed in pfm_write_pmcs().
 		 *
-		 * The value in thread->pmcs[] may be modified on overflow, i.e.,  when
+		 * The value in th_pmcs[] may be modified on overflow, i.e.,  when
 		 * monitoring needs to be stopped.
 		 */
 		if (is_monitor) CTX_USED_MONITOR(ctx, 1UL << cnum);
@@ -3046,7 +3018,7 @@ pfm_write_pmcs(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
 			/*
 			 * write thread state
 			 */
-			if (is_system == 0) thread->pmcs[cnum] = value;
+			if (is_system == 0) ctx->th_pmcs[cnum] = value;
 
 			/*
 			 * write hardware register if we can
@@ -3098,7 +3070,6 @@ error:
 static int
 pfm_write_pmds(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
 {
-	struct thread_struct *thread = NULL;
 	struct task_struct *task;
 	pfarg_reg_t *req = (pfarg_reg_t *)arg;
 	unsigned long value, hw_value, ovfl_mask;
@@ -3122,7 +3093,6 @@ pfm_write_pmds(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
 	 * the owner of the local PMU.
 	 */
 	if (likely(is_loaded)) {
-		thread = &task->thread;
 		/*
 		 * In system wide and when the context is loaded, access can only happen
 		 * when the caller is running on the CPU being monitored by the session.
@@ -3230,7 +3200,7 @@ pfm_write_pmds(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
 			/*
 		 	 * write thread state
 		 	 */
-			if (is_system == 0) thread->pmds[cnum] = hw_value;
+			if (is_system == 0) ctx->th_pmds[cnum] = hw_value;
 
 			/*
 			 * write hardware register if we can
@@ -3296,7 +3266,6 @@ abort_mission:
 static int
 pfm_read_pmds(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
 {
-	struct thread_struct *thread = NULL;
 	struct task_struct *task;
 	unsigned long val = 0UL, lval, ovfl_mask, sval;
 	pfarg_reg_t *req = (pfarg_reg_t *)arg;
@@ -3320,7 +3289,6 @@ pfm_read_pmds(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
 	if (state == PFM_CTX_ZOMBIE) return -EINVAL;
 
 	if (likely(is_loaded)) {
-		thread = &task->thread;
 		/*
 		 * In system wide and when the context is loaded, access can only happen
 		 * when the caller is running on the CPU being monitored by the session.
@@ -3357,7 +3325,7 @@ pfm_read_pmds(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
 		if (unlikely(!PMD_IS_IMPL(cnum))) goto error;
 		/*
 		 * we can only read the register that we use. That includes
-		 * the one we explicitely initialize AND the one we want included
+		 * the one we explicitly initialize AND the one we want included
 		 * in the sampling buffer (smpl_regs).
 		 *
 		 * Having this restriction allows optimization in the ctxsw routine
@@ -3382,7 +3350,7 @@ pfm_read_pmds(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
 			 * if context is zombie, then task does not exist anymore.
 			 * In this case, we use the full value saved in the context (pfm_flush_regs()).
 			 */
-			val = is_loaded ? thread->pmds[cnum] : 0UL;
+			val = is_loaded ? ctx->th_pmds[cnum] : 0UL;
 		}
 		rd_func = pmu_conf->pmd_desc[cnum].read_check;
 
@@ -3481,7 +3449,7 @@ pfm_use_debug_registers(struct task_struct *task)
 
 	if (pmu_conf->use_rr_dbregs == 0) return 0;
 
-	DPRINT(("called for [%d]\n", task->pid));
+	DPRINT(("called for [%d]\n", task_pid_nr(task)));
 
 	/*
 	 * do it only once
@@ -3512,7 +3480,7 @@ pfm_use_debug_registers(struct task_struct *task)
 	DPRINT(("ptrace_use_dbregs=%u  sys_use_dbregs=%u by [%d] ret = %d\n",
 		  pfm_sessions.pfs_ptrace_use_dbregs,
 		  pfm_sessions.pfs_sys_use_dbregs,
-		  task->pid, ret));
+		  task_pid_nr(task), ret));
 
 	UNLOCK_PFS(flags);
 
@@ -3524,7 +3492,7 @@ pfm_use_debug_registers(struct task_struct *task)
  * IA64_THREAD_DBG_VALID set. This indicates a task which was
  * able to use the debug registers for debugging purposes via
  * ptrace(). Therefore we know it was not using them for
- * perfmormance monitoring, so we only decrement the number
+ * performance monitoring, so we only decrement the number
  * of "ptraced" debug register users to keep the count up to date
  */
 int
@@ -3537,7 +3505,7 @@ pfm_release_debug_registers(struct task_struct *task)
 
 	LOCK_PFS(flags);
 	if (pfm_sessions.pfs_ptrace_use_dbregs == 0) {
-		printk(KERN_ERR "perfmon: invalid release for [%d] ptrace_use_dbregs=0\n", task->pid);
+		printk(KERN_ERR "perfmon: invalid release for [%d] ptrace_use_dbregs=0\n", task_pid_nr(task));
 		ret = -1;
 	}  else {
 		pfm_sessions.pfs_ptrace_use_dbregs--;
@@ -3589,7 +3557,7 @@ pfm_restart(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
 
 	/* sanity check */
 	if (unlikely(task == NULL)) {
-		printk(KERN_ERR "perfmon: [%d] pfm_restart no task\n", current->pid);
+		printk(KERN_ERR "perfmon: [%d] pfm_restart no task\n", task_pid_nr(current));
 		return -EINVAL;
 	}
 
@@ -3598,7 +3566,7 @@ pfm_restart(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
 		fmt = ctx->ctx_buf_fmt;
 
 		DPRINT(("restarting self %d ovfl=0x%lx\n",
-			task->pid,
+			task_pid_nr(task),
 			ctx->ctx_ovfl_regs[0]));
 
 		if (CTX_HAS_SMPL(ctx)) {
@@ -3622,11 +3590,11 @@ pfm_restart(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
 				pfm_reset_regs(ctx, ctx->ctx_ovfl_regs, PFM_PMD_LONG_RESET);
 
 			if (rst_ctrl.bits.mask_monitoring == 0) {
-				DPRINT(("resuming monitoring for [%d]\n", task->pid));
+				DPRINT(("resuming monitoring for [%d]\n", task_pid_nr(task)));
 
 				if (state == PFM_CTX_MASKED) pfm_restore_monitoring(task);
 			} else {
-				DPRINT(("keeping monitoring stopped for [%d]\n", task->pid));
+				DPRINT(("keeping monitoring stopped for [%d]\n", task_pid_nr(task)));
 
 				// cannot use pfm_stop_monitoring(task, regs);
 			}
@@ -3677,22 +3645,22 @@ pfm_restart(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
 	 * if non-blocking, then we ensure that the task will go into
 	 * pfm_handle_work() before returning to user mode.
 	 *
-	 * We cannot explicitely reset another task, it MUST always
+	 * We cannot explicitly reset another task, it MUST always
 	 * be done by the task itself. This works for system wide because
 	 * the tool that is controlling the session is logically doing 
 	 * "self-monitoring".
 	 */
 	if (CTX_OVFL_NOBLOCK(ctx) == 0 && state == PFM_CTX_MASKED) {
-		DPRINT(("unblocking [%d] \n", task->pid));
-		up(&ctx->ctx_restart_sem);
+		DPRINT(("unblocking [%d]\n", task_pid_nr(task)));
+		complete(&ctx->ctx_restart_done);
 	} else {
-		DPRINT(("[%d] armed exit trap\n", task->pid));
+		DPRINT(("[%d] armed exit trap\n", task_pid_nr(task)));
 
 		ctx->ctx_fl_trap_reason = PFM_TRAP_REASON_RESET;
 
 		PFM_SET_WORK_PENDING(task, 1);
 
-		pfm_set_task_notify(task);
+		set_notify_resume(task);
 
 		/*
 		 * XXX: send reschedule if task runs on another CPU
@@ -3774,7 +3742,7 @@ pfm_write_ibr_dbr(int mode, pfm_context_t *ctx, void *arg, int count, struct pt_
 	 * don't bother if we are loaded and task is being debugged
 	 */
 	if (is_loaded && (thread->flags & IA64_THREAD_DBG_VALID) != 0) {
-		DPRINT(("debug registers already in use for [%d]\n", task->pid));
+		DPRINT(("debug registers already in use for [%d]\n", task_pid_nr(task)));
 		return -EBUSY;
 	}
 
@@ -3815,7 +3783,7 @@ pfm_write_ibr_dbr(int mode, pfm_context_t *ctx, void *arg, int count, struct pt_
 	 * is shared by all processes running on it
  	 */
 	if (first_time && can_access_pmu) {
-		DPRINT(("[%d] clearing ibrs, dbrs\n", task->pid));
+		DPRINT(("[%d] clearing ibrs, dbrs\n", task_pid_nr(task)));
 		for (i=0; i < pmu_conf->num_ibrs; i++) {
 			ia64_set_ibr(i, 0UL);
 			ia64_dv_serialize_instruction();
@@ -4004,7 +3972,7 @@ pfm_stop(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
 		return -EBUSY;
 	}
 	DPRINT(("task [%d] ctx_state=%d is_system=%d\n",
-		PFM_CTX_TASK(ctx)->pid,
+		task_pid_nr(PFM_CTX_TASK(ctx)),
 		state,
 		is_system));
 	/*
@@ -4051,7 +4019,7 @@ pfm_stop(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
 	 	 */
 		ia64_psr(regs)->up = 0;
 	} else {
-		tregs = ia64_task_regs(task);
+		tregs = task_pt_regs(task);
 
 		/*
 	 	 * stop monitoring at the user level
@@ -4062,7 +4030,7 @@ pfm_stop(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
 		 * monitoring disabled in kernel at next reschedule
 		 */
 		ctx->ctx_saved_psr_up = 0;
-		DPRINT(("task=[%d]\n", task->pid));
+		DPRINT(("task=[%d]\n", task_pid_nr(task)));
 	}
 	return 0;
 }
@@ -4133,7 +4101,7 @@ pfm_start(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
 		ia64_psr(regs)->up = 1;
 
 	} else {
-		tregs = ia64_task_regs(ctx->ctx_task);
+		tregs = task_pt_regs(ctx->ctx_task);
 
 		/*
 		 * start monitoring at the kernel level the next
@@ -4187,10 +4155,10 @@ pfm_check_task_exist(pfm_context_t *ctx)
 	do_each_thread (g, t) {
 		if (t->thread.pfm_context == ctx) {
 			ret = 0;
-			break;
+			goto out;
 		}
 	} while_each_thread (g, t);
-
+out:
 	read_unlock(&tasklist_lock);
 
 	DPRINT(("pfm_check_task_exist: ret=%d ctx=%p\n", ret, ctx));
@@ -4267,11 +4235,12 @@ pfm_context_load(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
 
 		if (is_system) {
 			if (pfm_sessions.pfs_ptrace_use_dbregs) {
-				DPRINT(("cannot load [%d] dbregs in use\n", task->pid));
+				DPRINT(("cannot load [%d] dbregs in use\n",
+							task_pid_nr(task)));
 				ret = -EBUSY;
 			} else {
 				pfm_sessions.pfs_sys_use_dbregs++;
-				DPRINT(("load [%d] increased sys_use_dbreg=%u\n", task->pid, pfm_sessions.pfs_sys_use_dbregs));
+				DPRINT(("load [%d] increased sys_use_dbreg=%u\n", task_pid_nr(task), pfm_sessions.pfs_sys_use_dbregs));
 				set_dbregs = 1;
 			}
 		}
@@ -4351,8 +4320,8 @@ pfm_context_load(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
 	pfm_copy_pmds(task, ctx);
 	pfm_copy_pmcs(task, ctx);
 
-	pmcs_source = thread->pmcs;
-	pmds_source = thread->pmds;
+	pmcs_source = ctx->th_pmcs;
+	pmds_source = ctx->th_pmds;
 
 	/*
 	 * always the case for system-wide
@@ -4363,7 +4332,7 @@ pfm_context_load(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
 
 			/* allow user level control */
 			ia64_psr(regs)->sp = 0;
-			DPRINT(("clearing psr.sp for [%d]\n", task->pid));
+			DPRINT(("clearing psr.sp for [%d]\n", task_pid_nr(task)));
 
 			SET_LAST_CPU(ctx, smp_processor_id());
 			INC_ACTIVATION();
@@ -4398,12 +4367,12 @@ pfm_context_load(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
 		 */
 		SET_PMU_OWNER(task, ctx);
 
-		DPRINT(("context loaded on PMU for [%d]\n", task->pid));
+		DPRINT(("context loaded on PMU for [%d]\n", task_pid_nr(task)));
 	} else {
 		/*
 		 * when not current, task MUST be stopped, so this is safe
 		 */
-		regs = ia64_task_regs(task);
+		regs = task_pt_regs(task);
 
 		/* force a full reload */
 		ctx->ctx_last_activation = PFM_INVALID_ACTIVATION;
@@ -4462,7 +4431,7 @@ pfm_context_unload(pfm_context_t *ctx, void *arg, int count, struct pt_regs *reg
 	int prev_state, is_system;
 	int ret;
 
-	DPRINT(("ctx_state=%d task [%d]\n", ctx->ctx_state, task ? task->pid : -1));
+	DPRINT(("ctx_state=%d task [%d]\n", ctx->ctx_state, task ? task_pid_nr(task) : -1));
 
 	prev_state = ctx->ctx_state;
 	is_system  = ctx->ctx_fl_system;
@@ -4529,7 +4498,7 @@ pfm_context_unload(pfm_context_t *ctx, void *arg, int count, struct pt_regs *reg
 	/*
 	 * per-task mode
 	 */
-	tregs = task == current ? regs : ia64_task_regs(task);
+	tregs = task == current ? regs : task_pt_regs(task);
 
 	if (task == current) {
 		/*
@@ -4537,7 +4506,7 @@ pfm_context_unload(pfm_context_t *ctx, void *arg, int count, struct pt_regs *reg
 		 */
 		ia64_psr(regs)->sp = 1;
 
-		DPRINT(("setting psr.sp for [%d]\n", task->pid));
+		DPRINT(("setting psr.sp for [%d]\n", task_pid_nr(task)));
 	}
 	/*
 	 * save PMDs to context
@@ -4577,7 +4546,7 @@ pfm_context_unload(pfm_context_t *ctx, void *arg, int count, struct pt_regs *reg
 	ctx->ctx_fl_can_restart  = 0;
 	ctx->ctx_fl_going_zombie = 0;
 
-	DPRINT(("disconnected [%d] from context\n", task->pid));
+	DPRINT(("disconnected [%d] from context\n", task_pid_nr(task)));
 
 	return 0;
 }
@@ -4592,7 +4561,7 @@ pfm_exit_thread(struct task_struct *task)
 {
 	pfm_context_t *ctx;
 	unsigned long flags;
-	struct pt_regs *regs = ia64_task_regs(task);
+	struct pt_regs *regs = task_pt_regs(task);
 	int ret, state;
 	int free_ok = 0;
 
@@ -4600,22 +4569,22 @@ pfm_exit_thread(struct task_struct *task)
 
 	PROTECT_CTX(ctx, flags);
 
-	DPRINT(("state=%d task [%d]\n", ctx->ctx_state, task->pid));
+	DPRINT(("state=%d task [%d]\n", ctx->ctx_state, task_pid_nr(task)));
 
 	state = ctx->ctx_state;
 	switch(state) {
 		case PFM_CTX_UNLOADED:
 			/*
-	 		 * only comes to thios function if pfm_context is not NULL, i.e., cannot
+	 		 * only comes to this function if pfm_context is not NULL, i.e., cannot
 			 * be in unloaded state
 	 		 */
-			printk(KERN_ERR "perfmon: pfm_exit_thread [%d] ctx unloaded\n", task->pid);
+			printk(KERN_ERR "perfmon: pfm_exit_thread [%d] ctx unloaded\n", task_pid_nr(task));
 			break;
 		case PFM_CTX_LOADED:
 		case PFM_CTX_MASKED:
 			ret = pfm_context_unload(ctx, NULL, 0, regs);
 			if (ret) {
-				printk(KERN_ERR "perfmon: pfm_exit_thread [%d] state=%d unload failed %d\n", task->pid, state, ret);
+				printk(KERN_ERR "perfmon: pfm_exit_thread [%d] state=%d unload failed %d\n", task_pid_nr(task), state, ret);
 			}
 			DPRINT(("ctx unloaded for current state was %d\n", state));
 
@@ -4624,12 +4593,12 @@ pfm_exit_thread(struct task_struct *task)
 		case PFM_CTX_ZOMBIE:
 			ret = pfm_context_unload(ctx, NULL, 0, regs);
 			if (ret) {
-				printk(KERN_ERR "perfmon: pfm_exit_thread [%d] state=%d unload failed %d\n", task->pid, state, ret);
+				printk(KERN_ERR "perfmon: pfm_exit_thread [%d] state=%d unload failed %d\n", task_pid_nr(task), state, ret);
 			}
 			free_ok = 1;
 			break;
 		default:
-			printk(KERN_ERR "perfmon: pfm_exit_thread [%d] unexpected state=%d\n", task->pid, state);
+			printk(KERN_ERR "perfmon: pfm_exit_thread [%d] unexpected state=%d\n", task_pid_nr(task), state);
 			break;
 	}
 	UNPROTECT_CTX(ctx, flags);
@@ -4713,7 +4682,7 @@ recheck:
 	DPRINT(("context %d state=%d [%d] task_state=%ld must_stop=%d\n",
 		ctx->ctx_fd,
 		state,
-		task->pid,
+		task_pid_nr(task),
 		task->state, PFM_CMD_STOPPED(cmd)));
 
 	/*
@@ -4759,8 +4728,8 @@ recheck:
 	 * the task must be stopped.
 	 */
 	if (PFM_CMD_STOPPED(cmd)) {
-		if ((task->state != TASK_STOPPED) && (task->state != TASK_TRACED)) {
-			DPRINT(("[%d] task not in stopped state\n", task->pid));
+		if (!task_is_stopped_or_traced(task)) {
+			DPRINT(("[%d] task not in stopped state\n", task_pid_nr(task)));
 			return -EBUSY;
 		}
 		/*
@@ -4781,7 +4750,7 @@ recheck:
 
 		UNPROTECT_CTX(ctx, flags);
 
-		wait_task_inactive(task);
+		wait_task_inactive(task, 0);
 
 		PROTECT_CTX(ctx, flags);
 
@@ -4802,7 +4771,7 @@ recheck:
 asmlinkage long
 sys_perfmonctl (int fd, int cmd, void __user *arg, int count)
 {
-	struct file *file = NULL;
+	struct fd f = {NULL, 0};
 	pfm_context_t *ctx = NULL;
 	unsigned long flags = 0UL;
 	void *args_k = NULL;
@@ -4853,7 +4822,7 @@ restart_args:
 	 * limit abuse to min page size
 	 */
 	if (unlikely(sz > PFM_MAX_ARGSIZE)) {
-		printk(KERN_ERR "perfmon: [%d] argument too big %lu\n", current->pid, sz);
+		printk(KERN_ERR "perfmon: [%d] argument too big %lu\n", task_pid_nr(current), sz);
 		return -E2BIG;
 	}
 
@@ -4899,17 +4868,17 @@ restart_args:
 
 	ret = -EBADF;
 
-	file = fget(fd);
-	if (unlikely(file == NULL)) {
+	f = fdget(fd);
+	if (unlikely(f.file == NULL)) {
 		DPRINT(("invalid fd %d\n", fd));
 		goto error_args;
 	}
-	if (unlikely(PFM_IS_FILE(file) == 0)) {
+	if (unlikely(PFM_IS_FILE(f.file) == 0)) {
 		DPRINT(("fd %d not related to perfmon\n", fd));
 		goto error_args;
 	}
 
-	ctx = (pfm_context_t *)file->private_data;
+	ctx = f.file->private_data;
 	if (unlikely(ctx == NULL)) {
 		DPRINT(("no context for fd %d\n", fd));
 		goto error_args;
@@ -4925,7 +4894,7 @@ restart_args:
 	if (unlikely(ret)) goto abort_locked;
 
 skip_fd:
-	ret = (*func)(ctx, args_k, count, ia64_task_regs(current));
+	ret = (*func)(ctx, args_k, count, task_pt_regs(current));
 
 	call_made = 1;
 
@@ -4933,14 +4902,16 @@ abort_locked:
 	if (likely(ctx)) {
 		DPRINT(("context unlocked\n"));
 		UNPROTECT_CTX(ctx, flags);
-		fput(file);
 	}
 
 	/* copy argument back to user, if needed */
 	if (call_made && PFM_CMD_RW_ARG(cmd) && copy_to_user(arg, args_k, base_sz*count)) ret = -EFAULT;
 
 error_args:
-	if (args_k) kfree(args_k);
+	if (f.file)
+		fdput(f);
+
+	kfree(args_k);
 
 	DPRINT(("cmd=%s ret=%ld\n", PFM_CMD_NAME(cmd), ret));
 
@@ -4998,11 +4969,11 @@ pfm_context_force_terminate(pfm_context_t *ctx, struct pt_regs *regs)
 {
 	int ret;
 
-	DPRINT(("entering for [%d]\n", current->pid));
+	DPRINT(("entering for [%d]\n", task_pid_nr(current)));
 
 	ret = pfm_context_unload(ctx, NULL, 0, regs);
 	if (ret) {
-		printk(KERN_ERR "pfm_context_force_terminate: [%d] unloaded failed with %d\n", current->pid, ret);
+		printk(KERN_ERR "pfm_context_force_terminate: [%d] unloaded failed with %d\n", task_pid_nr(current), ret);
 	}
 
 	/*
@@ -5018,12 +4989,13 @@ pfm_context_force_terminate(pfm_context_t *ctx, struct pt_regs *regs)
 }
 
 static int pfm_ovfl_notify_user(pfm_context_t *ctx, unsigned long ovfl_pmds);
+
  /*
   * pfm_handle_work() can be called with interrupts enabled
   * (TIF_NEED_RESCHED) or disabled. The down_interruptible
   * call may sleep, therefore we must re-enable interrupts
   * to avoid deadlocks. It is safe to do so because this function
-  * is called ONLY when returning to user level (PUStk=1), in which case
+  * is called ONLY when returning to user level (pUStk=1), in which case
   * there is no risk of kernel stack overflow due to deep
   * interrupt nesting.
   */
@@ -5039,7 +5011,8 @@ pfm_handle_work(void)
 
 	ctx = PFM_GET_CTX(current);
 	if (ctx == NULL) {
-		printk(KERN_ERR "perfmon: [%d] has no PFM context\n", current->pid);
+		printk(KERN_ERR "perfmon: [%d] has no PFM context\n",
+			task_pid_nr(current));
 		return;
 	}
 
@@ -5047,9 +5020,7 @@ pfm_handle_work(void)
 
 	PFM_SET_WORK_PENDING(current, 0);
 
-	pfm_clear_task_notify();
-
-	regs = ia64_task_regs(current);
+	regs = task_pt_regs(current);
 
 	/*
 	 * extract reason for being here and clear
@@ -5063,11 +5034,12 @@ pfm_handle_work(void)
 	/*
 	 * must be done before we check for simple-reset mode
 	 */
-	if (ctx->ctx_fl_going_zombie || ctx->ctx_state == PFM_CTX_ZOMBIE) goto do_zombie;
-
+	if (ctx->ctx_fl_going_zombie || ctx->ctx_state == PFM_CTX_ZOMBIE)
+		goto do_zombie;
 
 	//if (CTX_OVFL_NOBLOCK(ctx)) goto skip_blocking;
-	if (reason == PFM_TRAP_REASON_RESET) goto skip_blocking;
+	if (reason == PFM_TRAP_REASON_RESET)
+		goto skip_blocking;
 
 	/*
 	 * restore interrupt mask to what it was on entry.
@@ -5086,7 +5058,7 @@ pfm_handle_work(void)
 	 * may go through without blocking on SMP systems
 	 * if restart has been received already by the time we call down()
 	 */
-	ret = down_interruptible(&ctx->ctx_restart_sem);
+	ret = wait_for_completion_interruptible(&ctx->ctx_restart_done);
 
 	DPRINT(("after block sleeping ret=%d\n", ret));
 
@@ -5115,7 +5087,8 @@ do_zombie:
 	/*
 	 * in case of interruption of down() we don't restart anything
 	 */
-	if (ret < 0) goto nothing_to_do;
+	if (ret < 0)
+		goto nothing_to_do;
 
 skip_blocking:
 	pfm_resume_after_ovfl(ctx, ovfl_regs, regs);
@@ -5207,10 +5180,10 @@ pfm_end_notify_user(pfm_context_t *ctx)
 
 /*
  * main overflow processing routine.
- * it can be called from the interrupt path or explicitely during the context switch code
+ * it can be called from the interrupt path or explicitly during the context switch code
  */
-static void
-pfm_overflow_handler(struct task_struct *task, pfm_context_t *ctx, u64 pmc0, struct pt_regs *regs)
+static void pfm_overflow_handler(struct task_struct *task, pfm_context_t *ctx,
+				unsigned long pmc0, struct pt_regs *regs)
 {
 	pfm_ovfl_arg_t *ovfl_arg;
 	unsigned long mask;
@@ -5236,7 +5209,7 @@ pfm_overflow_handler(struct task_struct *task, pfm_context_t *ctx, u64 pmc0, str
 	DPRINT_ovfl(("pmc0=0x%lx pid=%d iip=0x%lx, %s "
 		     "used_pmds=0x%lx\n",
 			pmc0,
-			task ? task->pid: -1,
+			task ? task_pid_nr(task): -1,
 			(regs ? regs->cr_iip : 0),
 			CTX_OVFL_NOBLOCK(ctx) ? "nonblocking" : "blocking",
 			ctx->ctx_used_pmds[0]));
@@ -5415,7 +5388,7 @@ pfm_overflow_handler(struct task_struct *task, pfm_context_t *ctx, u64 pmc0, str
 			 * when coming from ctxsw, current still points to the
 			 * previous task, therefore we must work with task and not current.
 			 */
-			pfm_set_task_notify(task);
+			set_notify_resume(task);
 		}
 		/*
 		 * defer until state is changed (shorten spin window). the context is locked
@@ -5425,7 +5398,7 @@ pfm_overflow_handler(struct task_struct *task, pfm_context_t *ctx, u64 pmc0, str
 	}
 
 	DPRINT_ovfl(("owner [%d] pending=%ld reason=%u ovfl_pmds=0x%lx ovfl_notify=0x%lx masked=%d\n",
-			GET_PMU_OWNER() ? GET_PMU_OWNER()->pid : -1,
+			GET_PMU_OWNER() ? task_pid_nr(GET_PMU_OWNER()) : -1,
 			PFM_GET_WORK_PENDING(task),
 			ctx->ctx_fl_trap_reason,
 			ovfl_pmds,
@@ -5450,7 +5423,7 @@ pfm_overflow_handler(struct task_struct *task, pfm_context_t *ctx, u64 pmc0, str
 sanity_check:
 	printk(KERN_ERR "perfmon: CPU%d overflow handler [%d] pmc0=0x%lx\n",
 			smp_processor_id(),
-			task ? task->pid : -1,
+			task ? task_pid_nr(task) : -1,
 			pmc0);
 	return;
 
@@ -5483,7 +5456,7 @@ stop_monitoring:
 	 *
 	 * Overall pretty hairy stuff....
 	 */
-	DPRINT(("ctx is zombie for [%d], converted to spurious\n", task ? task->pid: -1));
+	DPRINT(("ctx is zombie for [%d], converted to spurious\n", task ? task_pid_nr(task): -1));
 	pfm_clear_psr_up();
 	ia64_psr(regs)->up = 0;
 	ia64_psr(regs)->sp = 1;
@@ -5491,7 +5464,7 @@ stop_monitoring:
 }
 
 static int
-pfm_do_interrupt_handler(int irq, void *arg, struct pt_regs *regs)
+pfm_do_interrupt_handler(void *arg, struct pt_regs *regs)
 {
 	struct task_struct *task;
 	pfm_context_t *ctx;
@@ -5544,24 +5517,25 @@ pfm_do_interrupt_handler(int irq, void *arg, struct pt_regs *regs)
 
 report_spurious1:
 	printk(KERN_INFO "perfmon: spurious overflow interrupt on CPU%d: process %d has no PFM context\n",
-		this_cpu, task->pid);
+		this_cpu, task_pid_nr(task));
 	pfm_unfreeze_pmu();
 	return -1;
 report_spurious2:
 	printk(KERN_INFO "perfmon: spurious overflow interrupt on CPU%d: process %d, invalid flag\n", 
 		this_cpu, 
-		task->pid);
+		task_pid_nr(task));
 	pfm_unfreeze_pmu();
 	return -1;
 }
 
 static irqreturn_t
-pfm_interrupt_handler(int irq, void *arg, struct pt_regs *regs)
+pfm_interrupt_handler(int irq, void *arg)
 {
 	unsigned long start_cycles, total_cycles;
 	unsigned long min, max;
 	int this_cpu;
 	int ret;
+	struct pt_regs *regs = get_irq_regs();
 
 	this_cpu = get_cpu();
 	if (likely(!pfm_alt_intr_handler)) {
@@ -5570,7 +5544,7 @@ pfm_interrupt_handler(int irq, void *arg, struct pt_regs *regs)
 
 		start_cycles = ia64_get_itc();
 
-		ret = pfm_do_interrupt_handler(irq, arg, regs);
+		ret = pfm_do_interrupt_handler(arg, regs);
 
 		total_cycles = ia64_get_itc();
 
@@ -5590,7 +5564,7 @@ pfm_interrupt_handler(int irq, void *arg, struct pt_regs *regs)
 		(*pfm_alt_intr_handler->handler)(irq, arg, regs);
 	}
 
-	put_cpu_no_resched();
+	put_cpu();
 	return IRQ_HANDLED;
 }
 
@@ -5598,7 +5572,7 @@ pfm_interrupt_handler(int irq, void *arg, struct pt_regs *regs)
  * /proc/perfmon interface, for debug only
  */
 
-#define PFM_PROC_SHOW_HEADER	((void *)NR_CPUS+1)
+#define PFM_PROC_SHOW_HEADER	((void *)(long)nr_cpu_ids+1)
 
 static void *
 pfm_proc_start(struct seq_file *m, loff_t *pos)
@@ -5607,7 +5581,7 @@ pfm_proc_start(struct seq_file *m, loff_t *pos)
 		return PFM_PROC_SHOW_HEADER;
 	}
 
-	while (*pos <= NR_CPUS) {
+	while (*pos <= nr_cpu_ids) {
 		if (cpu_online(*pos - 1)) {
 			return (void *)*pos;
 		}
@@ -5667,24 +5641,8 @@ pfm_proc_show_header(struct seq_file *m)
 
 	list_for_each(pos, &pfm_buffer_fmt_list) {
 		entry = list_entry(pos, pfm_buffer_fmt_t, fmt_list);
-		seq_printf(m, "format                    : %02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x %s\n",
-			entry->fmt_uuid[0],
-			entry->fmt_uuid[1],
-			entry->fmt_uuid[2],
-			entry->fmt_uuid[3],
-			entry->fmt_uuid[4],
-			entry->fmt_uuid[5],
-			entry->fmt_uuid[6],
-			entry->fmt_uuid[7],
-			entry->fmt_uuid[8],
-			entry->fmt_uuid[9],
-			entry->fmt_uuid[10],
-			entry->fmt_uuid[11],
-			entry->fmt_uuid[12],
-			entry->fmt_uuid[13],
-			entry->fmt_uuid[14],
-			entry->fmt_uuid[15],
-			entry->fmt_name);
+		seq_printf(m, "format                    : %16phD %s\n",
+			   entry->fmt_uuid, entry->fmt_name);
 	}
 	spin_unlock(&pfm_buffer_fmt_lock);
 
@@ -5759,7 +5717,7 @@ pfm_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-struct seq_operations pfm_seq_ops = {
+const struct seq_operations pfm_seq_ops = {
 	.start =	pfm_proc_start,
  	.next =		pfm_proc_next,
  	.stop =		pfm_proc_stop,
@@ -5793,7 +5751,7 @@ pfm_syst_wide_update_task(struct task_struct *task, unsigned long info, int is_c
 	 * on every CPU, so we can rely on the pid to identify the idle task.
 	 */
 	if ((info & PFM_CPUINFO_EXCL_IDLE) == 0 || task->pid) {
-		regs = ia64_task_regs(task);
+		regs = task_pt_regs(task);
 		ia64_psr(regs)->pp = is_ctxswin ? dcr_pp : 0;
 		return;
 	}
@@ -5836,7 +5794,8 @@ pfm_force_cleanup(pfm_context_t *ctx, struct pt_regs *regs)
 	ia64_psr(regs)->sp = 1;
 
 	if (GET_PMU_OWNER() == task) {
-		DPRINT(("cleared ownership for [%d]\n", ctx->ctx_task->pid));
+		DPRINT(("cleared ownership for [%d]\n",
+					task_pid_nr(ctx->ctx_task)));
 		SET_PMU_OWNER(NULL, NULL);
 	}
 
@@ -5848,7 +5807,7 @@ pfm_force_cleanup(pfm_context_t *ctx, struct pt_regs *regs)
 	task->thread.pfm_context  = NULL;
 	task->thread.flags       &= ~IA64_THREAD_PM_VALID;
 
-	DPRINT(("force cleanup for [%d]\n",  task->pid));
+	DPRINT(("force cleanup for [%d]\n",  task_pid_nr(task)));
 }
 
 
@@ -5859,14 +5818,12 @@ void
 pfm_save_regs(struct task_struct *task)
 {
 	pfm_context_t *ctx;
-	struct thread_struct *t;
 	unsigned long flags;
 	u64 psr;
 
 
 	ctx = PFM_GET_CTX(task);
 	if (ctx == NULL) return;
-	t = &task->thread;
 
 	/*
  	 * we always come here with interrupts ALREADY disabled by
@@ -5876,7 +5833,7 @@ pfm_save_regs(struct task_struct *task)
 	flags = pfm_protect_ctx_ctxsw(ctx);
 
 	if (ctx->ctx_state == PFM_CTX_ZOMBIE) {
-		struct pt_regs *regs = ia64_task_regs(task);
+		struct pt_regs *regs = task_pt_regs(task);
 
 		pfm_clear_psr_up();
 
@@ -5924,19 +5881,19 @@ pfm_save_regs(struct task_struct *task)
 	 * guarantee we will be schedule at that same
 	 * CPU again.
 	 */
-	pfm_save_pmds(t->pmds, ctx->ctx_used_pmds[0]);
+	pfm_save_pmds(ctx->th_pmds, ctx->ctx_used_pmds[0]);
 
 	/*
 	 * save pmc0 ia64_srlz_d() done in pfm_save_pmds()
 	 * we will need it on the restore path to check
 	 * for pending overflow.
 	 */
-	t->pmcs[0] = ia64_get_pmc(0);
+	ctx->th_pmcs[0] = ia64_get_pmc(0);
 
 	/*
 	 * unfreeze PMU if had pending overflows
 	 */
-	if (t->pmcs[0] & ~0x1UL) pfm_unfreeze_pmu();
+	if (ctx->th_pmcs[0] & ~0x1UL) pfm_unfreeze_pmu();
 
 	/*
 	 * finally, allow context access.
@@ -5981,7 +5938,6 @@ static void
 pfm_lazy_save_regs (struct task_struct *task)
 {
 	pfm_context_t *ctx;
-	struct thread_struct *t;
 	unsigned long flags;
 
 	{ u64 psr  = pfm_get_psr();
@@ -5989,7 +5945,6 @@ pfm_lazy_save_regs (struct task_struct *task)
 	}
 
 	ctx = PFM_GET_CTX(task);
-	t   = &task->thread;
 
 	/*
 	 * we need to mask PMU overflow here to
@@ -6014,19 +5969,19 @@ pfm_lazy_save_regs (struct task_struct *task)
 	/*
 	 * save all the pmds we use
 	 */
-	pfm_save_pmds(t->pmds, ctx->ctx_used_pmds[0]);
+	pfm_save_pmds(ctx->th_pmds, ctx->ctx_used_pmds[0]);
 
 	/*
 	 * save pmc0 ia64_srlz_d() done in pfm_save_pmds()
 	 * it is needed to check for pended overflow
 	 * on the restore path
 	 */
-	t->pmcs[0] = ia64_get_pmc(0);
+	ctx->th_pmcs[0] = ia64_get_pmc(0);
 
 	/*
 	 * unfreeze PMU if had pending overflows
 	 */
-	if (t->pmcs[0] & ~0x1UL) pfm_unfreeze_pmu();
+	if (ctx->th_pmcs[0] & ~0x1UL) pfm_unfreeze_pmu();
 
 	/*
 	 * now get can unmask PMU interrupts, they will
@@ -6045,7 +6000,6 @@ void
 pfm_load_regs (struct task_struct *task)
 {
 	pfm_context_t *ctx;
-	struct thread_struct *t;
 	unsigned long pmc_mask = 0UL, pmd_mask = 0UL;
 	unsigned long flags;
 	u64 psr, psr_up;
@@ -6056,11 +6010,10 @@ pfm_load_regs (struct task_struct *task)
 
 	BUG_ON(GET_PMU_OWNER());
 
-	t     = &task->thread;
 	/*
 	 * possible on unload
 	 */
-	if (unlikely((t->flags & IA64_THREAD_PM_VALID) == 0)) return;
+	if (unlikely((task->thread.flags & IA64_THREAD_PM_VALID) == 0)) return;
 
 	/*
  	 * we always come here with interrupts ALREADY disabled by
@@ -6076,7 +6029,7 @@ pfm_load_regs (struct task_struct *task)
 	BUG_ON(psr & IA64_PSR_I);
 
 	if (unlikely(ctx->ctx_state == PFM_CTX_ZOMBIE)) {
-		struct pt_regs *regs = ia64_task_regs(task);
+		struct pt_regs *regs = task_pt_regs(task);
 
 		BUG_ON(ctx->ctx_smpl_hdr);
 
@@ -6142,26 +6095,26 @@ pfm_load_regs (struct task_struct *task)
 	 *
 	 * XXX: optimize here
 	 */
-	if (pmd_mask) pfm_restore_pmds(t->pmds, pmd_mask);
-	if (pmc_mask) pfm_restore_pmcs(t->pmcs, pmc_mask);
+	if (pmd_mask) pfm_restore_pmds(ctx->th_pmds, pmd_mask);
+	if (pmc_mask) pfm_restore_pmcs(ctx->th_pmcs, pmc_mask);
 
 	/*
 	 * check for pending overflow at the time the state
 	 * was saved.
 	 */
-	if (unlikely(PMC0_HAS_OVFL(t->pmcs[0]))) {
+	if (unlikely(PMC0_HAS_OVFL(ctx->th_pmcs[0]))) {
 		/*
 		 * reload pmc0 with the overflow information
 		 * On McKinley PMU, this will trigger a PMU interrupt
 		 */
-		ia64_set_pmc(0, t->pmcs[0]);
+		ia64_set_pmc(0, ctx->th_pmcs[0]);
 		ia64_srlz_d();
-		t->pmcs[0] = 0UL;
+		ctx->th_pmcs[0] = 0UL;
 
 		/*
 		 * will replay the PMU interrupt
 		 */
-		if (need_irq_resend) hw_resend_irq(NULL, IA64_PERFMON_VECTOR);
+		if (need_irq_resend) ia64_resend_irq(IA64_PERFMON_VECTOR);
 
 		pfm_stats[smp_processor_id()].pfm_replay_ovfl_intr_count++;
 	}
@@ -6209,7 +6162,6 @@ pfm_load_regs (struct task_struct *task)
 void
 pfm_load_regs (struct task_struct *task)
 {
-	struct thread_struct *t;
 	pfm_context_t *ctx;
 	struct task_struct *owner;
 	unsigned long pmd_mask, pmc_mask;
@@ -6218,7 +6170,6 @@ pfm_load_regs (struct task_struct *task)
 
 	owner = GET_PMU_OWNER();
 	ctx   = PFM_GET_CTX(task);
-	t     = &task->thread;
 	psr   = pfm_get_psr();
 
 	BUG_ON(psr & (IA64_PSR_UP|IA64_PSR_PP));
@@ -6281,27 +6232,27 @@ pfm_load_regs (struct task_struct *task)
 	 */
 	pmc_mask = ctx->ctx_all_pmcs[0];
 
-	pfm_restore_pmds(t->pmds, pmd_mask);
-	pfm_restore_pmcs(t->pmcs, pmc_mask);
+	pfm_restore_pmds(ctx->th_pmds, pmd_mask);
+	pfm_restore_pmcs(ctx->th_pmcs, pmc_mask);
 
 	/*
 	 * check for pending overflow at the time the state
 	 * was saved.
 	 */
-	if (unlikely(PMC0_HAS_OVFL(t->pmcs[0]))) {
+	if (unlikely(PMC0_HAS_OVFL(ctx->th_pmcs[0]))) {
 		/*
 		 * reload pmc0 with the overflow information
 		 * On McKinley PMU, this will trigger a PMU interrupt
 		 */
-		ia64_set_pmc(0, t->pmcs[0]);
+		ia64_set_pmc(0, ctx->th_pmcs[0]);
 		ia64_srlz_d();
 
-		t->pmcs[0] = 0UL;
+		ctx->th_pmcs[0] = 0UL;
 
 		/*
 		 * will replay the PMU interrupt
 		 */
-		if (need_irq_resend) hw_resend_irq(NULL, IA64_PERFMON_VECTOR);
+		if (need_irq_resend) ia64_resend_irq(IA64_PERFMON_VECTOR);
 
 		pfm_stats[smp_processor_id()].pfm_replay_ovfl_intr_count++;
 	}
@@ -6371,11 +6322,11 @@ pfm_flush_pmds(struct task_struct *task, pfm_context_t *ctx)
 		 */
 		pfm_unfreeze_pmu();
 	} else {
-		pmc0 = task->thread.pmcs[0];
+		pmc0 = ctx->th_pmcs[0];
 		/*
 		 * clear whatever overflow status bits there were
 		 */
-		task->thread.pmcs[0] = 0;
+		ctx->th_pmcs[0] = 0;
 	}
 	ovfl_val = pmu_conf->ovfl_val;
 	/*
@@ -6396,11 +6347,11 @@ pfm_flush_pmds(struct task_struct *task, pfm_context_t *ctx)
 		/*
 		 * can access PMU always true in system wide mode
 		 */
-		val = pmd_val = can_access_pmu ? ia64_get_pmd(i) : task->thread.pmds[i];
+		val = pmd_val = can_access_pmu ? ia64_get_pmd(i) : ctx->th_pmds[i];
 
 		if (PMD_IS_COUNTING(i)) {
 			DPRINT(("[%d] pmd[%d] ctx_pmd=0x%lx hw_pmd=0x%lx\n",
-				task->pid,
+				task_pid_nr(task),
 				i,
 				ctx->ctx_pmds[i].val,
 				val & ovfl_val));
@@ -6422,13 +6373,13 @@ pfm_flush_pmds(struct task_struct *task, pfm_context_t *ctx)
 			 */
 			if (pmc0 & (1UL << i)) {
 				val += 1 + ovfl_val;
-				DPRINT(("[%d] pmd[%d] overflowed\n", task->pid, i));
+				DPRINT(("[%d] pmd[%d] overflowed\n", task_pid_nr(task), i));
 			}
 		}
 
-		DPRINT(("[%d] ctx_pmd[%d]=0x%lx  pmd_val=0x%lx\n", task->pid, i, val, pmd_val));
+		DPRINT(("[%d] ctx_pmd[%d]=0x%lx  pmd_val=0x%lx\n", task_pid_nr(task), i, val, pmd_val));
 
-		if (is_self) task->thread.pmds[i] = pmd_val;
+		if (is_self) ctx->th_pmds[i] = pmd_val;
 
 		ctx->ctx_pmds[i].val = val;
 	}
@@ -6436,7 +6387,6 @@ pfm_flush_pmds(struct task_struct *task, pfm_context_t *ctx)
 
 static struct irqaction perfmon_irqaction = {
 	.handler = pfm_interrupt_handler,
-	.flags   = SA_INTERRUPT,
 	.name    = "perfmon"
 };
 
@@ -6445,7 +6395,7 @@ pfm_alt_save_pmu_state(void *data)
 {
 	struct pt_regs *regs;
 
-	regs = ia64_task_regs(current);
+	regs = task_pt_regs(current);
 
 	DPRINT(("called\n"));
 
@@ -6471,7 +6421,7 @@ pfm_alt_restore_pmu_state(void *data)
 {
 	struct pt_regs *regs;
 
-	regs = ia64_task_regs(current);
+	regs = task_pt_regs(current);
 
 	DPRINT(("called\n"));
 
@@ -6515,7 +6465,7 @@ pfm_install_alt_pmu_interrupt(pfm_intr_handler_desc_t *hdl)
 	}
 
 	/* save the current system wide pmu states */
-	ret = on_each_cpu(pfm_alt_save_pmu_state, NULL, 0, 1);
+	ret = on_each_cpu(pfm_alt_save_pmu_state, NULL, 1);
 	if (ret) {
 		DPRINT(("on_each_cpu() failed: %d\n", ret));
 		goto cleanup_reserve;
@@ -6560,7 +6510,7 @@ pfm_remove_alt_pmu_interrupt(pfm_intr_handler_desc_t *hdl)
 
 	pfm_alt_intr_handler = NULL;
 
-	ret = on_each_cpu(pfm_alt_restore_pmu_state, NULL, 0, 1);
+	ret = on_each_cpu(pfm_alt_restore_pmu_state, NULL, 1);
 	if (ret) {
 		DPRINT(("on_each_cpu() failed: %d\n", ret));
 	}
@@ -6603,7 +6553,7 @@ found:
 	return 0;
 }
 
-static struct file_operations pfm_proc_fops = {
+static const struct file_operations pfm_proc_fops = {
 	.open		= pfm_proc_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
@@ -6672,7 +6622,7 @@ pfm_init(void)
 	       ffz(pmu_conf->ovfl_val));
 
 	/* sanity check */
-	if (pmu_conf->num_pmds >= IA64_NUM_PMD_REGS || pmu_conf->num_pmcs >= IA64_NUM_PMC_REGS) {
+	if (pmu_conf->num_pmds >= PFM_NUM_PMD_REGS || pmu_conf->num_pmcs >= PFM_NUM_PMC_REGS) {
 		printk(KERN_ERR "perfmon: not enough pmc/pmd, perfmon disabled\n");
 		pmu_conf = NULL;
 		return -1;
@@ -6681,21 +6631,17 @@ pfm_init(void)
 	/*
 	 * create /proc/perfmon (mostly for debugging purposes)
 	 */
- 	perfmon_dir = create_proc_entry("perfmon", S_IRUGO, NULL);
+	perfmon_dir = proc_create("perfmon", S_IRUGO, NULL, &pfm_proc_fops);
 	if (perfmon_dir == NULL) {
 		printk(KERN_ERR "perfmon: cannot create /proc entry, perfmon disabled\n");
 		pmu_conf = NULL;
 		return -1;
 	}
-  	/*
- 	 * install customized file operations for /proc/perfmon entry
- 	 */
- 	perfmon_dir->proc_fops = &pfm_proc_fops;
 
 	/*
 	 * create /proc/sys/kernel/perfmon (for debugging purposes)
 	 */
-	pfm_sysctl_header = register_sysctl_table(pfm_sysctl_root, 0);
+	pfm_sysctl_header = register_sysctl_table(pfm_sysctl_root);
 
 	/*
 	 * initialize all our spinlocks
@@ -6718,6 +6664,7 @@ __initcall(pfm_init);
 void
 pfm_init_percpu (void)
 {
+	static int first_time=1;
 	/*
 	 * make sure no measurement is active
 	 * (may inherit programmed PMCs from EFI).
@@ -6730,8 +6677,10 @@ pfm_init_percpu (void)
 	 */
 	pfm_unfreeze_pmu();
 
-	if (smp_processor_id() == 0)
+	if (first_time) {
 		register_percpu_irq(IA64_PERFMON_VECTOR, &perfmon_irqaction);
+		first_time=0;
+	}
 
 	ia64_setreg(_IA64_REG_CR_PMV, IA64_PERFMON_VECTOR);
 	ia64_srlz_d();
@@ -6744,7 +6693,6 @@ void
 dump_pmu_state(const char *from)
 {
 	struct task_struct *task;
-	struct thread_struct *t;
 	struct pt_regs *regs;
 	pfm_context_t *ctx;
 	unsigned long psr, dcr, info, flags;
@@ -6753,7 +6701,7 @@ dump_pmu_state(const char *from)
 	local_irq_save(flags);
 
 	this_cpu = smp_processor_id();
-	regs     = ia64_task_regs(current);
+	regs     = task_pt_regs(current);
 	info     = PFM_CPUINFO_GET();
 	dcr      = ia64_getreg(_IA64_REG_CR_DCR);
 
@@ -6765,14 +6713,14 @@ dump_pmu_state(const char *from)
 	printk("CPU%d from %s() current [%d] iip=0x%lx %s\n", 
 		this_cpu, 
 		from, 
-		current->pid, 
+		task_pid_nr(current),
 		regs->cr_iip,
 		current->comm);
 
 	task = GET_PMU_OWNER();
 	ctx  = GET_PMU_CTX();
 
-	printk("->CPU%d owner [%d] ctx=%p\n", this_cpu, task ? task->pid : -1, ctx);
+	printk("->CPU%d owner [%d] ctx=%p\n", this_cpu, task ? task_pid_nr(task) : -1, ctx);
 
 	psr = pfm_get_psr();
 
@@ -6789,16 +6737,14 @@ dump_pmu_state(const char *from)
 	ia64_psr(regs)->up = 0;
 	ia64_psr(regs)->pp = 0;
 
-	t = &current->thread;
-
 	for (i=1; PMC_IS_LAST(i) == 0; i++) {
 		if (PMC_IS_IMPL(i) == 0) continue;
-		printk("->CPU%d pmc[%d]=0x%lx thread_pmc[%d]=0x%lx\n", this_cpu, i, ia64_get_pmc(i), i, t->pmcs[i]);
+		printk("->CPU%d pmc[%d]=0x%lx thread_pmc[%d]=0x%lx\n", this_cpu, i, ia64_get_pmc(i), i, ctx->th_pmcs[i]);
 	}
 
 	for (i=1; PMD_IS_LAST(i) == 0; i++) {
 		if (PMD_IS_IMPL(i) == 0) continue;
-		printk("->CPU%d pmd[%d]=0x%lx thread_pmd[%d]=0x%lx\n", this_cpu, i, ia64_get_pmd(i), i, t->pmds[i]);
+		printk("->CPU%d pmd[%d]=0x%lx thread_pmd[%d]=0x%lx\n", this_cpu, i, ia64_get_pmd(i), i, ctx->th_pmds[i]);
 	}
 
 	if (ctx) {
@@ -6822,7 +6768,7 @@ pfm_inherit(struct task_struct *task, struct pt_regs *regs)
 {
 	struct thread_struct *thread;
 
-	DPRINT(("perfmon: pfm_inherit clearing state for [%d]\n", task->pid));
+	DPRINT(("perfmon: pfm_inherit clearing state for [%d]\n", task_pid_nr(task)));
 
 	thread = &task->thread;
 
diff --git a/arch/ia64/kernel/perfmon_default_smpl.c b/arch/ia64/kernel/perfmon_default_smpl.c
index 344941db0a9..30c644ea44c 100644
--- a/arch/ia64/kernel/perfmon_default_smpl.c
+++ b/arch/ia64/kernel/perfmon_default_smpl.c
@@ -8,7 +8,6 @@
 #include <linux/kernel.h>
 #include <linux/types.h>
 #include <linux/module.h>
-#include <linux/config.h>
 #include <linux/init.h>
 #include <asm/delay.h>
 #include <linux/smp.h>
@@ -25,12 +24,12 @@ MODULE_LICENSE("GPL");
 #ifdef DEFAULT_DEBUG
 #define DPRINT(a) \
 	do { \
-		if (unlikely(pfm_sysctl.debug >0)) { printk("%s.%d: CPU%d ", __FUNCTION__, __LINE__, smp_processor_id()); printk a; } \
+		if (unlikely(pfm_sysctl.debug >0)) { printk("%s.%d: CPU%d ", __func__, __LINE__, smp_processor_id()); printk a; } \
 	} while (0)
 
 #define DPRINT_ovfl(a) \
 	do { \
-		if (unlikely(pfm_sysctl.debug > 0 && pfm_sysctl.debug_ovfl >0)) { printk("%s.%d: CPU%d ", __FUNCTION__, __LINE__, smp_processor_id()); printk a; } \
+		if (unlikely(pfm_sysctl.debug > 0 && pfm_sysctl.debug_ovfl >0)) { printk("%s.%d: CPU%d ", __func__, __LINE__, smp_processor_id()); printk a; } \
 	} while (0)
 
 #else
@@ -45,11 +44,11 @@ default_validate(struct task_struct *task, unsigned int flags, int cpu, void *da
 	int ret = 0;
 
 	if (data == NULL) {
-		DPRINT(("[%d] no argument passed\n", task->pid));
+		DPRINT(("[%d] no argument passed\n", task_pid_nr(task)));
 		return -EINVAL;
 	}
 
-	DPRINT(("[%d] validate flags=0x%x CPU%d\n", task->pid, flags, cpu));
+	DPRINT(("[%d] validate flags=0x%x CPU%d\n", task_pid_nr(task), flags, cpu));
 
 	/*
 	 * must hold at least the buffer header + one minimally sized entry
@@ -89,7 +88,7 @@ default_init(struct task_struct *task, void *buf, unsigned int flags, int cpu, v
 	hdr->hdr_count        = 0UL;
 
 	DPRINT(("[%d] buffer=%p buf_size=%lu hdr_size=%lu hdr_version=%u cur_offs=%lu\n",
-		task->pid,
+		task_pid_nr(task),
 		buf,
 		hdr->hdr_buf_size,
 		sizeof(*hdr),
@@ -151,7 +150,7 @@ default_handler(struct task_struct *task, void *buf, pfm_ovfl_arg_t *arg, struct
 	 * current = task running at the time of the overflow.
 	 *
 	 * per-task mode:
-	 * 	- this is ususally the task being monitored.
+	 * 	- this is usually the task being monitored.
 	 * 	  Under certain conditions, it might be a different task
 	 *
 	 * system-wide:
@@ -246,7 +245,7 @@ default_restart(struct task_struct *task, pfm_ovfl_ctrl_t *ctrl, void *buf, stru
 static int
 default_exit(struct task_struct *task, void *buf, struct pt_regs *regs)
 {
-	DPRINT(("[%d] exit(%p)\n", task->pid, buf));
+	DPRINT(("[%d] exit(%p)\n", task_pid_nr(task), buf));
 	return 0;
 }
 
diff --git a/arch/ia64/kernel/perfmon_mckinley.h b/arch/ia64/kernel/perfmon_mckinley.h
index 9becccda289..c4bec7a9d18 100644
--- a/arch/ia64/kernel/perfmon_mckinley.h
+++ b/arch/ia64/kernel/perfmon_mckinley.h
@@ -181,7 +181,7 @@ static pmu_config_t pmu_conf_mck={
 	.pmc_desc      = pfm_mck_pmc_desc,
 	.num_ibrs       = 8,
 	.num_dbrs       = 8,
-	.use_rr_dbregs = 1 /* debug register are use for range retrictions */
+	.use_rr_dbregs = 1 /* debug register are use for range restrictions */
 };
 
 
diff --git a/arch/ia64/kernel/perfmon_montecito.h b/arch/ia64/kernel/perfmon_montecito.h
new file mode 100644
index 00000000000..7f8da4c7ca6
--- /dev/null
+++ b/arch/ia64/kernel/perfmon_montecito.h
@@ -0,0 +1,269 @@
+/*
+ * This file contains the Montecito PMU register description tables
+ * and pmc checker used by perfmon.c.
+ *
+ * Copyright (c) 2005-2006 Hewlett-Packard Development Company, L.P.
+ *               Contributed by Stephane Eranian <eranian@hpl.hp.com>
+ */
+static int pfm_mont_pmc_check(struct task_struct *task, pfm_context_t *ctx, unsigned int cnum, unsigned long *val, struct pt_regs *regs);
+
+#define RDEP_MONT_ETB	(RDEP(38)|RDEP(39)|RDEP(48)|RDEP(49)|RDEP(50)|RDEP(51)|RDEP(52)|RDEP(53)|RDEP(54)|\
+			 RDEP(55)|RDEP(56)|RDEP(57)|RDEP(58)|RDEP(59)|RDEP(60)|RDEP(61)|RDEP(62)|RDEP(63))
+#define RDEP_MONT_DEAR  (RDEP(32)|RDEP(33)|RDEP(36))
+#define RDEP_MONT_IEAR  (RDEP(34)|RDEP(35))
+
+static pfm_reg_desc_t pfm_mont_pmc_desc[PMU_MAX_PMCS]={
+/* pmc0  */ { PFM_REG_CONTROL , 0, 0x0, -1, NULL, NULL, {0,0, 0, 0}, {0,0, 0, 0}},
+/* pmc1  */ { PFM_REG_CONTROL , 0, 0x0, -1, NULL, NULL, {0,0, 0, 0}, {0,0, 0, 0}},
+/* pmc2  */ { PFM_REG_CONTROL , 0, 0x0, -1, NULL, NULL, {0,0, 0, 0}, {0,0, 0, 0}},
+/* pmc3  */ { PFM_REG_CONTROL , 0, 0x0, -1, NULL, NULL, {0,0, 0, 0}, {0,0, 0, 0}},
+/* pmc4  */ { PFM_REG_COUNTING, 6, 0x2000000, 0x7c7fff7f, NULL, pfm_mont_pmc_check, {RDEP(4),0, 0, 0}, {0,0, 0, 0}},
+/* pmc5  */ { PFM_REG_COUNTING, 6, 0x2000000, 0x7c7fff7f, NULL, pfm_mont_pmc_check, {RDEP(5),0, 0, 0}, {0,0, 0, 0}},
+/* pmc6  */ { PFM_REG_COUNTING, 6, 0x2000000, 0x7c7fff7f, NULL, pfm_mont_pmc_check, {RDEP(6),0, 0, 0}, {0,0, 0, 0}},
+/* pmc7  */ { PFM_REG_COUNTING, 6, 0x2000000, 0x7c7fff7f, NULL, pfm_mont_pmc_check, {RDEP(7),0, 0, 0}, {0,0, 0, 0}},
+/* pmc8  */ { PFM_REG_COUNTING, 6, 0x2000000, 0x7c7fff7f, NULL, pfm_mont_pmc_check, {RDEP(8),0, 0, 0}, {0,0, 0, 0}},
+/* pmc9  */ { PFM_REG_COUNTING, 6, 0x2000000, 0x7c7fff7f, NULL, pfm_mont_pmc_check, {RDEP(9),0, 0, 0}, {0,0, 0, 0}},
+/* pmc10 */ { PFM_REG_COUNTING, 6, 0x2000000, 0x7c7fff7f, NULL, pfm_mont_pmc_check, {RDEP(10),0, 0, 0}, {0,0, 0, 0}},
+/* pmc11 */ { PFM_REG_COUNTING, 6, 0x2000000, 0x7c7fff7f, NULL, pfm_mont_pmc_check, {RDEP(11),0, 0, 0}, {0,0, 0, 0}},
+/* pmc12 */ { PFM_REG_COUNTING, 6, 0x2000000, 0x7c7fff7f, NULL, pfm_mont_pmc_check, {RDEP(12),0, 0, 0}, {0,0, 0, 0}},
+/* pmc13 */ { PFM_REG_COUNTING, 6, 0x2000000, 0x7c7fff7f, NULL, pfm_mont_pmc_check, {RDEP(13),0, 0, 0}, {0,0, 0, 0}},
+/* pmc14 */ { PFM_REG_COUNTING, 6, 0x2000000, 0x7c7fff7f, NULL, pfm_mont_pmc_check, {RDEP(14),0, 0, 0}, {0,0, 0, 0}},
+/* pmc15 */ { PFM_REG_COUNTING, 6, 0x2000000, 0x7c7fff7f, NULL, pfm_mont_pmc_check, {RDEP(15),0, 0, 0}, {0,0, 0, 0}},
+/* pmc16 */ { PFM_REG_NOTIMPL, },
+/* pmc17 */ { PFM_REG_NOTIMPL, },
+/* pmc18 */ { PFM_REG_NOTIMPL, },
+/* pmc19 */ { PFM_REG_NOTIMPL, },
+/* pmc20 */ { PFM_REG_NOTIMPL, },
+/* pmc21 */ { PFM_REG_NOTIMPL, },
+/* pmc22 */ { PFM_REG_NOTIMPL, },
+/* pmc23 */ { PFM_REG_NOTIMPL, },
+/* pmc24 */ { PFM_REG_NOTIMPL, },
+/* pmc25 */ { PFM_REG_NOTIMPL, },
+/* pmc26 */ { PFM_REG_NOTIMPL, },
+/* pmc27 */ { PFM_REG_NOTIMPL, },
+/* pmc28 */ { PFM_REG_NOTIMPL, },
+/* pmc29 */ { PFM_REG_NOTIMPL, },
+/* pmc30 */ { PFM_REG_NOTIMPL, },
+/* pmc31 */ { PFM_REG_NOTIMPL, },
+/* pmc32 */ { PFM_REG_CONFIG,  0, 0x30f01ffffffffffUL, 0x30f01ffffffffffUL, NULL, pfm_mont_pmc_check, {0,0, 0, 0}, {0,0, 0, 0}},
+/* pmc33 */ { PFM_REG_CONFIG,  0, 0x0,  0x1ffffffffffUL, NULL, pfm_mont_pmc_check, {0,0, 0, 0}, {0,0, 0, 0}},
+/* pmc34 */ { PFM_REG_CONFIG,  0, 0xf01ffffffffffUL, 0xf01ffffffffffUL, NULL, pfm_mont_pmc_check, {0,0, 0, 0}, {0,0, 0, 0}},
+/* pmc35 */ { PFM_REG_CONFIG,  0, 0x0,  0x1ffffffffffUL, NULL, pfm_mont_pmc_check, {0,0, 0, 0}, {0,0, 0, 0}},
+/* pmc36 */ { PFM_REG_CONFIG,  0, 0xfffffff0, 0xf, NULL, pfm_mont_pmc_check, {0,0, 0, 0}, {0,0, 0, 0}},
+/* pmc37 */ { PFM_REG_MONITOR, 4, 0x0, 0x3fff, NULL, pfm_mont_pmc_check, {RDEP_MONT_IEAR, 0, 0, 0}, {0, 0, 0, 0}},
+/* pmc38 */ { PFM_REG_CONFIG,  0, 0xdb6, 0x2492, NULL, pfm_mont_pmc_check, {0,0, 0, 0}, {0,0, 0, 0}},
+/* pmc39 */ { PFM_REG_MONITOR, 6, 0x0, 0xffcf, NULL, pfm_mont_pmc_check, {RDEP_MONT_ETB,0, 0, 0}, {0,0, 0, 0}},
+/* pmc40 */ { PFM_REG_MONITOR, 6, 0x2000000, 0xf01cf, NULL, pfm_mont_pmc_check, {RDEP_MONT_DEAR,0, 0, 0}, {0,0, 0, 0}},
+/* pmc41 */ { PFM_REG_CONFIG,  0, 0x00002078fefefefeUL, 0x1e00018181818UL, NULL, pfm_mont_pmc_check, {0,0, 0, 0}, {0,0, 0, 0}},
+/* pmc42 */ { PFM_REG_MONITOR, 6, 0x0, 0x7ff4f, NULL, pfm_mont_pmc_check, {RDEP_MONT_ETB,0, 0, 0}, {0,0, 0, 0}},
+	    { PFM_REG_END    , 0, 0x0, -1, NULL, NULL, {0,}, {0,}}, /* end marker */
+};
+
+static pfm_reg_desc_t pfm_mont_pmd_desc[PMU_MAX_PMDS]={
+/* pmd0  */ { PFM_REG_NOTIMPL, }, 
+/* pmd1  */ { PFM_REG_NOTIMPL, },
+/* pmd2  */ { PFM_REG_NOTIMPL, },
+/* pmd3  */ { PFM_REG_NOTIMPL, },
+/* pmd4  */ { PFM_REG_COUNTING, 0, 0x0, -1, NULL, NULL, {0,0, 0, 0}, {RDEP(4),0, 0, 0}},
+/* pmd5  */ { PFM_REG_COUNTING, 0, 0x0, -1, NULL, NULL, {0,0, 0, 0}, {RDEP(5),0, 0, 0}},
+/* pmd6  */ { PFM_REG_COUNTING, 0, 0x0, -1, NULL, NULL, {0,0, 0, 0}, {RDEP(6),0, 0, 0}},
+/* pmd7  */ { PFM_REG_COUNTING, 0, 0x0, -1, NULL, NULL, {0,0, 0, 0}, {RDEP(7),0, 0, 0}},
+/* pmd8  */ { PFM_REG_COUNTING, 0, 0x0, -1, NULL, NULL, {0,0, 0, 0}, {RDEP(8),0, 0, 0}}, 
+/* pmd9  */ { PFM_REG_COUNTING, 0, 0x0, -1, NULL, NULL, {0,0, 0, 0}, {RDEP(9),0, 0, 0}},
+/* pmd10 */ { PFM_REG_COUNTING, 0, 0x0, -1, NULL, NULL, {0,0, 0, 0}, {RDEP(10),0, 0, 0}},
+/* pmd11 */ { PFM_REG_COUNTING, 0, 0x0, -1, NULL, NULL, {0,0, 0, 0}, {RDEP(11),0, 0, 0}},
+/* pmd12 */ { PFM_REG_COUNTING, 0, 0x0, -1, NULL, NULL, {0,0, 0, 0}, {RDEP(12),0, 0, 0}},
+/* pmd13 */ { PFM_REG_COUNTING, 0, 0x0, -1, NULL, NULL, {0,0, 0, 0}, {RDEP(13),0, 0, 0}},
+/* pmd14 */ { PFM_REG_COUNTING, 0, 0x0, -1, NULL, NULL, {0,0, 0, 0}, {RDEP(14),0, 0, 0}},
+/* pmd15 */ { PFM_REG_COUNTING, 0, 0x0, -1, NULL, NULL, {0,0, 0, 0}, {RDEP(15),0, 0, 0}},
+/* pmd16 */ { PFM_REG_NOTIMPL, },
+/* pmd17 */ { PFM_REG_NOTIMPL, },
+/* pmd18 */ { PFM_REG_NOTIMPL, },
+/* pmd19 */ { PFM_REG_NOTIMPL, },
+/* pmd20 */ { PFM_REG_NOTIMPL, },
+/* pmd21 */ { PFM_REG_NOTIMPL, },
+/* pmd22 */ { PFM_REG_NOTIMPL, },
+/* pmd23 */ { PFM_REG_NOTIMPL, },
+/* pmd24 */ { PFM_REG_NOTIMPL, },
+/* pmd25 */ { PFM_REG_NOTIMPL, },
+/* pmd26 */ { PFM_REG_NOTIMPL, },
+/* pmd27 */ { PFM_REG_NOTIMPL, },
+/* pmd28 */ { PFM_REG_NOTIMPL, },
+/* pmd29 */ { PFM_REG_NOTIMPL, },
+/* pmd30 */ { PFM_REG_NOTIMPL, },
+/* pmd31 */ { PFM_REG_NOTIMPL, },
+/* pmd32 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP(33)|RDEP(36),0, 0, 0}, {RDEP(40),0, 0, 0}},
+/* pmd33 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP(32)|RDEP(36),0, 0, 0}, {RDEP(40),0, 0, 0}},
+/* pmd34 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP(35),0, 0, 0}, {RDEP(37),0, 0, 0}},
+/* pmd35 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP(34),0, 0, 0}, {RDEP(37),0, 0, 0}},
+/* pmd36 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP(32)|RDEP(33),0, 0, 0}, {RDEP(40),0, 0, 0}},
+/* pmd37 */ { PFM_REG_NOTIMPL, },
+/* pmd38 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP_MONT_ETB,0, 0, 0}, {RDEP(39),0, 0, 0}},
+/* pmd39 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP_MONT_ETB,0, 0, 0}, {RDEP(39),0, 0, 0}},
+/* pmd40 */ { PFM_REG_NOTIMPL, },
+/* pmd41 */ { PFM_REG_NOTIMPL, },
+/* pmd42 */ { PFM_REG_NOTIMPL, },
+/* pmd43 */ { PFM_REG_NOTIMPL, },
+/* pmd44 */ { PFM_REG_NOTIMPL, },
+/* pmd45 */ { PFM_REG_NOTIMPL, },
+/* pmd46 */ { PFM_REG_NOTIMPL, },
+/* pmd47 */ { PFM_REG_NOTIMPL, },
+/* pmd48 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP_MONT_ETB,0, 0, 0}, {RDEP(39),0, 0, 0}},
+/* pmd49 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP_MONT_ETB,0, 0, 0}, {RDEP(39),0, 0, 0}},
+/* pmd50 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP_MONT_ETB,0, 0, 0}, {RDEP(39),0, 0, 0}},
+/* pmd51 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP_MONT_ETB,0, 0, 0}, {RDEP(39),0, 0, 0}},
+/* pmd52 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP_MONT_ETB,0, 0, 0}, {RDEP(39),0, 0, 0}},
+/* pmd53 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP_MONT_ETB,0, 0, 0}, {RDEP(39),0, 0, 0}},
+/* pmd54 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP_MONT_ETB,0, 0, 0}, {RDEP(39),0, 0, 0}},
+/* pmd55 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP_MONT_ETB,0, 0, 0}, {RDEP(39),0, 0, 0}},
+/* pmd56 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP_MONT_ETB,0, 0, 0}, {RDEP(39),0, 0, 0}},
+/* pmd57 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP_MONT_ETB,0, 0, 0}, {RDEP(39),0, 0, 0}},
+/* pmd58 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP_MONT_ETB,0, 0, 0}, {RDEP(39),0, 0, 0}},
+/* pmd59 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP_MONT_ETB,0, 0, 0}, {RDEP(39),0, 0, 0}},
+/* pmd60 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP_MONT_ETB,0, 0, 0}, {RDEP(39),0, 0, 0}},
+/* pmd61 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP_MONT_ETB,0, 0, 0}, {RDEP(39),0, 0, 0}},
+/* pmd62 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP_MONT_ETB,0, 0, 0}, {RDEP(39),0, 0, 0}},
+/* pmd63 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP_MONT_ETB,0, 0, 0}, {RDEP(39),0, 0, 0}},
+	    { PFM_REG_END   , 0, 0x0, -1, NULL, NULL, {0,}, {0,}}, /* end marker */
+};
+
+/*
+ * PMC reserved fields must have their power-up values preserved
+ */
+static int
+pfm_mont_reserved(unsigned int cnum, unsigned long *val, struct pt_regs *regs)
+{
+	unsigned long tmp1, tmp2, ival = *val;
+
+	/* remove reserved areas from user value */
+	tmp1 = ival & PMC_RSVD_MASK(cnum);
+
+	/* get reserved fields values */
+	tmp2 = PMC_DFL_VAL(cnum) & ~PMC_RSVD_MASK(cnum);
+
+	*val = tmp1 | tmp2;
+
+	DPRINT(("pmc[%d]=0x%lx, mask=0x%lx, reset=0x%lx, val=0x%lx\n",
+		  cnum, ival, PMC_RSVD_MASK(cnum), PMC_DFL_VAL(cnum), *val));
+	return 0;
+}
+
+/*
+ * task can be NULL if the context is unloaded
+ */
+static int
+pfm_mont_pmc_check(struct task_struct *task, pfm_context_t *ctx, unsigned int cnum, unsigned long *val, struct pt_regs *regs)
+{
+	int ret = 0;
+	unsigned long val32 = 0, val38 = 0, val41 = 0;
+	unsigned long tmpval;
+	int check_case1 = 0;
+	int is_loaded;
+
+	/* first preserve the reserved fields */
+	pfm_mont_reserved(cnum, val, regs);
+
+	tmpval = *val;
+
+	/* sanity check */
+	if (ctx == NULL) return -EINVAL;
+
+	is_loaded = ctx->ctx_state == PFM_CTX_LOADED || ctx->ctx_state == PFM_CTX_MASKED;
+
+	/*
+	 * we must clear the debug registers if pmc41 has a value which enable
+	 * memory pipeline event constraints. In this case we need to clear the
+	 * the debug registers if they have not yet been accessed. This is required
+	 * to avoid picking stale state.
+	 * PMC41 is "active" if:
+	 * 	one of the pmc41.cfg_dtagXX field is different from 0x3
+	 * AND
+	 * 	at the corresponding pmc41.en_dbrpXX is set.
+	 * AND
+	 *	ctx_fl_using_dbreg == 0  (i.e., dbr not yet used)
+	 */
+	DPRINT(("cnum=%u val=0x%lx, using_dbreg=%d loaded=%d\n", cnum, tmpval, ctx->ctx_fl_using_dbreg, is_loaded));
+
+	if (cnum == 41 && is_loaded 
+	    && (tmpval & 0x1e00000000000UL) && (tmpval & 0x18181818UL) != 0x18181818UL && ctx->ctx_fl_using_dbreg == 0) {
+
+		DPRINT(("pmc[%d]=0x%lx has active pmc41 settings, clearing dbr\n", cnum, tmpval));
+
+		/* don't mix debug with perfmon */
+		if (task && (task->thread.flags & IA64_THREAD_DBG_VALID) != 0) return -EINVAL;
+
+		/*
+		 * a count of 0 will mark the debug registers if:
+		 * AND
+		 */
+		ret = pfm_write_ibr_dbr(PFM_DATA_RR, ctx, NULL, 0, regs);
+		if (ret) return ret;
+	}
+	/*
+	 * we must clear the (instruction) debug registers if:
+	 * 	pmc38.ig_ibrpX is 0 (enabled)
+	 * AND
+	 *	ctx_fl_using_dbreg == 0  (i.e., dbr not yet used)
+	 */
+	if (cnum == 38 && is_loaded && ((tmpval & 0x492UL) != 0x492UL) && ctx->ctx_fl_using_dbreg == 0) {
+
+		DPRINT(("pmc38=0x%lx has active pmc38 settings, clearing ibr\n", tmpval));
+
+		/* don't mix debug with perfmon */
+		if (task && (task->thread.flags & IA64_THREAD_DBG_VALID) != 0) return -EINVAL;
+
+		/*
+		 * a count of 0 will mark the debug registers as in use and also
+		 * ensure that they are properly cleared.
+		 */
+		ret = pfm_write_ibr_dbr(PFM_CODE_RR, ctx, NULL, 0, regs);
+		if (ret) return ret;
+
+	}
+	switch(cnum) {
+		case  32: val32 = *val;
+			  val38 = ctx->ctx_pmcs[38];
+			  val41 = ctx->ctx_pmcs[41];
+			  check_case1 = 1;
+			  break;
+		case  38: val38 = *val;
+			  val32 = ctx->ctx_pmcs[32];
+			  val41 = ctx->ctx_pmcs[41];
+			  check_case1 = 1;
+			  break;
+		case  41: val41 = *val;
+			  val32 = ctx->ctx_pmcs[32];
+			  val38 = ctx->ctx_pmcs[38];
+			  check_case1 = 1;
+			  break;
+	}
+	/* check illegal configuration which can produce inconsistencies in tagging
+	 * i-side events in L1D and L2 caches
+	 */
+	if (check_case1) {
+		ret =   (((val41 >> 45) & 0xf) == 0 && ((val32>>57) & 0x1) == 0)
+		     && ((((val38>>1) & 0x3) == 0x2 || ((val38>>1) & 0x3) == 0)
+		     ||  (((val38>>4) & 0x3) == 0x2 || ((val38>>4) & 0x3) == 0));
+		if (ret) {
+			DPRINT(("invalid config pmc38=0x%lx pmc41=0x%lx pmc32=0x%lx\n", val38, val41, val32));
+			return -EINVAL;
+		}
+	}
+	*val = tmpval;
+	return 0;
+}
+
+/*
+ * impl_pmcs, impl_pmds are computed at runtime to minimize errors!
+ */
+static pmu_config_t pmu_conf_mont={
+	.pmu_name        = "Montecito",
+	.pmu_family      = 0x20,
+	.flags           = PFM_PMU_IRQ_RESEND,
+	.ovfl_val        = (1UL << 47) - 1,
+	.pmd_desc        = pfm_mont_pmd_desc,
+	.pmc_desc        = pfm_mont_pmc_desc,
+	.num_ibrs        = 8,
+	.num_dbrs        = 8,
+	.use_rr_dbregs   = 1 /* debug register are use for range retrictions */
+};
diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c
index 051e050359e..55d4ba47a90 100644
--- a/arch/ia64/kernel/process.c
+++ b/arch/ia64/kernel/process.c
@@ -4,10 +4,10 @@
  * Copyright (C) 1998-2003 Hewlett-Packard Co
  *	David Mosberger-Tang <davidm@hpl.hp.com>
  * 04/11/17 Ashok Raj	<ashok.raj@intel.com> Added CPU Hotplug Support
+ *
+ * 2005-10-07 Keith Owens <kaos@sgi.com>
+ *	      Add notify_die() hooks.
  */
-#define __KERNEL_SYSCALLS__	/* see <asm/unistd.h> */
-#include <linux/config.h>
-
 #include <linux/cpu.h>
 #include <linux/pm.h>
 #include <linux/elf.h>
@@ -15,28 +15,31 @@
 #include <linux/kallsyms.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
+#include <linux/slab.h>
 #include <linux/module.h>
 #include <linux/notifier.h>
 #include <linux/personality.h>
 #include <linux/sched.h>
-#include <linux/slab.h>
-#include <linux/smp_lock.h>
 #include <linux/stddef.h>
 #include <linux/thread_info.h>
 #include <linux/unistd.h>
 #include <linux/efi.h>
 #include <linux/interrupt.h>
 #include <linux/delay.h>
-#include <linux/kprobes.h>
+#include <linux/kdebug.h>
+#include <linux/utsname.h>
+#include <linux/tracehook.h>
+#include <linux/rcupdate.h>
 
 #include <asm/cpu.h>
 #include <asm/delay.h>
 #include <asm/elf.h>
-#include <asm/ia32.h>
 #include <asm/irq.h>
+#include <asm/kexec.h>
 #include <asm/pgalloc.h>
 #include <asm/processor.h>
 #include <asm/sal.h>
+#include <asm/switch_to.h>
 #include <asm/tlbflush.h>
 #include <asm/uaccess.h>
 #include <asm/unwind.h>
@@ -51,10 +54,11 @@
 #include "sigframe.h"
 
 void (*ia64_mark_idle)(int);
-static DEFINE_PER_CPU(unsigned int, cpu_idle_state);
 
-unsigned long boot_option_idle_override = 0;
+unsigned long boot_option_idle_override = IDLE_NO_OVERRIDE;
 EXPORT_SYMBOL(boot_option_idle_override);
+void (*pm_power_off) (void);
+EXPORT_SYMBOL(pm_power_off);
 
 void
 ia64_do_show_stack (struct unw_frame_info *info, void *arg)
@@ -92,22 +96,16 @@ show_stack (struct task_struct *task, unsigned long *sp)
 }
 
 void
-dump_stack (void)
-{
-	show_stack(NULL, NULL);
-}
-
-EXPORT_SYMBOL(dump_stack);
-
-void
 show_regs (struct pt_regs *regs)
 {
 	unsigned long ip = regs->cr_iip + ia64_psr(regs)->ri;
 
 	print_modules();
-	printk("\nPid: %d, CPU %d, comm: %20s\n", current->pid, smp_processor_id(), current->comm);
-	printk("psr : %016lx ifs : %016lx ip  : [<%016lx>]    %s\n",
-	       regs->cr_ipsr, regs->cr_ifs, ip, print_tainted());
+	printk("\n");
+	show_regs_print_info(KERN_DEFAULT);
+	printk("psr : %016lx ifs : %016lx ip  : [<%016lx>]    %s (%s)\n",
+	       regs->cr_ipsr, regs->cr_ifs, ip, print_tainted(),
+	       init_utsname()->release);
 	print_symbol("ip is at %s\n", ip);
 	printk("unat: %016lx pfs : %016lx rsc : %016lx\n",
 	       regs->ar_unat, regs->ar_pfs, regs->ar_rsc);
@@ -154,11 +152,21 @@ show_regs (struct pt_regs *regs)
 		show_stack(NULL, NULL);
 }
 
+/* local support for deprecated console_print */
+void
+console_print(const char *s)
+{
+	printk(KERN_EMERG "%s", s);
+}
+
 void
-do_notify_resume_user (sigset_t *oldset, struct sigscratch *scr, long in_syscall)
+do_notify_resume_user(sigset_t *unused, struct sigscratch *scr, long in_syscall)
 {
 	if (fsys_mode(current, &scr->pt)) {
-		/* defer signal-handling etc. until we return to privilege-level 0.  */
+		/*
+		 * defer signal-handling etc. until we return to
+		 * privilege-level 0.
+		 */
 		if (!ia64_psr(&scr->pt)->lp)
 			ia64_psr(&scr->pt)->lp = 1;
 		return;
@@ -166,49 +174,44 @@ do_notify_resume_user (sigset_t *oldset, struct sigscratch *scr, long in_syscall
 
 #ifdef CONFIG_PERFMON
 	if (current->thread.pfm_needs_checking)
+		/*
+		 * Note: pfm_handle_work() allow us to call it with interrupts
+		 * disabled, and may enable interrupts within the function.
+		 */
 		pfm_handle_work();
 #endif
 
 	/* deal with pending signal delivery */
-	if (test_thread_flag(TIF_SIGPENDING))
-		ia64_do_signal(oldset, scr, in_syscall);
-}
+	if (test_thread_flag(TIF_SIGPENDING)) {
+		local_irq_enable();	/* force interrupt enable */
+		ia64_do_signal(scr, in_syscall);
+	}
 
-static int pal_halt        = 1;
-static int can_do_pal_halt = 1;
+	if (test_and_clear_thread_flag(TIF_NOTIFY_RESUME)) {
+		local_irq_enable();	/* force interrupt enable */
+		tracehook_notify_resume(&scr->pt);
+	}
 
-static int __init nohalt_setup(char * str)
-{
-	pal_halt = can_do_pal_halt = 0;
-	return 1;
-}
-__setup("nohalt", nohalt_setup);
+	/* copy user rbs to kernel rbs */
+	if (unlikely(test_thread_flag(TIF_RESTORE_RSE))) {
+		local_irq_enable();	/* force interrupt enable */
+		ia64_sync_krbs();
+	}
 
-void
-update_pal_halt_status(int status)
-{
-	can_do_pal_halt = pal_halt && status;
+	local_irq_disable();	/* force interrupt disable */
 }
 
-/*
- * We use this if we don't have any better idle routine..
- */
-void
-default_idle (void)
+static int __init nohalt_setup(char * str)
 {
-	local_irq_enable();
-	while (!need_resched())
-		if (can_do_pal_halt)
-			safe_halt();
-		else
-			cpu_relax();
+	cpu_idle_poll_ctrl(true);
+	return 1;
 }
+__setup("nohalt", nohalt_setup);
 
 #ifdef CONFIG_HOTPLUG_CPU
 /* We don't actually take CPU down, just spin without interrupts. */
 static inline void play_dead(void)
 {
-	extern void ia64_cpu_local_tick (void);
 	unsigned int this_cpu = smp_processor_id();
 
 	/* Ack it */
@@ -231,72 +234,29 @@ static inline void play_dead(void)
 }
 #endif /* CONFIG_HOTPLUG_CPU */
 
-void cpu_idle_wait(void)
+void arch_cpu_idle_dead(void)
 {
-	unsigned int cpu, this_cpu = get_cpu();
-	cpumask_t map;
-
-	set_cpus_allowed(current, cpumask_of_cpu(this_cpu));
-	put_cpu();
-
-	cpus_clear(map);
-	for_each_online_cpu(cpu) {
-		per_cpu(cpu_idle_state, cpu) = 1;
-		cpu_set(cpu, map);
-	}
-
-	__get_cpu_var(cpu_idle_state) = 0;
-
-	wmb();
-	do {
-		ssleep(1);
-		for_each_online_cpu(cpu) {
-			if (cpu_isset(cpu, map) && !per_cpu(cpu_idle_state, cpu))
-				cpu_clear(cpu, map);
-		}
-		cpus_and(map, map, cpu_online_map);
-	} while (!cpus_empty(map));
+	play_dead();
 }
-EXPORT_SYMBOL_GPL(cpu_idle_wait);
 
-void __attribute__((noreturn))
-cpu_idle (void)
+void arch_cpu_idle(void)
 {
 	void (*mark_idle)(int) = ia64_mark_idle;
 
-	/* endless idle loop with no priority at all */
-	while (1) {
 #ifdef CONFIG_SMP
-		if (!need_resched())
-			min_xtp();
+	min_xtp();
 #endif
-		while (!need_resched()) {
-			void (*idle)(void);
-
-			if (__get_cpu_var(cpu_idle_state))
-				__get_cpu_var(cpu_idle_state) = 0;
-
-			rmb();
-			if (mark_idle)
-				(*mark_idle)(1);
+	rmb();
+	if (mark_idle)
+		(*mark_idle)(1);
 
-			idle = pm_idle;
-			if (!idle)
-				idle = default_idle;
-			(*idle)();
-		}
-
-		if (mark_idle)
-			(*mark_idle)(0);
+	safe_halt();
 
+	if (mark_idle)
+		(*mark_idle)(0);
 #ifdef CONFIG_SMP
-		normal_xtp();
+	normal_xtp();
 #endif
-		schedule();
-		check_pgt_cache();
-		if (cpu_is_offline(smp_processor_id()))
-			play_dead();
-	}
 }
 
 void
@@ -317,11 +277,6 @@ ia64_save_extra (struct task_struct *task)
 	if (info & PFM_CPUINFO_SYST_WIDE)
 		pfm_syst_wide_update_task(task, info, 0);
 #endif
-
-#ifdef CONFIG_IA32_SUPPORT
-	if (IS_IA32_PROCESS(ia64_task_regs(task)))
-		ia32_save_state(task);
-#endif
 }
 
 void
@@ -342,11 +297,6 @@ ia64_load_extra (struct task_struct *task)
 	if (info & PFM_CPUINFO_SYST_WIDE) 
 		pfm_syst_wide_update_task(task, info, 1);
 #endif
-
-#ifdef CONFIG_IA32_SUPPORT
-	if (IS_IA32_PROCESS(ia64_task_regs(task)))
-		ia32_load_state(task);
-#endif
 }
 
 /*
@@ -381,77 +331,26 @@ ia64_load_extra (struct task_struct *task)
  * so there is nothing to worry about.
  */
 int
-copy_thread (int nr, unsigned long clone_flags,
+copy_thread(unsigned long clone_flags,
 	     unsigned long user_stack_base, unsigned long user_stack_size,
-	     struct task_struct *p, struct pt_regs *regs)
+	     struct task_struct *p)
 {
-	extern char ia64_ret_from_clone, ia32_ret_from_clone;
+	extern char ia64_ret_from_clone;
 	struct switch_stack *child_stack, *stack;
 	unsigned long rbs, child_rbs, rbs_size;
 	struct pt_regs *child_ptregs;
+	struct pt_regs *regs = current_pt_regs();
 	int retval = 0;
 
-#ifdef CONFIG_SMP
-	/*
-	 * For SMP idle threads, fork_by_hand() calls do_fork with
-	 * NULL regs.
-	 */
-	if (!regs)
-		return 0;
-#endif
-
-	stack = ((struct switch_stack *) regs) - 1;
-
 	child_ptregs = (struct pt_regs *) ((unsigned long) p + IA64_STK_OFFSET) - 1;
 	child_stack = (struct switch_stack *) child_ptregs - 1;
 
-	/* copy parent's switch_stack & pt_regs to child: */
-	memcpy(child_stack, stack, sizeof(*child_ptregs) + sizeof(*child_stack));
-
 	rbs = (unsigned long) current + IA64_RBS_OFFSET;
 	child_rbs = (unsigned long) p + IA64_RBS_OFFSET;
-	rbs_size = stack->ar_bspstore - rbs;
-
-	/* copy the parent's register backing store to the child: */
-	memcpy((void *) child_rbs, (void *) rbs, rbs_size);
-
-	if (likely(user_mode(child_ptregs))) {
-		if ((clone_flags & CLONE_SETTLS) && !IS_IA32_PROCESS(regs))
-			child_ptregs->r13 = regs->r16;	/* see sys_clone2() in entry.S */
-		if (user_stack_base) {
-			child_ptregs->r12 = user_stack_base + user_stack_size - 16;
-			child_ptregs->ar_bspstore = user_stack_base;
-			child_ptregs->ar_rnat = 0;
-			child_ptregs->loadrs = 0;
-		}
-	} else {
-		/*
-		 * Note: we simply preserve the relative position of
-		 * the stack pointer here.  There is no need to
-		 * allocate a scratch area here, since that will have
-		 * been taken care of by the caller of sys_clone()
-		 * already.
-		 */
-		child_ptregs->r12 = (unsigned long) child_ptregs - 16; /* kernel sp */
-		child_ptregs->r13 = (unsigned long) p;		/* set `current' pointer */
-	}
-	child_stack->ar_bspstore = child_rbs + rbs_size;
-	if (IS_IA32_PROCESS(regs))
-		child_stack->b0 = (unsigned long) &ia32_ret_from_clone;
-	else
-		child_stack->b0 = (unsigned long) &ia64_ret_from_clone;
 
 	/* copy parts of thread_struct: */
 	p->thread.ksp = (unsigned long) child_stack - 16;
 
-	/* stop some PSR bits from being inherited.
-	 * the psr.up/psr.pp bits must be cleared on fork but inherited on execve()
-	 * therefore we must specify them explicitly here and not include them in
-	 * IA64_PSR_BITS_TO_CLEAR.
-	 */
-	child_ptregs->cr_ipsr = ((child_ptregs->cr_ipsr | IA64_PSR_BITS_TO_SET)
-				 & ~(IA64_PSR_BITS_TO_CLEAR | IA64_PSR_PP | IA64_PSR_UP));
-
 	/*
 	 * NOTE: The calling convention considers all floating point
 	 * registers in the high partition (fph) to be scratch.  Since
@@ -473,22 +372,65 @@ copy_thread (int nr, unsigned long clone_flags,
 #	define THREAD_FLAGS_TO_SET	0
 	p->thread.flags = ((current->thread.flags & ~THREAD_FLAGS_TO_CLEAR)
 			   | THREAD_FLAGS_TO_SET);
+
 	ia64_drop_fpu(p);	/* don't pick up stale state from a CPU's fph */
-#ifdef CONFIG_IA32_SUPPORT
-	/*
-	 * If we're cloning an IA32 task then save the IA32 extra
-	 * state from the current task to the new task
-	 */
-	if (IS_IA32_PROCESS(ia64_task_regs(current))) {
-		ia32_save_state(p);
-		if (clone_flags & CLONE_SETTLS)
-			retval = ia32_clone_tls(p, child_ptregs);
-
-		/* Copy partially mapped page list */
-		if (!retval)
-			retval = ia32_copy_partial_page_list(p, clone_flags);
+
+	if (unlikely(p->flags & PF_KTHREAD)) {
+		if (unlikely(!user_stack_base)) {
+			/* fork_idle() called us */
+			return 0;
+		}
+		memset(child_stack, 0, sizeof(*child_ptregs) + sizeof(*child_stack));
+		child_stack->r4 = user_stack_base;	/* payload */
+		child_stack->r5 = user_stack_size;	/* argument */
+		/*
+		 * Preserve PSR bits, except for bits 32-34 and 37-45,
+		 * which we can't read.
+		 */
+		child_ptregs->cr_ipsr = ia64_getreg(_IA64_REG_PSR) | IA64_PSR_BN;
+		/* mark as valid, empty frame */
+		child_ptregs->cr_ifs = 1UL << 63;
+		child_stack->ar_fpsr = child_ptregs->ar_fpsr
+			= ia64_getreg(_IA64_REG_AR_FPSR);
+		child_stack->pr = (1 << PRED_KERNEL_STACK);
+		child_stack->ar_bspstore = child_rbs;
+		child_stack->b0 = (unsigned long) &ia64_ret_from_clone;
+
+		/* stop some PSR bits from being inherited.
+		 * the psr.up/psr.pp bits must be cleared on fork but inherited on execve()
+		 * therefore we must specify them explicitly here and not include them in
+		 * IA64_PSR_BITS_TO_CLEAR.
+		 */
+		child_ptregs->cr_ipsr = ((child_ptregs->cr_ipsr | IA64_PSR_BITS_TO_SET)
+				 & ~(IA64_PSR_BITS_TO_CLEAR | IA64_PSR_PP | IA64_PSR_UP));
+
+		return 0;
 	}
-#endif
+	stack = ((struct switch_stack *) regs) - 1;
+	/* copy parent's switch_stack & pt_regs to child: */
+	memcpy(child_stack, stack, sizeof(*child_ptregs) + sizeof(*child_stack));
+
+	/* copy the parent's register backing store to the child: */
+	rbs_size = stack->ar_bspstore - rbs;
+	memcpy((void *) child_rbs, (void *) rbs, rbs_size);
+	if (clone_flags & CLONE_SETTLS)
+		child_ptregs->r13 = regs->r16;	/* see sys_clone2() in entry.S */
+	if (user_stack_base) {
+		child_ptregs->r12 = user_stack_base + user_stack_size - 16;
+		child_ptregs->ar_bspstore = user_stack_base;
+		child_ptregs->ar_rnat = 0;
+		child_ptregs->loadrs = 0;
+	}
+	child_stack->ar_bspstore = child_rbs + rbs_size;
+	child_stack->b0 = (unsigned long) &ia64_ret_from_clone;
+
+	/* stop some PSR bits from being inherited.
+	 * the psr.up/psr.pp bits must be cleared on fork but inherited on execve()
+	 * therefore we must specify them explicitly here and not include them in
+	 * IA64_PSR_BITS_TO_CLEAR.
+	 */
+	child_ptregs->cr_ipsr = ((child_ptregs->cr_ipsr | IA64_PSR_BITS_TO_SET)
+				 & ~(IA64_PSR_BITS_TO_CLEAR | IA64_PSR_PP | IA64_PSR_UP));
 
 #ifdef CONFIG_PERFMON
 	if (current->thread.pfm_context)
@@ -500,7 +442,8 @@ copy_thread (int nr, unsigned long clone_flags,
 static void
 do_copy_task_regs (struct task_struct *task, struct unw_frame_info *info, void *arg)
 {
-	unsigned long mask, sp, nat_bits = 0, ip, ar_rnat, urbs_end, cfm;
+	unsigned long mask, sp, nat_bits = 0, ar_rnat, urbs_end, cfm;
+	unsigned long uninitialized_var(ip);	/* GCC be quiet */
 	elf_greg_t *dst = arg;
 	struct pt_regs *pt;
 	char nat;
@@ -602,21 +545,6 @@ do_dump_fpu (struct unw_frame_info *info, void *arg)
 	do_dump_task_fpu(current, info, arg);
 }
 
-int
-dump_task_regs(struct task_struct *task, elf_gregset_t *regs)
-{
-	struct unw_frame_info tcore_info;
-
-	if (current == task) {
-		unw_init_running(do_copy_regs, regs);
-	} else {
-		memset(&tcore_info, 0, sizeof(tcore_info));
-		unw_init_from_blocked_task(&tcore_info, task);
-		do_copy_task_regs(task, &tcore_info, regs);
-	}
-	return 1;
-}
-
 void
 ia64_elf_core_copy_regs (struct pt_regs *pt, elf_gregset_t dst)
 {
@@ -624,103 +552,21 @@ ia64_elf_core_copy_regs (struct pt_regs *pt, elf_gregset_t dst)
 }
 
 int
-dump_task_fpu (struct task_struct *task, elf_fpregset_t *dst)
-{
-	struct unw_frame_info tcore_info;
-
-	if (current == task) {
-		unw_init_running(do_dump_fpu, dst);
-	} else {
-		memset(&tcore_info, 0, sizeof(tcore_info));
-		unw_init_from_blocked_task(&tcore_info, task);
-		do_dump_task_fpu(task, &tcore_info, dst);
-	}
-	return 1;
-}
-
-int
 dump_fpu (struct pt_regs *pt, elf_fpregset_t dst)
 {
 	unw_init_running(do_dump_fpu, dst);
 	return 1;	/* f0-f31 are always valid so we always return 1 */
 }
 
-long
-sys_execve (char __user *filename, char __user * __user *argv, char __user * __user *envp,
-	    struct pt_regs *regs)
-{
-	char *fname;
-	int error;
-
-	fname = getname(filename);
-	error = PTR_ERR(fname);
-	if (IS_ERR(fname))
-		goto out;
-	error = do_execve(fname, argv, envp, regs);
-	putname(fname);
-out:
-	return error;
-}
-
-pid_t
-kernel_thread (int (*fn)(void *), void *arg, unsigned long flags)
-{
-	extern void start_kernel_thread (void);
-	unsigned long *helper_fptr = (unsigned long *) &start_kernel_thread;
-	struct {
-		struct switch_stack sw;
-		struct pt_regs pt;
-	} regs;
-
-	memset(&regs, 0, sizeof(regs));
-	regs.pt.cr_iip = helper_fptr[0];	/* set entry point (IP) */
-	regs.pt.r1 = helper_fptr[1];		/* set GP */
-	regs.pt.r9 = (unsigned long) fn;	/* 1st argument */
-	regs.pt.r11 = (unsigned long) arg;	/* 2nd argument */
-	/* Preserve PSR bits, except for bits 32-34 and 37-45, which we can't read.  */
-	regs.pt.cr_ipsr = ia64_getreg(_IA64_REG_PSR) | IA64_PSR_BN;
-	regs.pt.cr_ifs = 1UL << 63;		/* mark as valid, empty frame */
-	regs.sw.ar_fpsr = regs.pt.ar_fpsr = ia64_getreg(_IA64_REG_AR_FPSR);
-	regs.sw.ar_bspstore = (unsigned long) current + IA64_RBS_OFFSET;
-	regs.sw.pr = (1 << PRED_KERNEL_STACK);
-	return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, &regs.pt, 0, NULL, NULL);
-}
-EXPORT_SYMBOL(kernel_thread);
-
-/* This gets called from kernel_thread() via ia64_invoke_thread_helper().  */
-int
-kernel_thread_helper (int (*fn)(void *), void *arg)
-{
-#ifdef CONFIG_IA32_SUPPORT
-	if (IS_IA32_PROCESS(ia64_task_regs(current))) {
-		/* A kernel thread is always a 64-bit process. */
-		current->thread.map_base  = DEFAULT_MAP_BASE;
-		current->thread.task_size = DEFAULT_TASK_SIZE;
-		ia64_set_kr(IA64_KR_IO_BASE, current->thread.old_iob);
-		ia64_set_kr(IA64_KR_TSSD, current->thread.old_k1);
-	}
-#endif
-	return (*fn)(arg);
-}
-
 /*
  * Flush thread state.  This is called when a thread does an execve().
  */
 void
 flush_thread (void)
 {
-	/*
-	 * Remove function-return probe instances associated with this task
-	 * and put them back on the free list. Do not insert an exit probe for
-	 * this function, it will be disabled by kprobe_flush_task if you do.
-	 */
-	kprobe_flush_task(current);
-
 	/* drop floating-point and debug-register state if it exists: */
 	current->thread.flags &= ~(IA64_THREAD_FPH_VALID | IA64_THREAD_DBG_VALID);
 	ia64_drop_fpu(current);
-	if (IS_IA32_PROCESS(ia64_task_regs(current)))
-		ia32_drop_partial_page_list(current);
 }
 
 /*
@@ -731,13 +577,6 @@ void
 exit_thread (void)
 {
 
-	/*
-	 * Remove function-return probe instances associated with this task
-	 * and put them back on the free list. Do not insert an exit probe for
-	 * this function, it will be disabled by kprobe_flush_task if you do.
-	 */
-	kprobe_flush_task(current);
-
 	ia64_drop_fpu(current);
 #ifdef CONFIG_PERFMON
        /* if needed, stop monitoring and flush state to perfmon context */
@@ -748,8 +587,6 @@ exit_thread (void)
 	if (current->thread.flags & IA64_THREAD_DBG_VALID)
 		pfm_release_debug_registers(current);
 #endif
-	if (IS_IA32_PROCESS(ia64_task_regs(current)))
-		ia32_drop_partial_page_list(current);
 }
 
 unsigned long
@@ -759,6 +596,9 @@ get_wchan (struct task_struct *p)
 	unsigned long ip;
 	int count = 0;
 
+	if (!p || p == current || p->state == TASK_RUNNING)
+		return 0;
+
 	/*
 	 * Note: p may not be a blocked task (it could be current or
 	 * another process running on some other CPU.  Rather than
@@ -769,6 +609,8 @@ get_wchan (struct task_struct *p)
 	 */
 	unw_init_from_blocked_task(&info, p);
 	do {
+		if (p->state == TASK_RUNNING)
+			return 0;
 		if (unw_unwind(&info) < 0)
 			return 0;
 		unw_get_ip(&info, &ip);
@@ -801,15 +643,32 @@ cpu_halt (void)
 		ia64_pal_halt(min_power_state);
 }
 
+void machine_shutdown(void)
+{
+#ifdef CONFIG_HOTPLUG_CPU
+	int cpu;
+
+	for_each_online_cpu(cpu) {
+		if (cpu != smp_processor_id())
+			cpu_down(cpu);
+	}
+#endif
+#ifdef CONFIG_KEXEC
+	kexec_disable_iosapic();
+#endif
+}
+
 void
 machine_restart (char *restart_cmd)
 {
+	(void) notify_die(DIE_MACHINE_RESTART, restart_cmd, NULL, 0, 0, 0);
 	(*efi.reset_system)(EFI_RESET_WARM, 0, 0, NULL);
 }
 
 void
 machine_halt (void)
 {
+	(void) notify_die(DIE_MACHINE_HALT, "", NULL, 0, 0, 0);
 	cpu_halt();
 }
 
diff --git a/arch/ia64/kernel/ptrace.c b/arch/ia64/kernel/ptrace.c
index 4b19d041063..b7a5fffe092 100644
--- a/arch/ia64/kernel/ptrace.c
+++ b/arch/ia64/kernel/ptrace.c
@@ -3,27 +3,29 @@
  *
  * Copyright (C) 1999-2005 Hewlett-Packard Co
  *	David Mosberger-Tang <davidm@hpl.hp.com>
+ * Copyright (C) 2006 Intel Co
+ *  2006-08-12	- IA64 Native Utrace implementation support added by
+ *	Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
  *
  * Derived from the x86 and Alpha versions.
  */
-#include <linux/config.h>
 #include <linux/kernel.h>
 #include <linux/sched.h>
-#include <linux/slab.h>
 #include <linux/mm.h>
 #include <linux/errno.h>
 #include <linux/ptrace.h>
-#include <linux/smp_lock.h>
 #include <linux/user.h>
 #include <linux/security.h>
 #include <linux/audit.h>
 #include <linux/signal.h>
+#include <linux/regset.h>
+#include <linux/elf.h>
+#include <linux/tracehook.h>
 
 #include <asm/pgtable.h>
 #include <asm/processor.h>
 #include <asm/ptrace_offsets.h>
 #include <asm/rse.h>
-#include <asm/system.h>
 #include <asm/uaccess.h>
 #include <asm/unwind.h>
 #ifdef CONFIG_PERFMON
@@ -254,7 +256,7 @@ get_rnat (struct task_struct *task, struct switch_stack *sw,
 	long num_regs, nbits;
 	struct pt_regs *pt;
 
-	pt = ia64_task_regs(task);
+	pt = task_pt_regs(task);
 	kbsp = (unsigned long *) sw->ar_bspstore;
 	ubspstore = (unsigned long *) pt->ar_bspstore;
 
@@ -314,7 +316,7 @@ put_rnat (struct task_struct *task, struct switch_stack *sw,
 	struct pt_regs *pt;
 	unsigned long cfm, *urbs_kargs;
 
-	pt = ia64_task_regs(task);
+	pt = task_pt_regs(task);
 	kbsp = (unsigned long *) sw->ar_bspstore;
 	ubspstore = (unsigned long *) pt->ar_bspstore;
 
@@ -407,7 +409,7 @@ ia64_peek (struct task_struct *child, struct switch_stack *child_stack,
 
 	urbs_end = (long *) user_rbs_end;
 	laddr = (unsigned long *) addr;
-	child_regs = ia64_task_regs(child);
+	child_regs = task_pt_regs(child);
 	bspstore = (unsigned long *) child_regs->ar_bspstore;
 	krbs = (unsigned long *) child + IA64_RBS_OFFSET/8;
 	if (on_kernel_rbs(addr, (unsigned long) bspstore,
@@ -467,7 +469,7 @@ ia64_poke (struct task_struct *child, struct switch_stack *child_stack,
 	struct pt_regs *child_regs;
 
 	laddr = (unsigned long *) addr;
-	child_regs = ia64_task_regs(child);
+	child_regs = task_pt_regs(child);
 	bspstore = (unsigned long *) child_regs->ar_bspstore;
 	krbs = (unsigned long *) child + IA64_RBS_OFFSET/8;
 	if (on_kernel_rbs(addr, (unsigned long) bspstore,
@@ -548,77 +550,126 @@ ia64_sync_user_rbs (struct task_struct *child, struct switch_stack *sw,
 	return 0;
 }
 
-static inline int
-thread_matches (struct task_struct *thread, unsigned long addr)
+static long
+ia64_sync_kernel_rbs (struct task_struct *child, struct switch_stack *sw,
+		unsigned long user_rbs_start, unsigned long user_rbs_end)
 {
-	unsigned long thread_rbs_end;
-	struct pt_regs *thread_regs;
+	unsigned long addr, val;
+	long ret;
 
-	if (ptrace_check_attach(thread, 0) < 0)
-		/*
-		 * If the thread is not in an attachable state, we'll
-		 * ignore it.  The net effect is that if ADDR happens
-		 * to overlap with the portion of the thread's
-		 * register backing store that is currently residing
-		 * on the thread's kernel stack, then ptrace() may end
-		 * up accessing a stale value.  But if the thread
-		 * isn't stopped, that's a problem anyhow, so we're
-		 * doing as well as we can...
-		 */
-		return 0;
+	/* now copy word for word from user rbs to kernel rbs: */
+	for (addr = user_rbs_start; addr < user_rbs_end; addr += 8) {
+		if (access_process_vm(child, addr, &val, sizeof(val), 0)
+				!= sizeof(val))
+			return -EIO;
 
-	thread_regs = ia64_task_regs(thread);
-	thread_rbs_end = ia64_get_user_rbs_end(thread, thread_regs, NULL);
-	if (!on_kernel_rbs(addr, thread_regs->ar_bspstore, thread_rbs_end))
-		return 0;
+		ret = ia64_poke(child, sw, user_rbs_end, addr, val);
+		if (ret < 0)
+			return ret;
+	}
+	return 0;
+}
+
+typedef long (*syncfunc_t)(struct task_struct *, struct switch_stack *,
+			    unsigned long, unsigned long);
+
+static void do_sync_rbs(struct unw_frame_info *info, void *arg)
+{
+	struct pt_regs *pt;
+	unsigned long urbs_end;
+	syncfunc_t fn = arg;
 
-	return 1;	/* looks like we've got a winner */
+	if (unw_unwind_to_user(info) < 0)
+		return;
+	pt = task_pt_regs(info->task);
+	urbs_end = ia64_get_user_rbs_end(info->task, pt, NULL);
+
+	fn(info->task, info->sw, pt->ar_bspstore, urbs_end);
 }
 
 /*
- * GDB apparently wants to be able to read the register-backing store
- * of any thread when attached to a given process.  If we are peeking
- * or poking an address that happens to reside in the kernel-backing
- * store of another thread, we need to attach to that thread, because
- * otherwise we end up accessing stale data.
- *
- * task_list_lock must be read-locked before calling this routine!
+ * when a thread is stopped (ptraced), debugger might change thread's user
+ * stack (change memory directly), and we must avoid the RSE stored in kernel
+ * to override user stack (user space's RSE is newer than kernel's in the
+ * case). To workaround the issue, we copy kernel RSE to user RSE before the
+ * task is stopped, so user RSE has updated data.  we then copy user RSE to
+ * kernel after the task is resummed from traced stop and kernel will use the
+ * newer RSE to return to user. TIF_RESTORE_RSE is the flag to indicate we need
+ * synchronize user RSE to kernel.
  */
-static struct task_struct *
-find_thread_for_addr (struct task_struct *child, unsigned long addr)
+void ia64_ptrace_stop(void)
 {
-	struct task_struct *p;
-	struct mm_struct *mm;
-	struct list_head *this, *next;
-	int mm_users;
+	if (test_and_set_tsk_thread_flag(current, TIF_RESTORE_RSE))
+		return;
+	set_notify_resume(current);
+	unw_init_running(do_sync_rbs, ia64_sync_user_rbs);
+}
 
-	if (!(mm = get_task_mm(child)))
-		return child;
+/*
+ * This is called to read back the register backing store.
+ */
+void ia64_sync_krbs(void)
+{
+	clear_tsk_thread_flag(current, TIF_RESTORE_RSE);
 
-	/* -1 because of our get_task_mm(): */
-	mm_users = atomic_read(&mm->mm_users) - 1;
-	if (mm_users <= 1)
-		goto out;		/* not multi-threaded */
+	unw_init_running(do_sync_rbs, ia64_sync_kernel_rbs);
+}
+
+/*
+ * After PTRACE_ATTACH, a thread's register backing store area in user
+ * space is assumed to contain correct data whenever the thread is
+ * stopped.  arch_ptrace_stop takes care of this on tracing stops.
+ * But if the child was already stopped for job control when we attach
+ * to it, then it might not ever get into ptrace_stop by the time we
+ * want to examine the user memory containing the RBS.
+ */
+void
+ptrace_attach_sync_user_rbs (struct task_struct *child)
+{
+	int stopped = 0;
+	struct unw_frame_info info;
 
 	/*
-	 * Traverse the current process' children list.  Every task that
-	 * one attaches to becomes a child.  And it is only attached children
-	 * of the debugger that are of interest (ptrace_check_attach checks
-	 * for this).
+	 * If the child is in TASK_STOPPED, we need to change that to
+	 * TASK_TRACED momentarily while we operate on it.  This ensures
+	 * that the child won't be woken up and return to user mode while
+	 * we are doing the sync.  (It can only be woken up for SIGKILL.)
 	 */
- 	list_for_each_safe(this, next, &current->children) {
-		p = list_entry(this, struct task_struct, sibling);
-		if (p->mm != mm)
-			continue;
-		if (thread_matches(p, addr)) {
-			child = p;
-			goto out;
+
+	read_lock(&tasklist_lock);
+	if (child->sighand) {
+		spin_lock_irq(&child->sighand->siglock);
+		if (child->state == TASK_STOPPED &&
+		    !test_and_set_tsk_thread_flag(child, TIF_RESTORE_RSE)) {
+			set_notify_resume(child);
+
+			child->state = TASK_TRACED;
+			stopped = 1;
 		}
+		spin_unlock_irq(&child->sighand->siglock);
 	}
+	read_unlock(&tasklist_lock);
 
-  out:
-	mmput(mm);
-	return child;
+	if (!stopped)
+		return;
+
+	unw_init_from_blocked_task(&info, child);
+	do_sync_rbs(&info, ia64_sync_user_rbs);
+
+	/*
+	 * Now move the child back into TASK_STOPPED if it should be in a
+	 * job control stop, so that SIGCONT can be used to wake it up.
+	 */
+	read_lock(&tasklist_lock);
+	if (child->sighand) {
+		spin_lock_irq(&child->sighand->siglock);
+		if (child->state == TASK_TRACED &&
+		    (child->signal->flags & SIGNAL_STOP_STOPPED)) {
+			child->state = TASK_STOPPED;
+		}
+		spin_unlock_irq(&child->sighand->siglock);
+	}
+	read_unlock(&tasklist_lock);
 }
 
 /*
@@ -627,7 +678,7 @@ find_thread_for_addr (struct task_struct *child, unsigned long addr)
 inline void
 ia64_flush_fph (struct task_struct *task)
 {
-	struct ia64_psr *psr = ia64_psr(ia64_task_regs(task));
+	struct ia64_psr *psr = ia64_psr(task_pt_regs(task));
 
 	/*
 	 * Prevent migrating this task while
@@ -653,7 +704,7 @@ ia64_flush_fph (struct task_struct *task)
 void
 ia64_sync_fph (struct task_struct *task)
 {
-	struct ia64_psr *psr = ia64_psr(ia64_task_regs(task));
+	struct ia64_psr *psr = ia64_psr(task_pt_regs(task));
 
 	ia64_flush_fph(task);
 	if (!(task->thread.flags & IA64_THREAD_FPH_VALID)) {
@@ -664,25 +715,6 @@ ia64_sync_fph (struct task_struct *task)
 	psr->dfh = 1;
 }
 
-static int
-access_fr (struct unw_frame_info *info, int regnum, int hi,
-	   unsigned long *data, int write_access)
-{
-	struct ia64_fpreg fpval;
-	int ret;
-
-	ret = unw_get_fr(info, regnum, &fpval);
-	if (ret < 0)
-		return ret;
-
-	if (write_access) {
-		fpval.u.bits[hi] = *data;
-		ret = unw_set_fr(info, regnum, fpval);
-	} else
-		*data = fpval.u.bits[hi];
-	return ret;
-}
-
 /*
  * Change the machine-state of CHILD such that it will return via the normal
  * kernel exit-path, rather than the syscall-exit path.
@@ -704,14 +736,14 @@ convert_to_non_syscall (struct task_struct *child, struct pt_regs  *pt,
 		if ((long)((unsigned long)child + IA64_STK_OFFSET - sp)
 		    < IA64_PT_REGS_SIZE) {
 			dprintk("ptrace.%s: ran off the top of the kernel "
-				"stack\n", __FUNCTION__);
+				"stack\n", __func__);
 			return;
 		}
 		if (unw_get_pr (&prev_info, &pr) < 0) {
 			unw_get_rp(&prev_info, &ip);
 			dprintk("ptrace.%s: failed to read "
 				"predicate register (ip=0x%lx)\n",
-				__FUNCTION__, ip);
+				__func__, ip);
 			return;
 		}
 		if (unw_is_intr_frame(&info)
@@ -784,326 +816,7 @@ access_nat_bits (struct task_struct *child, struct pt_regs *pt,
 
 static int
 access_uarea (struct task_struct *child, unsigned long addr,
-	      unsigned long *data, int write_access)
-{
-	unsigned long *ptr, regnum, urbs_end, rnat_addr, cfm;
-	struct switch_stack *sw;
-	struct pt_regs *pt;
-#	define pt_reg_addr(pt, reg)	((void *)			    \
-					 ((unsigned long) (pt)		    \
-					  + offsetof(struct pt_regs, reg)))
-
-
-	pt = ia64_task_regs(child);
-	sw = (struct switch_stack *) (child->thread.ksp + 16);
-
-	if ((addr & 0x7) != 0) {
-		dprintk("ptrace: unaligned register address 0x%lx\n", addr);
-		return -1;
-	}
-
-	if (addr < PT_F127 + 16) {
-		/* accessing fph */
-		if (write_access)
-			ia64_sync_fph(child);
-		else
-			ia64_flush_fph(child);
-		ptr = (unsigned long *)
-			((unsigned long) &child->thread.fph + addr);
-	} else if ((addr >= PT_F10) && (addr < PT_F11 + 16)) {
-		/* scratch registers untouched by kernel (saved in pt_regs) */
-		ptr = pt_reg_addr(pt, f10) + (addr - PT_F10);
-	} else if (addr >= PT_F12 && addr < PT_F15 + 16) {
-		/*
-		 * Scratch registers untouched by kernel (saved in
-		 * switch_stack).
-		 */
-		ptr = (unsigned long *) ((long) sw
-					 + (addr - PT_NAT_BITS - 32));
-	} else if (addr < PT_AR_LC + 8) {
-		/* preserved state: */
-		struct unw_frame_info info;
-		char nat = 0;
-		int ret;
-
-		unw_init_from_blocked_task(&info, child);
-		if (unw_unwind_to_user(&info) < 0)
-			return -1;
-
-		switch (addr) {
-		      case PT_NAT_BITS:
-			return access_nat_bits(child, pt, &info,
-					       data, write_access);
-
-		      case PT_R4: case PT_R5: case PT_R6: case PT_R7:
-			if (write_access) {
-				/* read NaT bit first: */
-				unsigned long dummy;
-
-				ret = unw_get_gr(&info, (addr - PT_R4)/8 + 4,
-						 &dummy, &nat);
-				if (ret < 0)
-					return ret;
-			}
-			return unw_access_gr(&info, (addr - PT_R4)/8 + 4, data,
-					     &nat, write_access);
-
-		      case PT_B1: case PT_B2: case PT_B3:
-		      case PT_B4: case PT_B5:
-			return unw_access_br(&info, (addr - PT_B1)/8 + 1, data,
-					     write_access);
-
-		      case PT_AR_EC:
-			return unw_access_ar(&info, UNW_AR_EC, data,
-					     write_access);
-
-		      case PT_AR_LC:
-			return unw_access_ar(&info, UNW_AR_LC, data,
-					     write_access);
-
-		      default:
-			if (addr >= PT_F2 && addr < PT_F5 + 16)
-				return access_fr(&info, (addr - PT_F2)/16 + 2,
-						 (addr & 8) != 0, data,
-						 write_access);
-			else if (addr >= PT_F16 && addr < PT_F31 + 16)
-				return access_fr(&info,
-						 (addr - PT_F16)/16 + 16,
-						 (addr & 8) != 0,
-						 data, write_access);
-			else {
-				dprintk("ptrace: rejecting access to register "
-					"address 0x%lx\n", addr);
-				return -1;
-			}
-		}
-	} else if (addr < PT_F9+16) {
-		/* scratch state */
-		switch (addr) {
-		      case PT_AR_BSP:
-			/*
-			 * By convention, we use PT_AR_BSP to refer to
-			 * the end of the user-level backing store.
-			 * Use ia64_rse_skip_regs(PT_AR_BSP, -CFM.sof)
-			 * to get the real value of ar.bsp at the time
-			 * the kernel was entered.
-			 *
-			 * Furthermore, when changing the contents of
-			 * PT_AR_BSP (or PT_CFM) we MUST copy any
-			 * users-level stacked registers that are
-			 * stored on the kernel stack back to
-			 * user-space because otherwise, we might end
-			 * up clobbering kernel stacked registers.
-			 * Also, if this happens while the task is
-			 * blocked in a system call, which convert the
-			 * state such that the non-system-call exit
-			 * path is used.  This ensures that the proper
-			 * state will be picked up when resuming
-			 * execution.  However, it *also* means that
-			 * once we write PT_AR_BSP/PT_CFM, it won't be
-			 * possible to modify the syscall arguments of
-			 * the pending system call any longer.  This
-			 * shouldn't be an issue because modifying
-			 * PT_AR_BSP/PT_CFM generally implies that
-			 * we're either abandoning the pending system
-			 * call or that we defer it's re-execution
-			 * (e.g., due to GDB doing an inferior
-			 * function call).
-			 */
-			urbs_end = ia64_get_user_rbs_end(child, pt, &cfm);
-			if (write_access) {
-				if (*data != urbs_end) {
-					if (ia64_sync_user_rbs(child, sw,
-							       pt->ar_bspstore,
-							       urbs_end) < 0)
-						return -1;
-					if (in_syscall(pt))
-						convert_to_non_syscall(child,
-								       pt,
-								       cfm);
-					/*
-					 * Simulate user-level write
-					 * of ar.bsp:
-					 */
-					pt->loadrs = 0;
-					pt->ar_bspstore = *data;
-				}
-			} else
-				*data = urbs_end;
-			return 0;
-
-		      case PT_CFM:
-			urbs_end = ia64_get_user_rbs_end(child, pt, &cfm);
-			if (write_access) {
-				if (((cfm ^ *data) & PFM_MASK) != 0) {
-					if (ia64_sync_user_rbs(child, sw,
-							       pt->ar_bspstore,
-							       urbs_end) < 0)
-						return -1;
-					if (in_syscall(pt))
-						convert_to_non_syscall(child,
-								       pt,
-								       cfm);
-					pt->cr_ifs = ((pt->cr_ifs & ~PFM_MASK)
-						      | (*data & PFM_MASK));
-				}
-			} else
-				*data = cfm;
-			return 0;
-
-		      case PT_CR_IPSR:
-			if (write_access)
-				pt->cr_ipsr = ((*data & IPSR_MASK)
-					       | (pt->cr_ipsr & ~IPSR_MASK));
-			else
-				*data = (pt->cr_ipsr & IPSR_MASK);
-			return 0;
-
-		      case PT_AR_RSC:
-			if (write_access)
-				pt->ar_rsc = *data | (3 << 2); /* force PL3 */
-			else
-				*data = pt->ar_rsc;
-			return 0;
-
-		      case PT_AR_RNAT:
-			urbs_end = ia64_get_user_rbs_end(child, pt, NULL);
-			rnat_addr = (long) ia64_rse_rnat_addr((long *)
-							      urbs_end);
-			if (write_access)
-				return ia64_poke(child, sw, urbs_end,
-						 rnat_addr, *data);
-			else
-				return ia64_peek(child, sw, urbs_end,
-						 rnat_addr, data);
-
-		      case PT_R1:
-			ptr = pt_reg_addr(pt, r1);
-			break;
-		      case PT_R2:  case PT_R3:
-			ptr = pt_reg_addr(pt, r2) + (addr - PT_R2);
-			break;
-		      case PT_R8:  case PT_R9:  case PT_R10: case PT_R11:
-			ptr = pt_reg_addr(pt, r8) + (addr - PT_R8);
-			break;
-		      case PT_R12: case PT_R13:
-			ptr = pt_reg_addr(pt, r12) + (addr - PT_R12);
-			break;
-		      case PT_R14:
-			ptr = pt_reg_addr(pt, r14);
-			break;
-		      case PT_R15:
-			ptr = pt_reg_addr(pt, r15);
-			break;
-		      case PT_R16: case PT_R17: case PT_R18: case PT_R19:
-		      case PT_R20: case PT_R21: case PT_R22: case PT_R23:
-		      case PT_R24: case PT_R25: case PT_R26: case PT_R27:
-		      case PT_R28: case PT_R29: case PT_R30: case PT_R31:
-			ptr = pt_reg_addr(pt, r16) + (addr - PT_R16);
-			break;
-		      case PT_B0:
-			ptr = pt_reg_addr(pt, b0);
-			break;
-		      case PT_B6:
-			ptr = pt_reg_addr(pt, b6);
-			break;
-		      case PT_B7:
-			ptr = pt_reg_addr(pt, b7);
-			break;
-		      case PT_F6:  case PT_F6+8: case PT_F7: case PT_F7+8:
-		      case PT_F8:  case PT_F8+8: case PT_F9: case PT_F9+8:
-			ptr = pt_reg_addr(pt, f6) + (addr - PT_F6);
-			break;
-		      case PT_AR_BSPSTORE:
-			ptr = pt_reg_addr(pt, ar_bspstore);
-			break;
-		      case PT_AR_UNAT:
-			ptr = pt_reg_addr(pt, ar_unat);
-			break;
-		      case PT_AR_PFS:
-			ptr = pt_reg_addr(pt, ar_pfs);
-			break;
-		      case PT_AR_CCV:
-			ptr = pt_reg_addr(pt, ar_ccv);
-			break;
-		      case PT_AR_FPSR:
-			ptr = pt_reg_addr(pt, ar_fpsr);
-			break;
-		      case PT_CR_IIP:
-			ptr = pt_reg_addr(pt, cr_iip);
-			break;
-		      case PT_PR:
-			ptr = pt_reg_addr(pt, pr);
-			break;
-			/* scratch register */
-
-		      default:
-			/* disallow accessing anything else... */
-			dprintk("ptrace: rejecting access to register "
-				"address 0x%lx\n", addr);
-			return -1;
-		}
-	} else if (addr <= PT_AR_SSD) {
-		ptr = pt_reg_addr(pt, ar_csd) + (addr - PT_AR_CSD);
-	} else {
-		/* access debug registers */
-
-		if (addr >= PT_IBR) {
-			regnum = (addr - PT_IBR) >> 3;
-			ptr = &child->thread.ibr[0];
-		} else {
-			regnum = (addr - PT_DBR) >> 3;
-			ptr = &child->thread.dbr[0];
-		}
-
-		if (regnum >= 8) {
-			dprintk("ptrace: rejecting access to register "
-				"address 0x%lx\n", addr);
-			return -1;
-		}
-#ifdef CONFIG_PERFMON
-		/*
-		 * Check if debug registers are used by perfmon. This
-		 * test must be done once we know that we can do the
-		 * operation, i.e. the arguments are all valid, but
-		 * before we start modifying the state.
-		 *
-		 * Perfmon needs to keep a count of how many processes
-		 * are trying to modify the debug registers for system
-		 * wide monitoring sessions.
-		 *
-		 * We also include read access here, because they may
-		 * cause the PMU-installed debug register state
-		 * (dbr[], ibr[]) to be reset. The two arrays are also
-		 * used by perfmon, but we do not use
-		 * IA64_THREAD_DBG_VALID. The registers are restored
-		 * by the PMU context switch code.
-		 */
-		if (pfm_use_debug_registers(child)) return -1;
-#endif
-
-		if (!(child->thread.flags & IA64_THREAD_DBG_VALID)) {
-			child->thread.flags |= IA64_THREAD_DBG_VALID;
-			memset(child->thread.dbr, 0,
-			       sizeof(child->thread.dbr));
-			memset(child->thread.ibr, 0,
-			       sizeof(child->thread.ibr));
-		}
-
-		ptr += regnum;
-
-		if ((regnum & 1) && write_access) {
-			/* don't let the user set kernel-level breakpoints: */
-			*ptr = *data & ~(7UL << 56);
-			return 0;
-		}
-	}
-	if (write_access)
-		*ptr = *data;
-	else
-		*data = *ptr;
-	return 0;
-}
+	      unsigned long *data, int write_access);
 
 static long
 ptrace_getregs (struct task_struct *child, struct pt_all_user_regs __user *ppr)
@@ -1120,7 +833,7 @@ ptrace_getregs (struct task_struct *child, struct pt_all_user_regs __user *ppr)
 	if (!access_ok(VERIFY_WRITE, ppr, sizeof(struct pt_all_user_regs)))
 		return -EIO;
 
-	pt = ia64_task_regs(child);
+	pt = task_pt_regs(child);
 	sw = (struct switch_stack *) (child->thread.ksp + 16);
 	unw_init_from_blocked_task(&info, child);
 	if (unw_unwind_to_user(&info) < 0) {
@@ -1265,7 +978,7 @@ ptrace_setregs (struct task_struct *child, struct pt_all_user_regs __user *ppr)
 	if (!access_ok(VERIFY_READ, ppr, sizeof(struct pt_all_user_regs)))
 		return -EIO;
 
-	pt = ia64_task_regs(child);
+	pt = task_pt_regs(child);
 	sw = (struct switch_stack *) (child->thread.ksp + 16);
 	unw_init_from_blocked_task(&info, child);
 	if (unw_unwind_to_user(&info) < 0) {
@@ -1395,6 +1108,35 @@ ptrace_setregs (struct task_struct *child, struct pt_all_user_regs __user *ppr)
 	return ret;
 }
 
+void
+user_enable_single_step (struct task_struct *child)
+{
+	struct ia64_psr *child_psr = ia64_psr(task_pt_regs(child));
+
+	set_tsk_thread_flag(child, TIF_SINGLESTEP);
+	child_psr->ss = 1;
+}
+
+void
+user_enable_block_step (struct task_struct *child)
+{
+	struct ia64_psr *child_psr = ia64_psr(task_pt_regs(child));
+
+	set_tsk_thread_flag(child, TIF_SINGLESTEP);
+	child_psr->tb = 1;
+}
+
+void
+user_disable_single_step (struct task_struct *child)
+{
+	struct ia64_psr *child_psr = ia64_psr(task_pt_regs(child));
+
+	/* make sure the single step/taken-branch trap bits are not set: */
+	clear_tsk_thread_flag(child, TIF_SINGLESTEP);
+	child_psr->ss = 0;
+	child_psr->tb = 0;
+}
+
 /*
  * Called by kernel/ptrace.c when detaching..
  *
@@ -1403,270 +1145,1050 @@ ptrace_setregs (struct task_struct *child, struct pt_all_user_regs __user *ppr)
 void
 ptrace_disable (struct task_struct *child)
 {
-	struct ia64_psr *child_psr = ia64_psr(ia64_task_regs(child));
+	user_disable_single_step(child);
+}
 
-	/* make sure the single step/taken-branch trap bits are not set: */
-	child_psr->ss = 0;
-	child_psr->tb = 0;
+long
+arch_ptrace (struct task_struct *child, long request,
+	     unsigned long addr, unsigned long data)
+{
+	switch (request) {
+	case PTRACE_PEEKTEXT:
+	case PTRACE_PEEKDATA:
+		/* read word at location addr */
+		if (access_process_vm(child, addr, &data, sizeof(data), 0)
+		    != sizeof(data))
+			return -EIO;
+		/* ensure return value is not mistaken for error code */
+		force_successful_syscall_return();
+		return data;
+
+	/* PTRACE_POKETEXT and PTRACE_POKEDATA is handled
+	 * by the generic ptrace_request().
+	 */
+
+	case PTRACE_PEEKUSR:
+		/* read the word at addr in the USER area */
+		if (access_uarea(child, addr, &data, 0) < 0)
+			return -EIO;
+		/* ensure return value is not mistaken for error code */
+		force_successful_syscall_return();
+		return data;
+
+	case PTRACE_POKEUSR:
+		/* write the word at addr in the USER area */
+		if (access_uarea(child, addr, &data, 1) < 0)
+			return -EIO;
+		return 0;
+
+	case PTRACE_OLD_GETSIGINFO:
+		/* for backwards-compatibility */
+		return ptrace_request(child, PTRACE_GETSIGINFO, addr, data);
+
+	case PTRACE_OLD_SETSIGINFO:
+		/* for backwards-compatibility */
+		return ptrace_request(child, PTRACE_SETSIGINFO, addr, data);
+
+	case PTRACE_GETREGS:
+		return ptrace_getregs(child,
+				      (struct pt_all_user_regs __user *) data);
+
+	case PTRACE_SETREGS:
+		return ptrace_setregs(child,
+				      (struct pt_all_user_regs __user *) data);
+
+	default:
+		return ptrace_request(child, request, addr, data);
+	}
 }
 
+
+/* "asmlinkage" so the input arguments are preserved... */
+
 asmlinkage long
-sys_ptrace (long request, pid_t pid, unsigned long addr, unsigned long data)
+syscall_trace_enter (long arg0, long arg1, long arg2, long arg3,
+		     long arg4, long arg5, long arg6, long arg7,
+		     struct pt_regs regs)
+{
+	if (test_thread_flag(TIF_SYSCALL_TRACE))
+		if (tracehook_report_syscall_entry(&regs))
+			return -ENOSYS;
+
+	/* copy user rbs to kernel rbs */
+	if (test_thread_flag(TIF_RESTORE_RSE))
+		ia64_sync_krbs();
+
+
+	audit_syscall_entry(AUDIT_ARCH_IA64, regs.r15, arg0, arg1, arg2, arg3);
+
+	return 0;
+}
+
+/* "asmlinkage" so the input arguments are preserved... */
+
+asmlinkage void
+syscall_trace_leave (long arg0, long arg1, long arg2, long arg3,
+		     long arg4, long arg5, long arg6, long arg7,
+		     struct pt_regs regs)
+{
+	int step;
+
+	audit_syscall_exit(&regs);
+
+	step = test_thread_flag(TIF_SINGLESTEP);
+	if (step || test_thread_flag(TIF_SYSCALL_TRACE))
+		tracehook_report_syscall_exit(&regs, step);
+
+	/* copy user rbs to kernel rbs */
+	if (test_thread_flag(TIF_RESTORE_RSE))
+		ia64_sync_krbs();
+}
+
+/* Utrace implementation starts here */
+struct regset_get {
+	void *kbuf;
+	void __user *ubuf;
+};
+
+struct regset_set {
+	const void *kbuf;
+	const void __user *ubuf;
+};
+
+struct regset_getset {
+	struct task_struct *target;
+	const struct user_regset *regset;
+	union {
+		struct regset_get get;
+		struct regset_set set;
+	} u;
+	unsigned int pos;
+	unsigned int count;
+	int ret;
+};
+
+static int
+access_elf_gpreg(struct task_struct *target, struct unw_frame_info *info,
+		unsigned long addr, unsigned long *data, int write_access)
 {
 	struct pt_regs *pt;
-	unsigned long urbs_end, peek_or_poke;
-	struct task_struct *child;
-	struct switch_stack *sw;
-	long ret;
+	unsigned long *ptr = NULL;
+	int ret;
+	char nat = 0;
 
-	lock_kernel();
-	ret = -EPERM;
-	if (request == PTRACE_TRACEME) {
-		/* are we already being traced? */
-		if (current->ptrace & PT_PTRACED)
-			goto out;
-		ret = security_ptrace(current->parent, current);
-		if (ret)
-			goto out;
-		current->ptrace |= PT_PTRACED;
-		ret = 0;
-		goto out;
+	pt = task_pt_regs(target);
+	switch (addr) {
+	case ELF_GR_OFFSET(1):
+		ptr = &pt->r1;
+		break;
+	case ELF_GR_OFFSET(2):
+	case ELF_GR_OFFSET(3):
+		ptr = (void *)&pt->r2 + (addr - ELF_GR_OFFSET(2));
+		break;
+	case ELF_GR_OFFSET(4) ... ELF_GR_OFFSET(7):
+		if (write_access) {
+			/* read NaT bit first: */
+			unsigned long dummy;
+
+			ret = unw_get_gr(info, addr/8, &dummy, &nat);
+			if (ret < 0)
+				return ret;
+		}
+		return unw_access_gr(info, addr/8, data, &nat, write_access);
+	case ELF_GR_OFFSET(8) ... ELF_GR_OFFSET(11):
+		ptr = (void *)&pt->r8 + addr - ELF_GR_OFFSET(8);
+		break;
+	case ELF_GR_OFFSET(12):
+	case ELF_GR_OFFSET(13):
+		ptr = (void *)&pt->r12 + addr - ELF_GR_OFFSET(12);
+		break;
+	case ELF_GR_OFFSET(14):
+		ptr = &pt->r14;
+		break;
+	case ELF_GR_OFFSET(15):
+		ptr = &pt->r15;
 	}
+	if (write_access)
+		*ptr = *data;
+	else
+		*data = *ptr;
+	return 0;
+}
 
-	peek_or_poke = (request == PTRACE_PEEKTEXT
-			|| request == PTRACE_PEEKDATA
-			|| request == PTRACE_POKETEXT
-			|| request == PTRACE_POKEDATA);
-	ret = -ESRCH;
-	read_lock(&tasklist_lock);
-	{
-		child = find_task_by_pid(pid);
-		if (child) {
-			if (peek_or_poke)
-				child = find_thread_for_addr(child, addr);
-			get_task_struct(child);
+static int
+access_elf_breg(struct task_struct *target, struct unw_frame_info *info,
+		unsigned long addr, unsigned long *data, int write_access)
+{
+	struct pt_regs *pt;
+	unsigned long *ptr = NULL;
+
+	pt = task_pt_regs(target);
+	switch (addr) {
+	case ELF_BR_OFFSET(0):
+		ptr = &pt->b0;
+		break;
+	case ELF_BR_OFFSET(1) ... ELF_BR_OFFSET(5):
+		return unw_access_br(info, (addr - ELF_BR_OFFSET(0))/8,
+				     data, write_access);
+	case ELF_BR_OFFSET(6):
+		ptr = &pt->b6;
+		break;
+	case ELF_BR_OFFSET(7):
+		ptr = &pt->b7;
+	}
+	if (write_access)
+		*ptr = *data;
+	else
+		*data = *ptr;
+	return 0;
+}
+
+static int
+access_elf_areg(struct task_struct *target, struct unw_frame_info *info,
+		unsigned long addr, unsigned long *data, int write_access)
+{
+	struct pt_regs *pt;
+	unsigned long cfm, urbs_end;
+	unsigned long *ptr = NULL;
+
+	pt = task_pt_regs(target);
+	if (addr >= ELF_AR_RSC_OFFSET && addr <= ELF_AR_SSD_OFFSET) {
+		switch (addr) {
+		case ELF_AR_RSC_OFFSET:
+			/* force PL3 */
+			if (write_access)
+				pt->ar_rsc = *data | (3 << 2);
+			else
+				*data = pt->ar_rsc;
+			return 0;
+		case ELF_AR_BSP_OFFSET:
+			/*
+			 * By convention, we use PT_AR_BSP to refer to
+			 * the end of the user-level backing store.
+			 * Use ia64_rse_skip_regs(PT_AR_BSP, -CFM.sof)
+			 * to get the real value of ar.bsp at the time
+			 * the kernel was entered.
+			 *
+			 * Furthermore, when changing the contents of
+			 * PT_AR_BSP (or PT_CFM) while the task is
+			 * blocked in a system call, convert the state
+			 * so that the non-system-call exit
+			 * path is used.  This ensures that the proper
+			 * state will be picked up when resuming
+			 * execution.  However, it *also* means that
+			 * once we write PT_AR_BSP/PT_CFM, it won't be
+			 * possible to modify the syscall arguments of
+			 * the pending system call any longer.  This
+			 * shouldn't be an issue because modifying
+			 * PT_AR_BSP/PT_CFM generally implies that
+			 * we're either abandoning the pending system
+			 * call or that we defer it's re-execution
+			 * (e.g., due to GDB doing an inferior
+			 * function call).
+			 */
+			urbs_end = ia64_get_user_rbs_end(target, pt, &cfm);
+			if (write_access) {
+				if (*data != urbs_end) {
+					if (in_syscall(pt))
+						convert_to_non_syscall(target,
+								       pt,
+								       cfm);
+					/*
+					 * Simulate user-level write
+					 * of ar.bsp:
+					 */
+					pt->loadrs = 0;
+					pt->ar_bspstore = *data;
+				}
+			} else
+				*data = urbs_end;
+			return 0;
+		case ELF_AR_BSPSTORE_OFFSET:
+			ptr = &pt->ar_bspstore;
+			break;
+		case ELF_AR_RNAT_OFFSET:
+			ptr = &pt->ar_rnat;
+			break;
+		case ELF_AR_CCV_OFFSET:
+			ptr = &pt->ar_ccv;
+			break;
+		case ELF_AR_UNAT_OFFSET:
+			ptr = &pt->ar_unat;
+			break;
+		case ELF_AR_FPSR_OFFSET:
+			ptr = &pt->ar_fpsr;
+			break;
+		case ELF_AR_PFS_OFFSET:
+			ptr = &pt->ar_pfs;
+			break;
+		case ELF_AR_LC_OFFSET:
+			return unw_access_ar(info, UNW_AR_LC, data,
+					     write_access);
+		case ELF_AR_EC_OFFSET:
+			return unw_access_ar(info, UNW_AR_EC, data,
+					     write_access);
+		case ELF_AR_CSD_OFFSET:
+			ptr = &pt->ar_csd;
+			break;
+		case ELF_AR_SSD_OFFSET:
+			ptr = &pt->ar_ssd;
 		}
+	} else if (addr >= ELF_CR_IIP_OFFSET && addr <= ELF_CR_IPSR_OFFSET) {
+		switch (addr) {
+		case ELF_CR_IIP_OFFSET:
+			ptr = &pt->cr_iip;
+			break;
+		case ELF_CFM_OFFSET:
+			urbs_end = ia64_get_user_rbs_end(target, pt, &cfm);
+			if (write_access) {
+				if (((cfm ^ *data) & PFM_MASK) != 0) {
+					if (in_syscall(pt))
+						convert_to_non_syscall(target,
+								       pt,
+								       cfm);
+					pt->cr_ifs = ((pt->cr_ifs & ~PFM_MASK)
+						      | (*data & PFM_MASK));
+				}
+			} else
+				*data = cfm;
+			return 0;
+		case ELF_CR_IPSR_OFFSET:
+			if (write_access) {
+				unsigned long tmp = *data;
+				/* psr.ri==3 is a reserved value: SDM 2:25 */
+				if ((tmp & IA64_PSR_RI) == IA64_PSR_RI)
+					tmp &= ~IA64_PSR_RI;
+				pt->cr_ipsr = ((tmp & IPSR_MASK)
+					       | (pt->cr_ipsr & ~IPSR_MASK));
+			} else
+				*data = (pt->cr_ipsr & IPSR_MASK);
+			return 0;
+		}
+	} else if (addr == ELF_NAT_OFFSET)
+		return access_nat_bits(target, pt, info,
+				       data, write_access);
+	else if (addr == ELF_PR_OFFSET)
+		ptr = &pt->pr;
+	else
+		return -1;
+
+	if (write_access)
+		*ptr = *data;
+	else
+		*data = *ptr;
+
+	return 0;
+}
+
+static int
+access_elf_reg(struct task_struct *target, struct unw_frame_info *info,
+		unsigned long addr, unsigned long *data, int write_access)
+{
+	if (addr >= ELF_GR_OFFSET(1) && addr <= ELF_GR_OFFSET(15))
+		return access_elf_gpreg(target, info, addr, data, write_access);
+	else if (addr >= ELF_BR_OFFSET(0) && addr <= ELF_BR_OFFSET(7))
+		return access_elf_breg(target, info, addr, data, write_access);
+	else
+		return access_elf_areg(target, info, addr, data, write_access);
+}
+
+void do_gpregs_get(struct unw_frame_info *info, void *arg)
+{
+	struct pt_regs *pt;
+	struct regset_getset *dst = arg;
+	elf_greg_t tmp[16];
+	unsigned int i, index, min_copy;
+
+	if (unw_unwind_to_user(info) < 0)
+		return;
+
+	/*
+	 * coredump format:
+	 *      r0-r31
+	 *      NaT bits (for r0-r31; bit N == 1 iff rN is a NaT)
+	 *      predicate registers (p0-p63)
+	 *      b0-b7
+	 *      ip cfm user-mask
+	 *      ar.rsc ar.bsp ar.bspstore ar.rnat
+	 *      ar.ccv ar.unat ar.fpsr ar.pfs ar.lc ar.ec
+	 */
+
+
+	/* Skip r0 */
+	if (dst->count > 0 && dst->pos < ELF_GR_OFFSET(1)) {
+		dst->ret = user_regset_copyout_zero(&dst->pos, &dst->count,
+						      &dst->u.get.kbuf,
+						      &dst->u.get.ubuf,
+						      0, ELF_GR_OFFSET(1));
+		if (dst->ret || dst->count == 0)
+			return;
 	}
-	read_unlock(&tasklist_lock);
-	if (!child)
-		goto out;
-	ret = -EPERM;
-	if (pid == 1)		/* no messing around with init! */
-		goto out_tsk;
-
-	if (request == PTRACE_ATTACH) {
-		ret = ptrace_attach(child);
-		goto out_tsk;
+
+	/* gr1 - gr15 */
+	if (dst->count > 0 && dst->pos < ELF_GR_OFFSET(16)) {
+		index = (dst->pos - ELF_GR_OFFSET(1)) / sizeof(elf_greg_t);
+		min_copy = ELF_GR_OFFSET(16) > (dst->pos + dst->count) ?
+			 (dst->pos + dst->count) : ELF_GR_OFFSET(16);
+		for (i = dst->pos; i < min_copy; i += sizeof(elf_greg_t),
+				index++)
+			if (access_elf_reg(dst->target, info, i,
+						&tmp[index], 0) < 0) {
+				dst->ret = -EIO;
+				return;
+			}
+		dst->ret = user_regset_copyout(&dst->pos, &dst->count,
+				&dst->u.get.kbuf, &dst->u.get.ubuf, tmp,
+				ELF_GR_OFFSET(1), ELF_GR_OFFSET(16));
+		if (dst->ret || dst->count == 0)
+			return;
 	}
 
-	ret = ptrace_check_attach(child, request == PTRACE_KILL);
-	if (ret < 0)
-		goto out_tsk;
+	/* r16-r31 */
+	if (dst->count > 0 && dst->pos < ELF_NAT_OFFSET) {
+		pt = task_pt_regs(dst->target);
+		dst->ret = user_regset_copyout(&dst->pos, &dst->count,
+				&dst->u.get.kbuf, &dst->u.get.ubuf, &pt->r16,
+				ELF_GR_OFFSET(16), ELF_NAT_OFFSET);
+		if (dst->ret || dst->count == 0)
+			return;
+	}
 
-	pt = ia64_task_regs(child);
-	sw = (struct switch_stack *) (child->thread.ksp + 16);
+	/* nat, pr, b0 - b7 */
+	if (dst->count > 0 && dst->pos < ELF_CR_IIP_OFFSET) {
+		index = (dst->pos - ELF_NAT_OFFSET) / sizeof(elf_greg_t);
+		min_copy = ELF_CR_IIP_OFFSET > (dst->pos + dst->count) ?
+			 (dst->pos + dst->count) : ELF_CR_IIP_OFFSET;
+		for (i = dst->pos; i < min_copy; i += sizeof(elf_greg_t),
+				index++)
+			if (access_elf_reg(dst->target, info, i,
+						&tmp[index], 0) < 0) {
+				dst->ret = -EIO;
+				return;
+			}
+		dst->ret = user_regset_copyout(&dst->pos, &dst->count,
+				&dst->u.get.kbuf, &dst->u.get.ubuf, tmp,
+				ELF_NAT_OFFSET, ELF_CR_IIP_OFFSET);
+		if (dst->ret || dst->count == 0)
+			return;
+	}
 
-	switch (request) {
-	      case PTRACE_PEEKTEXT:
-	      case PTRACE_PEEKDATA:
-		/* read word at location addr */
-		urbs_end = ia64_get_user_rbs_end(child, pt, NULL);
-		ret = ia64_peek(child, sw, urbs_end, addr, &data);
-		if (ret == 0) {
-			ret = data;
-			/* ensure "ret" is not mistaken as an error code: */
-			force_successful_syscall_return();
-		}
-		goto out_tsk;
+	/* ip cfm psr ar.rsc ar.bsp ar.bspstore ar.rnat
+	 * ar.ccv ar.unat ar.fpsr ar.pfs ar.lc ar.ec ar.csd ar.ssd
+	 */
+	if (dst->count > 0 && dst->pos < (ELF_AR_END_OFFSET)) {
+		index = (dst->pos - ELF_CR_IIP_OFFSET) / sizeof(elf_greg_t);
+		min_copy = ELF_AR_END_OFFSET > (dst->pos + dst->count) ?
+			 (dst->pos + dst->count) : ELF_AR_END_OFFSET;
+		for (i = dst->pos; i < min_copy; i += sizeof(elf_greg_t),
+				index++)
+			if (access_elf_reg(dst->target, info, i,
+						&tmp[index], 0) < 0) {
+				dst->ret = -EIO;
+				return;
+			}
+		dst->ret = user_regset_copyout(&dst->pos, &dst->count,
+				&dst->u.get.kbuf, &dst->u.get.ubuf, tmp,
+				ELF_CR_IIP_OFFSET, ELF_AR_END_OFFSET);
+	}
+}
 
-	      case PTRACE_POKETEXT:
-	      case PTRACE_POKEDATA:
-		/* write the word at location addr */
-		urbs_end = ia64_get_user_rbs_end(child, pt, NULL);
-		ret = ia64_poke(child, sw, urbs_end, addr, data);
-		goto out_tsk;
+void do_gpregs_set(struct unw_frame_info *info, void *arg)
+{
+	struct pt_regs *pt;
+	struct regset_getset *dst = arg;
+	elf_greg_t tmp[16];
+	unsigned int i, index;
 
-	      case PTRACE_PEEKUSR:
-		/* read the word at addr in the USER area */
-		if (access_uarea(child, addr, &data, 0) < 0) {
-			ret = -EIO;
-			goto out_tsk;
-		}
-		ret = data;
-		/* ensure "ret" is not mistaken as an error code */
-		force_successful_syscall_return();
-		goto out_tsk;
+	if (unw_unwind_to_user(info) < 0)
+		return;
 
-	      case PTRACE_POKEUSR:
-		/* write the word at addr in the USER area */
-		if (access_uarea(child, addr, &data, 1) < 0) {
-			ret = -EIO;
-			goto out_tsk;
-		}
-		ret = 0;
-		goto out_tsk;
+	/* Skip r0 */
+	if (dst->count > 0 && dst->pos < ELF_GR_OFFSET(1)) {
+		dst->ret = user_regset_copyin_ignore(&dst->pos, &dst->count,
+						       &dst->u.set.kbuf,
+						       &dst->u.set.ubuf,
+						       0, ELF_GR_OFFSET(1));
+		if (dst->ret || dst->count == 0)
+			return;
+	}
 
-	      case PTRACE_OLD_GETSIGINFO:
-		/* for backwards-compatibility */
-		ret = ptrace_request(child, PTRACE_GETSIGINFO, addr, data);
-		goto out_tsk;
+	/* gr1-gr15 */
+	if (dst->count > 0 && dst->pos < ELF_GR_OFFSET(16)) {
+		i = dst->pos;
+		index = (dst->pos - ELF_GR_OFFSET(1)) / sizeof(elf_greg_t);
+		dst->ret = user_regset_copyin(&dst->pos, &dst->count,
+				&dst->u.set.kbuf, &dst->u.set.ubuf, tmp,
+				ELF_GR_OFFSET(1), ELF_GR_OFFSET(16));
+		if (dst->ret)
+			return;
+		for ( ; i < dst->pos; i += sizeof(elf_greg_t), index++)
+			if (access_elf_reg(dst->target, info, i,
+						&tmp[index], 1) < 0) {
+				dst->ret = -EIO;
+				return;
+			}
+		if (dst->count == 0)
+			return;
+	}
 
-	      case PTRACE_OLD_SETSIGINFO:
-		/* for backwards-compatibility */
-		ret = ptrace_request(child, PTRACE_SETSIGINFO, addr, data);
-		goto out_tsk;
-
-	      case PTRACE_SYSCALL:
-		/* continue and stop at next (return from) syscall */
-	      case PTRACE_CONT:
-		/* restart after signal. */
-		ret = -EIO;
-		if (!valid_signal(data))
-			goto out_tsk;
-		if (request == PTRACE_SYSCALL)
-			set_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
+	/* gr16-gr31 */
+	if (dst->count > 0 && dst->pos < ELF_NAT_OFFSET) {
+		pt = task_pt_regs(dst->target);
+		dst->ret = user_regset_copyin(&dst->pos, &dst->count,
+				&dst->u.set.kbuf, &dst->u.set.ubuf, &pt->r16,
+				ELF_GR_OFFSET(16), ELF_NAT_OFFSET);
+		if (dst->ret || dst->count == 0)
+			return;
+	}
+
+	/* nat, pr, b0 - b7 */
+	if (dst->count > 0 && dst->pos < ELF_CR_IIP_OFFSET) {
+		i = dst->pos;
+		index = (dst->pos - ELF_NAT_OFFSET) / sizeof(elf_greg_t);
+		dst->ret = user_regset_copyin(&dst->pos, &dst->count,
+				&dst->u.set.kbuf, &dst->u.set.ubuf, tmp,
+				ELF_NAT_OFFSET, ELF_CR_IIP_OFFSET);
+		if (dst->ret)
+			return;
+		for (; i < dst->pos; i += sizeof(elf_greg_t), index++)
+			if (access_elf_reg(dst->target, info, i,
+						&tmp[index], 1) < 0) {
+				dst->ret = -EIO;
+				return;
+			}
+		if (dst->count == 0)
+			return;
+	}
+
+	/* ip cfm psr ar.rsc ar.bsp ar.bspstore ar.rnat
+	 * ar.ccv ar.unat ar.fpsr ar.pfs ar.lc ar.ec ar.csd ar.ssd
+	 */
+	if (dst->count > 0 && dst->pos < (ELF_AR_END_OFFSET)) {
+		i = dst->pos;
+		index = (dst->pos - ELF_CR_IIP_OFFSET) / sizeof(elf_greg_t);
+		dst->ret = user_regset_copyin(&dst->pos, &dst->count,
+				&dst->u.set.kbuf, &dst->u.set.ubuf, tmp,
+				ELF_CR_IIP_OFFSET, ELF_AR_END_OFFSET);
+		if (dst->ret)
+			return;
+		for ( ; i < dst->pos; i += sizeof(elf_greg_t), index++)
+			if (access_elf_reg(dst->target, info, i,
+						&tmp[index], 1) < 0) {
+				dst->ret = -EIO;
+				return;
+			}
+	}
+}
+
+#define ELF_FP_OFFSET(i)	(i * sizeof(elf_fpreg_t))
+
+void do_fpregs_get(struct unw_frame_info *info, void *arg)
+{
+	struct regset_getset *dst = arg;
+	struct task_struct *task = dst->target;
+	elf_fpreg_t tmp[30];
+	int index, min_copy, i;
+
+	if (unw_unwind_to_user(info) < 0)
+		return;
+
+	/* Skip pos 0 and 1 */
+	if (dst->count > 0 && dst->pos < ELF_FP_OFFSET(2)) {
+		dst->ret = user_regset_copyout_zero(&dst->pos, &dst->count,
+						      &dst->u.get.kbuf,
+						      &dst->u.get.ubuf,
+						      0, ELF_FP_OFFSET(2));
+		if (dst->count == 0 || dst->ret)
+			return;
+	}
+
+	/* fr2-fr31 */
+	if (dst->count > 0 && dst->pos < ELF_FP_OFFSET(32)) {
+		index = (dst->pos - ELF_FP_OFFSET(2)) / sizeof(elf_fpreg_t);
+
+		min_copy = min(((unsigned int)ELF_FP_OFFSET(32)),
+				dst->pos + dst->count);
+		for (i = dst->pos; i < min_copy; i += sizeof(elf_fpreg_t),
+				index++)
+			if (unw_get_fr(info, i / sizeof(elf_fpreg_t),
+					 &tmp[index])) {
+				dst->ret = -EIO;
+				return;
+			}
+		dst->ret = user_regset_copyout(&dst->pos, &dst->count,
+				&dst->u.get.kbuf, &dst->u.get.ubuf, tmp,
+				ELF_FP_OFFSET(2), ELF_FP_OFFSET(32));
+		if (dst->count == 0 || dst->ret)
+			return;
+	}
+
+	/* fph */
+	if (dst->count > 0) {
+		ia64_flush_fph(dst->target);
+		if (task->thread.flags & IA64_THREAD_FPH_VALID)
+			dst->ret = user_regset_copyout(
+				&dst->pos, &dst->count,
+				&dst->u.get.kbuf, &dst->u.get.ubuf,
+				&dst->target->thread.fph,
+				ELF_FP_OFFSET(32), -1);
 		else
-			clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
-		child->exit_code = data;
+			/* Zero fill instead.  */
+			dst->ret = user_regset_copyout_zero(
+				&dst->pos, &dst->count,
+				&dst->u.get.kbuf, &dst->u.get.ubuf,
+				ELF_FP_OFFSET(32), -1);
+	}
+}
 
-		/*
-		 * Make sure the single step/taken-branch trap bits
-		 * are not set:
-		 */
-		ia64_psr(pt)->ss = 0;
-		ia64_psr(pt)->tb = 0;
+void do_fpregs_set(struct unw_frame_info *info, void *arg)
+{
+	struct regset_getset *dst = arg;
+	elf_fpreg_t fpreg, tmp[30];
+	int index, start, end;
 
-		wake_up_process(child);
-		ret = 0;
-		goto out_tsk;
+	if (unw_unwind_to_user(info) < 0)
+		return;
 
-	      case PTRACE_KILL:
-		/*
-		 * Make the child exit.  Best I can do is send it a
-		 * sigkill.  Perhaps it should be put in the status
-		 * that it wants to exit.
-		 */
-		if (child->exit_state == EXIT_ZOMBIE)
-			/* already dead */
-			goto out_tsk;
-		child->exit_code = SIGKILL;
-
-		ptrace_disable(child);
-		wake_up_process(child);
-		ret = 0;
-		goto out_tsk;
-
-	      case PTRACE_SINGLESTEP:
-		/* let child execute for one instruction */
-	      case PTRACE_SINGLEBLOCK:
-		ret = -EIO;
-		if (!valid_signal(data))
-			goto out_tsk;
-
-		clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
-		if (request == PTRACE_SINGLESTEP) {
-			ia64_psr(pt)->ss = 1;
-		} else {
-			ia64_psr(pt)->tb = 1;
+	/* Skip pos 0 and 1 */
+	if (dst->count > 0 && dst->pos < ELF_FP_OFFSET(2)) {
+		dst->ret = user_regset_copyin_ignore(&dst->pos, &dst->count,
+						       &dst->u.set.kbuf,
+						       &dst->u.set.ubuf,
+						       0, ELF_FP_OFFSET(2));
+		if (dst->count == 0 || dst->ret)
+			return;
+	}
+
+	/* fr2-fr31 */
+	if (dst->count > 0 && dst->pos < ELF_FP_OFFSET(32)) {
+		start = dst->pos;
+		end = min(((unsigned int)ELF_FP_OFFSET(32)),
+			 dst->pos + dst->count);
+		dst->ret = user_regset_copyin(&dst->pos, &dst->count,
+				&dst->u.set.kbuf, &dst->u.set.ubuf, tmp,
+				ELF_FP_OFFSET(2), ELF_FP_OFFSET(32));
+		if (dst->ret)
+			return;
+
+		if (start & 0xF) { /* only write high part */
+			if (unw_get_fr(info, start / sizeof(elf_fpreg_t),
+					 &fpreg)) {
+				dst->ret = -EIO;
+				return;
+			}
+			tmp[start / sizeof(elf_fpreg_t) - 2].u.bits[0]
+				= fpreg.u.bits[0];
+			start &= ~0xFUL;
+		}
+		if (end & 0xF) { /* only write low part */
+			if (unw_get_fr(info, end / sizeof(elf_fpreg_t),
+					&fpreg)) {
+				dst->ret = -EIO;
+				return;
+			}
+			tmp[end / sizeof(elf_fpreg_t) - 2].u.bits[1]
+				= fpreg.u.bits[1];
+			end = (end + 0xF) & ~0xFUL;
 		}
-		child->exit_code = data;
-
-		/* give it a chance to run. */
-		wake_up_process(child);
-		ret = 0;
-		goto out_tsk;
-
-	      case PTRACE_DETACH:
-		/* detach a process that was attached. */
-		ret = ptrace_detach(child, data);
-		goto out_tsk;
-
-	      case PTRACE_GETREGS:
-		ret = ptrace_getregs(child,
-				     (struct pt_all_user_regs __user *) data);
-		goto out_tsk;
-
-	      case PTRACE_SETREGS:
-		ret = ptrace_setregs(child,
-				     (struct pt_all_user_regs __user *) data);
-		goto out_tsk;
-
-	      default:
-		ret = ptrace_request(child, request, addr, data);
-		goto out_tsk;
+
+		for ( ;	start < end ; start += sizeof(elf_fpreg_t)) {
+			index = start / sizeof(elf_fpreg_t);
+			if (unw_set_fr(info, index, tmp[index - 2])) {
+				dst->ret = -EIO;
+				return;
+			}
+		}
+		if (dst->ret || dst->count == 0)
+			return;
+	}
+
+	/* fph */
+	if (dst->count > 0 && dst->pos < ELF_FP_OFFSET(128)) {
+		ia64_sync_fph(dst->target);
+		dst->ret = user_regset_copyin(&dst->pos, &dst->count,
+						&dst->u.set.kbuf,
+						&dst->u.set.ubuf,
+						&dst->target->thread.fph,
+						ELF_FP_OFFSET(32), -1);
 	}
-  out_tsk:
-	put_task_struct(child);
-  out:
-	unlock_kernel();
-	return ret;
 }
 
+static int
+do_regset_call(void (*call)(struct unw_frame_info *, void *),
+	       struct task_struct *target,
+	       const struct user_regset *regset,
+	       unsigned int pos, unsigned int count,
+	       const void *kbuf, const void __user *ubuf)
+{
+	struct regset_getset info = { .target = target, .regset = regset,
+				 .pos = pos, .count = count,
+				 .u.set = { .kbuf = kbuf, .ubuf = ubuf },
+				 .ret = 0 };
 
-void
-syscall_trace (void)
+	if (target == current)
+		unw_init_running(call, &info);
+	else {
+		struct unw_frame_info ufi;
+		memset(&ufi, 0, sizeof(ufi));
+		unw_init_from_blocked_task(&ufi, target);
+		(*call)(&ufi, &info);
+	}
+
+	return info.ret;
+}
+
+static int
+gpregs_get(struct task_struct *target,
+	   const struct user_regset *regset,
+	   unsigned int pos, unsigned int count,
+	   void *kbuf, void __user *ubuf)
 {
-	if (!test_thread_flag(TIF_SYSCALL_TRACE))
-		return;
-	if (!(current->ptrace & PT_PTRACED))
-		return;
-	/*
-	 * The 0x80 provides a way for the tracing parent to
-	 * distinguish between a syscall stop and SIGTRAP delivery.
-	 */
-	ptrace_notify(SIGTRAP
-		      | ((current->ptrace & PT_TRACESYSGOOD) ? 0x80 : 0));
+	return do_regset_call(do_gpregs_get, target, regset, pos, count,
+		kbuf, ubuf);
+}
 
+static int gpregs_set(struct task_struct *target,
+		const struct user_regset *regset,
+		unsigned int pos, unsigned int count,
+		const void *kbuf, const void __user *ubuf)
+{
+	return do_regset_call(do_gpregs_set, target, regset, pos, count,
+		kbuf, ubuf);
+}
+
+static void do_gpregs_writeback(struct unw_frame_info *info, void *arg)
+{
+	do_sync_rbs(info, ia64_sync_user_rbs);
+}
+
+/*
+ * This is called to write back the register backing store.
+ * ptrace does this before it stops, so that a tracer reading the user
+ * memory after the thread stops will get the current register data.
+ */
+static int
+gpregs_writeback(struct task_struct *target,
+		 const struct user_regset *regset,
+		 int now)
+{
+	if (test_and_set_tsk_thread_flag(target, TIF_RESTORE_RSE))
+		return 0;
+	set_notify_resume(target);
+	return do_regset_call(do_gpregs_writeback, target, regset, 0, 0,
+		NULL, NULL);
+}
+
+static int
+fpregs_active(struct task_struct *target, const struct user_regset *regset)
+{
+	return (target->thread.flags & IA64_THREAD_FPH_VALID) ? 128 : 32;
+}
+
+static int fpregs_get(struct task_struct *target,
+		const struct user_regset *regset,
+		unsigned int pos, unsigned int count,
+		void *kbuf, void __user *ubuf)
+{
+	return do_regset_call(do_fpregs_get, target, regset, pos, count,
+		kbuf, ubuf);
+}
+
+static int fpregs_set(struct task_struct *target,
+		const struct user_regset *regset,
+		unsigned int pos, unsigned int count,
+		const void *kbuf, const void __user *ubuf)
+{
+	return do_regset_call(do_fpregs_set, target, regset, pos, count,
+		kbuf, ubuf);
+}
+
+static int
+access_uarea(struct task_struct *child, unsigned long addr,
+	      unsigned long *data, int write_access)
+{
+	unsigned int pos = -1; /* an invalid value */
+	int ret;
+	unsigned long *ptr, regnum;
+
+	if ((addr & 0x7) != 0) {
+		dprintk("ptrace: unaligned register address 0x%lx\n", addr);
+		return -1;
+	}
+	if ((addr >= PT_NAT_BITS + 8 && addr < PT_F2) ||
+		(addr >= PT_R7 + 8 && addr < PT_B1) ||
+		(addr >= PT_AR_LC + 8 && addr < PT_CR_IPSR) ||
+		(addr >= PT_AR_SSD + 8 && addr < PT_DBR)) {
+		dprintk("ptrace: rejecting access to register "
+					"address 0x%lx\n", addr);
+		return -1;
+	}
+
+	switch (addr) {
+	case PT_F32 ... (PT_F127 + 15):
+		pos = addr - PT_F32 + ELF_FP_OFFSET(32);
+		break;
+	case PT_F2 ... (PT_F5 + 15):
+		pos = addr - PT_F2 + ELF_FP_OFFSET(2);
+		break;
+	case PT_F10 ... (PT_F31 + 15):
+		pos = addr - PT_F10 + ELF_FP_OFFSET(10);
+		break;
+	case PT_F6 ... (PT_F9 + 15):
+		pos = addr - PT_F6 + ELF_FP_OFFSET(6);
+		break;
+	}
+
+	if (pos != -1) {
+		if (write_access)
+			ret = fpregs_set(child, NULL, pos,
+				sizeof(unsigned long), data, NULL);
+		else
+			ret = fpregs_get(child, NULL, pos,
+				sizeof(unsigned long), data, NULL);
+		if (ret != 0)
+			return -1;
+		return 0;
+	}
+
+	switch (addr) {
+	case PT_NAT_BITS:
+		pos = ELF_NAT_OFFSET;
+		break;
+	case PT_R4 ... PT_R7:
+		pos = addr - PT_R4 + ELF_GR_OFFSET(4);
+		break;
+	case PT_B1 ... PT_B5:
+		pos = addr - PT_B1 + ELF_BR_OFFSET(1);
+		break;
+	case PT_AR_EC:
+		pos = ELF_AR_EC_OFFSET;
+		break;
+	case PT_AR_LC:
+		pos = ELF_AR_LC_OFFSET;
+		break;
+	case PT_CR_IPSR:
+		pos = ELF_CR_IPSR_OFFSET;
+		break;
+	case PT_CR_IIP:
+		pos = ELF_CR_IIP_OFFSET;
+		break;
+	case PT_CFM:
+		pos = ELF_CFM_OFFSET;
+		break;
+	case PT_AR_UNAT:
+		pos = ELF_AR_UNAT_OFFSET;
+		break;
+	case PT_AR_PFS:
+		pos = ELF_AR_PFS_OFFSET;
+		break;
+	case PT_AR_RSC:
+		pos = ELF_AR_RSC_OFFSET;
+		break;
+	case PT_AR_RNAT:
+		pos = ELF_AR_RNAT_OFFSET;
+		break;
+	case PT_AR_BSPSTORE:
+		pos = ELF_AR_BSPSTORE_OFFSET;
+		break;
+	case PT_PR:
+		pos = ELF_PR_OFFSET;
+		break;
+	case PT_B6:
+		pos = ELF_BR_OFFSET(6);
+		break;
+	case PT_AR_BSP:
+		pos = ELF_AR_BSP_OFFSET;
+		break;
+	case PT_R1 ... PT_R3:
+		pos = addr - PT_R1 + ELF_GR_OFFSET(1);
+		break;
+	case PT_R12 ... PT_R15:
+		pos = addr - PT_R12 + ELF_GR_OFFSET(12);
+		break;
+	case PT_R8 ... PT_R11:
+		pos = addr - PT_R8 + ELF_GR_OFFSET(8);
+		break;
+	case PT_R16 ... PT_R31:
+		pos = addr - PT_R16 + ELF_GR_OFFSET(16);
+		break;
+	case PT_AR_CCV:
+		pos = ELF_AR_CCV_OFFSET;
+		break;
+	case PT_AR_FPSR:
+		pos = ELF_AR_FPSR_OFFSET;
+		break;
+	case PT_B0:
+		pos = ELF_BR_OFFSET(0);
+		break;
+	case PT_B7:
+		pos = ELF_BR_OFFSET(7);
+		break;
+	case PT_AR_CSD:
+		pos = ELF_AR_CSD_OFFSET;
+		break;
+	case PT_AR_SSD:
+		pos = ELF_AR_SSD_OFFSET;
+		break;
+	}
+
+	if (pos != -1) {
+		if (write_access)
+			ret = gpregs_set(child, NULL, pos,
+				sizeof(unsigned long), data, NULL);
+		else
+			ret = gpregs_get(child, NULL, pos,
+				sizeof(unsigned long), data, NULL);
+		if (ret != 0)
+			return -1;
+		return 0;
+	}
+
+	/* access debug registers */
+	if (addr >= PT_IBR) {
+		regnum = (addr - PT_IBR) >> 3;
+		ptr = &child->thread.ibr[0];
+	} else {
+		regnum = (addr - PT_DBR) >> 3;
+		ptr = &child->thread.dbr[0];
+	}
+
+	if (regnum >= 8) {
+		dprintk("ptrace: rejecting access to register "
+				"address 0x%lx\n", addr);
+		return -1;
+	}
+#ifdef CONFIG_PERFMON
 	/*
-	 * This isn't the same as continuing with a signal, but it
-	 * will do for normal use.  strace only continues with a
-	 * signal if the stopping signal is not SIGTRAP.  -brl
+	 * Check if debug registers are used by perfmon. This
+	 * test must be done once we know that we can do the
+	 * operation, i.e. the arguments are all valid, but
+	 * before we start modifying the state.
+	 *
+	 * Perfmon needs to keep a count of how many processes
+	 * are trying to modify the debug registers for system
+	 * wide monitoring sessions.
+	 *
+	 * We also include read access here, because they may
+	 * cause the PMU-installed debug register state
+	 * (dbr[], ibr[]) to be reset. The two arrays are also
+	 * used by perfmon, but we do not use
+	 * IA64_THREAD_DBG_VALID. The registers are restored
+	 * by the PMU context switch code.
 	 */
-	if (current->exit_code) {
-		send_sig(current->exit_code, current, 1);
-		current->exit_code = 0;
+	if (pfm_use_debug_registers(child))
+		return -1;
+#endif
+
+	if (!(child->thread.flags & IA64_THREAD_DBG_VALID)) {
+		child->thread.flags |= IA64_THREAD_DBG_VALID;
+		memset(child->thread.dbr, 0,
+				sizeof(child->thread.dbr));
+		memset(child->thread.ibr, 0,
+				sizeof(child->thread.ibr));
+	}
+
+	ptr += regnum;
+
+	if ((regnum & 1) && write_access) {
+		/* don't let the user set kernel-level breakpoints: */
+		*ptr = *data & ~(7UL << 56);
+		return 0;
 	}
+	if (write_access)
+		*ptr = *data;
+	else
+		*data = *ptr;
+	return 0;
 }
 
-/* "asmlinkage" so the input arguments are preserved... */
+static const struct user_regset native_regsets[] = {
+	{
+		.core_note_type = NT_PRSTATUS,
+		.n = ELF_NGREG,
+		.size = sizeof(elf_greg_t), .align = sizeof(elf_greg_t),
+		.get = gpregs_get, .set = gpregs_set,
+		.writeback = gpregs_writeback
+	},
+	{
+		.core_note_type = NT_PRFPREG,
+		.n = ELF_NFPREG,
+		.size = sizeof(elf_fpreg_t), .align = sizeof(elf_fpreg_t),
+		.get = fpregs_get, .set = fpregs_set, .active = fpregs_active
+	},
+};
+
+static const struct user_regset_view user_ia64_view = {
+	.name = "ia64",
+	.e_machine = EM_IA_64,
+	.regsets = native_regsets, .n = ARRAY_SIZE(native_regsets)
+};
+
+const struct user_regset_view *task_user_regset_view(struct task_struct *tsk)
+{
+	return &user_ia64_view;
+}
 
-asmlinkage void
-syscall_trace_enter (long arg0, long arg1, long arg2, long arg3,
-		     long arg4, long arg5, long arg6, long arg7,
-		     struct pt_regs regs)
+struct syscall_get_set_args {
+	unsigned int i;
+	unsigned int n;
+	unsigned long *args;
+	struct pt_regs *regs;
+	int rw;
+};
+
+static void syscall_get_set_args_cb(struct unw_frame_info *info, void *data)
 {
-	if (test_thread_flag(TIF_SYSCALL_TRACE) 
-	    && (current->ptrace & PT_PTRACED))
-		syscall_trace();
-
-	if (unlikely(current->audit_context)) {
-		long syscall;
-		int arch;
-
-		if (IS_IA32_PROCESS(&regs)) {
-			syscall = regs.r1;
-			arch = AUDIT_ARCH_I386;
-		} else {
-			syscall = regs.r15;
-			arch = AUDIT_ARCH_IA64;
-		}
+	struct syscall_get_set_args *args = data;
+	struct pt_regs *pt = args->regs;
+	unsigned long *krbs, cfm, ndirty;
+	int i, count;
 
-		audit_syscall_entry(current, arch, syscall, arg0, arg1, arg2, arg3);
+	if (unw_unwind_to_user(info) < 0)
+		return;
+
+	cfm = pt->cr_ifs;
+	krbs = (unsigned long *)info->task + IA64_RBS_OFFSET/8;
+	ndirty = ia64_rse_num_regs(krbs, krbs + (pt->loadrs >> 19));
+
+	count = 0;
+	if (in_syscall(pt))
+		count = min_t(int, args->n, cfm & 0x7f);
+
+	for (i = 0; i < count; i++) {
+		if (args->rw)
+			*ia64_rse_skip_regs(krbs, ndirty + i + args->i) =
+				args->args[i];
+		else
+			args->args[i] = *ia64_rse_skip_regs(krbs,
+				ndirty + i + args->i);
 	}
 
+	if (!args->rw) {
+		while (i < args->n) {
+			args->args[i] = 0;
+			i++;
+		}
+	}
 }
 
-/* "asmlinkage" so the input arguments are preserved... */
-
-asmlinkage void
-syscall_trace_leave (long arg0, long arg1, long arg2, long arg3,
-		     long arg4, long arg5, long arg6, long arg7,
-		     struct pt_regs regs)
+void ia64_syscall_get_set_arguments(struct task_struct *task,
+	struct pt_regs *regs, unsigned int i, unsigned int n,
+	unsigned long *args, int rw)
 {
-	if (unlikely(current->audit_context))
-		audit_syscall_exit(current, AUDITSC_RESULT(regs.r10), regs.r8);
-
-	if (test_thread_flag(TIF_SYSCALL_TRACE)
-	    && (current->ptrace & PT_PTRACED))
-		syscall_trace();
+	struct syscall_get_set_args data = {
+		.i = i,
+		.n = n,
+		.args = args,
+		.regs = regs,
+		.rw = rw,
+	};
+
+	if (task == current)
+		unw_init_running(syscall_get_set_args_cb, &data);
+	else {
+		struct unw_frame_info ufi;
+		memset(&ufi, 0, sizeof(ufi));
+		unw_init_from_blocked_task(&ufi, task);
+		syscall_get_set_args_cb(&ufi, &data);
+	}
 }
diff --git a/arch/ia64/kernel/relocate_kernel.S b/arch/ia64/kernel/relocate_kernel.S
new file mode 100644
index 00000000000..c370e02f006
--- /dev/null
+++ b/arch/ia64/kernel/relocate_kernel.S
@@ -0,0 +1,325 @@
+/*
+ * arch/ia64/kernel/relocate_kernel.S
+ *
+ * Relocate kexec'able kernel and start it
+ *
+ * Copyright (C) 2005 Hewlett-Packard Development Company, L.P.
+ * Copyright (C) 2005 Khalid Aziz  <khalid.aziz@hp.com>
+ * Copyright (C) 2005 Intel Corp,  Zou Nan hai <nanhai.zou@intel.com>
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2.  See the file COPYING for more details.
+ */
+#include <asm/asmmacro.h>
+#include <asm/kregs.h>
+#include <asm/page.h>
+#include <asm/pgtable.h>
+#include <asm/mca_asm.h>
+
+       /* Must be relocatable PIC code callable as a C function
+        */
+GLOBAL_ENTRY(relocate_new_kernel)
+	.prologue
+	alloc r31=ar.pfs,4,0,0,0
+        .body
+.reloc_entry:
+{
+	rsm psr.i| psr.ic
+	mov r2=ip
+}
+	;;
+{
+        flushrs                         // must be first insn in group
+        srlz.i
+}
+	;;
+	dep r2=0,r2,61,3		//to physical address
+	;;
+	//first switch to physical mode
+	add r3=1f-.reloc_entry, r2
+	movl r16 = IA64_PSR_AC|IA64_PSR_BN|IA64_PSR_IC
+	mov ar.rsc=0	          	// put RSE in enforced lazy mode
+	;;
+	add sp=(memory_stack_end - 16 - .reloc_entry),r2
+	add r8=(register_stack - .reloc_entry),r2
+	;;
+	mov r18=ar.rnat
+	mov ar.bspstore=r8
+	;;
+        mov cr.ipsr=r16
+        mov cr.iip=r3
+        mov cr.ifs=r0
+	srlz.i
+	;;
+	mov ar.rnat=r18
+	rfi				// note: this unmask MCA/INIT (psr.mc)
+	;;
+1:
+	//physical mode code begin
+	mov b6=in1
+	dep r28=0,in2,61,3	//to physical address
+
+	// purge all TC entries
+#define O(member)       IA64_CPUINFO_##member##_OFFSET
+        GET_THIS_PADDR(r2, ia64_cpu_info) // load phys addr of cpu_info into r2
+        ;;
+        addl r17=O(PTCE_STRIDE),r2
+        addl r2=O(PTCE_BASE),r2
+        ;;
+        ld8 r18=[r2],(O(PTCE_COUNT)-O(PTCE_BASE));;    	// r18=ptce_base
+        ld4 r19=[r2],4                                  // r19=ptce_count[0]
+        ld4 r21=[r17],4                                 // r21=ptce_stride[0]
+        ;;
+        ld4 r20=[r2]                                    // r20=ptce_count[1]
+        ld4 r22=[r17]                                   // r22=ptce_stride[1]
+        mov r24=r0
+        ;;
+        adds r20=-1,r20
+        ;;
+#undef O
+2:
+        cmp.ltu p6,p7=r24,r19
+(p7)    br.cond.dpnt.few 4f
+        mov ar.lc=r20
+3:
+        ptc.e r18
+        ;;
+        add r18=r22,r18
+        br.cloop.sptk.few 3b
+        ;;
+        add r18=r21,r18
+        add r24=1,r24
+        ;;
+        br.sptk.few 2b
+4:
+        srlz.i
+        ;;
+	// purge TR entry for kernel text and data
+        movl r16=KERNEL_START
+        mov r18=KERNEL_TR_PAGE_SHIFT<<2
+        ;;
+        ptr.i r16, r18
+        ptr.d r16, r18
+        ;;
+        srlz.i
+        ;;
+
+        // purge TR entry for pal code
+        mov r16=in3
+        mov r18=IA64_GRANULE_SHIFT<<2
+        ;;
+        ptr.i r16,r18
+        ;;
+        srlz.i
+	;;
+
+        // purge TR entry for stack
+        mov r16=IA64_KR(CURRENT_STACK)
+        ;;
+        shl r16=r16,IA64_GRANULE_SHIFT
+        movl r19=PAGE_OFFSET
+        ;;
+        add r16=r19,r16
+        mov r18=IA64_GRANULE_SHIFT<<2
+        ;;
+        ptr.d r16,r18
+        ;;
+        srlz.i
+	;;
+
+	//copy segments
+	movl r16=PAGE_MASK
+        mov  r30=in0                    // in0 is page_list
+        br.sptk.few .dest_page
+	;;
+.loop:
+	ld8  r30=[in0], 8;;
+.dest_page:
+	tbit.z p0, p6=r30, 0;;    	// 0x1 dest page
+(p6)	and r17=r30, r16
+(p6)	br.cond.sptk.few .loop;;
+
+	tbit.z p0, p6=r30, 1;;		// 0x2 indirect page
+(p6)	and in0=r30, r16
+(p6)	br.cond.sptk.few .loop;;
+
+	tbit.z p0, p6=r30, 2;;		// 0x4 end flag
+(p6)	br.cond.sptk.few .end_loop;;
+
+	tbit.z p6, p0=r30, 3;;		// 0x8 source page
+(p6)	br.cond.sptk.few .loop
+
+	and r18=r30, r16
+
+	// simple copy page, may optimize later
+	movl r14=PAGE_SIZE/8 - 1;;
+	mov ar.lc=r14;;
+1:
+	ld8 r14=[r18], 8;;
+	st8 [r17]=r14;;
+	fc.i r17
+	add r17=8, r17
+	br.ctop.sptk.few 1b
+	br.sptk.few .loop
+	;;
+
+.end_loop:
+	sync.i			// for fc.i
+	;;
+	srlz.i
+	;;
+	srlz.d
+	;;
+	br.call.sptk.many b0=b6;;
+
+.align  32
+memory_stack:
+	.fill           8192, 1, 0
+memory_stack_end:
+register_stack:
+	.fill           8192, 1, 0
+register_stack_end:
+relocate_new_kernel_end:
+END(relocate_new_kernel)
+
+.global relocate_new_kernel_size
+relocate_new_kernel_size:
+	data8	relocate_new_kernel_end - relocate_new_kernel
+
+GLOBAL_ENTRY(ia64_dump_cpu_regs)
+        .prologue
+        alloc loc0=ar.pfs,1,2,0,0
+        .body
+        mov     ar.rsc=0                // put RSE in enforced lazy mode
+        add     loc1=4*8, in0           // save r4 and r5 first
+        ;;
+{
+        flushrs                         // flush dirty regs to backing store
+        srlz.i
+}
+        st8 [loc1]=r4, 8
+        ;;
+        st8 [loc1]=r5, 8
+        ;;
+        add loc1=32*8, in0
+        mov r4=ar.rnat
+        ;;
+        st8 [in0]=r0, 8			// r0
+        st8 [loc1]=r4, 8		// rnat
+        mov r5=pr
+        ;;
+        st8 [in0]=r1, 8			// r1
+        st8 [loc1]=r5, 8		// pr
+        mov r4=b0
+        ;;
+        st8 [in0]=r2, 8			// r2
+        st8 [loc1]=r4, 8		// b0
+        mov r5=b1;
+        ;;
+        st8 [in0]=r3, 24		// r3
+        st8 [loc1]=r5, 8		// b1
+        mov r4=b2
+        ;;
+        st8 [in0]=r6, 8			// r6
+        st8 [loc1]=r4, 8		// b2
+	mov r5=b3
+        ;;
+        st8 [in0]=r7, 8			// r7
+        st8 [loc1]=r5, 8		// b3
+        mov r4=b4
+        ;;
+        st8 [in0]=r8, 8			// r8
+        st8 [loc1]=r4, 8		// b4
+        mov r5=b5
+        ;;
+        st8 [in0]=r9, 8			// r9
+        st8 [loc1]=r5, 8		// b5
+        mov r4=b6
+        ;;
+        st8 [in0]=r10, 8		// r10
+        st8 [loc1]=r5, 8		// b6
+        mov r5=b7
+        ;;
+        st8 [in0]=r11, 8		// r11
+        st8 [loc1]=r5, 8		// b7
+        mov r4=b0
+        ;;
+        st8 [in0]=r12, 8		// r12
+        st8 [loc1]=r4, 8		// ip
+        mov r5=loc0
+	;;
+        st8 [in0]=r13, 8		// r13
+        extr.u r5=r5, 0, 38		// ar.pfs.pfm
+	mov r4=r0			// user mask
+        ;;
+        st8 [in0]=r14, 8		// r14
+        st8 [loc1]=r5, 8		// cfm
+        ;;
+        st8 [in0]=r15, 8		// r15
+        st8 [loc1]=r4, 8        	// user mask
+	mov r5=ar.rsc
+        ;;
+        st8 [in0]=r16, 8		// r16
+        st8 [loc1]=r5, 8        	// ar.rsc
+        mov r4=ar.bsp
+        ;;
+        st8 [in0]=r17, 8		// r17
+        st8 [loc1]=r4, 8        	// ar.bsp
+        mov r5=ar.bspstore
+        ;;
+        st8 [in0]=r18, 8		// r18
+        st8 [loc1]=r5, 8        	// ar.bspstore
+        mov r4=ar.rnat
+        ;;
+        st8 [in0]=r19, 8		// r19
+        st8 [loc1]=r4, 8        	// ar.rnat
+        mov r5=ar.ccv
+        ;;
+        st8 [in0]=r20, 8		// r20
+	st8 [loc1]=r5, 8        	// ar.ccv
+        mov r4=ar.unat
+        ;;
+        st8 [in0]=r21, 8		// r21
+        st8 [loc1]=r4, 8        	// ar.unat
+        mov r5 = ar.fpsr
+        ;;
+        st8 [in0]=r22, 8		// r22
+        st8 [loc1]=r5, 8        	// ar.fpsr
+        mov r4 = ar.unat
+        ;;
+        st8 [in0]=r23, 8		// r23
+        st8 [loc1]=r4, 8        	// unat
+        mov r5 = ar.fpsr
+        ;;
+        st8 [in0]=r24, 8		// r24
+        st8 [loc1]=r5, 8        	// fpsr
+        mov r4 = ar.pfs
+        ;;
+        st8 [in0]=r25, 8		// r25
+        st8 [loc1]=r4, 8        	// ar.pfs
+        mov r5 = ar.lc
+        ;;
+        st8 [in0]=r26, 8		// r26
+        st8 [loc1]=r5, 8        	// ar.lc
+        mov r4 = ar.ec
+        ;;
+        st8 [in0]=r27, 8		// r27
+        st8 [loc1]=r4, 8        	// ar.ec
+        mov r5 = ar.csd
+        ;;
+        st8 [in0]=r28, 8		// r28
+        st8 [loc1]=r5, 8        	// ar.csd
+        mov r4 = ar.ssd
+        ;;
+        st8 [in0]=r29, 8		// r29
+        st8 [loc1]=r4, 8        	// ar.ssd
+        ;;
+        st8 [in0]=r30, 8		// r30
+        ;;
+	st8 [in0]=r31, 8		// r31
+        mov ar.pfs=loc0
+        ;;
+        br.ret.sptk.many rp
+END(ia64_dump_cpu_regs)
+
+
diff --git a/arch/ia64/kernel/sal.c b/arch/ia64/kernel/sal.c
index acc0f132f86..0464173ea56 100644
--- a/arch/ia64/kernel/sal.c
+++ b/arch/ia64/kernel/sal.c
@@ -6,7 +6,6 @@
  * Copyright (C) 1999 VA Linux Systems
  * Copyright (C) 1999 Walt Drummond <drummond@valinux.com>
  */
-#include <linux/config.h>
 
 #include <linux/kernel.h>
 #include <linux/init.h>
@@ -14,6 +13,7 @@
 #include <linux/spinlock.h>
 #include <linux/string.h>
 
+#include <asm/delay.h>
 #include <asm/page.h>
 #include <asm/sal.h>
 #include <asm/pal.h>
@@ -109,6 +109,13 @@ check_versions (struct ia64_sal_systab *systab)
 		sal_revision = SAL_VERSION_CODE(2, 8);
 		sal_version = SAL_VERSION_CODE(0, 0);
 	}
+
+	if (ia64_platform_is("sn2") && (sal_revision == SAL_VERSION_CODE(2, 9)))
+		/*
+		 * SGI Altix has hard-coded version 2.9 in their prom
+		 * but they actually implement 3.2, so let's fix it here.
+		 */
+		sal_revision = SAL_VERSION_CODE(3, 2);
 }
 
 static void __init
@@ -134,7 +141,7 @@ set_smp_redirect (int flag)
 	 * interrupt redirection. The reason is this would require that
 	 * All interrupts be stopped and hard bind the irq to a cpu.
 	 * Later when the interrupt is fired we need to set the redir hint
-	 * on again in the vector. This is combersome for something that the
+	 * on again in the vector. This is cumbersome for something that the
 	 * user mode irq balancer will solve anyways.
 	 */
 	no_int_routing=1;
@@ -194,9 +201,8 @@ static void __init
 chk_nointroute_opt(void)
 {
 	char *cp;
-	extern char saved_command_line[];
 
-	for (cp = saved_command_line; *cp; ) {
+	for (cp = boot_command_line; *cp; ) {
 		if (memcmp(cp, "nointroute", 10) == 0) {
 			no_int_routing = 1;
 			printk ("no_int_routing on\n");
@@ -214,6 +220,89 @@ chk_nointroute_opt(void)
 static void __init sal_desc_ap_wakeup(void *p) { }
 #endif
 
+/*
+ * HP rx5670 firmware polls for interrupts during SAL_CACHE_FLUSH by reading
+ * cr.ivr, but it never writes cr.eoi.  This leaves any interrupt marked as
+ * "in-service" and masks other interrupts of equal or lower priority.
+ *
+ * HP internal defect reports: F1859, F2775, F3031.
+ */
+static int sal_cache_flush_drops_interrupts;
+
+static int __init
+force_pal_cache_flush(char *str)
+{
+	sal_cache_flush_drops_interrupts = 1;
+	return 0;
+}
+early_param("force_pal_cache_flush", force_pal_cache_flush);
+
+void __init
+check_sal_cache_flush (void)
+{
+	unsigned long flags;
+	int cpu;
+	u64 vector, cache_type = 3;
+	struct ia64_sal_retval isrv;
+
+	if (sal_cache_flush_drops_interrupts)
+		return;
+
+	cpu = get_cpu();
+	local_irq_save(flags);
+
+	/*
+	 * Send ourselves a timer interrupt, wait until it's reported, and see
+	 * if SAL_CACHE_FLUSH drops it.
+	 */
+	platform_send_ipi(cpu, IA64_TIMER_VECTOR, IA64_IPI_DM_INT, 0);
+
+	while (!ia64_get_irr(IA64_TIMER_VECTOR))
+		cpu_relax();
+
+	SAL_CALL(isrv, SAL_CACHE_FLUSH, cache_type, 0, 0, 0, 0, 0, 0);
+
+	if (isrv.status)
+		printk(KERN_ERR "SAL_CAL_FLUSH failed with %ld\n", isrv.status);
+
+	if (ia64_get_irr(IA64_TIMER_VECTOR)) {
+		vector = ia64_get_ivr();
+		ia64_eoi();
+		WARN_ON(vector != IA64_TIMER_VECTOR);
+	} else {
+		sal_cache_flush_drops_interrupts = 1;
+		printk(KERN_ERR "SAL: SAL_CACHE_FLUSH drops interrupts; "
+			"PAL_CACHE_FLUSH will be used instead\n");
+		ia64_eoi();
+	}
+
+	local_irq_restore(flags);
+	put_cpu();
+}
+
+s64
+ia64_sal_cache_flush (u64 cache_type)
+{
+	struct ia64_sal_retval isrv;
+
+	if (sal_cache_flush_drops_interrupts) {
+		unsigned long flags;
+		u64 progress;
+		s64 rc;
+
+		progress = 0;
+		local_irq_save(flags);
+		rc = ia64_pal_cache_flush(cache_type,
+			PAL_CACHE_FLUSH_INVALIDATE, &progress, NULL);
+		local_irq_restore(flags);
+		return rc;
+	}
+
+	SAL_CALL(isrv, SAL_CACHE_FLUSH, cache_type, 0, 0, 0, 0, 0, 0);
+	return isrv.status;
+}
+EXPORT_SYMBOL_GPL(ia64_sal_cache_flush);
+
 void __init
 ia64_sal_init (struct ia64_sal_systab *systab)
 {
@@ -262,6 +351,7 @@ ia64_sal_init (struct ia64_sal_systab *systab)
 		}
 		p += SAL_DESC_SIZE(*p);
 	}
+
 }
 
 int
@@ -300,3 +390,16 @@ ia64_sal_oemcall_reentrant(struct ia64_sal_retval *isrvp, u64 oemfunc,
 	return 0;
 }
 EXPORT_SYMBOL(ia64_sal_oemcall_reentrant);
+
+long
+ia64_sal_freq_base (unsigned long which, unsigned long *ticks_per_second,
+		    unsigned long *drift_info)
+{
+	struct ia64_sal_retval isrv;
+
+	SAL_CALL(isrv, SAL_FREQ_BASE, which, 0, 0, 0, 0, 0, 0);
+	*ticks_per_second = isrv.v0;
+	*drift_info = isrv.v1;
+	return isrv.status;
+}
+EXPORT_SYMBOL_GPL(ia64_sal_freq_base);
diff --git a/arch/ia64/kernel/salinfo.c b/arch/ia64/kernel/salinfo.c
index ca68e6e44a7..ee9719eebb1 100644
--- a/arch/ia64/kernel/salinfo.c
+++ b/arch/ia64/kernel/salinfo.c
@@ -3,7 +3,7 @@
  *
  * Creates entries in /proc/sal for various system features.
  *
- * Copyright (c) 2003 Silicon Graphics, Inc.  All rights reserved.
+ * Copyright (c) 2003, 2006 Silicon Graphics, Inc.  All rights reserved.
  * Copyright (c) 2003 Hewlett-Packard Co
  *	Bjorn Helgaas <bjorn.helgaas@hp.com>
  *
@@ -27,17 +27,26 @@
  *   mca.c may not pass a buffer, a NULL buffer just indicates that a new
  *   record is available in SAL.
  *   Replace some NR_CPUS by cpus_online, for hotplug cpu.
+ *
+ * Jan  5 2006        kaos@sgi.com
+ *   Handle hotplug cpus coming online.
+ *   Handle hotplug cpus going offline while they still have outstanding records.
+ *   Use the cpu_* macros consistently.
+ *   Replace the counting semaphore with a mutex and a test if the cpumask is non-empty.
+ *   Modify the locking to make the test for "work to do" an atomic operation.
  */
 
+#include <linux/capability.h>
+#include <linux/cpu.h>
 #include <linux/types.h>
 #include <linux/proc_fs.h>
+#include <linux/seq_file.h>
 #include <linux/module.h>
 #include <linux/smp.h>
-#include <linux/smp_lock.h>
 #include <linux/timer.h>
 #include <linux/vmalloc.h>
+#include <linux/semaphore.h>
 
-#include <asm/semaphore.h>
 #include <asm/sal.h>
 #include <asm/uaccess.h>
 
@@ -45,7 +54,7 @@ MODULE_AUTHOR("Jesse Barnes <jbarnes@sgi.com>");
 MODULE_DESCRIPTION("/proc interface to IA-64 SAL features");
 MODULE_LICENSE("GPL");
 
-static int salinfo_read(char *page, char **start, off_t off, int count, int *eof, void *data);
+static const struct file_operations proc_salinfo_fops;
 
 typedef struct {
 	const char		*name;		/* name of the proc entry */
@@ -57,7 +66,7 @@ typedef struct {
  * List {name,feature} pairs for every entry in /proc/sal/<feature>
  * that this module exports
  */
-static salinfo_entry_t salinfo_entries[]={
+static const salinfo_entry_t salinfo_entries[]={
 	{ "bus_lock",           IA64_SAL_PLATFORM_FEATURE_BUS_LOCK, },
 	{ "irq_redirection",	IA64_SAL_PLATFORM_FEATURE_IRQ_REDIR_HINT, },
 	{ "ipi_redirection",	IA64_SAL_PLATFORM_FEATURE_IPI_REDIR_HINT, },
@@ -131,8 +140,8 @@ enum salinfo_state {
 };
 
 struct salinfo_data {
-	volatile cpumask_t	cpu_event;	/* which cpus have outstanding events */
-	struct semaphore	sem;		/* count of cpus with outstanding events (bits set in cpu_event) */
+	cpumask_t		cpu_event;	/* which cpus have outstanding events */
+	struct semaphore	mutex;
 	u8			*log_buffer;
 	u64			log_size;
 	u8			*oemdata;	/* decoded oem data */
@@ -154,7 +163,7 @@ static DEFINE_SPINLOCK(data_saved_lock);
 /** salinfo_platform_oemdata - optional callback to decode oemdata from an error
  * record.
  * @sect_header: pointer to the start of the section to decode.
- * @oemdata: returns vmalloc area containing the decded output.
+ * @oemdata: returns vmalloc area containing the decoded output.
  * @oemdata_size: returns length of decoded output (strlen).
  *
  * Description: If user space asks for oem data to be decoded by the kernel
@@ -173,6 +182,21 @@ struct salinfo_platform_oemdata_parms {
 	int ret;
 };
 
+/* Kick the mutex that tells user space that there is work to do.  Instead of
+ * trying to track the state of the mutex across multiple cpus, in user
+ * context, interrupt context, non-maskable interrupt context and hotplug cpu,
+ * it is far easier just to grab the mutex if it is free then release it.
+ *
+ * This routine must be called with data_saved_lock held, to make the down/up
+ * operation atomic.
+ */
+static void
+salinfo_work_to_do(struct salinfo_data *data)
+{
+	(void)(down_trylock(&data->mutex) ?: 0);
+	up(&data->mutex);
+}
+
 static void
 salinfo_platform_oemdata_cpu(void *context)
 {
@@ -211,9 +235,9 @@ salinfo_log_wakeup(int type, u8 *buffer, u64 size, int irqsafe)
 
 	BUG_ON(type >= ARRAY_SIZE(salinfo_log_name));
 
+	if (irqsafe)
+		spin_lock_irqsave(&data_saved_lock, flags);
 	if (buffer) {
-		if (irqsafe)
-			spin_lock_irqsave(&data_saved_lock, flags);
 		for (i = 0, data_saved = data->data_saved; i < saved_size; ++i, ++data_saved) {
 			if (!data_saved->buffer)
 				break;
@@ -231,39 +255,36 @@ salinfo_log_wakeup(int type, u8 *buffer, u64 size, int irqsafe)
 			data_saved->size = size;
 			data_saved->buffer = buffer;
 		}
-		if (irqsafe)
-			spin_unlock_irqrestore(&data_saved_lock, flags);
 	}
-
-	if (!test_and_set_bit(smp_processor_id(), &data->cpu_event)) {
-		if (irqsafe)
-			up(&data->sem);
+	cpu_set(smp_processor_id(), data->cpu_event);
+	if (irqsafe) {
+		salinfo_work_to_do(data);
+		spin_unlock_irqrestore(&data_saved_lock, flags);
 	}
 }
 
 /* Check for outstanding MCA/INIT records every minute (arbitrary) */
 #define SALINFO_TIMER_DELAY (60*HZ)
 static struct timer_list salinfo_timer;
+extern void ia64_mlogbuf_dump(void);
 
 static void
 salinfo_timeout_check(struct salinfo_data *data)
 {
-	int i;
+	unsigned long flags;
 	if (!data->open)
 		return;
-	for_each_online_cpu(i) {
-		if (test_bit(i, &data->cpu_event)) {
-			/* double up() is not a problem, user space will see no
-			 * records for the additional "events".
-			 */
-			up(&data->sem);
-		}
+	if (!cpus_empty(data->cpu_event)) {
+		spin_lock_irqsave(&data_saved_lock, flags);
+		salinfo_work_to_do(data);
+		spin_unlock_irqrestore(&data_saved_lock, flags);
 	}
 }
 
-static void 
+static void
 salinfo_timeout (unsigned long arg)
 {
+	ia64_mlogbuf_dump();
 	salinfo_timeout_check(salinfo_data + SAL_INFO_TYPE_MCA);
 	salinfo_timeout_check(salinfo_data + SAL_INFO_TYPE_INIT);
 	salinfo_timer.expires = jiffies + SALINFO_TIMER_DELAY;
@@ -281,40 +302,41 @@ salinfo_event_open(struct inode *inode, struct file *file)
 static ssize_t
 salinfo_event_read(struct file *file, char __user *buffer, size_t count, loff_t *ppos)
 {
-	struct inode *inode = file->f_dentry->d_inode;
-	struct proc_dir_entry *entry = PDE(inode);
-	struct salinfo_data *data = entry->data;
+	struct salinfo_data *data = PDE_DATA(file_inode(file));
 	char cmd[32];
 	size_t size;
 	int i, n, cpu = -1;
 
 retry:
-	if (down_trylock(&data->sem)) {
+	if (cpus_empty(data->cpu_event) && down_trylock(&data->mutex)) {
 		if (file->f_flags & O_NONBLOCK)
 			return -EAGAIN;
-		if (down_interruptible(&data->sem))
-			return -ERESTARTSYS;
+		if (down_interruptible(&data->mutex))
+			return -EINTR;
 	}
 
 	n = data->cpu_check;
-	for (i = 0; i < NR_CPUS; i++) {
-		if (test_bit(n, &data->cpu_event) && cpu_online(n)) {
+	for (i = 0; i < nr_cpu_ids; i++) {
+		if (cpu_isset(n, data->cpu_event)) {
+			if (!cpu_online(n)) {
+				cpu_clear(n, data->cpu_event);
+				continue;
+			}
 			cpu = n;
 			break;
 		}
-		if (++n == NR_CPUS)
+		if (++n == nr_cpu_ids)
 			n = 0;
 	}
 
 	if (cpu == -1)
 		goto retry;
 
-	/* events are sticky until the user says "clear" */
-	up(&data->sem);
+	ia64_mlogbuf_dump();
 
 	/* for next read, start checking at next CPU */
 	data->cpu_check = cpu;
-	if (++data->cpu_check == NR_CPUS)
+	if (++data->cpu_check == nr_cpu_ids)
 		data->cpu_check = 0;
 
 	snprintf(cmd, sizeof(cmd), "read %d\n", cpu);
@@ -328,16 +350,16 @@ retry:
 	return size;
 }
 
-static struct file_operations salinfo_event_fops = {
+static const struct file_operations salinfo_event_fops = {
 	.open  = salinfo_event_open,
 	.read  = salinfo_event_read,
+	.llseek = noop_llseek,
 };
 
 static int
 salinfo_log_open(struct inode *inode, struct file *file)
 {
-	struct proc_dir_entry *entry = PDE(inode);
-	struct salinfo_data *data = entry->data;
+	struct salinfo_data *data = PDE_DATA(inode);
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
@@ -362,8 +384,7 @@ salinfo_log_open(struct inode *inode, struct file *file)
 static int
 salinfo_log_release(struct inode *inode, struct file *file)
 {
-	struct proc_dir_entry *entry = PDE(inode);
-	struct salinfo_data *data = entry->data;
+	struct salinfo_data *data = PDE_DATA(inode);
 
 	if (data->state == STATE_NO_DATA) {
 		vfree(data->log_buffer);
@@ -380,13 +401,10 @@ salinfo_log_release(struct inode *inode, struct file *file)
 static void
 call_on_cpu(int cpu, void (*fn)(void *), void *arg)
 {
-	cpumask_t save_cpus_allowed, new_cpus_allowed;
-	memcpy(&save_cpus_allowed, &current->cpus_allowed, sizeof(save_cpus_allowed));
-	memset(&new_cpus_allowed, 0, sizeof(new_cpus_allowed));
-	set_bit(cpu, &new_cpus_allowed);
-	set_cpus_allowed(current, new_cpus_allowed);
+	cpumask_t save_cpus_allowed = current->cpus_allowed;
+	set_cpus_allowed_ptr(current, cpumask_of(cpu));
 	(*fn)(arg);
-	set_cpus_allowed(current, save_cpus_allowed);
+	set_cpus_allowed_ptr(current, &save_cpus_allowed);
 }
 
 static void
@@ -432,19 +450,17 @@ retry:
 	if (!data->saved_num)
 		call_on_cpu(cpu, salinfo_log_read_cpu, data);
 	if (!data->log_size) {
-	        data->state = STATE_NO_DATA;
-	        clear_bit(cpu, &data->cpu_event);
+		data->state = STATE_NO_DATA;
+		cpu_clear(cpu, data->cpu_event);
 	} else {
-	        data->state = STATE_LOG_RECORD;
+		data->state = STATE_LOG_RECORD;
 	}
 }
 
 static ssize_t
 salinfo_log_read(struct file *file, char __user *buffer, size_t count, loff_t *ppos)
 {
-	struct inode *inode = file->f_dentry->d_inode;
-	struct proc_dir_entry *entry = PDE(inode);
-	struct salinfo_data *data = entry->data;
+	struct salinfo_data *data = PDE_DATA(file_inode(file));
 	u8 *buf;
 	u64 bufsize;
 
@@ -472,36 +488,38 @@ static int
 salinfo_log_clear(struct salinfo_data *data, int cpu)
 {
 	sal_log_record_header_t *rh;
+	unsigned long flags;
+	spin_lock_irqsave(&data_saved_lock, flags);
 	data->state = STATE_NO_DATA;
-	if (!test_bit(cpu, &data->cpu_event))
+	if (!cpu_isset(cpu, data->cpu_event)) {
+		spin_unlock_irqrestore(&data_saved_lock, flags);
 		return 0;
-	down(&data->sem);
-	clear_bit(cpu, &data->cpu_event);
+	}
+	cpu_clear(cpu, data->cpu_event);
 	if (data->saved_num) {
-		unsigned long flags;
-		spin_lock_irqsave(&data_saved_lock, flags);
-		shift1_data_saved(data, data->saved_num - 1 );
+		shift1_data_saved(data, data->saved_num - 1);
 		data->saved_num = 0;
-		spin_unlock_irqrestore(&data_saved_lock, flags);
 	}
+	spin_unlock_irqrestore(&data_saved_lock, flags);
 	rh = (sal_log_record_header_t *)(data->log_buffer);
 	/* Corrected errors have already been cleared from SAL */
 	if (rh->severity != sal_log_severity_corrected)
 		call_on_cpu(cpu, salinfo_log_clear_cpu, data);
 	/* clearing a record may make a new record visible */
 	salinfo_log_new_read(cpu, data);
-	if (data->state == STATE_LOG_RECORD &&
-	    !test_and_set_bit(cpu,  &data->cpu_event))
-		up(&data->sem);
+	if (data->state == STATE_LOG_RECORD) {
+		spin_lock_irqsave(&data_saved_lock, flags);
+		cpu_set(cpu, data->cpu_event);
+		salinfo_work_to_do(data);
+		spin_unlock_irqrestore(&data_saved_lock, flags);
+	}
 	return 0;
 }
 
 static ssize_t
 salinfo_log_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos)
 {
-	struct inode *inode = file->f_dentry->d_inode;
-	struct proc_dir_entry *entry = PDE(inode);
-	struct salinfo_data *data = entry->data;
+	struct salinfo_data *data = PDE_DATA(file_inode(file));
 	char cmd[32];
 	size_t size;
 	u32 offset;
@@ -542,11 +560,59 @@ salinfo_log_write(struct file *file, const char __user *buffer, size_t count, lo
 	return count;
 }
 
-static struct file_operations salinfo_data_fops = {
+static const struct file_operations salinfo_data_fops = {
 	.open    = salinfo_log_open,
 	.release = salinfo_log_release,
 	.read    = salinfo_log_read,
 	.write   = salinfo_log_write,
+	.llseek  = default_llseek,
+};
+
+static int
+salinfo_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu)
+{
+	unsigned int i, cpu = (unsigned long)hcpu;
+	unsigned long flags;
+	struct salinfo_data *data;
+	switch (action) {
+	case CPU_ONLINE:
+	case CPU_ONLINE_FROZEN:
+		spin_lock_irqsave(&data_saved_lock, flags);
+		for (i = 0, data = salinfo_data;
+		     i < ARRAY_SIZE(salinfo_data);
+		     ++i, ++data) {
+			cpu_set(cpu, data->cpu_event);
+			salinfo_work_to_do(data);
+		}
+		spin_unlock_irqrestore(&data_saved_lock, flags);
+		break;
+	case CPU_DEAD:
+	case CPU_DEAD_FROZEN:
+		spin_lock_irqsave(&data_saved_lock, flags);
+		for (i = 0, data = salinfo_data;
+		     i < ARRAY_SIZE(salinfo_data);
+		     ++i, ++data) {
+			struct salinfo_data_saved *data_saved;
+			int j;
+			for (j = ARRAY_SIZE(data->data_saved) - 1, data_saved = data->data_saved + j;
+			     j >= 0;
+			     --j, --data_saved) {
+				if (data_saved->buffer && data_saved->cpu == cpu) {
+					shift1_data_saved(data, j);
+				}
+			}
+			cpu_clear(cpu, data->cpu_event);
+		}
+		spin_unlock_irqrestore(&data_saved_lock, flags);
+		break;
+	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block salinfo_cpu_notifier =
+{
+	.notifier_call = salinfo_cpu_callback,
+	.priority = 0,
 };
 
 static int __init
@@ -556,7 +622,7 @@ salinfo_init(void)
 	struct proc_dir_entry **sdir = salinfo_proc_entries; /* keeps track of every entry */
 	struct proc_dir_entry *dir, *entry;
 	struct salinfo_data *data;
-	int i, j, online;
+	int i, j;
 
 	salinfo_dir = proc_mkdir("sal", NULL);
 	if (!salinfo_dir)
@@ -564,39 +630,36 @@ salinfo_init(void)
 
 	for (i=0; i < NR_SALINFO_ENTRIES; i++) {
 		/* pass the feature bit in question as misc data */
-		*sdir++ = create_proc_read_entry (salinfo_entries[i].name, 0, salinfo_dir,
-						  salinfo_read, (void *)salinfo_entries[i].feature);
+		*sdir++ = proc_create_data(salinfo_entries[i].name, 0, salinfo_dir,
+					   &proc_salinfo_fops,
+					   (void *)salinfo_entries[i].feature);
 	}
 
+	cpu_notifier_register_begin();
+
 	for (i = 0; i < ARRAY_SIZE(salinfo_log_name); i++) {
 		data = salinfo_data + i;
 		data->type = i;
-		sema_init(&data->sem, 0);
+		sema_init(&data->mutex, 1);
 		dir = proc_mkdir(salinfo_log_name[i], salinfo_dir);
 		if (!dir)
 			continue;
 
-		entry = create_proc_entry("event", S_IRUSR, dir);
+		entry = proc_create_data("event", S_IRUSR, dir,
+					 &salinfo_event_fops, data);
 		if (!entry)
 			continue;
-		entry->data = data;
-		entry->proc_fops = &salinfo_event_fops;
 		*sdir++ = entry;
 
-		entry = create_proc_entry("data", S_IRUSR | S_IWUSR, dir);
+		entry = proc_create_data("data", S_IRUSR | S_IWUSR, dir,
+					 &salinfo_data_fops, data);
 		if (!entry)
 			continue;
-		entry->data = data;
-		entry->proc_fops = &salinfo_data_fops;
 		*sdir++ = entry;
 
 		/* we missed any events before now */
-		online = 0;
-		for_each_online_cpu(j) {
-			set_bit(j, &data->cpu_event);
-			++online;
-		}
-		sema_init(&data->sem, online);
+		for_each_online_cpu(j)
+			cpu_set(j, data->cpu_event);
 
 		*sdir++ = dir;
 	}
@@ -608,6 +671,10 @@ salinfo_init(void)
 	salinfo_timer.function = &salinfo_timeout;
 	add_timer(&salinfo_timer);
 
+	__register_hotcpu_notifier(&salinfo_cpu_notifier);
+
+	cpu_notifier_register_done();
+
 	return 0;
 }
 
@@ -615,22 +682,23 @@ salinfo_init(void)
  * 'data' contains an integer that corresponds to the feature we're
  * testing
  */
-static int
-salinfo_read(char *page, char **start, off_t off, int count, int *eof, void *data)
+static int proc_salinfo_show(struct seq_file *m, void *v)
 {
-	int len = 0;
-
-	len = sprintf(page, (sal_platform_features & (unsigned long)data) ? "1\n" : "0\n");
-
-	if (len <= off+count) *eof = 1;
-
-	*start = page + off;
-	len   -= off;
-
-	if (len>count) len = count;
-	if (len<0) len = 0;
+	unsigned long data = (unsigned long)v;
+	seq_puts(m, (sal_platform_features & data) ? "1\n" : "0\n");
+	return 0;
+}
 
-	return len;
+static int proc_salinfo_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, proc_salinfo_show, PDE_DATA(inode));
 }
 
+static const struct file_operations proc_salinfo_fops = {
+	.open		= proc_salinfo_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
 module_init(salinfo_init);
diff --git a/arch/ia64/kernel/semaphore.c b/arch/ia64/kernel/semaphore.c
deleted file mode 100644
index 2724ef3fbae..00000000000
--- a/arch/ia64/kernel/semaphore.c
+++ /dev/null
@@ -1,165 +0,0 @@
-/*
- * IA-64 semaphore implementation (derived from x86 version).
- *
- * Copyright (C) 1999-2000, 2002 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- */
-
-/*
- * Semaphores are implemented using a two-way counter: The "count"
- * variable is decremented for each process that tries to acquire the
- * semaphore, while the "sleepers" variable is a count of such
- * acquires.
- *
- * Notably, the inline "up()" and "down()" functions can efficiently
- * test if they need to do any extra work (up needs to do something
- * only if count was negative before the increment operation.
- *
- * "sleeping" and the contention routine ordering is protected
- * by the spinlock in the semaphore's waitqueue head.
- *
- * Note that these functions are only called when there is contention
- * on the lock, and as such all this is the "non-critical" part of the
- * whole semaphore business. The critical part is the inline stuff in
- * <asm/semaphore.h> where we want to avoid any extra jumps and calls.
- */
-#include <linux/sched.h>
-#include <linux/init.h>
-
-#include <asm/errno.h>
-#include <asm/semaphore.h>
-
-/*
- * Logic:
- *  - Only on a boundary condition do we need to care. When we go
- *    from a negative count to a non-negative, we wake people up.
- *  - When we go from a non-negative count to a negative do we
- *    (a) synchronize with the "sleepers" count and (b) make sure
- *    that we're on the wakeup list before we synchronize so that
- *    we cannot lose wakeup events.
- */
-
-void
-__up (struct semaphore *sem)
-{
-	wake_up(&sem->wait);
-}
-
-void __sched __down (struct semaphore *sem)
-{
-	struct task_struct *tsk = current;
-	DECLARE_WAITQUEUE(wait, tsk);
-	unsigned long flags;
-
-	tsk->state = TASK_UNINTERRUPTIBLE;
-	spin_lock_irqsave(&sem->wait.lock, flags);
-	add_wait_queue_exclusive_locked(&sem->wait, &wait);
-
-	sem->sleepers++;
-	for (;;) {
-		int sleepers = sem->sleepers;
-
-		/*
-		 * Add "everybody else" into it. They aren't
-		 * playing, because we own the spinlock in
-		 * the wait_queue_head.
-		 */
-		if (!atomic_add_negative(sleepers - 1, &sem->count)) {
-			sem->sleepers = 0;
-			break;
-		}
-		sem->sleepers = 1;	/* us - see -1 above */
-		spin_unlock_irqrestore(&sem->wait.lock, flags);
-
-		schedule();
-
-		spin_lock_irqsave(&sem->wait.lock, flags);
-		tsk->state = TASK_UNINTERRUPTIBLE;
-	}
-	remove_wait_queue_locked(&sem->wait, &wait);
-	wake_up_locked(&sem->wait);
-	spin_unlock_irqrestore(&sem->wait.lock, flags);
-	tsk->state = TASK_RUNNING;
-}
-
-int __sched __down_interruptible (struct semaphore * sem)
-{
-	int retval = 0;
-	struct task_struct *tsk = current;
-	DECLARE_WAITQUEUE(wait, tsk);
-	unsigned long flags;
-
-	tsk->state = TASK_INTERRUPTIBLE;
-	spin_lock_irqsave(&sem->wait.lock, flags);
-	add_wait_queue_exclusive_locked(&sem->wait, &wait);
-
-	sem->sleepers ++;
-	for (;;) {
-		int sleepers = sem->sleepers;
-
-		/*
-		 * With signals pending, this turns into
-		 * the trylock failure case - we won't be
-		 * sleeping, and we* can't get the lock as
-		 * it has contention. Just correct the count
-		 * and exit.
-		 */
-		if (signal_pending(current)) {
-			retval = -EINTR;
-			sem->sleepers = 0;
-			atomic_add(sleepers, &sem->count);
-			break;
-		}
-
-		/*
-		 * Add "everybody else" into it. They aren't
-		 * playing, because we own the spinlock in
-		 * wait_queue_head. The "-1" is because we're
-		 * still hoping to get the semaphore.
-		 */
-		if (!atomic_add_negative(sleepers - 1, &sem->count)) {
-			sem->sleepers = 0;
-			break;
-		}
-		sem->sleepers = 1;	/* us - see -1 above */
-		spin_unlock_irqrestore(&sem->wait.lock, flags);
-
-		schedule();
-
-		spin_lock_irqsave(&sem->wait.lock, flags);
-		tsk->state = TASK_INTERRUPTIBLE;
-	}
-	remove_wait_queue_locked(&sem->wait, &wait);
-	wake_up_locked(&sem->wait);
-	spin_unlock_irqrestore(&sem->wait.lock, flags);
-
-	tsk->state = TASK_RUNNING;
-	return retval;
-}
-
-/*
- * Trylock failed - make sure we correct for having decremented the
- * count.
- */
-int
-__down_trylock (struct semaphore *sem)
-{
-	unsigned long flags;
-	int sleepers;
-
-	spin_lock_irqsave(&sem->wait.lock, flags);
-	sleepers = sem->sleepers + 1;
-	sem->sleepers = 0;
-
-	/*
-	 * Add "everybody else" and us into it. They aren't
-	 * playing, because we own the spinlock in the
-	 * wait_queue_head.
-	 */
-	if (!atomic_add_negative(sleepers, &sem->count)) {
-		wake_up_locked(&sem->wait);
-	}
-
-	spin_unlock_irqrestore(&sem->wait.lock, flags);
-	return 1;
-}
diff --git a/arch/ia64/kernel/setup.c b/arch/ia64/kernel/setup.c
index 3af6de36a48..d86669bcdfb 100644
--- a/arch/ia64/kernel/setup.c
+++ b/arch/ia64/kernel/setup.c
@@ -22,7 +22,6 @@
  * 06/24/99 W.Drummond	added boot_cpu_data.
  * 05/28/05 Z. Menyhart	Dynamic stride size for "flush_icache_range()"
  */
-#include <linux/config.h>
 #include <linux/module.h>
 #include <linux/init.h>
 
@@ -36,29 +35,33 @@
 #include <linux/seq_file.h>
 #include <linux/string.h>
 #include <linux/threads.h>
-#include <linux/tty.h>
+#include <linux/screen_info.h>
+#include <linux/dmi.h>
 #include <linux/serial.h>
 #include <linux/serial_core.h>
 #include <linux/efi.h>
 #include <linux/initrd.h>
-#include <linux/platform.h>
 #include <linux/pm.h>
+#include <linux/cpufreq.h>
+#include <linux/kexec.h>
+#include <linux/crash_dump.h>
 
-#include <asm/ia32.h>
 #include <asm/machvec.h>
 #include <asm/mca.h>
 #include <asm/meminit.h>
 #include <asm/page.h>
+#include <asm/paravirt.h>
+#include <asm/paravirt_patch.h>
 #include <asm/patch.h>
 #include <asm/pgtable.h>
 #include <asm/processor.h>
 #include <asm/sal.h>
 #include <asm/sections.h>
-#include <asm/serial.h>
 #include <asm/setup.h>
 #include <asm/smp.h>
-#include <asm/system.h>
+#include <asm/tlbflush.h>
 #include <asm/unistd.h>
+#include <asm/hpsim.h>
 
 #if defined(CONFIG_SMP) && (IA64_CPU_SIZE > PAGE_SIZE)
 # error "struct cpuinfo_ia64 too big!"
@@ -69,9 +72,8 @@ unsigned long __per_cpu_offset[NR_CPUS];
 EXPORT_SYMBOL(__per_cpu_offset);
 #endif
 
-DEFINE_PER_CPU(struct cpuinfo_ia64, cpu_info);
+DEFINE_PER_CPU(struct cpuinfo_ia64, ia64_cpu_info);
 DEFINE_PER_CPU(unsigned long, local_per_cpu_offset);
-DEFINE_PER_CPU(unsigned long, ia64_phys_stacked_size_p8);
 unsigned long ia64_cycles_per_usec;
 struct ia64_boot_param *ia64_boot_param;
 struct screen_info screen_info;
@@ -87,17 +89,13 @@ static struct resource code_resource = {
 	.name	= "Kernel code",
 	.flags	= IORESOURCE_BUSY | IORESOURCE_MEM
 };
-extern void efi_initialize_iomem_resources(struct resource *,
-		struct resource *);
-extern char _text[], _end[], _etext[];
 
-unsigned long ia64_max_cacheline_size;
+static struct resource bss_resource = {
+	.name	= "Kernel bss",
+	.flags	= IORESOURCE_BUSY | IORESOURCE_MEM
+};
 
-int dma_get_cache_alignment(void)
-{
-        return ia64_max_cacheline_size;
-}
-EXPORT_SYMBOL(dma_get_cache_alignment);
+unsigned long ia64_max_cacheline_size;
 
 unsigned long ia64_iobase;	/* virtual address for I/O accesses */
 EXPORT_SYMBOL(ia64_iobase);
@@ -111,6 +109,13 @@ unsigned int num_io_spaces;
  */
 #define	I_CACHE_STRIDE_SHIFT	5	/* Safest way to go: 32 bytes by 32 bytes */
 unsigned long ia64_i_cache_stride_shift = ~0;
+/*
+ * "clflush_cache_range()" needs to know what processor dependent stride size to
+ * use when it flushes cache lines including both d-cache and i-cache.
+ */
+/* Safest way to go: 32 bytes by 32 bytes */
+#define	CACHE_STRIDE_SHIFT	5
+unsigned long ia64_cache_stride_shift = ~0;
 
 /*
  * The merge_mask variable needs to be set to (max(iommu_page_size(iommu)) - 1).  This
@@ -127,8 +132,8 @@ EXPORT_SYMBOL(ia64_max_iommu_merge_mask);
 /*
  * We use a special marker for the end of memory and it uses the extra (+1) slot
  */
-struct rsvd_region rsvd_region[IA64_MAX_RSVD_REGIONS + 1];
-int num_rsvd_regions;
+struct rsvd_region rsvd_region[IA64_MAX_RSVD_REGIONS + 1] __initdata;
+int num_rsvd_regions __initdata;
 
 
 /*
@@ -137,10 +142,10 @@ int num_rsvd_regions;
  * caller-specified function is called with the memory ranges that remain after filtering.
  * This routine does not assume the incoming segments are sorted.
  */
-int
-filter_rsvd_memory (unsigned long start, unsigned long end, void *arg)
+int __init
+filter_rsvd_memory (u64 start, u64 end, void *arg)
 {
-	unsigned long range_start, range_end, prev_start;
+	u64 range_start, range_end, prev_start;
 	void (*func)(unsigned long, unsigned long, int);
 	int i;
 
@@ -173,7 +178,30 @@ filter_rsvd_memory (unsigned long start, unsigned long end, void *arg)
 	return 0;
 }
 
-static void
+/*
+ * Similar to "filter_rsvd_memory()", but the reserved memory ranges
+ * are not filtered out.
+ */
+int __init
+filter_memory(u64 start, u64 end, void *arg)
+{
+	void (*func)(unsigned long, unsigned long, int);
+
+#if IGNORE_PFN0
+	if (start == PAGE_OFFSET) {
+		printk(KERN_WARNING "warning: skipping physical page 0\n");
+		start += PAGE_SIZE;
+		if (start >= end)
+			return 0;
+	}
+#endif
+	func = arg;
+	if (start < end)
+		call_pernode_memory(__pa(start), end - start, func);
+	return 0;
+}
+
+static void __init
 sort_regions (struct rsvd_region *rsvd_region, int max)
 {
 	int j;
@@ -191,6 +219,23 @@ sort_regions (struct rsvd_region *rsvd_region, int max)
 	}
 }
 
+/* merge overlaps */
+static int __init
+merge_regions (struct rsvd_region *rsvd_region, int max)
+{
+	int i;
+	for (i = 1; i < max; ++i) {
+		if (rsvd_region[i].start >= rsvd_region[i-1].end)
+			continue;
+		if (rsvd_region[i].end > rsvd_region[i-1].end)
+			rsvd_region[i-1].end = rsvd_region[i].end;
+		--max;
+		memmove(&rsvd_region[i], &rsvd_region[i+1],
+			(max - i) * sizeof(struct rsvd_region));
+	}
+	return max;
+}
+
 /*
  * Request address space for all standard resources
  */
@@ -199,25 +244,101 @@ static int __init register_memory(void)
 	code_resource.start = ia64_tpa(_text);
 	code_resource.end   = ia64_tpa(_etext) - 1;
 	data_resource.start = ia64_tpa(_etext);
-	data_resource.end   = ia64_tpa(_end) - 1;
-	efi_initialize_iomem_resources(&code_resource, &data_resource);
+	data_resource.end   = ia64_tpa(_edata) - 1;
+	bss_resource.start  = ia64_tpa(__bss_start);
+	bss_resource.end    = ia64_tpa(_end) - 1;
+	efi_initialize_iomem_resources(&code_resource, &data_resource,
+			&bss_resource);
 
 	return 0;
 }
 
 __initcall(register_memory);
 
+
+#ifdef CONFIG_KEXEC
+
+/*
+ * This function checks if the reserved crashkernel is allowed on the specific
+ * IA64 machine flavour. Machines without an IO TLB use swiotlb and require
+ * some memory below 4 GB (i.e. in 32 bit area), see the implementation of
+ * lib/swiotlb.c. The hpzx1 architecture has an IO TLB but cannot use that
+ * in kdump case. See the comment in sba_init() in sba_iommu.c.
+ *
+ * So, the only machvec that really supports loading the kdump kernel
+ * over 4 GB is "sn2".
+ */
+static int __init check_crashkernel_memory(unsigned long pbase, size_t size)
+{
+	if (ia64_platform_is("sn2") || ia64_platform_is("uv"))
+		return 1;
+	else
+		return pbase < (1UL << 32);
+}
+
+static void __init setup_crashkernel(unsigned long total, int *n)
+{
+	unsigned long long base = 0, size = 0;
+	int ret;
+
+	ret = parse_crashkernel(boot_command_line, total,
+			&size, &base);
+	if (ret == 0 && size > 0) {
+		if (!base) {
+			sort_regions(rsvd_region, *n);
+			*n = merge_regions(rsvd_region, *n);
+			base = kdump_find_rsvd_region(size,
+					rsvd_region, *n);
+		}
+
+		if (!check_crashkernel_memory(base, size)) {
+			pr_warning("crashkernel: There would be kdump memory "
+				"at %ld GB but this is unusable because it "
+				"must\nbe below 4 GB. Change the memory "
+				"configuration of the machine.\n",
+				(unsigned long)(base >> 30));
+			return;
+		}
+
+		if (base != ~0UL) {
+			printk(KERN_INFO "Reserving %ldMB of memory at %ldMB "
+					"for crashkernel (System RAM: %ldMB)\n",
+					(unsigned long)(size >> 20),
+					(unsigned long)(base >> 20),
+					(unsigned long)(total >> 20));
+			rsvd_region[*n].start =
+				(unsigned long)__va(base);
+			rsvd_region[*n].end =
+				(unsigned long)__va(base + size);
+			(*n)++;
+			crashk_res.start = base;
+			crashk_res.end = base + size - 1;
+		}
+	}
+	efi_memmap_res.start = ia64_boot_param->efi_memmap;
+	efi_memmap_res.end = efi_memmap_res.start +
+		ia64_boot_param->efi_memmap_size;
+	boot_param_res.start = __pa(ia64_boot_param);
+	boot_param_res.end = boot_param_res.start +
+		sizeof(*ia64_boot_param);
+}
+#else
+static inline void __init setup_crashkernel(unsigned long total, int *n)
+{}
+#endif
+
 /**
  * reserve_memory - setup reserved memory areas
  *
  * Setup the reserved memory areas set aside for the boot parameters,
  * initrd, etc.  There are currently %IA64_MAX_RSVD_REGIONS defined,
- * see include/asm-ia64/meminit.h if you need to define more.
+ * see arch/ia64/include/asm/meminit.h if you need to define more.
  */
-void
+void __init
 reserve_memory (void)
 {
 	int n = 0;
+	unsigned long total_memory;
 
 	/*
 	 * none of the entries in this table overlap
@@ -239,6 +360,8 @@ reserve_memory (void)
 	rsvd_region[n].end   = (unsigned long) ia64_imva(_end);
 	n++;
 
+	n += paravirt_reserve_memory(&rsvd_region[n]);
+
 #ifdef CONFIG_BLK_DEV_INITRD
 	if (ia64_boot_param->initrd_start) {
 		rsvd_region[n].start = (unsigned long)__va(ia64_boot_param->initrd_start);
@@ -247,26 +370,37 @@ reserve_memory (void)
 	}
 #endif
 
-	efi_memmap_init(&rsvd_region[n].start, &rsvd_region[n].end);
+#ifdef CONFIG_CRASH_DUMP
+	if (reserve_elfcorehdr(&rsvd_region[n].start,
+			       &rsvd_region[n].end) == 0)
+		n++;
+#endif
+
+	total_memory = efi_memmap_init(&rsvd_region[n].start, &rsvd_region[n].end);
 	n++;
 
+	setup_crashkernel(total_memory, &n);
+
 	/* end of memory marker */
 	rsvd_region[n].start = ~0UL;
 	rsvd_region[n].end   = ~0UL;
 	n++;
 
 	num_rsvd_regions = n;
+	BUG_ON(IA64_MAX_RSVD_REGIONS + 1 < n);
 
 	sort_regions(rsvd_region, num_rsvd_regions);
+	num_rsvd_regions = merge_regions(rsvd_region, num_rsvd_regions);
 }
 
+
 /**
  * find_initrd - get initrd parameters from the boot parameter structure
  *
  * Grab the initrd start and end from the boot parameter struct given us by
  * the boot loader.
  */
-void
+void __init
 find_initrd (void)
 {
 #ifdef CONFIG_BLK_DEV_INITRD
@@ -274,7 +408,7 @@ find_initrd (void)
 		initrd_start = (unsigned long)__va(ia64_boot_param->initrd_start);
 		initrd_end   = initrd_start+ia64_boot_param->initrd_size;
 
-		printk(KERN_INFO "Initial ramdisk at: 0x%lx (%lu bytes)\n",
+		printk(KERN_INFO "Initial ramdisk at: 0x%lx (%llu bytes)\n",
 		       initrd_start, ia64_boot_param->initrd_size);
 	}
 #endif
@@ -340,10 +474,8 @@ early_console_setup (char *cmdline)
 	if (!efi_setup_pcdp_console(cmdline))
 		earlycons++;
 #endif
-#ifdef CONFIG_SERIAL_8250_CONSOLE
-	if (!early_serial_console_init(cmdline))
+	if (!simcons_register())
 		earlycons++;
-#endif
 
 	return (earlycons) ? 0 : -1;
 }
@@ -353,118 +485,117 @@ mark_bsp_online (void)
 {
 #ifdef CONFIG_SMP
 	/* If we register an early console, allow CPU 0 to printk */
-	cpu_set(smp_processor_id(), cpu_online_map);
+	set_cpu_online(smp_processor_id(), true);
 #endif
 }
 
-#ifdef CONFIG_SMP
-static void
-check_for_logical_procs (void)
+static __initdata int nomca;
+static __init int setup_nomca(char *s)
 {
-	pal_logical_to_physical_t info;
-	s64 status;
-
-	status = ia64_pal_logical_to_phys(0, &info);
-	if (status == -1) {
-		printk(KERN_INFO "No logical to physical processor mapping "
-		       "available\n");
-		return;
-	}
-	if (status) {
-		printk(KERN_ERR "ia64_pal_logical_to_phys failed with %ld\n",
-		       status);
-		return;
-	}
-	/*
-	 * Total number of siblings that BSP has.  Though not all of them 
-	 * may have booted successfully. The correct number of siblings 
-	 * booted is in info.overview_num_log.
+	nomca = 1;
+	return 0;
+}
+early_param("nomca", setup_nomca);
+
+#ifdef CONFIG_CRASH_DUMP
+int __init reserve_elfcorehdr(u64 *start, u64 *end)
+{
+	u64 length;
+
+	/* We get the address using the kernel command line,
+	 * but the size is extracted from the EFI tables.
+	 * Both address and size are required for reservation
+	 * to work properly.
 	 */
-	smp_num_siblings = info.overview_tpc;
-	smp_num_cpucores = info.overview_cpp;
+
+	if (!is_vmcore_usable())
+		return -EINVAL;
+
+	if ((length = vmcore_find_descriptor_size(elfcorehdr_addr)) == 0) {
+		vmcore_unusable();
+		return -EINVAL;
+	}
+
+	*start = (unsigned long)__va(elfcorehdr_addr);
+	*end = *start + length;
+	return 0;
 }
-#endif
+
+#endif /* CONFIG_PROC_VMCORE */
 
 void __init
 setup_arch (char **cmdline_p)
 {
 	unw_init();
 
+	paravirt_arch_setup_early();
+
 	ia64_patch_vtop((u64) __start___vtop_patchlist, (u64) __end___vtop_patchlist);
+	paravirt_patch_apply();
 
 	*cmdline_p = __va(ia64_boot_param->command_line);
-	strlcpy(saved_command_line, *cmdline_p, COMMAND_LINE_SIZE);
+	strlcpy(boot_command_line, *cmdline_p, COMMAND_LINE_SIZE);
 
 	efi_init();
 	io_port_init();
 
 #ifdef CONFIG_IA64_GENERIC
-	{
-		const char *mvec_name = strstr (*cmdline_p, "machvec=");
-		char str[64];
-
-		if (mvec_name) {
-			const char *end;
-			size_t len;
-
-			mvec_name += 8;
-			end = strchr (mvec_name, ' ');
-			if (end)
-				len = end - mvec_name;
-			else
-				len = strlen (mvec_name);
-			len = min(len, sizeof (str) - 1);
-			strncpy (str, mvec_name, len);
-			str[len] = '\0';
-			mvec_name = str;
-		} else
-			mvec_name = acpi_get_sysname();
-		machvec_init(mvec_name);
-	}
+	/* machvec needs to be parsed from the command line
+	 * before parse_early_param() is called to ensure
+	 * that ia64_mv is initialised before any command line
+	 * settings may cause console setup to occur
+	 */
+	machvec_init_from_cmdline(*cmdline_p);
 #endif
 
+	parse_early_param();
+
 	if (early_console_setup(*cmdline_p) == 0)
 		mark_bsp_online();
 
 #ifdef CONFIG_ACPI
 	/* Initialize the ACPI boot-time table parser */
 	acpi_table_init();
+	early_acpi_boot_init();
 # ifdef CONFIG_ACPI_NUMA
 	acpi_numa_init();
-# endif
-#else
-# ifdef CONFIG_SMP
-	smp_build_cpu_map();	/* happens, e.g., with the Ski simulator */
+#  ifdef CONFIG_ACPI_HOTPLUG_CPU
+	prefill_possible_map();
+#  endif
+	per_cpu_scan_finalize((cpus_weight(early_cpu_possible_map) == 0 ?
+		32 : cpus_weight(early_cpu_possible_map)),
+		additional_cpus > 0 ? additional_cpus : 0);
 # endif
 #endif /* CONFIG_APCI_BOOT */
 
+#ifdef CONFIG_SMP
+	smp_build_cpu_map();
+#endif
 	find_memory();
 
 	/* process SAL system table: */
-	ia64_sal_init(efi.sal_systab);
+	ia64_sal_init(__va(efi.sal_systab));
+
+#ifdef CONFIG_ITANIUM
+	ia64_patch_rse((u64) __start___rse_patchlist, (u64) __end___rse_patchlist);
+#else
+	{
+		unsigned long num_phys_stacked;
+
+		if (ia64_pal_rse_info(&num_phys_stacked, 0) == 0 && num_phys_stacked > 96)
+			ia64_patch_rse((u64) __start___rse_patchlist, (u64) __end___rse_patchlist);
+	}
+#endif
 
 #ifdef CONFIG_SMP
 	cpu_physical_id(0) = hard_smp_processor_id();
-
-	cpu_set(0, cpu_sibling_map[0]);
-	cpu_set(0, cpu_core_map[0]);
-
-	check_for_logical_procs();
-	if (smp_num_cpucores > 1)
-		printk(KERN_INFO
-		       "cpu package is Multi-Core capable: number of cores=%d\n",
-		       smp_num_cpucores);
-	if (smp_num_siblings > 1)
-		printk(KERN_INFO
-		       "cpu package is Multi-Threading capable: number of siblings=%d\n",
-		       smp_num_siblings);
 #endif
 
 	cpu_init();	/* initialize the bootstrap CPU */
+	mmu_context_init();	/* initialize context_id bitmap */
 
-#ifdef CONFIG_ACPI
-	acpi_boot_init();
-#endif
+	paravirt_banner();
+	paravirt_arch_setup_console(cmdline_p);
 
 #ifdef CONFIG_VT
 	if (!conswitchp) {
@@ -485,15 +616,20 @@ setup_arch (char **cmdline_p)
 #endif
 
 	/* enable IA-64 Machine Check Abort Handling unless disabled */
-	if (!strstr(saved_command_line, "nomca"))
+	if (paravirt_arch_setup_nomca())
+		nomca = 1;
+	if (!nomca)
 		ia64_mca_init();
 
 	platform_setup(cmdline_p);
+#ifndef CONFIG_IA64_HP_SIM
+	check_sal_cache_flush();
+#endif
 	paging_init();
 }
 
 /*
- * Display cpu info for all cpu's.
+ * Display cpu info for all CPUs.
  */
 static int
 show_cpuinfo (struct seq_file *m, void *v)
@@ -513,68 +649,67 @@ show_cpuinfo (struct seq_file *m, void *v)
 		{ 1UL << 1, "spontaneous deferral"},
 		{ 1UL << 2, "16-byte atomic ops" }
 	};
-	char family[32], features[128], *cp, sep;
+	char features[128], *cp, *sep;
 	struct cpuinfo_ia64 *c = v;
 	unsigned long mask;
-	int i;
+	unsigned long proc_freq;
+	int i, size;
 
 	mask = c->features;
 
-	switch (c->family) {
-	      case 0x07:	memcpy(family, "Itanium", 8); break;
-	      case 0x1f:	memcpy(family, "Itanium 2", 10); break;
-	      default:		sprintf(family, "%u", c->family); break;
-	}
-
 	/* build the feature string: */
-	memcpy(features, " standard", 10);
+	memcpy(features, "standard", 9);
 	cp = features;
-	sep = 0;
-	for (i = 0; i < (int) ARRAY_SIZE(feature_bits); ++i) {
+	size = sizeof(features);
+	sep = "";
+	for (i = 0; i < ARRAY_SIZE(feature_bits) && size > 1; ++i) {
 		if (mask & feature_bits[i].mask) {
-			if (sep)
-				*cp++ = sep;
-			sep = ',';
-			*cp++ = ' ';
-			strcpy(cp, feature_bits[i].feature_name);
-			cp += strlen(feature_bits[i].feature_name);
+			cp += snprintf(cp, size, "%s%s", sep,
+				       feature_bits[i].feature_name),
+			sep = ", ";
 			mask &= ~feature_bits[i].mask;
+			size = sizeof(features) - (cp - features);
 		}
 	}
-	if (mask) {
-		/* print unknown features as a hex value: */
-		if (sep)
-			*cp++ = sep;
-		sprintf(cp, " 0x%lx", mask);
+	if (mask && size > 1) {
+		/* print unknown features as a hex value */
+		snprintf(cp, size, "%s0x%lx", sep, mask);
 	}
 
+	proc_freq = cpufreq_quick_get(cpunum);
+	if (!proc_freq)
+		proc_freq = c->proc_freq / 1000;
+
 	seq_printf(m,
 		   "processor  : %d\n"
 		   "vendor     : %s\n"
 		   "arch       : IA-64\n"
-		   "family     : %s\n"
+		   "family     : %u\n"
 		   "model      : %u\n"
+		   "model name : %s\n"
 		   "revision   : %u\n"
 		   "archrev    : %u\n"
-		   "features   :%s\n"	/* don't change this---it _is_ right! */
+		   "features   : %s\n"
 		   "cpu number : %lu\n"
 		   "cpu regs   : %u\n"
-		   "cpu MHz    : %lu.%06lu\n"
+		   "cpu MHz    : %lu.%03lu\n"
 		   "itc MHz    : %lu.%06lu\n"
 		   "BogoMIPS   : %lu.%02lu\n",
-		   cpunum, c->vendor, family, c->model, c->revision, c->archrev,
+		   cpunum, c->vendor, c->family, c->model,
+		   c->model_name, c->revision, c->archrev,
 		   features, c->ppn, c->number,
-		   c->proc_freq / 1000000, c->proc_freq % 1000000,
+		   proc_freq / 1000, proc_freq % 1000,
 		   c->itc_freq / 1000000, c->itc_freq % 1000000,
 		   lpj*HZ/500000, (lpj*HZ/5000) % 100);
 #ifdef CONFIG_SMP
 	seq_printf(m, "siblings   : %u\n", cpus_weight(cpu_core_map[cpunum]));
+	if (c->socket_id != -1)
+		seq_printf(m, "physical id: %u\n", c->socket_id);
 	if (c->threads_per_core > 1 || c->cores_per_socket > 1)
 		seq_printf(m,
-		   	   "physical id: %u\n"
-		   	   "core id    : %u\n"
-		   	   "thread id  : %u\n",
-		   	   c->socket_id, c->core_id, c->thread_id);
+			   "core id    : %u\n"
+			   "thread id  : %u\n",
+			   c->core_id, c->thread_id);
 #endif
 	seq_printf(m,"\n");
 
@@ -585,10 +720,10 @@ static void *
 c_start (struct seq_file *m, loff_t *pos)
 {
 #ifdef CONFIG_SMP
-	while (*pos < NR_CPUS && !cpu_isset(*pos, cpu_online_map))
+	while (*pos < nr_cpu_ids && !cpu_online(*pos))
 		++*pos;
 #endif
-	return *pos < NR_CPUS ? cpu_data(*pos) : NULL;
+	return *pos < nr_cpu_ids ? cpu_data(*pos) : NULL;
 }
 
 static void *
@@ -603,14 +738,47 @@ c_stop (struct seq_file *m, void *v)
 {
 }
 
-struct seq_operations cpuinfo_op = {
+const struct seq_operations cpuinfo_op = {
 	.start =	c_start,
 	.next =		c_next,
 	.stop =		c_stop,
 	.show =		show_cpuinfo
 };
 
-void
+#define MAX_BRANDS	8
+static char brandname[MAX_BRANDS][128];
+
+static char *
+get_model_name(__u8 family, __u8 model)
+{
+	static int overflow;
+	char brand[128];
+	int i;
+
+	memcpy(brand, "Unknown", 8);
+	if (ia64_pal_get_brand_info(brand)) {
+		if (family == 0x7)
+			memcpy(brand, "Merced", 7);
+		else if (family == 0x1f) switch (model) {
+			case 0: memcpy(brand, "McKinley", 9); break;
+			case 1: memcpy(brand, "Madison", 8); break;
+			case 2: memcpy(brand, "Madison up to 9M cache", 23); break;
+		}
+	}
+	for (i = 0; i < MAX_BRANDS; i++)
+		if (strcmp(brandname[i], brand) == 0)
+			return brandname[i];
+	for (i = 0; i < MAX_BRANDS; i++)
+		if (brandname[i][0] == '\0')
+			return strcpy(brandname[i], brand);
+	if (overflow++ == 0)
+		printk(KERN_ERR
+		       "%s: Table overflow. Some processor model information will be missing\n",
+		       __func__);
+	return "Unknown";
+}
+
+static void
 identify_cpu (struct cpuinfo_ia64 *c)
 {
 	union {
@@ -639,7 +807,6 @@ identify_cpu (struct cpuinfo_ia64 *c)
 	pal_status_t status;
 	unsigned long impl_va_msb = 50, phys_addr_size = 44;	/* Itanium defaults */
 	int i;
-
 	for (i = 0; i < 5; ++i)
 		cpuid.bits[i] = ia64_get_cpuid(i);
 
@@ -648,12 +815,15 @@ identify_cpu (struct cpuinfo_ia64 *c)
 	c->cpu = smp_processor_id();
 
 	/* below default values will be overwritten  by identify_siblings() 
-	 * for Multi-Threading/Multi-Core capable cpu's
+	 * for Multi-Threading/Multi-Core capable CPUs
 	 */
 	c->threads_per_core = c->cores_per_socket = c->num_log = 1;
 	c->socket_id = -1;
 
 	identify_siblings(c);
+
+	if (c->threads_per_core > smp_num_siblings)
+		smp_num_siblings = c->threads_per_core;
 #endif
 	c->ppn = cpuid.field.ppn;
 	c->number = cpuid.field.number;
@@ -662,6 +832,7 @@ identify_cpu (struct cpuinfo_ia64 *c)
 	c->family = cpuid.field.family;
 	c->archrev = cpuid.field.archrev;
 	c->features = cpuid.field.features;
+	c->model_name = get_model_name(c->family, c->model);
 
 	status = ia64_pal_vm_summary(&vm1, &vm2);
 	if (status == PAL_STATUS_SUCCESS) {
@@ -672,60 +843,63 @@ identify_cpu (struct cpuinfo_ia64 *c)
 	c->unimpl_pa_mask = ~((1L<<63) | ((1L << phys_addr_size) - 1));
 }
 
-void
-setup_per_cpu_areas (void)
-{
-	/* start_kernel() requires this... */
-}
-
 /*
- * Calculate the max. cache line size.
+ * Do the following calculations:
  *
- * In addition, the minimum of the i-cache stride sizes is calculated for
- * "flush_icache_range()".
+ * 1. the max. cache line size.
+ * 2. the minimum of the i-cache stride sizes for "flush_icache_range()".
+ * 3. the minimum of the cache stride sizes for "clflush_cache_range()".
  */
 static void
-get_max_cacheline_size (void)
+get_cache_info(void)
 {
 	unsigned long line_size, max = 1;
-	u64 l, levels, unique_caches;
-        pal_cache_config_info_t cci;
-        s64 status;
+	unsigned long l, levels, unique_caches;
+	pal_cache_config_info_t cci;
+	long status;
 
         status = ia64_pal_cache_summary(&levels, &unique_caches);
         if (status != 0) {
                 printk(KERN_ERR "%s: ia64_pal_cache_summary() failed (status=%ld)\n",
-                       __FUNCTION__, status);
+                       __func__, status);
                 max = SMP_CACHE_BYTES;
 		/* Safest setup for "flush_icache_range()" */
 		ia64_i_cache_stride_shift = I_CACHE_STRIDE_SHIFT;
+		/* Safest setup for "clflush_cache_range()" */
+		ia64_cache_stride_shift = CACHE_STRIDE_SHIFT;
 		goto out;
         }
 
 	for (l = 0; l < levels; ++l) {
-		status = ia64_pal_cache_config_info(l, /* cache_type (data_or_unified)= */ 2,
-						    &cci);
+		/* cache_type (data_or_unified)=2 */
+		status = ia64_pal_cache_config_info(l, 2, &cci);
 		if (status != 0) {
-			printk(KERN_ERR
-			       "%s: ia64_pal_cache_config_info(l=%lu, 2) failed (status=%ld)\n",
-			       __FUNCTION__, l, status);
+			printk(KERN_ERR "%s: ia64_pal_cache_config_info"
+				"(l=%lu, 2) failed (status=%ld)\n",
+				__func__, l, status);
 			max = SMP_CACHE_BYTES;
 			/* The safest setup for "flush_icache_range()" */
 			cci.pcci_stride = I_CACHE_STRIDE_SHIFT;
+			/* The safest setup for "clflush_cache_range()" */
+			ia64_cache_stride_shift = CACHE_STRIDE_SHIFT;
 			cci.pcci_unified = 1;
+		} else {
+			if (cci.pcci_stride < ia64_cache_stride_shift)
+				ia64_cache_stride_shift = cci.pcci_stride;
+
+			line_size = 1 << cci.pcci_line_size;
+			if (line_size > max)
+				max = line_size;
 		}
-		line_size = 1 << cci.pcci_line_size;
-		if (line_size > max)
-			max = line_size;
+
 		if (!cci.pcci_unified) {
-			status = ia64_pal_cache_config_info(l,
-						    /* cache_type (instruction)= */ 1,
-						    &cci);
+			/* cache_type (instruction)=1*/
+			status = ia64_pal_cache_config_info(l, 1, &cci);
 			if (status != 0) {
-				printk(KERN_ERR
-				"%s: ia64_pal_cache_config_info(l=%lu, 1) failed (status=%ld)\n",
-					__FUNCTION__, l, status);
-				/* The safest setup for "flush_icache_range()" */
+				printk(KERN_ERR "%s: ia64_pal_cache_config_info"
+					"(l=%lu, 1) failed (status=%ld)\n",
+					__func__, l, status);
+				/* The safest setup for flush_icache_range() */
 				cci.pcci_stride = I_CACHE_STRIDE_SHIFT;
 			}
 		}
@@ -744,7 +918,8 @@ get_max_cacheline_size (void)
 void
 cpu_init (void)
 {
-	extern void __devinit ia64_mmu_init (void *);
+	extern void ia64_mmu_init(void *);
+	static unsigned long max_num_phys_stacked = IA64_NUM_PHYS_STACK_REG;
 	unsigned long num_phys_stacked;
 	pal_vm_info_2_u_t vmi;
 	unsigned int max_ctx;
@@ -752,16 +927,28 @@ cpu_init (void)
 	void *cpu_data;
 
 	cpu_data = per_cpu_init();
-
+#ifdef CONFIG_SMP
 	/*
-	 * We set ar.k3 so that assembly code in MCA handler can compute
-	 * physical addresses of per cpu variables with a simple:
-	 *   phys = ar.k3 + &per_cpu_var
+	 * insert boot cpu into sibling and core mapes
+	 * (must be done after per_cpu area is setup)
 	 */
-	ia64_set_kr(IA64_KR_PER_CPU_DATA,
-		    ia64_tpa(cpu_data) - (long) __per_cpu_start);
+	if (smp_processor_id() == 0) {
+		cpu_set(0, per_cpu(cpu_sibling_map, 0));
+		cpu_set(0, cpu_core_map[0]);
+	} else {
+		/*
+		 * Set ar.k3 so that assembly code in MCA handler can compute
+		 * physical addresses of per cpu variables with a simple:
+		 *   phys = ar.k3 + &per_cpu_var
+		 * and the alt-dtlb-miss handler can set per-cpu mapping into
+		 * the TLB when needed. head.S already did this for cpu0.
+		 */
+		ia64_set_kr(IA64_KR_PER_CPU_DATA,
+			    ia64_tpa(cpu_data) - (long) __per_cpu_start);
+	}
+#endif
 
-	get_max_cacheline_size();
+	get_cache_info();
 
 	/*
 	 * We can't pass "local_cpu_data" to identify_cpu() because we haven't called
@@ -769,7 +956,7 @@ cpu_init (void)
 	 * depends on the data returned by identify_cpu().  We break the dependency by
 	 * accessing cpu_data() through the canonical per-CPU address.
 	 */
-	cpu_info = cpu_data + ((char *) &__ia64_per_cpu_var(cpu_info) - __per_cpu_start);
+	cpu_info = cpu_data + ((char *) &__ia64_per_cpu_var(ia64_cpu_info) - __per_cpu_start);
 	identify_cpu(cpu_info);
 
 #ifdef CONFIG_MCKINLEY
@@ -787,7 +974,7 @@ cpu_init (void)
 #endif
 
 	/* Clear the stack memory reserved for pt_regs: */
-	memset(ia64_task_regs(current), 0, sizeof(struct pt_regs));
+	memset(task_pt_regs(current), 0, sizeof(struct pt_regs));
 
 	ia64_set_kr(IA64_KR_FPU_OWNER, 0);
 
@@ -813,17 +1000,12 @@ cpu_init (void)
 					| IA64_DCR_DA | IA64_DCR_DD | IA64_DCR_LC));
 	atomic_inc(&init_mm.mm_count);
 	current->active_mm = &init_mm;
-	if (current->mm)
-		BUG();
+	BUG_ON(current->mm);
 
 	ia64_mmu_init(ia64_imva(cpu_data));
 	ia64_mca_cpu_init(ia64_imva(cpu_data));
 
-#ifdef CONFIG_IA32_SUPPORT
-	ia32_cpu_init();
-#endif
-
-	/* Clear ITC to eliminiate sched_clock() overflows in human time.  */
+	/* Clear ITC to eliminate sched_clock() overflows in human time.  */
 	ia64_set_itc(0);
 
 	/* disable all local interrupt sources: */
@@ -835,14 +1017,20 @@ cpu_init (void)
 
 	/* clear TPR & XTP to enable all interrupt classes: */
 	ia64_setreg(_IA64_REG_CR_TPR, 0);
+
+	/* Clear any pending interrupts left by SAL/EFI */
+	while (ia64_get_ivr() != IA64_SPURIOUS_INT_VECTOR)
+		ia64_eoi();
+
 #ifdef CONFIG_SMP
 	normal_xtp();
 #endif
 
 	/* set ia64_ctx.max_rid to the maximum RID that is supported by all CPUs: */
-	if (ia64_pal_vm_summary(NULL, &vmi) == 0)
+	if (ia64_pal_vm_summary(NULL, &vmi) == 0) {
 		max_ctx = (1U << (vmi.pal_vm_info_2_s.rid_size - 3)) - 1;
-	else {
+		setup_ptcg_sem(vmi.pal_vm_info_2_s.max_purges, NPTCG_FROM_PAL);
+	} else {
 		printk(KERN_WARNING "cpu_init: PAL VM summary failed, assuming 18 RID bits\n");
 		max_ctx = (1U << 15) - 1;	/* use architected minimum */
 	}
@@ -858,14 +1046,25 @@ cpu_init (void)
 		num_phys_stacked = 96;
 	}
 	/* size of physical stacked register partition plus 8 bytes: */
-	__get_cpu_var(ia64_phys_stacked_size_p8) = num_phys_stacked*8 + 8;
+	if (num_phys_stacked > max_num_phys_stacked) {
+		ia64_patch_phys_stack_reg(num_phys_stacked*8 + 8);
+		max_num_phys_stacked = num_phys_stacked;
+	}
 	platform_cpu_init();
-	pm_idle = default_idle;
 }
 
-void
+void __init
 check_bugs (void)
 {
 	ia64_patch_mckinley_e9((unsigned long) __start___mckinley_e9_bundles,
 			       (unsigned long) __end___mckinley_e9_bundles);
 }
+
+static int __init run_dmi_scan(void)
+{
+	dmi_scan_machine();
+	dmi_memdev_walk();
+	dmi_set_dump_stack_arch_desc();
+	return 0;
+}
+core_initcall(run_dmi_scan);
diff --git a/arch/ia64/kernel/sigframe.h b/arch/ia64/kernel/sigframe.h
index 37b986cb86e..9fd9a1933b3 100644
--- a/arch/ia64/kernel/sigframe.h
+++ b/arch/ia64/kernel/sigframe.h
@@ -22,4 +22,4 @@ struct sigframe {
 	struct sigcontext sc;
 };
 
-extern long ia64_do_signal (sigset_t *, struct sigscratch *, long);
+extern void ia64_do_signal (struct sigscratch *, long);
diff --git a/arch/ia64/kernel/signal.c b/arch/ia64/kernel/signal.c
index 774f34b675c..33cab9a8adf 100644
--- a/arch/ia64/kernel/signal.c
+++ b/arch/ia64/kernel/signal.c
@@ -7,22 +7,20 @@
  * Derived from i386 and Alpha versions.
  */
 
-#include <linux/config.h>
 #include <linux/errno.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/ptrace.h>
+#include <linux/tracehook.h>
 #include <linux/sched.h>
 #include <linux/signal.h>
 #include <linux/smp.h>
-#include <linux/smp_lock.h>
 #include <linux/stddef.h>
 #include <linux/tty.h>
 #include <linux/binfmts.h>
 #include <linux/unistd.h>
 #include <linux/wait.h>
 
-#include <asm/ia32.h>
 #include <asm/intrinsics.h>
 #include <asm/uaccess.h>
 #include <asm/rse.h>
@@ -32,7 +30,6 @@
 
 #define DEBUG_SIG	0
 #define STACK_ALIGN	16		/* minimal alignment for stack pointer */
-#define _BLOCKABLE	(~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
 
 #if _NSIG_WORDS > 1
 # define PUT_SIGSET(k,u)	__copy_to_user((u)->sig, (k)->sig, sizeof(sigset_t))
@@ -42,55 +39,6 @@
 # define GET_SIGSET(k,u)	__get_user((k)->sig[0], &(u)->sig[0])
 #endif
 
-long
-ia64_rt_sigsuspend (sigset_t __user *uset, size_t sigsetsize, struct sigscratch *scr)
-{
-	sigset_t oldset, set;
-
-	/* XXX: Don't preclude handling different sized sigset_t's.  */
-	if (sigsetsize != sizeof(sigset_t))
-		return -EINVAL;
-
-	if (!access_ok(VERIFY_READ, uset, sigsetsize))
-		return -EFAULT;
-
-	if (GET_SIGSET(&set, uset))
-		return -EFAULT;
-
-	sigdelsetmask(&set, ~_BLOCKABLE);
-
-	spin_lock_irq(&current->sighand->siglock);
-	{
-		oldset = current->blocked;
-		current->blocked = set;
-		recalc_sigpending();
-	}
-	spin_unlock_irq(&current->sighand->siglock);
-
-	/*
-	 * The return below usually returns to the signal handler.  We need to
-	 * pre-set the correct error code here to ensure that the right values
-	 * get saved in sigcontext by ia64_do_signal.
-	 */
-	scr->pt.r8 = EINTR;
-	scr->pt.r10 = -1;
-
-	while (1) {
-		current->state = TASK_INTERRUPTIBLE;
-		schedule();
-		if (ia64_do_signal(&oldset, scr, 1))
-			return -EINTR;
-	}
-}
-
-asmlinkage long
-sys_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, long arg2,
-		 long arg3, long arg4, long arg5, long arg6, long arg7,
-		 struct pt_regs regs)
-{
-	return do_sigaltstack(uss, uoss, regs.r12);
-}
-
 static long
 restore_sigcontext (struct sigcontext __user *sc, struct sigscratch *scr)
 {
@@ -141,7 +89,7 @@ restore_sigcontext (struct sigcontext __user *sc, struct sigscratch *scr)
 	if ((flags & IA64_SC_FLAG_FPH_VALID) != 0) {
 		struct ia64_psr *psr = ia64_psr(&scr->pt);
 
-		__copy_from_user(current->thread.fph, &sc->sc_fr[32], 96*16);
+		err |= __copy_from_user(current->thread.fph, &sc->sc_fr[32], 96*16);
 		psr->mfh = 0;	/* drop signal handler's fph contents... */
 		preempt_disable();
 		if (psr->dfh)
@@ -157,7 +105,7 @@ restore_sigcontext (struct sigcontext __user *sc, struct sigscratch *scr)
 }
 
 int
-copy_siginfo_to_user (siginfo_t __user *to, siginfo_t *from)
+copy_siginfo_to_user (siginfo_t __user *to, const siginfo_t *from)
 {
 	if (!access_ok(VERIFY_WRITE, to, sizeof(siginfo_t)))
 		return -EFAULT;
@@ -243,14 +191,7 @@ ia64_rt_sigreturn (struct sigscratch *scr)
 	if (GET_SIGSET(&set, &sc->sc_mask))
 		goto give_sigsegv;
 
-	sigdelsetmask(&set, ~_BLOCKABLE);
-
-	spin_lock_irq(&current->sighand->siglock);
-	{
-		current->blocked = set;
-		recalc_sigpending();
-	}
-	spin_unlock_irq(&current->sighand->siglock);
+	set_current_blocked(&set);
 
 	if (restore_sigcontext(sc, scr))
 		goto give_sigsegv;
@@ -259,19 +200,16 @@ ia64_rt_sigreturn (struct sigscratch *scr)
 	printk("SIG return (%s:%d): sp=%lx ip=%lx\n",
 	       current->comm, current->pid, scr->pt.r12, scr->pt.cr_iip);
 #endif
-	/*
-	 * It is more difficult to avoid calling this function than to
-	 * call it and ignore errors.
-	 */
-	do_sigaltstack(&sc->sc_stack, NULL, scr->pt.r12);
+	if (restore_altstack(&sc->sc_stack))
+		goto give_sigsegv;
 	return retval;
 
   give_sigsegv:
 	si.si_signo = SIGSEGV;
 	si.si_errno = 0;
 	si.si_code = SI_KERNEL;
-	si.si_pid = current->pid;
-	si.si_uid = current->uid;
+	si.si_pid = task_pid_vnr(current);
+	si.si_uid = from_kuid_munged(current_user_ns(), current_uid());
 	si.si_addr = sc;
 	force_sig_info(SIGSEGV, &si, current);
 	return retval;
@@ -287,7 +225,7 @@ static long
 setup_sigcontext (struct sigcontext __user *sc, sigset_t *mask, struct sigscratch *scr)
 {
 	unsigned long flags = 0, ifs, cfm, nat;
-	long err;
+	long err = 0;
 
 	ifs = scr->pt.cr_ifs;
 
@@ -300,12 +238,12 @@ setup_sigcontext (struct sigcontext __user *sc, sigset_t *mask, struct sigscratc
 	ia64_flush_fph(current);
 	if ((current->thread.flags & IA64_THREAD_FPH_VALID)) {
 		flags |= IA64_SC_FLAG_FPH_VALID;
-		__copy_to_user(&sc->sc_fr[32], current->thread.fph, 96*16);
+		err = __copy_to_user(&sc->sc_fr[32], current->thread.fph, 96*16);
 	}
 
 	nat = ia64_get_scratch_nat_bits(&scr->pt, scr->scratch_unat);
 
-	err  = __put_user(flags, &sc->sc_flags);
+	err |= __put_user(flags, &sc->sc_flags);
 	err |= __put_user(nat, &sc->sc_nat);
 	err |= PUT_SIGSET(mask, &sc->sc_mask);
 	err |= __put_user(cfm, &sc->sc_cfm);
@@ -323,15 +261,7 @@ setup_sigcontext (struct sigcontext __user *sc, sigset_t *mask, struct sigscratc
 	err |= __copy_to_user(&sc->sc_gr[15], &scr->pt.r15, 8);		/* r15 */
 	err |= __put_user(scr->pt.cr_iip + ia64_psr(&scr->pt)->ri, &sc->sc_ip);
 
-	if (flags & IA64_SC_FLAG_IN_SYSCALL) {
-		/* Clear scratch registers if the signal interrupted a system call. */
-		err |= __put_user(0, &sc->sc_ar_ccv);				/* ar.ccv */
-		err |= __put_user(0, &sc->sc_br[7]);				/* b7 */
-		err |= __put_user(0, &sc->sc_gr[14]);				/* r14 */
-		err |= __clear_user(&sc->sc_ar25, 2*8);			/* ar.csd & ar.ssd */
-		err |= __clear_user(&sc->sc_gr[2], 2*8);			/* r2-r3 */
-		err |= __clear_user(&sc->sc_gr[16], 16*8);			/* r16-r31 */
-	} else {
+	if (!(flags & IA64_SC_FLAG_IN_SYSCALL)) {
 		/* Copy scratch regs to sigcontext if the signal didn't interrupt a syscall. */
 		err |= __put_user(scr->pt.ar_ccv, &sc->sc_ar_ccv);		/* ar.ccv */
 		err |= __put_user(scr->pt.b7, &sc->sc_br[7]);			/* b7 */
@@ -375,8 +305,8 @@ force_sigsegv_info (int sig, void __user *addr)
 	si.si_signo = SIGSEGV;
 	si.si_errno = 0;
 	si.si_code = SI_KERNEL;
-	si.si_pid = current->pid;
-	si.si_uid = current->uid;
+	si.si_pid = task_pid_vnr(current);
+	si.si_uid = from_kuid_munged(current_user_ns(), current_uid());
 	si.si_addr = addr;
 	force_sig_info(SIGSEGV, &si, current);
 	return 0;
@@ -387,24 +317,41 @@ setup_frame (int sig, struct k_sigaction *ka, siginfo_t *info, sigset_t *set,
 	     struct sigscratch *scr)
 {
 	extern char __kernel_sigtramp[];
-	unsigned long tramp_addr, new_rbs = 0;
+	unsigned long tramp_addr, new_rbs = 0, new_sp;
 	struct sigframe __user *frame;
 	long err;
 
-	frame = (void __user *) scr->pt.r12;
+	new_sp = scr->pt.r12;
 	tramp_addr = (unsigned long) __kernel_sigtramp;
-	if ((ka->sa.sa_flags & SA_ONSTACK) && sas_ss_flags((unsigned long) frame) == 0) {
-		frame = (void __user *) ((current->sas_ss_sp + current->sas_ss_size)
-					 & ~(STACK_ALIGN - 1));
-		/*
-		 * We need to check for the register stack being on the signal stack
-		 * separately, because it's switched separately (memory stack is switched
-		 * in the kernel, register stack is switched in the signal trampoline).
-		 */
-		if (!rbs_on_sig_stack(scr->pt.ar_bspstore))
-			new_rbs = (current->sas_ss_sp + sizeof(long) - 1) & ~(sizeof(long) - 1);
+	if (ka->sa.sa_flags & SA_ONSTACK) {
+		int onstack = sas_ss_flags(new_sp);
+
+		if (onstack == 0) {
+			new_sp = current->sas_ss_sp + current->sas_ss_size;
+			/*
+			 * We need to check for the register stack being on the
+			 * signal stack separately, because it's switched
+			 * separately (memory stack is switched in the kernel,
+			 * register stack is switched in the signal trampoline).
+			 */
+			if (!rbs_on_sig_stack(scr->pt.ar_bspstore))
+				new_rbs = ALIGN(current->sas_ss_sp,
+						sizeof(long));
+		} else if (onstack == SS_ONSTACK) {
+			unsigned long check_sp;
+
+			/*
+			 * If we are on the alternate signal stack and would
+			 * overflow it, don't. Return an always-bogus address
+			 * instead so we will die with SIGSEGV.
+			 */
+			check_sp = (new_sp - sizeof(*frame)) & -STACK_ALIGN;
+			if (!likely(on_sig_stack(check_sp)))
+				return force_sigsegv_info(sig, (void __user *)
+							  check_sp);
+		}
 	}
-	frame = (void __user *) frame - ((sizeof(*frame) + STACK_ALIGN - 1) & ~(STACK_ALIGN - 1));
+	frame = (void __user *) ((new_sp - sizeof(*frame)) & -STACK_ALIGN);
 
 	if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
 		return force_sigsegv_info(sig, frame);
@@ -418,9 +365,7 @@ setup_frame (int sig, struct k_sigaction *ka, siginfo_t *info, sigset_t *set,
 
 	err |= copy_siginfo_to_user(&frame->info, info);
 
-	err |= __put_user(current->sas_ss_sp, &frame->sc.sc_stack.ss_sp);
-	err |= __put_user(current->sas_ss_size, &frame->sc.sc_stack.ss_size);
-	err |= __put_user(sas_ss_flags(scr->pt.r12), &frame->sc.sc_stack.ss_flags);
+	err |= __save_altstack(&frame->sc.sc_stack, scr->pt.r12);
 	err |= setup_sigcontext(&frame->sc, set, scr);
 
 	if (unlikely(err))
@@ -455,24 +400,15 @@ setup_frame (int sig, struct k_sigaction *ka, siginfo_t *info, sigset_t *set,
 }
 
 static long
-handle_signal (unsigned long sig, struct k_sigaction *ka, siginfo_t *info, sigset_t *oldset,
+handle_signal (unsigned long sig, struct k_sigaction *ka, siginfo_t *info,
 	       struct sigscratch *scr)
 {
-	if (IS_IA32_PROCESS(&scr->pt)) {
-		/* send signal to IA-32 process */
-		if (!ia32_setup_frame1(sig, ka, info, oldset, &scr->pt))
-			return 0;
-	} else
-		/* send signal to IA-64 process */
-		if (!setup_frame(sig, ka, info, oldset, scr))
-			return 0;
-
-	spin_lock_irq(&current->sighand->siglock);
-	sigorsets(&current->blocked, &current->blocked, &ka->sa.sa_mask);
-	if (!(ka->sa.sa_flags & SA_NODEFER))
-		sigaddset(&current->blocked, sig);
-	recalc_sigpending();
-	spin_unlock_irq(&current->sighand->siglock);
+	if (!setup_frame(sig, ka, info, sigmask_to_save(), scr))
+		return 0;
+
+	signal_delivered(sig, info, ka, &scr->pt,
+				 test_thread_flag(TIF_SINGLESTEP));
+
 	return 1;
 }
 
@@ -480,25 +416,13 @@ handle_signal (unsigned long sig, struct k_sigaction *ka, siginfo_t *info, sigse
  * Note that `init' is a special process: it doesn't get signals it doesn't want to
  * handle.  Thus you cannot kill init even with a SIGKILL even by mistake.
  */
-long
-ia64_do_signal (sigset_t *oldset, struct sigscratch *scr, long in_syscall)
+void
+ia64_do_signal (struct sigscratch *scr, long in_syscall)
 {
 	struct k_sigaction ka;
 	siginfo_t info;
 	long restart = in_syscall;
 	long errno = scr->pt.r8;
-#	define ERR_CODE(c)	(IS_IA32_PROCESS(&scr->pt) ? -(c) : (c))
-
-	/*
-	 * In the ia64_leave_kernel code path, we want the common case to go fast, which
-	 * is why we may in certain cases get here from kernel mode. Just return without
-	 * doing anything if so.
-	 */
-	if (!user_mode(&scr->pt))
-		return 0;
-
-	if (!oldset)
-		oldset = &current->blocked;
 
 	/*
 	 * This only loops in the rare cases of handle_signal() failing, in which case we
@@ -513,14 +437,7 @@ ia64_do_signal (sigset_t *oldset, struct sigscratch *scr, long in_syscall)
 		 * inferior call), thus it's important to check for restarting _after_
 		 * get_signal_to_deliver().
 		 */
-		if (IS_IA32_PROCESS(&scr->pt)) {
-			if (in_syscall) {
-				if (errno >= 0)
-					restart = 0;
-				else
-					errno = -errno;
-			}
-		} else if ((long) scr->pt.r10 != -1)
+		if ((long) scr->pt.r10 != -1)
 			/*
 			 * A system calls has to be restarted only if one of the error codes
 			 * ERESTARTNOHAND, ERESTARTSYS, or ERESTARTNOINTR is returned.  If r10
@@ -536,22 +453,18 @@ ia64_do_signal (sigset_t *oldset, struct sigscratch *scr, long in_syscall)
 			switch (errno) {
 			      case ERESTART_RESTARTBLOCK:
 			      case ERESTARTNOHAND:
-				scr->pt.r8 = ERR_CODE(EINTR);
+				scr->pt.r8 = EINTR;
 				/* note: scr->pt.r10 is already -1 */
 				break;
 
 			      case ERESTARTSYS:
 				if ((ka.sa.sa_flags & SA_RESTART) == 0) {
-					scr->pt.r8 = ERR_CODE(EINTR);
+					scr->pt.r8 = EINTR;
 					/* note: scr->pt.r10 is already -1 */
 					break;
 				}
 			      case ERESTARTNOINTR:
-				if (IS_IA32_PROCESS(&scr->pt)) {
-					scr->pt.r8 = scr->pt.r1;
-					scr->pt.cr_iip -= 2;
-				} else
-					ia64_decrement_ip(&scr->pt);
+				ia64_decrement_ip(&scr->pt);
 				restart = 0; /* don't restart twice if handle_signal() fails... */
 			}
 		}
@@ -560,8 +473,8 @@ ia64_do_signal (sigset_t *oldset, struct sigscratch *scr, long in_syscall)
 		 * Whee!  Actually deliver the signal.  If the delivery failed, we need to
 		 * continue to iterate in this loop so we can deliver the SIGSEGV...
 		 */
-		if (handle_signal(signr, &ka, &info, oldset, scr))
-			return 1;
+		if (handle_signal(signr, &ka, &info, scr))
+			return;
 	}
 
 	/* Did we come from a system call? */
@@ -570,123 +483,18 @@ ia64_do_signal (sigset_t *oldset, struct sigscratch *scr, long in_syscall)
 		if (errno == ERESTARTNOHAND || errno == ERESTARTSYS || errno == ERESTARTNOINTR
 		    || errno == ERESTART_RESTARTBLOCK)
 		{
-			if (IS_IA32_PROCESS(&scr->pt)) {
-				scr->pt.r8 = scr->pt.r1;
-				scr->pt.cr_iip -= 2;
-				if (errno == ERESTART_RESTARTBLOCK)
-					scr->pt.r8 = 0;	/* x86 version of __NR_restart_syscall */
-			} else {
-				/*
-				 * Note: the syscall number is in r15 which is saved in
-				 * pt_regs so all we need to do here is adjust ip so that
-				 * the "break" instruction gets re-executed.
-				 */
-				ia64_decrement_ip(&scr->pt);
-				if (errno == ERESTART_RESTARTBLOCK)
-					scr->pt.r15 = __NR_restart_syscall;
-			}
-		}
-	}
-	return 0;
-}
-
-/* Set a delayed signal that was detected in MCA/INIT/NMI/PMI context where it
- * could not be delivered.  It is important that the target process is not
- * allowed to do any more work in user space.  Possible cases for the target
- * process:
- *
- * - It is sleeping and will wake up soon.  Store the data in the current task,
- *   the signal will be sent when the current task returns from the next
- *   interrupt.
- *
- * - It is running in user context.  Store the data in the current task, the
- *   signal will be sent when the current task returns from the next interrupt.
- *
- * - It is running in kernel context on this or another cpu and will return to
- *   user context.  Store the data in the target task, the signal will be sent
- *   to itself when the target task returns to user space.
- *
- * - It is running in kernel context on this cpu and will sleep before
- *   returning to user context.  Because this is also the current task, the
- *   signal will not get delivered and the task could sleep indefinitely.
- *   Store the data in the idle task for this cpu, the signal will be sent
- *   after the idle task processes its next interrupt.
- *
- * To cover all cases, store the data in the target task, the current task and
- * the idle task on this cpu.  Whatever happens, the signal will be delivered
- * to the target task before it can do any useful user space work.  Multiple
- * deliveries have no unwanted side effects.
- *
- * Note: This code is executed in MCA/INIT/NMI/PMI context, with interrupts
- * disabled.  It must not take any locks nor use kernel structures or services
- * that require locks.
- */
-
-/* To ensure that we get the right pid, check its start time.  To avoid extra
- * include files in thread_info.h, convert the task start_time to unsigned long,
- * giving us a cycle time of > 580 years.
- */
-static inline unsigned long
-start_time_ul(const struct task_struct *t)
-{
-	return t->start_time.tv_sec * NSEC_PER_SEC + t->start_time.tv_nsec;
-}
-
-void
-set_sigdelayed(pid_t pid, int signo, int code, void __user *addr)
-{
-	struct task_struct *t;
-	unsigned long start_time =  0;
-	int i;
-
-	for (i = 1; i <= 3; ++i) {
-		switch (i) {
-		case 1:
-			t = find_task_by_pid(pid);
-			if (t)
-				start_time = start_time_ul(t);
-			break;
-		case 2:
-			t = current;
-			break;
-		default:
-			t = idle_task(smp_processor_id());
-			break;
+			/*
+			 * Note: the syscall number is in r15 which is saved in
+			 * pt_regs so all we need to do here is adjust ip so that
+			 * the "break" instruction gets re-executed.
+			 */
+			ia64_decrement_ip(&scr->pt);
+			if (errno == ERESTART_RESTARTBLOCK)
+				scr->pt.r15 = __NR_restart_syscall;
 		}
-
-		if (!t)
-			return;
-		t->thread_info->sigdelayed.signo = signo;
-		t->thread_info->sigdelayed.code = code;
-		t->thread_info->sigdelayed.addr = addr;
-		t->thread_info->sigdelayed.start_time = start_time;
-		t->thread_info->sigdelayed.pid = pid;
-		wmb();
-		set_tsk_thread_flag(t, TIF_SIGDELAYED);
 	}
-}
-
-/* Called from entry.S when it detects TIF_SIGDELAYED, a delayed signal that
- * was detected in MCA/INIT/NMI/PMI context where it could not be delivered.
- */
 
-void
-do_sigdelayed(void)
-{
-	struct siginfo siginfo;
-	pid_t pid;
-	struct task_struct *t;
-
-	clear_thread_flag(TIF_SIGDELAYED);
-	memset(&siginfo, 0, sizeof(siginfo));
-	siginfo.si_signo = current_thread_info()->sigdelayed.signo;
-	siginfo.si_code = current_thread_info()->sigdelayed.code;
-	siginfo.si_addr = current_thread_info()->sigdelayed.addr;
-	pid = current_thread_info()->sigdelayed.pid;
-	t = find_task_by_pid(pid);
-	if (!t)
-		return;
-	if (current_thread_info()->sigdelayed.start_time != start_time_ul(t))
-		return;
-	force_sig_info(siginfo.si_signo, &siginfo, t);
+	/* if there's no signal to deliver, we just put the saved sigmask
+	 * back */
+	restore_saved_sigmask();
 }
diff --git a/arch/ia64/kernel/smp.c b/arch/ia64/kernel/smp.c
index 657ac99a451..9fcd4e63048 100644
--- a/arch/ia64/kernel/smp.c
+++ b/arch/ia64/kernel/smp.c
@@ -30,8 +30,9 @@
 #include <linux/delay.h>
 #include <linux/efi.h>
 #include <linux/bitops.h>
+#include <linux/kexec.h>
 
-#include <asm/atomic.h>
+#include <linux/atomic.h>
 #include <asm/current.h>
 #include <asm/delay.h>
 #include <asm/machvec.h>
@@ -43,54 +44,39 @@
 #include <asm/processor.h>
 #include <asm/ptrace.h>
 #include <asm/sal.h>
-#include <asm/system.h>
 #include <asm/tlbflush.h>
 #include <asm/unistd.h>
 #include <asm/mca.h>
 
 /*
- * Structure and data for smp_call_function(). This is designed to minimise static memory
- * requirements. It also looks cleaner.
+ * Note: alignment of 4 entries/cacheline was empirically determined
+ * to be a good tradeoff between hot cachelines & spreading the array
+ * across too many cacheline.
  */
-static  __cacheline_aligned DEFINE_SPINLOCK(call_lock);
+static struct local_tlb_flush_counts {
+	unsigned int count;
+} __attribute__((__aligned__(32))) local_tlb_flush_counts[NR_CPUS];
 
-struct call_data_struct {
-	void (*func) (void *info);
-	void *info;
-	long wait;
-	atomic_t started;
-	atomic_t finished;
-};
-
-static volatile struct call_data_struct *call_data;
+static DEFINE_PER_CPU_SHARED_ALIGNED(unsigned short [NR_CPUS],
+				     shadow_flush_counts);
 
 #define IPI_CALL_FUNC		0
 #define IPI_CPU_STOP		1
+#define IPI_CALL_FUNC_SINGLE	2
+#define IPI_KDUMP_CPU_STOP	3
 
 /* This needs to be cacheline aligned because it is written to by *other* CPUs.  */
-static DEFINE_PER_CPU(u64, ipi_operation) ____cacheline_aligned;
+static DEFINE_PER_CPU_SHARED_ALIGNED(unsigned long, ipi_operation);
 
 extern void cpu_halt (void);
 
-void
-lock_ipi_calllock(void)
-{
-	spin_lock_irq(&call_lock);
-}
-
-void
-unlock_ipi_calllock(void)
-{
-	spin_unlock_irq(&call_lock);
-}
-
 static void
-stop_this_cpu (void)
+stop_this_cpu(void)
 {
 	/*
 	 * Remove this CPU:
 	 */
-	cpu_clear(smp_processor_id(), cpu_online_map);
+	set_cpu_online(smp_processor_id(), false);
 	max_xtp();
 	local_irq_disable();
 	cpu_halt();
@@ -108,7 +94,7 @@ cpu_die(void)
 }
 
 irqreturn_t
-handle_IPI (int irq, void *dev_id, struct pt_regs *regs)
+handle_IPI (int irq, void *dev_id)
 {
 	int this_cpu = get_cpu();
 	unsigned long *pending_ipis = &__ia64_per_cpu_var(ipi_operation);
@@ -124,40 +110,23 @@ handle_IPI (int irq, void *dev_id, struct pt_regs *regs)
 			ops &= ~(1 << which);
 
 			switch (which) {
-			      case IPI_CALL_FUNC:
-			      {
-				      struct call_data_struct *data;
-				      void (*func)(void *info);
-				      void *info;
-				      int wait;
-
-				      /* release the 'pointer lock' */
-				      data = (struct call_data_struct *) call_data;
-				      func = data->func;
-				      info = data->info;
-				      wait = data->wait;
-
-				      mb();
-				      atomic_inc(&data->started);
-				      /*
-				       * At this point the structure may be gone unless
-				       * wait is true.
-				       */
-				      (*func)(info);
-
-				      /* Notify the sending CPU that the task is done.  */
-				      mb();
-				      if (wait)
-					      atomic_inc(&data->finished);
-			      }
-			      break;
-
-			      case IPI_CPU_STOP:
+			case IPI_CPU_STOP:
 				stop_this_cpu();
 				break;
-
-			      default:
-				printk(KERN_CRIT "Unknown IPI on CPU %d: %lu\n", this_cpu, which);
+			case IPI_CALL_FUNC:
+				generic_smp_call_function_interrupt();
+				break;
+			case IPI_CALL_FUNC_SINGLE:
+				generic_smp_call_function_single_interrupt();
+				break;
+#ifdef CONFIG_KEXEC
+			case IPI_KDUMP_CPU_STOP:
+				unw_init_running(kdump_cpu_freeze, NULL);
+				break;
+#endif
+			default:
+				printk(KERN_CRIT "Unknown IPI on CPU %d: %lu\n",
+						this_cpu, which);
 				break;
 			}
 		} while (ops);
@@ -167,8 +136,10 @@ handle_IPI (int irq, void *dev_id, struct pt_regs *regs)
 	return IRQ_HANDLED;
 }
 
+
+
 /*
- * Called with preeemption disabled.
+ * Called with preemption disabled.
  */
 static inline void
 send_IPI_single (int dest_cpu, int op)
@@ -178,7 +149,7 @@ send_IPI_single (int dest_cpu, int op)
 }
 
 /*
- * Called with preeemption disabled.
+ * Called with preemption disabled.
  */
 static inline void
 send_IPI_allbutself (int op)
@@ -192,7 +163,20 @@ send_IPI_allbutself (int op)
 }
 
 /*
- * Called with preeemption disabled.
+ * Called with preemption disabled.
+ */
+static inline void
+send_IPI_mask(const struct cpumask *mask, int op)
+{
+	unsigned int cpu;
+
+	for_each_cpu(cpu, mask) {
+			send_IPI_single(cpu, op);
+	}
+}
+
+/*
+ * Called with preemption disabled.
  */
 static inline void
 send_IPI_all (int op)
@@ -205,7 +189,7 @@ send_IPI_all (int op)
 }
 
 /*
- * Called with preeemption disabled.
+ * Called with preemption disabled.
  */
 static inline void
 send_IPI_self (int op)
@@ -213,155 +197,134 @@ send_IPI_self (int op)
 	send_IPI_single(smp_processor_id(), op);
 }
 
+#ifdef CONFIG_KEXEC
+void
+kdump_smp_send_stop(void)
+{
+ 	send_IPI_allbutself(IPI_KDUMP_CPU_STOP);
+}
+
+void
+kdump_smp_send_init(void)
+{
+	unsigned int cpu, self_cpu;
+	self_cpu = smp_processor_id();
+	for_each_online_cpu(cpu) {
+		if (cpu != self_cpu) {
+			if(kdump_status[cpu] == 0)
+				platform_send_ipi(cpu, 0, IA64_IPI_DM_INIT, 0);
+		}
+	}
+}
+#endif
 /*
- * Called with preeemption disabled.
+ * Called with preemption disabled.
  */
 void
 smp_send_reschedule (int cpu)
 {
 	platform_send_ipi(cpu, IA64_IPI_RESCHEDULE, IA64_IPI_DM_INT, 0);
 }
+EXPORT_SYMBOL_GPL(smp_send_reschedule);
 
-void
-smp_flush_tlb_all (void)
+/*
+ * Called with preemption disabled.
+ */
+static void
+smp_send_local_flush_tlb (int cpu)
 {
-	on_each_cpu((void (*)(void *))local_flush_tlb_all, NULL, 1, 1);
+	platform_send_ipi(cpu, IA64_IPI_LOCAL_TLB_FLUSH, IA64_IPI_DM_INT, 0);
 }
 
 void
-smp_flush_tlb_mm (struct mm_struct *mm)
+smp_local_flush_tlb(void)
 {
-	preempt_disable();
-	/* this happens for the common case of a single-threaded fork():  */
-	if (likely(mm == current->active_mm && atomic_read(&mm->mm_users) == 1))
-	{
-		local_finish_flush_tlb_mm(mm);
-		preempt_enable();
-		return;
-	}
-
-	preempt_enable();
 	/*
-	 * We could optimize this further by using mm->cpu_vm_mask to track which CPUs
-	 * have been running in the address space.  It's not clear that this is worth the
-	 * trouble though: to avoid races, we have to raise the IPI on the target CPU
-	 * anyhow, and once a CPU is interrupted, the cost of local_flush_tlb_all() is
-	 * rather trivial.
+	 * Use atomic ops. Otherwise, the load/increment/store sequence from
+	 * a "++" operation can have the line stolen between the load & store.
+	 * The overhead of the atomic op in negligible in this case & offers
+	 * significant benefit for the brief periods where lots of cpus
+	 * are simultaneously flushing TLBs.
 	 */
-	on_each_cpu((void (*)(void *))local_finish_flush_tlb_mm, mm, 1, 1);
+	ia64_fetchadd(1, &local_tlb_flush_counts[smp_processor_id()].count, acq);
+	local_flush_tlb_all();
 }
 
-/*
- * Run a function on another CPU
- *  <func>	The function to run. This must be fast and non-blocking.
- *  <info>	An arbitrary pointer to pass to the function.
- *  <nonatomic>	Currently unused.
- *  <wait>	If true, wait until function has completed on other CPUs.
- *  [RETURNS]   0 on success, else a negative status code.
- *
- * Does not return until the remote CPU is nearly ready to execute <func>
- * or is or has executed.
- */
+#define FLUSH_DELAY	5 /* Usec backoff to eliminate excessive cacheline bouncing */
 
-int
-smp_call_function_single (int cpuid, void (*func) (void *info), void *info, int nonatomic,
-			  int wait)
+void
+smp_flush_tlb_cpumask(cpumask_t xcpumask)
 {
-	struct call_data_struct data;
-	int cpus = 1;
-	int me = get_cpu(); /* prevent preemption and reschedule on another processor */
-
-	if (cpuid == me) {
-		printk(KERN_INFO "%s: trying to call self\n", __FUNCTION__);
-		put_cpu();
-		return -EBUSY;
-	}
+	unsigned short *counts = __ia64_per_cpu_var(shadow_flush_counts);
+	cpumask_t cpumask = xcpumask;
+	int mycpu, cpu, flush_mycpu = 0;
 
-	data.func = func;
-	data.info = info;
-	atomic_set(&data.started, 0);
-	data.wait = wait;
-	if (wait)
-		atomic_set(&data.finished, 0);
+	preempt_disable();
+	mycpu = smp_processor_id();
 
-	spin_lock_bh(&call_lock);
+	for_each_cpu_mask(cpu, cpumask)
+		counts[cpu] = local_tlb_flush_counts[cpu].count & 0xffff;
 
-	call_data = &data;
-	mb();	/* ensure store to call_data precedes setting of IPI_CALL_FUNC */
-  	send_IPI_single(cpuid, IPI_CALL_FUNC);
+	mb();
+	for_each_cpu_mask(cpu, cpumask) {
+		if (cpu == mycpu)
+			flush_mycpu = 1;
+		else
+			smp_send_local_flush_tlb(cpu);
+	}
 
-	/* Wait for response */
-	while (atomic_read(&data.started) != cpus)
-		cpu_relax();
+	if (flush_mycpu)
+		smp_local_flush_tlb();
 
-	if (wait)
-		while (atomic_read(&data.finished) != cpus)
-			cpu_relax();
-	call_data = NULL;
+	for_each_cpu_mask(cpu, cpumask)
+		while(counts[cpu] == (local_tlb_flush_counts[cpu].count & 0xffff))
+			udelay(FLUSH_DELAY);
 
-	spin_unlock_bh(&call_lock);
-	put_cpu();
-	return 0;
+	preempt_enable();
 }
-EXPORT_SYMBOL(smp_call_function_single);
 
-/*
- * this function sends a 'generic call function' IPI to all other CPUs
- * in the system.
- */
-
-/*
- *  [SUMMARY]	Run a function on all other CPUs.
- *  <func>	The function to run. This must be fast and non-blocking.
- *  <info>	An arbitrary pointer to pass to the function.
- *  <nonatomic>	currently unused.
- *  <wait>	If true, wait (atomically) until function has completed on other CPUs.
- *  [RETURNS]   0 on success, else a negative status code.
- *
- * Does not return until remote CPUs are nearly ready to execute <func> or are or have
- * executed.
- *
- * You must not call this function with disabled interrupts or from a
- * hardware interrupt handler or from a bottom half handler.
- */
-int
-smp_call_function (void (*func) (void *info), void *info, int nonatomic, int wait)
+void
+smp_flush_tlb_all (void)
 {
-	struct call_data_struct data;
-	int cpus = num_online_cpus()-1;
-
-	if (!cpus)
-		return 0;
-
-	/* Can deadlock when called with interrupts disabled */
-	WARN_ON(irqs_disabled());
-
-	data.func = func;
-	data.info = info;
-	atomic_set(&data.started, 0);
-	data.wait = wait;
-	if (wait)
-		atomic_set(&data.finished, 0);
-
-	spin_lock(&call_lock);
-
-	call_data = &data;
-	mb();	/* ensure store to call_data precedes setting of IPI_CALL_FUNC */
-	send_IPI_allbutself(IPI_CALL_FUNC);
+	on_each_cpu((void (*)(void *))local_flush_tlb_all, NULL, 1);
+}
 
-	/* Wait for response */
-	while (atomic_read(&data.started) != cpus)
-		cpu_relax();
+void
+smp_flush_tlb_mm (struct mm_struct *mm)
+{
+	cpumask_var_t cpus;
+	preempt_disable();
+	/* this happens for the common case of a single-threaded fork():  */
+	if (likely(mm == current->active_mm && atomic_read(&mm->mm_users) == 1))
+	{
+		local_finish_flush_tlb_mm(mm);
+		preempt_enable();
+		return;
+	}
+	if (!alloc_cpumask_var(&cpus, GFP_ATOMIC)) {
+		smp_call_function((void (*)(void *))local_finish_flush_tlb_mm,
+			mm, 1);
+	} else {
+		cpumask_copy(cpus, mm_cpumask(mm));
+		smp_call_function_many(cpus,
+			(void (*)(void *))local_finish_flush_tlb_mm, mm, 1);
+		free_cpumask_var(cpus);
+	}
+	local_irq_disable();
+	local_finish_flush_tlb_mm(mm);
+	local_irq_enable();
+	preempt_enable();
+}
 
-	if (wait)
-		while (atomic_read(&data.finished) != cpus)
-			cpu_relax();
-	call_data = NULL;
+void arch_send_call_function_single_ipi(int cpu)
+{
+	send_IPI_single(cpu, IPI_CALL_FUNC_SINGLE);
+}
 
-	spin_unlock(&call_lock);
-	return 0;
+void arch_send_call_function_ipi_mask(const struct cpumask *mask)
+{
+	send_IPI_mask(mask, IPI_CALL_FUNC);
 }
-EXPORT_SYMBOL(smp_call_function);
 
 /*
  * this function calls the 'stop' function on all other CPUs in the system.
@@ -372,7 +335,7 @@ smp_send_stop (void)
 	send_IPI_allbutself(IPI_CPU_STOP);
 }
 
-int __init
+int
 setup_profiling_timer (unsigned int multiplier)
 {
 	return -EINVAL;
diff --git a/arch/ia64/kernel/smpboot.c b/arch/ia64/kernel/smpboot.c
index 400a4898712..547a48d78bd 100644
--- a/arch/ia64/kernel/smpboot.c
+++ b/arch/ia64/kernel/smpboot.c
@@ -21,7 +21,6 @@
  * 05/01/30 Suresh Siddha <suresh.b.siddha@intel.com>
  *						Setup cpu_sibling_map and cpu_core_map
  */
-#include <linux/config.h>
 
 #include <linux/module.h>
 #include <linux/acpi.h>
@@ -36,30 +35,29 @@
 #include <linux/mm.h>
 #include <linux/notifier.h>
 #include <linux/smp.h>
-#include <linux/smp_lock.h>
 #include <linux/spinlock.h>
 #include <linux/efi.h>
 #include <linux/percpu.h>
 #include <linux/bitops.h>
 
-#include <asm/atomic.h>
+#include <linux/atomic.h>
 #include <asm/cache.h>
 #include <asm/current.h>
 #include <asm/delay.h>
-#include <asm/ia32.h>
 #include <asm/io.h>
 #include <asm/irq.h>
 #include <asm/machvec.h>
 #include <asm/mca.h>
 #include <asm/page.h>
+#include <asm/paravirt.h>
 #include <asm/pgalloc.h>
 #include <asm/pgtable.h>
 #include <asm/processor.h>
 #include <asm/ptrace.h>
 #include <asm/sal.h>
-#include <asm/system.h>
 #include <asm/tlbflush.h>
 #include <asm/unistd.h>
+#include <asm/sn/arch.h>
 
 #define SMP_DEBUG 0
 
@@ -70,12 +68,11 @@
 #endif
 
 #ifdef CONFIG_HOTPLUG_CPU
-/*
- * Store all idle threads, this can be reused instead of creating
- * a new thread. Also avoids complicated thread destroy functionality
- * for idle threads.
- */
-struct task_struct *idle_thread_array[NR_CPUS];
+#ifdef CONFIG_PERMIT_BSP_REMOVE
+#define bsp_remove_ok	1
+#else
+#define bsp_remove_ok	0
+#endif
 
 /*
  * Global array allocated for NR_CPUS at boot time
@@ -90,13 +87,7 @@ struct sal_to_os_boot *sal_state_for_booting_cpu = &sal_boot_rendez_state[0];
 
 #define set_brendez_area(x) (sal_state_for_booting_cpu = &sal_boot_rendez_state[(x)]);
 
-#define get_idle_for_cpu(x)		(idle_thread_array[(x)])
-#define set_idle_for_cpu(x,p)	(idle_thread_array[(x)] = (p))
-
 #else
-
-#define get_idle_for_cpu(x)		(NULL)
-#define set_idle_for_cpu(x,p)
 #define set_brendez_area(x)
 #endif
 
@@ -104,7 +95,7 @@ struct sal_to_os_boot *sal_state_for_booting_cpu = &sal_boot_rendez_state[0];
 /*
  * ITC synchronization related stuff:
  */
-#define MASTER	0
+#define MASTER	(0)
 #define SLAVE	(SMP_CACHE_BYTES/8)
 
 #define NUM_ROUNDS	64	/* magic value */
@@ -115,27 +106,22 @@ static volatile unsigned long go[SLAVE + 1];
 
 #define DEBUG_ITC_SYNC	0
 
-extern void __devinit calibrate_delay (void);
 extern void start_ap (void);
 extern unsigned long ia64_iobase;
 
-task_t *task_for_booting_cpu;
+struct task_struct *task_for_booting_cpu;
 
 /*
  * State for each CPU
  */
 DEFINE_PER_CPU(int, cpu_state);
 
-/* Bitmasks of currently online, and possible CPUs */
-cpumask_t cpu_online_map;
-EXPORT_SYMBOL(cpu_online_map);
-cpumask_t cpu_possible_map;
-EXPORT_SYMBOL(cpu_possible_map);
-
 cpumask_t cpu_core_map[NR_CPUS] __cacheline_aligned;
-cpumask_t cpu_sibling_map[NR_CPUS] __cacheline_aligned;
+EXPORT_SYMBOL(cpu_core_map);
+DEFINE_PER_CPU_SHARED_ALIGNED(cpumask_t, cpu_sibling_map);
+EXPORT_PER_CPU_SYMBOL(cpu_sibling_map);
+
 int smp_num_siblings = 1;
-int smp_num_cpucores = 1;
 
 /* which logical CPU number maps to which CPU (physical APIC ID) */
 volatile int ia64_cpu_to_sapicid[NR_CPUS];
@@ -151,6 +137,27 @@ char __initdata no_int_routing;
 
 unsigned char smp_int_redirect; /* are INT and IPI redirectable by the chipset? */
 
+#ifdef CONFIG_FORCE_CPEI_RETARGET
+#define CPEI_OVERRIDE_DEFAULT	(1)
+#else
+#define CPEI_OVERRIDE_DEFAULT	(0)
+#endif
+
+unsigned int force_cpei_retarget = CPEI_OVERRIDE_DEFAULT;
+
+static int __init
+cmdl_force_cpei(char *str)
+{
+	int value=0;
+
+	get_option (&str, &value);
+	force_cpei_retarget = value;
+
+	return 1;
+}
+
+__setup("force_cpei=", cmdl_force_cpei);
+
 static int __init
 nointroute (char *str)
 {
@@ -161,6 +168,27 @@ nointroute (char *str)
 
 __setup("nointroute", nointroute);
 
+static void fix_b0_for_bsp(void)
+{
+#ifdef CONFIG_HOTPLUG_CPU
+	int cpuid;
+	static int fix_bsp_b0 = 1;
+
+	cpuid = smp_processor_id();
+
+	/*
+	 * Cache the b0 value on the first AP that comes up
+	 */
+	if (!(fix_bsp_b0 && cpuid))
+		return;
+
+	sal_boot_rendez_state[0].br[0] = sal_boot_rendez_state[cpuid].br[0];
+	printk ("Fixed BSP b0 value from CPU %d\n", cpuid);
+
+	fix_bsp_b0 = 0;
+#endif
+}
+
 void
 sync_master (void *arg)
 {
@@ -270,7 +298,7 @@ ia64_sync_itc (unsigned int master)
 
 	go[MASTER] = 1;
 
-	if (smp_call_function_single(master, sync_master, NULL, 1, 0) < 0) {
+	if (smp_call_function_single(master, sync_master, NULL, 0) < 0) {
 		printk(KERN_ERR "sync_itc: failed to get attention of CPU %u!\n", master);
 		return;
 	}
@@ -319,16 +347,17 @@ ia64_sync_itc (unsigned int master)
 /*
  * Ideally sets up per-cpu profiling hooks.  Doesn't do much now...
  */
-static inline void __devinit
-smp_setup_percpu_timer (void)
+static inline void smp_setup_percpu_timer(void)
 {
 }
 
-static void __devinit
+static void
 smp_callin (void)
 {
-	int cpuid, phys_id;
+	int cpuid, phys_id, itc_master;
+	struct cpuinfo_ia64 *last_cpuinfo, *this_cpuinfo;
 	extern void ia64_init_itm(void);
+	extern volatile int time_keeper_id;
 
 #ifdef CONFIG_PERFMON
 	extern void pfm_init_percpu(void);
@@ -336,6 +365,7 @@ smp_callin (void)
 
 	cpuid = smp_processor_id();
 	phys_id = hard_smp_processor_id();
+	itc_master = time_keeper_id;
 
 	if (cpu_online(cpuid)) {
 		printk(KERN_ERR "huh, phys CPU#0x%x, CPU#0x%x already present??\n",
@@ -343,10 +373,21 @@ smp_callin (void)
 		BUG();
 	}
 
-	lock_ipi_calllock();
-	cpu_set(cpuid, cpu_online_map);
-	unlock_ipi_calllock();
+	fix_b0_for_bsp();
+
+	/*
+	 * numa_node_id() works after this.
+	 */
+	set_numa_node(cpu_to_node_map[cpuid]);
+	set_numa_mem(local_memory_node(cpu_to_node_map[cpuid]));
+
+	spin_lock(&vector_lock);
+	/* Setup the per cpu irq handling data structures */
+	__setup_vector_irq(cpuid);
+	notify_cpu_starting(cpuid);
+	set_cpu_online(cpuid, true);
 	per_cpu(cpu_state, cpuid) = CPU_ONLINE;
+	spin_unlock(&vector_lock);
 
 	smp_setup_percpu_timer();
 
@@ -365,20 +406,30 @@ smp_callin (void)
 		 * calls spin_unlock_bh(), which calls spin_unlock_bh(), which calls
 		 * local_bh_enable(), which bugs out if irqs are not enabled...
 		 */
-		Dprintk("Going to syncup ITC with BP.\n");
-		ia64_sync_itc(0);
+		Dprintk("Going to syncup ITC with ITC Master.\n");
+		ia64_sync_itc(itc_master);
 	}
 
 	/*
 	 * Get our bogomips.
 	 */
 	ia64_init_itm();
-	calibrate_delay();
-	local_cpu_data->loops_per_jiffy = loops_per_jiffy;
 
-#ifdef CONFIG_IA32_SUPPORT
-	ia32_gdt_init();
-#endif
+	/*
+	 * Delay calibration can be skipped if new processor is identical to the
+	 * previous processor.
+	 */
+	last_cpuinfo = cpu_data(cpuid - 1);
+	this_cpuinfo = local_cpu_data;
+	if (last_cpuinfo->itc_freq != this_cpuinfo->itc_freq ||
+	    last_cpuinfo->proc_freq != this_cpuinfo->proc_freq ||
+	    last_cpuinfo->features != this_cpuinfo->features ||
+	    last_cpuinfo->revision != this_cpuinfo->revision ||
+	    last_cpuinfo->family != this_cpuinfo->family ||
+	    last_cpuinfo->archrev != this_cpuinfo->archrev ||
+	    last_cpuinfo->model != this_cpuinfo->model)
+		calibrate_delay();
+	local_cpu_data->loops_per_jiffy = loops_per_jiffy;
 
 	/*
 	 * Allow the master to continue.
@@ -391,74 +442,29 @@ smp_callin (void)
 /*
  * Activate a secondary processor.  head.S calls this.
  */
-int __devinit
+int
 start_secondary (void *unused)
 {
 	/* Early console may use I/O ports */
 	ia64_set_kr(IA64_KR_IO_BASE, __pa(ia64_iobase));
+#ifndef CONFIG_PRINTK_TIME
 	Dprintk("start_secondary: starting CPU 0x%x\n", hard_smp_processor_id());
+#endif
 	efi_map_pal_code();
 	cpu_init();
+	preempt_disable();
 	smp_callin();
 
-	cpu_idle();
+	cpu_startup_entry(CPUHP_ONLINE);
 	return 0;
 }
 
-struct pt_regs * __devinit idle_regs(struct pt_regs *regs)
-{
-	return NULL;
-}
-
-struct create_idle {
-	struct task_struct *idle;
-	struct completion done;
-	int cpu;
-};
-
-void
-do_fork_idle(void *_c_idle)
-{
-	struct create_idle *c_idle = _c_idle;
-
-	c_idle->idle = fork_idle(c_idle->cpu);
-	complete(&c_idle->done);
-}
-
-static int __devinit
-do_boot_cpu (int sapicid, int cpu)
+static int
+do_boot_cpu (int sapicid, int cpu, struct task_struct *idle)
 {
 	int timeout;
-	struct create_idle c_idle = {
-		.cpu	= cpu,
-		.done	= COMPLETION_INITIALIZER(c_idle.done),
-	};
-	DECLARE_WORK(work, do_fork_idle, &c_idle);
-
- 	c_idle.idle = get_idle_for_cpu(cpu);
- 	if (c_idle.idle) {
-		init_idle(c_idle.idle, cpu);
- 		goto do_rest;
-	}
-
-	/*
-	 * We can't use kernel_thread since we must avoid to reschedule the child.
-	 */
-	if (!keventd_up() || current_is_keventd())
-		work.func(work.data);
-	else {
-		schedule_work(&work);
-		wait_for_completion(&c_idle.done);
-	}
-
-	if (IS_ERR(c_idle.idle))
-		panic("failed fork for CPU %d", cpu);
-
-	set_idle_for_cpu(cpu, c_idle.idle);
-
-do_rest:
-	task_for_booting_cpu = c_idle.idle;
 
+	task_for_booting_cpu = idle;
 	Dprintk("Sending wakeup vector %lu to AP 0x%x/0x%x.\n", ap_wakeup_vector, cpu, sapicid);
 
 	set_brendez_area(cpu);
@@ -478,7 +484,7 @@ do_rest:
 	if (!cpu_isset(cpu, cpu_callin_map)) {
 		printk(KERN_ERR "Processor 0x%x/0x%x is stuck.\n", cpu, sapicid);
 		ia64_cpu_to_sapicid[cpu] = -1;
-		cpu_clear(cpu, cpu_online_map);  /* was set in smp_callin() */
+		set_cpu_online(cpu, false);  /* was set in smp_callin() */
 		return -EINVAL;
 	}
 	return 0;
@@ -505,21 +511,17 @@ smp_build_cpu_map (void)
 
 	for (cpu = 0; cpu < NR_CPUS; cpu++) {
 		ia64_cpu_to_sapicid[cpu] = -1;
-#ifdef CONFIG_HOTPLUG_CPU
-		cpu_set(cpu, cpu_possible_map);
-#endif
 	}
 
 	ia64_cpu_to_sapicid[0] = boot_cpu_id;
-	cpus_clear(cpu_present_map);
-	cpu_set(0, cpu_present_map);
-	cpu_set(0, cpu_possible_map);
+	init_cpu_present(cpumask_of(0));
+	set_cpu_possible(0, true);
 	for (cpu = 1, i = 0; i < smp_boot_data.cpu_count; i++) {
 		sapicid = smp_boot_data.cpu_phys_id[i];
 		if (sapicid == boot_cpu_id)
 			continue;
-		cpu_set(cpu, cpu_present_map);
-		cpu_set(cpu, cpu_possible_map);
+		set_cpu_present(cpu, true);
+		set_cpu_possible(cpu, true);
 		ia64_cpu_to_sapicid[cpu] = sapicid;
 		cpu++;
 	}
@@ -539,10 +541,6 @@ smp_prepare_cpus (unsigned int max_cpus)
 
 	smp_setup_percpu_timer();
 
-	/*
-	 * We have the boot CPU online for sure.
-	 */
-	cpu_set(0, cpu_online_map);
 	cpu_set(0, cpu_callin_map);
 
 	local_cpu_data->loops_per_jiffy = loops_per_jiffy;
@@ -557,59 +555,34 @@ smp_prepare_cpus (unsigned int max_cpus)
 	 */
 	if (!max_cpus) {
 		printk(KERN_INFO "SMP mode deactivated.\n");
-		cpus_clear(cpu_online_map);
-		cpus_clear(cpu_present_map);
-		cpus_clear(cpu_possible_map);
-		cpu_set(0, cpu_online_map);
-		cpu_set(0, cpu_present_map);
-		cpu_set(0, cpu_possible_map);
+		init_cpu_online(cpumask_of(0));
+		init_cpu_present(cpumask_of(0));
+		init_cpu_possible(cpumask_of(0));
 		return;
 	}
 }
 
-void __devinit smp_prepare_boot_cpu(void)
+void smp_prepare_boot_cpu(void)
 {
-	cpu_set(smp_processor_id(), cpu_online_map);
+	set_cpu_online(smp_processor_id(), true);
 	cpu_set(smp_processor_id(), cpu_callin_map);
+	set_numa_node(cpu_to_node_map[smp_processor_id()]);
 	per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
+	paravirt_post_smp_prepare_boot_cpu();
 }
 
-/*
- * mt_info[] is a temporary store for all info returned by
- * PAL_LOGICAL_TO_PHYSICAL, to be copied into cpuinfo_ia64 when the
- * specific cpu comes.
- */
-static struct {
-	__u32   socket_id;
-	__u16   core_id;
-	__u16   thread_id;
-	__u16   proc_fixed_addr;
-	__u8    valid;
-} mt_info[NR_CPUS] __devinitdata;
-
 #ifdef CONFIG_HOTPLUG_CPU
 static inline void
-remove_from_mtinfo(int cpu)
-{
-	int i;
-
-	for_each_cpu(i)
-		if (mt_info[i].valid &&  mt_info[i].socket_id ==
-		    				cpu_data(cpu)->socket_id)
-			mt_info[i].valid = 0;
-}
-
-static inline void
 clear_cpu_sibling_map(int cpu)
 {
 	int i;
 
-	for_each_cpu_mask(i, cpu_sibling_map[cpu])
-		cpu_clear(cpu, cpu_sibling_map[i]);
+	for_each_cpu_mask(i, per_cpu(cpu_sibling_map, cpu))
+		cpu_clear(cpu, per_cpu(cpu_sibling_map, i));
 	for_each_cpu_mask(i, cpu_core_map[cpu])
 		cpu_clear(cpu, cpu_core_map[i]);
 
-	cpu_sibling_map[cpu] = cpu_core_map[cpu] = CPU_MASK_NONE;
+	per_cpu(cpu_sibling_map, cpu) = cpu_core_map[cpu] = CPU_MASK_NONE;
 }
 
 static void
@@ -620,7 +593,7 @@ remove_siblinginfo(int cpu)
 	if (cpu_data(cpu)->threads_per_core == 1 &&
 	    cpu_data(cpu)->cores_per_socket == 1) {
 		cpu_clear(cpu, cpu_core_map[cpu]);
-		cpu_clear(cpu, cpu_sibling_map[cpu]);
+		cpu_clear(cpu, per_cpu(cpu_sibling_map, cpu));
 		return;
 	}
 
@@ -628,15 +601,50 @@ remove_siblinginfo(int cpu)
 
 	/* remove it from all sibling map's */
 	clear_cpu_sibling_map(cpu);
+}
+
+extern void fixup_irqs(void);
 
-	/* if this cpu is the last in the core group, remove all its info 
-	 * from mt_info structure
+int migrate_platform_irqs(unsigned int cpu)
+{
+	int new_cpei_cpu;
+	struct irq_data *data = NULL;
+	const struct cpumask *mask;
+	int 		retval = 0;
+
+	/*
+	 * dont permit CPEI target to removed.
 	 */
-	if (last)
-		remove_from_mtinfo(cpu);
+	if (cpe_vector > 0 && is_cpu_cpei_target(cpu)) {
+		printk ("CPU (%d) is CPEI Target\n", cpu);
+		if (can_cpei_retarget()) {
+			/*
+			 * Now re-target the CPEI to a different processor
+			 */
+			new_cpei_cpu = cpumask_any(cpu_online_mask);
+			mask = cpumask_of(new_cpei_cpu);
+			set_cpei_target_cpu(new_cpei_cpu);
+			data = irq_get_irq_data(ia64_cpe_irq);
+			/*
+			 * Switch for now, immediately, we need to do fake intr
+			 * as other interrupts, but need to study CPEI behaviour with
+			 * polling before making changes.
+			 */
+			if (data && data->chip) {
+				data->chip->irq_disable(data);
+				data->chip->irq_set_affinity(data, mask, false);
+				data->chip->irq_enable(data);
+				printk ("Re-targeting CPEI to cpu %d\n", new_cpei_cpu);
+			}
+		}
+		if (!data) {
+			printk ("Unable to retarget CPEI, offline cpu [%d] failed\n", cpu);
+			retval = -EBUSY;
+		}
+	}
+	return retval;
 }
 
-extern void fixup_irqs(void);
 /* must be called with cpucontrol mutex held */
 int __cpu_disable(void)
 {
@@ -645,11 +653,24 @@ int __cpu_disable(void)
 	/*
 	 * dont permit boot processor for now
 	 */
-	if (cpu == 0)
+	if (cpu == 0 && !bsp_remove_ok) {
+		printk ("Your platform does not support removal of BSP\n");
+		return (-EBUSY);
+	}
+
+	if (ia64_platform_is("sn2")) {
+		if (!sn_cpu_disable_allowed(cpu))
+			return -EBUSY;
+	}
+
+	set_cpu_online(cpu, false);
+
+	if (migrate_platform_irqs(cpu)) {
+		set_cpu_online(cpu, true);
 		return -EBUSY;
+	}
 
 	remove_siblinginfo(cpu);
-	cpu_clear(cpu, cpu_online_map);
 	fixup_irqs();
 	local_flush_tlb_all();
 	cpu_clear(cpu, cpu_callin_map);
@@ -671,17 +692,6 @@ void __cpu_die(unsigned int cpu)
 	}
  	printk(KERN_ERR "CPU %u didn't die...\n", cpu);
 }
-#else /* !CONFIG_HOTPLUG_CPU */
-int __cpu_disable(void)
-{
-	return -ENOSYS;
-}
-
-void __cpu_die(unsigned int cpu)
-{
-	/* We said "no" in __cpu_disable */
-	BUG();
-}
 #endif /* CONFIG_HOTPLUG_CPU */
 
 void
@@ -702,8 +712,7 @@ smp_cpus_done (unsigned int dummy)
 	       (int)num_online_cpus(), bogosum/(500000/HZ), (bogosum/(5000/HZ))%100);
 }
 
-static inline void __devinit
-set_cpu_sibling_map(int cpu)
+static inline void set_cpu_sibling_map(int cpu)
 {
 	int i;
 
@@ -712,15 +721,15 @@ set_cpu_sibling_map(int cpu)
 			cpu_set(i, cpu_core_map[cpu]);
 			cpu_set(cpu, cpu_core_map[i]);
 			if (cpu_data(cpu)->core_id == cpu_data(i)->core_id) {
-				cpu_set(i, cpu_sibling_map[cpu]);
-				cpu_set(cpu, cpu_sibling_map[i]);
+				cpu_set(i, per_cpu(cpu_sibling_map, cpu));
+				cpu_set(cpu, per_cpu(cpu_sibling_map, i));
 			}
 		}
 	}
 }
 
-int __devinit
-__cpu_up (unsigned int cpu)
+int
+__cpu_up(unsigned int cpu, struct task_struct *tidle)
 {
 	int ret;
 	int sapicid;
@@ -738,13 +747,13 @@ __cpu_up (unsigned int cpu)
 
 	per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
 	/* Processor goes to start_secondary(), sets online flag */
-	ret = do_boot_cpu(sapicid, cpu);
+	ret = do_boot_cpu(sapicid, cpu, tidle);
 	if (ret < 0)
 		return ret;
 
 	if (cpu_data(cpu)->threads_per_core == 1 &&
 	    cpu_data(cpu)->cores_per_socket == 1) {
-		cpu_set(cpu, cpu_sibling_map[cpu]);
+		cpu_set(cpu, per_cpu(cpu_sibling_map, cpu));
 		cpu_set(cpu, cpu_core_map[cpu]);
 		return 0;
 	}
@@ -755,7 +764,7 @@ __cpu_up (unsigned int cpu)
 }
 
 /*
- * Assume that CPU's have been discovered by some platform-dependent interface.  For
+ * Assume that CPUs have been discovered by some platform-dependent interface.  For
  * SoftSDV/Lion, that would be ACPI.
  *
  * Setup of the IPI irq handler is done in irq.c:init_IRQ_SMP().
@@ -769,7 +778,7 @@ init_smp_config(void)
 	} *ap_startup;
 	long sal_ret;
 
-	/* Tell SAL where to drop the AP's.  */
+	/* Tell SAL where to drop the APs.  */
 	ap_startup = (struct fptr *) start_ap;
 	sal_ret = ia64_sal_set_vectors(SAL_VECTOR_OS_BOOT_RENDEZ,
 				       ia64_tpa(ap_startup->fp), ia64_tpa(ap_startup->gp), 0, 0, 0, 0);
@@ -778,106 +787,72 @@ init_smp_config(void)
 		       ia64_sal_strerror(sal_ret));
 }
 
-static inline int __devinit
-check_for_mtinfo_index(void)
-{
-	int i;
-	
-	for_each_cpu(i)
-		if (!mt_info[i].valid)
-			return i;
-
-	return -1;
-}
-
-/*
- * Search the mt_info to find out if this socket's cid/tid information is
- * cached or not. If the socket exists, fill in the core_id and thread_id 
- * in cpuinfo
- */
-static int __devinit
-check_for_new_socket(__u16 logical_address, struct cpuinfo_ia64 *c)
-{
-	int i;
-	__u32 sid = c->socket_id;
-
-	for_each_cpu(i) {
-		if (mt_info[i].valid && mt_info[i].proc_fixed_addr == logical_address
-		    && mt_info[i].socket_id == sid) {
-			c->core_id = mt_info[i].core_id;
-			c->thread_id = mt_info[i].thread_id;
-			return 1; /* not a new socket */
-		}
-	}
-	return 0;
-}
-
 /*
  * identify_siblings(cpu) gets called from identify_cpu. This populates the 
  * information related to logical execution units in per_cpu_data structure.
  */
-void __devinit
-identify_siblings(struct cpuinfo_ia64 *c)
+void identify_siblings(struct cpuinfo_ia64 *c)
 {
-	s64 status;
+	long status;
 	u16 pltid;
-	u64 proc_fixed_addr;
-	int count, i;
 	pal_logical_to_physical_t info;
 
-	if (smp_num_cpucores == 1 && smp_num_siblings == 1)
-		return;
+	status = ia64_pal_logical_to_phys(-1, &info);
+	if (status != PAL_STATUS_SUCCESS) {
+		if (status != PAL_STATUS_UNIMPLEMENTED) {
+			printk(KERN_ERR
+				"ia64_pal_logical_to_phys failed with %ld\n",
+				status);
+			return;
+		}
 
-	if ((status = ia64_pal_logical_to_phys(0, &info)) != PAL_STATUS_SUCCESS) {
-		printk(KERN_ERR "ia64_pal_logical_to_phys failed with %ld\n",
-		       status);
-		return;
-	}
-	if ((status = ia64_sal_physical_id_info(&pltid)) != PAL_STATUS_SUCCESS) {
-		printk(KERN_ERR "ia64_sal_pltid failed with %ld\n", status);
-		return;
+		info.overview_ppid = 0;
+		info.overview_cpp  = 1;
+		info.overview_tpc  = 1;
 	}
-	if ((status = ia64_pal_fixed_addr(&proc_fixed_addr)) != PAL_STATUS_SUCCESS) {
-		printk(KERN_ERR "ia64_pal_fixed_addr failed with %ld\n", status);
+
+	status = ia64_sal_physical_id_info(&pltid);
+	if (status != PAL_STATUS_SUCCESS) {
+		if (status != PAL_STATUS_UNIMPLEMENTED)
+			printk(KERN_ERR
+				"ia64_sal_pltid failed with %ld\n",
+				status);
 		return;
 	}
 
 	c->socket_id =  (pltid << 8) | info.overview_ppid;
-	c->cores_per_socket = info.overview_cpp;
-	c->threads_per_core = info.overview_tpc;
-	count = c->num_log = info.overview_num_log;
 
-	/* If the thread and core id information is already cached, then
-	 * we will simply update cpu_info and return. Otherwise, we will
-	 * do the PAL calls and cache core and thread id's of all the siblings.
-	 */
-	if (check_for_new_socket(proc_fixed_addr, c))
+	if (info.overview_cpp == 1 && info.overview_tpc == 1)
 		return;
 
-	for (i = 0; i < count; i++) {
-		int index;
-
-		if (i && (status = ia64_pal_logical_to_phys(i, &info))
-			  != PAL_STATUS_SUCCESS) {
-                	printk(KERN_ERR "ia64_pal_logical_to_phys failed"
-					" with %ld\n", status);
-                	return;
-		}
-		if (info.log2_la == proc_fixed_addr) {
-			c->core_id = info.log1_cid;
-			c->thread_id = info.log1_tid;
-		}
+	c->cores_per_socket = info.overview_cpp;
+	c->threads_per_core = info.overview_tpc;
+	c->num_log = info.overview_num_log;
 
-		index = check_for_mtinfo_index();
-		/* We will not do the mt_info caching optimization in this case.
-		 */
-		if (index < 0)
-			continue;
+	c->core_id = info.log1_cid;
+	c->thread_id = info.log1_tid;
+}
 
-		mt_info[index].valid = 1;
-		mt_info[index].socket_id = c->socket_id;
-		mt_info[index].core_id = info.log1_cid;
-		mt_info[index].thread_id = info.log1_tid;
-		mt_info[index].proc_fixed_addr = info.log2_la;
+/*
+ * returns non zero, if multi-threading is enabled
+ * on at least one physical package. Due to hotplug cpu
+ * and (maxcpus=), all threads may not necessarily be enabled
+ * even though the processor supports multi-threading.
+ */
+int is_multithreading_enabled(void)
+{
+	int i, j;
+
+	for_each_present_cpu(i) {
+		for_each_present_cpu(j) {
+			if (j == i)
+				continue;
+			if ((cpu_data(j)->socket_id == cpu_data(i)->socket_id)) {
+				if (cpu_data(j)->core_id == cpu_data(i)->core_id)
+					return 1;
+			}
+		}
 	}
+	return 0;
 }
+EXPORT_SYMBOL_GPL(is_multithreading_enabled);
diff --git a/arch/ia64/kernel/stacktrace.c b/arch/ia64/kernel/stacktrace.c
new file mode 100644
index 00000000000..5af2783a87f
--- /dev/null
+++ b/arch/ia64/kernel/stacktrace.c
@@ -0,0 +1,39 @@
+/*
+ * arch/ia64/kernel/stacktrace.c
+ *
+ * Stack trace management functions
+ *
+ */
+#include <linux/sched.h>
+#include <linux/stacktrace.h>
+#include <linux/module.h>
+
+static void
+ia64_do_save_stack(struct unw_frame_info *info, void *arg)
+{
+	struct stack_trace *trace = arg;
+	unsigned long ip;
+	int skip = trace->skip;
+
+	trace->nr_entries = 0;
+	do {
+		unw_get_ip(info, &ip);
+		if (ip == 0)
+			break;
+		if (skip == 0) {
+			trace->entries[trace->nr_entries++] = ip;
+			if (trace->nr_entries == trace->max_entries)
+				break;
+		} else
+			skip--;
+	} while (unw_unwind(info) >= 0);
+}
+
+/*
+ * Save stack-backtrace addresses into a stack_trace buffer.
+ */
+void save_stack_trace(struct stack_trace *trace)
+{
+	unw_init_running(ia64_do_save_stack, trace);
+}
+EXPORT_SYMBOL(save_stack_trace);
diff --git a/arch/ia64/kernel/sys_ia64.c b/arch/ia64/kernel/sys_ia64.c
index f2dbcd1db0d..41e33f84c18 100644
--- a/arch/ia64/kernel/sys_ia64.c
+++ b/arch/ia64/kernel/sys_ia64.c
@@ -5,7 +5,6 @@
  * Copyright (C) 1999-2000, 2002-2003, 2005 Hewlett-Packard Co
  *	David Mosberger-Tang <davidm@hpl.hp.com>
  */
-#include <linux/config.h>
 #include <linux/errno.h>
 #include <linux/fs.h>
 #include <linux/mm.h>
@@ -14,7 +13,6 @@
 #include <linux/shm.h>
 #include <linux/file.h>		/* doh, must come after sched.h... */
 #include <linux/smp.h>
-#include <linux/smp_lock.h>
 #include <linux/syscalls.h>
 #include <linux/highuid.h>
 #include <linux/hugetlb.h>
@@ -27,19 +25,26 @@ arch_get_unmapped_area (struct file *filp, unsigned long addr, unsigned long len
 			unsigned long pgoff, unsigned long flags)
 {
 	long map_shared = (flags & MAP_SHARED);
-	unsigned long start_addr, align_mask = PAGE_SIZE - 1;
+	unsigned long align_mask = 0;
 	struct mm_struct *mm = current->mm;
-	struct vm_area_struct *vma;
+	struct vm_unmapped_area_info info;
 
 	if (len > RGN_MAP_LIMIT)
 		return -ENOMEM;
 
+	/* handle fixed mapping: prevent overlap with huge pages */
+	if (flags & MAP_FIXED) {
+		if (is_hugepage_only_range(mm, addr, len))
+			return -EINVAL;
+		return addr;
+	}
+
 #ifdef CONFIG_HUGETLB_PAGE
 	if (REGION_NUMBER(addr) == RGN_HPAGE)
 		addr = 0;
 #endif
 	if (!addr)
-		addr = mm->free_area_cache;
+		addr = TASK_UNMAPPED_BASE;
 
 	if (map_shared && (TASK_SIZE > 0xfffffffful))
 		/*
@@ -48,28 +53,15 @@ arch_get_unmapped_area (struct file *filp, unsigned long addr, unsigned long len
 		 * tasks, we prefer to avoid exhausting the address space too quickly by
 		 * limiting alignment to a single page.
 		 */
-		align_mask = SHMLBA - 1;
-
-  full_search:
-	start_addr = addr = (addr + align_mask) & ~align_mask;
-
-	for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
-		/* At this point:  (!vma || addr < vma->vm_end). */
-		if (TASK_SIZE - len < addr || RGN_MAP_LIMIT - len < REGION_OFFSET(addr)) {
-			if (start_addr != TASK_UNMAPPED_BASE) {
-				/* Start a new search --- just in case we missed some holes.  */
-				addr = TASK_UNMAPPED_BASE;
-				goto full_search;
-			}
-			return -ENOMEM;
-		}
-		if (!vma || addr + len <= vma->vm_start) {
-			/* Remember the address where we stopped this search:  */
-			mm->free_area_cache = addr + len;
-			return addr;
-		}
-		addr = (vma->vm_end + align_mask) & ~align_mask;
-	}
+		align_mask = PAGE_MASK & (SHMLBA - 1);
+
+	info.flags = 0;
+	info.length = len;
+	info.low_limit = addr;
+	info.high_limit = TASK_SIZE;
+	info.align_mask = align_mask;
+	info.align_offset = 0;
+	return vm_unmapped_area(&info);
 }
 
 asmlinkage long
@@ -95,51 +87,7 @@ sys_getpagesize (void)
 asmlinkage unsigned long
 ia64_brk (unsigned long brk)
 {
-	unsigned long rlim, retval, newbrk, oldbrk;
-	struct mm_struct *mm = current->mm;
-
-	/*
-	 * Most of this replicates the code in sys_brk() except for an additional safety
-	 * check and the clearing of r8.  However, we can't call sys_brk() because we need
-	 * to acquire the mmap_sem before we can do the test...
-	 */
-	down_write(&mm->mmap_sem);
-
-	if (brk < mm->end_code)
-		goto out;
-	newbrk = PAGE_ALIGN(brk);
-	oldbrk = PAGE_ALIGN(mm->brk);
-	if (oldbrk == newbrk)
-		goto set_brk;
-
-	/* Always allow shrinking brk. */
-	if (brk <= mm->brk) {
-		if (!do_munmap(mm, newbrk, oldbrk-newbrk))
-			goto set_brk;
-		goto out;
-	}
-
-	/* Check against unimplemented/unmapped addresses: */
-	if ((newbrk - oldbrk) > RGN_MAP_LIMIT || REGION_OFFSET(newbrk) > RGN_MAP_LIMIT)
-		goto out;
-
-	/* Check against rlimit.. */
-	rlim = current->signal->rlim[RLIMIT_DATA].rlim_cur;
-	if (rlim < RLIM_INFINITY && brk - mm->start_data > rlim)
-		goto out;
-
-	/* Check against existing mmap mappings. */
-	if (find_vma_intersection(mm, oldbrk, newbrk+PAGE_SIZE))
-		goto out;
-
-	/* Ok, looks good - let it rip. */
-	if (do_brk(oldbrk, newbrk-oldbrk) != oldbrk)
-		goto out;
-set_brk:
-	mm->brk = brk;
-out:
-	retval = mm->brk;
-	up_write(&mm->mmap_sem);
+	unsigned long retval = sys_brk(brk);
 	force_successful_syscall_return();
 	return retval;
 }
@@ -149,13 +97,13 @@ out:
  * and r9) as this is faster than doing a copy_to_user().
  */
 asmlinkage long
-sys_pipe (void)
+sys_ia64_pipe (void)
 {
-	struct pt_regs *regs = ia64_task_regs(current);
+	struct pt_regs *regs = task_pt_regs(current);
 	int fd[2];
 	int retval;
 
-	retval = do_pipe(fd);
+	retval = do_pipe_flags(fd, 0);
 	if (retval)
 		goto out;
 	retval = fd[0];
@@ -164,49 +112,20 @@ sys_pipe (void)
 	return retval;
 }
 
-static inline unsigned long
-do_mmap2 (unsigned long addr, unsigned long len, int prot, int flags, int fd, unsigned long pgoff)
+int ia64_mmap_check(unsigned long addr, unsigned long len,
+		unsigned long flags)
 {
 	unsigned long roff;
-	struct file *file = NULL;
-
-	flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
-	if (!(flags & MAP_ANONYMOUS)) {
-		file = fget(fd);
-		if (!file)
-			return -EBADF;
-
-		if (!file->f_op || !file->f_op->mmap) {
-			addr = -ENODEV;
-			goto out;
-		}
-	}
-
-	/* Careful about overflows.. */
-	len = PAGE_ALIGN(len);
-	if (!len || len > TASK_SIZE) {
-		addr = -EINVAL;
-		goto out;
-	}
 
 	/*
-	 * Don't permit mappings into unmapped space, the virtual page table of a region,
-	 * or across a region boundary.  Note: RGN_MAP_LIMIT is equal to 2^n-PAGE_SIZE
-	 * (for some integer n <= 61) and len > 0.
+	 * Don't permit mappings into unmapped space, the virtual page table
+	 * of a region, or across a region boundary.  Note: RGN_MAP_LIMIT is
+	 * equal to 2^n-PAGE_SIZE (for some integer n <= 61) and len > 0.
 	 */
 	roff = REGION_OFFSET(addr);
-	if ((len > RGN_MAP_LIMIT) || (roff > (RGN_MAP_LIMIT - len))) {
-		addr = -EINVAL;
-		goto out;
-	}
-
-	down_write(&current->mm->mmap_sem);
-	addr = do_mmap_pgoff(file, addr, len, prot, flags, pgoff);
-	up_write(&current->mm->mmap_sem);
-
-out:	if (file)
-		fput(file);
-	return addr;
+	if ((len > RGN_MAP_LIMIT) || (roff > (RGN_MAP_LIMIT - len)))
+		return -EINVAL;
+	return 0;
 }
 
 /*
@@ -217,7 +136,7 @@ out:	if (file)
 asmlinkage unsigned long
 sys_mmap2 (unsigned long addr, unsigned long len, int prot, int flags, int fd, long pgoff)
 {
-	addr = do_mmap2(addr, len, prot, flags, fd, pgoff);
+	addr = sys_mmap_pgoff(addr, len, prot, flags, fd, pgoff);
 	if (!IS_ERR((void *) addr))
 		force_successful_syscall_return();
 	return addr;
@@ -229,7 +148,7 @@ sys_mmap (unsigned long addr, unsigned long len, int prot, int flags, int fd, lo
 	if (offset_in_page(off) != 0)
 		return -EINVAL;
 
-	addr = do_mmap2(addr, len, prot, flags, fd, off >> PAGE_SHIFT);
+	addr = sys_mmap_pgoff(addr, len, prot, flags, fd, off >> PAGE_SHIFT);
 	if (!IS_ERR((void *) addr))
 		force_successful_syscall_return();
 	return addr;
@@ -239,22 +158,9 @@ asmlinkage unsigned long
 ia64_mremap (unsigned long addr, unsigned long old_len, unsigned long new_len, unsigned long flags,
 	     unsigned long new_addr)
 {
-	extern unsigned long do_mremap (unsigned long addr,
-					unsigned long old_len,
-					unsigned long new_len,
-					unsigned long flags,
-					unsigned long new_addr);
-
-	down_write(&current->mm->mmap_sem);
-	{
-		addr = do_mremap(addr, old_len, new_len, flags, new_addr);
-	}
-	up_write(&current->mm->mmap_sem);
-
-	if (IS_ERR((void *) addr))
-		return addr;
-
-	force_successful_syscall_return();
+	addr = sys_mremap(addr, old_len, new_len, flags, new_addr);
+	if (!IS_ERR((void *) addr))
+		force_successful_syscall_return();
 	return addr;
 }
 
diff --git a/arch/ia64/kernel/time.c b/arch/ia64/kernel/time.c
index 5b7e736f3b4..71c52bc7c28 100644
--- a/arch/ia64/kernel/time.c
+++ b/arch/ia64/kernel/time.c
@@ -8,7 +8,6 @@
  * Copyright (C) 1999-2000 VA Linux Systems
  * Copyright (C) 1999-2000 Walt Drummond <drummond@valinux.com>
  */
-#include <linux/config.h>
 
 #include <linux/cpu.h>
 #include <linux/init.h>
@@ -19,20 +18,27 @@
 #include <linux/time.h>
 #include <linux/interrupt.h>
 #include <linux/efi.h>
-#include <linux/profile.h>
 #include <linux/timex.h>
+#include <linux/timekeeper_internal.h>
+#include <linux/platform_device.h>
 
 #include <asm/machvec.h>
 #include <asm/delay.h>
 #include <asm/hw_irq.h>
+#include <asm/paravirt.h>
 #include <asm/ptrace.h>
 #include <asm/sal.h>
 #include <asm/sections.h>
-#include <asm/system.h>
 
-extern unsigned long wall_jiffies;
+#include "fsyscall_gtod_data.h"
+
+static cycle_t itc_get_cycles(struct clocksource *cs);
+
+struct fsyscall_gtod_data_t fsyscall_gtod_data;
+
+struct itc_jitter_data_t itc_jitter_data;
 
-#define TIME_KEEPER_ID	0	/* smp_processor_id() of time-keeper */
+volatile int time_keeper_id = 0; /* smp_processor_id() of time-keeper */
 
 #ifdef CONFIG_IA64_DEBUG_IRQ
 
@@ -41,22 +47,114 @@ EXPORT_SYMBOL(last_cli_ip);
 
 #endif
 
-static struct time_interpolator itc_interpolator = {
-	.shift = 16,
-	.mask = 0xffffffffffffffffLL,
-	.source = TIME_SOURCE_CPU
+#ifdef CONFIG_PARAVIRT
+/* We need to define a real function for sched_clock, to override the
+   weak default version */
+unsigned long long sched_clock(void)
+{
+        return paravirt_sched_clock();
+}
+#endif
+
+#ifdef CONFIG_PARAVIRT
+static void
+paravirt_clocksource_resume(struct clocksource *cs)
+{
+	if (pv_time_ops.clocksource_resume)
+		pv_time_ops.clocksource_resume();
+}
+#endif
+
+static struct clocksource clocksource_itc = {
+	.name           = "itc",
+	.rating         = 350,
+	.read           = itc_get_cycles,
+	.mask           = CLOCKSOURCE_MASK(64),
+	.flags          = CLOCK_SOURCE_IS_CONTINUOUS,
+#ifdef CONFIG_PARAVIRT
+	.resume		= paravirt_clocksource_resume,
+#endif
 };
+static struct clocksource *itc_clocksource;
+
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
+
+#include <linux/kernel_stat.h>
+
+extern cputime_t cycle_to_cputime(u64 cyc);
+
+void vtime_account_user(struct task_struct *tsk)
+{
+	cputime_t delta_utime;
+	struct thread_info *ti = task_thread_info(tsk);
+
+	if (ti->ac_utime) {
+		delta_utime = cycle_to_cputime(ti->ac_utime);
+		account_user_time(tsk, delta_utime, delta_utime);
+		ti->ac_utime = 0;
+	}
+}
+
+/*
+ * Called from the context switch with interrupts disabled, to charge all
+ * accumulated times to the current process, and to prepare accounting on
+ * the next process.
+ */
+void arch_vtime_task_switch(struct task_struct *prev)
+{
+	struct thread_info *pi = task_thread_info(prev);
+	struct thread_info *ni = task_thread_info(current);
+
+	pi->ac_stamp = ni->ac_stamp;
+	ni->ac_stime = ni->ac_utime = 0;
+}
+
+/*
+ * Account time for a transition between system, hard irq or soft irq state.
+ * Note that this function is called with interrupts enabled.
+ */
+static cputime_t vtime_delta(struct task_struct *tsk)
+{
+	struct thread_info *ti = task_thread_info(tsk);
+	cputime_t delta_stime;
+	__u64 now;
+
+	WARN_ON_ONCE(!irqs_disabled());
+
+	now = ia64_get_itc();
+
+	delta_stime = cycle_to_cputime(ti->ac_stime + (now - ti->ac_stamp));
+	ti->ac_stime = 0;
+	ti->ac_stamp = now;
+
+	return delta_stime;
+}
+
+void vtime_account_system(struct task_struct *tsk)
+{
+	cputime_t delta = vtime_delta(tsk);
+
+	account_system_time(tsk, 0, delta, delta);
+}
+EXPORT_SYMBOL_GPL(vtime_account_system);
+
+void vtime_account_idle(struct task_struct *tsk)
+{
+	account_idle_time(vtime_delta(tsk));
+}
+
+#endif /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
 
 static irqreturn_t
-timer_interrupt (int irq, void *dev_id, struct pt_regs *regs)
+timer_interrupt (int irq, void *dev_id)
 {
 	unsigned long new_itm;
 
-	if (unlikely(cpu_is_offline(smp_processor_id()))) {
+	if (cpu_is_offline(smp_processor_id())) {
 		return IRQ_HANDLED;
 	}
 
-	platform_timer_interrupt(irq, dev_id, regs);
+	platform_timer_interrupt(irq, dev_id);
 
 	new_itm = local_cpu_data->itm_next;
 
@@ -64,38 +162,40 @@ timer_interrupt (int irq, void *dev_id, struct pt_regs *regs)
 		printk(KERN_ERR "Oops: timer tick before it's due (itc=%lx,itm=%lx)\n",
 		       ia64_get_itc(), new_itm);
 
-	profile_tick(CPU_PROFILING, regs);
+	profile_tick(CPU_PROFILING);
+
+	if (paravirt_do_steal_accounting(&new_itm))
+		goto skip_process_time_accounting;
 
 	while (1) {
-		update_process_times(user_mode(regs));
+		update_process_times(user_mode(get_irq_regs()));
 
 		new_itm += local_cpu_data->itm_delta;
 
-		if (smp_processor_id() == TIME_KEEPER_ID) {
-			/*
-			 * Here we are in the timer irq handler. We have irqs locally
-			 * disabled, but we don't know if the timer_bh is running on
-			 * another CPU. We need to avoid to SMP race by acquiring the
-			 * xtime_lock.
-			 */
-			write_seqlock(&xtime_lock);
-			do_timer(regs);
-			local_cpu_data->itm_next = new_itm;
-			write_sequnlock(&xtime_lock);
-		} else
-			local_cpu_data->itm_next = new_itm;
+		if (smp_processor_id() == time_keeper_id)
+			xtime_update(1);
+
+		local_cpu_data->itm_next = new_itm;
 
 		if (time_after(new_itm, ia64_get_itc()))
 			break;
+
+		/*
+		 * Allow IPIs to interrupt the timer loop.
+		 */
+		local_irq_enable();
+		local_irq_disable();
 	}
 
+skip_process_time_accounting:
+
 	do {
 		/*
 		 * If we're too close to the next clock tick for
 		 * comfort, we increase the safety margin by
 		 * intentionally dropping the next tick(s).  We do NOT
 		 * update itm.next because that would force us to call
-		 * do_timer() which in turn would let our clock run
+		 * xtime_update() which in turn would let our clock run
 		 * too fast (with the potentially devastating effect
 		 * of losing monotony of time).
 		 */
@@ -144,8 +244,7 @@ static int __init nojitter_setup(char *str)
 __setup("nojitter", nojitter_setup);
 
 
-void __devinit
-ia64_init_itm (void)
+void ia64_init_itm(void)
 {
 	unsigned long platform_base_freq, itc_freq;
 	struct pal_freq_ratio itc_ratio, proc_ratio;
@@ -188,7 +287,7 @@ ia64_init_itm (void)
 	itc_freq = (platform_base_freq*itc_ratio.num)/itc_ratio.den;
 
 	local_cpu_data->itm_delta = (itc_freq + HZ/2) / HZ;
-	printk(KERN_DEBUG "CPU %d: base freq=%lu.%03luMHz, ITC ratio=%lu/%lu, "
+	printk(KERN_DEBUG "CPU %d: base freq=%lu.%03luMHz, ITC ratio=%u/%u, "
 	       "ITC freq=%lu.%03luMHz", smp_processor_id(),
 	       platform_base_freq / 1000000, (platform_base_freq / 1000) % 1000,
 	       itc_ratio.num, itc_ratio.den, itc_freq / 1000000, (itc_freq / 1000) % 1000);
@@ -208,8 +307,6 @@ ia64_init_itm (void)
 					+ itc_freq/2)/itc_freq;
 
 	if (!(sal_platform_features & IA64_SAL_PLATFORM_FEATURE_ITC_DRIFT)) {
-		itc_interpolator.frequency = local_cpu_data->itc_freq;
-		itc_interpolator.drift = itc_drift;
 #ifdef CONFIG_SMP
 		/* On IA64 in an SMP configuration ITCs are never accurately synchronized.
 		 * Jitter compensation requires a cmpxchg which may limit
@@ -221,31 +318,154 @@ ia64_init_itm (void)
 		 * even going backward) if the ITC offsets between the individual CPUs
 		 * are too large.
 		 */
-		if (!nojitter) itc_interpolator.jitter = 1;
+		if (!nojitter)
+			itc_jitter_data.itc_jitter = 1;
 #endif
-		register_time_interpolator(&itc_interpolator);
-	}
+	} else
+		/*
+		 * ITC is drifty and we have not synchronized the ITCs in smpboot.c.
+		 * ITC values may fluctuate significantly between processors.
+		 * Clock should not be used for hrtimers. Mark itc as only
+		 * useful for boot and testing.
+		 *
+		 * Note that jitter compensation is off! There is no point of
+		 * synchronizing ITCs since they may be large differentials
+		 * that change over time.
+		 *
+		 * The only way to fix this would be to repeatedly sync the
+		 * ITCs. Until that time we have to avoid ITC.
+		 */
+		clocksource_itc.rating = 50;
+
+	paravirt_init_missing_ticks_accounting(smp_processor_id());
+
+	/* avoid softlock up message when cpu is unplug and plugged again. */
+	touch_softlockup_watchdog();
 
 	/* Setup the CPU local timer tick */
 	ia64_cpu_local_tick();
+
+	if (!itc_clocksource) {
+		clocksource_register_hz(&clocksource_itc,
+						local_cpu_data->itc_freq);
+		itc_clocksource = &clocksource_itc;
+	}
+}
+
+static cycle_t itc_get_cycles(struct clocksource *cs)
+{
+	unsigned long lcycle, now, ret;
+
+	if (!itc_jitter_data.itc_jitter)
+		return get_cycles();
+
+	lcycle = itc_jitter_data.itc_lastcycle;
+	now = get_cycles();
+	if (lcycle && time_after(lcycle, now))
+		return lcycle;
+
+	/*
+	 * Keep track of the last timer value returned.
+	 * In an SMP environment, you could lose out in contention of
+	 * cmpxchg. If so, your cmpxchg returns new value which the
+	 * winner of contention updated to. Use the new value instead.
+	 */
+	ret = cmpxchg(&itc_jitter_data.itc_lastcycle, lcycle, now);
+	if (unlikely(ret != lcycle))
+		return ret;
+
+	return now;
 }
 
+
 static struct irqaction timer_irqaction = {
 	.handler =	timer_interrupt,
-	.flags =	SA_INTERRUPT,
+	.flags =	IRQF_IRQPOLL,
 	.name =		"timer"
 };
 
+static struct platform_device rtc_efi_dev = {
+	.name = "rtc-efi",
+	.id = -1,
+};
+
+static int __init rtc_init(void)
+{
+	if (platform_device_register(&rtc_efi_dev) < 0)
+		printk(KERN_ERR "unable to register rtc device...\n");
+
+	/* not necessarily an error */
+	return 0;
+}
+module_init(rtc_init);
+
+void read_persistent_clock(struct timespec *ts)
+{
+	efi_gettimeofday(ts);
+}
+
 void __init
 time_init (void)
 {
 	register_percpu_irq(IA64_TIMER_VECTOR, &timer_irqaction);
-	efi_gettimeofday(&xtime);
 	ia64_init_itm();
+}
 
-	/*
-	 * Initialize wall_to_monotonic such that adding it to xtime will yield zero, the
-	 * tv_nsec field must be normalized (i.e., 0 <= nsec < NSEC_PER_SEC).
-	 */
-	set_normalized_timespec(&wall_to_monotonic, -xtime.tv_sec, -xtime.tv_nsec);
+/*
+ * Generic udelay assumes that if preemption is allowed and the thread
+ * migrates to another CPU, that the ITC values are synchronized across
+ * all CPUs.
+ */
+static void
+ia64_itc_udelay (unsigned long usecs)
+{
+	unsigned long start = ia64_get_itc();
+	unsigned long end = start + usecs*local_cpu_data->cyc_per_usec;
+
+	while (time_before(ia64_get_itc(), end))
+		cpu_relax();
+}
+
+void (*ia64_udelay)(unsigned long usecs) = &ia64_itc_udelay;
+
+void
+udelay (unsigned long usecs)
+{
+	(*ia64_udelay)(usecs);
+}
+EXPORT_SYMBOL(udelay);
+
+/* IA64 doesn't cache the timezone */
+void update_vsyscall_tz(void)
+{
 }
+
+void update_vsyscall_old(struct timespec *wall, struct timespec *wtm,
+			struct clocksource *c, u32 mult)
+{
+	write_seqcount_begin(&fsyscall_gtod_data.seq);
+
+        /* copy fsyscall clock data */
+        fsyscall_gtod_data.clk_mask = c->mask;
+        fsyscall_gtod_data.clk_mult = mult;
+        fsyscall_gtod_data.clk_shift = c->shift;
+        fsyscall_gtod_data.clk_fsys_mmio = c->archdata.fsys_mmio;
+        fsyscall_gtod_data.clk_cycle_last = c->cycle_last;
+
+	/* copy kernel time structures */
+        fsyscall_gtod_data.wall_time.tv_sec = wall->tv_sec;
+        fsyscall_gtod_data.wall_time.tv_nsec = wall->tv_nsec;
+	fsyscall_gtod_data.monotonic_time.tv_sec = wtm->tv_sec
+							+ wall->tv_sec;
+	fsyscall_gtod_data.monotonic_time.tv_nsec = wtm->tv_nsec
+							+ wall->tv_nsec;
+
+	/* normalize */
+	while (fsyscall_gtod_data.monotonic_time.tv_nsec >= NSEC_PER_SEC) {
+		fsyscall_gtod_data.monotonic_time.tv_nsec -= NSEC_PER_SEC;
+		fsyscall_gtod_data.monotonic_time.tv_sec++;
+	}
+
+	write_seqcount_end(&fsyscall_gtod_data.seq);
+}
+
diff --git a/arch/ia64/kernel/topology.c b/arch/ia64/kernel/topology.c
index 706b7734e19..f295f9abba4 100644
--- a/arch/ia64/kernel/topology.c
+++ b/arch/ia64/kernel/topology.c
@@ -9,60 +9,65 @@
  * 		2002/08/07 Erich Focht <efocht@ess.nec.de>
  * Populate cpu entries in sysfs for non-numa systems as well
  *  	Intel Corporation - Ashok Raj
+ * 02/27/2006 Zhang, Yanmin
+ *	Populate cpu cache entries in sysfs for cpu cache info
  */
 
-#include <linux/config.h>
 #include <linux/cpu.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/node.h>
+#include <linux/slab.h>
 #include <linux/init.h>
 #include <linux/bootmem.h>
 #include <linux/nodemask.h>
+#include <linux/notifier.h>
+#include <linux/export.h>
 #include <asm/mmzone.h>
 #include <asm/numa.h>
 #include <asm/cpu.h>
 
-#ifdef CONFIG_NUMA
-static struct node *sysfs_nodes;
-#endif
 static struct ia64_cpu *sysfs_cpus;
 
-int arch_register_cpu(int num)
+void arch_fix_phys_package_id(int num, u32 slot)
 {
-	struct node *parent = NULL;
-	
-#ifdef CONFIG_NUMA
-	parent = &sysfs_nodes[cpu_to_node(num)];
-#endif /* CONFIG_NUMA */
+#ifdef CONFIG_SMP
+	if (cpu_data(num)->socket_id == -1)
+		cpu_data(num)->socket_id = slot;
+#endif
+}
+EXPORT_SYMBOL_GPL(arch_fix_phys_package_id);
 
+
+#ifdef CONFIG_HOTPLUG_CPU
+int __ref arch_register_cpu(int num)
+{
 #ifdef CONFIG_ACPI
 	/*
-	 * If CPEI cannot be re-targetted, and this is
-	 * CPEI target, then dont create the control file
+	 * If CPEI can be re-targeted or if this is not
+	 * CPEI target, then it is hotpluggable
 	 */
-	if (!can_cpei_retarget() && is_cpu_cpei_target(num))
-		sysfs_cpus[num].cpu.no_control = 1;
+	if (can_cpei_retarget() || !is_cpu_cpei_target(num))
+		sysfs_cpus[num].cpu.hotpluggable = 1;
+	map_cpu_to_node(num, node_cpuid[num].nid);
 #endif
-
-	return register_cpu(&sysfs_cpus[num].cpu, num, parent);
+	return register_cpu(&sysfs_cpus[num].cpu, num);
 }
+EXPORT_SYMBOL(arch_register_cpu);
 
-#ifdef CONFIG_HOTPLUG_CPU
-
-void arch_unregister_cpu(int num)
+void __ref arch_unregister_cpu(int num)
 {
-	struct node *parent = NULL;
-
-#ifdef CONFIG_NUMA
-	int node = cpu_to_node(num);
-	parent = &sysfs_nodes[node];
-#endif /* CONFIG_NUMA */
-
-	return unregister_cpu(&sysfs_cpus[num].cpu, parent);
+	unregister_cpu(&sysfs_cpus[num].cpu);
+#ifdef CONFIG_ACPI
+	unmap_cpu_from_node(num, cpu_to_node(num));
+#endif
 }
-EXPORT_SYMBOL(arch_register_cpu);
 EXPORT_SYMBOL(arch_unregister_cpu);
+#else
+static int __init arch_register_cpu(int num)
+{
+	return register_cpu(&sysfs_cpus[num].cpu, num);
+}
 #endif /*CONFIG_HOTPLUG_CPU*/
 
 
@@ -71,31 +76,397 @@ static int __init topology_init(void)
 	int i, err = 0;
 
 #ifdef CONFIG_NUMA
-	sysfs_nodes = kmalloc(sizeof(struct node) * MAX_NUMNODES, GFP_KERNEL);
-	if (!sysfs_nodes) {
-		err = -ENOMEM;
-		goto out;
-	}
-	memset(sysfs_nodes, 0, sizeof(struct node) * MAX_NUMNODES);
-
-	/* MCD - Do we want to register all ONLINE nodes, or all POSSIBLE nodes? */
-	for_each_online_node(i)
-		if ((err = register_node(&sysfs_nodes[i], i, 0)))
+	/*
+	 * MCD - Do we want to register all ONLINE nodes, or all POSSIBLE nodes?
+	 */
+	for_each_online_node(i) {
+		if ((err = register_one_node(i)))
 			goto out;
+	}
 #endif
 
-	sysfs_cpus = kmalloc(sizeof(struct ia64_cpu) * NR_CPUS, GFP_KERNEL);
-	if (!sysfs_cpus) {
-		err = -ENOMEM;
-		goto out;
-	}
-	memset(sysfs_cpus, 0, sizeof(struct ia64_cpu) * NR_CPUS);
+	sysfs_cpus = kzalloc(sizeof(struct ia64_cpu) * NR_CPUS, GFP_KERNEL);
+	if (!sysfs_cpus)
+		panic("kzalloc in topology_init failed - NR_CPUS too big?");
 
-	for_each_present_cpu(i)
+	for_each_present_cpu(i) {
 		if((err = arch_register_cpu(i)))
 			goto out;
+	}
 out:
 	return err;
 }
 
-__initcall(topology_init);
+subsys_initcall(topology_init);
+
+
+/*
+ * Export cpu cache information through sysfs
+ */
+
+/*
+ *  A bunch of string array to get pretty printing
+ */
+static const char *cache_types[] = {
+	"",			/* not used */
+	"Instruction",
+	"Data",
+	"Unified"	/* unified */
+};
+
+static const char *cache_mattrib[]={
+	"WriteThrough",
+	"WriteBack",
+	"",		/* reserved */
+	""		/* reserved */
+};
+
+struct cache_info {
+	pal_cache_config_info_t	cci;
+	cpumask_t shared_cpu_map;
+	int level;
+	int type;
+	struct kobject kobj;
+};
+
+struct cpu_cache_info {
+	struct cache_info *cache_leaves;
+	int	num_cache_leaves;
+	struct kobject kobj;
+};
+
+static struct cpu_cache_info	all_cpu_cache_info[NR_CPUS];
+#define LEAF_KOBJECT_PTR(x,y)    (&all_cpu_cache_info[x].cache_leaves[y])
+
+#ifdef CONFIG_SMP
+static void cache_shared_cpu_map_setup(unsigned int cpu,
+		struct cache_info * this_leaf)
+{
+	pal_cache_shared_info_t	csi;
+	int num_shared, i = 0;
+	unsigned int j;
+
+	if (cpu_data(cpu)->threads_per_core <= 1 &&
+		cpu_data(cpu)->cores_per_socket <= 1) {
+		cpu_set(cpu, this_leaf->shared_cpu_map);
+		return;
+	}
+
+	if (ia64_pal_cache_shared_info(this_leaf->level,
+					this_leaf->type,
+					0,
+					&csi) != PAL_STATUS_SUCCESS)
+		return;
+
+	num_shared = (int) csi.num_shared;
+	do {
+		for_each_possible_cpu(j)
+			if (cpu_data(cpu)->socket_id == cpu_data(j)->socket_id
+				&& cpu_data(j)->core_id == csi.log1_cid
+				&& cpu_data(j)->thread_id == csi.log1_tid)
+				cpu_set(j, this_leaf->shared_cpu_map);
+
+		i++;
+	} while (i < num_shared &&
+		ia64_pal_cache_shared_info(this_leaf->level,
+				this_leaf->type,
+				i,
+				&csi) == PAL_STATUS_SUCCESS);
+}
+#else
+static void cache_shared_cpu_map_setup(unsigned int cpu,
+		struct cache_info * this_leaf)
+{
+	cpu_set(cpu, this_leaf->shared_cpu_map);
+	return;
+}
+#endif
+
+static ssize_t show_coherency_line_size(struct cache_info *this_leaf,
+					char *buf)
+{
+	return sprintf(buf, "%u\n", 1 << this_leaf->cci.pcci_line_size);
+}
+
+static ssize_t show_ways_of_associativity(struct cache_info *this_leaf,
+					char *buf)
+{
+	return sprintf(buf, "%u\n", this_leaf->cci.pcci_assoc);
+}
+
+static ssize_t show_attributes(struct cache_info *this_leaf, char *buf)
+{
+	return sprintf(buf,
+			"%s\n",
+			cache_mattrib[this_leaf->cci.pcci_cache_attr]);
+}
+
+static ssize_t show_size(struct cache_info *this_leaf, char *buf)
+{
+	return sprintf(buf, "%uK\n", this_leaf->cci.pcci_cache_size / 1024);
+}
+
+static ssize_t show_number_of_sets(struct cache_info *this_leaf, char *buf)
+{
+	unsigned number_of_sets = this_leaf->cci.pcci_cache_size;
+	number_of_sets /= this_leaf->cci.pcci_assoc;
+	number_of_sets /= 1 << this_leaf->cci.pcci_line_size;
+
+	return sprintf(buf, "%u\n", number_of_sets);
+}
+
+static ssize_t show_shared_cpu_map(struct cache_info *this_leaf, char *buf)
+{
+	ssize_t	len;
+	cpumask_t shared_cpu_map;
+
+	cpumask_and(&shared_cpu_map,
+				&this_leaf->shared_cpu_map, cpu_online_mask);
+	len = cpumask_scnprintf(buf, NR_CPUS+1, &shared_cpu_map);
+	len += sprintf(buf+len, "\n");
+	return len;
+}
+
+static ssize_t show_type(struct cache_info *this_leaf, char *buf)
+{
+	int type = this_leaf->type + this_leaf->cci.pcci_unified;
+	return sprintf(buf, "%s\n", cache_types[type]);
+}
+
+static ssize_t show_level(struct cache_info *this_leaf, char *buf)
+{
+	return sprintf(buf, "%u\n", this_leaf->level);
+}
+
+struct cache_attr {
+	struct attribute attr;
+	ssize_t (*show)(struct cache_info *, char *);
+	ssize_t (*store)(struct cache_info *, const char *, size_t count);
+};
+
+#ifdef define_one_ro
+	#undef define_one_ro
+#endif
+#define define_one_ro(_name) \
+	static struct cache_attr _name = \
+__ATTR(_name, 0444, show_##_name, NULL)
+
+define_one_ro(level);
+define_one_ro(type);
+define_one_ro(coherency_line_size);
+define_one_ro(ways_of_associativity);
+define_one_ro(size);
+define_one_ro(number_of_sets);
+define_one_ro(shared_cpu_map);
+define_one_ro(attributes);
+
+static struct attribute * cache_default_attrs[] = {
+	&type.attr,
+	&level.attr,
+	&coherency_line_size.attr,
+	&ways_of_associativity.attr,
+	&attributes.attr,
+	&size.attr,
+	&number_of_sets.attr,
+	&shared_cpu_map.attr,
+	NULL
+};
+
+#define to_object(k) container_of(k, struct cache_info, kobj)
+#define to_attr(a) container_of(a, struct cache_attr, attr)
+
+static ssize_t ia64_cache_show(struct kobject * kobj, struct attribute * attr, char * buf)
+{
+	struct cache_attr *fattr = to_attr(attr);
+	struct cache_info *this_leaf = to_object(kobj);
+	ssize_t ret;
+
+	ret = fattr->show ? fattr->show(this_leaf, buf) : 0;
+	return ret;
+}
+
+static const struct sysfs_ops cache_sysfs_ops = {
+	.show   = ia64_cache_show
+};
+
+static struct kobj_type cache_ktype = {
+	.sysfs_ops	= &cache_sysfs_ops,
+	.default_attrs	= cache_default_attrs,
+};
+
+static struct kobj_type cache_ktype_percpu_entry = {
+	.sysfs_ops	= &cache_sysfs_ops,
+};
+
+static void cpu_cache_sysfs_exit(unsigned int cpu)
+{
+	kfree(all_cpu_cache_info[cpu].cache_leaves);
+	all_cpu_cache_info[cpu].cache_leaves = NULL;
+	all_cpu_cache_info[cpu].num_cache_leaves = 0;
+	memset(&all_cpu_cache_info[cpu].kobj, 0, sizeof(struct kobject));
+	return;
+}
+
+static int cpu_cache_sysfs_init(unsigned int cpu)
+{
+	unsigned long i, levels, unique_caches;
+	pal_cache_config_info_t cci;
+	int j;
+	long status;
+	struct cache_info *this_cache;
+	int num_cache_leaves = 0;
+
+	if ((status = ia64_pal_cache_summary(&levels, &unique_caches)) != 0) {
+		printk(KERN_ERR "ia64_pal_cache_summary=%ld\n", status);
+		return -1;
+	}
+
+	this_cache=kzalloc(sizeof(struct cache_info)*unique_caches,
+			GFP_KERNEL);
+	if (this_cache == NULL)
+		return -ENOMEM;
+
+	for (i=0; i < levels; i++) {
+		for (j=2; j >0 ; j--) {
+			if ((status=ia64_pal_cache_config_info(i,j, &cci)) !=
+					PAL_STATUS_SUCCESS)
+				continue;
+
+			this_cache[num_cache_leaves].cci = cci;
+			this_cache[num_cache_leaves].level = i + 1;
+			this_cache[num_cache_leaves].type = j;
+
+			cache_shared_cpu_map_setup(cpu,
+					&this_cache[num_cache_leaves]);
+			num_cache_leaves ++;
+		}
+	}
+
+	all_cpu_cache_info[cpu].cache_leaves = this_cache;
+	all_cpu_cache_info[cpu].num_cache_leaves = num_cache_leaves;
+
+	memset(&all_cpu_cache_info[cpu].kobj, 0, sizeof(struct kobject));
+
+	return 0;
+}
+
+/* Add cache interface for CPU device */
+static int cache_add_dev(struct device *sys_dev)
+{
+	unsigned int cpu = sys_dev->id;
+	unsigned long i, j;
+	struct cache_info *this_object;
+	int retval = 0;
+	cpumask_t oldmask;
+
+	if (all_cpu_cache_info[cpu].kobj.parent)
+		return 0;
+
+	oldmask = current->cpus_allowed;
+	retval = set_cpus_allowed_ptr(current, cpumask_of(cpu));
+	if (unlikely(retval))
+		return retval;
+
+	retval = cpu_cache_sysfs_init(cpu);
+	set_cpus_allowed_ptr(current, &oldmask);
+	if (unlikely(retval < 0))
+		return retval;
+
+	retval = kobject_init_and_add(&all_cpu_cache_info[cpu].kobj,
+				      &cache_ktype_percpu_entry, &sys_dev->kobj,
+				      "%s", "cache");
+	if (unlikely(retval < 0)) {
+		cpu_cache_sysfs_exit(cpu);
+		return retval;
+	}
+
+	for (i = 0; i < all_cpu_cache_info[cpu].num_cache_leaves; i++) {
+		this_object = LEAF_KOBJECT_PTR(cpu,i);
+		retval = kobject_init_and_add(&(this_object->kobj),
+					      &cache_ktype,
+					      &all_cpu_cache_info[cpu].kobj,
+					      "index%1lu", i);
+		if (unlikely(retval)) {
+			for (j = 0; j < i; j++) {
+				kobject_put(&(LEAF_KOBJECT_PTR(cpu,j)->kobj));
+			}
+			kobject_put(&all_cpu_cache_info[cpu].kobj);
+			cpu_cache_sysfs_exit(cpu);
+			return retval;
+		}
+		kobject_uevent(&(this_object->kobj), KOBJ_ADD);
+	}
+	kobject_uevent(&all_cpu_cache_info[cpu].kobj, KOBJ_ADD);
+	return retval;
+}
+
+/* Remove cache interface for CPU device */
+static int cache_remove_dev(struct device *sys_dev)
+{
+	unsigned int cpu = sys_dev->id;
+	unsigned long i;
+
+	for (i = 0; i < all_cpu_cache_info[cpu].num_cache_leaves; i++)
+		kobject_put(&(LEAF_KOBJECT_PTR(cpu,i)->kobj));
+
+	if (all_cpu_cache_info[cpu].kobj.parent) {
+		kobject_put(&all_cpu_cache_info[cpu].kobj);
+		memset(&all_cpu_cache_info[cpu].kobj,
+			0,
+			sizeof(struct kobject));
+	}
+
+	cpu_cache_sysfs_exit(cpu);
+
+	return 0;
+}
+
+/*
+ * When a cpu is hot-plugged, do a check and initiate
+ * cache kobject if necessary
+ */
+static int cache_cpu_callback(struct notifier_block *nfb,
+		unsigned long action, void *hcpu)
+{
+	unsigned int cpu = (unsigned long)hcpu;
+	struct device *sys_dev;
+
+	sys_dev = get_cpu_device(cpu);
+	switch (action) {
+	case CPU_ONLINE:
+	case CPU_ONLINE_FROZEN:
+		cache_add_dev(sys_dev);
+		break;
+	case CPU_DEAD:
+	case CPU_DEAD_FROZEN:
+		cache_remove_dev(sys_dev);
+		break;
+	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block cache_cpu_notifier =
+{
+	.notifier_call = cache_cpu_callback
+};
+
+static int __init cache_sysfs_init(void)
+{
+	int i;
+
+	cpu_notifier_register_begin();
+
+	for_each_online_cpu(i) {
+		struct device *sys_dev = get_cpu_device((unsigned int)i);
+		cache_add_dev(sys_dev);
+	}
+
+	__register_hotcpu_notifier(&cache_cpu_notifier);
+
+	cpu_notifier_register_done();
+
+	return 0;
+}
+
+device_initcall(cache_sysfs_init);
+
diff --git a/arch/ia64/kernel/traps.c b/arch/ia64/kernel/traps.c
index f970359e7ed..d3636e67a98 100644
--- a/arch/ia64/kernel/traps.c
+++ b/arch/ia64/kernel/traps.c
@@ -7,7 +7,6 @@
  * 05/12/00 grao <goutham.rao@intel.com> : added isr in siginfo for SIGFPE
  */
 
-#include <linux/config.h>
 #include <linux/kernel.h>
 #include <linux/init.h>
 #include <linux/sched.h>
@@ -16,32 +15,18 @@
 #include <linux/module.h>       /* for EXPORT_SYMBOL */
 #include <linux/hardirq.h>
 #include <linux/kprobes.h>
+#include <linux/delay.h>		/* for ssleep() */
+#include <linux/kdebug.h>
 
 #include <asm/fpswa.h>
-#include <asm/ia32.h>
 #include <asm/intrinsics.h>
 #include <asm/processor.h>
 #include <asm/uaccess.h>
-#include <asm/kdebug.h>
-
-extern spinlock_t timerlist_lock;
+#include <asm/setup.h>
 
 fpswa_interface_t *fpswa_interface;
 EXPORT_SYMBOL(fpswa_interface);
 
-struct notifier_block *ia64die_chain;
-static DEFINE_SPINLOCK(die_notifier_lock);
-
-int register_die_notifier(struct notifier_block *nb)
-{
-	int err = 0;
-	unsigned long flags;
-	spin_lock_irqsave(&die_notifier_lock, flags);
-	err = notifier_chain_register(&ia64die_chain, nb);
-	spin_unlock_irqrestore(&die_notifier_lock, flags);
-	return err;
-}
-
 void __init
 trap_init (void)
 {
@@ -50,35 +35,7 @@ trap_init (void)
 		fpswa_interface = __va(ia64_boot_param->fpswa);
 }
 
-/*
- * Unlock any spinlocks which will prevent us from getting the message out (timerlist_lock
- * is acquired through the console unblank code)
- */
-void
-bust_spinlocks (int yes)
-{
-	int loglevel_save = console_loglevel;
-
-	if (yes) {
-		oops_in_progress = 1;
-		return;
-	}
-
-#ifdef CONFIG_VT
-	unblank_screen();
-#endif
-	oops_in_progress = 0;
-	/*
-	 * OK, the message is on the console.  Now we call printk() without
-	 * oops_in_progress set so that printk will give klogd a poke.  Hold onto
-	 * your hats...
-	 */
-	console_loglevel = 15;		/* NMI oopser may have shut the console up */
-	printk(" ");
-	console_loglevel = loglevel_save;
-}
-
-void
+int
 die (const char *str, struct pt_regs *regs, long err)
 {
 	static struct {
@@ -86,9 +43,9 @@ die (const char *str, struct pt_regs *regs, long err)
 		u32 lock_owner;
 		int lock_owner_depth;
 	} die = {
-		.lock =			SPIN_LOCK_UNLOCKED,
-		.lock_owner =		-1,
-		.lock_owner_depth =	0
+		.lock =	__SPIN_LOCK_UNLOCKED(die.lock),
+		.lock_owner = -1,
+		.lock_owner_depth = 0
 	};
 	static int die_counter;
 	int cpu = get_cpu();
@@ -104,22 +61,36 @@ die (const char *str, struct pt_regs *regs, long err)
 
 	if (++die.lock_owner_depth < 3) {
 		printk("%s[%d]: %s %ld [%d]\n",
-			current->comm, current->pid, str, err, ++die_counter);
-		show_regs(regs);
+		current->comm, task_pid_nr(current), str, err, ++die_counter);
+		if (notify_die(DIE_OOPS, str, regs, err, 255, SIGSEGV)
+	            != NOTIFY_STOP)
+			show_regs(regs);
+		else
+			regs = NULL;
   	} else
 		printk(KERN_ERR "Recursive die() failure, output suppressed\n");
 
 	bust_spinlocks(0);
 	die.lock_owner = -1;
+	add_taint(TAINT_DIE, LOCKDEP_NOW_UNRELIABLE);
 	spin_unlock_irq(&die.lock);
+
+	if (!regs)
+		return 1;
+
+	if (panic_on_oops)
+		panic("Fatal exception");
+
   	do_exit(SIGSEGV);
+	return 0;
 }
 
-void
+int
 die_if_kernel (char *str, struct pt_regs *regs, long err)
 {
 	if (!user_mode(regs))
-		die(str, regs, err);
+		return die(str, regs, err);
+	return 0;
 }
 
 void
@@ -128,24 +99,6 @@ __kprobes ia64_bad_break (unsigned long break_num, struct pt_regs *regs)
 	siginfo_t siginfo;
 	int sig, code;
 
-	/* break.b always sets cr.iim to 0, which causes problems for
-	 * debuggers.  Get the real break number from the original instruction,
-	 * but only for kernel code.  User space break.b is left alone, to
-	 * preserve the existing behaviour.  All break codings have the same
-	 * format, so there is no need to check the slot type.
-	 */
-	if (break_num == 0 && !user_mode(regs)) {
-		struct ia64_psr *ipsr = ia64_psr(regs);
-		unsigned long *bundle = (unsigned long *)regs->cr_iip;
-		unsigned long slot;
-		switch (ipsr->ri) {
-		      case 0:  slot = (bundle[0] >>  5); break;
-		      case 1:  slot = (bundle[0] >> 46) | (bundle[1] << 18); break;
-		      default: slot = (bundle[1] >> 23); break;
-		}
-		break_num = ((slot >> 36 & 1) << 20) | (slot >> 6 & 0xfffff);
-	}
-
 	/* SIGILL, SIGFPE, SIGSEGV, and SIGBUS want these field initialized: */
 	siginfo.si_addr = (void __user *) (regs->cr_iip + ia64_psr(regs)->ri);
 	siginfo.si_imm = break_num;
@@ -155,10 +108,10 @@ __kprobes ia64_bad_break (unsigned long break_num, struct pt_regs *regs)
 	switch (break_num) {
 	      case 0: /* unknown error (used by GCC for __builtin_abort()) */
 		if (notify_die(DIE_BREAK, "break 0", regs, break_num, TRAP_BRKPT, SIGTRAP)
-			       	== NOTIFY_STOP) {
+			       	== NOTIFY_STOP)
+			return;
+		if (die_if_kernel("bugcheck!", regs, break_num))
 			return;
-		}
-		die_if_kernel("bugcheck!", regs, break_num);
 		sig = SIGILL; code = ILL_ILLOPC;
 		break;
 
@@ -210,22 +163,17 @@ __kprobes ia64_bad_break (unsigned long break_num, struct pt_regs *regs)
 		sig = SIGILL; code = __ILL_BNDMOD;
 		break;
 
-	      case 0x80200:
-	      case 0x80300:
-		if (notify_die(DIE_BREAK, "kprobe", regs, break_num, TRAP_BRKPT, SIGTRAP)
-			       	== NOTIFY_STOP) {
-			return;
-		}
-		sig = SIGTRAP; code = TRAP_BRKPT;
-		break;
-
 	      default:
-		if (break_num < 0x40000 || break_num > 0x100000)
-			die_if_kernel("Bad break", regs, break_num);
+		if ((break_num < 0x40000 || break_num > 0x100000)
+		    && die_if_kernel("Bad break", regs, break_num))
+			return;
 
 		if (break_num < 0x80000) {
 			sig = SIGILL; code = __ILL_BREAK;
 		} else {
+			if (notify_die(DIE_BREAK, "bad break", regs, break_num, TRAP_BRKPT, SIGTRAP)
+					== NOTIFY_STOP)
+				return;
 			sig = SIGTRAP; code = TRAP_BRKPT;
 		}
 	}
@@ -324,6 +272,15 @@ fp_emulate (int fp_fault, void *bundle, long *ipsr, long *fpsr, long *isr, long
 	return ret.status;
 }
 
+struct fpu_swa_msg {
+	unsigned long count;
+	unsigned long time;
+};
+static DEFINE_PER_CPU(struct fpu_swa_msg, cpulast);
+DECLARE_PER_CPU(struct fpu_swa_msg, cpulast);
+static struct fpu_swa_msg last __cacheline_aligned;
+
+
 /*
  * Handle floating-point assist faults and traps.
  */
@@ -333,8 +290,6 @@ handle_fpu_swa (int fp_fault, struct pt_regs *regs, unsigned long isr)
 	long exception, bundle[2];
 	unsigned long fault_ip;
 	struct siginfo siginfo;
-	static int fpu_swa_count = 0;
-	static unsigned long last_time;
 
 	fault_ip = regs->cr_iip;
 	if (!fp_fault && (ia64_psr(regs)->ri == 0))
@@ -342,14 +297,37 @@ handle_fpu_swa (int fp_fault, struct pt_regs *regs, unsigned long isr)
 	if (copy_from_user(bundle, (void __user *) fault_ip, sizeof(bundle)))
 		return -1;
 
-	if (jiffies - last_time > 5*HZ)
-		fpu_swa_count = 0;
-	if ((fpu_swa_count < 4) && !(current->thread.flags & IA64_THREAD_FPEMU_NOPRINT)) {
-		last_time = jiffies;
-		++fpu_swa_count;
-		printk(KERN_WARNING
-		       "%s(%d): floating-point assist fault at ip %016lx, isr %016lx\n",
-		       current->comm, current->pid, regs->cr_iip + ia64_psr(regs)->ri, isr);
+	if (!(current->thread.flags & IA64_THREAD_FPEMU_NOPRINT))  {
+		unsigned long count, current_jiffies = jiffies;
+		struct fpu_swa_msg *cp = &__get_cpu_var(cpulast);
+
+		if (unlikely(current_jiffies > cp->time))
+			cp->count = 0;
+		if (unlikely(cp->count < 5)) {
+			cp->count++;
+			cp->time = current_jiffies + 5 * HZ;
+
+			/* minimize races by grabbing a copy of count BEFORE checking last.time. */
+			count = last.count;
+			barrier();
+
+			/*
+			 * Lower 4 bits are used as a count. Upper bits are a sequence
+			 * number that is updated when count is reset. The cmpxchg will
+			 * fail is seqno has changed. This minimizes mutiple cpus
+			 * resetting the count.
+			 */
+			if (current_jiffies > last.time)
+				(void) cmpxchg_acq(&last.count, count, 16 + (count & ~15));
+
+			/* used fetchadd to atomically update the count */
+			if ((last.count & 15) < 5 && (ia64_fetchadd(1, &last.count, acq) & 15) < 5) {
+				last.time = current_jiffies + 5 * HZ;
+				printk(KERN_WARNING
+		       			"%s(%d): floating-point assist fault at ip %016lx, isr %016lx\n",
+		       			current->comm, task_pid_nr(current), regs->cr_iip + ia64_psr(regs)->ri, isr);
+			}
+		}
 	}
 
 	exception = fp_emulate(fp_fault, bundle, &regs->cr_ipsr, &regs->ar_fpsr, &isr, &regs->pr,
@@ -434,14 +412,15 @@ ia64_illegal_op_fault (unsigned long ec, long arg1, long arg2, long arg3,
 #endif
 
 	sprintf(buf, "IA-64 Illegal operation fault");
-	die_if_kernel(buf, &regs, 0);
+	rv.fkt = 0;
+	if (die_if_kernel(buf, &regs, 0))
+		return rv;
 
 	memset(&si, 0, sizeof(si));
 	si.si_signo = SIGILL;
 	si.si_code = ILL_ILLOPC;
 	si.si_addr = (void __user *) (regs.cr_iip + ia64_psr(&regs)->ri);
 	force_sig_info(SIGILL, &si, current);
-	rv.fkt = 0;
 	return rv;
 }
 
@@ -485,7 +464,7 @@ ia64_fault (unsigned long vector, unsigned long isr, unsigned long ifa,
 		if (code == 8) {
 # ifdef CONFIG_IA64_PRINT_HAZARDS
 			printk("%s[%d]: possible hazard @ ip=%016lx (pr = %016lx)\n",
-			       current->comm, current->pid,
+			       current->comm, task_pid_nr(current),
 			       regs.cr_iip + ia64_psr(&regs)->ri, regs.pr);
 # endif
 			return;
@@ -551,12 +530,15 @@ ia64_fault (unsigned long vector, unsigned long isr, unsigned long ifa,
 		if (fsys_mode(current, &regs)) {
 			extern char __kernel_syscall_via_break[];
 			/*
-			 * Got a trap in fsys-mode: Taken Branch Trap and Single Step trap
-			 * need special handling; Debug trap is not supposed to happen.
+			 * Got a trap in fsys-mode: Taken Branch Trap
+			 * and Single Step trap need special handling;
+			 * Debug trap is ignored (we disable it here
+			 * and re-enable it in the lower-privilege trap).
 			 */
 			if (unlikely(vector == 29)) {
-				die("Got debug trap in fsys-mode---not supposed to happen!",
-				    &regs, 0);
+				set_thread_flag(TIF_DB_DISABLED);
+				ia64_psr(&regs)->db = 0;
+				ia64_psr(&regs)->lp = 1;
 				return;
 			}
 			/* re-do the system call via break 0x100000: */
@@ -578,12 +560,11 @@ ia64_fault (unsigned long vector, unsigned long isr, unsigned long ifa,
 #endif
 			break;
 		      case 35: siginfo.si_code = TRAP_BRANCH; ifa = 0; break;
-		      case 36:
-			      if (notify_die(DIE_SS, "ss", &regs, vector,
-					     vector, SIGTRAP) == NOTIFY_STOP)
-				      return;
-			      siginfo.si_code = TRAP_TRACE; ifa = 0; break;
+		      case 36: siginfo.si_code = TRAP_TRACE; ifa = 0; break;
 		}
+		if (notify_die(DIE_FAULT, "ia64_fault", &regs, vector, siginfo.si_code, SIGTRAP)
+			       	== NOTIFY_STOP)
+			return;
 		siginfo.si_signo = SIGTRAP;
 		siginfo.si_errno = 0;
 		siginfo.si_addr  = (void __user *) ifa;
@@ -611,10 +592,19 @@ ia64_fault (unsigned long vector, unsigned long isr, unsigned long ifa,
 	      case 34:
 		if (isr & 0x2) {
 			/* Lower-Privilege Transfer Trap */
+
+			/* If we disabled debug traps during an fsyscall,
+			 * re-enable them here.
+			 */
+			if (test_thread_flag(TIF_DB_DISABLED)) {
+				clear_thread_flag(TIF_DB_DISABLED);
+				ia64_psr(&regs)->db = 1;
+			}
+
 			/*
-			 * Just clear PSR.lp and then return immediately: all the
-			 * interesting work (e.g., signal delivery is done in the kernel
-			 * exit path).
+			 * Just clear PSR.lp and then return immediately:
+			 * all the interesting work (e.g., signal delivery)
+			 * is done in the kernel exit path.
 			 */
 			ia64_psr(&regs)->lp = 0;
 			return;
@@ -636,21 +626,13 @@ ia64_fault (unsigned long vector, unsigned long isr, unsigned long ifa,
 		break;
 
 	      case 45:
-#ifdef CONFIG_IA32_SUPPORT
-		if (ia32_exception(&regs, isr) == 0)
-			return;
-#endif
 		printk(KERN_ERR "Unexpected IA-32 exception (Trap 45)\n");
 		printk(KERN_ERR "  iip - 0x%lx, ifa - 0x%lx, isr - 0x%lx\n",
 		       iip, ifa, isr);
 		force_sig(SIGSEGV, current);
-		break;
+		return;
 
 	      case 46:
-#ifdef CONFIG_IA32_SUPPORT
-		if (ia32_intercept(&regs, isr) == 0)
-			return;
-#endif
 		printk(KERN_ERR "Unexpected IA-32 intercept trap (Trap 46)\n");
 		printk(KERN_ERR "  iip - 0x%lx, ifa - 0x%lx, isr - 0x%lx, iim - 0x%lx\n",
 		       iip, ifa, isr, iim);
@@ -665,6 +647,6 @@ ia64_fault (unsigned long vector, unsigned long isr, unsigned long ifa,
 		sprintf(buf, "Fault %lu", vector);
 		break;
 	}
-	die_if_kernel(buf, &regs, error);
-	force_sig(SIGILL, current);
+	if (!die_if_kernel(buf, &regs, error))
+		force_sig(SIGILL, current);
 }
diff --git a/arch/ia64/kernel/unaligned.c b/arch/ia64/kernel/unaligned.c
index 43b45b65ee5..622772b7fb6 100644
--- a/arch/ia64/kernel/unaligned.c
+++ b/arch/ia64/kernel/unaligned.c
@@ -13,10 +13,11 @@
  * 2001/08/13	Correct size of extended floats (float_fsz) from 16 to 10 bytes.
  * 2001/01/17	Add support emulation of unaligned kernel accesses.
  */
+#include <linux/jiffies.h>
 #include <linux/kernel.h>
 #include <linux/sched.h>
-#include <linux/smp_lock.h>
 #include <linux/tty.h>
+#include <linux/ratelimit.h>
 
 #include <asm/intrinsics.h>
 #include <asm/processor.h>
@@ -24,12 +25,12 @@
 #include <asm/uaccess.h>
 #include <asm/unaligned.h>
 
-extern void die_if_kernel(char *str, struct pt_regs *regs, long err) __attribute__ ((noreturn));
+extern int die_if_kernel(char *str, struct pt_regs *regs, long err);
 
 #undef DEBUG_UNALIGNED_TRAP
 
 #ifdef DEBUG_UNALIGNED_TRAP
-# define DPRINT(a...)	do { printk("%s %u: ", __FUNCTION__, __LINE__); printk (a); } while (0)
+# define DPRINT(a...)	do { printk("%s %u: ", __func__, __LINE__); printk (a); } while (0)
 # define DDUMP(str,vp,len)	dump(str, vp, len)
 
 static void
@@ -53,6 +54,15 @@ dump (const char *str, void *vp, size_t len)
 #define SIGN_EXT9		0xffffffffffffff00ul
 
 /*
+ *  sysctl settable hook which tells the kernel whether to honor the
+ *  IA64_THREAD_UAC_NOPRINT prctl.  Because this is user settable, we want
+ *  to allow the super user to enable/disable this for security reasons
+ *  (i.e. don't allow attacker to fill up logs with unaligned accesses).
+ */
+int no_unaligned_warning;
+int unaligned_dump_stack;
+
+/*
  * For M-unit:
  *
  *  opcode |   m  |   x6    |
@@ -666,9 +676,10 @@ emulate_load_updates (update_t type, load_store_t ld, struct pt_regs *regs, unsi
 	 * just in case.
 	 */
 	if (ld.x6_op == 1 || ld.x6_op == 3) {
-		printk(KERN_ERR "%s: register update on speculative load, error\n", __FUNCTION__);
-		die_if_kernel("unaligned reference on speculative load with register update\n",
-			      regs, 30);
+		printk(KERN_ERR "%s: register update on speculative load, error\n", __func__);
+		if (die_if_kernel("unaligned reference on speculative load with register update\n",
+				  regs, 30))
+			return;
 	}
 
 
@@ -1095,7 +1106,7 @@ emulate_load_floatpair (unsigned long ifa, load_store_t ld, struct pt_regs *regs
 		 */
 		if (ld.x6_op == 1 || ld.x6_op == 3)
 			printk(KERN_ERR "%s: register update on speculative load pair, error\n",
-			       __FUNCTION__);
+			       __func__);
 
 		setreg(ld.r3, ifa, 0, regs);
 	}
@@ -1273,23 +1284,9 @@ emulate_store_float (unsigned long ifa, load_store_t ld, struct pt_regs *regs)
 /*
  * Make sure we log the unaligned access, so that user/sysadmin can notice it and
  * eventually fix the program.  However, we don't want to do that for every access so we
- * pace it with jiffies.  This isn't really MP-safe, but it doesn't really have to be
- * either...
+ * pace it with jiffies.
  */
-static int
-within_logging_rate_limit (void)
-{
-	static unsigned long count, last_time;
-
-	if (jiffies - last_time > 5*HZ)
-		count = 0;
-	if (++count < 5) {
-		last_time = jiffies;
-		return 1;
-	}
-	return 0;
-
-}
+static DEFINE_RATELIMIT_STATE(logging_rate_limit, 5 * HZ, 5);
 
 void
 ia64_handle_unaligned (unsigned long ifa, struct pt_regs *regs)
@@ -1308,7 +1305,8 @@ ia64_handle_unaligned (unsigned long ifa, struct pt_regs *regs)
 
 	if (ia64_psr(regs)->be) {
 		/* we don't support big-endian accesses */
-		die_if_kernel("big-endian unaligned accesses are not supported", regs, 0);
+		if (die_if_kernel("big-endian unaligned accesses are not supported", regs, 0))
+			return;
 		goto force_sigbus;
 	}
 
@@ -1323,14 +1321,16 @@ ia64_handle_unaligned (unsigned long ifa, struct pt_regs *regs)
 		if ((current->thread.flags & IA64_THREAD_UAC_SIGBUS) != 0)
 			goto force_sigbus;
 
-		if (!(current->thread.flags & IA64_THREAD_UAC_NOPRINT)
-		    && within_logging_rate_limit())
+		if (!no_unaligned_warning &&
+		    !(current->thread.flags & IA64_THREAD_UAC_NOPRINT) &&
+		    __ratelimit(&logging_rate_limit))
 		{
 			char buf[200];	/* comm[] is at most 16 bytes... */
 			size_t len;
 
 			len = sprintf(buf, "%s(%d): unaligned access to 0x%016lx, "
-				      "ip=0x%016lx\n\r", current->comm, current->pid,
+				      "ip=0x%016lx\n\r", current->comm,
+				      task_pid_nr(current),
 				      ifa, regs->cr_iip + ipsr->ri);
 			/*
 			 * Don't call tty_write_message() if we're in the kernel; we might
@@ -1339,12 +1339,29 @@ ia64_handle_unaligned (unsigned long ifa, struct pt_regs *regs)
 			if (user_mode(regs))
 				tty_write_message(current->signal->tty, buf);
 			buf[len-1] = '\0';	/* drop '\r' */
-			printk(KERN_WARNING "%s", buf);	/* watch for command names containing %s */
+			/* watch for command names containing %s */
+			printk(KERN_WARNING "%s", buf);
+		} else {
+			if (no_unaligned_warning) {
+				printk_once(KERN_WARNING "%s(%d) encountered an "
+				       "unaligned exception which required\n"
+				       "kernel assistance, which degrades "
+				       "the performance of the application.\n"
+				       "Unaligned exception warnings have "
+				       "been disabled by the system "
+				       "administrator\n"
+				       "echo 0 > /proc/sys/kernel/ignore-"
+				       "unaligned-usertrap to re-enable\n",
+				       current->comm, task_pid_nr(current));
+			}
 		}
 	} else {
-		if (within_logging_rate_limit())
+		if (__ratelimit(&logging_rate_limit)) {
 			printk(KERN_WARNING "kernel unaligned access to 0x%016lx, ip=0x%016lx\n",
 			       ifa, regs->cr_iip + ipsr->ri);
+			if (unaligned_dump_stack)
+				dump_stack();
+		}
 		set_fs(KERNEL_DS);
 	}
 
@@ -1462,16 +1479,19 @@ ia64_handle_unaligned (unsigned long ifa, struct pt_regs *regs)
 	      case LDFA_OP:
 	      case LDFCCLR_OP:
 	      case LDFCNC_OP:
-	      case LDF_IMM_OP:
-	      case LDFA_IMM_OP:
-	      case LDFCCLR_IMM_OP:
-	      case LDFCNC_IMM_OP:
 		if (u.insn.x)
 			ret = emulate_load_floatpair(ifa, u.insn, regs);
 		else
 			ret = emulate_load_float(ifa, u.insn, regs);
 		break;
 
+	      case LDF_IMM_OP:
+	      case LDFA_IMM_OP:
+	      case LDFCCLR_IMM_OP:
+	      case LDFCNC_IMM_OP:
+		ret = emulate_load_float(ifa, u.insn, regs);
+		break;
+
 	      case STF_OP:
 	      case STF_IMM_OP:
 		ret = emulate_store_float(ifa, u.insn, regs);
@@ -1505,7 +1525,8 @@ ia64_handle_unaligned (unsigned long ifa, struct pt_regs *regs)
 			ia64_handle_exception(regs, eh);
 			goto done;
 		}
-		die_if_kernel("error during unaligned kernel access\n", regs, ret);
+		if (die_if_kernel("error during unaligned kernel access\n", regs, ret))
+			return;
 		/* NOT_REACHED */
 	}
   force_sigbus:
diff --git a/arch/ia64/kernel/uncached.c b/arch/ia64/kernel/uncached.c
index c6d40446c2c..20e8a9b21d7 100644
--- a/arch/ia64/kernel/uncached.c
+++ b/arch/ia64/kernel/uncached.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2001-2005 Silicon Graphics, Inc.  All rights reserved.
+ * Copyright (C) 2001-2008 Silicon Graphics, Inc.  All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms of version 2 of the GNU General Public License
@@ -18,79 +18,97 @@
 #include <linux/init.h>
 #include <linux/errno.h>
 #include <linux/string.h>
-#include <linux/slab.h>
 #include <linux/efi.h>
 #include <linux/genalloc.h>
+#include <linux/gfp.h>
 #include <asm/page.h>
 #include <asm/pal.h>
-#include <asm/system.h>
 #include <asm/pgtable.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
 #include <asm/tlbflush.h>
 #include <asm/sn/arch.h>
 
-#define DEBUG	0
 
-#if DEBUG
-#define dprintk			printk
-#else
-#define dprintk(x...)		do { } while (0)
-#endif
+extern void __init efi_memmap_walk_uc(efi_freemem_callback_t, void *);
 
-void __init efi_memmap_walk_uc (efi_freemem_callback_t callback);
+struct uncached_pool {
+	struct gen_pool *pool;
+	struct mutex add_chunk_mutex;	/* serialize adding a converted chunk */
+	int nchunks_added;		/* #of converted chunks added to pool */
+	atomic_t status;		/* smp called function's return status*/
+};
 
-#define MAX_UNCACHED_GRANULES	5
-static int allocated_granules;
+#define MAX_CONVERTED_CHUNKS_PER_NODE	2
 
-struct gen_pool *uncached_pool[MAX_NUMNODES];
+struct uncached_pool uncached_pools[MAX_NUMNODES];
 
 
 static void uncached_ipi_visibility(void *data)
 {
 	int status;
+	struct uncached_pool *uc_pool = (struct uncached_pool *)data;
 
 	status = ia64_pal_prefetch_visibility(PAL_VISIBILITY_PHYSICAL);
 	if ((status != PAL_VISIBILITY_OK) &&
 	    (status != PAL_VISIBILITY_OK_REMOTE_NEEDED))
-		printk(KERN_DEBUG "pal_prefetch_visibility() returns %i on "
-		       "CPU %i\n", status, get_cpu());
+		atomic_inc(&uc_pool->status);
 }
 
 
 static void uncached_ipi_mc_drain(void *data)
 {
 	int status;
+	struct uncached_pool *uc_pool = (struct uncached_pool *)data;
+
 	status = ia64_pal_mc_drain();
-	if (status)
-		printk(KERN_WARNING "ia64_pal_mc_drain() failed with %i on "
-		       "CPU %i\n", status, get_cpu());
+	if (status != PAL_STATUS_SUCCESS)
+		atomic_inc(&uc_pool->status);
 }
 
 
-static unsigned long
-uncached_get_new_chunk(struct gen_pool *poolp)
+/*
+ * Add a new chunk of uncached memory pages to the specified pool.
+ *
+ * @pool: pool to add new chunk of uncached memory to
+ * @nid: node id of node to allocate memory from, or -1
+ *
+ * This is accomplished by first allocating a granule of cached memory pages
+ * and then converting them to uncached memory pages.
+ */
+static int uncached_add_chunk(struct uncached_pool *uc_pool, int nid)
 {
 	struct page *page;
-	void *tmp;
-	int status, i;
-	unsigned long addr, node;
+	int status, i, nchunks_added = uc_pool->nchunks_added;
+	unsigned long c_addr, uc_addr;
 
-	if (allocated_granules >= MAX_UNCACHED_GRANULES)
+	if (mutex_lock_interruptible(&uc_pool->add_chunk_mutex) != 0)
+		return -1;	/* interrupted by a signal */
+
+	if (uc_pool->nchunks_added > nchunks_added) {
+		/* someone added a new chunk while we were waiting */
+		mutex_unlock(&uc_pool->add_chunk_mutex);
 		return 0;
+	}
+
+	if (uc_pool->nchunks_added >= MAX_CONVERTED_CHUNKS_PER_NODE) {
+		mutex_unlock(&uc_pool->add_chunk_mutex);
+		return -1;
+	}
+
+	/* attempt to allocate a granule's worth of cached memory pages */
 
-	node = poolp->private;
-	page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO,
+	page = alloc_pages_exact_node(nid,
+				GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE,
 				IA64_GRANULE_SHIFT-PAGE_SHIFT);
+	if (!page) {
+		mutex_unlock(&uc_pool->add_chunk_mutex);
+		return -1;
+	}
 
-	dprintk(KERN_INFO "get_new_chunk page %p, addr %lx\n",
-		page, (unsigned long)(page-vmem_map) << PAGE_SHIFT);
+	/* convert the memory pages from cached to uncached */
 
-	/*
-	 * Do magic if no mem on local node! XXX
-	 */
-	if (!page)
-		return 0;
-	tmp = page_address(page);
+	c_addr = (unsigned long)page_address(page);
+	uc_addr = c_addr - PAGE_OFFSET + __IA64_UNCACHED_OFFSET;
 
 	/*
 	 * There's a small race here where it's possible for someone to
@@ -100,76 +118,99 @@ uncached_get_new_chunk(struct gen_pool *poolp)
 	for (i = 0; i < (IA64_GRANULE_SIZE / PAGE_SIZE); i++)
 		SetPageUncached(&page[i]);
 
-	flush_tlb_kernel_range(tmp, tmp + IA64_GRANULE_SIZE);
+	flush_tlb_kernel_range(uc_addr, uc_addr + IA64_GRANULE_SIZE);
 
 	status = ia64_pal_prefetch_visibility(PAL_VISIBILITY_PHYSICAL);
+	if (status == PAL_VISIBILITY_OK_REMOTE_NEEDED) {
+		atomic_set(&uc_pool->status, 0);
+		status = smp_call_function(uncached_ipi_visibility, uc_pool, 1);
+		if (status || atomic_read(&uc_pool->status))
+			goto failed;
+	} else if (status != PAL_VISIBILITY_OK)
+		goto failed;
 
-	dprintk(KERN_INFO "pal_prefetch_visibility() returns %i on cpu %i\n",
-		status, get_cpu());
-
-	if (!status) {
-		status = smp_call_function(uncached_ipi_visibility, NULL, 0, 1);
-		if (status)
-			printk(KERN_WARNING "smp_call_function failed for "
-			       "uncached_ipi_visibility! (%i)\n", status);
-	}
+	preempt_disable();
 
 	if (ia64_platform_is("sn2"))
-		sn_flush_all_caches((unsigned long)tmp, IA64_GRANULE_SIZE);
+		sn_flush_all_caches(uc_addr, IA64_GRANULE_SIZE);
 	else
-		flush_icache_range((unsigned long)tmp,
-				   (unsigned long)tmp+IA64_GRANULE_SIZE);
+		flush_icache_range(uc_addr, uc_addr + IA64_GRANULE_SIZE);
 
-	ia64_pal_mc_drain();
-	status = smp_call_function(uncached_ipi_mc_drain, NULL, 0, 1);
+	/* flush the just introduced uncached translation from the TLB */
+	local_flush_tlb_all();
+
+	preempt_enable();
+
+	status = ia64_pal_mc_drain();
+	if (status != PAL_STATUS_SUCCESS)
+		goto failed;
+	atomic_set(&uc_pool->status, 0);
+	status = smp_call_function(uncached_ipi_mc_drain, uc_pool, 1);
+	if (status || atomic_read(&uc_pool->status))
+		goto failed;
+
+	/*
+	 * The chunk of memory pages has been converted to uncached so now we
+	 * can add it to the pool.
+	 */
+	status = gen_pool_add(uc_pool->pool, uc_addr, IA64_GRANULE_SIZE, nid);
 	if (status)
-		printk(KERN_WARNING "smp_call_function failed for "
-		       "uncached_ipi_mc_drain! (%i)\n", status);
+		goto failed;
+
+	uc_pool->nchunks_added++;
+	mutex_unlock(&uc_pool->add_chunk_mutex);
+	return 0;
 
-	addr = (unsigned long)tmp - PAGE_OFFSET + __IA64_UNCACHED_OFFSET;
+	/* failed to convert or add the chunk so give it back to the kernel */
+failed:
+	for (i = 0; i < (IA64_GRANULE_SIZE / PAGE_SIZE); i++)
+		ClearPageUncached(&page[i]);
 
-	allocated_granules++;
-	return addr;
+	free_pages(c_addr, IA64_GRANULE_SHIFT-PAGE_SHIFT);
+	mutex_unlock(&uc_pool->add_chunk_mutex);
+	return -1;
 }
 
 
 /*
  * uncached_alloc_page
  *
- * Allocate 1 uncached page. Allocates on the requested node. If no
- * uncached pages are available on the requested node, roundrobin starting
- * with higher nodes.
+ * @starting_nid: node id of node to start with, or -1
+ * @n_pages: number of contiguous pages to allocate
+ *
+ * Allocate the specified number of contiguous uncached pages on the
+ * the requested node. If not enough contiguous uncached pages are available
+ * on the requested node, roundrobin starting with the next higher node.
  */
-unsigned long
-uncached_alloc_page(int nid)
+unsigned long uncached_alloc_page(int starting_nid, int n_pages)
 {
-	unsigned long maddr;
+	unsigned long uc_addr;
+	struct uncached_pool *uc_pool;
+	int nid;
 
-	maddr = gen_pool_alloc(uncached_pool[nid], PAGE_SIZE);
+	if (unlikely(starting_nid >= MAX_NUMNODES))
+		return 0;
 
-	dprintk(KERN_DEBUG "uncached_alloc_page returns %lx on node %i\n",
-		maddr, nid);
+	if (starting_nid < 0)
+		starting_nid = numa_node_id();
+	nid = starting_nid;
 
-	/*
-	 * If no memory is availble on our local node, try the
-	 * remaining nodes in the system.
-	 */
-	if (!maddr) {
-		int i;
-
-		for (i = MAX_NUMNODES - 1; i >= 0; i--) {
-			if (i == nid || !node_online(i))
-				continue;
-			maddr = gen_pool_alloc(uncached_pool[i], PAGE_SIZE);
-			dprintk(KERN_DEBUG "uncached_alloc_page alternate search "
-				"returns %lx on node %i\n", maddr, i);
-			if (maddr) {
-				break;
-			}
-		}
-	}
+	do {
+		if (!node_state(nid, N_HIGH_MEMORY))
+			continue;
+		uc_pool = &uncached_pools[nid];
+		if (uc_pool->pool == NULL)
+			continue;
+		do {
+			uc_addr = gen_pool_alloc(uc_pool->pool,
+						 n_pages * PAGE_SIZE);
+			if (uc_addr != 0)
+				return uc_addr;
+		} while (uncached_add_chunk(uc_pool, nid) == 0);
 
-	return maddr;
+	} while ((nid = (nid + 1) % MAX_NUMNODES) != starting_nid);
+
+	return 0;
 }
 EXPORT_SYMBOL(uncached_alloc_page);
 
@@ -177,21 +218,23 @@ EXPORT_SYMBOL(uncached_alloc_page);
 /*
  * uncached_free_page
  *
- * Free a single uncached page.
+ * @uc_addr: uncached address of first page to free
+ * @n_pages: number of contiguous pages to free
+ *
+ * Free the specified number of uncached pages.
  */
-void
-uncached_free_page(unsigned long maddr)
+void uncached_free_page(unsigned long uc_addr, int n_pages)
 {
-	int node;
+	int nid = paddr_to_nid(uc_addr - __IA64_UNCACHED_OFFSET);
+	struct gen_pool *pool = uncached_pools[nid].pool;
 
-	node = paddr_to_nid(maddr - __IA64_UNCACHED_OFFSET);
+	if (unlikely(pool == NULL))
+		return;
 
-	dprintk(KERN_DEBUG "uncached_free_page(%lx) on node %i\n", maddr, node);
+	if ((uc_addr & (0XFUL << 60)) != __IA64_UNCACHED_OFFSET)
+		panic("uncached_free_page invalid address %lx\n", uc_addr);
 
-	if ((maddr & (0XFUL << 60)) != __IA64_UNCACHED_OFFSET)
-		panic("uncached_free_page invalid address %lx\n", maddr);
-
-	gen_pool_free(uncached_pool[node], maddr, PAGE_SIZE);
+	gen_pool_free(pool, uc_addr, n_pages * PAGE_SIZE);
 }
 EXPORT_SYMBOL(uncached_free_page);
 
@@ -199,42 +242,39 @@ EXPORT_SYMBOL(uncached_free_page);
 /*
  * uncached_build_memmap,
  *
+ * @uc_start: uncached starting address of a chunk of uncached memory
+ * @uc_end: uncached ending address of a chunk of uncached memory
+ * @arg: ignored, (NULL argument passed in on call to efi_memmap_walk_uc())
+ *
  * Called at boot time to build a map of pages that can be used for
  * memory special operations.
  */
-static int __init
-uncached_build_memmap(unsigned long start, unsigned long end, void *arg)
+static int __init uncached_build_memmap(u64 uc_start, u64 uc_end, void *arg)
 {
-	long length = end - start;
-	int node;
-
-	dprintk(KERN_ERR "uncached_build_memmap(%lx %lx)\n", start, end);
+	int nid = paddr_to_nid(uc_start - __IA64_UNCACHED_OFFSET);
+	struct gen_pool *pool = uncached_pools[nid].pool;
+	size_t size = uc_end - uc_start;
 
-	memset((char *)start, 0, length);
+	touch_softlockup_watchdog();
 
-	node = paddr_to_nid(start - __IA64_UNCACHED_OFFSET);
-
-	for (; start < end ; start += PAGE_SIZE) {
-		dprintk(KERN_INFO "sticking %lx into the pool!\n", start);
-		gen_pool_free(uncached_pool[node], start, PAGE_SIZE);
+	if (pool != NULL) {
+		memset((char *)uc_start, 0, size);
+		(void) gen_pool_add(pool, uc_start, size, nid);
 	}
-
 	return 0;
 }
 
 
-static int __init uncached_init(void) {
-	int i;
+static int __init uncached_init(void)
+{
+	int nid;
 
-	for (i = 0; i < MAX_NUMNODES; i++) {
-		if (!node_online(i))
-			continue;
-		uncached_pool[i] = gen_pool_create(0, IA64_GRANULE_SHIFT,
-						   &uncached_get_new_chunk, i);
+	for_each_node_state(nid, N_ONLINE) {
+		uncached_pools[nid].pool = gen_pool_create(PAGE_SHIFT, nid);
+		mutex_init(&uncached_pools[nid].add_chunk_mutex);
 	}
 
-	efi_memmap_walk_uc(uncached_build_memmap);
-
+	efi_memmap_walk_uc(uncached_build_memmap, NULL);
 	return 0;
 }
 
diff --git a/arch/ia64/kernel/unwind.c b/arch/ia64/kernel/unwind.c
index 93d5a3b41f6..8f66195999e 100644
--- a/arch/ia64/kernel/unwind.c
+++ b/arch/ia64/kernel/unwind.c
@@ -2,7 +2,7 @@
  * Copyright (C) 1999-2004 Hewlett-Packard Co
  *	David Mosberger-Tang <davidm@hpl.hp.com>
  * Copyright (C) 2003 Fenghua Yu <fenghua.yu@intel.com>
- * 	- Change pt_regs_off() to make it less dependant on pt_regs structure.
+ * 	- Change pt_regs_off() to make it less dependent on pt_regs structure.
  */
 /*
  * This file implements call frame unwind support for the Linux
@@ -41,7 +41,6 @@
 #include <asm/ptrace_offsets.h>
 #include <asm/rse.h>
 #include <asm/sections.h>
-#include <asm/system.h>
 #include <asm/uaccess.h>
 
 #include "entry.h"
@@ -60,6 +59,7 @@
 #  define UNW_DEBUG_ON(n)	unw_debug_level >= n
    /* Do not code a printk level, not all debug lines end in newline */
 #  define UNW_DPRINT(n, ...)  if (UNW_DEBUG_ON(n)) printk(__VA_ARGS__)
+#  undef inline
 #  define inline
 #else /* !UNW_DEBUG */
 #  define UNW_DEBUG_ON(n)  0
@@ -145,7 +145,7 @@ static struct {
 # endif
 } unw = {
 	.tables = &unw.kernel_table,
-	.lock = SPIN_LOCK_UNLOCKED,
+	.lock = __SPIN_LOCK_UNLOCKED(unw.lock),
 	.save_order = {
 		UNW_REG_RP, UNW_REG_PFS, UNW_REG_PSP, UNW_REG_PR,
 		UNW_REG_UNAT, UNW_REG_LC, UNW_REG_FPSR, UNW_REG_PRI_UNAT_GR
@@ -256,7 +256,7 @@ pt_regs_off (unsigned long reg)
 		off = unw.pt_regs_offsets[reg];
 
 	if (off < 0) {
-		UNW_DPRINT(0, "unwind.%s: bad scratch reg r%lu\n", __FUNCTION__, reg);
+		UNW_DPRINT(0, "unwind.%s: bad scratch reg r%lu\n", __func__, reg);
 		off = 0;
 	}
 	return (unsigned long) off;
@@ -267,13 +267,13 @@ get_scratch_regs (struct unw_frame_info *info)
 {
 	if (!info->pt) {
 		/* This should not happen with valid unwind info.  */
-		UNW_DPRINT(0, "unwind.%s: bad unwind info: resetting info->pt\n", __FUNCTION__);
+		UNW_DPRINT(0, "unwind.%s: bad unwind info: resetting info->pt\n", __func__);
 		if (info->flags & UNW_FLAG_INTERRUPT_FRAME)
 			info->pt = (unsigned long) ((struct pt_regs *) info->psp - 1);
 		else
 			info->pt = info->sp - 16;
 	}
-	UNW_DPRINT(3, "unwind.%s: sp 0x%lx pt 0x%lx\n", __FUNCTION__, info->sp, info->pt);
+	UNW_DPRINT(3, "unwind.%s: sp 0x%lx pt 0x%lx\n", __func__, info->sp, info->pt);
 	return (struct pt_regs *) info->pt;
 }
 
@@ -293,7 +293,7 @@ unw_access_gr (struct unw_frame_info *info, int regnum, unsigned long *val, char
 			return 0;
 		}
 		UNW_DPRINT(0, "unwind.%s: trying to access non-existent r%u\n",
-			   __FUNCTION__, regnum);
+			   __func__, regnum);
 		return -1;
 	}
 
@@ -340,7 +340,7 @@ unw_access_gr (struct unw_frame_info *info, int regnum, unsigned long *val, char
 					{
 						UNW_DPRINT(0, "unwind.%s: %p outside of regstk "
 							"[0x%lx-0x%lx)\n",
-							__FUNCTION__, (void *) addr,
+							__func__, (void *) addr,
 							info->regstk.limit,
 							info->regstk.top);
 						return -1;
@@ -373,7 +373,7 @@ unw_access_gr (struct unw_frame_info *info, int regnum, unsigned long *val, char
 		    || (unsigned long) addr >= info->regstk.top)
 		{
 			UNW_DPRINT(0, "unwind.%s: ignoring attempt to access register outside "
-				   "of rbs\n",  __FUNCTION__);
+				   "of rbs\n",  __func__);
 			return -1;
 		}
 		if ((unsigned long) nat_addr >= info->regstk.top)
@@ -384,7 +384,7 @@ unw_access_gr (struct unw_frame_info *info, int regnum, unsigned long *val, char
 	if (write) {
 		if (read_only(addr)) {
 			UNW_DPRINT(0, "unwind.%s: ignoring attempt to write read-only location\n",
-				__FUNCTION__);
+				__func__);
 		} else {
 			*addr = *val;
 			if (*nat)
@@ -426,13 +426,13 @@ unw_access_br (struct unw_frame_info *info, int regnum, unsigned long *val, int
 
 	      default:
 		UNW_DPRINT(0, "unwind.%s: trying to access non-existent b%u\n",
-			   __FUNCTION__, regnum);
+			   __func__, regnum);
 		return -1;
 	}
 	if (write)
 		if (read_only(addr)) {
 			UNW_DPRINT(0, "unwind.%s: ignoring attempt to write read-only location\n",
-				__FUNCTION__);
+				__func__);
 		} else
 			*addr = *val;
 	else
@@ -449,7 +449,7 @@ unw_access_fr (struct unw_frame_info *info, int regnum, struct ia64_fpreg *val,
 
 	if ((unsigned) (regnum - 2) >= 126) {
 		UNW_DPRINT(0, "unwind.%s: trying to access non-existent f%u\n",
-			   __FUNCTION__, regnum);
+			   __func__, regnum);
 		return -1;
 	}
 
@@ -481,7 +481,7 @@ unw_access_fr (struct unw_frame_info *info, int regnum, struct ia64_fpreg *val,
 	if (write)
 		if (read_only(addr)) {
 			UNW_DPRINT(0, "unwind.%s: ignoring attempt to write read-only location\n",
-				__FUNCTION__);
+				__func__);
 		} else
 			*addr = *val;
 	else
@@ -571,14 +571,14 @@ unw_access_ar (struct unw_frame_info *info, int regnum, unsigned long *val, int
 
 	      default:
 		UNW_DPRINT(0, "unwind.%s: trying to access non-existent ar%u\n",
-			   __FUNCTION__, regnum);
+			   __func__, regnum);
 		return -1;
 	}
 
 	if (write) {
 		if (read_only(addr)) {
 			UNW_DPRINT(0, "unwind.%s: ignoring attempt to write read-only location\n",
-				__FUNCTION__);
+				__func__);
 		} else
 			*addr = *val;
 	} else
@@ -599,7 +599,7 @@ unw_access_pr (struct unw_frame_info *info, unsigned long *val, int write)
 	if (write) {
 		if (read_only(addr)) {
 			UNW_DPRINT(0, "unwind.%s: ignoring attempt to write read-only location\n",
-				__FUNCTION__);
+				__func__);
 		} else
 			*addr = *val;
 	} else
@@ -698,7 +698,7 @@ decode_abreg (unsigned char abreg, int memory)
 	      default:
 		break;
 	}
-	UNW_DPRINT(0, "unwind.%s: bad abreg=0x%x\n", __FUNCTION__, abreg);
+	UNW_DPRINT(0, "unwind.%s: bad abreg=0x%x\n", __func__, abreg);
 	return UNW_REG_LC;
 }
 
@@ -738,7 +738,7 @@ spill_next_when (struct unw_reg_info **regp, struct unw_reg_info *lim, unw_word
 			return;
 		}
 	}
-	UNW_DPRINT(0, "unwind.%s: excess spill!\n",  __FUNCTION__);
+	UNW_DPRINT(0, "unwind.%s: excess spill!\n",  __func__);
 }
 
 static inline void
@@ -854,11 +854,11 @@ desc_abi (unsigned char abi, unsigned char context, struct unw_state_record *sr)
 {
 	if (abi == 3 && context == 'i') {
 		sr->flags |= UNW_FLAG_INTERRUPT_FRAME;
-		UNW_DPRINT(3, "unwind.%s: interrupt frame\n",  __FUNCTION__);
+		UNW_DPRINT(3, "unwind.%s: interrupt frame\n",  __func__);
 	}
 	else
 		UNW_DPRINT(0, "unwind%s: ignoring unwabi(abi=0x%x,context=0x%x)\n",
-				__FUNCTION__, abi, context);
+				__func__, abi, context);
 }
 
 static inline void
@@ -1203,10 +1203,10 @@ desc_spill_sprel_p (unsigned char qp, unw_word t, unsigned char abreg, unw_word
 static inline unw_hash_index_t
 hash (unsigned long ip)
 {
-#	define hashmagic	0x9e3779b97f4a7c16UL	/* based on (sqrt(5)/2-1)*2^64 */
+	/* magic number = ((sqrt(5)-1)/2)*2^64 */
+	static const unsigned long hashmagic = 0x9e3779b97f4a7c16UL;
 
-	return (ip >> 4)*hashmagic >> (64 - UNW_LOG_HASH_SIZE);
-#undef hashmagic
+	return (ip >> 4) * hashmagic >> (64 - UNW_LOG_HASH_SIZE);
 }
 
 static inline long
@@ -1346,7 +1346,7 @@ script_emit (struct unw_script *script, struct unw_insn insn)
 {
 	if (script->count >= UNW_MAX_SCRIPT_LEN) {
 		UNW_DPRINT(0, "unwind.%s: script exceeds maximum size of %u instructions!\n",
-			__FUNCTION__, UNW_MAX_SCRIPT_LEN);
+			__func__, UNW_MAX_SCRIPT_LEN);
 		return;
 	}
 	script->insn[script->count++] = insn;
@@ -1388,7 +1388,7 @@ emit_nat_info (struct unw_state_record *sr, int i, struct unw_script *script)
 
 	      default:
 		UNW_DPRINT(0, "unwind.%s: don't know how to emit nat info for where = %u\n",
-			   __FUNCTION__, r->where);
+			   __func__, r->where);
 		return;
 	}
 	insn.opc = opc;
@@ -1445,7 +1445,7 @@ compile_reg (struct unw_state_record *sr, int i, struct unw_script *script)
 				val = offsetof(struct pt_regs, f6) + 16*(rval - 6);
 			else
 				UNW_DPRINT(0, "unwind.%s: kernel may not touch f%lu\n",
-					   __FUNCTION__, rval);
+					   __func__, rval);
 		}
 		break;
 
@@ -1473,7 +1473,7 @@ compile_reg (struct unw_state_record *sr, int i, struct unw_script *script)
 
 	      default:
 		UNW_DPRINT(0, "unwind%s: register %u has unexpected `where' value of %u\n",
-			   __FUNCTION__, i, r->where);
+			   __func__, i, r->where);
 		break;
 	}
 	insn.opc = opc;
@@ -1530,7 +1530,7 @@ build_script (struct unw_frame_info *info)
 	struct unw_labeled_state *ls, *next;
 	unsigned long ip = info->ip;
 	struct unw_state_record sr;
-	struct unw_table *table;
+	struct unw_table *table, *prev;
 	struct unw_reg_info *r;
 	struct unw_insn insn;
 	u8 *dp, *desc_end;
@@ -1546,10 +1546,10 @@ build_script (struct unw_frame_info *info)
 		r->when = UNW_WHEN_NEVER;
 	sr.pr_val = info->pr;
 
-	UNW_DPRINT(3, "unwind.%s: ip 0x%lx\n", __FUNCTION__, ip);
+	UNW_DPRINT(3, "unwind.%s: ip 0x%lx\n", __func__, ip);
 	script = script_new(ip);
 	if (!script) {
-		UNW_DPRINT(0, "unwind.%s: failed to create unwind script\n",  __FUNCTION__);
+		UNW_DPRINT(0, "unwind.%s: failed to create unwind script\n",  __func__);
 		STAT(unw.stat.script.build_time += ia64_get_itc() - start);
 		return NULL;
 	}
@@ -1559,16 +1559,31 @@ build_script (struct unw_frame_info *info)
 
 	STAT(parse_start = ia64_get_itc());
 
+	prev = NULL;
 	for (table = unw.tables; table; table = table->next) {
 		if (ip >= table->start && ip < table->end) {
+			/*
+			 * Leave the kernel unwind table at the very front,
+			 * lest moving it breaks some assumption elsewhere.
+			 * Otherwise, move the matching table to the second
+			 * position in the list so that traversals can benefit
+			 * from commonality in backtrace paths.
+			 */
+			if (prev && prev != unw.tables) {
+				/* unw is safe - we're already spinlocked */
+				prev->next = table->next;
+				table->next = unw.tables->next;
+				unw.tables->next = table;
+			}
 			e = lookup(table, ip - table->segment_base);
 			break;
 		}
+		prev = table;
 	}
 	if (!e) {
 		/* no info, return default unwinder (leaf proc, no mem stack, no saved regs)  */
 		UNW_DPRINT(1, "unwind.%s: no unwind info for ip=0x%lx (prev ip=0x%lx)\n",
-			__FUNCTION__, ip, unw.cache[info->prev_script].ip);
+			__func__, ip, unw.cache[info->prev_script].ip);
 		sr.curr.reg[UNW_REG_RP].where = UNW_WHERE_BR;
 		sr.curr.reg[UNW_REG_RP].when = -1;
 		sr.curr.reg[UNW_REG_RP].val = 0;
@@ -1617,13 +1632,13 @@ build_script (struct unw_frame_info *info)
 		sr.curr.reg[UNW_REG_RP].when = -1;
 		sr.curr.reg[UNW_REG_RP].val = sr.return_link_reg;
 		UNW_DPRINT(1, "unwind.%s: using default for rp at ip=0x%lx where=%d val=0x%lx\n",
-			   __FUNCTION__, ip, sr.curr.reg[UNW_REG_RP].where,
+			   __func__, ip, sr.curr.reg[UNW_REG_RP].where,
 			   sr.curr.reg[UNW_REG_RP].val);
 	}
 
 #ifdef UNW_DEBUG
 	UNW_DPRINT(1, "unwind.%s: state record for func 0x%lx, t=%u:\n",
-		__FUNCTION__, table->segment_base + e->start_offset, sr.when_target);
+		__func__, table->segment_base + e->start_offset, sr.when_target);
 	for (r = sr.curr.reg; r < sr.curr.reg + UNW_NUM_REGS; ++r) {
 		if (r->where != UNW_WHERE_NONE || r->when != UNW_WHEN_NEVER) {
 			UNW_DPRINT(1, "  %s <- ", unw.preg_name[r - sr.curr.reg]);
@@ -1745,7 +1760,7 @@ run_script (struct unw_script *script, struct unw_frame_info *state)
 			} else {
 				s[dst] = 0;
 				UNW_DPRINT(0, "unwind.%s: no state->pt, dst=%ld, val=%ld\n",
-					   __FUNCTION__, dst, val);
+					   __func__, dst, val);
 			}
 			break;
 
@@ -1755,7 +1770,7 @@ run_script (struct unw_script *script, struct unw_frame_info *state)
 			else {
 				s[dst] = 0;
 				UNW_DPRINT(0, "unwind.%s: UNW_INSN_MOVE_CONST bad val=%ld\n",
-					   __FUNCTION__, val);
+					   __func__, val);
 			}
 			break;
 
@@ -1790,7 +1805,7 @@ run_script (struct unw_script *script, struct unw_frame_info *state)
 			    || s[val] < TASK_SIZE)
 			{
 				UNW_DPRINT(0, "unwind.%s: rejecting bad psp=0x%lx\n",
-					   __FUNCTION__, s[val]);
+					   __func__, s[val]);
 				break;
 			}
 #endif
@@ -1824,7 +1839,7 @@ find_save_locs (struct unw_frame_info *info)
 	if ((info->ip & (local_cpu_data->unimpl_va_mask | 0xf)) || info->ip < TASK_SIZE) {
 		/* don't let obviously bad addresses pollute the cache */
 		/* FIXME: should really be level 0 but it occurs too often. KAO */
-		UNW_DPRINT(1, "unwind.%s: rejecting bad ip=0x%lx\n", __FUNCTION__, info->ip);
+		UNW_DPRINT(1, "unwind.%s: rejecting bad ip=0x%lx\n", __func__, info->ip);
 		info->rp_loc = NULL;
 		return -1;
 	}
@@ -1837,7 +1852,7 @@ find_save_locs (struct unw_frame_info *info)
 			spin_unlock_irqrestore(&unw.lock, flags);
 			UNW_DPRINT(0,
 				   "unwind.%s: failed to locate/build unwind script for ip %lx\n",
-				   __FUNCTION__, info->ip);
+				   __func__, info->ip);
 			return -1;
 		}
 		have_write_lock = 1;
@@ -1855,6 +1870,14 @@ find_save_locs (struct unw_frame_info *info)
 	return 0;
 }
 
+static int
+unw_valid(const struct unw_frame_info *info, unsigned long* p)
+{
+	unsigned long loc = (unsigned long)p;
+	return (loc >= info->regstk.limit && loc < info->regstk.top) ||
+	       (loc >= info->memstk.top && loc < info->memstk.limit);
+}
+
 int
 unw_unwind (struct unw_frame_info *info)
 {
@@ -1869,27 +1892,29 @@ unw_unwind (struct unw_frame_info *info)
 	prev_sp = info->sp;
 	prev_bsp = info->bsp;
 
-	/* restore the ip */
-	if (!info->rp_loc) {
+	/* validate the return IP pointer */
+	if (!unw_valid(info, info->rp_loc)) {
 		/* FIXME: should really be level 0 but it occurs too often. KAO */
 		UNW_DPRINT(1, "unwind.%s: failed to locate return link (ip=0x%lx)!\n",
-			   __FUNCTION__, info->ip);
+			   __func__, info->ip);
 		STAT(unw.stat.api.unwind_time += ia64_get_itc() - start; local_irq_restore(flags));
 		return -1;
 	}
+	/* restore the ip */
 	ip = info->ip = *info->rp_loc;
 	if (ip < GATE_ADDR) {
-		UNW_DPRINT(2, "unwind.%s: reached user-space (ip=0x%lx)\n", __FUNCTION__, ip);
+		UNW_DPRINT(2, "unwind.%s: reached user-space (ip=0x%lx)\n", __func__, ip);
 		STAT(unw.stat.api.unwind_time += ia64_get_itc() - start; local_irq_restore(flags));
 		return -1;
 	}
 
-	/* restore the cfm: */
-	if (!info->pfs_loc) {
-		UNW_DPRINT(0, "unwind.%s: failed to locate ar.pfs!\n", __FUNCTION__);
+	/* validate the previous stack frame pointer */
+	if (!unw_valid(info, info->pfs_loc)) {
+		UNW_DPRINT(0, "unwind.%s: failed to locate ar.pfs!\n", __func__);
 		STAT(unw.stat.api.unwind_time += ia64_get_itc() - start; local_irq_restore(flags));
 		return -1;
 	}
+	/* restore the cfm: */
 	info->cfm_loc = info->pfs_loc;
 
 	/* restore the bsp: */
@@ -1901,13 +1926,13 @@ unw_unwind (struct unw_frame_info *info)
 			num_regs = *info->cfm_loc & 0x7f;		/* size of frame */
 		info->pfs_loc =
 			(unsigned long *) (info->pt + offsetof(struct pt_regs, ar_pfs));
-		UNW_DPRINT(3, "unwind.%s: interrupt_frame pt 0x%lx\n", __FUNCTION__, info->pt);
+		UNW_DPRINT(3, "unwind.%s: interrupt_frame pt 0x%lx\n", __func__, info->pt);
 	} else
 		num_regs = (*info->cfm_loc >> 7) & 0x7f;	/* size of locals */
 	info->bsp = (unsigned long) ia64_rse_skip_regs((unsigned long *) info->bsp, -num_regs);
 	if (info->bsp < info->regstk.limit || info->bsp > info->regstk.top) {
 		UNW_DPRINT(0, "unwind.%s: bsp (0x%lx) out of range [0x%lx-0x%lx]\n",
-			__FUNCTION__, info->bsp, info->regstk.limit, info->regstk.top);
+			__func__, info->bsp, info->regstk.limit, info->regstk.top);
 		STAT(unw.stat.api.unwind_time += ia64_get_itc() - start; local_irq_restore(flags));
 		return -1;
 	}
@@ -1916,14 +1941,14 @@ unw_unwind (struct unw_frame_info *info)
 	info->sp = info->psp;
 	if (info->sp < info->memstk.top || info->sp > info->memstk.limit) {
 		UNW_DPRINT(0, "unwind.%s: sp (0x%lx) out of range [0x%lx-0x%lx]\n",
-			__FUNCTION__, info->sp, info->memstk.top, info->memstk.limit);
+			__func__, info->sp, info->memstk.top, info->memstk.limit);
 		STAT(unw.stat.api.unwind_time += ia64_get_itc() - start; local_irq_restore(flags));
 		return -1;
 	}
 
 	if (info->ip == prev_ip && info->sp == prev_sp && info->bsp == prev_bsp) {
 		UNW_DPRINT(0, "unwind.%s: ip, sp, bsp unchanged; stopping here (ip=0x%lx)\n",
-			   __FUNCTION__, ip);
+			   __func__, ip);
 		STAT(unw.stat.api.unwind_time += ia64_get_itc() - start; local_irq_restore(flags));
 		return -1;
 	}
@@ -1943,14 +1968,14 @@ EXPORT_SYMBOL(unw_unwind);
 int
 unw_unwind_to_user (struct unw_frame_info *info)
 {
-	unsigned long ip, sp, pr = 0;
+	unsigned long ip, sp, pr = info->pr;
 
-	while (unw_unwind(info) >= 0) {
+	do {
 		unw_get_sp(info, &sp);
 		if ((long)((unsigned long)info->task + IA64_STK_OFFSET - sp)
 		    < IA64_PT_REGS_SIZE) {
 			UNW_DPRINT(0, "unwind.%s: ran off the top of the kernel stack\n",
-				   __FUNCTION__);
+				   __func__);
 			break;
 		}
 		if (unw_is_intr_frame(info) &&
@@ -1960,13 +1985,13 @@ unw_unwind_to_user (struct unw_frame_info *info)
 			unw_get_rp(info, &ip);
 			UNW_DPRINT(0, "unwind.%s: failed to read "
 				   "predicate register (ip=0x%lx)\n",
-				__FUNCTION__, ip);
+				__func__, ip);
 			return -1;
 		}
-	}
+	} while (unw_unwind(info) >= 0);
 	unw_get_ip(info, &ip);
 	UNW_DPRINT(0, "unwind.%s: failed to unwind to user-level (ip=0x%lx)\n",
-		   __FUNCTION__, ip);
+		   __func__, ip);
 	return -1;
 }
 EXPORT_SYMBOL(unw_unwind_to_user);
@@ -1991,13 +2016,16 @@ init_frame_info (struct unw_frame_info *info, struct task_struct *t,
 	memset(info, 0, sizeof(*info));
 
 	rbslimit = (unsigned long) t + IA64_RBS_OFFSET;
+	stklimit = (unsigned long) t + IA64_STK_OFFSET;
+
 	rbstop   = sw->ar_bspstore;
-	if (rbstop - (unsigned long) t >= IA64_STK_OFFSET)
+	if (rbstop > stklimit || rbstop < rbslimit)
 		rbstop = rbslimit;
 
-	stklimit = (unsigned long) t + IA64_STK_OFFSET;
 	if (stktop <= rbstop)
 		stktop = rbstop;
+	if (stktop > stklimit)
+		stktop = stklimit;
 
 	info->regstk.limit = rbslimit;
 	info->regstk.top   = rbstop;
@@ -2014,7 +2042,7 @@ init_frame_info (struct unw_frame_info *info, struct task_struct *t,
 		   "  pr     0x%lx\n"
 		   "  sw     0x%lx\n"
 		   "  sp     0x%lx\n",
-		   __FUNCTION__, (unsigned long) t, rbslimit, rbstop, stktop, stklimit,
+		   __func__, (unsigned long) t, rbslimit, rbstop, stktop, stklimit,
 		   info->pr, (unsigned long) info->sw, info->sp);
 	STAT(unw.stat.api.init_time += ia64_get_itc() - start; local_irq_restore(flags));
 }
@@ -2033,7 +2061,7 @@ unw_init_frame_info (struct unw_frame_info *info, struct task_struct *t, struct
 		   "  bsp    0x%lx\n"
 		   "  sol    0x%lx\n"
 		   "  ip     0x%lx\n",
-		   __FUNCTION__, info->bsp, sol, info->ip);
+		   __func__, info->bsp, sol, info->ip);
 	find_save_locs(info);
 }
 
@@ -2044,7 +2072,7 @@ unw_init_from_blocked_task (struct unw_frame_info *info, struct task_struct *t)
 {
 	struct switch_stack *sw = (struct switch_stack *) (t->thread.ksp + 16);
 
-	UNW_DPRINT(1, "unwind.%s\n", __FUNCTION__);
+	UNW_DPRINT(1, "unwind.%s\n", __func__);
 	unw_init_frame_info(info, t, sw);
 }
 EXPORT_SYMBOL(unw_init_from_blocked_task);
@@ -2074,7 +2102,7 @@ unw_add_unwind_table (const char *name, unsigned long segment_base, unsigned lon
 
 	if (end - start <= 0) {
 		UNW_DPRINT(0, "unwind.%s: ignoring attempt to insert empty unwind table\n",
-			   __FUNCTION__);
+			   __func__);
 		return NULL;
 	}
 
@@ -2105,14 +2133,14 @@ unw_remove_unwind_table (void *handle)
 
 	if (!handle) {
 		UNW_DPRINT(0, "unwind.%s: ignoring attempt to remove non-existent unwind table\n",
-			   __FUNCTION__);
+			   __func__);
 		return;
 	}
 
 	table = handle;
 	if (table == &unw.kernel_table) {
 		UNW_DPRINT(0, "unwind.%s: sorry, freeing the kernel's unwind table is a "
-			   "no-can-do!\n", __FUNCTION__);
+			   "no-can-do!\n", __func__);
 		return;
 	}
 
@@ -2125,7 +2153,7 @@ unw_remove_unwind_table (void *handle)
 				break;
 		if (!prev) {
 			UNW_DPRINT(0, "unwind.%s: failed to find unwind table %p\n",
-				   __FUNCTION__, (void *) table);
+				   __func__, (void *) table);
 			spin_unlock_irqrestore(&unw.lock, flags);
 			return;
 		}
@@ -2135,7 +2163,7 @@ unw_remove_unwind_table (void *handle)
 
 	/* next, remove hash table entries for this table */
 
-	for (index = 0; index <= UNW_HASH_SIZE; ++index) {
+	for (index = 0; index < UNW_HASH_SIZE; ++index) {
 		tmp = unw.cache + unw.hash[index];
 		if (unw.hash[index] >= UNW_CACHE_SIZE
 		    || tmp->ip < table->start || tmp->ip >= table->end)
@@ -2171,7 +2199,7 @@ create_gate_table (void)
 		}
 
 	if (!punw) {
-		printk("%s: failed to find gate DSO's unwind table!\n", __FUNCTION__);
+		printk("%s: failed to find gate DSO's unwind table!\n", __func__);
 		return 0;
 	}
 
@@ -2188,7 +2216,7 @@ create_gate_table (void)
 	unw.gate_table = kmalloc(size, GFP_KERNEL);
 	if (!unw.gate_table) {
 		unw.gate_table_size = 0;
-		printk(KERN_ERR "%s: unable to create unwind data for gate page!\n", __FUNCTION__);
+		printk(KERN_ERR "%s: unable to create unwind data for gate page!\n", __func__);
 		return 0;
 	}
 	unw.gate_table_size = size;
diff --git a/arch/ia64/kernel/vmlinux.lds.S b/arch/ia64/kernel/vmlinux.lds.S
index 30d8564e960..84f8a52ac5a 100644
--- a/arch/ia64/kernel/vmlinux.lds.S
+++ b/arch/ia64/kernel/vmlinux.lds.S
@@ -1,257 +1,248 @@
-#include <linux/config.h>
 
 #include <asm/cache.h>
 #include <asm/ptrace.h>
-#include <asm/system.h>
 #include <asm/pgtable.h>
 
-#define LOAD_OFFSET	(KERNEL_START - KERNEL_TR_PAGE_SIZE)
 #include <asm-generic/vmlinux.lds.h>
 
-#define IVT_TEXT							\
-		VMLINUX_SYMBOL(__start_ivt_text) = .;			\
-		*(.text.ivt)						\
-		VMLINUX_SYMBOL(__end_ivt_text) = .;
-
 OUTPUT_FORMAT("elf64-ia64-little")
 OUTPUT_ARCH(ia64)
 ENTRY(phys_start)
 jiffies = jiffies_64;
+
 PHDRS {
-  code   PT_LOAD;
-  percpu PT_LOAD;
-  data   PT_LOAD;
+	code   PT_LOAD;
+	percpu PT_LOAD;
+	data   PT_LOAD;
+	note   PT_NOTE;
+	unwind 0x70000001; /* PT_IA_64_UNWIND, but ld doesn't match the name */
 }
-SECTIONS
-{
-  /* Sections to be discarded */
-  /DISCARD/ : {
-	*(.exit.text)
-	*(.exit.data)
-	*(.exitcall.exit)
-	*(.IA_64.unwind.exit.text)
-	*(.IA_64.unwind_info.exit.text)
-	}
 
-  v = PAGE_OFFSET;	/* this symbol is here to make debugging easier... */
-  phys_start = _start - LOAD_OFFSET;
-
-  code : { } :code
-  . = KERNEL_START;
-
-  _text = .;
-  _stext = .;
-
-  .text : AT(ADDR(.text) - LOAD_OFFSET)
-    {
-	IVT_TEXT
-	*(.text)
-	SCHED_TEXT
-	LOCK_TEXT
-	KPROBES_TEXT
-	*(.gnu.linkonce.t*)
-    }
-  .text2 : AT(ADDR(.text2) - LOAD_OFFSET)
-	{ *(.text2) }
-#ifdef CONFIG_SMP
-  .text.lock : AT(ADDR(.text.lock) - LOAD_OFFSET)
-	{ *(.text.lock) }
-#endif
-  _etext = .;
+SECTIONS {
+	/*
+	 * unwind exit sections must be discarded before
+	 * the rest of the sections get included.
+	 */
+	/DISCARD/ : {
+		*(.IA_64.unwind.exit.text)
+		*(.IA_64.unwind_info.exit.text)
+		*(.comment)
+		*(.note)
+	}
 
-  /* Read-only data */
+	v = PAGE_OFFSET; /* this symbol is here to make debugging easier... */
+	phys_start = _start - LOAD_OFFSET;
+
+	code : {
+	} :code
+	. = KERNEL_START;
+
+	_text = .;
+	_stext = .;
+
+	.text : AT(ADDR(.text) - LOAD_OFFSET) {
+		__start_ivt_text = .;
+		*(.text..ivt)
+		__end_ivt_text = .;
+		TEXT_TEXT
+		SCHED_TEXT
+		LOCK_TEXT
+		KPROBES_TEXT
+		*(.gnu.linkonce.t*)
+	}
 
-  /* Exception table */
-  . = ALIGN(16);
-  __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET)
-	{
-	  __start___ex_table = .;
-	  *(__ex_table)
-	  __stop___ex_table = .;
+	.text2 : AT(ADDR(.text2) - LOAD_OFFSET)	{
+		*(.text2)
 	}
 
-  .data.patch.vtop : AT(ADDR(.data.patch.vtop) - LOAD_OFFSET)
-	{
-	  __start___vtop_patchlist = .;
-	  *(.data.patch.vtop)
-	  __end___vtop_patchlist = .;
+#ifdef CONFIG_SMP
+	.text..lock : AT(ADDR(.text..lock) - LOAD_OFFSET) {
+		*(.text..lock)
+	}
+#endif
+	_etext = .;
+
+	/*
+	 * Read-only data
+	 */
+	NOTES :code :note       /* put .notes in text and mark in PT_NOTE  */
+	code_continues : {
+	} : code               /* switch back to regular program...  */
+
+	EXCEPTION_TABLE(16)
+
+	/* MCA table */
+	. = ALIGN(16);
+	__mca_table : AT(ADDR(__mca_table) - LOAD_OFFSET) {
+		__start___mca_table = .;
+		*(__mca_table)
+		__stop___mca_table = .;
 	}
 
-  .data.patch.mckinley_e9 : AT(ADDR(.data.patch.mckinley_e9) - LOAD_OFFSET)
-	{
-	  __start___mckinley_e9_bundles = .;
-	  *(.data.patch.mckinley_e9)
-	  __end___mckinley_e9_bundles = .;
+	.data..patch.phys_stack_reg : AT(ADDR(.data..patch.phys_stack_reg) - LOAD_OFFSET) {
+		__start___phys_stack_reg_patchlist = .;
+		*(.data..patch.phys_stack_reg)
+		__end___phys_stack_reg_patchlist = .;
 	}
 
-  /* Global data */
-  _data = .;
+	/*
+	 * Global data
+	 */
+	_data = .;
 
-#if defined(CONFIG_IA64_GENERIC)
-  /* Machine Vector */
-  . = ALIGN(16);
-  .machvec : AT(ADDR(.machvec) - LOAD_OFFSET)
-	{
-	  machvec_start = .;
-	  *(.machvec)
-	  machvec_end = .;
+	/* Unwind info & table: */
+	. = ALIGN(8);
+	.IA_64.unwind_info : AT(ADDR(.IA_64.unwind_info) - LOAD_OFFSET) {
+		*(.IA_64.unwind_info*)
 	}
-#endif
-
-  /* Unwind info & table: */
-  . = ALIGN(8);
-  .IA_64.unwind_info : AT(ADDR(.IA_64.unwind_info) - LOAD_OFFSET)
-	{ *(.IA_64.unwind_info*) }
-  .IA_64.unwind : AT(ADDR(.IA_64.unwind) - LOAD_OFFSET)
-	{
-	  __start_unwind = .;
-	  *(.IA_64.unwind*)
-	  __end_unwind = .;
+	.IA_64.unwind : AT(ADDR(.IA_64.unwind) - LOAD_OFFSET) {
+		__start_unwind = .;
+		*(.IA_64.unwind*)
+		__end_unwind = .;
+	} :code :unwind
+	code_continues2 : {
+	} : code
+
+	RODATA
+
+	.opd : AT(ADDR(.opd) - LOAD_OFFSET) {
+		*(.opd)
 	}
 
-  RODATA
+	/*
+	 * Initialization code and data:
+	 */
+	. = ALIGN(PAGE_SIZE);
+	__init_begin = .;
 
-  .opd : AT(ADDR(.opd) - LOAD_OFFSET)
-	{ *(.opd) }
+	INIT_TEXT_SECTION(PAGE_SIZE)
+	INIT_DATA_SECTION(16)
 
-  /* Initialization code and data: */
+	.data..patch.vtop : AT(ADDR(.data..patch.vtop) - LOAD_OFFSET) {
+		__start___vtop_patchlist = .;
+		*(.data..patch.vtop)
+		__end___vtop_patchlist = .;
+	}
 
-  . = ALIGN(PAGE_SIZE);
-  __init_begin = .;
-  .init.text : AT(ADDR(.init.text) - LOAD_OFFSET)
-	{
-	  _sinittext = .;
-	  *(.init.text)
-	  _einittext = .;
+	.data..patch.rse : AT(ADDR(.data..patch.rse) - LOAD_OFFSET) {
+		__start___rse_patchlist = .;
+		*(.data..patch.rse)
+		__end___rse_patchlist = .;
 	}
 
-  .init.data : AT(ADDR(.init.data) - LOAD_OFFSET)
-	{ *(.init.data) }
+	.data..patch.mckinley_e9 : AT(ADDR(.data..patch.mckinley_e9) - LOAD_OFFSET) {
+		__start___mckinley_e9_bundles = .;
+		*(.data..patch.mckinley_e9)
+		__end___mckinley_e9_bundles = .;
+	}
 
-  .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET)
-	{
-	  __initramfs_start = .;
-	  *(.init.ramfs)
-	  __initramfs_end = .;
+#if defined(CONFIG_PARAVIRT)
+	. = ALIGN(16);
+	.paravirt_bundles : AT(ADDR(.paravirt_bundles) - LOAD_OFFSET) {
+		__start_paravirt_bundles = .;
+		*(.paravirt_bundles)
+		__stop_paravirt_bundles = .;
+	}
+	. = ALIGN(16);
+	.paravirt_insts : AT(ADDR(.paravirt_insts) - LOAD_OFFSET) {
+		__start_paravirt_insts = .;
+		*(.paravirt_insts)
+		__stop_paravirt_insts = .;
 	}
+	. = ALIGN(16);
+	.paravirt_branches : AT(ADDR(.paravirt_branches) - LOAD_OFFSET) {
+		__start_paravirt_branches = .;
+		*(.paravirt_branches)
+		__stop_paravirt_branches = .;
+	}
+#endif
 
-   . = ALIGN(16);
-  .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET)
-        {
-	  __setup_start = .;
-	  *(.init.setup)
-	  __setup_end = .;
+#if defined(CONFIG_IA64_GENERIC)
+	/* Machine Vector */
+	. = ALIGN(16);
+	.machvec : AT(ADDR(.machvec) - LOAD_OFFSET) {
+		machvec_start = .;
+		*(.machvec)
+		machvec_end = .;
 	}
-  .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET)
-	{
-	  __initcall_start = .;
-	  *(.initcall1.init)
-	  *(.initcall2.init)
-	  *(.initcall3.init)
-	  *(.initcall4.init)
-	  *(.initcall5.init)
-	  *(.initcall6.init)
-	  *(.initcall7.init)
-	  __initcall_end = .;
+#endif
+
+#ifdef	CONFIG_SMP
+	. = ALIGN(PERCPU_PAGE_SIZE);
+	__cpu0_per_cpu = .;
+	. = . + PERCPU_PAGE_SIZE;   /* cpu0 per-cpu space */
+#endif
+
+	. = ALIGN(PAGE_SIZE);
+	__init_end = .;
+
+	.data..page_aligned : AT(ADDR(.data..page_aligned) - LOAD_OFFSET) {
+		PAGE_ALIGNED_DATA(PAGE_SIZE)
+		. = ALIGN(PAGE_SIZE);
+		__start_gate_section = .;
+		*(.data..gate)
+		__stop_gate_section = .;
 	}
-   __con_initcall_start = .;
-  .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET)
-	{ *(.con_initcall.init) }
-  __con_initcall_end = .;
-  __security_initcall_start = .;
-  .security_initcall.init : AT(ADDR(.security_initcall.init) - LOAD_OFFSET)
-	{ *(.security_initcall.init) }
-  __security_initcall_end = .;
-  . = ALIGN(PAGE_SIZE);
-  __init_end = .;
-
-  /* The initial task and kernel stack */
-  .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET)
-	{ *(.data.init_task) }
-
-  .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET)
-        { *(__special_page_section)
-	  __start_gate_section = .;
-	  *(.data.gate)
-	  __stop_gate_section = .;
+	/*
+	 * make sure the gate page doesn't expose
+	 * kernel data
+	 */
+	. = ALIGN(PAGE_SIZE);
+
+	/* Per-cpu data: */
+	. = ALIGN(PERCPU_PAGE_SIZE);
+	PERCPU_VADDR(SMP_CACHE_BYTES, PERCPU_ADDR, :percpu)
+	__phys_per_cpu_start = __per_cpu_load;
+	/*
+	 * ensure percpu data fits
+	 * into percpu page size
+	 */
+	. = __phys_per_cpu_start + PERCPU_PAGE_SIZE;
+
+	data : {
+	} :data
+	.data : AT(ADDR(.data) - LOAD_OFFSET) {
+		_sdata  =  .;
+		INIT_TASK_DATA(PAGE_SIZE)
+		CACHELINE_ALIGNED_DATA(SMP_CACHE_BYTES)
+		READ_MOSTLY_DATA(SMP_CACHE_BYTES)
+		DATA_DATA
+		*(.data1)
+		*(.gnu.linkonce.d*)
+		CONSTRUCTORS
 	}
-  . = ALIGN(PAGE_SIZE);		/* make sure the gate page doesn't expose kernel data */
-
-  .data.cacheline_aligned : AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET)
-        { *(.data.cacheline_aligned) }
-
-  /* Per-cpu data: */
-  percpu : { } :percpu
-  . = ALIGN(PERCPU_PAGE_SIZE);
-  __phys_per_cpu_start = .;
-  .data.percpu PERCPU_ADDR : AT(__phys_per_cpu_start - LOAD_OFFSET)
-	{
-		__per_cpu_start = .;
-		*(.data.percpu)
-		__per_cpu_end = .;
+
+	. = ALIGN(16);	/* gp must be 16-byte aligned for exc. table */
+	.got : AT(ADDR(.got) - LOAD_OFFSET) {
+		*(.got.plt)
+		*(.got)
+	}
+	__gp = ADDR(.got) + 0x200000;
+
+	/*
+	 * We want the small data sections together,
+	 * so single-instruction offsets can access
+	 * them all, and initialized data all before
+	 * uninitialized, so we can shorten the
+	 * on-disk segment size.
+	 */
+	.sdata : AT(ADDR(.sdata) - LOAD_OFFSET) {
+		*(.sdata)
+		*(.sdata1)
+		*(.srdata)
 	}
-  . = __phys_per_cpu_start + PERCPU_PAGE_SIZE;	/* ensure percpu data fits into percpu page size */
-
-  data : { } :data
-  .data : AT(ADDR(.data) - LOAD_OFFSET)
-	{ *(.data) *(.data1) *(.gnu.linkonce.d*) CONSTRUCTORS }
-
-  . = ALIGN(16);	/* gp must be 16-byte aligned for exc. table */
-  .got : AT(ADDR(.got) - LOAD_OFFSET)
-	{ *(.got.plt) *(.got) }
-  __gp = ADDR(.got) + 0x200000;
-  /* We want the small data sections together, so single-instruction offsets
-     can access them all, and initialized data all before uninitialized, so
-     we can shorten the on-disk segment size.  */
-  .sdata : AT(ADDR(.sdata) - LOAD_OFFSET)
-	{ *(.sdata) *(.sdata1) *(.srdata) }
-  _edata  =  .;
-  _bss = .;
-  .sbss : AT(ADDR(.sbss) - LOAD_OFFSET)
-	{ *(.sbss) *(.scommon) }
-  .bss : AT(ADDR(.bss) - LOAD_OFFSET)
-	{ *(.bss) *(COMMON) }
-
-  _end = .;
-
-  code : { } :code
-  /* Stabs debugging sections.  */
-  .stab 0 : { *(.stab) }
-  .stabstr 0 : { *(.stabstr) }
-  .stab.excl 0 : { *(.stab.excl) }
-  .stab.exclstr 0 : { *(.stab.exclstr) }
-  .stab.index 0 : { *(.stab.index) }
-  .stab.indexstr 0 : { *(.stab.indexstr) }
-  /* DWARF debug sections.
-     Symbols in the DWARF debugging sections are relative to the beginning
-     of the section so we begin them at 0.  */
-  /* DWARF 1 */
-  .debug          0 : { *(.debug) }
-  .line           0 : { *(.line) }
-  /* GNU DWARF 1 extensions */
-  .debug_srcinfo  0 : { *(.debug_srcinfo) }
-  .debug_sfnames  0 : { *(.debug_sfnames) }
-  /* DWARF 1.1 and DWARF 2 */
-  .debug_aranges  0 : { *(.debug_aranges) }
-  .debug_pubnames 0 : { *(.debug_pubnames) }
-  /* DWARF 2 */
-  .debug_info     0 : { *(.debug_info) }
-  .debug_abbrev   0 : { *(.debug_abbrev) }
-  .debug_line     0 : { *(.debug_line) }
-  .debug_frame    0 : { *(.debug_frame) }
-  .debug_str      0 : { *(.debug_str) }
-  .debug_loc      0 : { *(.debug_loc) }
-  .debug_macinfo  0 : { *(.debug_macinfo) }
-  /* SGI/MIPS DWARF 2 extensions */
-  .debug_weaknames 0 : { *(.debug_weaknames) }
-  .debug_funcnames 0 : { *(.debug_funcnames) }
-  .debug_typenames 0 : { *(.debug_typenames) }
-  .debug_varnames  0 : { *(.debug_varnames) }
-  /* These must appear regardless of  .  */
-  /* Discard them for now since Intel SoftSDV cannot handle them.
-  .comment 0 : { *(.comment) }
-  .note 0 : { *(.note) }
-  */
-  /DISCARD/ : { *(.comment) }
-  /DISCARD/ : { *(.note) }
+	_edata  =  .;
+
+	BSS_SECTION(0, 0, 0)
+
+	_end = .;
+
+	code : {
+	} :code
+
+	STABS_DEBUG
+	DWARF_DEBUG
+
+	/* Default discards */
+	DISCARDS
 }