diff options
Diffstat (limited to 'arch')
-rw-r--r-- | arch/mips/au1000/Kconfig | 1 | ||||
-rw-r--r-- | arch/mips/au1000/common/irq.c | 251 | ||||
-rw-r--r-- | arch/mips/au1000/common/power.c | 7 | ||||
-rw-r--r-- | arch/mips/au1000/pb1200/irqmap.c | 2 | ||||
-rw-r--r-- | arch/mips/configs/mtx1_defconfig | 2 | ||||
-rw-r--r-- | arch/mips/kernel/head.S | 4 | ||||
-rw-r--r-- | arch/mips/kernel/time.c | 47 | ||||
-rw-r--r-- | arch/mips/kernel/traps.c | 164 | ||||
-rw-r--r-- | arch/mips/sgi-ip22/ip22-time.c | 9 | ||||
-rw-r--r-- | arch/mips/sibyte/bcm1480/time.c | 2 | ||||
-rw-r--r-- | arch/mips/sibyte/sb1250/time.c | 8 | ||||
-rw-r--r-- | arch/x86/kernel/alternative.c | 4 | ||||
-rw-r--r-- | arch/x86/kernel/asm-offsets_32.c | 14 | ||||
-rw-r--r-- | arch/x86/kernel/entry_32.S | 2 | ||||
-rw-r--r-- | arch/x86/kernel/paravirt_32.c | 224 | ||||
-rw-r--r-- | arch/x86/kernel/vmi_32.c | 201 | ||||
-rw-r--r-- | arch/x86/mm/init_32.c | 22 | ||||
-rw-r--r-- | arch/x86/xen/enlighten.c | 232 | ||||
-rw-r--r-- | arch/x86/xen/mmu.c | 144 | ||||
-rw-r--r-- | arch/x86/xen/multicalls.c | 52 | ||||
-rw-r--r-- | arch/x86/xen/multicalls.h | 5 | ||||
-rw-r--r-- | arch/x86/xen/smp.c | 14 | ||||
-rw-r--r-- | arch/x86/xen/time.c | 6 | ||||
-rw-r--r-- | arch/x86/xen/xen-ops.h | 10 |
24 files changed, 876 insertions, 551 deletions
diff --git a/arch/mips/au1000/Kconfig b/arch/mips/au1000/Kconfig index 29c95d97217..a23d4154da0 100644 --- a/arch/mips/au1000/Kconfig +++ b/arch/mips/au1000/Kconfig @@ -137,6 +137,7 @@ config SOC_AU1200 config SOC_AU1X00 bool select 64BIT_PHYS_ADDR + select IRQ_CPU select SYS_HAS_CPU_MIPS32_R1 select SYS_SUPPORTS_32BIT_KERNEL select SYS_SUPPORTS_APM_EMULATION diff --git a/arch/mips/au1000/common/irq.c b/arch/mips/au1000/common/irq.c index c00f308fd50..59e932a928d 100644 --- a/arch/mips/au1000/common/irq.c +++ b/arch/mips/au1000/common/irq.c @@ -1,11 +1,10 @@ /* - * BRIEF MODULE DESCRIPTION - * Au1000 interrupt routines. - * * Copyright 2001 MontaVista Software Inc. * Author: MontaVista Software, Inc. * ppopov@mvista.com or source@mvista.com * + * Copyright (C) 2007 Ralf Baechle (ralf@linux-mips.org) + * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the * Free Software Foundation; either version 2 of the License, or (at your @@ -32,6 +31,7 @@ #include <linux/interrupt.h> #include <linux/irq.h> +#include <asm/irq_cpu.h> #include <asm/mipsregs.h> #include <asm/mach-au1x00/au1000.h> #ifdef CONFIG_MIPS_PB1000 @@ -44,7 +44,7 @@ #define EXT_INTC1_REQ1 5 /* IP 5 */ #define MIPS_TIMER_IP 7 /* IP 7 */ -void (*board_init_irq)(void); +void (*board_init_irq)(void) __initdata = NULL; static DEFINE_SPINLOCK(irq_lock); @@ -134,12 +134,14 @@ void restore_au1xxx_intctl(void) inline void local_enable_irq(unsigned int irq_nr) { - if (irq_nr > AU1000_LAST_INTC0_INT) { - au_writel(1 << (irq_nr - 32), IC1_MASKSET); - au_writel(1 << (irq_nr - 32), IC1_WAKESET); + unsigned int bit = irq_nr - AU1000_INTC0_INT_BASE; + + if (bit >= 32) { + au_writel(1 << (bit - 32), IC1_MASKSET); + au_writel(1 << (bit - 32), IC1_WAKESET); } else { - au_writel(1 << irq_nr, IC0_MASKSET); - au_writel(1 << irq_nr, IC0_WAKESET); + au_writel(1 << bit, IC0_MASKSET); + au_writel(1 << bit, IC0_WAKESET); } au_sync(); } @@ -147,12 +149,14 @@ inline void local_enable_irq(unsigned int irq_nr) inline void local_disable_irq(unsigned int irq_nr) { - if (irq_nr > AU1000_LAST_INTC0_INT) { - au_writel(1 << (irq_nr - 32), IC1_MASKCLR); - au_writel(1 << (irq_nr - 32), IC1_WAKECLR); + unsigned int bit = irq_nr - AU1000_INTC0_INT_BASE; + + if (bit >= 32) { + au_writel(1 << (bit - 32), IC1_MASKCLR); + au_writel(1 << (bit - 32), IC1_WAKECLR); } else { - au_writel(1 << irq_nr, IC0_MASKCLR); - au_writel(1 << irq_nr, IC0_WAKECLR); + au_writel(1 << bit, IC0_MASKCLR); + au_writel(1 << bit, IC0_WAKECLR); } au_sync(); } @@ -160,12 +164,14 @@ inline void local_disable_irq(unsigned int irq_nr) static inline void mask_and_ack_rise_edge_irq(unsigned int irq_nr) { - if (irq_nr > AU1000_LAST_INTC0_INT) { - au_writel(1 << (irq_nr - 32), IC1_RISINGCLR); - au_writel(1 << (irq_nr - 32), IC1_MASKCLR); + unsigned int bit = irq_nr - AU1000_INTC0_INT_BASE; + + if (bit >= 32) { + au_writel(1 << (bit - 32), IC1_RISINGCLR); + au_writel(1 << (bit - 32), IC1_MASKCLR); } else { - au_writel(1 << irq_nr, IC0_RISINGCLR); - au_writel(1 << irq_nr, IC0_MASKCLR); + au_writel(1 << bit, IC0_RISINGCLR); + au_writel(1 << bit, IC0_MASKCLR); } au_sync(); } @@ -173,12 +179,14 @@ static inline void mask_and_ack_rise_edge_irq(unsigned int irq_nr) static inline void mask_and_ack_fall_edge_irq(unsigned int irq_nr) { - if (irq_nr > AU1000_LAST_INTC0_INT) { - au_writel(1 << (irq_nr - 32), IC1_FALLINGCLR); - au_writel(1 << (irq_nr - 32), IC1_MASKCLR); + unsigned int bit = irq_nr - AU1000_INTC0_INT_BASE; + + if (bit >= 32) { + au_writel(1 << (bit - 32), IC1_FALLINGCLR); + au_writel(1 << (bit - 32), IC1_MASKCLR); } else { - au_writel(1 << irq_nr, IC0_FALLINGCLR); - au_writel(1 << irq_nr, IC0_MASKCLR); + au_writel(1 << bit, IC0_FALLINGCLR); + au_writel(1 << bit, IC0_MASKCLR); } au_sync(); } @@ -186,17 +194,20 @@ static inline void mask_and_ack_fall_edge_irq(unsigned int irq_nr) static inline void mask_and_ack_either_edge_irq(unsigned int irq_nr) { - /* This may assume that we don't get interrupts from + unsigned int bit = irq_nr - AU1000_INTC0_INT_BASE; + + /* + * This may assume that we don't get interrupts from * both edges at once, or if we do, that we don't care. */ - if (irq_nr > AU1000_LAST_INTC0_INT) { - au_writel(1 << (irq_nr - 32), IC1_FALLINGCLR); - au_writel(1 << (irq_nr - 32), IC1_RISINGCLR); - au_writel(1 << (irq_nr - 32), IC1_MASKCLR); + if (bit >= 32) { + au_writel(1 << (bit - 32), IC1_FALLINGCLR); + au_writel(1 << (bit - 32), IC1_RISINGCLR); + au_writel(1 << (bit - 32), IC1_MASKCLR); } else { - au_writel(1 << irq_nr, IC0_FALLINGCLR); - au_writel(1 << irq_nr, IC0_RISINGCLR); - au_writel(1 << irq_nr, IC0_MASKCLR); + au_writel(1 << bit, IC0_FALLINGCLR); + au_writel(1 << bit, IC0_RISINGCLR); + au_writel(1 << bit, IC0_MASKCLR); } au_sync(); } @@ -213,10 +224,8 @@ static inline void mask_and_ack_level_irq(unsigned int irq_nr) au_sync(); } #endif - return; } - static void end_irq(unsigned int irq_nr) { if (!(irq_desc[irq_nr].status & (IRQ_DISABLED | IRQ_INPROGRESS))) @@ -341,114 +350,118 @@ void startup_match20_interrupt(irq_handler_t handler) } #endif -static void setup_local_irq(unsigned int irq_nr, int type, int int_req) +static void __init setup_local_irq(unsigned int irq_nr, int type, int int_req) { - if (irq_nr > AU1000_MAX_INTR) return; + unsigned int bit = irq_nr - AU1000_INTC0_INT_BASE; + + if (irq_nr > AU1000_MAX_INTR) + return; + /* Config2[n], Config1[n], Config0[n] */ - if (irq_nr > AU1000_LAST_INTC0_INT) { + if (bit >= 32) { switch (type) { case INTC_INT_RISE_EDGE: /* 0:0:1 */ - au_writel(1 << (irq_nr - 32), IC1_CFG2CLR); - au_writel(1 << (irq_nr - 32), IC1_CFG1CLR); - au_writel(1 << (irq_nr - 32), IC1_CFG0SET); + au_writel(1 << (bit - 32), IC1_CFG2CLR); + au_writel(1 << (bit - 32), IC1_CFG1CLR); + au_writel(1 << (bit - 32), IC1_CFG0SET); set_irq_chip(irq_nr, &rise_edge_irq_type); break; case INTC_INT_FALL_EDGE: /* 0:1:0 */ - au_writel(1 << (irq_nr - 32), IC1_CFG2CLR); - au_writel(1 << (irq_nr - 32), IC1_CFG1SET); - au_writel(1 << (irq_nr - 32), IC1_CFG0CLR); + au_writel(1 << (bit - 32), IC1_CFG2CLR); + au_writel(1 << (bit - 32), IC1_CFG1SET); + au_writel(1 << (bit - 32), IC1_CFG0CLR); set_irq_chip(irq_nr, &fall_edge_irq_type); break; case INTC_INT_RISE_AND_FALL_EDGE: /* 0:1:1 */ - au_writel(1 << (irq_nr - 32), IC1_CFG2CLR); - au_writel(1 << (irq_nr - 32), IC1_CFG1SET); - au_writel(1 << (irq_nr - 32), IC1_CFG0SET); + au_writel(1 << (bit - 32), IC1_CFG2CLR); + au_writel(1 << (bit - 32), IC1_CFG1SET); + au_writel(1 << (bit - 32), IC1_CFG0SET); set_irq_chip(irq_nr, &either_edge_irq_type); break; case INTC_INT_HIGH_LEVEL: /* 1:0:1 */ - au_writel(1 << (irq_nr - 32), IC1_CFG2SET); - au_writel(1 << (irq_nr - 32), IC1_CFG1CLR); - au_writel(1 << (irq_nr - 32), IC1_CFG0SET); + au_writel(1 << (bit - 32), IC1_CFG2SET); + au_writel(1 << (bit - 32), IC1_CFG1CLR); + au_writel(1 << (bit - 32), IC1_CFG0SET); set_irq_chip(irq_nr, &level_irq_type); break; case INTC_INT_LOW_LEVEL: /* 1:1:0 */ - au_writel(1 << (irq_nr - 32), IC1_CFG2SET); - au_writel(1 << (irq_nr - 32), IC1_CFG1SET); - au_writel(1 << (irq_nr - 32), IC1_CFG0CLR); + au_writel(1 << (bit - 32), IC1_CFG2SET); + au_writel(1 << (bit - 32), IC1_CFG1SET); + au_writel(1 << (bit - 32), IC1_CFG0CLR); set_irq_chip(irq_nr, &level_irq_type); break; case INTC_INT_DISABLED: /* 0:0:0 */ - au_writel(1 << (irq_nr - 32), IC1_CFG0CLR); - au_writel(1 << (irq_nr - 32), IC1_CFG1CLR); - au_writel(1 << (irq_nr - 32), IC1_CFG2CLR); + au_writel(1 << (bit - 32), IC1_CFG0CLR); + au_writel(1 << (bit - 32), IC1_CFG1CLR); + au_writel(1 << (bit - 32), IC1_CFG2CLR); break; default: /* disable the interrupt */ printk(KERN_WARNING "unexpected int type %d (irq %d)\n", type, irq_nr); - au_writel(1 << (irq_nr - 32), IC1_CFG0CLR); - au_writel(1 << (irq_nr - 32), IC1_CFG1CLR); - au_writel(1 << (irq_nr - 32), IC1_CFG2CLR); + au_writel(1 << (bit - 32), IC1_CFG0CLR); + au_writel(1 << (bit - 32), IC1_CFG1CLR); + au_writel(1 << (bit - 32), IC1_CFG2CLR); return; } if (int_req) /* assign to interrupt request 1 */ - au_writel(1 << (irq_nr - 32), IC1_ASSIGNCLR); + au_writel(1 << (bit - 32), IC1_ASSIGNCLR); else /* assign to interrupt request 0 */ - au_writel(1 << (irq_nr - 32), IC1_ASSIGNSET); - au_writel(1 << (irq_nr - 32), IC1_SRCSET); - au_writel(1 << (irq_nr - 32), IC1_MASKCLR); - au_writel(1 << (irq_nr - 32), IC1_WAKECLR); + au_writel(1 << (bit - 32), IC1_ASSIGNSET); + au_writel(1 << (bit - 32), IC1_SRCSET); + au_writel(1 << (bit - 32), IC1_MASKCLR); + au_writel(1 << (bit - 32), IC1_WAKECLR); } else { switch (type) { case INTC_INT_RISE_EDGE: /* 0:0:1 */ - au_writel(1 << irq_nr, IC0_CFG2CLR); - au_writel(1 << irq_nr, IC0_CFG1CLR); - au_writel(1 << irq_nr, IC0_CFG0SET); + au_writel(1 << bit, IC0_CFG2CLR); + au_writel(1 << bit, IC0_CFG1CLR); + au_writel(1 << bit, IC0_CFG0SET); set_irq_chip(irq_nr, &rise_edge_irq_type); break; case INTC_INT_FALL_EDGE: /* 0:1:0 */ - au_writel(1 << irq_nr, IC0_CFG2CLR); - au_writel(1 << irq_nr, IC0_CFG1SET); - au_writel(1 << irq_nr, IC0_CFG0CLR); + au_writel(1 << bit, IC0_CFG2CLR); + au_writel(1 << bit, IC0_CFG1SET); + au_writel(1 << bit, IC0_CFG0CLR); set_irq_chip(irq_nr, &fall_edge_irq_type); break; case INTC_INT_RISE_AND_FALL_EDGE: /* 0:1:1 */ - au_writel(1 << irq_nr, IC0_CFG2CLR); - au_writel(1 << irq_nr, IC0_CFG1SET); - au_writel(1 << irq_nr, IC0_CFG0SET); + au_writel(1 << bit, IC0_CFG2CLR); + au_writel(1 << bit, IC0_CFG1SET); + au_writel(1 << bit, IC0_CFG0SET); set_irq_chip(irq_nr, &either_edge_irq_type); break; case INTC_INT_HIGH_LEVEL: /* 1:0:1 */ - au_writel(1 << irq_nr, IC0_CFG2SET); - au_writel(1 << irq_nr, IC0_CFG1CLR); - au_writel(1 << irq_nr, IC0_CFG0SET); + au_writel(1 << bit, IC0_CFG2SET); + au_writel(1 << bit, IC0_CFG1CLR); + au_writel(1 << bit, IC0_CFG0SET); set_irq_chip(irq_nr, &level_irq_type); break; case INTC_INT_LOW_LEVEL: /* 1:1:0 */ - au_writel(1 << irq_nr, IC0_CFG2SET); - au_writel(1 << irq_nr, IC0_CFG1SET); - au_writel(1 << irq_nr, IC0_CFG0CLR); + au_writel(1 << bit, IC0_CFG2SET); + au_writel(1 << bit, IC0_CFG1SET); + au_writel(1 << bit, IC0_CFG0CLR); set_irq_chip(irq_nr, &level_irq_type); break; case INTC_INT_DISABLED: /* 0:0:0 */ - au_writel(1 << irq_nr, IC0_CFG0CLR); - au_writel(1 << irq_nr, IC0_CFG1CLR); - au_writel(1 << irq_nr, IC0_CFG2CLR); + au_writel(1 << bit, IC0_CFG0CLR); + au_writel(1 << bit, IC0_CFG1CLR); + au_writel(1 << bit, IC0_CFG2CLR); break; default: /* disable the interrupt */ printk(KERN_WARNING "unexpected int type %d (irq %d)\n", type, irq_nr); - au_writel(1 << irq_nr, IC0_CFG0CLR); - au_writel(1 << irq_nr, IC0_CFG1CLR); - au_writel(1 << irq_nr, IC0_CFG2CLR); + au_writel(1 << bit, IC0_CFG0CLR); + au_writel(1 << bit, IC0_CFG1CLR); + au_writel(1 << bit, IC0_CFG2CLR); return; } if (int_req) /* assign to interrupt request 1 */ - au_writel(1 << irq_nr, IC0_ASSIGNCLR); + au_writel(1 << bit, IC0_ASSIGNCLR); else /* assign to interrupt request 0 */ - au_writel(1 << irq_nr, IC0_ASSIGNSET); - au_writel(1 << irq_nr, IC0_SRCSET); - au_writel(1 << irq_nr, IC0_MASKCLR); - au_writel(1 << irq_nr, IC0_WAKECLR); + au_writel(1 << bit, IC0_ASSIGNSET); + au_writel(1 << bit, IC0_SRCSET); + au_writel(1 << bit, IC0_MASKCLR); + au_writel(1 << bit, IC0_WAKECLR); } au_sync(); } @@ -461,8 +474,8 @@ static void setup_local_irq(unsigned int irq_nr, int type, int int_req) static void intc0_req0_irqdispatch(void) { - int irq = 0; static unsigned long intc0_req0; + unsigned int bit; intc0_req0 |= au_readl(IC0_REQ0INT); @@ -481,25 +494,25 @@ static void intc0_req0_irqdispatch(void) return; } #endif - irq = ffs(intc0_req0); - intc0_req0 &= ~(1 << irq); - do_IRQ(irq); + bit = ffs(intc0_req0); + intc0_req0 &= ~(1 << bit); + do_IRQ(MIPS_CPU_IRQ_BASE + bit); } static void intc0_req1_irqdispatch(void) { - int irq = 0; static unsigned long intc0_req1; + unsigned int bit; intc0_req1 |= au_readl(IC0_REQ1INT); if (!intc0_req1) return; - irq = ffs(intc0_req1); - intc0_req1 &= ~(1 << irq); - do_IRQ(irq); + bit = ffs(intc0_req1); + intc0_req1 &= ~(1 << bit); + do_IRQ(bit); } @@ -509,43 +522,41 @@ static void intc0_req1_irqdispatch(void) */ static void intc1_req0_irqdispatch(void) { - int irq = 0; static unsigned long intc1_req0; + unsigned int bit; intc1_req0 |= au_readl(IC1_REQ0INT); if (!intc1_req0) return; - irq = ffs(intc1_req0); - intc1_req0 &= ~(1 << irq); - irq += 32; - do_IRQ(irq); + bit = ffs(intc1_req0); + intc1_req0 &= ~(1 << bit); + do_IRQ(MIPS_CPU_IRQ_BASE + 32 + bit); } static void intc1_req1_irqdispatch(void) { - int irq = 0; static unsigned long intc1_req1; + unsigned int bit; intc1_req1 |= au_readl(IC1_REQ1INT); if (!intc1_req1) return; - irq = ffs(intc1_req1); - intc1_req1 &= ~(1 << irq); - irq += 32; - do_IRQ(irq); + bit = ffs(intc1_req1); + intc1_req1 &= ~(1 << bit); + do_IRQ(MIPS_CPU_IRQ_BASE + 32 + bit); } asmlinkage void plat_irq_dispatch(void) { - unsigned int pending = read_c0_status() & read_c0_cause() & ST0_IM; + unsigned int pending = read_c0_status() & read_c0_cause(); if (pending & CAUSEF_IP7) - do_IRQ(63); + do_IRQ(MIPS_CPU_IRQ_BASE + 7); else if (pending & CAUSEF_IP2) intc0_req0_irqdispatch(); else if (pending & CAUSEF_IP3) @@ -561,17 +572,15 @@ asmlinkage void plat_irq_dispatch(void) void __init arch_init_irq(void) { int i; - unsigned long cp0_status; struct au1xxx_irqmap *imp; extern struct au1xxx_irqmap au1xxx_irq_map[]; extern struct au1xxx_irqmap au1xxx_ic0_map[]; extern int au1xxx_nr_irqs; extern int au1xxx_ic0_nr_irqs; - cp0_status = read_c0_status(); - - /* Initialize interrupt controllers to a safe state. - */ + /* + * Initialize interrupt controllers to a safe state. + */ au_writel(0xffffffff, IC0_CFG0CLR); au_writel(0xffffffff, IC0_CFG1CLR); au_writel(0xffffffff, IC0_CFG2CLR); @@ -594,16 +603,20 @@ void __init arch_init_irq(void) au_writel(0xffffffff, IC1_RISINGCLR); au_writel(0x00000000, IC1_TESTBIT); - /* Initialize IC0, which is fixed per processor. - */ + mips_cpu_irq_init(); + + /* + * Initialize IC0, which is fixed per processor. + */ imp = au1xxx_ic0_map; for (i = 0; i < au1xxx_ic0_nr_irqs; i++) { setup_local_irq(imp->im_irq, imp->im_type, imp->im_request); imp++; } - /* Now set up the irq mapping for the board. - */ + /* + * Now set up the irq mapping for the board. + */ imp = au1xxx_irq_map; for (i = 0; i < au1xxx_nr_irqs; i++) { setup_local_irq(imp->im_irq, imp->im_type, imp->im_request); @@ -615,5 +628,5 @@ void __init arch_init_irq(void) /* Board specific IRQ initialization. */ if (board_init_irq) - (*board_init_irq)(); + board_init_irq(); } diff --git a/arch/mips/au1000/common/power.c b/arch/mips/au1000/common/power.c index 6f57f72a7d5..54047d69b82 100644 --- a/arch/mips/au1000/common/power.c +++ b/arch/mips/au1000/common/power.c @@ -403,9 +403,9 @@ static int pm_do_freq(ctl_table * ctl, int write, struct file *file, } - /* We don't want _any_ interrupts other than - * match20. Otherwise our au1000_calibrate_delay() - * calculation will be off, potentially a lot. + /* + * We don't want _any_ interrupts other than match20. Otherwise our + * au1000_calibrate_delay() calculation will be off, potentially a lot. */ intc0_mask = save_local_and_disable(0); intc1_mask = save_local_and_disable(1); @@ -414,6 +414,7 @@ static int pm_do_freq(ctl_table * ctl, int write, struct file *file, au1000_calibrate_delay(); restore_local_and_enable(0, intc0_mask); restore_local_and_enable(1, intc1_mask); + return retval; } diff --git a/arch/mips/au1000/pb1200/irqmap.c b/arch/mips/au1000/pb1200/irqmap.c index 3bee274445f..5f48b060379 100644 --- a/arch/mips/au1000/pb1200/irqmap.c +++ b/arch/mips/au1000/pb1200/irqmap.c @@ -74,7 +74,7 @@ irqreturn_t pb1200_cascade_handler( int irq, void *dev_id) bcsr->int_status = bisr; for( ; bisr; bisr &= (bisr-1) ) { - extirq_nr = PB1200_INT_BEGIN + au_ffs(bisr); + extirq_nr = PB1200_INT_BEGIN + ffs(bisr); /* Ack and dispatch IRQ */ do_IRQ(extirq_nr); } diff --git a/arch/mips/configs/mtx1_defconfig b/arch/mips/configs/mtx1_defconfig index 0280ef389d8..b536d7c6379 100644 --- a/arch/mips/configs/mtx1_defconfig +++ b/arch/mips/configs/mtx1_defconfig @@ -3021,7 +3021,7 @@ CONFIG_MAGIC_SYSRQ=y # CONFIG_DEBUG_FS is not set # CONFIG_HEADERS_CHECK is not set # CONFIG_DEBUG_KERNEL is not set -# CONFIG_CROSSCOMPILE is not set +CONFIG_CROSSCOMPILE=y CONFIG_CMDLINE="" CONFIG_SYS_SUPPORTS_KGDB=y diff --git a/arch/mips/kernel/head.S b/arch/mips/kernel/head.S index e46782b0ebc..bf164a562ac 100644 --- a/arch/mips/kernel/head.S +++ b/arch/mips/kernel/head.S @@ -140,7 +140,7 @@ EXPORT(_stext) -#ifndef CONFIG_BOOT_RAW +#ifdef CONFIG_BOOT_RAW /* * Give us a fighting chance of running if execution beings at the * kernel load address. This is needed because this platform does @@ -149,6 +149,8 @@ EXPORT(_stext) __INIT #endif + __INIT_REFOK + NESTED(kernel_entry, 16, sp) # kernel entry point kernel_entry_setup # cpu specific setup diff --git a/arch/mips/kernel/time.c b/arch/mips/kernel/time.c index 05b365167a0..e4b5e647b14 100644 --- a/arch/mips/kernel/time.c +++ b/arch/mips/kernel/time.c @@ -391,6 +391,50 @@ static void mips_event_handler(struct clock_event_device *dev) { } +/* + * FIXME: This doesn't hold for the relocated E9000 compare interrupt. + */ +static int c0_compare_int_pending(void) +{ + return (read_c0_cause() >> cp0_compare_irq) & 0x100; +} + +static int c0_compare_int_usable(void) +{ + const unsigned int delta = 0x300000; + unsigned int cnt; + + /* + * IP7 already pending? Try to clear it by acking the timer. + */ + if (c0_compare_int_pending()) { + write_c0_compare(read_c0_compare()); + irq_disable_hazard(); + if (c0_compare_int_pending()) + return 0; + } + + cnt = read_c0_count(); + cnt += delta; + write_c0_compare(cnt); + + while ((long)(read_c0_count() - cnt) <= 0) + ; /* Wait for expiry */ + + if (!c0_compare_int_pending()) + return 0; + + write_c0_compare(read_c0_compare()); + irq_disable_hazard(); + if (c0_compare_int_pending()) + return 0; + + /* + * Feels like a real count / compare timer. + */ + return 1; +} + void __cpuinit mips_clockevent_init(void) { uint64_t mips_freq = mips_hpt_frequency; @@ -412,6 +456,9 @@ void __cpuinit mips_clockevent_init(void) return; #endif + if (!c0_compare_int_usable()) + return; + cd = &per_cpu(mips_clockevent_device, cpu); cd->name = "MIPS"; diff --git a/arch/mips/kernel/traps.c b/arch/mips/kernel/traps.c index 9c0c478d71a..bbf01b81a4f 100644 --- a/arch/mips/kernel/traps.c +++ b/arch/mips/kernel/traps.c @@ -9,9 +9,10 @@ * Copyright (C) 1999 Silicon Graphics, Inc. * Kevin D. Kissell, kevink@mips.com and Carsten Langgaard, carstenl@mips.com * Copyright (C) 2000, 01 MIPS Technologies, Inc. - * Copyright (C) 2002, 2003, 2004, 2005 Maciej W. Rozycki + * Copyright (C) 2002, 2003, 2004, 2005, 2007 Maciej W. Rozycki */ #include <linux/bug.h> +#include <linux/compiler.h> #include <linux/init.h> #include <linux/mm.h> #include <linux/module.h> @@ -410,7 +411,7 @@ asmlinkage void do_be(struct pt_regs *regs) } /* - * ll/sc emulation + * ll/sc, rdhwr, sync emulation */ #define OPCODE 0xfc000000 @@ -419,9 +420,11 @@ asmlinkage void do_be(struct pt_regs *regs) #define OFFSET 0x0000ffff #define LL 0xc0000000 #define SC 0xe0000000 +#define SPEC0 0x00000000 #define SPEC3 0x7c000000 #define RD 0x0000f800 #define FUNC 0x0000003f +#define SYNC 0x0000000f #define RDHWR 0x0000003b /* @@ -432,11 +435,10 @@ unsigned long ll_bit; static struct task_struct *ll_task = NULL; -static inline void simulate_ll(struct pt_regs *regs, unsigned int opcode) +static inline int simulate_ll(struct pt_regs *regs, unsigned int opcode) { unsigned long value, __user *vaddr; long offset; - int signal = 0; /* * analyse the ll instruction that just caused a ri exception @@ -451,14 +453,10 @@ static inline void simulate_ll(struct pt_regs *regs, unsigned int opcode) vaddr = (unsigned long __user *) ((unsigned long)(regs->regs[(opcode & BASE) >> 21]) + offset); - if ((unsigned long)vaddr & 3) { - signal = SIGBUS; - goto sig; - } - if (get_user(value, vaddr)) { - signal = SIGSEGV; - goto sig; - } + if ((unsigned long)vaddr & 3) + return SIGBUS; + if (get_user(value, vaddr)) + return SIGSEGV; preempt_disable(); @@ -471,22 +469,16 @@ static inline void simulate_ll(struct pt_regs *regs, unsigned int opcode) preempt_enable(); - compute_return_epc(regs); - regs->regs[(opcode & RT) >> 16] = value; - return; - -sig: - force_sig(signal, current); + return 0; } -static inline void simulate_sc(struct pt_regs *regs, unsigned int opcode) +static inline int simulate_sc(struct pt_regs *regs, unsigned int opcode) { unsigned long __user *vaddr; unsigned long reg; long offset; - int signal = 0; /* * analyse the sc instruction that just caused a ri exception @@ -502,34 +494,25 @@ static inline void simulate_sc(struct pt_regs *regs, unsigned int opcode) ((unsigned long)(regs->regs[(opcode & BASE) >> 21]) + offset); reg = (opcode & RT) >> 16; - if ((unsigned long)vaddr & 3) { - signal = SIGBUS; - goto sig; - } + if ((unsigned long)vaddr & 3) + return SIGBUS; preempt_disable(); if (ll_bit == 0 || ll_task != current) { - compute_return_epc(regs); regs->regs[reg] = 0; preempt_enable(); - return; + return 0; } preempt_enable(); - if (put_user(regs->regs[reg], vaddr)) { - signal = SIGSEGV; - goto sig; - } + if (put_user(regs->regs[reg], vaddr)) + return SIGSEGV; - compute_return_epc(regs); regs->regs[reg] = 1; - return; - -sig: - force_sig(signal, current); + return 0; } /* @@ -539,27 +522,14 @@ sig: * few processors such as NEC's VR4100 throw reserved instruction exceptions * instead, so we're doing the emulation thing in both exception handlers. */ -static inline int simulate_llsc(struct pt_regs *regs) +static int simulate_llsc(struct pt_regs *regs, unsigned int opcode) { - unsigned int opcode; - - if (get_user(opcode, (unsigned int __user *) exception_epc(regs))) - goto out_sigsegv; - - if ((opcode & OPCODE) == LL) { - simulate_ll(regs, opcode); - return 0; - } - if ((opcode & OPCODE) == SC) { - simulate_sc(regs, opcode); - return 0; - } - - return -EFAULT; /* Strange things going on ... */ + if ((opcode & OPCODE) == LL) + return simulate_ll(regs, opcode); + if ((opcode & OPCODE) == SC) + return simulate_sc(regs, opcode); -out_sigsegv: - force_sig(SIGSEGV, current); - return -EFAULT; + return -1; /* Must be something else ... */ } /* @@ -567,16 +537,9 @@ out_sigsegv: * registers not implemented in hardware. The only current use of this * is the thread area pointer. */ -static inline int simulate_rdhwr(struct pt_regs *regs) +static int simulate_rdhwr(struct pt_regs *regs, unsigned int opcode) { struct thread_info *ti = task_thread_info(current); - unsigned int opcode; - - if (get_user(opcode, (unsigned int __user *) exception_epc(regs))) - goto out_sigsegv; - - if (unlikely(compute_return_epc(regs))) - return -EFAULT; if ((opcode & OPCODE) == SPEC3 && (opcode & FUNC) == RDHWR) { int rd = (opcode & RD) >> 11; @@ -586,16 +549,20 @@ static inline int simulate_rdhwr(struct pt_regs *regs) regs->regs[rt] = ti->tp_value; return 0; default: - return -EFAULT; + return -1; } } /* Not ours. */ - return -EFAULT; + return -1; +} -out_sigsegv: - force_sig(SIGSEGV, current); - return -EFAULT; +static int simulate_sync(struct pt_regs *regs, unsigned int opcode) +{ + if ((opcode & OPCODE) == SPEC0 && (opcode & FUNC) == SYNC) + return 0; + + return -1; /* Must be something else ... */ } asmlinkage void do_ov(struct pt_regs *regs) @@ -767,16 +734,35 @@ out_sigsegv: asmlinkage void do_ri(struct pt_regs *regs) { - die_if_kernel("Reserved instruction in kernel code", regs); + unsigned int __user *epc = (unsigned int __user *)exception_epc(regs); + unsigned long old_epc = regs->cp0_epc; + unsigned int opcode = 0; + int status = -1; - if (!cpu_has_llsc) - if (!simulate_llsc(regs)) - return; + die_if_kernel("Reserved instruction in kernel code", regs); - if (!simulate_rdhwr(regs)) + if (unlikely(compute_return_epc(regs) < 0)) return; - force_sig(SIGILL, current); + if (unlikely(get_user(opcode, epc) < 0)) + status = SIGSEGV; + + if (!cpu_has_llsc && status < 0) + status = simulate_llsc(regs, opcode); + + if (status < 0) + status = simulate_rdhwr(regs, opcode); + + if (status < 0) + status = simulate_sync(regs, opcode); + + if (status < 0) + status = SIGILL; + + if (unlikely(status > 0)) { + regs->cp0_epc = old_epc; /* Undo skip-over. */ + force_sig(status, current); + } } /* @@ -808,7 +794,11 @@ static void mt_ase_fp_affinity(void) asmlinkage void do_cpu(struct pt_regs *regs) { + unsigned int __user *epc; + unsigned long old_epc; + unsigned int opcode; unsigned int cpid; + int status; die_if_kernel("do_cpu invoked from kernel context!", regs); @@ -816,14 +806,32 @@ asmlinkage void do_cpu(struct pt_regs *regs) switch (cpid) { case 0: - if (!cpu_has_llsc) - if (!simulate_llsc(regs)) - return; + epc = (unsigned int __user *)exception_epc(regs); + old_epc = regs->cp0_epc; + opcode = 0; + status = -1; - if (!simulate_rdhwr(regs)) + if (unlikely(compute_return_epc(regs) < 0)) return; - break; + if (unlikely(get_user(opcode, epc) < 0)) + status = SIGSEGV; + + if (!cpu_has_llsc && status < 0) + status = simulate_llsc(regs, opcode); + + if (status < 0) + status = simulate_rdhwr(regs, opcode); + + if (status < 0) + status = SIGILL; + + if (unlikely(status > 0)) { + regs->cp0_epc = old_epc; /* Undo skip-over. */ + force_sig(status, current); + } + + return; case 1: if (used_math()) /* Using the FPU again. */ diff --git a/arch/mips/sgi-ip22/ip22-time.c b/arch/mips/sgi-ip22/ip22-time.c index 9b9bffd2e8f..10e50549165 100644 --- a/arch/mips/sgi-ip22/ip22-time.c +++ b/arch/mips/sgi-ip22/ip22-time.c @@ -192,12 +192,3 @@ void indy_8254timer_irq(void) ArcEnterInteractiveMode(); irq_exit(); } - -void __init plat_timer_setup(struct irqaction *irq) -{ - /* over-write the handler, we use our own way */ - irq->handler = no_action; - - /* setup irqaction */ - setup_irq(SGI_TIMER_IRQ, irq); -} diff --git a/arch/mips/sibyte/bcm1480/time.c b/arch/mips/sibyte/bcm1480/time.c index 40d7126cd5b..5b4bfbbb5a2 100644 --- a/arch/mips/sibyte/bcm1480/time.c +++ b/arch/mips/sibyte/bcm1480/time.c @@ -84,7 +84,7 @@ static void sibyte_set_mode(enum clock_event_mode mode, void __iomem *timer_cfg, *timer_init; timer_cfg = IOADDR(A_SCD_TIMER_REGISTER(cpu, R_SCD_TIMER_CFG)); - timer_init = IOADDR(A_SCD_TIMER_REGISTER(cpu, R_SCD_TIMER_CFG)); + timer_init = IOADDR(A_SCD_TIMER_REGISTER(cpu, R_SCD_TIMER_INIT)); switch (mode) { case CLOCK_EVT_MODE_PERIODIC: diff --git a/arch/mips/sibyte/sb1250/time.c b/arch/mips/sibyte/sb1250/time.c index 38199ad8fc5..fe11fed8e0d 100644 --- a/arch/mips/sibyte/sb1250/time.c +++ b/arch/mips/sibyte/sb1250/time.c @@ -83,7 +83,7 @@ static void sibyte_set_mode(enum clock_event_mode mode, void __iomem *timer_cfg, *timer_init; timer_cfg = IOADDR(A_SCD_TIMER_REGISTER(cpu, R_SCD_TIMER_CFG)); - timer_init = IOADDR(A_SCD_TIMER_REGISTER(cpu, R_SCD_TIMER_CFG)); + timer_init = IOADDR(A_SCD_TIMER_REGISTER(cpu, R_SCD_TIMER_INIT)); switch(mode) { case CLOCK_EVT_MODE_PERIODIC: @@ -111,7 +111,7 @@ sibyte_next_event(unsigned long delta, struct clock_event_device *evt) void __iomem *timer_cfg, *timer_init; timer_cfg = IOADDR(A_SCD_TIMER_REGISTER(cpu, R_SCD_TIMER_CFG)); - timer_init = IOADDR(A_SCD_TIMER_REGISTER(cpu, R_SCD_TIMER_CFG)); + timer_init = IOADDR(A_SCD_TIMER_REGISTER(cpu, R_SCD_TIMER_INIT)); __raw_writeq(0, timer_cfg); __raw_writeq(delta, timer_init); @@ -155,7 +155,7 @@ static void sibyte_set_mode(enum clock_event_mode mode, void __iomem *timer_cfg, *timer_init; timer_cfg = IOADDR(A_SCD_TIMER_REGISTER(cpu, R_SCD_TIMER_CFG)); - timer_init = IOADDR(A_SCD_TIMER_REGISTER(cpu, R_SCD_TIMER_CFG)); + timer_init = IOADDR(A_SCD_TIMER_REGISTER(cpu, R_SCD_TIMER_INIT)); switch (mode) { case CLOCK_EVT_MODE_PERIODIC: @@ -183,7 +183,7 @@ sibyte_next_event(unsigned long delta, struct clock_event_device *evt) void __iomem *timer_cfg, *timer_init; timer_cfg = IOADDR(A_SCD_TIMER_REGISTER(cpu, R_SCD_TIMER_CFG)); - timer_init = IOADDR(A_SCD_TIMER_REGISTER(cpu, R_SCD_TIMER_CFG)); + timer_init = IOADDR(A_SCD_TIMER_REGISTER(cpu, R_SCD_TIMER_INIT)); __raw_writeq(0, timer_cfg); __raw_writeq(delta, timer_init); diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index a3ae8e6c8b3..3bd2688bd44 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -390,8 +390,8 @@ void apply_paravirt(struct paravirt_patch_site *start, BUG_ON(p->len > MAX_PATCH_LEN); /* prep the buffer with the original instructions */ memcpy(insnbuf, p->instr, p->len); - used = paravirt_ops.patch(p->instrtype, p->clobbers, insnbuf, - (unsigned long)p->instr, p->len); + used = pv_init_ops.patch(p->instrtype, p->clobbers, insnbuf, + (unsigned long)p->instr, p->len); BUG_ON(used > p->len); diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index 8029742c0fc..f1b7cdda82b 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c @@ -116,12 +116,14 @@ void foo(void) #ifdef CONFIG_PARAVIRT BLANK(); - OFFSET(PARAVIRT_enabled, paravirt_ops, paravirt_enabled); - OFFSET(PARAVIRT_irq_disable, paravirt_ops, irq_disable); - OFFSET(PARAVIRT_irq_enable, paravirt_ops, irq_enable); - OFFSET(PARAVIRT_irq_enable_sysexit, paravirt_ops, irq_enable_sysexit); - OFFSET(PARAVIRT_iret, paravirt_ops, iret); - OFFSET(PARAVIRT_read_cr0, paravirt_ops, read_cr0); + OFFSET(PARAVIRT_enabled, pv_info, paravirt_enabled); + OFFSET(PARAVIRT_PATCH_pv_cpu_ops, paravirt_patch_template, pv_cpu_ops); + OFFSET(PARAVIRT_PATCH_pv_irq_ops, paravirt_patch_template, pv_irq_ops); + OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable); + OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable); + OFFSET(PV_CPU_iret, pv_cpu_ops, iret); + OFFSET(PV_CPU_irq_enable_sysexit, pv_cpu_ops, irq_enable_sysexit); + OFFSET(PV_CPU_read_cr0, pv_cpu_ops, read_cr0); #endif #ifdef CONFIG_XEN diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 8099fea0a72..dc7f938e501 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -437,7 +437,7 @@ ldt_ss: * is still available to implement the setting of the high * 16-bits in the INTERRUPT_RETURN paravirt-op. */ - cmpl $0, paravirt_ops+PARAVIRT_enabled + cmpl $0, pv_info+PARAVIRT_enabled jne restore_nocheck #endif diff --git a/arch/x86/kernel/paravirt_32.c b/arch/x86/kernel/paravirt_32.c index 739cfb207dd..6a80d67c212 100644 --- a/arch/x86/kernel/paravirt_32.c +++ b/arch/x86/kernel/paravirt_32.c @@ -42,32 +42,33 @@ void _paravirt_nop(void) static void __init default_banner(void) { printk(KERN_INFO "Booting paravirtualized kernel on %s\n", - paravirt_ops.name); + pv_info.name); } char *memory_setup(void) { - return paravirt_ops.memory_setup(); + return pv_init_ops.memory_setup(); } /* Simple instruction patching code. */ -#define DEF_NATIVE(name, code) \ - extern const char start_##name[], end_##name[]; \ - asm("start_" #name ": " code "; end_" #name ":") - -DEF_NATIVE(irq_disable, "cli"); -DEF_NATIVE(irq_enable, "sti"); -DEF_NATIVE(restore_fl, "push %eax; popf"); -DEF_NATIVE(save_fl, "pushf; pop %eax"); -DEF_NATIVE(iret, "iret"); -DEF_NATIVE(irq_enable_sysexit, "sti; sysexit"); -DEF_NATIVE(read_cr2, "mov %cr2, %eax"); -DEF_NATIVE(write_cr3, "mov %eax, %cr3"); -DEF_NATIVE(read_cr3, "mov %cr3, %eax"); -DEF_NATIVE(clts, "clts"); -DEF_NATIVE(read_tsc, "rdtsc"); - -DEF_NATIVE(ud2a, "ud2a"); +#define DEF_NATIVE(ops, name, code) \ + extern const char start_##ops##_##name[], end_##ops##_##name[]; \ + asm("start_" #ops "_" #name ": " code "; end_" #ops "_" #name ":") + +DEF_NATIVE(pv_irq_ops, irq_disable, "cli"); +DEF_NATIVE(pv_irq_ops, irq_enable, "sti"); +DEF_NATIVE(pv_irq_ops, restore_fl, "push %eax; popf"); +DEF_NATIVE(pv_irq_ops, save_fl, "pushf; pop %eax"); +DEF_NATIVE(pv_cpu_ops, iret, "iret"); +DEF_NATIVE(pv_cpu_ops, irq_enable_sysexit, "sti; sysexit"); +DEF_NATIVE(pv_mmu_ops, read_cr2, "mov %cr2, %eax"); +DEF_NATIVE(pv_mmu_ops, write_cr3, "mov %eax, %cr3"); +DEF_NATIVE(pv_mmu_ops, read_cr3, "mov %cr3, %eax"); +DEF_NATIVE(pv_cpu_ops, clts, "clts"); +DEF_NATIVE(pv_cpu_ops, read_tsc, "rdtsc"); + +/* Undefined instruction for dealing with missing ops pointers. */ +static const unsigned char ud2a[] = { 0x0f, 0x0b }; static unsigned native_patch(u8 type, u16 clobbers, void *ibuf, unsigned long addr, unsigned len) @@ -76,37 +77,29 @@ static unsigned native_patch(u8 type, u16 clobbers, void *ibuf, unsigned ret; switch(type) { -#define SITE(x) case PARAVIRT_PATCH(x): start = start_##x; end = end_##x; goto patch_site - SITE(irq_disable); - SITE(irq_enable); - SITE(restore_fl); - SITE(save_fl); - SITE(iret); - SITE(irq_enable_sysexit); - SITE(read_cr2); - SITE(read_cr3); - SITE(write_cr3); - SITE(clts); - SITE(read_tsc); +#define SITE(ops, x) \ + case PARAVIRT_PATCH(ops.x): \ + start = start_##ops##_##x; \ + end = end_##ops##_##x; \ + goto patch_site + + SITE(pv_irq_ops, irq_disable); + SITE(pv_irq_ops, irq_enable); + SITE(pv_irq_ops, restore_fl); + SITE(pv_irq_ops, save_fl); + SITE(pv_cpu_ops, iret); + SITE(pv_cpu_ops, irq_enable_sysexit); + SITE(pv_mmu_ops, read_cr2); + SITE(pv_mmu_ops, read_cr3); + SITE(pv_mmu_ops, write_cr3); + SITE(pv_cpu_ops, clts); + SITE(pv_cpu_ops, read_tsc); #undef SITE patch_site: ret = paravirt_patch_insns(ibuf, len, start, end); break; - case PARAVIRT_PATCH(make_pgd): - case PARAVIRT_PATCH(make_pte): - case PARAVIRT_PATCH(pgd_val): - case PARAVIRT_PATCH(pte_val): -#ifdef CONFIG_X86_PAE - case PARAVIRT_PATCH(make_pmd): - case PARAVIRT_PATCH(pmd_val): -#endif - /* These functions end up returning exactly what - they're passed, in the same registers. */ - ret = paravirt_patch_nop(); - break; - default: ret = paravirt_patch_default(type, clobbers, ibuf, addr, len); break; @@ -150,7 +143,7 @@ unsigned paravirt_patch_call(void *insnbuf, return 5; } -unsigned paravirt_patch_jmp(const void *target, void *insnbuf, +unsigned paravirt_patch_jmp(void *insnbuf, const void *target, unsigned long addr, unsigned len) { struct branch *b = insnbuf; @@ -165,22 +158,37 @@ unsigned paravirt_patch_jmp(const void *target, void *insnbuf, return 5; } +/* Neat trick to map patch type back to the call within the + * corresponding structure. */ +static void *get_call_destination(u8 type) +{ + struct paravirt_patch_template tmpl = { + .pv_init_ops = pv_init_ops, + .pv_time_ops = pv_time_ops, + .pv_cpu_ops = pv_cpu_ops, + .pv_irq_ops = pv_irq_ops, + .pv_apic_ops = pv_apic_ops, + .pv_mmu_ops = pv_mmu_ops, + }; + return *((void **)&tmpl + type); +} + unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf, unsigned long addr, unsigned len) { - void *opfunc = *((void **)¶virt_ops + type); + void *opfunc = get_call_destination(type); unsigned ret; if (opfunc == NULL) /* If there's no function, patch it with a ud2a (BUG) */ - ret = paravirt_patch_insns(insnbuf, len, start_ud2a, end_ud2a); + ret = paravirt_patch_insns(insnbuf, len, ud2a, ud2a+sizeof(ud2a)); else if (opfunc == paravirt_nop) /* If the operation is a nop, then nop the callsite */ ret = paravirt_patch_nop(); - else if (type == PARAVIRT_PATCH(iret) || - type == PARAVIRT_PATCH(irq_enable_sysexit)) + else if (type == PARAVIRT_PATCH(pv_cpu_ops.iret) || + type == PARAVIRT_PATCH(pv_cpu_ops.irq_enable_sysexit)) /* If operation requires a jmp, then jmp */ - ret = paravirt_patch_jmp(opfunc, insnbuf, addr, len); + ret = paravirt_patch_jmp(insnbuf, opfunc, addr, len); else /* Otherwise call the function; assume target could clobber any caller-save reg */ @@ -205,7 +213,7 @@ unsigned paravirt_patch_insns(void *insnbuf, unsigned len, void init_IRQ(void) { - paravirt_ops.init_IRQ(); + pv_irq_ops.init_IRQ(); } static void native_flush_tlb(void) @@ -233,7 +241,7 @@ extern void native_irq_enable_sysexit(void); static int __init print_banner(void) { - paravirt_ops.banner(); + pv_init_ops.banner(); return 0; } core_initcall(print_banner); @@ -273,47 +281,96 @@ int paravirt_disable_iospace(void) return ret; } -struct paravirt_ops paravirt_ops = { +static DEFINE_PER_CPU(enum paravirt_lazy_mode, paravirt_lazy_mode) = PARAVIRT_LAZY_NONE; + +static inline void enter_lazy(enum paravirt_lazy_mode mode) +{ + BUG_ON(x86_read_percpu(paravirt_lazy_mode) != PARAVIRT_LAZY_NONE); + BUG_ON(preemptible()); + + x86_write_percpu(paravirt_lazy_mode, mode); +} + +void paravirt_leave_lazy(enum paravirt_lazy_mode mode) +{ + BUG_ON(x86_read_percpu(paravirt_lazy_mode) != mode); + BUG_ON(preemptible()); + + x86_write_percpu(paravirt_lazy_mode, PARAVIRT_LAZY_NONE); +} + +void paravirt_enter_lazy_mmu(void) +{ + enter_lazy(PARAVIRT_LAZY_MMU); +} + +void paravirt_leave_lazy_mmu(void) +{ + paravirt_leave_lazy(PARAVIRT_LAZY_MMU); +} + +void paravirt_enter_lazy_cpu(void) +{ + enter_lazy(PARAVIRT_LAZY_CPU); +} + +void paravirt_leave_lazy_cpu(void) +{ + paravirt_leave_lazy(PARAVIRT_LAZY_CPU); +} + +enum paravirt_lazy_mode paravirt_get_lazy_mode(void) +{ + return x86_read_percpu(paravirt_lazy_mode); +} + +struct pv_info pv_info = { .name = "bare hardware", .paravirt_enabled = 0, .kernel_rpl = 0, .shared_kernel_pmd = 1, /* Only used when CONFIG_X86_PAE is set */ +}; - .patch = native_patch, +struct pv_init_ops pv_init_ops = { + .patch = native_patch, .banner = default_banner, .arch_setup = paravirt_nop, .memory_setup = machine_specific_memory_setup, +}; + +struct pv_time_ops pv_time_ops = { + .time_init = hpet_time_init, .get_wallclock = native_get_wallclock, .set_wallclock = native_set_wallclock, - .time_init = hpet_time_init, + .sched_clock = native_sched_clock, + .get_cpu_khz = native_calculate_cpu_khz, +}; + +struct pv_irq_ops pv_irq_ops = { .init_IRQ = native_init_IRQ, + .save_fl = native_save_fl, + .restore_fl = native_restore_fl, + .irq_disable = native_irq_disable, + .irq_enable = native_irq_enable, + .safe_halt = native_safe_halt, + .halt = native_halt, +}; +struct pv_cpu_ops pv_cpu_ops = { .cpuid = native_cpuid, .get_debugreg = native_get_debugreg, .set_debugreg = native_set_debugreg, .clts = native_clts, .read_cr0 = native_read_cr0, .write_cr0 = native_write_cr0, - .read_cr2 = native_read_cr2, - .write_cr2 = native_write_cr2, - .read_cr3 = native_read_cr3, - .write_cr3 = native_write_cr3, .read_cr4 = native_read_cr4, .read_cr4_safe = native_read_cr4_safe, .write_cr4 = native_write_cr4, - .save_fl = native_save_fl, - .restore_fl = native_restore_fl, - .irq_disable = native_irq_disable, - .irq_enable = native_irq_enable, - .safe_halt = native_safe_halt, - .halt = native_halt, .wbinvd = native_wbinvd, .read_msr = native_read_msr_safe, .write_msr = native_write_msr_safe, .read_tsc = native_read_tsc, .read_pmc = native_read_pmc, - .sched_clock = native_sched_clock, - .get_cpu_khz = native_calculate_cpu_khz, .load_tr_desc = native_load_tr_desc, .set_ldt = native_set_ldt, .load_gdt = native_load_gdt, @@ -327,9 +384,19 @@ struct paravirt_ops paravirt_ops = { .write_idt_entry = write_dt_entry, .load_esp0 = native_load_esp0, + .irq_enable_sysexit = native_irq_enable_sysexit, + .iret = native_iret, + .set_iopl_mask = native_set_iopl_mask, .io_delay = native_io_delay, + .lazy_mode = { + .enter = paravirt_nop, + .leave = paravirt_nop, + }, +}; + +struct pv_apic_ops pv_apic_ops = { #ifdef CONFIG_X86_LOCAL_APIC .apic_write = native_apic_write, .apic_write_atomic = native_apic_write_atomic, @@ -338,11 +405,17 @@ struct paravirt_ops paravirt_ops = { .setup_secondary_clock = setup_secondary_APIC_clock, .startup_ipi_hook = paravirt_nop, #endif - .set_lazy_mode = paravirt_nop, +}; +struct pv_mmu_ops pv_mmu_ops = { .pagetable_setup_start = native_pagetable_setup_start, .pagetable_setup_done = native_pagetable_setup_done, + .read_cr2 = native_read_cr2, + .write_cr2 = native_write_cr2, + .read_cr3 = native_read_cr3, + .write_cr3 = native_write_cr3, + .flush_tlb_user = native_flush_tlb, .flush_tlb_kernel = native_flush_tlb_global, .flush_tlb_single = native_flush_tlb_single, @@ -381,12 +454,19 @@ struct paravirt_ops paravirt_ops = { .make_pte = native_make_pte, .make_pgd = native_make_pgd, - .irq_enable_sysexit = native_irq_enable_sysexit, - .iret = native_iret, - .dup_mmap = paravirt_nop, .exit_mmap = paravirt_nop, .activate_mm = paravirt_nop, + + .lazy_mode = { + .enter = paravirt_nop, + .leave = paravirt_nop, + }, }; -EXPORT_SYMBOL(paravirt_ops); +EXPORT_SYMBOL_GPL(pv_time_ops); +EXPORT_SYMBOL_GPL(pv_cpu_ops); +EXPORT_SYMBOL_GPL(pv_mmu_ops); +EXPORT_SYMBOL_GPL(pv_apic_ops); +EXPORT_SYMBOL_GPL(pv_info); +EXPORT_SYMBOL (pv_irq_ops); diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c index 18673e0f193..f02bad68aba 100644 --- a/arch/x86/kernel/vmi_32.c +++ b/arch/x86/kernel/vmi_32.c @@ -134,21 +134,21 @@ static unsigned vmi_patch(u8 type, u16 clobbers, void *insns, unsigned long eip, unsigned len) { switch (type) { - case PARAVIRT_PATCH(irq_disable): + case PARAVIRT_PATCH(pv_irq_ops.irq_disable): return patch_internal(VMI_CALL_DisableInterrupts, len, insns, eip); - case PARAVIRT_PATCH(irq_enable): + case PARAVIRT_PATCH(pv_irq_ops.irq_enable): return patch_internal(VMI_CALL_EnableInterrupts, len, insns, eip); - case PARAVIRT_PATCH(restore_fl): + case PARAVIRT_PATCH(pv_irq_ops.restore_fl): return patch_internal(VMI_CALL_SetInterruptMask, len, insns, eip); - case PARAVIRT_PATCH(save_fl): + case PARAVIRT_PATCH(pv_irq_ops.save_fl): return patch_internal(VMI_CALL_GetInterruptMask, len, insns, eip); - case PARAVIRT_PATCH(iret): + case PARAVIRT_PATCH(pv_cpu_ops.iret): return patch_internal(VMI_CALL_IRET, len, insns, eip); - case PARAVIRT_PATCH(irq_enable_sysexit): + case PARAVIRT_PATCH(pv_cpu_ops.irq_enable_sysexit): return patch_internal(VMI_CALL_SYSEXIT, len, insns, eip); default: break; @@ -552,24 +552,22 @@ vmi_startup_ipi_hook(int phys_apicid, unsigned long start_eip, } #endif -static void vmi_set_lazy_mode(enum paravirt_lazy_mode mode) +static void vmi_enter_lazy_cpu(void) { - static DEFINE_PER_CPU(enum paravirt_lazy_mode, lazy_mode); - - if (!vmi_ops.set_lazy_mode) - return; + paravirt_enter_lazy_cpu(); + vmi_ops.set_lazy_mode(2); +} - /* Modes should never nest or overlap */ - BUG_ON(__get_cpu_var(lazy_mode) && !(mode == PARAVIRT_LAZY_NONE || - mode == PARAVIRT_LAZY_FLUSH)); +static void vmi_enter_lazy_mmu(void) +{ + paravirt_enter_lazy_mmu(); + vmi_ops.set_lazy_mode(1); +} - if (mode == PARAVIRT_LAZY_FLUSH) { - vmi_ops.set_lazy_mode(0); - vmi_ops.set_lazy_mode(__get_cpu_var(lazy_mode)); - } else { - vmi_ops.set_lazy_mode(mode); - __get_cpu_var(lazy_mode) = mode; - } +static void vmi_leave_lazy(void) +{ + paravirt_leave_lazy(paravirt_get_lazy_mode()); + vmi_ops.set_lazy_mode(0); } static inline int __init check_vmi_rom(struct vrom_header *rom) @@ -690,9 +688,9 @@ do { \ reloc = call_vrom_long_func(vmi_rom, get_reloc, \ VMI_CALL_##vmicall); \ if (rel->type == VMI_RELOCATION_CALL_REL) \ - paravirt_ops.opname = (void *)rel->eip; \ + opname = (void *)rel->eip; \ else if (rel->type == VMI_RELOCATION_NOP) \ - paravirt_ops.opname = (void *)vmi_nop; \ + opname = (void *)vmi_nop; \ else if (rel->type != VMI_RELOCATION_NONE) \ printk(KERN_WARNING "VMI: Unknown relocation " \ "type %d for " #vmicall"\n",\ @@ -712,7 +710,7 @@ do { \ VMI_CALL_##vmicall); \ BUG_ON(rel->type == VMI_RELOCATION_JUMP_REL); \ if (rel->type == VMI_RELOCATION_CALL_REL) { \ - paravirt_ops.opname = wrapper; \ + opname = wrapper; \ vmi_ops.cache = (void *)rel->eip; \ } \ } while (0) @@ -732,11 +730,11 @@ static inline int __init activate_vmi(void) } savesegment(cs, kernel_cs); - paravirt_ops.paravirt_enabled = 1; - paravirt_ops.kernel_rpl = kernel_cs & SEGMENT_RPL_MASK; + pv_info.paravirt_enabled = 1; + pv_info.kernel_rpl = kernel_cs & SEGMENT_RPL_MASK; + pv_info.name = "vmi"; - paravirt_ops.patch = vmi_patch; - paravirt_ops.name = "vmi"; + pv_init_ops.patch = vmi_patch; /* * Many of these operations are ABI compatible with VMI. @@ -754,26 +752,26 @@ static inline int __init activate_vmi(void) */ /* CPUID is special, so very special it gets wrapped like a present */ - para_wrap(cpuid, vmi_cpuid, cpuid, CPUID); - - para_fill(clts, CLTS); - para_fill(get_debugreg, GetDR); - para_fill(set_debugreg, SetDR); - para_fill(read_cr0, GetCR0); - para_fill(read_cr2, GetCR2); - para_fill(read_cr3, GetCR3); - para_fill(read_cr4, GetCR4); - para_fill(write_cr0, SetCR0); - para_fill(write_cr2, SetCR2); - para_fill(write_cr3, SetCR3); - para_fill(write_cr4, SetCR4); - para_fill(save_fl, GetInterruptMask); - para_fill(restore_fl, SetInterruptMask); - para_fill(irq_disable, DisableInterrupts); - para_fill(irq_enable, EnableInterrupts); - - para_fill(wbinvd, WBINVD); - para_fill(read_tsc, RDTSC); + para_wrap(pv_cpu_ops.cpuid, vmi_cpuid, cpuid, CPUID); + + para_fill(pv_cpu_ops.clts, CLTS); + para_fill(pv_cpu_ops.get_debugreg, GetDR); + para_fill(pv_cpu_ops.set_debugreg, SetDR); + para_fill(pv_cpu_ops.read_cr0, GetCR0); + para_fill(pv_mmu_ops.read_cr2, GetCR2); + para_fill(pv_mmu_ops.read_cr3, GetCR3); + para_fill(pv_cpu_ops.read_cr4, GetCR4); + para_fill(pv_cpu_ops.write_cr0, SetCR0); + para_fill(pv_mmu_ops.write_cr2, SetCR2); + para_fill(pv_mmu_ops.write_cr3, SetCR3); + para_fill(pv_cpu_ops.write_cr4, SetCR4); + para_fill(pv_irq_ops.save_fl, GetInterruptMask); + para_fill(pv_irq_ops.restore_fl, SetInterruptMask); + para_fill(pv_irq_ops.irq_disable, DisableInterrupts); + para_fill(pv_irq_ops.irq_enable, EnableInterrupts); + + para_fill(pv_cpu_ops.wbinvd, WBINVD); + para_fill(pv_cpu_ops.read_tsc, RDTSC); /* The following we emulate with trap and emulate for now */ /* paravirt_ops.read_msr = vmi_rdmsr */ @@ -781,29 +779,38 @@ static inline int __init activate_vmi(void) /* paravirt_ops.rdpmc = vmi_rdpmc */ /* TR interface doesn't pass TR value, wrap */ - para_wrap(load_tr_desc, vmi_set_tr, set_tr, SetTR); + para_wrap(pv_cpu_ops.load_tr_desc, vmi_set_tr, set_tr, SetTR); /* LDT is special, too */ - para_wrap(set_ldt, vmi_set_ldt, _set_ldt, SetLDT); - - para_fill(load_gdt, SetGDT); - para_fill(load_idt, SetIDT); - para_fill(store_gdt, GetGDT); - para_fill(store_idt, GetIDT); - para_fill(store_tr, GetTR); - paravirt_ops.load_tls = vmi_load_tls; - para_fill(write_ldt_entry, WriteLDTEntry); - para_fill(write_gdt_entry, WriteGDTEntry); - para_fill(write_idt_entry, WriteIDTEntry); - para_wrap(load_esp0, vmi_load_esp0, set_kernel_stack, UpdateKernelStack); - para_fill(set_iopl_mask, SetIOPLMask); - para_fill(io_delay, IODelay); - para_wrap(set_lazy_mode, vmi_set_lazy_mode, set_lazy_mode, SetLazyMode); + para_wrap(pv_cpu_ops.set_ldt, vmi_set_ldt, _set_ldt, SetLDT); + + para_fill(pv_cpu_ops.load_gdt, SetGDT); + para_fill(pv_cpu_ops.load_idt, SetIDT); + para_fill(pv_cpu_ops.store_gdt, GetGDT); + para_fill(pv_cpu_ops.store_idt, GetIDT); + para_fill(pv_cpu_ops.store_tr, GetTR); + pv_cpu_ops.load_tls = vmi_load_tls; + para_fill(pv_cpu_ops.write_ldt_entry, WriteLDTEntry); + para_fill(pv_cpu_ops.write_gdt_entry, WriteGDTEntry); + para_fill(pv_cpu_ops.write_idt_entry, WriteIDTEntry); + para_wrap(pv_cpu_ops.load_esp0, vmi_load_esp0, set_kernel_stack, UpdateKernelStack); + para_fill(pv_cpu_ops.set_iopl_mask, SetIOPLMask); + para_fill(pv_cpu_ops.io_delay, IODelay); + + para_wrap(pv_cpu_ops.lazy_mode.enter, vmi_enter_lazy_cpu, + set_lazy_mode, SetLazyMode); + para_wrap(pv_cpu_ops.lazy_mode.leave, vmi_leave_lazy, + set_lazy_mode, SetLazyMode); + + para_wrap(pv_mmu_ops.lazy_mode.enter, vmi_enter_lazy_mmu, + set_lazy_mode, SetLazyMode); + para_wrap(pv_mmu_ops.lazy_mode.leave, vmi_leave_lazy, + set_lazy_mode, SetLazyMode); /* user and kernel flush are just handled with different flags to FlushTLB */ - para_wrap(flush_tlb_user, vmi_flush_tlb_user, _flush_tlb, FlushTLB); - para_wrap(flush_tlb_kernel, vmi_flush_tlb_kernel, _flush_tlb, FlushTLB); - para_fill(flush_tlb_single, InvalPage); + para_wrap(pv_mmu_ops.flush_tlb_user, vmi_flush_tlb_user, _flush_tlb, FlushTLB); + para_wrap(pv_mmu_ops.flush_tlb_kernel, vmi_flush_tlb_kernel, _flush_tlb, FlushTLB); + para_fill(pv_mmu_ops.flush_tlb_single, InvalPage); /* * Until a standard flag format can be agreed on, we need to @@ -819,41 +826,41 @@ static inline int __init activate_vmi(void) #endif if (vmi_ops.set_pte) { - paravirt_ops.set_pte = vmi_set_pte; - paravirt_ops.set_pte_at = vmi_set_pte_at; - paravirt_ops.set_pmd = vmi_set_pmd; + pv_mmu_ops.set_pte = vmi_set_pte; + pv_mmu_ops.set_pte_at = vmi_set_pte_at; + pv_mmu_ops.set_pmd = vmi_set_pmd; #ifdef CONFIG_X86_PAE - paravirt_ops.set_pte_atomic = vmi_set_pte_atomic; - paravirt_ops.set_pte_present = vmi_set_pte_present; - paravirt_ops.set_pud = vmi_set_pud; - paravirt_ops.pte_clear = vmi_pte_clear; - paravirt_ops.pmd_clear = vmi_pmd_clear; + pv_mmu_ops.set_pte_atomic = vmi_set_pte_atomic; + pv_mmu_ops.set_pte_present = vmi_set_pte_present; + pv_mmu_ops.set_pud = vmi_set_pud; + pv_mmu_ops.pte_clear = vmi_pte_clear; + pv_mmu_ops.pmd_clear = vmi_pmd_clear; #endif } if (vmi_ops.update_pte) { - paravirt_ops.pte_update = vmi_update_pte; - paravirt_ops.pte_update_defer = vmi_update_pte_defer; + pv_mmu_ops.pte_update = vmi_update_pte; + pv_mmu_ops.pte_update_defer = vmi_update_pte_defer; } vmi_ops.allocate_page = vmi_get_function(VMI_CALL_AllocatePage); if (vmi_ops.allocate_page) { - paravirt_ops.alloc_pt = vmi_allocate_pt; - paravirt_ops.alloc_pd = vmi_allocate_pd; - paravirt_ops.alloc_pd_clone = vmi_allocate_pd_clone; + pv_mmu_ops.alloc_pt = vmi_allocate_pt; + pv_mmu_ops.alloc_pd = vmi_allocate_pd; + pv_mmu_ops.alloc_pd_clone = vmi_allocate_pd_clone; } vmi_ops.release_page = vmi_get_function(VMI_CALL_ReleasePage); if (vmi_ops.release_page) { - paravirt_ops.release_pt = vmi_release_pt; - paravirt_ops.release_pd = vmi_release_pd; + pv_mmu_ops.release_pt = vmi_release_pt; + pv_mmu_ops.release_pd = vmi_release_pd; } /* Set linear is needed in all cases */ vmi_ops.set_linear_mapping = vmi_get_function(VMI_CALL_SetLinearMapping); #ifdef CONFIG_HIGHPTE if (vmi_ops.set_linear_mapping) - paravirt_ops.kmap_atomic_pte = vmi_kmap_atomic_pte; + pv_mmu_ops.kmap_atomic_pte = vmi_kmap_atomic_pte; #endif /* @@ -863,17 +870,17 @@ static inline int __init activate_vmi(void) * the backend. They are performance critical anyway, so requiring * a patch is not a big problem. */ - paravirt_ops.irq_enable_sysexit = (void *)0xfeedbab0; - paravirt_ops.iret = (void *)0xbadbab0; + pv_cpu_ops.irq_enable_sysexit = (void *)0xfeedbab0; + pv_cpu_ops.iret = (void *)0xbadbab0; #ifdef CONFIG_SMP - para_wrap(startup_ipi_hook, vmi_startup_ipi_hook, set_initial_ap_state, SetInitialAPState); + para_wrap(pv_apic_ops.startup_ipi_hook, vmi_startup_ipi_hook, set_initial_ap_state, SetInitialAPState); #endif #ifdef CONFIG_X86_LOCAL_APIC - para_fill(apic_read, APICRead); - para_fill(apic_write, APICWrite); - para_fill(apic_write_atomic, APICWrite); + para_fill(pv_apic_ops.apic_read, APICRead); + para_fill(pv_apic_ops.apic_write, APICWrite); + para_fill(pv_apic_ops.apic_write_atomic, APICWrite); #endif /* @@ -891,15 +898,15 @@ static inline int __init activate_vmi(void) vmi_timer_ops.set_alarm = vmi_get_function(VMI_CALL_SetAlarm); vmi_timer_ops.cancel_alarm = vmi_get_function(VMI_CALL_CancelAlarm); - paravirt_ops.time_init = vmi_time_init; - paravirt_ops.get_wallclock = vmi_get_wallclock; - paravirt_ops.set_wallclock = vmi_set_wallclock; + pv_time_ops.time_init = vmi_time_init; + pv_time_ops.get_wallclock = vmi_get_wallclock; + pv_time_ops.set_wallclock = vmi_set_wallclock; #ifdef CONFIG_X86_LOCAL_APIC - paravirt_ops.setup_boot_clock = vmi_time_bsp_init; - paravirt_ops.setup_secondary_clock = vmi_time_ap_init; + pv_apic_ops.setup_boot_clock = vmi_time_bsp_init; + pv_apic_ops.setup_secondary_clock = vmi_time_ap_init; #endif - paravirt_ops.sched_clock = vmi_sched_clock; - paravirt_ops.get_cpu_khz = vmi_cpu_khz; + pv_time_ops.sched_clock = vmi_sched_clock; + pv_time_ops.get_cpu_khz = vmi_cpu_khz; /* We have true wallclock functions; disable CMOS clock sync */ no_sync_cmos_clock = 1; @@ -908,7 +915,7 @@ static inline int __init activate_vmi(void) disable_vmi_timer = 1; } - para_fill(safe_halt, Halt); + para_fill(pv_irq_ops.safe_halt, Halt); /* * Alternative instruction rewriting doesn't happen soon enough diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index e4e37d4f4c5..c7d19471261 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -748,24 +748,12 @@ struct kmem_cache *pmd_cache; void __init pgtable_cache_init(void) { - size_t pgd_size = PTRS_PER_PGD*sizeof(pgd_t); - - if (PTRS_PER_PMD > 1) { + if (PTRS_PER_PMD > 1) pmd_cache = kmem_cache_create("pmd", - PTRS_PER_PMD*sizeof(pmd_t), - PTRS_PER_PMD*sizeof(pmd_t), - SLAB_PANIC, - pmd_ctor); - if (!SHARED_KERNEL_PMD) { - /* If we're in PAE mode and have a non-shared - kernel pmd, then the pgd size must be a - page size. This is because the pgd_list - links through the page structure, so there - can only be one pgd per page for this to - work. */ - pgd_size = PAGE_SIZE; - } - } + PTRS_PER_PMD*sizeof(pmd_t), + PTRS_PER_PMD*sizeof(pmd_t), + SLAB_PANIC, + pmd_ctor); } /* diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 265f7dd3234..94c39aaf695 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -51,11 +51,25 @@ EXPORT_SYMBOL_GPL(hypercall_page); -DEFINE_PER_CPU(enum paravirt_lazy_mode, xen_lazy_mode); - DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu); DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info); -DEFINE_PER_CPU(unsigned long, xen_cr3); + +/* + * Note about cr3 (pagetable base) values: + * + * xen_cr3 contains the current logical cr3 value; it contains the + * last set cr3. This may not be the current effective cr3, because + * its update may be being lazily deferred. However, a vcpu looking + * at its own cr3 can use this value knowing that it everything will + * be self-consistent. + * + * xen_current_cr3 contains the actual vcpu cr3; it is set once the + * hypercall to set the vcpu cr3 is complete (so it may be a little + * out of date, but it will never be set early). If one vcpu is + * looking at another vcpu's cr3 value, it should use this variable. + */ +DEFINE_PER_CPU(unsigned long, xen_cr3); /* cr3 stored as physaddr */ +DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */ struct start_info *xen_start_info; EXPORT_SYMBOL_GPL(xen_start_info); @@ -99,7 +113,7 @@ static void __init xen_vcpu_setup(int cpu) info.mfn = virt_to_mfn(vcpup); info.offset = offset_in_page(vcpup); - printk(KERN_DEBUG "trying to map vcpu_info %d at %p, mfn %x, offset %d\n", + printk(KERN_DEBUG "trying to map vcpu_info %d at %p, mfn %llx, offset %d\n", cpu, vcpup, info.mfn, info.offset); /* Check to see if the hypervisor will put the vcpu_info @@ -123,7 +137,7 @@ static void __init xen_vcpu_setup(int cpu) static void __init xen_banner(void) { printk(KERN_INFO "Booting paravirtualized kernel on %s\n", - paravirt_ops.name); + pv_info.name); printk(KERN_INFO "Hypervisor signature: %s\n", xen_start_info->magic); } @@ -248,29 +262,10 @@ static void xen_halt(void) xen_safe_halt(); } -static void xen_set_lazy_mode(enum paravirt_lazy_mode mode) +static void xen_leave_lazy(void) { - BUG_ON(preemptible()); - - switch (mode) { - case PARAVIRT_LAZY_NONE: - BUG_ON(x86_read_percpu(xen_lazy_mode) == PARAVIRT_LAZY_NONE); - break; - - case PARAVIRT_LAZY_MMU: - case PARAVIRT_LAZY_CPU: - BUG_ON(x86_read_percpu(xen_lazy_mode) != PARAVIRT_LAZY_NONE); - break; - - case PARAVIRT_LAZY_FLUSH: - /* flush if necessary, but don't change state */ - if (x86_read_percpu(xen_lazy_mode) != PARAVIRT_LAZY_NONE) - xen_mc_flush(); - return; - } - + paravirt_leave_lazy(paravirt_get_lazy_mode()); xen_mc_flush(); - x86_write_percpu(xen_lazy_mode, mode); } static unsigned long xen_store_tr(void) @@ -357,7 +352,7 @@ static void xen_load_tls(struct thread_struct *t, unsigned int cpu) * loaded properly. This will go away as soon as Xen has been * modified to not save/restore %gs for normal hypercalls. */ - if (xen_get_lazy_mode() == PARAVIRT_LAZY_CPU) + if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU) loadsegment(gs, 0); } @@ -631,32 +626,36 @@ static unsigned long xen_read_cr3(void) return x86_read_percpu(xen_cr3); } +static void set_current_cr3(void *v) +{ + x86_write_percpu(xen_current_cr3, (unsigned long)v); +} + static void xen_write_cr3(unsigned long cr3) { + struct mmuext_op *op; + struct multicall_space mcs; + unsigned long mfn = pfn_to_mfn(PFN_DOWN(cr3)); + BUG_ON(preemptible()); - if (cr3 == x86_read_percpu(xen_cr3)) { - /* just a simple tlb flush */ - xen_flush_tlb(); - return; - } + mcs = xen_mc_entry(sizeof(*op)); /* disables interrupts */ + /* Update while interrupts are disabled, so its atomic with + respect to ipis */ x86_write_percpu(xen_cr3, cr3); + op = mcs.args; + op->cmd = MMUEXT_NEW_BASEPTR; + op->arg1.mfn = mfn; - { - struct mmuext_op *op; - struct multicall_space mcs = xen_mc_entry(sizeof(*op)); - unsigned long mfn = pfn_to_mfn(PFN_DOWN(cr3)); - - op = mcs.args; - op->cmd = MMUEXT_NEW_BASEPTR; - op->arg1.mfn = mfn; + MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); - MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); + /* Update xen_update_cr3 once the batch has actually + been submitted. */ + xen_mc_callback(set_current_cr3, (void *)cr3); - xen_mc_issue(PARAVIRT_LAZY_CPU); - } + xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */ } /* Early in boot, while setting up the initial pagetable, assume @@ -667,6 +666,15 @@ static __init void xen_alloc_pt_init(struct mm_struct *mm, u32 pfn) make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); } +static void pin_pagetable_pfn(unsigned level, unsigned long pfn) +{ + struct mmuext_op op; + op.cmd = level; + op.arg1.mfn = pfn_to_mfn(pfn); + if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF)) + BUG(); +} + /* This needs to make sure the new pte page is pinned iff its being attached to a pinned pagetable. */ static void xen_alloc_pt(struct mm_struct *mm, u32 pfn) @@ -676,9 +684,10 @@ static void xen_alloc_pt(struct mm_struct *mm, u32 pfn) if (PagePinned(virt_to_page(mm->pgd))) { SetPagePinned(page); - if (!PageHighMem(page)) + if (!PageHighMem(page)) { make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); - else + pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn); + } else /* make sure there are no stray mappings of this page */ kmap_flush_unused(); @@ -691,8 +700,10 @@ static void xen_release_pt(u32 pfn) struct page *page = pfn_to_page(pfn); if (PagePinned(page)) { - if (!PageHighMem(page)) + if (!PageHighMem(page)) { + pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn); make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); + } } } @@ -737,7 +748,7 @@ static __init void xen_pagetable_setup_start(pgd_t *base) pgd_t *xen_pgd = (pgd_t *)xen_start_info->pt_base; /* special set_pte for pagetable initialization */ - paravirt_ops.set_pte = xen_set_pte_init; + pv_mmu_ops.set_pte = xen_set_pte_init; init_mm.pgd = base; /* @@ -784,8 +795,8 @@ static __init void xen_pagetable_setup_done(pgd_t *base) { /* This will work as long as patching hasn't happened yet (which it hasn't) */ - paravirt_ops.alloc_pt = xen_alloc_pt; - paravirt_ops.set_pte = xen_set_pte; + pv_mmu_ops.alloc_pt = xen_alloc_pt; + pv_mmu_ops.set_pte = xen_set_pte; if (!xen_feature(XENFEAT_auto_translated_physmap)) { /* @@ -807,15 +818,15 @@ static __init void xen_pagetable_setup_done(pgd_t *base) /* Actually pin the pagetable down, but we can't set PG_pinned yet because the page structures don't exist yet. */ { - struct mmuext_op op; + unsigned level; + #ifdef CONFIG_X86_PAE - op.cmd = MMUEXT_PIN_L3_TABLE; + level = MMUEXT_PIN_L3_TABLE; #else - op.cmd = MMUEXT_PIN_L3_TABLE; + level = MMUEXT_PIN_L2_TABLE; #endif - op.arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(base))); - if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF)) - BUG(); + + pin_pagetable_pfn(level, PFN_DOWN(__pa(base))); } } @@ -832,12 +843,12 @@ void __init xen_setup_vcpu_info_placement(void) if (have_vcpu_info_placement) { printk(KERN_INFO "Xen: using vcpu_info placement\n"); - paravirt_ops.save_fl = xen_save_fl_direct; - paravirt_ops.restore_fl = xen_restore_fl_direct; - paravirt_ops.irq_disable = xen_irq_disable_direct; - paravirt_ops.irq_enable = xen_irq_enable_direct; - paravirt_ops.read_cr2 = xen_read_cr2_direct; - paravirt_ops.iret = xen_iret_direct; + pv_irq_ops.save_fl = xen_save_fl_direct; + pv_irq_ops.restore_fl = xen_restore_fl_direct; + pv_irq_ops.irq_disable = xen_irq_disable_direct; + pv_irq_ops.irq_enable = xen_irq_enable_direct; + pv_mmu_ops.read_cr2 = xen_read_cr2_direct; + pv_cpu_ops.iret = xen_iret_direct; } } @@ -849,8 +860,8 @@ static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf, start = end = reloc = NULL; -#define SITE(x) \ - case PARAVIRT_PATCH(x): \ +#define SITE(op, x) \ + case PARAVIRT_PATCH(op.x): \ if (have_vcpu_info_placement) { \ start = (char *)xen_##x##_direct; \ end = xen_##x##_direct_end; \ @@ -859,10 +870,10 @@ static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf, goto patch_site switch (type) { - SITE(irq_enable); - SITE(irq_disable); - SITE(save_fl); - SITE(restore_fl); + SITE(pv_irq_ops, irq_enable); + SITE(pv_irq_ops, irq_disable); + SITE(pv_irq_ops, save_fl); + SITE(pv_irq_ops, restore_fl); #undef SITE patch_site: @@ -894,26 +905,32 @@ static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf, return ret; } -static const struct paravirt_ops xen_paravirt_ops __initdata = { +static const struct pv_info xen_info __initdata = { .paravirt_enabled = 1, .shared_kernel_pmd = 0, .name = "Xen", - .banner = xen_banner, +}; +static const struct pv_init_ops xen_init_ops __initdata = { .patch = xen_patch, + .banner = xen_banner, .memory_setup = xen_memory_setup, .arch_setup = xen_arch_setup, - .init_IRQ = xen_init_IRQ, .post_allocator_init = xen_mark_init_mm_pinned, +}; +static const struct pv_time_ops xen_time_ops __initdata = { .time_init = xen_time_init, + .set_wallclock = xen_set_wallclock, .get_wallclock = xen_get_wallclock, .get_cpu_khz = xen_cpu_khz, .sched_clock = xen_sched_clock, +}; +static const struct pv_cpu_ops xen_cpu_ops __initdata = { .cpuid = xen_cpuid, .set_debugreg = xen_set_debugreg, @@ -924,22 +941,10 @@ static const struct paravirt_ops xen_paravirt_ops __initdata = { .read_cr0 = native_read_cr0, .write_cr0 = native_write_cr0, - .read_cr2 = xen_read_cr2, - .write_cr2 = xen_write_cr2, - - .read_cr3 = xen_read_cr3, - .write_cr3 = xen_write_cr3, - .read_cr4 = native_read_cr4, .read_cr4_safe = native_read_cr4_safe, .write_cr4 = xen_write_cr4, - .save_fl = xen_save_fl, - .restore_fl = xen_restore_fl, - .irq_disable = xen_irq_disable, - .irq_enable = xen_irq_enable, - .safe_halt = xen_safe_halt, - .halt = xen_halt, .wbinvd = native_wbinvd, .read_msr = native_read_msr_safe, @@ -968,6 +973,23 @@ static const struct paravirt_ops xen_paravirt_ops __initdata = { .set_iopl_mask = xen_set_iopl_mask, .io_delay = xen_io_delay, + .lazy_mode = { + .enter = paravirt_enter_lazy_cpu, + .leave = xen_leave_lazy, + }, +}; + +static const struct pv_irq_ops xen_irq_ops __initdata = { + .init_IRQ = xen_init_IRQ, + .save_fl = xen_save_fl, + .restore_fl = xen_restore_fl, + .irq_disable = xen_irq_disable, + .irq_enable = xen_irq_enable, + .safe_halt = xen_safe_halt, + .halt = xen_halt, +}; + +static const struct pv_apic_ops xen_apic_ops __initdata = { #ifdef CONFIG_X86_LOCAL_APIC .apic_write = xen_apic_write, .apic_write_atomic = xen_apic_write, @@ -976,6 +998,17 @@ static const struct paravirt_ops xen_paravirt_ops __initdata = { .setup_secondary_clock = paravirt_nop, .startup_ipi_hook = paravirt_nop, #endif +}; + +static const struct pv_mmu_ops xen_mmu_ops __initdata = { + .pagetable_setup_start = xen_pagetable_setup_start, + .pagetable_setup_done = xen_pagetable_setup_done, + + .read_cr2 = xen_read_cr2, + .write_cr2 = xen_write_cr2, + + .read_cr3 = xen_read_cr3, + .write_cr3 = xen_write_cr3, .flush_tlb_user = xen_flush_tlb, .flush_tlb_kernel = xen_flush_tlb, @@ -985,9 +1018,6 @@ static const struct paravirt_ops xen_paravirt_ops __initdata = { .pte_update = paravirt_nop, .pte_update_defer = paravirt_nop, - .pagetable_setup_start = xen_pagetable_setup_start, - .pagetable_setup_done = xen_pagetable_setup_done, - .alloc_pt = xen_alloc_pt_init, .release_pt = xen_release_pt, .alloc_pd = paravirt_nop, @@ -1023,7 +1053,10 @@ static const struct paravirt_ops xen_paravirt_ops __initdata = { .dup_mmap = xen_dup_mmap, .exit_mmap = xen_exit_mmap, - .set_lazy_mode = xen_set_lazy_mode, + .lazy_mode = { + .enter = paravirt_enter_lazy_mmu, + .leave = xen_leave_lazy, + }, }; #ifdef CONFIG_SMP @@ -1079,6 +1112,17 @@ static const struct machine_ops __initdata xen_machine_ops = { }; +static void __init xen_reserve_top(void) +{ + unsigned long top = HYPERVISOR_VIRT_START; + struct xen_platform_parameters pp; + + if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0) + top = pp.virt_start; + + reserve_top_address(-top + 2 * PAGE_SIZE); +} + /* First C function to be called on Xen boot */ asmlinkage void __init xen_start_kernel(void) { @@ -1090,7 +1134,14 @@ asmlinkage void __init xen_start_kernel(void) BUG_ON(memcmp(xen_start_info->magic, "xen-3.0", 7) != 0); /* Install Xen paravirt ops */ - paravirt_ops = xen_paravirt_ops; + pv_info = xen_info; + pv_init_ops = xen_init_ops; + pv_time_ops = xen_time_ops; + pv_cpu_ops = xen_cpu_ops; + pv_irq_ops = xen_irq_ops; + pv_apic_ops = xen_apic_ops; + pv_mmu_ops = xen_mmu_ops; + machine_ops = xen_machine_ops; #ifdef CONFIG_SMP @@ -1112,6 +1163,7 @@ asmlinkage void __init xen_start_kernel(void) /* keep using Xen gdt for now; no urgent need to change it */ x86_write_percpu(xen_cr3, __pa(pgd)); + x86_write_percpu(xen_current_cr3, __pa(pgd)); #ifdef CONFIG_SMP /* Don't do the full vcpu_info placement stuff until we have a @@ -1123,12 +1175,12 @@ asmlinkage void __init xen_start_kernel(void) xen_setup_vcpu_info_placement(); #endif - paravirt_ops.kernel_rpl = 1; + pv_info.kernel_rpl = 1; if (xen_feature(XENFEAT_supervisor_mode_kernel)) - paravirt_ops.kernel_rpl = 0; + pv_info.kernel_rpl = 0; /* set the limit of our address space */ - reserve_top_address(-HYPERVISOR_VIRT_START + 2 * PAGE_SIZE); + xen_reserve_top(); /* set up basic CPUID stuff */ cpu_detect(&new_cpu_data); diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 0bb7f001910..b2e32f9d007 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -154,7 +154,7 @@ void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pteval) { if (mm == current->mm || mm == &init_mm) { - if (xen_get_lazy_mode() == PARAVIRT_LAZY_MMU) { + if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) { struct multicall_space mcs; mcs = xen_mc_entry(0); @@ -303,7 +303,12 @@ pgd_t xen_make_pgd(unsigned long pgd) } #endif /* CONFIG_X86_PAE */ - +enum pt_level { + PT_PGD, + PT_PUD, + PT_PMD, + PT_PTE +}; /* (Yet another) pagetable walker. This one is intended for pinning a @@ -315,7 +320,7 @@ pgd_t xen_make_pgd(unsigned long pgd) FIXADDR_TOP. But the important bit is that we don't pin beyond there, because then we start getting into Xen's ptes. */ -static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, unsigned), +static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, enum pt_level), unsigned long limit) { pgd_t *pgd = pgd_base; @@ -340,7 +345,7 @@ static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, unsigned), pud = pud_offset(pgd, 0); if (PTRS_PER_PUD > 1) /* not folded */ - flush |= (*func)(virt_to_page(pud), 0); + flush |= (*func)(virt_to_page(pud), PT_PUD); for (; addr != pud_limit; pud++, addr = pud_next) { pmd_t *pmd; @@ -359,7 +364,7 @@ static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, unsigned), pmd = pmd_offset(pud, 0); if (PTRS_PER_PMD > 1) /* not folded */ - flush |= (*func)(virt_to_page(pmd), 0); + flush |= (*func)(virt_to_page(pmd), PT_PMD); for (; addr != pmd_limit; pmd++) { addr += (PAGE_SIZE * PTRS_PER_PTE); @@ -371,17 +376,47 @@ static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, unsigned), if (pmd_none(*pmd)) continue; - flush |= (*func)(pmd_page(*pmd), 0); + flush |= (*func)(pmd_page(*pmd), PT_PTE); } } } - flush |= (*func)(virt_to_page(pgd_base), UVMF_TLB_FLUSH); + flush |= (*func)(virt_to_page(pgd_base), PT_PGD); return flush; } -static int pin_page(struct page *page, unsigned flags) +static spinlock_t *lock_pte(struct page *page) +{ + spinlock_t *ptl = NULL; + +#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS + ptl = __pte_lockptr(page); + spin_lock(ptl); +#endif + + return ptl; +} + +static void do_unlock(void *v) +{ + spinlock_t *ptl = v; + spin_unlock(ptl); +} + +static void xen_do_pin(unsigned level, unsigned long pfn) +{ + struct mmuext_op *op; + struct multicall_space mcs; + + mcs = __xen_mc_entry(sizeof(*op)); + op = mcs.args; + op->cmd = level; + op->arg1.mfn = pfn_to_mfn(pfn); + MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); +} + +static int pin_page(struct page *page, enum pt_level level) { unsigned pgfl = test_and_set_bit(PG_pinned, &page->flags); int flush; @@ -396,12 +431,26 @@ static int pin_page(struct page *page, unsigned flags) void *pt = lowmem_page_address(page); unsigned long pfn = page_to_pfn(page); struct multicall_space mcs = __xen_mc_entry(0); + spinlock_t *ptl; flush = 0; + ptl = NULL; + if (level == PT_PTE) + ptl = lock_pte(page); + MULTI_update_va_mapping(mcs.mc, (unsigned long)pt, pfn_pte(pfn, PAGE_KERNEL_RO), - flags); + level == PT_PGD ? UVMF_TLB_FLUSH : 0); + + if (level == PT_PTE) + xen_do_pin(MMUEXT_PIN_L1_TABLE, pfn); + + if (ptl) { + /* Queue a deferred unlock for when this batch + is completed. */ + xen_mc_callback(do_unlock, ptl); + } } return flush; @@ -412,8 +461,7 @@ static int pin_page(struct page *page, unsigned flags) read-only, and can be pinned. */ void xen_pgd_pin(pgd_t *pgd) { - struct multicall_space mcs; - struct mmuext_op *op; + unsigned level; xen_mc_batch(); @@ -424,16 +472,13 @@ void xen_pgd_pin(pgd_t *pgd) xen_mc_batch(); } - mcs = __xen_mc_entry(sizeof(*op)); - op = mcs.args; - #ifdef CONFIG_X86_PAE - op->cmd = MMUEXT_PIN_L3_TABLE; + level = MMUEXT_PIN_L3_TABLE; #else - op->cmd = MMUEXT_PIN_L2_TABLE; + level = MMUEXT_PIN_L2_TABLE; #endif - op->arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(pgd))); - MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); + + xen_do_pin(level, PFN_DOWN(__pa(pgd))); xen_mc_issue(0); } @@ -441,7 +486,7 @@ void xen_pgd_pin(pgd_t *pgd) /* The init_mm pagetable is really pinned as soon as its created, but that's before we have page structures to store the bits. So do all the book-keeping now. */ -static __init int mark_pinned(struct page *page, unsigned flags) +static __init int mark_pinned(struct page *page, enum pt_level level) { SetPagePinned(page); return 0; @@ -452,18 +497,32 @@ void __init xen_mark_init_mm_pinned(void) pgd_walk(init_mm.pgd, mark_pinned, FIXADDR_TOP); } -static int unpin_page(struct page *page, unsigned flags) +static int unpin_page(struct page *page, enum pt_level level) { unsigned pgfl = test_and_clear_bit(PG_pinned, &page->flags); if (pgfl && !PageHighMem(page)) { void *pt = lowmem_page_address(page); unsigned long pfn = page_to_pfn(page); - struct multicall_space mcs = __xen_mc_entry(0); + spinlock_t *ptl = NULL; + struct multicall_space mcs; + + if (level == PT_PTE) { + ptl = lock_pte(page); + + xen_do_pin(MMUEXT_UNPIN_TABLE, pfn); + } + + mcs = __xen_mc_entry(0); MULTI_update_va_mapping(mcs.mc, (unsigned long)pt, pfn_pte(pfn, PAGE_KERNEL), - flags); + level == PT_PGD ? UVMF_TLB_FLUSH : 0); + + if (ptl) { + /* unlock when batch completed */ + xen_mc_callback(do_unlock, ptl); + } } return 0; /* never need to flush on unpin */ @@ -472,18 +531,9 @@ static int unpin_page(struct page *page, unsigned flags) /* Release a pagetables pages back as normal RW */ static void xen_pgd_unpin(pgd_t *pgd) { - struct mmuext_op *op; - struct multicall_space mcs; - xen_mc_batch(); - mcs = __xen_mc_entry(sizeof(*op)); - - op = mcs.args; - op->cmd = MMUEXT_UNPIN_TABLE; - op->arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(pgd))); - - MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); + xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); pgd_walk(pgd, unpin_page, TASK_SIZE); @@ -514,20 +564,43 @@ static void drop_other_mm_ref(void *info) if (__get_cpu_var(cpu_tlbstate).active_mm == mm) leave_mm(smp_processor_id()); + + /* If this cpu still has a stale cr3 reference, then make sure + it has been flushed. */ + if (x86_read_percpu(xen_current_cr3) == __pa(mm->pgd)) { + load_cr3(swapper_pg_dir); + arch_flush_lazy_cpu_mode(); + } } static void drop_mm_ref(struct mm_struct *mm) { + cpumask_t mask; + unsigned cpu; + if (current->active_mm == mm) { if (current->mm == mm) load_cr3(swapper_pg_dir); else leave_mm(smp_processor_id()); + arch_flush_lazy_cpu_mode(); } - if (!cpus_empty(mm->cpu_vm_mask)) - xen_smp_call_function_mask(mm->cpu_vm_mask, drop_other_mm_ref, - mm, 1); + /* Get the "official" set of cpus referring to our pagetable. */ + mask = mm->cpu_vm_mask; + + /* It's possible that a vcpu may have a stale reference to our + cr3, because its in lazy mode, and it hasn't yet flushed + its set of pending hypercalls yet. In this case, we can + look at its actual current cr3 value, and force it to flush + if needed. */ + for_each_online_cpu(cpu) { + if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd)) + cpu_set(cpu, mask); + } + + if (!cpus_empty(mask)) + xen_smp_call_function_mask(mask, drop_other_mm_ref, mm, 1); } #else static void drop_mm_ref(struct mm_struct *mm) @@ -562,5 +635,6 @@ void xen_exit_mmap(struct mm_struct *mm) /* pgd may not be pinned in the error exit path of execve */ if (PagePinned(virt_to_page(mm->pgd))) xen_pgd_unpin(mm->pgd); + spin_unlock(&mm->page_table_lock); } diff --git a/arch/x86/xen/multicalls.c b/arch/x86/xen/multicalls.c index c837e8e463d..5e6f36f6d87 100644 --- a/arch/x86/xen/multicalls.c +++ b/arch/x86/xen/multicalls.c @@ -26,13 +26,22 @@ #include "multicalls.h" +#define MC_DEBUG 1 + #define MC_BATCH 32 #define MC_ARGS (MC_BATCH * 16 / sizeof(u64)) struct mc_buffer { struct multicall_entry entries[MC_BATCH]; +#if MC_DEBUG + struct multicall_entry debug[MC_BATCH]; +#endif u64 args[MC_ARGS]; - unsigned mcidx, argidx; + struct callback { + void (*fn)(void *); + void *data; + } callbacks[MC_BATCH]; + unsigned mcidx, argidx, cbidx; }; static DEFINE_PER_CPU(struct mc_buffer, mc_buffer); @@ -43,6 +52,7 @@ void xen_mc_flush(void) struct mc_buffer *b = &__get_cpu_var(mc_buffer); int ret = 0; unsigned long flags; + int i; BUG_ON(preemptible()); @@ -51,13 +61,31 @@ void xen_mc_flush(void) local_irq_save(flags); if (b->mcidx) { - int i; +#if MC_DEBUG + memcpy(b->debug, b->entries, + b->mcidx * sizeof(struct multicall_entry)); +#endif if (HYPERVISOR_multicall(b->entries, b->mcidx) != 0) BUG(); for (i = 0; i < b->mcidx; i++) if (b->entries[i].result < 0) ret++; + +#if MC_DEBUG + if (ret) { + printk(KERN_ERR "%d multicall(s) failed: cpu %d\n", + ret, smp_processor_id()); + for(i = 0; i < b->mcidx; i++) { + printk(" call %2d/%d: op=%lu arg=[%lx] result=%ld\n", + i+1, b->mcidx, + b->debug[i].op, + b->debug[i].args[0], + b->entries[i].result); + } + } +#endif + b->mcidx = 0; b->argidx = 0; } else @@ -65,6 +93,13 @@ void xen_mc_flush(void) local_irq_restore(flags); + for(i = 0; i < b->cbidx; i++) { + struct callback *cb = &b->callbacks[i]; + + (*cb->fn)(cb->data); + } + b->cbidx = 0; + BUG_ON(ret); } @@ -88,3 +123,16 @@ struct multicall_space __xen_mc_entry(size_t args) return ret; } + +void xen_mc_callback(void (*fn)(void *), void *data) +{ + struct mc_buffer *b = &__get_cpu_var(mc_buffer); + struct callback *cb; + + if (b->cbidx == MC_BATCH) + xen_mc_flush(); + + cb = &b->callbacks[b->cbidx++]; + cb->fn = fn; + cb->data = data; +} diff --git a/arch/x86/xen/multicalls.h b/arch/x86/xen/multicalls.h index e6f7530b156..8bae996d99a 100644 --- a/arch/x86/xen/multicalls.h +++ b/arch/x86/xen/multicalls.h @@ -35,11 +35,14 @@ void xen_mc_flush(void); /* Issue a multicall if we're not in a lazy mode */ static inline void xen_mc_issue(unsigned mode) { - if ((xen_get_lazy_mode() & mode) == 0) + if ((paravirt_get_lazy_mode() & mode) == 0) xen_mc_flush(); /* restore flags saved in xen_mc_batch */ local_irq_restore(x86_read_percpu(xen_mc_irq_flags)); } +/* Set up a callback to be called when the current batch is flushed */ +void xen_mc_callback(void (*fn)(void *), void *data); + #endif /* _XEN_MULTICALLS_H */ diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 6c058585459..c1b131bcdcb 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -371,7 +371,8 @@ int xen_smp_call_function_mask(cpumask_t mask, void (*func)(void *), void *info, int wait) { struct call_data_struct data; - int cpus; + int cpus, cpu; + bool yield; /* Holding any lock stops cpus from going down. */ spin_lock(&call_lock); @@ -400,9 +401,14 @@ int xen_smp_call_function_mask(cpumask_t mask, void (*func)(void *), /* Send a message to other CPUs and wait for them to respond */ xen_send_IPI_mask(mask, XEN_CALL_FUNCTION_VECTOR); - /* Make sure other vcpus get a chance to run. - XXX too severe? Maybe we should check the other CPU's states? */ - HYPERVISOR_sched_op(SCHEDOP_yield, 0); + /* Make sure other vcpus get a chance to run if they need to. */ + yield = false; + for_each_cpu_mask(cpu, mask) + if (xen_vcpu_stolen(cpu)) + yield = true; + + if (yield) + HYPERVISOR_sched_op(SCHEDOP_yield, 0); /* Wait for response */ while (atomic_read(&data.started) != cpus || diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index dfd6db69ead..d083ff5ef08 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -105,6 +105,12 @@ static void get_runstate_snapshot(struct vcpu_runstate_info *res) } while (get64(&state->state_entry_time) != state_time); } +/* return true when a vcpu could run but has no real cpu to run on */ +bool xen_vcpu_stolen(int vcpu) +{ + return per_cpu(runstate, vcpu).state == RUNSTATE_runnable; +} + static void setup_runstate_info(int cpu) { struct vcpu_register_runstate_memory_area area; diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index b9aaea45f07..b02a909bfd4 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -11,6 +11,7 @@ void xen_copy_trap_info(struct trap_info *traps); DECLARE_PER_CPU(struct vcpu_info *, xen_vcpu); DECLARE_PER_CPU(unsigned long, xen_cr3); +DECLARE_PER_CPU(unsigned long, xen_current_cr3); extern struct start_info *xen_start_info; extern struct shared_info *HYPERVISOR_shared_info; @@ -27,14 +28,9 @@ unsigned long xen_get_wallclock(void); int xen_set_wallclock(unsigned long time); unsigned long long xen_sched_clock(void); -void xen_mark_init_mm_pinned(void); - -DECLARE_PER_CPU(enum paravirt_lazy_mode, xen_lazy_mode); +bool xen_vcpu_stolen(int vcpu); -static inline unsigned xen_get_lazy_mode(void) -{ - return x86_read_percpu(xen_lazy_mode); -} +void xen_mark_init_mm_pinned(void); void __init xen_fill_possible_map(void); |