diff options
Diffstat (limited to 'arch')
50 files changed, 840 insertions, 367 deletions
diff --git a/arch/cris/kernel/sys_cris.c b/arch/cris/kernel/sys_cris.c index 8b9984197ed..d124066e172 100644 --- a/arch/cris/kernel/sys_cris.c +++ b/arch/cris/kernel/sys_cris.c @@ -40,8 +40,11 @@ asmlinkage int sys_pipe(unsigned long __user * fildes) error = do_pipe(fd); unlock_kernel(); if (!error) { - if (copy_to_user(fildes, fd, 2*sizeof(int))) + if (copy_to_user(fildes, fd, 2*sizeof(int))) { + sys_close(fd[0]); + sys_close(fd[1]); error = -EFAULT; + } } return error; } diff --git a/arch/m32r/kernel/sys_m32r.c b/arch/m32r/kernel/sys_m32r.c index 6d7a80fdad4..319c79720b8 100644 --- a/arch/m32r/kernel/sys_m32r.c +++ b/arch/m32r/kernel/sys_m32r.c @@ -90,8 +90,11 @@ sys_pipe(unsigned long r0, unsigned long r1, unsigned long r2, error = do_pipe(fd); if (!error) { - if (copy_to_user((void __user *)r0, fd, 2*sizeof(int))) + if (copy_to_user((void __user *)r0, fd, 2*sizeof(int))) { + sys_close(fd[0]); + sys_close(fd[1]); error = -EFAULT; + } } return error; } diff --git a/arch/m68k/kernel/traps.c b/arch/m68k/kernel/traps.c index fd4858e2dd6..75b8340b254 100644 --- a/arch/m68k/kernel/traps.c +++ b/arch/m68k/kernel/traps.c @@ -468,15 +468,26 @@ static inline void access_error040(struct frame *fp) * (if do_page_fault didn't fix the mapping, * the writeback won't do good) */ +disable_wb: #ifdef DEBUG printk(".. disabling wb2\n"); #endif if (fp->un.fmt7.wb2a == fp->un.fmt7.faddr) fp->un.fmt7.wb2s &= ~WBV_040; + if (fp->un.fmt7.wb3a == fp->un.fmt7.faddr) + fp->un.fmt7.wb3s &= ~WBV_040; } - } else if (send_fault_sig(&fp->ptregs) > 0) { - printk("68040 access error, ssw=%x\n", ssw); - trap_c(fp); + } else { + /* In case of a bus error we either kill the process or expect + * the kernel to catch the fault, which then is also responsible + * for cleaning up the mess. + */ + current->thread.signo = SIGBUS; + current->thread.faddr = fp->un.fmt7.faddr; + if (send_fault_sig(&fp->ptregs) >= 0) + printk("68040 bus error (ssw=%x, faddr=%lx)\n", ssw, + fp->un.fmt7.faddr); + goto disable_wb; } do_040writebacks(fp); diff --git a/arch/m68k/mac/config.c b/arch/m68k/mac/config.c index 735a49b4b93..ad3e3bacae3 100644 --- a/arch/m68k/mac/config.c +++ b/arch/m68k/mac/config.c @@ -48,9 +48,6 @@ struct mac_booter_data mac_bi_data; int mac_bisize = sizeof mac_bi_data; -struct mac_hw_present mac_hw_present; -EXPORT_SYMBOL(mac_hw_present); - /* New m68k bootinfo stuff and videobase */ extern int m68k_num_memory; @@ -817,27 +814,6 @@ void __init mac_identify(void) m68k_ramdisk.addr, m68k_ramdisk.size); #endif - /* - * TODO: set the various fields in macintosh_config->hw_present here! - */ - switch (macintosh_config->scsi_type) { - case MAC_SCSI_OLD: - MACHW_SET(MAC_SCSI_80); - break; - case MAC_SCSI_QUADRA: - case MAC_SCSI_QUADRA2: - case MAC_SCSI_QUADRA3: - MACHW_SET(MAC_SCSI_96); - if ((macintosh_config->ident == MAC_MODEL_Q900) || - (macintosh_config->ident == MAC_MODEL_Q950)) - MACHW_SET(MAC_SCSI_96_2); - break; - default: - printk(KERN_WARNING "config.c: wtf: unknown scsi, using 53c80\n"); - MACHW_SET(MAC_SCSI_80); - break; - } - iop_init(); via_init(); oss_init(); diff --git a/arch/powerpc/kvm/booke_guest.c b/arch/powerpc/kvm/booke_guest.c index 6d9884a6884..712d89a28c4 100644 --- a/arch/powerpc/kvm/booke_guest.c +++ b/arch/powerpc/kvm/booke_guest.c @@ -49,6 +49,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { "inst_emu", VCPU_STAT(emulated_inst_exits) }, { "dec", VCPU_STAT(dec_exits) }, { "ext_intr", VCPU_STAT(ext_intr_exits) }, + { "halt_wakeup", VCPU_STAT(halt_wakeup) }, { NULL } }; @@ -338,6 +339,11 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, } break; + case BOOKE_INTERRUPT_FP_UNAVAIL: + kvmppc_queue_exception(vcpu, exit_nr); + r = RESUME_GUEST; + break; + case BOOKE_INTERRUPT_DATA_STORAGE: vcpu->arch.dear = vcpu->arch.fault_dear; vcpu->arch.esr = vcpu->arch.fault_esr; diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index bad40bd2d3a..777e0f34e0e 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -36,13 +36,12 @@ gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn) int kvm_cpu_has_interrupt(struct kvm_vcpu *v) { - /* XXX implement me */ - return 0; + return !!(v->arch.pending_exceptions); } int kvm_arch_vcpu_runnable(struct kvm_vcpu *v) { - return 1; + return !(v->arch.msr & MSR_WE); } @@ -214,6 +213,11 @@ static void kvmppc_decrementer_func(unsigned long data) struct kvm_vcpu *vcpu = (struct kvm_vcpu *)data; kvmppc_queue_exception(vcpu, BOOKE_INTERRUPT_DECREMENTER); + + if (waitqueue_active(&vcpu->wq)) { + wake_up_interruptible(&vcpu->wq); + vcpu->stat.halt_wakeup++; + } } int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) @@ -339,6 +343,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) int r; sigset_t sigsaved; + vcpu_load(vcpu); + if (vcpu->sigset_active) sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved); @@ -363,12 +369,20 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) if (vcpu->sigset_active) sigprocmask(SIG_SETMASK, &sigsaved, NULL); + vcpu_put(vcpu); + return r; } int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq) { kvmppc_queue_exception(vcpu, BOOKE_INTERRUPT_EXTERNAL); + + if (waitqueue_active(&vcpu->wq)) { + wake_up_interruptible(&vcpu->wq); + vcpu->stat.halt_wakeup++; + } + return 0; } diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile index 4bb023f4c86..f1d2cdc5331 100644 --- a/arch/powerpc/lib/Makefile +++ b/arch/powerpc/lib/Makefile @@ -23,3 +23,4 @@ obj-$(CONFIG_SMP) += locks.o endif obj-$(CONFIG_PPC_LIB_RHEAP) += rheap.o +obj-$(CONFIG_HAS_IOMEM) += devres.o diff --git a/arch/powerpc/lib/devres.c b/arch/powerpc/lib/devres.c new file mode 100644 index 00000000000..292115d98ea --- /dev/null +++ b/arch/powerpc/lib/devres.c @@ -0,0 +1,42 @@ +/* + * Copyright (C) 2008 Freescale Semiconductor, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/device.h> /* devres_*(), devm_ioremap_release() */ +#include <linux/io.h> /* ioremap_flags() */ +#include <linux/module.h> /* EXPORT_SYMBOL() */ + +/** + * devm_ioremap_prot - Managed ioremap_flags() + * @dev: Generic device to remap IO address for + * @offset: BUS offset to map + * @size: Size of map + * @flags: Page flags + * + * Managed ioremap_prot(). Map is automatically unmapped on driver + * detach. + */ +void __iomem *devm_ioremap_prot(struct device *dev, resource_size_t offset, + size_t size, unsigned long flags) +{ + void __iomem **ptr, *addr; + + ptr = devres_alloc(devm_ioremap_release, sizeof(*ptr), GFP_KERNEL); + if (!ptr) + return NULL; + + addr = ioremap_flags(offset, size, flags); + if (addr) { + *ptr = addr; + devres_add(dev, ptr); + } else + devres_free(ptr); + + return addr; +} +EXPORT_SYMBOL(devm_ioremap_prot); diff --git a/arch/powerpc/platforms/pseries/scanlog.c b/arch/powerpc/platforms/pseries/scanlog.c index bec3803f061..417eca79df6 100644 --- a/arch/powerpc/platforms/pseries/scanlog.c +++ b/arch/powerpc/platforms/pseries/scanlog.c @@ -55,11 +55,6 @@ static ssize_t scanlog_read(struct file *file, char __user *buf, dp = PDE(inode); data = (unsigned int *)dp->data; - if (!data) { - printk(KERN_ERR "scanlog: read failed no data\n"); - return -EIO; - } - if (count > RTAS_DATA_BUF_SIZE) count = RTAS_DATA_BUF_SIZE; @@ -146,11 +141,6 @@ static int scanlog_open(struct inode * inode, struct file * file) struct proc_dir_entry *dp = PDE(inode); unsigned int *data = (unsigned int *)dp->data; - if (!data) { - printk(KERN_ERR "scanlog: open failed no data\n"); - return -EIO; - } - if (data[0] != 0) { /* This imperfect test stops a second copy of the * data (or a reset while data is being copied) @@ -168,10 +158,6 @@ static int scanlog_release(struct inode * inode, struct file * file) struct proc_dir_entry *dp = PDE(inode); unsigned int *data = (unsigned int *)dp->data; - if (!data) { - printk(KERN_ERR "scanlog: release failed no data\n"); - return -EIO; - } data[0] = 0; return 0; @@ -200,12 +186,11 @@ static int __init scanlog_init(void) if (!data) goto err; - ent = proc_create("ppc64/rtas/scan-log-dump", S_IRUSR, NULL, - &scanlog_fops); + ent = proc_create_data("ppc64/rtas/scan-log-dump", S_IRUSR, NULL, + &scanlog_fops, data); if (!ent) goto err; - ent->data = data; proc_ppc64_scan_log_dump = ent; return 0; diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 29a7940f284..1d035082e78 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -430,6 +430,13 @@ config CMM_IUCV Select this option to enable the special message interface to the cooperative memory management. +config PAGE_STATES + bool "Unused page notification" + help + This enables the notification of unused pages to the + hypervisor. The ESSA instruction is used to do the states + changes between a page that has content and the unused state. + config VIRT_TIMER bool "Virtual CPU timer support" help diff --git a/arch/s390/kernel/compat_wrapper.S b/arch/s390/kernel/compat_wrapper.S index 743d54f0b8d..d003a6e16af 100644 --- a/arch/s390/kernel/compat_wrapper.S +++ b/arch/s390/kernel/compat_wrapper.S @@ -121,7 +121,7 @@ sys32_ptrace_wrapper: lgfr %r3,%r3 # long llgtr %r4,%r4 # long llgfr %r5,%r5 # long - jg sys_ptrace # branch to system call + jg compat_sys_ptrace # branch to system call .globl sys32_alarm_wrapper sys32_alarm_wrapper: diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S index bdbb3bcd78a..708cf9cf9a3 100644 --- a/arch/s390/kernel/entry.S +++ b/arch/s390/kernel/entry.S @@ -279,8 +279,6 @@ sysc_do_restart: st %r2,SP_R2(%r15) # store return value (change R2 on stack) sysc_return: - tm SP_PSW+1(%r15),0x01 # returning to user ? - bno BASED(sysc_restore) tm __TI_flags+3(%r9),_TIF_WORK_SVC bnz BASED(sysc_work) # there is work to do (signals etc.) sysc_restore: @@ -312,6 +310,8 @@ sysc_work_loop: # One of the work bits is on. Find out which one. # sysc_work: + tm SP_PSW+1(%r15),0x01 # returning to user ? + bno BASED(sysc_restore) tm __TI_flags+3(%r9),_TIF_MCCK_PENDING bo BASED(sysc_mcck_pending) tm __TI_flags+3(%r9),_TIF_NEED_RESCHED @@ -602,12 +602,6 @@ io_no_vtime: la %r2,SP_PTREGS(%r15) # address of register-save area basr %r14,%r1 # branch to standard irq handler io_return: - tm SP_PSW+1(%r15),0x01 # returning to user ? -#ifdef CONFIG_PREEMPT - bno BASED(io_preempt) # no -> check for preemptive scheduling -#else - bno BASED(io_restore) # no-> skip resched & signal -#endif tm __TI_flags+3(%r9),_TIF_WORK_INT bnz BASED(io_work) # there is work to do (signals etc.) io_restore: @@ -629,10 +623,18 @@ io_restore_trace_psw: .long 0, io_restore_trace + 0x80000000 #endif -#ifdef CONFIG_PREEMPT -io_preempt: +# +# switch to kernel stack, then check the TIF bits +# +io_work: + tm SP_PSW+1(%r15),0x01 # returning to user ? +#ifndef CONFIG_PREEMPT + bno BASED(io_restore) # no-> skip resched & signal +#else + bnz BASED(io_work_user) # no -> check for preemptive scheduling + # check for preemptive scheduling icm %r0,15,__TI_precount(%r9) - bnz BASED(io_restore) + bnz BASED(io_restore) # preemption disabled l %r1,SP_R15(%r15) s %r1,BASED(.Lc_spsize) mvc SP_PTREGS(__PT_SIZE,%r1),SP_PTREGS(%r15) @@ -646,10 +648,7 @@ io_resume_loop: br %r1 # call schedule #endif -# -# switch to kernel stack, then check the TIF bits -# -io_work: +io_work_user: l %r1,__LC_KERNEL_STACK s %r1,BASED(.Lc_spsize) mvc SP_PTREGS(__PT_SIZE,%r1),SP_PTREGS(%r15) diff --git a/arch/s390/kernel/entry64.S b/arch/s390/kernel/entry64.S index 5a4a7bcd2bb..fee10177dbf 100644 --- a/arch/s390/kernel/entry64.S +++ b/arch/s390/kernel/entry64.S @@ -271,8 +271,6 @@ sysc_noemu: stg %r2,SP_R2(%r15) # store return value (change R2 on stack) sysc_return: - tm SP_PSW+1(%r15),0x01 # returning to user ? - jno sysc_restore tm __TI_flags+7(%r9),_TIF_WORK_SVC jnz sysc_work # there is work to do (signals etc.) sysc_restore: @@ -304,6 +302,8 @@ sysc_work_loop: # One of the work bits is on. Find out which one. # sysc_work: + tm SP_PSW+1(%r15),0x01 # returning to user ? + jno sysc_restore tm __TI_flags+7(%r9),_TIF_MCCK_PENDING jo sysc_mcck_pending tm __TI_flags+7(%r9),_TIF_NEED_RESCHED @@ -585,12 +585,6 @@ io_no_vtime: la %r2,SP_PTREGS(%r15) # address of register-save area brasl %r14,do_IRQ # call standard irq handler io_return: - tm SP_PSW+1(%r15),0x01 # returning to user ? -#ifdef CONFIG_PREEMPT - jno io_preempt # no -> check for preemptive scheduling -#else - jno io_restore # no-> skip resched & signal -#endif tm __TI_flags+7(%r9),_TIF_WORK_INT jnz io_work # there is work to do (signals etc.) io_restore: @@ -612,10 +606,41 @@ io_restore_trace_psw: .quad 0, io_restore_trace #endif -#ifdef CONFIG_PREEMPT -io_preempt: +# +# There is work todo, we need to check if we return to userspace, then +# check, if we are in SIE, if yes leave it +# +io_work: + tm SP_PSW+1(%r15),0x01 # returning to user ? +#ifndef CONFIG_PREEMPT +#if defined(CONFIG_KVM) || defined(CONFIG_KVM_MODULE) + jnz io_work_user # yes -> no need to check for SIE + la %r1, BASED(sie_opcode) # we return to kernel here + lg %r2, SP_PSW+8(%r15) + clc 0(2,%r1), 0(%r2) # is current instruction = SIE? + jne io_restore # no-> return to kernel + lg %r1, SP_PSW+8(%r15) # yes-> add 4 bytes to leave SIE + aghi %r1, 4 + stg %r1, SP_PSW+8(%r15) + j io_restore # return to kernel +#else + jno io_restore # no-> skip resched & signal +#endif +#else + jnz io_work_user # yes -> do resched & signal +#if defined(CONFIG_KVM) || defined(CONFIG_KVM_MODULE) + la %r1, BASED(sie_opcode) + lg %r2, SP_PSW+8(%r15) + clc 0(2,%r1), 0(%r2) # is current instruction = SIE? + jne 0f # no -> leave PSW alone + lg %r1, SP_PSW+8(%r15) # yes-> add 4 bytes to leave SIE + aghi %r1, 4 + stg %r1, SP_PSW+8(%r15) +0: +#endif + # check for preemptive scheduling icm %r0,15,__TI_precount(%r9) - jnz io_restore + jnz io_restore # preemption is disabled # switch to kernel stack lg %r1,SP_R15(%r15) aghi %r1,-SP_SIZE @@ -629,10 +654,7 @@ io_resume_loop: jg preempt_schedule_irq #endif -# -# switch to kernel stack, then check TIF bits -# -io_work: +io_work_user: lg %r1,__LC_KERNEL_STACK aghi %r1,-SP_SIZE mvc SP_PTREGS(__PT_SIZE,%r1),SP_PTREGS(%r15) @@ -653,6 +675,11 @@ io_work_loop: j io_restore io_work_done: +#if defined(CONFIG_KVM) || defined(CONFIG_KVM_MODULE) +sie_opcode: + .long 0xb2140000 +#endif + # # _TIF_MCCK_PENDING is set, call handler # diff --git a/arch/s390/kernel/ptrace.c b/arch/s390/kernel/ptrace.c index 7f427016374..35827b9bd4d 100644 --- a/arch/s390/kernel/ptrace.c +++ b/arch/s390/kernel/ptrace.c @@ -292,8 +292,7 @@ poke_user(struct task_struct *child, addr_t addr, addr_t data) return 0; } -static int -do_ptrace_normal(struct task_struct *child, long request, long addr, long data) +long arch_ptrace(struct task_struct *child, long request, long addr, long data) { ptrace_area parea; int copied, ret; @@ -529,35 +528,19 @@ poke_user_emu31(struct task_struct *child, addr_t addr, addr_t data) return 0; } -static int -do_ptrace_emu31(struct task_struct *child, long request, long addr, long data) +long compat_arch_ptrace(struct task_struct *child, compat_long_t request, + compat_ulong_t caddr, compat_ulong_t cdata) { - unsigned int tmp; /* 4 bytes !! */ + unsigned long addr = caddr; + unsigned long data = cdata; ptrace_area_emu31 parea; int copied, ret; switch (request) { - case PTRACE_PEEKTEXT: - case PTRACE_PEEKDATA: - /* read word at location addr. */ - copied = access_process_vm(child, addr, &tmp, sizeof(tmp), 0); - if (copied != sizeof(tmp)) - return -EIO; - return put_user(tmp, (unsigned int __force __user *) data); - case PTRACE_PEEKUSR: /* read the word at location addr in the USER area. */ return peek_user_emu31(child, addr, data); - case PTRACE_POKETEXT: - case PTRACE_POKEDATA: - /* write the word at location addr. */ - tmp = data; - copied = access_process_vm(child, addr, &tmp, sizeof(tmp), 1); - if (copied != sizeof(tmp)) - return -EIO; - return 0; - case PTRACE_POKEUSR: /* write the word at location addr in the USER area */ return poke_user_emu31(child, addr, data); @@ -587,82 +570,11 @@ do_ptrace_emu31(struct task_struct *child, long request, long addr, long data) copied += sizeof(unsigned int); } return 0; - case PTRACE_GETEVENTMSG: - return put_user((__u32) child->ptrace_message, - (unsigned int __force __user *) data); - case PTRACE_GETSIGINFO: - if (child->last_siginfo == NULL) - return -EINVAL; - return copy_siginfo_to_user32((compat_siginfo_t - __force __user *) data, - child->last_siginfo); - case PTRACE_SETSIGINFO: - if (child->last_siginfo == NULL) - return -EINVAL; - return copy_siginfo_from_user32(child->last_siginfo, - (compat_siginfo_t - __force __user *) data); } - return ptrace_request(child, request, addr, data); + return compat_ptrace_request(child, request, addr, data); } #endif -long arch_ptrace(struct task_struct *child, long request, long addr, long data) -{ - switch (request) { - case PTRACE_SYSCALL: - /* continue and stop at next (return from) syscall */ - case PTRACE_CONT: - /* restart after signal. */ - if (!valid_signal(data)) - return -EIO; - if (request == PTRACE_SYSCALL) - set_tsk_thread_flag(child, TIF_SYSCALL_TRACE); - else - clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE); - child->exit_code = data; - /* make sure the single step bit is not set. */ - user_disable_single_step(child); - wake_up_process(child); - return 0; - - case PTRACE_KILL: - /* - * make the child exit. Best I can do is send it a sigkill. - * perhaps it should be put in the status that it wants to - * exit. - */ - if (child->exit_state == EXIT_ZOMBIE) /* already dead */ - return 0; - child->exit_code = SIGKILL; - /* make sure the single step bit is not set. */ - user_disable_single_step(child); - wake_up_process(child); - return 0; - - case PTRACE_SINGLESTEP: - /* set the trap flag. */ - if (!valid_signal(data)) - return -EIO; - clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE); - child->exit_code = data; - user_enable_single_step(child); - /* give it a chance to run. */ - wake_up_process(child); - return 0; - - /* Do requests that differ for 31/64 bit */ - default: -#ifdef CONFIG_COMPAT - if (test_thread_flag(TIF_31BIT)) - return do_ptrace_emu31(child, request, addr, data); -#endif - return do_ptrace_normal(child, request, addr, data); - } - /* Not reached. */ - return -EIO; -} - asmlinkage void syscall_trace(struct pt_regs *regs, int entryexit) { diff --git a/arch/s390/kvm/Kconfig b/arch/s390/kvm/Kconfig index 1761b74d639..e051cad1f1e 100644 --- a/arch/s390/kvm/Kconfig +++ b/arch/s390/kvm/Kconfig @@ -22,7 +22,6 @@ config KVM select PREEMPT_NOTIFIERS select ANON_INODES select S390_SWITCH_AMODE - select PREEMPT ---help--- Support hosting paravirtualized guest machines using the SIE virtualization capability on the mainframe. This should work diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c index 349581a2610..47a0b642174 100644 --- a/arch/s390/kvm/intercept.c +++ b/arch/s390/kvm/intercept.c @@ -105,6 +105,9 @@ static intercept_handler_t instruction_handlers[256] = { static int handle_noop(struct kvm_vcpu *vcpu) { switch (vcpu->arch.sie_block->icptcode) { + case 0x0: + vcpu->stat.exit_null++; + break; case 0x10: vcpu->stat.exit_external_request++; break; diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 98d1e73e01f..0ac36a649eb 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -31,6 +31,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { "userspace_handled", VCPU_STAT(exit_userspace) }, + { "exit_null", VCPU_STAT(exit_null) }, { "exit_validity", VCPU_STAT(exit_validity) }, { "exit_stop_request", VCPU_STAT(exit_stop_request) }, { "exit_external_request", VCPU_STAT(exit_external_request) }, @@ -221,10 +222,6 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) vcpu->arch.guest_fpregs.fpc &= FPC_VALID_MASK; restore_fp_regs(&vcpu->arch.guest_fpregs); restore_access_regs(vcpu->arch.guest_acrs); - - if (signal_pending(current)) - atomic_set_mask(CPUSTAT_STOP_INT, - &vcpu->arch.sie_block->cpuflags); } void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) diff --git a/arch/s390/mm/Makefile b/arch/s390/mm/Makefile index fb988a48a75..2a745813454 100644 --- a/arch/s390/mm/Makefile +++ b/arch/s390/mm/Makefile @@ -5,3 +5,4 @@ obj-y := init.o fault.o extmem.o mmap.o vmem.o pgtable.o obj-$(CONFIG_CMM) += cmm.o obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o +obj-$(CONFIG_PAGE_STATES) += page-states.o diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index fa31de6ae97..29f3a63806b 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -126,6 +126,9 @@ void __init mem_init(void) /* clear the zero-page */ memset(empty_zero_page, 0, PAGE_SIZE); + /* Setup guest page hinting */ + cmma_init(); + /* this will put all low memory onto the freelists */ totalram_pages += free_all_bootmem(); diff --git a/arch/s390/mm/page-states.c b/arch/s390/mm/page-states.c new file mode 100644 index 00000000000..fc0ad73ffd9 --- /dev/null +++ b/arch/s390/mm/page-states.c @@ -0,0 +1,79 @@ +/* + * arch/s390/mm/page-states.c + * + * Copyright IBM Corp. 2008 + * + * Guest page hinting for unused pages. + * + * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com> + */ + +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/types.h> +#include <linux/mm.h> +#include <linux/init.h> + +#define ESSA_SET_STABLE 1 +#define ESSA_SET_UNUSED 2 + +static int cmma_flag; + +static int __init cmma(char *str) +{ + char *parm; + parm = strstrip(str); + if (strcmp(parm, "yes") == 0 || strcmp(parm, "on") == 0) { + cmma_flag = 1; + return 1; + } + cmma_flag = 0; + if (strcmp(parm, "no") == 0 || strcmp(parm, "off") == 0) + return 1; + return 0; +} + +__setup("cmma=", cmma); + +void __init cmma_init(void) +{ + register unsigned long tmp asm("0") = 0; + register int rc asm("1") = -EOPNOTSUPP; + + if (!cmma_flag) + return; + asm volatile( + " .insn rrf,0xb9ab0000,%1,%1,0,0\n" + "0: la %0,0\n" + "1:\n" + EX_TABLE(0b,1b) + : "+&d" (rc), "+&d" (tmp)); + if (rc) + cmma_flag = 0; +} + +void arch_free_page(struct page *page, int order) +{ + int i, rc; + + if (!cmma_flag) + return; + for (i = 0; i < (1 << order); i++) + asm volatile(".insn rrf,0xb9ab0000,%0,%1,%2,0" + : "=&d" (rc) + : "a" ((page_to_pfn(page) + i) << PAGE_SHIFT), + "i" (ESSA_SET_UNUSED)); +} + +void arch_alloc_page(struct page *page, int order) +{ + int i, rc; + + if (!cmma_flag) + return; + for (i = 0; i < (1 << order); i++) + asm volatile(".insn rrf,0xb9ab0000,%0,%1,%2,0" + : "=&d" (rc) + : "a" ((page_to_pfn(page) + i) << PAGE_SHIFT), + "i" (ESSA_SET_STABLE)); +} diff --git a/arch/um/drivers/line.c b/arch/um/drivers/line.c index 10b86e1cc65..5047490fc29 100644 --- a/arch/um/drivers/line.c +++ b/arch/um/drivers/line.c @@ -191,9 +191,9 @@ void line_flush_chars(struct tty_struct *tty) line_flush_buffer(tty); } -void line_put_char(struct tty_struct *tty, unsigned char ch) +int line_put_char(struct tty_struct *tty, unsigned char ch) { - line_write(tty, &ch, sizeof(ch)); + return line_write(tty, &ch, sizeof(ch)); } int line_write(struct tty_struct *tty, const unsigned char *buf, int len) diff --git a/arch/um/include/line.h b/arch/um/include/line.h index 1223f2c844b..979b73e6352 100644 --- a/arch/um/include/line.h +++ b/arch/um/include/line.h @@ -71,7 +71,7 @@ extern int line_setup(struct line *lines, unsigned int sizeof_lines, char *init, char **error_out); extern int line_write(struct tty_struct *tty, const unsigned char *buf, int len); -extern void line_put_char(struct tty_struct *tty, unsigned char ch); +extern int line_put_char(struct tty_struct *tty, unsigned char ch); extern void line_set_termios(struct tty_struct *tty, struct ktermios * old); extern int line_chars_in_buffer(struct tty_struct *tty); extern void line_flush_buffer(struct tty_struct *tty); diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index c3f880902d6..bbcafaa160c 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -18,6 +18,7 @@ config X86_64 ### Arch settings config X86 def_bool y + select HAVE_UNSTABLE_SCHED_CLOCK select HAVE_IDE select HAVE_OPROFILE select HAVE_KPROBES @@ -1661,6 +1662,7 @@ config GEODE_MFGPT_TIMER config OLPC bool "One Laptop Per Child support" + depends on MGEODE_LX default n help Add support for detecting the unique features of the OLPC diff --git a/arch/x86/boot/compressed/relocs.c b/arch/x86/boot/compressed/relocs.c index d01ea42187e..edaadea90aa 100644 --- a/arch/x86/boot/compressed/relocs.c +++ b/arch/x86/boot/compressed/relocs.c @@ -191,7 +191,7 @@ static void read_ehdr(FILE *fp) die("Cannot read ELF header: %s\n", strerror(errno)); } - if (memcmp(ehdr.e_ident, ELFMAG, 4) != 0) { + if (memcmp(ehdr.e_ident, ELFMAG, SELFMAG) != 0) { die("No ELF magic\n"); } if (ehdr.e_ident[EI_CLASS] != ELFCLASS32) { diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index bbdacb398d4..5e618c3b472 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -83,9 +83,7 @@ obj-$(CONFIG_KVM_GUEST) += kvm.o obj-$(CONFIG_KVM_CLOCK) += kvmclock.o obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o -ifdef CONFIG_INPUT_PCSPKR -obj-y += pcspeaker.o -endif +obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o obj-$(CONFIG_SCx200) += scx200.o scx200-y += scx200_32.o diff --git a/arch/x86/kernel/acpi/Makefile b/arch/x86/kernel/acpi/Makefile index 7335959b6af..fd5ca97a2ad 100644 --- a/arch/x86/kernel/acpi/Makefile +++ b/arch/x86/kernel/acpi/Makefile @@ -10,5 +10,5 @@ endif $(obj)/wakeup_rm.o: $(obj)/realmode/wakeup.bin $(obj)/realmode/wakeup.bin: FORCE - $(Q)$(MAKE) $(build)=$(obj)/realmode $@ + $(Q)$(MAKE) $(build)=$(obj)/realmode diff --git a/arch/x86/kernel/acpi/realmode/Makefile b/arch/x86/kernel/acpi/realmode/Makefile index 092900854ac..1c31cc0e9de 100644 --- a/arch/x86/kernel/acpi/realmode/Makefile +++ b/arch/x86/kernel/acpi/realmode/Makefile @@ -6,7 +6,8 @@ # for more details. # -targets := wakeup.bin wakeup.elf +always := wakeup.bin +targets := wakeup.elf wakeup.lds wakeup-y += wakeup.o wakemain.o video-mode.o copy.o @@ -48,7 +49,7 @@ LDFLAGS_wakeup.elf := -T CPPFLAGS_wakeup.lds += -P -C -$(obj)/wakeup.elf: $(src)/wakeup.lds $(WAKEUP_OBJS) FORCE +$(obj)/wakeup.elf: $(obj)/wakeup.lds $(WAKEUP_OBJS) FORCE $(call if_changed,ld) OBJCOPYFLAGS_wakeup.bin := -O binary diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index ddee04043ae..4bc1be5d547 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -133,6 +133,7 @@ static int kvm_register_clock(void) return native_write_msr_safe(MSR_KVM_SYSTEM_TIME, low, high); } +#ifdef CONFIG_X86_LOCAL_APIC static void kvm_setup_secondary_clock(void) { /* @@ -143,6 +144,7 @@ static void kvm_setup_secondary_clock(void) /* ok, done with our trickery, call native */ setup_secondary_APIC_clock(); } +#endif /* * After the clock is registered, the host will keep writing to the @@ -177,7 +179,9 @@ void __init kvmclock_init(void) pv_time_ops.get_wallclock = kvm_get_wallclock; pv_time_ops.set_wallclock = kvm_set_wallclock; pv_time_ops.sched_clock = kvm_clock_read; +#ifdef CONFIG_X86_LOCAL_APIC pv_apic_ops.setup_secondary_clock = kvm_setup_secondary_clock; +#endif machine_ops.shutdown = kvm_shutdown; #ifdef CONFIG_KEXEC machine_ops.crash_shutdown = kvm_crash_shutdown; diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 3e2c54dc8b2..404683b94e7 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -794,6 +794,11 @@ void __init find_smp_config(void) ACPI-based MP Configuration -------------------------------------------------------------------------- */ +/* + * Keep this outside and initialized to 0, for !CONFIG_ACPI builds: + */ +int es7000_plat; + #ifdef CONFIG_ACPI #ifdef CONFIG_X86_IO_APIC @@ -909,8 +914,6 @@ void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi) MP_intsrc_info(&intsrc); } -int es7000_plat; - void __init mp_config_acpi_legacy_irqs(void) { struct mpc_config_intsrc intsrc; diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 07c6d42ab5f..f6be7d5f82f 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -149,7 +149,6 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 745"), - DMI_MATCH(DMI_BOARD_NAME, "0WF810"), }, }, { /* Handle problems with rebooting on Dell Optiplex 745's DFF*/ diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index c0c68c18a78..cc6f5eb20b2 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -95,7 +95,7 @@ void __init setup_per_cpu_areas(void) /* Copy section for each CPU (we discard the original) */ size = PERCPU_ENOUGH_ROOM; - printk(KERN_INFO "PERCPU: Allocating %lu bytes of per cpu data\n", + printk(KERN_INFO "PERCPU: Allocating %zd bytes of per cpu data\n", size); for_each_possible_cpu(i) { diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 84241a256dc..6b087ab6cd8 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -299,7 +299,7 @@ static void __cpuinit smp_callin(void) /* * Activate a secondary processor. */ -void __cpuinit start_secondary(void *unused) +static void __cpuinit start_secondary(void *unused) { /* * Don't put *anything* before cpu_init(), SMP booting is too @@ -1306,7 +1306,7 @@ static void remove_siblinginfo(int cpu) cpu_clear(cpu, cpu_sibling_setup_map); } -int additional_cpus __initdata = -1; +static int additional_cpus __initdata = -1; static __init int setup_additional_cpus(char *s) { diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index 4c943eabacc..3324d90038e 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -288,6 +288,8 @@ static void pit_load_count(struct kvm *kvm, int channel, u32 val) * mode 1 is one shot, mode 2 is period, otherwise del timer */ switch (ps->channels[0].mode) { case 1: + /* FIXME: enhance mode 4 precision */ + case 4: create_pit_timer(&ps->pit_timer, val, 0); break; case 2: diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 2ad6f548167..36c5406b181 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -79,36 +79,6 @@ static int dbg = 1; } #endif -#define PT64_PT_BITS 9 -#define PT64_ENT_PER_PAGE (1 << PT64_PT_BITS) -#define PT32_PT_BITS 10 -#define PT32_ENT_PER_PAGE (1 << PT32_PT_BITS) - -#define PT_WRITABLE_SHIFT 1 - -#define PT_PRESENT_MASK (1ULL << 0) -#define PT_WRITABLE_MASK (1ULL << PT_WRITABLE_SHIFT) -#define PT_USER_MASK (1ULL << 2) -#define PT_PWT_MASK (1ULL << 3) -#define PT_PCD_MASK (1ULL << 4) -#define PT_ACCESSED_MASK (1ULL << 5) -#define PT_DIRTY_MASK (1ULL << 6) -#define PT_PAGE_SIZE_MASK (1ULL << 7) -#define PT_PAT_MASK (1ULL << 7) -#define PT_GLOBAL_MASK (1ULL << 8) -#define PT64_NX_SHIFT 63 -#define PT64_NX_MASK (1ULL << PT64_NX_SHIFT) - -#define PT_PAT_SHIFT 7 -#define PT_DIR_PAT_SHIFT 12 -#define PT_DIR_PAT_MASK (1ULL << PT_DIR_PAT_SHIFT) - -#define PT32_DIR_PSE36_SIZE 4 -#define PT32_DIR_PSE36_SHIFT 13 -#define PT32_DIR_PSE36_MASK \ - (((1ULL << PT32_DIR_PSE36_SIZE) - 1) << PT32_DIR_PSE36_SHIFT) - - #define PT_FIRST_AVAIL_BITS_SHIFT 9 #define PT64_SECOND_AVAIL_BITS_SHIFT 52 @@ -154,10 +124,6 @@ static int dbg = 1; #define PFERR_USER_MASK (1U << 2) #define PFERR_FETCH_MASK (1U << 4) -#define PT64_ROOT_LEVEL 4 -#define PT32_ROOT_LEVEL 2 -#define PT32E_ROOT_LEVEL 3 - #define PT_DIRECTORY_LEVEL 2 #define PT_PAGE_TABLE_LEVEL 1 @@ -186,6 +152,12 @@ static struct kmem_cache *mmu_page_header_cache; static u64 __read_mostly shadow_trap_nonpresent_pte; static u64 __read_mostly shadow_notrap_nonpresent_pte; +static u64 __read_mostly shadow_base_present_pte; +static u64 __read_mostly shadow_nx_mask; +static u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */ +static u64 __read_mostly shadow_user_mask; +static u64 __read_mostly shadow_accessed_mask; +static u64 __read_mostly shadow_dirty_mask; void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte) { @@ -194,6 +166,23 @@ void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte) } EXPORT_SYMBOL_GPL(kvm_mmu_set_nonpresent_ptes); +void kvm_mmu_set_base_ptes(u64 base_pte) +{ + shadow_base_present_pte = base_pte; +} +EXPORT_SYMBOL_GPL(kvm_mmu_set_base_ptes); + +void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, + u64 dirty_mask, u64 nx_mask, u64 x_mask) +{ + shadow_user_mask = user_mask; + shadow_accessed_mask = accessed_mask; + shadow_dirty_mask = dirty_mask; + shadow_nx_mask = nx_mask; + shadow_x_mask = x_mask; +} +EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes); + static int is_write_protection(struct kvm_vcpu *vcpu) { return vcpu->arch.cr0 & X86_CR0_WP; @@ -232,7 +221,7 @@ static int is_writeble_pte(unsigned long pte) static int is_dirty_pte(unsigned long pte) { - return pte & PT_DIRTY_MASK; + return pte & shadow_dirty_mask; } static int is_rmap_pte(u64 pte) @@ -387,7 +376,6 @@ static void account_shadowed(struct kvm *kvm, gfn_t gfn) write_count = slot_largepage_idx(gfn, gfn_to_memslot(kvm, gfn)); *write_count += 1; - WARN_ON(*write_count > KVM_PAGES_PER_HPAGE); } static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn) @@ -547,7 +535,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) return; sp = page_header(__pa(spte)); pfn = spte_to_pfn(*spte); - if (*spte & PT_ACCESSED_MASK) + if (*spte & shadow_accessed_mask) kvm_set_pfn_accessed(pfn); if (is_writeble_pte(*spte)) kvm_release_pfn_dirty(pfn); @@ -1073,17 +1061,17 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, * whether the guest actually used the pte (in order to detect * demand paging). */ - spte = PT_PRESENT_MASK | PT_DIRTY_MASK; + spte = shadow_base_present_pte | shadow_dirty_mask; if (!speculative) pte_access |= PT_ACCESSED_MASK; if (!dirty) pte_access &= ~ACC_WRITE_MASK; - if (!(pte_access & ACC_EXEC_MASK)) - spte |= PT64_NX_MASK; - - spte |= PT_PRESENT_MASK; + if (pte_access & ACC_EXEC_MASK) + spte |= shadow_x_mask; + else + spte |= shadow_nx_mask; if (pte_access & ACC_USER_MASK) - spte |= PT_USER_MASK; + spte |= shadow_user_mask; if (largepage) spte |= PT_PAGE_SIZE_MASK; @@ -1188,8 +1176,9 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, return -ENOMEM; } - table[index] = __pa(new_table->spt) | PT_PRESENT_MASK - | PT_WRITABLE_MASK | PT_USER_MASK; + table[index] = __pa(new_table->spt) + | PT_PRESENT_MASK | PT_WRITABLE_MASK + | shadow_user_mask | shadow_x_mask; } table_addr = table[index] & PT64_BASE_ADDR_MASK; } @@ -1244,7 +1233,6 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu) if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) return; spin_lock(&vcpu->kvm->mmu_lock); -#ifdef CONFIG_X86_64 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { hpa_t root = vcpu->arch.mmu.root_hpa; @@ -1256,7 +1244,6 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu) spin_unlock(&vcpu->kvm->mmu_lock); return; } -#endif for (i = 0; i < 4; ++i) { hpa_t root = vcpu->arch.mmu.pae_root[i]; @@ -1282,7 +1269,6 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu) root_gfn = vcpu->arch.cr3 >> PAGE_SHIFT; -#ifdef CONFIG_X86_64 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { hpa_t root = vcpu->arch.mmu.root_hpa; @@ -1297,7 +1283,6 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu) vcpu->arch.mmu.root_hpa = root; return; } -#endif metaphysical = !is_paging(vcpu); if (tdp_enabled) metaphysical = 1; @@ -1377,7 +1362,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, spin_lock(&vcpu->kvm->mmu_lock); kvm_mmu_free_some_pages(vcpu); r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK, - largepage, gfn, pfn, TDP_ROOT_LEVEL); + largepage, gfn, pfn, kvm_x86_ops->get_tdp_level()); spin_unlock(&vcpu->kvm->mmu_lock); return r; @@ -1484,7 +1469,7 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) context->page_fault = tdp_page_fault; context->free = nonpaging_free; context->prefetch_page = nonpaging_prefetch_page; - context->shadow_root_level = TDP_ROOT_LEVEL; + context->shadow_root_level = kvm_x86_ops->get_tdp_level(); context->root_hpa = INVALID_PAGE; if (!is_paging(vcpu)) { @@ -1633,7 +1618,7 @@ static bool last_updated_pte_accessed(struct kvm_vcpu *vcpu) { u64 *spte = vcpu->arch.last_pte_updated; - return !!(spte && (*spte & PT_ACCESSED_MASK)); + return !!(spte && (*spte & shadow_accessed_mask)); } static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index e64e9f56a65..1730757bbc7 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -3,11 +3,38 @@ #include <linux/kvm_host.h> -#ifdef CONFIG_X86_64 -#define TDP_ROOT_LEVEL PT64_ROOT_LEVEL -#else -#define TDP_ROOT_LEVEL PT32E_ROOT_LEVEL -#endif +#define PT64_PT_BITS 9 +#define PT64_ENT_PER_PAGE (1 << PT64_PT_BITS) +#define PT32_PT_BITS 10 +#define PT32_ENT_PER_PAGE (1 << PT32_PT_BITS) + +#define PT_WRITABLE_SHIFT 1 + +#define PT_PRESENT_MASK (1ULL << 0) +#define PT_WRITABLE_MASK (1ULL << PT_WRITABLE_SHIFT) +#define PT_USER_MASK (1ULL << 2) +#define PT_PWT_MASK (1ULL << 3) +#define PT_PCD_MASK (1ULL << 4) +#define PT_ACCESSED_MASK (1ULL << 5) +#define PT_DIRTY_MASK (1ULL << 6) +#define PT_PAGE_SIZE_MASK (1ULL << 7) +#define PT_PAT_MASK (1ULL << 7) +#define PT_GLOBAL_MASK (1ULL << 8) +#define PT64_NX_SHIFT 63 +#define PT64_NX_MASK (1ULL << PT64_NX_SHIFT) + +#define PT_PAT_SHIFT 7 +#define PT_DIR_PAT_SHIFT 12 +#define PT_DIR_PAT_MASK (1ULL << PT_DIR_PAT_SHIFT) + +#define PT32_DIR_PSE36_SIZE 4 +#define PT32_DIR_PSE36_SHIFT 13 +#define PT32_DIR_PSE36_MASK \ + (((1ULL << PT32_DIR_PSE36_SIZE) - 1) << PT32_DIR_PSE36_SHIFT) + +#define PT64_ROOT_LEVEL 4 +#define PT32_ROOT_LEVEL 2 +#define PT32E_ROOT_LEVEL 3 static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) { diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 89e0be2c10d..ab22615eee8 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1863,6 +1863,15 @@ static bool svm_cpu_has_accelerated_tpr(void) return false; } +static int get_npt_level(void) +{ +#ifdef CONFIG_X86_64 + return PT64_ROOT_LEVEL; +#else + return PT32E_ROOT_LEVEL; +#endif +} + static struct kvm_x86_ops svm_x86_ops = { .cpu_has_kvm_support = has_svm, .disabled_by_bios = is_disabled, @@ -1920,6 +1929,7 @@ static struct kvm_x86_ops svm_x86_ops = { .inject_pending_vectors = do_interrupt_requests, .set_tss_addr = svm_set_tss_addr, + .get_tdp_level = get_npt_level, }; static int __init svm_init(void) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 8e5d6645b90..bfe4db11989 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -42,6 +42,9 @@ module_param(enable_vpid, bool, 0); static int flexpriority_enabled = 1; module_param(flexpriority_enabled, bool, 0); +static int enable_ept = 1; +module_param(enable_ept, bool, 0); + struct vmcs { u32 revision_id; u32 abort; @@ -84,7 +87,7 @@ static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) return container_of(vcpu, struct vcpu_vmx, vcpu); } -static int init_rmode_tss(struct kvm *kvm); +static int init_rmode(struct kvm *kvm); static DEFINE_PER_CPU(struct vmcs *, vmxarea); static DEFINE_PER_CPU(struct vmcs *, current_vmcs); @@ -107,6 +110,11 @@ static struct vmcs_config { u32 vmentry_ctrl; } vmcs_config; +struct vmx_capability { + u32 ept; + u32 vpid; +} vmx_capability; + #define VMX_SEGMENT_FIELD(seg) \ [VCPU_SREG_##seg] = { \ .selector = GUEST_##seg##_SELECTOR, \ @@ -214,6 +222,32 @@ static inline bool cpu_has_vmx_virtualize_apic_accesses(void) SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES); } +static inline int cpu_has_vmx_invept_individual_addr(void) +{ + return (!!(vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT)); +} + +static inline int cpu_has_vmx_invept_context(void) +{ + return (!!(vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT)); +} + +static inline int cpu_has_vmx_invept_global(void) +{ + return (!!(vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT)); +} + +static inline int cpu_has_vmx_ept(void) +{ + return (vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_ENABLE_EPT); +} + +static inline int vm_need_ept(void) +{ + return (cpu_has_vmx_ept() && enable_ept); +} + static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm) { return ((cpu_has_vmx_virtualize_apic_accesses()) && @@ -250,6 +284,18 @@ static inline void __invvpid(int ext, u16 vpid, gva_t gva) : : "a"(&operand), "c"(ext) : "cc", "memory"); } +static inline void __invept(int ext, u64 eptp, gpa_t gpa) +{ + struct { + u64 eptp, gpa; + } operand = {eptp, gpa}; + + asm volatile (ASM_VMX_INVEPT + /* CF==1 or ZF==1 --> rc = -1 */ + "; ja 1f ; ud2 ; 1:\n" + : : "a" (&operand), "c" (ext) : "cc", "memory"); +} + static struct kvm_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr) { int i; @@ -301,6 +347,33 @@ static inline void vpid_sync_vcpu_all(struct vcpu_vmx *vmx) __invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vmx->vpid, 0); } +static inline void ept_sync_global(void) +{ + if (cpu_has_vmx_invept_global()) + __invept(VMX_EPT_EXTENT_GLOBAL, 0, 0); +} + +static inline void ept_sync_context(u64 eptp) +{ + if (vm_need_ept()) { + if (cpu_has_vmx_invept_context()) + __invept(VMX_EPT_EXTENT_CONTEXT, eptp, 0); + else + ept_sync_global(); + } +} + +static inline void ept_sync_individual_addr(u64 eptp, gpa_t gpa) +{ + if (vm_need_ept()) { + if (cpu_has_vmx_invept_individual_addr()) + __invept(VMX_EPT_EXTENT_INDIVIDUAL_ADDR, + eptp, gpa); + else + ept_sync_context(eptp); + } +} + static unsigned long vmcs_readl(unsigned long field) { unsigned long value; @@ -388,6 +461,8 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu) eb |= 1u << 1; if (vcpu->arch.rmode.active) eb = ~0; + if (vm_need_ept()) + eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */ vmcs_write32(EXCEPTION_BITMAP, eb); } @@ -985,7 +1060,7 @@ static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) { u32 vmx_msr_low, vmx_msr_high; - u32 min, opt; + u32 min, opt, min2, opt2; u32 _pin_based_exec_control = 0; u32 _cpu_based_exec_control = 0; u32 _cpu_based_2nd_exec_control = 0; @@ -1003,6 +1078,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING | #endif + CPU_BASED_CR3_LOAD_EXITING | + CPU_BASED_CR3_STORE_EXITING | CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MOV_DR_EXITING | CPU_BASED_USE_TSC_OFFSETING; @@ -1018,11 +1095,13 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) ~CPU_BASED_CR8_STORE_EXITING; #endif if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) { - min = 0; - opt = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | + min2 = 0; + opt2 = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | SECONDARY_EXEC_WBINVD_EXITING | - SECONDARY_EXEC_ENABLE_VPID; - if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS2, + SECONDARY_EXEC_ENABLE_VPID | + SECONDARY_EXEC_ENABLE_EPT; + if (adjust_vmx_controls(min2, opt2, + MSR_IA32_VMX_PROCBASED_CTLS2, &_cpu_based_2nd_exec_control) < 0) return -EIO; } @@ -1031,6 +1110,16 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW; #endif + if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) { + /* CR3 accesses don't need to cause VM Exits when EPT enabled */ + min &= ~(CPU_BASED_CR3_LOAD_EXITING | + CPU_BASED_CR3_STORE_EXITING); + if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS, + &_cpu_based_exec_control) < 0) + return -EIO; + rdmsr(MSR_IA32_VMX_EPT_VPID_CAP, + vmx_capability.ept, vmx_capability.vpid); + } min = 0; #ifdef CONFIG_X86_64 @@ -1256,7 +1345,7 @@ static void enter_rmode(struct kvm_vcpu *vcpu) fix_rmode_seg(VCPU_SREG_FS, &vcpu->arch.rmode.fs); kvm_mmu_reset_context(vcpu); - init_rmode_tss(vcpu->kvm); + init_rmode(vcpu->kvm); } #ifdef CONFIG_X86_64 @@ -1304,8 +1393,64 @@ static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & ~KVM_GUEST_CR4_MASK; } +static void ept_load_pdptrs(struct kvm_vcpu *vcpu) +{ + if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) { + if (!load_pdptrs(vcpu, vcpu->arch.cr3)) { + printk(KERN_ERR "EPT: Fail to load pdptrs!\n"); + return; + } + vmcs_write64(GUEST_PDPTR0, vcpu->arch.pdptrs[0]); + vmcs_write64(GUEST_PDPTR1, vcpu->arch.pdptrs[1]); + vmcs_write64(GUEST_PDPTR2, vcpu->arch.pdptrs[2]); + vmcs_write64(GUEST_PDPTR3, vcpu->arch.pdptrs[3]); + } +} + +static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); + +static void ept_update_paging_mode_cr0(unsigned long *hw_cr0, + unsigned long cr0, + struct kvm_vcpu *vcpu) +{ + if (!(cr0 & X86_CR0_PG)) { + /* From paging/starting to nonpaging */ + vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, + vmcs_config.cpu_based_exec_ctrl | + (CPU_BASED_CR3_LOAD_EXITING | + CPU_BASED_CR3_STORE_EXITING)); + vcpu->arch.cr0 = cr0; + vmx_set_cr4(vcpu, vcpu->arch.cr4); + *hw_cr0 |= X86_CR0_PE | X86_CR0_PG; + *hw_cr0 &= ~X86_CR0_WP; + } else if (!is_paging(vcpu)) { + /* From nonpaging to paging */ + vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, + vmcs_config.cpu_based_exec_ctrl & + ~(CPU_BASED_CR3_LOAD_EXITING | + CPU_BASED_CR3_STORE_EXITING)); + vcpu->arch.cr0 = cr0; + vmx_set_cr4(vcpu, vcpu->arch.cr4); + if (!(vcpu->arch.cr0 & X86_CR0_WP)) + *hw_cr0 &= ~X86_CR0_WP; + } +} + +static void ept_update_paging_mode_cr4(unsigned long *hw_cr4, + struct kvm_vcpu *vcpu) +{ + if (!is_paging(vcpu)) { + *hw_cr4 &= ~X86_CR4_PAE; + *hw_cr4 |= X86_CR4_PSE; + } else if (!(vcpu->arch.cr4 & X86_CR4_PAE)) + *hw_cr4 &= ~X86_CR4_PAE; +} + static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { + unsigned long hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK) | + KVM_VM_CR0_ALWAYS_ON; + vmx_fpu_deactivate(vcpu); if (vcpu->arch.rmode.active && (cr0 & X86_CR0_PE)) @@ -1323,29 +1468,61 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) } #endif + if (vm_need_ept()) + ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu); + vmcs_writel(CR0_READ_SHADOW, cr0); - vmcs_writel(GUEST_CR0, - (cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON); + vmcs_writel(GUEST_CR0, hw_cr0); vcpu->arch.cr0 = cr0; if (!(cr0 & X86_CR0_TS) || !(cr0 & X86_CR0_PE)) vmx_fpu_activate(vcpu); } +static u64 construct_eptp(unsigned long root_hpa) +{ + u64 eptp; + + /* TODO write the value reading from MSR */ + eptp = VMX_EPT_DEFAULT_MT | + VMX_EPT_DEFAULT_GAW << VMX_EPT_GAW_EPTP_SHIFT; + eptp |= (root_hpa & PAGE_MASK); + + return eptp; +} + static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) { + unsigned long guest_cr3; + u64 eptp; + + guest_cr3 = cr3; + if (vm_need_ept()) { + eptp = construct_eptp(cr3); + vmcs_write64(EPT_POINTER, eptp); + ept_sync_context(eptp); + ept_load_pdptrs(vcpu); + guest_cr3 = is_paging(vcpu) ? vcpu->arch.cr3 : + VMX_EPT_IDENTITY_PAGETABLE_ADDR; + } + vmx_flush_tlb(vcpu); - vmcs_writel(GUEST_CR3, cr3); + vmcs_writel(GUEST_CR3, guest_cr3); if (vcpu->arch.cr0 & X86_CR0_PE) vmx_fpu_deactivate(vcpu); } static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { - vmcs_writel(CR4_READ_SHADOW, cr4); - vmcs_writel(GUEST_CR4, cr4 | (vcpu->arch.rmode.active ? - KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON)); + unsigned long hw_cr4 = cr4 | (vcpu->arch.rmode.active ? + KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON); + vcpu->arch.cr4 = cr4; + if (vm_need_ept()) + ept_update_paging_mode_cr4(&hw_cr4, vcpu); + + vmcs_writel(CR4_READ_SHADOW, cr4); + vmcs_writel(GUEST_CR4, hw_cr4); } static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) @@ -1530,6 +1707,41 @@ out: return ret; } +static int init_rmode_identity_map(struct kvm *kvm) +{ + int i, r, ret; + pfn_t identity_map_pfn; + u32 tmp; + + if (!vm_need_ept()) + return 1; + if (unlikely(!kvm->arch.ept_identity_pagetable)) { + printk(KERN_ERR "EPT: identity-mapping pagetable " + "haven't been allocated!\n"); + return 0; + } + if (likely(kvm->arch.ept_identity_pagetable_done)) + return 1; + ret = 0; + identity_map_pfn = VMX_EPT_IDENTITY_PAGETABLE_ADDR >> PAGE_SHIFT; + r = kvm_clear_guest_page(kvm, identity_map_pfn, 0, PAGE_SIZE); + if (r < 0) + goto out; + /* Set up identity-mapping pagetable for EPT in real mode */ + for (i = 0; i < PT32_ENT_PER_PAGE; i++) { + tmp = (i << 22) + (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | + _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE); + r = kvm_write_guest_page(kvm, identity_map_pfn, + &tmp, i * sizeof(tmp), sizeof(tmp)); + if (r < 0) + goto out; + } + kvm->arch.ept_identity_pagetable_done = true; + ret = 1; +out: + return ret; +} + static void seg_setup(int seg) { struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; @@ -1564,6 +1776,31 @@ out: return r; } +static int alloc_identity_pagetable(struct kvm *kvm) +{ + struct kvm_userspace_memory_region kvm_userspace_mem; + int r = 0; + + down_write(&kvm->slots_lock); + if (kvm->arch.ept_identity_pagetable) + goto out; + kvm_userspace_mem.slot = IDENTITY_PAGETABLE_PRIVATE_MEMSLOT; + kvm_userspace_mem.flags = 0; + kvm_userspace_mem.guest_phys_addr = VMX_EPT_IDENTITY_PAGETABLE_ADDR; + kvm_userspace_mem.memory_size = PAGE_SIZE; + r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, 0); + if (r) + goto out; + + down_read(¤t->mm->mmap_sem); + kvm->arch.ept_identity_pagetable = gfn_to_page(kvm, + VMX_EPT_IDENTITY_PAGETABLE_ADDR >> PAGE_SHIFT); + up_read(¤t->mm->mmap_sem); +out: + up_write(&kvm->slots_lock); + return r; +} + static void allocate_vpid(struct vcpu_vmx *vmx) { int vpid; @@ -1638,6 +1875,9 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) CPU_BASED_CR8_LOAD_EXITING; #endif } + if (!vm_need_ept()) + exec_control |= CPU_BASED_CR3_STORE_EXITING | + CPU_BASED_CR3_LOAD_EXITING; vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control); if (cpu_has_secondary_exec_ctrls()) { @@ -1647,6 +1887,8 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; if (vmx->vpid == 0) exec_control &= ~SECONDARY_EXEC_ENABLE_VPID; + if (!vm_need_ept()) + exec_control &= ~SECONDARY_EXEC_ENABLE_EPT; vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); } @@ -1722,6 +1964,15 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) return 0; } +static int init_rmode(struct kvm *kvm) +{ + if (!init_rmode_tss(kvm)) + return 0; + if (!init_rmode_identity_map(kvm)) + return 0; + return 1; +} + static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -1729,7 +1980,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) int ret; down_read(&vcpu->kvm->slots_lock); - if (!init_rmode_tss(vmx->vcpu.kvm)) { + if (!init_rmode(vmx->vcpu.kvm)) { ret = -ENOMEM; goto out; } @@ -1994,6 +2245,9 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) if (intr_info & INTR_INFO_DELIVER_CODE_MASK) error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); if (is_page_fault(intr_info)) { + /* EPT won't cause page fault directly */ + if (vm_need_ept()) + BUG(); cr2 = vmcs_readl(EXIT_QUALIFICATION); KVMTRACE_3D(PAGE_FAULT, vcpu, error_code, (u32)cr2, (u32)((u64)cr2 >> 32), handler); @@ -2323,6 +2577,64 @@ static int handle_task_switch(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) return kvm_task_switch(vcpu, tss_selector, reason); } +static int handle_ept_violation(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +{ + u64 exit_qualification; + enum emulation_result er; + gpa_t gpa; + unsigned long hva; + int gla_validity; + int r; + + exit_qualification = vmcs_read64(EXIT_QUALIFICATION); + + if (exit_qualification & (1 << 6)) { + printk(KERN_ERR "EPT: GPA exceeds GAW!\n"); + return -ENOTSUPP; + } + + gla_validity = (exit_qualification >> 7) & 0x3; + if (gla_validity != 0x3 && gla_validity != 0x1 && gla_validity != 0) { + printk(KERN_ERR "EPT: Handling EPT violation failed!\n"); + printk(KERN_ERR "EPT: GPA: 0x%lx, GVA: 0x%lx\n", + (long unsigned int)vmcs_read64(GUEST_PHYSICAL_ADDRESS), + (long unsigned int)vmcs_read64(GUEST_LINEAR_ADDRESS)); + printk(KERN_ERR "EPT: Exit qualification is 0x%lx\n", + (long unsigned int)exit_qualification); + kvm_run->exit_reason = KVM_EXIT_UNKNOWN; + kvm_run->hw.hardware_exit_reason = 0; + return -ENOTSUPP; + } + + gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); + hva = gfn_to_hva(vcpu->kvm, gpa >> PAGE_SHIFT); + if (!kvm_is_error_hva(hva)) { + r = kvm_mmu_page_fault(vcpu, gpa & PAGE_MASK, 0); + if (r < 0) { + printk(KERN_ERR "EPT: Not enough memory!\n"); + return -ENOMEM; + } + return 1; + } else { + /* must be MMIO */ + er = emulate_instruction(vcpu, kvm_run, 0, 0, 0); + + if (er == EMULATE_FAIL) { + printk(KERN_ERR + "EPT: Fail to handle EPT violation vmexit!er is %d\n", + er); + printk(KERN_ERR "EPT: GPA: 0x%lx, GVA: 0x%lx\n", + (long unsigned int)vmcs_read64(GUEST_PHYSICAL_ADDRESS), + (long unsigned int)vmcs_read64(GUEST_LINEAR_ADDRESS)); + printk(KERN_ERR "EPT: Exit qualification is 0x%lx\n", + (long unsigned int)exit_qualification); + return -ENOTSUPP; + } else if (er == EMULATE_DO_MMIO) + return 0; + } + return 1; +} + /* * The exit handlers return 1 if the exit was handled fully and guest execution * may resume. Otherwise they set the kvm_run parameter to indicate what needs @@ -2346,6 +2658,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu, [EXIT_REASON_APIC_ACCESS] = handle_apic_access, [EXIT_REASON_WBINVD] = handle_wbinvd, [EXIT_REASON_TASK_SWITCH] = handle_task_switch, + [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation, }; static const int kvm_vmx_max_exit_handlers = @@ -2364,6 +2677,13 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) KVMTRACE_3D(VMEXIT, vcpu, exit_reason, (u32)vmcs_readl(GUEST_RIP), (u32)((u64)vmcs_readl(GUEST_RIP) >> 32), entryexit); + /* Access CR3 don't cause VMExit in paging mode, so we need + * to sync with guest real CR3. */ + if (vm_need_ept() && is_paging(vcpu)) { + vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); + ept_load_pdptrs(vcpu); + } + if (unlikely(vmx->fail)) { kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; kvm_run->fail_entry.hardware_entry_failure_reason @@ -2372,7 +2692,8 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) } if ((vectoring_info & VECTORING_INFO_VALID_MASK) && - exit_reason != EXIT_REASON_EXCEPTION_NMI) + (exit_reason != EXIT_REASON_EXCEPTION_NMI && + exit_reason != EXIT_REASON_EPT_VIOLATION)) printk(KERN_WARNING "%s: unexpected, valid vectoring info and " "exit reason is 0x%x\n", __func__, exit_reason); if (exit_reason < kvm_vmx_max_exit_handlers @@ -2674,6 +2995,15 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) return ERR_PTR(-ENOMEM); allocate_vpid(vmx); + if (id == 0 && vm_need_ept()) { + kvm_mmu_set_base_ptes(VMX_EPT_READABLE_MASK | + VMX_EPT_WRITABLE_MASK | + VMX_EPT_DEFAULT_MT << VMX_EPT_MT_EPTE_SHIFT); + kvm_mmu_set_mask_ptes(0ull, VMX_EPT_FAKE_ACCESSED_MASK, + VMX_EPT_FAKE_DIRTY_MASK, 0ull, + VMX_EPT_EXECUTABLE_MASK); + kvm_enable_tdp(); + } err = kvm_vcpu_init(&vmx->vcpu, kvm, id); if (err) @@ -2706,6 +3036,10 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) if (alloc_apic_access_page(kvm) != 0) goto free_vmcs; + if (vm_need_ept()) + if (alloc_identity_pagetable(kvm) != 0) + goto free_vmcs; + return &vmx->vcpu; free_vmcs: @@ -2735,6 +3069,11 @@ static void __init vmx_check_processor_compat(void *rtn) } } +static int get_ept_level(void) +{ + return VMX_EPT_DEFAULT_GAW + 1; +} + static struct kvm_x86_ops vmx_x86_ops = { .cpu_has_kvm_support = cpu_has_kvm_support, .disabled_by_bios = vmx_disabled_by_bios, @@ -2791,6 +3130,7 @@ static struct kvm_x86_ops vmx_x86_ops = { .inject_pending_vectors = do_interrupt_requests, .set_tss_addr = vmx_set_tss_addr, + .get_tdp_level = get_ept_level, }; static int __init vmx_init(void) @@ -2843,9 +3183,14 @@ static int __init vmx_init(void) vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_IA32_SYSENTER_ESP); vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_IA32_SYSENTER_EIP); + if (cpu_has_vmx_ept()) + bypass_guest_pf = 0; + if (bypass_guest_pf) kvm_mmu_set_nonpresent_ptes(~0xffeull, 0ull); + ept_sync_global(); + return 0; out2: diff --git a/arch/x86/kvm/vmx.h b/arch/x86/kvm/vmx.h index 5dff4606b98..79d94c610df 100644 --- a/arch/x86/kvm/vmx.h +++ b/arch/x86/kvm/vmx.h @@ -35,6 +35,8 @@ #define CPU_BASED_MWAIT_EXITING 0x00000400 #define CPU_BASED_RDPMC_EXITING 0x00000800 #define CPU_BASED_RDTSC_EXITING 0x00001000 +#define CPU_BASED_CR3_LOAD_EXITING 0x00008000 +#define CPU_BASED_CR3_STORE_EXITING 0x00010000 #define CPU_BASED_CR8_LOAD_EXITING 0x00080000 #define CPU_BASED_CR8_STORE_EXITING 0x00100000 #define CPU_BASED_TPR_SHADOW 0x00200000 @@ -49,6 +51,7 @@ * Definitions of Secondary Processor-Based VM-Execution Controls. */ #define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001 +#define SECONDARY_EXEC_ENABLE_EPT 0x00000002 #define SECONDARY_EXEC_ENABLE_VPID 0x00000020 #define SECONDARY_EXEC_WBINVD_EXITING 0x00000040 @@ -100,10 +103,22 @@ enum vmcs_field { VIRTUAL_APIC_PAGE_ADDR_HIGH = 0x00002013, APIC_ACCESS_ADDR = 0x00002014, APIC_ACCESS_ADDR_HIGH = 0x00002015, + EPT_POINTER = 0x0000201a, + EPT_POINTER_HIGH = 0x0000201b, + GUEST_PHYSICAL_ADDRESS = 0x00002400, + GUEST_PHYSICAL_ADDRESS_HIGH = 0x00002401, VMCS_LINK_POINTER = 0x00002800, VMCS_LINK_POINTER_HIGH = 0x00002801, GUEST_IA32_DEBUGCTL = 0x00002802, GUEST_IA32_DEBUGCTL_HIGH = 0x00002803, + GUEST_PDPTR0 = 0x0000280a, + GUEST_PDPTR0_HIGH = 0x0000280b, + GUEST_PDPTR1 = 0x0000280c, + GUEST_PDPTR1_HIGH = 0x0000280d, + GUEST_PDPTR2 = 0x0000280e, + GUEST_PDPTR2_HIGH = 0x0000280f, + GUEST_PDPTR3 = 0x00002810, + GUEST_PDPTR3_HIGH = 0x00002811, PIN_BASED_VM_EXEC_CONTROL = 0x00004000, CPU_BASED_VM_EXEC_CONTROL = 0x00004002, EXCEPTION_BITMAP = 0x00004004, @@ -226,6 +241,8 @@ enum vmcs_field { #define EXIT_REASON_MWAIT_INSTRUCTION 36 #define EXIT_REASON_TPR_BELOW_THRESHOLD 43 #define EXIT_REASON_APIC_ACCESS 44 +#define EXIT_REASON_EPT_VIOLATION 48 +#define EXIT_REASON_EPT_MISCONFIG 49 #define EXIT_REASON_WBINVD 54 /* @@ -316,15 +333,36 @@ enum vmcs_field { #define MSR_IA32_VMX_CR4_FIXED1 0x489 #define MSR_IA32_VMX_VMCS_ENUM 0x48a #define MSR_IA32_VMX_PROCBASED_CTLS2 0x48b +#define MSR_IA32_VMX_EPT_VPID_CAP 0x48c #define MSR_IA32_FEATURE_CONTROL 0x3a #define MSR_IA32_FEATURE_CONTROL_LOCKED 0x1 #define MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED 0x4 #define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT 9 +#define IDENTITY_PAGETABLE_PRIVATE_MEMSLOT 10 #define VMX_NR_VPIDS (1 << 16) #define VMX_VPID_EXTENT_SINGLE_CONTEXT 1 #define VMX_VPID_EXTENT_ALL_CONTEXT 2 +#define VMX_EPT_EXTENT_INDIVIDUAL_ADDR 0 +#define VMX_EPT_EXTENT_CONTEXT 1 +#define VMX_EPT_EXTENT_GLOBAL 2 +#define VMX_EPT_EXTENT_INDIVIDUAL_BIT (1ull << 24) +#define VMX_EPT_EXTENT_CONTEXT_BIT (1ull << 25) +#define VMX_EPT_EXTENT_GLOBAL_BIT (1ull << 26) +#define VMX_EPT_DEFAULT_GAW 3 +#define VMX_EPT_MAX_GAW 0x4 +#define VMX_EPT_MT_EPTE_SHIFT 3 +#define VMX_EPT_GAW_EPTP_SHIFT 3 +#define VMX_EPT_DEFAULT_MT 0x6ull +#define VMX_EPT_READABLE_MASK 0x1ull +#define VMX_EPT_WRITABLE_MASK 0x2ull +#define VMX_EPT_EXECUTABLE_MASK 0x4ull +#define VMX_EPT_FAKE_ACCESSED_MASK (1ull << 62) +#define VMX_EPT_FAKE_DIRTY_MASK (1ull << 63) + +#define VMX_EPT_IDENTITY_PAGETABLE_ADDR 0xfffbc000ul + #endif diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 0ce556372a4..21338bdb28f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2417,6 +2417,9 @@ int kvm_arch_init(void *opaque) kvm_x86_ops = ops; kvm_mmu_set_nonpresent_ptes(0ull, 0ull); + kvm_mmu_set_base_ptes(PT_PRESENT_MASK); + kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, + PT_DIRTY_MASK, PT64_NX_MASK, 0); return 0; out: @@ -3019,6 +3022,8 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) kvm_x86_ops->decache_regs(vcpu); + vcpu->arch.exception.pending = false; + vcpu_put(vcpu); return 0; @@ -3481,7 +3486,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason) } if (reason == TASK_SWITCH_IRET || reason == TASK_SWITCH_JMP) { - cseg_desc.type &= ~(1 << 8); //clear the B flag + cseg_desc.type &= ~(1 << 1); //clear the B flag save_guest_segment_descriptor(vcpu, tr_seg.selector, &cseg_desc); } @@ -3507,7 +3512,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason) } if (reason != TASK_SWITCH_IRET) { - nseg_desc.type |= (1 << 8); + nseg_desc.type |= (1 << 1); save_guest_segment_descriptor(vcpu, tss_selector, &nseg_desc); } @@ -3698,10 +3703,19 @@ void fx_init(struct kvm_vcpu *vcpu) { unsigned after_mxcsr_mask; + /* + * Touch the fpu the first time in non atomic context as if + * this is the first fpu instruction the exception handler + * will fire before the instruction returns and it'll have to + * allocate ram with GFP_KERNEL. + */ + if (!used_math()) + fx_save(&vcpu->arch.host_fx_image); + /* Initialize guest FPU by resetting ours and saving into guest's */ preempt_disable(); fx_save(&vcpu->arch.host_fx_image); - fpu_init(); + fx_finit(); fx_save(&vcpu->arch.guest_fx_image); fx_restore(&vcpu->arch.host_fx_image); preempt_enable(); @@ -3906,6 +3920,8 @@ void kvm_arch_destroy_vm(struct kvm *kvm) kvm_free_physmem(kvm); if (kvm->arch.apic_access_page) put_page(kvm->arch.apic_access_page); + if (kvm->arch.ept_identity_pagetable) + put_page(kvm->arch.ept_identity_pagetable); kfree(kvm); } diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c index 2ca08386f99..f2a696d6a24 100644 --- a/arch/x86/kvm/x86_emulate.c +++ b/arch/x86/kvm/x86_emulate.c @@ -1761,6 +1761,7 @@ twobyte_insn: case 6: /* lmsw */ realmode_lmsw(ctxt->vcpu, (u16)c->src.val, &ctxt->eflags); + c->dst.type = OP_NONE; break; case 7: /* invlpg*/ emulate_invlpg(ctxt->vcpu, memop); diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index 18378850e25..914ccf98368 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -476,29 +476,3 @@ int memory_add_physaddr_to_nid(u64 addr) EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); #endif - -#ifndef CONFIG_HAVE_ARCH_PARSE_SRAT -/* - * XXX FIXME: Make SLIT table parsing available to 32-bit NUMA - * - * These stub functions are needed to compile 32-bit NUMA when SRAT is - * not set. There are functions in srat_64.c for parsing this table - * and it may be possible to make them common functions. - */ -void acpi_numa_slit_init (struct acpi_table_slit *slit) -{ - printk(KERN_INFO "ACPI: No support for parsing SLIT table\n"); -} - -void acpi_numa_processor_affinity_init (struct acpi_srat_cpu_affinity *pa) -{ -} - -void acpi_numa_memory_affinity_init (struct acpi_srat_mem_affinity *ma) -{ -} - -void acpi_numa_arch_fixup(void) -{ -} -#endif /* CONFIG_HAVE_ARCH_PARSE_SRAT */ diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c index 9ee007be914..369cf065b6a 100644 --- a/arch/x86/mm/pgtable_32.c +++ b/arch/x86/mm/pgtable_32.c @@ -172,10 +172,3 @@ void reserve_top_address(unsigned long reserve) __FIXADDR_TOP = -reserve - PAGE_SIZE; __VMALLOC_RESERVE += reserve; } - -int pmd_bad(pmd_t pmd) -{ - WARN_ON_ONCE(pmd_bad_v1(pmd) != pmd_bad_v2(pmd)); - - return pmd_bad_v1(pmd); -} diff --git a/arch/x86/pci/Makefile_32 b/arch/x86/pci/Makefile_32 index 7fa519868d7..89ec35d00ef 100644 --- a/arch/x86/pci/Makefile_32 +++ b/arch/x86/pci/Makefile_32 @@ -6,11 +6,19 @@ obj-$(CONFIG_PCI_DIRECT) += direct.o obj-$(CONFIG_PCI_OLPC) += olpc.o pci-y := fixup.o + +# Do not change the ordering here. There is a nasty init function +# ordering dependency which breaks when you move acpi.o below +# legacy/irq.o pci-$(CONFIG_ACPI) += acpi.o pci-y += legacy.o irq.o -pci-$(CONFIG_X86_VISWS) += visws.o fixup.o -pci-$(CONFIG_X86_NUMAQ) += numa.o irq.o +# Careful: VISWS and NUMAQ overrule the pci-y above. The colons are +# therefor correct. This needs a proper fix by distangling the code. +pci-$(CONFIG_X86_VISWS) := visws.o fixup.o +pci-$(CONFIG_X86_NUMAQ) := numa.o irq.o + +# Necessary for NUMAQ as well pci-$(CONFIG_NUMA) += mp_bus_to_node.o obj-y += $(pci-y) common.o early.o diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index 1a9c0c6a1a1..d95de2f199c 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -6,45 +6,6 @@ #include <asm/numa.h> #include "pci.h" -static int __devinit can_skip_ioresource_align(const struct dmi_system_id *d) -{ - pci_probe |= PCI_CAN_SKIP_ISA_ALIGN; - printk(KERN_INFO "PCI: %s detected, can skip ISA alignment\n", d->ident); - return 0; -} - -static struct dmi_system_id acpi_pciprobe_dmi_table[] __devinitdata = { -/* - * Systems where PCI IO resource ISA alignment can be skipped - * when the ISA enable bit in the bridge control is not set - */ - { - .callback = can_skip_ioresource_align, - .ident = "IBM System x3800", - .matches = { - DMI_MATCH(DMI_SYS_VENDOR, "IBM"), - DMI_MATCH(DMI_PRODUCT_NAME, "x3800"), - }, - }, - { - .callback = can_skip_ioresource_align, - .ident = "IBM System x3850", - .matches = { - DMI_MATCH(DMI_SYS_VENDOR, "IBM"), - DMI_MATCH(DMI_PRODUCT_NAME, "x3850"), - }, - }, - { - .callback = can_skip_ioresource_align, - .ident = "IBM System x3950", - .matches = { - DMI_MATCH(DMI_SYS_VENDOR, "IBM"), - DMI_MATCH(DMI_PRODUCT_NAME, "x3950"), - }, - }, - {} -}; - struct pci_root_info { char *name; unsigned int res_num; @@ -196,8 +157,6 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_device *device, int do int pxm; #endif - dmi_check_system(acpi_pciprobe_dmi_table); - if (domain && !pci_domains_supported) { printk(KERN_WARNING "PCI: Multiple domains not supported " "(dom %d, bus %d)\n", domain, busnum); diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c index 2a4d751818b..8545c8a9d10 100644 --- a/arch/x86/pci/common.c +++ b/arch/x86/pci/common.c @@ -77,17 +77,48 @@ int pcibios_scanned; */ DEFINE_SPINLOCK(pci_config_lock); -static void __devinit pcibios_fixup_device_resources(struct pci_dev *dev) +static int __devinit can_skip_ioresource_align(const struct dmi_system_id *d) { - struct resource *rom_r = &dev->resource[PCI_ROM_RESOURCE]; - - if (rom_r->parent) - return; - if (rom_r->start) - /* we deal with BIOS assigned ROM later */ - return; - if (!(pci_probe & PCI_ASSIGN_ROMS)) - rom_r->start = rom_r->end = rom_r->flags = 0; + pci_probe |= PCI_CAN_SKIP_ISA_ALIGN; + printk(KERN_INFO "PCI: %s detected, can skip ISA alignment\n", d->ident); + return 0; +} + +static struct dmi_system_id can_skip_pciprobe_dmi_table[] __devinitdata = { +/* + * Systems where PCI IO resource ISA alignment can be skipped + * when the ISA enable bit in the bridge control is not set + */ + { + .callback = can_skip_ioresource_align, + .ident = "IBM System x3800", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "IBM"), + DMI_MATCH(DMI_PRODUCT_NAME, "x3800"), + }, + }, + { + .callback = can_skip_ioresource_align, + .ident = "IBM System x3850", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "IBM"), + DMI_MATCH(DMI_PRODUCT_NAME, "x3850"), + }, + }, + { + .callback = can_skip_ioresource_align, + .ident = "IBM System x3950", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "IBM"), + DMI_MATCH(DMI_PRODUCT_NAME, "x3950"), + }, + }, + {} +}; + +void __init dmi_check_skip_isa_align(void) +{ + dmi_check_system(can_skip_pciprobe_dmi_table); } /* @@ -97,11 +128,7 @@ static void __devinit pcibios_fixup_device_resources(struct pci_dev *dev) void __devinit pcibios_fixup_bus(struct pci_bus *b) { - struct pci_dev *dev; - pci_read_bridge_bases(b); - list_for_each_entry(dev, &b->devices, bus_list) - pcibios_fixup_device_resources(dev); } /* @@ -318,13 +345,16 @@ static struct dmi_system_id __devinitdata pciprobe_dmi_table[] = { {} }; +void __init dmi_check_pciprobe(void) +{ + dmi_check_system(pciprobe_dmi_table); +} + struct pci_bus * __devinit pcibios_scan_root(int busnum) { struct pci_bus *bus = NULL; struct pci_sysdata *sd; - dmi_check_system(pciprobe_dmi_table); - while ((bus = pci_find_next_bus(bus)) != NULL) { if (bus->number == busnum) { /* Already scanned */ @@ -462,6 +492,9 @@ char * __devinit pcibios_setup(char *str) } else if (!strcmp(str, "routeirq")) { pci_routeirq = 1; return NULL; + } else if (!strcmp(str, "skip_isa_align")) { + pci_probe |= PCI_CAN_SKIP_ISA_ALIGN; + return NULL; } return str; } @@ -489,7 +522,7 @@ void pcibios_disable_device (struct pci_dev *dev) pcibios_disable_irq(dev); } -struct pci_bus *pci_scan_bus_on_node(int busno, struct pci_ops *ops, int node) +struct pci_bus * __devinit pci_scan_bus_on_node(int busno, struct pci_ops *ops, int node) { struct pci_bus *bus = NULL; struct pci_sysdata *sd; @@ -512,7 +545,7 @@ struct pci_bus *pci_scan_bus_on_node(int busno, struct pci_ops *ops, int node) return bus; } -struct pci_bus *pci_scan_bus_with_sysdata(int busno) +struct pci_bus * __devinit pci_scan_bus_with_sysdata(int busno) { return pci_scan_bus_on_node(busno, &pci_root_ops, -1); } diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c index b60b2abd480..ff3a6a33634 100644 --- a/arch/x86/pci/fixup.c +++ b/arch/x86/pci/fixup.c @@ -502,7 +502,7 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_SIEMENS, 0x0015, */ static void fam10h_pci_cfg_space_size(struct pci_dev *dev) { - dev->cfg_size = pci_cfg_space_size_ext(dev, 0); + dev->cfg_size = pci_cfg_space_size_ext(dev); } DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1200, fam10h_pci_cfg_space_size); diff --git a/arch/x86/pci/init.c b/arch/x86/pci/init.c index dd30c6076b5..e70b9c57b88 100644 --- a/arch/x86/pci/init.c +++ b/arch/x86/pci/init.c @@ -33,6 +33,10 @@ static __init int pci_access_init(void) printk(KERN_ERR "PCI: Fatal: No config space access function found\n"); + dmi_check_pciprobe(); + + dmi_check_skip_isa_align(); + return 0; } arch_initcall(pci_access_init); diff --git a/arch/x86/pci/pci.h b/arch/x86/pci/pci.h index c58805a92db..f3972b12c60 100644 --- a/arch/x86/pci/pci.h +++ b/arch/x86/pci/pci.h @@ -38,6 +38,9 @@ enum pci_bf_sort_state { pci_dmi_bf, }; +extern void __init dmi_check_pciprobe(void); +extern void __init dmi_check_skip_isa_align(void); + /* pci-i386.c */ extern unsigned int pcibios_max_latency; diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c index 4dceeb1fc5e..cf058fecfce 100644 --- a/arch/x86/vdso/vdso32-setup.c +++ b/arch/x86/vdso/vdso32-setup.c @@ -162,7 +162,7 @@ static __init void relocate_vdso(Elf32_Ehdr *ehdr) Elf32_Shdr *shdr; int i; - BUG_ON(memcmp(ehdr->e_ident, ELFMAG, 4) != 0 || + BUG_ON(memcmp(ehdr->e_ident, ELFMAG, SELFMAG) != 0 || !elf_check_arch_ia32(ehdr) || ehdr->e_type != ET_DYN); diff --git a/arch/x86/video/fbdev.c b/arch/x86/video/fbdev.c index 4db42bff8c6..69527688f79 100644 --- a/arch/x86/video/fbdev.c +++ b/arch/x86/video/fbdev.c @@ -1,5 +1,4 @@ /* - * * Copyright (C) 2007 Antonino Daplas <adaplas@gmail.com> * * This file is subject to the terms and conditions of the GNU General Public @@ -29,3 +28,4 @@ int fb_is_primary_device(struct fb_info *info) return retval; } EXPORT_SYMBOL(fb_is_primary_device); +MODULE_LICENSE("GPL"); |