aboutsummaryrefslogtreecommitdiff
path: root/arch/x86
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2008-10-16 15:36:00 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2008-10-16 15:36:00 -0700
commit08d19f51f05a68ce89a289320ce4ed96e757df72 (patch)
tree31c5d718d0aeaff5083fe533cd6e1f9fbbe846bb /arch/x86
parent1c95e1b69073cff5ff179e592fa1a1e182c78a17 (diff)
parent2381ad241d0bea1253a37f314b270848067640bb (diff)
Merge branch 'kvm-updates/2.6.28' of git://git.kernel.org/pub/scm/linux/kernel/git/avi/kvm
* 'kvm-updates/2.6.28' of git://git.kernel.org/pub/scm/linux/kernel/git/avi/kvm: (134 commits) KVM: ia64: Add intel iommu support for guests. KVM: ia64: add directed mmio range support for kvm guests KVM: ia64: Make pmt table be able to hold physical mmio entries. KVM: Move irqchip_in_kernel() from ioapic.h to irq.h KVM: Separate irq ack notification out of arch/x86/kvm/irq.c KVM: Change is_mmio_pfn to kvm_is_mmio_pfn, and make it common for all archs KVM: Move device assignment logic to common code KVM: Device Assignment: Move vtd.c from arch/x86/kvm/ to virt/kvm/ KVM: VMX: enable invlpg exiting if EPT is disabled KVM: x86: Silence various LAPIC-related host kernel messages KVM: Device Assignment: Map mmio pages into VT-d page table KVM: PIC: enhance IPI avoidance KVM: MMU: add "oos_shadow" parameter to disable oos KVM: MMU: speed up mmu_unsync_walk KVM: MMU: out of sync shadow core KVM: MMU: mmu_convert_notrap helper KVM: MMU: awareness of new kvm_mmu_zap_page behaviour KVM: MMU: mmu_parent_walk KVM: x86: trap invlpg KVM: MMU: sync roots on mmu reload ...
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/kernel/kvmclock.c30
-rw-r--r--arch/x86/kernel/pvclock.c12
-rw-r--r--arch/x86/kvm/Makefile5
-rw-r--r--arch/x86/kvm/i8254.c81
-rw-r--r--arch/x86/kvm/i8254.h7
-rw-r--r--arch/x86/kvm/i8259.c53
-rw-r--r--arch/x86/kvm/irq.c3
-rw-r--r--arch/x86/kvm/irq.h6
-rw-r--r--arch/x86/kvm/kvm_cache_regs.h32
-rw-r--r--arch/x86/kvm/lapic.c43
-rw-r--r--arch/x86/kvm/mmu.c680
-rw-r--r--arch/x86/kvm/paging_tmpl.h249
-rw-r--r--arch/x86/kvm/svm.c156
-rw-r--r--arch/x86/kvm/vmx.c712
-rw-r--r--arch/x86/kvm/vmx.h3
-rw-r--r--arch/x86/kvm/x86.c552
-rw-r--r--arch/x86/kvm/x86.h22
-rw-r--r--arch/x86/kvm/x86_emulate.c170
-rw-r--r--arch/x86/xen/time.c11
19 files changed, 1906 insertions, 921 deletions
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index d02def06ca9..774ac499156 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -78,6 +78,34 @@ static cycle_t kvm_clock_read(void)
return ret;
}
+/*
+ * If we don't do that, there is the possibility that the guest
+ * will calibrate under heavy load - thus, getting a lower lpj -
+ * and execute the delays themselves without load. This is wrong,
+ * because no delay loop can finish beforehand.
+ * Any heuristics is subject to fail, because ultimately, a large
+ * poll of guests can be running and trouble each other. So we preset
+ * lpj here
+ */
+static unsigned long kvm_get_tsc_khz(void)
+{
+ return preset_lpj;
+}
+
+static void kvm_get_preset_lpj(void)
+{
+ struct pvclock_vcpu_time_info *src;
+ unsigned long khz;
+ u64 lpj;
+
+ src = &per_cpu(hv_clock, 0);
+ khz = pvclock_tsc_khz(src);
+
+ lpj = ((u64)khz * 1000);
+ do_div(lpj, HZ);
+ preset_lpj = lpj;
+}
+
static struct clocksource kvm_clock = {
.name = "kvm-clock",
.read = kvm_clock_read,
@@ -153,6 +181,7 @@ void __init kvmclock_init(void)
pv_time_ops.get_wallclock = kvm_get_wallclock;
pv_time_ops.set_wallclock = kvm_set_wallclock;
pv_time_ops.sched_clock = kvm_clock_read;
+ pv_time_ops.get_tsc_khz = kvm_get_tsc_khz;
#ifdef CONFIG_X86_LOCAL_APIC
pv_apic_ops.setup_secondary_clock = kvm_setup_secondary_clock;
#endif
@@ -163,6 +192,7 @@ void __init kvmclock_init(void)
#ifdef CONFIG_KEXEC
machine_ops.crash_shutdown = kvm_crash_shutdown;
#endif
+ kvm_get_preset_lpj();
clocksource_register(&kvm_clock);
}
}
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
index 05fbe9a0325..4f9c55f3a7c 100644
--- a/arch/x86/kernel/pvclock.c
+++ b/arch/x86/kernel/pvclock.c
@@ -97,6 +97,18 @@ static unsigned pvclock_get_time_values(struct pvclock_shadow_time *dst,
return dst->version;
}
+unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src)
+{
+ u64 pv_tsc_khz = 1000000ULL << 32;
+
+ do_div(pv_tsc_khz, src->tsc_to_system_mul);
+ if (src->tsc_shift < 0)
+ pv_tsc_khz <<= -src->tsc_shift;
+ else
+ pv_tsc_khz >>= src->tsc_shift;
+ return pv_tsc_khz;
+}
+
cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
{
struct pvclock_shadow_time shadow;
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index d0e940bb6f4..c02343594b4 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -3,10 +3,13 @@
#
common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \
- coalesced_mmio.o)
+ coalesced_mmio.o irq_comm.o)
ifeq ($(CONFIG_KVM_TRACE),y)
common-objs += $(addprefix ../../../virt/kvm/, kvm_trace.o)
endif
+ifeq ($(CONFIG_DMAR),y)
+common-objs += $(addprefix ../../../virt/kvm/, vtd.o)
+endif
EXTRA_CFLAGS += -Ivirt/kvm -Iarch/x86/kvm
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index c0f7872a912..634132a9a51 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -200,13 +200,14 @@ static int __pit_timer_fn(struct kvm_kpit_state *ps)
if (!atomic_inc_and_test(&pt->pending))
set_bit(KVM_REQ_PENDING_TIMER, &vcpu0->requests);
- if (vcpu0 && waitqueue_active(&vcpu0->wq)) {
- vcpu0->arch.mp_state = KVM_MP_STATE_RUNNABLE;
+
+ if (vcpu0 && waitqueue_active(&vcpu0->wq))
wake_up_interruptible(&vcpu0->wq);
- }
pt->timer.expires = ktime_add_ns(pt->timer.expires, pt->period);
pt->scheduled = ktime_to_ns(pt->timer.expires);
+ if (pt->period)
+ ps->channels[0].count_load_time = pt->timer.expires;
return (pt->period == 0 ? 0 : 1);
}
@@ -215,12 +216,22 @@ int pit_has_pending_timer(struct kvm_vcpu *vcpu)
{
struct kvm_pit *pit = vcpu->kvm->arch.vpit;
- if (pit && vcpu->vcpu_id == 0 && pit->pit_state.inject_pending)
+ if (pit && vcpu->vcpu_id == 0 && pit->pit_state.irq_ack)
return atomic_read(&pit->pit_state.pit_timer.pending);
-
return 0;
}
+static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian)
+{
+ struct kvm_kpit_state *ps = container_of(kian, struct kvm_kpit_state,
+ irq_ack_notifier);
+ spin_lock(&ps->inject_lock);
+ if (atomic_dec_return(&ps->pit_timer.pending) < 0)
+ atomic_inc(&ps->pit_timer.pending);
+ ps->irq_ack = 1;
+ spin_unlock(&ps->inject_lock);
+}
+
static enum hrtimer_restart pit_timer_fn(struct hrtimer *data)
{
struct kvm_kpit_state *ps;
@@ -255,8 +266,9 @@ static void destroy_pit_timer(struct kvm_kpit_timer *pt)
hrtimer_cancel(&pt->timer);
}
-static void create_pit_timer(struct kvm_kpit_timer *pt, u32 val, int is_period)
+static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period)
{
+ struct kvm_kpit_timer *pt = &ps->pit_timer;
s64 interval;
interval = muldiv64(val, NSEC_PER_SEC, KVM_PIT_FREQ);
@@ -268,6 +280,7 @@ static void create_pit_timer(struct kvm_kpit_timer *pt, u32 val, int is_period)
pt->period = (is_period == 0) ? 0 : interval;
pt->timer.function = pit_timer_fn;
atomic_set(&pt->pending, 0);
+ ps->irq_ack = 1;
hrtimer_start(&pt->timer, ktime_add_ns(ktime_get(), interval),
HRTIMER_MODE_ABS);
@@ -302,11 +315,11 @@ static void pit_load_count(struct kvm *kvm, int channel, u32 val)
case 1:
/* FIXME: enhance mode 4 precision */
case 4:
- create_pit_timer(&ps->pit_timer, val, 0);
+ create_pit_timer(ps, val, 0);
break;
case 2:
case 3:
- create_pit_timer(&ps->pit_timer, val, 1);
+ create_pit_timer(ps, val, 1);
break;
default:
destroy_pit_timer(&ps->pit_timer);
@@ -520,7 +533,7 @@ void kvm_pit_reset(struct kvm_pit *pit)
mutex_unlock(&pit->pit_state.lock);
atomic_set(&pit->pit_state.pit_timer.pending, 0);
- pit->pit_state.inject_pending = 1;
+ pit->pit_state.irq_ack = 1;
}
struct kvm_pit *kvm_create_pit(struct kvm *kvm)
@@ -534,6 +547,7 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm)
mutex_init(&pit->pit_state.lock);
mutex_lock(&pit->pit_state.lock);
+ spin_lock_init(&pit->pit_state.inject_lock);
/* Initialize PIO device */
pit->dev.read = pit_ioport_read;
@@ -555,6 +569,9 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm)
pit_state->pit = pit;
hrtimer_init(&pit_state->pit_timer.timer,
CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+ pit_state->irq_ack_notifier.gsi = 0;
+ pit_state->irq_ack_notifier.irq_acked = kvm_pit_ack_irq;
+ kvm_register_irq_ack_notifier(kvm, &pit_state->irq_ack_notifier);
mutex_unlock(&pit->pit_state.lock);
kvm_pit_reset(pit);
@@ -578,10 +595,8 @@ void kvm_free_pit(struct kvm *kvm)
static void __inject_pit_timer_intr(struct kvm *kvm)
{
mutex_lock(&kvm->lock);
- kvm_ioapic_set_irq(kvm->arch.vioapic, 0, 1);
- kvm_ioapic_set_irq(kvm->arch.vioapic, 0, 0);
- kvm_pic_set_irq(pic_irqchip(kvm), 0, 1);
- kvm_pic_set_irq(pic_irqchip(kvm), 0, 0);
+ kvm_set_irq(kvm, 0, 1);
+ kvm_set_irq(kvm, 0, 0);
mutex_unlock(&kvm->lock);
}
@@ -592,37 +607,19 @@ void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu)
struct kvm_kpit_state *ps;
if (vcpu && pit) {
+ int inject = 0;
ps = &pit->pit_state;
- /* Try to inject pending interrupts when:
- * 1. Pending exists
- * 2. Last interrupt was accepted or waited for too long time*/
- if (atomic_read(&ps->pit_timer.pending) &&
- (ps->inject_pending ||
- (jiffies - ps->last_injected_time
- >= KVM_MAX_PIT_INTR_INTERVAL))) {
- ps->inject_pending = 0;
- __inject_pit_timer_intr(kvm);
- ps->last_injected_time = jiffies;
- }
- }
-}
-
-void kvm_pit_timer_intr_post(struct kvm_vcpu *vcpu, int vec)
-{
- struct kvm_arch *arch = &vcpu->kvm->arch;
- struct kvm_kpit_state *ps;
-
- if (vcpu && arch->vpit) {
- ps = &arch->vpit->pit_state;
- if (atomic_read(&ps->pit_timer.pending) &&
- (((arch->vpic->pics[0].imr & 1) == 0 &&
- arch->vpic->pics[0].irq_base == vec) ||
- (arch->vioapic->redirtbl[0].fields.vector == vec &&
- arch->vioapic->redirtbl[0].fields.mask != 1))) {
- ps->inject_pending = 1;
- atomic_dec(&ps->pit_timer.pending);
- ps->channels[0].count_load_time = ktime_get();
+ /* Try to inject pending interrupts when
+ * last one has been acked.
+ */
+ spin_lock(&ps->inject_lock);
+ if (atomic_read(&ps->pit_timer.pending) && ps->irq_ack) {
+ ps->irq_ack = 0;
+ inject = 1;
}
+ spin_unlock(&ps->inject_lock);
+ if (inject)
+ __inject_pit_timer_intr(kvm);
}
}
diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h
index db25c2a6c8c..e436d4983aa 100644
--- a/arch/x86/kvm/i8254.h
+++ b/arch/x86/kvm/i8254.h
@@ -8,7 +8,6 @@ struct kvm_kpit_timer {
int irq;
s64 period; /* unit: ns */
s64 scheduled;
- ktime_t last_update;
atomic_t pending;
};
@@ -34,8 +33,9 @@ struct kvm_kpit_state {
u32 speaker_data_on;
struct mutex lock;
struct kvm_pit *pit;
- bool inject_pending; /* if inject pending interrupts */
- unsigned long last_injected_time;
+ spinlock_t inject_lock;
+ unsigned long irq_ack;
+ struct kvm_irq_ack_notifier irq_ack_notifier;
};
struct kvm_pit {
@@ -54,7 +54,6 @@ struct kvm_pit {
#define KVM_PIT_CHANNEL_MASK 0x3
void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu);
-void kvm_pit_timer_intr_post(struct kvm_vcpu *vcpu, int vec);
void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val);
struct kvm_pit *kvm_create_pit(struct kvm *kvm);
void kvm_free_pit(struct kvm *kvm);
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index c31164e8aa4..17e41e165f1 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -30,6 +30,19 @@
#include <linux/kvm_host.h>
+static void pic_clear_isr(struct kvm_kpic_state *s, int irq)
+{
+ s->isr &= ~(1 << irq);
+ s->isr_ack |= (1 << irq);
+}
+
+void kvm_pic_clear_isr_ack(struct kvm *kvm)
+{
+ struct kvm_pic *s = pic_irqchip(kvm);
+ s->pics[0].isr_ack = 0xff;
+ s->pics[1].isr_ack = 0xff;
+}
+
/*
* set irq level. If an edge is detected, then the IRR is set to 1
*/
@@ -141,11 +154,12 @@ void kvm_pic_set_irq(void *opaque, int irq, int level)
*/
static inline void pic_intack(struct kvm_kpic_state *s, int irq)
{
+ s->isr |= 1 << irq;
if (s->auto_eoi) {
if (s->rotate_on_auto_eoi)
s->priority_add = (irq + 1) & 7;
- } else
- s->isr |= (1 << irq);
+ pic_clear_isr(s, irq);
+ }
/*
* We don't clear a level sensitive interrupt here
*/
@@ -153,9 +167,10 @@ static inline void pic_intack(struct kvm_kpic_state *s, int irq)
s->irr &= ~(1 << irq);
}
-int kvm_pic_read_irq(struct kvm_pic *s)
+int kvm_pic_read_irq(struct kvm *kvm)
{
int irq, irq2, intno;
+ struct kvm_pic *s = pic_irqchip(kvm);
irq = pic_get_irq(&s->pics[0]);
if (irq >= 0) {
@@ -181,16 +196,32 @@ int kvm_pic_read_irq(struct kvm_pic *s)
intno = s->pics[0].irq_base + irq;
}
pic_update_irq(s);
+ kvm_notify_acked_irq(kvm, irq);
return intno;
}
void kvm_pic_reset(struct kvm_kpic_state *s)
{
+ int irq, irqbase;
+ struct kvm *kvm = s->pics_state->irq_request_opaque;
+ struct kvm_vcpu *vcpu0 = kvm->vcpus[0];
+
+ if (s == &s->pics_state->pics[0])
+ irqbase = 0;
+ else
+ irqbase = 8;
+
+ for (irq = 0; irq < PIC_NUM_PINS/2; irq++) {
+ if (vcpu0 && kvm_apic_accept_pic_intr(vcpu0))
+ if (s->irr & (1 << irq) || s->isr & (1 << irq))
+ kvm_notify_acked_irq(kvm, irq+irqbase);
+ }
s->last_irr = 0;
s->irr = 0;
s->imr = 0;
s->isr = 0;
+ s->isr_ack = 0xff;
s->priority_add = 0;
s->irq_base = 0;
s->read_reg_select = 0;
@@ -243,7 +274,7 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val)
priority = get_priority(s, s->isr);
if (priority != 8) {
irq = (priority + s->priority_add) & 7;
- s->isr &= ~(1 << irq);
+ pic_clear_isr(s, irq);
if (cmd == 5)
s->priority_add = (irq + 1) & 7;
pic_update_irq(s->pics_state);
@@ -251,7 +282,7 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val)
break;
case 3:
irq = val & 7;
- s->isr &= ~(1 << irq);
+ pic_clear_isr(s, irq);
pic_update_irq(s->pics_state);
break;
case 6:
@@ -260,8 +291,8 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val)
break;
case 7:
irq = val & 7;
- s->isr &= ~(1 << irq);
s->priority_add = (irq + 1) & 7;
+ pic_clear_isr(s, irq);
pic_update_irq(s->pics_state);
break;
default:
@@ -303,7 +334,7 @@ static u32 pic_poll_read(struct kvm_kpic_state *s, u32 addr1)
s->pics_state->pics[0].irr &= ~(1 << 2);
}
s->irr &= ~(1 << ret);
- s->isr &= ~(1 << ret);
+ pic_clear_isr(s, ret);
if (addr1 >> 7 || ret != 2)
pic_update_irq(s->pics_state);
} else {
@@ -422,10 +453,14 @@ static void pic_irq_request(void *opaque, int level)
{
struct kvm *kvm = opaque;
struct kvm_vcpu *vcpu = kvm->vcpus[0];
+ struct kvm_pic *s = pic_irqchip(kvm);
+ int irq = pic_get_irq(&s->pics[0]);
- pic_irqchip(kvm)->output = level;
- if (vcpu)
+ s->output = level;
+ if (vcpu && level && (s->pics[0].isr_ack & (1 << irq))) {
+ s->pics[0].isr_ack &= ~(1 << irq);
kvm_vcpu_kick(vcpu);
+ }
}
struct kvm_pic *kvm_create_pic(struct kvm *kvm)
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index 76d736b5f66..c019b8edcdb 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -72,7 +72,7 @@ int kvm_cpu_get_interrupt(struct kvm_vcpu *v)
if (kvm_apic_accept_pic_intr(v)) {
s = pic_irqchip(v->kvm);
s->output = 0; /* PIC */
- vector = kvm_pic_read_irq(s);
+ vector = kvm_pic_read_irq(v->kvm);
}
}
return vector;
@@ -90,7 +90,6 @@ EXPORT_SYMBOL_GPL(kvm_inject_pending_timer_irqs);
void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec)
{
kvm_apic_timer_intr_post(vcpu, vec);
- kvm_pit_timer_intr_post(vcpu, vec);
/* TODO: PIT, RTC etc. */
}
EXPORT_SYMBOL_GPL(kvm_timer_intr_post);
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index 7ca47cbb48b..f17c8f5bbf3 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -42,6 +42,7 @@ struct kvm_kpic_state {
u8 irr; /* interrupt request register */
u8 imr; /* interrupt mask register */
u8 isr; /* interrupt service register */
+ u8 isr_ack; /* interrupt ack detection */
u8 priority_add; /* highest irq priority */
u8 irq_base;
u8 read_reg_select;
@@ -63,12 +64,13 @@ struct kvm_pic {
void *irq_request_opaque;
int output; /* intr from master PIC */
struct kvm_io_device dev;
+ void (*ack_notifier)(void *opaque, int irq);
};
struct kvm_pic *kvm_create_pic(struct kvm *kvm);
-void kvm_pic_set_irq(void *opaque, int irq, int level);
-int kvm_pic_read_irq(struct kvm_pic *s);
+int kvm_pic_read_irq(struct kvm *kvm);
void kvm_pic_update_irq(struct kvm_pic *s);
+void kvm_pic_clear_isr_ack(struct kvm *kvm);
static inline struct kvm_pic *pic_irqchip(struct kvm *kvm)
{
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
new file mode 100644
index 00000000000..1ff819dce7d
--- /dev/null
+++ b/arch/x86/kvm/kvm_cache_regs.h
@@ -0,0 +1,32 @@
+#ifndef ASM_KVM_CACHE_REGS_H
+#define ASM_KVM_CACHE_REGS_H
+
+static inline unsigned long kvm_register_read(struct kvm_vcpu *vcpu,
+ enum kvm_reg reg)
+{
+ if (!test_bit(reg, (unsigned long *)&vcpu->arch.regs_avail))
+ kvm_x86_ops->cache_reg(vcpu, reg);
+
+ return vcpu->arch.regs[reg];
+}
+
+static inline void kvm_register_write(struct kvm_vcpu *vcpu,
+ enum kvm_reg reg,
+ unsigned long val)
+{
+ vcpu->arch.regs[reg] = val;
+ __set_bit(reg, (unsigned long *)&vcpu->arch.regs_dirty);
+ __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
+}
+
+static inline unsigned long kvm_rip_read(struct kvm_vcpu *vcpu)
+{
+ return kvm_register_read(vcpu, VCPU_REGS_RIP);
+}
+
+static inline void kvm_rip_write(struct kvm_vcpu *vcpu, unsigned long val)
+{
+ kvm_register_write(vcpu, VCPU_REGS_RIP, val);
+}
+
+#endif
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 73f43de69f6..6571926bfd3 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -32,6 +32,7 @@
#include <asm/current.h>
#include <asm/apicdef.h>
#include <asm/atomic.h>
+#include "kvm_cache_regs.h"
#include "irq.h"
#define PRId64 "d"
@@ -338,13 +339,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
} else
apic_clear_vector(vector, apic->regs + APIC_TMR);
- if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE)
- kvm_vcpu_kick(vcpu);
- else if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED) {
- vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
- if (waitqueue_active(&vcpu->wq))
- wake_up_interruptible(&vcpu->wq);
- }
+ kvm_vcpu_kick(vcpu);
result = (orig_irr == 0);
break;
@@ -370,21 +365,18 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
kvm_vcpu_kick(vcpu);
} else {
- printk(KERN_DEBUG
- "Ignoring de-assert INIT to vcpu %d\n",
- vcpu->vcpu_id);
+ apic_debug("Ignoring de-assert INIT to vcpu %d\n",
+ vcpu->vcpu_id);
}
-
break;
case APIC_DM_STARTUP:
- printk(KERN_DEBUG "SIPI to vcpu %d vector 0x%02x\n",
- vcpu->vcpu_id, vector);
+ apic_debug("SIPI to vcpu %d vector 0x%02x\n",
+ vcpu->vcpu_id, vector);
if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
vcpu->arch.sipi_vector = vector;
vcpu->arch.mp_state = KVM_MP_STATE_SIPI_RECEIVED;
- if (waitqueue_active(&vcpu->wq))
- wake_up_interruptible(&vcpu->wq);
+ kvm_vcpu_kick(vcpu);
}
break;
@@ -438,7 +430,7 @@ struct kvm_vcpu *kvm_get_lowest_prio_vcpu(struct kvm *kvm, u8 vector,
static void apic_set_eoi(struct kvm_lapic *apic)
{
int vector = apic_find_highest_isr(apic);
-
+ int trigger_mode;
/*
* Not every write EOI will has corresponding ISR,
* one example is when Kernel check timer on setup_IO_APIC
@@ -450,7 +442,10 @@ static void apic_set_eoi(struct kvm_lapic *apic)
apic_update_ppr(apic);
if (apic_test_and_clear_vector(vector, apic->regs + APIC_TMR))
- kvm_ioapic_update_eoi(apic->vcpu->kvm, vector);
+ trigger_mode = IOAPIC_LEVEL_TRIG;
+ else
+ trigger_mode = IOAPIC_EDGE_TRIG;
+ kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode);
}
static void apic_send_ipi(struct kvm_lapic *apic)
@@ -558,8 +553,7 @@ static void __report_tpr_access(struct kvm_lapic *apic, bool write)
struct kvm_run *run = vcpu->run;
set_bit(KVM_REQ_REPORT_TPR_ACCESS, &vcpu->requests);
- kvm_x86_ops->cache_regs(vcpu);
- run->tpr_access.rip = vcpu->arch.rip;
+ run->tpr_access.rip = kvm_rip_read(vcpu);
run->tpr_access.is_write = write;
}
@@ -683,9 +677,9 @@ static void apic_mmio_write(struct kvm_io_device *this,
* Refer SDM 8.4.1
*/
if (len != 4 || alignment) {
- if (printk_ratelimit())
- printk(KERN_ERR "apic write: bad size=%d %lx\n",
- len, (long)address);
+ /* Don't shout loud, $infamous_os would cause only noise. */
+ apic_debug("apic write: bad size=%d %lx\n",
+ len, (long)address);
return;
}
@@ -947,10 +941,9 @@ static int __apic_timer_fn(struct kvm_lapic *apic)
if(!atomic_inc_and_test(&apic->timer.pending))
set_bit(KVM_REQ_PENDING_TIMER, &apic->vcpu->requests);
- if (waitqueue_active(q)) {
- apic->vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
+ if (waitqueue_active(q))
wake_up_interruptible(q);
- }
+
if (apic_lvtt_period(apic)) {
result = 1;
apic->timer.dev.expires = ktime_add_ns(
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 3da2508eb22..99c239c5c0a 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -70,6 +70,9 @@ static int dbg = 0;
module_param(dbg, bool, 0644);
#endif
+static int oos_shadow = 1;
+module_param(oos_shadow, bool, 0644);
+
#ifndef MMU_DEBUG
#define ASSERT(x) do { } while (0)
#else
@@ -135,18 +138,24 @@ module_param(dbg, bool, 0644);
#define ACC_USER_MASK PT_USER_MASK
#define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)
-struct kvm_pv_mmu_op_buffer {
- void *ptr;
- unsigned len;
- unsigned processed;
- char buf[512] __aligned(sizeof(long));
-};
+#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
struct kvm_rmap_desc {
u64 *shadow_ptes[RMAP_EXT];
struct kvm_rmap_desc *more;
};
+struct kvm_shadow_walk {
+ int (*entry)(struct kvm_shadow_walk *walk, struct kvm_vcpu *vcpu,
+ u64 addr, u64 *spte, int level);
+};
+
+struct kvm_unsync_walk {
+ int (*entry) (struct kvm_mmu_page *sp, struct kvm_unsync_walk *walk);
+};
+
+typedef int (*mmu_parent_walk_fn) (struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp);
+
static struct kmem_cache *pte_chain_cache;
static struct kmem_cache *rmap_desc_cache;
static struct kmem_cache *mmu_page_header_cache;
@@ -405,16 +414,19 @@ static int host_largepage_backed(struct kvm *kvm, gfn_t gfn)
{
struct vm_area_struct *vma;
unsigned long addr;
+ int ret = 0;
addr = gfn_to_hva(kvm, gfn);
if (kvm_is_error_hva(addr))
- return 0;
+ return ret;
+ down_read(&current->mm->mmap_sem);
vma = find_vma(current->mm, addr);
if (vma && is_vm_hugetlb_page(vma))
- return 1;
+ ret = 1;
+ up_read(&current->mm->mmap_sem);
- return 0;
+ return ret;
}
static int is_largepage_backed(struct kvm_vcpu *vcpu, gfn_t large_gfn)
@@ -649,8 +661,6 @@ static void rmap_write_protect(struct kvm *kvm, u64 gfn)
if (write_protected)
kvm_flush_remote_tlbs(kvm);
-
- account_shadowed(kvm, gfn);
}
static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp)
@@ -859,6 +869,77 @@ static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
BUG();
}
+
+static void mmu_parent_walk(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+ mmu_parent_walk_fn fn)
+{
+ struct kvm_pte_chain *pte_chain;
+ struct hlist_node *node;
+ struct kvm_mmu_page *parent_sp;
+ int i;
+
+ if (!sp->multimapped && sp->parent_pte) {
+ parent_sp = page_header(__pa(sp->parent_pte));
+ fn(vcpu, parent_sp);
+ mmu_parent_walk(vcpu, parent_sp, fn);
+ return;
+ }
+ hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
+ for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
+ if (!pte_chain->parent_ptes[i])
+ break;
+ parent_sp = page_header(__pa(pte_chain->parent_ptes[i]));
+ fn(vcpu, parent_sp);
+ mmu_parent_walk(vcpu, parent_sp, fn);
+ }
+}
+
+static void kvm_mmu_update_unsync_bitmap(u64 *spte)
+{
+ unsigned int index;
+ struct kvm_mmu_page *sp = page_header(__pa(spte));
+
+ index = spte - sp->spt;
+ __set_bit(index, sp->unsync_child_bitmap);
+ sp->unsync_children = 1;
+}
+
+static void kvm_mmu_update_parents_unsync(struct kvm_mmu_page *sp)
+{
+ struct kvm_pte_chain *pte_chain;
+ struct hlist_node *node;
+ int i;
+
+ if (!sp->parent_pte)
+ return;
+
+ if (!sp->multimapped) {
+ kvm_mmu_update_unsync_bitmap(sp->parent_pte);
+ return;
+ }
+
+ hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
+ for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
+ if (!pte_chain->parent_ptes[i])
+ break;
+ kvm_mmu_update_unsync_bitmap(pte_chain->parent_ptes[i]);
+ }
+}
+
+static int unsync_walk_fn(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
+{
+ sp->unsync_children = 1;
+ kvm_mmu_update_parents_unsync(sp);
+ return 1;
+}
+
+static void kvm_mmu_mark_parents_unsync(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_page *sp)
+{
+ mmu_parent_walk(vcpu, sp, unsync_walk_fn);
+ kvm_mmu_update_parents_unsync(sp);
+}
+
static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
struct kvm_mmu_page *sp)
{
@@ -868,6 +949,58 @@ static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
sp->spt[i] = shadow_trap_nonpresent_pte;
}
+static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_page *sp)
+{
+ return 1;
+}
+
+static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
+{
+}
+
+#define for_each_unsync_children(bitmap, idx) \
+ for (idx = find_first_bit(bitmap, 512); \
+ idx < 512; \
+ idx = find_next_bit(bitmap, 512, idx+1))
+
+static int mmu_unsync_walk(struct kvm_mmu_page *sp,
+ struct kvm_unsync_walk *walker)
+{
+ int i, ret;
+
+ if (!sp->unsync_children)
+ return 0;
+
+ for_each_unsync_children(sp->unsync_child_bitmap, i) {
+ u64 ent = sp->spt[i];
+
+ if (is_shadow_present_pte(ent)) {
+ struct kvm_mmu_page *child;
+ child = page_header(ent & PT64_BASE_ADDR_MASK);
+
+ if (child->unsync_children) {
+ ret = mmu_unsync_walk(child, walker);
+ if (ret)
+ return ret;
+ __clear_bit(i, sp->unsync_child_bitmap);
+ }
+
+ if (child->unsync) {
+ ret = walker->entry(child, walker);
+ __clear_bit(i, sp->unsync_child_bitmap);
+ if (ret)
+ return ret;
+ }
+ }
+ }
+
+ if (find_first_bit(sp->unsync_child_bitmap, 512) == 512)
+ sp->unsync_children = 0;
+
+ return 0;
+}
+
static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn)
{
unsigned index;
@@ -888,6 +1021,59 @@ static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn)
return NULL;
}
+static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+ WARN_ON(!sp->unsync);
+ sp->unsync = 0;
+ --kvm->stat.mmu_unsync;
+}
+
+static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp);
+
+static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
+{
+ if (sp->role.glevels != vcpu->arch.mmu.root_level) {
+ kvm_mmu_zap_page(vcpu->kvm, sp);
+ return 1;
+ }
+
+ rmap_write_protect(vcpu->kvm, sp->gfn);
+ if (vcpu->arch.mmu.sync_page(vcpu, sp)) {
+ kvm_mmu_zap_page(vcpu->kvm, sp);
+ return 1;
+ }
+
+ kvm_mmu_flush_tlb(vcpu);
+ kvm_unlink_unsync_page(vcpu->kvm, sp);
+ return 0;
+}
+
+struct sync_walker {
+ struct kvm_vcpu *vcpu;
+ struct kvm_unsync_walk walker;
+};
+
+static int mmu_sync_fn(struct kvm_mmu_page *sp, struct kvm_unsync_walk *walk)
+{
+ struct sync_walker *sync_walk = container_of(walk, struct sync_walker,
+ walker);
+ struct kvm_vcpu *vcpu = sync_walk->vcpu;
+
+ kvm_sync_page(vcpu, sp);
+ return (need_resched() || spin_needbreak(&vcpu->kvm->mmu_lock));
+}
+
+static void mmu_sync_children(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
+{
+ struct sync_walker walker = {
+ .walker = { .entry = mmu_sync_fn, },
+ .vcpu = vcpu,
+ };
+
+ while (mmu_unsync_walk(sp, &walker.walker))
+ cond_resched_lock(&vcpu->kvm->mmu_lock);
+}
+
static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
gfn_t gfn,
gva_t gaddr,
@@ -901,7 +1087,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
unsigned quadrant;
struct hlist_head *bucket;
struct kvm_mmu_page *sp;
- struct hlist_node *node;
+ struct hlist_node *node, *tmp;
role.word = 0;
role.glevels = vcpu->arch.mmu.root_level;
@@ -917,9 +1103,20 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
gfn, role.word);
index = kvm_page_table_hashfn(gfn);
bucket = &vcpu->kvm->arch.mmu_page_hash[index];
- hlist_for_each_entry(sp, node, bucket, hash_link)
- if (sp->gfn == gfn && sp->role.word == role.word) {
+ hlist_for_each_entry_safe(sp, node, tmp, bucket, hash_link)
+ if (sp->gfn == gfn) {
+ if (sp->unsync)
+ if (kvm_sync_page(vcpu, sp))
+ continue;
+