--- zzzz-none-000/linux-3.10.107/virt/kvm/kvm_main.c 2017-06-27 09:49:32.000000000 +0000 +++ scorpion-7490-727/linux-3.10.107/virt/kvm/kvm_main.c 2021-02-04 17:41:59.000000000 +0000 @@ -16,7 +16,7 @@ * */ -#include "iodev.h" +#include #include #include @@ -58,6 +58,7 @@ #include "coalesced_mmio.h" #include "async_pf.h" +#include "vfio.h" #define CREATE_TRACE_POINTS #include @@ -65,17 +66,30 @@ MODULE_AUTHOR("Qumranet"); MODULE_LICENSE("GPL"); +/* Architectures should define their poll value according to the halt latency */ +static unsigned int halt_poll_ns = KVM_HALT_POLL_NS_DEFAULT; +module_param(halt_poll_ns, uint, S_IRUGO | S_IWUSR); + +/* Default doubles per-vcpu halt_poll_ns. */ +static unsigned int halt_poll_ns_grow = 2; +module_param(halt_poll_ns_grow, int, S_IRUGO); + +/* Default resets per-vcpu halt_poll_ns . */ +static unsigned int halt_poll_ns_shrink; +module_param(halt_poll_ns_shrink, int, S_IRUGO); + /* * Ordering of locks: * - * kvm->lock --> kvm->slots_lock --> kvm->irq_lock + * kvm->lock --> kvm->slots_lock --> kvm->irq_lock */ -DEFINE_RAW_SPINLOCK(kvm_lock); +DEFINE_SPINLOCK(kvm_lock); +static DEFINE_RAW_SPINLOCK(kvm_count_lock); LIST_HEAD(vm_list); static cpumask_var_t cpus_hardware_enabled; -static int kvm_usage_count = 0; +static int kvm_usage_count; static atomic_t hardware_enable_failed; struct kmem_cache *kvm_vcpu_cache; @@ -84,10 +98,11 @@ static __read_mostly struct preempt_ops kvm_preempt_ops; struct dentry *kvm_debugfs_dir; +EXPORT_SYMBOL_GPL(kvm_debugfs_dir); static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl, unsigned long arg); -#ifdef CONFIG_COMPAT +#ifdef CONFIG_KVM_COMPAT static long kvm_vcpu_compat_ioctl(struct file *file, unsigned int ioctl, unsigned long arg); #endif @@ -96,35 +111,18 @@ static void kvm_io_bus_destroy(struct kvm_io_bus *bus); -bool kvm_rebooting; +static void kvm_release_pfn_dirty(pfn_t pfn); +static void mark_page_dirty_in_slot(struct kvm_memory_slot *memslot, gfn_t gfn); + +__visible bool kvm_rebooting; EXPORT_SYMBOL_GPL(kvm_rebooting); static bool largepages_enabled = true; -bool kvm_is_mmio_pfn(pfn_t pfn) +bool kvm_is_reserved_pfn(pfn_t pfn) { - if (pfn_valid(pfn)) { - int reserved; - struct page *tail = pfn_to_page(pfn); - struct page *head = compound_head(tail); - reserved = PageReserved(head); - if (head != tail) { - /* - * "head" is not a dangling pointer - * (compound_head takes care of that) - * but the hugepage may have been splitted - * from under us (and we may not hold a - * reference count on the head page so it can - * be reused before we run PageReferenced), so - * we've to check PageTail before returning - * what we just read. - */ - smp_rmb(); - if (PageTail(tail)) - return reserved; - } - return PageReserved(tail); - } + if (pfn_valid(pfn)) + return PageReserved(pfn_to_page(pfn)); return true; } @@ -138,20 +136,13 @@ if (mutex_lock_killable(&vcpu->mutex)) return -EINTR; - if (unlikely(vcpu->pid != current->pids[PIDTYPE_PID].pid)) { - /* The thread running this VCPU changed. */ - struct pid *oldpid = vcpu->pid; - struct pid *newpid = get_task_pid(current, PIDTYPE_PID); - rcu_assign_pointer(vcpu->pid, newpid); - synchronize_rcu(); - put_pid(oldpid); - } cpu = get_cpu(); preempt_notifier_register(&vcpu->preempt_notifier); kvm_arch_vcpu_load(vcpu, cpu); put_cpu(); return 0; } +EXPORT_SYMBOL_GPL(vcpu_load); void vcpu_put(struct kvm_vcpu *vcpu) { @@ -161,12 +152,13 @@ preempt_enable(); mutex_unlock(&vcpu->mutex); } +EXPORT_SYMBOL_GPL(vcpu_put); static void ack_flush(void *_completed) { } -static bool make_all_cpus_request(struct kvm *kvm, unsigned int req) +bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req) { int i, cpu, me; cpumask_var_t cpus; @@ -198,29 +190,32 @@ return called; } +#ifndef CONFIG_HAVE_KVM_ARCH_TLB_FLUSH_ALL void kvm_flush_remote_tlbs(struct kvm *kvm) { long dirty_count = kvm->tlbs_dirty; smp_mb(); - if (make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH)) + if (kvm_make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH)) ++kvm->stat.remote_tlb_flush; cmpxchg(&kvm->tlbs_dirty, dirty_count, 0); } +EXPORT_SYMBOL_GPL(kvm_flush_remote_tlbs); +#endif void kvm_reload_remote_mmus(struct kvm *kvm) { - make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD); + kvm_make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD); } void kvm_make_mclock_inprogress_request(struct kvm *kvm) { - make_all_cpus_request(kvm, KVM_REQ_MCLOCK_INPROGRESS); + kvm_make_all_cpus_request(kvm, KVM_REQ_MCLOCK_INPROGRESS); } void kvm_make_scan_ioapic_request(struct kvm *kvm) { - make_all_cpus_request(kvm, KVM_REQ_SCAN_IOAPIC); + kvm_make_all_cpus_request(kvm, KVM_REQ_SCAN_IOAPIC); } int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) @@ -233,9 +228,13 @@ vcpu->kvm = kvm; vcpu->vcpu_id = id; vcpu->pid = NULL; + vcpu->halt_poll_ns = 0; init_waitqueue_head(&vcpu->wq); kvm_async_pf_vcpu_init(vcpu); + vcpu->pre_pcpu = -1; + INIT_LIST_HEAD(&vcpu->blocked_vcpu_list); + page = alloc_page(GFP_KERNEL | __GFP_ZERO); if (!page) { r = -ENOMEM; @@ -308,6 +307,9 @@ kvm_flush_remote_tlbs(kvm); spin_unlock(&kvm->mmu_lock); + + kvm_arch_mmu_notifier_invalidate_page(kvm, address); + srcu_read_unlock(&kvm->srcu, idx); } @@ -381,7 +383,8 @@ static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn, struct mm_struct *mm, - unsigned long address) + unsigned long start, + unsigned long end) { struct kvm *kvm = mmu_notifier_to_kvm(mn); int young, idx; @@ -389,7 +392,7 @@ idx = srcu_read_lock(&kvm->srcu); spin_lock(&kvm->mmu_lock); - young = kvm_age_hva(kvm, address); + young = kvm_age_hva(kvm, start, end); if (young) kvm_flush_remote_tlbs(kvm); @@ -399,6 +402,36 @@ return young; } +static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long start, + unsigned long end) +{ + struct kvm *kvm = mmu_notifier_to_kvm(mn); + int young, idx; + + idx = srcu_read_lock(&kvm->srcu); + spin_lock(&kvm->mmu_lock); + /* + * Even though we do not flush TLB, this will still adversely + * affect performance on pre-Haswell Intel EPT, where there is + * no EPT Access Bit to clear so that we have to tear down EPT + * tables instead. If we find this unacceptable, we can always + * add a parameter to kvm_age_hva so that it effectively doesn't + * do anything on clear_young. + * + * Also note that currently we never issue secondary TLB flushes + * from clear_young, leaving this job up to the regular system + * cadence. If we find this inaccurate, we might come up with a + * more sophisticated heuristic later. + */ + young = kvm_age_hva(kvm, start, end); + spin_unlock(&kvm->mmu_lock); + srcu_read_unlock(&kvm->srcu, idx); + + return young; +} + static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn, struct mm_struct *mm, unsigned long address) @@ -431,6 +464,7 @@ .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start, .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end, .clear_flush_young = kvm_mmu_notifier_clear_flush_young, + .clear_young = kvm_mmu_notifier_clear_young, .test_young = kvm_mmu_notifier_test_young, .change_pte = kvm_mmu_notifier_change_pte, .release = kvm_mmu_notifier_release, @@ -451,13 +485,60 @@ #endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */ -static void kvm_init_memslots_id(struct kvm *kvm) +static struct kvm_memslots *kvm_alloc_memslots(void) { int i; - struct kvm_memslots *slots = kvm->memslots; + struct kvm_memslots *slots; + slots = kvm_kvzalloc(sizeof(struct kvm_memslots)); + if (!slots) + return NULL; + + /* + * Init kvm generation close to the maximum to easily test the + * code of handling generation number wrap-around. + */ + slots->generation = -150; for (i = 0; i < KVM_MEM_SLOTS_NUM; i++) slots->id_to_index[i] = slots->memslots[i].id = i; + + return slots; +} + +static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot) +{ + if (!memslot->dirty_bitmap) + return; + + kvfree(memslot->dirty_bitmap); + memslot->dirty_bitmap = NULL; +} + +/* + * Free any memory in @free but not in @dont. + */ +static void kvm_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free, + struct kvm_memory_slot *dont) +{ + if (!dont || free->dirty_bitmap != dont->dirty_bitmap) + kvm_destroy_dirty_bitmap(free); + + kvm_arch_free_memslot(kvm, free, dont); + + free->npages = 0; +} + +static void kvm_free_memslots(struct kvm *kvm, struct kvm_memslots *slots) +{ + struct kvm_memory_slot *memslot; + + if (!slots) + return; + + kvm_for_each_memslot(memslot, slots) + kvm_free_memslot(kvm, memslot, NULL); + + kvfree(slots); } static struct kvm *kvm_create_vm(unsigned long type) @@ -480,26 +561,29 @@ r = kvm_arch_init_vm(kvm, type); if (r) - goto out_err_nodisable; + goto out_err_no_disable; r = hardware_enable_all(); if (r) - goto out_err_nodisable; + goto out_err_no_disable; -#ifdef CONFIG_HAVE_KVM_IRQCHIP - INIT_HLIST_HEAD(&kvm->mask_notifier_list); +#ifdef CONFIG_HAVE_KVM_IRQFD INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list); #endif BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX); r = -ENOMEM; - kvm->memslots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); - if (!kvm->memslots) - goto out_err_nosrcu; - kvm_init_memslots_id(kvm); + for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { + kvm->memslots[i] = kvm_alloc_memslots(); + if (!kvm->memslots[i]) + goto out_err_no_srcu; + } + if (init_srcu_struct(&kvm->srcu)) - goto out_err_nosrcu; + goto out_err_no_srcu; + if (init_srcu_struct(&kvm->irq_srcu)) + goto out_err_no_irq_srcu; for (i = 0; i < KVM_NR_BUSES; i++) { kvm->buses[i] = kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL); @@ -511,20 +595,25 @@ if (r) goto out_err; - raw_spin_lock(&kvm_lock); + spin_lock(&kvm_lock); list_add(&kvm->vm_list, &vm_list); - raw_spin_unlock(&kvm_lock); + spin_unlock(&kvm_lock); + + preempt_notifier_inc(); return kvm; out_err: + cleanup_srcu_struct(&kvm->irq_srcu); +out_err_no_irq_srcu: cleanup_srcu_struct(&kvm->srcu); -out_err_nosrcu: +out_err_no_srcu: hardware_disable_all(); -out_err_nodisable: +out_err_no_disable: for (i = 0; i < KVM_NR_BUSES; i++) kfree(kvm->buses[i]); - kfree(kvm->memslots); + for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) + kvm_free_memslots(kvm, kvm->memslots[i]); kvm_arch_free_vm(kvm); mmdrop(current->mm); return ERR_PTR(r); @@ -542,48 +631,6 @@ return kzalloc(size, GFP_KERNEL); } -void kvm_kvfree(const void *addr) -{ - if (is_vmalloc_addr(addr)) - vfree(addr); - else - kfree(addr); -} - -static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot) -{ - if (!memslot->dirty_bitmap) - return; - - kvm_kvfree(memslot->dirty_bitmap); - memslot->dirty_bitmap = NULL; -} - -/* - * Free any memory in @free but not in @dont. - */ -static void kvm_free_physmem_slot(struct kvm_memory_slot *free, - struct kvm_memory_slot *dont) -{ - if (!dont || free->dirty_bitmap != dont->dirty_bitmap) - kvm_destroy_dirty_bitmap(free); - - kvm_arch_free_memslot(free, dont); - - free->npages = 0; -} - -void kvm_free_physmem(struct kvm *kvm) -{ - struct kvm_memslots *slots = kvm->memslots; - struct kvm_memory_slot *memslot; - - kvm_for_each_memslot(memslot, slots) - kvm_free_physmem_slot(memslot, NULL); - - kfree(kvm->memslots); -} - static void kvm_destroy_devices(struct kvm *kvm) { struct list_head *node, *tmp; @@ -603,9 +650,9 @@ struct mm_struct *mm = kvm->mm; kvm_arch_sync_events(kvm); - raw_spin_lock(&kvm_lock); + spin_lock(&kvm_lock); list_del(&kvm->vm_list); - raw_spin_unlock(&kvm_lock); + spin_unlock(&kvm_lock); kvm_free_irq_routing(kvm); for (i = 0; i < KVM_NR_BUSES; i++) { if (kvm->buses[i]) @@ -620,9 +667,12 @@ #endif kvm_arch_destroy_vm(kvm); kvm_destroy_devices(kvm); - kvm_free_physmem(kvm); + for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) + kvm_free_memslots(kvm, kvm->memslots[i]); + cleanup_srcu_struct(&kvm->irq_srcu); cleanup_srcu_struct(&kvm->srcu); kvm_arch_free_vm(kvm); + preempt_notifier_dec(); hardware_disable_all(); mmdrop(mm); } @@ -657,68 +707,75 @@ */ static int kvm_create_dirty_bitmap(struct kvm_memory_slot *memslot) { -#ifndef CONFIG_S390 unsigned long dirty_bytes = 2 * kvm_dirty_bitmap_bytes(memslot); memslot->dirty_bitmap = kvm_kvzalloc(dirty_bytes); if (!memslot->dirty_bitmap) return -ENOMEM; -#endif /* !CONFIG_S390 */ - return 0; -} - -static int cmp_memslot(const void *slot1, const void *slot2) -{ - struct kvm_memory_slot *s1, *s2; - - s1 = (struct kvm_memory_slot *)slot1; - s2 = (struct kvm_memory_slot *)slot2; - - if (s1->npages < s2->npages) - return 1; - if (s1->npages > s2->npages) - return -1; - return 0; } /* - * Sort the memslots base on its size, so the larger slots - * will get better fit. + * Insert memslot and re-sort memslots based on their GFN, + * so binary search could be used to lookup GFN. + * Sorting algorithm takes advantage of having initially + * sorted array and known changed memslot position. */ -static void sort_memslots(struct kvm_memslots *slots) +static void update_memslots(struct kvm_memslots *slots, + struct kvm_memory_slot *new) { - int i; - - sort(slots->memslots, KVM_MEM_SLOTS_NUM, - sizeof(struct kvm_memory_slot), cmp_memslot, NULL); - - for (i = 0; i < KVM_MEM_SLOTS_NUM; i++) - slots->id_to_index[slots->memslots[i].id] = i; -} - -void update_memslots(struct kvm_memslots *slots, struct kvm_memory_slot *new, - u64 last_generation) -{ - if (new) { - int id = new->id; - struct kvm_memory_slot *old = id_to_memslot(slots, id); - unsigned long npages = old->npages; + int id = new->id; + int i = slots->id_to_index[id]; + struct kvm_memory_slot *mslots = slots->memslots; + + WARN_ON(mslots[i].id != id); + if (!new->npages) { + WARN_ON(!mslots[i].npages); + if (mslots[i].npages) + slots->used_slots--; + } else { + if (!mslots[i].npages) + slots->used_slots++; + } - *old = *new; - if (new->npages != npages) - sort_memslots(slots); + while (i < KVM_MEM_SLOTS_NUM - 1 && + new->base_gfn <= mslots[i + 1].base_gfn) { + if (!mslots[i + 1].npages) + break; + mslots[i] = mslots[i + 1]; + slots->id_to_index[mslots[i].id] = i; + i++; } - slots->generation = last_generation + 1; + /* + * The ">=" is needed when creating a slot with base_gfn == 0, + * so that it moves before all those with base_gfn == npages == 0. + * + * On the other hand, if new->npages is zero, the above loop has + * already left i pointing to the beginning of the empty part of + * mslots, and the ">=" would move the hole backwards in this + * case---which is wrong. So skip the loop when deleting a slot. + */ + if (new->npages) { + while (i > 0 && + new->base_gfn >= mslots[i - 1].base_gfn) { + mslots[i] = mslots[i - 1]; + slots->id_to_index[mslots[i].id] = i; + i--; + } + } else + WARN_ON_ONCE(i != slots->used_slots); + + mslots[i] = *new; + slots->id_to_index[mslots[i].id] = i; } -static int check_memory_region_flags(struct kvm_userspace_memory_region *mem) +static int check_memory_region_flags(const struct kvm_userspace_memory_region *mem) { u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES; -#ifdef KVM_CAP_READONLY_MEM +#ifdef __KVM_HAVE_READONLY_MEM valid_flags |= KVM_MEM_READONLY; #endif @@ -729,14 +786,30 @@ } static struct kvm_memslots *install_new_memslots(struct kvm *kvm, - struct kvm_memslots *slots, struct kvm_memory_slot *new) + int as_id, struct kvm_memslots *slots) { - struct kvm_memslots *old_memslots = kvm->memslots; + struct kvm_memslots *old_memslots = __kvm_memslots(kvm, as_id); + + /* + * Set the low bit in the generation, which disables SPTE caching + * until the end of synchronize_srcu_expedited. + */ + WARN_ON(old_memslots->generation & 1); + slots->generation = old_memslots->generation + 1; - update_memslots(slots, new, kvm->memslots->generation); - rcu_assign_pointer(kvm->memslots, slots); + rcu_assign_pointer(kvm->memslots[as_id], slots); synchronize_srcu_expedited(&kvm->srcu); - return old_memslots; + + /* + * Increment the new memslot generation a second time. This prevents + * vm exits that race with memslot updates from caching a memslot + * generation that will (potentially) be valid forever. + */ + slots->generation++; + + kvm_arch_memslots_updated(kvm, slots); + + return old_memslots; } /* @@ -745,10 +818,10 @@ * * Discontiguous memory is allowed, mostly for framebuffers. * - * Must be called holding mmap_sem for write. + * Must be called holding kvm->slots_lock for write. */ int __kvm_set_memory_region(struct kvm *kvm, - struct kvm_userspace_memory_region *mem) + const struct kvm_userspace_memory_region *mem) { int r; gfn_t base_gfn; @@ -756,6 +829,7 @@ struct kvm_memory_slot *slot; struct kvm_memory_slot old, new; struct kvm_memslots *slots = NULL, *old_memslots; + int as_id, id; enum kvm_mr_change change; r = check_memory_region_flags(mem); @@ -763,42 +837,40 @@ goto out; r = -EINVAL; + as_id = mem->slot >> 16; + id = (u16)mem->slot; + /* General sanity checks */ if (mem->memory_size & (PAGE_SIZE - 1)) goto out; if (mem->guest_phys_addr & (PAGE_SIZE - 1)) goto out; /* We can read the guest memory with __xxx_user() later on. */ - if ((mem->slot < KVM_USER_MEM_SLOTS) && + if ((id < KVM_USER_MEM_SLOTS) && ((mem->userspace_addr & (PAGE_SIZE - 1)) || !access_ok(VERIFY_WRITE, (void __user *)(unsigned long)mem->userspace_addr, mem->memory_size))) goto out; - if (mem->slot >= KVM_MEM_SLOTS_NUM) + if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_MEM_SLOTS_NUM) goto out; if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr) goto out; - slot = id_to_memslot(kvm->memslots, mem->slot); + slot = id_to_memslot(__kvm_memslots(kvm, as_id), id); base_gfn = mem->guest_phys_addr >> PAGE_SHIFT; npages = mem->memory_size >> PAGE_SHIFT; - r = -EINVAL; if (npages > KVM_MEM_MAX_NR_PAGES) goto out; - if (!npages) - mem->flags &= ~KVM_MEM_LOG_DIRTY_PAGES; - new = old = *slot; - new.id = mem->slot; + new.id = id; new.base_gfn = base_gfn; new.npages = npages; new.flags = mem->flags; - r = -EINVAL; if (npages) { if (!old.npages) change = KVM_MR_CREATE; @@ -817,17 +889,21 @@ goto out; } } - } else if (old.npages) { + } else { + if (!old.npages) + goto out; + change = KVM_MR_DELETE; - } else /* Modify a non-existent slot: disallowed. */ - goto out; + new.base_gfn = 0; + new.flags = 0; + } if ((change == KVM_MR_CREATE) || (change == KVM_MR_MOVE)) { /* Check for overlaps */ r = -EEXIST; - kvm_for_each_memslot(slot, kvm->memslots) { + kvm_for_each_memslot(slot, __kvm_memslots(kvm, as_id)) { if ((slot->id >= KVM_USER_MEM_SLOTS) || - (slot->id == mem->slot)) + (slot->id == id)) continue; if (!((base_gfn + npages <= slot->base_gfn) || (base_gfn >= slot->base_gfn + slot->npages))) @@ -843,7 +919,7 @@ if (change == KVM_MR_CREATE) { new.userspace_addr = mem->userspace_addr; - if (kvm_arch_create_memslot(&new, npages)) + if (kvm_arch_create_memslot(kvm, &new, npages)) goto out_free; } @@ -853,16 +929,16 @@ goto out_free; } + slots = kvm_kvzalloc(sizeof(struct kvm_memslots)); + if (!slots) + goto out_free; + memcpy(slots, __kvm_memslots(kvm, as_id), sizeof(struct kvm_memslots)); + if ((change == KVM_MR_DELETE) || (change == KVM_MR_MOVE)) { - r = -ENOMEM; - slots = kmemdup(kvm->memslots, sizeof(struct kvm_memslots), - GFP_KERNEL); - if (!slots) - goto out_free; - slot = id_to_memslot(slots, mem->slot); + slot = id_to_memslot(slots, id); slot->flags |= KVM_MEMSLOT_INVALID; - old_memslots = install_new_memslots(kvm, slots, NULL); + old_memslots = install_new_memslots(kvm, as_id, slots); /* slot was deleted or moved, clear iommu mapping */ kvm_iommu_unmap_pages(kvm, &old); @@ -870,10 +946,16 @@ * or moved, memslot will be created. * * validation of sp->gfn happens in: - * - gfn_to_hva (kvm_read_guest, gfn_to_pfn) - * - kvm_is_visible_gfn (mmu_check_roots) + * - gfn_to_hva (kvm_read_guest, gfn_to_pfn) + * - kvm_is_visible_gfn (mmu_check_roots) */ kvm_arch_flush_shadow_memslot(kvm, slot); + + /* + * We can re-use the old_memslots from above, the only difference + * from the currently installed memslots is the invalid flag. This + * will get overwritten by update_memslots anyway. + */ slots = old_memslots; } @@ -881,19 +963,20 @@ if (r) goto out_slots; - r = -ENOMEM; - /* - * We can re-use the old_memslots from above, the only difference - * from the currently installed memslots is the invalid flag. This - * will get overwritten by update_memslots anyway. - */ - if (!slots) { - slots = kmemdup(kvm->memslots, sizeof(struct kvm_memslots), - GFP_KERNEL); - if (!slots) - goto out_free; + /* actual memory is freed via old in kvm_free_memslot below */ + if (change == KVM_MR_DELETE) { + new.dirty_bitmap = NULL; + memset(&new.arch, 0, sizeof(new.arch)); } + update_memslots(slots, &new); + old_memslots = install_new_memslots(kvm, as_id, slots); + + kvm_arch_commit_memory_region(kvm, mem, &old, &new, change); + + kvm_free_memslot(kvm, &old, &new); + kvfree(old_memslots); + /* * IOMMU mapping: New slots need to be mapped. Old slots need to be * un-mapped and re-mapped if their base changes. Since base change @@ -905,36 +988,22 @@ */ if ((change == KVM_MR_CREATE) || (change == KVM_MR_MOVE)) { r = kvm_iommu_map_pages(kvm, &new); - if (r) - goto out_slots; - } - - /* actual memory is freed via old in kvm_free_physmem_slot below */ - if (change == KVM_MR_DELETE) { - new.dirty_bitmap = NULL; - memset(&new.arch, 0, sizeof(new.arch)); + return r; } - old_memslots = install_new_memslots(kvm, slots, &new); - - kvm_arch_commit_memory_region(kvm, mem, &old, change); - - kvm_free_physmem_slot(&old, &new); - kfree(old_memslots); - return 0; out_slots: - kfree(slots); + kvfree(slots); out_free: - kvm_free_physmem_slot(&new, &old); + kvm_free_memslot(kvm, &new, &old); out: return r; } EXPORT_SYMBOL_GPL(__kvm_set_memory_region); int kvm_set_memory_region(struct kvm *kvm, - struct kvm_userspace_memory_region *mem) + const struct kvm_userspace_memory_region *mem) { int r; @@ -945,27 +1014,32 @@ } EXPORT_SYMBOL_GPL(kvm_set_memory_region); -int kvm_vm_ioctl_set_memory_region(struct kvm *kvm, - struct kvm_userspace_memory_region *mem) +static int kvm_vm_ioctl_set_memory_region(struct kvm *kvm, + struct kvm_userspace_memory_region *mem) { - if (mem->slot >= KVM_USER_MEM_SLOTS) + if ((u16)mem->slot >= KVM_USER_MEM_SLOTS) return -EINVAL; + return kvm_set_memory_region(kvm, mem); } int kvm_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log, int *is_dirty) { + struct kvm_memslots *slots; struct kvm_memory_slot *memslot; - int r, i; + int r, i, as_id, id; unsigned long n; unsigned long any = 0; r = -EINVAL; - if (log->slot >= KVM_USER_MEM_SLOTS) + as_id = log->slot >> 16; + id = (u16)log->slot; + if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS) goto out; - memslot = id_to_memslot(kvm->memslots, log->slot); + slots = __kvm_memslots(kvm, as_id); + memslot = id_to_memslot(slots, id); r = -ENOENT; if (!memslot->dirty_bitmap) goto out; @@ -986,6 +1060,93 @@ out: return r; } +EXPORT_SYMBOL_GPL(kvm_get_dirty_log); + +#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT +/** + * kvm_get_dirty_log_protect - get a snapshot of dirty pages, and if any pages + * are dirty write protect them for next write. + * @kvm: pointer to kvm instance + * @log: slot id and address to which we copy the log + * @is_dirty: flag set if any page is dirty + * + * We need to keep it in mind that VCPU threads can write to the bitmap + * concurrently. So, to avoid losing track of dirty pages we keep the + * following order: + * + * 1. Take a snapshot of the bit and clear it if needed. + * 2. Write protect the corresponding page. + * 3. Copy the snapshot to the userspace. + * 4. Upon return caller flushes TLB's if needed. + * + * Between 2 and 4, the guest may write to the page using the remaining TLB + * entry. This is not a problem because the page is reported dirty using + * the snapshot taken before and step 4 ensures that writes done after + * exiting to userspace will be logged for the next call. + * + */ +int kvm_get_dirty_log_protect(struct kvm *kvm, + struct kvm_dirty_log *log, bool *is_dirty) +{ + struct kvm_memslots *slots; + struct kvm_memory_slot *memslot; + int r, i, as_id, id; + unsigned long n; + unsigned long *dirty_bitmap; + unsigned long *dirty_bitmap_buffer; + + r = -EINVAL; + as_id = log->slot >> 16; + id = (u16)log->slot; + if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS) + goto out; + + slots = __kvm_memslots(kvm, as_id); + memslot = id_to_memslot(slots, id); + + dirty_bitmap = memslot->dirty_bitmap; + r = -ENOENT; + if (!dirty_bitmap) + goto out; + + n = kvm_dirty_bitmap_bytes(memslot); + + dirty_bitmap_buffer = dirty_bitmap + n / sizeof(long); + memset(dirty_bitmap_buffer, 0, n); + + spin_lock(&kvm->mmu_lock); + *is_dirty = false; + for (i = 0; i < n / sizeof(long); i++) { + unsigned long mask; + gfn_t offset; + + if (!dirty_bitmap[i]) + continue; + + *is_dirty = true; + + mask = xchg(&dirty_bitmap[i], 0); + dirty_bitmap_buffer[i] = mask; + + if (mask) { + offset = i * BITS_PER_LONG; + kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot, + offset, mask); + } + } + + spin_unlock(&kvm->mmu_lock); + + r = -EFAULT; + if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n)) + goto out; + + r = 0; +out: + return r; +} +EXPORT_SYMBOL_GPL(kvm_get_dirty_log_protect); +#endif bool kvm_largepages_enabled(void) { @@ -1004,6 +1165,11 @@ } EXPORT_SYMBOL_GPL(gfn_to_memslot); +struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn) +{ + return __gfn_to_memslot(kvm_vcpu_memslots(vcpu), gfn); +} + int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn) { struct kvm_memory_slot *memslot = gfn_to_memslot(kvm, gfn); @@ -1067,7 +1233,7 @@ } unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot, - gfn_t gfn) + gfn_t gfn) { return gfn_to_hva_many(slot, gfn, NULL); } @@ -1079,23 +1245,39 @@ } EXPORT_SYMBOL_GPL(gfn_to_hva); +unsigned long kvm_vcpu_gfn_to_hva(struct kvm_vcpu *vcpu, gfn_t gfn) +{ + return gfn_to_hva_many(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn, NULL); +} +EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_hva); + /* - * The hva returned by this function is only allowed to be read. - * It should pair with kvm_read_hva() or kvm_read_hva_atomic(). + * If writable is set to false, the hva returned by this function is only + * allowed to be read. */ -static unsigned long gfn_to_hva_read(struct kvm *kvm, gfn_t gfn) +unsigned long gfn_to_hva_memslot_prot(struct kvm_memory_slot *slot, + gfn_t gfn, bool *writable) { - return __gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL, false); + unsigned long hva = __gfn_to_hva_many(slot, gfn, NULL, false); + + if (!kvm_is_error_hva(hva) && writable) + *writable = !memslot_is_readonly(slot); + + return hva; } -static int kvm_read_hva(void *data, void __user *hva, int len) +unsigned long gfn_to_hva_prot(struct kvm *kvm, gfn_t gfn, bool *writable) { - return __copy_from_user(data, hva, len); + struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn); + + return gfn_to_hva_memslot_prot(slot, gfn, writable); } -static int kvm_read_hva_atomic(void *data, void __user *hva, int len) +unsigned long kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu *vcpu, gfn_t gfn, bool *writable) { - return __copy_from_user_inatomic(data, hva, len); + struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); + + return gfn_to_hva_memslot_prot(slot, gfn, writable); } static int get_user_page_nowait(struct task_struct *tsk, struct mm_struct *mm, @@ -1172,8 +1354,9 @@ addr, write_fault, page); up_read(¤t->mm->mmap_sem); } else - npages = get_user_pages_fast(addr, 1, write_fault, - page); + npages = __get_user_pages_unlocked(current, current->mm, addr, 1, + write_fault, 0, page, + FOLL_TOUCH|FOLL_HWPOISON); if (npages != 1) return npages; @@ -1253,7 +1436,7 @@ else if ((vma->vm_flags & VM_PFNMAP)) { pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; - BUG_ON(!kvm_is_mmio_pfn(pfn)); + BUG_ON(!kvm_is_reserved_pfn(pfn)); } else { if (async && vma_is_valid(vma, write_fault)) *async = true; @@ -1264,9 +1447,8 @@ return pfn; } -static pfn_t -__gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn, bool atomic, - bool *async, bool write_fault, bool *writable) +pfn_t __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn, bool atomic, + bool *async, bool write_fault, bool *writable) { unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault); @@ -1285,65 +1467,59 @@ return hva_to_pfn(addr, atomic, async, write_fault, writable); } +EXPORT_SYMBOL_GPL(__gfn_to_pfn_memslot); -static pfn_t __gfn_to_pfn(struct kvm *kvm, gfn_t gfn, bool atomic, bool *async, - bool write_fault, bool *writable) +pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault, + bool *writable) { - struct kvm_memory_slot *slot; - - if (async) - *async = false; - - slot = gfn_to_memslot(kvm, gfn); - - return __gfn_to_pfn_memslot(slot, gfn, atomic, async, write_fault, - writable); + return __gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn, false, NULL, + write_fault, writable); } +EXPORT_SYMBOL_GPL(gfn_to_pfn_prot); -pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn) +pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn) { - return __gfn_to_pfn(kvm, gfn, true, NULL, true, NULL); + return __gfn_to_pfn_memslot(slot, gfn, false, NULL, true, NULL); } -EXPORT_SYMBOL_GPL(gfn_to_pfn_atomic); +EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot); -pfn_t gfn_to_pfn_async(struct kvm *kvm, gfn_t gfn, bool *async, - bool write_fault, bool *writable) +pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn) { - return __gfn_to_pfn(kvm, gfn, false, async, write_fault, writable); + return __gfn_to_pfn_memslot(slot, gfn, true, NULL, true, NULL); } -EXPORT_SYMBOL_GPL(gfn_to_pfn_async); +EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot_atomic); -pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn) +pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn) { - return __gfn_to_pfn(kvm, gfn, false, NULL, true, NULL); + return gfn_to_pfn_memslot_atomic(gfn_to_memslot(kvm, gfn), gfn); } -EXPORT_SYMBOL_GPL(gfn_to_pfn); +EXPORT_SYMBOL_GPL(gfn_to_pfn_atomic); -pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault, - bool *writable) +pfn_t kvm_vcpu_gfn_to_pfn_atomic(struct kvm_vcpu *vcpu, gfn_t gfn) { - return __gfn_to_pfn(kvm, gfn, false, NULL, write_fault, writable); + return gfn_to_pfn_memslot_atomic(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn); } -EXPORT_SYMBOL_GPL(gfn_to_pfn_prot); +EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_pfn_atomic); -pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn) +pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn) { - return __gfn_to_pfn_memslot(slot, gfn, false, NULL, true, NULL); + return gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn); } +EXPORT_SYMBOL_GPL(gfn_to_pfn); -pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn) +pfn_t kvm_vcpu_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn) { - return __gfn_to_pfn_memslot(slot, gfn, true, NULL, true, NULL); + return gfn_to_pfn_memslot(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn); } -EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot_atomic); +EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_pfn); -int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages, - int nr_pages) +int gfn_to_page_many_atomic(struct kvm_memory_slot *slot, gfn_t gfn, + struct page **pages, int nr_pages) { unsigned long addr; gfn_t entry; - addr = gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, &entry); + addr = gfn_to_hva_many(slot, gfn, &entry); if (kvm_is_error_hva(addr)) return -1; @@ -1359,7 +1535,7 @@ if (is_error_noslot_pfn(pfn)) return KVM_ERR_PTR_BAD_PAGE; - if (kvm_is_mmio_pfn(pfn)) { + if (kvm_is_reserved_pfn(pfn)) { WARN_ON(1); return KVM_ERR_PTR_BAD_PAGE; } @@ -1375,9 +1551,18 @@ return kvm_pfn_to_page(pfn); } - EXPORT_SYMBOL_GPL(gfn_to_page); +struct page *kvm_vcpu_gfn_to_page(struct kvm_vcpu *vcpu, gfn_t gfn) +{ + pfn_t pfn; + + pfn = kvm_vcpu_gfn_to_pfn(vcpu, gfn); + + return kvm_pfn_to_page(pfn); +} +EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_page); + void kvm_release_page_clean(struct page *page) { WARN_ON(is_error_page(page)); @@ -1388,7 +1573,7 @@ void kvm_release_pfn_clean(pfn_t pfn) { - if (!is_error_noslot_pfn(pfn) && !kvm_is_mmio_pfn(pfn)) + if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn)) put_page(pfn_to_page(pfn)); } EXPORT_SYMBOL_GPL(kvm_release_pfn_clean); @@ -1401,23 +1586,17 @@ } EXPORT_SYMBOL_GPL(kvm_release_page_dirty); -void kvm_release_pfn_dirty(pfn_t pfn) +static void kvm_release_pfn_dirty(pfn_t pfn) { kvm_set_pfn_dirty(pfn); kvm_release_pfn_clean(pfn); } -EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty); - -void kvm_set_page_dirty(struct page *page) -{ - kvm_set_pfn_dirty(page_to_pfn(page)); -} -EXPORT_SYMBOL_GPL(kvm_set_page_dirty); void kvm_set_pfn_dirty(pfn_t pfn) { - if (!kvm_is_mmio_pfn(pfn)) { + if (!kvm_is_reserved_pfn(pfn)) { struct page *page = pfn_to_page(pfn); + if (!PageReserved(page)) SetPageDirty(page); } @@ -1426,14 +1605,14 @@ void kvm_set_pfn_accessed(pfn_t pfn) { - if (!kvm_is_mmio_pfn(pfn)) + if (!kvm_is_reserved_pfn(pfn)) mark_page_accessed(pfn_to_page(pfn)); } EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed); void kvm_get_pfn(pfn_t pfn) { - if (!kvm_is_mmio_pfn(pfn)) + if (!kvm_is_reserved_pfn(pfn)) get_page(pfn_to_page(pfn)); } EXPORT_SYMBOL_GPL(kvm_get_pfn); @@ -1446,22 +1625,39 @@ return len; } -int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset, - int len) +static int __kvm_read_guest_page(struct kvm_memory_slot *slot, gfn_t gfn, + void *data, int offset, int len) { int r; unsigned long addr; - addr = gfn_to_hva_read(kvm, gfn); + addr = gfn_to_hva_memslot_prot(slot, gfn, NULL); if (kvm_is_error_hva(addr)) return -EFAULT; - r = kvm_read_hva(data, (void __user *)addr + offset, len); + r = __copy_from_user(data, (void __user *)addr + offset, len); if (r) return -EFAULT; return 0; } + +int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset, + int len) +{ + struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn); + + return __kvm_read_guest_page(slot, gfn, data, offset, len); +} EXPORT_SYMBOL_GPL(kvm_read_guest_page); +int kvm_vcpu_read_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, void *data, + int offset, int len) +{ + struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); + + return __kvm_read_guest_page(slot, gfn, data, offset, len); +} +EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_page); + int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len) { gfn_t gfn = gpa >> PAGE_SHIFT; @@ -1482,43 +1678,99 @@ } EXPORT_SYMBOL_GPL(kvm_read_guest); -int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data, - unsigned long len) +int kvm_vcpu_read_guest(struct kvm_vcpu *vcpu, gpa_t gpa, void *data, unsigned long len) { - int r; - unsigned long addr; gfn_t gfn = gpa >> PAGE_SHIFT; + int seg; int offset = offset_in_page(gpa); + int ret; + + while ((seg = next_segment(len, offset)) != 0) { + ret = kvm_vcpu_read_guest_page(vcpu, gfn, data, offset, seg); + if (ret < 0) + return ret; + offset = 0; + len -= seg; + data += seg; + ++gfn; + } + return 0; +} +EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest); - addr = gfn_to_hva_read(kvm, gfn); +static int __kvm_read_guest_atomic(struct kvm_memory_slot *slot, gfn_t gfn, + void *data, int offset, unsigned long len) +{ + int r; + unsigned long addr; + + addr = gfn_to_hva_memslot_prot(slot, gfn, NULL); if (kvm_is_error_hva(addr)) return -EFAULT; pagefault_disable(); - r = kvm_read_hva_atomic(data, (void __user *)addr + offset, len); + r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len); pagefault_enable(); if (r) return -EFAULT; return 0; } -EXPORT_SYMBOL(kvm_read_guest_atomic); -int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data, - int offset, int len) +int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data, + unsigned long len) +{ + gfn_t gfn = gpa >> PAGE_SHIFT; + struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn); + int offset = offset_in_page(gpa); + + return __kvm_read_guest_atomic(slot, gfn, data, offset, len); +} +EXPORT_SYMBOL_GPL(kvm_read_guest_atomic); + +int kvm_vcpu_read_guest_atomic(struct kvm_vcpu *vcpu, gpa_t gpa, + void *data, unsigned long len) +{ + gfn_t gfn = gpa >> PAGE_SHIFT; + struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); + int offset = offset_in_page(gpa); + + return __kvm_read_guest_atomic(slot, gfn, data, offset, len); +} +EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_atomic); + +static int __kvm_write_guest_page(struct kvm_memory_slot *memslot, gfn_t gfn, + const void *data, int offset, int len) { int r; unsigned long addr; - addr = gfn_to_hva(kvm, gfn); + addr = gfn_to_hva_memslot(memslot, gfn); if (kvm_is_error_hva(addr)) return -EFAULT; r = __copy_to_user((void __user *)addr + offset, data, len); if (r) return -EFAULT; - mark_page_dirty(kvm, gfn); + mark_page_dirty_in_slot(memslot, gfn); return 0; } + +int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, + const void *data, int offset, int len) +{ + struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn); + + return __kvm_write_guest_page(slot, gfn, data, offset, len); +} EXPORT_SYMBOL_GPL(kvm_write_guest_page); +int kvm_vcpu_write_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, + const void *data, int offset, int len) +{ + struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); + + return __kvm_write_guest_page(slot, gfn, data, offset, len); +} +EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest_page); + int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data, unsigned long len) { @@ -1538,6 +1790,28 @@ } return 0; } +EXPORT_SYMBOL_GPL(kvm_write_guest); + +int kvm_vcpu_write_guest(struct kvm_vcpu *vcpu, gpa_t gpa, const void *data, + unsigned long len) +{ + gfn_t gfn = gpa >> PAGE_SHIFT; + int seg; + int offset = offset_in_page(gpa); + int ret; + + while ((seg = next_segment(len, offset)) != 0) { + ret = kvm_vcpu_write_guest_page(vcpu, gfn, data, offset, seg); + if (ret < 0) + return ret; + offset = 0; + len -= seg; + data += seg; + ++gfn; + } + return 0; +} +EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest); int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc, gpa_t gpa, unsigned long len) @@ -1596,7 +1870,7 @@ r = __copy_to_user((void __user *)ghc->hva, data, len); if (r) return -EFAULT; - mark_page_dirty_in_slot(kvm, ghc->memslot, ghc->gpa >> PAGE_SHIFT); + mark_page_dirty_in_slot(ghc->memslot, ghc->gpa >> PAGE_SHIFT); return 0; } @@ -1629,8 +1903,9 @@ int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len) { - return kvm_write_guest_page(kvm, gfn, (const void *) empty_zero_page, - offset, len); + const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0))); + + return kvm_write_guest_page(kvm, gfn, zero_page, offset, len); } EXPORT_SYMBOL_GPL(kvm_clear_guest_page); @@ -1641,7 +1916,7 @@ int offset = offset_in_page(gpa); int ret; - while ((seg = next_segment(len, offset)) != 0) { + while ((seg = next_segment(len, offset)) != 0) { ret = kvm_clear_guest_page(kvm, gfn, offset, seg); if (ret < 0) return ret; @@ -1653,8 +1928,8 @@ } EXPORT_SYMBOL_GPL(kvm_clear_guest); -void mark_page_dirty_in_slot(struct kvm *kvm, struct kvm_memory_slot *memslot, - gfn_t gfn) +static void mark_page_dirty_in_slot(struct kvm_memory_slot *memslot, + gfn_t gfn) { if (memslot && memslot->dirty_bitmap) { unsigned long rel_gfn = gfn - memslot->base_gfn; @@ -1668,7 +1943,63 @@ struct kvm_memory_slot *memslot; memslot = gfn_to_memslot(kvm, gfn); - mark_page_dirty_in_slot(kvm, memslot, gfn); + mark_page_dirty_in_slot(memslot, gfn); +} +EXPORT_SYMBOL_GPL(mark_page_dirty); + +void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn) +{ + struct kvm_memory_slot *memslot; + + memslot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); + mark_page_dirty_in_slot(memslot, gfn); +} +EXPORT_SYMBOL_GPL(kvm_vcpu_mark_page_dirty); + +static void grow_halt_poll_ns(struct kvm_vcpu *vcpu) +{ + int old, val; + + old = val = vcpu->halt_poll_ns; + /* 10us base */ + if (val == 0 && halt_poll_ns_grow) + val = 10000; + else + val *= halt_poll_ns_grow; + + if (val > halt_poll_ns) + val = halt_poll_ns; + + vcpu->halt_poll_ns = val; + trace_kvm_halt_poll_ns_grow(vcpu->vcpu_id, val, old); +} + +static void shrink_halt_poll_ns(struct kvm_vcpu *vcpu) +{ + int old, val; + + old = val = vcpu->halt_poll_ns; + if (halt_poll_ns_shrink == 0) + val = 0; + else + val /= halt_poll_ns_shrink; + + vcpu->halt_poll_ns = val; + trace_kvm_halt_poll_ns_shrink(vcpu->vcpu_id, val, old); +} + +static int kvm_vcpu_check_block(struct kvm_vcpu *vcpu) +{ + if (kvm_arch_vcpu_runnable(vcpu)) { + kvm_make_request(KVM_REQ_UNHALT, vcpu); + return -EINTR; + } + if (kvm_cpu_has_pending_timer(vcpu)) + return -EINTR; + if (signal_pending(current)) + return -EINTR; + + return 0; } /* @@ -1676,25 +2007,64 @@ */ void kvm_vcpu_block(struct kvm_vcpu *vcpu) { + ktime_t start, cur; DEFINE_WAIT(wait); + bool waited = false; + u64 block_ns; + + start = cur = ktime_get(); + if (vcpu->halt_poll_ns) { + ktime_t stop = ktime_add_ns(ktime_get(), vcpu->halt_poll_ns); + + ++vcpu->stat.halt_attempted_poll; + do { + /* + * This sets KVM_REQ_UNHALT if an interrupt + * arrives. + */ + if (kvm_vcpu_check_block(vcpu) < 0) { + ++vcpu->stat.halt_successful_poll; + goto out; + } + cur = ktime_get(); + } while (single_task_running() && ktime_before(cur, stop)); + } + + kvm_arch_vcpu_blocking(vcpu); for (;;) { prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE); - if (kvm_arch_vcpu_runnable(vcpu)) { - kvm_make_request(KVM_REQ_UNHALT, vcpu); - break; - } - if (kvm_cpu_has_pending_timer(vcpu)) - break; - if (signal_pending(current)) + if (kvm_vcpu_check_block(vcpu) < 0) break; + waited = true; schedule(); } finish_wait(&vcpu->wq, &wait); + cur = ktime_get(); + + kvm_arch_vcpu_unblocking(vcpu); +out: + block_ns = ktime_to_ns(cur) - ktime_to_ns(start); + + if (halt_poll_ns) { + if (block_ns <= vcpu->halt_poll_ns) + ; + /* we had a long block, shrink polling */ + else if (vcpu->halt_poll_ns && block_ns > halt_poll_ns) + shrink_halt_poll_ns(vcpu); + /* we had a short halt and our poll time is too small */ + else if (vcpu->halt_poll_ns < halt_poll_ns && + block_ns < halt_poll_ns) + grow_halt_poll_ns(vcpu); + } else + vcpu->halt_poll_ns = 0; + + trace_kvm_vcpu_wakeup(block_ns, waited); } +EXPORT_SYMBOL_GPL(kvm_vcpu_block); #ifndef CONFIG_S390 /* @@ -1721,31 +2091,19 @@ EXPORT_SYMBOL_GPL(kvm_vcpu_kick); #endif /* !CONFIG_S390 */ -void kvm_resched(struct kvm_vcpu *vcpu) -{ - if (!need_resched()) - return; - cond_resched(); -} -EXPORT_SYMBOL_GPL(kvm_resched); - -bool kvm_vcpu_yield_to(struct kvm_vcpu *target) +int kvm_vcpu_yield_to(struct kvm_vcpu *target) { struct pid *pid; struct task_struct *task = NULL; - bool ret = false; + int ret = 0; rcu_read_lock(); pid = rcu_dereference(target->pid); if (pid) - task = get_pid_task(target->pid, PIDTYPE_PID); + task = get_pid_task(pid, PIDTYPE_PID); rcu_read_unlock(); if (!task) return ret; - if (task->flags & PF_VCPU) { - put_task_struct(task); - return ret; - } ret = yield_to(task, 1); put_task_struct(task); @@ -1753,7 +2111,6 @@ } EXPORT_SYMBOL_GPL(kvm_vcpu_yield_to); -#ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT /* * Helper that checks whether a VCPU is eligible for directed yield. * Most eligible candidate to yield is decided by following heuristics: @@ -1776,20 +2133,22 @@ * locking does not harm. It may result in trying to yield to same VCPU, fail * and continue with next VCPU and so on. */ -bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu) +static bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu) { +#ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT bool eligible; eligible = !vcpu->spin_loop.in_spin_loop || - (vcpu->spin_loop.in_spin_loop && - vcpu->spin_loop.dy_eligible); + vcpu->spin_loop.dy_eligible; if (vcpu->spin_loop.in_spin_loop) kvm_vcpu_set_dy_eligible(vcpu, !vcpu->spin_loop.dy_eligible); return eligible; -} +#else + return true; #endif +} void kvm_vcpu_on_spin(struct kvm_vcpu *me) { @@ -1820,7 +2179,7 @@ continue; if (vcpu == me) continue; - if (waitqueue_active(&vcpu->wq)) + if (waitqueue_active(&vcpu->wq) && !kvm_arch_vcpu_runnable(vcpu)) continue; if (!kvm_vcpu_eligible_for_directed_yield(vcpu)) continue; @@ -1886,7 +2245,7 @@ static struct file_operations kvm_vcpu_fops = { .release = kvm_vcpu_release, .unlocked_ioctl = kvm_vcpu_ioctl, -#ifdef CONFIG_COMPAT +#ifdef CONFIG_KVM_COMPAT .compat_ioctl = kvm_vcpu_compat_ioctl, #endif .mmap = kvm_vcpu_mmap, @@ -1898,7 +2257,7 @@ */ static int create_vcpu_fd(struct kvm_vcpu *vcpu) { - return anon_inode_getfd("kvm-vcpu", &kvm_vcpu_fops, vcpu, O_RDWR); + return anon_inode_getfd("kvm-vcpu", &kvm_vcpu_fops, vcpu, O_RDWR | O_CLOEXEC); } /* @@ -1949,6 +2308,11 @@ } kvm->vcpus[atomic_read(&kvm->online_vcpus)] = vcpu; + + /* + * Pairs with smp_rmb() in kvm_get_vcpu. Write kvm->vcpus + * before kvm->online_vcpu's incremented value. + */ smp_wmb(); atomic_inc(&kvm->online_vcpus); @@ -1994,7 +2358,7 @@ * Special cases: vcpu ioctls that are asynchronous to vcpu execution, * so vcpu_load() would break it. */ - if (ioctl == KVM_S390_INTERRUPT || ioctl == KVM_INTERRUPT) + if (ioctl == KVM_S390_INTERRUPT || ioctl == KVM_S390_IRQ || ioctl == KVM_INTERRUPT) return kvm_arch_vcpu_ioctl(filp, ioctl, arg); #endif @@ -2007,6 +2371,16 @@ r = -EINVAL; if (arg) goto out; + if (unlikely(vcpu->pid != current->pids[PIDTYPE_PID].pid)) { + /* The thread running this VCPU changed. */ + struct pid *oldpid = vcpu->pid; + struct pid *newpid = get_task_pid(current, PIDTYPE_PID); + + rcu_assign_pointer(vcpu->pid, newpid); + if (oldpid) + synchronize_rcu(); + put_pid(oldpid); + } r = kvm_arch_vcpu_ioctl_run(vcpu, vcpu->run); trace_kvm_userspace_exit(vcpu->run->exit_reason, r); break; @@ -2072,7 +2446,7 @@ if (r) goto out; r = -EFAULT; - if (copy_to_user(argp, &mp_state, sizeof mp_state)) + if (copy_to_user(argp, &mp_state, sizeof(mp_state))) goto out; r = 0; break; @@ -2081,7 +2455,7 @@ struct kvm_mp_state mp_state; r = -EFAULT; - if (copy_from_user(&mp_state, argp, sizeof mp_state)) + if (copy_from_user(&mp_state, argp, sizeof(mp_state))) goto out; r = kvm_arch_vcpu_ioctl_set_mpstate(vcpu, &mp_state); break; @@ -2090,13 +2464,13 @@ struct kvm_translation tr; r = -EFAULT; - if (copy_from_user(&tr, argp, sizeof tr)) + if (copy_from_user(&tr, argp, sizeof(tr))) goto out; r = kvm_arch_vcpu_ioctl_translate(vcpu, &tr); if (r) goto out; r = -EFAULT; - if (copy_to_user(argp, &tr, sizeof tr)) + if (copy_to_user(argp, &tr, sizeof(tr))) goto out; r = 0; break; @@ -2105,7 +2479,7 @@ struct kvm_guest_debug dbg; r = -EFAULT; - if (copy_from_user(&dbg, argp, sizeof dbg)) + if (copy_from_user(&dbg, argp, sizeof(dbg))) goto out; r = kvm_arch_vcpu_ioctl_set_guest_debug(vcpu, &dbg); break; @@ -2119,14 +2493,14 @@ if (argp) { r = -EFAULT; if (copy_from_user(&kvm_sigmask, argp, - sizeof kvm_sigmask)) + sizeof(kvm_sigmask))) goto out; r = -EINVAL; - if (kvm_sigmask.len != sizeof sigset) + if (kvm_sigmask.len != sizeof(sigset)) goto out; r = -EFAULT; if (copy_from_user(&sigset, sigmask_arg->sigset, - sizeof sigset)) + sizeof(sigset))) goto out; p = &sigset; } @@ -2167,7 +2541,7 @@ return r; } -#ifdef CONFIG_COMPAT +#ifdef CONFIG_KVM_COMPAT static long kvm_vcpu_compat_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { @@ -2188,14 +2562,14 @@ if (argp) { r = -EFAULT; if (copy_from_user(&kvm_sigmask, argp, - sizeof kvm_sigmask)) + sizeof(kvm_sigmask))) goto out; r = -EINVAL; - if (kvm_sigmask.len != sizeof csigset) + if (kvm_sigmask.len != sizeof(csigset)) goto out; r = -EFAULT; if (copy_from_user(&csigset, sigmask_arg->sigset, - sizeof csigset)) + sizeof(csigset))) goto out; sigset_from_compat(&sigset, &csigset); r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset); @@ -2259,7 +2633,7 @@ static const struct file_operations kvm_device_fops = { .unlocked_ioctl = kvm_device_ioctl, -#ifdef CONFIG_COMPAT +#ifdef CONFIG_KVM_COMPAT .compat_ioctl = kvm_device_ioctl, #endif .release = kvm_device_release, @@ -2273,6 +2647,35 @@ return filp->private_data; } +static struct kvm_device_ops *kvm_device_ops_table[KVM_DEV_TYPE_MAX] = { +#ifdef CONFIG_KVM_MPIC + [KVM_DEV_TYPE_FSL_MPIC_20] = &kvm_mpic_ops, + [KVM_DEV_TYPE_FSL_MPIC_42] = &kvm_mpic_ops, +#endif + +#ifdef CONFIG_KVM_XICS + [KVM_DEV_TYPE_XICS] = &kvm_xics_ops, +#endif +}; + +int kvm_register_device_ops(struct kvm_device_ops *ops, u32 type) +{ + if (type >= ARRAY_SIZE(kvm_device_ops_table)) + return -ENOSPC; + + if (kvm_device_ops_table[type] != NULL) + return -EEXIST; + + kvm_device_ops_table[type] = ops; + return 0; +} + +void kvm_unregister_device_ops(u32 type) +{ + if (kvm_device_ops_table[type] != NULL) + kvm_device_ops_table[type] = NULL; +} + static int kvm_ioctl_create_device(struct kvm *kvm, struct kvm_create_device *cd) { @@ -2281,21 +2684,12 @@ bool test = cd->flags & KVM_CREATE_DEVICE_TEST; int ret; - switch (cd->type) { -#ifdef CONFIG_KVM_MPIC - case KVM_DEV_TYPE_FSL_MPIC_20: - case KVM_DEV_TYPE_FSL_MPIC_42: - ops = &kvm_mpic_ops; - break; -#endif -#ifdef CONFIG_KVM_XICS - case KVM_DEV_TYPE_XICS: - ops = &kvm_xics_ops; - break; -#endif - default: + if (cd->type >= ARRAY_SIZE(kvm_device_ops_table)) + return -ENODEV; + + ops = kvm_device_ops_table[cd->type]; + if (ops == NULL) return -ENODEV; - } if (test) return 0; @@ -2313,7 +2707,7 @@ return ret; } - ret = anon_inode_getfd(ops->name, &kvm_device_fops, dev, O_RDWR); + ret = anon_inode_getfd(ops->name, &kvm_device_fops, dev, O_RDWR | O_CLOEXEC); if (ret < 0) { ops->destroy(dev); return ret; @@ -2325,6 +2719,37 @@ return 0; } +static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg) +{ + switch (arg) { + case KVM_CAP_USER_MEMORY: + case KVM_CAP_DESTROY_MEMORY_REGION_WORKS: + case KVM_CAP_JOIN_MEMORY_REGIONS_WORKS: + case KVM_CAP_INTERNAL_ERROR_DATA: +#ifdef CONFIG_HAVE_KVM_MSI + case KVM_CAP_SIGNAL_MSI: +#endif +#ifdef CONFIG_HAVE_KVM_IRQFD + case KVM_CAP_IRQFD: + case KVM_CAP_IRQFD_RESAMPLE: +#endif + case KVM_CAP_IOEVENTFD_ANY_LENGTH: + case KVM_CAP_CHECK_EXTENSION_VM: + return 1; +#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING + case KVM_CAP_IRQ_ROUTING: + return KVM_MAX_IRQ_ROUTES; +#endif +#if KVM_ADDRESS_SPACE_NUM > 1 + case KVM_CAP_MULTI_ADDRESS_SPACE: + return KVM_ADDRESS_SPACE_NUM; +#endif + default: + break; + } + return kvm_vm_ioctl_check_extension(kvm, arg); +} + static long kvm_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { @@ -2343,7 +2768,7 @@ r = -EFAULT; if (copy_from_user(&kvm_userspace_mem, argp, - sizeof kvm_userspace_mem)) + sizeof(kvm_userspace_mem))) goto out; r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem); @@ -2353,7 +2778,7 @@ struct kvm_dirty_log log; r = -EFAULT; - if (copy_from_user(&log, argp, sizeof log)) + if (copy_from_user(&log, argp, sizeof(log))) goto out; r = kvm_vm_ioctl_get_dirty_log(kvm, &log); break; @@ -2361,16 +2786,18 @@ #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET case KVM_REGISTER_COALESCED_MMIO: { struct kvm_coalesced_mmio_zone zone; + r = -EFAULT; - if (copy_from_user(&zone, argp, sizeof zone)) + if (copy_from_user(&zone, argp, sizeof(zone))) goto out; r = kvm_vm_ioctl_register_coalesced_mmio(kvm, &zone); break; } case KVM_UNREGISTER_COALESCED_MMIO: { struct kvm_coalesced_mmio_zone zone; + r = -EFAULT; - if (copy_from_user(&zone, argp, sizeof zone)) + if (copy_from_user(&zone, argp, sizeof(zone))) goto out; r = kvm_vm_ioctl_unregister_coalesced_mmio(kvm, &zone); break; @@ -2380,7 +2807,7 @@ struct kvm_irqfd data; r = -EFAULT; - if (copy_from_user(&data, argp, sizeof data)) + if (copy_from_user(&data, argp, sizeof(data))) goto out; r = kvm_irqfd(kvm, &data); break; @@ -2389,28 +2816,17 @@ struct kvm_ioeventfd data; r = -EFAULT; - if (copy_from_user(&data, argp, sizeof data)) + if (copy_from_user(&data, argp, sizeof(data))) goto out; r = kvm_ioeventfd(kvm, &data); break; } -#ifdef CONFIG_KVM_APIC_ARCHITECTURE - case KVM_SET_BOOT_CPU_ID: - r = 0; - mutex_lock(&kvm->lock); - if (atomic_read(&kvm->online_vcpus) != 0) - r = -EBUSY; - else - kvm->bsp_vcpu_id = arg; - mutex_unlock(&kvm->lock); - break; -#endif #ifdef CONFIG_HAVE_KVM_MSI case KVM_SIGNAL_MSI: { struct kvm_msi msi; r = -EFAULT; - if (copy_from_user(&msi, argp, sizeof msi)) + if (copy_from_user(&msi, argp, sizeof(msi))) goto out; r = kvm_send_userspace_msi(kvm, &msi); break; @@ -2422,7 +2838,7 @@ struct kvm_irq_level irq_event; r = -EFAULT; - if (copy_from_user(&irq_event, argp, sizeof irq_event)) + if (copy_from_user(&irq_event, argp, sizeof(irq_event))) goto out; r = kvm_vm_ioctl_irq_line(kvm, &irq_event, @@ -2432,7 +2848,7 @@ r = -EFAULT; if (ioctl == KVM_IRQ_LINE_STATUS) { - if (copy_to_user(argp, &irq_event, sizeof irq_event)) + if (copy_to_user(argp, &irq_event, sizeof(irq_event))) goto out; } @@ -2465,7 +2881,7 @@ goto out_free_irq_routing; r = kvm_set_irq_routing(kvm, entries, routing.nr, routing.flags); - out_free_irq_routing: +out_free_irq_routing: vfree(entries); break; } @@ -2488,16 +2904,17 @@ r = 0; break; } + case KVM_CHECK_EXTENSION: + r = kvm_vm_ioctl_check_extension_generic(kvm, arg); + break; default: r = kvm_arch_vm_ioctl(filp, ioctl, arg); - if (r == -ENOTTY) - r = kvm_vm_ioctl_assigned_device(kvm, ioctl, arg); } out: return r; } -#ifdef CONFIG_COMPAT +#ifdef CONFIG_KVM_COMPAT struct compat_kvm_dirty_log { __u32 slot; __u32 padding1; @@ -2541,44 +2958,12 @@ } #endif -static int kvm_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) -{ - struct page *page[1]; - unsigned long addr; - int npages; - gfn_t gfn = vmf->pgoff; - struct kvm *kvm = vma->vm_file->private_data; - - addr = gfn_to_hva(kvm, gfn); - if (kvm_is_error_hva(addr)) - return VM_FAULT_SIGBUS; - - npages = get_user_pages(current, current->mm, addr, 1, 1, 0, page, - NULL); - if (unlikely(npages != 1)) - return VM_FAULT_SIGBUS; - - vmf->page = page[0]; - return 0; -} - -static const struct vm_operations_struct kvm_vm_vm_ops = { - .fault = kvm_vm_fault, -}; - -static int kvm_vm_mmap(struct file *file, struct vm_area_struct *vma) -{ - vma->vm_ops = &kvm_vm_vm_ops; - return 0; -} - static struct file_operations kvm_vm_fops = { .release = kvm_vm_release, .unlocked_ioctl = kvm_vm_ioctl, -#ifdef CONFIG_COMPAT +#ifdef CONFIG_KVM_COMPAT .compat_ioctl = kvm_vm_compat_ioctl, #endif - .mmap = kvm_vm_mmap, .llseek = noop_llseek, }; @@ -2597,40 +2982,13 @@ return r; } #endif - r = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, O_RDWR); + r = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, O_RDWR | O_CLOEXEC); if (r < 0) kvm_put_kvm(kvm); return r; } -static long kvm_dev_ioctl_check_extension_generic(long arg) -{ - switch (arg) { - case KVM_CAP_USER_MEMORY: - case KVM_CAP_DESTROY_MEMORY_REGION_WORKS: - case KVM_CAP_JOIN_MEMORY_REGIONS_WORKS: -#ifdef CONFIG_KVM_APIC_ARCHITECTURE - case KVM_CAP_SET_BOOT_CPU_ID: -#endif - case KVM_CAP_INTERNAL_ERROR_DATA: -#ifdef CONFIG_HAVE_KVM_MSI - case KVM_CAP_SIGNAL_MSI: -#endif -#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING - case KVM_CAP_IRQFD_RESAMPLE: -#endif - return 1; -#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING - case KVM_CAP_IRQ_ROUTING: - return KVM_MAX_IRQ_ROUTES; -#endif - default: - break; - } - return kvm_dev_ioctl_check_extension(arg); -} - static long kvm_dev_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { @@ -2638,7 +2996,6 @@ switch (ioctl) { case KVM_GET_API_VERSION: - r = -EINVAL; if (arg) goto out; r = KVM_API_VERSION; @@ -2647,10 +3004,9 @@ r = kvm_dev_ioctl_create_vm(arg); break; case KVM_CHECK_EXTENSION: - r = kvm_dev_ioctl_check_extension_generic(arg); + r = kvm_vm_ioctl_check_extension_generic(NULL, arg); break; case KVM_GET_VCPU_MMAP_SIZE: - r = -EINVAL; if (arg) goto out; r = PAGE_SIZE; /* struct kvm_run */ @@ -2695,21 +3051,21 @@ cpumask_set_cpu(cpu, cpus_hardware_enabled); - r = kvm_arch_hardware_enable(NULL); + r = kvm_arch_hardware_enable(); if (r) { cpumask_clear_cpu(cpu, cpus_hardware_enabled); atomic_inc(&hardware_enable_failed); - printk(KERN_INFO "kvm: enabling virtualization on " - "CPU%d failed\n", cpu); + pr_info("kvm: enabling virtualization on CPU%d failed\n", cpu); } } -static void hardware_enable(void *junk) +static void hardware_enable(void) { - raw_spin_lock(&kvm_lock); - hardware_enable_nolock(junk); - raw_spin_unlock(&kvm_lock); + raw_spin_lock(&kvm_count_lock); + if (kvm_usage_count) + hardware_enable_nolock(NULL); + raw_spin_unlock(&kvm_count_lock); } static void hardware_disable_nolock(void *junk) @@ -2719,14 +3075,15 @@ if (!cpumask_test_cpu(cpu, cpus_hardware_enabled)) return; cpumask_clear_cpu(cpu, cpus_hardware_enabled); - kvm_arch_hardware_disable(NULL); + kvm_arch_hardware_disable(); } -static void hardware_disable(void *junk) +static void hardware_disable(void) { - raw_spin_lock(&kvm_lock); - hardware_disable_nolock(junk); - raw_spin_unlock(&kvm_lock); + raw_spin_lock(&kvm_count_lock); + if (kvm_usage_count) + hardware_disable_nolock(NULL); + raw_spin_unlock(&kvm_count_lock); } static void hardware_disable_all_nolock(void) @@ -2740,16 +3097,16 @@ static void hardware_disable_all(void) { - raw_spin_lock(&kvm_lock); + raw_spin_lock(&kvm_count_lock); hardware_disable_all_nolock(); - raw_spin_unlock(&kvm_lock); + raw_spin_unlock(&kvm_count_lock); } static int hardware_enable_all(void) { int r = 0; - raw_spin_lock(&kvm_lock); + raw_spin_lock(&kvm_count_lock); kvm_usage_count++; if (kvm_usage_count == 1) { @@ -2762,7 +3119,7 @@ } } - raw_spin_unlock(&kvm_lock); + raw_spin_unlock(&kvm_count_lock); return r; } @@ -2770,22 +3127,13 @@ static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val, void *v) { - int cpu = (long)v; - - if (!kvm_usage_count) - return NOTIFY_OK; - val &= ~CPU_TASKS_FROZEN; switch (val) { case CPU_DYING: - printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n", - cpu); - hardware_disable(NULL); + hardware_disable(); break; case CPU_STARTING: - printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n", - cpu); - hardware_enable(NULL); + hardware_enable(); break; } return NOTIFY_OK; @@ -2800,7 +3148,7 @@ * * And Intel TXT required VMX off for all cpu when system shutdown. */ - printk(KERN_INFO "kvm: exiting hardware virtualization\n"); + pr_info("kvm: exiting hardware virtualization\n"); kvm_rebooting = true; on_each_cpu(hardware_disable_nolock, NULL, 1); return NOTIFY_OK; @@ -2823,18 +3171,36 @@ kfree(bus); } -static int kvm_io_bus_sort_cmp(const void *p1, const void *p2) +static inline int kvm_io_bus_cmp(const struct kvm_io_range *r1, + const struct kvm_io_range *r2) { - const struct kvm_io_range *r1 = p1; - const struct kvm_io_range *r2 = p2; + gpa_t addr1 = r1->addr; + gpa_t addr2 = r2->addr; - if (r1->addr < r2->addr) + if (addr1 < addr2) return -1; - if (r1->addr + r1->len > r2->addr + r2->len) + + /* If r2->len == 0, match the exact address. If r2->len != 0, + * accept any overlapping write. Any order is acceptable for + * overlapping ranges, because kvm_io_bus_get_first_dev ensures + * we process all of them. + */ + if (r2->len) { + addr1 += r1->len; + addr2 += r2->len; + } + + if (addr1 > addr2) return 1; + return 0; } +static int kvm_io_bus_sort_cmp(const void *p1, const void *p2) +{ + return kvm_io_bus_cmp(p1, p2); +} + static int kvm_io_bus_insert_dev(struct kvm_io_bus *bus, struct kvm_io_device *dev, gpa_t addr, int len) { @@ -2868,72 +3234,124 @@ off = range - bus->range; - while (off > 0 && kvm_io_bus_sort_cmp(&key, &bus->range[off-1]) == 0) + while (off > 0 && kvm_io_bus_cmp(&key, &bus->range[off-1]) == 0) off--; return off; } +static int __kvm_io_bus_write(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus, + struct kvm_io_range *range, const void *val) +{ + int idx; + + idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len); + if (idx < 0) + return -EOPNOTSUPP; + + while (idx < bus->dev_count && + kvm_io_bus_cmp(range, &bus->range[idx]) == 0) { + if (!kvm_iodevice_write(vcpu, bus->range[idx].dev, range->addr, + range->len, val)) + return idx; + idx++; + } + + return -EOPNOTSUPP; +} + /* kvm_io_bus_write - called under kvm->slots_lock */ -int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, +int kvm_io_bus_write(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr, int len, const void *val) { - int idx; struct kvm_io_bus *bus; struct kvm_io_range range; + int r; range = (struct kvm_io_range) { .addr = addr, .len = len, }; - bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu); + bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu); if (!bus) return -ENOMEM; - idx = kvm_io_bus_get_first_dev(bus, addr, len); + r = __kvm_io_bus_write(vcpu, bus, &range, val); + return r < 0 ? r : 0; +} + +/* kvm_io_bus_write_cookie - called under kvm->slots_lock */ +int kvm_io_bus_write_cookie(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, + gpa_t addr, int len, const void *val, long cookie) +{ + struct kvm_io_bus *bus; + struct kvm_io_range range; + + range = (struct kvm_io_range) { + .addr = addr, + .len = len, + }; + + bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu); + if (!bus) + return -ENOMEM; + + /* First try the device referenced by cookie. */ + if ((cookie >= 0) && (cookie < bus->dev_count) && + (kvm_io_bus_cmp(&range, &bus->range[cookie]) == 0)) + if (!kvm_iodevice_write(vcpu, bus->range[cookie].dev, addr, len, + val)) + return cookie; + + /* + * cookie contained garbage; fall back to search and return the + * correct cookie value. + */ + return __kvm_io_bus_write(vcpu, bus, &range, val); +} + +static int __kvm_io_bus_read(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus, + struct kvm_io_range *range, void *val) +{ + int idx; + + idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len); if (idx < 0) return -EOPNOTSUPP; while (idx < bus->dev_count && - kvm_io_bus_sort_cmp(&range, &bus->range[idx]) == 0) { - if (!kvm_iodevice_write(bus->range[idx].dev, addr, len, val)) - return 0; + kvm_io_bus_cmp(range, &bus->range[idx]) == 0) { + if (!kvm_iodevice_read(vcpu, bus->range[idx].dev, range->addr, + range->len, val)) + return idx; idx++; } return -EOPNOTSUPP; } +EXPORT_SYMBOL_GPL(kvm_io_bus_write); /* kvm_io_bus_read - called under kvm->slots_lock */ -int kvm_io_bus_read(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, +int kvm_io_bus_read(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr, int len, void *val) { - int idx; struct kvm_io_bus *bus; struct kvm_io_range range; + int r; range = (struct kvm_io_range) { .addr = addr, .len = len, }; - bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu); + bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu); if (!bus) return -ENOMEM; - idx = kvm_io_bus_get_first_dev(bus, addr, len); - if (idx < 0) - return -EOPNOTSUPP; - - while (idx < bus->dev_count && - kvm_io_bus_sort_cmp(&range, &bus->range[idx]) == 0) { - if (!kvm_iodevice_read(bus->range[idx].dev, addr, len, val)) - return 0; - idx++; - } - - return -EOPNOTSUPP; + r = __kvm_io_bus_read(vcpu, bus, &range, val); + return r < 0 ? r : 0; } + /* Caller must hold slots_lock. */ int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, int len, struct kvm_io_device *dev) @@ -2948,7 +3366,7 @@ if (bus->dev_count - bus->ioeventfd_count > NR_IOBUS_DEVS - 1) return -ENOSPC; - new_bus = kzalloc(sizeof(*bus) + ((bus->dev_count + 1) * + new_bus = kmalloc(sizeof(*bus) + ((bus->dev_count + 1) * sizeof(struct kvm_io_range)), GFP_KERNEL); if (!new_bus) return -ENOMEM; @@ -2981,7 +3399,7 @@ if (i == bus->dev_count) return; - new_bus = kzalloc(sizeof(*bus) + ((bus->dev_count - 1) * + new_bus = kmalloc(sizeof(*bus) + ((bus->dev_count - 1) * sizeof(struct kvm_io_range)), GFP_KERNEL); if (!new_bus) { pr_err("kvm: failed to shrink bus, removing it completely\n"); @@ -3010,10 +3428,10 @@ struct kvm *kvm; *val = 0; - raw_spin_lock(&kvm_lock); + spin_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) *val += *(u32 *)((void *)kvm + offset); - raw_spin_unlock(&kvm_lock); + spin_unlock(&kvm_lock); return 0; } @@ -3027,12 +3445,12 @@ int i; *val = 0; - raw_spin_lock(&kvm_lock); + spin_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) kvm_for_each_vcpu(i, vcpu, kvm) *val += *(u32 *)((void *)vcpu + offset); - raw_spin_unlock(&kvm_lock); + spin_unlock(&kvm_lock); return 0; } @@ -3045,7 +3463,7 @@ static int kvm_init_debug(void) { - int r = -EFAULT; + int r = -EEXIST; struct kvm_stats_debugfs_item *p; kvm_debugfs_dir = debugfs_create_dir("kvm", NULL); @@ -3087,7 +3505,7 @@ static void kvm_resume(void) { if (kvm_usage_count) { - WARN_ON(raw_spin_is_locked(&kvm_lock)); + WARN_ON(raw_spin_is_locked(&kvm_count_lock)); hardware_enable_nolock(NULL); } } @@ -3106,9 +3524,12 @@ static void kvm_sched_in(struct preempt_notifier *pn, int cpu) { struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn); + if (vcpu->preempted) vcpu->preempted = false; + kvm_arch_sched_in(vcpu, cpu); + kvm_arch_vcpu_load(vcpu, cpu); } @@ -3185,7 +3606,7 @@ r = misc_register(&kvm_dev); if (r) { - printk(KERN_ERR "kvm: misc device register failed\n"); + pr_err("kvm: misc device register failed\n"); goto out_unreg; } @@ -3196,14 +3617,18 @@ r = kvm_init_debug(); if (r) { - printk(KERN_ERR "kvm: create debugfs files failed\n"); + pr_err("kvm: create debugfs files failed\n"); goto out_undebugfs; } + r = kvm_vfio_ops_init(); + WARN_ON(r); + return 0; out_undebugfs: unregister_syscore_ops(&kvm_syscore_ops); + misc_deregister(&kvm_dev); out_unreg: kvm_async_pf_deinit(); out_free: @@ -3239,5 +3664,6 @@ kvm_arch_exit(); kvm_irqfd_exit(); free_cpumask_var(cpus_hardware_enabled); + kvm_vfio_ops_exit(); } EXPORT_SYMBOL_GPL(kvm_exit);