--- zzzz-none-000/linux-3.10.107/arch/x86/kvm/paging_tmpl.h 2017-06-27 09:49:32.000000000 +0000 +++ scorpion-7490-727/linux-3.10.107/arch/x86/kvm/paging_tmpl.h 2021-02-04 17:41:59.000000000 +0000 @@ -23,6 +23,13 @@ * so the code in this file is compiled twice, once per pte size. */ +/* + * This is used to catch non optimized PT_GUEST_(DIRTY|ACCESS)_SHIFT macro + * uses for EPT without A/D paging type. + */ +extern u64 __pure __using_nonexistent_pte_bit(void) + __compiletime_error("wrong use of PT_GUEST_(DIRTY|ACCESS)_SHIFT"); + #if PTTYPE == 64 #define pt_element_t u64 #define guest_walker guest_walker64 @@ -32,6 +39,10 @@ #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl) #define PT_INDEX(addr, level) PT64_INDEX(addr, level) #define PT_LEVEL_BITS PT64_LEVEL_BITS + #define PT_GUEST_ACCESSED_MASK PT_ACCESSED_MASK + #define PT_GUEST_DIRTY_MASK PT_DIRTY_MASK + #define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT + #define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT #ifdef CONFIG_X86_64 #define PT_MAX_FULL_LEVELS 4 #define CMPXCHG cmpxchg @@ -49,7 +60,26 @@ #define PT_INDEX(addr, level) PT32_INDEX(addr, level) #define PT_LEVEL_BITS PT32_LEVEL_BITS #define PT_MAX_FULL_LEVELS 2 + #define PT_GUEST_ACCESSED_MASK PT_ACCESSED_MASK + #define PT_GUEST_DIRTY_MASK PT_DIRTY_MASK + #define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT + #define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT #define CMPXCHG cmpxchg +#elif PTTYPE == PTTYPE_EPT + #define pt_element_t u64 + #define guest_walker guest_walkerEPT + #define FNAME(name) ept_##name + #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK + #define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl) + #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl) + #define PT_INDEX(addr, level) PT64_INDEX(addr, level) + #define PT_LEVEL_BITS PT64_LEVEL_BITS + #define PT_GUEST_ACCESSED_MASK 0 + #define PT_GUEST_DIRTY_MASK 0 + #define PT_GUEST_DIRTY_SHIFT __using_nonexistent_pte_bit() + #define PT_GUEST_ACCESSED_SHIFT __using_nonexistent_pte_bit() + #define CMPXCHG cmpxchg64 + #define PT_MAX_FULL_LEVELS 4 #else #error Invalid PTTYPE value #endif @@ -69,6 +99,7 @@ pt_element_t prefetch_ptes[PTE_PREFETCH_NUM]; gpa_t pte_gpa[PT_MAX_FULL_LEVELS]; pt_element_t __user *ptep_user[PT_MAX_FULL_LEVELS]; + bool pte_writable[PT_MAX_FULL_LEVELS]; unsigned pt_access; unsigned pte_access; gfn_t gfn; @@ -80,6 +111,32 @@ return (gpte & PT_LVL_ADDR_MASK(lvl)) >> PAGE_SHIFT; } +static inline void FNAME(protect_clean_gpte)(unsigned *access, unsigned gpte) +{ + unsigned mask; + + /* dirty bit is not supported, so no need to track it */ + if (!PT_GUEST_DIRTY_MASK) + return; + + BUILD_BUG_ON(PT_WRITABLE_MASK != ACC_WRITE_MASK); + + mask = (unsigned)~ACC_WRITE_MASK; + /* Allow write access to dirty gptes */ + mask |= (gpte >> (PT_GUEST_DIRTY_SHIFT - PT_WRITABLE_SHIFT)) & + PT_WRITABLE_MASK; + *access &= mask; +} + +static inline int FNAME(is_present_gpte)(unsigned long pte) +{ +#if PTTYPE != PTTYPE_EPT + return is_present_gpte(pte); +#else + return pte & 7; +#endif +} + static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, pt_element_t __user *ptep_user, unsigned index, pt_element_t orig_pte, pt_element_t new_pte) @@ -103,6 +160,42 @@ return (ret != orig_pte); } +static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu, + struct kvm_mmu_page *sp, u64 *spte, + u64 gpte) +{ + if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL)) + goto no_present; + + if (!FNAME(is_present_gpte)(gpte)) + goto no_present; + + /* if accessed bit is not supported prefetch non accessed gpte */ + if (PT_GUEST_ACCESSED_MASK && !(gpte & PT_GUEST_ACCESSED_MASK)) + goto no_present; + + return false; + +no_present: + drop_spte(vcpu->kvm, spte); + return true; +} + +static inline unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, u64 gpte) +{ + unsigned access; +#if PTTYPE == PTTYPE_EPT + access = ((gpte & VMX_EPT_WRITABLE_MASK) ? ACC_WRITE_MASK : 0) | + ((gpte & VMX_EPT_EXECUTABLE_MASK) ? ACC_EXEC_MASK : 0) | + ACC_USER_MASK; +#else + access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK; + access &= ~(gpte >> PT64_NX_SHIFT); +#endif + + return access; +} + static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, struct guest_walker *walker, @@ -114,28 +207,49 @@ gfn_t table_gfn; int ret; + /* dirty/accessed bits are not supported, so no need to update them */ + if (!PT_GUEST_DIRTY_MASK) + return 0; + for (level = walker->max_level; level >= walker->level; --level) { pte = orig_pte = walker->ptes[level - 1]; table_gfn = walker->table_gfn[level - 1]; ptep_user = walker->ptep_user[level - 1]; index = offset_in_page(ptep_user) / sizeof(pt_element_t); - if (!(pte & PT_ACCESSED_MASK)) { + if (!(pte & PT_GUEST_ACCESSED_MASK)) { trace_kvm_mmu_set_accessed_bit(table_gfn, index, sizeof(pte)); - pte |= PT_ACCESSED_MASK; + pte |= PT_GUEST_ACCESSED_MASK; } - if (level == walker->level && write_fault && !is_dirty_gpte(pte)) { + if (level == walker->level && write_fault && + !(pte & PT_GUEST_DIRTY_MASK)) { trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte)); - pte |= PT_DIRTY_MASK; + pte |= PT_GUEST_DIRTY_MASK; } if (pte == orig_pte) continue; + /* + * If the slot is read-only, simply do not process the accessed + * and dirty bits. This is the correct thing to do if the slot + * is ROM, and page tables in read-as-ROM/write-as-MMIO slots + * are only supported if the accessed and dirty bits are already + * set in the ROM (so that MMIO writes are never needed). + * + * Note that NPT does not allow this at all and faults, since + * it always wants nested page table entries for the guest + * page tables to be writable. And EPT works but will simply + * overwrite the read-only memory to set the accessed and dirty + * bits. + */ + if (unlikely(!walker->pte_writable[level - 1])) + continue; + ret = FNAME(cmpxchg_gpte)(vcpu, mmu, ptep_user, index, orig_pte, pte); if (ret) return ret; - mark_page_dirty(vcpu->kvm, table_gfn); - walker->ptes[level] = pte; + kvm_vcpu_mark_page_dirty(vcpu, table_gfn); + walker->ptes[level - 1] = pte; } return 0; } @@ -170,16 +284,15 @@ if (walker->level == PT32E_ROOT_LEVEL) { pte = mmu->get_pdptr(vcpu, (addr >> 30) & 3); trace_kvm_mmu_paging_element(pte, walker->level); - if (!is_present_gpte(pte)) + if (!FNAME(is_present_gpte)(pte)) goto error; --walker->level; } #endif walker->max_level = walker->level; - ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) || - (mmu->get_cr3(vcpu) & CR3_NONPAE_RESERVED_BITS) == 0); + ASSERT(!(is_long_mode(vcpu) && !is_pae(vcpu))); - accessed_dirty = PT_ACCESSED_MASK; + accessed_dirty = PT_GUEST_ACCESSED_MASK; pt_access = pte_access = ACC_ALL; ++walker->level; @@ -199,12 +312,26 @@ walker->pte_gpa[walker->level - 1] = pte_gpa; real_gfn = mmu->translate_gpa(vcpu, gfn_to_gpa(table_gfn), - PFERR_USER_MASK|PFERR_WRITE_MASK); + PFERR_USER_MASK|PFERR_WRITE_MASK, + &walker->fault); + + /* + * FIXME: This can happen if emulation (for of an INS/OUTS + * instruction) triggers a nested page fault. The exit + * qualification / exit info field will incorrectly have + * "guest page access" as the nested page fault's cause, + * instead of "guest page structure access". To fix this, + * the x86_exception struct should be augmented with enough + * information to fix the exit_qualification or exit_info_1 + * fields. + */ if (unlikely(real_gfn == UNMAPPED_GVA)) - goto error; + return 0; + real_gfn = gpa_to_gfn(real_gfn); - host_addr = gfn_to_hva(vcpu->kvm, real_gfn); + host_addr = kvm_vcpu_gfn_to_hva_prot(vcpu, real_gfn, + &walker->pte_writable[walker->level - 1]); if (unlikely(kvm_is_error_hva(host_addr))) goto error; @@ -215,22 +342,21 @@ trace_kvm_mmu_paging_element(pte, walker->level); - if (unlikely(!is_present_gpte(pte))) + if (unlikely(!FNAME(is_present_gpte)(pte))) goto error; - if (unlikely(is_rsvd_bits_set(&vcpu->arch.mmu, pte, - walker->level))) { + if (unlikely(is_rsvd_bits_set(mmu, pte, walker->level))) { errcode |= PFERR_RSVD_MASK | PFERR_PRESENT_MASK; goto error; } accessed_dirty &= pte; - pte_access = pt_access & gpte_access(vcpu, pte); + pte_access = pt_access & FNAME(gpte_access)(vcpu, pte); walker->ptes[walker->level - 1] = pte; } while (!is_last_gpte(mmu, walker->level, pte)); - if (unlikely(permission_fault(mmu, pte_access, access))) { + if (unlikely(permission_fault(vcpu, mmu, pte_access, access))) { errcode |= PFERR_PRESENT_MASK; goto error; } @@ -241,20 +367,22 @@ if (PTTYPE == 32 && walker->level == PT_DIRECTORY_LEVEL && is_cpuid_PSE36()) gfn += pse36_gfn_delta(pte); - real_gpa = mmu->translate_gpa(vcpu, gfn_to_gpa(gfn), access); + real_gpa = mmu->translate_gpa(vcpu, gfn_to_gpa(gfn), access, &walker->fault); if (real_gpa == UNMAPPED_GVA) return 0; walker->gfn = real_gpa >> PAGE_SHIFT; if (!write_fault) - protect_clean_gpte(&pte_access, pte); + FNAME(protect_clean_gpte)(&pte_access, pte); else /* - * On a write fault, fold the dirty bit into accessed_dirty by - * shifting it one place right. + * On a write fault, fold the dirty bit into accessed_dirty. + * For modes without A/D bits support accessed_dirty will be + * always clear. */ - accessed_dirty &= pte >> (PT_DIRTY_SHIFT - PT_ACCESSED_SHIFT); + accessed_dirty &= pte >> + (PT_GUEST_DIRTY_SHIFT - PT_GUEST_ACCESSED_SHIFT); if (unlikely(!accessed_dirty)) { ret = FNAME(update_accessed_dirty_bits)(vcpu, mmu, walker, write_fault); @@ -279,6 +407,25 @@ walker->fault.vector = PF_VECTOR; walker->fault.error_code_valid = true; walker->fault.error_code = errcode; + +#if PTTYPE == PTTYPE_EPT + /* + * Use PFERR_RSVD_MASK in error_code to to tell if EPT + * misconfiguration requires to be injected. The detection is + * done by is_rsvd_bits_set() above. + * + * We set up the value of exit_qualification to inject: + * [2:0] - Derive from [2:0] of real exit_qualification at EPT violation + * [5:3] - Calculated by the page walk of the guest EPT page tables + * [7:8] - Derived from [7:8] of real exit_qualification + * + * The other bits are set to 0. + */ + if (!(errcode & PFERR_RSVD_MASK)) { + vcpu->arch.exit_qualification &= 0x187; + vcpu->arch.exit_qualification |= ((pt_access & pte) & 0x7) << 3; + } +#endif walker->fault.address = addr; walker->fault.nested_page_fault = mmu != vcpu->arch.walk_mmu; @@ -293,6 +440,7 @@ access); } +#if PTTYPE != PTTYPE_EPT static int FNAME(walk_addr_nested)(struct guest_walker *walker, struct kvm_vcpu *vcpu, gva_t addr, u32 access) @@ -300,6 +448,7 @@ return FNAME(walk_addr_generic)(walker, vcpu, &vcpu->arch.nested_mmu, addr, access); } +#endif static bool FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, @@ -309,14 +458,14 @@ gfn_t gfn; pfn_t pfn; - if (prefetch_invalid_gpte(vcpu, sp, spte, gpte)) + if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte)) return false; pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte); gfn = gpte_to_gfn(gpte); - pte_access = sp->role.access & gpte_access(vcpu, gpte); - protect_clean_gpte(&pte_access, gpte); + pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); + FNAME(protect_clean_gpte)(&pte_access, gpte); pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn, no_dirty_log && (pte_access & ACC_WRITE_MASK)); if (is_error_pfn(pfn)) @@ -353,11 +502,11 @@ base_gpa = pte_gpa & ~mask; index = (pte_gpa - base_gpa) / sizeof(pt_element_t); - r = kvm_read_guest_atomic(vcpu->kvm, base_gpa, + r = kvm_vcpu_read_guest_atomic(vcpu, base_gpa, gw->prefetch_ptes, sizeof(gw->prefetch_ptes)); curr_pte = gw->prefetch_ptes[index]; } else - r = kvm_read_guest_atomic(vcpu->kvm, pte_gpa, + r = kvm_vcpu_read_guest_atomic(vcpu, pte_gpa, &curr_pte, sizeof(curr_pte)); return r || curr_pte != gw->ptes[level - 1]; @@ -449,7 +598,7 @@ goto out_gpte_changed; if (sp) - link_shadow_page(it.sptep, sp); + link_shadow_page(it.sptep, sp, PT_GUEST_ACCESSED_MASK); } for (; @@ -469,7 +618,7 @@ sp = kvm_mmu_get_page(vcpu, direct_gfn, addr, it.level-1, true, direct_access, it.sptep); - link_shadow_page(it.sptep, sp); + link_shadow_page(it.sptep, sp, PT_GUEST_ACCESSED_MASK); } clear_sp_write_flooding_count(it.sptep); @@ -549,15 +698,24 @@ int r; pfn_t pfn; int level = PT_PAGE_TABLE_LEVEL; - int force_pt_level; + bool force_pt_level = false; unsigned long mmu_seq; bool map_writable, is_self_change_mapping; pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code); - if (unlikely(error_code & PFERR_RSVD_MASK)) - return handle_mmio_page_fault(vcpu, addr, error_code, - mmu_is_nested(vcpu)); + if (unlikely(error_code & PFERR_RSVD_MASK)) { + r = handle_mmio_page_fault(vcpu, addr, mmu_is_nested(vcpu)); + if (likely(r != RET_MMIO_PF_INVALID)) + return r; + + /* + * page fault with PFEC.RSVD = 1 is caused by shadow + * page fault, should not be used to walk guest page + * table. + */ + error_code &= ~PFERR_RSVD_MASK; + }; r = mmu_topup_memory_caches(vcpu); if (r) @@ -584,15 +742,14 @@ is_self_change_mapping = FNAME(is_self_change_mapping)(vcpu, &walker, user_fault, &vcpu->arch.write_fault_to_shadow_pgtable); - if (walker.level >= PT_DIRECTORY_LEVEL) - force_pt_level = mapping_level_dirty_bitmap(vcpu, walker.gfn) - || is_self_change_mapping; - else - force_pt_level = 1; - if (!force_pt_level) { - level = min(walker.level, mapping_level(vcpu, walker.gfn)); - walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1); - } + if (walker.level >= PT_DIRECTORY_LEVEL && !is_self_change_mapping) { + level = mapping_level(vcpu, walker.gfn, &force_pt_level); + if (likely(!force_pt_level)) { + level = min(walker.level, level); + walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1); + } + } else + force_pt_level = true; mmu_seq = vcpu->kvm->mmu_notifier_seq; smp_rmb(); @@ -701,8 +858,8 @@ if (!rmap_can_add(vcpu)) break; - if (kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &gpte, - sizeof(pt_element_t))) + if (kvm_vcpu_read_guest_atomic(vcpu, pte_gpa, &gpte, + sizeof(pt_element_t))) break; FNAME(update_pte)(vcpu, sp, sptep, &gpte); @@ -732,6 +889,7 @@ return gpa; } +#if PTTYPE != PTTYPE_EPT static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access, struct x86_exception *exception) @@ -750,6 +908,7 @@ return gpa; } +#endif /* * Using the cached information from sp->gfns is safe because: @@ -786,21 +945,22 @@ pte_gpa = first_pte_gpa + i * sizeof(pt_element_t); - if (kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &gpte, - sizeof(pt_element_t))) + if (kvm_vcpu_read_guest_atomic(vcpu, pte_gpa, &gpte, + sizeof(pt_element_t))) return -EINVAL; - if (prefetch_invalid_gpte(vcpu, sp, &sp->spt[i], gpte)) { + if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) { vcpu->kvm->tlbs_dirty++; continue; } gfn = gpte_to_gfn(gpte); pte_access = sp->role.access; - pte_access &= gpte_access(vcpu, gpte); - protect_clean_gpte(&pte_access, gpte); + pte_access &= FNAME(gpte_access)(vcpu, gpte); + FNAME(protect_clean_gpte)(&pte_access, gpte); - if (sync_mmio_spte(&sp->spt[i], gfn, pte_access, &nr_present)) + if (sync_mmio_spte(vcpu, &sp->spt[i], gfn, pte_access, + &nr_present)) continue; if (gfn != sp->gfns[i]) { @@ -834,3 +994,7 @@ #undef gpte_to_gfn #undef gpte_to_gfn_lvl #undef CMPXCHG +#undef PT_GUEST_ACCESSED_MASK +#undef PT_GUEST_DIRTY_MASK +#undef PT_GUEST_DIRTY_SHIFT +#undef PT_GUEST_ACCESSED_SHIFT