--- zzzz-none-000/linux-3.10.107/drivers/iommu/amd_iommu_v2.c 2017-06-27 09:49:32.000000000 +0000 +++ scorpion-7490-727/linux-3.10.107/drivers/iommu/amd_iommu_v2.c 2021-02-04 17:41:59.000000000 +0000 @@ -1,6 +1,6 @@ /* * Copyright (C) 2010-2012 Advanced Micro Devices, Inc. - * Author: Joerg Roedel + * Author: Joerg Roedel * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License version 2 as published @@ -31,7 +31,7 @@ #include "amd_iommu_proto.h" MODULE_LICENSE("GPL v2"); -MODULE_AUTHOR("Joerg Roedel "); +MODULE_AUTHOR("Joerg Roedel "); #define MAX_DEVICES 0x10000 #define PRI_QUEUE_SIZE 512 @@ -45,17 +45,23 @@ struct pasid_state { struct list_head list; /* For global state-list */ atomic_t count; /* Reference count */ - struct task_struct *task; /* Task bound to this PASID */ + unsigned mmu_notifier_count; /* Counting nested mmu_notifier + calls */ struct mm_struct *mm; /* mm_struct for the faults */ - struct mmu_notifier mn; /* mmu_otifier handle */ + struct mmu_notifier mn; /* mmu_notifier handle */ struct pri_queue pri[PRI_QUEUE_SIZE]; /* PRI tag states */ struct device_state *device_state; /* Link to our device_state */ int pasid; /* PASID index */ - spinlock_t lock; /* Protect pri_queues */ + bool invalid; /* Used during setup and + teardown of the pasid */ + spinlock_t lock; /* Protect pri_queues and + mmu_notifer_count */ wait_queue_head_t wq; /* To wait for count == 0 */ }; struct device_state { + struct list_head list; + u16 devid; atomic_t count; struct pci_dev *pdev; struct pasid_state **states; @@ -81,25 +87,12 @@ u16 flags; }; -static struct device_state **state_table; +static LIST_HEAD(state_list); static spinlock_t state_lock; -/* List and lock for all pasid_states */ -static LIST_HEAD(pasid_state_list); -static DEFINE_SPINLOCK(ps_lock); - static struct workqueue_struct *iommu_wq; -/* - * Empty page table - Used between - * mmu_notifier_invalidate_range_start and - * mmu_notifier_invalidate_range_end - */ -static u64 *empty_page_table; - static void free_pasid_states(struct device_state *dev_state); -static void unbind_pasid(struct device_state *dev_state, int pasid); -static int task_exit(struct notifier_block *nb, unsigned long e, void *data); static u16 device_id(struct pci_dev *pdev) { @@ -111,13 +104,25 @@ return devid; } +static struct device_state *__get_device_state(u16 devid) +{ + struct device_state *dev_state; + + list_for_each_entry(dev_state, &state_list, list) { + if (dev_state->devid == devid) + return dev_state; + } + + return NULL; +} + static struct device_state *get_device_state(u16 devid) { struct device_state *dev_state; unsigned long flags; spin_lock_irqsave(&state_lock, flags); - dev_state = state_table[devid]; + dev_state = __get_device_state(devid); if (dev_state != NULL) atomic_inc(&dev_state->count); spin_unlock_irqrestore(&state_lock, flags); @@ -127,11 +132,19 @@ static void free_device_state(struct device_state *dev_state) { + struct iommu_group *group; + /* * First detach device from domain - No more PRI requests will arrive * from that device after it is unbound from the IOMMUv2 domain. */ - iommu_detach_device(dev_state->domain, &dev_state->pdev->dev); + group = iommu_group_get(&dev_state->pdev->dev); + if (WARN_ON(!group)) + return; + + iommu_detach_group(dev_state->domain, group); + + iommu_group_put(group); /* Everything is down now, free the IOMMUv2 domain */ iommu_domain_free(dev_state->domain); @@ -146,41 +159,6 @@ wake_up(&dev_state->wq); } -static void put_device_state_wait(struct device_state *dev_state) -{ - DEFINE_WAIT(wait); - - prepare_to_wait(&dev_state->wq, &wait, TASK_UNINTERRUPTIBLE); - if (!atomic_dec_and_test(&dev_state->count)) - schedule(); - finish_wait(&dev_state->wq, &wait); - - free_device_state(dev_state); -} - -static struct notifier_block profile_nb = { - .notifier_call = task_exit, -}; - -static void link_pasid_state(struct pasid_state *pasid_state) -{ - spin_lock(&ps_lock); - list_add_tail(&pasid_state->list, &pasid_state_list); - spin_unlock(&ps_lock); -} - -static void __unlink_pasid_state(struct pasid_state *pasid_state) -{ - list_del(&pasid_state->list); -} - -static void unlink_pasid_state(struct pasid_state *pasid_state) -{ - spin_lock(&ps_lock); - __unlink_pasid_state(pasid_state); - spin_unlock(&ps_lock); -} - /* Must be called under dev_state->lock */ static struct pasid_state **__get_pasid_state_ptr(struct device_state *dev_state, int pasid, bool alloc) @@ -290,56 +268,37 @@ static void put_pasid_state(struct pasid_state *pasid_state) { - if (atomic_dec_and_test(&pasid_state->count)) { - put_device_state(pasid_state->device_state); + if (atomic_dec_and_test(&pasid_state->count)) wake_up(&pasid_state->wq); - } } static void put_pasid_state_wait(struct pasid_state *pasid_state) { - DEFINE_WAIT(wait); - - prepare_to_wait(&pasid_state->wq, &wait, TASK_UNINTERRUPTIBLE); - - if (atomic_dec_and_test(&pasid_state->count)) - put_device_state(pasid_state->device_state); - else - schedule(); - - finish_wait(&pasid_state->wq, &wait); - mmput(pasid_state->mm); + atomic_dec(&pasid_state->count); + wait_event(pasid_state->wq, !atomic_read(&pasid_state->count)); free_pasid_state(pasid_state); } -static void __unbind_pasid(struct pasid_state *pasid_state) +static void unbind_pasid(struct pasid_state *pasid_state) { struct iommu_domain *domain; domain = pasid_state->device_state->domain; + /* + * Mark pasid_state as invalid, no more faults will we added to the + * work queue after this is visible everywhere. + */ + pasid_state->invalid = true; + + /* Make sure this is visible */ + smp_wmb(); + + /* After this the device/pasid can't access the mm anymore */ amd_iommu_domain_clear_gcr3(domain, pasid_state->pasid); - clear_pasid_state(pasid_state->device_state, pasid_state->pasid); /* Make sure no more pending faults are in the queue */ flush_workqueue(iommu_wq); - - mmu_notifier_unregister(&pasid_state->mn, pasid_state->mm); - - put_pasid_state(pasid_state); /* Reference taken in bind() function */ -} - -static void unbind_pasid(struct device_state *dev_state, int pasid) -{ - struct pasid_state *pasid_state; - - pasid_state = get_pasid_state(dev_state, pasid); - if (pasid_state == NULL) - return; - - unlink_pasid_state(pasid_state); - __unbind_pasid(pasid_state); - put_pasid_state_wait(pasid_state); /* Reference taken in this function */ } static void free_pasid_states_level1(struct pasid_state **tbl) @@ -379,15 +338,26 @@ continue; put_pasid_state(pasid_state); - unbind_pasid(dev_state, i); + + /* + * This will call the mn_release function and + * unbind the PASID + */ + mmu_notifier_unregister(&pasid_state->mn, pasid_state->mm); + + put_pasid_state_wait(pasid_state); /* Reference taken in + amd_iommu_bind_pasid */ + + /* Drop reference taken in amd_iommu_bind_pasid */ + put_device_state(dev_state); } if (dev_state->pasid_levels == 2) free_pasid_states_level2(dev_state->states); else if (dev_state->pasid_levels == 1) free_pasid_states_level1(dev_state->states); - else if (dev_state->pasid_levels != 0) - BUG(); + else + BUG_ON(dev_state->pasid_levels != 0); free_page((unsigned long)dev_state->states); } @@ -411,21 +381,15 @@ static int mn_clear_flush_young(struct mmu_notifier *mn, struct mm_struct *mm, - unsigned long address) + unsigned long start, + unsigned long end) { - __mn_flush_page(mn, address); + for (; start < end; start += PAGE_SIZE) + __mn_flush_page(mn, start); return 0; } -static void mn_change_pte(struct mmu_notifier *mn, - struct mm_struct *mm, - unsigned long address, - pte_t pte) -{ - __mn_flush_page(mn, address); -} - static void mn_invalidate_page(struct mmu_notifier *mn, struct mm_struct *mm, unsigned long address) @@ -433,9 +397,9 @@ __mn_flush_page(mn, address); } -static void mn_invalidate_range_start(struct mmu_notifier *mn, - struct mm_struct *mm, - unsigned long start, unsigned long end) +static void mn_invalidate_range(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long start, unsigned long end) { struct pasid_state *pasid_state; struct device_state *dev_state; @@ -443,30 +407,36 @@ pasid_state = mn_to_state(mn); dev_state = pasid_state->device_state; - amd_iommu_domain_set_gcr3(dev_state->domain, pasid_state->pasid, - __pa(empty_page_table)); + if ((start ^ (end - 1)) < PAGE_SIZE) + amd_iommu_flush_page(dev_state->domain, pasid_state->pasid, + start); + else + amd_iommu_flush_tlb(dev_state->domain, pasid_state->pasid); } -static void mn_invalidate_range_end(struct mmu_notifier *mn, - struct mm_struct *mm, - unsigned long start, unsigned long end) +static void mn_release(struct mmu_notifier *mn, struct mm_struct *mm) { struct pasid_state *pasid_state; struct device_state *dev_state; + bool run_inv_ctx_cb; - pasid_state = mn_to_state(mn); - dev_state = pasid_state->device_state; + might_sleep(); + + pasid_state = mn_to_state(mn); + dev_state = pasid_state->device_state; + run_inv_ctx_cb = !pasid_state->invalid; - amd_iommu_domain_set_gcr3(dev_state->domain, pasid_state->pasid, - __pa(pasid_state->mm->pgd)); + if (run_inv_ctx_cb && dev_state->inv_ctx_cb) + dev_state->inv_ctx_cb(dev_state->pdev, pasid_state->pasid); + + unbind_pasid(pasid_state); } static struct mmu_notifier_ops iommu_mn = { + .release = mn_release, .clear_flush_young = mn_clear_flush_young, - .change_pte = mn_change_pte, .invalidate_page = mn_invalidate_page, - .invalidate_range_start = mn_invalidate_range_start, - .invalidate_range_end = mn_invalidate_range_end, + .invalidate_range = mn_invalidate_range, }; static void set_pri_tag_status(struct pasid_state *pasid_state, @@ -496,43 +466,90 @@ spin_unlock_irqrestore(&pasid_state->lock, flags); } +static void handle_fault_error(struct fault *fault) +{ + int status; + + if (!fault->dev_state->inv_ppr_cb) { + set_pri_tag_status(fault->state, fault->tag, PPR_INVALID); + return; + } + + status = fault->dev_state->inv_ppr_cb(fault->dev_state->pdev, + fault->pasid, + fault->address, + fault->flags); + switch (status) { + case AMD_IOMMU_INV_PRI_RSP_SUCCESS: + set_pri_tag_status(fault->state, fault->tag, PPR_SUCCESS); + break; + case AMD_IOMMU_INV_PRI_RSP_INVALID: + set_pri_tag_status(fault->state, fault->tag, PPR_INVALID); + break; + case AMD_IOMMU_INV_PRI_RSP_FAIL: + set_pri_tag_status(fault->state, fault->tag, PPR_FAILURE); + break; + default: + BUG(); + } +} + +static bool access_error(struct vm_area_struct *vma, struct fault *fault) +{ + unsigned long requested = 0; + + if (fault->flags & PPR_FAULT_EXEC) + requested |= VM_EXEC; + + if (fault->flags & PPR_FAULT_READ) + requested |= VM_READ; + + if (fault->flags & PPR_FAULT_WRITE) + requested |= VM_WRITE; + + return (requested & ~vma->vm_flags) != 0; +} + static void do_fault(struct work_struct *work) { struct fault *fault = container_of(work, struct fault, work); - int npages, write; - struct page *page; + struct mm_struct *mm; + struct vm_area_struct *vma; + u64 address; + int ret, write; write = !!(fault->flags & PPR_FAULT_WRITE); - npages = get_user_pages(fault->state->task, fault->state->mm, - fault->address, 1, write, 0, &page, NULL); + mm = fault->state->mm; + address = fault->address; - if (npages == 1) { - put_page(page); - } else if (fault->dev_state->inv_ppr_cb) { - int status; - - status = fault->dev_state->inv_ppr_cb(fault->dev_state->pdev, - fault->pasid, - fault->address, - fault->flags); - switch (status) { - case AMD_IOMMU_INV_PRI_RSP_SUCCESS: - set_pri_tag_status(fault->state, fault->tag, PPR_SUCCESS); - break; - case AMD_IOMMU_INV_PRI_RSP_INVALID: - set_pri_tag_status(fault->state, fault->tag, PPR_INVALID); - break; - case AMD_IOMMU_INV_PRI_RSP_FAIL: - set_pri_tag_status(fault->state, fault->tag, PPR_FAILURE); - break; - default: - BUG(); - } - } else { - set_pri_tag_status(fault->state, fault->tag, PPR_INVALID); + down_read(&mm->mmap_sem); + vma = find_extend_vma(mm, address); + if (!vma || address < vma->vm_start) { + /* failed to get a vma in the right range */ + up_read(&mm->mmap_sem); + handle_fault_error(fault); + goto out; + } + + /* Check if we have the right permissions on the vma */ + if (access_error(vma, fault)) { + up_read(&mm->mmap_sem); + handle_fault_error(fault); + goto out; + } + + ret = handle_mm_fault(mm, vma, address, write); + if (ret & VM_FAULT_ERROR) { + /* failed to service fault */ + up_read(&mm->mmap_sem); + handle_fault_error(fault); + goto out; } + up_read(&mm->mmap_sem); + +out: finish_pri_tag(fault->dev_state, fault->state, fault->tag); put_pasid_state(fault->state); @@ -561,7 +578,7 @@ goto out; pasid_state = get_pasid_state(dev_state, iommu_fault->pasid); - if (pasid_state == NULL) { + if (pasid_state == NULL || pasid_state->invalid) { /* We know the device but not the PASID -> send INVALID */ amd_iommu_complete_ppr(dev_state->pdev, iommu_fault->pasid, PPR_INVALID, tag); @@ -586,6 +603,7 @@ fault->state = pasid_state; fault->tag = tag; fault->finish = finish; + fault->pasid = iommu_fault->pasid; fault->flags = iommu_fault->flags; INIT_WORK(&fault->work, do_fault); @@ -594,6 +612,10 @@ ret = NOTIFY_OK; out_drop_state: + + if (ret != NOTIFY_OK && pasid_state) + put_pasid_state(pasid_state); + put_device_state(dev_state); out: @@ -604,58 +626,12 @@ .notifier_call = ppr_notifier, }; -static int task_exit(struct notifier_block *nb, unsigned long e, void *data) -{ - struct pasid_state *pasid_state; - struct task_struct *task; - - task = data; - - /* - * Using this notifier is a hack - but there is no other choice - * at the moment. What I really want is a sleeping notifier that - * is called when an MM goes down. But such a notifier doesn't - * exist yet. The notifier needs to sleep because it has to make - * sure that the device does not use the PASID and the address - * space anymore before it is destroyed. This includes waiting - * for pending PRI requests to pass the workqueue. The - * MMU-Notifiers would be a good fit, but they use RCU and so - * they are not allowed to sleep. Lets see how we can solve this - * in a more intelligent way in the future. - */ -again: - spin_lock(&ps_lock); - list_for_each_entry(pasid_state, &pasid_state_list, list) { - struct device_state *dev_state; - int pasid; - - if (pasid_state->task != task) - continue; - - /* Drop Lock and unbind */ - spin_unlock(&ps_lock); - - dev_state = pasid_state->device_state; - pasid = pasid_state->pasid; - - if (pasid_state->device_state->inv_ctx_cb) - dev_state->inv_ctx_cb(dev_state->pdev, pasid); - - unbind_pasid(dev_state, pasid); - - /* Task may be in the list multiple times */ - goto again; - } - spin_unlock(&ps_lock); - - return NOTIFY_OK; -} - int amd_iommu_bind_pasid(struct pci_dev *pdev, int pasid, struct task_struct *task) { struct pasid_state *pasid_state; struct device_state *dev_state; + struct mm_struct *mm; u16 devid; int ret; @@ -679,20 +655,23 @@ if (pasid_state == NULL) goto out; + atomic_set(&pasid_state->count, 1); init_waitqueue_head(&pasid_state->wq); spin_lock_init(&pasid_state->lock); - pasid_state->task = task; - pasid_state->mm = get_task_mm(task); + mm = get_task_mm(task); + pasid_state->mm = mm; pasid_state->device_state = dev_state; pasid_state->pasid = pasid; + pasid_state->invalid = true; /* Mark as valid only if we are + done with setting up the pasid */ pasid_state->mn.ops = &iommu_mn; if (pasid_state->mm == NULL) goto out_free; - mmu_notifier_register(&pasid_state->mn, pasid_state->mm); + mmu_notifier_register(&pasid_state->mn, mm); ret = set_pasid_state(dev_state, pasid_state, pasid); if (ret) @@ -703,7 +682,15 @@ if (ret) goto out_clear_state; - link_pasid_state(pasid_state); + /* Now we are ready to handle faults */ + pasid_state->invalid = false; + + /* + * Drop the reference to the mm_struct here. We rely on the + * mmu_notifier release call-back to inform us when the mm + * is going away. + */ + mmput(mm); return 0; @@ -711,9 +698,10 @@ clear_pasid_state(dev_state, pasid); out_unregister: - mmu_notifier_unregister(&pasid_state->mn, pasid_state->mm); + mmu_notifier_unregister(&pasid_state->mn, mm); out_free: + mmput(mm); free_pasid_state(pasid_state); out: @@ -725,6 +713,7 @@ void amd_iommu_unbind_pasid(struct pci_dev *pdev, int pasid) { + struct pasid_state *pasid_state; struct device_state *dev_state; u16 devid; @@ -741,9 +730,31 @@ if (pasid < 0 || pasid >= dev_state->max_pasids) goto out; - unbind_pasid(dev_state, pasid); + pasid_state = get_pasid_state(dev_state, pasid); + if (pasid_state == NULL) + goto out; + /* + * Drop reference taken here. We are safe because we still hold + * the reference taken in the amd_iommu_bind_pasid function. + */ + put_pasid_state(pasid_state); + + /* Clear the pasid state so that the pasid can be re-used */ + clear_pasid_state(dev_state, pasid_state->pasid); + /* + * Call mmu_notifier_unregister to drop our reference + * to pasid_state->mm + */ + mmu_notifier_unregister(&pasid_state->mn, pasid_state->mm); + + put_pasid_state_wait(pasid_state); /* Reference taken in + amd_iommu_bind_pasid */ out: + /* Drop reference taken in this function */ + put_device_state(dev_state); + + /* Drop reference taken in amd_iommu_bind_pasid */ put_device_state(dev_state); } EXPORT_SYMBOL(amd_iommu_unbind_pasid); @@ -751,6 +762,7 @@ int amd_iommu_init_device(struct pci_dev *pdev, int pasids) { struct device_state *dev_state; + struct iommu_group *group; unsigned long flags; int ret, tmp; u16 devid; @@ -771,7 +783,8 @@ spin_lock_init(&dev_state->lock); init_waitqueue_head(&dev_state->wq); - dev_state->pdev = pdev; + dev_state->pdev = pdev; + dev_state->devid = devid; tmp = pasids; for (dev_state->pasid_levels = 0; (tmp - 1) & ~0x1ff; tmp >>= 9) @@ -795,24 +808,35 @@ if (ret) goto out_free_domain; - ret = iommu_attach_device(dev_state->domain, &pdev->dev); - if (ret != 0) + group = iommu_group_get(&pdev->dev); + if (!group) { + ret = -EINVAL; goto out_free_domain; + } + + ret = iommu_attach_group(dev_state->domain, group); + if (ret != 0) + goto out_drop_group; + + iommu_group_put(group); spin_lock_irqsave(&state_lock, flags); - if (state_table[devid] != NULL) { + if (__get_device_state(devid) != NULL) { spin_unlock_irqrestore(&state_lock, flags); ret = -EBUSY; goto out_free_domain; } - state_table[devid] = dev_state; + list_add_tail(&dev_state->list, &state_list); spin_unlock_irqrestore(&state_lock, flags); return 0; +out_drop_group: + iommu_group_put(group); + out_free_domain: iommu_domain_free(dev_state->domain); @@ -839,20 +863,26 @@ spin_lock_irqsave(&state_lock, flags); - dev_state = state_table[devid]; + dev_state = __get_device_state(devid); if (dev_state == NULL) { spin_unlock_irqrestore(&state_lock, flags); return; } - state_table[devid] = NULL; + list_del(&dev_state->list); spin_unlock_irqrestore(&state_lock, flags); /* Get rid of any remaining pasid states */ free_pasid_states(dev_state); - put_device_state_wait(dev_state); + put_device_state(dev_state); + /* + * Wait until the last reference is dropped before freeing + * the device state. + */ + wait_event(dev_state->wq, !atomic_read(&dev_state->count)); + free_device_state(dev_state); } EXPORT_SYMBOL(amd_iommu_free_device); @@ -872,7 +902,7 @@ spin_lock_irqsave(&state_lock, flags); ret = -EINVAL; - dev_state = state_table[devid]; + dev_state = __get_device_state(devid); if (dev_state == NULL) goto out_unlock; @@ -903,7 +933,7 @@ spin_lock_irqsave(&state_lock, flags); ret = -EINVAL; - dev_state = state_table[devid]; + dev_state = __get_device_state(devid); if (dev_state == NULL) goto out_unlock; @@ -920,10 +950,9 @@ static int __init amd_iommu_v2_init(void) { - size_t state_table_size; int ret; - pr_info("AMD IOMMUv2 driver by Joerg Roedel \n"); + pr_info("AMD IOMMUv2 driver by Joerg Roedel \n"); if (!amd_iommu_v2_supported()) { pr_info("AMD IOMMUv2 functionality not available on this system\n"); @@ -936,46 +965,27 @@ spin_lock_init(&state_lock); - state_table_size = MAX_DEVICES * sizeof(struct device_state *); - state_table = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, - get_order(state_table_size)); - if (state_table == NULL) - return -ENOMEM; - ret = -ENOMEM; iommu_wq = create_workqueue("amd_iommu_v2"); if (iommu_wq == NULL) - goto out_free; - - ret = -ENOMEM; - empty_page_table = (u64 *)get_zeroed_page(GFP_KERNEL); - if (empty_page_table == NULL) - goto out_destroy_wq; + goto out; amd_iommu_register_ppr_notifier(&ppr_nb); - profile_event_register(PROFILE_TASK_EXIT, &profile_nb); return 0; -out_destroy_wq: - destroy_workqueue(iommu_wq); - -out_free: - free_pages((unsigned long)state_table, get_order(state_table_size)); - +out: return ret; } static void __exit amd_iommu_v2_exit(void) { struct device_state *dev_state; - size_t state_table_size; int i; if (!amd_iommu_v2_supported()) return; - profile_event_unregister(PROFILE_TASK_EXIT, &profile_nb); amd_iommu_unregister_ppr_notifier(&ppr_nb); flush_workqueue(iommu_wq); @@ -997,11 +1007,6 @@ } destroy_workqueue(iommu_wq); - - state_table_size = MAX_DEVICES * sizeof(struct device_state *); - free_pages((unsigned long)state_table, get_order(state_table_size)); - - free_page((unsigned long)empty_page_table); } module_init(amd_iommu_v2_init);