--- zzzz-none-000/linux-4.4.271/arch/arm/mm/dma-mapping.c 2021-06-03 06:22:09.000000000 +0000 +++ hawkeye-5590-750/linux-4.4.271/arch/arm/mm/dma-mapping.c 2023-04-19 10:22:27.000000000 +0000 @@ -28,6 +28,7 @@ #include #include #include +#include #include #include @@ -38,6 +39,7 @@ #include #include #include +#include #include "dma.h" #include "mm.h" @@ -225,7 +227,8 @@ return mask; } -static void __dma_clear_buffer(struct page *page, size_t size) +static void __dma_clear_buffer(struct page *page, size_t size, + struct dma_attrs *attrs) { /* * Ensure that the allocated pages are zeroed, and that any data @@ -236,7 +239,8 @@ phys_addr_t end = base + size; while (size > 0) { void *ptr = kmap_atomic(page); - memset(ptr, 0, PAGE_SIZE); + if (!dma_get_attr(DMA_ATTR_SKIP_ZEROING, attrs)) + memset(ptr, 0, PAGE_SIZE); dmac_flush_range(ptr, ptr + PAGE_SIZE); kunmap_atomic(ptr); page++; @@ -245,7 +249,8 @@ outer_flush_range(base, end); } else { void *ptr = page_address(page); - memset(ptr, 0, size); + if (!dma_get_attr(DMA_ATTR_SKIP_ZEROING, attrs)) + memset(ptr, 0, size); dmac_flush_range(ptr, ptr + size); outer_flush_range(__pa(ptr), __pa(ptr) + size); } @@ -271,7 +276,7 @@ for (p = page + (size >> PAGE_SHIFT), e = page + (1 << order); p < e; p++) __free_page(p); - __dma_clear_buffer(page, size); + __dma_clear_buffer(page, size, NULL); return page; } @@ -293,7 +298,8 @@ static void *__alloc_from_contiguous(struct device *dev, size_t size, pgprot_t prot, struct page **ret_page, - const void *caller, bool want_vaddr); + const void *caller, bool want_vaddr, + struct dma_attrs *attrs); static void *__alloc_remap_buffer(struct device *dev, size_t size, gfp_t gfp, pgprot_t prot, struct page **ret_page, @@ -361,7 +367,7 @@ if (dev_get_cma_area(NULL)) ptr = __alloc_from_contiguous(NULL, atomic_pool_size, prot, - &page, atomic_pool_init, true); + &page, atomic_pool_init, true, NULL); else ptr = __alloc_remap_buffer(NULL, atomic_pool_size, gfp, prot, &page, atomic_pool_init, true); @@ -530,7 +536,8 @@ static void *__alloc_from_contiguous(struct device *dev, size_t size, pgprot_t prot, struct page **ret_page, - const void *caller, bool want_vaddr) + const void *caller, bool want_vaddr, + struct dma_attrs *attrs) { unsigned long order = get_order(size); size_t count = size >> PAGE_SHIFT; @@ -541,7 +548,12 @@ if (!page) return NULL; - __dma_clear_buffer(page, size); + /* + * skip completely if we neither need to zero nor sync. + */ + if (!(dma_get_attr(DMA_ATTR_SKIP_CPU_SYNC, attrs) && + dma_get_attr(DMA_ATTR_SKIP_ZEROING, attrs))) + __dma_clear_buffer(page, size, attrs); if (!want_vaddr) goto out; @@ -591,7 +603,7 @@ #define __get_dma_pgprot(attrs, prot) __pgprot(0) #define __alloc_remap_buffer(dev, size, gfp, prot, ret, c, wv) NULL #define __alloc_from_pool(size, ret_page) NULL -#define __alloc_from_contiguous(dev, size, prot, ret, c, wv) NULL +#define __alloc_from_contiguous(dev, size, prot, ret, c, wv, a) NULL #define __free_from_pool(cpu_addr, size) 0 #define __free_from_contiguous(dev, page, cpu_addr, size, wv) do { } while (0) #define __dma_free_remap(cpu_addr, size) do { } while (0) @@ -653,7 +665,7 @@ addr = __alloc_simple_buffer(dev, size, gfp, &page); else if (dev_get_cma_area(dev) && (gfp & __GFP_DIRECT_RECLAIM)) addr = __alloc_from_contiguous(dev, size, prot, &page, - caller, want_vaddr); + caller, want_vaddr, attrs); else if (is_coherent) addr = __alloc_simple_buffer(dev, size, gfp, &page); else if (!gfpflags_allow_blocking(gfp)) @@ -929,7 +941,7 @@ int arm_dma_map_sg(struct device *dev, struct scatterlist *sg, int nents, enum dma_data_direction dir, struct dma_attrs *attrs) { - struct dma_map_ops *ops = get_dma_ops(dev); + const struct dma_map_ops *ops = get_dma_ops(dev); struct scatterlist *s; int i, j; @@ -963,7 +975,7 @@ void arm_dma_unmap_sg(struct device *dev, struct scatterlist *sg, int nents, enum dma_data_direction dir, struct dma_attrs *attrs) { - struct dma_map_ops *ops = get_dma_ops(dev); + const struct dma_map_ops *ops = get_dma_ops(dev); struct scatterlist *s; int i; @@ -982,7 +994,7 @@ void arm_dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nents, enum dma_data_direction dir) { - struct dma_map_ops *ops = get_dma_ops(dev); + const struct dma_map_ops *ops = get_dma_ops(dev); struct scatterlist *s; int i; @@ -1001,7 +1013,7 @@ void arm_dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nents, enum dma_data_direction dir) { - struct dma_map_ops *ops = get_dma_ops(dev); + const struct dma_map_ops *ops = get_dma_ops(dev); struct scatterlist *s; int i; @@ -1045,6 +1057,18 @@ /* IOMMU */ +#include +#include + +struct iommu_dma_notifier_data { + struct list_head list; + struct device *dev; + u64 dma_base; + u64 size; +}; +static LIST_HEAD(iommu_dma_masters); +static DEFINE_MUTEX(iommu_dma_notifier_lock); + static int extend_iommu_mapping(struct dma_iommu_mapping *mapping); static inline dma_addr_t __alloc_iova(struct dma_iommu_mapping *mapping, @@ -1140,6 +1164,9 @@ spin_unlock_irqrestore(&mapping->lock, flags); } +/* We'll try 2M, 1M, 64K, and finally 4K; array must end with 0! */ +static const int iommu_order_array[] = { 9, 8, 4, 0 }; + static struct page **__iommu_alloc_buffer(struct device *dev, size_t size, gfp_t gfp, struct dma_attrs *attrs) { @@ -1147,6 +1174,7 @@ int count = size >> PAGE_SHIFT; int array_size = count * sizeof(struct page *); int i = 0; + int order_idx = 0; if (array_size <= PAGE_SIZE) pages = kzalloc(array_size, GFP_KERNEL); @@ -1164,7 +1192,7 @@ if (!page) goto error; - __dma_clear_buffer(page, size); + __dma_clear_buffer(page, size, NULL); for (i = 0; i < count; i++) pages[i] = page + i; @@ -1172,6 +1200,10 @@ return pages; } + /* Go straight to 4K chunks if caller says it's OK. */ + if (dma_get_attr(DMA_ATTR_ALLOC_SINGLE_PAGES, attrs)) + order_idx = ARRAY_SIZE(iommu_order_array) - 1; + /* * IOMMU can map any pages, so himem can also be used here */ @@ -1180,22 +1212,24 @@ while (count) { int j, order; - for (order = __fls(count); order > 0; --order) { - /* - * We do not want OOM killer to be invoked as long - * as we can fall back to single pages, so we force - * __GFP_NORETRY for orders higher than zero. - */ - pages[i] = alloc_pages(gfp | __GFP_NORETRY, order); - if (pages[i]) - break; + order = iommu_order_array[order_idx]; + + /* Drop down when we get small */ + if (__fls(count) < order) { + order_idx++; + continue; } - if (!pages[i]) { - /* - * Fall back to single page allocation. - * Might invoke OOM killer as last resort. - */ + if (order) { + /* See if it's easy to allocate a high-order chunk */ + pages[i] = alloc_pages(gfp | __GFP_NORETRY, order); + + /* Go down a notch at first sign of pressure */ + if (!pages[i]) { + order_idx++; + continue; + } + } else { pages[i] = alloc_pages(gfp, 0); if (!pages[i]) goto error; @@ -1208,7 +1242,7 @@ pages[i + j] = pages[i] + j; } - __dma_clear_buffer(pages[i], PAGE_SIZE << order); + __dma_clear_buffer(pages[i], PAGE_SIZE << order, NULL); i += 1 << order; count -= 1 << order; } @@ -1504,10 +1538,10 @@ prot = IOMMU_READ | IOMMU_WRITE; break; case DMA_TO_DEVICE: - prot = IOMMU_READ; + prot = IOMMU_READ | IOMMU_WRITE; break; case DMA_FROM_DEVICE: - prot = IOMMU_WRITE; + prot = IOMMU_WRITE | IOMMU_READ; break; default: prot = 0; @@ -1642,7 +1676,31 @@ int arm_iommu_map_sg(struct device *dev, struct scatterlist *sg, int nents, enum dma_data_direction dir, struct dma_attrs *attrs) { - return __iommu_map_sg(dev, sg, nents, dir, attrs, false); + struct scatterlist *s; + int i; + size_t ret; + struct dma_iommu_mapping *mapping = dev->archdata.mapping; + unsigned int total_length = 0, current_offset = 0; + dma_addr_t iova; + int prot = __dma_direction_to_prot(dir); + + for_each_sg(sg, s, nents, i) + total_length += s->length; + + iova = __alloc_iova(mapping, total_length); + ret = iommu_map_sg(mapping->domain, iova, sg, nents, prot); + if (ret != total_length) { + __free_iova(mapping, iova, total_length); + return 0; + } + + for_each_sg(sg, s, nents, i) { + s->dma_address = iova + current_offset; + s->dma_length = total_length - current_offset; + current_offset += s->length; + } + + return nents; } static void __iommu_unmap_sg(struct device *dev, struct scatterlist *sg, @@ -1692,7 +1750,15 @@ void arm_iommu_unmap_sg(struct device *dev, struct scatterlist *sg, int nents, enum dma_data_direction dir, struct dma_attrs *attrs) { - __iommu_unmap_sg(dev, sg, nents, dir, attrs, false); + struct dma_iommu_mapping *mapping = dev->archdata.mapping; + unsigned int total_length = sg_dma_len(sg); + dma_addr_t iova = sg_dma_address(sg); + + total_length = PAGE_ALIGN((iova & ~PAGE_MASK) + total_length); + iova &= PAGE_MASK; + + iommu_unmap(mapping->domain, iova, total_length); + __free_iova(mapping, iova, total_length); } /** @@ -1867,7 +1933,7 @@ __dma_page_cpu_to_dev(page, offset, size, dir); } -struct dma_map_ops iommu_ops = { +const struct dma_map_ops iommu_ops = { .alloc = arm_iommu_alloc_attrs, .free = arm_iommu_free_attrs, .mmap = arm_iommu_mmap_attrs, @@ -1886,7 +1952,7 @@ .set_dma_mask = arm_dma_set_mask, }; -struct dma_map_ops iommu_coherent_ops = { +const struct dma_map_ops iommu_coherent_ops = { .alloc = arm_iommu_alloc_attrs, .free = arm_iommu_free_attrs, .mmap = arm_iommu_mmap_attrs, @@ -1915,13 +1981,19 @@ * arm_iommu_attach_device function. */ struct dma_iommu_mapping * -arm_iommu_create_mapping(struct bus_type *bus, dma_addr_t base, u64 size) +arm_iommu_create_mapping(struct device *dev, struct bus_type *bus, + dma_addr_t base, u64 size) { unsigned int bits = size >> PAGE_SHIFT; unsigned int bitmap_size = BITS_TO_LONGS(bits) * sizeof(long); struct dma_iommu_mapping *mapping; int extensions = 1; int err = -ENOMEM; + struct iommu_domain *domain; + + domain = iommu_get_domain_for_dev(dev); + if (domain && domain->handler_token) + return (struct dma_iommu_mapping *)domain->handler_token; /* currently only 32-bit DMA address space is supported */ if (size > DMA_BIT_MASK(32) + 1) @@ -1953,12 +2025,17 @@ mapping->extensions = extensions; mapping->base = base; mapping->bits = BITS_PER_BYTE * bitmap_size; + mapping->size = size; spin_lock_init(&mapping->lock); - mapping->domain = iommu_domain_alloc(bus); - if (!mapping->domain) - goto err4; + if (domain) { + mapping->domain = domain; + } else { + mapping->domain = iommu_domain_alloc(bus); + if (!mapping->domain) + goto err4; + } kref_init(&mapping->kref); return mapping; @@ -2011,17 +2088,41 @@ } EXPORT_SYMBOL_GPL(arm_iommu_release_mapping); +/* fast mapping is always true for now */ +static bool fast; + static int __arm_iommu_attach_device(struct device *dev, struct dma_iommu_mapping *mapping) { + struct iommu_domain *domain = iommu_get_domain_for_dev(dev); int err; - err = iommu_attach_device(mapping->domain, dev); - if (err) - return err; - - kref_get(&mapping->kref); - to_dma_iommu_mapping(dev) = mapping; + if (fast) { + if (!domain->handler_token) { + /** + * detach the existing and attach to fast mapping + * domain + */ + iommu_detach_device(mapping->domain, dev); + err = fast_smmu_attach_device(dev, mapping); + if (err) + return err; + } else { + dev->archdata.dma_ops = &fast_smmu_dma_ops; + dev->archdata.mapping = domain->handler_token; + mapping = domain->handler_token; + kref_get(&mapping->kref); + } + } else { + /** + * device is already attached to the default domain. So just + * take the reference for the mapping structure + */ + kref_get(&mapping->kref); + to_dma_iommu_mapping(dev) = mapping; + if (!domain->handler_token) + domain->handler_token = mapping; + } pr_debug("Attached IOMMU controller to %s device.\n", dev_name(dev)); return 0; @@ -2045,10 +2146,13 @@ { int err; - err = __arm_iommu_attach_device(dev, mapping); + err = iommu_attach_device(mapping->domain, dev); if (err) return err; + kref_get(&mapping->kref); + to_dma_iommu_mapping(dev) = mapping; + /* TODO: Should this be removed when fast DMA mapping is enabled? */ set_dma_ops(dev, &iommu_ops); return 0; } @@ -2085,31 +2189,64 @@ } EXPORT_SYMBOL_GPL(arm_iommu_detach_device); -static struct dma_map_ops *arm_get_iommu_dma_map_ops(bool coherent) +static const struct dma_map_ops *arm_get_iommu_dma_map_ops(struct device *dev, + bool coherent) { return coherent ? &iommu_coherent_ops : &iommu_ops; } +static void queue_iommu_attach(struct device *dev, u64 dma_base, u64 size) +{ + struct iommu_dma_notifier_data *iommudata; + + iommudata = kzalloc(sizeof(*iommudata), GFP_KERNEL); + if (!iommudata) + return; + + iommudata->dev = dev; + iommudata->dma_base = dma_base; + iommudata->size = size; + + mutex_lock(&iommu_dma_notifier_lock); + list_add(&iommudata->list, &iommu_dma_masters); + mutex_unlock(&iommu_dma_notifier_lock); +} + static bool arm_setup_iommu_dma_ops(struct device *dev, u64 dma_base, u64 size, - struct iommu_ops *iommu) + const struct iommu_ops *iommu) { struct dma_iommu_mapping *mapping; + struct iommu_group *group; if (!iommu) return false; - mapping = arm_iommu_create_mapping(dev->bus, dma_base, size); - if (IS_ERR(mapping)) { - pr_warn("Failed to create %llu-byte IOMMU mapping for device %s\n", - size, dev_name(dev)); - return false; - } + /** + * By this time, device may not added to IOMMU core, so we don't have + * the IOMMU group. So queue the IOMMU attach device which will be + * triggered after the IOMMU add_device once the device issues + * the BUS_NOTIFY_ADD_DEVICE event. + */ + group = iommu_group_get(dev); + if (group) { + iommu_group_put(group); + + mapping = arm_iommu_create_mapping(dev, dev->bus, dma_base, + size); + if (IS_ERR(mapping)) { + pr_warn("Failed to create %llu-byte IOMMU mapping for device %s\n", + size, dev_name(dev)); + return false; + } - if (__arm_iommu_attach_device(dev, mapping)) { - pr_warn("Failed to attached device %s to IOMMU_mapping\n", - dev_name(dev)); - arm_iommu_release_mapping(mapping); - return false; + if (__arm_iommu_attach_device(dev, mapping)) { + pr_warn("Failed to attached device %s to IOMMU_mapping\n", + dev_name(dev)); + arm_iommu_release_mapping(mapping); + return false; + } + } else { + queue_iommu_attach(dev, dma_base, size); } return true; @@ -2126,10 +2263,88 @@ arm_iommu_release_mapping(mapping); } +static int __iommu_attach_notifier(struct notifier_block *nb, + unsigned long action, void *data) +{ + struct iommu_dma_notifier_data *master, *tmp; + struct dma_iommu_mapping *mapping; + + if (action != BUS_NOTIFY_ADD_DEVICE) + return 0; + + mutex_lock(&iommu_dma_notifier_lock); + list_for_each_entry_safe(master, tmp, &iommu_dma_masters, list) { + + mapping = arm_iommu_create_mapping(master->dev, + master->dev->bus, + master->dma_base, + master->size); + if (IS_ERR(mapping)) { + pr_warn("Failed to create %llu-byte IOMMU mapping for device %s\n", + master->size, + dev_name(master->dev)); + goto err; + } + if (__arm_iommu_attach_device(master->dev, mapping)) { + pr_warn("Failed to attached device %s to IOMMU_mapping\n", + dev_name(master->dev)); + arm_iommu_release_mapping(mapping); + } +err: + list_del(&master->list); + kfree(master); + } + mutex_unlock(&iommu_dma_notifier_lock); + return 0; +} + +static int register_iommu_dma_ops_notifier(struct bus_type *bus) +{ + struct notifier_block *nb = kzalloc(sizeof(*nb), GFP_KERNEL); + int ret; + + if (!nb) + return -ENOMEM; + /* + * The device must be attached to a domain before the driver probe + * routine gets a chance to start allocating DMA buffers. However, + * the IOMMU driver also needs a chance to configure the iommu_group + * via its add_device callback first, so we need to make the attach + * happen between those two points. Since the IOMMU core uses a bus + * notifier with default priority for add_device, do the same but + * with a lower priority to ensure the appropriate ordering. + */ + nb->notifier_call = __iommu_attach_notifier; + nb->priority = -100; + + ret = bus_register_notifier(bus, nb); + if (ret) { + pr_warn("Failed to register DMA domain notifier; IOMMU DMA ops unavailable on bus '%s'\n", + bus->name); + kfree(nb); + } + return ret; +} + +static int __init __iommu_dma_init(void) +{ + int ret; + + ret = register_iommu_dma_ops_notifier(&platform_bus_type); + if (!ret) + ret = register_iommu_dma_ops_notifier(&amba_bustype); + + /* handle devices queued before this arch_initcall */ + if (!ret) + __iommu_attach_notifier(NULL, BUS_NOTIFY_ADD_DEVICE, NULL); + return ret; +} +postcore_initcall_sync(__iommu_dma_init); + #else static bool arm_setup_iommu_dma_ops(struct device *dev, u64 dma_base, u64 size, - struct iommu_ops *iommu) + const struct iommu_ops *iommu) { return false; } @@ -2140,24 +2355,26 @@ #endif /* CONFIG_ARM_DMA_USE_IOMMU */ -static struct dma_map_ops *arm_get_dma_map_ops(bool coherent) +static struct dma_map_ops *arm_get_dma_map_ops(struct device *dev, + bool coherent) { return coherent ? &arm_coherent_dma_ops : &arm_dma_ops; } void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size, - struct iommu_ops *iommu, bool coherent) + const struct iommu_ops *iommu, bool coherent) { - struct dma_map_ops *dma_ops; + const struct dma_map_ops *dma_ops; dev->archdata.dma_coherent = coherent; if (arm_setup_iommu_dma_ops(dev, dma_base, size, iommu)) - dma_ops = arm_get_iommu_dma_map_ops(coherent); + dma_ops = arm_get_iommu_dma_map_ops(dev, coherent); else - dma_ops = arm_get_dma_map_ops(coherent); + dma_ops = arm_get_dma_map_ops(dev, coherent); set_dma_ops(dev, dma_ops); } +EXPORT_SYMBOL(arch_setup_dma_ops); void arch_teardown_dma_ops(struct device *dev) {