--- zzzz-none-000/linux-4.4.60/arch/arm/mm/dma-mapping.c 2017-04-08 07:53:53.000000000 +0000 +++ scorpion-7490-727/linux-4.4.60/arch/arm/mm/dma-mapping.c 2021-02-04 17:41:59.000000000 +0000 @@ -28,6 +28,7 @@ #include #include #include +#include #include #include @@ -38,6 +39,7 @@ #include #include #include +#include #include "dma.h" #include "mm.h" @@ -225,7 +227,8 @@ return mask; } -static void __dma_clear_buffer(struct page *page, size_t size) +static void __dma_clear_buffer(struct page *page, size_t size, + struct dma_attrs *attrs) { /* * Ensure that the allocated pages are zeroed, and that any data @@ -236,7 +239,8 @@ phys_addr_t end = base + size; while (size > 0) { void *ptr = kmap_atomic(page); - memset(ptr, 0, PAGE_SIZE); + if (!dma_get_attr(DMA_ATTR_SKIP_ZEROING, attrs)) + memset(ptr, 0, PAGE_SIZE); dmac_flush_range(ptr, ptr + PAGE_SIZE); kunmap_atomic(ptr); page++; @@ -245,7 +249,8 @@ outer_flush_range(base, end); } else { void *ptr = page_address(page); - memset(ptr, 0, size); + if (!dma_get_attr(DMA_ATTR_SKIP_ZEROING, attrs)) + memset(ptr, 0, size); dmac_flush_range(ptr, ptr + size); outer_flush_range(__pa(ptr), __pa(ptr) + size); } @@ -271,7 +276,7 @@ for (p = page + (size >> PAGE_SHIFT), e = page + (1 << order); p < e; p++) __free_page(p); - __dma_clear_buffer(page, size); + __dma_clear_buffer(page, size, NULL); return page; } @@ -293,7 +298,8 @@ static void *__alloc_from_contiguous(struct device *dev, size_t size, pgprot_t prot, struct page **ret_page, - const void *caller, bool want_vaddr); + const void *caller, bool want_vaddr, + struct dma_attrs *attrs); static void *__alloc_remap_buffer(struct device *dev, size_t size, gfp_t gfp, pgprot_t prot, struct page **ret_page, @@ -361,7 +367,7 @@ if (dev_get_cma_area(NULL)) ptr = __alloc_from_contiguous(NULL, atomic_pool_size, prot, - &page, atomic_pool_init, true); + &page, atomic_pool_init, true, NULL); else ptr = __alloc_remap_buffer(NULL, atomic_pool_size, gfp, prot, &page, atomic_pool_init, true); @@ -530,7 +536,8 @@ static void *__alloc_from_contiguous(struct device *dev, size_t size, pgprot_t prot, struct page **ret_page, - const void *caller, bool want_vaddr) + const void *caller, bool want_vaddr, + struct dma_attrs *attrs) { unsigned long order = get_order(size); size_t count = size >> PAGE_SHIFT; @@ -541,7 +548,12 @@ if (!page) return NULL; - __dma_clear_buffer(page, size); + /* + * skip completely if we neither need to zero nor sync. + */ + if (!(dma_get_attr(DMA_ATTR_SKIP_CPU_SYNC, attrs) && + dma_get_attr(DMA_ATTR_SKIP_ZEROING, attrs))) + __dma_clear_buffer(page, size, attrs); if (!want_vaddr) goto out; @@ -591,7 +603,7 @@ #define __get_dma_pgprot(attrs, prot) __pgprot(0) #define __alloc_remap_buffer(dev, size, gfp, prot, ret, c, wv) NULL #define __alloc_from_pool(size, ret_page) NULL -#define __alloc_from_contiguous(dev, size, prot, ret, c, wv) NULL +#define __alloc_from_contiguous(dev, size, prot, ret, c, wv, a) NULL #define __free_from_pool(cpu_addr, size) 0 #define __free_from_contiguous(dev, page, cpu_addr, size, wv) do { } while (0) #define __dma_free_remap(cpu_addr, size) do { } while (0) @@ -653,7 +665,7 @@ addr = __alloc_simple_buffer(dev, size, gfp, &page); else if (dev_get_cma_area(dev) && (gfp & __GFP_DIRECT_RECLAIM)) addr = __alloc_from_contiguous(dev, size, prot, &page, - caller, want_vaddr); + caller, want_vaddr, attrs); else if (is_coherent) addr = __alloc_simple_buffer(dev, size, gfp, &page); else if (!gfpflags_allow_blocking(gfp)) @@ -911,7 +923,7 @@ int arm_dma_map_sg(struct device *dev, struct scatterlist *sg, int nents, enum dma_data_direction dir, struct dma_attrs *attrs) { - struct dma_map_ops *ops = get_dma_ops(dev); + const struct dma_map_ops *ops = get_dma_ops(dev); struct scatterlist *s; int i, j; @@ -945,7 +957,7 @@ void arm_dma_unmap_sg(struct device *dev, struct scatterlist *sg, int nents, enum dma_data_direction dir, struct dma_attrs *attrs) { - struct dma_map_ops *ops = get_dma_ops(dev); + const struct dma_map_ops *ops = get_dma_ops(dev); struct scatterlist *s; int i; @@ -964,7 +976,7 @@ void arm_dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nents, enum dma_data_direction dir) { - struct dma_map_ops *ops = get_dma_ops(dev); + const struct dma_map_ops *ops = get_dma_ops(dev); struct scatterlist *s; int i; @@ -983,7 +995,7 @@ void arm_dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nents, enum dma_data_direction dir) { - struct dma_map_ops *ops = get_dma_ops(dev); + const struct dma_map_ops *ops = get_dma_ops(dev); struct scatterlist *s; int i; @@ -1122,6 +1134,9 @@ spin_unlock_irqrestore(&mapping->lock, flags); } +/* We'll try 2M, 1M, 64K, and finally 4K; array must end with 0! */ +static const int iommu_order_array[] = { 9, 8, 4, 0 }; + static struct page **__iommu_alloc_buffer(struct device *dev, size_t size, gfp_t gfp, struct dma_attrs *attrs) { @@ -1129,6 +1144,7 @@ int count = size >> PAGE_SHIFT; int array_size = count * sizeof(struct page *); int i = 0; + int order_idx = 0; if (array_size <= PAGE_SIZE) pages = kzalloc(array_size, GFP_KERNEL); @@ -1146,7 +1162,7 @@ if (!page) goto error; - __dma_clear_buffer(page, size); + __dma_clear_buffer(page, size, NULL); for (i = 0; i < count; i++) pages[i] = page + i; @@ -1154,6 +1170,10 @@ return pages; } + /* Go straight to 4K chunks if caller says it's OK. */ + if (dma_get_attr(DMA_ATTR_ALLOC_SINGLE_PAGES, attrs)) + order_idx = ARRAY_SIZE(iommu_order_array) - 1; + /* * IOMMU can map any pages, so himem can also be used here */ @@ -1162,22 +1182,24 @@ while (count) { int j, order; - for (order = __fls(count); order > 0; --order) { - /* - * We do not want OOM killer to be invoked as long - * as we can fall back to single pages, so we force - * __GFP_NORETRY for orders higher than zero. - */ - pages[i] = alloc_pages(gfp | __GFP_NORETRY, order); - if (pages[i]) - break; + order = iommu_order_array[order_idx]; + + /* Drop down when we get small */ + if (__fls(count) < order) { + order_idx++; + continue; } - if (!pages[i]) { - /* - * Fall back to single page allocation. - * Might invoke OOM killer as last resort. - */ + if (order) { + /* See if it's easy to allocate a high-order chunk */ + pages[i] = alloc_pages(gfp | __GFP_NORETRY, order); + + /* Go down a notch at first sign of pressure */ + if (!pages[i]) { + order_idx++; + continue; + } + } else { pages[i] = alloc_pages(gfp, 0); if (!pages[i]) goto error; @@ -1190,7 +1212,7 @@ pages[i + j] = pages[i] + j; } - __dma_clear_buffer(pages[i], PAGE_SIZE << order); + __dma_clear_buffer(pages[i], PAGE_SIZE << order, NULL); i += 1 << order; count -= 1 << order; } @@ -1486,10 +1508,10 @@ prot = IOMMU_READ | IOMMU_WRITE; break; case DMA_TO_DEVICE: - prot = IOMMU_READ; + prot = IOMMU_READ | IOMMU_WRITE; break; case DMA_FROM_DEVICE: - prot = IOMMU_WRITE; + prot = IOMMU_WRITE | IOMMU_READ; break; default: prot = 0; @@ -1849,7 +1871,7 @@ __dma_page_cpu_to_dev(page, offset, size, dir); } -struct dma_map_ops iommu_ops = { +const struct dma_map_ops iommu_ops = { .alloc = arm_iommu_alloc_attrs, .free = arm_iommu_free_attrs, .mmap = arm_iommu_mmap_attrs, @@ -1868,7 +1890,7 @@ .set_dma_mask = arm_dma_set_mask, }; -struct dma_map_ops iommu_coherent_ops = { +const struct dma_map_ops iommu_coherent_ops = { .alloc = arm_iommu_alloc_attrs, .free = arm_iommu_free_attrs, .mmap = arm_iommu_mmap_attrs, @@ -1897,13 +1919,19 @@ * arm_iommu_attach_device function. */ struct dma_iommu_mapping * -arm_iommu_create_mapping(struct bus_type *bus, dma_addr_t base, u64 size) +arm_iommu_create_mapping(struct device *dev, struct bus_type *bus, + dma_addr_t base, u64 size) { unsigned int bits = size >> PAGE_SHIFT; unsigned int bitmap_size = BITS_TO_LONGS(bits) * sizeof(long); struct dma_iommu_mapping *mapping; int extensions = 1; int err = -ENOMEM; + struct iommu_domain *domain; + + domain = iommu_get_domain_for_dev(dev); + if (domain->handler_token) + return (struct dma_iommu_mapping *)domain->handler_token; /* currently only 32-bit DMA address space is supported */ if (size > DMA_BIT_MASK(32) + 1) @@ -1935,6 +1963,7 @@ mapping->extensions = extensions; mapping->base = base; mapping->bits = BITS_PER_BYTE * bitmap_size; + mapping->size = size; spin_lock_init(&mapping->lock); @@ -1993,17 +2022,37 @@ } EXPORT_SYMBOL_GPL(arm_iommu_release_mapping); +/* fast mapping is always true for now */ +static bool fast = true; + static int __arm_iommu_attach_device(struct device *dev, struct dma_iommu_mapping *mapping) { + struct iommu_domain *domain = iommu_get_domain_for_dev(dev); int err; - err = iommu_attach_device(mapping->domain, dev); - if (err) - return err; - - kref_get(&mapping->kref); - to_dma_iommu_mapping(dev) = mapping; + if (fast && !domain->handler_token) { + /* detach the existing and attach to fast mapping domain */ + iommu_detach_device(mapping->domain, dev); + err = fast_smmu_attach_device(dev, mapping); + if (err) + return err; + err = iommu_dma_init_domain(domain, mapping->base, + mapping->size); + if (err) + /* Detach the domain? */ + return err; + dev->archdata.dma_ops = &fast_smmu_dma_ops; + } else if (domain->handler_token) { + dev->archdata.dma_ops = &fast_smmu_dma_ops; + dev->archdata.mapping = domain->handler_token; + mapping = domain->handler_token; + kref_get(&mapping->kref); + } else { + err = iommu_attach_device(mapping->domain, dev); + if (err) + return err; + } pr_debug("Attached IOMMU controller to %s device.\n", dev_name(dev)); return 0; @@ -2031,6 +2080,7 @@ if (err) return err; + /* TODO: Should this be removed when fast DMA mapping is enabled? */ set_dma_ops(dev, &iommu_ops); return 0; } @@ -2067,20 +2117,27 @@ } EXPORT_SYMBOL_GPL(arm_iommu_detach_device); -static struct dma_map_ops *arm_get_iommu_dma_map_ops(bool coherent) +static const struct dma_map_ops *arm_get_iommu_dma_map_ops(struct device *dev, + bool coherent) { + /* If fast DMA mapping is enabled, dma_ops is updated with + * fast_smmu_dma_ops in __arm_iommu_attach_device(), use the same + */ + if (fast) + return dev->archdata.dma_ops; + return coherent ? &iommu_coherent_ops : &iommu_ops; } static bool arm_setup_iommu_dma_ops(struct device *dev, u64 dma_base, u64 size, - struct iommu_ops *iommu) + const struct iommu_ops *iommu) { struct dma_iommu_mapping *mapping; if (!iommu) return false; - mapping = arm_iommu_create_mapping(dev->bus, dma_base, size); + mapping = arm_iommu_create_mapping(dev, dev->bus, dma_base, size); if (IS_ERR(mapping)) { pr_warn("Failed to create %llu-byte IOMMU mapping for device %s\n", size, dev_name(dev)); @@ -2111,7 +2168,7 @@ #else static bool arm_setup_iommu_dma_ops(struct device *dev, u64 dma_base, u64 size, - struct iommu_ops *iommu) + const struct iommu_ops *iommu) { return false; } @@ -2122,24 +2179,26 @@ #endif /* CONFIG_ARM_DMA_USE_IOMMU */ -static struct dma_map_ops *arm_get_dma_map_ops(bool coherent) +static struct dma_map_ops *arm_get_dma_map_ops(struct device *dev, + bool coherent) { return coherent ? &arm_coherent_dma_ops : &arm_dma_ops; } void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size, - struct iommu_ops *iommu, bool coherent) + const struct iommu_ops *iommu, bool coherent) { - struct dma_map_ops *dma_ops; + const struct dma_map_ops *dma_ops; dev->archdata.dma_coherent = coherent; if (arm_setup_iommu_dma_ops(dev, dma_base, size, iommu)) - dma_ops = arm_get_iommu_dma_map_ops(coherent); + dma_ops = arm_get_iommu_dma_map_ops(dev, coherent); else - dma_ops = arm_get_dma_map_ops(coherent); + dma_ops = arm_get_dma_map_ops(dev, coherent); set_dma_ops(dev, dma_ops); } +EXPORT_SYMBOL(arch_setup_dma_ops); void arch_teardown_dma_ops(struct device *dev) {