--- zzzz-none-000/linux-5.4.213/net/core/skbuff.c 2022-09-15 10:04:56.000000000 +0000 +++ miami-7690-761/linux-5.4.213/net/core/skbuff.c 2024-05-29 11:20:02.000000000 +0000 @@ -60,6 +60,7 @@ #include #include #include +#include #include #include @@ -75,9 +76,38 @@ #include #include #include +#include #include "datagram.h" +#include "skbuff_recycle.h" +#include "skbuff_debug.h" + +struct kmem_cache *skb_data_cache; + +/* + * For low memory profile, NSS_SKB_FIXED_SIZE_2K is enabled and + * CONFIG_SKB_RECYCLER is disabled. For premium and enterprise profile + * CONFIG_SKB_RECYCLER is enabled and NSS_SKB_FIXED_SIZE_2K is disabled. + * Irrespective of NSS_SKB_FIXED_SIZE_2K enabled/disabled, the + * CONFIG_SKB_RECYCLER and __LP64__ determines the value of SKB_DATA_CACHE_SIZE + */ +#if defined(CONFIG_SKB_RECYCLER) +/* + * 2688 for 64bit arch, 2624 for 32bit arch + */ +#define SKB_DATA_CACHE_SIZE (SKB_DATA_ALIGN(SKB_RECYCLE_SIZE + NET_SKB_PAD) + SKB_DATA_ALIGN(sizeof(struct skb_shared_info))) +#else +/* + * 2368 for 64bit arch, 2176 for 32bit arch + */ +#if defined(__LP64__) +#define SKB_DATA_CACHE_SIZE ((SKB_DATA_ALIGN(1984 + NET_SKB_PAD)) + SKB_DATA_ALIGN(sizeof(struct skb_shared_info))) +#else +#define SKB_DATA_CACHE_SIZE ((SKB_DATA_ALIGN(1856 + NET_SKB_PAD)) + SKB_DATA_ALIGN(sizeof(struct skb_shared_info))) +#endif +#endif + struct kmem_cache *skbuff_head_cache __ro_after_init; static struct kmem_cache *skbuff_fclone_cache __ro_after_init; #ifdef CONFIG_SKB_EXTENSIONS @@ -138,7 +168,12 @@ * Try a regular allocation, when that fails and we're not entitled * to the reserves, fail. */ - obj = kmalloc_node_track_caller(size, + if (size > SZ_2K && size <= SKB_DATA_CACHE_SIZE) + obj = kmem_cache_alloc_node(skb_data_cache, + flags | __GFP_NOMEMALLOC | __GFP_NOWARN, + node); + else + obj = kmalloc_node_track_caller(size, flags | __GFP_NOMEMALLOC | __GFP_NOWARN, node); if (obj || !(gfp_pfmemalloc_allowed(flags))) @@ -146,7 +181,10 @@ /* Try again but now we are using pfmemalloc reserves */ ret_pfmemalloc = true; - obj = kmalloc_node_track_caller(size, flags, node); + if (size > SZ_2K && size <= SKB_DATA_CACHE_SIZE) + obj = kmem_cache_alloc_node(skb_data_cache, flags, node); + else + obj = kmalloc_node_track_caller(size, flags, node); out: if (pfmemalloc) @@ -197,6 +235,7 @@ skb = kmem_cache_alloc_node(cache, gfp_mask & ~__GFP_DMA, node); if (!skb) goto out; + skbuff_debugobj_init_and_activate(skb); prefetchw(skb); /* We do our best to align skb_shared_info on a separate cache @@ -251,6 +290,7 @@ out: return skb; nodata: + skbuff_debugobj_deactivate(skb); kmem_cache_free(cache, skb); skb = NULL; goto out; @@ -310,6 +350,7 @@ skb = kmem_cache_alloc(skbuff_head_cache, GFP_ATOMIC); if (unlikely(!skb)) return NULL; + skbuff_debugobj_init_and_activate(skb); memset(skb, 0, offsetof(struct sk_buff, tail)); @@ -411,7 +452,7 @@ /** * __netdev_alloc_skb - allocate an skbuff for rx on a specific device * @dev: network device to receive on - * @len: length to allocate + * @length: length to allocate * @gfp_mask: get_free_pages mask, passed to alloc_skb * * Allocate a new &sk_buff and assign it a usage count of one. The @@ -421,22 +462,48 @@ * * %NULL is returned if there is no free memory. */ -struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len, - gfp_t gfp_mask) +struct sk_buff *__netdev_alloc_skb(struct net_device *dev, + unsigned int length, gfp_t gfp_mask) { - struct page_frag_cache *nc; struct sk_buff *skb; + unsigned int len = length; + +#ifdef CONFIG_SKB_RECYCLER + bool reset_skb = true; + skb = skb_recycler_alloc(dev, length, reset_skb); + if (likely(skb)) { + skb->recycled_for_ds = 0; + return skb; + } + + len = SKB_RECYCLE_SIZE; + if (unlikely(length > SKB_RECYCLE_SIZE)) + len = length; + + skb = __alloc_skb(len + NET_SKB_PAD, gfp_mask, + SKB_ALLOC_RX, NUMA_NO_NODE); + if (!skb) + goto skb_fail; + + goto skb_success; +#else + struct page_frag_cache *nc; bool pfmemalloc; + bool page_frag_alloc_enable = true; void *data; len += NET_SKB_PAD; +#ifdef CONFIG_ALLOC_SKB_PAGE_FRAG_DISABLE + page_frag_alloc_enable = false; +#endif /* If requested length is either too small or too big, * we use kmalloc() for skb->head allocation. */ if (len <= SKB_WITH_OVERHEAD(1024) || len > SKB_WITH_OVERHEAD(PAGE_SIZE) || - (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) { + (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA)) || + !page_frag_alloc_enable) { skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE); if (!skb) goto skb_fail; @@ -474,6 +541,7 @@ if (pfmemalloc) skb->pfmemalloc = 1; skb->head_frag = 1; +#endif skb_success: skb_reserve(skb, NET_SKB_PAD); @@ -484,6 +552,134 @@ } EXPORT_SYMBOL(__netdev_alloc_skb); +#ifdef CONFIG_SKB_RECYCLER +/* __netdev_alloc_skb_no_skb_reset - allocate an skbuff for rx on a specific device + * @dev: network device to receive on + * @length: length to allocate + * @gfp_mask: get_free_pages mask, passed from wifi driver + * + * Allocate a new &sk_buff and assign it a usage count of one. The + * buffer has NET_SKB_PAD headroom built in. Users should allocate + * the headroom they think they need without accounting for the + * built in space. The built in space is used for optimisations. + * + * Currently, using __netdev_alloc_skb_no_skb_reset for DS alone + * and it invokes skb_recycler_alloc with reset_skb as false. + * Hence, recycler pool will not do reset_struct when it + * allocates DS used buffer to DS module, which will + * improve the performance + * + * %NULL is returned if there is no free memory. + */ +struct sk_buff *__netdev_alloc_skb_no_skb_reset(struct net_device *dev, + unsigned int length, gfp_t gfp_mask) +{ + struct sk_buff *skb; + unsigned int len = length; + bool reset_skb = false; + +#ifdef CONFIG_SKB_RECYCLER + skb = skb_recycler_alloc(dev, length, reset_skb); + if (likely(skb)) { + /* SKBs in the recycler are from various unknown sources. + * Their truesize is unknown. We should set truesize + * as the needed buffer size before using it. + */ + skb->truesize = SKB_TRUESIZE(SKB_DATA_ALIGN(len + NET_SKB_PAD)); + skb->fast_recycled = 0; + skb->fast_qdisc = 0; + return skb; + } + + len = SKB_RECYCLE_SIZE; + if (unlikely(length > SKB_RECYCLE_SIZE)) + len = length; + + skb = __alloc_skb(len + NET_SKB_PAD, gfp_mask, + SKB_ALLOC_RX, NUMA_NO_NODE); + if (!skb) + goto skb_fail; + + /* Set truesize as the needed buffer size + * rather than the allocated size by __alloc_skb(). + * */ + if (length + NET_SKB_PAD < SKB_WITH_OVERHEAD(PAGE_SIZE)) + skb->truesize = SKB_TRUESIZE(SKB_DATA_ALIGN(length + NET_SKB_PAD)); + + goto skb_success; +#else + struct page_frag_cache *nc; + bool pfmemalloc; + bool page_frag_alloc_enable = true; + void *data; + + len += NET_SKB_PAD; +#ifdef CONFIG_ALLOC_SKB_PAGE_FRAG_DISABLE + page_frag_alloc_enable = false; +#endif + /* If requested length is either too small or too big, + * we use kmalloc() for skb->head allocation. + */ + if (len <= SKB_WITH_OVERHEAD(1024) || + len > SKB_WITH_OVERHEAD(PAGE_SIZE) || + (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA)) || + !page_frag_alloc_enable) { + skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE); + if (!skb) + goto skb_fail; + goto skb_success; + } + + len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + len = SKB_DATA_ALIGN(len); + + if (sk_memalloc_socks()) + gfp_mask |= __GFP_MEMALLOC; + + if (in_irq() || irqs_disabled()) { + nc = this_cpu_ptr(&netdev_alloc_cache); + data = page_frag_alloc(nc, len, gfp_mask); + pfmemalloc = nc->pfmemalloc; + } else { + local_bh_disable(); + nc = this_cpu_ptr(&napi_alloc_cache.page); + data = page_frag_alloc(nc, len, gfp_mask); + pfmemalloc = nc->pfmemalloc; + local_bh_enable(); + } + + if (unlikely(!data)) + return NULL; + + skb = __build_skb(data, len); + if (unlikely(!skb)) { + skb_free_frag(data); + return NULL; + } + + /* use OR instead of assignment to avoid clearing of bits in mask */ + if (pfmemalloc) + skb->pfmemalloc = 1; + skb->head_frag = 1; +#endif + +skb_success: + skb_reserve(skb, NET_SKB_PAD); + skb->dev = dev; + +skb_fail: + return skb; +} +EXPORT_SYMBOL(__netdev_alloc_skb_no_skb_reset); +#else +struct sk_buff *__netdev_alloc_skb_no_skb_reset(struct net_device *dev, + unsigned int length, gfp_t gfp_mask) +{ + return __netdev_alloc_skb(dev, length, gfp_mask); +} +EXPORT_SYMBOL(__netdev_alloc_skb_no_skb_reset); +#endif + /** * __napi_alloc_skb - allocate skbuff for rx in a specific NAPI instance * @napi: napi instance this buffer was allocated for @@ -549,6 +745,22 @@ } EXPORT_SYMBOL(__napi_alloc_skb); +struct sk_buff *__netdev_alloc_skb_ip_align(struct net_device *dev, + unsigned int length, gfp_t gfp) +{ + struct sk_buff *skb = __netdev_alloc_skb(dev, length + NET_IP_ALIGN, gfp); + +#ifdef CONFIG_ETHERNET_PACKET_MANGLE + if (dev && (dev->priv_flags & IFF_NO_IP_ALIGN)) + return skb; +#endif + + if (NET_IP_ALIGN && skb) + skb_reserve(skb, NET_IP_ALIGN); + return skb; +} +EXPORT_SYMBOL(__netdev_alloc_skb_ip_align); + void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off, int size, unsigned int truesize) { @@ -600,7 +812,7 @@ kfree(head); } -static void skb_release_data(struct sk_buff *skb) +void skb_release_data(struct sk_buff *skb) { struct skb_shared_info *shinfo = skb_shinfo(skb); int i; @@ -623,12 +835,13 @@ /* * Free an skbuff by memory without cleaning the state. */ -static void kfree_skbmem(struct sk_buff *skb) +void kfree_skbmem(struct sk_buff *skb) { struct sk_buff_fclones *fclones; switch (skb->fclone) { case SKB_FCLONE_UNAVAILABLE: + skbuff_debugobj_deactivate(skb); kmem_cache_free(skbuff_head_cache, skb); return; @@ -649,7 +862,9 @@ } if (!refcount_dec_and_test(&fclones->fclone_ref)) return; + fastpath: + skbuff_debugobj_deactivate(&fclones->skb1); kmem_cache_free(skbuff_fclone_cache, fclones); } @@ -664,6 +879,9 @@ nf_conntrack_put(skb_nfct(skb)); #endif skb_ext_put(skb); +#if defined(CONFIG_AVM_PA_GENERIC_CT) + generic_ct_put(SKB_GENERIC_CT(skb)); +#endif } /* Free everything but the sk_buff shell. */ @@ -843,12 +1061,85 @@ if (!skb_unref(skb)) return; + prefetch(&skb->destructor); + + /*Tian: Not sure if we need to continue using this since + * since unref does the work in 5.4 + */ + + /* + if (likely(atomic_read(&skb->users) == 1)) + smp_rmb(); + else if (likely(!atomic_dec_and_test(&skb->users))) + return; + */ + + /* If possible we'd like to recycle any skb rather than just free it, + * but in order to do that we need to release any head state too. + * We don't want to do this later because we'll be in a pre-emption + * disabled state. + */ + skb_release_head_state(skb); + + /* Can we recycle this skb? If we can then it will be much faster + * for us to recycle this one later than to allocate a new one + * from scratch. + */ + if (likely(skb->head) && likely(skb_recycler_consume(skb))) + return; + trace_consume_skb(skb); - __kfree_skb(skb); + + /* We're not recycling so now we need to do the rest of what we would + * have done in __kfree_skb (above and beyond the skb_release_head_state + * that we already did). + */ + if (likely(skb->head)) + skb_release_data(skb); + + kfree_skbmem(skb); } EXPORT_SYMBOL(consume_skb); /** + * consume_skb_list_fast - free a list of skbs + * @skb_list: head of the buffer list + * + * Add the list of given SKBs to CPU list. Assumption is that these buffers + * have been allocated originally from the skb recycler and have been transmitted + * through a controlled fast xmit path, thus removing the need for additional checks + * before recycling the buffers back to pool + */ +void consume_skb_list_fast(struct sk_buff_head *skb_list) +{ + struct sk_buff *skb = NULL; + + if (likely(skb_recycler_consume_list_fast(skb_list))) { + return; + } + + while ((skb = skb_dequeue(skb_list)) != NULL) { + /* + * Check if release head state is needed + */ + skb_release_head_state(skb); + + trace_consume_skb(skb); + + /* + * We're not recycling so now we need to do the rest of what we would + * have done in __kfree_skb (above and beyond the skb_release_head_state + * that we already did). + */ + if (likely(skb->head)) + skb_release_data(skb); + + kfree_skbmem(skb); + } +} +EXPORT_SYMBOL(consume_skb_list_fast); + +/** * consume_stateless_skb - free an skbuff, assuming it is stateless * @skb: buffer to free * @@ -953,6 +1244,16 @@ memcpy(&new->headers_start, &old->headers_start, offsetof(struct sk_buff, headers_end) - offsetof(struct sk_buff, headers_start)); + + /* Clear the skb recycler flags here to make sure any skb whose size + * has been altered is not put back into recycler pool. + */ + new->fast_xmit = 0; + new->is_from_recycler = 0; + new->fast_recycled = 0; + new->recycled_for_ds = 0; + new->fast_qdisc = 0; + new->int_pri = 0; CHECK_SKB_FIELD(protocol); CHECK_SKB_FIELD(csum); CHECK_SKB_FIELD(hash); @@ -980,7 +1281,6 @@ #ifdef CONFIG_NET_SCHED CHECK_SKB_FIELD(tc_index); #endif - } /* @@ -994,6 +1294,12 @@ n->next = n->prev = NULL; n->sk = NULL; __copy_skb_header(n, skb); + /* Not to be copied by __copy_skb_header(). __copy_skb_header() is used + * during segmentation. Copies created by that function may not inherit + * the same pkt_info because avm_pa cannot tell them apart. + */ + if (IS_ENABLED(CONFIG_AVM_PA)) + memcpy(AVM_PKT_INFO(n), AVM_PKT_INFO(skb), sizeof(struct avm_pa_pkt_info)); C(len); C(data_len); @@ -1457,6 +1763,7 @@ n = kmem_cache_alloc(skbuff_head_cache, gfp_mask); if (!n) return NULL; + skbuff_debugobj_init_and_activate(n); n->fclone = SKB_FCLONE_UNAVAILABLE; } @@ -1484,6 +1791,12 @@ void skb_copy_header(struct sk_buff *new, const struct sk_buff *old) { __copy_skb_header(new, old); + /* Not to be copied by __copy_skb_header(). __copy_skb_header() is used + * during segmentation. Copies created by that function may not inherit + * the same pkt_info because avm_pa cannot tell them apart. + */ + if (IS_ENABLED(CONFIG_AVM_PA)) + memcpy(AVM_PKT_INFO(new), AVM_PKT_INFO(old), sizeof(struct avm_pa_pkt_info)); skb_shinfo(new)->gso_size = skb_shinfo(old)->gso_size; skb_shinfo(new)->gso_segs = skb_shinfo(old)->gso_segs; @@ -1696,6 +2009,16 @@ if (!skb->sk || skb->destructor == sock_edemux) skb->truesize += size - osize; + /* Clear the skb recycler flags here to make sure any skb whose size + * has been expanded is not put back into recycler. + */ + skb->fast_xmit = 0; + skb->is_from_recycler = 0; + skb->fast_recycled = 0; + skb->recycled_for_ds = 0; + skb->fast_qdisc = 0; + skb->int_pri = 0; + return 0; nofrags: @@ -4173,6 +4496,11 @@ void __init skb_init(void) { + skb_data_cache = kmem_cache_create_usercopy("skb_data_cache", + SKB_DATA_CACHE_SIZE, + 0, SLAB_PANIC, 0, SKB_DATA_CACHE_SIZE, + NULL); + skbuff_head_cache = kmem_cache_create_usercopy("skbuff_head_cache", sizeof(struct sk_buff), 0, @@ -4186,6 +4514,10 @@ SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); skb_extensions_init(); + skb_recycler_init(); +#if defined(CONFIG_AVM_PA_GENERIC_CT) + generic_ct_init(); +#endif } static int @@ -5039,6 +5371,7 @@ { if (head_stolen) { skb_release_head_state(skb); + skbuff_debugobj_deactivate(skb); kmem_cache_free(skbuff_head_cache, skb); } else { __kfree_skb(skb); @@ -5150,7 +5483,10 @@ skb->ignore_df = 0; skb_dst_drop(skb); skb_ext_reset(skb); - nf_reset_ct(skb); + /* TMA/MQU 20170411: Is this the right thing for namespace + * changes? We think so. See JZ-30001. + */ + nf_reset_no_generic_ct(skb); nf_reset_trace(skb); #ifdef CONFIG_NET_SWITCHDEV