--- zzzz-none-000/linux-4.4.271/net/core/skbuff.c 2021-06-03 06:22:09.000000000 +0000 +++ hawkeye-5590-750/linux-4.4.271/net/core/skbuff.c 2023-04-19 10:22:30.000000000 +0000 @@ -63,6 +63,7 @@ #include #include #include +#include #include #include @@ -77,6 +78,20 @@ #include #include +#include "skbuff_recycle.h" +#include "skbuff_debug.h" + +#if defined(CONFIG_SLUB_AVM_ALLOC_LIST) +#include +#include +#include +#include +#include +#include +#endif + +#include + struct kmem_cache *skbuff_head_cache __read_mostly; static struct kmem_cache *skbuff_fclone_cache __read_mostly; int sysctl_max_skb_frags __read_mostly = MAX_SKB_FRAGS; @@ -166,6 +181,7 @@ gfp_mask & ~__GFP_DMA, node); if (!skb) goto out; + skbuff_debugobj_init_and_activate(skb); /* * Only clear those fields we need to clear, not those that we will @@ -218,6 +234,7 @@ skb = kmem_cache_alloc_node(cache, gfp_mask & ~__GFP_DMA, node); if (!skb) goto out; + skbuff_debugobj_init_and_activate(skb); prefetchw(skb); /* We do our best to align skb_shared_info on a separate cache @@ -275,6 +292,7 @@ out: return skb; nodata: + skbuff_debugobj_deactivate(skb); kmem_cache_free(cache, skb); skb = NULL; goto out; @@ -309,6 +327,7 @@ skb = kmem_cache_alloc(skbuff_head_cache, GFP_ATOMIC); if (!skb) return NULL; + skbuff_debugobj_init_and_activate(skb); size -= SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); @@ -398,7 +417,7 @@ /** * __netdev_alloc_skb - allocate an skbuff for rx on a specific device * @dev: network device to receive on - * @len: length to allocate + * @length: length to allocate * @gfp_mask: get_free_pages mask, passed to alloc_skb * * Allocate a new &sk_buff and assign it a usage count of one. The @@ -408,23 +427,60 @@ * * %NULL is returned if there is no free memory. */ -struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len, - gfp_t gfp_mask) +struct sk_buff *__netdev_alloc_skb(struct net_device *dev, + unsigned int length, gfp_t gfp_mask) { +#ifndef CONFIG_SKB_RECYCLER struct page_frag_cache *nc; unsigned long flags; - struct sk_buff *skb; bool pfmemalloc; + bool page_frag_alloc_enable = true; void *data; +#endif + + struct sk_buff *skb; + unsigned int len = length; + +#ifdef CONFIG_SKB_RECYCLER + skb = skb_recycler_alloc(dev, length); + if (likely(skb)) { + /* SKBs in the recycler are from various unknown sources. + * Their truesize is unknown. We should set truesize + * as the needed buffer size before using it. + */ + skb->truesize = SKB_TRUESIZE(SKB_DATA_ALIGN(len + NET_SKB_PAD)); + return skb; + } + len = SKB_RECYCLE_SIZE; + if (unlikely(length > SKB_RECYCLE_SIZE)) + len = length; + + skb = __alloc_skb(len + NET_SKB_PAD, gfp_mask, + SKB_ALLOC_RX, NUMA_NO_NODE); + if (!skb) + goto skb_fail; + + /* Set truesize as the needed buffer size + * rather than the allocated size by __alloc_skb(). + */ + if (length + NET_SKB_PAD < SKB_WITH_OVERHEAD(PAGE_SIZE)) + skb->truesize = SKB_TRUESIZE(SKB_DATA_ALIGN(length + NET_SKB_PAD)); + + goto skb_success; +#else len += NET_SKB_PAD; +#ifdef CONFIG_ALLOC_SKB_PAGE_FRAG_DISABLE + page_frag_alloc_enable = false; +#endif /* If requested length is either too small or too big, * we use kmalloc() for skb->head allocation. */ if (len <= SKB_WITH_OVERHEAD(1024) || len > SKB_WITH_OVERHEAD(PAGE_SIZE) || - (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) { + (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA)) || + !page_frag_alloc_enable) { skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE); if (!skb) goto skb_fail; @@ -458,6 +514,7 @@ if (pfmemalloc) skb->pfmemalloc = 1; skb->head_frag = 1; +#endif skb_success: skb_reserve(skb, NET_SKB_PAD); @@ -533,6 +590,22 @@ } EXPORT_SYMBOL(__napi_alloc_skb); +struct sk_buff *__netdev_alloc_skb_ip_align(struct net_device *dev, + unsigned int length, gfp_t gfp) +{ + struct sk_buff *skb = __netdev_alloc_skb(dev, length + NET_IP_ALIGN, gfp); + +#ifdef CONFIG_ETHERNET_PACKET_MANGLE + if (dev && (dev->priv_flags & IFF_NO_IP_ALIGN)) + return skb; +#endif + + if (NET_IP_ALIGN && skb) + skb_reserve(skb, NET_IP_ALIGN); + return skb; +} +EXPORT_SYMBOL(__netdev_alloc_skb_ip_align); + void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off, int size, unsigned int truesize) { @@ -584,7 +657,7 @@ kfree(head); } -static void skb_release_data(struct sk_buff *skb) +void skb_release_data(struct sk_buff *skb) { struct skb_shared_info *shinfo = skb_shinfo(skb); int i; @@ -618,12 +691,13 @@ /* * Free an skbuff by memory without cleaning the state. */ -static void kfree_skbmem(struct sk_buff *skb) +void kfree_skbmem(struct sk_buff *skb) { struct sk_buff_fclones *fclones; switch (skb->fclone) { case SKB_FCLONE_UNAVAILABLE: + skbuff_debugobj_deactivate(skb); kmem_cache_free(skbuff_head_cache, skb); return; @@ -644,7 +718,9 @@ } if (!atomic_dec_and_test(&fclones->fclone_ref)) return; + fastpath: + skbuff_debugobj_deactivate(&fclones->skb1); kmem_cache_free(skbuff_fclone_cache, fclones); } @@ -661,6 +737,9 @@ #if IS_ENABLED(CONFIG_NF_CONNTRACK) nf_conntrack_put(skb->nfct); #endif +#if IS_ENABLED(CONFIG_AVM_PA_GENERIC_CT) + generic_ct_put(SKB_GENERIC_CT(skb)); +#endif #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) nf_bridge_put(skb->nf_bridge); #endif @@ -753,12 +832,38 @@ { if (unlikely(!skb)) return; + + prefetch(&skb->destructor); + if (likely(atomic_read(&skb->users) == 1)) smp_rmb(); else if (likely(!atomic_dec_and_test(&skb->users))) return; + + /* If possible we'd like to recycle any skb rather than just free it, + * but in order to do that we need to release any head state too. + * We don't want to do this later because we'll be in a pre-emption + * disabled state. + */ + skb_release_head_state(skb); + + /* Can we recycle this skb? If we can then it will be much faster + * for us to recycle this one later than to allocate a new one + * from scratch. + */ + if (likely(skb->head) && likely(skb_recycler_consume(skb))) + return; + trace_consume_skb(skb); - __kfree_skb(skb); + + /* We're not recycling so now we need to do the rest of what we would + * have done in __kfree_skb (above and beyond the skb_release_head_state + * that we already did). + */ + if (likely(skb->head)) + skb_release_data(skb); + + kfree_skbmem(skb); } EXPORT_SYMBOL(consume_skb); @@ -774,6 +879,9 @@ new->tstamp = old->tstamp; /* We do not copy old->sk */ new->dev = old->dev; +#if IS_ENABLED(CONFIG_AVM_NET_SKB_INPUT_DEV) + new->input_dev = old->input_dev; +#endif memcpy(new->cb, old->cb, sizeof(old->cb)); skb_dst_copy(new, old); #ifdef CONFIG_XFRM @@ -833,6 +941,12 @@ n->next = n->prev = NULL; n->sk = NULL; __copy_skb_header(n, skb); + /* Not to be copied by __copy_skb_header(). __copy_skb_header() is used + * during segmentation. Copies created by that function may not inherit + * the same pkt_info because avm_pa cannot tell them apart. + */ + if (IS_ENABLED(CONFIG_AVM_PA)) + memcpy(AVM_PKT_INFO(n), AVM_PKT_INFO(skb), sizeof(struct avm_pa_pkt_info)); C(len); C(data_len); @@ -971,6 +1085,7 @@ n = kmem_cache_alloc(skbuff_head_cache, gfp_mask); if (!n) return NULL; + skbuff_debugobj_init_and_activate(n); kmemcheck_annotate_bitfield(n, flags1); n->fclone = SKB_FCLONE_UNAVAILABLE; @@ -998,6 +1113,12 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old) { __copy_skb_header(new, old); + /* Not to be copied by __copy_skb_header(). __copy_skb_header() is used + * during segmentation. Copies created by that function may not inherit + * the same pkt_info because avm_pa cannot tell them apart. + */ + if (IS_ENABLED(CONFIG_AVM_PA)) + memcpy(AVM_PKT_INFO(new), AVM_PKT_INFO(old), sizeof(struct avm_pa_pkt_info)); skb_shinfo(new)->gso_size = skb_shinfo(old)->gso_size; skb_shinfo(new)->gso_segs = skb_shinfo(old)->gso_segs; @@ -3402,6 +3523,10 @@ 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); + skb_recycler_init(); +#if IS_ENABLED(CONFIG_AVM_PA_GENERIC_CT) + generic_ct_init(); +#endif } static int @@ -4214,6 +4339,7 @@ { if (head_stolen) { skb_release_head_state(skb); + skbuff_debugobj_deactivate(skb); kmem_cache_free(skbuff_head_cache, skb); } else { __kfree_skb(skb); @@ -4322,7 +4448,10 @@ skb_dst_drop(skb); skb_sender_cpu_clear(skb); secpath_reset(skb); - nf_reset(skb); + /* TMA/MQU 20170411: Is this the right thing for namespace + * changes? We think so. See JZ-30001. + */ + nf_reset_no_generic_ct(skb); nf_reset_trace(skb); if (!xnet) @@ -4607,3 +4736,403 @@ return NULL; } EXPORT_SYMBOL(alloc_skb_with_frags); + +#if defined(CONFIG_SLUB_AVM_ALLOC_LIST) +#undef pr_fmt +#define pr_fmt(fmt) fmt + +struct _skb_id_countlist { + unsigned long id; + unsigned long count; +}; + +enum _format_type { + is_proto = 0, + is_symbol, + is_skblen, + is_netdev, + is_slab, +}; + +#define SKB_COUNT_ID_ENTRIES 32 +struct _skb_class { + struct _skb_id_countlist countlist[SKB_COUNT_ID_ENTRIES]; + void (*skb_class_cb)(struct sk_buff *skb, struct _skb_id_countlist *countlist); + const char *name; + enum _format_type type; +}; + +static char devname[SKB_COUNT_ID_ENTRIES][IFNAMSIZ]; + +/** + * return id + * careful access/analyze/copy of dev->name (dev maybe invalid)! + * id == 0: devname is zero + * id < 0: inval pointer/device (-EINVAL) entrytable full (-ENOMEM) + */ +static int devname_to_id(struct net_device *dev) +{ + char devname_tmp[IFNAMSIZ]; + unsigned int i; + const char *p = &dev->name[0]; + + if (virt_addr_valid(p) == 0) + return dev ? -EINVAL : 0; + if (virt_addr_valid(p + IFNAMSIZ - 1) == 0) + return -EINVAL; + + if (!PageSlab(virt_to_head_page(p))) + /* support only kmalloc-alloced devices else some cases occur misunderstood DBE */ + return -EINVAL; + + for (i = 0; i < IFNAMSIZ; i++) { + devname_tmp[i] = *p; + if (*p == 0) + break; + if (isascii(*p++)) + continue; + break; + } + if (*p != 0) { + return -EINVAL; + } + for (i = 0; i < ARRAY_SIZE(devname); i++) { + if (devname[i][0] == 0) + break; + if (strncmp(devname[i], devname_tmp, IFNAMSIZ) == 0) + /* entry found */ + return i + 1; + } + if (i < ARRAY_SIZE(devname)) { + /* append */ + strncpy(devname[i], devname_tmp, IFNAMSIZ); + return i + 1; + } + return -ENOMEM; +} + +static int count_skb_id(unsigned long id, struct _skb_id_countlist *countlist) +{ + unsigned int entry; + + for (entry = 0; entry < SKB_COUNT_ID_ENTRIES; entry++) { + if (countlist[entry].id == id || + countlist[entry].count == 0) { + countlist[entry].id = id; + countlist[entry].count++; + return 0; + } + } + return -ENOMEM; +} + +/** + * @brief count all skb with same protocol + */ +static void skb_class_list_cb_protocol(struct sk_buff *skb, struct _skb_id_countlist *countlist) +{ + count_skb_id(skb->protocol, countlist); +} + +/** + * @brief count all skb with same netdev + * set reference for netdevice because we have to access to the name later + */ +static void skb_class_list_cb_netdev(struct sk_buff *skb, struct _skb_id_countlist *countlist) +{ + unsigned long id = devname_to_id(skb->dev); + + count_skb_id(id, countlist); +} + +/** + * @brief count all skb's with same destructor + */ +static void skb_class_list_cb_destructor(struct sk_buff *skb, struct _skb_id_countlist *countlist) +{ + count_skb_id((unsigned long)skb->destructor, countlist); +} + +/** + * @brief count all skb with same vlan_proto + */ +static void skb_class_list_cb_vlan_proto(struct sk_buff *skb, struct _skb_id_countlist *countlist) +{ + count_skb_id(skb->vlan_proto, countlist); +} + +/** + * @brief count all skb with valid sk or sk == null + * careful try to get slab-cachepool-name-pointer as id if sk == slab + * + */ +static void skb_class_list_cb_socket(struct sk_buff *skb, struct _skb_id_countlist *countlist) +{ + struct kmem_cache *s; + struct page *page; + void *sk = READ_ONCE(skb->sk); + + if (sk == 0) { + count_skb_id(0, countlist); + return; + } + if (!virt_addr_valid(sk)) { + count_skb_id(-EINVAL, countlist); + return; + } + page = virt_to_head_page(sk); + if (virt_addr_valid(page) && PageSlab(page)) { + s = page->slab_cache; + + if (virt_addr_valid(s) && virt_addr_valid(s->name)) { + count_skb_id((unsigned long)s->name, countlist); + return; + } + } + count_skb_id(-EINVAL, countlist); +} + +/** + * @brief count all skb with skb_iif + */ +static void skb_class_list_cb_iif(struct sk_buff *skb, struct _skb_id_countlist *countlist) +{ + count_skb_id(skb->skb_iif, countlist); +} + +#define PACKET_LEN_AREA (ETH_FRAME_LEN + ETH_FCS_LEN) +#define PACKET_LEN_OFFSET SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) +/** + * @brief count all skb len (areas) + */ +static void skb_class_list_cb_len(struct sk_buff *skb, struct _skb_id_countlist *countlist) +{ + int len = skb->len; + + if (len >= PACKET_LEN_OFFSET) { + len -= PACKET_LEN_OFFSET; + count_skb_id((len / PACKET_LEN_AREA) + 1, countlist); + return; + } + count_skb_id(0, countlist); +} + +#ifdef CONFIG_AVM_PA +/** + * @brief count all skb with pktinfo.ingress_pid_handle + */ +static void skb_class_list_cb_avm_pa_ingress_pid_handle(struct sk_buff *skb, struct _skb_id_countlist *countlist) +{ + count_skb_id(AVM_PKT_INFO(skb)->ingress_pid_handle, countlist); +} + +/** + * @brief count all skb with pktinfo.egress_pid_handle + */ +static void skb_class_list_cb_avm_pa_egress_pid_handle(struct sk_buff *skb, struct _skb_id_countlist *countlist) +{ + count_skb_id(AVM_PKT_INFO(skb)->egress_pid_handle, countlist); +} +#endif + +static struct _skb_class skb_class_list[] = { + { .skb_class_cb = skb_class_list_cb_protocol, .name = "protocol", .type = is_proto }, + { .skb_class_cb = skb_class_list_cb_vlan_proto, .name = "vlan_proto", .type = is_proto }, + { .skb_class_cb = skb_class_list_cb_netdev, .name = "netdev", .type = is_netdev }, + { .skb_class_cb = skb_class_list_cb_socket, .name = "socket", .type = is_slab}, + { .skb_class_cb = skb_class_list_cb_iif, .name = "skb_iif", .type = is_proto}, + { .skb_class_cb = skb_class_list_cb_len, .name = "len", .type = is_skblen}, +#ifdef CONFIG_AVM_PA + { .skb_class_cb = skb_class_list_cb_avm_pa_ingress_pid_handle, .name = "avm_pa_ingress_pid", .type = is_proto}, + { .skb_class_cb = skb_class_list_cb_avm_pa_egress_pid_handle, .name = "avm_pa_egress_pid ", .type = is_proto}, +#endif + { .skb_class_cb = skb_class_list_cb_destructor, .name = "destructor", .type = is_symbol }, +}; + +atomic_t busy_skb_pending_statistic; + +/** + * @ clean data and put refs for netdevices + */ +static void skb_class_list_clean(void) +{ + unsigned int i; + + for (i = 0; i < ARRAY_SIZE(skb_class_list); i++) { + memset(&skb_class_list[i].countlist, 0, sizeof(skb_class_list[i].countlist)); + } + + memset(&devname, 0, sizeof(devname)); +} + +/** + * @brief callback for all pending skb's + */ +static int sk_buff_pointer_cb(void *ref, void *p) +{ + unsigned long *sum_skbs = (unsigned long *)ref; + unsigned int i; + struct sk_buff *skb = (struct sk_buff *)p; + + for (i = 0; i < ARRAY_SIZE(skb_class_list); i++) { + struct _skb_class *pscl = &skb_class_list[i]; + + pscl->skb_class_cb(skb, pscl->countlist); + } + *sum_skbs += 1; +#if 0 + if (skb->protocol) + pr_err("%s: (ref=%p) %p: ts=%llu netdev:%s destructor=%pS protocol=%x vlan_proto=%x mac=%pM avm_pa: pid=%x hsession=%x\n", + __func__, + ref, + skb, + skb_get_ktime(skb).tv64, + skb->dev ? netdev_name(skb->dev) : "?", + skb->destructor, + skb->protocol, + skb->vlan_proto, + skb_mac_header(skb), + AVM_PKT_INFO(skb)->ingress_pid_handle, + AVM_PKT_INFO(skb)->session_handle); +#endif + return 0; +} + +/** + * @brief show pending skbs-statistic on oom or /proc/avm/skb_pending + */ +static void display_skb_class_counts(struct seq_file *seq, unsigned long sum_skbs, + struct kmem_cache *s, unsigned int threshcount) +{ + unsigned int i, entry; + unsigned long len_idx, len_start; + char txt[64]; + + if (threshcount) { + snprintf(txt, sizeof(txt), " - show all counters more/equal %u", threshcount); + } else + txt[0] = 0; + + sseq_printf(seq, "%s: pending sk_buffs: %lu (%5lu KiB)%s\n", + s->name, sum_skbs, (sum_skbs * s->object_size) >> 10, txt); + + for (i = 0; i < ARRAY_SIZE(skb_class_list); i++) { + struct _skb_class *pscl = &skb_class_list[i]; + + for (entry = 0; entry < SKB_COUNT_ID_ENTRIES; entry++) { + if (pscl->countlist[entry].count == 0) + break; + if (pscl->countlist[entry].count < threshcount) + continue; + switch (pscl->type) { + case is_netdev: + sseq_printf(seq, "%s: %-18s: %6lu\n", pscl->name, + pscl->countlist[entry].id == 0 ? "no-dev" : + pscl->countlist[entry].id <= ARRAY_SIZE(devname) ? devname[pscl->countlist[entry].id - 1] : + pscl->countlist[entry].id == (unsigned long)-ENOMEM ? "devlist-full" : + pscl->countlist[entry].id == (unsigned long)-EINVAL ? "dev-freed" : "dev-?", + pscl->countlist[entry].count); + break; + case is_slab: + sseq_printf(seq, "%s: %-18s: %6lu\n", pscl->name, + (pscl->countlist[entry].id == 0 ? "(null)" : + pscl->countlist[entry].id == (unsigned long)-EINVAL ? "inval address" : + virt_addr_valid(pscl->countlist[entry].id) ? (char *)pscl->countlist[entry].id : + "unknown"), + pscl->countlist[entry].count); + break; + case is_symbol: + sseq_printf(seq, "%s: %-48pS: %6lu\n", pscl->name, + (void *)pscl->countlist[entry].id, + pscl->countlist[entry].count); + break; + case is_proto: + sseq_printf(seq, "%s: 0x%04lx: %6lu\n", pscl->name, + pscl->countlist[entry].id, + pscl->countlist[entry].count); + break; + case is_skblen: + len_idx = pscl->countlist[entry].id; + if (len_idx == 0) { + sseq_printf(seq, "%s: %6u -%6u bytes: %6lu\n", pscl->name, + 0, PACKET_LEN_OFFSET - 1, + pscl->countlist[entry].count); + break; + } + len_idx--; + len_start = PACKET_LEN_OFFSET + (len_idx * PACKET_LEN_AREA); + sseq_printf(seq, "%s: %6lu -%6lu bytes: %6lu\n", pscl->name, + len_start, len_start + PACKET_LEN_AREA - 1, + pscl->countlist[entry].count); + break; + } + } + if (pscl->countlist[SKB_COUNT_ID_ENTRIES - 1].count) + sseq_printf(seq, "... (not all %s counted)\n", + pscl->type == is_netdev ? "netdevs" : + pscl->type == is_symbol ? "symbols" : + pscl->type == is_slab ? "sockets" : + pscl->type == is_proto ? "protocols" : "id"); + } +} + +#define SK_BUFF_THRESH_COUNT 50000 +/** + */ +static void avm_proc_skb_pending_statistic(struct seq_file *seq, void *priv) +{ + struct kmem_cache *cachetab[] = {skbuff_head_cache, skbuff_fclone_cache}; + unsigned int i, active_objs; + unsigned int *ptreshsize = priv; + + if (atomic_add_return(1, &busy_skb_pending_statistic) != 1) { + return; + } + for (i = 0; i < ARRAY_SIZE(cachetab); i++) { + unsigned long sum_skbs = 0; + struct kmem_cache *s = cachetab[i]; + + active_objs = kmem_cache_active_objects(s); + if (active_objs >= SK_BUFF_THRESH_COUNT || seq) { + kmem_cache_list_all_objects(s, &sum_skbs, sk_buff_pointer_cb); + if (!seq) + pr_err("mem-error: suspiciously much %s sk_buff's %u\n", + s->name, active_objs); + if (sum_skbs) + display_skb_class_counts(seq, sum_skbs, s, + ptreshsize ? *ptreshsize : 0); + skb_class_list_clean(); + } + } + atomic_set(&busy_skb_pending_statistic, 0); +} + +/** + */ +static int skb_oom_notify(struct notifier_block *block, + unsigned long event, void *_data) +{ + struct seq_file *seq = _data; + unsigned int threshcount = SK_BUFF_THRESH_COUNT / SKB_COUNT_ID_ENTRIES; + + avm_proc_skb_pending_statistic(seq, &threshcount); + return NOTIFY_OK; +} + +static struct notifier_block skb_oom_nb = { + .notifier_call = skb_oom_notify, + .priority = 1, +}; + +/** + */ +static __init int init_skb_oom(void) +{ + add_simple_proc_file("avm/skb_pending", NULL, + avm_proc_skb_pending_statistic, NULL); + + avm_oom_info_chain_register(&skb_oom_nb); + return 0; +} +late_initcall(init_skb_oom); +#endif