--- zzzz-none-000/linux-3.10.107/net/core/skbuff.c 2017-06-27 09:49:32.000000000 +0000 +++ scorpion-7490-727/linux-3.10.107/net/core/skbuff.c 2021-02-04 17:41:59.000000000 +0000 @@ -62,19 +62,29 @@ #include #include #include +#include +#include #include #include #include #include +#include #include #include #include #include +#include +#include + +#include "skbuff_recycle.h" +#include "skbuff_debug.h" struct kmem_cache *skbuff_head_cache __read_mostly; static struct kmem_cache *skbuff_fclone_cache __read_mostly; +int sysctl_max_skb_frags __read_mostly = MAX_SKB_FRAGS; +EXPORT_SYMBOL(sysctl_max_skb_frags); /** * skb_panic - private function for out-of-line support @@ -160,6 +170,7 @@ gfp_mask & ~__GFP_DMA, node); if (!skb) goto out; + skbuff_debugobj_init_and_activate(skb); /* * Only clear those fields we need to clear, not those that we will @@ -171,9 +182,7 @@ skb->truesize = sizeof(struct sk_buff); atomic_set(&skb->users, 1); -#ifdef NET_SKBUFF_DATA_USES_OFFSET - skb->mac_header = ~0U; -#endif + skb->mac_header = (typeof(skb->mac_header))~0U; out: return skb; } @@ -214,6 +223,7 @@ skb = kmem_cache_alloc_node(cache, gfp_mask & ~__GFP_DMA, node); if (!skb) goto out; + skbuff_debugobj_init_and_activate(skb); prefetchw(skb); /* We do our best to align skb_shared_info on a separate cache @@ -247,10 +257,8 @@ skb->data = data; skb_reset_tail_pointer(skb); skb->end = skb->tail + size; -#ifdef NET_SKBUFF_DATA_USES_OFFSET - skb->mac_header = ~0U; - skb->transport_header = ~0U; -#endif + skb->mac_header = (typeof(skb->mac_header))~0U; + skb->transport_header = (typeof(skb->transport_header))~0U; /* make sure we initialize shinfo sequentially */ shinfo = skb_shinfo(skb); @@ -259,20 +267,21 @@ kmemcheck_annotate_variable(shinfo->destructor_arg); if (flags & SKB_ALLOC_FCLONE) { - struct sk_buff *child = skb + 1; - atomic_t *fclone_ref = (atomic_t *) (child + 1); + struct sk_buff_fclones *fclones; - kmemcheck_annotate_bitfield(child, flags1); - kmemcheck_annotate_bitfield(child, flags2); + fclones = container_of(skb, struct sk_buff_fclones, skb1); + + kmemcheck_annotate_bitfield(&fclones->skb2, flags1); skb->fclone = SKB_FCLONE_ORIG; - atomic_set(fclone_ref, 1); + atomic_set(&fclones->fclone_ref, 1); - child->fclone = SKB_FCLONE_UNAVAILABLE; - child->pfmemalloc = pfmemalloc; + fclones->skb2.fclone = SKB_FCLONE_CLONE; + fclones->skb2.pfmemalloc = pfmemalloc; } out: return skb; nodata: + skbuff_debugobj_deactivate(skb); kmem_cache_free(cache, skb); skb = NULL; goto out; @@ -280,12 +289,14 @@ EXPORT_SYMBOL(__alloc_skb); /** - * build_skb - build a network buffer + * __build_skb - build a network buffer * @data: data buffer provided by caller - * @frag_size: size of fragment, or 0 if head was kmalloced + * @frag_size: size of data, or 0 if head was kmalloced * * Allocate a new &sk_buff. Caller provides space holding head and - * skb_shared_info. @data must have been allocated by kmalloc() + * skb_shared_info. @data must have been allocated by kmalloc() only if + * @frag_size is 0, otherwise data should come from the page allocator + * or vmalloc() * The return is the new skb buffer. * On a failure the return is %NULL, and @data is not freed. * Notes : @@ -296,7 +307,7 @@ * before giving packet to stack. * RX rings only contains data buffers, not full skbs. */ -struct sk_buff *build_skb(void *data, unsigned int frag_size) +struct sk_buff *__build_skb(void *data, unsigned int frag_size) { struct skb_shared_info *shinfo; struct sk_buff *skb; @@ -305,21 +316,19 @@ skb = kmem_cache_alloc(skbuff_head_cache, GFP_ATOMIC); if (!skb) return NULL; + skbuff_debugobj_init_and_activate(skb); size -= SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); memset(skb, 0, offsetof(struct sk_buff, tail)); skb->truesize = SKB_TRUESIZE(size); - skb->head_frag = frag_size != 0; atomic_set(&skb->users, 1); skb->head = data; skb->data = data; skb_reset_tail_pointer(skb); skb->end = skb->tail + size; -#ifdef NET_SKBUFF_DATA_USES_OFFSET - skb->mac_header = ~0U; - skb->transport_header = ~0U; -#endif + skb->mac_header = (typeof(skb->mac_header))~0U; + skb->transport_header = (typeof(skb->transport_header))~0U; /* make sure we initialize shinfo sequentially */ shinfo = skb_shinfo(skb); @@ -329,58 +338,37 @@ return skb; } + +/* build_skb() is wrapper over __build_skb(), that specifically + * takes care of skb->head and skb->pfmemalloc + * This means that if @frag_size is not zero, then @data must be backed + * by a page fragment, not kmalloc() or vmalloc() + */ +struct sk_buff *build_skb(void *data, unsigned int frag_size) +{ + struct sk_buff *skb = __build_skb(data, frag_size); + + if (skb && frag_size) { + skb->head_frag = 1; + if (page_is_pfmemalloc(virt_to_head_page(data))) + skb->pfmemalloc = 1; + } + return skb; +} EXPORT_SYMBOL(build_skb); -struct netdev_alloc_cache { - struct page_frag frag; - /* we maintain a pagecount bias, so that we dont dirty cache line - * containing page->_count every time we allocate a fragment. - */ - unsigned int pagecnt_bias; -}; -static DEFINE_PER_CPU(struct netdev_alloc_cache, netdev_alloc_cache); +static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache); +static DEFINE_PER_CPU(struct page_frag_cache, napi_alloc_cache); static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask) { - struct netdev_alloc_cache *nc; - void *data = NULL; - int order; + struct page_frag_cache *nc; unsigned long flags; + void *data; local_irq_save(flags); - nc = &__get_cpu_var(netdev_alloc_cache); - if (unlikely(!nc->frag.page)) { -refill: - for (order = NETDEV_FRAG_PAGE_MAX_ORDER; ;) { - gfp_t gfp = gfp_mask; - - if (order) - gfp |= __GFP_COMP | __GFP_NOWARN; - nc->frag.page = alloc_pages(gfp, order); - if (likely(nc->frag.page)) - break; - if (--order < 0) - goto end; - } - nc->frag.size = PAGE_SIZE << order; -recycle: - atomic_set(&nc->frag.page->_count, NETDEV_PAGECNT_MAX_BIAS); - nc->pagecnt_bias = NETDEV_PAGECNT_MAX_BIAS; - nc->frag.offset = 0; - } - - if (nc->frag.offset + fragsz > nc->frag.size) { - /* avoid unnecessary locked operations if possible */ - if ((atomic_read(&nc->frag.page->_count) == nc->pagecnt_bias) || - atomic_sub_and_test(nc->pagecnt_bias, &nc->frag.page->_count)) - goto recycle; - goto refill; - } - - data = page_address(nc->frag.page) + nc->frag.offset; - nc->frag.offset += fragsz; - nc->pagecnt_bias--; -end: + nc = this_cpu_ptr(&netdev_alloc_cache); + data = __alloc_page_frag(nc, fragsz, gfp_mask); local_irq_restore(flags); return data; } @@ -398,6 +386,19 @@ } EXPORT_SYMBOL(netdev_alloc_frag); +static void *__napi_alloc_frag(unsigned int fragsz, gfp_t gfp_mask) +{ + struct page_frag_cache *nc = this_cpu_ptr(&napi_alloc_cache); + + return __alloc_page_frag(nc, fragsz, gfp_mask); +} + +void *napi_alloc_frag(unsigned int fragsz) +{ + return __napi_alloc_frag(fragsz, GFP_ATOMIC | __GFP_COLD); +} +EXPORT_SYMBOL(napi_alloc_frag); + /** * __netdev_alloc_skb - allocate an skbuff for rx on a specific device * @dev: network device to receive on @@ -405,7 +406,7 @@ * @gfp_mask: get_free_pages mask, passed to alloc_skb * * Allocate a new &sk_buff and assign it a usage count of one. The - * buffer has unspecified headroom built in. Users should allocate + * buffer has NET_SKB_PAD headroom built in. Users should allocate * the headroom they think they need without accounting for the * built in space. The built in space is used for optimisations. * @@ -414,35 +415,172 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int length, gfp_t gfp_mask) { - struct sk_buff *skb = NULL; - unsigned int fragsz = SKB_DATA_ALIGN(length + NET_SKB_PAD) + - SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + struct sk_buff *skb; + unsigned int len = length; +#ifndef CONFIG_SKB_RECYCLER + struct page_frag_cache *nc; + unsigned long flags; + bool pfmemalloc; + bool page_frag_alloc_enable = true; + void *data; +#endif - if (fragsz <= PAGE_SIZE && !(gfp_mask & (__GFP_WAIT | GFP_DMA))) { - void *data; +#ifdef CONFIG_SKB_RECYCLER + skb = skb_recycler_alloc(dev, length); + if (likely(skb)) { + /* SKBs in the recycler are from various unknown sources. + * Their truesize is unknown. We should set truesize + * as the needed buffer size before using it. + */ + skb->truesize = SKB_TRUESIZE(SKB_DATA_ALIGN(len + NET_SKB_PAD)); + return skb; + } - if (sk_memalloc_socks()) - gfp_mask |= __GFP_MEMALLOC; + len = SKB_RECYCLE_SIZE; + if (unlikely(length > SKB_RECYCLE_SIZE)) + len = length; - data = __netdev_alloc_frag(fragsz, gfp_mask); + skb = __alloc_skb(len + NET_SKB_PAD, gfp_mask, + SKB_ALLOC_RX, NUMA_NO_NODE); + if (!skb) + goto skb_fail; - if (likely(data)) { - skb = build_skb(data, fragsz); - if (unlikely(!skb)) - put_page(virt_to_head_page(data)); - } - } else { - skb = __alloc_skb(length + NET_SKB_PAD, gfp_mask, - SKB_ALLOC_RX, NUMA_NO_NODE); + /* Set truesize as the needed buffer size + * rather than the allocated size by __alloc_skb(). + */ + if (length + NET_SKB_PAD < SKB_WITH_OVERHEAD(PAGE_SIZE)) + skb->truesize = SKB_TRUESIZE(SKB_DATA_ALIGN(length + NET_SKB_PAD)); + + goto skb_success; +#else + len += NET_SKB_PAD; + +#ifdef CONFIG_ALLOC_SKB_PAGE_FRAG_DISABLE + page_frag_alloc_enable = false; +#endif + if ((len > SKB_WITH_OVERHEAD(PAGE_SIZE)) || + (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA)) || + !page_frag_alloc_enable) { + skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE); + if (!skb) + goto skb_fail; + goto skb_success; } - if (likely(skb)) { - skb_reserve(skb, NET_SKB_PAD); - skb->dev = dev; + + len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + len = SKB_DATA_ALIGN(len); + + if (sk_memalloc_socks()) + gfp_mask |= __GFP_MEMALLOC; + + local_irq_save(flags); + + nc = this_cpu_ptr(&netdev_alloc_cache); + data = __alloc_page_frag(nc, len, gfp_mask); + pfmemalloc = nc->pfmemalloc; + + local_irq_restore(flags); + + if (unlikely(!data)) + return NULL; + + skb = __build_skb(data, len); + if (unlikely(!skb)) { + skb_free_frag(data); + return NULL; } + + /* use OR instead of assignment to avoid clearing of bits in mask */ + if (pfmemalloc) + skb->pfmemalloc = 1; + skb->head_frag = 1; +#endif + +skb_success: + skb_reserve(skb, NET_SKB_PAD); + skb->dev = dev; + +skb_fail: return skb; } EXPORT_SYMBOL(__netdev_alloc_skb); +/** + * __napi_alloc_skb - allocate skbuff for rx in a specific NAPI instance + * @napi: napi instance this buffer was allocated for + * @len: length to allocate + * @gfp_mask: get_free_pages mask, passed to alloc_skb and alloc_pages + * + * Allocate a new sk_buff for use in NAPI receive. This buffer will + * attempt to allocate the head from a special reserved region used + * only for NAPI Rx allocation. By doing this we can save several + * CPU cycles by avoiding having to disable and re-enable IRQs. + * + * %NULL is returned if there is no free memory. + */ +struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len, + gfp_t gfp_mask) +{ + struct page_frag_cache *nc = this_cpu_ptr(&napi_alloc_cache); + struct sk_buff *skb; + void *data; + + len += NET_SKB_PAD + NET_IP_ALIGN; + + if ((len > SKB_WITH_OVERHEAD(PAGE_SIZE)) || + (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) { + skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE); + if (!skb) + goto skb_fail; + goto skb_success; + } + + len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + len = SKB_DATA_ALIGN(len); + + if (sk_memalloc_socks()) + gfp_mask |= __GFP_MEMALLOC; + + data = __alloc_page_frag(nc, len, gfp_mask); + if (unlikely(!data)) + return NULL; + + skb = __build_skb(data, len); + if (unlikely(!skb)) { + skb_free_frag(data); + return NULL; + } + + /* use OR instead of assignment to avoid clearing of bits in mask */ + if (nc->pfmemalloc) + skb->pfmemalloc = 1; + skb->head_frag = 1; + +skb_success: + skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN); + skb->dev = napi->dev; + +skb_fail: + return skb; +} +EXPORT_SYMBOL(__napi_alloc_skb); + +struct sk_buff *__netdev_alloc_skb_ip_align(struct net_device *dev, + unsigned int length, gfp_t gfp) +{ + struct sk_buff *skb = __netdev_alloc_skb(dev, length + NET_IP_ALIGN, gfp); + +#ifdef CONFIG_ETHERNET_PACKET_MANGLE + if (dev && (dev->priv_flags & IFF_NO_IP_ALIGN)) + return skb; +#endif + + if (NET_IP_ALIGN && skb) + skb_reserve(skb, NET_IP_ALIGN); + return skb; +} +EXPORT_SYMBOL(__netdev_alloc_skb_ip_align); + void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off, int size, unsigned int truesize) { @@ -453,6 +591,18 @@ } EXPORT_SYMBOL(skb_add_rx_frag); +void skb_coalesce_rx_frag(struct sk_buff *skb, int i, int size, + unsigned int truesize) +{ + skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + + skb_frag_size_add(frag, size); + skb->len += size; + skb->data_len += size; + skb->truesize += truesize; +} +EXPORT_SYMBOL(skb_coalesce_rx_frag); + static void skb_drop_list(struct sk_buff **listp) { kfree_skb_list(*listp); @@ -474,74 +624,79 @@ static void skb_free_head(struct sk_buff *skb) { + unsigned char *head = skb->head; + if (skb->head_frag) - put_page(virt_to_head_page(skb->head)); + skb_free_frag(head); else - kfree(skb->head); + kfree(head); } -static void skb_release_data(struct sk_buff *skb) +void skb_release_data(struct sk_buff *skb) { - if (!skb->cloned || - !atomic_sub_return(skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1, - &skb_shinfo(skb)->dataref)) { - if (skb_shinfo(skb)->nr_frags) { - int i; - for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) - skb_frag_unref(skb, i); - } + struct skb_shared_info *shinfo = skb_shinfo(skb); + int i; - /* - * If skb buf is from userspace, we need to notify the caller - * the lower device DMA has done; - */ - if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) { - struct ubuf_info *uarg; + if (skb->cloned && + atomic_sub_return(skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1, + &shinfo->dataref)) + return; - uarg = skb_shinfo(skb)->destructor_arg; - if (uarg->callback) - uarg->callback(uarg, true); - } + for (i = 0; i < shinfo->nr_frags; i++) + __skb_frag_unref(&shinfo->frags[i]); - if (skb_has_frag_list(skb)) - skb_drop_fraglist(skb); + /* + * If skb buf is from userspace, we need to notify the caller + * the lower device DMA has done; + */ + if (shinfo->tx_flags & SKBTX_DEV_ZEROCOPY) { + struct ubuf_info *uarg; - skb_free_head(skb); + uarg = shinfo->destructor_arg; + if (uarg->callback) + uarg->callback(uarg, true); } + + if (shinfo->frag_list) + kfree_skb_list(shinfo->frag_list); + + skb_free_head(skb); } /* * Free an skbuff by memory without cleaning the state. */ -static void kfree_skbmem(struct sk_buff *skb) +void kfree_skbmem(struct sk_buff *skb) { - struct sk_buff *other; - atomic_t *fclone_ref; + struct sk_buff_fclones *fclones; switch (skb->fclone) { case SKB_FCLONE_UNAVAILABLE: + skbuff_debugobj_deactivate(skb); kmem_cache_free(skbuff_head_cache, skb); - break; + return; case SKB_FCLONE_ORIG: - fclone_ref = (atomic_t *) (skb + 2); - if (atomic_dec_and_test(fclone_ref)) - kmem_cache_free(skbuff_fclone_cache, skb); - break; + fclones = container_of(skb, struct sk_buff_fclones, skb1); - case SKB_FCLONE_CLONE: - fclone_ref = (atomic_t *) (skb + 1); - other = skb - 1; - - /* The clone portion is available for - * fast-cloning again. + /* We usually free the clone (TX completion) before original skb + * This test would have no chance to be true for the clone, + * while here, branch prediction will be good. */ - skb->fclone = SKB_FCLONE_UNAVAILABLE; + if (atomic_read(&fclones->fclone_ref) == 1) + goto fastpath; + break; - if (atomic_dec_and_test(fclone_ref)) - kmem_cache_free(skbuff_fclone_cache, other); + default: /* SKB_FCLONE_CLONE */ + fclones = container_of(skb, struct sk_buff_fclones, skb2); break; } + if (!atomic_dec_and_test(&fclones->fclone_ref)) + return; + +fastpath: + skbuff_debugobj_deactivate(&fclones->skb1); + kmem_cache_free(skbuff_fclone_cache, fclones); } static void skb_release_head_state(struct sk_buff *skb) @@ -557,15 +712,11 @@ #if IS_ENABLED(CONFIG_NF_CONNTRACK) nf_conntrack_put(skb->nfct); #endif -#ifdef CONFIG_BRIDGE_NETFILTER - nf_bridge_put(skb->nf_bridge); -#endif -/* XXX: IS this still necessary? - JHS */ -#ifdef CONFIG_NET_SCHED - skb->tc_index = 0; -#ifdef CONFIG_NET_CLS_ACT - skb->tc_verd = 0; +#if IS_ENABLED(CONFIG_AVM_GENERIC_CONNTRACK) + generic_ct_put(skb->generic_ct); #endif +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) + nf_bridge_put(skb->nf_bridge); #endif } @@ -656,62 +807,105 @@ { if (unlikely(!skb)) return; + + prefetch(&skb->destructor); + if (likely(atomic_read(&skb->users) == 1)) smp_rmb(); else if (likely(!atomic_dec_and_test(&skb->users))) return; + + /* If possible we'd like to recycle any skb rather than just free it, + * but in order to do that we need to release any head state too. + * We don't want to do this later because we'll be in a pre-emption + * disabled state. + */ + skb_release_head_state(skb); + + /* Can we recycle this skb? If we can then it will be much faster + * for us to recycle this one later than to allocate a new one + * from scratch. + */ + if (likely(skb->head) && likely(skb_recycler_consume(skb))) + return; + trace_consume_skb(skb); - __kfree_skb(skb); + + /* We're not recycling so now we need to do the rest of what we would + * have done in __kfree_skb (above and beyond the skb_release_head_state + * that we already did). + */ + if (likely(skb->head)) + skb_release_data(skb); + + kfree_skbmem(skb); } EXPORT_SYMBOL(consume_skb); +/* Make sure a field is enclosed inside headers_start/headers_end section */ +#define CHECK_SKB_FIELD(field) \ + BUILD_BUG_ON(offsetof(struct sk_buff, field) < \ + offsetof(struct sk_buff, headers_start)); \ + BUILD_BUG_ON(offsetof(struct sk_buff, field) > \ + offsetof(struct sk_buff, headers_end)); \ + static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old) { new->tstamp = old->tstamp; + /* We do not copy old->sk */ new->dev = old->dev; - new->transport_header = old->transport_header; - new->network_header = old->network_header; - new->mac_header = old->mac_header; - new->inner_transport_header = old->inner_transport_header; - new->inner_network_header = old->inner_network_header; - new->inner_mac_header = old->inner_mac_header; +#if IS_ENABLED(CONFIG_AVM_NET_SKB_INPUT_DEV) + new->input_dev = old->input_dev; +#endif + memcpy(new->cb, old->cb, sizeof(old->cb)); skb_dst_copy(new, old); - new->rxhash = old->rxhash; - new->ooo_okay = old->ooo_okay; - new->l4_rxhash = old->l4_rxhash; - new->no_fcs = old->no_fcs; - new->encapsulation = old->encapsulation; #ifdef CONFIG_XFRM new->sp = secpath_get(old->sp); #endif - memcpy(new->cb, old->cb, sizeof(old->cb)); - new->csum = old->csum; - new->local_df = old->local_df; - new->pkt_type = old->pkt_type; - new->ip_summed = old->ip_summed; - skb_copy_queue_mapping(new, old); - new->priority = old->priority; -#if IS_ENABLED(CONFIG_IP_VS) - new->ipvs_property = old->ipvs_property; + __nf_copy(new, old, false); + + /* Note : this field could be in headers_start/headers_end section + * It is not yet because we do not want to have a 16 bit hole + */ + new->queue_mapping = old->queue_mapping; + + memcpy(&new->headers_start, &old->headers_start, + offsetof(struct sk_buff, headers_end) - + offsetof(struct sk_buff, headers_start)); + CHECK_SKB_FIELD(protocol); + CHECK_SKB_FIELD(csum); + CHECK_SKB_FIELD(hash); + CHECK_SKB_FIELD(priority); + CHECK_SKB_FIELD(skb_iif); + CHECK_SKB_FIELD(vlan_proto); + CHECK_SKB_FIELD(vlan_tci); + CHECK_SKB_FIELD(transport_header); + CHECK_SKB_FIELD(network_header); + CHECK_SKB_FIELD(mac_header); + CHECK_SKB_FIELD(inner_protocol); + CHECK_SKB_FIELD(inner_transport_header); + CHECK_SKB_FIELD(inner_network_header); + CHECK_SKB_FIELD(inner_mac_header); + CHECK_SKB_FIELD(mark); +#ifdef CONFIG_NETWORK_SECMARK + CHECK_SKB_FIELD(secmark); +#endif +#ifdef CONFIG_NET_RX_BUSY_POLL + CHECK_SKB_FIELD(napi_id); #endif - new->pfmemalloc = old->pfmemalloc; - new->protocol = old->protocol; - new->mark = old->mark; - new->skb_iif = old->skb_iif; - __nf_copy(new, old); -#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE) - new->nf_trace = old->nf_trace; +#ifdef CONFIG_XPS + CHECK_SKB_FIELD(sender_cpu); #endif #ifdef CONFIG_NET_SCHED - new->tc_index = old->tc_index; + CHECK_SKB_FIELD(tc_index); #ifdef CONFIG_NET_CLS_ACT - new->tc_verd = old->tc_verd; + CHECK_SKB_FIELD(tc_verd); #endif #endif - new->vlan_proto = old->vlan_proto; - new->vlan_tci = old->vlan_tci; +#ifdef CONFIG_AVM_PA + CHECK_SKB_FIELD(avm_pa); +#endif - skb_copy_secmark(new, old); } /* @@ -794,7 +988,7 @@ page = alloc_page(gfp_mask); if (!page) { while (head) { - struct page *next = (struct page *)head->private; + struct page *next = (struct page *)page_private(head); put_page(head); head = next; } @@ -804,7 +998,7 @@ memcpy(page_address(page), vaddr + f->page_offset, skb_frag_size(f)); kunmap_atomic(vaddr); - page->private = (unsigned long)head; + set_page_private(page, (unsigned long)head); head = page; } @@ -818,7 +1012,7 @@ for (i = num_frags - 1; i >= 0; i--) { __skb_fill_page_desc(skb, i, head, 0, skb_shinfo(skb)->frags[i].size); - head = (struct page *)head->private; + head = (struct page *)page_private(head); } skb_shinfo(skb)->tx_flags &= ~SKBTX_DEV_ZEROCOPY; @@ -842,17 +1036,18 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask) { + struct sk_buff_fclones *fclones = container_of(skb, + struct sk_buff_fclones, + skb1); struct sk_buff *n; if (skb_orphan_frags(skb, gfp_mask)) return NULL; - n = skb + 1; if (skb->fclone == SKB_FCLONE_ORIG && - n->fclone == SKB_FCLONE_UNAVAILABLE) { - atomic_t *fclone_ref = (atomic_t *) (n + 1); - n->fclone = SKB_FCLONE_CLONE; - atomic_inc(fclone_ref); + atomic_read(&fclones->fclone_ref) == 1) { + n = &fclones->skb2; + atomic_set(&fclones->fclone_ref, 2); } else { if (skb_pfmemalloc(skb)) gfp_mask |= __GFP_MEMALLOC; @@ -860,9 +1055,9 @@ n = kmem_cache_alloc(skbuff_head_cache, gfp_mask); if (!n) return NULL; + skbuff_debugobj_init_and_activate(n); kmemcheck_annotate_bitfield(n, flags1); - kmemcheck_annotate_bitfield(n, flags2); n->fclone = SKB_FCLONE_UNAVAILABLE; } @@ -872,6 +1067,9 @@ static void skb_headers_offset_update(struct sk_buff *skb, int off) { + /* Only adjust this if it actually is csum_start rather than csum */ + if (skb->ip_summed == CHECKSUM_PARTIAL) + skb->csum_start += off; /* {transport,network,mac}_header and tail are relative to skb->head */ skb->transport_header += off; skb->network_header += off; @@ -884,18 +1082,8 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old) { -#ifndef NET_SKBUFF_DATA_USES_OFFSET - /* - * Shift between the two data areas in bytes - */ - unsigned long offset = new->data - old->data; -#endif - __copy_skb_header(new, old); -#ifndef NET_SKBUFF_DATA_USES_OFFSET - skb_headers_offset_update(new, offset); -#endif skb_shinfo(new)->gso_size = skb_shinfo(old)->gso_size; skb_shinfo(new)->gso_segs = skb_shinfo(old)->gso_segs; skb_shinfo(new)->gso_type = skb_shinfo(old)->gso_type; @@ -949,10 +1137,13 @@ EXPORT_SYMBOL(skb_copy); /** - * __pskb_copy - create copy of an sk_buff with private head. + * __pskb_copy_fclone - create copy of an sk_buff with private head. * @skb: buffer to copy * @headroom: headroom of new skb * @gfp_mask: allocation priority + * @fclone: if true allocate the copy of the skb from the fclone + * cache instead of the head cache; it is recommended to set this + * to true for the cases where the copy will likely be cloned * * Make a copy of both an &sk_buff and part of its data, located * in header. Fragmented data remain shared. This is used when @@ -962,11 +1153,12 @@ * The returned buffer has a reference count of 1. */ -struct sk_buff *__pskb_copy(struct sk_buff *skb, int headroom, gfp_t gfp_mask) +struct sk_buff *__pskb_copy_fclone(struct sk_buff *skb, int headroom, + gfp_t gfp_mask, bool fclone) { unsigned int size = skb_headlen(skb) + headroom; - struct sk_buff *n = __alloc_skb(size, gfp_mask, - skb_alloc_rx_flag(skb), NUMA_NO_NODE); + int flags = skb_alloc_rx_flag(skb) | (fclone ? SKB_ALLOC_FCLONE : 0); + struct sk_buff *n = __alloc_skb(size, gfp_mask, flags, NUMA_NO_NODE); if (!n) goto out; @@ -1006,7 +1198,7 @@ out: return n; } -EXPORT_SYMBOL(__pskb_copy); +EXPORT_SYMBOL(__pskb_copy_fclone); /** * pskb_expand_head - reallocate header of &sk_buff @@ -1015,8 +1207,8 @@ * @ntail: room to add at tail * @gfp_mask: allocation priority * - * Expands (or creates identical copy, if &nhead and &ntail are zero) - * header of skb. &sk_buff itself is not changed. &sk_buff MUST have + * Expands (or creates identical copy, if @nhead and @ntail are zero) + * header of @skb. &sk_buff itself is not changed. &sk_buff MUST have * reference count of 1. Returns zero in the case of success or error, * if expansion failed. In the last case, &sk_buff is not changed. * @@ -1087,10 +1279,7 @@ skb->end = skb->head + size; #endif skb->tail += off; - skb_headers_offset_update(skb, off); - /* Only adjust this if it actually is csum_start rather than csum */ - if (skb->ip_summed == CHECKSUM_PARTIAL) - skb->csum_start += nhead; + skb_headers_offset_update(skb, nhead); skb->cloned = 0; skb->hdr_len = 0; skb->nohdr = 0; @@ -1155,7 +1344,6 @@ NUMA_NO_NODE); int oldheadroom = skb_headroom(skb); int head_copy_len, head_copy_off; - int off; if (!n) return NULL; @@ -1179,12 +1367,7 @@ copy_skb_header(n, skb); - off = newheadroom - oldheadroom; - if (n->ip_summed == CHECKSUM_PARTIAL) - n->csum_start += off; -#ifdef NET_SKBUFF_DATA_USES_OFFSET - skb_headers_offset_update(n, off); -#endif + skb_headers_offset_update(n, newheadroom - oldheadroom); return n; } @@ -1237,6 +1420,29 @@ EXPORT_SYMBOL(skb_pad); /** + * pskb_put - add data to the tail of a potentially fragmented buffer + * @skb: start of the buffer to use + * @tail: tail fragment of the buffer to use + * @len: amount of data to add + * + * This function extends the used data area of the potentially + * fragmented buffer. @tail must be the last fragment of @skb -- or + * @skb itself. If this would exceed the total buffer size the kernel + * will panic. A pointer to the first byte of the extra data is + * returned. + */ + +unsigned char *pskb_put(struct sk_buff *skb, struct sk_buff *tail, int len) +{ + if (tail != skb) { + skb->data_len += len; + skb->len += len; + } + return skb_put(tail, len); +} +EXPORT_SYMBOL_GPL(pskb_put); + +/** * skb_put - add data to a buffer * @skb: buffer to use * @len: amount of data to add @@ -1766,15 +1972,39 @@ return false; } +ssize_t skb_socket_splice(struct sock *sk, + struct pipe_inode_info *pipe, + struct splice_pipe_desc *spd) +{ + int ret; + + /* Drop the socket lock, otherwise we have reverse + * locking dependencies between sk_lock and i_mutex + * here as compared to sendfile(). We enter here + * with the socket lock held, and splice_to_pipe() will + * grab the pipe inode lock. For sendfile() emulation, + * we call into ->sendpage() with the i_mutex lock held + * and networking will grab the socket lock. + */ + release_sock(sk); + ret = splice_to_pipe(pipe, spd); + lock_sock(sk); + + return ret; +} + /* * Map data from the skb to a pipe. Should handle both the linear part, * the fragments, and the frag list. It does NOT handle frag lists within * the frag list, if such a thing exists. We'd probably need to recurse to * handle that cleanly. */ -int skb_splice_bits(struct sk_buff *skb, unsigned int offset, +int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset, struct pipe_inode_info *pipe, unsigned int tlen, - unsigned int flags) + unsigned int flags, + ssize_t (*splice_cb)(struct sock *, + struct pipe_inode_info *, + struct splice_pipe_desc *)) { struct partial_page partial[MAX_SKB_FRAGS]; struct page *pages[MAX_SKB_FRAGS]; @@ -1787,7 +2017,6 @@ .spd_release = sock_spd_release, }; struct sk_buff *frag_iter; - struct sock *sk = skb->sk; int ret = 0; /* @@ -1810,23 +2039,12 @@ } done: - if (spd.nr_pages) { - /* - * Drop the socket lock, otherwise we have reverse - * locking dependencies between sk_lock and i_mutex - * here as compared to sendfile(). We enter here - * with the socket lock held, and splice_to_pipe() will - * grab the pipe inode lock. For sendfile() emulation, - * we call into ->sendpage() with the i_mutex lock held - * and networking will grab the socket lock. - */ - release_sock(sk); - ret = splice_to_pipe(pipe, &spd); - lock_sock(sk); - } + if (spd.nr_pages) + ret = splice_cb(sk, pipe, &spd); return ret; } +EXPORT_SYMBOL_GPL(skb_splice_bits); /** * skb_store_bits - store bits from kernel buffer to skb @@ -1913,9 +2131,8 @@ EXPORT_SYMBOL(skb_store_bits); /* Checksum skb data. */ - -__wsum skb_checksum(const struct sk_buff *skb, int offset, - int len, __wsum csum) +__wsum __skb_checksum(const struct sk_buff *skb, int offset, int len, + __wsum csum, const struct skb_checksum_ops *ops) { int start = skb_headlen(skb); int i, copy = start - offset; @@ -1926,7 +2143,7 @@ if (copy > 0) { if (copy > len) copy = len; - csum = csum_partial(skb->data + offset, copy, csum); + csum = ops->update(skb->data + offset, copy, csum); if ((len -= copy) == 0) return csum; offset += copy; @@ -1947,10 +2164,10 @@ if (copy > len) copy = len; vaddr = kmap_atomic(skb_frag_page(frag)); - csum2 = csum_partial(vaddr + frag->page_offset + - offset - start, copy, 0); + csum2 = ops->update(vaddr + frag->page_offset + + offset - start, copy, 0); kunmap_atomic(vaddr); - csum = csum_block_add(csum, csum2, pos); + csum = ops->combine(csum, csum2, pos, copy); if (!(len -= copy)) return csum; offset += copy; @@ -1969,9 +2186,9 @@ __wsum csum2; if (copy > len) copy = len; - csum2 = skb_checksum(frag_iter, offset - start, - copy, 0); - csum = csum_block_add(csum, csum2, pos); + csum2 = __skb_checksum(frag_iter, offset - start, + copy, 0, ops); + csum = ops->combine(csum, csum2, pos, copy); if ((len -= copy) == 0) return csum; offset += copy; @@ -1983,6 +2200,18 @@ return csum; } +EXPORT_SYMBOL(__skb_checksum); + +__wsum skb_checksum(const struct sk_buff *skb, int offset, + int len, __wsum csum) +{ + const struct skb_checksum_ops ops = { + .update = csum_partial_ext, + .combine = csum_block_add_ext, + }; + + return __skb_checksum(skb, offset, len, csum, &ops); +} EXPORT_SYMBOL(skb_checksum); /* Both of above in one bottle. */ @@ -2064,6 +2293,104 @@ } EXPORT_SYMBOL(skb_copy_and_csum_bits); + /** + * skb_zerocopy_headlen - Calculate headroom needed for skb_zerocopy() + * @from: source buffer + * + * Calculates the amount of linear headroom needed in the 'to' skb passed + * into skb_zerocopy(). + */ +unsigned int +skb_zerocopy_headlen(const struct sk_buff *from) +{ + unsigned int hlen = 0; + + if (!from->head_frag || + skb_headlen(from) < L1_CACHE_BYTES || + skb_shinfo(from)->nr_frags >= MAX_SKB_FRAGS) + hlen = skb_headlen(from); + + if (skb_has_frag_list(from)) + hlen = from->len; + + return hlen; +} +EXPORT_SYMBOL_GPL(skb_zerocopy_headlen); + +/** + * skb_zerocopy - Zero copy skb to skb + * @to: destination buffer + * @from: source buffer + * @len: number of bytes to copy from source buffer + * @hlen: size of linear headroom in destination buffer + * + * Copies up to `len` bytes from `from` to `to` by creating references + * to the frags in the source buffer. + * + * The `hlen` as calculated by skb_zerocopy_headlen() specifies the + * headroom in the `to` buffer. + * + * Return value: + * 0: everything is OK + * -ENOMEM: couldn't orphan frags of @from due to lack of memory + * -EFAULT: skb_copy_bits() found some problem with skb geometry + */ +int +skb_zerocopy(struct sk_buff *to, struct sk_buff *from, int len, int hlen) +{ + int i, j = 0; + int plen = 0; /* length of skb->head fragment */ + int ret; + struct page *page; + unsigned int offset; + + BUG_ON(!from->head_frag && !hlen); + + /* dont bother with small payloads */ + if (len <= skb_tailroom(to)) + return skb_copy_bits(from, 0, skb_put(to, len), len); + + if (hlen) { + ret = skb_copy_bits(from, 0, skb_put(to, hlen), hlen); + if (unlikely(ret)) + return ret; + len -= hlen; + } else { + plen = min_t(int, skb_headlen(from), len); + if (plen) { + page = virt_to_head_page(from->head); + offset = from->data - (unsigned char *)page_address(page); + __skb_fill_page_desc(to, 0, page, offset, plen); + get_page(page); + j = 1; + len -= plen; + } + } + + to->truesize += len + plen; + to->len += len + plen; + to->data_len += len + plen; + + if (unlikely(skb_orphan_frags(from, GFP_ATOMIC))) { + skb_tx_error(from); + return -ENOMEM; + } + + for (i = 0; i < skb_shinfo(from)->nr_frags; i++) { + if (!len) + break; + skb_shinfo(to)->frags[j] = skb_shinfo(from)->frags[i]; + skb_shinfo(to)->frags[j].size = min_t(int, skb_shinfo(to)->frags[j].size, len); + len -= skb_shinfo(to)->frags[j].size; + skb_frag_ref(to, j); + j++; + } + skb_shinfo(to)->nr_frags = j; + + return 0; +} +EXPORT_SYMBOL_GPL(skb_zerocopy); + void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to) { __wsum csum; @@ -2502,19 +2829,19 @@ * @data: destination pointer for data to be returned * @st: state variable * - * Reads a block of skb data at &consumed relative to the + * Reads a block of skb data at @consumed relative to the * lower offset specified to skb_prepare_seq_read(). Assigns - * the head of the data block to &data and returns the length + * the head of the data block to @data and returns the length * of the block or 0 if the end of the skb data or the upper * offset has been reached. * * The caller is not required to consume all of the data - * returned, i.e. &consumed is typically set to the number + * returned, i.e. @consumed is typically set to the number * of bytes already consumed and the next call to * skb_seq_read() will return the remaining part of the block. * * Note 1: The size of each block of data returned can be arbitrary, - * this limitation is the cost for zerocopy seqeuental + * this limitation is the cost for zerocopy sequential * reads of potentially non linear data. * * Note 2: Fragment lists within fragments are not implemented @@ -2527,8 +2854,13 @@ unsigned int block_limit, abs_offset = consumed + st->lower_offset; skb_frag_t *frag; - if (unlikely(abs_offset >= st->upper_offset)) + if (unlikely(abs_offset >= st->upper_offset)) { + if (st->frag_data) { + kunmap_atomic(st->frag_data); + st->frag_data = NULL; + } return 0; + } next_skb: block_limit = skb_headlen(st->cur_skb) + st->stepped_offset; @@ -2617,7 +2949,6 @@ * @from: search offset * @to: search limit * @config: textsearch configuration - * @state: uninitialized textsearch state variable * * Finds a pattern in the skb data according to the specified * textsearch configuration. Use textsearch_next() to retrieve @@ -2625,17 +2956,17 @@ * to the first occurrence or UINT_MAX if no match was found. */ unsigned int skb_find_text(struct sk_buff *skb, unsigned int from, - unsigned int to, struct ts_config *config, - struct ts_state *state) + unsigned int to, struct ts_config *config) { + struct ts_state state; unsigned int ret; config->get_next_block = skb_ts_get_next_block; config->finish = skb_ts_finish; - skb_prepare_seq_read(skb, from, to, TS_SKB_CB(state)); + skb_prepare_seq_read(skb, from, to, TS_SKB_CB(&state)); - ret = textsearch_find(config, state); + ret = textsearch_find(config, &state); return (ret <= to - from ? ret : UINT_MAX); } EXPORT_SYMBOL(skb_find_text); @@ -2643,7 +2974,7 @@ /** * skb_append_datato_frags - append the user data to a skb * @sk: sock structure - * @skb: skb structure to be appened with user data. + * @skb: skb structure to be appended with user data. * @getfrag: call back function to be used for getting the user data * @from: pointer to user message iov * @length: length of the iov message @@ -2698,6 +3029,24 @@ } EXPORT_SYMBOL(skb_append_datato_frags); +int skb_append_pagefrags(struct sk_buff *skb, struct page *page, + int offset, size_t size) +{ + int i = skb_shinfo(skb)->nr_frags; + + if (skb_can_coalesce(skb, i, page, offset)) { + skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], size); + } else if (i < MAX_SKB_FRAGS) { + get_page(page); + skb_fill_page_desc(skb, i, page, offset, size); + } else { + return -EMSGSIZE; + } + + return 0; +} +EXPORT_SYMBOL_GPL(skb_append_pagefrags); + /** * skb_pull_rcsum - pull skb and update receive checksum * @skb: buffer to update @@ -2722,67 +3071,96 @@ /** * skb_segment - Perform protocol segmentation on skb. - * @skb: buffer to segment + * @head_skb: buffer to segment * @features: features for the output path (see dev->features) * * This function performs segmentation on the given skb. It returns * a pointer to the first in a list of new skbs for the segments. * In case of error it returns ERR_PTR(err). */ -struct sk_buff *skb_segment(struct sk_buff *skb, netdev_features_t features) +struct sk_buff *skb_segment(struct sk_buff *head_skb, + netdev_features_t features) { struct sk_buff *segs = NULL; struct sk_buff *tail = NULL; - struct sk_buff *fskb = skb_shinfo(skb)->frag_list; - unsigned int mss = skb_shinfo(skb)->gso_size; - unsigned int doffset = skb->data - skb_mac_header(skb); + struct sk_buff *list_skb = skb_shinfo(head_skb)->frag_list; + skb_frag_t *frag = skb_shinfo(head_skb)->frags; + unsigned int mss = skb_shinfo(head_skb)->gso_size; + unsigned int doffset = head_skb->data - skb_mac_header(head_skb); + struct sk_buff *frag_skb = head_skb; unsigned int offset = doffset; - unsigned int tnl_hlen = skb_tnl_header_len(skb); + unsigned int tnl_hlen = skb_tnl_header_len(head_skb); unsigned int headroom; unsigned int len; __be16 proto; bool csum; int sg = !!(features & NETIF_F_SG); - int nfrags = skb_shinfo(skb)->nr_frags; + int nfrags = skb_shinfo(head_skb)->nr_frags; int err = -ENOMEM; int i = 0; int pos; + int dummy; - proto = skb_network_protocol(skb); + __skb_push(head_skb, doffset); + proto = skb_network_protocol(head_skb, &dummy); if (unlikely(!proto)) return ERR_PTR(-EINVAL); - csum = !!can_checksum_protocol(features, proto); - __skb_push(skb, doffset); - headroom = skb_headroom(skb); - pos = skb_headlen(skb); + csum = !head_skb->encap_hdr_csum && + !!can_checksum_protocol(features, proto); + + headroom = skb_headroom(head_skb); + pos = skb_headlen(head_skb); do { struct sk_buff *nskb; - skb_frag_t *frag; + skb_frag_t *nskb_frag; int hsize; int size; - len = skb->len - offset; + len = head_skb->len - offset; if (len > mss) len = mss; - hsize = skb_headlen(skb) - offset; + hsize = skb_headlen(head_skb) - offset; if (hsize < 0) hsize = 0; if (hsize > len || !sg) hsize = len; - if (!hsize && i >= nfrags) { - BUG_ON(fskb->len != len); + if (!hsize && i >= nfrags && skb_headlen(list_skb) && + (skb_headlen(list_skb) == len || sg)) { + BUG_ON(skb_headlen(list_skb) > len); + + i = 0; + nfrags = skb_shinfo(list_skb)->nr_frags; + frag = skb_shinfo(list_skb)->frags; + frag_skb = list_skb; + pos += skb_headlen(list_skb); + + while (pos < offset + len) { + BUG_ON(i >= nfrags); + + size = skb_frag_size(frag); + if (pos + size > offset + len) + break; - pos += len; - nskb = skb_clone(fskb, GFP_ATOMIC); - fskb = fskb->next; + i++; + pos += size; + frag++; + } + + nskb = skb_clone(list_skb, GFP_ATOMIC); + list_skb = list_skb->next; if (unlikely(!nskb)) goto err; + if (unlikely(pskb_trim(nskb, len))) { + kfree_skb(nskb); + goto err; + } + hsize = skb_end_offset(nskb); if (skb_cow_head(nskb, doffset + headroom)) { kfree_skb(nskb); @@ -2794,7 +3172,7 @@ __skb_push(nskb, doffset); } else { nskb = __alloc_skb(hsize + doffset + headroom, - GFP_ATOMIC, skb_alloc_rx_flag(skb), + GFP_ATOMIC, skb_alloc_rx_flag(head_skb), NUMA_NO_NODE); if (unlikely(!nskb)) @@ -2810,82 +3188,82 @@ segs = nskb; tail = nskb; - __copy_skb_header(nskb, skb); + __copy_skb_header(nskb, head_skb); - /* nskb and skb might have different headroom */ - if (nskb->ip_summed == CHECKSUM_PARTIAL) - nskb->csum_start += skb_headroom(nskb) - headroom; - - skb_reset_mac_header(nskb); - skb_set_network_header(nskb, skb->mac_len); - nskb->transport_header = (nskb->network_header + - skb_network_header_len(skb)); + skb_headers_offset_update(nskb, skb_headroom(nskb) - headroom); skb_reset_mac_len(nskb); - skb_copy_from_linear_data_offset(skb, -tnl_hlen, + skb_copy_from_linear_data_offset(head_skb, -tnl_hlen, nskb->data - tnl_hlen, doffset + tnl_hlen); - if (fskb != skb_shinfo(skb)->frag_list) + if (nskb->len == len + doffset) goto perform_csum_check; - if (!sg) { + if (!sg && !nskb->remcsum_offload) { nskb->ip_summed = CHECKSUM_NONE; - nskb->csum = skb_copy_and_csum_bits(skb, offset, + nskb->csum = skb_copy_and_csum_bits(head_skb, offset, skb_put(nskb, len), len, 0); + SKB_GSO_CB(nskb)->csum_start = + skb_headroom(nskb) + doffset; continue; } - frag = skb_shinfo(nskb)->frags; + nskb_frag = skb_shinfo(nskb)->frags; - skb_copy_from_linear_data_offset(skb, offset, + skb_copy_from_linear_data_offset(head_skb, offset, skb_put(nskb, hsize), hsize); - skb_shinfo(nskb)->tx_flags = skb_shinfo(skb)->tx_flags & SKBTX_SHARED_FRAG; + skb_shinfo(nskb)->tx_flags = skb_shinfo(head_skb)->tx_flags & + SKBTX_SHARED_FRAG; + + while (pos < offset + len) { + if (i >= nfrags) { + BUG_ON(skb_headlen(list_skb)); + + i = 0; + nfrags = skb_shinfo(list_skb)->nr_frags; + frag = skb_shinfo(list_skb)->frags; + frag_skb = list_skb; - while (pos < offset + len && i < nfrags) { - if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC))) + BUG_ON(!nfrags); + + list_skb = list_skb->next; + } + + if (unlikely(skb_shinfo(nskb)->nr_frags >= + MAX_SKB_FRAGS)) { + net_warn_ratelimited( + "skb_segment: too many frags: %u %u\n", + pos, mss); goto err; - *frag = skb_shinfo(skb)->frags[i]; - __skb_frag_ref(frag); - size = skb_frag_size(frag); + } + + if (unlikely(skb_orphan_frags(frag_skb, GFP_ATOMIC))) + goto err; + + *nskb_frag = *frag; + __skb_frag_ref(nskb_frag); + size = skb_frag_size(nskb_frag); if (pos < offset) { - frag->page_offset += offset - pos; - skb_frag_size_sub(frag, offset - pos); + nskb_frag->page_offset += offset - pos; + skb_frag_size_sub(nskb_frag, offset - pos); } skb_shinfo(nskb)->nr_frags++; if (pos + size <= offset + len) { i++; + frag++; pos += size; } else { - skb_frag_size_sub(frag, pos + size - (offset + len)); + skb_frag_size_sub(nskb_frag, pos + size - (offset + len)); goto skip_fraglist; } - frag++; - } - - if (pos < offset + len) { - struct sk_buff *fskb2 = fskb; - - BUG_ON(pos + fskb->len != offset + len); - - pos += fskb->len; - fskb = fskb->next; - - if (fskb2->next) { - fskb2 = skb_clone(fskb2, GFP_ATOMIC); - if (!fskb2) - goto err; - } else - skb_get(fskb2); - - SKB_FRAG_ASSERT(nskb); - skb_shinfo(nskb)->frag_list = fskb2; + nskb_frag++; } skip_fraglist: @@ -2894,52 +3272,63 @@ nskb->truesize += nskb->data_len; perform_csum_check: - if (!csum) { + if (!csum && !nskb->remcsum_offload) { nskb->csum = skb_checksum(nskb, doffset, nskb->len - doffset, 0); nskb->ip_summed = CHECKSUM_NONE; + SKB_GSO_CB(nskb)->csum_start = + skb_headroom(nskb) + doffset; } - } while ((offset += len) < skb->len); + } while ((offset += len) < head_skb->len); + /* Some callers want to get the end of the list. + * Put it in segs->prev to avoid walking the list. + * (see validate_xmit_skb_list() for example) + */ + segs->prev = tail; + + /* Following permits correct backpressure, for protocols + * using skb_set_owner_w(). + * Idea is to tranfert ownership from head_skb to last segment. + */ + if (head_skb->destructor == sock_wfree) { + swap(tail->truesize, head_skb->truesize); + swap(tail->destructor, head_skb->destructor); + swap(tail->sk, head_skb->sk); + } return segs; err: - while ((skb = segs)) { - segs = skb->next; - kfree_skb(skb); - } + kfree_skb_list(segs); return ERR_PTR(err); } EXPORT_SYMBOL_GPL(skb_segment); int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb) { - struct sk_buff *p = *head; - struct sk_buff *nskb; - struct skb_shared_info *skbinfo = skb_shinfo(skb); - struct skb_shared_info *pinfo = skb_shinfo(p); - unsigned int headroom; - unsigned int len = skb_gro_len(skb); + struct skb_shared_info *pinfo, *skbinfo = skb_shinfo(skb); unsigned int offset = skb_gro_offset(skb); unsigned int headlen = skb_headlen(skb); + unsigned int len = skb_gro_len(skb); + struct sk_buff *lp, *p = *head; unsigned int delta_truesize; - if (p->len + len >= 65536) + if (unlikely(p->len + len >= 65536)) return -E2BIG; - if (pinfo->frag_list) - goto merge; - else if (headlen <= offset) { + lp = NAPI_GRO_CB(p)->last; + pinfo = skb_shinfo(lp); + + if (headlen <= offset) { skb_frag_t *frag; skb_frag_t *frag2; int i = skbinfo->nr_frags; int nr_frags = pinfo->nr_frags + i; - offset -= headlen; - if (nr_frags > MAX_SKB_FRAGS) - return -E2BIG; + goto merge; + offset -= headlen; pinfo->nr_frags = nr_frags; skbinfo->nr_frags = 0; @@ -2970,7 +3359,7 @@ unsigned int first_offset; if (nr_frags + 1 + skbinfo->nr_frags > MAX_SKB_FRAGS) - return -E2BIG; + goto merge; first_offset = skb->data - (unsigned char *)page_address(page) + @@ -2988,43 +3377,7 @@ delta_truesize = skb->truesize - SKB_DATA_ALIGN(sizeof(struct sk_buff)); NAPI_GRO_CB(skb)->free = NAPI_GRO_FREE_STOLEN_HEAD; goto done; - } else if (skb_gro_len(p) != pinfo->gso_size) - return -E2BIG; - - headroom = skb_headroom(p); - nskb = alloc_skb(headroom + skb_gro_offset(p), GFP_ATOMIC); - if (unlikely(!nskb)) - return -ENOMEM; - - __copy_skb_header(nskb, p); - nskb->mac_len = p->mac_len; - - skb_reserve(nskb, headroom); - __skb_put(nskb, skb_gro_offset(p)); - - skb_set_mac_header(nskb, skb_mac_header(p) - p->data); - skb_set_network_header(nskb, skb_network_offset(p)); - skb_set_transport_header(nskb, skb_transport_offset(p)); - - __skb_pull(p, skb_gro_offset(p)); - memcpy(skb_mac_header(nskb), skb_mac_header(p), - p->data - skb_mac_header(p)); - - skb_shinfo(nskb)->frag_list = p; - skb_shinfo(nskb)->gso_size = pinfo->gso_size; - pinfo->gso_size = 0; - skb_header_release(p); - NAPI_GRO_CB(nskb)->last = p; - - nskb->data_len += p->len; - nskb->truesize += p->truesize; - nskb->len += p->len; - - *head = nskb; - nskb->next = p->next; - p->next = NULL; - - p = nskb; + } merge: delta_truesize = skb->truesize; @@ -3040,20 +3393,27 @@ __skb_pull(skb, offset); - NAPI_GRO_CB(p)->last->next = skb; + if (NAPI_GRO_CB(p)->last == p) + skb_shinfo(p)->frag_list = skb; + else + NAPI_GRO_CB(p)->last->next = skb; NAPI_GRO_CB(p)->last = skb; - skb_header_release(skb); + __skb_header_release(skb); + lp = p; done: NAPI_GRO_CB(p)->count++; p->data_len += len; p->truesize += delta_truesize; p->len += len; - + if (lp != p) { + lp->data_len += len; + lp->truesize += delta_truesize; + lp->len += len; + } NAPI_GRO_CB(skb)->same_flow = 1; return 0; } -EXPORT_SYMBOL_GPL(skb_gro_receive); void __init skb_init(void) { @@ -3063,11 +3423,14 @@ SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); skbuff_fclone_cache = kmem_cache_create("skbuff_fclone_cache", - (2*sizeof(struct sk_buff)) + - sizeof(atomic_t), + sizeof(struct sk_buff_fclones), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); +#if IS_ENABLED(CONFIG_AVM_GENERIC_CONNTRACK) + generic_ct_init(); +#endif + skb_recycler_init(); } /** @@ -3140,6 +3503,32 @@ return elt; } +/* As compared with skb_to_sgvec, skb_to_sgvec_nomark only map skb to given + * sglist without mark the sg which contain last skb data as the end. + * So the caller can mannipulate sg list as will when padding new data after + * the first call without calling sg_unmark_end to expend sg list. + * + * Scenario to use skb_to_sgvec_nomark: + * 1. sg_init_table + * 2. skb_to_sgvec_nomark(payload1) + * 3. skb_to_sgvec_nomark(payload2) + * + * This is equivalent to: + * 1. sg_init_table + * 2. skb_to_sgvec(payload1) + * 3. sg_unmark_end + * 4. skb_to_sgvec(payload2) + * + * When mapping mutilple payload conditionally, skb_to_sgvec_nomark + * is more preferable. + */ +int skb_to_sgvec_nomark(struct sk_buff *skb, struct scatterlist *sg, + int offset, int len) +{ + return __skb_to_sgvec(skb, sg, offset, len); +} +EXPORT_SYMBOL_GPL(skb_to_sgvec_nomark); + int skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len) { int nsg = __skb_to_sgvec(skb, sg, offset, len); @@ -3272,8 +3661,6 @@ */ int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb) { - int len = skb->len; - if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >= (unsigned int)sk->sk_rcvbuf) return -ENOMEM; @@ -3288,55 +3675,172 @@ skb_queue_tail(&sk->sk_error_queue, skb); if (!sock_flag(sk, SOCK_DEAD)) - sk->sk_data_ready(sk, len); + sk->sk_data_ready(sk); return 0; } EXPORT_SYMBOL(sock_queue_err_skb); -void skb_tstamp_tx(struct sk_buff *orig_skb, - struct skb_shared_hwtstamps *hwtstamps) +struct sk_buff *sock_dequeue_err_skb(struct sock *sk) { - struct sock *sk = orig_skb->sk; - struct sock_exterr_skb *serr; - struct sk_buff *skb; - int err; + struct sk_buff_head *q = &sk->sk_error_queue; + struct sk_buff *skb, *skb_next; + unsigned long flags; + int err = 0; - if (!sk) - return; + spin_lock_irqsave(&q->lock, flags); + skb = __skb_dequeue(q); + if (skb && (skb_next = skb_peek(q))) + err = SKB_EXT_ERR(skb_next)->ee.ee_errno; + spin_unlock_irqrestore(&q->lock, flags); - if (hwtstamps) { - *skb_hwtstamps(orig_skb) = - *hwtstamps; - } else { - /* - * no hardware time stamps available, - * so keep the shared tx_flags and only - * store software time stamp - */ - orig_skb->tstamp = ktime_get_real(); + sk->sk_err = err; + if (err) + sk->sk_error_report(sk); + + return skb; +} +EXPORT_SYMBOL(sock_dequeue_err_skb); + +/** + * skb_clone_sk - create clone of skb, and take reference to socket + * @skb: the skb to clone + * + * This function creates a clone of a buffer that holds a reference on + * sk_refcnt. Buffers created via this function are meant to be + * returned using sock_queue_err_skb, or free via kfree_skb. + * + * When passing buffers allocated with this function to sock_queue_err_skb + * it is necessary to wrap the call with sock_hold/sock_put in order to + * prevent the socket from being released prior to being enqueued on + * the sk_error_queue. + */ +struct sk_buff *skb_clone_sk(struct sk_buff *skb) +{ + struct sock *sk = skb->sk; + struct sk_buff *clone; + + if (!sk || !atomic_inc_not_zero(&sk->sk_refcnt)) + return NULL; + + clone = skb_clone(skb, GFP_ATOMIC); + if (!clone) { + sock_put(sk); + return NULL; } - skb = skb_clone(orig_skb, GFP_ATOMIC); - if (!skb) - return; + clone->sk = sk; + clone->destructor = sock_efree; + + return clone; +} +EXPORT_SYMBOL(skb_clone_sk); + +static void __skb_complete_tx_timestamp(struct sk_buff *skb, + struct sock *sk, + int tstype) +{ + struct sock_exterr_skb *serr; + int err; serr = SKB_EXT_ERR(skb); memset(serr, 0, sizeof(*serr)); serr->ee.ee_errno = ENOMSG; serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING; + serr->ee.ee_info = tstype; + if (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) { + serr->ee.ee_data = skb_shinfo(skb)->tskey; + if (sk->sk_protocol == IPPROTO_TCP && + sk->sk_type == SOCK_STREAM) + serr->ee.ee_data -= sk->sk_tskey; + } err = sock_queue_err_skb(sk, skb); if (err) kfree_skb(skb); } + +static bool skb_may_tx_timestamp(struct sock *sk, bool tsonly) +{ + bool ret; + + if (likely(sysctl_tstamp_allow_data || tsonly)) + return true; + + read_lock_bh(&sk->sk_callback_lock); + ret = sk->sk_socket && sk->sk_socket->file && + file_ns_capable(sk->sk_socket->file, &init_user_ns, CAP_NET_RAW); + read_unlock_bh(&sk->sk_callback_lock); + return ret; +} + +void skb_complete_tx_timestamp(struct sk_buff *skb, + struct skb_shared_hwtstamps *hwtstamps) +{ + struct sock *sk = skb->sk; + + if (!skb_may_tx_timestamp(sk, false)) + return; + + /* Take a reference to prevent skb_orphan() from freeing the socket, + * but only if the socket refcount is not zero. + */ + if (likely(atomic_inc_not_zero(&sk->sk_refcnt))) { + *skb_hwtstamps(skb) = *hwtstamps; + __skb_complete_tx_timestamp(skb, sk, SCM_TSTAMP_SND); + sock_put(sk); + } +} +EXPORT_SYMBOL_GPL(skb_complete_tx_timestamp); + +void __skb_tstamp_tx(struct sk_buff *orig_skb, + struct skb_shared_hwtstamps *hwtstamps, + struct sock *sk, int tstype) +{ + struct sk_buff *skb; + bool tsonly; + + if (!sk) + return; + + tsonly = sk->sk_tsflags & SOF_TIMESTAMPING_OPT_TSONLY; + if (!skb_may_tx_timestamp(sk, tsonly)) + return; + + if (tsonly) + skb = alloc_skb(0, GFP_ATOMIC); + else + skb = skb_clone(orig_skb, GFP_ATOMIC); + if (!skb) + return; + + if (tsonly) { + skb_shinfo(skb)->tx_flags = skb_shinfo(orig_skb)->tx_flags; + skb_shinfo(skb)->tskey = skb_shinfo(orig_skb)->tskey; + } + + if (hwtstamps) + *skb_hwtstamps(skb) = *hwtstamps; + else + skb->tstamp = ktime_get_real(); + + __skb_complete_tx_timestamp(skb, sk, tstype); +} +EXPORT_SYMBOL_GPL(__skb_tstamp_tx); + +void skb_tstamp_tx(struct sk_buff *orig_skb, + struct skb_shared_hwtstamps *hwtstamps) +{ + return __skb_tstamp_tx(orig_skb, hwtstamps, orig_skb->sk, + SCM_TSTAMP_SND); +} EXPORT_SYMBOL_GPL(skb_tstamp_tx); void skb_complete_wifi_ack(struct sk_buff *skb, bool acked) { struct sock *sk = skb->sk; struct sock_exterr_skb *serr; - int err; + int err = 1; skb->wifi_acked_valid = 1; skb->wifi_acked = acked; @@ -3346,13 +3850,18 @@ serr->ee.ee_errno = ENOMSG; serr->ee.ee_origin = SO_EE_ORIGIN_TXSTATUS; - err = sock_queue_err_skb(sk, skb); + /* Take a reference to prevent skb_orphan() from freeing the socket, + * but only if the socket refcount is not zero. + */ + if (likely(atomic_inc_not_zero(&sk->sk_refcnt))) { + err = sock_queue_err_skb(sk, skb); + sock_put(sk); + } if (err) kfree_skb(skb); } EXPORT_SYMBOL_GPL(skb_complete_wifi_ack); - /** * skb_partial_csum_set - set up and verify partial csum values for packet * @skb: the skb to set @@ -3381,6 +3890,324 @@ } EXPORT_SYMBOL_GPL(skb_partial_csum_set); +static int skb_maybe_pull_tail(struct sk_buff *skb, unsigned int len, + unsigned int max) +{ + if (skb_headlen(skb) >= len) + return 0; + + /* If we need to pullup then pullup to the max, so we + * won't need to do it again. + */ + if (max > skb->len) + max = skb->len; + + if (__pskb_pull_tail(skb, max - skb_headlen(skb)) == NULL) + return -ENOMEM; + + if (skb_headlen(skb) < len) + return -EPROTO; + + return 0; +} + +#define MAX_TCP_HDR_LEN (15 * 4) + +static __sum16 *skb_checksum_setup_ip(struct sk_buff *skb, + typeof(IPPROTO_IP) proto, + unsigned int off) +{ + switch (proto) { + int err; + + case IPPROTO_TCP: + err = skb_maybe_pull_tail(skb, off + sizeof(struct tcphdr), + off + MAX_TCP_HDR_LEN); + if (!err && !skb_partial_csum_set(skb, off, + offsetof(struct tcphdr, + check))) + err = -EPROTO; + return err ? ERR_PTR(err) : &tcp_hdr(skb)->check; + + case IPPROTO_UDP: + err = skb_maybe_pull_tail(skb, off + sizeof(struct udphdr), + off + sizeof(struct udphdr)); + if (!err && !skb_partial_csum_set(skb, off, + offsetof(struct udphdr, + check))) + err = -EPROTO; + return err ? ERR_PTR(err) : &udp_hdr(skb)->check; + } + + return ERR_PTR(-EPROTO); +} + +/* This value should be large enough to cover a tagged ethernet header plus + * maximally sized IP and TCP or UDP headers. + */ +#define MAX_IP_HDR_LEN 128 + +static int skb_checksum_setup_ipv4(struct sk_buff *skb, bool recalculate) +{ + unsigned int off; + bool fragment; + __sum16 *csum; + int err; + + fragment = false; + + err = skb_maybe_pull_tail(skb, + sizeof(struct iphdr), + MAX_IP_HDR_LEN); + if (err < 0) + goto out; + + if (ip_hdr(skb)->frag_off & htons(IP_OFFSET | IP_MF)) + fragment = true; + + off = ip_hdrlen(skb); + + err = -EPROTO; + + if (fragment) + goto out; + + csum = skb_checksum_setup_ip(skb, ip_hdr(skb)->protocol, off); + if (IS_ERR(csum)) + return PTR_ERR(csum); + + if (recalculate) + *csum = ~csum_tcpudp_magic(ip_hdr(skb)->saddr, + ip_hdr(skb)->daddr, + skb->len - off, + ip_hdr(skb)->protocol, 0); + err = 0; + +out: + return err; +} + +/* This value should be large enough to cover a tagged ethernet header plus + * an IPv6 header, all options, and a maximal TCP or UDP header. + */ +#define MAX_IPV6_HDR_LEN 256 + +#define OPT_HDR(type, skb, off) \ + (type *)(skb_network_header(skb) + (off)) + +static int skb_checksum_setup_ipv6(struct sk_buff *skb, bool recalculate) +{ + int err; + u8 nexthdr; + unsigned int off; + unsigned int len; + bool fragment; + bool done; + __sum16 *csum; + + fragment = false; + done = false; + + off = sizeof(struct ipv6hdr); + + err = skb_maybe_pull_tail(skb, off, MAX_IPV6_HDR_LEN); + if (err < 0) + goto out; + + nexthdr = ipv6_hdr(skb)->nexthdr; + + len = sizeof(struct ipv6hdr) + ntohs(ipv6_hdr(skb)->payload_len); + while (off <= len && !done) { + switch (nexthdr) { + case IPPROTO_DSTOPTS: + case IPPROTO_HOPOPTS: + case IPPROTO_ROUTING: { + struct ipv6_opt_hdr *hp; + + err = skb_maybe_pull_tail(skb, + off + + sizeof(struct ipv6_opt_hdr), + MAX_IPV6_HDR_LEN); + if (err < 0) + goto out; + + hp = OPT_HDR(struct ipv6_opt_hdr, skb, off); + nexthdr = hp->nexthdr; + off += ipv6_optlen(hp); + break; + } + case IPPROTO_AH: { + struct ip_auth_hdr *hp; + + err = skb_maybe_pull_tail(skb, + off + + sizeof(struct ip_auth_hdr), + MAX_IPV6_HDR_LEN); + if (err < 0) + goto out; + + hp = OPT_HDR(struct ip_auth_hdr, skb, off); + nexthdr = hp->nexthdr; + off += ipv6_authlen(hp); + break; + } + case IPPROTO_FRAGMENT: { + struct frag_hdr *hp; + + err = skb_maybe_pull_tail(skb, + off + + sizeof(struct frag_hdr), + MAX_IPV6_HDR_LEN); + if (err < 0) + goto out; + + hp = OPT_HDR(struct frag_hdr, skb, off); + + if (hp->frag_off & htons(IP6_OFFSET | IP6_MF)) + fragment = true; + + nexthdr = hp->nexthdr; + off += sizeof(struct frag_hdr); + break; + } + default: + done = true; + break; + } + } + + err = -EPROTO; + + if (!done || fragment) + goto out; + + csum = skb_checksum_setup_ip(skb, nexthdr, off); + if (IS_ERR(csum)) + return PTR_ERR(csum); + + if (recalculate) + *csum = ~csum_ipv6_magic(&ipv6_hdr(skb)->saddr, + &ipv6_hdr(skb)->daddr, + skb->len - off, nexthdr, 0); + err = 0; + +out: + return err; +} + +/** + * skb_checksum_setup - set up partial checksum offset + * @skb: the skb to set up + * @recalculate: if true the pseudo-header checksum will be recalculated + */ +int skb_checksum_setup(struct sk_buff *skb, bool recalculate) +{ + int err; + + switch (skb->protocol) { + case htons(ETH_P_IP): + err = skb_checksum_setup_ipv4(skb, recalculate); + break; + + case htons(ETH_P_IPV6): + err = skb_checksum_setup_ipv6(skb, recalculate); + break; + + default: + err = -EPROTO; + break; + } + + return err; +} +EXPORT_SYMBOL(skb_checksum_setup); + +/** + * skb_checksum_maybe_trim - maybe trims the given skb + * @skb: the skb to check + * @transport_len: the data length beyond the network header + * + * Checks whether the given skb has data beyond the given transport length. + * If so, returns a cloned skb trimmed to this transport length. + * Otherwise returns the provided skb. Returns NULL in error cases + * (e.g. transport_len exceeds skb length or out-of-memory). + * + * Caller needs to set the skb transport header and free any returned skb if it + * differs from the provided skb. + */ +static struct sk_buff *skb_checksum_maybe_trim(struct sk_buff *skb, + unsigned int transport_len) +{ + struct sk_buff *skb_chk; + unsigned int len = skb_transport_offset(skb) + transport_len; + int ret; + + if (skb->len < len) + return NULL; + else if (skb->len == len) + return skb; + + skb_chk = skb_clone(skb, GFP_ATOMIC); + if (!skb_chk) + return NULL; + + ret = pskb_trim_rcsum(skb_chk, len); + if (ret) { + kfree_skb(skb_chk); + return NULL; + } + + return skb_chk; +} + +/** + * skb_checksum_trimmed - validate checksum of an skb + * @skb: the skb to check + * @transport_len: the data length beyond the network header + * @skb_chkf: checksum function to use + * + * Applies the given checksum function skb_chkf to the provided skb. + * Returns a checked and maybe trimmed skb. Returns NULL on error. + * + * If the skb has data beyond the given transport length, then a + * trimmed & cloned skb is checked and returned. + * + * Caller needs to set the skb transport header and free any returned skb if it + * differs from the provided skb. + */ +struct sk_buff *skb_checksum_trimmed(struct sk_buff *skb, + unsigned int transport_len, + __sum16(*skb_chkf)(struct sk_buff *skb)) +{ + struct sk_buff *skb_chk; + unsigned int offset = skb_transport_offset(skb); + __sum16 ret; + + skb_chk = skb_checksum_maybe_trim(skb, transport_len); + if (!skb_chk) + goto err; + + if (!pskb_may_pull(skb_chk, offset)) + goto err; + + skb_pull_rcsum(skb_chk, offset); + ret = skb_chkf(skb_chk); + skb_push_rcsum(skb_chk, offset); + + if (ret) + goto err; + + return skb_chk; + +err: + if (skb_chk && skb_chk != skb) + kfree_skb(skb_chk); + + return NULL; + +} +EXPORT_SYMBOL(skb_checksum_trimmed); + void __skb_warn_lro_forwarding(const struct sk_buff *skb) { net_warn_ratelimited("%s: received packets cannot be forwarded while LRO is enabled\n", @@ -3392,6 +4219,7 @@ { if (head_stolen) { skb_release_head_state(skb); + skbuff_debugobj_deactivate(skb); kmem_cache_free(skbuff_head_cache, skb); } else { __kfree_skb(skb); @@ -3417,7 +4245,8 @@ return false; if (len <= skb_tailroom(to)) { - BUG_ON(skb_copy_bits(from, 0, skb_put(to, len), len)); + if (len) + BUG_ON(skb_copy_bits(from, 0, skb_put(to, len), len)); *delta_truesize = 0; return true; } @@ -3478,6 +4307,41 @@ EXPORT_SYMBOL(skb_try_coalesce); /** + * skb_scrub_packet - scrub an skb + * + * @skb: buffer to clean + * @xnet: packet is crossing netns + * + * skb_scrub_packet can be used after encapsulating or decapsulting a packet + * into/from a tunnel. Some information have to be cleared during these + * operations. + * skb_scrub_packet can also be used to clean a skb before injecting it in + * another namespace (@xnet == true). We have to clear all information in the + * skb that could impact namespace isolation. + */ +void skb_scrub_packet(struct sk_buff *skb, bool xnet) +{ + skb->tstamp.tv64 = 0; + skb->pkt_type = PACKET_HOST; + skb->skb_iif = 0; + skb->ignore_df = 0; + skb_dst_drop(skb); + skb_sender_cpu_clear(skb); + secpath_reset(skb); + /* TMA/MQU 20170411: Is this the right thing for namespace + * changes? We think so. See JZ-30001. */ + nf_reset_no_generic_ct(skb); + nf_reset_trace(skb); + + if (!xnet) + return; + + skb_orphan(skb); + skb->mark = 0; +} +EXPORT_SYMBOL_GPL(skb_scrub_packet); + +/** * skb_gso_transport_seglen - Return length of individual segments of a gso packet * * @skb: GSO skb @@ -3490,14 +4354,260 @@ unsigned int skb_gso_transport_seglen(const struct sk_buff *skb) { const struct skb_shared_info *shinfo = skb_shinfo(skb); + unsigned int thlen = 0; - if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) - return tcp_hdrlen(skb) + shinfo->gso_size; - + if (skb->encapsulation) { + thlen = skb_inner_transport_header(skb) - + skb_transport_header(skb); + + if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) + thlen += inner_tcp_hdrlen(skb); + } else if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) { + thlen = tcp_hdrlen(skb); + } /* UFO sets gso_size to the size of the fragmentation * payload, i.e. the size of the L4 (UDP) header is already * accounted for. */ - return shinfo->gso_size; + return thlen + shinfo->gso_size; } EXPORT_SYMBOL_GPL(skb_gso_transport_seglen); + +static struct sk_buff *skb_reorder_vlan_header(struct sk_buff *skb) +{ + if (skb_cow(skb, skb_headroom(skb)) < 0) { + kfree_skb(skb); + return NULL; + } + + memmove(skb->data - ETH_HLEN, skb->data - skb->mac_len - VLAN_HLEN, + 2 * ETH_ALEN); + skb->mac_header += VLAN_HLEN; + return skb; +} + +struct sk_buff *skb_vlan_untag(struct sk_buff *skb) +{ + struct vlan_hdr *vhdr; + u16 vlan_tci; + + if (unlikely(skb_vlan_tag_present(skb))) { + /* vlan_tci is already set-up so leave this for another time */ + return skb; + } + + skb = skb_share_check(skb, GFP_ATOMIC); + if (unlikely(!skb)) + goto err_free; + + if (unlikely(!pskb_may_pull(skb, VLAN_HLEN))) + goto err_free; + + vhdr = (struct vlan_hdr *)skb->data; + vlan_tci = ntohs(vhdr->h_vlan_TCI); + __vlan_hwaccel_put_tag(skb, skb->protocol, vlan_tci); + + skb_pull_rcsum(skb, VLAN_HLEN); + vlan_set_encap_proto(skb, vhdr); + + skb = skb_reorder_vlan_header(skb); + if (unlikely(!skb)) + goto err_free; + + skb_reset_network_header(skb); + skb_reset_transport_header(skb); + skb_reset_mac_len(skb); + + return skb; + +err_free: + kfree_skb(skb); + return NULL; +} +EXPORT_SYMBOL(skb_vlan_untag); + +int skb_ensure_writable(struct sk_buff *skb, int write_len) +{ + if (!pskb_may_pull(skb, write_len)) + return -ENOMEM; + + if (!skb_cloned(skb) || skb_clone_writable(skb, write_len)) + return 0; + + return pskb_expand_head(skb, 0, 0, GFP_ATOMIC); +} +EXPORT_SYMBOL(skb_ensure_writable); + +/* remove VLAN header from packet and update csum accordingly. */ +static int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci) +{ + struct vlan_hdr *vhdr; + unsigned int offset = skb->data - skb_mac_header(skb); + int err; + + __skb_push(skb, offset); + err = skb_ensure_writable(skb, VLAN_ETH_HLEN); + if (unlikely(err)) + goto pull; + + skb_postpull_rcsum(skb, skb->data + (2 * ETH_ALEN), VLAN_HLEN); + + vhdr = (struct vlan_hdr *)(skb->data + ETH_HLEN); + *vlan_tci = ntohs(vhdr->h_vlan_TCI); + + memmove(skb->data + VLAN_HLEN, skb->data, 2 * ETH_ALEN); + __skb_pull(skb, VLAN_HLEN); + + vlan_set_encap_proto(skb, vhdr); + skb->mac_header += VLAN_HLEN; + + if (skb_network_offset(skb) < ETH_HLEN) + skb_set_network_header(skb, ETH_HLEN); + + skb_reset_mac_len(skb); +pull: + __skb_pull(skb, offset); + + return err; +} + +int skb_vlan_pop(struct sk_buff *skb) +{ + u16 vlan_tci; + __be16 vlan_proto; + int err; + + if (likely(skb_vlan_tag_present(skb))) { + skb->vlan_tci = 0; + } else { + if (unlikely((skb->protocol != htons(ETH_P_8021Q) && + skb->protocol != htons(ETH_P_8021AD)) || + skb->len < VLAN_ETH_HLEN)) + return 0; + + err = __skb_vlan_pop(skb, &vlan_tci); + if (err) + return err; + } + /* move next vlan tag to hw accel tag */ + if (likely((skb->protocol != htons(ETH_P_8021Q) && + skb->protocol != htons(ETH_P_8021AD)) || + skb->len < VLAN_ETH_HLEN)) + return 0; + + vlan_proto = skb->protocol; + err = __skb_vlan_pop(skb, &vlan_tci); + if (unlikely(err)) + return err; + + __vlan_hwaccel_put_tag(skb, vlan_proto, vlan_tci); + return 0; +} +EXPORT_SYMBOL(skb_vlan_pop); + +int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci) +{ + if (skb_vlan_tag_present(skb)) { + unsigned int offset = skb->data - skb_mac_header(skb); + int err; + + /* __vlan_insert_tag expect skb->data pointing to mac header. + * So change skb->data before calling it and change back to + * original position later + */ + __skb_push(skb, offset); + err = __vlan_insert_tag(skb, skb->vlan_proto, + skb_vlan_tag_get(skb)); + if (err) { + __skb_pull(skb, offset); + return err; + } + + skb->protocol = skb->vlan_proto; + skb->mac_len += VLAN_HLEN; + + skb_postpush_rcsum(skb, skb->data + (2 * ETH_ALEN), VLAN_HLEN); + __skb_pull(skb, offset); + } + __vlan_hwaccel_put_tag(skb, vlan_proto, vlan_tci); + return 0; +} +EXPORT_SYMBOL(skb_vlan_push); + +/** + * alloc_skb_with_frags - allocate skb with page frags + * + * @header_len: size of linear part + * @data_len: needed length in frags + * @max_page_order: max page order desired. + * @errcode: pointer to error code if any + * @gfp_mask: allocation mask + * + * This can be used to allocate a paged skb, given a maximal order for frags. + */ +struct sk_buff *alloc_skb_with_frags(unsigned long header_len, + unsigned long data_len, + int max_page_order, + int *errcode, + gfp_t gfp_mask) +{ + int npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT; + unsigned long chunk; + struct sk_buff *skb; + struct page *page; + gfp_t gfp_head; + int i; + + *errcode = -EMSGSIZE; + /* Note this test could be relaxed, if we succeed to allocate + * high order pages... + */ + if (npages > MAX_SKB_FRAGS) + return NULL; + + gfp_head = gfp_mask; + if (gfp_head & __GFP_DIRECT_RECLAIM) + gfp_head |= __GFP_REPEAT; + + *errcode = -ENOBUFS; + skb = alloc_skb(header_len, gfp_head); + if (!skb) + return NULL; + + skb->truesize += npages << PAGE_SHIFT; + + for (i = 0; npages > 0; i++) { + int order = max_page_order; + + while (order) { + if (npages >= 1 << order) { + page = alloc_pages((gfp_mask & ~__GFP_DIRECT_RECLAIM) | + __GFP_COMP | + __GFP_NOWARN | + __GFP_NORETRY, + order); + if (page) + goto fill_page; + /* Do not retry other high order allocations */ + order = 1; + max_page_order = 0; + } + order--; + } + page = alloc_page(gfp_mask); + if (!page) + goto failure; +fill_page: + chunk = min_t(unsigned long, data_len, + PAGE_SIZE << order); + skb_fill_page_desc(skb, i, page, 0, chunk); + data_len -= chunk; + npages -= 1 << order; + } + return skb; + +failure: + kfree_skb(skb); + return NULL; +} +EXPORT_SYMBOL(alloc_skb_with_frags);