--- zzzz-none-000/linux-3.10.107/net/ipv4/ip_fragment.c 2017-06-27 09:49:32.000000000 +0000 +++ scorpion-7490-727/linux-3.10.107/net/ipv4/ip_fragment.c 2021-02-04 17:41:59.000000000 +0000 @@ -48,6 +48,7 @@ #include #include #include +#include /* NOTE. Logic of IP defragmentation is parallel to corresponding IPv6 * code now. If you change something here, _PLEASE_ update ipv6/reassembly.c @@ -55,6 +56,7 @@ */ static int sysctl_ipfrag_max_dist __read_mostly = 64; +static const char ip_frag_cache_name[] = "ip4-frags"; struct ipfrag_skb_cb { @@ -74,23 +76,20 @@ __be16 id; u8 protocol; u8 ecn; /* RFC3168 support */ + u16 max_df_size; /* largest frag with DF set seen */ int iif; + int vif; /* L3 master device index */ unsigned int rid; struct inet_peer *peer; }; -static inline u8 ip4_frag_ecn(u8 tos) +static u8 ip4_frag_ecn(u8 tos) { return 1 << (tos & INET_ECN_MASK); } static struct inet_frags ip4_frags; -int ip_frag_nqueues(struct net *net) -{ - return net->ipv4.frags.nqueues; -} - int ip_frag_mem(struct net *net) { return sum_frag_mem_limit(&net->ipv4.frags); @@ -102,56 +101,61 @@ struct ip4_create_arg { struct iphdr *iph; u32 user; + int vif; }; static unsigned int ipqhashfn(__be16 id, __be32 saddr, __be32 daddr, u8 prot) { + net_get_random_once(&ip4_frags.rnd, sizeof(ip4_frags.rnd)); return jhash_3words((__force u32)id << 16 | prot, (__force u32)saddr, (__force u32)daddr, - ip4_frags.rnd) & (INETFRAGS_HASHSZ - 1); + ip4_frags.rnd); } -static unsigned int ip4_hashfn(struct inet_frag_queue *q) +static unsigned int ip4_hashfn(const struct inet_frag_queue *q) { - struct ipq *ipq; + const struct ipq *ipq; ipq = container_of(q, struct ipq, q); return ipqhashfn(ipq->id, ipq->saddr, ipq->daddr, ipq->protocol); } -static bool ip4_frag_match(struct inet_frag_queue *q, void *a) +static bool ip4_frag_match(const struct inet_frag_queue *q, const void *a) { - struct ipq *qp; - struct ip4_create_arg *arg = a; + const struct ipq *qp; + const struct ip4_create_arg *arg = a; qp = container_of(q, struct ipq, q); return qp->id == arg->iph->id && qp->saddr == arg->iph->saddr && qp->daddr == arg->iph->daddr && qp->protocol == arg->iph->protocol && - qp->user == arg->user; + qp->user == arg->user && + qp->vif == arg->vif; } -static void ip4_frag_init(struct inet_frag_queue *q, void *a) +static void ip4_frag_init(struct inet_frag_queue *q, const void *a) { struct ipq *qp = container_of(q, struct ipq, q); struct netns_ipv4 *ipv4 = container_of(q->net, struct netns_ipv4, frags); struct net *net = container_of(ipv4, struct net, ipv4); - struct ip4_create_arg *arg = a; + const struct ip4_create_arg *arg = a; qp->protocol = arg->iph->protocol; qp->id = arg->iph->id; qp->ecn = ip4_frag_ecn(arg->iph->tos); qp->saddr = arg->iph->saddr; qp->daddr = arg->iph->daddr; + qp->vif = arg->vif; qp->user = arg->user; qp->peer = sysctl_ipfrag_max_dist ? - inet_getpeer_v4(net->ipv4.peers, arg->iph->saddr, 1) : NULL; + inet_getpeer_v4(net->ipv4.peers, arg->iph->saddr, arg->vif, 1) : + NULL; } -static __inline__ void ip4_frag_free(struct inet_frag_queue *q) +static void ip4_frag_free(struct inet_frag_queue *q) { struct ipq *qp; @@ -163,7 +167,7 @@ /* Destruction primitives. */ -static __inline__ void ipq_put(struct ipq *ipq) +static void ipq_put(struct ipq *ipq) { inet_frag_put(&ipq->q, &ip4_frags); } @@ -176,16 +180,13 @@ inet_frag_kill(&ipq->q, &ip4_frags); } -/* Memory limiting on fragments. Evictor trashes the oldest - * fragment queue until we are back under the threshold. - */ -static void ip_evictor(struct net *net) +static bool frag_expire_skip_icmp(u32 user) { - int evicted; - - evicted = inet_frag_evictor(&net->ipv4.frags, &ip4_frags, false); - if (evicted) - IP_ADD_STATS_BH(net, IPSTATS_MIB_REASMFAILS, evicted); + return user == IP_DEFRAG_AF_PACKET || + ip_defrag_user_in_between(user, IP_DEFRAG_CONNTRACK_IN, + __IP_DEFRAG_CONNTRACK_IN_END) || + ip_defrag_user_in_between(user, IP_DEFRAG_CONNTRACK_BRIDGE_IN, + __IP_DEFRAG_CONNTRACK_BRIDGE_IN); } /* @@ -201,19 +202,22 @@ spin_lock(&qp->q.lock); - if (qp->q.last_in & INET_FRAG_COMPLETE) + if (qp->q.flags & INET_FRAG_COMPLETE) goto out; ipq_kill(qp); - - IP_INC_STATS_BH(net, IPSTATS_MIB_REASMTIMEOUT); IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS); - if ((qp->q.last_in & INET_FRAG_FIRST_IN) && qp->q.fragments != NULL) { + if (!inet_frag_evicting(&qp->q)) { struct sk_buff *head = qp->q.fragments; const struct iphdr *iph; int err; + IP_INC_STATS_BH(net, IPSTATS_MIB_REASMTIMEOUT); + + if (!(qp->q.flags & INET_FRAG_FIRST_IN) || !qp->q.fragments) + goto out; + rcu_read_lock(); head->dev = dev_get_by_index_rcu(net, qp->iif); if (!head->dev) @@ -226,16 +230,13 @@ if (err) goto out_rcu_unlock; - /* - * Only an end host needs to send an ICMP + /* Only an end host needs to send an ICMP * "Fragment Reassembly Timeout" message, per RFC792. */ - if (qp->user == IP_DEFRAG_AF_PACKET || - (qp->user == IP_DEFRAG_CONNTRACK_IN && - skb_rtable(head)->rt_type != RTN_LOCAL)) + if (frag_expire_skip_icmp(qp->user) && + (skb_rtable(head)->rt_type != RTN_LOCAL)) goto out_rcu_unlock; - /* Send an ICMP "Fragment Reassembly Timeout" message. */ icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0); out_rcu_unlock: @@ -249,7 +250,8 @@ /* Find the correct entry in the "incomplete datagrams" queue for * this IP datagram, and create new one, if nothing is found. */ -static inline struct ipq *ip_find(struct net *net, struct iphdr *iph, u32 user) +static struct ipq *ip_find(struct net *net, struct iphdr *iph, + u32 user, int vif) { struct inet_frag_queue *q; struct ip4_create_arg arg; @@ -257,8 +259,8 @@ arg.iph = iph; arg.user = user; + arg.vif = vif; - read_lock(&ip4_frags.lock); hash = ipqhashfn(iph->id, iph->saddr, iph->daddr, iph->protocol); q = inet_frag_find(&net->ipv4.frags, &ip4_frags, &arg, hash); @@ -270,7 +272,7 @@ } /* Is the fragment too far ahead to be part of ipq? */ -static inline int ip_frag_too_far(struct ipq *qp) +static int ip_frag_too_far(struct ipq *qp) { struct inet_peer *peer = qp->peer; unsigned int max = sysctl_ipfrag_max_dist; @@ -315,9 +317,9 @@ kfree_skb(fp); fp = xp; } while (fp); - sub_frag_mem_limit(&qp->q, sum_truesize); + sub_frag_mem_limit(qp->q.net, sum_truesize); - qp->q.last_in = 0; + qp->q.flags = 0; qp->q.len = 0; qp->q.meat = 0; qp->q.fragments = NULL; @@ -333,12 +335,13 @@ { struct sk_buff *prev, *next; struct net_device *dev; + unsigned int fragsize; int flags, offset; int ihl, end; int err = -ENOENT; u8 ecn; - if (qp->q.last_in & INET_FRAG_COMPLETE) + if (qp->q.flags & INET_FRAG_COMPLETE) goto err; if (!(IPCB(skb)->flags & IPSKB_FRAG_COMPLETE) && @@ -365,9 +368,9 @@ * or have different end, the segment is corrupted. */ if (end < qp->q.len || - ((qp->q.last_in & INET_FRAG_LAST_IN) && end != qp->q.len)) + ((qp->q.flags & INET_FRAG_LAST_IN) && end != qp->q.len)) goto err; - qp->q.last_in |= INET_FRAG_LAST_IN; + qp->q.flags |= INET_FRAG_LAST_IN; qp->q.len = end; } else { if (end&7) { @@ -377,7 +380,7 @@ } if (end > qp->q.len) { /* Some bits beyond end -> corruption. */ - if (qp->q.last_in & INET_FRAG_LAST_IN) + if (qp->q.flags & INET_FRAG_LAST_IN) goto err; qp->q.len = end; } @@ -460,7 +463,7 @@ qp->q.fragments = next; qp->q.meat -= free_it->len; - sub_frag_mem_limit(&qp->q, free_it->truesize); + sub_frag_mem_limit(qp->q.net, free_it->truesize); kfree_skb(free_it); } } @@ -484,15 +487,20 @@ qp->q.stamp = skb->tstamp; qp->q.meat += skb->len; qp->ecn |= ecn; - add_frag_mem_limit(&qp->q, skb->truesize); + add_frag_mem_limit(qp->q.net, skb->truesize); if (offset == 0) - qp->q.last_in |= INET_FRAG_FIRST_IN; + qp->q.flags |= INET_FRAG_FIRST_IN; + + fragsize = skb->len + ihl; + + if (fragsize > qp->q.max_size) + qp->q.max_size = fragsize; if (ip_hdr(skb)->frag_off & htons(IP_DF) && - skb->len + ihl > qp->q.max_size) - qp->q.max_size = skb->len + ihl; + fragsize > qp->max_df_size) + qp->max_df_size = fragsize; - if (qp->q.last_in == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) && + if (qp->q.flags == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) && qp->q.meat == qp->q.len) { unsigned long orefdst = skb->_skb_refdst; @@ -503,7 +511,6 @@ } skb_dst_drop(skb); - inet_frag_lru_move(&qp->q); return -EINPROGRESS; err: @@ -523,7 +530,6 @@ int len; int ihlen; int err; - int sum_truesize; u8 ecn; ipq_kill(qp); @@ -552,7 +558,7 @@ qp->q.fragments = head; } - WARN_ON(head == NULL); + WARN_ON(!head); WARN_ON(FRAG_CB(head)->offset != 0); /* Allocate a new buffer for the datagram. */ @@ -574,7 +580,8 @@ struct sk_buff *clone; int i, plen = 0; - if ((clone = alloc_skb(0, GFP_ATOMIC)) == NULL) + clone = alloc_skb(0, GFP_ATOMIC); + if (!clone) goto out_nomem; clone->next = head->next; head->next = clone; @@ -587,47 +594,47 @@ head->len -= clone->len; clone->csum = 0; clone->ip_summed = head->ip_summed; - add_frag_mem_limit(&qp->q, clone->truesize); + add_frag_mem_limit(qp->q.net, clone->truesize); } + skb_shinfo(head)->frag_list = head->next; skb_push(head, head->data - skb_network_header(head)); - sum_truesize = head->truesize; - for (fp = head->next; fp;) { - bool headstolen; - int delta; - struct sk_buff *next = fp->next; - - sum_truesize += fp->truesize; + for (fp=head->next; fp; fp = fp->next) { + head->data_len += fp->len; + head->len += fp->len; if (head->ip_summed != fp->ip_summed) head->ip_summed = CHECKSUM_NONE; else if (head->ip_summed == CHECKSUM_COMPLETE) head->csum = csum_add(head->csum, fp->csum); - - if (skb_try_coalesce(head, fp, &headstolen, &delta)) { - kfree_skb_partial(fp, headstolen); - } else { - if (!skb_shinfo(head)->frag_list) - skb_shinfo(head)->frag_list = fp; - head->data_len += fp->len; - head->len += fp->len; - head->truesize += fp->truesize; - } - fp = next; + head->truesize += fp->truesize; } - sub_frag_mem_limit(&qp->q, sum_truesize); + sub_frag_mem_limit(qp->q.net, head->truesize); head->next = NULL; head->dev = dev; head->tstamp = qp->q.stamp; - IPCB(head)->frag_max_size = qp->q.max_size; + IPCB(head)->frag_max_size = max(qp->max_df_size, qp->q.max_size); iph = ip_hdr(head); - /* max_size != 0 implies at least one fragment had IP_DF set */ - iph->frag_off = qp->q.max_size ? htons(IP_DF) : 0; iph->tot_len = htons(len); iph->tos |= ecn; + /* When we set IP_DF on a refragmented skb we must also force a + * call to ip_fragment to avoid forwarding a DF-skb of size s while + * original sender only sent fragments of size f (where f < s). + * + * We only set DF/IPSKB_FRAG_PMTU if such DF fragment was the largest + * frag seen to avoid sending tiny DF-fragments in case skb was built + * from one very small df-fragment and one large non-df frag. + */ + if (qp->max_df_size == qp->q.max_size) { + IPCB(head)->flags |= IPSKB_FRAG_PMTU; + iph->frag_off = htons(IP_DF); + } else { + iph->frag_off = 0; + } + ip_send_check(iph); IP_INC_STATS_BH(net, IPSTATS_MIB_REASMOKS); @@ -636,8 +643,7 @@ return 0; out_nomem: - LIMIT_NETDEBUG(KERN_ERR pr_fmt("queue_glue: no memory for gluing queue %p\n"), - qp); + net_dbg_ratelimited("queue_glue: no memory for gluing queue %p\n", qp); err = -ENOMEM; goto out_fail; out_oversize: @@ -648,22 +654,18 @@ } /* Process an incoming IP datagram fragment. */ -int ip_defrag(struct sk_buff *skb, u32 user) +int ip_defrag(struct net *net, struct sk_buff *skb, u32 user) { + struct net_device *dev = skb->dev ? : skb_dst(skb)->dev; + int vif = l3mdev_master_ifindex_rcu(dev); struct ipq *qp; - struct net *net; - net = skb->dev ? dev_net(skb->dev) : dev_net(skb_dst(skb)->dev); IP_INC_STATS_BH(net, IPSTATS_MIB_REASMREQDS); - - if (!net->ipv4.frags.high_thresh) - goto fail; - - /* Start by cleaning up the memory. */ - ip_evictor(net); + skb_orphan(skb); /* Lookup (or create) queue header */ - if ((qp = ip_find(net, ip_hdr(skb), user)) != NULL) { + qp = ip_find(net, ip_hdr(skb), user, vif); + if (qp) { int ret; spin_lock(&qp->q.lock); @@ -675,14 +677,13 @@ return ret; } -fail: IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS); kfree_skb(skb); return -ENOMEM; } EXPORT_SYMBOL(ip_defrag); -struct sk_buff *ip_check_defrag(struct sk_buff *skb, u32 user) +struct sk_buff *ip_check_defrag(struct net *net, struct sk_buff *skb, u32 user) { struct iphdr iph; int netoff; @@ -711,9 +712,9 @@ if (pskb_trim_rcsum(skb, netoff + len)) return skb; memset(IPCB(skb), 0, sizeof(struct inet_skb_parm)); - if (ip_defrag(skb, user)) + if (ip_defrag(net, skb, user)) return NULL; - skb->rxhash = 0; + skb_clear_hash(skb); } } return skb; @@ -729,14 +730,17 @@ .data = &init_net.ipv4.frags.high_thresh, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dointvec_minmax, + .extra1 = &init_net.ipv4.frags.low_thresh }, { .procname = "ipfrag_low_thresh", .data = &init_net.ipv4.frags.low_thresh, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &init_net.ipv4.frags.high_thresh }, { .procname = "ipfrag_time", @@ -748,10 +752,12 @@ { } }; +/* secret interval has been deprecated */ +static int ip4_frags_secret_interval_unused; static struct ctl_table ip4_frags_ctl_table[] = { { .procname = "ipfrag_secret_interval", - .data = &ip4_frags.secret_interval, + .data = &ip4_frags_secret_interval_unused, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, @@ -775,11 +781,14 @@ table = ip4_frags_ns_ctl_table; if (!net_eq(net, &init_net)) { table = kmemdup(table, sizeof(ip4_frags_ns_ctl_table), GFP_KERNEL); - if (table == NULL) + if (!table) goto err_alloc; table[0].data = &net->ipv4.frags.high_thresh; + table[0].extra1 = &net->ipv4.frags.low_thresh; + table[0].extra2 = &init_net.ipv4.frags.high_thresh; table[1].data = &net->ipv4.frags.low_thresh; + table[1].extra2 = &net->ipv4.frags.high_thresh; table[2].data = &net->ipv4.frags.timeout; /* Don't export sysctls to unprivileged users */ @@ -788,7 +797,7 @@ } hdr = register_net_sysctl(net, "net/ipv4", table); - if (hdr == NULL) + if (!hdr) goto err_reg; net->ipv4.frags_hdr = hdr; @@ -810,27 +819,29 @@ kfree(table); } -static void ip4_frags_ctl_register(void) +static void __init ip4_frags_ctl_register(void) { register_net_sysctl(&init_net, "net/ipv4", ip4_frags_ctl_table); } #else -static inline int ip4_frags_ns_ctl_register(struct net *net) +static int ip4_frags_ns_ctl_register(struct net *net) { return 0; } -static inline void ip4_frags_ns_ctl_unregister(struct net *net) +static void ip4_frags_ns_ctl_unregister(struct net *net) { } -static inline void ip4_frags_ctl_register(void) +static void __init ip4_frags_ctl_register(void) { } #endif static int __net_init ipv4_frags_init_net(struct net *net) { + int res; + /* Fragment cache limits. * * The fragment memory accounting code, (tries to) account for @@ -854,9 +865,13 @@ */ net->ipv4.frags.timeout = IP_FRAG_TIME; - inet_frags_init_net(&net->ipv4.frags); - - return ip4_frags_ns_ctl_register(net); + res = inet_frags_init_net(&net->ipv4.frags); + if (res) + return res; + res = ip4_frags_ns_ctl_register(net); + if (res) + inet_frags_uninit_net(&net->ipv4.frags); + return res; } static void __net_exit ipv4_frags_exit_net(struct net *net) @@ -881,6 +896,7 @@ ip4_frags.qsize = sizeof(struct ipq); ip4_frags.match = ip4_frag_match; ip4_frags.frag_expire = ip_expire; - ip4_frags.secret_interval = 10 * 60 * HZ; - inet_frags_init(&ip4_frags); + ip4_frags.frags_cache_name = ip_frag_cache_name; + if (inet_frags_init(&ip4_frags)) + panic("IP: failed to allocate ip4_frags cache\n"); }