--- zzzz-none-000/linux-3.10.107/net/ipv4/udp.c 2017-06-27 09:49:32.000000000 +0000 +++ scorpion-7490-727/linux-3.10.107/net/ipv4/udp.c 2021-02-04 17:41:59.000000000 +0000 @@ -90,6 +90,7 @@ #include #include #include +#include #include #include #include @@ -103,12 +104,14 @@ #include #include #include +#include #include #include #include #include #include #include +#include #include "udp_impl.h" struct udp_table udp_table __read_mostly; @@ -141,7 +144,7 @@ struct hlist_nulls_node *node; kuid_t uid = sock_i_uid(sk); - sk_nulls_for_each(sk2, node, &hslot->head) + sk_nulls_for_each(sk2, node, &hslot->head) { if (net_eq(sock_net(sk2), net) && sk2 != sk && (bitmap || udp_sk(sk2)->udp_port_hash == num) && @@ -149,14 +152,13 @@ (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if || sk2->sk_bound_dev_if == sk->sk_bound_dev_if) && (!sk2->sk_reuseport || !sk->sk_reuseport || - !uid_eq(uid, sock_i_uid(sk2))) && - (*saddr_comp)(sk, sk2)) { - if (bitmap) - __set_bit(udp_sk(sk2)->udp_port_hash >> log, - bitmap); - else + !uid_eq(uid, sock_i_uid(sk2))) && + saddr_comp(sk, sk2)) { + if (!bitmap) return 1; + __set_bit(udp_sk(sk2)->udp_port_hash >> log, bitmap); } + } return 0; } @@ -165,10 +167,10 @@ * can insert/delete a socket with local_port == num */ static int udp_lib_lport_inuse2(struct net *net, __u16 num, - struct udp_hslot *hslot2, - struct sock *sk, - int (*saddr_comp)(const struct sock *sk1, - const struct sock *sk2)) + struct udp_hslot *hslot2, + struct sock *sk, + int (*saddr_comp)(const struct sock *sk1, + const struct sock *sk2)) { struct sock *sk2; struct hlist_nulls_node *node; @@ -176,7 +178,7 @@ int res = 0; spin_lock(&hslot2->lock); - udp_portaddr_for_each_entry(sk2, node, &hslot2->head) + udp_portaddr_for_each_entry(sk2, node, &hslot2->head) { if (net_eq(sock_net(sk2), net) && sk2 != sk && (udp_sk(sk2)->udp_port_hash == num) && @@ -184,11 +186,12 @@ (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if || sk2->sk_bound_dev_if == sk->sk_bound_dev_if) && (!sk2->sk_reuseport || !sk->sk_reuseport || - !uid_eq(uid, sock_i_uid(sk2))) && - (*saddr_comp)(sk, sk2)) { + !uid_eq(uid, sock_i_uid(sk2))) && + saddr_comp(sk, sk2)) { res = 1; break; } + } spin_unlock(&hslot2->lock); return res; } @@ -203,8 +206,8 @@ * with NULL address */ int udp_lib_get_port(struct sock *sk, unsigned short snum, - int (*saddr_comp)(const struct sock *sk1, - const struct sock *sk2), + int (*saddr_comp)(const struct sock *sk1, + const struct sock *sk2), unsigned int hash2_nulladdr) { struct udp_hslot *hslot, *hslot2; @@ -218,11 +221,11 @@ unsigned short first, last; DECLARE_BITMAP(bitmap, PORTS_PER_CHAIN); - inet_get_local_port_range(&low, &high); + inet_get_local_port_range(net, &low, &high); remaining = (high - low) + 1; - rand = net_random(); - first = (((u64)rand * remaining) >> 32) + low; + rand = prandom_u32(); + first = reciprocal_scale(rand, remaining) + low; /* * force rand to be an odd multiple of UDP_HTABLE_SIZE */ @@ -244,7 +247,7 @@ do { if (low <= snum && snum <= high && !test_bit(snum >> udptable->log, bitmap) && - !inet_is_reserved_local_port(snum)) + !inet_is_local_reserved_port(net, snum)) goto found; snum += rand; } while (snum != first); @@ -315,8 +318,8 @@ inet1->inet_rcv_saddr == inet2->inet_rcv_saddr)); } -static unsigned int udp4_portaddr_hash(struct net *net, __be32 saddr, - unsigned int port) +static u32 udp4_portaddr_hash(const struct net *net, __be32 saddr, + unsigned int port) { return jhash_1word((__force u32)saddr, net_hash_mix(net)) ^ port; } @@ -333,38 +336,46 @@ return udp_lib_get_port(sk, snum, ipv4_rcv_saddr_equal, hash2_nulladdr); } -static inline int compute_score(struct sock *sk, struct net *net, __be32 saddr, - unsigned short hnum, - __be16 sport, __be32 daddr, __be16 dport, int dif) -{ - int score = -1; - - if (net_eq(sock_net(sk), net) && udp_sk(sk)->udp_port_hash == hnum && - !ipv6_only_sock(sk)) { - struct inet_sock *inet = inet_sk(sk); - - score = (sk->sk_family == PF_INET ? 2 : 1); - if (inet->inet_rcv_saddr) { - if (inet->inet_rcv_saddr != daddr) - return -1; - score += 4; - } - if (inet->inet_daddr) { - if (inet->inet_daddr != saddr) - return -1; - score += 4; - } - if (inet->inet_dport) { - if (inet->inet_dport != sport) - return -1; - score += 4; - } - if (sk->sk_bound_dev_if) { - if (sk->sk_bound_dev_if != dif) - return -1; - score += 4; - } +static inline int compute_score(struct sock *sk, struct net *net, + __be32 saddr, unsigned short hnum, __be16 sport, + __be32 daddr, __be16 dport, int dif) +{ + int score; + struct inet_sock *inet; + + if (!net_eq(sock_net(sk), net) || + udp_sk(sk)->udp_port_hash != hnum || + ipv6_only_sock(sk)) + return -1; + + score = (sk->sk_family == PF_INET) ? 2 : 1; + inet = inet_sk(sk); + + if (inet->inet_rcv_saddr) { + if (inet->inet_rcv_saddr != daddr) + return -1; + score += 4; + } + + if (inet->inet_daddr) { + if (inet->inet_daddr != saddr) + return -1; + score += 4; } + + if (inet->inet_dport) { + if (inet->inet_dport != sport) + return -1; + score += 4; + } + + if (sk->sk_bound_dev_if) { + if (sk->sk_bound_dev_if != dif) + return -1; + score += 4; + } + if (sk->sk_incoming_cpu == raw_smp_processor_id()) + score++; return score; } @@ -375,36 +386,56 @@ __be32 saddr, __be16 sport, __be32 daddr, unsigned int hnum, int dif) { - int score = -1; + int score; + struct inet_sock *inet; - if (net_eq(sock_net(sk), net) && !ipv6_only_sock(sk)) { - struct inet_sock *inet = inet_sk(sk); + if (!net_eq(sock_net(sk), net) || + ipv6_only_sock(sk)) + return -1; - if (inet->inet_rcv_saddr != daddr) + inet = inet_sk(sk); + + if (inet->inet_rcv_saddr != daddr || + inet->inet_num != hnum) + return -1; + + score = (sk->sk_family == PF_INET) ? 2 : 1; + + if (inet->inet_daddr) { + if (inet->inet_daddr != saddr) return -1; - if (inet->inet_num != hnum) + score += 4; + } + + if (inet->inet_dport) { + if (inet->inet_dport != sport) return -1; + score += 4; + } - score = (sk->sk_family == PF_INET ? 2 : 1); - if (inet->inet_daddr) { - if (inet->inet_daddr != saddr) - return -1; - score += 4; - } - if (inet->inet_dport) { - if (inet->inet_dport != sport) - return -1; - score += 4; - } - if (sk->sk_bound_dev_if) { - if (sk->sk_bound_dev_if != dif) - return -1; - score += 4; - } + if (sk->sk_bound_dev_if) { + if (sk->sk_bound_dev_if != dif) + return -1; + score += 4; } + + if (sk->sk_incoming_cpu == raw_smp_processor_id()) + score++; + return score; } +static u32 udp_ehashfn(const struct net *net, const __be32 laddr, + const __u16 lport, const __be32 faddr, + const __be16 fport) +{ + static u32 udp_ehash_secret __read_mostly; + + net_get_random_once(&udp_ehash_secret, sizeof(udp_ehash_secret)); + + return __inet_ehashfn(laddr, lport, faddr, fport, + udp_ehash_secret + net_hash_mix(net)); +} /* called with read_rcu_lock() */ static struct sock *udp4_lib_lookup2(struct net *net, @@ -428,13 +459,13 @@ badness = score; reuseport = sk->sk_reuseport; if (reuseport) { - hash = inet_ehashfn(net, daddr, hnum, - saddr, htons(sport)); + hash = udp_ehashfn(net, daddr, hnum, + saddr, sport); matches = 1; } } else if (score == badness && reuseport) { matches++; - if (((u64)hash * matches) >> 32 == 0) + if (reciprocal_scale(hash, matches) == 0) result = sk; hash = next_pseudo_random32(hash); } @@ -509,13 +540,13 @@ badness = score; reuseport = sk->sk_reuseport; if (reuseport) { - hash = inet_ehashfn(net, daddr, hnum, - saddr, htons(sport)); + hash = udp_ehashfn(net, daddr, hnum, + saddr, sport); matches = 1; } } else if (score == badness && reuseport) { matches++; - if (((u64)hash * matches) >> 32 == 0) + if (reciprocal_scale(hash, matches) == 0) result = sk; hash = next_pseudo_random32(hash); } @@ -549,12 +580,15 @@ struct sock *sk; const struct iphdr *iph = ip_hdr(skb); - if (unlikely(sk = skb_steal_sock(skb))) - return sk; - else - return __udp4_lib_lookup(dev_net(skb_dst(skb)->dev), iph->saddr, sport, - iph->daddr, dport, inet_iif(skb), - udptable); + sk = __udp4_lib_lookup(dev_net(skb_dst(skb)->dev), iph->saddr, sport, + iph->daddr, dport, inet_iif(skb), + udptable); + +#ifdef CONFIG_AVM_PA + if (sk) + avm_pa_add_local_session(skb, sk); +#endif + return sk; } struct sock *udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport, @@ -564,34 +598,24 @@ } EXPORT_SYMBOL_GPL(udp4_lib_lookup); -static inline struct sock *udp_v4_mcast_next(struct net *net, struct sock *sk, - __be16 loc_port, __be32 loc_addr, - __be16 rmt_port, __be32 rmt_addr, - int dif) +static inline bool __udp_is_mcast_sock(struct net *net, struct sock *sk, + __be16 loc_port, __be32 loc_addr, + __be16 rmt_port, __be32 rmt_addr, + int dif, unsigned short hnum) { - struct hlist_nulls_node *node; - struct sock *s = sk; - unsigned short hnum = ntohs(loc_port); - - sk_nulls_for_each_from(s, node) { - struct inet_sock *inet = inet_sk(s); + struct inet_sock *inet = inet_sk(sk); - if (!net_eq(sock_net(s), net) || - udp_sk(s)->udp_port_hash != hnum || - (inet->inet_daddr && inet->inet_daddr != rmt_addr) || - (inet->inet_dport != rmt_port && inet->inet_dport) || - (inet->inet_rcv_saddr && - inet->inet_rcv_saddr != loc_addr) || - ipv6_only_sock(s) || - (s->sk_bound_dev_if && s->sk_bound_dev_if != dif)) - continue; - if (!ip_mc_sf_allow(s, loc_addr, rmt_addr, dif)) - continue; - goto found; - } - s = NULL; -found: - return s; + if (!net_eq(sock_net(sk), net) || + udp_sk(sk)->udp_port_hash != hnum || + (inet->inet_daddr && inet->inet_daddr != rmt_addr) || + (inet->inet_dport != rmt_port && inet->inet_dport) || + (inet->inet_rcv_saddr && inet->inet_rcv_saddr != loc_addr) || + ipv6_only_sock(sk) || + (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif)) + return false; + if (!ip_mc_sf_allow(sk, loc_addr, rmt_addr, dif)) + return false; + return true; } /* @@ -619,7 +643,7 @@ sk = __udp4_lib_lookup(net, iph->daddr, uh->dest, iph->saddr, uh->source, skb->dev->ifindex, udptable); - if (sk == NULL) { + if (!sk) { ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS); return; /* No socket for error */ } @@ -657,7 +681,7 @@ break; case ICMP_REDIRECT: ipv4_sk_redirect(skb, sk); - break; + goto out; } /* @@ -703,16 +727,15 @@ * @src: source IP address * @dst: destination IP address */ -static void udp4_hwcsum(struct sk_buff *skb, __be32 src, __be32 dst) +void udp4_hwcsum(struct sk_buff *skb, __be32 src, __be32 dst) { struct udphdr *uh = udp_hdr(skb); - struct sk_buff *frags = skb_shinfo(skb)->frag_list; int offset = skb_transport_offset(skb); int len = skb->len - offset; int hlen = len; __wsum csum = 0; - if (!frags) { + if (!skb_has_frag_list(skb)) { /* * Only one fragment on the socket. */ @@ -721,15 +744,17 @@ uh->check = ~csum_tcpudp_magic(src, dst, len, IPPROTO_UDP, 0); } else { + struct sk_buff *frags; + /* * HW-checksum won't work as there are two or more * fragments on the socket so that all csums of sk_buffs * should be together */ - do { + skb_walk_frags(skb, frags) { csum = csum_add(csum, frags->csum); hlen -= frags->len; - } while ((frags = frags->next)); + } csum = skb_checksum(skb, offset, hlen, csum); skb->ip_summed = CHECKSUM_NONE; @@ -739,6 +764,44 @@ uh->check = CSUM_MANGLED_0; } } +EXPORT_SYMBOL_GPL(udp4_hwcsum); + +/* Function to set UDP checksum for an IPv4 UDP packet. This is intended + * for the simple case like when setting the checksum for a UDP tunnel. + */ +void udp_set_csum(bool nocheck, struct sk_buff *skb, + __be32 saddr, __be32 daddr, int len) +{ + struct udphdr *uh = udp_hdr(skb); + + if (nocheck) + uh->check = 0; + else if (skb_is_gso(skb)) + uh->check = ~udp_v4_check(len, saddr, daddr, 0); + else if (skb_dst(skb) && skb_dst(skb)->dev && + (skb_dst(skb)->dev->features & NETIF_F_V4_CSUM)) { + + BUG_ON(skb->ip_summed == CHECKSUM_PARTIAL); + + skb->ip_summed = CHECKSUM_PARTIAL; + skb->csum_start = skb_transport_header(skb) - skb->head; + skb->csum_offset = offsetof(struct udphdr, check); + uh->check = ~udp_v4_check(len, saddr, daddr, 0); + } else { + __wsum csum; + + BUG_ON(skb->ip_summed == CHECKSUM_PARTIAL); + + uh->check = 0; + csum = skb_checksum(skb, 0, len, 0); + uh->check = udp_v4_check(len, saddr, daddr, csum); + if (uh->check == 0) + uh->check = CSUM_MANGLED_0; + + skb->ip_summed = CHECKSUM_UNNECESSARY; + } +} +EXPORT_SYMBOL(udp_set_csum); static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4) { @@ -763,7 +826,7 @@ if (is_udplite) /* UDP-Lite */ csum = udplite_csum(skb); - else if (sk->sk_no_check == UDP_CSUM_NOXMIT) { /* UDP csum disabled */ + else if (sk->sk_no_check_tx && !skb_is_gso(skb)) { /* UDP csum off */ skb->ip_summed = CHECKSUM_NONE; goto send; @@ -820,8 +883,7 @@ } EXPORT_SYMBOL(udp_push_pending_frames); -int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, - size_t len) +int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) { struct inet_sock *inet = inet_sk(sk); struct udp_sock *up = udp_sk(sk); @@ -853,6 +915,8 @@ ipc.opt = NULL; ipc.tx_flags = 0; + ipc.ttl = 0; + ipc.tos = -1; getfrag = is_udplite ? udplite_getfrag : ip_generic_getfrag; @@ -878,7 +942,7 @@ * Get and verify the address. */ if (msg->msg_name) { - struct sockaddr_in *usin = (struct sockaddr_in *)msg->msg_name; + DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name); if (msg->msg_namelen < sizeof(*usin)) return -EINVAL; if (usin->sin_family != AF_INET) { @@ -907,9 +971,12 @@ sock_tx_timestamp(sk, &ipc.tx_flags); if (msg->msg_controllen) { - err = ip_cmsg_send(sock_net(sk), msg, &ipc); - if (err) + err = ip_cmsg_send(sock_net(sk), msg, &ipc, + sk->sk_family == AF_INET6); + if (unlikely(err)) { + kfree(ipc.opt); return err; + } if (ipc.opt) free = 1; connected = 0; @@ -936,7 +1003,7 @@ faddr = ipc.opt->opt.faddr; connected = 0; } - tos = RT_TOS(inet->tos); + tos = get_rttos(&ipc, inet); if (sock_flag(sk, SOCK_LOCALROUTE) || (msg->msg_flags & MSG_DONTROUTE) || (ipc.opt && ipc.opt->opt.is_strictroute)) { @@ -956,15 +1023,23 @@ if (connected) rt = (struct rtable *)sk_dst_check(sk, 0); - if (rt == NULL) { + if (!rt) { struct net *net = sock_net(sk); + __u8 flow_flags = inet_sk_flowi_flags(sk); fl4 = &fl4_stack; + flowi4_init_output(fl4, ipc.oif, sk->sk_mark, tos, RT_SCOPE_UNIVERSE, sk->sk_protocol, - inet_sk_flowi_flags(sk)|FLOWI_FLAG_CAN_SLEEP, + flow_flags, faddr, saddr, dport, inet->inet_sport); + if (!saddr && ipc.oif) { + err = l3mdev_get_saddr(net, ipc.oif, fl4); + if (err < 0) + goto out; + } + security_sk_classify_flow(sk, flowi4_to_flowi(fl4)); rt = ip_route_output_flow(net, fl4, sk); if (IS_ERR(rt)) { @@ -993,7 +1068,7 @@ /* Lockless fast path for the non-corking case. */ if (!corkreq) { - skb = ip_make_skb(sk, fl4, getfrag, msg->msg_iov, ulen, + skb = ip_make_skb(sk, fl4, getfrag, msg, ulen, sizeof(struct udphdr), &ipc, &rt, msg->msg_flags); err = PTR_ERR(skb); @@ -1008,7 +1083,7 @@ /* ... which is an evident application bug. --ANK */ release_sock(sk); - LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("cork app bug 2\n")); + net_dbg_ratelimited("cork app bug 2\n"); err = -EINVAL; goto out; } @@ -1024,7 +1099,7 @@ do_append_data: up->len += ulen; - err = ip_append_data(sk, fl4, getfrag, msg->msg_iov, ulen, + err = ip_append_data(sk, fl4, getfrag, msg, ulen, sizeof(struct udphdr), &ipc, &rt, corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags); if (err) @@ -1080,7 +1155,7 @@ * sendpage interface can't pass. * This will succeed only when the socket is connected. */ - ret = udp_sendmsg(NULL, sk, &msg, 0); + ret = udp_sendmsg(sk, &msg, 0); if (ret < 0) return ret; } @@ -1090,7 +1165,7 @@ if (unlikely(!up->pending)) { release_sock(sk); - LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("udp cork app bug 3\n")); + net_dbg_ratelimited("udp cork app bug 3\n"); return -EINVAL; } @@ -1116,7 +1191,6 @@ return ret; } - /** * first_packet_length - return length of first packet in receive queue * @sk: socket @@ -1198,11 +1272,11 @@ * return it, otherwise we block. */ -int udp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, - size_t len, int noblock, int flags, int *addr_len) +int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock, + int flags, int *addr_len) { struct inet_sock *inet = inet_sk(sk); - struct sockaddr_in *sin = (struct sockaddr_in *)msg->msg_name; + DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name); struct sk_buff *skb; unsigned int ulen, copied; int peeked, off = 0; @@ -1240,12 +1314,11 @@ } if (checksum_valid || skb_csum_unnecessary(skb)) - err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), - msg->msg_iov, copied); + err = skb_copy_datagram_msg(skb, sizeof(struct udphdr), + msg, copied); else { - err = skb_copy_and_csum_datagram_iovec(skb, - sizeof(struct udphdr), - msg->msg_iov); + err = skb_copy_and_csum_datagram_msg(skb, sizeof(struct udphdr), + msg); if (err == -EINVAL) goto csum_copy_err; @@ -1276,7 +1349,7 @@ *addr_len = sizeof(*sin); } if (inet->cmsg_flags) - ip_cmsg_recv(msg, skb); + ip_cmsg_recv_offset(msg, skb, sizeof(struct udphdr), off); err = copied; if (flags & MSG_TRUNC) @@ -1301,7 +1374,6 @@ goto try_again; } - int udp_disconnect(struct sock *sk, int flags) { struct inet_sock *inet = inet_sk(sk); @@ -1399,8 +1471,11 @@ { int rc; - if (inet_sk(sk)->inet_daddr) + if (inet_sk(sk)->inet_daddr) { sock_rps_save_rxhash(sk, skb); + sk_mark_napi_id(sk, skb); + sk_incoming_cpu_update(sk); + } rc = sock_queue_rcv_skb(sk, skb); if (rc < 0) { @@ -1465,9 +1540,13 @@ /* if we're overly short, let UDP handle it */ encap_rcv = ACCESS_ONCE(up->encap_rcv); - if (skb->len > sizeof(struct udphdr) && encap_rcv != NULL) { + if (encap_rcv) { int ret; + /* Verify checksum before giving to encap */ + if (udp_lib_checksum_complete(skb)) + goto csum_error; + ret = encap_rcv(sk, skb); if (ret <= 0) { UDP_INC_STATS_BH(sock_net(sk), @@ -1497,8 +1576,8 @@ * provided by the application." */ if (up->pcrlen == 0) { /* full coverage was set */ - LIMIT_NETDEBUG(KERN_WARNING "UDPLite: partial coverage %d while full coverage %d requested\n", - UDP_SKB_CB(skb)->cscov, skb->len); + net_dbg_ratelimited("UDPLite: partial coverage %d while full coverage %d requested\n", + UDP_SKB_CB(skb)->cscov, skb->len); goto drop; } /* The next case involves violating the min. coverage requested @@ -1508,8 +1587,8 @@ * Therefore the above ...()->partial_cov statement is essential. */ if (UDP_SKB_CB(skb)->cscov < up->pcrlen) { - LIMIT_NETDEBUG(KERN_WARNING "UDPLite: coverage %d too small, need min %d\n", - UDP_SKB_CB(skb)->cscov, up->pcrlen); + net_dbg_ratelimited("UDPLite: coverage %d too small, need min %d\n", + UDP_SKB_CB(skb)->cscov, up->pcrlen); goto drop; } } @@ -1518,13 +1597,15 @@ udp_lib_checksum_complete(skb)) goto csum_error; - - if (sk_rcvqueues_full(sk, skb, sk->sk_rcvbuf)) + if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) { + UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_RCVBUFERRORS, + is_udplite); goto drop; + } rc = 0; - ipv4_pktinfo_prepare(skb); + ipv4_pktinfo_prepare(sk, skb); bh_lock_sock(sk); if (!sock_owned_by_user(sk)) rc = __udp_queue_rcv_skb(sk, skb); @@ -1545,7 +1626,6 @@ return -1; } - static void flush_stack(struct sock **stack, unsigned int count, struct sk_buff *skb, unsigned int final) { @@ -1553,9 +1633,13 @@ struct sk_buff *skb1 = NULL; struct sock *sk; +#ifdef CONFIG_AVM_PA + avm_pa_add_local_session(skb, stack[0]); +#endif + for (i = 0; i < count; i++) { sk = stack[i]; - if (likely(skb1 == NULL)) + if (likely(!skb1)) skb1 = (i == final) ? skb : skb_clone(skb, GFP_ATOMIC); if (!skb1) { @@ -1568,11 +1652,25 @@ if (skb1 && udp_queue_rcv_skb(sk, skb1) <= 0) skb1 = NULL; + + sock_put(sk); } if (unlikely(skb1)) kfree_skb(skb1); } +/* For TCP sockets, sk_rx_dst is protected by socket lock + * For UDP, we use xchg() to guard against concurrent changes. + */ +static void udp_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst) +{ + struct dst_entry *old; + + dst_hold(dst); + old = xchg(&sk->sk_rx_dst, dst); + dst_release(old); +} + /* * Multicasts and broadcasts go to each listener. * @@ -1581,46 +1679,61 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb, struct udphdr *uh, __be32 saddr, __be32 daddr, - struct udp_table *udptable) + struct udp_table *udptable, + int proto) { struct sock *sk, *stack[256 / sizeof(struct sock *)]; - struct udp_hslot *hslot = udp_hashslot(udptable, net, ntohs(uh->dest)); - int dif; - unsigned int i, count = 0; + struct hlist_nulls_node *node; + unsigned short hnum = ntohs(uh->dest); + struct udp_hslot *hslot = udp_hashslot(udptable, net, hnum); + int dif = skb->dev->ifindex; + unsigned int count = 0, offset = offsetof(typeof(*sk), sk_nulls_node); + unsigned int hash2 = 0, hash2_any = 0, use_hash2 = (hslot->count > 10); + bool inner_flushed = false; + + if (use_hash2) { + hash2_any = udp4_portaddr_hash(net, htonl(INADDR_ANY), hnum) & + udp_table.mask; + hash2 = udp4_portaddr_hash(net, daddr, hnum) & udp_table.mask; +start_lookup: + hslot = &udp_table.hash2[hash2]; + offset = offsetof(typeof(*sk), __sk_common.skc_portaddr_node); + } spin_lock(&hslot->lock); - sk = sk_nulls_head(&hslot->head); - dif = skb->dev->ifindex; - sk = udp_v4_mcast_next(net, sk, uh->dest, daddr, uh->source, saddr, dif); - while (sk) { - stack[count++] = sk; - sk = udp_v4_mcast_next(net, sk_nulls_next(sk), uh->dest, - daddr, uh->source, saddr, dif); - if (unlikely(count == ARRAY_SIZE(stack))) { - if (!sk) - break; - flush_stack(stack, count, skb, ~0); - count = 0; + sk_nulls_for_each_entry_offset(sk, node, &hslot->head, offset) { + if (__udp_is_mcast_sock(net, sk, + uh->dest, daddr, + uh->source, saddr, + dif, hnum)) { + if (unlikely(count == ARRAY_SIZE(stack))) { + flush_stack(stack, count, skb, ~0); + inner_flushed = true; + count = 0; + } + stack[count++] = sk; + sock_hold(sk); } } - /* - * before releasing chain lock, we must take a reference on sockets - */ - for (i = 0; i < count; i++) - sock_hold(stack[i]); spin_unlock(&hslot->lock); + /* Also lookup *:port if we are using hash2 and haven't done so yet. */ + if (use_hash2 && hash2 != hash2_any) { + hash2 = hash2_any; + goto start_lookup; + } + /* * do the slow work with no lock held */ if (count) { flush_stack(stack, count, skb, count - 1); - - for (i = 0; i < count; i++) - sock_put(stack[i]); } else { - kfree_skb(skb); + if (!inner_flushed) + UDP_INC_STATS_BH(net, UDP_MIB_IGNOREDMULTI, + proto == IPPROTO_UDPLITE); + consume_skb(skb); } return 0; } @@ -1633,7 +1746,6 @@ static inline int udp4_csum_init(struct sk_buff *skb, struct udphdr *uh, int proto) { - const struct iphdr *iph; int err; UDP_SKB_CB(skb)->partial_cov = 0; @@ -1645,22 +1757,8 @@ return err; } - iph = ip_hdr(skb); - if (uh->check == 0) { - skb->ip_summed = CHECKSUM_UNNECESSARY; - } else if (skb->ip_summed == CHECKSUM_COMPLETE) { - if (!csum_tcpudp_magic(iph->saddr, iph->daddr, skb->len, - proto, skb->csum)) - skb->ip_summed = CHECKSUM_UNNECESSARY; - } - if (!skb_csum_unnecessary(skb)) - skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr, - skb->len, proto, 0); - /* Probably, we should checksum udp header (it should be in cache - * in any case) and data in tiny packets (< rx copybreak). - */ - - return 0; + return skb_checksum_init_zero_check(skb, proto, uh->check, + inet_compute_pseudo); } /* @@ -1701,14 +1799,37 @@ if (udp4_csum_init(skb, uh, proto)) goto csum_error; + sk = skb_steal_sock(skb); + if (sk) { + struct dst_entry *dst = skb_dst(skb); + int ret; + + if (unlikely(sk->sk_rx_dst != dst)) + udp_sk_rx_dst_set(sk, dst); + + ret = udp_queue_rcv_skb(sk, skb); + sock_put(sk); + /* a return value > 0 means to resubmit the input, but + * it wants the return to be -protocol, or 0 + */ + if (ret > 0) + return -ret; + return 0; + } + if (rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST)) return __udp4_lib_mcast_deliver(net, skb, uh, - saddr, daddr, udptable); + saddr, daddr, udptable, proto); sk = __udp4_lib_lookup_skb(skb, uh->source, uh->dest, udptable); + if (sk) { + int ret; + + if (inet_get_convert_csum(sk) && uh->check && !IS_UDPLITE(sk)) + skb_checksum_try_convert(skb, IPPROTO_UDP, uh->check, + inet_compute_pseudo); - if (sk != NULL) { - int ret = udp_queue_rcv_skb(sk, skb); + ret = udp_queue_rcv_skb(sk, skb); sock_put(sk); /* a return value > 0 means to resubmit the input, but @@ -1738,11 +1859,11 @@ return 0; short_packet: - LIMIT_NETDEBUG(KERN_DEBUG "UDP%s: short packet: From %pI4:%u %d/%d to %pI4:%u\n", - proto == IPPROTO_UDPLITE ? "Lite" : "", - &saddr, ntohs(uh->source), - ulen, skb->len, - &daddr, ntohs(uh->dest)); + net_dbg_ratelimited("UDP%s: short packet: From %pI4:%u %d/%d to %pI4:%u\n", + proto == IPPROTO_UDPLITE ? "Lite" : "", + &saddr, ntohs(uh->source), + ulen, skb->len, + &daddr, ntohs(uh->dest)); goto drop; csum_error: @@ -1750,10 +1871,10 @@ * RFC1122: OK. Discards the bad packet silently (as far as * the network is concerned, anyway) as per 4.1.3.4 (MUST). */ - LIMIT_NETDEBUG(KERN_DEBUG "UDP%s: bad checksum. From %pI4:%u to %pI4:%u ulen %d\n", - proto == IPPROTO_UDPLITE ? "Lite" : "", - &saddr, ntohs(uh->source), &daddr, ntohs(uh->dest), - ulen); + net_dbg_ratelimited("UDP%s: bad checksum. From %pI4:%u to %pI4:%u ulen %d\n", + proto == IPPROTO_UDPLITE ? "Lite" : "", + &saddr, ntohs(uh->source), &daddr, ntohs(uh->dest), + ulen); UDP_INC_STATS_BH(net, UDP_MIB_CSUMERRORS, proto == IPPROTO_UDPLITE); drop: UDP_INC_STATS_BH(net, UDP_MIB_INERRORS, proto == IPPROTO_UDPLITE); @@ -1761,6 +1882,164 @@ return 0; } +/* We can only early demux multicast if there is a single matching socket. + * If more than one socket found returns NULL + */ +static struct sock *__udp4_lib_mcast_demux_lookup(struct net *net, + __be16 loc_port, __be32 loc_addr, + __be16 rmt_port, __be32 rmt_addr, + int dif) +{ + struct sock *sk, *result; + struct hlist_nulls_node *node; + unsigned short hnum = ntohs(loc_port); + unsigned int count, slot = udp_hashfn(net, hnum, udp_table.mask); + struct udp_hslot *hslot = &udp_table.hash[slot]; + + /* Do not bother scanning a too big list */ + if (hslot->count > 10) + return NULL; + + rcu_read_lock(); +begin: + count = 0; + result = NULL; + sk_nulls_for_each_rcu(sk, node, &hslot->head) { + if (__udp_is_mcast_sock(net, sk, + loc_port, loc_addr, + rmt_port, rmt_addr, + dif, hnum)) { + result = sk; + ++count; + } + } + /* + * if the nulls value we got at the end of this lookup is + * not the expected one, we must restart lookup. + * We probably met an item that was moved to another chain. + */ + if (get_nulls_value(node) != slot) + goto begin; + + if (result) { + if (count != 1 || + unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2))) + result = NULL; + else if (unlikely(!__udp_is_mcast_sock(net, result, + loc_port, loc_addr, + rmt_port, rmt_addr, + dif, hnum))) { + sock_put(result); + result = NULL; + } + } + rcu_read_unlock(); + return result; +} + +/* For unicast we should only early demux connected sockets or we can + * break forwarding setups. The chains here can be long so only check + * if the first socket is an exact match and if not move on. + */ +static struct sock *__udp4_lib_demux_lookup(struct net *net, + __be16 loc_port, __be32 loc_addr, + __be16 rmt_port, __be32 rmt_addr, + int dif) +{ + struct sock *sk, *result; + struct hlist_nulls_node *node; + unsigned short hnum = ntohs(loc_port); + unsigned int hash2 = udp4_portaddr_hash(net, loc_addr, hnum); + unsigned int slot2 = hash2 & udp_table.mask; + struct udp_hslot *hslot2 = &udp_table.hash2[slot2]; + INET_ADDR_COOKIE(acookie, rmt_addr, loc_addr); + const __portpair ports = INET_COMBINED_PORTS(rmt_port, hnum); + + rcu_read_lock(); + result = NULL; + udp_portaddr_for_each_entry_rcu(sk, node, &hslot2->head) { + if (INET_MATCH(sk, net, acookie, + rmt_addr, loc_addr, ports, dif)) + result = sk; + /* Only check first socket in chain */ + break; + } + + if (result) { + if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2))) + result = NULL; + else if (unlikely(!INET_MATCH(sk, net, acookie, + rmt_addr, loc_addr, + ports, dif))) { + sock_put(result); + result = NULL; + } + } + rcu_read_unlock(); + return result; +} + +void udp_v4_early_demux(struct sk_buff *skb) +{ + struct net *net = dev_net(skb->dev); + const struct iphdr *iph; + const struct udphdr *uh; + struct sock *sk; + struct dst_entry *dst; + int dif = skb->dev->ifindex; + int ours; + + /* validate the packet */ + if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct udphdr))) + return; + + iph = ip_hdr(skb); + uh = udp_hdr(skb); + + if (skb->pkt_type == PACKET_BROADCAST || + skb->pkt_type == PACKET_MULTICAST) { + struct in_device *in_dev = __in_dev_get_rcu(skb->dev); + + if (!in_dev) + return; + + /* we are supposed to accept bcast packets */ + if (skb->pkt_type == PACKET_MULTICAST) { + ours = ip_check_mc_rcu(in_dev, iph->daddr, iph->saddr, + iph->protocol); + if (!ours) + return; + } + + sk = __udp4_lib_mcast_demux_lookup(net, uh->dest, iph->daddr, + uh->source, iph->saddr, dif); + } else if (skb->pkt_type == PACKET_HOST) { + sk = __udp4_lib_demux_lookup(net, uh->dest, iph->daddr, + uh->source, iph->saddr, dif); + } else { + return; + } + + if (!sk) + return; + + skb->sk = sk; + skb->destructor = sock_efree; + dst = READ_ONCE(sk->sk_rx_dst); + + if (dst) + dst = dst_check(dst, 0); + if (dst) { + /* DST_NOCACHE can not be used without taking a reference */ + if (dst->flags & DST_NOCACHE) { + if (likely(atomic_inc_not_zero(&dst->__refcnt))) + skb_dst_set(skb, dst); + } else { + skb_dst_set_noref(skb, dst); + } + } +} + int udp_rcv(struct sk_buff *skb) { return __udp4_lib_rcv(skb, &udp_table, IPPROTO_UDP); @@ -1788,7 +2067,7 @@ int (*push_pending_frames)(struct sock *)) { struct udp_sock *up = udp_sk(sk); - int val; + int val, valbool; int err = 0; int is_udplite = IS_UDPLITE(sk); @@ -1798,6 +2077,8 @@ if (get_user(val, (int __user *)optval)) return -EFAULT; + valbool = val ? 1 : 0; + switch (optname) { case UDP_CORK: if (val != 0) { @@ -1805,7 +2086,7 @@ } else { up->corkflag = 0; lock_sock(sk); - (*push_pending_frames)(sk); + push_pending_frames(sk); release_sock(sk); } break; @@ -1827,6 +2108,14 @@ } break; + case UDP_NO_CHECK6_TX: + up->no_check6_tx = valbool; + break; + + case UDP_NO_CHECK6_RX: + up->no_check6_rx = valbool; + break; + /* * UDP-Lite's partial checksum coverage (RFC 3828). */ @@ -1909,6 +2198,14 @@ val = up->encap_type; break; + case UDP_NO_CHECK6_TX: + val = up->no_check6_tx; + break; + + case UDP_NO_CHECK6_RX: + val = up->no_check6_rx; + break; + /* The following two cannot be changed on UDP sockets, the return is * always 0 (which corresponds to the full checksum coverage of UDP). */ case UDPLITE_SEND_CSCOV: @@ -1966,6 +2263,8 @@ unsigned int mask = datagram_poll(file, sock, wait); struct sock *sk = sock->sk; + sock_rps_record_flow(sk); + /* Check for false positives due to checksum errors */ if ((mask & POLLRDNORM) && !(file->f_flags & O_NONBLOCK) && !(sk->sk_shutdown & RCV_SHUTDOWN) && !first_packet_length(sk)) @@ -2141,7 +2440,7 @@ /* ------------------------------------------------------------------------ */ static void udp4_format_sock(struct sock *sp, struct seq_file *f, - int bucket, int *len) + int bucket) { struct inet_sock *inet = inet_sk(sp); __be32 dest = inet->inet_daddr; @@ -2150,7 +2449,7 @@ __u16 srcp = ntohs(inet->inet_sport); seq_printf(f, "%5d: %08X:%04X %08X:%04X" - " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %pK %d%n", + " %02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %d", bucket, src, srcp, dest, destp, sp->sk_state, sk_wmem_alloc_get(sp), sk_rmem_alloc_get(sp), @@ -2158,23 +2457,22 @@ from_kuid_munged(seq_user_ns(f), sock_i_uid(sp)), 0, sock_i_ino(sp), atomic_read(&sp->sk_refcnt), sp, - atomic_read(&sp->sk_drops), len); + atomic_read(&sp->sk_drops)); } int udp4_seq_show(struct seq_file *seq, void *v) { + seq_setwidth(seq, 127); if (v == SEQ_START_TOKEN) - seq_printf(seq, "%-127s\n", - " sl local_address rem_address st tx_queue " + seq_puts(seq, " sl local_address rem_address st tx_queue " "rx_queue tr tm->when retrnsmt uid timeout " "inode ref pointer drops"); else { struct udp_iter_state *state = seq->private; - int len; - udp4_format_sock(v, seq, state->bucket, &len); - seq_printf(seq, "%*s\n", 127 - len, ""); + udp4_format_sock(v, seq, state->bucket); } + seq_pad(seq, '\n'); return 0; } @@ -2268,6 +2566,16 @@ } } +u32 udp_flow_hashrnd(void) +{ + static u32 hashrnd __read_mostly; + + net_get_random_once(&hashrnd, sizeof(hashrnd)); + + return hashrnd; +} +EXPORT_SYMBOL(udp_flow_hashrnd); + void __init udp_init(void) { unsigned long limit; @@ -2282,135 +2590,3 @@ sysctl_udp_rmem_min = SK_MEM_QUANTUM; sysctl_udp_wmem_min = SK_MEM_QUANTUM; } - -int udp4_ufo_send_check(struct sk_buff *skb) -{ - if (!pskb_may_pull(skb, sizeof(struct udphdr))) - return -EINVAL; - - if (likely(!skb->encapsulation)) { - const struct iphdr *iph; - struct udphdr *uh; - - iph = ip_hdr(skb); - uh = udp_hdr(skb); - - uh->check = ~csum_tcpudp_magic(iph->saddr, iph->daddr, skb->len, - IPPROTO_UDP, 0); - skb->csum_start = skb_transport_header(skb) - skb->head; - skb->csum_offset = offsetof(struct udphdr, check); - skb->ip_summed = CHECKSUM_PARTIAL; - } - return 0; -} - -static struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb, - netdev_features_t features) -{ - struct sk_buff *segs = ERR_PTR(-EINVAL); - int mac_len = skb->mac_len; - int tnl_hlen = skb_inner_mac_header(skb) - skb_transport_header(skb); - __be16 protocol = skb->protocol; - netdev_features_t enc_features; - int outer_hlen; - - if (unlikely(!pskb_may_pull(skb, tnl_hlen))) - goto out; - - skb->encapsulation = 0; - __skb_pull(skb, tnl_hlen); - skb_reset_mac_header(skb); - skb_set_network_header(skb, skb_inner_network_offset(skb)); - skb->mac_len = skb_inner_network_offset(skb); - skb->protocol = htons(ETH_P_TEB); - - /* segment inner packet. */ - enc_features = skb->dev->hw_enc_features & netif_skb_features(skb); - segs = skb_mac_gso_segment(skb, enc_features); - if (!segs || IS_ERR(segs)) - goto out; - - outer_hlen = skb_tnl_header_len(skb); - skb = segs; - do { - struct udphdr *uh; - int udp_offset = outer_hlen - tnl_hlen; - - skb->mac_len = mac_len; - - skb_push(skb, outer_hlen); - skb_reset_mac_header(skb); - skb_set_network_header(skb, mac_len); - skb_set_transport_header(skb, udp_offset); - uh = udp_hdr(skb); - uh->len = htons(skb->len - udp_offset); - - /* csum segment if tunnel sets skb with csum. */ - if (unlikely(uh->check)) { - struct iphdr *iph = ip_hdr(skb); - - uh->check = ~csum_tcpudp_magic(iph->saddr, iph->daddr, - skb->len - udp_offset, - IPPROTO_UDP, 0); - uh->check = csum_fold(skb_checksum(skb, udp_offset, - skb->len - udp_offset, 0)); - if (uh->check == 0) - uh->check = CSUM_MANGLED_0; - - } - skb->ip_summed = CHECKSUM_NONE; - skb->protocol = protocol; - } while ((skb = skb->next)); -out: - return segs; -} - -struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, - netdev_features_t features) -{ - struct sk_buff *segs = ERR_PTR(-EINVAL); - unsigned int mss; - mss = skb_shinfo(skb)->gso_size; - if (unlikely(skb->len <= mss)) - goto out; - - if (skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) { - /* Packet is from an untrusted source, reset gso_segs. */ - int type = skb_shinfo(skb)->gso_type; - - if (unlikely(type & ~(SKB_GSO_UDP | SKB_GSO_DODGY | - SKB_GSO_UDP_TUNNEL | - SKB_GSO_GRE) || - !(type & (SKB_GSO_UDP)))) - goto out; - - skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss); - - segs = NULL; - goto out; - } - - /* Fragment the skb. IP headers of the fragments are updated in - * inet_gso_segment() - */ - if (skb->encapsulation && skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL) - segs = skb_udp_tunnel_segment(skb, features); - else { - int offset; - __wsum csum; - - /* Do software UFO. Complete and fill in the UDP checksum as - * HW cannot do checksum of UDP packets sent as multiple - * IP fragments. - */ - offset = skb_checksum_start_offset(skb); - csum = skb_checksum(skb, offset, skb->len - offset, 0); - offset += skb->csum_offset; - *(__sum16 *)(skb->data + offset) = csum_fold(csum); - skb->ip_summed = CHECKSUM_NONE; - - segs = skb_segment(skb, features); - } -out: - return segs; -}