--- zzzz-none-000/linux-3.10.107/net/ipv4/inet_hashtables.c 2017-06-27 09:49:32.000000000 +0000 +++ scorpion-7490-727/linux-3.10.107/net/ipv4/inet_hashtables.c 2021-02-04 17:41:59.000000000 +0000 @@ -18,12 +18,42 @@ #include #include #include +#include #include #include #include #include +static u32 inet_ehashfn(const struct net *net, const __be32 laddr, + const __u16 lport, const __be32 faddr, + const __be16 fport) +{ + static u32 inet_ehash_secret __read_mostly; + + net_get_random_once(&inet_ehash_secret, sizeof(inet_ehash_secret)); + + return __inet_ehashfn(laddr, lport, faddr, fport, + inet_ehash_secret + net_hash_mix(net)); +} + +/* This function handles inet_sock, but also timewait and request sockets + * for IPv4/IPv6. + */ +u32 sk_ehashfn(const struct sock *sk) +{ +#if IS_ENABLED(CONFIG_IPV6) + if (sk->sk_family == AF_INET6 && + !ipv6_addr_v4mapped(&sk->sk_v6_daddr)) + return inet6_ehashfn(sock_net(sk), + &sk->sk_v6_rcv_saddr, sk->sk_num, + &sk->sk_v6_daddr, sk->sk_dport); +#endif + return inet_ehashfn(sock_net(sk), + sk->sk_rcv_saddr, sk->sk_num, + sk->sk_daddr, sk->sk_dport); +} + /* * Allocate and initialize a new local port bind bucket. * The bindhash mutex for snum's hash chain must be held here. @@ -35,8 +65,8 @@ { struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, GFP_ATOMIC); - if (tb != NULL) { - write_pnet(&tb->ib_net, hold_net(net)); + if (tb) { + write_pnet(&tb->ib_net, net); tb->port = snum; tb->fastreuse = 0; tb->fastreuseport = 0; @@ -54,7 +84,6 @@ { if (hlist_empty(&tb->owners)) { __hlist_del(&tb->node); - release_net(ib_net(tb)); kmem_cache_free(cachep, tb); } } @@ -62,10 +91,6 @@ void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb, const unsigned short snum) { - struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; - - atomic_inc(&hashinfo->bsockets); - inet_sk(sk)->inet_num = snum; sk_add_bind_node(sk, &tb->owners); tb->num_owners++; @@ -83,8 +108,6 @@ struct inet_bind_hashbucket *head = &hashinfo->bhash[bhash]; struct inet_bind_bucket *tb; - atomic_dec(&hashinfo->bsockets); - spin_lock(&head->lock); tb = inet_csk(sk)->icsk_bind_hash; __sk_del_bind_node(sk); @@ -103,7 +126,7 @@ } EXPORT_SYMBOL(inet_put_port); -int __inet_inherit_port(struct sock *sk, struct sock *child) +int __inet_inherit_port(const struct sock *sk, struct sock *child) { struct inet_hashinfo *table = sk->sk_prot->h.hashinfo; unsigned short port = inet_sk(child)->inet_num; @@ -114,6 +137,10 @@ spin_lock(&head->lock); tb = inet_csk(sk)->icsk_bind_hash; + if (unlikely(!tb)) { + spin_unlock(&head->lock); + return -ENOENT; + } if (tb->port != port) { /* NOTE: using tproxy and redirecting skbs to a proxy * on a different listener port breaks the assumption @@ -162,6 +189,8 @@ return -1; score += 4; } + if (sk->sk_incoming_cpu == raw_smp_processor_id()) + score++; } return score; } @@ -204,7 +233,7 @@ } } else if (score == hiscore && reuseport) { matches++; - if (((u64)phash * matches) >> 32 == 0) + if (reciprocal_scale(phash, matches) == 0) result = sk; phash = next_pseudo_random32(phash); } @@ -230,13 +259,34 @@ } EXPORT_SYMBOL_GPL(__inet_lookup_listener); +/* All sockets share common refcount, but have different destructors */ +void sock_gen_put(struct sock *sk) +{ + if (!atomic_dec_and_test(&sk->sk_refcnt)) + return; + + if (sk->sk_state == TCP_TIME_WAIT) + inet_twsk_free(inet_twsk(sk)); + else if (sk->sk_state == TCP_NEW_SYN_RECV) + reqsk_free(inet_reqsk(sk)); + else + sk_free(sk); +} +EXPORT_SYMBOL_GPL(sock_gen_put); + +void sock_edemux(struct sk_buff *skb) +{ + sock_gen_put(skb->sk); +} +EXPORT_SYMBOL(sock_edemux); + struct sock *__inet_lookup_established(struct net *net, struct inet_hashinfo *hashinfo, const __be32 saddr, const __be16 sport, const __be32 daddr, const u16 hnum, const int dif) { - INET_ADDR_COOKIE(acookie, saddr, daddr) + INET_ADDR_COOKIE(acookie, saddr, daddr); const __portpair ports = INET_COMBINED_PORTS(sport, hnum); struct sock *sk; const struct hlist_nulls_node *node; @@ -255,13 +305,13 @@ if (likely(INET_MATCH(sk, net, acookie, saddr, daddr, ports, dif))) { if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt))) - goto begintw; + goto out; if (unlikely(!INET_MATCH(sk, net, acookie, saddr, daddr, ports, dif))) { - sock_put(sk); + sock_gen_put(sk); goto begin; } - goto out; + goto found; } } /* @@ -271,37 +321,9 @@ */ if (get_nulls_value(node) != slot) goto begin; - -begintw: - /* Must check for a TIME_WAIT'er before going to listener hash. */ - sk_nulls_for_each_rcu(sk, node, &head->twchain) { - if (sk->sk_hash != hash) - continue; - if (likely(INET_TW_MATCH(sk, net, acookie, - saddr, daddr, ports, - dif))) { - if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt))) { - sk = NULL; - goto out; - } - if (unlikely(!INET_TW_MATCH(sk, net, acookie, - saddr, daddr, ports, - dif))) { - inet_twsk_put(inet_twsk(sk)); - goto begintw; - } - goto out; - } - } - /* - * if the nulls value we got at the end of this lookup is - * not the expected one, we must restart lookup. - * We probably met an item that was moved to another chain. - */ - if (get_nulls_value(node) != slot) - goto begintw; - sk = NULL; out: + sk = NULL; +found: rcu_read_unlock(); return sk; } @@ -317,7 +339,7 @@ __be32 daddr = inet->inet_rcv_saddr; __be32 saddr = inet->inet_daddr; int dif = sk->sk_bound_dev_if; - INET_ADDR_COOKIE(acookie, saddr, daddr) + INET_ADDR_COOKIE(acookie, saddr, daddr); const __portpair ports = INET_COMBINED_PORTS(inet->inet_dport, lport); struct net *net = sock_net(sk); unsigned int hash = inet_ehashfn(net, daddr, lport, @@ -326,60 +348,45 @@ spinlock_t *lock = inet_ehash_lockp(hinfo, hash); struct sock *sk2; const struct hlist_nulls_node *node; - struct inet_timewait_sock *tw; - int twrefcnt = 0; + struct inet_timewait_sock *tw = NULL; spin_lock(lock); - /* Check TIME-WAIT sockets first. */ - sk_nulls_for_each(sk2, node, &head->twchain) { - if (sk2->sk_hash != hash) - continue; - - if (likely(INET_TW_MATCH(sk2, net, acookie, - saddr, daddr, ports, dif))) { - tw = inet_twsk(sk2); - if (twsk_unique(sk, sk2, twp)) - goto unique; - else - goto not_unique; - } - } - tw = NULL; - - /* And established part... */ sk_nulls_for_each(sk2, node, &head->chain) { if (sk2->sk_hash != hash) continue; + if (likely(INET_MATCH(sk2, net, acookie, - saddr, daddr, ports, dif))) + saddr, daddr, ports, dif))) { + if (sk2->sk_state == TCP_TIME_WAIT) { + tw = inet_twsk(sk2); + if (twsk_unique(sk, sk2, twp)) + break; + } goto not_unique; + } } -unique: /* Must record num and sport now. Otherwise we will see - * in hash table socket with a funny identity. */ + * in hash table socket with a funny identity. + */ inet->inet_num = lport; inet->inet_sport = htons(lport); sk->sk_hash = hash; WARN_ON(!sk_unhashed(sk)); __sk_nulls_add_node_rcu(sk, &head->chain); if (tw) { - twrefcnt = inet_twsk_unhash(tw); + sk_nulls_del_node_init_rcu((struct sock *)tw); NET_INC_STATS_BH(net, LINUX_MIB_TIMEWAITRECYCLED); } spin_unlock(lock); - if (twrefcnt) - inet_twsk_put(tw); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); if (twp) { *twp = tw; } else if (tw) { /* Silly. Should hash-dance instead... */ - inet_twsk_deschedule(tw, death_row); - - inet_twsk_put(tw); + inet_twsk_deschedule_put(tw); } return 0; @@ -388,51 +395,69 @@ return -EADDRNOTAVAIL; } -static inline u32 inet_sk_port_offset(const struct sock *sk) +static u32 inet_sk_port_offset(const struct sock *sk) { const struct inet_sock *inet = inet_sk(sk); + return secure_ipv4_port_ephemeral(inet->inet_rcv_saddr, inet->inet_daddr, inet->inet_dport); } -int __inet_hash_nolisten(struct sock *sk, struct inet_timewait_sock *tw) +/* insert a socket into ehash, and eventually remove another one + * (The another one can be a SYN_RECV or TIMEWAIT + */ +bool inet_ehash_insert(struct sock *sk, struct sock *osk) { struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; struct hlist_nulls_head *list; - spinlock_t *lock; struct inet_ehash_bucket *head; - int twrefcnt = 0; + spinlock_t *lock; + bool ret = true; - WARN_ON(!sk_unhashed(sk)); + WARN_ON_ONCE(!sk_unhashed(sk)); - sk->sk_hash = inet_sk_ehashfn(sk); + sk->sk_hash = sk_ehashfn(sk); head = inet_ehash_bucket(hashinfo, sk->sk_hash); list = &head->chain; lock = inet_ehash_lockp(hashinfo, sk->sk_hash); spin_lock(lock); - __sk_nulls_add_node_rcu(sk, list); - if (tw) { - WARN_ON(sk->sk_hash != tw->tw_hash); - twrefcnt = inet_twsk_unhash(tw); + if (osk) { + WARN_ON_ONCE(sk->sk_hash != osk->sk_hash); + ret = sk_nulls_del_node_init_rcu(osk); } + if (ret) + __sk_nulls_add_node_rcu(sk, list); spin_unlock(lock); - sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); - return twrefcnt; + return ret; } -EXPORT_SYMBOL_GPL(__inet_hash_nolisten); -static void __inet_hash(struct sock *sk) +bool inet_ehash_nolisten(struct sock *sk, struct sock *osk) +{ + bool ok = inet_ehash_insert(sk, osk); + + if (ok) { + sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); + } else { + percpu_counter_inc(sk->sk_prot->orphan_count); + sk->sk_state = TCP_CLOSE; + sock_set_flag(sk, SOCK_DEAD); + inet_csk_destroy_sock(sk); + } + return ok; +} +EXPORT_SYMBOL_GPL(inet_ehash_nolisten); + +void __inet_hash(struct sock *sk, struct sock *osk) { struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; struct inet_listen_hashbucket *ilb; if (sk->sk_state != TCP_LISTEN) { - __inet_hash_nolisten(sk, NULL); + inet_ehash_nolisten(sk, osk); return; } - WARN_ON(!sk_unhashed(sk)); ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)]; @@ -441,12 +466,13 @@ sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); spin_unlock(&ilb->lock); } +EXPORT_SYMBOL(__inet_hash); void inet_hash(struct sock *sk) { if (sk->sk_state != TCP_CLOSE) { local_bh_disable(); - __inet_hash(sk); + __inet_hash(sk, NULL); local_bh_enable(); } } @@ -467,7 +493,7 @@ lock = inet_ehash_lockp(hashinfo, sk->sk_hash); spin_lock_bh(lock); - done =__sk_nulls_del_node_init_rcu(sk); + done = __sk_nulls_del_node_init_rcu(sk); if (done) sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); spin_unlock_bh(lock); @@ -477,8 +503,7 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, struct sock *sk, u32 port_offset, int (*check_established)(struct inet_timewait_death_row *, - struct sock *, __u16, struct inet_timewait_sock **), - int (*hash)(struct sock *sk, struct inet_timewait_sock *twp)) + struct sock *, __u16, struct inet_timewait_sock **)) { struct inet_hashinfo *hinfo = death_row->hashinfo; const unsigned short snum = inet_sk(sk)->inet_num; @@ -486,7 +511,6 @@ struct inet_bind_bucket *tb; int ret; struct net *net = sock_net(sk); - int twrefcnt = 1; if (!snum) { int i, remaining, low, high, port; @@ -494,13 +518,19 @@ u32 offset = hint + port_offset; struct inet_timewait_sock *tw = NULL; - inet_get_local_port_range(&low, &high); + inet_get_local_port_range(net, &low, &high); remaining = (high - low) + 1; + /* By starting with offset being an even number, + * we tend to leave about 50% of ports for other uses, + * like bind(0). + */ + offset &= ~1; + local_bh_disable(); - for (i = 1; i <= remaining; i++) { + for (i = 0; i < remaining; i++) { port = low + (i + offset) % remaining; - if (inet_is_reserved_local_port(port)) + if (inet_is_local_reserved_port(net, port)) continue; head = &hinfo->bhash[inet_bhashfn(net, port, hinfo->bhash_size)]; @@ -542,25 +572,20 @@ return -EADDRNOTAVAIL; ok: - hint += i; + hint += (i + 2) & ~1; /* Head lock still held and bh's disabled */ inet_bind_hash(sk, tb, port); if (sk_unhashed(sk)) { inet_sk(sk)->inet_sport = htons(port); - twrefcnt += hash(sk, tw); + inet_ehash_nolisten(sk, (struct sock *)tw); } if (tw) - twrefcnt += inet_twsk_bind_unhash(tw, hinfo); + inet_twsk_bind_unhash(tw, hinfo); spin_unlock(&head->lock); - if (tw) { - inet_twsk_deschedule(tw, death_row); - while (twrefcnt) { - twrefcnt--; - inet_twsk_put(tw); - } - } + if (tw) + inet_twsk_deschedule_put(tw); ret = 0; goto out; @@ -570,7 +595,7 @@ tb = inet_csk(sk)->icsk_bind_hash; spin_lock_bh(&head->lock); if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) { - hash(sk, NULL); + inet_ehash_nolisten(sk, NULL); spin_unlock_bh(&head->lock); return 0; } else { @@ -589,8 +614,12 @@ int inet_hash_connect(struct inet_timewait_death_row *death_row, struct sock *sk) { - return __inet_hash_connect(death_row, sk, inet_sk_port_offset(sk), - __inet_check_established, __inet_hash_nolisten); + u32 port_offset = 0; + + if (!inet_sk(sk)->inet_num) + port_offset = inet_sk_port_offset(sk); + return __inet_hash_connect(death_row, sk, port_offset, + __inet_check_established); } EXPORT_SYMBOL_GPL(inet_hash_connect); @@ -598,7 +627,6 @@ { int i; - atomic_set(&h->bsockets, 0); for (i = 0; i < INET_LHTABLE_SIZE; i++) { spin_lock_init(&h->listening_hash[i].lock); INIT_HLIST_NULLS_HEAD(&h->listening_hash[i].head, @@ -606,3 +634,32 @@ } } EXPORT_SYMBOL_GPL(inet_hashinfo_init); + +int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo) +{ + unsigned int locksz = sizeof(spinlock_t); + unsigned int i, nblocks = 1; + + if (locksz != 0) { + /* allocate 2 cache lines or at least one spinlock per cpu */ + nblocks = max(2U * L1_CACHE_BYTES / locksz, 1U); + nblocks = roundup_pow_of_two(nblocks * num_possible_cpus()); + + /* no more locks than number of hash buckets */ + nblocks = min(nblocks, hashinfo->ehash_mask + 1); + + hashinfo->ehash_locks = kmalloc_array(nblocks, locksz, + GFP_KERNEL | __GFP_NOWARN); + if (!hashinfo->ehash_locks) + hashinfo->ehash_locks = vmalloc(nblocks * locksz); + + if (!hashinfo->ehash_locks) + return -ENOMEM; + + for (i = 0; i < nblocks; i++) + spin_lock_init(&hashinfo->ehash_locks[i]); + } + hashinfo->ehash_locks_mask = nblocks - 1; + return 0; +} +EXPORT_SYMBOL_GPL(inet_ehash_locks_alloc);