--- zzzz-none-000/linux-3.10.107/net/ipv4/tcp.c 2017-06-27 09:49:32.000000000 +0000 +++ scorpion-7490-727/linux-3.10.107/net/ipv4/tcp.c 2021-02-04 17:41:59.000000000 +0000 @@ -252,6 +252,7 @@ #include #include #include +#include #include #include #include @@ -274,22 +275,27 @@ #include #include #include -#include #include #include #include +#include +#include int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT; int sysctl_tcp_min_tso_segs __read_mostly = 2; +int sysctl_tcp_autocorking __read_mostly = 1; + struct percpu_counter tcp_orphan_count; EXPORT_SYMBOL_GPL(tcp_orphan_count); +long sysctl_tcp_mem[3] __read_mostly; int sysctl_tcp_wmem[3] __read_mostly; int sysctl_tcp_rmem[3] __read_mostly; +EXPORT_SYMBOL(sysctl_tcp_mem); EXPORT_SYMBOL(sysctl_tcp_rmem); EXPORT_SYMBOL(sysctl_tcp_wmem); @@ -376,13 +382,14 @@ struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); - skb_queue_head_init(&tp->out_of_order_queue); + __skb_queue_head_init(&tp->out_of_order_queue); tcp_init_xmit_timers(sk); tcp_prequeue_init(tp); INIT_LIST_HEAD(&tp->tsq_node); icsk->icsk_rto = TCP_TIMEOUT_INIT; - tp->mdev = TCP_TIMEOUT_INIT; + tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT); + tp->rtt_min[0].rtt = ~0U; /* So many TCP implementations out there (incorrectly) count the * initial SYN frame in their delayed-ACK and congestion control @@ -397,10 +404,11 @@ tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; tp->snd_cwnd_clamp = ~0; tp->mss_cache = TCP_MSS_DEFAULT; + u64_stats_init(&tp->syncp); tp->reordering = sysctl_tcp_reordering; tcp_enable_early_retrans(tp); - icsk->icsk_ca_ops = &tcp_init_congestion_ops; + tcp_assign_congestion_control(sk); tp->tsoffset = 0; @@ -411,10 +419,6 @@ icsk->icsk_sync_mss = tcp_sync_mss; - /* Presumed zeroed, in order of appearance: - * cookie_in_always, cookie_out_never, - * s_data_constant, s_data_in, s_data_out - */ sk->sk_sndbuf = sysctl_tcp_wmem[1]; sk->sk_rcvbuf = sysctl_tcp_rmem[1]; @@ -425,6 +429,17 @@ } EXPORT_SYMBOL(tcp_init_sock); +static void tcp_tx_timestamp(struct sock *sk, struct sk_buff *skb) +{ + if (sk->sk_tsflags) { + struct skb_shared_info *shinfo = skb_shinfo(skb); + + sock_tx_timestamp(sk, &shinfo->tx_flags); + if (shinfo->tx_flags & SKBTX_ANY_TSTAMP) + shinfo->tskey = TCP_SKB_CB(skb)->seq + skb->len - 1; + } +} + /* * Wait for a TCP event. * @@ -437,9 +452,14 @@ unsigned int mask; struct sock *sk = sock->sk; const struct tcp_sock *tp = tcp_sk(sk); + int state; + + sock_rps_record_flow(sk); sock_poll_wait(file, sk_sleep(sk), wait); - if (sk->sk_state == TCP_LISTEN) + + state = sk_state_load(sk); + if (state == TCP_LISTEN) return inet_csk_listen_poll(sk); /* Socket is not locked. We are protected from async events @@ -476,14 +496,14 @@ * NOTE. Check for TCP_CLOSE is added. The goal is to prevent * blocking on fresh not-connected or disconnected socket. --ANK */ - if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE) + if (sk->sk_shutdown == SHUTDOWN_MASK || state == TCP_CLOSE) mask |= POLLHUP; if (sk->sk_shutdown & RCV_SHUTDOWN) mask |= POLLIN | POLLRDNORM | POLLRDHUP; /* Connected or passive Fast Open socket? */ - if (sk->sk_state != TCP_SYN_SENT && - (sk->sk_state != TCP_SYN_RECV || tp->fastopen_rsk != NULL)) { + if (state != TCP_SYN_SENT && + (state != TCP_SYN_RECV || tp->fastopen_rsk)) { int target = sock_rcvlowat(sk, 0, INT_MAX); if (tp->urg_seq == tp->copied_seq && @@ -491,25 +511,23 @@ tp->urg_data) target++; - /* Potential race condition. If read of tp below will - * escape above sk->sk_state, we can be illegally awaken - * in SYN_* states. */ if (tp->rcv_nxt - tp->copied_seq >= target) mask |= POLLIN | POLLRDNORM; if (!(sk->sk_shutdown & SEND_SHUTDOWN)) { - if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) { + if (sk_stream_is_writeable(sk)) { mask |= POLLOUT | POLLWRNORM; } else { /* send SIGIO later */ - set_bit(SOCK_ASYNC_NOSPACE, - &sk->sk_socket->flags); + sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); /* Race breaker. If space is freed after * wspace test but before the flags are set, - * IO signal will be lost. + * IO signal will be lost. Memory barrier + * pairs with the input side. */ - if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) + smp_mb__after_atomic(); + if (sk_stream_is_writeable(sk)) mask |= POLLOUT | POLLWRNORM; } } else @@ -520,7 +538,7 @@ } /* This barrier is coupled with smp_wmb() in tcp_reset() */ smp_rmb(); - if (sk->sk_err) + if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue)) mask |= POLLERR; return mask; @@ -595,7 +613,7 @@ return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1)); } -static inline void skb_entail(struct sock *sk, struct sk_buff *skb) +static void skb_entail(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); @@ -604,12 +622,14 @@ tcb->seq = tcb->end_seq = tp->write_seq; tcb->tcp_flags = TCPHDR_ACK; tcb->sacked = 0; - skb_header_release(skb); + __skb_header_release(skb); tcp_add_write_queue_tail(sk, skb); sk->sk_wmem_queued += skb->truesize; sk_mem_charge(sk, skb->truesize); if (tp->nonagle & TCP_NAGLE_PUSH) tp->nonagle &= ~TCP_NAGLE_PUSH; + + tcp_slow_start_after_idle_check(sk); } static inline void tcp_mark_urg(struct tcp_sock *tp, int flags) @@ -618,19 +638,58 @@ tp->snd_up = tp->write_seq; } -static inline void tcp_push(struct sock *sk, int flags, int mss_now, - int nonagle) +/* If a not yet filled skb is pushed, do not send it if + * we have data packets in Qdisc or NIC queues : + * Because TX completion will happen shortly, it gives a chance + * to coalesce future sendmsg() payload into this skb, without + * need for a timer, and with no latency trade off. + * As packets containing data payload have a bigger truesize + * than pure acks (dataless) packets, the last checks prevent + * autocorking if we only have an ACK in Qdisc/NIC queues, + * or if TX completion was delayed after we processed ACK packet. + */ +static bool tcp_should_autocork(struct sock *sk, struct sk_buff *skb, + int size_goal) { - if (tcp_send_head(sk)) { - struct tcp_sock *tp = tcp_sk(sk); + return skb->len < size_goal && + sysctl_tcp_autocorking && + skb != tcp_write_queue_head(sk) && + atomic_read(&sk->sk_wmem_alloc) > skb->truesize; +} + +static void tcp_push(struct sock *sk, int flags, int mss_now, + int nonagle, int size_goal) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *skb; + + if (!tcp_send_head(sk)) + return; + + skb = tcp_write_queue_tail(sk); + if (!(flags & MSG_MORE) || forced_push(tp)) + tcp_mark_push(tp, skb); + + tcp_mark_urg(tp, flags); - if (!(flags & MSG_MORE) || forced_push(tp)) - tcp_mark_push(tp, tcp_write_queue_tail(sk)); + if (tcp_should_autocork(sk, skb, size_goal)) { - tcp_mark_urg(tp, flags); - __tcp_push_pending_frames(sk, mss_now, - (flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle); + /* avoid atomic op if TSQ_THROTTLED bit is already set */ + if (!test_bit(TSQ_THROTTLED, &tp->tsq_flags)) { + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAUTOCORKING); + set_bit(TSQ_THROTTLED, &tp->tsq_flags); + } + /* It is possible TX completion already happened + * before we set TSQ_THROTTLED. + */ + if (atomic_read(&sk->sk_wmem_alloc) > skb->truesize) + return; } + + if (flags & MSG_MORE) + nonagle = TCP_NAGLE_CORK; + + __tcp_push_pending_frames(sk, mss_now, nonagle); } static int tcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb, @@ -639,8 +698,9 @@ struct tcp_splice_state *tss = rd_desc->arg.data; int ret; - ret = skb_splice_bits(skb, offset, tss->pipe, min(rd_desc->count, len), - tss->flags); + ret = skb_splice_bits(skb, skb->sk, offset, tss->pipe, + min(rd_desc->count, len), tss->flags, + skb_socket_splice); if (ret > 0) rd_desc->count -= ret; return ret; @@ -729,7 +789,7 @@ */ if (!skb_queue_empty(&sk->sk_receive_queue)) break; - sk_wait_data(sk, &timeo); + sk_wait_data(sk, &timeo, NULL); if (signal_pending(current)) { ret = sock_intr_errno(timeo); break; @@ -759,16 +819,28 @@ } EXPORT_SYMBOL(tcp_splice_read); -struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp) +struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp, + bool force_schedule) { struct sk_buff *skb; /* The TCP header must be at least 32-bit aligned. */ size = ALIGN(size, 4); + if (unlikely(tcp_under_memory_pressure(sk))) + sk_mem_reclaim_partial(sk); + skb = alloc_skb_fclone(size + sk->sk_prot->max_header, gfp); - if (skb) { - if (sk_wmem_schedule(sk, skb->truesize)) { + if (likely(skb)) { + bool mem_scheduled; + + if (force_schedule) { + mem_scheduled = true; + sk_forced_mem_schedule(sk, skb->truesize); + } else { + mem_scheduled = sk_wmem_schedule(sk, skb->truesize); + } + if (likely(mem_scheduled)) { skb_reserve(skb, sk->sk_prot->max_header); /* * Make sure that we have exactly size bytes @@ -789,47 +861,25 @@ int large_allowed) { struct tcp_sock *tp = tcp_sk(sk); - u32 xmit_size_goal, old_size_goal; + u32 new_size_goal, size_goal; - xmit_size_goal = mss_now; + if (!large_allowed || !sk_can_gso(sk)) + return mss_now; - if (large_allowed && sk_can_gso(sk)) { - u32 gso_size, hlen; - - /* Maybe we should/could use sk->sk_prot->max_header here ? */ - hlen = inet_csk(sk)->icsk_af_ops->net_header_len + - inet_csk(sk)->icsk_ext_hdr_len + - tp->tcp_header_len; - - /* Goal is to send at least one packet per ms, - * not one big TSO packet every 100 ms. - * This preserves ACK clocking and is consistent - * with tcp_tso_should_defer() heuristic. - */ - gso_size = sk->sk_pacing_rate / (2 * MSEC_PER_SEC); - gso_size = max_t(u32, gso_size, - sysctl_tcp_min_tso_segs * mss_now); + /* Note : tcp_tso_autosize() will eventually split this later */ + new_size_goal = sk->sk_gso_max_size - 1 - MAX_TCP_HEADER; + new_size_goal = tcp_bound_to_half_wnd(tp, new_size_goal); - xmit_size_goal = min_t(u32, gso_size, - sk->sk_gso_max_size - 1 - hlen); - - xmit_size_goal = tcp_bound_to_half_wnd(tp, xmit_size_goal); - - /* We try hard to avoid divides here */ - old_size_goal = tp->xmit_size_goal_segs * mss_now; - - if (likely(old_size_goal <= xmit_size_goal && - old_size_goal + mss_now > xmit_size_goal)) { - xmit_size_goal = old_size_goal; - } else { - tp->xmit_size_goal_segs = - min_t(u16, xmit_size_goal / mss_now, - sk->sk_gso_max_segs); - xmit_size_goal = tp->xmit_size_goal_segs * mss_now; - } + /* We try hard to avoid divides here */ + size_goal = tp->gso_segs * mss_now; + if (unlikely(new_size_goal < size_goal || + new_size_goal >= size_goal + mss_now)) { + tp->gso_segs = min_t(u16, new_size_goal / mss_now, + sk->sk_gso_max_segs); + size_goal = tp->gso_segs * mss_now; } - return max(xmit_size_goal, mss_now); + return max(size_goal, mss_now); } static int tcp_send_mss(struct sock *sk, int *size_goal, int flags) @@ -857,11 +907,12 @@ */ if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) && !tcp_passive_fastopen(sk)) { - if ((err = sk_stream_wait_connect(sk, &timeo)) != 0) + err = sk_stream_wait_connect(sk, &timeo); + if (err != 0) goto out_err; } - clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); mss_now = tcp_send_mss(sk, &size_goal, flags); copied = 0; @@ -880,7 +931,8 @@ if (!sk_stream_memory_free(sk)) goto wait_for_sndbuf; - skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation); + skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation, + skb_queue_empty(&sk->sk_write_queue)); if (!skb) goto wait_for_memory; @@ -893,7 +945,7 @@ i = skb_shinfo(skb)->nr_frags; can_coalesce = skb_can_coalesce(skb, i, page, offset); - if (!can_coalesce && i >= MAX_SKB_FRAGS) { + if (!can_coalesce && i >= sysctl_max_skb_frags) { tcp_mark_push(tp, skb); goto new_segment; } @@ -916,15 +968,18 @@ skb->ip_summed = CHECKSUM_PARTIAL; tp->write_seq += copy; TCP_SKB_CB(skb)->end_seq += copy; - skb_shinfo(skb)->gso_segs = 0; + tcp_skb_pcount_set(skb, 0); if (!copied) TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH; copied += copy; offset += copy; - if (!(size -= copy)) + size -= copy; + if (!size) { + tcp_tx_timestamp(sk, skb); goto out; + } if (skb->len < size_goal || (flags & MSG_OOB)) continue; @@ -939,9 +994,11 @@ wait_for_sndbuf: set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); wait_for_memory: - tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH); + tcp_push(sk, flags & ~MSG_MORE, mss_now, + TCP_NAGLE_PUSH, size_goal); - if ((err = sk_stream_wait_memory(sk, &timeo)) != 0) + err = sk_stream_wait_memory(sk, &timeo); + if (err != 0) goto do_error; mss_now = tcp_send_mss(sk, &size_goal, flags); @@ -949,13 +1006,16 @@ out: if (copied && !(flags & MSG_SENDPAGE_NOTLAST)) - tcp_push(sk, flags, mss_now, tp->nonagle); + tcp_push(sk, flags, mss_now, tp->nonagle, size_goal); return copied; do_error: if (copied) goto out; out_err: + /* make sure we wake any epoll edge trigger waiter */ + if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 && err == -EAGAIN)) + sk->sk_write_space(sk); return sk_stream_error(sk, flags, err); } @@ -1001,7 +1061,7 @@ void tcp_free_fastopen_req(struct tcp_sock *tp) { - if (tp->fastopen_req != NULL) { + if (tp->fastopen_req) { kfree(tp->fastopen_req); tp->fastopen_req = NULL; } @@ -1015,12 +1075,12 @@ if (!(sysctl_tcp_fastopen & TFO_CLIENT_ENABLE)) return -EOPNOTSUPP; - if (tp->fastopen_req != NULL) + if (tp->fastopen_req) return -EALREADY; /* Another Fast Open is in progress */ tp->fastopen_req = kzalloc(sizeof(struct tcp_fastopen_request), sk->sk_allocation); - if (unlikely(tp->fastopen_req == NULL)) + if (unlikely(!tp->fastopen_req)) return -ENOBUFS; tp->fastopen_req->data = msg; tp->fastopen_req->size = size; @@ -1033,14 +1093,12 @@ return err; } -int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, - size_t size) +int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) { - struct iovec *iov; struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; - int iovlen, flags, err, copied = 0; - int mss_now = 0, size_goal, copied_syn = 0, offset = 0; + int flags, err, copied = 0; + int mss_now = 0, size_goal, copied_syn = 0; bool sg; long timeo; @@ -1053,7 +1111,6 @@ goto out; else if (err) goto out_err; - offset = copied_syn; } timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); @@ -1064,7 +1121,8 @@ */ if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) && !tcp_passive_fastopen(sk)) { - if ((err = sk_stream_wait_connect(sk, &timeo)) != 0) + err = sk_stream_wait_connect(sk, &timeo); + if (err != 0) goto do_error; } @@ -1082,13 +1140,11 @@ } /* This should be in poll */ - clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); mss_now = tcp_send_mss(sk, &size_goal, flags); /* Ok commence sending. */ - iovlen = msg->msg_iovlen; - iov = msg->msg_iov; copied = 0; err = -EPIPE; @@ -1097,153 +1153,141 @@ sg = !!(sk->sk_route_caps & NETIF_F_SG); - while (--iovlen >= 0) { - size_t seglen = iov->iov_len; - unsigned char __user *from = iov->iov_base; - - iov++; - if (unlikely(offset > 0)) { /* Skip bytes copied in SYN */ - if (offset >= seglen) { - offset -= seglen; - continue; - } - seglen -= offset; - from += offset; - offset = 0; + while (msg_data_left(msg)) { + int copy = 0; + int max = size_goal; + + skb = tcp_write_queue_tail(sk); + if (tcp_send_head(sk)) { + if (skb->ip_summed == CHECKSUM_NONE) + max = mss_now; + copy = max - skb->len; } - while (seglen > 0) { - int copy = 0; - int max = size_goal; - - skb = tcp_write_queue_tail(sk); - if (tcp_send_head(sk)) { - if (skb->ip_summed == CHECKSUM_NONE) - max = mss_now; - copy = max - skb->len; - } - - if (copy <= 0) { + if (copy <= 0) { new_segment: - /* Allocate new segment. If the interface is SG, - * allocate skb fitting to single page. - */ - if (!sk_stream_memory_free(sk)) - goto wait_for_sndbuf; + /* Allocate new segment. If the interface is SG, + * allocate skb fitting to single page. + */ + if (!sk_stream_memory_free(sk)) + goto wait_for_sndbuf; - skb = sk_stream_alloc_skb(sk, - select_size(sk, sg), - sk->sk_allocation); - if (!skb) - goto wait_for_memory; + skb = sk_stream_alloc_skb(sk, + select_size(sk, sg), + sk->sk_allocation, + skb_queue_empty(&sk->sk_write_queue)); + if (!skb) + goto wait_for_memory; - /* - * All packets are restored as if they have - * already been sent. - */ - if (tp->repair) - TCP_SKB_CB(skb)->when = tcp_time_stamp; + /* + * Check whether we can use HW checksum. + */ + if (sk->sk_route_caps & NETIF_F_ALL_CSUM) + skb->ip_summed = CHECKSUM_PARTIAL; - /* - * Check whether we can use HW checksum. - */ - if (sk->sk_route_caps & NETIF_F_ALL_CSUM) - skb->ip_summed = CHECKSUM_PARTIAL; + skb_entail(sk, skb); + copy = size_goal; + max = size_goal; - skb_entail(sk, skb); - copy = size_goal; - max = size_goal; - } - - /* Try to append data to the end of skb. */ - if (copy > seglen) - copy = seglen; - - /* Where to copy to? */ - if (skb_availroom(skb) > 0) { - /* We have some space in skb head. Superb! */ - copy = min_t(int, copy, skb_availroom(skb)); - err = skb_add_data_nocache(sk, skb, from, copy); - if (err) - goto do_fault; - } else { - bool merge = true; - int i = skb_shinfo(skb)->nr_frags; - struct page_frag *pfrag = sk_page_frag(sk); - - if (!sk_page_frag_refill(sk, pfrag)) - goto wait_for_memory; - - if (!skb_can_coalesce(skb, i, pfrag->page, - pfrag->offset)) { - if (i == MAX_SKB_FRAGS || !sg) { - tcp_mark_push(tp, skb); - goto new_segment; - } - merge = false; - } + /* All packets are restored as if they have + * already been sent. skb_mstamp isn't set to + * avoid wrong rtt estimation. + */ + if (tp->repair) + TCP_SKB_CB(skb)->sacked |= TCPCB_REPAIRED; + } - copy = min_t(int, copy, pfrag->size - pfrag->offset); + /* Try to append data to the end of skb. */ + if (copy > msg_data_left(msg)) + copy = msg_data_left(msg); + + /* Where to copy to? */ + if (skb_availroom(skb) > 0) { + /* We have some space in skb head. Superb! */ + copy = min_t(int, copy, skb_availroom(skb)); + err = skb_add_data_nocache(sk, skb, &msg->msg_iter, copy); + if (err) + goto do_fault; + } else { + bool merge = true; + int i = skb_shinfo(skb)->nr_frags; + struct page_frag *pfrag = sk_page_frag(sk); - if (!sk_wmem_schedule(sk, copy)) - goto wait_for_memory; + if (!sk_page_frag_refill(sk, pfrag)) + goto wait_for_memory; - err = skb_copy_to_page_nocache(sk, from, skb, - pfrag->page, - pfrag->offset, - copy); - if (err) - goto do_error; - - /* Update the skb. */ - if (merge) { - skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy); - } else { - skb_fill_page_desc(skb, i, pfrag->page, - pfrag->offset, copy); - get_page(pfrag->page); + if (!skb_can_coalesce(skb, i, pfrag->page, + pfrag->offset)) { + if (i >= sysctl_max_skb_frags || !sg) { + tcp_mark_push(tp, skb); + goto new_segment; } - pfrag->offset += copy; + merge = false; } - if (!copied) - TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH; + copy = min_t(int, copy, pfrag->size - pfrag->offset); - tp->write_seq += copy; - TCP_SKB_CB(skb)->end_seq += copy; - skb_shinfo(skb)->gso_segs = 0; + if (!sk_wmem_schedule(sk, copy)) + goto wait_for_memory; - from += copy; - copied += copy; - if ((seglen -= copy) == 0 && iovlen == 0) - goto out; + err = skb_copy_to_page_nocache(sk, &msg->msg_iter, skb, + pfrag->page, + pfrag->offset, + copy); + if (err) + goto do_error; - if (skb->len < max || (flags & MSG_OOB) || unlikely(tp->repair)) - continue; + /* Update the skb. */ + if (merge) { + skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy); + } else { + skb_fill_page_desc(skb, i, pfrag->page, + pfrag->offset, copy); + get_page(pfrag->page); + } + pfrag->offset += copy; + } + + if (!copied) + TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH; - if (forced_push(tp)) { - tcp_mark_push(tp, skb); - __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH); - } else if (skb == tcp_send_head(sk)) - tcp_push_one(sk, mss_now); + tp->write_seq += copy; + TCP_SKB_CB(skb)->end_seq += copy; + tcp_skb_pcount_set(skb, 0); + + copied += copy; + if (!msg_data_left(msg)) { + tcp_tx_timestamp(sk, skb); + goto out; + } + + if (skb->len < max || (flags & MSG_OOB) || unlikely(tp->repair)) continue; + if (forced_push(tp)) { + tcp_mark_push(tp, skb); + __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH); + } else if (skb == tcp_send_head(sk)) + tcp_push_one(sk, mss_now); + continue; + wait_for_sndbuf: - set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); + set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); wait_for_memory: - if (copied) - tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH); + if (copied) + tcp_push(sk, flags & ~MSG_MORE, mss_now, + TCP_NAGLE_PUSH, size_goal); - if ((err = sk_stream_wait_memory(sk, &timeo)) != 0) - goto do_error; + err = sk_stream_wait_memory(sk, &timeo); + if (err != 0) + goto do_error; - mss_now = tcp_send_mss(sk, &size_goal, flags); - } + mss_now = tcp_send_mss(sk, &size_goal, flags); } out: if (copied) - tcp_push(sk, flags, mss_now, tp->nonagle); + tcp_push(sk, flags, mss_now, tp->nonagle, size_goal); out_nopush: release_sock(sk); return copied + copied_syn; @@ -1263,6 +1307,9 @@ goto out; out_err: err = sk_stream_error(sk, flags, err); + /* make sure we wake any epoll edge trigger waiter */ + if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 && err == -EAGAIN)) + sk->sk_write_space(sk); release_sock(sk); return err; } @@ -1297,7 +1344,7 @@ if (len > 0) { if (!(flags & MSG_TRUNC)) - err = memcpy_toiovec(msg->msg_iov, &c, 1); + err = memcpy_to_msg(msg, &c, 1); len = 1; } else msg->msg_flags |= MSG_TRUNC; @@ -1325,7 +1372,7 @@ /* XXX -- need to support SO_PEEK_OFF */ skb_queue_walk(&sk->sk_write_queue, skb) { - err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, skb->len); + err = skb_copy_datagram_msg(skb, 0, msg, skb->len); if (err) break; @@ -1341,7 +1388,7 @@ * calculation of whether or not we must ACK for the sake of * a window update. */ -void tcp_cleanup_rbuf(struct sock *sk, int copied) +static void tcp_cleanup_rbuf(struct sock *sk, int copied) { struct tcp_sock *tp = tcp_sk(sk); bool time_to_ack = false; @@ -1417,39 +1464,6 @@ tp->ucopy.memory = 0; } -#ifdef CONFIG_NET_DMA -static void tcp_service_net_dma(struct sock *sk, bool wait) -{ - dma_cookie_t done, used; - dma_cookie_t last_issued; - struct tcp_sock *tp = tcp_sk(sk); - - if (!tp->ucopy.dma_chan) - return; - - last_issued = tp->ucopy.dma_cookie; - dma_async_issue_pending(tp->ucopy.dma_chan); - - do { - if (dma_async_is_tx_complete(tp->ucopy.dma_chan, - last_issued, &done, - &used) == DMA_SUCCESS) { - /* Safe to free early-copied skbs now */ - __skb_queue_purge(&sk->sk_async_wait_queue); - break; - } else { - struct sk_buff *skb; - while ((skb = skb_peek(&sk->sk_async_wait_queue)) && - (dma_async_is_complete(skb->dma_cookie, done, - used) == DMA_SUCCESS)) { - __skb_dequeue(&sk->sk_async_wait_queue); - kfree_skb(skb); - } - } - } while (wait); -} -#endif - static struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off) { struct sk_buff *skb; @@ -1457,9 +1471,9 @@ while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) { offset = seq - TCP_SKB_CB(skb)->seq; - if (tcp_hdr(skb)->syn) + if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN) offset--; - if (offset < skb->len || tcp_hdr(skb)->fin) { + if (offset < skb->len || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)) { *off = offset; return skb; } @@ -1467,7 +1481,7 @@ * splitted a fat GRO packet, while we released socket lock * in skb_splice_bits() */ - sk_eat_skb(sk, skb, false); + sk_eat_skb(sk, skb); } return NULL; } @@ -1532,12 +1546,12 @@ if (offset + 1 != skb->len) continue; } - if (tcp_hdr(skb)->fin) { - sk_eat_skb(sk, skb, false); + if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) { + sk_eat_skb(sk, skb); ++seq; break; } - sk_eat_skb(sk, skb, false); + sk_eat_skb(sk, skb); if (!desc->count) break; tp->copied_seq = seq; @@ -1563,8 +1577,8 @@ * Probably, code can be easily improved even more. */ -int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, - size_t len, int nonblock, int flags, int *addr_len) +int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, + int flags, int *addr_len) { struct tcp_sock *tp = tcp_sk(sk); int copied = 0; @@ -1575,10 +1589,16 @@ int target; /* Read at least this many bytes */ long timeo; struct task_struct *user_recv = NULL; - bool copied_early = false; - struct sk_buff *skb; + struct sk_buff *skb, *last; u32 urg_hole = 0; + if (unlikely(flags & MSG_ERRQUEUE)) + return inet_recv_error(sk, msg, len, addr_len); + + if (sk_can_busy_loop(sk) && skb_queue_empty(&sk->sk_receive_queue) && + (sk->sk_state == TCP_ESTABLISHED)) + sk_busy_loop(sk, nonblock); + lock_sock(sk); err = -ENOTCONN; @@ -1614,28 +1634,6 @@ target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); -#ifdef CONFIG_NET_DMA - tp->ucopy.dma_chan = NULL; - preempt_disable(); - skb = skb_peek_tail(&sk->sk_receive_queue); - { - int available = 0; - - if (skb) - available = TCP_SKB_CB(skb)->seq + skb->len - (*seq); - if ((available < target) && - (len > sysctl_tcp_dma_copybreak) && !(flags & MSG_PEEK) && - !sysctl_tcp_low_latency && - net_dma_find_channel()) { - preempt_enable_no_resched(); - tp->ucopy.pinned_list = - dma_pin_iovec_pages(msg->msg_iov, len); - } else { - preempt_enable_no_resched(); - } - } -#endif - do { u32 offset; @@ -1651,7 +1649,9 @@ /* Next get a buffer. */ + last = skb_peek_tail(&sk->sk_receive_queue); skb_queue_walk(&sk->sk_receive_queue, skb) { + last = skb; /* Now that we have two receive queues this * shouldn't happen. */ @@ -1662,11 +1662,11 @@ break; offset = *seq - TCP_SKB_CB(skb)->seq; - if (tcp_hdr(skb)->syn) + if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN) offset--; if (offset < skb->len) goto found_ok_skb; - if (tcp_hdr(skb)->fin) + if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) goto found_fin_ok; WARN(!(flags & MSG_PEEK), "recvmsg bug 2: copied %X seq %X rcvnxt %X fl %X\n", @@ -1726,7 +1726,7 @@ if (!user_recv && !(flags & (MSG_TRUNC | MSG_PEEK))) { user_recv = current; tp->ucopy.task = user_recv; - tp->ucopy.iov = msg->msg_iov; + tp->ucopy.msg = msg; } tp->ucopy.len = len; @@ -1766,34 +1766,21 @@ /* __ Set realtime policy in scheduler __ */ } -#ifdef CONFIG_NET_DMA - if (tp->ucopy.dma_chan) { - if (tp->rcv_wnd == 0 && - !skb_queue_empty(&sk->sk_async_wait_queue)) { - tcp_service_net_dma(sk, true); - tcp_cleanup_rbuf(sk, copied); - } else - dma_async_issue_pending(tp->ucopy.dma_chan); - } -#endif if (copied >= target) { /* Do not sleep, just process backlog. */ release_sock(sk); lock_sock(sk); - } else - sk_wait_data(sk, &timeo); - -#ifdef CONFIG_NET_DMA - tcp_service_net_dma(sk, false); /* Don't block */ - tp->ucopy.wakeup = 0; -#endif + } else { + sk_wait_data(sk, &timeo, last); + } if (user_recv) { int chunk; /* __ Restore normal policy in scheduler __ */ - if ((chunk = len - tp->ucopy.len) != 0) { + chunk = len - tp->ucopy.len; + if (chunk != 0) { NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk); len -= chunk; copied += chunk; @@ -1804,7 +1791,8 @@ do_prequeue: tcp_prequeue_process(sk); - if ((chunk = len - tp->ucopy.len) != 0) { + chunk = len - tp->ucopy.len; + if (chunk != 0) { NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk); len -= chunk; copied += chunk; @@ -1845,43 +1833,12 @@ } if (!(flags & MSG_TRUNC)) { -#ifdef CONFIG_NET_DMA - if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list) - tp->ucopy.dma_chan = net_dma_find_channel(); - - if (tp->ucopy.dma_chan) { - tp->ucopy.dma_cookie = dma_skb_copy_datagram_iovec( - tp->ucopy.dma_chan, skb, offset, - msg->msg_iov, used, - tp->ucopy.pinned_list); - - if (tp->ucopy.dma_cookie < 0) { - - pr_alert("%s: dma_cookie < 0\n", - __func__); - - /* Exception. Bailout! */ - if (!copied) - copied = -EFAULT; - break; - } - - dma_async_issue_pending(tp->ucopy.dma_chan); - - if ((offset + used) == skb->len) - copied_early = true; - - } else -#endif - { - err = skb_copy_datagram_iovec(skb, offset, - msg->msg_iov, used); - if (err) { - /* Exception. Bailout! */ - if (!copied) - copied = -EFAULT; - break; - } + err = skb_copy_datagram_msg(skb, offset, msg, used); + if (err) { + /* Exception. Bailout! */ + if (!copied) + copied = -EFAULT; + break; } } @@ -1899,21 +1856,17 @@ if (used + offset < skb->len) continue; - if (tcp_hdr(skb)->fin) + if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) goto found_fin_ok; - if (!(flags & MSG_PEEK)) { - sk_eat_skb(sk, skb, copied_early); - copied_early = false; - } + if (!(flags & MSG_PEEK)) + sk_eat_skb(sk, skb); continue; found_fin_ok: /* Process the FIN. */ ++*seq; - if (!(flags & MSG_PEEK)) { - sk_eat_skb(sk, skb, copied_early); - copied_early = false; - } + if (!(flags & MSG_PEEK)) + sk_eat_skb(sk, skb); break; } while (len > 0); @@ -1936,16 +1889,6 @@ tp->ucopy.len = 0; } -#ifdef CONFIG_NET_DMA - tcp_service_net_dma(sk, true); /* Wait for queue to drain */ - tp->ucopy.dma_chan = NULL; - - if (tp->ucopy.pinned_list) { - dma_unpin_iovec_pages(tp->ucopy.pinned_list); - tp->ucopy.pinned_list = NULL; - } -#endif - /* According to UNIX98, msg_name/msg_namelen are ignored * on connected socket. I was just happy when found this 8) --ANK */ @@ -1997,7 +1940,7 @@ /* Change state AFTER socket is unhashed to avoid closed * socket sitting in hash tables. */ - sk->sk_state = state; + sk_state_store(sk, state); #ifdef STATE_TRACE SOCK_DEBUG(sk, "TCP sk=%p, State %s -> %s\n", sk, statename[oldstate], statename[state]); @@ -2014,18 +1957,19 @@ static const unsigned char new_state[16] = { /* current state: new state: action: */ - /* (Invalid) */ TCP_CLOSE, - /* TCP_ESTABLISHED */ TCP_FIN_WAIT1 | TCP_ACTION_FIN, - /* TCP_SYN_SENT */ TCP_CLOSE, - /* TCP_SYN_RECV */ TCP_FIN_WAIT1 | TCP_ACTION_FIN, - /* TCP_FIN_WAIT1 */ TCP_FIN_WAIT1, - /* TCP_FIN_WAIT2 */ TCP_FIN_WAIT2, - /* TCP_TIME_WAIT */ TCP_CLOSE, - /* TCP_CLOSE */ TCP_CLOSE, - /* TCP_CLOSE_WAIT */ TCP_LAST_ACK | TCP_ACTION_FIN, - /* TCP_LAST_ACK */ TCP_LAST_ACK, - /* TCP_LISTEN */ TCP_CLOSE, - /* TCP_CLOSING */ TCP_CLOSING, + [0 /* (Invalid) */] = TCP_CLOSE, + [TCP_ESTABLISHED] = TCP_FIN_WAIT1 | TCP_ACTION_FIN, + [TCP_SYN_SENT] = TCP_CLOSE, + [TCP_SYN_RECV] = TCP_FIN_WAIT1 | TCP_ACTION_FIN, + [TCP_FIN_WAIT1] = TCP_FIN_WAIT1, + [TCP_FIN_WAIT2] = TCP_FIN_WAIT2, + [TCP_TIME_WAIT] = TCP_CLOSE, + [TCP_CLOSE] = TCP_CLOSE, + [TCP_CLOSE_WAIT] = TCP_LAST_ACK | TCP_ACTION_FIN, + [TCP_LAST_ACK] = TCP_LAST_ACK, + [TCP_LISTEN] = TCP_CLOSE, + [TCP_CLOSING] = TCP_CLOSING, + [TCP_NEW_SYN_RECV] = TCP_CLOSE, /* should not happen ! */ }; static int tcp_close_state(struct sock *sk) @@ -2100,8 +2044,10 @@ * reader process may not have drained the data yet! */ while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) { - u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq - - tcp_hdr(skb)->fin; + u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq; + + if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) + len--; data_was_unread += len; __kfree_skb(skb); } @@ -2190,7 +2136,7 @@ /* This is a (useful) BSD violating of the RFC. There is a * problem with TCP as specified in that the other end could * keep a socket open forever with no application left this end. - * We use a 3 minute timeout (about the same as BSD) then kill + * We use a 1 minute timeout (about the same as BSD) then kill * our end. If they send after that then tough - BUT: long enough * that we won't make the old 4*rto = almost no time - whoops * reset mistake. @@ -2236,7 +2182,7 @@ * aborted (e.g., closed with unread data) before 3WHS * finishes. */ - if (req != NULL) + if (req) reqsk_fastopen_remove(sk, req, false); inet_csk_destroy_sock(sk); } @@ -2289,9 +2235,6 @@ __skb_queue_purge(&sk->sk_receive_queue); tcp_write_queue_purge(sk); __skb_queue_purge(&tp->out_of_order_queue); -#ifdef CONFIG_NET_DMA - __skb_queue_purge(&sk->sk_async_wait_queue); -#endif inet->inet_dport = 0; @@ -2300,8 +2243,9 @@ sk->sk_shutdown = 0; sock_reset_flag(sk, SOCK_DONE); - tp->srtt = 0; - if ((tp->write_seq += tp->max_window + 2) == 0) + tp->srtt_us = 0; + tp->write_seq += tp->max_window + 2; + if (tp->write_seq == 0) tp->write_seq = 1; icsk->icsk_backoff = 0; tp->snd_cwnd = 2; @@ -2324,13 +2268,6 @@ } EXPORT_SYMBOL(tcp_disconnect); -void tcp_sock_destruct(struct sock *sk) -{ - inet_sock_destruct(sk); - - kfree(inet_csk(sk)->icsk_accept_queue.fastopenq); -} - static inline bool tcp_can_repair_sock(const struct sock *sk) { return ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN) && @@ -2580,6 +2517,13 @@ icsk->icsk_syn_retries = val; break; + case TCP_SAVE_SYN: + if (val < 0 || val > 1) + err = -EINVAL; + else + tp->save_syn = val; + break; + case TCP_LINGER2: if (val < 0) tp->linger2 = -1; @@ -2631,7 +2575,7 @@ break; #endif case TCP_USER_TIMEOUT: - /* Cap the max timeout in ms TCP will retry/retrans + /* Cap the max time in ms TCP will retry or probe the window * before giving up and aborting (ETIMEDOUT) a connection. */ if (val < 0) @@ -2642,10 +2586,13 @@ case TCP_FASTOPEN: if (val >= 0 && ((1 << sk->sk_state) & (TCPF_CLOSE | - TCPF_LISTEN))) - err = fastopen_init_queue(sk, val); - else + TCPF_LISTEN))) { + tcp_fastopen_init_key_once(true); + + fastopen_queue_tune(sk, val); + } else { err = -EINVAL; + } break; case TCP_TIMESTAMP: if (!tp->repair) @@ -2653,6 +2600,10 @@ else tp->tsoffset = val - tcp_time_stamp; break; + case TCP_NOTSENT_LOWAT: + tp->notsent_lowat = val; + sk->sk_write_space(sk); + break; default: err = -ENOPROTOOPT; break; @@ -2687,15 +2638,21 @@ #endif /* Return information about state of tcp endpoint in API format. */ -void tcp_get_info(const struct sock *sk, struct tcp_info *info) +void tcp_get_info(struct sock *sk, struct tcp_info *info) { - const struct tcp_sock *tp = tcp_sk(sk); + const struct tcp_sock *tp = tcp_sk(sk); /* iff sk_type == SOCK_STREAM */ const struct inet_connection_sock *icsk = inet_csk(sk); u32 now = tcp_time_stamp; + unsigned int start; + u64 rate64; + u32 rate; memset(info, 0, sizeof(*info)); + if (sk->sk_type != SOCK_STREAM) + return; + + info->tcpi_state = sk_state_load(sk); - info->tcpi_state = sk->sk_state; info->tcpi_ca_state = icsk->icsk_ca_state; info->tcpi_retransmits = icsk->icsk_retransmits; info->tcpi_probes = icsk->icsk_probes_out; @@ -2723,7 +2680,7 @@ info->tcpi_snd_mss = tp->mss_cache; info->tcpi_rcv_mss = icsk->icsk_ack.rcv_mss; - if (sk->sk_state == TCP_LISTEN) { + if (info->tcpi_state == TCP_LISTEN) { info->tcpi_unacked = sk->sk_ack_backlog; info->tcpi_sacked = sk->sk_max_ack_backlog; } else { @@ -2740,8 +2697,8 @@ info->tcpi_pmtu = icsk->icsk_pmtu_cookie; info->tcpi_rcv_ssthresh = tp->rcv_ssthresh; - info->tcpi_rtt = jiffies_to_usecs(tp->srtt)>>3; - info->tcpi_rttvar = jiffies_to_usecs(tp->mdev)>>2; + info->tcpi_rtt = tp->srtt_us >> 3; + info->tcpi_rttvar = tp->mdev_us >> 2; info->tcpi_snd_ssthresh = tp->snd_ssthresh; info->tcpi_snd_cwnd = tp->snd_cwnd; info->tcpi_advmss = tp->advmss; @@ -2751,6 +2708,22 @@ info->tcpi_rcv_space = tp->rcvq_space.space; info->tcpi_total_retrans = tp->total_retrans; + + rate = READ_ONCE(sk->sk_pacing_rate); + rate64 = rate != ~0U ? rate : ~0ULL; + put_unaligned(rate64, &info->tcpi_pacing_rate); + + rate = READ_ONCE(sk->sk_max_pacing_rate); + rate64 = rate != ~0U ? rate : ~0ULL; + put_unaligned(rate64, &info->tcpi_max_pacing_rate); + + do { + start = u64_stats_fetch_begin_irq(&tp->syncp); + put_unaligned(tp->bytes_acked, &info->tcpi_bytes_acked); + put_unaligned(tp->bytes_received, &info->tcpi_bytes_received); + } while (u64_stats_fetch_retry_irq(&tp->syncp, start)); + info->tcpi_segs_out = tp->segs_out; + info->tcpi_segs_in = tp->segs_in; } EXPORT_SYMBOL_GPL(tcp_get_info); @@ -2822,6 +2795,26 @@ return -EFAULT; return 0; } + case TCP_CC_INFO: { + const struct tcp_congestion_ops *ca_ops; + union tcp_cc_info info; + size_t sz = 0; + int attr; + + if (get_user(len, optlen)) + return -EFAULT; + + ca_ops = icsk->icsk_ca_ops; + if (ca_ops && ca_ops->get_info) + sz = ca_ops->get_info(sk, ~0U, &attr, &info); + + len = min_t(unsigned int, len, sz); + if (put_user(len, optlen)) + return -EFAULT; + if (copy_to_user(optval, &info, len)) + return -EFAULT; + return 0; + } case TCP_QUICKACK: val = !icsk->icsk_ack.pingpong; break; @@ -2866,9 +2859,53 @@ case TCP_USER_TIMEOUT: val = jiffies_to_msecs(icsk->icsk_user_timeout); break; + + case TCP_FASTOPEN: + val = icsk->icsk_accept_queue.fastopenq.max_qlen; + break; + case TCP_TIMESTAMP: val = tcp_time_stamp + tp->tsoffset; break; + case TCP_NOTSENT_LOWAT: + val = tp->notsent_lowat; + break; + case TCP_SAVE_SYN: + val = tp->save_syn; + break; + case TCP_SAVED_SYN: { + if (get_user(len, optlen)) + return -EFAULT; + + lock_sock(sk); + if (tp->saved_syn) { + if (len < tp->saved_syn[0]) { + if (put_user(tp->saved_syn[0], optlen)) { + release_sock(sk); + return -EFAULT; + } + release_sock(sk); + return -EINVAL; + } + len = tp->saved_syn[0]; + if (put_user(len, optlen)) { + release_sock(sk); + return -EFAULT; + } + if (copy_to_user(optval, tp->saved_syn + 1, len)) { + release_sock(sk); + return -EFAULT; + } + tcp_saved_syn_free(tp); + release_sock(sk); + } else { + release_sock(sk); + len = 0; + if (put_user(len, optlen)) + return -EFAULT; + } + return 0; + } default: return -ENOPROTOOPT; } @@ -2904,341 +2941,43 @@ EXPORT_SYMBOL(compat_tcp_getsockopt); #endif -struct sk_buff *tcp_tso_segment(struct sk_buff *skb, - netdev_features_t features) -{ - struct sk_buff *segs = ERR_PTR(-EINVAL); - unsigned int sum_truesize = 0; - struct tcphdr *th; - unsigned int thlen; - unsigned int seq; - __be32 delta; - unsigned int oldlen; - unsigned int mss; - struct sk_buff *gso_skb = skb; - __sum16 newcheck; - bool ooo_okay, copy_destructor; - - if (!pskb_may_pull(skb, sizeof(*th))) - goto out; - - th = tcp_hdr(skb); - thlen = th->doff * 4; - if (thlen < sizeof(*th)) - goto out; - - if (!pskb_may_pull(skb, thlen)) - goto out; - - oldlen = (u16)~skb->len; - __skb_pull(skb, thlen); - - mss = skb_shinfo(skb)->gso_size; - if (unlikely(skb->len <= mss)) - goto out; - - if (skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) { - /* Packet is from an untrusted source, reset gso_segs. */ - int type = skb_shinfo(skb)->gso_type; - - if (unlikely(type & - ~(SKB_GSO_TCPV4 | - SKB_GSO_DODGY | - SKB_GSO_TCP_ECN | - SKB_GSO_TCPV6 | - SKB_GSO_GRE | - SKB_GSO_UDP_TUNNEL | - 0) || - !(type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))) - goto out; - - skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss); - - segs = NULL; - goto out; - } - - copy_destructor = gso_skb->destructor == tcp_wfree; - ooo_okay = gso_skb->ooo_okay; - /* All segments but the first should have ooo_okay cleared */ - skb->ooo_okay = 0; - - segs = skb_segment(skb, features); - if (IS_ERR(segs)) - goto out; - - /* Only first segment might have ooo_okay set */ - segs->ooo_okay = ooo_okay; - - delta = htonl(oldlen + (thlen + mss)); - - skb = segs; - th = tcp_hdr(skb); - seq = ntohl(th->seq); - - newcheck = ~csum_fold((__force __wsum)((__force u32)th->check + - (__force u32)delta)); - - do { - th->fin = th->psh = 0; - th->check = newcheck; - - if (skb->ip_summed != CHECKSUM_PARTIAL) - th->check = - csum_fold(csum_partial(skb_transport_header(skb), - thlen, skb->csum)); - - seq += mss; - if (copy_destructor) { - skb->destructor = gso_skb->destructor; - skb->sk = gso_skb->sk; - sum_truesize += skb->truesize; - } - skb = skb->next; - th = tcp_hdr(skb); - - th->seq = htonl(seq); - th->cwr = 0; - } while (skb->next); - - /* Following permits TCP Small Queues to work well with GSO : - * The callback to TCP stack will be called at the time last frag - * is freed at TX completion, and not right now when gso_skb - * is freed by GSO engine - */ - if (copy_destructor) { - swap(gso_skb->sk, skb->sk); - swap(gso_skb->destructor, skb->destructor); - sum_truesize += skb->truesize; - atomic_add(sum_truesize - gso_skb->truesize, - &skb->sk->sk_wmem_alloc); - } - - delta = htonl(oldlen + (skb->tail - skb->transport_header) + - skb->data_len); - th->check = ~csum_fold((__force __wsum)((__force u32)th->check + - (__force u32)delta)); - if (skb->ip_summed != CHECKSUM_PARTIAL) - th->check = csum_fold(csum_partial(skb_transport_header(skb), - thlen, skb->csum)); - -out: - return segs; -} -EXPORT_SYMBOL(tcp_tso_segment); - -struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb) -{ - struct sk_buff **pp = NULL; - struct sk_buff *p; - struct tcphdr *th; - struct tcphdr *th2; - unsigned int len; - unsigned int thlen; - __be32 flags; - unsigned int mss = 1; - unsigned int hlen; - unsigned int off; - int flush = 1; - int i; - - off = skb_gro_offset(skb); - hlen = off + sizeof(*th); - th = skb_gro_header_fast(skb, off); - if (skb_gro_header_hard(skb, hlen)) { - th = skb_gro_header_slow(skb, hlen, off); - if (unlikely(!th)) - goto out; - } - - thlen = th->doff * 4; - if (thlen < sizeof(*th)) - goto out; - - hlen = off + thlen; - if (skb_gro_header_hard(skb, hlen)) { - th = skb_gro_header_slow(skb, hlen, off); - if (unlikely(!th)) - goto out; - } - - skb_gro_pull(skb, thlen); - - len = skb_gro_len(skb); - flags = tcp_flag_word(th); - - for (; (p = *head); head = &p->next) { - if (!NAPI_GRO_CB(p)->same_flow) - continue; - - th2 = tcp_hdr(p); - - if (*(u32 *)&th->source ^ *(u32 *)&th2->source) { - NAPI_GRO_CB(p)->same_flow = 0; - continue; - } - - goto found; - } - - goto out_check_final; - -found: - flush = NAPI_GRO_CB(p)->flush; - flush |= (__force int)(flags & TCP_FLAG_CWR); - flush |= (__force int)((flags ^ tcp_flag_word(th2)) & - ~(TCP_FLAG_CWR | TCP_FLAG_FIN | TCP_FLAG_PSH)); - flush |= (__force int)(th->ack_seq ^ th2->ack_seq); - for (i = sizeof(*th); i < thlen; i += 4) - flush |= *(u32 *)((u8 *)th + i) ^ - *(u32 *)((u8 *)th2 + i); - - mss = skb_shinfo(p)->gso_size; - - flush |= (len - 1) >= mss; - flush |= (ntohl(th2->seq) + skb_gro_len(p)) ^ ntohl(th->seq); - - if (flush || skb_gro_receive(head, skb)) { - mss = 1; - goto out_check_final; - } - - p = *head; - th2 = tcp_hdr(p); - tcp_flag_word(th2) |= flags & (TCP_FLAG_FIN | TCP_FLAG_PSH); - -out_check_final: - flush = len < mss; - flush |= (__force int)(flags & (TCP_FLAG_URG | TCP_FLAG_PSH | - TCP_FLAG_RST | TCP_FLAG_SYN | - TCP_FLAG_FIN)); - - if (p && (!NAPI_GRO_CB(skb)->same_flow || flush)) - pp = head; - -out: - NAPI_GRO_CB(skb)->flush |= flush; - - return pp; -} -EXPORT_SYMBOL(tcp_gro_receive); - -int tcp_gro_complete(struct sk_buff *skb) -{ - struct tcphdr *th = tcp_hdr(skb); - - skb->csum_start = skb_transport_header(skb) - skb->head; - skb->csum_offset = offsetof(struct tcphdr, check); - skb->ip_summed = CHECKSUM_PARTIAL; - - skb_shinfo(skb)->gso_segs = NAPI_GRO_CB(skb)->count; - - if (th->cwr) - skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN; - - return 0; -} -EXPORT_SYMBOL(tcp_gro_complete); - #ifdef CONFIG_TCP_MD5SIG -static unsigned long tcp_md5sig_users; -static struct tcp_md5sig_pool __percpu *tcp_md5sig_pool; -static DEFINE_SPINLOCK(tcp_md5sig_pool_lock); +static DEFINE_PER_CPU(struct tcp_md5sig_pool, tcp_md5sig_pool); +static DEFINE_MUTEX(tcp_md5sig_mutex); +static bool tcp_md5sig_pool_populated = false; -static void __tcp_free_md5sig_pool(struct tcp_md5sig_pool __percpu *pool) +static void __tcp_alloc_md5sig_pool(void) { int cpu; for_each_possible_cpu(cpu) { - struct tcp_md5sig_pool *p = per_cpu_ptr(pool, cpu); - - if (p->md5_desc.tfm) - crypto_free_hash(p->md5_desc.tfm); - } - free_percpu(pool); -} - -void tcp_free_md5sig_pool(void) -{ - struct tcp_md5sig_pool __percpu *pool = NULL; + if (!per_cpu(tcp_md5sig_pool, cpu).md5_desc.tfm) { + struct crypto_hash *hash; - spin_lock_bh(&tcp_md5sig_pool_lock); - if (--tcp_md5sig_users == 0) { - pool = tcp_md5sig_pool; - tcp_md5sig_pool = NULL; + hash = crypto_alloc_hash("md5", 0, CRYPTO_ALG_ASYNC); + if (IS_ERR_OR_NULL(hash)) + return; + per_cpu(tcp_md5sig_pool, cpu).md5_desc.tfm = hash; + } } - spin_unlock_bh(&tcp_md5sig_pool_lock); - if (pool) - __tcp_free_md5sig_pool(pool); + /* before setting tcp_md5sig_pool_populated, we must commit all writes + * to memory. See smp_rmb() in tcp_get_md5sig_pool() + */ + smp_wmb(); + tcp_md5sig_pool_populated = true; } -EXPORT_SYMBOL(tcp_free_md5sig_pool); -static struct tcp_md5sig_pool __percpu * -__tcp_alloc_md5sig_pool(struct sock *sk) +bool tcp_alloc_md5sig_pool(void) { - int cpu; - struct tcp_md5sig_pool __percpu *pool; + if (unlikely(!tcp_md5sig_pool_populated)) { + mutex_lock(&tcp_md5sig_mutex); - pool = alloc_percpu(struct tcp_md5sig_pool); - if (!pool) - return NULL; - - for_each_possible_cpu(cpu) { - struct crypto_hash *hash; + if (!tcp_md5sig_pool_populated) + __tcp_alloc_md5sig_pool(); - hash = crypto_alloc_hash("md5", 0, CRYPTO_ALG_ASYNC); - if (IS_ERR_OR_NULL(hash)) - goto out_free; - - per_cpu_ptr(pool, cpu)->md5_desc.tfm = hash; + mutex_unlock(&tcp_md5sig_mutex); } - return pool; -out_free: - __tcp_free_md5sig_pool(pool); - return NULL; -} - -struct tcp_md5sig_pool __percpu *tcp_alloc_md5sig_pool(struct sock *sk) -{ - struct tcp_md5sig_pool __percpu *pool; - bool alloc = false; - -retry: - spin_lock_bh(&tcp_md5sig_pool_lock); - pool = tcp_md5sig_pool; - if (tcp_md5sig_users++ == 0) { - alloc = true; - spin_unlock_bh(&tcp_md5sig_pool_lock); - } else if (!pool) { - tcp_md5sig_users--; - spin_unlock_bh(&tcp_md5sig_pool_lock); - cpu_relax(); - goto retry; - } else - spin_unlock_bh(&tcp_md5sig_pool_lock); - - if (alloc) { - /* we cannot hold spinlock here because this may sleep. */ - struct tcp_md5sig_pool __percpu *p; - - p = __tcp_alloc_md5sig_pool(sk); - spin_lock_bh(&tcp_md5sig_pool_lock); - if (!p) { - tcp_md5sig_users--; - spin_unlock_bh(&tcp_md5sig_pool_lock); - return NULL; - } - pool = tcp_md5sig_pool; - if (pool) { - /* oops, it has already been assigned. */ - spin_unlock_bh(&tcp_md5sig_pool_lock); - __tcp_free_md5sig_pool(p); - } else { - tcp_md5sig_pool = pool = p; - spin_unlock_bh(&tcp_md5sig_pool_lock); - } - } - return pool; + return tcp_md5sig_pool_populated; } EXPORT_SYMBOL(tcp_alloc_md5sig_pool); @@ -3252,31 +2991,18 @@ */ struct tcp_md5sig_pool *tcp_get_md5sig_pool(void) { - struct tcp_md5sig_pool __percpu *p; - local_bh_disable(); - spin_lock(&tcp_md5sig_pool_lock); - p = tcp_md5sig_pool; - if (p) - tcp_md5sig_users++; - spin_unlock(&tcp_md5sig_pool_lock); - - if (p) - return this_cpu_ptr(p); - + if (tcp_md5sig_pool_populated) { + /* coupled with smp_wmb() in __tcp_alloc_md5sig_pool() */ + smp_rmb(); + return this_cpu_ptr(&tcp_md5sig_pool); + } local_bh_enable(); return NULL; } EXPORT_SYMBOL(tcp_get_md5sig_pool); -void tcp_put_md5sig_pool(void) -{ - local_bh_enable(); - tcp_free_md5sig_pool(); -} -EXPORT_SYMBOL(tcp_put_md5sig_pool); - int tcp_md5_hash_header(struct tcp_md5sig_pool *hp, const struct tcphdr *th) { @@ -3352,7 +3078,7 @@ tcp_set_state(sk, TCP_CLOSE); tcp_clear_xmit_timers(sk); - if (req != NULL) + if (req) reqsk_fastopen_remove(sk, req, false); sk->sk_shutdown = SHUTDOWN_MASK; @@ -3382,26 +3108,27 @@ } __setup("thash_entries=", set_thash_entries); -void tcp_init_mem(struct net *net) +static void __init tcp_init_mem(void) { - unsigned long limit = nr_free_buffer_pages() / 8; + unsigned long limit = nr_free_buffer_pages() / 16; + limit = max(limit, 128UL); - net->ipv4.sysctl_tcp_mem[0] = limit / 4 * 3; - net->ipv4.sysctl_tcp_mem[1] = limit; - net->ipv4.sysctl_tcp_mem[2] = net->ipv4.sysctl_tcp_mem[0] * 2; + sysctl_tcp_mem[0] = limit / 4 * 3; /* 4.68 % */ + sysctl_tcp_mem[1] = limit; /* 6.25 % */ + sysctl_tcp_mem[2] = sysctl_tcp_mem[0] * 2; /* 9.37 % */ } void __init tcp_init(void) { - struct sk_buff *skb = NULL; unsigned long limit; int max_rshare, max_wshare, cnt; unsigned int i; - BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb)); + BUILD_BUG_ON(TCP_MIN_SND_MSS <= MAX_TCP_OPTION_SPACE); + sock_skb_cb_check_size(sizeof(struct tcp_skb_cb)); - percpu_counter_init(&tcp_sockets_allocated, 0); - percpu_counter_init(&tcp_orphan_count, 0); + percpu_counter_init(&tcp_sockets_allocated, 0, GFP_KERNEL); + percpu_counter_init(&tcp_orphan_count, 0, GFP_KERNEL); tcp_hashinfo.bind_bucket_cachep = kmem_cache_create("tcp_bind_bucket", sizeof(struct inet_bind_bucket), 0, @@ -3422,10 +3149,9 @@ &tcp_hashinfo.ehash_mask, 0, thash_entries ? 0 : 512 * 1024); - for (i = 0; i <= tcp_hashinfo.ehash_mask; i++) { + for (i = 0; i <= tcp_hashinfo.ehash_mask; i++) INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].chain, i); - INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].twchain, i); - } + if (inet_ehash_locks_alloc(&tcp_hashinfo)) panic("TCP: failed to alloc ehash_locks"); tcp_hashinfo.bhash = @@ -3451,7 +3177,7 @@ sysctl_tcp_max_orphans = cnt / 2; sysctl_max_syn_backlog = max(128, cnt / 256); - tcp_init_mem(&init_net); + tcp_init_mem(); /* Set per-socket limits to no more than 1/128 the pressure threshold */ limit = nr_free_buffer_pages() << (PAGE_SHIFT - 7); max_wshare = min(4UL*1024*1024, limit); @@ -3469,8 +3195,6 @@ tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size); tcp_metrics_init(); - - tcp_register_congestion_control(&tcp_reno); - + BUG_ON(tcp_register_congestion_control(&tcp_reno) != 0); tcp_tasklet_init(); }