--- zzzz-none-000/linux-3.10.107/net/core/sock.c 2017-06-27 09:49:32.000000000 +0000 +++ scorpion-7490-727/linux-3.10.107/net/core/sock.c 2021-02-04 17:41:59.000000000 +0000 @@ -93,6 +93,7 @@ #include #include +#include #include #include #include @@ -130,6 +131,7 @@ #include #include #include +#include #include @@ -139,6 +141,8 @@ #include #endif +#include + static DEFINE_MUTEX(proto_list_mutex); static LIST_HEAD(proto_list); @@ -163,7 +167,7 @@ /** * sk_capable - Socket global capability test * @sk: Socket to use a capability on or through - * @cap: The global capbility to use + * @cap: The global capability to use * * Test to see if the opener of the socket had when the socket was * created and the current process has the capability @cap in all user @@ -180,7 +184,7 @@ * @sk: Socket to use a capability on or through * @cap: The capability to use * - * Test to see if the opener of the socket had when the socke was created + * Test to see if the opener of the socket had when the socket was created * and the current process has the capability @cap over the network namespace * the socket is a member of. */ @@ -322,6 +326,8 @@ int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512); EXPORT_SYMBOL(sysctl_optmem_max); +int sysctl_tstamp_allow_data __read_mostly = 1; + struct static_key memalloc_socks = STATIC_KEY_INIT_FALSE; EXPORT_SYMBOL_GPL(memalloc_socks); @@ -349,15 +355,12 @@ /* * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward - * progress of swapping. However, if SOCK_MEMALLOC is cleared while - * it has rmem allocations there is a risk that the user of the - * socket cannot make forward progress due to exceeding the rmem - * limits. By rights, sk_clear_memalloc() should only be called - * on sockets being torn down but warn and reset the accounting if - * that assumption breaks. + * progress of swapping. SOCK_MEMALLOC may be cleared while + * it has rmem allocations due to the last swapfile being deactivated + * but there is a risk that the socket is unusable due to exceeding + * the rmem limits. Reclaim the reserves and obey rmem limits again. */ - if (WARN_ON(sk->sk_forward_alloc)) - sk_mem_reclaim(sk); + sk_mem_reclaim(sk); } EXPORT_SYMBOL_GPL(sk_clear_memalloc); @@ -419,11 +422,23 @@ } } +static bool sock_needs_netstamp(const struct sock *sk) +{ + switch (sk->sk_family) { + case AF_UNSPEC: + case AF_UNIX: + return false; + default: + return true; + } +} + static void sock_disable_timestamp(struct sock *sk, unsigned long flags) { if (sk->sk_flags & flags) { sk->sk_flags &= ~flags; - if (!(sk->sk_flags & SK_FLAGS_TIMESTAMP)) + if (sock_needs_netstamp(sk) && + !(sk->sk_flags & SK_FLAGS_TIMESTAMP)) net_disable_timestamp(); } } @@ -432,7 +447,6 @@ int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) { int err; - int skb_len; unsigned long flags; struct sk_buff_head *list = &sk->sk_receive_queue; @@ -454,25 +468,18 @@ skb->dev = NULL; skb_set_owner_r(skb, sk); - /* Cache the SKB length before we tack it onto the receive - * queue. Once it is added it no longer belongs to us and - * may be freed by other threads of control pulling packets - * from the queue. - */ - skb_len = skb->len; - /* we escape from rcu protected region, make sure we dont leak * a norefcounted dst */ skb_dst_force(skb); spin_lock_irqsave(&list->lock, flags); - skb->dropcount = atomic_read(&sk->sk_drops); + sock_skb_set_dropcount(sk, skb); __skb_queue_tail(list, skb); spin_unlock_irqrestore(&list->lock, flags); if (!sock_flag(sk, SOCK_DEAD)) - sk->sk_data_ready(sk, skb_len); + sk->sk_data_ready(sk); return 0; } EXPORT_SYMBOL(sock_queue_rcv_skb); @@ -486,7 +493,7 @@ skb->dev = NULL; - if (sk_rcvqueues_full(sk, skb, sk->sk_rcvbuf)) { + if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) { atomic_inc(&sk->sk_drops); goto discard_and_relse; } @@ -519,12 +526,6 @@ } EXPORT_SYMBOL(sk_receive_skb); -void sk_reset_txq(struct sock *sk) -{ - sk_tx_queue_clear(sk); -} -EXPORT_SYMBOL(sk_reset_txq); - struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie) { struct dst_entry *dst = __sk_dst_get(sk); @@ -660,6 +661,25 @@ sock_reset_flag(sk, bit); } +bool sk_mc_loop(struct sock *sk) +{ + if (dev_recursion_level()) + return false; + if (!sk) + return true; + switch (sk->sk_family) { + case AF_INET: + return inet_sk(sk)->mc_loop; +#if IS_ENABLED(CONFIG_IPV6) + case AF_INET6: + return inet6_sk(sk)->mc_loop; +#endif + } + WARN_ON(1); + return true; +} +EXPORT_SYMBOL(sk_mc_loop); + /* * This is meant for all protocols to use and covers goings on * at the socket level. Everything here is generic. @@ -725,7 +745,7 @@ val = min_t(u32, val, sysctl_wmem_max); set_sndbuf: sk->sk_userlocks |= SOCK_SNDBUF_LOCK; - sk->sk_sndbuf = max_t(u32, val * 2, SOCK_MIN_SNDBUF); + sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF); /* Wake up sending tasks if we upped the value. */ sk->sk_write_space(sk); break; @@ -761,7 +781,7 @@ * returning the value we actually used in getsockopt * is the most desirable behavior. */ - sk->sk_rcvbuf = max_t(u32, val * 2, SOCK_MIN_RCVBUF); + sk->sk_rcvbuf = max_t(int, val * 2, SOCK_MIN_RCVBUF); break; case SO_RCVBUFFORCE: @@ -785,7 +805,7 @@ break; case SO_NO_CHECK: - sk->sk_no_check = valbool; + sk->sk_no_check_tx = valbool; break; case SO_PRIORITY: @@ -849,24 +869,27 @@ ret = -EINVAL; break; } - sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE, - val & SOF_TIMESTAMPING_TX_HARDWARE); - sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE, - val & SOF_TIMESTAMPING_TX_SOFTWARE); - sock_valbool_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE, - val & SOF_TIMESTAMPING_RX_HARDWARE); + + if (val & SOF_TIMESTAMPING_OPT_ID && + !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) { + if (sk->sk_protocol == IPPROTO_TCP && + sk->sk_type == SOCK_STREAM) { + if (sk->sk_state != TCP_ESTABLISHED) { + ret = -EINVAL; + break; + } + sk->sk_tskey = tcp_sk(sk)->snd_una; + } else { + sk->sk_tskey = 0; + } + } + sk->sk_tsflags = val; if (val & SOF_TIMESTAMPING_RX_SOFTWARE) sock_enable_timestamp(sk, SOCK_TIMESTAMPING_RX_SOFTWARE); else sock_disable_timestamp(sk, (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE)); - sock_valbool_flag(sk, SOCK_TIMESTAMPING_SOFTWARE, - val & SOF_TIMESTAMPING_SOFTWARE); - sock_valbool_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE, - val & SOF_TIMESTAMPING_SYS_HARDWARE); - sock_valbool_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE, - val & SOF_TIMESTAMPING_RAW_HARDWARE); break; case SO_RCVLOWAT: @@ -896,6 +919,19 @@ } break; + case SO_ATTACH_BPF: + ret = -EINVAL; + if (optlen == sizeof(u32)) { + u32 ufd; + + ret = -EFAULT; + if (copy_from_user(&ufd, optval, sizeof(ufd))) + break; + + ret = sk_attach_bpf(ufd, sk); + } + break; + case SO_DETACH_FILTER: ret = sk_detach_filter(sk); break; @@ -920,8 +956,6 @@ sk->sk_mark = val; break; - /* We implement the SO_SNDLOWAT etc to - not be settable (1003.1g 5.3) */ case SO_RXQ_OVFL: sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool); break; @@ -945,6 +979,30 @@ sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool); break; +#ifdef CONFIG_NET_RX_BUSY_POLL + case SO_BUSY_POLL: + /* allow unprivileged users to decrease the value */ + if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN)) + ret = -EPERM; + else { + if (val < 0) + ret = -EINVAL; + else + sk->sk_ll_usec = val; + } + break; +#endif + + case SO_MAX_PACING_RATE: + sk->sk_max_pacing_rate = val; + sk->sk_pacing_rate = min(sk->sk_pacing_rate, + sk->sk_max_pacing_rate); + break; + + case SO_INCOMING_CPU: + sk->sk_incoming_cpu = val; + break; + default: ret = -ENOPROTOOPT; break; @@ -955,8 +1013,8 @@ EXPORT_SYMBOL(sock_setsockopt); -void cred_to_ucred(struct pid *pid, const struct cred *cred, - struct ucred *ucred) +static void cred_to_ucred(struct pid *pid, const struct cred *cred, + struct ucred *ucred) { ucred->pid = pid_vnr(pid); ucred->uid = ucred->gid = -1; @@ -967,7 +1025,6 @@ ucred->gid = from_kgid_munged(current_ns, cred->egid); } } -EXPORT_SYMBOL_GPL(cred_to_ucred); int sock_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) @@ -1046,7 +1103,7 @@ break; case SO_NO_CHECK: - v.val = sk->sk_no_check; + v.val = sk->sk_no_check_tx; break; case SO_PRIORITY: @@ -1073,21 +1130,7 @@ break; case SO_TIMESTAMPING: - v.val = 0; - if (sock_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE)) - v.val |= SOF_TIMESTAMPING_TX_HARDWARE; - if (sock_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE)) - v.val |= SOF_TIMESTAMPING_TX_SOFTWARE; - if (sock_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE)) - v.val |= SOF_TIMESTAMPING_RX_HARDWARE; - if (sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE)) - v.val |= SOF_TIMESTAMPING_RX_SOFTWARE; - if (sock_flag(sk, SOCK_TIMESTAMPING_SOFTWARE)) - v.val |= SOF_TIMESTAMPING_SOFTWARE; - if (sock_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE)) - v.val |= SOF_TIMESTAMPING_SYS_HARDWARE; - if (sock_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE)) - v.val |= SOF_TIMESTAMPING_RAW_HARDWARE; + v.val = sk->sk_tsflags; break; case SO_RCVTIMEO: @@ -1198,11 +1241,32 @@ v.val = sock_flag(sk, SOCK_FILTER_LOCKED); break; + case SO_BPF_EXTENSIONS: + v.val = bpf_tell_extensions(); + break; + case SO_SELECT_ERR_QUEUE: v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE); break; +#ifdef CONFIG_NET_RX_BUSY_POLL + case SO_BUSY_POLL: + v.val = sk->sk_ll_usec; + break; +#endif + + case SO_MAX_PACING_RATE: + v.val = sk->sk_max_pacing_rate; + break; + + case SO_INCOMING_CPU: + v.val = sk->sk_incoming_cpu; + break; + default: + /* We implement the SO_SNDLOWAT etc to not be settable + * (1003.1g 7). + */ return -ENOPROTOOPT; } @@ -1328,19 +1392,7 @@ module_put(owner); } -#if IS_ENABLED(CONFIG_NET_CLS_CGROUP) -void sock_update_classid(struct sock *sk) -{ - u32 classid; - - classid = task_cls_classid(current); - if (classid != sk->sk_classid) - sk->sk_classid = classid; -} -EXPORT_SYMBOL(sock_update_classid); -#endif - -#if IS_ENABLED(CONFIG_NETPRIO_CGROUP) +#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO) void sock_update_netprioidx(struct sock *sk) { if (in_interrupt()) @@ -1357,9 +1409,10 @@ * @family: protocol family * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) * @prot: struct proto associated with this new sock instance + * @kern: is this to be a kernel socket? */ struct sock *sk_alloc(struct net *net, int family, gfp_t priority, - struct proto *prot) + struct proto *prot, int kern) { struct sock *sk; @@ -1372,7 +1425,10 @@ */ sk->sk_prot = sk->sk_prot_creator = prot; sock_lock_init(sk); - sock_net_set(sk, get_net(net)); + sk->sk_net_refcnt = kern ? 0 : 1; + if (likely(sk->sk_net_refcnt)) + get_net(net); + sock_net_set(sk, net); atomic_set(&sk->sk_wmem_alloc, 1); sock_update_classid(sk); @@ -1383,7 +1439,7 @@ } EXPORT_SYMBOL(sk_alloc); -static void __sk_free(struct sock *sk) +void sk_destruct(struct sock *sk) { struct sk_filter *filter; @@ -1411,10 +1467,21 @@ if (sk->sk_peer_cred) put_cred(sk->sk_peer_cred); put_pid(sk->sk_peer_pid); - put_net(sock_net(sk)); + if (likely(sk->sk_net_refcnt)) + put_net(sock_net(sk)); sk_prot_free(sk->sk_prot_creator, sk); } +static void __sk_free(struct sock *sk) +{ +#ifdef CONFIG_SOCK_DIAG + if (unlikely(sock_diag_has_destroy_listeners(sk) && sk->sk_net_refcnt)) + sock_diag_broadcast_destroy(sk); + else +#endif + sk_destruct(sk); +} + void sk_free(struct sock *sk) { /* @@ -1427,26 +1494,6 @@ } EXPORT_SYMBOL(sk_free); -/* - * Last sock_put should drop reference to sk->sk_net. It has already - * been dropped in sk_change_net. Taking reference to stopping namespace - * is not an option. - * Take reference to a socket to remove it from hash _alive_ and after that - * destroy it in the context of init_net. - */ -void sk_release_kernel(struct sock *sk) -{ - if (sk == NULL || sk->sk_socket == NULL) - return; - - sock_hold(sk); - sock_release(sk->sk_socket); - release_net(sock_net(sk)); - sock_net_set(sk, get_net(&init_net)); - sock_put(sk); -} -EXPORT_SYMBOL(sk_release_kernel); - static void sk_update_clone(const struct sock *sk, struct sock *newsk) { if (mem_cgroup_sockets_enabled && sk->sk_cgrp) @@ -1463,6 +1510,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) { struct sock *newsk; + bool is_charged = true; newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family); if (newsk != NULL) { @@ -1471,7 +1519,8 @@ sock_copy(newsk, sk); /* SANITY */ - get_net(sock_net(newsk)); + if (likely(newsk->sk_net_refcnt)) + get_net(sock_net(newsk)); sk_node_init(&newsk->sk_node); sock_lock_init(newsk); bh_lock_sock(newsk); @@ -1486,11 +1535,7 @@ atomic_set(&newsk->sk_omem_alloc, 0); skb_queue_head_init(&newsk->sk_receive_queue); skb_queue_head_init(&newsk->sk_write_queue); -#ifdef CONFIG_NET_DMA - skb_queue_head_init(&newsk->sk_async_wait_queue); -#endif - spin_lock_init(&newsk->sk_dst_lock); rwlock_init(&newsk->sk_callback_lock); lockdep_set_class_and_name(&newsk->sk_callback_lock, af_callback_keys + newsk->sk_family, @@ -1507,9 +1552,19 @@ filter = rcu_dereference_protected(newsk->sk_filter, 1); if (filter != NULL) - sk_filter_charge(newsk, filter); + /* though it's an empty new sock, the charging may fail + * if sysctl_optmem_max was changed between creation of + * original socket and cloning + */ + is_charged = sk_filter_charge(newsk, filter); - if (unlikely(xfrm_sk_clone_policy(newsk))) { + if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) { + /* We need to make sure that we don't uncharge the new + * socket if we couldn't charge it in the first place + * as otherwise we uncharge the parent's filter. + */ + if (!is_charged) + RCU_INIT_POINTER(newsk->sk_filter, NULL); /* It is still raw copy of parent, so invalidate * destructor and make plain sk_free() */ newsk->sk_destruct = NULL; @@ -1522,6 +1577,8 @@ newsk->sk_err = 0; newsk->sk_err_soft = 0; newsk->sk_priority = 0; + newsk->sk_incoming_cpu = raw_smp_processor_id(); + atomic64_set(&newsk->sk_cookie, 0); /* * Before updating sk_refcnt, we must commit prior changes to memory * (Documentation/RCU/rculist_nulls.txt for details) @@ -1549,7 +1606,8 @@ if (newsk->sk_prot->sockets_allocated) sk_sockets_allocated_inc(newsk); - if (newsk->sk_flags & SK_FLAGS_TIMESTAMP) + if (sock_needs_netstamp(sk) && + newsk->sk_flags & SK_FLAGS_TIMESTAMP) net_enable_timestamp(); } out: @@ -1559,7 +1617,9 @@ void sk_setup_caps(struct sock *sk, struct dst_entry *dst) { - __sk_dst_set(sk, dst); + u32 max_segs = 1; + + sk_dst_set(sk, dst); sk->sk_route_caps = dst->dev->features; if (sk->sk_route_caps & NETIF_F_GSO) sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE; @@ -1570,9 +1630,10 @@ } else { sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM; sk->sk_gso_max_size = dst->dev->gso_max_size; - sk->sk_gso_max_segs = dst->dev->gso_max_segs; + max_segs = max_t(u32, dst->dev->gso_max_segs, 1); } } + sk->sk_gso_max_segs = max_segs; } EXPORT_SYMBOL_GPL(sk_setup_caps); @@ -1607,6 +1668,52 @@ } EXPORT_SYMBOL(sock_wfree); +void skb_set_owner_w(struct sk_buff *skb, struct sock *sk) +{ + skb_orphan(skb); + skb->sk = sk; +#ifdef CONFIG_AVM_SK_TC_INDEX +#ifdef CONFIG_NET_SCHED + skb->tc_index = sk->sk_tc_index; +#endif +#endif +#ifdef CONFIG_INET + if (unlikely(!sk_fullsock(sk))) { + skb->destructor = sock_edemux; + sock_hold(sk); + return; + } +#endif + skb->destructor = sock_wfree; + skb_set_hash_from_sk(skb, sk); + /* + * We used to take a refcount on sk, but following operation + * is enough to guarantee sk_free() wont free this sock until + * all in-flight packets are completed + */ + atomic_add(skb->truesize, &sk->sk_wmem_alloc); +} +EXPORT_SYMBOL(skb_set_owner_w); + +void skb_orphan_partial(struct sk_buff *skb) +{ + /* TCP stack sets skb->ooo_okay based on sk_wmem_alloc, + * so we do not completely orphan skb, but transfert all + * accounted bytes but one, to avoid unexpected reorders. + */ + if (skb->destructor == sock_wfree +#ifdef CONFIG_INET + || skb->destructor == tcp_wfree +#endif + ) { + atomic_sub(skb->truesize - 1, &skb->sk->sk_wmem_alloc); + skb->truesize = 1; + } else { + skb_orphan(skb); + } +} +EXPORT_SYMBOL(skb_orphan_partial); + /* * Read buffer destructor automatically called from kfree_skb. */ @@ -1620,18 +1727,15 @@ } EXPORT_SYMBOL(sock_rfree); -void sock_edemux(struct sk_buff *skb) +/* + * Buffer destructor for skbs that are not used directly in read or write + * path, e.g. for error handler skbs. Automatically called from kfree_skb. + */ +void sock_efree(struct sk_buff *skb) { - struct sock *sk = skb->sk; - -#ifdef CONFIG_INET - if (sk->sk_state == TCP_TIME_WAIT) - inet_twsk_put(inet_twsk(sk)); - else -#endif - sock_put(sk); + sock_put(skb->sk); } -EXPORT_SYMBOL(sock_edemux); +EXPORT_SYMBOL(sock_efree); kuid_t sock_i_uid(struct sock *sk) { @@ -1673,22 +1777,6 @@ EXPORT_SYMBOL(sock_wmalloc); /* - * Allocate a skb from the socket's receive buffer. - */ -struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force, - gfp_t priority) -{ - if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) { - struct sk_buff *skb = alloc_skb(size, priority); - if (skb) { - skb_set_owner_r(skb, sk); - return skb; - } - } - return NULL; -} - -/* * Allocate a memory block from the socket's option memory buffer. */ void *sock_kmalloc(struct sock *sk, int size, gfp_t priority) @@ -1709,16 +1797,34 @@ } EXPORT_SYMBOL(sock_kmalloc); -/* - * Free an option memory block. +/* Free an option memory block. Note, we actually want the inline + * here as this allows gcc to detect the nullify and fold away the + * condition entirely. */ -void sock_kfree_s(struct sock *sk, void *mem, int size) +static inline void __sock_kfree_s(struct sock *sk, void *mem, int size, + const bool nullify) { - kfree(mem); + if (WARN_ON_ONCE(!mem)) + return; + if (nullify) + kzfree(mem); + else + kfree(mem); atomic_sub(size, &sk->sk_omem_alloc); } + +void sock_kfree_s(struct sock *sk, void *mem, int size) +{ + __sock_kfree_s(sk, mem, size, false); +} EXPORT_SYMBOL(sock_kfree_s); +void sock_kzfree_s(struct sock *sk, void *mem, int size) +{ + __sock_kfree_s(sk, mem, size, true); +} +EXPORT_SYMBOL(sock_kzfree_s); + /* It is almost wait_for_tcp_memory minus release_sock/lock_sock. I think, these locks should be removed for datagram sockets. */ @@ -1726,7 +1832,7 @@ { DEFINE_WAIT(wait); - clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); for (;;) { if (!timeo) break; @@ -1753,24 +1859,14 @@ struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len, unsigned long data_len, int noblock, - int *errcode) + int *errcode, int max_page_order) { struct sk_buff *skb; - gfp_t gfp_mask; long timeo; int err; - int npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT; - - err = -EMSGSIZE; - if (npages > MAX_SKB_FRAGS) - goto failure; - - gfp_mask = sk->sk_allocation; - if (gfp_mask & __GFP_WAIT) - gfp_mask |= __GFP_REPEAT; timeo = sock_sndtimeo(sk, noblock); - while (1) { + for (;;) { err = sock_error(sk); if (err != 0) goto failure; @@ -1779,43 +1875,10 @@ if (sk->sk_shutdown & SEND_SHUTDOWN) goto failure; - if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) { - skb = alloc_skb(header_len, gfp_mask); - if (skb) { - int i; - - /* No pages, we're done... */ - if (!data_len) - break; - - skb->truesize += data_len; - skb_shinfo(skb)->nr_frags = npages; - for (i = 0; i < npages; i++) { - struct page *page; - - page = alloc_pages(sk->sk_allocation, 0); - if (!page) { - err = -ENOBUFS; - skb_shinfo(skb)->nr_frags = i; - kfree_skb(skb); - goto failure; - } - - __skb_fill_page_desc(skb, i, - page, 0, - (data_len >= PAGE_SIZE ? - PAGE_SIZE : - data_len)); - data_len -= PAGE_SIZE; - } + if (sk_wmem_alloc_get(sk) < sk->sk_sndbuf) + break; - /* Full success... */ - break; - } - err = -ENOBUFS; - goto failure; - } - set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); err = -EAGAIN; if (!timeo) @@ -1824,8 +1887,10 @@ goto interrupted; timeo = sock_wait_for_wmem(sk, timeo); } - - skb_set_owner_w(skb, sk); + skb = alloc_skb_with_frags(header_len, data_len, max_page_order, + errcode, sk->sk_allocation); + if (skb) + skb_set_owner_w(skb, sk); return skb; interrupted: @@ -1839,42 +1904,86 @@ struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size, int noblock, int *errcode) { - return sock_alloc_send_pskb(sk, size, 0, noblock, errcode); + return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0); } EXPORT_SYMBOL(sock_alloc_send_skb); +int sock_cmsg_send(struct sock *sk, struct msghdr *msg, + struct sockcm_cookie *sockc) +{ + struct cmsghdr *cmsg; + + for_each_cmsghdr(cmsg, msg) { + if (!CMSG_OK(msg, cmsg)) + return -EINVAL; + if (cmsg->cmsg_level != SOL_SOCKET) + continue; + switch (cmsg->cmsg_type) { + case SO_MARK: + if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) + return -EPERM; + if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) + return -EINVAL; + sockc->mark = *(u32 *)CMSG_DATA(cmsg); + break; + default: + return -EINVAL; + } + } + return 0; +} +EXPORT_SYMBOL(sock_cmsg_send); + /* On 32bit arches, an skb frag is limited to 2^15 */ #define SKB_FRAG_PAGE_ORDER get_order(32768) -bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag) +/** + * skb_page_frag_refill - check that a page_frag contains enough room + * @sz: minimum size of the fragment we want to get + * @pfrag: pointer to page_frag + * @gfp: priority for memory allocation + * + * Note: While this allocator tries to use high order pages, there is + * no guarantee that allocations succeed. Therefore, @sz MUST be + * less or equal than PAGE_SIZE. + */ +bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp) { - int order; - if (pfrag->page) { if (atomic_read(&pfrag->page->_count) == 1) { pfrag->offset = 0; return true; } - if (pfrag->offset < pfrag->size) + if (pfrag->offset + sz <= pfrag->size) return true; put_page(pfrag->page); } - /* We restrict high order allocations to users that can afford to wait */ - order = (sk->sk_allocation & __GFP_WAIT) ? SKB_FRAG_PAGE_ORDER : 0; - - do { - gfp_t gfp = sk->sk_allocation; - - if (order) - gfp |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY; - pfrag->page = alloc_pages(gfp, order); + pfrag->offset = 0; + if (SKB_FRAG_PAGE_ORDER) { + /* Avoid direct reclaim but allow kswapd to wake */ + pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) | + __GFP_COMP | __GFP_NOWARN | + __GFP_NORETRY, + SKB_FRAG_PAGE_ORDER); if (likely(pfrag->page)) { - pfrag->offset = 0; - pfrag->size = PAGE_SIZE << order; + pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER; return true; } - } while (--order >= 0); + } + pfrag->page = alloc_page(gfp); + if (likely(pfrag->page)) { + pfrag->size = PAGE_SIZE; + return true; + } + return false; +} +EXPORT_SYMBOL(skb_page_frag_refill); + +bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag) +{ + if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation))) + return true; sk_enter_memory_pressure(sk); sk_stream_moderate_sndbuf(sk); @@ -1943,21 +2052,22 @@ * sk_wait_data - wait for data to arrive at sk_receive_queue * @sk: sock to wait on * @timeo: for how long + * @skb: last skb seen on sk_receive_queue * * Now socket state including sk->sk_err is changed only under lock, * hence we may omit checks after joining wait queue. * We check receive queue before schedule() only as optimization; * it is very likely that release_sock() added new data. */ -int sk_wait_data(struct sock *sk, long *timeo) +int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb) { int rc; DEFINE_WAIT(wait); prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); - set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags); - rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue)); - clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags); + sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); + rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb); + sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); finish_wait(sk_sleep(sk), &wait); return rc; } @@ -2052,7 +2162,7 @@ EXPORT_SYMBOL(__sk_mem_schedule); /** - * __sk_reclaim - reclaim memory_allocated + * __sk_mem_reclaim - reclaim memory_allocated * @sk: socket * @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple) */ @@ -2146,15 +2256,14 @@ } EXPORT_SYMBOL(sock_no_getsockopt); -int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m, - size_t len) +int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len) { return -EOPNOTSUPP; } EXPORT_SYMBOL(sock_no_sendmsg); -int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m, - size_t len, int flags) +int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len, + int flags) { return -EOPNOTSUPP; } @@ -2208,7 +2317,7 @@ rcu_read_unlock(); } -static void sock_def_readable(struct sock *sk, int len) +static void sock_def_readable(struct sock *sk) { struct socket_wq *wq; @@ -2246,7 +2355,6 @@ static void sock_def_destruct(struct sock *sk) { - kfree(sk->sk_protinfo); } void sk_send_sigurg(struct sock *sk) @@ -2277,9 +2385,6 @@ skb_queue_head_init(&sk->sk_receive_queue); skb_queue_head_init(&sk->sk_write_queue); skb_queue_head_init(&sk->sk_error_queue); -#ifdef CONFIG_NET_DMA - skb_queue_head_init(&sk->sk_async_wait_queue); -#endif sk->sk_send_head = NULL; @@ -2300,7 +2405,6 @@ } else sk->sk_wq = NULL; - spin_lock_init(&sk->sk_dst_lock); rwlock_init(&sk->sk_callback_lock); lockdep_set_class_and_name(&sk->sk_callback_lock, af_callback_keys + sk->sk_family, @@ -2325,7 +2429,14 @@ sk->sk_stamp = ktime_set(-1L, 0); +#ifdef CONFIG_NET_RX_BUSY_POLL + sk->sk_napi_id = 0; + sk->sk_ll_usec = sysctl_net_busy_read; +#endif + + sk->sk_max_pacing_rate = ~0U; sk->sk_pacing_rate = ~0U; + sk->sk_incoming_cpu = -1; /* * Before updating sk_refcnt, we must commit prior changes to memory * (Documentation/RCU/rculist_nulls.txt for details) @@ -2452,11 +2563,48 @@ * time stamping, but time stamping might have been on * already because of the other one */ - if (!(previous_flags & SK_FLAGS_TIMESTAMP)) + if (sock_needs_netstamp(sk) && + !(previous_flags & SK_FLAGS_TIMESTAMP)) net_enable_timestamp(); } } +int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len, + int level, int type) +{ + struct sock_exterr_skb *serr; + struct sk_buff *skb; + int copied, err; + + err = -EAGAIN; + skb = sock_dequeue_err_skb(sk); + if (skb == NULL) + goto out; + + copied = skb->len; + if (copied > len) { + msg->msg_flags |= MSG_TRUNC; + copied = len; + } + err = skb_copy_datagram_msg(skb, 0, msg, copied); + if (err) + goto out_free_skb; + + sock_recv_timestamp(msg, sk, skb); + + serr = SKB_EXT_ERR(skb); + put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee); + + msg->msg_flags |= MSG_ERRQUEUE; + err = copied; + +out_free_skb: + kfree_skb(skb); +out: + return err; +} +EXPORT_SYMBOL(sock_recv_errqueue); + /* * Get a socket option on an socket. * @@ -2487,14 +2635,14 @@ EXPORT_SYMBOL(compat_sock_common_getsockopt); #endif -int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock, - struct msghdr *msg, size_t size, int flags) +int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, + int flags) { struct sock *sk = sock->sk; int addr_len = 0; int err; - err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT, + err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT, flags & ~MSG_DONTWAIT, &addr_len); if (err >= 0) msg->msg_namelen = addr_len; @@ -2666,6 +2814,40 @@ } #endif +static void req_prot_cleanup(struct request_sock_ops *rsk_prot) +{ + if (!rsk_prot) + return; + kfree(rsk_prot->slab_name); + rsk_prot->slab_name = NULL; + kmem_cache_destroy(rsk_prot->slab); + rsk_prot->slab = NULL; +} + +static int req_prot_init(const struct proto *prot) +{ + struct request_sock_ops *rsk_prot = prot->rsk_prot; + + if (!rsk_prot) + return 0; + + rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", + prot->name); + if (!rsk_prot->slab_name) + return -ENOMEM; + + rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name, + rsk_prot->obj_size, 0, + prot->slab_flags, NULL); + + if (!rsk_prot->slab) { + pr_crit("%s: Can't create request sock SLAB cache!\n", + prot->name); + return -ENOMEM; + } + return 0; +} + int proto_register(struct proto *prot, int alloc_slab) { if (alloc_slab) { @@ -2679,21 +2861,8 @@ goto out; } - if (prot->rsk_prot != NULL) { - prot->rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", prot->name); - if (prot->rsk_prot->slab_name == NULL) - goto out_free_sock_slab; - - prot->rsk_prot->slab = kmem_cache_create(prot->rsk_prot->slab_name, - prot->rsk_prot->obj_size, 0, - SLAB_HWCACHE_ALIGN, NULL); - - if (prot->rsk_prot->slab == NULL) { - pr_crit("%s: Can't create request sock SLAB cache!\n", - prot->name); - goto out_free_request_sock_slab_name; - } - } + if (req_prot_init(prot)) + goto out_free_request_sock_slab; if (prot->twsk_prot != NULL) { prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name); @@ -2705,8 +2874,7 @@ kmem_cache_create(prot->twsk_prot->twsk_slab_name, prot->twsk_prot->twsk_obj_size, 0, - SLAB_HWCACHE_ALIGN | - prot->slab_flags, + prot->slab_flags, NULL); if (prot->twsk_prot->twsk_slab == NULL) goto out_free_timewait_sock_slab_name; @@ -2722,14 +2890,8 @@ out_free_timewait_sock_slab_name: kfree(prot->twsk_prot->twsk_slab_name); out_free_request_sock_slab: - if (prot->rsk_prot && prot->rsk_prot->slab) { - kmem_cache_destroy(prot->rsk_prot->slab); - prot->rsk_prot->slab = NULL; - } -out_free_request_sock_slab_name: - if (prot->rsk_prot) - kfree(prot->rsk_prot->slab_name); -out_free_sock_slab: + req_prot_cleanup(prot->rsk_prot); + kmem_cache_destroy(prot->slab); prot->slab = NULL; out: @@ -2744,16 +2906,10 @@ list_del(&prot->node); mutex_unlock(&proto_list_mutex); - if (prot->slab != NULL) { - kmem_cache_destroy(prot->slab); - prot->slab = NULL; - } + kmem_cache_destroy(prot->slab); + prot->slab = NULL; - if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) { - kmem_cache_destroy(prot->rsk_prot->slab); - kfree(prot->rsk_prot->slab_name); - prot->rsk_prot->slab = NULL; - } + req_prot_cleanup(prot->rsk_prot); if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) { kmem_cache_destroy(prot->twsk_prot->twsk_slab); @@ -2891,6 +3047,8 @@ static int __init proto_init(void) { + if (IS_ENABLED(CONFIG_PROC_STRIPPED)) + return 0; return register_pernet_subsys(&proto_net_ops); }