--- zzzz-none-000/linux-3.10.107/net/netlink/af_netlink.c 2017-06-27 09:49:32.000000000 +0000 +++ scorpion-7490-727/linux-3.10.107/net/netlink/af_netlink.c 2021-02-04 17:41:59.000000000 +0000 @@ -57,7 +57,11 @@ #include #include #include +#include +#include #include +#include +#include #include #include @@ -72,20 +76,22 @@ }; /* state bits */ -#define NETLINK_CONGESTED 0x0 +#define NETLINK_S_CONGESTED 0x0 /* flags */ -#define NETLINK_KERNEL_SOCKET 0x1 -#define NETLINK_RECV_PKTINFO 0x2 -#define NETLINK_BROADCAST_SEND_ERROR 0x4 -#define NETLINK_RECV_NO_ENOBUFS 0x8 +#define NETLINK_F_KERNEL_SOCKET 0x1 +#define NETLINK_F_RECV_PKTINFO 0x2 +#define NETLINK_F_BROADCAST_SEND_ERROR 0x4 +#define NETLINK_F_RECV_NO_ENOBUFS 0x8 +#define NETLINK_F_LISTEN_ALL_NSID 0x10 +#define NETLINK_F_CAP_ACK 0x20 static inline int netlink_is_kernel(struct sock *sk) { - return nlk_sk(sk)->flags & NETLINK_KERNEL_SOCKET; + return nlk_sk(sk)->flags & NETLINK_F_KERNEL_SOCKET; } -struct netlink_table *nl_table; +struct netlink_table *nl_table __read_mostly; EXPORT_SYMBOL_GPL(nl_table); static DECLARE_WAIT_QUEUE_HEAD(nl_table_wait); @@ -93,6 +99,14 @@ static int netlink_dump(struct sock *sk); static void netlink_skb_destructor(struct sk_buff *skb); +/* nl_table locking explained: + * Lookup and traversal are protected with an RCU read-side lock. Insertion + * and removal are protected with per bucket lock while using RCU list + * modification primitives and may run in parallel to RCU protected lookups. + * Destruction of the Netlink socket may only occur *after* nl_table_lock has + * been acquired * either during or after the socket has been removed from + * the list and after an RCU grace period. + */ DEFINE_RWLOCK(nl_table_lock); EXPORT_SYMBOL_GPL(nl_table_lock); static atomic_t nl_table_users = ATOMIC_INIT(0); @@ -101,650 +115,200 @@ static ATOMIC_NOTIFIER_HEAD(netlink_chain); -static inline u32 netlink_group_mask(u32 group) -{ - return group ? 1 << (group - 1) : 0; -} - -static inline struct hlist_head *nl_portid_hashfn(struct nl_portid_hash *hash, u32 portid) -{ - return &hash->table[jhash_1word(portid, hash->rnd) & hash->mask]; -} - -static void netlink_overrun(struct sock *sk) -{ - struct netlink_sock *nlk = nlk_sk(sk); - - if (!(nlk->flags & NETLINK_RECV_NO_ENOBUFS)) { - if (!test_and_set_bit(NETLINK_CONGESTED, &nlk_sk(sk)->state)) { - sk->sk_err = ENOBUFS; - sk->sk_error_report(sk); - } - } - atomic_inc(&sk->sk_drops); -} - -static void netlink_rcv_wake(struct sock *sk) -{ - struct netlink_sock *nlk = nlk_sk(sk); - - if (skb_queue_empty(&sk->sk_receive_queue)) - clear_bit(NETLINK_CONGESTED, &nlk->state); - if (!test_bit(NETLINK_CONGESTED, &nlk->state)) - wake_up_interruptible(&nlk->wait); -} - -#ifdef CONFIG_NETLINK_MMAP -static bool netlink_skb_is_mmaped(const struct sk_buff *skb) -{ - return NETLINK_CB(skb).flags & NETLINK_SKB_MMAPED; -} - -static bool netlink_rx_is_mmaped(struct sock *sk) -{ - return nlk_sk(sk)->rx_ring.pg_vec != NULL; -} - -static bool netlink_tx_is_mmaped(struct sock *sk) -{ - return nlk_sk(sk)->tx_ring.pg_vec != NULL; -} - -static __pure struct page *pgvec_to_page(const void *addr) -{ - if (is_vmalloc_addr(addr)) - return vmalloc_to_page(addr); - else - return virt_to_page(addr); -} +static DEFINE_SPINLOCK(netlink_tap_lock); +static struct list_head netlink_tap_all __read_mostly; -static void free_pg_vec(void **pg_vec, unsigned int order, unsigned int len) -{ - unsigned int i; - - for (i = 0; i < len; i++) { - if (pg_vec[i] != NULL) { - if (is_vmalloc_addr(pg_vec[i])) - vfree(pg_vec[i]); - else - free_pages((unsigned long)pg_vec[i], order); - } - } - kfree(pg_vec); -} +static const struct rhashtable_params netlink_rhashtable_params; -static void *alloc_one_pg_vec_page(unsigned long order) +static inline u32 netlink_group_mask(u32 group) { - void *buffer; - gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP | __GFP_ZERO | - __GFP_NOWARN | __GFP_NORETRY; - - buffer = (void *)__get_free_pages(gfp_flags, order); - if (buffer != NULL) - return buffer; - - buffer = vzalloc((1 << order) * PAGE_SIZE); - if (buffer != NULL) - return buffer; - - gfp_flags &= ~__GFP_NORETRY; - return (void *)__get_free_pages(gfp_flags, order); + return group ? 1 << (group - 1) : 0; } -static void **alloc_pg_vec(struct netlink_sock *nlk, - struct nl_mmap_req *req, unsigned int order) +static struct sk_buff *netlink_to_full_skb(const struct sk_buff *skb, + gfp_t gfp_mask) { - unsigned int block_nr = req->nm_block_nr; - unsigned int i; - void **pg_vec, *ptr; + unsigned int len = skb_end_offset(skb); + struct sk_buff *new; - pg_vec = kcalloc(block_nr, sizeof(void *), GFP_KERNEL); - if (pg_vec == NULL) + new = alloc_skb(len, gfp_mask); + if (new == NULL) return NULL; - for (i = 0; i < block_nr; i++) { - pg_vec[i] = ptr = alloc_one_pg_vec_page(order); - if (pg_vec[i] == NULL) - goto err1; - } - - return pg_vec; -err1: - free_pg_vec(pg_vec, order, block_nr); - return NULL; -} - - -static void -__netlink_set_ring(struct sock *sk, struct nl_mmap_req *req, bool tx_ring, void **pg_vec, - unsigned int order) -{ - struct netlink_sock *nlk = nlk_sk(sk); - struct sk_buff_head *queue; - struct netlink_ring *ring; - - queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue; - ring = tx_ring ? &nlk->tx_ring : &nlk->rx_ring; - - spin_lock_bh(&queue->lock); + NETLINK_CB(new).portid = NETLINK_CB(skb).portid; + NETLINK_CB(new).dst_group = NETLINK_CB(skb).dst_group; + NETLINK_CB(new).creds = NETLINK_CB(skb).creds; - ring->frame_max = req->nm_frame_nr - 1; - ring->head = 0; - ring->frame_size = req->nm_frame_size; - ring->pg_vec_pages = req->nm_block_size / PAGE_SIZE; - - swap(ring->pg_vec_len, req->nm_block_nr); - swap(ring->pg_vec_order, order); - swap(ring->pg_vec, pg_vec); - - __skb_queue_purge(queue); - spin_unlock_bh(&queue->lock); - - WARN_ON(atomic_read(&nlk->mapped)); - - if (pg_vec) - free_pg_vec(pg_vec, order, req->nm_block_nr); + memcpy(skb_put(new, len), skb->data, len); + return new; } -static int netlink_set_ring(struct sock *sk, struct nl_mmap_req *req, - bool tx_ring) +int netlink_add_tap(struct netlink_tap *nt) { - struct netlink_sock *nlk = nlk_sk(sk); - struct netlink_ring *ring; - void **pg_vec = NULL; - unsigned int order = 0; - - ring = tx_ring ? &nlk->tx_ring : &nlk->rx_ring; - - if (atomic_read(&nlk->mapped)) - return -EBUSY; - if (atomic_read(&ring->pending)) - return -EBUSY; - - if (req->nm_block_nr) { - if (ring->pg_vec != NULL) - return -EBUSY; - - if ((int)req->nm_block_size <= 0) - return -EINVAL; - if (!IS_ALIGNED(req->nm_block_size, PAGE_SIZE)) - return -EINVAL; - if (req->nm_frame_size < NL_MMAP_HDRLEN) - return -EINVAL; - if (!IS_ALIGNED(req->nm_frame_size, NL_MMAP_MSG_ALIGNMENT)) - return -EINVAL; - - ring->frames_per_block = req->nm_block_size / - req->nm_frame_size; - if (ring->frames_per_block == 0) - return -EINVAL; - if (ring->frames_per_block * req->nm_block_nr != - req->nm_frame_nr) - return -EINVAL; - - order = get_order(req->nm_block_size); - pg_vec = alloc_pg_vec(nlk, req, order); - if (pg_vec == NULL) - return -ENOMEM; - } else { - if (req->nm_frame_nr) - return -EINVAL; - } - - mutex_lock(&nlk->pg_vec_lock); - if (atomic_read(&nlk->mapped) == 0) { - __netlink_set_ring(sk, req, tx_ring, pg_vec, order); - mutex_unlock(&nlk->pg_vec_lock); - return 0; - } - - mutex_unlock(&nlk->pg_vec_lock); - - if (pg_vec) - free_pg_vec(pg_vec, order, req->nm_block_nr); - - return -EBUSY; -} - -static void netlink_mm_open(struct vm_area_struct *vma) -{ - struct file *file = vma->vm_file; - struct socket *sock = file->private_data; - struct sock *sk = sock->sk; + if (unlikely(nt->dev->type != ARPHRD_NETLINK)) + return -EINVAL; - if (sk) - atomic_inc(&nlk_sk(sk)->mapped); -} + spin_lock(&netlink_tap_lock); + list_add_rcu(&nt->list, &netlink_tap_all); + spin_unlock(&netlink_tap_lock); -static void netlink_mm_close(struct vm_area_struct *vma) -{ - struct file *file = vma->vm_file; - struct socket *sock = file->private_data; - struct sock *sk = sock->sk; + __module_get(nt->module); - if (sk) - atomic_dec(&nlk_sk(sk)->mapped); + return 0; } +EXPORT_SYMBOL_GPL(netlink_add_tap); -static const struct vm_operations_struct netlink_mmap_ops = { - .open = netlink_mm_open, - .close = netlink_mm_close, -}; - -static int netlink_mmap(struct file *file, struct socket *sock, - struct vm_area_struct *vma) +static int __netlink_remove_tap(struct netlink_tap *nt) { - struct sock *sk = sock->sk; - struct netlink_sock *nlk = nlk_sk(sk); - struct netlink_ring *ring; - unsigned long start, size, expected; - unsigned int i; - int err = -EINVAL; + bool found = false; + struct netlink_tap *tmp; - if (vma->vm_pgoff) - return -EINVAL; + spin_lock(&netlink_tap_lock); - mutex_lock(&nlk->pg_vec_lock); - - expected = 0; - for (ring = &nlk->rx_ring; ring <= &nlk->tx_ring; ring++) { - if (ring->pg_vec == NULL) - continue; - expected += ring->pg_vec_len * ring->pg_vec_pages * PAGE_SIZE; - } - - if (expected == 0) - goto out; - - size = vma->vm_end - vma->vm_start; - if (size != expected) - goto out; - - start = vma->vm_start; - for (ring = &nlk->rx_ring; ring <= &nlk->tx_ring; ring++) { - if (ring->pg_vec == NULL) - continue; - - for (i = 0; i < ring->pg_vec_len; i++) { - struct page *page; - void *kaddr = ring->pg_vec[i]; - unsigned int pg_num; - - for (pg_num = 0; pg_num < ring->pg_vec_pages; pg_num++) { - page = pgvec_to_page(kaddr); - err = vm_insert_page(vma, start, page); - if (err < 0) - goto out; - start += PAGE_SIZE; - kaddr += PAGE_SIZE; - } + list_for_each_entry(tmp, &netlink_tap_all, list) { + if (nt == tmp) { + list_del_rcu(&nt->list); + found = true; + goto out; } } - atomic_inc(&nlk->mapped); - vma->vm_ops = &netlink_mmap_ops; - err = 0; + pr_warn("__netlink_remove_tap: %p not found\n", nt); out: - mutex_unlock(&nlk->pg_vec_lock); - return err; -} - -static void netlink_frame_flush_dcache(const struct nl_mmap_hdr *hdr, unsigned int nm_len) -{ -#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1 - struct page *p_start, *p_end; + spin_unlock(&netlink_tap_lock); - /* First page is flushed through netlink_{get,set}_status */ - p_start = pgvec_to_page(hdr + PAGE_SIZE); - p_end = pgvec_to_page((void *)hdr + NL_MMAP_HDRLEN + nm_len - 1); - while (p_start <= p_end) { - flush_dcache_page(p_start); - p_start++; - } -#endif -} + if (found) + module_put(nt->module); -static enum nl_mmap_status netlink_get_status(const struct nl_mmap_hdr *hdr) -{ - smp_rmb(); - flush_dcache_page(pgvec_to_page(hdr)); - return hdr->nm_status; + return found ? 0 : -ENODEV; } -static void netlink_set_status(struct nl_mmap_hdr *hdr, - enum nl_mmap_status status) +int netlink_remove_tap(struct netlink_tap *nt) { - smp_mb(); - hdr->nm_status = status; - flush_dcache_page(pgvec_to_page(hdr)); -} - -static struct nl_mmap_hdr * -__netlink_lookup_frame(const struct netlink_ring *ring, unsigned int pos) -{ - unsigned int pg_vec_pos, frame_off; - - pg_vec_pos = pos / ring->frames_per_block; - frame_off = pos % ring->frames_per_block; - - return ring->pg_vec[pg_vec_pos] + (frame_off * ring->frame_size); -} - -static struct nl_mmap_hdr * -netlink_lookup_frame(const struct netlink_ring *ring, unsigned int pos, - enum nl_mmap_status status) -{ - struct nl_mmap_hdr *hdr; - - hdr = __netlink_lookup_frame(ring, pos); - if (netlink_get_status(hdr) != status) - return NULL; - - return hdr; -} - -static struct nl_mmap_hdr * -netlink_current_frame(const struct netlink_ring *ring, - enum nl_mmap_status status) -{ - return netlink_lookup_frame(ring, ring->head, status); -} + int ret; -static struct nl_mmap_hdr * -netlink_previous_frame(const struct netlink_ring *ring, - enum nl_mmap_status status) -{ - unsigned int prev; + ret = __netlink_remove_tap(nt); + synchronize_net(); - prev = ring->head ? ring->head - 1 : ring->frame_max; - return netlink_lookup_frame(ring, prev, status); + return ret; } +EXPORT_SYMBOL_GPL(netlink_remove_tap); -static void netlink_increment_head(struct netlink_ring *ring) +static bool netlink_filter_tap(const struct sk_buff *skb) { - ring->head = ring->head != ring->frame_max ? ring->head + 1 : 0; -} + struct sock *sk = skb->sk; -static void netlink_forward_ring(struct netlink_ring *ring) -{ - unsigned int head = ring->head, pos = head; - const struct nl_mmap_hdr *hdr; + /* We take the more conservative approach and + * whitelist socket protocols that may pass. + */ + switch (sk->sk_protocol) { + case NETLINK_ROUTE: + case NETLINK_USERSOCK: + case NETLINK_SOCK_DIAG: + case NETLINK_NFLOG: + case NETLINK_XFRM: + case NETLINK_FIB_LOOKUP: + case NETLINK_NETFILTER: + case NETLINK_GENERIC: + return true; + } - do { - hdr = __netlink_lookup_frame(ring, pos); - if (hdr->nm_status == NL_MMAP_STATUS_UNUSED) - break; - if (hdr->nm_status != NL_MMAP_STATUS_SKIP) - break; - netlink_increment_head(ring); - } while (ring->head != head); + return false; } -static bool netlink_dump_space(struct netlink_sock *nlk) +static int __netlink_deliver_tap_skb(struct sk_buff *skb, + struct net_device *dev) { - struct netlink_ring *ring = &nlk->rx_ring; - struct nl_mmap_hdr *hdr; - unsigned int n; + struct sk_buff *nskb; + struct sock *sk = skb->sk; + int ret = -ENOMEM; - hdr = netlink_current_frame(ring, NL_MMAP_STATUS_UNUSED); - if (hdr == NULL) - return false; + dev_hold(dev); - n = ring->head + ring->frame_max / 2; - if (n > ring->frame_max) - n -= ring->frame_max; - - hdr = __netlink_lookup_frame(ring, n); + if (is_vmalloc_addr(skb->head)) + nskb = netlink_to_full_skb(skb, GFP_ATOMIC); + else + nskb = skb_clone(skb, GFP_ATOMIC); + if (nskb) { + nskb->dev = dev; + nskb->protocol = htons((u16) sk->sk_protocol); + nskb->pkt_type = netlink_is_kernel(sk) ? + PACKET_KERNEL : PACKET_USER; + skb_reset_network_header(nskb); + ret = dev_queue_xmit(nskb); + if (unlikely(ret > 0)) + ret = net_xmit_errno(ret); + } - return hdr->nm_status == NL_MMAP_STATUS_UNUSED; + dev_put(dev); + return ret; } -static unsigned int netlink_poll(struct file *file, struct socket *sock, - poll_table *wait) +static void __netlink_deliver_tap(struct sk_buff *skb) { - struct sock *sk = sock->sk; - struct netlink_sock *nlk = nlk_sk(sk); - unsigned int mask; - int err; - - if (nlk->rx_ring.pg_vec != NULL) { - /* Memory mapped sockets don't call recvmsg(), so flow control - * for dumps is performed here. A dump is allowed to continue - * if at least half the ring is unused. - */ - while (nlk->cb != NULL && netlink_dump_space(nlk)) { - err = netlink_dump(sk); - if (err < 0) { - sk->sk_err = -err; - sk->sk_error_report(sk); - break; - } - } - netlink_rcv_wake(sk); - } - - mask = datagram_poll(file, sock, wait); + int ret; + struct netlink_tap *tmp; - spin_lock_bh(&sk->sk_receive_queue.lock); - if (nlk->rx_ring.pg_vec) { - netlink_forward_ring(&nlk->rx_ring); - if (!netlink_previous_frame(&nlk->rx_ring, NL_MMAP_STATUS_UNUSED)) - mask |= POLLIN | POLLRDNORM; - } - spin_unlock_bh(&sk->sk_receive_queue.lock); + if (!netlink_filter_tap(skb)) + return; - spin_lock_bh(&sk->sk_write_queue.lock); - if (nlk->tx_ring.pg_vec) { - if (netlink_current_frame(&nlk->tx_ring, NL_MMAP_STATUS_UNUSED)) - mask |= POLLOUT | POLLWRNORM; + list_for_each_entry_rcu(tmp, &netlink_tap_all, list) { + ret = __netlink_deliver_tap_skb(skb, tmp->dev); + if (unlikely(ret)) + break; } - spin_unlock_bh(&sk->sk_write_queue.lock); - - return mask; } -static struct nl_mmap_hdr *netlink_mmap_hdr(struct sk_buff *skb) -{ - return (struct nl_mmap_hdr *)(skb->head - NL_MMAP_HDRLEN); -} - -static void netlink_ring_setup_skb(struct sk_buff *skb, struct sock *sk, - struct netlink_ring *ring, - struct nl_mmap_hdr *hdr) -{ - unsigned int size; - void *data; - - size = ring->frame_size - NL_MMAP_HDRLEN; - data = (void *)hdr + NL_MMAP_HDRLEN; - - skb->head = data; - skb->data = data; - skb_reset_tail_pointer(skb); - skb->end = skb->tail + size; - skb->len = 0; - - skb->destructor = netlink_skb_destructor; - NETLINK_CB(skb).flags |= NETLINK_SKB_MMAPED; - NETLINK_CB(skb).sk = sk; -} - -static int netlink_mmap_sendmsg(struct sock *sk, struct msghdr *msg, - u32 dst_portid, u32 dst_group, - struct sock_iocb *siocb) +static void netlink_deliver_tap(struct sk_buff *skb) { - struct netlink_sock *nlk = nlk_sk(sk); - struct netlink_ring *ring; - struct nl_mmap_hdr *hdr; - struct sk_buff *skb; - unsigned int maxlen; - int err = 0, len = 0; - - mutex_lock(&nlk->pg_vec_lock); - - ring = &nlk->tx_ring; - maxlen = ring->frame_size - NL_MMAP_HDRLEN; - - do { - unsigned int nm_len; - - hdr = netlink_current_frame(ring, NL_MMAP_STATUS_VALID); - if (hdr == NULL) { - if (!(msg->msg_flags & MSG_DONTWAIT) && - atomic_read(&nlk->tx_ring.pending)) - schedule(); - continue; - } - - nm_len = ACCESS_ONCE(hdr->nm_len); - if (nm_len > maxlen) { - err = -EINVAL; - goto out; - } - - netlink_frame_flush_dcache(hdr, nm_len); - - skb = alloc_skb(nm_len, GFP_KERNEL); - if (skb == NULL) { - err = -ENOBUFS; - goto out; - } - __skb_put(skb, nm_len); - memcpy(skb->data, (void *)hdr + NL_MMAP_HDRLEN, nm_len); - netlink_set_status(hdr, NL_MMAP_STATUS_UNUSED); - - netlink_increment_head(ring); - - NETLINK_CB(skb).portid = nlk->portid; - NETLINK_CB(skb).dst_group = dst_group; - NETLINK_CB(skb).creds = siocb->scm->creds; - - err = security_netlink_send(sk, skb); - if (err) { - kfree_skb(skb); - goto out; - } - - if (unlikely(dst_group)) { - atomic_inc(&skb->users); - netlink_broadcast(sk, skb, dst_portid, dst_group, - GFP_KERNEL); - } - err = netlink_unicast(sk, skb, dst_portid, - msg->msg_flags & MSG_DONTWAIT); - if (err < 0) - goto out; - len += err; + rcu_read_lock(); - } while (hdr != NULL || - (!(msg->msg_flags & MSG_DONTWAIT) && - atomic_read(&nlk->tx_ring.pending))); + if (unlikely(!list_empty(&netlink_tap_all))) + __netlink_deliver_tap(skb); - if (len > 0) - err = len; -out: - mutex_unlock(&nlk->pg_vec_lock); - return err; + rcu_read_unlock(); } -static void netlink_queue_mmaped_skb(struct sock *sk, struct sk_buff *skb) +static void netlink_deliver_tap_kernel(struct sock *dst, struct sock *src, + struct sk_buff *skb) { - struct nl_mmap_hdr *hdr; - - hdr = netlink_mmap_hdr(skb); - hdr->nm_len = skb->len; - hdr->nm_group = NETLINK_CB(skb).dst_group; - hdr->nm_pid = NETLINK_CB(skb).creds.pid; - hdr->nm_uid = from_kuid(sk_user_ns(sk), NETLINK_CB(skb).creds.uid); - hdr->nm_gid = from_kgid(sk_user_ns(sk), NETLINK_CB(skb).creds.gid); - netlink_frame_flush_dcache(hdr, hdr->nm_len); - netlink_set_status(hdr, NL_MMAP_STATUS_VALID); - - NETLINK_CB(skb).flags |= NETLINK_SKB_DELIVERED; - kfree_skb(skb); + if (!(netlink_is_kernel(dst) && netlink_is_kernel(src))) + netlink_deliver_tap(skb); } -static void netlink_ring_set_copied(struct sock *sk, struct sk_buff *skb) +static void netlink_overrun(struct sock *sk) { struct netlink_sock *nlk = nlk_sk(sk); - struct netlink_ring *ring = &nlk->rx_ring; - struct nl_mmap_hdr *hdr; - spin_lock_bh(&sk->sk_receive_queue.lock); - hdr = netlink_current_frame(ring, NL_MMAP_STATUS_UNUSED); - if (hdr == NULL) { - spin_unlock_bh(&sk->sk_receive_queue.lock); - kfree_skb(skb); - netlink_overrun(sk); - return; + if (!(nlk->flags & NETLINK_F_RECV_NO_ENOBUFS)) { + if (!test_and_set_bit(NETLINK_S_CONGESTED, + &nlk_sk(sk)->state)) { + sk->sk_err = ENOBUFS; + sk->sk_error_report(sk); + } } - netlink_increment_head(ring); - __skb_queue_tail(&sk->sk_receive_queue, skb); - spin_unlock_bh(&sk->sk_receive_queue.lock); - - hdr->nm_len = skb->len; - hdr->nm_group = NETLINK_CB(skb).dst_group; - hdr->nm_pid = NETLINK_CB(skb).creds.pid; - hdr->nm_uid = from_kuid(sk_user_ns(sk), NETLINK_CB(skb).creds.uid); - hdr->nm_gid = from_kgid(sk_user_ns(sk), NETLINK_CB(skb).creds.gid); - netlink_set_status(hdr, NL_MMAP_STATUS_COPY); + atomic_inc(&sk->sk_drops); } -#else /* CONFIG_NETLINK_MMAP */ -#define netlink_skb_is_mmaped(skb) false -#define netlink_rx_is_mmaped(sk) false -#define netlink_tx_is_mmaped(sk) false -#define netlink_mmap sock_no_mmap -#define netlink_poll datagram_poll -#define netlink_mmap_sendmsg(sk, msg, dst_portid, dst_group, siocb) 0 -#endif /* CONFIG_NETLINK_MMAP */ - -static void netlink_destroy_callback(struct netlink_callback *cb) +static void netlink_rcv_wake(struct sock *sk) { - kfree_skb(cb->skb); - kfree(cb); -} + struct netlink_sock *nlk = nlk_sk(sk); -static void netlink_consume_callback(struct netlink_callback *cb) -{ - consume_skb(cb->skb); - kfree(cb); + if (skb_queue_empty(&sk->sk_receive_queue)) + clear_bit(NETLINK_S_CONGESTED, &nlk->state); + if (!test_bit(NETLINK_S_CONGESTED, &nlk->state)) + wake_up_interruptible(&nlk->wait); } static void netlink_skb_destructor(struct sk_buff *skb) { -#ifdef CONFIG_NETLINK_MMAP - struct nl_mmap_hdr *hdr; - struct netlink_ring *ring; - struct sock *sk; - - /* If a packet from the kernel to userspace was freed because of an - * error without being delivered to userspace, the kernel must reset - * the status. In the direction userspace to kernel, the status is - * always reset here after the packet was processed and freed. - */ - if (netlink_skb_is_mmaped(skb)) { - hdr = netlink_mmap_hdr(skb); - sk = NETLINK_CB(skb).sk; - - if (NETLINK_CB(skb).flags & NETLINK_SKB_TX) { - netlink_set_status(hdr, NL_MMAP_STATUS_UNUSED); - ring = &nlk_sk(sk)->tx_ring; - } else { - if (!(NETLINK_CB(skb).flags & NETLINK_SKB_DELIVERED)) { - hdr->nm_len = 0; - netlink_set_status(hdr, NL_MMAP_STATUS_VALID); - } - ring = &nlk_sk(sk)->rx_ring; - } - - WARN_ON(atomic_read(&ring->pending) == 0); - atomic_dec(&ring->pending); - sock_put(sk); + if (is_vmalloc_addr(skb->head)) { + if (!skb->cloned || + !atomic_dec_return(&(skb_shinfo(skb)->dataref))) + vfree(skb->head); skb->head = NULL; } -#endif if (skb->sk != NULL) sock_rfree(skb); } @@ -762,27 +326,14 @@ { struct netlink_sock *nlk = nlk_sk(sk); - if (nlk->cb) { - if (nlk->cb->done) - nlk->cb->done(nlk->cb); - - module_put(nlk->cb->module); - netlink_destroy_callback(nlk->cb); + if (nlk->cb_running) { + if (nlk->cb.done) + nlk->cb.done(&nlk->cb); + module_put(nlk->cb.module); + kfree_skb(nlk->cb.skb); } skb_queue_purge(&sk->sk_receive_queue); -#ifdef CONFIG_NETLINK_MMAP - if (1) { - struct nl_mmap_req req; - - memset(&req, 0, sizeof(req)); - if (nlk->rx_ring.pg_vec) - __netlink_set_ring(sk, &req, false, NULL, 0); - memset(&req, 0, sizeof(req)); - if (nlk->tx_ring.pg_vec) - __netlink_set_ring(sk, &req, true, NULL, 0); - } -#endif /* CONFIG_NETLINK_MMAP */ if (!sock_flag(sk, SOCK_DEAD)) { printk(KERN_ERR "Freeing alive netlink socket %p\n", sk); @@ -794,6 +345,14 @@ WARN_ON(nlk_sk(sk)->groups); } +static void netlink_sock_destruct_work(struct work_struct *work) +{ + struct netlink_sock *nlk = container_of(work, struct netlink_sock, + work); + + sk_free(&nlk->sk); +} + /* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it is _very_ bad on * SMP. Look, when several writers sleep and reader wakes them up, all but one * immediately hit write lock and grab all the cpus. Exclusive sleep solves @@ -849,98 +408,66 @@ wake_up(&nl_table_wait); } -static struct sock *netlink_lookup(struct net *net, int protocol, u32 portid) +struct netlink_compare_arg { - struct nl_portid_hash *hash = &nl_table[protocol].hash; - struct hlist_head *head; - struct sock *sk; + possible_net_t pnet; + u32 portid; +}; - read_lock(&nl_table_lock); - head = nl_portid_hashfn(hash, portid); - sk_for_each(sk, head) { - if (net_eq(sock_net(sk), net) && (nlk_sk(sk)->portid == portid)) { - sock_hold(sk); - goto found; - } - } - sk = NULL; -found: - read_unlock(&nl_table_lock); - return sk; -} +/* Doing sizeof directly may yield 4 extra bytes on 64-bit. */ +#define netlink_compare_arg_len \ + (offsetof(struct netlink_compare_arg, portid) + sizeof(u32)) -static struct hlist_head *nl_portid_hash_zalloc(size_t size) +static inline int netlink_compare(struct rhashtable_compare_arg *arg, + const void *ptr) { - if (size <= PAGE_SIZE) - return kzalloc(size, GFP_ATOMIC); - else - return (struct hlist_head *) - __get_free_pages(GFP_ATOMIC | __GFP_ZERO, - get_order(size)); + const struct netlink_compare_arg *x = arg->key; + const struct netlink_sock *nlk = ptr; + + return nlk->portid != x->portid || + !net_eq(sock_net(&nlk->sk), read_pnet(&x->pnet)); } -static void nl_portid_hash_free(struct hlist_head *table, size_t size) +static void netlink_compare_arg_init(struct netlink_compare_arg *arg, + struct net *net, u32 portid) { - if (size <= PAGE_SIZE) - kfree(table); - else - free_pages((unsigned long)table, get_order(size)); + memset(arg, 0, sizeof(*arg)); + write_pnet(&arg->pnet, net); + arg->portid = portid; } -static int nl_portid_hash_rehash(struct nl_portid_hash *hash, int grow) +static struct sock *__netlink_lookup(struct netlink_table *table, u32 portid, + struct net *net) { - unsigned int omask, mask, shift; - size_t osize, size; - struct hlist_head *otable, *table; - int i; - - omask = mask = hash->mask; - osize = size = (mask + 1) * sizeof(*table); - shift = hash->shift; + struct netlink_compare_arg arg; - if (grow) { - if (++shift > hash->max_shift) - return 0; - mask = mask * 2 + 1; - size *= 2; - } - - table = nl_portid_hash_zalloc(size); - if (!table) - return 0; - - otable = hash->table; - hash->table = table; - hash->mask = mask; - hash->shift = shift; - get_random_bytes(&hash->rnd, sizeof(hash->rnd)); - - for (i = 0; i <= omask; i++) { - struct sock *sk; - struct hlist_node *tmp; + netlink_compare_arg_init(&arg, net, portid); + return rhashtable_lookup_fast(&table->hash, &arg, + netlink_rhashtable_params); +} - sk_for_each_safe(sk, tmp, &otable[i]) - __sk_add_node(sk, nl_portid_hashfn(hash, nlk_sk(sk)->portid)); - } +static int __netlink_insert(struct netlink_table *table, struct sock *sk) +{ + struct netlink_compare_arg arg; - nl_portid_hash_free(otable, osize); - hash->rehash_time = jiffies + 10 * 60 * HZ; - return 1; + netlink_compare_arg_init(&arg, sock_net(sk), nlk_sk(sk)->portid); + return rhashtable_lookup_insert_key(&table->hash, &arg, + &nlk_sk(sk)->node, + netlink_rhashtable_params); } -static inline int nl_portid_hash_dilute(struct nl_portid_hash *hash, int len) +static struct sock *netlink_lookup(struct net *net, int protocol, u32 portid) { - int avg = hash->entries >> hash->shift; - - if (unlikely(avg > 1) && nl_portid_hash_rehash(hash, 1)) - return 1; + struct netlink_table *table = &nl_table[protocol]; + struct sock *sk; - if (unlikely(len > avg) && time_after(jiffies, hash->rehash_time)) { - nl_portid_hash_rehash(hash, 0); - return 1; - } + rcu_read_lock(); + sk = __netlink_lookup(table, portid, net); + if (sk) + sock_hold(sk); + rcu_read_unlock(); - return 0; + return sk; } static const struct proto_ops netlink_ops; @@ -969,52 +496,65 @@ * makes sure updates are visible before bind or setsockopt return. */ } -static int netlink_insert(struct sock *sk, struct net *net, u32 portid) +static int netlink_insert(struct sock *sk, u32 portid) { - struct nl_portid_hash *hash = &nl_table[sk->sk_protocol].hash; - struct hlist_head *head; - int err = -EADDRINUSE; - struct sock *osk; - int len; + struct netlink_table *table = &nl_table[sk->sk_protocol]; + int err; - netlink_table_grab(); - head = nl_portid_hashfn(hash, portid); - len = 0; - sk_for_each(osk, head) { - if (net_eq(sock_net(osk), net) && (nlk_sk(osk)->portid == portid)) - break; - len++; - } - if (osk) - goto err; + lock_sock(sk); - err = -EBUSY; - if (nlk_sk(sk)->portid) + err = nlk_sk(sk)->portid == portid ? 0 : -EBUSY; + if (nlk_sk(sk)->bound) goto err; err = -ENOMEM; - if (BITS_PER_LONG > 32 && unlikely(hash->entries >= UINT_MAX)) + if (BITS_PER_LONG > 32 && + unlikely(atomic_read(&table->hash.nelems) >= UINT_MAX)) goto err; - if (len && nl_portid_hash_dilute(hash, len)) - head = nl_portid_hashfn(hash, portid); - hash->entries++; nlk_sk(sk)->portid = portid; - sk_add_node(sk, head); - err = 0; + sock_hold(sk); + + err = __netlink_insert(table, sk); + if (err) { + /* In case the hashtable backend returns with -EBUSY + * from here, it must not escape to the caller. + */ + if (unlikely(err == -EBUSY)) + err = -EOVERFLOW; + if (err == -EEXIST) + err = -EADDRINUSE; + sock_put(sk); + goto err; + } + + /* We need to ensure that the socket is hashed and visible. */ + smp_wmb(); + nlk_sk(sk)->bound = portid; err: - netlink_table_ungrab(); + release_sock(sk); return err; } static void netlink_remove(struct sock *sk) { + struct netlink_table *table; + + table = &nl_table[sk->sk_protocol]; + if (!rhashtable_remove_fast(&table->hash, &nlk_sk(sk)->node, + netlink_rhashtable_params)) { + WARN_ON(atomic_read(&sk->sk_refcnt) == 1); + __sock_put(sk); + } + netlink_table_grab(); - if (sk_del_node_init(sk)) - nl_table[sk->sk_protocol].hash.entries--; - if (nlk_sk(sk)->subscriptions) + if (nlk_sk(sk)->subscriptions) { __sk_del_bind_node(sk); + netlink_update_listeners(sk); + } + if (sk->sk_protocol == NETLINK_GENERIC) + atomic_inc(&genl_sk_destructing_cnt); netlink_table_ungrab(); } @@ -1025,14 +565,15 @@ }; static int __netlink_create(struct net *net, struct socket *sock, - struct mutex *cb_mutex, int protocol) + struct mutex *cb_mutex, int protocol, + int kern) { struct sock *sk; struct netlink_sock *nlk; sock->ops = &netlink_ops; - sk = sk_alloc(net, PF_NETLINK, GFP_KERNEL, &netlink_proto); + sk = sk_alloc(net, PF_NETLINK, GFP_KERNEL, &netlink_proto, kern); if (!sk) return -ENOMEM; @@ -1046,9 +587,6 @@ mutex_init(nlk->cb_mutex); } init_waitqueue_head(&nlk->wait); -#ifdef CONFIG_NETLINK_MMAP - mutex_init(&nlk->pg_vec_lock); -#endif sk->sk_destruct = netlink_sock_destruct; sk->sk_protocol = protocol; @@ -1061,7 +599,8 @@ struct module *module = NULL; struct mutex *cb_mutex; struct netlink_sock *nlk; - void (*bind)(int group); + int (*bind)(struct net *net, int group); + void (*unbind)(struct net *net, int group); int err = 0; sock->state = SS_UNCONNECTED; @@ -1087,12 +626,13 @@ err = -EPROTONOSUPPORT; cb_mutex = nl_table[protocol].cb_mutex; bind = nl_table[protocol].bind; + unbind = nl_table[protocol].unbind; netlink_unlock_table(); if (err < 0) goto out; - err = __netlink_create(net, sock, cb_mutex, protocol); + err = __netlink_create(net, sock, cb_mutex, protocol, kern); if (err < 0) goto out_module; @@ -1103,6 +643,7 @@ nlk = nlk_sk(sock->sk); nlk->module = module; nlk->netlink_bind = bind; + nlk->netlink_unbind = unbind; out: return err; @@ -1111,6 +652,23 @@ goto out; } +static void deferred_put_nlk_sk(struct rcu_head *head) +{ + struct netlink_sock *nlk = container_of(head, struct netlink_sock, rcu); + struct sock *sk = &nlk->sk; + + if (!atomic_dec_and_test(&sk->sk_refcnt)) + return; + + if (nlk->cb_running && nlk->cb.done) { + INIT_WORK(&nlk->work, netlink_sock_destruct_work); + schedule_work(&nlk->work); + return; + } + + sk_free(sk); +} + static int netlink_release(struct socket *sock) { struct sock *sk = sock->sk; @@ -1128,12 +686,26 @@ * will be purged. */ + /* must not acquire netlink_table_lock in any way again before unbind + * and notifying genetlink is done as otherwise it might deadlock + */ + if (nlk->netlink_unbind) { + int i; + + for (i = 0; i < nlk->ngroups; i++) + if (test_bit(i, nlk->groups)) + nlk->netlink_unbind(sock_net(sk), i + 1); + } + if (sk->sk_protocol == NETLINK_GENERIC && + atomic_dec_return(&genl_sk_destructing_cnt) == 0) + wake_up(&genl_sk_destructing_waitq); + sock->sk = NULL; wake_up_interruptible_all(&nlk->wait); skb_queue_purge(&sk->sk_write_queue); - if (nlk->portid) { + if (nlk->portid && nlk->bound) { struct netlink_notify n = { .net = sock_net(sk), .protocol = sk->sk_protocol, @@ -1145,8 +717,8 @@ module_put(nlk->module); - netlink_table_grab(); if (netlink_is_kernel(sk)) { + netlink_table_grab(); BUG_ON(nl_table[sk->sk_protocol].registered == 0); if (--nl_table[sk->sk_protocol].registered == 0) { struct listeners *old; @@ -1156,13 +728,12 @@ kfree_rcu(old, rcu); nl_table[sk->sk_protocol].module = NULL; nl_table[sk->sk_protocol].bind = NULL; + nl_table[sk->sk_protocol].unbind = NULL; nl_table[sk->sk_protocol].flags = 0; nl_table[sk->sk_protocol].registered = 0; } - } else if (nlk->subscriptions) { - netlink_update_listeners(sk); + netlink_table_ungrab(); } - netlink_table_ungrab(); kfree(nlk->groups); nlk->groups = NULL; @@ -1170,7 +741,7 @@ local_bh_disable(); sock_prot_inuse_add(sock_net(sk), &netlink_proto, -1); local_bh_enable(); - sock_put(sk); + call_rcu(&nlk->rcu, deferred_put_nlk_sk); return 0; } @@ -1178,32 +749,29 @@ { struct sock *sk = sock->sk; struct net *net = sock_net(sk); - struct nl_portid_hash *hash = &nl_table[sk->sk_protocol].hash; - struct hlist_head *head; - struct sock *osk; + struct netlink_table *table = &nl_table[sk->sk_protocol]; s32 portid = task_tgid_vnr(current); int err; - static s32 rover = -4097; + s32 rover = -4096; + bool ok; retry: cond_resched(); - netlink_table_grab(); - head = nl_portid_hashfn(hash, portid); - sk_for_each(osk, head) { - if (!net_eq(sock_net(osk), net)) - continue; - if (nlk_sk(osk)->portid == portid) { - /* Bind collision, search negative portid values. */ - portid = rover--; - if (rover > -4097) - rover = -4097; - netlink_table_ungrab(); - goto retry; - } + rcu_read_lock(); + ok = !__netlink_lookup(table, portid, net); + rcu_read_unlock(); + if (!ok) { + /* Bind collision, search negative portid values. */ + if (rover == -4096) + /* rover will be in range [S32_MIN, -4097] */ + rover = S32_MIN + prandom_u32_max(-4096 - S32_MIN); + else if (rover >= -4096) + rover = -4097; + portid = rover--; + goto retry; } - netlink_table_ungrab(); - err = netlink_insert(sk, net, portid); + err = netlink_insert(sk, portid); if (err == -EADDRINUSE) goto retry; @@ -1332,6 +900,20 @@ return err; } +static void netlink_undo_bind(int group, long unsigned int groups, + struct sock *sk) +{ + struct netlink_sock *nlk = nlk_sk(sk); + int undo; + + if (!nlk->netlink_unbind) + return; + + for (undo = 0; undo < group; undo++) + if (test_bit(undo, &groups)) + nlk->netlink_unbind(sock_net(sk), undo + 1); +} + static int netlink_bind(struct socket *sock, struct sockaddr *addr, int addr_len) { @@ -1340,6 +922,8 @@ struct netlink_sock *nlk = nlk_sk(sk); struct sockaddr_nl *nladdr = (struct sockaddr_nl *)addr; int err; + long unsigned int groups = nladdr->nl_groups; + bool bound; if (addr_len < sizeof(struct sockaddr_nl)) return -EINVAL; @@ -1348,7 +932,7 @@ return -EINVAL; /* Only superuser is allowed to listen multicasts */ - if (nladdr->nl_groups) { + if (groups) { if (!netlink_allowed(sock, NL_CFG_F_NONROOT_RECV)) return -EPERM; err = netlink_realloc_groups(sk); @@ -1356,37 +940,53 @@ return err; } - if (nlk->portid) { + bound = nlk->bound; + if (bound) { + /* Ensure nlk->portid is up-to-date. */ + smp_rmb(); + if (nladdr->nl_pid != nlk->portid) return -EINVAL; - } else { + } + + if (nlk->netlink_bind && groups) { + int group; + + for (group = 0; group < nlk->ngroups; group++) { + if (!test_bit(group, &groups)) + continue; + err = nlk->netlink_bind(net, group + 1); + if (!err) + continue; + netlink_undo_bind(group, groups, sk); + return err; + } + } + + /* No need for barriers here as we return to user-space without + * using any of the bound attributes. + */ + if (!bound) { err = nladdr->nl_pid ? - netlink_insert(sk, net, nladdr->nl_pid) : + netlink_insert(sk, nladdr->nl_pid) : netlink_autobind(sock); - if (err) + if (err) { + netlink_undo_bind(nlk->ngroups, groups, sk); return err; + } } - if (!nladdr->nl_groups && (nlk->groups == NULL || !(u32)nlk->groups[0])) + if (!groups && (nlk->groups == NULL || !(u32)nlk->groups[0])) return 0; netlink_table_grab(); netlink_update_subscriptions(sk, nlk->subscriptions + - hweight32(nladdr->nl_groups) - + hweight32(groups) - hweight32(nlk->groups[0])); - nlk->groups[0] = (nlk->groups[0] & ~0xffffffffUL) | nladdr->nl_groups; + nlk->groups[0] = (nlk->groups[0] & ~0xffffffffUL) | groups; netlink_update_listeners(sk); netlink_table_ungrab(); - if (nlk->netlink_bind && nlk->groups[0]) { - int i; - - for (i=0; ingroups; i++) { - if (test_bit(i, nlk->groups)) - nlk->netlink_bind(i); - } - } - return 0; } @@ -1410,11 +1010,14 @@ if (addr->sa_family != AF_NETLINK) return -EINVAL; - /* Only superuser is allowed to send multicasts */ - if (nladdr->nl_groups && !netlink_allowed(sock, NL_CFG_F_NONROOT_SEND)) + if ((nladdr->nl_groups || nladdr->nl_pid) && + !netlink_allowed(sock, NL_CFG_F_NONROOT_SEND)) return -EPERM; - if (!nlk->portid) + /* No need for barriers here as we return to user-space without + * using any of the bound attributes. + */ + if (!nlk->bound) err = netlink_autobind(sock); if (err == 0) { @@ -1482,6 +1085,31 @@ return sock; } +static struct sk_buff *netlink_alloc_large_skb(unsigned int size, + int broadcast) +{ + struct sk_buff *skb; + void *data; + + if (size <= NLMSG_GOODSIZE || broadcast) + return alloc_skb(size, GFP_KERNEL); + + size = SKB_DATA_ALIGN(size) + + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + + data = vmalloc(size); + if (data == NULL) + return NULL; + + skb = __build_skb(data, size); + if (skb == NULL) + vfree(data); + else + skb->destructor = netlink_skb_destructor; + + return skb; +} + /* * Attach a skb to a netlink socket. * The caller must hold a reference to the destination socket. On error, the @@ -1500,8 +1128,7 @@ nlk = nlk_sk(sk); if ((atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || - test_bit(NETLINK_CONGESTED, &nlk->state)) && - !netlink_skb_is_mmaped(skb)) { + test_bit(NETLINK_S_CONGESTED, &nlk->state))) { DECLARE_WAITQUEUE(wait, current); if (!*timeo) { if (!ssk || netlink_is_kernel(ssk)) @@ -1515,7 +1142,7 @@ add_wait_queue(&nlk->wait, &wait); if ((atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || - test_bit(NETLINK_CONGESTED, &nlk->state)) && + test_bit(NETLINK_S_CONGESTED, &nlk->state)) && !sock_flag(sk, SOCK_DEAD)) *timeo = schedule_timeout(*timeo); @@ -1537,15 +1164,10 @@ { int len = skb->len; -#ifdef CONFIG_NETLINK_MMAP - if (netlink_skb_is_mmaped(skb)) - netlink_queue_mmaped_skb(sk, skb); - else if (netlink_rx_is_mmaped(sk)) - netlink_ring_set_copied(sk, skb); - else -#endif /* CONFIG_NETLINK_MMAP */ - skb_queue_tail(&sk->sk_receive_queue, skb); - sk->sk_data_ready(sk, len); + netlink_deliver_tap(skb); + + skb_queue_tail(&sk->sk_receive_queue, skb); + sk->sk_data_ready(sk); return len; } @@ -1565,27 +1187,7 @@ static struct sk_buff *netlink_trim(struct sk_buff *skb, gfp_t allocation) { - int delta; - WARN_ON(skb->sk != NULL); - if (netlink_skb_is_mmaped(skb)) - return skb; - - delta = skb->end - skb->tail; - if (delta * 2 < skb->truesize) - return skb; - - if (skb_shared(skb)) { - struct sk_buff *nskb = skb_clone(skb, allocation); - if (!nskb) - return skb; - consume_skb(skb); - skb = nskb; - } - - if (!pskb_expand_head(skb, 0, -delta, allocation)) - skb->truesize -= delta; - return skb; } @@ -1600,6 +1202,7 @@ ret = skb->len; netlink_skb_set_owner_r(skb, sk); NETLINK_CB(skb).sk = ssk; + netlink_deliver_tap_kernel(sk, ssk, skb); nlk->netlink_rcv(skb); consume_skb(skb); } else { @@ -1645,68 +1248,13 @@ } EXPORT_SYMBOL(netlink_unicast); -struct sk_buff *netlink_alloc_skb(struct sock *ssk, unsigned int size, - u32 dst_portid, gfp_t gfp_mask) +struct sk_buff *__netlink_alloc_skb(struct sock *ssk, unsigned int size, + unsigned int ldiff, u32 dst_portid, + gfp_t gfp_mask) { -#ifdef CONFIG_NETLINK_MMAP - struct sock *sk = NULL; - struct sk_buff *skb; - struct netlink_ring *ring; - struct nl_mmap_hdr *hdr; - unsigned int maxlen; - - sk = netlink_getsockbyportid(ssk, dst_portid); - if (IS_ERR(sk)) - goto out; - - ring = &nlk_sk(sk)->rx_ring; - /* fast-path without atomic ops for common case: non-mmaped receiver */ - if (ring->pg_vec == NULL) - goto out_put; - - skb = alloc_skb_head(gfp_mask); - if (skb == NULL) - goto err1; - - spin_lock_bh(&sk->sk_receive_queue.lock); - /* check again under lock */ - if (ring->pg_vec == NULL) - goto out_free; - - maxlen = ring->frame_size - NL_MMAP_HDRLEN; - if (maxlen < size) - goto out_free; - - netlink_forward_ring(ring); - hdr = netlink_current_frame(ring, NL_MMAP_STATUS_UNUSED); - if (hdr == NULL) - goto err2; - netlink_ring_setup_skb(skb, sk, ring, hdr); - netlink_set_status(hdr, NL_MMAP_STATUS_RESERVED); - atomic_inc(&ring->pending); - netlink_increment_head(ring); - - spin_unlock_bh(&sk->sk_receive_queue.lock); - return skb; - -err2: - kfree_skb(skb); - spin_unlock_bh(&sk->sk_receive_queue.lock); - netlink_overrun(sk); -err1: - sock_put(sk); - return NULL; - -out_free: - kfree_skb(skb); - spin_unlock_bh(&sk->sk_receive_queue.lock); -out_put: - sock_put(sk); -out: -#endif return alloc_skb(size, gfp_mask); } -EXPORT_SYMBOL_GPL(netlink_alloc_skb); +EXPORT_SYMBOL_GPL(__netlink_alloc_skb); int netlink_has_listeners(struct sock *sk, unsigned int group) { @@ -1732,7 +1280,7 @@ struct netlink_sock *nlk = nlk_sk(sk); if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf && - !test_bit(NETLINK_CONGESTED, &nlk->state)) { + !test_bit(NETLINK_S_CONGESTED, &nlk->state)) { netlink_skb_set_owner_r(skb, sk); __netlink_sendskb(sk, skb); return atomic_read(&sk->sk_rmem_alloc) > (sk->sk_rcvbuf >> 1); @@ -1755,25 +1303,34 @@ void *tx_data; }; -static int do_one_broadcast(struct sock *sk, - struct netlink_broadcast_data *p) +static void do_one_broadcast(struct sock *sk, + struct netlink_broadcast_data *p) { struct netlink_sock *nlk = nlk_sk(sk); int val; if (p->exclude_sk == sk) - goto out; + return; if (nlk->portid == p->portid || p->group - 1 >= nlk->ngroups || !test_bit(p->group - 1, nlk->groups)) - goto out; + return; - if (!net_eq(sock_net(sk), p->net)) - goto out; + if (!net_eq(sock_net(sk), p->net)) { + if (!(nlk->flags & NETLINK_F_LISTEN_ALL_NSID)) + return; + + if (!peernet_has_id(sock_net(sk), p->net)) + return; + + if (!file_ns_capable(sk->sk_socket->file, p->net->user_ns, + CAP_NET_BROADCAST)) + return; + } if (p->failure) { netlink_overrun(sk); - goto out; + return; } sock_hold(sk); @@ -1793,27 +1350,34 @@ netlink_overrun(sk); /* Clone failed. Notify ALL listeners. */ p->failure = 1; - if (nlk->flags & NETLINK_BROADCAST_SEND_ERROR) + if (nlk->flags & NETLINK_F_BROADCAST_SEND_ERROR) p->delivery_failure = 1; - } else if (p->tx_filter && p->tx_filter(sk, p->skb2, p->tx_data)) { + goto out; + } + if (p->tx_filter && p->tx_filter(sk, p->skb2, p->tx_data)) { kfree_skb(p->skb2); p->skb2 = NULL; - } else if (sk_filter(sk, p->skb2)) { + goto out; + } + if (sk_filter(sk, p->skb2)) { kfree_skb(p->skb2); p->skb2 = NULL; - } else if ((val = netlink_broadcast_deliver(sk, p->skb2)) < 0) { + goto out; + } + NETLINK_CB(p->skb2).nsid = peernet2id(sock_net(sk), p->net); + NETLINK_CB(p->skb2).nsid_is_set = true; + val = netlink_broadcast_deliver(sk, p->skb2); + if (val < 0) { netlink_overrun(sk); - if (nlk->flags & NETLINK_BROADCAST_SEND_ERROR) + if (nlk->flags & NETLINK_F_BROADCAST_SEND_ERROR) p->delivery_failure = 1; } else { p->congested |= val; p->delivered = 1; p->skb2 = NULL; } - sock_put(sk); - out: - return 0; + sock_put(sk); } int netlink_broadcast_filtered(struct sock *ssk, struct sk_buff *skb, u32 portid, @@ -1859,7 +1423,7 @@ consume_skb(info.skb2); if (info.delivered) { - if (info.congested && (allocation & __GFP_WAIT)) + if (info.congested && gfpflags_allow_blocking(allocation)) yield(); return 0; } @@ -1897,7 +1461,7 @@ !test_bit(p->group - 1, nlk->groups)) goto out; - if (p->code == ENOBUFS && nlk->flags & NETLINK_RECV_NO_ENOBUFS) { + if (p->code == ENOBUFS && nlk->flags & NETLINK_F_RECV_NO_ENOBUFS) { ret = 1; goto out; } @@ -1912,11 +1476,11 @@ * netlink_set_err - report error to broadcast listeners * @ssk: the kernel netlink socket, as returned by netlink_kernel_create() * @portid: the PORTID of a process that we want to skip (if any) - * @groups: the broadcast group that will notice the error + * @group: the broadcast group that will notice the error * @code: error code, must be negative (as usual in kernelspace) * * This function returns the number of broadcast listeners that have set the - * NETLINK_RECV_NO_ENOBUFS socket option. + * NETLINK_NO_ENOBUFS socket option. */ int netlink_set_err(struct sock *ssk, u32 portid, u32 group, int code) { @@ -1968,17 +1532,16 @@ if (level != SOL_NETLINK) return -ENOPROTOOPT; - if (optname != NETLINK_RX_RING && optname != NETLINK_TX_RING && - optlen >= sizeof(int) && + if (optlen >= sizeof(int) && get_user(val, (unsigned int __user *)optval)) return -EFAULT; switch (optname) { case NETLINK_PKTINFO: if (val) - nlk->flags |= NETLINK_RECV_PKTINFO; + nlk->flags |= NETLINK_F_RECV_PKTINFO; else - nlk->flags &= ~NETLINK_RECV_PKTINFO; + nlk->flags &= ~NETLINK_F_RECV_PKTINFO; err = 0; break; case NETLINK_ADD_MEMBERSHIP: @@ -1990,53 +1553,55 @@ return err; if (!val || val - 1 >= nlk->ngroups) return -EINVAL; + if (optname == NETLINK_ADD_MEMBERSHIP && nlk->netlink_bind) { + err = nlk->netlink_bind(sock_net(sk), val); + if (err) + return err; + } netlink_table_grab(); netlink_update_socket_mc(nlk, val, optname == NETLINK_ADD_MEMBERSHIP); netlink_table_ungrab(); - - if (nlk->netlink_bind) - nlk->netlink_bind(val); + if (optname == NETLINK_DROP_MEMBERSHIP && nlk->netlink_unbind) + nlk->netlink_unbind(sock_net(sk), val); err = 0; break; } case NETLINK_BROADCAST_ERROR: if (val) - nlk->flags |= NETLINK_BROADCAST_SEND_ERROR; + nlk->flags |= NETLINK_F_BROADCAST_SEND_ERROR; else - nlk->flags &= ~NETLINK_BROADCAST_SEND_ERROR; + nlk->flags &= ~NETLINK_F_BROADCAST_SEND_ERROR; err = 0; break; case NETLINK_NO_ENOBUFS: if (val) { - nlk->flags |= NETLINK_RECV_NO_ENOBUFS; - clear_bit(NETLINK_CONGESTED, &nlk->state); + nlk->flags |= NETLINK_F_RECV_NO_ENOBUFS; + clear_bit(NETLINK_S_CONGESTED, &nlk->state); wake_up_interruptible(&nlk->wait); } else { - nlk->flags &= ~NETLINK_RECV_NO_ENOBUFS; + nlk->flags &= ~NETLINK_F_RECV_NO_ENOBUFS; } err = 0; break; -#ifdef CONFIG_NETLINK_MMAP - case NETLINK_RX_RING: - case NETLINK_TX_RING: { - struct nl_mmap_req req; - - /* Rings might consume more memory than queue limits, require - * CAP_NET_ADMIN. - */ - if (!capable(CAP_NET_ADMIN)) + case NETLINK_LISTEN_ALL_NSID: + if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_BROADCAST)) return -EPERM; - if (optlen < sizeof(req)) - return -EINVAL; - if (copy_from_user(&req, optval, sizeof(req))) - return -EFAULT; - err = netlink_set_ring(sk, &req, - optname == NETLINK_TX_RING); + + if (val) + nlk->flags |= NETLINK_F_LISTEN_ALL_NSID; + else + nlk->flags &= ~NETLINK_F_LISTEN_ALL_NSID; + err = 0; + break; + case NETLINK_CAP_ACK: + if (val) + nlk->flags |= NETLINK_F_CAP_ACK; + else + nlk->flags &= ~NETLINK_F_CAP_ACK; + err = 0; break; - } -#endif /* CONFIG_NETLINK_MMAP */ default: err = -ENOPROTOOPT; } @@ -2063,7 +1628,7 @@ if (len < sizeof(int)) return -EINVAL; len = sizeof(int); - val = nlk->flags & NETLINK_RECV_PKTINFO ? 1 : 0; + val = nlk->flags & NETLINK_F_RECV_PKTINFO ? 1 : 0; if (put_user(len, optlen) || put_user(val, optval)) return -EFAULT; @@ -2073,7 +1638,7 @@ if (len < sizeof(int)) return -EINVAL; len = sizeof(int); - val = nlk->flags & NETLINK_BROADCAST_SEND_ERROR ? 1 : 0; + val = nlk->flags & NETLINK_F_BROADCAST_SEND_ERROR ? 1 : 0; if (put_user(len, optlen) || put_user(val, optval)) return -EFAULT; @@ -2083,7 +1648,39 @@ if (len < sizeof(int)) return -EINVAL; len = sizeof(int); - val = nlk->flags & NETLINK_RECV_NO_ENOBUFS ? 1 : 0; + val = nlk->flags & NETLINK_F_RECV_NO_ENOBUFS ? 1 : 0; + if (put_user(len, optlen) || + put_user(val, optval)) + return -EFAULT; + err = 0; + break; + case NETLINK_LIST_MEMBERSHIPS: { + int pos, idx, shift; + + err = 0; + netlink_lock_table(); + for (pos = 0; pos * 8 < nlk->ngroups; pos += sizeof(u32)) { + if (len - pos < sizeof(u32)) + break; + + idx = pos / sizeof(unsigned long); + shift = (pos % sizeof(unsigned long)) * 8; + if (put_user((u32)(nlk->groups[idx] >> shift), + (u32 __user *)(optval + pos))) { + err = -EFAULT; + break; + } + } + if (put_user(ALIGN(nlk->ngroups / 8, sizeof(u32)), optlen)) + err = -EFAULT; + netlink_unlock_table(); + break; + } + case NETLINK_CAP_ACK: + if (len < sizeof(int)) + return -EINVAL; + len = sizeof(int); + val = nlk->flags & NETLINK_F_CAP_ACK ? 1 : 0; if (put_user(len, optlen) || put_user(val, optval)) return -EFAULT; @@ -2103,13 +1700,21 @@ put_cmsg(msg, SOL_NETLINK, NETLINK_PKTINFO, sizeof(info), &info); } -static int netlink_sendmsg(struct kiocb *kiocb, struct socket *sock, - struct msghdr *msg, size_t len) +static void netlink_cmsg_listen_all_nsid(struct sock *sk, struct msghdr *msg, + struct sk_buff *skb) +{ + if (!NETLINK_CB(skb).nsid_is_set) + return; + + put_cmsg(msg, SOL_NETLINK, NETLINK_LISTEN_ALL_NSID, sizeof(int), + &NETLINK_CB(skb).nsid); +} + +static int netlink_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { - struct sock_iocb *siocb = kiocb_to_siocb(kiocb); struct sock *sk = sock->sk; struct netlink_sock *nlk = nlk_sk(sk); - struct sockaddr_nl *addr = msg->msg_name; + DECLARE_SOCKADDR(struct sockaddr_nl *, addr, msg->msg_name); u32 dst_portid; u32 dst_group; struct sk_buff *skb; @@ -2120,10 +1725,7 @@ if (msg->msg_flags&MSG_OOB) return -EOPNOTSUPP; - if (NULL == siocb->scm) - siocb->scm = &scm; - - err = scm_send(sock, msg, siocb->scm, true); + err = scm_send(sock, msg, &scm, true); if (err < 0) return err; @@ -2143,34 +1745,30 @@ dst_group = nlk->dst_group; } - if (!nlk->portid) { + if (!nlk->bound) { err = netlink_autobind(sock); if (err) goto out; - } - - if (netlink_tx_is_mmaped(sk) && - msg->msg_iov->iov_base == NULL) { - err = netlink_mmap_sendmsg(sk, msg, dst_portid, dst_group, - siocb); - goto out; + } else { + /* Ensure nlk is hashed and visible. */ + smp_rmb(); } err = -EMSGSIZE; if (len > sk->sk_sndbuf - 32) goto out; err = -ENOBUFS; - skb = alloc_skb(len, GFP_KERNEL); + skb = netlink_alloc_large_skb(len, dst_group); if (skb == NULL) goto out; NETLINK_CB(skb).portid = nlk->portid; NETLINK_CB(skb).dst_group = dst_group; - NETLINK_CB(skb).creds = siocb->scm->creds; + NETLINK_CB(skb).creds = scm.creds; NETLINK_CB(skb).flags = netlink_skb_flags; err = -EFAULT; - if (memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len)) { + if (memcpy_from_msg(skb_put(skb, len), msg, len)) { kfree_skb(skb); goto out; } @@ -2188,15 +1786,13 @@ err = netlink_unicast(sk, skb, dst_portid, msg->msg_flags&MSG_DONTWAIT); out: - scm_destroy(siocb->scm); + scm_destroy(&scm); return err; } -static int netlink_recvmsg(struct kiocb *kiocb, struct socket *sock, - struct msghdr *msg, size_t len, +static int netlink_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags) { - struct sock_iocb *siocb = kiocb_to_siocb(kiocb); struct scm_cookie scm; struct sock *sk = sock->sk; struct netlink_sock *nlk = nlk_sk(sk); @@ -2233,6 +1829,11 @@ } #endif + /* Record the max length of recvmsg() calls for future allocations */ + nlk->max_recvmsg_len = max(nlk->max_recvmsg_len, len); + nlk->max_recvmsg_len = min_t(size_t, nlk->max_recvmsg_len, + SKB_WITH_OVERHEAD(32768)); + copied = data_skb->len; if (len < copied) { msg->msg_flags |= MSG_TRUNC; @@ -2240,10 +1841,10 @@ } skb_reset_transport_header(data_skb); - err = skb_copy_datagram_iovec(data_skb, 0, msg->msg_iov, copied); + err = skb_copy_datagram_msg(data_skb, 0, msg, copied); if (msg->msg_name) { - struct sockaddr_nl *addr = (struct sockaddr_nl *)msg->msg_name; + DECLARE_SOCKADDR(struct sockaddr_nl *, addr, msg->msg_name); addr->nl_family = AF_NETLINK; addr->nl_pad = 0; addr->nl_pid = NETLINK_CB(skb).portid; @@ -2251,20 +1852,20 @@ msg->msg_namelen = sizeof(*addr); } - if (nlk->flags & NETLINK_RECV_PKTINFO) + if (nlk->flags & NETLINK_F_RECV_PKTINFO) netlink_cmsg_recv_pktinfo(msg, skb); + if (nlk->flags & NETLINK_F_LISTEN_ALL_NSID) + netlink_cmsg_listen_all_nsid(sk, msg, skb); - if (NULL == siocb->scm) { - memset(&scm, 0, sizeof(scm)); - siocb->scm = &scm; - } - siocb->scm->creds = *NETLINK_CREDS(skb); + memset(&scm, 0, sizeof(scm)); + scm.creds = *NETLINK_CREDS(skb); if (flags & MSG_TRUNC) copied = data_skb->len; skb_free_datagram(sk, skb); - if (nlk->cb && atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf / 2) { + if (nlk->cb_running && + atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf / 2) { ret = netlink_dump(sk); if (ret) { sk->sk_err = -ret; @@ -2272,13 +1873,13 @@ } } - scm_recv(sock, msg, siocb->scm, flags); + scm_recv(sock, msg, &scm, flags); out: netlink_rcv_wake(sk); return err ? : copied; } -static void netlink_data_ready(struct sock *sk, int len) +static void netlink_data_ready(struct sock *sk) { BUG(); } @@ -2308,17 +1909,10 @@ if (sock_create_lite(PF_NETLINK, SOCK_DGRAM, unit, &sock)) return NULL; - /* - * We have to just have a reference on the net from sk, but don't - * get_net it. Besides, we cannot get and then put the net here. - * So we create one inside init_net and the move it to net. - */ - - if (__netlink_create(&init_net, sock, cb_mutex, unit) < 0) + if (__netlink_create(net, sock, cb_mutex, unit, 1) < 0) goto out_sock_release_nosk; sk = sock->sk; - sk_change_net(sk, net); if (!cfg || cfg->groups < 32) groups = 32; @@ -2333,11 +1927,11 @@ if (cfg && cfg->input) nlk_sk(sk)->netlink_rcv = cfg->input; - if (netlink_insert(sk, net, 0)) + if (netlink_insert(sk, 0)) goto out_sock_release; nlk = nlk_sk(sk); - nlk->flags |= NETLINK_KERNEL_SOCKET; + nlk->flags |= NETLINK_F_KERNEL_SOCKET; netlink_table_grab(); if (!nl_table[unit].registered) { @@ -2347,7 +1941,10 @@ nl_table[unit].module = module; if (cfg) { nl_table[unit].bind = cfg->bind; + nl_table[unit].unbind = cfg->unbind; nl_table[unit].flags = cfg->flags; + if (cfg->compare) + nl_table[unit].compare = cfg->compare; } nl_table[unit].registered = 1; } else { @@ -2371,7 +1968,10 @@ void netlink_kernel_release(struct sock *sk) { - sk_release_kernel(sk); + if (sk == NULL || sk->sk_socket == NULL) + return; + + sock_release(sk->sk_socket); } EXPORT_SYMBOL(netlink_kernel_release); @@ -2430,28 +2030,13 @@ netlink_update_socket_mc(nlk_sk(sk), group, 0); } -/** - * netlink_clear_multicast_users - kick off multicast listeners - * - * This function removes all listeners from the given group. - * @ksk: The kernel netlink socket, as returned by - * netlink_kernel_create(). - * @group: The multicast group to clear. - */ -void netlink_clear_multicast_users(struct sock *ksk, unsigned int group) -{ - netlink_table_grab(); - __netlink_clear_multicast_users(ksk, group); - netlink_table_ungrab(); -} - struct nlmsghdr * __nlmsg_put(struct sk_buff *skb, u32 portid, u32 seq, int type, int len, int flags) { struct nlmsghdr *nlh; int size = nlmsg_msg_size(len); - nlh = (struct nlmsghdr*)skb_put(skb, NLMSG_ALIGN(size)); + nlh = (struct nlmsghdr *)skb_put(skb, NLMSG_ALIGN(size)); nlh->nlmsg_type = type; nlh->nlmsg_len = size; nlh->nlmsg_flags = flags; @@ -2474,25 +2059,53 @@ struct netlink_callback *cb; struct sk_buff *skb = NULL; struct nlmsghdr *nlh; + struct module *module; int len, err = -ENOBUFS; + int alloc_min_size; int alloc_size; mutex_lock(nlk->cb_mutex); - - cb = nlk->cb; - if (cb == NULL) { + if (!nlk->cb_running) { err = -EINVAL; goto errout_skb; } - alloc_size = max_t(int, cb->min_dump_alloc, NLMSG_GOODSIZE); - - if (!netlink_rx_is_mmaped(sk) && - atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) + if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) goto errout_skb; - skb = netlink_alloc_skb(sk, alloc_size, nlk->portid, GFP_KERNEL); + + /* NLMSG_GOODSIZE is small to avoid high order allocations being + * required, but it makes sense to _attempt_ a 16K bytes allocation + * to reduce number of system calls on dump operations, if user + * ever provided a big enough buffer. + */ + cb = &nlk->cb; + alloc_min_size = max_t(int, cb->min_dump_alloc, NLMSG_GOODSIZE); + + if (alloc_min_size < nlk->max_recvmsg_len) { + alloc_size = nlk->max_recvmsg_len; + skb = netlink_alloc_skb(sk, alloc_size, nlk->portid, + (GFP_KERNEL & ~__GFP_DIRECT_RECLAIM) | + __GFP_NOWARN | __GFP_NORETRY); + } + if (!skb) { + alloc_size = alloc_min_size; + skb = netlink_alloc_skb(sk, alloc_size, nlk->portid, + (GFP_KERNEL & ~__GFP_DIRECT_RECLAIM)); + } if (!skb) goto errout_skb; + + /* Trim skb to allocated size. User is expected to provide buffer as + * large as max(min_dump_alloc, 16KiB (mac_recvmsg_len capped at + * netlink_recvmsg())). dump will pack as many smaller messages as + * could fit within the allocated skb. skb is typically allocated + * with larger space than required (could be as much as near 2x the + * requested size with align to next power of 2 approach). Allowing + * dump to use the excess space makes it difficult for a user to have a + * reasonable static buffer based on the expected largest dump of a + * single netdev. The outcome is MSG_TRUNC error. + */ + skb_reserve(skb, skb_tailroom(skb) - alloc_size); netlink_skb_set_owner_r(skb, sk); len = cb->dump(skb, cb); @@ -2522,11 +2135,13 @@ if (cb->done) cb->done(cb); - nlk->cb = NULL; - mutex_unlock(nlk->cb_mutex); - module_put(cb->module); - netlink_consume_callback(cb); + nlk->cb_running = false; + module = cb->module; + skb = cb->skb; + mutex_unlock(nlk->cb_mutex); + module_put(module); + consume_skb(skb); return 0; errout_skb: @@ -2544,59 +2159,42 @@ struct netlink_sock *nlk; int ret; - cb = kzalloc(sizeof(*cb), GFP_KERNEL); - if (cb == NULL) - return -ENOBUFS; - - /* Memory mapped dump requests need to be copied to avoid looping - * on the pending state in netlink_mmap_sendmsg() while the CB hold - * a reference to the skb. - */ - if (netlink_skb_is_mmaped(skb)) { - skb = skb_copy(skb, GFP_KERNEL); - if (skb == NULL) { - kfree(cb); - return -ENOBUFS; - } - } else - atomic_inc(&skb->users); - - cb->dump = control->dump; - cb->done = control->done; - cb->nlh = nlh; - cb->data = control->data; - cb->module = control->module; - cb->min_dump_alloc = control->min_dump_alloc; - cb->skb = skb; + atomic_inc(&skb->users); sk = netlink_lookup(sock_net(ssk), ssk->sk_protocol, NETLINK_CB(skb).portid); if (sk == NULL) { - netlink_destroy_callback(cb); - return -ECONNREFUSED; + ret = -ECONNREFUSED; + goto error_free; } - nlk = nlk_sk(sk); + nlk = nlk_sk(sk); mutex_lock(nlk->cb_mutex); /* A dump is in progress... */ - if (nlk->cb) { - mutex_unlock(nlk->cb_mutex); - netlink_destroy_callback(cb); + if (nlk->cb_running) { ret = -EBUSY; - goto out; + goto error_unlock; } /* add reference of module which cb->dump belongs to */ - if (!try_module_get(cb->module)) { - mutex_unlock(nlk->cb_mutex); - netlink_destroy_callback(cb); + if (!try_module_get(control->module)) { ret = -EPROTONOSUPPORT; - goto out; + goto error_unlock; } - nlk->cb = cb; + cb = &nlk->cb; + memset(cb, 0, sizeof(*cb)); + cb->dump = control->dump; + cb->done = control->done; + cb->nlh = nlh; + cb->data = control->data; + cb->module = control->module; + cb->min_dump_alloc = control->min_dump_alloc; + cb->skb = skb; + + nlk->cb_running = true; + mutex_unlock(nlk->cb_mutex); ret = netlink_dump(sk); -out: sock_put(sk); if (ret) @@ -2606,6 +2204,13 @@ * signal not to send ACK even if it was requested. */ return -EINTR; + +error_unlock: + sock_put(sk); + mutex_unlock(nlk->cb_mutex); +error_free: + kfree_skb(skb); + return ret; } EXPORT_SYMBOL(__netlink_dump_start); @@ -2615,9 +2220,12 @@ struct nlmsghdr *rep; struct nlmsgerr *errmsg; size_t payload = sizeof(*errmsg); + struct netlink_sock *nlk = nlk_sk(NETLINK_CB(in_skb).sk); - /* error messages get the original request appened */ - if (err) + /* Error messages get the original request appened, unless the user + * requests to cap the error message. + */ + if (!(nlk->flags & NETLINK_F_CAP_ACK) && err) payload += nlmsg_len(nlh); skb = netlink_alloc_skb(in_skb->sk, nlmsg_total_size(payload), @@ -2640,7 +2248,7 @@ NLMSG_ERROR, payload, 0); errmsg = nlmsg_data(rep); errmsg->error = err; - memcpy(&errmsg->msg, nlh, err ? nlh->nlmsg_len : sizeof(*nlh)); + memcpy(&errmsg->msg, nlh, payload > sizeof(*errmsg) ? nlh->nlmsg_len : sizeof(*nlh)); netlink_unicast(in_skb->sk, skb, NETLINK_CB(in_skb).portid, MSG_DONTWAIT); } EXPORT_SYMBOL(netlink_ack); @@ -2729,89 +2337,98 @@ #ifdef CONFIG_PROC_FS struct nl_seq_iter { struct seq_net_private p; + struct rhashtable_iter hti; int link; - int hash_idx; }; -static struct sock *netlink_seq_socket_idx(struct seq_file *seq, loff_t pos) +static int netlink_walk_start(struct nl_seq_iter *iter) { - struct nl_seq_iter *iter = seq->private; - int i, j; - struct sock *s; - loff_t off = 0; - - for (i = 0; i < MAX_LINKS; i++) { - struct nl_portid_hash *hash = &nl_table[i].hash; + int err; - for (j = 0; j <= hash->mask; j++) { - sk_for_each(s, &hash->table[j]) { - if (sock_net(s) != seq_file_net(seq)) - continue; - if (off == pos) { - iter->link = i; - iter->hash_idx = j; - return s; - } - ++off; - } - } + err = rhashtable_walk_init(&nl_table[iter->link].hash, + &iter->hti, GFP_KERNEL); + if (err) { + iter->link = MAX_LINKS; + return err; } - return NULL; + + err = rhashtable_walk_start(&iter->hti); + return err == -EAGAIN ? 0 : err; } -static void *netlink_seq_start(struct seq_file *seq, loff_t *pos) - __acquires(nl_table_lock) +static void netlink_walk_stop(struct nl_seq_iter *iter) { - read_lock(&nl_table_lock); - return *pos ? netlink_seq_socket_idx(seq, *pos - 1) : SEQ_START_TOKEN; + rhashtable_walk_stop(&iter->hti); + rhashtable_walk_exit(&iter->hti); } -static void *netlink_seq_next(struct seq_file *seq, void *v, loff_t *pos) +static void *__netlink_seq_next(struct seq_file *seq) { - struct sock *s; - struct nl_seq_iter *iter; - int i, j; - - ++*pos; - - if (v == SEQ_START_TOKEN) - return netlink_seq_socket_idx(seq, 0); + struct nl_seq_iter *iter = seq->private; + struct netlink_sock *nlk; - iter = seq->private; - s = v; do { - s = sk_next(s); - } while (s && sock_net(s) != seq_file_net(seq)); - if (s) - return s; + for (;;) { + int err; - i = iter->link; - j = iter->hash_idx + 1; + nlk = rhashtable_walk_next(&iter->hti); - do { - struct nl_portid_hash *hash = &nl_table[i].hash; + if (IS_ERR(nlk)) { + if (PTR_ERR(nlk) == -EAGAIN) + continue; - for (; j <= hash->mask; j++) { - s = sk_head(&hash->table[j]); - while (s && sock_net(s) != seq_file_net(seq)) - s = sk_next(s); - if (s) { - iter->link = i; - iter->hash_idx = j; - return s; + return nlk; } + + if (nlk) + break; + + netlink_walk_stop(iter); + if (++iter->link >= MAX_LINKS) + return NULL; + + err = netlink_walk_start(iter); + if (err) + return ERR_PTR(err); } + } while (sock_net(&nlk->sk) != seq_file_net(seq)); - j = 0; - } while (++i < MAX_LINKS); + return nlk; +} - return NULL; +static void *netlink_seq_start(struct seq_file *seq, loff_t *posp) +{ + struct nl_seq_iter *iter = seq->private; + void *obj = SEQ_START_TOKEN; + loff_t pos; + int err; + + iter->link = 0; + + err = netlink_walk_start(iter); + if (err) + return ERR_PTR(err); + + for (pos = *posp; pos && obj && !IS_ERR(obj); pos--) + obj = __netlink_seq_next(seq); + + return obj; +} + +static void *netlink_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + ++*pos; + return __netlink_seq_next(seq); } static void netlink_seq_stop(struct seq_file *seq, void *v) - __releases(nl_table_lock) { - read_unlock(&nl_table_lock); + struct nl_seq_iter *iter = seq->private; + + if (iter->link >= MAX_LINKS) + return; + + netlink_walk_stop(iter); } @@ -2825,14 +2442,14 @@ struct sock *s = v; struct netlink_sock *nlk = nlk_sk(s); - seq_printf(seq, "%pK %-3d %-6u %08x %-8d %-8d %pK %-8d %-8d %-8lu\n", + seq_printf(seq, "%pK %-3d %-6u %08x %-8d %-8d %d %-8d %-8d %-8lu\n", s, s->sk_protocol, nlk->portid, nlk->groups ? (u32)nlk->groups[0] : 0, sk_rmem_alloc_get(s), sk_wmem_alloc_get(s), - nlk->cb, + nlk->cb_running, atomic_read(&s->sk_refcnt), atomic_read(&s->sk_drops), sock_i_ino(s) @@ -2887,7 +2504,7 @@ .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = netlink_getname, - .poll = netlink_poll, + .poll = datagram_poll, .ioctl = sock_no_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, @@ -2895,7 +2512,7 @@ .getsockopt = netlink_getsockopt, .sendmsg = netlink_sendmsg, .recvmsg = netlink_recvmsg, - .mmap = netlink_mmap, + .mmap = sock_no_mmap, .sendpage = sock_no_sendpage, }; @@ -2946,11 +2563,26 @@ .exit = netlink_net_exit, }; +static inline u32 netlink_hash(const void *data, u32 len, u32 seed) +{ + const struct netlink_sock *nlk = data; + struct netlink_compare_arg arg; + + netlink_compare_arg_init(&arg, sock_net(&nlk->sk), nlk->portid); + return jhash2((u32 *)&arg, netlink_compare_arg_len / sizeof(u32), seed); +} + +static const struct rhashtable_params netlink_rhashtable_params = { + .head_offset = offsetof(struct netlink_sock, node), + .key_len = netlink_compare_arg_len, + .obj_hashfn = netlink_hash, + .obj_cmpfn = netlink_compare, + .automatic_shrinking = true, +}; + static int __init netlink_proto_init(void) { int i; - unsigned long limit; - unsigned int order; int err = proto_register(&netlink_proto, 0); if (err != 0) @@ -2962,32 +2594,18 @@ if (!nl_table) goto panic; - if (totalram_pages >= (128 * 1024)) - limit = totalram_pages >> (21 - PAGE_SHIFT); - else - limit = totalram_pages >> (23 - PAGE_SHIFT); - - order = get_bitmask_order(limit) - 1 + PAGE_SHIFT; - limit = (1UL << order) / sizeof(struct hlist_head); - order = get_bitmask_order(min(limit, (unsigned long)UINT_MAX)) - 1; - for (i = 0; i < MAX_LINKS; i++) { - struct nl_portid_hash *hash = &nl_table[i].hash; - - hash->table = nl_portid_hash_zalloc(1 * sizeof(*hash->table)); - if (!hash->table) { - while (i-- > 0) - nl_portid_hash_free(nl_table[i].hash.table, - 1 * sizeof(*hash->table)); + if (rhashtable_init(&nl_table[i].hash, + &netlink_rhashtable_params) < 0) { + while (--i > 0) + rhashtable_destroy(&nl_table[i].hash); kfree(nl_table); goto panic; } - hash->max_shift = order; - hash->shift = 0; - hash->mask = 0; - hash->rehash_time = jiffies; } + INIT_LIST_HEAD(&netlink_tap_all); + netlink_add_usersock_entry(); sock_register(&netlink_family_ops);