--- zzzz-none-000/linux-3.10.107/net/unix/af_unix.c 2017-06-27 09:49:32.000000000 +0000 +++ scorpion-7490-727/linux-3.10.107/net/unix/af_unix.c 2021-02-04 17:41:59.000000000 +0000 @@ -80,6 +80,8 @@ * with BSD names. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include @@ -114,6 +116,7 @@ #include #include #include +#include struct hlist_head unix_socket_table[2 * UNIX_HASH_SIZE]; EXPORT_SYMBOL_GPL(unix_socket_table); @@ -137,12 +140,17 @@ #ifdef CONFIG_SECURITY_NETWORK static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb) { - memcpy(UNIXSID(skb), &scm->secid, sizeof(u32)); + UNIXCB(skb).secid = scm->secid; } static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb) { - scm->secid = *UNIXSID(skb); + scm->secid = UNIXCB(skb).secid; +} + +static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb) +{ + return (scm->secid == UNIXCB(skb).secid); } #else static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb) @@ -150,6 +158,11 @@ static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb) { } + +static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb) +{ + return true; +} #endif /* CONFIG_SECURITY_NETWORK */ /* @@ -216,15 +229,16 @@ if (!sunaddr || sunaddr->sun_family != AF_UNIX) return -EINVAL; if (sunaddr->sun_path[0]) { - /* - * This may look like an off by one error but it is a bit more - * subtle. 108 is the longest valid AF_UNIX path for a binding. - * sun_path[108] doesn't as such exist. However in kernel space - * we are guaranteed that it is a valid memory location in our - * kernel address buffer. - */ + unsigned char value; + if(len == sizeof(*sunaddr)) len--; /*--- mbahr@avm: don't write over struct-boundary ! ---*/ + value = ((unsigned char *)sunaddr)[len]; ((char *)sunaddr)[len] = 0; len = strlen(sunaddr->sun_path)+1+sizeof(short); + if(value && (len == sizeof(*sunaddr))) { + /*--- mbahr@avm: check - UNIX_MAX_PATH include zero-termination! ---*/ + printk(KERN_ERR"%s: error sun_path exceeded '%s%c'\n", __func__, sunaddr->sun_path, value); + return -EINVAL; + } return len; } @@ -302,7 +316,7 @@ &unix_socket_table[i->i_ino & (UNIX_HASH_SIZE - 1)]) { struct dentry *dentry = unix_sk(s)->path.dentry; - if (dentry && dentry->d_inode == i) { + if (dentry && d_real_inode(dentry) == i) { sock_hold(s); goto found; } @@ -425,9 +439,10 @@ return 0; } -static inline int unix_writable(struct sock *sk) +static int unix_writable(const struct sock *sk) { - return (atomic_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf; + return sk->sk_state != TCP_LISTEN && + (atomic_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf; } static void unix_write_space(struct sock *sk) @@ -476,7 +491,7 @@ WARN_ON(!sk_unhashed(sk)); WARN_ON(sk->sk_socket); if (!sock_flag(sk, SOCK_DEAD)) { - printk(KERN_INFO "Attempt to release alive unix socket: %p\n", sk); + pr_info("Attempt to release alive unix socket: %p\n", sk); return; } @@ -488,7 +503,7 @@ sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); local_bh_enable(); #ifdef UNIX_REFCNT_DEBUG - printk(KERN_DEBUG "UNIX %p is destroyed, %ld are still alive.\n", sk, + pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk, atomic_long_read(&unix_nr_socks)); #endif } @@ -541,6 +556,7 @@ if (state == TCP_LISTEN) unix_release_sock(skb->sk, 1); /* passed fds are erased in the kfree_skb hook */ + UNIXCB(skb).consumed = skb->len; kfree_skb(skb); } @@ -627,30 +643,30 @@ poll_table *); static int unix_ioctl(struct socket *, unsigned int, unsigned long); static int unix_shutdown(struct socket *, int); -static int unix_stream_sendmsg(struct kiocb *, struct socket *, - struct msghdr *, size_t); -static int unix_stream_recvmsg(struct kiocb *, struct socket *, - struct msghdr *, size_t, int); -static int unix_dgram_sendmsg(struct kiocb *, struct socket *, - struct msghdr *, size_t); -static int unix_dgram_recvmsg(struct kiocb *, struct socket *, - struct msghdr *, size_t, int); +static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t); +static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int); +static ssize_t unix_stream_sendpage(struct socket *, struct page *, int offset, + size_t size, int flags); +static ssize_t unix_stream_splice_read(struct socket *, loff_t *ppos, + struct pipe_inode_info *, size_t size, + unsigned int flags); +static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t); +static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int); static int unix_dgram_connect(struct socket *, struct sockaddr *, int, int); -static int unix_seqpacket_sendmsg(struct kiocb *, struct socket *, - struct msghdr *, size_t); -static int unix_seqpacket_recvmsg(struct kiocb *, struct socket *, - struct msghdr *, size_t, int); +static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t); +static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t, + int); static int unix_set_peek_off(struct sock *sk, int val) { struct unix_sock *u = unix_sk(sk); - if (mutex_lock_interruptible(&u->readlock)) + if (mutex_lock_interruptible(&u->iolock)) return -EINTR; sk->sk_peek_off = val; - mutex_unlock(&u->readlock); + mutex_unlock(&u->iolock); return 0; } @@ -674,7 +690,8 @@ .sendmsg = unix_stream_sendmsg, .recvmsg = unix_stream_recvmsg, .mmap = sock_no_mmap, - .sendpage = sock_no_sendpage, + .sendpage = unix_stream_sendpage, + .splice_read = unix_stream_splice_read, .set_peek_off = unix_set_peek_off, }; @@ -736,7 +753,7 @@ */ static struct lock_class_key af_unix_sk_receive_queue_lock_key; -static struct sock *unix_create1(struct net *net, struct socket *sock) +static struct sock *unix_create1(struct net *net, struct socket *sock, int kern) { struct sock *sk = NULL; struct unix_sock *u; @@ -745,7 +762,7 @@ if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) goto out; - sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_proto); + sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_proto, kern); if (!sk) goto out; @@ -762,7 +779,8 @@ spin_lock_init(&u->lock); atomic_long_set(&u->inflight, 0); INIT_LIST_HEAD(&u->link); - mutex_init(&u->readlock); /* single task reading lock */ + mutex_init(&u->iolock); /* single task reading lock */ + mutex_init(&u->bindlock); /* single task binding lock */ init_waitqueue_head(&u->peer_wait); init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay); unix_insert_socket(unix_sockets_unbound(sk), sk); @@ -805,7 +823,7 @@ return -ESOCKTNOSUPPORT; } - return unix_create1(net, sock) ? 0 : -ENOMEM; + return unix_create1(net, sock, kern) ? 0 : -ENOMEM; } static int unix_release(struct socket *sock) @@ -831,7 +849,7 @@ int err; unsigned int retries = 0; - err = mutex_lock_interruptible(&u->readlock); + err = mutex_lock_interruptible(&u->bindlock); if (err) return err; @@ -878,7 +896,7 @@ spin_unlock(&unix_table_lock); err = 0; -out: mutex_unlock(&u->readlock); +out: mutex_unlock(&u->bindlock); return err; } @@ -895,7 +913,7 @@ err = kern_path(sunname->sun_path, LOOKUP_FOLLOW, &path); if (err) goto fail; - inode = path.dentry->d_inode; + inode = d_real_inode(path.dentry); err = inode_permission(inode, MAY_WRITE); if (err) goto put_fail; @@ -956,7 +974,7 @@ */ err = security_path_mknod(&path, dentry, mode, 0); if (!err) { - err = vfs_mknod(path.dentry->d_inode, dentry, mode, 0); + err = vfs_mknod(d_inode(path.dentry), dentry, mode, 0); if (!err) { res->mnt = mntget(path.mnt); res->dentry = dget(dentry); @@ -1004,7 +1022,7 @@ } } - err = mutex_lock_interruptible(&u->readlock); + err = mutex_lock_interruptible(&u->bindlock); if (err) goto out_put; @@ -1024,7 +1042,7 @@ if (sun_path[0]) { addr->hash = UNIX_HASH_SIZE; - hash = path.dentry->d_inode->i_ino & (UNIX_HASH_SIZE-1); + hash = d_real_inode(path.dentry)->i_ino & (UNIX_HASH_SIZE - 1); spin_lock(&unix_table_lock); u->path = path; list = &unix_socket_table[hash]; @@ -1048,7 +1066,7 @@ out_unlock: spin_unlock(&unix_table_lock); out_up: - mutex_unlock(&u->readlock); + mutex_unlock(&u->bindlock); out_put: if (err) path_put(&path); @@ -1212,7 +1230,7 @@ err = -ENOMEM; /* create new sock for complete connection */ - newsk = unix_create1(sock_net(sk), NULL); + newsk = unix_create1(sock_net(sk), NULL, 0); if (newsk == NULL) goto out; @@ -1326,7 +1344,7 @@ sk->sk_state = TCP_ESTABLISHED; sock_hold(newsk); - smp_mb__after_atomic_inc(); /* sock_hold() does an atomic_inc() */ + smp_mb__after_atomic(); /* sock_hold() does an atomic_inc() */ unix_peer(sk) = newsk; unix_state_unlock(sk); @@ -1336,7 +1354,7 @@ __skb_queue_tail(&other->sk_receive_queue, skb); spin_unlock(&other->sk_receive_queue.lock); unix_state_unlock(other); - other->sk_data_ready(other, 0); + other->sk_data_ready(other); sock_put(other); return 0; @@ -1548,6 +1566,7 @@ UNIXCB(skb).uid = scm->creds.uid; UNIXCB(skb).gid = scm->creds.gid; UNIXCB(skb).fp = NULL; + unix_get_secdata(scm, skb); if (scm->fp && send_fds) err = unix_attach_fds(scm, skb); @@ -1555,6 +1574,14 @@ return err; } +static bool unix_passcred_enabled(const struct socket *sock, + const struct sock *other) +{ + return test_bit(SOCK_PASSCRED, &sock->flags) || + !other->sk_socket || + test_bit(SOCK_PASSCRED, &other->sk_socket->flags); +} + /* * Some apps rely on write() giving SCM_CREDENTIALS * We include credentials if source or destination socket @@ -1565,41 +1592,65 @@ { if (UNIXCB(skb).pid) return; - if (test_bit(SOCK_PASSCRED, &sock->flags) || - !other->sk_socket || - test_bit(SOCK_PASSCRED, &other->sk_socket->flags)) { + if (unix_passcred_enabled(sock, other)) { UNIXCB(skb).pid = get_pid(task_tgid(current)); current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid); } } +static int maybe_init_creds(struct scm_cookie *scm, + struct socket *socket, + const struct sock *other) +{ + int err; + struct msghdr msg = { .msg_controllen = 0 }; + + err = scm_send(socket, &msg, scm, false); + if (err) + return err; + + if (unix_passcred_enabled(socket, other)) { + scm->pid = get_pid(task_tgid(current)); + current_uid_gid(&scm->creds.uid, &scm->creds.gid); + } + return err; +} + +static bool unix_skb_scm_eq(struct sk_buff *skb, + struct scm_cookie *scm) +{ + const struct unix_skb_parms *u = &UNIXCB(skb); + + return u->pid == scm->pid && + uid_eq(u->uid, scm->creds.uid) && + gid_eq(u->gid, scm->creds.gid) && + unix_secdata_eq(scm, skb); +} + /* * Send AF_UNIX data. */ -static int unix_dgram_sendmsg(struct kiocb *kiocb, struct socket *sock, - struct msghdr *msg, size_t len) +static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, + size_t len) { - struct sock_iocb *siocb = kiocb_to_siocb(kiocb); struct sock *sk = sock->sk; struct net *net = sock_net(sk); struct unix_sock *u = unix_sk(sk); - struct sockaddr_un *sunaddr = msg->msg_name; + DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name); struct sock *other = NULL; int namelen = 0; /* fake GCC */ int err; unsigned int hash; struct sk_buff *skb; long timeo; - struct scm_cookie tmp_scm; + struct scm_cookie scm; int max_level; int data_len = 0; int sk_locked; - if (NULL == siocb->scm) - siocb->scm = &tmp_scm; wait_for_unix_gc(); - err = scm_send(sock, msg, siocb->scm, false); + err = scm_send(sock, msg, &scm, false); if (err < 0) return err; @@ -1628,26 +1679,30 @@ if (len > sk->sk_sndbuf - 32) goto out; - if (len > SKB_MAX_ALLOC) + if (len > SKB_MAX_ALLOC) { data_len = min_t(size_t, len - SKB_MAX_ALLOC, MAX_SKB_FRAGS * PAGE_SIZE); + data_len = PAGE_ALIGN(data_len); + + BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE); + } skb = sock_alloc_send_pskb(sk, len - data_len, data_len, - msg->msg_flags & MSG_DONTWAIT, &err); + msg->msg_flags & MSG_DONTWAIT, &err, + PAGE_ALLOC_COSTLY_ORDER); if (skb == NULL) goto out; - err = unix_scm_to_skb(siocb->scm, skb, true); + err = unix_scm_to_skb(&scm, skb, true); if (err < 0) goto out_free; max_level = err + 1; - unix_get_secdata(siocb->scm, skb); skb_put(skb, len - data_len); skb->data_len = data_len; skb->len = len; - err = skb_copy_datagram_from_iovec(skb, 0, msg->msg_iov, 0, len); + err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len); if (err) goto out_free; @@ -1719,7 +1774,12 @@ goto out_unlock; } - if (unlikely(unix_peer(other) != sk && unix_recvq_full(other))) { + /* other == sk && unix_peer(other) != sk if + * - unix_peer(sk) == NULL, destination address bound to sk + * - unix_peer(sk) == sk by time of get but disconnected before lock + */ + if (other != sk && + unlikely(unix_peer(other) != sk && unix_recvq_full(other))) { if (timeo) { timeo = unix_wait_for_peer(other, timeo); @@ -1758,9 +1818,9 @@ if (max_level > unix_sk(other)->recursion_level) unix_sk(other)->recursion_level = max_level; unix_state_unlock(other); - other->sk_data_ready(other, len); + other->sk_data_ready(other); sock_put(other); - scm_destroy(siocb->scm); + scm_destroy(&scm); return len; out_unlock: @@ -1772,28 +1832,30 @@ out: if (other) sock_put(other); - scm_destroy(siocb->scm); + scm_destroy(&scm); return err; } +/* We use paged skbs for stream sockets, and limit occupancy to 32768 + * bytes, and a minimun of a full page. + */ +#define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768)) -static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock, - struct msghdr *msg, size_t len) +static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, + size_t len) { - struct sock_iocb *siocb = kiocb_to_siocb(kiocb); struct sock *sk = sock->sk; struct sock *other = NULL; int err, size; struct sk_buff *skb; int sent = 0; - struct scm_cookie tmp_scm; + struct scm_cookie scm; bool fds_sent = false; int max_level; + int data_len; - if (NULL == siocb->scm) - siocb->scm = &tmp_scm; wait_for_unix_gc(); - err = scm_send(sock, msg, siocb->scm, false); + err = scm_send(sock, msg, &scm, false); if (err < 0) return err; @@ -1815,42 +1877,26 @@ goto pipe_err; while (sent < len) { - /* - * Optimisation for the fact that under 0.01% of X - * messages typically need breaking up. - */ - - size = len-sent; + size = len - sent; /* Keep two messages in the pipe so it schedules better */ - if (size > ((sk->sk_sndbuf >> 1) - 64)) - size = (sk->sk_sndbuf >> 1) - 64; + size = min_t(int, size, (sk->sk_sndbuf >> 1) - 64); - if (size > SKB_MAX_ALLOC) - size = SKB_MAX_ALLOC; + /* allow fallback to order-0 allocations */ + size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ); - /* - * Grab a buffer - */ + data_len = max_t(int, 0, size - SKB_MAX_HEAD(0)); - skb = sock_alloc_send_skb(sk, size, msg->msg_flags&MSG_DONTWAIT, - &err); + data_len = min_t(size_t, size, PAGE_ALIGN(data_len)); - if (skb == NULL) + skb = sock_alloc_send_pskb(sk, size - data_len, data_len, + msg->msg_flags & MSG_DONTWAIT, &err, + get_order(UNIX_SKB_FRAGS_SZ)); + if (!skb) goto out_err; - /* - * If you pass two values to the sock_alloc_send_skb - * it tries to grab the large buffer with GFP_NOFS - * (which can fail easily), and if it fails grab the - * fallback size buffer which is under a page and will - * succeed. [Alan] - */ - size = min_t(int, size, skb_tailroom(skb)); - - /* Only send the fds in the first buffer */ - err = unix_scm_to_skb(siocb->scm, skb, !fds_sent); + err = unix_scm_to_skb(&scm, skb, !fds_sent); if (err < 0) { kfree_skb(skb); goto out_err; @@ -1858,7 +1904,10 @@ max_level = err + 1; fds_sent = true; - err = memcpy_fromiovec(skb_put(skb, size), msg->msg_iov, size); + skb_put(skb, size - data_len); + skb->data_len = data_len; + skb->len = size; + err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size); if (err) { kfree_skb(skb); goto out_err; @@ -1875,12 +1924,11 @@ if (max_level > unix_sk(other)->recursion_level) unix_sk(other)->recursion_level = max_level; unix_state_unlock(other); - other->sk_data_ready(other, size); + other->sk_data_ready(other); sent += size; } - scm_destroy(siocb->scm); - siocb->scm = NULL; + scm_destroy(&scm); return sent; @@ -1892,13 +1940,128 @@ send_sig(SIGPIPE, current, 0); err = -EPIPE; out_err: - scm_destroy(siocb->scm); - siocb->scm = NULL; + scm_destroy(&scm); return sent ? : err; } -static int unix_seqpacket_sendmsg(struct kiocb *kiocb, struct socket *sock, - struct msghdr *msg, size_t len) +static ssize_t unix_stream_sendpage(struct socket *socket, struct page *page, + int offset, size_t size, int flags) +{ + int err; + bool send_sigpipe = false; + bool init_scm = true; + struct scm_cookie scm; + struct sock *other, *sk = socket->sk; + struct sk_buff *skb, *newskb = NULL, *tail = NULL; + + if (flags & MSG_OOB) + return -EOPNOTSUPP; + + other = unix_peer(sk); + if (!other || sk->sk_state != TCP_ESTABLISHED) + return -ENOTCONN; + + if (false) { +alloc_skb: + unix_state_unlock(other); + mutex_unlock(&unix_sk(other)->iolock); + newskb = sock_alloc_send_pskb(sk, 0, 0, flags & MSG_DONTWAIT, + &err, 0); + if (!newskb) + goto err; + } + + /* we must acquire iolock as we modify already present + * skbs in the sk_receive_queue and mess with skb->len + */ + err = mutex_lock_interruptible(&unix_sk(other)->iolock); + if (err) { + err = flags & MSG_DONTWAIT ? -EAGAIN : -ERESTARTSYS; + goto err; + } + + if (sk->sk_shutdown & SEND_SHUTDOWN) { + err = -EPIPE; + send_sigpipe = true; + goto err_unlock; + } + + unix_state_lock(other); + + if (sock_flag(other, SOCK_DEAD) || + other->sk_shutdown & RCV_SHUTDOWN) { + err = -EPIPE; + send_sigpipe = true; + goto err_state_unlock; + } + + if (init_scm) { + err = maybe_init_creds(&scm, socket, other); + if (err) + goto err_state_unlock; + init_scm = false; + } + + skb = skb_peek_tail(&other->sk_receive_queue); + if (tail && tail == skb) { + skb = newskb; + } else if (!skb || !unix_skb_scm_eq(skb, &scm)) { + if (newskb) { + skb = newskb; + } else { + tail = skb; + goto alloc_skb; + } + } else if (newskb) { + /* this is fast path, we don't necessarily need to + * call to kfree_skb even though with newskb == NULL + * this - does no harm + */ + consume_skb(newskb); + newskb = NULL; + } + + if (skb_append_pagefrags(skb, page, offset, size)) { + tail = skb; + goto alloc_skb; + } + + skb->len += size; + skb->data_len += size; + skb->truesize += size; + atomic_add(size, &sk->sk_wmem_alloc); + + if (newskb) { + err = unix_scm_to_skb(&scm, skb, false); + if (err) + goto err_state_unlock; + spin_lock(&other->sk_receive_queue.lock); + __skb_queue_tail(&other->sk_receive_queue, newskb); + spin_unlock(&other->sk_receive_queue.lock); + } + + unix_state_unlock(other); + mutex_unlock(&unix_sk(other)->iolock); + + other->sk_data_ready(other); + scm_destroy(&scm); + return size; + +err_state_unlock: + unix_state_unlock(other); +err_unlock: + mutex_unlock(&unix_sk(other)->iolock); +err: + kfree_skb(newskb); + if (send_sigpipe && !(flags & MSG_NOSIGNAL)) + send_sig(SIGPIPE, current, 0); + if (!init_scm) + scm_destroy(&scm); + return err; +} + +static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg, + size_t len) { int err; struct sock *sk = sock->sk; @@ -1913,19 +2076,18 @@ if (msg->msg_namelen) msg->msg_namelen = 0; - return unix_dgram_sendmsg(kiocb, sock, msg, len); + return unix_dgram_sendmsg(sock, msg, len); } -static int unix_seqpacket_recvmsg(struct kiocb *iocb, struct socket *sock, - struct msghdr *msg, size_t size, - int flags) +static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg, + size_t size, int flags) { struct sock *sk = sock->sk; if (sk->sk_state != TCP_ESTABLISHED) return -ENOTCONN; - return unix_dgram_recvmsg(iocb, sock, msg, size, flags); + return unix_dgram_recvmsg(sock, msg, size, flags); } static void unix_copy_addr(struct msghdr *msg, struct sock *sk) @@ -1938,12 +2100,10 @@ } } -static int unix_dgram_recvmsg(struct kiocb *iocb, struct socket *sock, - struct msghdr *msg, size_t size, - int flags) +static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, + size_t size, int flags) { - struct sock_iocb *siocb = kiocb_to_siocb(iocb); - struct scm_cookie tmp_scm; + struct scm_cookie scm; struct sock *sk = sock->sk; struct unix_sock *u = unix_sk(sk); int noblock = flags & MSG_DONTWAIT; @@ -1955,7 +2115,7 @@ if (flags&MSG_OOB) goto out; - err = mutex_lock_interruptible(&u->readlock); + err = mutex_lock_interruptible(&u->iolock); if (unlikely(err)) { /* recvmsg() in non blocking mode is supposed to return -EAGAIN * sk_rcvtimeo is not honored by mutex_lock_interruptible() @@ -1988,23 +2148,21 @@ else if (size < skb->len - skip) msg->msg_flags |= MSG_TRUNC; - err = skb_copy_datagram_iovec(skb, skip, msg->msg_iov, size); + err = skb_copy_datagram_msg(skb, skip, msg, size); if (err) goto out_free; if (sock_flag(sk, SOCK_RCVTSTAMP)) __sock_recv_timestamp(msg, sk, skb); - if (!siocb->scm) { - siocb->scm = &tmp_scm; - memset(&tmp_scm, 0, sizeof(tmp_scm)); - } - scm_set_cred(siocb->scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid); - unix_set_secdata(siocb->scm, skb); + memset(&scm, 0, sizeof(scm)); + + scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid); + unix_set_secdata(&scm, skb); if (!(flags & MSG_PEEK)) { if (UNIXCB(skb).fp) - unix_detach_fds(siocb->scm, skb); + unix_detach_fds(&scm, skb); sk_peek_offset_bwd(sk, skb->len); } else { @@ -2024,16 +2182,16 @@ sk_peek_offset_fwd(sk, size); if (UNIXCB(skb).fp) - siocb->scm->fp = scm_fp_dup(UNIXCB(skb).fp); + scm.fp = scm_fp_dup(UNIXCB(skb).fp); } err = (flags & MSG_TRUNC) ? skb->len - skip : size; - scm_recv(sock, msg, siocb->scm, flags); + scm_recv(sock, msg, &scm, flags); out_free: skb_free_datagram(sk, skb); out_unlock: - mutex_unlock(&u->readlock); + mutex_unlock(&u->iolock); out: return err; } @@ -2042,8 +2200,10 @@ * Sleep until more data has arrived. But check for races.. */ static long unix_stream_data_wait(struct sock *sk, long timeo, - struct sk_buff *last) + struct sk_buff *last, unsigned int last_len, + bool freezable) { + struct sk_buff *tail; DEFINE_WAIT(wait); unix_state_lock(sk); @@ -2051,22 +2211,27 @@ for (;;) { prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); - if (skb_peek_tail(&sk->sk_receive_queue) != last || + tail = skb_peek_tail(&sk->sk_receive_queue); + if (tail != last || + (tail && tail->len != last_len) || sk->sk_err || (sk->sk_shutdown & RCV_SHUTDOWN) || signal_pending(current) || !timeo) break; - set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags); + sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); unix_state_unlock(sk); - timeo = schedule_timeout(timeo); + if (freezable) + timeo = freezable_schedule_timeout(timeo); + else + timeo = schedule_timeout(timeo); unix_state_lock(sk); if (sock_flag(sk, SOCK_DEAD)) break; - clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags); + sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); } finish_wait(sk_sleep(sk), &wait); @@ -2074,47 +2239,68 @@ return timeo; } -static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock, - struct msghdr *msg, size_t size, - int flags) +static unsigned int unix_skb_len(const struct sk_buff *skb) +{ + return skb->len - UNIXCB(skb).consumed; +} + +struct unix_stream_read_state { + int (*recv_actor)(struct sk_buff *, int, int, + struct unix_stream_read_state *); + struct socket *socket; + struct msghdr *msg; + struct pipe_inode_info *pipe; + size_t size; + int flags; + unsigned int splice_flags; +}; + +static int unix_stream_read_generic(struct unix_stream_read_state *state, + bool freezable) { - struct sock_iocb *siocb = kiocb_to_siocb(iocb); - struct scm_cookie tmp_scm; + struct scm_cookie scm; + struct socket *sock = state->socket; struct sock *sk = sock->sk; struct unix_sock *u = unix_sk(sk); - struct sockaddr_un *sunaddr = msg->msg_name; int copied = 0; + int flags = state->flags; int noblock = flags & MSG_DONTWAIT; - int check_creds = 0; + bool check_creds = false; int target; int err = 0; long timeo; int skip; + size_t size = state->size; + unsigned int last_len; - err = -EINVAL; - if (sk->sk_state != TCP_ESTABLISHED) + if (unlikely(sk->sk_state != TCP_ESTABLISHED)) { + err = -EINVAL; goto out; + } - err = -EOPNOTSUPP; - if (flags&MSG_OOB) + if (unlikely(flags & MSG_OOB)) { + err = -EOPNOTSUPP; goto out; + } - target = sock_rcvlowat(sk, flags&MSG_WAITALL, size); + target = sock_rcvlowat(sk, flags & MSG_WAITALL, size); timeo = sock_rcvtimeo(sk, noblock); + memset(&scm, 0, sizeof(scm)); + /* Lock the socket to prevent queue disordering * while sleeps in memcpy_tomsg */ + mutex_lock(&u->iolock); - if (!siocb->scm) { - siocb->scm = &tmp_scm; - memset(&tmp_scm, 0, sizeof(tmp_scm)); - } - - mutex_lock(&u->readlock); + if (flags & MSG_PEEK) + skip = sk_peek_offset(sk, flags); + else + skip = 0; do { int chunk; + bool drop_skb; struct sk_buff *skb, *last; unix_state_lock(sk); @@ -2123,6 +2309,7 @@ goto unlock; } last = skb = skb_peek(&sk->sk_receive_queue); + last_len = last ? last->len : 0; again: if (skb == NULL) { unix_sk(sk)->recursion_level = 0; @@ -2140,29 +2327,33 @@ goto unlock; unix_state_unlock(sk); - err = -EAGAIN; - if (!timeo) + if (!timeo) { + err = -EAGAIN; break; - mutex_unlock(&u->readlock); + } - timeo = unix_stream_data_wait(sk, timeo, last); + mutex_unlock(&u->iolock); + + timeo = unix_stream_data_wait(sk, timeo, last, + last_len, freezable); if (signal_pending(current)) { err = sock_intr_errno(timeo); + scm_destroy(&scm); goto out; } - mutex_lock(&u->readlock); + mutex_lock(&u->iolock); continue; - unlock: +unlock: unix_state_unlock(sk); break; } - skip = sk_peek_offset(sk, flags); - while (skip >= skb->len) { - skip -= skb->len; + while (skip >= unix_skb_len(skb)) { + skip -= unix_skb_len(skb); last = skb; + last_len = skb->len; skb = skb_peek_next(skb, &sk->sk_receive_queue); if (!skb) goto again; @@ -2172,24 +2363,30 @@ if (check_creds) { /* Never glue messages from different writers */ - if ((UNIXCB(skb).pid != siocb->scm->pid) || - !uid_eq(UNIXCB(skb).uid, siocb->scm->creds.uid) || - !gid_eq(UNIXCB(skb).gid, siocb->scm->creds.gid)) + if (!unix_skb_scm_eq(skb, &scm)) break; } else if (test_bit(SOCK_PASSCRED, &sock->flags)) { /* Copy credentials */ - scm_set_cred(siocb->scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid); - check_creds = 1; + scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid); + unix_set_secdata(&scm, skb); + check_creds = true; } /* Copy address just once */ - if (sunaddr) { - unix_copy_addr(msg, skb->sk); + if (state->msg && state->msg->msg_name) { + DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, + state->msg->msg_name); + unix_copy_addr(state->msg, skb->sk); sunaddr = NULL; } - chunk = min_t(unsigned int, skb->len - skip, size); - if (memcpy_toiovec(msg->msg_iov, skb->data + skip, chunk)) { + chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size); + skb_get(skb); + chunk = state->recv_actor(skb, skip, chunk, state); + drop_skb = !unix_skb_len(skb); + /* skb is only safe to use if !drop_skb */ + consume_skb(skb); + if (chunk < 0) { if (copied == 0) copied = -EFAULT; break; @@ -2197,38 +2394,49 @@ copied += chunk; size -= chunk; + if (drop_skb) { + /* the skb was touched by a concurrent reader; + * we should not expect anything from this skb + * anymore and assume it invalid - we can be + * sure it was dropped from the socket queue + * + * let's report a short read + */ + err = 0; + break; + } + /* Mark read part of skb as used */ if (!(flags & MSG_PEEK)) { - skb_pull(skb, chunk); + UNIXCB(skb).consumed += chunk; sk_peek_offset_bwd(sk, chunk); if (UNIXCB(skb).fp) - unix_detach_fds(siocb->scm, skb); + unix_detach_fds(&scm, skb); - if (skb->len) + if (unix_skb_len(skb)) break; skb_unlink(skb, &sk->sk_receive_queue); consume_skb(skb); - if (siocb->scm->fp) + if (scm.fp) break; } else { /* It is questionable, see note in unix_dgram_recvmsg. */ if (UNIXCB(skb).fp) - siocb->scm->fp = scm_fp_dup(UNIXCB(skb).fp); + scm.fp = scm_fp_dup(UNIXCB(skb).fp); - if (skip) { - sk_peek_offset_fwd(sk, chunk); - skip -= chunk; - } + sk_peek_offset_fwd(sk, chunk); if (UNIXCB(skb).fp) break; + skip = 0; last = skb; + last_len = skb->len; unix_state_lock(sk); skb = skb_peek_next(skb, &sk->sk_receive_queue); if (skb) @@ -2238,12 +2446,86 @@ } } while (size); - mutex_unlock(&u->readlock); - scm_recv(sock, msg, siocb->scm, flags); + mutex_unlock(&u->iolock); + if (state->msg) + scm_recv(sock, state->msg, &scm, flags); + else + scm_destroy(&scm); out: return copied ? : err; } +static int unix_stream_read_actor(struct sk_buff *skb, + int skip, int chunk, + struct unix_stream_read_state *state) +{ + int ret; + + ret = skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip, + state->msg, chunk); + return ret ?: chunk; +} + +static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg, + size_t size, int flags) +{ + struct unix_stream_read_state state = { + .recv_actor = unix_stream_read_actor, + .socket = sock, + .msg = msg, + .size = size, + .flags = flags + }; + + return unix_stream_read_generic(&state, true); +} + +static ssize_t skb_unix_socket_splice(struct sock *sk, + struct pipe_inode_info *pipe, + struct splice_pipe_desc *spd) +{ + int ret; + struct unix_sock *u = unix_sk(sk); + + mutex_unlock(&u->iolock); + ret = splice_to_pipe(pipe, spd); + mutex_lock(&u->iolock); + + return ret; +} + +static int unix_stream_splice_actor(struct sk_buff *skb, + int skip, int chunk, + struct unix_stream_read_state *state) +{ + return skb_splice_bits(skb, state->socket->sk, + UNIXCB(skb).consumed + skip, + state->pipe, chunk, state->splice_flags, + skb_unix_socket_splice); +} + +static ssize_t unix_stream_splice_read(struct socket *sock, loff_t *ppos, + struct pipe_inode_info *pipe, + size_t size, unsigned int flags) +{ + struct unix_stream_read_state state = { + .recv_actor = unix_stream_splice_actor, + .socket = sock, + .pipe = pipe, + .size = size, + .splice_flags = flags, + }; + + if (unlikely(*ppos)) + return -ESPIPE; + + if (sock->file->f_flags & O_NONBLOCK || + flags & SPLICE_F_NONBLOCK) + state.flags = MSG_DONTWAIT; + + return unix_stream_read_generic(&state, false); +} + static int unix_shutdown(struct socket *sock, int mode) { struct sock *sk = sock->sk; @@ -2302,7 +2584,7 @@ if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) { skb_queue_walk(&sk->sk_receive_queue, skb) - amount += skb->len; + amount += unix_skb_len(skb); } else { skb = skb_peek(&sk->sk_receive_queue); if (skb) @@ -2432,7 +2714,7 @@ if (writable) mask |= POLLOUT | POLLWRNORM | POLLWRBAND; else - set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); return mask; } @@ -2627,8 +2909,7 @@ rc = proto_register(&unix_proto, 1); if (rc != 0) { - printk(KERN_CRIT "%s: Cannot create unix_sock SLAB cache!\n", - __func__); + pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__); goto out; }