--- zzzz-none-000/linux-3.10.107/net/9p/trans_rdma.c	2017-06-27 09:49:32.000000000 +0000
+++ scorpion-7490-727/linux-3.10.107/net/9p/trans_rdma.c	2021-02-04 17:41:59.000000000 +0000
@@ -57,9 +57,7 @@
 #define P9_RDMA_IRD		0
 #define P9_RDMA_ORD		0
 #define P9_RDMA_TIMEOUT		30000		/* 30 seconds */
-#define P9_RDMA_MAXSIZE		(4*4096)	/* Min SGE is 4, so we can
-						 * safely advertise a maxsize
-						 * of 64k */
+#define P9_RDMA_MAXSIZE		(1024*1024)	/* 1MB */
 
 /**
  * struct p9_trans_rdma - RDMA transport instance
@@ -75,7 +73,9 @@
  * @sq_depth: The depth of the Send Queue
  * @sq_sem: Semaphore for the SQ
  * @rq_depth: The depth of the Receive Queue.
- * @rq_count: Count of requests in the Receive Queue.
+ * @rq_sem: Semaphore for the RQ
+ * @excess_rc : Amount of posted Receive Contexts without a pending request.
+ *		See rdma_request()
  * @addr: The remote peer's address
  * @req_lock: Protects the active request list
  * @cm_done: Completion event for connection management tracking
@@ -94,13 +94,12 @@
 	struct ib_pd *pd;
 	struct ib_qp *qp;
 	struct ib_cq *cq;
-	struct ib_mr *dma_mr;
-	u32 lkey;
 	long timeout;
 	int sq_depth;
 	struct semaphore sq_sem;
 	int rq_depth;
-	atomic_t rq_count;
+	struct semaphore rq_sem;
+	atomic_t excess_rc;
 	struct sockaddr_in addr;
 	spinlock_t req_lock;
 
@@ -138,6 +137,7 @@
 	int sq_depth;
 	int rq_depth;
 	long timeout;
+	int privport;
 };
 
 /*
@@ -145,7 +145,10 @@
  */
 enum {
 	/* Options that take integer arguments */
-	Opt_port, Opt_rq_depth, Opt_sq_depth, Opt_timeout, Opt_err,
+	Opt_port, Opt_rq_depth, Opt_sq_depth, Opt_timeout,
+	/* Options that take no argument */
+	Opt_privport,
+	Opt_err,
 };
 
 static match_table_t tokens = {
@@ -153,6 +156,7 @@
 	{Opt_sq_depth, "sq=%u"},
 	{Opt_rq_depth, "rq=%u"},
 	{Opt_timeout, "timeout=%u"},
+	{Opt_privport, "privport"},
 	{Opt_err, NULL},
 };
 
@@ -174,6 +178,7 @@
 	opts->sq_depth = P9_RDMA_SQ_DEPTH;
 	opts->rq_depth = P9_RDMA_RQ_DEPTH;
 	opts->timeout = P9_RDMA_TIMEOUT;
+	opts->privport = 0;
 
 	if (!params)
 		return 0;
@@ -192,11 +197,13 @@
 		if (!*p)
 			continue;
 		token = match_token(p, tokens, args);
-		r = match_int(&args[0], &option);
-		if (r < 0) {
-			p9_debug(P9_DEBUG_ERROR,
-				 "integer field, but no integer?\n");
-			continue;
+		if ((token != Opt_err) && (token != Opt_privport)) {
+			r = match_int(&args[0], &option);
+			if (r < 0) {
+				p9_debug(P9_DEBUG_ERROR,
+					 "integer field, but no integer?\n");
+				continue;
+			}
 		}
 		switch (token) {
 		case Opt_port:
@@ -211,6 +218,9 @@
 		case Opt_timeout:
 			opts->timeout = option;
 			break;
+		case Opt_privport:
+			opts->privport = 1;
+			break;
 		default:
 			continue;
 		}
@@ -296,9 +306,15 @@
 	if (!req)
 		goto err_out;
 
+	/* Check that we have not yet received a reply for this request.
+	 */
+	if (unlikely(req->rc)) {
+		pr_err("Duplicate reply for request %d", tag);
+		goto err_out;
+	}
+
 	req->rc = c->rc;
-	req->status = REQ_STATUS_RCVD;
-	p9_client_cb(client, req);
+	p9_client_cb(client, req, REQ_STATUS_RCVD);
 
 	return;
 
@@ -336,8 +352,8 @@
 
 		switch (c->wc_op) {
 		case IB_WC_RECV:
-			atomic_dec(&rdma->rq_count);
 			handle_recv(client, rdma, c, wc.status, wc.byte_len);
+			up(&rdma->rq_sem);
 			break;
 
 		case IB_WC_SEND:
@@ -364,9 +380,6 @@
 	if (!rdma)
 		return;
 
-	if (rdma->dma_mr && !IS_ERR(rdma->dma_mr))
-		ib_dereg_mr(rdma->dma_mr);
-
 	if (rdma->qp && !IS_ERR(rdma->qp))
 		ib_destroy_qp(rdma->qp);
 
@@ -397,7 +410,7 @@
 
 	sge.addr = c->busa;
 	sge.length = client->msize;
-	sge.lkey = rdma->lkey;
+	sge.lkey = rdma->pd->local_dma_lkey;
 
 	wr.next = NULL;
 	c->wc_op = IB_WC_RECV;
@@ -421,32 +434,33 @@
 	struct p9_rdma_context *c = NULL;
 	struct p9_rdma_context *rpl_context = NULL;
 
+	/* When an error occurs between posting the recv and the send,
+	 * there will be a receive context posted without a pending request.
+	 * Since there is no way to "un-post" it, we remember it and skip
+	 * post_recv() for the next request.
+	 * So here,
+	 * see if we are this `next request' and need to absorb an excess rc.
+	 * If yes, then drop and free our own, and do not recv_post().
+	 **/
+	if (unlikely(atomic_read(&rdma->excess_rc) > 0)) {
+		if ((atomic_sub_return(1, &rdma->excess_rc) >= 0)) {
+			/* Got one ! */
+			kfree(req->rc);
+			req->rc = NULL;
+			goto dont_need_post_recv;
+		} else {
+			/* We raced and lost. */
+			atomic_inc(&rdma->excess_rc);
+		}
+	}
+
 	/* Allocate an fcall for the reply */
 	rpl_context = kmalloc(sizeof *rpl_context, GFP_NOFS);
 	if (!rpl_context) {
 		err = -ENOMEM;
-		goto err_close;
-	}
-
-	/*
-	 * If the request has a buffer, steal it, otherwise
-	 * allocate a new one.  Typically, requests should already
-	 * have receive buffers allocated and just swap them around
-	 */
-	if (!req->rc) {
-		req->rc = kmalloc(sizeof(struct p9_fcall)+client->msize,
-				  GFP_NOFS);
-		if (req->rc) {
-			req->rc->sdata = (char *) req->rc +
-						sizeof(struct p9_fcall);
-			req->rc->capacity = client->msize;
-		}
+		goto recv_error;
 	}
 	rpl_context->rc = req->rc;
-	if (!rpl_context->rc) {
-		err = -ENOMEM;
-		goto err_free2;
-	}
 
 	/*
 	 * Post a receive buffer for this request. We need to ensure
@@ -455,33 +469,39 @@
 	 * outstanding request, so we must keep a count to avoid
 	 * overflowing the RQ.
 	 */
-	if (atomic_inc_return(&rdma->rq_count) <= rdma->rq_depth) {
-		err = post_recv(client, rpl_context);
-		if (err)
-			goto err_free1;
-	} else
-		atomic_dec(&rdma->rq_count);
+	if (down_interruptible(&rdma->rq_sem)) {
+		err = -EINTR;
+		goto recv_error;
+	}
 
+	err = post_recv(client, rpl_context);
+	if (err) {
+		p9_debug(P9_DEBUG_FCALL, "POST RECV failed\n");
+		goto recv_error;
+	}
 	/* remove posted receive buffer from request structure */
 	req->rc = NULL;
 
+dont_need_post_recv:
 	/* Post the request */
 	c = kmalloc(sizeof *c, GFP_NOFS);
 	if (!c) {
 		err = -ENOMEM;
-		goto err_free1;
+		goto send_error;
 	}
 	c->req = req;
 
 	c->busa = ib_dma_map_single(rdma->cm_id->device,
 				    c->req->tc->sdata, c->req->tc->size,
 				    DMA_TO_DEVICE);
-	if (ib_dma_mapping_error(rdma->cm_id->device, c->busa))
-		goto error;
+	if (ib_dma_mapping_error(rdma->cm_id->device, c->busa)) {
+		err = -EIO;
+		goto send_error;
+	}
 
 	sge.addr = c->busa;
 	sge.length = c->req->tc->size;
-	sge.lkey = rdma->lkey;
+	sge.lkey = rdma->pd->local_dma_lkey;
 
 	wr.next = NULL;
 	c->wc_op = IB_WC_SEND;
@@ -491,22 +511,38 @@
 	wr.sg_list = &sge;
 	wr.num_sge = 1;
 
-	if (down_interruptible(&rdma->sq_sem))
-		goto error;
+	if (down_interruptible(&rdma->sq_sem)) {
+		err = -EINTR;
+		goto send_error;
+	}
 
-	return ib_post_send(rdma->qp, &wr, &bad_wr);
+	/* Mark request as `sent' *before* we actually send it,
+	 * because doing if after could erase the REQ_STATUS_RCVD
+	 * status in case of a very fast reply.
+	 */
+	req->status = REQ_STATUS_SENT;
+	err = ib_post_send(rdma->qp, &wr, &bad_wr);
+	if (err)
+		goto send_error;
 
- error:
+	/* Success */
+	return 0;
+
+ /* Handle errors that happened during or while preparing the send: */
+ send_error:
+	req->status = REQ_STATUS_ERROR;
 	kfree(c);
-	kfree(rpl_context->rc);
-	kfree(rpl_context);
-	p9_debug(P9_DEBUG_ERROR, "EIO\n");
-	return -EIO;
- err_free1:
-	kfree(rpl_context->rc);
- err_free2:
+	p9_debug(P9_DEBUG_ERROR, "Error %d in rdma_request()\n", err);
+
+	/* Ach.
+	 *  We did recv_post(), but not send. We have one recv_post in excess.
+	 */
+	atomic_inc(&rdma->excess_rc);
+	return err;
+
+ /* Handle errors that happened during or while preparing post_recv(): */
+ recv_error:
 	kfree(rpl_context);
- err_close:
 	spin_lock_irqsave(&rdma->req_lock, flags);
 	if (rdma->state < P9_RDMA_CLOSING) {
 		rdma->state = P9_RDMA_CLOSING;
@@ -551,17 +587,47 @@
 	spin_lock_init(&rdma->req_lock);
 	init_completion(&rdma->cm_done);
 	sema_init(&rdma->sq_sem, rdma->sq_depth);
-	atomic_set(&rdma->rq_count, 0);
+	sema_init(&rdma->rq_sem, rdma->rq_depth);
+	atomic_set(&rdma->excess_rc, 0);
 
 	return rdma;
 }
 
-/* its not clear to me we can do anything after send has been posted */
 static int rdma_cancel(struct p9_client *client, struct p9_req_t *req)
 {
+	/* Nothing to do here.
+	 * We will take care of it (if we have to) in rdma_cancelled()
+	 */
 	return 1;
 }
 
+/* A request has been fully flushed without a reply.
+ * That means we have posted one buffer in excess.
+ */
+static int rdma_cancelled(struct p9_client *client, struct p9_req_t *req)
+{
+	struct p9_trans_rdma *rdma = client->trans;
+	atomic_inc(&rdma->excess_rc);
+	return 0;
+}
+
+static int p9_rdma_bind_privport(struct p9_trans_rdma *rdma)
+{
+	struct sockaddr_in cl = {
+		.sin_family = AF_INET,
+		.sin_addr.s_addr = htonl(INADDR_ANY),
+	};
+	int port, err = -EINVAL;
+
+	for (port = P9_DEF_MAX_RESVPORT; port >= P9_DEF_MIN_RESVPORT; port--) {
+		cl.sin_port = htons((ushort)port);
+		err = rdma_bind_addr(rdma->cm_id, (struct sockaddr *)&cl);
+		if (err != -EADDRINUSE)
+			break;
+	}
+	return err;
+}
+
 /**
  * trans_create_rdma - Transport method for creating atransport instance
  * @client: client instance
@@ -576,7 +642,7 @@
 	struct p9_trans_rdma *rdma;
 	struct rdma_conn_param conn_param;
 	struct ib_qp_init_attr qp_attr;
-	struct ib_device_attr devattr;
+	struct ib_cq_init_attr cq_attr = {};
 
 	/* Parse the transport specific mount options */
 	err = parse_opts(args, &opts);
@@ -589,14 +655,24 @@
 		return -ENOMEM;
 
 	/* Create the RDMA CM ID */
-	rdma->cm_id = rdma_create_id(p9_cm_event_handler, client, RDMA_PS_TCP,
-				     IB_QPT_RC);
+	rdma->cm_id = rdma_create_id(&init_net, p9_cm_event_handler, client,
+				     RDMA_PS_TCP, IB_QPT_RC);
 	if (IS_ERR(rdma->cm_id))
 		goto error;
 
 	/* Associate the client with the transport */
 	client->trans = rdma;
 
+	/* Bind to a privileged port if we need to */
+	if (opts.privport) {
+		err = p9_rdma_bind_privport(rdma);
+		if (err < 0) {
+			pr_err("%s (%d): problem binding to privport: %d\n",
+			       __func__, task_pid_nr(current), -err);
+			goto error;
+		}
+	}
+
 	/* Resolve the server's address */
 	rdma->addr.sin_family = AF_INET;
 	rdma->addr.sin_addr.s_addr = in_aton(addr);
@@ -618,15 +694,11 @@
 	if (err || (rdma->state != P9_RDMA_ROUTE_RESOLVED))
 		goto error;
 
-	/* Query the device attributes */
-	err = ib_query_device(rdma->cm_id->device, &devattr);
-	if (err)
-		goto error;
-
 	/* Create the Completion Queue */
+	cq_attr.cqe = opts.sq_depth + opts.rq_depth + 1;
 	rdma->cq = ib_create_cq(rdma->cm_id->device, cq_comp_handler,
 				cq_event_handler, client,
-				opts.sq_depth + opts.rq_depth + 1, 0);
+				&cq_attr);
 	if (IS_ERR(rdma->cq))
 		goto error;
 	ib_req_notify_cq(rdma->cq, IB_CQ_NEXT_COMP);
@@ -636,17 +708,6 @@
 	if (IS_ERR(rdma->pd))
 		goto error;
 
-	/* Cache the DMA lkey in the transport */
-	rdma->dma_mr = NULL;
-	if (devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY)
-		rdma->lkey = rdma->cm_id->device->local_dma_lkey;
-	else {
-		rdma->dma_mr = ib_get_dma_mr(rdma->pd, IB_ACCESS_LOCAL_WRITE);
-		if (IS_ERR(rdma->dma_mr))
-			goto error;
-		rdma->lkey = rdma->dma_mr->lkey;
-	}
-
 	/* Create the Queue Pair */
 	memset(&qp_attr, 0, sizeof qp_attr);
 	qp_attr.event_handler = qp_event_handler;
@@ -695,6 +756,7 @@
 	.close = rdma_close,
 	.request = rdma_request,
 	.cancel = rdma_cancel,
+	.cancelled = rdma_cancelled,
 };
 
 /**