--- zzzz-none-000/linux-3.10.107/net/rds/ib_send.c 2017-06-27 09:49:32.000000000 +0000 +++ scorpion-7490-727/linux-3.10.107/net/rds/ib_send.c 2021-02-04 17:41:59.000000000 +0000 @@ -39,40 +39,6 @@ #include "rds.h" #include "ib.h" -static char *rds_ib_wc_status_strings[] = { -#define RDS_IB_WC_STATUS_STR(foo) \ - [IB_WC_##foo] = __stringify(IB_WC_##foo) - RDS_IB_WC_STATUS_STR(SUCCESS), - RDS_IB_WC_STATUS_STR(LOC_LEN_ERR), - RDS_IB_WC_STATUS_STR(LOC_QP_OP_ERR), - RDS_IB_WC_STATUS_STR(LOC_EEC_OP_ERR), - RDS_IB_WC_STATUS_STR(LOC_PROT_ERR), - RDS_IB_WC_STATUS_STR(WR_FLUSH_ERR), - RDS_IB_WC_STATUS_STR(MW_BIND_ERR), - RDS_IB_WC_STATUS_STR(BAD_RESP_ERR), - RDS_IB_WC_STATUS_STR(LOC_ACCESS_ERR), - RDS_IB_WC_STATUS_STR(REM_INV_REQ_ERR), - RDS_IB_WC_STATUS_STR(REM_ACCESS_ERR), - RDS_IB_WC_STATUS_STR(REM_OP_ERR), - RDS_IB_WC_STATUS_STR(RETRY_EXC_ERR), - RDS_IB_WC_STATUS_STR(RNR_RETRY_EXC_ERR), - RDS_IB_WC_STATUS_STR(LOC_RDD_VIOL_ERR), - RDS_IB_WC_STATUS_STR(REM_INV_RD_REQ_ERR), - RDS_IB_WC_STATUS_STR(REM_ABORT_ERR), - RDS_IB_WC_STATUS_STR(INV_EECN_ERR), - RDS_IB_WC_STATUS_STR(INV_EEC_STATE_ERR), - RDS_IB_WC_STATUS_STR(FATAL_ERR), - RDS_IB_WC_STATUS_STR(RESP_TIMEOUT_ERR), - RDS_IB_WC_STATUS_STR(GENERAL_ERR), -#undef RDS_IB_WC_STATUS_STR -}; - -char *rds_ib_wc_status_str(enum ib_wc_status status) -{ - return rds_str_array(rds_ib_wc_status_strings, - ARRAY_SIZE(rds_ib_wc_status_strings), status); -} - /* * Convert IB-specific error message to RDS error message and call core * completion handler. @@ -229,16 +195,16 @@ send->s_op = NULL; - send->s_wr.wr_id = i; + send->s_wr.wr_id = i | RDS_IB_SEND_OP; send->s_wr.sg_list = send->s_sge; send->s_wr.ex.imm_data = 0; sge = &send->s_sge[0]; sge->addr = ic->i_send_hdrs_dma + (i * sizeof(struct rds_header)); sge->length = sizeof(struct rds_header); - sge->lkey = ic->i_mr->lkey; + sge->lkey = ic->i_pd->local_dma_lkey; - send->s_sge[1].lkey = ic->i_mr->lkey; + send->s_sge[1].lkey = ic->i_pd->local_dma_lkey; } } @@ -271,81 +237,73 @@ * unallocs the next free entry in the ring it doesn't alter which is * the next to be freed, which is what this is concerned with. */ -void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context) +void rds_ib_send_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc) { - struct rds_connection *conn = context; - struct rds_ib_connection *ic = conn->c_transport_data; struct rds_message *rm = NULL; - struct ib_wc wc; + struct rds_connection *conn = ic->conn; struct rds_ib_send_work *send; u32 completed; u32 oldest; u32 i = 0; - int ret; int nr_sig = 0; - rdsdebug("cq %p conn %p\n", cq, conn); - rds_ib_stats_inc(s_ib_tx_cq_call); - ret = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); - if (ret) - rdsdebug("ib_req_notify_cq send failed: %d\n", ret); - - while (ib_poll_cq(cq, 1, &wc) > 0) { - rdsdebug("wc wr_id 0x%llx status %u (%s) byte_len %u imm_data %u\n", - (unsigned long long)wc.wr_id, wc.status, - rds_ib_wc_status_str(wc.status), wc.byte_len, - be32_to_cpu(wc.ex.imm_data)); - rds_ib_stats_inc(s_ib_tx_cq_event); - - if (wc.wr_id == RDS_IB_ACK_WR_ID) { - if (ic->i_ack_queued + HZ/2 < jiffies) - rds_ib_stats_inc(s_ib_tx_stalled); - rds_ib_ack_send_complete(ic); - continue; - } - - oldest = rds_ib_ring_oldest(&ic->i_send_ring); - - completed = rds_ib_ring_completed(&ic->i_send_ring, wc.wr_id, oldest); - - for (i = 0; i < completed; i++) { - send = &ic->i_sends[oldest]; - if (send->s_wr.send_flags & IB_SEND_SIGNALED) - nr_sig++; - - rm = rds_ib_send_unmap_op(ic, send, wc.status); - - if (send->s_queued + HZ/2 < jiffies) - rds_ib_stats_inc(s_ib_tx_stalled); - - if (send->s_op) { - if (send->s_op == rm->m_final_op) { - /* If anyone waited for this message to get flushed out, wake - * them up now */ - rds_message_unmapped(rm); - } - rds_message_put(rm); - send->s_op = NULL; - } - oldest = (oldest + 1) % ic->i_send_ring.w_nr; - } + rdsdebug("wc wr_id 0x%llx status %u (%s) byte_len %u imm_data %u\n", + (unsigned long long)wc->wr_id, wc->status, + ib_wc_status_msg(wc->status), wc->byte_len, + be32_to_cpu(wc->ex.imm_data)); + rds_ib_stats_inc(s_ib_tx_cq_event); + + if (wc->wr_id == RDS_IB_ACK_WR_ID) { + if (time_after(jiffies, ic->i_ack_queued + HZ / 2)) + rds_ib_stats_inc(s_ib_tx_stalled); + rds_ib_ack_send_complete(ic); + return; + } - rds_ib_ring_free(&ic->i_send_ring, completed); - rds_ib_sub_signaled(ic, nr_sig); - nr_sig = 0; + oldest = rds_ib_ring_oldest(&ic->i_send_ring); + + completed = rds_ib_ring_completed(&ic->i_send_ring, + (wc->wr_id & ~RDS_IB_SEND_OP), + oldest); + + for (i = 0; i < completed; i++) { + send = &ic->i_sends[oldest]; + if (send->s_wr.send_flags & IB_SEND_SIGNALED) + nr_sig++; + + rm = rds_ib_send_unmap_op(ic, send, wc->status); + + if (time_after(jiffies, send->s_queued + HZ / 2)) + rds_ib_stats_inc(s_ib_tx_stalled); - if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags) || - test_bit(0, &conn->c_map_queued)) - queue_delayed_work(rds_wq, &conn->c_send_w, 0); - - /* We expect errors as the qp is drained during shutdown */ - if (wc.status != IB_WC_SUCCESS && rds_conn_up(conn)) { - rds_ib_conn_error(conn, "send completion on %pI4 had status " - "%u (%s), disconnecting and reconnecting\n", - &conn->c_faddr, wc.status, - rds_ib_wc_status_str(wc.status)); + if (send->s_op) { + if (send->s_op == rm->m_final_op) { + /* If anyone waited for this message to get + * flushed out, wake them up now + */ + rds_message_unmapped(rm); + } + rds_message_put(rm); + send->s_op = NULL; } + + oldest = (oldest + 1) % ic->i_send_ring.w_nr; + } + + rds_ib_ring_free(&ic->i_send_ring, completed); + rds_ib_sub_signaled(ic, nr_sig); + nr_sig = 0; + + if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags) || + test_bit(0, &conn->c_map_queued)) + queue_delayed_work(rds_wq, &conn->c_send_w, 0); + + /* We expect errors as the qp is drained during shutdown */ + if (wc->status != IB_WC_SUCCESS && rds_conn_up(conn)) { + rds_ib_conn_error(conn, "send completion on %pI4 had status %u (%s), disconnecting and reconnecting\n", + &conn->c_faddr, wc->status, + ib_wc_status_msg(wc->status)); } } @@ -409,7 +367,7 @@ posted = IB_GET_POST_CREDITS(oldval); avail = IB_GET_SEND_CREDITS(oldval); - rdsdebug("rds_ib_send_grab_credits(%u): credits=%u posted=%u\n", + rdsdebug("wanted=%u credits=%u posted=%u\n", wanted, avail, posted); /* The last credit must be used to send a credit update. */ @@ -453,7 +411,7 @@ if (credits == 0) return; - rdsdebug("rds_ib_send_add_credits(%u): current=%u%s\n", + rdsdebug("credits=%u current=%u%s\n", credits, IB_GET_SEND_CREDITS(atomic_read(&ic->i_credits)), test_bit(RDS_LL_SEND_FULL, &conn->c_flags) ? ", ll_send_full" : ""); @@ -605,6 +563,8 @@ } rds_message_addref(rm); + rm->data.op_dmasg = 0; + rm->data.op_dmaoff = 0; ic->i_data_op = &rm->data; /* Finalize the header */ @@ -658,7 +618,7 @@ send = &ic->i_sends[pos]; first = send; prev = NULL; - scat = &ic->i_data_op->op_sg[sg]; + scat = &ic->i_data_op->op_sg[rm->data.op_dmasg]; i = 0; do { unsigned int len = 0; @@ -680,17 +640,20 @@ /* Set up the data, if present */ if (i < work_alloc && scat != &rm->data.op_sg[rm->data.op_count]) { - len = min(RDS_FRAG_SIZE, ib_sg_dma_len(dev, scat) - off); + len = min(RDS_FRAG_SIZE, + ib_sg_dma_len(dev, scat) - rm->data.op_dmaoff); send->s_wr.num_sge = 2; - send->s_sge[1].addr = ib_sg_dma_address(dev, scat) + off; + send->s_sge[1].addr = ib_sg_dma_address(dev, scat); + send->s_sge[1].addr += rm->data.op_dmaoff; send->s_sge[1].length = len; bytes_sent += len; - off += len; - if (off == ib_sg_dma_len(dev, scat)) { + rm->data.op_dmaoff += len; + if (rm->data.op_dmaoff == ib_sg_dma_len(dev, scat)) { scat++; - off = 0; + rm->data.op_dmasg++; + rm->data.op_dmaoff = 0; } } @@ -738,6 +701,11 @@ if (scat == &rm->data.op_sg[rm->data.op_count]) { prev->s_op = ic->i_data_op; prev->s_wr.send_flags |= IB_SEND_SOLICITED; + if (!(prev->s_wr.send_flags & IB_SEND_SIGNALED)) { + ic->i_unsignaled_wrs = rds_ib_sysctl_max_unsig_wrs; + prev->s_wr.send_flags |= IB_SEND_SIGNALED; + nr_sig++; + } ic->i_data_op = NULL; } @@ -809,23 +777,23 @@ send->s_queued = jiffies; if (op->op_type == RDS_ATOMIC_TYPE_CSWP) { - send->s_wr.opcode = IB_WR_MASKED_ATOMIC_CMP_AND_SWP; - send->s_wr.wr.atomic.compare_add = op->op_m_cswp.compare; - send->s_wr.wr.atomic.swap = op->op_m_cswp.swap; - send->s_wr.wr.atomic.compare_add_mask = op->op_m_cswp.compare_mask; - send->s_wr.wr.atomic.swap_mask = op->op_m_cswp.swap_mask; + send->s_atomic_wr.wr.opcode = IB_WR_MASKED_ATOMIC_CMP_AND_SWP; + send->s_atomic_wr.compare_add = op->op_m_cswp.compare; + send->s_atomic_wr.swap = op->op_m_cswp.swap; + send->s_atomic_wr.compare_add_mask = op->op_m_cswp.compare_mask; + send->s_atomic_wr.swap_mask = op->op_m_cswp.swap_mask; } else { /* FADD */ - send->s_wr.opcode = IB_WR_MASKED_ATOMIC_FETCH_AND_ADD; - send->s_wr.wr.atomic.compare_add = op->op_m_fadd.add; - send->s_wr.wr.atomic.swap = 0; - send->s_wr.wr.atomic.compare_add_mask = op->op_m_fadd.nocarry_mask; - send->s_wr.wr.atomic.swap_mask = 0; + send->s_atomic_wr.wr.opcode = IB_WR_MASKED_ATOMIC_FETCH_AND_ADD; + send->s_atomic_wr.compare_add = op->op_m_fadd.add; + send->s_atomic_wr.swap = 0; + send->s_atomic_wr.compare_add_mask = op->op_m_fadd.nocarry_mask; + send->s_atomic_wr.swap_mask = 0; } nr_sig = rds_ib_set_wr_signal_state(ic, send, op->op_notify); - send->s_wr.num_sge = 1; - send->s_wr.next = NULL; - send->s_wr.wr.atomic.remote_addr = op->op_remote_addr; - send->s_wr.wr.atomic.rkey = op->op_rkey; + send->s_atomic_wr.wr.num_sge = 1; + send->s_atomic_wr.wr.next = NULL; + send->s_atomic_wr.remote_addr = op->op_remote_addr; + send->s_atomic_wr.rkey = op->op_rkey; send->s_op = op; rds_message_addref(container_of(send->s_op, struct rds_message, atomic)); @@ -842,7 +810,7 @@ /* Convert our struct scatterlist to struct ib_sge */ send->s_sge[0].addr = ib_sg_dma_address(ic->i_cm_id->device, op->op_sg); send->s_sge[0].length = ib_sg_dma_len(ic->i_cm_id->device, op->op_sg); - send->s_sge[0].lkey = ic->i_mr->lkey; + send->s_sge[0].lkey = ic->i_pd->local_dma_lkey; rdsdebug("rva %Lx rpa %Lx len %u\n", op->op_remote_addr, send->s_sge[0].addr, send->s_sge[0].length); @@ -850,11 +818,11 @@ if (nr_sig) atomic_add(nr_sig, &ic->i_signaled_sends); - failed_wr = &send->s_wr; - ret = ib_post_send(ic->i_cm_id->qp, &send->s_wr, &failed_wr); + failed_wr = &send->s_atomic_wr.wr; + ret = ib_post_send(ic->i_cm_id->qp, &send->s_atomic_wr.wr, &failed_wr); rdsdebug("ic %p send %p (wr %p) ret %d wr %p\n", ic, - send, &send->s_wr, ret, failed_wr); - BUG_ON(failed_wr != &send->s_wr); + send, &send->s_atomic_wr, ret, failed_wr); + BUG_ON(failed_wr != &send->s_atomic_wr.wr); if (ret) { printk(KERN_WARNING "RDS/IB: atomic ib_post_send to %pI4 " "returned %d\n", &conn->c_faddr, ret); @@ -863,9 +831,9 @@ goto out; } - if (unlikely(failed_wr != &send->s_wr)) { + if (unlikely(failed_wr != &send->s_atomic_wr.wr)) { printk(KERN_WARNING "RDS/IB: atomic ib_post_send() rc=%d, but failed_wqe updated!\n", ret); - BUG_ON(failed_wr != &send->s_wr); + BUG_ON(failed_wr != &send->s_atomic_wr.wr); } out: @@ -936,27 +904,28 @@ nr_sig += rds_ib_set_wr_signal_state(ic, send, op->op_notify); send->s_wr.opcode = op->op_write ? IB_WR_RDMA_WRITE : IB_WR_RDMA_READ; - send->s_wr.wr.rdma.remote_addr = remote_addr; - send->s_wr.wr.rdma.rkey = op->op_rkey; + send->s_rdma_wr.remote_addr = remote_addr; + send->s_rdma_wr.rkey = op->op_rkey; if (num_sge > max_sge) { - send->s_wr.num_sge = max_sge; + send->s_rdma_wr.wr.num_sge = max_sge; num_sge -= max_sge; } else { - send->s_wr.num_sge = num_sge; + send->s_rdma_wr.wr.num_sge = num_sge; } - send->s_wr.next = NULL; + send->s_rdma_wr.wr.next = NULL; if (prev) - prev->s_wr.next = &send->s_wr; + prev->s_rdma_wr.wr.next = &send->s_rdma_wr.wr; - for (j = 0; j < send->s_wr.num_sge && scat != &op->op_sg[op->op_count]; j++) { + for (j = 0; j < send->s_rdma_wr.wr.num_sge && + scat != &op->op_sg[op->op_count]; j++) { len = ib_sg_dma_len(ic->i_cm_id->device, scat); send->s_sge[j].addr = ib_sg_dma_address(ic->i_cm_id->device, scat); send->s_sge[j].length = len; - send->s_sge[j].lkey = ic->i_mr->lkey; + send->s_sge[j].lkey = ic->i_pd->local_dma_lkey; sent += len; rdsdebug("ic %p sent %d remote_addr %llu\n", ic, sent, remote_addr); @@ -966,7 +935,9 @@ } rdsdebug("send %p wr %p num_sge %u next %p\n", send, - &send->s_wr, send->s_wr.num_sge, send->s_wr.next); + &send->s_rdma_wr.wr, + send->s_rdma_wr.wr.num_sge, + send->s_rdma_wr.wr.next); prev = send; if (++send == &ic->i_sends[ic->i_send_ring.w_nr]) @@ -987,11 +958,11 @@ if (nr_sig) atomic_add(nr_sig, &ic->i_signaled_sends); - failed_wr = &first->s_wr; - ret = ib_post_send(ic->i_cm_id->qp, &first->s_wr, &failed_wr); + failed_wr = &first->s_rdma_wr.wr; + ret = ib_post_send(ic->i_cm_id->qp, &first->s_rdma_wr.wr, &failed_wr); rdsdebug("ic %p first %p (wr %p) ret %d wr %p\n", ic, - first, &first->s_wr, ret, failed_wr); - BUG_ON(failed_wr != &first->s_wr); + first, &first->s_rdma_wr.wr, ret, failed_wr); + BUG_ON(failed_wr != &first->s_rdma_wr.wr); if (ret) { printk(KERN_WARNING "RDS/IB: rdma ib_post_send to %pI4 " "returned %d\n", &conn->c_faddr, ret); @@ -1000,9 +971,9 @@ goto out; } - if (unlikely(failed_wr != &first->s_wr)) { + if (unlikely(failed_wr != &first->s_rdma_wr.wr)) { printk(KERN_WARNING "RDS/IB: ib_post_send() rc=%d, but failed_wqe updated!\n", ret); - BUG_ON(failed_wr != &first->s_wr); + BUG_ON(failed_wr != &first->s_rdma_wr.wr); }