xprtrdma: Fix occasional transport deadlock
Under high I/O workloads, I've noticed that an RPC/RDMA transport
occasionally deadlocks (IOPS goes to zero, and doesn't recover).
Diagnosis shows that the sendctx queue is empty, but when sendctxs
are returned to the queue, the xprt_write_space wake-up never
occurs. The wake-up logic in rpcrdma_sendctx_put_locked is racy.
I noticed that both EMPTY_SCQ and XPRT_WRITE_SPACE are implemented
via an atomic bit. Just one of those is sufficient. Removing
EMPTY_SCQ in favor of the generic bit mechanism makes the deadlock
un-reproducible.
Without EMPTY_SCQ, rpcrdma_buffer::rb_flags is no longer used and
is therefore removed.
Unfortunately this patch does not apply cleanly to stable. If
needed, someone will have to port it and test it.
Fixes: 2fad659209
("xprtrdma: Wait on empty sendctx queue")
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
This commit is contained in:

committed by
Anna Schumaker

parent
1310051c72
commit
05eb06d866
@@ -539,6 +539,33 @@ TRACE_EVENT(xprtrdma_marshal_failed,
|
||||
)
|
||||
);
|
||||
|
||||
TRACE_EVENT(xprtrdma_prepsend_failed,
|
||||
TP_PROTO(const struct rpc_rqst *rqst,
|
||||
int ret
|
||||
),
|
||||
|
||||
TP_ARGS(rqst, ret),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
__field(unsigned int, task_id)
|
||||
__field(unsigned int, client_id)
|
||||
__field(u32, xid)
|
||||
__field(int, ret)
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
__entry->task_id = rqst->rq_task->tk_pid;
|
||||
__entry->client_id = rqst->rq_task->tk_client->cl_clid;
|
||||
__entry->xid = be32_to_cpu(rqst->rq_xid);
|
||||
__entry->ret = ret;
|
||||
),
|
||||
|
||||
TP_printk("task:%u@%u xid=0x%08x: ret=%d",
|
||||
__entry->task_id, __entry->client_id, __entry->xid,
|
||||
__entry->ret
|
||||
)
|
||||
);
|
||||
|
||||
TRACE_EVENT(xprtrdma_post_send,
|
||||
TP_PROTO(
|
||||
const struct rpcrdma_req *req,
|
||||
|
Reference in New Issue
Block a user