IB/ipath: Fix many locking issues when switching to error state

The send DMA hardware queue voided a number of prior assumptions about
when a send is complete which led to completions being generated out of
order.  There were also a number of locking issues when switching the QP
to the error or reset states, and we implement the IB_QPS_SQD state.

Signed-off-by: Ralph Campbell <ralph.campbell@qlogic.com>
Signed-off-by: Roland Dreier <rolandd@cisco.com>
Этот коммит содержится в:
Ralph Campbell
2008-05-13 11:41:29 -07:00
коммит произвёл Roland Dreier
родитель 53dc1ca194
Коммит e509be898d
8 изменённых файлов: 554 добавлений и 304 удалений

Просмотреть файл

@@ -78,6 +78,7 @@ const u32 ib_ipath_rnr_table[32] = {
* ipath_insert_rnr_queue - put QP on the RNR timeout list for the device
* @qp: the QP
*
* Called with the QP s_lock held and interrupts disabled.
* XXX Use a simple list for now. We might need a priority
* queue if we have lots of QPs waiting for RNR timeouts
* but that should be rare.
@@ -85,9 +86,9 @@ const u32 ib_ipath_rnr_table[32] = {
void ipath_insert_rnr_queue(struct ipath_qp *qp)
{
struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
unsigned long flags;
spin_lock_irqsave(&dev->pending_lock, flags);
/* We already did a spin_lock_irqsave(), so just use spin_lock */
spin_lock(&dev->pending_lock);
if (list_empty(&dev->rnrwait))
list_add(&qp->timerwait, &dev->rnrwait);
else {
@@ -109,7 +110,7 @@ void ipath_insert_rnr_queue(struct ipath_qp *qp)
nqp->s_rnr_timeout -= qp->s_rnr_timeout;
list_add(&qp->timerwait, l);
}
spin_unlock_irqrestore(&dev->pending_lock, flags);
spin_unlock(&dev->pending_lock);
}
/**
@@ -185,6 +186,11 @@ int ipath_get_rwqe(struct ipath_qp *qp, int wr_id_only)
}
spin_lock_irqsave(&rq->lock, flags);
if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK)) {
ret = 0;
goto unlock;
}
wq = rq->wq;
tail = wq->tail;
/* Validate tail before using it since it is user writable. */
@@ -192,9 +198,8 @@ int ipath_get_rwqe(struct ipath_qp *qp, int wr_id_only)
tail = 0;
do {
if (unlikely(tail == wq->head)) {
spin_unlock_irqrestore(&rq->lock, flags);
ret = 0;
goto bail;
goto unlock;
}
/* Make sure entry is read after head index is read. */
smp_rmb();
@@ -207,7 +212,7 @@ int ipath_get_rwqe(struct ipath_qp *qp, int wr_id_only)
wq->tail = tail;
ret = 1;
qp->r_wrid_valid = 1;
set_bit(IPATH_R_WRID_VALID, &qp->r_aflags);
if (handler) {
u32 n;
@@ -234,8 +239,8 @@ int ipath_get_rwqe(struct ipath_qp *qp, int wr_id_only)
goto bail;
}
}
unlock:
spin_unlock_irqrestore(&rq->lock, flags);
bail:
return ret;
}
@@ -263,35 +268,59 @@ static void ipath_ruc_loopback(struct ipath_qp *sqp)
atomic64_t *maddr;
enum ib_wc_status send_status;
/*
* Note that we check the responder QP state after
* checking the requester's state.
*/
qp = ipath_lookup_qpn(&dev->qp_table, sqp->remote_qpn);
if (!qp) {
dev->n_pkt_drops++;
return;
}
again:
spin_lock_irqsave(&sqp->s_lock, flags);
if (!(ib_ipath_state_ops[sqp->state] & IPATH_PROCESS_SEND_OK) ||
sqp->s_rnr_timeout) {
spin_unlock_irqrestore(&sqp->s_lock, flags);
goto done;
}
/* Return if we are already busy processing a work request. */
if ((sqp->s_flags & (IPATH_S_BUSY | IPATH_S_ANY_WAIT)) ||
!(ib_ipath_state_ops[sqp->state] & IPATH_PROCESS_OR_FLUSH_SEND))
goto unlock;
/* Get the next send request. */
if (sqp->s_last == sqp->s_head) {
/* Send work queue is empty. */
spin_unlock_irqrestore(&sqp->s_lock, flags);
goto done;
sqp->s_flags |= IPATH_S_BUSY;
again:
if (sqp->s_last == sqp->s_head)
goto clr_busy;
wqe = get_swqe_ptr(sqp, sqp->s_last);
/* Return if it is not OK to start a new work reqeust. */
if (!(ib_ipath_state_ops[sqp->state] & IPATH_PROCESS_NEXT_SEND_OK)) {
if (!(ib_ipath_state_ops[sqp->state] & IPATH_FLUSH_SEND))
goto clr_busy;
/* We are in the error state, flush the work request. */
send_status = IB_WC_WR_FLUSH_ERR;
goto flush_send;
}
/*
* We can rely on the entry not changing without the s_lock
* being held until we update s_last.
* We increment s_cur to indicate s_last is in progress.
*/
wqe = get_swqe_ptr(sqp, sqp->s_last);
if (sqp->s_last == sqp->s_cur) {
if (++sqp->s_cur >= sqp->s_size)
sqp->s_cur = 0;
}
spin_unlock_irqrestore(&sqp->s_lock, flags);
if (!qp || !(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK)) {
dev->n_pkt_drops++;
/*
* For RC, the requester would timeout and retry so
* shortcut the timeouts and just signal too many retries.
*/
if (sqp->ibqp.qp_type == IB_QPT_RC)
send_status = IB_WC_RETRY_EXC_ERR;
else
send_status = IB_WC_SUCCESS;
goto serr;
}
memset(&wc, 0, sizeof wc);
send_status = IB_WC_SUCCESS;
@@ -396,8 +425,7 @@ again:
sqp->s_len -= len;
}
if (wqe->wr.opcode == IB_WR_RDMA_WRITE ||
wqe->wr.opcode == IB_WR_RDMA_READ)
if (!test_and_clear_bit(IPATH_R_WRID_VALID, &qp->r_aflags))
goto send_comp;
if (wqe->wr.opcode == IB_WR_RDMA_WRITE_WITH_IMM)
@@ -417,6 +445,8 @@ again:
wqe->wr.send_flags & IB_SEND_SOLICITED);
send_comp:
spin_lock_irqsave(&sqp->s_lock, flags);
flush_send:
sqp->s_rnr_retry = sqp->s_rnr_retry_cnt;
ipath_send_complete(sqp, wqe, send_status);
goto again;
@@ -437,11 +467,12 @@ rnr_nak:
sqp->s_rnr_retry--;
spin_lock_irqsave(&sqp->s_lock, flags);
if (!(ib_ipath_state_ops[sqp->state] & IPATH_PROCESS_RECV_OK))
goto unlock;
goto clr_busy;
sqp->s_flags |= IPATH_S_WAITING;
dev->n_rnr_naks++;
sqp->s_rnr_timeout = ib_ipath_rnr_table[qp->r_min_rnr_timer];
ipath_insert_rnr_queue(sqp);
goto unlock;
goto clr_busy;
inv_err:
send_status = IB_WC_REM_INV_REQ_ERR;
@@ -473,17 +504,19 @@ serr:
}
goto done;
}
clr_busy:
sqp->s_flags &= ~IPATH_S_BUSY;
unlock:
spin_unlock_irqrestore(&sqp->s_lock, flags);
done:
if (atomic_dec_and_test(&qp->refcount))
if (qp && atomic_dec_and_test(&qp->refcount))
wake_up(&qp->wait);
}
static void want_buffer(struct ipath_devdata *dd, struct ipath_qp *qp)
{
if (!(dd->ipath_flags & IPATH_HAS_SEND_DMA) ||
qp->ibqp.qp_type == IB_QPT_SMI) {
qp->ibqp.qp_type == IB_QPT_SMI) {
unsigned long flags;
spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
@@ -501,26 +534,36 @@ static void want_buffer(struct ipath_devdata *dd, struct ipath_qp *qp)
* @dev: the device we ran out of buffers on
*
* Called when we run out of PIO buffers.
* If we are now in the error state, return zero to flush the
* send work request.
*/
static void ipath_no_bufs_available(struct ipath_qp *qp,
static int ipath_no_bufs_available(struct ipath_qp *qp,
struct ipath_ibdev *dev)
{
unsigned long flags;
int ret = 1;
/*
* Note that as soon as want_buffer() is called and
* possibly before it returns, ipath_ib_piobufavail()
* could be called. If we are still in the tasklet function,
* tasklet_hi_schedule() will not call us until the next time
* tasklet_hi_schedule() is called.
* We leave the busy flag set so that another post send doesn't
* try to put the same QP on the piowait list again.
* could be called. Therefore, put QP on the piowait list before
* enabling the PIO avail interrupt.
*/
spin_lock_irqsave(&dev->pending_lock, flags);
list_add_tail(&qp->piowait, &dev->piowait);
spin_unlock_irqrestore(&dev->pending_lock, flags);
want_buffer(dev->dd, qp);
dev->n_piowait++;
spin_lock_irqsave(&qp->s_lock, flags);
if (ib_ipath_state_ops[qp->state] & IPATH_PROCESS_SEND_OK) {
dev->n_piowait++;
qp->s_flags |= IPATH_S_WAITING;
qp->s_flags &= ~IPATH_S_BUSY;
spin_lock(&dev->pending_lock);
if (list_empty(&qp->piowait))
list_add_tail(&qp->piowait, &dev->piowait);
spin_unlock(&dev->pending_lock);
} else
ret = 0;
spin_unlock_irqrestore(&qp->s_lock, flags);
if (ret)
want_buffer(dev->dd, qp);
return ret;
}
/**
@@ -596,15 +639,13 @@ void ipath_do_send(unsigned long data)
struct ipath_qp *qp = (struct ipath_qp *)data;
struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
int (*make_req)(struct ipath_qp *qp);
if (test_and_set_bit(IPATH_S_BUSY, &qp->s_busy))
goto bail;
unsigned long flags;
if ((qp->ibqp.qp_type == IB_QPT_RC ||
qp->ibqp.qp_type == IB_QPT_UC) &&
qp->remote_ah_attr.dlid == dev->dd->ipath_lid) {
ipath_ruc_loopback(qp);
goto clear;
goto bail;
}
if (qp->ibqp.qp_type == IB_QPT_RC)
@@ -614,6 +655,19 @@ void ipath_do_send(unsigned long data)
else
make_req = ipath_make_ud_req;
spin_lock_irqsave(&qp->s_lock, flags);
/* Return if we are already busy processing a work request. */
if ((qp->s_flags & (IPATH_S_BUSY | IPATH_S_ANY_WAIT)) ||
!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_OR_FLUSH_SEND)) {
spin_unlock_irqrestore(&qp->s_lock, flags);
goto bail;
}
qp->s_flags |= IPATH_S_BUSY;
spin_unlock_irqrestore(&qp->s_lock, flags);
again:
/* Check for a constructed packet to be sent. */
if (qp->s_hdrwords != 0) {
@@ -623,8 +677,8 @@ again:
*/
if (ipath_verbs_send(qp, &qp->s_hdr, qp->s_hdrwords,
qp->s_cur_sge, qp->s_cur_size)) {
ipath_no_bufs_available(qp, dev);
goto bail;
if (ipath_no_bufs_available(qp, dev))
goto bail;
}
dev->n_unicast_xmit++;
/* Record that we sent the packet and s_hdr is empty. */
@@ -633,16 +687,20 @@ again:
if (make_req(qp))
goto again;
clear:
clear_bit(IPATH_S_BUSY, &qp->s_busy);
bail:;
}
/*
* This should be called with s_lock held.
*/
void ipath_send_complete(struct ipath_qp *qp, struct ipath_swqe *wqe,
enum ib_wc_status status)
{
unsigned long flags;
u32 last;
u32 old_last, last;
if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_OR_FLUSH_SEND))
return;
/* See ch. 11.2.4.1 and 10.7.3.1 */
if (!(qp->s_flags & IPATH_S_SIGNAL_REQ_WR) ||
@@ -661,10 +719,14 @@ void ipath_send_complete(struct ipath_qp *qp, struct ipath_swqe *wqe,
status != IB_WC_SUCCESS);
}
spin_lock_irqsave(&qp->s_lock, flags);
last = qp->s_last;
old_last = last = qp->s_last;
if (++last >= qp->s_size)
last = 0;
qp->s_last = last;
spin_unlock_irqrestore(&qp->s_lock, flags);
if (qp->s_cur == old_last)
qp->s_cur = last;
if (qp->s_tail == old_last)
qp->s_tail = last;
if (qp->state == IB_QPS_SQD && last == qp->s_cur)
qp->s_draining = 0;
}