rxrpc: Fix lockup due to no error backoff after ack transmit error
If the network becomes (partially) unavailable, say by disabling IPv6, the
background ACK transmission routine can get itself into a tizzy by
proposing immediate ACK retransmission. Since we're in the call event
processor, that happens immediately without returning to the workqueue
manager.
The condition should clear after a while when either the network comes back
or the call times out.
Fix this by:
(1) When re-proposing an ACK on failed Tx, don't schedule it immediately.
This will allow a certain amount of time to elapse before we try
again.
(2) Enforce a return to the workqueue manager after a certain number of
iterations of the call processing loop.
(3) Add a backoff delay that increases the delay on deferred ACKs by a
jiffy per failed transmission to a limit of HZ. The backoff delay is
cleared on a successful return from kernel_sendmsg().
(4) Cancel calls immediately if the opening sendmsg fails. The layer
above can arrange retransmission or rotate to another server.
Fixes: 248f219cb8
("rxrpc: Rewrite the data and ack handling code")
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:

committed by
David S. Miller

parent
284fb78ed7
commit
c7e86acfce
@@ -123,6 +123,7 @@ static void __rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason,
|
||||
else
|
||||
ack_at = expiry;
|
||||
|
||||
ack_at += READ_ONCE(call->tx_backoff);
|
||||
ack_at += now;
|
||||
if (time_before(ack_at, call->ack_at)) {
|
||||
WRITE_ONCE(call->ack_at, ack_at);
|
||||
@@ -311,6 +312,7 @@ void rxrpc_process_call(struct work_struct *work)
|
||||
container_of(work, struct rxrpc_call, processor);
|
||||
rxrpc_serial_t *send_ack;
|
||||
unsigned long now, next, t;
|
||||
unsigned int iterations = 0;
|
||||
|
||||
rxrpc_see_call(call);
|
||||
|
||||
@@ -319,6 +321,11 @@ void rxrpc_process_call(struct work_struct *work)
|
||||
call->debug_id, rxrpc_call_states[call->state], call->events);
|
||||
|
||||
recheck_state:
|
||||
/* Limit the number of times we do this before returning to the manager */
|
||||
iterations++;
|
||||
if (iterations > 5)
|
||||
goto requeue;
|
||||
|
||||
if (test_and_clear_bit(RXRPC_CALL_EV_ABORT, &call->events)) {
|
||||
rxrpc_send_abort_packet(call);
|
||||
goto recheck_state;
|
||||
@@ -447,13 +454,16 @@ recheck_state:
|
||||
rxrpc_reduce_call_timer(call, next, now, rxrpc_timer_restart);
|
||||
|
||||
/* other events may have been raised since we started checking */
|
||||
if (call->events && call->state < RXRPC_CALL_COMPLETE) {
|
||||
__rxrpc_queue_call(call);
|
||||
goto out;
|
||||
}
|
||||
if (call->events && call->state < RXRPC_CALL_COMPLETE)
|
||||
goto requeue;
|
||||
|
||||
out_put:
|
||||
rxrpc_put_call(call, rxrpc_call_put);
|
||||
out:
|
||||
_leave("");
|
||||
return;
|
||||
|
||||
requeue:
|
||||
__rxrpc_queue_call(call);
|
||||
goto out;
|
||||
}
|
||||
|
Reference in New Issue
Block a user