i40e/i40evf: Detect and recover hung queue scenario

In VFs, there is a known issue which can cause writebacks
to not occur when interrupts are disabled and there are
less than 4 descriptors resulting in TX timeout. Timeout
can also occur due to lost interrupt.

The current implementation for detecting and recovering
from hung queues in the PF is problematic because it actually
actively encourages lost interrupts.  By triggering a SW
interrupt, interrupts are forced on.  If we are already in
napi_poll and an interrupt fires, napi_poll will not be
rescheduled and the interrupt is effectively lost; thereby
potentially *causing* hung queues.

This patch checks whether packets are being processed between
every watchdog cycle and determine potential hung queue and
fires triggers SW interrupt only for that particular queue.

Signed-off-by: Sudheer Mogilappagari <sudheer.mogilappagari@intel.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
This commit is contained in:
Sudheer Mogilappagari
2017-12-18 05:17:25 -05:00
committed by Jeff Kirsher
parent d95cd48060
commit 07d44190a3
6 changed files with 115 additions and 99 deletions

View File

@@ -148,6 +148,59 @@ u32 i40evf_get_tx_pending(struct i40e_ring *ring, bool in_sw)
return 0;
}
/**
* i40evf_detect_recover_hung - Function to detect and recover hung_queues
* @vsi: pointer to vsi struct with tx queues
*
* VSI has netdev and netdev has TX queues. This function is to check each of
* those TX queues if they are hung, trigger recovery by issuing SW interrupt.
**/
void i40evf_detect_recover_hung(struct i40e_vsi *vsi)
{
struct i40e_ring *tx_ring = NULL;
struct net_device *netdev;
unsigned int i;
int packets;
if (!vsi)
return;
if (test_bit(__I40E_VSI_DOWN, vsi->state))
return;
netdev = vsi->netdev;
if (!netdev)
return;
if (!netif_carrier_ok(netdev))
return;
for (i = 0; i < vsi->back->num_active_queues; i++) {
tx_ring = &vsi->back->tx_rings[i];
if (tx_ring && tx_ring->desc) {
/* If packet counter has not changed the queue is
* likely stalled, so force an interrupt for this
* queue.
*
* prev_pkt_ctr would be negative if there was no
* pending work.
*/
packets = tx_ring->stats.packets & INT_MAX;
if (tx_ring->tx_stats.prev_pkt_ctr == packets) {
i40evf_force_wb(vsi, tx_ring->q_vector);
continue;
}
/* Memory barrier between read of packet count and call
* to i40evf_get_tx_pending()
*/
smp_rmb();
tx_ring->tx_stats.prev_pkt_ctr =
i40evf_get_tx_pending(tx_ring, false) ? packets : -1;
}
}
}
#define WB_STRIDE 4
/**
@@ -469,6 +522,7 @@ int i40evf_setup_tx_descriptors(struct i40e_ring *tx_ring)
tx_ring->next_to_use = 0;
tx_ring->next_to_clean = 0;
tx_ring->tx_stats.prev_pkt_ctr = -1;
return 0;
err:

View File

@@ -313,6 +313,7 @@ struct i40e_tx_queue_stats {
u64 tx_done_old;
u64 tx_linearize;
u64 tx_force_wb;
int prev_pkt_ctr;
u64 tx_lost_interrupt;
};
@@ -467,6 +468,7 @@ void i40evf_free_rx_resources(struct i40e_ring *rx_ring);
int i40evf_napi_poll(struct napi_struct *napi, int budget);
void i40evf_force_wb(struct i40e_vsi *vsi, struct i40e_q_vector *q_vector);
u32 i40evf_get_tx_pending(struct i40e_ring *ring, bool in_sw);
void i40evf_detect_recover_hung(struct i40e_vsi *vsi);
int __i40evf_maybe_stop_tx(struct i40e_ring *tx_ring, int size);
bool __i40evf_chk_linearize(struct sk_buff *skb);

View File

@@ -1716,6 +1716,8 @@ static void i40evf_watchdog_task(struct work_struct *work)
if (adapter->state == __I40EVF_RUNNING)
i40evf_request_stats(adapter);
watchdog_done:
if (adapter->state == __I40EVF_RUNNING)
i40evf_detect_recover_hung(&adapter->vsi);
clear_bit(__I40EVF_IN_CRITICAL_TASK, &adapter->crit_section);
restart_watchdog:
if (adapter->state == __I40EVF_REMOVE)