net: Revert devlink health changes.
This reverts the devlink health changes from 9/17/2019, Jiri wants things to be designed differently and it was agreed that the easiest way to do this is start from the beginning again. Commits reverted:cb5ccfbe73
880ee82f03
c7af343b4e
ff253fedab
6f9d56132e
fcd852c69d
8a66704a13
12bd0dcefe
aba25279c1
ce019faa70
b8c45a033a
And the follow-on build fix: o33a0efa4baecd689da9474ce0e8b673eb6931c60 Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
@@ -1,86 +0,0 @@
|
|||||||
The health mechanism is targeted for Real Time Alerting, in order to know when
|
|
||||||
something bad had happened to a PCI device
|
|
||||||
- Provide alert debug information
|
|
||||||
- Self healing
|
|
||||||
- If problem needs vendor support, provide a way to gather all needed debugging
|
|
||||||
information.
|
|
||||||
|
|
||||||
The main idea is to unify and centralize driver health reports in the
|
|
||||||
generic devlink instance and allow the user to set different
|
|
||||||
attributes of the health reporting and recovery procedures.
|
|
||||||
|
|
||||||
The devlink health reporter:
|
|
||||||
Device driver creates a "health reporter" per each error/health type.
|
|
||||||
Error/Health type can be a known/generic (eg pci error, fw error, rx/tx error)
|
|
||||||
or unknown (driver specific).
|
|
||||||
For each registered health reporter a driver can issue error/health reports
|
|
||||||
asynchronously. All health reports handling is done by devlink.
|
|
||||||
Device driver can provide specific callbacks for each "health reporter", e.g.
|
|
||||||
- Recovery procedures
|
|
||||||
- Diagnostics and object dump procedures
|
|
||||||
- OOB initial parameters
|
|
||||||
Different parts of the driver can register different types of health reporters
|
|
||||||
with different handlers.
|
|
||||||
|
|
||||||
Once an error is reported, devlink health will do the following actions:
|
|
||||||
* A log is being send to the kernel trace events buffer
|
|
||||||
* Health status and statistics are being updated for the reporter instance
|
|
||||||
* Object dump is being taken and saved at the reporter instance (as long as
|
|
||||||
there is no other dump which is already stored)
|
|
||||||
* Auto recovery attempt is being done. Depends on:
|
|
||||||
- Auto-recovery configuration
|
|
||||||
- Grace period vs. time passed since last recover
|
|
||||||
|
|
||||||
The user interface:
|
|
||||||
User can access/change each reporter's parameters and driver specific callbacks
|
|
||||||
via devlink, e.g per error type (per health reporter)
|
|
||||||
- Configure reporter's generic parameters (like: disable/enable auto recovery)
|
|
||||||
- Invoke recovery procedure
|
|
||||||
- Run diagnostics
|
|
||||||
- Object dump
|
|
||||||
|
|
||||||
The devlink health interface (via netlink):
|
|
||||||
DEVLINK_CMD_HEALTH_REPORTER_GET
|
|
||||||
Retrieves status and configuration info per DEV and reporter.
|
|
||||||
DEVLINK_CMD_HEALTH_REPORTER_SET
|
|
||||||
Allows reporter-related configuration setting.
|
|
||||||
DEVLINK_CMD_HEALTH_REPORTER_RECOVER
|
|
||||||
Triggers a reporter's recovery procedure.
|
|
||||||
DEVLINK_CMD_HEALTH_REPORTER_DIAGNOSE
|
|
||||||
Retrieves diagnostics data from a reporter on a device.
|
|
||||||
DEVLINK_CMD_HEALTH_REPORTER_DUMP_GET
|
|
||||||
Retrieves the last stored dump. Devlink health
|
|
||||||
saves a single dump. If an dump is not already stored by the devlink
|
|
||||||
for this reporter, devlink generates a new dump.
|
|
||||||
dump output is defined by the reporter.
|
|
||||||
DEVLINK_CMD_HEALTH_REPORTER_DUMP_CLEAR
|
|
||||||
Clears the last saved dump file for the specified reporter.
|
|
||||||
|
|
||||||
|
|
||||||
netlink
|
|
||||||
+--------------------------+
|
|
||||||
| |
|
|
||||||
| + |
|
|
||||||
| | |
|
|
||||||
+--------------------------+
|
|
||||||
|request for ops
|
|
||||||
|(diagnose,
|
|
||||||
mlx5_core devlink |recover,
|
|
||||||
|dump)
|
|
||||||
+--------+ +--------------------------+
|
|
||||||
| | | reporter| |
|
|
||||||
| | | +---------v----------+ |
|
|
||||||
| | ops execution | | | |
|
|
||||||
| <----------------------------------+ | |
|
|
||||||
| | | | | |
|
|
||||||
| | | + ^------------------+ |
|
|
||||||
| | | | request for ops |
|
|
||||||
| | | | (recover, dump) |
|
|
||||||
| | | | |
|
|
||||||
| | | +-+------------------+ |
|
|
||||||
| | health report | | health handler | |
|
|
||||||
| +-------------------------------> | |
|
|
||||||
| | | +--------------------+ |
|
|
||||||
| | health reporter create | |
|
|
||||||
| +----------------------------> |
|
|
||||||
+--------+ +--------------------------+
|
|
@@ -22,7 +22,7 @@ mlx5_core-y := main.o cmd.o debugfs.o fw.o eq.o uar.o pagealloc.o \
|
|||||||
#
|
#
|
||||||
mlx5_core-$(CONFIG_MLX5_CORE_EN) += en_main.o en_common.o en_fs.o en_ethtool.o \
|
mlx5_core-$(CONFIG_MLX5_CORE_EN) += en_main.o en_common.o en_fs.o en_ethtool.o \
|
||||||
en_tx.o en_rx.o en_dim.o en_txrx.o en/xdp.o en_stats.o \
|
en_tx.o en_rx.o en_dim.o en_txrx.o en/xdp.o en_stats.o \
|
||||||
en_selftest.o en/port.o en/monitor_stats.o en/reporter_tx.o
|
en_selftest.o en/port.o en/monitor_stats.o
|
||||||
|
|
||||||
#
|
#
|
||||||
# Netdev extra
|
# Netdev extra
|
||||||
|
@@ -388,7 +388,10 @@ struct mlx5e_txqsq {
|
|||||||
struct mlx5e_channel *channel;
|
struct mlx5e_channel *channel;
|
||||||
int txq_ix;
|
int txq_ix;
|
||||||
u32 rate_limit;
|
u32 rate_limit;
|
||||||
|
struct mlx5e_txqsq_recover {
|
||||||
struct work_struct recover_work;
|
struct work_struct recover_work;
|
||||||
|
u64 last_recover;
|
||||||
|
} recover;
|
||||||
} ____cacheline_aligned_in_smp;
|
} ____cacheline_aligned_in_smp;
|
||||||
|
|
||||||
struct mlx5e_dma_info {
|
struct mlx5e_dma_info {
|
||||||
@@ -679,13 +682,6 @@ struct mlx5e_rss_params {
|
|||||||
u8 hfunc;
|
u8 hfunc;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct mlx5e_modify_sq_param {
|
|
||||||
int curr_state;
|
|
||||||
int next_state;
|
|
||||||
int rl_update;
|
|
||||||
int rl_index;
|
|
||||||
};
|
|
||||||
|
|
||||||
struct mlx5e_priv {
|
struct mlx5e_priv {
|
||||||
/* priv data path fields - start */
|
/* priv data path fields - start */
|
||||||
struct mlx5e_txqsq *txq2sq[MLX5E_MAX_NUM_CHANNELS * MLX5E_MAX_NUM_TC];
|
struct mlx5e_txqsq *txq2sq[MLX5E_MAX_NUM_CHANNELS * MLX5E_MAX_NUM_TC];
|
||||||
@@ -741,7 +737,6 @@ struct mlx5e_priv {
|
|||||||
#ifdef CONFIG_MLX5_EN_TLS
|
#ifdef CONFIG_MLX5_EN_TLS
|
||||||
struct mlx5e_tls *tls;
|
struct mlx5e_tls *tls;
|
||||||
#endif
|
#endif
|
||||||
struct devlink_health_reporter *tx_reporter;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
struct mlx5e_profile {
|
struct mlx5e_profile {
|
||||||
@@ -871,11 +866,6 @@ void mlx5e_set_rq_type(struct mlx5_core_dev *mdev, struct mlx5e_params *params);
|
|||||||
void mlx5e_init_rq_type_params(struct mlx5_core_dev *mdev,
|
void mlx5e_init_rq_type_params(struct mlx5_core_dev *mdev,
|
||||||
struct mlx5e_params *params);
|
struct mlx5e_params *params);
|
||||||
|
|
||||||
int mlx5e_modify_sq(struct mlx5_core_dev *mdev, u32 sqn,
|
|
||||||
struct mlx5e_modify_sq_param *p);
|
|
||||||
void mlx5e_activate_txqsq(struct mlx5e_txqsq *sq);
|
|
||||||
void mlx5e_tx_disable_queue(struct netdev_queue *txq);
|
|
||||||
|
|
||||||
static inline bool mlx5e_tunnel_inner_ft_supported(struct mlx5_core_dev *mdev)
|
static inline bool mlx5e_tunnel_inner_ft_supported(struct mlx5_core_dev *mdev)
|
||||||
{
|
{
|
||||||
return (MLX5_CAP_ETH(mdev, tunnel_stateless_gre) &&
|
return (MLX5_CAP_ETH(mdev, tunnel_stateless_gre) &&
|
||||||
|
@@ -1,15 +0,0 @@
|
|||||||
/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
|
|
||||||
/* Copyright (c) 2018 Mellanox Technologies. */
|
|
||||||
|
|
||||||
#ifndef __MLX5E_EN_REPORTER_H
|
|
||||||
#define __MLX5E_EN_REPORTER_H
|
|
||||||
|
|
||||||
#include <linux/mlx5/driver.h>
|
|
||||||
#include "en.h"
|
|
||||||
|
|
||||||
int mlx5e_tx_reporter_create(struct mlx5e_priv *priv);
|
|
||||||
void mlx5e_tx_reporter_destroy(struct mlx5e_priv *priv);
|
|
||||||
void mlx5e_tx_reporter_err_cqe(struct mlx5e_txqsq *sq);
|
|
||||||
void mlx5e_tx_reporter_timeout(struct mlx5e_txqsq *sq);
|
|
||||||
|
|
||||||
#endif
|
|
@@ -1,356 +0,0 @@
|
|||||||
/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
|
|
||||||
/* Copyright (c) 2018 Mellanox Technologies. */
|
|
||||||
|
|
||||||
#include <net/devlink.h>
|
|
||||||
#include "reporter.h"
|
|
||||||
#include "lib/eq.h"
|
|
||||||
|
|
||||||
#define MLX5E_TX_REPORTER_PER_SQ_MAX_LEN 256
|
|
||||||
|
|
||||||
struct mlx5e_tx_err_ctx {
|
|
||||||
int (*recover)(struct mlx5e_txqsq *sq);
|
|
||||||
struct mlx5e_txqsq *sq;
|
|
||||||
};
|
|
||||||
|
|
||||||
static int mlx5e_wait_for_sq_flush(struct mlx5e_txqsq *sq)
|
|
||||||
{
|
|
||||||
unsigned long exp_time = jiffies + msecs_to_jiffies(2000);
|
|
||||||
|
|
||||||
while (time_before(jiffies, exp_time)) {
|
|
||||||
if (sq->cc == sq->pc)
|
|
||||||
return 0;
|
|
||||||
|
|
||||||
msleep(20);
|
|
||||||
}
|
|
||||||
|
|
||||||
netdev_err(sq->channel->netdev,
|
|
||||||
"Wait for SQ 0x%x flush timeout (sq cc = 0x%x, sq pc = 0x%x)\n",
|
|
||||||
sq->sqn, sq->cc, sq->pc);
|
|
||||||
|
|
||||||
return -ETIMEDOUT;
|
|
||||||
}
|
|
||||||
|
|
||||||
static void mlx5e_reset_txqsq_cc_pc(struct mlx5e_txqsq *sq)
|
|
||||||
{
|
|
||||||
WARN_ONCE(sq->cc != sq->pc,
|
|
||||||
"SQ 0x%x: cc (0x%x) != pc (0x%x)\n",
|
|
||||||
sq->sqn, sq->cc, sq->pc);
|
|
||||||
sq->cc = 0;
|
|
||||||
sq->dma_fifo_cc = 0;
|
|
||||||
sq->pc = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int mlx5e_sq_to_ready(struct mlx5e_txqsq *sq, int curr_state)
|
|
||||||
{
|
|
||||||
struct mlx5_core_dev *mdev = sq->channel->mdev;
|
|
||||||
struct net_device *dev = sq->channel->netdev;
|
|
||||||
struct mlx5e_modify_sq_param msp = {0};
|
|
||||||
int err;
|
|
||||||
|
|
||||||
msp.curr_state = curr_state;
|
|
||||||
msp.next_state = MLX5_SQC_STATE_RST;
|
|
||||||
|
|
||||||
err = mlx5e_modify_sq(mdev, sq->sqn, &msp);
|
|
||||||
if (err) {
|
|
||||||
netdev_err(dev, "Failed to move sq 0x%x to reset\n", sq->sqn);
|
|
||||||
return err;
|
|
||||||
}
|
|
||||||
|
|
||||||
memset(&msp, 0, sizeof(msp));
|
|
||||||
msp.curr_state = MLX5_SQC_STATE_RST;
|
|
||||||
msp.next_state = MLX5_SQC_STATE_RDY;
|
|
||||||
|
|
||||||
err = mlx5e_modify_sq(mdev, sq->sqn, &msp);
|
|
||||||
if (err) {
|
|
||||||
netdev_err(dev, "Failed to move sq 0x%x to ready\n", sq->sqn);
|
|
||||||
return err;
|
|
||||||
}
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int mlx5e_tx_reporter_err_cqe_recover(struct mlx5e_txqsq *sq)
|
|
||||||
{
|
|
||||||
struct mlx5_core_dev *mdev = sq->channel->mdev;
|
|
||||||
struct net_device *dev = sq->channel->netdev;
|
|
||||||
u8 state;
|
|
||||||
int err;
|
|
||||||
|
|
||||||
if (!test_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state))
|
|
||||||
return 0;
|
|
||||||
|
|
||||||
err = mlx5_core_query_sq_state(mdev, sq->sqn, &state);
|
|
||||||
if (err) {
|
|
||||||
netdev_err(dev, "Failed to query SQ 0x%x state. err = %d\n",
|
|
||||||
sq->sqn, err);
|
|
||||||
return err;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (state != MLX5_RQC_STATE_ERR) {
|
|
||||||
netdev_err(dev, "SQ 0x%x not in ERROR state\n", sq->sqn);
|
|
||||||
return -EINVAL;
|
|
||||||
}
|
|
||||||
|
|
||||||
mlx5e_tx_disable_queue(sq->txq);
|
|
||||||
|
|
||||||
err = mlx5e_wait_for_sq_flush(sq);
|
|
||||||
if (err)
|
|
||||||
return err;
|
|
||||||
|
|
||||||
/* At this point, no new packets will arrive from the stack as TXQ is
|
|
||||||
* marked with QUEUE_STATE_DRV_XOFF. In addition, NAPI cleared all
|
|
||||||
* pending WQEs. SQ can safely reset the SQ.
|
|
||||||
*/
|
|
||||||
|
|
||||||
err = mlx5e_sq_to_ready(sq, state);
|
|
||||||
if (err)
|
|
||||||
return err;
|
|
||||||
|
|
||||||
mlx5e_reset_txqsq_cc_pc(sq);
|
|
||||||
sq->stats->recover++;
|
|
||||||
mlx5e_activate_txqsq(sq);
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
void mlx5e_tx_reporter_err_cqe(struct mlx5e_txqsq *sq)
|
|
||||||
{
|
|
||||||
char err_str[MLX5E_TX_REPORTER_PER_SQ_MAX_LEN];
|
|
||||||
struct mlx5e_tx_err_ctx err_ctx = {0};
|
|
||||||
|
|
||||||
err_ctx.sq = sq;
|
|
||||||
err_ctx.recover = mlx5e_tx_reporter_err_cqe_recover;
|
|
||||||
sprintf(err_str, "ERR CQE on SQ: 0x%x", sq->sqn);
|
|
||||||
|
|
||||||
devlink_health_report(sq->channel->priv->tx_reporter, err_str,
|
|
||||||
&err_ctx);
|
|
||||||
}
|
|
||||||
|
|
||||||
static int mlx5e_tx_reporter_timeout_recover(struct mlx5e_txqsq *sq)
|
|
||||||
{
|
|
||||||
struct mlx5_eq_comp *eq = sq->cq.mcq.eq;
|
|
||||||
u32 eqe_count;
|
|
||||||
|
|
||||||
netdev_err(sq->channel->netdev, "EQ 0x%x: Cons = 0x%x, irqn = 0x%x\n",
|
|
||||||
eq->core.eqn, eq->core.cons_index, eq->core.irqn);
|
|
||||||
|
|
||||||
eqe_count = mlx5_eq_poll_irq_disabled(eq);
|
|
||||||
if (!eqe_count) {
|
|
||||||
clear_bit(MLX5E_SQ_STATE_ENABLED, &sq->state);
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
netdev_err(sq->channel->netdev, "Recover %d eqes on EQ 0x%x\n",
|
|
||||||
eqe_count, eq->core.eqn);
|
|
||||||
sq->channel->stats->eq_rearm++;
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
void mlx5e_tx_reporter_timeout(struct mlx5e_txqsq *sq)
|
|
||||||
{
|
|
||||||
struct mlx5e_tx_err_ctx err_ctx;
|
|
||||||
char err_str[MLX5E_TX_REPORTER_PER_SQ_MAX_LEN];
|
|
||||||
|
|
||||||
err_ctx.sq = sq;
|
|
||||||
err_ctx.recover = mlx5e_tx_reporter_timeout_recover;
|
|
||||||
sprintf(err_str,
|
|
||||||
"TX timeout on queue: %d, SQ: 0x%x, CQ: 0x%x, SQ Cons: 0x%x SQ Prod: 0x%x, usecs since last trans: %u\n",
|
|
||||||
sq->channel->ix, sq->sqn, sq->cq.mcq.cqn, sq->cc, sq->pc,
|
|
||||||
jiffies_to_usecs(jiffies - sq->txq->trans_start));
|
|
||||||
devlink_health_report(sq->channel->priv->tx_reporter, err_str,
|
|
||||||
&err_ctx);
|
|
||||||
}
|
|
||||||
|
|
||||||
/* state lock cannot be grabbed within this function.
|
|
||||||
* It can cause a dead lock or a read-after-free.
|
|
||||||
*/
|
|
||||||
int mlx5e_tx_reporter_recover_from_ctx(struct mlx5e_tx_err_ctx *err_ctx)
|
|
||||||
{
|
|
||||||
return err_ctx->recover(err_ctx->sq);
|
|
||||||
}
|
|
||||||
|
|
||||||
static int mlx5e_tx_reporter_recover_all(struct mlx5e_priv *priv)
|
|
||||||
{
|
|
||||||
int err;
|
|
||||||
|
|
||||||
mutex_lock(&priv->state_lock);
|
|
||||||
mlx5e_close_locked(priv->netdev);
|
|
||||||
err = mlx5e_open_locked(priv->netdev);
|
|
||||||
mutex_unlock(&priv->state_lock);
|
|
||||||
|
|
||||||
return err;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int mlx5e_tx_reporter_recover(struct devlink_health_reporter *reporter,
|
|
||||||
void *context)
|
|
||||||
{
|
|
||||||
struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter);
|
|
||||||
struct mlx5e_tx_err_ctx *err_ctx = context;
|
|
||||||
|
|
||||||
return err_ctx ? mlx5e_tx_reporter_recover_from_ctx(err_ctx) :
|
|
||||||
mlx5e_tx_reporter_recover_all(priv);
|
|
||||||
}
|
|
||||||
|
|
||||||
static int
|
|
||||||
mlx5e_tx_reporter_build_diagnose_output(struct devlink_health_buffer *buffer,
|
|
||||||
u32 sqn, u8 state, u8 stopped)
|
|
||||||
{
|
|
||||||
int err, i;
|
|
||||||
int nest = 0;
|
|
||||||
char name[20];
|
|
||||||
|
|
||||||
err = devlink_health_buffer_nest_start(buffer,
|
|
||||||
DEVLINK_ATTR_HEALTH_BUFFER_OBJECT);
|
|
||||||
if (err)
|
|
||||||
goto buffer_error;
|
|
||||||
nest++;
|
|
||||||
|
|
||||||
err = devlink_health_buffer_nest_start(buffer,
|
|
||||||
DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_PAIR);
|
|
||||||
if (err)
|
|
||||||
goto buffer_error;
|
|
||||||
nest++;
|
|
||||||
|
|
||||||
sprintf(name, "SQ 0x%x", sqn);
|
|
||||||
err = devlink_health_buffer_put_object_name(buffer, name);
|
|
||||||
if (err)
|
|
||||||
goto buffer_error;
|
|
||||||
|
|
||||||
err = devlink_health_buffer_nest_start(buffer,
|
|
||||||
DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE);
|
|
||||||
if (err)
|
|
||||||
goto buffer_error;
|
|
||||||
nest++;
|
|
||||||
|
|
||||||
err = devlink_health_buffer_nest_start(buffer,
|
|
||||||
DEVLINK_ATTR_HEALTH_BUFFER_OBJECT);
|
|
||||||
if (err)
|
|
||||||
goto buffer_error;
|
|
||||||
nest++;
|
|
||||||
|
|
||||||
err = devlink_health_buffer_nest_start(buffer,
|
|
||||||
DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_PAIR);
|
|
||||||
if (err)
|
|
||||||
goto buffer_error;
|
|
||||||
nest++;
|
|
||||||
|
|
||||||
err = devlink_health_buffer_put_object_name(buffer, "HW state");
|
|
||||||
if (err)
|
|
||||||
goto buffer_error;
|
|
||||||
|
|
||||||
err = devlink_health_buffer_nest_start(buffer,
|
|
||||||
DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE);
|
|
||||||
if (err)
|
|
||||||
goto buffer_error;
|
|
||||||
nest++;
|
|
||||||
|
|
||||||
err = devlink_health_buffer_put_value_u8(buffer, state);
|
|
||||||
if (err)
|
|
||||||
goto buffer_error;
|
|
||||||
|
|
||||||
devlink_health_buffer_nest_end(buffer); /* DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE */
|
|
||||||
nest--;
|
|
||||||
|
|
||||||
devlink_health_buffer_nest_end(buffer); /* DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_PAIR */
|
|
||||||
nest--;
|
|
||||||
|
|
||||||
err = devlink_health_buffer_nest_start(buffer,
|
|
||||||
DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_PAIR);
|
|
||||||
if (err)
|
|
||||||
goto buffer_error;
|
|
||||||
nest++;
|
|
||||||
|
|
||||||
err = devlink_health_buffer_put_object_name(buffer, "stopped");
|
|
||||||
if (err)
|
|
||||||
goto buffer_error;
|
|
||||||
|
|
||||||
err = devlink_health_buffer_nest_start(buffer,
|
|
||||||
DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE);
|
|
||||||
if (err)
|
|
||||||
goto buffer_error;
|
|
||||||
nest++;
|
|
||||||
|
|
||||||
err = devlink_health_buffer_put_value_u8(buffer, stopped);
|
|
||||||
if (err)
|
|
||||||
goto buffer_error;
|
|
||||||
|
|
||||||
for (i = 0; i < nest; i++)
|
|
||||||
devlink_health_buffer_nest_end(buffer);
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
|
|
||||||
buffer_error:
|
|
||||||
for (i = 0; i < nest; i++)
|
|
||||||
devlink_health_buffer_nest_cancel(buffer);
|
|
||||||
return err;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int mlx5e_tx_reporter_diagnose(struct devlink_health_reporter *reporter,
|
|
||||||
struct devlink_health_buffer **buffers_array,
|
|
||||||
unsigned int buffer_size,
|
|
||||||
unsigned int num_buffers)
|
|
||||||
{
|
|
||||||
struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter);
|
|
||||||
unsigned int buff = 0;
|
|
||||||
int i = 0, err = 0;
|
|
||||||
|
|
||||||
if (buffer_size < MLX5E_TX_REPORTER_PER_SQ_MAX_LEN)
|
|
||||||
return -ENOMEM;
|
|
||||||
|
|
||||||
mutex_lock(&priv->state_lock);
|
|
||||||
|
|
||||||
if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) {
|
|
||||||
mutex_unlock(&priv->state_lock);
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
while (i < priv->channels.num * priv->channels.params.num_tc) {
|
|
||||||
struct mlx5e_txqsq *sq = priv->txq2sq[i];
|
|
||||||
u8 state;
|
|
||||||
|
|
||||||
err = mlx5_core_query_sq_state(priv->mdev, sq->sqn, &state);
|
|
||||||
if (err)
|
|
||||||
break;
|
|
||||||
|
|
||||||
err = mlx5e_tx_reporter_build_diagnose_output(buffers_array[buff],
|
|
||||||
sq->sqn, state,
|
|
||||||
netif_xmit_stopped(sq->txq));
|
|
||||||
if (err) {
|
|
||||||
if (++buff == num_buffers)
|
|
||||||
break;
|
|
||||||
} else {
|
|
||||||
i++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
mutex_unlock(&priv->state_lock);
|
|
||||||
return err;
|
|
||||||
}
|
|
||||||
|
|
||||||
static const struct devlink_health_reporter_ops mlx5_tx_reporter_ops = {
|
|
||||||
.name = "TX",
|
|
||||||
.recover = mlx5e_tx_reporter_recover,
|
|
||||||
.diagnose_size = MLX5E_MAX_NUM_CHANNELS * MLX5E_MAX_NUM_TC *
|
|
||||||
MLX5E_TX_REPORTER_PER_SQ_MAX_LEN,
|
|
||||||
.diagnose = mlx5e_tx_reporter_diagnose,
|
|
||||||
.dump_size = 0,
|
|
||||||
.dump = NULL,
|
|
||||||
};
|
|
||||||
|
|
||||||
#define MLX5_REPORTER_TX_GRACEFUL_PERIOD 500
|
|
||||||
int mlx5e_tx_reporter_create(struct mlx5e_priv *priv)
|
|
||||||
{
|
|
||||||
struct mlx5_core_dev *mdev = priv->mdev;
|
|
||||||
struct devlink *devlink = priv_to_devlink(mdev);
|
|
||||||
|
|
||||||
priv->tx_reporter =
|
|
||||||
devlink_health_reporter_create(devlink, &mlx5_tx_reporter_ops,
|
|
||||||
MLX5_REPORTER_TX_GRACEFUL_PERIOD,
|
|
||||||
true, priv);
|
|
||||||
return PTR_ERR_OR_ZERO(priv->tx_reporter);
|
|
||||||
}
|
|
||||||
|
|
||||||
void mlx5e_tx_reporter_destroy(struct mlx5e_priv *priv)
|
|
||||||
{
|
|
||||||
devlink_health_reporter_destroy(priv->tx_reporter);
|
|
||||||
}
|
|
@@ -51,7 +51,6 @@
|
|||||||
#include "en/xdp.h"
|
#include "en/xdp.h"
|
||||||
#include "lib/eq.h"
|
#include "lib/eq.h"
|
||||||
#include "en/monitor_stats.h"
|
#include "en/monitor_stats.h"
|
||||||
#include "en/reporter.h"
|
|
||||||
|
|
||||||
struct mlx5e_rq_param {
|
struct mlx5e_rq_param {
|
||||||
u32 rqc[MLX5_ST_SZ_DW(rqc)];
|
u32 rqc[MLX5_ST_SZ_DW(rqc)];
|
||||||
@@ -1161,7 +1160,7 @@ static int mlx5e_alloc_txqsq_db(struct mlx5e_txqsq *sq, int numa)
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void mlx5e_tx_err_cqe_work(struct work_struct *recover_work);
|
static void mlx5e_sq_recover(struct work_struct *work);
|
||||||
static int mlx5e_alloc_txqsq(struct mlx5e_channel *c,
|
static int mlx5e_alloc_txqsq(struct mlx5e_channel *c,
|
||||||
int txq_ix,
|
int txq_ix,
|
||||||
struct mlx5e_params *params,
|
struct mlx5e_params *params,
|
||||||
@@ -1183,7 +1182,7 @@ static int mlx5e_alloc_txqsq(struct mlx5e_channel *c,
|
|||||||
sq->uar_map = mdev->mlx5e_res.bfreg.map;
|
sq->uar_map = mdev->mlx5e_res.bfreg.map;
|
||||||
sq->min_inline_mode = params->tx_min_inline_mode;
|
sq->min_inline_mode = params->tx_min_inline_mode;
|
||||||
sq->stats = &c->priv->channel_stats[c->ix].sq[tc];
|
sq->stats = &c->priv->channel_stats[c->ix].sq[tc];
|
||||||
INIT_WORK(&sq->recover_work, mlx5e_tx_err_cqe_work);
|
INIT_WORK(&sq->recover.recover_work, mlx5e_sq_recover);
|
||||||
if (MLX5_IPSEC_DEV(c->priv->mdev))
|
if (MLX5_IPSEC_DEV(c->priv->mdev))
|
||||||
set_bit(MLX5E_SQ_STATE_IPSEC, &sq->state);
|
set_bit(MLX5E_SQ_STATE_IPSEC, &sq->state);
|
||||||
if (mlx5_accel_is_tls_device(c->priv->mdev))
|
if (mlx5_accel_is_tls_device(c->priv->mdev))
|
||||||
@@ -1271,7 +1270,14 @@ static int mlx5e_create_sq(struct mlx5_core_dev *mdev,
|
|||||||
return err;
|
return err;
|
||||||
}
|
}
|
||||||
|
|
||||||
int mlx5e_modify_sq(struct mlx5_core_dev *mdev, u32 sqn,
|
struct mlx5e_modify_sq_param {
|
||||||
|
int curr_state;
|
||||||
|
int next_state;
|
||||||
|
bool rl_update;
|
||||||
|
int rl_index;
|
||||||
|
};
|
||||||
|
|
||||||
|
static int mlx5e_modify_sq(struct mlx5_core_dev *mdev, u32 sqn,
|
||||||
struct mlx5e_modify_sq_param *p)
|
struct mlx5e_modify_sq_param *p)
|
||||||
{
|
{
|
||||||
void *in;
|
void *in;
|
||||||
@@ -1370,7 +1376,17 @@ err_free_txqsq:
|
|||||||
return err;
|
return err;
|
||||||
}
|
}
|
||||||
|
|
||||||
void mlx5e_activate_txqsq(struct mlx5e_txqsq *sq)
|
static void mlx5e_reset_txqsq_cc_pc(struct mlx5e_txqsq *sq)
|
||||||
|
{
|
||||||
|
WARN_ONCE(sq->cc != sq->pc,
|
||||||
|
"SQ 0x%x: cc (0x%x) != pc (0x%x)\n",
|
||||||
|
sq->sqn, sq->cc, sq->pc);
|
||||||
|
sq->cc = 0;
|
||||||
|
sq->dma_fifo_cc = 0;
|
||||||
|
sq->pc = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void mlx5e_activate_txqsq(struct mlx5e_txqsq *sq)
|
||||||
{
|
{
|
||||||
sq->txq = netdev_get_tx_queue(sq->channel->netdev, sq->txq_ix);
|
sq->txq = netdev_get_tx_queue(sq->channel->netdev, sq->txq_ix);
|
||||||
clear_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state);
|
clear_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state);
|
||||||
@@ -1379,7 +1395,7 @@ void mlx5e_activate_txqsq(struct mlx5e_txqsq *sq)
|
|||||||
netif_tx_start_queue(sq->txq);
|
netif_tx_start_queue(sq->txq);
|
||||||
}
|
}
|
||||||
|
|
||||||
void mlx5e_tx_disable_queue(struct netdev_queue *txq)
|
static inline void netif_tx_disable_queue(struct netdev_queue *txq)
|
||||||
{
|
{
|
||||||
__netif_tx_lock_bh(txq);
|
__netif_tx_lock_bh(txq);
|
||||||
netif_tx_stop_queue(txq);
|
netif_tx_stop_queue(txq);
|
||||||
@@ -1395,7 +1411,7 @@ static void mlx5e_deactivate_txqsq(struct mlx5e_txqsq *sq)
|
|||||||
/* prevent netif_tx_wake_queue */
|
/* prevent netif_tx_wake_queue */
|
||||||
napi_synchronize(&c->napi);
|
napi_synchronize(&c->napi);
|
||||||
|
|
||||||
mlx5e_tx_disable_queue(sq->txq);
|
netif_tx_disable_queue(sq->txq);
|
||||||
|
|
||||||
/* last doorbell out, godspeed .. */
|
/* last doorbell out, godspeed .. */
|
||||||
if (mlx5e_wqc_has_room_for(wq, sq->cc, sq->pc, 1)) {
|
if (mlx5e_wqc_has_room_for(wq, sq->cc, sq->pc, 1)) {
|
||||||
@@ -1415,7 +1431,6 @@ static void mlx5e_close_txqsq(struct mlx5e_txqsq *sq)
|
|||||||
struct mlx5_rate_limit rl = {0};
|
struct mlx5_rate_limit rl = {0};
|
||||||
|
|
||||||
cancel_work_sync(&sq->dim.work);
|
cancel_work_sync(&sq->dim.work);
|
||||||
cancel_work_sync(&sq->recover_work);
|
|
||||||
mlx5e_destroy_sq(mdev, sq->sqn);
|
mlx5e_destroy_sq(mdev, sq->sqn);
|
||||||
if (sq->rate_limit) {
|
if (sq->rate_limit) {
|
||||||
rl.rate = sq->rate_limit;
|
rl.rate = sq->rate_limit;
|
||||||
@@ -1425,15 +1440,105 @@ static void mlx5e_close_txqsq(struct mlx5e_txqsq *sq)
|
|||||||
mlx5e_free_txqsq(sq);
|
mlx5e_free_txqsq(sq);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void mlx5e_tx_err_cqe_work(struct work_struct *recover_work)
|
static int mlx5e_wait_for_sq_flush(struct mlx5e_txqsq *sq)
|
||||||
{
|
{
|
||||||
struct mlx5e_txqsq *sq = container_of(recover_work, struct mlx5e_txqsq,
|
unsigned long exp_time = jiffies + msecs_to_jiffies(2000);
|
||||||
recover_work);
|
|
||||||
|
|
||||||
if (!sq->channel->priv->tx_reporter)
|
while (time_before(jiffies, exp_time)) {
|
||||||
|
if (sq->cc == sq->pc)
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
msleep(20);
|
||||||
|
}
|
||||||
|
|
||||||
|
netdev_err(sq->channel->netdev,
|
||||||
|
"Wait for SQ 0x%x flush timeout (sq cc = 0x%x, sq pc = 0x%x)\n",
|
||||||
|
sq->sqn, sq->cc, sq->pc);
|
||||||
|
|
||||||
|
return -ETIMEDOUT;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int mlx5e_sq_to_ready(struct mlx5e_txqsq *sq, int curr_state)
|
||||||
|
{
|
||||||
|
struct mlx5_core_dev *mdev = sq->channel->mdev;
|
||||||
|
struct net_device *dev = sq->channel->netdev;
|
||||||
|
struct mlx5e_modify_sq_param msp = {0};
|
||||||
|
int err;
|
||||||
|
|
||||||
|
msp.curr_state = curr_state;
|
||||||
|
msp.next_state = MLX5_SQC_STATE_RST;
|
||||||
|
|
||||||
|
err = mlx5e_modify_sq(mdev, sq->sqn, &msp);
|
||||||
|
if (err) {
|
||||||
|
netdev_err(dev, "Failed to move sq 0x%x to reset\n", sq->sqn);
|
||||||
|
return err;
|
||||||
|
}
|
||||||
|
|
||||||
|
memset(&msp, 0, sizeof(msp));
|
||||||
|
msp.curr_state = MLX5_SQC_STATE_RST;
|
||||||
|
msp.next_state = MLX5_SQC_STATE_RDY;
|
||||||
|
|
||||||
|
err = mlx5e_modify_sq(mdev, sq->sqn, &msp);
|
||||||
|
if (err) {
|
||||||
|
netdev_err(dev, "Failed to move sq 0x%x to ready\n", sq->sqn);
|
||||||
|
return err;
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void mlx5e_sq_recover(struct work_struct *work)
|
||||||
|
{
|
||||||
|
struct mlx5e_txqsq_recover *recover =
|
||||||
|
container_of(work, struct mlx5e_txqsq_recover,
|
||||||
|
recover_work);
|
||||||
|
struct mlx5e_txqsq *sq = container_of(recover, struct mlx5e_txqsq,
|
||||||
|
recover);
|
||||||
|
struct mlx5_core_dev *mdev = sq->channel->mdev;
|
||||||
|
struct net_device *dev = sq->channel->netdev;
|
||||||
|
u8 state;
|
||||||
|
int err;
|
||||||
|
|
||||||
|
err = mlx5_core_query_sq_state(mdev, sq->sqn, &state);
|
||||||
|
if (err) {
|
||||||
|
netdev_err(dev, "Failed to query SQ 0x%x state. err = %d\n",
|
||||||
|
sq->sqn, err);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (state != MLX5_RQC_STATE_ERR) {
|
||||||
|
netdev_err(dev, "SQ 0x%x not in ERROR state\n", sq->sqn);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
netif_tx_disable_queue(sq->txq);
|
||||||
|
|
||||||
|
if (mlx5e_wait_for_sq_flush(sq))
|
||||||
return;
|
return;
|
||||||
|
|
||||||
mlx5e_tx_reporter_err_cqe(sq);
|
/* If the interval between two consecutive recovers per SQ is too
|
||||||
|
* short, don't recover to avoid infinite loop of ERR_CQE -> recover.
|
||||||
|
* If we reached this state, there is probably a bug that needs to be
|
||||||
|
* fixed. let's keep the queue close and let tx timeout cleanup.
|
||||||
|
*/
|
||||||
|
if (jiffies_to_msecs(jiffies - recover->last_recover) <
|
||||||
|
MLX5E_SQ_RECOVER_MIN_INTERVAL) {
|
||||||
|
netdev_err(dev, "Recover SQ 0x%x canceled, too many error CQEs\n",
|
||||||
|
sq->sqn);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* At this point, no new packets will arrive from the stack as TXQ is
|
||||||
|
* marked with QUEUE_STATE_DRV_XOFF. In addition, NAPI cleared all
|
||||||
|
* pending WQEs. SQ can safely reset the SQ.
|
||||||
|
*/
|
||||||
|
if (mlx5e_sq_to_ready(sq, state))
|
||||||
|
return;
|
||||||
|
|
||||||
|
mlx5e_reset_txqsq_cc_pc(sq);
|
||||||
|
sq->stats->recover++;
|
||||||
|
recover->last_recover = jiffies;
|
||||||
|
mlx5e_activate_txqsq(sq);
|
||||||
}
|
}
|
||||||
|
|
||||||
static int mlx5e_open_icosq(struct mlx5e_channel *c,
|
static int mlx5e_open_icosq(struct mlx5e_channel *c,
|
||||||
@@ -3102,7 +3207,6 @@ static void mlx5e_cleanup_nic_tx(struct mlx5e_priv *priv)
|
|||||||
{
|
{
|
||||||
int tc;
|
int tc;
|
||||||
|
|
||||||
mlx5e_tx_reporter_destroy(priv);
|
|
||||||
for (tc = 0; tc < priv->profile->max_tc; tc++)
|
for (tc = 0; tc < priv->profile->max_tc; tc++)
|
||||||
mlx5e_destroy_tis(priv->mdev, priv->tisn[tc]);
|
mlx5e_destroy_tis(priv->mdev, priv->tisn[tc]);
|
||||||
}
|
}
|
||||||
@@ -4074,14 +4178,31 @@ netdev_features_t mlx5e_features_check(struct sk_buff *skb,
|
|||||||
return features;
|
return features;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static bool mlx5e_tx_timeout_eq_recover(struct net_device *dev,
|
||||||
|
struct mlx5e_txqsq *sq)
|
||||||
|
{
|
||||||
|
struct mlx5_eq_comp *eq = sq->cq.mcq.eq;
|
||||||
|
u32 eqe_count;
|
||||||
|
|
||||||
|
netdev_err(dev, "EQ 0x%x: Cons = 0x%x, irqn = 0x%x\n",
|
||||||
|
eq->core.eqn, eq->core.cons_index, eq->core.irqn);
|
||||||
|
|
||||||
|
eqe_count = mlx5_eq_poll_irq_disabled(eq);
|
||||||
|
if (!eqe_count)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
netdev_err(dev, "Recover %d eqes on EQ 0x%x\n", eqe_count, eq->core.eqn);
|
||||||
|
sq->channel->stats->eq_rearm++;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
static void mlx5e_tx_timeout_work(struct work_struct *work)
|
static void mlx5e_tx_timeout_work(struct work_struct *work)
|
||||||
{
|
{
|
||||||
struct mlx5e_priv *priv = container_of(work, struct mlx5e_priv,
|
struct mlx5e_priv *priv = container_of(work, struct mlx5e_priv,
|
||||||
tx_timeout_work);
|
tx_timeout_work);
|
||||||
int i;
|
struct net_device *dev = priv->netdev;
|
||||||
|
bool reopen_channels = false;
|
||||||
if (!priv->tx_reporter)
|
int i, err;
|
||||||
return;
|
|
||||||
|
|
||||||
rtnl_lock();
|
rtnl_lock();
|
||||||
mutex_lock(&priv->state_lock);
|
mutex_lock(&priv->state_lock);
|
||||||
@@ -4090,15 +4211,35 @@ static void mlx5e_tx_timeout_work(struct work_struct *work)
|
|||||||
goto unlock;
|
goto unlock;
|
||||||
|
|
||||||
for (i = 0; i < priv->channels.num * priv->channels.params.num_tc; i++) {
|
for (i = 0; i < priv->channels.num * priv->channels.params.num_tc; i++) {
|
||||||
struct netdev_queue *dev_queue =
|
struct netdev_queue *dev_queue = netdev_get_tx_queue(dev, i);
|
||||||
netdev_get_tx_queue(priv->netdev, i);
|
|
||||||
struct mlx5e_txqsq *sq = priv->txq2sq[i];
|
struct mlx5e_txqsq *sq = priv->txq2sq[i];
|
||||||
|
|
||||||
if (!netif_xmit_stopped(dev_queue))
|
if (!netif_xmit_stopped(dev_queue))
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
mlx5e_tx_reporter_timeout(sq);
|
netdev_err(dev,
|
||||||
|
"TX timeout on queue: %d, SQ: 0x%x, CQ: 0x%x, SQ Cons: 0x%x SQ Prod: 0x%x, usecs since last trans: %u\n",
|
||||||
|
i, sq->sqn, sq->cq.mcq.cqn, sq->cc, sq->pc,
|
||||||
|
jiffies_to_usecs(jiffies - dev_queue->trans_start));
|
||||||
|
|
||||||
|
/* If we recover a lost interrupt, most likely TX timeout will
|
||||||
|
* be resolved, skip reopening channels
|
||||||
|
*/
|
||||||
|
if (!mlx5e_tx_timeout_eq_recover(dev, sq)) {
|
||||||
|
clear_bit(MLX5E_SQ_STATE_ENABLED, &sq->state);
|
||||||
|
reopen_channels = true;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!reopen_channels)
|
||||||
|
goto unlock;
|
||||||
|
|
||||||
|
mlx5e_close_locked(dev);
|
||||||
|
err = mlx5e_open_locked(dev);
|
||||||
|
if (err)
|
||||||
|
netdev_err(priv->netdev,
|
||||||
|
"mlx5e_open_locked failed recovering from a tx_timeout, err(%d).\n",
|
||||||
|
err);
|
||||||
|
|
||||||
unlock:
|
unlock:
|
||||||
mutex_unlock(&priv->state_lock);
|
mutex_unlock(&priv->state_lock);
|
||||||
@@ -4767,7 +4908,6 @@ static int mlx5e_init_nic_tx(struct mlx5e_priv *priv)
|
|||||||
#ifdef CONFIG_MLX5_CORE_EN_DCB
|
#ifdef CONFIG_MLX5_CORE_EN_DCB
|
||||||
mlx5e_dcbnl_initialize(priv);
|
mlx5e_dcbnl_initialize(priv);
|
||||||
#endif
|
#endif
|
||||||
mlx5e_tx_reporter_create(priv);
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@@ -514,7 +514,7 @@ bool mlx5e_poll_tx_cq(struct mlx5e_cq *cq, int napi_budget)
|
|||||||
mlx5e_dump_error_cqe(sq,
|
mlx5e_dump_error_cqe(sq,
|
||||||
(struct mlx5_err_cqe *)cqe);
|
(struct mlx5_err_cqe *)cqe);
|
||||||
queue_work(cq->channel->priv->wq,
|
queue_work(cq->channel->priv->wq,
|
||||||
&sq->recover_work);
|
&sq->recover.recover_work);
|
||||||
}
|
}
|
||||||
stats->cqe_err++;
|
stats->cqe_err++;
|
||||||
}
|
}
|
||||||
|
@@ -30,7 +30,6 @@ struct devlink {
|
|||||||
struct list_head param_list;
|
struct list_head param_list;
|
||||||
struct list_head region_list;
|
struct list_head region_list;
|
||||||
u32 snapshot_id;
|
u32 snapshot_id;
|
||||||
struct list_head reporter_list;
|
|
||||||
struct devlink_dpipe_headers *dpipe_headers;
|
struct devlink_dpipe_headers *dpipe_headers;
|
||||||
const struct devlink_ops *ops;
|
const struct devlink_ops *ops;
|
||||||
struct device *dev;
|
struct device *dev;
|
||||||
@@ -424,36 +423,6 @@ struct devlink_region;
|
|||||||
|
|
||||||
typedef void devlink_snapshot_data_dest_t(const void *data);
|
typedef void devlink_snapshot_data_dest_t(const void *data);
|
||||||
|
|
||||||
struct devlink_health_buffer;
|
|
||||||
struct devlink_health_reporter;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* struct devlink_health_reporter_ops - Reporter operations
|
|
||||||
* @name: reporter name
|
|
||||||
* dump_size: dump buffer size allocated by the devlink
|
|
||||||
* diagnose_size: diagnose buffer size allocated by the devlink
|
|
||||||
* recover: callback to recover from reported error
|
|
||||||
* if priv_ctx is NULL, run a full recover
|
|
||||||
* dump: callback to dump an object
|
|
||||||
* if priv_ctx is NULL, run a full dump
|
|
||||||
* diagnose: callback to diagnose the current status
|
|
||||||
*/
|
|
||||||
|
|
||||||
struct devlink_health_reporter_ops {
|
|
||||||
char *name;
|
|
||||||
unsigned int dump_size;
|
|
||||||
unsigned int diagnose_size;
|
|
||||||
int (*recover)(struct devlink_health_reporter *reporter,
|
|
||||||
void *priv_ctx);
|
|
||||||
int (*dump)(struct devlink_health_reporter *reporter,
|
|
||||||
struct devlink_health_buffer **buffers_array,
|
|
||||||
unsigned int buffer_size, unsigned int num_buffers,
|
|
||||||
void *priv_ctx);
|
|
||||||
int (*diagnose)(struct devlink_health_reporter *reporter,
|
|
||||||
struct devlink_health_buffer **buffers_array,
|
|
||||||
unsigned int buffer_size, unsigned int num_buffers);
|
|
||||||
};
|
|
||||||
|
|
||||||
struct devlink_ops {
|
struct devlink_ops {
|
||||||
int (*reload)(struct devlink *devlink, struct netlink_ext_ack *extack);
|
int (*reload)(struct devlink *devlink, struct netlink_ext_ack *extack);
|
||||||
int (*port_type_set)(struct devlink_port *devlink_port,
|
int (*port_type_set)(struct devlink_port *devlink_port,
|
||||||
@@ -615,34 +584,6 @@ int devlink_region_snapshot_create(struct devlink_region *region, u64 data_len,
|
|||||||
u8 *data, u32 snapshot_id,
|
u8 *data, u32 snapshot_id,
|
||||||
devlink_snapshot_data_dest_t *data_destructor);
|
devlink_snapshot_data_dest_t *data_destructor);
|
||||||
|
|
||||||
int devlink_health_buffer_nest_start(struct devlink_health_buffer *buffer,
|
|
||||||
int attrtype);
|
|
||||||
void devlink_health_buffer_nest_end(struct devlink_health_buffer *buffer);
|
|
||||||
void devlink_health_buffer_nest_cancel(struct devlink_health_buffer *buffer);
|
|
||||||
int devlink_health_buffer_put_object_name(struct devlink_health_buffer *buffer,
|
|
||||||
char *name);
|
|
||||||
int devlink_health_buffer_put_value_u8(struct devlink_health_buffer *buffer,
|
|
||||||
u8 value);
|
|
||||||
int devlink_health_buffer_put_value_u32(struct devlink_health_buffer *buffer,
|
|
||||||
u32 value);
|
|
||||||
int devlink_health_buffer_put_value_u64(struct devlink_health_buffer *buffer,
|
|
||||||
u64 value);
|
|
||||||
int devlink_health_buffer_put_value_string(struct devlink_health_buffer *buffer,
|
|
||||||
char *name);
|
|
||||||
int devlink_health_buffer_put_value_data(struct devlink_health_buffer *buffer,
|
|
||||||
void *data, int len);
|
|
||||||
struct devlink_health_reporter *
|
|
||||||
devlink_health_reporter_create(struct devlink *devlink,
|
|
||||||
const struct devlink_health_reporter_ops *ops,
|
|
||||||
u64 graceful_period, bool auto_recover,
|
|
||||||
void *priv);
|
|
||||||
void
|
|
||||||
devlink_health_reporter_destroy(struct devlink_health_reporter *reporter);
|
|
||||||
|
|
||||||
void *
|
|
||||||
devlink_health_reporter_priv(struct devlink_health_reporter *reporter);
|
|
||||||
int devlink_health_report(struct devlink_health_reporter *reporter,
|
|
||||||
const char *msg, void *priv_ctx);
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
static inline struct devlink *devlink_alloc(const struct devlink_ops *ops,
|
static inline struct devlink *devlink_alloc(const struct devlink_ops *ops,
|
||||||
@@ -903,91 +844,6 @@ devlink_region_snapshot_create(struct devlink_region *region, u64 data_len,
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline int
|
|
||||||
devlink_health_buffer_nest_start(struct devlink_health_buffer *buffer,
|
|
||||||
int attrtype)
|
|
||||||
{
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline void
|
|
||||||
devlink_health_buffer_nest_end(struct devlink_health_buffer *buffer)
|
|
||||||
{
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline void
|
|
||||||
devlink_health_buffer_nest_cancel(struct devlink_health_buffer *buffer)
|
|
||||||
{
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline int
|
|
||||||
devlink_health_buffer_put_object_name(struct devlink_health_buffer *buffer,
|
|
||||||
char *name)
|
|
||||||
{
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline int
|
|
||||||
devlink_health_buffer_put_value_u8(struct devlink_health_buffer *buffer,
|
|
||||||
u8 value)
|
|
||||||
{
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline int
|
|
||||||
devlink_health_buffer_put_value_u32(struct devlink_health_buffer *buffer,
|
|
||||||
u32 value)
|
|
||||||
{
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline int
|
|
||||||
devlink_health_buffer_put_value_u64(struct devlink_health_buffer *buffer,
|
|
||||||
u64 value)
|
|
||||||
{
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline int
|
|
||||||
devlink_health_buffer_put_value_string(struct devlink_health_buffer *buffer,
|
|
||||||
char *name)
|
|
||||||
{
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline int
|
|
||||||
devlink_health_buffer_put_value_data(struct devlink_health_buffer *buffer,
|
|
||||||
void *data, int len)
|
|
||||||
{
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline struct devlink_health_reporter *
|
|
||||||
devlink_health_reporter_create(struct devlink *devlink,
|
|
||||||
const struct devlink_health_reporter_ops *ops,
|
|
||||||
u64 graceful_period, bool auto_recover,
|
|
||||||
void *priv)
|
|
||||||
{
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline void
|
|
||||||
devlink_health_reporter_destroy(struct devlink_health_reporter *reporter)
|
|
||||||
{
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline void *
|
|
||||||
devlink_health_reporter_priv(struct devlink_health_reporter *reporter)
|
|
||||||
{
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline int
|
|
||||||
devlink_health_report(struct devlink_health_reporter *reporter,
|
|
||||||
const char *msg, void *priv_ctx)
|
|
||||||
{
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#endif /* _NET_DEVLINK_H_ */
|
#endif /* _NET_DEVLINK_H_ */
|
||||||
|
@@ -46,65 +46,6 @@ TRACE_EVENT(devlink_hwmsg,
|
|||||||
(int) __entry->len, __get_dynamic_array(buf), __entry->len)
|
(int) __entry->len, __get_dynamic_array(buf), __entry->len)
|
||||||
);
|
);
|
||||||
|
|
||||||
TRACE_EVENT(devlink_health_report,
|
|
||||||
TP_PROTO(const struct devlink *devlink, const char *reporter_name,
|
|
||||||
const char *msg),
|
|
||||||
|
|
||||||
TP_ARGS(devlink, reporter_name, msg),
|
|
||||||
|
|
||||||
TP_STRUCT__entry(
|
|
||||||
__string(bus_name, devlink->dev->bus->name)
|
|
||||||
__string(dev_name, dev_name(devlink->dev))
|
|
||||||
__string(driver_name, devlink->dev->driver->name)
|
|
||||||
__string(reporter_name, msg)
|
|
||||||
__string(msg, msg)
|
|
||||||
),
|
|
||||||
|
|
||||||
TP_fast_assign(
|
|
||||||
__assign_str(bus_name, devlink->dev->bus->name);
|
|
||||||
__assign_str(dev_name, dev_name(devlink->dev));
|
|
||||||
__assign_str(driver_name, devlink->dev->driver->name);
|
|
||||||
__assign_str(reporter_name, reporter_name);
|
|
||||||
__assign_str(msg, msg);
|
|
||||||
),
|
|
||||||
|
|
||||||
TP_printk("bus_name=%s dev_name=%s driver_name=%s reporter_name=%s: %s",
|
|
||||||
__get_str(bus_name), __get_str(dev_name),
|
|
||||||
__get_str(driver_name), __get_str(reporter_name),
|
|
||||||
__get_str(msg))
|
|
||||||
);
|
|
||||||
|
|
||||||
TRACE_EVENT(devlink_health_recover_aborted,
|
|
||||||
TP_PROTO(const struct devlink *devlink, const char *reporter_name,
|
|
||||||
bool health_state, u64 time_since_last_recover),
|
|
||||||
|
|
||||||
TP_ARGS(devlink, reporter_name, health_state, time_since_last_recover),
|
|
||||||
|
|
||||||
TP_STRUCT__entry(
|
|
||||||
__string(bus_name, devlink->dev->bus->name)
|
|
||||||
__string(dev_name, dev_name(devlink->dev))
|
|
||||||
__string(driver_name, devlink->dev->driver->name)
|
|
||||||
__string(reporter_name, reporter_name)
|
|
||||||
__field(bool, health_state)
|
|
||||||
__field(u64, time_since_last_recover)
|
|
||||||
),
|
|
||||||
|
|
||||||
TP_fast_assign(
|
|
||||||
__assign_str(bus_name, devlink->dev->bus->name);
|
|
||||||
__assign_str(dev_name, dev_name(devlink->dev));
|
|
||||||
__assign_str(driver_name, devlink->dev->driver->name);
|
|
||||||
__assign_str(reporter_name, reporter_name);
|
|
||||||
__entry->health_state = health_state;
|
|
||||||
__entry->time_since_last_recover = time_since_last_recover;
|
|
||||||
),
|
|
||||||
|
|
||||||
TP_printk("bus_name=%s dev_name=%s driver_name=%s reporter_name=%s: health_state=%d time_since_last_recover = %llu recover aborted",
|
|
||||||
__get_str(bus_name), __get_str(dev_name),
|
|
||||||
__get_str(driver_name), __get_str(reporter_name),
|
|
||||||
__entry->health_state,
|
|
||||||
__entry->time_since_last_recover)
|
|
||||||
);
|
|
||||||
|
|
||||||
#endif /* _TRACE_DEVLINK_H */
|
#endif /* _TRACE_DEVLINK_H */
|
||||||
|
|
||||||
/* This part must be outside protection */
|
/* This part must be outside protection */
|
||||||
@@ -123,9 +64,6 @@ static inline void trace_devlink_hwmsg(const struct devlink *devlink,
|
|||||||
{
|
{
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline void trace_devlink_health(const char *msg)
|
|
||||||
{
|
|
||||||
}
|
|
||||||
#endif /* _TRACE_DEVLINK_H */
|
#endif /* _TRACE_DEVLINK_H */
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
@@ -89,13 +89,6 @@ enum devlink_command {
|
|||||||
DEVLINK_CMD_REGION_DEL,
|
DEVLINK_CMD_REGION_DEL,
|
||||||
DEVLINK_CMD_REGION_READ,
|
DEVLINK_CMD_REGION_READ,
|
||||||
|
|
||||||
DEVLINK_CMD_HEALTH_REPORTER_GET,
|
|
||||||
DEVLINK_CMD_HEALTH_REPORTER_SET,
|
|
||||||
DEVLINK_CMD_HEALTH_REPORTER_RECOVER,
|
|
||||||
DEVLINK_CMD_HEALTH_REPORTER_DIAGNOSE,
|
|
||||||
DEVLINK_CMD_HEALTH_REPORTER_DUMP_GET,
|
|
||||||
DEVLINK_CMD_HEALTH_REPORTER_DUMP_CLEAR,
|
|
||||||
|
|
||||||
/* add new commands above here */
|
/* add new commands above here */
|
||||||
__DEVLINK_CMD_MAX,
|
__DEVLINK_CMD_MAX,
|
||||||
DEVLINK_CMD_MAX = __DEVLINK_CMD_MAX - 1
|
DEVLINK_CMD_MAX = __DEVLINK_CMD_MAX - 1
|
||||||
@@ -292,24 +285,6 @@ enum devlink_attr {
|
|||||||
DEVLINK_ATTR_REGION_CHUNK_ADDR, /* u64 */
|
DEVLINK_ATTR_REGION_CHUNK_ADDR, /* u64 */
|
||||||
DEVLINK_ATTR_REGION_CHUNK_LEN, /* u64 */
|
DEVLINK_ATTR_REGION_CHUNK_LEN, /* u64 */
|
||||||
|
|
||||||
DEVLINK_ATTR_HEALTH_BUFFER_OBJECT, /* nested */
|
|
||||||
DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_PAIR, /* nested */
|
|
||||||
DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_NAME, /* string */
|
|
||||||
DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE, /* nested */
|
|
||||||
DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE_ARRAY, /* nested */
|
|
||||||
DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE_TYPE, /* u8 */
|
|
||||||
DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE_DATA, /* dynamic */
|
|
||||||
|
|
||||||
DEVLINK_ATTR_HEALTH_REPORTER, /* nested */
|
|
||||||
DEVLINK_ATTR_HEALTH_REPORTER_NAME, /* string */
|
|
||||||
DEVLINK_ATTR_HEALTH_REPORTER_STATE, /* u8 */
|
|
||||||
DEVLINK_ATTR_HEALTH_REPORTER_ERR, /* u64 */
|
|
||||||
DEVLINK_ATTR_HEALTH_REPORTER_RECOVER, /* u64 */
|
|
||||||
DEVLINK_ATTR_HEALTH_REPORTER_DUMP_AVAIL, /* u8 */
|
|
||||||
DEVLINK_ATTR_HEALTH_REPORTER_DUMP_TS, /* u64 */
|
|
||||||
DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD, /* u64 */
|
|
||||||
DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER, /* u8 */
|
|
||||||
|
|
||||||
/* add new attributes above here, update the policy in devlink.c */
|
/* add new attributes above here, update the policy in devlink.c */
|
||||||
|
|
||||||
__DEVLINK_ATTR_MAX,
|
__DEVLINK_ATTR_MAX,
|
||||||
|
1058
net/core/devlink.c
1058
net/core/devlink.c
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user