net/mlx5_core: Add pci error handlers to mlx5_core driver

This patch implement the pci_error_handlers for mlx5_core which allow the
driver to recover from PCI error.

Once an error is detected in the PCI, the mlx5_pci_err_detected is called
and it:
1) Marks the device to be in 'Internal Error' state.
2) Dispatches an event to the mlx5_ib to flush all the outstanding cqes
with error.
3) Returns all the on going commands with error.
4) Unloads the driver.

Afterwards, the FW is reset and mlx5_pci_slot_reset is called and it
enables the device and restore it's pci state.

If the later succeeds, mlx5_pci_resume is called, and it loads the SW
stack.

Signed-off-by: Majd Dibbiny <majd@mellanox.com>
Signed-off-by: Eli Cohen <eli@mellanox.com>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
Majd Dibbiny
2015-10-14 17:43:46 +03:00
committed by David S. Miller
parent fd76ee4da5
commit 89d44f0a6c
6 changed files with 451 additions and 12 deletions

View File

@@ -487,8 +487,26 @@ struct mlx5_priv {
spinlock_t ctx_lock;
};
enum mlx5_device_state {
MLX5_DEVICE_STATE_UP,
MLX5_DEVICE_STATE_INTERNAL_ERROR,
};
enum mlx5_interface_state {
MLX5_INTERFACE_STATE_DOWN,
MLX5_INTERFACE_STATE_UP,
};
enum mlx5_pci_status {
MLX5_PCI_STATUS_DISABLED,
MLX5_PCI_STATUS_ENABLED,
};
struct mlx5_core_dev {
struct pci_dev *pdev;
/* sync pci state */
struct mutex pci_status_mutex;
enum mlx5_pci_status pci_status;
u8 rev_id;
char board_id[MLX5_BOARD_ID_LEN];
struct mlx5_cmd cmd;
@@ -497,6 +515,10 @@ struct mlx5_core_dev {
u32 hca_caps_max[MLX5_CAP_NUM][MLX5_UN_SZ_DW(hca_cap_union)];
phys_addr_t iseg_base;
struct mlx5_init_seg __iomem *iseg;
enum mlx5_device_state state;
/* sync interface state */
struct mutex intf_state_mutex;
enum mlx5_interface_state interface_state;
void (*event) (struct mlx5_core_dev *dev,
enum mlx5_dev_event event,
unsigned long param);