{net,IB}/mlx5: Move Page fault EQ and ODP logic to RDMA
Use the new generic EQ API to move all ODP RDMA data structures and logic form mlx5 core driver into mlx5_ib driver. Signed-off-by: Saeed Mahameed <saeedm@mellanox.com> Reviewed-by: Leon Romanovsky <leonro@mellanox.com> Reviewed-by: Tariq Toukan <tariqt@mellanox.com> Acked-by: Jason Gunthorpe <jgg@mellanox.com> Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
This commit is contained in:

committed by
Leon Romanovsky

parent
7701707cb9
commit
d5d284b829
@@ -6040,6 +6040,11 @@ static int mlx5_ib_stage_odp_init(struct mlx5_ib_dev *dev)
|
||||
return mlx5_ib_odp_init_one(dev);
|
||||
}
|
||||
|
||||
void mlx5_ib_stage_odp_cleanup(struct mlx5_ib_dev *dev)
|
||||
{
|
||||
mlx5_ib_odp_cleanup_one(dev);
|
||||
}
|
||||
|
||||
int mlx5_ib_stage_counters_init(struct mlx5_ib_dev *dev)
|
||||
{
|
||||
if (MLX5_CAP_GEN(dev->mdev, max_qp_cnt)) {
|
||||
@@ -6225,7 +6230,7 @@ static const struct mlx5_ib_profile pf_profile = {
|
||||
mlx5_ib_stage_dev_res_cleanup),
|
||||
STAGE_CREATE(MLX5_IB_STAGE_ODP,
|
||||
mlx5_ib_stage_odp_init,
|
||||
NULL),
|
||||
mlx5_ib_stage_odp_cleanup),
|
||||
STAGE_CREATE(MLX5_IB_STAGE_COUNTERS,
|
||||
mlx5_ib_stage_counters_init,
|
||||
mlx5_ib_stage_counters_cleanup),
|
||||
@@ -6395,9 +6400,6 @@ static struct mlx5_interface mlx5_ib_interface = {
|
||||
.add = mlx5_ib_add,
|
||||
.remove = mlx5_ib_remove,
|
||||
.event = mlx5_ib_event,
|
||||
#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
|
||||
.pfault = mlx5_ib_pfault,
|
||||
#endif
|
||||
.protocol = MLX5_INTERFACE_PROTOCOL_IB,
|
||||
};
|
||||
|
||||
|
@@ -880,6 +880,15 @@ struct mlx5_ib_lb_state {
|
||||
bool enabled;
|
||||
};
|
||||
|
||||
struct mlx5_ib_pf_eq {
|
||||
struct mlx5_ib_dev *dev;
|
||||
struct mlx5_eq *core;
|
||||
struct work_struct work;
|
||||
spinlock_t lock; /* Pagefaults spinlock */
|
||||
struct workqueue_struct *wq;
|
||||
mempool_t *pool;
|
||||
};
|
||||
|
||||
struct mlx5_ib_dev {
|
||||
struct ib_device ib_dev;
|
||||
const struct uverbs_object_tree_def *driver_trees[7];
|
||||
@@ -902,6 +911,8 @@ struct mlx5_ib_dev {
|
||||
#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
|
||||
struct ib_odp_caps odp_caps;
|
||||
u64 odp_max_size;
|
||||
struct mlx5_ib_pf_eq odp_pf_eq;
|
||||
|
||||
/*
|
||||
* Sleepable RCU that prevents destruction of MRs while they are still
|
||||
* being used by a page fault handler.
|
||||
@@ -1158,9 +1169,8 @@ struct ib_mr *mlx5_ib_reg_dm_mr(struct ib_pd *pd, struct ib_dm *dm,
|
||||
|
||||
#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
|
||||
void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev);
|
||||
void mlx5_ib_pfault(struct mlx5_core_dev *mdev, void *context,
|
||||
struct mlx5_pagefault *pfault);
|
||||
int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev);
|
||||
void mlx5_ib_odp_cleanup_one(struct mlx5_ib_dev *ibdev);
|
||||
int __init mlx5_ib_odp_init(void);
|
||||
void mlx5_ib_odp_cleanup(void);
|
||||
void mlx5_ib_invalidate_range(struct ib_umem_odp *umem_odp, unsigned long start,
|
||||
@@ -1175,6 +1185,7 @@ static inline void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev)
|
||||
}
|
||||
|
||||
static inline int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev) { return 0; }
|
||||
static inline void mlx5_ib_odp_cleanup_one(struct mlx5_ib_dev *ibdev) {}
|
||||
static inline int mlx5_ib_odp_init(void) { return 0; }
|
||||
static inline void mlx5_ib_odp_cleanup(void) {}
|
||||
static inline void mlx5_odp_init_mr_cache_entry(struct mlx5_cache_ent *ent) {}
|
||||
|
@@ -37,6 +37,46 @@
|
||||
#include "mlx5_ib.h"
|
||||
#include "cmd.h"
|
||||
|
||||
#include <linux/mlx5/eq.h>
|
||||
|
||||
/* Contains the details of a pagefault. */
|
||||
struct mlx5_pagefault {
|
||||
u32 bytes_committed;
|
||||
u32 token;
|
||||
u8 event_subtype;
|
||||
u8 type;
|
||||
union {
|
||||
/* Initiator or send message responder pagefault details. */
|
||||
struct {
|
||||
/* Received packet size, only valid for responders. */
|
||||
u32 packet_size;
|
||||
/*
|
||||
* Number of resource holding WQE, depends on type.
|
||||
*/
|
||||
u32 wq_num;
|
||||
/*
|
||||
* WQE index. Refers to either the send queue or
|
||||
* receive queue, according to event_subtype.
|
||||
*/
|
||||
u16 wqe_index;
|
||||
} wqe;
|
||||
/* RDMA responder pagefault details */
|
||||
struct {
|
||||
u32 r_key;
|
||||
/*
|
||||
* Received packet size, minimal size page fault
|
||||
* resolution required for forward progress.
|
||||
*/
|
||||
u32 packet_size;
|
||||
u32 rdma_op_len;
|
||||
u64 rdma_va;
|
||||
} rdma;
|
||||
};
|
||||
|
||||
struct mlx5_ib_pf_eq *eq;
|
||||
struct work_struct work;
|
||||
};
|
||||
|
||||
#define MAX_PREFETCH_LEN (4*1024*1024U)
|
||||
|
||||
/* Timeout in ms to wait for an active mmu notifier to complete when handling
|
||||
@@ -304,14 +344,20 @@ static void mlx5_ib_page_fault_resume(struct mlx5_ib_dev *dev,
|
||||
{
|
||||
int wq_num = pfault->event_subtype == MLX5_PFAULT_SUBTYPE_WQE ?
|
||||
pfault->wqe.wq_num : pfault->token;
|
||||
int ret = mlx5_core_page_fault_resume(dev->mdev,
|
||||
pfault->token,
|
||||
wq_num,
|
||||
pfault->type,
|
||||
error);
|
||||
if (ret)
|
||||
mlx5_ib_err(dev, "Failed to resolve the page fault on WQ 0x%x\n",
|
||||
wq_num);
|
||||
u32 out[MLX5_ST_SZ_DW(page_fault_resume_out)] = { };
|
||||
u32 in[MLX5_ST_SZ_DW(page_fault_resume_in)] = { };
|
||||
int err;
|
||||
|
||||
MLX5_SET(page_fault_resume_in, in, opcode, MLX5_CMD_OP_PAGE_FAULT_RESUME);
|
||||
MLX5_SET(page_fault_resume_in, in, page_fault_type, pfault->type);
|
||||
MLX5_SET(page_fault_resume_in, in, token, pfault->token);
|
||||
MLX5_SET(page_fault_resume_in, in, wq_number, wq_num);
|
||||
MLX5_SET(page_fault_resume_in, in, error, !!error);
|
||||
|
||||
err = mlx5_cmd_exec(dev->mdev, in, sizeof(in), out, sizeof(out));
|
||||
if (err)
|
||||
mlx5_ib_err(dev, "Failed to resolve the page fault on WQ 0x%x err %d\n",
|
||||
wq_num, err);
|
||||
}
|
||||
|
||||
static struct mlx5_ib_mr *implicit_mr_alloc(struct ib_pd *pd,
|
||||
@@ -1196,10 +1242,8 @@ static void mlx5_ib_mr_rdma_pfault_handler(struct mlx5_ib_dev *dev,
|
||||
}
|
||||
}
|
||||
|
||||
void mlx5_ib_pfault(struct mlx5_core_dev *mdev, void *context,
|
||||
struct mlx5_pagefault *pfault)
|
||||
static void mlx5_ib_pfault(struct mlx5_ib_dev *dev, struct mlx5_pagefault *pfault)
|
||||
{
|
||||
struct mlx5_ib_dev *dev = context;
|
||||
u8 event_subtype = pfault->event_subtype;
|
||||
|
||||
switch (event_subtype) {
|
||||
@@ -1216,6 +1260,203 @@ void mlx5_ib_pfault(struct mlx5_core_dev *mdev, void *context,
|
||||
}
|
||||
}
|
||||
|
||||
static void mlx5_ib_eqe_pf_action(struct work_struct *work)
|
||||
{
|
||||
struct mlx5_pagefault *pfault = container_of(work,
|
||||
struct mlx5_pagefault,
|
||||
work);
|
||||
struct mlx5_ib_pf_eq *eq = pfault->eq;
|
||||
|
||||
mlx5_ib_pfault(eq->dev, pfault);
|
||||
mempool_free(pfault, eq->pool);
|
||||
}
|
||||
|
||||
static void mlx5_ib_eq_pf_process(struct mlx5_ib_pf_eq *eq)
|
||||
{
|
||||
struct mlx5_eqe_page_fault *pf_eqe;
|
||||
struct mlx5_pagefault *pfault;
|
||||
struct mlx5_eqe *eqe;
|
||||
int cc = 0;
|
||||
|
||||
while ((eqe = mlx5_eq_get_eqe(eq->core, cc))) {
|
||||
pfault = mempool_alloc(eq->pool, GFP_ATOMIC);
|
||||
if (!pfault) {
|
||||
schedule_work(&eq->work);
|
||||
break;
|
||||
}
|
||||
|
||||
pf_eqe = &eqe->data.page_fault;
|
||||
pfault->event_subtype = eqe->sub_type;
|
||||
pfault->bytes_committed = be32_to_cpu(pf_eqe->bytes_committed);
|
||||
|
||||
mlx5_ib_dbg(eq->dev,
|
||||
"PAGE_FAULT: subtype: 0x%02x, bytes_committed: 0x%06x\n",
|
||||
eqe->sub_type, pfault->bytes_committed);
|
||||
|
||||
switch (eqe->sub_type) {
|
||||
case MLX5_PFAULT_SUBTYPE_RDMA:
|
||||
/* RDMA based event */
|
||||
pfault->type =
|
||||
be32_to_cpu(pf_eqe->rdma.pftype_token) >> 24;
|
||||
pfault->token =
|
||||
be32_to_cpu(pf_eqe->rdma.pftype_token) &
|
||||
MLX5_24BIT_MASK;
|
||||
pfault->rdma.r_key =
|
||||
be32_to_cpu(pf_eqe->rdma.r_key);
|
||||
pfault->rdma.packet_size =
|
||||
be16_to_cpu(pf_eqe->rdma.packet_length);
|
||||
pfault->rdma.rdma_op_len =
|
||||
be32_to_cpu(pf_eqe->rdma.rdma_op_len);
|
||||
pfault->rdma.rdma_va =
|
||||
be64_to_cpu(pf_eqe->rdma.rdma_va);
|
||||
mlx5_ib_dbg(eq->dev,
|
||||
"PAGE_FAULT: type:0x%x, token: 0x%06x, r_key: 0x%08x\n",
|
||||
pfault->type, pfault->token,
|
||||
pfault->rdma.r_key);
|
||||
mlx5_ib_dbg(eq->dev,
|
||||
"PAGE_FAULT: rdma_op_len: 0x%08x, rdma_va: 0x%016llx\n",
|
||||
pfault->rdma.rdma_op_len,
|
||||
pfault->rdma.rdma_va);
|
||||
break;
|
||||
|
||||
case MLX5_PFAULT_SUBTYPE_WQE:
|
||||
/* WQE based event */
|
||||
pfault->type =
|
||||
(be32_to_cpu(pf_eqe->wqe.pftype_wq) >> 24) & 0x7;
|
||||
pfault->token =
|
||||
be32_to_cpu(pf_eqe->wqe.token);
|
||||
pfault->wqe.wq_num =
|
||||
be32_to_cpu(pf_eqe->wqe.pftype_wq) &
|
||||
MLX5_24BIT_MASK;
|
||||
pfault->wqe.wqe_index =
|
||||
be16_to_cpu(pf_eqe->wqe.wqe_index);
|
||||
pfault->wqe.packet_size =
|
||||
be16_to_cpu(pf_eqe->wqe.packet_length);
|
||||
mlx5_ib_dbg(eq->dev,
|
||||
"PAGE_FAULT: type:0x%x, token: 0x%06x, wq_num: 0x%06x, wqe_index: 0x%04x\n",
|
||||
pfault->type, pfault->token,
|
||||
pfault->wqe.wq_num,
|
||||
pfault->wqe.wqe_index);
|
||||
break;
|
||||
|
||||
default:
|
||||
mlx5_ib_warn(eq->dev,
|
||||
"Unsupported page fault event sub-type: 0x%02hhx\n",
|
||||
eqe->sub_type);
|
||||
/* Unsupported page faults should still be
|
||||
* resolved by the page fault handler
|
||||
*/
|
||||
}
|
||||
|
||||
pfault->eq = eq;
|
||||
INIT_WORK(&pfault->work, mlx5_ib_eqe_pf_action);
|
||||
queue_work(eq->wq, &pfault->work);
|
||||
|
||||
cc = mlx5_eq_update_cc(eq->core, ++cc);
|
||||
}
|
||||
|
||||
mlx5_eq_update_ci(eq->core, cc, 1);
|
||||
}
|
||||
|
||||
static irqreturn_t mlx5_ib_eq_pf_int(int irq, void *eq_ptr)
|
||||
{
|
||||
struct mlx5_ib_pf_eq *eq = eq_ptr;
|
||||
unsigned long flags;
|
||||
|
||||
if (spin_trylock_irqsave(&eq->lock, flags)) {
|
||||
mlx5_ib_eq_pf_process(eq);
|
||||
spin_unlock_irqrestore(&eq->lock, flags);
|
||||
} else {
|
||||
schedule_work(&eq->work);
|
||||
}
|
||||
|
||||
return IRQ_HANDLED;
|
||||
}
|
||||
|
||||
/* mempool_refill() was proposed but unfortunately wasn't accepted
|
||||
* http://lkml.iu.edu/hypermail/linux/kernel/1512.1/05073.html
|
||||
* Cheap workaround.
|
||||
*/
|
||||
static void mempool_refill(mempool_t *pool)
|
||||
{
|
||||
while (pool->curr_nr < pool->min_nr)
|
||||
mempool_free(mempool_alloc(pool, GFP_KERNEL), pool);
|
||||
}
|
||||
|
||||
static void mlx5_ib_eq_pf_action(struct work_struct *work)
|
||||
{
|
||||
struct mlx5_ib_pf_eq *eq =
|
||||
container_of(work, struct mlx5_ib_pf_eq, work);
|
||||
|
||||
mempool_refill(eq->pool);
|
||||
|
||||
spin_lock_irq(&eq->lock);
|
||||
mlx5_ib_eq_pf_process(eq);
|
||||
spin_unlock_irq(&eq->lock);
|
||||
}
|
||||
|
||||
enum {
|
||||
MLX5_IB_NUM_PF_EQE = 0x1000,
|
||||
MLX5_IB_NUM_PF_DRAIN = 64,
|
||||
};
|
||||
|
||||
static int
|
||||
mlx5_ib_create_pf_eq(struct mlx5_ib_dev *dev, struct mlx5_ib_pf_eq *eq)
|
||||
{
|
||||
struct mlx5_eq_param param = {};
|
||||
int err;
|
||||
|
||||
INIT_WORK(&eq->work, mlx5_ib_eq_pf_action);
|
||||
spin_lock_init(&eq->lock);
|
||||
eq->dev = dev;
|
||||
|
||||
eq->pool = mempool_create_kmalloc_pool(MLX5_IB_NUM_PF_DRAIN,
|
||||
sizeof(struct mlx5_pagefault));
|
||||
if (!eq->pool)
|
||||
return -ENOMEM;
|
||||
|
||||
eq->wq = alloc_workqueue("mlx5_ib_page_fault",
|
||||
WQ_HIGHPRI | WQ_UNBOUND | WQ_MEM_RECLAIM,
|
||||
MLX5_NUM_CMD_EQE);
|
||||
if (!eq->wq) {
|
||||
err = -ENOMEM;
|
||||
goto err_mempool;
|
||||
}
|
||||
|
||||
param = (struct mlx5_eq_param) {
|
||||
.index = MLX5_EQ_PFAULT_IDX,
|
||||
.mask = 1 << MLX5_EVENT_TYPE_PAGE_FAULT,
|
||||
.nent = MLX5_IB_NUM_PF_EQE,
|
||||
.context = eq,
|
||||
.handler = mlx5_ib_eq_pf_int
|
||||
};
|
||||
eq->core = mlx5_eq_create_generic(dev->mdev, "mlx5_ib_page_fault_eq", ¶m);
|
||||
if (IS_ERR(eq->core)) {
|
||||
err = PTR_ERR(eq->core);
|
||||
goto err_wq;
|
||||
}
|
||||
|
||||
return 0;
|
||||
err_wq:
|
||||
destroy_workqueue(eq->wq);
|
||||
err_mempool:
|
||||
mempool_destroy(eq->pool);
|
||||
return err;
|
||||
}
|
||||
|
||||
static int
|
||||
mlx5_ib_destroy_pf_eq(struct mlx5_ib_dev *dev, struct mlx5_ib_pf_eq *eq)
|
||||
{
|
||||
int err;
|
||||
|
||||
err = mlx5_eq_destroy_generic(dev->mdev, eq->core);
|
||||
cancel_work_sync(&eq->work);
|
||||
destroy_workqueue(eq->wq);
|
||||
mempool_destroy(eq->pool);
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
void mlx5_odp_init_mr_cache_entry(struct mlx5_cache_ent *ent)
|
||||
{
|
||||
if (!(ent->dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT))
|
||||
@@ -1244,7 +1485,7 @@ void mlx5_odp_init_mr_cache_entry(struct mlx5_cache_ent *ent)
|
||||
|
||||
int mlx5_ib_odp_init_one(struct mlx5_ib_dev *dev)
|
||||
{
|
||||
int ret;
|
||||
int ret = 0;
|
||||
|
||||
if (dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT) {
|
||||
ret = mlx5_cmd_null_mkey(dev->mdev, &dev->null_mkey);
|
||||
@@ -1254,7 +1495,20 @@ int mlx5_ib_odp_init_one(struct mlx5_ib_dev *dev)
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
if (!MLX5_CAP_GEN(dev->mdev, pg))
|
||||
return ret;
|
||||
|
||||
ret = mlx5_ib_create_pf_eq(dev, &dev->odp_pf_eq);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
void mlx5_ib_odp_cleanup_one(struct mlx5_ib_dev *dev)
|
||||
{
|
||||
if (!MLX5_CAP_GEN(dev->mdev, pg))
|
||||
return;
|
||||
|
||||
mlx5_ib_destroy_pf_eq(dev, &dev->odp_pf_eq);
|
||||
}
|
||||
|
||||
int mlx5_ib_odp_init(void)
|
||||
@@ -1264,4 +1518,3 @@ int mlx5_ib_odp_init(void)
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
Reference in New Issue
Block a user