Merge branch 'odp_fixes' into rdma.git for-next

Jason Gunthorpe says:

====================
This is a collection of general cleanups for ODP to clarify some of the
flows around umem creation and use of the interval tree.
====================

The branch is based on v5.3-rc5 due to dependencies

* odp_fixes:
  RDMA/mlx5: Use odp instead of mr->umem in pagefault_mr
  RDMA/mlx5: Use ib_umem_start instead of umem.address
  RDMA/core: Make invalidate_range a device operation
  RDMA/odp: Use kvcalloc for the dma_list and page_list
  RDMA/odp: Check for overflow when computing the umem_odp end
  RDMA/odp: Provide ib_umem_odp_release() to undo the allocs
  RDMA/odp: Split creating a umem_odp from ib_umem_get
  RDMA/odp: Make the three ways to create a umem_odp clear
  RMDA/odp: Consolidate umem_odp initialization
  RDMA/odp: Make it clearer when a umem is an implicit ODP umem
  RDMA/odp: Iterate over the whole rbtree directly
  RDMA/odp: Use the common interval tree library instead of generic
  RDMA/mlx5: Fix MR npages calculation for IB_ACCESS_HUGETLB

Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
此提交包含在:
Jason Gunthorpe
2019-08-21 14:10:36 -03:00
當前提交 868df536f5
共有 1452 個檔案被更改,包括 13706 行新增10317 行删除

查看文件

@@ -184,7 +184,7 @@ void mlx5_odp_populate_klm(struct mlx5_klm *pklm, size_t offset,
for (i = 0; i < nentries; i++, pklm++) {
pklm->bcount = cpu_to_be32(MLX5_IMR_MTT_SIZE);
va = (offset + i) * MLX5_IMR_MTT_SIZE;
if (odp && odp->umem.address == va) {
if (odp && ib_umem_start(odp) == va) {
struct mlx5_ib_mr *mtt = odp->private;
pklm->key = cpu_to_be32(mtt->ibmr.lkey);
@@ -206,7 +206,7 @@ static void mr_leaf_free_action(struct work_struct *work)
mr->parent = NULL;
synchronize_srcu(&mr->dev->mr_srcu);
ib_umem_release(&odp->umem);
ib_umem_odp_release(odp);
if (imr->live)
mlx5_ib_update_xlt(imr, idx, 1, 0,
MLX5_IB_UPD_XLT_INDIRECT |
@@ -384,7 +384,7 @@ static void mlx5_ib_page_fault_resume(struct mlx5_ib_dev *dev,
}
static struct mlx5_ib_mr *implicit_mr_alloc(struct ib_pd *pd,
struct ib_umem *umem,
struct ib_umem_odp *umem_odp,
bool ksm, int access_flags)
{
struct mlx5_ib_dev *dev = to_mdev(pd->device);
@@ -402,7 +402,7 @@ static struct mlx5_ib_mr *implicit_mr_alloc(struct ib_pd *pd,
mr->dev = dev;
mr->access_flags = access_flags;
mr->mmkey.iova = 0;
mr->umem = umem;
mr->umem = &umem_odp->umem;
if (ksm) {
err = mlx5_ib_update_xlt(mr, 0,
@@ -462,18 +462,17 @@ next_mr:
if (nentries)
nentries++;
} else {
odp = ib_alloc_odp_umem(odp_mr, addr,
MLX5_IMR_MTT_SIZE);
odp = ib_umem_odp_alloc_child(odp_mr, addr, MLX5_IMR_MTT_SIZE);
if (IS_ERR(odp)) {
mutex_unlock(&odp_mr->umem_mutex);
return ERR_CAST(odp);
}
mtt = implicit_mr_alloc(mr->ibmr.pd, &odp->umem, 0,
mtt = implicit_mr_alloc(mr->ibmr.pd, odp, 0,
mr->access_flags);
if (IS_ERR(mtt)) {
mutex_unlock(&odp_mr->umem_mutex);
ib_umem_release(&odp->umem);
ib_umem_odp_release(odp);
return ERR_CAST(mtt);
}
@@ -495,7 +494,7 @@ next_mr:
addr += MLX5_IMR_MTT_SIZE;
if (unlikely(addr < io_virt + bcnt)) {
odp = odp_next(odp);
if (odp && odp->umem.address != addr)
if (odp && ib_umem_start(odp) != addr)
odp = NULL;
goto next_mr;
}
@@ -519,19 +518,19 @@ struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd,
int access_flags)
{
struct mlx5_ib_mr *imr;
struct ib_umem *umem;
struct ib_umem_odp *umem_odp;
umem = ib_umem_get(udata, 0, 0, access_flags, 0);
if (IS_ERR(umem))
return ERR_CAST(umem);
umem_odp = ib_umem_odp_alloc_implicit(udata, access_flags);
if (IS_ERR(umem_odp))
return ERR_CAST(umem_odp);
imr = implicit_mr_alloc(&pd->ibpd, umem, 1, access_flags);
imr = implicit_mr_alloc(&pd->ibpd, umem_odp, 1, access_flags);
if (IS_ERR(imr)) {
ib_umem_release(umem);
ib_umem_odp_release(umem_odp);
return ERR_CAST(imr);
}
imr->umem = umem;
imr->umem = &umem_odp->umem;
init_waitqueue_head(&imr->q_leaf_free);
atomic_set(&imr->num_leaf_free, 0);
atomic_set(&imr->num_pending_prefetch, 0);
@@ -539,34 +538,31 @@ struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd,
return imr;
}
static int mr_leaf_free(struct ib_umem_odp *umem_odp, u64 start, u64 end,
void *cookie)
{
struct mlx5_ib_mr *mr = umem_odp->private, *imr = cookie;
if (mr->parent != imr)
return 0;
ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem_odp),
ib_umem_end(umem_odp));
if (umem_odp->dying)
return 0;
WRITE_ONCE(umem_odp->dying, 1);
atomic_inc(&imr->num_leaf_free);
schedule_work(&umem_odp->work);
return 0;
}
void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *imr)
{
struct ib_ucontext_per_mm *per_mm = mr_to_per_mm(imr);
struct rb_node *node;
down_read(&per_mm->umem_rwsem);
rbt_ib_umem_for_each_in_range(&per_mm->umem_tree, 0, ULLONG_MAX,
mr_leaf_free, true, imr);
for (node = rb_first_cached(&per_mm->umem_tree); node;
node = rb_next(node)) {
struct ib_umem_odp *umem_odp =
rb_entry(node, struct ib_umem_odp, interval_tree.rb);
struct mlx5_ib_mr *mr = umem_odp->private;
if (mr->parent != imr)
continue;
ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem_odp),
ib_umem_end(umem_odp));
if (umem_odp->dying)
continue;
WRITE_ONCE(umem_odp->dying, 1);
atomic_inc(&imr->num_leaf_free);
schedule_work(&umem_odp->work);
}
up_read(&per_mm->umem_rwsem);
wait_event(imr->q_leaf_free, !atomic_read(&imr->num_leaf_free));
@@ -579,7 +575,6 @@ static int pagefault_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr,
u32 flags)
{
int npages = 0, current_seq, page_shift, ret, np;
bool implicit = false;
struct ib_umem_odp *odp_mr = to_ib_umem_odp(mr->umem);
bool downgrade = flags & MLX5_PF_FLAGS_DOWNGRADE;
bool prefetch = flags & MLX5_PF_FLAGS_PREFETCH;
@@ -588,13 +583,12 @@ static int pagefault_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr,
struct ib_umem_odp *odp;
size_t size;
if (!odp_mr->page_list) {
if (odp_mr->is_implicit_odp) {
odp = implicit_mr_get_data(mr, io_virt, bcnt);
if (IS_ERR(odp))
return PTR_ERR(odp);
mr = odp->private;
implicit = true;
} else {
odp = odp_mr;
}
@@ -607,7 +601,7 @@ next_mr:
start_idx = (io_virt - (mr->mmkey.iova & page_mask)) >> page_shift;
access_mask = ODP_READ_ALLOWED_BIT;
if (prefetch && !downgrade && !mr->umem->writable) {
if (prefetch && !downgrade && !odp->umem.writable) {
/* prefetch with write-access must
* be supported by the MR
*/
@@ -615,7 +609,7 @@ next_mr:
goto out;
}
if (mr->umem->writable && !downgrade)
if (odp->umem.writable && !downgrade)
access_mask |= ODP_WRITE_ALLOWED_BIT;
current_seq = READ_ONCE(odp->notifiers_seq);
@@ -625,8 +619,8 @@ next_mr:
*/
smp_rmb();
ret = ib_umem_odp_map_dma_pages(to_ib_umem_odp(mr->umem), io_virt, size,
access_mask, current_seq);
ret = ib_umem_odp_map_dma_pages(odp, io_virt, size, access_mask,
current_seq);
if (ret < 0)
goto out;
@@ -634,8 +628,7 @@ next_mr:
np = ret;
mutex_lock(&odp->umem_mutex);
if (!ib_umem_mmu_notifier_retry(to_ib_umem_odp(mr->umem),
current_seq)) {
if (!ib_umem_mmu_notifier_retry(odp, current_seq)) {
/*
* No need to check whether the MTTs really belong to
* this MR, since ib_umem_odp_map_dma_pages already
@@ -668,7 +661,7 @@ next_mr:
io_virt += size;
next = odp_next(odp);
if (unlikely(!next || next->umem.address != io_virt)) {
if (unlikely(!next || ib_umem_start(next) != io_virt)) {
mlx5_ib_dbg(dev, "next implicit leaf removed at 0x%llx. got %p\n",
io_virt, next);
return -EAGAIN;
@@ -682,19 +675,15 @@ next_mr:
out:
if (ret == -EAGAIN) {
if (implicit || !odp->dying) {
unsigned long timeout =
msecs_to_jiffies(MMU_NOTIFIER_TIMEOUT);
unsigned long timeout = msecs_to_jiffies(MMU_NOTIFIER_TIMEOUT);
if (!wait_for_completion_timeout(
&odp->notifier_completion,
timeout)) {
mlx5_ib_warn(dev, "timeout waiting for mmu notifier. seq %d against %d. notifiers_count=%d\n",
current_seq, odp->notifiers_seq, odp->notifiers_count);
}
} else {
/* The MR is being killed, kill the QP as well. */
ret = -EFAULT;
if (!wait_for_completion_timeout(&odp->notifier_completion,
timeout)) {
mlx5_ib_warn(
dev,
"timeout waiting for mmu notifier. seq %d against %d. notifiers_count=%d\n",
current_seq, odp->notifiers_seq,
odp->notifiers_count);
}
}
@@ -1598,6 +1587,7 @@ void mlx5_odp_init_mr_cache_entry(struct mlx5_cache_ent *ent)
static const struct ib_device_ops mlx5_ib_dev_odp_ops = {
.advise_mr = mlx5_ib_advise_mr,
.invalidate_range = mlx5_ib_invalidate_range,
};
int mlx5_ib_odp_init_one(struct mlx5_ib_dev *dev)