From c43f1112c068f3b4b20a0a9d461c341d9caeb376 Mon Sep 17 00:00:00 2001 From: Maor Gottlieb Date: Wed, 18 Jan 2017 14:10:33 +0200 Subject: IB/mlx5: Add additional checks before processing MADs Check the has_smi bit in vport context and class version of MADs before allowing MADs processing to take place. MAD_IFC SMI commands can be executed only if smi bit is set. Fixes: e126ba97dba9 ('mlx5: Add driver for Mellanox Connect-IB adapters') Signed-off-by: Maor Gottlieb Signed-off-by: Parvi Kaustubhi Reviewed-by: Eli Cohen Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- include/linux/mlx5/driver.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 3a309f6a4a15..b8d69aeb1784 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -289,6 +289,7 @@ struct mlx5_port_caps { int gid_table_len; int pkey_table_len; u8 ext_port_cap; + bool has_smi; }; struct mlx5_cmd_mailbox { -- cgit v1.2.3 From 23a6964e3adb0796e1633562a574839b92360cb6 Mon Sep 17 00:00:00 2001 From: Majd Dibbiny Date: Wed, 18 Jan 2017 15:25:10 +0200 Subject: IB/mlx5: Add port counter support for Receive WQs Counters weren't updated due to Receive WQs' traffic since the counter-id was not associated with the RQ. Added support for associating the q-counter-id with the Receive WQ. The attachment is done only when changing WQ's state from RESET to READY in modify-WQ command. FW support is required for the above, without this support Receive WQ counters will not count. Signed-off-by: Majd Dibbiny Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/hw/mlx5/qp.c | 12 +++++++++++- include/linux/mlx5/mlx5_ifc.h | 2 +- 2 files changed, 12 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 5c7d655655bb..f395ee9d2fea 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -2429,7 +2429,7 @@ static int modify_raw_packet_qp_rq(struct mlx5_ib_dev *dev, if (raw_qp_param->set_mask & MLX5_RAW_QP_MOD_SET_RQ_Q_CTR_ID) { if (MLX5_CAP_GEN(dev->mdev, modify_rq_counter_set_id)) { MLX5_SET64(modify_rq_in, in, modify_bitmask, - MLX5_MODIFY_RQ_IN_MODIFY_BITMASK_MODIFY_RQ_COUNTER_SET_ID); + MLX5_MODIFY_RQ_IN_MODIFY_BITMASK_RQ_COUNTER_SET_ID); MLX5_SET(rqc, rqc, counter_set_id, raw_qp_param->rq_q_ctr_id); } else pr_info_once("%s: RAW PACKET QP counters are not supported on current FW\n", @@ -4910,6 +4910,16 @@ int mlx5_ib_modify_wq(struct ib_wq *wq, struct ib_wq_attr *wq_attr, MLX5_SET(modify_rq_in, in, rq_state, curr_wq_state); MLX5_SET(rqc, rqc, state, wq_state); + if (curr_wq_state == IB_WQS_RESET && wq_state == IB_WQS_RDY) { + if (MLX5_CAP_GEN(dev->mdev, modify_rq_counter_set_id)) { + MLX5_SET64(modify_rq_in, in, modify_bitmask, + MLX5_MODIFY_RQ_IN_MODIFY_BITMASK_RQ_COUNTER_SET_ID); + MLX5_SET(rqc, rqc, counter_set_id, dev->port->q_cnts.set_id); + } else + pr_info_once("%s: Receive WQ counters are not supported on current FW\n", + dev->ib_dev.name); + } + err = mlx5_core_modify_rq(dev->mdev, rwq->core_qp.qpn, in, inlen); kvfree(in); if (!err) diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 37327f6ba9cb..2d197d8a7025 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -4937,7 +4937,7 @@ struct mlx5_ifc_modify_rq_out_bits { enum { MLX5_MODIFY_RQ_IN_MODIFY_BITMASK_VSD = 1ULL << 1, - MLX5_MODIFY_RQ_IN_MODIFY_BITMASK_MODIFY_RQ_COUNTER_SET_ID = 1ULL << 3, + MLX5_MODIFY_RQ_IN_MODIFY_BITMASK_RQ_COUNTER_SET_ID = 1ULL << 3, }; struct mlx5_ifc_modify_rq_in_bits { -- cgit v1.2.3 From 49780d42dfc9ec0f4090c32ca59688449da1a1cd Mon Sep 17 00:00:00 2001 From: Artemy Kovalyov Date: Wed, 18 Jan 2017 16:58:10 +0200 Subject: IB/mlx5: Expose MR cache for mlx5_ib Allow other parts of mlx5_ib to use MR cache mechanism. * Add new functions mlx5_mr_cache_alloc and mlx5_mr_cache_free * Traditional MTT MKey buckets are limited by MAX_UMR_CACHE_ENTRY Additinal buckets may be added above. Signed-off-by: Artemy Kovalyov Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/hw/mlx5/mlx5_ib.h | 9 +++- drivers/infiniband/hw/mlx5/mr.c | 99 ++++++++++++++++++++++++++---------- include/linux/mlx5/driver.h | 3 +- 3 files changed, 82 insertions(+), 29 deletions(-) (limited to 'include/linux') diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 13bef19649f2..efc44de3c7d7 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -541,6 +541,10 @@ struct mlx5_cache_ent { struct dentry *dir; char name[4]; u32 order; + u32 xlt; + u32 access_mode; + u32 page; + u32 size; u32 cur; u32 miss; @@ -555,6 +559,7 @@ struct mlx5_cache_ent { struct work_struct work; struct delayed_work dwork; int pending; + struct completion compl; }; struct mlx5_mr_cache { @@ -837,7 +842,9 @@ void mlx5_ib_copy_pas(u64 *old, u64 *new, int step, int num); int mlx5_ib_get_cqe_size(struct mlx5_ib_dev *dev, struct ib_cq *ibcq); int mlx5_mr_cache_init(struct mlx5_ib_dev *dev); int mlx5_mr_cache_cleanup(struct mlx5_ib_dev *dev); -int mlx5_mr_ib_cont_pages(struct ib_umem *umem, u64 addr, int *count, int *shift); + +struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev, int entry); +void mlx5_mr_cache_free(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr); int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask, struct ib_mr_status *mr_status); struct ib_wq *mlx5_ib_create_wq(struct ib_pd *pd, diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index 8cf2a67f9fb0..8f5b94d483e4 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -49,6 +49,7 @@ enum { static int clean_mr(struct mlx5_ib_mr *mr); static int use_umr(struct mlx5_ib_dev *dev, int order); +static int unreg_umr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr); static int destroy_mkey(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) { @@ -149,6 +150,9 @@ static void reg_mr_callback(int status, void *context) if (err) pr_err("Error inserting to mkey tree. 0x%x\n", -err); write_unlock_irqrestore(&table->lock, flags); + + if (!completion_done(&ent->compl)) + complete(&ent->compl); } static int add_keys(struct mlx5_ib_dev *dev, int c, int num) @@ -157,7 +161,6 @@ static int add_keys(struct mlx5_ib_dev *dev, int c, int num) struct mlx5_cache_ent *ent = &cache->ent[c]; int inlen = MLX5_ST_SZ_BYTES(create_mkey_in); struct mlx5_ib_mr *mr; - int npages = 1 << ent->order; void *mkc; u32 *in; int err = 0; @@ -185,11 +188,11 @@ static int add_keys(struct mlx5_ib_dev *dev, int c, int num) MLX5_SET(mkc, mkc, free, 1); MLX5_SET(mkc, mkc, umr_en, 1); - MLX5_SET(mkc, mkc, access_mode, MLX5_MKC_ACCESS_MODE_MTT); + MLX5_SET(mkc, mkc, access_mode, ent->access_mode); MLX5_SET(mkc, mkc, qpn, 0xffffff); - MLX5_SET(mkc, mkc, translations_octword_size, (npages + 1) / 2); - MLX5_SET(mkc, mkc, log_page_size, 12); + MLX5_SET(mkc, mkc, translations_octword_size, ent->xlt); + MLX5_SET(mkc, mkc, log_page_size, ent->page); spin_lock_irq(&ent->lock); ent->pending++; @@ -447,6 +450,42 @@ static void cache_work_func(struct work_struct *work) __cache_work_func(ent); } +struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev, int entry) +{ + struct mlx5_mr_cache *cache = &dev->cache; + struct mlx5_cache_ent *ent; + struct mlx5_ib_mr *mr; + int err; + + if (entry < 0 || entry >= MAX_MR_CACHE_ENTRIES) { + mlx5_ib_err(dev, "cache entry %d is out of range\n", entry); + return NULL; + } + + ent = &cache->ent[entry]; + while (1) { + spin_lock_irq(&ent->lock); + if (list_empty(&ent->head)) { + spin_unlock_irq(&ent->lock); + + err = add_keys(dev, entry, 1); + if (err) + return ERR_PTR(err); + + wait_for_completion(&ent->compl); + } else { + mr = list_first_entry(&ent->head, struct mlx5_ib_mr, + list); + list_del(&mr->list); + ent->cur--; + spin_unlock_irq(&ent->lock); + if (ent->cur < ent->limit) + queue_work(cache->wq, &ent->work); + return mr; + } + } +} + static struct mlx5_ib_mr *alloc_cached_mr(struct mlx5_ib_dev *dev, int order) { struct mlx5_mr_cache *cache = &dev->cache; @@ -456,12 +495,12 @@ static struct mlx5_ib_mr *alloc_cached_mr(struct mlx5_ib_dev *dev, int order) int i; c = order2idx(dev, order); - if (c < 0 || c >= MAX_MR_CACHE_ENTRIES) { + if (c < 0 || c > MAX_UMR_CACHE_ENTRY) { mlx5_ib_warn(dev, "order %d, cache index %d\n", order, c); return NULL; } - for (i = c; i < MAX_MR_CACHE_ENTRIES; i++) { + for (i = c; i < MAX_UMR_CACHE_ENTRY; i++) { ent = &cache->ent[i]; mlx5_ib_dbg(dev, "order %d, cache index %d\n", ent->order, i); @@ -488,7 +527,7 @@ static struct mlx5_ib_mr *alloc_cached_mr(struct mlx5_ib_dev *dev, int order) return mr; } -static void free_cached_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) +void mlx5_mr_cache_free(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) { struct mlx5_mr_cache *cache = &dev->cache; struct mlx5_cache_ent *ent; @@ -500,6 +539,10 @@ static void free_cached_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) mlx5_ib_warn(dev, "order %d, cache index %d\n", mr->order, c); return; } + + if (unreg_umr(dev, mr)) + return; + ent = &cache->ent[c]; spin_lock_irq(&ent->lock); list_add_tail(&mr->list, &ent->head); @@ -602,7 +645,6 @@ int mlx5_mr_cache_init(struct mlx5_ib_dev *dev) { struct mlx5_mr_cache *cache = &dev->cache; struct mlx5_cache_ent *ent; - int limit; int err; int i; @@ -615,26 +657,33 @@ int mlx5_mr_cache_init(struct mlx5_ib_dev *dev) setup_timer(&dev->delay_timer, delay_time_func, (unsigned long)dev); for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) { - INIT_LIST_HEAD(&cache->ent[i].head); - spin_lock_init(&cache->ent[i].lock); - ent = &cache->ent[i]; INIT_LIST_HEAD(&ent->head); spin_lock_init(&ent->lock); ent->order = i + 2; ent->dev = dev; + ent->limit = 0; - if ((dev->mdev->profile->mask & MLX5_PROF_MASK_MR_CACHE) && - mlx5_core_is_pf(dev->mdev) && - use_umr(dev, ent->order)) - limit = dev->mdev->profile->mr_cache[i].limit; - else - limit = 0; - + init_completion(&ent->compl); INIT_WORK(&ent->work, cache_work_func); INIT_DELAYED_WORK(&ent->dwork, delayed_cache_work_func); - ent->limit = limit; queue_work(cache->wq, &ent->work); + + if (i > MAX_UMR_CACHE_ENTRY) + continue; + + if (!use_umr(dev, ent->order)) + continue; + + ent->page = PAGE_SHIFT; + ent->xlt = (1 << ent->order) * sizeof(struct mlx5_mtt) / + MLX5_IB_UMR_OCTOWORD; + ent->access_mode = MLX5_MKC_ACCESS_MODE_MTT; + if ((dev->mdev->profile->mask & MLX5_PROF_MASK_MR_CACHE) && + mlx5_core_is_pf(dev->mdev)) + ent->limit = dev->mdev->profile->mr_cache[i].limit; + else + ent->limit = 0; } err = mlx5_mr_cache_debugfs_init(dev); @@ -758,7 +807,7 @@ static int get_octo_len(u64 addr, u64 len, int page_size) static int use_umr(struct mlx5_ib_dev *dev, int order) { if (MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset)) - return order < MAX_MR_CACHE_ENTRIES + 2; + return order <= MAX_UMR_CACHE_ENTRY + 2; return order <= MLX5_MAX_UMR_SHIFT; } @@ -871,7 +920,7 @@ static struct mlx5_ib_mr *reg_umr(struct ib_pd *pd, struct ib_umem *umem, MLX5_IB_UPD_XLT_ENABLE); if (err) { - free_cached_mr(dev, mr); + mlx5_mr_cache_free(dev, mr); return ERR_PTR(err); } @@ -1091,6 +1140,7 @@ static struct mlx5_ib_mr *reg_create(struct ib_mr *ibmr, struct ib_pd *pd, goto err_2; } mr->mmkey.type = MLX5_MKEY_MR; + mr->desc_size = sizeof(struct mlx5_mtt); mr->umem = umem; mr->dev = dev; mr->live = 1; @@ -1398,12 +1448,7 @@ static int clean_mr(struct mlx5_ib_mr *mr) return err; } } else { - err = unreg_umr(dev, mr); - if (err) { - mlx5_ib_warn(dev, "failed unregister\n"); - return err; - } - free_cached_mr(dev, mr); + mlx5_mr_cache_free(dev, mr); } if (!umred) diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index b8d69aeb1784..2534b8a0fd7b 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -1052,7 +1052,8 @@ enum { }; enum { - MAX_MR_CACHE_ENTRIES = 21, + MAX_UMR_CACHE_ENTRY = 20, + MAX_MR_CACHE_ENTRIES }; enum { -- cgit v1.2.3 From 81713d3788d2e6bc005f15ee1c59d0eb06050a6b Mon Sep 17 00:00:00 2001 From: Artemy Kovalyov Date: Wed, 18 Jan 2017 16:58:11 +0200 Subject: IB/mlx5: Add implicit MR support Add implicit MR, covering entire user address space. The MR is implemented as an indirect KSM MR consisting of 1GB direct MRs. Pages and direct MRs are added/removed to MR by ODP. Signed-off-by: Artemy Kovalyov Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/hw/mlx5/main.c | 2 + drivers/infiniband/hw/mlx5/mlx5_ib.h | 20 +- drivers/infiniband/hw/mlx5/mr.c | 33 ++- drivers/infiniband/hw/mlx5/odp.c | 505 ++++++++++++++++++++++++++++++++--- include/linux/mlx5/driver.h | 2 + 5 files changed, 513 insertions(+), 49 deletions(-) (limited to 'include/linux') diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index fe37da2be26f..eb8719ca500e 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -3583,6 +3583,8 @@ static int __init mlx5_ib_init(void) { int err; + mlx5_ib_odp_init(); + err = mlx5_register_interface(&mlx5_ib_interface); return err; diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index efc44de3c7d7..3cd064b5f0bf 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -202,6 +202,7 @@ struct mlx5_ib_flow_db { #define MLX5_IB_UPD_XLT_ADDR BIT(3) #define MLX5_IB_UPD_XLT_PD BIT(4) #define MLX5_IB_UPD_XLT_ACCESS BIT(5) +#define MLX5_IB_UPD_XLT_INDIRECT BIT(6) /* Private QP creation flags to be passed in ib_qp_init_attr.create_flags. * @@ -503,6 +504,10 @@ struct mlx5_ib_mr { int live; void *descs_alloc; int access_flags; /* Needed for rereg MR */ + + struct mlx5_ib_mr *parent; + atomic_t num_leaf_free; + wait_queue_head_t q_leaf_free; }; struct mlx5_ib_mw { @@ -637,6 +642,7 @@ struct mlx5_ib_dev { * being used by a page fault handler. */ struct srcu_struct mr_srcu; + u32 null_mkey; #endif struct mlx5_ib_flow_db flow_db; /* protect resources needed as part of reset flow */ @@ -789,6 +795,9 @@ struct ib_mw *mlx5_ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type, int mlx5_ib_dealloc_mw(struct ib_mw *mw); int mlx5_ib_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages, int page_shift, int flags); +struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd, + int access_flags); +void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *mr); int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start, u64 length, u64 virt_addr, int access_flags, struct ib_pd *pd, struct ib_udata *udata); @@ -868,6 +877,9 @@ int __init mlx5_ib_odp_init(void); void mlx5_ib_odp_cleanup(void); void mlx5_ib_invalidate_range(struct ib_umem *umem, unsigned long start, unsigned long end); +void mlx5_odp_init_mr_cache_entry(struct mlx5_cache_ent *ent); +void mlx5_odp_populate_klm(struct mlx5_klm *pklm, size_t offset, + size_t nentries, struct mlx5_ib_mr *mr, int flags); #else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */ static inline void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev) { @@ -875,9 +887,13 @@ static inline void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev) } static inline int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev) { return 0; } -static inline void mlx5_ib_odp_remove_one(struct mlx5_ib_dev *ibdev) {} +static inline void mlx5_ib_odp_remove_one(struct mlx5_ib_dev *ibdev) {} static inline int mlx5_ib_odp_init(void) { return 0; } -static inline void mlx5_ib_odp_cleanup(void) {} +static inline void mlx5_ib_odp_cleanup(void) {} +static inline void mlx5_odp_init_mr_cache_entry(struct mlx5_cache_ent *ent) {} +static inline void mlx5_odp_populate_klm(struct mlx5_klm *pklm, size_t offset, + size_t nentries, struct mlx5_ib_mr *mr, + int flags) {} #endif /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */ diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index 8f5b94d483e4..3c1f483d003f 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -469,7 +469,7 @@ struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev, int entry) spin_unlock_irq(&ent->lock); err = add_keys(dev, entry, 1); - if (err) + if (err && err != -EAGAIN) return ERR_PTR(err); wait_for_completion(&ent->compl); @@ -669,8 +669,10 @@ int mlx5_mr_cache_init(struct mlx5_ib_dev *dev) INIT_DELAYED_WORK(&ent->dwork, delayed_cache_work_func); queue_work(cache->wq, &ent->work); - if (i > MAX_UMR_CACHE_ENTRY) + if (i > MAX_UMR_CACHE_ENTRY) { + mlx5_odp_init_mr_cache_entry(ent); continue; + } if (!use_umr(dev, ent->order)) continue; @@ -935,6 +937,10 @@ static inline int populate_xlt(struct mlx5_ib_mr *mr, int idx, int npages, { struct mlx5_ib_dev *dev = mr->dev; struct ib_umem *umem = mr->umem; + if (flags & MLX5_IB_UPD_XLT_INDIRECT) { + mlx5_odp_populate_klm(xlt, idx, npages, mr, flags); + return npages; + } npages = min_t(size_t, npages, ib_umem_num_pages(umem) - idx); @@ -968,7 +974,9 @@ int mlx5_ib_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages, struct mlx5_umr_wr wr; struct ib_sge sg; int err = 0; - int desc_size = sizeof(struct mlx5_mtt); + int desc_size = (flags & MLX5_IB_UPD_XLT_INDIRECT) + ? sizeof(struct mlx5_klm) + : sizeof(struct mlx5_mtt); const int page_align = MLX5_UMR_MTT_ALIGNMENT / desc_size; const int page_mask = page_align - 1; size_t pages_mapped = 0; @@ -1186,6 +1194,18 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, mlx5_ib_dbg(dev, "start 0x%llx, virt_addr 0x%llx, length 0x%llx, access_flags 0x%x\n", start, virt_addr, length, access_flags); + +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING + if (!start && length == U64_MAX) { + if (!(access_flags & IB_ACCESS_ON_DEMAND) || + !(dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT)) + return ERR_PTR(-EINVAL); + + mr = mlx5_ib_alloc_implicit_mr(to_mpd(pd), access_flags); + return &mr->ibmr; + } +#endif + err = mr_umem_get(pd, start, length, access_flags, &umem, &npages, &page_shift, &ncont, &order); @@ -1471,8 +1491,11 @@ int mlx5_ib_dereg_mr(struct ib_mr *ibmr) /* Wait for all running page-fault handlers to finish. */ synchronize_srcu(&dev->mr_srcu); /* Destroy all page mappings */ - mlx5_ib_invalidate_range(umem, ib_umem_start(umem), - ib_umem_end(umem)); + if (umem->odp_data->page_list) + mlx5_ib_invalidate_range(umem, ib_umem_start(umem), + ib_umem_end(umem)); + else + mlx5_ib_free_implicit_mr(mr); /* * We kill the umem before the MR for ODP, * so that there will not be any invalidations in diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index e5bc267aca73..d7b12f0750e2 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -34,6 +34,7 @@ #include #include "mlx5_ib.h" +#include "cmd.h" #define MAX_PREFETCH_LEN (4*1024*1024U) @@ -41,6 +42,140 @@ * a pagefault. */ #define MMU_NOTIFIER_TIMEOUT 1000 +#define MLX5_IMR_MTT_BITS (30 - PAGE_SHIFT) +#define MLX5_IMR_MTT_SHIFT (MLX5_IMR_MTT_BITS + PAGE_SHIFT) +#define MLX5_IMR_MTT_ENTRIES BIT_ULL(MLX5_IMR_MTT_BITS) +#define MLX5_IMR_MTT_SIZE BIT_ULL(MLX5_IMR_MTT_SHIFT) +#define MLX5_IMR_MTT_MASK (~(MLX5_IMR_MTT_SIZE - 1)) + +#define MLX5_KSM_PAGE_SHIFT MLX5_IMR_MTT_SHIFT + +static u64 mlx5_imr_ksm_entries; + +static int check_parent(struct ib_umem_odp *odp, + struct mlx5_ib_mr *parent) +{ + struct mlx5_ib_mr *mr = odp->private; + + return mr && mr->parent == parent; +} + +static struct ib_umem_odp *odp_next(struct ib_umem_odp *odp) +{ + struct mlx5_ib_mr *mr = odp->private, *parent = mr->parent; + struct ib_ucontext *ctx = odp->umem->context; + struct rb_node *rb; + + down_read(&ctx->umem_rwsem); + while (1) { + rb = rb_next(&odp->interval_tree.rb); + if (!rb) + goto not_found; + odp = rb_entry(rb, struct ib_umem_odp, interval_tree.rb); + if (check_parent(odp, parent)) + goto end; + } +not_found: + odp = NULL; +end: + up_read(&ctx->umem_rwsem); + return odp; +} + +static struct ib_umem_odp *odp_lookup(struct ib_ucontext *ctx, + u64 start, u64 length, + struct mlx5_ib_mr *parent) +{ + struct ib_umem_odp *odp; + struct rb_node *rb; + + down_read(&ctx->umem_rwsem); + odp = rbt_ib_umem_lookup(&ctx->umem_tree, start, length); + if (!odp) + goto end; + + while (1) { + if (check_parent(odp, parent)) + goto end; + rb = rb_next(&odp->interval_tree.rb); + if (!rb) + goto not_found; + odp = rb_entry(rb, struct ib_umem_odp, interval_tree.rb); + if (ib_umem_start(odp->umem) > start + length) + goto not_found; + } +not_found: + odp = NULL; +end: + up_read(&ctx->umem_rwsem); + return odp; +} + +void mlx5_odp_populate_klm(struct mlx5_klm *pklm, size_t offset, + size_t nentries, struct mlx5_ib_mr *mr, int flags) +{ + struct ib_pd *pd = mr->ibmr.pd; + struct ib_ucontext *ctx = pd->uobject->context; + struct mlx5_ib_dev *dev = to_mdev(pd->device); + struct ib_umem_odp *odp; + unsigned long va; + int i; + + if (flags & MLX5_IB_UPD_XLT_ZAP) { + for (i = 0; i < nentries; i++, pklm++) { + pklm->bcount = cpu_to_be32(MLX5_IMR_MTT_SIZE); + pklm->key = cpu_to_be32(dev->null_mkey); + pklm->va = 0; + } + return; + } + + odp = odp_lookup(ctx, offset * MLX5_IMR_MTT_SIZE, + nentries * MLX5_IMR_MTT_SIZE, mr); + + for (i = 0; i < nentries; i++, pklm++) { + pklm->bcount = cpu_to_be32(MLX5_IMR_MTT_SIZE); + va = (offset + i) * MLX5_IMR_MTT_SIZE; + if (odp && odp->umem->address == va) { + struct mlx5_ib_mr *mtt = odp->private; + + pklm->key = cpu_to_be32(mtt->ibmr.lkey); + odp = odp_next(odp); + } else { + pklm->key = cpu_to_be32(dev->null_mkey); + } + mlx5_ib_dbg(dev, "[%d] va %lx key %x\n", + i, va, be32_to_cpu(pklm->key)); + } +} + +static void mr_leaf_free_action(struct work_struct *work) +{ + struct ib_umem_odp *odp = container_of(work, struct ib_umem_odp, work); + int idx = ib_umem_start(odp->umem) >> MLX5_IMR_MTT_SHIFT; + struct mlx5_ib_mr *mr = odp->private, *imr = mr->parent; + + mr->parent = NULL; + synchronize_srcu(&mr->dev->mr_srcu); + + if (!READ_ONCE(odp->dying)) { + mr->parent = imr; + if (atomic_dec_and_test(&imr->num_leaf_free)) + wake_up(&imr->q_leaf_free); + return; + } + + ib_umem_release(odp->umem); + if (imr->live) + mlx5_ib_update_xlt(imr, idx, 1, 0, + MLX5_IB_UPD_XLT_INDIRECT | + MLX5_IB_UPD_XLT_ATOMIC); + mlx5_mr_cache_free(mr->dev, mr); + + if (atomic_dec_and_test(&imr->num_leaf_free)) + wake_up(&imr->q_leaf_free); +} + void mlx5_ib_invalidate_range(struct ib_umem *umem, unsigned long start, unsigned long end) { @@ -111,6 +246,13 @@ void mlx5_ib_invalidate_range(struct ib_umem *umem, unsigned long start, */ ib_umem_odp_unmap_dma_pages(umem, start, end); + + if (unlikely(!umem->npages && mr->parent && + !umem->odp_data->dying)) { + WRITE_ONCE(umem->odp_data->dying, 1); + atomic_inc(&mr->parent->num_leaf_free); + schedule_work(&umem->odp_data->work); + } } void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev) @@ -147,6 +289,11 @@ void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev) if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.atomic)) caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_ATOMIC; + if (MLX5_CAP_GEN(dev->mdev, fixed_buffer_size) && + MLX5_CAP_GEN(dev->mdev, null_mkey) && + MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset)) + caps->general_caps |= IB_ODP_SUPPORT_IMPLICIT; + return; } @@ -184,6 +331,197 @@ static void mlx5_ib_page_fault_resume(struct mlx5_ib_dev *dev, wq_num); } +static struct mlx5_ib_mr *implicit_mr_alloc(struct ib_pd *pd, + struct ib_umem *umem, + bool ksm, int access_flags) +{ + struct mlx5_ib_dev *dev = to_mdev(pd->device); + struct mlx5_ib_mr *mr; + int err; + + mr = mlx5_mr_cache_alloc(dev, ksm ? MLX5_IMR_KSM_CACHE_ENTRY : + MLX5_IMR_MTT_CACHE_ENTRY); + + if (IS_ERR(mr)) + return mr; + + mr->ibmr.pd = pd; + + mr->dev = dev; + mr->access_flags = access_flags; + mr->mmkey.iova = 0; + mr->umem = umem; + + if (ksm) { + err = mlx5_ib_update_xlt(mr, 0, + mlx5_imr_ksm_entries, + MLX5_KSM_PAGE_SHIFT, + MLX5_IB_UPD_XLT_INDIRECT | + MLX5_IB_UPD_XLT_ZAP | + MLX5_IB_UPD_XLT_ENABLE); + + } else { + err = mlx5_ib_update_xlt(mr, 0, + MLX5_IMR_MTT_ENTRIES, + PAGE_SHIFT, + MLX5_IB_UPD_XLT_ZAP | + MLX5_IB_UPD_XLT_ENABLE | + MLX5_IB_UPD_XLT_ATOMIC); + } + + if (err) + goto fail; + + mr->ibmr.lkey = mr->mmkey.key; + mr->ibmr.rkey = mr->mmkey.key; + + mr->live = 1; + + mlx5_ib_dbg(dev, "key %x dev %p mr %p\n", + mr->mmkey.key, dev->mdev, mr); + + return mr; + +fail: + mlx5_ib_err(dev, "Failed to register MKEY %d\n", err); + mlx5_mr_cache_free(dev, mr); + + return ERR_PTR(err); +} + +static struct ib_umem_odp *implicit_mr_get_data(struct mlx5_ib_mr *mr, + u64 io_virt, size_t bcnt) +{ + struct ib_ucontext *ctx = mr->ibmr.pd->uobject->context; + struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.pd->device); + struct ib_umem_odp *odp, *result = NULL; + u64 addr = io_virt & MLX5_IMR_MTT_MASK; + int nentries = 0, start_idx = 0, ret; + struct mlx5_ib_mr *mtt; + struct ib_umem *umem; + + mutex_lock(&mr->umem->odp_data->umem_mutex); + odp = odp_lookup(ctx, addr, 1, mr); + + mlx5_ib_dbg(dev, "io_virt:%llx bcnt:%zx addr:%llx odp:%p\n", + io_virt, bcnt, addr, odp); + +next_mr: + if (likely(odp)) { + if (nentries) + nentries++; + } else { + umem = ib_alloc_odp_umem(ctx, addr, MLX5_IMR_MTT_SIZE); + if (IS_ERR(umem)) { + mutex_unlock(&mr->umem->odp_data->umem_mutex); + return ERR_CAST(umem); + } + + mtt = implicit_mr_alloc(mr->ibmr.pd, umem, 0, mr->access_flags); + if (IS_ERR(mtt)) { + mutex_unlock(&mr->umem->odp_data->umem_mutex); + ib_umem_release(umem); + return ERR_CAST(mtt); + } + + odp = umem->odp_data; + odp->private = mtt; + mtt->umem = umem; + mtt->mmkey.iova = addr; + mtt->parent = mr; + INIT_WORK(&odp->work, mr_leaf_free_action); + + if (!nentries) + start_idx = addr >> MLX5_IMR_MTT_SHIFT; + nentries++; + } + + odp->dying = 0; + + /* Return first odp if region not covered by single one */ + if (likely(!result)) + result = odp; + + addr += MLX5_IMR_MTT_SIZE; + if (unlikely(addr < io_virt + bcnt)) { + odp = odp_next(odp); + if (odp && odp->umem->address != addr) + odp = NULL; + goto next_mr; + } + + if (unlikely(nentries)) { + ret = mlx5_ib_update_xlt(mr, start_idx, nentries, 0, + MLX5_IB_UPD_XLT_INDIRECT | + MLX5_IB_UPD_XLT_ATOMIC); + if (ret) { + mlx5_ib_err(dev, "Failed to update PAS\n"); + result = ERR_PTR(ret); + } + } + + mutex_unlock(&mr->umem->odp_data->umem_mutex); + return result; +} + +struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd, + int access_flags) +{ + struct ib_ucontext *ctx = pd->ibpd.uobject->context; + struct mlx5_ib_mr *imr; + struct ib_umem *umem; + + umem = ib_umem_get(ctx, 0, 0, IB_ACCESS_ON_DEMAND, 0); + if (IS_ERR(umem)) + return ERR_CAST(umem); + + imr = implicit_mr_alloc(&pd->ibpd, umem, 1, access_flags); + if (IS_ERR(imr)) { + ib_umem_release(umem); + return ERR_CAST(imr); + } + + imr->umem = umem; + init_waitqueue_head(&imr->q_leaf_free); + atomic_set(&imr->num_leaf_free, 0); + + return imr; +} + +static int mr_leaf_free(struct ib_umem *umem, u64 start, + u64 end, void *cookie) +{ + struct mlx5_ib_mr *mr = umem->odp_data->private, *imr = cookie; + + if (mr->parent != imr) + return 0; + + ib_umem_odp_unmap_dma_pages(umem, + ib_umem_start(umem), + ib_umem_end(umem)); + + if (umem->odp_data->dying) + return 0; + + WRITE_ONCE(umem->odp_data->dying, 1); + atomic_inc(&imr->num_leaf_free); + schedule_work(&umem->odp_data->work); + + return 0; +} + +void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *imr) +{ + struct ib_ucontext *ctx = imr->ibmr.pd->uobject->context; + + down_read(&ctx->umem_rwsem); + rbt_ib_umem_for_each_in_range(&ctx->umem_tree, 0, ULLONG_MAX, + mr_leaf_free, imr); + up_read(&ctx->umem_rwsem); + + wait_event(imr->q_leaf_free, !atomic_read(&imr->num_leaf_free)); +} + /* * Handle a single data segment in a page-fault WQE or RDMA region. * @@ -195,47 +533,43 @@ static void mlx5_ib_page_fault_resume(struct mlx5_ib_dev *dev, * -EFAULT when there's an error mapping the requested pages. The caller will * abort the page fault handling. */ -static int pagefault_single_data_segment(struct mlx5_ib_dev *mib_dev, +static int pagefault_single_data_segment(struct mlx5_ib_dev *dev, u32 key, u64 io_virt, size_t bcnt, u32 *bytes_committed, u32 *bytes_mapped) { int srcu_key; - unsigned int current_seq; + unsigned int current_seq = 0; u64 start_idx; int npages = 0, ret = 0; struct mlx5_ib_mr *mr; u64 access_mask = ODP_READ_ALLOWED_BIT; + struct ib_umem_odp *odp; + int implicit = 0; + size_t size; - srcu_key = srcu_read_lock(&mib_dev->mr_srcu); - mr = mlx5_ib_odp_find_mr_lkey(mib_dev, key); + srcu_key = srcu_read_lock(&dev->mr_srcu); + mr = mlx5_ib_odp_find_mr_lkey(dev, key); /* * If we didn't find the MR, it means the MR was closed while we were * handling the ODP event. In this case we return -EFAULT so that the * QP will be closed. */ if (!mr || !mr->ibmr.pd) { - pr_err("Failed to find relevant mr for lkey=0x%06x, probably the MR was destroyed\n", - key); + mlx5_ib_dbg(dev, "Failed to find relevant mr for lkey=0x%06x, probably the MR was destroyed\n", + key); ret = -EFAULT; goto srcu_unlock; } if (!mr->umem->odp_data) { - pr_debug("skipping non ODP MR (lkey=0x%06x) in page fault handler.\n", - key); + mlx5_ib_dbg(dev, "skipping non ODP MR (lkey=0x%06x) in page fault handler.\n", + key); if (bytes_mapped) *bytes_mapped += (bcnt - *bytes_committed); goto srcu_unlock; } - current_seq = ACCESS_ONCE(mr->umem->odp_data->notifiers_seq); - /* - * Ensure the sequence number is valid for some time before we call - * gup. - */ - smp_rmb(); - /* * Avoid branches - this code will perform correctly * in all iterations (in iteration 2 and above, @@ -244,63 +578,109 @@ static int pagefault_single_data_segment(struct mlx5_ib_dev *mib_dev, io_virt += *bytes_committed; bcnt -= *bytes_committed; + if (!mr->umem->odp_data->page_list) { + odp = implicit_mr_get_data(mr, io_virt, bcnt); + + if (IS_ERR(odp)) { + ret = PTR_ERR(odp); + goto srcu_unlock; + } + mr = odp->private; + implicit = 1; + + } else { + odp = mr->umem->odp_data; + } + +next_mr: + current_seq = READ_ONCE(odp->notifiers_seq); + /* + * Ensure the sequence number is valid for some time before we call + * gup. + */ + smp_rmb(); + + size = min_t(size_t, bcnt, ib_umem_end(odp->umem) - io_virt); start_idx = (io_virt - (mr->mmkey.iova & PAGE_MASK)) >> PAGE_SHIFT; if (mr->umem->writable) access_mask |= ODP_WRITE_ALLOWED_BIT; - npages = ib_umem_odp_map_dma_pages(mr->umem, io_virt, bcnt, - access_mask, current_seq); - if (npages < 0) { - ret = npages; + + ret = ib_umem_odp_map_dma_pages(mr->umem, io_virt, size, + access_mask, current_seq); + + if (ret < 0) goto srcu_unlock; - } - if (npages > 0) { - mutex_lock(&mr->umem->odp_data->umem_mutex); + if (ret > 0) { + int np = ret; + + mutex_lock(&odp->umem_mutex); if (!ib_umem_mmu_notifier_retry(mr->umem, current_seq)) { /* * No need to check whether the MTTs really belong to * this MR, since ib_umem_odp_map_dma_pages already * checks this. */ - ret = mlx5_ib_update_xlt(mr, start_idx, npages, + ret = mlx5_ib_update_xlt(mr, start_idx, np, PAGE_SHIFT, MLX5_IB_UPD_XLT_ATOMIC); } else { ret = -EAGAIN; } - mutex_unlock(&mr->umem->odp_data->umem_mutex); + mutex_unlock(&odp->umem_mutex); if (ret < 0) { if (ret != -EAGAIN) - pr_err("Failed to update mkey page tables\n"); + mlx5_ib_err(dev, "Failed to update mkey page tables\n"); goto srcu_unlock; } if (bytes_mapped) { - u32 new_mappings = npages * PAGE_SIZE - + u32 new_mappings = np * PAGE_SIZE - (io_virt - round_down(io_virt, PAGE_SIZE)); - *bytes_mapped += min_t(u32, new_mappings, bcnt); + *bytes_mapped += min_t(u32, new_mappings, size); } + + npages += np; + } + + bcnt -= size; + if (unlikely(bcnt)) { + struct ib_umem_odp *next; + + io_virt += size; + next = odp_next(odp); + if (unlikely(!next || next->umem->address != io_virt)) { + mlx5_ib_dbg(dev, "next implicit leaf removed at 0x%llx. got %p\n", + io_virt, next); + ret = -EAGAIN; + goto srcu_unlock_no_wait; + } + odp = next; + mr = odp->private; + goto next_mr; } srcu_unlock: if (ret == -EAGAIN) { - if (!mr->umem->odp_data->dying) { - struct ib_umem_odp *odp_data = mr->umem->odp_data; + if (implicit || !odp->dying) { unsigned long timeout = msecs_to_jiffies(MMU_NOTIFIER_TIMEOUT); if (!wait_for_completion_timeout( - &odp_data->notifier_completion, + &odp->notifier_completion, timeout)) { - pr_warn("timeout waiting for mmu notifier completion\n"); + mlx5_ib_warn(dev, "timeout waiting for mmu notifier. seq %d against %d\n", + current_seq, odp->notifiers_seq); } } else { /* The MR is being killed, kill the QP as well. */ ret = -EFAULT; } } - srcu_read_unlock(&mib_dev->mr_srcu, srcu_key); + +srcu_unlock_no_wait: + srcu_read_unlock(&dev->mr_srcu, srcu_key); *bytes_committed = 0; return ret ? ret : npages; } @@ -618,8 +998,8 @@ static void mlx5_ib_mr_wqe_pfault_handler(struct mlx5_ib_dev *dev, goto resolve_page_fault; } else if (ret < 0 || total_wqe_bytes > bytes_mapped) { if (ret != -ENOENT) - mlx5_ib_err(dev, "Error getting user pages for page fault. Error: %d\n", - ret); + mlx5_ib_err(dev, "PAGE FAULT error: %d. QP 0x%x. type: 0x%x\n", + ret, pfault->wqe.wq_num, pfault->type); goto resolve_page_fault; } @@ -627,7 +1007,7 @@ static void mlx5_ib_mr_wqe_pfault_handler(struct mlx5_ib_dev *dev, resolve_page_fault: mlx5_ib_page_fault_resume(dev, pfault, resume_with_error); mlx5_ib_dbg(dev, "PAGE FAULT completed. QP 0x%x resume_with_error=%d, type: 0x%x\n", - pfault->token, resume_with_error, + pfault->wqe.wq_num, resume_with_error, pfault->type); free_page((unsigned long)buffer); } @@ -700,10 +1080,9 @@ static void mlx5_ib_mr_rdma_pfault_handler(struct mlx5_ib_dev *dev, ret = pagefault_single_data_segment(dev, rkey, address, prefetch_len, &bytes_committed, NULL); - if (ret < 0) { + if (ret < 0 && ret != -EAGAIN) { mlx5_ib_warn(dev, "Prefetch failed. ret: %d, QP 0x%x, address: 0x%.16llx, length = 0x%.16x\n", - ret, pfault->token, address, - prefetch_len); + ret, pfault->token, address, prefetch_len); } } } @@ -728,19 +1107,61 @@ void mlx5_ib_pfault(struct mlx5_core_dev *mdev, void *context, } } -int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev) +void mlx5_odp_init_mr_cache_entry(struct mlx5_cache_ent *ent) +{ + if (!(ent->dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT)) + return; + + switch (ent->order - 2) { + case MLX5_IMR_MTT_CACHE_ENTRY: + ent->page = PAGE_SHIFT; + ent->xlt = MLX5_IMR_MTT_ENTRIES * + sizeof(struct mlx5_mtt) / + MLX5_IB_UMR_OCTOWORD; + ent->access_mode = MLX5_MKC_ACCESS_MODE_MTT; + ent->limit = 0; + break; + + case MLX5_IMR_KSM_CACHE_ENTRY: + ent->page = MLX5_KSM_PAGE_SHIFT; + ent->xlt = mlx5_imr_ksm_entries * + sizeof(struct mlx5_klm) / + MLX5_IB_UMR_OCTOWORD; + ent->access_mode = MLX5_MKC_ACCESS_MODE_KSM; + ent->limit = 0; + break; + } +} + +int mlx5_ib_odp_init_one(struct mlx5_ib_dev *dev) { int ret; - ret = init_srcu_struct(&ibdev->mr_srcu); + ret = init_srcu_struct(&dev->mr_srcu); if (ret) return ret; + if (dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT) { + ret = mlx5_cmd_null_mkey(dev->mdev, &dev->null_mkey); + if (ret) { + mlx5_ib_err(dev, "Error getting null_mkey %d\n", ret); + return ret; + } + } + return 0; } -void mlx5_ib_odp_remove_one(struct mlx5_ib_dev *ibdev) +void mlx5_ib_odp_remove_one(struct mlx5_ib_dev *dev) +{ + cleanup_srcu_struct(&dev->mr_srcu); +} + +int mlx5_ib_odp_init(void) { - cleanup_srcu_struct(&ibdev->mr_srcu); + mlx5_imr_ksm_entries = BIT_ULL(get_order(TASK_SIZE) - + MLX5_IMR_MTT_BITS); + + return 0; } diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 2534b8a0fd7b..886ff2b00500 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -1053,6 +1053,8 @@ enum { enum { MAX_UMR_CACHE_ENTRY = 20, + MLX5_IMR_MTT_CACHE_ENTRY, + MLX5_IMR_KSM_CACHE_ENTRY, MAX_MR_CACHE_ENTRIES }; -- cgit v1.2.3