From 557599b312a229e5a69611b542cfb51d404151d5 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 1 Sep 2022 11:20:54 -0300 Subject: [PATCH 1/3] RDMA/core: Add UVERBS_ATTR_RAW_FD This uses the same passing protocol as UVERBS_ATTR_FD (eg len = 0 data_s64 = fd), except that the FD is not required to be a uverbs object and the core code does not covert the FD to an object handle automatically. Access to the int fd is provided by uverbs_get_raw_fd(). Link: https://lore.kernel.org/r/2-v1-bd147097458e+ede-umem_dmabuf_jgg@nvidia.com (cherry picked from commit 015bda8abd3a6a77656e60b36d499c43a2c0f0a1) Signed-off-by: Tushar Dave Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/uverbs_ioctl.c | 8 ++++++++ include/rdma/uverbs_ioctl.h | 13 +++++++++++++ 2 files changed, 21 insertions(+) diff --git a/drivers/infiniband/core/uverbs_ioctl.c b/drivers/infiniband/core/uverbs_ioctl.c index 990f0724acc6b..d9799706c58e9 100644 --- a/drivers/infiniband/core/uverbs_ioctl.c +++ b/drivers/infiniband/core/uverbs_ioctl.c @@ -337,6 +337,14 @@ static int uverbs_process_attr(struct bundle_priv *pbundle, break; + case UVERBS_ATTR_TYPE_RAW_FD: + if (uattr->attr_data.reserved || uattr->len != 0 || + uattr->data_s64 < INT_MIN || uattr->data_s64 > INT_MAX) + return -EINVAL; + /* _uverbs_get_const_signed() is the accessor */ + e->ptr_attr.data = uattr->data_s64; + break; + case UVERBS_ATTR_TYPE_IDRS_ARRAY: return uverbs_process_idrs_array(pbundle, attr_uapi, &e->objs_arr_attr, uattr, diff --git a/include/rdma/uverbs_ioctl.h b/include/rdma/uverbs_ioctl.h index 23bb404aba12c..9d45a5b203169 100644 --- a/include/rdma/uverbs_ioctl.h +++ b/include/rdma/uverbs_ioctl.h @@ -24,6 +24,7 @@ enum uverbs_attr_type { UVERBS_ATTR_TYPE_PTR_OUT, UVERBS_ATTR_TYPE_IDR, UVERBS_ATTR_TYPE_FD, + UVERBS_ATTR_TYPE_RAW_FD, UVERBS_ATTR_TYPE_ENUM_IN, UVERBS_ATTR_TYPE_IDRS_ARRAY, }; @@ -521,6 +522,11 @@ struct uapi_definition { .u.obj.access = _access, \ __VA_ARGS__ } }) +#define UVERBS_ATTR_RAW_FD(_attr_id, ...) \ + (&(const struct uverbs_attr_def){ \ + .id = (_attr_id), \ + .attr = { .type = UVERBS_ATTR_TYPE_RAW_FD, __VA_ARGS__ } }) + #define UVERBS_ATTR_PTR_IN(_attr_id, _type, ...) \ (&(const struct uverbs_attr_def){ \ .id = _attr_id, \ @@ -999,4 +1005,11 @@ _uverbs_get_const_unsigned(u64 *to, uverbs_get_const_default_unsigned(_to, _attrs_bundle, _idx, \ _default)) +static inline int +uverbs_get_raw_fd(int *to, const struct uverbs_attr_bundle *attrs_bundle, + size_t idx) +{ + return uverbs_get_const_signed(to, attrs_bundle, idx); +} + #endif From f44f89c8fcf5a1fb2751d48d5f820fa4d0f46949 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 1 Sep 2022 11:20:55 -0300 Subject: [PATCH 2/3] RDMA/mlx5: Add support for dmabuf to devx umem This is modeled after the similar EFA enablement in commit 66f4817b5712 ("RDMA/efa: Add support for dmabuf memory regions"). Like EFA there is no support for revocation so we simply call the ib_umem_dmabuf_get_pinned() to obtain a umem instead of the normal ib_umem_get(). Everything else stays the same. Link: https://lore.kernel.org/r/3-v1-bd147097458e+ede-umem_dmabuf_jgg@nvidia.com (cherry picked from commit 9af859c58d0f169ead0ed95204cdd891b0ee623a) Signed-off-by: Tushar Dave Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/devx.c | 24 +++++++++++++++++++++--- include/uapi/rdma/mlx5_user_ioctl_cmds.h | 1 + 2 files changed, 22 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/devx.c b/drivers/infiniband/hw/mlx5/devx.c index 28cf0aa61ada6..f50da0804c4b8 100644 --- a/drivers/infiniband/hw/mlx5/devx.c +++ b/drivers/infiniband/hw/mlx5/devx.c @@ -2198,9 +2198,25 @@ static int devx_umem_get(struct mlx5_ib_dev *dev, struct ib_ucontext *ucontext, if (err) return err; - obj->umem = ib_umem_get_peer(&dev->ib_dev, addr, size, access, 0); - if (IS_ERR(obj->umem)) - return PTR_ERR(obj->umem); + if (uverbs_attr_is_valid(attrs, MLX5_IB_ATTR_DEVX_UMEM_REG_DMABUF_FD)) { + struct ib_umem_dmabuf *umem_dmabuf; + int dmabuf_fd; + + err = uverbs_get_raw_fd(&dmabuf_fd, attrs, + MLX5_IB_ATTR_DEVX_UMEM_REG_DMABUF_FD); + if (err) + return -EFAULT; + + umem_dmabuf = ib_umem_dmabuf_get_pinned( + &dev->ib_dev, addr, size, dmabuf_fd, access); + if (IS_ERR(umem_dmabuf)) + return PTR_ERR(umem_dmabuf); + obj->umem = &umem_dmabuf->umem; + } else { + obj->umem = ib_umem_get(&dev->ib_dev, addr, size, access); + if (IS_ERR(obj->umem)) + return PTR_ERR(obj->umem); + } return 0; } @@ -2852,6 +2868,8 @@ DECLARE_UVERBS_NAMED_METHOD( UVERBS_ATTR_PTR_IN(MLX5_IB_ATTR_DEVX_UMEM_REG_LEN, UVERBS_ATTR_TYPE(u64), UA_MANDATORY), + UVERBS_ATTR_RAW_FD(MLX5_IB_ATTR_DEVX_UMEM_REG_DMABUF_FD, + UA_OPTIONAL), UVERBS_ATTR_FLAGS_IN(MLX5_IB_ATTR_DEVX_UMEM_REG_ACCESS, enum ib_access_flags), UVERBS_ATTR_CONST_IN(MLX5_IB_ATTR_DEVX_UMEM_REG_PGSZ_BITMAP, diff --git a/include/uapi/rdma/mlx5_user_ioctl_cmds.h b/include/uapi/rdma/mlx5_user_ioctl_cmds.h index e539c84d63f1a..0af52cc0d4621 100644 --- a/include/uapi/rdma/mlx5_user_ioctl_cmds.h +++ b/include/uapi/rdma/mlx5_user_ioctl_cmds.h @@ -174,6 +174,7 @@ enum mlx5_ib_devx_umem_reg_attrs { MLX5_IB_ATTR_DEVX_UMEM_REG_ACCESS, MLX5_IB_ATTR_DEVX_UMEM_REG_OUT_ID, MLX5_IB_ATTR_DEVX_UMEM_REG_PGSZ_BITMAP, + MLX5_IB_ATTR_DEVX_UMEM_REG_DMABUF_FD, }; enum mlx5_ib_devx_umem_dereg_attrs { From 7bc3d815e92a343486cfea2e8d0a3d36db648be2 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 1 Sep 2022 11:20:56 -0300 Subject: [PATCH 3/3] RDMA/mlx5: Enable ATS support for MRs and umems For mlx5 if ATS is enabled in the PCI config then the device will use ATS requests for only certain DMA operations. This has to be opted in by the SW side based on the mkey or umem settings. ATS slows down the PCI performance, so it should only be set in cases when it is needed. All of these cases revolve around optimizing PCI P2P transfers and avoiding bad cases where the bus just doesn't work. Link: https://lore.kernel.org/r/4-v1-bd147097458e+ede-umem_dmabuf_jgg@nvidia.com (cherry picked from commit 72b2f7608a59727e7c2e5b11cff2749c2c080fac) Signed-off-by: Tushar Dave Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/devx.c | 37 ++++++++++++++++------------ drivers/infiniband/hw/mlx5/mlx5_ib.h | 36 +++++++++++++++++++++++++++ drivers/infiniband/hw/mlx5/mr.c | 6 ++++- 3 files changed, 62 insertions(+), 17 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/devx.c b/drivers/infiniband/hw/mlx5/devx.c index f50da0804c4b8..27efe40b8b527 100644 --- a/drivers/infiniband/hw/mlx5/devx.c +++ b/drivers/infiniband/hw/mlx5/devx.c @@ -2175,26 +2175,17 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_SUBSCRIBE_EVENT)( static int devx_umem_get(struct mlx5_ib_dev *dev, struct ib_ucontext *ucontext, struct uverbs_attr_bundle *attrs, - struct devx_umem *obj) + struct devx_umem *obj, u32 access_flags) { u64 addr; size_t size; - u32 access; int err; if (uverbs_copy_from(&addr, attrs, MLX5_IB_ATTR_DEVX_UMEM_REG_ADDR) || uverbs_copy_from(&size, attrs, MLX5_IB_ATTR_DEVX_UMEM_REG_LEN)) return -EFAULT; - err = uverbs_get_flags32(&access, attrs, - MLX5_IB_ATTR_DEVX_UMEM_REG_ACCESS, - IB_ACCESS_LOCAL_WRITE | - IB_ACCESS_REMOTE_WRITE | - IB_ACCESS_REMOTE_READ); - if (err) - return err; - - err = ib_check_mr_access(&dev->ib_dev, access); + err = ib_check_mr_access(&dev->ib_dev, access_flags); if (err) return err; @@ -2208,12 +2199,12 @@ static int devx_umem_get(struct mlx5_ib_dev *dev, struct ib_ucontext *ucontext, return -EFAULT; umem_dmabuf = ib_umem_dmabuf_get_pinned( - &dev->ib_dev, addr, size, dmabuf_fd, access); + &dev->ib_dev, addr, size, dmabuf_fd, access_flags); if (IS_ERR(umem_dmabuf)) return PTR_ERR(umem_dmabuf); obj->umem = &umem_dmabuf->umem; } else { - obj->umem = ib_umem_get(&dev->ib_dev, addr, size, access); + obj->umem = ib_umem_get(&dev->ib_dev, addr, size, access_flags); if (IS_ERR(obj->umem)) return PTR_ERR(obj->umem); } @@ -2255,7 +2246,8 @@ static unsigned int devx_umem_find_best_pgsize(struct ib_umem *umem, static int devx_umem_reg_cmd_alloc(struct mlx5_ib_dev *dev, struct uverbs_attr_bundle *attrs, struct devx_umem *obj, - struct devx_umem_reg_cmd *cmd) + struct devx_umem_reg_cmd *cmd, + int access) { unsigned long pgsz_bitmap; unsigned int page_size; @@ -2306,6 +2298,9 @@ static int devx_umem_reg_cmd_alloc(struct mlx5_ib_dev *dev, if (obj->umem->is_peer) MLX5_SET(umem, umem, ats, MLX5_CAP_GEN(dev->mdev, ats)); + if (mlx5_umem_needs_ats(dev, obj->umem, access)) + MLX5_SET(umem, umem, ats, 1); + mlx5_ib_populate_pas(obj->umem, page_size, mtt, (obj->umem->writable ? MLX5_IB_MTT_WRITE : 0) | MLX5_IB_MTT_READ); @@ -2323,20 +2318,30 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_UMEM_REG)( struct mlx5_ib_ucontext *c = rdma_udata_to_drv_context( &attrs->driver_udata, struct mlx5_ib_ucontext, ibucontext); struct mlx5_ib_dev *dev = to_mdev(c->ibucontext.device); + int access_flags; int err; if (!c->devx_uid) return -EINVAL; + err = uverbs_get_flags32(&access_flags, attrs, + MLX5_IB_ATTR_DEVX_UMEM_REG_ACCESS, + IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_REMOTE_WRITE | + IB_ACCESS_REMOTE_READ | + IB_ACCESS_RELAXED_ORDERING); + if (err) + return err; + obj = kzalloc(sizeof(struct devx_umem), GFP_KERNEL); if (!obj) return -ENOMEM; - err = devx_umem_get(dev, &c->ibucontext, attrs, obj); + err = devx_umem_get(dev, &c->ibucontext, attrs, obj, access_flags); if (err) goto err_obj_free; - err = devx_umem_reg_cmd_alloc(dev, attrs, obj, &cmd); + err = devx_umem_reg_cmd_alloc(dev, attrs, obj, &cmd, access_flags); if (err) goto err_umem_release; diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index bf20a388eabe1..6886cf90fab7d 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -1615,4 +1615,40 @@ static inline bool rt_supported(int ts_cap) return ts_cap == MLX5_TIMESTAMP_FORMAT_CAP_REAL_TIME || ts_cap == MLX5_TIMESTAMP_FORMAT_CAP_FREE_RUNNING_AND_REAL_TIME; } + +/* + * PCI Peer to Peer is a trainwreck. If no switch is present then things + * sometimes work, depending on the pci_distance_p2p logic for excluding broken + * root complexes. However if a switch is present in the path, then things get + * really ugly depending on how the switch is setup. This table assumes that the + * root complex is strict and is validating that all req/reps are matches + * perfectly - so any scenario where it sees only half the transaction is a + * failure. + * + * CR/RR/DT ATS RO P2P + * 00X X X OK + * 010 X X fails (request is routed to root but root never sees comp) + * 011 0 X fails (request is routed to root but root never sees comp) + * 011 1 X OK + * 10X X 1 OK + * 101 X 0 fails (completion is routed to root but root didn't see req) + * 110 X 0 SLOW + * 111 0 0 SLOW + * 111 1 0 fails (completion is routed to root but root didn't see req) + * 111 1 1 OK + * + * Unfortunately we cannot reliably know if a switch is present or what the + * CR/RR/DT ACS settings are, as in a VM that is all hidden. Assume that + * CR/RR/DT is 111 if the ATS cap is enabled and follow the last three rows. + * + * For now assume if the umem is a dma_buf then it is P2P. + */ +static inline bool mlx5_umem_needs_ats(struct mlx5_ib_dev *dev, + struct ib_umem *umem, int access_flags) +{ + if (!MLX5_CAP_GEN(dev->mdev, ats) || !umem->is_dmabuf) + return false; + return access_flags & IB_ACCESS_RELAXED_ORDERING; +} + #endif /* MLX5_IB_H */ diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index b938093ffa3fb..406076e353d79 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -960,7 +960,8 @@ static struct mlx5_ib_mr *alloc_cacheable_mr(struct ib_pd *pd, * cache then synchronously create an uncached one. */ if (!ent || ent->limit == 0 || - !mlx5_ib_can_reconfig_with_umr(dev, 0, access_flags)) { + !mlx5_ib_can_reconfig_with_umr(dev, 0, access_flags) || + mlx5_umem_needs_ats(dev, umem, access_flags)) { mutex_lock(&dev->slow_path_mutex); mr = reg_create(pd, umem, iova, access_flags, page_size, false); mutex_unlock(&dev->slow_path_mutex); @@ -1334,6 +1335,9 @@ static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem, MLX5_SET(mkc, mkc, translations_octword_size, get_octo_len(iova, umem->length, mr->page_shift)); MLX5_SET(mkc, mkc, log_page_size, mr->page_shift); + if (mlx5_umem_needs_ats(dev, umem, access_flags)) + MLX5_SET(mkc, mkc, ma_translation_mode, 1); + if (umem->is_peer) MLX5_SET(mkc, mkc, ma_translation_mode, MLX5_CAP_GEN(dev->mdev, ats));