Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

efa: Add unsolicited RDMA write with immediate receive support #1459

Merged
merged 3 commits into from
May 12, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
Prev Previous commit
Next Next commit
providers/efa: Add unsolicited RDMA write w/ imm. receive
Add EFA direct verbs support for creating QPs that do not require
explicit receive WRs to accept RDMA write with immediate. Correctly
handle completions that didn't consume a WR.
Add a related capability bit and update documentation accordingly.

Reviewed-by: Daniel Kinsbursky <[email protected]>
Reviewed-by: Daniel Kranzdorf <[email protected]>
Signed-off-by: Michael Margolin <[email protected]>
  • Loading branch information
mrgolin committed May 8, 2024
commit b56366c7108331f07c41a3d5b9cc210270a0506e
8 changes: 5 additions & 3 deletions providers/efa/efa_io_defs.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */
/*
* Copyright 2018-2023 Amazon.com, Inc. or its affiliates. All rights reserved.
* Copyright 2018-2024 Amazon.com, Inc. or its affiliates. All rights reserved.
*/

#ifndef _EFA_IO_H_
Expand Down Expand Up @@ -208,7 +208,7 @@ struct efa_io_rx_desc {
struct efa_io_cdesc_common {
/*
* verbs-generated request ID, as provided in the completed tx or rx
* descriptor.
* descriptor.
*/
uint16_t req_id;

Expand All @@ -221,7 +221,8 @@ struct efa_io_cdesc_common {
* 3 : has_imm - indicates that immediate data is
* present - for RX completions only
* 6:4 : op_type - enum efa_io_send_op_type
* 7 : reserved31 - MBZ
* 7 : unsolicited - indicates that there is no
* matching request - for RDMA with imm. RX only
*/
uint8_t flags;

Expand Down Expand Up @@ -301,5 +302,6 @@ struct efa_io_rx_cdesc_ex {
#define EFA_IO_CDESC_COMMON_Q_TYPE_MASK GENMASK(2, 1)
#define EFA_IO_CDESC_COMMON_HAS_IMM_MASK BIT(3)
#define EFA_IO_CDESC_COMMON_OP_TYPE_MASK GENMASK(6, 4)
#define EFA_IO_CDESC_COMMON_UNSOLICITED_MASK BIT(7)

#endif /* _EFA_IO_H_ */
16 changes: 15 additions & 1 deletion providers/efa/efadv.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

#include <stdio.h>
#include <sys/types.h>
#include <stdbool.h>

#include <infiniband/verbs.h>

Expand All @@ -24,10 +25,15 @@ struct ibv_qp *efadv_create_driver_qp(struct ibv_pd *ibvpd,
struct ibv_qp_init_attr *attr,
uint32_t driver_qp_type);

enum {
EFADV_QP_FLAGS_UNSOLICITED_WRITE_RECV = 1 << 0,
};

struct efadv_qp_init_attr {
uint64_t comp_mask;
uint32_t driver_qp_type;
uint8_t reserved[4];
uint16_t flags;
uint8_t reserved[2];
};

struct ibv_qp *efadv_create_qp_ex(struct ibv_context *ibvctx,
Expand All @@ -40,6 +46,7 @@ enum {
EFADV_DEVICE_ATTR_CAPS_RNR_RETRY = 1 << 1,
EFADV_DEVICE_ATTR_CAPS_CQ_WITH_SGID = 1 << 2,
EFADV_DEVICE_ATTR_CAPS_RDMA_WRITE = 1 << 3,
EFADV_DEVICE_ATTR_CAPS_UNSOLICITED_WRITE_RECV = 1 << 4,
};

struct efadv_device_attr {
Expand Down Expand Up @@ -70,10 +77,12 @@ int efadv_query_ah(struct ibv_ah *ibvah, struct efadv_ah_attr *attr,
struct efadv_cq {
uint64_t comp_mask;
int (*wc_read_sgid)(struct efadv_cq *efadv_cq, union ibv_gid *sgid);
bool (*wc_is_unsolicited)(struct efadv_cq *efadv_cq);
};

enum {
EFADV_WC_EX_WITH_SGID = 1 << 0,
EFADV_WC_EX_WITH_IS_UNSOLICITED = 1 << 1,
};

struct efadv_cq_init_attr {
Expand All @@ -94,6 +103,11 @@ static inline int efadv_wc_read_sgid(struct efadv_cq *efadv_cq,
return efadv_cq->wc_read_sgid(efadv_cq, sgid);
}

static inline bool efadv_wc_is_unsolicited(struct efadv_cq *efadv_cq)
{
return efadv_cq->wc_is_unsolicited(efadv_cq);
}

enum {
EFADV_MR_ATTR_VALIDITY_RECV_IC_ID = 1 << 0,
EFADV_MR_ATTR_VALIDITY_RDMA_READ_IC_ID = 1 << 1,
Expand Down
7 changes: 7 additions & 0 deletions providers/efa/man/efadv_create_cq.3.md
Original file line number Diff line number Diff line change
Expand Up @@ -62,13 +62,20 @@ struct efadv_cq_init_attr {
EFADV_WC_EX_WITH_SGID:
if source AH is unknown, require sgid in WC.

EFADV_WC_EX_WITH_IS_UNSOLICITED:
request for an option to check whether a receive WC is unsolicited.


# Completion iterator functions

*efadv_wc_read_sgid*
: Get the source GID field from the current completion.
If the AH is known, a negative error value is returned.

*efadv_wc_is_unsolicited*
: Check whether it's an unsolicited receive completion that has no matching work request.
This function is available if the CQ was created with EFADV_WC_EX_WITH_IS_UNSOLICITED.


# RETURN VALUE

Expand Down
9 changes: 8 additions & 1 deletion providers/efa/man/efadv_create_qp_ex.3.md
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,8 @@ Compatibility is handled using the comp_mask and inlen fields.
struct efadv_qp_init_attr {
uint64_t comp_mask;
uint32_t driver_qp_type;
uint8_t reserved[4];
uint16_t flags;
uint8_t reserved[2];
};
```

Expand All @@ -60,6 +61,12 @@ struct efadv_qp_init_attr {
EFADV_QP_DRIVER_TYPE_SRD:
Create an SRD QP.

*flags*
: A bitwise OR of the values described below.

EFADV_QP_FLAGS_UNSOLICITED_WRITE_RECV:
Receive WRs will not be consumed for RDMA write with imm.

# RETURN VALUE

efadv_create_qp_ex() returns a pointer to the created QP, or NULL if the request fails.
Expand Down
6 changes: 6 additions & 0 deletions providers/efa/man/efadv_query_device.3.md
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,12 @@ struct efadv_device_attr {
EFADV_DEVICE_ATTR_CAPS_RDMA_WRITE:
RDMA write is supported

EFADV_DEVICE_ATTR_CAPS_UNSOLICITED_WRITE_RECV:
Indicates the device has support for creating QPs that can receive unsolicited
RDMA write with immediate. RQ with this feature enabled will not consume any work
requests in order to receive RDMA write with immediate and a WC generated for such
receive will be marked as unsolicited.

*max_rdma_size*
: Maximum RDMA transfer size in bytes.

Expand Down
63 changes: 51 additions & 12 deletions providers/efa/verbs.c
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,9 @@ int efadv_query_device(struct ibv_context *ibvctx,

if (EFA_DEV_CAP(ctx, CQ_WITH_SGID))
attr->device_caps |= EFADV_DEVICE_ATTR_CAPS_CQ_WITH_SGID;

if (EFA_DEV_CAP(ctx, UNSOLICITED_WRITE_RECV))
attr->device_caps |= EFADV_DEVICE_ATTR_CAPS_UNSOLICITED_WRITE_RECV;
}

if (vext_field_avail(typeof(*attr), max_rdma_size, inlen)) {
Expand Down Expand Up @@ -599,6 +602,13 @@ static int efa_wc_read_sgid(struct efadv_cq *efadv_cq, union ibv_gid *sgid)
return 0;
}

static bool efa_wc_is_unsolicited(struct efadv_cq *efadv_cq)
{
struct efa_cq *cq = efadv_cq_to_efa_cq(efadv_cq);

return EFA_GET(&cq->cur_cqe->flags, EFA_IO_CDESC_COMMON_UNSOLICITED);
}

static void efa_process_cqe(struct efa_cq *cq, struct ibv_wc *wc,
struct efa_qp *qp)
{
Expand Down Expand Up @@ -653,7 +663,8 @@ static void efa_process_cqe(struct efa_cq *cq, struct ibv_wc *wc,
wc->wc_flags |= IBV_WC_WITH_IMM;
}

wc->wr_id = cq->cur_wq->wrid[wrid_idx];
wc->wr_id = !EFA_GET(&cqe->flags, EFA_IO_CDESC_COMMON_UNSOLICITED) ?
cq->cur_wq->wrid[wrid_idx] : 0;

rdma_tracepoint(rdma_core_efa, process_completion, cq->dev->name, wc->wr_id,
wc->status, wc->opcode, wc->src_qp, wc->qp_num, wc->slid,
Expand All @@ -679,7 +690,8 @@ static void efa_process_ex_cqe(struct efa_cq *cq, struct efa_qp *qp)
UINT32_MAX, UINT16_MAX, efa_wc_read_byte_len(ibvcqx));
} else {
cq->cur_wq = &qp->rq.wq;
ibvcqx->wr_id = cq->cur_wq->wrid[wrid_idx];
ibvcqx->wr_id = !EFA_GET(&cqe->flags, EFA_IO_CDESC_COMMON_UNSOLICITED) ?
cq->cur_wq->wrid[wrid_idx] : 0;
ibvcqx->status = to_ibv_status(cqe->status);

rdma_tracepoint(rdma_core_efa, process_completion, cq->dev->name, ibvcqx->wr_id,
Expand Down Expand Up @@ -722,7 +734,8 @@ static inline int efa_poll_sub_cq(struct efa_cq *cq, struct efa_sub_cq *sub_cq,
efa_process_ex_cqe(cq, *cur_qp);
} else {
efa_process_cqe(cq, wc, *cur_qp);
efa_wq_put_wrid_idx_unlocked(cq->cur_wq, cq->cur_cqe->req_id);
if (!EFA_GET(&cq->cur_cqe->flags, EFA_IO_CDESC_COMMON_UNSOLICITED))
efa_wq_put_wrid_idx_unlocked(cq->cur_wq, cq->cur_cqe->req_id);
}

return 0;
Expand Down Expand Up @@ -806,7 +819,8 @@ static int efa_next_poll(struct ibv_cq_ex *ibvcqx)
struct efa_cq *cq = to_efa_cq_ex(ibvcqx);
int ret;

efa_wq_put_wrid_idx_unlocked(cq->cur_wq, cq->cur_cqe->req_id);
if (!EFA_GET(&cq->cur_cqe->flags, EFA_IO_CDESC_COMMON_UNSOLICITED))
efa_wq_put_wrid_idx_unlocked(cq->cur_wq, cq->cur_cqe->req_id);
ret = efa_poll_sub_cqs(cq, NULL, true);

return ret;
Expand All @@ -817,7 +831,8 @@ static void efa_end_poll(struct ibv_cq_ex *ibvcqx)
struct efa_cq *cq = to_efa_cq_ex(ibvcqx);

if (cq->cur_cqe) {
efa_wq_put_wrid_idx_unlocked(cq->cur_wq, cq->cur_cqe->req_id);
if (!EFA_GET(&cq->cur_cqe->flags, EFA_IO_CDESC_COMMON_UNSOLICITED))
efa_wq_put_wrid_idx_unlocked(cq->cur_wq, cq->cur_cqe->req_id);
if (cq->db)
efa_update_cq_doorbell(cq, false);
}
Expand Down Expand Up @@ -856,6 +871,8 @@ static void efa_cq_fill_pfns(struct efa_cq *cq,

if (efa_attr && (efa_attr->wc_flags & EFADV_WC_EX_WITH_SGID))
cq->dv_cq.wc_read_sgid = efa_wc_read_sgid;
if (efa_attr && (efa_attr->wc_flags & EFADV_WC_EX_WITH_IS_UNSOLICITED))
cq->dv_cq.wc_is_unsolicited = efa_wc_is_unsolicited;
}

static void efa_sub_cq_initialize(struct efa_sub_cq *sub_cq, uint8_t *buf,
Expand Down Expand Up @@ -993,8 +1010,8 @@ struct ibv_cq_ex *efadv_create_cq(struct ibv_context *ibvctx,
struct efadv_cq_init_attr *efa_attr,
uint32_t inlen)
{
uint64_t supp_wc_flags = 0;
struct efa_context *ctx;
uint64_t supp_wc_flags;

if (!is_efa_dev(ibvctx->device)) {
verbs_err(verbs_get_ctx(ibvctx), "Not an EFA device\n");
Expand All @@ -1011,7 +1028,10 @@ struct ibv_cq_ex *efadv_create_cq(struct ibv_context *ibvctx,
}

ctx = to_efa_context(ibvctx);
supp_wc_flags = EFA_DEV_CAP(ctx, CQ_WITH_SGID) ? EFADV_WC_EX_WITH_SGID : 0;
if (EFA_DEV_CAP(ctx, CQ_WITH_SGID))
supp_wc_flags |= EFADV_WC_EX_WITH_SGID;
if (EFA_DEV_CAP(ctx, UNSOLICITED_WRITE_RECV))
supp_wc_flags |= EFADV_WC_EX_WITH_IS_UNSOLICITED;
if (!check_comp_mask(efa_attr->wc_flags, supp_wc_flags)) {
verbs_err(verbs_get_ctx(ibvctx),
"Invalid EFA wc_flags[%#lx]\n", efa_attr->wc_flags);
Expand Down Expand Up @@ -1322,16 +1342,20 @@ static int efa_check_qp_attr(struct efa_context *ctx,
struct ibv_qp_init_attr_ex *attr,
struct efadv_qp_init_attr *efa_attr)
{
uint64_t supp_send_ops_mask;
uint64_t supp_ud_send_ops_mask = IBV_QP_EX_WITH_SEND |
IBV_QP_EX_WITH_SEND_WITH_IMM;
IBV_QP_EX_WITH_SEND_WITH_IMM;
uint64_t supp_srd_send_ops_mask = IBV_QP_EX_WITH_SEND |
IBV_QP_EX_WITH_SEND_WITH_IMM;
uint64_t supp_send_ops_mask;
uint16_t supp_efa_flags = 0;

if (EFA_DEV_CAP(ctx, RDMA_READ))
supp_srd_send_ops_mask |= IBV_QP_EX_WITH_RDMA_READ;
if (EFA_DEV_CAP(ctx, RDMA_WRITE))
supp_srd_send_ops_mask |= IBV_QP_EX_WITH_RDMA_WRITE |
IBV_QP_EX_WITH_RDMA_WRITE_WITH_IMM;
if (EFA_DEV_CAP(ctx, UNSOLICITED_WRITE_RECV))
supp_efa_flags |= EFADV_QP_FLAGS_UNSOLICITED_WRITE_RECV;

#define EFA_CREATE_QP_SUPP_ATTR_MASK \
(IBV_QP_INIT_ATTR_PD | IBV_QP_INIT_ATTR_SEND_OPS_FLAGS)
Expand All @@ -1342,6 +1366,13 @@ static int efa_check_qp_attr(struct efa_context *ctx,
return EOPNOTSUPP;
}

if (!check_comp_mask(efa_attr->flags, supp_efa_flags)) {
verbs_err(&ctx->ibvctx,
"Unsupported EFA flags[%#x] supported[%#x]\n",
efa_attr->flags, supp_efa_flags);
return EOPNOTSUPP;
}

if (!check_comp_mask(attr->comp_mask, EFA_CREATE_QP_SUPP_ATTR_MASK)) {
verbs_err(&ctx->ibvctx,
"Unsupported comp_mask[%#x] supported[%#x]\n",
Expand Down Expand Up @@ -1463,6 +1494,8 @@ static struct ibv_qp *create_qp(struct ibv_context *ibvctx,
sizeof(struct efa_io_tx_wqe);
if (attr->qp_type == IBV_QPT_DRIVER)
req.driver_qp_type = efa_attr->driver_qp_type;
if (efa_attr->flags & EFADV_QP_FLAGS_UNSOLICITED_WRITE_RECV)
req.flags |= EFA_CREATE_QP_WITH_UNSOLICITED_WRITE_RECV;

err = ibv_cmd_create_qp_ex(ibvctx, &qp->verbs_qp,
attr, &req.ibv_cmd, sizeof(req),
Expand Down Expand Up @@ -1520,6 +1553,7 @@ struct ibv_qp *efa_create_qp(struct ibv_pd *ibvpd,
struct ibv_qp_init_attr *attr)
{
struct ibv_qp_init_attr_ex attr_ex = {};
struct efadv_qp_init_attr efa_attr = {};
struct ibv_qp *ibvqp;

if (attr->qp_type != IBV_QPT_UD) {
Expand All @@ -1533,7 +1567,7 @@ struct ibv_qp *efa_create_qp(struct ibv_pd *ibvpd,
attr_ex.comp_mask = IBV_QP_INIT_ATTR_PD;
attr_ex.pd = ibvpd;

ibvqp = create_qp(ibvpd->context, &attr_ex, NULL);
ibvqp = create_qp(ibvpd->context, &attr_ex, &efa_attr);
if (ibvqp)
memcpy(attr, &attr_ex, sizeof(*attr));

Expand All @@ -1543,13 +1577,15 @@ struct ibv_qp *efa_create_qp(struct ibv_pd *ibvpd,
struct ibv_qp *efa_create_qp_ex(struct ibv_context *ibvctx,
struct ibv_qp_init_attr_ex *attr_ex)
{
struct efadv_qp_init_attr efa_attr = {};

if (attr_ex->qp_type != IBV_QPT_UD) {
verbs_err(verbs_get_ctx(ibvctx), "Unsupported QP type\n");
errno = EOPNOTSUPP;
return NULL;
}

return create_qp(ibvctx, attr_ex, NULL);
return create_qp(ibvctx, attr_ex, &efa_attr);
}

struct ibv_qp *efadv_create_driver_qp(struct ibv_pd *ibvpd,
Expand Down Expand Up @@ -1590,6 +1626,8 @@ struct ibv_qp *efadv_create_qp_ex(struct ibv_context *ibvctx,
struct efadv_qp_init_attr *efa_attr,
uint32_t inlen)
{
struct efadv_qp_init_attr local_efa_attr = {};

if (!is_efa_dev(ibvctx->device)) {
verbs_err(verbs_get_ctx(ibvctx), "Not an EFA device\n");
errno = EOPNOTSUPP;
Expand All @@ -1607,7 +1645,8 @@ struct ibv_qp *efadv_create_qp_ex(struct ibv_context *ibvctx,
return NULL;
}

return create_qp(ibvctx, attr_ex, efa_attr);
memcpy(&local_efa_attr, efa_attr, min_t(uint32_t, inlen, sizeof(local_efa_attr)));
return create_qp(ibvctx, attr_ex, &local_efa_attr);
}

int efa_modify_qp(struct ibv_qp *ibvqp, struct ibv_qp_attr *attr,
Expand Down