mtl/ofi: Fix erroneous FI_PEEK/FI_CLAIM usage
The current iprobe/improbe implementations merely checks the return code on the posted receive operation to tell if there is a match or not. This commit moves the check to the probe's error callback instead. Per the semantics defined in libfabric, the peek operation is asynchronous and the results are to be fetched from the completion queue. If no message is found matching the tags specified in the peek request, then a completion queue error entry with err field set to FI_ENOMSG will be available. Signed-off-by: Raghu Raja <craghun@amazon.com>
Этот коммит содержится в:
родитель
9afcb8e2ab
Коммит
39f8a86b65
@ -991,10 +991,20 @@ __opal_attribute_always_inline__ static inline int
|
||||
ompi_mtl_ofi_probe_error_callback(struct fi_cq_err_entry *error,
|
||||
ompi_mtl_ofi_request_t *ofi_req)
|
||||
{
|
||||
ofi_req->status.MPI_ERROR = MPI_ERR_INTERN;
|
||||
ofi_req->completion_count--;
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
/*
|
||||
* Receives posted with FI_PEEK and friends will get an error
|
||||
* completion with FI_ENOMSG. This just indicates the lack of a match for
|
||||
* the probe and is not an error case. All other error cases are
|
||||
* provider-internal errors and should be flagged as such.
|
||||
*/
|
||||
if (error->err == FI_ENOMSG)
|
||||
return OMPI_SUCCESS;
|
||||
|
||||
ofi_req->status.MPI_ERROR = MPI_ERR_INTERN;
|
||||
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
__opal_attribute_always_inline__ static inline int
|
||||
@ -1039,7 +1049,6 @@ ompi_mtl_ofi_iprobe_generic(struct mca_mtl_base_module_t *mtl,
|
||||
/**
|
||||
* fi_trecvmsg with FI_PEEK:
|
||||
* Initiate a search for a match in the hardware or software queue.
|
||||
* The search can complete immediately with -ENOMSG.
|
||||
* If successful, libfabric will enqueue a context entry into the completion
|
||||
* queue to make the search nonblocking. This code will poll until the
|
||||
* entry is enqueued.
|
||||
@ -1060,13 +1069,7 @@ ompi_mtl_ofi_iprobe_generic(struct mca_mtl_base_module_t *mtl,
|
||||
ofi_req.match_state = 0;
|
||||
|
||||
MTL_OFI_RETRY_UNTIL_DONE(fi_trecvmsg(ompi_mtl_ofi.ofi_ctxt[ctxt_id].rx_ep, &msg, msgflags), ret);
|
||||
if (-FI_ENOMSG == ret) {
|
||||
/**
|
||||
* The search request completed but no matching message was found.
|
||||
*/
|
||||
*flag = 0;
|
||||
return OMPI_SUCCESS;
|
||||
} else if (OPAL_UNLIKELY(0 > ret)) {
|
||||
if (OPAL_UNLIKELY(0 > ret)) {
|
||||
MTL_OFI_LOG_FI_ERR(ret, "fi_trecvmsg failed");
|
||||
return ompi_mtl_ofi_get_error(ret);
|
||||
}
|
||||
@ -1136,7 +1139,6 @@ ompi_mtl_ofi_improbe_generic(struct mca_mtl_base_module_t *mtl,
|
||||
/**
|
||||
* fi_trecvmsg with FI_PEEK and FI_CLAIM:
|
||||
* Initiate a search for a match in the hardware or software queue.
|
||||
* The search can complete immediately with -ENOMSG.
|
||||
* If successful, libfabric will enqueue a context entry into the completion
|
||||
* queue to make the search nonblocking. This code will poll until the
|
||||
* entry is enqueued.
|
||||
@ -1158,14 +1160,7 @@ ompi_mtl_ofi_improbe_generic(struct mca_mtl_base_module_t *mtl,
|
||||
ofi_req->mask_bits = mask_bits;
|
||||
|
||||
MTL_OFI_RETRY_UNTIL_DONE(fi_trecvmsg(ompi_mtl_ofi.ofi_ctxt[ctxt_id].rx_ep, &msg, msgflags), ret);
|
||||
if (-FI_ENOMSG == ret) {
|
||||
/**
|
||||
* The search request completed but no matching message was found.
|
||||
*/
|
||||
*matched = 0;
|
||||
free(ofi_req);
|
||||
return OMPI_SUCCESS;
|
||||
} else if (OPAL_UNLIKELY(0 > ret)) {
|
||||
if (OPAL_UNLIKELY(0 > ret)) {
|
||||
MTL_OFI_LOG_FI_ERR(ret, "fi_trecvmsg failed");
|
||||
free(ofi_req);
|
||||
return ompi_mtl_ofi_get_error(ret);
|
||||
|
Загрузка…
Ссылка в новой задаче
Block a user