1
1

mtl/ofi: Fix erroneous FI_PEEK/FI_CLAIM usage

The current iprobe/improbe implementations merely checks the return
code on the posted receive operation to tell if there is a match or
not. This commit moves the check to the probe's error callback
instead. Per the semantics defined in libfabric, the peek operation is
asynchronous and the results are to be fetched from the completion
queue. If no message is found matching the tags specified in the peek
request, then a completion queue error entry with err field set to
FI_ENOMSG will be available.

Signed-off-by: Raghu Raja <craghun@amazon.com>
Этот коммит содержится в:
Raghu Raja 2020-10-23 00:20:04 +00:00
родитель 9afcb8e2ab
Коммит 39f8a86b65

Просмотреть файл

@ -991,10 +991,20 @@ __opal_attribute_always_inline__ static inline int
ompi_mtl_ofi_probe_error_callback(struct fi_cq_err_entry *error,
ompi_mtl_ofi_request_t *ofi_req)
{
ofi_req->status.MPI_ERROR = MPI_ERR_INTERN;
ofi_req->completion_count--;
return OMPI_SUCCESS;
/*
* Receives posted with FI_PEEK and friends will get an error
* completion with FI_ENOMSG. This just indicates the lack of a match for
* the probe and is not an error case. All other error cases are
* provider-internal errors and should be flagged as such.
*/
if (error->err == FI_ENOMSG)
return OMPI_SUCCESS;
ofi_req->status.MPI_ERROR = MPI_ERR_INTERN;
return OMPI_ERROR;
}
__opal_attribute_always_inline__ static inline int
@ -1039,7 +1049,6 @@ ompi_mtl_ofi_iprobe_generic(struct mca_mtl_base_module_t *mtl,
/**
* fi_trecvmsg with FI_PEEK:
* Initiate a search for a match in the hardware or software queue.
* The search can complete immediately with -ENOMSG.
* If successful, libfabric will enqueue a context entry into the completion
* queue to make the search nonblocking. This code will poll until the
* entry is enqueued.
@ -1060,13 +1069,7 @@ ompi_mtl_ofi_iprobe_generic(struct mca_mtl_base_module_t *mtl,
ofi_req.match_state = 0;
MTL_OFI_RETRY_UNTIL_DONE(fi_trecvmsg(ompi_mtl_ofi.ofi_ctxt[ctxt_id].rx_ep, &msg, msgflags), ret);
if (-FI_ENOMSG == ret) {
/**
* The search request completed but no matching message was found.
*/
*flag = 0;
return OMPI_SUCCESS;
} else if (OPAL_UNLIKELY(0 > ret)) {
if (OPAL_UNLIKELY(0 > ret)) {
MTL_OFI_LOG_FI_ERR(ret, "fi_trecvmsg failed");
return ompi_mtl_ofi_get_error(ret);
}
@ -1136,7 +1139,6 @@ ompi_mtl_ofi_improbe_generic(struct mca_mtl_base_module_t *mtl,
/**
* fi_trecvmsg with FI_PEEK and FI_CLAIM:
* Initiate a search for a match in the hardware or software queue.
* The search can complete immediately with -ENOMSG.
* If successful, libfabric will enqueue a context entry into the completion
* queue to make the search nonblocking. This code will poll until the
* entry is enqueued.
@ -1158,14 +1160,7 @@ ompi_mtl_ofi_improbe_generic(struct mca_mtl_base_module_t *mtl,
ofi_req->mask_bits = mask_bits;
MTL_OFI_RETRY_UNTIL_DONE(fi_trecvmsg(ompi_mtl_ofi.ofi_ctxt[ctxt_id].rx_ep, &msg, msgflags), ret);
if (-FI_ENOMSG == ret) {
/**
* The search request completed but no matching message was found.
*/
*matched = 0;
free(ofi_req);
return OMPI_SUCCESS;
} else if (OPAL_UNLIKELY(0 > ret)) {
if (OPAL_UNLIKELY(0 > ret)) {
MTL_OFI_LOG_FI_ERR(ret, "fi_trecvmsg failed");
free(ofi_req);
return ompi_mtl_ofi_get_error(ret);