REF6976 Silent failure of OMPI over OFI with large messages sizes
INTERNAL: STL-59403 The OFI (libfabric) MTL does not respect the maximum message size parameter that OFI provides in the fi_info data. This patch adds this missing max_msg_size field to the mca_ofi_module_t structure and adds a length check to the low-level send routines. (cherry-picked from commit 3aca4af548a3d781b6b52f89f4d6c7e66d379609) Change-Id: Ie50445e5edfb0f30916de0836db0edc64ecf7c60 Signed-off-by: Michael Heinz <michael.william.heinz@intel.com> Reviewed-by: Adam Goldman <adam.goldman@intel.com> Reviewed-by: Brendan Cunningham <brendan.cunningham@intel.com>
Этот коммит содержится в:
родитель
e547a2b94d
Коммит
89be953cfd
@ -16,3 +16,5 @@ unusual; your job may behave unpredictably (and/or abort) after this.
|
||||
Local host: %s
|
||||
Location: %s:%d
|
||||
Error: %s (%zd)
|
||||
[message too big]
|
||||
Message size %llu bigger than supported by selected transport. Max = %llu
|
||||
|
@ -308,13 +308,22 @@ ompi_mtl_ofi_send(struct mca_mtl_base_module_t *mtl,
|
||||
endpoint = ompi_mtl_ofi_get_endpoint(mtl, ompi_proc);
|
||||
|
||||
ompi_ret = ompi_mtl_datatype_pack(convertor, &start, &length, &free_after);
|
||||
if (OMPI_SUCCESS != ompi_ret) return ompi_ret;
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != ompi_ret)) {
|
||||
return ompi_ret;
|
||||
}
|
||||
|
||||
ofi_req.buffer = (free_after) ? start : NULL;
|
||||
ofi_req.length = length;
|
||||
ofi_req.status.MPI_ERROR = OMPI_SUCCESS;
|
||||
ofi_req.completion_count = 0;
|
||||
|
||||
if (OPAL_UNLIKELY(length > endpoint->mtl_ofi_module->max_msg_size)) {
|
||||
opal_show_help("help-mtl-ofi.txt",
|
||||
"message too big", false,
|
||||
length, endpoint->mtl_ofi_module->max_msg_size);
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
if (ompi_mtl_ofi.fi_cq_data) {
|
||||
match_bits = mtl_ofi_create_send_tag_CQD(comm->c_contextid, tag);
|
||||
src_addr = endpoint->peer_fiaddr;
|
||||
@ -438,13 +447,20 @@ ompi_mtl_ofi_isend(struct mca_mtl_base_module_t *mtl,
|
||||
endpoint = ompi_mtl_ofi_get_endpoint(mtl, ompi_proc);
|
||||
|
||||
ompi_ret = ompi_mtl_datatype_pack(convertor, &start, &length, &free_after);
|
||||
if (OMPI_SUCCESS != ompi_ret) return ompi_ret;
|
||||
if (OMPI_UNLIKELY(OMPI_SUCCESS != ompi_ret)) return ompi_ret;
|
||||
|
||||
ofi_req->buffer = (free_after) ? start : NULL;
|
||||
ofi_req->length = length;
|
||||
ofi_req->status.MPI_ERROR = OMPI_SUCCESS;
|
||||
ofi_req->completion_count = 1;
|
||||
|
||||
if (OPAL_UNLIKELY(length > endpoint->mtl_ofi_module->max_msg_size)) {
|
||||
opal_show_help("help-mtl-ofi.txt",
|
||||
"message too big", false,
|
||||
length, endpoint->mtl_ofi_module->max_msg_size);
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
if (ompi_mtl_ofi.fi_cq_data) {
|
||||
match_bits = mtl_ofi_create_send_tag_CQD(comm->c_contextid, tag);
|
||||
src_addr = endpoint->peer_fiaddr;
|
||||
|
@ -630,9 +630,10 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
|
||||
}
|
||||
|
||||
/**
|
||||
* Save the maximum inject size.
|
||||
* Save the maximum sizes.
|
||||
*/
|
||||
ompi_mtl_ofi.max_inject_size = prov->tx_attr->inject_size;
|
||||
ompi_mtl_ofi.max_msg_size = prov->ep_attr->max_msg_size;
|
||||
|
||||
/**
|
||||
* Create the objects that will be bound to the endpoint.
|
||||
|
@ -49,6 +49,9 @@ typedef struct mca_mtl_ofi_module_t {
|
||||
/** Maximum inject size */
|
||||
size_t max_inject_size;
|
||||
|
||||
/** Largest message that can be sent in a single send. */
|
||||
size_t max_msg_size;
|
||||
|
||||
/** Maximum number of CQ events to read in OFI Progress */
|
||||
int ofi_progress_event_count;
|
||||
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user