diff --git a/ompi/mca/mtl/ofi/help-mtl-ofi.txt b/ompi/mca/mtl/ofi/help-mtl-ofi.txt index 8131766ae0..fe00f11159 100644 --- a/ompi/mca/mtl/ofi/help-mtl-ofi.txt +++ b/ompi/mca/mtl/ofi/help-mtl-ofi.txt @@ -16,3 +16,5 @@ unusual; your job may behave unpredictably (and/or abort) after this. Local host: %s Location: %s:%d Error: %s (%zd) +[message too big] +Message size %llu bigger than supported by selected transport. Max = %llu diff --git a/ompi/mca/mtl/ofi/mtl_ofi.h b/ompi/mca/mtl/ofi/mtl_ofi.h index 1a04a5bad2..74f6324c8b 100644 --- a/ompi/mca/mtl/ofi/mtl_ofi.h +++ b/ompi/mca/mtl/ofi/mtl_ofi.h @@ -308,13 +308,22 @@ ompi_mtl_ofi_send(struct mca_mtl_base_module_t *mtl, endpoint = ompi_mtl_ofi_get_endpoint(mtl, ompi_proc); ompi_ret = ompi_mtl_datatype_pack(convertor, &start, &length, &free_after); - if (OMPI_SUCCESS != ompi_ret) return ompi_ret; + if (OPAL_UNLIKELY(OMPI_SUCCESS != ompi_ret)) { + return ompi_ret; + } ofi_req.buffer = (free_after) ? start : NULL; ofi_req.length = length; ofi_req.status.MPI_ERROR = OMPI_SUCCESS; ofi_req.completion_count = 0; + if (OPAL_UNLIKELY(length > endpoint->mtl_ofi_module->max_msg_size)) { + opal_show_help("help-mtl-ofi.txt", + "message too big", false, + length, endpoint->mtl_ofi_module->max_msg_size); + return OMPI_ERROR; + } + if (ompi_mtl_ofi.fi_cq_data) { match_bits = mtl_ofi_create_send_tag_CQD(comm->c_contextid, tag); src_addr = endpoint->peer_fiaddr; @@ -438,13 +447,20 @@ ompi_mtl_ofi_isend(struct mca_mtl_base_module_t *mtl, endpoint = ompi_mtl_ofi_get_endpoint(mtl, ompi_proc); ompi_ret = ompi_mtl_datatype_pack(convertor, &start, &length, &free_after); - if (OMPI_SUCCESS != ompi_ret) return ompi_ret; + if (OMPI_UNLIKELY(OMPI_SUCCESS != ompi_ret)) return ompi_ret; ofi_req->buffer = (free_after) ? start : NULL; ofi_req->length = length; ofi_req->status.MPI_ERROR = OMPI_SUCCESS; ofi_req->completion_count = 1; + if (OPAL_UNLIKELY(length > endpoint->mtl_ofi_module->max_msg_size)) { + opal_show_help("help-mtl-ofi.txt", + "message too big", false, + length, endpoint->mtl_ofi_module->max_msg_size); + return OMPI_ERROR; + } + if (ompi_mtl_ofi.fi_cq_data) { match_bits = mtl_ofi_create_send_tag_CQD(comm->c_contextid, tag); src_addr = endpoint->peer_fiaddr; diff --git a/ompi/mca/mtl/ofi/mtl_ofi_component.c b/ompi/mca/mtl/ofi/mtl_ofi_component.c index bc1a694789..1da8f2b045 100644 --- a/ompi/mca/mtl/ofi/mtl_ofi_component.c +++ b/ompi/mca/mtl/ofi/mtl_ofi_component.c @@ -630,9 +630,10 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads, } /** - * Save the maximum inject size. + * Save the maximum sizes. */ ompi_mtl_ofi.max_inject_size = prov->tx_attr->inject_size; + ompi_mtl_ofi.max_msg_size = prov->ep_attr->max_msg_size; /** * Create the objects that will be bound to the endpoint. diff --git a/ompi/mca/mtl/ofi/mtl_ofi_types.h b/ompi/mca/mtl/ofi/mtl_ofi_types.h index 5514b67a48..ec82cde589 100644 --- a/ompi/mca/mtl/ofi/mtl_ofi_types.h +++ b/ompi/mca/mtl/ofi/mtl_ofi_types.h @@ -49,6 +49,9 @@ typedef struct mca_mtl_ofi_module_t { /** Maximum inject size */ size_t max_inject_size; + /** Largest message that can be sent in a single send. */ + size_t max_msg_size; + /** Maximum number of CQ events to read in OFI Progress */ int ofi_progress_event_count;