Merge pull request #7004 from mwheinz/REFS6976-master
REF6976 Silent failure of OMPI over OFI with large messages sizes
Этот коммит содержится в:
Коммит
ee3564a2dc
@ -75,3 +75,5 @@ recoverable and your application is likely to abort.
|
|||||||
Local host: %s
|
Local host: %s
|
||||||
Remote host: %s
|
Remote host: %s
|
||||||
Error: %s (%d)
|
Error: %s (%d)
|
||||||
|
[message too big]
|
||||||
|
Message size %llu bigger than supported by selected transport. Max = %llu
|
||||||
|
@ -421,13 +421,22 @@ ompi_mtl_ofi_send_generic(struct mca_mtl_base_module_t *mtl,
|
|||||||
sep_peer_fiaddr = fi_rx_addr(endpoint->peer_fiaddr, ctxt_id, ompi_mtl_ofi.rx_ctx_bits);
|
sep_peer_fiaddr = fi_rx_addr(endpoint->peer_fiaddr, ctxt_id, ompi_mtl_ofi.rx_ctx_bits);
|
||||||
|
|
||||||
ompi_ret = ompi_mtl_datatype_pack(convertor, &start, &length, &free_after);
|
ompi_ret = ompi_mtl_datatype_pack(convertor, &start, &length, &free_after);
|
||||||
if (OMPI_SUCCESS != ompi_ret) return ompi_ret;
|
if (OPAL_UNLIKELY(OMPI_SUCCESS != ompi_ret)) {
|
||||||
|
return ompi_ret;
|
||||||
|
}
|
||||||
|
|
||||||
ofi_req.buffer = (free_after) ? start : NULL;
|
ofi_req.buffer = (free_after) ? start : NULL;
|
||||||
ofi_req.length = length;
|
ofi_req.length = length;
|
||||||
ofi_req.status.MPI_ERROR = OMPI_SUCCESS;
|
ofi_req.status.MPI_ERROR = OMPI_SUCCESS;
|
||||||
ofi_req.completion_count = 0;
|
ofi_req.completion_count = 0;
|
||||||
|
|
||||||
|
if (OPAL_UNLIKELY(length > endpoint->mtl_ofi_module->max_msg_size)) {
|
||||||
|
opal_show_help("help-mtl-ofi.txt",
|
||||||
|
"message too big", false,
|
||||||
|
length, endpoint->mtl_ofi_module->max_msg_size);
|
||||||
|
return OMPI_ERROR;
|
||||||
|
}
|
||||||
|
|
||||||
if (ofi_cq_data) {
|
if (ofi_cq_data) {
|
||||||
match_bits = mtl_ofi_create_send_tag_CQD(comm->c_contextid, tag);
|
match_bits = mtl_ofi_create_send_tag_CQD(comm->c_contextid, tag);
|
||||||
src_addr = sep_peer_fiaddr;
|
src_addr = sep_peer_fiaddr;
|
||||||
@ -553,13 +562,20 @@ ompi_mtl_ofi_isend_generic(struct mca_mtl_base_module_t *mtl,
|
|||||||
sep_peer_fiaddr = fi_rx_addr(endpoint->peer_fiaddr, ctxt_id, ompi_mtl_ofi.rx_ctx_bits);
|
sep_peer_fiaddr = fi_rx_addr(endpoint->peer_fiaddr, ctxt_id, ompi_mtl_ofi.rx_ctx_bits);
|
||||||
|
|
||||||
ompi_ret = ompi_mtl_datatype_pack(convertor, &start, &length, &free_after);
|
ompi_ret = ompi_mtl_datatype_pack(convertor, &start, &length, &free_after);
|
||||||
if (OMPI_SUCCESS != ompi_ret) return ompi_ret;
|
if (OMPI_UNLIKELY(OMPI_SUCCESS != ompi_ret)) return ompi_ret;
|
||||||
|
|
||||||
ofi_req->buffer = (free_after) ? start : NULL;
|
ofi_req->buffer = (free_after) ? start : NULL;
|
||||||
ofi_req->length = length;
|
ofi_req->length = length;
|
||||||
ofi_req->status.MPI_ERROR = OMPI_SUCCESS;
|
ofi_req->status.MPI_ERROR = OMPI_SUCCESS;
|
||||||
ofi_req->completion_count = 1;
|
ofi_req->completion_count = 1;
|
||||||
|
|
||||||
|
if (OPAL_UNLIKELY(length > endpoint->mtl_ofi_module->max_msg_size)) {
|
||||||
|
opal_show_help("help-mtl-ofi.txt",
|
||||||
|
"message too big", false,
|
||||||
|
length, endpoint->mtl_ofi_module->max_msg_size);
|
||||||
|
return OMPI_ERROR;
|
||||||
|
}
|
||||||
|
|
||||||
if (ofi_cq_data) {
|
if (ofi_cq_data) {
|
||||||
match_bits = mtl_ofi_create_send_tag_CQD(comm->c_contextid, tag);
|
match_bits = mtl_ofi_create_send_tag_CQD(comm->c_contextid, tag);
|
||||||
} else {
|
} else {
|
||||||
|
@ -881,9 +881,10 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Save the maximum inject size.
|
* Save the maximum sizes.
|
||||||
*/
|
*/
|
||||||
ompi_mtl_ofi.max_inject_size = prov->tx_attr->inject_size;
|
ompi_mtl_ofi.max_inject_size = prov->tx_attr->inject_size;
|
||||||
|
ompi_mtl_ofi.max_msg_size = prov->ep_attr->max_msg_size;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* The user is not allowed to exceed MTL_OFI_MAX_PROG_EVENT_COUNT.
|
* The user is not allowed to exceed MTL_OFI_MAX_PROG_EVENT_COUNT.
|
||||||
|
@ -70,6 +70,9 @@ typedef struct mca_mtl_ofi_module_t {
|
|||||||
/** Maximum inject size */
|
/** Maximum inject size */
|
||||||
size_t max_inject_size;
|
size_t max_inject_size;
|
||||||
|
|
||||||
|
/** Largest message that can be sent in a single send. */
|
||||||
|
size_t max_msg_size;
|
||||||
|
|
||||||
/** Maximum number of CQ events to read in OFI Progress */
|
/** Maximum number of CQ events to read in OFI Progress */
|
||||||
int ofi_progress_event_count;
|
int ofi_progress_event_count;
|
||||||
|
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user