1
1

MTL/OFI: Check threshold number of peers allowed per rank

When the provider does not support FI_REMOTE_CQ_DATA, the OFI tag does not have
sizeof(int) bits for the rank. Therefore, unexpected behavior will occur when
this limit is crossed.

Check the max allowed number of ranks during add_procs() and return if there is
danger of exceeding this threshold.

Signed-off-by: Aravind Gopalakrishnan <Aravind.Gopalakrishnan@intel.com>
Этот коммит содержится в:
Aravind Gopalakrishnan 2018-11-01 14:03:00 -07:00
родитель 37954b5fda
Коммит 5cf43de445
4 изменённых файлов: 28 добавлений и 3 удалений

Просмотреть файл

@ -54,9 +54,22 @@ ompi_mtl_ofi_add_procs(struct mca_mtl_base_module_t *mtl,
char *ep_names = NULL;
fi_addr_t *fi_addrs = NULL;
mca_mtl_ofi_endpoint_t *endpoint = NULL;
int num_peers_limit = (1 << ompi_mtl_ofi.num_bits_source_rank) - 1;
namelen = ompi_mtl_ofi.epnamelen;
/* We cannot add more ranks than available tag bits */
if ((false == ompi_mtl_ofi.fi_cq_data) &&
OPAL_UNLIKELY(((int) (nprocs + ompi_mtl_ofi.num_peers) > num_peers_limit))) {
opal_output(0, "%s:%d: OFI provider: %s does not have enough bits for source rank in its tag.\n"
"Adding more ranks will result in undefined behaviour. Please enable\n"
"FI_REMOTE_CQ_DATA feature in the provider. For more info refer fi_cq(3).\n",
__FILE__, __LINE__, ompi_mtl_ofi.provider_name);
fflush(stderr);
ret = OMPI_ERROR;
goto bail;
}
/**
* Create array of EP names.
*/
@ -126,6 +139,9 @@ ompi_mtl_ofi_add_procs(struct mca_mtl_base_module_t *mtl,
procs[i]->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_MTL] = endpoint;
}
/* Update global counter of number of procs added to this rank */
ompi_mtl_ofi.num_peers += nprocs;
ret = OMPI_SUCCESS;
bail:

Просмотреть файл

@ -15,7 +15,6 @@
#include "mtl_ofi.h"
#include "opal/util/argv.h"
#include "opal/util/show_help.h"
#include "opal/util/printf.h"
static int ompi_mtl_ofi_component_open(void);
@ -607,6 +606,7 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
/* Update the maximum supported Communicator ID */
ompi_mtl_ofi.base.mtl_max_contextid = (int)((1ULL << ofi_tag_bits_for_cid) - 1);
ompi_mtl_ofi.num_peers = 0;
/**
* Open fabric
@ -740,6 +740,8 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
goto error;
}
ompi_mtl_ofi.provider_name = strdup(prov->fabric_attr->prov_name);
/**
* Free providers info since it's not needed anymore.
*/

Просмотреть файл

@ -41,7 +41,13 @@ typedef struct mca_mtl_ofi_endpoint_t mca_mtl_ofi_endpoint_t;
static inline mca_mtl_ofi_endpoint_t *ompi_mtl_ofi_get_endpoint (struct mca_mtl_base_module_t* mtl, ompi_proc_t *ompi_proc)
{
if (OPAL_UNLIKELY(NULL == ompi_proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_MTL])) {
ompi_mtl_ofi_add_procs(mtl, 1, &ompi_proc);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ompi_mtl_ofi_add_procs(mtl, 1, &ompi_proc))) {
/* Fatal error. exit() out */
opal_output(0, "%s:%d: *** The Open MPI OFI MTL is aborting the MPI job (via exit(3)).\n",
__FILE__, __LINE__);
fflush(stderr);
exit(1);
}
}
return ompi_proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_MTL];

Просмотреть файл

@ -43,7 +43,7 @@ typedef struct mca_mtl_ofi_module_t {
/** "Any source" address */
fi_addr_t any_addr;
/** Optional user-specified OFI provider name */
/** OFI provider name */
char *provider_name;
/** Maximum inject size */
@ -61,6 +61,7 @@ typedef struct mca_mtl_ofi_module_t {
unsigned long long source_rank_mask;
unsigned long long mpi_tag_mask;
int num_bits_mpi_tag;
int num_peers;
/** Synchronous protocol tag bits */
unsigned long long sync_send;