Merge pull request #7248 from wckzhang/v4.0.x
MTL/OFI: Check threshold number of peers allowed per rank
Этот коммит содержится в:
Коммит
3da939b124
@ -54,9 +54,22 @@ ompi_mtl_ofi_add_procs(struct mca_mtl_base_module_t *mtl,
|
|||||||
char *ep_names = NULL;
|
char *ep_names = NULL;
|
||||||
fi_addr_t *fi_addrs = NULL;
|
fi_addr_t *fi_addrs = NULL;
|
||||||
mca_mtl_ofi_endpoint_t *endpoint = NULL;
|
mca_mtl_ofi_endpoint_t *endpoint = NULL;
|
||||||
|
int num_peers_limit = (1 << ompi_mtl_ofi.num_bits_source_rank) - 1;
|
||||||
|
|
||||||
namelen = ompi_mtl_ofi.epnamelen;
|
namelen = ompi_mtl_ofi.epnamelen;
|
||||||
|
|
||||||
|
/* We cannot add more ranks than available tag bits */
|
||||||
|
if ((false == ompi_mtl_ofi.fi_cq_data) &&
|
||||||
|
OPAL_UNLIKELY(((int) (nprocs + ompi_mtl_ofi.num_peers) > num_peers_limit))) {
|
||||||
|
opal_output(0, "%s:%d: OFI provider: %s does not have enough bits for source rank in its tag.\n"
|
||||||
|
"Adding more ranks will result in undefined behaviour. Please enable\n"
|
||||||
|
"FI_REMOTE_CQ_DATA feature in the provider. For more info refer fi_cq(3).\n",
|
||||||
|
__FILE__, __LINE__, ompi_mtl_ofi.provider_name);
|
||||||
|
fflush(stderr);
|
||||||
|
ret = OMPI_ERROR;
|
||||||
|
goto bail;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Create array of EP names.
|
* Create array of EP names.
|
||||||
*/
|
*/
|
||||||
@ -126,6 +139,9 @@ ompi_mtl_ofi_add_procs(struct mca_mtl_base_module_t *mtl,
|
|||||||
procs[i]->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_MTL] = endpoint;
|
procs[i]->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_MTL] = endpoint;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Update global counter of number of procs added to this rank */
|
||||||
|
ompi_mtl_ofi.num_peers += nprocs;
|
||||||
|
|
||||||
ret = OMPI_SUCCESS;
|
ret = OMPI_SUCCESS;
|
||||||
|
|
||||||
bail:
|
bail:
|
||||||
|
@ -14,7 +14,7 @@
|
|||||||
|
|
||||||
#include "mtl_ofi.h"
|
#include "mtl_ofi.h"
|
||||||
#include "opal/util/argv.h"
|
#include "opal/util/argv.h"
|
||||||
#include "opal/util/show_help.h"
|
#include "opal/util/printf.h"
|
||||||
|
|
||||||
static int ompi_mtl_ofi_component_open(void);
|
static int ompi_mtl_ofi_component_open(void);
|
||||||
static int ompi_mtl_ofi_component_query(mca_base_module_t **module, int *priority);
|
static int ompi_mtl_ofi_component_query(mca_base_module_t **module, int *priority);
|
||||||
@ -576,6 +576,8 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
|
|||||||
ompi_mtl_ofi_define_tag_mode(ofi_tag_mode);
|
ompi_mtl_ofi_define_tag_mode(ofi_tag_mode);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ompi_mtl_ofi.num_peers = 0;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Open fabric
|
* Open fabric
|
||||||
* The getinfo struct returns a fabric attribute struct that can be used to
|
* The getinfo struct returns a fabric attribute struct that can be used to
|
||||||
@ -709,6 +711,8 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
|
|||||||
goto error;
|
goto error;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ompi_mtl_ofi.provider_name = strdup(prov->fabric_attr->prov_name);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Free providers info since it's not needed anymore.
|
* Free providers info since it's not needed anymore.
|
||||||
*/
|
*/
|
||||||
|
@ -41,7 +41,13 @@ typedef struct mca_mtl_ofi_endpoint_t mca_mtl_ofi_endpoint_t;
|
|||||||
static inline mca_mtl_ofi_endpoint_t *ompi_mtl_ofi_get_endpoint (struct mca_mtl_base_module_t* mtl, ompi_proc_t *ompi_proc)
|
static inline mca_mtl_ofi_endpoint_t *ompi_mtl_ofi_get_endpoint (struct mca_mtl_base_module_t* mtl, ompi_proc_t *ompi_proc)
|
||||||
{
|
{
|
||||||
if (OPAL_UNLIKELY(NULL == ompi_proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_MTL])) {
|
if (OPAL_UNLIKELY(NULL == ompi_proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_MTL])) {
|
||||||
ompi_mtl_ofi_add_procs(mtl, 1, &ompi_proc);
|
if (OPAL_UNLIKELY(OMPI_SUCCESS != ompi_mtl_ofi_add_procs(mtl, 1, &ompi_proc))) {
|
||||||
|
/* Fatal error. exit() out */
|
||||||
|
opal_output(0, "%s:%d: *** The Open MPI OFI MTL is aborting the MPI job (via exit(3)).\n",
|
||||||
|
__FILE__, __LINE__);
|
||||||
|
fflush(stderr);
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return ompi_proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_MTL];
|
return ompi_proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_MTL];
|
||||||
|
@ -43,7 +43,7 @@ typedef struct mca_mtl_ofi_module_t {
|
|||||||
/** "Any source" address */
|
/** "Any source" address */
|
||||||
fi_addr_t any_addr;
|
fi_addr_t any_addr;
|
||||||
|
|
||||||
/** Optional user-specified OFI provider name */
|
/** OFI provider name */
|
||||||
char *provider_name;
|
char *provider_name;
|
||||||
|
|
||||||
/** Maximum inject size */
|
/** Maximum inject size */
|
||||||
@ -64,6 +64,7 @@ typedef struct mca_mtl_ofi_module_t {
|
|||||||
unsigned long long source_rank_mask;
|
unsigned long long source_rank_mask;
|
||||||
unsigned long long mpi_tag_mask;
|
unsigned long long mpi_tag_mask;
|
||||||
int num_bits_mpi_tag;
|
int num_bits_mpi_tag;
|
||||||
|
int num_peers;
|
||||||
|
|
||||||
/** Synchronous protocol tag bits */
|
/** Synchronous protocol tag bits */
|
||||||
unsigned long long sync_send;
|
unsigned long long sync_send;
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user