
We currently save the hostname of a proc when we create the ompi_proc_t for it. This was originally done because the only method we had for discovering the host of a proc was to include that info in the modex, and we had to therefore store it somewhere proc-local. Obviously, this ccarried a memory penalty for storing all those strings, and so we added a "cutoff" parameter so that we wouldn't collect hostnames above a certain number of procs. Unfortunately, this still results in an 8-byte/proc memory cost as we have a char* pointer in the opal_proc_t that is contained in the ompi_proc_t so that we can store the hostname of the other procs if we fall below the cutoff. At scale, this can consume a fair amount of memory. With the switch to relying on PMIx, there is no longer a need to cache the proc hostnames. Using the "optional" feature of PMIx_Get, we restrict the retrieval to be purely proc-local - i.e., we retrieve the info either via shared memory or from within the proc-internal hash storage (depending upon the active PMIx components). Thus, the retrieval of a hostname is purely a local operation involving no communication. All RM's are required to provide a complete hostname map of all procs at startup. Thus, we have full access to all hostnames without including them in a modex or having to cache them on each proc. This allows us to remove the char* pointer from the opal_proc_t, saving us 8-bytes/proc. Unfortunately, PMIx_Get does not currently support the return of a static pointer to memory. Thus, even though PMIx has the hostname in its memory, it can only return a malloc'd version of it. I have therefore ensured that the return from opal_get_proc_hostname is consistently malloc'd and free'd wherever used. This shouldn't be a burden as the hostname is only used in one of two circumstances: (a) in an error message (b) in a verbose output for debugging purposes Thus, there should be no performance penalty associated with the malloc/free requirement. PMIx will eventually be returning static pointers, and so we can eventually simplify this method and return a "const char*" - but as noted, this really isn't an issue even today. Signed-off-by: Ralph Castain <rhc@pmix.org>
185 строки
5.3 KiB
C
185 строки
5.3 KiB
C
/*
|
|
* Copyright (c) 2013-2020 Intel, Inc. All rights reserved.
|
|
*
|
|
* $COPYRIGHT$
|
|
*
|
|
* Additional copyrights may follow
|
|
*
|
|
* $HEADER$
|
|
*/
|
|
|
|
#include "mtl_ofi.h"
|
|
|
|
OMPI_DECLSPEC extern mca_mtl_ofi_component_t mca_mtl_ofi_component;
|
|
|
|
mca_mtl_ofi_module_t ompi_mtl_ofi = {
|
|
{
|
|
(int)((1ULL << MTL_OFI_CID_BIT_COUNT_1) - 1), /* max cid */
|
|
(int)((1ULL << (MTL_OFI_TAG_BIT_COUNT_1 - 1)) - 1) ,/* max tag value */
|
|
0, /* request reserve space */
|
|
0, /* flags */
|
|
|
|
ompi_mtl_ofi_add_procs,
|
|
ompi_mtl_ofi_del_procs,
|
|
ompi_mtl_ofi_finalize,
|
|
|
|
NULL,
|
|
NULL,
|
|
NULL,
|
|
NULL,
|
|
ompi_mtl_ofi_imrecv,
|
|
NULL,
|
|
|
|
ompi_mtl_ofi_cancel,
|
|
ompi_mtl_ofi_add_comm,
|
|
ompi_mtl_ofi_del_comm
|
|
},
|
|
0,
|
|
0,
|
|
NULL,
|
|
NULL
|
|
};
|
|
|
|
int
|
|
ompi_mtl_ofi_add_procs(struct mca_mtl_base_module_t *mtl,
|
|
size_t nprocs,
|
|
struct ompi_proc_t** procs)
|
|
{
|
|
int ret = OMPI_SUCCESS;
|
|
size_t i;
|
|
size_t size;
|
|
size_t namelen;
|
|
int count = 0;
|
|
char *ep_name = NULL;
|
|
char *ep_names = NULL;
|
|
fi_addr_t *fi_addrs = NULL;
|
|
mca_mtl_ofi_endpoint_t *endpoint = NULL;
|
|
int num_peers_limit = (1 << ompi_mtl_ofi.num_bits_source_rank) - 1;
|
|
|
|
namelen = ompi_mtl_ofi.epnamelen;
|
|
|
|
/* We cannot add more ranks than available tag bits */
|
|
if ((false == ompi_mtl_ofi.fi_cq_data) &&
|
|
OPAL_UNLIKELY(((int) (nprocs + ompi_mtl_ofi.num_peers) > num_peers_limit))) {
|
|
opal_output(0, "%s:%d: OFI provider: %s does not have enough bits for source rank in its tag.\n"
|
|
"Adding more ranks will result in undefined behaviour. Please enable\n"
|
|
"FI_REMOTE_CQ_DATA feature in the provider. For more info refer fi_cq(3).\n",
|
|
__FILE__, __LINE__, ompi_mtl_ofi.provider_name);
|
|
fflush(stderr);
|
|
ret = OMPI_ERROR;
|
|
goto bail;
|
|
}
|
|
|
|
/**
|
|
* Create array of EP names.
|
|
*/
|
|
ep_names = malloc(nprocs * namelen);
|
|
if (NULL == ep_names) {
|
|
ret = OMPI_ERROR;
|
|
goto bail;
|
|
}
|
|
|
|
/**
|
|
* Create array of fi_addrs.
|
|
*/
|
|
fi_addrs = malloc(nprocs * sizeof(fi_addr_t));
|
|
if (NULL == fi_addrs) {
|
|
ret = OMPI_ERROR;
|
|
goto bail;
|
|
}
|
|
|
|
/**
|
|
* Retrieve the processes' EP names from modex.
|
|
*/
|
|
for (i = 0; i < nprocs; ++i) {
|
|
OFI_COMPAT_MODEX_RECV(ret,
|
|
&mca_mtl_ofi_component.super.mtl_version,
|
|
procs[i],
|
|
(void**)&ep_name,
|
|
&size);
|
|
if (OMPI_SUCCESS != ret) {
|
|
char *errhost = opal_get_proc_hostname(&procs[i]->super);
|
|
opal_show_help("help-mtl-ofi.txt", "modex failed",
|
|
true, ompi_process_info.nodename,
|
|
errhost, opal_strerror(ret), ret);
|
|
free(errhost);
|
|
goto bail;
|
|
}
|
|
memcpy(&ep_names[i*namelen], ep_name, namelen);
|
|
}
|
|
|
|
/**
|
|
* Map the EP names to fi_addrs.
|
|
*/
|
|
count = fi_av_insert(ompi_mtl_ofi.av, ep_names, nprocs, fi_addrs, 0, NULL);
|
|
if ((count < 0) || (nprocs != (size_t)count)) {
|
|
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
|
|
"%s:%d: fi_av_insert failed: %d\n",
|
|
__FILE__, __LINE__, count);
|
|
ret = OMPI_ERROR;
|
|
goto bail;
|
|
}
|
|
|
|
/**
|
|
* Store the fi_addrs within the endpoint objects.
|
|
*/
|
|
for (i = 0; i < nprocs; ++i) {
|
|
endpoint = OBJ_NEW(mca_mtl_ofi_endpoint_t);
|
|
if (NULL == endpoint) {
|
|
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
|
|
"%s:%d: mtl/ofi: could not allocate endpoint"
|
|
" structure\n",
|
|
__FILE__, __LINE__);
|
|
ret = OMPI_ERROR;
|
|
goto bail;
|
|
}
|
|
|
|
endpoint->mtl_ofi_module = &ompi_mtl_ofi;
|
|
endpoint->peer_fiaddr = fi_addrs[i];
|
|
|
|
/* FIXME: What happens if this endpoint already exists? */
|
|
procs[i]->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_MTL] = endpoint;
|
|
}
|
|
|
|
/* Update global counter of number of procs added to this rank */
|
|
ompi_mtl_ofi.num_peers += nprocs;
|
|
|
|
ret = OMPI_SUCCESS;
|
|
|
|
bail:
|
|
if (fi_addrs)
|
|
free(fi_addrs);
|
|
|
|
if (ep_names)
|
|
free(ep_names);
|
|
|
|
return ret;
|
|
}
|
|
|
|
int
|
|
ompi_mtl_ofi_del_procs(struct mca_mtl_base_module_t *mtl,
|
|
size_t nprocs,
|
|
struct ompi_proc_t** procs)
|
|
{
|
|
int ret;
|
|
size_t i;
|
|
mca_mtl_ofi_endpoint_t *endpoint = NULL;
|
|
|
|
for (i = 0 ; i < nprocs ; ++i) {
|
|
if (NULL != procs[i] &&
|
|
NULL != procs[i]->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_MTL]) {
|
|
endpoint = procs[i]->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_MTL];
|
|
ret = fi_av_remove(ompi_mtl_ofi.av, &endpoint->peer_fiaddr, 1, 0);
|
|
if (ret) {
|
|
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
|
|
"%s:%d: fi_av_remove failed: %s\n", __FILE__, __LINE__, fi_strerror(errno));
|
|
return ret;
|
|
}
|
|
procs[i]->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_MTL] = NULL;
|
|
OBJ_RELEASE(endpoint);
|
|
}
|
|
}
|
|
|
|
return OMPI_SUCCESS;
|
|
}
|