ompi_proc_t size reduction: part 1
We currently save the hostname of a proc when we create the ompi_proc_t for it. This was originally done because the only method we had for discovering the host of a proc was to include that info in the modex, and we had to therefore store it somewhere proc-local. Obviously, this ccarried a memory penalty for storing all those strings, and so we added a "cutoff" parameter so that we wouldn't collect hostnames above a certain number of procs. Unfortunately, this still results in an 8-byte/proc memory cost as we have a char* pointer in the opal_proc_t that is contained in the ompi_proc_t so that we can store the hostname of the other procs if we fall below the cutoff. At scale, this can consume a fair amount of memory. With the switch to relying on PMIx, there is no longer a need to cache the proc hostnames. Using the "optional" feature of PMIx_Get, we restrict the retrieval to be purely proc-local - i.e., we retrieve the info either via shared memory or from within the proc-internal hash storage (depending upon the active PMIx components). Thus, the retrieval of a hostname is purely a local operation involving no communication. All RM's are required to provide a complete hostname map of all procs at startup. Thus, we have full access to all hostnames without including them in a modex or having to cache them on each proc. This allows us to remove the char* pointer from the opal_proc_t, saving us 8-bytes/proc. Unfortunately, PMIx_Get does not currently support the return of a static pointer to memory. Thus, even though PMIx has the hostname in its memory, it can only return a malloc'd version of it. I have therefore ensured that the return from opal_get_proc_hostname is consistently malloc'd and free'd wherever used. This shouldn't be a burden as the hostname is only used in one of two circumstances: (a) in an error message (b) in a verbose output for debugging purposes Thus, there should be no performance penalty associated with the malloc/free requirement. PMIx will eventually be returning static pointers, and so we can eventually simplify this method and return a "const char*" - but as noted, this really isn't an issue even today. Signed-off-by: Ralph Castain <rhc@pmix.org>
Этот коммит содержится в:
родитель
48b52478ef
Коммит
33ab928e1b
@ -13,7 +13,7 @@
|
||||
* Copyright (c) 2007-2015 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2008-2016 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2013 Intel, Inc. All rights reserved
|
||||
* Copyright (c) 2013-2020 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2014 NVIDIA Corporation. All rights reserved.
|
||||
* Copyright (c) 2014 Research Organization for Information Science
|
||||
* and Technology (RIST). All rights reserved.
|
||||
@ -233,12 +233,14 @@ static int mca_bml_r2_endpoint_add_btl (struct ompi_proc_t *proc, mca_bml_base_e
|
||||
|
||||
if (!bml_btl || bml_btl->btl->btl_exclusivity <= btl->btl_exclusivity) {
|
||||
/* this btl has higher exclusivity than an existing btl or none exists */
|
||||
|
||||
opal_output_verbose(1, opal_btl_base_framework.framework_output,
|
||||
"mca: bml: Using %s btl for send to %s on node %s",
|
||||
btl->btl_component->btl_version.mca_component_name,
|
||||
OMPI_NAME_PRINT(&proc->super.proc_name),
|
||||
proc->super.proc_hostname);
|
||||
if (0 < opal_output_get_verbosity(opal_btl_base_framework.framework_output)) {
|
||||
char *errhost = opal_get_proc_hostname(&proc->super);
|
||||
opal_output(0, "mca: bml: Using %s btl for send to %s on node %s",
|
||||
btl->btl_component->btl_version.mca_component_name,
|
||||
OMPI_NAME_PRINT(&proc->super.proc_name),
|
||||
errhost);
|
||||
free(errhost);
|
||||
}
|
||||
|
||||
/* cache the endpoint on the proc */
|
||||
if (NULL == bml_btl || (bml_btl->btl->btl_exclusivity <= btl->btl_exclusivity)) {
|
||||
@ -252,15 +254,16 @@ static int mca_bml_r2_endpoint_add_btl (struct ompi_proc_t *proc, mca_bml_base_e
|
||||
* calculate the bitwise OR of the btl flags
|
||||
*/
|
||||
bml_endpoint->btl_flags_or |= bml_btl->btl_flags;
|
||||
} else {
|
||||
opal_output_verbose(20, opal_btl_base_framework.framework_output,
|
||||
"mca: bml: Not using %s btl for send to %s on node %s "
|
||||
"because %s btl has higher exclusivity (%d > %d)",
|
||||
btl->btl_component->btl_version.mca_component_name,
|
||||
OMPI_NAME_PRINT(&proc->super.proc_name), proc->super.proc_hostname,
|
||||
bml_btl->btl->btl_component->btl_version.mca_component_name,
|
||||
bml_btl->btl->btl_exclusivity,
|
||||
btl->btl_exclusivity);
|
||||
} else if (19 < opal_output_get_verbosity(opal_btl_base_framework.framework_output)) {
|
||||
char *errhost = opal_get_proc_hostname(&proc->super);
|
||||
opal_output(0, "mca: bml: Not using %s btl for send to %s on node %s "
|
||||
"because %s btl has higher exclusivity (%d > %d)",
|
||||
btl->btl_component->btl_version.mca_component_name,
|
||||
OMPI_NAME_PRINT(&proc->super.proc_name), errhost,
|
||||
bml_btl->btl->btl_component->btl_version.mca_component_name,
|
||||
bml_btl->btl->btl_exclusivity,
|
||||
btl->btl_exclusivity);
|
||||
free(errhost);
|
||||
}
|
||||
|
||||
btl_in_use = true;
|
||||
@ -424,14 +427,16 @@ static int mca_bml_r2_add_proc (struct ompi_proc_t *proc)
|
||||
OBJ_RELEASE(bml_endpoint);
|
||||
/* no btl is available for this proc */
|
||||
if (mca_bml_r2.show_unreach_errors) {
|
||||
char *errhost = opal_get_proc_hostname(&proc->super);
|
||||
char *localhost = opal_get_proc_hostname(&ompi_proc_local_proc->super);
|
||||
opal_show_help ("help-mca-bml-r2.txt", "unreachable proc", true,
|
||||
OMPI_NAME_PRINT(&(ompi_proc_local_proc->super.proc_name)),
|
||||
(NULL != ompi_proc_local_proc->super.proc_hostname ?
|
||||
ompi_proc_local_proc->super.proc_hostname : "unknown!"),
|
||||
localhost,
|
||||
OMPI_NAME_PRINT(&(proc->super.proc_name)),
|
||||
(NULL != proc->super.proc_hostname ?
|
||||
proc->super.proc_hostname : "unknown!"),
|
||||
errhost,
|
||||
btl_names);
|
||||
free(errhost);
|
||||
free(localhost);
|
||||
}
|
||||
|
||||
return OMPI_ERR_UNREACH;
|
||||
@ -578,14 +583,16 @@ static int mca_bml_r2_add_procs( size_t nprocs,
|
||||
if (NULL == proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML]) {
|
||||
ret = OMPI_ERR_UNREACH;
|
||||
if (mca_bml_r2.show_unreach_errors) {
|
||||
char *errhost = opal_get_proc_hostname(&proc->super);
|
||||
char *localhost = opal_get_proc_hostname(&ompi_proc_local_proc->super);
|
||||
opal_show_help("help-mca-bml-r2.txt", "unreachable proc", true,
|
||||
OMPI_NAME_PRINT(&(ompi_proc_local_proc->super.proc_name)),
|
||||
(NULL != ompi_proc_local_proc->super.proc_hostname ?
|
||||
ompi_proc_local_proc->super.proc_hostname : "unknown!"),
|
||||
localhost,
|
||||
OMPI_NAME_PRINT(&(proc->super.proc_name)),
|
||||
(NULL != proc->super.proc_hostname ?
|
||||
proc->super.proc_hostname : "unknown!"),
|
||||
errhost,
|
||||
btl_names);
|
||||
free(errhost);
|
||||
free(localhost);
|
||||
}
|
||||
|
||||
break;
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2013-2018 Intel, Inc. All rights reserved
|
||||
* Copyright (c) 2013-2020 Intel, Inc. All rights reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
@ -98,10 +98,11 @@ ompi_mtl_ofi_add_procs(struct mca_mtl_base_module_t *mtl,
|
||||
(void**)&ep_name,
|
||||
&size);
|
||||
if (OMPI_SUCCESS != ret) {
|
||||
char *errhost = opal_get_proc_hostname(&procs[i]->super);
|
||||
opal_show_help("help-mtl-ofi.txt", "modex failed",
|
||||
true, ompi_process_info.nodename,
|
||||
procs[i]->super.proc_hostname,
|
||||
opal_strerror(ret), ret);
|
||||
errhost, opal_strerror(ret), ret);
|
||||
free(errhost);
|
||||
goto bail;
|
||||
}
|
||||
memcpy(&ep_names[i*namelen], ep_name, namelen);
|
||||
|
@ -324,8 +324,9 @@ ompi_mtl_psm2_add_procs(struct mca_mtl_base_module_t *mtl,
|
||||
errstr ? errstr : "unknown connect error");
|
||||
for (j = 0; j < (int) nprocs; j++) {
|
||||
if (errs_out[j] == thiserr) {
|
||||
opal_output(0, " %s", (NULL == procs[j]->super.proc_hostname) ?
|
||||
"unknown" : procs[j]->super.proc_hostname);
|
||||
char *errhost = opal_get_proc_hostname(&procs[j]->super);
|
||||
opal_output(0, " %s", errhost);
|
||||
free(errhost);
|
||||
}
|
||||
}
|
||||
opal_output(0, "\n");
|
||||
|
@ -12,7 +12,7 @@
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2012 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2013-2019 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2013-2020 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2015 Cisco Systems, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
@ -370,12 +370,15 @@ mca_pml_base_pml_check_selected(const char *my_pml,
|
||||
/* if that module doesn't match my own, return an error */
|
||||
if ((size != strlen(my_pml) + 1) ||
|
||||
(0 != strcmp(my_pml, remote_pml))) {
|
||||
char *errhost = opal_get_proc_hostname(&procs[0]->super);
|
||||
opal_output(0, "%s selected pml %s, but peer %s on %s selected pml %s",
|
||||
OMPI_NAME_PRINT(&ompi_proc_local()->super.proc_name),
|
||||
my_pml, OMPI_NAME_PRINT(&procs[0]->super.proc_name),
|
||||
(NULL == procs[0]->super.proc_hostname) ? "unknown" : procs[0]->super.proc_hostname,
|
||||
errhost,
|
||||
remote_pml);
|
||||
free(remote_pml); /* cleanup before returning */
|
||||
free(remote_pml);
|
||||
free(errhost);
|
||||
/* cleanup before returning */
|
||||
return OMPI_ERR_UNREACH;
|
||||
}
|
||||
|
||||
|
@ -15,6 +15,7 @@
|
||||
* Copyright (c) 2012-2015 NVIDIA Corporation. All rights reserved.
|
||||
* Copyright (c) 2015-2017 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2020 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -202,14 +203,17 @@ void mca_pml_ob1_cuda_add_ipc_support(struct mca_btl_base_module_t* btl, int32_t
|
||||
/* Find the corresponding bml and adjust the flag to support CUDA get */
|
||||
for( i = 0; i < (int)ep->btl_send.arr_size; i++ ) {
|
||||
if( ep->btl_send.bml_btls[i].btl == btl ) {
|
||||
if (4 < opal_output_get_verbosity(btl_verbose_stream)) {
|
||||
char *errhost = opal_get_proc_hostname(&errproc->super);
|
||||
opal_output(0, "BTL %s: rank=%d enabling CUDA IPC "
|
||||
"to rank=%d on node=%s \n",
|
||||
btl->btl_component->btl_version.mca_component_name,
|
||||
OMPI_PROC_MY_NAME->vpid,
|
||||
((ompi_process_name_t*)&errproc->super.proc_name)->vpid,
|
||||
errhost);
|
||||
free(errhost);
|
||||
}
|
||||
ep->btl_send.bml_btls[i].btl_flags |= MCA_BTL_FLAGS_CUDA_GET;
|
||||
opal_output_verbose(5, btl_verbose_stream,
|
||||
"BTL %s: rank=%d enabling CUDA IPC "
|
||||
"to rank=%d on node=%s \n",
|
||||
btl->btl_component->btl_version.mca_component_name,
|
||||
OMPI_PROC_MY_NAME->vpid,
|
||||
((ompi_process_name_t*)&errproc->super.proc_name)->vpid,
|
||||
errproc->super.proc_hostname);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -86,9 +86,6 @@ void ompi_proc_destruct(ompi_proc_t* proc)
|
||||
* destroyed here. It will be destroyed later when the ompi_datatype_finalize is called.
|
||||
*/
|
||||
OBJ_RELEASE( proc->super.proc_convertor );
|
||||
if (NULL != proc->super.proc_hostname) {
|
||||
free(proc->super.proc_hostname);
|
||||
}
|
||||
opal_mutex_lock (&ompi_proc_lock);
|
||||
opal_list_remove_item(&ompi_proc_list, (opal_list_item_t*)proc);
|
||||
opal_hash_table_remove_value_ptr (&ompi_proc_hash, &proc->super.proc_name, sizeof (proc->super.proc_name));
|
||||
@ -135,22 +132,12 @@ static int ompi_proc_allocate (ompi_jobid_t jobid, ompi_vpid_t vpid, ompi_proc_t
|
||||
*/
|
||||
int ompi_proc_complete_init_single (ompi_proc_t *proc)
|
||||
{
|
||||
int ret;
|
||||
|
||||
if ((OMPI_CAST_RTE_NAME(&proc->super.proc_name)->jobid == OMPI_PROC_MY_NAME->jobid) &&
|
||||
(OMPI_CAST_RTE_NAME(&proc->super.proc_name)->vpid == OMPI_PROC_MY_NAME->vpid)) {
|
||||
/* nothing else to do */
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
/* we can retrieve the hostname at no cost because it
|
||||
* was provided at startup - but make it optional so
|
||||
* we don't chase after it if some system doesn't
|
||||
* provide it */
|
||||
proc->super.proc_hostname = NULL;
|
||||
OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, PMIX_HOSTNAME, &proc->super.proc_name,
|
||||
(char**)&(proc->super.proc_hostname), PMIX_STRING);
|
||||
|
||||
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
|
||||
/* get the remote architecture - this might force a modex except
|
||||
* for those environments where the RM provides it */
|
||||
@ -264,7 +251,6 @@ int ompi_proc_init(void)
|
||||
/* set local process data */
|
||||
ompi_proc_local_proc = proc;
|
||||
proc->super.proc_flags = OPAL_PROC_ALL_LOCAL;
|
||||
proc->super.proc_hostname = strdup(ompi_process_info.nodename);
|
||||
proc->super.proc_arch = opal_local_arch;
|
||||
/* Register the local proc with OPAL */
|
||||
opal_proc_local_set(&proc->super);
|
||||
@ -609,7 +595,6 @@ int ompi_proc_refresh(void)
|
||||
if (i == OMPI_PROC_MY_NAME->vpid) {
|
||||
ompi_proc_local_proc = proc;
|
||||
proc->super.proc_flags = OPAL_PROC_ALL_LOCAL;
|
||||
proc->super.proc_hostname = ompi_process_info.nodename;
|
||||
proc->super.proc_arch = opal_local_arch;
|
||||
opal_proc_local_set(&proc->super);
|
||||
} else {
|
||||
@ -676,13 +661,6 @@ ompi_proc_pack(ompi_proc_t **proclist, int proclistsize,
|
||||
opal_mutex_unlock (&ompi_proc_lock);
|
||||
return rc;
|
||||
}
|
||||
/* pass the name of the host this proc is on */
|
||||
rc = opal_dss.pack(buf, &(proc->super.proc_hostname), 1, OPAL_STRING);
|
||||
if(rc != OPAL_SUCCESS) {
|
||||
OMPI_ERROR_LOG(rc);
|
||||
opal_mutex_unlock (&ompi_proc_lock);
|
||||
return rc;
|
||||
}
|
||||
}
|
||||
opal_mutex_unlock (&ompi_proc_lock);
|
||||
return OMPI_SUCCESS;
|
||||
@ -747,10 +725,10 @@ ompi_proc_unpack(opal_buffer_t* buf,
|
||||
int32_t count=1;
|
||||
ompi_process_name_t new_name;
|
||||
uint32_t new_arch;
|
||||
char *new_hostname;
|
||||
bool isnew = false;
|
||||
int rc;
|
||||
char *nspace;
|
||||
uint16_t u16, *u16ptr;
|
||||
|
||||
rc = opal_dss.unpack(buf, &new_name, &count, OMPI_NAME);
|
||||
if (rc != OPAL_SUCCESS) {
|
||||
@ -774,13 +752,6 @@ ompi_proc_unpack(opal_buffer_t* buf,
|
||||
free(newprocs);
|
||||
return rc;
|
||||
}
|
||||
rc = opal_dss.unpack(buf, &new_hostname, &count, OPAL_STRING);
|
||||
if (rc != OPAL_SUCCESS) {
|
||||
OMPI_ERROR_LOG(rc);
|
||||
free(plist);
|
||||
free(newprocs);
|
||||
return rc;
|
||||
}
|
||||
/* see if this proc is already on our ompi_proc_list */
|
||||
plist[i] = ompi_proc_find_and_add(&new_name, &isnew);
|
||||
if (isnew) {
|
||||
@ -798,27 +769,25 @@ ompi_proc_unpack(opal_buffer_t* buf,
|
||||
OBJ_RELEASE(plist[i]->super.proc_convertor);
|
||||
plist[i]->super.proc_convertor = opal_convertor_create(plist[i]->super.proc_arch, 0);
|
||||
#else
|
||||
char *errhost = opal_get_proc_hostname(&plist[i]->super);
|
||||
opal_show_help("help-mpi-runtime.txt",
|
||||
"heterogeneous-support-unavailable",
|
||||
true, ompi_process_info.nodename,
|
||||
new_hostname == NULL ? "<hostname unavailable>" :
|
||||
new_hostname);
|
||||
errhost);
|
||||
free(plist);
|
||||
free(newprocs);
|
||||
free(errhost);
|
||||
return OMPI_ERR_NOT_SUPPORTED;
|
||||
#endif
|
||||
}
|
||||
|
||||
if (NULL != new_hostname) {
|
||||
if (0 == strcmp(ompi_proc_local_proc->super.proc_hostname, new_hostname)) {
|
||||
plist[i]->super.proc_flags |= (OPAL_PROC_ON_NODE | OPAL_PROC_ON_CU | OPAL_PROC_ON_CLUSTER);
|
||||
}
|
||||
|
||||
/* Save the hostname */
|
||||
plist[i]->super.proc_hostname = new_hostname;
|
||||
/* get the locality information - all RTEs are required
|
||||
* to provide this information at startup */
|
||||
u16ptr = &u16;
|
||||
OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, PMIX_LOCALITY, &plist[i]->super.proc_name, &u16ptr, PMIX_UINT16);
|
||||
if (OPAL_SUCCESS == rc) {
|
||||
plist[i]->super.proc_flags = u16;
|
||||
}
|
||||
} else if (NULL != new_hostname) {
|
||||
free(new_hostname);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -13,7 +13,7 @@
|
||||
* Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved.
|
||||
* Copyright (c) 2012 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2013-2014 Intel, Inc. All rights reserved
|
||||
* Copyright (c) 2013-2020 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -59,13 +59,15 @@ OPAL_DECLSPEC extern int mca_btl_base_out(const char*, ...) __opal_attribute_for
|
||||
|
||||
#define BTL_PEER_ERROR(proc, args) \
|
||||
do { \
|
||||
char *errhost; \
|
||||
mca_btl_base_err("%s[%s:%d:%s] from %s ", \
|
||||
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), \
|
||||
__FILE__, __LINE__, __func__, \
|
||||
opal_process_info.nodename); \
|
||||
if (proc) { \
|
||||
mca_btl_base_err("to: %s ", \
|
||||
opal_get_proc_hostname(proc)); \
|
||||
errhost = opal_get_proc_hostname(proc); \
|
||||
mca_btl_base_err("to: %s ", errhost); \
|
||||
free(errhost); \
|
||||
} \
|
||||
mca_btl_base_err args; \
|
||||
mca_btl_base_err("\n"); \
|
||||
|
@ -15,6 +15,7 @@
|
||||
* Copyright (c) 2015-2017 Research Organization for Information Science
|
||||
* and Technology (RIST). All rights reserved.
|
||||
* Copyright (c) 2015-2016 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2020 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -44,6 +45,7 @@
|
||||
|
||||
#include "opal/opal_socket_errno.h"
|
||||
#include "opal/mca/btl/base/btl_base_error.h"
|
||||
#include "opal/util/proc.h"
|
||||
#include "opal/util/show_help.h"
|
||||
|
||||
#include "btl_tcp_frag.h"
|
||||
@ -168,6 +170,7 @@ bool mca_btl_tcp_frag_recv(mca_btl_tcp_frag_t* frag, int sd)
|
||||
mca_btl_base_endpoint_t* btl_endpoint = frag->endpoint;
|
||||
ssize_t cnt;
|
||||
int32_t i, num_vecs, dont_copy_data = 0;
|
||||
char *errhost;
|
||||
|
||||
repeat:
|
||||
num_vecs = frag->iov_cnt;
|
||||
@ -231,10 +234,11 @@ bool mca_btl_tcp_frag_recv(mca_btl_tcp_frag_t* frag, int sd)
|
||||
return false;
|
||||
|
||||
case ECONNRESET:
|
||||
errhost = opal_get_proc_hostname(btl_endpoint->endpoint_proc->proc_opal);
|
||||
opal_show_help("help-mpi-btl-tcp.txt", "peer hung up",
|
||||
true, opal_process_info.nodename,
|
||||
getpid(),
|
||||
btl_endpoint->endpoint_proc->proc_opal->proc_hostname);
|
||||
getpid(), errhost);
|
||||
free(errhost);
|
||||
btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED;
|
||||
mca_btl_tcp_endpoint_close(btl_endpoint);
|
||||
return false;
|
||||
|
@ -11,7 +11,7 @@
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2008-2010 Oracle and/or its affiliates. All rights reserved
|
||||
* Copyright (c) 2013-2019 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2013-2020 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2014-2016 Research Organization for Information Science
|
||||
* and Technology (RIST). All rights reserved.
|
||||
* Copyright (c) 2015-2016 Los Alamos National Security, LLC. All rights
|
||||
@ -48,6 +48,7 @@
|
||||
#include "opal/util/proc.h"
|
||||
#include "opal/util/show_help.h"
|
||||
#include "opal/util/printf.h"
|
||||
#include "opal/util/proc.h"
|
||||
#include "opal/util/string_copy.h"
|
||||
#include "opal/util/bipartite_graph.h"
|
||||
|
||||
@ -479,21 +480,18 @@ int mca_btl_tcp_proc_insert(mca_btl_tcp_proc_t* btl_proc,
|
||||
mca_btl_base_endpoint_t* btl_endpoint)
|
||||
{
|
||||
mca_btl_tcp_module_t* tcp_btl = btl_endpoint->endpoint_btl;
|
||||
const char *proc_hostname;
|
||||
mca_btl_tcp_addr_t *remote_addr;
|
||||
int rc = OPAL_SUCCESS;
|
||||
|
||||
if (NULL == (proc_hostname = opal_get_proc_hostname(btl_proc->proc_opal))) {
|
||||
rc = OPAL_ERR_UNREACH;
|
||||
goto out;
|
||||
}
|
||||
|
||||
rc = opal_hash_table_get_value_uint32(&btl_proc->btl_index_to_endpoint, tcp_btl->btl_index, (void **)&remote_addr);
|
||||
if (OPAL_SUCCESS != rc) {
|
||||
opal_output_verbose(10, opal_btl_base_framework.framework_output,
|
||||
"btl:tcp: host %s, process %s UNREACHABLE",
|
||||
proc_hostname,
|
||||
OPAL_NAME_PRINT(btl_proc->proc_opal->proc_name));
|
||||
if (9 < opal_output_get_verbosity(opal_btl_base_framework.framework_output)) {
|
||||
char *proc_hostname = opal_get_proc_hostname(btl_proc->proc_opal);
|
||||
opal_output(0, "btl:tcp: host %s, process %s UNREACHABLE",
|
||||
proc_hostname,
|
||||
OPAL_NAME_PRINT(btl_proc->proc_opal->proc_name));
|
||||
free(proc_hostname);
|
||||
}
|
||||
goto out;
|
||||
}
|
||||
btl_endpoint->endpoint_addr = remote_addr;
|
||||
@ -685,14 +683,15 @@ void mca_btl_tcp_proc_accept(mca_btl_tcp_proc_t* btl_proc, struct sockaddr* addr
|
||||
}
|
||||
addr_str = tmp;
|
||||
}
|
||||
tmp = opal_get_proc_hostname(btl_proc->proc_opal);
|
||||
opal_show_help("help-mpi-btl-tcp.txt", "dropped inbound connection",
|
||||
true, opal_process_info.nodename,
|
||||
getpid(),
|
||||
btl_proc->proc_opal->proc_hostname,
|
||||
getpid(), tmp,
|
||||
OPAL_NAME_PRINT(btl_proc->proc_opal->proc_name),
|
||||
opal_net_get_hostname((struct sockaddr*)addr),
|
||||
btl_proc->proc_endpoint_count,
|
||||
(NULL == addr_str) ? "NONE" : addr_str);
|
||||
free(tmp);
|
||||
if (NULL != addr_str) {
|
||||
free(addr_str);
|
||||
}
|
||||
|
@ -1,6 +1,6 @@
|
||||
/*
|
||||
* Copyright (c) 2014-2019 Cisco Systems, Inc. All rights reserved
|
||||
* Copyright (c) 2014 Intel, Inc. All rights reserved
|
||||
* Copyright (c) 2014-2020 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -279,13 +279,15 @@ opal_btl_usnic_check_connectivity(opal_btl_usnic_module_t *module,
|
||||
{
|
||||
if (OPAL_LIKELY(mca_btl_usnic_component.connectivity_enabled) &&
|
||||
OPAL_UNLIKELY(!endpoint->endpoint_connectivity_checked)) {
|
||||
char *host = opal_get_proc_hostname(endpoint->endpoint_proc->proc_opal);
|
||||
opal_btl_usnic_connectivity_ping(module->local_modex.ipv4_addr,
|
||||
module->local_modex.connectivity_udp_port,
|
||||
endpoint->endpoint_remote_modex.ipv4_addr,
|
||||
endpoint->endpoint_remote_modex.netmask,
|
||||
endpoint->endpoint_remote_modex.connectivity_udp_port,
|
||||
opal_get_proc_hostname(endpoint->endpoint_proc->proc_opal),
|
||||
host,
|
||||
endpoint->endpoint_remote_modex.max_msg_size);
|
||||
free(host);
|
||||
endpoint->endpoint_connectivity_checked = true;
|
||||
}
|
||||
}
|
||||
|
@ -1,6 +1,6 @@
|
||||
/*
|
||||
* Copyright (c) 2013-2016 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2014 Intel, Inc. All rights reserved
|
||||
* Copyright (c) 2014-2020 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2018 Amazon.com, Inc. or its affiliates. All Rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
@ -215,8 +215,10 @@ static int map_output_procs(FILE *fp)
|
||||
/* Loop over and print the sorted module device information */
|
||||
int ret = OPAL_SUCCESS;
|
||||
for (i = 0; i < num_procs; ++i) {
|
||||
char *errhost = opal_get_proc_hostname(procs[i]->proc_opal);
|
||||
fprintf(fp, "peer=%d,", procs[i]->proc_opal->proc_name.vpid);
|
||||
fprintf(fp, "hostname=%s,", opal_get_proc_hostname(procs[i]->proc_opal));
|
||||
fprintf(fp, "hostname=%s,", errhost);
|
||||
free(errhost);
|
||||
if (OPAL_SUCCESS != map_output_endpoints(fp, procs[i])) {
|
||||
break;
|
||||
}
|
||||
@ -244,9 +246,10 @@ void opal_btl_usnic_connectivity_map(void)
|
||||
|
||||
/* Filename is of the form: <prefix>-<hostname>.<pid>.<job>.<MCW
|
||||
rank>.txt */
|
||||
host =
|
||||
opal_asprintf(&filename, "%s-%s.pid%d.job%d.mcwrank%d.txt",
|
||||
mca_btl_usnic_component.connectivity_map_prefix,
|
||||
opal_get_proc_hostname(opal_proc_local_get()),
|
||||
opal_process_info.nodename,
|
||||
getpid(),
|
||||
opal_proc_local_get()->proc_name.jobid,
|
||||
opal_proc_local_get()->proc_name.vpid);
|
||||
|
@ -15,7 +15,7 @@
|
||||
* Copyright (c) 2009-2019 Cisco Systems, Inc. All rights reserved
|
||||
* Copyright (c) 2014-2016 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2014 Intel, Inc. All rights reserved
|
||||
* Copyright (c) 2014-2020 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2018 Amazon.com, Inc. or its affiliates. All Rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
@ -102,6 +102,7 @@ static int add_procs_block_create_endpoints(opal_btl_usnic_module_t *module,
|
||||
int rc;
|
||||
opal_proc_t* my_proc;
|
||||
size_t num_created = 0;
|
||||
char *errhost;
|
||||
|
||||
/* get pointer to my proc structure */
|
||||
my_proc = opal_proc_local_get();
|
||||
@ -143,11 +144,14 @@ static int add_procs_block_create_endpoints(opal_btl_usnic_module_t *module,
|
||||
if (OPAL_ERR_UNREACH == rc) {
|
||||
/* If the peer doesn't have usnic modex info, then we just
|
||||
skip it */
|
||||
opal_output_verbose(75, USNIC_OUT,
|
||||
"btl:usnic:add_procs:%s: peer %s on %s does not have usnic modex info; skipping",
|
||||
module->linux_device_name,
|
||||
usnic_compat_proc_name_print(&opal_proc->proc_name),
|
||||
opal_get_proc_hostname(opal_proc));
|
||||
if (74 < opal_output_get_verbosity(USNIC_OUT)) {
|
||||
errhost = opal_get_proc_hostname(opal_proc);
|
||||
opal_output(0, "btl:usnic:add_procs:%s: peer %s on %s does not have usnic modex info; skipping",
|
||||
module->linux_device_name,
|
||||
usnic_compat_proc_name_print(&opal_proc->proc_name),
|
||||
errhost);
|
||||
free(errhost);
|
||||
}
|
||||
continue;
|
||||
} else if (OPAL_SUCCESS != rc) {
|
||||
return OPAL_ERR_OUT_OF_RESOURCE;
|
||||
@ -159,11 +163,14 @@ static int add_procs_block_create_endpoints(opal_btl_usnic_module_t *module,
|
||||
rc = opal_btl_usnic_create_endpoint(module, usnic_proc,
|
||||
&usnic_endpoint);
|
||||
if (OPAL_SUCCESS != rc) {
|
||||
opal_output_verbose(5, USNIC_OUT,
|
||||
"btl:usnic:add_procs:%s: unable to create endpoint to peer %s on %s",
|
||||
module->linux_device_name,
|
||||
usnic_compat_proc_name_print(&opal_proc->proc_name),
|
||||
opal_get_proc_hostname(opal_proc));
|
||||
if (4 < opal_output_get_verbosity(USNIC_OUT)) {
|
||||
errhost = opal_get_proc_hostname(opal_proc);
|
||||
opal_output(0, "btl:usnic:add_procs:%s: unable to create endpoint to peer %s on %s",
|
||||
module->linux_device_name,
|
||||
usnic_compat_proc_name_print(&opal_proc->proc_name),
|
||||
errhost);
|
||||
free(errhost);
|
||||
}
|
||||
OBJ_RELEASE(usnic_proc);
|
||||
continue;
|
||||
}
|
||||
@ -221,6 +228,8 @@ static int add_procs_block_create_endpoints(opal_btl_usnic_module_t *module,
|
||||
static void add_procs_warn_unreachable(opal_btl_usnic_module_t *module,
|
||||
opal_btl_usnic_endpoint_t *endpoint)
|
||||
{
|
||||
char *errhost;
|
||||
|
||||
/* Only show the warning if it is enabled */
|
||||
if (!mca_btl_usnic_component.show_route_failures) {
|
||||
return;
|
||||
@ -236,13 +245,15 @@ static void add_procs_warn_unreachable(opal_btl_usnic_module_t *module,
|
||||
module->linux_device_name,
|
||||
module->if_ipv4_addr_str,
|
||||
remote);
|
||||
errhost = opal_get_proc_hostname(endpoint->endpoint_proc->proc_opal);
|
||||
opal_show_help("help-mpi-btl-usnic.txt", "unreachable peer IP",
|
||||
true,
|
||||
opal_process_info.nodename,
|
||||
module->if_ipv4_addr_str,
|
||||
module->linux_device_name,
|
||||
opal_get_proc_hostname(endpoint->endpoint_proc->proc_opal),
|
||||
errhost,
|
||||
remote);
|
||||
free(errhost);
|
||||
}
|
||||
|
||||
/* A bunch of calls to fi_av_insert() were previously
|
||||
|
@ -12,7 +12,7 @@
|
||||
* Copyright (c) 2006 Sandia National Laboratories. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2013-2016 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2013-2014 Intel, Inc. All rights reserved
|
||||
* Copyright (c) 2013-2020 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -641,14 +641,15 @@ static int match_modex(opal_btl_usnic_module_t *module,
|
||||
if (*index_out >= 0 &&
|
||||
proc->proc_modex[*index_out].max_msg_size !=
|
||||
(uint16_t) module->fabric_info->ep_attr->max_msg_size) {
|
||||
char *errhost = opal_get_proc_hostname(proc->proc_opal);
|
||||
opal_show_help("help-mpi-btl-usnic.txt", "MTU mismatch",
|
||||
true,
|
||||
opal_process_info.nodename,
|
||||
module->linux_device_name,
|
||||
module->fabric_info->ep_attr->max_msg_size,
|
||||
(NULL == proc->proc_opal->proc_hostname) ?
|
||||
"unknown" : proc->proc_opal->proc_hostname,
|
||||
errhost,
|
||||
proc->proc_modex[*index_out].max_msg_size);
|
||||
free(errhost);
|
||||
*index_out = -1;
|
||||
return OPAL_ERR_UNREACH;
|
||||
}
|
||||
|
@ -14,7 +14,7 @@
|
||||
* Copyright (c) 2008-2010 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2010-2015 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2014 Intel, Inc. All rights reserved
|
||||
* Copyright (c) 2014-2020 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -104,7 +104,7 @@ attach_and_init(opal_shmem_ds_t *shmem_bufp,
|
||||
/* is addr past end of the shared memory segment? */
|
||||
if ((unsigned char *)seg + shmem_bufp->seg_size < addr) {
|
||||
opal_show_help("help-mpi-common-sm.txt", "mmap too small", 1,
|
||||
opal_proc_local_get()->proc_hostname,
|
||||
opal_process_info.nodename,
|
||||
(unsigned long)shmem_bufp->seg_size,
|
||||
(unsigned long)size_ctl_structure,
|
||||
(unsigned long)data_seg_alignment);
|
||||
|
@ -13,7 +13,7 @@
|
||||
* Copyright (c) 2011-2012 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2011-2014 NVIDIA Corporation. All rights reserved.
|
||||
* Copyright (c) 2015 Intel, Inc. All rights reserved
|
||||
* Copyright (c) 2015-2020 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2018 Amazon.com, Inc. or its affiliates. All Rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
@ -226,7 +226,7 @@ int mca_common_sm_mpool_ft_event(int state) {
|
||||
/* Record the shared memory filename */
|
||||
opal_asprintf( &file_name, "%s"OPAL_PATH_SEP"shared_mem_pool.%s",
|
||||
opal_process_info.job_session_dir,
|
||||
opal_proc_local_get()->proc_hostname );
|
||||
opal_process_info.nodename );
|
||||
/* Disabled to get FT code compiled again
|
||||
* TODO: FIXIT soon
|
||||
orte_sstore.set_attr(orte_sstore_handle_current, SSTORE_METADATA_LOCAL_TOUCH, file_name);
|
||||
|
@ -18,6 +18,7 @@
|
||||
* Copyright (c) 2015-2018 Research Organization for Information Science
|
||||
* and Technology (RIST). All rights reserved.
|
||||
* Copyright (c) 2018 Amazon.com, Inc. or its affiliates. All Rights reserved.
|
||||
* Copyright (c) 2020 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -192,13 +193,13 @@ void mca_mpool_base_tree_print(int show_up_to_mem_leaks)
|
||||
show_up_to_mem_leaks < 0) {
|
||||
opal_show_help("help-mpool-base.txt", "all mem leaks",
|
||||
true, OPAL_NAME_PRINT(OPAL_PROC_MY_NAME),
|
||||
opal_proc_local_get()->proc_hostname,
|
||||
opal_process_info.nodename,
|
||||
getpid(), leak_msg);
|
||||
} else {
|
||||
int i = num_leaks - show_up_to_mem_leaks;
|
||||
opal_show_help("help-mpool-base.txt", "some mem leaks",
|
||||
true, OPAL_NAME_PRINT(OPAL_PROC_MY_NAME),
|
||||
opal_proc_local_get()->proc_hostname,
|
||||
opal_process_info.nodename,
|
||||
getpid(), leak_msg, i,
|
||||
(i > 1) ? "s were" : " was",
|
||||
(i > 1) ? "are" : "is");
|
||||
|
@ -12,6 +12,7 @@
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2015-2016 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2020 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -71,7 +72,7 @@ mca_rcache_base_module_t* mca_rcache_base_module_create (const char* name, void
|
||||
} else if (1 == opal_leave_pinned || opal_leave_pinned_pipeline) {
|
||||
opal_show_help("help-rcache-base.txt", "leave pinned failed",
|
||||
true, name, OPAL_NAME_PRINT(OPAL_PROC_MY_NAME),
|
||||
opal_proc_local_get()->proc_hostname);
|
||||
opal_process_info.nodename);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
@ -13,6 +13,7 @@
|
||||
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2012-2015 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2020 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -67,7 +68,7 @@ void mca_rcache_base_mem_cb (void* base, size_t size, void* cbdata, bool from_al
|
||||
int len;
|
||||
len = snprintf(msg, sizeof(msg), "[%s:%05d] Attempt to free memory that is still in "
|
||||
"use by an ongoing MPI communication (buffer %p, size %lu). MPI job "
|
||||
"will now abort.\n", opal_proc_local_get()->proc_hostname,
|
||||
"will now abort.\n", opal_process_info.nodename,
|
||||
getpid(), base, (unsigned long) size);
|
||||
msg[sizeof(msg) - 1] = '\0';
|
||||
write(2, msg, len);
|
||||
@ -75,7 +76,7 @@ void mca_rcache_base_mem_cb (void* base, size_t size, void* cbdata, bool from_al
|
||||
opal_show_help("help-rcache-base.txt",
|
||||
"cannot deregister in-use memory", true,
|
||||
current->rcache_component->rcache_version.mca_component_name,
|
||||
opal_proc_local_get()->proc_hostname,
|
||||
opal_process_info.nodename,
|
||||
base, (unsigned long) size);
|
||||
}
|
||||
|
||||
|
@ -41,11 +41,10 @@ opal_process_info_t opal_process_info = {
|
||||
static opal_proc_t opal_local_proc = {
|
||||
{ .opal_list_next = NULL,
|
||||
.opal_list_prev = NULL},
|
||||
{OPAL_JOBID_INVALID, OPAL_VPID_INVALID},
|
||||
0,
|
||||
0,
|
||||
NULL,
|
||||
NULL
|
||||
.proc_name = {OPAL_JOBID_INVALID, OPAL_VPID_INVALID},
|
||||
.proc_arch = 0,
|
||||
.proc_flags = 0,
|
||||
.proc_convertor = NULL
|
||||
};
|
||||
static opal_proc_t* opal_proc_my_name = &opal_local_proc;
|
||||
|
||||
@ -55,14 +54,12 @@ static void opal_proc_construct(opal_proc_t* proc)
|
||||
proc->proc_convertor = NULL;
|
||||
proc->proc_flags = 0;
|
||||
proc->proc_name = *OPAL_NAME_INVALID;
|
||||
proc->proc_hostname = NULL;
|
||||
}
|
||||
|
||||
static void opal_proc_destruct(opal_proc_t* proc)
|
||||
{
|
||||
proc->proc_flags = 0;
|
||||
proc->proc_name = *OPAL_NAME_INVALID;
|
||||
proc->proc_hostname = NULL;
|
||||
proc->proc_convertor = NULL;
|
||||
}
|
||||
|
||||
@ -188,30 +185,26 @@ struct opal_proc_t *(*opal_proc_for_name) (const opal_process_name_t name) = opa
|
||||
char* opal_get_proc_hostname(const opal_proc_t *proc)
|
||||
{
|
||||
int ret;
|
||||
char *hostname;
|
||||
|
||||
/* if the proc is NULL, then we can't know */
|
||||
if (NULL == proc) {
|
||||
return "unknown";
|
||||
return strdup("unknown");
|
||||
}
|
||||
|
||||
/* if it is my own hostname we are after, then just hand back
|
||||
* the value in opal_process_info */
|
||||
if (proc == opal_proc_my_name) {
|
||||
return opal_process_info.nodename;
|
||||
}
|
||||
|
||||
/* see if we already have the data - if so, pass it back */
|
||||
if (NULL != proc->proc_hostname) {
|
||||
return proc->proc_hostname;
|
||||
return strdup(opal_process_info.nodename);
|
||||
}
|
||||
|
||||
/* if we don't already have it, then try to get it */
|
||||
OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, PMIX_HOSTNAME, &proc->proc_name,
|
||||
(char**)&(proc->proc_hostname), PMIX_STRING);
|
||||
(char**)&hostname, PMIX_STRING);
|
||||
if (OPAL_SUCCESS != ret) {
|
||||
return "unknown"; // return something so the caller doesn't segfault
|
||||
return strdup("unknown"); // return something so the caller doesn't segfault
|
||||
}
|
||||
|
||||
/* user is not allowed to release the data */
|
||||
return proc->proc_hostname;
|
||||
return hostname;
|
||||
}
|
||||
|
@ -48,7 +48,7 @@
|
||||
#define OPAL_VPID_WILDCARD (OPAL_VPID_MAX + 1)
|
||||
|
||||
#define OPAL_PROC_MY_NAME (opal_proc_local_get()->proc_name)
|
||||
#define OPAL_PROC_MY_HOSTNAME (opal_proc_local_get()->proc_hostname)
|
||||
#define OPAL_PROC_MY_HOSTNAME (opal_process_info.nodename)
|
||||
|
||||
#define OPAL_NAME_WILDCARD (&opal_name_wildcard)
|
||||
OPAL_DECLSPEC extern opal_process_name_t opal_name_wildcard;
|
||||
@ -91,9 +91,6 @@ typedef struct opal_proc_t {
|
||||
opal_hwloc_locality_t proc_flags;
|
||||
/** Base convertor for the proc described by this process */
|
||||
struct opal_convertor_t* proc_convertor;
|
||||
/** A pointer to the name of this host - data is
|
||||
* actually stored outside of this framework. */
|
||||
char* proc_hostname;
|
||||
} opal_proc_t;
|
||||
OBJ_CLASS_DECLARATION(opal_proc_t);
|
||||
|
||||
|
Загрузка…
Ссылка в новой задаче
Block a user