ompi_proc_t size reduction: part 1
We currently save the hostname of a proc when we create the ompi_proc_t for it. This was originally done because the only method we had for discovering the host of a proc was to include that info in the modex, and we had to therefore store it somewhere proc-local. Obviously, this ccarried a memory penalty for storing all those strings, and so we added a "cutoff" parameter so that we wouldn't collect hostnames above a certain number of procs. Unfortunately, this still results in an 8-byte/proc memory cost as we have a char* pointer in the opal_proc_t that is contained in the ompi_proc_t so that we can store the hostname of the other procs if we fall below the cutoff. At scale, this can consume a fair amount of memory. With the switch to relying on PMIx, there is no longer a need to cache the proc hostnames. Using the "optional" feature of PMIx_Get, we restrict the retrieval to be purely proc-local - i.e., we retrieve the info either via shared memory or from within the proc-internal hash storage (depending upon the active PMIx components). Thus, the retrieval of a hostname is purely a local operation involving no communication. All RM's are required to provide a complete hostname map of all procs at startup. Thus, we have full access to all hostnames without including them in a modex or having to cache them on each proc. This allows us to remove the char* pointer from the opal_proc_t, saving us 8-bytes/proc. Unfortunately, PMIx_Get does not currently support the return of a static pointer to memory. Thus, even though PMIx has the hostname in its memory, it can only return a malloc'd version of it. I have therefore ensured that the return from opal_get_proc_hostname is consistently malloc'd and free'd wherever used. This shouldn't be a burden as the hostname is only used in one of two circumstances: (a) in an error message (b) in a verbose output for debugging purposes Thus, there should be no performance penalty associated with the malloc/free requirement. PMIx will eventually be returning static pointers, and so we can eventually simplify this method and return a "const char*" - but as noted, this really isn't an issue even today. Signed-off-by: Ralph Castain <rhc@pmix.org>
Этот коммит содержится в:
родитель
48b52478ef
Коммит
33ab928e1b
@ -13,7 +13,7 @@
|
|||||||
* Copyright (c) 2007-2015 Los Alamos National Security, LLC. All rights
|
* Copyright (c) 2007-2015 Los Alamos National Security, LLC. All rights
|
||||||
* reserved.
|
* reserved.
|
||||||
* Copyright (c) 2008-2016 Cisco Systems, Inc. All rights reserved.
|
* Copyright (c) 2008-2016 Cisco Systems, Inc. All rights reserved.
|
||||||
* Copyright (c) 2013 Intel, Inc. All rights reserved
|
* Copyright (c) 2013-2020 Intel, Inc. All rights reserved.
|
||||||
* Copyright (c) 2014 NVIDIA Corporation. All rights reserved.
|
* Copyright (c) 2014 NVIDIA Corporation. All rights reserved.
|
||||||
* Copyright (c) 2014 Research Organization for Information Science
|
* Copyright (c) 2014 Research Organization for Information Science
|
||||||
* and Technology (RIST). All rights reserved.
|
* and Technology (RIST). All rights reserved.
|
||||||
@ -233,12 +233,14 @@ static int mca_bml_r2_endpoint_add_btl (struct ompi_proc_t *proc, mca_bml_base_e
|
|||||||
|
|
||||||
if (!bml_btl || bml_btl->btl->btl_exclusivity <= btl->btl_exclusivity) {
|
if (!bml_btl || bml_btl->btl->btl_exclusivity <= btl->btl_exclusivity) {
|
||||||
/* this btl has higher exclusivity than an existing btl or none exists */
|
/* this btl has higher exclusivity than an existing btl or none exists */
|
||||||
|
if (0 < opal_output_get_verbosity(opal_btl_base_framework.framework_output)) {
|
||||||
opal_output_verbose(1, opal_btl_base_framework.framework_output,
|
char *errhost = opal_get_proc_hostname(&proc->super);
|
||||||
"mca: bml: Using %s btl for send to %s on node %s",
|
opal_output(0, "mca: bml: Using %s btl for send to %s on node %s",
|
||||||
btl->btl_component->btl_version.mca_component_name,
|
btl->btl_component->btl_version.mca_component_name,
|
||||||
OMPI_NAME_PRINT(&proc->super.proc_name),
|
OMPI_NAME_PRINT(&proc->super.proc_name),
|
||||||
proc->super.proc_hostname);
|
errhost);
|
||||||
|
free(errhost);
|
||||||
|
}
|
||||||
|
|
||||||
/* cache the endpoint on the proc */
|
/* cache the endpoint on the proc */
|
||||||
if (NULL == bml_btl || (bml_btl->btl->btl_exclusivity <= btl->btl_exclusivity)) {
|
if (NULL == bml_btl || (bml_btl->btl->btl_exclusivity <= btl->btl_exclusivity)) {
|
||||||
@ -252,15 +254,16 @@ static int mca_bml_r2_endpoint_add_btl (struct ompi_proc_t *proc, mca_bml_base_e
|
|||||||
* calculate the bitwise OR of the btl flags
|
* calculate the bitwise OR of the btl flags
|
||||||
*/
|
*/
|
||||||
bml_endpoint->btl_flags_or |= bml_btl->btl_flags;
|
bml_endpoint->btl_flags_or |= bml_btl->btl_flags;
|
||||||
} else {
|
} else if (19 < opal_output_get_verbosity(opal_btl_base_framework.framework_output)) {
|
||||||
opal_output_verbose(20, opal_btl_base_framework.framework_output,
|
char *errhost = opal_get_proc_hostname(&proc->super);
|
||||||
"mca: bml: Not using %s btl for send to %s on node %s "
|
opal_output(0, "mca: bml: Not using %s btl for send to %s on node %s "
|
||||||
"because %s btl has higher exclusivity (%d > %d)",
|
"because %s btl has higher exclusivity (%d > %d)",
|
||||||
btl->btl_component->btl_version.mca_component_name,
|
btl->btl_component->btl_version.mca_component_name,
|
||||||
OMPI_NAME_PRINT(&proc->super.proc_name), proc->super.proc_hostname,
|
OMPI_NAME_PRINT(&proc->super.proc_name), errhost,
|
||||||
bml_btl->btl->btl_component->btl_version.mca_component_name,
|
bml_btl->btl->btl_component->btl_version.mca_component_name,
|
||||||
bml_btl->btl->btl_exclusivity,
|
bml_btl->btl->btl_exclusivity,
|
||||||
btl->btl_exclusivity);
|
btl->btl_exclusivity);
|
||||||
|
free(errhost);
|
||||||
}
|
}
|
||||||
|
|
||||||
btl_in_use = true;
|
btl_in_use = true;
|
||||||
@ -424,14 +427,16 @@ static int mca_bml_r2_add_proc (struct ompi_proc_t *proc)
|
|||||||
OBJ_RELEASE(bml_endpoint);
|
OBJ_RELEASE(bml_endpoint);
|
||||||
/* no btl is available for this proc */
|
/* no btl is available for this proc */
|
||||||
if (mca_bml_r2.show_unreach_errors) {
|
if (mca_bml_r2.show_unreach_errors) {
|
||||||
|
char *errhost = opal_get_proc_hostname(&proc->super);
|
||||||
|
char *localhost = opal_get_proc_hostname(&ompi_proc_local_proc->super);
|
||||||
opal_show_help ("help-mca-bml-r2.txt", "unreachable proc", true,
|
opal_show_help ("help-mca-bml-r2.txt", "unreachable proc", true,
|
||||||
OMPI_NAME_PRINT(&(ompi_proc_local_proc->super.proc_name)),
|
OMPI_NAME_PRINT(&(ompi_proc_local_proc->super.proc_name)),
|
||||||
(NULL != ompi_proc_local_proc->super.proc_hostname ?
|
localhost,
|
||||||
ompi_proc_local_proc->super.proc_hostname : "unknown!"),
|
|
||||||
OMPI_NAME_PRINT(&(proc->super.proc_name)),
|
OMPI_NAME_PRINT(&(proc->super.proc_name)),
|
||||||
(NULL != proc->super.proc_hostname ?
|
errhost,
|
||||||
proc->super.proc_hostname : "unknown!"),
|
|
||||||
btl_names);
|
btl_names);
|
||||||
|
free(errhost);
|
||||||
|
free(localhost);
|
||||||
}
|
}
|
||||||
|
|
||||||
return OMPI_ERR_UNREACH;
|
return OMPI_ERR_UNREACH;
|
||||||
@ -578,14 +583,16 @@ static int mca_bml_r2_add_procs( size_t nprocs,
|
|||||||
if (NULL == proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML]) {
|
if (NULL == proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML]) {
|
||||||
ret = OMPI_ERR_UNREACH;
|
ret = OMPI_ERR_UNREACH;
|
||||||
if (mca_bml_r2.show_unreach_errors) {
|
if (mca_bml_r2.show_unreach_errors) {
|
||||||
|
char *errhost = opal_get_proc_hostname(&proc->super);
|
||||||
|
char *localhost = opal_get_proc_hostname(&ompi_proc_local_proc->super);
|
||||||
opal_show_help("help-mca-bml-r2.txt", "unreachable proc", true,
|
opal_show_help("help-mca-bml-r2.txt", "unreachable proc", true,
|
||||||
OMPI_NAME_PRINT(&(ompi_proc_local_proc->super.proc_name)),
|
OMPI_NAME_PRINT(&(ompi_proc_local_proc->super.proc_name)),
|
||||||
(NULL != ompi_proc_local_proc->super.proc_hostname ?
|
localhost,
|
||||||
ompi_proc_local_proc->super.proc_hostname : "unknown!"),
|
|
||||||
OMPI_NAME_PRINT(&(proc->super.proc_name)),
|
OMPI_NAME_PRINT(&(proc->super.proc_name)),
|
||||||
(NULL != proc->super.proc_hostname ?
|
errhost,
|
||||||
proc->super.proc_hostname : "unknown!"),
|
|
||||||
btl_names);
|
btl_names);
|
||||||
|
free(errhost);
|
||||||
|
free(localhost);
|
||||||
}
|
}
|
||||||
|
|
||||||
break;
|
break;
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
/*
|
/*
|
||||||
* Copyright (c) 2013-2018 Intel, Inc. All rights reserved
|
* Copyright (c) 2013-2020 Intel, Inc. All rights reserved.
|
||||||
*
|
*
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
@ -98,10 +98,11 @@ ompi_mtl_ofi_add_procs(struct mca_mtl_base_module_t *mtl,
|
|||||||
(void**)&ep_name,
|
(void**)&ep_name,
|
||||||
&size);
|
&size);
|
||||||
if (OMPI_SUCCESS != ret) {
|
if (OMPI_SUCCESS != ret) {
|
||||||
|
char *errhost = opal_get_proc_hostname(&procs[i]->super);
|
||||||
opal_show_help("help-mtl-ofi.txt", "modex failed",
|
opal_show_help("help-mtl-ofi.txt", "modex failed",
|
||||||
true, ompi_process_info.nodename,
|
true, ompi_process_info.nodename,
|
||||||
procs[i]->super.proc_hostname,
|
errhost, opal_strerror(ret), ret);
|
||||||
opal_strerror(ret), ret);
|
free(errhost);
|
||||||
goto bail;
|
goto bail;
|
||||||
}
|
}
|
||||||
memcpy(&ep_names[i*namelen], ep_name, namelen);
|
memcpy(&ep_names[i*namelen], ep_name, namelen);
|
||||||
|
@ -324,8 +324,9 @@ ompi_mtl_psm2_add_procs(struct mca_mtl_base_module_t *mtl,
|
|||||||
errstr ? errstr : "unknown connect error");
|
errstr ? errstr : "unknown connect error");
|
||||||
for (j = 0; j < (int) nprocs; j++) {
|
for (j = 0; j < (int) nprocs; j++) {
|
||||||
if (errs_out[j] == thiserr) {
|
if (errs_out[j] == thiserr) {
|
||||||
opal_output(0, " %s", (NULL == procs[j]->super.proc_hostname) ?
|
char *errhost = opal_get_proc_hostname(&procs[j]->super);
|
||||||
"unknown" : procs[j]->super.proc_hostname);
|
opal_output(0, " %s", errhost);
|
||||||
|
free(errhost);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
opal_output(0, "\n");
|
opal_output(0, "\n");
|
||||||
|
@ -12,7 +12,7 @@
|
|||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
* Copyright (c) 2012 Los Alamos National Security, LLC. All rights
|
* Copyright (c) 2012 Los Alamos National Security, LLC. All rights
|
||||||
* reserved.
|
* reserved.
|
||||||
* Copyright (c) 2013-2019 Intel, Inc. All rights reserved.
|
* Copyright (c) 2013-2020 Intel, Inc. All rights reserved.
|
||||||
* Copyright (c) 2015 Cisco Systems, Inc. All rights reserved.
|
* Copyright (c) 2015 Cisco Systems, Inc. All rights reserved.
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
@ -370,12 +370,15 @@ mca_pml_base_pml_check_selected(const char *my_pml,
|
|||||||
/* if that module doesn't match my own, return an error */
|
/* if that module doesn't match my own, return an error */
|
||||||
if ((size != strlen(my_pml) + 1) ||
|
if ((size != strlen(my_pml) + 1) ||
|
||||||
(0 != strcmp(my_pml, remote_pml))) {
|
(0 != strcmp(my_pml, remote_pml))) {
|
||||||
|
char *errhost = opal_get_proc_hostname(&procs[0]->super);
|
||||||
opal_output(0, "%s selected pml %s, but peer %s on %s selected pml %s",
|
opal_output(0, "%s selected pml %s, but peer %s on %s selected pml %s",
|
||||||
OMPI_NAME_PRINT(&ompi_proc_local()->super.proc_name),
|
OMPI_NAME_PRINT(&ompi_proc_local()->super.proc_name),
|
||||||
my_pml, OMPI_NAME_PRINT(&procs[0]->super.proc_name),
|
my_pml, OMPI_NAME_PRINT(&procs[0]->super.proc_name),
|
||||||
(NULL == procs[0]->super.proc_hostname) ? "unknown" : procs[0]->super.proc_hostname,
|
errhost,
|
||||||
remote_pml);
|
remote_pml);
|
||||||
free(remote_pml); /* cleanup before returning */
|
free(remote_pml);
|
||||||
|
free(errhost);
|
||||||
|
/* cleanup before returning */
|
||||||
return OMPI_ERR_UNREACH;
|
return OMPI_ERR_UNREACH;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -15,6 +15,7 @@
|
|||||||
* Copyright (c) 2012-2015 NVIDIA Corporation. All rights reserved.
|
* Copyright (c) 2012-2015 NVIDIA Corporation. All rights reserved.
|
||||||
* Copyright (c) 2015-2017 Los Alamos National Security, LLC. All rights
|
* Copyright (c) 2015-2017 Los Alamos National Security, LLC. All rights
|
||||||
* reserved.
|
* reserved.
|
||||||
|
* Copyright (c) 2020 Intel, Inc. All rights reserved.
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
* Additional copyrights may follow
|
* Additional copyrights may follow
|
||||||
@ -202,14 +203,17 @@ void mca_pml_ob1_cuda_add_ipc_support(struct mca_btl_base_module_t* btl, int32_t
|
|||||||
/* Find the corresponding bml and adjust the flag to support CUDA get */
|
/* Find the corresponding bml and adjust the flag to support CUDA get */
|
||||||
for( i = 0; i < (int)ep->btl_send.arr_size; i++ ) {
|
for( i = 0; i < (int)ep->btl_send.arr_size; i++ ) {
|
||||||
if( ep->btl_send.bml_btls[i].btl == btl ) {
|
if( ep->btl_send.bml_btls[i].btl == btl ) {
|
||||||
|
if (4 < opal_output_get_verbosity(btl_verbose_stream)) {
|
||||||
|
char *errhost = opal_get_proc_hostname(&errproc->super);
|
||||||
|
opal_output(0, "BTL %s: rank=%d enabling CUDA IPC "
|
||||||
|
"to rank=%d on node=%s \n",
|
||||||
|
btl->btl_component->btl_version.mca_component_name,
|
||||||
|
OMPI_PROC_MY_NAME->vpid,
|
||||||
|
((ompi_process_name_t*)&errproc->super.proc_name)->vpid,
|
||||||
|
errhost);
|
||||||
|
free(errhost);
|
||||||
|
}
|
||||||
ep->btl_send.bml_btls[i].btl_flags |= MCA_BTL_FLAGS_CUDA_GET;
|
ep->btl_send.bml_btls[i].btl_flags |= MCA_BTL_FLAGS_CUDA_GET;
|
||||||
opal_output_verbose(5, btl_verbose_stream,
|
|
||||||
"BTL %s: rank=%d enabling CUDA IPC "
|
|
||||||
"to rank=%d on node=%s \n",
|
|
||||||
btl->btl_component->btl_version.mca_component_name,
|
|
||||||
OMPI_PROC_MY_NAME->vpid,
|
|
||||||
((ompi_process_name_t*)&errproc->super.proc_name)->vpid,
|
|
||||||
errproc->super.proc_hostname);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -86,9 +86,6 @@ void ompi_proc_destruct(ompi_proc_t* proc)
|
|||||||
* destroyed here. It will be destroyed later when the ompi_datatype_finalize is called.
|
* destroyed here. It will be destroyed later when the ompi_datatype_finalize is called.
|
||||||
*/
|
*/
|
||||||
OBJ_RELEASE( proc->super.proc_convertor );
|
OBJ_RELEASE( proc->super.proc_convertor );
|
||||||
if (NULL != proc->super.proc_hostname) {
|
|
||||||
free(proc->super.proc_hostname);
|
|
||||||
}
|
|
||||||
opal_mutex_lock (&ompi_proc_lock);
|
opal_mutex_lock (&ompi_proc_lock);
|
||||||
opal_list_remove_item(&ompi_proc_list, (opal_list_item_t*)proc);
|
opal_list_remove_item(&ompi_proc_list, (opal_list_item_t*)proc);
|
||||||
opal_hash_table_remove_value_ptr (&ompi_proc_hash, &proc->super.proc_name, sizeof (proc->super.proc_name));
|
opal_hash_table_remove_value_ptr (&ompi_proc_hash, &proc->super.proc_name, sizeof (proc->super.proc_name));
|
||||||
@ -135,22 +132,12 @@ static int ompi_proc_allocate (ompi_jobid_t jobid, ompi_vpid_t vpid, ompi_proc_t
|
|||||||
*/
|
*/
|
||||||
int ompi_proc_complete_init_single (ompi_proc_t *proc)
|
int ompi_proc_complete_init_single (ompi_proc_t *proc)
|
||||||
{
|
{
|
||||||
int ret;
|
|
||||||
|
|
||||||
if ((OMPI_CAST_RTE_NAME(&proc->super.proc_name)->jobid == OMPI_PROC_MY_NAME->jobid) &&
|
if ((OMPI_CAST_RTE_NAME(&proc->super.proc_name)->jobid == OMPI_PROC_MY_NAME->jobid) &&
|
||||||
(OMPI_CAST_RTE_NAME(&proc->super.proc_name)->vpid == OMPI_PROC_MY_NAME->vpid)) {
|
(OMPI_CAST_RTE_NAME(&proc->super.proc_name)->vpid == OMPI_PROC_MY_NAME->vpid)) {
|
||||||
/* nothing else to do */
|
/* nothing else to do */
|
||||||
return OMPI_SUCCESS;
|
return OMPI_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* we can retrieve the hostname at no cost because it
|
|
||||||
* was provided at startup - but make it optional so
|
|
||||||
* we don't chase after it if some system doesn't
|
|
||||||
* provide it */
|
|
||||||
proc->super.proc_hostname = NULL;
|
|
||||||
OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, PMIX_HOSTNAME, &proc->super.proc_name,
|
|
||||||
(char**)&(proc->super.proc_hostname), PMIX_STRING);
|
|
||||||
|
|
||||||
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
|
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
|
||||||
/* get the remote architecture - this might force a modex except
|
/* get the remote architecture - this might force a modex except
|
||||||
* for those environments where the RM provides it */
|
* for those environments where the RM provides it */
|
||||||
@ -264,7 +251,6 @@ int ompi_proc_init(void)
|
|||||||
/* set local process data */
|
/* set local process data */
|
||||||
ompi_proc_local_proc = proc;
|
ompi_proc_local_proc = proc;
|
||||||
proc->super.proc_flags = OPAL_PROC_ALL_LOCAL;
|
proc->super.proc_flags = OPAL_PROC_ALL_LOCAL;
|
||||||
proc->super.proc_hostname = strdup(ompi_process_info.nodename);
|
|
||||||
proc->super.proc_arch = opal_local_arch;
|
proc->super.proc_arch = opal_local_arch;
|
||||||
/* Register the local proc with OPAL */
|
/* Register the local proc with OPAL */
|
||||||
opal_proc_local_set(&proc->super);
|
opal_proc_local_set(&proc->super);
|
||||||
@ -609,7 +595,6 @@ int ompi_proc_refresh(void)
|
|||||||
if (i == OMPI_PROC_MY_NAME->vpid) {
|
if (i == OMPI_PROC_MY_NAME->vpid) {
|
||||||
ompi_proc_local_proc = proc;
|
ompi_proc_local_proc = proc;
|
||||||
proc->super.proc_flags = OPAL_PROC_ALL_LOCAL;
|
proc->super.proc_flags = OPAL_PROC_ALL_LOCAL;
|
||||||
proc->super.proc_hostname = ompi_process_info.nodename;
|
|
||||||
proc->super.proc_arch = opal_local_arch;
|
proc->super.proc_arch = opal_local_arch;
|
||||||
opal_proc_local_set(&proc->super);
|
opal_proc_local_set(&proc->super);
|
||||||
} else {
|
} else {
|
||||||
@ -676,13 +661,6 @@ ompi_proc_pack(ompi_proc_t **proclist, int proclistsize,
|
|||||||
opal_mutex_unlock (&ompi_proc_lock);
|
opal_mutex_unlock (&ompi_proc_lock);
|
||||||
return rc;
|
return rc;
|
||||||
}
|
}
|
||||||
/* pass the name of the host this proc is on */
|
|
||||||
rc = opal_dss.pack(buf, &(proc->super.proc_hostname), 1, OPAL_STRING);
|
|
||||||
if(rc != OPAL_SUCCESS) {
|
|
||||||
OMPI_ERROR_LOG(rc);
|
|
||||||
opal_mutex_unlock (&ompi_proc_lock);
|
|
||||||
return rc;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
opal_mutex_unlock (&ompi_proc_lock);
|
opal_mutex_unlock (&ompi_proc_lock);
|
||||||
return OMPI_SUCCESS;
|
return OMPI_SUCCESS;
|
||||||
@ -747,10 +725,10 @@ ompi_proc_unpack(opal_buffer_t* buf,
|
|||||||
int32_t count=1;
|
int32_t count=1;
|
||||||
ompi_process_name_t new_name;
|
ompi_process_name_t new_name;
|
||||||
uint32_t new_arch;
|
uint32_t new_arch;
|
||||||
char *new_hostname;
|
|
||||||
bool isnew = false;
|
bool isnew = false;
|
||||||
int rc;
|
int rc;
|
||||||
char *nspace;
|
char *nspace;
|
||||||
|
uint16_t u16, *u16ptr;
|
||||||
|
|
||||||
rc = opal_dss.unpack(buf, &new_name, &count, OMPI_NAME);
|
rc = opal_dss.unpack(buf, &new_name, &count, OMPI_NAME);
|
||||||
if (rc != OPAL_SUCCESS) {
|
if (rc != OPAL_SUCCESS) {
|
||||||
@ -774,13 +752,6 @@ ompi_proc_unpack(opal_buffer_t* buf,
|
|||||||
free(newprocs);
|
free(newprocs);
|
||||||
return rc;
|
return rc;
|
||||||
}
|
}
|
||||||
rc = opal_dss.unpack(buf, &new_hostname, &count, OPAL_STRING);
|
|
||||||
if (rc != OPAL_SUCCESS) {
|
|
||||||
OMPI_ERROR_LOG(rc);
|
|
||||||
free(plist);
|
|
||||||
free(newprocs);
|
|
||||||
return rc;
|
|
||||||
}
|
|
||||||
/* see if this proc is already on our ompi_proc_list */
|
/* see if this proc is already on our ompi_proc_list */
|
||||||
plist[i] = ompi_proc_find_and_add(&new_name, &isnew);
|
plist[i] = ompi_proc_find_and_add(&new_name, &isnew);
|
||||||
if (isnew) {
|
if (isnew) {
|
||||||
@ -798,27 +769,25 @@ ompi_proc_unpack(opal_buffer_t* buf,
|
|||||||
OBJ_RELEASE(plist[i]->super.proc_convertor);
|
OBJ_RELEASE(plist[i]->super.proc_convertor);
|
||||||
plist[i]->super.proc_convertor = opal_convertor_create(plist[i]->super.proc_arch, 0);
|
plist[i]->super.proc_convertor = opal_convertor_create(plist[i]->super.proc_arch, 0);
|
||||||
#else
|
#else
|
||||||
|
char *errhost = opal_get_proc_hostname(&plist[i]->super);
|
||||||
opal_show_help("help-mpi-runtime.txt",
|
opal_show_help("help-mpi-runtime.txt",
|
||||||
"heterogeneous-support-unavailable",
|
"heterogeneous-support-unavailable",
|
||||||
true, ompi_process_info.nodename,
|
true, ompi_process_info.nodename,
|
||||||
new_hostname == NULL ? "<hostname unavailable>" :
|
errhost);
|
||||||
new_hostname);
|
|
||||||
free(plist);
|
free(plist);
|
||||||
free(newprocs);
|
free(newprocs);
|
||||||
|
free(errhost);
|
||||||
return OMPI_ERR_NOT_SUPPORTED;
|
return OMPI_ERR_NOT_SUPPORTED;
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
if (NULL != new_hostname) {
|
/* get the locality information - all RTEs are required
|
||||||
if (0 == strcmp(ompi_proc_local_proc->super.proc_hostname, new_hostname)) {
|
* to provide this information at startup */
|
||||||
plist[i]->super.proc_flags |= (OPAL_PROC_ON_NODE | OPAL_PROC_ON_CU | OPAL_PROC_ON_CLUSTER);
|
u16ptr = &u16;
|
||||||
}
|
OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, PMIX_LOCALITY, &plist[i]->super.proc_name, &u16ptr, PMIX_UINT16);
|
||||||
|
if (OPAL_SUCCESS == rc) {
|
||||||
/* Save the hostname */
|
plist[i]->super.proc_flags = u16;
|
||||||
plist[i]->super.proc_hostname = new_hostname;
|
|
||||||
}
|
}
|
||||||
} else if (NULL != new_hostname) {
|
|
||||||
free(new_hostname);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -13,7 +13,7 @@
|
|||||||
* Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved.
|
* Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved.
|
||||||
* Copyright (c) 2012 Los Alamos National Security, LLC.
|
* Copyright (c) 2012 Los Alamos National Security, LLC.
|
||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
* Copyright (c) 2013-2014 Intel, Inc. All rights reserved
|
* Copyright (c) 2013-2020 Intel, Inc. All rights reserved.
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
* Additional copyrights may follow
|
* Additional copyrights may follow
|
||||||
@ -59,13 +59,15 @@ OPAL_DECLSPEC extern int mca_btl_base_out(const char*, ...) __opal_attribute_for
|
|||||||
|
|
||||||
#define BTL_PEER_ERROR(proc, args) \
|
#define BTL_PEER_ERROR(proc, args) \
|
||||||
do { \
|
do { \
|
||||||
|
char *errhost; \
|
||||||
mca_btl_base_err("%s[%s:%d:%s] from %s ", \
|
mca_btl_base_err("%s[%s:%d:%s] from %s ", \
|
||||||
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), \
|
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), \
|
||||||
__FILE__, __LINE__, __func__, \
|
__FILE__, __LINE__, __func__, \
|
||||||
opal_process_info.nodename); \
|
opal_process_info.nodename); \
|
||||||
if (proc) { \
|
if (proc) { \
|
||||||
mca_btl_base_err("to: %s ", \
|
errhost = opal_get_proc_hostname(proc); \
|
||||||
opal_get_proc_hostname(proc)); \
|
mca_btl_base_err("to: %s ", errhost); \
|
||||||
|
free(errhost); \
|
||||||
} \
|
} \
|
||||||
mca_btl_base_err args; \
|
mca_btl_base_err args; \
|
||||||
mca_btl_base_err("\n"); \
|
mca_btl_base_err("\n"); \
|
||||||
|
@ -15,6 +15,7 @@
|
|||||||
* Copyright (c) 2015-2017 Research Organization for Information Science
|
* Copyright (c) 2015-2017 Research Organization for Information Science
|
||||||
* and Technology (RIST). All rights reserved.
|
* and Technology (RIST). All rights reserved.
|
||||||
* Copyright (c) 2015-2016 Cisco Systems, Inc. All rights reserved.
|
* Copyright (c) 2015-2016 Cisco Systems, Inc. All rights reserved.
|
||||||
|
* Copyright (c) 2020 Intel, Inc. All rights reserved.
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
* Additional copyrights may follow
|
* Additional copyrights may follow
|
||||||
@ -44,6 +45,7 @@
|
|||||||
|
|
||||||
#include "opal/opal_socket_errno.h"
|
#include "opal/opal_socket_errno.h"
|
||||||
#include "opal/mca/btl/base/btl_base_error.h"
|
#include "opal/mca/btl/base/btl_base_error.h"
|
||||||
|
#include "opal/util/proc.h"
|
||||||
#include "opal/util/show_help.h"
|
#include "opal/util/show_help.h"
|
||||||
|
|
||||||
#include "btl_tcp_frag.h"
|
#include "btl_tcp_frag.h"
|
||||||
@ -168,6 +170,7 @@ bool mca_btl_tcp_frag_recv(mca_btl_tcp_frag_t* frag, int sd)
|
|||||||
mca_btl_base_endpoint_t* btl_endpoint = frag->endpoint;
|
mca_btl_base_endpoint_t* btl_endpoint = frag->endpoint;
|
||||||
ssize_t cnt;
|
ssize_t cnt;
|
||||||
int32_t i, num_vecs, dont_copy_data = 0;
|
int32_t i, num_vecs, dont_copy_data = 0;
|
||||||
|
char *errhost;
|
||||||
|
|
||||||
repeat:
|
repeat:
|
||||||
num_vecs = frag->iov_cnt;
|
num_vecs = frag->iov_cnt;
|
||||||
@ -231,10 +234,11 @@ bool mca_btl_tcp_frag_recv(mca_btl_tcp_frag_t* frag, int sd)
|
|||||||
return false;
|
return false;
|
||||||
|
|
||||||
case ECONNRESET:
|
case ECONNRESET:
|
||||||
|
errhost = opal_get_proc_hostname(btl_endpoint->endpoint_proc->proc_opal);
|
||||||
opal_show_help("help-mpi-btl-tcp.txt", "peer hung up",
|
opal_show_help("help-mpi-btl-tcp.txt", "peer hung up",
|
||||||
true, opal_process_info.nodename,
|
true, opal_process_info.nodename,
|
||||||
getpid(),
|
getpid(), errhost);
|
||||||
btl_endpoint->endpoint_proc->proc_opal->proc_hostname);
|
free(errhost);
|
||||||
btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED;
|
btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED;
|
||||||
mca_btl_tcp_endpoint_close(btl_endpoint);
|
mca_btl_tcp_endpoint_close(btl_endpoint);
|
||||||
return false;
|
return false;
|
||||||
|
@ -11,7 +11,7 @@
|
|||||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
* Copyright (c) 2008-2010 Oracle and/or its affiliates. All rights reserved
|
* Copyright (c) 2008-2010 Oracle and/or its affiliates. All rights reserved
|
||||||
* Copyright (c) 2013-2019 Intel, Inc. All rights reserved.
|
* Copyright (c) 2013-2020 Intel, Inc. All rights reserved.
|
||||||
* Copyright (c) 2014-2016 Research Organization for Information Science
|
* Copyright (c) 2014-2016 Research Organization for Information Science
|
||||||
* and Technology (RIST). All rights reserved.
|
* and Technology (RIST). All rights reserved.
|
||||||
* Copyright (c) 2015-2016 Los Alamos National Security, LLC. All rights
|
* Copyright (c) 2015-2016 Los Alamos National Security, LLC. All rights
|
||||||
@ -48,6 +48,7 @@
|
|||||||
#include "opal/util/proc.h"
|
#include "opal/util/proc.h"
|
||||||
#include "opal/util/show_help.h"
|
#include "opal/util/show_help.h"
|
||||||
#include "opal/util/printf.h"
|
#include "opal/util/printf.h"
|
||||||
|
#include "opal/util/proc.h"
|
||||||
#include "opal/util/string_copy.h"
|
#include "opal/util/string_copy.h"
|
||||||
#include "opal/util/bipartite_graph.h"
|
#include "opal/util/bipartite_graph.h"
|
||||||
|
|
||||||
@ -479,21 +480,18 @@ int mca_btl_tcp_proc_insert(mca_btl_tcp_proc_t* btl_proc,
|
|||||||
mca_btl_base_endpoint_t* btl_endpoint)
|
mca_btl_base_endpoint_t* btl_endpoint)
|
||||||
{
|
{
|
||||||
mca_btl_tcp_module_t* tcp_btl = btl_endpoint->endpoint_btl;
|
mca_btl_tcp_module_t* tcp_btl = btl_endpoint->endpoint_btl;
|
||||||
const char *proc_hostname;
|
|
||||||
mca_btl_tcp_addr_t *remote_addr;
|
mca_btl_tcp_addr_t *remote_addr;
|
||||||
int rc = OPAL_SUCCESS;
|
int rc = OPAL_SUCCESS;
|
||||||
|
|
||||||
if (NULL == (proc_hostname = opal_get_proc_hostname(btl_proc->proc_opal))) {
|
|
||||||
rc = OPAL_ERR_UNREACH;
|
|
||||||
goto out;
|
|
||||||
}
|
|
||||||
|
|
||||||
rc = opal_hash_table_get_value_uint32(&btl_proc->btl_index_to_endpoint, tcp_btl->btl_index, (void **)&remote_addr);
|
rc = opal_hash_table_get_value_uint32(&btl_proc->btl_index_to_endpoint, tcp_btl->btl_index, (void **)&remote_addr);
|
||||||
if (OPAL_SUCCESS != rc) {
|
if (OPAL_SUCCESS != rc) {
|
||||||
opal_output_verbose(10, opal_btl_base_framework.framework_output,
|
if (9 < opal_output_get_verbosity(opal_btl_base_framework.framework_output)) {
|
||||||
"btl:tcp: host %s, process %s UNREACHABLE",
|
char *proc_hostname = opal_get_proc_hostname(btl_proc->proc_opal);
|
||||||
proc_hostname,
|
opal_output(0, "btl:tcp: host %s, process %s UNREACHABLE",
|
||||||
OPAL_NAME_PRINT(btl_proc->proc_opal->proc_name));
|
proc_hostname,
|
||||||
|
OPAL_NAME_PRINT(btl_proc->proc_opal->proc_name));
|
||||||
|
free(proc_hostname);
|
||||||
|
}
|
||||||
goto out;
|
goto out;
|
||||||
}
|
}
|
||||||
btl_endpoint->endpoint_addr = remote_addr;
|
btl_endpoint->endpoint_addr = remote_addr;
|
||||||
@ -685,14 +683,15 @@ void mca_btl_tcp_proc_accept(mca_btl_tcp_proc_t* btl_proc, struct sockaddr* addr
|
|||||||
}
|
}
|
||||||
addr_str = tmp;
|
addr_str = tmp;
|
||||||
}
|
}
|
||||||
|
tmp = opal_get_proc_hostname(btl_proc->proc_opal);
|
||||||
opal_show_help("help-mpi-btl-tcp.txt", "dropped inbound connection",
|
opal_show_help("help-mpi-btl-tcp.txt", "dropped inbound connection",
|
||||||
true, opal_process_info.nodename,
|
true, opal_process_info.nodename,
|
||||||
getpid(),
|
getpid(), tmp,
|
||||||
btl_proc->proc_opal->proc_hostname,
|
|
||||||
OPAL_NAME_PRINT(btl_proc->proc_opal->proc_name),
|
OPAL_NAME_PRINT(btl_proc->proc_opal->proc_name),
|
||||||
opal_net_get_hostname((struct sockaddr*)addr),
|
opal_net_get_hostname((struct sockaddr*)addr),
|
||||||
btl_proc->proc_endpoint_count,
|
btl_proc->proc_endpoint_count,
|
||||||
(NULL == addr_str) ? "NONE" : addr_str);
|
(NULL == addr_str) ? "NONE" : addr_str);
|
||||||
|
free(tmp);
|
||||||
if (NULL != addr_str) {
|
if (NULL != addr_str) {
|
||||||
free(addr_str);
|
free(addr_str);
|
||||||
}
|
}
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
/*
|
/*
|
||||||
* Copyright (c) 2014-2019 Cisco Systems, Inc. All rights reserved
|
* Copyright (c) 2014-2019 Cisco Systems, Inc. All rights reserved
|
||||||
* Copyright (c) 2014 Intel, Inc. All rights reserved
|
* Copyright (c) 2014-2020 Intel, Inc. All rights reserved.
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
* Additional copyrights may follow
|
* Additional copyrights may follow
|
||||||
@ -279,13 +279,15 @@ opal_btl_usnic_check_connectivity(opal_btl_usnic_module_t *module,
|
|||||||
{
|
{
|
||||||
if (OPAL_LIKELY(mca_btl_usnic_component.connectivity_enabled) &&
|
if (OPAL_LIKELY(mca_btl_usnic_component.connectivity_enabled) &&
|
||||||
OPAL_UNLIKELY(!endpoint->endpoint_connectivity_checked)) {
|
OPAL_UNLIKELY(!endpoint->endpoint_connectivity_checked)) {
|
||||||
|
char *host = opal_get_proc_hostname(endpoint->endpoint_proc->proc_opal);
|
||||||
opal_btl_usnic_connectivity_ping(module->local_modex.ipv4_addr,
|
opal_btl_usnic_connectivity_ping(module->local_modex.ipv4_addr,
|
||||||
module->local_modex.connectivity_udp_port,
|
module->local_modex.connectivity_udp_port,
|
||||||
endpoint->endpoint_remote_modex.ipv4_addr,
|
endpoint->endpoint_remote_modex.ipv4_addr,
|
||||||
endpoint->endpoint_remote_modex.netmask,
|
endpoint->endpoint_remote_modex.netmask,
|
||||||
endpoint->endpoint_remote_modex.connectivity_udp_port,
|
endpoint->endpoint_remote_modex.connectivity_udp_port,
|
||||||
opal_get_proc_hostname(endpoint->endpoint_proc->proc_opal),
|
host,
|
||||||
endpoint->endpoint_remote_modex.max_msg_size);
|
endpoint->endpoint_remote_modex.max_msg_size);
|
||||||
|
free(host);
|
||||||
endpoint->endpoint_connectivity_checked = true;
|
endpoint->endpoint_connectivity_checked = true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
/*
|
/*
|
||||||
* Copyright (c) 2013-2016 Cisco Systems, Inc. All rights reserved.
|
* Copyright (c) 2013-2016 Cisco Systems, Inc. All rights reserved.
|
||||||
* Copyright (c) 2014 Intel, Inc. All rights reserved
|
* Copyright (c) 2014-2020 Intel, Inc. All rights reserved.
|
||||||
* Copyright (c) 2018 Amazon.com, Inc. or its affiliates. All Rights reserved.
|
* Copyright (c) 2018 Amazon.com, Inc. or its affiliates. All Rights reserved.
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
@ -215,8 +215,10 @@ static int map_output_procs(FILE *fp)
|
|||||||
/* Loop over and print the sorted module device information */
|
/* Loop over and print the sorted module device information */
|
||||||
int ret = OPAL_SUCCESS;
|
int ret = OPAL_SUCCESS;
|
||||||
for (i = 0; i < num_procs; ++i) {
|
for (i = 0; i < num_procs; ++i) {
|
||||||
|
char *errhost = opal_get_proc_hostname(procs[i]->proc_opal);
|
||||||
fprintf(fp, "peer=%d,", procs[i]->proc_opal->proc_name.vpid);
|
fprintf(fp, "peer=%d,", procs[i]->proc_opal->proc_name.vpid);
|
||||||
fprintf(fp, "hostname=%s,", opal_get_proc_hostname(procs[i]->proc_opal));
|
fprintf(fp, "hostname=%s,", errhost);
|
||||||
|
free(errhost);
|
||||||
if (OPAL_SUCCESS != map_output_endpoints(fp, procs[i])) {
|
if (OPAL_SUCCESS != map_output_endpoints(fp, procs[i])) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
@ -244,9 +246,10 @@ void opal_btl_usnic_connectivity_map(void)
|
|||||||
|
|
||||||
/* Filename is of the form: <prefix>-<hostname>.<pid>.<job>.<MCW
|
/* Filename is of the form: <prefix>-<hostname>.<pid>.<job>.<MCW
|
||||||
rank>.txt */
|
rank>.txt */
|
||||||
|
host =
|
||||||
opal_asprintf(&filename, "%s-%s.pid%d.job%d.mcwrank%d.txt",
|
opal_asprintf(&filename, "%s-%s.pid%d.job%d.mcwrank%d.txt",
|
||||||
mca_btl_usnic_component.connectivity_map_prefix,
|
mca_btl_usnic_component.connectivity_map_prefix,
|
||||||
opal_get_proc_hostname(opal_proc_local_get()),
|
opal_process_info.nodename,
|
||||||
getpid(),
|
getpid(),
|
||||||
opal_proc_local_get()->proc_name.jobid,
|
opal_proc_local_get()->proc_name.jobid,
|
||||||
opal_proc_local_get()->proc_name.vpid);
|
opal_proc_local_get()->proc_name.vpid);
|
||||||
|
@ -15,7 +15,7 @@
|
|||||||
* Copyright (c) 2009-2019 Cisco Systems, Inc. All rights reserved
|
* Copyright (c) 2009-2019 Cisco Systems, Inc. All rights reserved
|
||||||
* Copyright (c) 2014-2016 Los Alamos National Security, LLC. All rights
|
* Copyright (c) 2014-2016 Los Alamos National Security, LLC. All rights
|
||||||
* reserved.
|
* reserved.
|
||||||
* Copyright (c) 2014 Intel, Inc. All rights reserved
|
* Copyright (c) 2014-2020 Intel, Inc. All rights reserved.
|
||||||
* Copyright (c) 2018 Amazon.com, Inc. or its affiliates. All Rights reserved.
|
* Copyright (c) 2018 Amazon.com, Inc. or its affiliates. All Rights reserved.
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
@ -102,6 +102,7 @@ static int add_procs_block_create_endpoints(opal_btl_usnic_module_t *module,
|
|||||||
int rc;
|
int rc;
|
||||||
opal_proc_t* my_proc;
|
opal_proc_t* my_proc;
|
||||||
size_t num_created = 0;
|
size_t num_created = 0;
|
||||||
|
char *errhost;
|
||||||
|
|
||||||
/* get pointer to my proc structure */
|
/* get pointer to my proc structure */
|
||||||
my_proc = opal_proc_local_get();
|
my_proc = opal_proc_local_get();
|
||||||
@ -143,11 +144,14 @@ static int add_procs_block_create_endpoints(opal_btl_usnic_module_t *module,
|
|||||||
if (OPAL_ERR_UNREACH == rc) {
|
if (OPAL_ERR_UNREACH == rc) {
|
||||||
/* If the peer doesn't have usnic modex info, then we just
|
/* If the peer doesn't have usnic modex info, then we just
|
||||||
skip it */
|
skip it */
|
||||||
opal_output_verbose(75, USNIC_OUT,
|
if (74 < opal_output_get_verbosity(USNIC_OUT)) {
|
||||||
"btl:usnic:add_procs:%s: peer %s on %s does not have usnic modex info; skipping",
|
errhost = opal_get_proc_hostname(opal_proc);
|
||||||
module->linux_device_name,
|
opal_output(0, "btl:usnic:add_procs:%s: peer %s on %s does not have usnic modex info; skipping",
|
||||||
usnic_compat_proc_name_print(&opal_proc->proc_name),
|
module->linux_device_name,
|
||||||
opal_get_proc_hostname(opal_proc));
|
usnic_compat_proc_name_print(&opal_proc->proc_name),
|
||||||
|
errhost);
|
||||||
|
free(errhost);
|
||||||
|
}
|
||||||
continue;
|
continue;
|
||||||
} else if (OPAL_SUCCESS != rc) {
|
} else if (OPAL_SUCCESS != rc) {
|
||||||
return OPAL_ERR_OUT_OF_RESOURCE;
|
return OPAL_ERR_OUT_OF_RESOURCE;
|
||||||
@ -159,11 +163,14 @@ static int add_procs_block_create_endpoints(opal_btl_usnic_module_t *module,
|
|||||||
rc = opal_btl_usnic_create_endpoint(module, usnic_proc,
|
rc = opal_btl_usnic_create_endpoint(module, usnic_proc,
|
||||||
&usnic_endpoint);
|
&usnic_endpoint);
|
||||||
if (OPAL_SUCCESS != rc) {
|
if (OPAL_SUCCESS != rc) {
|
||||||
opal_output_verbose(5, USNIC_OUT,
|
if (4 < opal_output_get_verbosity(USNIC_OUT)) {
|
||||||
"btl:usnic:add_procs:%s: unable to create endpoint to peer %s on %s",
|
errhost = opal_get_proc_hostname(opal_proc);
|
||||||
module->linux_device_name,
|
opal_output(0, "btl:usnic:add_procs:%s: unable to create endpoint to peer %s on %s",
|
||||||
usnic_compat_proc_name_print(&opal_proc->proc_name),
|
module->linux_device_name,
|
||||||
opal_get_proc_hostname(opal_proc));
|
usnic_compat_proc_name_print(&opal_proc->proc_name),
|
||||||
|
errhost);
|
||||||
|
free(errhost);
|
||||||
|
}
|
||||||
OBJ_RELEASE(usnic_proc);
|
OBJ_RELEASE(usnic_proc);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
@ -221,6 +228,8 @@ static int add_procs_block_create_endpoints(opal_btl_usnic_module_t *module,
|
|||||||
static void add_procs_warn_unreachable(opal_btl_usnic_module_t *module,
|
static void add_procs_warn_unreachable(opal_btl_usnic_module_t *module,
|
||||||
opal_btl_usnic_endpoint_t *endpoint)
|
opal_btl_usnic_endpoint_t *endpoint)
|
||||||
{
|
{
|
||||||
|
char *errhost;
|
||||||
|
|
||||||
/* Only show the warning if it is enabled */
|
/* Only show the warning if it is enabled */
|
||||||
if (!mca_btl_usnic_component.show_route_failures) {
|
if (!mca_btl_usnic_component.show_route_failures) {
|
||||||
return;
|
return;
|
||||||
@ -236,13 +245,15 @@ static void add_procs_warn_unreachable(opal_btl_usnic_module_t *module,
|
|||||||
module->linux_device_name,
|
module->linux_device_name,
|
||||||
module->if_ipv4_addr_str,
|
module->if_ipv4_addr_str,
|
||||||
remote);
|
remote);
|
||||||
|
errhost = opal_get_proc_hostname(endpoint->endpoint_proc->proc_opal);
|
||||||
opal_show_help("help-mpi-btl-usnic.txt", "unreachable peer IP",
|
opal_show_help("help-mpi-btl-usnic.txt", "unreachable peer IP",
|
||||||
true,
|
true,
|
||||||
opal_process_info.nodename,
|
opal_process_info.nodename,
|
||||||
module->if_ipv4_addr_str,
|
module->if_ipv4_addr_str,
|
||||||
module->linux_device_name,
|
module->linux_device_name,
|
||||||
opal_get_proc_hostname(endpoint->endpoint_proc->proc_opal),
|
errhost,
|
||||||
remote);
|
remote);
|
||||||
|
free(errhost);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* A bunch of calls to fi_av_insert() were previously
|
/* A bunch of calls to fi_av_insert() were previously
|
||||||
|
@ -12,7 +12,7 @@
|
|||||||
* Copyright (c) 2006 Sandia National Laboratories. All rights
|
* Copyright (c) 2006 Sandia National Laboratories. All rights
|
||||||
* reserved.
|
* reserved.
|
||||||
* Copyright (c) 2013-2016 Cisco Systems, Inc. All rights reserved.
|
* Copyright (c) 2013-2016 Cisco Systems, Inc. All rights reserved.
|
||||||
* Copyright (c) 2013-2014 Intel, Inc. All rights reserved
|
* Copyright (c) 2013-2020 Intel, Inc. All rights reserved.
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
* Additional copyrights may follow
|
* Additional copyrights may follow
|
||||||
@ -641,14 +641,15 @@ static int match_modex(opal_btl_usnic_module_t *module,
|
|||||||
if (*index_out >= 0 &&
|
if (*index_out >= 0 &&
|
||||||
proc->proc_modex[*index_out].max_msg_size !=
|
proc->proc_modex[*index_out].max_msg_size !=
|
||||||
(uint16_t) module->fabric_info->ep_attr->max_msg_size) {
|
(uint16_t) module->fabric_info->ep_attr->max_msg_size) {
|
||||||
|
char *errhost = opal_get_proc_hostname(proc->proc_opal);
|
||||||
opal_show_help("help-mpi-btl-usnic.txt", "MTU mismatch",
|
opal_show_help("help-mpi-btl-usnic.txt", "MTU mismatch",
|
||||||
true,
|
true,
|
||||||
opal_process_info.nodename,
|
opal_process_info.nodename,
|
||||||
module->linux_device_name,
|
module->linux_device_name,
|
||||||
module->fabric_info->ep_attr->max_msg_size,
|
module->fabric_info->ep_attr->max_msg_size,
|
||||||
(NULL == proc->proc_opal->proc_hostname) ?
|
errhost,
|
||||||
"unknown" : proc->proc_opal->proc_hostname,
|
|
||||||
proc->proc_modex[*index_out].max_msg_size);
|
proc->proc_modex[*index_out].max_msg_size);
|
||||||
|
free(errhost);
|
||||||
*index_out = -1;
|
*index_out = -1;
|
||||||
return OPAL_ERR_UNREACH;
|
return OPAL_ERR_UNREACH;
|
||||||
}
|
}
|
||||||
|
@ -14,7 +14,7 @@
|
|||||||
* Copyright (c) 2008-2010 Cisco Systems, Inc. All rights reserved.
|
* Copyright (c) 2008-2010 Cisco Systems, Inc. All rights reserved.
|
||||||
* Copyright (c) 2010-2015 Los Alamos National Security, LLC.
|
* Copyright (c) 2010-2015 Los Alamos National Security, LLC.
|
||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
* Copyright (c) 2014 Intel, Inc. All rights reserved
|
* Copyright (c) 2014-2020 Intel, Inc. All rights reserved.
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
* Additional copyrights may follow
|
* Additional copyrights may follow
|
||||||
@ -104,7 +104,7 @@ attach_and_init(opal_shmem_ds_t *shmem_bufp,
|
|||||||
/* is addr past end of the shared memory segment? */
|
/* is addr past end of the shared memory segment? */
|
||||||
if ((unsigned char *)seg + shmem_bufp->seg_size < addr) {
|
if ((unsigned char *)seg + shmem_bufp->seg_size < addr) {
|
||||||
opal_show_help("help-mpi-common-sm.txt", "mmap too small", 1,
|
opal_show_help("help-mpi-common-sm.txt", "mmap too small", 1,
|
||||||
opal_proc_local_get()->proc_hostname,
|
opal_process_info.nodename,
|
||||||
(unsigned long)shmem_bufp->seg_size,
|
(unsigned long)shmem_bufp->seg_size,
|
||||||
(unsigned long)size_ctl_structure,
|
(unsigned long)size_ctl_structure,
|
||||||
(unsigned long)data_seg_alignment);
|
(unsigned long)data_seg_alignment);
|
||||||
|
@ -13,7 +13,7 @@
|
|||||||
* Copyright (c) 2011-2012 Los Alamos National Security, LLC.
|
* Copyright (c) 2011-2012 Los Alamos National Security, LLC.
|
||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
* Copyright (c) 2011-2014 NVIDIA Corporation. All rights reserved.
|
* Copyright (c) 2011-2014 NVIDIA Corporation. All rights reserved.
|
||||||
* Copyright (c) 2015 Intel, Inc. All rights reserved
|
* Copyright (c) 2015-2020 Intel, Inc. All rights reserved.
|
||||||
* Copyright (c) 2018 Amazon.com, Inc. or its affiliates. All Rights reserved.
|
* Copyright (c) 2018 Amazon.com, Inc. or its affiliates. All Rights reserved.
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
@ -226,7 +226,7 @@ int mca_common_sm_mpool_ft_event(int state) {
|
|||||||
/* Record the shared memory filename */
|
/* Record the shared memory filename */
|
||||||
opal_asprintf( &file_name, "%s"OPAL_PATH_SEP"shared_mem_pool.%s",
|
opal_asprintf( &file_name, "%s"OPAL_PATH_SEP"shared_mem_pool.%s",
|
||||||
opal_process_info.job_session_dir,
|
opal_process_info.job_session_dir,
|
||||||
opal_proc_local_get()->proc_hostname );
|
opal_process_info.nodename );
|
||||||
/* Disabled to get FT code compiled again
|
/* Disabled to get FT code compiled again
|
||||||
* TODO: FIXIT soon
|
* TODO: FIXIT soon
|
||||||
orte_sstore.set_attr(orte_sstore_handle_current, SSTORE_METADATA_LOCAL_TOUCH, file_name);
|
orte_sstore.set_attr(orte_sstore_handle_current, SSTORE_METADATA_LOCAL_TOUCH, file_name);
|
||||||
|
@ -18,6 +18,7 @@
|
|||||||
* Copyright (c) 2015-2018 Research Organization for Information Science
|
* Copyright (c) 2015-2018 Research Organization for Information Science
|
||||||
* and Technology (RIST). All rights reserved.
|
* and Technology (RIST). All rights reserved.
|
||||||
* Copyright (c) 2018 Amazon.com, Inc. or its affiliates. All Rights reserved.
|
* Copyright (c) 2018 Amazon.com, Inc. or its affiliates. All Rights reserved.
|
||||||
|
* Copyright (c) 2020 Intel, Inc. All rights reserved.
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
* Additional copyrights may follow
|
* Additional copyrights may follow
|
||||||
@ -192,13 +193,13 @@ void mca_mpool_base_tree_print(int show_up_to_mem_leaks)
|
|||||||
show_up_to_mem_leaks < 0) {
|
show_up_to_mem_leaks < 0) {
|
||||||
opal_show_help("help-mpool-base.txt", "all mem leaks",
|
opal_show_help("help-mpool-base.txt", "all mem leaks",
|
||||||
true, OPAL_NAME_PRINT(OPAL_PROC_MY_NAME),
|
true, OPAL_NAME_PRINT(OPAL_PROC_MY_NAME),
|
||||||
opal_proc_local_get()->proc_hostname,
|
opal_process_info.nodename,
|
||||||
getpid(), leak_msg);
|
getpid(), leak_msg);
|
||||||
} else {
|
} else {
|
||||||
int i = num_leaks - show_up_to_mem_leaks;
|
int i = num_leaks - show_up_to_mem_leaks;
|
||||||
opal_show_help("help-mpool-base.txt", "some mem leaks",
|
opal_show_help("help-mpool-base.txt", "some mem leaks",
|
||||||
true, OPAL_NAME_PRINT(OPAL_PROC_MY_NAME),
|
true, OPAL_NAME_PRINT(OPAL_PROC_MY_NAME),
|
||||||
opal_proc_local_get()->proc_hostname,
|
opal_process_info.nodename,
|
||||||
getpid(), leak_msg, i,
|
getpid(), leak_msg, i,
|
||||||
(i > 1) ? "s were" : " was",
|
(i > 1) ? "s were" : " was",
|
||||||
(i > 1) ? "are" : "is");
|
(i > 1) ? "are" : "is");
|
||||||
|
@ -12,6 +12,7 @@
|
|||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
* Copyright (c) 2015-2016 Los Alamos National Security, LLC. All rights
|
* Copyright (c) 2015-2016 Los Alamos National Security, LLC. All rights
|
||||||
* reserved.
|
* reserved.
|
||||||
|
* Copyright (c) 2020 Intel, Inc. All rights reserved.
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
* Additional copyrights may follow
|
* Additional copyrights may follow
|
||||||
@ -71,7 +72,7 @@ mca_rcache_base_module_t* mca_rcache_base_module_create (const char* name, void
|
|||||||
} else if (1 == opal_leave_pinned || opal_leave_pinned_pipeline) {
|
} else if (1 == opal_leave_pinned || opal_leave_pinned_pipeline) {
|
||||||
opal_show_help("help-rcache-base.txt", "leave pinned failed",
|
opal_show_help("help-rcache-base.txt", "leave pinned failed",
|
||||||
true, name, OPAL_NAME_PRINT(OPAL_PROC_MY_NAME),
|
true, name, OPAL_NAME_PRINT(OPAL_PROC_MY_NAME),
|
||||||
opal_proc_local_get()->proc_hostname);
|
opal_process_info.nodename);
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -13,6 +13,7 @@
|
|||||||
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
|
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
|
||||||
* Copyright (c) 2012-2015 Los Alamos National Security, LLC.
|
* Copyright (c) 2012-2015 Los Alamos National Security, LLC.
|
||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
|
* Copyright (c) 2020 Intel, Inc. All rights reserved.
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
* Additional copyrights may follow
|
* Additional copyrights may follow
|
||||||
@ -67,7 +68,7 @@ void mca_rcache_base_mem_cb (void* base, size_t size, void* cbdata, bool from_al
|
|||||||
int len;
|
int len;
|
||||||
len = snprintf(msg, sizeof(msg), "[%s:%05d] Attempt to free memory that is still in "
|
len = snprintf(msg, sizeof(msg), "[%s:%05d] Attempt to free memory that is still in "
|
||||||
"use by an ongoing MPI communication (buffer %p, size %lu). MPI job "
|
"use by an ongoing MPI communication (buffer %p, size %lu). MPI job "
|
||||||
"will now abort.\n", opal_proc_local_get()->proc_hostname,
|
"will now abort.\n", opal_process_info.nodename,
|
||||||
getpid(), base, (unsigned long) size);
|
getpid(), base, (unsigned long) size);
|
||||||
msg[sizeof(msg) - 1] = '\0';
|
msg[sizeof(msg) - 1] = '\0';
|
||||||
write(2, msg, len);
|
write(2, msg, len);
|
||||||
@ -75,7 +76,7 @@ void mca_rcache_base_mem_cb (void* base, size_t size, void* cbdata, bool from_al
|
|||||||
opal_show_help("help-rcache-base.txt",
|
opal_show_help("help-rcache-base.txt",
|
||||||
"cannot deregister in-use memory", true,
|
"cannot deregister in-use memory", true,
|
||||||
current->rcache_component->rcache_version.mca_component_name,
|
current->rcache_component->rcache_version.mca_component_name,
|
||||||
opal_proc_local_get()->proc_hostname,
|
opal_process_info.nodename,
|
||||||
base, (unsigned long) size);
|
base, (unsigned long) size);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -41,11 +41,10 @@ opal_process_info_t opal_process_info = {
|
|||||||
static opal_proc_t opal_local_proc = {
|
static opal_proc_t opal_local_proc = {
|
||||||
{ .opal_list_next = NULL,
|
{ .opal_list_next = NULL,
|
||||||
.opal_list_prev = NULL},
|
.opal_list_prev = NULL},
|
||||||
{OPAL_JOBID_INVALID, OPAL_VPID_INVALID},
|
.proc_name = {OPAL_JOBID_INVALID, OPAL_VPID_INVALID},
|
||||||
0,
|
.proc_arch = 0,
|
||||||
0,
|
.proc_flags = 0,
|
||||||
NULL,
|
.proc_convertor = NULL
|
||||||
NULL
|
|
||||||
};
|
};
|
||||||
static opal_proc_t* opal_proc_my_name = &opal_local_proc;
|
static opal_proc_t* opal_proc_my_name = &opal_local_proc;
|
||||||
|
|
||||||
@ -55,14 +54,12 @@ static void opal_proc_construct(opal_proc_t* proc)
|
|||||||
proc->proc_convertor = NULL;
|
proc->proc_convertor = NULL;
|
||||||
proc->proc_flags = 0;
|
proc->proc_flags = 0;
|
||||||
proc->proc_name = *OPAL_NAME_INVALID;
|
proc->proc_name = *OPAL_NAME_INVALID;
|
||||||
proc->proc_hostname = NULL;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static void opal_proc_destruct(opal_proc_t* proc)
|
static void opal_proc_destruct(opal_proc_t* proc)
|
||||||
{
|
{
|
||||||
proc->proc_flags = 0;
|
proc->proc_flags = 0;
|
||||||
proc->proc_name = *OPAL_NAME_INVALID;
|
proc->proc_name = *OPAL_NAME_INVALID;
|
||||||
proc->proc_hostname = NULL;
|
|
||||||
proc->proc_convertor = NULL;
|
proc->proc_convertor = NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -188,30 +185,26 @@ struct opal_proc_t *(*opal_proc_for_name) (const opal_process_name_t name) = opa
|
|||||||
char* opal_get_proc_hostname(const opal_proc_t *proc)
|
char* opal_get_proc_hostname(const opal_proc_t *proc)
|
||||||
{
|
{
|
||||||
int ret;
|
int ret;
|
||||||
|
char *hostname;
|
||||||
|
|
||||||
/* if the proc is NULL, then we can't know */
|
/* if the proc is NULL, then we can't know */
|
||||||
if (NULL == proc) {
|
if (NULL == proc) {
|
||||||
return "unknown";
|
return strdup("unknown");
|
||||||
}
|
}
|
||||||
|
|
||||||
/* if it is my own hostname we are after, then just hand back
|
/* if it is my own hostname we are after, then just hand back
|
||||||
* the value in opal_process_info */
|
* the value in opal_process_info */
|
||||||
if (proc == opal_proc_my_name) {
|
if (proc == opal_proc_my_name) {
|
||||||
return opal_process_info.nodename;
|
return strdup(opal_process_info.nodename);
|
||||||
}
|
|
||||||
|
|
||||||
/* see if we already have the data - if so, pass it back */
|
|
||||||
if (NULL != proc->proc_hostname) {
|
|
||||||
return proc->proc_hostname;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* if we don't already have it, then try to get it */
|
/* if we don't already have it, then try to get it */
|
||||||
OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, PMIX_HOSTNAME, &proc->proc_name,
|
OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, PMIX_HOSTNAME, &proc->proc_name,
|
||||||
(char**)&(proc->proc_hostname), PMIX_STRING);
|
(char**)&hostname, PMIX_STRING);
|
||||||
if (OPAL_SUCCESS != ret) {
|
if (OPAL_SUCCESS != ret) {
|
||||||
return "unknown"; // return something so the caller doesn't segfault
|
return strdup("unknown"); // return something so the caller doesn't segfault
|
||||||
}
|
}
|
||||||
|
|
||||||
/* user is not allowed to release the data */
|
/* user is not allowed to release the data */
|
||||||
return proc->proc_hostname;
|
return hostname;
|
||||||
}
|
}
|
||||||
|
@ -48,7 +48,7 @@
|
|||||||
#define OPAL_VPID_WILDCARD (OPAL_VPID_MAX + 1)
|
#define OPAL_VPID_WILDCARD (OPAL_VPID_MAX + 1)
|
||||||
|
|
||||||
#define OPAL_PROC_MY_NAME (opal_proc_local_get()->proc_name)
|
#define OPAL_PROC_MY_NAME (opal_proc_local_get()->proc_name)
|
||||||
#define OPAL_PROC_MY_HOSTNAME (opal_proc_local_get()->proc_hostname)
|
#define OPAL_PROC_MY_HOSTNAME (opal_process_info.nodename)
|
||||||
|
|
||||||
#define OPAL_NAME_WILDCARD (&opal_name_wildcard)
|
#define OPAL_NAME_WILDCARD (&opal_name_wildcard)
|
||||||
OPAL_DECLSPEC extern opal_process_name_t opal_name_wildcard;
|
OPAL_DECLSPEC extern opal_process_name_t opal_name_wildcard;
|
||||||
@ -91,9 +91,6 @@ typedef struct opal_proc_t {
|
|||||||
opal_hwloc_locality_t proc_flags;
|
opal_hwloc_locality_t proc_flags;
|
||||||
/** Base convertor for the proc described by this process */
|
/** Base convertor for the proc described by this process */
|
||||||
struct opal_convertor_t* proc_convertor;
|
struct opal_convertor_t* proc_convertor;
|
||||||
/** A pointer to the name of this host - data is
|
|
||||||
* actually stored outside of this framework. */
|
|
||||||
char* proc_hostname;
|
|
||||||
} opal_proc_t;
|
} opal_proc_t;
|
||||||
OBJ_CLASS_DECLARATION(opal_proc_t);
|
OBJ_CLASS_DECLARATION(opal_proc_t);
|
||||||
|
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user