1
1

ompi_proc_t size reduction: part 1

We currently save the hostname of a proc when we create the ompi_proc_t for it. This was originally done because the only method we had for discovering the host of a proc was to include that info in the modex, and we had to therefore store it somewhere proc-local. Obviously, this ccarried a memory penalty for storing all those strings, and so we added a "cutoff" parameter so that we wouldn't collect hostnames above a certain number of procs.

Unfortunately, this still results in an 8-byte/proc memory cost as we have a char* pointer in the opal_proc_t that is contained in the ompi_proc_t so that we can store the hostname of the other procs if we fall below the cutoff. At scale, this can consume a fair amount of memory.

With the switch to relying on PMIx, there is no longer a need to cache the proc hostnames. Using the "optional" feature of PMIx_Get, we restrict the retrieval to be purely proc-local - i.e., we retrieve the info either via shared memory or from within the proc-internal hash storage (depending upon the active PMIx components). Thus, the retrieval of a hostname is purely a local operation involving no communication.

All RM's are required to provide a complete hostname map of all procs at startup. Thus, we have full access to all hostnames without including them in a modex or having to cache them on each proc. This allows us to remove the char* pointer from the opal_proc_t, saving us 8-bytes/proc.

Unfortunately, PMIx_Get does not currently support the return of a static pointer to memory. Thus, even though PMIx has the hostname in its memory, it can only return a malloc'd version of it. I have therefore ensured that the return from opal_get_proc_hostname is consistently malloc'd and free'd wherever used. This shouldn't be a burden as the hostname is only used in one of two circumstances:

(a) in an error message
(b) in a verbose output for debugging purposes

Thus, there should be no performance penalty associated with the malloc/free requirement. PMIx will eventually be returning static pointers, and so we can eventually simplify this method and return a "const char*" - but as noted, this really isn't an issue even today.

Signed-off-by: Ralph Castain <rhc@pmix.org>
Этот коммит содержится в:
Ralph Castain 2020-03-22 07:41:54 -07:00 коммит произвёл Jeff Squyres
родитель 48b52478ef
Коммит 33ab928e1b
20 изменённых файлов: 148 добавлений и 148 удалений

Просмотреть файл

@ -13,7 +13,7 @@
* Copyright (c) 2007-2015 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2008-2016 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2013 Intel, Inc. All rights reserved
* Copyright (c) 2013-2020 Intel, Inc. All rights reserved.
* Copyright (c) 2014 NVIDIA Corporation. All rights reserved.
* Copyright (c) 2014 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
@ -233,12 +233,14 @@ static int mca_bml_r2_endpoint_add_btl (struct ompi_proc_t *proc, mca_bml_base_e
if (!bml_btl || bml_btl->btl->btl_exclusivity <= btl->btl_exclusivity) {
/* this btl has higher exclusivity than an existing btl or none exists */
opal_output_verbose(1, opal_btl_base_framework.framework_output,
"mca: bml: Using %s btl for send to %s on node %s",
btl->btl_component->btl_version.mca_component_name,
OMPI_NAME_PRINT(&proc->super.proc_name),
proc->super.proc_hostname);
if (0 < opal_output_get_verbosity(opal_btl_base_framework.framework_output)) {
char *errhost = opal_get_proc_hostname(&proc->super);
opal_output(0, "mca: bml: Using %s btl for send to %s on node %s",
btl->btl_component->btl_version.mca_component_name,
OMPI_NAME_PRINT(&proc->super.proc_name),
errhost);
free(errhost);
}
/* cache the endpoint on the proc */
if (NULL == bml_btl || (bml_btl->btl->btl_exclusivity <= btl->btl_exclusivity)) {
@ -252,15 +254,16 @@ static int mca_bml_r2_endpoint_add_btl (struct ompi_proc_t *proc, mca_bml_base_e
* calculate the bitwise OR of the btl flags
*/
bml_endpoint->btl_flags_or |= bml_btl->btl_flags;
} else {
opal_output_verbose(20, opal_btl_base_framework.framework_output,
"mca: bml: Not using %s btl for send to %s on node %s "
"because %s btl has higher exclusivity (%d > %d)",
btl->btl_component->btl_version.mca_component_name,
OMPI_NAME_PRINT(&proc->super.proc_name), proc->super.proc_hostname,
bml_btl->btl->btl_component->btl_version.mca_component_name,
bml_btl->btl->btl_exclusivity,
btl->btl_exclusivity);
} else if (19 < opal_output_get_verbosity(opal_btl_base_framework.framework_output)) {
char *errhost = opal_get_proc_hostname(&proc->super);
opal_output(0, "mca: bml: Not using %s btl for send to %s on node %s "
"because %s btl has higher exclusivity (%d > %d)",
btl->btl_component->btl_version.mca_component_name,
OMPI_NAME_PRINT(&proc->super.proc_name), errhost,
bml_btl->btl->btl_component->btl_version.mca_component_name,
bml_btl->btl->btl_exclusivity,
btl->btl_exclusivity);
free(errhost);
}
btl_in_use = true;
@ -424,14 +427,16 @@ static int mca_bml_r2_add_proc (struct ompi_proc_t *proc)
OBJ_RELEASE(bml_endpoint);
/* no btl is available for this proc */
if (mca_bml_r2.show_unreach_errors) {
char *errhost = opal_get_proc_hostname(&proc->super);
char *localhost = opal_get_proc_hostname(&ompi_proc_local_proc->super);
opal_show_help ("help-mca-bml-r2.txt", "unreachable proc", true,
OMPI_NAME_PRINT(&(ompi_proc_local_proc->super.proc_name)),
(NULL != ompi_proc_local_proc->super.proc_hostname ?
ompi_proc_local_proc->super.proc_hostname : "unknown!"),
localhost,
OMPI_NAME_PRINT(&(proc->super.proc_name)),
(NULL != proc->super.proc_hostname ?
proc->super.proc_hostname : "unknown!"),
errhost,
btl_names);
free(errhost);
free(localhost);
}
return OMPI_ERR_UNREACH;
@ -578,14 +583,16 @@ static int mca_bml_r2_add_procs( size_t nprocs,
if (NULL == proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML]) {
ret = OMPI_ERR_UNREACH;
if (mca_bml_r2.show_unreach_errors) {
char *errhost = opal_get_proc_hostname(&proc->super);
char *localhost = opal_get_proc_hostname(&ompi_proc_local_proc->super);
opal_show_help("help-mca-bml-r2.txt", "unreachable proc", true,
OMPI_NAME_PRINT(&(ompi_proc_local_proc->super.proc_name)),
(NULL != ompi_proc_local_proc->super.proc_hostname ?
ompi_proc_local_proc->super.proc_hostname : "unknown!"),
localhost,
OMPI_NAME_PRINT(&(proc->super.proc_name)),
(NULL != proc->super.proc_hostname ?
proc->super.proc_hostname : "unknown!"),
errhost,
btl_names);
free(errhost);
free(localhost);
}
break;

Просмотреть файл

@ -1,5 +1,5 @@
/*
* Copyright (c) 2013-2018 Intel, Inc. All rights reserved
* Copyright (c) 2013-2020 Intel, Inc. All rights reserved.
*
* $COPYRIGHT$
*
@ -98,10 +98,11 @@ ompi_mtl_ofi_add_procs(struct mca_mtl_base_module_t *mtl,
(void**)&ep_name,
&size);
if (OMPI_SUCCESS != ret) {
char *errhost = opal_get_proc_hostname(&procs[i]->super);
opal_show_help("help-mtl-ofi.txt", "modex failed",
true, ompi_process_info.nodename,
procs[i]->super.proc_hostname,
opal_strerror(ret), ret);
errhost, opal_strerror(ret), ret);
free(errhost);
goto bail;
}
memcpy(&ep_names[i*namelen], ep_name, namelen);

Просмотреть файл

@ -324,8 +324,9 @@ ompi_mtl_psm2_add_procs(struct mca_mtl_base_module_t *mtl,
errstr ? errstr : "unknown connect error");
for (j = 0; j < (int) nprocs; j++) {
if (errs_out[j] == thiserr) {
opal_output(0, " %s", (NULL == procs[j]->super.proc_hostname) ?
"unknown" : procs[j]->super.proc_hostname);
char *errhost = opal_get_proc_hostname(&procs[j]->super);
opal_output(0, " %s", errhost);
free(errhost);
}
}
opal_output(0, "\n");

Просмотреть файл

@ -12,7 +12,7 @@
* All rights reserved.
* Copyright (c) 2012 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2013-2019 Intel, Inc. All rights reserved.
* Copyright (c) 2013-2020 Intel, Inc. All rights reserved.
* Copyright (c) 2015 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
@ -370,12 +370,15 @@ mca_pml_base_pml_check_selected(const char *my_pml,
/* if that module doesn't match my own, return an error */
if ((size != strlen(my_pml) + 1) ||
(0 != strcmp(my_pml, remote_pml))) {
char *errhost = opal_get_proc_hostname(&procs[0]->super);
opal_output(0, "%s selected pml %s, but peer %s on %s selected pml %s",
OMPI_NAME_PRINT(&ompi_proc_local()->super.proc_name),
my_pml, OMPI_NAME_PRINT(&procs[0]->super.proc_name),
(NULL == procs[0]->super.proc_hostname) ? "unknown" : procs[0]->super.proc_hostname,
errhost,
remote_pml);
free(remote_pml); /* cleanup before returning */
free(remote_pml);
free(errhost);
/* cleanup before returning */
return OMPI_ERR_UNREACH;
}

Просмотреть файл

@ -15,6 +15,7 @@
* Copyright (c) 2012-2015 NVIDIA Corporation. All rights reserved.
* Copyright (c) 2015-2017 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2020 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -202,14 +203,17 @@ void mca_pml_ob1_cuda_add_ipc_support(struct mca_btl_base_module_t* btl, int32_t
/* Find the corresponding bml and adjust the flag to support CUDA get */
for( i = 0; i < (int)ep->btl_send.arr_size; i++ ) {
if( ep->btl_send.bml_btls[i].btl == btl ) {
if (4 < opal_output_get_verbosity(btl_verbose_stream)) {
char *errhost = opal_get_proc_hostname(&errproc->super);
opal_output(0, "BTL %s: rank=%d enabling CUDA IPC "
"to rank=%d on node=%s \n",
btl->btl_component->btl_version.mca_component_name,
OMPI_PROC_MY_NAME->vpid,
((ompi_process_name_t*)&errproc->super.proc_name)->vpid,
errhost);
free(errhost);
}
ep->btl_send.bml_btls[i].btl_flags |= MCA_BTL_FLAGS_CUDA_GET;
opal_output_verbose(5, btl_verbose_stream,
"BTL %s: rank=%d enabling CUDA IPC "
"to rank=%d on node=%s \n",
btl->btl_component->btl_version.mca_component_name,
OMPI_PROC_MY_NAME->vpid,
((ompi_process_name_t*)&errproc->super.proc_name)->vpid,
errproc->super.proc_hostname);
}
}
}

Просмотреть файл

@ -86,9 +86,6 @@ void ompi_proc_destruct(ompi_proc_t* proc)
* destroyed here. It will be destroyed later when the ompi_datatype_finalize is called.
*/
OBJ_RELEASE( proc->super.proc_convertor );
if (NULL != proc->super.proc_hostname) {
free(proc->super.proc_hostname);
}
opal_mutex_lock (&ompi_proc_lock);
opal_list_remove_item(&ompi_proc_list, (opal_list_item_t*)proc);
opal_hash_table_remove_value_ptr (&ompi_proc_hash, &proc->super.proc_name, sizeof (proc->super.proc_name));
@ -135,22 +132,12 @@ static int ompi_proc_allocate (ompi_jobid_t jobid, ompi_vpid_t vpid, ompi_proc_t
*/
int ompi_proc_complete_init_single (ompi_proc_t *proc)
{
int ret;
if ((OMPI_CAST_RTE_NAME(&proc->super.proc_name)->jobid == OMPI_PROC_MY_NAME->jobid) &&
(OMPI_CAST_RTE_NAME(&proc->super.proc_name)->vpid == OMPI_PROC_MY_NAME->vpid)) {
/* nothing else to do */
return OMPI_SUCCESS;
}
/* we can retrieve the hostname at no cost because it
* was provided at startup - but make it optional so
* we don't chase after it if some system doesn't
* provide it */
proc->super.proc_hostname = NULL;
OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, PMIX_HOSTNAME, &proc->super.proc_name,
(char**)&(proc->super.proc_hostname), PMIX_STRING);
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
/* get the remote architecture - this might force a modex except
* for those environments where the RM provides it */
@ -264,7 +251,6 @@ int ompi_proc_init(void)
/* set local process data */
ompi_proc_local_proc = proc;
proc->super.proc_flags = OPAL_PROC_ALL_LOCAL;
proc->super.proc_hostname = strdup(ompi_process_info.nodename);
proc->super.proc_arch = opal_local_arch;
/* Register the local proc with OPAL */
opal_proc_local_set(&proc->super);
@ -609,7 +595,6 @@ int ompi_proc_refresh(void)
if (i == OMPI_PROC_MY_NAME->vpid) {
ompi_proc_local_proc = proc;
proc->super.proc_flags = OPAL_PROC_ALL_LOCAL;
proc->super.proc_hostname = ompi_process_info.nodename;
proc->super.proc_arch = opal_local_arch;
opal_proc_local_set(&proc->super);
} else {
@ -676,13 +661,6 @@ ompi_proc_pack(ompi_proc_t **proclist, int proclistsize,
opal_mutex_unlock (&ompi_proc_lock);
return rc;
}
/* pass the name of the host this proc is on */
rc = opal_dss.pack(buf, &(proc->super.proc_hostname), 1, OPAL_STRING);
if(rc != OPAL_SUCCESS) {
OMPI_ERROR_LOG(rc);
opal_mutex_unlock (&ompi_proc_lock);
return rc;
}
}
opal_mutex_unlock (&ompi_proc_lock);
return OMPI_SUCCESS;
@ -747,10 +725,10 @@ ompi_proc_unpack(opal_buffer_t* buf,
int32_t count=1;
ompi_process_name_t new_name;
uint32_t new_arch;
char *new_hostname;
bool isnew = false;
int rc;
char *nspace;
uint16_t u16, *u16ptr;
rc = opal_dss.unpack(buf, &new_name, &count, OMPI_NAME);
if (rc != OPAL_SUCCESS) {
@ -774,13 +752,6 @@ ompi_proc_unpack(opal_buffer_t* buf,
free(newprocs);
return rc;
}
rc = opal_dss.unpack(buf, &new_hostname, &count, OPAL_STRING);
if (rc != OPAL_SUCCESS) {
OMPI_ERROR_LOG(rc);
free(plist);
free(newprocs);
return rc;
}
/* see if this proc is already on our ompi_proc_list */
plist[i] = ompi_proc_find_and_add(&new_name, &isnew);
if (isnew) {
@ -798,27 +769,25 @@ ompi_proc_unpack(opal_buffer_t* buf,
OBJ_RELEASE(plist[i]->super.proc_convertor);
plist[i]->super.proc_convertor = opal_convertor_create(plist[i]->super.proc_arch, 0);
#else
char *errhost = opal_get_proc_hostname(&plist[i]->super);
opal_show_help("help-mpi-runtime.txt",
"heterogeneous-support-unavailable",
true, ompi_process_info.nodename,
new_hostname == NULL ? "<hostname unavailable>" :
new_hostname);
errhost);
free(plist);
free(newprocs);
free(errhost);
return OMPI_ERR_NOT_SUPPORTED;
#endif
}
if (NULL != new_hostname) {
if (0 == strcmp(ompi_proc_local_proc->super.proc_hostname, new_hostname)) {
plist[i]->super.proc_flags |= (OPAL_PROC_ON_NODE | OPAL_PROC_ON_CU | OPAL_PROC_ON_CLUSTER);
}
/* Save the hostname */
plist[i]->super.proc_hostname = new_hostname;
/* get the locality information - all RTEs are required
* to provide this information at startup */
u16ptr = &u16;
OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, PMIX_LOCALITY, &plist[i]->super.proc_name, &u16ptr, PMIX_UINT16);
if (OPAL_SUCCESS == rc) {
plist[i]->super.proc_flags = u16;
}
} else if (NULL != new_hostname) {
free(new_hostname);
}
}

Просмотреть файл

@ -13,7 +13,7 @@
* Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved.
* Copyright (c) 2012 Los Alamos National Security, LLC.
* All rights reserved.
* Copyright (c) 2013-2014 Intel, Inc. All rights reserved
* Copyright (c) 2013-2020 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -59,13 +59,15 @@ OPAL_DECLSPEC extern int mca_btl_base_out(const char*, ...) __opal_attribute_for
#define BTL_PEER_ERROR(proc, args) \
do { \
char *errhost; \
mca_btl_base_err("%s[%s:%d:%s] from %s ", \
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), \
__FILE__, __LINE__, __func__, \
opal_process_info.nodename); \
if (proc) { \
mca_btl_base_err("to: %s ", \
opal_get_proc_hostname(proc)); \
errhost = opal_get_proc_hostname(proc); \
mca_btl_base_err("to: %s ", errhost); \
free(errhost); \
} \
mca_btl_base_err args; \
mca_btl_base_err("\n"); \

Просмотреть файл

@ -15,6 +15,7 @@
* Copyright (c) 2015-2017 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* Copyright (c) 2015-2016 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2020 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -44,6 +45,7 @@
#include "opal/opal_socket_errno.h"
#include "opal/mca/btl/base/btl_base_error.h"
#include "opal/util/proc.h"
#include "opal/util/show_help.h"
#include "btl_tcp_frag.h"
@ -168,6 +170,7 @@ bool mca_btl_tcp_frag_recv(mca_btl_tcp_frag_t* frag, int sd)
mca_btl_base_endpoint_t* btl_endpoint = frag->endpoint;
ssize_t cnt;
int32_t i, num_vecs, dont_copy_data = 0;
char *errhost;
repeat:
num_vecs = frag->iov_cnt;
@ -231,10 +234,11 @@ bool mca_btl_tcp_frag_recv(mca_btl_tcp_frag_t* frag, int sd)
return false;
case ECONNRESET:
errhost = opal_get_proc_hostname(btl_endpoint->endpoint_proc->proc_opal);
opal_show_help("help-mpi-btl-tcp.txt", "peer hung up",
true, opal_process_info.nodename,
getpid(),
btl_endpoint->endpoint_proc->proc_opal->proc_hostname);
getpid(), errhost);
free(errhost);
btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED;
mca_btl_tcp_endpoint_close(btl_endpoint);
return false;

Просмотреть файл

@ -11,7 +11,7 @@
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2008-2010 Oracle and/or its affiliates. All rights reserved
* Copyright (c) 2013-2019 Intel, Inc. All rights reserved.
* Copyright (c) 2013-2020 Intel, Inc. All rights reserved.
* Copyright (c) 2014-2016 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* Copyright (c) 2015-2016 Los Alamos National Security, LLC. All rights
@ -48,6 +48,7 @@
#include "opal/util/proc.h"
#include "opal/util/show_help.h"
#include "opal/util/printf.h"
#include "opal/util/proc.h"
#include "opal/util/string_copy.h"
#include "opal/util/bipartite_graph.h"
@ -479,21 +480,18 @@ int mca_btl_tcp_proc_insert(mca_btl_tcp_proc_t* btl_proc,
mca_btl_base_endpoint_t* btl_endpoint)
{
mca_btl_tcp_module_t* tcp_btl = btl_endpoint->endpoint_btl;
const char *proc_hostname;
mca_btl_tcp_addr_t *remote_addr;
int rc = OPAL_SUCCESS;
if (NULL == (proc_hostname = opal_get_proc_hostname(btl_proc->proc_opal))) {
rc = OPAL_ERR_UNREACH;
goto out;
}
rc = opal_hash_table_get_value_uint32(&btl_proc->btl_index_to_endpoint, tcp_btl->btl_index, (void **)&remote_addr);
if (OPAL_SUCCESS != rc) {
opal_output_verbose(10, opal_btl_base_framework.framework_output,
"btl:tcp: host %s, process %s UNREACHABLE",
proc_hostname,
OPAL_NAME_PRINT(btl_proc->proc_opal->proc_name));
if (9 < opal_output_get_verbosity(opal_btl_base_framework.framework_output)) {
char *proc_hostname = opal_get_proc_hostname(btl_proc->proc_opal);
opal_output(0, "btl:tcp: host %s, process %s UNREACHABLE",
proc_hostname,
OPAL_NAME_PRINT(btl_proc->proc_opal->proc_name));
free(proc_hostname);
}
goto out;
}
btl_endpoint->endpoint_addr = remote_addr;
@ -685,14 +683,15 @@ void mca_btl_tcp_proc_accept(mca_btl_tcp_proc_t* btl_proc, struct sockaddr* addr
}
addr_str = tmp;
}
tmp = opal_get_proc_hostname(btl_proc->proc_opal);
opal_show_help("help-mpi-btl-tcp.txt", "dropped inbound connection",
true, opal_process_info.nodename,
getpid(),
btl_proc->proc_opal->proc_hostname,
getpid(), tmp,
OPAL_NAME_PRINT(btl_proc->proc_opal->proc_name),
opal_net_get_hostname((struct sockaddr*)addr),
btl_proc->proc_endpoint_count,
(NULL == addr_str) ? "NONE" : addr_str);
free(tmp);
if (NULL != addr_str) {
free(addr_str);
}

Просмотреть файл

@ -1,6 +1,6 @@
/*
* Copyright (c) 2014-2019 Cisco Systems, Inc. All rights reserved
* Copyright (c) 2014 Intel, Inc. All rights reserved
* Copyright (c) 2014-2020 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -279,13 +279,15 @@ opal_btl_usnic_check_connectivity(opal_btl_usnic_module_t *module,
{
if (OPAL_LIKELY(mca_btl_usnic_component.connectivity_enabled) &&
OPAL_UNLIKELY(!endpoint->endpoint_connectivity_checked)) {
char *host = opal_get_proc_hostname(endpoint->endpoint_proc->proc_opal);
opal_btl_usnic_connectivity_ping(module->local_modex.ipv4_addr,
module->local_modex.connectivity_udp_port,
endpoint->endpoint_remote_modex.ipv4_addr,
endpoint->endpoint_remote_modex.netmask,
endpoint->endpoint_remote_modex.connectivity_udp_port,
opal_get_proc_hostname(endpoint->endpoint_proc->proc_opal),
host,
endpoint->endpoint_remote_modex.max_msg_size);
free(host);
endpoint->endpoint_connectivity_checked = true;
}
}

Просмотреть файл

@ -1,6 +1,6 @@
/*
* Copyright (c) 2013-2016 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2014 Intel, Inc. All rights reserved
* Copyright (c) 2014-2020 Intel, Inc. All rights reserved.
* Copyright (c) 2018 Amazon.com, Inc. or its affiliates. All Rights reserved.
* $COPYRIGHT$
*
@ -215,8 +215,10 @@ static int map_output_procs(FILE *fp)
/* Loop over and print the sorted module device information */
int ret = OPAL_SUCCESS;
for (i = 0; i < num_procs; ++i) {
char *errhost = opal_get_proc_hostname(procs[i]->proc_opal);
fprintf(fp, "peer=%d,", procs[i]->proc_opal->proc_name.vpid);
fprintf(fp, "hostname=%s,", opal_get_proc_hostname(procs[i]->proc_opal));
fprintf(fp, "hostname=%s,", errhost);
free(errhost);
if (OPAL_SUCCESS != map_output_endpoints(fp, procs[i])) {
break;
}
@ -244,9 +246,10 @@ void opal_btl_usnic_connectivity_map(void)
/* Filename is of the form: <prefix>-<hostname>.<pid>.<job>.<MCW
rank>.txt */
host =
opal_asprintf(&filename, "%s-%s.pid%d.job%d.mcwrank%d.txt",
mca_btl_usnic_component.connectivity_map_prefix,
opal_get_proc_hostname(opal_proc_local_get()),
opal_process_info.nodename,
getpid(),
opal_proc_local_get()->proc_name.jobid,
opal_proc_local_get()->proc_name.vpid);

Просмотреть файл

@ -15,7 +15,7 @@
* Copyright (c) 2009-2019 Cisco Systems, Inc. All rights reserved
* Copyright (c) 2014-2016 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2014 Intel, Inc. All rights reserved
* Copyright (c) 2014-2020 Intel, Inc. All rights reserved.
* Copyright (c) 2018 Amazon.com, Inc. or its affiliates. All Rights reserved.
* $COPYRIGHT$
*
@ -102,6 +102,7 @@ static int add_procs_block_create_endpoints(opal_btl_usnic_module_t *module,
int rc;
opal_proc_t* my_proc;
size_t num_created = 0;
char *errhost;
/* get pointer to my proc structure */
my_proc = opal_proc_local_get();
@ -143,11 +144,14 @@ static int add_procs_block_create_endpoints(opal_btl_usnic_module_t *module,
if (OPAL_ERR_UNREACH == rc) {
/* If the peer doesn't have usnic modex info, then we just
skip it */
opal_output_verbose(75, USNIC_OUT,
"btl:usnic:add_procs:%s: peer %s on %s does not have usnic modex info; skipping",
module->linux_device_name,
usnic_compat_proc_name_print(&opal_proc->proc_name),
opal_get_proc_hostname(opal_proc));
if (74 < opal_output_get_verbosity(USNIC_OUT)) {
errhost = opal_get_proc_hostname(opal_proc);
opal_output(0, "btl:usnic:add_procs:%s: peer %s on %s does not have usnic modex info; skipping",
module->linux_device_name,
usnic_compat_proc_name_print(&opal_proc->proc_name),
errhost);
free(errhost);
}
continue;
} else if (OPAL_SUCCESS != rc) {
return OPAL_ERR_OUT_OF_RESOURCE;
@ -159,11 +163,14 @@ static int add_procs_block_create_endpoints(opal_btl_usnic_module_t *module,
rc = opal_btl_usnic_create_endpoint(module, usnic_proc,
&usnic_endpoint);
if (OPAL_SUCCESS != rc) {
opal_output_verbose(5, USNIC_OUT,
"btl:usnic:add_procs:%s: unable to create endpoint to peer %s on %s",
module->linux_device_name,
usnic_compat_proc_name_print(&opal_proc->proc_name),
opal_get_proc_hostname(opal_proc));
if (4 < opal_output_get_verbosity(USNIC_OUT)) {
errhost = opal_get_proc_hostname(opal_proc);
opal_output(0, "btl:usnic:add_procs:%s: unable to create endpoint to peer %s on %s",
module->linux_device_name,
usnic_compat_proc_name_print(&opal_proc->proc_name),
errhost);
free(errhost);
}
OBJ_RELEASE(usnic_proc);
continue;
}
@ -221,6 +228,8 @@ static int add_procs_block_create_endpoints(opal_btl_usnic_module_t *module,
static void add_procs_warn_unreachable(opal_btl_usnic_module_t *module,
opal_btl_usnic_endpoint_t *endpoint)
{
char *errhost;
/* Only show the warning if it is enabled */
if (!mca_btl_usnic_component.show_route_failures) {
return;
@ -236,13 +245,15 @@ static void add_procs_warn_unreachable(opal_btl_usnic_module_t *module,
module->linux_device_name,
module->if_ipv4_addr_str,
remote);
errhost = opal_get_proc_hostname(endpoint->endpoint_proc->proc_opal);
opal_show_help("help-mpi-btl-usnic.txt", "unreachable peer IP",
true,
opal_process_info.nodename,
module->if_ipv4_addr_str,
module->linux_device_name,
opal_get_proc_hostname(endpoint->endpoint_proc->proc_opal),
errhost,
remote);
free(errhost);
}
/* A bunch of calls to fi_av_insert() were previously

Просмотреть файл

@ -12,7 +12,7 @@
* Copyright (c) 2006 Sandia National Laboratories. All rights
* reserved.
* Copyright (c) 2013-2016 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2013-2014 Intel, Inc. All rights reserved
* Copyright (c) 2013-2020 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -641,14 +641,15 @@ static int match_modex(opal_btl_usnic_module_t *module,
if (*index_out >= 0 &&
proc->proc_modex[*index_out].max_msg_size !=
(uint16_t) module->fabric_info->ep_attr->max_msg_size) {
char *errhost = opal_get_proc_hostname(proc->proc_opal);
opal_show_help("help-mpi-btl-usnic.txt", "MTU mismatch",
true,
opal_process_info.nodename,
module->linux_device_name,
module->fabric_info->ep_attr->max_msg_size,
(NULL == proc->proc_opal->proc_hostname) ?
"unknown" : proc->proc_opal->proc_hostname,
errhost,
proc->proc_modex[*index_out].max_msg_size);
free(errhost);
*index_out = -1;
return OPAL_ERR_UNREACH;
}

Просмотреть файл

@ -14,7 +14,7 @@
* Copyright (c) 2008-2010 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2010-2015 Los Alamos National Security, LLC.
* All rights reserved.
* Copyright (c) 2014 Intel, Inc. All rights reserved
* Copyright (c) 2014-2020 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -104,7 +104,7 @@ attach_and_init(opal_shmem_ds_t *shmem_bufp,
/* is addr past end of the shared memory segment? */
if ((unsigned char *)seg + shmem_bufp->seg_size < addr) {
opal_show_help("help-mpi-common-sm.txt", "mmap too small", 1,
opal_proc_local_get()->proc_hostname,
opal_process_info.nodename,
(unsigned long)shmem_bufp->seg_size,
(unsigned long)size_ctl_structure,
(unsigned long)data_seg_alignment);

Просмотреть файл

@ -13,7 +13,7 @@
* Copyright (c) 2011-2012 Los Alamos National Security, LLC.
* All rights reserved.
* Copyright (c) 2011-2014 NVIDIA Corporation. All rights reserved.
* Copyright (c) 2015 Intel, Inc. All rights reserved
* Copyright (c) 2015-2020 Intel, Inc. All rights reserved.
* Copyright (c) 2018 Amazon.com, Inc. or its affiliates. All Rights reserved.
* $COPYRIGHT$
*
@ -226,7 +226,7 @@ int mca_common_sm_mpool_ft_event(int state) {
/* Record the shared memory filename */
opal_asprintf( &file_name, "%s"OPAL_PATH_SEP"shared_mem_pool.%s",
opal_process_info.job_session_dir,
opal_proc_local_get()->proc_hostname );
opal_process_info.nodename );
/* Disabled to get FT code compiled again
* TODO: FIXIT soon
orte_sstore.set_attr(orte_sstore_handle_current, SSTORE_METADATA_LOCAL_TOUCH, file_name);

Просмотреть файл

@ -18,6 +18,7 @@
* Copyright (c) 2015-2018 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* Copyright (c) 2018 Amazon.com, Inc. or its affiliates. All Rights reserved.
* Copyright (c) 2020 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -192,13 +193,13 @@ void mca_mpool_base_tree_print(int show_up_to_mem_leaks)
show_up_to_mem_leaks < 0) {
opal_show_help("help-mpool-base.txt", "all mem leaks",
true, OPAL_NAME_PRINT(OPAL_PROC_MY_NAME),
opal_proc_local_get()->proc_hostname,
opal_process_info.nodename,
getpid(), leak_msg);
} else {
int i = num_leaks - show_up_to_mem_leaks;
opal_show_help("help-mpool-base.txt", "some mem leaks",
true, OPAL_NAME_PRINT(OPAL_PROC_MY_NAME),
opal_proc_local_get()->proc_hostname,
opal_process_info.nodename,
getpid(), leak_msg, i,
(i > 1) ? "s were" : " was",
(i > 1) ? "are" : "is");

Просмотреть файл

@ -12,6 +12,7 @@
* All rights reserved.
* Copyright (c) 2015-2016 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2020 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -71,7 +72,7 @@ mca_rcache_base_module_t* mca_rcache_base_module_create (const char* name, void
} else if (1 == opal_leave_pinned || opal_leave_pinned_pipeline) {
opal_show_help("help-rcache-base.txt", "leave pinned failed",
true, name, OPAL_NAME_PRINT(OPAL_PROC_MY_NAME),
opal_proc_local_get()->proc_hostname);
opal_process_info.nodename);
return NULL;
}

Просмотреть файл

@ -13,6 +13,7 @@
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2012-2015 Los Alamos National Security, LLC.
* All rights reserved.
* Copyright (c) 2020 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -67,7 +68,7 @@ void mca_rcache_base_mem_cb (void* base, size_t size, void* cbdata, bool from_al
int len;
len = snprintf(msg, sizeof(msg), "[%s:%05d] Attempt to free memory that is still in "
"use by an ongoing MPI communication (buffer %p, size %lu). MPI job "
"will now abort.\n", opal_proc_local_get()->proc_hostname,
"will now abort.\n", opal_process_info.nodename,
getpid(), base, (unsigned long) size);
msg[sizeof(msg) - 1] = '\0';
write(2, msg, len);
@ -75,7 +76,7 @@ void mca_rcache_base_mem_cb (void* base, size_t size, void* cbdata, bool from_al
opal_show_help("help-rcache-base.txt",
"cannot deregister in-use memory", true,
current->rcache_component->rcache_version.mca_component_name,
opal_proc_local_get()->proc_hostname,
opal_process_info.nodename,
base, (unsigned long) size);
}

Просмотреть файл

@ -41,11 +41,10 @@ opal_process_info_t opal_process_info = {
static opal_proc_t opal_local_proc = {
{ .opal_list_next = NULL,
.opal_list_prev = NULL},
{OPAL_JOBID_INVALID, OPAL_VPID_INVALID},
0,
0,
NULL,
NULL
.proc_name = {OPAL_JOBID_INVALID, OPAL_VPID_INVALID},
.proc_arch = 0,
.proc_flags = 0,
.proc_convertor = NULL
};
static opal_proc_t* opal_proc_my_name = &opal_local_proc;
@ -55,14 +54,12 @@ static void opal_proc_construct(opal_proc_t* proc)
proc->proc_convertor = NULL;
proc->proc_flags = 0;
proc->proc_name = *OPAL_NAME_INVALID;
proc->proc_hostname = NULL;
}
static void opal_proc_destruct(opal_proc_t* proc)
{
proc->proc_flags = 0;
proc->proc_name = *OPAL_NAME_INVALID;
proc->proc_hostname = NULL;
proc->proc_convertor = NULL;
}
@ -188,30 +185,26 @@ struct opal_proc_t *(*opal_proc_for_name) (const opal_process_name_t name) = opa
char* opal_get_proc_hostname(const opal_proc_t *proc)
{
int ret;
char *hostname;
/* if the proc is NULL, then we can't know */
if (NULL == proc) {
return "unknown";
return strdup("unknown");
}
/* if it is my own hostname we are after, then just hand back
* the value in opal_process_info */
if (proc == opal_proc_my_name) {
return opal_process_info.nodename;
}
/* see if we already have the data - if so, pass it back */
if (NULL != proc->proc_hostname) {
return proc->proc_hostname;
return strdup(opal_process_info.nodename);
}
/* if we don't already have it, then try to get it */
OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, PMIX_HOSTNAME, &proc->proc_name,
(char**)&(proc->proc_hostname), PMIX_STRING);
(char**)&hostname, PMIX_STRING);
if (OPAL_SUCCESS != ret) {
return "unknown"; // return something so the caller doesn't segfault
return strdup("unknown"); // return something so the caller doesn't segfault
}
/* user is not allowed to release the data */
return proc->proc_hostname;
return hostname;
}

Просмотреть файл

@ -48,7 +48,7 @@
#define OPAL_VPID_WILDCARD (OPAL_VPID_MAX + 1)
#define OPAL_PROC_MY_NAME (opal_proc_local_get()->proc_name)
#define OPAL_PROC_MY_HOSTNAME (opal_proc_local_get()->proc_hostname)
#define OPAL_PROC_MY_HOSTNAME (opal_process_info.nodename)
#define OPAL_NAME_WILDCARD (&opal_name_wildcard)
OPAL_DECLSPEC extern opal_process_name_t opal_name_wildcard;
@ -91,9 +91,6 @@ typedef struct opal_proc_t {
opal_hwloc_locality_t proc_flags;
/** Base convertor for the proc described by this process */
struct opal_convertor_t* proc_convertor;
/** A pointer to the name of this host - data is
* actually stored outside of this framework. */
char* proc_hostname;
} opal_proc_t;
OBJ_CLASS_DECLARATION(opal_proc_t);