diff --git a/ompi/mca/bml/r2/bml_r2.c b/ompi/mca/bml/r2/bml_r2.c index 44d7a783da..7abace2d34 100644 --- a/ompi/mca/bml/r2/bml_r2.c +++ b/ompi/mca/bml/r2/bml_r2.c @@ -13,7 +13,7 @@ * Copyright (c) 2007-2015 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2008-2016 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2013 Intel, Inc. All rights reserved + * Copyright (c) 2013-2020 Intel, Inc. All rights reserved. * Copyright (c) 2014 NVIDIA Corporation. All rights reserved. * Copyright (c) 2014 Research Organization for Information Science * and Technology (RIST). All rights reserved. @@ -233,12 +233,14 @@ static int mca_bml_r2_endpoint_add_btl (struct ompi_proc_t *proc, mca_bml_base_e if (!bml_btl || bml_btl->btl->btl_exclusivity <= btl->btl_exclusivity) { /* this btl has higher exclusivity than an existing btl or none exists */ - - opal_output_verbose(1, opal_btl_base_framework.framework_output, - "mca: bml: Using %s btl for send to %s on node %s", - btl->btl_component->btl_version.mca_component_name, - OMPI_NAME_PRINT(&proc->super.proc_name), - proc->super.proc_hostname); + if (0 < opal_output_get_verbosity(opal_btl_base_framework.framework_output)) { + char *errhost = opal_get_proc_hostname(&proc->super); + opal_output(0, "mca: bml: Using %s btl for send to %s on node %s", + btl->btl_component->btl_version.mca_component_name, + OMPI_NAME_PRINT(&proc->super.proc_name), + errhost); + free(errhost); + } /* cache the endpoint on the proc */ if (NULL == bml_btl || (bml_btl->btl->btl_exclusivity <= btl->btl_exclusivity)) { @@ -252,15 +254,16 @@ static int mca_bml_r2_endpoint_add_btl (struct ompi_proc_t *proc, mca_bml_base_e * calculate the bitwise OR of the btl flags */ bml_endpoint->btl_flags_or |= bml_btl->btl_flags; - } else { - opal_output_verbose(20, opal_btl_base_framework.framework_output, - "mca: bml: Not using %s btl for send to %s on node %s " - "because %s btl has higher exclusivity (%d > %d)", - btl->btl_component->btl_version.mca_component_name, - OMPI_NAME_PRINT(&proc->super.proc_name), proc->super.proc_hostname, - bml_btl->btl->btl_component->btl_version.mca_component_name, - bml_btl->btl->btl_exclusivity, - btl->btl_exclusivity); + } else if (19 < opal_output_get_verbosity(opal_btl_base_framework.framework_output)) { + char *errhost = opal_get_proc_hostname(&proc->super); + opal_output(0, "mca: bml: Not using %s btl for send to %s on node %s " + "because %s btl has higher exclusivity (%d > %d)", + btl->btl_component->btl_version.mca_component_name, + OMPI_NAME_PRINT(&proc->super.proc_name), errhost, + bml_btl->btl->btl_component->btl_version.mca_component_name, + bml_btl->btl->btl_exclusivity, + btl->btl_exclusivity); + free(errhost); } btl_in_use = true; @@ -424,14 +427,16 @@ static int mca_bml_r2_add_proc (struct ompi_proc_t *proc) OBJ_RELEASE(bml_endpoint); /* no btl is available for this proc */ if (mca_bml_r2.show_unreach_errors) { + char *errhost = opal_get_proc_hostname(&proc->super); + char *localhost = opal_get_proc_hostname(&ompi_proc_local_proc->super); opal_show_help ("help-mca-bml-r2.txt", "unreachable proc", true, OMPI_NAME_PRINT(&(ompi_proc_local_proc->super.proc_name)), - (NULL != ompi_proc_local_proc->super.proc_hostname ? - ompi_proc_local_proc->super.proc_hostname : "unknown!"), + localhost, OMPI_NAME_PRINT(&(proc->super.proc_name)), - (NULL != proc->super.proc_hostname ? - proc->super.proc_hostname : "unknown!"), + errhost, btl_names); + free(errhost); + free(localhost); } return OMPI_ERR_UNREACH; @@ -578,14 +583,16 @@ static int mca_bml_r2_add_procs( size_t nprocs, if (NULL == proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML]) { ret = OMPI_ERR_UNREACH; if (mca_bml_r2.show_unreach_errors) { + char *errhost = opal_get_proc_hostname(&proc->super); + char *localhost = opal_get_proc_hostname(&ompi_proc_local_proc->super); opal_show_help("help-mca-bml-r2.txt", "unreachable proc", true, OMPI_NAME_PRINT(&(ompi_proc_local_proc->super.proc_name)), - (NULL != ompi_proc_local_proc->super.proc_hostname ? - ompi_proc_local_proc->super.proc_hostname : "unknown!"), + localhost, OMPI_NAME_PRINT(&(proc->super.proc_name)), - (NULL != proc->super.proc_hostname ? - proc->super.proc_hostname : "unknown!"), + errhost, btl_names); + free(errhost); + free(localhost); } break; diff --git a/ompi/mca/mtl/ofi/mtl_ofi.c b/ompi/mca/mtl/ofi/mtl_ofi.c index 6c679b88b9..789350c49e 100644 --- a/ompi/mca/mtl/ofi/mtl_ofi.c +++ b/ompi/mca/mtl/ofi/mtl_ofi.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013-2018 Intel, Inc. All rights reserved + * Copyright (c) 2013-2020 Intel, Inc. All rights reserved. * * $COPYRIGHT$ * @@ -98,10 +98,11 @@ ompi_mtl_ofi_add_procs(struct mca_mtl_base_module_t *mtl, (void**)&ep_name, &size); if (OMPI_SUCCESS != ret) { + char *errhost = opal_get_proc_hostname(&procs[i]->super); opal_show_help("help-mtl-ofi.txt", "modex failed", true, ompi_process_info.nodename, - procs[i]->super.proc_hostname, - opal_strerror(ret), ret); + errhost, opal_strerror(ret), ret); + free(errhost); goto bail; } memcpy(&ep_names[i*namelen], ep_name, namelen); diff --git a/ompi/mca/mtl/psm2/mtl_psm2.c b/ompi/mca/mtl/psm2/mtl_psm2.c index 8a15bda8b6..ab7668e2a2 100644 --- a/ompi/mca/mtl/psm2/mtl_psm2.c +++ b/ompi/mca/mtl/psm2/mtl_psm2.c @@ -324,8 +324,9 @@ ompi_mtl_psm2_add_procs(struct mca_mtl_base_module_t *mtl, errstr ? errstr : "unknown connect error"); for (j = 0; j < (int) nprocs; j++) { if (errs_out[j] == thiserr) { - opal_output(0, " %s", (NULL == procs[j]->super.proc_hostname) ? - "unknown" : procs[j]->super.proc_hostname); + char *errhost = opal_get_proc_hostname(&procs[j]->super); + opal_output(0, " %s", errhost); + free(errhost); } } opal_output(0, "\n"); diff --git a/ompi/mca/pml/base/pml_base_select.c b/ompi/mca/pml/base/pml_base_select.c index 9953fe9c10..c6cd025586 100644 --- a/ompi/mca/pml/base/pml_base_select.c +++ b/ompi/mca/pml/base/pml_base_select.c @@ -12,7 +12,7 @@ * All rights reserved. * Copyright (c) 2012 Los Alamos National Security, LLC. All rights * reserved. - * Copyright (c) 2013-2019 Intel, Inc. All rights reserved. + * Copyright (c) 2013-2020 Intel, Inc. All rights reserved. * Copyright (c) 2015 Cisco Systems, Inc. All rights reserved. * $COPYRIGHT$ * @@ -370,12 +370,15 @@ mca_pml_base_pml_check_selected(const char *my_pml, /* if that module doesn't match my own, return an error */ if ((size != strlen(my_pml) + 1) || (0 != strcmp(my_pml, remote_pml))) { + char *errhost = opal_get_proc_hostname(&procs[0]->super); opal_output(0, "%s selected pml %s, but peer %s on %s selected pml %s", OMPI_NAME_PRINT(&ompi_proc_local()->super.proc_name), my_pml, OMPI_NAME_PRINT(&procs[0]->super.proc_name), - (NULL == procs[0]->super.proc_hostname) ? "unknown" : procs[0]->super.proc_hostname, + errhost, remote_pml); - free(remote_pml); /* cleanup before returning */ + free(remote_pml); + free(errhost); + /* cleanup before returning */ return OMPI_ERR_UNREACH; } diff --git a/ompi/mca/pml/ob1/pml_ob1_cuda.c b/ompi/mca/pml/ob1/pml_ob1_cuda.c index eb46914a90..a71776e03e 100644 --- a/ompi/mca/pml/ob1/pml_ob1_cuda.c +++ b/ompi/mca/pml/ob1/pml_ob1_cuda.c @@ -15,6 +15,7 @@ * Copyright (c) 2012-2015 NVIDIA Corporation. All rights reserved. * Copyright (c) 2015-2017 Los Alamos National Security, LLC. All rights * reserved. + * Copyright (c) 2020 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -202,14 +203,17 @@ void mca_pml_ob1_cuda_add_ipc_support(struct mca_btl_base_module_t* btl, int32_t /* Find the corresponding bml and adjust the flag to support CUDA get */ for( i = 0; i < (int)ep->btl_send.arr_size; i++ ) { if( ep->btl_send.bml_btls[i].btl == btl ) { + if (4 < opal_output_get_verbosity(btl_verbose_stream)) { + char *errhost = opal_get_proc_hostname(&errproc->super); + opal_output(0, "BTL %s: rank=%d enabling CUDA IPC " + "to rank=%d on node=%s \n", + btl->btl_component->btl_version.mca_component_name, + OMPI_PROC_MY_NAME->vpid, + ((ompi_process_name_t*)&errproc->super.proc_name)->vpid, + errhost); + free(errhost); + } ep->btl_send.bml_btls[i].btl_flags |= MCA_BTL_FLAGS_CUDA_GET; - opal_output_verbose(5, btl_verbose_stream, - "BTL %s: rank=%d enabling CUDA IPC " - "to rank=%d on node=%s \n", - btl->btl_component->btl_version.mca_component_name, - OMPI_PROC_MY_NAME->vpid, - ((ompi_process_name_t*)&errproc->super.proc_name)->vpid, - errproc->super.proc_hostname); } } } diff --git a/ompi/proc/proc.c b/ompi/proc/proc.c index a4907d8385..fe37a17e31 100644 --- a/ompi/proc/proc.c +++ b/ompi/proc/proc.c @@ -86,9 +86,6 @@ void ompi_proc_destruct(ompi_proc_t* proc) * destroyed here. It will be destroyed later when the ompi_datatype_finalize is called. */ OBJ_RELEASE( proc->super.proc_convertor ); - if (NULL != proc->super.proc_hostname) { - free(proc->super.proc_hostname); - } opal_mutex_lock (&ompi_proc_lock); opal_list_remove_item(&ompi_proc_list, (opal_list_item_t*)proc); opal_hash_table_remove_value_ptr (&ompi_proc_hash, &proc->super.proc_name, sizeof (proc->super.proc_name)); @@ -135,22 +132,12 @@ static int ompi_proc_allocate (ompi_jobid_t jobid, ompi_vpid_t vpid, ompi_proc_t */ int ompi_proc_complete_init_single (ompi_proc_t *proc) { - int ret; - if ((OMPI_CAST_RTE_NAME(&proc->super.proc_name)->jobid == OMPI_PROC_MY_NAME->jobid) && (OMPI_CAST_RTE_NAME(&proc->super.proc_name)->vpid == OMPI_PROC_MY_NAME->vpid)) { /* nothing else to do */ return OMPI_SUCCESS; } - /* we can retrieve the hostname at no cost because it - * was provided at startup - but make it optional so - * we don't chase after it if some system doesn't - * provide it */ - proc->super.proc_hostname = NULL; - OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, PMIX_HOSTNAME, &proc->super.proc_name, - (char**)&(proc->super.proc_hostname), PMIX_STRING); - #if OPAL_ENABLE_HETEROGENEOUS_SUPPORT /* get the remote architecture - this might force a modex except * for those environments where the RM provides it */ @@ -264,7 +251,6 @@ int ompi_proc_init(void) /* set local process data */ ompi_proc_local_proc = proc; proc->super.proc_flags = OPAL_PROC_ALL_LOCAL; - proc->super.proc_hostname = strdup(ompi_process_info.nodename); proc->super.proc_arch = opal_local_arch; /* Register the local proc with OPAL */ opal_proc_local_set(&proc->super); @@ -609,7 +595,6 @@ int ompi_proc_refresh(void) if (i == OMPI_PROC_MY_NAME->vpid) { ompi_proc_local_proc = proc; proc->super.proc_flags = OPAL_PROC_ALL_LOCAL; - proc->super.proc_hostname = ompi_process_info.nodename; proc->super.proc_arch = opal_local_arch; opal_proc_local_set(&proc->super); } else { @@ -676,13 +661,6 @@ ompi_proc_pack(ompi_proc_t **proclist, int proclistsize, opal_mutex_unlock (&ompi_proc_lock); return rc; } - /* pass the name of the host this proc is on */ - rc = opal_dss.pack(buf, &(proc->super.proc_hostname), 1, OPAL_STRING); - if(rc != OPAL_SUCCESS) { - OMPI_ERROR_LOG(rc); - opal_mutex_unlock (&ompi_proc_lock); - return rc; - } } opal_mutex_unlock (&ompi_proc_lock); return OMPI_SUCCESS; @@ -747,10 +725,10 @@ ompi_proc_unpack(opal_buffer_t* buf, int32_t count=1; ompi_process_name_t new_name; uint32_t new_arch; - char *new_hostname; bool isnew = false; int rc; char *nspace; + uint16_t u16, *u16ptr; rc = opal_dss.unpack(buf, &new_name, &count, OMPI_NAME); if (rc != OPAL_SUCCESS) { @@ -774,13 +752,6 @@ ompi_proc_unpack(opal_buffer_t* buf, free(newprocs); return rc; } - rc = opal_dss.unpack(buf, &new_hostname, &count, OPAL_STRING); - if (rc != OPAL_SUCCESS) { - OMPI_ERROR_LOG(rc); - free(plist); - free(newprocs); - return rc; - } /* see if this proc is already on our ompi_proc_list */ plist[i] = ompi_proc_find_and_add(&new_name, &isnew); if (isnew) { @@ -798,27 +769,25 @@ ompi_proc_unpack(opal_buffer_t* buf, OBJ_RELEASE(plist[i]->super.proc_convertor); plist[i]->super.proc_convertor = opal_convertor_create(plist[i]->super.proc_arch, 0); #else + char *errhost = opal_get_proc_hostname(&plist[i]->super); opal_show_help("help-mpi-runtime.txt", "heterogeneous-support-unavailable", true, ompi_process_info.nodename, - new_hostname == NULL ? "" : - new_hostname); + errhost); free(plist); free(newprocs); + free(errhost); return OMPI_ERR_NOT_SUPPORTED; #endif } - if (NULL != new_hostname) { - if (0 == strcmp(ompi_proc_local_proc->super.proc_hostname, new_hostname)) { - plist[i]->super.proc_flags |= (OPAL_PROC_ON_NODE | OPAL_PROC_ON_CU | OPAL_PROC_ON_CLUSTER); - } - - /* Save the hostname */ - plist[i]->super.proc_hostname = new_hostname; + /* get the locality information - all RTEs are required + * to provide this information at startup */ + u16ptr = &u16; + OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, PMIX_LOCALITY, &plist[i]->super.proc_name, &u16ptr, PMIX_UINT16); + if (OPAL_SUCCESS == rc) { + plist[i]->super.proc_flags = u16; } - } else if (NULL != new_hostname) { - free(new_hostname); } } diff --git a/opal/mca/btl/base/btl_base_error.h b/opal/mca/btl/base/btl_base_error.h index fb18eb8a05..633fd18f04 100644 --- a/opal/mca/btl/base/btl_base_error.h +++ b/opal/mca/btl/base/btl_base_error.h @@ -13,7 +13,7 @@ * Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2012 Los Alamos National Security, LLC. * All rights reserved. - * Copyright (c) 2013-2014 Intel, Inc. All rights reserved + * Copyright (c) 2013-2020 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -59,13 +59,15 @@ OPAL_DECLSPEC extern int mca_btl_base_out(const char*, ...) __opal_attribute_for #define BTL_PEER_ERROR(proc, args) \ do { \ + char *errhost; \ mca_btl_base_err("%s[%s:%d:%s] from %s ", \ OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), \ __FILE__, __LINE__, __func__, \ opal_process_info.nodename); \ if (proc) { \ - mca_btl_base_err("to: %s ", \ - opal_get_proc_hostname(proc)); \ + errhost = opal_get_proc_hostname(proc); \ + mca_btl_base_err("to: %s ", errhost); \ + free(errhost); \ } \ mca_btl_base_err args; \ mca_btl_base_err("\n"); \ diff --git a/opal/mca/btl/tcp/btl_tcp_frag.c b/opal/mca/btl/tcp/btl_tcp_frag.c index c655891742..cb5eb4d92c 100644 --- a/opal/mca/btl/tcp/btl_tcp_frag.c +++ b/opal/mca/btl/tcp/btl_tcp_frag.c @@ -15,6 +15,7 @@ * Copyright (c) 2015-2017 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2015-2016 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2020 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -44,6 +45,7 @@ #include "opal/opal_socket_errno.h" #include "opal/mca/btl/base/btl_base_error.h" +#include "opal/util/proc.h" #include "opal/util/show_help.h" #include "btl_tcp_frag.h" @@ -168,6 +170,7 @@ bool mca_btl_tcp_frag_recv(mca_btl_tcp_frag_t* frag, int sd) mca_btl_base_endpoint_t* btl_endpoint = frag->endpoint; ssize_t cnt; int32_t i, num_vecs, dont_copy_data = 0; + char *errhost; repeat: num_vecs = frag->iov_cnt; @@ -231,10 +234,11 @@ bool mca_btl_tcp_frag_recv(mca_btl_tcp_frag_t* frag, int sd) return false; case ECONNRESET: + errhost = opal_get_proc_hostname(btl_endpoint->endpoint_proc->proc_opal); opal_show_help("help-mpi-btl-tcp.txt", "peer hung up", true, opal_process_info.nodename, - getpid(), - btl_endpoint->endpoint_proc->proc_opal->proc_hostname); + getpid(), errhost); + free(errhost); btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED; mca_btl_tcp_endpoint_close(btl_endpoint); return false; diff --git a/opal/mca/btl/tcp/btl_tcp_proc.c b/opal/mca/btl/tcp/btl_tcp_proc.c index 9a0300a610..20c7e22d85 100644 --- a/opal/mca/btl/tcp/btl_tcp_proc.c +++ b/opal/mca/btl/tcp/btl_tcp_proc.c @@ -11,7 +11,7 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2008-2010 Oracle and/or its affiliates. All rights reserved - * Copyright (c) 2013-2019 Intel, Inc. All rights reserved. + * Copyright (c) 2013-2020 Intel, Inc. All rights reserved. * Copyright (c) 2014-2016 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2015-2016 Los Alamos National Security, LLC. All rights @@ -48,6 +48,7 @@ #include "opal/util/proc.h" #include "opal/util/show_help.h" #include "opal/util/printf.h" +#include "opal/util/proc.h" #include "opal/util/string_copy.h" #include "opal/util/bipartite_graph.h" @@ -479,21 +480,18 @@ int mca_btl_tcp_proc_insert(mca_btl_tcp_proc_t* btl_proc, mca_btl_base_endpoint_t* btl_endpoint) { mca_btl_tcp_module_t* tcp_btl = btl_endpoint->endpoint_btl; - const char *proc_hostname; mca_btl_tcp_addr_t *remote_addr; int rc = OPAL_SUCCESS; - if (NULL == (proc_hostname = opal_get_proc_hostname(btl_proc->proc_opal))) { - rc = OPAL_ERR_UNREACH; - goto out; - } - rc = opal_hash_table_get_value_uint32(&btl_proc->btl_index_to_endpoint, tcp_btl->btl_index, (void **)&remote_addr); if (OPAL_SUCCESS != rc) { - opal_output_verbose(10, opal_btl_base_framework.framework_output, - "btl:tcp: host %s, process %s UNREACHABLE", - proc_hostname, - OPAL_NAME_PRINT(btl_proc->proc_opal->proc_name)); + if (9 < opal_output_get_verbosity(opal_btl_base_framework.framework_output)) { + char *proc_hostname = opal_get_proc_hostname(btl_proc->proc_opal); + opal_output(0, "btl:tcp: host %s, process %s UNREACHABLE", + proc_hostname, + OPAL_NAME_PRINT(btl_proc->proc_opal->proc_name)); + free(proc_hostname); + } goto out; } btl_endpoint->endpoint_addr = remote_addr; @@ -685,14 +683,15 @@ void mca_btl_tcp_proc_accept(mca_btl_tcp_proc_t* btl_proc, struct sockaddr* addr } addr_str = tmp; } + tmp = opal_get_proc_hostname(btl_proc->proc_opal); opal_show_help("help-mpi-btl-tcp.txt", "dropped inbound connection", true, opal_process_info.nodename, - getpid(), - btl_proc->proc_opal->proc_hostname, + getpid(), tmp, OPAL_NAME_PRINT(btl_proc->proc_opal->proc_name), opal_net_get_hostname((struct sockaddr*)addr), btl_proc->proc_endpoint_count, (NULL == addr_str) ? "NONE" : addr_str); + free(tmp); if (NULL != addr_str) { free(addr_str); } diff --git a/opal/mca/btl/usnic/btl_usnic_connectivity.h b/opal/mca/btl/usnic/btl_usnic_connectivity.h index c951debd95..7bde19f81f 100644 --- a/opal/mca/btl/usnic/btl_usnic_connectivity.h +++ b/opal/mca/btl/usnic/btl_usnic_connectivity.h @@ -1,6 +1,6 @@ /* * Copyright (c) 2014-2019 Cisco Systems, Inc. All rights reserved - * Copyright (c) 2014 Intel, Inc. All rights reserved + * Copyright (c) 2014-2020 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -279,13 +279,15 @@ opal_btl_usnic_check_connectivity(opal_btl_usnic_module_t *module, { if (OPAL_LIKELY(mca_btl_usnic_component.connectivity_enabled) && OPAL_UNLIKELY(!endpoint->endpoint_connectivity_checked)) { + char *host = opal_get_proc_hostname(endpoint->endpoint_proc->proc_opal); opal_btl_usnic_connectivity_ping(module->local_modex.ipv4_addr, module->local_modex.connectivity_udp_port, endpoint->endpoint_remote_modex.ipv4_addr, endpoint->endpoint_remote_modex.netmask, endpoint->endpoint_remote_modex.connectivity_udp_port, - opal_get_proc_hostname(endpoint->endpoint_proc->proc_opal), + host, endpoint->endpoint_remote_modex.max_msg_size); + free(host); endpoint->endpoint_connectivity_checked = true; } } diff --git a/opal/mca/btl/usnic/btl_usnic_map.c b/opal/mca/btl/usnic/btl_usnic_map.c index 9be2ed2522..c7774a901c 100644 --- a/opal/mca/btl/usnic/btl_usnic_map.c +++ b/opal/mca/btl/usnic/btl_usnic_map.c @@ -1,6 +1,6 @@ /* * Copyright (c) 2013-2016 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2014 Intel, Inc. All rights reserved + * Copyright (c) 2014-2020 Intel, Inc. All rights reserved. * Copyright (c) 2018 Amazon.com, Inc. or its affiliates. All Rights reserved. * $COPYRIGHT$ * @@ -215,8 +215,10 @@ static int map_output_procs(FILE *fp) /* Loop over and print the sorted module device information */ int ret = OPAL_SUCCESS; for (i = 0; i < num_procs; ++i) { + char *errhost = opal_get_proc_hostname(procs[i]->proc_opal); fprintf(fp, "peer=%d,", procs[i]->proc_opal->proc_name.vpid); - fprintf(fp, "hostname=%s,", opal_get_proc_hostname(procs[i]->proc_opal)); + fprintf(fp, "hostname=%s,", errhost); + free(errhost); if (OPAL_SUCCESS != map_output_endpoints(fp, procs[i])) { break; } @@ -244,9 +246,10 @@ void opal_btl_usnic_connectivity_map(void) /* Filename is of the form: -....txt */ + host = opal_asprintf(&filename, "%s-%s.pid%d.job%d.mcwrank%d.txt", mca_btl_usnic_component.connectivity_map_prefix, - opal_get_proc_hostname(opal_proc_local_get()), + opal_process_info.nodename, getpid(), opal_proc_local_get()->proc_name.jobid, opal_proc_local_get()->proc_name.vpid); diff --git a/opal/mca/btl/usnic/btl_usnic_module.c b/opal/mca/btl/usnic/btl_usnic_module.c index c16001908e..19fae2e16e 100644 --- a/opal/mca/btl/usnic/btl_usnic_module.c +++ b/opal/mca/btl/usnic/btl_usnic_module.c @@ -15,7 +15,7 @@ * Copyright (c) 2009-2019 Cisco Systems, Inc. All rights reserved * Copyright (c) 2014-2016 Los Alamos National Security, LLC. All rights * reserved. - * Copyright (c) 2014 Intel, Inc. All rights reserved + * Copyright (c) 2014-2020 Intel, Inc. All rights reserved. * Copyright (c) 2018 Amazon.com, Inc. or its affiliates. All Rights reserved. * $COPYRIGHT$ * @@ -102,6 +102,7 @@ static int add_procs_block_create_endpoints(opal_btl_usnic_module_t *module, int rc; opal_proc_t* my_proc; size_t num_created = 0; + char *errhost; /* get pointer to my proc structure */ my_proc = opal_proc_local_get(); @@ -143,11 +144,14 @@ static int add_procs_block_create_endpoints(opal_btl_usnic_module_t *module, if (OPAL_ERR_UNREACH == rc) { /* If the peer doesn't have usnic modex info, then we just skip it */ - opal_output_verbose(75, USNIC_OUT, - "btl:usnic:add_procs:%s: peer %s on %s does not have usnic modex info; skipping", - module->linux_device_name, - usnic_compat_proc_name_print(&opal_proc->proc_name), - opal_get_proc_hostname(opal_proc)); + if (74 < opal_output_get_verbosity(USNIC_OUT)) { + errhost = opal_get_proc_hostname(opal_proc); + opal_output(0, "btl:usnic:add_procs:%s: peer %s on %s does not have usnic modex info; skipping", + module->linux_device_name, + usnic_compat_proc_name_print(&opal_proc->proc_name), + errhost); + free(errhost); + } continue; } else if (OPAL_SUCCESS != rc) { return OPAL_ERR_OUT_OF_RESOURCE; @@ -159,11 +163,14 @@ static int add_procs_block_create_endpoints(opal_btl_usnic_module_t *module, rc = opal_btl_usnic_create_endpoint(module, usnic_proc, &usnic_endpoint); if (OPAL_SUCCESS != rc) { - opal_output_verbose(5, USNIC_OUT, - "btl:usnic:add_procs:%s: unable to create endpoint to peer %s on %s", - module->linux_device_name, - usnic_compat_proc_name_print(&opal_proc->proc_name), - opal_get_proc_hostname(opal_proc)); + if (4 < opal_output_get_verbosity(USNIC_OUT)) { + errhost = opal_get_proc_hostname(opal_proc); + opal_output(0, "btl:usnic:add_procs:%s: unable to create endpoint to peer %s on %s", + module->linux_device_name, + usnic_compat_proc_name_print(&opal_proc->proc_name), + errhost); + free(errhost); + } OBJ_RELEASE(usnic_proc); continue; } @@ -221,6 +228,8 @@ static int add_procs_block_create_endpoints(opal_btl_usnic_module_t *module, static void add_procs_warn_unreachable(opal_btl_usnic_module_t *module, opal_btl_usnic_endpoint_t *endpoint) { + char *errhost; + /* Only show the warning if it is enabled */ if (!mca_btl_usnic_component.show_route_failures) { return; @@ -236,13 +245,15 @@ static void add_procs_warn_unreachable(opal_btl_usnic_module_t *module, module->linux_device_name, module->if_ipv4_addr_str, remote); + errhost = opal_get_proc_hostname(endpoint->endpoint_proc->proc_opal); opal_show_help("help-mpi-btl-usnic.txt", "unreachable peer IP", true, opal_process_info.nodename, module->if_ipv4_addr_str, module->linux_device_name, - opal_get_proc_hostname(endpoint->endpoint_proc->proc_opal), + errhost, remote); + free(errhost); } /* A bunch of calls to fi_av_insert() were previously diff --git a/opal/mca/btl/usnic/btl_usnic_proc.c b/opal/mca/btl/usnic/btl_usnic_proc.c index bb5239d503..a2878c5b50 100644 --- a/opal/mca/btl/usnic/btl_usnic_proc.c +++ b/opal/mca/btl/usnic/btl_usnic_proc.c @@ -12,7 +12,7 @@ * Copyright (c) 2006 Sandia National Laboratories. All rights * reserved. * Copyright (c) 2013-2016 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2013-2014 Intel, Inc. All rights reserved + * Copyright (c) 2013-2020 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -641,14 +641,15 @@ static int match_modex(opal_btl_usnic_module_t *module, if (*index_out >= 0 && proc->proc_modex[*index_out].max_msg_size != (uint16_t) module->fabric_info->ep_attr->max_msg_size) { + char *errhost = opal_get_proc_hostname(proc->proc_opal); opal_show_help("help-mpi-btl-usnic.txt", "MTU mismatch", true, opal_process_info.nodename, module->linux_device_name, module->fabric_info->ep_attr->max_msg_size, - (NULL == proc->proc_opal->proc_hostname) ? - "unknown" : proc->proc_opal->proc_hostname, + errhost, proc->proc_modex[*index_out].max_msg_size); + free(errhost); *index_out = -1; return OPAL_ERR_UNREACH; } diff --git a/opal/mca/common/sm/common_sm.c b/opal/mca/common/sm/common_sm.c index c6e2a0fdaf..3c9168d16f 100644 --- a/opal/mca/common/sm/common_sm.c +++ b/opal/mca/common/sm/common_sm.c @@ -14,7 +14,7 @@ * Copyright (c) 2008-2010 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2010-2015 Los Alamos National Security, LLC. * All rights reserved. - * Copyright (c) 2014 Intel, Inc. All rights reserved + * Copyright (c) 2014-2020 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -104,7 +104,7 @@ attach_and_init(opal_shmem_ds_t *shmem_bufp, /* is addr past end of the shared memory segment? */ if ((unsigned char *)seg + shmem_bufp->seg_size < addr) { opal_show_help("help-mpi-common-sm.txt", "mmap too small", 1, - opal_proc_local_get()->proc_hostname, + opal_process_info.nodename, (unsigned long)shmem_bufp->seg_size, (unsigned long)size_ctl_structure, (unsigned long)data_seg_alignment); diff --git a/opal/mca/common/sm/common_sm_mpool.c b/opal/mca/common/sm/common_sm_mpool.c index 03366426ad..e390315077 100644 --- a/opal/mca/common/sm/common_sm_mpool.c +++ b/opal/mca/common/sm/common_sm_mpool.c @@ -13,7 +13,7 @@ * Copyright (c) 2011-2012 Los Alamos National Security, LLC. * All rights reserved. * Copyright (c) 2011-2014 NVIDIA Corporation. All rights reserved. - * Copyright (c) 2015 Intel, Inc. All rights reserved + * Copyright (c) 2015-2020 Intel, Inc. All rights reserved. * Copyright (c) 2018 Amazon.com, Inc. or its affiliates. All Rights reserved. * $COPYRIGHT$ * @@ -226,7 +226,7 @@ int mca_common_sm_mpool_ft_event(int state) { /* Record the shared memory filename */ opal_asprintf( &file_name, "%s"OPAL_PATH_SEP"shared_mem_pool.%s", opal_process_info.job_session_dir, - opal_proc_local_get()->proc_hostname ); + opal_process_info.nodename ); /* Disabled to get FT code compiled again * TODO: FIXIT soon orte_sstore.set_attr(orte_sstore_handle_current, SSTORE_METADATA_LOCAL_TOUCH, file_name); diff --git a/opal/mca/mpool/base/mpool_base_tree.c b/opal/mca/mpool/base/mpool_base_tree.c index a497151311..a5937f0382 100644 --- a/opal/mca/mpool/base/mpool_base_tree.c +++ b/opal/mca/mpool/base/mpool_base_tree.c @@ -18,6 +18,7 @@ * Copyright (c) 2015-2018 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2018 Amazon.com, Inc. or its affiliates. All Rights reserved. + * Copyright (c) 2020 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -192,13 +193,13 @@ void mca_mpool_base_tree_print(int show_up_to_mem_leaks) show_up_to_mem_leaks < 0) { opal_show_help("help-mpool-base.txt", "all mem leaks", true, OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), - opal_proc_local_get()->proc_hostname, + opal_process_info.nodename, getpid(), leak_msg); } else { int i = num_leaks - show_up_to_mem_leaks; opal_show_help("help-mpool-base.txt", "some mem leaks", true, OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), - opal_proc_local_get()->proc_hostname, + opal_process_info.nodename, getpid(), leak_msg, i, (i > 1) ? "s were" : " was", (i > 1) ? "are" : "is"); diff --git a/opal/mca/rcache/base/rcache_base_create.c b/opal/mca/rcache/base/rcache_base_create.c index b41e315ced..df74b7784c 100644 --- a/opal/mca/rcache/base/rcache_base_create.c +++ b/opal/mca/rcache/base/rcache_base_create.c @@ -12,6 +12,7 @@ * All rights reserved. * Copyright (c) 2015-2016 Los Alamos National Security, LLC. All rights * reserved. + * Copyright (c) 2020 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -71,7 +72,7 @@ mca_rcache_base_module_t* mca_rcache_base_module_create (const char* name, void } else if (1 == opal_leave_pinned || opal_leave_pinned_pipeline) { opal_show_help("help-rcache-base.txt", "leave pinned failed", true, name, OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), - opal_proc_local_get()->proc_hostname); + opal_process_info.nodename); return NULL; } diff --git a/opal/mca/rcache/base/rcache_base_mem_cb.c b/opal/mca/rcache/base/rcache_base_mem_cb.c index 48039fde3a..8c164d00e6 100644 --- a/opal/mca/rcache/base/rcache_base_mem_cb.c +++ b/opal/mca/rcache/base/rcache_base_mem_cb.c @@ -13,6 +13,7 @@ * Copyright (c) 2009 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2012-2015 Los Alamos National Security, LLC. * All rights reserved. + * Copyright (c) 2020 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -67,7 +68,7 @@ void mca_rcache_base_mem_cb (void* base, size_t size, void* cbdata, bool from_al int len; len = snprintf(msg, sizeof(msg), "[%s:%05d] Attempt to free memory that is still in " "use by an ongoing MPI communication (buffer %p, size %lu). MPI job " - "will now abort.\n", opal_proc_local_get()->proc_hostname, + "will now abort.\n", opal_process_info.nodename, getpid(), base, (unsigned long) size); msg[sizeof(msg) - 1] = '\0'; write(2, msg, len); @@ -75,7 +76,7 @@ void mca_rcache_base_mem_cb (void* base, size_t size, void* cbdata, bool from_al opal_show_help("help-rcache-base.txt", "cannot deregister in-use memory", true, current->rcache_component->rcache_version.mca_component_name, - opal_proc_local_get()->proc_hostname, + opal_process_info.nodename, base, (unsigned long) size); } diff --git a/opal/util/proc.c b/opal/util/proc.c index d1f4d84aa5..16b1968bd8 100644 --- a/opal/util/proc.c +++ b/opal/util/proc.c @@ -41,11 +41,10 @@ opal_process_info_t opal_process_info = { static opal_proc_t opal_local_proc = { { .opal_list_next = NULL, .opal_list_prev = NULL}, - {OPAL_JOBID_INVALID, OPAL_VPID_INVALID}, - 0, - 0, - NULL, - NULL + .proc_name = {OPAL_JOBID_INVALID, OPAL_VPID_INVALID}, + .proc_arch = 0, + .proc_flags = 0, + .proc_convertor = NULL }; static opal_proc_t* opal_proc_my_name = &opal_local_proc; @@ -55,14 +54,12 @@ static void opal_proc_construct(opal_proc_t* proc) proc->proc_convertor = NULL; proc->proc_flags = 0; proc->proc_name = *OPAL_NAME_INVALID; - proc->proc_hostname = NULL; } static void opal_proc_destruct(opal_proc_t* proc) { proc->proc_flags = 0; proc->proc_name = *OPAL_NAME_INVALID; - proc->proc_hostname = NULL; proc->proc_convertor = NULL; } @@ -188,30 +185,26 @@ struct opal_proc_t *(*opal_proc_for_name) (const opal_process_name_t name) = opa char* opal_get_proc_hostname(const opal_proc_t *proc) { int ret; + char *hostname; /* if the proc is NULL, then we can't know */ if (NULL == proc) { - return "unknown"; + return strdup("unknown"); } /* if it is my own hostname we are after, then just hand back * the value in opal_process_info */ if (proc == opal_proc_my_name) { - return opal_process_info.nodename; - } - - /* see if we already have the data - if so, pass it back */ - if (NULL != proc->proc_hostname) { - return proc->proc_hostname; + return strdup(opal_process_info.nodename); } /* if we don't already have it, then try to get it */ OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, PMIX_HOSTNAME, &proc->proc_name, - (char**)&(proc->proc_hostname), PMIX_STRING); + (char**)&hostname, PMIX_STRING); if (OPAL_SUCCESS != ret) { - return "unknown"; // return something so the caller doesn't segfault + return strdup("unknown"); // return something so the caller doesn't segfault } /* user is not allowed to release the data */ - return proc->proc_hostname; + return hostname; } diff --git a/opal/util/proc.h b/opal/util/proc.h index bd54dcb7ec..3bd86b6292 100644 --- a/opal/util/proc.h +++ b/opal/util/proc.h @@ -48,7 +48,7 @@ #define OPAL_VPID_WILDCARD (OPAL_VPID_MAX + 1) #define OPAL_PROC_MY_NAME (opal_proc_local_get()->proc_name) -#define OPAL_PROC_MY_HOSTNAME (opal_proc_local_get()->proc_hostname) +#define OPAL_PROC_MY_HOSTNAME (opal_process_info.nodename) #define OPAL_NAME_WILDCARD (&opal_name_wildcard) OPAL_DECLSPEC extern opal_process_name_t opal_name_wildcard; @@ -91,9 +91,6 @@ typedef struct opal_proc_t { opal_hwloc_locality_t proc_flags; /** Base convertor for the proc described by this process */ struct opal_convertor_t* proc_convertor; - /** A pointer to the name of this host - data is - * actually stored outside of this framework. */ - char* proc_hostname; } opal_proc_t; OBJ_CLASS_DECLARATION(opal_proc_t);