1
1

As per the email discussion, revise the sparse handling of hostnames so that we avoid potential infinite loops while allowing large-scale users to improve their startup time:

* add a new MCA param orte_hostname_cutoff to specify the number of nodes at which we stop including hostnames. This defaults to INT_MAX => always include hostnames. If a value is given, then we will include hostnames for any allocation smaller than the given limit.

* remove ompi_proc_get_hostname. Replace all occurrences with a direct link to ompi_proc_t's proc_hostname, protected by appropriate "if NULL"

* modify the OMPI-ORTE integration component so that any call to modex_recv automatically loads the ompi_proc_t->proc_hostname field as well as returning the requested info. Thus, any process whose modex info you retrieve will automatically receive the hostname. Note that on-demand retrieval is still enabled - i.e., if we are running under direct launch with PMI, the hostname will be fetched upon first call to modex_recv, and then the ompi_proc_t->proc_hostname field will be loaded

* removed a stale MCA param "mpi_keep_peer_hostnames" that was no longer used anywhere in the code base

* added an envar lookup in ess/pmi for the number of nodes in the allocation. Sadly, PMI itself doesn't provide that info, so we have to get it a different way. Currently, we support PBS-based systems and SLURM - for any other, rank0 will emit a warning and we assume max number of daemons so we will always retain hostnames

This commit was SVN r29052.
This commit is contained in:
Ralph Castain 2013-08-20 18:59:36 +00:00
parent f49f879b2d
commit 45e695928f
27 changed files with 365 additions and 254 deletions

View File

@ -13,6 +13,7 @@
* Copyright (c) 2007-2012 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2008-2009 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2013 Intel, Inc. All rights reserved
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -408,11 +409,11 @@ static int mca_bml_r2_add_procs( size_t nprocs,
"unreachable proc",
true,
OMPI_NAME_PRINT(&(ompi_proc_local_proc->proc_name)),
(ompi_proc_get_hostname(ompi_proc_local_proc) ?
ompi_proc_get_hostname(ompi_proc_local_proc) : "unknown!"),
(NULL != ompi_proc_local_proc->proc_hostname ?
ompi_proc_local_proc->proc_hostname : "unknown!"),
OMPI_NAME_PRINT(&(unreach_proc->proc_name)),
(ompi_proc_get_hostname(unreach_proc) ?
ompi_proc_get_hostname(unreach_proc) : "unknown!"),
(NULL != ompi_proc_local_proc->proc_hostname ?
ompi_proc_local_proc->proc_hostname : "unknown!"),
btl_names);
}

View File

@ -13,7 +13,8 @@
* Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved.
* Copyright (c) 2012 Los Alamos National Security, LLC.
* All rights reserved.
* $COPYRIGHT$
* Copyright (c) 2013 Intel, Inc. All rights reserved
* $COPYRIGHT$
*
* Additional copyrights may follow
*
@ -62,8 +63,9 @@ do { \
OMPI_NAME_PRINT(OMPI_PROC_MY_NAME), \
__FILE__, __LINE__, __func__, \
ompi_process_info.nodename); \
if(proc && ompi_proc_get_hostname(proc)) { \
mca_btl_base_err("to: %s ", ompi_proc_get_hostname(proc)); \
if(proc) { \
mca_btl_base_err("to: %s ", (NULL == proc->proc_hostname) ? \
"unknown" : proc->proc_hostname); \
} \
mca_btl_base_err args; \
mca_btl_base_err("\n"); \

View File

@ -17,6 +17,7 @@
* Copyright (c) 2006-2007 Voltaire All rights reserved.
* Copyright (c) 2008-2012 Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2009 IBM Corporation. All rights reserved.
* Copyright (c) 2013 Intel, Inc. All rights reserved
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -483,16 +484,17 @@ static int mca_btl_openib_tune_endpoint(mca_btl_openib_module_t* openib_btl,
if(mca_btl_openib_get_transport_type(openib_btl) != endpoint->rem_info.rem_transport_type) {
opal_show_help("help-mpi-btl-openib.txt",
"conflicting transport types", true,
ompi_process_info.nodename,
ibv_get_device_name(openib_btl->device->ib_dev),
(openib_btl->device->ib_dev_attr).vendor_id,
(openib_btl->device->ib_dev_attr).vendor_part_id,
mca_btl_openib_transport_name_strings[mca_btl_openib_get_transport_type(openib_btl)],
ompi_proc_get_hostname(endpoint->endpoint_proc->proc_ompi),
endpoint->rem_info.rem_vendor_id,
endpoint->rem_info.rem_vendor_part_id,
mca_btl_openib_transport_name_strings[endpoint->rem_info.rem_transport_type]);
"conflicting transport types", true,
ompi_process_info.nodename,
ibv_get_device_name(openib_btl->device->ib_dev),
(openib_btl->device->ib_dev_attr).vendor_id,
(openib_btl->device->ib_dev_attr).vendor_part_id,
mca_btl_openib_transport_name_strings[mca_btl_openib_get_transport_type(openib_btl)],
(NULL == endpoint->endpoint_proc->proc_ompi->proc_hostname) ?
"unknown" : NULL == endpoint->endpoint_proc->proc_ompi->proc_hostname,
endpoint->rem_info.rem_vendor_id,
endpoint->rem_info.rem_vendor_part_id,
mca_btl_openib_transport_name_strings[endpoint->rem_info.rem_transport_type]);
return OMPI_ERROR;
}
@ -551,7 +553,8 @@ static int mca_btl_openib_tune_endpoint(mca_btl_openib_module_t* openib_btl,
(openib_btl->device->ib_dev_attr).vendor_id,
(openib_btl->device->ib_dev_attr).vendor_part_id,
mca_btl_openib_component.receive_queues,
ompi_proc_get_hostname(endpoint->endpoint_proc->proc_ompi),
(NULL == endpoint->endpoint_proc->proc_ompi->proc_hostname) ?
"unknown", endpoint->endpoint_proc->proc_ompi->proc_hostname,
endpoint->rem_info.rem_vendor_id,
endpoint->rem_info.rem_vendor_part_id,
recv_qps);
@ -573,7 +576,8 @@ static int mca_btl_openib_tune_endpoint(mca_btl_openib_module_t* openib_btl,
(openib_btl->device->ib_dev_attr).vendor_id,
(openib_btl->device->ib_dev_attr).vendor_part_id,
mca_btl_openib_component.receive_queues,
ompi_proc_get_hostname(endpoint->endpoint_proc->proc_ompi),
(NULL == endpoint->endpoint_proc->proc_ompi->proc_hostname) ?
"unknown", endpoint->endpoint_proc->proc_ompi->proc_hostname,
endpoint->rem_info.rem_vendor_id,
endpoint->rem_info.rem_vendor_part_id,
values.receive_queues);

View File

@ -18,6 +18,7 @@
* Copyright (c) 2009-2012 Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011-2013 NVIDIA Corporation. All rights reserved.
* Copyright (c) 2012 Oak Ridge National Laboratory. All rights reserved
* Copyright (c) 2013 Intel, Inc. All rights reserved
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -535,7 +536,8 @@ static void btl_openib_control(mca_btl_base_module_t* btl,
break;
case MCA_BTL_OPENIB_CONTROL_CTS:
OPAL_OUTPUT((-1, "received CTS from %s (buffer %p): posted recvs %d, sent cts %d",
ompi_proc_get_hostname(ep->endpoint_proc->proc_ompi),
(NULL == ep->endpoint_proc->proc_ompi->proc_hostname) ?
"unknown" : ep->endpoint_proc->proc_ompi->proc_hostname,
(void*) ctl_hdr,
ep->endpoint_posted_recvs, ep->endpoint_cts_sent));
ep->endpoint_cts_received = true;
@ -3530,9 +3532,9 @@ error:
if (IBV_WC_RNR_RETRY_EXC_ERR == wc->status ||
IBV_WC_RETRY_EXC_ERR == wc->status) {
char *peer_hostname =
(NULL != ompi_proc_get_hostname(endpoint->endpoint_proc->proc_ompi)) ?
(char*)ompi_proc_get_hostname(endpoint->endpoint_proc->proc_ompi) :
const char *peer_hostname =
(NULL != endpoint->endpoint_proc->proc_ompi) ?
endpoint->endpoint_proc->proc_ompi) :
"<unknown -- please run with mpi_keep_peer_hostnames=1>";
const char *device_name =
ibv_get_device_name(endpoint->qps[qp].qp->lcl_qp->context->device);
@ -3543,12 +3545,15 @@ error:
"pp rnr retry exceeded" :
"srq rnr retry exceeded", true,
ompi_process_info.nodename, device_name,
peer_hostname);
(NULL == endpoint->endpoint_proc->proc_ompi->proc_hostname) ?
"unknown" : endpoint->endpoint_proc->proc_ompi->proc_hostname);
} else if (IBV_WC_RETRY_EXC_ERR == wc->status) {
opal_show_help("help-mpi-btl-openib.txt",
"pp retry exceeded", true,
ompi_process_info.nodename,
device_name, peer_hostname);
device_name,
(NULL == endpoint->endpoint_proc->proc_ompi->proc_hostname) ?
"unknown" : endpoint->endpoint_proc->proc_ompi->proc_hostname);
}
}

View File

@ -17,6 +17,7 @@
* Copyright (c) 2006-2009 Mellanox Technologies, Inc. All rights reserved.
* Copyright (c) 2010-2011 IBM Corporation. All rights reserved.
* Copyright (c) 2010-2011 Oracle and/or its affiliates. All rights reserved
* Copyright (c) 2013 Intel, Inc. All rights reserved
*
* $COPYRIGHT$
*
@ -507,7 +508,8 @@ static void cts_sent(mca_btl_base_module_t* btl,
/* Nothing to do/empty function (we can't pass in a NULL pointer
for the des_cbfunc) */
OPAL_OUTPUT((-1, "CTS send to %s completed",
ompi_proc_get_hostname(ep->endpoint_proc->proc_ompi)));
(NULL == ep->endpoint_proc->proc_ompi->proc_hostname) ?
"unknown" : ep->endpoint_proc->proc_ompi->proc_hostname));
}
/*
@ -522,7 +524,8 @@ void mca_btl_openib_endpoint_send_cts(mca_btl_openib_endpoint_t *endpoint)
mca_btl_openib_control_header_t *ctl_hdr;
OPAL_OUTPUT((-1, "SENDING CTS to %s on qp index %d (QP num %d)",
ompi_proc_get_hostname(endpoint->endpoint_proc->proc_ompi),
(NULL == endpoint->endpoint_proc->proc_ompi->proc_hostname) ?
"unknown" : endpoint->endpoint_proc->proc_ompi->proc_hostname,
mca_btl_openib_component.credits_qp,
endpoint->qps[mca_btl_openib_component.credits_qp].qp->lcl_qp->qp_num));
sc_frag = alloc_control_frag(endpoint->endpoint_btl);
@ -592,7 +595,8 @@ void mca_btl_openib_endpoint_cpc_complete(mca_btl_openib_endpoint_t *endpoint)
transport_type_ib_p = (IBV_TRANSPORT_IB == endpoint->endpoint_btl->device->ib_dev->transport_type);
#endif
OPAL_OUTPUT((-1, "cpc_complete to peer %s: is IB %d, initiatior %d, cts received: %d",
ompi_proc_get_hostname(endpoint->endpoint_proc->proc_ompi),
(NULL == endpoint->endpoint_proc->proc_ompi->proc_hostname) ?
"unknown" : endpoint->endpoint_proc->proc_ompi->proc_hostname,
transport_type_ib_p,
endpoint->endpoint_initiator,
endpoint->endpoint_cts_received));
@ -605,7 +609,8 @@ void mca_btl_openib_endpoint_cpc_complete(mca_btl_openib_endpoint_t *endpoint)
mark us as connected */
if (endpoint->endpoint_cts_received) {
OPAL_OUTPUT((-1, "cpc_complete to %s -- already got CTS, so marking endpoint as complete",
ompi_proc_get_hostname(endpoint->endpoint_proc->proc_ompi)));
(NULL == endpoint->endpoint_proc->proc_ompi->proc_hostname) ?
"unknown" : endpoint->endpoint_proc->proc_ompi->proc_hostname));
mca_btl_openib_endpoint_connected(endpoint);
}
}

View File

@ -3,6 +3,7 @@
* Copyright (c) 2007 Mellanox Technologies, Inc. All rights reserved.
* Copyright (c) 2012 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2013 Intel, Inc. All rights reserved
*
* $COPYRIGHT$
*
@ -457,7 +458,8 @@ int ompi_btl_openib_connect_base_alloc_cts(mca_btl_base_endpoint_t *endpoint)
mca_btl_openib_component.credits_qp;
endpoint->endpoint_cts_frag.super.endpoint = endpoint;
OPAL_OUTPUT((-1, "Got a CTS frag for peer %s, addr %p, length %d, lkey %d",
ompi_proc_get_hostname(endpoint->endpoint_proc->proc_ompi),
(NULL == endpoint->endpoint_proc->proc_ompi->proc_hostname) ?
"unknown" : endpoint->endpoint_proc->proc_ompi->proc_hostname,
(void*) endpoint->endpoint_cts_frag.super.sg_entry.addr,
endpoint->endpoint_cts_frag.super.sg_entry.length,
endpoint->endpoint_cts_frag.super.sg_entry.lkey));

View File

@ -6,6 +6,7 @@
* Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012-2013 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2013 Intel, Inc. All rights reserved
*
* $COPYRIGHT$
*
@ -716,7 +717,8 @@ static int rdmacm_module_start_connect(ompi_btl_openib_connect_base_module_t *cp
(void*) endpoint,
(void*) endpoint->endpoint_local_cpc,
endpoint->endpoint_initiator ? "am" : "am NOT",
ompi_proc_get_hostname(endpoint->endpoint_proc->proc_ompi)));
(NULL == endpoint->endpoint_proc->proc_ompi->proc_hostname) ?
"unknown" : endpoint->endpoint_proc->proc_ompi->proc_hostname));
/* If we're the initiator, then open all the QPs */
if (contents->endpoint->endpoint_initiator) {
@ -845,7 +847,8 @@ static int handle_connect_request(struct rdma_cm_event *event)
(void*) endpoint,
(void*) endpoint->endpoint_local_cpc,
endpoint->endpoint_initiator ? "am" : "am NOT",
ompi_proc_get_hostname(endpoint->endpoint_proc->proc_ompi)));
(NULL == endpoint->endpoint_proc->proc_ompi->proc_hostname) ?
"unknown" : endpoint->endpoint_proc->proc_ompi->proc_hostname));
if (endpoint->endpoint_initiator) {
reject_reason_t reason = REJECT_WRONG_DIRECTION;
@ -906,7 +909,8 @@ static int handle_connect_request(struct rdma_cm_event *event)
}
OPAL_OUTPUT((-1, "Posted CTS receiver buffer (%p) for peer %s, qp index %d (QP num %d), WR ID %p, SG addr %p, len %d, lkey %d",
(void*) wr->sg_list[0].addr,
ompi_proc_get_hostname(endpoint->endpoint_proc->proc_ompi),
(NULL == endpoint->endpoint_proc->proc_ompi->proc_hostname) ?
"unknown" : endpoint->endpoint_proc->proc_ompi->proc_hostname,
qpnum,
endpoint->qps[qpnum].qp->lcl_qp->qp_num,
(void*) wr->wr_id,
@ -1097,7 +1101,8 @@ static void *local_endpoint_cpc_complete(void *context)
mca_btl_openib_endpoint_t *endpoint = (mca_btl_openib_endpoint_t *)context;
OPAL_OUTPUT((-1, "MAIN local_endpoint_cpc_complete to %s",
ompi_proc_get_hostname(endpoint->endpoint_proc->proc_ompi)));
(NULL == endpoint->endpoint_proc->proc_ompi->proc_hostname) ?
"unknown" : endpoint->endpoint_proc->proc_ompi->proc_hostname));
mca_btl_openib_endpoint_cpc_complete(endpoint);
return NULL;
@ -1117,7 +1122,8 @@ static int rdmacm_connect_endpoint(id_context_t *context,
if (contents->server) {
endpoint = context->endpoint;
OPAL_OUTPUT((-1, "SERVICE Server CPC complete to %s",
ompi_proc_get_hostname(endpoint->endpoint_proc->proc_ompi)));
(NULL == endpoint->endpoint_proc->proc_ompi->proc_hostname) ?
"unknown" : endpoint->endpoint_proc->proc_ompi->proc_hostname));
} else {
endpoint = contents->endpoint;
endpoint->rem_info.rem_index =
@ -1132,7 +1138,8 @@ static int rdmacm_connect_endpoint(id_context_t *context,
contents->on_client_list = true;
}
OPAL_OUTPUT((-1, "SERVICE Client CPC complete to %s",
ompi_proc_get_hostname(endpoint->endpoint_proc->proc_ompi)));
(NULL == endpoint->endpoint_proc->proc_ompi->proc_hostname) ?
"unknown" : endpoint->endpoint_proc->proc_ompi->proc_hostname));
}
if (NULL == endpoint) {
BTL_ERROR(("Can't find endpoint"));
@ -1144,8 +1151,12 @@ static int rdmacm_connect_endpoint(id_context_t *context,
/* Only notify the upper layers after the last QP has been
connected */
if (++data->rdmacm_counter < mca_btl_openib_component.num_qps) {
BTL_VERBOSE(("%s to peer %s, count == %d", contents->server?"server":"client", ompi_proc_get_hostname(endpoint->endpoint_proc->proc_ompi), data->rdmacm_counter));
OPAL_OUTPUT((-1, "%s to peer %s, count == %d", contents->server?"server":"client", ompi_proc_get_hostname(endpoint->endpoint_proc->proc_ompi), data->rdmacm_counter));
BTL_VERBOSE(("%s to peer %s, count == %d", contents->server?"server":"client",
(NULL == endpoint->endpoint_proc->proc_ompi->proc_hostname) ?
"unknown" : endpoint->endpoint_proc->proc_ompi->proc_hostname, data->rdmacm_counter));
OPAL_OUTPUT((-1, "%s to peer %s, count == %d", contents->server?"server":"client",
(NULL == endpoint->endpoint_proc->proc_ompi->proc_hostname) ?
"unknown" : endpoint->endpoint_proc->proc_ompi->proc_hostname, data->rdmacm_counter));
return OMPI_SUCCESS;
}
@ -1376,7 +1387,8 @@ static int finish_connect(id_context_t *context)
OPAL_OUTPUT((-1, "Posted initiator CTS buffer (%p, length %d) for peer %s, qp index %d (QP num %d)",
(void*) wr->sg_list[0].addr,
wr->sg_list[0].length,
ompi_proc_get_hostname(contents->endpoint->endpoint_proc->proc_ompi),
(NULL == contents->endpoint->endpoint_proc->proc_ompi->proc_hostname) ?
"unknown" : contents->endpoint->endpoint_proc->proc_ompi->proc_hostname,
context->qpnum,
contents->endpoint->qps[context->qpnum].qp->lcl_qp->qp_num));
}
@ -1443,7 +1455,8 @@ static int finish_connect(id_context_t *context)
(void*) contents->endpoint,
(void*) contents->endpoint->endpoint_local_cpc,
contents->endpoint->endpoint_initiator ? "am" : "am NOT",
ompi_proc_get_hostname(contents->endpoint->endpoint_proc->proc_ompi)));
(NULL == contents->endpoint->endpoint_proc->proc_ompi->proc_hostname) ?
"unknown" : contents->endpoint->endpoint_proc->proc_ompi->proc_hostname));
rc = rdma_connect(context->id, &conn_param);
if (0 != rc) {
BTL_ERROR(("rdma_connect Failed with %d", rc));
@ -1485,7 +1498,8 @@ static void *show_help_rdmacm_event_error(void *c)
ompi_process_info.nodename,
device,
rdma_event_str(event->event),
ompi_proc_get_hostname(context->endpoint->endpoint_proc->proc_ompi));
(NULL == context->endpoint->endpoint_proc->proc_ompi->proc_hostname) ?
"unknown" : context->endpoint->endpoint_proc->proc_ompi->proc_hostname));
}
return NULL;

View File

@ -10,6 +10,7 @@
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2008-2010 Oracle and/or its affiliates. All rights reserved
* Copyright (c) 2013 Intel, Inc. All rights reserved
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -377,7 +378,9 @@ int mca_btl_tcp_proc_insert( mca_btl_tcp_proc_t* btl_proc,
int rc, *a = NULL;
size_t i, j;
proc_hostname = ompi_proc_get_hostname(btl_proc->proc_ompi);
if (NULL == (proc_hostname = btl_proc->proc_ompi->proc_hostname)) {
return OMPI_ERR_UNREACH;
}
#ifndef WORDS_BIGENDIAN
/* if we are little endian and our peer is not so lucky, then we

View File

@ -12,6 +12,7 @@
* Copyright (c) 2006 Sandia National Laboratories. All rights
* reserved.
* Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved.
* Copyright (c) 2013 Intel, Inc. All rights reserved
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -259,7 +260,8 @@ static int mca_btl_udapl_proc_address_match(
BTL_UDAPL_VERBOSE_HELP(VERBOSE_SHOW_HELP,
("help-mpi-btl-udapl.txt", "no network match",
true, btl_addr_string, ompi_process_info.nodename,
ompi_proc_get_hostname(peer_proc->proc_ompi)));
(NULL == peer_proc->proc_ompi->proc_hostname) ?
"unknown" : peer_proc->proc_ompi->proc_hostname));
return OMPI_ERR_OUT_OF_RESOURCE;
}

View File

@ -12,6 +12,7 @@
* Copyright (c) 2006 Sandia National Laboratories. All rights
* reserved.
* Copyright (c) 2013 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2013 Intel, Inc. All rights reserved
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -295,22 +296,15 @@ static int match_modex(ompi_btl_usnic_module_t *module,
/* If MTU does not match, throw an error */
if (proc->proc_modex[i].mtu != module->if_mtu) {
const char *peer_hostname;
if (NULL != ompi_proc_get_hostname(proc->proc_ompi)) {
peer_hostname = ompi_proc_get_hostname(proc->proc_ompi);
} else {
peer_hostname =
"<unknown -- please run with mpi_keep_peer_hostnames=1>";
}
opal_show_help("help-mpi-btl-usnic.txt", "MTU mismatch",
true,
ompi_process_info.nodename,
ibv_get_device_name(module->device),
module->port_num,
module->if_mtu,
peer_hostname,
proc->proc_modex[i].mtu);
true,
ompi_process_info.nodename,
ibv_get_device_name(module->device),
module->port_num,
module->if_mtu,
(NULL == proc->proc_ompi->proc_hostname) ?
"unknown" : proc->proc_ompi->proc_hostname,
proc->proc_modex[i].mtu);
return -1;
}

View File

@ -3,6 +3,7 @@
* Copyright (c) 2007-2012 Mellanox Technologies. All rights reserved.
*
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
* Copyright (c) 2013 Intel, Inc. All rights reserved
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -621,7 +622,8 @@ int ompi_common_ofacm_base_alloc_cts(mca_btl_base_endpoint_t *endpoint)
mca_btl_openib_component.credits_qp;
endpoint->endpoint_cts_frag.super.endpoint = endpoint;
OPAL_OUTPUT((-1, "Got a CTS frag for peer %s, addr %p, length %d, lkey %d",
ompi_proc_get_hostname(endpoint->endpoint_proc->proc_ompi),
(NULL == endpoint->endpoint_proc->proc_ompi->proc_hostname) ?
"unknown" : endpoint->endpoint_proc->proc_ompi->proc_hostname,
(void*) endpoint->endpoint_cts_frag.super.sg_entry.addr,
endpoint->endpoint_cts_frag.super.sg_entry.length,
endpoint->endpoint_cts_frag.super.sg_entry.lkey));

View File

@ -1,5 +1,6 @@
/*
* Copyright (C) Mellanox Technologies Ltd. 2001-2011. ALL RIGHTS RESERVED.
* Copyright (c) 2013 Intel, Inc. All rights reserved
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -508,7 +509,9 @@ int ompi_mtl_mxm_add_procs(struct mca_mtl_base_module_t *mtl, size_t nprocs,
MXM_ERROR("MXM returned connect error: %s\n", mxm_error_string(err));
for (i = 0; i < nprocs; ++i) {
if (MXM_OK != conn_reqs[i].error) {
MXM_ERROR("MXM EP connect to %s error: %s\n", ompi_proc_get_hostname(procs[i]),
MXM_ERROR("MXM EP connect to %s error: %s\n",
(NULL == procs[i]->proc_hostname) ?
"unknown" : procs[i]->proc_hostname,
mxm_error_string(conn_reqs[i].error));
}
}

View File

@ -10,6 +10,7 @@
* Copyright (c) 2004-2006 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2006 QLogic Corporation. All rights reserved.
* Copyright (c) 2013 Intel, Inc. All rights reserved
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -313,7 +314,8 @@ ompi_mtl_psm_add_procs(struct mca_mtl_base_module_t *mtl,
errstr ? errstr : "unknown connect error");
for (j = 0; j < (int) nprocs; j++) {
if (errs_out[j] == thiserr) {
opal_output(0, " %s", ompi_proc_get_hostname(procs[j]));
opal_output(0, " %s", (NULL == procs[j]->proc_hostname) ?
"unknown" : procs[j]->proc_hostname);
}
}
opal_output(0, "\n");

View File

@ -12,6 +12,7 @@
* All rights reserved.
* Copyright (c) 2012 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2013 Intel, Inc. All rights reserved
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -368,18 +369,11 @@ mca_pml_base_pml_check_selected(const char *my_pml,
/* if that module doesn't match my own, return an error */
if ((size != strlen(my_pml) + 1) ||
(0 != strcmp(my_pml, remote_pml))) {
if (ompi_proc_get_hostname(procs[0])) {
opal_output(0, "%s selected pml %s, but peer %s on %s selected pml %s",
OMPI_NAME_PRINT(&ompi_proc_local()->proc_name),
my_pml, OMPI_NAME_PRINT(&procs[0]->proc_name),
ompi_proc_get_hostname(procs[0]),
remote_pml);
} else {
opal_output(0, "%s selected pml %s, but peer %s selected pml %s",
OMPI_NAME_PRINT(&ompi_proc_local()->proc_name),
my_pml, OMPI_NAME_PRINT(&procs[0]->proc_name),
remote_pml);
}
opal_output(0, "%s selected pml %s, but peer %s on %s selected pml %s",
OMPI_NAME_PRINT(&ompi_proc_local()->proc_name),
my_pml, OMPI_NAME_PRINT(&procs[0]->proc_name),
(NULL == procs[0]->proc_hostname) ? "unknown" : procs[0]->proc_hostname,
remote_pml);
free(remote_pml); /* cleanup before returning */
return OMPI_ERR_UNREACH;
}

View File

@ -2,6 +2,7 @@
* Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011-2012 Los Alamos National Security, LLC.
* All rights reserved.
* Copyright (c) 2013 Intel, Inc. All rights reserved
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -408,7 +409,7 @@ void mca_pml_bfo_recv_frag_callback_rndvrestartnotify(mca_btl_base_module_t* btl
recvreq->remote_req_send.pval, (void *)recvreq,
recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE,
hdr->hdr_restart.hdr_jobid, hdr->hdr_restart.hdr_vpid,
ompi_proc_get_hostname(ompi_proc));
(NULL == ompi_proc->proc_hostname) ? "unknown" : ompi_proc->proc_hostname);
mca_pml_bfo_recv_request_rndvrestartnack(des, ompi_proc, false);
return;
}
@ -1415,7 +1416,7 @@ void mca_pml_bfo_map_out_btl(struct mca_btl_base_module_t* btl,
btl->btl_component->btl_version.mca_component_name,
OMPI_PROC_MY_NAME->vpid,
btlname, errproc->proc_name.vpid,
ompi_proc_get_hostname(errproc));
(NULL == errproc->proc_hostname) ? "unknown" : errproc->proc_hostname);
/* Need to search for any pending packets associated
* with this endpoint and remove them. We may also

View File

@ -1,6 +1,7 @@
/*
* Copyright (c) 2012-2013 Los Alamos National Security, LLC.
* All rights reserved.
* Copyright (c) 2013 Intel, Inc. All rights reserved
*
* $COPYRIGHT$
*
@ -65,6 +66,7 @@ typedef orte_node_rank_t ompi_node_rank_t;
typedef orte_local_rank_t ompi_local_rank_t;
#define ompi_process_info orte_process_info
#define ompi_rte_proc_is_bound orte_proc_is_bound
#define ompi_rte_hostname_cutoff orte_hostname_cutoff
/* Error handling objects and operations */
OMPI_DECLSPEC void ompi_rte_abort(int error_code, char *fmt, ...);

View File

@ -1,6 +1,7 @@
/*
* Copyright (c) 2012-2013 Los Alamos National Security, LLC.
* All rights reserved.
* Copyright (c) 2013 Intel, Inc. All rights reserved
*/
#include "ompi_config.h"
#include "ompi/constants.h"
@ -35,6 +36,7 @@
#include "ompi/mca/rte/base/base.h"
#include "ompi/mca/rte/rte.h"
#include "ompi/debuggers/debuggers.h"
#include "ompi/proc/proc.h"
void ompi_rte_abort(int error_code, char *fmt, ...)
{
@ -149,21 +151,54 @@ int ompi_rte_db_fetch(const orte_process_name_t *nm,
const char *key,
void **data, opal_data_type_t type)
{
return opal_db.fetch((opal_identifier_t*)nm, key, data, type);
ompi_proc_t *proct;
int rc;
if (OPAL_SUCCESS != (rc = opal_db.fetch((opal_identifier_t*)nm, key, data, type))) {
return rc;
}
/* update the hostname */
proct = ompi_proc_find(nm);
if (NULL == proct->proc_hostname) {
opal_db.fetch_pointer((opal_identifier_t*)nm, ORTE_DB_HOSTNAME, (void**)&proct->proc_hostname, OPAL_STRING);
}
return OMPI_SUCCESS;
}
int ompi_rte_db_fetch_pointer(const orte_process_name_t *nm,
const char *key,
void **data, opal_data_type_t type)
{
return opal_db.fetch_pointer((opal_identifier_t*)nm, key, data, type);
ompi_proc_t *proct;
int rc;
if (OPAL_SUCCESS != (rc = opal_db.fetch_pointer((opal_identifier_t*)nm, key, data, type))) {
return rc;
}
/* update the hostname */
proct = ompi_proc_find(nm);
if (NULL == proct->proc_hostname) {
opal_db.fetch_pointer((opal_identifier_t*)nm, ORTE_DB_HOSTNAME, (void**)&proct->proc_hostname, OPAL_STRING);
}
return OMPI_SUCCESS;
}
int ompi_rte_db_fetch_multiple(const orte_process_name_t *nm,
const char *key,
opal_list_t *kvs)
{
return opal_db.fetch_multiple((opal_identifier_t*)nm, key, kvs);
ompi_proc_t *proct;
int rc;
if (OPAL_SUCCESS != (rc = opal_db.fetch_multiple((opal_identifier_t*)nm, key, kvs))) {
return rc;
}
/* update the hostname */
proct = ompi_proc_find(nm);
if (NULL == proct->proc_hostname) {
opal_db.fetch_pointer((opal_identifier_t*)nm, ORTE_DB_HOSTNAME, (void**)&proct->proc_hostname, OPAL_STRING);
}
return OMPI_SUCCESS;
}
int ompi_rte_db_remove(const orte_process_name_t *nm,

View File

@ -12,6 +12,7 @@
* Copyright (c) 2006-2007 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2012 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2013 Intel, Inc. All rights reserved
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -156,8 +157,20 @@ int ompi_proc_complete_init(void)
break;
}
/* get the remote architecture */
if (ompi_process_info.num_daemons < ompi_rte_hostname_cutoff) {
/* retrieve the hostname */
ret = ompi_modex_recv_string_pointer(OMPI_DB_HOSTNAME, proc, (void**)&(proc->proc_hostname), OPAL_STRING);
if (OMPI_SUCCESS != ret) {
break;
}
} else {
/* just set the hostname to NULL for now - we'll fill it in
* as modex_recv's are called for procs we will talk to
*/
proc->proc_hostname = NULL;
}
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
/* get the remote architecture */
{
uint32_t *ui32ptr;
ui32ptr = &(proc->proc_arch);
@ -185,21 +198,6 @@ int ompi_proc_complete_init(void)
return errcode;
}
const char *ompi_proc_get_hostname (ompi_proc_t *proc)
{
int ret;
if (NULL == proc->proc_hostname) {
/* get a pointer to the name of the node it is on */
ret = ompi_modex_recv_string_pointer(OMPI_DB_HOSTNAME, proc, (void**)&(proc->proc_hostname), OPAL_STRING);
if (OMPI_SUCCESS != ret) {
return NULL;
}
}
return proc->proc_hostname;
}
int ompi_proc_finalize (void)
{
opal_list_item_t *item;
@ -371,7 +369,6 @@ int ompi_proc_refresh(void) {
ompi_vpid_t i = 0;
int ret=OMPI_SUCCESS;
opal_hwloc_locality_t *hwlocale;
uint32_t *uiptr;
OPAL_THREAD_LOCK(&ompi_proc_lock);
@ -397,25 +394,31 @@ int ompi_proc_refresh(void) {
if (OMPI_SUCCESS != ret) {
break;
}
proc->proc_hostname = NULL;
if (ompi_process_info.num_daemons < ompi_rte_hostname_cutoff) {
/* retrieve the hostname */
ret = ompi_modex_recv_string_pointer(OMPI_DB_HOSTNAME, proc, (void**)&(proc->proc_hostname), OPAL_STRING);
if (OMPI_SUCCESS != ret) {
break;
}
} else {
/* just set the hostname to NULL for now - we'll fill it in
* as modex_recv's are called for procs we will talk to
*/
proc->proc_hostname = NULL;
}
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
/* get the remote architecture */
uiptr = &(proc->proc_arch);
ret = ompi_modex_recv_key_value("OMPI_ARCH", proc, (void**)&uiptr, OPAL_UINT32);
/* if arch is different than mine, create a new convertor for this proc */
if (proc->proc_arch != opal_local_arch) {
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
OBJ_RELEASE(proc->proc_convertor);
proc->proc_convertor = opal_convertor_create(proc->proc_arch, 0);
#else
opal_show_help("help-mpi-runtime",
"heterogeneous-support-unavailable",
true, ompi_process_info.nodename,
proc->proc_hostname == NULL ? "<hostname unavailable>" :
proc->proc_hostname);
OPAL_THREAD_UNLOCK(&ompi_proc_lock);
return OMPI_ERR_NOT_SUPPORTED;
#endif
}
#else
/* must be same arch as my own */
proc->proc_arch = opal_local_arch;
#endif
}
}
@ -456,7 +459,6 @@ ompi_proc_pack(ompi_proc_t **proclist, int proclistsize, opal_buffer_t* buf)
OPAL_THREAD_UNLOCK(&ompi_proc_lock);
return rc;
}
(void) ompi_proc_get_hostname (proclist[i]);
rc = opal_dss.pack(buf, &(proclist[i]->proc_hostname), 1, OPAL_STRING);
if(rc != OPAL_SUCCESS) {
OMPI_ERROR_LOG(rc);

View File

@ -12,6 +12,7 @@
* Copyright (c) 2006-2012 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2007-2012 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2013 Intel, Inc. All rights reserved
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -304,15 +305,6 @@ OMPI_DECLSPEC int ompi_proc_unpack(opal_buffer_t *buf,
*/
OMPI_DECLSPEC int ompi_proc_refresh(void);
/**
* Retrieve the hostname for a process
*
* @note Retrieving the hostname may require communication.
*
* @param proc process to retrieve hostname from
*/
OMPI_DECLSPEC const char *ompi_proc_get_hostname (ompi_proc_t *proc);
END_C_DECLS
#endif /* OMPI_PROC_PROC_H */

View File

@ -14,6 +14,7 @@
* Copyright (c) 2007-2013 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2013 NVIDIA Corporation. All rights reserved.
* Copyright (c) 2013 Intel, Inc. All rights reserved
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -57,7 +58,6 @@ bool ompi_mpi_show_mca_params = false;
char *ompi_mpi_show_mca_params_file = NULL;
bool ompi_mpi_abort_print_stack = false;
int ompi_mpi_abort_delay = 0;
bool ompi_mpi_keep_peer_hostnames = true;
bool ompi_mpi_keep_fqdn_hostnames = false;
int ompi_mpi_leave_pinned = -1;
bool ompi_mpi_leave_pinned_pipeline = false;
@ -211,16 +211,6 @@ int ompi_mpi_register_params(void)
/* User-level process pinning controls */
/* Do we want to save hostnames for debugging messages? This can
eat quite a bit of memory... */
ompi_mpi_keep_peer_hostnames = true;
(void) mca_base_var_register("ompi", "mpi", NULL, "keep_peer_hostnames",
"If nonzero, save the string hostnames of all MPI peer processes (mostly for error / debugging output messages). This can add quite a bit of memory usage to each MPI process.",
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY,
&ompi_mpi_keep_peer_hostnames);
/* MPI_ABORT controls */
ompi_mpi_abort_delay = 0;
(void) mca_base_var_register("ompi", "mpi", NULL, "abort_delay",

View File

@ -13,6 +13,7 @@
* reserved.
* Copyright (c) 2006-2009 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2013 NVIDIA Corporation. All rights reserved.
* Copyright (c) 2013 Intel, Inc. All rights reserved
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -96,12 +97,6 @@ OMPI_DECLSPEC extern bool ompi_mpi_show_mca_params;
*/
OMPI_DECLSPEC extern char * ompi_mpi_show_mca_params_file;
/**
* Whether we should keep the string hostnames of all the MPI
* process peers around or not (eats up a good bit of memory).
*/
OMPI_DECLSPEC extern bool ompi_mpi_keep_peer_hostnames;
/**
* Whether an MPI_ABORT should print out a stack trace or not.
*/

View File

@ -97,6 +97,7 @@ static int rte_init(void)
orte_node_rank_t node_rank;
char *rmluri;
opal_hwloc_locality_t locality;
char *tmp;
/* run the prolog */
if (ORTE_SUCCESS != (ret = orte_ess_base_std_prolog())) {
@ -346,6 +347,21 @@ static int rte_init(void)
orte_process_info.max_procs = orte_process_info.num_procs;
}
/* set the number of nodes - have to test as it could be
* one of multiple environments
*/
if (NULL != (tmp = getenv("SLURM_NNODES"))) {
orte_process_info.num_daemons = strtol(tmp, NULL, 10);
} else if (NULL != (tmp = getenv("PBS_NUM_NODES"))) {
orte_process_info.num_daemons = strtol(tmp, NULL, 10);
} else {
if (0 == ORTE_PROC_MY_NAME->vpid) {
orte_show_help("help-orte-runtime.txt",
"orte_init:startup:num_daemons", true);
}
orte_process_info.num_daemons = UINT_MAX;
}
/* construct the PMI RTE string */
rmluri = orte_rml.get_contact_info();

View File

@ -56,3 +56,8 @@ again.
An error occurred while trying to pack the information about the job. More nodes
have been found than the %d expected. Please check your configuration files such
as the mapping.
#
[orte_init:startup:num_daemons]
Open MPI was unable to determine the number of nodes in your allocation. We
are therefore assuming a very large number to ensure you receive proper error
messages.

View File

@ -13,6 +13,7 @@
* Copyright (c) 2009-2010 Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011-2013 Los Alamos National Security, LLC.
* All rights reserved.
* Copyright (c) 2013 Intel, Inc. All rights reserved
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -78,6 +79,7 @@ bool orte_have_fqdn_allocation = false;
bool orte_show_resolved_nodenames;
bool orte_retain_aliases;
int orte_use_hostname_alias;
orte_vpid_t orte_hostname_cutoff;
int orted_debug_failure;
int orted_debug_failure_delay;

View File

@ -13,6 +13,7 @@
* Copyright (c) 2007-2012 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2011-2013 Los Alamos National Security, LLC.
* All rights reserved.
* Copyright (c) 2013 Intel, Inc. All rights reserved
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -606,6 +607,7 @@ ORTE_DECLSPEC extern bool orte_have_fqdn_allocation;
ORTE_DECLSPEC extern bool orte_show_resolved_nodenames;
ORTE_DECLSPEC extern bool orte_retain_aliases;
ORTE_DECLSPEC extern int orte_use_hostname_alias;
ORTE_DECLSPEC extern orte_vpid_t orte_hostname_cutoff;
/* debug flags */
ORTE_DECLSPEC extern int orted_debug_failure;

View File

@ -13,6 +13,7 @@
* Copyright (c) 2009-2010 Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012-2013 Los Alamos National Security, LLC.
* All rights reserved
* Copyright (c) 2013 Intel, Inc. All rights reserved
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -435,6 +436,15 @@ int orte_register_params(void)
OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY,
&orte_use_hostname_alias);
/* cutoff for including hostnames in modex */
orte_hostname_cutoff = UINT_MAX;
(void) mca_base_var_register ("orte", "orte", NULL, "hostname_cutoff",
"If the number of nodes in the allocation exceeds the provided value,"
"hostnames for remote processes will not be supplied to applications [default: UINT_MAX]",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY,
&orte_hostname_cutoff);
orte_xml_output = false;
(void) mca_base_var_register ("orte", "orte", NULL, "xml_output",
"Display all output in XML format (default: false)",

View File

@ -11,6 +11,7 @@
* All rights reserved.
* Copyright (c) 2012-2013 Los Alamos National Security, LLC.
* All rights reserved.
* Copyright (c) 2013 Intel, Inc. All rights reserved
*
* $COPYRIGHT$
*
@ -272,6 +273,12 @@ int orte_util_encode_nodemap(opal_byte_object_t *boptr, bool update)
/* setup a buffer for tmp use */
OBJ_CONSTRUCT(&buf, opal_buffer_t);
/* send the number of nodes */
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &daemons->num_procs, 1, ORTE_VPID))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* only send info on nodes that have daemons on them, and
* only regarding daemons that have changed - i.e., new
* daemons since the last time we sent the info - so we
@ -299,40 +306,42 @@ int orte_util_encode_nodemap(opal_byte_object_t *boptr, bool update)
ORTE_ERROR_LOG(rc);
return rc;
}
/* pack the name of the node */
if (!orte_keep_fqdn_hostnames) {
nodename = strdup(node->name);
/* if the nodename is an IP address, do not mess with it! */
if (!opal_net_isaddr(nodename)) {
/* not an IP address */
if (NULL != (ptr = strchr(nodename, '.'))) {
*ptr = '\0';
if (daemons->num_procs < orte_hostname_cutoff) {
/* pack the name of the node */
if (!orte_keep_fqdn_hostnames) {
nodename = strdup(node->name);
/* if the nodename is an IP address, do not mess with it! */
if (!opal_net_isaddr(nodename)) {
/* not an IP address */
if (NULL != (ptr = strchr(nodename, '.'))) {
*ptr = '\0';
}
}
}
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &nodename, 1, OPAL_STRING))) {
ORTE_ERROR_LOG(rc);
return rc;
}
free(nodename);
} else {
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &node->name, 1, OPAL_STRING))) {
ORTE_ERROR_LOG(rc);
return rc;
}
}
/* if requested, pack any aliases */
if (orte_retain_aliases) {
uint8_t naliases, ni;
naliases = opal_argv_count(node->alias);
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &naliases, 1, OPAL_UINT8))) {
ORTE_ERROR_LOG(rc);
return rc;
}
for (ni=0; ni < naliases; ni++) {
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &node->alias[ni], 1, OPAL_STRING))) {
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &nodename, 1, OPAL_STRING))) {
ORTE_ERROR_LOG(rc);
return rc;
}
free(nodename);
} else {
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &node->name, 1, OPAL_STRING))) {
ORTE_ERROR_LOG(rc);
return rc;
}
}
/* if requested, pack any aliases */
if (orte_retain_aliases) {
uint8_t naliases, ni;
naliases = opal_argv_count(node->alias);
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &naliases, 1, OPAL_UINT8))) {
ORTE_ERROR_LOG(rc);
return rc;
}
for (ni=0; ni < naliases; ni++) {
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &node->alias[ni], 1, OPAL_STRING))) {
ORTE_ERROR_LOG(rc);
return rc;
}
}
}
}
@ -358,7 +367,7 @@ int orte_util_encode_nodemap(opal_byte_object_t *boptr, bool update)
int orte_util_decode_nodemap(opal_byte_object_t *bo)
{
int n;
int32_t num_daemons;
orte_vpid_t num_daemons;
orte_process_name_t daemon;
opal_buffer_t buf;
int rc=ORTE_SUCCESS;
@ -378,75 +387,82 @@ int orte_util_decode_nodemap(opal_byte_object_t *bo)
OBJ_CONSTRUCT(&buf, opal_buffer_t);
opal_dss.load(&buf, bo->bytes, bo->size);
/* unpack the number of daemons */
n=1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, &num_daemons, &n, ORTE_VPID))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* set the daemon jobid */
daemon.jobid = ORTE_DAEMON_JOBID(ORTE_PROC_MY_NAME->jobid);
num_daemons = 0;
n=1;
while (OPAL_SUCCESS == (rc = opal_dss.unpack(&buf, &daemon.vpid, &n, ORTE_VPID))) {
++num_daemons;
/* unpack and store the node's name */
n=1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, &nodename, &n, OPAL_STRING))) {
ORTE_ERROR_LOG(rc);
return rc;
}
if (ORTE_SUCCESS != (rc = opal_db.store((opal_identifier_t*)&daemon, OPAL_DB_INTERNAL, ORTE_DB_HOSTNAME, nodename, OPAL_STRING))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* now store a direct reference so we can quickly lookup the daemon from a hostname */
opal_output_verbose(2, orte_nidmap_output,
"%s storing nodename %s for daemon %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
nodename, ORTE_VPID_PRINT(daemon.vpid));
if (ORTE_SUCCESS != (rc = opal_db.store((opal_identifier_t*)ORTE_NAME_WILDCARD, OPAL_DB_INTERNAL, nodename, &daemon.vpid, OPAL_UINT32))) {
ORTE_ERROR_LOG(rc);
return rc;
}
OPAL_OUTPUT_VERBOSE((2, orte_nidmap_output,
"%s orte:util:decode:nidmap daemon %s node %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_VPID_PRINT(daemon.vpid), nodename));
/* if this is my daemon, then store the data for me too */
if (daemon.vpid == ORTE_PROC_MY_DAEMON->vpid) {
if (ORTE_SUCCESS != (rc = opal_db.store((opal_identifier_t*)ORTE_PROC_MY_NAME, OPAL_DB_INTERNAL, ORTE_DB_HOSTNAME, nodename, OPAL_STRING))) {
ORTE_ERROR_LOG(rc);
return rc;
}
if (ORTE_SUCCESS != (rc = opal_db.store((opal_identifier_t*)ORTE_PROC_MY_NAME, OPAL_DB_INTERNAL, ORTE_DB_DAEMON_VPID, &daemon.vpid, OPAL_UINT32))) {
ORTE_ERROR_LOG(rc);
return rc;
}
}
/* if requested, unpack any aliases */
if (orte_retain_aliases) {
char *alias;
uint8_t naliases, ni;
if (num_daemons < orte_hostname_cutoff) {
/* unpack and store the node's name */
n=1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, &naliases, &n, OPAL