As per the email discussion, revise the sparse handling of hostnames so that we avoid potential infinite loops while allowing large-scale users to improve their startup time:
* add a new MCA param orte_hostname_cutoff to specify the number of nodes at which we stop including hostnames. This defaults to INT_MAX => always include hostnames. If a value is given, then we will include hostnames for any allocation smaller than the given limit. * remove ompi_proc_get_hostname. Replace all occurrences with a direct link to ompi_proc_t's proc_hostname, protected by appropriate "if NULL" * modify the OMPI-ORTE integration component so that any call to modex_recv automatically loads the ompi_proc_t->proc_hostname field as well as returning the requested info. Thus, any process whose modex info you retrieve will automatically receive the hostname. Note that on-demand retrieval is still enabled - i.e., if we are running under direct launch with PMI, the hostname will be fetched upon first call to modex_recv, and then the ompi_proc_t->proc_hostname field will be loaded * removed a stale MCA param "mpi_keep_peer_hostnames" that was no longer used anywhere in the code base * added an envar lookup in ess/pmi for the number of nodes in the allocation. Sadly, PMI itself doesn't provide that info, so we have to get it a different way. Currently, we support PBS-based systems and SLURM - for any other, rank0 will emit a warning and we assume max number of daemons so we will always retain hostnames This commit was SVN r29052.
This commit is contained in:
parent
f49f879b2d
commit
45e695928f
@ -13,6 +13,7 @@
|
||||
* Copyright (c) 2007-2012 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2008-2009 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2013 Intel, Inc. All rights reserved
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -408,11 +409,11 @@ static int mca_bml_r2_add_procs( size_t nprocs,
|
||||
"unreachable proc",
|
||||
true,
|
||||
OMPI_NAME_PRINT(&(ompi_proc_local_proc->proc_name)),
|
||||
(ompi_proc_get_hostname(ompi_proc_local_proc) ?
|
||||
ompi_proc_get_hostname(ompi_proc_local_proc) : "unknown!"),
|
||||
(NULL != ompi_proc_local_proc->proc_hostname ?
|
||||
ompi_proc_local_proc->proc_hostname : "unknown!"),
|
||||
OMPI_NAME_PRINT(&(unreach_proc->proc_name)),
|
||||
(ompi_proc_get_hostname(unreach_proc) ?
|
||||
ompi_proc_get_hostname(unreach_proc) : "unknown!"),
|
||||
(NULL != ompi_proc_local_proc->proc_hostname ?
|
||||
ompi_proc_local_proc->proc_hostname : "unknown!"),
|
||||
btl_names);
|
||||
}
|
||||
|
||||
|
@ -13,7 +13,8 @@
|
||||
* Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved.
|
||||
* Copyright (c) 2012 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
* Copyright (c) 2013 Intel, Inc. All rights reserved
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
@ -62,8 +63,9 @@ do { \
|
||||
OMPI_NAME_PRINT(OMPI_PROC_MY_NAME), \
|
||||
__FILE__, __LINE__, __func__, \
|
||||
ompi_process_info.nodename); \
|
||||
if(proc && ompi_proc_get_hostname(proc)) { \
|
||||
mca_btl_base_err("to: %s ", ompi_proc_get_hostname(proc)); \
|
||||
if(proc) { \
|
||||
mca_btl_base_err("to: %s ", (NULL == proc->proc_hostname) ? \
|
||||
"unknown" : proc->proc_hostname); \
|
||||
} \
|
||||
mca_btl_base_err args; \
|
||||
mca_btl_base_err("\n"); \
|
||||
|
@ -17,6 +17,7 @@
|
||||
* Copyright (c) 2006-2007 Voltaire All rights reserved.
|
||||
* Copyright (c) 2008-2012 Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2009 IBM Corporation. All rights reserved.
|
||||
* Copyright (c) 2013 Intel, Inc. All rights reserved
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -483,16 +484,17 @@ static int mca_btl_openib_tune_endpoint(mca_btl_openib_module_t* openib_btl,
|
||||
|
||||
if(mca_btl_openib_get_transport_type(openib_btl) != endpoint->rem_info.rem_transport_type) {
|
||||
opal_show_help("help-mpi-btl-openib.txt",
|
||||
"conflicting transport types", true,
|
||||
ompi_process_info.nodename,
|
||||
ibv_get_device_name(openib_btl->device->ib_dev),
|
||||
(openib_btl->device->ib_dev_attr).vendor_id,
|
||||
(openib_btl->device->ib_dev_attr).vendor_part_id,
|
||||
mca_btl_openib_transport_name_strings[mca_btl_openib_get_transport_type(openib_btl)],
|
||||
ompi_proc_get_hostname(endpoint->endpoint_proc->proc_ompi),
|
||||
endpoint->rem_info.rem_vendor_id,
|
||||
endpoint->rem_info.rem_vendor_part_id,
|
||||
mca_btl_openib_transport_name_strings[endpoint->rem_info.rem_transport_type]);
|
||||
"conflicting transport types", true,
|
||||
ompi_process_info.nodename,
|
||||
ibv_get_device_name(openib_btl->device->ib_dev),
|
||||
(openib_btl->device->ib_dev_attr).vendor_id,
|
||||
(openib_btl->device->ib_dev_attr).vendor_part_id,
|
||||
mca_btl_openib_transport_name_strings[mca_btl_openib_get_transport_type(openib_btl)],
|
||||
(NULL == endpoint->endpoint_proc->proc_ompi->proc_hostname) ?
|
||||
"unknown" : NULL == endpoint->endpoint_proc->proc_ompi->proc_hostname,
|
||||
endpoint->rem_info.rem_vendor_id,
|
||||
endpoint->rem_info.rem_vendor_part_id,
|
||||
mca_btl_openib_transport_name_strings[endpoint->rem_info.rem_transport_type]);
|
||||
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
@ -551,7 +553,8 @@ static int mca_btl_openib_tune_endpoint(mca_btl_openib_module_t* openib_btl,
|
||||
(openib_btl->device->ib_dev_attr).vendor_id,
|
||||
(openib_btl->device->ib_dev_attr).vendor_part_id,
|
||||
mca_btl_openib_component.receive_queues,
|
||||
ompi_proc_get_hostname(endpoint->endpoint_proc->proc_ompi),
|
||||
(NULL == endpoint->endpoint_proc->proc_ompi->proc_hostname) ?
|
||||
"unknown", endpoint->endpoint_proc->proc_ompi->proc_hostname,
|
||||
endpoint->rem_info.rem_vendor_id,
|
||||
endpoint->rem_info.rem_vendor_part_id,
|
||||
recv_qps);
|
||||
@ -573,7 +576,8 @@ static int mca_btl_openib_tune_endpoint(mca_btl_openib_module_t* openib_btl,
|
||||
(openib_btl->device->ib_dev_attr).vendor_id,
|
||||
(openib_btl->device->ib_dev_attr).vendor_part_id,
|
||||
mca_btl_openib_component.receive_queues,
|
||||
ompi_proc_get_hostname(endpoint->endpoint_proc->proc_ompi),
|
||||
(NULL == endpoint->endpoint_proc->proc_ompi->proc_hostname) ?
|
||||
"unknown", endpoint->endpoint_proc->proc_ompi->proc_hostname,
|
||||
endpoint->rem_info.rem_vendor_id,
|
||||
endpoint->rem_info.rem_vendor_part_id,
|
||||
values.receive_queues);
|
||||
|
@ -18,6 +18,7 @@
|
||||
* Copyright (c) 2009-2012 Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2011-2013 NVIDIA Corporation. All rights reserved.
|
||||
* Copyright (c) 2012 Oak Ridge National Laboratory. All rights reserved
|
||||
* Copyright (c) 2013 Intel, Inc. All rights reserved
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -535,7 +536,8 @@ static void btl_openib_control(mca_btl_base_module_t* btl,
|
||||
break;
|
||||
case MCA_BTL_OPENIB_CONTROL_CTS:
|
||||
OPAL_OUTPUT((-1, "received CTS from %s (buffer %p): posted recvs %d, sent cts %d",
|
||||
ompi_proc_get_hostname(ep->endpoint_proc->proc_ompi),
|
||||
(NULL == ep->endpoint_proc->proc_ompi->proc_hostname) ?
|
||||
"unknown" : ep->endpoint_proc->proc_ompi->proc_hostname,
|
||||
(void*) ctl_hdr,
|
||||
ep->endpoint_posted_recvs, ep->endpoint_cts_sent));
|
||||
ep->endpoint_cts_received = true;
|
||||
@ -3530,9 +3532,9 @@ error:
|
||||
|
||||
if (IBV_WC_RNR_RETRY_EXC_ERR == wc->status ||
|
||||
IBV_WC_RETRY_EXC_ERR == wc->status) {
|
||||
char *peer_hostname =
|
||||
(NULL != ompi_proc_get_hostname(endpoint->endpoint_proc->proc_ompi)) ?
|
||||
(char*)ompi_proc_get_hostname(endpoint->endpoint_proc->proc_ompi) :
|
||||
const char *peer_hostname =
|
||||
(NULL != endpoint->endpoint_proc->proc_ompi) ?
|
||||
endpoint->endpoint_proc->proc_ompi) :
|
||||
"<unknown -- please run with mpi_keep_peer_hostnames=1>";
|
||||
const char *device_name =
|
||||
ibv_get_device_name(endpoint->qps[qp].qp->lcl_qp->context->device);
|
||||
@ -3543,12 +3545,15 @@ error:
|
||||
"pp rnr retry exceeded" :
|
||||
"srq rnr retry exceeded", true,
|
||||
ompi_process_info.nodename, device_name,
|
||||
peer_hostname);
|
||||
(NULL == endpoint->endpoint_proc->proc_ompi->proc_hostname) ?
|
||||
"unknown" : endpoint->endpoint_proc->proc_ompi->proc_hostname);
|
||||
} else if (IBV_WC_RETRY_EXC_ERR == wc->status) {
|
||||
opal_show_help("help-mpi-btl-openib.txt",
|
||||
"pp retry exceeded", true,
|
||||
ompi_process_info.nodename,
|
||||
device_name, peer_hostname);
|
||||
device_name,
|
||||
(NULL == endpoint->endpoint_proc->proc_ompi->proc_hostname) ?
|
||||
"unknown" : endpoint->endpoint_proc->proc_ompi->proc_hostname);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -17,6 +17,7 @@
|
||||
* Copyright (c) 2006-2009 Mellanox Technologies, Inc. All rights reserved.
|
||||
* Copyright (c) 2010-2011 IBM Corporation. All rights reserved.
|
||||
* Copyright (c) 2010-2011 Oracle and/or its affiliates. All rights reserved
|
||||
* Copyright (c) 2013 Intel, Inc. All rights reserved
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
@ -507,7 +508,8 @@ static void cts_sent(mca_btl_base_module_t* btl,
|
||||
/* Nothing to do/empty function (we can't pass in a NULL pointer
|
||||
for the des_cbfunc) */
|
||||
OPAL_OUTPUT((-1, "CTS send to %s completed",
|
||||
ompi_proc_get_hostname(ep->endpoint_proc->proc_ompi)));
|
||||
(NULL == ep->endpoint_proc->proc_ompi->proc_hostname) ?
|
||||
"unknown" : ep->endpoint_proc->proc_ompi->proc_hostname));
|
||||
}
|
||||
|
||||
/*
|
||||
@ -522,7 +524,8 @@ void mca_btl_openib_endpoint_send_cts(mca_btl_openib_endpoint_t *endpoint)
|
||||
mca_btl_openib_control_header_t *ctl_hdr;
|
||||
|
||||
OPAL_OUTPUT((-1, "SENDING CTS to %s on qp index %d (QP num %d)",
|
||||
ompi_proc_get_hostname(endpoint->endpoint_proc->proc_ompi),
|
||||
(NULL == endpoint->endpoint_proc->proc_ompi->proc_hostname) ?
|
||||
"unknown" : endpoint->endpoint_proc->proc_ompi->proc_hostname,
|
||||
mca_btl_openib_component.credits_qp,
|
||||
endpoint->qps[mca_btl_openib_component.credits_qp].qp->lcl_qp->qp_num));
|
||||
sc_frag = alloc_control_frag(endpoint->endpoint_btl);
|
||||
@ -592,7 +595,8 @@ void mca_btl_openib_endpoint_cpc_complete(mca_btl_openib_endpoint_t *endpoint)
|
||||
transport_type_ib_p = (IBV_TRANSPORT_IB == endpoint->endpoint_btl->device->ib_dev->transport_type);
|
||||
#endif
|
||||
OPAL_OUTPUT((-1, "cpc_complete to peer %s: is IB %d, initiatior %d, cts received: %d",
|
||||
ompi_proc_get_hostname(endpoint->endpoint_proc->proc_ompi),
|
||||
(NULL == endpoint->endpoint_proc->proc_ompi->proc_hostname) ?
|
||||
"unknown" : endpoint->endpoint_proc->proc_ompi->proc_hostname,
|
||||
transport_type_ib_p,
|
||||
endpoint->endpoint_initiator,
|
||||
endpoint->endpoint_cts_received));
|
||||
@ -605,7 +609,8 @@ void mca_btl_openib_endpoint_cpc_complete(mca_btl_openib_endpoint_t *endpoint)
|
||||
mark us as connected */
|
||||
if (endpoint->endpoint_cts_received) {
|
||||
OPAL_OUTPUT((-1, "cpc_complete to %s -- already got CTS, so marking endpoint as complete",
|
||||
ompi_proc_get_hostname(endpoint->endpoint_proc->proc_ompi)));
|
||||
(NULL == endpoint->endpoint_proc->proc_ompi->proc_hostname) ?
|
||||
"unknown" : endpoint->endpoint_proc->proc_ompi->proc_hostname));
|
||||
mca_btl_openib_endpoint_connected(endpoint);
|
||||
}
|
||||
}
|
||||
|
@ -3,6 +3,7 @@
|
||||
* Copyright (c) 2007 Mellanox Technologies, Inc. All rights reserved.
|
||||
* Copyright (c) 2012 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2013 Intel, Inc. All rights reserved
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
@ -457,7 +458,8 @@ int ompi_btl_openib_connect_base_alloc_cts(mca_btl_base_endpoint_t *endpoint)
|
||||
mca_btl_openib_component.credits_qp;
|
||||
endpoint->endpoint_cts_frag.super.endpoint = endpoint;
|
||||
OPAL_OUTPUT((-1, "Got a CTS frag for peer %s, addr %p, length %d, lkey %d",
|
||||
ompi_proc_get_hostname(endpoint->endpoint_proc->proc_ompi),
|
||||
(NULL == endpoint->endpoint_proc->proc_ompi->proc_hostname) ?
|
||||
"unknown" : endpoint->endpoint_proc->proc_ompi->proc_hostname,
|
||||
(void*) endpoint->endpoint_cts_frag.super.sg_entry.addr,
|
||||
endpoint->endpoint_cts_frag.super.sg_entry.length,
|
||||
endpoint->endpoint_cts_frag.super.sg_entry.lkey));
|
||||
|
@ -6,6 +6,7 @@
|
||||
* Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2012-2013 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2013 Intel, Inc. All rights reserved
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
@ -716,7 +717,8 @@ static int rdmacm_module_start_connect(ompi_btl_openib_connect_base_module_t *cp
|
||||
(void*) endpoint,
|
||||
(void*) endpoint->endpoint_local_cpc,
|
||||
endpoint->endpoint_initiator ? "am" : "am NOT",
|
||||
ompi_proc_get_hostname(endpoint->endpoint_proc->proc_ompi)));
|
||||
(NULL == endpoint->endpoint_proc->proc_ompi->proc_hostname) ?
|
||||
"unknown" : endpoint->endpoint_proc->proc_ompi->proc_hostname));
|
||||
|
||||
/* If we're the initiator, then open all the QPs */
|
||||
if (contents->endpoint->endpoint_initiator) {
|
||||
@ -845,7 +847,8 @@ static int handle_connect_request(struct rdma_cm_event *event)
|
||||
(void*) endpoint,
|
||||
(void*) endpoint->endpoint_local_cpc,
|
||||
endpoint->endpoint_initiator ? "am" : "am NOT",
|
||||
ompi_proc_get_hostname(endpoint->endpoint_proc->proc_ompi)));
|
||||
(NULL == endpoint->endpoint_proc->proc_ompi->proc_hostname) ?
|
||||
"unknown" : endpoint->endpoint_proc->proc_ompi->proc_hostname));
|
||||
if (endpoint->endpoint_initiator) {
|
||||
reject_reason_t reason = REJECT_WRONG_DIRECTION;
|
||||
|
||||
@ -906,7 +909,8 @@ static int handle_connect_request(struct rdma_cm_event *event)
|
||||
}
|
||||
OPAL_OUTPUT((-1, "Posted CTS receiver buffer (%p) for peer %s, qp index %d (QP num %d), WR ID %p, SG addr %p, len %d, lkey %d",
|
||||
(void*) wr->sg_list[0].addr,
|
||||
ompi_proc_get_hostname(endpoint->endpoint_proc->proc_ompi),
|
||||
(NULL == endpoint->endpoint_proc->proc_ompi->proc_hostname) ?
|
||||
"unknown" : endpoint->endpoint_proc->proc_ompi->proc_hostname,
|
||||
qpnum,
|
||||
endpoint->qps[qpnum].qp->lcl_qp->qp_num,
|
||||
(void*) wr->wr_id,
|
||||
@ -1097,7 +1101,8 @@ static void *local_endpoint_cpc_complete(void *context)
|
||||
mca_btl_openib_endpoint_t *endpoint = (mca_btl_openib_endpoint_t *)context;
|
||||
|
||||
OPAL_OUTPUT((-1, "MAIN local_endpoint_cpc_complete to %s",
|
||||
ompi_proc_get_hostname(endpoint->endpoint_proc->proc_ompi)));
|
||||
(NULL == endpoint->endpoint_proc->proc_ompi->proc_hostname) ?
|
||||
"unknown" : endpoint->endpoint_proc->proc_ompi->proc_hostname));
|
||||
mca_btl_openib_endpoint_cpc_complete(endpoint);
|
||||
|
||||
return NULL;
|
||||
@ -1117,7 +1122,8 @@ static int rdmacm_connect_endpoint(id_context_t *context,
|
||||
if (contents->server) {
|
||||
endpoint = context->endpoint;
|
||||
OPAL_OUTPUT((-1, "SERVICE Server CPC complete to %s",
|
||||
ompi_proc_get_hostname(endpoint->endpoint_proc->proc_ompi)));
|
||||
(NULL == endpoint->endpoint_proc->proc_ompi->proc_hostname) ?
|
||||
"unknown" : endpoint->endpoint_proc->proc_ompi->proc_hostname));
|
||||
} else {
|
||||
endpoint = contents->endpoint;
|
||||
endpoint->rem_info.rem_index =
|
||||
@ -1132,7 +1138,8 @@ static int rdmacm_connect_endpoint(id_context_t *context,
|
||||
contents->on_client_list = true;
|
||||
}
|
||||
OPAL_OUTPUT((-1, "SERVICE Client CPC complete to %s",
|
||||
ompi_proc_get_hostname(endpoint->endpoint_proc->proc_ompi)));
|
||||
(NULL == endpoint->endpoint_proc->proc_ompi->proc_hostname) ?
|
||||
"unknown" : endpoint->endpoint_proc->proc_ompi->proc_hostname));
|
||||
}
|
||||
if (NULL == endpoint) {
|
||||
BTL_ERROR(("Can't find endpoint"));
|
||||
@ -1144,8 +1151,12 @@ static int rdmacm_connect_endpoint(id_context_t *context,
|
||||
/* Only notify the upper layers after the last QP has been
|
||||
connected */
|
||||
if (++data->rdmacm_counter < mca_btl_openib_component.num_qps) {
|
||||
BTL_VERBOSE(("%s to peer %s, count == %d", contents->server?"server":"client", ompi_proc_get_hostname(endpoint->endpoint_proc->proc_ompi), data->rdmacm_counter));
|
||||
OPAL_OUTPUT((-1, "%s to peer %s, count == %d", contents->server?"server":"client", ompi_proc_get_hostname(endpoint->endpoint_proc->proc_ompi), data->rdmacm_counter));
|
||||
BTL_VERBOSE(("%s to peer %s, count == %d", contents->server?"server":"client",
|
||||
(NULL == endpoint->endpoint_proc->proc_ompi->proc_hostname) ?
|
||||
"unknown" : endpoint->endpoint_proc->proc_ompi->proc_hostname, data->rdmacm_counter));
|
||||
OPAL_OUTPUT((-1, "%s to peer %s, count == %d", contents->server?"server":"client",
|
||||
(NULL == endpoint->endpoint_proc->proc_ompi->proc_hostname) ?
|
||||
"unknown" : endpoint->endpoint_proc->proc_ompi->proc_hostname, data->rdmacm_counter));
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
@ -1376,7 +1387,8 @@ static int finish_connect(id_context_t *context)
|
||||
OPAL_OUTPUT((-1, "Posted initiator CTS buffer (%p, length %d) for peer %s, qp index %d (QP num %d)",
|
||||
(void*) wr->sg_list[0].addr,
|
||||
wr->sg_list[0].length,
|
||||
ompi_proc_get_hostname(contents->endpoint->endpoint_proc->proc_ompi),
|
||||
(NULL == contents->endpoint->endpoint_proc->proc_ompi->proc_hostname) ?
|
||||
"unknown" : contents->endpoint->endpoint_proc->proc_ompi->proc_hostname,
|
||||
context->qpnum,
|
||||
contents->endpoint->qps[context->qpnum].qp->lcl_qp->qp_num));
|
||||
}
|
||||
@ -1443,7 +1455,8 @@ static int finish_connect(id_context_t *context)
|
||||
(void*) contents->endpoint,
|
||||
(void*) contents->endpoint->endpoint_local_cpc,
|
||||
contents->endpoint->endpoint_initiator ? "am" : "am NOT",
|
||||
ompi_proc_get_hostname(contents->endpoint->endpoint_proc->proc_ompi)));
|
||||
(NULL == contents->endpoint->endpoint_proc->proc_ompi->proc_hostname) ?
|
||||
"unknown" : contents->endpoint->endpoint_proc->proc_ompi->proc_hostname));
|
||||
rc = rdma_connect(context->id, &conn_param);
|
||||
if (0 != rc) {
|
||||
BTL_ERROR(("rdma_connect Failed with %d", rc));
|
||||
@ -1485,7 +1498,8 @@ static void *show_help_rdmacm_event_error(void *c)
|
||||
ompi_process_info.nodename,
|
||||
device,
|
||||
rdma_event_str(event->event),
|
||||
ompi_proc_get_hostname(context->endpoint->endpoint_proc->proc_ompi));
|
||||
(NULL == context->endpoint->endpoint_proc->proc_ompi->proc_hostname) ?
|
||||
"unknown" : context->endpoint->endpoint_proc->proc_ompi->proc_hostname));
|
||||
}
|
||||
|
||||
return NULL;
|
||||
|
@ -10,6 +10,7 @@
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2008-2010 Oracle and/or its affiliates. All rights reserved
|
||||
* Copyright (c) 2013 Intel, Inc. All rights reserved
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -377,7 +378,9 @@ int mca_btl_tcp_proc_insert( mca_btl_tcp_proc_t* btl_proc,
|
||||
int rc, *a = NULL;
|
||||
size_t i, j;
|
||||
|
||||
proc_hostname = ompi_proc_get_hostname(btl_proc->proc_ompi);
|
||||
if (NULL == (proc_hostname = btl_proc->proc_ompi->proc_hostname)) {
|
||||
return OMPI_ERR_UNREACH;
|
||||
}
|
||||
|
||||
#ifndef WORDS_BIGENDIAN
|
||||
/* if we are little endian and our peer is not so lucky, then we
|
||||
|
@ -12,6 +12,7 @@
|
||||
* Copyright (c) 2006 Sandia National Laboratories. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved.
|
||||
* Copyright (c) 2013 Intel, Inc. All rights reserved
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -259,7 +260,8 @@ static int mca_btl_udapl_proc_address_match(
|
||||
BTL_UDAPL_VERBOSE_HELP(VERBOSE_SHOW_HELP,
|
||||
("help-mpi-btl-udapl.txt", "no network match",
|
||||
true, btl_addr_string, ompi_process_info.nodename,
|
||||
ompi_proc_get_hostname(peer_proc->proc_ompi)));
|
||||
(NULL == peer_proc->proc_ompi->proc_hostname) ?
|
||||
"unknown" : peer_proc->proc_ompi->proc_hostname));
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
|
@ -12,6 +12,7 @@
|
||||
* Copyright (c) 2006 Sandia National Laboratories. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2013 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2013 Intel, Inc. All rights reserved
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -295,22 +296,15 @@ static int match_modex(ompi_btl_usnic_module_t *module,
|
||||
|
||||
/* If MTU does not match, throw an error */
|
||||
if (proc->proc_modex[i].mtu != module->if_mtu) {
|
||||
const char *peer_hostname;
|
||||
|
||||
if (NULL != ompi_proc_get_hostname(proc->proc_ompi)) {
|
||||
peer_hostname = ompi_proc_get_hostname(proc->proc_ompi);
|
||||
} else {
|
||||
peer_hostname =
|
||||
"<unknown -- please run with mpi_keep_peer_hostnames=1>";
|
||||
}
|
||||
opal_show_help("help-mpi-btl-usnic.txt", "MTU mismatch",
|
||||
true,
|
||||
ompi_process_info.nodename,
|
||||
ibv_get_device_name(module->device),
|
||||
module->port_num,
|
||||
module->if_mtu,
|
||||
peer_hostname,
|
||||
proc->proc_modex[i].mtu);
|
||||
true,
|
||||
ompi_process_info.nodename,
|
||||
ibv_get_device_name(module->device),
|
||||
module->port_num,
|
||||
module->if_mtu,
|
||||
(NULL == proc->proc_ompi->proc_hostname) ?
|
||||
"unknown" : proc->proc_ompi->proc_hostname,
|
||||
proc->proc_modex[i].mtu);
|
||||
return -1;
|
||||
}
|
||||
|
||||
|
@ -3,6 +3,7 @@
|
||||
* Copyright (c) 2007-2012 Mellanox Technologies. All rights reserved.
|
||||
*
|
||||
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
|
||||
* Copyright (c) 2013 Intel, Inc. All rights reserved
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -621,7 +622,8 @@ int ompi_common_ofacm_base_alloc_cts(mca_btl_base_endpoint_t *endpoint)
|
||||
mca_btl_openib_component.credits_qp;
|
||||
endpoint->endpoint_cts_frag.super.endpoint = endpoint;
|
||||
OPAL_OUTPUT((-1, "Got a CTS frag for peer %s, addr %p, length %d, lkey %d",
|
||||
ompi_proc_get_hostname(endpoint->endpoint_proc->proc_ompi),
|
||||
(NULL == endpoint->endpoint_proc->proc_ompi->proc_hostname) ?
|
||||
"unknown" : endpoint->endpoint_proc->proc_ompi->proc_hostname,
|
||||
(void*) endpoint->endpoint_cts_frag.super.sg_entry.addr,
|
||||
endpoint->endpoint_cts_frag.super.sg_entry.length,
|
||||
endpoint->endpoint_cts_frag.super.sg_entry.lkey));
|
||||
|
@ -1,5 +1,6 @@
|
||||
/*
|
||||
* Copyright (C) Mellanox Technologies Ltd. 2001-2011. ALL RIGHTS RESERVED.
|
||||
* Copyright (c) 2013 Intel, Inc. All rights reserved
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -508,7 +509,9 @@ int ompi_mtl_mxm_add_procs(struct mca_mtl_base_module_t *mtl, size_t nprocs,
|
||||
MXM_ERROR("MXM returned connect error: %s\n", mxm_error_string(err));
|
||||
for (i = 0; i < nprocs; ++i) {
|
||||
if (MXM_OK != conn_reqs[i].error) {
|
||||
MXM_ERROR("MXM EP connect to %s error: %s\n", ompi_proc_get_hostname(procs[i]),
|
||||
MXM_ERROR("MXM EP connect to %s error: %s\n",
|
||||
(NULL == procs[i]->proc_hostname) ?
|
||||
"unknown" : procs[i]->proc_hostname,
|
||||
mxm_error_string(conn_reqs[i].error));
|
||||
}
|
||||
}
|
||||
|
@ -10,6 +10,7 @@
|
||||
* Copyright (c) 2004-2006 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2006 QLogic Corporation. All rights reserved.
|
||||
* Copyright (c) 2013 Intel, Inc. All rights reserved
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -313,7 +314,8 @@ ompi_mtl_psm_add_procs(struct mca_mtl_base_module_t *mtl,
|
||||
errstr ? errstr : "unknown connect error");
|
||||
for (j = 0; j < (int) nprocs; j++) {
|
||||
if (errs_out[j] == thiserr) {
|
||||
opal_output(0, " %s", ompi_proc_get_hostname(procs[j]));
|
||||
opal_output(0, " %s", (NULL == procs[j]->proc_hostname) ?
|
||||
"unknown" : procs[j]->proc_hostname);
|
||||
}
|
||||
}
|
||||
opal_output(0, "\n");
|
||||
|
@ -12,6 +12,7 @@
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2012 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2013 Intel, Inc. All rights reserved
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -368,18 +369,11 @@ mca_pml_base_pml_check_selected(const char *my_pml,
|
||||
/* if that module doesn't match my own, return an error */
|
||||
if ((size != strlen(my_pml) + 1) ||
|
||||
(0 != strcmp(my_pml, remote_pml))) {
|
||||
if (ompi_proc_get_hostname(procs[0])) {
|
||||
opal_output(0, "%s selected pml %s, but peer %s on %s selected pml %s",
|
||||
OMPI_NAME_PRINT(&ompi_proc_local()->proc_name),
|
||||
my_pml, OMPI_NAME_PRINT(&procs[0]->proc_name),
|
||||
ompi_proc_get_hostname(procs[0]),
|
||||
remote_pml);
|
||||
} else {
|
||||
opal_output(0, "%s selected pml %s, but peer %s selected pml %s",
|
||||
OMPI_NAME_PRINT(&ompi_proc_local()->proc_name),
|
||||
my_pml, OMPI_NAME_PRINT(&procs[0]->proc_name),
|
||||
remote_pml);
|
||||
}
|
||||
opal_output(0, "%s selected pml %s, but peer %s on %s selected pml %s",
|
||||
OMPI_NAME_PRINT(&ompi_proc_local()->proc_name),
|
||||
my_pml, OMPI_NAME_PRINT(&procs[0]->proc_name),
|
||||
(NULL == procs[0]->proc_hostname) ? "unknown" : procs[0]->proc_hostname,
|
||||
remote_pml);
|
||||
free(remote_pml); /* cleanup before returning */
|
||||
return OMPI_ERR_UNREACH;
|
||||
}
|
||||
|
@ -2,6 +2,7 @@
|
||||
* Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2011-2012 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2013 Intel, Inc. All rights reserved
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -408,7 +409,7 @@ void mca_pml_bfo_recv_frag_callback_rndvrestartnotify(mca_btl_base_module_t* btl
|
||||
recvreq->remote_req_send.pval, (void *)recvreq,
|
||||
recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE,
|
||||
hdr->hdr_restart.hdr_jobid, hdr->hdr_restart.hdr_vpid,
|
||||
ompi_proc_get_hostname(ompi_proc));
|
||||
(NULL == ompi_proc->proc_hostname) ? "unknown" : ompi_proc->proc_hostname);
|
||||
mca_pml_bfo_recv_request_rndvrestartnack(des, ompi_proc, false);
|
||||
return;
|
||||
}
|
||||
@ -1415,7 +1416,7 @@ void mca_pml_bfo_map_out_btl(struct mca_btl_base_module_t* btl,
|
||||
btl->btl_component->btl_version.mca_component_name,
|
||||
OMPI_PROC_MY_NAME->vpid,
|
||||
btlname, errproc->proc_name.vpid,
|
||||
ompi_proc_get_hostname(errproc));
|
||||
(NULL == errproc->proc_hostname) ? "unknown" : errproc->proc_hostname);
|
||||
|
||||
/* Need to search for any pending packets associated
|
||||
* with this endpoint and remove them. We may also
|
||||
|
@ -1,6 +1,7 @@
|
||||
/*
|
||||
* Copyright (c) 2012-2013 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2013 Intel, Inc. All rights reserved
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
@ -65,6 +66,7 @@ typedef orte_node_rank_t ompi_node_rank_t;
|
||||
typedef orte_local_rank_t ompi_local_rank_t;
|
||||
#define ompi_process_info orte_process_info
|
||||
#define ompi_rte_proc_is_bound orte_proc_is_bound
|
||||
#define ompi_rte_hostname_cutoff orte_hostname_cutoff
|
||||
|
||||
/* Error handling objects and operations */
|
||||
OMPI_DECLSPEC void ompi_rte_abort(int error_code, char *fmt, ...);
|
||||
|
@ -1,6 +1,7 @@
|
||||
/*
|
||||
* Copyright (c) 2012-2013 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2013 Intel, Inc. All rights reserved
|
||||
*/
|
||||
#include "ompi_config.h"
|
||||
#include "ompi/constants.h"
|
||||
@ -35,6 +36,7 @@
|
||||
#include "ompi/mca/rte/base/base.h"
|
||||
#include "ompi/mca/rte/rte.h"
|
||||
#include "ompi/debuggers/debuggers.h"
|
||||
#include "ompi/proc/proc.h"
|
||||
|
||||
void ompi_rte_abort(int error_code, char *fmt, ...)
|
||||
{
|
||||
@ -149,21 +151,54 @@ int ompi_rte_db_fetch(const orte_process_name_t *nm,
|
||||
const char *key,
|
||||
void **data, opal_data_type_t type)
|
||||
{
|
||||
return opal_db.fetch((opal_identifier_t*)nm, key, data, type);
|
||||
ompi_proc_t *proct;
|
||||
int rc;
|
||||
|
||||
if (OPAL_SUCCESS != (rc = opal_db.fetch((opal_identifier_t*)nm, key, data, type))) {
|
||||
return rc;
|
||||
}
|
||||
/* update the hostname */
|
||||
proct = ompi_proc_find(nm);
|
||||
if (NULL == proct->proc_hostname) {
|
||||
opal_db.fetch_pointer((opal_identifier_t*)nm, ORTE_DB_HOSTNAME, (void**)&proct->proc_hostname, OPAL_STRING);
|
||||
}
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
int ompi_rte_db_fetch_pointer(const orte_process_name_t *nm,
|
||||
const char *key,
|
||||
void **data, opal_data_type_t type)
|
||||
{
|
||||
return opal_db.fetch_pointer((opal_identifier_t*)nm, key, data, type);
|
||||
ompi_proc_t *proct;
|
||||
int rc;
|
||||
|
||||
if (OPAL_SUCCESS != (rc = opal_db.fetch_pointer((opal_identifier_t*)nm, key, data, type))) {
|
||||
return rc;
|
||||
}
|
||||
/* update the hostname */
|
||||
proct = ompi_proc_find(nm);
|
||||
if (NULL == proct->proc_hostname) {
|
||||
opal_db.fetch_pointer((opal_identifier_t*)nm, ORTE_DB_HOSTNAME, (void**)&proct->proc_hostname, OPAL_STRING);
|
||||
}
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
int ompi_rte_db_fetch_multiple(const orte_process_name_t *nm,
|
||||
const char *key,
|
||||
opal_list_t *kvs)
|
||||
{
|
||||
return opal_db.fetch_multiple((opal_identifier_t*)nm, key, kvs);
|
||||
ompi_proc_t *proct;
|
||||
int rc;
|
||||
|
||||
if (OPAL_SUCCESS != (rc = opal_db.fetch_multiple((opal_identifier_t*)nm, key, kvs))) {
|
||||
return rc;
|
||||
}
|
||||
/* update the hostname */
|
||||
proct = ompi_proc_find(nm);
|
||||
if (NULL == proct->proc_hostname) {
|
||||
opal_db.fetch_pointer((opal_identifier_t*)nm, ORTE_DB_HOSTNAME, (void**)&proct->proc_hostname, OPAL_STRING);
|
||||
}
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
int ompi_rte_db_remove(const orte_process_name_t *nm,
|
||||
|
@ -12,6 +12,7 @@
|
||||
* Copyright (c) 2006-2007 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2012 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2013 Intel, Inc. All rights reserved
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -156,8 +157,20 @@ int ompi_proc_complete_init(void)
|
||||
break;
|
||||
}
|
||||
|
||||
/* get the remote architecture */
|
||||
if (ompi_process_info.num_daemons < ompi_rte_hostname_cutoff) {
|
||||
/* retrieve the hostname */
|
||||
ret = ompi_modex_recv_string_pointer(OMPI_DB_HOSTNAME, proc, (void**)&(proc->proc_hostname), OPAL_STRING);
|
||||
if (OMPI_SUCCESS != ret) {
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
/* just set the hostname to NULL for now - we'll fill it in
|
||||
* as modex_recv's are called for procs we will talk to
|
||||
*/
|
||||
proc->proc_hostname = NULL;
|
||||
}
|
||||
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
|
||||
/* get the remote architecture */
|
||||
{
|
||||
uint32_t *ui32ptr;
|
||||
ui32ptr = &(proc->proc_arch);
|
||||
@ -185,21 +198,6 @@ int ompi_proc_complete_init(void)
|
||||
return errcode;
|
||||
}
|
||||
|
||||
const char *ompi_proc_get_hostname (ompi_proc_t *proc)
|
||||
{
|
||||
int ret;
|
||||
|
||||
if (NULL == proc->proc_hostname) {
|
||||
/* get a pointer to the name of the node it is on */
|
||||
ret = ompi_modex_recv_string_pointer(OMPI_DB_HOSTNAME, proc, (void**)&(proc->proc_hostname), OPAL_STRING);
|
||||
if (OMPI_SUCCESS != ret) {
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
return proc->proc_hostname;
|
||||
}
|
||||
|
||||
int ompi_proc_finalize (void)
|
||||
{
|
||||
opal_list_item_t *item;
|
||||
@ -371,7 +369,6 @@ int ompi_proc_refresh(void) {
|
||||
ompi_vpid_t i = 0;
|
||||
int ret=OMPI_SUCCESS;
|
||||
opal_hwloc_locality_t *hwlocale;
|
||||
uint32_t *uiptr;
|
||||
|
||||
OPAL_THREAD_LOCK(&ompi_proc_lock);
|
||||
|
||||
@ -397,25 +394,31 @@ int ompi_proc_refresh(void) {
|
||||
if (OMPI_SUCCESS != ret) {
|
||||
break;
|
||||
}
|
||||
proc->proc_hostname = NULL;
|
||||
if (ompi_process_info.num_daemons < ompi_rte_hostname_cutoff) {
|
||||
/* retrieve the hostname */
|
||||
ret = ompi_modex_recv_string_pointer(OMPI_DB_HOSTNAME, proc, (void**)&(proc->proc_hostname), OPAL_STRING);
|
||||
if (OMPI_SUCCESS != ret) {
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
/* just set the hostname to NULL for now - we'll fill it in
|
||||
* as modex_recv's are called for procs we will talk to
|
||||
*/
|
||||
proc->proc_hostname = NULL;
|
||||
}
|
||||
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
|
||||
/* get the remote architecture */
|
||||
uiptr = &(proc->proc_arch);
|
||||
ret = ompi_modex_recv_key_value("OMPI_ARCH", proc, (void**)&uiptr, OPAL_UINT32);
|
||||
/* if arch is different than mine, create a new convertor for this proc */
|
||||
if (proc->proc_arch != opal_local_arch) {
|
||||
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
|
||||
OBJ_RELEASE(proc->proc_convertor);
|
||||
proc->proc_convertor = opal_convertor_create(proc->proc_arch, 0);
|
||||
#else
|
||||
opal_show_help("help-mpi-runtime",
|
||||
"heterogeneous-support-unavailable",
|
||||
true, ompi_process_info.nodename,
|
||||
proc->proc_hostname == NULL ? "<hostname unavailable>" :
|
||||
proc->proc_hostname);
|
||||
OPAL_THREAD_UNLOCK(&ompi_proc_lock);
|
||||
return OMPI_ERR_NOT_SUPPORTED;
|
||||
#endif
|
||||
}
|
||||
#else
|
||||
/* must be same arch as my own */
|
||||
proc->proc_arch = opal_local_arch;
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
@ -456,7 +459,6 @@ ompi_proc_pack(ompi_proc_t **proclist, int proclistsize, opal_buffer_t* buf)
|
||||
OPAL_THREAD_UNLOCK(&ompi_proc_lock);
|
||||
return rc;
|
||||
}
|
||||
(void) ompi_proc_get_hostname (proclist[i]);
|
||||
rc = opal_dss.pack(buf, &(proclist[i]->proc_hostname), 1, OPAL_STRING);
|
||||
if(rc != OPAL_SUCCESS) {
|
||||
OMPI_ERROR_LOG(rc);
|
||||
|
@ -12,6 +12,7 @@
|
||||
* Copyright (c) 2006-2012 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2007-2012 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2013 Intel, Inc. All rights reserved
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -304,15 +305,6 @@ OMPI_DECLSPEC int ompi_proc_unpack(opal_buffer_t *buf,
|
||||
*/
|
||||
OMPI_DECLSPEC int ompi_proc_refresh(void);
|
||||
|
||||
/**
|
||||
* Retrieve the hostname for a process
|
||||
*
|
||||
* @note Retrieving the hostname may require communication.
|
||||
*
|
||||
* @param proc process to retrieve hostname from
|
||||
*/
|
||||
OMPI_DECLSPEC const char *ompi_proc_get_hostname (ompi_proc_t *proc);
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
#endif /* OMPI_PROC_PROC_H */
|
||||
|
@ -14,6 +14,7 @@
|
||||
* Copyright (c) 2007-2013 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2013 NVIDIA Corporation. All rights reserved.
|
||||
* Copyright (c) 2013 Intel, Inc. All rights reserved
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -57,7 +58,6 @@ bool ompi_mpi_show_mca_params = false;
|
||||
char *ompi_mpi_show_mca_params_file = NULL;
|
||||
bool ompi_mpi_abort_print_stack = false;
|
||||
int ompi_mpi_abort_delay = 0;
|
||||
bool ompi_mpi_keep_peer_hostnames = true;
|
||||
bool ompi_mpi_keep_fqdn_hostnames = false;
|
||||
int ompi_mpi_leave_pinned = -1;
|
||||
bool ompi_mpi_leave_pinned_pipeline = false;
|
||||
@ -211,16 +211,6 @@ int ompi_mpi_register_params(void)
|
||||
|
||||
/* User-level process pinning controls */
|
||||
|
||||
/* Do we want to save hostnames for debugging messages? This can
|
||||
eat quite a bit of memory... */
|
||||
ompi_mpi_keep_peer_hostnames = true;
|
||||
(void) mca_base_var_register("ompi", "mpi", NULL, "keep_peer_hostnames",
|
||||
"If nonzero, save the string hostnames of all MPI peer processes (mostly for error / debugging output messages). This can add quite a bit of memory usage to each MPI process.",
|
||||
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_9,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&ompi_mpi_keep_peer_hostnames);
|
||||
|
||||
/* MPI_ABORT controls */
|
||||
ompi_mpi_abort_delay = 0;
|
||||
(void) mca_base_var_register("ompi", "mpi", NULL, "abort_delay",
|
||||
|
@ -13,6 +13,7 @@
|
||||
* reserved.
|
||||
* Copyright (c) 2006-2009 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2013 NVIDIA Corporation. All rights reserved.
|
||||
* Copyright (c) 2013 Intel, Inc. All rights reserved
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -96,12 +97,6 @@ OMPI_DECLSPEC extern bool ompi_mpi_show_mca_params;
|
||||
*/
|
||||
OMPI_DECLSPEC extern char * ompi_mpi_show_mca_params_file;
|
||||
|
||||
/**
|
||||
* Whether we should keep the string hostnames of all the MPI
|
||||
* process peers around or not (eats up a good bit of memory).
|
||||
*/
|
||||
OMPI_DECLSPEC extern bool ompi_mpi_keep_peer_hostnames;
|
||||
|
||||
/**
|
||||
* Whether an MPI_ABORT should print out a stack trace or not.
|
||||
*/
|
||||
|
@ -97,6 +97,7 @@ static int rte_init(void)
|
||||
orte_node_rank_t node_rank;
|
||||
char *rmluri;
|
||||
opal_hwloc_locality_t locality;
|
||||
char *tmp;
|
||||
|
||||
/* run the prolog */
|
||||
if (ORTE_SUCCESS != (ret = orte_ess_base_std_prolog())) {
|
||||
@ -346,6 +347,21 @@ static int rte_init(void)
|
||||
orte_process_info.max_procs = orte_process_info.num_procs;
|
||||
}
|
||||
|
||||
/* set the number of nodes - have to test as it could be
|
||||
* one of multiple environments
|
||||
*/
|
||||
if (NULL != (tmp = getenv("SLURM_NNODES"))) {
|
||||
orte_process_info.num_daemons = strtol(tmp, NULL, 10);
|
||||
} else if (NULL != (tmp = getenv("PBS_NUM_NODES"))) {
|
||||
orte_process_info.num_daemons = strtol(tmp, NULL, 10);
|
||||
} else {
|
||||
if (0 == ORTE_PROC_MY_NAME->vpid) {
|
||||
orte_show_help("help-orte-runtime.txt",
|
||||
"orte_init:startup:num_daemons", true);
|
||||
}
|
||||
orte_process_info.num_daemons = UINT_MAX;
|
||||
}
|
||||
|
||||
/* construct the PMI RTE string */
|
||||
rmluri = orte_rml.get_contact_info();
|
||||
|
||||
|
@ -56,3 +56,8 @@ again.
|
||||
An error occurred while trying to pack the information about the job. More nodes
|
||||
have been found than the %d expected. Please check your configuration files such
|
||||
as the mapping.
|
||||
#
|
||||
[orte_init:startup:num_daemons]
|
||||
Open MPI was unable to determine the number of nodes in your allocation. We
|
||||
are therefore assuming a very large number to ensure you receive proper error
|
||||
messages.
|
||||
|
@ -13,6 +13,7 @@
|
||||
* Copyright (c) 2009-2010 Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2011-2013 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2013 Intel, Inc. All rights reserved
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -78,6 +79,7 @@ bool orte_have_fqdn_allocation = false;
|
||||
bool orte_show_resolved_nodenames;
|
||||
bool orte_retain_aliases;
|
||||
int orte_use_hostname_alias;
|
||||
orte_vpid_t orte_hostname_cutoff;
|
||||
|
||||
int orted_debug_failure;
|
||||
int orted_debug_failure_delay;
|
||||
|
@ -13,6 +13,7 @@
|
||||
* Copyright (c) 2007-2012 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2011-2013 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2013 Intel, Inc. All rights reserved
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -606,6 +607,7 @@ ORTE_DECLSPEC extern bool orte_have_fqdn_allocation;
|
||||
ORTE_DECLSPEC extern bool orte_show_resolved_nodenames;
|
||||
ORTE_DECLSPEC extern bool orte_retain_aliases;
|
||||
ORTE_DECLSPEC extern int orte_use_hostname_alias;
|
||||
ORTE_DECLSPEC extern orte_vpid_t orte_hostname_cutoff;
|
||||
|
||||
/* debug flags */
|
||||
ORTE_DECLSPEC extern int orted_debug_failure;
|
||||
|
@ -13,6 +13,7 @@
|
||||
* Copyright (c) 2009-2010 Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2012-2013 Los Alamos National Security, LLC.
|
||||
* All rights reserved
|
||||
* Copyright (c) 2013 Intel, Inc. All rights reserved
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -435,6 +436,15 @@ int orte_register_params(void)
|
||||
OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&orte_use_hostname_alias);
|
||||
|
||||
/* cutoff for including hostnames in modex */
|
||||
orte_hostname_cutoff = UINT_MAX;
|
||||
(void) mca_base_var_register ("orte", "orte", NULL, "hostname_cutoff",
|
||||
"If the number of nodes in the allocation exceeds the provided value,"
|
||||
"hostnames for remote processes will not be supplied to applications [default: UINT_MAX]",
|
||||
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&orte_hostname_cutoff);
|
||||
|
||||
orte_xml_output = false;
|
||||
(void) mca_base_var_register ("orte", "orte", NULL, "xml_output",
|
||||
"Display all output in XML format (default: false)",
|
||||
|
@ -11,6 +11,7 @@
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2012-2013 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2013 Intel, Inc. All rights reserved
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
@ -272,6 +273,12 @@ int orte_util_encode_nodemap(opal_byte_object_t *boptr, bool update)
|
||||
/* setup a buffer for tmp use */
|
||||
OBJ_CONSTRUCT(&buf, opal_buffer_t);
|
||||
|
||||
/* send the number of nodes */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &daemons->num_procs, 1, ORTE_VPID))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* only send info on nodes that have daemons on them, and
|
||||
* only regarding daemons that have changed - i.e., new
|
||||
* daemons since the last time we sent the info - so we
|
||||
@ -299,40 +306,42 @@ int orte_util_encode_nodemap(opal_byte_object_t *boptr, bool update)
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
/* pack the name of the node */
|
||||
if (!orte_keep_fqdn_hostnames) {
|
||||
nodename = strdup(node->name);
|
||||
/* if the nodename is an IP address, do not mess with it! */
|
||||
if (!opal_net_isaddr(nodename)) {
|
||||
/* not an IP address */
|
||||
if (NULL != (ptr = strchr(nodename, '.'))) {
|
||||
*ptr = '\0';
|
||||
if (daemons->num_procs < orte_hostname_cutoff) {
|
||||
/* pack the name of the node */
|
||||
if (!orte_keep_fqdn_hostnames) {
|
||||
nodename = strdup(node->name);
|
||||
/* if the nodename is an IP address, do not mess with it! */
|
||||
if (!opal_net_isaddr(nodename)) {
|
||||
/* not an IP address */
|
||||
if (NULL != (ptr = strchr(nodename, '.'))) {
|
||||
*ptr = '\0';
|
||||
}
|
||||
}
|
||||
}
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &nodename, 1, OPAL_STRING))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
free(nodename);
|
||||
} else {
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &node->name, 1, OPAL_STRING))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
}
|
||||
/* if requested, pack any aliases */
|
||||
if (orte_retain_aliases) {
|
||||
uint8_t naliases, ni;
|
||||
naliases = opal_argv_count(node->alias);
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &naliases, 1, OPAL_UINT8))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
for (ni=0; ni < naliases; ni++) {
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &node->alias[ni], 1, OPAL_STRING))) {
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &nodename, 1, OPAL_STRING))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
free(nodename);
|
||||
} else {
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &node->name, 1, OPAL_STRING))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
}
|
||||
/* if requested, pack any aliases */
|
||||
if (orte_retain_aliases) {
|
||||
uint8_t naliases, ni;
|
||||
naliases = opal_argv_count(node->alias);
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &naliases, 1, OPAL_UINT8))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
for (ni=0; ni < naliases; ni++) {
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &node->alias[ni], 1, OPAL_STRING))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -358,7 +367,7 @@ int orte_util_encode_nodemap(opal_byte_object_t *boptr, bool update)
|
||||
int orte_util_decode_nodemap(opal_byte_object_t *bo)
|
||||
{
|
||||
int n;
|
||||
int32_t num_daemons;
|
||||
orte_vpid_t num_daemons;
|
||||
orte_process_name_t daemon;
|
||||
opal_buffer_t buf;
|
||||
int rc=ORTE_SUCCESS;
|
||||
@ -378,75 +387,82 @@ int orte_util_decode_nodemap(opal_byte_object_t *bo)
|
||||
OBJ_CONSTRUCT(&buf, opal_buffer_t);
|
||||
opal_dss.load(&buf, bo->bytes, bo->size);
|
||||
|
||||
/* unpack the number of daemons */
|
||||
n=1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, &num_daemons, &n, ORTE_VPID))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* set the daemon jobid */
|
||||
daemon.jobid = ORTE_DAEMON_JOBID(ORTE_PROC_MY_NAME->jobid);
|
||||
|
||||
num_daemons = 0;
|
||||
n=1;
|
||||
while (OPAL_SUCCESS == (rc = opal_dss.unpack(&buf, &daemon.vpid, &n, ORTE_VPID))) {
|
||||
++num_daemons;
|
||||
/* unpack and store the node's name */
|
||||
n=1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, &nodename, &n, OPAL_STRING))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
if (ORTE_SUCCESS != (rc = opal_db.store((opal_identifier_t*)&daemon, OPAL_DB_INTERNAL, ORTE_DB_HOSTNAME, nodename, OPAL_STRING))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
/* now store a direct reference so we can quickly lookup the daemon from a hostname */
|
||||
opal_output_verbose(2, orte_nidmap_output,
|
||||
"%s storing nodename %s for daemon %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
nodename, ORTE_VPID_PRINT(daemon.vpid));
|
||||
if (ORTE_SUCCESS != (rc = opal_db.store((opal_identifier_t*)ORTE_NAME_WILDCARD, OPAL_DB_INTERNAL, nodename, &daemon.vpid, OPAL_UINT32))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((2, orte_nidmap_output,
|
||||
"%s orte:util:decode:nidmap daemon %s node %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_VPID_PRINT(daemon.vpid), nodename));
|
||||
|
||||
/* if this is my daemon, then store the data for me too */
|
||||
if (daemon.vpid == ORTE_PROC_MY_DAEMON->vpid) {
|
||||
if (ORTE_SUCCESS != (rc = opal_db.store((opal_identifier_t*)ORTE_PROC_MY_NAME, OPAL_DB_INTERNAL, ORTE_DB_HOSTNAME, nodename, OPAL_STRING))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
if (ORTE_SUCCESS != (rc = opal_db.store((opal_identifier_t*)ORTE_PROC_MY_NAME, OPAL_DB_INTERNAL, ORTE_DB_DAEMON_VPID, &daemon.vpid, OPAL_UINT32))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
}
|
||||
|
||||
/* if requested, unpack any aliases */
|
||||
if (orte_retain_aliases) {
|
||||
char *alias;
|
||||
uint8_t naliases, ni;
|
||||
if (num_daemons < orte_hostname_cutoff) {
|
||||
/* unpack and store the node's name */
|
||||
n=1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, &naliases, &n, OPAL |