1
1

As per the email discussion, revise the sparse handling of hostnames so that we avoid potential infinite loops while allowing large-scale users to improve their startup time:

* add a new MCA param orte_hostname_cutoff to specify the number of nodes at which we stop including hostnames. This defaults to INT_MAX => always include hostnames. If a value is given, then we will include hostnames for any allocation smaller than the given limit.

* remove ompi_proc_get_hostname. Replace all occurrences with a direct link to ompi_proc_t's proc_hostname, protected by appropriate "if NULL"

* modify the OMPI-ORTE integration component so that any call to modex_recv automatically loads the ompi_proc_t->proc_hostname field as well as returning the requested info. Thus, any process whose modex info you retrieve will automatically receive the hostname. Note that on-demand retrieval is still enabled - i.e., if we are running under direct launch with PMI, the hostname will be fetched upon first call to modex_recv, and then the ompi_proc_t->proc_hostname field will be loaded

* removed a stale MCA param "mpi_keep_peer_hostnames" that was no longer used anywhere in the code base

* added an envar lookup in ess/pmi for the number of nodes in the allocation. Sadly, PMI itself doesn't provide that info, so we have to get it a different way. Currently, we support PBS-based systems and SLURM - for any other, rank0 will emit a warning and we assume max number of daemons so we will always retain hostnames

This commit was SVN r29052.
Этот коммит содержится в:
Ralph Castain 2013-08-20 18:59:36 +00:00
родитель f49f879b2d
Коммит 45e695928f
27 изменённых файлов: 365 добавлений и 254 удалений

Просмотреть файл

@ -13,6 +13,7 @@
* Copyright (c) 2007-2012 Los Alamos National Security, LLC. All rights * Copyright (c) 2007-2012 Los Alamos National Security, LLC. All rights
* reserved. * reserved.
* Copyright (c) 2008-2009 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2008-2009 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2013 Intel, Inc. All rights reserved
* $COPYRIGHT$ * $COPYRIGHT$
* *
* Additional copyrights may follow * Additional copyrights may follow
@ -408,11 +409,11 @@ static int mca_bml_r2_add_procs( size_t nprocs,
"unreachable proc", "unreachable proc",
true, true,
OMPI_NAME_PRINT(&(ompi_proc_local_proc->proc_name)), OMPI_NAME_PRINT(&(ompi_proc_local_proc->proc_name)),
(ompi_proc_get_hostname(ompi_proc_local_proc) ? (NULL != ompi_proc_local_proc->proc_hostname ?
ompi_proc_get_hostname(ompi_proc_local_proc) : "unknown!"), ompi_proc_local_proc->proc_hostname : "unknown!"),
OMPI_NAME_PRINT(&(unreach_proc->proc_name)), OMPI_NAME_PRINT(&(unreach_proc->proc_name)),
(ompi_proc_get_hostname(unreach_proc) ? (NULL != ompi_proc_local_proc->proc_hostname ?
ompi_proc_get_hostname(unreach_proc) : "unknown!"), ompi_proc_local_proc->proc_hostname : "unknown!"),
btl_names); btl_names);
} }

Просмотреть файл

@ -13,6 +13,7 @@
* Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved.
* Copyright (c) 2012 Los Alamos National Security, LLC. * Copyright (c) 2012 Los Alamos National Security, LLC.
* All rights reserved. * All rights reserved.
* Copyright (c) 2013 Intel, Inc. All rights reserved
* $COPYRIGHT$ * $COPYRIGHT$
* *
* Additional copyrights may follow * Additional copyrights may follow
@ -62,8 +63,9 @@ do { \
OMPI_NAME_PRINT(OMPI_PROC_MY_NAME), \ OMPI_NAME_PRINT(OMPI_PROC_MY_NAME), \
__FILE__, __LINE__, __func__, \ __FILE__, __LINE__, __func__, \
ompi_process_info.nodename); \ ompi_process_info.nodename); \
if(proc && ompi_proc_get_hostname(proc)) { \ if(proc) { \
mca_btl_base_err("to: %s ", ompi_proc_get_hostname(proc)); \ mca_btl_base_err("to: %s ", (NULL == proc->proc_hostname) ? \
"unknown" : proc->proc_hostname); \
} \ } \
mca_btl_base_err args; \ mca_btl_base_err args; \
mca_btl_base_err("\n"); \ mca_btl_base_err("\n"); \

Просмотреть файл

@ -17,6 +17,7 @@
* Copyright (c) 2006-2007 Voltaire All rights reserved. * Copyright (c) 2006-2007 Voltaire All rights reserved.
* Copyright (c) 2008-2012 Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2008-2012 Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2009 IBM Corporation. All rights reserved. * Copyright (c) 2009 IBM Corporation. All rights reserved.
* Copyright (c) 2013 Intel, Inc. All rights reserved
* $COPYRIGHT$ * $COPYRIGHT$
* *
* Additional copyrights may follow * Additional copyrights may follow
@ -489,7 +490,8 @@ static int mca_btl_openib_tune_endpoint(mca_btl_openib_module_t* openib_btl,
(openib_btl->device->ib_dev_attr).vendor_id, (openib_btl->device->ib_dev_attr).vendor_id,
(openib_btl->device->ib_dev_attr).vendor_part_id, (openib_btl->device->ib_dev_attr).vendor_part_id,
mca_btl_openib_transport_name_strings[mca_btl_openib_get_transport_type(openib_btl)], mca_btl_openib_transport_name_strings[mca_btl_openib_get_transport_type(openib_btl)],
ompi_proc_get_hostname(endpoint->endpoint_proc->proc_ompi), (NULL == endpoint->endpoint_proc->proc_ompi->proc_hostname) ?
"unknown" : NULL == endpoint->endpoint_proc->proc_ompi->proc_hostname,
endpoint->rem_info.rem_vendor_id, endpoint->rem_info.rem_vendor_id,
endpoint->rem_info.rem_vendor_part_id, endpoint->rem_info.rem_vendor_part_id,
mca_btl_openib_transport_name_strings[endpoint->rem_info.rem_transport_type]); mca_btl_openib_transport_name_strings[endpoint->rem_info.rem_transport_type]);
@ -551,7 +553,8 @@ static int mca_btl_openib_tune_endpoint(mca_btl_openib_module_t* openib_btl,
(openib_btl->device->ib_dev_attr).vendor_id, (openib_btl->device->ib_dev_attr).vendor_id,
(openib_btl->device->ib_dev_attr).vendor_part_id, (openib_btl->device->ib_dev_attr).vendor_part_id,
mca_btl_openib_component.receive_queues, mca_btl_openib_component.receive_queues,
ompi_proc_get_hostname(endpoint->endpoint_proc->proc_ompi), (NULL == endpoint->endpoint_proc->proc_ompi->proc_hostname) ?
"unknown", endpoint->endpoint_proc->proc_ompi->proc_hostname,
endpoint->rem_info.rem_vendor_id, endpoint->rem_info.rem_vendor_id,
endpoint->rem_info.rem_vendor_part_id, endpoint->rem_info.rem_vendor_part_id,
recv_qps); recv_qps);
@ -573,7 +576,8 @@ static int mca_btl_openib_tune_endpoint(mca_btl_openib_module_t* openib_btl,
(openib_btl->device->ib_dev_attr).vendor_id, (openib_btl->device->ib_dev_attr).vendor_id,
(openib_btl->device->ib_dev_attr).vendor_part_id, (openib_btl->device->ib_dev_attr).vendor_part_id,
mca_btl_openib_component.receive_queues, mca_btl_openib_component.receive_queues,
ompi_proc_get_hostname(endpoint->endpoint_proc->proc_ompi), (NULL == endpoint->endpoint_proc->proc_ompi->proc_hostname) ?
"unknown", endpoint->endpoint_proc->proc_ompi->proc_hostname,
endpoint->rem_info.rem_vendor_id, endpoint->rem_info.rem_vendor_id,
endpoint->rem_info.rem_vendor_part_id, endpoint->rem_info.rem_vendor_part_id,
values.receive_queues); values.receive_queues);

Просмотреть файл

@ -18,6 +18,7 @@
* Copyright (c) 2009-2012 Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2009-2012 Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011-2013 NVIDIA Corporation. All rights reserved. * Copyright (c) 2011-2013 NVIDIA Corporation. All rights reserved.
* Copyright (c) 2012 Oak Ridge National Laboratory. All rights reserved * Copyright (c) 2012 Oak Ridge National Laboratory. All rights reserved
* Copyright (c) 2013 Intel, Inc. All rights reserved
* $COPYRIGHT$ * $COPYRIGHT$
* *
* Additional copyrights may follow * Additional copyrights may follow
@ -535,7 +536,8 @@ static void btl_openib_control(mca_btl_base_module_t* btl,
break; break;
case MCA_BTL_OPENIB_CONTROL_CTS: case MCA_BTL_OPENIB_CONTROL_CTS:
OPAL_OUTPUT((-1, "received CTS from %s (buffer %p): posted recvs %d, sent cts %d", OPAL_OUTPUT((-1, "received CTS from %s (buffer %p): posted recvs %d, sent cts %d",
ompi_proc_get_hostname(ep->endpoint_proc->proc_ompi), (NULL == ep->endpoint_proc->proc_ompi->proc_hostname) ?
"unknown" : ep->endpoint_proc->proc_ompi->proc_hostname,
(void*) ctl_hdr, (void*) ctl_hdr,
ep->endpoint_posted_recvs, ep->endpoint_cts_sent)); ep->endpoint_posted_recvs, ep->endpoint_cts_sent));
ep->endpoint_cts_received = true; ep->endpoint_cts_received = true;
@ -3530,9 +3532,9 @@ error:
if (IBV_WC_RNR_RETRY_EXC_ERR == wc->status || if (IBV_WC_RNR_RETRY_EXC_ERR == wc->status ||
IBV_WC_RETRY_EXC_ERR == wc->status) { IBV_WC_RETRY_EXC_ERR == wc->status) {
char *peer_hostname = const char *peer_hostname =
(NULL != ompi_proc_get_hostname(endpoint->endpoint_proc->proc_ompi)) ? (NULL != endpoint->endpoint_proc->proc_ompi) ?
(char*)ompi_proc_get_hostname(endpoint->endpoint_proc->proc_ompi) : endpoint->endpoint_proc->proc_ompi) :
"<unknown -- please run with mpi_keep_peer_hostnames=1>"; "<unknown -- please run with mpi_keep_peer_hostnames=1>";
const char *device_name = const char *device_name =
ibv_get_device_name(endpoint->qps[qp].qp->lcl_qp->context->device); ibv_get_device_name(endpoint->qps[qp].qp->lcl_qp->context->device);
@ -3543,12 +3545,15 @@ error:
"pp rnr retry exceeded" : "pp rnr retry exceeded" :
"srq rnr retry exceeded", true, "srq rnr retry exceeded", true,
ompi_process_info.nodename, device_name, ompi_process_info.nodename, device_name,
peer_hostname); (NULL == endpoint->endpoint_proc->proc_ompi->proc_hostname) ?
"unknown" : endpoint->endpoint_proc->proc_ompi->proc_hostname);
} else if (IBV_WC_RETRY_EXC_ERR == wc->status) { } else if (IBV_WC_RETRY_EXC_ERR == wc->status) {
opal_show_help("help-mpi-btl-openib.txt", opal_show_help("help-mpi-btl-openib.txt",
"pp retry exceeded", true, "pp retry exceeded", true,
ompi_process_info.nodename, ompi_process_info.nodename,
device_name, peer_hostname); device_name,
(NULL == endpoint->endpoint_proc->proc_ompi->proc_hostname) ?
"unknown" : endpoint->endpoint_proc->proc_ompi->proc_hostname);
} }
} }

Просмотреть файл

@ -17,6 +17,7 @@
* Copyright (c) 2006-2009 Mellanox Technologies, Inc. All rights reserved. * Copyright (c) 2006-2009 Mellanox Technologies, Inc. All rights reserved.
* Copyright (c) 2010-2011 IBM Corporation. All rights reserved. * Copyright (c) 2010-2011 IBM Corporation. All rights reserved.
* Copyright (c) 2010-2011 Oracle and/or its affiliates. All rights reserved * Copyright (c) 2010-2011 Oracle and/or its affiliates. All rights reserved
* Copyright (c) 2013 Intel, Inc. All rights reserved
* *
* $COPYRIGHT$ * $COPYRIGHT$
* *
@ -507,7 +508,8 @@ static void cts_sent(mca_btl_base_module_t* btl,
/* Nothing to do/empty function (we can't pass in a NULL pointer /* Nothing to do/empty function (we can't pass in a NULL pointer
for the des_cbfunc) */ for the des_cbfunc) */
OPAL_OUTPUT((-1, "CTS send to %s completed", OPAL_OUTPUT((-1, "CTS send to %s completed",
ompi_proc_get_hostname(ep->endpoint_proc->proc_ompi))); (NULL == ep->endpoint_proc->proc_ompi->proc_hostname) ?
"unknown" : ep->endpoint_proc->proc_ompi->proc_hostname));
} }
/* /*
@ -522,7 +524,8 @@ void mca_btl_openib_endpoint_send_cts(mca_btl_openib_endpoint_t *endpoint)
mca_btl_openib_control_header_t *ctl_hdr; mca_btl_openib_control_header_t *ctl_hdr;
OPAL_OUTPUT((-1, "SENDING CTS to %s on qp index %d (QP num %d)", OPAL_OUTPUT((-1, "SENDING CTS to %s on qp index %d (QP num %d)",
ompi_proc_get_hostname(endpoint->endpoint_proc->proc_ompi), (NULL == endpoint->endpoint_proc->proc_ompi->proc_hostname) ?
"unknown" : endpoint->endpoint_proc->proc_ompi->proc_hostname,
mca_btl_openib_component.credits_qp, mca_btl_openib_component.credits_qp,
endpoint->qps[mca_btl_openib_component.credits_qp].qp->lcl_qp->qp_num)); endpoint->qps[mca_btl_openib_component.credits_qp].qp->lcl_qp->qp_num));
sc_frag = alloc_control_frag(endpoint->endpoint_btl); sc_frag = alloc_control_frag(endpoint->endpoint_btl);
@ -592,7 +595,8 @@ void mca_btl_openib_endpoint_cpc_complete(mca_btl_openib_endpoint_t *endpoint)
transport_type_ib_p = (IBV_TRANSPORT_IB == endpoint->endpoint_btl->device->ib_dev->transport_type); transport_type_ib_p = (IBV_TRANSPORT_IB == endpoint->endpoint_btl->device->ib_dev->transport_type);
#endif #endif
OPAL_OUTPUT((-1, "cpc_complete to peer %s: is IB %d, initiatior %d, cts received: %d", OPAL_OUTPUT((-1, "cpc_complete to peer %s: is IB %d, initiatior %d, cts received: %d",
ompi_proc_get_hostname(endpoint->endpoint_proc->proc_ompi), (NULL == endpoint->endpoint_proc->proc_ompi->proc_hostname) ?
"unknown" : endpoint->endpoint_proc->proc_ompi->proc_hostname,
transport_type_ib_p, transport_type_ib_p,
endpoint->endpoint_initiator, endpoint->endpoint_initiator,
endpoint->endpoint_cts_received)); endpoint->endpoint_cts_received));
@ -605,7 +609,8 @@ void mca_btl_openib_endpoint_cpc_complete(mca_btl_openib_endpoint_t *endpoint)
mark us as connected */ mark us as connected */
if (endpoint->endpoint_cts_received) { if (endpoint->endpoint_cts_received) {
OPAL_OUTPUT((-1, "cpc_complete to %s -- already got CTS, so marking endpoint as complete", OPAL_OUTPUT((-1, "cpc_complete to %s -- already got CTS, so marking endpoint as complete",
ompi_proc_get_hostname(endpoint->endpoint_proc->proc_ompi))); (NULL == endpoint->endpoint_proc->proc_ompi->proc_hostname) ?
"unknown" : endpoint->endpoint_proc->proc_ompi->proc_hostname));
mca_btl_openib_endpoint_connected(endpoint); mca_btl_openib_endpoint_connected(endpoint);
} }
} }

Просмотреть файл

@ -3,6 +3,7 @@
* Copyright (c) 2007 Mellanox Technologies, Inc. All rights reserved. * Copyright (c) 2007 Mellanox Technologies, Inc. All rights reserved.
* Copyright (c) 2012 Los Alamos National Security, LLC. All rights * Copyright (c) 2012 Los Alamos National Security, LLC. All rights
* reserved. * reserved.
* Copyright (c) 2013 Intel, Inc. All rights reserved
* *
* $COPYRIGHT$ * $COPYRIGHT$
* *
@ -457,7 +458,8 @@ int ompi_btl_openib_connect_base_alloc_cts(mca_btl_base_endpoint_t *endpoint)
mca_btl_openib_component.credits_qp; mca_btl_openib_component.credits_qp;
endpoint->endpoint_cts_frag.super.endpoint = endpoint; endpoint->endpoint_cts_frag.super.endpoint = endpoint;
OPAL_OUTPUT((-1, "Got a CTS frag for peer %s, addr %p, length %d, lkey %d", OPAL_OUTPUT((-1, "Got a CTS frag for peer %s, addr %p, length %d, lkey %d",
ompi_proc_get_hostname(endpoint->endpoint_proc->proc_ompi), (NULL == endpoint->endpoint_proc->proc_ompi->proc_hostname) ?
"unknown" : endpoint->endpoint_proc->proc_ompi->proc_hostname,
(void*) endpoint->endpoint_cts_frag.super.sg_entry.addr, (void*) endpoint->endpoint_cts_frag.super.sg_entry.addr,
endpoint->endpoint_cts_frag.super.sg_entry.length, endpoint->endpoint_cts_frag.super.sg_entry.length,
endpoint->endpoint_cts_frag.super.sg_entry.lkey)); endpoint->endpoint_cts_frag.super.sg_entry.lkey));

Просмотреть файл

@ -6,6 +6,7 @@
* Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012-2013 Los Alamos National Security, LLC. All rights * Copyright (c) 2012-2013 Los Alamos National Security, LLC. All rights
* reserved. * reserved.
* Copyright (c) 2013 Intel, Inc. All rights reserved
* *
* $COPYRIGHT$ * $COPYRIGHT$
* *
@ -716,7 +717,8 @@ static int rdmacm_module_start_connect(ompi_btl_openib_connect_base_module_t *cp
(void*) endpoint, (void*) endpoint,
(void*) endpoint->endpoint_local_cpc, (void*) endpoint->endpoint_local_cpc,
endpoint->endpoint_initiator ? "am" : "am NOT", endpoint->endpoint_initiator ? "am" : "am NOT",
ompi_proc_get_hostname(endpoint->endpoint_proc->proc_ompi))); (NULL == endpoint->endpoint_proc->proc_ompi->proc_hostname) ?
"unknown" : endpoint->endpoint_proc->proc_ompi->proc_hostname));
/* If we're the initiator, then open all the QPs */ /* If we're the initiator, then open all the QPs */
if (contents->endpoint->endpoint_initiator) { if (contents->endpoint->endpoint_initiator) {
@ -845,7 +847,8 @@ static int handle_connect_request(struct rdma_cm_event *event)
(void*) endpoint, (void*) endpoint,
(void*) endpoint->endpoint_local_cpc, (void*) endpoint->endpoint_local_cpc,
endpoint->endpoint_initiator ? "am" : "am NOT", endpoint->endpoint_initiator ? "am" : "am NOT",
ompi_proc_get_hostname(endpoint->endpoint_proc->proc_ompi))); (NULL == endpoint->endpoint_proc->proc_ompi->proc_hostname) ?
"unknown" : endpoint->endpoint_proc->proc_ompi->proc_hostname));
if (endpoint->endpoint_initiator) { if (endpoint->endpoint_initiator) {
reject_reason_t reason = REJECT_WRONG_DIRECTION; reject_reason_t reason = REJECT_WRONG_DIRECTION;
@ -906,7 +909,8 @@ static int handle_connect_request(struct rdma_cm_event *event)
} }
OPAL_OUTPUT((-1, "Posted CTS receiver buffer (%p) for peer %s, qp index %d (QP num %d), WR ID %p, SG addr %p, len %d, lkey %d", OPAL_OUTPUT((-1, "Posted CTS receiver buffer (%p) for peer %s, qp index %d (QP num %d), WR ID %p, SG addr %p, len %d, lkey %d",
(void*) wr->sg_list[0].addr, (void*) wr->sg_list[0].addr,
ompi_proc_get_hostname(endpoint->endpoint_proc->proc_ompi), (NULL == endpoint->endpoint_proc->proc_ompi->proc_hostname) ?
"unknown" : endpoint->endpoint_proc->proc_ompi->proc_hostname,
qpnum, qpnum,
endpoint->qps[qpnum].qp->lcl_qp->qp_num, endpoint->qps[qpnum].qp->lcl_qp->qp_num,
(void*) wr->wr_id, (void*) wr->wr_id,
@ -1097,7 +1101,8 @@ static void *local_endpoint_cpc_complete(void *context)
mca_btl_openib_endpoint_t *endpoint = (mca_btl_openib_endpoint_t *)context; mca_btl_openib_endpoint_t *endpoint = (mca_btl_openib_endpoint_t *)context;
OPAL_OUTPUT((-1, "MAIN local_endpoint_cpc_complete to %s", OPAL_OUTPUT((-1, "MAIN local_endpoint_cpc_complete to %s",
ompi_proc_get_hostname(endpoint->endpoint_proc->proc_ompi))); (NULL == endpoint->endpoint_proc->proc_ompi->proc_hostname) ?
"unknown" : endpoint->endpoint_proc->proc_ompi->proc_hostname));
mca_btl_openib_endpoint_cpc_complete(endpoint); mca_btl_openib_endpoint_cpc_complete(endpoint);
return NULL; return NULL;
@ -1117,7 +1122,8 @@ static int rdmacm_connect_endpoint(id_context_t *context,
if (contents->server) { if (contents->server) {
endpoint = context->endpoint; endpoint = context->endpoint;
OPAL_OUTPUT((-1, "SERVICE Server CPC complete to %s", OPAL_OUTPUT((-1, "SERVICE Server CPC complete to %s",
ompi_proc_get_hostname(endpoint->endpoint_proc->proc_ompi))); (NULL == endpoint->endpoint_proc->proc_ompi->proc_hostname) ?
"unknown" : endpoint->endpoint_proc->proc_ompi->proc_hostname));
} else { } else {
endpoint = contents->endpoint; endpoint = contents->endpoint;
endpoint->rem_info.rem_index = endpoint->rem_info.rem_index =
@ -1132,7 +1138,8 @@ static int rdmacm_connect_endpoint(id_context_t *context,
contents->on_client_list = true; contents->on_client_list = true;
} }
OPAL_OUTPUT((-1, "SERVICE Client CPC complete to %s", OPAL_OUTPUT((-1, "SERVICE Client CPC complete to %s",
ompi_proc_get_hostname(endpoint->endpoint_proc->proc_ompi))); (NULL == endpoint->endpoint_proc->proc_ompi->proc_hostname) ?
"unknown" : endpoint->endpoint_proc->proc_ompi->proc_hostname));
} }
if (NULL == endpoint) { if (NULL == endpoint) {
BTL_ERROR(("Can't find endpoint")); BTL_ERROR(("Can't find endpoint"));
@ -1144,8 +1151,12 @@ static int rdmacm_connect_endpoint(id_context_t *context,
/* Only notify the upper layers after the last QP has been /* Only notify the upper layers after the last QP has been
connected */ connected */
if (++data->rdmacm_counter < mca_btl_openib_component.num_qps) { if (++data->rdmacm_counter < mca_btl_openib_component.num_qps) {
BTL_VERBOSE(("%s to peer %s, count == %d", contents->server?"server":"client", ompi_proc_get_hostname(endpoint->endpoint_proc->proc_ompi), data->rdmacm_counter)); BTL_VERBOSE(("%s to peer %s, count == %d", contents->server?"server":"client",
OPAL_OUTPUT((-1, "%s to peer %s, count == %d", contents->server?"server":"client", ompi_proc_get_hostname(endpoint->endpoint_proc->proc_ompi), data->rdmacm_counter)); (NULL == endpoint->endpoint_proc->proc_ompi->proc_hostname) ?
"unknown" : endpoint->endpoint_proc->proc_ompi->proc_hostname, data->rdmacm_counter));
OPAL_OUTPUT((-1, "%s to peer %s, count == %d", contents->server?"server":"client",
(NULL == endpoint->endpoint_proc->proc_ompi->proc_hostname) ?
"unknown" : endpoint->endpoint_proc->proc_ompi->proc_hostname, data->rdmacm_counter));
return OMPI_SUCCESS; return OMPI_SUCCESS;
} }
@ -1376,7 +1387,8 @@ static int finish_connect(id_context_t *context)
OPAL_OUTPUT((-1, "Posted initiator CTS buffer (%p, length %d) for peer %s, qp index %d (QP num %d)", OPAL_OUTPUT((-1, "Posted initiator CTS buffer (%p, length %d) for peer %s, qp index %d (QP num %d)",
(void*) wr->sg_list[0].addr, (void*) wr->sg_list[0].addr,
wr->sg_list[0].length, wr->sg_list[0].length,
ompi_proc_get_hostname(contents->endpoint->endpoint_proc->proc_ompi), (NULL == contents->endpoint->endpoint_proc->proc_ompi->proc_hostname) ?
"unknown" : contents->endpoint->endpoint_proc->proc_ompi->proc_hostname,
context->qpnum, context->qpnum,
contents->endpoint->qps[context->qpnum].qp->lcl_qp->qp_num)); contents->endpoint->qps[context->qpnum].qp->lcl_qp->qp_num));
} }
@ -1443,7 +1455,8 @@ static int finish_connect(id_context_t *context)
(void*) contents->endpoint, (void*) contents->endpoint,
(void*) contents->endpoint->endpoint_local_cpc, (void*) contents->endpoint->endpoint_local_cpc,
contents->endpoint->endpoint_initiator ? "am" : "am NOT", contents->endpoint->endpoint_initiator ? "am" : "am NOT",
ompi_proc_get_hostname(contents->endpoint->endpoint_proc->proc_ompi))); (NULL == contents->endpoint->endpoint_proc->proc_ompi->proc_hostname) ?
"unknown" : contents->endpoint->endpoint_proc->proc_ompi->proc_hostname));
rc = rdma_connect(context->id, &conn_param); rc = rdma_connect(context->id, &conn_param);
if (0 != rc) { if (0 != rc) {
BTL_ERROR(("rdma_connect Failed with %d", rc)); BTL_ERROR(("rdma_connect Failed with %d", rc));
@ -1485,7 +1498,8 @@ static void *show_help_rdmacm_event_error(void *c)
ompi_process_info.nodename, ompi_process_info.nodename,
device, device,
rdma_event_str(event->event), rdma_event_str(event->event),
ompi_proc_get_hostname(context->endpoint->endpoint_proc->proc_ompi)); (NULL == context->endpoint->endpoint_proc->proc_ompi->proc_hostname) ?
"unknown" : context->endpoint->endpoint_proc->proc_ompi->proc_hostname));
} }
return NULL; return NULL;

Просмотреть файл

@ -10,6 +10,7 @@
* Copyright (c) 2004-2005 The Regents of the University of California. * Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved. * All rights reserved.
* Copyright (c) 2008-2010 Oracle and/or its affiliates. All rights reserved * Copyright (c) 2008-2010 Oracle and/or its affiliates. All rights reserved
* Copyright (c) 2013 Intel, Inc. All rights reserved
* $COPYRIGHT$ * $COPYRIGHT$
* *
* Additional copyrights may follow * Additional copyrights may follow
@ -377,7 +378,9 @@ int mca_btl_tcp_proc_insert( mca_btl_tcp_proc_t* btl_proc,
int rc, *a = NULL; int rc, *a = NULL;
size_t i, j; size_t i, j;
proc_hostname = ompi_proc_get_hostname(btl_proc->proc_ompi); if (NULL == (proc_hostname = btl_proc->proc_ompi->proc_hostname)) {
return OMPI_ERR_UNREACH;
}
#ifndef WORDS_BIGENDIAN #ifndef WORDS_BIGENDIAN
/* if we are little endian and our peer is not so lucky, then we /* if we are little endian and our peer is not so lucky, then we

Просмотреть файл

@ -12,6 +12,7 @@
* Copyright (c) 2006 Sandia National Laboratories. All rights * Copyright (c) 2006 Sandia National Laboratories. All rights
* reserved. * reserved.
* Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved.
* Copyright (c) 2013 Intel, Inc. All rights reserved
* $COPYRIGHT$ * $COPYRIGHT$
* *
* Additional copyrights may follow * Additional copyrights may follow
@ -259,7 +260,8 @@ static int mca_btl_udapl_proc_address_match(
BTL_UDAPL_VERBOSE_HELP(VERBOSE_SHOW_HELP, BTL_UDAPL_VERBOSE_HELP(VERBOSE_SHOW_HELP,
("help-mpi-btl-udapl.txt", "no network match", ("help-mpi-btl-udapl.txt", "no network match",
true, btl_addr_string, ompi_process_info.nodename, true, btl_addr_string, ompi_process_info.nodename,
ompi_proc_get_hostname(peer_proc->proc_ompi))); (NULL == peer_proc->proc_ompi->proc_hostname) ?
"unknown" : peer_proc->proc_ompi->proc_hostname));
return OMPI_ERR_OUT_OF_RESOURCE; return OMPI_ERR_OUT_OF_RESOURCE;
} }

Просмотреть файл

@ -12,6 +12,7 @@
* Copyright (c) 2006 Sandia National Laboratories. All rights * Copyright (c) 2006 Sandia National Laboratories. All rights
* reserved. * reserved.
* Copyright (c) 2013 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2013 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2013 Intel, Inc. All rights reserved
* $COPYRIGHT$ * $COPYRIGHT$
* *
* Additional copyrights may follow * Additional copyrights may follow
@ -295,21 +296,14 @@ static int match_modex(ompi_btl_usnic_module_t *module,
/* If MTU does not match, throw an error */ /* If MTU does not match, throw an error */
if (proc->proc_modex[i].mtu != module->if_mtu) { if (proc->proc_modex[i].mtu != module->if_mtu) {
const char *peer_hostname;
if (NULL != ompi_proc_get_hostname(proc->proc_ompi)) {
peer_hostname = ompi_proc_get_hostname(proc->proc_ompi);
} else {
peer_hostname =
"<unknown -- please run with mpi_keep_peer_hostnames=1>";
}
opal_show_help("help-mpi-btl-usnic.txt", "MTU mismatch", opal_show_help("help-mpi-btl-usnic.txt", "MTU mismatch",
true, true,
ompi_process_info.nodename, ompi_process_info.nodename,
ibv_get_device_name(module->device), ibv_get_device_name(module->device),
module->port_num, module->port_num,
module->if_mtu, module->if_mtu,
peer_hostname, (NULL == proc->proc_ompi->proc_hostname) ?
"unknown" : proc->proc_ompi->proc_hostname,
proc->proc_modex[i].mtu); proc->proc_modex[i].mtu);
return -1; return -1;
} }

Просмотреть файл

@ -3,6 +3,7 @@
* Copyright (c) 2007-2012 Mellanox Technologies. All rights reserved. * Copyright (c) 2007-2012 Mellanox Technologies. All rights reserved.
* *
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
* Copyright (c) 2013 Intel, Inc. All rights reserved
* $COPYRIGHT$ * $COPYRIGHT$
* *
* Additional copyrights may follow * Additional copyrights may follow
@ -621,7 +622,8 @@ int ompi_common_ofacm_base_alloc_cts(mca_btl_base_endpoint_t *endpoint)
mca_btl_openib_component.credits_qp; mca_btl_openib_component.credits_qp;
endpoint->endpoint_cts_frag.super.endpoint = endpoint; endpoint->endpoint_cts_frag.super.endpoint = endpoint;
OPAL_OUTPUT((-1, "Got a CTS frag for peer %s, addr %p, length %d, lkey %d", OPAL_OUTPUT((-1, "Got a CTS frag for peer %s, addr %p, length %d, lkey %d",
ompi_proc_get_hostname(endpoint->endpoint_proc->proc_ompi), (NULL == endpoint->endpoint_proc->proc_ompi->proc_hostname) ?
"unknown" : endpoint->endpoint_proc->proc_ompi->proc_hostname,
(void*) endpoint->endpoint_cts_frag.super.sg_entry.addr, (void*) endpoint->endpoint_cts_frag.super.sg_entry.addr,
endpoint->endpoint_cts_frag.super.sg_entry.length, endpoint->endpoint_cts_frag.super.sg_entry.length,
endpoint->endpoint_cts_frag.super.sg_entry.lkey)); endpoint->endpoint_cts_frag.super.sg_entry.lkey));

Просмотреть файл

@ -1,5 +1,6 @@
/* /*
* Copyright (C) Mellanox Technologies Ltd. 2001-2011. ALL RIGHTS RESERVED. * Copyright (C) Mellanox Technologies Ltd. 2001-2011. ALL RIGHTS RESERVED.
* Copyright (c) 2013 Intel, Inc. All rights reserved
* $COPYRIGHT$ * $COPYRIGHT$
* *
* Additional copyrights may follow * Additional copyrights may follow
@ -508,7 +509,9 @@ int ompi_mtl_mxm_add_procs(struct mca_mtl_base_module_t *mtl, size_t nprocs,
MXM_ERROR("MXM returned connect error: %s\n", mxm_error_string(err)); MXM_ERROR("MXM returned connect error: %s\n", mxm_error_string(err));
for (i = 0; i < nprocs; ++i) { for (i = 0; i < nprocs; ++i) {
if (MXM_OK != conn_reqs[i].error) { if (MXM_OK != conn_reqs[i].error) {
MXM_ERROR("MXM EP connect to %s error: %s\n", ompi_proc_get_hostname(procs[i]), MXM_ERROR("MXM EP connect to %s error: %s\n",
(NULL == procs[i]->proc_hostname) ?
"unknown" : procs[i]->proc_hostname,
mxm_error_string(conn_reqs[i].error)); mxm_error_string(conn_reqs[i].error));
} }
} }

Просмотреть файл

@ -10,6 +10,7 @@
* Copyright (c) 2004-2006 The Regents of the University of California. * Copyright (c) 2004-2006 The Regents of the University of California.
* All rights reserved. * All rights reserved.
* Copyright (c) 2006 QLogic Corporation. All rights reserved. * Copyright (c) 2006 QLogic Corporation. All rights reserved.
* Copyright (c) 2013 Intel, Inc. All rights reserved
* $COPYRIGHT$ * $COPYRIGHT$
* *
* Additional copyrights may follow * Additional copyrights may follow
@ -313,7 +314,8 @@ ompi_mtl_psm_add_procs(struct mca_mtl_base_module_t *mtl,
errstr ? errstr : "unknown connect error"); errstr ? errstr : "unknown connect error");
for (j = 0; j < (int) nprocs; j++) { for (j = 0; j < (int) nprocs; j++) {
if (errs_out[j] == thiserr) { if (errs_out[j] == thiserr) {
opal_output(0, " %s", ompi_proc_get_hostname(procs[j])); opal_output(0, " %s", (NULL == procs[j]->proc_hostname) ?
"unknown" : procs[j]->proc_hostname);
} }
} }
opal_output(0, "\n"); opal_output(0, "\n");

Просмотреть файл

@ -12,6 +12,7 @@
* All rights reserved. * All rights reserved.
* Copyright (c) 2012 Los Alamos National Security, LLC. All rights * Copyright (c) 2012 Los Alamos National Security, LLC. All rights
* reserved. * reserved.
* Copyright (c) 2013 Intel, Inc. All rights reserved
* $COPYRIGHT$ * $COPYRIGHT$
* *
* Additional copyrights may follow * Additional copyrights may follow
@ -368,18 +369,11 @@ mca_pml_base_pml_check_selected(const char *my_pml,
/* if that module doesn't match my own, return an error */ /* if that module doesn't match my own, return an error */
if ((size != strlen(my_pml) + 1) || if ((size != strlen(my_pml) + 1) ||
(0 != strcmp(my_pml, remote_pml))) { (0 != strcmp(my_pml, remote_pml))) {
if (ompi_proc_get_hostname(procs[0])) {
opal_output(0, "%s selected pml %s, but peer %s on %s selected pml %s", opal_output(0, "%s selected pml %s, but peer %s on %s selected pml %s",
OMPI_NAME_PRINT(&ompi_proc_local()->proc_name), OMPI_NAME_PRINT(&ompi_proc_local()->proc_name),
my_pml, OMPI_NAME_PRINT(&procs[0]->proc_name), my_pml, OMPI_NAME_PRINT(&procs[0]->proc_name),
ompi_proc_get_hostname(procs[0]), (NULL == procs[0]->proc_hostname) ? "unknown" : procs[0]->proc_hostname,
remote_pml); remote_pml);
} else {
opal_output(0, "%s selected pml %s, but peer %s selected pml %s",
OMPI_NAME_PRINT(&ompi_proc_local()->proc_name),
my_pml, OMPI_NAME_PRINT(&procs[0]->proc_name),
remote_pml);
}
free(remote_pml); /* cleanup before returning */ free(remote_pml); /* cleanup before returning */
return OMPI_ERR_UNREACH; return OMPI_ERR_UNREACH;
} }

Просмотреть файл

@ -2,6 +2,7 @@
* Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. * Copyright (c) 2011-2012 Los Alamos National Security, LLC.
* All rights reserved. * All rights reserved.
* Copyright (c) 2013 Intel, Inc. All rights reserved
* $COPYRIGHT$ * $COPYRIGHT$
* *
* Additional copyrights may follow * Additional copyrights may follow
@ -408,7 +409,7 @@ void mca_pml_bfo_recv_frag_callback_rndvrestartnotify(mca_btl_base_module_t* btl
recvreq->remote_req_send.pval, (void *)recvreq, recvreq->remote_req_send.pval, (void *)recvreq,
recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE, recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE,
hdr->hdr_restart.hdr_jobid, hdr->hdr_restart.hdr_vpid, hdr->hdr_restart.hdr_jobid, hdr->hdr_restart.hdr_vpid,
ompi_proc_get_hostname(ompi_proc)); (NULL == ompi_proc->proc_hostname) ? "unknown" : ompi_proc->proc_hostname);
mca_pml_bfo_recv_request_rndvrestartnack(des, ompi_proc, false); mca_pml_bfo_recv_request_rndvrestartnack(des, ompi_proc, false);
return; return;
} }
@ -1415,7 +1416,7 @@ void mca_pml_bfo_map_out_btl(struct mca_btl_base_module_t* btl,
btl->btl_component->btl_version.mca_component_name, btl->btl_component->btl_version.mca_component_name,
OMPI_PROC_MY_NAME->vpid, OMPI_PROC_MY_NAME->vpid,
btlname, errproc->proc_name.vpid, btlname, errproc->proc_name.vpid,
ompi_proc_get_hostname(errproc)); (NULL == errproc->proc_hostname) ? "unknown" : errproc->proc_hostname);
/* Need to search for any pending packets associated /* Need to search for any pending packets associated
* with this endpoint and remove them. We may also * with this endpoint and remove them. We may also

Просмотреть файл

@ -1,6 +1,7 @@
/* /*
* Copyright (c) 2012-2013 Los Alamos National Security, LLC. * Copyright (c) 2012-2013 Los Alamos National Security, LLC.
* All rights reserved. * All rights reserved.
* Copyright (c) 2013 Intel, Inc. All rights reserved
* *
* $COPYRIGHT$ * $COPYRIGHT$
* *
@ -65,6 +66,7 @@ typedef orte_node_rank_t ompi_node_rank_t;
typedef orte_local_rank_t ompi_local_rank_t; typedef orte_local_rank_t ompi_local_rank_t;
#define ompi_process_info orte_process_info #define ompi_process_info orte_process_info
#define ompi_rte_proc_is_bound orte_proc_is_bound #define ompi_rte_proc_is_bound orte_proc_is_bound
#define ompi_rte_hostname_cutoff orte_hostname_cutoff
/* Error handling objects and operations */ /* Error handling objects and operations */
OMPI_DECLSPEC void ompi_rte_abort(int error_code, char *fmt, ...); OMPI_DECLSPEC void ompi_rte_abort(int error_code, char *fmt, ...);

Просмотреть файл

@ -1,6 +1,7 @@
/* /*
* Copyright (c) 2012-2013 Los Alamos National Security, LLC. * Copyright (c) 2012-2013 Los Alamos National Security, LLC.
* All rights reserved. * All rights reserved.
* Copyright (c) 2013 Intel, Inc. All rights reserved
*/ */
#include "ompi_config.h" #include "ompi_config.h"
#include "ompi/constants.h" #include "ompi/constants.h"
@ -35,6 +36,7 @@
#include "ompi/mca/rte/base/base.h" #include "ompi/mca/rte/base/base.h"
#include "ompi/mca/rte/rte.h" #include "ompi/mca/rte/rte.h"
#include "ompi/debuggers/debuggers.h" #include "ompi/debuggers/debuggers.h"
#include "ompi/proc/proc.h"
void ompi_rte_abort(int error_code, char *fmt, ...) void ompi_rte_abort(int error_code, char *fmt, ...)
{ {
@ -149,21 +151,54 @@ int ompi_rte_db_fetch(const orte_process_name_t *nm,
const char *key, const char *key,
void **data, opal_data_type_t type) void **data, opal_data_type_t type)
{ {
return opal_db.fetch((opal_identifier_t*)nm, key, data, type); ompi_proc_t *proct;
int rc;
if (OPAL_SUCCESS != (rc = opal_db.fetch((opal_identifier_t*)nm, key, data, type))) {
return rc;
}
/* update the hostname */
proct = ompi_proc_find(nm);
if (NULL == proct->proc_hostname) {
opal_db.fetch_pointer((opal_identifier_t*)nm, ORTE_DB_HOSTNAME, (void**)&proct->proc_hostname, OPAL_STRING);
}
return OMPI_SUCCESS;
} }
int ompi_rte_db_fetch_pointer(const orte_process_name_t *nm, int ompi_rte_db_fetch_pointer(const orte_process_name_t *nm,
const char *key, const char *key,
void **data, opal_data_type_t type) void **data, opal_data_type_t type)
{ {
return opal_db.fetch_pointer((opal_identifier_t*)nm, key, data, type); ompi_proc_t *proct;
int rc;
if (OPAL_SUCCESS != (rc = opal_db.fetch_pointer((opal_identifier_t*)nm, key, data, type))) {
return rc;
}
/* update the hostname */
proct = ompi_proc_find(nm);
if (NULL == proct->proc_hostname) {
opal_db.fetch_pointer((opal_identifier_t*)nm, ORTE_DB_HOSTNAME, (void**)&proct->proc_hostname, OPAL_STRING);
}
return OMPI_SUCCESS;
} }
int ompi_rte_db_fetch_multiple(const orte_process_name_t *nm, int ompi_rte_db_fetch_multiple(const orte_process_name_t *nm,
const char *key, const char *key,
opal_list_t *kvs) opal_list_t *kvs)
{ {
return opal_db.fetch_multiple((opal_identifier_t*)nm, key, kvs); ompi_proc_t *proct;
int rc;
if (OPAL_SUCCESS != (rc = opal_db.fetch_multiple((opal_identifier_t*)nm, key, kvs))) {
return rc;
}
/* update the hostname */
proct = ompi_proc_find(nm);
if (NULL == proct->proc_hostname) {
opal_db.fetch_pointer((opal_identifier_t*)nm, ORTE_DB_HOSTNAME, (void**)&proct->proc_hostname, OPAL_STRING);
}
return OMPI_SUCCESS;
} }
int ompi_rte_db_remove(const orte_process_name_t *nm, int ompi_rte_db_remove(const orte_process_name_t *nm,

Просмотреть файл

@ -12,6 +12,7 @@
* Copyright (c) 2006-2007 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2006-2007 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2012 Los Alamos National Security, LLC. All rights * Copyright (c) 2012 Los Alamos National Security, LLC. All rights
* reserved. * reserved.
* Copyright (c) 2013 Intel, Inc. All rights reserved
* $COPYRIGHT$ * $COPYRIGHT$
* *
* Additional copyrights may follow * Additional copyrights may follow
@ -156,8 +157,20 @@ int ompi_proc_complete_init(void)
break; break;
} }
/* get the remote architecture */ if (ompi_process_info.num_daemons < ompi_rte_hostname_cutoff) {
/* retrieve the hostname */
ret = ompi_modex_recv_string_pointer(OMPI_DB_HOSTNAME, proc, (void**)&(proc->proc_hostname), OPAL_STRING);
if (OMPI_SUCCESS != ret) {
break;
}
} else {
/* just set the hostname to NULL for now - we'll fill it in
* as modex_recv's are called for procs we will talk to
*/
proc->proc_hostname = NULL;
}
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT #if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
/* get the remote architecture */
{ {
uint32_t *ui32ptr; uint32_t *ui32ptr;
ui32ptr = &(proc->proc_arch); ui32ptr = &(proc->proc_arch);
@ -185,21 +198,6 @@ int ompi_proc_complete_init(void)
return errcode; return errcode;
} }
const char *ompi_proc_get_hostname (ompi_proc_t *proc)
{
int ret;
if (NULL == proc->proc_hostname) {
/* get a pointer to the name of the node it is on */
ret = ompi_modex_recv_string_pointer(OMPI_DB_HOSTNAME, proc, (void**)&(proc->proc_hostname), OPAL_STRING);
if (OMPI_SUCCESS != ret) {
return NULL;
}
}
return proc->proc_hostname;
}
int ompi_proc_finalize (void) int ompi_proc_finalize (void)
{ {
opal_list_item_t *item; opal_list_item_t *item;
@ -371,7 +369,6 @@ int ompi_proc_refresh(void) {
ompi_vpid_t i = 0; ompi_vpid_t i = 0;
int ret=OMPI_SUCCESS; int ret=OMPI_SUCCESS;
opal_hwloc_locality_t *hwlocale; opal_hwloc_locality_t *hwlocale;
uint32_t *uiptr;
OPAL_THREAD_LOCK(&ompi_proc_lock); OPAL_THREAD_LOCK(&ompi_proc_lock);
@ -397,25 +394,31 @@ int ompi_proc_refresh(void) {
if (OMPI_SUCCESS != ret) { if (OMPI_SUCCESS != ret) {
break; break;
} }
if (ompi_process_info.num_daemons < ompi_rte_hostname_cutoff) {
/* retrieve the hostname */
ret = ompi_modex_recv_string_pointer(OMPI_DB_HOSTNAME, proc, (void**)&(proc->proc_hostname), OPAL_STRING);
if (OMPI_SUCCESS != ret) {
break;
}
} else {
/* just set the hostname to NULL for now - we'll fill it in
* as modex_recv's are called for procs we will talk to
*/
proc->proc_hostname = NULL; proc->proc_hostname = NULL;
}
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
/* get the remote architecture */ /* get the remote architecture */
uiptr = &(proc->proc_arch); uiptr = &(proc->proc_arch);
ret = ompi_modex_recv_key_value("OMPI_ARCH", proc, (void**)&uiptr, OPAL_UINT32); ret = ompi_modex_recv_key_value("OMPI_ARCH", proc, (void**)&uiptr, OPAL_UINT32);
/* if arch is different than mine, create a new convertor for this proc */ /* if arch is different than mine, create a new convertor for this proc */
if (proc->proc_arch != opal_local_arch) { if (proc->proc_arch != opal_local_arch) {
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
OBJ_RELEASE(proc->proc_convertor); OBJ_RELEASE(proc->proc_convertor);
proc->proc_convertor = opal_convertor_create(proc->proc_arch, 0); proc->proc_convertor = opal_convertor_create(proc->proc_arch, 0);
#else
opal_show_help("help-mpi-runtime",
"heterogeneous-support-unavailable",
true, ompi_process_info.nodename,
proc->proc_hostname == NULL ? "<hostname unavailable>" :
proc->proc_hostname);
OPAL_THREAD_UNLOCK(&ompi_proc_lock);
return OMPI_ERR_NOT_SUPPORTED;
#endif
} }
#else
/* must be same arch as my own */
proc->proc_arch = opal_local_arch;
#endif
} }
} }
@ -456,7 +459,6 @@ ompi_proc_pack(ompi_proc_t **proclist, int proclistsize, opal_buffer_t* buf)
OPAL_THREAD_UNLOCK(&ompi_proc_lock); OPAL_THREAD_UNLOCK(&ompi_proc_lock);
return rc; return rc;
} }
(void) ompi_proc_get_hostname (proclist[i]);
rc = opal_dss.pack(buf, &(proclist[i]->proc_hostname), 1, OPAL_STRING); rc = opal_dss.pack(buf, &(proclist[i]->proc_hostname), 1, OPAL_STRING);
if(rc != OPAL_SUCCESS) { if(rc != OPAL_SUCCESS) {
OMPI_ERROR_LOG(rc); OMPI_ERROR_LOG(rc);

Просмотреть файл

@ -12,6 +12,7 @@
* Copyright (c) 2006-2012 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2006-2012 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2007-2012 Los Alamos National Security, LLC. All rights * Copyright (c) 2007-2012 Los Alamos National Security, LLC. All rights
* reserved. * reserved.
* Copyright (c) 2013 Intel, Inc. All rights reserved
* $COPYRIGHT$ * $COPYRIGHT$
* *
* Additional copyrights may follow * Additional copyrights may follow
@ -304,15 +305,6 @@ OMPI_DECLSPEC int ompi_proc_unpack(opal_buffer_t *buf,
*/ */
OMPI_DECLSPEC int ompi_proc_refresh(void); OMPI_DECLSPEC int ompi_proc_refresh(void);
/**
* Retrieve the hostname for a process
*
* @note Retrieving the hostname may require communication.
*
* @param proc process to retrieve hostname from
*/
OMPI_DECLSPEC const char *ompi_proc_get_hostname (ompi_proc_t *proc);
END_C_DECLS END_C_DECLS
#endif /* OMPI_PROC_PROC_H */ #endif /* OMPI_PROC_PROC_H */

Просмотреть файл

@ -14,6 +14,7 @@
* Copyright (c) 2007-2013 Los Alamos National Security, LLC. All rights * Copyright (c) 2007-2013 Los Alamos National Security, LLC. All rights
* reserved. * reserved.
* Copyright (c) 2013 NVIDIA Corporation. All rights reserved. * Copyright (c) 2013 NVIDIA Corporation. All rights reserved.
* Copyright (c) 2013 Intel, Inc. All rights reserved
* $COPYRIGHT$ * $COPYRIGHT$
* *
* Additional copyrights may follow * Additional copyrights may follow
@ -57,7 +58,6 @@ bool ompi_mpi_show_mca_params = false;
char *ompi_mpi_show_mca_params_file = NULL; char *ompi_mpi_show_mca_params_file = NULL;
bool ompi_mpi_abort_print_stack = false; bool ompi_mpi_abort_print_stack = false;
int ompi_mpi_abort_delay = 0; int ompi_mpi_abort_delay = 0;
bool ompi_mpi_keep_peer_hostnames = true;
bool ompi_mpi_keep_fqdn_hostnames = false; bool ompi_mpi_keep_fqdn_hostnames = false;
int ompi_mpi_leave_pinned = -1; int ompi_mpi_leave_pinned = -1;
bool ompi_mpi_leave_pinned_pipeline = false; bool ompi_mpi_leave_pinned_pipeline = false;
@ -211,16 +211,6 @@ int ompi_mpi_register_params(void)
/* User-level process pinning controls */ /* User-level process pinning controls */
/* Do we want to save hostnames for debugging messages? This can
eat quite a bit of memory... */
ompi_mpi_keep_peer_hostnames = true;
(void) mca_base_var_register("ompi", "mpi", NULL, "keep_peer_hostnames",
"If nonzero, save the string hostnames of all MPI peer processes (mostly for error / debugging output messages). This can add quite a bit of memory usage to each MPI process.",
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY,
&ompi_mpi_keep_peer_hostnames);
/* MPI_ABORT controls */ /* MPI_ABORT controls */
ompi_mpi_abort_delay = 0; ompi_mpi_abort_delay = 0;
(void) mca_base_var_register("ompi", "mpi", NULL, "abort_delay", (void) mca_base_var_register("ompi", "mpi", NULL, "abort_delay",

Просмотреть файл

@ -13,6 +13,7 @@
* reserved. * reserved.
* Copyright (c) 2006-2009 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2006-2009 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2013 NVIDIA Corporation. All rights reserved. * Copyright (c) 2013 NVIDIA Corporation. All rights reserved.
* Copyright (c) 2013 Intel, Inc. All rights reserved
* $COPYRIGHT$ * $COPYRIGHT$
* *
* Additional copyrights may follow * Additional copyrights may follow
@ -96,12 +97,6 @@ OMPI_DECLSPEC extern bool ompi_mpi_show_mca_params;
*/ */
OMPI_DECLSPEC extern char * ompi_mpi_show_mca_params_file; OMPI_DECLSPEC extern char * ompi_mpi_show_mca_params_file;
/**
* Whether we should keep the string hostnames of all the MPI
* process peers around or not (eats up a good bit of memory).
*/
OMPI_DECLSPEC extern bool ompi_mpi_keep_peer_hostnames;
/** /**
* Whether an MPI_ABORT should print out a stack trace or not. * Whether an MPI_ABORT should print out a stack trace or not.
*/ */

Просмотреть файл

@ -97,6 +97,7 @@ static int rte_init(void)
orte_node_rank_t node_rank; orte_node_rank_t node_rank;
char *rmluri; char *rmluri;
opal_hwloc_locality_t locality; opal_hwloc_locality_t locality;
char *tmp;
/* run the prolog */ /* run the prolog */
if (ORTE_SUCCESS != (ret = orte_ess_base_std_prolog())) { if (ORTE_SUCCESS != (ret = orte_ess_base_std_prolog())) {
@ -346,6 +347,21 @@ static int rte_init(void)
orte_process_info.max_procs = orte_process_info.num_procs; orte_process_info.max_procs = orte_process_info.num_procs;
} }
/* set the number of nodes - have to test as it could be
* one of multiple environments
*/
if (NULL != (tmp = getenv("SLURM_NNODES"))) {
orte_process_info.num_daemons = strtol(tmp, NULL, 10);
} else if (NULL != (tmp = getenv("PBS_NUM_NODES"))) {
orte_process_info.num_daemons = strtol(tmp, NULL, 10);
} else {
if (0 == ORTE_PROC_MY_NAME->vpid) {
orte_show_help("help-orte-runtime.txt",
"orte_init:startup:num_daemons", true);
}
orte_process_info.num_daemons = UINT_MAX;
}
/* construct the PMI RTE string */ /* construct the PMI RTE string */
rmluri = orte_rml.get_contact_info(); rmluri = orte_rml.get_contact_info();

Просмотреть файл

@ -56,3 +56,8 @@ again.
An error occurred while trying to pack the information about the job. More nodes An error occurred while trying to pack the information about the job. More nodes
have been found than the %d expected. Please check your configuration files such have been found than the %d expected. Please check your configuration files such
as the mapping. as the mapping.
#
[orte_init:startup:num_daemons]
Open MPI was unable to determine the number of nodes in your allocation. We
are therefore assuming a very large number to ensure you receive proper error
messages.

Просмотреть файл

@ -13,6 +13,7 @@
* Copyright (c) 2009-2010 Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2009-2010 Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011-2013 Los Alamos National Security, LLC. * Copyright (c) 2011-2013 Los Alamos National Security, LLC.
* All rights reserved. * All rights reserved.
* Copyright (c) 2013 Intel, Inc. All rights reserved
* $COPYRIGHT$ * $COPYRIGHT$
* *
* Additional copyrights may follow * Additional copyrights may follow
@ -78,6 +79,7 @@ bool orte_have_fqdn_allocation = false;
bool orte_show_resolved_nodenames; bool orte_show_resolved_nodenames;
bool orte_retain_aliases; bool orte_retain_aliases;
int orte_use_hostname_alias; int orte_use_hostname_alias;
orte_vpid_t orte_hostname_cutoff;
int orted_debug_failure; int orted_debug_failure;
int orted_debug_failure_delay; int orted_debug_failure_delay;

Просмотреть файл

@ -13,6 +13,7 @@
* Copyright (c) 2007-2012 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2007-2012 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2011-2013 Los Alamos National Security, LLC. * Copyright (c) 2011-2013 Los Alamos National Security, LLC.
* All rights reserved. * All rights reserved.
* Copyright (c) 2013 Intel, Inc. All rights reserved
* $COPYRIGHT$ * $COPYRIGHT$
* *
* Additional copyrights may follow * Additional copyrights may follow
@ -606,6 +607,7 @@ ORTE_DECLSPEC extern bool orte_have_fqdn_allocation;
ORTE_DECLSPEC extern bool orte_show_resolved_nodenames; ORTE_DECLSPEC extern bool orte_show_resolved_nodenames;
ORTE_DECLSPEC extern bool orte_retain_aliases; ORTE_DECLSPEC extern bool orte_retain_aliases;
ORTE_DECLSPEC extern int orte_use_hostname_alias; ORTE_DECLSPEC extern int orte_use_hostname_alias;
ORTE_DECLSPEC extern orte_vpid_t orte_hostname_cutoff;
/* debug flags */ /* debug flags */
ORTE_DECLSPEC extern int orted_debug_failure; ORTE_DECLSPEC extern int orted_debug_failure;

Просмотреть файл

@ -13,6 +13,7 @@
* Copyright (c) 2009-2010 Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2009-2010 Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012-2013 Los Alamos National Security, LLC. * Copyright (c) 2012-2013 Los Alamos National Security, LLC.
* All rights reserved * All rights reserved
* Copyright (c) 2013 Intel, Inc. All rights reserved
* $COPYRIGHT$ * $COPYRIGHT$
* *
* Additional copyrights may follow * Additional copyrights may follow
@ -435,6 +436,15 @@ int orte_register_params(void)
OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY, OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY,
&orte_use_hostname_alias); &orte_use_hostname_alias);
/* cutoff for including hostnames in modex */
orte_hostname_cutoff = UINT_MAX;
(void) mca_base_var_register ("orte", "orte", NULL, "hostname_cutoff",
"If the number of nodes in the allocation exceeds the provided value,"
"hostnames for remote processes will not be supplied to applications [default: UINT_MAX]",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY,
&orte_hostname_cutoff);
orte_xml_output = false; orte_xml_output = false;
(void) mca_base_var_register ("orte", "orte", NULL, "xml_output", (void) mca_base_var_register ("orte", "orte", NULL, "xml_output",
"Display all output in XML format (default: false)", "Display all output in XML format (default: false)",

Просмотреть файл

@ -11,6 +11,7 @@
* All rights reserved. * All rights reserved.
* Copyright (c) 2012-2013 Los Alamos National Security, LLC. * Copyright (c) 2012-2013 Los Alamos National Security, LLC.
* All rights reserved. * All rights reserved.
* Copyright (c) 2013 Intel, Inc. All rights reserved
* *
* $COPYRIGHT$ * $COPYRIGHT$
* *
@ -272,6 +273,12 @@ int orte_util_encode_nodemap(opal_byte_object_t *boptr, bool update)
/* setup a buffer for tmp use */ /* setup a buffer for tmp use */
OBJ_CONSTRUCT(&buf, opal_buffer_t); OBJ_CONSTRUCT(&buf, opal_buffer_t);
/* send the number of nodes */
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &daemons->num_procs, 1, ORTE_VPID))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* only send info on nodes that have daemons on them, and /* only send info on nodes that have daemons on them, and
* only regarding daemons that have changed - i.e., new * only regarding daemons that have changed - i.e., new
* daemons since the last time we sent the info - so we * daemons since the last time we sent the info - so we
@ -299,6 +306,7 @@ int orte_util_encode_nodemap(opal_byte_object_t *boptr, bool update)
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
return rc; return rc;
} }
if (daemons->num_procs < orte_hostname_cutoff) {
/* pack the name of the node */ /* pack the name of the node */
if (!orte_keep_fqdn_hostnames) { if (!orte_keep_fqdn_hostnames) {
nodename = strdup(node->name); nodename = strdup(node->name);
@ -335,6 +343,7 @@ int orte_util_encode_nodemap(opal_byte_object_t *boptr, bool update)
} }
} }
} }
}
/* pack the oversubscribed flag */ /* pack the oversubscribed flag */
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &node->oversubscribed, 1, OPAL_UINT8))) { if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &node->oversubscribed, 1, OPAL_UINT8))) {
@ -358,7 +367,7 @@ int orte_util_encode_nodemap(opal_byte_object_t *boptr, bool update)
int orte_util_decode_nodemap(opal_byte_object_t *bo) int orte_util_decode_nodemap(opal_byte_object_t *bo)
{ {
int n; int n;
int32_t num_daemons; orte_vpid_t num_daemons;
orte_process_name_t daemon; orte_process_name_t daemon;
opal_buffer_t buf; opal_buffer_t buf;
int rc=ORTE_SUCCESS; int rc=ORTE_SUCCESS;
@ -378,13 +387,19 @@ int orte_util_decode_nodemap(opal_byte_object_t *bo)
OBJ_CONSTRUCT(&buf, opal_buffer_t); OBJ_CONSTRUCT(&buf, opal_buffer_t);
opal_dss.load(&buf, bo->bytes, bo->size); opal_dss.load(&buf, bo->bytes, bo->size);
/* unpack the number of daemons */
n=1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, &num_daemons, &n, ORTE_VPID))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* set the daemon jobid */ /* set the daemon jobid */
daemon.jobid = ORTE_DAEMON_JOBID(ORTE_PROC_MY_NAME->jobid); daemon.jobid = ORTE_DAEMON_JOBID(ORTE_PROC_MY_NAME->jobid);
num_daemons = 0;
n=1; n=1;
while (OPAL_SUCCESS == (rc = opal_dss.unpack(&buf, &daemon.vpid, &n, ORTE_VPID))) { while (OPAL_SUCCESS == (rc = opal_dss.unpack(&buf, &daemon.vpid, &n, ORTE_VPID))) {
++num_daemons; if (num_daemons < orte_hostname_cutoff) {
/* unpack and store the node's name */ /* unpack and store the node's name */
n=1; n=1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, &nodename, &n, OPAL_STRING))) { if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, &nodename, &n, OPAL_STRING))) {
@ -449,6 +464,7 @@ int orte_util_decode_nodemap(opal_byte_object_t *bo)
free(alias); free(alias);
} }
} }
}
/* unpack and discard the oversubscribed flag - procs don't need it */ /* unpack and discard the oversubscribed flag - procs don't need it */
n=1; n=1;
@ -482,6 +498,7 @@ int orte_util_decode_daemon_nodemap(opal_byte_object_t *bo)
char *name; char *name;
orte_job_t *daemons; orte_job_t *daemons;
orte_proc_t *dptr; orte_proc_t *dptr;
orte_vpid_t num_daemons;
OPAL_OUTPUT_VERBOSE((1, orte_nidmap_output, OPAL_OUTPUT_VERBOSE((1, orte_nidmap_output,
"%s decode:nidmap decoding daemon nodemap", "%s decode:nidmap decoding daemon nodemap",
@ -496,12 +513,19 @@ int orte_util_decode_daemon_nodemap(opal_byte_object_t *bo)
OBJ_CONSTRUCT(&buf, opal_buffer_t); OBJ_CONSTRUCT(&buf, opal_buffer_t);
opal_dss.load(&buf, bo->bytes, bo->size); opal_dss.load(&buf, bo->bytes, bo->size);
/* transfer the data to the nodes, counting the number of /* unpack the number of procs */
* daemons in the system n=1;
*/ if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, &num_daemons, &n, ORTE_VPID))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* transfer the data to the nodes */
daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid); daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
daemons->num_procs = num_daemons;
n=1; n=1;
while (OPAL_SUCCESS == (rc = opal_dss.unpack(&buf, &vpid, &n, ORTE_VPID))) { while (OPAL_SUCCESS == (rc = opal_dss.unpack(&buf, &vpid, &n, ORTE_VPID))) {
if (daemons->num_procs < orte_hostname_cutoff) {
/* unpack and store the node's name */ /* unpack and store the node's name */
n=1; n=1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, &name, &n, OPAL_STRING))) { if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, &name, &n, OPAL_STRING))) {
@ -535,6 +559,7 @@ int orte_util_decode_daemon_nodemap(opal_byte_object_t *bo)
free(alias); free(alias);
} }
} }
}
/* unpack the oversubscribed flag */ /* unpack the oversubscribed flag */
n=1; n=1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, &oversub, &n, OPAL_UINT8))) { if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, &oversub, &n, OPAL_UINT8))) {
@ -546,7 +571,6 @@ int orte_util_decode_daemon_nodemap(opal_byte_object_t *bo)
dptr->name.jobid = ORTE_PROC_MY_NAME->jobid; dptr->name.jobid = ORTE_PROC_MY_NAME->jobid;
dptr->name.vpid = vpid; dptr->name.vpid = vpid;
opal_pointer_array_set_item(daemons->procs, vpid, dptr); opal_pointer_array_set_item(daemons->procs, vpid, dptr);
daemons->num_procs++;
} }
if (NULL != node->daemon) { if (NULL != node->daemon) {
OBJ_RELEASE(node->daemon); OBJ_RELEASE(node->daemon);
@ -904,6 +928,7 @@ int orte_util_decode_pidmap(opal_byte_object_t *bo)
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
goto cleanup; goto cleanup;
} }
if (orte_process_info.num_daemons < orte_hostname_cutoff) {
/* lookup and store the hostname for this proc */ /* lookup and store the hostname for this proc */
if (ORTE_SUCCESS != (rc = opal_db.fetch_pointer((opal_identifier_t*)&dmn, ORTE_DB_HOSTNAME, (void**)&hostname, OPAL_STRING))) { if (ORTE_SUCCESS != (rc = opal_db.fetch_pointer((opal_identifier_t*)&dmn, ORTE_DB_HOSTNAME, (void**)&hostname, OPAL_STRING))) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
@ -915,6 +940,7 @@ int orte_util_decode_pidmap(opal_byte_object_t *bo)
} }
} }
} }
}
/* see if there is a file map */ /* see if there is a file map */
n=1; n=1;
if (OPAL_SUCCESS != (rc = opal_dss.unpack(&buf, &flag, &n, OPAL_UINT8))) { if (OPAL_SUCCESS != (rc = opal_dss.unpack(&buf, &flag, &n, OPAL_UINT8))) {