2011-07-26 08:36:21 +04:00
|
|
|
/*
|
|
|
|
* Copyright (C) Mellanox Technologies Ltd. 2001-2011. ALL RIGHTS RESERVED.
|
As per the email discussion, revise the sparse handling of hostnames so that we avoid potential infinite loops while allowing large-scale users to improve their startup time:
* add a new MCA param orte_hostname_cutoff to specify the number of nodes at which we stop including hostnames. This defaults to INT_MAX => always include hostnames. If a value is given, then we will include hostnames for any allocation smaller than the given limit.
* remove ompi_proc_get_hostname. Replace all occurrences with a direct link to ompi_proc_t's proc_hostname, protected by appropriate "if NULL"
* modify the OMPI-ORTE integration component so that any call to modex_recv automatically loads the ompi_proc_t->proc_hostname field as well as returning the requested info. Thus, any process whose modex info you retrieve will automatically receive the hostname. Note that on-demand retrieval is still enabled - i.e., if we are running under direct launch with PMI, the hostname will be fetched upon first call to modex_recv, and then the ompi_proc_t->proc_hostname field will be loaded
* removed a stale MCA param "mpi_keep_peer_hostnames" that was no longer used anywhere in the code base
* added an envar lookup in ess/pmi for the number of nodes in the allocation. Sadly, PMI itself doesn't provide that info, so we have to get it a different way. Currently, we support PBS-based systems and SLURM - for any other, rank0 will emit a warning and we assume max number of daemons so we will always retain hostnames
This commit was SVN r29052.
2013-08-20 22:59:36 +04:00
|
|
|
* Copyright (c) 2013 Intel, Inc. All rights reserved
|
2011-07-26 08:36:21 +04:00
|
|
|
* $COPYRIGHT$
|
|
|
|
*
|
|
|
|
* Additional copyrights may follow
|
|
|
|
*
|
|
|
|
* $HEADER$
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
|
|
#include "ompi_config.h"
|
|
|
|
|
|
|
|
#include "ompi/mca/mtl/mtl.h"
|
|
|
|
#include "ompi/runtime/ompi_module_exchange.h"
|
|
|
|
#include "ompi/mca/mtl/base/mtl_base_datatype.h"
|
|
|
|
#include "ompi/proc/proc.h"
|
|
|
|
#include "ompi/communicator/communicator.h"
|
2013-05-14 16:27:44 +04:00
|
|
|
#include "opal/memoryhooks/memory.h"
|
2013-12-19 23:37:21 +04:00
|
|
|
#include "opal/util/show_help.h"
|
2011-07-26 08:36:21 +04:00
|
|
|
|
|
|
|
#include "mtl_mxm.h"
|
|
|
|
#include "mtl_mxm_types.h"
|
|
|
|
#include "mtl_mxm_endpoint.h"
|
|
|
|
#include "mtl_mxm_request.h"
|
|
|
|
|
|
|
|
mca_mtl_mxm_module_t ompi_mtl_mxm = {
|
|
|
|
{
|
|
|
|
0, /* max context id */
|
|
|
|
0, /* max tag value */
|
|
|
|
0, /* request reserve space */
|
|
|
|
0, /* flags */
|
|
|
|
ompi_mtl_mxm_add_procs,
|
|
|
|
ompi_mtl_mxm_del_procs,
|
|
|
|
ompi_mtl_mxm_finalize,
|
|
|
|
ompi_mtl_mxm_send,
|
|
|
|
ompi_mtl_mxm_isend,
|
|
|
|
ompi_mtl_mxm_irecv,
|
|
|
|
ompi_mtl_mxm_iprobe,
|
2012-03-26 23:27:03 +04:00
|
|
|
ompi_mtl_mxm_imrecv,
|
|
|
|
ompi_mtl_mxm_improbe,
|
2011-07-26 08:36:21 +04:00
|
|
|
ompi_mtl_mxm_cancel,
|
|
|
|
ompi_mtl_mxm_add_comm,
|
|
|
|
ompi_mtl_mxm_del_comm
|
2013-05-14 16:27:44 +04:00
|
|
|
},
|
|
|
|
0,
|
|
|
|
0,
|
|
|
|
NULL,
|
|
|
|
NULL
|
2011-07-26 08:36:21 +04:00
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
static uint32_t ompi_mtl_mxm_get_job_id(void)
|
|
|
|
{
|
|
|
|
uint8_t unique_job_key[16];
|
|
|
|
uint32_t job_key;
|
|
|
|
unsigned long long *uu;
|
|
|
|
char *generated_key;
|
|
|
|
|
|
|
|
uu = (unsigned long long *) unique_job_key;
|
|
|
|
|
|
|
|
generated_key = getenv("OMPI_MCA_orte_precondition_transports");
|
|
|
|
memset(uu, 0, sizeof(unique_job_key));
|
|
|
|
|
|
|
|
if (!generated_key || (strlen(generated_key) != 33) || sscanf(generated_key, "%016llx-%016llx", &uu[0], &uu[1]) != 2) {
|
2013-02-13 01:10:11 +04:00
|
|
|
opal_show_help("help-mtl-mxm.txt", "no uuid present", true,
|
2011-07-26 08:36:21 +04:00
|
|
|
generated_key ? "could not be parsed from" :
|
2013-01-28 03:25:10 +04:00
|
|
|
"not present in", ompi_process_info.nodename);
|
2011-07-26 08:36:21 +04:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2012-02-02 19:00:47 +04:00
|
|
|
/*
|
|
|
|
* decode OMPI_MCA_orte_precondition_transports that looks as
|
|
|
|
* 000003ca00000000-0000000100000000
|
|
|
|
* jobfam-stepid
|
|
|
|
* to get jobid coded with ORTE_CONSTRUCT_LOCAL_JOBID()
|
|
|
|
*/
|
|
|
|
#define GET_LOCAL_JOBID(local, job) \
|
|
|
|
( ((local) & 0xffff0000) | ((job) & 0x0000ffff) )
|
|
|
|
job_key = GET_LOCAL_JOBID((uu[0]>>(8 * sizeof(int))) << 16, uu[1]>>(8 * sizeof(int)));
|
|
|
|
|
2011-07-26 08:36:21 +04:00
|
|
|
return job_key;
|
|
|
|
}
|
|
|
|
|
|
|
|
int ompi_mtl_mxm_progress(void);
|
2013-05-14 16:27:44 +04:00
|
|
|
#if MXM_API >= MXM_VERSION(2,0)
|
|
|
|
static void ompi_mtl_mxm_mem_release_cb(void *buf, size_t length,
|
|
|
|
void *cbdata, bool from_alloc);
|
|
|
|
#endif
|
2011-07-26 08:36:21 +04:00
|
|
|
|
2013-12-04 13:11:55 +04:00
|
|
|
#if MXM_API < MXM_VERSION(2,0)
|
2011-07-26 08:36:21 +04:00
|
|
|
static int ompi_mtl_mxm_get_ep_address(ompi_mtl_mxm_ep_conn_info_t *ep_info, mxm_ptl_id_t ptlid)
|
|
|
|
{
|
|
|
|
size_t addrlen;
|
|
|
|
mxm_error_t err;
|
|
|
|
|
|
|
|
addrlen = sizeof(ep_info->ptl_addr[ptlid]);
|
|
|
|
err = mxm_ep_address(ompi_mtl_mxm.ep, ptlid,
|
2012-12-10 02:58:37 +04:00
|
|
|
(struct sockaddr *) &ep_info->ptl_addr[ptlid], &addrlen);
|
2011-07-26 08:36:21 +04:00
|
|
|
if (MXM_OK != err) {
|
2013-04-12 20:37:42 +04:00
|
|
|
opal_show_help("help-mtl-mxm.txt", "unable to extract endpoint ptl address",
|
2012-12-10 02:58:37 +04:00
|
|
|
true, (int)ptlid, mxm_error_string(err));
|
2011-07-26 08:36:21 +04:00
|
|
|
return OMPI_ERROR;
|
|
|
|
}
|
|
|
|
|
|
|
|
return OMPI_SUCCESS;
|
|
|
|
}
|
2012-12-10 02:58:37 +04:00
|
|
|
#else
|
2013-04-12 20:37:42 +04:00
|
|
|
static int ompi_mtl_mxm_get_ep_address(void **address_p, size_t *address_len_p)
|
2012-12-10 02:58:37 +04:00
|
|
|
{
|
|
|
|
mxm_error_t err;
|
|
|
|
|
2013-04-12 20:37:42 +04:00
|
|
|
*address_len_p = 0;
|
|
|
|
err = mxm_ep_get_address(ompi_mtl_mxm.ep, NULL, address_len_p);
|
|
|
|
if (err != MXM_ERR_BUFFER_TOO_SMALL) {
|
|
|
|
MXM_ERROR("Failed to get ep address length");
|
|
|
|
return OMPI_ERROR;
|
|
|
|
}
|
|
|
|
|
|
|
|
*address_p = malloc(*address_len_p);
|
|
|
|
if (*address_p == NULL) {
|
|
|
|
MXM_ERROR("Failed to allocate ep address buffer");
|
|
|
|
return OMPI_ERR_OUT_OF_RESOURCE;
|
|
|
|
}
|
|
|
|
|
|
|
|
err = mxm_ep_get_address(ompi_mtl_mxm.ep, *address_p, address_len_p);
|
|
|
|
if (MXM_OK != err) {
|
2013-02-13 01:10:11 +04:00
|
|
|
opal_show_help("help-mtl-mxm.txt", "unable to extract endpoint address",
|
2013-04-12 20:37:42 +04:00
|
|
|
true, mxm_error_string(err));
|
2012-12-10 02:58:37 +04:00
|
|
|
return OMPI_ERROR;
|
|
|
|
}
|
2013-04-12 20:37:42 +04:00
|
|
|
|
|
|
|
return OMPI_SUCCESS;
|
2012-12-10 02:58:37 +04:00
|
|
|
}
|
|
|
|
#endif
|
2011-07-26 08:36:21 +04:00
|
|
|
|
2011-10-03 16:59:55 +04:00
|
|
|
#define max(a,b) ((a)>(b)?(a):(b))
|
2011-07-26 08:36:21 +04:00
|
|
|
|
2013-04-12 20:37:42 +04:00
|
|
|
static mxm_error_t
|
|
|
|
ompi_mtl_mxm_create_ep(mxm_h ctx, mxm_ep_h *ep, unsigned ptl_bitmap, int lr,
|
|
|
|
uint32_t jobid, uint64_t mxlr, int nlps)
|
|
|
|
{
|
2012-08-08 19:29:38 +04:00
|
|
|
mxm_error_t err;
|
|
|
|
|
2013-12-04 13:11:55 +04:00
|
|
|
#if MXM_API < MXM_VERSION(2,0)
|
|
|
|
ompi_mtl_mxm.mxm_ep_opts->job_id = jobid;
|
|
|
|
ompi_mtl_mxm.mxm_ep_opts->local_rank = lr;
|
|
|
|
ompi_mtl_mxm.mxm_ep_opts->num_local_procs = nlps;
|
|
|
|
err = mxm_ep_create(ctx, ompi_mtl_mxm.mxm_ep_opts, ep);
|
2013-04-12 20:37:42 +04:00
|
|
|
#else
|
2013-12-04 13:11:55 +04:00
|
|
|
err = mxm_ep_create(ctx, ompi_mtl_mxm.mxm_ep_opts, ep);
|
2012-08-08 19:29:38 +04:00
|
|
|
#endif
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
2013-04-12 20:37:42 +04:00
|
|
|
/*
|
|
|
|
* send information using modex (in some case there is limitation on data size for example ess/pmi)
|
|
|
|
* set size of data sent for once
|
|
|
|
*
|
|
|
|
*/
|
|
|
|
static int ompi_mtl_mxm_send_ep_address(void *address, size_t address_len)
|
2013-01-31 12:38:08 +04:00
|
|
|
{
|
2013-04-12 20:37:42 +04:00
|
|
|
char *modex_component_name = mca_base_component_to_string(&mca_mtl_mxm_component.super.mtl_version);
|
|
|
|
char *modex_name = malloc(strlen(modex_component_name) + 5);
|
|
|
|
const size_t modex_max_size = 0x60;
|
|
|
|
unsigned char *modex_buf_ptr;
|
|
|
|
size_t modex_buf_size;
|
|
|
|
size_t modex_cur_size;
|
|
|
|
int modex_name_id = 0;
|
|
|
|
int rc;
|
2013-02-04 10:59:24 +04:00
|
|
|
|
2013-04-12 20:37:42 +04:00
|
|
|
/* Send address length */
|
|
|
|
sprintf(modex_name, "%s-len", modex_component_name);
|
|
|
|
rc = ompi_modex_send_string((const char *)modex_name, &address_len, sizeof(address_len));
|
|
|
|
if (OMPI_SUCCESS != rc) {
|
|
|
|
MXM_ERROR("failed to send address length");
|
|
|
|
goto bail;
|
2013-02-04 10:59:24 +04:00
|
|
|
}
|
|
|
|
|
2013-04-12 20:37:42 +04:00
|
|
|
/* Send address, in parts.
|
|
|
|
* modex name looks as mtl.mxm.1.5-18 where mtl.mxm.1.5 is the component and 18 is part index.
|
2013-02-04 10:59:24 +04:00
|
|
|
*/
|
2013-04-12 20:37:42 +04:00
|
|
|
modex_buf_size = address_len;
|
|
|
|
modex_buf_ptr = address;
|
|
|
|
while (modex_buf_size) {
|
|
|
|
sprintf(modex_name, "%s-%d", modex_component_name, modex_name_id);
|
|
|
|
modex_cur_size = (modex_buf_size < modex_max_size) ? modex_buf_size : modex_max_size;
|
|
|
|
rc = ompi_modex_send_string(modex_name, modex_buf_ptr, modex_cur_size);
|
2013-02-04 10:59:24 +04:00
|
|
|
if (OMPI_SUCCESS != rc) {
|
2013-04-12 20:37:42 +04:00
|
|
|
MXM_ERROR("Open MPI couldn't distribute EP connection details");
|
|
|
|
goto bail;
|
2013-02-04 10:59:24 +04:00
|
|
|
}
|
|
|
|
|
2013-04-12 20:37:42 +04:00
|
|
|
modex_name_id++;
|
|
|
|
modex_buf_ptr += modex_cur_size;
|
|
|
|
modex_buf_size -= modex_cur_size;
|
|
|
|
}
|
2013-02-04 10:59:24 +04:00
|
|
|
|
2013-04-12 20:37:42 +04:00
|
|
|
rc = OMPI_SUCCESS;
|
2013-02-04 10:59:24 +04:00
|
|
|
|
2013-04-12 20:37:42 +04:00
|
|
|
bail:
|
|
|
|
free(modex_component_name);
|
|
|
|
free(modex_name);
|
|
|
|
return rc;
|
|
|
|
}
|
2013-02-04 10:59:24 +04:00
|
|
|
|
2013-04-12 20:37:42 +04:00
|
|
|
/*
|
|
|
|
* recieve information using modex
|
|
|
|
*/
|
|
|
|
static int ompi_mtl_mxm_recv_ep_address(ompi_proc_t *source_proc, void **address_p,
|
|
|
|
size_t *address_len_p)
|
|
|
|
{
|
|
|
|
char *modex_component_name = mca_base_component_to_string(&mca_mtl_mxm_component.super.mtl_version);
|
|
|
|
char *modex_name = malloc(strlen(modex_component_name) + 5);
|
|
|
|
unsigned char *modex_buf_ptr;
|
|
|
|
size_t modex_cur_size;
|
|
|
|
size_t modex_buf_size;
|
|
|
|
size_t *address_len_buf_ptr;
|
|
|
|
int modex_name_id = 0;
|
|
|
|
int rc;
|
2013-02-04 10:59:24 +04:00
|
|
|
|
2013-04-12 20:37:42 +04:00
|
|
|
*address_p = NULL;
|
|
|
|
*address_len_p = 0;
|
2013-02-04 10:59:24 +04:00
|
|
|
|
2013-04-12 20:37:42 +04:00
|
|
|
/* Receive address length */
|
|
|
|
sprintf(modex_name, "%s-len", modex_component_name);
|
|
|
|
rc = ompi_modex_recv_string(modex_name, source_proc, (void**)&address_len_buf_ptr,
|
|
|
|
&modex_cur_size);
|
|
|
|
if (OMPI_SUCCESS != rc) {
|
|
|
|
MXM_ERROR("Failed to receive ep address length");
|
|
|
|
goto bail;
|
|
|
|
}
|
2013-02-04 10:59:24 +04:00
|
|
|
|
2013-04-12 20:37:42 +04:00
|
|
|
/* Allocate buffer to hold the address */
|
|
|
|
*address_len_p = *address_len_buf_ptr;
|
|
|
|
*address_p = malloc(*address_len_p);
|
|
|
|
if (*address_p == NULL) {
|
|
|
|
MXM_ERROR("Failed to allocate modex receive buffer");
|
|
|
|
rc = OMPI_ERR_OUT_OF_RESOURCE;
|
|
|
|
goto bail;
|
|
|
|
}
|
2013-02-04 10:54:24 +04:00
|
|
|
|
2013-04-12 20:37:42 +04:00
|
|
|
/* Receive the data, in parts */
|
|
|
|
modex_buf_size = 0;
|
|
|
|
while (modex_buf_size < *address_len_p) {
|
|
|
|
sprintf(modex_name, "%s-%d", modex_component_name, modex_name_id);
|
|
|
|
rc = ompi_modex_recv_string(modex_name, source_proc, (void**)&modex_buf_ptr,
|
|
|
|
&modex_cur_size);
|
|
|
|
if (OMPI_SUCCESS != rc) {
|
|
|
|
MXM_ERROR("Open MPI couldn't distribute EP connection details");
|
|
|
|
goto bail;
|
2013-02-04 10:59:24 +04:00
|
|
|
}
|
|
|
|
|
2013-04-12 20:37:42 +04:00
|
|
|
memcpy((char*)(*address_p) + modex_buf_size, modex_buf_ptr, modex_cur_size);
|
|
|
|
modex_buf_size += modex_cur_size;
|
|
|
|
modex_name_id++;
|
|
|
|
}
|
2013-02-04 10:59:24 +04:00
|
|
|
|
2013-04-12 20:37:42 +04:00
|
|
|
rc = OMPI_SUCCESS;
|
|
|
|
bail:
|
|
|
|
free(modex_component_name);
|
|
|
|
free(modex_name);
|
|
|
|
return rc;
|
2013-02-04 10:59:24 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
int ompi_mtl_mxm_module_init(void)
|
|
|
|
{
|
2013-04-12 20:37:42 +04:00
|
|
|
#if MXM_API < MXM_VERSION(2,0)
|
2011-07-26 08:36:21 +04:00
|
|
|
ompi_mtl_mxm_ep_conn_info_t ep_info;
|
2013-04-12 20:37:42 +04:00
|
|
|
#endif
|
|
|
|
void *ep_address;
|
|
|
|
size_t ep_address_len;
|
2011-07-26 08:36:21 +04:00
|
|
|
mxm_error_t err;
|
|
|
|
uint32_t jobid;
|
2011-10-03 16:59:55 +04:00
|
|
|
uint64_t mxlr;
|
2014-01-09 19:15:14 +04:00
|
|
|
ompi_proc_t **procs;
|
2012-12-10 02:58:37 +04:00
|
|
|
unsigned ptl_bitmap;
|
2011-10-03 16:59:55 +04:00
|
|
|
size_t totps, proc;
|
|
|
|
int lr, nlps;
|
2013-04-12 20:37:42 +04:00
|
|
|
int rc;
|
2011-10-03 16:59:55 +04:00
|
|
|
|
|
|
|
mxlr = 0;
|
|
|
|
lr = -1;
|
2011-07-26 08:36:21 +04:00
|
|
|
|
|
|
|
jobid = ompi_mtl_mxm_get_job_id();
|
|
|
|
if (0 == jobid) {
|
|
|
|
MXM_ERROR("Failed to generate jobid");
|
|
|
|
return OMPI_ERROR;
|
|
|
|
}
|
|
|
|
|
2011-10-03 16:59:55 +04:00
|
|
|
if (NULL == (procs = ompi_proc_world(&totps))) {
|
|
|
|
MXM_ERROR("Unable to obtain process list");
|
|
|
|
return OMPI_ERROR;
|
|
|
|
}
|
|
|
|
|
2012-06-02 15:07:20 +04:00
|
|
|
if (totps < (size_t)ompi_mtl_mxm.mxm_np) {
|
2012-06-04 01:48:42 +04:00
|
|
|
MXM_VERBOSE(1, "MXM support will be disabled because of total number "
|
|
|
|
"of processes (%lu) is less than the minimum set by the "
|
|
|
|
"mtl_mxm_np MCA parameter (%u)", totps, ompi_mtl_mxm.mxm_np);
|
|
|
|
return OMPI_ERR_NOT_SUPPORTED;
|
2012-06-02 15:07:20 +04:00
|
|
|
}
|
|
|
|
MXM_VERBOSE(1, "MXM support enabled");
|
|
|
|
|
2013-01-28 03:25:10 +04:00
|
|
|
if (ORTE_NODE_RANK_INVALID == (lr = ompi_process_info.my_node_rank)) {
|
2012-06-05 13:13:16 +04:00
|
|
|
MXM_ERROR("Unable to obtain local node rank");
|
|
|
|
return OMPI_ERROR;
|
|
|
|
}
|
2013-01-28 03:25:10 +04:00
|
|
|
nlps = ompi_process_info.num_local_peers + 1;
|
2011-10-03 16:59:55 +04:00
|
|
|
|
2012-06-05 13:13:16 +04:00
|
|
|
for (proc = 0; proc < totps; proc++) {
|
2012-06-27 18:53:55 +04:00
|
|
|
if (OPAL_PROC_ON_LOCAL_NODE(procs[proc]->proc_flags)) {
|
2011-10-03 16:59:55 +04:00
|
|
|
mxlr = max(mxlr, procs[proc]->proc_name.vpid);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2012-02-02 19:00:47 +04:00
|
|
|
/* Setup the endpoint options and local addresses to bind to. */
|
2013-12-04 13:11:55 +04:00
|
|
|
#if MXM_API < MXM_VERSION(2,0)
|
|
|
|
ptl_bitmap = ompi_mtl_mxm.mxm_ctx_opts->ptl_bitmap;
|
2012-12-10 02:58:37 +04:00
|
|
|
#else
|
|
|
|
ptl_bitmap = 0;
|
2012-08-08 19:29:38 +04:00
|
|
|
#endif
|
2011-07-26 08:36:21 +04:00
|
|
|
|
|
|
|
/* Open MXM endpoint */
|
2012-08-08 19:29:38 +04:00
|
|
|
err = ompi_mtl_mxm_create_ep(ompi_mtl_mxm.mxm_context, &ompi_mtl_mxm.ep,
|
2013-04-12 20:37:42 +04:00
|
|
|
ptl_bitmap, lr, jobid, mxlr, nlps);
|
2011-07-26 08:36:21 +04:00
|
|
|
if (MXM_OK != err) {
|
2013-02-13 01:10:11 +04:00
|
|
|
opal_show_help("help-mtl-mxm.txt", "unable to create endpoint", true,
|
2011-07-26 08:36:21 +04:00
|
|
|
mxm_error_string(err));
|
|
|
|
return OMPI_ERROR;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Get address for each PTL on this endpoint, and share it with other ranks.
|
|
|
|
*/
|
2012-12-10 02:58:37 +04:00
|
|
|
#if MXM_API < MXM_VERSION(2,0)
|
2011-10-31 14:17:43 +04:00
|
|
|
if ((ptl_bitmap & MXM_BIT(MXM_PTL_SELF)) &&
|
|
|
|
OMPI_SUCCESS != ompi_mtl_mxm_get_ep_address(&ep_info, MXM_PTL_SELF)) {
|
2011-07-26 08:36:21 +04:00
|
|
|
return OMPI_ERROR;
|
|
|
|
}
|
2011-10-31 14:17:43 +04:00
|
|
|
if ((ptl_bitmap & MXM_BIT(MXM_PTL_RDMA)) &&
|
|
|
|
OMPI_SUCCESS != ompi_mtl_mxm_get_ep_address(&ep_info, MXM_PTL_RDMA)) {
|
2011-07-26 08:36:21 +04:00
|
|
|
return OMPI_ERROR;
|
|
|
|
}
|
2011-10-31 14:17:43 +04:00
|
|
|
if ((ptl_bitmap & MXM_BIT(MXM_PTL_SHM)) &&
|
|
|
|
OMPI_SUCCESS != ompi_mtl_mxm_get_ep_address(&ep_info, MXM_PTL_SHM)) {
|
2011-10-03 16:59:55 +04:00
|
|
|
return OMPI_ERROR;
|
|
|
|
}
|
2013-02-04 10:59:24 +04:00
|
|
|
|
2013-04-12 20:37:42 +04:00
|
|
|
ep_address = &ep_info;
|
|
|
|
ep_address_len = sizeof(ep_info);
|
2013-02-04 10:59:24 +04:00
|
|
|
#else
|
2013-04-12 20:37:42 +04:00
|
|
|
rc = ompi_mtl_mxm_get_ep_address(&ep_address, &ep_address_len);
|
|
|
|
if (OMPI_SUCCESS != rc) {
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
rc = ompi_mtl_mxm_send_ep_address(ep_address, ep_address_len);
|
|
|
|
if (OMPI_SUCCESS != rc) {
|
|
|
|
MXM_ERROR("Modex session failed.");
|
|
|
|
return rc;
|
2013-02-04 10:59:24 +04:00
|
|
|
}
|
2013-04-12 20:37:42 +04:00
|
|
|
|
|
|
|
#if MXM_API >= MXM_VERSION(2,0)
|
|
|
|
free(ep_address);
|
2013-02-04 10:59:24 +04:00
|
|
|
#endif
|
|
|
|
|
2011-07-26 08:36:21 +04:00
|
|
|
/* Register the MXM progress function */
|
|
|
|
opal_progress_register(ompi_mtl_mxm_progress);
|
2013-05-14 16:27:44 +04:00
|
|
|
|
|
|
|
#if MXM_API >= MXM_VERSION(2,0)
|
|
|
|
if (ompi_mtl_mxm.using_mem_hooks) {
|
|
|
|
opal_mem_hooks_register_release(ompi_mtl_mxm_mem_release_cb, NULL);
|
|
|
|
}
|
|
|
|
#endif
|
2011-07-26 08:36:21 +04:00
|
|
|
return OMPI_SUCCESS;
|
|
|
|
}
|
|
|
|
|
|
|
|
int ompi_mtl_mxm_finalize(struct mca_mtl_base_module_t* mtl)
|
|
|
|
{
|
2013-05-14 16:27:44 +04:00
|
|
|
#if MXM_API >= MXM_VERSION(2,0)
|
|
|
|
if (ompi_mtl_mxm.using_mem_hooks) {
|
|
|
|
opal_mem_hooks_unregister_release(ompi_mtl_mxm_mem_release_cb);
|
|
|
|
}
|
|
|
|
#endif
|
2011-07-26 08:36:21 +04:00
|
|
|
opal_progress_unregister(ompi_mtl_mxm_progress);
|
|
|
|
mxm_ep_destroy(ompi_mtl_mxm.ep);
|
|
|
|
return OMPI_SUCCESS;
|
|
|
|
}
|
|
|
|
|
|
|
|
int ompi_mtl_mxm_add_procs(struct mca_mtl_base_module_t *mtl, size_t nprocs,
|
2013-08-30 20:54:55 +04:00
|
|
|
struct ompi_proc_t** procs)
|
2011-07-26 08:36:21 +04:00
|
|
|
{
|
2013-04-12 20:37:42 +04:00
|
|
|
#if MXM_API < MXM_VERSION(2,0)
|
2012-02-02 19:00:47 +04:00
|
|
|
ompi_mtl_mxm_ep_conn_info_t *ep_info;
|
2011-07-26 08:36:21 +04:00
|
|
|
mxm_conn_req_t *conn_reqs;
|
2013-04-12 20:37:42 +04:00
|
|
|
#endif
|
|
|
|
void *ep_address;
|
|
|
|
size_t ep_address_len;
|
2011-07-26 08:36:21 +04:00
|
|
|
mxm_error_t err;
|
|
|
|
size_t i;
|
|
|
|
int rc;
|
2013-08-30 20:54:55 +04:00
|
|
|
mca_mtl_mxm_endpoint_t *endpoint;
|
2011-07-26 08:36:21 +04:00
|
|
|
|
|
|
|
assert(mtl == &ompi_mtl_mxm.super);
|
|
|
|
|
2013-04-12 20:37:42 +04:00
|
|
|
#if MXM_API < MXM_VERSION(2,0)
|
2011-07-26 08:36:21 +04:00
|
|
|
/* Allocate connection requests */
|
2013-04-12 20:37:42 +04:00
|
|
|
conn_reqs = calloc(nprocs, sizeof(mxm_conn_req_t));
|
|
|
|
ep_info = calloc(nprocs, sizeof(ompi_mtl_mxm_ep_conn_info_t));
|
2011-07-26 08:36:21 +04:00
|
|
|
if (NULL == conn_reqs || NULL == ep_info) {
|
|
|
|
rc = OMPI_ERR_OUT_OF_RESOURCE;
|
|
|
|
goto bail;
|
|
|
|
}
|
2013-04-12 20:37:42 +04:00
|
|
|
#endif
|
2011-07-26 08:36:21 +04:00
|
|
|
|
|
|
|
/* Get the EP connection requests for all the processes from modex */
|
|
|
|
for (i = 0; i < nprocs; ++i) {
|
2013-04-12 20:37:42 +04:00
|
|
|
rc = ompi_mtl_mxm_recv_ep_address(procs[i], &ep_address, &ep_address_len);
|
|
|
|
if (rc != OMPI_SUCCESS) {
|
|
|
|
goto bail;
|
2012-02-02 19:00:47 +04:00
|
|
|
}
|
2011-07-26 08:36:21 +04:00
|
|
|
|
2012-12-10 02:58:37 +04:00
|
|
|
#if MXM_API < MXM_VERSION(2,0)
|
2013-04-12 20:37:42 +04:00
|
|
|
if (ep_address_len != sizeof(ep_info[i])) {
|
|
|
|
MXM_ERROR("Invalid endpoint address length");
|
|
|
|
rc = OMPI_ERROR;
|
|
|
|
goto bail;
|
|
|
|
}
|
|
|
|
|
|
|
|
memcpy(&ep_info[i], ep_address, ep_address_len);
|
2012-02-02 19:00:47 +04:00
|
|
|
conn_reqs[i].ptl_addr[MXM_PTL_SELF] = (struct sockaddr *)&(ep_info[i].ptl_addr[MXM_PTL_SELF]);
|
2013-04-12 20:37:42 +04:00
|
|
|
conn_reqs[i].ptl_addr[MXM_PTL_SHM] = (struct sockaddr *)&(ep_info[i].ptl_addr[MXM_PTL_SHM]);
|
2012-02-02 19:00:47 +04:00
|
|
|
conn_reqs[i].ptl_addr[MXM_PTL_RDMA] = (struct sockaddr *)&(ep_info[i].ptl_addr[MXM_PTL_RDMA]);
|
2013-12-04 13:11:55 +04:00
|
|
|
|
2012-12-10 02:58:37 +04:00
|
|
|
#else
|
2013-08-30 20:54:55 +04:00
|
|
|
endpoint = OBJ_NEW(mca_mtl_mxm_endpoint_t);
|
|
|
|
endpoint->mtl_mxm_module = &ompi_mtl_mxm;
|
|
|
|
err = mxm_ep_connect(ompi_mtl_mxm.ep, ep_address, &endpoint->mxm_conn);
|
2013-04-12 20:37:42 +04:00
|
|
|
if (err != MXM_OK) {
|
|
|
|
MXM_ERROR("MXM returned connect error: %s\n", mxm_error_string(err));
|
|
|
|
rc = OMPI_ERROR;
|
|
|
|
goto bail;
|
|
|
|
}
|
2013-08-30 20:54:55 +04:00
|
|
|
procs[i]->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_MTL] = endpoint;
|
2012-12-10 02:58:37 +04:00
|
|
|
#endif
|
2013-04-12 20:37:42 +04:00
|
|
|
free(ep_address);
|
2011-07-26 08:36:21 +04:00
|
|
|
}
|
|
|
|
|
2013-04-12 20:37:42 +04:00
|
|
|
#if MXM_API < MXM_VERSION(2,0)
|
2011-07-26 08:36:21 +04:00
|
|
|
/* Connect to remote peers */
|
2013-12-04 13:11:55 +04:00
|
|
|
err = mxm_ep_connect(ompi_mtl_mxm.ep, conn_reqs, nprocs, -1);
|
2011-07-26 08:36:21 +04:00
|
|
|
if (MXM_OK != err) {
|
|
|
|
MXM_ERROR("MXM returned connect error: %s\n", mxm_error_string(err));
|
|
|
|
for (i = 0; i < nprocs; ++i) {
|
|
|
|
if (MXM_OK != conn_reqs[i].error) {
|
2013-12-04 13:11:55 +04:00
|
|
|
MXM_ERROR("MXM EP connect to %s error: %s\n",
|
As per the email discussion, revise the sparse handling of hostnames so that we avoid potential infinite loops while allowing large-scale users to improve their startup time:
* add a new MCA param orte_hostname_cutoff to specify the number of nodes at which we stop including hostnames. This defaults to INT_MAX => always include hostnames. If a value is given, then we will include hostnames for any allocation smaller than the given limit.
* remove ompi_proc_get_hostname. Replace all occurrences with a direct link to ompi_proc_t's proc_hostname, protected by appropriate "if NULL"
* modify the OMPI-ORTE integration component so that any call to modex_recv automatically loads the ompi_proc_t->proc_hostname field as well as returning the requested info. Thus, any process whose modex info you retrieve will automatically receive the hostname. Note that on-demand retrieval is still enabled - i.e., if we are running under direct launch with PMI, the hostname will be fetched upon first call to modex_recv, and then the ompi_proc_t->proc_hostname field will be loaded
* removed a stale MCA param "mpi_keep_peer_hostnames" that was no longer used anywhere in the code base
* added an envar lookup in ess/pmi for the number of nodes in the allocation. Sadly, PMI itself doesn't provide that info, so we have to get it a different way. Currently, we support PBS-based systems and SLURM - for any other, rank0 will emit a warning and we assume max number of daemons so we will always retain hostnames
This commit was SVN r29052.
2013-08-20 22:59:36 +04:00
|
|
|
(NULL == procs[i]->proc_hostname) ?
|
|
|
|
"unknown" : procs[i]->proc_hostname,
|
2011-07-26 08:36:21 +04:00
|
|
|
mxm_error_string(conn_reqs[i].error));
|
|
|
|
}
|
|
|
|
}
|
2011-11-08 16:34:01 +04:00
|
|
|
rc = OMPI_ERROR;
|
2011-07-26 08:36:21 +04:00
|
|
|
goto bail;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Save returned connections */
|
|
|
|
for (i = 0; i < nprocs; ++i) {
|
2013-08-30 20:54:55 +04:00
|
|
|
endpoint = OBJ_NEW(mca_mtl_mxm_endpoint_t);
|
|
|
|
endpoint->mtl_mxm_module = &ompi_mtl_mxm;
|
|
|
|
endpoint->mxm_conn = conn_reqs[i].conn;
|
|
|
|
procs[i]->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_MTL] = endpoint;
|
2011-07-26 08:36:21 +04:00
|
|
|
}
|
2013-12-04 13:11:55 +04:00
|
|
|
|
2013-04-12 20:37:42 +04:00
|
|
|
#endif
|
2013-12-04 13:11:55 +04:00
|
|
|
|
2013-04-12 20:37:42 +04:00
|
|
|
rc = OMPI_SUCCESS;
|
2011-07-26 08:36:21 +04:00
|
|
|
|
|
|
|
bail:
|
2013-04-12 20:37:42 +04:00
|
|
|
#if MXM_API < MXM_VERSION(2,0)
|
|
|
|
free(conn_reqs);
|
|
|
|
free(ep_info);
|
|
|
|
#endif
|
2011-07-26 08:36:21 +04:00
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|
|
|
|
int ompi_mtl_mxm_del_procs(struct mca_mtl_base_module_t *mtl, size_t nprocs,
|
2013-08-30 20:54:55 +04:00
|
|
|
struct ompi_proc_t** procs)
|
2011-07-26 08:36:21 +04:00
|
|
|
{
|
|
|
|
size_t i;
|
|
|
|
|
|
|
|
for (i = 0; i < nprocs; ++i) {
|
2013-08-30 20:54:55 +04:00
|
|
|
mca_mtl_mxm_endpoint_t *endpoint = (mca_mtl_mxm_endpoint_t*)
|
|
|
|
procs[i]->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_MTL];
|
|
|
|
mxm_ep_disconnect(endpoint->mxm_conn);
|
|
|
|
OBJ_RELEASE(endpoint);
|
2011-07-26 08:36:21 +04:00
|
|
|
}
|
|
|
|
return OMPI_SUCCESS;
|
|
|
|
}
|
|
|
|
|
|
|
|
int ompi_mtl_mxm_add_comm(struct mca_mtl_base_module_t *mtl,
|
|
|
|
struct ompi_communicator_t *comm)
|
|
|
|
{
|
|
|
|
mxm_error_t err;
|
|
|
|
mxm_mq_h mq;
|
|
|
|
|
|
|
|
assert(mtl == &ompi_mtl_mxm.super);
|
|
|
|
assert(NULL != ompi_mtl_mxm.mxm_context);
|
|
|
|
|
|
|
|
err = mxm_mq_create(ompi_mtl_mxm.mxm_context, comm->c_contextid, &mq);
|
|
|
|
if (MXM_OK != err) {
|
2013-02-13 01:10:11 +04:00
|
|
|
opal_show_help("help-mtl-mxm.txt", "mxm mq create", true, mxm_error_string(err));
|
2011-07-26 08:36:21 +04:00
|
|
|
return OMPI_ERROR;
|
|
|
|
}
|
|
|
|
|
|
|
|
comm->c_pml_comm = (void*)mq;
|
|
|
|
return OMPI_SUCCESS;
|
|
|
|
}
|
|
|
|
|
|
|
|
int ompi_mtl_mxm_del_comm(struct mca_mtl_base_module_t *mtl,
|
|
|
|
struct ompi_communicator_t *comm)
|
|
|
|
{
|
|
|
|
assert(mtl == &ompi_mtl_mxm.super);
|
|
|
|
if (NULL != ompi_mtl_mxm.mxm_context) {
|
|
|
|
mxm_mq_destroy((mxm_mq_h)comm->c_pml_comm);
|
|
|
|
}
|
|
|
|
return OMPI_SUCCESS;
|
|
|
|
}
|
|
|
|
|
|
|
|
int ompi_mtl_mxm_progress(void)
|
|
|
|
{
|
|
|
|
mxm_error_t err;
|
|
|
|
|
|
|
|
err = mxm_progress(ompi_mtl_mxm.mxm_context);
|
|
|
|
if ((MXM_OK != err) && (MXM_ERR_NO_PROGRESS != err) ) {
|
2013-02-13 01:10:11 +04:00
|
|
|
opal_show_help("help-mtl-mxm.txt", "errors during mxm_progress", true, mxm_error_string(err));
|
2011-07-26 08:36:21 +04:00
|
|
|
}
|
|
|
|
return 1;
|
|
|
|
}
|
2012-07-25 17:26:40 +04:00
|
|
|
|
2013-05-14 16:27:44 +04:00
|
|
|
#if MXM_API >= MXM_VERSION(2,0)
|
|
|
|
static void ompi_mtl_mxm_mem_release_cb(void *buf, size_t length,
|
|
|
|
void *cbdata, bool from_alloc)
|
|
|
|
{
|
|
|
|
mxm_mem_unmap(ompi_mtl_mxm.mxm_context, buf, length,
|
|
|
|
from_alloc ? MXM_MEM_UNMAP_MARK_INVALID : 0);
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2012-07-25 17:26:40 +04:00
|
|
|
OBJ_CLASS_INSTANCE(
|
|
|
|
ompi_mtl_mxm_message_t,
|
|
|
|
ompi_free_list_item_t,
|
|
|
|
NULL,
|
|
|
|
NULL);
|