0e878c1ac3
Signed-off-by: Ralph Castain <rhc@pmix.org>
1315 строки
48 KiB
C
1315 строки
48 KiB
C
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
|
/*
|
|
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
|
* University Research and Technology
|
|
* Corporation. All rights reserved.
|
|
* Copyright (c) 2004-2017 The University of Tennessee and The University
|
|
* of Tennessee Research Foundation. All rights
|
|
* reserved.
|
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
|
* University of Stuttgart. All rights reserved.
|
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
|
* All rights reserved.
|
|
* Copyright (c) 2007-2018 Cisco Systems, Inc. All rights reserved
|
|
* Copyright (c) 2006-2009 University of Houston. All rights reserved.
|
|
* Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved.
|
|
* Copyright (c) 2011-2015 Los Alamos National Security, LLC. All rights
|
|
* reserved.
|
|
* Copyright (c) 2013-2019 Intel, Inc. All rights reserved.
|
|
* Copyright (c) 2014-2017 Research Organization for Information Science
|
|
* and Technology (RIST). All rights reserved.
|
|
* Copyright (c) 2018 Amazon.com, Inc. or its affiliates. All Rights reserved.
|
|
* $COPYRIGHT$
|
|
*
|
|
* Additional copyrights may follow
|
|
*
|
|
* $HEADER$
|
|
*/
|
|
|
|
#include "ompi_config.h"
|
|
#include "ompi/constants.h"
|
|
|
|
#include <string.h>
|
|
#include <stdio.h>
|
|
#include <ctype.h>
|
|
#include <time.h>
|
|
#if HAVE_SYS_TIME_H
|
|
#include <sys/time.h>
|
|
#endif
|
|
|
|
#include "opal/util/alfg.h"
|
|
#include "opal/util/argv.h"
|
|
#include "opal/util/opal_getcwd.h"
|
|
#include "opal/util/proc.h"
|
|
#include "opal/util/show_help.h"
|
|
#include "opal/util/printf.h"
|
|
#include "opal/dss/dss.h"
|
|
#include "opal/mca/hwloc/base/base.h"
|
|
#include "opal/mca/pmix/pmix.h"
|
|
|
|
#include "ompi/communicator/communicator.h"
|
|
#include "ompi/group/group.h"
|
|
#include "ompi/proc/proc.h"
|
|
#include "ompi/mca/pml/pml.h"
|
|
#include "ompi/mca/rte/rte.h"
|
|
#include "ompi/info/info.h"
|
|
|
|
#include "ompi/dpm/dpm.h"
|
|
|
|
static opal_rng_buff_t rnd;
|
|
|
|
typedef struct {
|
|
ompi_communicator_t *comm;
|
|
int size;
|
|
struct ompi_request_t **reqs;
|
|
int buf;
|
|
} ompi_dpm_disconnect_obj;
|
|
static int disconnect_waitall (int count, ompi_dpm_disconnect_obj **objs);
|
|
static ompi_dpm_disconnect_obj *disconnect_init(ompi_communicator_t *comm);
|
|
|
|
typedef struct {
|
|
opal_list_item_t super;
|
|
ompi_proc_t *p;
|
|
} ompi_dpm_proct_caddy_t;
|
|
static OBJ_CLASS_INSTANCE(ompi_dpm_proct_caddy_t,
|
|
opal_list_item_t,
|
|
NULL, NULL);
|
|
|
|
/*
|
|
* Init the module
|
|
*/
|
|
int ompi_dpm_init(void)
|
|
{
|
|
time_t now;
|
|
|
|
/* seed our random number generator */
|
|
now = time(NULL);
|
|
if (!opal_srand(&rnd, now)) {
|
|
return OMPI_ERROR;
|
|
}
|
|
return OMPI_SUCCESS;
|
|
}
|
|
|
|
int ompi_dpm_connect_accept(ompi_communicator_t *comm, int root,
|
|
const char *port_string, bool send_first,
|
|
ompi_communicator_t **newcomm)
|
|
{
|
|
int k, size, rsize, rank, rc, rportlen=0;
|
|
char **members = NULL, *nstring, *rport=NULL;
|
|
bool dense, isnew;
|
|
opal_process_name_t pname;
|
|
opal_list_t ilist, mlist, rlist;
|
|
opal_value_t info;
|
|
opal_pmix_pdata_t pdat;
|
|
opal_namelist_t *nm;
|
|
opal_jobid_t jobid;
|
|
|
|
ompi_communicator_t *newcomp=MPI_COMM_NULL;
|
|
ompi_proc_t *proc;
|
|
ompi_group_t *group=comm->c_local_group;
|
|
ompi_proc_t **proc_list=NULL, **new_proc_list = NULL;
|
|
int32_t i;
|
|
ompi_group_t *new_group_pointer;
|
|
ompi_dpm_proct_caddy_t *cd;
|
|
|
|
if (NULL == opal_pmix.publish || NULL == opal_pmix.connect ||
|
|
NULL == opal_pmix.unpublish ||
|
|
(NULL == opal_pmix.lookup && NULL == opal_pmix.lookup_nb)) {
|
|
/* print a nice message explaining we don't have support */
|
|
opal_show_help("help-mpi-runtime.txt", "noconxcpt", true);
|
|
return OMPI_ERR_NOT_SUPPORTED;
|
|
}
|
|
if (!ompi_rte_connect_accept_support(port_string)) {
|
|
/* they will have printed the help message */
|
|
return OMPI_ERR_NOT_SUPPORTED;
|
|
}
|
|
|
|
/* set default error return */
|
|
*newcomm = MPI_COMM_NULL;
|
|
|
|
size = ompi_comm_size ( comm );
|
|
rank = ompi_comm_rank ( comm );
|
|
|
|
/* the "send_first" end will append ":connect" to the port name and publish
|
|
* the list of its participating procs on that key. The receiving root proc
|
|
* will append ":accept" to the port name and publish the list of its
|
|
* participants on that key. Each proc will then block waiting for lookup
|
|
* to complete on the other's key. Once that completes, the list of remote
|
|
* procs is used to complete construction of the intercommunicator. */
|
|
|
|
/* everyone constructs the list of members from their communicator */
|
|
if (MPI_COMM_WORLD == comm) {
|
|
pname.jobid = OMPI_PROC_MY_NAME->jobid;
|
|
pname.vpid = OPAL_VPID_WILDCARD;
|
|
rc = opal_convert_process_name_to_string(&nstring, &pname);
|
|
if (OPAL_SUCCESS != rc) {
|
|
return OMPI_ERROR;
|
|
}
|
|
opal_argv_append_nosize(&members, nstring);
|
|
free(nstring);
|
|
/* have to add the number of procs in the job so the remote side
|
|
* can correctly add the procs by computing their names, and our nspace
|
|
* so they can update their records */
|
|
if (NULL == (nstring = (char*)opal_pmix.get_nspace(OMPI_PROC_MY_NAME->jobid))) {
|
|
opal_argv_free(members);
|
|
return OMPI_ERR_NOT_SUPPORTED;
|
|
}
|
|
opal_argv_append_nosize(&members, nstring);
|
|
(void)opal_asprintf(&nstring, "%d", size);
|
|
opal_argv_append_nosize(&members, nstring);
|
|
free(nstring);
|
|
} else {
|
|
if (OMPI_GROUP_IS_DENSE(group)) {
|
|
proc_list = group->grp_proc_pointers;
|
|
dense = true;
|
|
} else {
|
|
proc_list = (ompi_proc_t**)calloc(group->grp_proc_count,
|
|
sizeof(ompi_proc_t *));
|
|
for (i=0 ; i<group->grp_proc_count ; i++) {
|
|
if (NULL == (proc_list[i] = ompi_group_peer_lookup(group,i))) {
|
|
OMPI_ERROR_LOG(OMPI_ERR_NOT_FOUND);
|
|
rc = OMPI_ERR_NOT_FOUND;
|
|
free(proc_list);
|
|
goto exit;
|
|
}
|
|
}
|
|
dense = false;
|
|
}
|
|
for (i=0; i < size; i++) {
|
|
opal_process_name_t proc_name;
|
|
if (ompi_proc_is_sentinel (proc_list[i])) {
|
|
proc_name = ompi_proc_sentinel_to_name ((uintptr_t) proc_list[i]);
|
|
} else {
|
|
proc_name = proc_list[i]->super.proc_name;
|
|
}
|
|
rc = opal_convert_process_name_to_string(&nstring, &proc_name);
|
|
if (OPAL_SUCCESS != rc) {
|
|
if (!dense) {
|
|
free(proc_list);
|
|
proc_list = NULL;
|
|
}
|
|
return OMPI_ERROR;
|
|
}
|
|
opal_argv_append_nosize(&members, nstring);
|
|
free(nstring);
|
|
if (NULL == (nstring = (char*)opal_pmix.get_nspace(proc_name.jobid))) {
|
|
opal_argv_free(members);
|
|
free (proc_list);
|
|
return OMPI_ERR_NOT_SUPPORTED;
|
|
}
|
|
opal_argv_append_nosize(&members, nstring);
|
|
}
|
|
if (!dense) {
|
|
free(proc_list);
|
|
proc_list = NULL;
|
|
}
|
|
}
|
|
|
|
if (rank == root) {
|
|
/* the roots for each side exchange their list of participants */
|
|
OBJ_CONSTRUCT(&info, opal_value_t);
|
|
OBJ_CONSTRUCT(&pdat, opal_pmix_pdata_t);
|
|
if (send_first) {
|
|
(void)opal_asprintf(&info.key, "%s:connect", port_string);
|
|
(void)opal_asprintf(&pdat.value.key, "%s:accept", port_string);
|
|
} else {
|
|
(void)opal_asprintf(&info.key, "%s:accept", port_string);
|
|
(void)opal_asprintf(&pdat.value.key, "%s:connect", port_string);
|
|
}
|
|
info.type = OPAL_STRING;
|
|
info.data.string = opal_argv_join(members, ':');
|
|
pdat.value.type = OPAL_STRING;
|
|
|
|
OPAL_PMIX_EXCHANGE(rc, &info, &pdat, 600); // give them 10 minutes
|
|
OBJ_DESTRUCT(&info);
|
|
if (OPAL_SUCCESS != rc) {
|
|
OBJ_DESTRUCT(&pdat);
|
|
return rc;
|
|
}
|
|
|
|
/* save the result */
|
|
rport = strdup(pdat.value.data.string); // need this later
|
|
rportlen = strlen(rport) + 1; // retain the NULL terminator
|
|
OBJ_DESTRUCT(&pdat);
|
|
}
|
|
|
|
/* if we aren't in a comm_spawn, the non-root members won't have
|
|
* the port_string - so let's make sure everyone knows the other
|
|
* side's participants */
|
|
|
|
/* bcast the list-length to all processes in the local comm */
|
|
rc = comm->c_coll->coll_bcast(&rportlen, 1, MPI_INT, root, comm,
|
|
comm->c_coll->coll_bcast_module);
|
|
if (OMPI_SUCCESS != rc) {
|
|
free(rport);
|
|
goto exit;
|
|
}
|
|
|
|
if (rank != root) {
|
|
/* non root processes need to allocate the buffer manually */
|
|
rport = (char*)malloc(rportlen);
|
|
if (NULL == rport) {
|
|
rc = OMPI_ERR_OUT_OF_RESOURCE;
|
|
goto exit;
|
|
}
|
|
}
|
|
/* now share the list of remote participants */
|
|
rc = comm->c_coll->coll_bcast(rport, rportlen, MPI_BYTE, root, comm,
|
|
comm->c_coll->coll_bcast_module);
|
|
if (OMPI_SUCCESS != rc) {
|
|
free(rport);
|
|
goto exit;
|
|
}
|
|
|
|
/* initiate a list of participants for the connect,
|
|
* starting with our own members */
|
|
OBJ_CONSTRUCT(&mlist, opal_list_t);
|
|
for (i=0; NULL != members[i]; i++) {
|
|
nm = OBJ_NEW(opal_namelist_t);
|
|
if (OPAL_SUCCESS != (rc = opal_convert_string_to_process_name(&nm->name, members[i]))) {
|
|
OMPI_ERROR_LOG(rc);
|
|
opal_argv_free(members);
|
|
free(rport);
|
|
OPAL_LIST_DESTRUCT(&mlist);
|
|
goto exit;
|
|
}
|
|
/* step over the nspace */
|
|
++i;
|
|
if (NULL == members[i]) {
|
|
/* this shouldn't happen and is an error */
|
|
OMPI_ERROR_LOG(OMPI_ERR_BAD_PARAM);
|
|
OPAL_LIST_DESTRUCT(&mlist);
|
|
opal_argv_free(members);
|
|
free(rport);
|
|
rc = OMPI_ERR_BAD_PARAM;
|
|
goto exit;
|
|
}
|
|
/* if the rank is wildcard, then we need to add all procs
|
|
* in that job to the list */
|
|
if (OPAL_VPID_WILDCARD == nm->name.vpid) {
|
|
jobid = nm->name.jobid;
|
|
OBJ_RELEASE(nm);
|
|
for (k=0; k < size; k++) {
|
|
nm = OBJ_NEW(opal_namelist_t);
|
|
nm->name.jobid = jobid;
|
|
nm->name.vpid = k;
|
|
opal_list_append(&mlist, &nm->super);
|
|
}
|
|
/* now step over the size */
|
|
if (NULL == members[i+1]) {
|
|
/* this shouldn't happen and is an error */
|
|
OMPI_ERROR_LOG(OMPI_ERR_BAD_PARAM);
|
|
OPAL_LIST_DESTRUCT(&mlist);
|
|
opal_argv_free(members);
|
|
free(rport);
|
|
rc = OMPI_ERR_BAD_PARAM;
|
|
goto exit;
|
|
}
|
|
++i;
|
|
} else {
|
|
opal_list_append(&mlist, &nm->super);
|
|
}
|
|
}
|
|
opal_argv_free(members);
|
|
members = NULL;
|
|
|
|
/* rport contains a colon-delimited list
|
|
* of process names for the remote procs - convert it
|
|
* into an argv array */
|
|
members = opal_argv_split(rport, ':');
|
|
free(rport);
|
|
|
|
/* add the list of remote procs to our list, and
|
|
* keep a list of them for later */
|
|
OBJ_CONSTRUCT(&ilist, opal_list_t);
|
|
OBJ_CONSTRUCT(&rlist, opal_list_t);
|
|
|
|
for (i=0; NULL != members[i]; i++) {
|
|
nm = OBJ_NEW(opal_namelist_t);
|
|
if (OPAL_SUCCESS != (rc = opal_convert_string_to_process_name(&nm->name, members[i]))) {
|
|
OMPI_ERROR_LOG(rc);
|
|
opal_argv_free(members);
|
|
OPAL_LIST_DESTRUCT(&ilist);
|
|
OPAL_LIST_DESTRUCT(&rlist);
|
|
goto exit;
|
|
}
|
|
/* next entry is the nspace - register it */
|
|
++i;
|
|
if (NULL == members[i]) {
|
|
OMPI_ERROR_LOG(OMPI_ERR_NOT_SUPPORTED);
|
|
opal_argv_free(members);
|
|
OPAL_LIST_DESTRUCT(&ilist);
|
|
OPAL_LIST_DESTRUCT(&rlist);
|
|
goto exit;
|
|
}
|
|
opal_pmix.register_jobid(nm->name.jobid, members[i]);
|
|
if (OPAL_VPID_WILDCARD == nm->name.vpid) {
|
|
jobid = nm->name.jobid;
|
|
OBJ_RELEASE(nm);
|
|
/* if the vpid is wildcard, then we are including all ranks
|
|
* of that job, and the next entry in members should be the
|
|
* number of procs in the job */
|
|
if (NULL == members[i+1]) {
|
|
/* just protect against the error */
|
|
OMPI_ERROR_LOG(OMPI_ERR_BAD_PARAM);
|
|
opal_argv_free(members);
|
|
OPAL_LIST_DESTRUCT(&ilist);
|
|
OPAL_LIST_DESTRUCT(&rlist);
|
|
rc = OMPI_ERR_BAD_PARAM;
|
|
goto exit;
|
|
}
|
|
rsize = strtoul(members[i+1], NULL, 10);
|
|
++i;
|
|
for (k=0; k < rsize; k++) {
|
|
nm = OBJ_NEW(opal_namelist_t);
|
|
nm->name.jobid = jobid;
|
|
nm->name.vpid = k;
|
|
opal_list_append(&mlist, &nm->super);
|
|
/* see if this needs to be added to our ompi_proc_t array */
|
|
proc = ompi_proc_find_and_add(&nm->name, &isnew);
|
|
if (isnew) {
|
|
cd = OBJ_NEW(ompi_dpm_proct_caddy_t);
|
|
cd->p = proc;
|
|
opal_list_append(&ilist, &cd->super);
|
|
}
|
|
/* either way, add to the remote list */
|
|
cd = OBJ_NEW(ompi_dpm_proct_caddy_t);
|
|
cd->p = proc;
|
|
opal_list_append(&rlist, &cd->super);
|
|
}
|
|
} else {
|
|
opal_list_append(&mlist, &nm->super);
|
|
/* see if this needs to be added to our ompi_proc_t array */
|
|
proc = ompi_proc_find_and_add(&nm->name, &isnew);
|
|
if (isnew) {
|
|
cd = OBJ_NEW(ompi_dpm_proct_caddy_t);
|
|
cd->p = proc;
|
|
opal_list_append(&ilist, &cd->super);
|
|
}
|
|
/* either way, add to the remote list */
|
|
cd = OBJ_NEW(ompi_dpm_proct_caddy_t);
|
|
cd->p = proc;
|
|
opal_list_append(&rlist, &cd->super);
|
|
}
|
|
}
|
|
opal_argv_free(members);
|
|
|
|
/* tell the host RTE to connect us - this will download
|
|
* all known data for the nspace's of participating procs
|
|
* so that add_procs will not result in a slew of lookups */
|
|
rc = opal_pmix.connect(&mlist);
|
|
OPAL_LIST_DESTRUCT(&mlist);
|
|
if (OPAL_SUCCESS != rc) {
|
|
OMPI_ERROR_LOG(rc);
|
|
OPAL_LIST_DESTRUCT(&ilist);
|
|
OPAL_LIST_DESTRUCT(&rlist);
|
|
goto exit;
|
|
}
|
|
if (0 < opal_list_get_size(&ilist)) {
|
|
uint32_t *peer_ranks = NULL;
|
|
int prn, nprn;
|
|
char *val, *mycpuset;
|
|
uint16_t u16;
|
|
opal_process_name_t wildcard_rank;
|
|
/* convert the list of new procs to a proc_t array */
|
|
new_proc_list = (ompi_proc_t**)calloc(opal_list_get_size(&ilist),
|
|
sizeof(ompi_proc_t *));
|
|
/* get the list of local peers for the new procs */
|
|
cd = (ompi_dpm_proct_caddy_t*)opal_list_get_first(&ilist);
|
|
proc = cd->p;
|
|
wildcard_rank.jobid = proc->super.proc_name.jobid;
|
|
wildcard_rank.vpid = OMPI_NAME_WILDCARD->vpid;
|
|
/* retrieve the local peers */
|
|
OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, OPAL_PMIX_LOCAL_PEERS,
|
|
&wildcard_rank, &val, OPAL_STRING);
|
|
if (OPAL_SUCCESS == rc && NULL != val) {
|
|
char **peers = opal_argv_split(val, ',');
|
|
free(val);
|
|
nprn = opal_argv_count(peers);
|
|
peer_ranks = (uint32_t*)calloc(nprn, sizeof(uint32_t));
|
|
for (prn = 0; NULL != peers[prn]; prn++) {
|
|
peer_ranks[prn] = strtoul(peers[prn], NULL, 10);
|
|
}
|
|
opal_argv_free(peers);
|
|
}
|
|
|
|
/* get my locality string */
|
|
val = NULL;
|
|
OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, OPAL_PMIX_LOCALITY_STRING,
|
|
OMPI_PROC_MY_NAME, &val, OPAL_STRING);
|
|
if (OPAL_SUCCESS == rc && NULL != val) {
|
|
mycpuset = val;
|
|
} else {
|
|
mycpuset = NULL;
|
|
}
|
|
|
|
i = 0;
|
|
OPAL_LIST_FOREACH(cd, &ilist, ompi_dpm_proct_caddy_t) {
|
|
opal_value_t *kv;
|
|
proc = cd->p;
|
|
new_proc_list[i] = proc ;
|
|
/* ompi_proc_complete_init_single() initializes and optionally retrieves
|
|
* OPAL_PMIX_LOCALITY and OPAL_PMIX_HOSTNAME. since we can live without
|
|
* them, we are just fine */
|
|
ompi_proc_complete_init_single(proc);
|
|
/* if this proc is local, then get its locality */
|
|
if (NULL != peer_ranks) {
|
|
for (prn=0; prn < nprn; prn++) {
|
|
if (peer_ranks[prn] == proc->super.proc_name.vpid) {
|
|
/* get their locality string */
|
|
val = NULL;
|
|
OPAL_MODEX_RECV_VALUE_IMMEDIATE(rc, OPAL_PMIX_LOCALITY_STRING,
|
|
&proc->super.proc_name, &val, OPAL_STRING);
|
|
if (OPAL_SUCCESS == rc && NULL != val) {
|
|
u16 = opal_hwloc_compute_relative_locality(mycpuset, val);
|
|
free(val);
|
|
} else {
|
|
/* all we can say is that it shares our node */
|
|
u16 = OPAL_PROC_ON_CLUSTER | OPAL_PROC_ON_CU | OPAL_PROC_ON_NODE;
|
|
}
|
|
proc->super.proc_flags = u16;
|
|
/* save the locality for later */
|
|
kv = OBJ_NEW(opal_value_t);
|
|
kv->key = strdup(OPAL_PMIX_LOCALITY);
|
|
kv->type = OPAL_UINT16;
|
|
kv->data.uint16 = proc->super.proc_flags;
|
|
opal_pmix.store_local(&proc->super.proc_name, kv);
|
|
OBJ_RELEASE(kv); // maintain accounting
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
++i;
|
|
}
|
|
if (NULL != mycpuset) {
|
|
free(mycpuset);
|
|
}
|
|
if (NULL != peer_ranks) {
|
|
free(peer_ranks);
|
|
}
|
|
/* call add_procs on the new ones */
|
|
rc = MCA_PML_CALL(add_procs(new_proc_list, opal_list_get_size(&ilist)));
|
|
free(new_proc_list);
|
|
new_proc_list = NULL;
|
|
if (OMPI_SUCCESS != rc) {
|
|
OMPI_ERROR_LOG(rc);
|
|
OPAL_LIST_DESTRUCT(&ilist);
|
|
goto exit;
|
|
}
|
|
}
|
|
OPAL_LIST_DESTRUCT(&ilist);
|
|
|
|
/* now deal with the remote group */
|
|
rsize = opal_list_get_size(&rlist);
|
|
new_group_pointer=ompi_group_allocate(rsize);
|
|
if (NULL == new_group_pointer) {
|
|
rc = OMPI_ERR_OUT_OF_RESOURCE;
|
|
OPAL_LIST_DESTRUCT(&rlist);
|
|
goto exit;
|
|
}
|
|
/* assign group elements */
|
|
i=0;
|
|
OPAL_LIST_FOREACH(cd, &rlist, ompi_dpm_proct_caddy_t) {
|
|
new_group_pointer->grp_proc_pointers[i++] = cd->p;
|
|
/* retain the proc */
|
|
OBJ_RETAIN(cd->p);
|
|
}
|
|
OPAL_LIST_DESTRUCT(&rlist);
|
|
|
|
/* set up communicator structure */
|
|
rc = ompi_comm_set ( &newcomp, /* new comm */
|
|
comm, /* old comm */
|
|
group->grp_proc_count, /* local_size */
|
|
NULL, /* local_procs */
|
|
rsize, /* remote_size */
|
|
NULL , /* remote_procs */
|
|
NULL, /* attrs */
|
|
comm->error_handler, /* error handler */
|
|
NULL, /* topo component */
|
|
group, /* local group */
|
|
new_group_pointer /* remote group */
|
|
);
|
|
if (OMPI_SUCCESS != rc) {
|
|
goto exit;
|
|
}
|
|
|
|
OBJ_RELEASE(new_group_pointer);
|
|
new_group_pointer = MPI_GROUP_NULL;
|
|
|
|
/* allocate comm_cid */
|
|
rc = ompi_comm_nextcid ( newcomp, /* new communicator */
|
|
comm, /* old communicator */
|
|
NULL, /* bridge comm */
|
|
&root, /* local leader */
|
|
(void*)port_string, /* rendezvous point */
|
|
send_first, /* send or recv first */
|
|
OMPI_COMM_CID_INTRA_PMIX); /* mode */
|
|
if (OMPI_SUCCESS != rc) {
|
|
goto exit;
|
|
}
|
|
|
|
/* activate comm and init coll-component */
|
|
rc = ompi_comm_activate ( &newcomp, /* new communicator */
|
|
comm, /* old communicator */
|
|
NULL, /* bridge comm */
|
|
&root, /* local leader */
|
|
(void*)port_string, /* rendezvous point */
|
|
send_first, /* send or recv first */
|
|
OMPI_COMM_CID_INTRA_PMIX); /* mode */
|
|
if (OMPI_SUCCESS != rc) {
|
|
goto exit;
|
|
}
|
|
|
|
/* Question: do we have to re-start some low level stuff
|
|
to enable the usage of fast communication devices
|
|
between the two worlds ?
|
|
*/
|
|
|
|
exit:
|
|
if (OMPI_SUCCESS != rc) {
|
|
if (MPI_COMM_NULL != newcomp && NULL != newcomp) {
|
|
OBJ_RELEASE(newcomp);
|
|
newcomp = MPI_COMM_NULL;
|
|
}
|
|
}
|
|
|
|
*newcomm = newcomp;
|
|
return rc;
|
|
}
|
|
|
|
static int construct_peers(ompi_group_t *group, opal_list_t *peers)
|
|
{
|
|
int i;
|
|
opal_namelist_t *nm, *n2;
|
|
ompi_proc_t *proct;
|
|
opal_process_name_t proc_name;
|
|
|
|
for (i=0; i < group->grp_proc_count; i++) {
|
|
if (OMPI_GROUP_IS_DENSE(group)) {
|
|
proct = group->grp_proc_pointers[i];
|
|
} else {
|
|
proct = ompi_group_peer_lookup(group, i);
|
|
}
|
|
if (NULL == proct) {
|
|
OMPI_ERROR_LOG(OMPI_ERR_NOT_FOUND);
|
|
return OMPI_ERR_NOT_FOUND;
|
|
}
|
|
if (ompi_proc_is_sentinel (proct)) {
|
|
proc_name = ompi_proc_sentinel_to_name ((uintptr_t)proct);
|
|
} else {
|
|
proc_name = proct->super.proc_name;
|
|
}
|
|
|
|
/* add to the list of peers */
|
|
nm = OBJ_NEW(opal_namelist_t);
|
|
nm->name = proc_name;
|
|
/* need to maintain an ordered list to ensure the tracker signatures
|
|
* match across all procs */
|
|
OPAL_LIST_FOREACH(n2, peers, opal_namelist_t) {
|
|
if (opal_compare_proc(nm->name, n2->name) < 0) {
|
|
opal_list_insert_pos(peers, &n2->super, &nm->super);
|
|
nm = NULL;
|
|
break;
|
|
}
|
|
}
|
|
if (NULL != nm) {
|
|
/* append to the end */
|
|
opal_list_append(peers, &nm->super);
|
|
}
|
|
}
|
|
return OMPI_SUCCESS;
|
|
}
|
|
|
|
int ompi_dpm_disconnect(ompi_communicator_t *comm)
|
|
{
|
|
int ret;
|
|
ompi_group_t *group;
|
|
opal_list_t coll;
|
|
|
|
/* Note that we explicitly use an RTE-based barrier (vs. an MPI
|
|
barrier). See a lengthy comment in
|
|
ompi/runtime/ompi_mpi_finalize.c for a much more detailed
|
|
rationale. */
|
|
|
|
/* setup the collective */
|
|
OBJ_CONSTRUCT(&coll, opal_list_t);
|
|
/* RHC: assuming for now that this must flow across all
|
|
* local and remote group members */
|
|
group = comm->c_local_group;
|
|
if (OMPI_SUCCESS != (ret = construct_peers(group, &coll))) {
|
|
OMPI_ERROR_LOG(ret);
|
|
OPAL_LIST_DESTRUCT(&coll);
|
|
return ret;
|
|
}
|
|
/* do the same for the remote group */
|
|
group = comm->c_remote_group;
|
|
if (OMPI_SUCCESS != (ret = construct_peers(group, &coll))) {
|
|
OMPI_ERROR_LOG(ret);
|
|
OPAL_LIST_DESTRUCT(&coll);
|
|
return ret;
|
|
}
|
|
|
|
/* ensure we tell the host RM to disconnect us - this
|
|
* is a blocking operation so just use a fence */
|
|
if (OMPI_SUCCESS != (ret = opal_pmix.fence(&coll, false))) {
|
|
OMPI_ERROR_LOG(ret);
|
|
OPAL_LIST_DESTRUCT(&coll);
|
|
return ret;
|
|
}
|
|
OPAL_LIST_DESTRUCT(&coll);
|
|
|
|
return ret;
|
|
}
|
|
|
|
int ompi_dpm_spawn(int count, const char *array_of_commands[],
|
|
char **array_of_argv[],
|
|
const int array_of_maxprocs[],
|
|
const MPI_Info array_of_info[],
|
|
const char *port_name)
|
|
{
|
|
int rc, i, j;
|
|
int have_wdir=0;
|
|
int flag=0;
|
|
char cwd[OPAL_PATH_MAX];
|
|
char host[OPAL_MAX_INFO_VAL]; /*** should define OMPI_HOST_MAX ***/
|
|
char prefix[OPAL_MAX_INFO_VAL];
|
|
char stdin_target[OPAL_MAX_INFO_VAL];
|
|
char params[OPAL_MAX_INFO_VAL];
|
|
char mapper[OPAL_MAX_INFO_VAL];
|
|
char slot_list[OPAL_MAX_INFO_VAL];
|
|
uint32_t ui32;
|
|
bool personality = false;
|
|
opal_jobid_t jobid;
|
|
|
|
opal_list_t apps;
|
|
opal_list_t job_info;
|
|
opal_pmix_app_t *app;
|
|
opal_value_t *info;
|
|
bool local_spawn, non_mpi;
|
|
char **envars;
|
|
|
|
/* parse the info object */
|
|
/* check potentially for:
|
|
- "host": desired host where to spawn the processes
|
|
- "hostfile": hostfile containing hosts where procs are
|
|
to be spawned
|
|
- "add-host": add the specified hosts to the known list
|
|
of available resources and spawn these
|
|
procs on them
|
|
- "add-hostfile": add the hosts in the hostfile to the
|
|
known list of available resources and spawn
|
|
these procs on them
|
|
- "env": a newline-delimited list of envar values to be
|
|
placed into the app's environment (of form "foo=bar")
|
|
- "ompi_prefix": the path to the root of the directory tree where ompi
|
|
executables and libraries can be found on all nodes
|
|
used to spawn these procs
|
|
- "arch": desired architecture
|
|
- "wdir": directory, where executable can be found
|
|
- "path": list of directories where to look for the executable
|
|
- "file": filename, where additional information is provided.
|
|
- "soft": see page 92 of MPI-2.
|
|
- "mapper": indicate the mapper to be used for the job
|
|
- "display_map": display the map of the spawned job
|
|
- "npernode": number of procs/node to spawn
|
|
- "pernode": spawn one proc/node
|
|
- "ppr": spawn specified number of procs per specified object
|
|
- "map_by": specify object by which the procs should be mapped
|
|
- "rank_by": specify object by which the procs should be ranked
|
|
- "bind_to": specify object to which the procs should be bound
|
|
- "ompi_preload_binary": move binaries to nodes prior to execution
|
|
- "ompi_preload_files": move specified files to nodes prior to execution
|
|
- "ompi_non_mpi": spawned job will not call MPI_Init
|
|
- "ompi_param": list of MCA params to be in the spawned job's environment
|
|
- "env": newline (\n) delimited list of envar values to be passed to spawned procs
|
|
*/
|
|
|
|
/* setup the job object */
|
|
OBJ_CONSTRUCT(&job_info, opal_list_t);
|
|
OBJ_CONSTRUCT(&apps, opal_list_t);
|
|
|
|
/* Convert the list of commands to list of opal_pmix_app_t */
|
|
for (i = 0; i < count; ++i) {
|
|
app = OBJ_NEW(opal_pmix_app_t);
|
|
if (NULL == app) {
|
|
OMPI_ERROR_LOG(OMPI_ERR_OUT_OF_RESOURCE);
|
|
OPAL_LIST_DESTRUCT(&apps);
|
|
opal_progress_event_users_decrement();
|
|
return OMPI_ERR_OUT_OF_RESOURCE;
|
|
}
|
|
/* add the app to the job data */
|
|
opal_list_append(&apps, &app->super);
|
|
|
|
/* copy over the name of the executable */
|
|
app->cmd = strdup(array_of_commands[i]);
|
|
opal_argv_append_nosize(&app->argv, app->cmd);
|
|
|
|
/* record the number of procs to be generated */
|
|
app->maxprocs = array_of_maxprocs[i];
|
|
|
|
/* copy over the argv array */
|
|
if (MPI_ARGVS_NULL != array_of_argv &&
|
|
MPI_ARGV_NULL != array_of_argv[i]) {
|
|
for (j=0; NULL != array_of_argv[i][j]; j++) {
|
|
opal_argv_append_nosize(&app->argv, array_of_argv[i][j]);
|
|
}
|
|
}
|
|
|
|
/* Add environment variable with the contact information for the
|
|
child processes.
|
|
*/
|
|
opal_setenv("OMPI_PARENT_PORT", port_name, true, &app->env);
|
|
for (j = 0; NULL != environ[j]; ++j) {
|
|
if (0 == strncmp(OPAL_MCA_PREFIX, environ[j], strlen(OPAL_MCA_PREFIX))) {
|
|
opal_argv_append_nosize(&app->env, environ[j]);
|
|
}
|
|
}
|
|
|
|
/* Check for well-known info keys */
|
|
have_wdir = 0;
|
|
if ( array_of_info != NULL && array_of_info[i] != MPI_INFO_NULL ) {
|
|
|
|
/* check for personality - this is a job-level key */
|
|
ompi_info_get (array_of_info[i], "personality", sizeof(host) - 1, host, &flag);
|
|
if ( flag ) {
|
|
personality = true;
|
|
info = OBJ_NEW(opal_value_t);
|
|
info->key = strdup(OPAL_PMIX_PERSONALITY);
|
|
opal_value_load(info, host, OPAL_STRING);
|
|
opal_list_append(&job_info, &info->super);
|
|
}
|
|
|
|
/* check for 'host' */
|
|
ompi_info_get (array_of_info[i], "host", sizeof(host) - 1, host, &flag);
|
|
if ( flag ) {
|
|
info = OBJ_NEW(opal_value_t);
|
|
info->key = strdup(OPAL_PMIX_HOST);
|
|
opal_value_load(info, host, OPAL_STRING);
|
|
opal_list_append(&app->info, &info->super);
|
|
}
|
|
|
|
/* check for 'hostfile' */
|
|
ompi_info_get (array_of_info[i], "hostfile", sizeof(host) - 1, host, &flag);
|
|
if ( flag ) {
|
|
info = OBJ_NEW(opal_value_t);
|
|
info->key = strdup(OPAL_PMIX_HOSTFILE);
|
|
opal_value_load(info, host, OPAL_STRING);
|
|
opal_list_append(&app->info, &info->super);
|
|
}
|
|
|
|
/* check for 'add-hostfile' */
|
|
ompi_info_get (array_of_info[i], "add-hostfile", sizeof(host) - 1, host, &flag);
|
|
if ( flag ) {
|
|
info = OBJ_NEW(opal_value_t);
|
|
info->key = strdup(OPAL_PMIX_ADD_HOSTFILE);
|
|
opal_value_load(info, host, OPAL_STRING);
|
|
opal_list_append(&app->info, &info->super);
|
|
}
|
|
|
|
/* check for 'add-host' */
|
|
ompi_info_get (array_of_info[i], "add-host", sizeof(host) - 1, host, &flag);
|
|
if ( flag ) {
|
|
info = OBJ_NEW(opal_value_t);
|
|
info->key = strdup(OPAL_PMIX_ADD_HOST);
|
|
opal_value_load(info, host, OPAL_STRING);
|
|
opal_list_append(&app->info, &info->super);
|
|
}
|
|
|
|
/* check for env */
|
|
ompi_info_get (array_of_info[i], "env", sizeof(host)-1, host, &flag);
|
|
if ( flag ) {
|
|
envars = opal_argv_split(host, '\n');
|
|
for (j=0; NULL != envars[j]; j++) {
|
|
opal_argv_append_nosize(&app->env, envars[j]);
|
|
}
|
|
opal_argv_free(envars);
|
|
}
|
|
|
|
/* 'path', 'arch', 'file', 'soft' -- to be implemented */
|
|
|
|
/* check for 'ompi_prefix' (OMPI-specific -- to effect the same
|
|
* behavior as --prefix option to orterun)
|
|
*
|
|
* This is a job-level key
|
|
*/
|
|
ompi_info_get (array_of_info[i], "ompi_prefix", sizeof(prefix) - 1, prefix, &flag);
|
|
if ( flag ) {
|
|
info = OBJ_NEW(opal_value_t);
|
|
info->key = strdup(OPAL_PMIX_PREFIX);
|
|
opal_value_load(info, prefix, OPAL_STRING);
|
|
opal_list_append(&job_info, &info->super);
|
|
}
|
|
|
|
/* check for 'wdir' */
|
|
ompi_info_get (array_of_info[i], "wdir", sizeof(cwd) - 1, cwd, &flag);
|
|
if ( flag ) {
|
|
info = OBJ_NEW(opal_value_t);
|
|
info->key = strdup(OPAL_PMIX_WDIR);
|
|
opal_value_load(info, cwd, OPAL_STRING);
|
|
opal_list_append(&app->info, &info->super);
|
|
have_wdir = 1;
|
|
}
|
|
|
|
/* check for 'mapper' - a job-level key */
|
|
ompi_info_get(array_of_info[i], "mapper", sizeof(mapper) - 1, mapper, &flag);
|
|
if ( flag ) {
|
|
info = OBJ_NEW(opal_value_t);
|
|
info->key = strdup(OPAL_PMIX_MAPPER);
|
|
opal_value_load(info, mapper, OPAL_STRING);
|
|
opal_list_append(&job_info, &info->super);
|
|
}
|
|
|
|
/* check for 'display_map' - a job-level key */
|
|
ompi_info_get_bool(array_of_info[i], "display_map", &local_spawn, &flag);
|
|
if ( flag ) {
|
|
info = OBJ_NEW(opal_value_t);
|
|
info->key = strdup(OPAL_PMIX_DISPLAY_MAP);
|
|
opal_value_load(info, &local_spawn, OPAL_BOOL);
|
|
opal_list_append(&job_info, &info->super);
|
|
}
|
|
|
|
/* check for 'npernode' and 'ppr' - job-level key */
|
|
ompi_info_get (array_of_info[i], "npernode", sizeof(slot_list) - 1, slot_list, &flag);
|
|
if ( flag ) {
|
|
info = OBJ_NEW(opal_value_t);
|
|
info->key = strdup(OPAL_PMIX_PPR);
|
|
info->type = OPAL_STRING;
|
|
(void)opal_asprintf(&(info->data.string), "%s:n", slot_list);
|
|
opal_list_append(&job_info, &info->super);
|
|
}
|
|
ompi_info_get (array_of_info[i], "pernode", sizeof(slot_list) - 1, slot_list, &flag);
|
|
if ( flag ) {
|
|
info = OBJ_NEW(opal_value_t);
|
|
info->key = strdup(OPAL_PMIX_PPR);
|
|
opal_value_load(info, "1:n", OPAL_STRING);
|
|
opal_list_append(&job_info, &info->super);
|
|
}
|
|
ompi_info_get (array_of_info[i], "ppr", sizeof(slot_list) - 1, slot_list, &flag);
|
|
if ( flag ) {
|
|
info = OBJ_NEW(opal_value_t);
|
|
info->key = strdup(OPAL_PMIX_PPR);
|
|
opal_value_load(info, slot_list, OPAL_STRING);
|
|
opal_list_append(&job_info, &info->super);
|
|
}
|
|
|
|
/* check for 'map_by' - job-level key */
|
|
ompi_info_get(array_of_info[i], "map_by", sizeof(slot_list) - 1, slot_list, &flag);
|
|
if ( flag ) {
|
|
info = OBJ_NEW(opal_value_t);
|
|
info->key = strdup(OPAL_PMIX_MAPBY);
|
|
opal_value_load(info, slot_list, OPAL_STRING);
|
|
opal_list_append(&job_info, &info->super);
|
|
}
|
|
|
|
/* check for 'rank_by' - job-level key */
|
|
ompi_info_get(array_of_info[i], "rank_by", sizeof(slot_list) - 1, slot_list, &flag);
|
|
if ( flag ) {
|
|
info = OBJ_NEW(opal_value_t);
|
|
info->key = strdup(OPAL_PMIX_RANKBY);
|
|
opal_value_load(info, slot_list, OPAL_STRING);
|
|
opal_list_append(&job_info, &info->super);
|
|
}
|
|
|
|
/* check for 'bind_to' - job-level key */
|
|
ompi_info_get(array_of_info[i], "bind_to", sizeof(slot_list) - 1, slot_list, &flag);
|
|
if ( flag ) {
|
|
info = OBJ_NEW(opal_value_t);
|
|
info->key = strdup(OPAL_PMIX_BINDTO);
|
|
opal_value_load(info, slot_list, OPAL_STRING);
|
|
opal_list_append(&job_info, &info->super);
|
|
}
|
|
|
|
/* check for 'preload_binary' - job-level key */
|
|
ompi_info_get_bool(array_of_info[i], "ompi_preload_binary", &local_spawn, &flag);
|
|
if ( flag ) {
|
|
info = OBJ_NEW(opal_value_t);
|
|
info->key = strdup(OPAL_PMIX_PRELOAD_BIN);
|
|
opal_value_load(info, &local_spawn, OPAL_BOOL);
|
|
opal_list_append(&job_info, &info->super);
|
|
}
|
|
|
|
/* check for 'preload_files' - job-level key */
|
|
ompi_info_get (array_of_info[i], "ompi_preload_files", sizeof(cwd) - 1, cwd, &flag);
|
|
if ( flag ) {
|
|
info = OBJ_NEW(opal_value_t);
|
|
info->key = strdup(OPAL_PMIX_PRELOAD_FILES);
|
|
opal_value_load(info, cwd, OPAL_STRING);
|
|
opal_list_append(&job_info, &info->super);
|
|
}
|
|
|
|
/* see if this is a non-mpi job - if so, then set the flag so ORTE
|
|
* knows what to do - job-level key
|
|
*/
|
|
ompi_info_get_bool(array_of_info[i], "ompi_non_mpi", &non_mpi, &flag);
|
|
if (flag && non_mpi) {
|
|
info = OBJ_NEW(opal_value_t);
|
|
info->key = strdup(OPAL_PMIX_NON_PMI);
|
|
opal_value_load(info, &non_mpi, OPAL_BOOL);
|
|
opal_list_append(&job_info, &info->super);
|
|
}
|
|
|
|
/* see if this is an MCA param that the user wants applied to the child job */
|
|
ompi_info_get (array_of_info[i], "ompi_param", sizeof(params) - 1, params, &flag);
|
|
if ( flag ) {
|
|
opal_argv_append_unique_nosize(&app->env, params, true);
|
|
}
|
|
|
|
/* see if user specified what to do with stdin - defaults to
|
|
* not forwarding stdin to child processes - job-level key
|
|
*/
|
|
ompi_info_get (array_of_info[i], "ompi_stdin_target", sizeof(stdin_target) - 1, stdin_target, &flag);
|
|
if ( flag ) {
|
|
if (0 == strcmp(stdin_target, "all")) {
|
|
ui32 = OPAL_VPID_WILDCARD;
|
|
} else if (0 == strcmp(stdin_target, "none")) {
|
|
ui32 = OPAL_VPID_INVALID;
|
|
} else {
|
|
ui32 = strtoul(stdin_target, NULL, 10);
|
|
}
|
|
info = OBJ_NEW(opal_value_t);
|
|
info->key = strdup(OPAL_PMIX_STDIN_TGT);
|
|
opal_value_load(info, &ui32, OPAL_UINT32);
|
|
opal_list_append(&job_info, &info->super);
|
|
}
|
|
}
|
|
|
|
/* default value: If the user did not tell us where to look for the
|
|
* executable, we assume the current working directory
|
|
*/
|
|
if ( !have_wdir ) {
|
|
if (OMPI_SUCCESS != (rc = opal_getcwd(cwd, OPAL_PATH_MAX))) {
|
|
OMPI_ERROR_LOG(rc);
|
|
OPAL_LIST_DESTRUCT(&apps);
|
|
opal_progress_event_users_decrement();
|
|
return rc;
|
|
}
|
|
info = OBJ_NEW(opal_value_t);
|
|
info->key = strdup(OPAL_PMIX_WDIR);
|
|
opal_value_load(info, cwd, OPAL_STRING);
|
|
opal_list_append(&app->info, &info->super);
|
|
}
|
|
|
|
/* leave the map info alone - the launcher will
|
|
* decide where to put things
|
|
*/
|
|
} /* for (i = 0 ; i < count ; ++i) */
|
|
|
|
/* default the personality - job-level key */
|
|
if (!personality) {
|
|
info = OBJ_NEW(opal_value_t);
|
|
info->key = strdup(OPAL_PMIX_PERSONALITY);
|
|
opal_value_load(info, "ompi", OPAL_STRING);
|
|
opal_list_append(&job_info, &info->super);
|
|
}
|
|
|
|
/* spawn procs */
|
|
rc = opal_pmix.spawn(&job_info, &apps, &jobid);
|
|
OPAL_LIST_DESTRUCT(&job_info);
|
|
OPAL_LIST_DESTRUCT(&apps);
|
|
|
|
if (OPAL_SUCCESS != rc) {
|
|
opal_progress_event_users_decrement();
|
|
return MPI_ERR_SPAWN;
|
|
}
|
|
|
|
return OMPI_SUCCESS;
|
|
}
|
|
|
|
/* Create a rendezvous tag consisting of our name + a random number */
|
|
int ompi_dpm_open_port(char *port_name)
|
|
{
|
|
uint32_t r;
|
|
char *tmp;
|
|
|
|
r = opal_rand(&rnd);
|
|
opal_convert_process_name_to_string(&tmp, OMPI_PROC_MY_NAME);
|
|
snprintf(port_name, MPI_MAX_PORT_NAME-1, "%s:%u", tmp, r);
|
|
port_name[MPI_MAX_PORT_NAME - 1] = '\0';
|
|
free(tmp);
|
|
return OMPI_SUCCESS;
|
|
}
|
|
|
|
int ompi_dpm_close_port(const char *port_name)
|
|
{
|
|
/* nothing to do here - user is responsible for the memory */
|
|
return OMPI_SUCCESS;
|
|
}
|
|
|
|
int ompi_dpm_dyn_init(void)
|
|
{
|
|
int root=0, rc;
|
|
bool send_first = true;
|
|
ompi_communicator_t *newcomm=NULL;
|
|
char *port_name=NULL, *tmp, *ptr;
|
|
|
|
/* check for appropriate env variable */
|
|
tmp = getenv("OMPI_PARENT_PORT");
|
|
if (NULL == tmp) {
|
|
/* nothing to do */
|
|
return OMPI_SUCCESS;
|
|
}
|
|
|
|
/* the value passed to us may have quote marks around it to protect
|
|
* the value if passed on the command line. We must remove those
|
|
* to have a correct string
|
|
*/
|
|
if ('"' == tmp[0]) {
|
|
/* if the first char is a quote, then so will the last one be */
|
|
tmp[strlen(tmp)-1] = '\0';
|
|
ptr = &tmp[1];
|
|
} else {
|
|
ptr = &tmp[0];
|
|
}
|
|
port_name = strdup(ptr);
|
|
|
|
rc = ompi_dpm_connect_accept(MPI_COMM_WORLD, root, port_name, send_first, &newcomm);
|
|
free(port_name);
|
|
if (OMPI_SUCCESS != rc) {
|
|
return rc;
|
|
}
|
|
|
|
/* originally, we set comm_parent to comm_null (in comm_init),
|
|
* now we have to decrease the reference counters to the according
|
|
* objects
|
|
*/
|
|
OBJ_RELEASE(ompi_mpi_comm_parent->c_local_group);
|
|
OBJ_RELEASE(ompi_mpi_comm_parent->error_handler);
|
|
OBJ_RELEASE(ompi_mpi_comm_parent);
|
|
|
|
/* Set the parent communicator */
|
|
ompi_mpi_comm_parent = newcomm;
|
|
|
|
/* Set name for debugging purposes */
|
|
snprintf(newcomm->c_name, MPI_MAX_OBJECT_NAME, "MPI_COMM_PARENT");
|
|
newcomm->c_flags |= OMPI_COMM_NAMEISSET;
|
|
|
|
return OMPI_SUCCESS;
|
|
}
|
|
|
|
|
|
/*
|
|
* finalize the module
|
|
*/
|
|
int ompi_dpm_finalize(void)
|
|
{
|
|
return OMPI_SUCCESS;
|
|
}
|
|
|
|
|
|
/**********************************************************************/
|
|
/**********************************************************************/
|
|
/**********************************************************************/
|
|
/* this routine runs through the list of communicators
|
|
and does the disconnect for all dynamic communicators */
|
|
int ompi_dpm_dyn_finalize(void)
|
|
{
|
|
int i,j=0, max=0;
|
|
ompi_dpm_disconnect_obj **objs=NULL;
|
|
ompi_communicator_t *comm=NULL;
|
|
|
|
if (1 <ompi_comm_num_dyncomm) {
|
|
objs = (ompi_dpm_disconnect_obj**)malloc(ompi_comm_num_dyncomm*
|
|
sizeof(ompi_dpm_disconnect_obj*));
|
|
if (NULL == objs) {
|
|
return OMPI_ERR_OUT_OF_RESOURCE;
|
|
}
|
|
|
|
max = opal_pointer_array_get_size(&ompi_mpi_communicators);
|
|
for (i=3; i<max; i++) {
|
|
comm = (ompi_communicator_t*)opal_pointer_array_get_item(&ompi_mpi_communicators,i);
|
|
if (NULL != comm && OMPI_COMM_IS_DYNAMIC(comm)) {
|
|
objs[j++] = disconnect_init(comm);
|
|
}
|
|
}
|
|
|
|
if (j != ompi_comm_num_dyncomm+1) {
|
|
free(objs);
|
|
return OMPI_ERROR;
|
|
}
|
|
|
|
disconnect_waitall(ompi_comm_num_dyncomm, objs);
|
|
free(objs);
|
|
}
|
|
|
|
return OMPI_SUCCESS;
|
|
}
|
|
|
|
/* the next two routines implement a kind of non-blocking barrier.
|
|
the only difference is, that you can wait for the completion
|
|
of more than one initiated ibarrier. This is required for waiting
|
|
for all still connected processes in MPI_Finalize.
|
|
|
|
disconnect_init returns a handle, which has to be passed in
|
|
to disconnect_waitall. The second routine blocks, until
|
|
all non-blocking barriers described by the handles are finished.
|
|
The communicators can than be released.
|
|
*/
|
|
/**********************************************************************/
|
|
/**********************************************************************/
|
|
/**********************************************************************/
|
|
|
|
static ompi_dpm_disconnect_obj *disconnect_init(ompi_communicator_t *comm)
|
|
{
|
|
ompi_dpm_disconnect_obj *obj=NULL;
|
|
int ret;
|
|
int i;
|
|
|
|
obj = (ompi_dpm_disconnect_obj*)calloc(1,sizeof(ompi_dpm_disconnect_obj));
|
|
if (NULL == obj) {
|
|
opal_output(0, "Could not allocate disconnect object");
|
|
return NULL;
|
|
}
|
|
|
|
if (OMPI_COMM_IS_INTER(comm)) {
|
|
obj->size = ompi_comm_remote_size(comm);
|
|
} else {
|
|
obj->size = ompi_comm_size(comm);
|
|
}
|
|
|
|
obj->comm = comm;
|
|
obj->reqs = (ompi_request_t**)malloc(2*obj->size*sizeof(ompi_request_t *));
|
|
if (NULL == obj->reqs) {
|
|
opal_output(0, "Could not allocate request array for disconnect object");
|
|
free(obj);
|
|
return NULL;
|
|
}
|
|
|
|
/* initiate all isend_irecvs. We use a dummy buffer stored on
|
|
the object, since we are sending zero size messages anyway. */
|
|
for (i=0; i < obj->size; i++) {
|
|
ret = MCA_PML_CALL(irecv(&(obj->buf), 0, MPI_INT, i,
|
|
OMPI_COMM_BARRIER_TAG, comm,
|
|
&(obj->reqs[2*i])));
|
|
|
|
if (OMPI_SUCCESS != ret) {
|
|
opal_output(0, "dpm_disconnect_init: error %d in irecv to process %d", ret, i);
|
|
free(obj->reqs);
|
|
free(obj);
|
|
return NULL;
|
|
}
|
|
ret = MCA_PML_CALL(isend(&(obj->buf), 0, MPI_INT, i,
|
|
OMPI_COMM_BARRIER_TAG,
|
|
MCA_PML_BASE_SEND_SYNCHRONOUS,
|
|
comm, &(obj->reqs[2*i+1])));
|
|
|
|
if (OMPI_SUCCESS != ret) {
|
|
opal_output(0, "dpm_disconnect_init: error %d in isend to process %d", ret, i);
|
|
free(obj->reqs);
|
|
free(obj);
|
|
return NULL;
|
|
}
|
|
}
|
|
|
|
/* return handle */
|
|
return obj;
|
|
}
|
|
/**********************************************************************/
|
|
/**********************************************************************/
|
|
/**********************************************************************/
|
|
/* - count how many requests are active
|
|
* - generate a request array large enough to hold
|
|
all active requests
|
|
* - call waitall on the overall request array
|
|
* - free the objects
|
|
*/
|
|
static int disconnect_waitall (int count, ompi_dpm_disconnect_obj **objs)
|
|
{
|
|
|
|
ompi_request_t **reqs=NULL;
|
|
char *treq=NULL;
|
|
int totalcount = 0;
|
|
int i;
|
|
int ret;
|
|
|
|
for (i=0; i<count; i++) {
|
|
if (NULL == objs[i]) {
|
|
opal_output(0, "Error in comm_disconnect_waitall");
|
|
return OMPI_ERROR;
|
|
}
|
|
|
|
totalcount += objs[i]->size;
|
|
}
|
|
|
|
reqs = (ompi_request_t**)malloc(2*totalcount*sizeof(ompi_request_t *));
|
|
if (NULL == reqs) {
|
|
opal_output(0, "ompi_comm_disconnect_waitall: error allocating memory");
|
|
return OMPI_ERROR;
|
|
}
|
|
|
|
/* generate a single, large array of pending requests */
|
|
treq = (char *)reqs;
|
|
for (i=0; i<count; i++) {
|
|
memcpy(treq, objs[i]->reqs, 2*objs[i]->size * sizeof(ompi_request_t *));
|
|
treq += 2*objs[i]->size * sizeof(ompi_request_t *);
|
|
}
|
|
|
|
/* force all non-blocking all-to-alls to finish */
|
|
ret = ompi_request_wait_all(2*totalcount, reqs, MPI_STATUSES_IGNORE);
|
|
|
|
/* Finally, free everything */
|
|
for (i=0; i< count; i++ ) {
|
|
if (NULL != objs[i]->reqs ) {
|
|
free(objs[i]->reqs );
|
|
}
|
|
free(objs[i]);
|
|
}
|
|
|
|
free(reqs);
|
|
|
|
return ret;
|
|
}
|
|
|
|
/**********************************************************************/
|
|
/**********************************************************************/
|
|
/**********************************************************************/
|
|
static bool ompi_dpm_group_is_dyn (ompi_group_t *group, ompi_jobid_t thisjobid)
|
|
{
|
|
int size = group ? ompi_group_size (group) : 0;
|
|
|
|
for (int i = 0 ; i < size ; ++i) {
|
|
opal_process_name_t name = ompi_group_get_proc_name (group, i);
|
|
|
|
if (thisjobid != ((ompi_process_name_t *) &name)->jobid) {
|
|
/* at least one is different */
|
|
return true;
|
|
}
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
/* All we want to do in this function is determine if the number of
|
|
* jobids in the local and/or remote group is > 1. This tells us to
|
|
* set the disconnect flag. We don't actually care what the true
|
|
* number -is-, only that it is > 1
|
|
*/
|
|
void ompi_dpm_mark_dyncomm(ompi_communicator_t *comm)
|
|
{
|
|
bool found;
|
|
ompi_jobid_t thisjobid;
|
|
|
|
/* special case for MPI_COMM_NULL */
|
|
if (comm == MPI_COMM_NULL) {
|
|
return;
|
|
}
|
|
|
|
thisjobid = ompi_group_get_proc_name (comm->c_local_group, 0).jobid;
|
|
|
|
/* loop over all processes in local group and check for
|
|
* a different jobid
|
|
*/
|
|
found = ompi_dpm_group_is_dyn (comm->c_local_group, thisjobid);
|
|
if (!found) {
|
|
/* if inter-comm, loop over all processes in remote_group
|
|
* and see if any are different from thisjobid
|
|
*/
|
|
found = ompi_dpm_group_is_dyn (comm->c_remote_group, thisjobid);
|
|
}
|
|
|
|
/* if a different jobid was found, set the disconnect flag*/
|
|
if (found) {
|
|
ompi_comm_num_dyncomm++;
|
|
OMPI_COMM_SET_DYNAMIC(comm);
|
|
}
|
|
}
|