425 строки
15 KiB
C
425 строки
15 KiB
C
/* -*- C -*-
|
|
*
|
|
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
|
|
* University Research and Technology
|
|
* Corporation. All rights reserved.
|
|
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
|
* of Tennessee Research Foundation. All rights
|
|
* reserved.
|
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
|
* University of Stuttgart. All rights reserved.
|
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
|
* All rights reserved.
|
|
* Copyright (c) 2011-2012 Los Alamos National Security, LLC.
|
|
* All rights reserved.
|
|
* $COPYRIGHT$
|
|
*
|
|
* Additional copyrights may follow
|
|
*
|
|
* $HEADER$
|
|
*/
|
|
/** @file:
|
|
*
|
|
*/
|
|
|
|
/*
|
|
* includes
|
|
*/
|
|
#include "orte_config.h"
|
|
|
|
|
|
#include "opal/dss/dss.h"
|
|
|
|
#include "orte/util/proc_info.h"
|
|
#include "orte/util/error_strings.h"
|
|
#include "orte/mca/errmgr/errmgr.h"
|
|
#include "orte/mca/odls/base/base.h"
|
|
#include "orte/mca/rmaps/rmaps_types.h"
|
|
#include "orte/mca/rml/rml.h"
|
|
#include "orte/mca/routed/routed.h"
|
|
#include "orte/mca/state/state.h"
|
|
#include "orte/util/name_fns.h"
|
|
#include "orte/util/nidmap.h"
|
|
#include "orte/runtime/orte_globals.h"
|
|
|
|
#include "orte/mca/grpcomm/grpcomm.h"
|
|
#include "orte/mca/grpcomm/base/base.h"
|
|
|
|
static int pack_xcast(orte_grpcomm_signature_t *sig,
|
|
opal_buffer_t *buffer,
|
|
opal_buffer_t *message,
|
|
orte_rml_tag_t tag);
|
|
|
|
static int create_dmns(orte_grpcomm_signature_t *sig,
|
|
orte_vpid_t **dmns, size_t *ndmns);
|
|
|
|
typedef struct {
|
|
opal_object_t super;
|
|
opal_event_t ev;
|
|
orte_grpcomm_signature_t *sig;
|
|
opal_buffer_t *buf;
|
|
orte_grpcomm_cbfunc_t cbfunc;
|
|
void *cbdata;
|
|
} orte_grpcomm_caddy_t;
|
|
static void gccon(orte_grpcomm_caddy_t *p)
|
|
{
|
|
p->sig = NULL;
|
|
p->buf = NULL;
|
|
p->cbfunc = NULL;
|
|
p->cbdata = NULL;
|
|
}
|
|
static OBJ_CLASS_INSTANCE(orte_grpcomm_caddy_t,
|
|
opal_object_t,
|
|
gccon, NULL);
|
|
|
|
int orte_grpcomm_API_xcast(orte_grpcomm_signature_t *sig,
|
|
orte_rml_tag_t tag,
|
|
opal_buffer_t *msg)
|
|
{
|
|
int rc = ORTE_ERROR;
|
|
opal_buffer_t *buf;
|
|
orte_grpcomm_base_active_t *active;
|
|
orte_vpid_t *dmns;
|
|
size_t ndmns;
|
|
|
|
OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base_framework.framework_output,
|
|
"%s grpcomm:base:xcast sending %u bytes to tag %ld",
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
|
(NULL == msg) ? 0 : (unsigned int)msg->bytes_used, (long)tag));
|
|
|
|
/* this function does not access any framework-global data, and
|
|
* so it does not require us to push it into the event library */
|
|
|
|
/* prep the output buffer */
|
|
buf = OBJ_NEW(opal_buffer_t);
|
|
|
|
/* create the array of participating daemons */
|
|
if (ORTE_SUCCESS != (rc = create_dmns(sig, &dmns, &ndmns))) {
|
|
ORTE_ERROR_LOG(rc);
|
|
OBJ_RELEASE(buf);
|
|
return rc;
|
|
}
|
|
|
|
/* setup the payload */
|
|
if (ORTE_SUCCESS != (rc = pack_xcast(sig, buf, msg, tag))) {
|
|
ORTE_ERROR_LOG(rc);
|
|
OBJ_RELEASE(buf);
|
|
if (NULL != dmns) {
|
|
free(dmns);
|
|
}
|
|
return rc;
|
|
}
|
|
|
|
/* cycle thru the actives and see who can send it */
|
|
OPAL_LIST_FOREACH(active, &orte_grpcomm_base.actives, orte_grpcomm_base_active_t) {
|
|
if (NULL != active->module->xcast) {
|
|
if (ORTE_SUCCESS == (rc = active->module->xcast(dmns, ndmns, buf))) {
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
OBJ_RELEASE(buf); // if the module needs to keep the buf, it should OBJ_RETAIN it
|
|
if (NULL != dmns) {
|
|
free(dmns);
|
|
}
|
|
return rc;
|
|
}
|
|
|
|
static void allgather_stub(int fd, short args, void *cbdata)
|
|
{
|
|
orte_grpcomm_caddy_t *cd = (orte_grpcomm_caddy_t*)cbdata;
|
|
int ret = OPAL_SUCCESS;
|
|
int rc;
|
|
orte_grpcomm_base_active_t *active;
|
|
orte_grpcomm_coll_t *coll;
|
|
void *seq_number;
|
|
|
|
OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base_framework.framework_output,
|
|
"%s grpcomm:base:allgather stub",
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
|
|
|
/* retrieve an existing tracker, create it if not
|
|
* already found. The allgather module is responsible
|
|
* for releasing it upon completion of the collective */
|
|
ret = opal_hash_table_get_value_ptr(&orte_grpcomm_base.sig_table, (void *)cd->sig->signature, cd->sig->sz * sizeof(orte_process_name_t), &seq_number);
|
|
if (OPAL_ERR_NOT_FOUND == ret) {
|
|
cd->sig->seq_num = 0;
|
|
} else if (OPAL_SUCCESS == ret) {
|
|
cd->sig->seq_num = *((uint32_t *)(seq_number)) + 1;
|
|
} else {
|
|
OPAL_OUTPUT((orte_grpcomm_base_framework.framework_output,
|
|
"%s rpcomm:base:allgather can't not get signature from hash table",
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
|
ORTE_ERROR_LOG(ret);
|
|
OBJ_RELEASE(cd);
|
|
return;
|
|
}
|
|
ret = opal_hash_table_set_value_ptr(&orte_grpcomm_base.sig_table, (void *)cd->sig->signature, cd->sig->sz * sizeof(orte_process_name_t), (void *)&cd->sig->seq_num);
|
|
if (OPAL_SUCCESS != ret) {
|
|
OPAL_OUTPUT((orte_grpcomm_base_framework.framework_output,
|
|
"%s rpcomm:base:allgather can't not add new signature to hash table",
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
|
ORTE_ERROR_LOG(ret);
|
|
OBJ_RELEASE(cd);
|
|
return;
|
|
}
|
|
coll = orte_grpcomm_base_get_tracker(cd->sig, true);
|
|
coll->cbfunc = cd->cbfunc;
|
|
coll->cbdata = cd->cbdata;
|
|
|
|
/* cycle thru the actives and see who can process it */
|
|
OPAL_LIST_FOREACH(active, &orte_grpcomm_base.actives, orte_grpcomm_base_active_t) {
|
|
if (NULL != active->module->allgather) {
|
|
if (ORTE_SUCCESS == (rc = active->module->allgather(coll, cd->buf))) {
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
OBJ_RELEASE(cd);
|
|
}
|
|
|
|
int orte_grpcomm_API_allgather(orte_grpcomm_signature_t *sig,
|
|
opal_buffer_t *buf,
|
|
orte_grpcomm_cbfunc_t cbfunc,
|
|
void *cbdata)
|
|
{
|
|
orte_grpcomm_caddy_t *cd;
|
|
|
|
OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base_framework.framework_output,
|
|
"%s grpcomm:base:allgather",
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
|
|
|
/* must push this into the event library to ensure we can
|
|
* access framework-global data safely */
|
|
cd = OBJ_NEW(orte_grpcomm_caddy_t);
|
|
/* ensure the data doesn't go away */
|
|
OBJ_RETAIN(buf);
|
|
opal_dss.copy((void **)&cd->sig, (void *)sig, ORTE_SIGNATURE);
|
|
cd->buf = buf;
|
|
cd->cbfunc = cbfunc;
|
|
cd->cbdata = cbdata;
|
|
opal_event_set(orte_event_base, &cd->ev, -1, OPAL_EV_WRITE, allgather_stub, cd);
|
|
opal_event_set_priority(&cd->ev, ORTE_MSG_PRI);
|
|
opal_event_active(&cd->ev, OPAL_EV_WRITE, 1);
|
|
return ORTE_SUCCESS;
|
|
}
|
|
|
|
orte_grpcomm_coll_t* orte_grpcomm_base_get_tracker(orte_grpcomm_signature_t *sig, bool create)
|
|
{
|
|
orte_grpcomm_coll_t *coll;
|
|
int rc;
|
|
|
|
/* search the existing tracker list to see if this already exists */
|
|
OPAL_LIST_FOREACH(coll, &orte_grpcomm_base.ongoing, orte_grpcomm_coll_t) {
|
|
if (NULL == sig->signature) {
|
|
if (NULL == coll->sig->signature) {
|
|
/* only one collective can operate at a time
|
|
* across every process in the system */
|
|
return coll;
|
|
}
|
|
/* if only one is NULL, then we can't possibly match */
|
|
break;
|
|
}
|
|
if (OPAL_EQUAL == (rc = opal_dss.compare(sig, coll->sig, ORTE_SIGNATURE))) {
|
|
OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base_framework.framework_output,
|
|
"%s grpcomm:base:returning existing collective",
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
|
return coll;
|
|
}
|
|
}
|
|
/* if we get here, then this is a new collective - so create
|
|
* the tracker for it */
|
|
if (!create) {
|
|
OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base_framework.framework_output,
|
|
"%s grpcomm:base: not creating new coll",
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
|
|
|
return NULL;
|
|
}
|
|
coll = OBJ_NEW(orte_grpcomm_coll_t);
|
|
opal_dss.copy((void **)&coll->sig, (void *)sig, ORTE_SIGNATURE);
|
|
|
|
if (1 < opal_output_get_verbosity(orte_grpcomm_base_framework.framework_output)) {
|
|
char *tmp=NULL;
|
|
(void)opal_dss.print(&tmp, NULL, coll->sig, ORTE_SIGNATURE);
|
|
opal_output(0, "%s grpcomm:base: creating new coll for%s",
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), tmp);
|
|
free(tmp);
|
|
}
|
|
|
|
opal_list_append(&orte_grpcomm_base.ongoing, &coll->super);
|
|
|
|
/* now get the daemons involved */
|
|
if (ORTE_SUCCESS != (rc = create_dmns(sig, &coll->dmns, &coll->ndmns))) {
|
|
ORTE_ERROR_LOG(rc);
|
|
return NULL;
|
|
}
|
|
return coll;
|
|
}
|
|
|
|
static int create_dmns(orte_grpcomm_signature_t *sig,
|
|
orte_vpid_t **dmns, size_t *ndmns)
|
|
{
|
|
size_t n;
|
|
orte_job_t *jdata;
|
|
orte_proc_t *proc;
|
|
orte_node_t *node;
|
|
int i;
|
|
opal_list_t ds;
|
|
orte_namelist_t *nm;
|
|
orte_vpid_t vpid;
|
|
bool found;
|
|
size_t nds;
|
|
orte_vpid_t *dns;
|
|
|
|
OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base_framework.framework_output,
|
|
"%s grpcomm:base:create_dmns called with %s signature",
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
|
(NULL == sig->signature) ? "NULL" : "NON-NULL"));
|
|
|
|
/* if NULL == procs, then all daemons are participating */
|
|
if (NULL == sig->signature) {
|
|
*ndmns = orte_process_info.num_procs;
|
|
*dmns = NULL;
|
|
return ORTE_SUCCESS;
|
|
}
|
|
|
|
if (ORTE_VPID_WILDCARD == sig->signature[0].vpid) {
|
|
OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base_framework.framework_output,
|
|
"%s grpcomm:base:create_dmns called for all procs in job %s",
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
|
ORTE_JOBID_PRINT(sig->signature[0].jobid)));
|
|
/* all daemons hosting this jobid are participating */
|
|
if (NULL == (jdata = orte_get_job_data_object(sig->signature[0].jobid))) {
|
|
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
|
return ORTE_ERR_NOT_FOUND;
|
|
}
|
|
if (NULL == jdata->map) {
|
|
/* we haven't generated a job map yet - if we are the HNP,
|
|
* then we should only involve ourselves. Otherwise, we have
|
|
* no choice but to abort to avoid hangs */
|
|
if (ORTE_PROC_IS_HNP) {
|
|
dns = (orte_vpid_t*)malloc(sizeof(vpid));
|
|
dns[0] = ORTE_PROC_MY_NAME->vpid;
|
|
*ndmns = 1;
|
|
*dmns = dns;
|
|
return ORTE_SUCCESS;
|
|
}
|
|
ORTE_FORCED_TERMINATE(ORTE_ERR_NOT_FOUND);
|
|
*ndmns = 0;
|
|
*dmns = NULL;
|
|
return ORTE_ERR_NOT_FOUND;
|
|
}
|
|
/* get the array */
|
|
dns = (orte_vpid_t*)malloc(jdata->map->num_nodes * sizeof(vpid));
|
|
nds = 0;
|
|
for (i=0; i < jdata->map->nodes->size && (int)nds < jdata->map->num_nodes; i++) {
|
|
if (NULL == (node = opal_pointer_array_get_item(jdata->map->nodes, i))) {
|
|
continue;
|
|
}
|
|
if (NULL == node->daemon) {
|
|
/* should never happen */
|
|
ORTE_ERROR_LOG(ORTE_ERROR);
|
|
free(dns);
|
|
return ORTE_ERROR;
|
|
}
|
|
OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_framework.framework_output,
|
|
"%s grpcomm:base:create_dmns adding daemon %s to array",
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
|
ORTE_NAME_PRINT(&node->daemon->name)));
|
|
dns[nds++] = node->daemon->name.vpid;
|
|
}
|
|
} else {
|
|
/* lookup the daemon for each proc and add it to the list, checking to
|
|
* ensure any daemon only gets added once. Yes, this isn't a scalable
|
|
* algo - someone can come up with something better! */
|
|
OBJ_CONSTRUCT(&ds, opal_list_t);
|
|
for (n=0; n < sig->sz; n++) {
|
|
if (NULL == (jdata = orte_get_job_data_object(sig->signature[n].jobid))) {
|
|
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
|
OPAL_LIST_DESTRUCT(&ds);
|
|
return ORTE_ERR_NOT_FOUND;
|
|
}
|
|
opal_output_verbose(5, orte_grpcomm_base_framework.framework_output,
|
|
"%s sign: GETTING PROC OBJECT FOR %s",
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
|
ORTE_NAME_PRINT(&sig->signature[n]));
|
|
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, sig->signature[n].vpid))) {
|
|
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
|
OPAL_LIST_DESTRUCT(&ds);
|
|
return ORTE_ERR_NOT_FOUND;
|
|
}
|
|
if (NULL == proc->node || NULL == proc->node->daemon) {
|
|
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
|
OPAL_LIST_DESTRUCT(&ds);
|
|
ORTE_FORCED_TERMINATE(ORTE_ERR_NOT_FOUND);
|
|
return ORTE_ERR_NOT_FOUND;
|
|
}
|
|
vpid = proc->node->daemon->name.vpid;
|
|
found = false;
|
|
OPAL_LIST_FOREACH(nm, &ds, orte_namelist_t) {
|
|
if (nm->name.vpid == vpid) {
|
|
found = true;
|
|
break;
|
|
}
|
|
}
|
|
if (!found) {
|
|
nm = OBJ_NEW(orte_namelist_t);
|
|
nm->name.vpid = vpid;
|
|
opal_list_append(&ds, &nm->super);
|
|
}
|
|
}
|
|
if (0 == opal_list_get_size(&ds)) {
|
|
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
|
|
OPAL_LIST_DESTRUCT(&ds);
|
|
return ORTE_ERR_BAD_PARAM;
|
|
}
|
|
dns = (orte_vpid_t*)malloc(opal_list_get_size(&ds) * sizeof(orte_vpid_t));
|
|
nds = 0;
|
|
while (NULL != (nm = (orte_namelist_t*)opal_list_remove_first(&ds))) {
|
|
OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_framework.framework_output,
|
|
"%s grpcomm:base:create_dmns adding daemon %s to array",
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
|
ORTE_NAME_PRINT(&nm->name)));
|
|
dns[nds++] = nm->name.vpid;
|
|
OBJ_RELEASE(nm);
|
|
}
|
|
OPAL_LIST_DESTRUCT(&ds);
|
|
}
|
|
*dmns = dns;
|
|
*ndmns = nds;
|
|
return ORTE_SUCCESS;
|
|
}
|
|
|
|
static int pack_xcast(orte_grpcomm_signature_t *sig,
|
|
opal_buffer_t *buffer,
|
|
opal_buffer_t *message,
|
|
orte_rml_tag_t tag)
|
|
{
|
|
int rc;
|
|
|
|
/* pass along the signature */
|
|
if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &sig, 1, ORTE_SIGNATURE))) {
|
|
ORTE_ERROR_LOG(rc);
|
|
goto CLEANUP;
|
|
}
|
|
/* pass the final tag */
|
|
if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &tag, 1, ORTE_RML_TAG))) {
|
|
ORTE_ERROR_LOG(rc);
|
|
goto CLEANUP;
|
|
}
|
|
|
|
/* copy the payload into the new buffer - this is non-destructive, so our
|
|
* caller is still responsible for releasing any memory in the buffer they
|
|
* gave to us
|
|
*/
|
|
if (ORTE_SUCCESS != (rc = opal_dss.copy_payload(buffer, message))) {
|
|
ORTE_ERROR_LOG(rc);
|
|
goto CLEANUP;
|
|
}
|
|
|
|
CLEANUP:
|
|
return ORTE_SUCCESS;
|
|
}
|
|
|