
Multiple conduits can exist at the same time, and can even point to the same base transport. Each conduit can have its own characteristics (e.g., flow control) based on the info keys provided to the "open_conduit" call. For ease during the transition period, the "legacy" RML interfaces remain as wrappers over the new conduit-based APIs using a default conduit opened during orte_init - this default conduit is tied to the OOB framework so that current behaviors are preserved. Once the transition has been completed, a one-time cleanup will be done to update all RML calls to the new APIs and the "legacy" interfaces will be deleted. While we are at it: Remove oob/usock component to eliminate the TMPDIR length problem - get all working, including oob_stress
180 строки
5.6 KiB
C
180 строки
5.6 KiB
C
/*
|
|
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
|
|
* University Research and Technology
|
|
* Corporation. All rights reserved.
|
|
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
|
* of Tennessee Research Foundation. All rights
|
|
* reserved.
|
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
|
* University of Stuttgart. All rights reserved.
|
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
|
* All rights reserved.
|
|
* $COPYRIGHT$
|
|
*
|
|
* Additional copyrights may follow
|
|
*
|
|
* $HEADER$
|
|
*/
|
|
/** @file */
|
|
|
|
#include "orte_config.h"
|
|
#include "orte/constants.h"
|
|
#include "orte/types.h"
|
|
|
|
#include "opal/util/argv.h"
|
|
#include "opal/util/output.h"
|
|
|
|
#include "opal/dss/dss.h"
|
|
#include "orte/mca/errmgr/errmgr.h"
|
|
#include "orte/mca/routed/routed.h"
|
|
#include "orte/util/name_fns.h"
|
|
#include "orte/util/proc_info.h"
|
|
#include "orte/runtime/orte_globals.h"
|
|
|
|
#include "orte/mca/rml/rml.h"
|
|
#include "orte/mca/rml/base/rml_contact.h"
|
|
#include "orte/mca/rml/base/base.h"
|
|
|
|
|
|
int orte_rml_base_get_contact_info(orte_jobid_t job, opal_buffer_t *data)
|
|
{
|
|
int i;
|
|
orte_job_t *jdata;
|
|
orte_proc_t *proc;
|
|
int rc;
|
|
|
|
/* lookup the job */
|
|
if (NULL == (jdata = orte_get_job_data_object(job))) {
|
|
/* bad jobid */
|
|
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
|
|
return ORTE_ERR_BAD_PARAM;
|
|
}
|
|
|
|
/* cycle through all procs in the job, adding their contact info to the buffer */
|
|
for (i=0; i < jdata->procs->size; i++) {
|
|
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, i))) {
|
|
continue;
|
|
}
|
|
/* if this proc doesn't have any contact info, ignore it */
|
|
if (NULL == proc->rml_uri) {
|
|
continue;
|
|
}
|
|
if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &proc->rml_uri, 1, OPAL_STRING))) {
|
|
ORTE_ERROR_LOG(rc);
|
|
return rc;
|
|
}
|
|
}
|
|
|
|
return ORTE_SUCCESS;
|
|
}
|
|
|
|
int orte_rml_base_update_contact_info(opal_buffer_t* data)
|
|
{
|
|
orte_std_cntr_t cnt;
|
|
orte_vpid_t num_procs;
|
|
char *rml_uri;
|
|
orte_process_name_t name;
|
|
bool got_name;
|
|
int rc;
|
|
|
|
/* unpack the data for each entry */
|
|
num_procs = 0;
|
|
name.jobid = ORTE_JOBID_INVALID;
|
|
got_name = false;
|
|
cnt = 1;
|
|
while (ORTE_SUCCESS == (rc = opal_dss.unpack(data, &rml_uri, &cnt, OPAL_STRING))) {
|
|
|
|
OPAL_OUTPUT_VERBOSE((5, orte_rml_base_framework.framework_output,
|
|
"%s rml:base:update:contact:info got uri %s",
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
|
NULL == rml_uri ? "NULL" : rml_uri));
|
|
|
|
if (NULL != rml_uri) {
|
|
/* set the contact info into the hash table */
|
|
orte_rml.set_contact_info(rml_uri);
|
|
if (!got_name) {
|
|
/* we only get an update from a single jobid - the command
|
|
* that creates these doesn't cross jobid boundaries - so
|
|
* record it here
|
|
*/
|
|
if (ORTE_SUCCESS != (rc = orte_rml_base_parse_uris(rml_uri, &name, NULL))) {
|
|
ORTE_ERROR_LOG(rc);
|
|
free(rml_uri);
|
|
return rc;
|
|
}
|
|
got_name = true;
|
|
/* if this is for a different job family, update the route to this proc */
|
|
if (ORTE_JOB_FAMILY(name.jobid) != ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid)) {
|
|
if (ORTE_SUCCESS != (rc = orte_routed.update_route(&name, &name))) {
|
|
ORTE_ERROR_LOG(rc);
|
|
free(rml_uri);
|
|
return rc;
|
|
}
|
|
}
|
|
}
|
|
free(rml_uri);
|
|
}
|
|
|
|
/* track how many procs were in the message */
|
|
++num_procs;
|
|
}
|
|
if (ORTE_ERR_UNPACK_READ_PAST_END_OF_BUFFER != rc) {
|
|
ORTE_ERROR_LOG(rc);
|
|
return rc;
|
|
}
|
|
|
|
/* if we are a daemon and this was info about our jobid, this update would
|
|
* include updated contact info
|
|
* for all daemons in the system - indicating that the number of daemons
|
|
* changed since we were initially launched. Thus, update the num_procs
|
|
* in our process_info struct so we can correctly route any messages
|
|
*/
|
|
if (ORTE_PROC_MY_NAME->jobid == name.jobid &&
|
|
ORTE_PROC_IS_DAEMON &&
|
|
orte_process_info.num_procs < num_procs) {
|
|
orte_process_info.num_procs = num_procs;
|
|
|
|
if (orte_process_info.max_procs < orte_process_info.num_procs) {
|
|
orte_process_info.max_procs = orte_process_info.num_procs;
|
|
}
|
|
|
|
/* if we changed it, then we better update the routing
|
|
* plan so daemon collectives work correctly.
|
|
*/
|
|
orte_routed.update_routing_plan();
|
|
}
|
|
|
|
return ORTE_SUCCESS;
|
|
}
|
|
|
|
int
|
|
orte_rml_base_parse_uris(const char* uri,
|
|
orte_process_name_t* peer,
|
|
char*** uris)
|
|
{
|
|
int rc;
|
|
|
|
/* parse the process name */
|
|
char* cinfo = strdup(uri);
|
|
char* ptr = strchr(cinfo, ';');
|
|
if(NULL == ptr) {
|
|
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
|
|
free(cinfo);
|
|
return ORTE_ERR_BAD_PARAM;
|
|
}
|
|
*ptr = '\0';
|
|
ptr++;
|
|
if (ORTE_SUCCESS != (rc = orte_util_convert_string_to_process_name(peer, cinfo))) {
|
|
ORTE_ERROR_LOG(rc);
|
|
free(cinfo);
|
|
return rc;
|
|
}
|
|
|
|
if (NULL != uris) {
|
|
/* parse the remainder of the string into an array of uris */
|
|
*uris = opal_argv_split(ptr, ';');
|
|
}
|
|
free(cinfo);
|
|
return ORTE_SUCCESS;
|
|
}
|