4e7ff0bd5e
To enable the epochs and the resilient orte code, use the configure flag: --enable-resilient-orte This will define both: ORTE_ENABLE_EPOCH ORTE_RESIL_ORTE This commit was SVN r25093.
264 строки
7.9 KiB
C
264 строки
7.9 KiB
C
/*
|
|
* Copyright (c) 2007 Los Alamos National Security, LLC.
|
|
* All rights reserved.
|
|
* Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved.
|
|
* Copyright (c) 2004-2010 The Trustees of Indiana University.
|
|
* All rights reserved.
|
|
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
|
* of Tennessee Research Foundation. All rights
|
|
* reserved.
|
|
* $COPYRIGHT$
|
|
*
|
|
* Additional copyrights may follow
|
|
*
|
|
* $HEADER$
|
|
*/
|
|
|
|
#include "orte_config.h"
|
|
#include "orte/constants.h"
|
|
|
|
#include "opal/mca/mca.h"
|
|
#include "opal/class/opal_bitmap.h"
|
|
#include "opal/dss/dss.h"
|
|
#include "opal/util/output.h"
|
|
#include "opal/mca/base/mca_base_component_repository.h"
|
|
|
|
#include "orte/mca/errmgr/errmgr.h"
|
|
#include "orte/mca/rml/base/rml_contact.h"
|
|
#include "orte/util/proc_info.h"
|
|
#include "orte/runtime/orte_globals.h"
|
|
|
|
#include "orte/mca/routed/routed.h"
|
|
#include "orte/mca/routed/base/base.h"
|
|
|
|
|
|
/* The following file was created by configure. It contains extern
|
|
* statements and the definition of an array of pointers to each
|
|
* component's public mca_base_component_t struct. */
|
|
#include "orte/mca/routed/base/static-components.h"
|
|
|
|
#if ORTE_DISABLE_FULL_SUPPORT
|
|
/* have to include a bogus function here so that
|
|
* the build system sees at least one function
|
|
* in the library
|
|
*/
|
|
int orte_routed_base_open(void)
|
|
{
|
|
return ORTE_SUCCESS;
|
|
}
|
|
|
|
#else
|
|
|
|
static void construct(orte_routed_tree_t *rt)
|
|
{
|
|
rt->vpid = ORTE_VPID_INVALID;
|
|
OBJ_CONSTRUCT(&rt->relatives, opal_bitmap_t);
|
|
}
|
|
static void destruct(orte_routed_tree_t *rt)
|
|
{
|
|
OBJ_DESTRUCT(&rt->relatives);
|
|
}
|
|
OBJ_CLASS_INSTANCE(orte_routed_tree_t, opal_list_item_t,
|
|
construct, destruct);
|
|
|
|
static void jfamconst(orte_routed_jobfam_t *ptr)
|
|
{
|
|
ptr->route.jobid = ORTE_JOBID_INVALID;
|
|
ptr->route.vpid = ORTE_VPID_INVALID;
|
|
ORTE_EPOCH_SET(ptr->route.epoch,ORTE_EPOCH_MIN);
|
|
ptr->hnp_uri = NULL;
|
|
}
|
|
static void jfamdest(orte_routed_jobfam_t *ptr)
|
|
{
|
|
if (NULL != ptr->hnp_uri) {
|
|
free(ptr->hnp_uri);
|
|
}
|
|
}
|
|
OBJ_CLASS_INSTANCE(orte_routed_jobfam_t, opal_object_t,
|
|
jfamconst, jfamdest);
|
|
|
|
int orte_routed_base_output = -1;
|
|
orte_routed_module_t orte_routed = {0};
|
|
opal_list_t orte_routed_base_components;
|
|
opal_mutex_t orte_routed_base_lock;
|
|
opal_condition_t orte_routed_base_cond;
|
|
bool orte_routed_base_wait_sync;
|
|
opal_pointer_array_t orte_routed_jobfams;
|
|
|
|
static orte_routed_component_t *active_component = NULL;
|
|
static bool component_open_called = false;
|
|
static bool opened = false;
|
|
static bool selected = false;
|
|
|
|
int
|
|
orte_routed_base_open(void)
|
|
{
|
|
int ret;
|
|
orte_routed_jobfam_t *jfam;
|
|
|
|
if (opened) {
|
|
return ORTE_SUCCESS;
|
|
}
|
|
opened = true;
|
|
|
|
/* setup the output stream */
|
|
orte_routed_base_output = opal_output_open(NULL);
|
|
OBJ_CONSTRUCT(&orte_routed_base_lock, opal_mutex_t);
|
|
OBJ_CONSTRUCT(&orte_routed_base_cond, opal_condition_t);
|
|
orte_routed_base_wait_sync = false;
|
|
|
|
/* Initialize globals */
|
|
OBJ_CONSTRUCT(&orte_routed_base_components, opal_list_t);
|
|
|
|
/* Initialize storage of remote hnp uris */
|
|
OBJ_CONSTRUCT(&orte_routed_jobfams, opal_pointer_array_t);
|
|
opal_pointer_array_init(&orte_routed_jobfams, 8, INT_MAX, 8);
|
|
/* prime it with our HNP uri */
|
|
jfam = OBJ_NEW(orte_routed_jobfam_t);
|
|
jfam->route.jobid = ORTE_PROC_MY_HNP->jobid;
|
|
jfam->route.vpid = ORTE_PROC_MY_HNP->vpid;
|
|
ORTE_EPOCH_SET(jfam->route.epoch,ORTE_PROC_MY_HNP->epoch);
|
|
jfam->job_family = ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid);
|
|
if (NULL != orte_process_info.my_hnp_uri) {
|
|
jfam->hnp_uri = strdup(orte_process_info.my_hnp_uri);
|
|
}
|
|
opal_pointer_array_add(&orte_routed_jobfams, jfam);
|
|
|
|
/* Open up all available components */
|
|
ret = mca_base_components_open("routed",
|
|
orte_routed_base_output,
|
|
mca_routed_base_static_components,
|
|
&orte_routed_base_components,
|
|
true);
|
|
component_open_called = true;
|
|
|
|
return ret;
|
|
}
|
|
|
|
int
|
|
orte_routed_base_select(void)
|
|
{
|
|
int ret, exit_status = OPAL_SUCCESS;
|
|
orte_routed_component_t *best_component = NULL;
|
|
orte_routed_module_t *best_module = NULL;
|
|
|
|
if (selected) {
|
|
return ORTE_SUCCESS;
|
|
}
|
|
selected = true;
|
|
|
|
/*
|
|
* Select the best component
|
|
*/
|
|
if( OPAL_SUCCESS != mca_base_select("routed", orte_routed_base_output,
|
|
&orte_routed_base_components,
|
|
(mca_base_module_t **) &best_module,
|
|
(mca_base_component_t **) &best_component) ) {
|
|
/* This will only happen if no component was selected */
|
|
exit_status = ORTE_ERR_NOT_FOUND;
|
|
goto cleanup;
|
|
}
|
|
|
|
/* Save the winner */
|
|
orte_routed = *best_module;
|
|
active_component = best_component;
|
|
|
|
/* initialize the selected component */
|
|
opal_output_verbose(10, orte_routed_base_output,
|
|
"orte_routed_base_select: initializing selected component %s",
|
|
best_component->base_version.mca_component_name);
|
|
if (ORTE_SUCCESS != (ret = orte_routed.initialize()) ) {
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
|
|
cleanup:
|
|
return exit_status;
|
|
}
|
|
|
|
|
|
int
|
|
orte_routed_base_close(void)
|
|
{
|
|
int i;
|
|
orte_routed_jobfam_t *jfam;
|
|
|
|
/* finalize the selected component */
|
|
if (NULL != orte_routed.finalize) {
|
|
orte_routed.finalize();
|
|
}
|
|
|
|
/* shutdown any remaining opened components */
|
|
if (component_open_called) {
|
|
mca_base_components_close(orte_routed_base_output,
|
|
&orte_routed_base_components, NULL);
|
|
}
|
|
|
|
for (i=0; i < orte_routed_jobfams.size; i++) {
|
|
if (NULL != (jfam = (orte_routed_jobfam_t*)opal_pointer_array_get_item(&orte_routed_jobfams, i))) {
|
|
OBJ_RELEASE(jfam);
|
|
}
|
|
}
|
|
OBJ_DESTRUCT(&orte_routed_jobfams);
|
|
|
|
OBJ_DESTRUCT(&orte_routed_base_components);
|
|
OBJ_DESTRUCT(&orte_routed_base_lock);
|
|
OBJ_DESTRUCT(&orte_routed_base_cond);
|
|
|
|
opened = false;
|
|
selected = false;
|
|
|
|
return ORTE_SUCCESS;
|
|
}
|
|
|
|
void orte_routed_base_update_hnps(opal_buffer_t *buf)
|
|
{
|
|
int n, rc;
|
|
char *uri;
|
|
orte_process_name_t name;
|
|
orte_routed_jobfam_t *jfam;
|
|
uint16_t jobfamily;
|
|
|
|
n = 1;
|
|
while (ORTE_SUCCESS == opal_dss.unpack(buf, &uri, &n, OPAL_STRING)) {
|
|
/*extract the name */
|
|
if (ORTE_SUCCESS != (rc = orte_rml_base_parse_uris(uri, &name, NULL))) {
|
|
ORTE_ERROR_LOG(rc);
|
|
free(uri);
|
|
n=1;
|
|
continue;
|
|
}
|
|
jobfamily = ORTE_JOB_FAMILY(name.jobid);
|
|
/* see if we already have this connection */
|
|
for (n=0; n < orte_routed_jobfams.size; n++) {
|
|
if (NULL == (jfam = (orte_routed_jobfam_t*)opal_pointer_array_get_item(&orte_routed_jobfams,n))) {
|
|
continue;
|
|
}
|
|
if (jobfamily == jfam->job_family) {
|
|
/* update uri */
|
|
if (NULL != jfam->hnp_uri) {
|
|
free(jfam->hnp_uri);
|
|
}
|
|
jfam->hnp_uri = strdup(uri);
|
|
OPAL_OUTPUT_VERBOSE((10, orte_routed_base_output,
|
|
"%s adding remote HNP %s\n\t%s",
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
|
ORTE_NAME_PRINT(&name), uri));
|
|
goto done;
|
|
}
|
|
}
|
|
/* nope - create it */
|
|
jfam = OBJ_NEW(orte_routed_jobfam_t);
|
|
jfam->job_family = jobfamily;
|
|
jfam->route.jobid = name.jobid;
|
|
jfam->route.vpid = name.vpid;
|
|
ORTE_EPOCH_SET(jfam->route.epoch,name.epoch);
|
|
jfam->hnp_uri = strdup(uri);
|
|
done:
|
|
free(uri);
|
|
n=1;
|
|
}
|
|
}
|
|
|
|
#endif /* ORTE_DISABLE_FULL_SUPPORT */
|