7c795f4416
Do a little optimization and minimize recomputation of the routing plan. Signed-off-by: Ralph Castain <rhc@open-mpi.org>
358 строки
12 KiB
C
358 строки
12 KiB
C
/*
|
|
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
|
* University Research and Technology
|
|
* Corporation. All rights reserved.
|
|
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
|
* of Tennessee Research Foundation. All rights
|
|
* reserved.
|
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
|
* University of Stuttgart. All rights reserved.
|
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
|
* All rights reserved.
|
|
* Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
|
|
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights
|
|
* reserved.
|
|
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
|
|
* $COPYRIGHT$
|
|
*
|
|
* Additional copyrights may follow
|
|
*
|
|
* $HEADER$
|
|
*/
|
|
|
|
#include "orte_config.h"
|
|
#include "orte/constants.h"
|
|
#include "orte/types.h"
|
|
|
|
#include "opal/dss/dss.h"
|
|
#include "opal/util/argv.h"
|
|
|
|
#include "orte/mca/errmgr/errmgr.h"
|
|
#include "orte/mca/ess/ess.h"
|
|
#include "orte/mca/odls/odls_types.h"
|
|
#include "orte/mca/rml/rml.h"
|
|
#include "orte/mca/state/state.h"
|
|
#include "orte/runtime/orte_globals.h"
|
|
#include "orte/runtime/orte_wait.h"
|
|
|
|
#include "orte/mca/routed/base/base.h"
|
|
|
|
char* orte_routed_base_assign_module(char *modules)
|
|
{
|
|
orte_routed_base_active_t *active;
|
|
char **desired;
|
|
int i;
|
|
|
|
/* the incoming param contains a comma-delimited, prioritized
|
|
* list of desired routing modules. If it is NULL, then we
|
|
* simply return the module at the top of our list */
|
|
if (NULL == modules) {
|
|
active = (orte_routed_base_active_t*)opal_list_get_first(&orte_routed_base.actives);
|
|
return active->component->base_version.mca_component_name;
|
|
}
|
|
|
|
/* otherwise, cycle thru the provided list of desired modules
|
|
* and pick the highest priority one that matches */
|
|
desired = opal_argv_split(modules, ',');
|
|
for (i=0; NULL != desired[i]; i++) {
|
|
OPAL_LIST_FOREACH(active, &orte_routed_base.actives, orte_routed_base_active_t) {
|
|
if (0 == strcasecmp(desired[i], active->component->base_version.mca_component_name)) {
|
|
opal_argv_free(desired);
|
|
return active->component->base_version.mca_component_name;
|
|
}
|
|
}
|
|
}
|
|
opal_argv_free(desired);
|
|
|
|
/* get here if none match */
|
|
return NULL;
|
|
}
|
|
|
|
int orte_routed_base_delete_route(char *module, orte_process_name_t *proc)
|
|
{
|
|
orte_routed_base_active_t *active;
|
|
int rc;
|
|
|
|
OPAL_LIST_FOREACH(active, &orte_routed_base.actives, orte_routed_base_active_t) {
|
|
if (NULL == module ||
|
|
0 == strcmp(module, active->component->base_version.mca_component_name)) {
|
|
if (NULL != active->module->delete_route) {
|
|
if (ORTE_SUCCESS != (rc = active->module->delete_route(proc))) {
|
|
return rc;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
return ORTE_SUCCESS;
|
|
}
|
|
|
|
int orte_routed_base_update_route(char *module, orte_process_name_t *target,
|
|
orte_process_name_t *route)
|
|
{
|
|
orte_routed_base_active_t *active;
|
|
int rc;
|
|
|
|
OPAL_LIST_FOREACH(active, &orte_routed_base.actives, orte_routed_base_active_t) {
|
|
if (NULL == module ||
|
|
0 == strcmp(module, active->component->base_version.mca_component_name)) {
|
|
if (NULL != active->module->update_route) {
|
|
if (ORTE_SUCCESS != (rc = active->module->update_route(target, route))) {
|
|
return rc;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
return ORTE_SUCCESS;
|
|
}
|
|
|
|
orte_process_name_t orte_routed_base_get_route(char *module, orte_process_name_t *target)
|
|
{
|
|
orte_routed_base_active_t *active;
|
|
|
|
/* a NULL module corresponds to direct */
|
|
if (!orte_routed_base.routing_enabled || NULL == module) {
|
|
return *target;
|
|
}
|
|
|
|
OPAL_LIST_FOREACH(active, &orte_routed_base.actives, orte_routed_base_active_t) {
|
|
if (0 == strcmp(module, active->component->base_version.mca_component_name)) {
|
|
if (NULL != active->module->get_route) {
|
|
return active->module->get_route(target);
|
|
}
|
|
return *ORTE_NAME_INVALID;
|
|
}
|
|
}
|
|
return *ORTE_NAME_INVALID;
|
|
}
|
|
|
|
int orte_routed_base_route_lost(char *module, const orte_process_name_t *route)
|
|
{
|
|
orte_routed_base_active_t *active;
|
|
int rc;
|
|
|
|
OPAL_LIST_FOREACH(active, &orte_routed_base.actives, orte_routed_base_active_t) {
|
|
if (NULL == module ||
|
|
0 == strcmp(module, active->component->base_version.mca_component_name)) {
|
|
if (NULL != active->module->route_lost) {
|
|
if (ORTE_SUCCESS != (rc = active->module->route_lost(route))) {
|
|
return rc;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
return ORTE_SUCCESS;
|
|
}
|
|
|
|
bool orte_routed_base_route_is_defined(char *module, const orte_process_name_t *target)
|
|
{
|
|
orte_routed_base_active_t *active;
|
|
|
|
/* a NULL module corresponds to direct */
|
|
if (NULL == module) {
|
|
return true;
|
|
}
|
|
|
|
OPAL_LIST_FOREACH(active, &orte_routed_base.actives, orte_routed_base_active_t) {
|
|
if (0 == strcmp(module, active->component->base_version.mca_component_name)) {
|
|
if (NULL != active->module->route_is_defined) {
|
|
return active->module->route_is_defined(target);
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
|
|
/* if we didn't find the specified module, or it doesn't have
|
|
* the required API, then the route isn't defined */
|
|
return false;
|
|
}
|
|
|
|
void orte_routed_base_update_routing_plan(char *module)
|
|
{
|
|
orte_routed_base_active_t *active;
|
|
|
|
OPAL_LIST_FOREACH(active, &orte_routed_base.actives, orte_routed_base_active_t) {
|
|
if (NULL == module ||
|
|
0 == strcmp(module, active->component->base_version.mca_component_name)) {
|
|
if (NULL != active->module->update_routing_plan) {
|
|
active->module->update_routing_plan();
|
|
}
|
|
}
|
|
}
|
|
|
|
return;
|
|
}
|
|
|
|
void orte_routed_base_get_routing_list(char *module, opal_list_t *coll)
|
|
{
|
|
orte_routed_base_active_t *active;
|
|
|
|
OPAL_LIST_FOREACH(active, &orte_routed_base.actives, orte_routed_base_active_t) {
|
|
if (NULL == module ||
|
|
0 == strcmp(module, active->component->base_version.mca_component_name)) {
|
|
if (NULL != active->module->get_routing_list) {
|
|
active->module->get_routing_list(coll);
|
|
}
|
|
}
|
|
}
|
|
return;
|
|
}
|
|
|
|
int orte_routed_base_set_lifeline(char *module, orte_process_name_t *proc)
|
|
{
|
|
orte_routed_base_active_t *active;
|
|
int rc;
|
|
|
|
OPAL_LIST_FOREACH(active, &orte_routed_base.actives, orte_routed_base_active_t) {
|
|
if (NULL == module ||
|
|
0 == strcmp(module, active->component->base_version.mca_component_name)) {
|
|
if (NULL != active->module->set_lifeline) {
|
|
if (ORTE_SUCCESS != (rc = active->module->set_lifeline(proc))) {
|
|
return rc;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
return ORTE_SUCCESS;
|
|
}
|
|
|
|
size_t orte_routed_base_num_routes(char *module)
|
|
{
|
|
orte_routed_base_active_t *active;
|
|
size_t rc = 0;
|
|
|
|
OPAL_LIST_FOREACH(active, &orte_routed_base.actives, orte_routed_base_active_t) {
|
|
if (NULL == module ||
|
|
0 == strcmp(module, active->component->base_version.mca_component_name)) {
|
|
if (NULL != active->module->num_routes) {
|
|
rc += active->module->num_routes();
|
|
}
|
|
}
|
|
}
|
|
return rc;
|
|
}
|
|
|
|
int orte_routed_base_ft_event(char *module, int state)
|
|
{
|
|
orte_routed_base_active_t *active;
|
|
int rc;
|
|
|
|
OPAL_LIST_FOREACH(active, &orte_routed_base.actives, orte_routed_base_active_t) {
|
|
if (NULL == module ||
|
|
0 == strcmp(module, active->component->base_version.mca_component_name)) {
|
|
if (NULL != active->module->ft_event) {
|
|
if (ORTE_SUCCESS != (rc = active->module->ft_event(state))) {
|
|
return rc;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
return ORTE_SUCCESS;
|
|
}
|
|
|
|
|
|
void orte_routed_base_xcast_routing(opal_list_t *coll, opal_list_t *my_children)
|
|
{
|
|
orte_routed_tree_t *child;
|
|
orte_namelist_t *nm;
|
|
int i;
|
|
orte_proc_t *proc;
|
|
orte_job_t *daemons;
|
|
|
|
/* if we are the HNP and an abnormal termination is underway,
|
|
* then send it directly to everyone
|
|
*/
|
|
if (ORTE_PROC_IS_HNP) {
|
|
if (orte_abnormal_term_ordered || !orte_routing_is_enabled) {
|
|
daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
|
|
for (i=1; i < daemons->procs->size; i++) {
|
|
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(daemons->procs, i))) {
|
|
continue;
|
|
}
|
|
/* exclude anyone known not alive */
|
|
if (ORTE_FLAG_TEST(proc, ORTE_PROC_FLAG_ALIVE)) {
|
|
nm = OBJ_NEW(orte_namelist_t);
|
|
nm->name.jobid = ORTE_PROC_MY_NAME->jobid;
|
|
nm->name.vpid = proc->name.vpid;
|
|
opal_list_append(coll, &nm->super);
|
|
}
|
|
}
|
|
/* if nobody is known alive, then we need to die */
|
|
if (0 == opal_list_get_size(coll)) {
|
|
ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED);
|
|
}
|
|
} else {
|
|
/* the xcast always goes to our children */
|
|
OPAL_LIST_FOREACH(child, my_children, orte_routed_tree_t) {
|
|
nm = OBJ_NEW(orte_namelist_t);
|
|
nm->name.jobid = ORTE_PROC_MY_NAME->jobid;
|
|
nm->name.vpid = child->vpid;
|
|
opal_list_append(coll, &nm->super);
|
|
}
|
|
}
|
|
} else {
|
|
/* I am a daemon - route to my children */
|
|
OPAL_LIST_FOREACH(child, my_children, orte_routed_tree_t) {
|
|
nm = OBJ_NEW(orte_namelist_t);
|
|
nm->name.jobid = ORTE_PROC_MY_NAME->jobid;
|
|
nm->name.vpid = child->vpid;
|
|
opal_list_append(coll, &nm->super);
|
|
}
|
|
}
|
|
}
|
|
|
|
int orte_routed_base_process_callback(orte_jobid_t job, opal_buffer_t *buffer)
|
|
{
|
|
orte_proc_t *proc;
|
|
orte_job_t *jdata;
|
|
orte_std_cntr_t cnt;
|
|
char *rml_uri;
|
|
orte_vpid_t vpid;
|
|
int rc;
|
|
|
|
/* lookup the job object for this process */
|
|
if (NULL == (jdata = orte_get_job_data_object(job))) {
|
|
/* came from a different job family - this is an error */
|
|
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
|
return ORTE_ERR_NOT_FOUND;
|
|
}
|
|
|
|
/* unpack the data for each entry */
|
|
cnt = 1;
|
|
while (ORTE_SUCCESS == (rc = opal_dss.unpack(buffer, &vpid, &cnt, ORTE_VPID))) {
|
|
|
|
if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &rml_uri, &cnt, OPAL_STRING))) {
|
|
ORTE_ERROR_LOG(rc);
|
|
continue;
|
|
}
|
|
|
|
OPAL_OUTPUT_VERBOSE((2, orte_routed_base_framework.framework_output,
|
|
"%s routed_base:callback got uri %s for job %s rank %s",
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
|
(NULL == rml_uri) ? "NULL" : rml_uri,
|
|
ORTE_JOBID_PRINT(job), ORTE_VPID_PRINT(vpid)));
|
|
|
|
if (NULL == rml_uri) {
|
|
/* should not happen */
|
|
ORTE_ERROR_LOG(ORTE_ERR_FATAL);
|
|
return ORTE_ERR_FATAL;
|
|
}
|
|
|
|
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, vpid))) {
|
|
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
|
continue;
|
|
}
|
|
|
|
/* update the record */
|
|
proc->rml_uri = strdup(rml_uri);
|
|
free(rml_uri);
|
|
|
|
cnt = 1;
|
|
}
|
|
if (ORTE_ERR_UNPACK_READ_PAST_END_OF_BUFFER != rc) {
|
|
ORTE_ERROR_LOG(rc);
|
|
return rc;
|
|
}
|
|
|
|
return ORTE_SUCCESS;
|
|
}
|