1
1
openmpi/orte/mca/rmaps/resilient/rmaps_resilient.c

882 строки
34 KiB
C
Исходник Обычный вид История

/*
At long last, the fabled revision to the affinity system has arrived. A more detailed explanation of how this all works will be presented here: https://svn.open-mpi.org/trac/ompi/wiki/ProcessPlacement The wiki page is incomplete at the moment, but I hope to complete it over the next few days. I will provide updates on the devel list. As the wiki page states, the default and most commonly used options remain unchanged (except as noted below). New, esoteric and complex options have been added, but unless you are a true masochist, you are unlikely to use many of them beyond perhaps an initial curiosity-motivated experimentation. In a nutshell, this commit revamps the map/rank/bind procedure to take into account topology info on the compute nodes. I have, for the most part, preserved the default behaviors, with three notable exceptions: 1. I have at long last bowed my head in submission to the system admin's of managed clusters. For years, they have complained about our default of allowing users to oversubscribe nodes - i.e., to run more processes on a node than allocated slots. Accordingly, I have modified the default behavior: if you are running off of hostfile/dash-host allocated nodes, then the default is to allow oversubscription. If you are running off of RM-allocated nodes, then the default is to NOT allow oversubscription. Flags to override these behaviors are provided, so this only affects the default behavior. 2. both cpus/rank and stride have been removed. The latter was demanded by those who didn't understand the purpose behind it - and I agreed as the users who requested it are no longer using it. The former was removed temporarily pending implementation. 3. vm launch is now the sole method for starting OMPI. It was just too darned hard to maintain multiple launch procedures - maybe someday, provided someone can demonstrate a reason to do so. As Jeff stated, it is impossible to fully test a change of this size. I have tested it on Linux and Mac, covering all the default and simple options, singletons, and comm_spawn. That said, I'm sure others will find problems, so I'll be watching MTT results until this stabilizes. This commit was SVN r25476.
2011-11-15 03:40:11 +00:00
* Copyright (c) 2009-2011 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2009-2010 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/constants.h"
#include "orte/types.h"
#include <errno.h>
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif /* HAVE_UNISTD_H */
#ifdef HAVE_STRING_H
#include <string.h>
#endif /* HAVE_STRING_H */
#include <stdio.h>
#include "opal/mca/base/mca_base_param.h"
#include "opal/util/argv.h"
#include "opal/class/opal_pointer_array.h"
#include "orte/util/error_strings.h"
#include "orte/util/show_help.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/rmaps/base/rmaps_private.h"
#include "orte/mca/rmaps/base/base.h"
#include "rmaps_resilient.h"
/*
* Local variable
*/
static char *orte_getline(FILE *fp);
static bool have_ftgrps=false, made_ftgrps=false;
static int construct_ftgrps(void);
static int get_ftgrp_target(orte_proc_t *proc,
orte_rmaps_res_ftgrp_t **target,
orte_node_t **nd);
static int get_new_node(orte_proc_t *proc,
orte_app_context_t *app,
orte_job_map_t *map,
orte_node_t **ndret);
static int map_to_ftgrps(orte_job_t *jdata);
/*
* Loadbalance the cluster
*/
static int orte_rmaps_resilient_map(orte_job_t *jdata)
{
orte_app_context_t *app;
At long last, the fabled revision to the affinity system has arrived. A more detailed explanation of how this all works will be presented here: https://svn.open-mpi.org/trac/ompi/wiki/ProcessPlacement The wiki page is incomplete at the moment, but I hope to complete it over the next few days. I will provide updates on the devel list. As the wiki page states, the default and most commonly used options remain unchanged (except as noted below). New, esoteric and complex options have been added, but unless you are a true masochist, you are unlikely to use many of them beyond perhaps an initial curiosity-motivated experimentation. In a nutshell, this commit revamps the map/rank/bind procedure to take into account topology info on the compute nodes. I have, for the most part, preserved the default behaviors, with three notable exceptions: 1. I have at long last bowed my head in submission to the system admin's of managed clusters. For years, they have complained about our default of allowing users to oversubscribe nodes - i.e., to run more processes on a node than allocated slots. Accordingly, I have modified the default behavior: if you are running off of hostfile/dash-host allocated nodes, then the default is to allow oversubscription. If you are running off of RM-allocated nodes, then the default is to NOT allow oversubscription. Flags to override these behaviors are provided, so this only affects the default behavior. 2. both cpus/rank and stride have been removed. The latter was demanded by those who didn't understand the purpose behind it - and I agreed as the users who requested it are no longer using it. The former was removed temporarily pending implementation. 3. vm launch is now the sole method for starting OMPI. It was just too darned hard to maintain multiple launch procedures - maybe someday, provided someone can demonstrate a reason to do so. As Jeff stated, it is impossible to fully test a change of this size. I have tested it on Linux and Mac, covering all the default and simple options, singletons, and comm_spawn. That said, I'm sure others will find problems, so I'll be watching MTT results until this stabilizes. This commit was SVN r25476.
2011-11-15 03:40:11 +00:00
int i, j;
int rc = ORTE_SUCCESS;
At long last, the fabled revision to the affinity system has arrived. A more detailed explanation of how this all works will be presented here: https://svn.open-mpi.org/trac/ompi/wiki/ProcessPlacement The wiki page is incomplete at the moment, but I hope to complete it over the next few days. I will provide updates on the devel list. As the wiki page states, the default and most commonly used options remain unchanged (except as noted below). New, esoteric and complex options have been added, but unless you are a true masochist, you are unlikely to use many of them beyond perhaps an initial curiosity-motivated experimentation. In a nutshell, this commit revamps the map/rank/bind procedure to take into account topology info on the compute nodes. I have, for the most part, preserved the default behaviors, with three notable exceptions: 1. I have at long last bowed my head in submission to the system admin's of managed clusters. For years, they have complained about our default of allowing users to oversubscribe nodes - i.e., to run more processes on a node than allocated slots. Accordingly, I have modified the default behavior: if you are running off of hostfile/dash-host allocated nodes, then the default is to allow oversubscription. If you are running off of RM-allocated nodes, then the default is to NOT allow oversubscription. Flags to override these behaviors are provided, so this only affects the default behavior. 2. both cpus/rank and stride have been removed. The latter was demanded by those who didn't understand the purpose behind it - and I agreed as the users who requested it are no longer using it. The former was removed temporarily pending implementation. 3. vm launch is now the sole method for starting OMPI. It was just too darned hard to maintain multiple launch procedures - maybe someday, provided someone can demonstrate a reason to do so. As Jeff stated, it is impossible to fully test a change of this size. I have tested it on Linux and Mac, covering all the default and simple options, singletons, and comm_spawn. That said, I'm sure others will find problems, so I'll be watching MTT results until this stabilizes. This commit was SVN r25476.
2011-11-15 03:40:11 +00:00
orte_node_t *nd=NULL, *oldnode, *node, *nptr;
orte_rmaps_res_ftgrp_t *target = NULL;
orte_proc_t *proc;
orte_vpid_t totprocs;
opal_list_t node_list;
orte_std_cntr_t num_slots;
opal_list_item_t *item;
George raised some valid concerns about the extensibility of the revised rmaps framework. Address those by: 1. removing the enum of mapper values 2. change the req_mapper and last_mapper fields to char* so they can hold the component name instead of a mapper flag 3. revise the selection logic in the mapper components to reflect the change. Components now look for their name in the req_mapper field, or to see if other criteria (e.g., npernode) are set that mandate their doing the mapping Several MCA params resided in the rmaps base for historical reasons - they have been in the base since at least the original 1.2 release (and perhaps earlier). However, George correctly pointed out that they really should reside in their respective components. Accordingly, move them to the components, but register synonyms to the old names to avoid breaking backward compatibility. These revisions retain the current functionality of allowing comm_spawn'd jobs to use different mappers than the original job, and for the errmgr to utilize the resilient mapper to recover processes regardless of how they were originally mapped. Given the large number of possible combinations, I am sure that someone will find a corner-case combination of values and selection criteria that cause either no mapper to be selected, or one other than the intended to be used. No one can test all the ways people will use this system, so I expect debugging to continue for awhile. The ability of comm_spawn'd jobs to exploit this functionality relies on changes to the orte_dpm component - this will be committed separately. This commit was SVN r24520.
2011-03-12 05:30:09 +00:00
mca_base_component_t *c = &mca_rmaps_resilient_component.super.base_version;
At long last, the fabled revision to the affinity system has arrived. A more detailed explanation of how this all works will be presented here: https://svn.open-mpi.org/trac/ompi/wiki/ProcessPlacement The wiki page is incomplete at the moment, but I hope to complete it over the next few days. I will provide updates on the devel list. As the wiki page states, the default and most commonly used options remain unchanged (except as noted below). New, esoteric and complex options have been added, but unless you are a true masochist, you are unlikely to use many of them beyond perhaps an initial curiosity-motivated experimentation. In a nutshell, this commit revamps the map/rank/bind procedure to take into account topology info on the compute nodes. I have, for the most part, preserved the default behaviors, with three notable exceptions: 1. I have at long last bowed my head in submission to the system admin's of managed clusters. For years, they have complained about our default of allowing users to oversubscribe nodes - i.e., to run more processes on a node than allocated slots. Accordingly, I have modified the default behavior: if you are running off of hostfile/dash-host allocated nodes, then the default is to allow oversubscription. If you are running off of RM-allocated nodes, then the default is to NOT allow oversubscription. Flags to override these behaviors are provided, so this only affects the default behavior. 2. both cpus/rank and stride have been removed. The latter was demanded by those who didn't understand the purpose behind it - and I agreed as the users who requested it are no longer using it. The former was removed temporarily pending implementation. 3. vm launch is now the sole method for starting OMPI. It was just too darned hard to maintain multiple launch procedures - maybe someday, provided someone can demonstrate a reason to do so. As Jeff stated, it is impossible to fully test a change of this size. I have tested it on Linux and Mac, covering all the default and simple options, singletons, and comm_spawn. That said, I'm sure others will find problems, so I'll be watching MTT results until this stabilizes. This commit was SVN r25476.
2011-11-15 03:40:11 +00:00
bool found;
if (ORTE_JOB_STATE_INIT == jdata->state) {
George raised some valid concerns about the extensibility of the revised rmaps framework. Address those by: 1. removing the enum of mapper values 2. change the req_mapper and last_mapper fields to char* so they can hold the component name instead of a mapper flag 3. revise the selection logic in the mapper components to reflect the change. Components now look for their name in the req_mapper field, or to see if other criteria (e.g., npernode) are set that mandate their doing the mapping Several MCA params resided in the rmaps base for historical reasons - they have been in the base since at least the original 1.2 release (and perhaps earlier). However, George correctly pointed out that they really should reside in their respective components. Accordingly, move them to the components, but register synonyms to the old names to avoid breaking backward compatibility. These revisions retain the current functionality of allowing comm_spawn'd jobs to use different mappers than the original job, and for the errmgr to utilize the resilient mapper to recover processes regardless of how they were originally mapped. Given the large number of possible combinations, I am sure that someone will find a corner-case combination of values and selection criteria that cause either no mapper to be selected, or one other than the intended to be used. No one can test all the ways people will use this system, so I expect debugging to continue for awhile. The ability of comm_spawn'd jobs to exploit this functionality relies on changes to the orte_dpm component - this will be committed separately. This commit was SVN r24520.
2011-03-12 05:30:09 +00:00
if (NULL != jdata->map->req_mapper &&
0 != strcasecmp(jdata->map->req_mapper, c->mca_component_name)) {
/* a mapper has been specified, and it isn't me */
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps:resilient: job %s not using resilient mapper",
ORTE_JOBID_PRINT(jdata->jobid));
return ORTE_ERR_TAKE_NEXT_OPTION;
}
if (NULL == mca_rmaps_resilient_component.fault_group_file) {
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps:resilient: cannot perform initial map of job %s - no fault groups",
ORTE_JOBID_PRINT(jdata->jobid));
return ORTE_ERR_TAKE_NEXT_OPTION;
}
} else if (ORTE_JOB_STATE_RESTART != jdata->state &&
ORTE_JOB_STATE_PROCS_MIGRATING != jdata->state) {
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps:resilient: cannot map job %s - not in restart or migrating",
ORTE_JOBID_PRINT(jdata->jobid));
return ORTE_ERR_TAKE_NEXT_OPTION;
}
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps:resilient: mapping job %s",
ORTE_JOBID_PRINT(jdata->jobid));
/* flag that I did the mapping */
if (NULL != jdata->map->last_mapper) {
free(jdata->map->last_mapper);
}
George raised some valid concerns about the extensibility of the revised rmaps framework. Address those by: 1. removing the enum of mapper values 2. change the req_mapper and last_mapper fields to char* so they can hold the component name instead of a mapper flag 3. revise the selection logic in the mapper components to reflect the change. Components now look for their name in the req_mapper field, or to see if other criteria (e.g., npernode) are set that mandate their doing the mapping Several MCA params resided in the rmaps base for historical reasons - they have been in the base since at least the original 1.2 release (and perhaps earlier). However, George correctly pointed out that they really should reside in their respective components. Accordingly, move them to the components, but register synonyms to the old names to avoid breaking backward compatibility. These revisions retain the current functionality of allowing comm_spawn'd jobs to use different mappers than the original job, and for the errmgr to utilize the resilient mapper to recover processes regardless of how they were originally mapped. Given the large number of possible combinations, I am sure that someone will find a corner-case combination of values and selection criteria that cause either no mapper to be selected, or one other than the intended to be used. No one can test all the ways people will use this system, so I expect debugging to continue for awhile. The ability of comm_spawn'd jobs to exploit this functionality relies on changes to the orte_dpm component - this will be committed separately. This commit was SVN r24520.
2011-03-12 05:30:09 +00:00
jdata->map->last_mapper = strdup(c->mca_component_name);
/* have we already constructed the fault group list? */
if (!made_ftgrps) {
construct_ftgrps();
}
if (ORTE_JOB_STATE_INIT == jdata->state) {
/* this is an initial map - let the fault group mapper
* handle it
*/
return map_to_ftgrps(jdata);
}
/*
* NOTE: if a proc is being ADDED to an existing job, then its
* node field will be NULL.
*/
OPAL_OUTPUT_VERBOSE((1, orte_rmaps_base.rmaps_output,
"%s rmaps:resilient: remapping job %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(jdata->jobid)));
/* cycle through all the procs in this job to find the one(s) that failed */
for (i=0; i < jdata->procs->size; i++) {
/* get the proc object */
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, i))) {
continue;
}
OPAL_OUTPUT_VERBOSE((7, orte_rmaps_base.rmaps_output,
"%s PROC %s STATE %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&proc->name),
orte_proc_state_to_str(proc->state)));
/* is this proc to be restarted? */
if (proc->state != ORTE_PROC_STATE_RESTART) {
continue;
}
/* save the current node */
oldnode = proc->node;
/* point to the app */
app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, proc->app_idx);
if( NULL == app ) {
ORTE_ERROR_LOG(ORTE_ERR_FAILED_TO_MAP);
rc = ORTE_ERR_FAILED_TO_MAP;
goto error;
}
if (NULL == oldnode) {
OPAL_OUTPUT_VERBOSE((1, orte_rmaps_base.rmaps_output,
"%s rmaps:resilient: proc %s is to be started",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&proc->name)));
} else {
OPAL_OUTPUT_VERBOSE((1, orte_rmaps_base.rmaps_output,
"%s rmaps:resilient: proc %s from node %s[%s] is to be restarted",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&proc->name),
(NULL == oldnode->name) ? "NULL" : oldnode->name,
(NULL == oldnode->daemon) ? "--" : ORTE_VPID_PRINT(oldnode->daemon->name.vpid)));
}
if (NULL == oldnode) {
/* this proc was not previously running - likely it is being added
* to the job. So place it on the node with the fewest procs to
* balance the load
*/
OBJ_CONSTRUCT(&node_list, opal_list_t);
if (ORTE_SUCCESS != (rc = orte_rmaps_base_get_target_nodes(&node_list,
&num_slots,
app,
jdata->map->mapping,
false))) {
ORTE_ERROR_LOG(rc);
while (NULL != (item = opal_list_remove_first(&node_list))) {
OBJ_RELEASE(item);
}
OBJ_DESTRUCT(&node_list);
goto error;
}
if (opal_list_is_empty(&node_list)) {
/* put the proc on "hold" until resources are available */
OBJ_DESTRUCT(&node_list);
proc->state = ORTE_PROC_STATE_MIGRATING;
rc = ORTE_ERR_OUT_OF_RESOURCE;
goto error;
}
totprocs = 1000000;
nd = NULL;
while (NULL != (item = opal_list_remove_first(&node_list))) {
node = (orte_node_t*)item;
if (node->num_procs < totprocs) {
nd = node;
totprocs = node->num_procs;
}
OBJ_RELEASE(item); /* maintain accounting */
}
OBJ_DESTRUCT(&node_list);
/* we already checked to ensure there was at least one node,
* so we couldn't have come out of the loop with nd=NULL
*/
OPAL_OUTPUT_VERBOSE((1, orte_rmaps_base.rmaps_output,
"%s rmaps:resilient: Placing new process on node %s[%s] (no ftgrp)",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
nd->name,
(NULL == nd->daemon) ? "--" : ORTE_VPID_PRINT(nd->daemon->name.vpid)));
} else {
OPAL_OUTPUT_VERBOSE((1, orte_rmaps_base.rmaps_output,
"%s rmaps:resilient: proc %s from node %s is to be restarted",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&proc->name),
(NULL == proc->node) ? "NULL" : proc->node->name));
/* if we have fault groups, use them */
if (have_ftgrps) {
if (ORTE_SUCCESS != (rc = get_ftgrp_target(proc, &target, &nd))) {
ORTE_ERROR_LOG(rc);
goto error;
}
OPAL_OUTPUT_VERBOSE((1, orte_rmaps_base.rmaps_output,
"%s rmaps:resilient: placing proc %s into fault group %d node %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&proc->name), target->ftgrp, nd->name));
} else {
if (ORTE_SUCCESS != (rc = get_new_node(proc, app, jdata->map, &nd))) {
ORTE_ERROR_LOG(rc);
return rc;
}
}
}
At long last, the fabled revision to the affinity system has arrived. A more detailed explanation of how this all works will be presented here: https://svn.open-mpi.org/trac/ompi/wiki/ProcessPlacement The wiki page is incomplete at the moment, but I hope to complete it over the next few days. I will provide updates on the devel list. As the wiki page states, the default and most commonly used options remain unchanged (except as noted below). New, esoteric and complex options have been added, but unless you are a true masochist, you are unlikely to use many of them beyond perhaps an initial curiosity-motivated experimentation. In a nutshell, this commit revamps the map/rank/bind procedure to take into account topology info on the compute nodes. I have, for the most part, preserved the default behaviors, with three notable exceptions: 1. I have at long last bowed my head in submission to the system admin's of managed clusters. For years, they have complained about our default of allowing users to oversubscribe nodes - i.e., to run more processes on a node than allocated slots. Accordingly, I have modified the default behavior: if you are running off of hostfile/dash-host allocated nodes, then the default is to allow oversubscription. If you are running off of RM-allocated nodes, then the default is to NOT allow oversubscription. Flags to override these behaviors are provided, so this only affects the default behavior. 2. both cpus/rank and stride have been removed. The latter was demanded by those who didn't understand the purpose behind it - and I agreed as the users who requested it are no longer using it. The former was removed temporarily pending implementation. 3. vm launch is now the sole method for starting OMPI. It was just too darned hard to maintain multiple launch procedures - maybe someday, provided someone can demonstrate a reason to do so. As Jeff stated, it is impossible to fully test a change of this size. I have tested it on Linux and Mac, covering all the default and simple options, singletons, and comm_spawn. That said, I'm sure others will find problems, so I'll be watching MTT results until this stabilizes. This commit was SVN r25476.
2011-11-15 03:40:11 +00:00
/* add node to map if necessary - nothing we can do here
* but search for it
*/
At long last, the fabled revision to the affinity system has arrived. A more detailed explanation of how this all works will be presented here: https://svn.open-mpi.org/trac/ompi/wiki/ProcessPlacement The wiki page is incomplete at the moment, but I hope to complete it over the next few days. I will provide updates on the devel list. As the wiki page states, the default and most commonly used options remain unchanged (except as noted below). New, esoteric and complex options have been added, but unless you are a true masochist, you are unlikely to use many of them beyond perhaps an initial curiosity-motivated experimentation. In a nutshell, this commit revamps the map/rank/bind procedure to take into account topology info on the compute nodes. I have, for the most part, preserved the default behaviors, with three notable exceptions: 1. I have at long last bowed my head in submission to the system admin's of managed clusters. For years, they have complained about our default of allowing users to oversubscribe nodes - i.e., to run more processes on a node than allocated slots. Accordingly, I have modified the default behavior: if you are running off of hostfile/dash-host allocated nodes, then the default is to allow oversubscription. If you are running off of RM-allocated nodes, then the default is to NOT allow oversubscription. Flags to override these behaviors are provided, so this only affects the default behavior. 2. both cpus/rank and stride have been removed. The latter was demanded by those who didn't understand the purpose behind it - and I agreed as the users who requested it are no longer using it. The former was removed temporarily pending implementation. 3. vm launch is now the sole method for starting OMPI. It was just too darned hard to maintain multiple launch procedures - maybe someday, provided someone can demonstrate a reason to do so. As Jeff stated, it is impossible to fully test a change of this size. I have tested it on Linux and Mac, covering all the default and simple options, singletons, and comm_spawn. That said, I'm sure others will find problems, so I'll be watching MTT results until this stabilizes. This commit was SVN r25476.
2011-11-15 03:40:11 +00:00
found = false;
for (j=0; j < jdata->map->nodes->size; j++) {
if (NULL == (nptr = (orte_node_t*)opal_pointer_array_get_item(jdata->map->nodes, j))) {
continue;
}
if (nptr == nd) {
found = true;
break;
}
}
At long last, the fabled revision to the affinity system has arrived. A more detailed explanation of how this all works will be presented here: https://svn.open-mpi.org/trac/ompi/wiki/ProcessPlacement The wiki page is incomplete at the moment, but I hope to complete it over the next few days. I will provide updates on the devel list. As the wiki page states, the default and most commonly used options remain unchanged (except as noted below). New, esoteric and complex options have been added, but unless you are a true masochist, you are unlikely to use many of them beyond perhaps an initial curiosity-motivated experimentation. In a nutshell, this commit revamps the map/rank/bind procedure to take into account topology info on the compute nodes. I have, for the most part, preserved the default behaviors, with three notable exceptions: 1. I have at long last bowed my head in submission to the system admin's of managed clusters. For years, they have complained about our default of allowing users to oversubscribe nodes - i.e., to run more processes on a node than allocated slots. Accordingly, I have modified the default behavior: if you are running off of hostfile/dash-host allocated nodes, then the default is to allow oversubscription. If you are running off of RM-allocated nodes, then the default is to NOT allow oversubscription. Flags to override these behaviors are provided, so this only affects the default behavior. 2. both cpus/rank and stride have been removed. The latter was demanded by those who didn't understand the purpose behind it - and I agreed as the users who requested it are no longer using it. The former was removed temporarily pending implementation. 3. vm launch is now the sole method for starting OMPI. It was just too darned hard to maintain multiple launch procedures - maybe someday, provided someone can demonstrate a reason to do so. As Jeff stated, it is impossible to fully test a change of this size. I have tested it on Linux and Mac, covering all the default and simple options, singletons, and comm_spawn. That said, I'm sure others will find problems, so I'll be watching MTT results until this stabilizes. This commit was SVN r25476.
2011-11-15 03:40:11 +00:00
if (!found) {
OBJ_RETAIN(nd);
opal_pointer_array_add(jdata->map->nodes, nd);
nd->mapped = true;
}
OBJ_RETAIN(nd); /* maintain accounting on object */
proc->node = nd;
proc->nodename = nd->name;
nd->num_procs++;
opal_pointer_array_add(nd->procs, (void*)proc);
/* retain the proc struct so that we correctly track its release */
OBJ_RETAIN(proc);
/* flag the proc state as non-launched so we'll know to launch it */
proc->state = ORTE_PROC_STATE_INIT;
/* update the node and local ranks so static ports can
* be properly selected if active
*/
orte_rmaps_base_update_local_ranks(jdata, oldnode, nd, proc);
}
error:
return rc;
}
orte_rmaps_base_module_t orte_rmaps_resilient_module = {
orte_rmaps_resilient_map
};
static char *orte_getline(FILE *fp)
{
char *ret, *buff;
char input[1024];
ret = fgets(input, 1024, fp);
if (NULL != ret) {
input[strlen(input)-1] = '\0'; /* remove newline */
buff = strdup(input);
return buff;
}
return NULL;
}
static int construct_ftgrps(void)
{
orte_rmaps_res_ftgrp_t *ftgrp;
orte_node_t *node;
FILE *fp;
char *ftinput;
int grp;
char **nodes;
bool found;
int i, k;
/* flag that we did this */
made_ftgrps = true;
if (NULL == mca_rmaps_resilient_component.fault_group_file) {
/* nothing to build */
return ORTE_SUCCESS;
}
/* construct it */
OPAL_OUTPUT_VERBOSE((1, orte_rmaps_base.rmaps_output,
"%s rmaps:resilient: constructing fault groups",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
fp = fopen(mca_rmaps_resilient_component.fault_group_file, "r");
if (NULL == fp) { /* not found */
orte_show_help("help-orte-rmaps-resilient.txt", "orte-rmaps-resilient:file-not-found",
true, mca_rmaps_resilient_component.fault_group_file);
return ORTE_ERR_FAILED_TO_MAP;
}
/* build list of fault groups */
grp = 0;
while (NULL != (ftinput = orte_getline(fp))) {
ftgrp = OBJ_NEW(orte_rmaps_res_ftgrp_t);
ftgrp->ftgrp = grp++;
nodes = opal_argv_split(ftinput, ',');
/* find the referenced nodes */
for (k=0; k < opal_argv_count(nodes); k++) {
found = false;
for (i=0; i < orte_node_pool->size && !found; i++) {
if (NULL == (node = opal_pointer_array_get_item(orte_node_pool, i))) {
continue;
}
if (0 == strcmp(node->name, nodes[k])) {
OBJ_RETAIN(node);
OPAL_OUTPUT_VERBOSE((1, orte_rmaps_base.rmaps_output,
"%s rmaps:resilient: adding node %s to fault group %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
node->name, ftgrp->ftgrp));
opal_pointer_array_add(&ftgrp->nodes, node);
found = true;
break;
}
}
}
opal_list_append(&mca_rmaps_resilient_component.fault_grps, &ftgrp->super);
opal_argv_free(nodes);
free(ftinput);
}
fclose(fp);
/* flag that we have fault grps */
have_ftgrps = true;
return ORTE_SUCCESS;
}
static int get_ftgrp_target(orte_proc_t *proc,
orte_rmaps_res_ftgrp_t **tgt,
orte_node_t **ndret)
{
opal_list_item_t *item;
int k, totnodes;
orte_node_t *node, *nd;
orte_rmaps_res_ftgrp_t *target, *ftgrp;
float avgload, minload;
orte_vpid_t totprocs, lowprocs;
/* set defaults */
*tgt = NULL;
*ndret = NULL;
/* flag all the fault groups that
* include this node so we don't reuse them
*/
minload = 1000000.0;
target = NULL;
for (item = opal_list_get_first(&mca_rmaps_resilient_component.fault_grps);
item != opal_list_get_end(&mca_rmaps_resilient_component.fault_grps);
item = opal_list_get_next(item)) {
ftgrp = (orte_rmaps_res_ftgrp_t*)item;
/* see if the node is in this fault group */
ftgrp->included = true;
ftgrp->used = false;
for (k=0; k < ftgrp->nodes.size; k++) {
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(&ftgrp->nodes, k))) {
continue;
}
if (NULL != proc->node && 0 == strcmp(node->name, proc->node->name)) {
/* yes - mark it to not be included */
OPAL_OUTPUT_VERBOSE((1, orte_rmaps_base.rmaps_output,
"%s rmaps:resilient: node %s is in fault group %d, which will be excluded",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
proc->node->name, ftgrp->ftgrp));
ftgrp->included = false;
break;
}
}
/* if this ftgrp is not included, then skip it */
if (!ftgrp->included) {
continue;
}
/* compute the load average on this fault group */
totprocs = 0;
totnodes = 0;
for (k=0; k < ftgrp->nodes.size; k++) {
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(&ftgrp->nodes, k))) {
continue;
}
totnodes++;
totprocs += node->num_procs;
}
avgload = (float)totprocs / (float)totnodes;
/* now find the lightest loaded of the included fault groups */
if (avgload < minload) {
minload = avgload;
target = ftgrp;
OPAL_OUTPUT_VERBOSE((2, orte_rmaps_base.rmaps_output,
"%s rmaps:resilient: found new min load ftgrp %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ftgrp->ftgrp));
}
}
if (NULL == target) {
/* nothing found */
return ORTE_ERR_NOT_FOUND;
}
/* if we did find a target, re-map the proc to the lightest loaded
* node in that group
*/
lowprocs = 1000000;
nd = NULL;
for (k=0; k < target->nodes.size; k++) {
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(&target->nodes, k))) {
continue;
}
if (node->num_procs < lowprocs) {
lowprocs = node->num_procs;
nd = node;
}
}
/* return the results */
*tgt = target;
*ndret = nd;
return ORTE_SUCCESS;
}
static int get_new_node(orte_proc_t *proc,
orte_app_context_t *app,
orte_job_map_t *map,
orte_node_t **ndret)
{
orte_node_t *nd, *oldnode, *node;
orte_proc_t *pptr;
int rc, j;
opal_list_t node_list, candidates;
opal_list_item_t *item, *next;
orte_std_cntr_t num_slots;
bool found;
/* set defaults */
*ndret = NULL;
nd = NULL;
oldnode = proc->node;
/*
* Get a list of all nodes
*/
OBJ_CONSTRUCT(&node_list, opal_list_t);
if (ORTE_SUCCESS != (rc = orte_rmaps_base_get_target_nodes(&node_list,
&num_slots,
app,
map->mapping,
false))) {
ORTE_ERROR_LOG(rc);
goto release;
}
if (opal_list_is_empty(&node_list)) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
rc = ORTE_ERR_OUT_OF_RESOURCE;
goto release;
}
if (1 == opal_list_get_size(&node_list)) {
/* if we have only one node, all we can do is put the proc on that
* node, even if it is the same one - better than not restarting at
* all
*/
nd = (orte_node_t*)opal_list_get_first(&node_list);
proc->prior_node = oldnode;
OPAL_OUTPUT_VERBOSE((1, orte_rmaps_base.rmaps_output,
"%s rmaps:resilient: Placing process %s on node %s[%s] (only one avail node)",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&proc->name),
nd->name,
(NULL == nd->daemon) ? "--" : ORTE_VPID_PRINT(nd->daemon->name.vpid)));
goto release;
}
/*
* Cycle thru the list, transferring
* all available nodes to the candidate list
* so we can get them in the right order
*
*/
OBJ_CONSTRUCT(&candidates, opal_list_t);
while (NULL != (item = opal_list_remove_first(&node_list))) {
node = (orte_node_t*)item;
/* don't put it back on current node */
if (node == oldnode) {
OBJ_RELEASE(item);
continue;
}
if (0 == node->num_procs) {
OPAL_OUTPUT_VERBOSE((7, orte_rmaps_base.rmaps_output,
"%s PREPENDING EMPTY NODE %s[%s] TO CANDIDATES",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
(NULL == node->name) ? "NULL" : node->name,
(NULL == node->daemon) ? "--" : ORTE_VPID_PRINT(node->daemon->name.vpid)));
opal_list_prepend(&candidates, item);
} else {
OPAL_OUTPUT_VERBOSE((7, orte_rmaps_base.rmaps_output,
"%s APPENDING NON-EMPTY NODE %s[%s] TO CANDIDATES",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
(NULL == node->name) ? "NULL" : node->name,
(NULL == node->daemon) ? "--" : ORTE_VPID_PRINT(node->daemon->name.vpid)));
opal_list_append(&candidates, item);
}
}
/* search the candidates
* try to use a semi-intelligent selection logic here that:
*
* (a) avoids putting the proc on a node where a peer is already
* located as this degrades our fault tolerance
*
* (b) avoids "ricochet effect" where a process would ping-pong
* between two nodes as it fails
*/
nd = NULL;
item = opal_list_get_first(&candidates);
while (item != opal_list_get_end(&candidates)) {
node = (orte_node_t*)item;
next = opal_list_get_next(item);
/* don't return to our prior location to avoid
* "ricochet" effect
*/
if (NULL != proc->prior_node &&
node == proc->prior_node) {
OPAL_OUTPUT_VERBOSE((7, orte_rmaps_base.rmaps_output,
"%s REMOVING PRIOR NODE %s[%s] FROM CANDIDATES",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
(NULL == node->name) ? "NULL" : node->name,
(NULL == node->daemon) ? "--" : ORTE_VPID_PRINT(node->daemon->name.vpid)));
opal_list_remove_item(&candidates, item);
OBJ_RELEASE(item); /* maintain acctg */
item = next;
continue;
}
/* if this node is empty, then it is the winner */
if (0 == node->num_procs) {
nd = node;
proc->prior_node = oldnode;
break;
}
/* if this node has someone from my job, then skip it
* to avoid (a)
*/
found = false;
for (j=0; j < node->procs->size; j++) {
if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) {
continue;
}
if (pptr->name.jobid == proc->name.jobid) {
OPAL_OUTPUT_VERBOSE((7, orte_rmaps_base.rmaps_output,
"%s FOUND PEER %s ON NODE %s[%s]",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&pptr->name),
(NULL == node->name) ? "NULL" : node->name,
(NULL == node->daemon) ? "--" : ORTE_VPID_PRINT(node->daemon->name.vpid)));
found = true;
break;
}
}
if (found) {
item = next;
continue;
}
/* get here if all tests pass - take this node */
nd = node;
proc->prior_node = oldnode;
break;
}
if (NULL == nd) {
/* didn't find anything */
if (NULL != proc->prior_node) {
nd = proc->prior_node;
OPAL_OUTPUT_VERBOSE((1, orte_rmaps_base.rmaps_output,
"%s rmaps:resilient: Placing process %s on prior node %s[%s] (no ftgrp)",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&proc->name),
(NULL == nd->name) ? "NULL" : nd->name,
(NULL == nd->daemon) ? "--" : ORTE_VPID_PRINT(nd->daemon->name.vpid)));
} else {
nd = oldnode;
proc->prior_node = oldnode;
OPAL_OUTPUT_VERBOSE((1, orte_rmaps_base.rmaps_output,
"%s rmaps:resilient: Placing process %s back on old node %s[%s] (no ftgrp)",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&proc->name),
(NULL == nd->name) ? "NULL" : nd->name,
(NULL == nd->daemon) ? "--" : ORTE_VPID_PRINT(nd->daemon->name.vpid)));
}
}
/* cleanup candidate list */
while (NULL != (item = opal_list_remove_first(&candidates))) {
OBJ_RELEASE(item);
}
OBJ_DESTRUCT(&candidates);
release:
OPAL_OUTPUT_VERBOSE((1, orte_rmaps_base.rmaps_output,
"%s rmaps:resilient: Placing process on node %s[%s] (no ftgrp)",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
(NULL == nd->name) ? "NULL" : nd->name,
(NULL == nd->daemon) ? "--" : ORTE_VPID_PRINT(nd->daemon->name.vpid)));
while (NULL != (item = opal_list_remove_first(&node_list))) {
OBJ_RELEASE(item);
}
OBJ_DESTRUCT(&node_list);
*ndret = nd;
return rc;
}
static void flag_nodes(opal_list_t *node_list)
{
opal_list_item_t *item, *nitem;
orte_node_t *node, *nd;
orte_rmaps_res_ftgrp_t *ftgrp;
int k;
for (item = opal_list_get_first(&mca_rmaps_resilient_component.fault_grps);
item != opal_list_get_end(&mca_rmaps_resilient_component.fault_grps);
item = opal_list_get_next(item)) {
ftgrp = (orte_rmaps_res_ftgrp_t*)item;
/* reset the flags */
ftgrp->used = false;
ftgrp->included = false;
/* if at least one node in our list is included in this
* ftgrp, then flag it as included
*/
for (nitem = opal_list_get_first(node_list);
!ftgrp->included && nitem != opal_list_get_end(node_list);
nitem = opal_list_get_next(nitem)) {
node = (orte_node_t*)nitem;
for (k=0; k < ftgrp->nodes.size; k++) {
if (NULL == (nd = (orte_node_t*)opal_pointer_array_get_item(&ftgrp->nodes, k))) {
continue;
}
if (0 == strcmp(nd->name, node->name)) {
ftgrp->included = true;
break;
}
}
}
}
}
static int map_to_ftgrps(orte_job_t *jdata)
{
orte_job_map_t *map;
orte_app_context_t *app;
int i, j, k, totnodes;
opal_list_t node_list;
opal_list_item_t *item, *next, *curitem;
orte_std_cntr_t num_slots;
int rc = ORTE_SUCCESS;
float avgload, minload;
orte_node_t *node, *nd=NULL;
orte_rmaps_res_ftgrp_t *ftgrp, *target = NULL;
orte_vpid_t totprocs, num_assigned;
orte_proc_t *proc;
bool initial_map=true;
OPAL_OUTPUT_VERBOSE((1, orte_rmaps_base.rmaps_output,
"%s rmaps:resilient: creating initial map for job %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(jdata->jobid)));
/* start at the beginning... */
jdata->num_procs = 0;
map = jdata->map;
for (i=0; i < jdata->apps->size; i++) {
/* get the app_context */
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
continue;
}
/* you cannot use this mapper unless you specify the number of procs to
* launch for each app
*/
if (0 == app->num_procs) {
orte_show_help("help-orte-rmaps-resilient.txt",
"orte-rmaps-resilient:num-procs",
true);
return ORTE_ERR_SILENT;
}
num_assigned = 0;
/* for each app_context, we have to get the list of nodes that it can
* use since that can now be modified with a hostfile and/or -host
* option
*/
OBJ_CONSTRUCT(&node_list, opal_list_t);
if (ORTE_SUCCESS != (rc = orte_rmaps_base_get_target_nodes(&node_list, &num_slots, app,
map->mapping, initial_map))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* flag that all subsequent requests should not reset the node->mapped flag */
initial_map = false;
/* remove all nodes that are not "up" or do not have a running daemon on them */
item = opal_list_get_first(&node_list);
while (item != opal_list_get_end(&node_list)) {
next = opal_list_get_next(item);
node = (orte_node_t*)item;
if (ORTE_NODE_STATE_UP != node->state ||
NULL == node->daemon ||
ORTE_PROC_STATE_RUNNING != node->daemon->state) {
opal_list_remove_item(&node_list, item);
OBJ_RELEASE(item);
}
item = next;
}
curitem = opal_list_get_first(&node_list);
/* flag the fault groups included by these nodes */
flag_nodes(&node_list);
/* map each copy to a different fault group - if more copies are
* specified than fault groups, then overlap in a round-robin fashion
*/
for (j=0; j < app->num_procs; j++) {
/* find unused included fault group with lowest average load - if none
* found, then break
*/
target = NULL;
minload = 1000000000.0;
for (item = opal_list_get_first(&mca_rmaps_resilient_component.fault_grps);
item != opal_list_get_end(&mca_rmaps_resilient_component.fault_grps);
item = opal_list_get_next(item)) {
ftgrp = (orte_rmaps_res_ftgrp_t*)item;
OPAL_OUTPUT_VERBOSE((2, orte_rmaps_base.rmaps_output,
"%s rmaps:resilient: fault group %d used: %s included %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ftgrp->ftgrp,
ftgrp->used ? "YES" : "NO",
ftgrp->included ? "YES" : "NO" ));
/* if this ftgrp has already been used or is not included, then
* skip it
*/
if (ftgrp->used || !ftgrp->included) {
continue;
}
/* compute the load average on this fault group */
totprocs = 0;
totnodes = 0;
for (k=0; k < ftgrp->nodes.size; k++) {
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(&ftgrp->nodes, k))) {
continue;
}
totnodes++;
totprocs += node->num_procs;
}
avgload = (float)totprocs / (float)totnodes;
if (avgload < minload) {
minload = avgload;
target = ftgrp;
OPAL_OUTPUT_VERBOSE((2, orte_rmaps_base.rmaps_output,
"%s rmaps:resilient: found new min load ftgrp %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ftgrp->ftgrp));
}
}
/* if we have more procs than fault groups, then we simply
* map the remaining procs on available nodes in a round-robin
* fashion - it doesn't matter where they go as they will not
* be contributing to fault tolerance by definition
*/
if (NULL == target) {
OPAL_OUTPUT_VERBOSE((2, orte_rmaps_base.rmaps_output,
"%s rmaps:resilient: more procs than fault groups - mapping excess rr",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
nd = (orte_node_t*)curitem;
curitem = opal_list_get_next(curitem);
if (curitem == opal_list_get_end(&node_list)) {
curitem = opal_list_get_first(&node_list);
}
} else {
/* pick node with lowest load from within that group */
totprocs = 1000000;
for (k=0; k < target->nodes.size; k++) {
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(&target->nodes, k))) {
continue;
}
if (node->num_procs < totprocs) {
totprocs = node->num_procs;
nd = node;
}
}
}
OPAL_OUTPUT_VERBOSE((1, orte_rmaps_base.rmaps_output,
"%s rmaps:resilient: placing proc into fault group %d node %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
(NULL == target) ? -1 : target->ftgrp, nd->name));
At long last, the fabled revision to the affinity system has arrived. A more detailed explanation of how this all works will be presented here: https://svn.open-mpi.org/trac/ompi/wiki/ProcessPlacement The wiki page is incomplete at the moment, but I hope to complete it over the next few days. I will provide updates on the devel list. As the wiki page states, the default and most commonly used options remain unchanged (except as noted below). New, esoteric and complex options have been added, but unless you are a true masochist, you are unlikely to use many of them beyond perhaps an initial curiosity-motivated experimentation. In a nutshell, this commit revamps the map/rank/bind procedure to take into account topology info on the compute nodes. I have, for the most part, preserved the default behaviors, with three notable exceptions: 1. I have at long last bowed my head in submission to the system admin's of managed clusters. For years, they have complained about our default of allowing users to oversubscribe nodes - i.e., to run more processes on a node than allocated slots. Accordingly, I have modified the default behavior: if you are running off of hostfile/dash-host allocated nodes, then the default is to allow oversubscription. If you are running off of RM-allocated nodes, then the default is to NOT allow oversubscription. Flags to override these behaviors are provided, so this only affects the default behavior. 2. both cpus/rank and stride have been removed. The latter was demanded by those who didn't understand the purpose behind it - and I agreed as the users who requested it are no longer using it. The former was removed temporarily pending implementation. 3. vm launch is now the sole method for starting OMPI. It was just too darned hard to maintain multiple launch procedures - maybe someday, provided someone can demonstrate a reason to do so. As Jeff stated, it is impossible to fully test a change of this size. I have tested it on Linux and Mac, covering all the default and simple options, singletons, and comm_spawn. That said, I'm sure others will find problems, so I'll be watching MTT results until this stabilizes. This commit was SVN r25476.
2011-11-15 03:40:11 +00:00
/* if the node isn't in the map, add it */
if (!nd->mapped) {
OBJ_RETAIN(node);
opal_pointer_array_add(map->nodes, nd);
nd->mapped = true;
}
proc = orte_rmaps_base_setup_proc(jdata, node, app->idx);
if ((nd->slots < (int)nd->num_procs) ||
(0 < nd->slots_max && nd->slots_max < (int)nd->num_procs)) {
At long last, the fabled revision to the affinity system has arrived. A more detailed explanation of how this all works will be presented here: https://svn.open-mpi.org/trac/ompi/wiki/ProcessPlacement The wiki page is incomplete at the moment, but I hope to complete it over the next few days. I will provide updates on the devel list. As the wiki page states, the default and most commonly used options remain unchanged (except as noted below). New, esoteric and complex options have been added, but unless you are a true masochist, you are unlikely to use many of them beyond perhaps an initial curiosity-motivated experimentation. In a nutshell, this commit revamps the map/rank/bind procedure to take into account topology info on the compute nodes. I have, for the most part, preserved the default behaviors, with three notable exceptions: 1. I have at long last bowed my head in submission to the system admin's of managed clusters. For years, they have complained about our default of allowing users to oversubscribe nodes - i.e., to run more processes on a node than allocated slots. Accordingly, I have modified the default behavior: if you are running off of hostfile/dash-host allocated nodes, then the default is to allow oversubscription. If you are running off of RM-allocated nodes, then the default is to NOT allow oversubscription. Flags to override these behaviors are provided, so this only affects the default behavior. 2. both cpus/rank and stride have been removed. The latter was demanded by those who didn't understand the purpose behind it - and I agreed as the users who requested it are no longer using it. The former was removed temporarily pending implementation. 3. vm launch is now the sole method for starting OMPI. It was just too darned hard to maintain multiple launch procedures - maybe someday, provided someone can demonstrate a reason to do so. As Jeff stated, it is impossible to fully test a change of this size. I have tested it on Linux and Mac, covering all the default and simple options, singletons, and comm_spawn. That said, I'm sure others will find problems, so I'll be watching MTT results until this stabilizes. This commit was SVN r25476.
2011-11-15 03:40:11 +00:00
if (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) {
orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error",
true, nd->num_procs, app->app);
return ORTE_ERR_SILENT;
}
At long last, the fabled revision to the affinity system has arrived. A more detailed explanation of how this all works will be presented here: https://svn.open-mpi.org/trac/ompi/wiki/ProcessPlacement The wiki page is incomplete at the moment, but I hope to complete it over the next few days. I will provide updates on the devel list. As the wiki page states, the default and most commonly used options remain unchanged (except as noted below). New, esoteric and complex options have been added, but unless you are a true masochist, you are unlikely to use many of them beyond perhaps an initial curiosity-motivated experimentation. In a nutshell, this commit revamps the map/rank/bind procedure to take into account topology info on the compute nodes. I have, for the most part, preserved the default behaviors, with three notable exceptions: 1. I have at long last bowed my head in submission to the system admin's of managed clusters. For years, they have complained about our default of allowing users to oversubscribe nodes - i.e., to run more processes on a node than allocated slots. Accordingly, I have modified the default behavior: if you are running off of hostfile/dash-host allocated nodes, then the default is to allow oversubscription. If you are running off of RM-allocated nodes, then the default is to NOT allow oversubscription. Flags to override these behaviors are provided, so this only affects the default behavior. 2. both cpus/rank and stride have been removed. The latter was demanded by those who didn't understand the purpose behind it - and I agreed as the users who requested it are no longer using it. The former was removed temporarily pending implementation. 3. vm launch is now the sole method for starting OMPI. It was just too darned hard to maintain multiple launch procedures - maybe someday, provided someone can demonstrate a reason to do so. As Jeff stated, it is impossible to fully test a change of this size. I have tested it on Linux and Mac, covering all the default and simple options, singletons, and comm_spawn. That said, I'm sure others will find problems, so I'll be watching MTT results until this stabilizes. This commit was SVN r25476.
2011-11-15 03:40:11 +00:00
/* flag the node as oversubscribed so that sched-yield gets
* properly set
*/
nd->oversubscribed = true;
}
/* track number of procs mapped */
num_assigned++;
/* flag this fault group as used */
if (NULL != target) {
target->used = true;
}
}
/* track number of procs */
jdata->num_procs += app->num_procs;
/* compute vpids and add proc objects to the job - this has to be
* done after each app_context is mapped in order to keep the
* vpids contiguous within an app_context
*/
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_vpids(jdata))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* cleanup the node list - it can differ from one app_context
* to another, so we have to get it every time
*/
while (NULL != (item = opal_list_remove_first(&node_list))) {
OBJ_RELEASE(item);
}
OBJ_DESTRUCT(&node_list);
}
/* compute and save local ranks */
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_local_ranks(jdata))) {
ORTE_ERROR_LOG(rc);
return rc;
}
return ORTE_SUCCESS;
}