Update resilient recovery mapping algorithm to be a bit more sophisticated. Track the prior node a proc was on so we avoid ricochet effect. Also avoid putting recovering proc onto node that is already occupied by a peer as this degrades fault tolerance.
This commit was SVN r24417.
Этот коммит содержится в:
родитель
a8cf19a7bc
Коммит
f014284f91
@ -28,6 +28,7 @@
|
||||
#include "opal/util/argv.h"
|
||||
#include "opal/class/opal_pointer_array.h"
|
||||
|
||||
#include "orte/util/error_strings.h"
|
||||
#include "orte/util/show_help.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
|
||||
@ -116,6 +117,11 @@ static int orte_rmaps_resilient_map(orte_job_t *jdata)
|
||||
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, i))) {
|
||||
continue;
|
||||
}
|
||||
OPAL_OUTPUT_VERBOSE((7, orte_rmaps_base.rmaps_output,
|
||||
"%s PROC %s STATE %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&proc->name),
|
||||
orte_proc_state_to_str(proc->state)));
|
||||
/* is this proc to be restarted? */
|
||||
if (proc->state != ORTE_PROC_STATE_RESTART) {
|
||||
continue;
|
||||
@ -130,6 +136,20 @@ static int orte_rmaps_resilient_map(orte_job_t *jdata)
|
||||
goto error;
|
||||
}
|
||||
|
||||
if (NULL == oldnode) {
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_rmaps_base.rmaps_output,
|
||||
"%s rmaps:resilient: proc %s is to be started",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&proc->name)));
|
||||
} else {
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_rmaps_base.rmaps_output,
|
||||
"%s rmaps:resilient: proc %s from node %s[%s] is to be restarted",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&proc->name),
|
||||
(NULL == oldnode->name) ? "NULL" : oldnode->name,
|
||||
(NULL == oldnode->daemon) ? "--" : ORTE_VPID_PRINT(oldnode->daemon->name.vpid)));
|
||||
}
|
||||
|
||||
if (NULL == oldnode) {
|
||||
/* this proc was not previously running - likely it is being added
|
||||
* to the job. So place it on the node with the fewest procs to
|
||||
@ -141,11 +161,17 @@ static int orte_rmaps_resilient_map(orte_job_t *jdata)
|
||||
app,
|
||||
jdata->map->policy))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
while (NULL != (item = opal_list_remove_first(&node_list))) {
|
||||
OBJ_RELEASE(item);
|
||||
}
|
||||
OBJ_DESTRUCT(&node_list);
|
||||
goto error;
|
||||
}
|
||||
if (0 == opal_list_get_size(&node_list)) {
|
||||
ORTE_ERROR_LOG(ORTE_ERROR);
|
||||
rc = ORTE_ERROR;
|
||||
if (opal_list_is_empty(&node_list)) {
|
||||
/* put the proc on "hold" until resources are available */
|
||||
OBJ_DESTRUCT(&node_list);
|
||||
proc->state = ORTE_PROC_STATE_MIGRATING;
|
||||
rc = ORTE_ERR_OUT_OF_RESOURCE;
|
||||
goto error;
|
||||
}
|
||||
totprocs = 1000000;
|
||||
@ -163,9 +189,10 @@ static int orte_rmaps_resilient_map(orte_job_t *jdata)
|
||||
* so we couldn't have come out of the loop with nd=NULL
|
||||
*/
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_rmaps_base.rmaps_output,
|
||||
"%s rmaps:resilient: Placing new process on node %s daemon %s (no ftgrp)",
|
||||
"%s rmaps:resilient: Placing new process on node %s[%s] (no ftgrp)",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
nd->name, ORTE_NAME_PRINT((&nd->daemon->name))));
|
||||
nd->name,
|
||||
(NULL == nd->daemon) ? "--" : ORTE_VPID_PRINT(nd->daemon->name.vpid)));
|
||||
} else {
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_rmaps_base.rmaps_output,
|
||||
@ -413,20 +440,14 @@ static int get_new_node(orte_proc_t *proc,
|
||||
orte_node_t **ndret)
|
||||
{
|
||||
orte_node_t *nd, *oldnode, *node;
|
||||
int rc;
|
||||
opal_list_t node_list;
|
||||
orte_proc_t *pptr;
|
||||
int rc, j;
|
||||
opal_list_t node_list, candidates;
|
||||
opal_list_item_t *item, *next;
|
||||
orte_std_cntr_t num_slots;
|
||||
bool found;
|
||||
|
||||
/* if no ftgrps are available, then just put it on the next node
|
||||
* on the list - obviously, this is a rather unintelligent decision.
|
||||
* However, we want to ensure that we don't just keep bouncing
|
||||
* back/forth between the same two nodes.
|
||||
*
|
||||
* Note: if the list only has oldnode on it, then this installs
|
||||
* the proc back on its original node - this is better than not
|
||||
* restarting at all
|
||||
*/
|
||||
/* set defaults */
|
||||
*ndret = NULL;
|
||||
nd = NULL;
|
||||
oldnode = proc->node;
|
||||
@ -440,46 +461,158 @@ static int get_new_node(orte_proc_t *proc,
|
||||
app,
|
||||
map->policy))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto error;
|
||||
goto release;
|
||||
}
|
||||
if (0 == opal_list_get_size(&node_list)) {
|
||||
if (opal_list_is_empty(&node_list)) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
rc = ORTE_ERR_OUT_OF_RESOURCE;
|
||||
goto error;
|
||||
goto release;
|
||||
}
|
||||
|
||||
if (1 == opal_list_get_size(&node_list)) {
|
||||
/* if we have only one node, all we can do is put the proc on that
|
||||
* node, even if it is the same one - better than not restarting at
|
||||
* all
|
||||
*/
|
||||
nd = (orte_node_t*)opal_list_get_first(&node_list);
|
||||
proc->prior_node = oldnode;
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_rmaps_base.rmaps_output,
|
||||
"%s rmaps:resilient: Placing process %s on node %s[%s] (only one avail node)",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&proc->name),
|
||||
nd->name,
|
||||
(NULL == nd->daemon) ? "--" : ORTE_VPID_PRINT(nd->daemon->name.vpid)));
|
||||
goto release;
|
||||
}
|
||||
|
||||
/*
|
||||
* Cycle thru the list to find the current node
|
||||
* Cycle thru the list, transferring
|
||||
* all available nodes to the candidate list
|
||||
* so we can get them in the right order
|
||||
*
|
||||
*/
|
||||
item = opal_list_get_first(&node_list);
|
||||
while (item != opal_list_get_end(&node_list)) {
|
||||
next = opal_list_get_next(item);
|
||||
OBJ_CONSTRUCT(&candidates, opal_list_t);
|
||||
while (NULL != (item = opal_list_remove_first(&node_list))) {
|
||||
node = (orte_node_t*)item;
|
||||
OPAL_OUTPUT_VERBOSE((7, orte_rmaps_base.rmaps_output,
|
||||
"%s CHECKING NODE %s[%s] AGAINST NODE %s[%s]",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
node->name,
|
||||
(NULL == node->daemon) ? "?" : ORTE_VPID_PRINT(node->daemon->name.vpid),
|
||||
oldnode->name,
|
||||
(NULL == oldnode->daemon) ? "?" : ORTE_VPID_PRINT(oldnode->daemon->name.vpid)));
|
||||
/* don't put it back on current node */
|
||||
if (node == oldnode) {
|
||||
if (next == opal_list_get_end(&node_list)) {
|
||||
nd = (orte_node_t*)opal_list_get_first(&node_list);
|
||||
} else {
|
||||
nd = (orte_node_t*)next;
|
||||
}
|
||||
OBJ_RELEASE(item);
|
||||
continue;
|
||||
}
|
||||
if (0 == node->num_procs) {
|
||||
OPAL_OUTPUT_VERBOSE((7, orte_rmaps_base.rmaps_output,
|
||||
"%s PREPENDING EMPTY NODE %s[%s] TO CANDIDATES",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
(NULL == node->name) ? "NULL" : node->name,
|
||||
(NULL == node->daemon) ? "--" : ORTE_VPID_PRINT(node->daemon->name.vpid)));
|
||||
opal_list_prepend(&candidates, item);
|
||||
} else {
|
||||
OPAL_OUTPUT_VERBOSE((7, orte_rmaps_base.rmaps_output,
|
||||
"%s APPENDING NON-EMPTY NODE %s[%s] TO CANDIDATES",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
(NULL == node->name) ? "NULL" : node->name,
|
||||
(NULL == node->daemon) ? "--" : ORTE_VPID_PRINT(node->daemon->name.vpid)));
|
||||
opal_list_append(&candidates, item);
|
||||
}
|
||||
}
|
||||
/* search the candidates
|
||||
* try to use a semi-intelligent selection logic here that:
|
||||
*
|
||||
* (a) avoids putting the proc on a node where a peer is already
|
||||
* located as this degrades our fault tolerance
|
||||
*
|
||||
* (b) avoids "ricochet effect" where a process would ping-pong
|
||||
* between two nodes as it fails
|
||||
*/
|
||||
nd = NULL;
|
||||
item = opal_list_get_first(&candidates);
|
||||
while (item != opal_list_get_end(&candidates)) {
|
||||
node = (orte_node_t*)item;
|
||||
next = opal_list_get_next(item);
|
||||
/* don't return to our prior location to avoid
|
||||
* "ricochet" effect
|
||||
*/
|
||||
if (NULL != proc->prior_node &&
|
||||
node == proc->prior_node) {
|
||||
OPAL_OUTPUT_VERBOSE((7, orte_rmaps_base.rmaps_output,
|
||||
"%s REMOVING PRIOR NODE %s[%s] FROM CANDIDATES",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
(NULL == node->name) ? "NULL" : node->name,
|
||||
(NULL == node->daemon) ? "--" : ORTE_VPID_PRINT(node->daemon->name.vpid)));
|
||||
opal_list_remove_item(&candidates, item);
|
||||
OBJ_RELEASE(item); /* maintain acctg */
|
||||
item = next;
|
||||
continue;
|
||||
}
|
||||
/* if this node is empty, then it is the winner */
|
||||
if (0 == node->num_procs) {
|
||||
nd = node;
|
||||
proc->prior_node = oldnode;
|
||||
break;
|
||||
}
|
||||
item = next;
|
||||
/* if this node has someone from my job, then skip it
|
||||
* to avoid (a)
|
||||
*/
|
||||
found = false;
|
||||
for (j=0; j < node->procs->size; j++) {
|
||||
if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) {
|
||||
continue;
|
||||
}
|
||||
if (pptr->name.jobid == proc->name.jobid) {
|
||||
OPAL_OUTPUT_VERBOSE((7, orte_rmaps_base.rmaps_output,
|
||||
"%s FOUND PEER %s ON NODE %s[%s]",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&pptr->name),
|
||||
(NULL == node->name) ? "NULL" : node->name,
|
||||
(NULL == node->daemon) ? "--" : ORTE_VPID_PRINT(node->daemon->name.vpid)));
|
||||
found = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (found) {
|
||||
item = next;
|
||||
continue;
|
||||
}
|
||||
/* get here if all tests pass - take this node */
|
||||
nd = node;
|
||||
proc->prior_node = oldnode;
|
||||
break;
|
||||
}
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_rmaps_base.rmaps_output,
|
||||
"%s rmaps:resilient: Placing process on node %s daemon %s (no ftgrp)",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
(nd == oldnode) ? "OLDNODE" : nd->name,
|
||||
ORTE_NAME_PRINT((&nd->daemon->name))));
|
||||
if (NULL == nd) {
|
||||
/* didn't find anything */
|
||||
if (NULL != proc->prior_node) {
|
||||
nd = proc->prior_node;
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_rmaps_base.rmaps_output,
|
||||
"%s rmaps:resilient: Placing process %s on prior node %s[%s] (no ftgrp)",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&proc->name),
|
||||
(NULL == nd->name) ? "NULL" : nd->name,
|
||||
(NULL == nd->daemon) ? "--" : ORTE_VPID_PRINT(nd->daemon->name.vpid)));
|
||||
} else {
|
||||
nd = oldnode;
|
||||
proc->prior_node = oldnode;
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_rmaps_base.rmaps_output,
|
||||
"%s rmaps:resilient: Placing process %s back on old node %s[%s] (no ftgrp)",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&proc->name),
|
||||
(NULL == nd->name) ? "NULL" : nd->name,
|
||||
(NULL == nd->daemon) ? "--" : ORTE_VPID_PRINT(nd->daemon->name.vpid)));
|
||||
}
|
||||
|
||||
}
|
||||
/* cleanup candidate list */
|
||||
while (NULL != (item = opal_list_remove_first(&candidates))) {
|
||||
OBJ_RELEASE(item);
|
||||
}
|
||||
OBJ_DESTRUCT(&candidates);
|
||||
|
||||
release:
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_rmaps_base.rmaps_output,
|
||||
"%s rmaps:resilient: Placing process on node %s[%s] (no ftgrp)",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
(NULL == nd->name) ? "NULL" : nd->name,
|
||||
(NULL == nd->daemon) ? "--" : ORTE_VPID_PRINT(nd->daemon->name.vpid)));
|
||||
|
||||
error:
|
||||
while (NULL != (item = opal_list_remove_first(&node_list))) {
|
||||
OBJ_RELEASE(item);
|
||||
}
|
||||
|
@ -885,6 +885,7 @@ static void orte_proc_construct(orte_proc_t* proc)
|
||||
proc->app_idx = 0;
|
||||
proc->slot_list = NULL;
|
||||
proc->node = NULL;
|
||||
proc->prior_node = NULL;
|
||||
proc->nodename = NULL;
|
||||
proc->rml_uri = NULL;
|
||||
proc->restarts = 0;
|
||||
|
@ -473,6 +473,8 @@ struct orte_proc_t {
|
||||
char *slot_list;
|
||||
/* pointer to the node where this proc is executing */
|
||||
orte_node_t *node;
|
||||
/* pointer to the node where this proc last executed */
|
||||
orte_node_t *prior_node;
|
||||
/* name of the node where this proc is executing - this
|
||||
* is used simply to pass that info to a calling
|
||||
* tool since it may not have a node array available
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user