Update resilient recovery mapping algorithm to be a bit more sophisticated. Track the prior node a proc was on so we avoid ricochet effect. Also avoid putting recovering proc onto node that is already occupied by a peer as this degrades fault tolerance.

This commit was SVN r24417.
2011-02-20 18:46:21 +00:00 · 2011-02-20 18:46:21 +00:00 · f014284f91
--- a/orte/mca/rmaps/resilient/rmaps_resilient.c
+++ b/orte/mca/rmaps/resilient/rmaps_resilient.c
@ -28,6 +28,7 @@
 #include "opal/util/argv.h"
 #include "opal/class/opal_pointer_array.h"

+#include "orte/util/error_strings.h"
 #include "orte/util/show_help.h"
 #include "orte/mca/errmgr/errmgr.h"

@ -116,6 +117,11 @@ static int orte_rmaps_resilient_map(orte_job_t *jdata)
        if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, i))) {
            continue;
        }
+        OPAL_OUTPUT_VERBOSE((7, orte_rmaps_base.rmaps_output,
+                             "%s PROC %s STATE %s",
+                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
+                             ORTE_NAME_PRINT(&proc->name),
+                             orte_proc_state_to_str(proc->state)));
        /* is this proc to be restarted? */
        if (proc->state != ORTE_PROC_STATE_RESTART) {
            continue;
@ -130,6 +136,20 @@ static int orte_rmaps_resilient_map(orte_job_t *jdata)
            goto error;
        }

+        if (NULL == oldnode) {
+            OPAL_OUTPUT_VERBOSE((1, orte_rmaps_base.rmaps_output,
+                                 "%s rmaps:resilient: proc %s is to be started",
+                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
+                                 ORTE_NAME_PRINT(&proc->name)));
+        } else {
+            OPAL_OUTPUT_VERBOSE((1, orte_rmaps_base.rmaps_output,
+                                 "%s rmaps:resilient: proc %s from node %s[%s] is to be restarted",
+                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
+                                 ORTE_NAME_PRINT(&proc->name),
+                                 (NULL == oldnode->name) ? "NULL" : oldnode->name,
+                                 (NULL == oldnode->daemon) ? "--" : ORTE_VPID_PRINT(oldnode->daemon->name.vpid)));
+        }
+
        if (NULL == oldnode) {
            /* this proc was not previously running - likely it is being added
             * to the job. So place it on the node with the fewest procs to
@ -141,11 +161,17 @@ static int orte_rmaps_resilient_map(orte_job_t *jdata)
                                                                       app,
                                                                       jdata->map->policy))) {
                ORTE_ERROR_LOG(rc);
+                while (NULL != (item = opal_list_remove_first(&node_list))) {
+                    OBJ_RELEASE(item);
+                }
+                OBJ_DESTRUCT(&node_list);
                goto error;
            }
-            if (0 == opal_list_get_size(&node_list)) {
-                ORTE_ERROR_LOG(ORTE_ERROR);
-                rc = ORTE_ERROR;
+            if (opal_list_is_empty(&node_list)) {
+                /* put the proc on "hold" until resources are available */
+                OBJ_DESTRUCT(&node_list);
+                proc->state = ORTE_PROC_STATE_MIGRATING;
+                rc = ORTE_ERR_OUT_OF_RESOURCE;
                goto error;
            }
            totprocs = 1000000;
@ -163,9 +189,10 @@ static int orte_rmaps_resilient_map(orte_job_t *jdata)
             * so we couldn't have come out of the loop with nd=NULL
             */
            OPAL_OUTPUT_VERBOSE((1, orte_rmaps_base.rmaps_output,
-                                 "%s rmaps:resilient: Placing new process on node %s daemon %s (no ftgrp)",
+                                 "%s rmaps:resilient: Placing new process on node %s[%s] (no ftgrp)",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
-                                 nd->name, ORTE_NAME_PRINT((&nd->daemon->name))));
+                                 nd->name,
+                                 (NULL == nd->daemon) ? "--" : ORTE_VPID_PRINT(nd->daemon->name.vpid)));
        } else {

            OPAL_OUTPUT_VERBOSE((1, orte_rmaps_base.rmaps_output,
@ -413,20 +440,14 @@ static int get_new_node(orte_proc_t *proc,
                        orte_node_t **ndret)
 {
    orte_node_t *nd, *oldnode, *node;
-    int rc;
-    opal_list_t node_list;
+    orte_proc_t *pptr;
+    int rc, j;
+    opal_list_t node_list, candidates;
    opal_list_item_t *item, *next;
    orte_std_cntr_t num_slots;
+    bool found;

-    /* if no ftgrps are available, then just put it on the next node
-     * on the list - obviously, this is a rather unintelligent decision.
-     * However, we want to ensure  that we don't just keep bouncing
-     * back/forth between the same two nodes.
-     *
-     * Note: if the list only has oldnode on it, then this installs
-     * the proc back on its original node - this is better than not
-     * restarting at all
-     */
+    /* set defaults */
    *ndret = NULL;
    nd = NULL;
    oldnode = proc->node;
@ -440,46 +461,158 @@ static int get_new_node(orte_proc_t *proc,
                                                               app,
                                                               map->policy))) {
        ORTE_ERROR_LOG(rc);
-        goto error;
+        goto release;
    }
-    if (0 == opal_list_get_size(&node_list)) {
+    if (opal_list_is_empty(&node_list)) {
        ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
        rc = ORTE_ERR_OUT_OF_RESOURCE;
-        goto error;
+        goto release;
+    }
+
+    if (1 == opal_list_get_size(&node_list)) {
+        /* if we have only one node, all we can do is put the proc on that
+         * node, even if it is the same one - better than not restarting at
+         * all
+         */
+        nd = (orte_node_t*)opal_list_get_first(&node_list);
+        proc->prior_node = oldnode;
+        OPAL_OUTPUT_VERBOSE((1, orte_rmaps_base.rmaps_output,
+                             "%s rmaps:resilient: Placing process %s on node %s[%s] (only one avail node)",
+                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
+                             ORTE_NAME_PRINT(&proc->name),
+                             nd->name,
+                             (NULL == nd->daemon) ? "--" : ORTE_VPID_PRINT(nd->daemon->name.vpid)));
+        goto release;
    }

    /*
-     * Cycle thru the list to find the current node
+     * Cycle thru the list, transferring
+     * all available nodes to the candidate list
+     * so we can get them in the right order
     * 
     */
-    item = opal_list_get_first(&node_list);
-    while (item != opal_list_get_end(&node_list)) {
-        next = opal_list_get_next(item);
+    OBJ_CONSTRUCT(&candidates, opal_list_t);
+    while (NULL != (item = opal_list_remove_first(&node_list))) {
        node = (orte_node_t*)item;
-        OPAL_OUTPUT_VERBOSE((7, orte_rmaps_base.rmaps_output,
-                             "%s CHECKING NODE %s[%s] AGAINST NODE %s[%s]",
-                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
-                             node->name,
-                             (NULL == node->daemon) ? "?" : ORTE_VPID_PRINT(node->daemon->name.vpid),
-                             oldnode->name,
-                             (NULL == oldnode->daemon) ? "?" : ORTE_VPID_PRINT(oldnode->daemon->name.vpid)));
+        /* don't put it back on current node */
        if (node == oldnode) {
-            if (next == opal_list_get_end(&node_list)) {
-                nd = (orte_node_t*)opal_list_get_first(&node_list);
-            } else {
-                nd = (orte_node_t*)next;
-            }
+            OBJ_RELEASE(item);
+            continue;
+        }
+        if (0 == node->num_procs) {
+            OPAL_OUTPUT_VERBOSE((7, orte_rmaps_base.rmaps_output,
+                                 "%s PREPENDING EMPTY NODE %s[%s] TO CANDIDATES",
+                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
+                                 (NULL == node->name) ? "NULL" : node->name,
+                                 (NULL == node->daemon) ? "--" : ORTE_VPID_PRINT(node->daemon->name.vpid)));
+            opal_list_prepend(&candidates, item);
+        } else {
+            OPAL_OUTPUT_VERBOSE((7, orte_rmaps_base.rmaps_output,
+                                 "%s APPENDING NON-EMPTY NODE %s[%s] TO CANDIDATES",
+                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
+                                 (NULL == node->name) ? "NULL" : node->name,
+                                 (NULL == node->daemon) ? "--" : ORTE_VPID_PRINT(node->daemon->name.vpid)));
+            opal_list_append(&candidates, item);
+        }
+    }
+    /* search the candidates
+     * try to use a semi-intelligent selection logic here that:
+     *
+     * (a) avoids putting the proc on a node where a peer is already
+     *     located as this degrades our fault tolerance
+     *
+     * (b) avoids "ricochet effect" where a process would ping-pong
+     *     between two nodes as it fails
+     */
+    nd = NULL;
+    item = opal_list_get_first(&candidates);
+    while (item != opal_list_get_end(&candidates)) {
+        node = (orte_node_t*)item;
+        next = opal_list_get_next(item);
+        /* don't return to our prior location to avoid
+         * "ricochet" effect
+         */
+        if (NULL != proc->prior_node &&
+            node == proc->prior_node) {
+            OPAL_OUTPUT_VERBOSE((7, orte_rmaps_base.rmaps_output,
+                                 "%s REMOVING PRIOR NODE %s[%s] FROM CANDIDATES",
+                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
+                                 (NULL == node->name) ? "NULL" : node->name,
+                                 (NULL == node->daemon) ? "--" : ORTE_VPID_PRINT(node->daemon->name.vpid)));
+            opal_list_remove_item(&candidates, item);
+            OBJ_RELEASE(item);  /* maintain acctg */
+            item = next;
+            continue;
+        }
+        /* if this node is empty, then it is the winner */
+        if (0 == node->num_procs) {
+            nd = node;
+            proc->prior_node = oldnode;
            break;
        }
-        item = next;
+        /* if this node has someone from my job, then skip it
+         * to avoid (a)
+         */
+        found = false;
+        for (j=0; j < node->procs->size; j++) {
+            if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) {
+                continue;
+            }
+            if (pptr->name.jobid == proc->name.jobid) {
+                OPAL_OUTPUT_VERBOSE((7, orte_rmaps_base.rmaps_output,
+                                     "%s FOUND PEER %s ON NODE %s[%s]",
+                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
+                                     ORTE_NAME_PRINT(&pptr->name),
+                                     (NULL == node->name) ? "NULL" : node->name,
+                                     (NULL == node->daemon) ? "--" : ORTE_VPID_PRINT(node->daemon->name.vpid)));
+                found = true;
+                break;
+            }
+        }
+        if (found) {
+            item = next;
+            continue;
+        }
+        /* get here if all tests pass - take this node */
+        nd = node;
+        proc->prior_node = oldnode;
+        break;
    }
-    OPAL_OUTPUT_VERBOSE((1, orte_rmaps_base.rmaps_output,
-                         "%s rmaps:resilient: Placing process on node %s daemon %s (no ftgrp)",
-                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
-                         (nd == oldnode) ? "OLDNODE" : nd->name,
-                         ORTE_NAME_PRINT((&nd->daemon->name))));
+    if (NULL == nd) {
+        /* didn't find anything */
+        if (NULL != proc->prior_node) {
+            nd = proc->prior_node;
+            OPAL_OUTPUT_VERBOSE((1, orte_rmaps_base.rmaps_output,
+                                 "%s rmaps:resilient: Placing process %s on prior node %s[%s] (no ftgrp)",
+                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
+                                 ORTE_NAME_PRINT(&proc->name),
+                                 (NULL == nd->name) ? "NULL" : nd->name,
+                                 (NULL == nd->daemon) ? "--" : ORTE_VPID_PRINT(nd->daemon->name.vpid)));
+        } else {
+            nd = oldnode;
+            proc->prior_node = oldnode;
+            OPAL_OUTPUT_VERBOSE((1, orte_rmaps_base.rmaps_output,
+                                 "%s rmaps:resilient: Placing process %s back on old node %s[%s] (no ftgrp)",
+                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
+                                 ORTE_NAME_PRINT(&proc->name),
+                                 (NULL == nd->name) ? "NULL" : nd->name,
+                                 (NULL == nd->daemon) ? "--" : ORTE_VPID_PRINT(nd->daemon->name.vpid)));
+        }
+
+    }
+    /* cleanup candidate list */
+    while (NULL != (item = opal_list_remove_first(&candidates))) {
+        OBJ_RELEASE(item);
+    }
+    OBJ_DESTRUCT(&candidates);
+
+ release:
+    OPAL_OUTPUT_VERBOSE((1, orte_rmaps_base.rmaps_output,
+                         "%s rmaps:resilient: Placing process on node %s[%s] (no ftgrp)",
+                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
+                         (NULL == nd->name) ? "NULL" : nd->name,
+                         (NULL == nd->daemon) ? "--" : ORTE_VPID_PRINT(nd->daemon->name.vpid)));

- error:
    while (NULL != (item = opal_list_remove_first(&node_list))) {
        OBJ_RELEASE(item);
    }
--- a/orte/runtime/orte_globals.c
+++ b/orte/runtime/orte_globals.c
@ -885,6 +885,7 @@ static void orte_proc_construct(orte_proc_t* proc)
    proc->app_idx = 0;
    proc->slot_list = NULL;
    proc->node = NULL;
+    proc->prior_node = NULL;
    proc->nodename = NULL;
    proc->rml_uri = NULL;
    proc->restarts = 0;
--- a/orte/runtime/orte_globals.h
+++ b/orte/runtime/orte_globals.h
@ -473,6 +473,8 @@ struct orte_proc_t {
    char *slot_list;
    /* pointer to the node where this proc is executing */
    orte_node_t *node;
+    /* pointer to the node where this proc last executed */
+    orte_node_t *prior_node;
    /* name of the node where this proc is executing - this
     * is used simply to pass that info to a calling
     * tool since it may not have a node array available