Do a little better job of catching up on missed mcast messages, and provide a way out of scenarios where catch-up is impossible.

This commit was SVN r24955.
2011-07-27 14:58:30 +00:00 · 2011-07-27 14:58:30 +00:00 · decab98fb2
--- a/orte/mca/rmcast/base/private.h
+++ b/orte/mca/rmcast/base/private.h
@ -109,6 +109,7 @@ typedef struct {
    opal_list_item_t super;
    orte_rmcast_channel_t channel;
    orte_rmcast_seq_t seq_num;
+    bool recovering;
 } rmcast_seq_tracker_t;
 ORTE_DECLSPEC OBJ_CLASS_DECLARATION(rmcast_seq_tracker_t);

--- a/orte/mca/rmcast/base/rmcast_base_open.c
+++ b/orte/mca/rmcast/base/rmcast_base_open.c
@ -420,6 +420,7 @@ static void trk_construct(rmcast_seq_tracker_t *ptr)
 {
    ptr->channel = ORTE_RMCAST_INVALID_CHANNEL;
    ptr->seq_num = ORTE_RMCAST_SEQ_INVALID;
+    ptr->recovering = false;
 }
 OBJ_CLASS_INSTANCE(rmcast_seq_tracker_t,
                   opal_list_item_t,
--- a/orte/mca/rmcast/base/rmcast_base_threads.c
+++ b/orte/mca/rmcast/base/rmcast_base_threads.c
@ -218,13 +218,25 @@ void orte_rmcast_base_process_msg(orte_rmcast_msg_t *msg)
                                         "%s Repeat msg %d on channel %d from source %s",
                                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), recvd_seq_num, channel,
                                         ORTE_NAME_PRINT(&name)));
+                    goto cleanup;
                }
                if (1 != (recvd_seq_num - trkr->seq_num) ||
                    (ORTE_RMCAST_SEQ_MAX == trkr->seq_num && 0 != recvd_seq_num)) {
+                    /* if we are already recovering, don't bother complaining again - this
+                     * let's us drain the pipe of any messages we receive prior to the
+                     * recovery message stream starting. So if (ahem) someone holds us in gdb,
+                     * for example, then we need to jetison all the messages that might have
+                     * stacked up in the interim or else we'll generate a bunch of recovery
+                     * requests.
+                     */
+                    if (trkr->recovering) {
+                        goto cleanup;
+                    }
                    /* missing a message - request it */
                    opal_output(0, "%s Missed msg %d (%d) on channel %d from source %s",
                                ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), recvd_seq_num,
                                trkr->seq_num, channel, ORTE_NAME_PRINT(&name));
+                    trkr->recovering = true;
                    alert = OBJ_NEW(opal_buffer_t);
                    if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &channel, 1, ORTE_RMCAST_CHANNEL_T))) {
                        ORTE_ERROR_LOG(rc);
@ -246,6 +258,10 @@ void orte_rmcast_base_process_msg(orte_rmcast_msg_t *msg)
                                     channel, ORTE_NAME_PRINT(&log->name), recvd_seq_num));
            }
            trkr->seq_num = recvd_seq_num;
+            /* always reset the recovering flag so we will bark if
+             * another message is lost
+             */
+            trkr->recovering = false;
        }
    }

--- a/orte/mca/rmcast/udp/rmcast_udp.c
+++ b/orte/mca/rmcast/udp/rmcast_udp.c
@ -1109,6 +1109,18 @@ static void resend_data(int status, orte_process_name_t* sender,
        goto release;
    }

+    /* if the channel is UINT32_MAX, then we know that this is a
+     * a response from a sender telling us that our request for
+     * missing messages is too far behind, so we should just
+     * abort
+     */
+    if (UINT32_MAX == channel) {
+        opal_output(0, "%s CANNOT RECOVER FROM LOST MESSAGE - TOO FAR BEHIND - ABORTING",
+                    ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
+        orte_errmgr.abort(1, NULL);
+        goto release;
+    }
+
    n=1;
    if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &start, &n, ORTE_RMCAST_SEQ_T))) {
        ORTE_ERROR_LOG(rc);
@ -1125,6 +1137,25 @@ static void resend_data(int status, orte_process_name_t* sender,
        goto release;
    }

+    /* see if we can bring the proc up to date - if it is too
+     * far behind, then there is no hope of recovery
+     */
+    log = (rmcast_send_log_t*)opal_ring_buffer_poke(&ch->cache, 0);
+    if (NULL == log || start < log->seq_num) {
+        /* no hope - tell them */
+        channel = UINT32_MAX;
+        recover = OBJ_NEW(opal_buffer_t);
+        if (ORTE_SUCCESS != (rc = opal_dss.pack(recover, &channel, 1, ORTE_RMCAST_CHANNEL_T))) {
+            ORTE_ERROR_LOG(rc);
+            goto release;
+        }
+        if (0 > (rc = orte_rml.send_buffer_nb(sender, recover, ORTE_RML_TAG_MULTICAST, 0, cbfunc, NULL))) {
+            ORTE_ERROR_LOG(rc);
+            OBJ_RELEASE(recover);
+        }
+        goto release;
+    }
+
    /* search its ring buffer for the starting message - function
     * automatically starts at the oldest message and works up
     * from there