1
1

Do a little better job of catching up on missed mcast messages, and provide a way out of scenarios where catch-up is impossible.

This commit was SVN r24955.
Этот коммит содержится в:
Ralph Castain 2011-07-27 14:58:30 +00:00
родитель c3bc33b3fb
Коммит decab98fb2
4 изменённых файлов: 49 добавлений и 0 удалений

Просмотреть файл

@ -109,6 +109,7 @@ typedef struct {
opal_list_item_t super;
orte_rmcast_channel_t channel;
orte_rmcast_seq_t seq_num;
bool recovering;
} rmcast_seq_tracker_t;
ORTE_DECLSPEC OBJ_CLASS_DECLARATION(rmcast_seq_tracker_t);

Просмотреть файл

@ -420,6 +420,7 @@ static void trk_construct(rmcast_seq_tracker_t *ptr)
{
ptr->channel = ORTE_RMCAST_INVALID_CHANNEL;
ptr->seq_num = ORTE_RMCAST_SEQ_INVALID;
ptr->recovering = false;
}
OBJ_CLASS_INSTANCE(rmcast_seq_tracker_t,
opal_list_item_t,

Просмотреть файл

@ -218,13 +218,25 @@ void orte_rmcast_base_process_msg(orte_rmcast_msg_t *msg)
"%s Repeat msg %d on channel %d from source %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), recvd_seq_num, channel,
ORTE_NAME_PRINT(&name)));
goto cleanup;
}
if (1 != (recvd_seq_num - trkr->seq_num) ||
(ORTE_RMCAST_SEQ_MAX == trkr->seq_num && 0 != recvd_seq_num)) {
/* if we are already recovering, don't bother complaining again - this
* let's us drain the pipe of any messages we receive prior to the
* recovery message stream starting. So if (ahem) someone holds us in gdb,
* for example, then we need to jetison all the messages that might have
* stacked up in the interim or else we'll generate a bunch of recovery
* requests.
*/
if (trkr->recovering) {
goto cleanup;
}
/* missing a message - request it */
opal_output(0, "%s Missed msg %d (%d) on channel %d from source %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), recvd_seq_num,
trkr->seq_num, channel, ORTE_NAME_PRINT(&name));
trkr->recovering = true;
alert = OBJ_NEW(opal_buffer_t);
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &channel, 1, ORTE_RMCAST_CHANNEL_T))) {
ORTE_ERROR_LOG(rc);
@ -246,6 +258,10 @@ void orte_rmcast_base_process_msg(orte_rmcast_msg_t *msg)
channel, ORTE_NAME_PRINT(&log->name), recvd_seq_num));
}
trkr->seq_num = recvd_seq_num;
/* always reset the recovering flag so we will bark if
* another message is lost
*/
trkr->recovering = false;
}
}

Просмотреть файл

@ -1109,6 +1109,18 @@ static void resend_data(int status, orte_process_name_t* sender,
goto release;
}
/* if the channel is UINT32_MAX, then we know that this is a
* a response from a sender telling us that our request for
* missing messages is too far behind, so we should just
* abort
*/
if (UINT32_MAX == channel) {
opal_output(0, "%s CANNOT RECOVER FROM LOST MESSAGE - TOO FAR BEHIND - ABORTING",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
orte_errmgr.abort(1, NULL);
goto release;
}
n=1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &start, &n, ORTE_RMCAST_SEQ_T))) {
ORTE_ERROR_LOG(rc);
@ -1125,6 +1137,25 @@ static void resend_data(int status, orte_process_name_t* sender,
goto release;
}
/* see if we can bring the proc up to date - if it is too
* far behind, then there is no hope of recovery
*/
log = (rmcast_send_log_t*)opal_ring_buffer_poke(&ch->cache, 0);
if (NULL == log || start < log->seq_num) {
/* no hope - tell them */
channel = UINT32_MAX;
recover = OBJ_NEW(opal_buffer_t);
if (ORTE_SUCCESS != (rc = opal_dss.pack(recover, &channel, 1, ORTE_RMCAST_CHANNEL_T))) {
ORTE_ERROR_LOG(rc);
goto release;
}
if (0 > (rc = orte_rml.send_buffer_nb(sender, recover, ORTE_RML_TAG_MULTICAST, 0, cbfunc, NULL))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(recover);
}
goto release;
}
/* search its ring buffer for the starting message - function
* automatically starts at the oldest message and works up
* from there