diff --git a/orte/runtime/orte_globals.c b/orte/runtime/orte_globals.c index 073605d204..ae90079b27 100644 --- a/orte/runtime/orte_globals.c +++ b/orte/runtime/orte_globals.c @@ -898,6 +898,9 @@ static void orte_proc_construct(orte_proc_t* proc) proc->exit_code = 0; /* Assume we won't fail unless otherwise notified */ proc->rml_uri = NULL; proc->restarts = 0; + proc->fast_failures = 0; + proc->last_failure.tv_sec = 0; + proc->last_failure.tv_usec = 0; proc->reported = false; proc->beat = 0; OBJ_CONSTRUCT(&proc->stats, opal_pstats_t); diff --git a/orte/runtime/orte_globals.h b/orte/runtime/orte_globals.h index ff51640888..3edf2fee98 100644 --- a/orte/runtime/orte_globals.h +++ b/orte/runtime/orte_globals.h @@ -488,6 +488,10 @@ struct orte_proc_t { char *rml_uri; /* number of times this process has been restarted */ int32_t restarts; + /* time of last restart */ + struct timeval last_failure; + /* number of failures in "fast" window */ + int32_t fast_failures; /* flag to indicate proc has reported in */ bool reported; /* if heartbeat recvd during last time period */