Further cleanup of orte-ps so it doesn't abort when hitting a stale HNP - only report that event once and just keep working.
Refs trac:3992 This commit was SVN r29974. The following Trac tickets were found above: Ticket 3992 --> https://svn.open-mpi.org/trac/ompi/ticket/3992
Этот коммит содержится в:
родитель
bf5e314f76
Коммит
6239e64f36
@ -1129,7 +1129,6 @@ void mca_oob_tcp_component_no_route(int fd, short args, void *cbdata)
|
|||||||
/* get the peer object */
|
/* get the peer object */
|
||||||
if (OPAL_SUCCESS != opal_hash_table_get_value_uint64(&mca_oob_tcp_component.peers,
|
if (OPAL_SUCCESS != opal_hash_table_get_value_uint64(&mca_oob_tcp_component.peers,
|
||||||
ui64, (void**)&pr) || NULL == pr) {
|
ui64, (void**)&pr) || NULL == pr) {
|
||||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
|
||||||
goto cleanup;
|
goto cleanup;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -37,8 +37,8 @@ Error: You specified a jobid (%d) without also specifying a vpid.
|
|||||||
Error: The specified vpid (%d) is not valid for job %d.
|
Error: The specified vpid (%d) is not valid for job %d.
|
||||||
#
|
#
|
||||||
[stale-hnp]
|
[stale-hnp]
|
||||||
An attempt was made to obtain ps information from a non-responsive
|
An attempt was made to obtain ps information from at least
|
||||||
HNP:
|
one non-responsive HNP:
|
||||||
|
|
||||||
HNP name: %s
|
HNP name: %s
|
||||||
|
|
||||||
|
@ -223,7 +223,8 @@ main(int argc, char *argv[])
|
|||||||
opal_list_t hnp_list;
|
opal_list_t hnp_list;
|
||||||
opal_list_item_t* item = NULL;
|
opal_list_item_t* item = NULL;
|
||||||
orte_ps_mpirun_info_t hnpinfo;
|
orte_ps_mpirun_info_t hnpinfo;
|
||||||
|
bool reported = false;
|
||||||
|
|
||||||
/***************
|
/***************
|
||||||
* Initialize
|
* Initialize
|
||||||
***************/
|
***************/
|
||||||
@ -277,9 +278,10 @@ main(int argc, char *argv[])
|
|||||||
/* this could be due to a stale session directory - if so,
|
/* this could be due to a stale session directory - if so,
|
||||||
* just skip this entry, but don't abort
|
* just skip this entry, but don't abort
|
||||||
*/
|
*/
|
||||||
if (ORTE_ERR_SILENT == ret) {
|
if (!reported && ORTE_ERR_SILENT == ret) {
|
||||||
orte_show_help("help-orte-ps.txt", "stale-hnp", true,
|
orte_show_help("help-orte-ps.txt", "stale-hnp", true,
|
||||||
ORTE_NAME_PRINT(&(hnpinfo.hnp->name)));
|
ORTE_NAME_PRINT(&(hnpinfo.hnp->name)));
|
||||||
|
reported = true;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
goto cleanup;
|
goto cleanup;
|
||||||
|
@ -56,6 +56,10 @@ static void quicktime_cb(int fd, short event, void *cbdata)
|
|||||||
opal_event_free(quicktime);
|
opal_event_free(quicktime);
|
||||||
quicktime = NULL;
|
quicktime = NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* cancel the recv */
|
||||||
|
orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_TOOL);
|
||||||
|
|
||||||
error_exit = ORTE_ERR_SILENT;
|
error_exit = ORTE_ERR_SILENT;
|
||||||
/* declare it fired */
|
/* declare it fired */
|
||||||
timer_fired = true;
|
timer_fired = true;
|
||||||
|
Загрузка…
Ссылка в новой задаче
Block a user