1
1

Further cleanup of orte-ps so it doesn't abort when hitting a stale HNP - only report that event once and just keep working.

Refs trac:3992

This commit was SVN r29974.

The following Trac tickets were found above:
  Ticket 3992 --> https://svn.open-mpi.org/trac/ompi/ticket/3992
Этот коммит содержится в:
Ralph Castain 2013-12-19 03:28:05 +00:00
родитель bf5e314f76
Коммит 6239e64f36
4 изменённых файлов: 10 добавлений и 5 удалений

Просмотреть файл

@ -1129,7 +1129,6 @@ void mca_oob_tcp_component_no_route(int fd, short args, void *cbdata)
/* get the peer object */ /* get the peer object */
if (OPAL_SUCCESS != opal_hash_table_get_value_uint64(&mca_oob_tcp_component.peers, if (OPAL_SUCCESS != opal_hash_table_get_value_uint64(&mca_oob_tcp_component.peers,
ui64, (void**)&pr) || NULL == pr) { ui64, (void**)&pr) || NULL == pr) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
goto cleanup; goto cleanup;
} }

Просмотреть файл

@ -37,8 +37,8 @@ Error: You specified a jobid (%d) without also specifying a vpid.
Error: The specified vpid (%d) is not valid for job %d. Error: The specified vpid (%d) is not valid for job %d.
# #
[stale-hnp] [stale-hnp]
An attempt was made to obtain ps information from a non-responsive An attempt was made to obtain ps information from at least
HNP: one non-responsive HNP:
HNP name: %s HNP name: %s

Просмотреть файл

@ -223,7 +223,8 @@ main(int argc, char *argv[])
opal_list_t hnp_list; opal_list_t hnp_list;
opal_list_item_t* item = NULL; opal_list_item_t* item = NULL;
orte_ps_mpirun_info_t hnpinfo; orte_ps_mpirun_info_t hnpinfo;
bool reported = false;
/*************** /***************
* Initialize * Initialize
***************/ ***************/
@ -277,9 +278,10 @@ main(int argc, char *argv[])
/* this could be due to a stale session directory - if so, /* this could be due to a stale session directory - if so,
* just skip this entry, but don't abort * just skip this entry, but don't abort
*/ */
if (ORTE_ERR_SILENT == ret) { if (!reported && ORTE_ERR_SILENT == ret) {
orte_show_help("help-orte-ps.txt", "stale-hnp", true, orte_show_help("help-orte-ps.txt", "stale-hnp", true,
ORTE_NAME_PRINT(&(hnpinfo.hnp->name))); ORTE_NAME_PRINT(&(hnpinfo.hnp->name)));
reported = true;
continue; continue;
} }
goto cleanup; goto cleanup;

Просмотреть файл

@ -56,6 +56,10 @@ static void quicktime_cb(int fd, short event, void *cbdata)
opal_event_free(quicktime); opal_event_free(quicktime);
quicktime = NULL; quicktime = NULL;
} }
/* cancel the recv */
orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_TOOL);
error_exit = ORTE_ERR_SILENT; error_exit = ORTE_ERR_SILENT;
/* declare it fired */ /* declare it fired */
timer_fired = true; timer_fired = true;