1
1

Add a verbose debug options. And add some error prints in the ESS' ft_event code.

This commit was SVN r22003.
Этот коммит содержится в:
Josh Hursey 2009-09-23 17:05:49 +00:00
родитель 2769091261
Коммит a6ee73156c
2 изменённых файлов: 19 добавлений и 0 удалений

15
orte/mca/ess/env/ess_env_module.c поставляемый
Просмотреть файл

@ -422,6 +422,7 @@ static int rte_ft_event(int state)
* Notify SnapC * Notify SnapC
*/ */
if( ORTE_SUCCESS != (ret = orte_snapc.ft_event(OPAL_CRS_CHECKPOINT))) { if( ORTE_SUCCESS != (ret = orte_snapc.ft_event(OPAL_CRS_CHECKPOINT))) {
ORTE_ERROR_LOG(ret);
exit_status = ret; exit_status = ret;
goto cleanup; goto cleanup;
} }
@ -430,6 +431,7 @@ static int rte_ft_event(int state)
* Notify Routed * Notify Routed
*/ */
if( ORTE_SUCCESS != (ret = orte_routed.ft_event(OPAL_CRS_CHECKPOINT))) { if( ORTE_SUCCESS != (ret = orte_routed.ft_event(OPAL_CRS_CHECKPOINT))) {
ORTE_ERROR_LOG(ret);
exit_status = ret; exit_status = ret;
goto cleanup; goto cleanup;
} }
@ -438,6 +440,7 @@ static int rte_ft_event(int state)
* Notify RML -> OOB * Notify RML -> OOB
*/ */
if( ORTE_SUCCESS != (ret = orte_rml.ft_event(OPAL_CRS_CHECKPOINT))) { if( ORTE_SUCCESS != (ret = orte_rml.ft_event(OPAL_CRS_CHECKPOINT))) {
ORTE_ERROR_LOG(ret);
exit_status = ret; exit_status = ret;
goto cleanup; goto cleanup;
} }
@ -448,6 +451,7 @@ static int rte_ft_event(int state)
* Notify RML -> OOB * Notify RML -> OOB
*/ */
if( ORTE_SUCCESS != (ret = orte_rml.ft_event(OPAL_CRS_CONTINUE))) { if( ORTE_SUCCESS != (ret = orte_rml.ft_event(OPAL_CRS_CONTINUE))) {
ORTE_ERROR_LOG(ret);
exit_status = ret; exit_status = ret;
goto cleanup; goto cleanup;
} }
@ -456,6 +460,7 @@ static int rte_ft_event(int state)
* Notify Routed * Notify Routed
*/ */
if( ORTE_SUCCESS != (ret = orte_routed.ft_event(OPAL_CRS_CONTINUE))) { if( ORTE_SUCCESS != (ret = orte_routed.ft_event(OPAL_CRS_CONTINUE))) {
ORTE_ERROR_LOG(ret);
exit_status = ret; exit_status = ret;
goto cleanup; goto cleanup;
} }
@ -464,6 +469,7 @@ static int rte_ft_event(int state)
* Notify SnapC * Notify SnapC
*/ */
if( ORTE_SUCCESS != (ret = orte_snapc.ft_event(OPAL_CRS_CONTINUE))) { if( ORTE_SUCCESS != (ret = orte_snapc.ft_event(OPAL_CRS_CONTINUE))) {
ORTE_ERROR_LOG(ret);
exit_status = ret; exit_status = ret;
goto cleanup; goto cleanup;
} }
@ -490,6 +496,7 @@ static int rte_ft_event(int state)
* Notify RML -> OOB * Notify RML -> OOB
*/ */
if( ORTE_SUCCESS != (ret = orte_rml.ft_event(OPAL_CRS_RESTART))) { if( ORTE_SUCCESS != (ret = orte_rml.ft_event(OPAL_CRS_RESTART))) {
ORTE_ERROR_LOG(ret);
exit_status = ret; exit_status = ret;
goto cleanup; goto cleanup;
} }
@ -501,11 +508,13 @@ static int rte_ft_event(int state)
svtype = orte_process_info.proc_type; svtype = orte_process_info.proc_type;
orte_process_info.proc_type = ORTE_PROC_TOOL; orte_process_info.proc_type = ORTE_PROC_TOOL;
if (ORTE_SUCCESS != (ret = orte_routed.finalize()) ) { if (ORTE_SUCCESS != (ret = orte_routed.finalize()) ) {
ORTE_ERROR_LOG(ret);
exit_status = ret; exit_status = ret;
goto cleanup; goto cleanup;
} }
orte_process_info.proc_type = svtype; orte_process_info.proc_type = svtype;
if (ORTE_SUCCESS != (ret = orte_routed.initialize()) ) { if (ORTE_SUCCESS != (ret = orte_routed.initialize()) ) {
ORTE_ERROR_LOG(ret);
exit_status = ret; exit_status = ret;
goto cleanup; goto cleanup;
} }
@ -515,10 +524,12 @@ static int rte_ft_event(int state)
*/ */
orte_grpcomm.finalize(); orte_grpcomm.finalize();
if (ORTE_SUCCESS != (ret = orte_grpcomm.init())) { if (ORTE_SUCCESS != (ret = orte_grpcomm.init())) {
ORTE_ERROR_LOG(ret);
exit_status = ret; exit_status = ret;
goto cleanup; goto cleanup;
} }
if (ORTE_SUCCESS != (ret = orte_grpcomm.purge_proc_attrs())) { if (ORTE_SUCCESS != (ret = orte_grpcomm.purge_proc_attrs())) {
ORTE_ERROR_LOG(ret);
exit_status = ret; exit_status = ret;
goto cleanup; goto cleanup;
} }
@ -542,6 +553,7 @@ static int rte_ft_event(int state)
* RML - Enable communications * RML - Enable communications
*/ */
if (ORTE_SUCCESS != (ret = orte_rml.enable_comm())) { if (ORTE_SUCCESS != (ret = orte_rml.enable_comm())) {
ORTE_ERROR_LOG(ret);
exit_status = ret; exit_status = ret;
goto cleanup; goto cleanup;
} }
@ -564,6 +576,7 @@ static int rte_ft_event(int state)
* Notify Routed * Notify Routed
*/ */
if( ORTE_SUCCESS != (ret = orte_routed.ft_event(OPAL_CRS_RESTART))) { if( ORTE_SUCCESS != (ret = orte_routed.ft_event(OPAL_CRS_RESTART))) {
ORTE_ERROR_LOG(ret);
exit_status = ret; exit_status = ret;
goto cleanup; goto cleanup;
} }
@ -572,6 +585,7 @@ static int rte_ft_event(int state)
* Notify SnapC * Notify SnapC
*/ */
if( ORTE_SUCCESS != (ret = orte_snapc.ft_event(OPAL_CRS_RESTART))) { if( ORTE_SUCCESS != (ret = orte_snapc.ft_event(OPAL_CRS_RESTART))) {
ORTE_ERROR_LOG(ret);
exit_status = ret; exit_status = ret;
goto cleanup; goto cleanup;
} }
@ -585,6 +599,7 @@ static int rte_ft_event(int state)
* of the program across checkpointes * of the program across checkpointes
*/ */
if( ORTE_SUCCESS != (ret = ess_env_ft_event_update_process_info(orte_process_info.my_name, getpid())) ) { if( ORTE_SUCCESS != (ret = ess_env_ft_event_update_process_info(orte_process_info.my_name, getpid())) ) {
ORTE_ERROR_LOG(ret);
exit_status = ret; exit_status = ret;
goto cleanup; goto cleanup;
} }

Просмотреть файл

@ -531,6 +531,10 @@ static void snapc_full_local_process_app_update_cmd(int fd, short event, void *c
goto cleanup; goto cleanup;
} }
OPAL_OUTPUT_VERBOSE((10, mca_snapc_full_component.super.output_handle,
"Local) Updated PID: %s : %d -> %d",
ORTE_NAME_PRINT(&vpid_snapshot->super.process_name), vpid_snapshot->process_pid, proc_pid));
/* JJH: Maybe we should save the old and the newly restarted pid? */ /* JJH: Maybe we should save the old and the newly restarted pid? */
vpid_snapshot->process_pid = proc_pid; vpid_snapshot->process_pid = proc_pid;