/* * Copyright (c) 2009-2011 The Trustees of Indiana University. * All rights reserved. * Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved. * Copyright (c) 2004-2011 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2011 Los Alamos National Security, LLC. * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow * * $HEADER$ */ #include "orte_config.h" #include #ifdef HAVE_UNISTD_H #include #endif /* HAVE_UNISTD_H */ #ifdef HAVE_STRING_H #include #endif #ifdef HAVE_SYS_WAIT_H #include #endif #include "opal/util/output.h" #include "opal/dss/dss.h" #include "orte/mca/rml/rml.h" #include "orte/mca/odls/odls.h" #include "orte/mca/odls/base/base.h" #include "orte/mca/odls/base/odls_private.h" #include "orte/mca/plm/base/plm_private.h" #include "orte/mca/plm/plm.h" #include "orte/mca/rmaps/rmaps_types.h" #include "orte/mca/sensor/sensor.h" #include "orte/mca/routed/routed.h" #include "orte/mca/debugger/base/base.h" #include "orte/mca/notifier/notifier.h" #include "orte/mca/grpcomm/grpcomm.h" #include "orte/mca/ess/ess.h" #include "orte/util/error_strings.h" #include "orte/util/name_fns.h" #include "orte/util/proc_info.h" #include "orte/util/show_help.h" #include "orte/util/nidmap.h" #include "orte/runtime/orte_globals.h" #include "orte/runtime/orte_locks.h" #include "orte/runtime/orte_quit.h" #include "orte/runtime/data_type_support/orte_dt_support.h" #include "orte/mca/errmgr/errmgr.h" #include "orte/mca/errmgr/base/base.h" #include "orte/mca/errmgr/base/errmgr_private.h" #include "errmgr_hnp.h" /********************** * C/R Mgr Components * Global: HNP **********************/ static orte_errmgr_base_module_t global_module = { /** Initialization Function */ orte_errmgr_hnp_global_module_init, /** Finalization Function */ orte_errmgr_hnp_global_module_finalize, /** Error Log */ orte_errmgr_base_log, /** Forced Abort */ orte_errmgr_base_abort, /** Peer Force Abort */ orte_errmgr_base_abort_peers, /** Update State */ orte_errmgr_hnp_global_update_state, /* Predicted Fault */ orte_errmgr_hnp_global_predicted_fault, /* Suggest proc to node mapping */ orte_errmgr_hnp_global_suggest_map_targets, /* FT Event hook */ orte_errmgr_hnp_global_ft_event, orte_errmgr_base_register_migration_warning #if ORTE_RESIL_ORTE /* Set the callback */ ,orte_errmgr_base_set_fault_callback #endif }; /* * Local functions */ static void hnp_abort(orte_jobid_t job, orte_exit_code_t exit_code); static void failed_start(orte_job_t *jdata); static void update_local_procs_in_job(orte_job_t *jdata, orte_job_state_t jobstate, orte_proc_state_t state, orte_exit_code_t exit_code); static void check_job_complete(orte_job_t *jdata); static void killprocs(orte_jobid_t job, orte_vpid_t vpid); static int hnp_relocate(orte_job_t *jdata, orte_process_name_t *proc, orte_proc_state_t state, orte_exit_code_t exit_code); static orte_odls_child_t* proc_is_local(orte_process_name_t *proc); #if ORTE_RESIL_ORTE static int send_to_local_applications(opal_pointer_array_t *dead_names); static void failure_notification(int status, orte_process_name_t* sender, opal_buffer_t *buffer, orte_rml_tag_t tag, void* cbdata); #endif /************************ * API Definitions ************************/ int orte_errmgr_hnp_component_query(mca_base_module_t **module, int *priority) { opal_output_verbose(10, mca_errmgr_hnp_component.super.output_handle, "errmgr:hnp:component_query()"); if( ORTE_PROC_IS_HNP ) { *priority = mca_errmgr_hnp_component.super.priority; *module = (mca_base_module_t *)&global_module; } /* Daemons and Apps have their own components */ else { *module = NULL; *priority = -1; } return ORTE_SUCCESS; } /******************* * Global Functions ********************/ int orte_errmgr_hnp_global_module_init(void) { int ret, exit_status = ORTE_SUCCESS; #if OPAL_ENABLE_FT_CR if( mca_errmgr_hnp_component.crmig_enabled ) { if( ORTE_SUCCESS != (ret = orte_errmgr_hnp_crmig_global_module_init()) ) { exit_status = ret; goto cleanup; } } else { /* Still need the tool listener so we can tell it that we cannot do * anything if they ask. */ if( ORTE_SUCCESS != (ret = orte_errmgr_base_tool_init()) ) { ORTE_ERROR_LOG(ret); return ret; } } if( mca_errmgr_hnp_component.autor_enabled ) { if( ORTE_SUCCESS != (ret = orte_errmgr_hnp_autor_global_module_init()) ) { exit_status = ret; goto cleanup; } } #endif /* OPAL_ENABLE_FT_CR */ if( ORTE_SUCCESS != (ret = orte_errmgr_hnp_base_global_init()) ) { exit_status = ret; goto cleanup; } cleanup: return exit_status; } int orte_errmgr_hnp_global_module_finalize(void) { int ret, exit_status = ORTE_SUCCESS; #if OPAL_ENABLE_FT_CR if( mca_errmgr_hnp_component.crmig_enabled ) { if( ORTE_SUCCESS != (ret = orte_errmgr_hnp_crmig_global_module_finalize()) ) { exit_status = ret; goto cleanup; } } else { /* Still need the tool listener so we can tell it that we cannot do * anything if they ask. */ if( ORTE_SUCCESS != (ret = orte_errmgr_base_tool_finalize()) ) { ORTE_ERROR_LOG(ret); return ret; } } if( mca_errmgr_hnp_component.autor_enabled ) { if( ORTE_SUCCESS != (ret = orte_errmgr_hnp_autor_global_module_finalize()) ) { exit_status = ret; goto cleanup; } } #endif /* OPAL_ENABLE_FT_CR */ if( ORTE_SUCCESS != (ret = orte_errmgr_hnp_base_global_finalize()) ) { exit_status = ret; goto cleanup; } cleanup: return exit_status; } int orte_errmgr_hnp_global_update_state(orte_jobid_t job, orte_job_state_t jobstate, orte_process_name_t *proc_name, orte_proc_state_t state, pid_t pid, orte_exit_code_t exit_code) { int ret, exit_status = ORTE_SUCCESS; mca_errmgr_hnp_component.ignore_current_update = false; if (orte_finalizing || orte_job_term_ordered || ORTE_PROC_STATE_TERMINATED == state ) { mca_errmgr_hnp_component.term_in_progress = true; } OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base.output, "errmgr:hnp:update_state() %s) " "------- %s state updated for process %s to %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ((NULL == proc_name) ? "App. Process" : (proc_name->jobid == ORTE_PROC_MY_HNP->jobid ? "Daemon" : "App. Process")), (NULL == proc_name) ? "NULL" : ORTE_NAME_PRINT(proc_name), orte_proc_state_to_str(state))); #if OPAL_ENABLE_FT_CR if( mca_errmgr_hnp_component.crmig_enabled && !mca_errmgr_hnp_component.autor_in_progress) { if( ORTE_SUCCESS != (ret = orte_errmgr_hnp_crmig_global_update_state(job, jobstate, proc_name, state, pid, exit_code)) ) { exit_status = ret; goto cleanup; } } if( mca_errmgr_hnp_component.autor_enabled && !mca_errmgr_hnp_component.crmig_in_progress) { if( ORTE_SUCCESS != (ret = orte_errmgr_hnp_autor_global_update_state(job, jobstate, proc_name, state, pid, exit_code)) ) { exit_status = ret; goto cleanup; } } #endif /* OPAL_ENABLE_FT_CR */ if( !mca_errmgr_hnp_component.ignore_current_update ) { if( ORTE_SUCCESS != (ret = orte_errmgr_hnp_base_global_update_state(job, jobstate, proc_name, state, pid, exit_code)) ) { exit_status = ret; goto cleanup; } } cleanup: return exit_status; } int orte_errmgr_hnp_global_predicted_fault(opal_list_t *proc_list, opal_list_t *node_list, opal_list_t *suggested_map) { #if OPAL_ENABLE_FT_CR int ret, exit_status = ORTE_SUCCESS; if( mca_errmgr_hnp_component.crmig_enabled ) { if( ORTE_SUCCESS != (ret = orte_errmgr_hnp_crmig_global_predicted_fault(proc_list, node_list, suggested_map)) ) { exit_status = ret; goto cleanup; } } /* * If Process migration is not enabled, then return an error the tool * which will print an appropriate message for the user. */ else { OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle, "errmgr:hnp:predicted_fault() Command line asked for a migration, but it is not enabled\n")); orte_errmgr_base_migrate_update(ORTE_ERRMGR_MIGRATE_STATE_ERROR); exit_status = ORTE_ERR_NOT_IMPLEMENTED; goto cleanup; } cleanup: return exit_status; #else return ORTE_ERR_NOT_IMPLEMENTED; #endif /* OPAL_ENABLE_FT_CR */ } int orte_errmgr_hnp_global_suggest_map_targets(orte_proc_t *proc, orte_node_t *oldnode, opal_list_t *node_list) { #if OPAL_ENABLE_FT_CR int ret, exit_status = ORTE_ERR_NOT_IMPLEMENTED; if( mca_errmgr_hnp_component.crmig_enabled && !mca_errmgr_hnp_component.autor_in_progress ) { exit_status = ORTE_SUCCESS; if( ORTE_SUCCESS != (ret = orte_errmgr_hnp_crmig_global_suggest_map_targets(proc, oldnode, node_list)) ) { exit_status = ret; goto cleanup; } } if( mca_errmgr_hnp_component.autor_enabled && !mca_errmgr_hnp_component.crmig_in_progress ) { exit_status = ORTE_SUCCESS; if( ORTE_SUCCESS != (ret = orte_errmgr_hnp_autor_global_suggest_map_targets(proc, oldnode, node_list)) ) { exit_status = ret; goto cleanup; } } cleanup: return exit_status; #else return ORTE_ERR_NOT_IMPLEMENTED; #endif /* OPAL_ENABLE_FT_CR */ } int orte_errmgr_hnp_global_ft_event(int state) { int ret, exit_status = ORTE_SUCCESS; #if OPAL_ENABLE_FT_CR if( !mca_errmgr_hnp_component.crmig_enabled ) { if( ORTE_SUCCESS != (ret = orte_errmgr_hnp_crmig_global_ft_event(state)) ) { exit_status = ret; goto cleanup; } } if( !mca_errmgr_hnp_component.autor_enabled ) { if( ORTE_SUCCESS != (ret = orte_errmgr_hnp_autor_global_ft_event(state)) ) { exit_status = ret; goto cleanup; } } #endif /* OPAL_ENABLE_FT_CR */ if( ORTE_SUCCESS != (ret = orte_errmgr_hnp_base_global_ft_event(state)) ) { exit_status = ret; goto cleanup; } cleanup: return exit_status; } /********************** * From HNP **********************/ int orte_errmgr_hnp_base_global_init(void) { int ret = ORTE_SUCCESS; #if ORTE_RESIL_ORTE ret = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_FAILURE_NOTICE, ORTE_RML_PERSISTENT, failure_notification, NULL); #endif return ret; } int orte_errmgr_hnp_base_global_finalize(void) { #if ORTE_RESIL_ORTE orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_FAILURE_NOTICE); #endif return ORTE_SUCCESS; } int orte_errmgr_hnp_base_global_update_state(orte_jobid_t job, orte_job_state_t jobstate, orte_process_name_t *proc, orte_proc_state_t state, pid_t pid, orte_exit_code_t exit_code) { orte_job_t *jdata; orte_exit_code_t sts; orte_odls_child_t *child; int rc; orte_app_context_t *app; orte_proc_t *pdat; OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, "%s errmgr:hnp: job %s reported state %s" " for proc %s state %s pid %d exit_code %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(job), orte_job_state_to_str(jobstate), (NULL == proc) ? "NULL" : ORTE_NAME_PRINT(proc), orte_proc_state_to_str(state), pid, exit_code)); /* * if orte is trying to shutdown, just let it */ if (orte_finalizing) { return ORTE_SUCCESS; } if (NULL == proc) { /* this is an update for an entire local job */ if (ORTE_JOBID_INVALID == job) { /* whatever happened, we don't know what job * it happened to */ if (ORTE_JOB_STATE_NEVER_LAUNCHED == jobstate) { orte_never_launched = true; } orte_show_help("help-orte-errmgr-hnp.txt", "errmgr-hnp:unknown-job-error", true, orte_job_state_to_str(jobstate)); hnp_abort(job, exit_code); return ORTE_SUCCESS; } /* get the job object */ if (NULL == (jdata = orte_get_job_data_object(job))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); return ORTE_ERR_NOT_FOUND; } /* update the state */ jdata->state = jobstate; OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, "%s errmgr:hnp: job %s reported state %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(jdata->jobid), orte_job_state_to_str(jobstate))); switch (jobstate) { case ORTE_JOB_STATE_TERMINATED: /* support batch-operated jobs */ update_local_procs_in_job(jdata, jobstate, ORTE_PROC_STATE_TERMINATED, 0); jdata->num_terminated = jdata->num_procs; check_job_complete(jdata); break; case ORTE_JOB_STATE_ABORTED: /* support batch-operated jobs */ update_local_procs_in_job(jdata, jobstate, ORTE_PROC_STATE_ABORTED, exit_code); jdata->num_terminated = jdata->num_procs; check_job_complete(jdata); break; case ORTE_JOB_STATE_FAILED_TO_START: failed_start(jdata); check_job_complete(jdata); /* set the local proc states */ /* the job object for this job will have been NULL'd * in the array if the job was solely local. If it isn't * NULL, then we need to tell everyone else to die */ if (NULL != (jdata = orte_get_job_data_object(job))) { sts = exit_code; if (ORTE_PROC_MY_NAME->jobid == job && !orte_abnormal_term_ordered) { /* set the flag indicating that a daemon failed so we use the proper * methods for attempting to shutdown the rest of the system */ orte_abnormal_term_ordered = true; if (WIFSIGNALED(exit_code)) { /* died on signal */ #ifdef WCOREDUMP if (WCOREDUMP(exit_code)) { orte_show_help("help-plm-base.txt", "daemon-died-signal-core", true, WTERMSIG(exit_code)); sts = WTERMSIG(exit_code); } else { orte_show_help("help-plm-base.txt", "daemon-died-signal", true, WTERMSIG(exit_code)); sts = WTERMSIG(exit_code); } #else orte_show_help("help-plm-base.txt", "daemon-died-signal", true, WTERMSIG(exit_code)); sts = WTERMSIG(exit_code); #endif /* WCOREDUMP */ } else { orte_show_help("help-plm-base.txt", "daemon-died-no-signal", true, WEXITSTATUS(exit_code)); sts = WEXITSTATUS(exit_code); } } hnp_abort(jdata->jobid, sts); } break; case ORTE_JOB_STATE_SILENT_ABORT: failed_start(jdata); check_job_complete(jdata); /* set the local proc states */ /* the job object for this job will have been NULL'd * in the array if the job was solely local. If it isn't * NULL, then we need to tell everyone else to die */ if (NULL != (jdata = orte_get_job_data_object(job))) { if (ORTE_PROC_MY_NAME->jobid == job && !orte_abnormal_term_ordered) { /* set the flag indicating that a daemon failed so we use the proper * methods for attempting to shutdown the rest of the system */ orte_abnormal_term_ordered = true; } hnp_abort(jdata->jobid, exit_code); } break; case ORTE_JOB_STATE_RUNNING: /* update all procs in job */ update_local_procs_in_job(jdata, jobstate, ORTE_PROC_STATE_RUNNING, 0); /* record that we reported */ jdata->num_daemons_reported++; /* report if requested */ if (orte_report_launch_progress) { if (0 == jdata->num_daemons_reported % 100 || jdata->num_daemons_reported == orte_process_info.num_procs) { opal_output(orte_clean_output, "Reported: %d (out of %d) daemons - %d (out of %d) procs", (int)jdata->num_daemons_reported, (int)orte_process_info.num_procs, (int)jdata->num_launched, (int)jdata->num_procs); } } break; case ORTE_JOB_STATE_NEVER_LAUNCHED: orte_never_launched = true; jdata->num_terminated = jdata->num_procs; check_job_complete(jdata); /* set the local proc states */ /* the job object for this job will have been NULL'd * in the array if the job was solely local. If it isn't * NULL, then we need to tell everyone else to die */ if (NULL != (jdata = orte_get_job_data_object(job))) { hnp_abort(jdata->jobid, exit_code); } break; case ORTE_JOB_STATE_SENSOR_BOUND_EXCEEDED: /* update all procs in job */ update_local_procs_in_job(jdata, jobstate, ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED, exit_code); /* order all local procs for this job to be killed */ killprocs(jdata->jobid, ORTE_VPID_WILDCARD); check_job_complete(jdata); /* set the local proc states */ /* the job object for this job will have been NULL'd * in the array if the job was solely local. If it isn't * NULL, then we need to tell everyone else to die */ if (NULL != (jdata = orte_get_job_data_object(job))) { hnp_abort(jdata->jobid, exit_code); } break; case ORTE_JOB_STATE_COMM_FAILED: /* order all local procs for this job to be killed */ killprocs(jdata->jobid, ORTE_VPID_WILDCARD); check_job_complete(jdata); /* set the local proc states */ /* the job object for this job will have been NULL'd * in the array if the job was solely local. If it isn't * NULL, then we need to tell everyone else to die */ if (NULL != (jdata = orte_get_job_data_object(job))) { hnp_abort(jdata->jobid, exit_code); } break; case ORTE_JOB_STATE_HEARTBEAT_FAILED: /* order all local procs for this job to be killed */ killprocs(jdata->jobid, ORTE_VPID_WILDCARD); check_job_complete(jdata); /* set the local proc states */ /* the job object for this job will have been NULL'd * in the array if the job was solely local. If it isn't * NULL, then we need to tell everyone else to die */ if (NULL != (jdata = orte_get_job_data_object(job))) { hnp_abort(jdata->jobid, exit_code); } break; default: break; } return ORTE_SUCCESS; } /* get the job object */ if (NULL == (jdata = orte_get_job_data_object(proc->jobid))) { /* if the orteds are terminating, check job complete */ if (orte_orteds_term_ordered) { opal_output(0, "TERM ORDERED - CHECKING COMPLETE"); check_job_complete(NULL); return ORTE_SUCCESS; } else { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); return ORTE_ERR_NOT_FOUND; } } #if OPAL_ENABLE_FT_CR /* Notify the process state to the notifier framework if it is active and selected. */ orte_errmgr_base_proc_state_notify(state, proc); #endif /* update is for a specific proc */ switch (state) { case ORTE_PROC_STATE_ABORTED: case ORTE_PROC_STATE_ABORTED_BY_SIG: case ORTE_PROC_STATE_TERM_WO_SYNC: if( jdata->enable_recovery ) { /* is this a local proc */ if (NULL != (child = proc_is_local(proc))) { /* local proc - see if it has reached its restart limit */ app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, child->app_idx); if (child->restarts < app->max_restarts) { child->restarts++; if (ORTE_SUCCESS == (rc = orte_odls.restart_proc(child))) { return ORTE_SUCCESS; } /* reset the child's state as restart_proc would * have cleared it */ child->state = state; /* see if we can relocate it somewhere else */ if (ORTE_SUCCESS == hnp_relocate(jdata, proc, state, exit_code)) { return ORTE_SUCCESS; } /* let it fall thru to abort */ } } else { /* this is a remote process - see if we can relocate it */ if (ORTE_SUCCESS == hnp_relocate(jdata, proc, state, exit_code)) { return ORTE_SUCCESS; } /* guess not - let it fall thru to abort */ } } orte_errmgr_hnp_update_proc(jdata, proc, state, pid, exit_code); check_job_complete(jdata); /* need to set the job state */ /* the job object for this job will have been NULL'd * in the array if the job was solely local. If it isn't * NULL, then we need to tell everyone else to die */ if (NULL != (jdata = orte_get_job_data_object(proc->jobid))) { hnp_abort(jdata->jobid, exit_code); } break; case ORTE_PROC_STATE_FAILED_TO_START: case ORTE_PROC_STATE_CALLED_ABORT: orte_errmgr_hnp_update_proc(jdata, proc, state, pid, exit_code); check_job_complete(jdata); /* the job object for this job will have been NULL'd * in the array if the job was solely local. If it isn't * NULL, then we need to tell everyone else to die */ if (NULL != (jdata = orte_get_job_data_object(proc->jobid))) { hnp_abort(jdata->jobid, exit_code); } break; case ORTE_PROC_STATE_REGISTERED: case ORTE_PROC_STATE_RUNNING: orte_errmgr_hnp_update_proc(jdata, proc, state, pid, exit_code); break; case ORTE_PROC_STATE_LAUNCHED: /* record the pid for this child */ orte_errmgr_hnp_update_proc(jdata, proc, state, pid, exit_code); break; case ORTE_PROC_STATE_TERMINATED: case ORTE_PROC_STATE_TERM_NON_ZERO: case ORTE_PROC_STATE_KILLED_BY_CMD: orte_errmgr_hnp_update_proc(jdata, proc, state, pid, exit_code); check_job_complete(jdata); break; case ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED: if (jdata->enable_recovery) { killprocs(proc->jobid, proc->vpid); /* is this a local proc */ if (NULL != (child = proc_is_local(proc))) { /* local proc - see if it has reached its restart limit */ app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, child->app_idx); if (child->restarts < app->max_restarts) { child->restarts++; if (ORTE_SUCCESS == (rc = orte_odls.restart_proc(child))) { return ORTE_SUCCESS; } /* reset the child's state as restart_proc would * have cleared it */ child->state = state; /* see if we can relocate it somewhere else */ if (ORTE_SUCCESS == hnp_relocate(jdata, proc, state, exit_code)) { return ORTE_SUCCESS; } /* let it fall thru to abort */ } } else { /* this is a remote process - see if we can relocate it */ if (ORTE_SUCCESS == hnp_relocate(jdata, proc, state, exit_code)) { return ORTE_SUCCESS; } /* guess not - let it fall thru to abort */ } } /* kill all jobs */ orte_errmgr_hnp_update_proc(jdata, proc, state, pid, exit_code); check_job_complete(jdata); /* need to set the job state */ /* the job object for this job will have been NULL'd * in the array if the job was solely local. If it isn't * NULL, then we need to tell everyone else to die */ if (NULL != (jdata = orte_get_job_data_object(proc->jobid))) { hnp_abort(jdata->jobid, exit_code); } break; case ORTE_PROC_STATE_COMM_FAILED: /* is this to a daemon? */ if (ORTE_PROC_MY_NAME->jobid == proc->jobid) { /* if this is my own connection, ignore it */ if (ORTE_PROC_MY_NAME->vpid == proc->vpid) { OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, "%s My own connection - ignoring it", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); break; } /* if we have ordered orteds to terminate, record it */ if (orte_orteds_term_ordered) { OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, "%s Daemons terminating - recording daemon %s as gone", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc))); /* remove from dependent routes, if it is one */ orte_routed.route_lost(proc); /* update daemon job */ orte_errmgr_hnp_record_dead_process(proc); /* We'll check if the job was complete when we get the * message back from the HNP notifying us of the dead * process */ check_job_complete(jdata); break; } /* if abort is in progress, see if this one failed to tell * us it had terminated */ if (orte_abnormal_term_ordered) { OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, "%s Abort in progress - recording daemon %s as gone", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc))); /* remove from dependent routes, if it is one */ orte_routed.route_lost(proc); /* update daemon job */ orte_errmgr_hnp_record_dead_process(proc); /* We'll check if the job was complete when we get the * message back from the HNP notifying us of the dead * process */ check_job_complete(jdata); break; } /* remove from dependent routes, if it is one */ orte_routed.route_lost(proc); /* delete the route */ orte_routed.delete_route(proc); /* purge the oob */ orte_rml.purge(proc); if( orte_enable_recovery ) { /* relocate its processes */ if (ORTE_SUCCESS != (rc = hnp_relocate(jdata, proc, state, exit_code))) { /* unable to relocate for some reason */ opal_output(0, "%s UNABLE TO RELOCATE PROCS FROM FAILED DAEMON %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc)); /* kill all local procs */ killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD); /* kill all jobs */ hnp_abort(ORTE_JOBID_WILDCARD, exit_code); /* check if all is complete so we can terminate */ check_job_complete(jdata); } } else { #if !ORTE_RESIL_ORTE if (NULL == (pdat = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); orte_show_help("help-orte-errmgr-hnp.txt", "errmgr-hnp:daemon-died", true, ORTE_VPID_PRINT(proc->vpid), "Unknown"); } else { orte_show_help("help-orte-errmgr-hnp.txt", "errmgr-hnp:daemon-died", true, ORTE_VPID_PRINT(proc->vpid), (NULL == pdat->node) ? "Unknown" : ((NULL == pdat->node->name) ? "Unknown" : pdat->node->name)); } #endif if (ORTE_SUCCESS != orte_errmgr_hnp_record_dead_process(proc)) { /* The process is already dead so don't keep trying to do * this stuff. */ return ORTE_SUCCESS; } #if !ORTE_RESIL_ORTE /* kill all local procs */ killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD); /* kill all jobs */ hnp_abort(ORTE_JOBID_WILDCARD, exit_code); #endif /* We'll check if the job was complete when we get the * message back from the HNP notifying us of the dead * process */ check_job_complete(jdata); } } break; case ORTE_PROC_STATE_HEARTBEAT_FAILED: /* heartbeats are only from daemons */ if( orte_enable_recovery ) { /* relocate its processes */ } else { orte_errmgr_hnp_record_dead_process(proc); /* kill all local procs */ killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD); /* kill all jobs */ hnp_abort(ORTE_JOBID_WILDCARD, exit_code); return ORTE_ERR_UNRECOVERABLE; } break; default: break; } return ORTE_SUCCESS; } int orte_errmgr_hnp_base_global_ft_event(int state) { return ORTE_SUCCESS; } #if ORTE_RESIL_ORTE static void failure_notification(int status, orte_process_name_t* sender, opal_buffer_t *buffer, orte_rml_tag_t tag, void* cbdata) { orte_std_cntr_t n; int ret = ORTE_SUCCESS, num_failed; opal_pointer_array_t *dead_names; int32_t i; orte_process_name_t *name_item; orte_epoch_t epoch; orte_job_t *jdat; orte_proc_t *pdat, *pdat2; opal_buffer_t *answer; /* If processes have started terminating, don't worry about reported * failures. The ORTEDs don't know the difference. */ if (mca_errmgr_hnp_component.term_in_progress) { return; } if (orte_debug_daemons_flag) { opal_output(0, "%s errmgr:hnp HNP received process failed from orted %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(sender)); } n = 1; /* Get the number of failed procs */ if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &num_failed, &n, ORTE_VPID))) { ORTE_ERROR_LOG(ret); return; } dead_names = OBJ_NEW(opal_pointer_array_t); for (i = 0; i < num_failed; i++) { name_item = (orte_process_name_t *) malloc(sizeof(orte_process_name_t)); /* Unpack the buffer to get the dead process' name. */ n = 1; if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, name_item, &n, ORTE_NAME))) { ORTE_ERROR_LOG(ret); return; } /* Check to see if the message is telling us about an old epoch. * If so ignore the message. */ epoch = orte_util_lookup_epoch(name_item); if (name_item->epoch < epoch) { if (orte_debug_daemons_flag) { opal_output(0, "%s errmgr:hnp HNP ignoring duplicate notification for %s failure (reported epoch: %s local epoch: %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(name_item), ORTE_EPOCH_PRINT(name_item->epoch), ORTE_EPOCH_PRINT(epoch)); } free(name_item); continue; } else { if (orte_debug_daemons_flag) { opal_output(0, "%s errmgr:hnp HNP received notification for %s failure (reported epoch: %s local epoch: %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(name_item), ORTE_EPOCH_PRINT(name_item->epoch), ORTE_EPOCH_PRINT(epoch)); } } opal_pointer_array_add(dead_names, name_item); /* Check to see if the message is telling us about an orted and * it is from another orted. Orteds don't have the list of all * the application processes so they don't know if there were * any child processes on the nodes that they are reporting. */ if (OPAL_EQUAL != orte_util_compare_name_fields(ORTE_NS_CMP_ALL, sender, ORTE_PROC_MY_NAME)) { if (NULL == (jdat = orte_get_job_data_object(name_item->jobid))) { continue; } else if (NULL == (pdat = (orte_proc_t *) opal_pointer_array_get_item(jdat->procs, name_item->vpid))) { continue; } else if (NULL == pdat->node) { continue; } if (ORTE_PROC_MY_NAME->jobid == name_item->jobid) { for (i = 0; i < opal_pointer_array_get_size(pdat->node->procs); i++) { if (NULL == (pdat2 = (orte_proc_t *) opal_pointer_array_get_item(pdat->node->procs, i))) { continue; } /* ignore this process if it has already terminated */ if (ORTE_PROC_STATE_TERMINATED <= pdat2->state) { continue; } /* the proc must have been alive, so notify everyone that it died */ name_item = (orte_process_name_t *) malloc(sizeof(orte_process_name_t)); name_item->jobid = pdat2->name.jobid; name_item->vpid = pdat2->name.vpid; name_item->epoch = orte_util_lookup_epoch(&(pdat2->name)); opal_pointer_array_add(dead_names, name_item); } } } } /* Update the number of failed process so any duplicates don't get * re-reported. */ num_failed = opal_pointer_array_get_size(dead_names); if (num_failed > 0) { orte_errmgr_hnp_global_mark_processes_as_dead(dead_names); if (!orte_orteds_term_ordered) { /* Send a message out to all the orteds to inform them that the * process is dead. Long live the process (or not if it is so * decided)! */ answer = OBJ_NEW(opal_buffer_t); if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &num_failed, 1, ORTE_VPID))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(answer); return; } for (i = 0; i < opal_pointer_array_get_size(dead_names); i++) { if (NULL != (name_item = (orte_process_name_t *) opal_pointer_array_get_item(dead_names, i))) { if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, name_item, 1, ORTE_NAME))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(answer); return; } } } if (ORTE_SUCCESS != (ret = orte_grpcomm.xcast(ORTE_PROC_MY_NAME->jobid, answer, ORTE_RML_TAG_FAILURE_NOTICE))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(answer); return; } /* Tell the applications' ORTE layers that there is a failure. */ if (ORTE_SUCCESS != (ret = send_to_local_applications(dead_names))) { return; } } for (i = 0; i < num_failed; i++) { name_item = (orte_process_name_t *) opal_pointer_array_get_item(dead_names, i); free(name_item); } } OBJ_RELEASE(dead_names); } #endif /***************** * Local Functions *****************/ static void hnp_abort(orte_jobid_t job, orte_exit_code_t exit_code) { int rc; /* if we are already in progress, then ignore this call */ if (opal_atomic_trylock(&orte_abort_inprogress_lock)) { /* returns 1 if already locked */ OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, "%s errmgr:hnp: abort in progress, ignoring abort on job %s with status %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(job), exit_code)); return; } OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, "%s errmgr:hnp: abort called on job %s with status %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(job), exit_code)); /* if debuggers are running, clean up */ orte_debugger.finalize(); /* set control params to indicate we are terminating */ orte_job_term_ordered = true; orte_abnormal_term_ordered = true; orte_enable_recovery = false; /* set the exit status, just in case whomever called us failed * to do so - it can only be done once, so we are protected * from overwriting it */ ORTE_UPDATE_EXIT_STATUS(exit_code); /* tell the plm to terminate the orteds - they will automatically * kill their local procs */ if (ORTE_SUCCESS != (rc = orte_plm.terminate_orteds())) { ORTE_ERROR_LOG(rc); } } static void failed_start(orte_job_t *jdata) { opal_list_item_t *item, *next; orte_odls_job_t *jobdat; orte_odls_child_t *child; orte_proc_t *proc; /* lookup the local jobdat for this job */ jobdat = NULL; for (item = opal_list_get_first(&orte_local_jobdata); item != opal_list_get_end(&orte_local_jobdata); item = opal_list_get_next(item)) { jobdat = (orte_odls_job_t*)item; /* is this the specified job? */ if (jobdat->jobid == jdata->jobid) { break; } } if (NULL == jobdat) { /* race condition - may not have been formed yet */ return; } jobdat->state = ORTE_JOB_STATE_FAILED_TO_START; OPAL_THREAD_LOCK(&orte_odls_globals.mutex); for (item = opal_list_get_first(&orte_local_children); item != opal_list_get_end(&orte_local_children); item = next) { next = opal_list_get_next(item); child = (orte_odls_child_t*)item; if (child->name->jobid == jobdat->jobid) { if (ORTE_PROC_STATE_LAUNCHED > child->state || ORTE_PROC_STATE_UNTERMINATED < child->state) { /* get the master proc object */ proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, child->name->vpid); proc->state = child->state; proc->exit_code = child->exit_code; /* update the counter so we can terminate */ jdata->num_terminated++; /* remove the child from our list */ opal_list_remove_item(&orte_local_children, &child->super); OBJ_RELEASE(child); jobdat->num_local_procs--; } } } opal_condition_signal(&orte_odls_globals.cond); OPAL_THREAD_UNLOCK(&orte_odls_globals.mutex); OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, "%s errmgr:hnp: job %s reported incomplete start", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(jdata->jobid))); } static void update_local_procs_in_job(orte_job_t *jdata, orte_job_state_t jobstate, orte_proc_state_t state, orte_exit_code_t exit_code) { opal_list_item_t *item, *next; orte_odls_job_t *jobdat; orte_odls_child_t *child; orte_proc_t *proc; /* lookup the local jobdat for this job */ jobdat = NULL; for (item = opal_list_get_first(&orte_local_jobdata); item != opal_list_get_end(&orte_local_jobdata); item = opal_list_get_next(item)) { jobdat = (orte_odls_job_t*)item; /* is this the specified job? */ if (jobdat->jobid == jdata->jobid) { break; } } if (NULL == jobdat) { /* race condition - may not have been formed yet */ return; } jobdat->state = jobstate; jdata->state = jobstate; OPAL_THREAD_LOCK(&orte_odls_globals.mutex); for (item = opal_list_get_first(&orte_local_children); item != opal_list_get_end(&orte_local_children); item = next) { next = opal_list_get_next(item); child = (orte_odls_child_t*)item; if (jdata->jobid == child->name->jobid) { child->state = state; proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, child->name->vpid); proc->state = state; if (proc->exit_code < exit_code) { proc->exit_code = exit_code; } if (ORTE_PROC_STATE_UNTERMINATED < state) { opal_list_remove_item(&orte_local_children, &child->super); OBJ_RELEASE(child); jdata->num_terminated++; jobdat->num_local_procs--; } else if (ORTE_PROC_STATE_RUNNING) { jdata->num_launched++; } else if (ORTE_PROC_STATE_REGISTERED == state) { jdata->num_reported++; if (jdata->dyn_spawn_active && jdata->num_reported == jdata->num_procs) { OPAL_RELEASE_THREAD(&jdata->dyn_spawn_lock, &jdata->dyn_spawn_cond, &jdata->dyn_spawn_active); } } } } opal_condition_signal(&orte_odls_globals.cond); OPAL_THREAD_UNLOCK(&orte_odls_globals.mutex); } void orte_errmgr_hnp_update_proc(orte_job_t *jdata, orte_process_name_t *proc, orte_proc_state_t state, pid_t pid, orte_exit_code_t exit_code) { opal_list_item_t *item, *next; orte_odls_child_t *child; orte_proc_t *proct; orte_odls_job_t *jobdat, *jdat; int i; jobdat = NULL; for (item = opal_list_get_first(&orte_local_jobdata); item != opal_list_get_end(&orte_local_jobdata); item = opal_list_get_next(item)) { jdat = (orte_odls_job_t*)item; if (jdat->jobid == jdata->jobid) { jobdat = jdat; break; } } if (NULL == jobdat) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); } OPAL_THREAD_LOCK(&orte_odls_globals.mutex); /*** UPDATE LOCAL CHILD ***/ for (item = opal_list_get_first(&orte_local_children); item != opal_list_get_end(&orte_local_children); item = next) { next = opal_list_get_next(item); child = (orte_odls_child_t*)item; if (child->name->jobid == proc->jobid) { if (child->name->vpid == proc->vpid) { child->state = state; if (0 < pid) { child->pid = pid; } child->exit_code = exit_code; proct = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, child->name->vpid); proct->state = state; if (0 < pid) { proct->pid = pid; } proct->exit_code = exit_code; if (ORTE_PROC_STATE_UNTERMINATED < state) { if (!jdata->enable_recovery) { opal_list_remove_item(&orte_local_children, &child->super); OBJ_RELEASE(child); if (NULL != jobdat) { jobdat->num_local_procs--; } } jdata->num_terminated++; } else if (ORTE_PROC_STATE_RUNNING == state) { jdata->num_launched++; if (jdata->num_launched == jdata->num_procs) { jdata->state = ORTE_JOB_STATE_RUNNING; } } else if (ORTE_PROC_STATE_REGISTERED == state) { jdata->num_reported++; if (jdata->dyn_spawn_active && jdata->num_reported == jdata->num_procs) { OPAL_RELEASE_THREAD(&jdata->dyn_spawn_lock, &jdata->dyn_spawn_cond, &jdata->dyn_spawn_active); } } return; } } } opal_condition_signal(&orte_odls_globals.cond); OPAL_THREAD_UNLOCK(&orte_odls_globals.mutex); /*** UPDATE REMOTE CHILD ***/ for (i=0; i < jdata->procs->size; i++) { if (NULL == (proct = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, i))) { continue; } if (proct->name.jobid != proc->jobid || proct->name.vpid != proc->vpid) { continue; } proct->state = state; if (0 < pid) { proct->pid = pid; } proct->exit_code = exit_code; if (ORTE_PROC_STATE_REGISTERED == state) { jdata->num_reported++; if (jdata->dyn_spawn_active && jdata->num_reported == jdata->num_procs) { OPAL_RELEASE_THREAD(&jdata->dyn_spawn_lock, &jdata->dyn_spawn_cond, &jdata->dyn_spawn_active); } } else if (ORTE_PROC_STATE_UNTERMINATED < state) { /* update the counter so we can terminate */ jdata->num_terminated++; } else if (ORTE_PROC_STATE_RUNNING == state) { jdata->num_launched++; if (jdata->num_launched == jdata->num_procs) { jdata->state = ORTE_JOB_STATE_RUNNING; } } return; } } static void check_job_complete(orte_job_t *jdata) { orte_proc_t *proc; int i; orte_std_cntr_t j; orte_job_t *job; orte_node_t *node; orte_job_map_t *map; orte_std_cntr_t index; bool one_still_alive; orte_vpid_t non_zero=0, lowest=0; char *msg; #if 0 /* Check if FileM is active. If so then keep processing. */ OPAL_ACQUIRE_THREAD(&orte_filem_base_lock, &orte_filem_base_cond, &orte_filem_base_is_active); #endif if (NULL == jdata) { /* just check to see if the daemons are complete */ OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, "%s errmgr:hnp:check_job_complete - received NULL job, checking daemons", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); goto CHECK_DAEMONS; } for (i=0; i < jdata->procs->size && !jdata->abort; i++) { if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, i))) { /* the proc array may no longer be left justified, so * we need to check everything */ continue; } if (0 != proc->exit_code) { non_zero++; if (0 == lowest) { lowest = proc->exit_code; } } switch (proc->state) { case ORTE_PROC_STATE_KILLED_BY_CMD: OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, "%s errmgr:hnp:check_job_completed proc %s killed by cmd", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&proc->name))); /* we ordered this proc to die, so it isn't an abnormal termination * and we don't flag it as such - just check the remaining jobs to * see if anyone is still alive */ if (jdata->num_terminated >= jdata->num_procs) { /* this job has terminated - now we need to check to see if ALL * the other jobs have also completed and wakeup if that is true */ if (!jdata->abort) { jdata->state = ORTE_JOB_STATE_KILLED_BY_CMD; } } goto CHECK_ALIVE; break; case ORTE_PROC_STATE_ABORTED: OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, "%s errmgr:hnp:check_job_completed proc %s aborted", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&proc->name))); if (!jdata->abort) { jdata->state = ORTE_JOB_STATE_ABORTED; /* point to the lowest rank to cause the problem */ jdata->aborted_proc = proc; /* retain the object so it doesn't get free'd */ OBJ_RETAIN(proc); jdata->abort = true; ORTE_UPDATE_EXIT_STATUS(proc->exit_code); } break; case ORTE_PROC_STATE_FAILED_TO_START: OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, "%s errmgr_hnp:check_job_completed proc %s failed to start", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&proc->name))); if (!jdata->abort) { jdata->state = ORTE_JOB_STATE_FAILED_TO_START; /* point to the lowest rank to cause the problem */ jdata->aborted_proc = proc; /* retain the object so it doesn't get free'd */ OBJ_RETAIN(proc); jdata->abort = true; ORTE_UPDATE_EXIT_STATUS(proc->exit_code); } break; case ORTE_PROC_STATE_ABORTED_BY_SIG: OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, "%s errmgr:hnp:check_job_completed proc %s aborted by signal", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&proc->name))); if (!jdata->abort) { jdata->state = ORTE_JOB_STATE_ABORTED_BY_SIG; /* point to the lowest rank to cause the problem */ jdata->aborted_proc = proc; /* retain the object so it doesn't get free'd */ OBJ_RETAIN(proc); jdata->abort = true; ORTE_UPDATE_EXIT_STATUS(proc->exit_code); } break; case ORTE_PROC_STATE_TERM_WO_SYNC: OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, "%s errmgr:hnp:check_job_completed proc %s terminated without sync", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&proc->name))); if (!jdata->abort) { jdata->state = ORTE_JOB_STATE_ABORTED_WO_SYNC; /* point to the lowest rank to cause the problem */ jdata->aborted_proc = proc; /* retain the object so it doesn't get free'd */ OBJ_RETAIN(proc); jdata->abort = true; ORTE_UPDATE_EXIT_STATUS(proc->exit_code); /* now treat a special case - if the proc exit'd without a required * sync, it may have done so with a zero exit code. We want to ensure * that the user realizes there was an error, so in this -one- case, * we overwrite the process' exit code with the default error code */ ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE); } break; case ORTE_PROC_STATE_COMM_FAILED: if (!jdata->abort) { jdata->state = ORTE_JOB_STATE_COMM_FAILED; /* point to the lowest rank to cause the problem */ jdata->aborted_proc = proc; /* retain the object so it doesn't get free'd */ OBJ_RETAIN(proc); jdata->abort = true; ORTE_UPDATE_EXIT_STATUS(proc->exit_code); } break; case ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED: if (!jdata->abort) { jdata->state = ORTE_JOB_STATE_SENSOR_BOUND_EXCEEDED; /* point to the lowest rank to cause the problem */ jdata->aborted_proc = proc; /* retain the object so it doesn't get free'd */ OBJ_RETAIN(proc); jdata->abort = true; ORTE_UPDATE_EXIT_STATUS(proc->exit_code); } break; case ORTE_PROC_STATE_CALLED_ABORT: if (!jdata->abort) { jdata->state = ORTE_JOB_STATE_CALLED_ABORT; /* point to the first proc to cause the problem */ jdata->aborted_proc = proc; /* retain the object so it doesn't get free'd */ OBJ_RETAIN(proc); jdata->abort = true; ORTE_UPDATE_EXIT_STATUS(proc->exit_code); } break; case ORTE_PROC_STATE_HEARTBEAT_FAILED: if (!jdata->abort) { jdata->state = ORTE_JOB_STATE_HEARTBEAT_FAILED; /* point to the lowest rank to cause the problem */ jdata->aborted_proc = proc; /* retain the object so it doesn't get free'd */ OBJ_RETAIN(proc); jdata->abort = true; ORTE_UPDATE_EXIT_STATUS(proc->exit_code); } break; case ORTE_PROC_STATE_TERM_NON_ZERO: ORTE_UPDATE_EXIT_STATUS(proc->exit_code); if (orte_abort_non_zero_exit) { if (!jdata->abort) { jdata->state = ORTE_JOB_STATE_NON_ZERO_TERM; /* point to the lowest rank to cause the problem */ jdata->aborted_proc = proc; /* retain the object so it doesn't get free'd */ OBJ_RETAIN(proc); jdata->abort = true; } } break; default: if (ORTE_PROC_STATE_UNTERMINATED < proc->state && jdata->controls & ORTE_JOB_CONTROL_CONTINUOUS_OP) { OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, "%s errmgr:hnp:check_job_completed proc %s terminated and continuous", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&proc->name))); if (!jdata->abort) { proc->state = ORTE_PROC_STATE_ABORTED; jdata->state = ORTE_JOB_STATE_ABORTED; /* point to the lowest rank to cause the problem */ jdata->aborted_proc = proc; /* retain the object so it doesn't get free'd */ OBJ_RETAIN(proc); jdata->abort = true; ORTE_UPDATE_EXIT_STATUS(proc->exit_code); } } break; } } if (jdata->abort) { /* the job aborted - turn off any sensors on this job */ orte_sensor.stop(jdata->jobid); } if (ORTE_JOB_STATE_UNTERMINATED > jdata->state && jdata->num_terminated >= jdata->num_procs) { /* this job has terminated */ jdata->state = ORTE_JOB_STATE_TERMINATED; /* turn off any sensor monitors on this job */ orte_sensor.stop(jdata->jobid); if (0 < non_zero) { if (!orte_report_child_jobs_separately || 1 == ORTE_LOCAL_JOBID(jdata->jobid)) { /* update the exit code */ ORTE_UPDATE_EXIT_STATUS(lowest); } /* warn user */ opal_output(orte_clean_output, "-------------------------------------------------------\n" "While %s job %s terminated normally, %s %s. Further examination may be required.\n" "-------------------------------------------------------", (1 == ORTE_LOCAL_JOBID(jdata->jobid)) ? "the primary" : "child", (1 == ORTE_LOCAL_JOBID(jdata->jobid)) ? "" : ORTE_LOCAL_JOBID_PRINT(jdata->jobid), ORTE_VPID_PRINT(non_zero), (1 == non_zero) ? "process returned\na non-zero exit code." : "processes returned\nnon-zero exit codes."); } OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, "%s errmgr:hnp:check_job_completed declared job %s normally terminated - checking all jobs", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(jdata->jobid))); } /* if this job is a continuously operating one, then don't do * anything further - just return here */ if (NULL != jdata && (ORTE_JOB_CONTROL_CONTINUOUS_OP & jdata->controls || ORTE_JOB_CONTROL_RECOVERABLE & jdata->controls)) { goto CHECK_ALIVE; } /* if the job that is being checked is the HNP, then we are * trying to terminate the orteds. In that situation, we * do -not- check all jobs - we simply notify the HNP * that the orteds are complete. Also check special case * if jdata is NULL - we want * to definitely declare the job done if the orteds * have completed, no matter what else may be happening. * This can happen if a ctrl-c hits in the "wrong" place * while launching */ CHECK_DAEMONS: if (jdata == NULL || jdata->jobid == ORTE_PROC_MY_NAME->jobid) { if (0 == orte_routed.num_routes()) { /* orteds are done! */ OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, "%s orteds complete - exiting", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); if (NULL == jdata) { jdata = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid); } jdata->state = ORTE_JOB_STATE_TERMINATED; orte_quit(); return; } return; } /* Release the resources used by this job. Since some errmgrs may want * to continue using resources allocated to the job as part of their * fault recovery procedure, we only do this once the job is "complete". * Note that an aborted/killed job -is- flagged as complete and will * therefore have its resources released. We need to do this after * we call the errmgr so that any attempt to restart the job will * avoid doing so in the exact same place as the current job */ if (NULL != jdata->map && jdata->state == ORTE_JOB_STATE_TERMINATED) { map = jdata->map; for (index = 0; index < map->nodes->size; index++) { if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, index))) { continue; } OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, "%s releasing procs from node %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node->name)); for (i = 0; i < node->procs->size; i++) { if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i))) { continue; } if (proc->name.jobid != jdata->jobid) { /* skip procs from another job */ continue; } node->slots_inuse--; node->num_procs--; OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, "%s releasing proc %s from node %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&proc->name), node->name)); /* set the entry in the node array to NULL */ opal_pointer_array_set_item(node->procs, i, NULL); /* release the proc once for the map entry */ OBJ_RELEASE(proc); } } OBJ_RELEASE(map); jdata->map = NULL; } CHECK_ALIVE: /* now check to see if all jobs are done - release this jdata * object when we find it */ one_still_alive = false; for (j=1; j < orte_job_data->size; j++) { if (NULL == (job = (orte_job_t*)opal_pointer_array_get_item(orte_job_data, j))) { /* since we are releasing jdata objects as we * go, we can no longer assume that the job_data * array is left justified */ continue; } /* if this is the job we are checking AND it normally terminated, * then go ahead and release it. We cannot release it if it * abnormally terminated as mpirun needs the info so it can * report appropriately to the user * * NOTE: do not release the primary job (j=1) so we * can pretty-print completion message */ if (NULL != jdata && job->jobid == jdata->jobid && (jdata->state == ORTE_JOB_STATE_TERMINATED || jdata->state == ORTE_JOB_STATE_KILLED_BY_CMD)) { /* release this object, ensuring that the * pointer array internal accounting * is maintained! */ if (1 < j) { opal_pointer_array_set_item(orte_job_data, j, NULL); /* ensure the array has a NULL */ OBJ_RELEASE(jdata); } continue; } /* if the job is flagged to not be monitored, skip it */ if (ORTE_JOB_CONTROL_DO_NOT_MONITOR & job->controls) { continue; } /* when checking for job termination, we must be sure to NOT check * our own job as it - rather obviously - has NOT terminated! */ if (job->num_terminated < job->num_procs) { /* we have at least one job that is not done yet - we cannot * just return, though, as we need to ensure we cleanout the * job data for the job that just completed */ OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, "%s errmgr:hnp:check_job_completed job %s is not terminated (%d:%d)", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(job->jobid), job->num_terminated, job->num_procs)); one_still_alive = true; } else { OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, "%s errmgr:hnp:check_job_completed job %s is terminated (%d vs %d [%s])", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(job->jobid), job->num_terminated, job->num_procs, (NULL == jdata) ? "UNKNOWN" : orte_job_state_to_str(jdata->state) )); } } /* if a job is still alive, we just return */ if (one_still_alive) { OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, "%s errmgr:hnp:check_job_completed at least one job is not terminated", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); return; } /* if we get here, then all jobs are done, so terminate */ OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, "%s errmgr:hnp:check_job_completed all jobs terminated", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* set the exit status to 0 - this will only happen if it * wasn't already set by an error condition */ ORTE_UPDATE_EXIT_STATUS(0); /* provide a notifier message if that framework is active - ignored otherwise */ if (NULL != (job = (orte_job_t*)opal_pointer_array_get_item(orte_job_data, 1))) { if (NULL == job->name) { job->name = strdup(orte_process_info.nodename); } if (NULL == job->instance) { asprintf(&job->instance, "%d", orte_process_info.pid); } if (0 == orte_exit_status) { asprintf(&msg, "Job %s:%s complete", job->name, job->instance); orte_notifier.log(ORTE_NOTIFIER_INFO, 0, msg); } else { asprintf(&msg, "Job %s:%s terminated abnormally", job->name, job->instance); orte_notifier.log(ORTE_NOTIFIER_ALERT, orte_exit_status, msg); } free(msg); /* this job object will be release during finalize */ } orte_jobs_complete(); /* if I am the only daemon alive, then I can exit now */ if (0 == orte_routed.num_routes()) { OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, "%s orteds complete - exiting", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); orte_quit(); } } static void killprocs(orte_jobid_t job, orte_vpid_t vpid) { opal_pointer_array_t cmd; orte_proc_t proc; int rc; /* stop local sensors for this job */ if (ORTE_VPID_WILDCARD == vpid) { orte_sensor.stop(job); } if (ORTE_JOBID_WILDCARD == job && ORTE_VPID_WILDCARD == vpid) { if (ORTE_SUCCESS != (rc = orte_odls.kill_local_procs(NULL))) { ORTE_ERROR_LOG(rc); } return; } OBJ_CONSTRUCT(&cmd, opal_pointer_array_t); OBJ_CONSTRUCT(&proc, orte_proc_t); proc.name.jobid = job; proc.name.vpid = vpid; ORTE_EPOCH_SET(proc.name.epoch,orte_ess.proc_get_epoch(&(proc.name))); opal_pointer_array_add(&cmd, &proc); if (ORTE_SUCCESS != (rc = orte_odls.kill_local_procs(&cmd))) { ORTE_ERROR_LOG(rc); } OBJ_DESTRUCT(&cmd); OBJ_DESTRUCT(&proc); } static int hnp_relocate(orte_job_t *jdata, orte_process_name_t *proc, orte_proc_state_t state, orte_exit_code_t exit_code) { orte_job_t *jdat; orte_proc_t *pdata, *pdt, *pdt2; orte_node_t *node, *nd; orte_app_context_t *app; char *app_name; int rc, i, n; OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base.output, "%s CHECKING ON RELOCATE FOR APP %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc))); /* get the proc_t object for this process */ pdata = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid); if (NULL == pdata) { opal_output(0, "Data for proc %s could not be found", ORTE_NAME_PRINT(proc)); return ORTE_ERR_NOT_FOUND; } /* set the state */ pdata->state = state; /* retain the node id */ node = pdata->node; /* if it is a daemon that died, we need to flag all of its procs * to be relocated */ if (ORTE_PROC_MY_NAME->jobid == proc->jobid) { /* remove this proc from the daemon job */ orte_errmgr_hnp_record_dead_process(proc); /* check to see if any other nodes are "alive" */ if (!orte_hnp_is_allocated && jdata->num_procs == 1) { return ORTE_ERR_FATAL; } app_name = "orted"; /* scan the procs looking for each unique jobid on the node */ for (i=0; i < node->procs->size; i++) { if (NULL == (pdt = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i))) { continue; } /* get the job data object for this process */ if (NULL == (jdat = orte_get_job_data_object(pdt->name.jobid))) { /* major problem */ ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); continue; } /* since the node was used in this job's map, release * it so that accounting is maintained */ OBJ_RELEASE(node); /* mark this proc as dead so it will be restarted */ pdt->state = ORTE_PROC_STATE_ABORTED; /* remove this proc from the node */ OBJ_RELEASE(pdt); /* maintains accounting */ opal_pointer_array_set_item(node->procs, i, NULL); /* maintain accounting on num procs alive in case this can't restart */ jdat->num_terminated++; /* look for all other procs on this node from the same job */ for (n=0; n < node->procs->size; n++) { if (NULL == (pdt2 = (orte_proc_t*)opal_pointer_array_get_item(node->procs, n))) { continue; } if (pdt2->name.jobid == pdt->name.jobid) { /* mark this proc as having aborted */ pdt2->state = ORTE_PROC_STATE_ABORTED; /* remove it from the node */ OBJ_RELEASE(pdt2); opal_pointer_array_set_item(node->procs, n, NULL); /* maintain accounting on num procs alive */ jdat->num_terminated++; } } /* and remove the node from the map */ for (n=0; n < jdat->map->nodes->size; n++) { if (NULL == (nd = (orte_node_t*)opal_pointer_array_get_item(jdat->map->nodes, n))) { continue; } if (nd->index == node->index) { opal_pointer_array_set_item(jdat->map->nodes, n, NULL); OBJ_RELEASE(node); /* maintain accounting */ break; } } /* reset the job params for this job */ orte_plm_base_reset_job(jdat); /* relaunch the job */ opal_output(0, "%s RELOCATING APPS FOR JOB %s FROM NODE %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(jdat->jobid), node->name); if (ORTE_SUCCESS != (rc = orte_plm.spawn(jdat))) { opal_output(0, "FAILED TO RESTART APP %s on error %s", app_name, ORTE_ERROR_NAME(rc)); return rc; } } return ORTE_SUCCESS; } /* otherwise, we are an app - try to relocate us to another node */ app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, pdata->app_idx); if (NULL == app) { /* no way to restart this job */ orte_show_help("help-orte-errmgr-hnp.txt", "errmgr-hnp:cannot-relocate", true, ORTE_NAME_PRINT(proc)); return ORTE_ERR_NOT_FOUND; } app_name = app->app; /* track that we are attempting to restart */ pdata->restarts++; /* have we exceeded the number of restarts for this proc? */ if (app->max_restarts < pdata->restarts) { return ORTE_ERR_RESTART_LIMIT_EXCEEDED; } /* reset the job params for restart */ orte_plm_base_reset_job(jdata); /* flag the current node as not-to-be-used */ pdata->node->state = ORTE_NODE_STATE_DO_NOT_USE; /* restart the job - the spawn function will remap and * launch the replacement proc(s) */ OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base.output, "%s RELOCATING APP %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc))); if (ORTE_SUCCESS != (rc = orte_plm.spawn(jdata))) { opal_output(0, "FAILED TO RESTART APP %s on error %s", app_name, ORTE_ERROR_NAME(rc)); return rc; } return ORTE_SUCCESS; } static orte_odls_child_t* proc_is_local(orte_process_name_t *proc) { orte_odls_child_t *child; opal_list_item_t *item; child = NULL; for (item = opal_list_get_first(&orte_local_children); item != opal_list_get_end(&orte_local_children); item = opal_list_get_next(item)) { child = (orte_odls_child_t*)item; if (child->name->jobid == proc->jobid && child->name->vpid == proc->vpid) { return child; } } return NULL; } #if ORTE_RESIL_ORTE static void cbfunc(int status, orte_process_name_t *peer, opal_buffer_t *buffer, orte_rml_tag_t tag, void* cbdata) { OBJ_RELEASE(buffer); } #endif int orte_errmgr_hnp_record_dead_process(orte_process_name_t *proc) { orte_job_t *jdat; orte_proc_t *pdat, *proc_item; int i; opal_pointer_array_t *dead_names; OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base.output, "%s RECORDING DEAD PROCESS %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc))); if (NULL == (jdat = orte_get_job_data_object(proc->jobid))) { opal_output(0, "Can't find job object"); return ORTE_ERR_NOT_FOUND; } if (NULL != (pdat = (orte_proc_t*)opal_pointer_array_get_item(jdat->procs, proc->vpid)) && ORTE_PROC_STATE_TERMINATED > pdat->state) { #if ORTE_ENABLE_EPOCH /* Make sure that the epochs match. */ if (proc->epoch != pdat->name.epoch) { opal_output(1, "The epoch does not match the current epoch. Throwing the request out."); return ORTE_SUCCESS; } #endif dead_names = OBJ_NEW(opal_pointer_array_t); if (ORTE_PROC_MY_NAME->jobid == proc->jobid) { opal_pointer_array_add(dead_names, &(pdat->name)); for (i = 0; i < opal_pointer_array_get_size(pdat->node->procs); i++) { if (NULL == (proc_item = (orte_proc_t *) opal_pointer_array_get_item(pdat->node->procs, i))) { continue; } opal_pointer_array_add(dead_names, &(proc_item->name)); } } #if ORTE_RESIL_ORTE if (!mca_errmgr_hnp_component.term_in_progress) { /* * Send a message to the other daemons so they know that a daemon has * died. */ int rc, num_failed = opal_pointer_array_get_size(dead_names); opal_buffer_t* buffer = OBJ_NEW(opal_buffer_t); orte_process_name_t *proc_name; if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &num_failed, 1, ORTE_VPID))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(buffer); } else { /* Iterate over the list of dead procs and send them along with * the rest. The HNP needs this info so it can tell the other * ORTEDs and they can inform the appropriate applications. */ for (i = 0; i < num_failed; i++) { if (NULL != (proc_name = (orte_process_name_t *) opal_pointer_array_get_item(dead_names, i))) { if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, proc_name, 1, ORTE_NAME))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(buffer); } } } OBJ_RELEASE(dead_names); OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base.output, "%s SENDING DEAD PROCESS MESSAGE TO HNP", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, buffer, ORTE_RML_TAG_FAILURE_NOTICE, 0, cbfunc, NULL); } } else { orte_errmgr_hnp_global_mark_processes_as_dead(dead_names); } #else orte_errmgr_hnp_global_mark_processes_as_dead(dead_names); #endif } return ORTE_SUCCESS; } int orte_errmgr_hnp_global_mark_processes_as_dead(opal_pointer_array_t *dead_procs) { int i; orte_process_name_t *name_item; orte_job_t *jdat; orte_proc_t *pdat; orte_node_t *node; OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, "HNP %s marking procs as dead", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* Iterate over the list of processes */ for (i = 0; i < opal_pointer_array_get_size(dead_procs); i++) { if (NULL == (name_item = (orte_process_name_t *) opal_pointer_array_get_item(dead_procs, i))) { opal_output(1, "NULL found in dead process list."); continue; } if (NULL == (jdat = orte_get_job_data_object(name_item->jobid))) { OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, "%s Job data not found.", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); return ORTE_ERR_NOT_FOUND; } if (NULL != (pdat = (orte_proc_t *) opal_pointer_array_get_item(jdat->procs, name_item->vpid)) && pdat->state < ORTE_PROC_STATE_TERMINATED) { OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, "HNP %s marking %s as dead", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&pdat->name))); #if ORTE_RESIL_ORTE /* Make sure the epochs match, if not it probably means that we * already reported this failure. */ if (name_item->epoch != pdat->name.epoch) { continue; } orte_util_set_epoch(name_item, name_item->epoch + 1); #endif /* Remove it from the job array */ opal_pointer_array_set_item(jdat->procs, name_item->vpid, NULL); orte_process_info.num_procs--; jdat->num_procs--; /* Check if this is an ORTED */ if (ORTE_PROC_MY_NAME->jobid == name_item->jobid) { /* Mark the node as down so it won't be used in mapping anymore. */ node = pdat->node; node->state = ORTE_NODE_STATE_DOWN; node->daemon = NULL; } OBJ_RELEASE(pdat); #if ORTE_RESIL_ORTE /* Create a new proc object that will keep track of the epoch * information */ pdat = OBJ_NEW(orte_proc_t); pdat->name.jobid = jdat->jobid; pdat->name.vpid = name_item->vpid; pdat->name.epoch = name_item->epoch + 1; opal_pointer_array_set_item(jdat->procs, name_item->vpid, pdat); jdat->num_procs++; jdat->num_terminated++; #endif /* Set the state as terminated so we'll know the process isn't * actually there. */ pdat->state = ORTE_PROC_STATE_TERMINATED; } else { #if ORTE_RESIL_ORTE opal_output(0, "Proc data not found for %s", ORTE_NAME_PRINT(name_item)); /* Create a new proc object that will keep track of the epoch * information */ pdat = OBJ_NEW(orte_proc_t); pdat->name.jobid = jdat->jobid; pdat->name.vpid = name_item->vpid; pdat->name.epoch = name_item->epoch + 1; /* Set the state as terminated so we'll know the process isn't * actually there. */ pdat->state = ORTE_PROC_STATE_TERMINATED; opal_pointer_array_set_item(jdat->procs, name_item->vpid, pdat); jdat->num_procs++; jdat->num_terminated++; #endif } check_job_complete(jdat); } #if ORTE_RESIL_ORTE if (!mca_errmgr_hnp_component.term_in_progress) { /* Need to update the orted routing module. */ orte_routed.update_routing_tree(ORTE_PROC_MY_NAME->jobid); if (NULL != fault_cbfunc) { (*fault_cbfunc)(dead_procs); } } #endif return ORTE_SUCCESS; } #if ORTE_RESIL_ORTE int send_to_local_applications(opal_pointer_array_t *dead_names) { opal_buffer_t *buf; int ret = ORTE_SUCCESS; orte_process_name_t *name_item; int size, i; OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base.output, "%s Sending failure to local applications.", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); buf = OBJ_NEW(opal_buffer_t); size = opal_pointer_array_get_size(dead_names); if (ORTE_SUCCESS != (ret = opal_dss.pack(buf, &size, 1, ORTE_VPID))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(buf); return ret; } for (i = 0; i < size; i++) { if (NULL != (name_item = (orte_process_name_t *) opal_pointer_array_get_item(dead_names, i))) { if (ORTE_SUCCESS != (ret = opal_dss.pack(buf, name_item, 1, ORTE_NAME))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(buf); return ret; } } } if (ORTE_SUCCESS != (ret = orte_odls.deliver_message(ORTE_JOBID_WILDCARD, buf, ORTE_RML_TAG_EPOCH_CHANGE))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(buf); return ret; } OBJ_RELEASE(buf); return ret; } #endif