1
1
openmpi/orte/mca/errmgr/hnp/errmgr_hnp_crmig.c
Wesley Bland 4e7ff0bd5e By popular demand the epoch code is now disabled by default.
To enable the epochs and the resilient orte code, use the configure flag:

--enable-resilient-orte

This will define both:

ORTE_ENABLE_EPOCH
ORTE_RESIL_ORTE

This commit was SVN r25093.
2011-08-26 22:16:14 +00:00

1518 строки
54 KiB
C

/*
* Copyright (c) 2009-2010 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2011 Oak Ridge National Labs. All rights reserved.
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include <sys/types.h>
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif /* HAVE_UNISTD_H */
#ifdef HAVE_STRING_H
#include <string.h>
#endif
#include "opal/util/show_help.h"
#include "opal/util/output.h"
#include "opal/util/opal_environ.h"
#include "opal/util/basename.h"
#include "opal/util/argv.h"
#include "opal/mca/mca.h"
#include "opal/mca/base/base.h"
#include "opal/mca/base/mca_base_param.h"
#include "opal/mca/crs/crs.h"
#include "opal/mca/crs/base/base.h"
#include "orte/util/error_strings.h"
#include "orte/util/name_fns.h"
#include "orte/util/proc_info.h"
#include "orte/runtime/orte_globals.h"
#include "opal/dss/dss.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/rml/rml_types.h"
#include "orte/mca/iof/iof.h"
#include "orte/mca/plm/plm.h"
#include "orte/mca/plm/base/base.h"
#include "orte/mca/plm/base/plm_private.h"
#include "orte/mca/filem/filem.h"
#include "orte/mca/grpcomm/grpcomm.h"
#include "orte/runtime/orte_wait.h"
#include "orte/mca/rmaps/rmaps_types.h"
#include "orte/mca/routed/routed.h"
#include "orte/mca/snapc/snapc.h"
#include "orte/mca/snapc/base/base.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/errmgr/base/base.h"
#include "orte/mca/errmgr/base/errmgr_private.h"
#include "errmgr_hnp.h"
#include MCA_timer_IMPLEMENTATION_HEADER
#if OPAL_ENABLE_FT_CR
/************************************
* Locally Global vars & functions :)
************************************/
static orte_jobid_t current_global_jobid = ORTE_JOBID_INVALID;
static orte_job_t *current_global_jobdata = NULL;
static bool migrating_underway = false;
static bool migrating_terminated = false;
static bool migrating_restarted = false;
static opal_list_t *current_onto_mapping_general = NULL;
static opal_list_t *current_onto_mapping_exclusive = NULL;
/*** Command Line Interactions */
static int current_migration_status = ORTE_ERRMGR_MIGRATE_STATE_NONE;
static int errmgr_crmig_global_migrate(opal_list_t *off_procs, opal_list_t *off_nodes, opal_list_t *onto_map);
static int orte_errmgr_hnp_crmig_global_process_fault(orte_job_t *jdata,
orte_process_name_t *proc_name,
orte_proc_state_t state);
static void errmgr_crmig_process_fault_app(orte_job_t *jdata,
orte_process_name_t *proc,
orte_proc_state_t state);
static void errmgr_crmig_process_fault_daemon(orte_job_t *jdata,
orte_process_name_t *proc,
orte_proc_state_t state);
static bool check_if_duplicate_proc(orte_proc_t *proc, opal_pointer_array_t *migrating_procs);
static int check_if_terminated(opal_pointer_array_t *migrating_procs);
static int check_if_restarted(opal_pointer_array_t *migrating_procs);
static int check_and_pre_map(opal_list_t *off_procs,
opal_list_t *off_nodes,
orte_snapc_base_quiesce_t *cur_datum);
static void display_request(opal_list_t *off_procs,
opal_list_t *off_nodes,
orte_snapc_base_quiesce_t *cur_datum);
/*
* Timer stuff
*/
static void errmgr_crmig_set_time(int idx);
static void errmgr_crmig_display_all_timers(void);
static void errmgr_crmig_clear_timers(void);
static double errmgr_crmig_get_time(void);
static void errmgr_crmig_display_indv_timer_core(double diff, char *str);
static double timer_start[OPAL_CR_TIMER_MAX];
#define ERRMGR_CRMIG_TIMER_START 0
#define ERRMGR_CRMIG_TIMER_SETUP 1
#define ERRMGR_CRMIG_TIMER_CKPT 2
#define ERRMGR_CRMIG_TIMER_TERM 3
#define ERRMGR_CRMIG_TIMER_RESETUP 4
#define ERRMGR_CRMIG_TIMER_RESTART 5
#define ERRMGR_CRMIG_TIMER_FINISH 6
#define ERRMGR_CRMIG_TIMER_MAX 7
#define ERRMGR_CRMIG_CLEAR_TIMERS() \
{ \
if(OPAL_UNLIKELY(mca_errmgr_hnp_component.crmig_timing_enabled > 0)) { \
errmgr_crmig_clear_timers(); \
} \
}
#define ERRMGR_CRMIG_SET_TIMER(idx) \
{ \
if(OPAL_UNLIKELY(mca_errmgr_hnp_component.crmig_timing_enabled > 0)) { \
errmgr_crmig_set_time(idx); \
} \
}
#define ERRMGR_CRMIG_DISPLAY_ALL_TIMERS() \
{ \
if(OPAL_UNLIKELY(mca_errmgr_hnp_component.crmig_timing_enabled > 0)) { \
errmgr_crmig_display_all_timers(); \
} \
}
/************************
* Function Definitions: Global
************************/
int orte_errmgr_hnp_crmig_global_module_init(void)
{
int ret;
opal_output_verbose(10, mca_errmgr_hnp_component.super.output_handle,
"errmgr:hnp(crmig): init()");
migrating_underway = false;
current_global_jobid = ORTE_JOBID_INVALID;
current_global_jobdata = NULL;
/*
* Initialize the connection to the orte-migrate tool
*/
if( ORTE_SUCCESS != (ret = orte_errmgr_base_tool_init()) ) {
ORTE_ERROR_LOG(ret);
return ret;
}
ERRMGR_CRMIG_CLEAR_TIMERS();
return ORTE_SUCCESS;
}
int orte_errmgr_hnp_crmig_global_module_finalize(void)
{
int ret;
opal_output_verbose(10, mca_errmgr_hnp_component.super.output_handle,
"errmgr:hnp(crmig): finalize()");
/*
* Finalize the connection to the orte-migrate tool
*/
if( ORTE_SUCCESS != (ret = orte_errmgr_base_tool_finalize()) ) {
ORTE_ERROR_LOG(ret);
return ret;
}
migrating_underway = false;
current_global_jobid = ORTE_JOBID_INVALID;
current_global_jobdata = NULL;
ERRMGR_CRMIG_CLEAR_TIMERS();
return ORTE_SUCCESS;
}
int orte_errmgr_hnp_crmig_global_predicted_fault(opal_list_t *proc_list,
opal_list_t *node_list,
opal_list_t *suggested_map)
{
int ret, exit_status = ORTE_SUCCESS;
orte_job_t *jdata = NULL;
int i;
/*
* JJH: RETURN HERE
* If we are already migrating, then reject this request
*/
if( migrating_underway ) {
;
}
/*
* Determine the jobid for this migration
* JJH: Assumes only one job active at any one time
*/
for(i = 0; i < orte_job_data->size; ++i ) {
if (NULL == (jdata = (orte_job_t*)opal_pointer_array_get_item(orte_job_data, i))) {
continue;
}
/* Exclude outselves */
if( jdata->jobid == ORTE_PROC_MY_NAME->jobid ) {
continue;
}
current_global_jobdata = jdata;
current_global_jobid = jdata->jobid;
break;
}
if( NULL == current_global_jobdata ) {
opal_output(0, "errmgr:hnp(crmig):predicted_fault(): Global) Error: Cannot find the jdata for the current job.");
ORTE_ERROR_LOG(ORTE_ERROR);
return ORTE_ERROR;
}
current_global_jobdata->controls |= ORTE_JOB_CONTROL_RECOVERABLE;
current_migration_status = ORTE_ERRMGR_MIGRATE_STATE_REQUEST;
if( ORTE_SUCCESS != (ret = orte_errmgr_base_migrate_update(current_migration_status)) ) {
ORTE_ERROR_LOG(ret);
exit_status = ret;
goto cleanup;
}
/*************************
* Kick off the migration
*************************/
if( ORTE_SUCCESS != (ret = errmgr_crmig_global_migrate(proc_list, node_list, suggested_map)) ) {
ORTE_ERROR_LOG(ret);
exit_status = ret;
goto cleanup;
}
/************************
* Set up the Command Line listener again
*************************/
if( ORTE_ERRMGR_MIGRATE_STATE_ERROR != current_migration_status ) {
if( ORTE_SUCCESS != (ret = orte_errmgr_base_migrate_update(ORTE_ERRMGR_MIGRATE_STATE_NONE)) ) {
ORTE_ERROR_LOG(ret);
exit_status = ret;
goto cleanup;
}
opal_show_help("help-orte-errmgr-hnp.txt", "crmig_migrated_job", true);
}
current_migration_status = ORTE_ERRMGR_MIGRATE_STATE_NONE;
cleanup:
return exit_status;
}
int orte_errmgr_hnp_crmig_global_update_state(orte_jobid_t job,
orte_job_state_t jobstate,
orte_process_name_t *proc_name,
orte_proc_state_t state,
pid_t pid,
orte_exit_code_t exit_code)
{
orte_job_t *jdata = NULL;
int ret = ORTE_SUCCESS;
/*
* if orte is trying to shutdown, just let it
*/
if( mca_errmgr_hnp_component.term_in_progress ) {
return ORTE_SUCCESS;
}
/*
* Get the job data object for this process
*/
if( NULL != proc_name ) { /* Get job from proc's jobid */
jdata = orte_get_job_data_object(proc_name->jobid);
} else { /* Get from the general job */
jdata = orte_get_job_data_object(job);
}
if( NULL == jdata ) {
opal_output(0, "%s errmgr:hnp(crmig):update_state() Error: Cannot find job %s for Process %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(job),
(NULL == proc_name) ? "NULL" : ORTE_NAME_PRINT(proc_name) );
ret = ORTE_ERROR;
ORTE_ERROR_LOG(ret);
return ret;
}
/*
* If this is a tool, ignore
*/
if( jdata->num_apps == 0 &&
OPAL_EQUAL != orte_util_compare_name_fields(ORTE_NS_CMP_JOBID, ORTE_PROC_MY_NAME, proc_name) ) {
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
"%s errmgr:hnp(crmig): An external tool disconnected. Ignore...",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
return ORTE_SUCCESS;
}
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
"%s errmgr:hnp(crmig): job %s reported state %s"
" for proc %s state %s exit_code %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(job),
orte_job_state_to_str(jobstate),
(NULL == proc_name) ? "NULL" : ORTE_NAME_PRINT(proc_name),
orte_proc_state_to_str(state), exit_code));
if( ORTE_PROC_STATE_ABORTED_BY_SIG == state ||
ORTE_PROC_STATE_COMM_FAILED == state ) {
if( ORTE_SUCCESS != (ret = orte_errmgr_hnp_crmig_global_process_fault(jdata, proc_name, state)) ) {
ORTE_ERROR_LOG(ret);
return ret;
}
}
else if( ORTE_PROC_STATE_KILLED_BY_CMD == state ) {
if( migrating_underway ) {
/* If we are migrating, then we need to mask this to prevent the lower level from terminating us */
mca_errmgr_hnp_component.ignore_current_update = true;
orte_errmgr_hnp_update_proc(jdata, proc_name, state, 0, exit_code);
}
}
return ORTE_SUCCESS;
}
int orte_errmgr_hnp_crmig_global_suggest_map_targets(orte_proc_t *proc,
orte_node_t *oldnode,
opal_list_t *node_list)
{
int exit_status = ORTE_SUCCESS;
opal_list_item_t *item = NULL, *m_item = NULL;
orte_errmgr_predicted_map_t *onto_map = NULL, *current_proc_map = NULL;
orte_node_t *node = NULL;
bool found = false;
int num_suggested = 0;
orte_std_cntr_t i_proc;
orte_proc_t *peer_proc = NULL;
/*
* If not migrating, then suggest nothing
*/
if( !migrating_underway ) {
return ORTE_SUCCESS;
}
/*
* First look for an exclusive mapping for this process
*/
for(item = opal_list_get_first(current_onto_mapping_exclusive);
item != opal_list_get_end(current_onto_mapping_exclusive);
item = opal_list_get_next(item) ) {
onto_map = (orte_errmgr_predicted_map_t*) item;
if( onto_map->proc_name.vpid == proc->name.vpid ) {
current_proc_map = onto_map;
break;
}
}
/*
* If there is an exclusive mapping then...
*/
if( NULL != current_proc_map ) {
/*
* If we made an exclusive mapping during the check_and_pre_map()
* then honor it here.
*/
if( NULL != current_proc_map->pre_map_fixed_node ) {
for( item = opal_list_get_first(node_list);
item != opal_list_get_end(node_list);
item = opal_list_get_next(item) ) {
node = (orte_node_t*)item;
/* Exclude all other nodes */
found = false;
if( 0 == strncmp(node->name, current_proc_map->pre_map_fixed_node,
strlen(current_proc_map->pre_map_fixed_node)) ) {
found = true;
break;
}
if( !found ) {
opal_list_remove_item(node_list, item);
OBJ_RELEASE(item);
continue;
} else {
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle,
"errmgr:hnp(crmig):suggest() ------- Fixed use of node [%15s : %10s -> %10s (%10s)] -------",
ORTE_NAME_PRINT(&proc->name), oldnode->name,
current_proc_map->pre_map_fixed_node, node->name));
}
}
/* All done with mapping */
exit_status = ORTE_SUCCESS;
goto cleanup;
}
/*
* If 'off_current_node' then exclude current node
*/
if( current_proc_map->off_current_node ) {
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle,
"errmgr:hnp(crmig):suggest() ------- Remove old node (info) [%15s : %10s] -------",
ORTE_NAME_PRINT(&proc->name), oldnode->name));
for( item = opal_list_get_first(node_list);
item != opal_list_get_end(node_list);
item = opal_list_get_next(item) ) {
node = (orte_node_t*)item;
/* Exclude the old node */
if( node == oldnode ) {
opal_list_remove_item(node_list, item);
OBJ_RELEASE(item);
break;
}
}
}
/*
* If 'map_proc_name' then map to the node where this process resides
* Note: Only do this if there was no 'other' node suggested. If there
* was an 'other' node suggested then we need to honor that before
* we honor the peer suggestion.
*/
if( ORTE_VPID_INVALID != current_proc_map->map_proc_name.vpid &&
current_proc_map->proc_name.vpid != current_proc_map->map_proc_name.vpid &&
NULL == current_proc_map->map_node_name ) {
/*
* Find the node containting the target process
*/
for(i_proc = 0; i_proc < opal_pointer_array_get_size(current_global_jobdata->procs); ++i_proc) {
peer_proc = (orte_proc_t*)opal_pointer_array_get_item(current_global_jobdata->procs, i_proc);
if( NULL == peer_proc ) {
continue;
}
if( peer_proc->name.vpid == current_proc_map->map_proc_name.vpid ) {
current_proc_map->map_node_name = strdup(peer_proc->node->name);
break;
}
}
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle,
"errmgr:hnp(crmig):suggest() ------- Force use of node with proc [%15s -> %15s: %10s -> %10s] -------",
ORTE_NAME_PRINT(&proc->name), ORTE_NAME_PRINT(&peer_proc->name),
oldnode->name, current_proc_map->map_node_name));
}
/*
* If 'map_node_name' then use this node exclusively
*/
if( NULL != current_proc_map->map_node_name ) {
for( item = opal_list_get_first(node_list);
item != opal_list_get_end(node_list);
item = opal_list_get_next(item) ) {
node = (orte_node_t*)item;
/* Exclude all nodes not in the include list */
found = false;
if( 0 == strncmp(node->name, current_proc_map->map_node_name, strlen(current_proc_map->map_node_name)) ) {
found = true;
}
if( !found ) {
opal_list_remove_item(node_list, item);
OBJ_RELEASE(item);
continue;
} else {
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle,
"errmgr:hnp(crmig):suggest() ------- Force use of node [%15s : %10s -> %10s (%10s)] -------",
ORTE_NAME_PRINT(&proc->name), oldnode->name,
current_proc_map->map_node_name, node->name));
}
}
/* All done with mapping */
exit_status = ORTE_SUCCESS;
goto cleanup;
}
/*
* Otherwise then map as if there was no exclusive mapping
*/
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle,
"errmgr:hnp(crmig):suggest() ------- Suggesting as if non-exclusive [%15s : 0x%x : %10s] -------",
ORTE_NAME_PRINT(&proc->name), proc->state, oldnode->name));
}
/*
* If no exclusive mapping (or exclusive did not yield any results) then...
*/
else {
/*
* Remove the old node from the list, if there are more than 1 nodes available
*/
if(1 < opal_list_get_size(node_list) ) {
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle,
"errmgr:hnp(crmig):suggest() ------- Remove old node [%15s : %10s] -------",
ORTE_NAME_PRINT(&proc->name), oldnode->name));
for( item = opal_list_get_first(node_list);
item != opal_list_get_end(node_list);
item = opal_list_get_next(item) ) {
node = (orte_node_t*)item;
/* Exclude the old node */
if( node == oldnode ) {
opal_list_remove_item(node_list, item);
OBJ_RELEASE(item);
break;
}
}
}
}
/*
* If we do not have any general suggestions, then just return
*/
if( opal_list_get_size(current_onto_mapping_general) <= 0 ) {
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle,
"errmgr:hnp(crmig):suggest() ------- No suggestions for target [%15s : 0x%x : %10s] -------",
ORTE_NAME_PRINT(&proc->name), proc->state, oldnode->name));
exit_status = ORTE_SUCCESS;
goto cleanup;
}
/*
* Otherwise look through the general suggestions as an include list
*/
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle,
"errmgr:hnp(crmig):suggest() ------- Suggest a target for [%15s : 0x%x : %10s] -------",
ORTE_NAME_PRINT(&proc->name), proc->state, oldnode->name));
num_suggested = 0;
for( item = opal_list_get_first(node_list);
item != opal_list_get_end(node_list);
item = opal_list_get_next(item) ) {
node = (orte_node_t*)item;
/* Exclude all nodes not in the include list */
found = false;
for(m_item = opal_list_get_first(current_onto_mapping_general);
m_item != opal_list_get_end(current_onto_mapping_general);
m_item = opal_list_get_next(m_item) ) {
onto_map = (orte_errmgr_predicted_map_t*) m_item;
if( 0 == strncmp(node->name, onto_map->map_node_name, strlen(onto_map->map_node_name)) ) {
found = true;
break;
}
}
if( !found ) {
opal_list_remove_item(node_list, item);
OBJ_RELEASE(item);
continue;
}
++num_suggested;
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle,
"errmgr:hnp(crmig):suggest() ------- Suggesting target %2d [%15s : 0x%x : %10s -> %10s] -------",
num_suggested, ORTE_NAME_PRINT(&proc->name), proc->state, oldnode->name, node->name));
}
cleanup:
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle,
"errmgr:hnp(crmig):suggest() ------- Suggested %2d nodes for [%15s : 0x%x : %10s] -------",
(int)opal_list_get_size(node_list), ORTE_NAME_PRINT(&proc->name), proc->state, oldnode->name));
return exit_status;
}
int orte_errmgr_hnp_crmig_global_ft_event(int state)
{
return ORTE_SUCCESS;
}
/************************
* Function Definitions: Static
************************/
static int orte_errmgr_hnp_crmig_global_process_fault(orte_job_t *jdata,
orte_process_name_t *proc_name,
orte_proc_state_t state)
{
/*
* JJH: Todo
* The expected logic here is:
* if( a daemon with children fails ) {
* abort migration.
* }
* if( a daemon without children fails ) {
* continue. No processes lost
* }
* if( an application process fails ) {
* abort migration. Might be a bad checkpoint, or a process that we were
* not migrating that died.
* }
* else {
* continue;
* }
*/
if( proc_name->jobid == ORTE_PROC_MY_NAME->jobid ) {
errmgr_crmig_process_fault_daemon(jdata, proc_name, state);
} else {
errmgr_crmig_process_fault_app(jdata, proc_name, state);
}
return ORTE_SUCCESS;
}
static int errmgr_crmig_global_migrate(opal_list_t *off_procs, opal_list_t *off_nodes, opal_list_t *onto_maps)
{
int ret, exit_status = ORTE_SUCCESS;
orte_std_cntr_t i_node;
orte_std_cntr_t i_proc;
orte_node_t *node = NULL;
orte_proc_t *proc = NULL;
bool found = false;
orte_snapc_base_quiesce_t *cur_datum = NULL;
bool close_iof_stdin = false;
orte_process_name_t iof_name = {ORTE_JOBID_INVALID, 0};
char * err_str_procs = NULL;
char * err_str_nodes = NULL;
char * tmp_str = NULL;
orte_errmgr_predicted_proc_t *off_proc = NULL;
orte_errmgr_predicted_node_t *off_node = NULL;
orte_errmgr_predicted_map_t *onto_map = NULL;
opal_list_item_t *item = NULL;
ERRMGR_CRMIG_CLEAR_TIMERS();
ERRMGR_CRMIG_SET_TIMER(ERRMGR_CRMIG_TIMER_START);
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle,
"errmgr:hnp(crmig):migrate() ------- Migrating (%3d, %3d, %3d) -------",
(int)opal_list_get_size(off_procs),
(int)opal_list_get_size(off_nodes),
(int)opal_list_get_size(onto_maps)));
/*
* Modeled after orte_plm_base_reset_job
*/
cur_datum = OBJ_NEW(orte_snapc_base_quiesce_t);
cur_datum->migrating = true;
migrating_underway = true;
mca_errmgr_hnp_component.crmig_in_progress = true;
current_migration_status = ORTE_ERRMGR_MIGRATE_STATE_RUNNING;
if( ORTE_SUCCESS != (ret = orte_errmgr_base_migrate_update(current_migration_status)) ) {
ORTE_ERROR_LOG(ret);
exit_status = ret;
goto cleanup;
}
/*
* Check to make sure that the 'off' and 'onto' nodes exist
* - if 'onto' nodes do not, then add them (JJH XXX)
* - if 'off' nodes do not, then return an error (JJH XXX)
* JJH TODO...
*/
/*
* Copy over the onto_nodes so we can suggest them later
*/
if( NULL != current_onto_mapping_general ) {
OBJ_RELEASE(current_onto_mapping_general);
current_onto_mapping_general = NULL;
}
if( NULL != current_onto_mapping_exclusive ) {
OBJ_RELEASE(current_onto_mapping_exclusive);
current_onto_mapping_exclusive = NULL;
}
current_onto_mapping_general = OBJ_NEW(opal_list_t);
current_onto_mapping_exclusive = OBJ_NEW(opal_list_t);
if( NULL != onto_maps ) {
while( NULL != (item = opal_list_remove_first(onto_maps)) ) {
onto_map = (orte_errmgr_predicted_map_t*) item;
/* Determine if process exclude mapping, or general */
if( onto_map->proc_name.vpid == ORTE_VPID_INVALID ) {
opal_list_append(current_onto_mapping_general, item);
} else {
opal_list_append(current_onto_mapping_exclusive, item);
}
}
}
for(item = opal_list_get_first(current_onto_mapping_exclusive);
item != opal_list_get_end(current_onto_mapping_exclusive);
item = opal_list_get_next(item) ) {
onto_map = (orte_errmgr_predicted_map_t*) item;
/*
* Find the node currently containing this process
*/
found = false;
for(i_proc = 0; i_proc < opal_pointer_array_get_size(current_global_jobdata->procs); ++i_proc) {
proc = (orte_proc_t*)opal_pointer_array_get_item(current_global_jobdata->procs, i_proc);
if( NULL == proc ) {
continue;
}
if( proc->name.vpid == onto_map->proc_name.vpid) {
found = true;
break;
}
}
/*
* Check to see if this process hsould be skipped
*/
if( !onto_map->off_current_node &&
(ORTE_VPID_INVALID == onto_map->map_proc_name.vpid ||
onto_map->proc_name.vpid == onto_map->map_proc_name.vpid ) &&
(NULL == onto_map->map_node_name ||
0 == strncmp(onto_map->map_node_name, proc->node->name, strlen(proc->node->name))) ) {
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle,
"errmgr:hnp(crmig):migrate() ------- Process %15s does not wish to move -------",
ORTE_NAME_PRINT(&proc->name)));
} else {
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle,
"errmgr:hnp(crmig):migrate() ------- Process %15s will be moved -------",
ORTE_NAME_PRINT(&proc->name)));
/*
* Set the process to restarting
*/
proc->state = ORTE_PROC_STATE_MIGRATING;
opal_pointer_array_add(&(cur_datum->migrating_procs), (void*)proc);
OBJ_RETAIN(proc);
(cur_datum->num_migrating)++;
if( current_global_jobdata->stdin_target == proc->name.vpid ) {
close_iof_stdin = true;
iof_name.jobid = proc->name.jobid;
iof_name.vpid = proc->name.vpid;
ORTE_EPOCH_SET(iof_name.epoch,proc->name.epoch);
}
}
}
migrating_terminated = false;
migrating_restarted = false;
/*
* Create a list of processes to migrate, if 'off_nodes' specified
*/
for(item = opal_list_get_first(off_nodes);
item != opal_list_get_end(off_nodes);
item = opal_list_get_next(item) ) {
off_node = (orte_errmgr_predicted_node_t*)item;
/*
* Find the node in the job structure
* - Make sure that 'odin00' doesn't match all 'odin00*'
*/
found = false;
for(i_node = 0; i_node < opal_pointer_array_get_size(current_global_jobdata->map->nodes); ++i_node) {
node = (orte_node_t*)opal_pointer_array_get_item(current_global_jobdata->map->nodes, i_node);
if( NULL == node ) {
continue;
}
if( 0 == strncmp(node->name, off_node->node_name, strlen(off_node->node_name)) ) {
found = true;
break;
}
}
if( !found ) {
; /* Warn about invalid node */
} else {
/*
* Add all processes from this node
*/
for(i_proc = 0; i_proc < opal_pointer_array_get_size(node->procs); ++i_proc) {
proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i_proc);
if( NULL == proc ) {
continue;
}
/*
* Set the process to restarting
*/
proc->state = ORTE_PROC_STATE_MIGRATING;
opal_pointer_array_add(&(cur_datum->migrating_procs), (void*)proc);
OBJ_RETAIN(proc);
(cur_datum->num_migrating)++;
if( current_global_jobdata->stdin_target == proc->name.vpid ) {
close_iof_stdin = true;
iof_name.jobid = proc->name.jobid;
iof_name.vpid = proc->name.vpid;
ORTE_EPOCH_SET(iof_name.epoch,proc->name.epoch);
}
}
}
}
/*
* Create a list of processes to migrate, if 'off_procs' specified
*/
for(item = opal_list_get_first(off_procs);
item != opal_list_get_end(off_procs);
item = opal_list_get_next(item) ) {
off_proc = (orte_errmgr_predicted_proc_t*)item;
/*
* Find the process in the job structure
*/
found = false;
for(i_proc = 0; i_proc < opal_pointer_array_get_size(current_global_jobdata->procs); ++i_proc) {
proc = (orte_proc_t*)opal_pointer_array_get_item(current_global_jobdata->procs, i_proc);
if( NULL == proc ) {
continue;
}
if( proc->name.vpid == off_proc->proc_name.vpid) {
found = true;
break;
}
}
/*
* Make sure the process is not listed multiple times
*/
if( found ) {
found = check_if_duplicate_proc(proc, &(cur_datum->migrating_procs));
if( !found ) {
/*
* Set the process to restarting
*/
proc->state = ORTE_PROC_STATE_MIGRATING;
opal_pointer_array_add(&(cur_datum->migrating_procs), (void*)proc);
OBJ_RETAIN(proc);
(cur_datum->num_migrating)++;
if( current_global_jobdata->stdin_target == proc->name.vpid ) {
close_iof_stdin = true;
iof_name.jobid = proc->name.jobid;
iof_name.vpid = proc->name.vpid;
ORTE_EPOCH_SET(iof_name.epoch,proc->name.epoch);
}
}
}
}
/*
* If we did not find any processes to migrate, then throw a warning, and skip it.
*/
if( 0 >= cur_datum->num_migrating ) {
for(item = opal_list_get_first(off_nodes);
item != opal_list_get_end(off_nodes);
item = opal_list_get_next(item) ) {
off_node = (orte_errmgr_predicted_node_t*)item;
if( NULL != err_str_nodes ) {
asprintf(&tmp_str, "%s, %s", err_str_nodes, off_node->node_name);
free(err_str_nodes);
err_str_nodes = strdup(tmp_str);
free(tmp_str);
tmp_str = NULL;
} else {
asprintf(&err_str_nodes, "%s", off_node->node_name);
}
}
for(item = opal_list_get_first(off_procs);
item != opal_list_get_end(off_procs);
item = opal_list_get_next(item) ) {
off_proc = (orte_errmgr_predicted_proc_t*)item;
if( NULL != err_str_procs ) {
asprintf(&tmp_str, "%s, %d", err_str_procs, (int)off_proc->proc_name.vpid);
free(err_str_procs);
err_str_procs = strdup(tmp_str);
free(tmp_str);
tmp_str = NULL;
} else {
asprintf(&err_str_procs, "%d", off_proc->proc_name.vpid);
}
}
opal_show_help("help-orte-errmgr-hnp.txt", "crmig_no_migrating_procs", true,
err_str_nodes,
err_str_procs);
current_migration_status = ORTE_ERRMGR_MIGRATE_STATE_ERROR;
if( ORTE_SUCCESS != (ret = orte_errmgr_base_migrate_update(current_migration_status)) ) {
ORTE_ERROR_LOG(ret);
exit_status = ret;
goto cleanup;
}
goto cleanup;
}
/*
* Final pass on the migration list to pre-map processes and remove
* processes that should not be migrated.
*/
if( ORTE_SUCCESS != (ret = check_and_pre_map(off_procs, off_nodes, cur_datum)) ) {
ORTE_ERROR_LOG(ret);
exit_status = ret;
goto cleanup;
}
/*
* Display the request before processing it.
*/
display_request(off_procs, off_nodes, cur_datum);
ERRMGR_CRMIG_SET_TIMER(ERRMGR_CRMIG_TIMER_SETUP);
/*
* Checkpoint the job
* - Hold all non-migrating processes
* - Abort the marked processes
* -
*/
current_migration_status = ORTE_ERRMGR_MIGRATE_STATE_RUN_CKPT;
if( ORTE_SUCCESS != (ret = orte_errmgr_base_migrate_update(current_migration_status)) ) {
ORTE_ERROR_LOG(ret);
exit_status = ret;
goto cleanup;
}
opal_output_verbose(10, mca_errmgr_hnp_component.super.output_handle,
"errmgr:hnp(crmig):migrate() ------- Starting the checkpoint of job %s -------",
ORTE_JOBID_PRINT(current_global_jobdata->jobid));
if( ORTE_SUCCESS != (ret = orte_snapc.start_ckpt(cur_datum)) ) {
opal_output(0, "errmgr:hnp(crmig):migrate() Error: Unable to start the checkpoint.");
ORTE_ERROR_LOG(ret);
exit_status = ret;
goto cleanup;
}
ERRMGR_CRMIG_SET_TIMER(ERRMGR_CRMIG_TIMER_CKPT);
/*
* Terminate the migrating processes
*/
opal_output_verbose(10, mca_errmgr_hnp_component.super.output_handle,
"errmgr:hnp(crmig):migrate() ------- Terminate old processes in job %s -------",
ORTE_JOBID_PRINT(current_global_jobdata->jobid));
orte_plm.terminate_procs(&cur_datum->migrating_procs);
/*
* Clear the IOF stdin target if necessary
*/
if( close_iof_stdin ) {
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle,
"errmgr:hnp(crmig):migrate() ------- Closing old STDIN target for job %s (%s)-------",
ORTE_JOBID_PRINT(current_global_jobdata->jobid),
ORTE_NAME_PRINT(&iof_name) ));
orte_iof.close(&iof_name, ORTE_IOF_STDIN);
}
/*
* Wait for the processes to finish terminating
*/
opal_output_verbose(10, mca_errmgr_hnp_component.super.output_handle,
"errmgr:hnp(crmig):migrate() ------- Waiting for termination -------");
while( !migrating_terminated ) {
opal_progress();
check_if_terminated(&(cur_datum->migrating_procs));
}
ERRMGR_CRMIG_SET_TIMER(ERRMGR_CRMIG_TIMER_TERM);
/*
* Start remapping the processes
*/
opal_output_verbose(10, mca_errmgr_hnp_component.super.output_handle,
"errmgr:hnp(crmig):migrate() ------- Checkpoint finished, setting up job %s -------",
ORTE_JOBID_PRINT(current_global_jobdata->jobid));
current_migration_status = ORTE_ERRMGR_MIGRATE_STATE_STARTUP;
if( ORTE_SUCCESS != (ret = orte_errmgr_base_migrate_update(current_migration_status)) ) {
ORTE_ERROR_LOG(ret);
exit_status = ret;
goto cleanup;
}
/*
* Reset the job parameters for restart
* This will set the state of the job to 'restart'
*/
orte_plm_base_reset_job(current_global_jobdata);
/*
* Adjust the application context information
*/
for(i_proc = 0; i_proc < opal_pointer_array_get_size(&(cur_datum->migrating_procs)); ++i_proc) {
proc = (orte_proc_t*)opal_pointer_array_get_item(&(cur_datum->migrating_procs), i_proc);
if( NULL == proc ) {
continue;
}
if( ORTE_SUCCESS != (ret = orte_errmgr_base_update_app_context_for_cr_recovery(current_global_jobdata,
proc,
&(cur_datum->ss_snapshot->local_snapshots))) ) {
ORTE_ERROR_LOG(ret);
exit_status = ret;
goto cleanup;
}
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle,
"\tAdjusted: \"%s\" [0x%d] [%s]\n",
ORTE_NAME_PRINT(&proc->name), proc->state, proc->node->name));
}
ERRMGR_CRMIG_SET_TIMER(ERRMGR_CRMIG_TIMER_RESETUP);
/*
* Restart the job
* - spawn function will remap and launch the replacement proc(s)
*/
opal_output_verbose(10, mca_errmgr_hnp_component.super.output_handle,
"errmgr:hnp(crmig):migrate() ------- Respawning migrating processes in job %s -------",
ORTE_JOBID_PRINT(current_global_jobdata->jobid));
orte_plm.spawn(current_global_jobdata);
opal_output_verbose(10, mca_errmgr_hnp_component.super.output_handle,
"errmgr:hnp(crmig):migrate() ------- Waiting for restart -------");
migrating_restarted = false;
while( !migrating_restarted ) {
opal_progress();
check_if_restarted(&(cur_datum->migrating_procs));
}
ERRMGR_CRMIG_SET_TIMER(ERRMGR_CRMIG_TIMER_RESTART);
/*
* Finish the checkpoint
*/
opal_output_verbose(10, mca_errmgr_hnp_component.super.output_handle,
"errmgr:hnp(crmig):migrate() ------- Reconnecting processes in job %s -------",
ORTE_JOBID_PRINT(current_global_jobdata->jobid));
if( ORTE_SUCCESS != (ret = orte_snapc.end_ckpt(cur_datum)) ) {
opal_output(0, "errmgr:hnp(crmig):migrate() Error: Unable to end the checkpoint.");
ORTE_ERROR_LOG(ret);
exit_status = ret;
goto cleanup;
}
/*
* All done
*/
opal_output_verbose(10, mca_errmgr_hnp_component.super.output_handle,
"errmgr:hnp(crmig):migrate() ------- Finished migrating processes in job %s -------",
ORTE_JOBID_PRINT(current_global_jobdata->jobid));
OBJ_RELEASE(cur_datum);
current_migration_status = ORTE_ERRMGR_MIGRATE_STATE_FINISH;
if( ORTE_SUCCESS != (ret = orte_errmgr_base_migrate_update(current_migration_status)) ) {
ORTE_ERROR_LOG(ret);
exit_status = ret;
goto cleanup;
}
ERRMGR_CRMIG_SET_TIMER(ERRMGR_CRMIG_TIMER_FINISH);
ERRMGR_CRMIG_DISPLAY_ALL_TIMERS();
cleanup:
migrating_underway = false;
migrating_terminated = false;
migrating_restarted = false;
mca_errmgr_hnp_component.crmig_in_progress = false;
if( NULL != err_str_procs ) {
free(err_str_procs);
err_str_procs = NULL;
}
if( NULL != err_str_nodes ) {
free(err_str_nodes);
err_str_nodes = NULL;
}
return exit_status;
}
static bool check_if_duplicate_proc(orte_proc_t *proc, opal_pointer_array_t *migrating_procs)
{
orte_std_cntr_t i_proc;
orte_proc_t *loc_proc = NULL;
for(i_proc = 0; i_proc < opal_pointer_array_get_size(migrating_procs); ++i_proc) {
loc_proc = (orte_proc_t*)opal_pointer_array_get_item(migrating_procs, i_proc);
if( NULL == loc_proc ) {
continue;
}
if( loc_proc->name.vpid == proc->name.vpid ) {
return true;
}
}
return false;
}
static int check_if_terminated(opal_pointer_array_t *migrating_procs)
{
orte_std_cntr_t i_proc;
orte_proc_t *proc = NULL;
bool is_done;
is_done = true;
for(i_proc = 0; i_proc < opal_pointer_array_get_size(migrating_procs); ++i_proc) {
proc = (orte_proc_t*)opal_pointer_array_get_item(migrating_procs, i_proc);
if( NULL == proc ) {
continue;
}
if( !(ORTE_PROC_STATE_KILLED_BY_CMD & proc->state) ) {
is_done = false;
break;
}
}
if( is_done ) {
migrating_terminated = true;
}
else {
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle,
"\t Still waiting for termination: \"%s\" [0x%x] != [0x%x]\n",
ORTE_NAME_PRINT(&proc->name), proc->state, ORTE_PROC_STATE_KILLED_BY_CMD));
}
return ORTE_SUCCESS;
}
static int check_if_restarted(opal_pointer_array_t *migrating_procs)
{
orte_std_cntr_t i_proc;
orte_proc_t *proc = NULL;
bool is_done;
is_done = true;
for(i_proc = 0; i_proc < opal_pointer_array_get_size(migrating_procs); ++i_proc) {
proc = (orte_proc_t*)opal_pointer_array_get_item(migrating_procs, i_proc);
if( NULL == proc ) {
continue;
}
/* proc->state != ORTE_PROC_STATE_LAUNCHED */
if( !(ORTE_PROC_STATE_RUNNING & proc->state) ) {
is_done = false;
break;
}
}
if( is_done ) {
migrating_restarted = true;
}
else {
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle,
"\tStill waiting for restart: \"%s\" [0x%x] != [0x%x]\n",
ORTE_NAME_PRINT(&proc->name), proc->state, ORTE_PROC_STATE_RUNNING));
}
return ORTE_SUCCESS;
}
static void errmgr_crmig_process_fault_app(orte_job_t *jdata,
orte_process_name_t *proc,
orte_proc_state_t state)
{
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle,
"errmgr:hnp(crmig):process_fault_app() "
"------- Application fault reported! proc %s (0x%x) "
"- %s",
ORTE_NAME_PRINT(proc),
state,
(migrating_underway ? "Migrating" : "Not Migrating") ));
return;
}
static void errmgr_crmig_process_fault_daemon(orte_job_t *jdata,
orte_process_name_t *proc,
orte_proc_state_t state)
{
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle,
"errmgr:hnp(crmig):process_fault_daemon() "
"------- Daemon fault reported! proc %s (0x%x) "
"- %s",
ORTE_NAME_PRINT(proc),
state,
(migrating_underway ? "Migrating" : "Not Migrating") ));
/*
* Failed communication can be ignored for the most part.
* Make sure to remove the route
* JJH: Check to make sure this is not a new daemon loss.
*/
if( ORTE_PROC_STATE_COMM_FAILED == state ) {
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle,
"errmgr:hnp(crmig):process_fault_daemon() "
"------- Daemon fault reported! proc %s (0x%x) "
"- Communication failure, keep going",
ORTE_NAME_PRINT(proc),
state ));
}
return;
}
static int check_and_pre_map(opal_list_t *off_procs,
opal_list_t *off_nodes,
orte_snapc_base_quiesce_t *cur_datum)
{
/*
* Check the 'off_procs' list for processes that should not be migrated
*/
/*
* Check the 'current_onto_mapping_exclusive' for processes that are moving
* 'near/with' other processes that are also moving. Be sure to watch out
* for circular deadlock.
*/
/*
* Use the 'pre_map_fixed_node' structure to fix this process' mapping.
*/
return ORTE_SUCCESS;
}
static void display_request(opal_list_t *off_procs,
opal_list_t *off_nodes,
orte_snapc_base_quiesce_t *cur_datum)
{
orte_std_cntr_t i_node;
orte_std_cntr_t i_proc;
orte_node_t *node = NULL;
orte_proc_t *proc = NULL;
bool found = false;
char * status_str = NULL;
char * tmp_str = NULL;
orte_errmgr_predicted_proc_t *off_proc = NULL;
orte_errmgr_predicted_node_t *off_node = NULL;
orte_errmgr_predicted_map_t *onto_map = NULL;
opal_list_item_t *item = NULL;
/*
* Display all requested processes to migrate
*/
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle,
"errmgr:hnp(crmig):migrate() Requested Processes to migrate: (%d procs)\n",
(int) opal_list_get_size(off_procs) ));
for(item = opal_list_get_first(off_procs);
item != opal_list_get_end(off_procs);
item = opal_list_get_next(item) ) {
off_proc = (orte_errmgr_predicted_proc_t*)item;
/*
* Find the process in the job structure
*/
found = false;
for(i_proc = 0; i_proc < opal_pointer_array_get_size(current_global_jobdata->procs); ++i_proc) {
proc = (orte_proc_t*)opal_pointer_array_get_item(current_global_jobdata->procs, i_proc);
if( NULL == proc ) {
continue;
}
if( proc->name.vpid == off_proc->proc_name.vpid) {
found = true;
break;
}
}
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle,
"\t%s (Rank %3d) on node %s\n",
ORTE_NAME_PRINT(&proc->name), (int)off_proc->proc_name.vpid, proc->node->name));
}
/*
* Display Off Nodes
*/
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle,
"errmgr:hnp(crmig):migrate() Requested Nodes to migration: (%d nodes)\n",
(int)opal_list_get_size(off_nodes) ));
for(item = opal_list_get_first(off_nodes);
item != opal_list_get_end(off_nodes);
item = opal_list_get_next(item) ) {
off_node = (orte_errmgr_predicted_node_t*)item;
for(i_node = 0; i_node < opal_pointer_array_get_size(current_global_jobdata->map->nodes); ++i_node) {
node = (orte_node_t*)opal_pointer_array_get_item(current_global_jobdata->map->nodes, i_node);
if( NULL == node ) {
continue;
}
found = false;
if( 0 == strncmp(node->name, off_node->node_name, strlen(off_node->node_name)) ) {
found = true;
break;
}
}
if( found ) {
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle,
"\t\"%s\" \t%d\n",
node->name, node->num_procs));
for(i_proc = 0; i_proc < opal_pointer_array_get_size(node->procs); ++i_proc) {
proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i_proc);
if( NULL == proc ) {
continue;
}
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle,
"\t\t\"%s\" [0x%x]\n",
ORTE_NAME_PRINT(&proc->name), proc->state));
}
}
}
/*
* Suggested onto nodes
*/
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle,
"errmgr:hnp(crmig):migrate() Suggested nodes to migration onto: (%d nodes)\n",
(int)opal_list_get_size(current_onto_mapping_general) ));
for(item = opal_list_get_first(current_onto_mapping_general);
item != opal_list_get_end(current_onto_mapping_general);
item = opal_list_get_next(item) ) {
onto_map = (orte_errmgr_predicted_map_t*) item;
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle,
"\t\"%s\"\n",
onto_map->map_node_name));
}
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle,
"errmgr:hnp(crmig):migrate() Suggested nodes to migration onto (exclusive): (%d nodes)\n",
(int)opal_list_get_size(current_onto_mapping_exclusive) ));
for(item = opal_list_get_first(current_onto_mapping_exclusive);
item != opal_list_get_end(current_onto_mapping_exclusive);
item = opal_list_get_next(item) ) {
onto_map = (orte_errmgr_predicted_map_t*) item;
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle,
"\t%d\t(%c)\t\"%s\"\n",
onto_map->proc_name.vpid,
(onto_map->off_current_node ? 'T' : 'F'),
onto_map->map_node_name));
}
/*
* Display all processes scheduled to migrate
*/
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle,
"errmgr:hnp(crmig):migrate() All Migrating Processes: (%d procs)\n",
cur_datum->num_migrating));
for(i_proc = 0; i_proc < opal_pointer_array_get_size(&(cur_datum->migrating_procs)); ++i_proc) {
proc = (orte_proc_t*)opal_pointer_array_get_item(&(cur_datum->migrating_procs), i_proc);
if( NULL == proc ) {
continue;
}
OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle,
"\t\"%s\" [0x%x] [%s]\n",
ORTE_NAME_PRINT(&proc->name), proc->state, proc->node->name));
if( NULL == status_str ) {
asprintf(&status_str, "\t%s Rank %d on Node %s\n",
ORTE_NAME_PRINT(&proc->name),
(int)proc->name.vpid,
proc->node->name);
} else {
tmp_str = strdup(status_str);
free(status_str);
status_str = NULL;
asprintf(&status_str, "%s\t%s Rank %d on Node %s\n",
tmp_str,
ORTE_NAME_PRINT(&proc->name),
(int)proc->name.vpid,
proc->node->name);
}
}
opal_show_help("help-orte-errmgr-hnp.txt", "crmig_migrating_job", true,
status_str);
if( NULL != tmp_str ) {
free(tmp_str);
tmp_str = NULL;
}
if( NULL != status_str ) {
free(status_str);
status_str = NULL;
}
return;
}
/************************
* Timing
************************/
static void errmgr_crmig_set_time(int idx)
{
if(idx < ERRMGR_CRMIG_TIMER_MAX ) {
if( timer_start[idx] <= 0.0 ) {
timer_start[idx] = errmgr_crmig_get_time();
}
}
}
static void errmgr_crmig_display_all_timers(void)
{
double diff = 0.0;
char * label = NULL;
opal_output(0, "Process Migration Timing: ******************** Summary Begin\n");
/********** Structure Setup **********/
label = strdup("Setup");
diff = timer_start[ERRMGR_CRMIG_TIMER_SETUP] - timer_start[ERRMGR_CRMIG_TIMER_START];
errmgr_crmig_display_indv_timer_core(diff, label);
free(label);
/********** Checkpoint **********/
label = strdup("Checkpoint");
diff = timer_start[ERRMGR_CRMIG_TIMER_CKPT] - timer_start[ERRMGR_CRMIG_TIMER_SETUP];
errmgr_crmig_display_indv_timer_core(diff, label);
free(label);
/********** Termination **********/
label = strdup("Terminate");
diff = timer_start[ERRMGR_CRMIG_TIMER_TERM] - timer_start[ERRMGR_CRMIG_TIMER_CKPT];
errmgr_crmig_display_indv_timer_core(diff, label);
free(label);
/********** Setup new job **********/
label = strdup("Setup Relaunch");
diff = timer_start[ERRMGR_CRMIG_TIMER_RESETUP] - timer_start[ERRMGR_CRMIG_TIMER_TERM];
errmgr_crmig_display_indv_timer_core(diff, label);
free(label);
/********** Restart **********/
label = strdup("Restart");
diff = timer_start[ERRMGR_CRMIG_TIMER_RESTART] - timer_start[ERRMGR_CRMIG_TIMER_RESETUP];
errmgr_crmig_display_indv_timer_core(diff, label);
free(label);
/********** Finish **********/
label = strdup("Finalize");
diff = timer_start[ERRMGR_CRMIG_TIMER_FINISH] - timer_start[ERRMGR_CRMIG_TIMER_RESTART];
errmgr_crmig_display_indv_timer_core(diff, label);
free(label);
opal_output(0, "Process Migration Timing: ******************** Summary End\n");
}
static void errmgr_crmig_clear_timers(void)
{
int i;
for(i = 0; i < ERRMGR_CRMIG_TIMER_MAX; ++i) {
timer_start[i] = 0.0;
}
}
static double errmgr_crmig_get_time(void)
{
double wtime;
#if OPAL_TIMER_USEC_NATIVE
wtime = (double)opal_timer_base_get_usec() / 1000000.0;
#else
struct timeval tv;
gettimeofday(&tv, NULL);
wtime = tv.tv_sec;
wtime += (double)tv.tv_usec / 1000000.0;
#endif
return wtime;
}
static void errmgr_crmig_display_indv_timer_core(double diff, char *str)
{
double total = 0;
double perc = 0;
total = timer_start[ERRMGR_CRMIG_TIMER_MAX-1] - timer_start[ERRMGR_CRMIG_TIMER_START];
perc = (diff/total) * 100;
opal_output(0,
"errmgr_crmig: timing: %-20s = %10.2f s\t%10.2f s\t%6.2f\n",
str,
diff,
total,
perc);
return;
}
#endif /* OPAL_ENABLE_FT_CR */