/* * Copyright (c) 2004-2010 The Trustees of Indiana University. * All rights reserved. * Copyright (c) 2004-2011 The Trustees of the University of Tennessee. * All rights reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2007 Evergrid, Inc. All rights reserved. * * $COPYRIGHT$ * * Additional copyrights may follow * * $HEADER$ */ #include "orte_config.h" #include #ifdef HAVE_UNISTD_H #include #endif /* HAVE_UNISTD_H */ #include #include "opal/include/opal/prefetch.h" #include "opal/util/output.h" #include "opal/util/opal_environ.h" #include "opal/util/basename.h" #include "opal/util/show_help.h" #include "orte/mca/mca.h" #include "opal/mca/base/base.h" #include "opal/mca/crs/crs.h" #include "opal/mca/crs/base/base.h" #include "orte/util/name_fns.h" #include "orte/util/proc_info.h" #include "orte/runtime/orte_globals.h" #include "opal/dss/dss.h" #include "orte/mca/rml/rml.h" #include "orte/mca/rml/rml_types.h" #include "orte/mca/rmaps/rmaps.h" #include "orte/mca/rmaps/rmaps_types.h" #include "orte/mca/plm/plm.h" #include "orte/mca/grpcomm/grpcomm.h" #include "orte/runtime/orte_wait.h" #include "orte/mca/errmgr/errmgr.h" #include "orte/mca/errmgr/base/base.h" #include "orte/mca/snapc/snapc.h" #include "orte/mca/snapc/base/base.h" #include "snapc_full.h" #include MCA_timer_IMPLEMENTATION_HEADER /************************************ * Locally Global vars & functions :) ************************************/ #define INC_SEQ_NUM() \ { \ if(orte_snapc_base_store_only_one_seq) { \ orte_snapc_base_snapshot_seq_number = 0; \ } else { \ orte_snapc_base_snapshot_seq_number++; \ } \ } static orte_jobid_t current_global_jobid = ORTE_JOBID_INVALID; static orte_snapc_base_global_snapshot_t global_snapshot; static int current_total_orteds = 0; static bool updated_job_to_running; static int current_job_ckpt_state = ORTE_SNAPC_CKPT_STATE_NONE; static bool cleanup_on_establish = false; static bool global_coord_has_local_children = false; static bool currently_migrating = false; static opal_list_t *migrating_procs = NULL; static int global_init_job_structs(void); static int global_refresh_job_structs(void); static bool snapc_orted_recv_issued = false; static bool is_orte_checkpoint_connected = false; static bool is_app_checkpointable = false; static int snapc_full_global_start_listener(void); static int snapc_full_global_stop_listener(void); static void snapc_full_global_orted_recv(int status, orte_process_name_t* sender, opal_buffer_t* buffer, orte_rml_tag_t tag, void* cbdata); static void snapc_full_process_restart_proc_info_cmd(orte_process_name_t* sender, opal_buffer_t* buffer); static void snapc_full_process_request_op_cmd(orte_process_name_t* sender, opal_buffer_t* buffer); /*** Command Line Interactions */ static orte_process_name_t orte_checkpoint_sender; static bool snapc_cmdline_recv_issued = false; static int snapc_full_global_start_cmdline_listener(void); static int snapc_full_global_stop_cmdline_listener(void); static void snapc_full_global_cmdline_recv(int status, orte_process_name_t* sender, opal_buffer_t* buffer, orte_rml_tag_t tag, void* cbdata); static int snapc_full_establish_snapshot_dir(bool empty_metadata); /*** */ static int snapc_full_global_checkpoint(opal_crs_base_ckpt_options_t *options); static int snapc_full_global_notify_checkpoint(orte_jobid_t jobid, opal_crs_base_ckpt_options_t *options); static int orte_snapc_full_global_set_job_ckpt_info( orte_jobid_t jobid, int ckpt_state, orte_sstore_base_handle_t handle, bool quick, opal_crs_base_ckpt_options_t *options); int global_coord_job_state_update(orte_jobid_t jobid, int job_ckpt_state, orte_sstore_base_handle_t handle, opal_crs_base_ckpt_options_t *options); static void snapc_full_process_job_update_cmd(orte_process_name_t* sender, opal_buffer_t* buffer, bool quick); static int snapc_full_process_orted_update_cmd(orte_process_name_t* sender, opal_buffer_t* buffer, bool quick); static orte_snapc_full_orted_snapshot_t *find_orted_snapshot(orte_process_name_t *name ); static int snapc_full_global_get_min_state(void); static int write_out_global_metadata(void); static int orte_snapc_full_global_reset_coord(void); /* * Timer stuff */ static void snapc_full_set_time(int idx); static void snapc_full_display_all_timers(void); static void snapc_full_display_recovered_timers(void); static void snapc_full_clear_timers(void); static double snapc_full_get_time(void); static void snapc_full_display_indv_timer_core(double diff, char *str); #define SNAPC_FULL_TIMER_START 0 #define SNAPC_FULL_TIMER_RUNNING 1 #define SNAPC_FULL_TIMER_FIN_LOCAL 2 #define SNAPC_FULL_TIMER_SS_SYNC 3 #define SNAPC_FULL_TIMER_ESTABLISH 4 #define SNAPC_FULL_TIMER_RECOVERED 5 #define SNAPC_FULL_TIMER_MAX 6 static double timer_start[SNAPC_FULL_TIMER_MAX]; #define SNAPC_FULL_CLEAR_TIMERS() \ { \ if(OPAL_UNLIKELY(orte_snapc_full_timing_enabled)) { \ snapc_full_clear_timers(); \ } \ } #define SNAPC_FULL_SET_TIMER(idx) \ { \ if(OPAL_UNLIKELY(orte_snapc_full_timing_enabled)) { \ snapc_full_set_time(idx); \ } \ } #define SNAPC_FULL_DISPLAY_ALL_TIMERS() \ { \ if(OPAL_UNLIKELY(orte_snapc_full_timing_enabled)) { \ snapc_full_display_all_timers(); \ } \ } #define SNAPC_FULL_DISPLAY_RECOVERED_TIMER() \ { \ if(OPAL_UNLIKELY(orte_snapc_full_timing_enabled)) { \ snapc_full_display_recovered_timers(); \ } \ } /* * Progress */ static void snapc_full_report_progress(orte_snapc_full_orted_snapshot_t *orted_snapshot, int total, int min_state); static int report_progress_cur_loc_finished = 0; static double report_progress_last_reported_loc_finished = 0; #define SNAPC_FULL_REPORT_PROGRESS(orted, total, min_state) \ { \ if(OPAL_UNLIKELY(orte_snapc_full_progress_meter > 0)) { \ snapc_full_report_progress(orted, total, min_state); \ } \ } /************************ * Function Definitions ************************/ int global_coord_init(void) { current_global_jobid = ORTE_JOBID_INVALID; orte_snapc_base_snapshot_seq_number = -1; orte_checkpoint_sender = orte_name_invalid; SNAPC_FULL_CLEAR_TIMERS(); return ORTE_SUCCESS; } int global_coord_finalize(void) { current_global_jobid = ORTE_JOBID_INVALID; orte_snapc_base_snapshot_seq_number = -1; SNAPC_FULL_CLEAR_TIMERS(); return ORTE_SUCCESS; } int global_coord_setup_job(orte_jobid_t jobid) { int ret, exit_status = ORTE_SUCCESS; orte_job_t *jdata = NULL; /* * Only allow one job at a time. * * It is possible to pass through this function twice since HNP may also be * a local daemon. So it may be both a global and local coordinator. * Global: orte_plm_base_setup_job() * Local : odls_default_module.c */ /* Global Coordinator pass */ if( ORTE_JOBID_INVALID == current_global_jobid ) { current_global_jobid = jobid; OPAL_OUTPUT_VERBOSE((10, mca_snapc_full_component.super.output_handle, "Global) Setup job %s as the Global Coordinator\n", ORTE_JOBID_PRINT(jobid))); SNAPC_FULL_CLEAR_TIMERS(); SNAPC_FULL_SET_TIMER(SNAPC_FULL_TIMER_START); } /* Local Coordinator pass - Always happens after global coordinator pass */ else if ( jobid == current_global_jobid ) { /* look up job data object */ if (NULL == (jdata = orte_get_job_data_object(current_global_jobid))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); return ORTE_ERR_NOT_FOUND; } if (ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_RESTART)) { OPAL_OUTPUT_VERBOSE((10, mca_snapc_full_component.super.output_handle, "Global) Restarting Job %s...", ORTE_JOBID_PRINT(jobid))); SNAPC_FULL_CLEAR_TIMERS(); SNAPC_FULL_SET_TIMER(SNAPC_FULL_TIMER_START); if( ORTE_SUCCESS != (ret = global_refresh_job_structs()) ) { ORTE_ERROR_LOG(ret); return ret; } if( ORTE_SNAPC_LOCAL_COORD_TYPE == (orte_snapc_coord_type & ORTE_SNAPC_LOCAL_COORD_TYPE) ) { return local_coord_setup_job(jobid); } return ORTE_SUCCESS; } /* If there are no local children, do not become a local coordinator */ if( !global_coord_has_local_children ) { return ORTE_SUCCESS; } OPAL_OUTPUT_VERBOSE((10, mca_snapc_full_component.super.output_handle, "Global) Setup job %s as the Local Coordinator\n", ORTE_JOBID_PRINT(jobid))); orte_snapc_coord_type |= ORTE_SNAPC_LOCAL_COORD_TYPE; return local_coord_setup_job(jobid); } /* Only allow one job at a time */ else { opal_output(mca_snapc_full_component.super.output_handle, "Global) Setup of job %s Failed! Already setup job %s\n", ORTE_JOBID_PRINT(jobid), ORTE_JOBID_PRINT(current_global_jobid)); ORTE_ERROR_LOG(ORTE_ERROR); return ORTE_ERROR; } /* * Start out with a sequence number just below the first * This will be incremented when we checkpoint */ orte_snapc_base_snapshot_seq_number = -1; /* * Allocate structure to track node status */ if( ORTE_SUCCESS != (ret = global_init_job_structs()) ) { ORTE_ERROR_LOG(ret); exit_status = ret; goto cleanup; } /* * Setup Global Coordinator command processing listener */ if( ORTE_SUCCESS != (ret = snapc_full_global_start_listener()) ) { ORTE_ERROR_LOG(ret); exit_status = ret; goto cleanup; } /* * Setup command line tool checkpoint request listener */ if( ORTE_SUCCESS != (ret = snapc_full_global_start_cmdline_listener()) ) { ORTE_ERROR_LOG(ret); exit_status = ret; goto cleanup; } /* * If requested pre-establish the global snapshot directory */ #if 0 if(orte_snapc_base_establish_global_snapshot_dir) { opal_output(0, "Global) Error: Pre-establishment of snapshot directory currently not supported!"); ORTE_ERROR_LOG(ORTE_ERR_NOT_SUPPORTED); OPAL_OUTPUT_VERBOSE((10, mca_snapc_full_component.super.output_handle, "Global) Pre-establish the global snapshot directory\n")); if( ORTE_SUCCESS != (ret = snapc_full_establish_snapshot_dir(true))) { ORTE_ERROR_LOG(ret); exit_status = ret; goto cleanup; } } #endif OPAL_OUTPUT_VERBOSE((10, mca_snapc_full_component.super.output_handle, "Global) Finished setup of job %s ", ORTE_JOBID_PRINT(jobid))); cleanup: return exit_status; } int global_coord_release_job(orte_jobid_t jobid) { int ret, exit_status = ORTE_SUCCESS; /* * Make sure we are not waiting on a checkpoint to complete */ if( is_orte_checkpoint_connected ) { if( ORTE_SUCCESS != (ret = orte_snapc_base_global_coord_ckpt_update_cmd(&orte_checkpoint_sender, global_snapshot.ss_handle, ORTE_SNAPC_CKPT_STATE_ERROR)) ) { ORTE_ERROR_LOG(ret); } } /* * Clean up listeners */ if( ORTE_SUCCESS != (ret = snapc_full_global_stop_cmdline_listener()) ) { ORTE_ERROR_LOG(ret); exit_status = ret; } if( ORTE_SUCCESS != (ret = snapc_full_global_stop_listener()) ) { ORTE_ERROR_LOG(ret); exit_status = ret; } OBJ_DESTRUCT(&global_snapshot); return exit_status; } int global_coord_start_ckpt(orte_snapc_base_quiesce_t *datum) { int ret, exit_status = ORTE_SUCCESS; orte_std_cntr_t i_proc; orte_proc_t *proc = NULL; orte_proc_t *new_proc = NULL; opal_list_item_t *item = NULL; opal_crs_base_ckpt_options_t *options = NULL; char *tmp_str = NULL; OPAL_OUTPUT_VERBOSE((10, mca_snapc_full_component.super.output_handle, "Global) Starting checkpoint (internally requested)")); orte_checkpoint_sender = orte_name_invalid; /* * If migrating */ if( datum->migrating ) { currently_migrating = true; if( NULL != migrating_procs ) { while( NULL != (item = opal_list_remove_first(migrating_procs)) ) { proc = (orte_proc_t*)item; OBJ_RELEASE(proc); } } else { migrating_procs = OBJ_NEW(opal_list_t); } /* * Copy over the procs into a list */ for(i_proc = 0; i_proc < opal_pointer_array_get_size(&(datum->migrating_procs)); ++i_proc) { proc = (orte_proc_t*)opal_pointer_array_get_item(&(datum->migrating_procs), i_proc); if( NULL == proc ) { continue; } new_proc = OBJ_NEW(orte_proc_t); new_proc->name.jobid = proc->name.jobid; new_proc->name.vpid = proc->name.vpid; new_proc->node = OBJ_NEW(orte_node_t); new_proc->node->name = proc->node->name; opal_list_append(migrating_procs, &new_proc->super); OBJ_RETAIN(new_proc); } OPAL_OUTPUT_VERBOSE((10, mca_snapc_full_component.super.output_handle, "Global) SnapC Migrating Processes: (%d procs) [Updated]\n", (int)opal_list_get_size(migrating_procs) )); for (item = opal_list_get_first(migrating_procs); item != opal_list_get_end(migrating_procs); item = opal_list_get_next(item)) { new_proc = (orte_proc_t*)item; OPAL_OUTPUT_VERBOSE((10, mca_snapc_full_component.super.output_handle, "\t\"%s\" [%s]\n", ORTE_NAME_PRINT(&new_proc->name),new_proc->node->name)); } } /************************* * Kick off the checkpoint (local coord will release the processes) *************************/ options = OBJ_NEW(opal_crs_base_ckpt_options_t); if( ORTE_SUCCESS != (ret = snapc_full_global_checkpoint(options) ) ) { ORTE_ERROR_LOG(ret); exit_status = ret; goto cleanup; } /* * Wait for checkpoint to locally finish on all nodes */ while(((currently_migrating && current_job_ckpt_state != ORTE_SNAPC_CKPT_STATE_MIGRATING) || (!currently_migrating && current_job_ckpt_state != ORTE_SNAPC_CKPT_STATE_FINISHED_LOCAL)) && current_job_ckpt_state != ORTE_SNAPC_CKPT_STATE_ESTABLISHED && current_job_ckpt_state != ORTE_SNAPC_CKPT_STATE_RECOVERED && current_job_ckpt_state != ORTE_SNAPC_CKPT_STATE_ERROR && current_job_ckpt_state != ORTE_SNAPC_CKPT_STATE_NONE ) { opal_progress(); } /* * Update the quiesce structure with the handle */ datum->snapshot = OBJ_NEW(orte_snapc_base_global_snapshot_t); datum->ss_handle = global_snapshot.ss_handle; datum->ss_snapshot = OBJ_NEW(orte_sstore_base_global_snapshot_info_t); if( ORTE_SUCCESS != (ret = orte_sstore.request_global_snapshot_data(&(datum->ss_handle), datum->ss_snapshot)) ) { ORTE_ERROR_LOG(ret); exit_status = ret; goto cleanup; } /* JJH Is the snapc structure useful with the sstore structure ??? */ orte_sstore.get_attr(global_snapshot.ss_handle, SSTORE_METADATA_GLOBAL_SNAP_SEQ, &tmp_str); datum->epoch = atoi(tmp_str); if( NULL != tmp_str ) { free(tmp_str); tmp_str = NULL; } cleanup: if( NULL != options ) { OBJ_RELEASE(options); options = NULL; } return exit_status; } int global_coord_end_ckpt(orte_snapc_base_quiesce_t *datum) { int ret, exit_status = ORTE_SUCCESS; opal_list_item_t* item = NULL; OPAL_OUTPUT_VERBOSE((10, mca_snapc_full_component.super.output_handle, "Global) Finishing checkpoint (internally requested) [%3d]", current_job_ckpt_state)); if( currently_migrating ) { OPAL_OUTPUT_VERBOSE((10, mca_snapc_full_component.super.output_handle, "Global) End Ckpt: Flush the modex cached data\n")); /* TODO: You can't pass NULL as the identifier - what you'll need to do is * close all open dstore handles, and then open the ones you need */ #if 0 if (OPAL_SUCCESS != (ret = opal_dstore.remove(NULL, NULL))) { ORTE_ERROR_LOG(ret); exit_status = ret; goto cleanup; } #endif SNAPC_FULL_SET_TIMER(SNAPC_FULL_TIMER_ESTABLISH); if( ORTE_SUCCESS != (ret = orte_snapc_full_global_set_job_ckpt_info(current_global_jobid, ORTE_SNAPC_CKPT_STATE_FINISHED_LOCAL, global_snapshot.ss_handle, true, NULL) ) ) { ORTE_ERROR_LOG(ret); exit_status = ret; goto cleanup; } } while(current_job_ckpt_state != ORTE_SNAPC_CKPT_STATE_RECOVERED && current_job_ckpt_state != ORTE_SNAPC_CKPT_STATE_ERROR && current_job_ckpt_state != ORTE_SNAPC_CKPT_STATE_NONE ) { opal_progress(); } /* * Update the job structure since processes may have moved around */ if( ORTE_SUCCESS != (ret = global_refresh_job_structs()) ) { ORTE_ERROR_LOG(ret); exit_status = ret; goto cleanup; } OPAL_OUTPUT_VERBOSE((10, mca_snapc_full_component.super.output_handle, "Global) Finished checkpoint (internally requested) [%d]", current_job_ckpt_state)); if( currently_migrating ) { current_job_ckpt_state = ORTE_SNAPC_CKPT_STATE_NONE; cleanup_on_establish = false; report_progress_cur_loc_finished = 0; report_progress_last_reported_loc_finished = 0; } cleanup: currently_migrating = false; if( NULL != migrating_procs ) { while( NULL != (item = opal_list_remove_first(migrating_procs)) ) { OBJ_RELEASE(item); } OBJ_RELEASE(migrating_procs); migrating_procs = NULL; } return exit_status; } /****************** * Local functions ******************/ static int global_init_job_structs(void) { orte_snapc_full_orted_snapshot_t *orted_snapshot = NULL; orte_snapc_base_local_snapshot_t *app_snapshot = NULL; opal_list_item_t* orted_item = NULL; orte_node_t *cur_node = NULL; orte_job_map_t *map = NULL; orte_job_t *jdata = NULL; orte_proc_t **procs = NULL; orte_std_cntr_t i = 0; orte_vpid_t p = 0; orte_ns_cmp_bitmask_t mask; bool found = false; /* look up job data object */ if (NULL == (jdata = orte_get_job_data_object(current_global_jobid))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); return ORTE_ERR_NOT_FOUND; } OBJ_CONSTRUCT(&global_snapshot, orte_snapc_base_global_snapshot_t); map = jdata->map; for (i=0; i < map->nodes->size; i++) { if (NULL == (cur_node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) { continue; } procs = (orte_proc_t**)cur_node->procs->addr; /* * Look out for duplicates * JJH: Should not happen, but does if rmaps get a bug in setting up the map. */ found = false; for(orted_item = opal_list_get_first(&(global_snapshot.local_snapshots)); orted_item != opal_list_get_end(&(global_snapshot.local_snapshots)); orted_item = opal_list_get_next(orted_item) ) { orted_snapshot = (orte_snapc_full_orted_snapshot_t*)orted_item; /* * Is in list? */ if(OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, &(cur_node->daemon->name), &(orted_snapshot->process_name) )) { found = true; break; } } if( found ) { OPAL_OUTPUT_VERBOSE((1, mca_snapc_full_component.super.output_handle, "Global) [%d] Found Daemon %s with %d procs - Duplicate!! - Should not happen!", i, ORTE_NAME_PRINT(&(cur_node->daemon->name)), cur_node->num_procs)); continue; } OPAL_OUTPUT_VERBOSE((10, mca_snapc_full_component.super.output_handle, "Global) [%d] Found Daemon %s with %d procs", i, ORTE_NAME_PRINT(&(cur_node->daemon->name)), cur_node->num_procs)); orted_snapshot = OBJ_NEW(orte_snapc_full_orted_snapshot_t); orted_snapshot->process_name.jobid = cur_node->daemon->name.jobid; orted_snapshot->process_name.vpid = cur_node->daemon->name.vpid; mask = ORTE_NS_CMP_JOBID; if (OPAL_EQUAL == orte_util_compare_name_fields(mask, &orted_snapshot->process_name, ORTE_PROC_MY_NAME)) { global_coord_has_local_children = true; } for(p = 0; p < cur_node->num_procs; ++p) { OPAL_OUTPUT_VERBOSE((10, mca_snapc_full_component.super.output_handle, "Global) \t [%d] Found Process %s on Daemon %s", p, ORTE_NAME_PRINT(&(procs[p]->name)), ORTE_NAME_PRINT(&(cur_node->daemon->name)) )); app_snapshot = OBJ_NEW(orte_snapc_base_local_snapshot_t); app_snapshot->process_name.jobid = procs[p]->name.jobid; app_snapshot->process_name.vpid = procs[p]->name.vpid; opal_list_append(&(orted_snapshot->super.local_snapshots), &(app_snapshot->super)); } opal_list_append(&global_snapshot.local_snapshots, &(orted_snapshot->super.super)); } return ORTE_SUCCESS; } static int global_refresh_job_structs(void) { orte_snapc_full_orted_snapshot_t *orted_snapshot = NULL; orte_snapc_base_local_snapshot_t *app_snapshot = NULL; opal_list_item_t* orted_item = NULL; opal_list_item_t* app_item = NULL; opal_list_item_t* item = NULL; orte_node_t *cur_node = NULL; orte_job_map_t *map = NULL; orte_job_t *jdata = NULL; orte_proc_t **procs = NULL; orte_proc_t *new_proc = NULL; orte_std_cntr_t i = 0; orte_vpid_t p = 0; bool found = false; orte_ns_cmp_bitmask_t mask; /* look up job data object */ if (NULL == (jdata = orte_get_job_data_object(current_global_jobid))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); return ORTE_ERR_NOT_FOUND; } OPAL_OUTPUT_VERBOSE((10, mca_snapc_full_component.super.output_handle, "Global) Refreshing Job Structures... [%3d]", current_job_ckpt_state)); if( NULL != migrating_procs ) { for (item = opal_list_get_first(migrating_procs); item != opal_list_get_end(migrating_procs); item = opal_list_get_next(item)) { new_proc = (orte_proc_t*)item; /* * Look through all daemons */ found = false; for(orted_item = opal_list_get_first(&(global_snapshot.local_snapshots)); orted_item != opal_list_get_end(&(global_snapshot.local_snapshots)); orted_item = opal_list_get_next(orted_item) ) { orted_snapshot = (orte_snapc_full_orted_snapshot_t*)orted_item; /* * Look through all processes tracked by this daemon */ for(app_item = opal_list_get_first(&(orted_snapshot->super.local_snapshots)); app_item != opal_list_get_end(&(orted_snapshot->super.local_snapshots)); app_item = opal_list_get_next(app_item) ) { app_snapshot = (orte_snapc_base_local_snapshot_t*)app_item; if(OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, &(new_proc->name), &(app_snapshot->process_name) )) { found = true; opal_list_remove_item(&(orted_snapshot->super.local_snapshots), app_item); break; } } if( found ) { break; } } } } /* * First make sure that all of the orted's have the proper number of * children, if no children, then stop tracking. */ map = jdata->map; for(orted_item = opal_list_get_first(&(global_snapshot.local_snapshots)); orted_item != opal_list_get_end(&(global_snapshot.local_snapshots)); orted_item = opal_list_get_next(orted_item) ) { orted_snapshot = (orte_snapc_full_orted_snapshot_t*)orted_item; /* Make sure this orted is in the map */ found = false; for (i=0; i < map->nodes->size; i++) { if (NULL == (cur_node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) { continue; } if(OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, &(cur_node->daemon->name), &(orted_snapshot->process_name) )) { found = true; break; } } /* If not, then remove all processes, keep ref. we might reuse it later */ if( !found ) { OPAL_OUTPUT_VERBOSE((10, mca_snapc_full_component.super.output_handle, "Global) Found Empty Daemon %s not in map (Refresh)", ORTE_NAME_PRINT(&(orted_snapshot->process_name)) )); while( NULL != (item = opal_list_remove_first(&(orted_snapshot->super.local_snapshots))) ) { OBJ_RELEASE(item); } } } /* * Look for new nodes */ for (i=0; i < map->nodes->size; i++) { if (NULL == (cur_node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) { continue; } procs = (orte_proc_t**)cur_node->procs->addr; /* * See if we are already tracking it, if so refresh it * (This daemon could have been restarted, and processes migrated back to it) */ found = false; for(orted_item = opal_list_get_first(&(global_snapshot.local_snapshots)); orted_item != opal_list_get_end(&(global_snapshot.local_snapshots)); orted_item = opal_list_get_next(orted_item) ) { orted_snapshot = (orte_snapc_full_orted_snapshot_t*)orted_item; if(OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, &(cur_node->daemon->name), &(orted_snapshot->process_name) )) { found = true; break; } } if( found ) { OPAL_OUTPUT_VERBOSE((10, mca_snapc_full_component.super.output_handle, "Global) [%d] Found Daemon %s with %d procs (Refresh)", i, ORTE_NAME_PRINT(&(cur_node->daemon->name)), cur_node->num_procs)); /* Remove all old processes */ while( NULL != (item = opal_list_remove_first(&(orted_snapshot->super.local_snapshots))) ) { OBJ_RELEASE(item); } /* Add back new processes (a bit of overkill, sure, but it works) */ for(p = 0; p < cur_node->num_procs; ++p) { if( NULL == procs[p] ) { continue; } OPAL_OUTPUT_VERBOSE((10, mca_snapc_full_component.super.output_handle, "Global) \t [%d] Found Process %s on Daemon %s", p, ORTE_NAME_PRINT(&(procs[p]->name)), ORTE_NAME_PRINT(&(cur_node->daemon->name)) )); app_snapshot = OBJ_NEW(orte_snapc_base_local_snapshot_t); app_snapshot->process_name.jobid = procs[p]->name.jobid; app_snapshot->process_name.vpid = procs[p]->name.vpid; opal_list_append(&(orted_snapshot->super.local_snapshots), &(app_snapshot->super)); } continue; } OPAL_OUTPUT_VERBOSE((10, mca_snapc_full_component.super.output_handle, "Global) [%d] Found Daemon %s with %d procs", i, ORTE_NAME_PRINT(&(cur_node->daemon->name)), cur_node->num_procs)); orted_snapshot = OBJ_NEW(orte_snapc_full_orted_snapshot_t); orted_snapshot->process_name.jobid = cur_node->daemon->name.jobid; orted_snapshot->process_name.vpid = cur_node->daemon->name.vpid; mask = ORTE_NS_CMP_ALL; if (OPAL_EQUAL == orte_util_compare_name_fields(mask, &orted_snapshot->process_name, ORTE_PROC_MY_NAME)) { global_coord_has_local_children = true; } for(p = 0; p < cur_node->num_procs; ++p) { if( NULL == procs[p] ) { continue; } OPAL_OUTPUT_VERBOSE((10, mca_snapc_full_component.super.output_handle, "Global) \t [%d] Found Process %s on Daemon %s", p, ORTE_NAME_PRINT(&(procs[p]->name)), ORTE_NAME_PRINT(&(cur_node->daemon->name)) )); app_snapshot = OBJ_NEW(orte_snapc_base_local_snapshot_t); app_snapshot->process_name.jobid = procs[p]->name.jobid; app_snapshot->process_name.vpid = procs[p]->name.vpid; opal_list_append(&(orted_snapshot->super.local_snapshots), &(app_snapshot->super)); } opal_list_append(&global_snapshot.local_snapshots, &(orted_snapshot->super.super)); } return ORTE_SUCCESS; } /***************** * Setup listeners *****************/ static int snapc_full_global_start_listener(void) { if (snapc_orted_recv_issued && ORTE_PROC_IS_HNP) { return ORTE_SUCCESS; } OPAL_OUTPUT_VERBOSE((5, mca_snapc_full_component.super.output_handle, "Global) Startup Coordinator Channel")); /* * Coordinator command listener */ orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_SNAPC_FULL, ORTE_RML_PERSISTENT, snapc_full_global_orted_recv, NULL); snapc_orted_recv_issued = true; return ORTE_SUCCESS; } static int snapc_full_global_stop_listener(void) { if (!snapc_orted_recv_issued && ORTE_PROC_IS_HNP) { return ORTE_SUCCESS; } OPAL_OUTPUT_VERBOSE((5, mca_snapc_full_component.super.output_handle, "Global) Shutdown Coordinator Channel")); orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_SNAPC_FULL); snapc_orted_recv_issued = false; return ORTE_SUCCESS; } static int snapc_full_global_start_cmdline_listener(void) { if (snapc_cmdline_recv_issued && ORTE_PROC_IS_HNP) { return ORTE_SUCCESS; } OPAL_OUTPUT_VERBOSE((5, mca_snapc_full_component.super.output_handle, "Global) Startup Command Line Channel")); /* * Coordinator command listener */ orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_CKPT, 0, snapc_full_global_cmdline_recv, NULL); snapc_cmdline_recv_issued = true; return ORTE_SUCCESS; } static int snapc_full_global_stop_cmdline_listener(void) { if (!snapc_cmdline_recv_issued && ORTE_PROC_IS_HNP) { return ORTE_SUCCESS; } OPAL_OUTPUT_VERBOSE((5, mca_snapc_full_component.super.output_handle, "Global) Shutdown Command Line Channel")); orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_CKPT); snapc_cmdline_recv_issued = false; return ORTE_SUCCESS; } /***************** * Listener Callbacks *****************/ static void snapc_full_global_cmdline_recv(int status, orte_process_name_t* sender, opal_buffer_t* buffer, orte_rml_tag_t tag, void* cbdata) { int ret; orte_snapc_cmd_flag_t command; orte_std_cntr_t count = 1; orte_jobid_t jobid; opal_crs_base_ckpt_options_t *options = NULL; if( ORTE_RML_TAG_CKPT != tag ) { opal_output(mca_snapc_full_component.super.output_handle, "Global) Error: Unknown tag: Received a command message from %s (tag = %d).", ORTE_NAME_PRINT(sender), tag); ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); return; } OPAL_OUTPUT_VERBOSE((10, mca_snapc_full_component.super.output_handle, "Global) Command Line: Start a checkpoint operation [Sender = %s]", ORTE_NAME_PRINT(sender))); snapc_cmdline_recv_issued = false; /* Not a persistent RML message */ options = OBJ_NEW(opal_crs_base_ckpt_options_t); count = 1; if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &command, &count, ORTE_SNAPC_CMD))) { ORTE_ERROR_LOG(ret); goto cleanup; } /* * orte_checkpoint has requested that a checkpoint be taken */ if (ORTE_SNAPC_GLOBAL_INIT_CMD == command) { OPAL_OUTPUT_VERBOSE((10, mca_snapc_full_component.super.output_handle, "Global) Command line requested a checkpoint [command %d]\n", command)); /* * Unpack the buffer from the orte_checkpoint command */ if( ORTE_SUCCESS != (ret = orte_snapc_base_global_coord_ckpt_init_cmd(sender, buffer, options, &jobid)) ) { ORTE_ERROR_LOG(ret); goto cleanup; } orte_checkpoint_sender = *sender; is_orte_checkpoint_connected = true; /* * If the application is not ready for a checkpoint, * then send back an error. */ if( !is_app_checkpointable ) { OPAL_OUTPUT_VERBOSE((1, mca_snapc_full_component.super.output_handle, "Global) request_cmd(): Checkpointing currently disabled, rejecting request")); if( ORTE_SUCCESS != (ret = orte_snapc_base_global_coord_ckpt_update_cmd(&orte_checkpoint_sender, 0, ORTE_SNAPC_CKPT_STATE_ERROR))) { ORTE_ERROR_LOG(ret); } orte_checkpoint_sender = orte_name_invalid; is_orte_checkpoint_connected = false; /* Reset the listener */ if( ORTE_SUCCESS != (ret = snapc_full_global_start_cmdline_listener() ) ){ ORTE_ERROR_LOG(ret); } goto cleanup; } /* * If the jobid was specified, and does not match the current job, then fail */ if( ORTE_JOBID_INVALID != jobid && jobid != current_global_jobid) { opal_output(mca_snapc_full_component.super.output_handle, "Global) Error: Jobid %s does not match the current jobid %s", ORTE_JOBID_PRINT(jobid), ORTE_JOBID_PRINT(current_global_jobid)); ORTE_ERROR_LOG(ORTE_ERROR); goto cleanup; } /************************* * Kick off the checkpoint *************************/ SNAPC_FULL_CLEAR_TIMERS(); SNAPC_FULL_SET_TIMER(SNAPC_FULL_TIMER_START); if( ORTE_SUCCESS != (ret = snapc_full_global_checkpoint(options) ) ) { ORTE_ERROR_LOG(ret); goto cleanup; } } /* * Terminate the connection (Not currently implemented) */ else if (ORTE_SNAPC_GLOBAL_TERM_CMD == command) { OPAL_OUTPUT_VERBOSE((10, mca_snapc_full_component.super.output_handle, "Global) Command line requested to terminate connection (command %d)\n", command)); ORTE_ERROR_LOG(ORTE_ERR_NOT_SUPPORTED); goto cleanup; } /* * Unknown command */ else { OPAL_OUTPUT_VERBOSE((10, mca_snapc_full_component.super.output_handle, "Global) Command line sent an unknown command (command %d)\n", command)); ORTE_ERROR_LOG(ORTE_ERR_NOT_SUPPORTED); goto cleanup; } cleanup: if( NULL != options ) { OBJ_RELEASE(options); options = NULL; } return; } void snapc_full_global_orted_recv(int status, orte_process_name_t* sender, opal_buffer_t* buffer, orte_rml_tag_t tag, void* cbdata) { int ret; orte_snapc_full_cmd_flag_t command; orte_std_cntr_t count; static int num_inside = 0; if( ORTE_RML_TAG_SNAPC_FULL != tag ) { opal_output(mca_snapc_full_component.super.output_handle, "Global) Error: Unknown tag: Received a command message from %s (tag = %d).", ORTE_NAME_PRINT(sender), tag); ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); return; } /* * This is a message from a Local Coordinator */ OPAL_OUTPUT_VERBOSE((5, mca_snapc_full_component.super.output_handle, "Global) Receive a command message from %s.", ORTE_NAME_PRINT(sender))); count = 1; if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &command, &count, ORTE_SNAPC_FULL_CMD))) { ORTE_ERROR_LOG(ret); return; } ++num_inside; switch (command) { case ORTE_SNAPC_FULL_UPDATE_JOB_STATE_QUICK_CMD: OPAL_OUTPUT_VERBOSE((10, mca_snapc_full_component.super.output_handle, "Global) Command: Job State Update (quick)")); snapc_full_process_job_update_cmd(sender, buffer, true); break; case ORTE_SNAPC_FULL_UPDATE_JOB_STATE_CMD: OPAL_OUTPUT_VERBOSE((10, mca_snapc_full_component.super.output_handle, "Global) Command: Job State Update")); snapc_full_process_job_update_cmd(sender, buffer, false); break; case ORTE_SNAPC_FULL_UPDATE_ORTED_STATE_QUICK_CMD: OPAL_OUTPUT_VERBOSE((10, mca_snapc_full_component.super.output_handle, "Global) Command: Daemon State Update (quick)")); snapc_full_process_orted_update_cmd(sender, buffer, true); break; case ORTE_SNAPC_FULL_UPDATE_ORTED_STATE_CMD: OPAL_OUTPUT_VERBOSE((10, mca_snapc_full_component.super.output_handle, "Global) Command: Daemon State Update")); snapc_full_process_orted_update_cmd(sender, buffer, false); break; case ORTE_SNAPC_FULL_RESTART_PROC_INFO: OPAL_OUTPUT_VERBOSE((10, mca_snapc_full_component.super.output_handle, "Global) Command: Update hostname/pid associations")); snapc_full_process_restart_proc_info_cmd(sender, buffer); break; case ORTE_SNAPC_FULL_REQUEST_OP_CMD: OPAL_OUTPUT_VERBOSE((10, mca_snapc_full_component.super.output_handle, "Global) Command: Request Op")); snapc_full_process_request_op_cmd(sender, buffer); break; default: ORTE_ERROR_LOG(ORTE_ERR_VALUE_OUT_OF_BOUNDS); } return; } static void snapc_full_process_request_op_cmd(orte_process_name_t* sender, opal_buffer_t* sbuffer) { int ret; orte_std_cntr_t count = 1; orte_jobid_t jobid; int op_event, op_state; opal_crs_base_ckpt_options_t *options = NULL; opal_buffer_t *buffer = NULL; orte_snapc_full_cmd_flag_t command = ORTE_SNAPC_FULL_REQUEST_OP_CMD; int seq_num = -1, i; char * global_handle = NULL, *tmp_str = NULL; orte_snapc_base_request_op_t *datum = NULL; orte_checkpoint_sender = orte_name_invalid; count = 1; if (ORTE_SUCCESS != (ret = opal_dss.unpack(sbuffer, &jobid, &count, ORTE_JOBID))) { ORTE_ERROR_LOG(ret); goto cleanup; } count = 1; if (ORTE_SUCCESS != (ret = opal_dss.unpack(sbuffer, &op_event, &count, OPAL_INT))) { ORTE_ERROR_LOG(ret); goto cleanup; } OPAL_OUTPUT_VERBOSE((5, mca_snapc_full_component.super.output_handle, "Global) process_request_op(): Op Code %2d\n", op_event)); /************************************ * Application have been initialized, and are ready for checkpointing ************************************/ if( ORTE_SNAPC_OP_INIT == op_event ) { OPAL_OUTPUT_VERBOSE((3, mca_snapc_full_component.super.output_handle, "Global) process_request_op(): Checkpointing Enabled (%2d)\n", op_event)); is_app_checkpointable = true; } /************************************ * Application is finalizing, and no longer ready for checkpointing. ************************************/ else if( ORTE_SNAPC_OP_FIN == op_event ) { OPAL_OUTPUT_VERBOSE((3, mca_snapc_full_component.super.output_handle, "Global) process_request_op(): Checkpointing Disabled (%2d)\n", op_event)); is_app_checkpointable = false; /* * Wait for any ongoing checkpoints to finish */ if( current_job_ckpt_state != ORTE_SNAPC_CKPT_STATE_ERROR && current_job_ckpt_state != ORTE_SNAPC_CKPT_STATE_NONE ) { OPAL_OUTPUT_VERBOSE((3, mca_snapc_full_component.super.output_handle, "Global) process_request_op(): Wait for ongoing checkpoint to complete...")); while( current_job_ckpt_state != ORTE_SNAPC_CKPT_STATE_ERROR && current_job_ckpt_state != ORTE_SNAPC_CKPT_STATE_NONE ) { opal_progress(); } } /* * Tell application that it is now ok to finailze */ OPAL_OUTPUT_VERBOSE((3, mca_snapc_full_component.super.output_handle, "Global) process_request_op(): Send Finalize ACK to the job")); buffer = OBJ_NEW(opal_buffer_t); if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &command, 1, ORTE_SNAPC_FULL_CMD))) { ORTE_ERROR_LOG(ret); goto cleanup; } op_event = ORTE_SNAPC_OP_FIN_ACK; if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &op_event, 1, OPAL_INT))) { ORTE_ERROR_LOG(ret); goto cleanup; } if (ORTE_SUCCESS != (ret = orte_rml.send_buffer_nb(sender, buffer, ORTE_RML_TAG_SNAPC_FULL, orte_rml_send_callback, NULL))) { ORTE_ERROR_LOG(ret); goto cleanup; } /* buffer should not be released here; the callback releases it */ buffer = NULL; } /************************************ * Start a checkpoint operation ************************************/ else if( ORTE_SNAPC_OP_CHECKPOINT == op_event ) { OPAL_OUTPUT_VERBOSE((5, mca_snapc_full_component.super.output_handle, "Global) process_request_op(): Starting checkpoint (%2d)\n", op_event)); options = OBJ_NEW(opal_crs_base_ckpt_options_t); if( ORTE_SUCCESS != (ret = snapc_full_global_checkpoint(options) ) ) { ORTE_ERROR_LOG(ret); goto cleanup; } /* * Wait for the operation to complete */ while( current_job_ckpt_state != ORTE_SNAPC_CKPT_STATE_ERROR && current_job_ckpt_state != ORTE_SNAPC_CKPT_STATE_NONE ) { opal_progress(); } if( ORTE_SNAPC_CKPT_STATE_ERROR == current_job_ckpt_state ) { op_state = -1; } else { op_state = 0; } /* * Tell the sender that the operation is finished */ buffer = OBJ_NEW(opal_buffer_t); if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &command, 1, ORTE_SNAPC_FULL_CMD))) { ORTE_ERROR_LOG(ret); goto cleanup; } if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &op_event, 1, OPAL_INT))) { ORTE_ERROR_LOG(ret); goto cleanup; } if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &op_state, 1, OPAL_INT))) { ORTE_ERROR_LOG(ret); goto cleanup; } if (ORTE_SUCCESS != (ret = orte_rml.send_buffer_nb(sender, buffer, ORTE_RML_TAG_SNAPC_FULL, orte_rml_send_callback, NULL))) { ORTE_ERROR_LOG(ret); goto cleanup; } /* buffer should not be released here; the callback releases it */ buffer = NULL; } /************************************ * Start the Restart operation ************************************/ else if( ORTE_SNAPC_OP_RESTART == op_event ) { OPAL_OUTPUT_VERBOSE((5, mca_snapc_full_component.super.output_handle, "Global) process_request_op(): Starting restart (%2d)\n", op_event)); count = 1; if (ORTE_SUCCESS != (ret = opal_dss.unpack(sbuffer, &seq_num, &count, OPAL_INT))) { ORTE_ERROR_LOG(ret); orte_snapc_ckpt_state_notify(ORTE_SNAPC_CKPT_STATE_NO_RESTART); goto cleanup; } count = 1; if (ORTE_SUCCESS != (ret = opal_dss.unpack(sbuffer, &global_handle, &count, OPAL_STRING))) { ORTE_ERROR_LOG(ret); orte_snapc_ckpt_state_notify(ORTE_SNAPC_CKPT_STATE_NO_RESTART); goto cleanup; } /* * Kick off the restart */ if( ORTE_SUCCESS != (ret = orte_errmgr_base_restart_job(current_global_jobid, global_handle, seq_num) ) ) { ORTE_ERROR_LOG(ret); orte_snapc_ckpt_state_notify(ORTE_SNAPC_CKPT_STATE_NO_RESTART); goto cleanup; } } /************************************ * Start the Migration operation ************************************/ else if( ORTE_SNAPC_OP_MIGRATE == op_event ) { OPAL_OUTPUT_VERBOSE((5, mca_snapc_full_component.super.output_handle, "Global) process_request_op(): Starting migration (%2d)\n", op_event)); datum = OBJ_NEW(orte_snapc_base_request_op_t); /* * Unpack migration information */ count = 1; if (ORTE_SUCCESS != (ret = opal_dss.unpack(sbuffer, &(datum->mig_num), &count, OPAL_INT))) { ORTE_ERROR_LOG(ret); goto cleanup; } datum->mig_vpids = malloc(sizeof(int) * datum->mig_num); datum->mig_host_pref = malloc(sizeof(char) * datum->mig_num * OPAL_MAX_PROCESSOR_NAME); datum->mig_vpid_pref = malloc(sizeof(int) * datum->mig_num); datum->mig_off_node = malloc(sizeof(int) * datum->mig_num); for( i = 0; i < datum->mig_num; ++i ) { (datum->mig_vpids)[i] = 0; (datum->mig_host_pref)[i][0] = '\0'; (datum->mig_vpid_pref)[i] = 0; (datum->mig_off_node)[i] = (int)false; } for( i = 0; i < datum->mig_num; ++i ) { count = 1; if (ORTE_SUCCESS != (ret = opal_dss.unpack(sbuffer, &((datum->mig_vpids)[i]), &count, OPAL_INT))) { ORTE_ERROR_LOG(ret); goto cleanup; } if(NULL != tmp_str ) { free(tmp_str); tmp_str = NULL; } count = 1; if (ORTE_SUCCESS != (ret = opal_dss.unpack(sbuffer, &tmp_str, &count, OPAL_STRING))) { ORTE_ERROR_LOG(ret); goto cleanup; } strncpy( ((datum->mig_host_pref)[i]), tmp_str, OPAL_MAX_PROCESSOR_NAME); count = 1; if (ORTE_SUCCESS != (ret = opal_dss.unpack(sbuffer, &((datum->mig_vpid_pref)[i]), &count, OPAL_INT))) { ORTE_ERROR_LOG(ret); goto cleanup; } count = 1; if (ORTE_SUCCESS != (ret = opal_dss.unpack(sbuffer, &((datum->mig_off_node)[i]), &count, OPAL_INT))) { ORTE_ERROR_LOG(ret); goto cleanup; } OPAL_OUTPUT_VERBOSE((20, mca_snapc_full_component.super.output_handle, "Global) Migration %3d/%3d: Received Rank %3d - Requested <%s> (%3d) %c\n", datum->mig_num, i, (datum->mig_vpids)[i], (datum->mig_host_pref)[i], (datum->mig_vpid_pref)[i], (OPAL_INT_TO_BOOL((datum->mig_off_node)[i]) ? 'T' : 'F') )); } /* * Kick off the migration */ OPAL_OUTPUT_VERBOSE((20, mca_snapc_full_component.super.output_handle, "Global) ------ Kick Off Migration -----")); if( ORTE_SUCCESS != (ret = orte_errmgr_base_migrate_job(current_global_jobid, datum) ) ) { ORTE_ERROR_LOG(ret); goto cleanup; } /* * Tell the sender that the operation is finished */ OPAL_OUTPUT_VERBOSE((20, mca_snapc_full_component.super.output_handle, "Global) ------ Finished Migration. Release processes (%15s )-----", ORTE_NAME_PRINT(sender) )); buffer = OBJ_NEW(opal_buffer_t); if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &command, 1, ORTE_SNAPC_FULL_CMD))) { ORTE_ERROR_LOG(ret); goto cleanup; } if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &op_event, 1, OPAL_INT))) { ORTE_ERROR_LOG(ret); goto cleanup; } op_state = 0; if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &op_state, 1, OPAL_INT))) { ORTE_ERROR_LOG(ret); goto cleanup; } if (ORTE_SUCCESS != (ret = orte_rml.send_buffer_nb(sender, buffer, ORTE_RML_TAG_SNAPC_FULL, orte_rml_send_callback, NULL))) { ORTE_ERROR_LOG(ret); goto cleanup; } OPAL_OUTPUT_VERBOSE((20, mca_snapc_full_component.super.output_handle, "Global) ------ Finished Migration. Released processes (%15s )-----", ORTE_NAME_PRINT(sender) )); } /************************************ * Start the Quiesce operation ************************************/ else if( ORTE_SNAPC_OP_QUIESCE_START == op_event) { OPAL_OUTPUT_VERBOSE((5, mca_snapc_full_component.super.output_handle, "Global) process_request_op(): Starting quiesce (%2d)\n", op_event)); options = OBJ_NEW(opal_crs_base_ckpt_options_t); options->inc_prep_only = true; if( ORTE_SUCCESS != (ret = snapc_full_global_checkpoint(options) ) ) { ORTE_ERROR_LOG(ret); goto cleanup; } /* * Wait for quiescence */ while( current_job_ckpt_state != ORTE_SNAPC_CKPT_STATE_ERROR && current_job_ckpt_state != ORTE_SNAPC_CKPT_STATE_INC_PREPED ) { opal_progress(); } OPAL_OUTPUT_VERBOSE((5, mca_snapc_full_component.super.output_handle, "Global) process_request_op(): Quiesce_start finished(%2d)\n", op_event)); } /************************************ * End the Quiesce operation ************************************/ else if( ORTE_SNAPC_OP_QUIESCE_END == op_event) { OPAL_OUTPUT_VERBOSE((5, mca_snapc_full_component.super.output_handle, "Global) process_request_op(): Ending quiesce (%2d)\n", op_event)); /* * Wait for the checkpoint operation to finish */ while( current_job_ckpt_state != ORTE_SNAPC_CKPT_STATE_ERROR && current_job_ckpt_state != ORTE_SNAPC_CKPT_STATE_NONE ) { opal_progress(); } OPAL_OUTPUT_VERBOSE((5, mca_snapc_full_component.super.output_handle, "Global) process_request_op(): Quiesce_end finished(%2d)\n", op_event)); } cleanup: if (NULL != buffer) { OBJ_RELEASE(buffer); buffer = NULL; } if( NULL != options ) { OBJ_RELEASE(options); options = NULL; } if(NULL != tmp_str ) { free(tmp_str); tmp_str = NULL; } return; } static int snapc_full_process_orted_update_cmd(orte_process_name_t* sender, opal_buffer_t* buffer, bool quick) { int ret, exit_status = ORTE_SUCCESS; orte_std_cntr_t count; int remote_ckpt_state; opal_list_item_t* item = NULL; opal_list_item_t* aitem = NULL; orte_snapc_full_orted_snapshot_t *orted_snapshot = NULL; orte_snapc_base_local_snapshot_t *app_snapshot = NULL; int loc_min_state; char *state_str = NULL; orted_snapshot = find_orted_snapshot(sender); if( NULL == orted_snapshot ) { opal_output(mca_snapc_full_component.super.output_handle, "Global) Error: Unknown Daemon %s", ORTE_NAME_PRINT(sender) ); exit_status = ORTE_ERROR; ORTE_ERROR_LOG(ORTE_ERROR); goto cleanup; } OPAL_OUTPUT_VERBOSE((20, mca_snapc_full_component.super.output_handle, "Global) Daemon %s: Changed state to:\n", ORTE_NAME_PRINT(&(orted_snapshot->process_name)) )); /* * Unpack the data (quick) * - state * Unpack the data (long) * - state * - # procs * - Foreach proc * - process name */ count = 1; if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &remote_ckpt_state, &count, OPAL_INT))) { ORTE_ERROR_LOG(ret); exit_status = ret; goto cleanup; } orted_snapshot->state = remote_ckpt_state; orte_snapc_ckpt_state_str(&state_str, orted_snapshot->state); OPAL_OUTPUT_VERBOSE((20, mca_snapc_full_component.super.output_handle, "Global) State: %d (%s)\n", (int)(orted_snapshot->state), state_str)); free(state_str); state_str = NULL; /* JJH: Though there is currently no additional information sent in a long * message versus a small message, keep this logic so that in the * future it can be easily reused without substantially modifying * the component. */ if( quick ) { exit_status = ORTE_SUCCESS; goto post_process; } post_process: loc_min_state = snapc_full_global_get_min_state(); SNAPC_FULL_REPORT_PROGRESS(orted_snapshot, current_total_orteds, loc_min_state); /* * Notify the orte-checkpoint command once we have everyone running. * No need to broadcast this to everyone since they already know. */ if( ORTE_SNAPC_CKPT_STATE_RUNNING == loc_min_state && ORTE_SNAPC_CKPT_STATE_RUNNING != current_job_ckpt_state) { current_job_ckpt_state = loc_min_state; SNAPC_FULL_SET_TIMER(SNAPC_FULL_TIMER_RUNNING); if( is_orte_checkpoint_connected && ORTE_SUCCESS != (ret = orte_snapc_base_global_coord_ckpt_update_cmd(&orte_checkpoint_sender, global_snapshot.ss_handle, current_job_ckpt_state)) ) { ORTE_ERROR_LOG(ret); exit_status = ret; goto cleanup; } } /* * If we are just prep'ing the INC, then acknowledge the state change */ if( ORTE_SNAPC_CKPT_STATE_INC_PREPED == loc_min_state && ORTE_SNAPC_CKPT_STATE_INC_PREPED > current_job_ckpt_state) { current_job_ckpt_state = loc_min_state; OPAL_OUTPUT_VERBOSE((20, mca_snapc_full_component.super.output_handle, "Global) All Processes have finished the INC prep!\n")); } /* * Notify the orte-checkpoint command once we have everyone stopped. * No need to broadcast this to everyone since they already know. */ if( ORTE_SNAPC_CKPT_STATE_STOPPED == loc_min_state && ORTE_SNAPC_CKPT_STATE_STOPPED > current_job_ckpt_state) { current_job_ckpt_state = loc_min_state; OPAL_OUTPUT_VERBOSE((20, mca_snapc_full_component.super.output_handle, "Global) All Processes have been stopped!\n")); if( is_orte_checkpoint_connected && ORTE_SUCCESS != (ret = orte_snapc_base_global_coord_ckpt_update_cmd(&orte_checkpoint_sender, global_snapshot.ss_handle, current_job_ckpt_state)) ) { ORTE_ERROR_LOG(ret); exit_status = ret; goto cleanup; } /* orte-checkpoint detaches at this point */ is_orte_checkpoint_connected = false; /* * Synchronize the checkpoint here */ write_out_global_metadata(); } /* * If all daemons have finished, let everyone know we are locally finished. */ if( ORTE_SNAPC_CKPT_STATE_FINISHED_LOCAL == loc_min_state && ORTE_SNAPC_CKPT_STATE_FINISHED_LOCAL > current_job_ckpt_state) { SNAPC_FULL_SET_TIMER(SNAPC_FULL_TIMER_FIN_LOCAL); if( ORTE_SNAPC_CKPT_STATE_NONE != current_job_ckpt_state ) { if( loc_min_state == current_job_ckpt_state) { opal_output(0, "Global) JJH WARNING!!: (%d) == (%d)", loc_min_state, current_job_ckpt_state); } } if( currently_migrating ) { write_out_global_metadata(); current_job_ckpt_state = ORTE_SNAPC_CKPT_STATE_MIGRATING; } else { current_job_ckpt_state = ORTE_SNAPC_CKPT_STATE_FINISHED_LOCAL; } if( NULL != state_str ) { free(state_str); } orte_snapc_ckpt_state_str(&state_str, current_job_ckpt_state); OPAL_OUTPUT_VERBOSE((20, mca_snapc_full_component.super.output_handle, "Global) Job State Changed: %d (%s)\n", (int)current_job_ckpt_state, state_str )); free(state_str); state_str = NULL; if( ORTE_SUCCESS != (ret = orte_snapc_full_global_set_job_ckpt_info(current_global_jobid, current_job_ckpt_state, global_snapshot.ss_handle, true, NULL) ) ) { ORTE_ERROR_LOG(ret); exit_status = ret; goto cleanup; } /* * Now that we have finished locally, * - Write out the metadata * - Sync the snapshot to SStore * if we are stopping then we have already written out this data. */ if( !(global_snapshot.options->stop) && !currently_migrating ) { write_out_global_metadata(); } SNAPC_FULL_SET_TIMER(SNAPC_FULL_TIMER_ESTABLISH); if( ORTE_SUCCESS != (ret = orte_snapc_full_global_set_job_ckpt_info(current_global_jobid, ORTE_SNAPC_CKPT_STATE_ESTABLISHED, global_snapshot.ss_handle, true, NULL) ) ) { ORTE_ERROR_LOG(ret); exit_status = ret; goto cleanup; } } /* * If all daemons have confirmed that their local proces are finished * and we have finished establishing the checkpoint, * then let the command line tool know and cleanup. */ if( ORTE_SNAPC_CKPT_STATE_RECOVERED == loc_min_state && ORTE_SNAPC_CKPT_STATE_RECOVERED > current_job_ckpt_state ) { /* * If this is a job restarting then we do something different */ if( current_job_ckpt_state == ORTE_SNAPC_CKPT_STATE_NONE ) { OPAL_OUTPUT_VERBOSE((5, mca_snapc_full_component.super.output_handle, "Global) Job has been successfully restarted")); /*current_job_ckpt_state = ORTE_SNAPC_CKPT_STATE_RECOVERED;*/ orte_snapc_ckpt_state_notify(ORTE_SNAPC_CKPT_STATE_RECOVERED); for(item = opal_list_get_first(&(global_snapshot.local_snapshots)); item != opal_list_get_end(&(global_snapshot.local_snapshots)); item = opal_list_get_next(item) ) { orted_snapshot = (orte_snapc_full_orted_snapshot_t*)item; orted_snapshot->state = ORTE_SNAPC_CKPT_STATE_NONE; for(aitem = opal_list_get_first(&(orted_snapshot->super.local_snapshots)); aitem != opal_list_get_end(&(orted_snapshot->super.local_snapshots)); aitem = opal_list_get_next(aitem) ) { app_snapshot = (orte_snapc_base_local_snapshot_t*)aitem; app_snapshot->state = ORTE_SNAPC_CKPT_STATE_NONE; } } SNAPC_FULL_SET_TIMER(SNAPC_FULL_TIMER_RECOVERED); SNAPC_FULL_DISPLAY_RECOVERED_TIMER(); orte_snapc_base_has_recovered = true; is_app_checkpointable = true; exit_status = ORTE_SUCCESS; goto cleanup; } /* * If the checkpoint has not been established yet, then do not clear the * snapshot structure just yet. */ if(ORTE_SNAPC_CKPT_STATE_ESTABLISHED != current_job_ckpt_state ) { cleanup_on_establish = true; } current_job_ckpt_state = ORTE_SNAPC_CKPT_STATE_RECOVERED; if( NULL != state_str ) { free(state_str); } orte_snapc_ckpt_state_str(&state_str, current_job_ckpt_state); OPAL_OUTPUT_VERBOSE((20, mca_snapc_full_component.super.output_handle, "Global) Job State Changed: %d (%s)\n", (int)current_job_ckpt_state, state_str )); free(state_str); state_str = NULL; /* * Notify the orte-checkpoint command */ if( is_orte_checkpoint_connected && ORTE_SUCCESS != (ret = orte_snapc_base_global_coord_ckpt_update_cmd(&orte_checkpoint_sender, global_snapshot.ss_handle, current_job_ckpt_state)) ) { ORTE_ERROR_LOG(ret); exit_status = ret; goto cleanup; } SNAPC_FULL_SET_TIMER(SNAPC_FULL_TIMER_RECOVERED); /* * If the checkpoint has been established at this point, then cleanup. */ if( !cleanup_on_establish && ORTE_SNAPC_CKPT_STATE_RECOVERED == current_job_ckpt_state) { if( ORTE_SUCCESS != (ret = orte_snapc_full_global_reset_coord()) ) { ORTE_ERROR_LOG(ret); exit_status = ret; goto cleanup; } } } cleanup: if( NULL != state_str ) { free(state_str); state_str = NULL; } return exit_status; } static void snapc_full_process_restart_proc_info_cmd(orte_process_name_t* sender, opal_buffer_t* buffer) { int ret; orte_std_cntr_t count; size_t num_vpids = 0, i; pid_t tmp_pid; char * tmp_hostname = NULL; count = 1; if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &tmp_hostname, &count, OPAL_STRING))) { opal_output(mca_snapc_full_component.super.output_handle, "Global) vpid_assoc: Failed to unpack process Hostname from peer %s\n", ORTE_NAME_PRINT(sender)); goto cleanup; } count = 1; if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &num_vpids, &count, OPAL_SIZE))) { opal_output(mca_snapc_full_component.super.output_handle, "Global) vpid_assoc: Failed to unpack num_vpids from peer %s\n", ORTE_NAME_PRINT(sender)); goto cleanup; } for(i = 0; i < num_vpids; ++i) { count = 1; if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &tmp_pid, &count, OPAL_PID))) { opal_output(mca_snapc_full_component.super.output_handle, "Global) vpid_assoc: Failed to unpack process PID from peer %s\n", ORTE_NAME_PRINT(sender)); goto cleanup; } global_coord_restart_proc_info(tmp_pid, tmp_hostname); } /* stdout may be buffered by the C library so it needs to be flushed so * that the debugger can read the process info. */ fflush(stdout); cleanup: return; } int global_coord_restart_proc_info(pid_t local_pid, char * local_hostname) { printf("MPIR_debug_info) %s:%d\n", local_hostname, local_pid); return 0; } static void snapc_full_process_job_update_cmd(orte_process_name_t* sender, opal_buffer_t* buffer, bool quick) { int ret; orte_std_cntr_t count; orte_jobid_t jobid; int job_ckpt_state = ORTE_SNAPC_CKPT_STATE_NONE; opal_crs_base_ckpt_options_t *options = NULL; bool loc_migrating = false; size_t loc_num_procs = 0; orte_proc_t *proc = NULL; size_t i; orte_sstore_base_handle_t ss_handle; /* * Unpack the data (quick) * - jobid * - ckpt_state * - sstore_handle * Unpack the data (long) * - jobid * - ckpt_state * - ckpt_options */ count = 1; if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &jobid, &count, ORTE_JOBID))) { ORTE_ERROR_LOG(ret); return; } count = 1; if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &job_ckpt_state, &count, OPAL_INT))) { ORTE_ERROR_LOG(ret); return; } if( !quick ) { if (ORTE_SUCCESS != (ret = orte_sstore.unpack_handle(sender, buffer, &ss_handle)) ) { ORTE_ERROR_LOG(ret); return; } options = OBJ_NEW(opal_crs_base_ckpt_options_t); if( ORTE_SUCCESS != (ret = orte_snapc_base_unpack_options(buffer, options)) ) { ORTE_ERROR_LOG(ret); return; } /* In this case we want to use the current_options that are cached * so that we do not have to send them every time. */ opal_crs_base_copy_options(options, global_snapshot.options); count = 1; if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &(loc_migrating), &count, OPAL_BOOL))) { ORTE_ERROR_LOG(ret); goto cleanup; } if( loc_migrating ) { count = 1; if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &loc_num_procs, &count, OPAL_SIZE))) { ORTE_ERROR_LOG(ret); goto cleanup; } for( i = 0; i < loc_num_procs; ++i ) { count = 1; if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &proc, &count, ORTE_NAME))) { ORTE_ERROR_LOG(ret); goto cleanup; } /* JJH: Update local info as needed */ } } } if( ORTE_SUCCESS != (ret = global_coord_job_state_update(jobid, job_ckpt_state, ss_handle, global_snapshot.options) ) ) { ORTE_ERROR_LOG(ret); } cleanup: if( NULL != options ) { OBJ_RELEASE(options); options = NULL; } return; } static int snapc_full_establish_snapshot_dir(bool empty_metadata) { char **value = NULL; int idx = 0; /********************* * Contact the Stable Storage Framework to setup the storage directory *********************/ INC_SEQ_NUM(); orte_sstore.request_checkpoint_handle(&(global_snapshot.ss_handle), orte_snapc_base_snapshot_seq_number, current_global_jobid); if( currently_migrating ) { orte_sstore.set_attr(global_snapshot.ss_handle, SSTORE_METADATA_GLOBAL_MIGRATING, "1"); } orte_sstore.register_handle(global_snapshot.ss_handle); /* * Save the AMCA parameter used into the metadata file */ if( 0 > (idx = mca_base_var_find("opal", "mca", "base", "param_file_prefix")) ) { opal_show_help("help-orte-restart.txt", "amca_param_not_found", true); } if( 0 < idx ) { mca_base_var_get_value (idx, &value, NULL, NULL); if (*value) { orte_sstore.set_attr(global_snapshot.ss_handle, SSTORE_METADATA_GLOBAL_AMCA_PARAM, *value); OPAL_OUTPUT_VERBOSE((10, mca_snapc_full_component.super.output_handle, "Global) AMCA Parameter Preserved: %s", *value)); } } /* * Save the TUNE parameter used into the metadata file */ if( 0 > (idx = mca_base_var_find("opal", "mca", "base", "envar_file_prefix")) ) { opal_show_help("help-orte-restart.txt", "tune_param_not_found", true); } if( 0 < idx ) { mca_base_var_get_value (idx, &value, NULL, NULL); if (*value) { orte_sstore.set_attr(global_snapshot.ss_handle, SSTORE_METADATA_GLOBAL_TUNE_PARAM, *value); OPAL_OUTPUT_VERBOSE((10, mca_snapc_full_component.super.output_handle, "Global) TUNE Parameter Preserved: %s", *value)); } } return ORTE_SUCCESS; } static int snapc_full_global_checkpoint(opal_crs_base_ckpt_options_t *options) { int ret, exit_status = ORTE_SUCCESS; OPAL_OUTPUT_VERBOSE((10, mca_snapc_full_component.super.output_handle, "Global) Checkpoint of job %s has been requested\n", ORTE_JOBID_PRINT(current_global_jobid))); /* opal_output(0, "================> JJH Checkpoint Started"); */ current_job_ckpt_state = ORTE_SNAPC_CKPT_STATE_REQUEST; /********************* * Generate the global snapshot directory, and unique global snapshot handle *********************/ if( ORTE_SUCCESS != (ret = snapc_full_establish_snapshot_dir(false))) { ORTE_ERROR_LOG(ret); exit_status = ret; goto cleanup; } /*********************************** * Do an update handshake with the orte_checkpoint command ***********************************/ updated_job_to_running = false; if( is_orte_checkpoint_connected && ORTE_SUCCESS != (ret = orte_snapc_base_global_coord_ckpt_update_cmd(&orte_checkpoint_sender, global_snapshot.ss_handle, current_job_ckpt_state) ) ) { ORTE_ERROR_LOG(ret); exit_status = ret; goto cleanup; } /********************** * Notify the Local Snapshot Coordinators of the checkpoint request **********************/ OPAL_OUTPUT_VERBOSE((15, mca_snapc_full_component.super.output_handle, "Global) Notifying the Local Coordinators\n")); if( ORTE_SUCCESS != (ret = snapc_full_global_notify_checkpoint(current_global_jobid, options)) ) { ORTE_ERROR_LOG(ret); exit_status = ret; goto cleanup; } cleanup: return exit_status; } static int snapc_full_global_notify_checkpoint(orte_jobid_t jobid, opal_crs_base_ckpt_options_t *options) { int ret, exit_status = ORTE_SUCCESS; orte_snapc_full_orted_snapshot_t *orted_snapshot = NULL; opal_list_item_t* item = NULL; int ckpt_state; ckpt_state = ORTE_SNAPC_CKPT_STATE_PENDING; /* * Copy over the options */ opal_crs_base_copy_options(options, global_snapshot.options); /* * Update the global structure */ for(item = opal_list_get_first(&global_snapshot.local_snapshots); item != opal_list_get_end(&global_snapshot.local_snapshots); item = opal_list_get_next(item) ) { orted_snapshot = (orte_snapc_full_orted_snapshot_t*)item; orted_snapshot->state = ckpt_state; } /* * Update the job state, and broadcast to all local daemons */ if( ORTE_SUCCESS != (ret = orte_snapc_full_global_set_job_ckpt_info(jobid, ckpt_state, global_snapshot.ss_handle, false, options) ) ) { ORTE_ERROR_LOG(ret); exit_status = ret; goto cleanup; } cleanup: return exit_status; } /********************************** * Job/Proc State Set/Get Routines **********************************/ static int orte_snapc_full_global_set_job_ckpt_info( orte_jobid_t jobid, int ckpt_state, orte_sstore_base_handle_t handle, bool quick, opal_crs_base_ckpt_options_t *options) { int ret, exit_status = ORTE_SUCCESS; orte_snapc_full_cmd_flag_t command; opal_buffer_t *buffer = NULL; char * state_str = NULL; orte_proc_t *proc = NULL; opal_list_item_t *item = NULL; size_t num_procs; orte_grpcomm_signature_t *sig; /* * Update all Local Coordinators (broadcast operation) */ buffer = OBJ_NEW(opal_buffer_t); if( quick ) { command = ORTE_SNAPC_FULL_UPDATE_JOB_STATE_QUICK_CMD; } else { command = ORTE_SNAPC_FULL_UPDATE_JOB_STATE_CMD; } if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &command, 1, ORTE_SNAPC_FULL_CMD))) { ORTE_ERROR_LOG(ret); exit_status = ret; goto cleanup; } if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &jobid, 1, ORTE_JOBID))) { ORTE_ERROR_LOG(ret); exit_status = ret; goto cleanup; } if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &ckpt_state, 1, OPAL_INT))) { ORTE_ERROR_LOG(ret); exit_status = ret; goto cleanup; } if( quick ) { goto process_msg; } if (ORTE_SUCCESS != (ret = orte_sstore.pack_handle(NULL, buffer, handle))) { ORTE_ERROR_LOG(ret); exit_status = ret; goto cleanup; } if(ORTE_SUCCESS != (ret = orte_snapc_base_pack_options(buffer, options))) { ORTE_ERROR_LOG(ret); exit_status = ret; goto cleanup; } if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &(currently_migrating), 1, OPAL_BOOL))) { ORTE_ERROR_LOG(ret); exit_status = ret; goto cleanup; } if( currently_migrating ) { num_procs = opal_list_get_size(migrating_procs); if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &num_procs, 1, OPAL_SIZE))) { ORTE_ERROR_LOG(ret); exit_status = ret; goto cleanup; } for (item = opal_list_get_first(migrating_procs); item != opal_list_get_end(migrating_procs); item = opal_list_get_next(item)) { proc = (orte_proc_t*)item; if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &(proc->name), 1, ORTE_NAME))) { ORTE_ERROR_LOG(ret); exit_status = ret; goto cleanup; } } } process_msg: orte_snapc_ckpt_state_str(&state_str, ckpt_state); OPAL_OUTPUT_VERBOSE((15, mca_snapc_full_component.super.output_handle, "Global) Notify Local Coordinators of job %s state change to %d (%s)\n", ORTE_JOBID_PRINT(jobid), (int)ckpt_state, state_str )); free(state_str); state_str = NULL; /* goes to all daemons */ sig = OBJ_NEW(orte_grpcomm_signature_t); sig->signature = (orte_process_name_t*)malloc(sizeof(orte_process_name_t)); sig->signature[0].jobid = ORTE_PROC_MY_NAME->jobid; sig->signature[0].vpid = ORTE_VPID_WILDCARD; if (ORTE_SUCCESS != (ret = orte_grpcomm.xcast(sig, ORTE_RML_TAG_SNAPC_FULL, buffer))) { ORTE_ERROR_LOG(ret); exit_status = ret; goto cleanup; } /* * We will also receive the job update, and process in the RML callback */ cleanup: if( NULL != state_str ) { free(state_str); state_str = NULL; } OBJ_RELEASE(buffer); OBJ_RELEASE(sig); return exit_status; } int global_coord_job_state_update(orte_jobid_t jobid, int job_ckpt_state, orte_sstore_base_handle_t ss_handle, opal_crs_base_ckpt_options_t *options) { int ret, exit_status = ORTE_SUCCESS; char * state_str = NULL; orte_snapc_ckpt_state_str(&state_str, job_ckpt_state); OPAL_OUTPUT_VERBOSE((15, mca_snapc_full_component.super.output_handle, "Global) Job update command: jobid %s -> state %d (%s)\n", ORTE_JOBID_PRINT(jobid), (int)job_ckpt_state, state_str )); free(state_str); state_str = NULL; /************************ * Update the orte_checkpoint command ************************/ current_job_ckpt_state = job_ckpt_state; if( is_orte_checkpoint_connected && ORTE_SUCCESS != (ret = orte_snapc_base_global_coord_ckpt_update_cmd(&orte_checkpoint_sender, global_snapshot.ss_handle, current_job_ckpt_state)) ) { ORTE_ERROR_LOG(ret); exit_status = ret; goto cleanup; } /* * Global Coordinator: If also a Local coordinator then act locally before globally */ if( ORTE_SNAPC_LOCAL_COORD_TYPE == (orte_snapc_coord_type & ORTE_SNAPC_LOCAL_COORD_TYPE) ) { if( ORTE_SUCCESS != (ret = local_coord_job_state_update(jobid, job_ckpt_state, ss_handle, options)) ) { ORTE_ERROR_LOG(ret); exit_status = ret; goto cleanup; } } /* * Process the cmd */ if(ORTE_SNAPC_CKPT_STATE_ESTABLISHED == job_ckpt_state ) { /* * If the processes recovered before the checkpoint was established, * then we need to cleanup here instead of in the recovery block */ if( cleanup_on_establish ) { if( ORTE_SUCCESS != (ret = orte_snapc_full_global_reset_coord()) ) { ORTE_ERROR_LOG(ret); exit_status = ret; goto cleanup; } } } else if(ORTE_SNAPC_CKPT_STATE_ERROR == job_ckpt_state ) { opal_output(mca_snapc_full_component.super.output_handle, "Error: Checkpoint failed!"); } /* * This should not happen, since this state is always handled locally */ else if(ORTE_SNAPC_CKPT_STATE_STOPPED == job_ckpt_state ) { ; } /* * This should not happen, since we do not handle this case */ else if(ORTE_SNAPC_CKPT_STATE_REQUEST == job_ckpt_state ) { opal_output(mca_snapc_full_component.super.output_handle, "ERROR: Internal Checkpoint request not implemented."); ORTE_ERROR_LOG(ORTE_ERR_NOT_SUPPORTED); } cleanup: if( NULL != state_str) { free(state_str); state_str = NULL; } return exit_status; } static int write_out_global_metadata(void) { orte_snapc_full_orted_snapshot_t *orted_snapshot = NULL; opal_list_item_t* orted_item = NULL; OPAL_OUTPUT_VERBOSE((20, mca_snapc_full_component.super.output_handle, "Global) Updating Metadata")); /* * Check for an error * JJH CLEANUP: Check might be good, but mostly unnecessary * JJH: Do we want to pass this along to the SStore? Probably */ for(orted_item = opal_list_get_first(&(global_snapshot.local_snapshots)); orted_item != opal_list_get_end(&(global_snapshot.local_snapshots)); orted_item = opal_list_get_next(orted_item) ) { orted_snapshot = (orte_snapc_full_orted_snapshot_t*)orted_item; if( ORTE_SNAPC_CKPT_STATE_ERROR == orted_snapshot->state ) { return ORTE_ERROR; } } /* * Sync the stable storage */ orte_sstore.sync(global_snapshot.ss_handle); SNAPC_FULL_SET_TIMER(SNAPC_FULL_TIMER_SS_SYNC); return ORTE_SUCCESS; } static orte_snapc_full_orted_snapshot_t *find_orted_snapshot(orte_process_name_t *name ) { int ret; orte_snapc_full_orted_snapshot_t *orted_snapshot = NULL; opal_list_item_t* item = NULL; orte_ns_cmp_bitmask_t mask; for(item = opal_list_get_first(&(global_snapshot.local_snapshots)); item != opal_list_get_end(&(global_snapshot.local_snapshots)); item = opal_list_get_next(item) ) { orted_snapshot = (orte_snapc_full_orted_snapshot_t*)item; mask = ORTE_NS_CMP_ALL; if (OPAL_EQUAL == orte_util_compare_name_fields(mask, name, &orted_snapshot->process_name)) { return orted_snapshot; } } /* * Refresh the job structure, and try again */ OPAL_OUTPUT_VERBOSE((20, mca_snapc_full_component.super.output_handle, "Global) find_orted(%s) failed. Refreshing and trying again...", ORTE_NAME_PRINT(name) )); if( ORTE_SUCCESS != (ret = global_refresh_job_structs()) ) { ORTE_ERROR_LOG(ret); return NULL; } for(item = opal_list_get_first(&(global_snapshot.local_snapshots)); item != opal_list_get_end(&(global_snapshot.local_snapshots)); item = opal_list_get_next(item) ) { orted_snapshot = (orte_snapc_full_orted_snapshot_t*)item; mask = ORTE_NS_CMP_ALL; if (OPAL_EQUAL == orte_util_compare_name_fields(mask, name, &orted_snapshot->process_name)) { return orted_snapshot; } } return NULL; } static int snapc_full_global_get_min_state(void) { int min_state = ORTE_SNAPC_CKPT_MAX; orte_snapc_full_orted_snapshot_t *orted_snapshot = NULL; opal_list_item_t* item = NULL; char * state_str_a = NULL; char * state_str_b = NULL; current_total_orteds = 0; for(item = opal_list_get_first(&(global_snapshot.local_snapshots)); item != opal_list_get_end(&(global_snapshot.local_snapshots)); item = opal_list_get_next(item) ) { orted_snapshot = (orte_snapc_full_orted_snapshot_t*)item; /* Ignore orteds with no processes */ if( 0 >= opal_list_get_size(&(orted_snapshot->super.local_snapshots)) ) { OPAL_OUTPUT_VERBOSE((10, mca_snapc_full_component.super.output_handle, "Global) ... %s Skipping - (no children)", ORTE_NAME_PRINT(&orted_snapshot->process_name) )); continue; } current_total_orteds++; if( NULL != state_str_a ) { free(state_str_a); state_str_a = NULL; } if( NULL != state_str_b ) { free(state_str_b); state_str_b = NULL; } orte_snapc_ckpt_state_str(&state_str_a, orted_snapshot->state); orte_snapc_ckpt_state_str(&state_str_b, min_state); OPAL_OUTPUT_VERBOSE((10, mca_snapc_full_component.super.output_handle, "Global) ... %s Checking [%d %s] vs [%d %s]", ORTE_NAME_PRINT(&orted_snapshot->process_name), (int)orted_snapshot->state, state_str_a, min_state, state_str_b )); if( (int)min_state > (int)orted_snapshot->state ) { min_state = orted_snapshot->state; OPAL_OUTPUT_VERBOSE((10, mca_snapc_full_component.super.output_handle, "Global) ... %s Update --> Min State [%d %s]", ORTE_NAME_PRINT(&orted_snapshot->process_name), (int)min_state, state_str_a )); } } if( NULL != state_str_b ) { free(state_str_b); state_str_b = NULL; } orte_snapc_ckpt_state_str(&state_str_b, min_state); OPAL_OUTPUT_VERBOSE((10, mca_snapc_full_component.super.output_handle, "Global) ... Min State [%d %s]", (int)min_state, state_str_b )); if( NULL != state_str_a ) { free(state_str_a); state_str_a = NULL; } if( NULL != state_str_b ) { free(state_str_b); state_str_b = NULL; } return min_state; } static int orte_snapc_full_global_reset_coord(void) { int ret, exit_status = ORTE_SUCCESS; opal_list_item_t* item = NULL; opal_list_item_t* aitem = NULL; orte_snapc_full_orted_snapshot_t *orted_snapshot = NULL; orte_snapc_base_local_snapshot_t *app_snapshot = NULL; /******************************** * Terminate the job if requested * At this point the application should have already exited, but do this * just to make doubly sure that the job is terminated. *********************************/ if( global_snapshot.options->term ) { SNAPC_FULL_DISPLAY_ALL_TIMERS(); orte_plm.terminate_job(current_global_jobid); } else { SNAPC_FULL_DISPLAY_ALL_TIMERS(); } /* * Just cleanup, do not need to send out another message */ opal_crs_base_clear_options(global_snapshot.options); /* * Reset global data structures */ for(item = opal_list_get_first(&(global_snapshot.local_snapshots)); item != opal_list_get_end(&(global_snapshot.local_snapshots)); item = opal_list_get_next(item) ) { orted_snapshot = (orte_snapc_full_orted_snapshot_t*)item; orted_snapshot->state = ORTE_SNAPC_CKPT_STATE_NONE; for(aitem = opal_list_get_first(&(orted_snapshot->super.local_snapshots)); aitem != opal_list_get_end(&(orted_snapshot->super.local_snapshots)); aitem = opal_list_get_next(aitem) ) { app_snapshot = (orte_snapc_base_local_snapshot_t*)aitem; app_snapshot->state = ORTE_SNAPC_CKPT_STATE_NONE; } } /************************ * Set up the Command Line listener again *************************/ is_orte_checkpoint_connected = false; if( ORTE_SUCCESS != (ret = snapc_full_global_start_cmdline_listener() ) ){ ORTE_ERROR_LOG(ret); exit_status = ret; } current_job_ckpt_state = ORTE_SNAPC_CKPT_STATE_NONE; cleanup_on_establish = false; report_progress_cur_loc_finished = 0; report_progress_last_reported_loc_finished = 0; return exit_status; } /************************ * Timing ************************/ static void snapc_full_set_time(int idx) { if(idx < SNAPC_FULL_TIMER_MAX ) { if( timer_start[idx] <= 0.0 ) { timer_start[idx] = snapc_full_get_time(); } } } static void snapc_full_display_all_timers(void) { double diff = 0.0; char * label = NULL; opal_output(0, "Snapshot Coordination Timing: ******************** Summary Begin\n"); /********** Startup time **********/ label = strdup("Running"); diff = timer_start[SNAPC_FULL_TIMER_RUNNING] - timer_start[SNAPC_FULL_TIMER_START]; snapc_full_display_indv_timer_core(diff, label); free(label); /********** Time to finish locally **********/ label = strdup("Finish Locally"); diff = timer_start[SNAPC_FULL_TIMER_FIN_LOCAL] - timer_start[SNAPC_FULL_TIMER_RUNNING]; snapc_full_display_indv_timer_core(diff, label); free(label); if( timer_start[SNAPC_FULL_TIMER_SS_SYNC] <= timer_start[SNAPC_FULL_TIMER_RECOVERED] ) { /********** SStore Sync **********/ label = strdup("SStore Sync"); diff = timer_start[SNAPC_FULL_TIMER_SS_SYNC] - timer_start[SNAPC_FULL_TIMER_FIN_LOCAL]; snapc_full_display_indv_timer_core(diff, label); free(label); /********** Establish Ckpt **********/ label = strdup("Establish"); diff = timer_start[SNAPC_FULL_TIMER_ESTABLISH] - timer_start[SNAPC_FULL_TIMER_SS_SYNC]; snapc_full_display_indv_timer_core(diff, label); free(label); /********** Recover **********/ label = strdup("Continue/Recover"); diff = timer_start[SNAPC_FULL_TIMER_RECOVERED] - timer_start[SNAPC_FULL_TIMER_ESTABLISH]; snapc_full_display_indv_timer_core(diff, label); free(label); } else { /* Established after procs recovered */ /********** SStore Sync **********/ label = strdup("SStore Sync*"); diff = timer_start[SNAPC_FULL_TIMER_SS_SYNC] - timer_start[SNAPC_FULL_TIMER_RECOVERED]; snapc_full_display_indv_timer_core(diff, label); free(label); /********** Establish Ckpt **********/ label = strdup("Establish*"); diff = timer_start[SNAPC_FULL_TIMER_ESTABLISH] - timer_start[SNAPC_FULL_TIMER_SS_SYNC]; snapc_full_display_indv_timer_core(diff, label); free(label); /********** Recover **********/ label = strdup("Continue/Recover*"); diff = timer_start[SNAPC_FULL_TIMER_RECOVERED] - timer_start[SNAPC_FULL_TIMER_FIN_LOCAL]; snapc_full_display_indv_timer_core(diff, label); free(label); } opal_output(0, "Snapshot Coordination Timing: ******************** Summary End\n"); } static void snapc_full_display_recovered_timers(void) { double diff = 0.0; char * label = NULL; opal_output(0, "Snapshot Coordination Timing: ******************** Summary Begin\n"); /********** Recover **********/ label = strdup("Recover"); diff = timer_start[SNAPC_FULL_TIMER_RECOVERED] - timer_start[SNAPC_FULL_TIMER_START]; snapc_full_display_indv_timer_core(diff, label); free(label); opal_output(0, "Snapshot Coordination Timing: ******************** Summary End\n"); } static void snapc_full_clear_timers(void) { int i; for(i = 0; i < SNAPC_FULL_TIMER_MAX; ++i) { timer_start[i] = 0.0; } } static double snapc_full_get_time(void) { double wtime; #if OPAL_TIMER_USEC_NATIVE wtime = (double)opal_timer_base_get_usec() / 1000000.0; #else struct timeval tv; gettimeofday(&tv, NULL); wtime = tv.tv_sec; wtime += (double)tv.tv_usec / 1000000.0; #endif return wtime; } static void snapc_full_display_indv_timer_core(double diff, char *str) { double total = 0; double perc = 0; if( timer_start[SNAPC_FULL_TIMER_SS_SYNC] <= timer_start[SNAPC_FULL_TIMER_RECOVERED] ) { total = timer_start[SNAPC_FULL_TIMER_RECOVERED] - timer_start[SNAPC_FULL_TIMER_START]; } else { total = timer_start[SNAPC_FULL_TIMER_ESTABLISH] - timer_start[SNAPC_FULL_TIMER_START]; } perc = (diff/total) * 100; opal_output(0, "snapc_full: timing: %-20s = %10.2f s\t%10.2f s\t%6.2f\n", str, diff, total, perc); return; } static void snapc_full_report_progress(orte_snapc_full_orted_snapshot_t *orted_snapshot, int total, int min_state) { orte_snapc_full_orted_snapshot_t *loc_orted_snapshot = NULL; opal_list_item_t* item = NULL; double perc_done; if( ORTE_SNAPC_CKPT_STATE_FINISHED_LOCAL != orted_snapshot->state ) { return; } report_progress_cur_loc_finished++; perc_done = (total-report_progress_cur_loc_finished)/(total*1.0); perc_done = (perc_done-1)*(-100.0); if( perc_done >= (report_progress_last_reported_loc_finished + orte_snapc_full_progress_meter) || report_progress_last_reported_loc_finished == 0.0 ) { report_progress_last_reported_loc_finished = perc_done; opal_output(0, "snapc_full: progress: %10.2f %c Locally Finished\n", perc_done, '%'); } if( perc_done > 95.0 ) { opal_output(0, "snapc_full: progress: Waiting on the following daemons (%10.2f %c):", perc_done, '%'); for(item = opal_list_get_first(&(global_snapshot.local_snapshots)); item != opal_list_get_end(&(global_snapshot.local_snapshots)); item = opal_list_get_next(item) ) { loc_orted_snapshot = (orte_snapc_full_orted_snapshot_t*)item; if( ORTE_SNAPC_CKPT_STATE_FINISHED_LOCAL != loc_orted_snapshot->state ) { opal_output(0, "snapc_full: progress: Daemon %s", ORTE_NAME_PRINT(&loc_orted_snapshot->process_name)); } } } return; }