1
1
openmpi/orte/tools/orte-migrate/orte-migrate.c
2011-01-26 15:36:34 +00:00

778 строки
21 KiB
C

/*
* Copyright (c) 2009-2010 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file
* ORTE Process Migration Tool for migrating processes in a multiprocess job
*
*/
#include "orte_config.h"
#include "orte/constants.h"
#include <stdio.h>
#include <errno.h>
#ifdef HAVE_STDLIB_H
#include <stdlib.h>
#endif /* HAVE_STDLIB_H */
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif /* HAVE_UNISTD_H */
#ifdef HAVE_FCNTL_H
#include <fcntl.h>
#endif /* HAVE_FCNTL_H */
#ifdef HAVE_SYS_TYPES_H
#include <sys/types.h>
#endif /* HAVE_SYS_TYPES_H */
#ifdef HAVE_SYS_STAT_H
#include <sys/stat.h> /* for mkfifo */
#endif /* HAVE_SYS_STAT_H */
#ifdef HAVE_SYS_WAIT_H
#include <sys/wait.h>
#endif /* HAVE_SYS_WAIT_H */
#ifdef HAVE_STRING_H
#include <string.h>
#endif /* HAVE_STRING_H */
#include "opal/util/cmd_line.h"
#include "opal/util/output.h"
#include "opal/util/argv.h"
#include "opal/util/opal_environ.h"
#include "opal/mca/base/base.h"
#include "opal/mca/base/mca_base_param.h"
#include "opal/mca/crs/crs.h"
#include "opal/mca/crs/base/base.h"
#include "opal/runtime/opal.h"
#include "opal/runtime/opal_cr.h"
#include "orte/runtime/runtime.h"
#include "orte/runtime/orte_cr.h"
#include "orte/util/hnp_contact.h"
#include "orte/runtime/orte_globals.h"
#include "orte/util/name_fns.h"
#include "opal/util/show_help.h"
#include "orte/util/proc_info.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/rml/rml_types.h"
#include "orte/mca/errmgr/errmgr.h"
#include "opal/dss/dss.h"
#include "orte/mca/snapc/snapc.h"
#include "orte/mca/snapc/base/base.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/errmgr/base/base.h"
#include MCA_timer_IMPLEMENTATION_HEADER
/******************
* Local Functions
******************/
static int tool_init(int argc, char *argv[]); /* Initalization routine */
static int tool_finalize(void); /* Finalization routine */
static int parse_args(int argc, char *argv[]);
static int find_hnp(void);
static int start_listener(void);
static int stop_listener(void);
static void hnp_receiver(int status,
orte_process_name_t* sender,
opal_buffer_t* buffer,
orte_rml_tag_t tag,
void* cbdata);
static void process_ckpt_update_cmd(orte_process_name_t* sender,
opal_buffer_t* buffer);
static int notify_hnp(void);
static int pretty_print_status(void);
static int pretty_print_migration(void);
static orte_hnp_contact_t *orterun_hnp = NULL;
static int orte_migrate_ckpt_status = ORTE_ERRMGR_MIGRATE_STATE_NONE;
/*****************************************
* Global Vars for Command line Arguments
*****************************************/
static bool listener_started = false;
static double timer_start = 0;
static double timer_last = 0;
static double get_time(void);
typedef struct {
bool help;
int pid;
bool verbose;
int verbose_level;
bool status;
int output;
char *off_nodes;
char *off_procs;
char *onto_nodes;
} orte_migrate_globals_t;
orte_migrate_globals_t orte_migrate_globals;
opal_cmd_line_init_t cmd_line_opts[] = {
{ NULL, NULL, NULL,
'h', NULL, "help",
0,
&orte_migrate_globals.help, OPAL_CMD_LINE_TYPE_BOOL,
"This help message" },
{ NULL, NULL, NULL,
'v', NULL, "verbose",
0,
&orte_migrate_globals.verbose, OPAL_CMD_LINE_TYPE_BOOL,
"Be Verbose" },
{ NULL, NULL, NULL,
'V', NULL, NULL,
1,
&orte_migrate_globals.verbose_level, OPAL_CMD_LINE_TYPE_INT,
"Set the verbosity level (For additional debugging information)" },
{ "hnp-pid", NULL, NULL,
'\0', NULL, "hnp-pid",
1,
&orte_migrate_globals.pid, OPAL_CMD_LINE_TYPE_INT,
"This should be the pid of the mpirun whose applications you wish "
"to migrate." },
{ NULL, NULL, NULL,
'x', NULL, "off",
1,
&orte_migrate_globals.off_nodes, OPAL_CMD_LINE_TYPE_STRING,
"List of nodes to migrate off of (comma separated)" },
{ NULL, NULL, NULL,
'r', NULL, "ranks",
1,
&orte_migrate_globals.off_procs, OPAL_CMD_LINE_TYPE_STRING,
"List of MPI_COMM_WORLD ranks to migrate (comma separated)" },
{ NULL, NULL, NULL,
't', NULL, "onto",
1,
&orte_migrate_globals.onto_nodes, OPAL_CMD_LINE_TYPE_STRING,
"List of nodes to migrate onto (comma separated)" },
/* End of list */
{ NULL, NULL, NULL, '\0', NULL, NULL, 0,
NULL, OPAL_CMD_LINE_TYPE_NULL,
NULL }
};
int
main(int argc, char *argv[])
{
int ret, exit_status = ORTE_SUCCESS;
/***************
* Initialize
***************/
if (ORTE_SUCCESS != (ret = tool_init(argc, argv))) {
exit_status = ret;
goto cleanup;
}
/***************************
* Find the HNP that we want to connect to, if it exists
***************************/
if( orte_migrate_globals.verbose ) {
opal_output_verbose(10, orte_migrate_globals.output,
"orte_migrate: Finding HNP...");
}
if (ORTE_SUCCESS != (ret = find_hnp())) {
opal_show_help("help-orte-migrate.txt", "invalid_pid",
true, orte_migrate_globals.pid);
exit_status = ret;
goto cleanup;
}
/*******************************
* Send migration information to HNP
*******************************/
if( orte_migrate_globals.verbose ) {
opal_output_verbose(10, orte_migrate_globals.output,
"orte_migrate: Sending info to HNP...");
}
if (ORTE_SUCCESS != (ret = notify_hnp())) {
opal_output(0,
"HNP with PID %d Not found!",
orte_migrate_globals.pid);
exit_status = ret;
goto cleanup;
}
/*******************************
* Wait for migration to complete
*******************************/
while( ORTE_ERRMGR_MIGRATE_STATE_FINISH != orte_migrate_ckpt_status &&
ORTE_ERRMGR_MIGRATE_STATE_ERROR != orte_migrate_ckpt_status &&
ORTE_ERRMGR_MIGRATE_STATE_ERR_INPROGRESS != orte_migrate_ckpt_status) {
opal_progress();
}
if( orte_migrate_globals.status ) {
orte_migrate_ckpt_status = ORTE_ERRMGR_MIGRATE_STATE_FINISH;
pretty_print_status();
}
cleanup:
/***************
* Cleanup
***************/
if (ORTE_SUCCESS != (ret = tool_finalize())) {
return ret;
}
return exit_status;
}
static int parse_args(int argc, char *argv[]) {
int i, ret, len, exit_status = ORTE_SUCCESS ;
opal_cmd_line_t cmd_line;
char **app_env = NULL, **global_env = NULL;
char * tmp_env_var = NULL;
/* Init structure */
memset(&orte_migrate_globals, 0, sizeof(orte_migrate_globals_t));
orte_migrate_globals.help = false;
orte_migrate_globals.pid = -1;
orte_migrate_globals.verbose = false;
orte_migrate_globals.verbose_level = 0;
orte_migrate_globals.status = false;
orte_migrate_globals.output = -1;
orte_migrate_globals.off_nodes = NULL;
orte_migrate_globals.off_procs = NULL;
orte_migrate_globals.onto_nodes = NULL;
/* Parse the command line options */
opal_cmd_line_create(&cmd_line, cmd_line_opts);
mca_base_open();
mca_base_cmd_line_setup(&cmd_line);
ret = opal_cmd_line_parse(&cmd_line, true, argc, argv);
/**
* Put all of the MCA arguments in the environment
*/
mca_base_cmd_line_process_args(&cmd_line, &app_env, &global_env);
len = opal_argv_count(app_env);
for(i = 0; i < len; ++i) {
putenv(app_env[i]);
}
len = opal_argv_count(global_env);
for(i = 0; i < len; ++i) {
putenv(global_env[i]);
}
tmp_env_var = mca_base_param_env_var("opal_cr_is_tool");
opal_setenv(tmp_env_var,
"1",
true, &environ);
free(tmp_env_var);
tmp_env_var = NULL;
/**
* Now start parsing our specific arguments
*/
/* get the remaining bits */
opal_cmd_line_get_tail(&cmd_line, &argc, &argv);
#if OPAL_ENABLE_FT_CR == 0
/* Warn and exit if not configured with Migrate/Restart */
{
char *args = NULL;
args = opal_cmd_line_get_usage_msg(&cmd_line);
opal_show_help("help-orte-migrate.txt", "usage-no-cr",
true, args);
free(args);
exit_status = ORTE_ERROR;
goto cleanup;
}
#endif
if (OPAL_SUCCESS != ret ||
orte_migrate_globals.help ||
0 >= argc ||
(NULL == orte_migrate_globals.off_nodes && NULL == orte_migrate_globals.off_procs) ) {
char *args = NULL;
args = opal_cmd_line_get_usage_msg(&cmd_line);
opal_show_help("help-orte-migrate.txt", "usage", true,
args);
free(args);
exit_status = ORTE_ERROR;
goto cleanup;
}
if(orte_migrate_globals.verbose_level < 0 ) {
orte_migrate_globals.verbose_level = 0;
}
if(orte_migrate_globals.verbose_level > 0) {
orte_migrate_globals.verbose = true;
}
/*
* If the user did not supply an hnp jobid, then they must
* supply the PID of MPIRUN
*/
if(0 >= argc ) {
exit_status = ORTE_SUCCESS;
goto cleanup;
}
orte_migrate_globals.pid = atoi(argv[0]);
if ( 0 >= orte_migrate_globals.pid ) {
opal_show_help("help-orte-migrate.txt", "invalid_pid", true,
orte_migrate_globals.pid);
exit_status = ORTE_ERROR;
goto cleanup;
}
if(orte_migrate_globals.verbose) {
orte_migrate_globals.status = true;
}
if(orte_migrate_globals.verbose) {
pretty_print_migration();
}
cleanup:
return exit_status;
}
/*
* This function attempts to find an HNP to connect to.
*/
static int find_hnp(void) {
int ret, exit_status = ORTE_SUCCESS;
opal_list_t hnp_list;
opal_list_item_t *item;
orte_hnp_contact_t *hnpcandidate;
/* get the list of local hnp's available to us and setup
* contact info for them into the RML
*/
OBJ_CONSTRUCT(&hnp_list, opal_list_t);
if (ORTE_SUCCESS != (ret = orte_list_local_hnps(&hnp_list, true) ) ) {
ORTE_ERROR_LOG(ret);
exit_status = ret;
goto cleanup;
}
/* search the list for the desired hnp */
while (NULL != (item = opal_list_remove_first(&hnp_list))) {
hnpcandidate = (orte_hnp_contact_t*)item;
if( hnpcandidate->pid == orte_migrate_globals.pid) {
/* this is the one we want */
orterun_hnp = hnpcandidate;
exit_status = ORTE_SUCCESS;
goto cleanup;
}
}
cleanup:
while (NULL != (item = opal_list_remove_first(&hnp_list))) {
OBJ_RELEASE(item);
}
OBJ_DESTRUCT(&hnp_list);
if( NULL == orterun_hnp ) {
return ORTE_ERROR;
} else {
return exit_status;
}
}
static int tool_init(int argc, char *argv[]) {
int exit_status = ORTE_SUCCESS, ret;
char * tmp_env_var = NULL;
listener_started = false;
/*
* Make sure to init util before parse_args
* to ensure installdirs is setup properly
* before calling mca_base_open();
*/
if( ORTE_SUCCESS != (ret = opal_init_util(&argc, &argv)) ) {
return ret;
}
/*
* Parse Command Line Arguments
*/
if (ORTE_SUCCESS != (ret = parse_args(argc, argv))) {
return ret;
}
/* Disable the migrate notification routine for this
* tool. As we will never need to migrate this tool.
* Note: This must happen before opal_init().
*/
opal_cr_set_enabled(false);
/* Select the none component, since we don't actually use a migrateer */
tmp_env_var = mca_base_param_env_var("crs");
opal_setenv(tmp_env_var,
"none",
true, &environ);
free(tmp_env_var);
tmp_env_var = NULL;
/***************************
* We need all of OPAL and the TOOLS portion of ORTE - this
* sets us up so we can talk to any HNP over the wire
***************************/
if (ORTE_SUCCESS != (ret = orte_init(&argc, &argv, ORTE_PROC_TOOL))) {
exit_status = ret;
goto cleanup;
}
/*
* Setup ORTE Output handle from the verbose argument
*/
if( orte_migrate_globals.verbose ) {
orte_migrate_globals.output = opal_output_open(NULL);
opal_output_set_verbosity(orte_migrate_globals.output, orte_migrate_globals.verbose_level);
} else {
orte_migrate_globals.output = 0; /* Default=STDERR */
}
/*
* Start the listener
*/
if( ORTE_SUCCESS != (ret = start_listener() ) ) {
exit_status = ret;
}
cleanup:
return exit_status;
}
static int tool_finalize(void) {
int exit_status = ORTE_SUCCESS, ret;
/*
* Stop the listener
*/
if( ORTE_SUCCESS != (ret = stop_listener() ) ) {
exit_status = ret;
}
if (ORTE_SUCCESS != (ret = orte_finalize())) {
exit_status = ret;
}
return exit_status;
}
static int start_listener(void)
{
int ret, exit_status = ORTE_SUCCESS;
if (ORTE_SUCCESS != (ret = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD,
ORTE_RML_TAG_MIGRATE,
ORTE_RML_PERSISTENT,
hnp_receiver,
NULL))) {
exit_status = ret;
goto cleanup;
}
listener_started = true;
cleanup:
return exit_status;
}
static int stop_listener(void)
{
int ret, exit_status = ORTE_SUCCESS;
if( !listener_started ) {
exit_status = ORTE_ERROR;
goto cleanup;
}
if (ORTE_SUCCESS != (ret = orte_rml.recv_cancel(ORTE_NAME_WILDCARD,
ORTE_RML_TAG_MIGRATE))) {
exit_status = ret;
goto cleanup;
}
listener_started = false;
cleanup:
return exit_status;
}
static void hnp_receiver(int status,
orte_process_name_t* sender,
opal_buffer_t* buffer,
orte_rml_tag_t tag,
void* cbdata)
{
orte_errmgr_tool_cmd_flag_t command;
orte_std_cntr_t count;
int rc;
opal_output_verbose(5, orte_migrate_globals.output,
"orte_migrate: hnp_receiver: Receive a command message.");
/*
* Otherwise this is an inter-coordinator command (usually updating state info).
*/
count = 1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &command, &count, ORTE_ERRMGR_MIGRATE_TOOL_CMD))) {
ORTE_ERROR_LOG(rc);
return;
}
switch (command) {
case ORTE_ERRMGR_MIGRATE_TOOL_UPDATE_CMD:
opal_output_verbose(10, orte_migrate_globals.output,
"orte_migrate: hnp_receiver: Status Update.");
process_ckpt_update_cmd(sender, buffer);
break;
case ORTE_ERRMGR_MIGRATE_TOOL_INIT_CMD:
/* Do Nothing */
break;
default:
ORTE_ERROR_LOG(ORTE_ERR_VALUE_OUT_OF_BOUNDS);
}
}
static void process_ckpt_update_cmd(orte_process_name_t* sender,
opal_buffer_t* buffer)
{
int ret, exit_status = ORTE_SUCCESS;
orte_std_cntr_t count = 1;
int ckpt_status = ORTE_ERRMGR_MIGRATE_STATE_NONE;
/*
* Receive the data:
* - ckpt_state
*/
count = 1;
if ( ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &ckpt_status, &count, OPAL_INT)) ) {
exit_status = ret;
goto cleanup;
}
orte_migrate_ckpt_status = ckpt_status;
/*
* If the job is not able to be migrateed, then return
*/
if( ORTE_SNAPC_CKPT_STATE_NO_CKPT == orte_migrate_ckpt_status) {
opal_show_help("help-orte-migrate.txt", "non-ckptable",
true,
orte_migrate_globals.pid);
exit_status = ORTE_ERROR;
goto cleanup;
}
/*
* If a migration is already in progress, then we must tell the user to
* try again later.
*/
if( ORTE_ERRMGR_MIGRATE_STATE_ERR_INPROGRESS == orte_migrate_ckpt_status) {
opal_show_help("help-orte-migrate.txt", "err-inprogress",
true,
orte_migrate_globals.pid);
exit_status = ORTE_ERROR;
goto cleanup;
}
/*
* If there was an error, display a message and exit
*/
if( ORTE_ERRMGR_MIGRATE_STATE_ERROR == orte_migrate_ckpt_status ) {
opal_show_help("help-orte-migrate.txt", "err-other",
true,
orte_migrate_globals.pid);
exit_status = ORTE_ERROR;
goto cleanup;
}
/*
* If we are to display the status progression
*/
if( orte_migrate_globals.status ) {
if(ORTE_ERRMGR_MIGRATE_STATE_FINISH != orte_migrate_ckpt_status) {
pretty_print_status();
}
}
cleanup:
return;
}
static int notify_hnp(void)
{
int ret, exit_status = ORTE_SUCCESS;
opal_buffer_t *buffer = NULL;
orte_errmgr_tool_cmd_flag_t command = ORTE_ERRMGR_MIGRATE_TOOL_INIT_CMD;
if (NULL == (buffer = OBJ_NEW(opal_buffer_t))) {
exit_status = ORTE_ERROR;
goto cleanup;
}
opal_output_verbose(10, orte_migrate_globals.output,
"orte_migrate: notify_hnp: Contact Head Node Process PID %d\n",
orte_migrate_globals.pid);
timer_start = get_time();
/***********************************
* Notify HNP of migrate request
* Send:
* - Command
* - Off Nodes
* - Off Procs
* - Onto Nodes
***********************************/
if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &command, 1, ORTE_ERRMGR_MIGRATE_TOOL_CMD)) ) {
exit_status = ret;
goto cleanup;
}
if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &(orte_migrate_globals.off_procs), 1, OPAL_STRING)) ) {
exit_status = ret;
goto cleanup;
}
if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &(orte_migrate_globals.off_nodes), 1, OPAL_STRING)) ) {
exit_status = ret;
goto cleanup;
}
if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &(orte_migrate_globals.onto_nodes), 1, OPAL_STRING)) ) {
exit_status = ret;
goto cleanup;
}
if ( 0 > (ret = orte_rml.send_buffer(&(orterun_hnp->name), buffer, ORTE_RML_TAG_MIGRATE, 0)) ) {
exit_status = ret;
goto cleanup;
}
cleanup:
if( NULL != buffer) {
OBJ_RELEASE(buffer);
buffer = NULL;
}
if( ORTE_SUCCESS != exit_status ) {
opal_show_help("help-orte-migrate.txt", "unable_to_connect", true,
orte_migrate_globals.pid);
}
return exit_status;
}
/***************
* Pretty Print
***************/
static double get_time(void) {
double wtime;
#if OPAL_TIMER_USEC_NATIVE
wtime = (double)opal_timer_base_get_usec() / 1000000.0;
#else
struct timeval tv;
gettimeofday(&tv, NULL);
wtime = tv.tv_sec;
wtime += (double)tv.tv_usec / 1000000.0;
#endif
return wtime;
}
static int pretty_print_status(void) {
char * state_str = NULL;
double cur_time;
cur_time = get_time();
if( timer_last == 0 ) {
timer_last = cur_time;
}
orte_errmgr_base_migrate_state_str(&state_str, orte_migrate_ckpt_status);
opal_output(0,
"[%6.2f / %6.2f] %*s - ...\n",
(cur_time - timer_last), (cur_time - timer_start),
25, state_str);
if( NULL != state_str) {
free(state_str);
}
timer_last = cur_time;
return ORTE_SUCCESS;
}
static int pretty_print_migration(void)
{
char **loc_off_nodes = NULL;
char **loc_off_procs = NULL;
char **loc_onto_nodes = NULL;
int loc_off_nodes_cnt = 0;
int loc_off_procs_cnt = 0;
int loc_onto_cnt = 0;
int i;
if( NULL != orte_migrate_globals.off_nodes ) {
loc_off_nodes = opal_argv_split(orte_migrate_globals.off_nodes, ',');
loc_off_nodes_cnt = opal_argv_count(loc_off_nodes);
}
if( NULL != orte_migrate_globals.off_procs ) {
loc_off_procs = opal_argv_split(orte_migrate_globals.off_procs, ',');
loc_off_procs_cnt = opal_argv_count(loc_off_procs);
}
if( NULL != orte_migrate_globals.onto_nodes ) {
loc_onto_nodes = opal_argv_split(orte_migrate_globals.onto_nodes, ',');
loc_onto_cnt = opal_argv_count(loc_onto_nodes);
}
printf("Migrate Nodes: (%d nodes)\n", loc_off_nodes_cnt);
for(i = 0; i < loc_off_nodes_cnt; ++i) {
printf("\t\"%s\"\n", loc_off_nodes[i]);
}
printf("Migrate Ranks: (%d ranks)\n", loc_off_procs_cnt);
for(i = 0; i < loc_off_procs_cnt; ++i) {
printf("\t\"%s\"\n", loc_off_procs[i]);
}
printf("Migrate Onto : (%d nodes)\n", loc_onto_cnt);
for(i = 0; i < loc_onto_cnt; ++i) {
printf("\t\"%s\"\n", loc_onto_nodes[i]);
}
return ORTE_SUCCESS;
}