Fix the none.checkpoint command.
orte-checkpoint/orte-restart seem to not seem to totally like orte_output so revert them to opal_output for now. Since we have no need for the additional complexity of orte_output we can drop it for now and revisit this if anyone needs it later. It seems that if you set the verbose level on an output handle then try to call a normal orte_output() on it then the message will *not* be printed. This is the same for opal_output, and seems incorrect to me because it stops some error messages from being printed out if you do not directly specify opal_output(0, ...). Maybe someone should take a look a this. orte-checkpoint would segv if passed an incorrect PID. Fixed the return code so it errors out properly. Thanks to Eric Roman for bringing this to my attention. This commit was SVN r18583.
Этот коммит содержится в:
родитель
6e37dd0ef0
Коммит
78f14b5255
@ -119,6 +119,8 @@ int opal_crs_base_none_module_finalize(void)
|
|||||||
|
|
||||||
int opal_crs_base_none_checkpoint(pid_t pid, opal_crs_base_snapshot_t *snapshot, opal_crs_state_type_t *state)
|
int opal_crs_base_none_checkpoint(pid_t pid, opal_crs_base_snapshot_t *snapshot, opal_crs_state_type_t *state)
|
||||||
{
|
{
|
||||||
|
int ret;
|
||||||
|
|
||||||
*state = OPAL_CRS_CONTINUE;
|
*state = OPAL_CRS_CONTINUE;
|
||||||
|
|
||||||
snapshot->component_name = strdup("none");
|
snapshot->component_name = strdup("none");
|
||||||
@ -127,42 +129,16 @@ int opal_crs_base_none_checkpoint(pid_t pid, opal_crs_base_snapshot_t *snapshot,
|
|||||||
snapshot->remote_location = strdup("");
|
snapshot->remote_location = strdup("");
|
||||||
snapshot->cold_start = false;
|
snapshot->cold_start = false;
|
||||||
|
|
||||||
#if 0
|
|
||||||
/* JJH - A more complete alternative if needed */
|
|
||||||
opal_crs_none_snapshot_t *snapshot = OBJ_NEW(opal_crs_none_snapshot_t);
|
|
||||||
|
|
||||||
if(NULL != snapshot->super.reference_name)
|
|
||||||
free(snapshot->super.reference_name);
|
|
||||||
snapshot->super.reference_name = strdup(base_snapshot->reference_name);
|
|
||||||
|
|
||||||
if(NULL != snapshot->super.local_location)
|
|
||||||
free(snapshot->super.local_location);
|
|
||||||
snapshot->super.local_location = strdup(base_snapshot->local_location);
|
|
||||||
|
|
||||||
if(NULL != snapshot->super.remote_location)
|
|
||||||
free(snapshot->super.remote_location);
|
|
||||||
snapshot->super.remote_location = strdup(base_snapshot->remote_location);
|
|
||||||
|
|
||||||
opal_output_verbose(10, mca_crs_none_component.super.output_handle,
|
|
||||||
"crs:none: checkpoint(%d, ---)", pid);
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Create the snapshot directory
|
* Update the snapshot metadata
|
||||||
*/
|
*/
|
||||||
snapshot->super.component_name = strdup(mca_crs_none_component.super.base_version.mca_component_name);
|
if( OPAL_SUCCESS != (ret = opal_crs_base_metadata_write_token(NULL, CRS_METADATA_COMP, "none") ) ) {
|
||||||
if( OPAL_SUCCESS != (ret = opal_crs_base_init_snapshot_directory(&snapshot->super) )) {
|
opal_output(0,
|
||||||
opal_output(mca_crs_none_component.super.output_handle,
|
"crs:none: checkpoint(): Error: Unable to write component name to the directory for (%s).",
|
||||||
"crs:none: checkpoint(): Error: Unable to initialize the directory for (%s).",
|
snapshot->reference_name);
|
||||||
snapshot->super.reference_name);
|
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
|
||||||
* Return to the caller
|
|
||||||
*/
|
|
||||||
base_snapshot = &(snapshot->super);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
return OPAL_SUCCESS;
|
return OPAL_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -190,6 +190,9 @@ main(int argc, char *argv[])
|
|||||||
* Find the HNP that we want to connect to, if it exists
|
* Find the HNP that we want to connect to, if it exists
|
||||||
***************************/
|
***************************/
|
||||||
if (ORTE_SUCCESS != (ret = find_hnp())) {
|
if (ORTE_SUCCESS != (ret = find_hnp())) {
|
||||||
|
opal_output(0,
|
||||||
|
"HNP with PID %d Not found!",
|
||||||
|
orte_checkpoint_globals.pid);
|
||||||
exit_status = ret;
|
exit_status = ret;
|
||||||
goto cleanup;
|
goto cleanup;
|
||||||
}
|
}
|
||||||
@ -198,24 +201,24 @@ main(int argc, char *argv[])
|
|||||||
* Checkpoint the requested PID
|
* Checkpoint the requested PID
|
||||||
*******************************/
|
*******************************/
|
||||||
if( orte_checkpoint_globals.verbose ) {
|
if( orte_checkpoint_globals.verbose ) {
|
||||||
orte_output_verbose(10, orte_checkpoint_globals.output,
|
opal_output_verbose(10, orte_checkpoint_globals.output,
|
||||||
"orte_checkpoint: Checkpointing...");
|
"orte_checkpoint: Checkpointing...");
|
||||||
if (0 < orte_checkpoint_globals.pid) {
|
if (0 < orte_checkpoint_globals.pid) {
|
||||||
orte_output_verbose(10, orte_checkpoint_globals.output,
|
opal_output_verbose(10, orte_checkpoint_globals.output,
|
||||||
"\t PID %d",
|
"\t PID %d",
|
||||||
orte_checkpoint_globals.pid);
|
orte_checkpoint_globals.pid);
|
||||||
} else if (ORTE_JOBID_INVALID != orte_checkpoint_globals.req_hnp){
|
} else if (ORTE_JOBID_INVALID != orte_checkpoint_globals.req_hnp){
|
||||||
orte_output_verbose(10, orte_checkpoint_globals.output,
|
opal_output_verbose(10, orte_checkpoint_globals.output,
|
||||||
"\t Mpirun (%s)",
|
"\t Mpirun (%s)",
|
||||||
ORTE_JOBID_PRINT(orte_checkpoint_globals.req_hnp));
|
ORTE_JOBID_PRINT(orte_checkpoint_globals.req_hnp));
|
||||||
}
|
}
|
||||||
|
|
||||||
orte_output_verbose(10, orte_checkpoint_globals.output,
|
opal_output_verbose(10, orte_checkpoint_globals.output,
|
||||||
"\t Connected to Mpirun %s",
|
"\t Connected to Mpirun %s",
|
||||||
ORTE_NAME_PRINT(&orterun_hnp->name));
|
ORTE_NAME_PRINT(&orterun_hnp->name));
|
||||||
|
|
||||||
if(orte_checkpoint_globals.term) {
|
if(orte_checkpoint_globals.term) {
|
||||||
orte_output_verbose(10, orte_checkpoint_globals.output,
|
opal_output_verbose(10, orte_checkpoint_globals.output,
|
||||||
"\t Terminating after checkpoint\n");
|
"\t Terminating after checkpoint\n");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -381,7 +384,7 @@ static int parse_args(int argc, char *argv[]) {
|
|||||||
* This function attempts to find an HNP to connect to.
|
* This function attempts to find an HNP to connect to.
|
||||||
*/
|
*/
|
||||||
static int find_hnp(void) {
|
static int find_hnp(void) {
|
||||||
int ret;
|
int ret, exit_status = ORTE_SUCCESS;
|
||||||
opal_list_t hnp_list;
|
opal_list_t hnp_list;
|
||||||
opal_list_item_t *item;
|
opal_list_item_t *item;
|
||||||
orte_hnp_contact_t *hnpcandidate;
|
orte_hnp_contact_t *hnpcandidate;
|
||||||
@ -390,6 +393,7 @@ static int find_hnp(void) {
|
|||||||
OBJ_CONSTRUCT(&hnp_list, opal_list_t);
|
OBJ_CONSTRUCT(&hnp_list, opal_list_t);
|
||||||
if (ORTE_SUCCESS != (ret = orte_list_local_hnps(&hnp_list) ) ) {
|
if (ORTE_SUCCESS != (ret = orte_list_local_hnps(&hnp_list) ) ) {
|
||||||
ORTE_ERROR_LOG(ret);
|
ORTE_ERROR_LOG(ret);
|
||||||
|
exit_status = ret;
|
||||||
goto cleanup;
|
goto cleanup;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -400,7 +404,7 @@ static int find_hnp(void) {
|
|||||||
hnpcandidate->pid == orte_checkpoint_globals.pid) {
|
hnpcandidate->pid == orte_checkpoint_globals.pid) {
|
||||||
/* this is the one we want */
|
/* this is the one we want */
|
||||||
orterun_hnp = hnpcandidate;
|
orterun_hnp = hnpcandidate;
|
||||||
ret = ORTE_SUCCESS;
|
exit_status = ORTE_SUCCESS;
|
||||||
goto cleanup;
|
goto cleanup;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -411,7 +415,11 @@ cleanup:
|
|||||||
}
|
}
|
||||||
OBJ_DESTRUCT(&hnp_list);
|
OBJ_DESTRUCT(&hnp_list);
|
||||||
|
|
||||||
return ret;
|
if( NULL == orterun_hnp ) {
|
||||||
|
return ORTE_ERROR;
|
||||||
|
} else {
|
||||||
|
return exit_status;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static int ckpt_init(int argc, char *argv[]) {
|
static int ckpt_init(int argc, char *argv[]) {
|
||||||
@ -436,16 +444,6 @@ static int ckpt_init(int argc, char *argv[]) {
|
|||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
|
||||||
* Setup OPAL Output handle from the verbose argument
|
|
||||||
*/
|
|
||||||
if( orte_checkpoint_globals.verbose ) {
|
|
||||||
orte_checkpoint_globals.output = orte_output_open(NULL);
|
|
||||||
orte_output_set_verbosity(orte_checkpoint_globals.output, 10);
|
|
||||||
} else {
|
|
||||||
orte_checkpoint_globals.output = 0; /* Default=STDERR */
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Disable the checkpoint notification routine for this
|
/* Disable the checkpoint notification routine for this
|
||||||
* tool. As we will never need to checkpoint this tool.
|
* tool. As we will never need to checkpoint this tool.
|
||||||
* Note: This must happen before opal_init().
|
* Note: This must happen before opal_init().
|
||||||
@ -469,6 +467,16 @@ static int ckpt_init(int argc, char *argv[]) {
|
|||||||
goto cleanup;
|
goto cleanup;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Setup ORTE Output handle from the verbose argument
|
||||||
|
*/
|
||||||
|
if( orte_checkpoint_globals.verbose ) {
|
||||||
|
orte_checkpoint_globals.output = opal_output_open(NULL);
|
||||||
|
opal_output_set_verbosity(orte_checkpoint_globals.output, 10);
|
||||||
|
} else {
|
||||||
|
orte_checkpoint_globals.output = 0; /* Default=STDERR */
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Start the listener
|
* Start the listener
|
||||||
*/
|
*/
|
||||||
@ -548,7 +556,7 @@ static void hnp_receiver(int status,
|
|||||||
orte_std_cntr_t count;
|
orte_std_cntr_t count;
|
||||||
int rc;
|
int rc;
|
||||||
|
|
||||||
orte_output_verbose(5, orte_checkpoint_globals.output,
|
opal_output_verbose(5, orte_checkpoint_globals.output,
|
||||||
"orte_checkpoint: hnp_receiver: Receive a command message.");
|
"orte_checkpoint: hnp_receiver: Receive a command message.");
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -562,7 +570,7 @@ static void hnp_receiver(int status,
|
|||||||
|
|
||||||
switch (command) {
|
switch (command) {
|
||||||
case ORTE_SNAPC_GLOBAL_UPDATE_CMD:
|
case ORTE_SNAPC_GLOBAL_UPDATE_CMD:
|
||||||
orte_output_verbose(10, orte_checkpoint_globals.output,
|
opal_output_verbose(10, orte_checkpoint_globals.output,
|
||||||
"orte_checkpoint: hnp_receiver: Status Update.");
|
"orte_checkpoint: hnp_receiver: Status Update.");
|
||||||
|
|
||||||
process_ckpt_update_cmd(sender, buffer);
|
process_ckpt_update_cmd(sender, buffer);
|
||||||
@ -647,7 +655,7 @@ notify_process_for_checkpoint(int term)
|
|||||||
goto cleanup;
|
goto cleanup;
|
||||||
}
|
}
|
||||||
|
|
||||||
orte_output_verbose(10, orte_checkpoint_globals.output,
|
opal_output_verbose(10, orte_checkpoint_globals.output,
|
||||||
"orte_checkpoint: notify_hnp: Contact Head Node Process PID %d\n",
|
"orte_checkpoint: notify_hnp: Contact Head Node Process PID %d\n",
|
||||||
orte_checkpoint_globals.pid);
|
orte_checkpoint_globals.pid);
|
||||||
|
|
||||||
@ -678,7 +686,7 @@ notify_process_for_checkpoint(int term)
|
|||||||
goto cleanup;
|
goto cleanup;
|
||||||
}
|
}
|
||||||
|
|
||||||
orte_output_verbose(10, orte_checkpoint_globals.output,
|
opal_output_verbose(10, orte_checkpoint_globals.output,
|
||||||
"orte_checkpoint: notify_hnp: Requested a checkpoint of jobid %s\n",
|
"orte_checkpoint: notify_hnp: Requested a checkpoint of jobid %s\n",
|
||||||
ORTE_JOBID_PRINT(jobid));
|
ORTE_JOBID_PRINT(jobid));
|
||||||
|
|
||||||
@ -704,7 +712,7 @@ static int pretty_print_status(void) {
|
|||||||
|
|
||||||
state_str = orte_snapc_ckpt_state_str(orte_checkpoint_globals.ckpt_status);
|
state_str = orte_snapc_ckpt_state_str(orte_checkpoint_globals.ckpt_status);
|
||||||
|
|
||||||
orte_output(orte_checkpoint_globals.output,
|
opal_output(0,
|
||||||
"%*s - Global Snapshot Reference: %s\n",
|
"%*s - Global Snapshot Reference: %s\n",
|
||||||
25, state_str, global_snapshot_handle);
|
25, state_str, global_snapshot_handle);
|
||||||
if( NULL != state_str) {
|
if( NULL != state_str) {
|
||||||
|
@ -189,15 +189,15 @@ main(int argc, char *argv[])
|
|||||||
* Restart in this process [mpirun/orterun]
|
* Restart in this process [mpirun/orterun]
|
||||||
******************************/
|
******************************/
|
||||||
if( orte_restart_globals.verbose ) {
|
if( orte_restart_globals.verbose ) {
|
||||||
orte_output_verbose(10, orte_restart_globals.output,
|
opal_output_verbose(10, orte_restart_globals.output,
|
||||||
"Restarting from file (%s)",
|
"Restarting from file (%s)",
|
||||||
orte_restart_globals.filename);
|
orte_restart_globals.filename);
|
||||||
|
|
||||||
if( orte_restart_globals.forked ) {
|
if( orte_restart_globals.forked ) {
|
||||||
orte_output_verbose(10, orte_restart_globals.output,
|
opal_output_verbose(10, orte_restart_globals.output,
|
||||||
"\t Forking off a child");
|
"\t Forking off a child");
|
||||||
} else {
|
} else {
|
||||||
orte_output_verbose(10, orte_restart_globals.output,
|
opal_output_verbose(10, orte_restart_globals.output,
|
||||||
"\t Exec in self");
|
"\t Exec in self");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -209,7 +209,7 @@ main(int argc, char *argv[])
|
|||||||
goto cleanup;
|
goto cleanup;
|
||||||
}
|
}
|
||||||
|
|
||||||
orte_output_verbose(10, orte_restart_globals.output,
|
opal_output_verbose(10, orte_restart_globals.output,
|
||||||
"orte_restart: Restarted Child with PID = %d\n", child_pid);
|
"orte_restart: Restarted Child with PID = %d\n", child_pid);
|
||||||
|
|
||||||
/***************
|
/***************
|
||||||
@ -251,8 +251,8 @@ static int initialize(int argc, char *argv[]) {
|
|||||||
* Setup OPAL Output handle from the verbose argument
|
* Setup OPAL Output handle from the verbose argument
|
||||||
*/
|
*/
|
||||||
if( orte_restart_globals.verbose ) {
|
if( orte_restart_globals.verbose ) {
|
||||||
orte_restart_globals.output = orte_output_open(NULL);
|
orte_restart_globals.output = opal_output_open(NULL);
|
||||||
orte_output_set_verbosity(orte_restart_globals.output, 10);
|
opal_output_set_verbosity(orte_restart_globals.output, 10);
|
||||||
} else {
|
} else {
|
||||||
orte_restart_globals.output = 0; /* Default=STDERR */
|
orte_restart_globals.output = 0; /* Default=STDERR */
|
||||||
}
|
}
|
||||||
@ -412,7 +412,7 @@ static int check_file(orte_snapc_base_global_snapshot_t *snapshot)
|
|||||||
{
|
{
|
||||||
int ret, exit_status = ORTE_SUCCESS;
|
int ret, exit_status = ORTE_SUCCESS;
|
||||||
|
|
||||||
orte_output_verbose(10, orte_restart_globals.output,
|
opal_output_verbose(10, orte_restart_globals.output,
|
||||||
"Checking for the existence of (%s)\n",
|
"Checking for the existence of (%s)\n",
|
||||||
snapshot->local_location);
|
snapshot->local_location);
|
||||||
|
|
||||||
@ -545,7 +545,7 @@ static int spawn_children(orte_snapc_base_global_snapshot_t *snapshot, pid_t *ch
|
|||||||
/* Child Process */
|
/* Child Process */
|
||||||
status = execvp(strdup(argv[0]), argv);
|
status = execvp(strdup(argv[0]), argv);
|
||||||
if( 0 > status) {
|
if( 0 > status) {
|
||||||
orte_output(orte_restart_globals.output,
|
opal_output(orte_restart_globals.output,
|
||||||
"orte_restart: execv failed with status = %d\n",
|
"orte_restart: execv failed with status = %d\n",
|
||||||
status);
|
status);
|
||||||
}
|
}
|
||||||
@ -557,7 +557,7 @@ static int spawn_children(orte_snapc_base_global_snapshot_t *snapshot, pid_t *ch
|
|||||||
;
|
;
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
orte_output(orte_restart_globals.output,
|
opal_output(orte_restart_globals.output,
|
||||||
"orte_restart: fork failed: This should never happen!");
|
"orte_restart: fork failed: This should never happen!");
|
||||||
/* Fork failed :( */
|
/* Fork failed :( */
|
||||||
exit_status = *child_pid;
|
exit_status = *child_pid;
|
||||||
|
Загрузка…
Ссылка в новой задаче
Block a user