1
1

Fix the none.checkpoint command.

orte-checkpoint/orte-restart seem to not seem to totally like orte_output so revert them to opal_output for now. Since we have no need for the additional complexity of orte_output we can drop it for now and revisit this if anyone needs it later.

It seems that if you set the verbose level on an output handle then try to call a normal orte_output() on it then the message will *not* be printed. This is the same for opal_output, and seems incorrect to me because it stops some error messages from being printed out if you do not directly specify opal_output(0, ...). Maybe someone should take a look a this.


orte-checkpoint would segv if passed an incorrect PID. Fixed the return code so it errors out properly.

Thanks to Eric Roman for bringing this to my attention.

This commit was SVN r18583.
Этот коммит содержится в:
Josh Hursey 2008-06-04 14:44:11 +00:00
родитель 6e37dd0ef0
Коммит 78f14b5255
3 изменённых файлов: 48 добавлений и 64 удалений

Просмотреть файл

@ -119,6 +119,8 @@ int opal_crs_base_none_module_finalize(void)
int opal_crs_base_none_checkpoint(pid_t pid, opal_crs_base_snapshot_t *snapshot, opal_crs_state_type_t *state)
{
int ret;
*state = OPAL_CRS_CONTINUE;
snapshot->component_name = strdup("none");
@ -127,42 +129,16 @@ int opal_crs_base_none_checkpoint(pid_t pid, opal_crs_base_snapshot_t *snapshot,
snapshot->remote_location = strdup("");
snapshot->cold_start = false;
#if 0
/* JJH - A more complete alternative if needed */
opal_crs_none_snapshot_t *snapshot = OBJ_NEW(opal_crs_none_snapshot_t);
if(NULL != snapshot->super.reference_name)
free(snapshot->super.reference_name);
snapshot->super.reference_name = strdup(base_snapshot->reference_name);
if(NULL != snapshot->super.local_location)
free(snapshot->super.local_location);
snapshot->super.local_location = strdup(base_snapshot->local_location);
if(NULL != snapshot->super.remote_location)
free(snapshot->super.remote_location);
snapshot->super.remote_location = strdup(base_snapshot->remote_location);
opal_output_verbose(10, mca_crs_none_component.super.output_handle,
"crs:none: checkpoint(%d, ---)", pid);
/*
* Create the snapshot directory
* Update the snapshot metadata
*/
snapshot->super.component_name = strdup(mca_crs_none_component.super.base_version.mca_component_name);
if( OPAL_SUCCESS != (ret = opal_crs_base_init_snapshot_directory(&snapshot->super) )) {
opal_output(mca_crs_none_component.super.output_handle,
"crs:none: checkpoint(): Error: Unable to initialize the directory for (%s).",
snapshot->super.reference_name);
if( OPAL_SUCCESS != (ret = opal_crs_base_metadata_write_token(NULL, CRS_METADATA_COMP, "none") ) ) {
opal_output(0,
"crs:none: checkpoint(): Error: Unable to write component name to the directory for (%s).",
snapshot->reference_name);
return ret;
}
/*
* Return to the caller
*/
base_snapshot = &(snapshot->super);
#endif
return OPAL_SUCCESS;
}

Просмотреть файл

@ -190,6 +190,9 @@ main(int argc, char *argv[])
* Find the HNP that we want to connect to, if it exists
***************************/
if (ORTE_SUCCESS != (ret = find_hnp())) {
opal_output(0,
"HNP with PID %d Not found!",
orte_checkpoint_globals.pid);
exit_status = ret;
goto cleanup;
}
@ -198,24 +201,24 @@ main(int argc, char *argv[])
* Checkpoint the requested PID
*******************************/
if( orte_checkpoint_globals.verbose ) {
orte_output_verbose(10, orte_checkpoint_globals.output,
opal_output_verbose(10, orte_checkpoint_globals.output,
"orte_checkpoint: Checkpointing...");
if (0 < orte_checkpoint_globals.pid) {
orte_output_verbose(10, orte_checkpoint_globals.output,
opal_output_verbose(10, orte_checkpoint_globals.output,
"\t PID %d",
orte_checkpoint_globals.pid);
} else if (ORTE_JOBID_INVALID != orte_checkpoint_globals.req_hnp){
orte_output_verbose(10, orte_checkpoint_globals.output,
opal_output_verbose(10, orte_checkpoint_globals.output,
"\t Mpirun (%s)",
ORTE_JOBID_PRINT(orte_checkpoint_globals.req_hnp));
}
orte_output_verbose(10, orte_checkpoint_globals.output,
opal_output_verbose(10, orte_checkpoint_globals.output,
"\t Connected to Mpirun %s",
ORTE_NAME_PRINT(&orterun_hnp->name));
if(orte_checkpoint_globals.term) {
orte_output_verbose(10, orte_checkpoint_globals.output,
opal_output_verbose(10, orte_checkpoint_globals.output,
"\t Terminating after checkpoint\n");
}
}
@ -381,7 +384,7 @@ static int parse_args(int argc, char *argv[]) {
* This function attempts to find an HNP to connect to.
*/
static int find_hnp(void) {
int ret;
int ret, exit_status = ORTE_SUCCESS;
opal_list_t hnp_list;
opal_list_item_t *item;
orte_hnp_contact_t *hnpcandidate;
@ -390,6 +393,7 @@ static int find_hnp(void) {
OBJ_CONSTRUCT(&hnp_list, opal_list_t);
if (ORTE_SUCCESS != (ret = orte_list_local_hnps(&hnp_list) ) ) {
ORTE_ERROR_LOG(ret);
exit_status = ret;
goto cleanup;
}
@ -400,7 +404,7 @@ static int find_hnp(void) {
hnpcandidate->pid == orte_checkpoint_globals.pid) {
/* this is the one we want */
orterun_hnp = hnpcandidate;
ret = ORTE_SUCCESS;
exit_status = ORTE_SUCCESS;
goto cleanup;
}
}
@ -410,8 +414,12 @@ cleanup:
OBJ_RELEASE(item);
}
OBJ_DESTRUCT(&hnp_list);
return ret;
if( NULL == orterun_hnp ) {
return ORTE_ERROR;
} else {
return exit_status;
}
}
static int ckpt_init(int argc, char *argv[]) {
@ -436,16 +444,6 @@ static int ckpt_init(int argc, char *argv[]) {
return ret;
}
/*
* Setup OPAL Output handle from the verbose argument
*/
if( orte_checkpoint_globals.verbose ) {
orte_checkpoint_globals.output = orte_output_open(NULL);
orte_output_set_verbosity(orte_checkpoint_globals.output, 10);
} else {
orte_checkpoint_globals.output = 0; /* Default=STDERR */
}
/* Disable the checkpoint notification routine for this
* tool. As we will never need to checkpoint this tool.
* Note: This must happen before opal_init().
@ -469,6 +467,16 @@ static int ckpt_init(int argc, char *argv[]) {
goto cleanup;
}
/*
* Setup ORTE Output handle from the verbose argument
*/
if( orte_checkpoint_globals.verbose ) {
orte_checkpoint_globals.output = opal_output_open(NULL);
opal_output_set_verbosity(orte_checkpoint_globals.output, 10);
} else {
orte_checkpoint_globals.output = 0; /* Default=STDERR */
}
/*
* Start the listener
*/
@ -548,7 +556,7 @@ static void hnp_receiver(int status,
orte_std_cntr_t count;
int rc;
orte_output_verbose(5, orte_checkpoint_globals.output,
opal_output_verbose(5, orte_checkpoint_globals.output,
"orte_checkpoint: hnp_receiver: Receive a command message.");
/*
@ -562,7 +570,7 @@ static void hnp_receiver(int status,
switch (command) {
case ORTE_SNAPC_GLOBAL_UPDATE_CMD:
orte_output_verbose(10, orte_checkpoint_globals.output,
opal_output_verbose(10, orte_checkpoint_globals.output,
"orte_checkpoint: hnp_receiver: Status Update.");
process_ckpt_update_cmd(sender, buffer);
@ -647,7 +655,7 @@ notify_process_for_checkpoint(int term)
goto cleanup;
}
orte_output_verbose(10, orte_checkpoint_globals.output,
opal_output_verbose(10, orte_checkpoint_globals.output,
"orte_checkpoint: notify_hnp: Contact Head Node Process PID %d\n",
orte_checkpoint_globals.pid);
@ -678,7 +686,7 @@ notify_process_for_checkpoint(int term)
goto cleanup;
}
orte_output_verbose(10, orte_checkpoint_globals.output,
opal_output_verbose(10, orte_checkpoint_globals.output,
"orte_checkpoint: notify_hnp: Requested a checkpoint of jobid %s\n",
ORTE_JOBID_PRINT(jobid));
@ -704,7 +712,7 @@ static int pretty_print_status(void) {
state_str = orte_snapc_ckpt_state_str(orte_checkpoint_globals.ckpt_status);
orte_output(orte_checkpoint_globals.output,
opal_output(0,
"%*s - Global Snapshot Reference: %s\n",
25, state_str, global_snapshot_handle);
if( NULL != state_str) {

Просмотреть файл

@ -189,15 +189,15 @@ main(int argc, char *argv[])
* Restart in this process [mpirun/orterun]
******************************/
if( orte_restart_globals.verbose ) {
orte_output_verbose(10, orte_restart_globals.output,
opal_output_verbose(10, orte_restart_globals.output,
"Restarting from file (%s)",
orte_restart_globals.filename);
if( orte_restart_globals.forked ) {
orte_output_verbose(10, orte_restart_globals.output,
opal_output_verbose(10, orte_restart_globals.output,
"\t Forking off a child");
} else {
orte_output_verbose(10, orte_restart_globals.output,
opal_output_verbose(10, orte_restart_globals.output,
"\t Exec in self");
}
}
@ -209,7 +209,7 @@ main(int argc, char *argv[])
goto cleanup;
}
orte_output_verbose(10, orte_restart_globals.output,
opal_output_verbose(10, orte_restart_globals.output,
"orte_restart: Restarted Child with PID = %d\n", child_pid);
/***************
@ -251,8 +251,8 @@ static int initialize(int argc, char *argv[]) {
* Setup OPAL Output handle from the verbose argument
*/
if( orte_restart_globals.verbose ) {
orte_restart_globals.output = orte_output_open(NULL);
orte_output_set_verbosity(orte_restart_globals.output, 10);
orte_restart_globals.output = opal_output_open(NULL);
opal_output_set_verbosity(orte_restart_globals.output, 10);
} else {
orte_restart_globals.output = 0; /* Default=STDERR */
}
@ -412,7 +412,7 @@ static int check_file(orte_snapc_base_global_snapshot_t *snapshot)
{
int ret, exit_status = ORTE_SUCCESS;
orte_output_verbose(10, orte_restart_globals.output,
opal_output_verbose(10, orte_restart_globals.output,
"Checking for the existence of (%s)\n",
snapshot->local_location);
@ -545,7 +545,7 @@ static int spawn_children(orte_snapc_base_global_snapshot_t *snapshot, pid_t *ch
/* Child Process */
status = execvp(strdup(argv[0]), argv);
if( 0 > status) {
orte_output(orte_restart_globals.output,
opal_output(orte_restart_globals.output,
"orte_restart: execv failed with status = %d\n",
status);
}
@ -557,7 +557,7 @@ static int spawn_children(orte_snapc_base_global_snapshot_t *snapshot, pid_t *ch
;
}
else {
orte_output(orte_restart_globals.output,
opal_output(orte_restart_globals.output,
"orte_restart: fork failed: This should never happen!");
/* Fork failed :( */
exit_status = *child_pid;