One more correction to mpirun exit codes - cleanup the application proc's exit codes in the orted so that non-zero exit codes generated by mpirun itself don't get "munged".
Modify the multi_abort function so they all return different exit codes - allows us to tell which one was being reported. This commit was SVN r17895.
Этот коммит содержится в:
родитель
27a73ad9ee
Коммит
6bb139e4f2
@ -1436,9 +1436,6 @@ GOTCHILD:
|
||||
goto MOVEON;
|
||||
}
|
||||
|
||||
/* save the exit code */
|
||||
child->exit_code = status;
|
||||
|
||||
/* If this child was the (vpid==0), we hooked it up to orterun's
|
||||
STDIN SOURCE earlier (do not change this without also changing
|
||||
odsl_default_fork_local_proc()). So we have to tell the SOURCE
|
||||
@ -1481,6 +1478,9 @@ GOTCHILD:
|
||||
|
||||
/* determine the state of this process */
|
||||
if(WIFEXITED(status)) {
|
||||
/* set the exit status appropriately */
|
||||
child->exit_code = WEXITSTATUS(status);
|
||||
|
||||
/* even though the process exited "normally", it is quite
|
||||
* possible that this happened via an orte_abort call - in
|
||||
* which case, we need to indicate this was an "abnormal"
|
||||
@ -1556,6 +1556,13 @@ GOTCHILD:
|
||||
* abnormal, so indicate that condition
|
||||
*/
|
||||
child->state = ORTE_PROC_STATE_ABORTED_BY_SIG;
|
||||
/* If a process was killed by a signal, then make the
|
||||
* exit code of orterun be "signo + 128" so that "prog"
|
||||
* and "orterun prog" will both set the same status
|
||||
* value for the shell
|
||||
*/
|
||||
child->exit_code = WTERMSIG(status) + 128;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
|
||||
"%s odls:wait_local_proc child process %s terminated with signal",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
|
@ -18,7 +18,7 @@ int main(int argc, char* argv[])
|
||||
|
||||
printf("Hello, World, I am %d of %d\n", rank, size);
|
||||
|
||||
if (0 != rank) MPI_Abort(MPI_COMM_WORLD, 2);
|
||||
if (0 != rank) MPI_Abort(MPI_COMM_WORLD, rank);
|
||||
|
||||
MPI_Finalize();
|
||||
return 0;
|
||||
|
@ -580,19 +580,6 @@ static void job_completed(int trigpipe, short event, void *arg)
|
||||
num_killed, ((num_killed > 1) ? "es" : ""), orterun_basename);
|
||||
}
|
||||
}
|
||||
/* Make sure we propagate the exit code */
|
||||
if (WIFEXITED(orte_exit_status)) {
|
||||
orte_exit_status = WEXITSTATUS(orte_exit_status);
|
||||
} else if (ORTE_JOB_STATE_FAILED_TO_START == exit_state ||
|
||||
ORTE_JOB_STATE_ABORTED_WO_SYNC == exit_state) {
|
||||
/* ensure we don't treat this like a signal */
|
||||
} else {
|
||||
/* If a process was killed by a signal, then make the
|
||||
* exit code of orterun be "signo + 128" so that "prog"
|
||||
* and "orterun prog" will both set the same status
|
||||
* value for the shell */
|
||||
orte_exit_status = WTERMSIG(orte_exit_status) + 128;
|
||||
}
|
||||
|
||||
/* the job is complete - now setup an event that will
|
||||
* trigger when the orteds are gone and tell the orteds that it is
|
||||
|
Загрузка…
Ссылка в новой задаче
Block a user