1
1

One more correction to mpirun exit codes - cleanup the application proc's exit codes in the orted so that non-zero exit codes generated by mpirun itself don't get "munged".

Modify the multi_abort function so they all return different exit codes - allows us to tell which one was being reported.

This commit was SVN r17895.
Этот коммит содержится в:
Ralph Castain 2008-03-20 13:54:11 +00:00
родитель 27a73ad9ee
Коммит 6bb139e4f2
3 изменённых файлов: 11 добавлений и 17 удалений

Просмотреть файл

@ -1436,9 +1436,6 @@ GOTCHILD:
goto MOVEON;
}
/* save the exit code */
child->exit_code = status;
/* If this child was the (vpid==0), we hooked it up to orterun's
STDIN SOURCE earlier (do not change this without also changing
odsl_default_fork_local_proc()). So we have to tell the SOURCE
@ -1481,6 +1478,9 @@ GOTCHILD:
/* determine the state of this process */
if(WIFEXITED(status)) {
/* set the exit status appropriately */
child->exit_code = WEXITSTATUS(status);
/* even though the process exited "normally", it is quite
* possible that this happened via an orte_abort call - in
* which case, we need to indicate this was an "abnormal"
@ -1556,6 +1556,13 @@ GOTCHILD:
* abnormal, so indicate that condition
*/
child->state = ORTE_PROC_STATE_ABORTED_BY_SIG;
/* If a process was killed by a signal, then make the
* exit code of orterun be "signo + 128" so that "prog"
* and "orterun prog" will both set the same status
* value for the shell
*/
child->exit_code = WTERMSIG(status) + 128;
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
"%s odls:wait_local_proc child process %s terminated with signal",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),

Просмотреть файл

@ -18,7 +18,7 @@ int main(int argc, char* argv[])
printf("Hello, World, I am %d of %d\n", rank, size);
if (0 != rank) MPI_Abort(MPI_COMM_WORLD, 2);
if (0 != rank) MPI_Abort(MPI_COMM_WORLD, rank);
MPI_Finalize();
return 0;

Просмотреть файл

@ -580,19 +580,6 @@ static void job_completed(int trigpipe, short event, void *arg)
num_killed, ((num_killed > 1) ? "es" : ""), orterun_basename);
}
}
/* Make sure we propagate the exit code */
if (WIFEXITED(orte_exit_status)) {
orte_exit_status = WEXITSTATUS(orte_exit_status);
} else if (ORTE_JOB_STATE_FAILED_TO_START == exit_state ||
ORTE_JOB_STATE_ABORTED_WO_SYNC == exit_state) {
/* ensure we don't treat this like a signal */
} else {
/* If a process was killed by a signal, then make the
* exit code of orterun be "signo + 128" so that "prog"
* and "orterun prog" will both set the same status
* value for the shell */
orte_exit_status = WTERMSIG(orte_exit_status) + 128;
}
/* the job is complete - now setup an event that will
* trigger when the orteds are gone and tell the orteds that it is