1
1

Temporary fix -- will be obviated by the size_t (et al.) fixes coming

soon.

Make ORTE_EXIT_CODE be the same as INT32, not INT8.  This allows the
full propogation of the value returned by waitpid() rather than just
the lowest 8 bites.  Also change the naming of it in orterun to be
exit_status, not exit_code (per POSIX standard naming convention).

orterun now returns the first nonzero exit status that it receives.

This commit was SVN r5530.
Этот коммит содержится в:
Jeff Squyres 2005-04-29 00:36:07 +00:00
родитель ca35c1276c
Коммит c13c802944
4 изменённых файлов: 26 добавлений и 21 удалений

Просмотреть файл

@ -62,7 +62,6 @@ size_t orte_dps_memory_required(void *src, size_t num_vals, orte_data_type_t typ
case ORTE_DATA_TYPE: case ORTE_DATA_TYPE:
case ORTE_NODE_STATE: case ORTE_NODE_STATE:
case ORTE_PROC_STATE: case ORTE_PROC_STATE:
case ORTE_EXIT_CODE:
case ORTE_BOOL: case ORTE_BOOL:
case ORTE_BYTE: case ORTE_BYTE:
case ORTE_INT8: case ORTE_INT8:
@ -80,6 +79,7 @@ size_t orte_dps_memory_required(void *src, size_t num_vals, orte_data_type_t typ
case ORTE_JOBID: case ORTE_JOBID:
case ORTE_CELLID: case ORTE_CELLID:
case ORTE_GPR_NOTIFY_ID: case ORTE_GPR_NOTIFY_ID:
case ORTE_EXIT_CODE:
case ORTE_INT32: case ORTE_INT32:
case ORTE_UINT32: case ORTE_UINT32:
return (size_t)(num_vals * sizeof(uint32_t)); return (size_t)(num_vals * sizeof(uint32_t));

Просмотреть файл

@ -173,7 +173,6 @@ int orte_dps_pack_nobuffer(void *dst, void *src, size_t num_vals,
case ORTE_DATA_TYPE: case ORTE_DATA_TYPE:
case ORTE_NODE_STATE: case ORTE_NODE_STATE:
case ORTE_PROC_STATE: case ORTE_PROC_STATE:
case ORTE_EXIT_CODE:
case ORTE_BYTE: case ORTE_BYTE:
case ORTE_INT8: case ORTE_INT8:
case ORTE_UINT8: case ORTE_UINT8:
@ -204,6 +203,7 @@ int orte_dps_pack_nobuffer(void *dst, void *src, size_t num_vals,
case ORTE_JOBID: case ORTE_JOBID:
case ORTE_CELLID: case ORTE_CELLID:
case ORTE_GPR_NOTIFY_ID: case ORTE_GPR_NOTIFY_ID:
case ORTE_EXIT_CODE:
case ORTE_INT32: case ORTE_INT32:
case ORTE_UINT32: case ORTE_UINT32:
dptr = (char *) dst; dptr = (char *) dst;

Просмотреть файл

@ -168,7 +168,6 @@ int orte_dps_unpack_nobuffer(void *dst, void *src, size_t num_vals,
case ORTE_DATA_TYPE: case ORTE_DATA_TYPE:
case ORTE_NODE_STATE: case ORTE_NODE_STATE:
case ORTE_PROC_STATE: case ORTE_PROC_STATE:
case ORTE_EXIT_CODE:
case ORTE_BYTE: case ORTE_BYTE:
case ORTE_INT8: case ORTE_INT8:
case ORTE_UINT8: case ORTE_UINT8:
@ -209,6 +208,7 @@ int orte_dps_unpack_nobuffer(void *dst, void *src, size_t num_vals,
case ORTE_JOBID: case ORTE_JOBID:
case ORTE_CELLID: case ORTE_CELLID:
case ORTE_GPR_NOTIFY_ID: case ORTE_GPR_NOTIFY_ID:
case ORTE_EXIT_CODE:
case ORTE_INT32: case ORTE_INT32:
case ORTE_UINT32: case ORTE_UINT32:

Просмотреть файл

@ -86,7 +86,7 @@ struct globals_t {
bool no_wait_for_job_completion; bool no_wait_for_job_completion;
bool debug; bool debug;
int num_procs; int num_procs;
int exit_code; int exit_status;
char *hostfile; char *hostfile;
char *env_val; char *env_val;
char *appfile; char *appfile;
@ -99,7 +99,7 @@ static bool globals_init = false;
struct proc_info_t { struct proc_info_t {
bool reported; bool reported;
int32_t exit_code; int32_t exit_status;
}; };
struct proc_info_t *proc_infos = NULL; struct proc_info_t *proc_infos = NULL;
@ -239,7 +239,7 @@ int main(int argc, char *argv[], char* env[])
} }
for (i = 0; i < j; ++i) { for (i = 0; i < j; ++i) {
proc_infos[i].reported = false; proc_infos[i].reported = false;
proc_infos[i].exit_code = 0; proc_infos[i].exit_status = 0;
} }
/* Intialize our Open RTE environment */ /* Intialize our Open RTE environment */
@ -275,8 +275,13 @@ int main(int argc, char *argv[], char* env[])
&orterun_globals.lock); &orterun_globals.lock);
} }
/* Make sure we propagate the exit code */ /* Make sure we propagate the exit code */
rc = orterun_globals.exit_code; if (WIFEXITED(orterun_globals.exit_status)) {
rc = WEXITSTATUS(orterun_globals.exit_status);
} else {
rc = WTERMSIG(orterun_globals.exit_status);
}
OMPI_THREAD_UNLOCK(&orterun_globals.lock); OMPI_THREAD_UNLOCK(&orterun_globals.lock);
orte_gpr.dump_segments(0);
/* If we showed more abort messages than were allowed, /* If we showed more abort messages than were allowed,
show a followup message here */ show a followup message here */
@ -315,8 +320,8 @@ static void dump_aborted_procs(orte_jobid_t jobid)
orte_gpr_value_t** values = NULL; orte_gpr_value_t** values = NULL;
int i, k, num_values = 0; int i, k, num_values = 0;
int rc; int rc;
int32_t exit_code = 0; int32_t exit_status = 0;
bool exit_code_set; bool exit_status_set;
char *keys[] = { char *keys[] = {
ORTE_PROC_NAME_KEY, ORTE_PROC_NAME_KEY,
ORTE_PROC_PID_KEY, ORTE_PROC_PID_KEY,
@ -353,8 +358,8 @@ static void dump_aborted_procs(orte_jobid_t jobid)
uint32_t rank = -1; uint32_t rank = -1;
char* node_name = NULL; char* node_name = NULL;
exit_code = 0; exit_status = 0;
exit_code_set = false; exit_status_set = false;
for(k=0; k < value->cnt; k++) { for(k=0; k < value->cnt; k++) {
orte_gpr_keyval_t* keyval = value->keyvals[k]; orte_gpr_keyval_t* keyval = value->keyvals[k];
if(strcmp(keyval->key, ORTE_PROC_NAME_KEY) == 0) { if(strcmp(keyval->key, ORTE_PROC_NAME_KEY) == 0) {
@ -370,8 +375,8 @@ static void dump_aborted_procs(orte_jobid_t jobid)
continue; continue;
} }
if(strcmp(keyval->key, ORTE_PROC_EXIT_CODE_KEY) == 0) { if(strcmp(keyval->key, ORTE_PROC_EXIT_CODE_KEY) == 0) {
exit_code = keyval->value.i32; exit_status = keyval->value.exit_code;
exit_code_set = true; exit_status_set = true;
continue; continue;
} }
if(strcmp(keyval->key, ORTE_NODE_NAME_KEY) == 0) { if(strcmp(keyval->key, ORTE_NODE_NAME_KEY) == 0) {
@ -379,33 +384,33 @@ static void dump_aborted_procs(orte_jobid_t jobid)
continue; continue;
} }
} }
if (rank >= 0 && exit_code_set) { if (rank >= 0 && exit_status_set) {
proc_infos[rank].exit_code = exit_code; proc_infos[rank].exit_status = exit_status;
} }
if (WIFSIGNALED(exit_code) && rank >= 0 && if (WIFSIGNALED(exit_status) && rank >= 0 &&
!proc_infos[rank].reported) { !proc_infos[rank].reported) {
proc_infos[rank].reported = true; proc_infos[rank].reported = true;
if (9 == WTERMSIG(exit_code)) { if (9 == WTERMSIG(exit_status)) {
++num_killed; ++num_killed;
} else { } else {
if (num_aborted < max_display_aborted) { if (num_aborted < max_display_aborted) {
fprintf(stderr, "Job rank %d (pid %d) on node \"%s\" exited on signal %d\n", fprintf(stderr, "Job rank %d (pid %d) on node \"%s\" exited on signal %d\n",
rank, pid, node_name, WTERMSIG(exit_code)); rank, pid, node_name, WTERMSIG(exit_status));
} }
++num_aborted; ++num_aborted;
} }
} }
/* If we haven't done so already, hold the exit_code so we can /* If we haven't done so already, hold the exit_status so we can
return it when exiting. Specifically, keep the first return it when exiting. Specifically, keep the first
non-zero entry. If they all return zero, we'll return non-zero entry. If they all return zero, we'll return
zero. */ zero. */
OMPI_THREAD_LOCK(&orterun_globals.lock); OMPI_THREAD_LOCK(&orterun_globals.lock);
if (0 == orterun_globals.exit_code && exit_code_set) { if (0 == orterun_globals.exit_status && exit_status_set) {
orterun_globals.exit_code = exit_code; orterun_globals.exit_status = exit_status;
} }
OMPI_THREAD_UNLOCK(&orterun_globals.lock); OMPI_THREAD_UNLOCK(&orterun_globals.lock);