diff --git a/orte/mca/plm/base/plm_base_launch_support.c b/orte/mca/plm/base/plm_base_launch_support.c index 587300fb75..c64302dc1f 100644 --- a/orte/mca/plm/base/plm_base_launch_support.c +++ b/orte/mca/plm/base/plm_base_launch_support.c @@ -1311,7 +1311,7 @@ void orte_plm_base_check_job_completed(orte_job_t *jdata) * as abnormally terminated, then do not update its state * * Treat termination of any process in a continuously operating job as - * an error + * an error unless it was specifically commanded */ if (jdata->state < ORTE_JOB_STATE_TERMINATED || jdata->controls & ORTE_JOB_CONTROL_CONTINUOUS_OP) { @@ -1375,12 +1375,16 @@ void orte_plm_base_check_job_completed(orte_job_t *jdata) } break; } else if (ORTE_PROC_STATE_KILLED_BY_CMD == proc->state) { - /* we ordered this proc to die */ - jdata->state = ORTE_JOB_STATE_KILLED_BY_CMD; - jdata->aborted_proc = NULL; /* no reason to save it */ - /* use the default exit code since we ordered the termination */ - ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE); - /* now just check the remaining jobs to see if anyone is still alive */ + /* we ordered this proc to die, so it isn't an abnormal termination + * and we don't flag it as such - just check the remaining jobs to + * see if anyone is still alive + */ + if (jdata->num_terminated >= jdata->num_procs) { + /* this job has terminated - now we need to check to see if ALL + * the other jobs have also completed and wakeup if that is true + */ + jdata->state = ORTE_JOB_STATE_KILLED_BY_CMD; + } goto CHECK_ALL_JOBS; } else if (ORTE_PROC_STATE_UNTERMINATED < proc->state && jdata->controls & ORTE_JOB_CONTROL_CONTINUOUS_OP) { @@ -1446,7 +1450,7 @@ CHECK_ALL_JOBS: * anything further - just return here */ if (NULL != jdata && ORTE_JOB_CONTROL_CONTINUOUS_OP & jdata->controls) { - return; + goto CHECK_ALIVE; } /* if the job that is being checked is the HNP, then we are @@ -1511,6 +1515,7 @@ CHECK_ALL_JOBS: jdata->map = NULL; } +CHECK_ALIVE: /* now check to see if all jobs are done - release this jdata * object when we find it */ @@ -1529,7 +1534,8 @@ CHECK_ALL_JOBS: * report appropriately to the user */ if (NULL != jdata && job->jobid == jdata->jobid && - jdata->state == ORTE_JOB_STATE_TERMINATED) { + (jdata->state == ORTE_JOB_STATE_TERMINATED || + jdata->state == ORTE_JOB_STATE_KILLED_BY_CMD)) { /* release this object, ensuring that the * pointer array internal accounting * is maintained! diff --git a/orte/test/system/Makefile b/orte/test/system/Makefile index b20cf67300..977553ede1 100644 --- a/orte/test/system/Makefile +++ b/orte/test/system/Makefile @@ -1,4 +1,4 @@ -PROGS = no_op sigusr_trap spin orte_nodename orte_spawn orte_loop_spawn orte_loop_child orte_abort get_limits orte_ring spawn_child orte_tool orte_no_op binom oob_stress iof_stress iof_delay radix orte_barrier orte_mcast opal_interface mcast mcast_recv +PROGS = no_op sigusr_trap spin orte_nodename orte_spawn orte_loop_spawn orte_loop_child orte_abort get_limits orte_ring spawn_child orte_tool orte_no_op binom oob_stress iof_stress iof_delay radix orte_barrier orte_mcast opal_interface mcast mcast_recv orte_spin segfault all: $(PROGS) diff --git a/orte/test/system/orte_mcast.c b/orte/test/system/orte_mcast.c index 3f3927ac74..bcc264a48e 100644 --- a/orte/test/system/orte_mcast.c +++ b/orte/test/system/orte_mcast.c @@ -8,6 +8,7 @@ #include "opal/dss/dss.h" #include "opal/event/event.h" +#include "opal/util/output.h" #include "orte/util/proc_info.h" #include "orte/util/name_fns.h" @@ -43,6 +44,7 @@ int main(int argc, char* argv[]) opal_buffer_t buf, *bfptr; int32_t i32=1; struct iovec iovec_array[3]; + orte_rmcast_channel_t chan=4; if (0 > (rc = orte_init(ORTE_PROC_NON_MPI))) { fprintf(stderr, "orte_nodename: couldn't init orte - error code %d\n", rc); @@ -58,13 +60,17 @@ int main(int argc, char* argv[]) if (0 == ORTE_PROC_MY_NAME->vpid) { + /* open a new channel */ + if (ORTE_SUCCESS != (rc = orte_rmcast.open_channel(&chan, "orte_mcast", NULL, -1, NULL, ORTE_RMCAST_XMIT))) { + ORTE_ERROR_LOG(rc); + goto blast; + } orte_grpcomm.barrier(); - fprintf(stderr, "%d: past barrier\n", (int)ORTE_PROC_MY_NAME->vpid); OBJ_CONSTRUCT(&buf, opal_buffer_t); opal_dss.pack(&buf, &i32, 1, OPAL_INT32); if (ORTE_SUCCESS != (rc = orte_rmcast.send_buffer(ORTE_RMCAST_APP_PUBLIC_CHANNEL, - ORTE_RMCAST_TAG_WILDCARD, &buf))) { + ORTE_RMCAST_TAG_ANNOUNCE, &buf))) { ORTE_ERROR_LOG(rc); OBJ_DESTRUCT(&buf); goto blast; @@ -74,8 +80,8 @@ int main(int argc, char* argv[]) bfptr = OBJ_NEW(opal_buffer_t); i32 = 2; opal_dss.pack(bfptr, &i32, 1, OPAL_INT32); - if (ORTE_SUCCESS != (rc = orte_rmcast.send_buffer_nb(ORTE_RMCAST_APP_PUBLIC_CHANNEL, - ORTE_RMCAST_TAG_WILDCARD, bfptr, + if (ORTE_SUCCESS != (rc = orte_rmcast.send_buffer_nb(chan, + ORTE_RMCAST_TAG_OUTPUT, bfptr, cbfunc_buf_snt, NULL))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(bfptr); @@ -98,12 +104,22 @@ int main(int argc, char* argv[]) orte_finalize(); return 0; } else { + /* open a new channel */ + if (ORTE_SUCCESS != (rc = orte_rmcast.open_channel(&chan, "orte_mcast", NULL, -1, NULL, ORTE_RMCAST_RECV))) { + ORTE_ERROR_LOG(rc); + } if (ORTE_SUCCESS != (rc = orte_rmcast.recv_buffer_nb(ORTE_RMCAST_APP_PUBLIC_CHANNEL, ORTE_RMCAST_TAG_WILDCARD, ORTE_RMCAST_PERSISTENT, cbfunc, NULL))) { ORTE_ERROR_LOG(rc); } + if (ORTE_SUCCESS != (rc = orte_rmcast.recv_buffer_nb(chan, + ORTE_RMCAST_TAG_WILDCARD, + ORTE_RMCAST_PERSISTENT, + cbfunc, NULL))) { + ORTE_ERROR_LOG(rc); + } if (ORTE_SUCCESS != (rc = orte_rmcast.recv_nb(ORTE_RMCAST_APP_PUBLIC_CHANNEL, ORTE_RMCAST_TAG_WILDCARD, ORTE_RMCAST_PERSISTENT, @@ -112,7 +128,6 @@ int main(int argc, char* argv[]) } orte_grpcomm.barrier(); - fprintf(stderr, "%d: past barrier\n", (int)ORTE_PROC_MY_NAME->vpid); } opal_event_dispatch(); @@ -134,10 +149,9 @@ static void cbfunc(int status, rc = 1; opal_dss.unpack(buffer, &i32, &rc, OPAL_INT32); - fprintf(stderr, "%s GOT BUFFER MESSAGE from %s with value %d\n", + opal_output(0, "%s GOT BUFFER MESSAGE from %s on channel %d tag %d with value %d\n", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(sender), i32); - fflush(stderr); + ORTE_NAME_PRINT(sender), channel, tag, i32); orte_grpcomm.barrier(); @@ -172,9 +186,9 @@ static void cbfunc_iovec(int status, { int rc; - fprintf(stderr, "%s GOT IOVEC MESSAGE from %s of %d elements\n", + opal_output(0, "%s GOT IOVEC MESSAGE from %s of %d elements\n", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(sender), count); - fflush(stderr); + #if 0 if (0 != ORTE_PROC_MY_NAME->vpid) { /* send it back */ @@ -195,8 +209,7 @@ static void cbfunc_buf_snt(int status, orte_process_name_t *sender, opal_buffer_t *buf, void *cbdata) { - fprintf(stderr, "%s BUFFERED_NB SEND COMPLETE\n", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - fflush(stderr); + opal_output(0, "%s BUFFERED_NB SEND COMPLETE\n", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); OBJ_RELEASE(buf); } diff --git a/orte/test/system/orte_spin.c b/orte/test/system/orte_spin.c new file mode 100644 index 0000000000..f1b106105c --- /dev/null +++ b/orte/test/system/orte_spin.c @@ -0,0 +1,31 @@ +/* -*- C -*- + * + * $HEADER$ + * + * A program that just spins - provides mechanism for testing user-driven + * abnormal program termination + */ + +#include + +#include "orte/runtime/runtime.h" + +int main(int argc, char* argv[]) +{ + + int i; + double pi; + + orte_init(ORTE_PROC_NON_MPI); + + i = 0; + while (1) { + i++; + pi = i / 3.14159256; + if (i > 100) i = 0; + } + + orte_finalize(); + + return 0; +} diff --git a/orte/test/system/segfault.c b/orte/test/system/segfault.c new file mode 100644 index 0000000000..1905e38fe2 --- /dev/null +++ b/orte/test/system/segfault.c @@ -0,0 +1,18 @@ +/* -*- C -*- + * + * $HEADER$ + * + * A program that just segfaults + */ + +#include + +int main(int argc, char* argv[]) +{ + + double pi; + char *dum=NULL; + + pi = (double)*dum; + +}